summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig129
-rw-r--r--mm/Makefile6
-rw-r--r--mm/backing-dev.c9
-rw-r--r--mm/balloon_compaction.c27
-rw-r--r--mm/cma.c388
-rw-r--r--mm/cma.h6
-rw-r--r--mm/cma_debug.c10
-rw-r--r--mm/compaction.c70
-rw-r--r--mm/damon/Kconfig19
-rw-r--r--mm/damon/Makefile1
-rw-r--r--mm/damon/core.c397
-rw-r--r--mm/damon/lru_sort.c127
-rw-r--r--mm/damon/modules-common.c2
-rw-r--r--mm/damon/modules-common.h2
-rw-r--r--mm/damon/ops-common.c285
-rw-r--r--mm/damon/ops-common.h9
-rw-r--r--mm/damon/paddr.c398
-rw-r--r--mm/damon/reclaim.c123
-rw-r--r--mm/damon/stat.c272
-rw-r--r--mm/damon/sysfs-common.c2
-rw-r--r--mm/damon/sysfs-common.h2
-rw-r--r--mm/damon/sysfs-schemes.c546
-rw-r--r--mm/damon/sysfs.c240
-rw-r--r--mm/damon/tests/core-kunit.h112
-rw-r--r--mm/damon/tests/vaddr-kunit.h4
-rw-r--r--mm/damon/vaddr.c360
-rw-r--r--mm/debug.c54
-rw-r--r--mm/debug_page_alloc.c2
-rw-r--r--mm/debug_vm_pgtable.c140
-rw-r--r--mm/dmapool.c19
-rw-r--r--mm/execmem.c243
-rw-r--r--mm/filemap.c263
-rw-r--r--mm/gup.c544
-rw-r--r--mm/highmem.c10
-rw-r--r--mm/hmm.c347
-rw-r--r--mm/huge_memory.c759
-rw-r--r--mm/hugetlb.c643
-rw-r--r--mm/hugetlb_cma.c14
-rw-r--r--mm/hugetlb_cma.h6
-rw-r--r--mm/hugetlb_vmemmap.c2
-rw-r--r--mm/hwpoison-inject.c91
-rw-r--r--mm/internal.h185
-rw-r--r--mm/io-mapping.c29
-rw-r--r--mm/kasan/Makefile3
-rw-r--r--mm/kasan/common.c47
-rw-r--r--mm/kasan/generic.c19
-rw-r--r--mm/kasan/hw_tags.c54
-rw-r--r--mm/kasan/init.c16
-rw-r--r--mm/kasan/kasan.h15
-rw-r--r--mm/kasan/kasan_test_c.c257
-rw-r--r--mm/kasan/report.c47
-rw-r--r--mm/kasan/shadow.c145
-rw-r--r--mm/kasan/sw_tags.c1
-rw-r--r--mm/kasan/tags.c2
-rw-r--r--mm/kfence/core.c16
-rw-r--r--mm/khugepaged.c323
-rw-r--r--mm/kmemleak.c60
-rw-r--r--mm/kmsan/core.c22
-rw-r--r--mm/kmsan/hooks.c18
-rw-r--r--mm/kmsan/init.c3
-rw-r--r--mm/kmsan/instrumentation.c4
-rw-r--r--mm/kmsan/kmsan.h1
-rw-r--r--mm/kmsan/kmsan_test.c17
-rw-r--r--mm/kmsan/report.c6
-rw-r--r--mm/kmsan/shadow.c7
-rw-r--r--mm/ksm.c129
-rw-r--r--mm/list_lru.c34
-rw-r--r--mm/maccess.c3
-rw-r--r--mm/madvise.c937
-rw-r--r--mm/mapping_dirty_helpers.c6
-rw-r--r--mm/memblock.c402
-rw-r--r--mm/memcontrol-v1.c17
-rw-r--r--mm/memcontrol.c904
-rw-r--r--mm/memfd.c43
-rw-r--r--mm/memory-failure.c183
-rw-r--r--mm/memory-tiers.c31
-rw-r--r--mm/memory.c1006
-rw-r--r--mm/memory_hotplug.c249
-rw-r--r--mm/mempolicy.c547
-rw-r--r--mm/mempool.c40
-rw-r--r--mm/memremap.c67
-rw-r--r--mm/migrate.c530
-rw-r--r--mm/migrate_device.c4
-rw-r--r--mm/mincore.c93
-rw-r--r--mm/mlock.c10
-rw-r--r--mm/mm_init.c294
-rw-r--r--mm/mmap.c338
-rw-r--r--mm/mmap_lock.c453
-rw-r--r--mm/mmu_gather.c5
-rw-r--r--mm/mmu_notifier.c2
-rw-r--r--mm/mmzone.c4
-rw-r--r--mm/mprotect.c310
-rw-r--r--mm/mremap.c687
-rw-r--r--mm/mseal.c166
-rw-r--r--mm/nommu.c75
-rw-r--r--mm/numa.c4
-rw-r--r--mm/numa_emulation.c4
-rw-r--r--mm/numa_memblks.c28
-rw-r--r--mm/oom_kill.c52
-rw-r--r--mm/page-writeback.c134
-rw-r--r--mm/page_alloc.c658
-rw-r--r--mm/page_ext.c17
-rw-r--r--mm/page_idle.c2
-rw-r--r--mm/page_io.c84
-rw-r--r--mm/page_isolation.c112
-rw-r--r--mm/page_owner.c6
-rw-r--r--mm/page_table_check.c34
-rw-r--r--mm/page_vma_mapped.c6
-rw-r--r--mm/pagewalk.c122
-rw-r--r--mm/percpu-km.c2
-rw-r--r--mm/percpu-stats.c1
-rw-r--r--mm/percpu.c28
-rw-r--r--mm/pgtable-generic.c7
-rw-r--r--mm/ptdump.c67
-rw-r--r--mm/readahead.c64
-rw-r--r--mm/rmap.c335
-rw-r--r--mm/secretmem.c59
-rw-r--r--mm/shmem.c584
-rw-r--r--mm/show_mem.c41
-rw-r--r--mm/slab.h52
-rw-r--r--mm/slab_common.c39
-rw-r--r--mm/slub.c2682
-rw-r--r--mm/sparse-vmemmap.c11
-rw-r--r--mm/sparse.c21
-rw-r--r--mm/swap.c104
-rw-r--r--mm/swap.h367
-rw-r--r--mm/swap_state.c497
-rw-r--r--mm/swap_table.h130
-rw-r--r--mm/swapfile.c808
-rw-r--r--mm/truncate.c22
-rw-r--r--mm/userfaultfd.c378
-rw-r--r--mm/util.c243
-rw-r--r--mm/vma.c411
-rw-r--r--mm/vma.h107
-rw-r--r--mm/vma_exec.c161
-rw-r--r--mm/vma_init.c152
-rw-r--r--mm/vmalloc.c341
-rw-r--r--mm/vmpressure.c2
-rw-r--r--mm/vmscan.c679
-rw-r--r--mm/vmstat.c461
-rw-r--r--mm/workingset.c6
-rw-r--r--mm/zpdesc.h36
-rw-r--r--mm/zpool.c326
-rw-r--r--mm/zsmalloc.c151
-rw-r--r--mm/zswap.c274
145 files changed, 16982 insertions, 9479 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index e113f713b493..0e26f4fc8717 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -9,9 +9,6 @@ menu "Memory Management options"
config ARCH_NO_SWAP
bool
-config ZPOOL
- bool
-
menuconfig SWAP
bool "Support for paging of anonymous memory (swap)"
depends on MMU && BLOCK && !ARCH_NO_SWAP
@@ -26,7 +23,7 @@ config ZSWAP
bool "Compressed cache for swap pages"
depends on SWAP
select CRYPTO
- select ZPOOL
+ select ZSMALLOC
help
A lightweight compressed cache for swap pages. It takes
pages that are in the process of being swapped out and attempts to
@@ -125,45 +122,18 @@ config ZSWAP_COMPRESSOR_DEFAULT
default "zstd" if ZSWAP_COMPRESSOR_DEFAULT_ZSTD
default ""
-choice
- prompt "Default allocator"
- depends on ZSWAP
- default ZSWAP_ZPOOL_DEFAULT_ZSMALLOC if MMU
- help
- Selects the default allocator for the compressed cache for
- swap pages.
- The default is 'zbud' for compatibility, however please do
- read the description of each of the allocators below before
- making a right choice.
-
- The selection made here can be overridden by using the kernel
- command line 'zswap.zpool=' option.
+config ZSMALLOC
+ tristate
-config ZSWAP_ZPOOL_DEFAULT_ZSMALLOC
- bool "zsmalloc"
- select ZSMALLOC
- help
- Use the zsmalloc allocator as the default allocator.
-endchoice
+if ZSMALLOC
-config ZSWAP_ZPOOL_DEFAULT
- string
- depends on ZSWAP
- default "zsmalloc" if ZSWAP_ZPOOL_DEFAULT_ZSMALLOC
- default ""
+menu "Zsmalloc allocator options"
+ depends on ZSMALLOC
-config ZSMALLOC
- tristate
- prompt "N:1 compression allocator (zsmalloc)" if (ZSWAP || ZRAM)
- depends on MMU
- help
- zsmalloc is a slab-based memory allocator designed to store
- pages of various compression levels efficiently. It achieves
- the highest storage density with the least amount of fragmentation.
+comment "Zsmalloc is a common backend allocator for zswap & zram"
config ZSMALLOC_STAT
bool "Export zsmalloc statistics"
- depends on ZSMALLOC
select DEBUG_FS
help
This option enables code in the zsmalloc to collect various
@@ -175,7 +145,6 @@ config ZSMALLOC_CHAIN_SIZE
int "Maximum number of physical pages per-zspage"
default 8
range 4 16
- depends on ZSMALLOC
help
This option sets the upper limit on the number of physical pages
that a zmalloc page (zspage) can consist of. The optimal zspage
@@ -190,10 +159,15 @@ config ZSMALLOC_CHAIN_SIZE
For more information, see zsmalloc documentation.
+endmenu
+
+endif
+
menu "Slab allocator options"
config SLUB
def_bool y
+ select IRQ_WORK
config KVFREE_RCU_BATCHED
def_bool y
@@ -439,9 +413,8 @@ config SPARSEMEM_VMEMMAP_ENABLE
bool
config SPARSEMEM_VMEMMAP
- bool "Sparse Memory virtual memmap"
+ def_bool y
depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE
- default y
help
SPARSEMEM_VMEMMAP uses a virtually mapped memmap to optimise
pfn_to_page and page_to_pfn operations. This is the most
@@ -469,6 +442,10 @@ config HAVE_GUP_FAST
depends on MMU
bool
+# Enable memblock support for scratch memory which is needed for kexec handover
+config MEMBLOCK_KHO_SCRATCH
+ bool
+
# Don't discard allocated memory used to track "memory" and "reserved" memblocks
# after early boot, so it can still be used to test for validity of memory.
# Also, memblocks are updated with memory hot(un)plug.
@@ -772,7 +749,6 @@ config MEMORY_FAILURE
depends on MMU
depends on ARCH_SUPPORTS_MEMORY_FAILURE
bool "Enable recovery from hardware memory errors"
- select MEMORY_ISOLATION
select RAS
help
Enables code to recover from some memory failures on systems
@@ -819,6 +795,22 @@ config ARCH_WANT_GENERAL_HUGETLB
config ARCH_WANTS_THP_SWAP
def_bool n
+config PERSISTENT_HUGE_ZERO_FOLIO
+ bool "Allocate a PMD sized folio for zeroing"
+ depends on TRANSPARENT_HUGEPAGE
+ help
+ Enable this option to reduce the runtime refcounting overhead
+ of the huge zero folio and expand the places in the kernel
+ that can use huge zero folios. For instance, block I/O benefits
+ from access to large folios for zeroing memory.
+
+ With this option enabled, the huge zero folio is allocated
+ once and never freed. One full huge page's worth of memory shall
+ be used.
+
+ Say Y if your system has lots of memory. Say N if you are
+ memory constrained.
+
config MM_ID
def_bool n
@@ -882,7 +874,7 @@ config THP_SWAP
config READ_ONLY_THP_FOR_FS
bool "Read-only THP for filesystems (EXPERIMENTAL)"
- depends on TRANSPARENT_HUGEPAGE && SHMEM
+ depends on TRANSPARENT_HUGEPAGE
help
Allow khugepaged to put read-only file-backed pages in THP.
@@ -930,6 +922,13 @@ config ARCH_SUPPORTS_PUD_PFNMAP
depends on ARCH_SUPPORTS_HUGE_PFNMAP && HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
#
+# Architectures that always use weak definitions for percpu
+# variables in modules should set this.
+#
+config ARCH_MODULE_NEEDS_WEAK_PER_CPU
+ bool
+
+#
# UP and nommu archs use km based percpu allocator
#
config NEED_PER_CPU_KM
@@ -989,6 +988,41 @@ config CMA_AREAS
If unsure, leave the default value "8" in UMA and "20" in NUMA.
+#
+# Select this config option from the architecture Kconfig, if available, to set
+# the max page order for physically contiguous allocations.
+#
+config ARCH_FORCE_MAX_ORDER
+ int
+
+#
+# When ARCH_FORCE_MAX_ORDER is not defined,
+# the default page block order is MAX_PAGE_ORDER (10) as per
+# include/linux/mmzone.h.
+#
+config PAGE_BLOCK_MAX_ORDER
+ int "Page Block Order Upper Limit"
+ range 1 10 if ARCH_FORCE_MAX_ORDER = 0
+ default 10 if ARCH_FORCE_MAX_ORDER = 0
+ range 1 ARCH_FORCE_MAX_ORDER if ARCH_FORCE_MAX_ORDER != 0
+ default ARCH_FORCE_MAX_ORDER if ARCH_FORCE_MAX_ORDER != 0
+ help
+ The page block order refers to the power of two number of pages that
+ are physically contiguous and can have a migrate type associated to
+ them. The maximum size of the page block order is at least limited by
+ ARCH_FORCE_MAX_ORDER/MAX_PAGE_ORDER.
+
+ This config adds a new upper limit of default page block
+ order when the page block order is required to be smaller than
+ ARCH_FORCE_MAX_ORDER/MAX_PAGE_ORDER or other limits
+ (see include/linux/pageblock-flags.h for details).
+
+ Reducing pageblock order can negatively impact THP generation
+ success rate. If your workloads use THP heavily, please use this
+ option with caution.
+
+ Don't change if unsure.
+
config MEM_SOFT_DIRTY
bool "Track memory changes"
depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS
@@ -1071,9 +1105,6 @@ config ARCH_HAS_CURRENT_STACK_POINTER
register alias named "current_stack_pointer", this config can be
selected.
-config ARCH_HAS_PTE_DEVMAP
- bool
-
config ARCH_HAS_ZONE_DMA_SET
bool
@@ -1091,7 +1122,6 @@ config ZONE_DEVICE
depends on MEMORY_HOTPLUG
depends on MEMORY_HOTREMOVE
depends on SPARSEMEM_VMEMMAP
- depends on ARCH_HAS_PTE_DEVMAP
select XARRAY_MULTI
help
@@ -1200,10 +1230,6 @@ config KMAP_LOCAL
config KMAP_LOCAL_NON_LINEAR_PTE_ARRAY
bool
-# struct io_mapping based helper. Selected by drivers that need them
-config IO_MAPPING
- bool
-
config MEMFD_CREATE
bool "Enable memfd_create() system call" if EXPERT
@@ -1317,6 +1343,7 @@ config NUMA_MEMBLKS
config NUMA_EMU
bool "NUMA emulation"
depends on NUMA_MEMBLKS
+ depends on X86 || GENERIC_ARCH_NUMA
help
Enable NUMA emulation. A flat machine will be split
into virtual nodes when booted with "numa=fake=N", where N is the
@@ -1342,6 +1369,8 @@ config PT_RECLAIM
Note: now only empty user PTE page table pages will be reclaimed.
+config FIND_NORMAL_PAGE
+ def_bool n
source "mm/damon/Kconfig"
diff --git a/mm/Makefile b/mm/Makefile
index e7f6bbf8ae5f..21abb3353550 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -37,7 +37,7 @@ mmu-y := nommu.o
mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \
mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \
msync.o page_vma_mapped.o pagewalk.o \
- pgtable-generic.o rmap.o vmalloc.o vma.o
+ pgtable-generic.o rmap.o vmalloc.o vma.o vma_exec.o
ifdef CONFIG_CROSS_MEMORY_ATTACH
@@ -55,7 +55,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
mm_init.o percpu.o slab_common.o \
compaction.o show_mem.o \
interval_tree.o list_lru.o workingset.o \
- debug.o gup.o mmap_lock.o $(mmu-y)
+ debug.o gup.o mmap_lock.o vma_init.o $(mmu-y)
# Give 'page_alloc' its own module-parameter namespace
page-alloc-y := page_alloc.o
@@ -115,7 +115,6 @@ obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o
obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o
obj-$(CONFIG_PAGE_OWNER) += page_owner.o
obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
-obj-$(CONFIG_ZPOOL) += zpool.o
obj-$(CONFIG_ZSMALLOC) += zsmalloc.o
obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
obj-$(CONFIG_CMA) += cma.o
@@ -141,7 +140,6 @@ obj-$(CONFIG_MEMFD_CREATE) += memfd.o
obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o
obj-$(CONFIG_PTDUMP) += ptdump.o
obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
-obj-$(CONFIG_IO_MAPPING) += io-mapping.o
obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 783904d8c5ef..41b6c9386b69 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -510,7 +510,7 @@ static void wb_update_bandwidth_workfn(struct work_struct *work)
/*
* Initial write bandwidth: 100 MB/s
*/
-#define INIT_BW (100 << (20 - PAGE_SHIFT))
+#define INIT_BW MB_TO_PAGES(100)
static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
gfp_t gfp)
@@ -633,6 +633,7 @@ static void cgwb_release_workfn(struct work_struct *work)
wb_exit(wb);
bdi_put(bdi);
WARN_ON_ONCE(!list_empty(&wb->b_attached));
+ WARN_ON_ONCE(work_pending(&wb->switch_work));
call_rcu(&wb->rcu, cgwb_free_rcu);
}
@@ -709,6 +710,8 @@ static int cgwb_create(struct backing_dev_info *bdi,
wb->memcg_css = memcg_css;
wb->blkcg_css = blkcg_css;
INIT_LIST_HEAD(&wb->b_attached);
+ INIT_WORK(&wb->switch_work, inode_switch_wbs_work_fn);
+ init_llist_head(&wb->switch_wbs_ctxs);
INIT_WORK(&wb->release_work, cgwb_release_workfn);
set_bit(WB_registered, &wb->state);
bdi_get(bdi);
@@ -839,6 +842,8 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
if (!ret) {
bdi->wb.memcg_css = &root_mem_cgroup->css;
bdi->wb.blkcg_css = blkcg_root_css;
+ INIT_WORK(&bdi->wb.switch_work, inode_switch_wbs_work_fn);
+ init_llist_head(&bdi->wb.switch_wbs_ctxs);
}
return ret;
}
@@ -1026,7 +1031,7 @@ struct backing_dev_info *bdi_alloc(int node_id)
kfree(bdi);
return NULL;
}
- bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT;
+ bdi->capabilities = BDI_CAP_WRITEBACK;
bdi->ra_pages = VM_READAHEAD_PAGES;
bdi->io_pages = VM_READAHEAD_PAGES;
timer_setup(&bdi->laptop_mode_wb_timer, laptop_mode_timer_fn, 0);
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index d3e00731e262..03c5dbabb156 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -94,13 +94,8 @@ size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info,
if (!trylock_page(page))
continue;
- if (IS_ENABLED(CONFIG_BALLOON_COMPACTION) &&
- PageIsolated(page)) {
- /* raced with isolation */
- unlock_page(page);
- continue;
- }
- balloon_page_delete(page);
+ list_del(&page->lru);
+ balloon_page_finalize(page);
__count_vm_event(BALLOON_DEFLATE);
list_add(&page->lru, pages);
unlock_page(page);
@@ -211,6 +206,9 @@ static bool balloon_page_isolate(struct page *page, isolate_mode_t mode)
struct balloon_dev_info *b_dev_info = balloon_page_device(page);
unsigned long flags;
+ if (!b_dev_info)
+ return false;
+
spin_lock_irqsave(&b_dev_info->pages_lock, flags);
list_del(&page->lru);
b_dev_info->isolated_pages++;
@@ -224,6 +222,10 @@ static void balloon_page_putback(struct page *page)
struct balloon_dev_info *b_dev_info = balloon_page_device(page);
unsigned long flags;
+ /* Isolated balloon pages cannot get deflated. */
+ if (WARN_ON_ONCE(!b_dev_info))
+ return;
+
spin_lock_irqsave(&b_dev_info->pages_lock, flags);
list_add(&page->lru, &b_dev_info->pages);
b_dev_info->isolated_pages--;
@@ -239,6 +241,10 @@ static int balloon_page_migrate(struct page *newpage, struct page *page,
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
+ /* Isolated balloon pages cannot get deflated. */
+ if (WARN_ON_ONCE(!balloon))
+ return -EAGAIN;
+
return balloon->migratepage(balloon, newpage, page, mode);
}
@@ -247,6 +253,11 @@ const struct movable_operations balloon_mops = {
.isolate_page = balloon_page_isolate,
.putback_page = balloon_page_putback,
};
-EXPORT_SYMBOL_GPL(balloon_mops);
+
+static int __init balloon_init(void)
+{
+ return set_movable_ops(&balloon_mops, PGTY_offline);
+}
+core_initcall(balloon_init);
#endif /* CONFIG_BALLOON_COMPACTION */
diff --git a/mm/cma.c b/mm/cma.c
index 15632939f20a..813e6dc7b095 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -22,6 +22,7 @@
#include <linux/mm.h>
#include <linux/sizes.h>
#include <linux/slab.h>
+#include <linux/string_choices.h>
#include <linux/log2.h>
#include <linux/cma.h>
#include <linux/highmem.h>
@@ -35,12 +36,6 @@
struct cma cma_areas[MAX_CMA_AREAS];
unsigned int cma_area_count;
-static int __init __cma_declare_contiguous_nid(phys_addr_t *basep,
- phys_addr_t size, phys_addr_t limit,
- phys_addr_t alignment, unsigned int order_per_bit,
- bool fixed, const char *name, struct cma **res_cma,
- int nid);
-
phys_addr_t cma_get_base(const struct cma *cma)
{
WARN_ON_ONCE(cma->nranges != 1);
@@ -143,13 +138,14 @@ bool cma_validate_zones(struct cma *cma)
static void __init cma_activate_area(struct cma *cma)
{
- unsigned long pfn, end_pfn;
+ unsigned long pfn, end_pfn, early_pfn[CMA_MAX_RANGES];
int allocrange, r;
struct cma_memrange *cmr;
unsigned long bitmap_count, count;
for (allocrange = 0; allocrange < cma->nranges; allocrange++) {
cmr = &cma->ranges[allocrange];
+ early_pfn[allocrange] = cmr->early_pfn;
cmr->bitmap = bitmap_zalloc(cma_bitmap_maxno(cma, cmr),
GFP_KERNEL);
if (!cmr->bitmap)
@@ -161,13 +157,13 @@ static void __init cma_activate_area(struct cma *cma)
for (r = 0; r < cma->nranges; r++) {
cmr = &cma->ranges[r];
- if (cmr->early_pfn != cmr->base_pfn) {
- count = cmr->early_pfn - cmr->base_pfn;
+ if (early_pfn[r] != cmr->base_pfn) {
+ count = early_pfn[r] - cmr->base_pfn;
bitmap_count = cma_bitmap_pages_to_bits(cma, count);
bitmap_set(cmr->bitmap, 0, bitmap_count);
}
- for (pfn = cmr->early_pfn; pfn < cmr->base_pfn + cmr->count;
+ for (pfn = early_pfn[r]; pfn < cmr->base_pfn + cmr->count;
pfn += pageblock_nr_pages)
init_cma_reserved_pageblock(pfn_to_page(pfn));
}
@@ -193,7 +189,7 @@ cleanup:
for (r = 0; r < allocrange; r++) {
cmr = &cma->ranges[r];
end_pfn = cmr->base_pfn + cmr->count;
- for (pfn = cmr->early_pfn; pfn < end_pfn; pfn++)
+ for (pfn = early_pfn[r]; pfn < end_pfn; pfn++)
free_reserved_page(pfn_to_page(pfn));
}
}
@@ -357,6 +353,168 @@ static void __init list_insert_sorted(
}
}
+static int __init cma_fixed_reserve(phys_addr_t base, phys_addr_t size)
+{
+ if (IS_ENABLED(CONFIG_HIGHMEM)) {
+ phys_addr_t highmem_start = __pa(high_memory - 1) + 1;
+
+ /*
+ * If allocating at a fixed base the request region must not
+ * cross the low/high memory boundary.
+ */
+ if (base < highmem_start && base + size > highmem_start) {
+ pr_err("Region at %pa defined on low/high memory boundary (%pa)\n",
+ &base, &highmem_start);
+ return -EINVAL;
+ }
+ }
+
+ if (memblock_is_region_reserved(base, size) ||
+ memblock_reserve(base, size) < 0) {
+ return -EBUSY;
+ }
+
+ return 0;
+}
+
+static phys_addr_t __init cma_alloc_mem(phys_addr_t base, phys_addr_t size,
+ phys_addr_t align, phys_addr_t limit, int nid)
+{
+ phys_addr_t addr = 0;
+
+ /*
+ * If there is enough memory, try a bottom-up allocation first.
+ * It will place the new cma area close to the start of the node
+ * and guarantee that the compaction is moving pages out of the
+ * cma area and not into it.
+ * Avoid using first 4GB to not interfere with constrained zones
+ * like DMA/DMA32.
+ */
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+ if (!memblock_bottom_up() && limit >= SZ_4G + size) {
+ memblock_set_bottom_up(true);
+ addr = memblock_alloc_range_nid(size, align, SZ_4G, limit,
+ nid, true);
+ memblock_set_bottom_up(false);
+ }
+#endif
+
+ /*
+ * On systems with HIGHMEM try allocating from there before consuming
+ * memory in lower zones.
+ */
+ if (!addr && IS_ENABLED(CONFIG_HIGHMEM)) {
+ phys_addr_t highmem = __pa(high_memory - 1) + 1;
+
+ /*
+ * All pages in the reserved area must come from the same zone.
+ * If the requested region crosses the low/high memory boundary,
+ * try allocating from high memory first and fall back to low
+ * memory in case of failure.
+ */
+ if (base < highmem && limit > highmem) {
+ addr = memblock_alloc_range_nid(size, align, highmem,
+ limit, nid, true);
+ limit = highmem;
+ }
+ }
+
+ if (!addr)
+ addr = memblock_alloc_range_nid(size, align, base, limit, nid,
+ true);
+
+ return addr;
+}
+
+static int __init __cma_declare_contiguous_nid(phys_addr_t *basep,
+ phys_addr_t size, phys_addr_t limit,
+ phys_addr_t alignment, unsigned int order_per_bit,
+ bool fixed, const char *name, struct cma **res_cma,
+ int nid)
+{
+ phys_addr_t memblock_end = memblock_end_of_DRAM();
+ phys_addr_t base = *basep;
+ int ret;
+
+ pr_debug("%s(size %pa, base %pa, limit %pa alignment %pa)\n",
+ __func__, &size, &base, &limit, &alignment);
+
+ if (cma_area_count == ARRAY_SIZE(cma_areas)) {
+ pr_err("Not enough slots for CMA reserved regions!\n");
+ return -ENOSPC;
+ }
+
+ if (!size)
+ return -EINVAL;
+
+ if (alignment && !is_power_of_2(alignment))
+ return -EINVAL;
+
+ if (!IS_ENABLED(CONFIG_NUMA))
+ nid = NUMA_NO_NODE;
+
+ /* Sanitise input arguments. */
+ alignment = max_t(phys_addr_t, alignment, CMA_MIN_ALIGNMENT_BYTES);
+ if (fixed && base & (alignment - 1)) {
+ pr_err("Region at %pa must be aligned to %pa bytes\n",
+ &base, &alignment);
+ return -EINVAL;
+ }
+ base = ALIGN(base, alignment);
+ size = ALIGN(size, alignment);
+ limit &= ~(alignment - 1);
+
+ if (!base)
+ fixed = false;
+
+ /* size should be aligned with order_per_bit */
+ if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit))
+ return -EINVAL;
+
+
+ /*
+ * If the limit is unspecified or above the memblock end, its effective
+ * value will be the memblock end. Set it explicitly to simplify further
+ * checks.
+ */
+ if (limit == 0 || limit > memblock_end)
+ limit = memblock_end;
+
+ if (base + size > limit) {
+ pr_err("Size (%pa) of region at %pa exceeds limit (%pa)\n",
+ &size, &base, &limit);
+ return -EINVAL;
+ }
+
+ /* Reserve memory */
+ if (fixed) {
+ ret = cma_fixed_reserve(base, size);
+ if (ret)
+ return ret;
+ } else {
+ base = cma_alloc_mem(base, size, alignment, limit, nid);
+ if (!base)
+ return -ENOMEM;
+
+ /*
+ * kmemleak scans/reads tracked objects for pointers to other
+ * objects but this address isn't mapped and accessible
+ */
+ kmemleak_ignore_phys(base);
+ }
+
+ ret = cma_init_reserved_mem(base, size, order_per_bit, name, res_cma);
+ if (ret) {
+ memblock_phys_free(base, size);
+ return ret;
+ }
+
+ (*res_cma)->nid = nid;
+ *basep = base;
+
+ return 0;
+}
+
/*
* Create CMA areas with a total size of @total_size. A normal allocation
* for one area is tried first. If that fails, the biggest memblock
@@ -547,8 +705,7 @@ out:
(unsigned long)total_size / SZ_1M);
else
pr_info("Reserved %lu MiB in %d range%s\n",
- (unsigned long)total_size / SZ_1M, nr,
- nr > 1 ? "s" : "");
+ (unsigned long)total_size / SZ_1M, nr, str_plural(nr));
return ret;
}
@@ -592,151 +749,9 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
return ret;
}
-static int __init __cma_declare_contiguous_nid(phys_addr_t *basep,
- phys_addr_t size, phys_addr_t limit,
- phys_addr_t alignment, unsigned int order_per_bit,
- bool fixed, const char *name, struct cma **res_cma,
- int nid)
-{
- phys_addr_t memblock_end = memblock_end_of_DRAM();
- phys_addr_t highmem_start, base = *basep;
- int ret;
-
- /*
- * We can't use __pa(high_memory) directly, since high_memory
- * isn't a valid direct map VA, and DEBUG_VIRTUAL will (validly)
- * complain. Find the boundary by adding one to the last valid
- * address.
- */
- highmem_start = __pa(high_memory - 1) + 1;
- pr_debug("%s(size %pa, base %pa, limit %pa alignment %pa)\n",
- __func__, &size, &base, &limit, &alignment);
-
- if (cma_area_count == ARRAY_SIZE(cma_areas)) {
- pr_err("Not enough slots for CMA reserved regions!\n");
- return -ENOSPC;
- }
-
- if (!size)
- return -EINVAL;
-
- if (alignment && !is_power_of_2(alignment))
- return -EINVAL;
-
- if (!IS_ENABLED(CONFIG_NUMA))
- nid = NUMA_NO_NODE;
-
- /* Sanitise input arguments. */
- alignment = max_t(phys_addr_t, alignment, CMA_MIN_ALIGNMENT_BYTES);
- if (fixed && base & (alignment - 1)) {
- pr_err("Region at %pa must be aligned to %pa bytes\n",
- &base, &alignment);
- return -EINVAL;
- }
- base = ALIGN(base, alignment);
- size = ALIGN(size, alignment);
- limit &= ~(alignment - 1);
-
- if (!base)
- fixed = false;
-
- /* size should be aligned with order_per_bit */
- if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit))
- return -EINVAL;
-
- /*
- * If allocating at a fixed base the request region must not cross the
- * low/high memory boundary.
- */
- if (fixed && base < highmem_start && base + size > highmem_start) {
- pr_err("Region at %pa defined on low/high memory boundary (%pa)\n",
- &base, &highmem_start);
- return -EINVAL;
- }
-
- /*
- * If the limit is unspecified or above the memblock end, its effective
- * value will be the memblock end. Set it explicitly to simplify further
- * checks.
- */
- if (limit == 0 || limit > memblock_end)
- limit = memblock_end;
-
- if (base + size > limit) {
- pr_err("Size (%pa) of region at %pa exceeds limit (%pa)\n",
- &size, &base, &limit);
- return -EINVAL;
- }
-
- /* Reserve memory */
- if (fixed) {
- if (memblock_is_region_reserved(base, size) ||
- memblock_reserve(base, size) < 0) {
- return -EBUSY;
- }
- } else {
- phys_addr_t addr = 0;
-
- /*
- * If there is enough memory, try a bottom-up allocation first.
- * It will place the new cma area close to the start of the node
- * and guarantee that the compaction is moving pages out of the
- * cma area and not into it.
- * Avoid using first 4GB to not interfere with constrained zones
- * like DMA/DMA32.
- */
-#ifdef CONFIG_PHYS_ADDR_T_64BIT
- if (!memblock_bottom_up() && memblock_end >= SZ_4G + size) {
- memblock_set_bottom_up(true);
- addr = memblock_alloc_range_nid(size, alignment, SZ_4G,
- limit, nid, true);
- memblock_set_bottom_up(false);
- }
-#endif
-
- /*
- * All pages in the reserved area must come from the same zone.
- * If the requested region crosses the low/high memory boundary,
- * try allocating from high memory first and fall back to low
- * memory in case of failure.
- */
- if (!addr && base < highmem_start && limit > highmem_start) {
- addr = memblock_alloc_range_nid(size, alignment,
- highmem_start, limit, nid, true);
- limit = highmem_start;
- }
-
- if (!addr) {
- addr = memblock_alloc_range_nid(size, alignment, base,
- limit, nid, true);
- if (!addr)
- return -ENOMEM;
- }
-
- /*
- * kmemleak scans/reads tracked objects for pointers to other
- * objects but this address isn't mapped and accessible
- */
- kmemleak_ignore_phys(addr);
- base = addr;
- }
-
- ret = cma_init_reserved_mem(base, size, order_per_bit, name, res_cma);
- if (ret) {
- memblock_phys_free(base, size);
- return ret;
- }
-
- (*res_cma)->nid = nid;
- *basep = base;
-
- return 0;
-}
-
static void cma_debug_show_areas(struct cma *cma)
{
- unsigned long next_zero_bit, next_set_bit, nr_zero;
- unsigned long start;
+ unsigned long start, end;
unsigned long nr_part;
unsigned long nbits;
int r;
@@ -747,22 +762,12 @@ static void cma_debug_show_areas(struct cma *cma)
for (r = 0; r < cma->nranges; r++) {
cmr = &cma->ranges[r];
- start = 0;
nbits = cma_bitmap_maxno(cma, cmr);
pr_info("range %d: ", r);
- for (;;) {
- next_zero_bit = find_next_zero_bit(cmr->bitmap,
- nbits, start);
- if (next_zero_bit >= nbits)
- break;
- next_set_bit = find_next_bit(cmr->bitmap, nbits,
- next_zero_bit);
- nr_zero = next_set_bit - next_zero_bit;
- nr_part = nr_zero << cma->order_per_bit;
- pr_cont("%s%lu@%lu", start ? "+" : "", nr_part,
- next_zero_bit);
- start = next_zero_bit + nr_zero;
+ for_each_clear_bitrange(start, end, cmr->bitmap, nbits) {
+ nr_part = (end - start) << cma->order_per_bit;
+ pr_cont("%s%lu@%lu", start ? "+" : "", nr_part, start);
}
pr_info("\n");
}
@@ -775,10 +780,8 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr,
unsigned long count, unsigned int align,
struct page **pagep, gfp_t gfp)
{
- unsigned long mask, offset;
- unsigned long pfn = -1;
- unsigned long start = 0;
unsigned long bitmap_maxno, bitmap_no, bitmap_count;
+ unsigned long start, pfn, mask, offset;
int ret = -EBUSY;
struct page *page = NULL;
@@ -790,7 +793,7 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr,
if (bitmap_count > bitmap_maxno)
goto out;
- for (;;) {
+ for (start = 0; ; start = bitmap_no + mask + 1) {
spin_lock_irq(&cma->lock);
/*
* If the request is larger than the available number
@@ -807,6 +810,22 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr,
spin_unlock_irq(&cma->lock);
break;
}
+
+ pfn = cmr->base_pfn + (bitmap_no << cma->order_per_bit);
+ page = pfn_to_page(pfn);
+
+ /*
+ * Do not hand out page ranges that are not contiguous, so
+ * callers can just iterate the pages without having to worry
+ * about these corner cases.
+ */
+ if (!page_range_contiguous(page, count)) {
+ spin_unlock_irq(&cma->lock);
+ pr_warn_ratelimited("%s: %s: skipping incompatible area [0x%lx-0x%lx]",
+ __func__, cma->name, pfn, pfn + count - 1);
+ continue;
+ }
+
bitmap_set(cmr->bitmap, bitmap_no, bitmap_count);
cma->available_count -= count;
/*
@@ -816,29 +835,24 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr,
*/
spin_unlock_irq(&cma->lock);
- pfn = cmr->base_pfn + (bitmap_no << cma->order_per_bit);
mutex_lock(&cma->alloc_mutex);
- ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA, gfp);
+ ret = alloc_contig_range(pfn, pfn + count, ACR_FLAGS_CMA, gfp);
mutex_unlock(&cma->alloc_mutex);
- if (ret == 0) {
- page = pfn_to_page(pfn);
+ if (!ret)
break;
- }
cma_clear_bitmap(cma, cmr, pfn, count);
if (ret != -EBUSY)
break;
pr_debug("%s(): memory range at pfn 0x%lx %p is busy, retrying\n",
- __func__, pfn, pfn_to_page(pfn));
+ __func__, pfn, page);
- trace_cma_alloc_busy_retry(cma->name, pfn, pfn_to_page(pfn),
- count, align);
- /* try again with a bit different memory target */
- start = bitmap_no + mask + 1;
+ trace_cma_alloc_busy_retry(cma->name, pfn, page, count, align);
}
out:
- *pagep = page;
+ if (!ret)
+ *pagep = page;
return ret;
}
@@ -850,8 +864,6 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count,
unsigned long i;
const char *name = cma ? cma->name : NULL;
- trace_cma_alloc_start(name, count, align);
-
if (!cma || !cma->count)
return page;
@@ -861,6 +873,8 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count,
if (!count)
return page;
+ trace_cma_alloc_start(name, count, cma->available_count, cma->count, align);
+
for (r = 0; r < cma->nranges; r++) {
page = NULL;
@@ -877,7 +891,7 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count,
*/
if (page) {
for (i = 0; i < count; i++)
- page_kasan_tag_reset(nth_page(page, i));
+ page_kasan_tag_reset(page + i);
}
if (ret && !(gfp & __GFP_NOWARN)) {
diff --git a/mm/cma.h b/mm/cma.h
index 41a3ab0ec3de..c70180c36559 100644
--- a/mm/cma.h
+++ b/mm/cma.h
@@ -25,9 +25,11 @@ struct cma_kobject {
*/
struct cma_memrange {
unsigned long base_pfn;
- unsigned long early_pfn;
unsigned long count;
- unsigned long *bitmap;
+ union {
+ unsigned long early_pfn;
+ unsigned long *bitmap;
+ };
#ifdef CONFIG_CMA_DEBUGFS
struct debugfs_u32_array dfs_bitmap;
#endif
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
index fdf899532ca0..8c7d7f8e8fbd 100644
--- a/mm/cma_debug.c
+++ b/mm/cma_debug.c
@@ -56,16 +56,8 @@ static int cma_maxchunk_get(void *data, u64 *val)
for (r = 0; r < cma->nranges; r++) {
cmr = &cma->ranges[r];
bitmap_maxno = cma_bitmap_maxno(cma, cmr);
- end = 0;
- for (;;) {
- start = find_next_zero_bit(cmr->bitmap,
- bitmap_maxno, end);
- if (start >= bitmap_maxno)
- break;
- end = find_next_bit(cmr->bitmap, bitmap_maxno,
- start);
+ for_each_clear_bitrange(start, end, cmr->bitmap, bitmap_maxno)
maxchunk = max(end - start, maxchunk);
- }
}
spin_unlock_irq(&cma->lock);
*val = (u64)maxchunk << cma->order_per_bit;
diff --git a/mm/compaction.c b/mm/compaction.c
index ca71fd3c3181..1e8f8eca318c 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -114,39 +114,6 @@ static unsigned long release_free_list(struct list_head *freepages)
}
#ifdef CONFIG_COMPACTION
-bool PageMovable(struct page *page)
-{
- const struct movable_operations *mops;
-
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- if (!__PageMovable(page))
- return false;
-
- mops = page_movable_ops(page);
- if (mops)
- return true;
-
- return false;
-}
-
-void __SetPageMovable(struct page *page, const struct movable_operations *mops)
-{
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE((unsigned long)mops & PAGE_MAPPING_MOVABLE, page);
- page->mapping = (void *)((unsigned long)mops | PAGE_MAPPING_MOVABLE);
-}
-EXPORT_SYMBOL(__SetPageMovable);
-
-void __ClearPageMovable(struct page *page)
-{
- VM_BUG_ON_PAGE(!PageMovable(page), page);
- /*
- * This page still has the type of a movable page, but it's
- * actually not movable any more.
- */
- page->mapping = (void *)PAGE_MAPPING_MOVABLE;
-}
-EXPORT_SYMBOL(__ClearPageMovable);
/* Do not skip compaction more than 64 times */
#define COMPACT_MAX_DEFER_SHIFT 6
@@ -1001,10 +968,11 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
locked = NULL;
}
- ret = isolate_or_dissolve_huge_page(page, &cc->migratepages);
+ folio = page_folio(page);
+ ret = isolate_or_dissolve_huge_folio(folio, &cc->migratepages);
/*
- * Fail isolation in case isolate_or_dissolve_huge_page()
+ * Fail isolation in case isolate_or_dissolve_huge_folio()
* reports an error. In case of -ENOMEM, abort right away.
*/
if (ret < 0) {
@@ -1016,13 +984,12 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
goto isolate_fail;
}
- if (PageHuge(page)) {
+ if (folio_test_hugetlb(folio)) {
/*
* Hugepage was successfully isolated and placed
* on the cc->migratepages list.
*/
- folio = page_folio(page);
- low_pfn += folio_nr_pages(folio) - 1;
+ low_pfn += folio_nr_pages(folio) - folio_page_idx(folio, page) - 1;
goto isolate_success_no_list;
}
@@ -1082,18 +1049,15 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* Skip any other type of page
*/
if (!PageLRU(page)) {
- /*
- * __PageMovable can return false positive so we need
- * to verify it under page_lock.
- */
- if (unlikely(__PageMovable(page)) &&
- !PageIsolated(page)) {
+ /* Isolation code will deal with any races. */
+ if (unlikely(page_has_movable_ops(page)) &&
+ !PageMovableOpsIsolated(page)) {
if (locked) {
unlock_page_lruvec_irqrestore(locked, flags);
locked = NULL;
}
- if (isolate_movable_page(page, mode)) {
+ if (isolate_movable_ops_page(page, mode)) {
folio = page_folio(page);
goto isolate_success;
}
@@ -2249,15 +2213,11 @@ static unsigned int fragmentation_score_node(pg_data_t *pgdat)
static unsigned int fragmentation_score_wmark(bool low)
{
- unsigned int wmark_low;
+ unsigned int wmark_low, leeway;
- /*
- * Cap the low watermark to avoid excessive compaction
- * activity in case a user sets the proactiveness tunable
- * close to 100 (maximum).
- */
- wmark_low = max(100U - sysctl_compaction_proactiveness, 5U);
- return low ? wmark_low : min(wmark_low + 10, 100U);
+ wmark_low = 100U - sysctl_compaction_proactiveness;
+ leeway = min(10U, wmark_low / 2);
+ return low ? wmark_low : min(wmark_low + leeway, 100U);
}
static bool should_proactive_compact_node(pg_data_t *pgdat)
@@ -2348,7 +2308,6 @@ static enum compact_result __compact_finished(struct compact_control *cc)
ret = COMPACT_NO_SUITABLE_PAGE;
for (order = cc->order; order < NR_PAGE_ORDERS; order++) {
struct free_area *area = &cc->zone->free_area[order];
- bool claim_block;
/* Job done if page is free of the right migratetype */
if (!free_area_empty(area, migratetype))
@@ -2364,8 +2323,7 @@ static enum compact_result __compact_finished(struct compact_control *cc)
* Job done if allocation would steal freepages from
* other migratetype buddy lists.
*/
- if (find_suitable_fallback(area, order, migratetype,
- true, &claim_block) != -1)
+ if (find_suitable_fallback(area, order, migratetype, true) >= 0)
/*
* Movable pages are OK in any pageblock. If we are
* stealing for a non-movable allocation, make sure
diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig
index c213cf8b5638..8c868f7035fc 100644
--- a/mm/damon/Kconfig
+++ b/mm/damon/Kconfig
@@ -28,6 +28,7 @@ config DAMON_VADDR
bool "Data access monitoring operations for virtual address spaces"
depends on DAMON && MMU
select PAGE_IDLE_FLAG
+ default DAMON
help
This builds the default data access monitoring operations for DAMON
that work for virtual address spaces.
@@ -36,6 +37,7 @@ config DAMON_PADDR
bool "Data access monitoring operations for the physical address space"
depends on DAMON && MMU
select PAGE_IDLE_FLAG
+ default DAMON
help
This builds the default data access monitoring operations for DAMON
that works for the physical address space.
@@ -55,6 +57,7 @@ config DAMON_VADDR_KUNIT_TEST
config DAMON_SYSFS
bool "DAMON sysfs interface"
depends on DAMON && SYSFS
+ default DAMON
help
This builds the sysfs interface for DAMON. The user space can use
the interface for arbitrary data access monitoring.
@@ -91,4 +94,20 @@ config DAMON_LRU_SORT
protect frequently accessed (hot) pages while rarely accessed (cold)
pages reclaimed first under memory pressure.
+config DAMON_STAT
+ bool "Build data access monitoring stat (DAMON_STAT)"
+ depends on DAMON_PADDR
+ help
+ This builds the DAMON-based access monitoring statistics subsystem.
+ It runs DAMON and expose access monitoring results in simple stat
+ metrics.
+
+config DAMON_STAT_ENABLED_DEFAULT
+ bool "Enable DAMON_STAT by default"
+ depends on DAMON_STAT
+ default DAMON_STAT
+ help
+ Whether to enable DAMON_STAT by default. Users can disable it in
+ boot or runtime using its 'enabled' parameter.
+
endmenu
diff --git a/mm/damon/Makefile b/mm/damon/Makefile
index 8b49012ba8c3..d8d6bf5f8bff 100644
--- a/mm/damon/Makefile
+++ b/mm/damon/Makefile
@@ -6,3 +6,4 @@ obj-$(CONFIG_DAMON_PADDR) += ops-common.o paddr.o
obj-$(CONFIG_DAMON_SYSFS) += sysfs-common.o sysfs-schemes.o sysfs.o
obj-$(CONFIG_DAMON_RECLAIM) += modules-common.o reclaim.o
obj-$(CONFIG_DAMON_LRU_SORT) += modules-common.o lru_sort.o
+obj-$(CONFIG_DAMON_STAT) += modules-common.o stat.o
diff --git a/mm/damon/core.c b/mm/damon/core.c
index f0c1676f0599..93848b4c6944 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -201,6 +201,7 @@ static int damon_fill_regions_holes(struct damon_region *first,
* @t: the given target.
* @ranges: array of new monitoring target ranges.
* @nr_ranges: length of @ranges.
+ * @min_sz_region: minimum region size.
*
* This function adds new regions to, or modify existing regions of a
* monitoring target to fit in specific ranges.
@@ -208,7 +209,7 @@ static int damon_fill_regions_holes(struct damon_region *first,
* Return: 0 if success, or negative error code otherwise.
*/
int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
- unsigned int nr_ranges)
+ unsigned int nr_ranges, unsigned long min_sz_region)
{
struct damon_region *r, *next;
unsigned int i;
@@ -245,16 +246,16 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
/* no region intersects with this range */
newr = damon_new_region(
ALIGN_DOWN(range->start,
- DAMON_MIN_REGION),
- ALIGN(range->end, DAMON_MIN_REGION));
+ min_sz_region),
+ ALIGN(range->end, min_sz_region));
if (!newr)
return -ENOMEM;
damon_insert_region(newr, damon_prev_region(r), r, t);
} else {
/* resize intersecting regions to fit in this range */
first->ar.start = ALIGN_DOWN(range->start,
- DAMON_MIN_REGION);
- last->ar.end = ALIGN(range->end, DAMON_MIN_REGION);
+ min_sz_region);
+ last->ar.end = ALIGN(range->end, min_sz_region);
/* fill possible holes in the range */
err = damon_fill_regions_holes(first, last, t);
@@ -407,6 +408,7 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
scheme->wmarks = *wmarks;
scheme->wmarks.activated = true;
+ scheme->migrate_dests = (struct damos_migrate_dests){};
scheme->target_nid = target_nid;
return scheme;
@@ -449,6 +451,9 @@ void damon_destroy_scheme(struct damos *s)
damos_for_each_filter_safe(f, next, s)
damos_destroy_filter(f);
+
+ kfree(s->migrate_dests.node_id_arr);
+ kfree(s->migrate_dests.weight_arr);
damon_del_scheme(s);
damon_free_scheme(s);
}
@@ -498,8 +503,12 @@ void damon_free_target(struct damon_target *t)
kfree(t);
}
-void damon_destroy_target(struct damon_target *t)
+void damon_destroy_target(struct damon_target *t, struct damon_ctx *ctx)
{
+
+ if (ctx && ctx->ops.cleanup_target)
+ ctx->ops.cleanup_target(t);
+
damon_del_target(t);
damon_free_target(t);
}
@@ -529,12 +538,16 @@ struct damon_ctx *damon_new_ctx(void)
ctx->next_ops_update_sis = 0;
mutex_init(&ctx->kdamond_lock);
- mutex_init(&ctx->call_control_lock);
+ INIT_LIST_HEAD(&ctx->call_controls);
+ mutex_init(&ctx->call_controls_lock);
mutex_init(&ctx->walk_control_lock);
ctx->attrs.min_nr_regions = 10;
ctx->attrs.max_nr_regions = 1000;
+ ctx->addr_unit = 1;
+ ctx->min_sz_region = DAMON_MIN_REGION;
+
INIT_LIST_HEAD(&ctx->adaptive_targets);
INIT_LIST_HEAD(&ctx->schemes);
@@ -545,13 +558,8 @@ static void damon_destroy_targets(struct damon_ctx *ctx)
{
struct damon_target *t, *next_t;
- if (ctx->ops.cleanup) {
- ctx->ops.cleanup(ctx);
- return;
- }
-
damon_for_each_target_safe(t, next_t, ctx)
- damon_destroy_target(t);
+ damon_destroy_target(t, ctx);
}
void damon_destroy_ctx(struct damon_ctx *ctx)
@@ -566,6 +574,23 @@ void damon_destroy_ctx(struct damon_ctx *ctx)
kfree(ctx);
}
+static bool damon_attrs_equals(const struct damon_attrs *attrs1,
+ const struct damon_attrs *attrs2)
+{
+ const struct damon_intervals_goal *ig1 = &attrs1->intervals_goal;
+ const struct damon_intervals_goal *ig2 = &attrs2->intervals_goal;
+
+ return attrs1->sample_interval == attrs2->sample_interval &&
+ attrs1->aggr_interval == attrs2->aggr_interval &&
+ attrs1->ops_update_interval == attrs2->ops_update_interval &&
+ attrs1->min_nr_regions == attrs2->min_nr_regions &&
+ attrs1->max_nr_regions == attrs2->max_nr_regions &&
+ ig1->access_bp == ig2->access_bp &&
+ ig1->aggrs == ig2->aggrs &&
+ ig1->min_sample_us == ig2->min_sample_us &&
+ ig1->max_sample_us == ig2->max_sample_us;
+}
+
static unsigned int damon_age_for_new_attrs(unsigned int age,
struct damon_attrs *old_attrs, struct damon_attrs *new_attrs)
{
@@ -676,9 +701,7 @@ static bool damon_valid_intervals_goal(struct damon_attrs *attrs)
* @attrs: monitoring attributes
*
* This function should be called while the kdamond is not running, an access
- * check results aggregation is not ongoing (e.g., from &struct
- * damon_callback->after_aggregation or &struct
- * damon_callback->after_wmarks_check callbacks), or from damon_call().
+ * check results aggregation is not ongoing (e.g., from damon_call().
*
* Every time interval is in micro-seconds.
*
@@ -754,6 +777,19 @@ static struct damos_quota_goal *damos_nth_quota_goal(
return NULL;
}
+static void damos_commit_quota_goal_union(
+ struct damos_quota_goal *dst, struct damos_quota_goal *src)
+{
+ switch (dst->metric) {
+ case DAMOS_QUOTA_NODE_MEM_USED_BP:
+ case DAMOS_QUOTA_NODE_MEM_FREE_BP:
+ dst->nid = src->nid;
+ break;
+ default:
+ break;
+ }
+}
+
static void damos_commit_quota_goal(
struct damos_quota_goal *dst, struct damos_quota_goal *src)
{
@@ -762,6 +798,7 @@ static void damos_commit_quota_goal(
if (dst->metric == DAMOS_QUOTA_USER_INPUT)
dst->current_value = src->current_value;
/* keep last_psi_total as is, since it will be updated in next cycle */
+ damos_commit_quota_goal_union(dst, src);
}
/**
@@ -774,7 +811,7 @@ static void damos_commit_quota_goal(
* DAMON contexts, instead of manual in-place updates.
*
* This function should be called from parameters-update safe context, like
- * DAMON callbacks.
+ * damon_call().
*/
int damos_commit_quota_goals(struct damos_quota *dst, struct damos_quota *src)
{
@@ -795,6 +832,7 @@ int damos_commit_quota_goals(struct damos_quota *dst, struct damos_quota *src)
src_goal->metric, src_goal->target_value);
if (!new_goal)
return -ENOMEM;
+ damos_commit_quota_goal_union(new_goal, src_goal);
damos_add_quota_goal(dst, new_goal);
}
return 0;
@@ -828,6 +866,18 @@ static struct damos_filter *damos_nth_filter(int n, struct damos *s)
return NULL;
}
+static struct damos_filter *damos_nth_ops_filter(int n, struct damos *s)
+{
+ struct damos_filter *filter;
+ int i = 0;
+
+ damos_for_each_ops_filter(filter, s) {
+ if (i++ == n)
+ return filter;
+ }
+ return NULL;
+}
+
static void damos_commit_filter_arg(
struct damos_filter *dst, struct damos_filter *src)
{
@@ -854,6 +904,7 @@ static void damos_commit_filter(
{
dst->type = src->type;
dst->matching = src->matching;
+ dst->allow = src->allow;
damos_commit_filter_arg(dst, src);
}
@@ -891,7 +942,7 @@ static int damos_commit_ops_filters(struct damos *dst, struct damos *src)
int i = 0, j = 0;
damos_for_each_ops_filter_safe(dst_filter, next, dst) {
- src_filter = damos_nth_filter(i++, src);
+ src_filter = damos_nth_ops_filter(i++, src);
if (src_filter)
damos_commit_filter(dst_filter, src_filter);
else
@@ -939,6 +990,41 @@ static void damos_set_filters_default_reject(struct damos *s)
damos_filters_default_reject(&s->ops_filters);
}
+static int damos_commit_dests(struct damos *dst, struct damos *src)
+{
+ struct damos_migrate_dests *dst_dests, *src_dests;
+
+ dst_dests = &dst->migrate_dests;
+ src_dests = &src->migrate_dests;
+
+ if (dst_dests->nr_dests != src_dests->nr_dests) {
+ kfree(dst_dests->node_id_arr);
+ kfree(dst_dests->weight_arr);
+
+ dst_dests->node_id_arr = kmalloc_array(src_dests->nr_dests,
+ sizeof(*dst_dests->node_id_arr), GFP_KERNEL);
+ if (!dst_dests->node_id_arr) {
+ dst_dests->weight_arr = NULL;
+ return -ENOMEM;
+ }
+
+ dst_dests->weight_arr = kmalloc_array(src_dests->nr_dests,
+ sizeof(*dst_dests->weight_arr), GFP_KERNEL);
+ if (!dst_dests->weight_arr) {
+ /* ->node_id_arr will be freed by scheme destruction */
+ return -ENOMEM;
+ }
+ }
+
+ dst_dests->nr_dests = src_dests->nr_dests;
+ for (int i = 0; i < src_dests->nr_dests; i++) {
+ dst_dests->node_id_arr[i] = src_dests->node_id_arr[i];
+ dst_dests->weight_arr[i] = src_dests->weight_arr[i];
+ }
+
+ return 0;
+}
+
static int damos_commit_filters(struct damos *dst, struct damos *src)
{
int err;
@@ -978,6 +1064,11 @@ static int damos_commit(struct damos *dst, struct damos *src)
return err;
dst->wmarks = src->wmarks;
+ dst->target_nid = src->target_nid;
+
+ err = damos_commit_dests(dst, src);
+ if (err)
+ return err;
err = damos_commit_filters(dst, src);
return err;
@@ -1038,8 +1129,8 @@ static struct damon_target *damon_nth_target(int n, struct damon_ctx *ctx)
*
* If @src has no region, @dst keeps current regions.
*/
-static int damon_commit_target_regions(
- struct damon_target *dst, struct damon_target *src)
+static int damon_commit_target_regions(struct damon_target *dst,
+ struct damon_target *src, unsigned long src_min_sz_region)
{
struct damon_region *src_region;
struct damon_addr_range *ranges;
@@ -1056,18 +1147,19 @@ static int damon_commit_target_regions(
i = 0;
damon_for_each_region(src_region, src)
ranges[i++] = src_region->ar;
- err = damon_set_regions(dst, ranges, i);
+ err = damon_set_regions(dst, ranges, i, src_min_sz_region);
kfree(ranges);
return err;
}
static int damon_commit_target(
struct damon_target *dst, bool dst_has_pid,
- struct damon_target *src, bool src_has_pid)
+ struct damon_target *src, bool src_has_pid,
+ unsigned long src_min_sz_region)
{
int err;
- err = damon_commit_target_regions(dst, src);
+ err = damon_commit_target_regions(dst, src, src_min_sz_region);
if (err)
return err;
if (dst_has_pid)
@@ -1089,13 +1181,20 @@ static int damon_commit_targets(
if (src_target) {
err = damon_commit_target(
dst_target, damon_target_has_pid(dst),
- src_target, damon_target_has_pid(src));
+ src_target, damon_target_has_pid(src),
+ src->min_sz_region);
if (err)
return err;
} else {
- if (damon_target_has_pid(dst))
- put_pid(dst_target->pid);
- damon_destroy_target(dst_target);
+ struct damos *s;
+
+ damon_destroy_target(dst_target, dst);
+ damon_for_each_scheme(s, dst) {
+ if (s->quota.charge_target_from == dst_target) {
+ s->quota.charge_target_from = NULL;
+ s->quota.charge_addr_from = 0;
+ }
+ }
}
}
@@ -1106,9 +1205,10 @@ static int damon_commit_targets(
if (!new_target)
return -ENOMEM;
err = damon_commit_target(new_target, false,
- src_target, damon_target_has_pid(src));
+ src_target, damon_target_has_pid(src),
+ src->min_sz_region);
if (err) {
- damon_destroy_target(new_target);
+ damon_destroy_target(new_target, NULL);
return err;
}
damon_add_target(dst, new_target);
@@ -1127,7 +1227,7 @@ static int damon_commit_targets(
* in-place updates.
*
* This function should be called from parameters-update safe context, like
- * DAMON callbacks.
+ * damon_call().
*/
int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src)
{
@@ -1146,10 +1246,14 @@ int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src)
* 2. ops update should be done after pid handling is done (target
* committing require putting pids).
*/
- err = damon_set_attrs(dst, &src->attrs);
- if (err)
- return err;
+ if (!damon_attrs_equals(&dst->attrs, &src->attrs)) {
+ err = damon_set_attrs(dst, &src->attrs);
+ if (err)
+ return err;
+ }
dst->ops = src->ops;
+ dst->addr_unit = src->addr_unit;
+ dst->min_sz_region = src->min_sz_region;
return 0;
}
@@ -1182,8 +1286,8 @@ static unsigned long damon_region_sz_limit(struct damon_ctx *ctx)
if (ctx->attrs.min_nr_regions)
sz /= ctx->attrs.min_nr_regions;
- if (sz < DAMON_MIN_REGION)
- sz = DAMON_MIN_REGION;
+ if (sz < ctx->min_sz_region)
+ sz = ctx->min_sz_region;
return sz;
}
@@ -1303,7 +1407,13 @@ int damon_stop(struct damon_ctx **ctxs, int nr_ctxs)
return err;
}
-static bool damon_is_running(struct damon_ctx *ctx)
+/**
+ * damon_is_running() - Returns if a given DAMON context is running.
+ * @ctx: The DAMON context to see if running.
+ *
+ * Return: true if @ctx is running, false otherwise.
+ */
+bool damon_is_running(struct damon_ctx *ctx)
{
bool running;
@@ -1320,8 +1430,9 @@ static bool damon_is_running(struct damon_ctx *ctx)
*
* Ask DAMON worker thread (kdamond) of @ctx to call a function with an
* argument data that respectively passed via &damon_call_control->fn and
- * &damon_call_control->data of @control, and wait until the kdamond finishes
- * handling of the request.
+ * &damon_call_control->data of @control. If &damon_call_control->repeat of
+ * @control is set, further wait until the kdamond finishes handling of the
+ * request. Otherwise, return as soon as the request is made.
*
* The kdamond executes the function with the argument in the main loop, just
* after a sampling of the iteration is finished. The function can hence
@@ -1333,18 +1444,18 @@ static bool damon_is_running(struct damon_ctx *ctx)
*/
int damon_call(struct damon_ctx *ctx, struct damon_call_control *control)
{
- init_completion(&control->completion);
+ if (!control->repeat)
+ init_completion(&control->completion);
control->canceled = false;
+ INIT_LIST_HEAD(&control->list);
- mutex_lock(&ctx->call_control_lock);
- if (ctx->call_control) {
- mutex_unlock(&ctx->call_control_lock);
- return -EBUSY;
- }
- ctx->call_control = control;
- mutex_unlock(&ctx->call_control_lock);
+ mutex_lock(&ctx->call_controls_lock);
+ list_add_tail(&ctx->call_controls, &control->list);
+ mutex_unlock(&ctx->call_controls_lock);
if (!damon_is_running(ctx))
return -EINVAL;
+ if (control->repeat)
+ return 0;
wait_for_completion(&control->completion);
if (control->canceled)
return -ECANCELED;
@@ -1392,6 +1503,19 @@ int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control)
}
/*
+ * Warn and fix corrupted ->nr_accesses[_bp] for investigations and preventing
+ * the problem being propagated.
+ */
+static void damon_warn_fix_nr_accesses_corruption(struct damon_region *r)
+{
+ if (r->nr_accesses_bp == r->nr_accesses * 10000)
+ return;
+ WARN_ONCE(true, "invalid nr_accesses_bp at reset: %u %u\n",
+ r->nr_accesses_bp, r->nr_accesses);
+ r->nr_accesses_bp = r->nr_accesses * 10000;
+}
+
+/*
* Reset the aggregated monitoring results ('nr_accesses' of each region).
*/
static void kdamond_reset_aggregated(struct damon_ctx *c)
@@ -1404,6 +1528,7 @@ static void kdamond_reset_aggregated(struct damon_ctx *c)
damon_for_each_region(r, t) {
trace_damon_aggregated(ti, r, damon_nr_regions(t));
+ damon_warn_fix_nr_accesses_corruption(r);
r->last_nr_accesses = r->nr_accesses;
r->nr_accesses = 0;
}
@@ -1427,6 +1552,7 @@ static unsigned long damon_get_intervals_score(struct damon_ctx *c)
}
}
target_access_events = max_access_events * goal_bp / 10000;
+ target_access_events = target_access_events ? : 1;
return access_events * 10000 / target_access_events;
}
@@ -1467,6 +1593,7 @@ static void kdamond_tune_intervals(struct damon_ctx *c)
new_attrs.sample_interval);
new_attrs.aggr_interval = new_attrs.sample_interval *
c->attrs.aggr_samples;
+ trace_damon_monitor_intervals_tune(new_attrs.sample_interval);
damon_set_attrs(c, &new_attrs);
}
@@ -1504,6 +1631,7 @@ static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t,
* @t: The target of the region.
* @rp: The pointer to the region.
* @s: The scheme to be applied.
+ * @min_sz_region: minimum region size.
*
* If a quota of a scheme has exceeded in a quota charge window, the scheme's
* action would applied to only a part of the target access pattern fulfilling
@@ -1521,7 +1649,7 @@ static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t,
* Return: true if the region should be entirely skipped, false otherwise.
*/
static bool damos_skip_charged_region(struct damon_target *t,
- struct damon_region **rp, struct damos *s)
+ struct damon_region **rp, struct damos *s, unsigned long min_sz_region)
{
struct damon_region *r = *rp;
struct damos_quota *quota = &s->quota;
@@ -1543,11 +1671,11 @@ static bool damos_skip_charged_region(struct damon_target *t,
if (quota->charge_addr_from && r->ar.start <
quota->charge_addr_from) {
sz_to_skip = ALIGN_DOWN(quota->charge_addr_from -
- r->ar.start, DAMON_MIN_REGION);
+ r->ar.start, min_sz_region);
if (!sz_to_skip) {
- if (damon_sz_region(r) <= DAMON_MIN_REGION)
+ if (damon_sz_region(r) <= min_sz_region)
return true;
- sz_to_skip = DAMON_MIN_REGION;
+ sz_to_skip = min_sz_region;
}
damon_split_region_at(t, r, sz_to_skip);
r = damon_next_region(r);
@@ -1572,7 +1700,8 @@ static void damos_update_stat(struct damos *s,
}
static bool damos_filter_match(struct damon_ctx *ctx, struct damon_target *t,
- struct damon_region *r, struct damos_filter *filter)
+ struct damon_region *r, struct damos_filter *filter,
+ unsigned long min_sz_region)
{
bool matched = false;
struct damon_target *ti;
@@ -1589,8 +1718,8 @@ static bool damos_filter_match(struct damon_ctx *ctx, struct damon_target *t,
matched = target_idx == filter->target_idx;
break;
case DAMOS_FILTER_TYPE_ADDR:
- start = ALIGN_DOWN(filter->addr_range.start, DAMON_MIN_REGION);
- end = ALIGN_DOWN(filter->addr_range.end, DAMON_MIN_REGION);
+ start = ALIGN_DOWN(filter->addr_range.start, min_sz_region);
+ end = ALIGN_DOWN(filter->addr_range.end, min_sz_region);
/* inside the range */
if (start <= r->ar.start && r->ar.end <= end) {
@@ -1626,7 +1755,7 @@ static bool damos_filter_out(struct damon_ctx *ctx, struct damon_target *t,
s->core_filters_allowed = false;
damos_for_each_filter(filter, s) {
- if (damos_filter_match(ctx, t, r, filter)) {
+ if (damos_filter_match(ctx, t, r, filter, ctx->min_sz_region)) {
if (filter->allow)
s->core_filters_allowed = true;
return !filter->allow;
@@ -1761,7 +1890,7 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
if (c->ops.apply_scheme) {
if (quota->esz && quota->charged_sz + sz > quota->esz) {
sz = ALIGN_DOWN(quota->esz - quota->charged_sz,
- DAMON_MIN_REGION);
+ c->min_sz_region);
if (!sz)
goto update_stat;
damon_split_region_at(t, r, sz);
@@ -1809,7 +1938,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
if (quota->esz && quota->charged_sz >= quota->esz)
continue;
- if (damos_skip_charged_region(t, &r, s))
+ if (damos_skip_charged_region(t, &r, s, c->min_sz_region))
continue;
if (!damos_valid_target(c, t, r, s))
@@ -1889,6 +2018,29 @@ static inline u64 damos_get_some_mem_psi_total(void)
#endif /* CONFIG_PSI */
+#ifdef CONFIG_NUMA
+static __kernel_ulong_t damos_get_node_mem_bp(
+ struct damos_quota_goal *goal)
+{
+ struct sysinfo i;
+ __kernel_ulong_t numerator;
+
+ si_meminfo_node(&i, goal->nid);
+ if (goal->metric == DAMOS_QUOTA_NODE_MEM_USED_BP)
+ numerator = i.totalram - i.freeram;
+ else /* DAMOS_QUOTA_NODE_MEM_FREE_BP */
+ numerator = i.freeram;
+ return numerator * 10000 / i.totalram;
+}
+#else
+static __kernel_ulong_t damos_get_node_mem_bp(
+ struct damos_quota_goal *goal)
+{
+ return 0;
+}
+#endif
+
+
static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal)
{
u64 now_psi_total;
@@ -1902,6 +2054,10 @@ static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal)
goal->current_value = now_psi_total - goal->last_psi_total;
goal->last_psi_total = now_psi_total;
break;
+ case DAMOS_QUOTA_NODE_MEM_USED_BP:
+ case DAMOS_QUOTA_NODE_MEM_FREE_BP:
+ goal->current_value = damos_get_node_mem_bp(goal);
+ break;
default:
break;
}
@@ -1947,8 +2103,8 @@ static void damos_set_effective_quota(struct damos_quota *quota)
if (quota->ms) {
if (quota->total_charged_ns)
- throughput = quota->total_charged_sz * 1000000 /
- quota->total_charged_ns;
+ throughput = mult_frac(quota->total_charged_sz, 1000000,
+ quota->total_charged_ns);
else
throughput = PAGE_SIZE * 1024;
esz = min(throughput * quota->ms, esz);
@@ -1960,17 +2116,37 @@ static void damos_set_effective_quota(struct damos_quota *quota)
quota->esz = esz;
}
+static void damos_trace_esz(struct damon_ctx *c, struct damos *s,
+ struct damos_quota *quota)
+{
+ unsigned int cidx = 0, sidx = 0;
+ struct damos *siter;
+
+ damon_for_each_scheme(siter, c) {
+ if (siter == s)
+ break;
+ sidx++;
+ }
+ trace_damos_esz(cidx, sidx, quota->esz);
+}
+
static void damos_adjust_quota(struct damon_ctx *c, struct damos *s)
{
struct damos_quota *quota = &s->quota;
struct damon_target *t;
struct damon_region *r;
- unsigned long cumulated_sz;
+ unsigned long cumulated_sz, cached_esz;
unsigned int score, max_score = 0;
if (!quota->ms && !quota->sz && list_empty(&quota->goals))
return;
+ /* First charge window */
+ if (!quota->total_charged_sz && !quota->charged_from) {
+ quota->charged_from = jiffies;
+ damos_set_effective_quota(quota);
+ }
+
/* New charge window starts */
if (time_after_eq(jiffies, quota->charged_from +
msecs_to_jiffies(quota->reset_interval))) {
@@ -1979,7 +2155,11 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s)
quota->total_charged_sz += quota->charged_sz;
quota->charged_from = jiffies;
quota->charged_sz = 0;
+ if (trace_damos_esz_enabled())
+ cached_esz = quota->esz;
damos_set_effective_quota(quota);
+ if (trace_damos_esz_enabled() && quota->esz != cached_esz)
+ damos_trace_esz(c, s, quota);
}
if (!c->ops.get_scheme_score)
@@ -2083,6 +2263,8 @@ static void damon_merge_regions_of(struct damon_target *t, unsigned int thres,
damon_for_each_region_safe(r, next, t) {
if (abs(r->nr_accesses - r->last_nr_accesses) > thres)
r->age = 0;
+ else if ((r->nr_accesses == 0) != (r->last_nr_accesses == 0))
+ r->age = 0;
else
r->age++;
@@ -2158,7 +2340,8 @@ static void damon_split_region_at(struct damon_target *t,
}
/* Split every region in the given target into 'nr_subs' regions */
-static void damon_split_regions_of(struct damon_target *t, int nr_subs)
+static void damon_split_regions_of(struct damon_target *t, int nr_subs,
+ unsigned long min_sz_region)
{
struct damon_region *r, *next;
unsigned long sz_region, sz_sub = 0;
@@ -2168,13 +2351,13 @@ static void damon_split_regions_of(struct damon_target *t, int nr_subs)
sz_region = damon_sz_region(r);
for (i = 0; i < nr_subs - 1 &&
- sz_region > 2 * DAMON_MIN_REGION; i++) {
+ sz_region > 2 * min_sz_region; i++) {
/*
* Randomly select size of left sub-region to be at
* least 10 percent and at most 90% of original region
*/
sz_sub = ALIGN_DOWN(damon_rand(1, 10) *
- sz_region / 10, DAMON_MIN_REGION);
+ sz_region / 10, min_sz_region);
/* Do not allow blank region */
if (sz_sub == 0 || sz_sub >= sz_region)
continue;
@@ -2214,7 +2397,7 @@ static void kdamond_split_regions(struct damon_ctx *ctx)
nr_subregions = 3;
damon_for_each_target(t, ctx)
- damon_split_regions_of(t, nr_subregions);
+ damon_split_regions_of(t, nr_subregions, ctx->min_sz_region);
last_nr_regions = nr_regions;
}
@@ -2300,36 +2483,53 @@ static void kdamond_usleep(unsigned long usecs)
}
/*
- * kdamond_call() - handle damon_call_control.
+ * kdamond_call() - handle damon_call_control objects.
* @ctx: The &struct damon_ctx of the kdamond.
* @cancel: Whether to cancel the invocation of the function.
*
- * If there is a &struct damon_call_control request that registered via
+ * If there are &struct damon_call_control requests that registered via
* &damon_call() on @ctx, do or cancel the invocation of the function depending
- * on @cancel. @cancel is set when the kdamond is deactivated by DAMOS
- * watermarks, or the kdamond is already out of the main loop and therefore
- * will be terminated.
+ * on @cancel. @cancel is set when the kdamond is already out of the main loop
+ * and therefore will be terminated.
*/
static void kdamond_call(struct damon_ctx *ctx, bool cancel)
{
struct damon_call_control *control;
+ LIST_HEAD(repeat_controls);
int ret = 0;
- mutex_lock(&ctx->call_control_lock);
- control = ctx->call_control;
- mutex_unlock(&ctx->call_control_lock);
- if (!control)
- return;
- if (cancel) {
- control->canceled = true;
- } else {
- ret = control->fn(control->data);
- control->return_code = ret;
+ while (true) {
+ mutex_lock(&ctx->call_controls_lock);
+ control = list_first_entry_or_null(&ctx->call_controls,
+ struct damon_call_control, list);
+ mutex_unlock(&ctx->call_controls_lock);
+ if (!control)
+ break;
+ if (cancel) {
+ control->canceled = true;
+ } else {
+ ret = control->fn(control->data);
+ control->return_code = ret;
+ }
+ mutex_lock(&ctx->call_controls_lock);
+ list_del(&control->list);
+ mutex_unlock(&ctx->call_controls_lock);
+ if (!control->repeat) {
+ complete(&control->completion);
+ } else if (control->canceled && control->dealloc_on_cancel) {
+ kfree(control);
+ continue;
+ } else {
+ list_add(&control->list, &repeat_controls);
+ }
}
- complete(&control->completion);
- mutex_lock(&ctx->call_control_lock);
- ctx->call_control = NULL;
- mutex_unlock(&ctx->call_control_lock);
+ control = list_first_entry_or_null(&repeat_controls,
+ struct damon_call_control, list);
+ if (!control || cancel)
+ return;
+ mutex_lock(&ctx->call_controls_lock);
+ list_add_tail(&control->list, &ctx->call_controls);
+ mutex_unlock(&ctx->call_controls_lock);
}
/* Returns negative error code if it's not activated but should return */
@@ -2353,10 +2553,7 @@ static int kdamond_wait_activation(struct damon_ctx *ctx)
kdamond_usleep(min_wait_time);
- if (ctx->callback.after_wmarks_check &&
- ctx->callback.after_wmarks_check(ctx))
- break;
- kdamond_call(ctx, true);
+ kdamond_call(ctx, false);
damos_walk_cancel(ctx);
}
return -EBUSY;
@@ -2412,10 +2609,9 @@ static int kdamond_fn(void *data)
while (!kdamond_need_stop(ctx)) {
/*
* ctx->attrs and ctx->next_{aggregation,ops_update}_sis could
- * be changed from after_wmarks_check() or after_aggregation()
- * callbacks. Read the values here, and use those for this
- * iteration. That is, damon_set_attrs() updated new values
- * are respected from next iteration.
+ * be changed from kdamond_call(). Read the values here, and
+ * use those for this iteration. That is, damon_set_attrs()
+ * updated new values are respected from next iteration.
*/
unsigned long next_aggregation_sis = ctx->next_aggregation_sis;
unsigned long next_ops_update_sis = ctx->next_ops_update_sis;
@@ -2433,14 +2629,10 @@ static int kdamond_fn(void *data)
if (ctx->ops.check_accesses)
max_nr_accesses = ctx->ops.check_accesses(ctx);
- if (ctx->passed_sample_intervals >= next_aggregation_sis) {
+ if (ctx->passed_sample_intervals >= next_aggregation_sis)
kdamond_merge_regions(ctx,
max_nr_accesses / 10,
sz_limit);
- if (ctx->callback.after_aggregation &&
- ctx->callback.after_aggregation(ctx))
- break;
- }
/*
* do kdamond_call() and kdamond_apply_schemes() after
@@ -2506,8 +2698,6 @@ done:
damon_destroy_region(r, t);
}
- if (ctx->callback.before_terminate)
- ctx->callback.before_terminate(ctx);
if (ctx->ops.cleanup)
ctx->ops.cleanup(ctx);
kfree(ctx->regions_score_histogram);
@@ -2526,6 +2716,7 @@ done:
running_exclusive_ctxs = false;
mutex_unlock(&damon_lock);
+ damon_destroy_targets(ctx);
return 0;
}
@@ -2599,7 +2790,7 @@ int damon_set_region_biggest_system_ram_default(struct damon_target *t,
addr_range.start = *start;
addr_range.end = *end;
- return damon_set_regions(t, &addr_range, 1);
+ return damon_set_regions(t, &addr_range, 1, DAMON_MIN_REGION);
}
/*
@@ -2672,6 +2863,16 @@ void damon_update_region_access_rate(struct damon_region *r, bool accessed,
r->nr_accesses++;
}
+/**
+ * damon_initialized() - Return if DAMON is ready to be used.
+ *
+ * Return: true if DAMON is ready to be used, false otherwise.
+ */
+bool damon_initialized(void)
+{
+ return damon_region_cache != NULL;
+}
+
static int __init damon_init(void)
{
damon_region_cache = KMEM_CACHE(damon_region, 0);
diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
index 4af8fd4a390b..42b9a656f9de 100644
--- a/mm/damon/lru_sort.c
+++ b/mm/damon/lru_sort.c
@@ -112,6 +112,13 @@ static unsigned long monitor_region_end __read_mostly;
module_param(monitor_region_end, ulong, 0600);
/*
+ * Scale factor for DAMON_LRU_SORT to ops address conversion.
+ *
+ * This parameter must not be set to 0.
+ */
+static unsigned long addr_unit __read_mostly = 1;
+
+/*
* PID of the DAMON thread
*
* If DAMON_LRU_SORT is enabled, this becomes the PID of the worker thread.
@@ -198,7 +205,21 @@ static int damon_lru_sort_apply_parameters(void)
if (err)
return err;
- err = damon_set_attrs(ctx, &damon_lru_sort_mon_attrs);
+ /*
+ * If monitor_region_start/end are unset, always silently
+ * reset addr_unit to 1.
+ */
+ if (!monitor_region_start && !monitor_region_end)
+ addr_unit = 1;
+ param_ctx->addr_unit = addr_unit;
+ param_ctx->min_sz_region = max(DAMON_MIN_REGION / addr_unit, 1);
+
+ if (!damon_lru_sort_mon_attrs.sample_interval) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = damon_set_attrs(param_ctx, &damon_lru_sort_mon_attrs);
if (err)
goto out;
@@ -230,6 +251,39 @@ out:
return err;
}
+static int damon_lru_sort_handle_commit_inputs(void)
+{
+ int err;
+
+ if (!commit_inputs)
+ return 0;
+
+ err = damon_lru_sort_apply_parameters();
+ commit_inputs = false;
+ return err;
+}
+
+static int damon_lru_sort_damon_call_fn(void *arg)
+{
+ struct damon_ctx *c = arg;
+ struct damos *s;
+
+ /* update the stats parameter */
+ damon_for_each_scheme(s, c) {
+ if (s->action == DAMOS_LRU_PRIO)
+ damon_lru_sort_hot_stat = s->stat;
+ else if (s->action == DAMOS_LRU_DEPRIO)
+ damon_lru_sort_cold_stat = s->stat;
+ }
+
+ return damon_lru_sort_handle_commit_inputs();
+}
+
+static struct damon_call_control call_control = {
+ .fn = damon_lru_sort_damon_call_fn,
+ .repeat = true,
+};
+
static int damon_lru_sort_turn(bool on)
{
int err;
@@ -249,9 +303,33 @@ static int damon_lru_sort_turn(bool on)
if (err)
return err;
kdamond_pid = ctx->kdamond->pid;
+ return damon_call(ctx, &call_control);
+}
+
+static int damon_lru_sort_addr_unit_store(const char *val,
+ const struct kernel_param *kp)
+{
+ unsigned long input_addr_unit;
+ int err = kstrtoul(val, 0, &input_addr_unit);
+
+ if (err)
+ return err;
+ if (!input_addr_unit)
+ return -EINVAL;
+
+ addr_unit = input_addr_unit;
return 0;
}
+static const struct kernel_param_ops addr_unit_param_ops = {
+ .set = damon_lru_sort_addr_unit_store,
+ .get = param_get_ulong,
+};
+
+module_param_cb(addr_unit, &addr_unit_param_ops, &addr_unit, 0600);
+MODULE_PARM_DESC(addr_unit,
+ "Scale factor for DAMON_LRU_SORT to ops address conversion (default: 1)");
+
static int damon_lru_sort_enabled_store(const char *val,
const struct kernel_param *kp)
{
@@ -267,7 +345,7 @@ static int damon_lru_sort_enabled_store(const char *val,
return 0;
/* Called before init function. The function will handle this. */
- if (!ctx)
+ if (!damon_initialized())
goto set_param_out;
err = damon_lru_sort_turn(enable);
@@ -288,52 +366,27 @@ module_param_cb(enabled, &enabled_param_ops, &enabled, 0600);
MODULE_PARM_DESC(enabled,
"Enable or disable DAMON_LRU_SORT (default: disabled)");
-static int damon_lru_sort_handle_commit_inputs(void)
+static int __init damon_lru_sort_init(void)
{
int err;
- if (!commit_inputs)
- return 0;
-
- err = damon_lru_sort_apply_parameters();
- commit_inputs = false;
- return err;
-}
-
-static int damon_lru_sort_after_aggregation(struct damon_ctx *c)
-{
- struct damos *s;
-
- /* update the stats parameter */
- damon_for_each_scheme(s, c) {
- if (s->action == DAMOS_LRU_PRIO)
- damon_lru_sort_hot_stat = s->stat;
- else if (s->action == DAMOS_LRU_DEPRIO)
- damon_lru_sort_cold_stat = s->stat;
+ if (!damon_initialized()) {
+ err = -ENOMEM;
+ goto out;
}
-
- return damon_lru_sort_handle_commit_inputs();
-}
-
-static int damon_lru_sort_after_wmarks_check(struct damon_ctx *c)
-{
- return damon_lru_sort_handle_commit_inputs();
-}
-
-static int __init damon_lru_sort_init(void)
-{
- int err = damon_modules_new_paddr_ctx_target(&ctx, &target);
-
+ err = damon_modules_new_paddr_ctx_target(&ctx, &target);
if (err)
- return err;
+ goto out;
- ctx->callback.after_wmarks_check = damon_lru_sort_after_wmarks_check;
- ctx->callback.after_aggregation = damon_lru_sort_after_aggregation;
+ call_control.data = ctx;
/* 'enabled' has set before this function, probably via command line */
if (enabled)
err = damon_lru_sort_turn(true);
+out:
+ if (err && enabled)
+ enabled = false;
return err;
}
diff --git a/mm/damon/modules-common.c b/mm/damon/modules-common.c
index 7cf96574cde7..86d58f8c4f63 100644
--- a/mm/damon/modules-common.c
+++ b/mm/damon/modules-common.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Common Primitives for DAMON Modules
+ * Common Code for DAMON Modules
*
* Author: SeongJae Park <sj@kernel.org>
*/
diff --git a/mm/damon/modules-common.h b/mm/damon/modules-common.h
index f49cdb417005..f103ad556368 100644
--- a/mm/damon/modules-common.h
+++ b/mm/damon/modules-common.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
- * Common Primitives for DAMON Modules
+ * Common Code for DAMON Modules
*
* Author: SeongJae Park <sj@kernel.org>
*/
diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c
index 0db1fc70c84d..998c5180a603 100644
--- a/mm/damon/ops-common.c
+++ b/mm/damon/ops-common.c
@@ -1,10 +1,11 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Common Primitives for Data Access Monitoring
+ * Common Code for Data Access Monitoring
*
* Author: SeongJae Park <sj@kernel.org>
*/
+#include <linux/migrate.h>
#include <linux/mmu_notifier.h>
#include <linux/page_idle.h>
#include <linux/pagemap.h>
@@ -12,6 +13,7 @@
#include <linux/swap.h>
#include <linux/swapops.h>
+#include "../internal.h"
#include "ops-common.h"
/*
@@ -138,3 +140,284 @@ int damon_cold_score(struct damon_ctx *c, struct damon_region *r,
/* Return coldness of the region */
return DAMOS_MAX_SCORE - hotness;
}
+
+static bool damon_folio_mkold_one(struct folio *folio,
+ struct vm_area_struct *vma, unsigned long addr, void *arg)
+{
+ DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0);
+
+ while (page_vma_mapped_walk(&pvmw)) {
+ addr = pvmw.address;
+ if (pvmw.pte)
+ damon_ptep_mkold(pvmw.pte, vma, addr);
+ else
+ damon_pmdp_mkold(pvmw.pmd, vma, addr);
+ }
+ return true;
+}
+
+void damon_folio_mkold(struct folio *folio)
+{
+ struct rmap_walk_control rwc = {
+ .rmap_one = damon_folio_mkold_one,
+ .anon_lock = folio_lock_anon_vma_read,
+ };
+ bool need_lock;
+
+ if (!folio_mapped(folio) || !folio_raw_mapping(folio)) {
+ folio_set_idle(folio);
+ return;
+ }
+
+ need_lock = !folio_test_anon(folio) || folio_test_ksm(folio);
+ if (need_lock && !folio_trylock(folio))
+ return;
+
+ rmap_walk(folio, &rwc);
+
+ if (need_lock)
+ folio_unlock(folio);
+
+}
+
+static bool damon_folio_young_one(struct folio *folio,
+ struct vm_area_struct *vma, unsigned long addr, void *arg)
+{
+ bool *accessed = arg;
+ DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0);
+ pte_t pte;
+
+ *accessed = false;
+ while (page_vma_mapped_walk(&pvmw)) {
+ addr = pvmw.address;
+ if (pvmw.pte) {
+ pte = ptep_get(pvmw.pte);
+
+ /*
+ * PFN swap PTEs, such as device-exclusive ones, that
+ * actually map pages are "old" from a CPU perspective.
+ * The MMU notifier takes care of any device aspects.
+ */
+ *accessed = (pte_present(pte) && pte_young(pte)) ||
+ !folio_test_idle(folio) ||
+ mmu_notifier_test_young(vma->vm_mm, addr);
+ } else {
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ *accessed = pmd_young(pmdp_get(pvmw.pmd)) ||
+ !folio_test_idle(folio) ||
+ mmu_notifier_test_young(vma->vm_mm, addr);
+#else
+ WARN_ON_ONCE(1);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+ }
+ if (*accessed) {
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+ }
+
+ /* If accessed, stop walking */
+ return *accessed == false;
+}
+
+bool damon_folio_young(struct folio *folio)
+{
+ bool accessed = false;
+ struct rmap_walk_control rwc = {
+ .arg = &accessed,
+ .rmap_one = damon_folio_young_one,
+ .anon_lock = folio_lock_anon_vma_read,
+ };
+ bool need_lock;
+
+ if (!folio_mapped(folio) || !folio_raw_mapping(folio)) {
+ if (folio_test_idle(folio))
+ return false;
+ else
+ return true;
+ }
+
+ need_lock = !folio_test_anon(folio) || folio_test_ksm(folio);
+ if (need_lock && !folio_trylock(folio))
+ return false;
+
+ rmap_walk(folio, &rwc);
+
+ if (need_lock)
+ folio_unlock(folio);
+
+ return accessed;
+}
+
+bool damos_folio_filter_match(struct damos_filter *filter, struct folio *folio)
+{
+ bool matched = false;
+ struct mem_cgroup *memcg;
+ size_t folio_sz;
+
+ switch (filter->type) {
+ case DAMOS_FILTER_TYPE_ANON:
+ matched = folio_test_anon(folio);
+ break;
+ case DAMOS_FILTER_TYPE_ACTIVE:
+ matched = folio_test_active(folio);
+ break;
+ case DAMOS_FILTER_TYPE_MEMCG:
+ rcu_read_lock();
+ memcg = folio_memcg_check(folio);
+ if (!memcg)
+ matched = false;
+ else
+ matched = filter->memcg_id == mem_cgroup_id(memcg);
+ rcu_read_unlock();
+ break;
+ case DAMOS_FILTER_TYPE_YOUNG:
+ matched = damon_folio_young(folio);
+ if (matched)
+ damon_folio_mkold(folio);
+ break;
+ case DAMOS_FILTER_TYPE_HUGEPAGE_SIZE:
+ folio_sz = folio_size(folio);
+ matched = filter->sz_range.min <= folio_sz &&
+ folio_sz <= filter->sz_range.max;
+ break;
+ case DAMOS_FILTER_TYPE_UNMAPPED:
+ matched = !folio_mapped(folio) || !folio_raw_mapping(folio);
+ break;
+ default:
+ break;
+ }
+
+ return matched == filter->matching;
+}
+
+static unsigned int __damon_migrate_folio_list(
+ struct list_head *migrate_folios, struct pglist_data *pgdat,
+ int target_nid)
+{
+ unsigned int nr_succeeded = 0;
+ struct migration_target_control mtc = {
+ /*
+ * Allocate from 'node', or fail quickly and quietly.
+ * When this happens, 'page' will likely just be discarded
+ * instead of migrated.
+ */
+ .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
+ __GFP_NOMEMALLOC | GFP_NOWAIT,
+ .nid = target_nid,
+ };
+
+ if (pgdat->node_id == target_nid || target_nid == NUMA_NO_NODE)
+ return 0;
+
+ if (list_empty(migrate_folios))
+ return 0;
+
+ /* Migration ignores all cpuset and mempolicy settings */
+ migrate_pages(migrate_folios, alloc_migration_target, NULL,
+ (unsigned long)&mtc, MIGRATE_ASYNC, MR_DAMON,
+ &nr_succeeded);
+
+ return nr_succeeded;
+}
+
+static unsigned int damon_migrate_folio_list(struct list_head *folio_list,
+ struct pglist_data *pgdat,
+ int target_nid)
+{
+ unsigned int nr_migrated = 0;
+ struct folio *folio;
+ LIST_HEAD(ret_folios);
+ LIST_HEAD(migrate_folios);
+
+ while (!list_empty(folio_list)) {
+ struct folio *folio;
+
+ cond_resched();
+
+ folio = lru_to_folio(folio_list);
+ list_del(&folio->lru);
+
+ if (!folio_trylock(folio))
+ goto keep;
+
+ /* Relocate its contents to another node. */
+ list_add(&folio->lru, &migrate_folios);
+ folio_unlock(folio);
+ continue;
+keep:
+ list_add(&folio->lru, &ret_folios);
+ }
+ /* 'folio_list' is always empty here */
+
+ /* Migrate folios selected for migration */
+ nr_migrated += __damon_migrate_folio_list(
+ &migrate_folios, pgdat, target_nid);
+ /*
+ * Folios that could not be migrated are still in @migrate_folios. Add
+ * those back on @folio_list
+ */
+ if (!list_empty(&migrate_folios))
+ list_splice_init(&migrate_folios, folio_list);
+
+ try_to_unmap_flush();
+
+ list_splice(&ret_folios, folio_list);
+
+ while (!list_empty(folio_list)) {
+ folio = lru_to_folio(folio_list);
+ list_del(&folio->lru);
+ folio_putback_lru(folio);
+ }
+
+ return nr_migrated;
+}
+
+unsigned long damon_migrate_pages(struct list_head *folio_list, int target_nid)
+{
+ int nid;
+ unsigned long nr_migrated = 0;
+ LIST_HEAD(node_folio_list);
+ unsigned int noreclaim_flag;
+
+ if (list_empty(folio_list))
+ return nr_migrated;
+
+ if (target_nid < 0 || target_nid >= MAX_NUMNODES ||
+ !node_state(target_nid, N_MEMORY))
+ return nr_migrated;
+
+ noreclaim_flag = memalloc_noreclaim_save();
+
+ nid = folio_nid(lru_to_folio(folio_list));
+ do {
+ struct folio *folio = lru_to_folio(folio_list);
+
+ if (nid == folio_nid(folio)) {
+ list_move(&folio->lru, &node_folio_list);
+ continue;
+ }
+
+ nr_migrated += damon_migrate_folio_list(&node_folio_list,
+ NODE_DATA(nid),
+ target_nid);
+ nid = folio_nid(lru_to_folio(folio_list));
+ } while (!list_empty(folio_list));
+
+ nr_migrated += damon_migrate_folio_list(&node_folio_list,
+ NODE_DATA(nid),
+ target_nid);
+
+ memalloc_noreclaim_restore(noreclaim_flag);
+
+ return nr_migrated;
+}
+
+bool damos_ops_has_filter(struct damos *s)
+{
+ struct damos_filter *f;
+
+ damos_for_each_ops_filter(f, s)
+ return true;
+ return false;
+}
diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h
index 18d837d11bce..5efa5b5970de 100644
--- a/mm/damon/ops-common.h
+++ b/mm/damon/ops-common.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
- * Common Primitives for Data Access Monitoring
+ * Common Code for Data Access Monitoring
*
* Author: SeongJae Park <sj@kernel.org>
*/
@@ -11,8 +11,15 @@ struct folio *damon_get_folio(unsigned long pfn);
void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr);
void damon_pmdp_mkold(pmd_t *pmd, struct vm_area_struct *vma, unsigned long addr);
+void damon_folio_mkold(struct folio *folio);
+bool damon_folio_young(struct folio *folio);
int damon_cold_score(struct damon_ctx *c, struct damon_region *r,
struct damos *s);
int damon_hot_score(struct damon_ctx *c, struct damon_region *r,
struct damos *s);
+
+bool damos_folio_filter_match(struct damos_filter *filter, struct folio *folio);
+unsigned long damon_migrate_pages(struct list_head *folio_list, int target_nid);
+
+bool damos_ops_has_filter(struct damos *s);
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 1b70d3f36046..07a8aead439e 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * DAMON Primitives for The Physical Address Space
+ * DAMON Code for The Physical Address Space
*
* Author: SeongJae Park <sj@kernel.org>
*/
@@ -13,52 +13,31 @@
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/memory-tiers.h>
-#include <linux/migrate.h>
#include <linux/mm_inline.h>
#include "../internal.h"
#include "ops-common.h"
-static bool damon_folio_mkold_one(struct folio *folio,
- struct vm_area_struct *vma, unsigned long addr, void *arg)
+static phys_addr_t damon_pa_phys_addr(
+ unsigned long addr, unsigned long addr_unit)
{
- DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0);
-
- while (page_vma_mapped_walk(&pvmw)) {
- addr = pvmw.address;
- if (pvmw.pte)
- damon_ptep_mkold(pvmw.pte, vma, addr);
- else
- damon_pmdp_mkold(pvmw.pmd, vma, addr);
- }
- return true;
+ return (phys_addr_t)addr * addr_unit;
}
-static void damon_folio_mkold(struct folio *folio)
+static unsigned long damon_pa_core_addr(
+ phys_addr_t pa, unsigned long addr_unit)
{
- struct rmap_walk_control rwc = {
- .rmap_one = damon_folio_mkold_one,
- .anon_lock = folio_lock_anon_vma_read,
- };
- bool need_lock;
-
- if (!folio_mapped(folio) || !folio_raw_mapping(folio)) {
- folio_set_idle(folio);
- return;
- }
-
- need_lock = !folio_test_anon(folio) || folio_test_ksm(folio);
- if (need_lock && !folio_trylock(folio))
- return;
-
- rmap_walk(folio, &rwc);
-
- if (need_lock)
- folio_unlock(folio);
-
+ /*
+ * Use div_u64() for avoiding linking errors related with __udivdi3,
+ * __aeabi_uldivmod, or similar problems. This should also improve the
+ * performance optimization (read div_u64() comment for the detail).
+ */
+ if (sizeof(pa) == 8 && sizeof(addr_unit) == 4)
+ return div_u64(pa, addr_unit);
+ return pa / addr_unit;
}
-static void damon_pa_mkold(unsigned long paddr)
+static void damon_pa_mkold(phys_addr_t paddr)
{
struct folio *folio = damon_get_folio(PHYS_PFN(paddr));
@@ -69,11 +48,12 @@ static void damon_pa_mkold(unsigned long paddr)
folio_put(folio);
}
-static void __damon_pa_prepare_access_check(struct damon_region *r)
+static void __damon_pa_prepare_access_check(struct damon_region *r,
+ unsigned long addr_unit)
{
r->sampling_addr = damon_rand(r->ar.start, r->ar.end);
- damon_pa_mkold(r->sampling_addr);
+ damon_pa_mkold(damon_pa_phys_addr(r->sampling_addr, addr_unit));
}
static void damon_pa_prepare_access_checks(struct damon_ctx *ctx)
@@ -83,80 +63,11 @@ static void damon_pa_prepare_access_checks(struct damon_ctx *ctx)
damon_for_each_target(t, ctx) {
damon_for_each_region(r, t)
- __damon_pa_prepare_access_check(r);
+ __damon_pa_prepare_access_check(r, ctx->addr_unit);
}
}
-static bool damon_folio_young_one(struct folio *folio,
- struct vm_area_struct *vma, unsigned long addr, void *arg)
-{
- bool *accessed = arg;
- DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0);
- pte_t pte;
-
- *accessed = false;
- while (page_vma_mapped_walk(&pvmw)) {
- addr = pvmw.address;
- if (pvmw.pte) {
- pte = ptep_get(pvmw.pte);
-
- /*
- * PFN swap PTEs, such as device-exclusive ones, that
- * actually map pages are "old" from a CPU perspective.
- * The MMU notifier takes care of any device aspects.
- */
- *accessed = (pte_present(pte) && pte_young(pte)) ||
- !folio_test_idle(folio) ||
- mmu_notifier_test_young(vma->vm_mm, addr);
- } else {
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- *accessed = pmd_young(pmdp_get(pvmw.pmd)) ||
- !folio_test_idle(folio) ||
- mmu_notifier_test_young(vma->vm_mm, addr);
-#else
- WARN_ON_ONCE(1);
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
- }
- if (*accessed) {
- page_vma_mapped_walk_done(&pvmw);
- break;
- }
- }
-
- /* If accessed, stop walking */
- return *accessed == false;
-}
-
-static bool damon_folio_young(struct folio *folio)
-{
- bool accessed = false;
- struct rmap_walk_control rwc = {
- .arg = &accessed,
- .rmap_one = damon_folio_young_one,
- .anon_lock = folio_lock_anon_vma_read,
- };
- bool need_lock;
-
- if (!folio_mapped(folio) || !folio_raw_mapping(folio)) {
- if (folio_test_idle(folio))
- return false;
- else
- return true;
- }
-
- need_lock = !folio_test_anon(folio) || folio_test_ksm(folio);
- if (need_lock && !folio_trylock(folio))
- return false;
-
- rmap_walk(folio, &rwc);
-
- if (need_lock)
- folio_unlock(folio);
-
- return accessed;
-}
-
-static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz)
+static bool damon_pa_young(phys_addr_t paddr, unsigned long *folio_sz)
{
struct folio *folio = damon_get_folio(PHYS_PFN(paddr));
bool accessed;
@@ -171,23 +82,25 @@ static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz)
}
static void __damon_pa_check_access(struct damon_region *r,
- struct damon_attrs *attrs)
+ struct damon_attrs *attrs, unsigned long addr_unit)
{
- static unsigned long last_addr;
+ static phys_addr_t last_addr;
static unsigned long last_folio_sz = PAGE_SIZE;
static bool last_accessed;
+ phys_addr_t sampling_addr = damon_pa_phys_addr(
+ r->sampling_addr, addr_unit);
/* If the region is in the last checked page, reuse the result */
if (ALIGN_DOWN(last_addr, last_folio_sz) ==
- ALIGN_DOWN(r->sampling_addr, last_folio_sz)) {
+ ALIGN_DOWN(sampling_addr, last_folio_sz)) {
damon_update_region_access_rate(r, last_accessed, attrs);
return;
}
- last_accessed = damon_pa_young(r->sampling_addr, &last_folio_sz);
+ last_accessed = damon_pa_young(sampling_addr, &last_folio_sz);
damon_update_region_access_rate(r, last_accessed, attrs);
- last_addr = r->sampling_addr;
+ last_addr = sampling_addr;
}
static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx)
@@ -198,7 +111,8 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx)
damon_for_each_target(t, ctx) {
damon_for_each_region(r, t) {
- __damon_pa_check_access(r, &ctx->attrs);
+ __damon_pa_check_access(
+ r, &ctx->attrs, ctx->addr_unit);
max_nr_accesses = max(r->nr_accesses, max_nr_accesses);
}
}
@@ -206,49 +120,6 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx)
return max_nr_accesses;
}
-static bool damos_pa_filter_match(struct damos_filter *filter,
- struct folio *folio)
-{
- bool matched = false;
- struct mem_cgroup *memcg;
- size_t folio_sz;
-
- switch (filter->type) {
- case DAMOS_FILTER_TYPE_ANON:
- matched = folio_test_anon(folio);
- break;
- case DAMOS_FILTER_TYPE_ACTIVE:
- matched = folio_test_active(folio);
- break;
- case DAMOS_FILTER_TYPE_MEMCG:
- rcu_read_lock();
- memcg = folio_memcg_check(folio);
- if (!memcg)
- matched = false;
- else
- matched = filter->memcg_id == mem_cgroup_id(memcg);
- rcu_read_unlock();
- break;
- case DAMOS_FILTER_TYPE_YOUNG:
- matched = damon_folio_young(folio);
- if (matched)
- damon_folio_mkold(folio);
- break;
- case DAMOS_FILTER_TYPE_HUGEPAGE_SIZE:
- folio_sz = folio_size(folio);
- matched = filter->sz_range.min <= folio_sz &&
- folio_sz <= filter->sz_range.max;
- break;
- case DAMOS_FILTER_TYPE_UNMAPPED:
- matched = !folio_mapped(folio) || !folio_raw_mapping(folio);
- break;
- default:
- break;
- }
-
- return matched == filter->matching;
-}
-
/*
* damos_pa_filter_out - Return true if the page should be filtered out.
*/
@@ -260,7 +131,7 @@ static bool damos_pa_filter_out(struct damos *scheme, struct folio *folio)
return false;
damos_for_each_ops_filter(filter, scheme) {
- if (damos_pa_filter_match(filter, folio))
+ if (damos_folio_filter_match(filter, folio))
return !filter->allow;
}
return scheme->ops_filters_default_reject;
@@ -277,10 +148,11 @@ static bool damon_pa_invalid_damos_folio(struct folio *folio, struct damos *s)
return false;
}
-static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s,
+static unsigned long damon_pa_pageout(struct damon_region *r,
+ unsigned long addr_unit, struct damos *s,
unsigned long *sz_filter_passed)
{
- unsigned long addr, applied;
+ phys_addr_t addr, applied;
LIST_HEAD(folio_list);
bool install_young_filter = true;
struct damos_filter *filter;
@@ -301,8 +173,8 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s,
damos_add_filter(s, filter);
}
- addr = r->ar.start;
- while (addr < r->ar.end) {
+ addr = damon_pa_phys_addr(r->ar.start, addr_unit);
+ while (addr < damon_pa_phys_addr(r->ar.end, addr_unit)) {
folio = damon_get_folio(PHYS_PFN(addr));
if (damon_pa_invalid_damos_folio(folio, s)) {
addr += PAGE_SIZE;
@@ -312,7 +184,7 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s,
if (damos_pa_filter_out(s, folio))
goto put_folio;
else
- *sz_filter_passed += folio_size(folio);
+ *sz_filter_passed += folio_size(folio) / addr_unit;
folio_clear_referenced(folio);
folio_test_clear_young(folio);
@@ -331,18 +203,19 @@ put_folio:
applied = reclaim_pages(&folio_list);
cond_resched();
s->last_applied = folio;
- return applied * PAGE_SIZE;
+ return damon_pa_core_addr(applied * PAGE_SIZE, addr_unit);
}
static inline unsigned long damon_pa_mark_accessed_or_deactivate(
- struct damon_region *r, struct damos *s, bool mark_accessed,
+ struct damon_region *r, unsigned long addr_unit,
+ struct damos *s, bool mark_accessed,
unsigned long *sz_filter_passed)
{
- unsigned long addr, applied = 0;
+ phys_addr_t addr, applied = 0;
struct folio *folio;
- addr = r->ar.start;
- while (addr < r->ar.end) {
+ addr = damon_pa_phys_addr(r->ar.start, addr_unit);
+ while (addr < damon_pa_phys_addr(r->ar.end, addr_unit)) {
folio = damon_get_folio(PHYS_PFN(addr));
if (damon_pa_invalid_damos_folio(folio, s)) {
addr += PAGE_SIZE;
@@ -352,7 +225,7 @@ static inline unsigned long damon_pa_mark_accessed_or_deactivate(
if (damos_pa_filter_out(s, folio))
goto put_folio;
else
- *sz_filter_passed += folio_size(folio);
+ *sz_filter_passed += folio_size(folio) / addr_unit;
if (mark_accessed)
folio_mark_accessed(folio);
@@ -364,153 +237,35 @@ put_folio:
folio_put(folio);
}
s->last_applied = folio;
- return applied * PAGE_SIZE;
+ return damon_pa_core_addr(applied * PAGE_SIZE, addr_unit);
}
static unsigned long damon_pa_mark_accessed(struct damon_region *r,
- struct damos *s, unsigned long *sz_filter_passed)
+ unsigned long addr_unit, struct damos *s,
+ unsigned long *sz_filter_passed)
{
- return damon_pa_mark_accessed_or_deactivate(r, s, true,
+ return damon_pa_mark_accessed_or_deactivate(r, addr_unit, s, true,
sz_filter_passed);
}
static unsigned long damon_pa_deactivate_pages(struct damon_region *r,
- struct damos *s, unsigned long *sz_filter_passed)
+ unsigned long addr_unit, struct damos *s,
+ unsigned long *sz_filter_passed)
{
- return damon_pa_mark_accessed_or_deactivate(r, s, false,
+ return damon_pa_mark_accessed_or_deactivate(r, addr_unit, s, false,
sz_filter_passed);
}
-static unsigned int __damon_pa_migrate_folio_list(
- struct list_head *migrate_folios, struct pglist_data *pgdat,
- int target_nid)
-{
- unsigned int nr_succeeded = 0;
- nodemask_t allowed_mask = NODE_MASK_NONE;
- struct migration_target_control mtc = {
- /*
- * Allocate from 'node', or fail quickly and quietly.
- * When this happens, 'page' will likely just be discarded
- * instead of migrated.
- */
- .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
- __GFP_NOWARN | __GFP_NOMEMALLOC | GFP_NOWAIT,
- .nid = target_nid,
- .nmask = &allowed_mask
- };
-
- if (pgdat->node_id == target_nid || target_nid == NUMA_NO_NODE)
- return 0;
-
- if (list_empty(migrate_folios))
- return 0;
-
- /* Migration ignores all cpuset and mempolicy settings */
- migrate_pages(migrate_folios, alloc_migrate_folio, NULL,
- (unsigned long)&mtc, MIGRATE_ASYNC, MR_DAMON,
- &nr_succeeded);
-
- return nr_succeeded;
-}
-
-static unsigned int damon_pa_migrate_folio_list(struct list_head *folio_list,
- struct pglist_data *pgdat,
- int target_nid)
-{
- unsigned int nr_migrated = 0;
- struct folio *folio;
- LIST_HEAD(ret_folios);
- LIST_HEAD(migrate_folios);
-
- while (!list_empty(folio_list)) {
- struct folio *folio;
-
- cond_resched();
-
- folio = lru_to_folio(folio_list);
- list_del(&folio->lru);
-
- if (!folio_trylock(folio))
- goto keep;
-
- /* Relocate its contents to another node. */
- list_add(&folio->lru, &migrate_folios);
- folio_unlock(folio);
- continue;
-keep:
- list_add(&folio->lru, &ret_folios);
- }
- /* 'folio_list' is always empty here */
-
- /* Migrate folios selected for migration */
- nr_migrated += __damon_pa_migrate_folio_list(
- &migrate_folios, pgdat, target_nid);
- /*
- * Folios that could not be migrated are still in @migrate_folios. Add
- * those back on @folio_list
- */
- if (!list_empty(&migrate_folios))
- list_splice_init(&migrate_folios, folio_list);
-
- try_to_unmap_flush();
-
- list_splice(&ret_folios, folio_list);
-
- while (!list_empty(folio_list)) {
- folio = lru_to_folio(folio_list);
- list_del(&folio->lru);
- folio_putback_lru(folio);
- }
-
- return nr_migrated;
-}
-
-static unsigned long damon_pa_migrate_pages(struct list_head *folio_list,
- int target_nid)
-{
- int nid;
- unsigned long nr_migrated = 0;
- LIST_HEAD(node_folio_list);
- unsigned int noreclaim_flag;
-
- if (list_empty(folio_list))
- return nr_migrated;
-
- noreclaim_flag = memalloc_noreclaim_save();
-
- nid = folio_nid(lru_to_folio(folio_list));
- do {
- struct folio *folio = lru_to_folio(folio_list);
-
- if (nid == folio_nid(folio)) {
- list_move(&folio->lru, &node_folio_list);
- continue;
- }
-
- nr_migrated += damon_pa_migrate_folio_list(&node_folio_list,
- NODE_DATA(nid),
- target_nid);
- nid = folio_nid(lru_to_folio(folio_list));
- } while (!list_empty(folio_list));
-
- nr_migrated += damon_pa_migrate_folio_list(&node_folio_list,
- NODE_DATA(nid),
- target_nid);
-
- memalloc_noreclaim_restore(noreclaim_flag);
-
- return nr_migrated;
-}
-
-static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s,
+static unsigned long damon_pa_migrate(struct damon_region *r,
+ unsigned long addr_unit, struct damos *s,
unsigned long *sz_filter_passed)
{
- unsigned long addr, applied;
+ phys_addr_t addr, applied;
LIST_HEAD(folio_list);
struct folio *folio;
- addr = r->ar.start;
- while (addr < r->ar.end) {
+ addr = damon_pa_phys_addr(r->ar.start, addr_unit);
+ while (addr < damon_pa_phys_addr(r->ar.end, addr_unit)) {
folio = damon_get_folio(PHYS_PFN(addr));
if (damon_pa_invalid_damos_folio(folio, s)) {
addr += PAGE_SIZE;
@@ -520,7 +275,7 @@ static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s,
if (damos_pa_filter_out(s, folio))
goto put_folio;
else
- *sz_filter_passed += folio_size(folio);
+ *sz_filter_passed += folio_size(folio) / addr_unit;
if (!folio_isolate_lru(folio))
goto put_folio;
@@ -529,33 +284,24 @@ put_folio:
addr += folio_size(folio);
folio_put(folio);
}
- applied = damon_pa_migrate_pages(&folio_list, s->target_nid);
+ applied = damon_migrate_pages(&folio_list, s->target_nid);
cond_resched();
s->last_applied = folio;
- return applied * PAGE_SIZE;
-}
-
-static bool damon_pa_scheme_has_filter(struct damos *s)
-{
- struct damos_filter *f;
-
- damos_for_each_ops_filter(f, s)
- return true;
- return false;
+ return damon_pa_core_addr(applied * PAGE_SIZE, addr_unit);
}
-static unsigned long damon_pa_stat(struct damon_region *r, struct damos *s,
+static unsigned long damon_pa_stat(struct damon_region *r,
+ unsigned long addr_unit, struct damos *s,
unsigned long *sz_filter_passed)
{
- unsigned long addr;
- LIST_HEAD(folio_list);
+ phys_addr_t addr;
struct folio *folio;
- if (!damon_pa_scheme_has_filter(s))
+ if (!damos_ops_has_filter(s))
return 0;
- addr = r->ar.start;
- while (addr < r->ar.end) {
+ addr = damon_pa_phys_addr(r->ar.start, addr_unit);
+ while (addr < damon_pa_phys_addr(r->ar.end, addr_unit)) {
folio = damon_get_folio(PHYS_PFN(addr));
if (damon_pa_invalid_damos_folio(folio, s)) {
addr += PAGE_SIZE;
@@ -563,7 +309,7 @@ static unsigned long damon_pa_stat(struct damon_region *r, struct damos *s,
}
if (!damos_pa_filter_out(s, folio))
- *sz_filter_passed += folio_size(folio);
+ *sz_filter_passed += folio_size(folio) / addr_unit;
addr += folio_size(folio);
folio_put(folio);
}
@@ -575,18 +321,22 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
struct damon_target *t, struct damon_region *r,
struct damos *scheme, unsigned long *sz_filter_passed)
{
+ unsigned long aunit = ctx->addr_unit;
+
switch (scheme->action) {
case DAMOS_PAGEOUT:
- return damon_pa_pageout(r, scheme, sz_filter_passed);
+ return damon_pa_pageout(r, aunit, scheme, sz_filter_passed);
case DAMOS_LRU_PRIO:
- return damon_pa_mark_accessed(r, scheme, sz_filter_passed);
+ return damon_pa_mark_accessed(r, aunit, scheme,
+ sz_filter_passed);
case DAMOS_LRU_DEPRIO:
- return damon_pa_deactivate_pages(r, scheme, sz_filter_passed);
+ return damon_pa_deactivate_pages(r, aunit, scheme,
+ sz_filter_passed);
case DAMOS_MIGRATE_HOT:
case DAMOS_MIGRATE_COLD:
- return damon_pa_migrate(r, scheme, sz_filter_passed);
+ return damon_pa_migrate(r, aunit, scheme, sz_filter_passed);
case DAMOS_STAT:
- return damon_pa_stat(r, scheme, sz_filter_passed);
+ return damon_pa_stat(r, aunit, scheme, sz_filter_passed);
default:
/* DAMOS actions that not yet supported by 'paddr'. */
break;
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index a675150965e0..7ba3d0f9a19a 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -129,6 +129,13 @@ static unsigned long monitor_region_end __read_mostly;
module_param(monitor_region_end, ulong, 0600);
/*
+ * Scale factor for DAMON_RECLAIM to ops address conversion.
+ *
+ * This parameter must not be set to 0.
+ */
+static unsigned long addr_unit __read_mostly = 1;
+
+/*
* Skip anonymous pages reclamation.
*
* If this parameter is set as ``Y``, DAMON_RECLAIM does not reclaim anonymous
@@ -194,7 +201,21 @@ static int damon_reclaim_apply_parameters(void)
if (err)
return err;
- err = damon_set_attrs(ctx, &damon_reclaim_mon_attrs);
+ /*
+ * If monitor_region_start/end are unset, always silently
+ * reset addr_unit to 1.
+ */
+ if (!monitor_region_start && !monitor_region_end)
+ addr_unit = 1;
+ param_ctx->addr_unit = addr_unit;
+ param_ctx->min_sz_region = max(DAMON_MIN_REGION / addr_unit, 1);
+
+ if (!damon_reclaim_mon_attrs.aggr_interval) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = damon_set_attrs(param_ctx, &damon_reclaim_mon_attrs);
if (err)
goto out;
@@ -202,7 +223,7 @@ static int damon_reclaim_apply_parameters(void)
scheme = damon_reclaim_new_scheme();
if (!scheme)
goto out;
- damon_set_schemes(ctx, &scheme, 1);
+ damon_set_schemes(param_ctx, &scheme, 1);
if (quota_mem_pressure_us) {
goal = damos_new_quota_goal(DAMOS_QUOTA_SOME_MEM_PSI_US,
@@ -238,6 +259,35 @@ out:
return err;
}
+static int damon_reclaim_handle_commit_inputs(void)
+{
+ int err;
+
+ if (!commit_inputs)
+ return 0;
+
+ err = damon_reclaim_apply_parameters();
+ commit_inputs = false;
+ return err;
+}
+
+static int damon_reclaim_damon_call_fn(void *arg)
+{
+ struct damon_ctx *c = arg;
+ struct damos *s;
+
+ /* update the stats parameter */
+ damon_for_each_scheme(s, c)
+ damon_reclaim_stat = s->stat;
+
+ return damon_reclaim_handle_commit_inputs();
+}
+
+static struct damon_call_control call_control = {
+ .fn = damon_reclaim_damon_call_fn,
+ .repeat = true,
+};
+
static int damon_reclaim_turn(bool on)
{
int err;
@@ -257,9 +307,33 @@ static int damon_reclaim_turn(bool on)
if (err)
return err;
kdamond_pid = ctx->kdamond->pid;
+ return damon_call(ctx, &call_control);
+}
+
+static int damon_reclaim_addr_unit_store(const char *val,
+ const struct kernel_param *kp)
+{
+ unsigned long input_addr_unit;
+ int err = kstrtoul(val, 0, &input_addr_unit);
+
+ if (err)
+ return err;
+ if (!input_addr_unit)
+ return -EINVAL;
+
+ addr_unit = input_addr_unit;
return 0;
}
+static const struct kernel_param_ops addr_unit_param_ops = {
+ .set = damon_reclaim_addr_unit_store,
+ .get = param_get_ulong,
+};
+
+module_param_cb(addr_unit, &addr_unit_param_ops, &addr_unit, 0600);
+MODULE_PARM_DESC(addr_unit,
+ "Scale factor for DAMON_RECLAIM to ops address conversion (default: 1)");
+
static int damon_reclaim_enabled_store(const char *val,
const struct kernel_param *kp)
{
@@ -275,7 +349,7 @@ static int damon_reclaim_enabled_store(const char *val,
return 0;
/* Called before init function. The function will handle this. */
- if (!ctx)
+ if (!damon_initialized())
goto set_param_out;
err = damon_reclaim_turn(enable);
@@ -296,48 +370,27 @@ module_param_cb(enabled, &enabled_param_ops, &enabled, 0600);
MODULE_PARM_DESC(enabled,
"Enable or disable DAMON_RECLAIM (default: disabled)");
-static int damon_reclaim_handle_commit_inputs(void)
-{
- int err;
-
- if (!commit_inputs)
- return 0;
-
- err = damon_reclaim_apply_parameters();
- commit_inputs = false;
- return err;
-}
-
-static int damon_reclaim_after_aggregation(struct damon_ctx *c)
-{
- struct damos *s;
-
- /* update the stats parameter */
- damon_for_each_scheme(s, c)
- damon_reclaim_stat = s->stat;
-
- return damon_reclaim_handle_commit_inputs();
-}
-
-static int damon_reclaim_after_wmarks_check(struct damon_ctx *c)
-{
- return damon_reclaim_handle_commit_inputs();
-}
-
static int __init damon_reclaim_init(void)
{
- int err = damon_modules_new_paddr_ctx_target(&ctx, &target);
+ int err;
+ if (!damon_initialized()) {
+ err = -ENOMEM;
+ goto out;
+ }
+ err = damon_modules_new_paddr_ctx_target(&ctx, &target);
if (err)
- return err;
+ goto out;
- ctx->callback.after_wmarks_check = damon_reclaim_after_wmarks_check;
- ctx->callback.after_aggregation = damon_reclaim_after_aggregation;
+ call_control.data = ctx;
/* 'enabled' has set before this function, probably via command line */
if (enabled)
err = damon_reclaim_turn(true);
+out:
+ if (err && enabled)
+ enabled = false;
return err;
}
diff --git a/mm/damon/stat.c b/mm/damon/stat.c
new file mode 100644
index 000000000000..d8010968bbed
--- /dev/null
+++ b/mm/damon/stat.c
@@ -0,0 +1,272 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Shows data access monitoring resutls in simple metrics.
+ */
+
+#define pr_fmt(fmt) "damon-stat: " fmt
+
+#include <linux/damon.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sort.h>
+
+#ifdef MODULE_PARAM_PREFIX
+#undef MODULE_PARAM_PREFIX
+#endif
+#define MODULE_PARAM_PREFIX "damon_stat."
+
+static int damon_stat_enabled_store(
+ const char *val, const struct kernel_param *kp);
+
+static const struct kernel_param_ops enabled_param_ops = {
+ .set = damon_stat_enabled_store,
+ .get = param_get_bool,
+};
+
+static bool enabled __read_mostly = IS_ENABLED(
+ CONFIG_DAMON_STAT_ENABLED_DEFAULT);
+module_param_cb(enabled, &enabled_param_ops, &enabled, 0600);
+MODULE_PARM_DESC(enabled, "Enable of disable DAMON_STAT");
+
+static unsigned long estimated_memory_bandwidth __read_mostly;
+module_param(estimated_memory_bandwidth, ulong, 0400);
+MODULE_PARM_DESC(estimated_memory_bandwidth,
+ "Estimated memory bandwidth usage in bytes per second");
+
+static long memory_idle_ms_percentiles[101] __read_mostly = {0,};
+module_param_array(memory_idle_ms_percentiles, long, NULL, 0400);
+MODULE_PARM_DESC(memory_idle_ms_percentiles,
+ "Memory idle time percentiles in milliseconds");
+
+static unsigned long aggr_interval_us;
+module_param(aggr_interval_us, ulong, 0400);
+MODULE_PARM_DESC(aggr_interval_us,
+ "Current tuned aggregation interval in microseconds");
+
+static struct damon_ctx *damon_stat_context;
+
+static void damon_stat_set_estimated_memory_bandwidth(struct damon_ctx *c)
+{
+ struct damon_target *t;
+ struct damon_region *r;
+ unsigned long access_bytes = 0;
+
+ damon_for_each_target(t, c) {
+ damon_for_each_region(r, t)
+ access_bytes += (r->ar.end - r->ar.start) *
+ r->nr_accesses;
+ }
+ estimated_memory_bandwidth = access_bytes * USEC_PER_MSEC *
+ MSEC_PER_SEC / c->attrs.aggr_interval;
+}
+
+static int damon_stat_idletime(const struct damon_region *r)
+{
+ if (r->nr_accesses)
+ return -1 * (r->age + 1);
+ return r->age + 1;
+}
+
+static int damon_stat_cmp_regions(const void *a, const void *b)
+{
+ const struct damon_region *ra = *(const struct damon_region **)a;
+ const struct damon_region *rb = *(const struct damon_region **)b;
+
+ return damon_stat_idletime(ra) - damon_stat_idletime(rb);
+}
+
+static int damon_stat_sort_regions(struct damon_ctx *c,
+ struct damon_region ***sorted_ptr, int *nr_regions_ptr,
+ unsigned long *total_sz_ptr)
+{
+ struct damon_target *t;
+ struct damon_region *r;
+ struct damon_region **region_pointers;
+ unsigned int nr_regions = 0;
+ unsigned long total_sz = 0;
+
+ damon_for_each_target(t, c) {
+ /* there is only one target */
+ region_pointers = kmalloc_array(damon_nr_regions(t),
+ sizeof(*region_pointers), GFP_KERNEL);
+ if (!region_pointers)
+ return -ENOMEM;
+ damon_for_each_region(r, t) {
+ region_pointers[nr_regions++] = r;
+ total_sz += r->ar.end - r->ar.start;
+ }
+ }
+ sort(region_pointers, nr_regions, sizeof(*region_pointers),
+ damon_stat_cmp_regions, NULL);
+ *sorted_ptr = region_pointers;
+ *nr_regions_ptr = nr_regions;
+ *total_sz_ptr = total_sz;
+ return 0;
+}
+
+static void damon_stat_set_idletime_percentiles(struct damon_ctx *c)
+{
+ struct damon_region **sorted_regions, *region;
+ int nr_regions;
+ unsigned long total_sz, accounted_bytes = 0;
+ int err, i, next_percentile = 0;
+
+ err = damon_stat_sort_regions(c, &sorted_regions, &nr_regions,
+ &total_sz);
+ if (err)
+ return;
+ for (i = 0; i < nr_regions; i++) {
+ region = sorted_regions[i];
+ accounted_bytes += region->ar.end - region->ar.start;
+ while (next_percentile <= accounted_bytes * 100 / total_sz)
+ memory_idle_ms_percentiles[next_percentile++] =
+ damon_stat_idletime(region) *
+ (long)c->attrs.aggr_interval / USEC_PER_MSEC;
+ }
+ kfree(sorted_regions);
+}
+
+static int damon_stat_damon_call_fn(void *data)
+{
+ struct damon_ctx *c = data;
+ static unsigned long last_refresh_jiffies;
+
+ /* avoid unnecessarily frequent stat update */
+ if (time_before_eq(jiffies, last_refresh_jiffies +
+ msecs_to_jiffies(5 * MSEC_PER_SEC)))
+ return 0;
+ last_refresh_jiffies = jiffies;
+
+ aggr_interval_us = c->attrs.aggr_interval;
+ damon_stat_set_estimated_memory_bandwidth(c);
+ damon_stat_set_idletime_percentiles(c);
+ return 0;
+}
+
+static struct damon_ctx *damon_stat_build_ctx(void)
+{
+ struct damon_ctx *ctx;
+ struct damon_attrs attrs;
+ struct damon_target *target;
+ unsigned long start = 0, end = 0;
+
+ ctx = damon_new_ctx();
+ if (!ctx)
+ return NULL;
+ attrs = (struct damon_attrs) {
+ .sample_interval = 5 * USEC_PER_MSEC,
+ .aggr_interval = 100 * USEC_PER_MSEC,
+ .ops_update_interval = 60 * USEC_PER_MSEC * MSEC_PER_SEC,
+ .min_nr_regions = 10,
+ .max_nr_regions = 1000,
+ };
+ /*
+ * auto-tune sampling and aggregation interval aiming 4% DAMON-observed
+ * accesses ratio, keeping sampling interval in [5ms, 10s] range.
+ */
+ attrs.intervals_goal = (struct damon_intervals_goal) {
+ .access_bp = 400, .aggrs = 3,
+ .min_sample_us = 5000, .max_sample_us = 10000000,
+ };
+ if (damon_set_attrs(ctx, &attrs))
+ goto free_out;
+
+ /*
+ * auto-tune sampling and aggregation interval aiming 4% DAMON-observed
+ * accesses ratio, keeping sampling interval in [5ms, 10s] range.
+ */
+ ctx->attrs.intervals_goal = (struct damon_intervals_goal) {
+ .access_bp = 400, .aggrs = 3,
+ .min_sample_us = 5000, .max_sample_us = 10000000,
+ };
+ if (damon_select_ops(ctx, DAMON_OPS_PADDR))
+ goto free_out;
+
+ target = damon_new_target();
+ if (!target)
+ goto free_out;
+ damon_add_target(ctx, target);
+ if (damon_set_region_biggest_system_ram_default(target, &start, &end))
+ goto free_out;
+ return ctx;
+free_out:
+ damon_destroy_ctx(ctx);
+ return NULL;
+}
+
+static struct damon_call_control call_control = {
+ .fn = damon_stat_damon_call_fn,
+ .repeat = true,
+};
+
+static int damon_stat_start(void)
+{
+ int err;
+
+ damon_stat_context = damon_stat_build_ctx();
+ if (!damon_stat_context)
+ return -ENOMEM;
+ err = damon_start(&damon_stat_context, 1, true);
+ if (err)
+ return err;
+ call_control.data = damon_stat_context;
+ return damon_call(damon_stat_context, &call_control);
+}
+
+static void damon_stat_stop(void)
+{
+ damon_stop(&damon_stat_context, 1);
+ damon_destroy_ctx(damon_stat_context);
+}
+
+static int damon_stat_enabled_store(
+ const char *val, const struct kernel_param *kp)
+{
+ bool is_enabled = enabled;
+ int err;
+
+ err = kstrtobool(val, &enabled);
+ if (err)
+ return err;
+
+ if (is_enabled == enabled)
+ return 0;
+
+ if (!damon_initialized())
+ /*
+ * probably called from command line parsing (parse_args()).
+ * Cannot call damon_new_ctx(). Let damon_stat_init() handle.
+ */
+ return 0;
+
+ if (enabled) {
+ err = damon_stat_start();
+ if (err)
+ enabled = false;
+ return err;
+ }
+ damon_stat_stop();
+ return 0;
+}
+
+static int __init damon_stat_init(void)
+{
+ int err = 0;
+
+ if (!damon_initialized()) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /* probably set via command line */
+ if (enabled)
+ err = damon_stat_start();
+
+out:
+ if (err && enabled)
+ enabled = false;
+ return err;
+}
+
+module_init(damon_stat_init);
diff --git a/mm/damon/sysfs-common.c b/mm/damon/sysfs-common.c
index 70edf45c2174..ffaf285e241a 100644
--- a/mm/damon/sysfs-common.c
+++ b/mm/damon/sysfs-common.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Common Primitives for DAMON Sysfs Interface
+ * Common Code for DAMON Sysfs Interface
*
* Author: SeongJae Park <sj@kernel.org>
*/
diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h
index 70d84bdc9f5f..2099adee11d0 100644
--- a/mm/damon/sysfs-common.h
+++ b/mm/damon/sysfs-common.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
- * Common Primitives for DAMON Sysfs Interface
+ * Common Code for DAMON Sysfs Interface
*
* Author: SeongJae Park <sj@kernel.org>
*/
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 23b562df0839..6536f16006c9 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -341,16 +341,45 @@ static struct damon_sysfs_scheme_filter *damon_sysfs_scheme_filter_alloc(
return filter;
}
-/* Should match with enum damos_filter_type */
-static const char * const damon_sysfs_scheme_filter_type_strs[] = {
- "anon",
- "active",
- "memcg",
- "young",
- "hugepage_size",
- "unmapped",
- "addr",
- "target",
+struct damos_sysfs_filter_type_name {
+ enum damos_filter_type type;
+ char *name;
+};
+
+static const struct damos_sysfs_filter_type_name
+damos_sysfs_filter_type_names[] = {
+ {
+ .type = DAMOS_FILTER_TYPE_ANON,
+ .name = "anon",
+ },
+ {
+ .type = DAMOS_FILTER_TYPE_ACTIVE,
+ .name = "active",
+ },
+ {
+ .type = DAMOS_FILTER_TYPE_MEMCG,
+ .name = "memcg",
+ },
+ {
+ .type = DAMOS_FILTER_TYPE_YOUNG,
+ .name = "young",
+ },
+ {
+ .type = DAMOS_FILTER_TYPE_HUGEPAGE_SIZE,
+ .name = "hugepage_size",
+ },
+ {
+ .type = DAMOS_FILTER_TYPE_UNMAPPED,
+ .name = "unmapped",
+ },
+ {
+ .type = DAMOS_FILTER_TYPE_ADDR,
+ .name = "addr",
+ },
+ {
+ .type = DAMOS_FILTER_TYPE_TARGET,
+ .name = "target",
+ },
};
static ssize_t type_show(struct kobject *kobj,
@@ -358,9 +387,16 @@ static ssize_t type_show(struct kobject *kobj,
{
struct damon_sysfs_scheme_filter *filter = container_of(kobj,
struct damon_sysfs_scheme_filter, kobj);
+ int i;
- return sysfs_emit(buf, "%s\n",
- damon_sysfs_scheme_filter_type_strs[filter->type]);
+ for (i = 0; i < ARRAY_SIZE(damos_sysfs_filter_type_names); i++) {
+ const struct damos_sysfs_filter_type_name *type_name;
+
+ type_name = &damos_sysfs_filter_type_names[i];
+ if (type_name->type == filter->type)
+ return sysfs_emit(buf, "%s\n", type_name->name);
+ }
+ return -EINVAL;
}
static bool damos_sysfs_scheme_filter_valid_type(
@@ -385,16 +421,19 @@ static ssize_t type_store(struct kobject *kobj,
{
struct damon_sysfs_scheme_filter *filter = container_of(kobj,
struct damon_sysfs_scheme_filter, kobj);
- enum damos_filter_type type;
ssize_t ret = -EINVAL;
+ int i;
- for (type = 0; type < NR_DAMOS_FILTER_TYPES; type++) {
- if (sysfs_streq(buf, damon_sysfs_scheme_filter_type_strs[
- type])) {
+ for (i = 0; i < ARRAY_SIZE(damos_sysfs_filter_type_names); i++) {
+ const struct damos_sysfs_filter_type_name *type_name;
+
+ type_name = &damos_sysfs_filter_type_names[i];
+ if (sysfs_streq(buf, type_name->name)) {
if (!damos_sysfs_scheme_filter_valid_type(
- filter->handle_layer, type))
+ filter->handle_layer,
+ type_name->type))
break;
- filter->type = type;
+ filter->type = type_name->type;
ret = count;
break;
}
@@ -465,12 +504,14 @@ static ssize_t memcg_path_store(struct kobject *kobj,
{
struct damon_sysfs_scheme_filter *filter = container_of(kobj,
struct damon_sysfs_scheme_filter, kobj);
- char *path = kmalloc(sizeof(*path) * (count + 1), GFP_KERNEL);
+ char *path = kmalloc_array(size_add(count, 1), sizeof(*path),
+ GFP_KERNEL);
if (!path)
return -ENOMEM;
strscpy(path, buf, count + 1);
+ kfree(filter->memcg_path);
filter->memcg_path = path;
return count;
}
@@ -783,10 +824,21 @@ static struct damon_sysfs_watermarks *damon_sysfs_watermarks_alloc(
return watermarks;
}
-/* Should match with enum damos_wmark_metric */
-static const char * const damon_sysfs_wmark_metric_strs[] = {
- "none",
- "free_mem_rate",
+struct damos_sysfs_wmark_metric_name {
+ enum damos_wmark_metric metric;
+ char *name;
+};
+
+static const struct damos_sysfs_wmark_metric_name
+damos_sysfs_wmark_metric_names[] = {
+ {
+ .metric = DAMOS_WMARK_NONE,
+ .name = "none",
+ },
+ {
+ .metric = DAMOS_WMARK_FREE_MEM_RATE,
+ .name = "free_mem_rate",
+ },
};
static ssize_t metric_show(struct kobject *kobj, struct kobj_attribute *attr,
@@ -794,9 +846,16 @@ static ssize_t metric_show(struct kobject *kobj, struct kobj_attribute *attr,
{
struct damon_sysfs_watermarks *watermarks = container_of(kobj,
struct damon_sysfs_watermarks, kobj);
+ int i;
- return sysfs_emit(buf, "%s\n",
- damon_sysfs_wmark_metric_strs[watermarks->metric]);
+ for (i = 0; i < ARRAY_SIZE(damos_sysfs_wmark_metric_names); i++) {
+ const struct damos_sysfs_wmark_metric_name *metric_name;
+
+ metric_name = &damos_sysfs_wmark_metric_names[i];
+ if (metric_name->metric == watermarks->metric)
+ return sysfs_emit(buf, "%s\n", metric_name->name);
+ }
+ return -EINVAL;
}
static ssize_t metric_store(struct kobject *kobj, struct kobj_attribute *attr,
@@ -804,11 +863,14 @@ static ssize_t metric_store(struct kobject *kobj, struct kobj_attribute *attr,
{
struct damon_sysfs_watermarks *watermarks = container_of(kobj,
struct damon_sysfs_watermarks, kobj);
- enum damos_wmark_metric metric;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(damos_sysfs_wmark_metric_names); i++) {
+ const struct damos_sysfs_wmark_metric_name *metric_name;
- for (metric = 0; metric < NR_DAMOS_WMARK_METRICS; metric++) {
- if (sysfs_streq(buf, damon_sysfs_wmark_metric_strs[metric])) {
- watermarks->metric = metric;
+ metric_name = &damos_sysfs_wmark_metric_names[i];
+ if (sysfs_streq(buf, metric_name->name)) {
+ watermarks->metric = metric_name->metric;
return count;
}
}
@@ -936,12 +998,7 @@ struct damos_sysfs_quota_goal {
enum damos_quota_goal_metric metric;
unsigned long target_value;
unsigned long current_value;
-};
-
-/* This should match with enum damos_action */
-static const char * const damos_sysfs_quota_goal_metric_strs[] = {
- "user_input",
- "some_mem_psi_us",
+ int nid;
};
static struct damos_sysfs_quota_goal *damos_sysfs_quota_goal_alloc(void)
@@ -949,14 +1006,46 @@ static struct damos_sysfs_quota_goal *damos_sysfs_quota_goal_alloc(void)
return kzalloc(sizeof(struct damos_sysfs_quota_goal), GFP_KERNEL);
}
+struct damos_sysfs_qgoal_metric_name {
+ enum damos_quota_goal_metric metric;
+ char *name;
+};
+
+static
+struct damos_sysfs_qgoal_metric_name damos_sysfs_qgoal_metric_names[] = {
+ {
+ .metric = DAMOS_QUOTA_USER_INPUT,
+ .name = "user_input",
+ },
+ {
+ .metric = DAMOS_QUOTA_SOME_MEM_PSI_US,
+ .name = "some_mem_psi_us",
+ },
+ {
+ .metric = DAMOS_QUOTA_NODE_MEM_USED_BP,
+ .name = "node_mem_used_bp",
+ },
+ {
+ .metric = DAMOS_QUOTA_NODE_MEM_FREE_BP,
+ .name = "node_mem_free_bp",
+ },
+};
+
static ssize_t target_metric_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
struct damos_sysfs_quota_goal *goal = container_of(kobj,
struct damos_sysfs_quota_goal, kobj);
+ int i;
- return sysfs_emit(buf, "%s\n",
- damos_sysfs_quota_goal_metric_strs[goal->metric]);
+ for (i = 0; i < ARRAY_SIZE(damos_sysfs_qgoal_metric_names); i++) {
+ struct damos_sysfs_qgoal_metric_name *metric_name;
+
+ metric_name = &damos_sysfs_qgoal_metric_names[i];
+ if (metric_name->metric == goal->metric)
+ return sysfs_emit(buf, "%s\n", metric_name->name);
+ }
+ return -EINVAL;
}
static ssize_t target_metric_store(struct kobject *kobj,
@@ -964,11 +1053,14 @@ static ssize_t target_metric_store(struct kobject *kobj,
{
struct damos_sysfs_quota_goal *goal = container_of(kobj,
struct damos_sysfs_quota_goal, kobj);
- enum damos_quota_goal_metric m;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(damos_sysfs_qgoal_metric_names); i++) {
+ struct damos_sysfs_qgoal_metric_name *metric_name;
- for (m = 0; m < NR_DAMOS_QUOTA_GOAL_METRICS; m++) {
- if (sysfs_streq(buf, damos_sysfs_quota_goal_metric_strs[m])) {
- goal->metric = m;
+ metric_name = &damos_sysfs_qgoal_metric_names[i];
+ if (sysfs_streq(buf, metric_name->name)) {
+ goal->metric = metric_name->metric;
return count;
}
}
@@ -1014,6 +1106,28 @@ static ssize_t current_value_store(struct kobject *kobj,
return err ? err : count;
}
+static ssize_t nid_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damos_sysfs_quota_goal *goal = container_of(kobj, struct
+ damos_sysfs_quota_goal, kobj);
+
+ /* todo: return error if the goal is not using nid */
+
+ return sysfs_emit(buf, "%d\n", goal->nid);
+}
+
+static ssize_t nid_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damos_sysfs_quota_goal *goal = container_of(kobj, struct
+ damos_sysfs_quota_goal, kobj);
+ int err = kstrtoint(buf, 0, &goal->nid);
+
+ /* feed callback should check existence of this file and read value */
+ return err ? err : count;
+}
+
static void damos_sysfs_quota_goal_release(struct kobject *kobj)
{
/* or, notify this release to the feed callback */
@@ -1029,10 +1143,14 @@ static struct kobj_attribute damos_sysfs_quota_goal_target_value_attr =
static struct kobj_attribute damos_sysfs_quota_goal_current_value_attr =
__ATTR_RW_MODE(current_value, 0600);
+static struct kobj_attribute damos_sysfs_quota_goal_nid_attr =
+ __ATTR_RW_MODE(nid, 0600);
+
static struct attribute *damos_sysfs_quota_goal_attrs[] = {
&damos_sysfs_quota_goal_target_metric_attr.attr,
&damos_sysfs_quota_goal_target_value_attr.attr,
&damos_sysfs_quota_goal_current_value_attr.attr,
+ &damos_sysfs_quota_goal_nid_attr.attr,
NULL,
};
ATTRIBUTE_GROUPS(damos_sysfs_quota_goal);
@@ -1538,6 +1656,204 @@ static const struct kobj_type damon_sysfs_access_pattern_ktype = {
};
/*
+ * dest (action destination) directory
+ */
+
+struct damos_sysfs_dest {
+ struct kobject kobj;
+ unsigned int id;
+ unsigned int weight;
+};
+
+static struct damos_sysfs_dest *damos_sysfs_dest_alloc(void)
+{
+ return kzalloc(sizeof(struct damos_sysfs_dest), GFP_KERNEL);
+}
+
+static ssize_t id_show(
+ struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ struct damos_sysfs_dest *dest = container_of(kobj,
+ struct damos_sysfs_dest, kobj);
+
+ return sysfs_emit(buf, "%u\n", dest->id);
+}
+
+static ssize_t id_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damos_sysfs_dest *dest = container_of(kobj,
+ struct damos_sysfs_dest, kobj);
+ int err = kstrtouint(buf, 0, &dest->id);
+
+ return err ? err : count;
+}
+
+static ssize_t weight_show(
+ struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ struct damos_sysfs_dest *dest = container_of(kobj,
+ struct damos_sysfs_dest, kobj);
+
+ return sysfs_emit(buf, "%u\n", dest->weight);
+}
+
+static ssize_t weight_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damos_sysfs_dest *dest = container_of(kobj,
+ struct damos_sysfs_dest, kobj);
+ int err = kstrtouint(buf, 0, &dest->weight);
+
+ return err ? err : count;
+}
+
+static void damos_sysfs_dest_release(struct kobject *kobj)
+{
+ struct damos_sysfs_dest *dest = container_of(kobj,
+ struct damos_sysfs_dest, kobj);
+ kfree(dest);
+}
+
+static struct kobj_attribute damos_sysfs_dest_id_attr =
+ __ATTR_RW_MODE(id, 0600);
+
+static struct kobj_attribute damos_sysfs_dest_weight_attr =
+ __ATTR_RW_MODE(weight, 0600);
+
+static struct attribute *damos_sysfs_dest_attrs[] = {
+ &damos_sysfs_dest_id_attr.attr,
+ &damos_sysfs_dest_weight_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damos_sysfs_dest);
+
+static const struct kobj_type damos_sysfs_dest_ktype = {
+ .release = damos_sysfs_dest_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damos_sysfs_dest_groups,
+};
+
+/*
+ * dests (action destinations) directory
+ */
+
+struct damos_sysfs_dests {
+ struct kobject kobj;
+ struct damos_sysfs_dest **dests_arr;
+ int nr;
+};
+
+static struct damos_sysfs_dests *
+damos_sysfs_dests_alloc(void)
+{
+ return kzalloc(sizeof(struct damos_sysfs_dests), GFP_KERNEL);
+}
+
+static void damos_sysfs_dests_rm_dirs(
+ struct damos_sysfs_dests *dests)
+{
+ struct damos_sysfs_dest **dests_arr = dests->dests_arr;
+ int i;
+
+ for (i = 0; i < dests->nr; i++)
+ kobject_put(&dests_arr[i]->kobj);
+ dests->nr = 0;
+ kfree(dests_arr);
+ dests->dests_arr = NULL;
+}
+
+static int damos_sysfs_dests_add_dirs(
+ struct damos_sysfs_dests *dests, int nr_dests)
+{
+ struct damos_sysfs_dest **dests_arr, *dest;
+ int err, i;
+
+ damos_sysfs_dests_rm_dirs(dests);
+ if (!nr_dests)
+ return 0;
+
+ dests_arr = kmalloc_array(nr_dests, sizeof(*dests_arr),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!dests_arr)
+ return -ENOMEM;
+ dests->dests_arr = dests_arr;
+
+ for (i = 0; i < nr_dests; i++) {
+ dest = damos_sysfs_dest_alloc();
+ if (!dest) {
+ damos_sysfs_dests_rm_dirs(dests);
+ return -ENOMEM;
+ }
+
+ err = kobject_init_and_add(&dest->kobj,
+ &damos_sysfs_dest_ktype,
+ &dests->kobj, "%d", i);
+ if (err) {
+ kobject_put(&dest->kobj);
+ damos_sysfs_dests_rm_dirs(dests);
+ return err;
+ }
+
+ dests_arr[i] = dest;
+ dests->nr++;
+ }
+ return 0;
+}
+
+static ssize_t nr_dests_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damos_sysfs_dests *dests = container_of(kobj,
+ struct damos_sysfs_dests, kobj);
+
+ return sysfs_emit(buf, "%d\n", dests->nr);
+}
+
+static ssize_t nr_dests_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damos_sysfs_dests *dests;
+ int nr, err = kstrtoint(buf, 0, &nr);
+
+ if (err)
+ return err;
+ if (nr < 0)
+ return -EINVAL;
+
+ dests = container_of(kobj, struct damos_sysfs_dests, kobj);
+
+ if (!mutex_trylock(&damon_sysfs_lock))
+ return -EBUSY;
+ err = damos_sysfs_dests_add_dirs(dests, nr);
+ mutex_unlock(&damon_sysfs_lock);
+ if (err)
+ return err;
+
+ return count;
+}
+
+static void damos_sysfs_dests_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damos_sysfs_dests, kobj));
+}
+
+static struct kobj_attribute damos_sysfs_dests_nr_attr =
+ __ATTR_RW_MODE(nr_dests, 0600);
+
+static struct attribute *damos_sysfs_dests_attrs[] = {
+ &damos_sysfs_dests_nr_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damos_sysfs_dests);
+
+static const struct kobj_type damos_sysfs_dests_ktype = {
+ .release = damos_sysfs_dests_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damos_sysfs_dests_groups,
+};
+
+/*
* scheme directory
*/
@@ -1554,20 +1870,55 @@ struct damon_sysfs_scheme {
struct damon_sysfs_stats *stats;
struct damon_sysfs_scheme_regions *tried_regions;
int target_nid;
+ struct damos_sysfs_dests *dests;
+};
+
+struct damos_sysfs_action_name {
+ enum damos_action action;
+ char *name;
};
-/* This should match with enum damos_action */
-static const char * const damon_sysfs_damos_action_strs[] = {
- "willneed",
- "cold",
- "pageout",
- "hugepage",
- "nohugepage",
- "lru_prio",
- "lru_deprio",
- "migrate_hot",
- "migrate_cold",
- "stat",
+static struct damos_sysfs_action_name damos_sysfs_action_names[] = {
+ {
+ .action = DAMOS_WILLNEED,
+ .name = "willneed",
+ },
+ {
+ .action = DAMOS_COLD,
+ .name = "cold",
+ },
+ {
+ .action = DAMOS_PAGEOUT,
+ .name = "pageout",
+ },
+ {
+ .action = DAMOS_HUGEPAGE,
+ .name = "hugepage",
+ },
+ {
+ .action = DAMOS_NOHUGEPAGE,
+ .name = "nohugepage",
+ },
+ {
+ .action = DAMOS_LRU_PRIO,
+ .name = "lru_prio",
+ },
+ {
+ .action = DAMOS_LRU_DEPRIO,
+ .name = "lru_deprio",
+ },
+ {
+ .action = DAMOS_MIGRATE_HOT,
+ .name = "migrate_hot",
+ },
+ {
+ .action = DAMOS_MIGRATE_COLD,
+ .name = "migrate_cold",
+ },
+ {
+ .action = DAMOS_STAT,
+ .name = "stat",
+ },
};
static struct damon_sysfs_scheme *damon_sysfs_scheme_alloc(
@@ -1610,6 +1961,22 @@ out:
return err;
}
+static int damos_sysfs_set_dests(struct damon_sysfs_scheme *scheme)
+{
+ struct damos_sysfs_dests *dests = damos_sysfs_dests_alloc();
+ int err;
+
+ if (!dests)
+ return -ENOMEM;
+ err = kobject_init_and_add(&dests->kobj, &damos_sysfs_dests_ktype,
+ &scheme->kobj, "dests");
+ if (err)
+ kobject_put(&dests->kobj);
+ else
+ scheme->dests = dests;
+ return err;
+}
+
static int damon_sysfs_scheme_set_quotas(struct damon_sysfs_scheme *scheme)
{
struct damon_sysfs_quotas *quotas = damon_sysfs_quotas_alloc();
@@ -1742,9 +2109,12 @@ static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme)
err = damon_sysfs_scheme_set_access_pattern(scheme);
if (err)
return err;
- err = damon_sysfs_scheme_set_quotas(scheme);
+ err = damos_sysfs_set_dests(scheme);
if (err)
goto put_access_pattern_out;
+ err = damon_sysfs_scheme_set_quotas(scheme);
+ if (err)
+ goto put_dests_out;
err = damon_sysfs_scheme_set_watermarks(scheme);
if (err)
goto put_quotas_access_pattern_out;
@@ -1775,6 +2145,9 @@ put_watermarks_quotas_access_pattern_out:
put_quotas_access_pattern_out:
kobject_put(&scheme->quotas->kobj);
scheme->quotas = NULL;
+put_dests_out:
+ kobject_put(&scheme->dests->kobj);
+ scheme->dests = NULL;
put_access_pattern_out:
kobject_put(&scheme->access_pattern->kobj);
scheme->access_pattern = NULL;
@@ -1785,6 +2158,8 @@ static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme)
{
damon_sysfs_access_pattern_rm_dirs(scheme->access_pattern);
kobject_put(&scheme->access_pattern->kobj);
+ damos_sysfs_dests_rm_dirs(scheme->dests);
+ kobject_put(&scheme->dests->kobj);
damon_sysfs_quotas_rm_dirs(scheme->quotas);
kobject_put(&scheme->quotas->kobj);
kobject_put(&scheme->watermarks->kobj);
@@ -1804,9 +2179,16 @@ static ssize_t action_show(struct kobject *kobj, struct kobj_attribute *attr,
{
struct damon_sysfs_scheme *scheme = container_of(kobj,
struct damon_sysfs_scheme, kobj);
+ int i;
- return sysfs_emit(buf, "%s\n",
- damon_sysfs_damos_action_strs[scheme->action]);
+ for (i = 0; i < ARRAY_SIZE(damos_sysfs_action_names); i++) {
+ struct damos_sysfs_action_name *action_name;
+
+ action_name = &damos_sysfs_action_names[i];
+ if (action_name->action == scheme->action)
+ return sysfs_emit(buf, "%s\n", action_name->name);
+ }
+ return -EINVAL;
}
static ssize_t action_store(struct kobject *kobj, struct kobj_attribute *attr,
@@ -1814,11 +2196,14 @@ static ssize_t action_store(struct kobject *kobj, struct kobj_attribute *attr,
{
struct damon_sysfs_scheme *scheme = container_of(kobj,
struct damon_sysfs_scheme, kobj);
- enum damos_action action;
+ int i;
- for (action = 0; action < NR_DAMOS_ACTIONS; action++) {
- if (sysfs_streq(buf, damon_sysfs_damos_action_strs[action])) {
- scheme->action = action;
+ for (i = 0; i < ARRAY_SIZE(damos_sysfs_action_names); i++) {
+ struct damos_sysfs_action_name *action_name;
+
+ action_name = &damos_sysfs_action_names[i];
+ if (sysfs_streq(buf, action_name->name)) {
+ scheme->action = action_name->action;
return count;
}
}
@@ -2035,7 +2420,7 @@ static int damon_sysfs_memcg_path_to_id(char *memcg_path, unsigned short *id)
if (!memcg_path)
return -EINVAL;
- path = kmalloc(sizeof(*path) * PATH_MAX, GFP_KERNEL);
+ path = kmalloc_array(PATH_MAX, sizeof(*path), GFP_KERNEL);
if (!path)
return -ENOMEM;
@@ -2120,8 +2505,17 @@ static int damos_sysfs_add_quota_score(
sysfs_goal->target_value);
if (!goal)
return -ENOMEM;
- if (sysfs_goal->metric == DAMOS_QUOTA_USER_INPUT)
+ switch (sysfs_goal->metric) {
+ case DAMOS_QUOTA_USER_INPUT:
goal->current_value = sysfs_goal->current_value;
+ break;
+ case DAMOS_QUOTA_NODE_MEM_USED_BP:
+ case DAMOS_QUOTA_NODE_MEM_FREE_BP:
+ goal->nid = sysfs_goal->nid;
+ break;
+ default:
+ break;
+ }
damos_add_quota_goal(quota, goal);
}
return 0;
@@ -2182,6 +2576,29 @@ void damos_sysfs_update_effective_quotas(
}
}
+static int damos_sysfs_add_migrate_dest(struct damos *scheme,
+ struct damos_sysfs_dests *sysfs_dests)
+{
+ struct damos_migrate_dests *dests = &scheme->migrate_dests;
+ int i;
+
+ dests->node_id_arr = kmalloc_array(sysfs_dests->nr,
+ sizeof(*dests->node_id_arr), GFP_KERNEL);
+ if (!dests->node_id_arr)
+ return -ENOMEM;
+ dests->weight_arr = kmalloc_array(sysfs_dests->nr,
+ sizeof(*dests->weight_arr), GFP_KERNEL);
+ if (!dests->weight_arr)
+ /* ->node_id_arr will be freed by scheme destruction */
+ return -ENOMEM;
+ for (i = 0; i < sysfs_dests->nr; i++) {
+ dests->node_id_arr[i] = sysfs_dests->dests_arr[i]->id;
+ dests->weight_arr[i] = sysfs_dests->dests_arr[i]->weight;
+ }
+ dests->nr_dests = sysfs_dests->nr;
+ return 0;
+}
+
static struct damos *damon_sysfs_mk_scheme(
struct damon_sysfs_scheme *sysfs_scheme)
{
@@ -2244,6 +2661,11 @@ static struct damos *damon_sysfs_mk_scheme(
damon_destroy_scheme(scheme);
return NULL;
}
+ err = damos_sysfs_add_migrate_dest(scheme, sysfs_scheme->dests);
+ if (err) {
+ damon_destroy_scheme(scheme);
+ return NULL;
+ }
return scheme;
}
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 1af6aff35d84..2fc722f998f8 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -811,16 +811,30 @@ static const struct kobj_type damon_sysfs_attrs_ktype = {
* context directory
*/
-/* This should match with enum damon_ops_id */
-static const char * const damon_sysfs_ops_strs[] = {
- "vaddr",
- "fvaddr",
- "paddr",
+struct damon_sysfs_ops_name {
+ enum damon_ops_id ops_id;
+ char *name;
+};
+
+static const struct damon_sysfs_ops_name damon_sysfs_ops_names[] = {
+ {
+ .ops_id = DAMON_OPS_VADDR,
+ .name = "vaddr",
+ },
+ {
+ .ops_id = DAMON_OPS_FVADDR,
+ .name = "fvaddr",
+ },
+ {
+ .ops_id = DAMON_OPS_PADDR,
+ .name = "paddr",
+ },
};
struct damon_sysfs_context {
struct kobject kobj;
enum damon_ops_id ops_id;
+ unsigned long addr_unit;
struct damon_sysfs_attrs *attrs;
struct damon_sysfs_targets *targets;
struct damon_sysfs_schemes *schemes;
@@ -836,6 +850,7 @@ static struct damon_sysfs_context *damon_sysfs_context_alloc(
return NULL;
context->kobj = (struct kobject){};
context->ops_id = ops_id;
+ context->addr_unit = 1;
return context;
}
@@ -934,14 +949,16 @@ static void damon_sysfs_context_rm_dirs(struct damon_sysfs_context *context)
static ssize_t avail_operations_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- enum damon_ops_id id;
int len = 0;
+ int i;
- for (id = 0; id < NR_DAMON_OPS; id++) {
- if (!damon_is_registered_ops(id))
+ for (i = 0; i < ARRAY_SIZE(damon_sysfs_ops_names); i++) {
+ const struct damon_sysfs_ops_name *ops_name;
+
+ ops_name = &damon_sysfs_ops_names[i];
+ if (!damon_is_registered_ops(ops_name->ops_id))
continue;
- len += sysfs_emit_at(buf, len, "%s\n",
- damon_sysfs_ops_strs[id]);
+ len += sysfs_emit_at(buf, len, "%s\n", ops_name->name);
}
return len;
}
@@ -951,8 +968,16 @@ static ssize_t operations_show(struct kobject *kobj,
{
struct damon_sysfs_context *context = container_of(kobj,
struct damon_sysfs_context, kobj);
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(damon_sysfs_ops_names); i++) {
+ const struct damon_sysfs_ops_name *ops_name;
- return sysfs_emit(buf, "%s\n", damon_sysfs_ops_strs[context->ops_id]);
+ ops_name = &damon_sysfs_ops_names[i];
+ if (ops_name->ops_id == context->ops_id)
+ return sysfs_emit(buf, "%s\n", ops_name->name);
+ }
+ return -EINVAL;
}
static ssize_t operations_store(struct kobject *kobj,
@@ -960,17 +985,46 @@ static ssize_t operations_store(struct kobject *kobj,
{
struct damon_sysfs_context *context = container_of(kobj,
struct damon_sysfs_context, kobj);
- enum damon_ops_id id;
+ int i;
- for (id = 0; id < NR_DAMON_OPS; id++) {
- if (sysfs_streq(buf, damon_sysfs_ops_strs[id])) {
- context->ops_id = id;
+ for (i = 0; i < ARRAY_SIZE(damon_sysfs_ops_names); i++) {
+ const struct damon_sysfs_ops_name *ops_name;
+
+ ops_name = &damon_sysfs_ops_names[i];
+ if (sysfs_streq(buf, ops_name->name)) {
+ context->ops_id = ops_name->ops_id;
return count;
}
}
return -EINVAL;
}
+static ssize_t addr_unit_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_context *context = container_of(kobj,
+ struct damon_sysfs_context, kobj);
+
+ return sysfs_emit(buf, "%lu\n", context->addr_unit);
+}
+
+static ssize_t addr_unit_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_context *context = container_of(kobj,
+ struct damon_sysfs_context, kobj);
+ unsigned long input_addr_unit;
+ int err = kstrtoul(buf, 0, &input_addr_unit);
+
+ if (err)
+ return err;
+ if (!input_addr_unit)
+ return -EINVAL;
+
+ context->addr_unit = input_addr_unit;
+ return count;
+}
+
static void damon_sysfs_context_release(struct kobject *kobj)
{
kfree(container_of(kobj, struct damon_sysfs_context, kobj));
@@ -982,9 +1036,13 @@ static struct kobj_attribute damon_sysfs_context_avail_operations_attr =
static struct kobj_attribute damon_sysfs_context_operations_attr =
__ATTR_RW_MODE(operations, 0600);
+static struct kobj_attribute damon_sysfs_context_addr_unit_attr =
+ __ATTR_RW_MODE(addr_unit, 0600);
+
static struct attribute *damon_sysfs_context_attrs[] = {
&damon_sysfs_context_avail_operations_attr.attr,
&damon_sysfs_context_operations_attr.attr,
+ &damon_sysfs_context_addr_unit_attr.attr,
NULL,
};
ATTRIBUTE_GROUPS(damon_sysfs_context);
@@ -1129,6 +1187,7 @@ struct damon_sysfs_kdamond {
struct kobject kobj;
struct damon_sysfs_contexts *contexts;
struct damon_ctx *damon_ctx;
+ unsigned int refresh_ms;
};
static struct damon_sysfs_kdamond *damon_sysfs_kdamond_alloc(void)
@@ -1163,16 +1222,6 @@ static void damon_sysfs_kdamond_rm_dirs(struct damon_sysfs_kdamond *kdamond)
kobject_put(&kdamond->contexts->kobj);
}
-static bool damon_sysfs_ctx_running(struct damon_ctx *ctx)
-{
- bool running;
-
- mutex_lock(&ctx->kdamond_lock);
- running = ctx->kdamond != NULL;
- mutex_unlock(&ctx->kdamond_lock);
- return running;
-}
-
/*
* enum damon_sysfs_cmd - Commands for a specific kdamond.
*/
@@ -1243,13 +1292,17 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
{
struct damon_sysfs_kdamond *kdamond = container_of(kobj,
struct damon_sysfs_kdamond, kobj);
- struct damon_ctx *ctx = kdamond->damon_ctx;
- bool running;
+ struct damon_ctx *ctx;
+ bool running = false;
- if (!ctx)
- running = false;
- else
- running = damon_sysfs_ctx_running(ctx);
+ if (!mutex_trylock(&damon_sysfs_lock))
+ return -EBUSY;
+
+ ctx = kdamond->damon_ctx;
+ if (ctx)
+ running = damon_is_running(ctx);
+
+ mutex_unlock(&damon_sysfs_lock);
return sysfs_emit(buf, "%s\n", running ?
damon_sysfs_cmd_strs[DAMON_SYSFS_CMD_ON] :
@@ -1279,20 +1332,9 @@ static int damon_sysfs_set_attrs(struct damon_ctx *ctx,
return damon_set_attrs(ctx, &attrs);
}
-static void damon_sysfs_destroy_targets(struct damon_ctx *ctx)
-{
- struct damon_target *t, *next;
- bool has_pid = damon_target_has_pid(ctx);
-
- damon_for_each_target_safe(t, next, ctx) {
- if (has_pid)
- put_pid(t->pid);
- damon_destroy_target(t);
- }
-}
-
static int damon_sysfs_set_regions(struct damon_target *t,
- struct damon_sysfs_regions *sysfs_regions)
+ struct damon_sysfs_regions *sysfs_regions,
+ unsigned long min_sz_region)
{
struct damon_addr_range *ranges = kmalloc_array(sysfs_regions->nr,
sizeof(*ranges), GFP_KERNEL | __GFP_NOWARN);
@@ -1314,7 +1356,7 @@ static int damon_sysfs_set_regions(struct damon_target *t,
if (ranges[i - 1].end > ranges[i].start)
goto out;
}
- err = damon_set_regions(t, ranges, sysfs_regions->nr);
+ err = damon_set_regions(t, ranges, sysfs_regions->nr, min_sz_region);
out:
kfree(ranges);
return err;
@@ -1325,7 +1367,6 @@ static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target,
struct damon_ctx *ctx)
{
struct damon_target *t = damon_new_target();
- int err = -EINVAL;
if (!t)
return -ENOMEM;
@@ -1333,16 +1374,10 @@ static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target,
if (damon_target_has_pid(ctx)) {
t->pid = find_get_pid(sys_target->pid);
if (!t->pid)
- goto destroy_targets_out;
+ /* caller will destroy targets */
+ return -EINVAL;
}
- err = damon_sysfs_set_regions(t, sys_target->regions);
- if (err)
- goto destroy_targets_out;
- return 0;
-
-destroy_targets_out:
- damon_sysfs_destroy_targets(ctx);
- return err;
+ return damon_sysfs_set_regions(t, sys_target->regions, ctx->min_sz_region);
}
static int damon_sysfs_add_targets(struct damon_ctx *ctx,
@@ -1364,21 +1399,6 @@ static int damon_sysfs_add_targets(struct damon_ctx *ctx,
return 0;
}
-static void damon_sysfs_before_terminate(struct damon_ctx *ctx)
-{
- struct damon_target *t, *next;
-
- if (!damon_target_has_pid(ctx))
- return;
-
- mutex_lock(&ctx->kdamond_lock);
- damon_for_each_target_safe(t, next, ctx) {
- put_pid(t->pid);
- damon_destroy_target(t);
- }
- mutex_unlock(&ctx->kdamond_lock);
-}
-
/*
* damon_sysfs_upd_schemes_stats() - Update schemes stats sysfs files.
* @data: The kobject wrapper that associated to the kdamond thread.
@@ -1403,7 +1423,7 @@ static inline bool damon_sysfs_kdamond_running(
struct damon_sysfs_kdamond *kdamond)
{
return kdamond->damon_ctx &&
- damon_sysfs_ctx_running(kdamond->damon_ctx);
+ damon_is_running(kdamond->damon_ctx);
}
static int damon_sysfs_apply_inputs(struct damon_ctx *ctx,
@@ -1414,6 +1434,11 @@ static int damon_sysfs_apply_inputs(struct damon_ctx *ctx,
err = damon_select_ops(ctx, sys_ctx->ops_id);
if (err)
return err;
+ ctx->addr_unit = sys_ctx->addr_unit;
+ /* addr_unit is respected by only DAMON_OPS_PADDR */
+ if (sys_ctx->ops_id == DAMON_OPS_PADDR)
+ ctx->min_sz_region = max(
+ DAMON_MIN_REGION / sys_ctx->addr_unit, 1);
err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs);
if (err)
return err;
@@ -1450,13 +1475,11 @@ static int damon_sysfs_commit_input(void *data)
test_ctx = damon_new_ctx();
err = damon_commit_ctx(test_ctx, param_ctx);
if (err) {
- damon_sysfs_destroy_targets(test_ctx);
damon_destroy_ctx(test_ctx);
goto out;
}
err = damon_commit_ctx(kdamond->damon_ctx, param_ctx);
out:
- damon_sysfs_destroy_targets(param_ctx);
damon_destroy_ctx(param_ctx);
return err;
}
@@ -1525,13 +1548,34 @@ static struct damon_ctx *damon_sysfs_build_ctx(
return ERR_PTR(err);
}
- ctx->callback.before_terminate = damon_sysfs_before_terminate;
return ctx;
}
+static int damon_sysfs_repeat_call_fn(void *data)
+{
+ struct damon_sysfs_kdamond *sysfs_kdamond = data;
+ static unsigned long next_update_jiffies;
+
+ if (!sysfs_kdamond->refresh_ms)
+ return 0;
+ if (time_before(jiffies, next_update_jiffies))
+ return 0;
+ next_update_jiffies = jiffies +
+ msecs_to_jiffies(sysfs_kdamond->refresh_ms);
+
+ if (!mutex_trylock(&damon_sysfs_lock))
+ return 0;
+ damon_sysfs_upd_tuned_intervals(sysfs_kdamond);
+ damon_sysfs_upd_schemes_stats(sysfs_kdamond);
+ damon_sysfs_upd_schemes_effective_quotas(sysfs_kdamond);
+ mutex_unlock(&damon_sysfs_lock);
+ return 0;
+}
+
static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond)
{
struct damon_ctx *ctx;
+ struct damon_call_control *repeat_call_control;
int err;
if (damon_sysfs_kdamond_running(kdamond))
@@ -1544,15 +1588,29 @@ static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond)
damon_destroy_ctx(kdamond->damon_ctx);
kdamond->damon_ctx = NULL;
+ repeat_call_control = kmalloc(sizeof(*repeat_call_control),
+ GFP_KERNEL);
+ if (!repeat_call_control)
+ return -ENOMEM;
+
ctx = damon_sysfs_build_ctx(kdamond->contexts->contexts_arr[0]);
- if (IS_ERR(ctx))
+ if (IS_ERR(ctx)) {
+ kfree(repeat_call_control);
return PTR_ERR(ctx);
+ }
err = damon_start(&ctx, 1, false);
if (err) {
+ kfree(repeat_call_control);
damon_destroy_ctx(ctx);
return err;
}
kdamond->damon_ctx = ctx;
+
+ repeat_call_control->fn = damon_sysfs_repeat_call_fn;
+ repeat_call_control->data = kdamond;
+ repeat_call_control->repeat = true;
+ repeat_call_control->dealloc_on_cancel = true;
+ damon_call(ctx, repeat_call_control);
return err;
}
@@ -1572,12 +1630,14 @@ static int damon_sysfs_damon_call(int (*fn)(void *data),
struct damon_sysfs_kdamond *kdamond)
{
struct damon_call_control call_control = {};
+ int err;
if (!kdamond->damon_ctx)
return -EINVAL;
call_control.fn = fn;
call_control.data = kdamond;
- return damon_call(kdamond->damon_ctx, &call_control);
+ err = damon_call(kdamond->damon_ctx, &call_control);
+ return err ? err : call_control.return_code;
}
struct damon_sysfs_schemes_walk_data {
@@ -1711,6 +1771,30 @@ out:
return sysfs_emit(buf, "%d\n", pid);
}
+static ssize_t refresh_ms_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_kdamond *kdamond = container_of(kobj,
+ struct damon_sysfs_kdamond, kobj);
+
+ return sysfs_emit(buf, "%u\n", kdamond->refresh_ms);
+}
+
+static ssize_t refresh_ms_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_kdamond *kdamond = container_of(kobj,
+ struct damon_sysfs_kdamond, kobj);
+ unsigned int nr;
+ int err = kstrtouint(buf, 0, &nr);
+
+ if (err)
+ return err;
+
+ kdamond->refresh_ms = nr;
+ return count;
+}
+
static void damon_sysfs_kdamond_release(struct kobject *kobj)
{
struct damon_sysfs_kdamond *kdamond = container_of(kobj,
@@ -1727,9 +1811,13 @@ static struct kobj_attribute damon_sysfs_kdamond_state_attr =
static struct kobj_attribute damon_sysfs_kdamond_pid_attr =
__ATTR_RO_MODE(pid, 0400);
+static struct kobj_attribute damon_sysfs_kdamond_refresh_ms_attr =
+ __ATTR_RW_MODE(refresh_ms, 0600);
+
static struct attribute *damon_sysfs_kdamond_attrs[] = {
&damon_sysfs_kdamond_state_attr.attr,
&damon_sysfs_kdamond_pid_attr.attr,
+ &damon_sysfs_kdamond_refresh_ms_attr.attr,
NULL,
};
ATTRIBUTE_GROUPS(damon_sysfs_kdamond);
diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h
index be0fea9ee5fc..51369e35298b 100644
--- a/mm/damon/tests/core-kunit.h
+++ b/mm/damon/tests/core-kunit.h
@@ -58,7 +58,7 @@ static void damon_test_target(struct kunit *test)
damon_add_target(c, t);
KUNIT_EXPECT_EQ(test, 1u, nr_damon_targets(c));
- damon_destroy_target(t);
+ damon_destroy_target(t, c);
KUNIT_EXPECT_EQ(test, 0u, nr_damon_targets(c));
damon_destroy_ctx(c);
@@ -230,14 +230,14 @@ static void damon_test_split_regions_of(struct kunit *test)
t = damon_new_target();
r = damon_new_region(0, 22);
damon_add_region(r, t);
- damon_split_regions_of(t, 2);
+ damon_split_regions_of(t, 2, DAMON_MIN_REGION);
KUNIT_EXPECT_LE(test, damon_nr_regions(t), 2u);
damon_free_target(t);
t = damon_new_target();
r = damon_new_region(0, 220);
damon_add_region(r, t);
- damon_split_regions_of(t, 4);
+ damon_split_regions_of(t, 4, DAMON_MIN_REGION);
KUNIT_EXPECT_LE(test, damon_nr_regions(t), 4u);
damon_free_target(t);
damon_destroy_ctx(c);
@@ -303,14 +303,14 @@ static void damon_test_set_regions(struct kunit *test)
damon_add_region(r1, t);
damon_add_region(r2, t);
- damon_set_regions(t, &range, 1);
+ damon_set_regions(t, &range, 1, DAMON_MIN_REGION);
KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 3);
damon_for_each_region(r, t) {
KUNIT_EXPECT_EQ(test, r->ar.start, expects[expect_idx++]);
KUNIT_EXPECT_EQ(test, r->ar.end, expects[expect_idx++]);
}
- damon_destroy_target(t);
+ damon_destroy_target(t, NULL);
}
static void damon_test_nr_accesses_to_accesses_bp(struct kunit *test)
@@ -419,6 +419,22 @@ static void damos_test_new_filter(struct kunit *test)
damos_destroy_filter(filter);
}
+static void damos_test_commit_filter(struct kunit *test)
+{
+ struct damos_filter *src_filter = damos_new_filter(
+ DAMOS_FILTER_TYPE_ANON, true, true);
+ struct damos_filter *dst_filter = damos_new_filter(
+ DAMOS_FILTER_TYPE_ACTIVE, false, false);
+
+ damos_commit_filter(dst_filter, src_filter);
+ KUNIT_EXPECT_EQ(test, dst_filter->type, src_filter->type);
+ KUNIT_EXPECT_EQ(test, dst_filter->matching, src_filter->matching);
+ KUNIT_EXPECT_EQ(test, dst_filter->allow, src_filter->allow);
+
+ damos_destroy_filter(src_filter);
+ damos_destroy_filter(dst_filter);
+}
+
static void damos_test_filter_out(struct kunit *test)
{
struct damon_target *t;
@@ -434,25 +450,29 @@ static void damos_test_filter_out(struct kunit *test)
damon_add_region(r, t);
/* region in the range */
- KUNIT_EXPECT_TRUE(test, damos_filter_match(NULL, t, r, f));
+ KUNIT_EXPECT_TRUE(test,
+ damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION));
KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1);
/* region before the range */
r->ar.start = DAMON_MIN_REGION * 1;
r->ar.end = DAMON_MIN_REGION * 2;
- KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f));
+ KUNIT_EXPECT_FALSE(test,
+ damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION));
KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1);
/* region after the range */
r->ar.start = DAMON_MIN_REGION * 6;
r->ar.end = DAMON_MIN_REGION * 8;
- KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f));
+ KUNIT_EXPECT_FALSE(test,
+ damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION));
KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1);
/* region started before the range */
r->ar.start = DAMON_MIN_REGION * 1;
r->ar.end = DAMON_MIN_REGION * 4;
- KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f));
+ KUNIT_EXPECT_FALSE(test,
+ damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION));
/* filter should have split the region */
KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 1);
KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 2);
@@ -465,7 +485,8 @@ static void damos_test_filter_out(struct kunit *test)
/* region started in the range */
r->ar.start = DAMON_MIN_REGION * 2;
r->ar.end = DAMON_MIN_REGION * 8;
- KUNIT_EXPECT_TRUE(test, damos_filter_match(NULL, t, r, f));
+ KUNIT_EXPECT_TRUE(test,
+ damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION));
/* filter should have split the region */
KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 2);
KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 6);
@@ -510,6 +531,75 @@ static void damon_test_feed_loop_next_input(struct kunit *test)
damon_feed_loop_next_input(last_input, 2000));
}
+static void damon_test_set_filters_default_reject(struct kunit *test)
+{
+ struct damos scheme;
+ struct damos_filter *target_filter, *anon_filter;
+
+ INIT_LIST_HEAD(&scheme.filters);
+ INIT_LIST_HEAD(&scheme.ops_filters);
+
+ damos_set_filters_default_reject(&scheme);
+ /*
+ * No filter is installed. Allow by default on both core and ops layer
+ * filtering stages, since there are no filters at all.
+ */
+ KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, false);
+ KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, false);
+
+ target_filter = damos_new_filter(DAMOS_FILTER_TYPE_TARGET, true, true);
+ damos_add_filter(&scheme, target_filter);
+ damos_set_filters_default_reject(&scheme);
+ /*
+ * A core-handled allow-filter is installed.
+ * Rejct by default on core layer filtering stage due to the last
+ * core-layer-filter's behavior.
+ * Allow by default on ops layer filtering stage due to the absence of
+ * ops layer filters.
+ */
+ KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, true);
+ KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, false);
+
+ target_filter->allow = false;
+ damos_set_filters_default_reject(&scheme);
+ /*
+ * A core-handled reject-filter is installed.
+ * Allow by default on core layer filtering stage due to the last
+ * core-layer-filter's behavior.
+ * Allow by default on ops layer filtering stage due to the absence of
+ * ops layer filters.
+ */
+ KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, false);
+ KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, false);
+
+ anon_filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true, true);
+ damos_add_filter(&scheme, anon_filter);
+
+ damos_set_filters_default_reject(&scheme);
+ /*
+ * A core-handled reject-filter and ops-handled allow-filter are installed.
+ * Allow by default on core layer filtering stage due to the existence
+ * of the ops-handled filter.
+ * Reject by default on ops layer filtering stage due to the last
+ * ops-layer-filter's behavior.
+ */
+ KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, false);
+ KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, true);
+
+ target_filter->allow = true;
+ damos_set_filters_default_reject(&scheme);
+ /*
+ * A core-handled allow-filter and ops-handled allow-filter are
+ * installed.
+ * Allow by default on core layer filtering stage due to the existence
+ * of the ops-handled filter.
+ * Reject by default on ops layer filtering stage due to the last
+ * ops-layer-filter's behavior.
+ */
+ KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, false);
+ KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, true);
+}
+
static struct kunit_case damon_test_cases[] = {
KUNIT_CASE(damon_test_target),
KUNIT_CASE(damon_test_regions),
@@ -525,8 +615,10 @@ static struct kunit_case damon_test_cases[] = {
KUNIT_CASE(damon_test_set_attrs),
KUNIT_CASE(damon_test_moving_sum),
KUNIT_CASE(damos_test_new_filter),
+ KUNIT_CASE(damos_test_commit_filter),
KUNIT_CASE(damos_test_filter_out),
KUNIT_CASE(damon_test_feed_loop_next_input),
+ KUNIT_CASE(damon_test_set_filters_default_reject),
{},
};
diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h
index 7cd944266a92..fce38dd53cf8 100644
--- a/mm/damon/tests/vaddr-kunit.h
+++ b/mm/damon/tests/vaddr-kunit.h
@@ -141,7 +141,7 @@ static void damon_do_test_apply_three_regions(struct kunit *test,
damon_add_region(r, t);
}
- damon_set_regions(t, three_regions, 3);
+ damon_set_regions(t, three_regions, 3, DAMON_MIN_REGION);
for (i = 0; i < nr_expected / 2; i++) {
r = __nth_region_of(t, i);
@@ -149,7 +149,7 @@ static void damon_do_test_apply_three_regions(struct kunit *test,
KUNIT_EXPECT_EQ(test, r->ar.end, expected[i * 2 + 1]);
}
- damon_destroy_target(t);
+ damon_destroy_target(t, NULL);
}
/*
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index e6d99106a7f9..7e834467b2d8 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * DAMON Primitives for Virtual Address Spaces
+ * DAMON Code for Virtual Address Spaces
*
* Author: SeongJae Park <sj@kernel.org>
*/
@@ -15,6 +15,7 @@
#include <linux/pagewalk.h>
#include <linux/sched/mm.h>
+#include "../internal.h"
#include "ops-common.h"
#ifdef CONFIG_DAMON_VADDR_KUNIT_TEST
@@ -298,7 +299,7 @@ static void damon_va_update(struct damon_ctx *ctx)
damon_for_each_target(t, ctx) {
if (damon_va_three_regions(t, three_regions))
continue;
- damon_set_regions(t, three_regions, 3);
+ damon_set_regions(t, three_regions, 3, DAMON_MIN_REGION);
}
}
@@ -327,10 +328,8 @@ static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr,
}
pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
- if (!pte) {
- walk->action = ACTION_AGAIN;
+ if (!pte)
return 0;
- }
if (!pte_present(ptep_get(pte)))
goto out;
damon_ptep_mkold(pte, walk->vma, addr);
@@ -480,10 +479,8 @@ regular_page:
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
- if (!pte) {
- walk->action = ACTION_AGAIN;
+ if (!pte)
return 0;
- }
ptent = ptep_get(pte);
if (!pte_present(ptent))
goto out;
@@ -610,6 +607,187 @@ static unsigned int damon_va_check_accesses(struct damon_ctx *ctx)
return max_nr_accesses;
}
+static bool damos_va_filter_young_match(struct damos_filter *filter,
+ struct folio *folio, struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep, pmd_t *pmdp)
+{
+ bool young = false;
+
+ if (ptep)
+ young = pte_young(ptep_get(ptep));
+ else if (pmdp)
+ young = pmd_young(pmdp_get(pmdp));
+
+ young = young || !folio_test_idle(folio) ||
+ mmu_notifier_test_young(vma->vm_mm, addr);
+
+ if (young && ptep)
+ damon_ptep_mkold(ptep, vma, addr);
+ else if (young && pmdp)
+ damon_pmdp_mkold(pmdp, vma, addr);
+
+ return young == filter->matching;
+}
+
+static bool damos_va_filter_out(struct damos *scheme, struct folio *folio,
+ struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep, pmd_t *pmdp)
+{
+ struct damos_filter *filter;
+ bool matched;
+
+ if (scheme->core_filters_allowed)
+ return false;
+
+ damos_for_each_ops_filter(filter, scheme) {
+ /*
+ * damos_folio_filter_match checks the young filter by doing an
+ * rmap on the folio to find its page table. However, being the
+ * vaddr scheme, we have direct access to the page tables, so
+ * use that instead.
+ */
+ if (filter->type == DAMOS_FILTER_TYPE_YOUNG)
+ matched = damos_va_filter_young_match(filter, folio,
+ vma, addr, ptep, pmdp);
+ else
+ matched = damos_folio_filter_match(filter, folio);
+
+ if (matched)
+ return !filter->allow;
+ }
+ return scheme->ops_filters_default_reject;
+}
+
+struct damos_va_migrate_private {
+ struct list_head *migration_lists;
+ struct damos *scheme;
+};
+
+/*
+ * Place the given folio in the migration_list corresponding to where the folio
+ * should be migrated.
+ *
+ * The algorithm used here is similar to weighted_interleave_nid()
+ */
+static void damos_va_migrate_dests_add(struct folio *folio,
+ struct vm_area_struct *vma, unsigned long addr,
+ struct damos_migrate_dests *dests,
+ struct list_head *migration_lists)
+{
+ pgoff_t ilx;
+ int order;
+ unsigned int target;
+ unsigned int weight_total = 0;
+ int i;
+
+ /*
+ * If dests is empty, there is only one migration list corresponding
+ * to s->target_nid.
+ */
+ if (!dests->nr_dests) {
+ i = 0;
+ goto isolate;
+ }
+
+ order = folio_order(folio);
+ ilx = vma->vm_pgoff >> order;
+ ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
+
+ for (i = 0; i < dests->nr_dests; i++)
+ weight_total += dests->weight_arr[i];
+
+ /* If the total weights are somehow 0, don't migrate at all */
+ if (!weight_total)
+ return;
+
+ target = ilx % weight_total;
+ for (i = 0; i < dests->nr_dests; i++) {
+ if (target < dests->weight_arr[i])
+ break;
+ target -= dests->weight_arr[i];
+ }
+
+ /* If the folio is already in the right node, don't do anything */
+ if (folio_nid(folio) == dests->node_id_arr[i])
+ return;
+
+isolate:
+ if (!folio_isolate_lru(folio))
+ return;
+
+ list_add(&folio->lru, &migration_lists[i]);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static int damos_va_migrate_pmd_entry(pmd_t *pmd, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ struct damos_va_migrate_private *priv = walk->private;
+ struct list_head *migration_lists = priv->migration_lists;
+ struct damos *s = priv->scheme;
+ struct damos_migrate_dests *dests = &s->migrate_dests;
+ struct folio *folio;
+ spinlock_t *ptl;
+ pmd_t pmde;
+
+ ptl = pmd_lock(walk->mm, pmd);
+ pmde = pmdp_get(pmd);
+
+ if (!pmd_present(pmde) || !pmd_trans_huge(pmde))
+ goto unlock;
+
+ /* Tell page walk code to not split the PMD */
+ walk->action = ACTION_CONTINUE;
+
+ folio = damon_get_folio(pmd_pfn(pmde));
+ if (!folio)
+ goto unlock;
+
+ if (damos_va_filter_out(s, folio, walk->vma, addr, NULL, pmd))
+ goto put_folio;
+
+ damos_va_migrate_dests_add(folio, walk->vma, addr, dests,
+ migration_lists);
+
+put_folio:
+ folio_put(folio);
+unlock:
+ spin_unlock(ptl);
+ return 0;
+}
+#else
+#define damos_va_migrate_pmd_entry NULL
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+static int damos_va_migrate_pte_entry(pte_t *pte, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ struct damos_va_migrate_private *priv = walk->private;
+ struct list_head *migration_lists = priv->migration_lists;
+ struct damos *s = priv->scheme;
+ struct damos_migrate_dests *dests = &s->migrate_dests;
+ struct folio *folio;
+ pte_t ptent;
+
+ ptent = ptep_get(pte);
+ if (pte_none(ptent) || !pte_present(ptent))
+ return 0;
+
+ folio = damon_get_folio(pte_pfn(ptent));
+ if (!folio)
+ return 0;
+
+ if (damos_va_filter_out(s, folio, walk->vma, addr, pte, NULL))
+ goto put_folio;
+
+ damos_va_migrate_dests_add(folio, walk->vma, addr, dests,
+ migration_lists);
+
+put_folio:
+ folio_put(folio);
+ return 0;
+}
+
/*
* Functions for the target validity check and cleanup
*/
@@ -627,6 +805,11 @@ static bool damon_va_target_valid(struct damon_target *t)
return false;
}
+static void damon_va_cleanup_target(struct damon_target *t)
+{
+ put_pid(t->pid);
+}
+
#ifndef CONFIG_ADVISE_SYSCALLS
static unsigned long damos_madvise(struct damon_target *target,
struct damon_region *r, int behavior)
@@ -653,6 +836,157 @@ static unsigned long damos_madvise(struct damon_target *target,
}
#endif /* CONFIG_ADVISE_SYSCALLS */
+static unsigned long damos_va_migrate(struct damon_target *target,
+ struct damon_region *r, struct damos *s,
+ unsigned long *sz_filter_passed)
+{
+ LIST_HEAD(folio_list);
+ struct damos_va_migrate_private priv;
+ struct mm_struct *mm;
+ int nr_dests;
+ int nid;
+ bool use_target_nid;
+ unsigned long applied = 0;
+ struct damos_migrate_dests *dests = &s->migrate_dests;
+ struct mm_walk_ops walk_ops = {
+ .pmd_entry = damos_va_migrate_pmd_entry,
+ .pte_entry = damos_va_migrate_pte_entry,
+ .walk_lock = PGWALK_RDLOCK,
+ };
+
+ use_target_nid = dests->nr_dests == 0;
+ nr_dests = use_target_nid ? 1 : dests->nr_dests;
+ priv.scheme = s;
+ priv.migration_lists = kmalloc_array(nr_dests,
+ sizeof(*priv.migration_lists), GFP_KERNEL);
+ if (!priv.migration_lists)
+ return 0;
+
+ for (int i = 0; i < nr_dests; i++)
+ INIT_LIST_HEAD(&priv.migration_lists[i]);
+
+
+ mm = damon_get_mm(target);
+ if (!mm)
+ goto free_lists;
+
+ mmap_read_lock(mm);
+ walk_page_range(mm, r->ar.start, r->ar.end, &walk_ops, &priv);
+ mmap_read_unlock(mm);
+ mmput(mm);
+
+ for (int i = 0; i < nr_dests; i++) {
+ nid = use_target_nid ? s->target_nid : dests->node_id_arr[i];
+ applied += damon_migrate_pages(&priv.migration_lists[i], nid);
+ cond_resched();
+ }
+
+free_lists:
+ kfree(priv.migration_lists);
+ return applied * PAGE_SIZE;
+}
+
+struct damos_va_stat_private {
+ struct damos *scheme;
+ unsigned long *sz_filter_passed;
+};
+
+static inline bool damos_va_invalid_folio(struct folio *folio,
+ struct damos *s)
+{
+ return !folio || folio == s->last_applied;
+}
+
+static int damos_va_stat_pmd_entry(pmd_t *pmd, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ struct damos_va_stat_private *priv = walk->private;
+ struct damos *s = priv->scheme;
+ unsigned long *sz_filter_passed = priv->sz_filter_passed;
+ struct vm_area_struct *vma = walk->vma;
+ struct folio *folio;
+ spinlock_t *ptl;
+ pte_t *start_pte, *pte, ptent;
+ int nr;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (pmd_trans_huge(*pmd)) {
+ pmd_t pmde;
+
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (!ptl)
+ return 0;
+ pmde = pmdp_get(pmd);
+ if (!pmd_present(pmde))
+ goto huge_unlock;
+
+ folio = vm_normal_folio_pmd(vma, addr, pmde);
+
+ if (damos_va_invalid_folio(folio, s))
+ goto huge_unlock;
+
+ if (!damos_va_filter_out(s, folio, vma, addr, NULL, pmd))
+ *sz_filter_passed += folio_size(folio);
+ s->last_applied = folio;
+
+huge_unlock:
+ spin_unlock(ptl);
+ return 0;
+ }
+#endif
+ start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ if (!start_pte)
+ return 0;
+
+ for (; addr < next; pte += nr, addr += nr * PAGE_SIZE) {
+ nr = 1;
+ ptent = ptep_get(pte);
+
+ if (pte_none(ptent) || !pte_present(ptent))
+ continue;
+
+ folio = vm_normal_folio(vma, addr, ptent);
+
+ if (damos_va_invalid_folio(folio, s))
+ continue;
+
+ if (!damos_va_filter_out(s, folio, vma, addr, pte, NULL))
+ *sz_filter_passed += folio_size(folio);
+ nr = folio_nr_pages(folio);
+ s->last_applied = folio;
+ }
+ pte_unmap_unlock(start_pte, ptl);
+ return 0;
+}
+
+static unsigned long damos_va_stat(struct damon_target *target,
+ struct damon_region *r, struct damos *s,
+ unsigned long *sz_filter_passed)
+{
+ struct damos_va_stat_private priv;
+ struct mm_struct *mm;
+ struct mm_walk_ops walk_ops = {
+ .pmd_entry = damos_va_stat_pmd_entry,
+ .walk_lock = PGWALK_RDLOCK,
+ };
+
+ priv.scheme = s;
+ priv.sz_filter_passed = sz_filter_passed;
+
+ if (!damos_ops_has_filter(s))
+ return 0;
+
+ mm = damon_get_mm(target);
+ if (!mm)
+ return 0;
+
+ mmap_read_lock(mm);
+ walk_page_range(mm, r->ar.start, r->ar.end, &walk_ops, &priv);
+ mmap_read_unlock(mm);
+ mmput(mm);
+ return 0;
+}
+
static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx,
struct damon_target *t, struct damon_region *r,
struct damos *scheme, unsigned long *sz_filter_passed)
@@ -675,8 +1009,11 @@ static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx,
case DAMOS_NOHUGEPAGE:
madv_action = MADV_NOHUGEPAGE;
break;
+ case DAMOS_MIGRATE_HOT:
+ case DAMOS_MIGRATE_COLD:
+ return damos_va_migrate(t, r, scheme, sz_filter_passed);
case DAMOS_STAT:
- return 0;
+ return damos_va_stat(t, r, scheme, sz_filter_passed);
default:
/*
* DAMOS actions that are not yet supported by 'vaddr'.
@@ -695,6 +1032,10 @@ static int damon_va_scheme_score(struct damon_ctx *context,
switch (scheme->action) {
case DAMOS_PAGEOUT:
return damon_cold_score(context, r, scheme);
+ case DAMOS_MIGRATE_HOT:
+ return damon_hot_score(context, r, scheme);
+ case DAMOS_MIGRATE_COLD:
+ return damon_cold_score(context, r, scheme);
default:
break;
}
@@ -711,6 +1052,7 @@ static int __init damon_va_initcall(void)
.prepare_access_checks = damon_va_prepare_access_checks,
.check_accesses = damon_va_check_accesses,
.target_valid = damon_va_target_valid,
+ .cleanup_target = damon_va_cleanup_target,
.cleanup = NULL,
.apply_scheme = damon_va_apply_scheme,
.get_scheme_score = damon_va_scheme_score,
diff --git a/mm/debug.c b/mm/debug.c
index db83e381a8ae..64ddb0c4b4be 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -71,10 +71,12 @@ static void __dump_folio(struct folio *folio, struct page *page,
unsigned long pfn, unsigned long idx)
{
struct address_space *mapping = folio_mapping(folio);
- int mapcount = atomic_read(&page->_mapcount);
+ int mapcount = atomic_read(&page->_mapcount) + 1;
char *type = "";
- mapcount = page_mapcount_is_type(mapcount) ? 0 : mapcount + 1;
+ if (page_mapcount_is_type(mapcount))
+ mapcount = 0;
+
pr_warn("page: refcount:%d mapcount:%d mapping:%p index:%#lx pfn:%#lx\n",
folio_ref_count(folio), mapcount, mapping,
folio->index + idx, pfn);
@@ -127,47 +129,13 @@ static void __dump_folio(struct folio *folio, struct page *page,
static void __dump_page(const struct page *page)
{
- struct folio *foliop, folio;
- struct page precise;
- unsigned long head;
- unsigned long pfn = page_to_pfn(page);
- unsigned long idx, nr_pages = 1;
- int loops = 5;
-
-again:
- memcpy(&precise, page, sizeof(*page));
- head = precise.compound_head;
- if ((head & 1) == 0) {
- foliop = (struct folio *)&precise;
- idx = 0;
- if (!folio_test_large(foliop))
- goto dump;
- foliop = (struct folio *)page;
- } else {
- foliop = (struct folio *)(head - 1);
- idx = folio_page_idx(foliop, page);
- }
+ struct page_snapshot ps;
- if (idx < MAX_FOLIO_NR_PAGES) {
- memcpy(&folio, foliop, 2 * sizeof(struct page));
- nr_pages = folio_nr_pages(&folio);
- if (nr_pages > 1)
- memcpy(&folio.__page_2, &foliop->__page_2,
- sizeof(struct page));
- foliop = &folio;
- }
-
- if (idx > nr_pages) {
- if (loops-- > 0)
- goto again;
+ snapshot_page(&ps, page);
+ if (!snapshot_page_is_faithful(&ps))
pr_warn("page does not match folio\n");
- precise.compound_head &= ~1UL;
- foliop = (struct folio *)&precise;
- idx = 0;
- }
-dump:
- __dump_folio(foliop, &precise, pfn, idx);
+ __dump_folio(&ps.folio_snapshot, &ps.page_snapshot, ps.pfn, ps.idx);
}
void dump_page(const struct page *page, const char *reason)
@@ -214,7 +182,7 @@ void dump_mm(const struct mm_struct *mm)
"start_code %lx end_code %lx start_data %lx end_data %lx\n"
"start_brk %lx brk %lx start_stack %lx\n"
"arg_start %lx arg_end %lx env_start %lx env_end %lx\n"
- "binfmt %px flags %lx\n"
+ "binfmt %px flags %*pb\n"
#ifdef CONFIG_AIO
"ioctx_table %px\n"
#endif
@@ -243,7 +211,7 @@ void dump_mm(const struct mm_struct *mm)
mm->start_code, mm->end_code, mm->start_data, mm->end_data,
mm->start_brk, mm->brk, mm->start_stack,
mm->arg_start, mm->arg_end, mm->env_start, mm->env_end,
- mm->binfmt, mm->flags,
+ mm->binfmt, NUM_MM_FLAG_BITS, __mm_flags_get_bitmap(mm),
#ifdef CONFIG_AIO
mm->ioctx_table,
#endif
@@ -288,7 +256,7 @@ void dump_vmg(const struct vma_merge_struct *vmg, const char *reason)
vmg->vmi, vmg->vmi ? vma_iter_addr(vmg->vmi) : 0,
vmg->vmi ? vma_iter_end(vmg->vmi) : 0,
vmg->prev, vmg->middle, vmg->next, vmg->target,
- vmg->start, vmg->end, vmg->flags,
+ vmg->start, vmg->end, vmg->vm_flags,
vmg->file, vmg->anon_vma, vmg->policy,
#ifdef CONFIG_USERFAULTFD
vmg->uffd_ctx.ctx,
diff --git a/mm/debug_page_alloc.c b/mm/debug_page_alloc.c
index d46acf989dde..6a26eca546c3 100644
--- a/mm/debug_page_alloc.c
+++ b/mm/debug_page_alloc.c
@@ -23,7 +23,7 @@ static int __init debug_guardpage_minorder_setup(char *buf)
unsigned long res;
if (kstrtoul(buf, 10, &res) < 0 || res > MAX_PAGE_ORDER / 2) {
- pr_err("Bad debug_guardpage_minorder value\n");
+ pr_err("Bad debug_guardpage_minorder value: %s\n", buf);
return 0;
}
_debug_guardpage_minorder = res;
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index bc748f700a9e..830107b6dd08 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -20,7 +20,6 @@
#include <linux/mman.h>
#include <linux/mm_types.h>
#include <linux/module.h>
-#include <linux/pfn_t.h>
#include <linux/printk.h>
#include <linux/pgtable.h>
#include <linux/random.h>
@@ -73,6 +72,8 @@ struct pgtable_debug_args {
unsigned long fixed_pud_pfn;
unsigned long fixed_pmd_pfn;
unsigned long fixed_pte_pfn;
+
+ swp_entry_t swp_entry;
};
static void __init pte_basic_tests(struct pgtable_debug_args *args, int idx)
@@ -348,12 +349,6 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args)
vaddr &= HPAGE_PUD_MASK;
pud = pfn_pud(args->pud_pfn, args->page_prot);
- /*
- * Some architectures have debug checks to make sure
- * huge pud mapping are only found with devmap entries
- * For now test with only devmap entries.
- */
- pud = pud_mkdevmap(pud);
set_pud_at(args->mm, vaddr, args->pudp, pud);
flush_dcache_page(page);
pudp_set_wrprotect(args->mm, vaddr, args->pudp);
@@ -366,7 +361,6 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args)
WARN_ON(!pud_none(pud));
#endif /* __PAGETABLE_PMD_FOLDED */
pud = pfn_pud(args->pud_pfn, args->page_prot);
- pud = pud_mkdevmap(pud);
pud = pud_wrprotect(pud);
pud = pud_mkclean(pud);
set_pud_at(args->mm, vaddr, args->pudp, pud);
@@ -384,7 +378,6 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args)
#endif /* __PAGETABLE_PMD_FOLDED */
pud = pfn_pud(args->pud_pfn, args->page_prot);
- pud = pud_mkdevmap(pud);
pud = pud_mkyoung(pud);
set_pud_at(args->mm, vaddr, args->pudp, pud);
flush_dcache_page(page);
@@ -693,53 +686,6 @@ static void __init pmd_protnone_tests(struct pgtable_debug_args *args)
static void __init pmd_protnone_tests(struct pgtable_debug_args *args) { }
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
-static void __init pte_devmap_tests(struct pgtable_debug_args *args)
-{
- pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot);
-
- pr_debug("Validating PTE devmap\n");
- WARN_ON(!pte_devmap(pte_mkdevmap(pte)));
-}
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void __init pmd_devmap_tests(struct pgtable_debug_args *args)
-{
- pmd_t pmd;
-
- if (!has_transparent_hugepage())
- return;
-
- pr_debug("Validating PMD devmap\n");
- pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot);
- WARN_ON(!pmd_devmap(pmd_mkdevmap(pmd)));
-}
-
-#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
-static void __init pud_devmap_tests(struct pgtable_debug_args *args)
-{
- pud_t pud;
-
- if (!has_transparent_pud_hugepage())
- return;
-
- pr_debug("Validating PUD devmap\n");
- pud = pfn_pud(args->fixed_pud_pfn, args->page_prot);
- WARN_ON(!pud_devmap(pud_mkdevmap(pud)));
-}
-#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
-static void __init pud_devmap_tests(struct pgtable_debug_args *args) { }
-#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
-#else /* CONFIG_TRANSPARENT_HUGEPAGE */
-static void __init pmd_devmap_tests(struct pgtable_debug_args *args) { }
-static void __init pud_devmap_tests(struct pgtable_debug_args *args) { }
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-#else
-static void __init pte_devmap_tests(struct pgtable_debug_args *args) { }
-static void __init pmd_devmap_tests(struct pgtable_debug_args *args) { }
-static void __init pud_devmap_tests(struct pgtable_debug_args *args) { }
-#endif /* CONFIG_ARCH_HAS_PTE_DEVMAP */
-
static void __init pte_soft_dirty_tests(struct pgtable_debug_args *args)
{
pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot);
@@ -754,12 +700,15 @@ static void __init pte_soft_dirty_tests(struct pgtable_debug_args *args)
static void __init pte_swap_soft_dirty_tests(struct pgtable_debug_args *args)
{
- pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot);
+ pte_t pte;
if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
return;
pr_debug("Validating PTE swap soft dirty\n");
+ pte = swp_entry_to_pte(args->swp_entry);
+ WARN_ON(!is_swap_pte(pte));
+
WARN_ON(!pte_swp_soft_dirty(pte_swp_mksoft_dirty(pte)));
WARN_ON(pte_swp_soft_dirty(pte_swp_clear_soft_dirty(pte)));
}
@@ -793,7 +742,9 @@ static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args)
return;
pr_debug("Validating PMD swap soft dirty\n");
- pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot);
+ pmd = swp_entry_to_pmd(args->swp_entry);
+ WARN_ON(!is_swap_pmd(pmd));
+
WARN_ON(!pmd_swp_soft_dirty(pmd_swp_mksoft_dirty(pmd)));
WARN_ON(pmd_swp_soft_dirty(pmd_swp_clear_soft_dirty(pmd)));
}
@@ -804,17 +755,11 @@ static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) {
static void __init pte_swap_exclusive_tests(struct pgtable_debug_args *args)
{
- unsigned long max_swap_offset;
swp_entry_t entry, entry2;
pte_t pte;
pr_debug("Validating PTE swap exclusive\n");
-
- /* See generic_max_swapfile_size(): probe the maximum offset */
- max_swap_offset = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0, ~0UL))));
-
- /* Create a swp entry with all possible bits set */
- entry = swp_entry((1 << MAX_SWAPFILES_SHIFT) - 1, max_swap_offset);
+ entry = args->swp_entry;
pte = swp_entry_to_pte(entry);
WARN_ON(pte_swp_exclusive(pte));
@@ -838,30 +783,34 @@ static void __init pte_swap_exclusive_tests(struct pgtable_debug_args *args)
static void __init pte_swap_tests(struct pgtable_debug_args *args)
{
- swp_entry_t swp;
- pte_t pte;
+ swp_entry_t arch_entry;
+ pte_t pte1, pte2;
pr_debug("Validating PTE swap\n");
- pte = pfn_pte(args->fixed_pte_pfn, args->page_prot);
- swp = __pte_to_swp_entry(pte);
- pte = __swp_entry_to_pte(swp);
- WARN_ON(args->fixed_pte_pfn != pte_pfn(pte));
+ pte1 = swp_entry_to_pte(args->swp_entry);
+ WARN_ON(!is_swap_pte(pte1));
+
+ arch_entry = __pte_to_swp_entry(pte1);
+ pte2 = __swp_entry_to_pte(arch_entry);
+ WARN_ON(memcmp(&pte1, &pte2, sizeof(pte1)));
}
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
static void __init pmd_swap_tests(struct pgtable_debug_args *args)
{
- swp_entry_t swp;
- pmd_t pmd;
+ swp_entry_t arch_entry;
+ pmd_t pmd1, pmd2;
if (!has_transparent_hugepage())
return;
pr_debug("Validating PMD swap\n");
- pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot);
- swp = __pmd_to_swp_entry(pmd);
- pmd = __swp_entry_to_pmd(swp);
- WARN_ON(args->fixed_pmd_pfn != pmd_pfn(pmd));
+ pmd1 = swp_entry_to_pmd(args->swp_entry);
+ WARN_ON(!is_swap_pmd(pmd1));
+
+ arch_entry = __pmd_to_swp_entry(pmd1);
+ pmd2 = __swp_entry_to_pmd(arch_entry);
+ WARN_ON(memcmp(&pmd1, &pmd2, sizeof(pmd1)));
}
#else /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */
static void __init pmd_swap_tests(struct pgtable_debug_args *args) { }
@@ -910,26 +859,18 @@ static void __init swap_migration_tests(struct pgtable_debug_args *args)
#ifdef CONFIG_HUGETLB_PAGE
static void __init hugetlb_basic_tests(struct pgtable_debug_args *args)
{
- struct page *page;
pte_t pte;
pr_debug("Validating HugeTLB basic\n");
- /*
- * Accessing the page associated with the pfn is safe here,
- * as it was previously derived from a real kernel symbol.
- */
- page = pfn_to_page(args->fixed_pmd_pfn);
- pte = mk_huge_pte(page, args->page_prot);
+ pte = pfn_pte(args->fixed_pmd_pfn, args->page_prot);
+ pte = arch_make_huge_pte(pte, PMD_SHIFT, VM_ACCESS_FLAGS);
+#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
+ WARN_ON(!pte_huge(pte));
+#endif
WARN_ON(!huge_pte_dirty(huge_pte_mkdirty(pte)));
WARN_ON(!huge_pte_write(huge_pte_mkwrite(huge_pte_wrprotect(pte))));
WARN_ON(huge_pte_write(huge_pte_wrprotect(huge_pte_mkwrite(pte))));
-
-#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
- pte = pfn_pte(args->fixed_pmd_pfn, args->page_prot);
-
- WARN_ON(!pte_huge(arch_make_huge_pte(pte, PMD_SHIFT, VM_ACCESS_FLAGS)));
-#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
}
#else /* !CONFIG_HUGETLB_PAGE */
static void __init hugetlb_basic_tests(struct pgtable_debug_args *args) { }
@@ -1049,29 +990,34 @@ static void __init destroy_args(struct pgtable_debug_args *args)
/* Free page table entries */
if (args->start_ptep) {
+ pmd_clear(args->pmdp);
pte_free(args->mm, args->start_ptep);
mm_dec_nr_ptes(args->mm);
}
if (args->start_pmdp) {
+ pud_clear(args->pudp);
pmd_free(args->mm, args->start_pmdp);
mm_dec_nr_pmds(args->mm);
}
if (args->start_pudp) {
+ p4d_clear(args->p4dp);
pud_free(args->mm, args->start_pudp);
mm_dec_nr_puds(args->mm);
}
- if (args->start_p4dp)
+ if (args->start_p4dp) {
+ pgd_clear(args->pgdp);
p4d_free(args->mm, args->start_p4dp);
+ }
/* Free vma and mm struct */
if (args->vma)
vm_area_free(args->vma);
if (args->mm)
- mmdrop(args->mm);
+ mmput(args->mm);
}
static struct page * __init
@@ -1174,6 +1120,7 @@ static void __init init_fixed_pfns(struct pgtable_debug_args *args)
static int __init init_args(struct pgtable_debug_args *args)
{
+ unsigned long max_swap_offset;
struct page *page = NULL;
int ret = 0;
@@ -1256,6 +1203,11 @@ static int __init init_args(struct pgtable_debug_args *args)
init_fixed_pfns(args);
+ /* See generic_max_swapfile_size(): probe the maximum offset */
+ max_swap_offset = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0, ~0UL))));
+ /* Create a swp entry with all possible bits set */
+ args->swp_entry = swp_entry((1 << MAX_SWAPFILES_SHIFT) - 1, max_swap_offset);
+
/*
* Allocate (huge) pages because some of the tests need to access
* the data in the pages. The corresponding tests will be skipped
@@ -1341,10 +1293,6 @@ static int __init debug_vm_pgtable(void)
pte_protnone_tests(&args);
pmd_protnone_tests(&args);
- pte_devmap_tests(&args);
- pmd_devmap_tests(&args);
- pud_devmap_tests(&args);
-
pte_soft_dirty_tests(&args);
pmd_soft_dirty_tests(&args);
pte_swap_soft_dirty_tests(&args);
diff --git a/mm/dmapool.c b/mm/dmapool.c
index f0bfc6c490f4..5d8af6e29127 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -56,6 +56,7 @@ struct dma_pool { /* the pool */
unsigned int size;
unsigned int allocation;
unsigned int boundary;
+ int node;
char name[32];
struct list_head pools;
};
@@ -199,16 +200,17 @@ static void pool_block_push(struct dma_pool *pool, struct dma_block *block,
/**
- * dma_pool_create - Creates a pool of consistent memory blocks, for dma.
+ * dma_pool_create_node - Creates a pool of coherent DMA memory blocks.
* @name: name of pool, for diagnostics
* @dev: device that will be doing the DMA
* @size: size of the blocks in this pool.
* @align: alignment requirement for blocks; must be a power of two
* @boundary: returned blocks won't cross this power of two boundary
+ * @node: optional NUMA node to allocate structs 'dma_pool' and 'dma_page' on
* Context: not in_interrupt()
*
* Given one of these pools, dma_pool_alloc()
- * may be used to allocate memory. Such memory will all have "consistent"
+ * may be used to allocate memory. Such memory will all have coherent
* DMA mappings, accessible by the device and its driver without using
* cache flushing primitives. The actual size of blocks allocated may be
* larger than requested because of alignment.
@@ -221,8 +223,8 @@ static void pool_block_push(struct dma_pool *pool, struct dma_block *block,
* Return: a dma allocation pool with the requested characteristics, or
* %NULL if one can't be created.
*/
-struct dma_pool *dma_pool_create(const char *name, struct device *dev,
- size_t size, size_t align, size_t boundary)
+struct dma_pool *dma_pool_create_node(const char *name, struct device *dev,
+ size_t size, size_t align, size_t boundary, int node)
{
struct dma_pool *retval;
size_t allocation;
@@ -251,7 +253,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
boundary = min(boundary, allocation);
- retval = kzalloc(sizeof(*retval), GFP_KERNEL);
+ retval = kzalloc_node(sizeof(*retval), GFP_KERNEL, node);
if (!retval)
return retval;
@@ -264,6 +266,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
retval->size = size;
retval->boundary = boundary;
retval->allocation = allocation;
+ retval->node = node;
INIT_LIST_HEAD(&retval->pools);
/*
@@ -295,7 +298,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
mutex_unlock(&pools_reg_lock);
return retval;
}
-EXPORT_SYMBOL(dma_pool_create);
+EXPORT_SYMBOL(dma_pool_create_node);
static void pool_initialise_page(struct dma_pool *pool, struct dma_page *page)
{
@@ -335,7 +338,7 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
{
struct dma_page *page;
- page = kmalloc(sizeof(*page), mem_flags);
+ page = kmalloc_node(sizeof(*page), mem_flags, pool->node);
if (!page)
return NULL;
@@ -392,7 +395,7 @@ void dma_pool_destroy(struct dma_pool *pool)
EXPORT_SYMBOL(dma_pool_destroy);
/**
- * dma_pool_alloc - get a block of consistent memory
+ * dma_pool_alloc - get a block of coherent memory
* @pool: dma pool that will produce the block
* @mem_flags: GFP_* bitmask
* @handle: pointer to dma address of block
diff --git a/mm/execmem.c b/mm/execmem.c
index 6f7a2653b280..810a4ba9c924 100644
--- a/mm/execmem.c
+++ b/mm/execmem.c
@@ -38,9 +38,6 @@ static void *execmem_vmalloc(struct execmem_range *range, size_t size,
if (kasan)
vm_flags |= VM_DEFER_KMEMLEAK;
- if (vm_flags & VM_ALLOW_HUGE_VMAP)
- align = PMD_SIZE;
-
p = __vmalloc_node_range(size, align, start, end, gfp_flags,
pgprot, vm_flags, NUMA_NO_NODE,
__builtin_return_address(0));
@@ -93,8 +90,15 @@ struct execmem_cache {
struct mutex mutex;
struct maple_tree busy_areas;
struct maple_tree free_areas;
+ unsigned int pending_free_cnt; /* protected by mutex */
};
+/* delay to schedule asynchronous free if fast path free fails */
+#define FREE_DELAY (msecs_to_jiffies(10))
+
+/* mark entries in busy_areas that should be freed asynchronously */
+#define PENDING_FREE_MASK (1 << (PAGE_SHIFT - 1))
+
static struct execmem_cache execmem_cache = {
.mutex = __MUTEX_INITIALIZER(execmem_cache.mutex),
.busy_areas = MTREE_INIT_EXT(busy_areas, MT_FLAGS_LOCK_EXTERN,
@@ -130,6 +134,27 @@ err_restore:
return err;
}
+static int execmem_force_rw(void *ptr, size_t size)
+{
+ unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ unsigned long addr = (unsigned long)ptr;
+ int ret;
+
+ ret = set_memory_nx(addr, nr);
+ if (ret)
+ return ret;
+
+ return set_memory_rw(addr, nr);
+}
+
+int execmem_restore_rox(void *ptr, size_t size)
+{
+ unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ unsigned long addr = (unsigned long)ptr;
+
+ return set_memory_rox(addr, nr);
+}
+
static void execmem_cache_clean(struct work_struct *work)
{
struct maple_tree *free_areas = &execmem_cache.free_areas;
@@ -155,20 +180,17 @@ static void execmem_cache_clean(struct work_struct *work)
static DECLARE_WORK(execmem_cache_clean_work, execmem_cache_clean);
-static int execmem_cache_add(void *ptr, size_t size)
+static int execmem_cache_add_locked(void *ptr, size_t size, gfp_t gfp_mask)
{
struct maple_tree *free_areas = &execmem_cache.free_areas;
- struct mutex *mutex = &execmem_cache.mutex;
unsigned long addr = (unsigned long)ptr;
MA_STATE(mas, free_areas, addr - 1, addr + 1);
unsigned long lower, upper;
void *area = NULL;
- int err;
lower = addr;
upper = addr + size - 1;
- mutex_lock(mutex);
area = mas_walk(&mas);
if (area && mas.last == addr - 1)
lower = mas.index;
@@ -178,12 +200,14 @@ static int execmem_cache_add(void *ptr, size_t size)
upper = mas.last;
mas_set_range(&mas, lower, upper);
- err = mas_store_gfp(&mas, (void *)lower, GFP_KERNEL);
- mutex_unlock(mutex);
- if (err)
- return err;
+ return mas_store_gfp(&mas, (void *)lower, gfp_mask);
+}
- return 0;
+static int execmem_cache_add(void *ptr, size_t size, gfp_t gfp_mask)
+{
+ guard(mutex)(&execmem_cache.mutex);
+
+ return execmem_cache_add_locked(ptr, size, gfp_mask);
}
static bool within_range(struct execmem_range *range, struct ma_state *mas,
@@ -254,34 +278,6 @@ out_unlock:
return ptr;
}
-static bool execmem_cache_rox = false;
-
-void execmem_cache_make_ro(void)
-{
- struct maple_tree *free_areas = &execmem_cache.free_areas;
- struct maple_tree *busy_areas = &execmem_cache.busy_areas;
- MA_STATE(mas_free, free_areas, 0, ULONG_MAX);
- MA_STATE(mas_busy, busy_areas, 0, ULONG_MAX);
- struct mutex *mutex = &execmem_cache.mutex;
- void *area;
-
- execmem_cache_rox = true;
-
- mutex_lock(mutex);
-
- mas_for_each(&mas_free, area, ULONG_MAX) {
- unsigned long pages = mas_range_len(&mas_free) >> PAGE_SHIFT;
- set_memory_ro(mas_free.index, pages);
- }
-
- mas_for_each(&mas_busy, area, ULONG_MAX) {
- unsigned long pages = mas_range_len(&mas_busy) >> PAGE_SHIFT;
- set_memory_ro(mas_busy.index, pages);
- }
-
- mutex_unlock(mutex);
-}
-
static int execmem_cache_populate(struct execmem_range *range, size_t size)
{
unsigned long vm_flags = VM_ALLOW_HUGE_VMAP;
@@ -292,6 +288,11 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size)
alloc_size = round_up(size, PMD_SIZE);
p = execmem_vmalloc(range, alloc_size, PAGE_KERNEL, vm_flags);
+ if (!p) {
+ alloc_size = size;
+ p = execmem_vmalloc(range, alloc_size, PAGE_KERNEL, vm_flags);
+ }
+
if (!p)
return err;
@@ -300,19 +301,13 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size)
goto err_free_mem;
/* fill memory with instructions that will trap */
- execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true);
+ execmem_fill_trapping_insns(p, alloc_size);
- if (execmem_cache_rox) {
- err = set_memory_rox((unsigned long)p, vm->nr_pages);
- if (err)
- goto err_free_mem;
- } else {
- err = set_memory_x((unsigned long)p, vm->nr_pages);
- if (err)
- goto err_free_mem;
- }
+ err = set_memory_rox((unsigned long)p, vm->nr_pages);
+ if (err)
+ goto err_free_mem;
- err = execmem_cache_add(p, alloc_size);
+ err = execmem_cache_add(p, alloc_size, GFP_KERNEL);
if (err)
goto err_reset_direct_map;
@@ -341,57 +336,117 @@ static void *execmem_cache_alloc(struct execmem_range *range, size_t size)
return __execmem_cache_alloc(range, size);
}
-static bool execmem_cache_free(void *ptr)
+static inline bool is_pending_free(void *ptr)
{
- struct maple_tree *busy_areas = &execmem_cache.busy_areas;
- struct mutex *mutex = &execmem_cache.mutex;
- unsigned long addr = (unsigned long)ptr;
- MA_STATE(mas, busy_areas, addr, addr);
- size_t size;
- void *area;
+ return ((unsigned long)ptr & PENDING_FREE_MASK);
+}
- mutex_lock(mutex);
- area = mas_walk(&mas);
- if (!area) {
- mutex_unlock(mutex);
- return false;
- }
- size = mas_range_len(&mas);
+static inline void *pending_free_set(void *ptr)
+{
+ return (void *)((unsigned long)ptr | PENDING_FREE_MASK);
+}
- mas_store_gfp(&mas, NULL, GFP_KERNEL);
- mutex_unlock(mutex);
+static inline void *pending_free_clear(void *ptr)
+{
+ return (void *)((unsigned long)ptr & ~PENDING_FREE_MASK);
+}
- execmem_fill_trapping_insns(ptr, size, /* writable = */ false);
+static int __execmem_cache_free(struct ma_state *mas, void *ptr, gfp_t gfp_mask)
+{
+ size_t size = mas_range_len(mas);
+ int err;
- execmem_cache_add(ptr, size);
+ err = execmem_force_rw(ptr, size);
+ if (err)
+ return err;
- schedule_work(&execmem_cache_clean_work);
+ execmem_fill_trapping_insns(ptr, size);
+ execmem_restore_rox(ptr, size);
- return true;
+ err = execmem_cache_add_locked(ptr, size, gfp_mask);
+ if (err)
+ return err;
+
+ mas_store_gfp(mas, NULL, gfp_mask);
+ return 0;
}
-int execmem_make_temp_rw(void *ptr, size_t size)
+static void execmem_cache_free_slow(struct work_struct *work);
+static DECLARE_DELAYED_WORK(execmem_cache_free_work, execmem_cache_free_slow);
+
+static void execmem_cache_free_slow(struct work_struct *work)
{
- unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT;
- unsigned long addr = (unsigned long)ptr;
- int ret;
+ struct maple_tree *busy_areas = &execmem_cache.busy_areas;
+ MA_STATE(mas, busy_areas, 0, ULONG_MAX);
+ void *area;
- ret = set_memory_nx(addr, nr);
- if (ret)
- return ret;
+ guard(mutex)(&execmem_cache.mutex);
- return set_memory_rw(addr, nr);
+ if (!execmem_cache.pending_free_cnt)
+ return;
+
+ mas_for_each(&mas, area, ULONG_MAX) {
+ if (!is_pending_free(area))
+ continue;
+
+ area = pending_free_clear(area);
+ if (__execmem_cache_free(&mas, area, GFP_KERNEL))
+ continue;
+
+ execmem_cache.pending_free_cnt--;
+ }
+
+ if (execmem_cache.pending_free_cnt)
+ schedule_delayed_work(&execmem_cache_free_work, FREE_DELAY);
+ else
+ schedule_work(&execmem_cache_clean_work);
}
-int execmem_restore_rox(void *ptr, size_t size)
+static bool execmem_cache_free(void *ptr)
{
- unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ struct maple_tree *busy_areas = &execmem_cache.busy_areas;
unsigned long addr = (unsigned long)ptr;
+ MA_STATE(mas, busy_areas, addr, addr);
+ void *area;
+ int err;
- return set_memory_rox(addr, nr);
+ guard(mutex)(&execmem_cache.mutex);
+
+ area = mas_walk(&mas);
+ if (!area)
+ return false;
+
+ err = __execmem_cache_free(&mas, area, GFP_KERNEL | __GFP_NORETRY);
+ if (err) {
+ /*
+ * mas points to exact slot we've got the area from, nothing
+ * else can modify the tree because of the mutex, so there
+ * won't be any allocations in mas_store_gfp() and it will just
+ * change the pointer.
+ */
+ area = pending_free_set(area);
+ mas_store_gfp(&mas, area, GFP_KERNEL);
+ execmem_cache.pending_free_cnt++;
+ schedule_delayed_work(&execmem_cache_free_work, FREE_DELAY);
+ return true;
+ }
+
+ schedule_work(&execmem_cache_clean_work);
+
+ return true;
}
#else /* CONFIG_ARCH_HAS_EXECMEM_ROX */
+/*
+ * when ROX cache is not used the permissions defined by architectures for
+ * execmem ranges that are updated before use (e.g. EXECMEM_MODULE_TEXT) must
+ * be writable anyway
+ */
+static inline int execmem_force_rw(void *ptr, size_t size)
+{
+ return 0;
+}
+
static void *execmem_cache_alloc(struct execmem_range *range, size_t size)
{
return NULL;
@@ -409,7 +464,9 @@ void *execmem_alloc(enum execmem_type type, size_t size)
bool use_cache = range->flags & EXECMEM_ROX_CACHE;
unsigned long vm_flags = VM_FLUSH_RESET_PERMS;
pgprot_t pgprot = range->pgprot;
- void *p;
+ void *p = NULL;
+
+ size = PAGE_ALIGN(size);
if (use_cache)
p = execmem_cache_alloc(range, size);
@@ -419,6 +476,21 @@ void *execmem_alloc(enum execmem_type type, size_t size)
return kasan_reset_tag(p);
}
+void *execmem_alloc_rw(enum execmem_type type, size_t size)
+{
+ void *p __free(execmem) = execmem_alloc(type, size);
+ int err;
+
+ if (!p)
+ return NULL;
+
+ err = execmem_force_rw(p, size);
+ if (err)
+ return NULL;
+
+ return no_free_ptr(p);
+}
+
void execmem_free(void *ptr)
{
/*
@@ -431,11 +503,6 @@ void execmem_free(void *ptr)
vfree(ptr);
}
-void *execmem_update_copy(void *dst, const void *src, size_t size)
-{
- return text_poke_copy(dst, src, size);
-}
-
bool execmem_is_rox(enum execmem_type type)
{
return !!(execmem_info->ranges[type].flags & EXECMEM_ROX_CACHE);
diff --git a/mm/filemap.c b/mm/filemap.c
index 7b90cbeb4a1a..13f0259d993c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -142,7 +142,7 @@ static void page_cache_delete(struct address_space *mapping,
xas_init_marks(&xas);
folio->mapping = NULL;
- /* Leave page->index set: truncation lookup relies upon it */
+ /* Leave folio->index set: truncation lookup relies upon it */
mapping->nrpages -= nr;
}
@@ -190,6 +190,9 @@ static void filemap_unaccount_folio(struct address_space *mapping,
__lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr);
filemap_nr_thps_dec(mapping);
}
+ if (test_bit(AS_KERNEL_FILE, &folio->mapping->flags))
+ mod_node_page_state(folio_pgdat(folio),
+ NR_KERNEL_FILE_PAGES, -nr);
/*
* At this point folio must be either written or cleaned by
@@ -949,7 +952,7 @@ unlock:
return 0;
error:
folio->mapping = NULL;
- /* Leave page->index set: truncation relies upon it */
+ /* Leave folio->index set: truncation relies upon it */
folio_put_refs(folio, nr);
return xas_error(&xas);
}
@@ -960,8 +963,14 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio,
{
void *shadow = NULL;
int ret;
+ struct mem_cgroup *tmp;
+ bool kernel_file = test_bit(AS_KERNEL_FILE, &mapping->flags);
+ if (kernel_file)
+ tmp = set_active_memcg(root_mem_cgroup);
ret = mem_cgroup_charge(folio, NULL, gfp);
+ if (kernel_file)
+ set_active_memcg(tmp);
if (ret)
return ret;
@@ -983,6 +992,10 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio,
if (!(gfp & __GFP_WRITE) && shadow)
workingset_refault(folio, shadow);
folio_add_lru(folio);
+ if (kernel_file)
+ mod_node_page_state(folio_pgdat(folio),
+ NR_KERNEL_FILE_PAGES,
+ folio_nr_pages(folio));
}
return ret;
}
@@ -1140,10 +1153,10 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync,
*/
flags = wait->flags;
if (flags & WQ_FLAG_EXCLUSIVE) {
- if (test_bit(key->bit_nr, &key->folio->flags))
+ if (test_bit(key->bit_nr, &key->folio->flags.f))
return -1;
if (flags & WQ_FLAG_CUSTOM) {
- if (test_and_set_bit(key->bit_nr, &key->folio->flags))
+ if (test_and_set_bit(key->bit_nr, &key->folio->flags.f))
return -1;
flags |= WQ_FLAG_DONE;
}
@@ -1226,9 +1239,9 @@ static inline bool folio_trylock_flag(struct folio *folio, int bit_nr,
struct wait_queue_entry *wait)
{
if (wait->flags & WQ_FLAG_EXCLUSIVE) {
- if (test_and_set_bit(bit_nr, &folio->flags))
+ if (test_and_set_bit(bit_nr, &folio->flags.f))
return false;
- } else if (test_bit(bit_nr, &folio->flags))
+ } else if (test_bit(bit_nr, &folio->flags.f))
return false;
wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE;
@@ -1589,13 +1602,30 @@ int folio_wait_private_2_killable(struct folio *folio)
}
EXPORT_SYMBOL(folio_wait_private_2_killable);
+static void filemap_end_dropbehind(struct folio *folio)
+{
+ struct address_space *mapping = folio->mapping;
+
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+
+ if (folio_test_writeback(folio) || folio_test_dirty(folio))
+ return;
+ if (!folio_test_clear_dropbehind(folio))
+ return;
+ if (mapping)
+ folio_unmap_invalidate(mapping, folio, 0);
+}
+
/*
* If folio was marked as dropbehind, then pages should be dropped when writeback
* completes. Do that now. If we fail, it's likely because of a big folio -
* just reset dropbehind for that case and latter completions should invalidate.
*/
-static void folio_end_dropbehind_write(struct folio *folio)
+void folio_end_dropbehind(struct folio *folio)
{
+ if (!folio_test_dropbehind(folio))
+ return;
+
/*
* Hitting !in_task() should not happen off RWF_DONTCACHE writeback,
* but can happen if normal writeback just happens to find dirty folios
@@ -1604,24 +1634,23 @@ static void folio_end_dropbehind_write(struct folio *folio)
* invalidation in that case.
*/
if (in_task() && folio_trylock(folio)) {
- if (folio->mapping)
- folio_unmap_invalidate(folio->mapping, folio, 0);
+ filemap_end_dropbehind(folio);
folio_unlock(folio);
}
}
+EXPORT_SYMBOL_GPL(folio_end_dropbehind);
/**
- * folio_end_writeback - End writeback against a folio.
+ * folio_end_writeback_no_dropbehind - End writeback against a folio.
* @folio: The folio.
*
* The folio must actually be under writeback.
+ * This call is intended for filesystems that need to defer dropbehind.
*
* Context: May be called from process or interrupt context.
*/
-void folio_end_writeback(struct folio *folio)
+void folio_end_writeback_no_dropbehind(struct folio *folio)
{
- bool folio_dropbehind = false;
-
VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio);
/*
@@ -1636,6 +1665,25 @@ void folio_end_writeback(struct folio *folio)
folio_rotate_reclaimable(folio);
}
+ if (__folio_end_writeback(folio))
+ folio_wake_bit(folio, PG_writeback);
+
+ acct_reclaim_writeback(folio);
+}
+EXPORT_SYMBOL_GPL(folio_end_writeback_no_dropbehind);
+
+/**
+ * folio_end_writeback - End writeback against a folio.
+ * @folio: The folio.
+ *
+ * The folio must actually be under writeback.
+ *
+ * Context: May be called from process or interrupt context.
+ */
+void folio_end_writeback(struct folio *folio)
+{
+ VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio);
+
/*
* Writeback does not hold a folio reference of its own, relying
* on truncation to wait for the clearing of PG_writeback.
@@ -1643,14 +1691,8 @@ void folio_end_writeback(struct folio *folio)
* reused before the folio_wake_bit().
*/
folio_get(folio);
- if (!folio_test_dirty(folio))
- folio_dropbehind = folio_test_clear_dropbehind(folio);
- if (__folio_end_writeback(folio))
- folio_wake_bit(folio, PG_writeback);
- acct_reclaim_writeback(folio);
-
- if (folio_dropbehind)
- folio_end_dropbehind_write(folio);
+ folio_end_writeback_no_dropbehind(folio);
+ folio_end_dropbehind(folio);
folio_put(folio);
}
EXPORT_SYMBOL(folio_end_writeback);
@@ -1767,8 +1809,9 @@ pgoff_t page_cache_next_miss(struct address_space *mapping,
pgoff_t index, unsigned long max_scan)
{
XA_STATE(xas, &mapping->i_pages, index);
+ unsigned long nr = max_scan;
- while (max_scan--) {
+ while (nr--) {
void *entry = xas_next(&xas);
if (!entry || xa_is_value(entry))
return xas.xa_index;
@@ -1949,7 +1992,7 @@ no_page:
gfp &= ~__GFP_FS;
if (fgp_flags & FGP_NOWAIT) {
gfp &= ~GFP_KERNEL;
- gfp |= GFP_NOWAIT | __GFP_NOWARN;
+ gfp |= GFP_NOWAIT;
}
if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
fgp_flags |= FGP_LOCK;
@@ -2435,6 +2478,9 @@ static bool filemap_range_uptodate(struct address_space *mapping,
pos -= folio_pos(folio);
}
+ if (pos == 0 && count >= folio_size(folio))
+ return false;
+
return mapping->a_ops->is_partially_uptodate(folio, pos, count);
}
@@ -2572,8 +2618,9 @@ static int filemap_get_pages(struct kiocb *iocb, size_t count,
unsigned int flags;
int err = 0;
- /* "last_index" is the index of the page beyond the end of the read */
- last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE);
+ /* "last_index" is the index of the folio beyond the end of the read */
+ last_index = round_up(iocb->ki_pos + count,
+ mapping_min_folio_nrbytes(mapping)) >> PAGE_SHIFT;
retry:
if (fatal_signal_pending(current))
return -EINTR;
@@ -2607,9 +2654,10 @@ retry:
goto err;
}
if (!folio_test_uptodate(folio)) {
- if ((iocb->ki_flags & IOCB_WAITQ) &&
- folio_batch_count(fbatch) > 1)
- iocb->ki_flags |= IOCB_NOWAIT;
+ if (folio_batch_count(fbatch) > 1) {
+ err = -EAGAIN;
+ goto err;
+ }
err = filemap_update_page(iocb, mapping, count, folio,
need_uptodate);
if (err)
@@ -2635,16 +2683,14 @@ static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio)
return (pos1 >> shift == pos2 >> shift);
}
-static void filemap_end_dropbehind_read(struct address_space *mapping,
- struct folio *folio)
+static void filemap_end_dropbehind_read(struct folio *folio)
{
if (!folio_test_dropbehind(folio))
return;
if (folio_test_writeback(folio) || folio_test_dirty(folio))
return;
if (folio_trylock(folio)) {
- if (folio_test_clear_dropbehind(folio))
- folio_unmap_invalidate(mapping, folio, 0);
+ filemap_end_dropbehind(folio);
folio_unlock(folio);
}
}
@@ -2765,7 +2811,7 @@ put_folios:
for (i = 0; i < folio_batch_count(&fbatch); i++) {
struct folio *folio = fbatch.folios[i];
- filemap_end_dropbehind_read(mapping, folio);
+ filemap_end_dropbehind_read(folio);
folio_put(folio);
}
folio_batch_init(&fbatch);
@@ -3206,8 +3252,8 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
struct address_space *mapping = file->f_mapping;
DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff);
struct file *fpin = NULL;
- unsigned long vm_flags = vmf->vma->vm_flags;
- unsigned int mmap_miss;
+ vm_flags_t vm_flags = vmf->vma->vm_flags;
+ unsigned short mmap_miss;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* Use the readahead code, even if readahead is disabled */
@@ -3222,13 +3268,17 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
if (!(vm_flags & VM_RAND_READ))
ra->size *= 2;
ra->async_size = HPAGE_PMD_NR;
- page_cache_ra_order(&ractl, ra, HPAGE_PMD_ORDER);
+ ra->order = HPAGE_PMD_ORDER;
+ page_cache_ra_order(&ractl, ra);
return fpin;
}
#endif
- /* If we don't want any read-ahead, don't bother */
- if (vm_flags & VM_RAND_READ)
+ /*
+ * If we don't want any read-ahead, don't bother. VM_EXEC case below is
+ * already intended for random access.
+ */
+ if ((vm_flags & (VM_RAND_READ | VM_EXEC)) == VM_RAND_READ)
return fpin;
if (!ra->ra_pages)
return fpin;
@@ -3251,15 +3301,43 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
if (mmap_miss > MMAP_LOTSAMISS)
return fpin;
- /*
- * mmap read-around
- */
+ if (vm_flags & VM_EXEC) {
+ /*
+ * Allow arch to request a preferred minimum folio order for
+ * executable memory. This can often be beneficial to
+ * performance if (e.g.) arm64 can contpte-map the folio.
+ * Executable memory rarely benefits from readahead, due to its
+ * random access nature, so set async_size to 0.
+ *
+ * Limit to the boundaries of the VMA to avoid reading in any
+ * pad that might exist between sections, which would be a waste
+ * of memory.
+ */
+ struct vm_area_struct *vma = vmf->vma;
+ unsigned long start = vma->vm_pgoff;
+ unsigned long end = start + vma_pages(vma);
+ unsigned long ra_end;
+
+ ra->order = exec_folio_order();
+ ra->start = round_down(vmf->pgoff, 1UL << ra->order);
+ ra->start = max(ra->start, start);
+ ra_end = round_up(ra->start + ra->ra_pages, 1UL << ra->order);
+ ra_end = min(ra_end, end);
+ ra->size = ra_end - ra->start;
+ ra->async_size = 0;
+ } else {
+ /*
+ * mmap read-around
+ */
+ ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
+ ra->size = ra->ra_pages;
+ ra->async_size = ra->ra_pages / 4;
+ ra->order = 0;
+ }
+
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
- ra->size = ra->ra_pages;
- ra->async_size = ra->ra_pages / 4;
ractl._index = ra->start;
- page_cache_ra_order(&ractl, ra, 0);
+ page_cache_ra_order(&ractl, ra);
return fpin;
}
@@ -3275,15 +3353,23 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
struct file_ra_state *ra = &file->f_ra;
DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff);
struct file *fpin = NULL;
- unsigned int mmap_miss;
+ unsigned short mmap_miss;
/* If we don't want any read-ahead, don't bother */
if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
return fpin;
- mmap_miss = READ_ONCE(ra->mmap_miss);
- if (mmap_miss)
- WRITE_ONCE(ra->mmap_miss, --mmap_miss);
+ /*
+ * If the folio is locked, we're likely racing against another fault.
+ * Don't touch the mmap_miss counter to avoid decreasing it multiple
+ * times for a single folio and break the balance with mmap_miss
+ * increase in do_sync_mmap_readahead().
+ */
+ if (likely(!folio_test_locked(folio))) {
+ mmap_miss = READ_ONCE(ra->mmap_miss);
+ if (mmap_miss)
+ WRITE_ONCE(ra->mmap_miss, --mmap_miss);
+ }
if (folio_test_readahead(folio)) {
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
@@ -3533,7 +3619,7 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio,
if (pmd_none(*vmf->pmd) && folio_test_pmd_mappable(folio)) {
struct page *page = folio_file_page(folio, start);
- vm_fault_t ret = do_set_pmd(vmf, page);
+ vm_fault_t ret = do_set_pmd(vmf, folio, page);
if (!ret) {
/* The page is mapped successfully, reference consumed. */
folio_unlock(folio);
@@ -3595,12 +3681,28 @@ skip:
static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
struct folio *folio, unsigned long start,
unsigned long addr, unsigned int nr_pages,
- unsigned long *rss, unsigned int *mmap_miss)
+ unsigned long *rss, unsigned short *mmap_miss)
{
+ unsigned int ref_from_caller = 1;
vm_fault_t ret = 0;
struct page *page = folio_page(folio, start);
unsigned int count = 0;
pte_t *old_ptep = vmf->pte;
+ unsigned long addr0;
+
+ /*
+ * Map the large folio fully where possible.
+ *
+ * The folio must not cross VMA or page table boundary.
+ */
+ addr0 = addr - start * PAGE_SIZE;
+ if (folio_within_vma(folio, vmf->vma) &&
+ (addr0 & PMD_MASK) == ((addr0 + folio_size(folio) - 1) & PMD_MASK)) {
+ vmf->pte -= start;
+ page -= start;
+ addr = addr0;
+ nr_pages = folio_nr_pages(folio);
+ }
do {
if (PageHWPoison(page + count))
@@ -3630,7 +3732,8 @@ skip:
if (count) {
set_pte_range(vmf, folio, page, count, addr);
*rss += count;
- folio_ref_add(folio, count);
+ folio_ref_add(folio, count - ref_from_caller);
+ ref_from_caller = 0;
if (in_range(vmf->address, addr, count * PAGE_SIZE))
ret = VM_FAULT_NOPAGE;
}
@@ -3645,25 +3748,29 @@ skip:
if (count) {
set_pte_range(vmf, folio, page, count, addr);
*rss += count;
- folio_ref_add(folio, count);
+ folio_ref_add(folio, count - ref_from_caller);
+ ref_from_caller = 0;
if (in_range(vmf->address, addr, count * PAGE_SIZE))
ret = VM_FAULT_NOPAGE;
}
vmf->pte = old_ptep;
+ if (ref_from_caller)
+ /* Locked folios cannot get truncated. */
+ folio_ref_dec(folio);
return ret;
}
static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf,
struct folio *folio, unsigned long addr,
- unsigned long *rss, unsigned int *mmap_miss)
+ unsigned long *rss, unsigned short *mmap_miss)
{
vm_fault_t ret = 0;
struct page *page = &folio->page;
if (PageHWPoison(page))
- return ret;
+ goto out;
/* See comment of filemap_map_folio_range() */
if (!folio_test_workingset(folio))
@@ -3675,15 +3782,18 @@ static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf,
* the fault-around logic.
*/
if (!pte_none(ptep_get(vmf->pte)))
- return ret;
+ goto out;
if (vmf->address == addr)
ret = VM_FAULT_NOPAGE;
set_pte_range(vmf, folio, page, 1, addr);
(*rss)++;
- folio_ref_inc(folio);
+ return ret;
+out:
+ /* Locked folios cannot get truncated. */
+ folio_ref_dec(folio);
return ret;
}
@@ -3699,7 +3809,8 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
struct folio *folio;
vm_fault_t ret = 0;
unsigned long rss = 0;
- unsigned int nr_pages = 0, mmap_miss = 0, mmap_miss_saved, folio_type;
+ unsigned int nr_pages = 0, folio_type;
+ unsigned short mmap_miss = 0, mmap_miss_saved;
rcu_read_lock();
folio = next_uptodate_folio(&xas, mapping, end_pgoff);
@@ -3742,7 +3853,6 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
nr_pages, &rss, &mmap_miss);
folio_unlock(folio);
- folio_put(folio);
} while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL);
add_mm_counter(vma->vm_mm, folio_type, rss);
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -3805,6 +3915,18 @@ int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
return 0;
}
+int generic_file_mmap_prepare(struct vm_area_desc *desc)
+{
+ struct file *file = desc->file;
+ struct address_space *mapping = file->f_mapping;
+
+ if (!mapping->a_ops->read_folio)
+ return -ENOEXEC;
+ file_accessed(file);
+ desc->vm_ops = &generic_file_vm_ops;
+ return 0;
+}
+
/*
* This is for filesystems which do not implement ->writepage.
*/
@@ -3814,6 +3936,13 @@ int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
return -EINVAL;
return generic_file_mmap(file, vma);
}
+
+int generic_file_readonly_mmap_prepare(struct vm_area_desc *desc)
+{
+ if (is_shared_maywrite(desc->vm_flags))
+ return -EINVAL;
+ return generic_file_mmap_prepare(desc);
+}
#else
vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
{
@@ -3823,15 +3952,25 @@ int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
{
return -ENOSYS;
}
+int generic_file_mmap_prepare(struct vm_area_desc *desc)
+{
+ return -ENOSYS;
+}
int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
{
return -ENOSYS;
}
+int generic_file_readonly_mmap_prepare(struct vm_area_desc *desc)
+{
+ return -ENOSYS;
+}
#endif /* CONFIG_MMU */
EXPORT_SYMBOL(filemap_page_mkwrite);
EXPORT_SYMBOL(generic_file_mmap);
+EXPORT_SYMBOL(generic_file_mmap_prepare);
EXPORT_SYMBOL(generic_file_readonly_mmap);
+EXPORT_SYMBOL(generic_file_readonly_mmap_prepare);
static struct folio *do_read_cache_folio(struct address_space *mapping,
pgoff_t index, filler_t filler, struct file *file, gfp_t gfp)
@@ -4100,7 +4239,7 @@ retry:
break;
}
- status = a_ops->write_begin(file, mapping, pos, bytes,
+ status = a_ops->write_begin(iocb, mapping, pos, bytes,
&folio, &fsdata);
if (unlikely(status < 0))
break;
@@ -4121,7 +4260,7 @@ retry:
copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
flush_dcache_folio(folio);
- status = a_ops->write_end(file, mapping, pos, bytes, copied,
+ status = a_ops->write_end(iocb, mapping, pos, bytes, copied,
folio, fsdata);
if (unlikely(status != copied)) {
iov_iter_revert(i, copied - max(status, 0L));
@@ -4419,7 +4558,7 @@ static void filemap_cachestat(struct address_space *mapping,
* invalidation, so there might not be
* a shadow in the swapcache (yet).
*/
- shadow = get_shadow_from_swap_cache(swp);
+ shadow = swap_cache_get_shadow(swp);
if (!shadow)
goto resched;
}
diff --git a/mm/gup.c b/mm/gup.c
index 84461d384ae2..a8ba5112e4d0 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -26,11 +26,7 @@
#include <asm/tlbflush.h>
#include "internal.h"
-
-struct follow_page_context {
- struct dev_pagemap *pgmap;
- unsigned int page_mask;
-};
+#include "swap.h"
static inline void sanity_check_pinned_pages(struct page **pages,
unsigned long npages)
@@ -63,11 +59,11 @@ static inline void sanity_check_pinned_pages(struct page **pages,
!folio_test_anon(folio))
continue;
if (!folio_test_large(folio) || folio_test_hugetlb(folio))
- VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page), page);
+ VM_WARN_ON_ONCE_FOLIO(!PageAnonExclusive(&folio->page), folio);
else
/* Either a PTE-mapped or a PMD-mapped THP. */
- VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page) &&
- !PageAnonExclusive(page), page);
+ VM_WARN_ON_ONCE_PAGE(!PageAnonExclusive(&folio->page) &&
+ !PageAnonExclusive(page), page);
}
}
@@ -147,7 +143,7 @@ int __must_check try_grab_folio(struct folio *folio, int refs,
if (WARN_ON_ONCE(folio_ref_count(folio) <= 0))
return -ENOMEM;
- if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(&folio->page)))
+ if (unlikely(!(flags & FOLL_PCI_P2PDMA) && folio_is_pci_p2pdma(folio)))
return -EREMOTEIO;
if (flags & FOLL_GET)
@@ -236,7 +232,7 @@ void folio_add_pin(struct folio *folio)
static inline struct folio *gup_folio_range_next(struct page *start,
unsigned long npages, unsigned long i, unsigned int *ntails)
{
- struct page *next = nth_page(start, i);
+ struct page *next = start + i;
struct folio *folio = page_folio(next);
unsigned int nr = 1;
@@ -341,6 +337,10 @@ EXPORT_SYMBOL(unpin_user_pages_dirty_lock);
* "gup-pinned page range" refers to a range of pages that has had one of the
* pin_user_pages() variants called on that page.
*
+ * The page range must be truly physically contiguous: the page range
+ * corresponds to a contiguous PFN range and all pages can be iterated
+ * naturally.
+ *
* For the page ranges defined by [page .. page+npages], make that range (or
* its head pages, if a compound page) dirty, if @make_dirty is true, and if the
* page range was previously listed as clean.
@@ -358,6 +358,8 @@ void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
struct folio *folio;
unsigned int nr;
+ VM_WARN_ON_ONCE(!page_range_contiguous(page, npages));
+
for (i = 0; i < npages; i += nr) {
folio = gup_folio_range_next(page, npages, i, &nr);
if (make_dirty && !folio_test_dirty(folio)) {
@@ -474,29 +476,15 @@ EXPORT_SYMBOL_GPL(unpin_folios);
* lifecycle. Avoid setting the bit unless necessary, or it might cause write
* cache bouncing on large SMP machines for concurrent pinned gups.
*/
-static inline void mm_set_has_pinned_flag(unsigned long *mm_flags)
+static inline void mm_set_has_pinned_flag(struct mm_struct *mm)
{
- if (!test_bit(MMF_HAS_PINNED, mm_flags))
- set_bit(MMF_HAS_PINNED, mm_flags);
+ if (!mm_flags_test(MMF_HAS_PINNED, mm))
+ mm_flags_set(MMF_HAS_PINNED, mm);
}
#ifdef CONFIG_MMU
#ifdef CONFIG_HAVE_GUP_FAST
-static int record_subpages(struct page *page, unsigned long sz,
- unsigned long addr, unsigned long end,
- struct page **pages)
-{
- struct page *start_page;
- int nr;
-
- start_page = nth_page(page, (addr & (sz - 1)) >> PAGE_SHIFT);
- for (nr = 0; addr != end; nr++, addr += PAGE_SIZE)
- pages[nr] = nth_page(start_page, nr);
-
- return nr;
-}
-
/**
* try_grab_folio_fast() - Attempt to get or pin a folio in fast path.
* @page: pointer to page to be grabbed
@@ -660,7 +648,7 @@ static inline bool can_follow_write_pud(pud_t pud, struct page *page,
static struct page *follow_huge_pud(struct vm_area_struct *vma,
unsigned long addr, pud_t *pudp,
- int flags, struct follow_page_context *ctx)
+ int flags, unsigned long *page_mask)
{
struct mm_struct *mm = vma->vm_mm;
struct page *page;
@@ -678,38 +666,16 @@ static struct page *follow_huge_pud(struct vm_area_struct *vma,
return NULL;
pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
-
- if (IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) &&
- pud_devmap(pud)) {
- /*
- * device mapped pages can only be returned if the caller
- * will manage the page reference count.
- *
- * At least one of FOLL_GET | FOLL_PIN must be set, so
- * assert that here:
- */
- if (!(flags & (FOLL_GET | FOLL_PIN)))
- return ERR_PTR(-EEXIST);
-
- if (flags & FOLL_TOUCH)
- touch_pud(vma, addr, pudp, flags & FOLL_WRITE);
-
- ctx->pgmap = get_dev_pagemap(pfn, ctx->pgmap);
- if (!ctx->pgmap)
- return ERR_PTR(-EFAULT);
- }
-
page = pfn_to_page(pfn);
- if (!pud_devmap(pud) && !pud_write(pud) &&
- gup_must_unshare(vma, flags, page))
+ if (!pud_write(pud) && gup_must_unshare(vma, flags, page))
return ERR_PTR(-EMLINK);
ret = try_grab_folio(page_folio(page), 1, flags);
if (ret)
page = ERR_PTR(ret);
else
- ctx->page_mask = HPAGE_PUD_NR - 1;
+ *page_mask = HPAGE_PUD_NR - 1;
return page;
}
@@ -735,7 +701,7 @@ static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page,
static struct page *follow_huge_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmd,
unsigned int flags,
- struct follow_page_context *ctx)
+ unsigned long *page_mask)
{
struct mm_struct *mm = vma->vm_mm;
pmd_t pmdval = *pmd;
@@ -759,8 +725,8 @@ static struct page *follow_huge_pmd(struct vm_area_struct *vma,
if (!pmd_write(pmdval) && gup_must_unshare(vma, flags, page))
return ERR_PTR(-EMLINK);
- VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
- !PageAnonExclusive(page), page);
+ VM_WARN_ON_ONCE_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
+ !PageAnonExclusive(page), page);
ret = try_grab_folio(page_folio(page), 1, flags);
if (ret)
@@ -772,7 +738,7 @@ static struct page *follow_huge_pmd(struct vm_area_struct *vma,
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
- ctx->page_mask = HPAGE_PMD_NR - 1;
+ *page_mask = HPAGE_PMD_NR - 1;
return page;
}
@@ -780,7 +746,7 @@ static struct page *follow_huge_pmd(struct vm_area_struct *vma,
#else /* CONFIG_PGTABLE_HAS_HUGE_LEAVES */
static struct page *follow_huge_pud(struct vm_area_struct *vma,
unsigned long addr, pud_t *pudp,
- int flags, struct follow_page_context *ctx)
+ int flags, unsigned long *page_mask)
{
return NULL;
}
@@ -788,7 +754,7 @@ static struct page *follow_huge_pud(struct vm_area_struct *vma,
static struct page *follow_huge_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmd,
unsigned int flags,
- struct follow_page_context *ctx)
+ unsigned long *page_mask)
{
return NULL;
}
@@ -834,8 +800,7 @@ static inline bool can_follow_write_pte(pte_t pte, struct page *page,
}
static struct page *follow_page_pte(struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmd, unsigned int flags,
- struct dev_pagemap **pgmap)
+ unsigned long address, pmd_t *pmd, unsigned int flags)
{
struct mm_struct *mm = vma->vm_mm;
struct folio *folio;
@@ -844,11 +809,6 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
pte_t *ptep, pte;
int ret;
- /* FOLL_GET and FOLL_PIN are mutually exclusive. */
- if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
- (FOLL_PIN | FOLL_GET)))
- return ERR_PTR(-EINVAL);
-
ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
if (!ptep)
return no_page_table(vma, flags, address);
@@ -861,8 +821,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
page = vm_normal_page(vma, address, pte);
/*
- * We only care about anon pages in can_follow_write_pte() and don't
- * have to worry about pte_devmap() because they are never anon.
+ * We only care about anon pages in can_follow_write_pte().
*/
if ((flags & FOLL_WRITE) &&
!can_follow_write_pte(pte, page, vma, flags)) {
@@ -870,18 +829,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
goto out;
}
- if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
- /*
- * Only return device mapping pages in the FOLL_GET or FOLL_PIN
- * case since they are only valid while holding the pgmap
- * reference.
- */
- *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
- if (*pgmap)
- page = pte_page(pte);
- else
- goto no_page;
- } else if (unlikely(!page)) {
+ if (unlikely(!page)) {
if (flags & FOLL_DUMP) {
/* Avoid special (like zero) pages in core dumps */
page = ERR_PTR(-EFAULT);
@@ -903,8 +851,8 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
goto out;
}
- VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
- !PageAnonExclusive(page), page);
+ VM_WARN_ON_ONCE_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
+ !PageAnonExclusive(page), page);
/* try_grab_folio() does nothing unless FOLL_GET or FOLL_PIN is set. */
ret = try_grab_folio(folio, 1, flags);
@@ -950,7 +898,7 @@ no_page:
static struct page *follow_pmd_mask(struct vm_area_struct *vma,
unsigned long address, pud_t *pudp,
unsigned int flags,
- struct follow_page_context *ctx)
+ unsigned long *page_mask)
{
pmd_t *pmd, pmdval;
spinlock_t *ptl;
@@ -963,16 +911,8 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
return no_page_table(vma, flags, address);
if (!pmd_present(pmdval))
return no_page_table(vma, flags, address);
- if (pmd_devmap(pmdval)) {
- ptl = pmd_lock(mm, pmd);
- page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
- spin_unlock(ptl);
- if (page)
- return page;
- return no_page_table(vma, flags, address);
- }
if (likely(!pmd_leaf(pmdval)))
- return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
+ return follow_page_pte(vma, address, pmd, flags);
if (pmd_protnone(pmdval) && !gup_can_follow_protnone(vma, flags))
return no_page_table(vma, flags, address);
@@ -985,16 +925,16 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
}
if (unlikely(!pmd_leaf(pmdval))) {
spin_unlock(ptl);
- return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
+ return follow_page_pte(vma, address, pmd, flags);
}
if (pmd_trans_huge(pmdval) && (flags & FOLL_SPLIT_PMD)) {
spin_unlock(ptl);
split_huge_pmd(vma, pmd, address);
/* If pmd was left empty, stuff a page table in there quickly */
return pte_alloc(mm, pmd) ? ERR_PTR(-ENOMEM) :
- follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
+ follow_page_pte(vma, address, pmd, flags);
}
- page = follow_huge_pmd(vma, address, pmd, flags, ctx);
+ page = follow_huge_pmd(vma, address, pmd, flags, page_mask);
spin_unlock(ptl);
return page;
}
@@ -1002,7 +942,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
static struct page *follow_pud_mask(struct vm_area_struct *vma,
unsigned long address, p4d_t *p4dp,
unsigned int flags,
- struct follow_page_context *ctx)
+ unsigned long *page_mask)
{
pud_t *pudp, pud;
spinlock_t *ptl;
@@ -1015,7 +955,7 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma,
return no_page_table(vma, flags, address);
if (pud_leaf(pud)) {
ptl = pud_lock(mm, pudp);
- page = follow_huge_pud(vma, address, pudp, flags, ctx);
+ page = follow_huge_pud(vma, address, pudp, flags, page_mask);
spin_unlock(ptl);
if (page)
return page;
@@ -1024,13 +964,13 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma,
if (unlikely(pud_bad(pud)))
return no_page_table(vma, flags, address);
- return follow_pmd_mask(vma, address, pudp, flags, ctx);
+ return follow_pmd_mask(vma, address, pudp, flags, page_mask);
}
static struct page *follow_p4d_mask(struct vm_area_struct *vma,
unsigned long address, pgd_t *pgdp,
unsigned int flags,
- struct follow_page_context *ctx)
+ unsigned long *page_mask)
{
p4d_t *p4dp, p4d;
@@ -1041,7 +981,7 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
if (!p4d_present(p4d) || p4d_bad(p4d))
return no_page_table(vma, flags, address);
- return follow_pud_mask(vma, address, p4dp, flags, ctx);
+ return follow_pud_mask(vma, address, p4dp, flags, page_mask);
}
/**
@@ -1049,20 +989,16 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
* @vma: vm_area_struct mapping @address
* @address: virtual address to look up
* @flags: flags modifying lookup behaviour
- * @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a
- * pointer to output page_mask
+ * @page_mask: a pointer to output page_mask
*
* @flags can have FOLL_ flags set, defined in <linux/mm.h>
*
- * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches
- * the device's dev_pagemap metadata to avoid repeating expensive lookups.
- *
* When getting an anonymous page and the caller has to trigger unsharing
* of a shared anonymous page first, -EMLINK is returned. The caller should
* trigger a fault with FAULT_FLAG_UNSHARE set. Note that unsharing is only
* relevant with FOLL_PIN and !FOLL_WRITE.
*
- * On output, the @ctx->page_mask is set according to the size of the page.
+ * On output, @page_mask is set according to the size of the page.
*
* Return: the mapped (struct page *), %NULL if no mapping exists, or
* an error pointer if there is a mapping to something not represented
@@ -1070,7 +1006,7 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
*/
static struct page *follow_page_mask(struct vm_area_struct *vma,
unsigned long address, unsigned int flags,
- struct follow_page_context *ctx)
+ unsigned long *page_mask)
{
pgd_t *pgd;
struct mm_struct *mm = vma->vm_mm;
@@ -1078,13 +1014,13 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
vma_pgtable_walk_begin(vma);
- ctx->page_mask = 0;
+ *page_mask = 0;
pgd = pgd_offset(mm, address);
if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
page = no_page_table(vma, flags, address);
else
- page = follow_p4d_mask(vma, address, pgd, flags, ctx);
+ page = follow_p4d_mask(vma, address, pgd, flags, page_mask);
vma_pgtable_walk_end(vma);
@@ -1106,10 +1042,7 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
/* user gate pages are read-only */
if (gup_flags & FOLL_WRITE)
return -EFAULT;
- if (address > TASK_SIZE)
- pgd = pgd_offset_k(address);
- else
- pgd = pgd_offset_gate(mm, address);
+ pgd = pgd_offset(mm, address);
if (pgd_none(*pgd))
return -EFAULT;
p4d = p4d_offset(pgd, address);
@@ -1187,7 +1120,7 @@ static int faultin_page(struct vm_area_struct *vma,
if (unshare) {
fault_flags |= FAULT_FLAG_UNSHARE;
/* FAULT_FLAG_WRITE and FAULT_FLAG_UNSHARE are incompatible */
- VM_BUG_ON(fault_flags & FAULT_FLAG_WRITE);
+ VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_WRITE);
}
ret = handle_mm_fault(vma, address, fault_flags, NULL);
@@ -1425,14 +1358,18 @@ static long __get_user_pages(struct mm_struct *mm,
{
long ret = 0, i = 0;
struct vm_area_struct *vma = NULL;
- struct follow_page_context ctx = { NULL };
+ unsigned long page_mask = 0;
if (!nr_pages)
return 0;
start = untagged_addr_remote(mm, start);
- VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));
+ VM_WARN_ON_ONCE(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));
+
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+ VM_WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) ==
+ (FOLL_PIN | FOLL_GET));
do {
struct page *page;
@@ -1463,7 +1400,7 @@ static long __get_user_pages(struct mm_struct *mm,
pages ? &page : NULL);
if (ret)
goto out;
- ctx.page_mask = 0;
+ page_mask = 0;
goto next_page;
}
@@ -1486,7 +1423,7 @@ retry:
}
cond_resched();
- page = follow_page_mask(vma, start, gup_flags, &ctx);
+ page = follow_page_mask(vma, start, gup_flags, &page_mask);
if (!page || PTR_ERR(page) == -EMLINK) {
ret = faultin_page(vma, start, gup_flags,
PTR_ERR(page) == -EMLINK, locked);
@@ -1519,7 +1456,7 @@ retry:
goto out;
}
next_page:
- page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask);
+ page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
if (page_increm > nr_pages)
page_increm = nr_pages;
@@ -1557,7 +1494,7 @@ next_page:
}
for (j = 0; j < page_increm; j++) {
- subpage = nth_page(page, j);
+ subpage = page + j;
pages[i + j] = subpage;
flush_anon_page(vma, subpage, start + j * PAGE_SIZE);
flush_dcache_page(subpage);
@@ -1569,8 +1506,6 @@ next_page:
nr_pages -= page_increm;
} while (nr_pages);
out:
- if (ctx.pgmap)
- put_dev_pagemap(ctx.pgmap);
return i ? i : ret;
}
@@ -1738,7 +1673,7 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
mmap_assert_locked(mm);
if (flags & FOLL_PIN)
- mm_set_has_pinned_flag(&mm->flags);
+ mm_set_has_pinned_flag(mm);
/*
* FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior
@@ -1763,10 +1698,7 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
}
/* VM_FAULT_RETRY or VM_FAULT_COMPLETED cannot return errors */
- if (!*locked) {
- BUG_ON(ret < 0);
- BUG_ON(ret >= nr_pages);
- }
+ VM_WARN_ON_ONCE(!*locked && (ret < 0 || ret >= nr_pages));
if (ret > 0) {
nr_pages -= ret;
@@ -1811,7 +1743,6 @@ retry:
ret = mmap_read_lock_killable(mm);
if (ret) {
- BUG_ON(ret > 0);
if (!pages_done)
pages_done = ret;
break;
@@ -1822,11 +1753,11 @@ retry:
pages, locked);
if (!*locked) {
/* Continue to retry until we succeeded */
- BUG_ON(ret != 0);
+ VM_WARN_ON_ONCE(ret != 0);
goto retry;
}
if (ret != 1) {
- BUG_ON(ret > 1);
+ VM_WARN_ON_ONCE(ret > 1);
if (!pages_done)
pages_done = ret;
break;
@@ -1888,10 +1819,10 @@ long populate_vma_page_range(struct vm_area_struct *vma,
int gup_flags;
long ret;
- VM_BUG_ON(!PAGE_ALIGNED(start));
- VM_BUG_ON(!PAGE_ALIGNED(end));
- VM_BUG_ON_VMA(start < vma->vm_start, vma);
- VM_BUG_ON_VMA(end > vma->vm_end, vma);
+ VM_WARN_ON_ONCE(!PAGE_ALIGNED(start));
+ VM_WARN_ON_ONCE(!PAGE_ALIGNED(end));
+ VM_WARN_ON_ONCE_VMA(start < vma->vm_start, vma);
+ VM_WARN_ON_ONCE_VMA(end > vma->vm_end, vma);
mmap_assert_locked(mm);
/*
@@ -1960,8 +1891,8 @@ long faultin_page_range(struct mm_struct *mm, unsigned long start,
int gup_flags;
long ret;
- VM_BUG_ON(!PAGE_ALIGNED(start));
- VM_BUG_ON(!PAGE_ALIGNED(end));
+ VM_WARN_ON_ONCE(!PAGE_ALIGNED(start));
+ VM_WARN_ON_ONCE(!PAGE_ALIGNED(end));
mmap_assert_locked(mm);
/*
@@ -2051,7 +1982,7 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
{
struct vm_area_struct *vma;
bool must_unlock = false;
- unsigned long vm_flags;
+ vm_flags_t vm_flags;
long i;
if (!nr_pages)
@@ -2114,28 +2045,22 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
*/
size_t fault_in_writeable(char __user *uaddr, size_t size)
{
- char __user *start = uaddr, *end;
+ const unsigned long start = (unsigned long)uaddr;
+ const unsigned long end = start + size;
+ unsigned long cur;
if (unlikely(size == 0))
return 0;
if (!user_write_access_begin(uaddr, size))
return size;
- if (!PAGE_ALIGNED(uaddr)) {
- unsafe_put_user(0, uaddr, out);
- uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr);
- }
- end = (char __user *)PAGE_ALIGN((unsigned long)start + size);
- if (unlikely(end < start))
- end = NULL;
- while (uaddr != end) {
- unsafe_put_user(0, uaddr, out);
- uaddr += PAGE_SIZE;
- }
+ /* Stop once we overflow to 0. */
+ for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE))
+ unsafe_put_user(0, (char __user *)cur, out);
out:
user_write_access_end();
- if (size > uaddr - start)
- return size - (uaddr - start);
+ if (size > cur - start)
+ return size - (cur - start);
return 0;
}
EXPORT_SYMBOL(fault_in_writeable);
@@ -2189,26 +2114,24 @@ EXPORT_SYMBOL(fault_in_subpage_writeable);
*/
size_t fault_in_safe_writeable(const char __user *uaddr, size_t size)
{
- unsigned long start = (unsigned long)uaddr, end;
+ const unsigned long start = (unsigned long)uaddr;
+ const unsigned long end = start + size;
+ unsigned long cur;
struct mm_struct *mm = current->mm;
bool unlocked = false;
if (unlikely(size == 0))
return 0;
- end = PAGE_ALIGN(start + size);
- if (end < start)
- end = 0;
mmap_read_lock(mm);
- do {
- if (fixup_user_fault(mm, start, FAULT_FLAG_WRITE, &unlocked))
+ /* Stop once we overflow to 0. */
+ for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE))
+ if (fixup_user_fault(mm, cur, FAULT_FLAG_WRITE, &unlocked))
break;
- start = (start + PAGE_SIZE) & PAGE_MASK;
- } while (start != end);
mmap_read_unlock(mm);
- if (size > start - (unsigned long)uaddr)
- return size - (start - (unsigned long)uaddr);
+ if (size > cur - start)
+ return size - (cur - start);
return 0;
}
EXPORT_SYMBOL(fault_in_safe_writeable);
@@ -2223,30 +2146,24 @@ EXPORT_SYMBOL(fault_in_safe_writeable);
*/
size_t fault_in_readable(const char __user *uaddr, size_t size)
{
- const char __user *start = uaddr, *end;
+ const unsigned long start = (unsigned long)uaddr;
+ const unsigned long end = start + size;
+ unsigned long cur;
volatile char c;
if (unlikely(size == 0))
return 0;
if (!user_read_access_begin(uaddr, size))
return size;
- if (!PAGE_ALIGNED(uaddr)) {
- unsafe_get_user(c, uaddr, out);
- uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr);
- }
- end = (const char __user *)PAGE_ALIGN((unsigned long)start + size);
- if (unlikely(end < start))
- end = NULL;
- while (uaddr != end) {
- unsafe_get_user(c, uaddr, out);
- uaddr += PAGE_SIZE;
- }
+ /* Stop once we overflow to 0. */
+ for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE))
+ unsafe_get_user(c, (const char __user *)cur, out);
out:
user_read_access_end();
(void)c;
- if (size > uaddr - start)
- return size - (uaddr - start);
+ if (size > cur - start)
+ return size - (cur - start);
return 0;
}
EXPORT_SYMBOL(fault_in_readable);
@@ -2317,27 +2234,51 @@ static void pofs_unpin(struct pages_or_folios *pofs)
unpin_user_pages(pofs->pages, pofs->nr_entries);
}
+static struct folio *pofs_next_folio(struct folio *folio,
+ struct pages_or_folios *pofs, long *index_ptr)
+{
+ long i = *index_ptr + 1;
+
+ if (!pofs->has_folios && folio_test_large(folio)) {
+ const unsigned long start_pfn = folio_pfn(folio);
+ const unsigned long end_pfn = start_pfn + folio_nr_pages(folio);
+
+ for (; i < pofs->nr_entries; i++) {
+ unsigned long pfn = page_to_pfn(pofs->pages[i]);
+
+ /* Is this page part of this folio? */
+ if (pfn < start_pfn || pfn >= end_pfn)
+ break;
+ }
+ }
+
+ if (unlikely(i == pofs->nr_entries))
+ return NULL;
+ *index_ptr = i;
+
+ return pofs_get_folio(pofs, i);
+}
+
/*
* Returns the number of collected folios. Return value is always >= 0.
*/
-static void collect_longterm_unpinnable_folios(
+static unsigned long collect_longterm_unpinnable_folios(
struct list_head *movable_folio_list,
struct pages_or_folios *pofs)
{
- struct folio *prev_folio = NULL;
- bool drain_allow = true;
- unsigned long i;
-
- for (i = 0; i < pofs->nr_entries; i++) {
- struct folio *folio = pofs_get_folio(pofs, i);
+ unsigned long collected = 0;
+ struct folio *folio;
+ int drained = 0;
+ long i = 0;
- if (folio == prev_folio)
- continue;
- prev_folio = folio;
+ for (folio = pofs_get_folio(pofs, i); folio;
+ folio = pofs_next_folio(folio, pofs, &i)) {
if (folio_is_longterm_pinnable(folio))
continue;
+ collected++;
+
if (folio_is_device_coherent(folio))
continue;
@@ -2346,9 +2287,17 @@ static void collect_longterm_unpinnable_folios(
continue;
}
- if (!folio_test_lru(folio) && drain_allow) {
+ if (drained == 0 && folio_may_be_lru_cached(folio) &&
+ folio_ref_count(folio) !=
+ folio_expected_ref_count(folio) + 1) {
+ lru_add_drain();
+ drained = 1;
+ }
+ if (drained == 1 && folio_may_be_lru_cached(folio) &&
+ folio_ref_count(folio) !=
+ folio_expected_ref_count(folio) + 1) {
lru_add_drain_all();
- drain_allow = false;
+ drained = 2;
}
if (!folio_isolate_lru(folio))
@@ -2359,6 +2308,8 @@ static void collect_longterm_unpinnable_folios(
NR_ISOLATED_ANON + folio_is_file_lru(folio),
folio_nr_pages(folio));
}
+
+ return collected;
}
/*
@@ -2435,9 +2386,11 @@ static long
check_and_migrate_movable_pages_or_folios(struct pages_or_folios *pofs)
{
LIST_HEAD(movable_folio_list);
+ unsigned long collected;
- collect_longterm_unpinnable_folios(&movable_folio_list, pofs);
- if (list_empty(&movable_folio_list))
+ collected = collect_longterm_unpinnable_folios(&movable_folio_list,
+ pofs);
+ if (!collected)
return 0;
return migrate_longterm_unpinnable_folios(&movable_folio_list, pofs);
@@ -2839,9 +2792,9 @@ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags)
return false;
/* Anonymous folios pose no problem. */
- mapping_flags = (unsigned long)mapping & PAGE_MAPPING_FLAGS;
+ mapping_flags = (unsigned long)mapping & FOLIO_MAPPING_FLAGS;
if (mapping_flags)
- return mapping_flags & PAGE_MAPPING_ANON;
+ return mapping_flags & FOLIO_MAPPING_ANON;
/*
* At this point, we know the mapping is non-null and points to an
@@ -2888,8 +2841,7 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
unsigned long end, unsigned int flags, struct page **pages,
int *nr)
{
- struct dev_pagemap *pgmap = NULL;
- int nr_start = *nr, ret = 0;
+ int ret = 0;
pte_t *ptep, *ptem;
ptem = ptep = pte_offset_map(&pmd, addr);
@@ -2913,19 +2865,11 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
if (!pte_access_permitted(pte, flags & FOLL_WRITE))
goto pte_unmap;
- if (pte_devmap(pte)) {
- if (unlikely(flags & FOLL_LONGTERM))
- goto pte_unmap;
-
- pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
- if (unlikely(!pgmap)) {
- gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
- goto pte_unmap;
- }
- } else if (pte_special(pte))
+ if (pte_special(pte))
goto pte_unmap;
- VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+ /* If it's not marked as special it must have a valid memmap. */
+ VM_WARN_ON_ONCE(!pfn_valid(pte_pfn(pte)));
page = pte_page(pte);
folio = try_grab_folio_fast(page, 1, flags);
@@ -2954,12 +2898,9 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
* see Documentation/core-api/pin_user_pages.rst for
* details.
*/
- if (flags & FOLL_PIN) {
- ret = arch_make_folio_accessible(folio);
- if (ret) {
- gup_put_folio(folio, 1, flags);
- goto pte_unmap;
- }
+ if ((flags & FOLL_PIN) && arch_make_folio_accessible(folio)) {
+ gup_put_folio(folio, 1, flags);
+ goto pte_unmap;
}
folio_set_referenced(folio);
pages[*nr] = page;
@@ -2969,8 +2910,6 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
ret = 1;
pte_unmap:
- if (pgmap)
- put_dev_pagemap(pgmap);
pte_unmap(ptem);
return ret;
}
@@ -2993,91 +2932,6 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
}
#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
-#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
-static int gup_fast_devmap_leaf(unsigned long pfn, unsigned long addr,
- unsigned long end, unsigned int flags, struct page **pages, int *nr)
-{
- int nr_start = *nr;
- struct dev_pagemap *pgmap = NULL;
-
- do {
- struct folio *folio;
- struct page *page = pfn_to_page(pfn);
-
- pgmap = get_dev_pagemap(pfn, pgmap);
- if (unlikely(!pgmap)) {
- gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
- break;
- }
-
- folio = try_grab_folio_fast(page, 1, flags);
- if (!folio) {
- gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
- break;
- }
- folio_set_referenced(folio);
- pages[*nr] = page;
- (*nr)++;
- pfn++;
- } while (addr += PAGE_SIZE, addr != end);
-
- put_dev_pagemap(pgmap);
- return addr == end;
-}
-
-static int gup_fast_devmap_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr,
- unsigned long end, unsigned int flags, struct page **pages,
- int *nr)
-{
- unsigned long fault_pfn;
- int nr_start = *nr;
-
- fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- if (!gup_fast_devmap_leaf(fault_pfn, addr, end, flags, pages, nr))
- return 0;
-
- if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
- gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
- return 0;
- }
- return 1;
-}
-
-static int gup_fast_devmap_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr,
- unsigned long end, unsigned int flags, struct page **pages,
- int *nr)
-{
- unsigned long fault_pfn;
- int nr_start = *nr;
-
- fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
- if (!gup_fast_devmap_leaf(fault_pfn, addr, end, flags, pages, nr))
- return 0;
-
- if (unlikely(pud_val(orig) != pud_val(*pudp))) {
- gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
- return 0;
- }
- return 1;
-}
-#else
-static int gup_fast_devmap_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr,
- unsigned long end, unsigned int flags, struct page **pages,
- int *nr)
-{
- BUILD_BUG();
- return 0;
-}
-
-static int gup_fast_devmap_pud_leaf(pud_t pud, pud_t *pudp, unsigned long addr,
- unsigned long end, unsigned int flags, struct page **pages,
- int *nr)
-{
- BUILD_BUG();
- return 0;
-}
-#endif
-
static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr,
unsigned long end, unsigned int flags, struct page **pages,
int *nr)
@@ -3092,15 +2946,8 @@ static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr,
if (pmd_special(orig))
return 0;
- if (pmd_devmap(orig)) {
- if (unlikely(flags & FOLL_LONGTERM))
- return 0;
- return gup_fast_devmap_pmd_leaf(orig, pmdp, addr, end, flags,
- pages, nr);
- }
-
- page = pmd_page(orig);
- refs = record_subpages(page, PMD_SIZE, addr, end, pages + *nr);
+ refs = (end - addr) >> PAGE_SHIFT;
+ page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
folio = try_grab_folio_fast(page, refs, flags);
if (!folio)
@@ -3120,7 +2967,10 @@ static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr,
return 0;
}
+ pages += *nr;
*nr += refs;
+ for (; refs; refs--)
+ *(pages++) = page++;
folio_set_referenced(folio);
return 1;
}
@@ -3139,15 +2989,8 @@ static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr,
if (pud_special(orig))
return 0;
- if (pud_devmap(orig)) {
- if (unlikely(flags & FOLL_LONGTERM))
- return 0;
- return gup_fast_devmap_pud_leaf(orig, pudp, addr, end, flags,
- pages, nr);
- }
-
- page = pud_page(orig);
- refs = record_subpages(page, PUD_SIZE, addr, end, pages + *nr);
+ refs = (end - addr) >> PAGE_SHIFT;
+ page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
folio = try_grab_folio_fast(page, refs, flags);
if (!folio)
@@ -3168,47 +3011,10 @@ static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr,
return 0;
}
+ pages += *nr;
*nr += refs;
- folio_set_referenced(folio);
- return 1;
-}
-
-static int gup_fast_pgd_leaf(pgd_t orig, pgd_t *pgdp, unsigned long addr,
- unsigned long end, unsigned int flags, struct page **pages,
- int *nr)
-{
- int refs;
- struct page *page;
- struct folio *folio;
-
- if (!pgd_access_permitted(orig, flags & FOLL_WRITE))
- return 0;
-
- BUILD_BUG_ON(pgd_devmap(orig));
-
- page = pgd_page(orig);
- refs = record_subpages(page, PGDIR_SIZE, addr, end, pages + *nr);
-
- folio = try_grab_folio_fast(page, refs, flags);
- if (!folio)
- return 0;
-
- if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) {
- gup_put_folio(folio, refs, flags);
- return 0;
- }
-
- if (!pgd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
- gup_put_folio(folio, refs, flags);
- return 0;
- }
-
- if (!gup_fast_folio_allowed(folio, flags)) {
- gup_put_folio(folio, refs, flags);
- return 0;
- }
-
- *nr += refs;
+ for (; refs; refs--)
+ *(pages++) = page++;
folio_set_referenced(folio);
return 1;
}
@@ -3307,12 +3113,9 @@ static void gup_fast_pgd_range(unsigned long addr, unsigned long end,
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
return;
- if (unlikely(pgd_leaf(pgd))) {
- if (!gup_fast_pgd_leaf(pgd, pgdp, addr, next, flags,
- pages, nr))
- return;
- } else if (!gup_fast_p4d_range(pgdp, pgd, addr, next, flags,
- pages, nr))
+ BUILD_BUG_ON(pgd_leaf(pgd));
+ if (!gup_fast_p4d_range(pgdp, pgd, addr, next, flags,
+ pages, nr))
return;
} while (pgdp++, addr = next, addr != end);
}
@@ -3359,7 +3162,7 @@ static unsigned long gup_fast(unsigned long start, unsigned long end,
* include/asm-generic/tlb.h for more details.
*
* We do not adopt an rcu_read_lock() here as we also want to block IPIs
- * that come from THPs splitting.
+ * that come from callers of tlb_remove_table_sync_one().
*/
local_irq_save(flags);
gup_fast_pgd_range(start, end, gup_flags, pages, &nr_pinned);
@@ -3395,7 +3198,7 @@ static int gup_fast_fallback(unsigned long start, unsigned long nr_pages,
return -EINVAL;
if (gup_flags & FOLL_PIN)
- mm_set_has_pinned_flag(&current->mm->flags);
+ mm_set_has_pinned_flag(current->mm);
if (!(gup_flags & FOLL_FAST_ONLY))
might_lock_read(&current->mm->mmap_lock);
@@ -3647,7 +3450,7 @@ long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end,
{
unsigned int flags, nr_folios, nr_found;
unsigned int i, pgshift = PAGE_SHIFT;
- pgoff_t start_idx, end_idx, next_idx;
+ pgoff_t start_idx, end_idx;
struct folio *folio = NULL;
struct folio_batch fbatch;
struct hstate *h;
@@ -3697,20 +3500,8 @@ long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end,
folio = NULL;
}
- next_idx = 0;
for (i = 0; i < nr_found; i++) {
- /*
- * As there can be multiple entries for a
- * given folio in the batch returned by
- * filemap_get_folios_contig(), the below
- * check is to ensure that we pin and return a
- * unique set of folios between start and end.
- */
- if (next_idx &&
- next_idx != folio_index(fbatch.folios[i]))
- continue;
-
- folio = page_folio(&fbatch.folios[i]->page);
+ folio = fbatch.folios[i];
if (try_grab_folio(folio, 1, FOLL_PIN)) {
folio_batch_release(&fbatch);
@@ -3722,7 +3513,6 @@ long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end,
*offset = offset_in_folio(folio, start);
folios[nr_folios] = folio;
- next_idx = folio_next_index(folio);
if (++nr_folios == max_folios)
break;
}
diff --git a/mm/highmem.c b/mm/highmem.c
index ef3189b36cad..b5c8e4c2d5d4 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -61,7 +61,7 @@ static inline int kmap_local_calc_idx(int idx)
/*
* Determine color of virtual address where the page should be mapped.
*/
-static inline unsigned int get_pkmap_color(struct page *page)
+static inline unsigned int get_pkmap_color(const struct page *page)
{
return 0;
}
@@ -334,7 +334,7 @@ EXPORT_SYMBOL(kmap_high);
*
* This can be called from any context.
*/
-void *kmap_high_get(struct page *page)
+void *kmap_high_get(const struct page *page)
{
unsigned long vaddr, flags;
@@ -356,7 +356,7 @@ void *kmap_high_get(struct page *page)
* If ARCH_NEEDS_KMAP_HIGH_GET is not defined then this may be called
* only from user context.
*/
-void kunmap_high(struct page *page)
+void kunmap_high(const struct page *page)
{
unsigned long vaddr;
unsigned long nr;
@@ -508,7 +508,7 @@ static inline void kmap_local_idx_pop(void)
#endif
#ifndef arch_kmap_local_high_get
-static inline void *arch_kmap_local_high_get(struct page *page)
+static inline void *arch_kmap_local_high_get(const struct page *page)
{
return NULL;
}
@@ -572,7 +572,7 @@ void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot)
}
EXPORT_SYMBOL_GPL(__kmap_local_pfn_prot);
-void *__kmap_local_page_prot(struct page *page, pgprot_t prot)
+void *__kmap_local_page_prot(const struct page *page, pgprot_t prot)
{
void *kmap;
diff --git a/mm/hmm.c b/mm/hmm.c
index 082f7b7c0b9e..87562914670a 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -10,6 +10,7 @@
*/
#include <linux/pagewalk.h>
#include <linux/hmm.h>
+#include <linux/hmm-dma.h>
#include <linux/init.h>
#include <linux/rmap.h>
#include <linux/swap.h>
@@ -23,6 +24,7 @@
#include <linux/sched/mm.h>
#include <linux/jump_label.h>
#include <linux/dma-mapping.h>
+#include <linux/pci-p2pdma.h>
#include <linux/mmu_notifier.h>
#include <linux/memory_hotplug.h>
@@ -39,13 +41,21 @@ enum {
HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT,
};
+enum {
+ /* These flags are carried from input-to-output */
+ HMM_PFN_INOUT_FLAGS = HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA |
+ HMM_PFN_P2PDMA_BUS,
+};
+
static int hmm_pfns_fill(unsigned long addr, unsigned long end,
struct hmm_range *range, unsigned long cpu_flags)
{
unsigned long i = (addr - range->start) >> PAGE_SHIFT;
- for (; addr < end; addr += PAGE_SIZE, i++)
- range->hmm_pfns[i] = cpu_flags;
+ for (; addr < end; addr += PAGE_SIZE, i++) {
+ range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
+ range->hmm_pfns[i] |= cpu_flags;
+ }
return 0;
}
@@ -173,6 +183,7 @@ static inline unsigned long hmm_pfn_flags_order(unsigned long order)
return order << HMM_PFN_ORDER_SHIFT;
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline unsigned long pmd_to_hmm_pfn_flags(struct hmm_range *range,
pmd_t pmd)
{
@@ -183,7 +194,6 @@ static inline unsigned long pmd_to_hmm_pfn_flags(struct hmm_range *range,
hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT);
}
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
unsigned long end, unsigned long hmm_pfns[],
pmd_t pmd)
@@ -202,8 +212,10 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
return hmm_vma_fault(addr, end, required_fault, walk);
pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
- hmm_pfns[i] = pfn | cpu_flags;
+ for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
+ hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
+ hmm_pfns[i] |= pfn | cpu_flags;
+ }
return 0;
}
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -230,14 +242,14 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
unsigned long cpu_flags;
pte_t pte = ptep_get(ptep);
uint64_t pfn_req_flags = *hmm_pfn;
+ uint64_t new_pfn_flags = 0;
if (pte_none_mostly(pte)) {
required_fault =
hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
if (required_fault)
goto fault;
- *hmm_pfn = 0;
- return 0;
+ goto out;
}
if (!pte_present(pte)) {
@@ -253,16 +265,14 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
cpu_flags = HMM_PFN_VALID;
if (is_writable_device_private_entry(entry))
cpu_flags |= HMM_PFN_WRITE;
- *hmm_pfn = swp_offset_pfn(entry) | cpu_flags;
- return 0;
+ new_pfn_flags = swp_offset_pfn(entry) | cpu_flags;
+ goto out;
}
required_fault =
hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
- if (!required_fault) {
- *hmm_pfn = 0;
- return 0;
- }
+ if (!required_fault)
+ goto out;
if (!non_swap_entry(entry))
goto fault;
@@ -292,23 +302,22 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
goto fault;
/*
- * Bypass devmap pte such as DAX page when all pfn requested
- * flags(pfn_req_flags) are fulfilled.
* Since each architecture defines a struct page for the zero page, just
* fall through and treat it like a normal page.
*/
if (!vm_normal_page(walk->vma, addr, pte) &&
- !pte_devmap(pte) &&
!is_zero_pfn(pte_pfn(pte))) {
if (hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0)) {
pte_unmap(ptep);
return -EFAULT;
}
- *hmm_pfn = HMM_PFN_ERROR;
- return 0;
+ new_pfn_flags = HMM_PFN_ERROR;
+ goto out;
}
- *hmm_pfn = pte_pfn(pte) | cpu_flags;
+ new_pfn_flags = pte_pfn(pte) | cpu_flags;
+out:
+ *hmm_pfn = (*hmm_pfn & HMM_PFN_INOUT_FLAGS) | new_pfn_flags;
return 0;
fault:
@@ -317,6 +326,68 @@ fault:
return hmm_vma_fault(addr, end, required_fault, walk);
}
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start,
+ unsigned long end, unsigned long *hmm_pfns,
+ pmd_t pmd)
+{
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ unsigned long npages = (end - start) >> PAGE_SHIFT;
+ unsigned long addr = start;
+ swp_entry_t entry = pmd_to_swp_entry(pmd);
+ unsigned int required_fault;
+
+ if (is_device_private_entry(entry) &&
+ pfn_swap_entry_folio(entry)->pgmap->owner ==
+ range->dev_private_owner) {
+ unsigned long cpu_flags = HMM_PFN_VALID |
+ hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT);
+ unsigned long pfn = swp_offset_pfn(entry);
+ unsigned long i;
+
+ if (is_writable_device_private_entry(entry))
+ cpu_flags |= HMM_PFN_WRITE;
+
+ /*
+ * Fully populate the PFN list though subsequent PFNs could be
+ * inferred, because drivers which are not yet aware of large
+ * folios probably do not support sparsely populated PFN lists.
+ */
+ for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
+ hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
+ hmm_pfns[i] |= pfn | cpu_flags;
+ }
+
+ return 0;
+ }
+
+ required_fault = hmm_range_need_fault(hmm_vma_walk, hmm_pfns,
+ npages, 0);
+ if (required_fault) {
+ if (is_device_private_entry(entry))
+ return hmm_vma_fault(addr, end, required_fault, walk);
+ else
+ return -EFAULT;
+ }
+
+ return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
+}
+#else
+static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start,
+ unsigned long end, unsigned long *hmm_pfns,
+ pmd_t pmd)
+{
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ unsigned long npages = (end - start) >> PAGE_SHIFT;
+
+ if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0))
+ return -EFAULT;
+ return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
+}
+#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
+
static int hmm_vma_walk_pmd(pmd_t *pmdp,
unsigned long start,
unsigned long end,
@@ -345,13 +416,11 @@ again:
return hmm_pfns_fill(start, end, range, 0);
}
- if (!pmd_present(pmd)) {
- if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0))
- return -EFAULT;
- return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
- }
+ if (!pmd_present(pmd))
+ return hmm_vma_handle_absent_pmd(walk, start, end, hmm_pfns,
+ pmd);
- if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) {
+ if (pmd_trans_huge(pmd)) {
/*
* No need to take pmd_lock here, even if some other thread
* is splitting the huge pmd we will get that event through
@@ -362,7 +431,7 @@ again:
* values.
*/
pmd = pmdp_get_lockless(pmdp);
- if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
+ if (!pmd_trans_huge(pmd))
goto again;
return hmm_vma_handle_pmd(walk, addr, end, hmm_pfns, pmd);
@@ -396,8 +465,7 @@ again:
return 0;
}
-#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \
- defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+#if defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
static inline unsigned long pud_to_hmm_pfn_flags(struct hmm_range *range,
pud_t pud)
{
@@ -429,7 +497,7 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
return hmm_vma_walk_hole(start, end, -1, walk);
}
- if (pud_leaf(pud) && pud_devmap(pud)) {
+ if (pud_leaf(pud)) {
unsigned long i, npages, pfn;
unsigned int required_fault;
unsigned long *hmm_pfns;
@@ -448,8 +516,10 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
}
pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
- for (i = 0; i < npages; ++i, ++pfn)
- hmm_pfns[i] = pfn | cpu_flags;
+ for (i = 0; i < npages; ++i, ++pfn) {
+ hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
+ hmm_pfns[i] |= pfn | cpu_flags;
+ }
goto out_unlock;
}
@@ -507,8 +577,10 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
}
pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT);
- for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
- range->hmm_pfns[i] = pfn | cpu_flags;
+ for (; addr < end; addr += PAGE_SIZE, i++, pfn++) {
+ range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
+ range->hmm_pfns[i] |= pfn | cpu_flags;
+ }
spin_unlock(ptl);
return 0;
@@ -607,3 +679,212 @@ int hmm_range_fault(struct hmm_range *range)
return ret;
}
EXPORT_SYMBOL(hmm_range_fault);
+
+/**
+ * hmm_dma_map_alloc - Allocate HMM map structure
+ * @dev: device to allocate structure for
+ * @map: HMM map to allocate
+ * @nr_entries: number of entries in the map
+ * @dma_entry_size: size of the DMA entry in the map
+ *
+ * Allocate the HMM map structure and all the lists it contains.
+ * Return 0 on success, -ENOMEM on failure.
+ */
+int hmm_dma_map_alloc(struct device *dev, struct hmm_dma_map *map,
+ size_t nr_entries, size_t dma_entry_size)
+{
+ bool dma_need_sync = false;
+ bool use_iova;
+
+ WARN_ON_ONCE(!(nr_entries * PAGE_SIZE / dma_entry_size));
+
+ /*
+ * The HMM API violates our normal DMA buffer ownership rules and can't
+ * transfer buffer ownership. The dma_addressing_limited() check is a
+ * best approximation to ensure no swiotlb buffering happens.
+ */
+#ifdef CONFIG_DMA_NEED_SYNC
+ dma_need_sync = !dev->dma_skip_sync;
+#endif /* CONFIG_DMA_NEED_SYNC */
+ if (dma_need_sync || dma_addressing_limited(dev))
+ return -EOPNOTSUPP;
+
+ map->dma_entry_size = dma_entry_size;
+ map->pfn_list = kvcalloc(nr_entries, sizeof(*map->pfn_list),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!map->pfn_list)
+ return -ENOMEM;
+
+ use_iova = dma_iova_try_alloc(dev, &map->state, 0,
+ nr_entries * PAGE_SIZE);
+ if (!use_iova && dma_need_unmap(dev)) {
+ map->dma_list = kvcalloc(nr_entries, sizeof(*map->dma_list),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!map->dma_list)
+ goto err_dma;
+ }
+ return 0;
+
+err_dma:
+ kvfree(map->pfn_list);
+ return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(hmm_dma_map_alloc);
+
+/**
+ * hmm_dma_map_free - iFree HMM map structure
+ * @dev: device to free structure from
+ * @map: HMM map containing the various lists and state
+ *
+ * Free the HMM map structure and all the lists it contains.
+ */
+void hmm_dma_map_free(struct device *dev, struct hmm_dma_map *map)
+{
+ if (dma_use_iova(&map->state))
+ dma_iova_free(dev, &map->state);
+ kvfree(map->pfn_list);
+ kvfree(map->dma_list);
+}
+EXPORT_SYMBOL_GPL(hmm_dma_map_free);
+
+/**
+ * hmm_dma_map_pfn - Map a physical HMM page to DMA address
+ * @dev: Device to map the page for
+ * @map: HMM map
+ * @idx: Index into the PFN and dma address arrays
+ * @p2pdma_state: PCI P2P state.
+ *
+ * dma_alloc_iova() allocates IOVA based on the size specified by their use in
+ * iova->size. Call this function after IOVA allocation to link whole @page
+ * to get the DMA address. Note that very first call to this function
+ * will have @offset set to 0 in the IOVA space allocated from
+ * dma_alloc_iova(). For subsequent calls to this function on same @iova,
+ * @offset needs to be advanced by the caller with the size of previous
+ * page that was linked + DMA address returned for the previous page that was
+ * linked by this function.
+ */
+dma_addr_t hmm_dma_map_pfn(struct device *dev, struct hmm_dma_map *map,
+ size_t idx,
+ struct pci_p2pdma_map_state *p2pdma_state)
+{
+ struct dma_iova_state *state = &map->state;
+ dma_addr_t *dma_addrs = map->dma_list;
+ unsigned long *pfns = map->pfn_list;
+ struct page *page = hmm_pfn_to_page(pfns[idx]);
+ phys_addr_t paddr = hmm_pfn_to_phys(pfns[idx]);
+ size_t offset = idx * map->dma_entry_size;
+ unsigned long attrs = 0;
+ dma_addr_t dma_addr;
+ int ret;
+
+ if ((pfns[idx] & HMM_PFN_DMA_MAPPED) &&
+ !(pfns[idx] & HMM_PFN_P2PDMA_BUS)) {
+ /*
+ * We are in this flow when there is a need to resync flags,
+ * for example when page was already linked in prefetch call
+ * with READ flag and now we need to add WRITE flag
+ *
+ * This page was already programmed to HW and we don't want/need
+ * to unlink and link it again just to resync flags.
+ */
+ if (dma_use_iova(state))
+ return state->addr + offset;
+
+ /*
+ * Without dma_need_unmap, the dma_addrs array is NULL, thus we
+ * need to regenerate the address below even if there already
+ * was a mapping. But !dma_need_unmap implies that the
+ * mapping stateless, so this is fine.
+ */
+ if (dma_need_unmap(dev))
+ return dma_addrs[idx];
+
+ /* Continue to remapping */
+ }
+
+ switch (pci_p2pdma_state(p2pdma_state, dev, page)) {
+ case PCI_P2PDMA_MAP_NONE:
+ break;
+ case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
+ attrs |= DMA_ATTR_MMIO;
+ pfns[idx] |= HMM_PFN_P2PDMA;
+ break;
+ case PCI_P2PDMA_MAP_BUS_ADDR:
+ pfns[idx] |= HMM_PFN_P2PDMA_BUS | HMM_PFN_DMA_MAPPED;
+ return pci_p2pdma_bus_addr_map(p2pdma_state, paddr);
+ default:
+ return DMA_MAPPING_ERROR;
+ }
+
+ if (dma_use_iova(state)) {
+ ret = dma_iova_link(dev, state, paddr, offset,
+ map->dma_entry_size, DMA_BIDIRECTIONAL,
+ attrs);
+ if (ret)
+ goto error;
+
+ ret = dma_iova_sync(dev, state, offset, map->dma_entry_size);
+ if (ret) {
+ dma_iova_unlink(dev, state, offset, map->dma_entry_size,
+ DMA_BIDIRECTIONAL, attrs);
+ goto error;
+ }
+
+ dma_addr = state->addr + offset;
+ } else {
+ if (WARN_ON_ONCE(dma_need_unmap(dev) && !dma_addrs))
+ goto error;
+
+ dma_addr = dma_map_phys(dev, paddr, map->dma_entry_size,
+ DMA_BIDIRECTIONAL, attrs);
+ if (dma_mapping_error(dev, dma_addr))
+ goto error;
+
+ if (dma_need_unmap(dev))
+ dma_addrs[idx] = dma_addr;
+ }
+ pfns[idx] |= HMM_PFN_DMA_MAPPED;
+ return dma_addr;
+error:
+ pfns[idx] &= ~HMM_PFN_P2PDMA;
+ return DMA_MAPPING_ERROR;
+
+}
+EXPORT_SYMBOL_GPL(hmm_dma_map_pfn);
+
+/**
+ * hmm_dma_unmap_pfn - Unmap a physical HMM page from DMA address
+ * @dev: Device to unmap the page from
+ * @map: HMM map
+ * @idx: Index of the PFN to unmap
+ *
+ * Returns true if the PFN was mapped and has been unmapped, false otherwise.
+ */
+bool hmm_dma_unmap_pfn(struct device *dev, struct hmm_dma_map *map, size_t idx)
+{
+ const unsigned long valid_dma = HMM_PFN_VALID | HMM_PFN_DMA_MAPPED;
+ struct dma_iova_state *state = &map->state;
+ dma_addr_t *dma_addrs = map->dma_list;
+ unsigned long *pfns = map->pfn_list;
+ unsigned long attrs = 0;
+
+ if ((pfns[idx] & valid_dma) != valid_dma)
+ return false;
+
+ if (pfns[idx] & HMM_PFN_P2PDMA)
+ attrs |= DMA_ATTR_MMIO;
+
+ if (pfns[idx] & HMM_PFN_P2PDMA_BUS)
+ ; /* no need to unmap bus address P2P mappings */
+ else if (dma_use_iova(state))
+ dma_iova_unlink(dev, state, idx * map->dma_entry_size,
+ map->dma_entry_size, DMA_BIDIRECTIONAL, attrs);
+ else if (dma_need_unmap(dev))
+ dma_unmap_phys(dev, dma_addrs[idx], map->dma_entry_size,
+ DMA_BIDIRECTIONAL, attrs);
+
+ pfns[idx] &=
+ ~(HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA | HMM_PFN_P2PDMA_BUS);
+ return true;
+}
+EXPORT_SYMBOL_GPL(hmm_dma_unmap_pfn);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 47d76d03ce30..1b81680b4225 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -22,7 +22,6 @@
#include <linux/mm_types.h>
#include <linux/khugepaged.h>
#include <linux/freezer.h>
-#include <linux/pfn_t.h>
#include <linux/mman.h>
#include <linux/memremap.h>
#include <linux/pagemap.h>
@@ -99,13 +98,13 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma)
}
unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
- unsigned long vm_flags,
- unsigned long tva_flags,
+ vm_flags_t vm_flags,
+ enum tva_type type,
unsigned long orders)
{
- bool smaps = tva_flags & TVA_SMAPS;
- bool in_pf = tva_flags & TVA_IN_PF;
- bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS;
+ const bool smaps = type == TVA_SMAPS;
+ const bool in_pf = type == TVA_PAGEFAULT;
+ const bool forced_collapse = type == TVA_FORCED_COLLAPSE;
unsigned long supported_orders;
/* Check the intersection of requested and supported orders. */
@@ -123,7 +122,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
if (!vma->vm_mm) /* vdso */
return 0;
- if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags))
+ if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags, forced_collapse))
return 0;
/* khugepaged doesn't collapse DAX vma, but page fault is fine. */
@@ -166,16 +165,16 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
* own flags.
*/
if (!in_pf && shmem_file(vma->vm_file))
- return shmem_allowable_huge_orders(file_inode(vma->vm_file),
+ return orders & shmem_allowable_huge_orders(file_inode(vma->vm_file),
vma, vma->vm_pgoff, 0,
- !enforce_sysfs);
+ forced_collapse);
if (!vma_is_anonymous(vma)) {
/*
- * Enforce sysfs THP requirements as necessary. Anonymous vmas
+ * Enforce THP collapse requirements as necessary. Anonymous vmas
* were already handled in thp_vma_allowable_orders().
*/
- if (enforce_sysfs &&
+ if (!forced_collapse &&
(!hugepage_global_enabled() || (!(vm_flags & VM_HUGEPAGE) &&
!hugepage_global_always())))
return 0;
@@ -208,7 +207,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
return orders;
}
-static bool get_huge_zero_page(void)
+static bool get_huge_zero_folio(void)
{
struct folio *zero_folio;
retry:
@@ -238,7 +237,7 @@ retry:
return true;
}
-static void put_huge_zero_page(void)
+static void put_huge_zero_folio(void)
{
/*
* Counter should never go to zero here. Only shrinker can put
@@ -249,33 +248,39 @@ static void put_huge_zero_page(void)
struct folio *mm_get_huge_zero_folio(struct mm_struct *mm)
{
- if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
+ if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO))
+ return huge_zero_folio;
+
+ if (mm_flags_test(MMF_HUGE_ZERO_FOLIO, mm))
return READ_ONCE(huge_zero_folio);
- if (!get_huge_zero_page())
+ if (!get_huge_zero_folio())
return NULL;
- if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
- put_huge_zero_page();
+ if (mm_flags_test_and_set(MMF_HUGE_ZERO_FOLIO, mm))
+ put_huge_zero_folio();
return READ_ONCE(huge_zero_folio);
}
void mm_put_huge_zero_folio(struct mm_struct *mm)
{
- if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
- put_huge_zero_page();
+ if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO))
+ return;
+
+ if (mm_flags_test(MMF_HUGE_ZERO_FOLIO, mm))
+ put_huge_zero_folio();
}
-static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
- struct shrink_control *sc)
+static unsigned long shrink_huge_zero_folio_count(struct shrinker *shrink,
+ struct shrink_control *sc)
{
/* we can free zero page only if last reference remains */
return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
}
-static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
- struct shrink_control *sc)
+static unsigned long shrink_huge_zero_folio_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
{
if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
struct folio *zero_folio = xchg(&huge_zero_folio, NULL);
@@ -288,7 +293,7 @@ static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
return 0;
}
-static struct shrinker *huge_zero_page_shrinker;
+static struct shrinker *huge_zero_folio_shrinker;
#ifdef CONFIG_SYSFS
static ssize_t enabled_show(struct kobject *kobj,
@@ -850,33 +855,47 @@ static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
static int __init thp_shrinker_init(void)
{
- huge_zero_page_shrinker = shrinker_alloc(0, "thp-zero");
- if (!huge_zero_page_shrinker)
- return -ENOMEM;
-
deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE |
SHRINKER_MEMCG_AWARE |
SHRINKER_NONSLAB,
"thp-deferred_split");
- if (!deferred_split_shrinker) {
- shrinker_free(huge_zero_page_shrinker);
+ if (!deferred_split_shrinker)
return -ENOMEM;
- }
-
- huge_zero_page_shrinker->count_objects = shrink_huge_zero_page_count;
- huge_zero_page_shrinker->scan_objects = shrink_huge_zero_page_scan;
- shrinker_register(huge_zero_page_shrinker);
deferred_split_shrinker->count_objects = deferred_split_count;
deferred_split_shrinker->scan_objects = deferred_split_scan;
shrinker_register(deferred_split_shrinker);
+ if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) {
+ /*
+ * Bump the reference of the huge_zero_folio and do not
+ * initialize the shrinker.
+ *
+ * huge_zero_folio will always be NULL on failure. We assume
+ * that get_huge_zero_folio() will most likely not fail as
+ * thp_shrinker_init() is invoked early on during boot.
+ */
+ if (!get_huge_zero_folio())
+ pr_warn("Allocating persistent huge zero folio failed\n");
+ return 0;
+ }
+
+ huge_zero_folio_shrinker = shrinker_alloc(0, "thp-zero");
+ if (!huge_zero_folio_shrinker) {
+ shrinker_free(deferred_split_shrinker);
+ return -ENOMEM;
+ }
+
+ huge_zero_folio_shrinker->count_objects = shrink_huge_zero_folio_count;
+ huge_zero_folio_shrinker->scan_objects = shrink_huge_zero_folio_scan;
+ shrinker_register(huge_zero_folio_shrinker);
+
return 0;
}
static void __init thp_shrinker_exit(void)
{
- shrinker_free(huge_zero_page_shrinker);
+ shrinker_free(huge_zero_folio_shrinker);
shrinker_free(deferred_split_shrinker);
}
@@ -912,7 +931,7 @@ static int __init hugepage_init(void)
* where the extra memory used could hurt more than TLB overhead
* is likely to save. The admin can still enable it through /sys.
*/
- if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) {
+ if (totalram_pages() < MB_TO_PAGES(512)) {
transparent_hugepage_flags = 0;
return 0;
}
@@ -1126,7 +1145,7 @@ static unsigned long __thp_get_unmapped_area(struct file *filp,
off_sub = (off - ret) & (size - 1);
- if (test_bit(MMF_TOPDOWN, &current->mm->flags) && !off_sub)
+ if (mm_flags_test(MMF_TOPDOWN, current->mm) && !off_sub)
return ret + size;
ret += off_sub;
@@ -1203,7 +1222,7 @@ static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd,
{
pmd_t entry;
- entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
+ entry = folio_mk_pmd(folio, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
folio_add_lru_vma(folio, vma);
@@ -1309,8 +1328,8 @@ static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm,
struct folio *zero_folio)
{
pmd_t entry;
- entry = mk_pmd(&zero_folio->page, vma->vm_page_prot);
- entry = pmd_mkhuge(entry);
+ entry = folio_mk_pmd(zero_folio, vma->vm_page_prot);
+ entry = pmd_mkspecial(entry);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
mm_inc_nr_ptes(mm);
@@ -1373,35 +1392,64 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
return __do_huge_pmd_anonymous_page(vmf);
}
-static int insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write,
- pgtable_t pgtable)
+struct folio_or_pfn {
+ union {
+ struct folio *folio;
+ unsigned long pfn;
+ };
+ bool is_folio;
+};
+
+static vm_fault_t insert_pmd(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, struct folio_or_pfn fop, pgprot_t prot,
+ bool write)
{
struct mm_struct *mm = vma->vm_mm;
+ pgtable_t pgtable = NULL;
+ spinlock_t *ptl;
pmd_t entry;
- lockdep_assert_held(pmd_lockptr(mm, pmd));
+ if (addr < vma->vm_start || addr >= vma->vm_end)
+ return VM_FAULT_SIGBUS;
+
+ if (arch_needs_pgtable_deposit()) {
+ pgtable = pte_alloc_one(vma->vm_mm);
+ if (!pgtable)
+ return VM_FAULT_OOM;
+ }
+ ptl = pmd_lock(mm, pmd);
if (!pmd_none(*pmd)) {
+ const unsigned long pfn = fop.is_folio ? folio_pfn(fop.folio) :
+ fop.pfn;
+
if (write) {
- if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) {
+ if (pmd_pfn(*pmd) != pfn) {
WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
- return -EEXIST;
+ goto out_unlock;
}
entry = pmd_mkyoung(*pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
if (pmdp_set_access_flags(vma, addr, pmd, entry, 1))
update_mmu_cache_pmd(vma, addr, pmd);
}
-
- return -EEXIST;
+ goto out_unlock;
}
- entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
- if (pfn_t_devmap(pfn))
- entry = pmd_mkdevmap(entry);
- else
+ if (fop.is_folio) {
+ entry = folio_mk_pmd(fop.folio, vma->vm_page_prot);
+
+ if (is_huge_zero_folio(fop.folio)) {
+ entry = pmd_mkspecial(entry);
+ } else {
+ folio_get(fop.folio);
+ folio_add_file_rmap_pmd(fop.folio, &fop.folio->page, vma);
+ add_mm_counter(mm, mm_counter_file(fop.folio), HPAGE_PMD_NR);
+ }
+ } else {
+ entry = pmd_mkhuge(pfn_pmd(fop.pfn, prot));
entry = pmd_mkspecial(entry);
+ }
if (write) {
entry = pmd_mkyoung(pmd_mkdirty(entry));
entry = maybe_pmd_mkwrite(entry, vma);
@@ -1410,11 +1458,17 @@ static int insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
if (pgtable) {
pgtable_trans_huge_deposit(mm, pmd, pgtable);
mm_inc_nr_ptes(mm);
+ pgtable = NULL;
}
set_pmd_at(mm, addr, pmd, entry);
update_mmu_cache_pmd(vma, addr, pmd);
- return 0;
+
+out_unlock:
+ spin_unlock(ptl);
+ if (pgtable)
+ pte_free(mm, pgtable);
+ return VM_FAULT_NOPAGE;
}
/**
@@ -1427,44 +1481,29 @@ static int insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
*
* Return: vm_fault_t value.
*/
-vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
+vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, unsigned long pfn,
+ bool write)
{
unsigned long addr = vmf->address & PMD_MASK;
struct vm_area_struct *vma = vmf->vma;
pgprot_t pgprot = vma->vm_page_prot;
- pgtable_t pgtable = NULL;
- spinlock_t *ptl;
- int error;
+ struct folio_or_pfn fop = {
+ .pfn = pfn,
+ };
/*
* If we had pmd_special, we could avoid all these restrictions,
* but we need to be consistent with PTEs and architectures that
* can't support a 'special' bit.
*/
- BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
- !pfn_t_devmap(pfn));
+ BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
(VM_PFNMAP|VM_MIXEDMAP));
BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
- if (addr < vma->vm_start || addr >= vma->vm_end)
- return VM_FAULT_SIGBUS;
-
- if (arch_needs_pgtable_deposit()) {
- pgtable = pte_alloc_one(vma->vm_mm);
- if (!pgtable)
- return VM_FAULT_OOM;
- }
+ pfnmap_setup_cachemode_pfn(pfn, &pgprot);
- track_pfn_insert(vma, &pgprot, pfn);
- ptl = pmd_lock(vma->vm_mm, vmf->pmd);
- error = insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write,
- pgtable);
- spin_unlock(ptl);
- if (error && pgtable)
- pte_free(vma->vm_mm, pgtable);
-
- return VM_FAULT_NOPAGE;
+ return insert_pmd(vma, addr, vmf->pmd, fop, pgprot, write);
}
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
@@ -1473,37 +1512,15 @@ vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio,
{
struct vm_area_struct *vma = vmf->vma;
unsigned long addr = vmf->address & PMD_MASK;
- struct mm_struct *mm = vma->vm_mm;
- spinlock_t *ptl;
- pgtable_t pgtable = NULL;
- int error;
-
- if (addr < vma->vm_start || addr >= vma->vm_end)
- return VM_FAULT_SIGBUS;
+ struct folio_or_pfn fop = {
+ .folio = folio,
+ .is_folio = true,
+ };
if (WARN_ON_ONCE(folio_order(folio) != PMD_ORDER))
return VM_FAULT_SIGBUS;
- if (arch_needs_pgtable_deposit()) {
- pgtable = pte_alloc_one(vma->vm_mm);
- if (!pgtable)
- return VM_FAULT_OOM;
- }
-
- ptl = pmd_lock(mm, vmf->pmd);
- if (pmd_none(*vmf->pmd)) {
- folio_get(folio);
- folio_add_file_rmap_pmd(folio, &folio->page, vma);
- add_mm_counter(mm, mm_counter_file(folio), HPAGE_PMD_NR);
- }
- error = insert_pfn_pmd(vma, addr, vmf->pmd,
- pfn_to_pfn_t(folio_pfn(folio)), vma->vm_page_prot,
- write, pgtable);
- spin_unlock(ptl);
- if (error && pgtable)
- pte_free(mm, pgtable);
-
- return VM_FAULT_NOPAGE;
+ return insert_pmd(vma, addr, vmf->pmd, fop, vma->vm_page_prot, write);
}
EXPORT_SYMBOL_GPL(vmf_insert_folio_pmd);
@@ -1515,36 +1532,51 @@ static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
return pud;
}
-static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
- pud_t *pud, pfn_t pfn, bool write)
+static vm_fault_t insert_pud(struct vm_area_struct *vma, unsigned long addr,
+ pud_t *pud, struct folio_or_pfn fop, pgprot_t prot, bool write)
{
struct mm_struct *mm = vma->vm_mm;
- pgprot_t prot = vma->vm_page_prot;
+ spinlock_t *ptl;
pud_t entry;
+ if (addr < vma->vm_start || addr >= vma->vm_end)
+ return VM_FAULT_SIGBUS;
+
+ ptl = pud_lock(mm, pud);
if (!pud_none(*pud)) {
+ const unsigned long pfn = fop.is_folio ? folio_pfn(fop.folio) :
+ fop.pfn;
+
if (write) {
- if (WARN_ON_ONCE(pud_pfn(*pud) != pfn_t_to_pfn(pfn)))
- return;
+ if (WARN_ON_ONCE(pud_pfn(*pud) != pfn))
+ goto out_unlock;
entry = pud_mkyoung(*pud);
entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
if (pudp_set_access_flags(vma, addr, pud, entry, 1))
update_mmu_cache_pud(vma, addr, pud);
}
- return;
+ goto out_unlock;
}
- entry = pud_mkhuge(pfn_t_pud(pfn, prot));
- if (pfn_t_devmap(pfn))
- entry = pud_mkdevmap(entry);
- else
+ if (fop.is_folio) {
+ entry = folio_mk_pud(fop.folio, vma->vm_page_prot);
+
+ folio_get(fop.folio);
+ folio_add_file_rmap_pud(fop.folio, &fop.folio->page, vma);
+ add_mm_counter(mm, mm_counter_file(fop.folio), HPAGE_PUD_NR);
+ } else {
+ entry = pud_mkhuge(pfn_pud(fop.pfn, prot));
entry = pud_mkspecial(entry);
+ }
if (write) {
entry = pud_mkyoung(pud_mkdirty(entry));
entry = maybe_pud_mkwrite(entry, vma);
}
set_pud_at(mm, addr, pud, entry);
update_mmu_cache_pud(vma, addr, pud);
+out_unlock:
+ spin_unlock(ptl);
+ return VM_FAULT_NOPAGE;
}
/**
@@ -1557,34 +1589,29 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
*
* Return: vm_fault_t value.
*/
-vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
+vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, unsigned long pfn,
+ bool write)
{
unsigned long addr = vmf->address & PUD_MASK;
struct vm_area_struct *vma = vmf->vma;
pgprot_t pgprot = vma->vm_page_prot;
- spinlock_t *ptl;
+ struct folio_or_pfn fop = {
+ .pfn = pfn,
+ };
/*
* If we had pud_special, we could avoid all these restrictions,
* but we need to be consistent with PTEs and architectures that
* can't support a 'special' bit.
*/
- BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
- !pfn_t_devmap(pfn));
+ BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
(VM_PFNMAP|VM_MIXEDMAP));
BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
- if (addr < vma->vm_start || addr >= vma->vm_end)
- return VM_FAULT_SIGBUS;
-
- track_pfn_insert(vma, &pgprot, pfn);
-
- ptl = pud_lock(vma->vm_mm, vmf->pud);
- insert_pfn_pud(vma, addr, vmf->pud, pfn, write);
- spin_unlock(ptl);
+ pfnmap_setup_cachemode_pfn(pfn, &pgprot);
- return VM_FAULT_NOPAGE;
+ return insert_pud(vma, addr, vmf->pud, fop, pgprot, write);
}
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
@@ -1601,34 +1628,15 @@ vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio,
{
struct vm_area_struct *vma = vmf->vma;
unsigned long addr = vmf->address & PUD_MASK;
- pud_t *pud = vmf->pud;
- struct mm_struct *mm = vma->vm_mm;
- spinlock_t *ptl;
-
- if (addr < vma->vm_start || addr >= vma->vm_end)
- return VM_FAULT_SIGBUS;
+ struct folio_or_pfn fop = {
+ .folio = folio,
+ .is_folio = true,
+ };
if (WARN_ON_ONCE(folio_order(folio) != PUD_ORDER))
return VM_FAULT_SIGBUS;
- ptl = pud_lock(mm, pud);
-
- /*
- * If there is already an entry present we assume the folio is
- * already mapped, hence no need to take another reference. We
- * still call insert_pfn_pud() though in case the mapping needs
- * upgrading to writeable.
- */
- if (pud_none(*vmf->pud)) {
- folio_get(folio);
- folio_add_file_rmap_pud(folio, &folio->page, vma);
- add_mm_counter(mm, mm_counter_file(folio), HPAGE_PUD_NR);
- }
- insert_pfn_pud(vma, addr, vmf->pud, pfn_to_pfn_t(folio_pfn(folio)),
- write);
- spin_unlock(ptl);
-
- return VM_FAULT_NOPAGE;
+ return insert_pud(vma, addr, vmf->pud, fop, vma->vm_page_prot, write);
}
EXPORT_SYMBOL_GPL(vmf_insert_folio_pud);
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
@@ -1646,46 +1654,6 @@ void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
update_mmu_cache_pmd(vma, addr, pmd);
}
-struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
-{
- unsigned long pfn = pmd_pfn(*pmd);
- struct mm_struct *mm = vma->vm_mm;
- struct page *page;
- int ret;
-
- assert_spin_locked(pmd_lockptr(mm, pmd));
-
- if (flags & FOLL_WRITE && !pmd_write(*pmd))
- return NULL;
-
- if (pmd_present(*pmd) && pmd_devmap(*pmd))
- /* pass */;
- else
- return NULL;
-
- if (flags & FOLL_TOUCH)
- touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);
-
- /*
- * device mapped pages can only be returned if the
- * caller will manage the page reference count.
- */
- if (!(flags & (FOLL_GET | FOLL_PIN)))
- return ERR_PTR(-EEXIST);
-
- pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
- *pgmap = get_dev_pagemap(pfn, *pgmap);
- if (!*pgmap)
- return ERR_PTR(-EFAULT);
- page = pfn_to_page(pfn);
- ret = try_grab_folio(page_folio(page), 1, flags);
- if (ret)
- page = ERR_PTR(ret);
-
- return page;
-}
-
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
@@ -1698,7 +1666,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
int ret = -ENOMEM;
pmd = pmdp_get_lockless(src_pmd);
- if (unlikely(pmd_present(pmd) && pmd_special(pmd))) {
+ if (unlikely(pmd_present(pmd) && pmd_special(pmd) &&
+ !is_huge_zero_pmd(pmd))) {
dst_ptl = pmd_lock(dst_mm, dst_pmd);
src_ptl = pmd_lockptr(src_mm, src_pmd);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
@@ -1786,7 +1755,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte_free(dst_mm, pgtable);
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
- __split_huge_pmd(src_vma, src_pmd, addr, false, NULL);
+ __split_huge_pmd(src_vma, src_pmd, addr, false);
return -EAGAIN;
}
add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
@@ -1837,7 +1806,7 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
ret = -EAGAIN;
pud = *src_pud;
- if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud)))
+ if (unlikely(!pud_trans_huge(pud)))
goto out_unlock;
/*
@@ -2008,7 +1977,7 @@ unlock_fallback:
folio_unlock(folio);
spin_unlock(vmf->ptl);
fallback:
- __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
+ __split_huge_pmd(vma, vmf->pmd, vmf->address, false);
return VM_FAULT_FALLBACK;
}
@@ -2260,6 +2229,14 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
zap_deposited_table(tlb->mm, pmd);
add_mm_counter(tlb->mm, mm_counter_file(folio),
-HPAGE_PMD_NR);
+
+ /*
+ * Use flush_needed to indicate whether the PMD entry
+ * is present, instead of checking pmd_present() again.
+ */
+ if (flush_needed && pmd_young(orig_pmd) &&
+ likely(vma_has_recency(vma)))
+ folio_mark_accessed(folio);
}
spin_unlock(ptl);
@@ -2653,12 +2630,12 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
folio_move_anon_rmap(src_folio, dst_vma);
src_folio->index = linear_page_index(dst_vma, dst_addr);
- _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot);
+ _dst_pmd = folio_mk_pmd(src_folio, dst_vma->vm_page_prot);
/* Follow mremap() behavior and treat the entry dirty after the move */
_dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
} else {
src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
- _dst_pmd = mk_huge_pmd(src_page, dst_vma->vm_page_prot);
+ _dst_pmd = folio_mk_pmd(src_folio, dst_vma->vm_page_prot);
}
set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd);
@@ -2691,8 +2668,7 @@ spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
{
spinlock_t *ptl;
ptl = pmd_lock(vma->vm_mm, pmd);
- if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) ||
- pmd_devmap(*pmd)))
+ if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd)))
return ptl;
spin_unlock(ptl);
return NULL;
@@ -2709,7 +2685,7 @@ spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
spinlock_t *ptl;
ptl = pud_lock(vma->vm_mm, pud);
- if (likely(pud_trans_huge(*pud) || pud_devmap(*pud)))
+ if (likely(pud_trans_huge(*pud)))
return ptl;
spin_unlock(ptl);
return NULL;
@@ -2761,7 +2737,7 @@ static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
- VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud));
+ VM_BUG_ON(!pud_trans_huge(*pud));
count_vm_event(THP_SPLIT_PUD);
@@ -2794,7 +2770,7 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
(address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
mmu_notifier_invalidate_range_start(&range);
ptl = pud_lock(vma->vm_mm, pud);
- if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
+ if (unlikely(!pud_trans_huge(*pud)))
goto out;
__split_huge_pud_locked(vma, pud, range.start);
@@ -2867,8 +2843,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
- VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
- && !pmd_devmap(*pmd));
+ VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd));
count_vm_event(THP_SPLIT_PMD);
@@ -3073,33 +3048,15 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
}
void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmd, bool freeze, struct folio *folio)
+ pmd_t *pmd, bool freeze)
{
- bool pmd_migration = is_pmd_migration_entry(*pmd);
-
- VM_WARN_ON_ONCE(folio && !folio_test_pmd_mappable(folio));
VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE));
- VM_WARN_ON_ONCE(folio && !folio_test_locked(folio));
- VM_BUG_ON(freeze && !folio);
-
- /*
- * When the caller requests to set up a migration entry, we
- * require a folio to check the PMD against. Otherwise, there
- * is a risk of replacing the wrong folio.
- */
- if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) || pmd_migration) {
- /*
- * Do not apply pmd_folio() to a migration entry; and folio lock
- * guarantees that it must be of the wrong folio anyway.
- */
- if (folio && (pmd_migration || folio != pmd_folio(*pmd)))
- return;
+ if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd))
__split_huge_pmd_locked(vma, pmd, address, freeze);
- }
}
void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long address, bool freeze, struct folio *folio)
+ unsigned long address, bool freeze)
{
spinlock_t *ptl;
struct mmu_notifier_range range;
@@ -3109,20 +3066,20 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
(address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start(&range);
ptl = pmd_lock(vma->vm_mm, pmd);
- split_huge_pmd_locked(vma, range.start, pmd, freeze, folio);
+ split_huge_pmd_locked(vma, range.start, pmd, freeze);
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(&range);
}
void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
- bool freeze, struct folio *folio)
+ bool freeze)
{
pmd_t *pmd = mm_find_pmd(vma->vm_mm, address);
if (!pmd)
return;
- __split_huge_pmd(vma, pmd, address, freeze, folio);
+ __split_huge_pmd(vma, pmd, address, freeze);
}
static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address)
@@ -3134,7 +3091,7 @@ static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned
if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) &&
range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE),
ALIGN(address, HPAGE_PMD_SIZE)))
- split_huge_pmd_address(vma, address, false, NULL);
+ split_huge_pmd_address(vma, address, false);
}
void vma_adjust_trans_huge(struct vm_area_struct *vma,
@@ -3345,8 +3302,8 @@ static void __split_folio_to_order(struct folio *folio, int old_order,
* unreferenced sub-pages of an anonymous THP: we can simply drop
* PG_anon_exclusive (-> PG_mappedtodisk) for these here.
*/
- new_folio->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
- new_folio->flags |= (folio->flags &
+ new_folio->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
+ new_folio->flags.f |= (folio->flags.f &
((1L << PG_referenced) |
(1L << PG_swapbacked) |
(1L << PG_swapcache) |
@@ -3420,10 +3377,6 @@ static void __split_folio_to_order(struct folio *folio, int old_order,
* order - 1 to new_order).
* @split_at: in buddy allocator like split, the folio containing @split_at
* will be split until its order becomes @new_order.
- * @lock_at: the folio containing @lock_at is left locked for caller.
- * @list: the after split folios will be added to @list if it is not NULL,
- * otherwise to LRU lists.
- * @end: the end of the file @folio maps to. -1 if @folio is anonymous memory.
* @xas: xa_state pointing to folio->mapping->i_pages and locked by caller
* @mapping: @folio->mapping
* @uniform_split: if the split is uniform or not (buddy allocator like split)
@@ -3449,52 +3402,26 @@ static void __split_folio_to_order(struct folio *folio, int old_order,
* @page, which is split in next for loop.
*
* After splitting, the caller's folio reference will be transferred to the
- * folio containing @page. The other folios may be freed if they are not mapped.
- *
- * In terms of locking, after splitting,
- * 1. uniform split leaves @page (or the folio contains it) locked;
- * 2. buddy allocator like (non-uniform) split leaves @folio locked.
- *
+ * folio containing @page. The caller needs to unlock and/or free after-split
+ * folios if necessary.
*
* For !uniform_split, when -ENOMEM is returned, the original folio might be
* split. The caller needs to check the input folio.
*/
static int __split_unmapped_folio(struct folio *folio, int new_order,
- struct page *split_at, struct page *lock_at,
- struct list_head *list, pgoff_t end,
- struct xa_state *xas, struct address_space *mapping,
- bool uniform_split)
+ struct page *split_at, struct xa_state *xas,
+ struct address_space *mapping, bool uniform_split)
{
- struct lruvec *lruvec;
- struct address_space *swap_cache = NULL;
- struct folio *origin_folio = folio;
- struct folio *next_folio = folio_next(folio);
- struct folio *new_folio;
- struct folio *next;
int order = folio_order(folio);
- int split_order;
int start_order = uniform_split ? new_order : order - 1;
- int nr_dropped = 0;
- int ret = 0;
bool stop_split = false;
-
- if (folio_test_swapcache(folio)) {
- VM_BUG_ON(mapping);
-
- /* a swapcache folio can only be uniformly split to order-0 */
- if (!uniform_split || new_order != 0)
- return -EINVAL;
-
- swap_cache = swap_address_space(folio->swap);
- xa_lock(&swap_cache->i_pages);
- }
+ struct folio *next;
+ int split_order;
+ int ret = 0;
if (folio_test_anon(folio))
mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
- /* lock lru list/PageCompound, ref frozen by page_ref_freeze */
- lruvec = folio_lruvec_lock(folio);
-
folio_clear_has_hwpoisoned(folio);
/*
@@ -3504,9 +3431,9 @@ static int __split_unmapped_folio(struct folio *folio, int new_order,
for (split_order = start_order;
split_order >= new_order && !stop_split;
split_order--) {
- int old_order = folio_order(folio);
- struct folio *release;
struct folio *end_folio = folio_next(folio);
+ int old_order = folio_order(folio);
+ struct folio *new_folio;
/* order-1 anonymous folio is not supported */
if (folio_test_anon(folio) && split_order == 1)
@@ -3528,126 +3455,45 @@ static int __split_unmapped_folio(struct folio *folio, int new_order,
if (xas_error(xas)) {
ret = xas_error(xas);
stop_split = true;
- goto after_split;
}
}
}
- folio_split_memcg_refs(folio, old_order, split_order);
- split_page_owner(&folio->page, old_order, split_order);
- pgalloc_tag_split(folio, old_order, split_order);
+ if (!stop_split) {
+ folio_split_memcg_refs(folio, old_order, split_order);
+ split_page_owner(&folio->page, old_order, split_order);
+ pgalloc_tag_split(folio, old_order, split_order);
- __split_folio_to_order(folio, old_order, split_order);
+ __split_folio_to_order(folio, old_order, split_order);
+ }
-after_split:
/*
- * Iterate through after-split folios and perform related
- * operations. But in buddy allocator like split, the folio
+ * Iterate through after-split folios and update folio stats.
+ * But in buddy allocator like split, the folio
* containing the specified page is skipped until its order
* is new_order, since the folio will be worked on in next
* iteration.
*/
- for (release = folio; release != end_folio; release = next) {
- next = folio_next(release);
+ for (new_folio = folio; new_folio != end_folio; new_folio = next) {
+ next = folio_next(new_folio);
/*
- * for buddy allocator like split, the folio containing
- * page will be split next and should not be released,
- * until the folio's order is new_order or stop_split
- * is set to true by the above xas_split() failure.
+ * for buddy allocator like split, new_folio containing
+ * @split_at page could be split again, thus do not
+ * change stats yet. Wait until new_folio's order is
+ * @new_order or stop_split is set to true by the above
+ * xas_split() failure.
*/
- if (release == page_folio(split_at)) {
- folio = release;
+ if (new_folio == page_folio(split_at)) {
+ folio = new_folio;
if (split_order != new_order && !stop_split)
continue;
}
- if (folio_test_anon(release)) {
- mod_mthp_stat(folio_order(release),
- MTHP_STAT_NR_ANON, 1);
- }
-
- /*
- * origin_folio should be kept frozon until page cache
- * entries are updated with all the other after-split
- * folios to prevent others seeing stale page cache
- * entries.
- */
- if (release == origin_folio)
- continue;
-
- folio_ref_unfreeze(release, 1 +
- ((mapping || swap_cache) ?
- folio_nr_pages(release) : 0));
-
- lru_add_split_folio(origin_folio, release, lruvec,
- list);
-
- /* Some pages can be beyond EOF: drop them from cache */
- if (release->index >= end) {
- if (shmem_mapping(mapping))
- nr_dropped += folio_nr_pages(release);
- else if (folio_test_clear_dirty(release))
- folio_account_cleaned(release,
- inode_to_wb(mapping->host));
- __filemap_remove_folio(release, NULL);
- folio_put_refs(release, folio_nr_pages(release));
- } else if (mapping) {
- __xa_store(&mapping->i_pages,
- release->index, release, 0);
- } else if (swap_cache) {
- __xa_store(&swap_cache->i_pages,
- swap_cache_index(release->swap),
- release, 0);
- }
+ if (folio_test_anon(new_folio))
+ mod_mthp_stat(folio_order(new_folio),
+ MTHP_STAT_NR_ANON, 1);
}
}
- /*
- * Unfreeze origin_folio only after all page cache entries, which used
- * to point to it, have been updated with new folios. Otherwise,
- * a parallel folio_try_get() can grab origin_folio and its caller can
- * see stale page cache entries.
- */
- folio_ref_unfreeze(origin_folio, 1 +
- ((mapping || swap_cache) ? folio_nr_pages(origin_folio) : 0));
-
- unlock_page_lruvec(lruvec);
-
- if (swap_cache)
- xa_unlock(&swap_cache->i_pages);
- if (mapping)
- xa_unlock(&mapping->i_pages);
-
- /* Caller disabled irqs, so they are still disabled here */
- local_irq_enable();
-
- if (nr_dropped)
- shmem_uncharge(mapping->host, nr_dropped);
-
- remap_page(origin_folio, 1 << order,
- folio_test_anon(origin_folio) ?
- RMP_USE_SHARED_ZEROPAGE : 0);
-
- /*
- * At this point, folio should contain the specified page.
- * For uniform split, it is left for caller to unlock.
- * For buddy allocator like split, the first after-split folio is left
- * for caller to unlock.
- */
- for (new_folio = origin_folio; new_folio != next_folio; new_folio = next) {
- next = folio_next(new_folio);
- if (new_folio == page_folio(lock_at))
- continue;
-
- folio_unlock(new_folio);
- /*
- * Subpages may be freed if there wasn't any mapping
- * like if add_to_swap() is running on a lru page that
- * had its mapping zapped. And freeing these pages
- * requires taking the lru_lock so we do the put_page
- * of the tail pages after the split is complete.
- */
- free_page_and_swap_cache(&new_folio->page);
- }
return ret;
}
@@ -3721,6 +3567,11 @@ bool uniform_split_supported(struct folio *folio, unsigned int new_order,
* It is in charge of checking whether the split is supported or not and
* preparing @folio for __split_unmapped_folio().
*
+ * After splitting, the after-split folio containing @lock_at remains locked
+ * and others are unlocked:
+ * 1. for uniform split, @lock_at points to one of @folio's subpages;
+ * 2. for buddy allocator like (non-uniform) split, @lock_at points to @folio.
+ *
* return: 0: successful, <0 failed (if -ENOMEM is returned, @folio might be
* split but not to @new_order, the caller needs to check)
*/
@@ -3730,16 +3581,20 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
{
struct deferred_split *ds_queue = get_deferred_split_queue(folio);
XA_STATE(xas, &folio->mapping->i_pages, folio->index);
+ struct folio *end_folio = folio_next(folio);
bool is_anon = folio_test_anon(folio);
struct address_space *mapping = NULL;
struct anon_vma *anon_vma = NULL;
int order = folio_order(folio);
+ struct folio *new_folio, *next;
+ int nr_shmem_dropped = 0;
+ int remap_flags = 0;
int extra_pins, ret;
pgoff_t end;
bool is_hzp;
- VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
- VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_large(folio), folio);
if (folio != page_folio(split_at) || folio != page_folio(lock_at))
return -EINVAL;
@@ -3777,7 +3632,6 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
ret = -EBUSY;
goto out;
}
- end = -1;
mapping = NULL;
anon_vma_lock_write(anon_vma);
} else {
@@ -3857,13 +3711,19 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
*/
xas_lock(&xas);
xas_reset(&xas);
- if (xas_load(&xas) != folio)
+ if (xas_load(&xas) != folio) {
+ ret = -EAGAIN;
goto fail;
+ }
}
/* Prevent deferred_split_scan() touching ->_refcount */
spin_lock(&ds_queue->split_queue_lock);
if (folio_ref_freeze(folio, 1 + extra_pins)) {
+ struct swap_cluster_info *ci = NULL;
+ struct lruvec *lruvec;
+ int expected_refs;
+
if (folio_order(folio) > 1 &&
!list_empty(&folio->_deferred_list)) {
ds_queue->split_queue_len--;
@@ -3897,18 +3757,119 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
}
}
- ret = __split_unmapped_folio(folio, new_order,
- split_at, lock_at, list, end, &xas, mapping,
- uniform_split);
+ if (folio_test_swapcache(folio)) {
+ if (mapping) {
+ VM_WARN_ON_ONCE_FOLIO(mapping, folio);
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ ci = swap_cluster_get_and_lock(folio);
+ }
+
+ /* lock lru list/PageCompound, ref frozen by page_ref_freeze */
+ lruvec = folio_lruvec_lock(folio);
+
+ ret = __split_unmapped_folio(folio, new_order, split_at, &xas,
+ mapping, uniform_split);
+
+ /*
+ * Unfreeze after-split folios and put them back to the right
+ * list. @folio should be kept frozon until page cache
+ * entries are updated with all the other after-split folios
+ * to prevent others seeing stale page cache entries.
+ * As a result, new_folio starts from the next folio of
+ * @folio.
+ */
+ for (new_folio = folio_next(folio); new_folio != end_folio;
+ new_folio = next) {
+ unsigned long nr_pages = folio_nr_pages(new_folio);
+
+ next = folio_next(new_folio);
+
+ expected_refs = folio_expected_ref_count(new_folio) + 1;
+ folio_ref_unfreeze(new_folio, expected_refs);
+
+ lru_add_split_folio(folio, new_folio, lruvec, list);
+
+ /*
+ * Anonymous folio with swap cache.
+ * NOTE: shmem in swap cache is not supported yet.
+ */
+ if (ci) {
+ __swap_cache_replace_folio(ci, folio, new_folio);
+ continue;
+ }
+
+ /* Anonymous folio without swap cache */
+ if (!mapping)
+ continue;
+
+ /* Add the new folio to the page cache. */
+ if (new_folio->index < end) {
+ __xa_store(&mapping->i_pages, new_folio->index,
+ new_folio, 0);
+ continue;
+ }
+
+ /* Drop folio beyond EOF: ->index >= end */
+ if (shmem_mapping(mapping))
+ nr_shmem_dropped += nr_pages;
+ else if (folio_test_clear_dirty(new_folio))
+ folio_account_cleaned(
+ new_folio, inode_to_wb(mapping->host));
+ __filemap_remove_folio(new_folio, NULL);
+ folio_put_refs(new_folio, nr_pages);
+ }
+ /*
+ * Unfreeze @folio only after all page cache entries, which
+ * used to point to it, have been updated with new folios.
+ * Otherwise, a parallel folio_try_get() can grab @folio
+ * and its caller can see stale page cache entries.
+ */
+ expected_refs = folio_expected_ref_count(folio) + 1;
+ folio_ref_unfreeze(folio, expected_refs);
+
+ unlock_page_lruvec(lruvec);
+
+ if (ci)
+ swap_cluster_unlock(ci);
} else {
spin_unlock(&ds_queue->split_queue_lock);
-fail:
- if (mapping)
- xas_unlock(&xas);
- local_irq_enable();
- remap_page(folio, folio_nr_pages(folio), 0);
ret = -EAGAIN;
}
+fail:
+ if (mapping)
+ xas_unlock(&xas);
+
+ local_irq_enable();
+
+ if (nr_shmem_dropped)
+ shmem_uncharge(mapping->host, nr_shmem_dropped);
+
+ if (!ret && is_anon)
+ remap_flags = RMP_USE_SHARED_ZEROPAGE;
+ remap_page(folio, 1 << order, remap_flags);
+
+ /*
+ * Unlock all after-split folios except the one containing
+ * @lock_at page. If @folio is not split, it will be kept locked.
+ */
+ for (new_folio = folio; new_folio != end_folio; new_folio = next) {
+ next = folio_next(new_folio);
+ if (new_folio == page_folio(lock_at))
+ continue;
+
+ folio_unlock(new_folio);
+ /*
+ * Subpages may be freed if there wasn't any mapping
+ * like if add_to_swap() is running on a lru page that
+ * had its mapping zapped. And freeing these pages
+ * requires taking the lru_lock so we do the put_page
+ * of the tail pages after the split is complete.
+ */
+ free_folio_and_swap_cache(new_folio);
+ }
out_unlock:
if (anon_vma) {
@@ -4143,32 +4104,23 @@ static unsigned long deferred_split_count(struct shrinker *shrink,
static bool thp_underused(struct folio *folio)
{
int num_zero_pages = 0, num_filled_pages = 0;
- void *kaddr;
int i;
if (khugepaged_max_ptes_none == HPAGE_PMD_NR - 1)
return false;
for (i = 0; i < folio_nr_pages(folio); i++) {
- kaddr = kmap_local_folio(folio, i * PAGE_SIZE);
- if (!memchr_inv(kaddr, 0, PAGE_SIZE)) {
- num_zero_pages++;
- if (num_zero_pages > khugepaged_max_ptes_none) {
- kunmap_local(kaddr);
+ if (pages_identical(folio_page(folio, i), ZERO_PAGE(0))) {
+ if (++num_zero_pages > khugepaged_max_ptes_none)
return true;
- }
} else {
/*
* Another path for early exit once the number
* of non-zero filled pages exceeds threshold.
*/
- num_filled_pages++;
- if (num_filled_pages >= HPAGE_PMD_NR - khugepaged_max_ptes_none) {
- kunmap_local(kaddr);
+ if (++num_filled_pages >= HPAGE_PMD_NR - khugepaged_max_ptes_none)
return false;
- }
}
- kunmap_local(kaddr);
}
return false;
}
@@ -4214,6 +4166,13 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
bool underused = false;
if (!folio_test_partially_mapped(folio)) {
+ /*
+ * See try_to_map_unused_to_zeropage(): we cannot
+ * optimize zero-filled pages after splitting an
+ * mlocked folio.
+ */
+ if (folio_test_mlocked(folio))
+ goto next;
underused = thp_underused(folio);
if (!underused)
goto next;
@@ -4355,8 +4314,8 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
goto out;
}
- pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n",
- pid, vaddr_start, vaddr_end);
+ pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx], new_order: %u, in_folio_offset: %ld\n",
+ pid, vaddr_start, vaddr_end, new_order, in_folio_offset);
mmap_read_lock(mm);
/*
@@ -4466,8 +4425,8 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
if (IS_ERR(candidate))
goto out;
- pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n",
- file_path, off_start, off_end);
+ pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx], new_order: %u, in_folio_offset: %ld\n",
+ file_path, off_start, off_end, new_order, in_folio_offset);
mapping = candidate->f_mapping;
min_order = mapping_min_folio_order(mapping);
@@ -4680,7 +4639,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
entry = pmd_to_swp_entry(*pvmw->pmd);
folio_get(folio);
- pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot));
+ pmde = folio_mk_pmd(folio, READ_ONCE(vma->vm_page_prot));
if (pmd_swp_soft_dirty(*pvmw->pmd))
pmde = pmd_mksoft_dirty(pmde);
if (is_writable_migration_entry(entry))
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 7ae38bfb9096..795ee393eac0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -25,6 +25,7 @@
#include <linux/mmdebug.h>
#include <linux/sched/signal.h>
#include <linux/rmap.h>
+#include <linux/string_choices.h>
#include <linux/string_helpers.h>
#include <linux/swap.h>
#include <linux/swapops.h>
@@ -58,6 +59,7 @@ int hugetlb_max_hstate __read_mostly;
unsigned int default_hstate_idx;
struct hstate hstates[HUGE_MAX_HSTATE];
+__initdata nodemask_t hugetlb_bootmem_nodes;
__initdata struct list_head huge_boot_pages[MAX_NUMNODES];
static unsigned long hstate_boot_nrinvalid[HUGE_MAX_HSTATE] __initdata;
@@ -120,7 +122,7 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
- unsigned long start, unsigned long end);
+ unsigned long start, unsigned long end, bool take_locks);
static struct resv_map *vma_resv_map(struct vm_area_struct *vma);
static void hugetlb_free_folio(struct folio *folio)
@@ -283,11 +285,6 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
return ret;
}
-static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
-{
- return HUGETLBFS_SB(inode->i_sb)->spool;
-}
-
static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
{
return subpool_inode(file_inode(vma->vm_file));
@@ -1250,7 +1247,7 @@ void hugetlb_dup_vma_private(struct vm_area_struct *vma)
/*
* Reset and decrement one ref on hugepage private reservation.
* Called with mm->mmap_lock writer semaphore held.
- * This function should be only used by move_vma() and operate on
+ * This function should be only used by mremap and operate on
* same sized vma. It should never come here with last ref on the
* reservation.
*/
@@ -1476,17 +1473,14 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
#ifdef CONFIG_CONTIG_ALLOC
-static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
+static struct folio *alloc_gigantic_folio(int order, gfp_t gfp_mask,
int nid, nodemask_t *nodemask)
{
struct folio *folio;
- int order = huge_page_order(h);
bool retried = false;
- if (nid == NUMA_NO_NODE)
- nid = numa_mem_id();
retry:
- folio = hugetlb_cma_alloc_folio(h, gfp_mask, nid, nodemask);
+ folio = hugetlb_cma_alloc_folio(order, gfp_mask, nid, nodemask);
if (!folio) {
if (hugetlb_cma_exclusive_alloc())
return NULL;
@@ -1509,16 +1503,16 @@ retry:
}
#else /* !CONFIG_CONTIG_ALLOC */
-static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
- int nid, nodemask_t *nodemask)
+static struct folio *alloc_gigantic_folio(int order, gfp_t gfp_mask, int nid,
+ nodemask_t *nodemask)
{
return NULL;
}
#endif /* CONFIG_CONTIG_ALLOC */
#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
-static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
- int nid, nodemask_t *nodemask)
+static struct folio *alloc_gigantic_folio(int order, gfp_t gfp_mask, int nid,
+ nodemask_t *nodemask)
{
return NULL;
}
@@ -1893,14 +1887,14 @@ void free_huge_folio(struct folio *folio)
/*
* Must be called with the hugetlb lock held
*/
-static void __prep_account_new_huge_page(struct hstate *h, int nid)
+static void account_new_hugetlb_folio(struct hstate *h, struct folio *folio)
{
lockdep_assert_held(&hugetlb_lock);
h->nr_huge_pages++;
- h->nr_huge_pages_node[nid]++;
+ h->nr_huge_pages_node[folio_nid(folio)]++;
}
-static void init_new_hugetlb_folio(struct hstate *h, struct folio *folio)
+static void init_new_hugetlb_folio(struct folio *folio)
{
__folio_set_hugetlb(folio);
INIT_LIST_HEAD(&folio->lru);
@@ -1909,20 +1903,6 @@ static void init_new_hugetlb_folio(struct hstate *h, struct folio *folio)
set_hugetlb_cgroup_rsvd(folio, NULL);
}
-static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio)
-{
- init_new_hugetlb_folio(h, folio);
- hugetlb_vmemmap_optimize_folio(h, folio);
-}
-
-static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int nid)
-{
- __prep_new_hugetlb_folio(h, folio);
- spin_lock_irq(&hugetlb_lock);
- __prep_account_new_huge_page(h, nid);
- spin_unlock_irq(&hugetlb_lock);
-}
-
/*
* Find and lock address space (mapping) in write mode.
*
@@ -1943,14 +1923,11 @@ struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio)
return NULL;
}
-static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h,
- gfp_t gfp_mask, int nid, nodemask_t *nmask,
- nodemask_t *node_alloc_noretry)
+static struct folio *alloc_buddy_hugetlb_folio(int order, gfp_t gfp_mask,
+ int nid, nodemask_t *nmask, nodemask_t *node_alloc_noretry)
{
- int order = huge_page_order(h);
struct folio *folio;
bool alloc_try_hard = true;
- bool retry = true;
/*
* By default we always try hard to allocate the folio with
@@ -1963,24 +1940,8 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h,
alloc_try_hard = false;
if (alloc_try_hard)
gfp_mask |= __GFP_RETRY_MAYFAIL;
- if (nid == NUMA_NO_NODE)
- nid = numa_mem_id();
-retry:
- folio = __folio_alloc(gfp_mask, order, nid, nmask);
- /* Ensure hugetlb folio won't have large_rmappable flag set. */
- if (folio)
- folio_clear_large_rmappable(folio);
- if (folio && !folio_ref_freeze(folio, 1)) {
- folio_put(folio);
- if (retry) { /* retry once */
- retry = false;
- goto retry;
- }
- /* WOW! twice in a row. */
- pr_warn("HugeTLB unexpected inflated folio ref count\n");
- folio = NULL;
- }
+ folio = (struct folio *)__alloc_frozen_pages(gfp_mask, order, nid, nmask);
/*
* If we did not specify __GFP_RETRY_MAYFAIL, but still got a
@@ -2012,36 +1973,36 @@ static struct folio *only_alloc_fresh_hugetlb_folio(struct hstate *h,
nodemask_t *node_alloc_noretry)
{
struct folio *folio;
+ int order = huge_page_order(h);
- if (hstate_is_gigantic(h))
- folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask);
+ if (nid == NUMA_NO_NODE)
+ nid = numa_mem_id();
+
+ if (order_is_gigantic(order))
+ folio = alloc_gigantic_folio(order, gfp_mask, nid, nmask);
else
- folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, nmask, node_alloc_noretry);
+ folio = alloc_buddy_hugetlb_folio(order, gfp_mask, nid, nmask,
+ node_alloc_noretry);
if (folio)
- init_new_hugetlb_folio(h, folio);
+ init_new_hugetlb_folio(folio);
return folio;
}
/*
- * Common helper to allocate a fresh hugetlb page. All specific allocators
- * should use this function to get new hugetlb pages
+ * Common helper to allocate a fresh hugetlb folio. All specific allocators
+ * should use this function to get new hugetlb folio
*
- * Note that returned page is 'frozen': ref count of head page and all tail
- * pages is zero.
+ * Note that returned folio is 'frozen': ref count of head page and all tail
+ * pages is zero, and the accounting must be done in the caller.
*/
static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h,
gfp_t gfp_mask, int nid, nodemask_t *nmask)
{
struct folio *folio;
- if (hstate_is_gigantic(h))
- folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask);
- else
- folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, nmask, NULL);
- if (!folio)
- return NULL;
-
- prep_new_hugetlb_folio(h, folio, folio_nid(folio));
+ folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL);
+ if (folio)
+ hugetlb_vmemmap_optimize_folio(h, folio);
return folio;
}
@@ -2057,7 +2018,7 @@ static void prep_and_add_allocated_folios(struct hstate *h,
/* Add all new pool pages to free lists in one lock cycle */
spin_lock_irqsave(&hugetlb_lock, flags);
list_for_each_entry_safe(folio, tmp_f, folio_list, lru) {
- __prep_account_new_huge_page(h, folio_nid(folio));
+ account_new_hugetlb_folio(h, folio);
enqueue_hugetlb_folio(h, folio);
}
spin_unlock_irqrestore(&hugetlb_lock, flags);
@@ -2259,19 +2220,17 @@ static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h,
goto out_unlock;
spin_unlock_irq(&hugetlb_lock);
- folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL);
+ folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask);
if (!folio)
return NULL;
- hugetlb_vmemmap_optimize_folio(h, folio);
-
spin_lock_irq(&hugetlb_lock);
/*
* nr_huge_pages needs to be adjusted within the same lock cycle
* as surplus_pages, otherwise it might confuse
* persistent_huge_pages() momentarily.
*/
- __prep_account_new_huge_page(h, folio_nid(folio));
+ account_new_hugetlb_folio(h, folio);
/*
* We could have raced with the pool size change.
@@ -2308,6 +2267,10 @@ static struct folio *alloc_migrate_hugetlb_folio(struct hstate *h, gfp_t gfp_mas
if (!folio)
return NULL;
+ spin_lock_irq(&hugetlb_lock);
+ account_new_hugetlb_folio(h, folio);
+ spin_unlock_irq(&hugetlb_lock);
+
/* fresh huge pages are frozen */
folio_ref_unfreeze(folio, 1);
/*
@@ -2354,12 +2317,15 @@ struct folio *alloc_hugetlb_folio_reserve(struct hstate *h, int preferred_nid,
struct folio *folio;
spin_lock_irq(&hugetlb_lock);
+ if (!h->resv_huge_pages) {
+ spin_unlock_irq(&hugetlb_lock);
+ return NULL;
+ }
+
folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, preferred_nid,
nmask);
- if (folio) {
- VM_BUG_ON(!h->resv_huge_pages);
+ if (folio)
h->resv_huge_pages--;
- }
spin_unlock_irq(&hugetlb_lock);
return folio;
@@ -2419,7 +2385,6 @@ static int gather_surplus_pages(struct hstate *h, long delta)
long i;
long needed, allocated;
bool alloc_ok = true;
- int node;
nodemask_t *mbind_nodemask, alloc_nodemask;
mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h));
@@ -2443,21 +2408,12 @@ retry:
for (i = 0; i < needed; i++) {
folio = NULL;
- /* Prioritize current node */
- if (node_isset(numa_mem_id(), alloc_nodemask))
- folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h),
- numa_mem_id(), NULL);
-
- if (!folio) {
- for_each_node_mask(node, alloc_nodemask) {
- if (node == numa_mem_id())
- continue;
- folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h),
- node, NULL);
- if (folio)
- break;
- }
- }
+ /*
+ * It is okay to use NUMA_NO_NODE because we use numa_mem_id()
+ * down the road to pick the current node if that is the case.
+ */
+ folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h),
+ NUMA_NO_NODE, &alloc_nodemask);
if (!folio) {
alloc_ok = false;
break;
@@ -2811,20 +2767,24 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
/*
* alloc_and_dissolve_hugetlb_folio - Allocate a new folio and dissolve
* the old one
- * @h: struct hstate old page belongs to
* @old_folio: Old folio to dissolve
* @list: List to isolate the page in case we need to
* Returns 0 on success, otherwise negated error.
*/
-static int alloc_and_dissolve_hugetlb_folio(struct hstate *h,
- struct folio *old_folio, struct list_head *list)
+static int alloc_and_dissolve_hugetlb_folio(struct folio *old_folio,
+ struct list_head *list)
{
- gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
+ gfp_t gfp_mask;
+ struct hstate *h;
int nid = folio_nid(old_folio);
struct folio *new_folio = NULL;
int ret = 0;
retry:
+ /*
+ * The old_folio might have been dissolved from under our feet, so make sure
+ * to carefully check the state under the lock.
+ */
spin_lock_irq(&hugetlb_lock);
if (!folio_test_hugetlb(old_folio)) {
/*
@@ -2853,20 +2813,21 @@ retry:
cond_resched();
goto retry;
} else {
+ h = folio_hstate(old_folio);
if (!new_folio) {
spin_unlock_irq(&hugetlb_lock);
- new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid,
- NULL, NULL);
+ gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
+ new_folio = alloc_fresh_hugetlb_folio(h, gfp_mask,
+ nid, NULL);
if (!new_folio)
return -ENOMEM;
- __prep_new_hugetlb_folio(h, new_folio);
goto retry;
}
/*
* Ok, old_folio is still a genuine free hugepage. Remove it from
* the freelist and decrease the counters. These will be
- * incremented again when calling __prep_account_new_huge_page()
+ * incremented again when calling account_new_hugetlb_folio()
* and enqueue_hugetlb_folio() for new_folio. The counters will
* remain stable since this happens under the lock.
*/
@@ -2876,7 +2837,7 @@ retry:
* Ref count on new_folio is already zero as it was dropped
* earlier. It can be directly added to the pool free list.
*/
- __prep_account_new_huge_page(h, nid);
+ account_new_hugetlb_folio(h, new_folio);
enqueue_hugetlb_folio(h, new_folio);
/*
@@ -2896,38 +2857,26 @@ free_new:
return ret;
}
-int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
+int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list)
{
- struct hstate *h;
- struct folio *folio = page_folio(page);
int ret = -EBUSY;
- /*
- * The page might have been dissolved from under our feet, so make sure
- * to carefully check the state under the lock.
- * Return success when racing as if we dissolved the page ourselves.
- */
- spin_lock_irq(&hugetlb_lock);
- if (folio_test_hugetlb(folio)) {
- h = folio_hstate(folio);
- } else {
- spin_unlock_irq(&hugetlb_lock);
+ /* Not to disrupt normal path by vainly holding hugetlb_lock */
+ if (!folio_test_hugetlb(folio))
return 0;
- }
- spin_unlock_irq(&hugetlb_lock);
/*
* Fence off gigantic pages as there is a cyclic dependency between
* alloc_contig_range and them. Return -ENOMEM as this has the effect
* of bailing out right away without further retrying.
*/
- if (hstate_is_gigantic(h))
+ if (order_is_gigantic(folio_order(folio)))
return -ENOMEM;
if (folio_ref_count(folio) && folio_isolate_hugetlb(folio, list))
ret = 0;
else if (!folio_ref_count(folio))
- ret = alloc_and_dissolve_hugetlb_folio(h, folio, list);
+ ret = alloc_and_dissolve_hugetlb_folio(folio, list);
return ret;
}
@@ -2941,7 +2890,6 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
*/
int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn)
{
- struct hstate *h;
struct folio *folio;
int ret = 0;
@@ -2949,16 +2897,10 @@ int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn)
while (start_pfn < end_pfn) {
folio = pfn_folio(start_pfn);
- if (folio_test_hugetlb(folio)) {
- h = folio_hstate(folio);
- } else {
- start_pfn++;
- continue;
- }
- if (!folio_ref_count(folio)) {
- ret = alloc_and_dissolve_hugetlb_folio(h, folio,
- &isolate_list);
+ /* Not to disrupt normal path by vainly holding hugetlb_lock */
+ if (folio_test_hugetlb(folio) && !folio_ref_count(folio)) {
+ ret = alloc_and_dissolve_hugetlb_folio(folio, &isolate_list);
if (ret)
break;
@@ -3245,7 +3187,8 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid)
}
/* allocate from next node when distributing huge pages */
- for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, &node_states[N_ONLINE]) {
+ for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node,
+ &hugetlb_bootmem_nodes) {
m = alloc_bootmem(h, node, false);
if (!m)
return 0;
@@ -3274,17 +3217,18 @@ static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio,
{
enum zone_type zone = zone_idx(folio_zone(folio));
int nid = folio_nid(folio);
+ struct page *page = folio_page(folio, start_page_number);
unsigned long head_pfn = folio_pfn(folio);
unsigned long pfn, end_pfn = head_pfn + end_page_number;
- int ret;
-
- for (pfn = head_pfn + start_page_number; pfn < end_pfn; pfn++) {
- struct page *page = pfn_to_page(pfn);
+ /*
+ * As we marked all tail pages with memblock_reserved_mark_noinit(),
+ * we must initialize them ourselves here.
+ */
+ for (pfn = head_pfn + start_page_number; pfn < end_pfn; page++, pfn++) {
__init_single_page(page, pfn, zone, nid);
prep_compound_tail((struct page *)folio, pfn - head_pfn);
- ret = page_ref_freeze(page, 1);
- VM_BUG_ON(!ret);
+ set_page_count(page, 0);
}
}
@@ -3294,12 +3238,15 @@ static void __init hugetlb_folio_init_vmemmap(struct folio *folio,
{
int ret;
- /* Prepare folio head */
+ /*
+ * This is an open-coded prep_compound_page() whereby we avoid
+ * walking pages twice by initializing/preparing+freezing them in the
+ * same go.
+ */
__folio_clear_reserved(folio);
__folio_set_head(folio);
ret = folio_ref_freeze(folio, 1);
VM_BUG_ON(!ret);
- /* Initialize the necessary tail struct pages */
hugetlb_folio_init_tail_vmemmap(folio, 1, nr_pages);
prep_compound_head((struct page *)folio, huge_page_order(h));
}
@@ -3335,8 +3282,8 @@ static void __init hugetlb_bootmem_init_migratetype(struct folio *folio,
if (folio_test_hugetlb_cma(folio))
init_cma_pageblock(folio_page(folio, i));
else
- set_pageblock_migratetype(folio_page(folio, i),
- MIGRATE_MOVABLE);
+ init_pageblock_migratetype(folio_page(folio, i),
+ MIGRATE_MOVABLE, false);
}
}
@@ -3364,7 +3311,7 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h,
hugetlb_bootmem_init_migratetype(folio, h);
/* Subdivide locks to achieve better parallel performance */
spin_lock_irqsave(&hugetlb_lock, flags);
- __prep_account_new_huge_page(h, folio_nid(folio));
+ account_new_hugetlb_folio(h, folio);
enqueue_hugetlb_folio(h, folio);
spin_unlock_irqrestore(&hugetlb_lock, flags);
}
@@ -3460,7 +3407,7 @@ static void __init gather_bootmem_prealloc_node(unsigned long nid)
hugetlb_folio_init_vmemmap(folio, h,
HUGETLB_VMEMMAP_RESERVE_PAGES);
- init_new_hugetlb_folio(h, folio);
+ init_new_hugetlb_folio(folio);
if (hugetlb_bootmem_page_prehvo(m))
/*
@@ -3591,7 +3538,14 @@ static void __init hugetlb_pages_alloc_boot_node(unsigned long start, unsigned l
nodes_clear(node_alloc_noretry);
for (i = 0; i < num; ++i) {
- struct folio *folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY],
+ struct folio *folio;
+
+ if (hugetlb_vmemmap_optimizable_size(h) &&
+ (si_mem_available() == 0) && !list_empty(&folio_list)) {
+ prep_and_add_allocated_folios(h, &folio_list);
+ INIT_LIST_HEAD(&folio_list);
+ }
+ folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY],
&node_alloc_noretry, &next_node);
if (!folio)
break;
@@ -3626,10 +3580,9 @@ static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h)
unsigned long jiffies_start;
unsigned long jiffies_end;
+ unsigned long remaining;
job.thread_fn = hugetlb_pages_alloc_boot_node;
- job.start = 0;
- job.size = h->max_huge_pages;
/*
* job.max_threads is 25% of the available cpu threads by default.
@@ -3653,10 +3606,29 @@ static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h)
}
job.max_threads = hugepage_allocation_threads;
- job.min_chunk = h->max_huge_pages / hugepage_allocation_threads;
jiffies_start = jiffies;
- padata_do_multithreaded(&job);
+ do {
+ remaining = h->max_huge_pages - h->nr_huge_pages;
+
+ job.start = h->nr_huge_pages;
+ job.size = remaining;
+ job.min_chunk = remaining / hugepage_allocation_threads;
+ padata_do_multithreaded(&job);
+
+ if (h->nr_huge_pages == h->max_huge_pages)
+ break;
+
+ /*
+ * Retry only if the vmemmap optimization might have been able to free
+ * some memory back to the system.
+ */
+ if (!hugetlb_vmemmap_optimizable(h))
+ break;
+
+ /* Continue if progress was made in last iteration */
+ } while (remaining != (h->max_huge_pages - h->nr_huge_pages));
+
jiffies_end = jiffies;
pr_info("HugeTLB: allocation took %dms with hugepage_allocation_threads=%ld\n",
@@ -3691,6 +3663,9 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
return;
}
+ if (!h->max_huge_pages)
+ return;
+
/* do node specific alloc */
if (hugetlb_hstate_alloc_pages_specific_nodes(h))
return;
@@ -3709,6 +3684,15 @@ static void __init hugetlb_init_hstates(void)
struct hstate *h, *h2;
for_each_hstate(h) {
+ /*
+ * Always reset to first_memory_node here, even if
+ * next_nid_to_alloc was set before - we can't
+ * reference hugetlb_bootmem_nodes after init, and
+ * first_memory_node is right for all further allocations.
+ */
+ h->next_nid_to_alloc = first_memory_node;
+ h->next_nid_to_free = first_memory_node;
+
/* oversize hugepages were init'ed in early boot */
if (!hstate_is_gigantic(h))
hugetlb_hstate_alloc_pages(h);
@@ -3748,10 +3732,10 @@ static void __init report_hugepages(void)
string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n",
- buf, h->free_huge_pages);
+ buf, h->nr_huge_pages);
if (nrinvalid)
pr_info("HugeTLB: %s page size: %lu invalid page%s discarded\n",
- buf, nrinvalid, nrinvalid > 1 ? "s" : "");
+ buf, nrinvalid, str_plural(nrinvalid));
pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n",
hugetlb_vmemmap_optimizable_size(h) / SZ_1K, buf);
}
@@ -4063,7 +4047,7 @@ static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst,
prep_compound_page(page, dst->order);
new_folio->mapping = NULL;
- init_new_hugetlb_folio(dst, new_folio);
+ init_new_hugetlb_folio(new_folio);
/* Copy the CMA flag so that it is freed correctly */
if (cma)
folio_set_hugetlb_cma(new_folio);
@@ -4682,6 +4666,7 @@ static int __init hugetlb_init(void)
BUILD_BUG_ON(sizeof_field(struct page, private) * BITS_PER_BYTE <
__NR_HPAGEFLAGS);
+ BUILD_BUG_ON_INVALID(HUGETLB_PAGE_ORDER > MAX_FOLIO_ORDER);
if (!hugepages_supported()) {
if (hugetlb_max_hstate || default_hstate_max_huge_pages)
@@ -4765,6 +4750,7 @@ void __init hugetlb_add_hstate(unsigned int order)
}
BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
BUG_ON(order < order_base_2(__NR_USED_SUBPAGE));
+ WARN_ON(order > MAX_FOLIO_ORDER);
h = &hstates[hugetlb_max_hstate++];
__mutex_init(&h->resize_lock, "resize mutex", &h->resize_key);
h->order = order;
@@ -5021,6 +5007,20 @@ static int __init default_hugepagesz_setup(char *s)
}
hugetlb_early_param("default_hugepagesz", default_hugepagesz_setup);
+void __init hugetlb_bootmem_set_nodes(void)
+{
+ int i, nid;
+ unsigned long start_pfn, end_pfn;
+
+ if (!nodes_empty(hugetlb_bootmem_nodes))
+ return;
+
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
+ if (end_pfn > start_pfn)
+ node_set(nid, hugetlb_bootmem_nodes);
+ }
+}
+
static bool __hugetlb_bootmem_allocated __initdata;
bool __init hugetlb_bootmem_allocated(void)
@@ -5036,6 +5036,8 @@ void __init hugetlb_bootmem_alloc(void)
if (__hugetlb_bootmem_allocated)
return;
+ hugetlb_bootmem_set_nodes();
+
for (i = 0; i < MAX_NUMNODES; i++)
INIT_LIST_HEAD(&huge_boot_pages[i]);
@@ -5043,7 +5045,6 @@ void __init hugetlb_bootmem_alloc(void)
for_each_hstate(h) {
h->next_nid_to_alloc = first_online_node;
- h->next_nid_to_free = first_online_node;
if (hstate_is_gigantic(h))
hugetlb_hstate_alloc_pages(h);
@@ -5418,26 +5419,40 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
{
if (addr & ~(huge_page_mask(hstate_vma(vma))))
return -EINVAL;
+ return 0;
+}
+void hugetlb_split(struct vm_area_struct *vma, unsigned long addr)
+{
/*
* PMD sharing is only possible for PUD_SIZE-aligned address ranges
* in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this
* split, unshare PMDs in the PUD_SIZE interval surrounding addr now.
+ * This function is called in the middle of a VMA split operation, with
+ * MM, VMA and rmap all write-locked to prevent concurrent page table
+ * walks (except hardware and gup_fast()).
*/
+ vma_assert_write_locked(vma);
+ i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+
if (addr & ~PUD_MASK) {
- /*
- * hugetlb_vm_op_split is called right before we attempt to
- * split the VMA. We will need to unshare PMDs in the old and
- * new VMAs, so let's unshare before we split.
- */
unsigned long floor = addr & PUD_MASK;
unsigned long ceil = floor + PUD_SIZE;
- if (floor >= vma->vm_start && ceil <= vma->vm_end)
- hugetlb_unshare_pmds(vma, floor, ceil);
+ if (floor >= vma->vm_start && ceil <= vma->vm_end) {
+ /*
+ * Locking:
+ * Use take_locks=false here.
+ * The file rmap lock is already held.
+ * The hugetlb VMA lock can't be taken when we already
+ * hold the file rmap lock, and we don't need it because
+ * its purpose is to synchronize against concurrent page
+ * table walks, which are not possible thanks to the
+ * locks held by our caller.
+ */
+ hugetlb_unshare_pmds(vma, floor, ceil, /* take_locks = */ false);
+ }
}
-
- return 0;
}
static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
@@ -5472,18 +5487,16 @@ const struct vm_operations_struct hugetlb_vm_ops = {
.pagesize = hugetlb_vm_op_pagesize,
};
-static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
+static pte_t make_huge_pte(struct vm_area_struct *vma, struct folio *folio,
bool try_mkwrite)
{
- pte_t entry;
+ pte_t entry = folio_mk_pte(folio, vma->vm_page_prot);
unsigned int shift = huge_page_shift(hstate_vma(vma));
if (try_mkwrite && (vma->vm_flags & VM_WRITE)) {
- entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
- vma->vm_page_prot)));
+ entry = pte_mkwrite_novma(pte_mkdirty(entry));
} else {
- entry = huge_pte_wrprotect(mk_huge_pte(page,
- vma->vm_page_prot));
+ entry = pte_wrprotect(entry);
}
entry = pte_mkyoung(entry);
entry = arch_make_huge_pte(entry, shift, vma->vm_flags);
@@ -5538,7 +5551,7 @@ static void
hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr,
struct folio *new_folio, pte_t old, unsigned long sz)
{
- pte_t newpte = make_huge_pte(vma, &new_folio->page, true);
+ pte_t newpte = make_huge_pte(vma, new_folio, true);
__folio_mark_uptodate(new_folio);
hugetlb_add_new_anon_rmap(new_folio, vma, addr);
@@ -5595,18 +5608,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
break;
}
- /*
- * If the pagetables are shared don't copy or take references.
- *
- * dst_pte == src_pte is the common case of src/dest sharing.
- * However, src could have 'unshared' and dst shares with
- * another vma. So page_count of ptep page is checked instead
- * to reliably determine whether pte is shared.
- */
- if (page_count(virt_to_page(dst_pte)) > 1) {
+#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
+ /* If the pagetables are shared, there is nothing to do */
+ if (ptdesc_pmd_is_shared(virt_to_ptdesc(dst_pte))) {
addr |= last_addr_mask;
continue;
}
+#endif
dst_ptl = huge_pte_lock(h, dst, dst_pte);
src_ptl = huge_pte_lockptr(h, src, src_pte);
@@ -5842,17 +5850,17 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long start, unsigned long end,
- struct page *ref_page, zap_flags_t zap_flags)
+ struct folio *folio, zap_flags_t zap_flags)
{
struct mm_struct *mm = vma->vm_mm;
+ const bool folio_provided = !!folio;
unsigned long address;
pte_t *ptep;
pte_t pte;
spinlock_t *ptl;
- struct page *page;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
- bool adjust_reservation = false;
+ bool adjust_reservation;
unsigned long last_addr_mask;
bool force_flush = false;
@@ -5913,14 +5921,13 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
continue;
}
- page = pte_page(pte);
/*
- * If a reference page is supplied, it is because a specific
- * page is being unmapped, not a range. Ensure the page we
- * are about to unmap is the actual page of interest.
+ * If a folio is supplied, it is because a specific
+ * folio is being unmapped, not a range. Ensure the folio we
+ * are about to unmap is the actual folio of interest.
*/
- if (ref_page) {
- if (page != ref_page) {
+ if (folio_provided) {
+ if (folio != page_folio(pte_page(pte))) {
spin_unlock(ptl);
continue;
}
@@ -5930,12 +5937,14 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
* looking like data was lost
*/
set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
+ } else {
+ folio = page_folio(pte_page(pte));
}
pte = huge_ptep_get_and_clear(mm, address, ptep, sz);
tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
if (huge_pte_dirty(pte))
- set_page_dirty(page);
+ folio_mark_dirty(folio);
/* Leave a uffd-wp pte marker if needed */
if (huge_pte_uffd_wp(pte) &&
!(zap_flags & ZAP_FLAG_DROP_MARKER))
@@ -5943,7 +5952,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
make_pte_marker(PTE_MARKER_UFFD_WP),
sz);
hugetlb_count_sub(pages_per_huge_page(h), mm);
- hugetlb_remove_rmap(page_folio(page));
+ hugetlb_remove_rmap(folio);
+ spin_unlock(ptl);
/*
* Restore the reservation for anonymous page, otherwise the
@@ -5951,14 +5961,16 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
* If there we are freeing a surplus, do not set the restore
* reservation bit.
*/
+ adjust_reservation = false;
+
+ spin_lock_irq(&hugetlb_lock);
if (!h->surplus_huge_pages && __vma_private_lock(vma) &&
- folio_test_anon(page_folio(page))) {
- folio_set_hugetlb_restore_reserve(page_folio(page));
+ folio_test_anon(folio)) {
+ folio_set_hugetlb_restore_reserve(folio);
/* Reservation to be adjusted after the spin lock */
adjust_reservation = true;
}
-
- spin_unlock(ptl);
+ spin_unlock_irq(&hugetlb_lock);
/*
* Adjust the reservation for the region that will have the
@@ -5977,16 +5989,17 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
* count will not be incremented by free_huge_folio.
* Act as if we consumed the reservation.
*/
- folio_clear_hugetlb_restore_reserve(page_folio(page));
+ folio_clear_hugetlb_restore_reserve(folio);
else if (rc)
vma_add_reservation(h, vma, address);
}
- tlb_remove_page_size(tlb, page, huge_page_size(h));
+ tlb_remove_page_size(tlb, folio_page(folio, 0),
+ folio_size(folio));
/*
- * Bail out after unmapping reference page if supplied
+ * If we were instructed to unmap a specific folio, we're done.
*/
- if (ref_page)
+ if (folio_provided)
break;
}
tlb_end_vma(tlb, vma);
@@ -6048,7 +6061,7 @@ void __hugetlb_zap_end(struct vm_area_struct *vma,
}
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
- unsigned long end, struct page *ref_page,
+ unsigned long end, struct folio *folio,
zap_flags_t zap_flags)
{
struct mmu_notifier_range range;
@@ -6060,7 +6073,8 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
mmu_notifier_invalidate_range_start(&range);
tlb_gather_mmu(&tlb, vma->vm_mm);
- __unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags);
+ __unmap_hugepage_range(&tlb, vma, start, end,
+ folio, zap_flags);
mmu_notifier_invalidate_range_end(&range);
tlb_finish_mmu(&tlb);
@@ -6073,7 +6087,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
* same region.
*/
static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
- struct page *page, unsigned long address)
+ struct folio *folio, unsigned long address)
{
struct hstate *h = hstate_vma(vma);
struct vm_area_struct *iter_vma;
@@ -6117,7 +6131,8 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
*/
if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
unmap_hugepage_range(iter_vma, address,
- address + huge_page_size(h), page, 0);
+ address + huge_page_size(h),
+ folio, 0);
}
i_mmap_unlock_write(mapping);
}
@@ -6128,8 +6143,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
* cannot race with other handlers or page migration.
* Keep the pte_same checks anyway to make transition from the mutex easier.
*/
-static vm_fault_t hugetlb_wp(struct folio *pagecache_folio,
- struct vm_fault *vmf)
+static vm_fault_t hugetlb_wp(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct mm_struct *mm = vma->vm_mm;
@@ -6191,16 +6205,17 @@ retry_avoidcopy:
PageAnonExclusive(&old_folio->page), &old_folio->page);
/*
- * If the process that created a MAP_PRIVATE mapping is about to
- * perform a COW due to a shared page count, attempt to satisfy
- * the allocation without using the existing reserves. The pagecache
- * page is used to determine if the reserve at this address was
- * consumed or not. If reserves were used, a partial faulted mapping
- * at the time of fork() could consume its reserves on COW instead
- * of the full address range.
+ * If the process that created a MAP_PRIVATE mapping is about to perform
+ * a COW due to a shared page count, attempt to satisfy the allocation
+ * without using the existing reserves.
+ * In order to determine where this is a COW on a MAP_PRIVATE mapping it
+ * is enough to check whether the old_folio is anonymous. This means that
+ * the reserve for this address was consumed. If reserves were used, a
+ * partial faulted mapping at the fime of fork() could consume its reserves
+ * on COW instead of the full address range.
*/
if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
- old_folio != pagecache_folio)
+ folio_test_anon(old_folio))
cow_from_owner = true;
folio_get(old_folio);
@@ -6240,8 +6255,7 @@ retry_avoidcopy:
hugetlb_vma_unlock_read(vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- unmap_ref_private(mm, vma, &old_folio->page,
- vmf->address);
+ unmap_ref_private(mm, vma, old_folio, vmf->address);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
hugetlb_vma_lock_read(vma);
@@ -6288,7 +6302,7 @@ retry_avoidcopy:
spin_lock(vmf->ptl);
vmf->pte = hugetlb_walk(vma, vmf->address, huge_page_size(h));
if (likely(vmf->pte && pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), pte))) {
- pte_t newpte = make_huge_pte(vma, &new_folio->page, !unshare);
+ pte_t newpte = make_huge_pte(vma, new_folio, !unshare);
/* Break COW or unshare */
huge_ptep_clear_flush(vma, vmf->address, vmf->pte);
@@ -6404,16 +6418,16 @@ static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm, unsigned
static vm_fault_t hugetlb_no_page(struct address_space *mapping,
struct vm_fault *vmf)
{
+ u32 hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff);
+ bool new_folio, new_anon_folio = false;
struct vm_area_struct *vma = vmf->vma;
struct mm_struct *mm = vma->vm_mm;
struct hstate *h = hstate_vma(vma);
vm_fault_t ret = VM_FAULT_SIGBUS;
- int anon_rmap = 0;
- unsigned long size;
+ bool folio_locked = true;
struct folio *folio;
+ unsigned long size;
pte_t new_pte;
- bool new_folio, new_pagecache_folio = false;
- u32 hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff);
/*
* Currently, we are forced to kill the process in the event the
@@ -6512,10 +6526,9 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping,
ret = VM_FAULT_SIGBUS;
goto out;
}
- new_pagecache_folio = true;
} else {
+ new_anon_folio = true;
folio_lock(folio);
- anon_rmap = 1;
}
} else {
/*
@@ -6564,11 +6577,11 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping,
if (!pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), vmf->orig_pte))
goto backout;
- if (anon_rmap)
+ if (new_anon_folio)
hugetlb_add_new_anon_rmap(folio, vma, vmf->address);
else
hugetlb_add_file_rmap(folio);
- new_pte = make_huge_pte(vma, &folio->page, vma->vm_flags & VM_SHARED);
+ new_pte = make_huge_pte(vma, folio, vma->vm_flags & VM_SHARED);
/*
* If this pte was previously wr-protected, keep it wr-protected even
* if populated.
@@ -6579,8 +6592,16 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping,
hugetlb_count_add(pages_per_huge_page(h), mm);
if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
+ /*
+ * No need to keep file folios locked. See comment in
+ * hugetlb_fault().
+ */
+ if (!new_anon_folio) {
+ folio_locked = false;
+ folio_unlock(folio);
+ }
/* Optimization, do the COW without a second fault */
- ret = hugetlb_wp(folio, vmf);
+ ret = hugetlb_wp(vmf);
}
spin_unlock(vmf->ptl);
@@ -6593,7 +6614,8 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping,
if (new_folio)
folio_set_hugetlb_migratable(folio);
- folio_unlock(folio);
+ if (folio_locked)
+ folio_unlock(folio);
out:
hugetlb_vma_unlock_read(vma);
@@ -6610,7 +6632,8 @@ out:
backout:
spin_unlock(vmf->ptl);
backout_unlocked:
- if (new_folio && !new_pagecache_folio)
+ /* We only need to restore reservations for private mappings */
+ if (new_anon_folio)
restore_reserve_on_error(h, vma, vmf->address, folio);
folio_unlock(folio);
@@ -6648,10 +6671,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
vm_fault_t ret;
u32 hash;
struct folio *folio = NULL;
- struct folio *pagecache_folio = NULL;
struct hstate *h = hstate_vma(vma);
struct address_space *mapping;
- int need_wait_lock = 0;
+ bool need_wait_lock = false;
struct vm_fault vmf = {
.vma = vma,
.address = address & huge_page_mask(h),
@@ -6717,15 +6739,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
ret = 0;
- /*
- * vmf.orig_pte could be a migration/hwpoison vmf.orig_pte at this
- * point, so this check prevents the kernel from going below assuming
- * that we have an active hugepage in pagecache. This goto expects
- * the 2nd page fault, and is_hugetlb_entry_(migration|hwpoisoned)
- * check will properly handle it.
- */
+ /* Not present, either a migration or a hwpoisoned entry */
if (!pte_present(vmf.orig_pte)) {
- if (unlikely(is_hugetlb_entry_migration(vmf.orig_pte))) {
+ if (is_hugetlb_entry_migration(vmf.orig_pte)) {
/*
* Release the hugetlb fault lock now, but retain
* the vma lock, because it is needed to guard the
@@ -6736,7 +6752,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
migration_entry_wait_huge(vma, vmf.address, vmf.pte);
return 0;
- } else if (unlikely(is_hugetlb_entry_hwpoisoned(vmf.orig_pte)))
+ } else if (is_hugetlb_entry_hwpoisoned(vmf.orig_pte))
ret = VM_FAULT_HWPOISON_LARGE |
VM_FAULT_SET_HINDEX(hstate_index(h));
goto out_mutex;
@@ -6746,8 +6762,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* If we are going to COW/unshare the mapping later, we examine the
* pending reservations for this page now. This will ensure that any
* allocations necessary to record that reservation occur outside the
- * spinlock. Also lookup the pagecache page now as it is used to
- * determine if a reservation has been consumed.
+ * spinlock.
*/
if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
!(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(vmf.orig_pte)) {
@@ -6757,11 +6772,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
}
/* Just decrements count, does not deallocate */
vma_end_reservation(h, vma, vmf.address);
-
- pagecache_folio = filemap_lock_hugetlb_folio(h, mapping,
- vmf.pgoff);
- if (IS_ERR(pagecache_folio))
- pagecache_folio = NULL;
}
vmf.ptl = huge_pte_lock(h, mm, vmf.pte);
@@ -6775,10 +6785,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
(flags & FAULT_FLAG_WRITE) && !huge_pte_write(vmf.orig_pte)) {
if (!userfaultfd_wp_async(vma)) {
spin_unlock(vmf.ptl);
- if (pagecache_folio) {
- folio_unlock(pagecache_folio);
- folio_put(pagecache_folio);
- }
hugetlb_vma_unlock_read(vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
return handle_userfault(&vmf, VM_UFFD_WP);
@@ -6790,24 +6796,24 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
/* Fallthrough to CoW */
}
- /*
- * hugetlb_wp() requires page locks of pte_page(vmf.orig_pte) and
- * pagecache_folio, so here we need take the former one
- * when folio != pagecache_folio or !pagecache_folio.
- */
- folio = page_folio(pte_page(vmf.orig_pte));
- if (folio != pagecache_folio)
- if (!folio_trylock(folio)) {
- need_wait_lock = 1;
- goto out_ptl;
- }
-
- folio_get(folio);
-
if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
if (!huge_pte_write(vmf.orig_pte)) {
- ret = hugetlb_wp(pagecache_folio, &vmf);
- goto out_put_page;
+ /*
+ * Anonymous folios need to be lock since hugetlb_wp()
+ * checks whether we can re-use the folio exclusively
+ * for us in case we are the only user of it.
+ */
+ folio = page_folio(pte_page(vmf.orig_pte));
+ if (folio_test_anon(folio) && !folio_trylock(folio)) {
+ need_wait_lock = true;
+ goto out_ptl;
+ }
+ folio_get(folio);
+ ret = hugetlb_wp(&vmf);
+ if (folio_test_anon(folio))
+ folio_unlock(folio);
+ folio_put(folio);
+ goto out_ptl;
} else if (likely(flags & FAULT_FLAG_WRITE)) {
vmf.orig_pte = huge_pte_mkdirty(vmf.orig_pte);
}
@@ -6816,17 +6822,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (huge_ptep_set_access_flags(vma, vmf.address, vmf.pte, vmf.orig_pte,
flags & FAULT_FLAG_WRITE))
update_mmu_cache(vma, vmf.address, vmf.pte);
-out_put_page:
- if (folio != pagecache_folio)
- folio_unlock(folio);
- folio_put(folio);
out_ptl:
spin_unlock(vmf.ptl);
-
- if (pagecache_folio) {
- folio_unlock(pagecache_folio);
- folio_put(pagecache_folio);
- }
out_mutex:
hugetlb_vma_unlock_read(vma);
@@ -6839,11 +6836,16 @@ out_mutex:
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
/*
- * Generally it's safe to hold refcount during waiting page lock. But
- * here we just wait to defer the next page fault to avoid busy loop and
- * the page is not used after unlocked before returning from the current
- * page fault. So we are safe from accessing freed page, even if we wait
- * here without taking refcount.
+ * hugetlb_wp drops all the locks, but the folio lock, before trying to
+ * unmap the folio from other processes. During that window, if another
+ * process mapping that folio faults in, it will take the mutex and then
+ * it will wait on folio_lock, causing an ABBA deadlock.
+ * Use trylock instead and bail out if we fail.
+ *
+ * Ideally, we should hold a refcount on the folio we wait for, but we do
+ * not want to use the folio after it becomes unlocked, but rather just
+ * wait for it to become unlocked, so hopefully next fault successes on
+ * the trylock.
*/
if (need_wait_lock)
folio_wait_locked(folio);
@@ -6939,6 +6941,11 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
folio = alloc_hugetlb_folio(dst_vma, dst_addr, false);
if (IS_ERR(folio)) {
+ pte_t *actual_pte = hugetlb_walk(dst_vma, dst_addr, PMD_SIZE);
+ if (actual_pte) {
+ ret = -EEXIST;
+ goto out;
+ }
ret = -ENOMEM;
goto out;
}
@@ -7053,7 +7060,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
* For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY
* with wp flag set, don't set pte write bit.
*/
- _dst_pte = make_huge_pte(dst_vma, &folio->page,
+ _dst_pte = make_huge_pte(dst_vma, folio,
!wp_enabled && !(is_continue && !vm_shared));
/*
* Always mark UFFDIO_COPY page dirty; note that this may not be
@@ -7163,11 +7170,11 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
/* Nothing to do. */
} else if (unlikely(is_hugetlb_entry_migration(pte))) {
swp_entry_t entry = pte_to_swp_entry(pte);
- struct page *page = pfn_swap_entry_to_page(entry);
+ struct folio *folio = pfn_swap_entry_folio(entry);
pte_t newpte = pte;
if (is_writable_migration_entry(entry)) {
- if (PageAnon(page))
+ if (folio_test_anon(folio))
entry = make_readable_exclusive_migration_entry(
swp_offset(entry));
else
@@ -7215,6 +7222,8 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
psize);
}
spin_unlock(ptl);
+
+ cond_resched();
}
/*
* Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare
@@ -7241,8 +7250,15 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
return pages > 0 ? (pages << h->order) : pages;
}
-/* Return true if reservation was successful, false otherwise. */
-bool hugetlb_reserve_pages(struct inode *inode,
+/*
+ * Update the reservation map for the range [from, to].
+ *
+ * Returns the number of entries that would be added to the reservation map
+ * associated with the range [from, to]. This number is greater or equal to
+ * zero. -EINVAL or -ENOMEM is returned in case of any errors.
+ */
+
+long hugetlb_reserve_pages(struct inode *inode,
long from, long to,
struct vm_area_struct *vma,
vm_flags_t vm_flags)
@@ -7257,7 +7273,7 @@ bool hugetlb_reserve_pages(struct inode *inode,
/* This should never happen */
if (from > to) {
VM_WARN(1, "%s called with a negative range\n", __func__);
- return false;
+ return -EINVAL;
}
/*
@@ -7272,7 +7288,7 @@ bool hugetlb_reserve_pages(struct inode *inode,
* without using reserves
*/
if (vm_flags & VM_NORESERVE)
- return true;
+ return 0;
/*
* Shared mappings base their reservation on the number of pages that
@@ -7379,7 +7395,7 @@ bool hugetlb_reserve_pages(struct inode *inode,
hugetlb_cgroup_put_rsvd_cgroup(h_cg);
}
}
- return true;
+ return chg;
out_put_pages:
spool_resv = chg - gbl_reserve;
@@ -7407,7 +7423,7 @@ out_err:
kref_put(&resv_map->refs, resv_map_release);
set_vma_resv_map(vma, NULL);
}
- return false;
+ return chg < 0 ? chg : add < 0 ? add : -EINVAL;
}
long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
@@ -7462,8 +7478,8 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
unsigned long s_end = sbase + PUD_SIZE;
/* Allow segments to share if only one is marked locked */
- unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED_MASK;
- unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED_MASK;
+ vm_flags_t vm_flags = vma->vm_flags & ~VM_LOCKED_MASK;
+ vm_flags_t svm_flags = svma->vm_flags & ~VM_LOCKED_MASK;
/*
* match the virtual addresses, permission and the alignment of the
@@ -7602,10 +7618,17 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
hugetlb_vma_assert_locked(vma);
if (sz != PMD_SIZE)
return 0;
- if (!ptdesc_pmd_pts_count(virt_to_ptdesc(ptep)))
+ if (!ptdesc_pmd_is_shared(virt_to_ptdesc(ptep)))
return 0;
pud_clear(pud);
+ /*
+ * Once our caller drops the rmap lock, some other process might be
+ * using this page table as a normal, non-hugetlb page table.
+ * Wait for pending gup_fast() in other threads to finish before letting
+ * that happen.
+ */
+ tlb_remove_table_sync_one();
ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep));
mm_dec_nr_pmds(mm);
return 1;
@@ -7831,7 +7854,7 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re
struct hstate *h = folio_hstate(old_folio);
hugetlb_cgroup_migrate(old_folio, new_folio);
- set_page_owner_migrate_reason(&new_folio->page, reason);
+ folio_set_owner_migrate_reason(new_folio, reason);
/*
* transfer temporary state of the new hugetlb folio. This is
@@ -7876,9 +7899,16 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re
spin_unlock_irq(&hugetlb_lock);
}
+/*
+ * If @take_locks is false, the caller must ensure that no concurrent page table
+ * access can happen (except for gup_fast() and hardware page walks).
+ * If @take_locks is true, we take the hugetlb VMA lock (to lock out things like
+ * concurrent page fault handling) and the file rmap lock.
+ */
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
unsigned long start,
- unsigned long end)
+ unsigned long end,
+ bool take_locks)
{
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
@@ -7902,8 +7932,12 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
start, end);
mmu_notifier_invalidate_range_start(&range);
- hugetlb_vma_lock_write(vma);
- i_mmap_lock_write(vma->vm_file->f_mapping);
+ if (take_locks) {
+ hugetlb_vma_lock_write(vma);
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+ } else {
+ i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+ }
for (address = start; address < end; address += PUD_SIZE) {
ptep = hugetlb_walk(vma, address, sz);
if (!ptep)
@@ -7913,8 +7947,10 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
spin_unlock(ptl);
}
flush_hugetlb_tlb_range(vma, start, end);
- i_mmap_unlock_write(vma->vm_file->f_mapping);
- hugetlb_vma_unlock_write(vma);
+ if (take_locks) {
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+ hugetlb_vma_unlock_write(vma);
+ }
/*
* No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see
* Documentation/mm/mmu_notifier.rst.
@@ -7929,5 +7965,20 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
{
hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE),
- ALIGN_DOWN(vma->vm_end, PUD_SIZE));
+ ALIGN_DOWN(vma->vm_end, PUD_SIZE),
+ /* take_locks = */ true);
+}
+
+/*
+ * For hugetlb, mremap() is an odd edge case - while the VMA copying is
+ * performed, we permit both the old and new VMAs to reference the same
+ * reservation.
+ *
+ * We fix this up after the operation succeeds, or if a newly allocated VMA
+ * is closed as a result of a failure to allocate memory.
+ */
+void fixup_hugetlb_reservations(struct vm_area_struct *vma)
+{
+ if (is_vm_hugetlb_page(vma))
+ clear_vma_resv_huge_pages(vma);
}
diff --git a/mm/hugetlb_cma.c b/mm/hugetlb_cma.c
index e0f2d5c3a84c..e8e4dc7182d5 100644
--- a/mm/hugetlb_cma.c
+++ b/mm/hugetlb_cma.c
@@ -26,11 +26,10 @@ void hugetlb_cma_free_folio(struct folio *folio)
}
-struct folio *hugetlb_cma_alloc_folio(struct hstate *h, gfp_t gfp_mask,
+struct folio *hugetlb_cma_alloc_folio(int order, gfp_t gfp_mask,
int nid, nodemask_t *nodemask)
{
int node;
- int order = huge_page_order(h);
struct folio *folio = NULL;
if (hugetlb_cma[nid])
@@ -66,7 +65,7 @@ hugetlb_cma_alloc_bootmem(struct hstate *h, int *nid, bool node_exact)
if (node_exact)
return NULL;
- for_each_online_node(node) {
+ for_each_node_mask(node, hugetlb_bootmem_nodes) {
cma = hugetlb_cma[node];
if (!cma || node == *nid)
continue;
@@ -153,11 +152,13 @@ void __init hugetlb_cma_reserve(int order)
if (!hugetlb_cma_size)
return;
+ hugetlb_bootmem_set_nodes();
+
for (nid = 0; nid < MAX_NUMNODES; nid++) {
if (hugetlb_cma_size_in_node[nid] == 0)
continue;
- if (!node_online(nid)) {
+ if (!node_isset(nid, hugetlb_bootmem_nodes)) {
pr_warn("hugetlb_cma: invalid node %d specified\n", nid);
hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
hugetlb_cma_size_in_node[nid] = 0;
@@ -190,13 +191,14 @@ void __init hugetlb_cma_reserve(int order)
* If 3 GB area is requested on a machine with 4 numa nodes,
* let's allocate 1 GB on first three nodes and ignore the last one.
*/
- per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes);
+ per_node = DIV_ROUND_UP(hugetlb_cma_size,
+ nodes_weight(hugetlb_bootmem_nodes));
pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n",
hugetlb_cma_size / SZ_1M, per_node / SZ_1M);
}
reserved = 0;
- for_each_online_node(nid) {
+ for_each_node_mask(nid, hugetlb_bootmem_nodes) {
int res;
char name[CMA_MAX_NAME];
diff --git a/mm/hugetlb_cma.h b/mm/hugetlb_cma.h
index f7d7fb9880a2..2c2ec8a7e134 100644
--- a/mm/hugetlb_cma.h
+++ b/mm/hugetlb_cma.h
@@ -4,7 +4,7 @@
#ifdef CONFIG_CMA
void hugetlb_cma_free_folio(struct folio *folio);
-struct folio *hugetlb_cma_alloc_folio(struct hstate *h, gfp_t gfp_mask,
+struct folio *hugetlb_cma_alloc_folio(int order, gfp_t gfp_mask,
int nid, nodemask_t *nodemask);
struct huge_bootmem_page *hugetlb_cma_alloc_bootmem(struct hstate *h, int *nid,
bool node_exact);
@@ -18,8 +18,8 @@ static inline void hugetlb_cma_free_folio(struct folio *folio)
{
}
-static inline struct folio *hugetlb_cma_alloc_folio(struct hstate *h,
- gfp_t gfp_mask, int nid, nodemask_t *nodemask)
+static inline struct folio *hugetlb_cma_alloc_folio(int order, gfp_t gfp_mask,
+ int nid, nodemask_t *nodemask)
{
return NULL;
}
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 27245e86df25..ba0fb1b6a5a8 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -166,7 +166,7 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end,
VM_BUG_ON(!PAGE_ALIGNED(start | end));
mmap_read_lock(&init_mm);
- ret = walk_page_range_novma(&init_mm, start, end, &vmemmap_remap_ops,
+ ret = walk_kernel_page_table_range(start, end, &vmemmap_remap_ops,
NULL, walk);
mmap_read_unlock(&init_mm);
if (ret)
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 7ecaa1900137..a11222572f97 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -7,8 +7,96 @@
#include <linux/swap.h>
#include <linux/pagemap.h>
#include <linux/hugetlb.h>
+#include <linux/page-flags.h>
+#include <linux/memcontrol.h>
#include "internal.h"
+static u32 hwpoison_filter_enable;
+static u32 hwpoison_filter_dev_major = ~0U;
+static u32 hwpoison_filter_dev_minor = ~0U;
+static u64 hwpoison_filter_flags_mask;
+static u64 hwpoison_filter_flags_value;
+
+static int hwpoison_filter_dev(struct page *p)
+{
+ struct folio *folio = page_folio(p);
+ struct address_space *mapping;
+ dev_t dev;
+
+ if (hwpoison_filter_dev_major == ~0U &&
+ hwpoison_filter_dev_minor == ~0U)
+ return 0;
+
+ mapping = folio_mapping(folio);
+ if (mapping == NULL || mapping->host == NULL)
+ return -EINVAL;
+
+ dev = mapping->host->i_sb->s_dev;
+ if (hwpoison_filter_dev_major != ~0U &&
+ hwpoison_filter_dev_major != MAJOR(dev))
+ return -EINVAL;
+ if (hwpoison_filter_dev_minor != ~0U &&
+ hwpoison_filter_dev_minor != MINOR(dev))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int hwpoison_filter_flags(struct page *p)
+{
+ if (!hwpoison_filter_flags_mask)
+ return 0;
+
+ if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
+ hwpoison_filter_flags_value)
+ return 0;
+ else
+ return -EINVAL;
+}
+
+/*
+ * This allows stress tests to limit test scope to a collection of tasks
+ * by putting them under some memcg. This prevents killing unrelated/important
+ * processes such as /sbin/init. Note that the target task may share clean
+ * pages with init (eg. libc text), which is harmless. If the target task
+ * share _dirty_ pages with another task B, the test scheme must make sure B
+ * is also included in the memcg. At last, due to race conditions this filter
+ * can only guarantee that the page either belongs to the memcg tasks, or is
+ * a freed page.
+ */
+#ifdef CONFIG_MEMCG
+static u64 hwpoison_filter_memcg;
+static int hwpoison_filter_task(struct page *p)
+{
+ if (!hwpoison_filter_memcg)
+ return 0;
+
+ if (page_cgroup_ino(p) != hwpoison_filter_memcg)
+ return -EINVAL;
+
+ return 0;
+}
+#else
+static int hwpoison_filter_task(struct page *p) { return 0; }
+#endif
+
+static int hwpoison_filter(struct page *p)
+{
+ if (!hwpoison_filter_enable)
+ return 0;
+
+ if (hwpoison_filter_dev(p))
+ return -EINVAL;
+
+ if (hwpoison_filter_flags(p))
+ return -EINVAL;
+
+ if (hwpoison_filter_task(p))
+ return -EINVAL;
+
+ return 0;
+}
+
static struct dentry *hwpoison_dir;
static int hwpoison_inject(void *data, u64 val)
@@ -67,6 +155,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n");
static void __exit pfn_inject_exit(void)
{
hwpoison_filter_enable = 0;
+ hwpoison_filter_unregister();
debugfs_remove_recursive(hwpoison_dir);
}
@@ -105,6 +194,8 @@ static int __init pfn_inject_init(void)
&hwpoison_filter_memcg);
#endif
+ hwpoison_filter_register(hwpoison_filter);
+
return 0;
}
diff --git a/mm/internal.h b/mm/internal.h
index 5c7a2b43ad76..1561fc2ff5b8 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -149,7 +149,7 @@ static inline void *folio_raw_mapping(const struct folio *folio)
{
unsigned long mapping = (unsigned long)folio->mapping;
- return (void *)(mapping & ~PAGE_MAPPING_FLAGS);
+ return (void *)(mapping & ~FOLIO_MAPPING_FLAGS);
}
/*
@@ -164,7 +164,7 @@ static inline void *folio_raw_mapping(const struct folio *folio)
*/
static inline int mmap_file(struct file *file, struct vm_area_struct *vma)
{
- int err = call_mmap(file, vma);
+ int err = vfs_mmap(file, vma);
if (likely(!err))
return 0;
@@ -202,94 +202,106 @@ static inline void vma_close(struct vm_area_struct *vma)
/* Flags for folio_pte_batch(). */
typedef int __bitwise fpb_t;
-/* Compare PTEs after pte_mkclean(), ignoring the dirty bit. */
-#define FPB_IGNORE_DIRTY ((__force fpb_t)BIT(0))
+/* Compare PTEs respecting the dirty bit. */
+#define FPB_RESPECT_DIRTY ((__force fpb_t)BIT(0))
-/* Compare PTEs after pte_clear_soft_dirty(), ignoring the soft-dirty bit. */
-#define FPB_IGNORE_SOFT_DIRTY ((__force fpb_t)BIT(1))
+/* Compare PTEs respecting the soft-dirty bit. */
+#define FPB_RESPECT_SOFT_DIRTY ((__force fpb_t)BIT(1))
+
+/* Compare PTEs respecting the writable bit. */
+#define FPB_RESPECT_WRITE ((__force fpb_t)BIT(2))
+
+/*
+ * Merge PTE write bits: if any PTE in the batch is writable, modify the
+ * PTE at @ptentp to be writable.
+ */
+#define FPB_MERGE_WRITE ((__force fpb_t)BIT(3))
+
+/*
+ * Merge PTE young and dirty bits: if any PTE in the batch is young or dirty,
+ * modify the PTE at @ptentp to be young or dirty, respectively.
+ */
+#define FPB_MERGE_YOUNG_DIRTY ((__force fpb_t)BIT(4))
static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
{
- if (flags & FPB_IGNORE_DIRTY)
+ if (!(flags & FPB_RESPECT_DIRTY))
pte = pte_mkclean(pte);
- if (likely(flags & FPB_IGNORE_SOFT_DIRTY))
+ if (likely(!(flags & FPB_RESPECT_SOFT_DIRTY)))
pte = pte_clear_soft_dirty(pte);
- return pte_wrprotect(pte_mkold(pte));
+ if (likely(!(flags & FPB_RESPECT_WRITE)))
+ pte = pte_wrprotect(pte);
+ return pte_mkold(pte);
}
/**
- * folio_pte_batch - detect a PTE batch for a large folio
+ * folio_pte_batch_flags - detect a PTE batch for a large folio
* @folio: The large folio to detect a PTE batch for.
- * @addr: The user virtual address the first page is mapped at.
- * @start_ptep: Page table pointer for the first entry.
- * @pte: Page table entry for the first page.
+ * @vma: The VMA. Only relevant with FPB_MERGE_WRITE, otherwise can be NULL.
+ * @ptep: Page table pointer for the first entry.
+ * @ptentp: Pointer to a COPY of the first page table entry whose flags this
+ * function updates based on @flags if appropriate.
* @max_nr: The maximum number of table entries to consider.
* @flags: Flags to modify the PTE batch semantics.
- * @any_writable: Optional pointer to indicate whether any entry except the
- * first one is writable.
- * @any_young: Optional pointer to indicate whether any entry except the
- * first one is young.
- * @any_dirty: Optional pointer to indicate whether any entry except the
- * first one is dirty.
*
* Detect a PTE batch: consecutive (present) PTEs that map consecutive
- * pages of the same large folio.
+ * pages of the same large folio in a single VMA and a single page table.
*
* All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN,
- * the accessed bit, writable bit, dirty bit (with FPB_IGNORE_DIRTY) and
- * soft-dirty bit (with FPB_IGNORE_SOFT_DIRTY).
+ * the accessed bit, writable bit, dirty bit (unless FPB_RESPECT_DIRTY is set)
+ * and soft-dirty bit (unless FPB_RESPECT_SOFT_DIRTY is set).
+ *
+ * @ptep must map any page of the folio. max_nr must be at least one and
+ * must be limited by the caller so scanning cannot exceed a single VMA and
+ * a single page table.
*
- * start_ptep must map any page of the folio. max_nr must be at least one and
- * must be limited by the caller so scanning cannot exceed a single page table.
+ * Depending on the FPB_MERGE_* flags, the pte stored at @ptentp will
+ * be updated: it's crucial that a pointer to a COPY of the first
+ * page table entry, obtained through ptep_get(), is provided as @ptentp.
+ *
+ * This function will be inlined to optimize based on the input parameters;
+ * consider using folio_pte_batch() instead if applicable.
*
* Return: the number of table entries in the batch.
*/
-static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
- pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags,
- bool *any_writable, bool *any_young, bool *any_dirty)
+static inline unsigned int folio_pte_batch_flags(struct folio *folio,
+ struct vm_area_struct *vma, pte_t *ptep, pte_t *ptentp,
+ unsigned int max_nr, fpb_t flags)
{
- pte_t expected_pte, *ptep;
- bool writable, young, dirty;
- int nr, cur_nr;
-
- if (any_writable)
- *any_writable = false;
- if (any_young)
- *any_young = false;
- if (any_dirty)
- *any_dirty = false;
+ bool any_writable = false, any_young = false, any_dirty = false;
+ pte_t expected_pte, pte = *ptentp;
+ unsigned int nr, cur_nr;
VM_WARN_ON_FOLIO(!pte_present(pte), folio);
VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio);
VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio);
+ /*
+ * Ensure this is a pointer to a copy not a pointer into a page table.
+ * If this is a stack value, it won't be a valid virtual address, but
+ * that's fine because it also cannot be pointing into the page table.
+ */
+ VM_WARN_ON(virt_addr_valid(ptentp) && PageTable(virt_to_page(ptentp)));
/* Limit max_nr to the actual remaining PFNs in the folio we could batch. */
max_nr = min_t(unsigned long, max_nr,
folio_pfn(folio) + folio_nr_pages(folio) - pte_pfn(pte));
- nr = pte_batch_hint(start_ptep, pte);
+ nr = pte_batch_hint(ptep, pte);
expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags);
- ptep = start_ptep + nr;
+ ptep = ptep + nr;
while (nr < max_nr) {
pte = ptep_get(ptep);
- if (any_writable)
- writable = !!pte_write(pte);
- if (any_young)
- young = !!pte_young(pte);
- if (any_dirty)
- dirty = !!pte_dirty(pte);
- pte = __pte_batch_clear_ignored(pte, flags);
- if (!pte_same(pte, expected_pte))
+ if (!pte_same(__pte_batch_clear_ignored(pte, flags), expected_pte))
break;
- if (any_writable)
- *any_writable |= writable;
- if (any_young)
- *any_young |= young;
- if (any_dirty)
- *any_dirty |= dirty;
+ if (flags & FPB_MERGE_WRITE)
+ any_writable |= pte_write(pte);
+ if (flags & FPB_MERGE_YOUNG_DIRTY) {
+ any_young |= pte_young(pte);
+ any_dirty |= pte_dirty(pte);
+ }
cur_nr = pte_batch_hint(ptep, pte);
expected_pte = pte_advance_pfn(expected_pte, cur_nr);
@@ -297,9 +309,19 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
nr += cur_nr;
}
+ if (any_writable)
+ *ptentp = pte_mkwrite(*ptentp, vma);
+ if (any_young)
+ *ptentp = pte_mkyoung(*ptentp);
+ if (any_dirty)
+ *ptentp = pte_mkdirty(*ptentp);
+
return min(nr, max_nr);
}
+unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte,
+ unsigned int max_nr);
+
/**
* pte_move_swp_offset - Move the swap entry offset field of a swap pte
* forward or backward by delta
@@ -430,11 +452,13 @@ void unmap_page_range(struct mmu_gather *tlb,
struct vm_area_struct *vma,
unsigned long addr, unsigned long end,
struct zap_details *details);
+void zap_page_range_single_batched(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, unsigned long addr,
+ unsigned long size, struct zap_details *details);
int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio,
gfp_t gfp);
-void page_cache_ra_order(struct readahead_control *, struct file_ra_state *,
- unsigned int order);
+void page_cache_ra_order(struct readahead_control *, struct file_ra_state *);
void force_page_cache_ra(struct readahead_control *, unsigned long nr);
static inline void force_page_cache_readahead(struct address_space *mapping,
struct file *file, pgoff_t index, unsigned long nr_to_read)
@@ -514,6 +538,16 @@ extern unsigned long highest_memmap_pfn;
bool folio_isolate_lru(struct folio *folio);
void folio_putback_lru(struct folio *folio);
extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason);
+#ifdef CONFIG_NUMA
+int user_proactive_reclaim(char *buf,
+ struct mem_cgroup *memcg, pg_data_t *pgdat);
+#else
+static inline int user_proactive_reclaim(char *buf,
+ struct mem_cgroup *memcg, pg_data_t *pgdat)
+{
+ return 0;
+}
+#endif
/*
* in mm/rmap.c:
@@ -721,6 +755,7 @@ static inline void folio_set_order(struct folio *folio, unsigned int order)
{
if (WARN_ON_ONCE(!order || !folio_test_large(folio)))
return;
+ VM_WARN_ON_ONCE(order > MAX_FOLIO_ORDER);
folio->_flags_1 = (folio->_flags_1 & ~0xffUL) | order;
#ifdef NR_PAGES_IN_LARGE_FOLIO
@@ -808,6 +843,10 @@ static inline struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned int ord
#define alloc_frozen_pages(...) \
alloc_hooks(alloc_frozen_pages_noprof(__VA_ARGS__))
+struct page *alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order);
+#define alloc_frozen_pages_nolock(...) \
+ alloc_hooks(alloc_frozen_pages_nolock_noprof(__VA_ARGS__))
+
extern void zone_pcp_reset(struct zone *zone);
extern void zone_pcp_disable(struct zone *zone);
extern void zone_pcp_enable(struct zone *zone);
@@ -818,7 +857,8 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
int nid, bool exact_nid);
void memmap_init_range(unsigned long, int, unsigned long, unsigned long,
- unsigned long, enum meminit_context, struct vmem_altmap *, int);
+ unsigned long, enum meminit_context, struct vmem_altmap *, int,
+ bool);
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
@@ -910,7 +950,7 @@ static inline void init_cma_pageblock(struct page *page)
int find_suitable_fallback(struct free_area *area, unsigned int order,
- int migratetype, bool claim_only, bool *claim_block);
+ int migratetype, bool claimable);
static inline bool free_area_empty(struct free_area *area, int migratetype)
{
@@ -926,8 +966,8 @@ extern long populate_vma_page_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end, int *locked);
extern long faultin_page_range(struct mm_struct *mm, unsigned long start,
unsigned long end, bool write, int *locked);
-extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags,
- unsigned long bytes);
+bool mlock_future_ok(const struct mm_struct *mm, vm_flags_t vm_flags,
+ unsigned long bytes);
/*
* NOTE: This function can't tell whether the folio is "fully mapped" in the
@@ -1116,6 +1156,8 @@ DECLARE_STATIC_KEY_TRUE(deferred_pages);
bool __init deferred_grow_zone(struct zone *zone, unsigned int order);
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
+void init_deferred_page(unsigned long pfn, int nid);
+
enum mminit_level {
MMINIT_WARNING,
MMINIT_VERIFY,
@@ -1190,14 +1232,10 @@ static inline bool node_reclaim_enabled(void)
#ifdef CONFIG_MEMORY_FAILURE
int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill);
void shake_folio(struct folio *folio);
-extern int hwpoison_filter(struct page *p);
-
-extern u32 hwpoison_filter_dev_major;
-extern u32 hwpoison_filter_dev_minor;
-extern u64 hwpoison_filter_flags_mask;
-extern u64 hwpoison_filter_flags_value;
-extern u64 hwpoison_filter_memcg;
-extern u32 hwpoison_filter_enable;
+typedef int hwpoison_filter_func_t(struct page *p);
+void hwpoison_filter_register(hwpoison_filter_func_t *filter);
+void hwpoison_filter_unregister(void);
+
#define MAGIC_HWPOISON 0x48575053U /* HWPS */
void SetPageHWPoisonTakenOff(struct page *page);
void ClearPageHWPoisonTakenOff(struct page *page);
@@ -1222,7 +1260,6 @@ extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long,
unsigned long, unsigned long);
extern void set_pageblock_order(void);
-struct folio *alloc_migrate_folio(struct folio *src, unsigned long private);
unsigned long reclaim_pages(struct list_head *folio_list);
unsigned int reclaim_clean_pages_from_list(struct zone *zone,
struct list_head *folio_list);
@@ -1297,11 +1334,6 @@ extern const struct trace_print_flags pageflag_names[];
extern const struct trace_print_flags vmaflag_names[];
extern const struct trace_print_flags gfpflag_names[];
-static inline bool is_migrate_highatomic(enum migratetype migratetype)
-{
- return migratetype == MIGRATE_HIGHATOMIC;
-}
-
void setup_zone_pageset(struct zone *zone);
struct migration_target_control {
@@ -1355,7 +1387,7 @@ int migrate_device_coherent_folio(struct folio *folio);
struct vm_struct *__get_vm_area_node(unsigned long size,
unsigned long align, unsigned long shift,
- unsigned long flags, unsigned long start,
+ unsigned long vm_flags, unsigned long start,
unsigned long end, int node, gfp_t gfp_mask,
const void *caller);
@@ -1600,6 +1632,9 @@ static inline void accept_page(struct page *page)
int walk_page_range_mm(struct mm_struct *mm, unsigned long start,
unsigned long end, const struct mm_walk_ops *ops,
void *private);
+int walk_page_range_debug(struct mm_struct *mm, unsigned long start,
+ unsigned long end, const struct mm_walk_ops *ops,
+ pgd_t *pgd, void *private);
/* pt_reclaim.c */
bool try_get_and_clear_pmd(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdval);
@@ -1619,5 +1654,7 @@ static inline bool reclaim_pt_is_enabled(unsigned long start, unsigned long end,
}
#endif /* CONFIG_PT_RECLAIM */
+void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm);
+int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm);
#endif /* __MM_INTERNAL_H */
diff --git a/mm/io-mapping.c b/mm/io-mapping.c
deleted file mode 100644
index 01b362799930..000000000000
--- a/mm/io-mapping.c
+++ /dev/null
@@ -1,29 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-
-#include <linux/mm.h>
-#include <linux/io-mapping.h>
-
-/**
- * io_mapping_map_user - remap an I/O mapping to userspace
- * @iomap: the source io_mapping
- * @vma: user vma to map to
- * @addr: target user address to start at
- * @pfn: physical address of kernel memory
- * @size: size of map area
- *
- * Note: this is only safe if the mm semaphore is held when called.
- */
-int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma,
- unsigned long addr, unsigned long pfn, unsigned long size)
-{
- vm_flags_t expected_flags = VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
-
- if (WARN_ON_ONCE((vma->vm_flags & expected_flags) != expected_flags))
- return -EINVAL;
-
- /* We rely on prevalidation of the io-mapping to skip track_pfn(). */
- return remap_pfn_range_notrack(vma, addr, pfn, size,
- __pgprot((pgprot_val(iomap->prot) & _PAGE_CACHE_MASK) |
- (pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK)));
-}
-EXPORT_SYMBOL_GPL(io_mapping_map_user);
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index 1a958e7c8a46..dd93ae8a6beb 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -35,7 +35,7 @@ CFLAGS_shadow.o := $(CC_FLAGS_KASAN_RUNTIME)
CFLAGS_hw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
CFLAGS_sw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
-CFLAGS_KASAN_TEST := $(CFLAGS_KASAN) $(call cc-disable-warning, vla)
+CFLAGS_KASAN_TEST := $(CFLAGS_KASAN)
ifndef CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX
# If compiler instruments memintrinsics by prefixing them with __asan/__hwasan,
# we need to treat them normally (as builtins), otherwise the compiler won't
@@ -44,6 +44,7 @@ ifndef CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX
CFLAGS_KASAN_TEST += -fno-builtin
endif
+CFLAGS_REMOVE_kasan_test_c.o += $(call cc-option, -Wvla-larger-than=1)
CFLAGS_kasan_test_c.o := $(CFLAGS_KASAN_TEST)
RUSTFLAGS_kasan_test_rust.o := $(RUSTFLAGS_KASAN)
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index ed4873e18c75..d4c14359feaf 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -32,6 +32,15 @@
#include "kasan.h"
#include "../slab.h"
+#if defined(CONFIG_ARCH_DEFER_KASAN) || defined(CONFIG_KASAN_HW_TAGS)
+/*
+ * Definition of the unified static key declared in kasan-enabled.h.
+ * This provides consistent runtime enable/disable across KASAN modes.
+ */
+DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled);
+EXPORT_SYMBOL_GPL(kasan_flag_enabled);
+#endif
+
struct slab *kasan_addr_to_slab(const void *addr)
{
if (virt_addr_valid(addr))
@@ -230,16 +239,12 @@ static bool check_slab_allocation(struct kmem_cache *cache, void *object,
}
static inline void poison_slab_object(struct kmem_cache *cache, void *object,
- bool init, bool still_accessible)
+ bool init)
{
void *tagged_object = object;
object = kasan_reset_tag(object);
- /* RCU slabs could be legally used after free within the RCU period. */
- if (unlikely(still_accessible))
- return;
-
kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE),
KASAN_SLAB_FREE, init);
@@ -250,18 +255,36 @@ static inline void poison_slab_object(struct kmem_cache *cache, void *object,
bool __kasan_slab_pre_free(struct kmem_cache *cache, void *object,
unsigned long ip)
{
- if (!kasan_arch_is_ready() || is_kfence_address(object))
+ if (is_kfence_address(object))
return false;
return check_slab_allocation(cache, object, ip);
}
bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init,
- bool still_accessible)
+ bool still_accessible, bool no_quarantine)
{
- if (!kasan_arch_is_ready() || is_kfence_address(object))
+ if (is_kfence_address(object))
return false;
- poison_slab_object(cache, object, init, still_accessible);
+ /*
+ * If this point is reached with an object that must still be
+ * accessible under RCU, we can't poison it; in that case, also skip the
+ * quarantine. This should mostly only happen when CONFIG_SLUB_RCU_DEBUG
+ * has been disabled manually.
+ *
+ * Putting the object on the quarantine wouldn't help catch UAFs (since
+ * we can't poison it here), and it would mask bugs caused by
+ * SLAB_TYPESAFE_BY_RCU users not being careful enough about object
+ * reuse; so overall, putting the object into the quarantine here would
+ * be counterproductive.
+ */
+ if (still_accessible)
+ return false;
+
+ poison_slab_object(cache, object, init);
+
+ if (no_quarantine)
+ return false;
/*
* If the object is put into quarantine, do not let slab put the object
@@ -282,7 +305,7 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init,
static inline bool check_page_allocation(void *ptr, unsigned long ip)
{
- if (!kasan_arch_is_ready())
+ if (!kasan_enabled())
return false;
if (ptr != page_address(virt_to_head_page(ptr))) {
@@ -511,7 +534,7 @@ bool __kasan_mempool_poison_object(void *ptr, unsigned long ip)
return true;
}
- if (is_kfence_address(ptr) || !kasan_arch_is_ready())
+ if (is_kfence_address(ptr))
return true;
slab = folio_slab(folio);
@@ -519,7 +542,7 @@ bool __kasan_mempool_poison_object(void *ptr, unsigned long ip)
if (check_slab_allocation(slab->slab_cache, ptr, ip))
return false;
- poison_slab_object(slab->slab_cache, ptr, false, false);
+ poison_slab_object(slab->slab_cache, ptr, false);
return true;
}
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index d54e89f8c3e7..b413c46b3e04 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -37,6 +37,17 @@
#include "../slab.h"
/*
+ * Initialize Generic KASAN and enable runtime checks.
+ * This should be called from arch kasan_init() once shadow memory is ready.
+ */
+void __init kasan_init_generic(void)
+{
+ kasan_enable();
+
+ pr_info("KernelAddressSanitizer initialized (generic)\n");
+}
+
+/*
* All functions below always inlined so compiler could
* perform better optimizations in each of __asan_loadX/__assn_storeX
* depending on memory access size X.
@@ -165,7 +176,7 @@ static __always_inline bool check_region_inline(const void *addr,
size_t size, bool write,
unsigned long ret_ip)
{
- if (!kasan_arch_is_ready())
+ if (!kasan_enabled())
return true;
if (unlikely(size == 0))
@@ -193,7 +204,7 @@ bool kasan_byte_accessible(const void *addr)
{
s8 shadow_byte;
- if (!kasan_arch_is_ready())
+ if (!kasan_enabled())
return true;
shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(addr));
@@ -495,7 +506,7 @@ static void release_alloc_meta(struct kasan_alloc_meta *meta)
static void release_free_meta(const void *object, struct kasan_free_meta *meta)
{
- if (!kasan_arch_is_ready())
+ if (!kasan_enabled())
return;
/* Check if free meta is valid. */
@@ -562,7 +573,7 @@ void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags)
kasan_save_track(&alloc_meta->alloc_track, flags);
}
-void kasan_save_free_info(struct kmem_cache *cache, void *object)
+void __kasan_save_free_info(struct kmem_cache *cache, void *object)
{
struct kasan_free_meta *free_meta;
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index 9a6927394b54..1c373cc4b3fa 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -46,13 +46,6 @@ static enum kasan_arg_mode kasan_arg_mode __ro_after_init;
static enum kasan_arg_vmalloc kasan_arg_vmalloc __initdata;
/*
- * Whether KASAN is enabled at all.
- * The value remains false until KASAN is initialized by kasan_init_hw_tags().
- */
-DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled);
-EXPORT_SYMBOL(kasan_flag_enabled);
-
-/*
* Whether the selected mode is synchronous, asynchronous, or asymmetric.
* Defaults to KASAN_MODE_SYNC.
*/
@@ -67,6 +60,9 @@ DEFINE_STATIC_KEY_FALSE(kasan_flag_vmalloc);
#endif
EXPORT_SYMBOL_GPL(kasan_flag_vmalloc);
+/* Whether to check write accesses only. */
+static bool kasan_flag_write_only = false;
+
#define PAGE_ALLOC_SAMPLE_DEFAULT 1
#define PAGE_ALLOC_SAMPLE_ORDER_DEFAULT 3
@@ -141,6 +137,23 @@ static int __init early_kasan_flag_vmalloc(char *arg)
}
early_param("kasan.vmalloc", early_kasan_flag_vmalloc);
+/* kasan.write_only=off/on */
+static int __init early_kasan_flag_write_only(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ if (!strcmp(arg, "off"))
+ kasan_flag_write_only = false;
+ else if (!strcmp(arg, "on"))
+ kasan_flag_write_only = true;
+ else
+ return -EINVAL;
+
+ return 0;
+}
+early_param("kasan.write_only", early_kasan_flag_write_only);
+
static inline const char *kasan_mode_info(void)
{
if (kasan_mode == KASAN_MODE_ASYNC)
@@ -260,12 +273,13 @@ void __init kasan_init_hw_tags(void)
kasan_init_tags();
/* KASAN is now initialized, enable it. */
- static_branch_enable(&kasan_flag_enabled);
+ kasan_enable();
- pr_info("KernelAddressSanitizer initialized (hw-tags, mode=%s, vmalloc=%s, stacktrace=%s)\n",
+ pr_info("KernelAddressSanitizer initialized (hw-tags, mode=%s, vmalloc=%s, stacktrace=%s, write_only=%s)\n",
kasan_mode_info(),
str_on_off(kasan_vmalloc_enabled()),
- str_on_off(kasan_stack_collection_enabled()));
+ str_on_off(kasan_stack_collection_enabled()),
+ str_on_off(kasan_flag_write_only));
}
#ifdef CONFIG_KASAN_VMALLOC
@@ -392,6 +406,20 @@ void kasan_enable_hw_tags(void)
hw_enable_tag_checks_asymm();
else
hw_enable_tag_checks_sync();
+
+ /*
+ * CPUs can only be in one of two states:
+ * - All CPUs support the write_only feature
+ * - No CPUs support the write_only feature
+ *
+ * If the first CPU attempts hw_enable_tag_checks_write_only() and
+ * finds the feature unsupported, kasan_flag_write_only is set to OFF
+ * to avoid further unnecessary calls on other CPUs.
+ */
+ if (kasan_flag_write_only && hw_enable_tag_checks_write_only()) {
+ kasan_flag_write_only = false;
+ pr_err_once("write-only mode is not supported and thus not enabled\n");
+ }
}
#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
@@ -404,4 +432,10 @@ VISIBLE_IF_KUNIT void kasan_force_async_fault(void)
}
EXPORT_SYMBOL_IF_KUNIT(kasan_force_async_fault);
+VISIBLE_IF_KUNIT bool kasan_write_only_enabled(void)
+{
+ return kasan_flag_write_only;
+}
+EXPORT_SYMBOL_IF_KUNIT(kasan_write_only_enabled);
+
#endif
diff --git a/mm/kasan/init.c b/mm/kasan/init.c
index ced6b29fcf76..f084e7a5df1e 100644
--- a/mm/kasan/init.c
+++ b/mm/kasan/init.c
@@ -13,9 +13,9 @@
#include <linux/mm.h>
#include <linux/pfn.h>
#include <linux/slab.h>
+#include <linux/pgalloc.h>
#include <asm/page.h>
-#include <asm/pgalloc.h>
#include "kasan.h"
@@ -191,7 +191,7 @@ static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr,
pud_t *pud;
pmd_t *pmd;
- p4d_populate(&init_mm, p4d,
+ p4d_populate_kernel(addr, p4d,
lm_alias(kasan_early_shadow_pud));
pud = pud_offset(p4d, addr);
pud_populate(&init_mm, pud,
@@ -212,7 +212,7 @@ static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr,
} else {
p = early_alloc(PAGE_SIZE, NUMA_NO_NODE);
pud_init(p);
- p4d_populate(&init_mm, p4d, p);
+ p4d_populate_kernel(addr, p4d, p);
}
}
zero_pud_populate(p4d, addr, next);
@@ -251,10 +251,10 @@ int __ref kasan_populate_early_shadow(const void *shadow_start,
* puds,pmds, so pgd_populate(), pud_populate()
* is noops.
*/
- pgd_populate(&init_mm, pgd,
+ pgd_populate_kernel(addr, pgd,
lm_alias(kasan_early_shadow_p4d));
p4d = p4d_offset(pgd, addr);
- p4d_populate(&init_mm, p4d,
+ p4d_populate_kernel(addr, p4d,
lm_alias(kasan_early_shadow_pud));
pud = pud_offset(p4d, addr);
pud_populate(&init_mm, pud,
@@ -266,14 +266,12 @@ int __ref kasan_populate_early_shadow(const void *shadow_start,
}
if (pgd_none(*pgd)) {
- p4d_t *p;
if (slab_is_available()) {
- p = p4d_alloc(&init_mm, pgd, addr);
- if (!p)
+ if (!p4d_alloc(&init_mm, pgd, addr))
return -ENOMEM;
} else {
- pgd_populate(&init_mm, pgd,
+ pgd_populate_kernel(addr, pgd,
early_alloc(PAGE_SIZE, NUMA_NO_NODE));
}
}
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 129178be5e64..07fa7375a848 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -398,7 +398,13 @@ depot_stack_handle_t kasan_save_stack(gfp_t flags, depot_flags_t depot_flags);
void kasan_set_track(struct kasan_track *track, depot_stack_handle_t stack);
void kasan_save_track(struct kasan_track *track, gfp_t flags);
void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags);
-void kasan_save_free_info(struct kmem_cache *cache, void *object);
+
+void __kasan_save_free_info(struct kmem_cache *cache, void *object);
+static inline void kasan_save_free_info(struct kmem_cache *cache, void *object)
+{
+ if (kasan_enabled())
+ __kasan_save_free_info(cache, object);
+}
#ifdef CONFIG_KASAN_GENERIC
bool kasan_quarantine_put(struct kmem_cache *cache, void *object);
@@ -431,6 +437,7 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag)
#define hw_suppress_tag_checks_start() arch_suppress_tag_checks_start()
#define hw_suppress_tag_checks_stop() arch_suppress_tag_checks_stop()
#define hw_force_async_tag_fault() arch_force_async_tag_fault()
+#define hw_enable_tag_checks_write_only() arch_enable_tag_checks_write_only()
#define hw_get_random_tag() arch_get_random_tag()
#define hw_get_mem_tag(addr) arch_get_mem_tag(addr)
#define hw_set_mem_tag_range(addr, size, tag, init) \
@@ -451,11 +458,17 @@ void __init kasan_init_tags(void);
#if defined(CONFIG_KASAN_HW_TAGS) && IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
void kasan_force_async_fault(void);
+bool kasan_write_only_enabled(void);
#else /* CONFIG_KASAN_HW_TAGS && CONFIG_KASAN_KUNIT_TEST */
static inline void kasan_force_async_fault(void) { }
+static inline bool kasan_write_only_enabled(void)
+{
+ return false;
+}
+
#endif /* CONFIG_KASAN_HW_TAGS && CONFIG_KASAN_KUNIT_TEST */
#ifdef CONFIG_KASAN_SW_TAGS
diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c
index 5f922dd38ffa..2cafca31b092 100644
--- a/mm/kasan/kasan_test_c.c
+++ b/mm/kasan/kasan_test_c.c
@@ -47,7 +47,7 @@ static struct {
* Some tests use these global variables to store return values from function
* calls that could otherwise be eliminated by the compiler as dead code.
*/
-static volatile void *kasan_ptr_result;
+static void *volatile kasan_ptr_result;
static volatile int kasan_int_result;
/* Probe for console output: obtains test_status lines of interest. */
@@ -94,11 +94,14 @@ static void kasan_test_exit(struct kunit *test)
}
/**
- * KUNIT_EXPECT_KASAN_FAIL - check that the executed expression produces a
- * KASAN report; causes a KUnit test failure otherwise.
+ * KUNIT_EXPECT_KASAN_RESULT - checks whether the executed expression
+ * produces a KASAN report; causes a KUnit test failure when the result
+ * is different from @fail.
*
* @test: Currently executing KUnit test.
- * @expression: Expression that must produce a KASAN report.
+ * @expr: Expression to be tested.
+ * @expr_str: Expression to be tested encoded as a string.
+ * @fail: Whether expression should produce a KASAN report.
*
* For hardware tag-based KASAN, when a synchronous tag fault happens, tag
* checking is auto-disabled. When this happens, this test handler reenables
@@ -110,25 +113,29 @@ static void kasan_test_exit(struct kunit *test)
* Use READ/WRITE_ONCE() for the accesses and compiler barriers around the
* expression to prevent that.
*
- * In between KUNIT_EXPECT_KASAN_FAIL checks, test_status.report_found is kept
+ * In between KUNIT_EXPECT_KASAN_RESULT checks, test_status.report_found is kept
* as false. This allows detecting KASAN reports that happen outside of the
* checks by asserting !test_status.report_found at the start of
- * KUNIT_EXPECT_KASAN_FAIL and in kasan_test_exit.
+ * KUNIT_EXPECT_KASAN_RESULT and in kasan_test_exit.
*/
-#define KUNIT_EXPECT_KASAN_FAIL(test, expression) do { \
+#define KUNIT_EXPECT_KASAN_RESULT(test, expr, expr_str, fail) \
+do { \
if (IS_ENABLED(CONFIG_KASAN_HW_TAGS) && \
kasan_sync_fault_possible()) \
migrate_disable(); \
KUNIT_EXPECT_FALSE(test, READ_ONCE(test_status.report_found)); \
barrier(); \
- expression; \
+ expr; \
barrier(); \
if (kasan_async_fault_possible()) \
kasan_force_async_fault(); \
- if (!READ_ONCE(test_status.report_found)) { \
- KUNIT_FAIL(test, KUNIT_SUBTEST_INDENT "KASAN failure " \
- "expected in \"" #expression \
- "\", but none occurred"); \
+ if (READ_ONCE(test_status.report_found) != fail) { \
+ KUNIT_FAIL(test, KUNIT_SUBTEST_INDENT "KASAN failure" \
+ "%sexpected in \"" expr_str \
+ "\", but %soccurred", \
+ (fail ? " " : " not "), \
+ (test_status.report_found ? \
+ "" : "none ")); \
} \
if (IS_ENABLED(CONFIG_KASAN_HW_TAGS) && \
kasan_sync_fault_possible()) { \
@@ -141,6 +148,34 @@ static void kasan_test_exit(struct kunit *test)
WRITE_ONCE(test_status.async_fault, false); \
} while (0)
+/*
+ * KUNIT_EXPECT_KASAN_FAIL - check that the executed expression produces a
+ * KASAN report; causes a KUnit test failure otherwise.
+ *
+ * @test: Currently executing KUnit test.
+ * @expr: Expression that must produce a KASAN report.
+ */
+#define KUNIT_EXPECT_KASAN_FAIL(test, expr) \
+ KUNIT_EXPECT_KASAN_RESULT(test, expr, #expr, true)
+
+/*
+ * KUNIT_EXPECT_KASAN_FAIL_READ - check that the executed expression
+ * produces a KASAN report when the write-only mode is not enabled;
+ * causes a KUnit test failure otherwise.
+ *
+ * Note: At the moment, this macro does not check whether the produced
+ * KASAN report is a report about a bad read access. It is only intended
+ * for checking the write-only KASAN mode functionality without failing
+ * KASAN tests.
+ *
+ * @test: Currently executing KUnit test.
+ * @expr: Expression that must only produce a KASAN report
+ * when the write-only mode is not enabled.
+ */
+#define KUNIT_EXPECT_KASAN_FAIL_READ(test, expr) \
+ KUNIT_EXPECT_KASAN_RESULT(test, expr, #expr, \
+ !kasan_write_only_enabled()) \
+
#define KASAN_TEST_NEEDS_CONFIG_ON(test, config) do { \
if (!IS_ENABLED(config)) \
kunit_skip((test), "Test requires " #config "=y"); \
@@ -183,8 +218,8 @@ static void kmalloc_oob_right(struct kunit *test)
KUNIT_EXPECT_KASAN_FAIL(test, ptr[size + 5] = 'y');
/* Out-of-bounds access past the aligned kmalloc object. */
- KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] =
- ptr[size + KASAN_GRANULE_SIZE + 5]);
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, ptr[0] =
+ ptr[size + KASAN_GRANULE_SIZE + 5]);
kfree(ptr);
}
@@ -198,7 +233,7 @@ static void kmalloc_oob_left(struct kunit *test)
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
OPTIMIZER_HIDE_VAR(ptr);
- KUNIT_EXPECT_KASAN_FAIL(test, *ptr = *(ptr - 1));
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, *ptr = *(ptr - 1));
kfree(ptr);
}
@@ -211,7 +246,7 @@ static void kmalloc_node_oob_right(struct kunit *test)
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
OPTIMIZER_HIDE_VAR(ptr);
- KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = ptr[size]);
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, ptr[0] = ptr[size]);
kfree(ptr);
}
@@ -291,7 +326,7 @@ static void kmalloc_large_uaf(struct kunit *test)
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
kfree(ptr);
- KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[0]);
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)ptr)[0]);
}
static void kmalloc_large_invalid_free(struct kunit *test)
@@ -323,7 +358,7 @@ static void page_alloc_oob_right(struct kunit *test)
ptr = page_address(pages);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
- KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = ptr[size]);
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, ptr[0] = ptr[size]);
free_pages((unsigned long)ptr, order);
}
@@ -338,7 +373,7 @@ static void page_alloc_uaf(struct kunit *test)
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
free_pages((unsigned long)ptr, order);
- KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[0]);
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)ptr)[0]);
}
static void krealloc_more_oob_helper(struct kunit *test,
@@ -458,7 +493,7 @@ static void krealloc_uaf(struct kunit *test)
KUNIT_EXPECT_KASAN_FAIL(test, ptr2 = krealloc(ptr1, size2, GFP_KERNEL));
KUNIT_ASSERT_NULL(test, ptr2);
- KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)ptr1);
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, *(volatile char *)ptr1);
}
static void kmalloc_oob_16(struct kunit *test)
@@ -501,7 +536,7 @@ static void kmalloc_uaf_16(struct kunit *test)
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2);
kfree(ptr2);
- KUNIT_EXPECT_KASAN_FAIL(test, *ptr1 = *ptr2);
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, *ptr1 = *ptr2);
kfree(ptr1);
}
@@ -640,8 +675,8 @@ static void kmalloc_memmove_invalid_size(struct kunit *test)
memset((char *)ptr, 0, 64);
OPTIMIZER_HIDE_VAR(ptr);
OPTIMIZER_HIDE_VAR(invalid_size);
- KUNIT_EXPECT_KASAN_FAIL(test,
- memmove((char *)ptr, (char *)ptr + 4, invalid_size));
+ KUNIT_EXPECT_KASAN_FAIL_READ(test,
+ memmove((char *)ptr, (char *)ptr + 4, invalid_size));
kfree(ptr);
}
@@ -654,7 +689,7 @@ static void kmalloc_uaf(struct kunit *test)
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
kfree(ptr);
- KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[8]);
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)ptr)[8]);
}
static void kmalloc_uaf_memset(struct kunit *test)
@@ -701,7 +736,7 @@ again:
goto again;
}
- KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr1)[40]);
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)ptr1)[40]);
KUNIT_EXPECT_PTR_NE(test, ptr1, ptr2);
kfree(ptr2);
@@ -727,19 +762,19 @@ static void kmalloc_uaf3(struct kunit *test)
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2);
kfree(ptr2);
- KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr1)[8]);
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)ptr1)[8]);
}
static void kasan_atomics_helper(struct kunit *test, void *unsafe, void *safe)
{
int *i_unsafe = unsafe;
- KUNIT_EXPECT_KASAN_FAIL(test, READ_ONCE(*i_unsafe));
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, READ_ONCE(*i_unsafe));
KUNIT_EXPECT_KASAN_FAIL(test, WRITE_ONCE(*i_unsafe, 42));
- KUNIT_EXPECT_KASAN_FAIL(test, smp_load_acquire(i_unsafe));
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, smp_load_acquire(i_unsafe));
KUNIT_EXPECT_KASAN_FAIL(test, smp_store_release(i_unsafe, 42));
- KUNIT_EXPECT_KASAN_FAIL(test, atomic_read(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, atomic_read(unsafe));
KUNIT_EXPECT_KASAN_FAIL(test, atomic_set(unsafe, 42));
KUNIT_EXPECT_KASAN_FAIL(test, atomic_add(42, unsafe));
KUNIT_EXPECT_KASAN_FAIL(test, atomic_sub(42, unsafe));
@@ -752,18 +787,31 @@ static void kasan_atomics_helper(struct kunit *test, void *unsafe, void *safe)
KUNIT_EXPECT_KASAN_FAIL(test, atomic_xchg(unsafe, 42));
KUNIT_EXPECT_KASAN_FAIL(test, atomic_cmpxchg(unsafe, 21, 42));
KUNIT_EXPECT_KASAN_FAIL(test, atomic_try_cmpxchg(unsafe, safe, 42));
- KUNIT_EXPECT_KASAN_FAIL(test, atomic_try_cmpxchg(safe, unsafe, 42));
+ /*
+ * The result of the test below may vary due to garbage values of
+ * unsafe in write-only mode.
+ * Therefore, skip this test when KASAN is configured in write-only mode.
+ */
+ if (!kasan_write_only_enabled())
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_try_cmpxchg(safe, unsafe, 42));
KUNIT_EXPECT_KASAN_FAIL(test, atomic_sub_and_test(42, unsafe));
KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec_and_test(unsafe));
KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc_and_test(unsafe));
KUNIT_EXPECT_KASAN_FAIL(test, atomic_add_negative(42, unsafe));
- KUNIT_EXPECT_KASAN_FAIL(test, atomic_add_unless(unsafe, 21, 42));
- KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc_not_zero(unsafe));
- KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc_unless_negative(unsafe));
- KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec_unless_positive(unsafe));
- KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec_if_positive(unsafe));
+ /*
+ * The result of the test below may vary due to garbage values of
+ * unsafe in write-only mode.
+ * Therefore, skip this test when KASAN is configured in write-only mode.
+ */
+ if (!kasan_write_only_enabled()) {
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_add_unless(unsafe, 21, 42));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc_not_zero(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc_unless_negative(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec_unless_positive(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec_if_positive(unsafe));
+ }
- KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_read(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, atomic_long_read(unsafe));
KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_set(unsafe, 42));
KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_add(42, unsafe));
KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_sub(42, unsafe));
@@ -776,16 +824,29 @@ static void kasan_atomics_helper(struct kunit *test, void *unsafe, void *safe)
KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_xchg(unsafe, 42));
KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_cmpxchg(unsafe, 21, 42));
KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_try_cmpxchg(unsafe, safe, 42));
- KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_try_cmpxchg(safe, unsafe, 42));
+ /*
+ * The result of the test below may vary due to garbage values of
+ * unsafe in write-only mode.
+ * Therefore, skip this test when KASAN is configured in write-only mode.
+ */
+ if (!kasan_write_only_enabled())
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_try_cmpxchg(safe, unsafe, 42));
KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_sub_and_test(42, unsafe));
KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec_and_test(unsafe));
KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc_and_test(unsafe));
KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_add_negative(42, unsafe));
- KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_add_unless(unsafe, 21, 42));
- KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc_not_zero(unsafe));
- KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc_unless_negative(unsafe));
- KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec_unless_positive(unsafe));
- KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec_if_positive(unsafe));
+ /*
+ * The result of the test below may vary due to garbage values of
+ * unsafe in write-only mode.
+ * Therefore, skip this test when KASAN is configured in write-only mode.
+ */
+ if (!kasan_write_only_enabled()) {
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_add_unless(unsafe, 21, 42));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc_not_zero(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc_unless_negative(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec_unless_positive(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec_if_positive(unsafe));
+ }
}
static void kasan_atomics(struct kunit *test)
@@ -842,8 +903,8 @@ static void ksize_unpoisons_memory(struct kunit *test)
/* These must trigger a KASAN report. */
if (IS_ENABLED(CONFIG_KASAN_GENERIC))
KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[size]);
- KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[size + 5]);
- KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[real_size - 1]);
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)ptr)[size + 5]);
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)ptr)[real_size - 1]);
kfree(ptr);
}
@@ -863,8 +924,8 @@ static void ksize_uaf(struct kunit *test)
OPTIMIZER_HIDE_VAR(ptr);
KUNIT_EXPECT_KASAN_FAIL(test, ksize(ptr));
- KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[0]);
- KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[size]);
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)ptr)[0]);
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)ptr)[size]);
}
/*
@@ -899,9 +960,9 @@ static void rcu_uaf(struct kunit *test)
global_rcu_ptr = rcu_dereference_protected(
(struct kasan_rcu_info __rcu *)ptr, NULL);
- KUNIT_EXPECT_KASAN_FAIL(test,
- call_rcu(&global_rcu_ptr->rcu, rcu_uaf_reclaim);
- rcu_barrier());
+ KUNIT_EXPECT_KASAN_FAIL_READ(test,
+ call_rcu(&global_rcu_ptr->rcu, rcu_uaf_reclaim);
+ rcu_barrier());
}
static void workqueue_uaf_work(struct work_struct *work)
@@ -924,8 +985,8 @@ static void workqueue_uaf(struct kunit *test)
queue_work(workqueue, work);
destroy_workqueue(workqueue);
- KUNIT_EXPECT_KASAN_FAIL(test,
- ((volatile struct work_struct *)work)->data);
+ KUNIT_EXPECT_KASAN_FAIL_READ(test,
+ ((volatile struct work_struct *)work)->data);
}
static void kfree_via_page(struct kunit *test)
@@ -972,7 +1033,7 @@ static void kmem_cache_oob(struct kunit *test)
return;
}
- KUNIT_EXPECT_KASAN_FAIL(test, *p = p[size + OOB_TAG_OFF]);
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, *p = p[size + OOB_TAG_OFF]);
kmem_cache_free(cache, p);
kmem_cache_destroy(cache);
@@ -1068,8 +1129,47 @@ static void kmem_cache_rcu_uaf(struct kunit *test)
*/
rcu_barrier();
- KUNIT_EXPECT_KASAN_FAIL(test, READ_ONCE(*p));
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, READ_ONCE(*p));
+
+ kmem_cache_destroy(cache);
+}
+
+/*
+ * Check that SLAB_TYPESAFE_BY_RCU objects are immediately reused when
+ * CONFIG_SLUB_RCU_DEBUG is off, and stay at the same address.
+ * Without this, KASAN builds would be unable to trigger bugs caused by
+ * SLAB_TYPESAFE_BY_RCU users handling reycled objects improperly.
+ */
+static void kmem_cache_rcu_reuse(struct kunit *test)
+{
+ char *p, *p2;
+ struct kmem_cache *cache;
+
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_SLUB_RCU_DEBUG);
+
+ cache = kmem_cache_create("test_cache", 16, 0, SLAB_TYPESAFE_BY_RCU,
+ NULL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache);
+
+ migrate_disable();
+ p = kmem_cache_alloc(cache, GFP_KERNEL);
+ if (!p) {
+ kunit_err(test, "Allocation failed: %s\n", __func__);
+ goto out;
+ }
+
+ kmem_cache_free(cache, p);
+ p2 = kmem_cache_alloc(cache, GFP_KERNEL);
+ if (!p2) {
+ kunit_err(test, "Allocation failed: %s\n", __func__);
+ goto out;
+ }
+ KUNIT_EXPECT_PTR_EQ(test, p, p2);
+
+ kmem_cache_free(cache, p2);
+out:
+ migrate_enable();
kmem_cache_destroy(cache);
}
@@ -1207,7 +1307,7 @@ static void mempool_oob_right_helper(struct kunit *test, mempool_t *pool, size_t
KUNIT_EXPECT_KASAN_FAIL(test,
((volatile char *)&elem[size])[0]);
else
- KUNIT_EXPECT_KASAN_FAIL(test,
+ KUNIT_EXPECT_KASAN_FAIL_READ(test,
((volatile char *)&elem[round_up(size, KASAN_GRANULE_SIZE)])[0]);
mempool_free(elem, pool);
@@ -1273,7 +1373,7 @@ static void mempool_uaf_helper(struct kunit *test, mempool_t *pool, bool page)
mempool_free(elem, pool);
ptr = page ? page_address((struct page *)elem) : elem;
- KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[0]);
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)ptr)[0]);
}
static void mempool_kmalloc_uaf(struct kunit *test)
@@ -1532,7 +1632,7 @@ static void kasan_memchr(struct kunit *test)
OPTIMIZER_HIDE_VAR(ptr);
OPTIMIZER_HIDE_VAR(size);
- KUNIT_EXPECT_KASAN_FAIL(test,
+ KUNIT_EXPECT_KASAN_FAIL_READ(test,
kasan_ptr_result = memchr(ptr, '1', size + 1));
kfree(ptr);
@@ -1559,7 +1659,7 @@ static void kasan_memcmp(struct kunit *test)
OPTIMIZER_HIDE_VAR(ptr);
OPTIMIZER_HIDE_VAR(size);
- KUNIT_EXPECT_KASAN_FAIL(test,
+ KUNIT_EXPECT_KASAN_FAIL_READ(test,
kasan_int_result = memcmp(ptr, arr, size+1));
kfree(ptr);
}
@@ -1578,9 +1678,11 @@ static void kasan_strings(struct kunit *test)
ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ OPTIMIZER_HIDE_VAR(ptr);
src = kmalloc(KASAN_GRANULE_SIZE, GFP_KERNEL | __GFP_ZERO);
strscpy(src, "f0cacc1a0000000", KASAN_GRANULE_SIZE);
+ OPTIMIZER_HIDE_VAR(src);
/*
* Make sure that strscpy() does not trigger KASAN if it overreads into
@@ -1594,7 +1696,7 @@ static void kasan_strings(struct kunit *test)
strscpy(ptr, src + 1, KASAN_GRANULE_SIZE));
/* strscpy should fail if the first byte is unreadable. */
- KUNIT_EXPECT_KASAN_FAIL(test, strscpy(ptr, src + KASAN_GRANULE_SIZE,
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, strscpy(ptr, src + KASAN_GRANULE_SIZE,
KASAN_GRANULE_SIZE));
kfree(src);
@@ -1607,17 +1709,17 @@ static void kasan_strings(struct kunit *test)
* will likely point to zeroed byte.
*/
ptr += 16;
- KUNIT_EXPECT_KASAN_FAIL(test, kasan_ptr_result = strchr(ptr, '1'));
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, kasan_ptr_result = strchr(ptr, '1'));
- KUNIT_EXPECT_KASAN_FAIL(test, kasan_ptr_result = strrchr(ptr, '1'));
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, kasan_ptr_result = strrchr(ptr, '1'));
- KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strcmp(ptr, "2"));
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, kasan_int_result = strcmp(ptr, "2"));
- KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strncmp(ptr, "2", 1));
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, kasan_int_result = strncmp(ptr, "2", 1));
- KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strlen(ptr));
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, kasan_int_result = strlen(ptr));
- KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strnlen(ptr, 1));
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, kasan_int_result = strnlen(ptr, 1));
}
static void kasan_bitops_modify(struct kunit *test, int nr, void *addr)
@@ -1636,12 +1738,18 @@ static void kasan_bitops_test_and_modify(struct kunit *test, int nr, void *addr)
{
KUNIT_EXPECT_KASAN_FAIL(test, test_and_set_bit(nr, addr));
KUNIT_EXPECT_KASAN_FAIL(test, __test_and_set_bit(nr, addr));
- KUNIT_EXPECT_KASAN_FAIL(test, test_and_set_bit_lock(nr, addr));
+ /*
+ * When KASAN is running in write-only mode,
+ * a fault won't occur when the bit is set.
+ * Therefore, skip the test_and_set_bit_lock test in write-only mode.
+ */
+ if (!kasan_write_only_enabled())
+ KUNIT_EXPECT_KASAN_FAIL(test, test_and_set_bit_lock(nr, addr));
KUNIT_EXPECT_KASAN_FAIL(test, test_and_clear_bit(nr, addr));
KUNIT_EXPECT_KASAN_FAIL(test, __test_and_clear_bit(nr, addr));
KUNIT_EXPECT_KASAN_FAIL(test, test_and_change_bit(nr, addr));
KUNIT_EXPECT_KASAN_FAIL(test, __test_and_change_bit(nr, addr));
- KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = test_bit(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, kasan_int_result = test_bit(nr, addr));
if (nr < 7)
KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result =
xor_unlock_is_negative_byte(1 << nr, addr));
@@ -1765,7 +1873,7 @@ static void vmalloc_oob(struct kunit *test)
KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)v_ptr)[size]);
/* An aligned access into the first out-of-bounds granule. */
- KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)v_ptr)[size + 5]);
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)v_ptr)[size + 5]);
/* Check that in-bounds accesses to the physical page are valid. */
page = vmalloc_to_page(v_ptr);
@@ -1977,6 +2085,11 @@ static void rust_uaf(struct kunit *test)
KUNIT_EXPECT_KASAN_FAIL(test, kasan_test_rust_uaf());
}
+/*
+ * copy_to_kernel_nofault() is an internal helper available when
+ * kasan_test is built-in, so it must not be visible to loadable modules.
+ */
+#ifndef MODULE
static void copy_to_kernel_nofault_oob(struct kunit *test)
{
char *ptr;
@@ -2011,6 +2124,7 @@ static void copy_to_kernel_nofault_oob(struct kunit *test)
kfree(ptr);
}
+#endif /* !MODULE */
static void copy_user_test_oob(struct kunit *test)
{
@@ -2036,15 +2150,15 @@ static void copy_user_test_oob(struct kunit *test)
KUNIT_EXPECT_KASAN_FAIL(test,
unused = copy_from_user(kmem, usermem, size + 1));
- KUNIT_EXPECT_KASAN_FAIL(test,
+ KUNIT_EXPECT_KASAN_FAIL_READ(test,
unused = copy_to_user(usermem, kmem, size + 1));
KUNIT_EXPECT_KASAN_FAIL(test,
unused = __copy_from_user(kmem, usermem, size + 1));
- KUNIT_EXPECT_KASAN_FAIL(test,
+ KUNIT_EXPECT_KASAN_FAIL_READ(test,
unused = __copy_to_user(usermem, kmem, size + 1));
KUNIT_EXPECT_KASAN_FAIL(test,
unused = __copy_from_user_inatomic(kmem, usermem, size + 1));
- KUNIT_EXPECT_KASAN_FAIL(test,
+ KUNIT_EXPECT_KASAN_FAIL_READ(test,
unused = __copy_to_user_inatomic(usermem, kmem, size + 1));
/*
@@ -2098,6 +2212,7 @@ static struct kunit_case kasan_kunit_test_cases[] = {
KUNIT_CASE(kmem_cache_double_free),
KUNIT_CASE(kmem_cache_invalid_free),
KUNIT_CASE(kmem_cache_rcu_uaf),
+ KUNIT_CASE(kmem_cache_rcu_reuse),
KUNIT_CASE(kmem_cache_double_destroy),
KUNIT_CASE(kmem_cache_accounted),
KUNIT_CASE(kmem_cache_bulk),
@@ -2131,7 +2246,9 @@ static struct kunit_case kasan_kunit_test_cases[] = {
KUNIT_CASE(match_all_not_assigned),
KUNIT_CASE(match_all_ptr_tag),
KUNIT_CASE(match_all_mem_tag),
+#ifndef MODULE
KUNIT_CASE(copy_to_kernel_nofault_oob),
+#endif
KUNIT_CASE(rust_uaf),
KUNIT_CASE(copy_user_test_oob),
{}
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 8357e1a33699..62c01b4527eb 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -370,36 +370,6 @@ static inline bool init_task_stack_addr(const void *addr)
sizeof(init_thread_union.stack));
}
-/*
- * This function is invoked with report_lock (a raw_spinlock) held. A
- * PREEMPT_RT kernel cannot call find_vm_area() as it will acquire a sleeping
- * rt_spinlock.
- *
- * For !RT kernel, the PROVE_RAW_LOCK_NESTING config option will print a
- * lockdep warning for this raw_spinlock -> spinlock dependency. This config
- * option is enabled by default to ensure better test coverage to expose this
- * kind of RT kernel problem. This lockdep splat, however, can be suppressed
- * by using DEFINE_WAIT_OVERRIDE_MAP() if it serves a useful purpose and the
- * invalid PREEMPT_RT case has been taken care of.
- */
-static inline struct vm_struct *kasan_find_vm_area(void *addr)
-{
- static DEFINE_WAIT_OVERRIDE_MAP(vmalloc_map, LD_WAIT_SLEEP);
- struct vm_struct *va;
-
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- return NULL;
-
- /*
- * Suppress lockdep warning and fetch vmalloc area of the
- * offending address.
- */
- lock_map_acquire_try(&vmalloc_map);
- va = find_vm_area(addr);
- lock_map_release(&vmalloc_map);
- return va;
-}
-
static void print_address_description(void *addr, u8 tag,
struct kasan_report_info *info)
{
@@ -429,19 +399,10 @@ static void print_address_description(void *addr, u8 tag,
}
if (is_vmalloc_addr(addr)) {
- struct vm_struct *va = kasan_find_vm_area(addr);
-
- if (va) {
- pr_err("The buggy address belongs to the virtual mapping at\n"
- " [%px, %px) created by:\n"
- " %pS\n",
- va->addr, va->addr + va->size, va->caller);
- pr_err("\n");
-
- page = vmalloc_to_page(addr);
- } else {
- pr_err("The buggy address %px belongs to a vmalloc virtual mapping\n", addr);
- }
+ pr_err("The buggy address belongs to a");
+ if (!vmalloc_dump_obj(addr))
+ pr_cont(" vmalloc virtual mapping\n");
+ page = vmalloc_to_page(addr);
}
if (page) {
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index 88d1c9dcb507..5d2a876035d6 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -125,7 +125,7 @@ void kasan_poison(const void *addr, size_t size, u8 value, bool init)
{
void *shadow_start, *shadow_end;
- if (!kasan_arch_is_ready())
+ if (!kasan_enabled())
return;
/*
@@ -150,7 +150,7 @@ EXPORT_SYMBOL_GPL(kasan_poison);
#ifdef CONFIG_KASAN_GENERIC
void kasan_poison_last_granule(const void *addr, size_t size)
{
- if (!kasan_arch_is_ready())
+ if (!kasan_enabled())
return;
if (size & KASAN_GRANULE_MASK) {
@@ -292,39 +292,123 @@ void __init __weak kasan_populate_early_vm_area_shadow(void *start,
{
}
+struct vmalloc_populate_data {
+ unsigned long start;
+ struct page **pages;
+};
+
static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr,
- void *unused)
+ void *_data)
{
- unsigned long page;
+ struct vmalloc_populate_data *data = _data;
+ struct page *page;
pte_t pte;
+ int index;
- if (likely(!pte_none(ptep_get(ptep))))
- return 0;
-
- page = __get_free_page(GFP_KERNEL);
- if (!page)
- return -ENOMEM;
+ arch_leave_lazy_mmu_mode();
- __memset((void *)page, KASAN_VMALLOC_INVALID, PAGE_SIZE);
- pte = pfn_pte(PFN_DOWN(__pa(page)), PAGE_KERNEL);
+ index = PFN_DOWN(addr - data->start);
+ page = data->pages[index];
+ __memset(page_to_virt(page), KASAN_VMALLOC_INVALID, PAGE_SIZE);
+ pte = pfn_pte(page_to_pfn(page), PAGE_KERNEL);
spin_lock(&init_mm.page_table_lock);
if (likely(pte_none(ptep_get(ptep)))) {
set_pte_at(&init_mm, addr, ptep, pte);
- page = 0;
+ data->pages[index] = NULL;
}
spin_unlock(&init_mm.page_table_lock);
- if (page)
- free_page(page);
+
+ arch_enter_lazy_mmu_mode();
+
return 0;
}
-int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
+static void ___free_pages_bulk(struct page **pages, int nr_pages)
+{
+ int i;
+
+ for (i = 0; i < nr_pages; i++) {
+ if (pages[i]) {
+ __free_pages(pages[i], 0);
+ pages[i] = NULL;
+ }
+ }
+}
+
+static int ___alloc_pages_bulk(struct page **pages, int nr_pages, gfp_t gfp_mask)
+{
+ unsigned long nr_populated, nr_total = nr_pages;
+ struct page **page_array = pages;
+
+ while (nr_pages) {
+ nr_populated = alloc_pages_bulk(gfp_mask, nr_pages, pages);
+ if (!nr_populated) {
+ ___free_pages_bulk(page_array, nr_total - nr_pages);
+ return -ENOMEM;
+ }
+ pages += nr_populated;
+ nr_pages -= nr_populated;
+ }
+
+ return 0;
+}
+
+static int __kasan_populate_vmalloc(unsigned long start, unsigned long end, gfp_t gfp_mask)
+{
+ unsigned long nr_pages, nr_total = PFN_UP(end - start);
+ struct vmalloc_populate_data data;
+ unsigned int flags;
+ int ret = 0;
+
+ data.pages = (struct page **)__get_free_page(gfp_mask | __GFP_ZERO);
+ if (!data.pages)
+ return -ENOMEM;
+
+ while (nr_total) {
+ nr_pages = min(nr_total, PAGE_SIZE / sizeof(data.pages[0]));
+ ret = ___alloc_pages_bulk(data.pages, nr_pages, gfp_mask);
+ if (ret)
+ break;
+
+ data.start = start;
+
+ /*
+ * page tables allocations ignore external gfp mask, enforce it
+ * by the scope API
+ */
+ if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
+ flags = memalloc_nofs_save();
+ else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
+ flags = memalloc_noio_save();
+
+ ret = apply_to_page_range(&init_mm, start, nr_pages * PAGE_SIZE,
+ kasan_populate_vmalloc_pte, &data);
+
+ if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
+ memalloc_nofs_restore(flags);
+ else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
+ memalloc_noio_restore(flags);
+
+ ___free_pages_bulk(data.pages, nr_pages);
+ if (ret)
+ break;
+
+ start += nr_pages * PAGE_SIZE;
+ nr_total -= nr_pages;
+ }
+
+ free_page((unsigned long)data.pages);
+
+ return ret;
+}
+
+int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask)
{
unsigned long shadow_start, shadow_end;
int ret;
- if (!kasan_arch_is_ready())
+ if (!kasan_enabled())
return 0;
if (!is_vmalloc_or_module_addr((void *)addr))
@@ -348,9 +432,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
shadow_start = PAGE_ALIGN_DOWN(shadow_start);
shadow_end = PAGE_ALIGN(shadow_end);
- ret = apply_to_page_range(&init_mm, shadow_start,
- shadow_end - shadow_start,
- kasan_populate_vmalloc_pte, NULL);
+ ret = __kasan_populate_vmalloc(shadow_start, shadow_end, gfp_mask);
if (ret)
return ret;
@@ -397,18 +479,23 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
void *unused)
{
- unsigned long page;
+ pte_t pte;
+ int none;
- page = (unsigned long)__va(pte_pfn(ptep_get(ptep)) << PAGE_SHIFT);
+ arch_leave_lazy_mmu_mode();
spin_lock(&init_mm.page_table_lock);
-
- if (likely(!pte_none(ptep_get(ptep)))) {
+ pte = ptep_get(ptep);
+ none = pte_none(pte);
+ if (likely(!none))
pte_clear(&init_mm, addr, ptep);
- free_page(page);
- }
spin_unlock(&init_mm.page_table_lock);
+ if (likely(!none))
+ __free_page(pfn_to_page(pte_pfn(pte)));
+
+ arch_enter_lazy_mmu_mode();
+
return 0;
}
@@ -496,7 +583,7 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end,
unsigned long region_start, region_end;
unsigned long size;
- if (!kasan_arch_is_ready())
+ if (!kasan_enabled())
return;
region_start = ALIGN(start, KASAN_MEMORY_PER_SHADOW_PAGE);
@@ -547,7 +634,7 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size,
* with setting memory tags, so the KASAN_VMALLOC_INIT flag is ignored.
*/
- if (!kasan_arch_is_ready())
+ if (!kasan_enabled())
return (void *)start;
if (!is_vmalloc_or_module_addr(start))
@@ -572,7 +659,7 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size,
*/
void __kasan_poison_vmalloc(const void *start, unsigned long size)
{
- if (!kasan_arch_is_ready())
+ if (!kasan_enabled())
return;
if (!is_vmalloc_or_module_addr(start))
diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c
index b9382b5b6a37..c75741a74602 100644
--- a/mm/kasan/sw_tags.c
+++ b/mm/kasan/sw_tags.c
@@ -44,6 +44,7 @@ void __init kasan_init_sw_tags(void)
per_cpu(prng_state, cpu) = (u32)get_cycles();
kasan_init_tags();
+ kasan_enable();
pr_info("KernelAddressSanitizer initialized (sw-tags, stacktrace=%s)\n",
str_on_off(kasan_stack_collection_enabled()));
diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c
index d65d48b85f90..b9f31293622b 100644
--- a/mm/kasan/tags.c
+++ b/mm/kasan/tags.c
@@ -142,7 +142,7 @@ void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags)
save_stack_info(cache, object, flags, false);
}
-void kasan_save_free_info(struct kmem_cache *cache, void *object)
+void __kasan_save_free_info(struct kmem_cache *cache, void *object)
{
save_stack_info(cache, object, 0, true);
}
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 102048821c22..727c20c94ac5 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -594,30 +594,30 @@ static void rcu_guarded_free(struct rcu_head *h)
*/
static unsigned long kfence_init_pool(void)
{
- unsigned long addr;
- struct page *pages;
+ unsigned long addr, start_pfn;
int i;
if (!arch_kfence_init_pool())
return (unsigned long)__kfence_pool;
addr = (unsigned long)__kfence_pool;
- pages = virt_to_page(__kfence_pool);
+ start_pfn = PHYS_PFN(virt_to_phys(__kfence_pool));
/*
- * Set up object pages: they must have PG_slab set, to avoid freeing
- * these as real pages.
+ * Set up object pages: they must have PGTY_slab set to avoid freeing
+ * them as real pages.
*
* We also want to avoid inserting kfence_free() in the kfree()
* fast-path in SLUB, and therefore need to ensure kfree() correctly
* enters __slab_free() slow-path.
*/
for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) {
- struct slab *slab = page_slab(nth_page(pages, i));
+ struct slab *slab;
if (!i || (i % 2))
continue;
+ slab = page_slab(pfn_to_page(start_pfn + i));
__folio_set_slab(slab_folio(slab));
#ifdef CONFIG_MEMCG
slab->obj_exts = (unsigned long)&kfence_metadata_init[i / 2 - 1].obj_exts |
@@ -665,10 +665,12 @@ static unsigned long kfence_init_pool(void)
reset_slab:
for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) {
- struct slab *slab = page_slab(nth_page(pages, i));
+ struct slab *slab;
if (!i || (i % 2))
continue;
+
+ slab = page_slab(pfn_to_page(start_pfn + i));
#ifdef CONFIG_MEMCG
slab->obj_exts = 0;
#endif
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index cc945c6ab3bd..abe54f0043c7 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -39,7 +39,6 @@ enum scan_result {
SCAN_PTE_NON_PRESENT,
SCAN_PTE_UFFD_WP,
SCAN_PTE_MAPPED_HUGEPAGE,
- SCAN_PAGE_RO,
SCAN_LACK_REFERENCED_PAGE,
SCAN_PAGE_NULL,
SCAN_SCAN_ABORT,
@@ -105,14 +104,6 @@ struct collapse_control {
};
/**
- * struct khugepaged_mm_slot - khugepaged information per mm that is being scanned
- * @slot: hash lookup from mm to mm_slot
- */
-struct khugepaged_mm_slot {
- struct mm_slot slot;
-};
-
-/**
* struct khugepaged_scan - cursor for scanning
* @mm_head: the head of the mm list to scan
* @mm_slot: the current mm_slot we are scanning
@@ -122,7 +113,7 @@ struct khugepaged_mm_slot {
*/
struct khugepaged_scan {
struct list_head mm_head;
- struct khugepaged_mm_slot *mm_slot;
+ struct mm_slot *mm_slot;
unsigned long address;
};
@@ -347,7 +338,7 @@ struct attribute_group khugepaged_attr_group = {
#endif /* CONFIG_SYSFS */
int hugepage_madvise(struct vm_area_struct *vma,
- unsigned long *vm_flags, int advice)
+ vm_flags_t *vm_flags, int advice)
{
switch (advice) {
case MADV_HUGEPAGE:
@@ -385,7 +376,7 @@ int hugepage_madvise(struct vm_area_struct *vma,
int __init khugepaged_init(void)
{
- mm_slot_cache = KMEM_CACHE(khugepaged_mm_slot, 0);
+ mm_slot_cache = KMEM_CACHE(mm_slot, 0);
if (!mm_slot_cache)
return -ENOMEM;
@@ -410,7 +401,7 @@ static inline int hpage_collapse_test_exit(struct mm_struct *mm)
static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm)
{
return hpage_collapse_test_exit(mm) ||
- test_bit(MMF_DISABLE_THP, &mm->flags);
+ mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm);
}
static bool hugepage_pmd_enabled(void)
@@ -439,21 +430,18 @@ static bool hugepage_pmd_enabled(void)
void __khugepaged_enter(struct mm_struct *mm)
{
- struct khugepaged_mm_slot *mm_slot;
struct mm_slot *slot;
int wakeup;
/* __khugepaged_exit() must not run from under us */
VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm);
- if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags)))
+ if (unlikely(mm_flags_test_and_set(MMF_VM_HUGEPAGE, mm)))
return;
- mm_slot = mm_slot_alloc(mm_slot_cache);
- if (!mm_slot)
+ slot = mm_slot_alloc(mm_slot_cache);
+ if (!slot)
return;
- slot = &mm_slot->slot;
-
spin_lock(&khugepaged_mm_lock);
mm_slot_insert(mm_slots_hash, mm, slot);
/*
@@ -470,26 +458,23 @@ void __khugepaged_enter(struct mm_struct *mm)
}
void khugepaged_enter_vma(struct vm_area_struct *vma,
- unsigned long vm_flags)
+ vm_flags_t vm_flags)
{
- if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
+ if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) &&
hugepage_pmd_enabled()) {
- if (thp_vma_allowable_order(vma, vm_flags, TVA_ENFORCE_SYSFS,
- PMD_ORDER))
+ if (thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER))
__khugepaged_enter(vma->vm_mm);
}
}
void __khugepaged_exit(struct mm_struct *mm)
{
- struct khugepaged_mm_slot *mm_slot;
struct mm_slot *slot;
int free = 0;
spin_lock(&khugepaged_mm_lock);
slot = mm_slot_lookup(mm_slots_hash, mm);
- mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
- if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
+ if (slot && khugepaged_scan.mm_slot != slot) {
hash_del(&slot->hash);
list_del(&slot->mm_node);
free = 1;
@@ -497,10 +482,10 @@ void __khugepaged_exit(struct mm_struct *mm)
spin_unlock(&khugepaged_mm_lock);
if (free) {
- clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
- mm_slot_free(mm_slot_cache, mm_slot);
+ mm_flags_clear(MMF_VM_HUGEPAGE, mm);
+ mm_slot_free(mm_slot_cache, slot);
mmdrop(mm);
- } else if (mm_slot) {
+ } else if (slot) {
/*
* This is required to serialize against
* hpage_collapse_test_exit() (which is guaranteed to run
@@ -548,33 +533,20 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte,
}
}
-static bool is_refcount_suitable(struct folio *folio)
-{
- int expected_refcount = folio_mapcount(folio);
-
- if (!folio_test_anon(folio) || folio_test_swapcache(folio))
- expected_refcount += folio_nr_pages(folio);
-
- if (folio_test_private(folio))
- expected_refcount++;
-
- return folio_ref_count(folio) == expected_refcount;
-}
-
static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
- unsigned long address,
+ unsigned long start_addr,
pte_t *pte,
struct collapse_control *cc,
struct list_head *compound_pagelist)
{
struct page *page = NULL;
struct folio *folio = NULL;
+ unsigned long addr = start_addr;
pte_t *_pte;
int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
- bool writable = false;
for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
- _pte++, address += PAGE_SIZE) {
+ _pte++, addr += PAGE_SIZE) {
pte_t pteval = ptep_get(_pte);
if (pte_none(pteval) || (pte_present(pteval) &&
is_zero_pfn(pte_pfn(pteval)))) {
@@ -597,7 +569,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
result = SCAN_PTE_UFFD_WP;
goto out;
}
- page = vm_normal_page(vma, address, pteval);
+ page = vm_normal_page(vma, addr, pteval);
if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
result = SCAN_PAGE_NULL;
goto out;
@@ -652,7 +624,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
* but not from this process. The other process cannot write to
* the page, only trigger CoW.
*/
- if (!is_refcount_suitable(folio)) {
+ if (folio_expected_ref_count(folio) != folio_ref_count(folio)) {
folio_unlock(folio);
result = SCAN_PAGE_COUNT;
goto out;
@@ -682,28 +654,23 @@ next:
*/
if (cc->is_khugepaged &&
(pte_young(pteval) || folio_test_young(folio) ||
- folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm,
- address)))
+ folio_test_referenced(folio) ||
+ mmu_notifier_test_young(vma->vm_mm, addr)))
referenced++;
-
- if (pte_write(pteval))
- writable = true;
}
- if (unlikely(!writable)) {
- result = SCAN_PAGE_RO;
- } else if (unlikely(cc->is_khugepaged && !referenced)) {
+ if (unlikely(cc->is_khugepaged && !referenced)) {
result = SCAN_LACK_REFERENCED_PAGE;
} else {
result = SCAN_SUCCEED;
- trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero,
- referenced, writable, result);
+ trace_mm_collapse_huge_page_isolate(folio, none_or_zero,
+ referenced, result);
return result;
}
out:
release_pte_pages(pte, _pte, compound_pagelist);
- trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero,
- referenced, writable, result);
+ trace_mm_collapse_huge_page_isolate(folio, none_or_zero,
+ referenced, result);
return result;
}
@@ -713,12 +680,15 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
spinlock_t *ptl,
struct list_head *compound_pagelist)
{
+ unsigned long end = address + HPAGE_PMD_SIZE;
struct folio *src, *tmp;
- pte_t *_pte;
pte_t pteval;
+ pte_t *_pte;
+ unsigned int nr_ptes;
- for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
- _pte++, address += PAGE_SIZE) {
+ for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte += nr_ptes,
+ address += nr_ptes * PAGE_SIZE) {
+ nr_ptes = 1;
pteval = ptep_get(_pte);
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
@@ -735,18 +705,26 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
struct page *src_page = pte_page(pteval);
src = page_folio(src_page);
- if (!folio_test_large(src))
+
+ if (folio_test_large(src)) {
+ unsigned int max_nr_ptes = (end - address) >> PAGE_SHIFT;
+
+ nr_ptes = folio_pte_batch(src, _pte, pteval, max_nr_ptes);
+ } else {
release_pte_folio(src);
+ }
+
/*
* ptl mostly unnecessary, but preempt has to
* be disabled to update the per-cpu stats
* inside folio_remove_rmap_pte().
*/
spin_lock(ptl);
- ptep_clear(vma->vm_mm, address, _pte);
- folio_remove_rmap_pte(src, src_page, vma);
+ clear_ptes(vma->vm_mm, address, _pte, nr_ptes);
+ folio_remove_rmap_ptes(src, src_page, nr_ptes, vma);
spin_unlock(ptl);
- free_page_and_swap_cache(src_page);
+ free_swap_cache(src);
+ folio_put_refs(src, nr_ptes);
}
}
@@ -923,7 +901,8 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
struct collapse_control *cc)
{
struct vm_area_struct *vma;
- unsigned long tva_flags = cc->is_khugepaged ? TVA_ENFORCE_SYSFS : 0;
+ enum tva_type type = cc->is_khugepaged ? TVA_KHUGEPAGED :
+ TVA_FORCED_COLLAPSE;
if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
return SCAN_ANY_PROCESS;
@@ -934,7 +913,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
if (!thp_vma_suitable_order(vma, address, PMD_ORDER))
return SCAN_ADDRESS_RANGE;
- if (!thp_vma_allowable_order(vma, vma->vm_flags, tva_flags, PMD_ORDER))
+ if (!thp_vma_allowable_order(vma, vma->vm_flags, type, PMD_ORDER))
return SCAN_VMA_CHECK;
/*
* Anon VMA expected, the address may be unmapped then
@@ -954,12 +933,18 @@ static inline int check_pmd_state(pmd_t *pmd)
if (pmd_none(pmde))
return SCAN_PMD_NONE;
+
+ /*
+ * The folio may be under migration when khugepaged is trying to
+ * collapse it. Migration success or failure will eventually end
+ * up with a present PMD mapping a folio again.
+ */
+ if (is_pmd_migration_entry(pmde))
+ return SCAN_PMD_MAPPED;
if (!pmd_present(pmde))
return SCAN_PMD_NULL;
if (pmd_trans_huge(pmde))
return SCAN_PMD_MAPPED;
- if (pmd_devmap(pmde))
- return SCAN_PMD_NULL;
if (pmd_bad(pmde))
return SCAN_PMD_NULL;
return SCAN_SUCCEED;
@@ -999,21 +984,21 @@ static int check_pmd_still_valid(struct mm_struct *mm,
*/
static int __collapse_huge_page_swapin(struct mm_struct *mm,
struct vm_area_struct *vma,
- unsigned long haddr, pmd_t *pmd,
+ unsigned long start_addr, pmd_t *pmd,
int referenced)
{
int swapped_in = 0;
vm_fault_t ret = 0;
- unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE);
+ unsigned long addr, end = start_addr + (HPAGE_PMD_NR * PAGE_SIZE);
int result;
pte_t *pte = NULL;
spinlock_t *ptl;
- for (address = haddr; address < end; address += PAGE_SIZE) {
+ for (addr = start_addr; addr < end; addr += PAGE_SIZE) {
struct vm_fault vmf = {
.vma = vma,
- .address = address,
- .pgoff = linear_page_index(vma, address),
+ .address = addr,
+ .pgoff = linear_page_index(vma, addr),
.flags = FAULT_FLAG_ALLOW_RETRY,
.pmd = pmd,
};
@@ -1023,7 +1008,7 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
* Here the ptl is only used to check pte_same() in
* do_swap_page(), so readonly version is enough.
*/
- pte = pte_offset_map_ro_nolock(mm, pmd, address, &ptl);
+ pte = pte_offset_map_ro_nolock(mm, pmd, addr, &ptl);
if (!pte) {
mmap_read_unlock(mm);
result = SCAN_PMD_NULL;
@@ -1168,11 +1153,11 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
if (result != SCAN_SUCCEED)
goto out_up_write;
/* check if the pmd is still valid */
+ vma_start_write(vma);
result = check_pmd_still_valid(mm, address, pmd);
if (result != SCAN_SUCCEED)
goto out_up_write;
- vma_start_write(vma);
anon_vma_lock_write(vma->anon_vma);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
@@ -1239,7 +1224,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
__folio_mark_uptodate(folio);
pgtable = pmd_pgtable(_pmd);
- _pmd = mk_huge_pmd(&folio->page, vma->vm_page_prot);
+ _pmd = folio_mk_pmd(folio, vma->vm_page_prot);
_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
spin_lock(pmd_ptl);
@@ -1266,7 +1251,7 @@ out_nolock:
static int hpage_collapse_scan_pmd(struct mm_struct *mm,
struct vm_area_struct *vma,
- unsigned long address, bool *mmap_locked,
+ unsigned long start_addr, bool *mmap_locked,
struct collapse_control *cc)
{
pmd_t *pmd;
@@ -1275,27 +1260,26 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
int none_or_zero = 0, shared = 0;
struct page *page = NULL;
struct folio *folio = NULL;
- unsigned long _address;
+ unsigned long addr;
spinlock_t *ptl;
int node = NUMA_NO_NODE, unmapped = 0;
- bool writable = false;
- VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK);
- result = find_pmd_or_thp_or_none(mm, address, &pmd);
+ result = find_pmd_or_thp_or_none(mm, start_addr, &pmd);
if (result != SCAN_SUCCEED)
goto out;
memset(cc->node_load, 0, sizeof(cc->node_load));
nodes_clear(cc->alloc_nmask);
- pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+ pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
if (!pte) {
result = SCAN_PMD_NULL;
goto out;
}
- for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR;
- _pte++, _address += PAGE_SIZE) {
+ for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
+ _pte++, addr += PAGE_SIZE) {
pte_t pteval = ptep_get(_pte);
if (is_swap_pte(pteval)) {
++unmapped;
@@ -1342,10 +1326,8 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
result = SCAN_PTE_UFFD_WP;
goto out_unmap;
}
- if (pte_write(pteval))
- writable = true;
- page = vm_normal_page(vma, _address, pteval);
+ page = vm_normal_page(vma, addr, pteval);
if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
result = SCAN_PAGE_NULL;
goto out_unmap;
@@ -1402,7 +1384,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
* has excessive GUP pins (i.e. 512). Anyway the same check
* will be done again later the risk seems low.
*/
- if (!is_refcount_suitable(folio)) {
+ if (folio_expected_ref_count(folio) != folio_ref_count(folio)) {
result = SCAN_PAGE_COUNT;
goto out_unmap;
}
@@ -1413,13 +1395,11 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
*/
if (cc->is_khugepaged &&
(pte_young(pteval) || folio_test_young(folio) ||
- folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm,
- address)))
+ folio_test_referenced(folio) ||
+ mmu_notifier_test_young(vma->vm_mm, addr)))
referenced++;
}
- if (!writable) {
- result = SCAN_PAGE_RO;
- } else if (cc->is_khugepaged &&
+ if (cc->is_khugepaged &&
(!referenced ||
(unmapped && referenced < HPAGE_PMD_NR / 2))) {
result = SCAN_LACK_REFERENCED_PAGE;
@@ -1429,20 +1409,19 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
out_unmap:
pte_unmap_unlock(pte, ptl);
if (result == SCAN_SUCCEED) {
- result = collapse_huge_page(mm, address, referenced,
+ result = collapse_huge_page(mm, start_addr, referenced,
unmapped, cc);
/* collapse_huge_page will return with the mmap_lock released */
*mmap_locked = false;
}
out:
- trace_mm_khugepaged_scan_pmd(mm, &folio->page, writable, referenced,
+ trace_mm_khugepaged_scan_pmd(mm, folio, referenced,
none_or_zero, result, unmapped);
return result;
}
-static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
+static void collect_mm_slot(struct mm_slot *slot)
{
- struct mm_slot *slot = &mm_slot->slot;
struct mm_struct *mm = slot->mm;
lockdep_assert_held(&khugepaged_mm_lock);
@@ -1455,34 +1434,49 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
/*
* Not strictly needed because the mm exited already.
*
- * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
+ * mm_flags_clear(MMF_VM_HUGEPAGE, mm);
*/
/* khugepaged_mm_lock actually not necessary for the below */
- mm_slot_free(mm_slot_cache, mm_slot);
+ mm_slot_free(mm_slot_cache, slot);
mmdrop(mm);
}
}
-#ifdef CONFIG_SHMEM
-/* hpage must be locked, and mmap_lock must be held */
+/* folio must be locked, and mmap_lock must be held */
static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmdp, struct page *hpage)
+ pmd_t *pmdp, struct folio *folio, struct page *page)
{
+ struct mm_struct *mm = vma->vm_mm;
struct vm_fault vmf = {
.vma = vma,
.address = addr,
.flags = 0,
- .pmd = pmdp,
};
+ pgd_t *pgdp;
+ p4d_t *p4dp;
+ pud_t *pudp;
- VM_BUG_ON(!PageTransHuge(hpage));
mmap_assert_locked(vma->vm_mm);
- if (do_set_pmd(&vmf, hpage))
+ if (!pmdp) {
+ pgdp = pgd_offset(mm, addr);
+ p4dp = p4d_alloc(mm, pgdp, addr);
+ if (!p4dp)
+ return SCAN_FAIL;
+ pudp = pud_alloc(mm, p4dp, addr);
+ if (!pudp)
+ return SCAN_FAIL;
+ pmdp = pmd_alloc(mm, pudp, addr);
+ if (!pmdp)
+ return SCAN_FAIL;
+ }
+
+ vmf.pmd = pmdp;
+ if (do_set_pmd(&vmf, folio, page))
return SCAN_FAIL;
- get_page(hpage);
+ folio_get(folio);
return SCAN_SUCCEED;
}
@@ -1501,15 +1495,17 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
bool install_pmd)
{
+ int nr_mapped_ptes = 0, result = SCAN_FAIL;
+ unsigned int nr_batch_ptes;
struct mmu_notifier_range range;
bool notified = false;
unsigned long haddr = addr & HPAGE_PMD_MASK;
+ unsigned long end = haddr + HPAGE_PMD_SIZE;
struct vm_area_struct *vma = vma_lookup(mm, haddr);
struct folio *folio;
pte_t *start_pte, *pte;
pmd_t *pmd, pgt_pmd;
spinlock_t *pml = NULL, *ptl;
- int nr_ptes = 0, result = SCAN_FAIL;
int i;
mmap_assert_locked(mm);
@@ -1529,9 +1525,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
* in the page cache with a single hugepage. If a mm were to fault-in
* this memory (mapped by a suitably aligned VMA), we'd get the hugepage
* and map it by a PMD, regardless of sysfs THP settings. As such, let's
- * analogously elide sysfs THP settings here.
+ * analogously elide sysfs THP settings here and force collapse.
*/
- if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER))
+ if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER))
return SCAN_VMA_CHECK;
/* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
@@ -1552,6 +1548,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
switch (result) {
case SCAN_SUCCEED:
break;
+ case SCAN_PMD_NULL:
case SCAN_PMD_NONE:
/*
* All pte entries have been removed and pmd cleared.
@@ -1623,11 +1620,15 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
goto abort;
/* step 2: clear page table and adjust rmap */
- for (i = 0, addr = haddr, pte = start_pte;
- i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
+ for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR;
+ i += nr_batch_ptes, addr += nr_batch_ptes * PAGE_SIZE,
+ pte += nr_batch_ptes) {
+ unsigned int max_nr_batch_ptes = (end - addr) >> PAGE_SHIFT;
struct page *page;
pte_t ptent = ptep_get(pte);
+ nr_batch_ptes = 1;
+
if (pte_none(ptent))
continue;
/*
@@ -1641,26 +1642,29 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
goto abort;
}
page = vm_normal_page(vma, addr, ptent);
+
if (folio_page(folio, i) != page)
goto abort;
+ nr_batch_ptes = folio_pte_batch(folio, pte, ptent, max_nr_batch_ptes);
+
/*
* Must clear entry, or a racing truncate may re-remove it.
* TLB flush can be left until pmdp_collapse_flush() does it.
* PTE dirty? Shmem page is already dirty; file is read-only.
*/
- ptep_clear(mm, addr, pte);
- folio_remove_rmap_pte(folio, page, vma);
- nr_ptes++;
+ clear_ptes(mm, addr, pte, nr_batch_ptes);
+ folio_remove_rmap_ptes(folio, page, nr_batch_ptes, vma);
+ nr_mapped_ptes += nr_batch_ptes;
}
if (!pml)
spin_unlock(ptl);
/* step 3: set proper refcount and mm_counters. */
- if (nr_ptes) {
- folio_ref_sub(folio, nr_ptes);
- add_mm_counter(mm, mm_counter_file(folio), -nr_ptes);
+ if (nr_mapped_ptes) {
+ folio_ref_sub(folio, nr_mapped_ptes);
+ add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes);
}
/* step 4: remove empty page table */
@@ -1689,14 +1693,14 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
maybe_install_pmd:
/* step 5: install pmd entry */
result = install_pmd
- ? set_huge_pmd(vma, haddr, pmd, &folio->page)
+ ? set_huge_pmd(vma, haddr, pmd, folio, &folio->page)
: SCAN_SUCCEED;
goto drop_folio;
abort:
- if (nr_ptes) {
+ if (nr_mapped_ptes) {
flush_tlb_mm(mm);
- folio_ref_sub(folio, nr_ptes);
- add_mm_counter(mm, mm_counter_file(folio), -nr_ptes);
+ folio_ref_sub(folio, nr_mapped_ptes);
+ add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes);
}
unlock:
if (start_pte)
@@ -2295,6 +2299,17 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
continue;
}
+ if (!folio_try_get(folio)) {
+ xas_reset(&xas);
+ continue;
+ }
+
+ if (unlikely(folio != xas_reload(&xas))) {
+ folio_put(folio);
+ xas_reset(&xas);
+ continue;
+ }
+
if (folio_order(folio) == HPAGE_PMD_ORDER &&
folio->index == start) {
/* Maybe PMD-mapped */
@@ -2305,23 +2320,27 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
* it's safe to skip LRU and refcount checks before
* returning.
*/
+ folio_put(folio);
break;
}
node = folio_nid(folio);
if (hpage_collapse_scan_abort(node, cc)) {
result = SCAN_SCAN_ABORT;
+ folio_put(folio);
break;
}
cc->node_load[node]++;
if (!folio_test_lru(folio)) {
result = SCAN_PAGE_LRU;
+ folio_put(folio);
break;
}
- if (!is_refcount_suitable(folio)) {
+ if (folio_expected_ref_count(folio) + 1 != folio_ref_count(folio)) {
result = SCAN_PAGE_COUNT;
+ folio_put(folio);
break;
}
@@ -2333,6 +2352,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
*/
present += folio_nr_pages(folio);
+ folio_put(folio);
if (need_resched()) {
xas_pause(&xas);
@@ -2354,14 +2374,6 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
trace_mm_khugepaged_scan_file(mm, folio, file, present, swap, result);
return result;
}
-#else
-static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
- struct file *file, pgoff_t start,
- struct collapse_control *cc)
-{
- BUILD_BUG();
-}
-#endif
static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
struct collapse_control *cc)
@@ -2369,7 +2381,6 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
__acquires(&khugepaged_mm_lock)
{
struct vma_iterator vmi;
- struct khugepaged_mm_slot *mm_slot;
struct mm_slot *slot;
struct mm_struct *mm;
struct vm_area_struct *vma;
@@ -2380,14 +2391,12 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
*result = SCAN_FAIL;
if (khugepaged_scan.mm_slot) {
- mm_slot = khugepaged_scan.mm_slot;
- slot = &mm_slot->slot;
+ slot = khugepaged_scan.mm_slot;
} else {
- slot = list_entry(khugepaged_scan.mm_head.next,
+ slot = list_first_entry(&khugepaged_scan.mm_head,
struct mm_slot, mm_node);
- mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
khugepaged_scan.address = 0;
- khugepaged_scan.mm_slot = mm_slot;
+ khugepaged_scan.mm_slot = slot;
}
spin_unlock(&khugepaged_mm_lock);
@@ -2413,8 +2422,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
progress++;
break;
}
- if (!thp_vma_allowable_order(vma, vma->vm_flags,
- TVA_ENFORCE_SYSFS, PMD_ORDER)) {
+ if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) {
skip:
progress++;
continue;
@@ -2437,7 +2445,7 @@ skip:
VM_BUG_ON(khugepaged_scan.address < hstart ||
khugepaged_scan.address + HPAGE_PMD_SIZE >
hend);
- if (IS_ENABLED(CONFIG_SHMEM) && !vma_is_anonymous(vma)) {
+ if (!vma_is_anonymous(vma)) {
struct file *file = get_file(vma->vm_file);
pgoff_t pgoff = linear_page_index(vma,
khugepaged_scan.address);
@@ -2486,7 +2494,7 @@ breakouterloop:
breakouterloop_mmap_lock:
spin_lock(&khugepaged_mm_lock);
- VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
+ VM_BUG_ON(khugepaged_scan.mm_slot != slot);
/*
* Release the current mm_slot if this mm is about to die, or
* if we scanned all vmas of this mm.
@@ -2497,18 +2505,15 @@ breakouterloop_mmap_lock:
* khugepaged runs here, khugepaged_exit will find
* mm_slot not pointing to the exiting mm.
*/
- if (slot->mm_node.next != &khugepaged_scan.mm_head) {
- slot = list_entry(slot->mm_node.next,
- struct mm_slot, mm_node);
- khugepaged_scan.mm_slot =
- mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
+ if (!list_is_last(&slot->mm_node, &khugepaged_scan.mm_head)) {
+ khugepaged_scan.mm_slot = list_next_entry(slot, mm_node);
khugepaged_scan.address = 0;
} else {
khugepaged_scan.mm_slot = NULL;
khugepaged_full_scans++;
}
- collect_mm_slot(mm_slot);
+ collect_mm_slot(slot);
}
return progress;
@@ -2595,7 +2600,7 @@ static void khugepaged_wait_work(void)
static int khugepaged(void *none)
{
- struct khugepaged_mm_slot *mm_slot;
+ struct mm_slot *slot;
set_freezable();
set_user_nice(current, MAX_NICE);
@@ -2606,10 +2611,10 @@ static int khugepaged(void *none)
}
spin_lock(&khugepaged_mm_lock);
- mm_slot = khugepaged_scan.mm_slot;
+ slot = khugepaged_scan.mm_slot;
khugepaged_scan.mm_slot = NULL;
- if (mm_slot)
- collect_mm_slot(mm_slot);
+ if (slot)
+ collect_mm_slot(slot);
spin_unlock(&khugepaged_mm_lock);
return 0;
}
@@ -2736,8 +2741,8 @@ static int madvise_collapse_errno(enum scan_result r)
}
}
-int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
- unsigned long start, unsigned long end)
+int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, bool *lock_dropped)
{
struct collapse_control *cc;
struct mm_struct *mm = vma->vm_mm;
@@ -2748,9 +2753,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
BUG_ON(vma->vm_start > start);
BUG_ON(vma->vm_end < end);
- *prev = vma;
-
- if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER))
+ if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER))
return -EINVAL;
cc = kmalloc(sizeof(*cc), GFP_KERNEL);
@@ -2783,7 +2786,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
mmap_assert_locked(mm);
memset(cc->node_load, 0, sizeof(cc->node_load));
nodes_clear(cc->alloc_nmask);
- if (IS_ENABLED(CONFIG_SHMEM) && !vma_is_anonymous(vma)) {
+ if (!vma_is_anonymous(vma)) {
struct file *file = get_file(vma->vm_file);
pgoff_t pgoff = linear_page_index(vma, addr);
@@ -2797,7 +2800,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
&mmap_locked, cc);
}
if (!mmap_locked)
- *prev = NULL; /* Tell caller we dropped mmap_lock */
+ *lock_dropped = true;
handle_result:
switch (result) {
@@ -2807,7 +2810,6 @@ handle_result:
break;
case SCAN_PTE_MAPPED_HUGEPAGE:
BUG_ON(mmap_locked);
- BUG_ON(*prev);
mmap_read_lock(mm);
result = collapse_pte_mapped_thp(mm, addr, true);
mmap_read_unlock(mm);
@@ -2816,7 +2818,6 @@ handle_result:
case SCAN_PMD_NULL:
case SCAN_PTE_NON_PRESENT:
case SCAN_PTE_UFFD_WP:
- case SCAN_PAGE_RO:
case SCAN_LACK_REFERENCED_PAGE:
case SCAN_PAGE_NULL:
case SCAN_PAGE_COUNT:
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index c12cef3eeb32..1ac56ceb29b6 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -210,13 +210,11 @@ static struct kmem_cache *object_cache;
static struct kmem_cache *scan_area_cache;
/* set if tracing memory operations is enabled */
-static int kmemleak_enabled = 1;
+static int kmemleak_enabled __read_mostly = 1;
/* same as above but only for the kmemleak_free() callback */
-static int kmemleak_free_enabled = 1;
+static int kmemleak_free_enabled __read_mostly = 1;
/* set in the late_initcall if there were no errors */
static int kmemleak_late_initialized;
-/* set if a kmemleak warning was issued */
-static int kmemleak_warning;
/* set if a fatal kmemleak error has occurred */
static int kmemleak_error;
@@ -254,7 +252,6 @@ static void kmemleak_disable(void);
#define kmemleak_warn(x...) do { \
pr_warn(x); \
dump_stack(); \
- kmemleak_warning = 1; \
} while (0)
/*
@@ -325,8 +322,6 @@ static void hex_dump_object(struct seq_file *seq,
* sufficient references to it (count >= min_count)
* - black - ignore, it doesn't contain references (e.g. text section)
* (min_count == -1). No function defined for this color.
- * Newly created objects don't have any color assigned (object->count == -1)
- * before the next memory scan when they become white.
*/
static bool color_white(const struct kmemleak_object *object)
{
@@ -442,9 +437,15 @@ static struct kmemleak_object *__lookup_object(unsigned long ptr, int alias,
else if (untagged_objp == untagged_ptr || alias)
return object;
else {
+ /*
+ * Printk deferring due to the kmemleak_lock held.
+ * This is done to avoid deadlock.
+ */
+ printk_deferred_enter();
kmemleak_warn("Found object by alias at 0x%08lx\n",
ptr);
dump_object_info(object);
+ printk_deferred_exit();
break;
}
}
@@ -475,6 +476,7 @@ static struct kmemleak_object *mem_pool_alloc(gfp_t gfp)
{
unsigned long flags;
struct kmemleak_object *object;
+ bool warn = false;
/* try the slab allocator first */
if (object_cache) {
@@ -493,8 +495,10 @@ static struct kmemleak_object *mem_pool_alloc(gfp_t gfp)
else if (mem_pool_free_count)
object = &mem_pool[--mem_pool_free_count];
else
- pr_warn_once("Memory pool empty, consider increasing CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE\n");
+ warn = true;
raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
+ if (warn)
+ pr_warn_once("Memory pool empty, consider increasing CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE\n");
return object;
}
@@ -738,6 +742,11 @@ static int __link_object(struct kmemleak_object *object, unsigned long ptr,
else if (untagged_objp + parent->size <= untagged_ptr)
link = &parent->rb_node.rb_right;
else {
+ /*
+ * Printk deferring due to the kmemleak_lock held.
+ * This is done to avoid deadlock.
+ */
+ printk_deferred_enter();
kmemleak_stop("Cannot insert 0x%lx into the object search tree (overlaps existing)\n",
ptr);
/*
@@ -745,6 +754,7 @@ static int __link_object(struct kmemleak_object *object, unsigned long ptr,
* be freed while the kmemleak_lock is held.
*/
dump_object_info(parent);
+ printk_deferred_exit();
return -EEXIST;
}
}
@@ -858,13 +868,8 @@ static void delete_object_part(unsigned long ptr, size_t size,
raw_spin_lock_irqsave(&kmemleak_lock, flags);
object = __find_and_remove_object(ptr, 1, objflags);
- if (!object) {
-#ifdef DEBUG
- kmemleak_warn("Partially freeing unknown object at 0x%08lx (size %zu)\n",
- ptr, size);
-#endif
+ if (!object)
goto unlock;
- }
/*
* Create one or two objects that may result from the memory block
@@ -884,8 +889,14 @@ static void delete_object_part(unsigned long ptr, size_t size,
unlock:
raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
- if (object)
+ if (object) {
__delete_object(object);
+ } else {
+#ifdef DEBUG
+ kmemleak_warn("Partially freeing unknown object at 0x%08lx (size %zu)\n",
+ ptr, size);
+#endif
+ }
out:
if (object_l)
@@ -1252,6 +1263,20 @@ void __ref kmemleak_transient_leak(const void *ptr)
EXPORT_SYMBOL(kmemleak_transient_leak);
/**
+ * kmemleak_ignore_percpu - similar to kmemleak_ignore but taking a percpu
+ * address argument
+ * @ptr: percpu address of the object
+ */
+void __ref kmemleak_ignore_percpu(const void __percpu *ptr)
+{
+ pr_debug("%s(0x%px)\n", __func__, ptr);
+
+ if (kmemleak_enabled && ptr && !IS_ERR_PCPU(ptr))
+ make_black_object((unsigned long)ptr, OBJECT_PERCPU);
+}
+EXPORT_SYMBOL_GPL(kmemleak_ignore_percpu);
+
+/**
* kmemleak_ignore - ignore an allocated object
* @ptr: pointer to beginning of the object
*
@@ -2172,6 +2197,7 @@ static const struct file_operations kmemleak_fops = {
static void __kmemleak_do_cleanup(void)
{
struct kmemleak_object *object, *tmp;
+ unsigned int cnt = 0;
/*
* Kmemleak has already been disabled, no need for RCU list traversal
@@ -2180,6 +2206,10 @@ static void __kmemleak_do_cleanup(void)
list_for_each_entry_safe(object, tmp, &object_list, object_list) {
__remove_object(object);
__delete_object(object);
+
+ /* Call cond_resched() once per 64 iterations to avoid soft lockup */
+ if (!(++cnt & 0x3f))
+ cond_resched();
}
}
diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c
index a495debf1436..8bca7fece47f 100644
--- a/mm/kmsan/core.c
+++ b/mm/kmsan/core.c
@@ -159,8 +159,8 @@ depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id)
* Make sure we have enough spare bits in @id to hold the UAF bit and
* the chain depth.
*/
- BUILD_BUG_ON(
- (1 << STACK_DEPOT_EXTRA_BITS) <= (KMSAN_MAX_ORIGIN_DEPTH << 1));
+ BUILD_BUG_ON((1 << STACK_DEPOT_EXTRA_BITS) <=
+ (KMSAN_MAX_ORIGIN_DEPTH << 1));
extra_bits = stack_depot_get_extra_bits(id);
depth = kmsan_depth_from_eb(extra_bits);
@@ -195,7 +195,8 @@ void kmsan_internal_set_shadow_origin(void *addr, size_t size, int b,
u32 origin, bool checked)
{
u64 address = (u64)addr;
- u32 *shadow_start, *origin_start;
+ void *shadow_start;
+ u32 *aligned_shadow, *origin_start;
size_t pad = 0;
KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(addr, size));
@@ -214,9 +215,12 @@ void kmsan_internal_set_shadow_origin(void *addr, size_t size, int b,
}
__memset(shadow_start, b, size);
- if (!IS_ALIGNED(address, KMSAN_ORIGIN_SIZE)) {
+ if (IS_ALIGNED(address, KMSAN_ORIGIN_SIZE)) {
+ aligned_shadow = shadow_start;
+ } else {
pad = address % KMSAN_ORIGIN_SIZE;
address -= pad;
+ aligned_shadow = shadow_start - pad;
size += pad;
}
size = ALIGN(size, KMSAN_ORIGIN_SIZE);
@@ -230,7 +234,7 @@ void kmsan_internal_set_shadow_origin(void *addr, size_t size, int b,
* corresponding shadow slot is zero.
*/
for (int i = 0; i < size / KMSAN_ORIGIN_SIZE; i++) {
- if (origin || !shadow_start[i])
+ if (origin || !aligned_shadow[i])
origin_start[i] = origin;
}
}
@@ -274,11 +278,9 @@ void kmsan_internal_check_memory(void *addr, size_t size,
* bytes before, report them.
*/
if (cur_origin) {
- kmsan_enter_runtime();
kmsan_report(cur_origin, addr, size,
cur_off_start, pos - 1, user_addr,
reason);
- kmsan_leave_runtime();
}
cur_origin = 0;
cur_off_start = -1;
@@ -292,11 +294,9 @@ void kmsan_internal_check_memory(void *addr, size_t size,
* poisoned bytes before, report them.
*/
if (cur_origin) {
- kmsan_enter_runtime();
kmsan_report(cur_origin, addr, size,
cur_off_start, pos + i - 1,
user_addr, reason);
- kmsan_leave_runtime();
}
cur_origin = 0;
cur_off_start = -1;
@@ -312,11 +312,9 @@ void kmsan_internal_check_memory(void *addr, size_t size,
*/
if (cur_origin != new_origin) {
if (cur_origin) {
- kmsan_enter_runtime();
kmsan_report(cur_origin, addr, size,
cur_off_start, pos + i - 1,
user_addr, reason);
- kmsan_leave_runtime();
}
cur_origin = new_origin;
cur_off_start = pos + i;
@@ -326,10 +324,8 @@ void kmsan_internal_check_memory(void *addr, size_t size,
}
KMSAN_WARN_ON(pos != size);
if (cur_origin) {
- kmsan_enter_runtime();
kmsan_report(cur_origin, addr, size, cur_off_start, pos - 1,
user_addr, reason);
- kmsan_leave_runtime();
}
}
diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c
index 3df45c25c1f6..2cee59d89c80 100644
--- a/mm/kmsan/hooks.c
+++ b/mm/kmsan/hooks.c
@@ -114,9 +114,7 @@ void kmsan_kfree_large(const void *ptr)
kmsan_enter_runtime();
page = virt_to_head_page((void *)ptr);
KMSAN_WARN_ON(ptr != page_address(page));
- kmsan_internal_poison_memory((void *)ptr,
- page_size(page),
- GFP_KERNEL,
+ kmsan_internal_poison_memory((void *)ptr, page_size(page), GFP_KERNEL,
KMSAN_POISON_CHECK | KMSAN_POISON_FREE);
kmsan_leave_runtime();
}
@@ -277,8 +275,10 @@ void kmsan_copy_to_user(void __user *to, const void *from, size_t to_copy,
* Don't check anything, just copy the shadow of the copied
* bytes.
*/
+ kmsan_enter_runtime();
kmsan_internal_memmove_metadata((void *)to, (void *)from,
to_copy - left);
+ kmsan_leave_runtime();
}
user_access_restore(ua_flags);
}
@@ -336,14 +336,15 @@ static void kmsan_handle_dma_page(const void *addr, size_t size,
}
/* Helper function to handle DMA data transfers. */
-void kmsan_handle_dma(struct page *page, size_t offset, size_t size,
+void kmsan_handle_dma(phys_addr_t phys, size_t size,
enum dma_data_direction dir)
{
- u64 page_offset, to_go, addr;
+ u64 page_offset, to_go;
+ void *addr;
- if (PageHighMem(page))
+ if (PhysHighMem(phys))
return;
- addr = (u64)page_address(page) + offset;
+ addr = phys_to_virt(phys);
/*
* The kernel may occasionally give us adjacent DMA pages not belonging
* to the same allocation. Process them separately to avoid triggering
@@ -366,8 +367,7 @@ void kmsan_handle_dma_sg(struct scatterlist *sg, int nents,
int i;
for_each_sg(sg, item, nents, i)
- kmsan_handle_dma(sg_page(item), item->offset, item->length,
- dir);
+ kmsan_handle_dma(sg_phys(item), item->length, dir);
}
/* Functions from kmsan-checks.h follow. */
diff --git a/mm/kmsan/init.c b/mm/kmsan/init.c
index 10f52c085e6c..b14ce3417e65 100644
--- a/mm/kmsan/init.c
+++ b/mm/kmsan/init.c
@@ -35,8 +35,7 @@ static void __init kmsan_record_future_shadow_range(void *start, void *end)
KMSAN_WARN_ON(future_index == NUM_FUTURE_RANGES);
KMSAN_WARN_ON((nstart >= nend) ||
/* Virtual address 0 is valid on s390. */
- (!IS_ENABLED(CONFIG_S390) && !nstart) ||
- !nend);
+ (!IS_ENABLED(CONFIG_S390) && !nstart) || !nend);
nstart = ALIGN_DOWN(nstart, PAGE_SIZE);
nend = ALIGN(nend, PAGE_SIZE);
diff --git a/mm/kmsan/instrumentation.c b/mm/kmsan/instrumentation.c
index 02a405e55d6c..69f0a57a401c 100644
--- a/mm/kmsan/instrumentation.c
+++ b/mm/kmsan/instrumentation.c
@@ -312,13 +312,9 @@ EXPORT_SYMBOL(__msan_unpoison_alloca);
void __msan_warning(u32 origin);
void __msan_warning(u32 origin)
{
- if (!kmsan_enabled || kmsan_in_runtime())
- return;
- kmsan_enter_runtime();
kmsan_report(origin, /*address*/ NULL, /*size*/ 0,
/*off_first*/ 0, /*off_last*/ 0, /*user_addr*/ NULL,
REASON_ANY);
- kmsan_leave_runtime();
}
EXPORT_SYMBOL(__msan_warning);
diff --git a/mm/kmsan/kmsan.h b/mm/kmsan/kmsan.h
index 29555a8bc315..bc3d1810f352 100644
--- a/mm/kmsan/kmsan.h
+++ b/mm/kmsan/kmsan.h
@@ -121,7 +121,6 @@ static __always_inline void kmsan_leave_runtime(void)
KMSAN_WARN_ON(--ctx->kmsan_in_runtime);
}
-depot_stack_handle_t kmsan_save_stack(void);
depot_stack_handle_t kmsan_save_stack_with_flags(gfp_t flags,
unsigned int extra_bits);
diff --git a/mm/kmsan/kmsan_test.c b/mm/kmsan/kmsan_test.c
index 9733a22c46c1..902ec48b1e3e 100644
--- a/mm/kmsan/kmsan_test.c
+++ b/mm/kmsan/kmsan_test.c
@@ -556,6 +556,21 @@ DEFINE_TEST_MEMSETXX(16)
DEFINE_TEST_MEMSETXX(32)
DEFINE_TEST_MEMSETXX(64)
+/* Test case: ensure that KMSAN does not access shadow memory out of bounds. */
+static void test_memset_on_guarded_buffer(struct kunit *test)
+{
+ void *buf = vmalloc(PAGE_SIZE);
+
+ kunit_info(test,
+ "memset() on ends of guarded buffer should not crash\n");
+
+ for (size_t size = 0; size <= 128; size++) {
+ memset(buf, 0xff, size);
+ memset(buf + PAGE_SIZE - size, 0xff, size);
+ }
+ vfree(buf);
+}
+
static noinline void fibonacci(int *array, int size, int start)
{
if (start < 2 || (start == size))
@@ -677,6 +692,7 @@ static struct kunit_case kmsan_test_cases[] = {
KUNIT_CASE(test_memset16),
KUNIT_CASE(test_memset32),
KUNIT_CASE(test_memset64),
+ KUNIT_CASE(test_memset_on_guarded_buffer),
KUNIT_CASE(test_long_origin_chain),
KUNIT_CASE(test_stackdepot_roundtrip),
KUNIT_CASE(test_unpoison_memory),
@@ -732,3 +748,4 @@ kunit_test_suites(&kmsan_test_suite);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Alexander Potapenko <glider@google.com>");
+MODULE_DESCRIPTION("Test cases for KMSAN");
diff --git a/mm/kmsan/report.c b/mm/kmsan/report.c
index 94a3303fb65e..d6853ce08954 100644
--- a/mm/kmsan/report.c
+++ b/mm/kmsan/report.c
@@ -157,14 +157,14 @@ void kmsan_report(depot_stack_handle_t origin, void *address, int size,
unsigned long ua_flags;
bool is_uaf;
- if (!kmsan_enabled)
+ if (!kmsan_enabled || kmsan_in_runtime())
return;
if (current->kmsan_ctx.depth)
return;
if (!origin)
return;
- kmsan_disable_current();
+ kmsan_enter_runtime();
ua_flags = user_access_save();
raw_spin_lock(&kmsan_report_lock);
pr_err("=====================================================\n");
@@ -217,5 +217,5 @@ void kmsan_report(depot_stack_handle_t origin, void *address, int size,
if (panic_on_kmsan)
panic("kmsan.panic set ...\n");
user_access_restore(ua_flags);
- kmsan_enable_current();
+ kmsan_leave_runtime();
}
diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c
index 1bb505a08415..54f3c3c962f0 100644
--- a/mm/kmsan/shadow.c
+++ b/mm/kmsan/shadow.c
@@ -207,8 +207,7 @@ void kmsan_free_page(struct page *page, unsigned int order)
if (!kmsan_enabled || kmsan_in_runtime())
return;
kmsan_enter_runtime();
- kmsan_internal_poison_memory(page_address(page),
- page_size(page),
+ kmsan_internal_poison_memory(page_address(page), page_size(page),
GFP_KERNEL,
KMSAN_POISON_CHECK | KMSAN_POISON_FREE);
kmsan_leave_runtime();
@@ -248,17 +247,19 @@ int kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end,
kmsan_enter_runtime();
mapped = __vmap_pages_range_noflush(shadow_start, shadow_end, prot,
s_pages, page_shift);
+ kmsan_leave_runtime();
if (mapped) {
err = mapped;
goto ret;
}
+ kmsan_enter_runtime();
mapped = __vmap_pages_range_noflush(origin_start, origin_end, prot,
o_pages, page_shift);
+ kmsan_leave_runtime();
if (mapped) {
err = mapped;
goto ret;
}
- kmsan_leave_runtime();
flush_tlb_kernel_range(shadow_start, shadow_end);
flush_tlb_kernel_range(origin_start, origin_end);
flush_cache_vmap(shadow_start, shadow_end);
diff --git a/mm/ksm.c b/mm/ksm.c
index 8583fb91ef13..7bc726b50b2f 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -677,28 +677,32 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_v
return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
}
-static bool vma_ksm_compatible(struct vm_area_struct *vma)
+static bool ksm_compatible(const struct file *file, vm_flags_t vm_flags)
{
- if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE | VM_PFNMAP |
- VM_IO | VM_DONTEXPAND | VM_HUGETLB |
- VM_MIXEDMAP| VM_DROPPABLE))
+ if (vm_flags & (VM_SHARED | VM_MAYSHARE | VM_SPECIAL |
+ VM_HUGETLB | VM_DROPPABLE))
return false; /* just ignore the advice */
- if (vma_is_dax(vma))
+ if (file_is_dax(file))
return false;
#ifdef VM_SAO
- if (vma->vm_flags & VM_SAO)
+ if (vm_flags & VM_SAO)
return false;
#endif
#ifdef VM_SPARC_ADI
- if (vma->vm_flags & VM_SPARC_ADI)
+ if (vm_flags & VM_SPARC_ADI)
return false;
#endif
return true;
}
+static bool vma_ksm_compatible(struct vm_area_struct *vma)
+{
+ return ksm_compatible(vma->vm_file, vma->vm_flags);
+}
+
static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
unsigned long addr)
{
@@ -889,7 +893,7 @@ static struct folio *ksm_get_folio(struct ksm_stable_node *stable_node,
unsigned long kpfn;
expected_mapping = (void *)((unsigned long)stable_node |
- PAGE_MAPPING_KSM);
+ FOLIO_MAPPING_KSM);
again:
kpfn = READ_ONCE(stable_node->kpfn); /* Address dependency. */
folio = pfn_folio(kpfn);
@@ -1057,16 +1061,11 @@ struct ksm_stable_node *folio_stable_node(const struct folio *folio)
return folio_test_ksm(folio) ? folio_raw_mapping(folio) : NULL;
}
-static inline struct ksm_stable_node *page_stable_node(struct page *page)
-{
- return folio_stable_node(page_folio(page));
-}
-
static inline void folio_set_stable_node(struct folio *folio,
struct ksm_stable_node *stable_node)
{
VM_WARN_ON_FOLIO(folio_test_anon(folio) && PageAnonExclusive(&folio->page), folio);
- folio->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
+ folio->mapping = (void *)((unsigned long)stable_node | FOLIO_MAPPING_KSM);
}
#ifdef CONFIG_SYSFS
@@ -1213,8 +1212,8 @@ mm_exiting:
spin_unlock(&ksm_mmlist_lock);
mm_slot_free(mm_slot_cache, mm_slot);
- clear_bit(MMF_VM_MERGEABLE, &mm->flags);
- clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
+ mm_flags_clear(MMF_VM_MERGEABLE, mm);
+ mm_flags_clear(MMF_VM_MERGE_ANY, mm);
mmdrop(mm);
} else
spin_unlock(&ksm_mmlist_lock);
@@ -2221,6 +2220,7 @@ static void stable_tree_append(struct ksm_rmap_item *rmap_item,
*/
static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_item)
{
+ struct folio *folio = page_folio(page);
struct ksm_rmap_item *tree_rmap_item;
struct page *tree_page = NULL;
struct ksm_stable_node *stable_node;
@@ -2229,7 +2229,7 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite
int err;
bool max_page_sharing_bypass = false;
- stable_node = page_stable_node(page);
+ stable_node = folio_stable_node(folio);
if (stable_node) {
if (stable_node->head != &migrate_nodes &&
get_kpfn_nid(READ_ONCE(stable_node->kpfn)) !=
@@ -2268,7 +2268,7 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite
/* Start by searching for the folio in the stable tree */
kfolio = stable_tree_search(page);
- if (&kfolio->page == page && rmap_item->head == stable_node) {
+ if (kfolio == folio && rmap_item->head == stable_node) {
folio_put(kfolio);
return;
}
@@ -2349,10 +2349,11 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite
* the page is locked, it is better to skip it and
* perhaps try again later.
*/
- if (!trylock_page(page))
+ if (!folio_trylock(folio))
return;
split_huge_page(page);
- unlock_page(page);
+ folio = page_folio(page);
+ folio_unlock(folio);
}
}
}
@@ -2616,8 +2617,8 @@ no_vmas:
spin_unlock(&ksm_mmlist_lock);
mm_slot_free(mm_slot_cache, mm_slot);
- clear_bit(MMF_VM_MERGEABLE, &mm->flags);
- clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
+ mm_flags_clear(MMF_VM_MERGEABLE, mm);
+ mm_flags_clear(MMF_VM_MERGE_ANY, mm);
mmap_read_unlock(mm);
mmdrop(mm);
} else {
@@ -2696,14 +2697,17 @@ static int ksm_scan_thread(void *nothing)
return 0;
}
-static void __ksm_add_vma(struct vm_area_struct *vma)
+static bool __ksm_should_add_vma(const struct file *file, vm_flags_t vm_flags)
{
- unsigned long vm_flags = vma->vm_flags;
-
if (vm_flags & VM_MERGEABLE)
- return;
+ return false;
- if (vma_ksm_compatible(vma))
+ return ksm_compatible(file, vm_flags);
+}
+
+static void __ksm_add_vma(struct vm_area_struct *vma)
+{
+ if (__ksm_should_add_vma(vma->vm_file, vma->vm_flags))
vm_flags_set(vma, VM_MERGEABLE);
}
@@ -2724,16 +2728,22 @@ static int __ksm_del_vma(struct vm_area_struct *vma)
return 0;
}
/**
- * ksm_add_vma - Mark vma as mergeable if compatible
+ * ksm_vma_flags - Update VMA flags to mark as mergeable if compatible
*
- * @vma: Pointer to vma
+ * @mm: Proposed VMA's mm_struct
+ * @file: Proposed VMA's file-backed mapping, if any.
+ * @vm_flags: Proposed VMA"s flags.
+ *
+ * Returns: @vm_flags possibly updated to mark mergeable.
*/
-void ksm_add_vma(struct vm_area_struct *vma)
+vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file,
+ vm_flags_t vm_flags)
{
- struct mm_struct *mm = vma->vm_mm;
+ if (mm_flags_test(MMF_VM_MERGE_ANY, mm) &&
+ __ksm_should_add_vma(file, vm_flags))
+ vm_flags |= VM_MERGEABLE;
- if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
- __ksm_add_vma(vma);
+ return vm_flags;
}
static void ksm_add_vmas(struct mm_struct *mm)
@@ -2771,16 +2781,16 @@ int ksm_enable_merge_any(struct mm_struct *mm)
{
int err;
- if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
+ if (mm_flags_test(MMF_VM_MERGE_ANY, mm))
return 0;
- if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
+ if (!mm_flags_test(MMF_VM_MERGEABLE, mm)) {
err = __ksm_enter(mm);
if (err)
return err;
}
- set_bit(MMF_VM_MERGE_ANY, &mm->flags);
+ mm_flags_set(MMF_VM_MERGE_ANY, mm);
ksm_add_vmas(mm);
return 0;
@@ -2802,7 +2812,7 @@ int ksm_disable_merge_any(struct mm_struct *mm)
{
int err;
- if (!test_bit(MMF_VM_MERGE_ANY, &mm->flags))
+ if (!mm_flags_test(MMF_VM_MERGE_ANY, mm))
return 0;
err = ksm_del_vmas(mm);
@@ -2811,7 +2821,7 @@ int ksm_disable_merge_any(struct mm_struct *mm)
return err;
}
- clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
+ mm_flags_clear(MMF_VM_MERGE_ANY, mm);
return 0;
}
@@ -2819,15 +2829,15 @@ int ksm_disable(struct mm_struct *mm)
{
mmap_assert_write_locked(mm);
- if (!test_bit(MMF_VM_MERGEABLE, &mm->flags))
+ if (!mm_flags_test(MMF_VM_MERGEABLE, mm))
return 0;
- if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
+ if (mm_flags_test(MMF_VM_MERGE_ANY, mm))
return ksm_disable_merge_any(mm);
return ksm_del_vmas(mm);
}
int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
- unsigned long end, int advice, unsigned long *vm_flags)
+ unsigned long end, int advice, vm_flags_t *vm_flags)
{
struct mm_struct *mm = vma->vm_mm;
int err;
@@ -2839,7 +2849,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
if (!vma_ksm_compatible(vma))
return 0;
- if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
+ if (!mm_flags_test(MMF_VM_MERGEABLE, mm)) {
err = __ksm_enter(mm);
if (err)
return err;
@@ -2899,7 +2909,7 @@ int __ksm_enter(struct mm_struct *mm)
list_add_tail(&slot->mm_node, &ksm_scan.mm_slot->slot.mm_node);
spin_unlock(&ksm_mmlist_lock);
- set_bit(MMF_VM_MERGEABLE, &mm->flags);
+ mm_flags_set(MMF_VM_MERGEABLE, mm);
mmgrab(mm);
if (needs_wakeup)
@@ -2911,7 +2921,7 @@ int __ksm_enter(struct mm_struct *mm)
void __ksm_exit(struct mm_struct *mm)
{
- struct ksm_mm_slot *mm_slot;
+ struct ksm_mm_slot *mm_slot = NULL;
struct mm_slot *slot;
int easy_to_free = 0;
@@ -2926,23 +2936,26 @@ void __ksm_exit(struct mm_struct *mm)
spin_lock(&ksm_mmlist_lock);
slot = mm_slot_lookup(mm_slots_hash, mm);
+ if (!slot)
+ goto unlock;
mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
- if (mm_slot && ksm_scan.mm_slot != mm_slot) {
- if (!mm_slot->rmap_list) {
- hash_del(&slot->hash);
- list_del(&slot->mm_node);
- easy_to_free = 1;
- } else {
- list_move(&slot->mm_node,
- &ksm_scan.mm_slot->slot.mm_node);
- }
+ if (ksm_scan.mm_slot == mm_slot)
+ goto unlock;
+ if (!mm_slot->rmap_list) {
+ hash_del(&slot->hash);
+ list_del(&slot->mm_node);
+ easy_to_free = 1;
+ } else {
+ list_move(&slot->mm_node,
+ &ksm_scan.mm_slot->slot.mm_node);
}
+unlock:
spin_unlock(&ksm_mmlist_lock);
if (easy_to_free) {
mm_slot_free(mm_slot_cache, mm_slot);
- clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
- clear_bit(MMF_VM_MERGEABLE, &mm->flags);
+ mm_flags_clear(MMF_VM_MERGE_ANY, mm);
+ mm_flags_clear(MMF_VM_MERGEABLE, mm);
mmdrop(mm);
} else if (mm_slot) {
mmap_write_lock(mm);
@@ -3669,10 +3682,10 @@ static ssize_t advisor_mode_show(struct kobject *kobj,
{
const char *output;
- if (ksm_advisor == KSM_ADVISOR_NONE)
- output = "[none] scan-time";
- else if (ksm_advisor == KSM_ADVISOR_SCAN_TIME)
+ if (ksm_advisor == KSM_ADVISOR_SCAN_TIME)
output = "none [scan-time]";
+ else
+ output = "[none] scan-time";
return sysfs_emit(buf, "%s\n", output);
}
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 490473af3122..ec48b5dadf51 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -60,30 +60,34 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx)
return &lru->node[nid].lru;
}
+static inline bool lock_list_lru(struct list_lru_one *l, bool irq)
+{
+ if (irq)
+ spin_lock_irq(&l->lock);
+ else
+ spin_lock(&l->lock);
+ if (unlikely(READ_ONCE(l->nr_items) == LONG_MIN)) {
+ if (irq)
+ spin_unlock_irq(&l->lock);
+ else
+ spin_unlock(&l->lock);
+ return false;
+ }
+ return true;
+}
+
static inline struct list_lru_one *
lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
bool irq, bool skip_empty)
{
struct list_lru_one *l;
- long nr_items;
rcu_read_lock();
again:
l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
- if (likely(l)) {
- if (irq)
- spin_lock_irq(&l->lock);
- else
- spin_lock(&l->lock);
- nr_items = READ_ONCE(l->nr_items);
- if (likely(nr_items != LONG_MIN)) {
- rcu_read_unlock();
- return l;
- }
- if (irq)
- spin_unlock_irq(&l->lock);
- else
- spin_unlock(&l->lock);
+ if (likely(l) && lock_list_lru(l, irq)) {
+ rcu_read_unlock();
+ return l;
}
/*
* Caller may simply bail out if raced with reparenting or
diff --git a/mm/maccess.c b/mm/maccess.c
index 8f0906180a94..486559d68858 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -82,7 +82,6 @@ Efault:
pagefault_enable();
return -EFAULT;
}
-EXPORT_SYMBOL_GPL(copy_to_kernel_nofault);
long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count)
{
@@ -196,7 +195,7 @@ long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr,
if (ret >= count) {
ret = count;
dst[ret - 1] = '\0';
- } else if (ret > 0) {
+ } else if (ret >= 0) {
ret++;
}
diff --git a/mm/madvise.c b/mm/madvise.c
index b17f684322ad..fb1c86e630b6 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -19,6 +19,7 @@
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/mm_inline.h>
+#include <linux/mmu_context.h>
#include <linux/string.h>
#include <linux/uio.h>
#include <linux/ksm.h>
@@ -37,6 +38,8 @@
#include "internal.h"
#include "swap.h"
+#define __MADV_SET_ANON_VMA_NAME (-1)
+
/*
* Maximum number of attempts we make to install guard pages before we give up
* and return -ERESTARTNOINTR to have userspace try again.
@@ -48,34 +51,39 @@ struct madvise_walk_private {
bool pageout;
};
-/*
- * Any behaviour which results in changes to the vma->vm_flags needs to
- * take mmap_lock for writing. Others, which simply traverse vmas, need
- * to only take it for reading.
- */
-static int madvise_need_mmap_write(int behavior)
-{
- switch (behavior) {
- case MADV_REMOVE:
- case MADV_WILLNEED:
- case MADV_DONTNEED:
- case MADV_DONTNEED_LOCKED:
- case MADV_COLD:
- case MADV_PAGEOUT:
- case MADV_FREE:
- case MADV_POPULATE_READ:
- case MADV_POPULATE_WRITE:
- case MADV_COLLAPSE:
- case MADV_GUARD_INSTALL:
- case MADV_GUARD_REMOVE:
- return 0;
- default:
- /* be safe, default to 1. list exceptions explicitly */
- return 1;
- }
-}
+enum madvise_lock_mode {
+ MADVISE_NO_LOCK,
+ MADVISE_MMAP_READ_LOCK,
+ MADVISE_MMAP_WRITE_LOCK,
+ MADVISE_VMA_READ_LOCK,
+};
+
+struct madvise_behavior_range {
+ unsigned long start;
+ unsigned long end;
+};
+
+struct madvise_behavior {
+ struct mm_struct *mm;
+ int behavior;
+ struct mmu_gather *tlb;
+ enum madvise_lock_mode lock_mode;
+ struct anon_vma_name *anon_name;
+
+ /*
+ * The range over which the behaviour is currently being applied. If
+ * traversing multiple VMAs, this is updated for each.
+ */
+ struct madvise_behavior_range range;
+ /* The VMA and VMA preceding it (if applicable) currently targeted. */
+ struct vm_area_struct *prev;
+ struct vm_area_struct *vma;
+ bool lock_dropped;
+};
#ifdef CONFIG_ANON_VMA_NAME
+static int madvise_walk_vmas(struct madvise_behavior *madv_behavior);
+
struct anon_vma_name *anon_vma_name_alloc(const char *name)
{
struct anon_vma_name *anon_name;
@@ -101,7 +109,8 @@ void anon_vma_name_free(struct kref *kref)
struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
{
- mmap_assert_locked(vma->vm_mm);
+ if (!rwsem_is_locked(&vma->vm_mm->mmap_lock))
+ vma_assert_locked(vma);
return vma->anon_name;
}
@@ -137,40 +146,39 @@ static int replace_anon_vma_name(struct vm_area_struct *vma,
}
#endif /* CONFIG_ANON_VMA_NAME */
/*
- * Update the vm_flags on region of a vma, splitting it or merging it as
- * necessary. Must be called with mmap_lock held for writing;
- * Caller should ensure anon_name stability by raising its refcount even when
- * anon_name belongs to a valid vma because this function might free that vma.
+ * Update the vm_flags or anon_name on region of a vma, splitting it or merging
+ * it as necessary. Must be called with mmap_lock held for writing.
*/
-static int madvise_update_vma(struct vm_area_struct *vma,
- struct vm_area_struct **prev, unsigned long start,
- unsigned long end, unsigned long new_flags,
- struct anon_vma_name *anon_name)
+static int madvise_update_vma(vm_flags_t new_flags,
+ struct madvise_behavior *madv_behavior)
{
- struct mm_struct *mm = vma->vm_mm;
- int error;
- VMA_ITERATOR(vmi, mm, start);
-
- if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) {
- *prev = vma;
+ struct vm_area_struct *vma = madv_behavior->vma;
+ struct madvise_behavior_range *range = &madv_behavior->range;
+ struct anon_vma_name *anon_name = madv_behavior->anon_name;
+ bool set_new_anon_name = madv_behavior->behavior == __MADV_SET_ANON_VMA_NAME;
+ VMA_ITERATOR(vmi, madv_behavior->mm, range->start);
+
+ if (new_flags == vma->vm_flags && (!set_new_anon_name ||
+ anon_vma_name_eq(anon_vma_name(vma), anon_name)))
return 0;
- }
- vma = vma_modify_flags_name(&vmi, *prev, vma, start, end, new_flags,
- anon_name);
+ if (set_new_anon_name)
+ vma = vma_modify_name(&vmi, madv_behavior->prev, vma,
+ range->start, range->end, anon_name);
+ else
+ vma = vma_modify_flags(&vmi, madv_behavior->prev, vma,
+ range->start, range->end, new_flags);
+
if (IS_ERR(vma))
return PTR_ERR(vma);
- *prev = vma;
+ madv_behavior->vma = vma;
/* vm_flags is protected by the mmap_lock held in write mode. */
vma_start_write(vma);
vm_flags_reset(vma, new_flags);
- if (!vma->vm_file || vma_is_anon_shmem(vma)) {
- error = replace_anon_vma_name(vma, anon_name);
- if (error)
- return error;
- }
+ if (set_new_anon_name)
+ return replace_anon_vma_name(vma, anon_name);
return 0;
}
@@ -263,21 +271,27 @@ static void shmem_swapin_range(struct vm_area_struct *vma,
}
#endif /* CONFIG_SWAP */
+static void mark_mmap_lock_dropped(struct madvise_behavior *madv_behavior)
+{
+ VM_WARN_ON_ONCE(madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK);
+ madv_behavior->lock_dropped = true;
+}
+
/*
* Schedule all required I/O operations. Do not wait for completion.
*/
-static long madvise_willneed(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start, unsigned long end)
+static long madvise_willneed(struct madvise_behavior *madv_behavior)
{
- struct mm_struct *mm = vma->vm_mm;
+ struct vm_area_struct *vma = madv_behavior->vma;
+ struct mm_struct *mm = madv_behavior->mm;
struct file *file = vma->vm_file;
+ unsigned long start = madv_behavior->range.start;
+ unsigned long end = madv_behavior->range.end;
loff_t offset;
- *prev = vma;
#ifdef CONFIG_SWAP
if (!file) {
- walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
+ walk_page_range_vma(vma, start, end, &swapin_walk_ops, vma);
lru_add_drain(); /* Push any new pages onto the LRU now */
return 0;
}
@@ -303,7 +317,7 @@ static long madvise_willneed(struct vm_area_struct *vma,
* vma's reference to the file) can go away as soon as we drop
* mmap_lock.
*/
- *prev = NULL; /* tell sys_madvise we drop mmap_lock */
+ mark_mmap_lock_dropped(madv_behavior);
get_file(file);
offset = (loff_t)(start - vma->vm_start)
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
@@ -331,14 +345,12 @@ static inline bool can_do_file_pageout(struct vm_area_struct *vma)
static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end,
struct folio *folio, pte_t *ptep,
- pte_t pte, bool *any_young,
- bool *any_dirty)
+ pte_t *ptentp)
{
- const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
int max_nr = (end - addr) / PAGE_SIZE;
- return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL,
- any_young, any_dirty);
+ return folio_pte_batch_flags(folio, NULL, ptep, ptentp, max_nr,
+ FPB_MERGE_YOUNG_DIRTY);
}
static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
@@ -476,13 +488,7 @@ restart:
* next pte in the range.
*/
if (folio_test_large(folio)) {
- bool any_young;
-
- nr = madvise_folio_pte_batch(addr, end, folio, pte,
- ptent, &any_young, NULL);
- if (any_young)
- ptent = pte_mkyoung(ptent);
-
+ nr = madvise_folio_pte_batch(addr, end, folio, pte, &ptent);
if (nr < folio_nr_pages(folio)) {
int err;
@@ -503,6 +509,7 @@ restart:
pte_offset_map_lock(mm, pmd, addr, &ptl);
if (!start_pte)
break;
+ flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
if (!err)
nr = 0;
@@ -567,16 +574,19 @@ static const struct mm_walk_ops cold_walk_ops = {
};
static void madvise_cold_page_range(struct mmu_gather *tlb,
- struct vm_area_struct *vma,
- unsigned long addr, unsigned long end)
+ struct madvise_behavior *madv_behavior)
+
{
+ struct vm_area_struct *vma = madv_behavior->vma;
+ struct madvise_behavior_range *range = &madv_behavior->range;
struct madvise_walk_private walk_private = {
.pageout = false,
.tlb = tlb,
};
tlb_start_vma(tlb, vma);
- walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
+ walk_page_range_vma(vma, range->start, range->end, &cold_walk_ops,
+ &walk_private);
tlb_end_vma(tlb, vma);
}
@@ -585,28 +595,25 @@ static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB));
}
-static long madvise_cold(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start_addr, unsigned long end_addr)
+static long madvise_cold(struct madvise_behavior *madv_behavior)
{
- struct mm_struct *mm = vma->vm_mm;
+ struct vm_area_struct *vma = madv_behavior->vma;
struct mmu_gather tlb;
- *prev = vma;
if (!can_madv_lru_vma(vma))
return -EINVAL;
lru_add_drain();
- tlb_gather_mmu(&tlb, mm);
- madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
+ tlb_gather_mmu(&tlb, madv_behavior->mm);
+ madvise_cold_page_range(&tlb, madv_behavior);
tlb_finish_mmu(&tlb);
return 0;
}
static void madvise_pageout_page_range(struct mmu_gather *tlb,
- struct vm_area_struct *vma,
- unsigned long addr, unsigned long end)
+ struct vm_area_struct *vma,
+ struct madvise_behavior_range *range)
{
struct madvise_walk_private walk_private = {
.pageout = true,
@@ -614,18 +621,16 @@ static void madvise_pageout_page_range(struct mmu_gather *tlb,
};
tlb_start_vma(tlb, vma);
- walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
+ walk_page_range_vma(vma, range->start, range->end, &cold_walk_ops,
+ &walk_private);
tlb_end_vma(tlb, vma);
}
-static long madvise_pageout(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start_addr, unsigned long end_addr)
+static long madvise_pageout(struct madvise_behavior *madv_behavior)
{
- struct mm_struct *mm = vma->vm_mm;
struct mmu_gather tlb;
+ struct vm_area_struct *vma = madv_behavior->vma;
- *prev = vma;
if (!can_madv_lru_vma(vma))
return -EINVAL;
@@ -640,8 +645,8 @@ static long madvise_pageout(struct vm_area_struct *vma,
return 0;
lru_add_drain();
- tlb_gather_mmu(&tlb, mm);
- madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
+ tlb_gather_mmu(&tlb, madv_behavior->mm);
+ madvise_pageout_page_range(&tlb, vma, &madv_behavior->range);
tlb_finish_mmu(&tlb);
return 0;
@@ -713,11 +718,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
* next pte in the range.
*/
if (folio_test_large(folio)) {
- bool any_young, any_dirty;
-
- nr = madvise_folio_pte_batch(addr, end, folio, pte,
- ptent, &any_young, &any_dirty);
-
+ nr = madvise_folio_pte_batch(addr, end, folio, pte, &ptent);
if (nr < folio_nr_pages(folio)) {
int err;
@@ -736,16 +737,12 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
start_pte = pte;
if (!start_pte)
break;
+ flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
if (!err)
nr = 0;
continue;
}
-
- if (any_young)
- ptent = pte_mkyoung(ptent);
- if (any_dirty)
- ptent = pte_mkdirty(ptent);
}
if (folio_test_swapcache(folio) || folio_test_dirty(folio)) {
@@ -789,17 +786,31 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
return 0;
}
-static const struct mm_walk_ops madvise_free_walk_ops = {
- .pmd_entry = madvise_free_pte_range,
- .walk_lock = PGWALK_RDLOCK,
-};
+static inline enum page_walk_lock get_walk_lock(enum madvise_lock_mode mode)
+{
+ switch (mode) {
+ case MADVISE_VMA_READ_LOCK:
+ return PGWALK_VMA_RDLOCK_VERIFY;
+ case MADVISE_MMAP_READ_LOCK:
+ return PGWALK_RDLOCK;
+ default:
+ /* Other modes don't require fixing up the walk_lock */
+ WARN_ON_ONCE(1);
+ return PGWALK_RDLOCK;
+ }
+}
-static int madvise_free_single_vma(struct vm_area_struct *vma,
- unsigned long start_addr, unsigned long end_addr)
+static int madvise_free_single_vma(struct madvise_behavior *madv_behavior)
{
- struct mm_struct *mm = vma->vm_mm;
+ struct mm_struct *mm = madv_behavior->mm;
+ struct vm_area_struct *vma = madv_behavior->vma;
+ unsigned long start_addr = madv_behavior->range.start;
+ unsigned long end_addr = madv_behavior->range.end;
struct mmu_notifier_range range;
- struct mmu_gather tlb;
+ struct mmu_gather *tlb = madv_behavior->tlb;
+ struct mm_walk_ops walk_ops = {
+ .pmd_entry = madvise_free_pte_range,
+ };
/* MADV_FREE works for only anon vma at the moment */
if (!vma_is_anonymous(vma))
@@ -815,17 +826,15 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
range.start, range.end);
lru_add_drain();
- tlb_gather_mmu(&tlb, mm);
update_hiwater_rss(mm);
mmu_notifier_invalidate_range_start(&range);
- tlb_start_vma(&tlb, vma);
- walk_page_range(vma->vm_mm, range.start, range.end,
- &madvise_free_walk_ops, &tlb);
- tlb_end_vma(&tlb, vma);
+ tlb_start_vma(tlb, vma);
+ walk_ops.walk_lock = get_walk_lock(madv_behavior->lock_mode);
+ walk_page_range_vma(vma, range.start, range.end,
+ &walk_ops, tlb);
+ tlb_end_vma(tlb, vma);
mmu_notifier_invalidate_range_end(&range);
- tlb_finish_mmu(&tlb);
-
return 0;
}
@@ -848,23 +857,28 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
* An interface that causes the system to free clean pages and flush
* dirty pages is already available as msync(MS_INVALIDATE).
*/
-static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
- unsigned long start, unsigned long end)
+static long madvise_dontneed_single_vma(struct madvise_behavior *madv_behavior)
+
{
+ struct madvise_behavior_range *range = &madv_behavior->range;
struct zap_details details = {
.reclaim_pt = true,
.even_cows = true,
};
- zap_page_range_single(vma, start, end - start, &details);
+ zap_page_range_single_batched(
+ madv_behavior->tlb, madv_behavior->vma, range->start,
+ range->end - range->start, &details);
return 0;
}
-static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
- unsigned long start,
- unsigned long *end,
- int behavior)
+static
+bool madvise_dontneed_free_valid_vma(struct madvise_behavior *madv_behavior)
{
+ struct vm_area_struct *vma = madv_behavior->vma;
+ int behavior = madv_behavior->behavior;
+ struct madvise_behavior_range *range = &madv_behavior->range;
+
if (!is_vm_hugetlb_page(vma)) {
unsigned int forbidden = VM_PFNMAP;
@@ -876,7 +890,7 @@ static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED)
return false;
- if (start & ~huge_page_mask(hstate_vma(vma)))
+ if (range->start & ~huge_page_mask(hstate_vma(vma)))
return false;
/*
@@ -885,40 +899,38 @@ static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
* Avoid unexpected data loss by rounding down the number of
* huge pages freed.
*/
- *end = ALIGN_DOWN(*end, huge_page_size(hstate_vma(vma)));
+ range->end = ALIGN_DOWN(range->end, huge_page_size(hstate_vma(vma)));
return true;
}
-static long madvise_dontneed_free(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start, unsigned long end,
- int behavior)
+static long madvise_dontneed_free(struct madvise_behavior *madv_behavior)
{
- struct mm_struct *mm = vma->vm_mm;
+ struct mm_struct *mm = madv_behavior->mm;
+ struct madvise_behavior_range *range = &madv_behavior->range;
+ int behavior = madv_behavior->behavior;
- *prev = vma;
- if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior))
+ if (!madvise_dontneed_free_valid_vma(madv_behavior))
return -EINVAL;
- if (start == end)
+ if (range->start == range->end)
return 0;
- if (!userfaultfd_remove(vma, start, end)) {
- *prev = NULL; /* mmap_lock has been dropped, prev is stale */
+ if (!userfaultfd_remove(madv_behavior->vma, range->start, range->end)) {
+ struct vm_area_struct *vma;
+ mark_mmap_lock_dropped(madv_behavior);
mmap_read_lock(mm);
- vma = vma_lookup(mm, start);
+ madv_behavior->vma = vma = vma_lookup(mm, range->start);
if (!vma)
return -ENOMEM;
/*
* Potential end adjustment for hugetlb vma is OK as
* the check below keeps end within vma.
*/
- if (!madvise_dontneed_free_valid_vma(vma, start, &end,
- behavior))
+ if (!madvise_dontneed_free_valid_vma(madv_behavior))
return -EINVAL;
- if (end > vma->vm_end) {
+ if (range->end > vma->vm_end) {
/*
* Don't fail if end > vma->vm_end. If the old
* vma was split while the mmap_lock was
@@ -931,7 +943,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
* end-vma->vm_end range, but the manager can
* handle a repetition fine.
*/
- end = vma->vm_end;
+ range->end = vma->vm_end;
}
/*
* If the memory region between start and end was
@@ -940,24 +952,26 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
* the adjustment for hugetlb vma above may have rounded
* end down to the start address.
*/
- if (start == end)
+ if (range->start == range->end)
return 0;
- VM_WARN_ON(start > end);
+ VM_WARN_ON(range->start > range->end);
}
if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED)
- return madvise_dontneed_single_vma(vma, start, end);
+ return madvise_dontneed_single_vma(madv_behavior);
else if (behavior == MADV_FREE)
- return madvise_free_single_vma(vma, start, end);
+ return madvise_free_single_vma(madv_behavior);
else
return -EINVAL;
}
-static long madvise_populate(struct mm_struct *mm, unsigned long start,
- unsigned long end, int behavior)
+static long madvise_populate(struct madvise_behavior *madv_behavior)
{
- const bool write = behavior == MADV_POPULATE_WRITE;
+ struct mm_struct *mm = madv_behavior->mm;
+ const bool write = madv_behavior->behavior == MADV_POPULATE_WRITE;
int locked = 1;
+ unsigned long start = madv_behavior->range.start;
+ unsigned long end = madv_behavior->range.end;
long pages;
while (start < end) {
@@ -994,16 +1008,17 @@ static long madvise_populate(struct mm_struct *mm, unsigned long start,
* Application wants to free up the pages and associated backing store.
* This is effectively punching a hole into the middle of a file.
*/
-static long madvise_remove(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start, unsigned long end)
+static long madvise_remove(struct madvise_behavior *madv_behavior)
{
loff_t offset;
int error;
struct file *f;
- struct mm_struct *mm = vma->vm_mm;
+ struct mm_struct *mm = madv_behavior->mm;
+ struct vm_area_struct *vma = madv_behavior->vma;
+ unsigned long start = madv_behavior->range.start;
+ unsigned long end = madv_behavior->range.end;
- *prev = NULL; /* tell sys_madvise we drop mmap_lock */
+ mark_mmap_lock_dropped(madv_behavior);
if (vma->vm_flags & VM_LOCKED)
return -EINVAL;
@@ -1056,8 +1071,8 @@ static bool is_valid_guard_vma(struct vm_area_struct *vma, bool allow_locked)
static bool is_guard_pte_marker(pte_t ptent)
{
- return is_pte_marker(ptent) &&
- is_guard_swp_entry(pte_to_swp_entry(ptent));
+ return is_swap_pte(ptent) &&
+ is_guard_swp_entry(pte_to_swp_entry(ptent));
}
static int guard_install_pud_entry(pud_t *pud, unsigned long addr,
@@ -1066,7 +1081,7 @@ static int guard_install_pud_entry(pud_t *pud, unsigned long addr,
pud_t pudval = pudp_get(pud);
/* If huge return >0 so we abort the operation + zap. */
- return pud_trans_huge(pudval) || pud_devmap(pudval);
+ return pud_trans_huge(pudval);
}
static int guard_install_pmd_entry(pmd_t *pmd, unsigned long addr,
@@ -1075,7 +1090,7 @@ static int guard_install_pmd_entry(pmd_t *pmd, unsigned long addr,
pmd_t pmdval = pmdp_get(pmd);
/* If huge return >0 so we abort the operation + zap. */
- return pmd_trans_huge(pmdval) || pmd_devmap(pmdval);
+ return pmd_trans_huge(pmdval);
}
static int guard_install_pte_entry(pte_t *pte, unsigned long addr,
@@ -1115,14 +1130,13 @@ static const struct mm_walk_ops guard_install_walk_ops = {
.walk_lock = PGWALK_RDLOCK,
};
-static long madvise_guard_install(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start, unsigned long end)
+static long madvise_guard_install(struct madvise_behavior *madv_behavior)
{
+ struct vm_area_struct *vma = madv_behavior->vma;
+ struct madvise_behavior_range *range = &madv_behavior->range;
long err;
int i;
- *prev = vma;
if (!is_valid_guard_vma(vma, /* allow_locked = */false))
return -EINVAL;
@@ -1153,13 +1167,14 @@ static long madvise_guard_install(struct vm_area_struct *vma,
unsigned long nr_pages = 0;
/* Returns < 0 on error, == 0 if success, > 0 if zap needed. */
- err = walk_page_range_mm(vma->vm_mm, start, end,
+ err = walk_page_range_mm(vma->vm_mm, range->start, range->end,
&guard_install_walk_ops, &nr_pages);
if (err < 0)
return err;
if (err == 0) {
- unsigned long nr_expected_pages = PHYS_PFN(end - start);
+ unsigned long nr_expected_pages =
+ PHYS_PFN(range->end - range->start);
VM_WARN_ON(nr_pages != nr_expected_pages);
return 0;
@@ -1169,7 +1184,8 @@ static long madvise_guard_install(struct vm_area_struct *vma,
* OK some of the range have non-guard pages mapped, zap
* them. This leaves existing guard pages in place.
*/
- zap_page_range_single(vma, start, end - start, NULL);
+ zap_page_range_single(vma, range->start,
+ range->end - range->start, NULL);
}
/*
@@ -1186,7 +1202,7 @@ static int guard_remove_pud_entry(pud_t *pud, unsigned long addr,
pud_t pudval = pudp_get(pud);
/* If huge, cannot have guard pages present, so no-op - skip. */
- if (pud_trans_huge(pudval) || pud_devmap(pudval))
+ if (pud_trans_huge(pudval))
walk->action = ACTION_CONTINUE;
return 0;
@@ -1198,7 +1214,7 @@ static int guard_remove_pmd_entry(pmd_t *pmd, unsigned long addr,
pmd_t pmdval = pmdp_get(pmd);
/* If huge, cannot have guard pages present, so no-op - skip. */
- if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval))
+ if (pmd_trans_huge(pmdval))
walk->action = ACTION_CONTINUE;
return 0;
@@ -1225,11 +1241,11 @@ static const struct mm_walk_ops guard_remove_walk_ops = {
.walk_lock = PGWALK_RDLOCK,
};
-static long madvise_guard_remove(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start, unsigned long end)
+static long madvise_guard_remove(struct madvise_behavior *madv_behavior)
{
- *prev = vma;
+ struct vm_area_struct *vma = madv_behavior->vma;
+ struct madvise_behavior_range *range = &madv_behavior->range;
+
/*
* We're ok with removing guards in mlock()'d ranges, as this is a
* non-destructive action.
@@ -1237,40 +1253,117 @@ static long madvise_guard_remove(struct vm_area_struct *vma,
if (!is_valid_guard_vma(vma, /* allow_locked = */true))
return -EINVAL;
- return walk_page_range(vma->vm_mm, start, end,
+ return walk_page_range_vma(vma, range->start, range->end,
&guard_remove_walk_ops, NULL);
}
+#ifdef CONFIG_64BIT
+/* Does the madvise operation result in discarding of mapped data? */
+static bool is_discard(int behavior)
+{
+ switch (behavior) {
+ case MADV_FREE:
+ case MADV_DONTNEED:
+ case MADV_DONTNEED_LOCKED:
+ case MADV_REMOVE:
+ case MADV_DONTFORK:
+ case MADV_WIPEONFORK:
+ case MADV_GUARD_INSTALL:
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * We are restricted from madvise()'ing mseal()'d VMAs only in very particular
+ * circumstances - discarding of data from read-only anonymous SEALED mappings.
+ *
+ * This is because users cannot trivally discard data from these VMAs, and may
+ * only do so via an appropriate madvise() call.
+ */
+static bool can_madvise_modify(struct madvise_behavior *madv_behavior)
+{
+ struct vm_area_struct *vma = madv_behavior->vma;
+
+ /* If the VMA isn't sealed we're good. */
+ if (!vma_is_sealed(vma))
+ return true;
+
+ /* For a sealed VMA, we only care about discard operations. */
+ if (!is_discard(madv_behavior->behavior))
+ return true;
+
+ /*
+ * We explicitly permit all file-backed mappings, whether MAP_SHARED or
+ * MAP_PRIVATE.
+ *
+ * The latter causes some complications. Because now, one can mmap()
+ * read/write a MAP_PRIVATE mapping, write to it, then mprotect()
+ * read-only, mseal() and a discard will be permitted.
+ *
+ * However, in order to avoid issues with potential use of madvise(...,
+ * MADV_DONTNEED) of mseal()'d .text mappings we, for the time being,
+ * permit this.
+ */
+ if (!vma_is_anonymous(vma))
+ return true;
+
+ /* If the user could write to the mapping anyway, then this is fine. */
+ if ((vma->vm_flags & VM_WRITE) &&
+ arch_vma_access_permitted(vma, /* write= */ true,
+ /* execute= */ false, /* foreign= */ false))
+ return true;
+
+ /* Otherwise, we are not permitted to perform this operation. */
+ return false;
+}
+#else
+static bool can_madvise_modify(struct madvise_behavior *madv_behavior)
+{
+ return true;
+}
+#endif
+
/*
* Apply an madvise behavior to a region of a vma. madvise_update_vma
* will handle splitting a vm area into separate areas, each area with its own
* behavior.
*/
-static int madvise_vma_behavior(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start, unsigned long end,
- unsigned long behavior)
+static int madvise_vma_behavior(struct madvise_behavior *madv_behavior)
{
+ int behavior = madv_behavior->behavior;
+ struct vm_area_struct *vma = madv_behavior->vma;
+ vm_flags_t new_flags = vma->vm_flags;
+ struct madvise_behavior_range *range = &madv_behavior->range;
int error;
- struct anon_vma_name *anon_name;
- unsigned long new_flags = vma->vm_flags;
- if (unlikely(!can_modify_vma_madv(vma, behavior)))
+ if (unlikely(!can_madvise_modify(madv_behavior)))
return -EPERM;
switch (behavior) {
case MADV_REMOVE:
- return madvise_remove(vma, prev, start, end);
+ return madvise_remove(madv_behavior);
case MADV_WILLNEED:
- return madvise_willneed(vma, prev, start, end);
+ return madvise_willneed(madv_behavior);
case MADV_COLD:
- return madvise_cold(vma, prev, start, end);
+ return madvise_cold(madv_behavior);
case MADV_PAGEOUT:
- return madvise_pageout(vma, prev, start, end);
+ return madvise_pageout(madv_behavior);
case MADV_FREE:
case MADV_DONTNEED:
case MADV_DONTNEED_LOCKED:
- return madvise_dontneed_free(vma, prev, start, end, behavior);
+ return madvise_dontneed_free(madv_behavior);
+ case MADV_COLLAPSE:
+ return madvise_collapse(vma, range->start, range->end,
+ &madv_behavior->lock_dropped);
+ case MADV_GUARD_INSTALL:
+ return madvise_guard_install(madv_behavior);
+ case MADV_GUARD_REMOVE:
+ return madvise_guard_remove(madv_behavior);
+
+ /* The below behaviours update VMAs via madvise_update_vma(). */
+
case MADV_NORMAL:
new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
break;
@@ -1284,18 +1377,18 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
new_flags |= VM_DONTCOPY;
break;
case MADV_DOFORK:
- if (vma->vm_flags & VM_IO)
+ if (new_flags & VM_IO)
return -EINVAL;
new_flags &= ~VM_DONTCOPY;
break;
case MADV_WIPEONFORK:
/* MADV_WIPEONFORK is only supported on anonymous memory. */
- if (vma->vm_file || vma->vm_flags & VM_SHARED)
+ if (vma->vm_file || new_flags & VM_SHARED)
return -EINVAL;
new_flags |= VM_WIPEONFORK;
break;
case MADV_KEEPONFORK:
- if (vma->vm_flags & VM_DROPPABLE)
+ if (new_flags & VM_DROPPABLE)
return -EINVAL;
new_flags &= ~VM_WIPEONFORK;
break;
@@ -1303,14 +1396,15 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
new_flags |= VM_DONTDUMP;
break;
case MADV_DODUMP:
- if ((!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) ||
- (vma->vm_flags & VM_DROPPABLE))
+ if ((!is_vm_hugetlb_page(vma) && (new_flags & VM_SPECIAL)) ||
+ (new_flags & VM_DROPPABLE))
return -EINVAL;
new_flags &= ~VM_DONTDUMP;
break;
case MADV_MERGEABLE:
case MADV_UNMERGEABLE:
- error = ksm_madvise(vma, start, end, behavior, &new_flags);
+ error = ksm_madvise(vma, range->start, range->end,
+ behavior, &new_flags);
if (error)
goto out;
break;
@@ -1320,20 +1414,17 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
if (error)
goto out;
break;
- case MADV_COLLAPSE:
- return madvise_collapse(vma, prev, start, end);
- case MADV_GUARD_INSTALL:
- return madvise_guard_install(vma, prev, start, end);
- case MADV_GUARD_REMOVE:
- return madvise_guard_remove(vma, prev, start, end);
+ case __MADV_SET_ANON_VMA_NAME:
+ /* Only anonymous mappings can be named */
+ if (vma->vm_file && !vma_is_anon_shmem(vma))
+ return -EBADF;
+ break;
}
- anon_name = anon_vma_name(vma);
- anon_vma_name_get(anon_name);
- error = madvise_update_vma(vma, prev, start, end, new_flags,
- anon_name);
- anon_vma_name_put(anon_name);
+ /* This is a write operation.*/
+ VM_WARN_ON_ONCE(madv_behavior->lock_mode != MADVISE_MMAP_WRITE_LOCK);
+ error = madvise_update_vma(new_flags, madv_behavior);
out:
/*
* madvise() returns EAGAIN if kernel resources, such as
@@ -1348,15 +1439,15 @@ out:
/*
* Error injection support for memory error handling.
*/
-static int madvise_inject_error(int behavior,
- unsigned long start, unsigned long end)
+static int madvise_inject_error(struct madvise_behavior *madv_behavior)
{
unsigned long size;
+ unsigned long start = madv_behavior->range.start;
+ unsigned long end = madv_behavior->range.end;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
-
for (; start < end; start += size) {
unsigned long pfn;
struct page *page;
@@ -1374,7 +1465,7 @@ static int madvise_inject_error(int behavior,
*/
size = page_size(compound_head(page));
- if (behavior == MADV_SOFT_OFFLINE) {
+ if (madv_behavior->behavior == MADV_SOFT_OFFLINE) {
pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
pfn, start);
ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
@@ -1393,9 +1484,9 @@ static int madvise_inject_error(int behavior,
return 0;
}
-static bool is_memory_failure(int behavior)
+static bool is_memory_failure(struct madvise_behavior *madv_behavior)
{
- switch (behavior) {
+ switch (madv_behavior->behavior) {
case MADV_HWPOISON:
case MADV_SOFT_OFFLINE:
return true;
@@ -1406,13 +1497,12 @@ static bool is_memory_failure(int behavior)
#else
-static int madvise_inject_error(int behavior,
- unsigned long start, unsigned long end)
+static int madvise_inject_error(struct madvise_behavior *madv_behavior)
{
return 0;
}
-static bool is_memory_failure(int behavior)
+static bool is_memory_failure(struct madvise_behavior *madv_behavior)
{
return false;
}
@@ -1478,145 +1568,226 @@ static bool process_madvise_remote_valid(int behavior)
}
/*
- * Walk the vmas in range [start,end), and call the visit function on each one.
- * The visit function will get start and end parameters that cover the overlap
- * between the current vma and the original range. Any unmapped regions in the
- * original range will result in this function returning -ENOMEM while still
- * calling the visit function on all of the existing vmas in the range.
- * Must be called with the mmap_lock held for reading or writing.
+ * Try to acquire a VMA read lock if possible.
+ *
+ * We only support this lock over a single VMA, which the input range must
+ * span either partially or fully.
+ *
+ * This function always returns with an appropriate lock held. If a VMA read
+ * lock could be acquired, we return true and set madv_behavior state
+ * accordingly.
+ *
+ * If a VMA read lock could not be acquired, we return false and expect caller to
+ * fallback to mmap lock behaviour.
*/
-static
-int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
- unsigned long end, unsigned long arg,
- int (*visit)(struct vm_area_struct *vma,
- struct vm_area_struct **prev, unsigned long start,
- unsigned long end, unsigned long arg))
+static bool try_vma_read_lock(struct madvise_behavior *madv_behavior)
{
+ struct mm_struct *mm = madv_behavior->mm;
struct vm_area_struct *vma;
- struct vm_area_struct *prev;
- unsigned long tmp;
+
+ vma = lock_vma_under_rcu(mm, madv_behavior->range.start);
+ if (!vma)
+ goto take_mmap_read_lock;
+ /*
+ * Must span only a single VMA; uffd and remote processes are
+ * unsupported.
+ */
+ if (madv_behavior->range.end > vma->vm_end || current->mm != mm ||
+ userfaultfd_armed(vma)) {
+ vma_end_read(vma);
+ goto take_mmap_read_lock;
+ }
+ madv_behavior->vma = vma;
+ return true;
+
+take_mmap_read_lock:
+ mmap_read_lock(mm);
+ madv_behavior->lock_mode = MADVISE_MMAP_READ_LOCK;
+ return false;
+}
+
+/*
+ * Walk the vmas in range [start,end), and call the madvise_vma_behavior
+ * function on each one. The function will get start and end parameters that
+ * cover the overlap between the current vma and the original range. Any
+ * unmapped regions in the original range will result in this function returning
+ * -ENOMEM while still calling the madvise_vma_behavior function on all of the
+ * existing vmas in the range. Must be called with the mmap_lock held for
+ * reading or writing.
+ */
+static
+int madvise_walk_vmas(struct madvise_behavior *madv_behavior)
+{
+ struct mm_struct *mm = madv_behavior->mm;
+ struct madvise_behavior_range *range = &madv_behavior->range;
+ /* range is updated to span each VMA, so store end of entire range. */
+ unsigned long last_end = range->end;
int unmapped_error = 0;
+ int error;
+ struct vm_area_struct *prev, *vma;
/*
- * If the interval [start,end) covers some unmapped address
- * ranges, just ignore them, but return -ENOMEM at the end.
- * - different from the way of handling in mlock etc.
+ * If VMA read lock is supported, apply madvise to a single VMA
+ * tentatively, avoiding walking VMAs.
*/
- vma = find_vma_prev(mm, start, &prev);
- if (vma && start > vma->vm_start)
+ if (madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK &&
+ try_vma_read_lock(madv_behavior)) {
+ error = madvise_vma_behavior(madv_behavior);
+ vma_end_read(madv_behavior->vma);
+ return error;
+ }
+
+ vma = find_vma_prev(mm, range->start, &prev);
+ if (vma && range->start > vma->vm_start)
prev = vma;
for (;;) {
- int error;
-
/* Still start < end. */
if (!vma)
return -ENOMEM;
- /* Here start < (end|vma->vm_end). */
- if (start < vma->vm_start) {
+ /* Here start < (last_end|vma->vm_end). */
+ if (range->start < vma->vm_start) {
+ /*
+ * This indicates a gap between VMAs in the input
+ * range. This does not cause the operation to abort,
+ * rather we simply return -ENOMEM to indicate that this
+ * has happened, but carry on.
+ */
unmapped_error = -ENOMEM;
- start = vma->vm_start;
- if (start >= end)
+ range->start = vma->vm_start;
+ if (range->start >= last_end)
break;
}
- /* Here vma->vm_start <= start < (end|vma->vm_end) */
- tmp = vma->vm_end;
- if (end < tmp)
- tmp = end;
+ /* Here vma->vm_start <= range->start < (last_end|vma->vm_end) */
+ range->end = min(vma->vm_end, last_end);
- /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
- error = visit(vma, &prev, start, tmp, arg);
+ /* Here vma->vm_start <= range->start < range->end <= (last_end|vma->vm_end). */
+ madv_behavior->prev = prev;
+ madv_behavior->vma = vma;
+ error = madvise_vma_behavior(madv_behavior);
if (error)
return error;
- start = tmp;
- if (prev && start < prev->vm_end)
- start = prev->vm_end;
- if (start >= end)
+ if (madv_behavior->lock_dropped) {
+ /* We dropped the mmap lock, we can't ref the VMA. */
+ prev = NULL;
+ vma = NULL;
+ madv_behavior->lock_dropped = false;
+ } else {
+ vma = madv_behavior->vma;
+ prev = vma;
+ }
+
+ if (vma && range->end < vma->vm_end)
+ range->end = vma->vm_end;
+ if (range->end >= last_end)
break;
- if (prev)
- vma = find_vma(mm, prev->vm_end);
- else /* madvise_remove dropped mmap_lock */
- vma = find_vma(mm, start);
+
+ vma = find_vma(mm, vma ? vma->vm_end : range->end);
+ range->start = range->end;
}
return unmapped_error;
}
-#ifdef CONFIG_ANON_VMA_NAME
-static int madvise_vma_anon_name(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start, unsigned long end,
- unsigned long anon_name)
+/*
+ * Any behaviour which results in changes to the vma->vm_flags needs to
+ * take mmap_lock for writing. Others, which simply traverse vmas, need
+ * to only take it for reading.
+ */
+static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavior)
{
- int error;
-
- /* Only anonymous mappings can be named */
- if (vma->vm_file && !vma_is_anon_shmem(vma))
- return -EBADF;
-
- error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
- (struct anon_vma_name *)anon_name);
+ if (is_memory_failure(madv_behavior))
+ return MADVISE_NO_LOCK;
- /*
- * madvise() returns EAGAIN if kernel resources, such as
- * slab, are temporarily unavailable.
- */
- if (error == -ENOMEM)
- error = -EAGAIN;
- return error;
+ switch (madv_behavior->behavior) {
+ case MADV_REMOVE:
+ case MADV_WILLNEED:
+ case MADV_COLD:
+ case MADV_PAGEOUT:
+ case MADV_POPULATE_READ:
+ case MADV_POPULATE_WRITE:
+ case MADV_COLLAPSE:
+ case MADV_GUARD_INSTALL:
+ case MADV_GUARD_REMOVE:
+ return MADVISE_MMAP_READ_LOCK;
+ case MADV_DONTNEED:
+ case MADV_DONTNEED_LOCKED:
+ case MADV_FREE:
+ return MADVISE_VMA_READ_LOCK;
+ default:
+ return MADVISE_MMAP_WRITE_LOCK;
+ }
}
-int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
- unsigned long len_in, struct anon_vma_name *anon_name)
+static int madvise_lock(struct madvise_behavior *madv_behavior)
{
- unsigned long end;
- unsigned long len;
+ struct mm_struct *mm = madv_behavior->mm;
+ enum madvise_lock_mode lock_mode = get_lock_mode(madv_behavior);
- if (start & ~PAGE_MASK)
- return -EINVAL;
- len = (len_in + ~PAGE_MASK) & PAGE_MASK;
+ switch (lock_mode) {
+ case MADVISE_NO_LOCK:
+ break;
+ case MADVISE_MMAP_WRITE_LOCK:
+ if (mmap_write_lock_killable(mm))
+ return -EINTR;
+ break;
+ case MADVISE_MMAP_READ_LOCK:
+ mmap_read_lock(mm);
+ break;
+ case MADVISE_VMA_READ_LOCK:
+ /* We will acquire the lock per-VMA in madvise_walk_vmas(). */
+ break;
+ }
- /* Check to see whether len was rounded up from small -ve to zero */
- if (len_in && !len)
- return -EINVAL;
+ madv_behavior->lock_mode = lock_mode;
+ return 0;
+}
- end = start + len;
- if (end < start)
- return -EINVAL;
+static void madvise_unlock(struct madvise_behavior *madv_behavior)
+{
+ struct mm_struct *mm = madv_behavior->mm;
- if (end == start)
- return 0;
+ switch (madv_behavior->lock_mode) {
+ case MADVISE_NO_LOCK:
+ return;
+ case MADVISE_MMAP_WRITE_LOCK:
+ mmap_write_unlock(mm);
+ break;
+ case MADVISE_MMAP_READ_LOCK:
+ mmap_read_unlock(mm);
+ break;
+ case MADVISE_VMA_READ_LOCK:
+ /* We will drop the lock per-VMA in madvise_walk_vmas(). */
+ break;
+ }
- return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name,
- madvise_vma_anon_name);
+ madv_behavior->lock_mode = MADVISE_NO_LOCK;
}
-#endif /* CONFIG_ANON_VMA_NAME */
-static int madvise_lock(struct mm_struct *mm, int behavior)
+static bool madvise_batch_tlb_flush(int behavior)
{
- if (is_memory_failure(behavior))
- return 0;
-
- if (madvise_need_mmap_write(behavior)) {
- if (mmap_write_lock_killable(mm))
- return -EINTR;
- } else {
- mmap_read_lock(mm);
+ switch (behavior) {
+ case MADV_DONTNEED:
+ case MADV_DONTNEED_LOCKED:
+ case MADV_FREE:
+ return true;
+ default:
+ return false;
}
- return 0;
}
-static void madvise_unlock(struct mm_struct *mm, int behavior)
+static void madvise_init_tlb(struct madvise_behavior *madv_behavior)
{
- if (is_memory_failure(behavior))
- return;
+ if (madvise_batch_tlb_flush(madv_behavior->behavior))
+ tlb_gather_mmu(madv_behavior->tlb, madv_behavior->mm);
+}
- if (madvise_need_mmap_write(behavior))
- mmap_write_unlock(mm);
- else
- mmap_read_unlock(mm);
+static void madvise_finish_tlb(struct madvise_behavior *madv_behavior)
+{
+ if (madvise_batch_tlb_flush(madv_behavior->behavior))
+ tlb_finish_mmu(madv_behavior->tlb);
}
static bool is_valid_madvise(unsigned long start, size_t len_in, int behavior)
@@ -1665,9 +1836,9 @@ static bool madvise_should_skip(unsigned long start, size_t len_in,
return false;
}
-static bool is_madvise_populate(int behavior)
+static bool is_madvise_populate(struct madvise_behavior *madv_behavior)
{
- switch (behavior) {
+ switch (madv_behavior->behavior) {
case MADV_POPULATE_READ:
case MADV_POPULATE_WRITE:
return true;
@@ -1676,24 +1847,42 @@ static bool is_madvise_populate(int behavior)
}
}
-static int madvise_do_behavior(struct mm_struct *mm,
- unsigned long start, size_t len_in, int behavior)
+/*
+ * untagged_addr_remote() assumes mmap_lock is already held. On
+ * architectures like x86 and RISC-V, tagging is tricky because each
+ * mm may have a different tagging mask. However, we might only hold
+ * the per-VMA lock (currently only local processes are supported),
+ * so untagged_addr is used to avoid the mmap_lock assertion for
+ * local processes.
+ */
+static inline unsigned long get_untagged_addr(struct mm_struct *mm,
+ unsigned long start)
+{
+ return current->mm == mm ? untagged_addr(start) :
+ untagged_addr_remote(mm, start);
+}
+
+static int madvise_do_behavior(unsigned long start, size_t len_in,
+ struct madvise_behavior *madv_behavior)
{
struct blk_plug plug;
- unsigned long end;
int error;
+ struct madvise_behavior_range *range = &madv_behavior->range;
+
+ if (is_memory_failure(madv_behavior)) {
+ range->start = start;
+ range->end = start + len_in;
+ return madvise_inject_error(madv_behavior);
+ }
- if (is_memory_failure(behavior))
- return madvise_inject_error(behavior, start, start + len_in);
- start = untagged_addr_remote(mm, start);
- end = start + PAGE_ALIGN(len_in);
+ range->start = get_untagged_addr(madv_behavior->mm, start);
+ range->end = range->start + PAGE_ALIGN(len_in);
blk_start_plug(&plug);
- if (is_madvise_populate(behavior))
- error = madvise_populate(mm, start, end, behavior);
+ if (is_madvise_populate(madv_behavior))
+ error = madvise_populate(madv_behavior);
else
- error = madvise_walk_vmas(mm, start, end, behavior,
- madvise_vma_behavior);
+ error = madvise_walk_vmas(madv_behavior);
blk_finish_plug(&plug);
return error;
}
@@ -1773,14 +1962,22 @@ static int madvise_do_behavior(struct mm_struct *mm,
int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
{
int error;
+ struct mmu_gather tlb;
+ struct madvise_behavior madv_behavior = {
+ .mm = mm,
+ .behavior = behavior,
+ .tlb = &tlb,
+ };
if (madvise_should_skip(start, len_in, behavior, &error))
return error;
- error = madvise_lock(mm, behavior);
+ error = madvise_lock(&madv_behavior);
if (error)
return error;
- error = madvise_do_behavior(mm, start, len_in, behavior);
- madvise_unlock(mm, behavior);
+ madvise_init_tlb(&madv_behavior);
+ error = madvise_do_behavior(start, len_in, &madv_behavior);
+ madvise_finish_tlb(&madv_behavior);
+ madvise_unlock(&madv_behavior);
return error;
}
@@ -1796,12 +1993,19 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
{
ssize_t ret = 0;
size_t total_len;
+ struct mmu_gather tlb;
+ struct madvise_behavior madv_behavior = {
+ .mm = mm,
+ .behavior = behavior,
+ .tlb = &tlb,
+ };
total_len = iov_iter_count(iter);
- ret = madvise_lock(mm, behavior);
+ ret = madvise_lock(&madv_behavior);
if (ret)
return ret;
+ madvise_init_tlb(&madv_behavior);
while (iov_iter_count(iter)) {
unsigned long start = (unsigned long)iter_iov_addr(iter);
@@ -1811,7 +2015,7 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
if (madvise_should_skip(start, len_in, behavior, &error))
ret = error;
else
- ret = madvise_do_behavior(mm, start, len_in, behavior);
+ ret = madvise_do_behavior(start, len_in, &madv_behavior);
/*
* An madvise operation is attempting to restart the syscall,
* but we cannot proceed as it would not be correct to repeat
@@ -1829,16 +2033,22 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
}
/* Drop and reacquire lock to unwind race. */
- madvise_unlock(mm, behavior);
- madvise_lock(mm, behavior);
+ madvise_finish_tlb(&madv_behavior);
+ madvise_unlock(&madv_behavior);
+ ret = madvise_lock(&madv_behavior);
+ if (ret)
+ goto out;
+ madvise_init_tlb(&madv_behavior);
continue;
}
if (ret < 0)
break;
iov_iter_advance(iter, iter_iov_len(iter));
}
- madvise_unlock(mm, behavior);
+ madvise_finish_tlb(&madv_behavior);
+ madvise_unlock(&madv_behavior);
+out:
ret = (total_len - iov_iter_count(iter)) ? : ret;
return ret;
@@ -1907,3 +2117,88 @@ free_iov:
out:
return ret;
}
+
+#ifdef CONFIG_ANON_VMA_NAME
+
+#define ANON_VMA_NAME_MAX_LEN 80
+#define ANON_VMA_NAME_INVALID_CHARS "\\`$[]"
+
+static inline bool is_valid_name_char(char ch)
+{
+ /* printable ascii characters, excluding ANON_VMA_NAME_INVALID_CHARS */
+ return ch > 0x1f && ch < 0x7f &&
+ !strchr(ANON_VMA_NAME_INVALID_CHARS, ch);
+}
+
+static int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
+ unsigned long len_in, struct anon_vma_name *anon_name)
+{
+ unsigned long end;
+ unsigned long len;
+ int error;
+ struct madvise_behavior madv_behavior = {
+ .mm = mm,
+ .behavior = __MADV_SET_ANON_VMA_NAME,
+ .anon_name = anon_name,
+ };
+
+ if (start & ~PAGE_MASK)
+ return -EINVAL;
+ len = (len_in + ~PAGE_MASK) & PAGE_MASK;
+
+ /* Check to see whether len was rounded up from small -ve to zero */
+ if (len_in && !len)
+ return -EINVAL;
+
+ end = start + len;
+ if (end < start)
+ return -EINVAL;
+
+ if (end == start)
+ return 0;
+
+ madv_behavior.range.start = start;
+ madv_behavior.range.end = end;
+
+ error = madvise_lock(&madv_behavior);
+ if (error)
+ return error;
+ error = madvise_walk_vmas(&madv_behavior);
+ madvise_unlock(&madv_behavior);
+
+ return error;
+}
+
+int set_anon_vma_name(unsigned long addr, unsigned long size,
+ const char __user *uname)
+{
+ struct anon_vma_name *anon_name = NULL;
+ struct mm_struct *mm = current->mm;
+ int error;
+
+ if (uname) {
+ char *name, *pch;
+
+ name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN);
+ if (IS_ERR(name))
+ return PTR_ERR(name);
+
+ for (pch = name; *pch != '\0'; pch++) {
+ if (!is_valid_name_char(*pch)) {
+ kfree(name);
+ return -EINVAL;
+ }
+ }
+ /* anon_vma has its own copy */
+ anon_name = anon_vma_name_alloc(name);
+ kfree(name);
+ if (!anon_name)
+ return -ENOMEM;
+ }
+
+ error = madvise_set_anon_name(mm, addr, size, anon_name);
+ anon_vma_name_put(anon_name);
+
+ return error;
+}
+#endif
diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c
index 2f8829b3541a..c193de6cb23a 100644
--- a/mm/mapping_dirty_helpers.c
+++ b/mm/mapping_dirty_helpers.c
@@ -129,7 +129,7 @@ static int wp_clean_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long end,
pmd_t pmdval = pmdp_get_lockless(pmd);
/* Do not split a huge pmd, present or migrated */
- if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval)) {
+ if (pmd_trans_huge(pmdval)) {
WARN_ON(pmd_write(pmdval) || pmd_dirty(pmdval));
walk->action = ACTION_CONTINUE;
}
@@ -152,7 +152,7 @@ static int wp_clean_pud_entry(pud_t *pud, unsigned long addr, unsigned long end,
pud_t pudval = READ_ONCE(*pud);
/* Do not split a huge pud */
- if (pud_trans_huge(pudval) || pud_devmap(pudval)) {
+ if (pud_trans_huge(pudval)) {
WARN_ON(pud_write(pudval) || pud_dirty(pudval));
walk->action = ACTION_CONTINUE;
}
@@ -218,7 +218,7 @@ static void wp_clean_post_vma(struct mm_walk *walk)
static int wp_clean_test_walk(unsigned long start, unsigned long end,
struct mm_walk *walk)
{
- unsigned long vm_flags = READ_ONCE(walk->vma->vm_flags);
+ vm_flags_t vm_flags = READ_ONCE(walk->vma->vm_flags);
/* Skip non-applicable VMAs */
if ((vm_flags & (VM_SHARED | VM_MAYWRITE | VM_HUGETLB)) !=
diff --git a/mm/memblock.c b/mm/memblock.c
index 0e9ebb8aa7fe..e23e16618e9b 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -18,6 +18,11 @@
#include <linux/memblock.h>
#include <linux/mutex.h>
+#ifdef CONFIG_KEXEC_HANDOVER
+#include <linux/libfdt.h>
+#include <linux/kexec_handover.h>
+#endif /* CONFIG_KEXEC_HANDOVER */
+
#include <asm/sections.h>
#include <linux/io.h>
@@ -107,6 +112,13 @@ unsigned long min_low_pfn;
unsigned long max_pfn;
unsigned long long max_possible_pfn;
+#ifdef CONFIG_MEMBLOCK_KHO_SCRATCH
+/* When set to true, only allocate from MEMBLOCK_KHO_SCRATCH ranges */
+static bool kho_scratch_only;
+#else
+#define kho_scratch_only false
+#endif
+
static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_MEMORY_REGIONS] __initdata_memblock;
static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_RESERVED_REGIONS] __initdata_memblock;
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
@@ -166,6 +178,10 @@ bool __init_memblock memblock_has_mirror(void)
static enum memblock_flags __init_memblock choose_memblock_flags(void)
{
+ /* skip non-scratch memory for kho early boot allocations */
+ if (kho_scratch_only)
+ return MEMBLOCK_KHO_SCRATCH;
+
return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE;
}
@@ -499,7 +515,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
* needn't do it
*/
if (!use_slab)
- BUG_ON(memblock_reserve(addr, new_alloc_size));
+ BUG_ON(memblock_reserve_kern(addr, new_alloc_size));
/* Update slab flag */
*in_slab = use_slab;
@@ -649,7 +665,7 @@ repeat:
#ifdef CONFIG_NUMA
WARN_ON(nid != memblock_get_region_node(rgn));
#endif
- WARN_ON(flags != rgn->flags);
+ WARN_ON(flags != MEMBLOCK_NONE && flags != rgn->flags);
nr_new++;
if (insert) {
if (start_rgn == -1)
@@ -764,9 +780,9 @@ bool __init_memblock memblock_validate_numa_coverage(unsigned long threshold_byt
}
if ((nr_pages << PAGE_SHIFT) > threshold_bytes) {
- mem_size_mb = memblock_phys_mem_size() >> 20;
+ mem_size_mb = memblock_phys_mem_size() / SZ_1M;
pr_err("NUMA: no nodes coverage for %luMB of %luMB RAM\n",
- (nr_pages << PAGE_SHIFT) >> 20, mem_size_mb);
+ (nr_pages << PAGE_SHIFT) / SZ_1M, mem_size_mb);
return false;
}
@@ -909,14 +925,15 @@ int __init_memblock memblock_phys_free(phys_addr_t base, phys_addr_t size)
return memblock_remove_range(&memblock.reserved, base, size);
}
-int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
+int __init_memblock __memblock_reserve(phys_addr_t base, phys_addr_t size,
+ int nid, enum memblock_flags flags)
{
phys_addr_t end = base + size - 1;
- memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
- &base, &end, (void *)_RET_IP_);
+ memblock_dbg("%s: [%pa-%pa] nid=%d flags=%x %pS\n", __func__,
+ &base, &end, nid, flags, (void *)_RET_IP_);
- return memblock_add_range(&memblock.reserved, base, size, MAX_NUMNODES, 0);
+ return memblock_add_range(&memblock.reserved, base, size, nid, flags);
}
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
@@ -931,6 +948,40 @@ int __init_memblock memblock_physmem_add(phys_addr_t base, phys_addr_t size)
}
#endif
+#ifdef CONFIG_MEMBLOCK_KHO_SCRATCH
+__init void memblock_set_kho_scratch_only(void)
+{
+ kho_scratch_only = true;
+}
+
+__init void memblock_clear_kho_scratch_only(void)
+{
+ kho_scratch_only = false;
+}
+
+__init void memmap_init_kho_scratch_pages(void)
+{
+ phys_addr_t start, end;
+ unsigned long pfn;
+ int nid;
+ u64 i;
+
+ if (!IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT))
+ return;
+
+ /*
+ * Initialize struct pages for free scratch memory.
+ * The struct pages for reserved scratch memory will be set up in
+ * reserve_bootmem_region()
+ */
+ __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE,
+ MEMBLOCK_KHO_SCRATCH, &start, &end, &nid) {
+ for (pfn = PFN_UP(start); pfn < PFN_DOWN(end); pfn++)
+ init_deferred_page(pfn, nid);
+ }
+}
+#endif
+
/**
* memblock_setclr_flag - set or clear flag for a memory region
* @type: memblock type to set/clear flag for
@@ -1040,13 +1091,20 @@ int __init_memblock memblock_clear_nomap(phys_addr_t base, phys_addr_t size)
/**
* memblock_reserved_mark_noinit - Mark a reserved memory region with flag
- * MEMBLOCK_RSRV_NOINIT which results in the struct pages not being initialized
- * for this region.
+ * MEMBLOCK_RSRV_NOINIT
+ *
* @base: the base phys addr of the region
* @size: the size of the region
*
- * struct pages will not be initialized for reserved memory regions marked with
- * %MEMBLOCK_RSRV_NOINIT.
+ * The struct pages for the reserved regions marked %MEMBLOCK_RSRV_NOINIT will
+ * not be fully initialized to allow the caller optimize their initialization.
+ *
+ * When %CONFIG_DEFERRED_STRUCT_PAGE_INIT is enabled, setting this flag
+ * completely bypasses the initialization of struct pages for such region.
+ *
+ * When %CONFIG_DEFERRED_STRUCT_PAGE_INIT is disabled, struct pages in this
+ * region will be initialized with default values but won't be marked as
+ * reserved.
*
* Return: 0 on success, -errno on failure.
*/
@@ -1056,6 +1114,36 @@ int __init_memblock memblock_reserved_mark_noinit(phys_addr_t base, phys_addr_t
MEMBLOCK_RSRV_NOINIT);
}
+/**
+ * memblock_mark_kho_scratch - Mark a memory region as MEMBLOCK_KHO_SCRATCH.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
+ *
+ * Only memory regions marked with %MEMBLOCK_KHO_SCRATCH will be considered
+ * for allocations during early boot with kexec handover.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+__init int memblock_mark_kho_scratch(phys_addr_t base, phys_addr_t size)
+{
+ return memblock_setclr_flag(&memblock.memory, base, size, 1,
+ MEMBLOCK_KHO_SCRATCH);
+}
+
+/**
+ * memblock_clear_kho_scratch - Clear MEMBLOCK_KHO_SCRATCH flag for a
+ * specified region.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+__init int memblock_clear_kho_scratch(phys_addr_t base, phys_addr_t size)
+{
+ return memblock_setclr_flag(&memblock.memory, base, size, 0,
+ MEMBLOCK_KHO_SCRATCH);
+}
+
static bool should_skip_region(struct memblock_type *type,
struct memblock_region *m,
int nid, int flags)
@@ -1087,6 +1175,13 @@ static bool should_skip_region(struct memblock_type *type,
if (!(flags & MEMBLOCK_DRIVER_MANAGED) && memblock_is_driver_managed(m))
return true;
+ /*
+ * In early alloc during kexec handover, we can only consider
+ * MEMBLOCK_KHO_SCRATCH regions for the allocations
+ */
+ if ((flags & MEMBLOCK_KHO_SCRATCH) && !memblock_is_kho_scratch(m))
+ return true;
+
return false;
}
@@ -1350,70 +1445,6 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
return 0;
}
-#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
-/**
- * __next_mem_pfn_range_in_zone - iterator for for_each_*_range_in_zone()
- *
- * @idx: pointer to u64 loop variable
- * @zone: zone in which all of the memory blocks reside
- * @out_spfn: ptr to ulong for start pfn of the range, can be %NULL
- * @out_epfn: ptr to ulong for end pfn of the range, can be %NULL
- *
- * This function is meant to be a zone/pfn specific wrapper for the
- * for_each_mem_range type iterators. Specifically they are used in the
- * deferred memory init routines and as such we were duplicating much of
- * this logic throughout the code. So instead of having it in multiple
- * locations it seemed like it would make more sense to centralize this to
- * one new iterator that does everything they need.
- */
-void __init_memblock
-__next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
- unsigned long *out_spfn, unsigned long *out_epfn)
-{
- int zone_nid = zone_to_nid(zone);
- phys_addr_t spa, epa;
-
- __next_mem_range(idx, zone_nid, MEMBLOCK_NONE,
- &memblock.memory, &memblock.reserved,
- &spa, &epa, NULL);
-
- while (*idx != U64_MAX) {
- unsigned long epfn = PFN_DOWN(epa);
- unsigned long spfn = PFN_UP(spa);
-
- /*
- * Verify the end is at least past the start of the zone and
- * that we have at least one PFN to initialize.
- */
- if (zone->zone_start_pfn < epfn && spfn < epfn) {
- /* if we went too far just stop searching */
- if (zone_end_pfn(zone) <= spfn) {
- *idx = U64_MAX;
- break;
- }
-
- if (out_spfn)
- *out_spfn = max(zone->zone_start_pfn, spfn);
- if (out_epfn)
- *out_epfn = min(zone_end_pfn(zone), epfn);
-
- return;
- }
-
- __next_mem_range(idx, zone_nid, MEMBLOCK_NONE,
- &memblock.memory, &memblock.reserved,
- &spa, &epa, NULL);
- }
-
- /* signal end of iteration */
- if (out_spfn)
- *out_spfn = ULONG_MAX;
- if (out_epfn)
- *out_epfn = 0;
-}
-
-#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
-
/**
* memblock_alloc_range_nid - allocate boot memory block
* @size: size of memory block to be allocated in bytes
@@ -1467,14 +1498,14 @@ phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
again:
found = memblock_find_in_range_node(size, align, start, end, nid,
flags);
- if (found && !memblock_reserve(found, size))
+ if (found && !__memblock_reserve(found, size, nid, MEMBLOCK_RSRV_KERN))
goto done;
if (numa_valid_node(nid) && !exact_nid) {
found = memblock_find_in_range_node(size, align, start,
end, NUMA_NO_NODE,
flags);
- if (found && !memblock_reserve(found, size))
+ if (found && !memblock_reserve_kern(found, size))
goto done;
}
@@ -1759,6 +1790,28 @@ phys_addr_t __init_memblock memblock_reserved_size(void)
return memblock.reserved.total_size;
}
+phys_addr_t __init_memblock memblock_reserved_kern_size(phys_addr_t limit, int nid)
+{
+ struct memblock_region *r;
+ phys_addr_t total = 0;
+
+ for_each_reserved_mem_region(r) {
+ phys_addr_t size = r->size;
+
+ if (r->base > limit)
+ break;
+
+ if (r->base + r->size > limit)
+ size = limit - r->base;
+
+ if (nid == memblock_get_region_node(r) || !numa_valid_node(nid))
+ if (r->flags & MEMBLOCK_RSRV_KERN)
+ total += size;
+ }
+
+ return total;
+}
+
/**
* memblock_estimated_nr_free_pages - return estimated number of free pages
* from memblock point of view
@@ -2289,6 +2342,7 @@ void __init memblock_free_all(void)
free_unused_memmap();
reset_all_zones_managed_pages();
+ memblock_clear_kho_scratch_only();
pages = free_low_memory_core_early();
totalram_pages_add(pages);
}
@@ -2386,6 +2440,191 @@ int reserve_mem_release_by_name(const char *name)
return 1;
}
+#ifdef CONFIG_KEXEC_HANDOVER
+#define MEMBLOCK_KHO_FDT "memblock"
+#define MEMBLOCK_KHO_NODE_COMPATIBLE "memblock-v1"
+#define RESERVE_MEM_KHO_NODE_COMPATIBLE "reserve-mem-v1"
+static struct page *kho_fdt;
+
+static int reserve_mem_kho_finalize(struct kho_serialization *ser)
+{
+ int err = 0, i;
+
+ for (i = 0; i < reserved_mem_count; i++) {
+ struct reserve_mem_table *map = &reserved_mem_table[i];
+ struct page *page = phys_to_page(map->start);
+ unsigned int nr_pages = map->size >> PAGE_SHIFT;
+
+ err |= kho_preserve_pages(page, nr_pages);
+ }
+
+ err |= kho_preserve_folio(page_folio(kho_fdt));
+ err |= kho_add_subtree(ser, MEMBLOCK_KHO_FDT, page_to_virt(kho_fdt));
+
+ return notifier_from_errno(err);
+}
+
+static int reserve_mem_kho_notifier(struct notifier_block *self,
+ unsigned long cmd, void *v)
+{
+ switch (cmd) {
+ case KEXEC_KHO_FINALIZE:
+ return reserve_mem_kho_finalize((struct kho_serialization *)v);
+ case KEXEC_KHO_ABORT:
+ return NOTIFY_DONE;
+ default:
+ return NOTIFY_BAD;
+ }
+}
+
+static struct notifier_block reserve_mem_kho_nb = {
+ .notifier_call = reserve_mem_kho_notifier,
+};
+
+static int __init prepare_kho_fdt(void)
+{
+ int err = 0, i;
+ void *fdt;
+
+ kho_fdt = alloc_page(GFP_KERNEL);
+ if (!kho_fdt)
+ return -ENOMEM;
+
+ fdt = page_to_virt(kho_fdt);
+
+ err |= fdt_create(fdt, PAGE_SIZE);
+ err |= fdt_finish_reservemap(fdt);
+
+ err |= fdt_begin_node(fdt, "");
+ err |= fdt_property_string(fdt, "compatible", MEMBLOCK_KHO_NODE_COMPATIBLE);
+ for (i = 0; i < reserved_mem_count; i++) {
+ struct reserve_mem_table *map = &reserved_mem_table[i];
+
+ err |= fdt_begin_node(fdt, map->name);
+ err |= fdt_property_string(fdt, "compatible", RESERVE_MEM_KHO_NODE_COMPATIBLE);
+ err |= fdt_property(fdt, "start", &map->start, sizeof(map->start));
+ err |= fdt_property(fdt, "size", &map->size, sizeof(map->size));
+ err |= fdt_end_node(fdt);
+ }
+ err |= fdt_end_node(fdt);
+
+ err |= fdt_finish(fdt);
+
+ if (err) {
+ pr_err("failed to prepare memblock FDT for KHO: %d\n", err);
+ put_page(kho_fdt);
+ kho_fdt = NULL;
+ }
+
+ return err;
+}
+
+static int __init reserve_mem_init(void)
+{
+ int err;
+
+ if (!kho_is_enabled() || !reserved_mem_count)
+ return 0;
+
+ err = prepare_kho_fdt();
+ if (err)
+ return err;
+
+ err = register_kho_notifier(&reserve_mem_kho_nb);
+ if (err) {
+ put_page(kho_fdt);
+ kho_fdt = NULL;
+ }
+
+ return err;
+}
+late_initcall(reserve_mem_init);
+
+static void *__init reserve_mem_kho_retrieve_fdt(void)
+{
+ phys_addr_t fdt_phys;
+ static void *fdt;
+ int err;
+
+ if (fdt)
+ return fdt;
+
+ err = kho_retrieve_subtree(MEMBLOCK_KHO_FDT, &fdt_phys);
+ if (err) {
+ if (err != -ENOENT)
+ pr_warn("failed to retrieve FDT '%s' from KHO: %d\n",
+ MEMBLOCK_KHO_FDT, err);
+ return NULL;
+ }
+
+ fdt = phys_to_virt(fdt_phys);
+
+ err = fdt_node_check_compatible(fdt, 0, MEMBLOCK_KHO_NODE_COMPATIBLE);
+ if (err) {
+ pr_warn("FDT '%s' is incompatible with '%s': %d\n",
+ MEMBLOCK_KHO_FDT, MEMBLOCK_KHO_NODE_COMPATIBLE, err);
+ fdt = NULL;
+ }
+
+ return fdt;
+}
+
+static bool __init reserve_mem_kho_revive(const char *name, phys_addr_t size,
+ phys_addr_t align)
+{
+ int err, len_start, len_size, offset;
+ const phys_addr_t *p_start, *p_size;
+ const void *fdt;
+
+ fdt = reserve_mem_kho_retrieve_fdt();
+ if (!fdt)
+ return false;
+
+ offset = fdt_subnode_offset(fdt, 0, name);
+ if (offset < 0) {
+ pr_warn("FDT '%s' has no child '%s': %d\n",
+ MEMBLOCK_KHO_FDT, name, offset);
+ return false;
+ }
+ err = fdt_node_check_compatible(fdt, offset, RESERVE_MEM_KHO_NODE_COMPATIBLE);
+ if (err) {
+ pr_warn("Node '%s' is incompatible with '%s': %d\n",
+ name, RESERVE_MEM_KHO_NODE_COMPATIBLE, err);
+ return false;
+ }
+
+ p_start = fdt_getprop(fdt, offset, "start", &len_start);
+ p_size = fdt_getprop(fdt, offset, "size", &len_size);
+ if (!p_start || len_start != sizeof(*p_start) || !p_size ||
+ len_size != sizeof(*p_size)) {
+ return false;
+ }
+
+ if (*p_start & (align - 1)) {
+ pr_warn("KHO reserve-mem '%s' has wrong alignment (0x%lx, 0x%lx)\n",
+ name, (long)align, (long)*p_start);
+ return false;
+ }
+
+ if (*p_size != size) {
+ pr_warn("KHO reserve-mem '%s' has wrong size (0x%lx != 0x%lx)\n",
+ name, (long)*p_size, (long)size);
+ return false;
+ }
+
+ reserved_mem_add(*p_start, size, name);
+ pr_info("Revived memory reservation '%s' from KHO\n", name);
+
+ return true;
+}
+#else
+static bool __init reserve_mem_kho_revive(const char *name, phys_addr_t size,
+ phys_addr_t align)
+{
+ return false;
+}
+#endif /* CONFIG_KEXEC_HANDOVER */
+
/*
* Parse reserve_mem=nn:align:name
*/
@@ -2441,6 +2680,11 @@ static int __init reserve_mem(char *p)
if (reserve_mem_find_by_name(name, &start, &tmp))
return -EBUSY;
+ /* Pick previous allocations up from KHO if available */
+ if (reserve_mem_kho_revive(name, size, align))
+ return 1;
+
+ /* TODO: Allocation must be outside of scratch region */
start = memblock_phys_alloc(size, align);
if (!start)
return -ENOMEM;
@@ -2458,6 +2702,8 @@ static const char * const flagname[] = {
[ilog2(MEMBLOCK_NOMAP)] = "NOMAP",
[ilog2(MEMBLOCK_DRIVER_MANAGED)] = "DRV_MNG",
[ilog2(MEMBLOCK_RSRV_NOINIT)] = "RSV_NIT",
+ [ilog2(MEMBLOCK_RSRV_KERN)] = "RSV_KERN",
+ [ilog2(MEMBLOCK_KHO_SCRATCH)] = "KHO_SCRATCH",
};
static int memblock_debug_show(struct seq_file *m, void *private)
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index 4a9cf27a70af..6eed14bff742 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -512,9 +512,9 @@ static void memcg1_charge_statistics(struct mem_cgroup *memcg, int nr_pages)
{
/* pagein of a big page is an event. So, ignore page size */
if (nr_pages > 0)
- __count_memcg_events(memcg, PGPGIN, 1);
+ count_memcg_events(memcg, PGPGIN, 1);
else {
- __count_memcg_events(memcg, PGPGOUT, 1);
+ count_memcg_events(memcg, PGPGOUT, 1);
nr_pages = -nr_pages; /* for event */
}
@@ -689,7 +689,7 @@ void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
unsigned long flags;
local_irq_save(flags);
- __count_memcg_events(memcg, PGPGOUT, pgpgout);
+ count_memcg_events(memcg, PGPGOUT, pgpgout);
__this_cpu_add(memcg->events_percpu->nr_page_events, nr_memory);
memcg1_check_events(memcg, nid);
local_irq_restore(flags);
@@ -761,7 +761,7 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
size = thresholds->primary ? thresholds->primary->size + 1 : 1;
/* Allocate memory for new array of thresholds */
- new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
+ new = kmalloc(struct_size(new, entries, size), GFP_KERNEL_ACCOUNT);
if (!new) {
ret = -ENOMEM;
goto unlock;
@@ -924,7 +924,7 @@ static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
{
struct mem_cgroup_eventfd_list *event;
- event = kmalloc(sizeof(*event), GFP_KERNEL);
+ event = kmalloc(sizeof(*event), GFP_KERNEL_ACCOUNT);
if (!event)
return -ENOMEM;
@@ -1087,7 +1087,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
CLASS(fd, cfile)(cfd);
- event = kzalloc(sizeof(*event), GFP_KERNEL);
+ event = kzalloc(sizeof(*event), GFP_KERNEL_ACCOUNT);
if (!event)
return -ENOMEM;
@@ -2053,7 +2053,7 @@ struct cftype mem_cgroup_legacy_files[] = {
{
.name = "cgroup.event_control", /* XXX: for compat */
.write = memcg_write_event_control,
- .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
+ .flags = CFTYPE_NO_PREFIX,
},
{
.name = "swappiness",
@@ -2198,8 +2198,7 @@ bool memcg1_alloc_events(struct mem_cgroup *memcg)
void memcg1_free_events(struct mem_cgroup *memcg)
{
- if (memcg->events_percpu)
- free_percpu(memcg->events_percpu);
+ free_percpu(memcg->events_percpu);
}
static int __init memcg1_init(void)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c96c1f2b9cf5..4deda33625f4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -29,6 +29,7 @@
#include <linux/page_counter.h>
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
+#include <linux/cpuset.h>
#include <linux/sched/mm.h>
#include <linux/shmem_fs.h>
#include <linux/hugetlb.h>
@@ -50,7 +51,6 @@
#include <linux/spinlock.h>
#include <linux/fs.h>
#include <linux/seq_file.h>
-#include <linux/parser.h>
#include <linux/vmpressure.h>
#include <linux/memremap.h>
#include <linux/mm_inline.h>
@@ -95,6 +95,9 @@ static bool cgroup_memory_nokmem __ro_after_init;
/* BPF memory accounting disabled? */
static bool cgroup_memory_nobpf __ro_after_init;
+static struct kmem_cache *memcg_cachep;
+static struct kmem_cache *memcg_pn_cachep;
+
#ifdef CONFIG_CGROUP_WRITEBACK
static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
#endif
@@ -129,8 +132,7 @@ bool mem_cgroup_kmem_disabled(void)
return cgroup_memory_nokmem;
}
-static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
- unsigned int nr_pages);
+static void memcg_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages);
static void obj_cgroup_release(struct percpu_ref *ref)
{
@@ -163,8 +165,16 @@ static void obj_cgroup_release(struct percpu_ref *ref)
WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
nr_pages = nr_bytes >> PAGE_SHIFT;
- if (nr_pages)
- obj_cgroup_uncharge_pages(objcg, nr_pages);
+ if (nr_pages) {
+ struct mem_cgroup *memcg;
+
+ memcg = get_mem_cgroup_from_objcg(objcg);
+ mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages);
+ memcg1_account_kmem(memcg, -nr_pages);
+ if (!mem_cgroup_is_root(memcg))
+ memcg_uncharge(memcg, nr_pages);
+ mem_cgroup_put(memcg);
+ }
spin_lock_irqsave(&objcg_lock, flags);
list_del(&objcg->list);
@@ -277,6 +287,7 @@ ino_t page_cgroup_ino(struct page *page)
rcu_read_unlock();
return ino;
}
+EXPORT_SYMBOL_GPL(page_cgroup_ino);
/* Subset of node_stat_item for memcg stats */
static const unsigned int memcg_node_stat_items[] = {
@@ -492,8 +503,8 @@ struct memcg_vmstats_percpu {
unsigned int stats_updates;
/* Cached pointers for fast iteration in memcg_rstat_updated() */
- struct memcg_vmstats_percpu *parent;
- struct memcg_vmstats *vmstats;
+ struct memcg_vmstats_percpu __percpu *parent_pcpu;
+ struct memcg_vmstats *vmstats;
/* The above should fit a single cacheline for memcg_rstat_updated() */
@@ -520,7 +531,7 @@ struct memcg_vmstats {
unsigned long events_pending[NR_MEMCG_EVENTS];
/* Stats updates since the last flush */
- atomic64_t stats_updates;
+ atomic_t stats_updates;
};
/*
@@ -544,60 +555,41 @@ static u64 flush_last_time;
#define FLUSH_TIME (2UL*HZ)
-/*
- * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can
- * not rely on this as part of an acquired spinlock_t lock. These functions are
- * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion
- * is sufficient.
- */
-static void memcg_stats_lock(void)
-{
- preempt_disable_nested();
- VM_WARN_ON_IRQS_ENABLED();
-}
-
-static void __memcg_stats_lock(void)
-{
- preempt_disable_nested();
-}
-
-static void memcg_stats_unlock(void)
-{
- preempt_enable_nested();
-}
-
-
static bool memcg_vmstats_needs_flush(struct memcg_vmstats *vmstats)
{
- return atomic64_read(&vmstats->stats_updates) >
+ return atomic_read(&vmstats->stats_updates) >
MEMCG_CHARGE_BATCH * num_online_cpus();
}
-static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
+static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val,
+ int cpu)
{
+ struct memcg_vmstats_percpu __percpu *statc_pcpu;
struct memcg_vmstats_percpu *statc;
- int cpu = smp_processor_id();
unsigned int stats_updates;
if (!val)
return;
- cgroup_rstat_updated(memcg->css.cgroup, cpu);
- statc = this_cpu_ptr(memcg->vmstats_percpu);
- for (; statc; statc = statc->parent) {
- stats_updates = READ_ONCE(statc->stats_updates) + abs(val);
- WRITE_ONCE(statc->stats_updates, stats_updates);
+ css_rstat_updated(&memcg->css, cpu);
+ statc_pcpu = memcg->vmstats_percpu;
+ for (; statc_pcpu; statc_pcpu = statc->parent_pcpu) {
+ statc = this_cpu_ptr(statc_pcpu);
+ /*
+ * If @memcg is already flushable then all its ancestors are
+ * flushable as well and also there is no need to increase
+ * stats_updates.
+ */
+ if (memcg_vmstats_needs_flush(statc->vmstats))
+ break;
+
+ stats_updates = this_cpu_add_return(statc_pcpu->stats_updates,
+ abs(val));
if (stats_updates < MEMCG_CHARGE_BATCH)
continue;
- /*
- * If @memcg is already flush-able, increasing stats_updates is
- * redundant. Avoid the overhead of the atomic update.
- */
- if (!memcg_vmstats_needs_flush(statc->vmstats))
- atomic64_add(stats_updates,
- &statc->vmstats->stats_updates);
- WRITE_ONCE(statc->stats_updates, 0);
+ stats_updates = this_cpu_xchg(statc_pcpu->stats_updates, 0);
+ atomic_add(stats_updates, &statc->vmstats->stats_updates);
}
}
@@ -605,7 +597,7 @@ static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force)
{
bool needs_flush = memcg_vmstats_needs_flush(memcg->vmstats);
- trace_memcg_flush_stats(memcg, atomic64_read(&memcg->vmstats->stats_updates),
+ trace_memcg_flush_stats(memcg, atomic_read(&memcg->vmstats->stats_updates),
force, needs_flush);
if (!force && !needs_flush)
@@ -614,7 +606,7 @@ static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force)
if (mem_cgroup_is_root(memcg))
WRITE_ONCE(flush_last_time, jiffies_64);
- cgroup_rstat_flush(memcg->css.cgroup);
+ css_rstat_flush(&memcg->css);
}
/*
@@ -687,15 +679,16 @@ static int memcg_state_val_in_pages(int idx, int val)
}
/**
- * __mod_memcg_state - update cgroup memory statistics
+ * mod_memcg_state - update cgroup memory statistics
* @memcg: the memory cgroup
* @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
* @val: delta to add to the counter, can be negative
*/
-void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
+void mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
int val)
{
int i = memcg_stats_index(idx);
+ int cpu;
if (mem_cgroup_disabled())
return;
@@ -703,10 +696,14 @@ void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
return;
- __this_cpu_add(memcg->vmstats_percpu->state[i], val);
+ cpu = get_cpu();
+
+ this_cpu_add(memcg->vmstats_percpu->state[i], val);
val = memcg_state_val_in_pages(idx, val);
- memcg_rstat_updated(memcg, val);
+ memcg_rstat_updated(memcg, val, cpu);
trace_mod_memcg_state(memcg, idx, val);
+
+ put_cpu();
}
#ifdef CONFIG_MEMCG_V1
@@ -728,13 +725,14 @@ unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
}
#endif
-static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
+static void mod_memcg_lruvec_state(struct lruvec *lruvec,
enum node_stat_item idx,
int val)
{
struct mem_cgroup_per_node *pn;
struct mem_cgroup *memcg;
int i = memcg_stats_index(idx);
+ int cpu;
if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
return;
@@ -742,35 +740,19 @@ static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
memcg = pn->memcg;
- /*
- * The caller from rmap relies on disabled preemption because they never
- * update their counter from in-interrupt context. For these two
- * counters we check that the update is never performed from an
- * interrupt context while other caller need to have disabled interrupt.
- */
- __memcg_stats_lock();
- if (IS_ENABLED(CONFIG_DEBUG_VM)) {
- switch (idx) {
- case NR_ANON_MAPPED:
- case NR_FILE_MAPPED:
- case NR_ANON_THPS:
- WARN_ON_ONCE(!in_task());
- break;
- default:
- VM_WARN_ON_IRQS_ENABLED();
- }
- }
+ cpu = get_cpu();
/* Update memcg */
- __this_cpu_add(memcg->vmstats_percpu->state[i], val);
+ this_cpu_add(memcg->vmstats_percpu->state[i], val);
/* Update lruvec */
- __this_cpu_add(pn->lruvec_stats_percpu->state[i], val);
+ this_cpu_add(pn->lruvec_stats_percpu->state[i], val);
val = memcg_state_val_in_pages(idx, val);
- memcg_rstat_updated(memcg, val);
+ memcg_rstat_updated(memcg, val, cpu);
trace_mod_memcg_lruvec_state(memcg, idx, val);
- memcg_stats_unlock();
+
+ put_cpu();
}
/**
@@ -791,7 +773,7 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
/* Update memcg and lruvec */
if (!mem_cgroup_disabled())
- __mod_memcg_lruvec_state(lruvec, idx, val);
+ mod_memcg_lruvec_state(lruvec, idx, val);
}
void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx,
@@ -841,15 +823,16 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
}
/**
- * __count_memcg_events - account VM events in a cgroup
+ * count_memcg_events - account VM events in a cgroup
* @memcg: the memory cgroup
* @idx: the event item
* @count: the number of events that occurred
*/
-void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
+void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
unsigned long count)
{
int i = memcg_events_index(idx);
+ int cpu;
if (mem_cgroup_disabled())
return;
@@ -857,11 +840,13 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
return;
- memcg_stats_lock();
- __this_cpu_add(memcg->vmstats_percpu->events[i], count);
- memcg_rstat_updated(memcg, count);
+ cpu = get_cpu();
+
+ this_cpu_add(memcg->vmstats_percpu->events[i], count);
+ memcg_rstat_updated(memcg, count, cpu);
trace_count_memcg_events(memcg, idx, count);
- memcg_stats_unlock();
+
+ put_cpu();
}
unsigned long memcg_events(struct mem_cgroup *memcg, int event)
@@ -1168,7 +1153,6 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
{
struct mem_cgroup *iter;
int ret = 0;
- int i = 0;
BUG_ON(mem_cgroup_is_root(memcg));
@@ -1178,10 +1162,9 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
while (!ret && (task = css_task_iter_next(&it))) {
- /* Avoid potential softlockup warning */
- if ((++i & 1023) == 0)
- cond_resched();
ret = fn(task, arg);
+ /* Avoid potential softlockup warning */
+ cond_resched();
}
css_task_iter_end(&it);
if (ret) {
@@ -1664,7 +1647,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
* A few threads which were not waiting at mutex_lock_killable() can
* fail to bail out. Therefore, check again after holding oom_lock.
*/
- ret = task_is_dying() || out_of_memory(&oc);
+ ret = out_of_memory(&oc);
unlock:
mutex_unlock(&oom_lock);
@@ -1758,155 +1741,234 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
pr_cont(" are going to be killed due to memory.oom.group set\n");
}
+/*
+ * The value of NR_MEMCG_STOCK is selected to keep the cached memcgs and their
+ * nr_pages in a single cacheline. This may change in future.
+ */
+#define NR_MEMCG_STOCK 7
+#define FLUSHING_CACHED_CHARGE 0
struct memcg_stock_pcp {
- local_trylock_t stock_lock;
- struct mem_cgroup *cached; /* this never be root cgroup */
- unsigned int nr_pages;
+ local_trylock_t lock;
+ uint8_t nr_pages[NR_MEMCG_STOCK];
+ struct mem_cgroup *cached[NR_MEMCG_STOCK];
+
+ struct work_struct work;
+ unsigned long flags;
+};
+static DEFINE_PER_CPU_ALIGNED(struct memcg_stock_pcp, memcg_stock) = {
+ .lock = INIT_LOCAL_TRYLOCK(lock),
+};
+
+struct obj_stock_pcp {
+ local_trylock_t lock;
+ unsigned int nr_bytes;
struct obj_cgroup *cached_objcg;
struct pglist_data *cached_pgdat;
- unsigned int nr_bytes;
int nr_slab_reclaimable_b;
int nr_slab_unreclaimable_b;
struct work_struct work;
unsigned long flags;
-#define FLUSHING_CACHED_CHARGE 0
};
-static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = {
- .stock_lock = INIT_LOCAL_TRYLOCK(stock_lock),
+
+static DEFINE_PER_CPU_ALIGNED(struct obj_stock_pcp, obj_stock) = {
+ .lock = INIT_LOCAL_TRYLOCK(lock),
};
+
static DEFINE_MUTEX(percpu_charge_mutex);
-static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock);
-static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
+static void drain_obj_stock(struct obj_stock_pcp *stock);
+static bool obj_stock_flush_required(struct obj_stock_pcp *stock,
struct mem_cgroup *root_memcg);
/**
* consume_stock: Try to consume stocked charge on this cpu.
* @memcg: memcg to consume from.
* @nr_pages: how many pages to charge.
- * @gfp_mask: allocation mask.
*
- * The charges will only happen if @memcg matches the current cpu's memcg
- * stock, and at least @nr_pages are available in that stock. Failure to
- * service an allocation will refill the stock.
+ * Consume the cached charge if enough nr_pages are present otherwise return
+ * failure. Also return failure for charge request larger than
+ * MEMCG_CHARGE_BATCH or if the local lock is already taken.
*
* returns true if successful, false otherwise.
*/
-static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages,
- gfp_t gfp_mask)
+static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
{
struct memcg_stock_pcp *stock;
- unsigned int stock_pages;
- unsigned long flags;
+ uint8_t stock_pages;
bool ret = false;
+ int i;
- if (nr_pages > MEMCG_CHARGE_BATCH)
- return ret;
-
- if (gfpflags_allow_spinning(gfp_mask))
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
- else if (!local_trylock_irqsave(&memcg_stock.stock_lock, flags))
+ if (nr_pages > MEMCG_CHARGE_BATCH ||
+ !local_trylock(&memcg_stock.lock))
return ret;
stock = this_cpu_ptr(&memcg_stock);
- stock_pages = READ_ONCE(stock->nr_pages);
- if (memcg == READ_ONCE(stock->cached) && stock_pages >= nr_pages) {
- WRITE_ONCE(stock->nr_pages, stock_pages - nr_pages);
- ret = true;
+
+ for (i = 0; i < NR_MEMCG_STOCK; ++i) {
+ if (memcg != READ_ONCE(stock->cached[i]))
+ continue;
+
+ stock_pages = READ_ONCE(stock->nr_pages[i]);
+ if (stock_pages >= nr_pages) {
+ WRITE_ONCE(stock->nr_pages[i], stock_pages - nr_pages);
+ ret = true;
+ }
+ break;
}
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ local_unlock(&memcg_stock.lock);
return ret;
}
+static void memcg_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
+{
+ page_counter_uncharge(&memcg->memory, nr_pages);
+ if (do_memsw_account())
+ page_counter_uncharge(&memcg->memsw, nr_pages);
+}
+
/*
* Returns stocks cached in percpu and reset cached information.
*/
-static void drain_stock(struct memcg_stock_pcp *stock)
+static void drain_stock(struct memcg_stock_pcp *stock, int i)
{
- unsigned int stock_pages = READ_ONCE(stock->nr_pages);
- struct mem_cgroup *old = READ_ONCE(stock->cached);
+ struct mem_cgroup *old = READ_ONCE(stock->cached[i]);
+ uint8_t stock_pages;
if (!old)
return;
+ stock_pages = READ_ONCE(stock->nr_pages[i]);
if (stock_pages) {
- page_counter_uncharge(&old->memory, stock_pages);
- if (do_memsw_account())
- page_counter_uncharge(&old->memsw, stock_pages);
-
- WRITE_ONCE(stock->nr_pages, 0);
+ memcg_uncharge(old, stock_pages);
+ WRITE_ONCE(stock->nr_pages[i], 0);
}
css_put(&old->css);
- WRITE_ONCE(stock->cached, NULL);
+ WRITE_ONCE(stock->cached[i], NULL);
+}
+
+static void drain_stock_fully(struct memcg_stock_pcp *stock)
+{
+ int i;
+
+ for (i = 0; i < NR_MEMCG_STOCK; ++i)
+ drain_stock(stock, i);
}
-static void drain_local_stock(struct work_struct *dummy)
+static void drain_local_memcg_stock(struct work_struct *dummy)
{
struct memcg_stock_pcp *stock;
- struct obj_cgroup *old = NULL;
- unsigned long flags;
- /*
- * The only protection from cpu hotplug (memcg_hotplug_cpu_dead) vs.
- * drain_stock races is that we always operate on local CPU stock
- * here with IRQ disabled
- */
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ if (WARN_ONCE(!in_task(), "drain in non-task context"))
+ return;
+
+ local_lock(&memcg_stock.lock);
stock = this_cpu_ptr(&memcg_stock);
- old = drain_obj_stock(stock);
- drain_stock(stock);
+ drain_stock_fully(stock);
clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
- obj_cgroup_put(old);
+ local_unlock(&memcg_stock.lock);
}
-/*
- * Cache charges(val) to local per_cpu area.
- * This will be consumed by consume_stock() function, later.
- */
-static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
+static void drain_local_obj_stock(struct work_struct *dummy)
{
- struct memcg_stock_pcp *stock;
- unsigned int stock_pages;
+ struct obj_stock_pcp *stock;
- stock = this_cpu_ptr(&memcg_stock);
- if (READ_ONCE(stock->cached) != memcg) { /* reset if necessary */
- drain_stock(stock);
- css_get(&memcg->css);
- WRITE_ONCE(stock->cached, memcg);
- }
- stock_pages = READ_ONCE(stock->nr_pages) + nr_pages;
- WRITE_ONCE(stock->nr_pages, stock_pages);
+ if (WARN_ONCE(!in_task(), "drain in non-task context"))
+ return;
+
+ local_lock(&obj_stock.lock);
+
+ stock = this_cpu_ptr(&obj_stock);
+ drain_obj_stock(stock);
+ clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
- if (stock_pages > MEMCG_CHARGE_BATCH)
- drain_stock(stock);
+ local_unlock(&obj_stock.lock);
}
static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
{
- unsigned long flags;
+ struct memcg_stock_pcp *stock;
+ struct mem_cgroup *cached;
+ uint8_t stock_pages;
+ bool success = false;
+ int empty_slot = -1;
+ int i;
+
+ /*
+ * For now limit MEMCG_CHARGE_BATCH to 127 and less. In future if we
+ * decide to increase it more than 127 then we will need more careful
+ * handling of nr_pages[] in struct memcg_stock_pcp.
+ */
+ BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S8_MAX);
+
+ VM_WARN_ON_ONCE(mem_cgroup_is_root(memcg));
- if (!local_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
+ if (nr_pages > MEMCG_CHARGE_BATCH ||
+ !local_trylock(&memcg_stock.lock)) {
/*
- * In case of unlikely failure to lock percpu stock_lock
- * uncharge memcg directly.
+ * In case of larger than batch refill or unlikely failure to
+ * lock the percpu memcg_stock.lock, uncharge memcg directly.
*/
- if (mem_cgroup_is_root(memcg))
- return;
- page_counter_uncharge(&memcg->memory, nr_pages);
- if (do_memsw_account())
- page_counter_uncharge(&memcg->memsw, nr_pages);
+ memcg_uncharge(memcg, nr_pages);
return;
}
- __refill_stock(memcg, nr_pages);
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+
+ stock = this_cpu_ptr(&memcg_stock);
+ for (i = 0; i < NR_MEMCG_STOCK; ++i) {
+ cached = READ_ONCE(stock->cached[i]);
+ if (!cached && empty_slot == -1)
+ empty_slot = i;
+ if (memcg == READ_ONCE(stock->cached[i])) {
+ stock_pages = READ_ONCE(stock->nr_pages[i]) + nr_pages;
+ WRITE_ONCE(stock->nr_pages[i], stock_pages);
+ if (stock_pages > MEMCG_CHARGE_BATCH)
+ drain_stock(stock, i);
+ success = true;
+ break;
+ }
+ }
+
+ if (!success) {
+ i = empty_slot;
+ if (i == -1) {
+ i = get_random_u32_below(NR_MEMCG_STOCK);
+ drain_stock(stock, i);
+ }
+ css_get(&memcg->css);
+ WRITE_ONCE(stock->cached[i], memcg);
+ WRITE_ONCE(stock->nr_pages[i], nr_pages);
+ }
+
+ local_unlock(&memcg_stock.lock);
+}
+
+static bool is_memcg_drain_needed(struct memcg_stock_pcp *stock,
+ struct mem_cgroup *root_memcg)
+{
+ struct mem_cgroup *memcg;
+ bool flush = false;
+ int i;
+
+ rcu_read_lock();
+ for (i = 0; i < NR_MEMCG_STOCK; ++i) {
+ memcg = READ_ONCE(stock->cached[i]);
+ if (!memcg)
+ continue;
+
+ if (READ_ONCE(stock->nr_pages[i]) &&
+ mem_cgroup_is_descendant(memcg, root_memcg)) {
+ flush = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return flush;
}
/*
@@ -1929,25 +1991,27 @@ void drain_all_stock(struct mem_cgroup *root_memcg)
migrate_disable();
curcpu = smp_processor_id();
for_each_online_cpu(cpu) {
- struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
- struct mem_cgroup *memcg;
- bool flush = false;
+ struct memcg_stock_pcp *memcg_st = &per_cpu(memcg_stock, cpu);
+ struct obj_stock_pcp *obj_st = &per_cpu(obj_stock, cpu);
- rcu_read_lock();
- memcg = READ_ONCE(stock->cached);
- if (memcg && READ_ONCE(stock->nr_pages) &&
- mem_cgroup_is_descendant(memcg, root_memcg))
- flush = true;
- else if (obj_stock_flush_required(stock, root_memcg))
- flush = true;
- rcu_read_unlock();
+ if (!test_bit(FLUSHING_CACHED_CHARGE, &memcg_st->flags) &&
+ is_memcg_drain_needed(memcg_st, root_memcg) &&
+ !test_and_set_bit(FLUSHING_CACHED_CHARGE,
+ &memcg_st->flags)) {
+ if (cpu == curcpu)
+ drain_local_memcg_stock(&memcg_st->work);
+ else if (!cpu_is_isolated(cpu))
+ schedule_work_on(cpu, &memcg_st->work);
+ }
- if (flush &&
- !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
+ if (!test_bit(FLUSHING_CACHED_CHARGE, &obj_st->flags) &&
+ obj_stock_flush_required(obj_st, root_memcg) &&
+ !test_and_set_bit(FLUSHING_CACHED_CHARGE,
+ &obj_st->flags)) {
if (cpu == curcpu)
- drain_local_stock(&stock->work);
+ drain_local_obj_stock(&obj_st->work);
else if (!cpu_is_isolated(cpu))
- schedule_work_on(cpu, &stock->work);
+ schedule_work_on(cpu, &obj_st->work);
}
}
migrate_enable();
@@ -1956,19 +2020,9 @@ void drain_all_stock(struct mem_cgroup *root_memcg)
static int memcg_hotplug_cpu_dead(unsigned int cpu)
{
- struct memcg_stock_pcp *stock;
- struct obj_cgroup *old;
- unsigned long flags;
-
- stock = &per_cpu(memcg_stock, cpu);
-
- /* drain_obj_stock requires stock_lock */
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
- old = drain_obj_stock(stock);
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
-
- drain_stock(stock);
- obj_cgroup_put(old);
+ /* no need for the local lock */
+ drain_obj_stock(&per_cpu(obj_stock, cpu));
+ drain_stock_fully(&per_cpu(memcg_stock, cpu));
return 0;
}
@@ -2150,7 +2204,7 @@ static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
* try_charge() (context permitting), as well as from the userland
* return path where reclaim is always able to block.
*/
-void mem_cgroup_handle_over_high(gfp_t gfp_mask)
+void __mem_cgroup_handle_over_high(gfp_t gfp_mask)
{
unsigned long penalty_jiffies;
unsigned long pflags;
@@ -2160,9 +2214,6 @@ void mem_cgroup_handle_over_high(gfp_t gfp_mask)
struct mem_cgroup *memcg;
bool in_retry = false;
- if (likely(!nr_pages))
- return;
-
memcg = get_mem_cgroup_from_mm(current->mm);
current->memcg_nr_pages_over_high = 0;
@@ -2256,12 +2307,13 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
bool drained = false;
bool raised_max_event = false;
unsigned long pflags;
+ bool allow_spinning = gfpflags_allow_spinning(gfp_mask);
retry:
- if (consume_stock(memcg, nr_pages, gfp_mask))
+ if (consume_stock(memcg, nr_pages))
return 0;
- if (!gfpflags_allow_spinning(gfp_mask))
+ if (!allow_spinning)
/* Avoid the refill and flush of the older stock */
batch = nr_pages;
@@ -2297,7 +2349,7 @@ retry:
if (!gfpflags_allow_blocking(gfp_mask))
goto nomem;
- memcg_memory_event(mem_over_limit, MEMCG_MAX);
+ __memcg_memory_event(mem_over_limit, MEMCG_MAX, allow_spinning);
raised_max_event = true;
psi_memstall_enter(&pflags);
@@ -2364,7 +2416,7 @@ force:
* a MEMCG_MAX event.
*/
if (!raised_max_event)
- memcg_memory_event(mem_over_limit, MEMCG_MAX);
+ __memcg_memory_event(mem_over_limit, MEMCG_MAX, allow_spinning);
/*
* The allocation either can't fail or will lead to more memory
@@ -2433,7 +2485,7 @@ done_restock:
if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH &&
!(current->flags & PF_MEMALLOC) &&
gfpflags_allow_blocking(gfp_mask))
- mem_cgroup_handle_over_high(gfp_mask);
+ __mem_cgroup_handle_over_high(gfp_mask);
return 0;
}
@@ -2459,17 +2511,48 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
folio->memcg_data = (unsigned long)memcg;
}
-static inline void __mod_objcg_mlstate(struct obj_cgroup *objcg,
+#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
+static inline void account_slab_nmi_safe(struct mem_cgroup *memcg,
+ struct pglist_data *pgdat,
+ enum node_stat_item idx, int nr)
+{
+ struct lruvec *lruvec;
+
+ if (likely(!in_nmi())) {
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ mod_memcg_lruvec_state(lruvec, idx, nr);
+ } else {
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[pgdat->node_id];
+
+ /* preemption is disabled in_nmi(). */
+ css_rstat_updated(&memcg->css, smp_processor_id());
+ if (idx == NR_SLAB_RECLAIMABLE_B)
+ atomic_add(nr, &pn->slab_reclaimable);
+ else
+ atomic_add(nr, &pn->slab_unreclaimable);
+ }
+}
+#else
+static inline void account_slab_nmi_safe(struct mem_cgroup *memcg,
+ struct pglist_data *pgdat,
+ enum node_stat_item idx, int nr)
+{
+ struct lruvec *lruvec;
+
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ mod_memcg_lruvec_state(lruvec, idx, nr);
+}
+#endif
+
+static inline void mod_objcg_mlstate(struct obj_cgroup *objcg,
struct pglist_data *pgdat,
enum node_stat_item idx, int nr)
{
struct mem_cgroup *memcg;
- struct lruvec *lruvec;
rcu_read_lock();
memcg = obj_cgroup_memcg(objcg);
- lruvec = mem_cgroup_lruvec(memcg, pgdat);
- __mod_memcg_lruvec_state(lruvec, idx, nr);
+ account_slab_nmi_safe(memcg, pgdat, idx, nr);
rcu_read_unlock();
}
@@ -2594,6 +2677,9 @@ __always_inline struct obj_cgroup *current_obj_cgroup(void)
struct mem_cgroup *memcg;
struct obj_cgroup *objcg;
+ if (IS_ENABLED(CONFIG_MEMCG_NMI_UNSAFE) && in_nmi())
+ return NULL;
+
if (in_task()) {
memcg = current->active_memcg;
if (unlikely(memcg))
@@ -2656,6 +2742,24 @@ struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio)
return objcg;
}
+#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
+static inline void account_kmem_nmi_safe(struct mem_cgroup *memcg, int val)
+{
+ if (likely(!in_nmi())) {
+ mod_memcg_state(memcg, MEMCG_KMEM, val);
+ } else {
+ /* preemption is disabled in_nmi(). */
+ css_rstat_updated(&memcg->css, smp_processor_id());
+ atomic_add(val, &memcg->kmem_stat);
+ }
+}
+#else
+static inline void account_kmem_nmi_safe(struct mem_cgroup *memcg, int val)
+{
+ mod_memcg_state(memcg, MEMCG_KMEM, val);
+}
+#endif
+
/*
* obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg
* @objcg: object cgroup to uncharge
@@ -2668,7 +2772,7 @@ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
memcg = get_mem_cgroup_from_objcg(objcg);
- mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages);
+ account_kmem_nmi_safe(memcg, -nr_pages);
memcg1_account_kmem(memcg, -nr_pages);
if (!mem_cgroup_is_root(memcg))
refill_stock(memcg, nr_pages);
@@ -2696,7 +2800,7 @@ static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp,
if (ret)
goto out;
- mod_memcg_state(memcg, MEMCG_KMEM, nr_pages);
+ account_kmem_nmi_safe(memcg, nr_pages);
memcg1_account_kmem(memcg, nr_pages);
out:
css_put(&memcg->css);
@@ -2764,50 +2868,27 @@ void __memcg_kmem_uncharge_page(struct page *page, int order)
obj_cgroup_put(objcg);
}
-/* Replace the stock objcg with objcg, return the old objcg */
-static struct obj_cgroup *replace_stock_objcg(struct memcg_stock_pcp *stock,
- struct obj_cgroup *objcg)
-{
- struct obj_cgroup *old = NULL;
-
- old = drain_obj_stock(stock);
- obj_cgroup_get(objcg);
- stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
- ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
- WRITE_ONCE(stock->cached_objcg, objcg);
- return old;
-}
-
-static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
- enum node_stat_item idx, int nr)
+static void __account_obj_stock(struct obj_cgroup *objcg,
+ struct obj_stock_pcp *stock, int nr,
+ struct pglist_data *pgdat, enum node_stat_item idx)
{
- struct memcg_stock_pcp *stock;
- struct obj_cgroup *old = NULL;
- unsigned long flags;
int *bytes;
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
- stock = this_cpu_ptr(&memcg_stock);
-
/*
* Save vmstat data in stock and skip vmstat array update unless
- * accumulating over a page of vmstat data or when pgdat or idx
- * changes.
+ * accumulating over a page of vmstat data or when pgdat changes.
*/
- if (READ_ONCE(stock->cached_objcg) != objcg) {
- old = replace_stock_objcg(stock, objcg);
- stock->cached_pgdat = pgdat;
- } else if (stock->cached_pgdat != pgdat) {
+ if (stock->cached_pgdat != pgdat) {
/* Flush the existing cached vmstat data */
struct pglist_data *oldpg = stock->cached_pgdat;
if (stock->nr_slab_reclaimable_b) {
- __mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B,
+ mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B,
stock->nr_slab_reclaimable_b);
stock->nr_slab_reclaimable_b = 0;
}
if (stock->nr_slab_unreclaimable_b) {
- __mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B,
+ mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B,
stock->nr_slab_unreclaimable_b);
stock->nr_slab_unreclaimable_b = 0;
}
@@ -2833,37 +2914,38 @@ static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
}
}
if (nr)
- __mod_objcg_mlstate(objcg, pgdat, idx, nr);
-
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
- obj_cgroup_put(old);
+ mod_objcg_mlstate(objcg, pgdat, idx, nr);
}
-static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
+static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
+ struct pglist_data *pgdat, enum node_stat_item idx)
{
- struct memcg_stock_pcp *stock;
- unsigned long flags;
+ struct obj_stock_pcp *stock;
bool ret = false;
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ if (!local_trylock(&obj_stock.lock))
+ return ret;
- stock = this_cpu_ptr(&memcg_stock);
+ stock = this_cpu_ptr(&obj_stock);
if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) {
stock->nr_bytes -= nr_bytes;
ret = true;
+
+ if (pgdat)
+ __account_obj_stock(objcg, stock, nr_bytes, pgdat, idx);
}
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ local_unlock(&obj_stock.lock);
return ret;
}
-static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
+static void drain_obj_stock(struct obj_stock_pcp *stock)
{
struct obj_cgroup *old = READ_ONCE(stock->cached_objcg);
if (!old)
- return NULL;
+ return;
if (stock->nr_bytes) {
unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
@@ -2876,7 +2958,8 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages);
memcg1_account_kmem(memcg, -nr_pages);
- __refill_stock(memcg, nr_pages);
+ if (!mem_cgroup_is_root(memcg))
+ memcg_uncharge(memcg, nr_pages);
css_put(&memcg->css);
}
@@ -2900,13 +2983,13 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
*/
if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) {
if (stock->nr_slab_reclaimable_b) {
- __mod_objcg_mlstate(old, stock->cached_pgdat,
+ mod_objcg_mlstate(old, stock->cached_pgdat,
NR_SLAB_RECLAIMABLE_B,
stock->nr_slab_reclaimable_b);
stock->nr_slab_reclaimable_b = 0;
}
if (stock->nr_slab_unreclaimable_b) {
- __mod_objcg_mlstate(old, stock->cached_pgdat,
+ mod_objcg_mlstate(old, stock->cached_pgdat,
NR_SLAB_UNRECLAIMABLE_B,
stock->nr_slab_unreclaimable_b);
stock->nr_slab_unreclaimable_b = 0;
@@ -2915,63 +2998,76 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
}
WRITE_ONCE(stock->cached_objcg, NULL);
- /*
- * The `old' objects needs to be released by the caller via
- * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock.
- */
- return old;
+ obj_cgroup_put(old);
}
-static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
+static bool obj_stock_flush_required(struct obj_stock_pcp *stock,
struct mem_cgroup *root_memcg)
{
struct obj_cgroup *objcg = READ_ONCE(stock->cached_objcg);
struct mem_cgroup *memcg;
+ bool flush = false;
+ rcu_read_lock();
if (objcg) {
memcg = obj_cgroup_memcg(objcg);
if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
- return true;
+ flush = true;
}
+ rcu_read_unlock();
- return false;
+ return flush;
}
static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
- bool allow_uncharge)
+ bool allow_uncharge, int nr_acct, struct pglist_data *pgdat,
+ enum node_stat_item idx)
{
- struct memcg_stock_pcp *stock;
- struct obj_cgroup *old = NULL;
- unsigned long flags;
+ struct obj_stock_pcp *stock;
unsigned int nr_pages = 0;
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ if (!local_trylock(&obj_stock.lock)) {
+ if (pgdat)
+ mod_objcg_mlstate(objcg, pgdat, idx, nr_bytes);
+ nr_pages = nr_bytes >> PAGE_SHIFT;
+ nr_bytes = nr_bytes & (PAGE_SIZE - 1);
+ atomic_add(nr_bytes, &objcg->nr_charged_bytes);
+ goto out;
+ }
- stock = this_cpu_ptr(&memcg_stock);
+ stock = this_cpu_ptr(&obj_stock);
if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */
- old = replace_stock_objcg(stock, objcg);
+ drain_obj_stock(stock);
+ obj_cgroup_get(objcg);
+ stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
+ ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
+ WRITE_ONCE(stock->cached_objcg, objcg);
+
allow_uncharge = true; /* Allow uncharge when objcg changes */
}
stock->nr_bytes += nr_bytes;
+ if (pgdat)
+ __account_obj_stock(objcg, stock, nr_acct, pgdat, idx);
+
if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) {
nr_pages = stock->nr_bytes >> PAGE_SHIFT;
stock->nr_bytes &= (PAGE_SIZE - 1);
}
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
- obj_cgroup_put(old);
-
+ local_unlock(&obj_stock.lock);
+out:
if (nr_pages)
obj_cgroup_uncharge_pages(objcg, nr_pages);
}
-int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
+static int obj_cgroup_charge_account(struct obj_cgroup *objcg, gfp_t gfp, size_t size,
+ struct pglist_data *pgdat, enum node_stat_item idx)
{
unsigned int nr_pages, nr_bytes;
int ret;
- if (consume_obj_stock(objcg, size))
+ if (likely(consume_obj_stock(objcg, size, pgdat, idx)))
return 0;
/*
@@ -3004,15 +3100,21 @@ int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
nr_pages += 1;
ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages);
- if (!ret && nr_bytes)
- refill_obj_stock(objcg, PAGE_SIZE - nr_bytes, false);
+ if (!ret && (nr_bytes || pgdat))
+ refill_obj_stock(objcg, nr_bytes ? PAGE_SIZE - nr_bytes : 0,
+ false, size, pgdat, idx);
return ret;
}
+int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
+{
+ return obj_cgroup_charge_account(objcg, gfp, size, NULL, 0);
+}
+
void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
{
- refill_obj_stock(objcg, size, true);
+ refill_obj_stock(objcg, size, true, 0, NULL, 0);
}
static inline size_t obj_full_size(struct kmem_cache *s)
@@ -3064,23 +3166,32 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
return false;
}
- if (obj_cgroup_charge(objcg, flags, size * obj_full_size(s)))
- return false;
-
for (i = 0; i < size; i++) {
slab = virt_to_slab(p[i]);
if (!slab_obj_exts(slab) &&
alloc_slab_obj_exts(slab, s, flags, false)) {
- obj_cgroup_uncharge(objcg, obj_full_size(s));
continue;
}
+ /*
+ * if we fail and size is 1, memcg_alloc_abort_single() will
+ * just free the object, which is ok as we have not assigned
+ * objcg to its obj_ext yet
+ *
+ * for larger sizes, kmem_cache_free_bulk() will uncharge
+ * any objects that were already charged and obj_ext assigned
+ *
+ * TODO: we could batch this until slab_pgdat(slab) changes
+ * between iterations, with a more complicated undo
+ */
+ if (obj_cgroup_charge_account(objcg, flags, obj_full_size(s),
+ slab_pgdat(slab), cache_vmstat_idx(s)))
+ return false;
+
off = obj_to_index(s, slab, p[i]);
obj_cgroup_get(objcg);
slab_obj_exts(slab)[off].objcg = objcg;
- mod_objcg_state(objcg, slab_pgdat(slab),
- cache_vmstat_idx(s), obj_full_size(s));
}
return true;
@@ -3089,6 +3200,8 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
void **p, int objects, struct slabobj_ext *obj_exts)
{
+ size_t obj_size = obj_full_size(s);
+
for (int i = 0; i < objects; i++) {
struct obj_cgroup *objcg;
unsigned int off;
@@ -3099,9 +3212,8 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
continue;
obj_exts[off].objcg = NULL;
- obj_cgroup_uncharge(objcg, obj_full_size(s));
- mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s),
- -obj_full_size(s));
+ refill_obj_stock(objcg, obj_size, true, -obj_size,
+ slab_pgdat(slab), cache_vmstat_idx(s));
obj_cgroup_put(objcg);
}
}
@@ -3543,7 +3655,8 @@ static bool alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
{
struct mem_cgroup_per_node *pn;
- pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, node);
+ pn = kmem_cache_alloc_node(memcg_pn_cachep, GFP_KERNEL | __GFP_ZERO,
+ node);
if (!pn)
return false;
@@ -3590,13 +3703,14 @@ static void mem_cgroup_free(struct mem_cgroup *memcg)
static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
{
- struct memcg_vmstats_percpu *statc, *pstatc;
+ struct memcg_vmstats_percpu *statc;
+ struct memcg_vmstats_percpu __percpu *pstatc_pcpu;
struct mem_cgroup *memcg;
int node, cpu;
int __maybe_unused i;
long error;
- memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL);
+ memcg = kmem_cache_zalloc(memcg_cachep, GFP_KERNEL);
if (!memcg)
return ERR_PTR(-ENOMEM);
@@ -3621,9 +3735,9 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
for_each_possible_cpu(cpu) {
if (parent)
- pstatc = per_cpu_ptr(parent->vmstats_percpu, cpu);
+ pstatc_pcpu = parent->vmstats_percpu;
statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
- statc->parent = parent ? pstatc : NULL;
+ statc->parent_pcpu = parent ? pstatc_pcpu : NULL;
statc->vmstats = memcg->vmstats;
}
@@ -3639,7 +3753,10 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
INIT_LIST_HEAD(&memcg->memory_peaks);
INIT_LIST_HEAD(&memcg->swap_peaks);
spin_lock_init(&memcg->peaks_lock);
- memcg->socket_pressure = jiffies;
+ memcg->socket_pressure = get_jiffies_64();
+#if BITS_PER_LONG < 64
+ seqlock_init(&memcg->socket_pressure_seqlock);
+#endif
memcg1_memcg_init(memcg);
memcg->kmemcg_id = -1;
INIT_LIST_HEAD(&memcg->objcg_list);
@@ -3897,6 +4014,53 @@ static void mem_cgroup_stat_aggregate(struct aggregate_control *ac)
}
}
+#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
+static void flush_nmi_stats(struct mem_cgroup *memcg, struct mem_cgroup *parent,
+ int cpu)
+{
+ int nid;
+
+ if (atomic_read(&memcg->kmem_stat)) {
+ int kmem = atomic_xchg(&memcg->kmem_stat, 0);
+ int index = memcg_stats_index(MEMCG_KMEM);
+
+ memcg->vmstats->state[index] += kmem;
+ if (parent)
+ parent->vmstats->state_pending[index] += kmem;
+ }
+
+ for_each_node_state(nid, N_MEMORY) {
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
+ struct lruvec_stats *lstats = pn->lruvec_stats;
+ struct lruvec_stats *plstats = NULL;
+
+ if (parent)
+ plstats = parent->nodeinfo[nid]->lruvec_stats;
+
+ if (atomic_read(&pn->slab_reclaimable)) {
+ int slab = atomic_xchg(&pn->slab_reclaimable, 0);
+ int index = memcg_stats_index(NR_SLAB_RECLAIMABLE_B);
+
+ lstats->state[index] += slab;
+ if (plstats)
+ plstats->state_pending[index] += slab;
+ }
+ if (atomic_read(&pn->slab_unreclaimable)) {
+ int slab = atomic_xchg(&pn->slab_unreclaimable, 0);
+ int index = memcg_stats_index(NR_SLAB_UNRECLAIMABLE_B);
+
+ lstats->state[index] += slab;
+ if (plstats)
+ plstats->state_pending[index] += slab;
+ }
+ }
+}
+#else
+static void flush_nmi_stats(struct mem_cgroup *memcg, struct mem_cgroup *parent,
+ int cpu)
+{}
+#endif
+
static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
@@ -3905,6 +4069,8 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
struct aggregate_control ac;
int nid;
+ flush_nmi_stats(memcg, parent, cpu);
+
statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
ac = (struct aggregate_control) {
@@ -3954,8 +4120,8 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
}
WRITE_ONCE(statc->stats_updates, 0);
/* We are in a per-cpu loop here, only do the atomic write once */
- if (atomic64_read(&memcg->vmstats->stats_updates))
- atomic64_set(&memcg->vmstats->stats_updates, 0);
+ if (atomic_read(&memcg->vmstats->stats_updates))
+ atomic_set(&memcg->vmstats->stats_updates, 0);
}
static void mem_cgroup_fork(struct task_struct *task)
@@ -4196,6 +4362,9 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
page_counter_set_high(&memcg->memory, high);
+ if (of->file->f_flags & O_NONBLOCK)
+ goto out;
+
for (;;) {
unsigned long nr_pages = page_counter_read(&memcg->memory);
unsigned long reclaimed;
@@ -4218,7 +4387,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
if (!reclaimed && !nr_retries--)
break;
}
-
+out:
memcg_wb_domain_size_changed(memcg);
return nbytes;
}
@@ -4245,6 +4414,9 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
xchg(&memcg->memory.max, max);
+ if (of->file->f_flags & O_NONBLOCK)
+ goto out;
+
for (;;) {
unsigned long nr_pages = page_counter_read(&memcg->memory);
@@ -4272,7 +4444,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
break;
cond_resched();
}
-
+out:
memcg_wb_domain_size_changed(memcg);
return nbytes;
}
@@ -4393,78 +4565,15 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
return nbytes;
}
-enum {
- MEMORY_RECLAIM_SWAPPINESS = 0,
- MEMORY_RECLAIM_NULL,
-};
-
-static const match_table_t tokens = {
- { MEMORY_RECLAIM_SWAPPINESS, "swappiness=%d"},
- { MEMORY_RECLAIM_NULL, NULL },
-};
-
static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
- unsigned int nr_retries = MAX_RECLAIM_RETRIES;
- unsigned long nr_to_reclaim, nr_reclaimed = 0;
- int swappiness = -1;
- unsigned int reclaim_options;
- char *old_buf, *start;
- substring_t args[MAX_OPT_ARGS];
-
- buf = strstrip(buf);
-
- old_buf = buf;
- nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE;
- if (buf == old_buf)
- return -EINVAL;
-
- buf = strstrip(buf);
-
- while ((start = strsep(&buf, " ")) != NULL) {
- if (!strlen(start))
- continue;
- switch (match_token(start, tokens, args)) {
- case MEMORY_RECLAIM_SWAPPINESS:
- if (match_int(&args[0], &swappiness))
- return -EINVAL;
- if (swappiness < MIN_SWAPPINESS || swappiness > MAX_SWAPPINESS)
- return -EINVAL;
- break;
- default:
- return -EINVAL;
- }
- }
-
- reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE;
- while (nr_reclaimed < nr_to_reclaim) {
- /* Will converge on zero, but reclaim enforces a minimum */
- unsigned long batch_size = (nr_to_reclaim - nr_reclaimed) / 4;
- unsigned long reclaimed;
-
- if (signal_pending(current))
- return -EINTR;
-
- /*
- * This is the final attempt, drain percpu lru caches in the
- * hope of introducing more evictable pages for
- * try_to_free_mem_cgroup_pages().
- */
- if (!nr_retries)
- lru_add_drain_all();
-
- reclaimed = try_to_free_mem_cgroup_pages(memcg,
- batch_size, GFP_KERNEL,
- reclaim_options,
- swappiness == -1 ? NULL : &swappiness);
-
- if (!reclaimed && !nr_retries--)
- return -EAGAIN;
+ int ret;
- nr_reclaimed += reclaimed;
- }
+ ret = user_proactive_reclaim(buf, memcg, NULL);
+ if (ret)
+ return ret;
return nbytes;
}
@@ -4697,9 +4806,7 @@ static inline void uncharge_gather_clear(struct uncharge_gather *ug)
static void uncharge_batch(const struct uncharge_gather *ug)
{
if (ug->nr_memory) {
- page_counter_uncharge(&ug->memcg->memory, ug->nr_memory);
- if (do_memsw_account())
- page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory);
+ memcg_uncharge(ug->memcg, ug->nr_memory);
if (ug->nr_kmem) {
mod_memcg_state(ug->memcg, MEMCG_KMEM, -ug->nr_kmem);
memcg1_account_kmem(ug->memcg, -ug->nr_kmem);
@@ -4912,22 +5019,42 @@ out:
void mem_cgroup_sk_free(struct sock *sk)
{
- if (sk->sk_memcg)
- css_put(&sk->sk_memcg->css);
+ struct mem_cgroup *memcg = mem_cgroup_from_sk(sk);
+
+ if (memcg)
+ css_put(&memcg->css);
+}
+
+void mem_cgroup_sk_inherit(const struct sock *sk, struct sock *newsk)
+{
+ struct mem_cgroup *memcg;
+
+ if (sk->sk_memcg == newsk->sk_memcg)
+ return;
+
+ mem_cgroup_sk_free(newsk);
+
+ memcg = mem_cgroup_from_sk(sk);
+ if (memcg)
+ css_get(&memcg->css);
+
+ newsk->sk_memcg = sk->sk_memcg;
}
/**
- * mem_cgroup_charge_skmem - charge socket memory
- * @memcg: memcg to charge
+ * mem_cgroup_sk_charge - charge socket memory
+ * @sk: socket in memcg to charge
* @nr_pages: number of pages to charge
* @gfp_mask: reclaim mode
*
* Charges @nr_pages to @memcg. Returns %true if the charge fit within
* @memcg's configured limit, %false if it doesn't.
*/
-bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
- gfp_t gfp_mask)
+bool mem_cgroup_sk_charge(const struct sock *sk, unsigned int nr_pages,
+ gfp_t gfp_mask)
{
+ struct mem_cgroup *memcg = mem_cgroup_from_sk(sk);
+
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
return memcg1_charge_skmem(memcg, nr_pages, gfp_mask);
@@ -4940,12 +5067,14 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
}
/**
- * mem_cgroup_uncharge_skmem - uncharge socket memory
- * @memcg: memcg to uncharge
+ * mem_cgroup_sk_uncharge - uncharge socket memory
+ * @sk: socket in memcg to uncharge
* @nr_pages: number of pages to uncharge
*/
-void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
+void mem_cgroup_sk_uncharge(const struct sock *sk, unsigned int nr_pages)
{
+ struct mem_cgroup *memcg = mem_cgroup_from_sk(sk);
+
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
memcg1_uncharge_skmem(memcg, nr_pages);
return;
@@ -4975,15 +5104,16 @@ static int __init cgroup_memory(char *s)
__setup("cgroup.memory=", cgroup_memory);
/*
- * subsys_initcall() for memory controller.
+ * Memory controller init before cgroup_init() initialize root_mem_cgroup.
*
* Some parts like memcg_hotplug_cpu_dead() have to be initialized from this
* context because of lock dependencies (cgroup_lock -> cpu hotplug) but
* basically everything that doesn't depend on a specific mem_cgroup structure
* should be initialized from here.
*/
-static int __init mem_cgroup_init(void)
+int __init mem_cgroup_init(void)
{
+ unsigned int memcg_size;
int cpu;
/*
@@ -4997,13 +5127,22 @@ static int __init mem_cgroup_init(void)
cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
memcg_hotplug_cpu_dead);
- for_each_possible_cpu(cpu)
+ for_each_possible_cpu(cpu) {
INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
- drain_local_stock);
+ drain_local_memcg_stock);
+ INIT_WORK(&per_cpu_ptr(&obj_stock, cpu)->work,
+ drain_local_obj_stock);
+ }
+
+ memcg_size = struct_size_t(struct mem_cgroup, nodeinfo, nr_node_ids);
+ memcg_cachep = kmem_cache_create("mem_cgroup", memcg_size, 0,
+ SLAB_PANIC | SLAB_HWCACHE_ALIGN, NULL);
+
+ memcg_pn_cachep = KMEM_CACHE(mem_cgroup_per_node,
+ SLAB_PANIC | SLAB_HWCACHE_ALIGN);
return 0;
}
-subsys_initcall(mem_cgroup_init);
#ifdef CONFIG_SWAP
/**
@@ -5457,3 +5596,8 @@ static int __init mem_cgroup_swap_init(void)
subsys_initcall(mem_cgroup_swap_init);
#endif /* CONFIG_SWAP */
+
+bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
+{
+ return memcg ? cpuset_node_allowed(memcg->css.cgroup, nid) : true;
+}
diff --git a/mm/memfd.c b/mm/memfd.c
index c64df1343059..1d109c1acf21 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -20,6 +20,7 @@
#include <linux/memfd.h>
#include <linux/pid_namespace.h>
#include <uapi/linux/memfd.h>
+#include "swap.h"
/*
* We need a tag: a new tag would expand every xa_node by 8 bytes,
@@ -31,8 +32,7 @@
static bool memfd_folio_has_extra_refs(struct folio *folio)
{
- return folio_ref_count(folio) - folio_mapcount(folio) !=
- folio_nr_pages(folio);
+ return folio_ref_count(folio) != folio_expected_ref_count(folio);
}
static void memfd_tag_pins(struct xa_state *xas)
@@ -70,7 +70,6 @@ struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx)
#ifdef CONFIG_HUGETLB_PAGE
struct folio *folio;
gfp_t gfp_mask;
- int err;
if (is_file_hugepages(memfd)) {
/*
@@ -79,12 +78,19 @@ struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx)
* alloc from. Also, the folio will be pinned for an indefinite
* amount of time, so it is not expected to be migrated away.
*/
+ struct inode *inode = file_inode(memfd);
struct hstate *h = hstate_file(memfd);
+ int err = -ENOMEM;
+ long nr_resv;
gfp_mask = htlb_alloc_mask(h);
gfp_mask &= ~(__GFP_HIGHMEM | __GFP_MOVABLE);
idx >>= huge_page_order(h);
+ nr_resv = hugetlb_reserve_pages(inode, idx, idx + 1, NULL, 0);
+ if (nr_resv < 0)
+ return ERR_PTR(nr_resv);
+
folio = alloc_hugetlb_folio_reserve(h,
numa_node_id(),
NULL,
@@ -95,12 +101,17 @@ struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx)
idx);
if (err) {
folio_put(folio);
- return ERR_PTR(err);
+ goto err_unresv;
}
+
+ hugetlb_set_folio_subpool(folio, subpool_inode(inode));
folio_unlock(folio);
return folio;
}
- return ERR_PTR(-ENOMEM);
+err_unresv:
+ if (nr_resv > 0)
+ hugetlb_unreserve_pages(inode, idx, idx + 1, 0);
+ return ERR_PTR(err);
}
#endif
return shmem_read_folio(memfd->f_mapping, idx);
@@ -332,10 +343,10 @@ static inline bool is_write_sealed(unsigned int seals)
return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE);
}
-static int check_write_seal(unsigned long *vm_flags_ptr)
+static int check_write_seal(vm_flags_t *vm_flags_ptr)
{
- unsigned long vm_flags = *vm_flags_ptr;
- unsigned long mask = vm_flags & (VM_SHARED | VM_WRITE);
+ vm_flags_t vm_flags = *vm_flags_ptr;
+ vm_flags_t mask = vm_flags & (VM_SHARED | VM_WRITE);
/* If a private mapping then writability is irrelevant. */
if (!(mask & VM_SHARED))
@@ -357,7 +368,7 @@ static int check_write_seal(unsigned long *vm_flags_ptr)
return 0;
}
-int memfd_check_seals_mmap(struct file *file, unsigned long *vm_flags_ptr)
+int memfd_check_seals_mmap(struct file *file, vm_flags_t *vm_flags_ptr)
{
int err = 0;
unsigned int *seals_ptr = memfd_file_seals_ptr(file);
@@ -374,11 +385,11 @@ static int sanitize_flags(unsigned int *flags_ptr)
unsigned int flags = *flags_ptr;
if (!(flags & MFD_HUGETLB)) {
- if (flags & ~(unsigned int)MFD_ALL_FLAGS)
+ if (flags & ~MFD_ALL_FLAGS)
return -EINVAL;
} else {
/* Allow huge page size encoding in flags. */
- if (flags & ~(unsigned int)(MFD_ALL_FLAGS |
+ if (flags & ~(MFD_ALL_FLAGS |
(MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
return -EINVAL;
}
@@ -400,7 +411,7 @@ static char *alloc_name(const char __user *uname)
if (!name)
return ERR_PTR(-ENOMEM);
- strcpy(name, MFD_NAME_PREFIX);
+ memcpy(name, MFD_NAME_PREFIX, MFD_NAME_PREFIX_LEN);
/* returned length does not include terminating zero */
len = strncpy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, MFD_NAME_MAX_LEN + 1);
if (len < 0) {
@@ -474,22 +485,22 @@ SYSCALL_DEFINE2(memfd_create,
fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
if (fd < 0) {
error = fd;
- goto err_name;
+ goto err_free_name;
}
file = alloc_file(name, flags);
if (IS_ERR(file)) {
error = PTR_ERR(file);
- goto err_fd;
+ goto err_free_fd;
}
fd_install(fd, file);
kfree(name);
return fd;
-err_fd:
+err_free_fd:
put_unused_fd(fd);
-err_name:
+err_free_name:
kfree(name);
return error;
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index b91a33fb6c69..3edebb0cda30 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -212,106 +212,34 @@ static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, boo
return true;
}
-#if IS_ENABLED(CONFIG_HWPOISON_INJECT)
+static hwpoison_filter_func_t __rcu *hwpoison_filter_func __read_mostly;
-u32 hwpoison_filter_enable = 0;
-u32 hwpoison_filter_dev_major = ~0U;
-u32 hwpoison_filter_dev_minor = ~0U;
-u64 hwpoison_filter_flags_mask;
-u64 hwpoison_filter_flags_value;
-EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
-EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
-EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
-EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
-EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
-
-static int hwpoison_filter_dev(struct page *p)
+void hwpoison_filter_register(hwpoison_filter_func_t *filter)
{
- struct folio *folio = page_folio(p);
- struct address_space *mapping;
- dev_t dev;
-
- if (hwpoison_filter_dev_major == ~0U &&
- hwpoison_filter_dev_minor == ~0U)
- return 0;
-
- mapping = folio_mapping(folio);
- if (mapping == NULL || mapping->host == NULL)
- return -EINVAL;
-
- dev = mapping->host->i_sb->s_dev;
- if (hwpoison_filter_dev_major != ~0U &&
- hwpoison_filter_dev_major != MAJOR(dev))
- return -EINVAL;
- if (hwpoison_filter_dev_minor != ~0U &&
- hwpoison_filter_dev_minor != MINOR(dev))
- return -EINVAL;
-
- return 0;
+ rcu_assign_pointer(hwpoison_filter_func, filter);
}
+EXPORT_SYMBOL_GPL(hwpoison_filter_register);
-static int hwpoison_filter_flags(struct page *p)
+void hwpoison_filter_unregister(void)
{
- if (!hwpoison_filter_flags_mask)
- return 0;
-
- if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
- hwpoison_filter_flags_value)
- return 0;
- else
- return -EINVAL;
+ RCU_INIT_POINTER(hwpoison_filter_func, NULL);
+ synchronize_rcu();
}
+EXPORT_SYMBOL_GPL(hwpoison_filter_unregister);
-/*
- * This allows stress tests to limit test scope to a collection of tasks
- * by putting them under some memcg. This prevents killing unrelated/important
- * processes such as /sbin/init. Note that the target task may share clean
- * pages with init (eg. libc text), which is harmless. If the target task
- * share _dirty_ pages with another task B, the test scheme must make sure B
- * is also included in the memcg. At last, due to race conditions this filter
- * can only guarantee that the page either belongs to the memcg tasks, or is
- * a freed page.
- */
-#ifdef CONFIG_MEMCG
-u64 hwpoison_filter_memcg;
-EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
-static int hwpoison_filter_task(struct page *p)
+static int hwpoison_filter(struct page *p)
{
- if (!hwpoison_filter_memcg)
- return 0;
-
- if (page_cgroup_ino(p) != hwpoison_filter_memcg)
- return -EINVAL;
-
- return 0;
-}
-#else
-static int hwpoison_filter_task(struct page *p) { return 0; }
-#endif
-
-int hwpoison_filter(struct page *p)
-{
- if (!hwpoison_filter_enable)
- return 0;
-
- if (hwpoison_filter_dev(p))
- return -EINVAL;
-
- if (hwpoison_filter_flags(p))
- return -EINVAL;
+ int ret = 0;
+ hwpoison_filter_func_t *filter;
- if (hwpoison_filter_task(p))
- return -EINVAL;
+ rcu_read_lock();
+ filter = rcu_dereference(hwpoison_filter_func);
+ if (filter)
+ ret = filter(p);
+ rcu_read_unlock();
- return 0;
-}
-EXPORT_SYMBOL_GPL(hwpoison_filter);
-#else
-int hwpoison_filter(struct page *p)
-{
- return 0;
+ return ret;
}
-#endif
/*
* Kill all processes that have a poisoned page mapped and then isolate
@@ -837,19 +765,33 @@ static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask,
struct mm_walk *walk)
{
struct hwpoison_walk *hwp = walk->private;
- pte_t pte = huge_ptep_get(walk->mm, addr, ptep);
struct hstate *h = hstate_vma(walk->vma);
+ spinlock_t *ptl;
+ pte_t pte;
+ int ret;
- return check_hwpoisoned_entry(pte, addr, huge_page_shift(h),
- hwp->pfn, &hwp->tk);
+ ptl = huge_pte_lock(h, walk->mm, ptep);
+ pte = huge_ptep_get(walk->mm, addr, ptep);
+ ret = check_hwpoisoned_entry(pte, addr, huge_page_shift(h),
+ hwp->pfn, &hwp->tk);
+ spin_unlock(ptl);
+ return ret;
}
#else
#define hwpoison_hugetlb_range NULL
#endif
+static int hwpoison_test_walk(unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ /* We also want to consider pages mapped into VM_PFNMAP. */
+ return 0;
+}
+
static const struct mm_walk_ops hwpoison_walk_ops = {
.pmd_entry = hwpoison_pte_range,
.hugetlb_entry = hwpoison_hugetlb_range,
+ .test_walk = hwpoison_test_walk,
.walk_lock = PGWALK_RDLOCK,
};
@@ -942,7 +884,7 @@ static const char * const action_page_types[] = {
[MF_MSG_BUDDY] = "free buddy page",
[MF_MSG_DAX] = "dax page",
[MF_MSG_UNSPLIT_THP] = "unsplit thp",
- [MF_MSG_ALREADY_POISONED] = "already poisoned",
+ [MF_MSG_ALREADY_POISONED] = "already poisoned page",
[MF_MSG_UNKNOWN] = "unknown page",
};
@@ -1185,7 +1127,7 @@ static int me_swapcache_clean(struct page_state *ps, struct page *p)
struct folio *folio = page_folio(p);
int ret;
- delete_from_swap_cache(folio);
+ swap_cache_del_folio(folio);
ret = delete_from_lru_cache(folio) ? MF_FAILED : MF_RECOVERED;
folio_unlock(folio);
@@ -1335,9 +1277,10 @@ static int action_result(unsigned long pfn, enum mf_action_page_type type,
{
trace_memory_failure_event(pfn, type, result);
- num_poisoned_pages_inc(pfn);
-
- update_per_node_mf_stats(pfn, result);
+ if (type != MF_MSG_ALREADY_POISONED) {
+ num_poisoned_pages_inc(pfn);
+ update_per_node_mf_stats(pfn, result);
+ }
pr_err("%#lx: recovery action for %s: %s\n",
pfn, action_page_types[type], action_name[result]);
@@ -1388,8 +1331,8 @@ static inline bool HWPoisonHandlable(struct page *page, unsigned long flags)
if (PageSlab(page))
return false;
- /* Soft offline could migrate non-LRU movable pages */
- if ((flags & MF_SOFT_OFFLINE) && __PageMovable(page))
+ /* Soft offline could migrate movable_ops pages */
+ if ((flags & MF_SOFT_OFFLINE) && page_has_movable_ops(page))
return true;
return PageLRU(page) || is_free_buddy_page(page);
@@ -1561,6 +1504,10 @@ static int get_hwpoison_page(struct page *p, unsigned long flags)
return ret;
}
+/*
+ * The caller must guarantee the folio isn't large folio, except hugetlb.
+ * try_to_unmap() can't handle it.
+ */
int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill)
{
enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON;
@@ -1689,10 +1636,10 @@ static int identify_page_state(unsigned long pfn, struct page *p,
* carried out only if the first check can't determine the page status.
*/
for (ps = error_states;; ps++)
- if ((p->flags & ps->mask) == ps->res)
+ if ((p->flags.f & ps->mask) == ps->res)
break;
- page_flags |= (p->flags & (1UL << PG_dirty));
+ page_flags |= (p->flags.f & (1UL << PG_dirty));
if (!ps->mask)
for (ps = error_states;; ps++)
@@ -2076,12 +2023,11 @@ retry:
*hugetlb = 0;
return 0;
} else if (res == -EHWPOISON) {
- pr_err("%#lx: already hardware poisoned\n", pfn);
if (flags & MF_ACTION_REQUIRED) {
folio = page_folio(p);
res = kill_accessing_process(current, folio_pfn(folio), flags);
- action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
}
+ action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
return res;
} else if (res == -EBUSY) {
if (!(flags & MF_NO_RETRY)) {
@@ -2119,7 +2065,7 @@ retry:
return action_result(pfn, MF_MSG_FREE_HUGE, res);
}
- page_flags = folio->flags;
+ page_flags = folio->flags.f;
if (!hwpoison_user_mappings(folio, p, pfn, flags)) {
folio_unlock(folio);
@@ -2248,7 +2194,7 @@ int memory_failure(unsigned long pfn, int flags)
goto unlock_mutex;
if (pfn_valid(pfn)) {
- pgmap = get_dev_pagemap(pfn, NULL);
+ pgmap = get_dev_pagemap(pfn);
put_ref_page(pfn, flags);
if (pgmap) {
res = memory_failure_dev_pagemap(pfn, flags,
@@ -2267,7 +2213,6 @@ try_again:
goto unlock_mutex;
if (TestSetPageHWPoison(p)) {
- pr_err("%#lx: already hardware poisoned\n", pfn);
res = -EHWPOISON;
if (flags & MF_ACTION_REQUIRED)
res = kill_accessing_process(current, pfn, flags);
@@ -2380,7 +2325,7 @@ try_again:
* folio_remove_rmap_*() in try_to_unmap_one(). So to determine page
* status correctly, we save a copy of the page flags at this time.
*/
- page_flags = folio->flags;
+ page_flags = folio->flags.f;
/*
* __munlock_folio() may clear a writeback folio's LRU flag without
@@ -2503,19 +2448,6 @@ static void memory_failure_work_func(struct work_struct *work)
}
}
-/*
- * Process memory_failure work queued on the specified CPU.
- * Used to avoid return-to-userspace racing with the memory_failure workqueue.
- */
-void memory_failure_queue_kick(int cpu)
-{
- struct memory_failure_cpu *mf_cpu;
-
- mf_cpu = &per_cpu(memory_failure_cpu, cpu);
- cancel_work_sync(&mf_cpu->work);
- memory_failure_work_func(&mf_cpu->work);
-}
-
static int __init memory_failure_init(void)
{
struct memory_failure_cpu *mf_cpu;
@@ -2564,10 +2496,9 @@ int unpoison_memory(unsigned long pfn)
static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
- if (!pfn_valid(pfn))
- return -ENXIO;
-
- p = pfn_to_page(pfn);
+ p = pfn_to_online_page(pfn);
+ if (!p)
+ return -EIO;
folio = page_folio(p);
mutex_lock(&mf_mutex);
@@ -2739,13 +2670,13 @@ static int soft_offline_in_use_page(struct page *page)
putback_movable_pages(&pagelist);
pr_info("%#lx: %s migration failed %ld, type %pGp\n",
- pfn, msg_page[huge], ret, &page->flags);
+ pfn, msg_page[huge], ret, &page->flags.f);
if (ret > 0)
ret = -EBUSY;
}
} else {
pr_info("%#lx: %s isolation failed, page count %d, type %pGp\n",
- pfn, msg_page[huge], page_count(page), &page->flags);
+ pfn, msg_page[huge], page_count(page), &page->flags.f);
ret = -EBUSY;
}
return ret;
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index fc14fe53e9b7..0ea5c13f10a2 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -872,25 +872,18 @@ static int __meminit memtier_hotplug_callback(struct notifier_block *self,
unsigned long action, void *_arg)
{
struct memory_tier *memtier;
- struct memory_notify *arg = _arg;
-
- /*
- * Only update the node migration order when a node is
- * changing status, like online->offline.
- */
- if (arg->status_change_nid < 0)
- return notifier_from_errno(0);
+ struct node_notify *nn = _arg;
switch (action) {
- case MEM_OFFLINE:
+ case NODE_REMOVED_LAST_MEMORY:
mutex_lock(&memory_tier_lock);
- if (clear_node_memory_tier(arg->status_change_nid))
+ if (clear_node_memory_tier(nn->nid))
establish_demotion_targets();
mutex_unlock(&memory_tier_lock);
break;
- case MEM_ONLINE:
+ case NODE_ADDED_FIRST_MEMORY:
mutex_lock(&memory_tier_lock);
- memtier = set_node_memory_tier(arg->status_change_nid);
+ memtier = set_node_memory_tier(nn->nid);
if (!IS_ERR(memtier))
establish_demotion_targets();
mutex_unlock(&memory_tier_lock);
@@ -929,7 +922,7 @@ static int __init memory_tier_init(void)
nodes_and(default_dram_nodes, node_states[N_MEMORY],
node_states[N_CPU]);
- hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI);
+ hotplug_node_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI);
return 0;
}
subsys_initcall(memory_tier_init);
@@ -949,11 +942,23 @@ static ssize_t demotion_enabled_store(struct kobject *kobj,
const char *buf, size_t count)
{
ssize_t ret;
+ bool before = numa_demotion_enabled;
ret = kstrtobool(buf, &numa_demotion_enabled);
if (ret)
return ret;
+ /*
+ * Reset kswapd_failures statistics. They may no longer be
+ * valid since the policy for kswapd has changed.
+ */
+ if (before == false && numa_demotion_enabled == true) {
+ struct pglist_data *pgdat;
+
+ for_each_online_pgdat(pgdat)
+ atomic_set(&pgdat->kswapd_failures, 0);
+ }
+
return count;
}
diff --git a/mm/memory.c b/mm/memory.c
index 49199410805c..74b45e258323 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -57,7 +57,6 @@
#include <linux/export.h>
#include <linux/delayacct.h>
#include <linux/init.h>
-#include <linux/pfn_t.h>
#include <linux/writeback.h>
#include <linux/memcontrol.h>
#include <linux/mmu_notifier.h>
@@ -125,6 +124,24 @@ int randomize_va_space __read_mostly =
2;
#endif
+static const struct ctl_table mmu_sysctl_table[] = {
+ {
+ .procname = "randomize_va_space",
+ .data = &randomize_va_space,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+};
+
+static int __init init_mm_sysctl(void)
+{
+ register_sysctl_init("kernel", mmu_sysctl_table);
+ return 0;
+}
+
+subsys_initcall(init_mm_sysctl);
+
#ifndef arch_wants_old_prefaulted_pte
static inline bool arch_wants_old_prefaulted_pte(void)
{
@@ -278,8 +295,17 @@ static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
p4d_free_tlb(tlb, p4d, start);
}
-/*
- * This function frees user-level page tables of a process.
+/**
+ * free_pgd_range - Unmap and free page tables in the range
+ * @tlb: the mmu_gather containing pending TLB flush info
+ * @addr: virtual address start
+ * @end: virtual address end
+ * @floor: lowest address boundary
+ * @ceiling: highest address boundary
+ *
+ * This function tears down all user-level page tables in the
+ * specified virtual address range [@addr..@end). It is part of
+ * the memory unmap flow.
*/
void free_pgd_range(struct mmu_gather *tlb,
unsigned long addr, unsigned long end,
@@ -349,6 +375,8 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
{
struct unlink_vma_file_batch vb;
+ tlb_free_vmas(tlb);
+
do {
unsigned long addr = vma->vm_start;
struct vm_area_struct *next;
@@ -369,32 +397,26 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
vma_start_write(vma);
unlink_anon_vmas(vma);
- if (is_vm_hugetlb_page(vma)) {
- unlink_file_vma(vma);
- hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
- floor, next ? next->vm_start : ceiling);
- } else {
- unlink_file_vma_batch_init(&vb);
- unlink_file_vma_batch_add(&vb, vma);
+ unlink_file_vma_batch_init(&vb);
+ unlink_file_vma_batch_add(&vb, vma);
- /*
- * Optimization: gather nearby vmas into one call down
- */
- while (next && next->vm_start <= vma->vm_end + PMD_SIZE
- && !is_vm_hugetlb_page(next)) {
- vma = next;
- next = mas_find(mas, ceiling - 1);
- if (unlikely(xa_is_zero(next)))
- next = NULL;
- if (mm_wr_locked)
- vma_start_write(vma);
- unlink_anon_vmas(vma);
- unlink_file_vma_batch_add(&vb, vma);
- }
- unlink_file_vma_batch_final(&vb);
- free_pgd_range(tlb, addr, vma->vm_end,
- floor, next ? next->vm_start : ceiling);
+ /*
+ * Optimization: gather nearby vmas into one call down
+ */
+ while (next && next->vm_start <= vma->vm_end + PMD_SIZE) {
+ vma = next;
+ next = mas_find(mas, ceiling - 1);
+ if (unlikely(xa_is_zero(next)))
+ next = NULL;
+ if (mm_wr_locked)
+ vma_start_write(vma);
+ unlink_anon_vmas(vma);
+ unlink_file_vma_batch_add(&vb, vma);
}
+ unlink_file_vma_batch_final(&vb);
+
+ free_pgd_range(tlb, addr, vma->vm_end,
+ floor, next ? next->vm_start : ceiling);
vma = next;
} while (vma);
}
@@ -469,22 +491,8 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
add_mm_counter(mm, i, rss[i]);
}
-/*
- * This function is called to print an error when a bad pte
- * is found. For example, we might have a PFN-mapped pte in
- * a region that doesn't allow it.
- *
- * The calling function must still handle the error.
- */
-static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
- pte_t pte, struct page *page)
+static bool is_bad_page_map_ratelimited(void)
{
- pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
- p4d_t *p4d = p4d_offset(pgd, addr);
- pud_t *pud = pud_offset(p4d, addr);
- pmd_t *pmd = pmd_offset(pud, addr);
- struct address_space *mapping;
- pgoff_t index;
static unsigned long resume;
static unsigned long nr_shown;
static unsigned long nr_unshown;
@@ -496,7 +504,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
if (nr_shown == 60) {
if (time_before(jiffies, resume)) {
nr_unshown++;
- return;
+ return true;
}
if (nr_unshown) {
pr_alert("BUG: Bad page map: %lu messages suppressed\n",
@@ -507,37 +515,135 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
}
if (nr_shown++ == 0)
resume = jiffies + 60 * HZ;
+ return false;
+}
+
+static void __print_bad_page_map_pgtable(struct mm_struct *mm, unsigned long addr)
+{
+ unsigned long long pgdv, p4dv, pudv, pmdv;
+ p4d_t p4d, *p4dp;
+ pud_t pud, *pudp;
+ pmd_t pmd, *pmdp;
+ pgd_t *pgdp;
+
+ /*
+ * Although this looks like a fully lockless pgtable walk, it is not:
+ * see locking requirements for print_bad_page_map().
+ */
+ pgdp = pgd_offset(mm, addr);
+ pgdv = pgd_val(*pgdp);
+
+ if (!pgd_present(*pgdp) || pgd_leaf(*pgdp)) {
+ pr_alert("pgd:%08llx\n", pgdv);
+ return;
+ }
+
+ p4dp = p4d_offset(pgdp, addr);
+ p4d = p4dp_get(p4dp);
+ p4dv = p4d_val(p4d);
+
+ if (!p4d_present(p4d) || p4d_leaf(p4d)) {
+ pr_alert("pgd:%08llx p4d:%08llx\n", pgdv, p4dv);
+ return;
+ }
+
+ pudp = pud_offset(p4dp, addr);
+ pud = pudp_get(pudp);
+ pudv = pud_val(pud);
+
+ if (!pud_present(pud) || pud_leaf(pud)) {
+ pr_alert("pgd:%08llx p4d:%08llx pud:%08llx\n", pgdv, p4dv, pudv);
+ return;
+ }
+
+ pmdp = pmd_offset(pudp, addr);
+ pmd = pmdp_get(pmdp);
+ pmdv = pmd_val(pmd);
+
+ /*
+ * Dumping the PTE would be nice, but it's tricky with CONFIG_HIGHPTE,
+ * because the table should already be mapped by the caller and
+ * doing another map would be bad. print_bad_page_map() should
+ * already take care of printing the PTE.
+ */
+ pr_alert("pgd:%08llx p4d:%08llx pud:%08llx pmd:%08llx\n", pgdv,
+ p4dv, pudv, pmdv);
+}
+
+/*
+ * This function is called to print an error when a bad page table entry (e.g.,
+ * corrupted page table entry) is found. For example, we might have a
+ * PFN-mapped pte in a region that doesn't allow it.
+ *
+ * The calling function must still handle the error.
+ *
+ * This function must be called during a proper page table walk, as it will
+ * re-walk the page table to dump information: the caller MUST prevent page
+ * table teardown (by holding mmap, vma or rmap lock) and MUST hold the leaf
+ * page table lock.
+ */
+static void print_bad_page_map(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long long entry, struct page *page,
+ enum pgtable_level level)
+{
+ struct address_space *mapping;
+ pgoff_t index;
+
+ if (is_bad_page_map_ratelimited())
+ return;
mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
index = linear_page_index(vma, addr);
- pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
- current->comm,
- (long long)pte_val(pte), (long long)pmd_val(*pmd));
+ pr_alert("BUG: Bad page map in process %s %s:%08llx", current->comm,
+ pgtable_level_to_str(level), entry);
+ __print_bad_page_map_pgtable(vma->vm_mm, addr);
if (page)
- dump_page(page, "bad pte");
+ dump_page(page, "bad page map");
pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
(void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
- pr_alert("file:%pD fault:%ps mmap:%ps read_folio:%ps\n",
+ pr_alert("file:%pD fault:%ps mmap:%ps mmap_prepare: %ps read_folio:%ps\n",
vma->vm_file,
vma->vm_ops ? vma->vm_ops->fault : NULL,
vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
+ vma->vm_file ? vma->vm_file->f_op->mmap_prepare : NULL,
mapping ? mapping->a_ops->read_folio : NULL);
dump_stack();
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
}
+#define print_bad_pte(vma, addr, pte, page) \
+ print_bad_page_map(vma, addr, pte_val(pte), page, PGTABLE_LEVEL_PTE)
-/*
- * vm_normal_page -- This function gets the "struct page" associated with a pte.
+/**
+ * __vm_normal_page() - Get the "struct page" associated with a page table entry.
+ * @vma: The VMA mapping the page table entry.
+ * @addr: The address where the page table entry is mapped.
+ * @pfn: The PFN stored in the page table entry.
+ * @special: Whether the page table entry is marked "special".
+ * @level: The page table level for error reporting purposes only.
+ * @entry: The page table entry value for error reporting purposes only.
*
* "Special" mappings do not wish to be associated with a "struct page" (either
* it doesn't exist, or it exists but they don't want to touch it). In this
- * case, NULL is returned here. "Normal" mappings do have a struct page.
+ * case, NULL is returned here. "Normal" mappings do have a struct page and
+ * are ordinarily refcounted.
+ *
+ * Page mappings of the shared zero folios are always considered "special", as
+ * they are not ordinarily refcounted: neither the refcount nor the mapcount
+ * of these folios is adjusted when mapping them into user page tables.
+ * Selected page table walkers (such as GUP) can still identify mappings of the
+ * shared zero folios and work with the underlying "struct page".
+ *
+ * There are 2 broad cases. Firstly, an architecture may define a "special"
+ * page table entry bit, such as pte_special(), in which case this function is
+ * trivial. Secondly, an architecture may not have a spare page table
+ * entry bit, which requires a more complicated scheme, described below.
*
- * There are 2 broad cases. Firstly, an architecture may define a pte_special()
- * pte bit, in which case this function is trivial. Secondly, an architecture
- * may not have a spare pte bit, which requires a more complicated scheme,
- * described below.
+ * With CONFIG_FIND_NORMAL_PAGE, we might have the "special" bit set on
+ * page table entries that actually map "normal" pages: however, that page
+ * cannot be looked up through the PFN stored in the page table entry, but
+ * instead will be looked up through vm_ops->find_normal_page(). So far, this
+ * only applies to PTEs.
*
* A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
* special mapping (even if there are underlying and valid "struct pages").
@@ -562,82 +668,104 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
*
* VM_MIXEDMAP mappings can likewise contain memory with or without "struct
* page" backing, however the difference is that _all_ pages with a struct
- * page (that is, those where pfn_valid is true) are refcounted and considered
- * normal pages by the VM. The only exception are zeropages, which are
- * *never* refcounted.
+ * page (that is, those where pfn_valid is true, except the shared zero
+ * folios) are refcounted and considered normal pages by the VM.
*
* The disadvantage is that pages are refcounted (which can be slower and
* simply not an option for some PFNMAP users). The advantage is that we
* don't have to follow the strict linearity rule of PFNMAP mappings in
* order to support COWable mappings.
*
+ * Return: Returns the "struct page" if this is a "normal" mapping. Returns
+ * NULL if this is a "special" mapping.
*/
-struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
- pte_t pte)
+static inline struct page *__vm_normal_page(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long pfn, bool special,
+ unsigned long long entry, enum pgtable_level level)
{
- unsigned long pfn = pte_pfn(pte);
-
if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) {
- if (likely(!pte_special(pte)))
- goto check_pfn;
- if (vma->vm_ops && vma->vm_ops->find_special_page)
- return vma->vm_ops->find_special_page(vma, addr);
- if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
- return NULL;
- if (is_zero_pfn(pfn))
+ if (unlikely(special)) {
+#ifdef CONFIG_FIND_NORMAL_PAGE
+ if (vma->vm_ops && vma->vm_ops->find_normal_page)
+ return vma->vm_ops->find_normal_page(vma, addr);
+#endif /* CONFIG_FIND_NORMAL_PAGE */
+ if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
+ return NULL;
+ if (is_zero_pfn(pfn) || is_huge_zero_pfn(pfn))
+ return NULL;
+
+ print_bad_page_map(vma, addr, entry, NULL, level);
return NULL;
- if (pte_devmap(pte))
+ }
/*
- * NOTE: New users of ZONE_DEVICE will not set pte_devmap()
- * and will have refcounts incremented on their struct pages
- * when they are inserted into PTEs, thus they are safe to
- * return here. Legacy ZONE_DEVICE pages that set pte_devmap()
- * do not have refcounts. Example of legacy ZONE_DEVICE is
- * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers.
+ * With CONFIG_ARCH_HAS_PTE_SPECIAL, any special page table
+ * mappings (incl. shared zero folios) are marked accordingly.
*/
- return NULL;
-
- print_bad_pte(vma, addr, pte, NULL);
- return NULL;
- }
-
- /* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */
+ } else {
+ if (unlikely(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))) {
+ if (vma->vm_flags & VM_MIXEDMAP) {
+ /* If it has a "struct page", it's "normal". */
+ if (!pfn_valid(pfn))
+ return NULL;
+ } else {
+ unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;
- if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
- if (vma->vm_flags & VM_MIXEDMAP) {
- if (!pfn_valid(pfn))
- return NULL;
- if (is_zero_pfn(pfn))
- return NULL;
- goto out;
- } else {
- unsigned long off;
- off = (addr - vma->vm_start) >> PAGE_SHIFT;
- if (pfn == vma->vm_pgoff + off)
- return NULL;
- if (!is_cow_mapping(vma->vm_flags))
- return NULL;
+ /* Only CoW'ed anon folios are "normal". */
+ if (pfn == vma->vm_pgoff + off)
+ return NULL;
+ if (!is_cow_mapping(vma->vm_flags))
+ return NULL;
+ }
}
- }
- if (is_zero_pfn(pfn))
- return NULL;
+ if (is_zero_pfn(pfn) || is_huge_zero_pfn(pfn))
+ return NULL;
+ }
-check_pfn:
if (unlikely(pfn > highest_memmap_pfn)) {
- print_bad_pte(vma, addr, pte, NULL);
+ /* Corrupted page table entry. */
+ print_bad_page_map(vma, addr, entry, NULL, level);
return NULL;
}
-
/*
* NOTE! We still have PageReserved() pages in the page tables.
- * eg. VDSO mappings can cause them to exist.
+ * For example, VDSO mappings can cause them to exist.
*/
-out:
- VM_WARN_ON_ONCE(is_zero_pfn(pfn));
+ VM_WARN_ON_ONCE(is_zero_pfn(pfn) || is_huge_zero_pfn(pfn));
return pfn_to_page(pfn);
}
+/**
+ * vm_normal_page() - Get the "struct page" associated with a PTE
+ * @vma: The VMA mapping the @pte.
+ * @addr: The address where the @pte is mapped.
+ * @pte: The PTE.
+ *
+ * Get the "struct page" associated with a PTE. See __vm_normal_page()
+ * for details on "normal" and "special" mappings.
+ *
+ * Return: Returns the "struct page" if this is a "normal" mapping. Returns
+ * NULL if this is a "special" mapping.
+ */
+struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+ pte_t pte)
+{
+ return __vm_normal_page(vma, addr, pte_pfn(pte), pte_special(pte),
+ pte_val(pte), PGTABLE_LEVEL_PTE);
+}
+
+/**
+ * vm_normal_folio() - Get the "struct folio" associated with a PTE
+ * @vma: The VMA mapping the @pte.
+ * @addr: The address where the @pte is mapped.
+ * @pte: The PTE.
+ *
+ * Get the "struct folio" associated with a PTE. See __vm_normal_page()
+ * for details on "normal" and "special" mappings.
+ *
+ * Return: Returns the "struct folio" if this is a "normal" mapping. Returns
+ * NULL if this is a "special" mapping.
+ */
struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,
pte_t pte)
{
@@ -649,45 +777,37 @@ struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,
}
#ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES
+/**
+ * vm_normal_page_pmd() - Get the "struct page" associated with a PMD
+ * @vma: The VMA mapping the @pmd.
+ * @addr: The address where the @pmd is mapped.
+ * @pmd: The PMD.
+ *
+ * Get the "struct page" associated with a PTE. See __vm_normal_page()
+ * for details on "normal" and "special" mappings.
+ *
+ * Return: Returns the "struct page" if this is a "normal" mapping. Returns
+ * NULL if this is a "special" mapping.
+ */
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t pmd)
{
- unsigned long pfn = pmd_pfn(pmd);
-
- /* Currently it's only used for huge pfnmaps */
- if (unlikely(pmd_special(pmd)))
- return NULL;
-
- if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
- if (vma->vm_flags & VM_MIXEDMAP) {
- if (!pfn_valid(pfn))
- return NULL;
- goto out;
- } else {
- unsigned long off;
- off = (addr - vma->vm_start) >> PAGE_SHIFT;
- if (pfn == vma->vm_pgoff + off)
- return NULL;
- if (!is_cow_mapping(vma->vm_flags))
- return NULL;
- }
- }
-
- if (pmd_devmap(pmd))
- return NULL;
- if (is_huge_zero_pmd(pmd))
- return NULL;
- if (unlikely(pfn > highest_memmap_pfn))
- return NULL;
-
- /*
- * NOTE! We still have PageReserved() pages in the page tables.
- * eg. VDSO mappings can cause them to exist.
- */
-out:
- return pfn_to_page(pfn);
+ return __vm_normal_page(vma, addr, pmd_pfn(pmd), pmd_special(pmd),
+ pmd_val(pmd), PGTABLE_LEVEL_PMD);
}
+/**
+ * vm_normal_folio_pmd() - Get the "struct folio" associated with a PMD
+ * @vma: The VMA mapping the @pmd.
+ * @addr: The address where the @pmd is mapped.
+ * @pmd: The PMD.
+ *
+ * Get the "struct folio" associated with a PTE. See __vm_normal_page()
+ * for details on "normal" and "special" mappings.
+ *
+ * Return: Returns the "struct folio" if this is a "normal" mapping. Returns
+ * NULL if this is a "special" mapping.
+ */
struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t pmd)
{
@@ -697,6 +817,25 @@ struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma,
return page_folio(page);
return NULL;
}
+
+/**
+ * vm_normal_page_pud() - Get the "struct page" associated with a PUD
+ * @vma: The VMA mapping the @pud.
+ * @addr: The address where the @pud is mapped.
+ * @pud: The PUD.
+ *
+ * Get the "struct page" associated with a PUD. See __vm_normal_page()
+ * for details on "normal" and "special" mappings.
+ *
+ * Return: Returns the "struct page" if this is a "normal" mapping. Returns
+ * NULL if this is a "special" mapping.
+ */
+struct page *vm_normal_page_pud(struct vm_area_struct *vma,
+ unsigned long addr, pud_t pud)
+{
+ return __vm_normal_page(vma, addr, pud_pfn(pud), pud_special(pud),
+ pud_val(pud), PGTABLE_LEVEL_PUD);
+}
#endif
/**
@@ -785,7 +924,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma,
struct vm_area_struct *src_vma, unsigned long addr, int *rss)
{
- unsigned long vm_flags = dst_vma->vm_flags;
+ vm_flags_t vm_flags = dst_vma->vm_flags;
pte_t orig_pte = ptep_get(src_pte);
pte_t pte = orig_pte;
struct folio *folio;
@@ -929,7 +1068,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
rss[MM_ANONPAGES]++;
/* All done, just insert the new page copy in the child */
- pte = mk_pte(&new_folio->page, dst_vma->vm_page_prot);
+ pte = folio_mk_pte(new_folio, dst_vma->vm_page_prot);
pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma);
if (userfaultfd_pte_wp(dst_vma, ptep_get(src_pte)))
/* Uffd-wp needs to be delivered to dest pte as well */
@@ -973,10 +1112,9 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
pte_t *dst_pte, pte_t *src_pte, pte_t pte, unsigned long addr,
int max_nr, int *rss, struct folio **prealloc)
{
+ fpb_t flags = FPB_MERGE_WRITE;
struct page *page;
struct folio *folio;
- bool any_writable;
- fpb_t flags = 0;
int err, nr;
page = vm_normal_page(src_vma, addr, pte);
@@ -991,13 +1129,12 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
* by keeping the batching logic separate.
*/
if (unlikely(!*prealloc && folio_test_large(folio) && max_nr != 1)) {
- if (src_vma->vm_flags & VM_SHARED)
- flags |= FPB_IGNORE_DIRTY;
- if (!vma_soft_dirty_enabled(src_vma))
- flags |= FPB_IGNORE_SOFT_DIRTY;
+ if (!(src_vma->vm_flags & VM_SHARED))
+ flags |= FPB_RESPECT_DIRTY;
+ if (vma_soft_dirty_enabled(src_vma))
+ flags |= FPB_RESPECT_SOFT_DIRTY;
- nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags,
- &any_writable, NULL, NULL);
+ nr = folio_pte_batch_flags(folio, src_vma, src_pte, &pte, max_nr, flags);
folio_ref_add(folio, nr);
if (folio_test_anon(folio)) {
if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page,
@@ -1011,8 +1148,6 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
folio_dup_file_rmap_ptes(folio, page, nr, dst_vma);
rss[mm_counter_file(folio)] += nr;
}
- if (any_writable)
- pte = pte_mkwrite(pte, src_vma);
__copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte,
addr, nr);
return nr;
@@ -1238,8 +1373,7 @@ copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
src_pmd = pmd_offset(src_pud, addr);
do {
next = pmd_addr_end(addr, end);
- if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
- || pmd_devmap(*src_pmd)) {
+ if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)) {
int err;
VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
@@ -1275,7 +1409,7 @@ copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
src_pud = pud_offset(src_p4d, addr);
do {
next = pud_addr_end(addr, end);
- if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
+ if (pud_trans_huge(*src_pud)) {
int err;
VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma);
@@ -1361,7 +1495,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
struct mm_struct *dst_mm = dst_vma->vm_mm;
struct mm_struct *src_mm = src_vma->vm_mm;
struct mmu_notifier_range range;
- unsigned long next, pfn = 0;
+ unsigned long next;
bool is_cow;
int ret;
@@ -1371,12 +1505,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
if (is_vm_hugetlb_page(src_vma))
return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma);
- if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
- ret = track_pfn_copy(dst_vma, src_vma, &pfn);
- if (ret)
- return ret;
- }
-
/*
* We need to invalidate the secondary MMU mappings only when
* there could be a permission downgrade on the ptes of the
@@ -1418,8 +1546,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
raw_write_seqcount_end(&src_mm->write_protect_seq);
mmu_notifier_invalidate_range_end(&range);
}
- if (ret && unlikely(src_vma->vm_flags & VM_PFNMAP))
- untrack_pfn_copy(dst_vma, pfn);
return ret;
}
@@ -1545,7 +1671,6 @@ static inline int zap_present_ptes(struct mmu_gather *tlb,
struct zap_details *details, int *rss, bool *force_flush,
bool *force_break, bool *any_skipped)
{
- const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
struct mm_struct *mm = tlb->mm;
struct folio *folio;
struct page *page;
@@ -1575,9 +1700,7 @@ static inline int zap_present_ptes(struct mmu_gather *tlb,
* by keeping the batching logic separate.
*/
if (unlikely(folio_test_large(folio) && max_nr != 1)) {
- nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags,
- NULL, NULL, NULL);
-
+ nr = folio_pte_batch(folio, pte, ptent, max_nr);
zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr,
addr, details, rss, force_flush,
force_break, any_skipped);
@@ -1797,9 +1920,9 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
- if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
+ if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE)
- __split_huge_pmd(vma, pmd, addr, false, NULL);
+ __split_huge_pmd(vma, pmd, addr, false);
else if (zap_huge_pmd(tlb, vma, pmd, addr)) {
addr = next;
continue;
@@ -1839,7 +1962,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
pud = pud_offset(p4d, addr);
do {
next = pud_addr_end(addr, end);
- if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
+ if (pud_trans_huge(*pud)) {
if (next - addr != HPAGE_PUD_SIZE) {
mmap_assert_locked(tlb->mm);
split_huge_pud(vma, pud, addr);
@@ -1914,9 +2037,6 @@ static void unmap_single_vma(struct mmu_gather *tlb,
if (vma->vm_file)
uprobe_munmap(vma, start, end);
- if (unlikely(vma->vm_flags & VM_PFNMAP))
- untrack_pfn(vma, 0, 0, mm_wr_locked);
-
if (start != end) {
if (unlikely(is_vm_hugetlb_page(vma))) {
/*
@@ -1990,35 +2110,64 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
}
/**
- * zap_page_range_single - remove user pages in a given range
+ * zap_page_range_single_batched - remove user pages in a given range
+ * @tlb: pointer to the caller's struct mmu_gather
* @vma: vm_area_struct holding the applicable pages
- * @address: starting address of pages to zap
- * @size: number of bytes to zap
+ * @address: starting address of pages to remove
+ * @size: number of bytes to remove
* @details: details of shared cache invalidation
*
- * The range must fit into one VMA.
+ * @tlb shouldn't be NULL. The range must fit into one VMA. If @vma is for
+ * hugetlb, @tlb is flushed and re-initialized by this function.
*/
-void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
+void zap_page_range_single_batched(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, unsigned long address,
unsigned long size, struct zap_details *details)
{
const unsigned long end = address + size;
struct mmu_notifier_range range;
- struct mmu_gather tlb;
+
+ VM_WARN_ON_ONCE(!tlb || tlb->mm != vma->vm_mm);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
address, end);
hugetlb_zap_begin(vma, &range.start, &range.end);
- tlb_gather_mmu(&tlb, vma->vm_mm);
update_hiwater_rss(vma->vm_mm);
mmu_notifier_invalidate_range_start(&range);
/*
* unmap 'address-end' not 'range.start-range.end' as range
* could have been expanded for hugetlb pmd sharing.
*/
- unmap_single_vma(&tlb, vma, address, end, details, false);
+ unmap_single_vma(tlb, vma, address, end, details, false);
mmu_notifier_invalidate_range_end(&range);
+ if (is_vm_hugetlb_page(vma)) {
+ /*
+ * flush tlb and free resources before hugetlb_zap_end(), to
+ * avoid concurrent page faults' allocation failure.
+ */
+ tlb_finish_mmu(tlb);
+ hugetlb_zap_end(vma, details);
+ tlb_gather_mmu(tlb, vma->vm_mm);
+ }
+}
+
+/**
+ * zap_page_range_single - remove user pages in a given range
+ * @vma: vm_area_struct holding the applicable pages
+ * @address: starting address of pages to zap
+ * @size: number of bytes to zap
+ * @details: details of shared cache invalidation
+ *
+ * The range must fit into one VMA.
+ */
+void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
+ unsigned long size, struct zap_details *details)
+{
+ struct mmu_gather tlb;
+
+ tlb_gather_mmu(&tlb, vma->vm_mm);
+ zap_page_range_single_batched(&tlb, vma, address, size, details);
tlb_finish_mmu(&tlb);
- hugetlb_zap_end(vma, details);
}
/**
@@ -2117,8 +2266,7 @@ static int validate_page_before_insert(struct vm_area_struct *vma,
return -EINVAL;
return 0;
}
- if (folio_test_anon(folio) || folio_test_slab(folio) ||
- page_has_type(page))
+ if (folio_test_anon(folio) || page_has_type(page))
return -EINVAL;
flush_dcache_folio(folio);
return 0;
@@ -2418,7 +2566,7 @@ int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
EXPORT_SYMBOL(vm_map_pages_zero);
static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
- pfn_t pfn, pgprot_t prot, bool mkwrite)
+ unsigned long pfn, pgprot_t prot, bool mkwrite)
{
struct mm_struct *mm = vma->vm_mm;
pte_t *pte, entry;
@@ -2440,7 +2588,7 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
* allocation and mapping invalidation so just skip the
* update.
*/
- if (pte_pfn(entry) != pfn_t_to_pfn(pfn)) {
+ if (pte_pfn(entry) != pfn) {
WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry)));
goto out_unlock;
}
@@ -2453,10 +2601,7 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
}
/* Ok, finally just insert the thing.. */
- if (pfn_t_devmap(pfn))
- entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
- else
- entry = pte_mkspecial(pfn_t_pte(pfn, prot));
+ entry = pte_mkspecial(pfn_pte(pfn, prot));
if (mkwrite) {
entry = pte_mkyoung(entry);
@@ -2525,10 +2670,9 @@ vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
if (!pfn_modify_allowed(pfn, pgprot))
return VM_FAULT_SIGBUS;
- track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
+ pfnmap_setup_cachemode_pfn(pfn, &pgprot);
- return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
- false);
+ return insert_pfn(vma, addr, pfn, pgprot, false);
}
EXPORT_SYMBOL(vmf_insert_pfn_prot);
@@ -2559,25 +2703,22 @@ vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
}
EXPORT_SYMBOL(vmf_insert_pfn);
-static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn, bool mkwrite)
+static bool vm_mixed_ok(struct vm_area_struct *vma, unsigned long pfn,
+ bool mkwrite)
{
- if (unlikely(is_zero_pfn(pfn_t_to_pfn(pfn))) &&
+ if (unlikely(is_zero_pfn(pfn)) &&
(mkwrite || !vm_mixed_zeropage_allowed(vma)))
return false;
/* these checks mirror the abort conditions in vm_normal_page */
if (vma->vm_flags & VM_MIXEDMAP)
return true;
- if (pfn_t_devmap(pfn))
- return true;
- if (pfn_t_special(pfn))
- return true;
- if (is_zero_pfn(pfn_t_to_pfn(pfn)))
+ if (is_zero_pfn(pfn))
return true;
return false;
}
static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
- unsigned long addr, pfn_t pfn, bool mkwrite)
+ unsigned long addr, unsigned long pfn, bool mkwrite)
{
pgprot_t pgprot = vma->vm_page_prot;
int err;
@@ -2588,9 +2729,9 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
if (addr < vma->vm_start || addr >= vma->vm_end)
return VM_FAULT_SIGBUS;
- track_pfn_insert(vma, &pgprot, pfn);
+ pfnmap_setup_cachemode_pfn(pfn, &pgprot);
- if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
+ if (!pfn_modify_allowed(pfn, pgprot))
return VM_FAULT_SIGBUS;
/*
@@ -2600,8 +2741,7 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
* than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP
* without pte special, it would there be refcounted as a normal page.
*/
- if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) &&
- !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pfn_valid(pfn)) {
struct page *page;
/*
@@ -2609,7 +2749,7 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
* regardless of whether the caller specified flags that
* result in pfn_t_has_page() == false.
*/
- page = pfn_to_page(pfn_t_to_pfn(pfn));
+ page = pfn_to_page(pfn);
err = insert_page(vma, addr, page, pgprot, mkwrite);
} else {
return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
@@ -2644,7 +2784,7 @@ vm_fault_t vmf_insert_page_mkwrite(struct vm_fault *vmf, struct page *page,
EXPORT_SYMBOL_GPL(vmf_insert_page_mkwrite);
vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
- pfn_t pfn)
+ unsigned long pfn)
{
return __vm_insert_mixed(vma, addr, pfn, false);
}
@@ -2656,7 +2796,7 @@ EXPORT_SYMBOL(vmf_insert_mixed);
* the same entry was actually inserted.
*/
vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
- unsigned long addr, pfn_t pfn)
+ unsigned long addr, unsigned long pfn)
{
return __vm_insert_mixed(vma, addr, pfn, true);
}
@@ -2833,6 +2973,36 @@ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
return error;
}
+#ifdef __HAVE_PFNMAP_TRACKING
+static inline struct pfnmap_track_ctx *pfnmap_track_ctx_alloc(unsigned long pfn,
+ unsigned long size, pgprot_t *prot)
+{
+ struct pfnmap_track_ctx *ctx;
+
+ if (pfnmap_track(pfn, size, prot))
+ return ERR_PTR(-EINVAL);
+
+ ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+ if (unlikely(!ctx)) {
+ pfnmap_untrack(pfn, size);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ ctx->pfn = pfn;
+ ctx->size = size;
+ kref_init(&ctx->kref);
+ return ctx;
+}
+
+void pfnmap_track_ctx_release(struct kref *ref)
+{
+ struct pfnmap_track_ctx *ctx = container_of(ref, struct pfnmap_track_ctx, kref);
+
+ pfnmap_untrack(ctx->pfn, ctx->size);
+ kfree(ctx);
+}
+#endif /* __HAVE_PFNMAP_TRACKING */
+
/**
* remap_pfn_range - remap kernel memory to userspace
* @vma: user vma to map to
@@ -2845,20 +3015,51 @@ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
*
* Return: %0 on success, negative error code otherwise.
*/
+#ifdef __HAVE_PFNMAP_TRACKING
int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn, unsigned long size, pgprot_t prot)
{
+ struct pfnmap_track_ctx *ctx = NULL;
int err;
- err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
- if (err)
+ size = PAGE_ALIGN(size);
+
+ /*
+ * If we cover the full VMA, we'll perform actual tracking, and
+ * remember to untrack when the last reference to our tracking
+ * context from a VMA goes away. We'll keep tracking the whole pfn
+ * range even during VMA splits and partial unmapping.
+ *
+ * If we only cover parts of the VMA, we'll only setup the cachemode
+ * in the pgprot for the pfn range.
+ */
+ if (addr == vma->vm_start && addr + size == vma->vm_end) {
+ if (vma->pfnmap_track_ctx)
+ return -EINVAL;
+ ctx = pfnmap_track_ctx_alloc(pfn, size, &prot);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+ } else if (pfnmap_setup_cachemode(pfn, size, &prot)) {
return -EINVAL;
+ }
err = remap_pfn_range_notrack(vma, addr, pfn, size, prot);
- if (err)
- untrack_pfn(vma, pfn, PAGE_ALIGN(size), true);
+ if (ctx) {
+ if (err)
+ kref_put(&ctx->kref, pfnmap_track_ctx_release);
+ else
+ vma->pfnmap_track_ctx = ctx;
+ }
return err;
}
+
+#else
+int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t prot)
+{
+ return remap_pfn_range_notrack(vma, addr, pfn, size, prot);
+}
+#endif
EXPORT_SYMBOL(remap_pfn_range);
/**
@@ -3523,7 +3724,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
inc_mm_counter(mm, MM_ANONPAGES);
}
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
- entry = mk_pte(&new_folio->page, vma->vm_page_prot);
+ entry = folio_mk_pte(new_folio, vma->vm_page_prot);
entry = pte_sw_mkyoung(entry);
if (unlikely(unshare)) {
if (pte_soft_dirty(vmf->orig_pte))
@@ -3730,7 +3931,7 @@ static bool __wp_can_reuse_large_anon_folio(struct folio *folio,
* If all folio references are from mappings, and all mappings are in
* the page tables of this MM, then this folio is exclusive to this MM.
*/
- if (folio_test_large_maybe_mapped_shared(folio))
+ if (test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids))
return false;
VM_WARN_ON_ONCE(folio_test_ksm(folio));
@@ -3753,7 +3954,7 @@ static bool __wp_can_reuse_large_anon_folio(struct folio *folio,
folio_lock_large_mapcount(folio);
VM_WARN_ON_ONCE_FOLIO(folio_large_mapcount(folio) > folio_ref_count(folio), folio);
- if (folio_test_large_maybe_mapped_shared(folio))
+ if (test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids))
goto unlock;
if (folio_large_mapcount(folio) != folio_ref_count(folio))
goto unlock;
@@ -4224,26 +4425,6 @@ static struct folio *__alloc_swap_folio(struct vm_fault *vmf)
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
-{
- struct swap_info_struct *si = swp_swap_info(entry);
- pgoff_t offset = swp_offset(entry);
- int i;
-
- /*
- * While allocating a large folio and doing swap_read_folio, which is
- * the case the being faulted pte doesn't have swapcache. We need to
- * ensure all PTEs have no cache as well, otherwise, we might go to
- * swap devices while the content is in swapcache.
- */
- for (i = 0; i < max_nr; i++) {
- if ((si->swap_map[offset + i] & SWAP_HAS_CACHE))
- return i;
- }
-
- return i;
-}
-
/*
* Check if the PTEs within a range are contiguous swap entries
* and have consistent swapcache, zeromap.
@@ -4333,8 +4514,8 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
* Get a list of all the (large) orders below PMD_ORDER that are enabled
* and suitable for swapping THP.
*/
- orders = thp_vma_allowable_orders(vma, vma->vm_flags,
- TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1);
+ orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT,
+ BIT(PMD_ORDER) - 1);
orders = thp_vma_suitable_orders(vma, vmf->address, orders);
orders = thp_swap_suitable_orders(swp_offset(entry),
vmf->address, orders);
@@ -4478,9 +4659,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (unlikely(!si))
goto out;
- folio = swap_cache_get_folio(entry, vma, vmf->address);
+ folio = swap_cache_get_folio(entry);
if (folio)
- page = folio_file_page(folio, swp_offset(entry));
+ swap_update_readahead(folio, vma, vmf->address);
swapcache = folio;
if (!folio) {
@@ -4517,7 +4698,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
memcg1_swapin(entry, nr_pages);
- shadow = get_shadow_from_swap_cache(entry);
+ shadow = swap_cache_get_shadow(entry);
if (shadow)
workingset_refault(folio, shadow);
@@ -4551,20 +4732,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
ret = VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT);
count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
- page = folio_file_page(folio, swp_offset(entry));
- } else if (PageHWPoison(page)) {
- /*
- * hwpoisoned dirty swapcache pages are kept for killing
- * owner processes (which may be unknown at hwpoison time)
- */
- ret = VM_FAULT_HWPOISON;
- goto out_release;
}
ret |= folio_lock_or_retry(folio, vmf);
if (ret & VM_FAULT_RETRY)
goto out_release;
+ page = folio_file_page(folio, swp_offset(entry));
if (swapcache) {
/*
* Make sure folio_free_swap() or swapoff did not release the
@@ -4573,14 +4747,22 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* swapcache, we need to check that the page's swap has not
* changed.
*/
- if (unlikely(!folio_test_swapcache(folio) ||
- page_swap_entry(page).val != entry.val))
+ if (unlikely(!folio_matches_swap_entry(folio, entry)))
+ goto out_page;
+
+ if (unlikely(PageHWPoison(page))) {
+ /*
+ * hwpoisoned dirty swapcache pages are kept for killing
+ * owner processes (which may be unknown at hwpoison time)
+ */
+ ret = VM_FAULT_HWPOISON;
goto out_page;
+ }
/*
* KSM sometimes has to copy on read faults, for example, if
- * page->index of !PageKSM() pages would be nonlinear inside the
- * anon VMA -- PageKSM() is lost on actual swapout.
+ * folio->index of non-ksm folios would be nonlinear inside the
+ * anon VMA -- the ksm flag is lost on actual swapout.
*/
folio = ksm_might_need_to_copy(folio, vma, vmf->address);
if (unlikely(!folio)) {
@@ -4881,8 +5063,8 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
* for this vma. Then filter out the orders that can't be allocated over
* the faulting address and still be fully contained in the vma.
*/
- orders = thp_vma_allowable_orders(vma, vma->vm_flags,
- TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1);
+ orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT,
+ BIT(PMD_ORDER) - 1);
orders = thp_vma_suitable_orders(vma, vmf->address, orders);
if (!orders)
@@ -5013,7 +5195,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
*/
__folio_mark_uptodate(folio);
- entry = mk_pte(&folio->page, vma->vm_page_prot);
+ entry = folio_mk_pte(folio, vma->vm_page_prot);
entry = pte_sw_mkyoung(entry);
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry), vma);
@@ -5138,9 +5320,8 @@ static void deposit_prealloc_pte(struct vm_fault *vmf)
vmf->prealloc_pte = NULL;
}
-vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
+vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *page)
{
- struct folio *folio = page_folio(page);
struct vm_area_struct *vma = vmf->vma;
bool write = vmf->flags & FAULT_FLAG_WRITE;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
@@ -5151,9 +5332,11 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
* It is too late to allocate a small folio, we already have a large
* folio in the pagecache: especially s390 KVM cannot tolerate any
* PMD mappings, but PTE-mapped THP are fine. So let's simply refuse any
- * PMD mappings if THPs are disabled.
+ * PMD mappings if THPs are disabled. As we already have a THP,
+ * behave as if we are forcing a collapse.
*/
- if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags))
+ if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags,
+ /* forced_collapse=*/ true))
return ret;
if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
@@ -5188,7 +5371,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
flush_icache_pages(vma, page, HPAGE_PMD_NR);
- entry = mk_huge_pmd(page, vma->vm_page_prot);
+ entry = folio_mk_pmd(folio, vma->vm_page_prot);
if (write)
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
@@ -5213,7 +5396,7 @@ out:
return ret;
}
#else
-vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
+vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *page)
{
return VM_FAULT_FALLBACK;
}
@@ -5245,6 +5428,8 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio,
if (write)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ else if (pte_write(entry) && folio_test_dirty(folio))
+ entry = pte_mkdirty(entry);
if (unlikely(vmf_orig_pte_uffd_wp(vmf)))
entry = pte_mkuffd_wp(entry);
/* copy-on-write page */
@@ -5305,6 +5490,7 @@ fallback:
else
page = vmf->page;
+ folio = page_folio(page);
/*
* check even for read faults because we might have lost our CoWed
* page
@@ -5316,8 +5502,8 @@ fallback:
}
if (pmd_none(*vmf->pmd)) {
- if (PageTransCompound(page)) {
- ret = do_set_pmd(vmf, page);
+ if (folio_test_pmd_mappable(folio)) {
+ ret = do_set_pmd(vmf, folio, page);
if (ret != VM_FAULT_FALLBACK)
return ret;
}
@@ -5328,16 +5514,10 @@ fallback:
return VM_FAULT_OOM;
}
- folio = page_folio(page);
nr_pages = folio_nr_pages(folio);
- /*
- * Using per-page fault to maintain the uffd semantics, and same
- * approach also applies to non-anonymous-shmem faults to avoid
- * inflating the RSS of the process.
- */
- if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma)) ||
- unlikely(needs_fallback)) {
+ /* Using per-page fault to maintain the uffd semantics */
+ if (unlikely(userfaultfd_armed(vma)) || unlikely(needs_fallback)) {
nr_pages = 1;
} else if (nr_pages > 1) {
pgoff_t idx = folio_page_idx(folio, page);
@@ -5892,7 +6072,7 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
split:
/* COW or write-notify handled on pte level: split pmd. */
- __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
+ __split_huge_pmd(vma, vmf->pmd, vmf->address, false);
return VM_FAULT_FALLBACK;
}
@@ -6056,7 +6236,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
.gfp_mask = __get_fault_gfp_mask(vma),
};
struct mm_struct *mm = vma->vm_mm;
- unsigned long vm_flags = vma->vm_flags;
+ vm_flags_t vm_flags = vma->vm_flags;
pgd_t *pgd;
p4d_t *p4d;
vm_fault_t ret;
@@ -6071,8 +6251,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
return VM_FAULT_OOM;
retry_pud:
if (pud_none(*vmf.pud) &&
- thp_vma_allowable_order(vma, vm_flags,
- TVA_IN_PF | TVA_ENFORCE_SYSFS, PUD_ORDER)) {
+ thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PUD_ORDER)) {
ret = create_huge_pud(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
@@ -6080,7 +6259,7 @@ retry_pud:
pud_t orig_pud = *vmf.pud;
barrier();
- if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
+ if (pud_trans_huge(orig_pud)) {
/*
* TODO once we support anonymous PUDs: NUMA case and
@@ -6106,8 +6285,7 @@ retry_pud:
goto retry_pud;
if (pmd_none(*vmf.pmd) &&
- thp_vma_allowable_order(vma, vm_flags,
- TVA_IN_PF | TVA_ENFORCE_SYSFS, PMD_ORDER)) {
+ thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PMD_ORDER)) {
ret = create_huge_pmd(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
@@ -6121,7 +6299,7 @@ retry_pud:
pmd_migration_entry_wait(mm, vmf.pmd);
return 0;
}
- if (pmd_trans_huge(vmf.orig_pmd) || pmd_devmap(vmf.orig_pmd)) {
+ if (pmd_trans_huge(vmf.orig_pmd)) {
if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma))
return do_huge_pmd_numa_page(&vmf);
@@ -6338,258 +6516,6 @@ out:
}
EXPORT_SYMBOL_GPL(handle_mm_fault);
-#ifdef CONFIG_LOCK_MM_AND_FIND_VMA
-#include <linux/extable.h>
-
-static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
-{
- if (likely(mmap_read_trylock(mm)))
- return true;
-
- if (regs && !user_mode(regs)) {
- unsigned long ip = exception_ip(regs);
- if (!search_exception_tables(ip))
- return false;
- }
-
- return !mmap_read_lock_killable(mm);
-}
-
-static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
-{
- /*
- * We don't have this operation yet.
- *
- * It should be easy enough to do: it's basically a
- * atomic_long_try_cmpxchg_acquire()
- * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
- * it also needs the proper lockdep magic etc.
- */
- return false;
-}
-
-static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
-{
- mmap_read_unlock(mm);
- if (regs && !user_mode(regs)) {
- unsigned long ip = exception_ip(regs);
- if (!search_exception_tables(ip))
- return false;
- }
- return !mmap_write_lock_killable(mm);
-}
-
-/*
- * Helper for page fault handling.
- *
- * This is kind of equivalent to "mmap_read_lock()" followed
- * by "find_extend_vma()", except it's a lot more careful about
- * the locking (and will drop the lock on failure).
- *
- * For example, if we have a kernel bug that causes a page
- * fault, we don't want to just use mmap_read_lock() to get
- * the mm lock, because that would deadlock if the bug were
- * to happen while we're holding the mm lock for writing.
- *
- * So this checks the exception tables on kernel faults in
- * order to only do this all for instructions that are actually
- * expected to fault.
- *
- * We can also actually take the mm lock for writing if we
- * need to extend the vma, which helps the VM layer a lot.
- */
-struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
- unsigned long addr, struct pt_regs *regs)
-{
- struct vm_area_struct *vma;
-
- if (!get_mmap_lock_carefully(mm, regs))
- return NULL;
-
- vma = find_vma(mm, addr);
- if (likely(vma && (vma->vm_start <= addr)))
- return vma;
-
- /*
- * Well, dang. We might still be successful, but only
- * if we can extend a vma to do so.
- */
- if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
- mmap_read_unlock(mm);
- return NULL;
- }
-
- /*
- * We can try to upgrade the mmap lock atomically,
- * in which case we can continue to use the vma
- * we already looked up.
- *
- * Otherwise we'll have to drop the mmap lock and
- * re-take it, and also look up the vma again,
- * re-checking it.
- */
- if (!mmap_upgrade_trylock(mm)) {
- if (!upgrade_mmap_lock_carefully(mm, regs))
- return NULL;
-
- vma = find_vma(mm, addr);
- if (!vma)
- goto fail;
- if (vma->vm_start <= addr)
- goto success;
- if (!(vma->vm_flags & VM_GROWSDOWN))
- goto fail;
- }
-
- if (expand_stack_locked(vma, addr))
- goto fail;
-
-success:
- mmap_write_downgrade(mm);
- return vma;
-
-fail:
- mmap_write_unlock(mm);
- return NULL;
-}
-#endif
-
-#ifdef CONFIG_PER_VMA_LOCK
-static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching)
-{
- unsigned int tgt_refcnt = VMA_LOCK_OFFSET;
-
- /* Additional refcnt if the vma is attached. */
- if (!detaching)
- tgt_refcnt++;
-
- /*
- * If vma is detached then only vma_mark_attached() can raise the
- * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
- */
- if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt))
- return false;
-
- rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
- rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
- refcount_read(&vma->vm_refcnt) == tgt_refcnt,
- TASK_UNINTERRUPTIBLE);
- lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
-
- return true;
-}
-
-static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
-{
- *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt);
- rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
-}
-
-void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
-{
- bool locked;
-
- /*
- * __vma_enter_locked() returns false immediately if the vma is not
- * attached, otherwise it waits until refcnt is indicating that vma
- * is attached with no readers.
- */
- locked = __vma_enter_locked(vma, false);
-
- /*
- * We should use WRITE_ONCE() here because we can have concurrent reads
- * from the early lockless pessimistic check in vma_start_read().
- * We don't really care about the correctness of that early check, but
- * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
- */
- WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
-
- if (locked) {
- bool detached;
-
- __vma_exit_locked(vma, &detached);
- WARN_ON_ONCE(detached); /* vma should remain attached */
- }
-}
-EXPORT_SYMBOL_GPL(__vma_start_write);
-
-void vma_mark_detached(struct vm_area_struct *vma)
-{
- vma_assert_write_locked(vma);
- vma_assert_attached(vma);
-
- /*
- * We are the only writer, so no need to use vma_refcount_put().
- * The condition below is unlikely because the vma has been already
- * write-locked and readers can increment vm_refcnt only temporarily
- * before they check vm_lock_seq, realize the vma is locked and drop
- * back the vm_refcnt. That is a narrow window for observing a raised
- * vm_refcnt.
- */
- if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
- /* Wait until vma is detached with no readers. */
- if (__vma_enter_locked(vma, true)) {
- bool detached;
-
- __vma_exit_locked(vma, &detached);
- WARN_ON_ONCE(!detached);
- }
- }
-}
-
-/*
- * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
- * stable and not isolated. If the VMA is not found or is being modified the
- * function returns NULL.
- */
-struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
- unsigned long address)
-{
- MA_STATE(mas, &mm->mm_mt, address, address);
- struct vm_area_struct *vma;
-
- rcu_read_lock();
-retry:
- vma = mas_walk(&mas);
- if (!vma)
- goto inval;
-
- vma = vma_start_read(mm, vma);
- if (IS_ERR_OR_NULL(vma)) {
- /* Check if the VMA got isolated after we found it */
- if (PTR_ERR(vma) == -EAGAIN) {
- count_vm_vma_lock_event(VMA_LOCK_MISS);
- /* The area was replaced with another one */
- goto retry;
- }
-
- /* Failed to lock the VMA */
- goto inval;
- }
- /*
- * At this point, we have a stable reference to a VMA: The VMA is
- * locked and we know it hasn't already been isolated.
- * From here on, we can access the VMA without worrying about which
- * fields are accessible for RCU readers.
- */
-
- /* Check if the vma we locked is the right one. */
- if (unlikely(vma->vm_mm != mm ||
- address < vma->vm_start || address >= vma->vm_end))
- goto inval_end_read;
-
- rcu_read_unlock();
- return vma;
-
-inval_end_read:
- vma_end_read(vma);
-inval:
- rcu_read_unlock();
- count_vm_vma_lock_event(VMA_LOCK_ABORT);
- return NULL;
-}
-#endif /* CONFIG_PER_VMA_LOCK */
-
#ifndef __PAGETABLE_P4D_FOLDED
/*
* Allocate p4d page table.
@@ -6900,6 +6826,7 @@ static int __access_remote_vm(struct mm_struct *mm, unsigned long addr,
while (len) {
int bytes, offset;
void *maddr;
+ struct folio *folio;
struct vm_area_struct *vma = NULL;
struct page *page = get_user_page_vma_remote(mm, addr,
gup_flags, &vma);
@@ -6931,21 +6858,22 @@ static int __access_remote_vm(struct mm_struct *mm, unsigned long addr,
if (bytes <= 0)
break;
} else {
+ folio = page_folio(page);
bytes = len;
offset = addr & (PAGE_SIZE-1);
if (bytes > PAGE_SIZE-offset)
bytes = PAGE_SIZE-offset;
- maddr = kmap_local_page(page);
+ maddr = kmap_local_folio(folio, folio_page_idx(folio, page) * PAGE_SIZE);
if (write) {
copy_to_user_page(vma, page, addr,
maddr + offset, buf, bytes);
- set_page_dirty_lock(page);
+ folio_mark_dirty_lock(folio);
} else {
copy_from_user_page(vma, page, addr,
buf, maddr + offset, bytes);
}
- unmap_and_put_page(page, maddr);
+ folio_release_kmap(folio, maddr);
}
len -= bytes;
buf += bytes;
@@ -7024,6 +6952,7 @@ static int __copy_remote_vm_str(struct mm_struct *mm, unsigned long addr,
while (len) {
int bytes, offset, retval;
void *maddr;
+ struct folio *folio;
struct page *page;
struct vm_area_struct *vma = NULL;
@@ -7039,17 +6968,18 @@ static int __copy_remote_vm_str(struct mm_struct *mm, unsigned long addr,
goto out;
}
+ folio = page_folio(page);
bytes = len;
offset = addr & (PAGE_SIZE - 1);
if (bytes > PAGE_SIZE - offset)
bytes = PAGE_SIZE - offset;
- maddr = kmap_local_page(page);
+ maddr = kmap_local_folio(folio, folio_page_idx(folio, page) * PAGE_SIZE);
retval = strscpy(buf, maddr + offset, bytes);
if (retval >= 0) {
/* Found the end of the string */
buf += retval;
- unmap_and_put_page(page, maddr);
+ folio_release_kmap(folio, maddr);
break;
}
@@ -7067,7 +6997,7 @@ static int __copy_remote_vm_str(struct mm_struct *mm, unsigned long addr,
}
len -= bytes;
- unmap_and_put_page(page, maddr);
+ folio_release_kmap(folio, maddr);
}
out:
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 8305483de38b..0be83039c3b5 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -35,6 +35,7 @@
#include <linux/compaction.h>
#include <linux/rmap.h>
#include <linux/module.h>
+#include <linux/node.h>
#include <asm/tlbflush.h>
@@ -374,7 +375,7 @@ struct page *pfn_to_online_page(unsigned long pfn)
* the section may be 'offline' but 'valid'. Only
* get_dev_pagemap() can determine sub-section online status.
*/
- pgmap = get_dev_pagemap(pfn, NULL);
+ pgmap = get_dev_pagemap(pfn);
put_dev_pagemap(pgmap);
/* The presence of a pgmap indicates ZONE_DEVICE offline pfn */
@@ -699,30 +700,6 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
online_mem_sections(start_pfn, end_pfn);
}
-/* check which state of node_states will be changed when online memory */
-static void node_states_check_changes_online(unsigned long nr_pages,
- struct zone *zone, struct memory_notify *arg)
-{
- int nid = zone_to_nid(zone);
-
- arg->status_change_nid = NUMA_NO_NODE;
- arg->status_change_nid_normal = NUMA_NO_NODE;
-
- if (!node_state(nid, N_MEMORY))
- arg->status_change_nid = nid;
- if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY))
- arg->status_change_nid_normal = nid;
-}
-
-static void node_states_set_node(int node, struct memory_notify *arg)
-{
- if (arg->status_change_nid_normal >= 0)
- node_set_state(node, N_NORMAL_MEMORY);
-
- if (arg->status_change_nid >= 0)
- node_set_state(node, N_MEMORY);
-}
-
static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn,
unsigned long nr_pages)
{
@@ -770,7 +747,8 @@ static inline void section_taint_zone_device(unsigned long pfn)
*/
void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
unsigned long nr_pages,
- struct vmem_altmap *altmap, int migratetype)
+ struct vmem_altmap *altmap, int migratetype,
+ bool isolate_pageblock)
{
struct pglist_data *pgdat = zone->zone_pgdat;
int nid = pgdat->node_id;
@@ -797,12 +775,13 @@ void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
/*
* TODO now we have a visible range of pages which are not associated
- * with their zone properly. Not nice but set_pfnblock_flags_mask
+ * with their zone properly. Not nice but set_pfnblock_migratetype()
* expects the zone spans the pfn range. All the pages in the range
* are reserved so nobody should be touching them so we should be safe
*/
memmap_init_range(nr_pages, nid, zone_idx(zone), start_pfn, 0,
- MEMINIT_HOTPLUG, altmap, migratetype);
+ MEMINIT_HOTPLUG, altmap, migratetype,
+ isolate_pageblock);
set_zone_contiguous(zone);
}
@@ -976,7 +955,7 @@ static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn
* effectively unused by the kernel, yet they account to "present pages".
* Fortunately, these allocations are comparatively small in relevant setups
* (e.g., fraction of system memory).
- * b) Some hotplugged memory blocks in virtualized environments, esecially
+ * b) Some hotplugged memory blocks in virtualized environments, especially
* hotplugged by virtio-mem, look like they are completely present, however,
* only parts of the memory block are actually currently usable.
* "present pages" is an upper limit that can get reached at runtime. As
@@ -1127,7 +1106,8 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
if (mhp_off_inaccessible)
page_init_poison(pfn_to_page(pfn), sizeof(struct page) * nr_pages);
- move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE);
+ move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE,
+ false);
for (i = 0; i < nr_pages; i++) {
struct page *page = pfn_to_page(pfn + i);
@@ -1173,11 +1153,17 @@ void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages)
int online_pages(unsigned long pfn, unsigned long nr_pages,
struct zone *zone, struct memory_group *group)
{
- unsigned long flags;
- int need_zonelists_rebuild = 0;
+ struct memory_notify mem_arg = {
+ .start_pfn = pfn,
+ .nr_pages = nr_pages,
+ };
+ struct node_notify node_arg = {
+ .nid = NUMA_NO_NODE,
+ };
const int nid = zone_to_nid(zone);
+ int need_zonelists_rebuild = 0;
+ unsigned long flags;
int ret;
- struct memory_notify arg;
/*
* {on,off}lining is constrained to full memory sections (or more
@@ -1192,13 +1178,19 @@ int online_pages(unsigned long pfn, unsigned long nr_pages,
/* associate pfn range with the zone */
- move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE);
-
- arg.start_pfn = pfn;
- arg.nr_pages = nr_pages;
- node_states_check_changes_online(nr_pages, zone, &arg);
+ move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_MOVABLE,
+ true);
+
+ if (!node_state(nid, N_MEMORY)) {
+ /* Adding memory to the node for the first time */
+ node_arg.nid = nid;
+ ret = node_notify(NODE_ADDING_FIRST_MEMORY, &node_arg);
+ ret = notifier_to_errno(ret);
+ if (ret)
+ goto failed_addition;
+ }
- ret = memory_notify(MEM_GOING_ONLINE, &arg);
+ ret = memory_notify(MEM_GOING_ONLINE, &mem_arg);
ret = notifier_to_errno(ret);
if (ret)
goto failed_addition;
@@ -1224,12 +1216,13 @@ int online_pages(unsigned long pfn, unsigned long nr_pages,
online_pages_range(pfn, nr_pages);
adjust_present_page_count(pfn_to_page(pfn), group, nr_pages);
- node_states_set_node(nid, &arg);
+ if (node_arg.nid >= 0)
+ node_set_state(nid, N_MEMORY);
if (need_zonelists_rebuild)
build_all_zonelists(NULL);
/* Basic onlining is complete, allow allocation of onlined pages. */
- undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE);
+ undo_isolate_page_range(pfn, pfn + nr_pages);
/*
* Freshly onlined pages aren't shuffled (e.g., all pages are placed to
@@ -1245,16 +1238,22 @@ int online_pages(unsigned long pfn, unsigned long nr_pages,
kswapd_run(nid);
kcompactd_run(nid);
+ if (node_arg.nid >= 0)
+ /* First memory added successfully. Notify consumers. */
+ node_notify(NODE_ADDED_FIRST_MEMORY, &node_arg);
+
writeback_set_ratelimit();
- memory_notify(MEM_ONLINE, &arg);
+ memory_notify(MEM_ONLINE, &mem_arg);
return 0;
failed_addition:
pr_debug("online_pages [mem %#010llx-%#010llx] failed\n",
(unsigned long long) pfn << PAGE_SHIFT,
(((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1);
- memory_notify(MEM_CANCEL_ONLINE, &arg);
+ memory_notify(MEM_CANCEL_ONLINE, &mem_arg);
+ if (node_arg.nid != NUMA_NO_NODE)
+ node_notify(NODE_CANCEL_ADDING_FIRST_MEMORY, &node_arg);
remove_pfn_range_from_zone(zone, pfn, nr_pages);
return ret;
}
@@ -1478,7 +1477,7 @@ static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group,
}
/* create memory block devices after memory was added */
- ret = create_memory_block_devices(cur_start, memblock_size,
+ ret = create_memory_block_devices(cur_start, memblock_size, nid,
params.altmap, group);
if (ret) {
arch_remove_memory(cur_start, memblock_size, NULL);
@@ -1540,8 +1539,16 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
ret = __try_online_node(nid, false);
if (ret < 0)
- goto error;
- new_node = ret;
+ goto error_memblock_remove;
+ if (ret) {
+ node_set_online(nid);
+ ret = register_one_node(nid);
+ if (WARN_ON(ret)) {
+ node_set_offline(nid);
+ goto error_memblock_remove;
+ }
+ new_node = true;
+ }
/*
* Self hosted memmap array
@@ -1557,27 +1564,15 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
goto error;
/* create memory block devices after memory was added */
- ret = create_memory_block_devices(start, size, NULL, group);
+ ret = create_memory_block_devices(start, size, nid, NULL, group);
if (ret) {
arch_remove_memory(start, size, params.altmap);
goto error;
}
}
- if (new_node) {
- /* If sysfs file of new node can't be created, cpu on the node
- * can't be hot-added. There is no rollback way now.
- * So, check by BUG_ON() to catch it reluctantly..
- * We online node here. We can't roll back from here.
- */
- node_set_online(nid);
- ret = __register_one_node(nid);
- BUG_ON(ret);
- }
-
- register_memory_blocks_under_node(nid, PFN_DOWN(start),
- PFN_UP(start + size - 1),
- MEMINIT_HOTPLUG);
+ register_memory_blocks_under_node_hotplug(nid, PFN_DOWN(start),
+ PFN_UP(start + size - 1));
/* create new memmap entry */
if (!strcmp(res->name, "System RAM"))
@@ -1599,6 +1594,11 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
return ret;
error:
+ if (new_node) {
+ node_set_offline(nid);
+ unregister_one_node(nid);
+ }
+error_memblock_remove:
if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
memblock_remove(start, size);
error_mem_hotplug_end:
@@ -1741,8 +1741,8 @@ bool mhp_range_allowed(u64 start, u64 size, bool need_mapping)
#ifdef CONFIG_MEMORY_HOTREMOVE
/*
- * Scan pfn range [start,end) to find movable/migratable pages (LRU pages,
- * non-lru movable pages and hugepages). Will skip over most unmovable
+ * Scan pfn range [start,end) to find movable/migratable pages (LRU and
+ * hugetlb folio, movable_ops pages). Will skip over most unmovable
* pages (esp., pages that can be skipped when offlining), but bail out on
* definitely unmovable pages.
*
@@ -1756,20 +1756,16 @@ static int scan_movable_pages(unsigned long start, unsigned long end,
{
unsigned long pfn;
- for (pfn = start; pfn < end; pfn++) {
+ for_each_valid_pfn(pfn, start, end) {
struct page *page;
struct folio *folio;
- if (!pfn_valid(pfn))
- continue;
page = pfn_to_page(pfn);
- if (PageLRU(page))
- goto found;
- if (__PageMovable(page))
+ if (PageLRU(page) || page_has_movable_ops(page))
goto found;
/*
- * PageOffline() pages that are not marked __PageMovable() and
+ * PageOffline() pages that do not have movable_ops and
* have a reference count > 0 (after MEM_GOING_OFFLINE) are
* definitely unmovable. If their reference count would be 0,
* they could at least be skipped when offlining memory.
@@ -1805,11 +1801,9 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
static DEFINE_RATELIMIT_STATE(migrate_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
- for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+ for_each_valid_pfn(pfn, start_pfn, end_pfn) {
struct page *page;
- if (!pfn_valid(pfn))
- continue;
page = pfn_to_page(pfn);
folio = page_folio(page);
@@ -1823,8 +1817,14 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
pfn = folio_pfn(folio) + folio_nr_pages(folio) - 1;
if (folio_contain_hwpoisoned_page(folio)) {
- if (WARN_ON(folio_test_lru(folio)))
- folio_isolate_lru(folio);
+ /*
+ * unmap_poisoned_folio() cannot handle large folios
+ * in all cases yet.
+ */
+ if (folio_test_large(folio) && !folio_test_hugetlb(folio))
+ goto put_folio;
+ if (folio_test_lru(folio) && !folio_isolate_lru(folio))
+ goto put_folio;
if (folio_mapped(folio)) {
folio_lock(folio);
unmap_poisoned_folio(folio, pfn, false);
@@ -1890,54 +1890,6 @@ static int __init cmdline_parse_movable_node(char *p)
}
early_param("movable_node", cmdline_parse_movable_node);
-/* check which state of node_states will be changed when offline memory */
-static void node_states_check_changes_offline(unsigned long nr_pages,
- struct zone *zone, struct memory_notify *arg)
-{
- struct pglist_data *pgdat = zone->zone_pgdat;
- unsigned long present_pages = 0;
- enum zone_type zt;
-
- arg->status_change_nid = NUMA_NO_NODE;
- arg->status_change_nid_normal = NUMA_NO_NODE;
-
- /*
- * Check whether node_states[N_NORMAL_MEMORY] will be changed.
- * If the memory to be offline is within the range
- * [0..ZONE_NORMAL], and it is the last present memory there,
- * the zones in that range will become empty after the offlining,
- * thus we can determine that we need to clear the node from
- * node_states[N_NORMAL_MEMORY].
- */
- for (zt = 0; zt <= ZONE_NORMAL; zt++)
- present_pages += pgdat->node_zones[zt].present_pages;
- if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages)
- arg->status_change_nid_normal = zone_to_nid(zone);
-
- /*
- * We have accounted the pages from [0..ZONE_NORMAL); ZONE_HIGHMEM
- * does not apply as we don't support 32bit.
- * Here we count the possible pages from ZONE_MOVABLE.
- * If after having accounted all the pages, we see that the nr_pages
- * to be offlined is over or equal to the accounted pages,
- * we know that the node will become empty, and so, we can clear
- * it for N_MEMORY as well.
- */
- present_pages += pgdat->node_zones[ZONE_MOVABLE].present_pages;
-
- if (nr_pages >= present_pages)
- arg->status_change_nid = zone_to_nid(zone);
-}
-
-static void node_states_clear_node(int node, struct memory_notify *arg)
-{
- if (arg->status_change_nid_normal >= 0)
- node_clear_state(node, N_NORMAL_MEMORY);
-
- if (arg->status_change_nid >= 0)
- node_clear_state(node, N_MEMORY);
-}
-
static int count_system_ram_pages_cb(unsigned long start_pfn,
unsigned long nr_pages, void *data)
{
@@ -1953,11 +1905,18 @@ static int count_system_ram_pages_cb(unsigned long start_pfn,
int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
struct zone *zone, struct memory_group *group)
{
- const unsigned long end_pfn = start_pfn + nr_pages;
unsigned long pfn, managed_pages, system_ram_pages = 0;
+ const unsigned long end_pfn = start_pfn + nr_pages;
+ struct pglist_data *pgdat = zone->zone_pgdat;
const int node = zone_to_nid(zone);
+ struct memory_notify mem_arg = {
+ .start_pfn = start_pfn,
+ .nr_pages = nr_pages,
+ };
+ struct node_notify node_arg = {
+ .nid = NUMA_NO_NODE,
+ };
unsigned long flags;
- struct memory_notify arg;
char *reason;
int ret;
@@ -2009,18 +1968,28 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
/* set above range as isolated */
ret = start_isolate_page_range(start_pfn, end_pfn,
- MIGRATE_MOVABLE,
- MEMORY_OFFLINE | REPORT_FAILURE);
+ PB_ISOLATE_MODE_MEM_OFFLINE);
if (ret) {
reason = "failure to isolate range";
goto failed_removal_pcplists_disabled;
}
- arg.start_pfn = start_pfn;
- arg.nr_pages = nr_pages;
- node_states_check_changes_offline(nr_pages, zone, &arg);
+ /*
+ * Check whether the node will have no present pages after we offline
+ * 'nr_pages' more. If so, we know that the node will become empty, and
+ * so we will clear N_MEMORY for it.
+ */
+ if (nr_pages >= pgdat->node_present_pages) {
+ node_arg.nid = node;
+ ret = node_notify(NODE_REMOVING_LAST_MEMORY, &node_arg);
+ ret = notifier_to_errno(ret);
+ if (ret) {
+ reason = "node notifier failure";
+ goto failed_removal_isolated;
+ }
+ }
- ret = memory_notify(MEM_GOING_OFFLINE, &arg);
+ ret = memory_notify(MEM_GOING_OFFLINE, &mem_arg);
ret = notifier_to_errno(ret);
if (ret) {
reason = "notifier failure";
@@ -2069,7 +2038,8 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
goto failed_removal_isolated;
}
- ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE);
+ ret = test_pages_isolated(start_pfn, end_pfn,
+ PB_ISOLATE_MODE_MEM_OFFLINE);
} while (ret);
@@ -2100,27 +2070,32 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
* Make sure to mark the node as memory-less before rebuilding the zone
* list. Otherwise this node would still appear in the fallback lists.
*/
- node_states_clear_node(node, &arg);
+ if (node_arg.nid >= 0)
+ node_clear_state(node, N_MEMORY);
if (!populated_zone(zone)) {
zone_pcp_reset(zone);
build_all_zonelists(NULL);
}
- if (arg.status_change_nid >= 0) {
+ if (node_arg.nid >= 0) {
kcompactd_stop(node);
kswapd_stop(node);
+ /* Node went memoryless. Notify consumers */
+ node_notify(NODE_REMOVED_LAST_MEMORY, &node_arg);
}
writeback_set_ratelimit();
- memory_notify(MEM_OFFLINE, &arg);
+ memory_notify(MEM_OFFLINE, &mem_arg);
remove_pfn_range_from_zone(zone, start_pfn, nr_pages);
return 0;
failed_removal_isolated:
/* pushback to free area */
- undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
- memory_notify(MEM_CANCEL_OFFLINE, &arg);
+ undo_isolate_page_range(start_pfn, end_pfn);
+ memory_notify(MEM_CANCEL_OFFLINE, &mem_arg);
+ if (node_arg.nid != NUMA_NO_NODE)
+ node_notify(NODE_CANCEL_REMOVING_LAST_MEMORY, &node_arg);
failed_removal_pcplists_disabled:
lru_cache_enable();
zone_pcp_enable(zone);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b28a1e6ae096..eb83cff7db8c 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -109,10 +109,12 @@
#include <linux/mmu_notifier.h>
#include <linux/printk.h>
#include <linux/swapops.h>
+#include <linux/gcd.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include <linux/uaccess.h>
+#include <linux/memory.h>
#include "internal.h"
@@ -139,31 +141,138 @@ static struct mempolicy default_policy = {
static struct mempolicy preferred_node_policy[MAX_NUMNODES];
/*
- * iw_table is the sysfs-set interleave weight table, a value of 0 denotes
- * system-default value should be used. A NULL iw_table also denotes that
- * system-default values should be used. Until the system-default table
- * is implemented, the system-default is always 1.
- *
- * iw_table is RCU protected
+ * weightiness balances the tradeoff between small weights (cycles through nodes
+ * faster, more fair/even distribution) and large weights (smaller errors
+ * between actual bandwidth ratios and weight ratios). 32 is a number that has
+ * been found to perform at a reasonable compromise between the two goals.
+ */
+static const int weightiness = 32;
+
+/*
+ * A null weighted_interleave_state is interpreted as having .mode="auto",
+ * and .iw_table is interpreted as an array of 1s with length nr_node_ids.
*/
-static u8 __rcu *iw_table;
-static DEFINE_MUTEX(iw_table_lock);
+struct weighted_interleave_state {
+ bool mode_auto;
+ u8 iw_table[];
+};
+static struct weighted_interleave_state __rcu *wi_state;
+static unsigned int *node_bw_table;
+
+/*
+ * wi_state_lock protects both wi_state and node_bw_table.
+ * node_bw_table is only used by writers to update wi_state.
+ */
+static DEFINE_MUTEX(wi_state_lock);
static u8 get_il_weight(int node)
{
- u8 *table;
- u8 weight;
+ struct weighted_interleave_state *state;
+ u8 weight = 1;
rcu_read_lock();
- table = rcu_dereference(iw_table);
- /* if no iw_table, use system default */
- weight = table ? table[node] : 1;
- /* if value in iw_table is 0, use system default */
- weight = weight ? weight : 1;
+ state = rcu_dereference(wi_state);
+ if (state)
+ weight = state->iw_table[node];
rcu_read_unlock();
return weight;
}
+/*
+ * Convert bandwidth values into weighted interleave weights.
+ * Call with wi_state_lock.
+ */
+static void reduce_interleave_weights(unsigned int *bw, u8 *new_iw)
+{
+ u64 sum_bw = 0;
+ unsigned int cast_sum_bw, scaling_factor = 1, iw_gcd = 0;
+ int nid;
+
+ for_each_node_state(nid, N_MEMORY)
+ sum_bw += bw[nid];
+
+ /* Scale bandwidths to whole numbers in the range [1, weightiness] */
+ for_each_node_state(nid, N_MEMORY) {
+ /*
+ * Try not to perform 64-bit division.
+ * If sum_bw < scaling_factor, then sum_bw < U32_MAX.
+ * If sum_bw > scaling_factor, then round the weight up to 1.
+ */
+ scaling_factor = weightiness * bw[nid];
+ if (bw[nid] && sum_bw < scaling_factor) {
+ cast_sum_bw = (unsigned int)sum_bw;
+ new_iw[nid] = scaling_factor / cast_sum_bw;
+ } else {
+ new_iw[nid] = 1;
+ }
+ if (!iw_gcd)
+ iw_gcd = new_iw[nid];
+ iw_gcd = gcd(iw_gcd, new_iw[nid]);
+ }
+
+ /* 1:2 is strictly better than 16:32. Reduce by the weights' GCD. */
+ for_each_node_state(nid, N_MEMORY)
+ new_iw[nid] /= iw_gcd;
+}
+
+int mempolicy_set_node_perf(unsigned int node, struct access_coordinate *coords)
+{
+ struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
+ unsigned int *old_bw, *new_bw;
+ unsigned int bw_val;
+ int i;
+
+ bw_val = min(coords->read_bandwidth, coords->write_bandwidth);
+ new_bw = kcalloc(nr_node_ids, sizeof(unsigned int), GFP_KERNEL);
+ if (!new_bw)
+ return -ENOMEM;
+
+ new_wi_state = kmalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
+ GFP_KERNEL);
+ if (!new_wi_state) {
+ kfree(new_bw);
+ return -ENOMEM;
+ }
+ new_wi_state->mode_auto = true;
+ for (i = 0; i < nr_node_ids; i++)
+ new_wi_state->iw_table[i] = 1;
+
+ /*
+ * Update bandwidth info, even in manual mode. That way, when switching
+ * to auto mode in the future, iw_table can be overwritten using
+ * accurate bw data.
+ */
+ mutex_lock(&wi_state_lock);
+
+ old_bw = node_bw_table;
+ if (old_bw)
+ memcpy(new_bw, old_bw, nr_node_ids * sizeof(*old_bw));
+ new_bw[node] = bw_val;
+ node_bw_table = new_bw;
+
+ old_wi_state = rcu_dereference_protected(wi_state,
+ lockdep_is_held(&wi_state_lock));
+ if (old_wi_state && !old_wi_state->mode_auto) {
+ /* Manual mode; skip reducing weights and updating wi_state */
+ mutex_unlock(&wi_state_lock);
+ kfree(new_wi_state);
+ goto out;
+ }
+
+ /* NULL wi_state assumes auto=true; reduce weights and update wi_state*/
+ reduce_interleave_weights(new_bw, new_wi_state->iw_table);
+ rcu_assign_pointer(wi_state, new_wi_state);
+
+ mutex_unlock(&wi_state_lock);
+ if (old_wi_state) {
+ synchronize_rcu();
+ kfree(old_wi_state);
+ }
+out:
+ kfree(old_bw);
+ return 0;
+}
+
/**
* numa_nearest_node - Find nearest node by state
* @node: Node id to start the search
@@ -573,6 +682,7 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
pte_t *pte, *mapped_pte;
pte_t ptent;
spinlock_t *ptl;
+ int max_nr, nr;
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
@@ -586,7 +696,9 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
walk->action = ACTION_AGAIN;
return 0;
}
- for (; addr != end; pte++, addr += PAGE_SIZE) {
+ for (; addr != end; pte += nr, addr += nr * PAGE_SIZE) {
+ max_nr = (end - addr) >> PAGE_SHIFT;
+ nr = 1;
ptent = ptep_get(pte);
if (pte_none(ptent))
continue;
@@ -598,6 +710,8 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
folio = vm_normal_folio(vma, addr, ptent);
if (!folio || folio_is_zone_device(folio))
continue;
+ if (folio_test_large(folio) && max_nr != 1)
+ nr = folio_pte_batch(folio, pte, ptent, max_nr);
/*
* vm_normal_folio() filters out zero pages, but there might
* still be reserved folios to skip, perhaps in a VDSO.
@@ -630,7 +744,7 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
!vma_migratable(vma) ||
!migrate_folio_add(folio, qp->pagelist, flags)) {
- qp->nr_failed++;
+ qp->nr_failed += nr;
if (strictly_unmovable(flags))
break;
}
@@ -2014,26 +2128,28 @@ static unsigned int read_once_policy_nodemask(struct mempolicy *pol,
static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
{
+ struct weighted_interleave_state *state;
nodemask_t nodemask;
unsigned int target, nr_nodes;
- u8 *table;
+ u8 *table = NULL;
unsigned int weight_total = 0;
u8 weight;
- int nid;
+ int nid = 0;
nr_nodes = read_once_policy_nodemask(pol, &nodemask);
if (!nr_nodes)
return numa_node_id();
rcu_read_lock();
- table = rcu_dereference(iw_table);
+
+ state = rcu_dereference(wi_state);
+ /* Uninitialized wi_state means we should assume all weights are 1 */
+ if (state)
+ table = state->iw_table;
+
/* calculate the total weight */
- for_each_node_mask(nid, nodemask) {
- /* detect system default usage */
- weight = table ? table[nid] : 1;
- weight = weight ? weight : 1;
- weight_total += weight;
- }
+ for_each_node_mask(nid, nodemask)
+ weight_total += table ? table[nid] : 1;
/* Calculate the node offset based on totals */
target = ilx % weight_total;
@@ -2041,7 +2157,6 @@ static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
while (target) {
/* detect system default usage */
weight = table ? table[nid] : 1;
- weight = weight ? weight : 1;
if (target < weight)
break;
target -= weight;
@@ -2442,13 +2557,14 @@ static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
struct mempolicy *pol, unsigned long nr_pages,
struct page **page_array)
{
+ struct weighted_interleave_state *state;
struct task_struct *me = current;
unsigned int cpuset_mems_cookie;
unsigned long total_allocated = 0;
unsigned long nr_allocated = 0;
unsigned long rounds;
unsigned long node_pages, delta;
- u8 *table, *weights, weight;
+ u8 *weights, weight;
unsigned int weight_total = 0;
unsigned long rem_pages = nr_pages;
nodemask_t nodes;
@@ -2498,17 +2614,19 @@ static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
return total_allocated;
rcu_read_lock();
- table = rcu_dereference(iw_table);
- if (table)
- memcpy(weights, table, nr_node_ids);
- rcu_read_unlock();
+ state = rcu_dereference(wi_state);
+ if (state) {
+ memcpy(weights, state->iw_table, nr_node_ids * sizeof(u8));
+ rcu_read_unlock();
+ } else {
+ rcu_read_unlock();
+ for (i = 0; i < nr_node_ids; i++)
+ weights[i] = 1;
+ }
/* calculate total, detect system default usage */
- for_each_node_mask(node, nodes) {
- if (!weights[node])
- weights[node] = 1;
+ for_each_node_mask(node, nodes)
weight_total += weights[node];
- }
/*
* Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls.
@@ -3419,6 +3537,14 @@ struct iw_node_attr {
int nid;
};
+struct sysfs_wi_group {
+ struct kobject wi_kobj;
+ struct mutex kobj_lock;
+ struct iw_node_attr *nattrs[];
+};
+
+static struct sysfs_wi_group *wi_group;
+
static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
@@ -3433,177 +3559,310 @@ static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t count)
{
+ struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
struct iw_node_attr *node_attr;
- u8 *new;
- u8 *old;
u8 weight = 0;
+ int i;
node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
- if (count == 0 || sysfs_streq(buf, ""))
- weight = 0;
- else if (kstrtou8(buf, 0, &weight))
+ if (count == 0 || sysfs_streq(buf, "") ||
+ kstrtou8(buf, 0, &weight) || weight == 0)
return -EINVAL;
- new = kzalloc(nr_node_ids, GFP_KERNEL);
- if (!new)
+ new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
+ GFP_KERNEL);
+ if (!new_wi_state)
return -ENOMEM;
- mutex_lock(&iw_table_lock);
- old = rcu_dereference_protected(iw_table,
- lockdep_is_held(&iw_table_lock));
- if (old)
- memcpy(new, old, nr_node_ids);
- new[node_attr->nid] = weight;
- rcu_assign_pointer(iw_table, new);
- mutex_unlock(&iw_table_lock);
- synchronize_rcu();
- kfree(old);
+ mutex_lock(&wi_state_lock);
+ old_wi_state = rcu_dereference_protected(wi_state,
+ lockdep_is_held(&wi_state_lock));
+ if (old_wi_state) {
+ memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
+ nr_node_ids * sizeof(u8));
+ } else {
+ for (i = 0; i < nr_node_ids; i++)
+ new_wi_state->iw_table[i] = 1;
+ }
+ new_wi_state->iw_table[node_attr->nid] = weight;
+ new_wi_state->mode_auto = false;
+
+ rcu_assign_pointer(wi_state, new_wi_state);
+ mutex_unlock(&wi_state_lock);
+ if (old_wi_state) {
+ synchronize_rcu();
+ kfree(old_wi_state);
+ }
return count;
}
-static struct iw_node_attr **node_attrs;
-
-static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
- struct kobject *parent)
+static ssize_t weighted_interleave_auto_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
{
- if (!node_attr)
- return;
- sysfs_remove_file(parent, &node_attr->kobj_attr.attr);
- kfree(node_attr->kobj_attr.attr.name);
- kfree(node_attr);
+ struct weighted_interleave_state *state;
+ bool wi_auto = true;
+
+ rcu_read_lock();
+ state = rcu_dereference(wi_state);
+ if (state)
+ wi_auto = state->mode_auto;
+ rcu_read_unlock();
+
+ return sysfs_emit(buf, "%s\n", str_true_false(wi_auto));
}
-static void sysfs_wi_release(struct kobject *wi_kobj)
+static ssize_t weighted_interleave_auto_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
{
+ struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
+ unsigned int *bw;
+ bool input;
int i;
+ if (kstrtobool(buf, &input))
+ return -EINVAL;
+
+ new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
+ GFP_KERNEL);
+ if (!new_wi_state)
+ return -ENOMEM;
for (i = 0; i < nr_node_ids; i++)
- sysfs_wi_node_release(node_attrs[i], wi_kobj);
- kobject_put(wi_kobj);
+ new_wi_state->iw_table[i] = 1;
+
+ mutex_lock(&wi_state_lock);
+ if (!input) {
+ old_wi_state = rcu_dereference_protected(wi_state,
+ lockdep_is_held(&wi_state_lock));
+ if (!old_wi_state)
+ goto update_wi_state;
+ if (input == old_wi_state->mode_auto) {
+ mutex_unlock(&wi_state_lock);
+ return count;
+ }
+
+ memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
+ nr_node_ids * sizeof(u8));
+ goto update_wi_state;
+ }
+
+ bw = node_bw_table;
+ if (!bw) {
+ mutex_unlock(&wi_state_lock);
+ kfree(new_wi_state);
+ return -ENODEV;
+ }
+
+ new_wi_state->mode_auto = true;
+ reduce_interleave_weights(bw, new_wi_state->iw_table);
+
+update_wi_state:
+ rcu_assign_pointer(wi_state, new_wi_state);
+ mutex_unlock(&wi_state_lock);
+ if (old_wi_state) {
+ synchronize_rcu();
+ kfree(old_wi_state);
+ }
+ return count;
+}
+
+static void sysfs_wi_node_delete(int nid)
+{
+ struct iw_node_attr *attr;
+
+ if (nid < 0 || nid >= nr_node_ids)
+ return;
+
+ mutex_lock(&wi_group->kobj_lock);
+ attr = wi_group->nattrs[nid];
+ if (!attr) {
+ mutex_unlock(&wi_group->kobj_lock);
+ return;
+ }
+
+ wi_group->nattrs[nid] = NULL;
+ mutex_unlock(&wi_group->kobj_lock);
+
+ sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr);
+ kfree(attr->kobj_attr.attr.name);
+ kfree(attr);
+}
+
+static void sysfs_wi_node_delete_all(void)
+{
+ int nid;
+
+ for (nid = 0; nid < nr_node_ids; nid++)
+ sysfs_wi_node_delete(nid);
+}
+
+static void wi_state_free(void)
+{
+ struct weighted_interleave_state *old_wi_state;
+
+ mutex_lock(&wi_state_lock);
+ old_wi_state = rcu_dereference_protected(wi_state,
+ lockdep_is_held(&wi_state_lock));
+ rcu_assign_pointer(wi_state, NULL);
+ mutex_unlock(&wi_state_lock);
+
+ if (old_wi_state) {
+ synchronize_rcu();
+ kfree(old_wi_state);
+ }
+}
+
+static struct kobj_attribute wi_auto_attr =
+ __ATTR(auto, 0664, weighted_interleave_auto_show,
+ weighted_interleave_auto_store);
+
+static void wi_cleanup(void) {
+ sysfs_remove_file(&wi_group->wi_kobj, &wi_auto_attr.attr);
+ sysfs_wi_node_delete_all();
+ wi_state_free();
+}
+
+static void wi_kobj_release(struct kobject *wi_kobj)
+{
+ kfree(wi_group);
}
static const struct kobj_type wi_ktype = {
.sysfs_ops = &kobj_sysfs_ops,
- .release = sysfs_wi_release,
+ .release = wi_kobj_release,
};
-static int add_weight_node(int nid, struct kobject *wi_kobj)
+static int sysfs_wi_node_add(int nid)
{
- struct iw_node_attr *node_attr;
+ int ret;
char *name;
+ struct iw_node_attr *new_attr;
- node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL);
- if (!node_attr)
+ if (nid < 0 || nid >= nr_node_ids) {
+ pr_err("invalid node id: %d\n", nid);
+ return -EINVAL;
+ }
+
+ new_attr = kzalloc(sizeof(*new_attr), GFP_KERNEL);
+ if (!new_attr)
return -ENOMEM;
name = kasprintf(GFP_KERNEL, "node%d", nid);
if (!name) {
- kfree(node_attr);
+ kfree(new_attr);
return -ENOMEM;
}
- sysfs_attr_init(&node_attr->kobj_attr.attr);
- node_attr->kobj_attr.attr.name = name;
- node_attr->kobj_attr.attr.mode = 0644;
- node_attr->kobj_attr.show = node_show;
- node_attr->kobj_attr.store = node_store;
- node_attr->nid = nid;
+ sysfs_attr_init(&new_attr->kobj_attr.attr);
+ new_attr->kobj_attr.attr.name = name;
+ new_attr->kobj_attr.attr.mode = 0644;
+ new_attr->kobj_attr.show = node_show;
+ new_attr->kobj_attr.store = node_store;
+ new_attr->nid = nid;
- if (sysfs_create_file(wi_kobj, &node_attr->kobj_attr.attr)) {
- kfree(node_attr->kobj_attr.attr.name);
- kfree(node_attr);
- pr_err("failed to add attribute to weighted_interleave\n");
- return -ENOMEM;
+ mutex_lock(&wi_group->kobj_lock);
+ if (wi_group->nattrs[nid]) {
+ mutex_unlock(&wi_group->kobj_lock);
+ ret = -EEXIST;
+ goto out;
}
- node_attrs[nid] = node_attr;
+ ret = sysfs_create_file(&wi_group->wi_kobj, &new_attr->kobj_attr.attr);
+ if (ret) {
+ mutex_unlock(&wi_group->kobj_lock);
+ goto out;
+ }
+ wi_group->nattrs[nid] = new_attr;
+ mutex_unlock(&wi_group->kobj_lock);
return 0;
+
+out:
+ kfree(new_attr->kobj_attr.attr.name);
+ kfree(new_attr);
+ return ret;
+}
+
+static int wi_node_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ int err;
+ struct node_notify *nn = data;
+ int nid = nn->nid;
+
+ switch (action) {
+ case NODE_ADDED_FIRST_MEMORY:
+ err = sysfs_wi_node_add(nid);
+ if (err)
+ pr_err("failed to add sysfs for node%d during hotplug: %d\n",
+ nid, err);
+ break;
+ case NODE_REMOVED_LAST_MEMORY:
+ sysfs_wi_node_delete(nid);
+ break;
+ }
+
+ return NOTIFY_OK;
}
-static int add_weighted_interleave_group(struct kobject *root_kobj)
+static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj)
{
- struct kobject *wi_kobj;
int nid, err;
- wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
- if (!wi_kobj)
+ wi_group = kzalloc(struct_size(wi_group, nattrs, nr_node_ids),
+ GFP_KERNEL);
+ if (!wi_group)
return -ENOMEM;
+ mutex_init(&wi_group->kobj_lock);
- err = kobject_init_and_add(wi_kobj, &wi_ktype, root_kobj,
+ err = kobject_init_and_add(&wi_group->wi_kobj, &wi_ktype, mempolicy_kobj,
"weighted_interleave");
- if (err) {
- kfree(wi_kobj);
- return err;
- }
+ if (err)
+ goto err_put_kobj;
- for_each_node_state(nid, N_POSSIBLE) {
- err = add_weight_node(nid, wi_kobj);
+ err = sysfs_create_file(&wi_group->wi_kobj, &wi_auto_attr.attr);
+ if (err)
+ goto err_put_kobj;
+
+ for_each_online_node(nid) {
+ if (!node_state(nid, N_MEMORY))
+ continue;
+
+ err = sysfs_wi_node_add(nid);
if (err) {
- pr_err("failed to add sysfs [node%d]\n", nid);
- break;
+ pr_err("failed to add sysfs for node%d during init: %d\n",
+ nid, err);
+ goto err_cleanup_kobj;
}
}
- if (err)
- kobject_put(wi_kobj);
- return 0;
-}
-static void mempolicy_kobj_release(struct kobject *kobj)
-{
- u8 *old;
+ hotplug_node_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI);
+ return 0;
- mutex_lock(&iw_table_lock);
- old = rcu_dereference_protected(iw_table,
- lockdep_is_held(&iw_table_lock));
- rcu_assign_pointer(iw_table, NULL);
- mutex_unlock(&iw_table_lock);
- synchronize_rcu();
- kfree(old);
- kfree(node_attrs);
- kfree(kobj);
+err_cleanup_kobj:
+ wi_cleanup();
+ kobject_del(&wi_group->wi_kobj);
+err_put_kobj:
+ kobject_put(&wi_group->wi_kobj);
+ return err;
}
-static const struct kobj_type mempolicy_ktype = {
- .release = mempolicy_kobj_release
-};
-
static int __init mempolicy_sysfs_init(void)
{
int err;
static struct kobject *mempolicy_kobj;
- mempolicy_kobj = kzalloc(sizeof(*mempolicy_kobj), GFP_KERNEL);
- if (!mempolicy_kobj) {
- err = -ENOMEM;
- goto err_out;
- }
-
- node_attrs = kcalloc(nr_node_ids, sizeof(struct iw_node_attr *),
- GFP_KERNEL);
- if (!node_attrs) {
- err = -ENOMEM;
- goto mempol_out;
- }
+ mempolicy_kobj = kobject_create_and_add("mempolicy", mm_kobj);
+ if (!mempolicy_kobj)
+ return -ENOMEM;
- err = kobject_init_and_add(mempolicy_kobj, &mempolicy_ktype, mm_kobj,
- "mempolicy");
+ err = add_weighted_interleave_group(mempolicy_kobj);
if (err)
- goto node_out;
+ goto err_kobj;
- err = add_weighted_interleave_group(mempolicy_kobj);
- if (err) {
- pr_err("mempolicy sysfs structure failed to initialize\n");
- kobject_put(mempolicy_kobj);
- return err;
- }
+ return 0;
- return err;
-node_out:
- kfree(node_attrs);
-mempol_out:
- kfree(mempolicy_kobj);
-err_out:
- pr_err("failed to add mempolicy kobject to the system\n");
+err_kobj:
+ kobject_del(mempolicy_kobj);
+ kobject_put(mempolicy_kobj);
return err;
}
diff --git a/mm/mempool.c b/mm/mempool.c
index 3223337135d0..1c38e873e546 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -136,7 +136,7 @@ static void kasan_unpoison_element(mempool_t *pool, void *element)
static __always_inline void add_element(mempool_t *pool, void *element)
{
- BUG_ON(pool->curr_nr >= pool->min_nr);
+ BUG_ON(pool->min_nr != 0 && pool->curr_nr >= pool->min_nr);
poison_element(pool, element);
if (kasan_poison_element(pool, element))
pool->elements[pool->curr_nr++] = element;
@@ -202,16 +202,20 @@ int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
pool->alloc = alloc_fn;
pool->free = free_fn;
init_waitqueue_head(&pool->wait);
-
- pool->elements = kmalloc_array_node(min_nr, sizeof(void *),
+ /*
+ * max() used here to ensure storage for at least 1 element to support
+ * zero minimum pool
+ */
+ pool->elements = kmalloc_array_node(max(1, min_nr), sizeof(void *),
gfp_mask, node_id);
if (!pool->elements)
return -ENOMEM;
/*
- * First pre-allocate the guaranteed number of buffers.
+ * First pre-allocate the guaranteed number of buffers,
+ * also pre-allocate 1 element for zero minimum pool.
*/
- while (pool->curr_nr < pool->min_nr) {
+ while (pool->curr_nr < max(1, pool->min_nr)) {
void *element;
element = pool->alloc(gfp_mask, pool->pool_data);
@@ -540,11 +544,35 @@ void mempool_free(void *element, mempool_t *pool)
if (likely(pool->curr_nr < pool->min_nr)) {
add_element(pool, element);
spin_unlock_irqrestore(&pool->lock, flags);
- wake_up(&pool->wait);
+ if (wq_has_sleeper(&pool->wait))
+ wake_up(&pool->wait);
+ return;
+ }
+ spin_unlock_irqrestore(&pool->lock, flags);
+ }
+
+ /*
+ * Handle the min_nr = 0 edge case:
+ *
+ * For zero-minimum pools, curr_nr < min_nr (0 < 0) never succeeds,
+ * so waiters sleeping on pool->wait would never be woken by the
+ * wake-up path of previous test. This explicit check ensures the
+ * allocation of element when both min_nr and curr_nr are 0, and
+ * any active waiters are properly awakened.
+ */
+ if (unlikely(pool->min_nr == 0 &&
+ READ_ONCE(pool->curr_nr) == 0)) {
+ spin_lock_irqsave(&pool->lock, flags);
+ if (likely(pool->curr_nr == 0)) {
+ add_element(pool, element);
+ spin_unlock_irqrestore(&pool->lock, flags);
+ if (wq_has_sleeper(&pool->wait))
+ wake_up(&pool->wait);
return;
}
spin_unlock_irqrestore(&pool->lock, flags);
}
+
pool->free(element, pool->pool_data);
}
EXPORT_SYMBOL(mempool_free);
diff --git a/mm/memremap.c b/mm/memremap.c
index 2aebc1b192da..46cb1b0b6f72 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -5,7 +5,6 @@
#include <linux/kasan.h>
#include <linux/memory_hotplug.h>
#include <linux/memremap.h>
-#include <linux/pfn_t.h>
#include <linux/swap.h>
#include <linux/mm.h>
#include <linux/mmzone.h>
@@ -39,30 +38,6 @@ unsigned long memremap_compat_align(void)
EXPORT_SYMBOL_GPL(memremap_compat_align);
#endif
-#ifdef CONFIG_FS_DAX
-DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
-EXPORT_SYMBOL(devmap_managed_key);
-
-static void devmap_managed_enable_put(struct dev_pagemap *pgmap)
-{
- if (pgmap->type == MEMORY_DEVICE_FS_DAX)
- static_branch_dec(&devmap_managed_key);
-}
-
-static void devmap_managed_enable_get(struct dev_pagemap *pgmap)
-{
- if (pgmap->type == MEMORY_DEVICE_FS_DAX)
- static_branch_inc(&devmap_managed_key);
-}
-#else
-static void devmap_managed_enable_get(struct dev_pagemap *pgmap)
-{
-}
-static void devmap_managed_enable_put(struct dev_pagemap *pgmap)
-{
-}
-#endif /* CONFIG_FS_DAX */
-
static void pgmap_array_delete(struct range *range)
{
xa_store_range(&pgmap_array, PHYS_PFN(range->start), PHYS_PFN(range->end),
@@ -130,7 +105,7 @@ static void pageunmap_range(struct dev_pagemap *pgmap, int range_id)
}
mem_hotplug_done();
- untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range), true);
+ pfnmap_untrack(PHYS_PFN(range->start), range_len(range));
pgmap_array_delete(range);
}
@@ -151,7 +126,6 @@ void memunmap_pages(struct dev_pagemap *pgmap)
percpu_ref_exit(&pgmap->ref);
WARN_ONCE(pgmap->altmap.alloc, "failed to free all reserved pages\n");
- devmap_managed_enable_put(pgmap);
}
EXPORT_SYMBOL_GPL(memunmap_pages);
@@ -179,14 +153,14 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
"altmap not supported for multiple ranges\n"))
return -EINVAL;
- conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->start), NULL);
+ conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->start));
if (conflict_pgmap) {
WARN(1, "Conflicting mapping in same section\n");
put_dev_pagemap(conflict_pgmap);
return -ENOMEM;
}
- conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->end), NULL);
+ conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->end));
if (conflict_pgmap) {
WARN(1, "Conflicting mapping in same section\n");
put_dev_pagemap(conflict_pgmap);
@@ -211,8 +185,8 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
if (nid < 0)
nid = numa_mem_id();
- error = track_pfn_remap(NULL, &params->pgprot, PHYS_PFN(range->start), 0,
- range_len(range));
+ error = pfnmap_track(PHYS_PFN(range->start), range_len(range),
+ &params->pgprot);
if (error)
goto err_pfn_remap;
@@ -254,7 +228,7 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE];
move_pfn_range_to_zone(zone, PHYS_PFN(range->start),
PHYS_PFN(range_len(range)), params->altmap,
- MIGRATE_MOVABLE);
+ MIGRATE_MOVABLE, false);
}
mem_hotplug_done();
@@ -277,7 +251,7 @@ err_add_memory:
if (!is_private)
kasan_remove_zero_shadow(__va(range->start), range_len(range));
err_kasan:
- untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range), true);
+ pfnmap_untrack(PHYS_PFN(range->start), range_len(range));
err_pfn_remap:
pgmap_array_delete(range);
return error;
@@ -301,6 +275,9 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
if (WARN_ONCE(!nr_range, "nr_range must be specified\n"))
return ERR_PTR(-EINVAL);
+ if (WARN_ONCE(pgmap->vmemmap_shift > MAX_FOLIO_ORDER,
+ "requested folio size unsupported\n"))
+ return ERR_PTR(-EINVAL);
switch (pgmap->type) {
case MEMORY_DEVICE_PRIVATE:
@@ -332,10 +309,6 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
}
break;
case MEMORY_DEVICE_FS_DAX:
- if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) {
- WARN(1, "File system DAX not supported\n");
- return ERR_PTR(-EINVAL);
- }
params.pgprot = pgprot_decrypted(params.pgprot);
break;
case MEMORY_DEVICE_GENERIC:
@@ -354,8 +327,6 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
if (error)
return ERR_PTR(error);
- devmap_managed_enable_get(pgmap);
-
/*
* Clear the pgmap nr_range as it will be incremented for each
* successfully processed range. This communicates how many
@@ -426,26 +397,12 @@ EXPORT_SYMBOL_GPL(devm_memunmap_pages);
/**
* get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn
* @pfn: page frame number to lookup page_map
- * @pgmap: optional known pgmap that already has a reference
- *
- * If @pgmap is non-NULL and covers @pfn it will be returned as-is. If @pgmap
- * is non-NULL but does not cover @pfn the reference to it will be released.
*/
-struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
- struct dev_pagemap *pgmap)
+struct dev_pagemap *get_dev_pagemap(unsigned long pfn)
{
+ struct dev_pagemap *pgmap;
resource_size_t phys = PFN_PHYS(pfn);
- /*
- * In the cached case we're already holding a live reference.
- */
- if (pgmap) {
- if (phys >= pgmap->range.start && phys <= pgmap->range.end)
- return pgmap;
- put_dev_pagemap(pgmap);
- }
-
- /* fall back to slow path lookup */
rcu_read_lock();
pgmap = xa_load(&pgmap_array, PHYS_PFN(phys));
if (pgmap && !percpu_ref_tryget_live_rcu(&pgmap->ref))
diff --git a/mm/migrate.c b/mm/migrate.c
index 676d9cfc7059..e3065c9edb55 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -35,7 +35,6 @@
#include <linux/compat.h>
#include <linux/hugetlb.h>
#include <linux/gfp.h>
-#include <linux/pfn_t.h>
#include <linux/page_idle.h>
#include <linux/page_owner.h>
#include <linux/sched/mm.h>
@@ -50,9 +49,73 @@
#include <trace/events/migrate.h>
#include "internal.h"
+#include "swap.h"
-bool isolate_movable_page(struct page *page, isolate_mode_t mode)
+static const struct movable_operations *offline_movable_ops;
+static const struct movable_operations *zsmalloc_movable_ops;
+
+int set_movable_ops(const struct movable_operations *ops, enum pagetype type)
+{
+ /*
+ * We only allow for selected types and don't handle concurrent
+ * registration attempts yet.
+ */
+ switch (type) {
+ case PGTY_offline:
+ if (offline_movable_ops && ops)
+ return -EBUSY;
+ offline_movable_ops = ops;
+ break;
+ case PGTY_zsmalloc:
+ if (zsmalloc_movable_ops && ops)
+ return -EBUSY;
+ zsmalloc_movable_ops = ops;
+ break;
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(set_movable_ops);
+
+static const struct movable_operations *page_movable_ops(struct page *page)
+{
+ VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(page), page);
+
+ /*
+ * If we enable page migration for a page of a certain type by marking
+ * it as movable, the page type must be sticky until the page gets freed
+ * back to the buddy.
+ */
+ if (PageOffline(page))
+ /* Only balloon compaction sets PageOffline pages movable. */
+ return offline_movable_ops;
+ if (PageZsmalloc(page))
+ return zsmalloc_movable_ops;
+
+ return NULL;
+}
+
+/**
+ * isolate_movable_ops_page - isolate a movable_ops page for migration
+ * @page: The page.
+ * @mode: The isolation mode.
+ *
+ * Try to isolate a movable_ops page for migration. Will fail if the page is
+ * not a movable_ops page, if the page is already isolated for migration
+ * or if the page was just was released by its owner.
+ *
+ * Once isolated, the page cannot get freed until it is either putback
+ * or migrated.
+ *
+ * Returns true if isolation succeeded, otherwise false.
+ */
+bool isolate_movable_ops_page(struct page *page, isolate_mode_t mode)
{
+ /*
+ * TODO: these pages will not be folios in the future. All
+ * folio dependencies will have to be removed.
+ */
struct folio *folio = folio_get_nontail_page(page);
const struct movable_operations *mops;
@@ -69,11 +132,14 @@ bool isolate_movable_page(struct page *page, isolate_mode_t mode)
goto out;
/*
- * Check movable flag before taking the page lock because
+ * Check for movable_ops pages before taking the page lock because
* we use non-atomic bitops on newly allocated page flags so
* unconditionally grabbing the lock ruins page's owner side.
+ *
+ * Note that once a page has movable_ops, it will stay that way
+ * until the page was freed.
*/
- if (unlikely(!__folio_test_movable(folio)))
+ if (unlikely(!page_has_movable_ops(page)))
goto out_putfolio;
/*
@@ -90,18 +156,20 @@ bool isolate_movable_page(struct page *page, isolate_mode_t mode)
if (unlikely(!folio_trylock(folio)))
goto out_putfolio;
- if (!folio_test_movable(folio) || folio_test_isolated(folio))
+ VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(page), page);
+ if (PageMovableOpsIsolated(page))
goto out_no_isolated;
- mops = folio_movable_ops(folio);
- VM_BUG_ON_FOLIO(!mops, folio);
+ mops = page_movable_ops(page);
+ if (WARN_ON_ONCE(!mops))
+ goto out_no_isolated;
- if (!mops->isolate_page(&folio->page, mode))
+ if (!mops->isolate_page(page, mode))
goto out_no_isolated;
/* Driver shouldn't use the isolated flag */
- WARN_ON_ONCE(folio_test_isolated(folio));
- folio_set_isolated(folio);
+ VM_WARN_ON_ONCE_PAGE(PageMovableOpsIsolated(page), page);
+ SetPageMovableOpsIsolated(page);
folio_unlock(folio);
return true;
@@ -114,12 +182,68 @@ out:
return false;
}
-static void putback_movable_folio(struct folio *folio)
+/**
+ * putback_movable_ops_page - putback an isolated movable_ops page
+ * @page: The isolated page.
+ *
+ * Putback an isolated movable_ops page.
+ *
+ * After the page was putback, it might get freed instantly.
+ */
+static void putback_movable_ops_page(struct page *page)
{
- const struct movable_operations *mops = folio_movable_ops(folio);
+ /*
+ * TODO: these pages will not be folios in the future. All
+ * folio dependencies will have to be removed.
+ */
+ struct folio *folio = page_folio(page);
- mops->putback_page(&folio->page);
- folio_clear_isolated(folio);
+ VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(page), page);
+ VM_WARN_ON_ONCE_PAGE(!PageMovableOpsIsolated(page), page);
+ folio_lock(folio);
+ page_movable_ops(page)->putback_page(page);
+ ClearPageMovableOpsIsolated(page);
+ folio_unlock(folio);
+ folio_put(folio);
+}
+
+/**
+ * migrate_movable_ops_page - migrate an isolated movable_ops page
+ * @dst: The destination page.
+ * @src: The source page.
+ * @mode: The migration mode.
+ *
+ * Migrate an isolated movable_ops page.
+ *
+ * If the src page was already released by its owner, the src page is
+ * un-isolated (putback) and migration succeeds; the migration core will be the
+ * owner of both pages.
+ *
+ * If the src page was not released by its owner and the migration was
+ * successful, the owner of the src page and the dst page are swapped and
+ * the src page is un-isolated.
+ *
+ * If migration fails, the ownership stays unmodified and the src page
+ * remains isolated: migration may be retried later or the page can be putback.
+ *
+ * TODO: migration core will treat both pages as folios and lock them before
+ * this call to unlock them after this call. Further, the folio refcounts on
+ * src and dst are also released by migration core. These pages will not be
+ * folios in the future, so that must be reworked.
+ *
+ * Returns 0 on success, otherwise a negative error code.
+ */
+static int migrate_movable_ops_page(struct page *dst, struct page *src,
+ enum migrate_mode mode)
+{
+ int rc;
+
+ VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(src), src);
+ VM_WARN_ON_ONCE_PAGE(!PageMovableOpsIsolated(src), src);
+ rc = page_movable_ops(src)->migrate_page(dst, src, mode);
+ if (!rc)
+ ClearPageMovableOpsIsolated(src);
+ return rc;
}
/*
@@ -141,20 +265,8 @@ void putback_movable_pages(struct list_head *l)
continue;
}
list_del(&folio->lru);
- /*
- * We isolated non-lru movable folio so here we can use
- * __folio_test_movable because LRU folio's mapping cannot
- * have PAGE_MAPPING_MOVABLE.
- */
- if (unlikely(__folio_test_movable(folio))) {
- VM_BUG_ON_FOLIO(!folio_test_isolated(folio), folio);
- folio_lock(folio);
- if (folio_test_movable(folio))
- putback_movable_folio(folio);
- else
- folio_clear_isolated(folio);
- folio_unlock(folio);
- folio_put(folio);
+ if (unlikely(page_has_movable_ops(&folio->page))) {
+ putback_movable_ops_page(&folio->page);
} else {
node_stat_mod_folio(folio, NR_ISOLATED_ANON +
folio_is_file_lru(folio), -folio_nr_pages(folio));
@@ -166,43 +278,34 @@ void putback_movable_pages(struct list_head *l)
/* Must be called with an elevated refcount on the non-hugetlb folio */
bool isolate_folio_to_list(struct folio *folio, struct list_head *list)
{
- bool isolated, lru;
-
if (folio_test_hugetlb(folio))
return folio_isolate_hugetlb(folio, list);
- lru = !__folio_test_movable(folio);
- if (lru)
- isolated = folio_isolate_lru(folio);
- else
- isolated = isolate_movable_page(&folio->page,
- ISOLATE_UNEVICTABLE);
-
- if (!isolated)
- return false;
-
- list_add(&folio->lru, list);
- if (lru)
+ if (page_has_movable_ops(&folio->page)) {
+ if (!isolate_movable_ops_page(&folio->page,
+ ISOLATE_UNEVICTABLE))
+ return false;
+ } else {
+ if (!folio_isolate_lru(folio))
+ return false;
node_stat_add_folio(folio, NR_ISOLATED_ANON +
folio_is_file_lru(folio));
-
+ }
+ list_add(&folio->lru, list);
return true;
}
static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
- struct folio *folio,
- unsigned long idx)
+ struct folio *folio, pte_t old_pte, unsigned long idx)
{
struct page *page = folio_page(folio, idx);
- bool contains_data;
pte_t newpte;
- void *addr;
if (PageCompound(page))
return false;
VM_BUG_ON_PAGE(!PageAnon(page), page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(pte_present(ptep_get(pvmw->pte)), page);
+ VM_BUG_ON_PAGE(pte_present(old_pte), page);
if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags & VM_LOCKED) ||
mm_forbids_zeropage(pvmw->vma->vm_mm))
@@ -213,15 +316,17 @@ static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
* this subpage has been non present. If the subpage is only zero-filled
* then map it to the shared zeropage.
*/
- addr = kmap_local_page(page);
- contains_data = memchr_inv(addr, 0, PAGE_SIZE);
- kunmap_local(addr);
-
- if (contains_data)
+ if (!pages_identical(page, ZERO_PAGE(0)))
return false;
newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address),
pvmw->vma->vm_page_prot));
+
+ if (pte_swp_soft_dirty(old_pte))
+ newpte = pte_mksoft_dirty(newpte);
+ if (pte_swp_uffd_wp(old_pte))
+ newpte = pte_mkuffd_wp(newpte);
+
set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte, newpte);
dec_mm_counter(pvmw->vma->vm_mm, mm_counter(folio));
@@ -264,13 +369,13 @@ static bool remove_migration_pte(struct folio *folio,
continue;
}
#endif
+ old_pte = ptep_get(pvmw.pte);
if (rmap_walk_arg->map_unused_to_zeropage &&
- try_to_map_unused_to_zeropage(&pvmw, folio, idx))
+ try_to_map_unused_to_zeropage(&pvmw, folio, old_pte, idx))
continue;
folio_get(folio);
pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
- old_pte = ptep_get(pvmw.pte);
entry = pte_to_swp_entry(old_pte);
if (!is_migration_entry_young(entry))
@@ -445,20 +550,6 @@ unlock:
}
#endif
-static int folio_expected_refs(struct address_space *mapping,
- struct folio *folio)
-{
- int refs = 1;
- if (!mapping)
- return refs;
-
- refs += folio_nr_pages(folio);
- if (folio_test_private(folio))
- refs++;
-
- return refs;
-}
-
/*
* Replace the folio in the mapping.
*
@@ -471,10 +562,10 @@ static int __folio_migrate_mapping(struct address_space *mapping,
struct folio *newfolio, struct folio *folio, int expected_count)
{
XA_STATE(xas, &mapping->i_pages, folio_index(folio));
+ struct swap_cluster_info *ci = NULL;
struct zone *oldzone, *newzone;
int dirty;
long nr = folio_nr_pages(folio);
- long entries, i;
if (!mapping) {
/* Take off deferred split queue while frozen and memcg set */
@@ -494,15 +585,22 @@ static int __folio_migrate_mapping(struct address_space *mapping,
if (folio_test_swapbacked(folio))
__folio_set_swapbacked(newfolio);
- return MIGRATEPAGE_SUCCESS;
+ return 0;
}
oldzone = folio_zone(folio);
newzone = folio_zone(newfolio);
- xas_lock_irq(&xas);
+ if (folio_test_swapcache(folio))
+ ci = swap_cluster_get_and_lock_irq(folio);
+ else
+ xas_lock_irq(&xas);
+
if (!folio_ref_freeze(folio, expected_count)) {
- xas_unlock_irq(&xas);
+ if (ci)
+ swap_cluster_unlock_irq(ci);
+ else
+ xas_unlock_irq(&xas);
return -EAGAIN;
}
@@ -523,9 +621,6 @@ static int __folio_migrate_mapping(struct address_space *mapping,
if (folio_test_swapcache(folio)) {
folio_set_swapcache(newfolio);
newfolio->private = folio_get_private(folio);
- entries = nr;
- } else {
- entries = 1;
}
/* Move dirty while folio refs frozen and newfolio not yet exposed */
@@ -535,11 +630,10 @@ static int __folio_migrate_mapping(struct address_space *mapping,
folio_set_dirty(newfolio);
}
- /* Swap cache still stores N entries instead of a high-order entry */
- for (i = 0; i < entries; i++) {
+ if (folio_test_swapcache(folio))
+ __swap_cache_replace_folio(ci, folio, newfolio);
+ else
xas_store(&xas, newfolio);
- xas_next(&xas);
- }
/*
* Drop cache reference from old folio by unfreezing
@@ -548,8 +642,11 @@ static int __folio_migrate_mapping(struct address_space *mapping,
*/
folio_ref_unfreeze(folio, expected_count - nr);
- xas_unlock(&xas);
/* Leave irq disabled to prevent preemption while updating stats */
+ if (ci)
+ swap_cluster_unlock(ci);
+ else
+ xas_unlock(&xas);
/*
* If moved to a different zone then also account
@@ -595,13 +692,13 @@ static int __folio_migrate_mapping(struct address_space *mapping,
}
local_irq_enable();
- return MIGRATEPAGE_SUCCESS;
+ return 0;
}
int folio_migrate_mapping(struct address_space *mapping,
struct folio *newfolio, struct folio *folio, int extra_count)
{
- int expected_count = folio_expected_refs(mapping, folio) + extra_count;
+ int expected_count = folio_expected_ref_count(folio) + extra_count + 1;
if (folio_ref_count(folio) != expected_count)
return -EAGAIN;
@@ -618,7 +715,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
struct folio *dst, struct folio *src)
{
XA_STATE(xas, &mapping->i_pages, folio_index(src));
- int rc, expected_count = folio_expected_refs(mapping, src);
+ int rc, expected_count = folio_expected_ref_count(src) + 1;
if (folio_ref_count(src) != expected_count)
return -EAGAIN;
@@ -644,7 +741,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
xas_unlock_irq(&xas);
- return MIGRATEPAGE_SUCCESS;
+ return 0;
}
/*
@@ -749,7 +846,7 @@ static int __migrate_folio(struct address_space *mapping, struct folio *dst,
struct folio *src, void *src_private,
enum migrate_mode mode)
{
- int rc, expected_count = folio_expected_refs(mapping, src);
+ int rc, expected_count = folio_expected_ref_count(src) + 1;
/* Check whether src does not have extra refs before we do more work */
if (folio_ref_count(src) != expected_count)
@@ -760,14 +857,14 @@ static int __migrate_folio(struct address_space *mapping, struct folio *dst,
return rc;
rc = __folio_migrate_mapping(mapping, dst, src, expected_count);
- if (rc != MIGRATEPAGE_SUCCESS)
+ if (rc)
return rc;
if (src_private)
folio_attach_private(dst, folio_detach_private(src));
folio_migrate_flags(dst, src);
- return MIGRATEPAGE_SUCCESS;
+ return 0;
}
/**
@@ -837,7 +934,7 @@ static int __buffer_migrate_folio(struct address_space *mapping,
return migrate_folio(mapping, dst, src, mode);
/* Check whether page does not have extra refs before we do more work */
- expected_count = folio_expected_refs(mapping, src);
+ expected_count = folio_expected_ref_count(src) + 1;
if (folio_ref_count(src) != expected_count)
return -EAGAIN;
@@ -874,7 +971,7 @@ recheck_buffers:
}
rc = filemap_migrate_folio(mapping, dst, src, mode);
- if (rc != MIGRATEPAGE_SUCCESS)
+ if (rc)
goto unlock_buffers;
bh = head;
@@ -947,66 +1044,20 @@ int filemap_migrate_folio(struct address_space *mapping,
EXPORT_SYMBOL_GPL(filemap_migrate_folio);
/*
- * Writeback a folio to clean the dirty state
- */
-static int writeout(struct address_space *mapping, struct folio *folio)
-{
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_NONE,
- .nr_to_write = 1,
- .range_start = 0,
- .range_end = LLONG_MAX,
- .for_reclaim = 1
- };
- int rc;
-
- if (!mapping->a_ops->writepage)
- /* No write method for the address space */
- return -EINVAL;
-
- if (!folio_clear_dirty_for_io(folio))
- /* Someone else already triggered a write */
- return -EAGAIN;
-
- /*
- * A dirty folio may imply that the underlying filesystem has
- * the folio on some queue. So the folio must be clean for
- * migration. Writeout may mean we lose the lock and the
- * folio state is no longer what we checked for earlier.
- * At this point we know that the migration attempt cannot
- * be successful.
- */
- remove_migration_ptes(folio, folio, 0);
-
- rc = mapping->a_ops->writepage(&folio->page, &wbc);
-
- if (rc != AOP_WRITEPAGE_ACTIVATE)
- /* unlocked. Relock */
- folio_lock(folio);
-
- return (rc < 0) ? -EIO : -EAGAIN;
-}
-
-/*
* Default handling if a filesystem does not provide a migration function.
*/
static int fallback_migrate_folio(struct address_space *mapping,
struct folio *dst, struct folio *src, enum migrate_mode mode)
{
- if (folio_test_dirty(src)) {
- /* Only writeback folios in full synchronous migration */
- switch (mode) {
- case MIGRATE_SYNC:
- break;
- default:
- return -EBUSY;
- }
- return writeout(mapping, src);
- }
+ WARN_ONCE(mapping->a_ops->writepages,
+ "%ps does not implement migrate_folio\n",
+ mapping->a_ops);
+ if (folio_test_dirty(src))
+ return -EBUSY;
/*
- * Buffers may be managed in a filesystem specific way.
- * We must have no buffers or drop them.
+ * Filesystem may have private data at folio->private that we
+ * can't migrate automatically.
*/
if (!filemap_release_folio(src, GFP_KERNEL))
return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY;
@@ -1015,91 +1066,54 @@ static int fallback_migrate_folio(struct address_space *mapping,
}
/*
- * Move a page to a newly allocated page
- * The page is locked and all ptes have been successfully removed.
+ * Move a src folio to a newly allocated dst folio.
+ *
+ * The src and dst folios are locked and the src folios was unmapped from
+ * the page tables.
*
- * The new page will have replaced the old page if this function
- * is successful.
+ * On success, the src folio was replaced by the dst folio.
*
* Return value:
* < 0 - error code
- * MIGRATEPAGE_SUCCESS - success
+ * 0 - success
*/
static int move_to_new_folio(struct folio *dst, struct folio *src,
enum migrate_mode mode)
{
+ struct address_space *mapping = folio_mapping(src);
int rc = -EAGAIN;
- bool is_lru = !__folio_test_movable(src);
VM_BUG_ON_FOLIO(!folio_test_locked(src), src);
VM_BUG_ON_FOLIO(!folio_test_locked(dst), dst);
- if (likely(is_lru)) {
- struct address_space *mapping = folio_mapping(src);
-
- if (!mapping)
- rc = migrate_folio(mapping, dst, src, mode);
- else if (mapping_inaccessible(mapping))
- rc = -EOPNOTSUPP;
- else if (mapping->a_ops->migrate_folio)
- /*
- * Most folios have a mapping and most filesystems
- * provide a migrate_folio callback. Anonymous folios
- * are part of swap space which also has its own
- * migrate_folio callback. This is the most common path
- * for page migration.
- */
- rc = mapping->a_ops->migrate_folio(mapping, dst, src,
- mode);
- else
- rc = fallback_migrate_folio(mapping, dst, src, mode);
- } else {
- const struct movable_operations *mops;
-
+ if (!mapping)
+ rc = migrate_folio(mapping, dst, src, mode);
+ else if (mapping_inaccessible(mapping))
+ rc = -EOPNOTSUPP;
+ else if (mapping->a_ops->migrate_folio)
/*
- * In case of non-lru page, it could be released after
- * isolation step. In that case, we shouldn't try migration.
+ * Most folios have a mapping and most filesystems
+ * provide a migrate_folio callback. Anonymous folios
+ * are part of swap space which also has its own
+ * migrate_folio callback. This is the most common path
+ * for page migration.
*/
- VM_BUG_ON_FOLIO(!folio_test_isolated(src), src);
- if (!folio_test_movable(src)) {
- rc = MIGRATEPAGE_SUCCESS;
- folio_clear_isolated(src);
- goto out;
- }
-
- mops = folio_movable_ops(src);
- rc = mops->migrate_page(&dst->page, &src->page, mode);
- WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS &&
- !folio_test_isolated(src));
- }
-
- /*
- * When successful, old pagecache src->mapping must be cleared before
- * src is freed; but stats require that PageAnon be left as PageAnon.
- */
- if (rc == MIGRATEPAGE_SUCCESS) {
- if (__folio_test_movable(src)) {
- VM_BUG_ON_FOLIO(!folio_test_isolated(src), src);
-
- /*
- * We clear PG_movable under page_lock so any compactor
- * cannot try to migrate this page.
- */
- folio_clear_isolated(src);
- }
+ rc = mapping->a_ops->migrate_folio(mapping, dst, src,
+ mode);
+ else
+ rc = fallback_migrate_folio(mapping, dst, src, mode);
+ if (!rc) {
/*
- * Anonymous and movable src->mapping will be cleared by
- * free_pages_prepare so don't reset it here for keeping
- * the type to work PageAnon, for example.
+ * For pagecache folios, src->mapping must be cleared before src
+ * is freed. Anonymous folios must stay anonymous until freed.
*/
- if (!folio_mapping_flags(src))
+ if (!folio_test_anon(src))
src->mapping = NULL;
if (likely(!folio_is_zone_device(dst)))
flush_dcache_folio(dst);
}
-out:
return rc;
}
@@ -1166,12 +1180,7 @@ static void migrate_folio_undo_dst(struct folio *dst, bool locked,
static void migrate_folio_done(struct folio *src,
enum migrate_reason reason)
{
- /*
- * Compaction can migrate also non-LRU pages which are
- * not accounted to NR_ISOLATED_*. They can be recognized
- * as __folio_test_movable
- */
- if (likely(!__folio_test_movable(src)) && reason != MR_DEMOTION)
+ if (likely(!page_has_movable_ops(&src->page)) && reason != MR_DEMOTION)
mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON +
folio_is_file_lru(src), -folio_nr_pages(src));
@@ -1184,26 +1193,15 @@ static void migrate_folio_done(struct folio *src,
static int migrate_folio_unmap(new_folio_t get_new_folio,
free_folio_t put_new_folio, unsigned long private,
struct folio *src, struct folio **dstp, enum migrate_mode mode,
- enum migrate_reason reason, struct list_head *ret)
+ struct list_head *ret)
{
struct folio *dst;
int rc = -EAGAIN;
int old_page_state = 0;
struct anon_vma *anon_vma = NULL;
- bool is_lru = data_race(!__folio_test_movable(src));
bool locked = false;
bool dst_locked = false;
- if (folio_ref_count(src) == 1) {
- /* Folio was freed from under us. So we are done. */
- folio_clear_active(src);
- folio_clear_unevictable(src);
- /* free_pages_prepare() will clear PG_isolated. */
- list_del(&src->lru);
- migrate_folio_done(src, reason);
- return MIGRATEPAGE_SUCCESS;
- }
-
dst = get_new_folio(src, private);
if (!dst)
return -ENOMEM;
@@ -1291,9 +1289,9 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
goto out;
dst_locked = true;
- if (unlikely(!is_lru)) {
+ if (unlikely(page_has_movable_ops(&src->page))) {
__migrate_folio_record(dst, old_page_state, anon_vma);
- return MIGRATEPAGE_UNMAP;
+ return 0;
}
/*
@@ -1323,7 +1321,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
if (!folio_mapped(src)) {
__migrate_folio_record(dst, old_page_state, anon_vma);
- return MIGRATEPAGE_UNMAP;
+ return 0;
}
out:
@@ -1350,20 +1348,23 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
int rc;
int old_page_state = 0;
struct anon_vma *anon_vma = NULL;
- bool is_lru = !__folio_test_movable(src);
struct list_head *prev;
__migrate_folio_extract(dst, &old_page_state, &anon_vma);
prev = dst->lru.prev;
list_del(&dst->lru);
+ if (unlikely(page_has_movable_ops(&src->page))) {
+ rc = migrate_movable_ops_page(&dst->page, &src->page, mode);
+ if (rc)
+ goto out;
+ goto out_unlock_both;
+ }
+
rc = move_to_new_folio(dst, src, mode);
if (rc)
goto out;
- if (unlikely(!is_lru))
- goto out_unlock_both;
-
/*
* When successful, push dst to LRU immediately: so that if it
* turns out to be an mlocked page, remove_migration_ptes() will
@@ -1382,7 +1383,7 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
out_unlock_both:
folio_unlock(dst);
- set_page_owner_migrate_reason(&dst->page, reason);
+ folio_set_owner_migrate_reason(dst, reason);
/*
* If migration is successful, decrease refcount of dst,
* which will not free the page because new page owner increased
@@ -1452,7 +1453,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio,
if (folio_ref_count(src) == 1) {
/* page was freed from under us. So we are done. */
folio_putback_hugetlb(src);
- return MIGRATEPAGE_SUCCESS;
+ return 0;
}
dst = get_new_folio(src, private);
@@ -1515,8 +1516,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio,
rc = move_to_new_folio(dst, src, mode);
if (page_was_mapped)
- remove_migration_ptes(src,
- rc == MIGRATEPAGE_SUCCESS ? dst : src, 0);
+ remove_migration_ptes(src, !rc ? dst : src, 0);
unlock_put_anon:
folio_unlock(dst);
@@ -1525,7 +1525,7 @@ put_anon:
if (anon_vma)
put_anon_vma(anon_vma);
- if (rc == MIGRATEPAGE_SUCCESS) {
+ if (!rc) {
move_hugetlb_state(src, dst, reason);
put_new_folio = NULL;
}
@@ -1533,7 +1533,7 @@ put_anon:
out_unlock:
folio_unlock(src);
out:
- if (rc == MIGRATEPAGE_SUCCESS)
+ if (!rc)
folio_putback_hugetlb(src);
else if (rc != -EAGAIN)
list_move_tail(&src->lru, ret);
@@ -1643,7 +1643,7 @@ static int migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio,
reason, ret_folios);
/*
* The rules are:
- * Success: hugetlb folio will be put back
+ * 0: hugetlb folio will be put back
* -EAGAIN: stay on the from list
* -ENOMEM: stay on the from list
* Other errno: put on ret_folios list
@@ -1660,7 +1660,7 @@ static int migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio,
retry++;
nr_retry_pages += nr_pages;
break;
- case MIGRATEPAGE_SUCCESS:
+ case 0:
stats->nr_succeeded += nr_pages;
break;
default:
@@ -1714,7 +1714,7 @@ static void migrate_folios_move(struct list_head *src_folios,
reason, ret_folios);
/*
* The rules are:
- * Success: folio will be freed
+ * 0: folio will be freed
* -EAGAIN: stay on the unmap_folios list
* Other errno: put on ret_folios list
*/
@@ -1724,7 +1724,7 @@ static void migrate_folios_move(struct list_head *src_folios,
*thp_retry += is_thp;
*nr_retry_pages += nr_pages;
break;
- case MIGRATEPAGE_SUCCESS:
+ case 0:
stats->nr_succeeded += nr_pages;
stats->nr_thp_succeeded += is_thp;
break;
@@ -1863,14 +1863,27 @@ static int migrate_pages_batch(struct list_head *from,
continue;
}
+ /*
+ * If we are holding the last folio reference, the folio
+ * was freed from under us, so just drop our reference.
+ */
+ if (likely(!page_has_movable_ops(&folio->page)) &&
+ folio_ref_count(folio) == 1) {
+ folio_clear_active(folio);
+ folio_clear_unevictable(folio);
+ list_del(&folio->lru);
+ migrate_folio_done(folio, reason);
+ stats->nr_succeeded += nr_pages;
+ stats->nr_thp_succeeded += is_thp;
+ continue;
+ }
+
rc = migrate_folio_unmap(get_new_folio, put_new_folio,
- private, folio, &dst, mode, reason,
- ret_folios);
+ private, folio, &dst, mode, ret_folios);
/*
* The rules are:
- * Success: folio will be freed
- * Unmap: folio will be put on unmap_folios list,
- * dst folio put on dst_folios list
+ * 0: folio will be put on unmap_folios list,
+ * dst folio put on dst_folios list
* -EAGAIN: stay on the from list
* -ENOMEM: stay on the from list
* Other errno: put on ret_folios list
@@ -1920,11 +1933,7 @@ static int migrate_pages_batch(struct list_head *from,
thp_retry += is_thp;
nr_retry_pages += nr_pages;
break;
- case MIGRATEPAGE_SUCCESS:
- stats->nr_succeeded += nr_pages;
- stats->nr_thp_succeeded += is_thp;
- break;
- case MIGRATEPAGE_UNMAP:
+ case 0:
list_move_tail(&folio->lru, &unmap_folios);
list_add_tail(&dst->lru, &dst_folios);
break;
@@ -2378,13 +2387,6 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
}
/*
- * The move_pages() man page does not have an -EEXIST choice, so
- * use -EFAULT instead.
- */
- if (err == -EEXIST)
- err = -EFAULT;
-
- /*
* If the page is already on the target node (!err), store the
* node, otherwise, store the err.
*/
@@ -2458,6 +2460,7 @@ set_status:
static int get_compat_pages_array(const void __user *chunk_pages[],
const void __user * __user *pages,
+ unsigned long chunk_offset,
unsigned long chunk_nr)
{
compat_uptr_t __user *pages32 = (compat_uptr_t __user *)pages;
@@ -2465,7 +2468,7 @@ static int get_compat_pages_array(const void __user *chunk_pages[],
int i;
for (i = 0; i < chunk_nr; i++) {
- if (get_user(p, pages32 + i))
+ if (get_user(p, pages32 + chunk_offset + i))
return -EFAULT;
chunk_pages[i] = compat_ptr(p);
}
@@ -2484,27 +2487,28 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
#define DO_PAGES_STAT_CHUNK_NR 16UL
const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
int chunk_status[DO_PAGES_STAT_CHUNK_NR];
+ unsigned long chunk_offset = 0;
while (nr_pages) {
unsigned long chunk_nr = min(nr_pages, DO_PAGES_STAT_CHUNK_NR);
if (in_compat_syscall()) {
if (get_compat_pages_array(chunk_pages, pages,
- chunk_nr))
+ chunk_offset, chunk_nr))
break;
} else {
- if (copy_from_user(chunk_pages, pages,
+ if (copy_from_user(chunk_pages, pages + chunk_offset,
chunk_nr * sizeof(*chunk_pages)))
break;
}
do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
- if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status)))
+ if (copy_to_user(status + chunk_offset, chunk_status,
+ chunk_nr * sizeof(*status)))
break;
- pages += chunk_nr;
- status += chunk_nr;
+ chunk_offset += chunk_nr;
nr_pages -= chunk_nr;
}
return nr_pages ? -EFAULT : 0;
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 3158afe7eb23..abd9f6850db6 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -615,7 +615,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
pmdp = pmd_alloc(mm, pudp, addr);
if (!pmdp)
goto abort;
- if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp))
+ if (pmd_trans_huge(*pmdp))
goto abort;
if (pte_alloc(mm, pmdp))
goto abort;
@@ -778,7 +778,7 @@ static void __migrate_device_pages(unsigned long *src_pfns,
if (migrate && migrate->fault_page == page)
extra_cnt = 1;
r = folio_migrate_mapping(mapping, newfolio, folio, extra_cnt);
- if (r != MIGRATEPAGE_SUCCESS)
+ if (r)
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
else
folio_migrate_flags(newfolio, folio);
diff --git a/mm/mincore.c b/mm/mincore.c
index 832f29f46767..8ec4719370e1 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -21,6 +21,7 @@
#include <linux/uaccess.h>
#include "swap.h"
+#include "internal.h"
static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
unsigned long end, struct mm_walk *walk)
@@ -28,7 +29,9 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
#ifdef CONFIG_HUGETLB_PAGE
unsigned char present;
unsigned char *vec = walk->private;
+ spinlock_t *ptl;
+ ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
/*
* Hugepages under user process are always in RAM and never
* swapped out, but theoretically it needs to be checked.
@@ -37,12 +40,54 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
for (; addr != end; vec++, addr += PAGE_SIZE)
*vec = present;
walk->private = vec;
+ spin_unlock(ptl);
#else
BUG();
#endif
return 0;
}
+static unsigned char mincore_swap(swp_entry_t entry, bool shmem)
+{
+ struct swap_info_struct *si;
+ struct folio *folio = NULL;
+ unsigned char present = 0;
+
+ if (!IS_ENABLED(CONFIG_SWAP)) {
+ WARN_ON(1);
+ return 0;
+ }
+
+ /*
+ * Shmem mapping may contain swapin error entries, which are
+ * absent. Page table may contain migration or hwpoison
+ * entries which are always uptodate.
+ */
+ if (non_swap_entry(entry))
+ return !shmem;
+
+ /*
+ * Shmem mapping lookup is lockless, so we need to grab the swap
+ * device. mincore page table walk locks the PTL, and the swap
+ * device is stable, avoid touching the si for better performance.
+ */
+ if (shmem) {
+ si = get_swap_device(entry);
+ if (!si)
+ return 0;
+ }
+ folio = swap_cache_get_folio(entry);
+ if (shmem)
+ put_swap_device(si);
+ /* The swap cache space contains either folio, shadow or NULL */
+ if (folio && !xa_is_value(folio)) {
+ present = folio_test_uptodate(folio);
+ folio_put(folio);
+ }
+
+ return present;
+}
+
/*
* Later we can get more picky about what "in core" means precisely.
* For now, simply check to see if the page is in the page cache,
@@ -60,8 +105,15 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t index)
* any other file mapping (ie. marked !present and faulted in with
* tmpfs's .fault). So swapped out tmpfs mappings are tested here.
*/
- folio = filemap_get_incore_folio(mapping, index);
- if (!IS_ERR(folio)) {
+ folio = filemap_get_entry(mapping, index);
+ if (folio) {
+ if (xa_is_value(folio)) {
+ if (shmem_mapping(mapping))
+ return mincore_swap(radix_to_swp_entry(folio),
+ true);
+ else
+ return 0;
+ }
present = folio_test_uptodate(folio);
folio_put(folio);
}
@@ -105,6 +157,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte_t *ptep;
unsigned char *vec = walk->private;
int nr = (end - addr) >> PAGE_SHIFT;
+ int step, i;
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
@@ -118,35 +171,29 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
walk->action = ACTION_AGAIN;
return 0;
}
- for (; addr != end; ptep++, addr += PAGE_SIZE) {
+ for (; addr != end; ptep += step, addr += step * PAGE_SIZE) {
pte_t pte = ptep_get(ptep);
+ step = 1;
/* We need to do cache lookup too for pte markers */
if (pte_none_mostly(pte))
__mincore_unmapped_range(addr, addr + PAGE_SIZE,
vma, vec);
- else if (pte_present(pte))
- *vec = 1;
- else { /* pte is a swap entry */
- swp_entry_t entry = pte_to_swp_entry(pte);
-
- if (non_swap_entry(entry)) {
- /*
- * migration or hwpoison entries are always
- * uptodate
- */
- *vec = 1;
- } else {
-#ifdef CONFIG_SWAP
- *vec = mincore_page(swap_address_space(entry),
- swap_cache_index(entry));
-#else
- WARN_ON(1);
- *vec = 1;
-#endif
+ else if (pte_present(pte)) {
+ unsigned int batch = pte_batch_hint(ptep, pte);
+
+ if (batch > 1) {
+ unsigned int max_nr = (end - addr) >> PAGE_SHIFT;
+
+ step = min_t(unsigned int, batch, max_nr);
}
+
+ for (i = 0; i < step; i++)
+ vec[i] = 1;
+ } else { /* pte is a swap entry */
+ *vec = mincore_swap(pte_to_swp_entry(pte), false);
}
- vec++;
+ vec += step;
}
pte_unmap_unlock(ptep - 1, ptl);
out:
diff --git a/mm/mlock.c b/mm/mlock.c
index 3cb72b579ffd..bb0776f5ef7c 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -255,7 +255,7 @@ void mlock_folio(struct folio *folio)
folio_get(folio);
if (!folio_batch_add(fbatch, mlock_lru(folio)) ||
- folio_test_large(folio) || lru_cache_disabled())
+ !folio_may_be_lru_cached(folio) || lru_cache_disabled())
mlock_folio_batch(fbatch);
local_unlock(&mlock_fbatch.lock);
}
@@ -278,7 +278,7 @@ void mlock_new_folio(struct folio *folio)
folio_get(folio);
if (!folio_batch_add(fbatch, mlock_new(folio)) ||
- folio_test_large(folio) || lru_cache_disabled())
+ !folio_may_be_lru_cached(folio) || lru_cache_disabled())
mlock_folio_batch(fbatch);
local_unlock(&mlock_fbatch.lock);
}
@@ -299,7 +299,7 @@ void munlock_folio(struct folio *folio)
*/
folio_get(folio);
if (!folio_batch_add(fbatch, folio) ||
- folio_test_large(folio) || lru_cache_disabled())
+ !folio_may_be_lru_cached(folio) || lru_cache_disabled())
mlock_folio_batch(fbatch);
local_unlock(&mlock_fbatch.lock);
}
@@ -307,15 +307,13 @@ void munlock_folio(struct folio *folio)
static inline unsigned int folio_mlock_step(struct folio *folio,
pte_t *pte, unsigned long addr, unsigned long end)
{
- const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
unsigned int count = (end - addr) >> PAGE_SHIFT;
pte_t ptent = ptep_get(pte);
if (!folio_test_large(folio))
return 1;
- return folio_pte_batch(folio, addr, pte, ptent, count, fpb_flags, NULL,
- NULL, NULL);
+ return folio_pte_batch(folio, pte, ptent, count);
}
static inline bool allow_mlock_munlock(struct folio *folio,
diff --git a/mm/mm_init.c b/mm/mm_init.c
index eedce9321e13..3db2dea7db4c 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -30,6 +30,7 @@
#include <linux/crash_dump.h>
#include <linux/execmem.h>
#include <linux/vmstat.h>
+#include <linux/kexec_handover.h>
#include <linux/hugetlb.h>
#include "internal.h"
#include "slab.h"
@@ -684,7 +685,8 @@ void __meminit __init_page_from_nid(unsigned long pfn, int nid)
__init_single_page(pfn_to_page(pfn), pfn, zid, nid);
if (pageblock_aligned(pfn))
- set_pageblock_migratetype(pfn_to_page(pfn), MIGRATE_MOVABLE);
+ init_pageblock_migratetype(pfn_to_page(pfn), MIGRATE_MOVABLE,
+ false);
}
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
@@ -743,7 +745,7 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
return false;
}
-static void __meminit init_deferred_page(unsigned long pfn, int nid)
+static void __meminit __init_deferred_page(unsigned long pfn, int nid)
{
if (early_page_initialised(pfn, nid))
return;
@@ -763,11 +765,16 @@ static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
return false;
}
-static inline void init_deferred_page(unsigned long pfn, int nid)
+static inline void __init_deferred_page(unsigned long pfn, int nid)
{
}
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
+void __meminit init_deferred_page(unsigned long pfn, int nid)
+{
+ __init_deferred_page(pfn, nid);
+}
+
/*
* Initialised pages do not have PageReserved set. This function is
* called for each range allocated by the bootmem allocator and
@@ -777,22 +784,19 @@ static inline void init_deferred_page(unsigned long pfn, int nid)
void __meminit reserve_bootmem_region(phys_addr_t start,
phys_addr_t end, int nid)
{
- unsigned long start_pfn = PFN_DOWN(start);
- unsigned long end_pfn = PFN_UP(end);
+ unsigned long pfn;
- for (; start_pfn < end_pfn; start_pfn++) {
- if (pfn_valid(start_pfn)) {
- struct page *page = pfn_to_page(start_pfn);
+ for_each_valid_pfn(pfn, PFN_DOWN(start), PFN_UP(end)) {
+ struct page *page = pfn_to_page(pfn);
- init_deferred_page(start_pfn, nid);
+ __init_deferred_page(pfn, nid);
- /*
- * no need for atomic set_bit because the struct
- * page is not visible yet so nobody should
- * access it yet.
- */
- __SetPageReserved(page);
- }
+ /*
+ * no need for atomic set_bit because the struct
+ * page is not visible yet so nobody should
+ * access it yet.
+ */
+ __SetPageReserved(page);
}
}
@@ -828,7 +832,7 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
* - physical memory bank size is not necessarily the exact multiple of the
* arbitrary section size
* - early reserved memory may not be listed in memblock.memory
- * - non-memory regions covered by the contigious flatmem mapping
+ * - non-memory regions covered by the contiguous flatmem mapping
* - memory layouts defined with memmap= kernel parameter may not align
* nicely with memmap sections
*
@@ -848,11 +852,7 @@ static void __init init_unavailable_range(unsigned long spfn,
unsigned long pfn;
u64 pgcnt = 0;
- for (pfn = spfn; pfn < epfn; pfn++) {
- if (!pfn_valid(pageblock_start_pfn(pfn))) {
- pfn = pageblock_end_pfn(pfn) - 1;
- continue;
- }
+ for_each_valid_pfn(pfn, spfn, epfn) {
__init_single_page(pfn_to_page(pfn), pfn, zone, node);
__SetPageReserved(pfn_to_page(pfn));
pgcnt++;
@@ -875,7 +875,8 @@ static void __init init_unavailable_range(unsigned long spfn,
void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn, unsigned long zone_end_pfn,
enum meminit_context context,
- struct vmem_altmap *altmap, int migratetype)
+ struct vmem_altmap *altmap, int migratetype,
+ bool isolate_pageblock)
{
unsigned long pfn, end_pfn = start_pfn + size;
struct page *page;
@@ -932,7 +933,8 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone
* over the place during system boot.
*/
if (pageblock_aligned(pfn)) {
- set_pageblock_migratetype(page, migratetype);
+ init_pageblock_migratetype(page, migratetype,
+ isolate_pageblock);
cond_resched();
}
pfn++;
@@ -955,7 +957,8 @@ static void __init memmap_init_zone_range(struct zone *zone,
return;
memmap_init_range(end_pfn - start_pfn, nid, zone_id, start_pfn,
- zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
+ zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE,
+ false);
if (*hole_pfn < start_pfn)
init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid);
@@ -1036,7 +1039,7 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
* because this is done early in section_activate()
*/
if (pageblock_aligned(pfn)) {
- set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ init_pageblock_migratetype(page, MIGRATE_MOVABLE, false);
cond_resched();
}
@@ -1088,6 +1091,12 @@ static void __ref memmap_init_compound(struct page *head,
unsigned long pfn, end_pfn = head_pfn + nr_pages;
unsigned int order = pgmap->vmemmap_shift;
+ /*
+ * We have to initialize the pages, including setting up page links.
+ * prep_compound_page() does not take care of that, so instead we
+ * open-code prep_compound_page() so we can take care of initializing
+ * the pages in the same go.
+ */
__SetPageHead(head);
for (pfn = head_pfn + 1; pfn < end_pfn; pfn++) {
struct page *page = pfn_to_page(pfn);
@@ -1095,15 +1104,8 @@ static void __ref memmap_init_compound(struct page *head,
__init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
prep_compound_tail(head, pfn - head_pfn);
set_page_count(page, 0);
-
- /*
- * The first tail page stores important compound page info.
- * Call prep_compound_head() after the first tail page has
- * been initialized, to not have the data overwritten.
- */
- if (pfn == head_pfn + 1)
- prep_compound_head(head, order);
}
+ prep_compound_head(head, order);
}
void __ref memmap_init_zone_device(struct zone *zone,
@@ -1510,7 +1512,7 @@ static inline void setup_usemap(struct zone *zone) {}
/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
void __init set_pageblock_order(void)
{
- unsigned int order = MAX_PAGE_ORDER;
+ unsigned int order = PAGE_BLOCK_MAX_ORDER;
/* Check that pageblock_nr_pages has not already been setup */
if (pageblock_order)
@@ -1907,7 +1909,7 @@ void __init free_area_init(unsigned long *max_zone_pfn)
free_area_init_node(nid);
/*
- * No sysfs hierarcy will be created via register_one_node()
+ * No sysfs hierarchy will be created via register_one_node()
*for memory-less node because here it's not marked as N_MEMORY
*and won't be set online later. The benefit is userspace
*program won't be confused by sysfs files/directories of
@@ -1997,7 +1999,8 @@ static void __init deferred_free_pages(unsigned long pfn,
/* Free a large naturally-aligned chunk if possible */
if (nr_pages == MAX_ORDER_NR_PAGES && IS_MAX_ORDER_ALIGNED(pfn)) {
for (i = 0; i < nr_pages; i += pageblock_nr_pages)
- set_pageblock_migratetype(page + i, MIGRATE_MOVABLE);
+ init_pageblock_migratetype(page + i, MIGRATE_MOVABLE,
+ false);
__free_pages_core(page, MAX_PAGE_ORDER, MEMINIT_EARLY);
return;
}
@@ -2007,7 +2010,8 @@ static void __init deferred_free_pages(unsigned long pfn,
for (i = 0; i < nr_pages; i++, page++, pfn++) {
if (pageblock_aligned(pfn))
- set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ init_pageblock_migratetype(page, MIGRATE_MOVABLE,
+ false);
__free_pages_core(page, 0, MEMINIT_EARLY);
}
}
@@ -2041,112 +2045,63 @@ static unsigned long __init deferred_init_pages(struct zone *zone,
}
/*
- * This function is meant to pre-load the iterator for the zone init from
- * a given point.
- * Specifically it walks through the ranges starting with initial index
- * passed to it until we are caught up to the first_init_pfn value and
- * exits there. If we never encounter the value we return false indicating
- * there are no valid ranges left.
- */
-static bool __init
-deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone,
- unsigned long *spfn, unsigned long *epfn,
- unsigned long first_init_pfn)
-{
- u64 j = *i;
-
- if (j == 0)
- __next_mem_pfn_range_in_zone(&j, zone, spfn, epfn);
-
- /*
- * Start out by walking through the ranges in this zone that have
- * already been initialized. We don't need to do anything with them
- * so we just need to flush them out of the system.
- */
- for_each_free_mem_pfn_range_in_zone_from(j, zone, spfn, epfn) {
- if (*epfn <= first_init_pfn)
- continue;
- if (*spfn < first_init_pfn)
- *spfn = first_init_pfn;
- *i = j;
- return true;
- }
-
- return false;
-}
-
-/*
- * Initialize and free pages. We do it in two loops: first we initialize
- * struct page, then free to buddy allocator, because while we are
- * freeing pages we can access pages that are ahead (computing buddy
- * page in __free_one_page()).
+ * Initialize and free pages.
+ *
+ * At this point reserved pages and struct pages that correspond to holes in
+ * memblock.memory are already intialized so every free range has a valid
+ * memory map around it.
+ * This ensures that access of pages that are ahead of the range being
+ * initialized (computing buddy page in __free_one_page()) always reads a valid
+ * struct page.
*
- * In order to try and keep some memory in the cache we have the loop
- * broken along max page order boundaries. This way we will not cause
- * any issues with the buddy page computation.
+ * In order to try and improve CPU cache locality we have the loop broken along
+ * max page order boundaries.
*/
static unsigned long __init
-deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
- unsigned long *end_pfn)
+deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
+ struct zone *zone)
{
- unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
- unsigned long spfn = *start_pfn, epfn = *end_pfn;
+ int nid = zone_to_nid(zone);
unsigned long nr_pages = 0;
- u64 j = *i;
+ phys_addr_t start, end;
+ u64 i = 0;
- /* First we loop through and initialize the page values */
- for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
- unsigned long t;
+ for_each_free_mem_range(i, nid, 0, &start, &end, NULL) {
+ unsigned long spfn = PFN_UP(start);
+ unsigned long epfn = PFN_DOWN(end);
- if (mo_pfn <= *start_pfn)
+ if (spfn >= end_pfn)
break;
- t = min(mo_pfn, *end_pfn);
- nr_pages += deferred_init_pages(zone, *start_pfn, t);
+ spfn = max(spfn, start_pfn);
+ epfn = min(epfn, end_pfn);
- if (mo_pfn < *end_pfn) {
- *start_pfn = mo_pfn;
- break;
- }
- }
+ while (spfn < epfn) {
+ unsigned long mo_pfn = ALIGN(spfn + 1, MAX_ORDER_NR_PAGES);
+ unsigned long chunk_end = min(mo_pfn, epfn);
- /* Reset values and now loop through freeing pages as needed */
- swap(j, *i);
+ nr_pages += deferred_init_pages(zone, spfn, chunk_end);
+ deferred_free_pages(spfn, chunk_end - spfn);
- for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
- unsigned long t;
+ spfn = chunk_end;
- if (mo_pfn <= spfn)
- break;
-
- t = min(mo_pfn, epfn);
- deferred_free_pages(spfn, t - spfn);
-
- if (mo_pfn <= epfn)
- break;
+ if (irqs_disabled())
+ touch_nmi_watchdog();
+ else
+ cond_resched();
+ }
}
return nr_pages;
}
static void __init
-deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
- void *arg)
+deferred_init_memmap_job(unsigned long start_pfn, unsigned long end_pfn,
+ void *arg)
{
- unsigned long spfn, epfn;
struct zone *zone = arg;
- u64 i = 0;
-
- deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);
- /*
- * Initialize and free pages in MAX_PAGE_ORDER sized increments so that
- * we can avoid introducing any issues with the buddy allocator.
- */
- while (spfn < end_pfn) {
- deferred_init_maxorder(&i, zone, &spfn, &epfn);
- cond_resched();
- }
+ deferred_init_memmap_chunk(start_pfn, end_pfn, zone);
}
static unsigned int __init
@@ -2160,12 +2115,10 @@ static int __init deferred_init_memmap(void *data)
{
pg_data_t *pgdat = data;
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
- unsigned long spfn = 0, epfn = 0;
- unsigned long first_init_pfn, flags;
+ int max_threads = deferred_page_init_max_threads(cpumask);
+ unsigned long first_init_pfn, last_pfn, flags;
unsigned long start = jiffies;
struct zone *zone;
- int max_threads;
- u64 i = 0;
/* Bind memory initialisation thread to a local node if possible */
if (!cpumask_empty(cpumask))
@@ -2193,24 +2146,20 @@ static int __init deferred_init_memmap(void *data)
/* Only the highest zone is deferred */
zone = pgdat->node_zones + pgdat->nr_zones - 1;
+ last_pfn = SECTION_ALIGN_UP(zone_end_pfn(zone));
- max_threads = deferred_page_init_max_threads(cpumask);
-
- while (deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, first_init_pfn)) {
- first_init_pfn = ALIGN(epfn, PAGES_PER_SECTION);
- struct padata_mt_job job = {
- .thread_fn = deferred_init_memmap_chunk,
- .fn_arg = zone,
- .start = spfn,
- .size = first_init_pfn - spfn,
- .align = PAGES_PER_SECTION,
- .min_chunk = PAGES_PER_SECTION,
- .max_threads = max_threads,
- .numa_aware = false,
- };
+ struct padata_mt_job job = {
+ .thread_fn = deferred_init_memmap_job,
+ .fn_arg = zone,
+ .start = first_init_pfn,
+ .size = last_pfn - first_init_pfn,
+ .align = PAGES_PER_SECTION,
+ .min_chunk = PAGES_PER_SECTION,
+ .max_threads = max_threads,
+ .numa_aware = false,
+ };
- padata_do_multithreaded(&job);
- }
+ padata_do_multithreaded(&job);
/* Sanity check that the next zone really is unpopulated */
WARN_ON(pgdat->nr_zones < MAX_NR_ZONES && populated_zone(++zone));
@@ -2235,12 +2184,11 @@ static int __init deferred_init_memmap(void *data)
*/
bool __init deferred_grow_zone(struct zone *zone, unsigned int order)
{
- unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
+ unsigned long nr_pages_needed = SECTION_ALIGN_UP(1 << order);
pg_data_t *pgdat = zone->zone_pgdat;
unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
unsigned long spfn, epfn, flags;
unsigned long nr_pages = 0;
- u64 i = 0;
/* Only the last zone may have deferred pages */
if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat))
@@ -2257,37 +2205,26 @@ bool __init deferred_grow_zone(struct zone *zone, unsigned int order)
return true;
}
- /* If the zone is empty somebody else may have cleared out the zone */
- if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
- first_deferred_pfn)) {
- pgdat->first_deferred_pfn = ULONG_MAX;
- pgdat_resize_unlock(pgdat, &flags);
- /* Retry only once. */
- return first_deferred_pfn != ULONG_MAX;
+ /*
+ * Initialize at least nr_pages_needed in section chunks.
+ * If a section has less free memory than nr_pages_needed, the next
+ * section will be also initialized.
+ * Note, that it still does not guarantee that allocation of order can
+ * be satisfied if the sections are fragmented because of memblock
+ * allocations.
+ */
+ for (spfn = first_deferred_pfn, epfn = SECTION_ALIGN_UP(spfn + 1);
+ nr_pages < nr_pages_needed && spfn < zone_end_pfn(zone);
+ spfn = epfn, epfn += PAGES_PER_SECTION) {
+ nr_pages += deferred_init_memmap_chunk(spfn, epfn, zone);
}
/*
- * Initialize and free pages in MAX_PAGE_ORDER sized increments so
- * that we can avoid introducing any issues with the buddy
- * allocator.
+ * There were no pages to initialize and free which means the zone's
+ * memory map is completely initialized.
*/
- while (spfn < epfn) {
- /* update our first deferred PFN for this section */
- first_deferred_pfn = spfn;
+ pgdat->first_deferred_pfn = nr_pages ? spfn : ULONG_MAX;
- nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
- touch_nmi_watchdog();
-
- /* We should only stop along section boundaries */
- if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
- continue;
-
- /* If our quota has been met we can stop here */
- if (nr_pages >= nr_pages_needed)
- break;
- }
-
- pgdat->first_deferred_pfn = spfn;
pgdat_resize_unlock(pgdat, &flags);
return nr_pages > 0;
@@ -2306,7 +2243,7 @@ void __init init_cma_reserved_pageblock(struct page *page)
set_page_count(p, 0);
} while (++p, --i);
- set_pageblock_migratetype(page, MIGRATE_CMA);
+ init_pageblock_migratetype(page, MIGRATE_CMA, false);
set_page_refcounted(page);
/* pages were reserved and not allocated */
clear_page_tag_ref(page);
@@ -2320,7 +2257,7 @@ void __init init_cma_reserved_pageblock(struct page *page)
*/
void __init init_cma_pageblock(struct page *page)
{
- set_pageblock_migratetype(page, MIGRATE_CMA);
+ init_pageblock_migratetype(page, MIGRATE_CMA, false);
adjust_managed_page_count(page, pageblock_nr_pages);
page_zone(page)->cma_pages += pageblock_nr_pages;
}
@@ -2667,12 +2604,6 @@ static void __init report_meminit(void)
stack = "all(pattern)";
else if (IS_ENABLED(CONFIG_INIT_STACK_ALL_ZERO))
stack = "all(zero)";
- else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL))
- stack = "byref_all(zero)";
- else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF))
- stack = "byref(zero)";
- else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_USER))
- stack = "__user(zero)";
else
stack = "off";
@@ -2765,6 +2696,13 @@ void __init mm_core_init(void)
report_meminit();
kmsan_init_shadow();
stack_depot_early_init();
+
+ /*
+ * KHO memory setup must happen while memblock is still active, but
+ * as close as possible to buddy initialization
+ */
+ kho_memory_init();
+
memblock_free_all();
mem_init();
kmem_cache_init();
diff --git a/mm/mmap.c b/mm/mmap.c
index bd210aaf7ebd..5fd3b80fda1d 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -80,7 +80,7 @@ core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);
/* Update vma->vm_page_prot to reflect vma->vm_flags. */
void vma_set_page_prot(struct vm_area_struct *vma)
{
- unsigned long vm_flags = vma->vm_flags;
+ vm_flags_t vm_flags = vma->vm_flags;
pgprot_t vm_page_prot;
vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags);
@@ -127,18 +127,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
origbrk = mm->brk;
+ min_brk = mm->start_brk;
#ifdef CONFIG_COMPAT_BRK
/*
* CONFIG_COMPAT_BRK can still be overridden by setting
* randomize_va_space to 2, which will still cause mm->start_brk
* to be arbitrarily shifted
*/
- if (current->brk_randomized)
- min_brk = mm->start_brk;
- else
+ if (!current->brk_randomized)
min_brk = mm->end_data;
-#else
- min_brk = mm->start_brk;
#endif
if (brk < min_brk)
goto out;
@@ -228,12 +225,12 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
return hint;
}
-bool mlock_future_ok(struct mm_struct *mm, unsigned long flags,
+bool mlock_future_ok(const struct mm_struct *mm, vm_flags_t vm_flags,
unsigned long bytes)
{
unsigned long locked_pages, limit_pages;
- if (!(flags & VM_LOCKED) || capable(CAP_IPC_LOCK))
+ if (!(vm_flags & VM_LOCKED) || capable(CAP_IPC_LOCK))
return true;
locked_pages = bytes >> PAGE_SHIFT;
@@ -475,7 +472,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
vm_flags &= ~VM_MAYEXEC;
}
- if (!file->f_op->mmap)
+ if (!can_mmap_file(file))
return -ENODEV;
if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
return -EINVAL;
@@ -805,7 +802,7 @@ unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *fi
unsigned long pgoff, unsigned long flags,
vm_flags_t vm_flags)
{
- if (test_bit(MMF_TOPDOWN, &mm->flags))
+ if (mm_flags_test(MMF_TOPDOWN, mm))
return arch_get_unmapped_area_topdown(filp, addr, len, pgoff,
flags, vm_flags);
return arch_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags);
@@ -871,9 +868,8 @@ mm_get_unmapped_area(struct mm_struct *mm, struct file *file,
unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags)
{
- if (test_bit(MMF_TOPDOWN, &mm->flags))
- return arch_get_unmapped_area_topdown(file, addr, len, pgoff, flags, 0);
- return arch_get_unmapped_area(file, addr, len, pgoff, flags, 0);
+ return mm_get_unmapped_area_vmflags(mm, file, addr, len,
+ pgoff, flags, 0);
}
EXPORT_SYMBOL(mm_get_unmapped_area);
@@ -1207,7 +1203,7 @@ out:
return ret;
}
-int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
+int vm_brk_flags(unsigned long addr, unsigned long request, vm_flags_t vm_flags)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = NULL;
@@ -1224,7 +1220,7 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
return 0;
/* Until we need other flags, refuse anything except VM_EXEC. */
- if ((flags & (~VM_EXEC)) != 0)
+ if ((vm_flags & (~VM_EXEC)) != 0)
return -EINVAL;
if (mmap_write_lock_killable(mm))
@@ -1239,7 +1235,7 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
goto munmap_failed;
vma = vma_prev(&vmi);
- ret = do_brk_flags(&vmi, vma, addr, len, flags);
+ ret = do_brk_flags(&vmi, vma, addr, len, vm_flags);
populate = ((mm->def_flags & VM_LOCKED) != 0);
mmap_write_unlock(mm);
userfaultfd_unmap_complete(mm, &uf);
@@ -1288,7 +1284,7 @@ void exit_mmap(struct mm_struct *mm)
* Set MMF_OOM_SKIP to hide this task from the oom killer/reaper
* because the memory has been already freed.
*/
- set_bit(MMF_OOM_SKIP, &mm->flags);
+ mm_flags_set(MMF_OOM_SKIP, mm);
mmap_write_lock(mm);
mt_clear_in_rcu(&mm->mm_mt);
vma_iter_set(&vmi, vma->vm_end);
@@ -1321,48 +1317,6 @@ destroy:
vm_unacct_memory(nr_accounted);
}
-/* Insert vm structure into process list sorted by address
- * and into the inode's i_mmap tree. If vm_file is non-NULL
- * then i_mmap_rwsem is taken here.
- */
-int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
-{
- unsigned long charged = vma_pages(vma);
-
-
- if (find_vma_intersection(mm, vma->vm_start, vma->vm_end))
- return -ENOMEM;
-
- if ((vma->vm_flags & VM_ACCOUNT) &&
- security_vm_enough_memory_mm(mm, charged))
- return -ENOMEM;
-
- /*
- * The vm_pgoff of a purely anonymous vma should be irrelevant
- * until its first write fault, when page's anon_vma and index
- * are set. But now set the vm_pgoff it will almost certainly
- * end up with (unless mremap moves it elsewhere before that
- * first wfault), so /proc/pid/maps tells a consistent story.
- *
- * By setting it to reflect the virtual start address of the
- * vma, merges and splits can happen in a seamless way, just
- * using the existing file pgoff checks and manipulations.
- * Similarly in do_mmap and in do_brk_flags.
- */
- if (vma_is_anonymous(vma)) {
- BUG_ON(vma->anon_vma);
- vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
- }
-
- if (vma_link(mm, vma)) {
- if (vma->vm_flags & VM_ACCOUNT)
- vm_unacct_memory(charged);
- return -ENOMEM;
- }
-
- return 0;
-}
-
/*
* Return true if the calling process may expand its vm space by the passed
* number of pages
@@ -1486,7 +1440,7 @@ static vm_fault_t special_mapping_fault(struct vm_fault *vmf)
static struct vm_area_struct *__install_special_mapping(
struct mm_struct *mm,
unsigned long addr, unsigned long len,
- unsigned long vm_flags, void *priv,
+ vm_flags_t vm_flags, void *priv,
const struct vm_operations_struct *ops)
{
int ret;
@@ -1538,7 +1492,7 @@ bool vma_is_special_mapping(const struct vm_area_struct *vma,
struct vm_area_struct *_install_special_mapping(
struct mm_struct *mm,
unsigned long addr, unsigned long len,
- unsigned long vm_flags, const struct vm_special_mapping *spec)
+ vm_flags_t vm_flags, const struct vm_special_mapping *spec)
{
return __install_special_mapping(mm, addr, len, vm_flags, (void *)spec,
&special_mapping_vmops);
@@ -1596,7 +1550,7 @@ static const struct ctl_table mmap_table[] = {
#endif /* CONFIG_SYSCTL */
/*
- * initialise the percpu counter for VM
+ * initialise the percpu counter for VM, initialise VMA state.
*/
void __init mmap_init(void)
{
@@ -1607,6 +1561,7 @@ void __init mmap_init(void)
#ifdef CONFIG_SYSCTL
register_sysctl_init("vm", mmap_table);
#endif
+ vma_state_init();
}
/*
@@ -1718,90 +1673,6 @@ static int __meminit init_reserve_notifier(void)
subsys_initcall(init_reserve_notifier);
/*
- * Relocate a VMA downwards by shift bytes. There cannot be any VMAs between
- * this VMA and its relocated range, which will now reside at [vma->vm_start -
- * shift, vma->vm_end - shift).
- *
- * This function is almost certainly NOT what you want for anything other than
- * early executable temporary stack relocation.
- */
-int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
-{
- /*
- * The process proceeds as follows:
- *
- * 1) Use shift to calculate the new vma endpoints.
- * 2) Extend vma to cover both the old and new ranges. This ensures the
- * arguments passed to subsequent functions are consistent.
- * 3) Move vma's page tables to the new range.
- * 4) Free up any cleared pgd range.
- * 5) Shrink the vma to cover only the new range.
- */
-
- struct mm_struct *mm = vma->vm_mm;
- unsigned long old_start = vma->vm_start;
- unsigned long old_end = vma->vm_end;
- unsigned long length = old_end - old_start;
- unsigned long new_start = old_start - shift;
- unsigned long new_end = old_end - shift;
- VMA_ITERATOR(vmi, mm, new_start);
- VMG_STATE(vmg, mm, &vmi, new_start, old_end, 0, vma->vm_pgoff);
- struct vm_area_struct *next;
- struct mmu_gather tlb;
- PAGETABLE_MOVE(pmc, vma, vma, old_start, new_start, length);
-
- BUG_ON(new_start > new_end);
-
- /*
- * ensure there are no vmas between where we want to go
- * and where we are
- */
- if (vma != vma_next(&vmi))
- return -EFAULT;
-
- vma_iter_prev_range(&vmi);
- /*
- * cover the whole range: [new_start, old_end)
- */
- vmg.middle = vma;
- if (vma_expand(&vmg))
- return -ENOMEM;
-
- /*
- * move the page tables downwards, on failure we rely on
- * process cleanup to remove whatever mess we made.
- */
- pmc.for_stack = true;
- if (length != move_page_tables(&pmc))
- return -ENOMEM;
-
- tlb_gather_mmu(&tlb, mm);
- next = vma_next(&vmi);
- if (new_end > old_start) {
- /*
- * when the old and new regions overlap clear from new_end.
- */
- free_pgd_range(&tlb, new_end, old_end, new_end,
- next ? next->vm_start : USER_PGTABLES_CEILING);
- } else {
- /*
- * otherwise, clean from old_start; this is done to not touch
- * the address space in [new_end, old_start) some architectures
- * have constraints on va-space that make this illegal (IA64) -
- * for the others its just a little faster.
- */
- free_pgd_range(&tlb, old_start, old_end, new_end,
- next ? next->vm_start : USER_PGTABLES_CEILING);
- }
- tlb_finish_mmu(&tlb);
-
- vma_prev(&vmi);
- /* Shrink the vma to just the new range */
- return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff);
-}
-
-#ifdef CONFIG_MMU
-/*
* Obtain a read lock on mm->mmap_lock, if the specified address is below the
* start of the VMA, the intent is to perform a write, and it is a
* downward-growing stack, then attempt to expand the stack to contain it.
@@ -1844,10 +1715,175 @@ bool mmap_read_lock_maybe_expand(struct mm_struct *mm,
mmap_write_downgrade(mm);
return true;
}
-#else
-bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, bool write)
+
+__latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
{
- return false;
+ struct vm_area_struct *mpnt, *tmp;
+ int retval;
+ unsigned long charge = 0;
+ LIST_HEAD(uf);
+ VMA_ITERATOR(vmi, mm, 0);
+
+ if (mmap_write_lock_killable(oldmm))
+ return -EINTR;
+ flush_cache_dup_mm(oldmm);
+ uprobe_dup_mmap(oldmm, mm);
+ /*
+ * Not linked in yet - no deadlock potential:
+ */
+ mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);
+
+ /* No ordering required: file already has been exposed. */
+ dup_mm_exe_file(mm, oldmm);
+
+ mm->total_vm = oldmm->total_vm;
+ mm->data_vm = oldmm->data_vm;
+ mm->exec_vm = oldmm->exec_vm;
+ mm->stack_vm = oldmm->stack_vm;
+
+ /* Use __mt_dup() to efficiently build an identical maple tree. */
+ retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL);
+ if (unlikely(retval))
+ goto out;
+
+ mt_clear_in_rcu(vmi.mas.tree);
+ for_each_vma(vmi, mpnt) {
+ struct file *file;
+
+ vma_start_write(mpnt);
+ if (mpnt->vm_flags & VM_DONTCOPY) {
+ retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start,
+ mpnt->vm_end, GFP_KERNEL);
+ if (retval)
+ goto loop_out;
+
+ vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
+ continue;
+ }
+ charge = 0;
+ /*
+ * Don't duplicate many vmas if we've been oom-killed (for
+ * example)
+ */
+ if (fatal_signal_pending(current)) {
+ retval = -EINTR;
+ goto loop_out;
+ }
+ if (mpnt->vm_flags & VM_ACCOUNT) {
+ unsigned long len = vma_pages(mpnt);
+
+ if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
+ goto fail_nomem;
+ charge = len;
+ }
+
+ tmp = vm_area_dup(mpnt);
+ if (!tmp)
+ goto fail_nomem;
+ retval = vma_dup_policy(mpnt, tmp);
+ if (retval)
+ goto fail_nomem_policy;
+ tmp->vm_mm = mm;
+ retval = dup_userfaultfd(tmp, &uf);
+ if (retval)
+ goto fail_nomem_anon_vma_fork;
+ if (tmp->vm_flags & VM_WIPEONFORK) {
+ /*
+ * VM_WIPEONFORK gets a clean slate in the child.
+ * Don't prepare anon_vma until fault since we don't
+ * copy page for current vma.
+ */
+ tmp->anon_vma = NULL;
+ } else if (anon_vma_fork(tmp, mpnt))
+ goto fail_nomem_anon_vma_fork;
+ vm_flags_clear(tmp, VM_LOCKED_MASK);
+ /*
+ * Copy/update hugetlb private vma information.
+ */
+ if (is_vm_hugetlb_page(tmp))
+ hugetlb_dup_vma_private(tmp);
+
+ /*
+ * Link the vma into the MT. After using __mt_dup(), memory
+ * allocation is not necessary here, so it cannot fail.
+ */
+ vma_iter_bulk_store(&vmi, tmp);
+
+ mm->map_count++;
+
+ if (tmp->vm_ops && tmp->vm_ops->open)
+ tmp->vm_ops->open(tmp);
+
+ file = tmp->vm_file;
+ if (file) {
+ struct address_space *mapping = file->f_mapping;
+
+ get_file(file);
+ i_mmap_lock_write(mapping);
+ if (vma_is_shared_maywrite(tmp))
+ mapping_allow_writable(mapping);
+ flush_dcache_mmap_lock(mapping);
+ /* insert tmp into the share list, just after mpnt */
+ vma_interval_tree_insert_after(tmp, mpnt,
+ &mapping->i_mmap);
+ flush_dcache_mmap_unlock(mapping);
+ i_mmap_unlock_write(mapping);
+ }
+
+ if (!(tmp->vm_flags & VM_WIPEONFORK))
+ retval = copy_page_range(tmp, mpnt);
+
+ if (retval) {
+ mpnt = vma_next(&vmi);
+ goto loop_out;
+ }
+ }
+ /* a new mm has just been created */
+ retval = arch_dup_mmap(oldmm, mm);
+loop_out:
+ vma_iter_free(&vmi);
+ if (!retval) {
+ mt_set_in_rcu(vmi.mas.tree);
+ ksm_fork(mm, oldmm);
+ khugepaged_fork(mm, oldmm);
+ } else {
+
+ /*
+ * The entire maple tree has already been duplicated. If the
+ * mmap duplication fails, mark the failure point with
+ * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered,
+ * stop releasing VMAs that have not been duplicated after this
+ * point.
+ */
+ if (mpnt) {
+ mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
+ mas_store(&vmi.mas, XA_ZERO_ENTRY);
+ /* Avoid OOM iterating a broken tree */
+ mm_flags_set(MMF_OOM_SKIP, mm);
+ }
+ /*
+ * The mm_struct is going to exit, but the locks will be dropped
+ * first. Set the mm_struct as unstable is advisable as it is
+ * not fully initialised.
+ */
+ mm_flags_set(MMF_UNSTABLE, mm);
+ }
+out:
+ mmap_write_unlock(mm);
+ flush_tlb_mm(oldmm);
+ mmap_write_unlock(oldmm);
+ if (!retval)
+ dup_userfaultfd_complete(&uf);
+ else
+ dup_userfaultfd_fail(&uf);
+ return retval;
+
+fail_nomem_anon_vma_fork:
+ mpol_put(vma_policy(tmp));
+fail_nomem_policy:
+ vm_area_free(tmp);
+fail_nomem:
+ retval = -ENOMEM;
+ vm_unacct_memory(charge);
+ goto loop_out;
}
-#endif
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
index e7dbaf96aa17..0a0db5849b8e 100644
--- a/mm/mmap_lock.c
+++ b/mm/mmap_lock.c
@@ -42,3 +42,456 @@ void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write)
}
EXPORT_SYMBOL(__mmap_lock_do_trace_released);
#endif /* CONFIG_TRACING */
+
+#ifdef CONFIG_MMU
+#ifdef CONFIG_PER_VMA_LOCK
+static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching)
+{
+ unsigned int tgt_refcnt = VMA_LOCK_OFFSET;
+
+ /* Additional refcnt if the vma is attached. */
+ if (!detaching)
+ tgt_refcnt++;
+
+ /*
+ * If vma is detached then only vma_mark_attached() can raise the
+ * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
+ */
+ if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt))
+ return false;
+
+ rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
+ rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
+ refcount_read(&vma->vm_refcnt) == tgt_refcnt,
+ TASK_UNINTERRUPTIBLE);
+ lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
+
+ return true;
+}
+
+static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
+{
+ *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt);
+ rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
+}
+
+void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
+{
+ bool locked;
+
+ /*
+ * __vma_enter_locked() returns false immediately if the vma is not
+ * attached, otherwise it waits until refcnt is indicating that vma
+ * is attached with no readers.
+ */
+ locked = __vma_enter_locked(vma, false);
+
+ /*
+ * We should use WRITE_ONCE() here because we can have concurrent reads
+ * from the early lockless pessimistic check in vma_start_read().
+ * We don't really care about the correctness of that early check, but
+ * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
+ */
+ WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
+
+ if (locked) {
+ bool detached;
+
+ __vma_exit_locked(vma, &detached);
+ WARN_ON_ONCE(detached); /* vma should remain attached */
+ }
+}
+EXPORT_SYMBOL_GPL(__vma_start_write);
+
+void vma_mark_detached(struct vm_area_struct *vma)
+{
+ vma_assert_write_locked(vma);
+ vma_assert_attached(vma);
+
+ /*
+ * We are the only writer, so no need to use vma_refcount_put().
+ * The condition below is unlikely because the vma has been already
+ * write-locked and readers can increment vm_refcnt only temporarily
+ * before they check vm_lock_seq, realize the vma is locked and drop
+ * back the vm_refcnt. That is a narrow window for observing a raised
+ * vm_refcnt.
+ */
+ if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
+ /* Wait until vma is detached with no readers. */
+ if (__vma_enter_locked(vma, true)) {
+ bool detached;
+
+ __vma_exit_locked(vma, &detached);
+ WARN_ON_ONCE(!detached);
+ }
+ }
+}
+
+/*
+ * Try to read-lock a vma. The function is allowed to occasionally yield false
+ * locked result to avoid performance overhead, in which case we fall back to
+ * using mmap_lock. The function should never yield false unlocked result.
+ * False locked result is possible if mm_lock_seq overflows or if vma gets
+ * reused and attached to a different mm before we lock it.
+ * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got
+ * detached.
+ *
+ * IMPORTANT: RCU lock must be held upon entering the function, but upon error
+ * IT IS RELEASED. The caller must handle this correctly.
+ */
+static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
+ struct vm_area_struct *vma)
+{
+ struct mm_struct *other_mm;
+ int oldcnt;
+
+ RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu lock held");
+ /*
+ * Check before locking. A race might cause false locked result.
+ * We can use READ_ONCE() for the mm_lock_seq here, and don't need
+ * ACQUIRE semantics, because this is just a lockless check whose result
+ * we don't rely on for anything - the mm_lock_seq read against which we
+ * need ordering is below.
+ */
+ if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) {
+ vma = NULL;
+ goto err;
+ }
+
+ /*
+ * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire()
+ * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET.
+ * Acquire fence is required here to avoid reordering against later
+ * vm_lock_seq check and checks inside lock_vma_under_rcu().
+ */
+ if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
+ VMA_REF_LIMIT))) {
+ /* return EAGAIN if vma got detached from under us */
+ vma = oldcnt ? NULL : ERR_PTR(-EAGAIN);
+ goto err;
+ }
+
+ rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
+
+ if (unlikely(vma->vm_mm != mm))
+ goto err_unstable;
+
+ /*
+ * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result.
+ * False unlocked result is impossible because we modify and check
+ * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq
+ * modification invalidates all existing locks.
+ *
+ * We must use ACQUIRE semantics for the mm_lock_seq so that if we are
+ * racing with vma_end_write_all(), we only start reading from the VMA
+ * after it has been unlocked.
+ * This pairs with RELEASE semantics in vma_end_write_all().
+ */
+ if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) {
+ vma_refcount_put(vma);
+ vma = NULL;
+ goto err;
+ }
+
+ return vma;
+err:
+ rcu_read_unlock();
+
+ return vma;
+err_unstable:
+ /*
+ * If vma got attached to another mm from under us, that mm is not
+ * stable and can be freed in the narrow window after vma->vm_refcnt
+ * is dropped and before rcuwait_wake_up(mm) is called. Grab it before
+ * releasing vma->vm_refcnt.
+ */
+ other_mm = vma->vm_mm; /* use a copy as vma can be freed after we drop vm_refcnt */
+
+ /* __mmdrop() is a heavy operation, do it after dropping RCU lock. */
+ rcu_read_unlock();
+ mmgrab(other_mm);
+ vma_refcount_put(vma);
+ mmdrop(other_mm);
+
+ return NULL;
+}
+
+/*
+ * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
+ * stable and not isolated. If the VMA is not found or is being modified the
+ * function returns NULL.
+ */
+struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
+ unsigned long address)
+{
+ MA_STATE(mas, &mm->mm_mt, address, address);
+ struct vm_area_struct *vma;
+
+retry:
+ rcu_read_lock();
+ vma = mas_walk(&mas);
+ if (!vma) {
+ rcu_read_unlock();
+ goto inval;
+ }
+
+ vma = vma_start_read(mm, vma);
+ if (IS_ERR_OR_NULL(vma)) {
+ /* Check if the VMA got isolated after we found it */
+ if (PTR_ERR(vma) == -EAGAIN) {
+ count_vm_vma_lock_event(VMA_LOCK_MISS);
+ /* The area was replaced with another one */
+ goto retry;
+ }
+
+ /* Failed to lock the VMA */
+ goto inval;
+ }
+ /*
+ * At this point, we have a stable reference to a VMA: The VMA is
+ * locked and we know it hasn't already been isolated.
+ * From here on, we can access the VMA without worrying about which
+ * fields are accessible for RCU readers.
+ */
+ rcu_read_unlock();
+
+ /* Check if the vma we locked is the right one. */
+ if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
+ vma_end_read(vma);
+ goto inval;
+ }
+
+ return vma;
+
+inval:
+ count_vm_vma_lock_event(VMA_LOCK_ABORT);
+ return NULL;
+}
+
+static struct vm_area_struct *lock_next_vma_under_mmap_lock(struct mm_struct *mm,
+ struct vma_iterator *vmi,
+ unsigned long from_addr)
+{
+ struct vm_area_struct *vma;
+ int ret;
+
+ ret = mmap_read_lock_killable(mm);
+ if (ret)
+ return ERR_PTR(ret);
+
+ /* Lookup the vma at the last position again under mmap_read_lock */
+ vma_iter_set(vmi, from_addr);
+ vma = vma_next(vmi);
+ if (vma) {
+ /* Very unlikely vma->vm_refcnt overflow case */
+ if (unlikely(!vma_start_read_locked(vma)))
+ vma = ERR_PTR(-EAGAIN);
+ }
+
+ mmap_read_unlock(mm);
+
+ return vma;
+}
+
+struct vm_area_struct *lock_next_vma(struct mm_struct *mm,
+ struct vma_iterator *vmi,
+ unsigned long from_addr)
+{
+ struct vm_area_struct *vma;
+ unsigned int mm_wr_seq;
+ bool mmap_unlocked;
+
+ RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu read lock held");
+retry:
+ /* Start mmap_lock speculation in case we need to verify the vma later */
+ mmap_unlocked = mmap_lock_speculate_try_begin(mm, &mm_wr_seq);
+ vma = vma_next(vmi);
+ if (!vma)
+ return NULL;
+
+ vma = vma_start_read(mm, vma);
+ if (IS_ERR_OR_NULL(vma)) {
+ /*
+ * Retry immediately if the vma gets detached from under us.
+ * Infinite loop should not happen because the vma we find will
+ * have to be constantly knocked out from under us.
+ */
+ if (PTR_ERR(vma) == -EAGAIN) {
+ /* reset to search from the last address */
+ rcu_read_lock();
+ vma_iter_set(vmi, from_addr);
+ goto retry;
+ }
+
+ goto fallback;
+ }
+
+ /* Verify the vma is not behind the last search position. */
+ if (unlikely(from_addr >= vma->vm_end))
+ goto fallback_unlock;
+
+ /*
+ * vma can be ahead of the last search position but we need to verify
+ * it was not shrunk after we found it and another vma has not been
+ * installed ahead of it. Otherwise we might observe a gap that should
+ * not be there.
+ */
+ if (from_addr < vma->vm_start) {
+ /* Verify only if the address space might have changed since vma lookup. */
+ if (!mmap_unlocked || mmap_lock_speculate_retry(mm, mm_wr_seq)) {
+ vma_iter_set(vmi, from_addr);
+ if (vma != vma_next(vmi))
+ goto fallback_unlock;
+ }
+ }
+
+ return vma;
+
+fallback_unlock:
+ rcu_read_unlock();
+ vma_end_read(vma);
+fallback:
+ vma = lock_next_vma_under_mmap_lock(mm, vmi, from_addr);
+ rcu_read_lock();
+ /* Reinitialize the iterator after re-entering rcu read section */
+ vma_iter_set(vmi, IS_ERR_OR_NULL(vma) ? from_addr : vma->vm_end);
+
+ return vma;
+}
+#endif /* CONFIG_PER_VMA_LOCK */
+
+#ifdef CONFIG_LOCK_MM_AND_FIND_VMA
+#include <linux/extable.h>
+
+static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
+{
+ if (likely(mmap_read_trylock(mm)))
+ return true;
+
+ if (regs && !user_mode(regs)) {
+ unsigned long ip = exception_ip(regs);
+ if (!search_exception_tables(ip))
+ return false;
+ }
+
+ return !mmap_read_lock_killable(mm);
+}
+
+static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
+{
+ /*
+ * We don't have this operation yet.
+ *
+ * It should be easy enough to do: it's basically a
+ * atomic_long_try_cmpxchg_acquire()
+ * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
+ * it also needs the proper lockdep magic etc.
+ */
+ return false;
+}
+
+static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
+{
+ mmap_read_unlock(mm);
+ if (regs && !user_mode(regs)) {
+ unsigned long ip = exception_ip(regs);
+ if (!search_exception_tables(ip))
+ return false;
+ }
+ return !mmap_write_lock_killable(mm);
+}
+
+/*
+ * Helper for page fault handling.
+ *
+ * This is kind of equivalent to "mmap_read_lock()" followed
+ * by "find_extend_vma()", except it's a lot more careful about
+ * the locking (and will drop the lock on failure).
+ *
+ * For example, if we have a kernel bug that causes a page
+ * fault, we don't want to just use mmap_read_lock() to get
+ * the mm lock, because that would deadlock if the bug were
+ * to happen while we're holding the mm lock for writing.
+ *
+ * So this checks the exception tables on kernel faults in
+ * order to only do this all for instructions that are actually
+ * expected to fault.
+ *
+ * We can also actually take the mm lock for writing if we
+ * need to extend the vma, which helps the VM layer a lot.
+ */
+struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
+ unsigned long addr, struct pt_regs *regs)
+{
+ struct vm_area_struct *vma;
+
+ if (!get_mmap_lock_carefully(mm, regs))
+ return NULL;
+
+ vma = find_vma(mm, addr);
+ if (likely(vma && (vma->vm_start <= addr)))
+ return vma;
+
+ /*
+ * Well, dang. We might still be successful, but only
+ * if we can extend a vma to do so.
+ */
+ if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
+ mmap_read_unlock(mm);
+ return NULL;
+ }
+
+ /*
+ * We can try to upgrade the mmap lock atomically,
+ * in which case we can continue to use the vma
+ * we already looked up.
+ *
+ * Otherwise we'll have to drop the mmap lock and
+ * re-take it, and also look up the vma again,
+ * re-checking it.
+ */
+ if (!mmap_upgrade_trylock(mm)) {
+ if (!upgrade_mmap_lock_carefully(mm, regs))
+ return NULL;
+
+ vma = find_vma(mm, addr);
+ if (!vma)
+ goto fail;
+ if (vma->vm_start <= addr)
+ goto success;
+ if (!(vma->vm_flags & VM_GROWSDOWN))
+ goto fail;
+ }
+
+ if (expand_stack_locked(vma, addr))
+ goto fail;
+
+success:
+ mmap_write_downgrade(mm);
+ return vma;
+
+fail:
+ mmap_write_unlock(mm);
+ return NULL;
+}
+#endif /* CONFIG_LOCK_MM_AND_FIND_VMA */
+
+#else /* CONFIG_MMU */
+
+/*
+ * At least xtensa ends up having protection faults even with no
+ * MMU.. No stack expansion, at least.
+ */
+struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
+ unsigned long addr, struct pt_regs *regs)
+{
+ struct vm_area_struct *vma;
+
+ mmap_read_lock(mm);
+ vma = vma_lookup(mm, addr);
+ if (!vma)
+ mmap_read_unlock(mm);
+ return vma;
+}
+
+#endif /* CONFIG_MMU */
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index db7ba4a725d6..374aa6f021c6 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -32,7 +32,7 @@ static bool tlb_next_batch(struct mmu_gather *tlb)
if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
return false;
- batch = (void *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
+ batch = (void *)__get_free_page(GFP_NOWAIT);
if (!batch)
return false;
@@ -364,7 +364,7 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
struct mmu_table_batch **batch = &tlb->batch;
if (*batch == NULL) {
- *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
+ *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT);
if (*batch == NULL) {
tlb_table_invalidate(tlb);
tlb_remove_table_one(table);
@@ -424,6 +424,7 @@ static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
tlb->page_size = 0;
#endif
+ tlb->vma_pfn = 0;
__tlb_reset_range(tlb);
inc_tlb_flush_pending(tlb->mm);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index fc18fe274505..8e0125dc0522 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -4,7 +4,7 @@
*
* Copyright (C) 2008 Qumranet, Inc.
* Copyright (C) 2008 SGI
- * Christoph Lameter <cl@linux.com>
+ * Christoph Lameter <cl@gentwo.org>
*/
#include <linux/rculist.h>
diff --git a/mm/mmzone.c b/mm/mmzone.c
index f9baa8882fbf..0c8f181d9d50 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -99,14 +99,14 @@ int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
unsigned long old_flags, flags;
int last_cpupid;
- old_flags = READ_ONCE(folio->flags);
+ old_flags = READ_ONCE(folio->flags.f);
do {
flags = old_flags;
last_cpupid = (flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT);
flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
- } while (unlikely(!try_cmpxchg(&folio->flags, &old_flags, flags)));
+ } while (unlikely(!try_cmpxchg(&folio->flags.f, &old_flags, flags)));
return last_cpupid;
}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 62c1f7945741..113b48985834 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -40,11 +40,8 @@
#include "internal.h"
-bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
- pte_t pte)
+static bool maybe_change_pte_writable(struct vm_area_struct *vma, pte_t pte)
{
- struct page *page;
-
if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE)))
return false;
@@ -60,16 +57,32 @@ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
if (userfaultfd_pte_wp(vma, pte))
return false;
- if (!(vma->vm_flags & VM_SHARED)) {
- /*
- * Writable MAP_PRIVATE mapping: We can only special-case on
- * exclusive anonymous pages, because we know that our
- * write-fault handler similarly would map them writable without
- * any additional checks while holding the PT lock.
- */
- page = vm_normal_page(vma, addr, pte);
- return page && PageAnon(page) && PageAnonExclusive(page);
- }
+ return true;
+}
+
+static bool can_change_private_pte_writable(struct vm_area_struct *vma,
+ unsigned long addr, pte_t pte)
+{
+ struct page *page;
+
+ if (!maybe_change_pte_writable(vma, pte))
+ return false;
+
+ /*
+ * Writable MAP_PRIVATE mapping: We can only special-case on
+ * exclusive anonymous pages, because we know that our
+ * write-fault handler similarly would map them writable without
+ * any additional checks while holding the PT lock.
+ */
+ page = vm_normal_page(vma, addr, pte);
+ return page && PageAnon(page) && PageAnonExclusive(page);
+}
+
+static bool can_change_shared_pte_writable(struct vm_area_struct *vma,
+ pte_t pte)
+{
+ if (!maybe_change_pte_writable(vma, pte))
+ return false;
VM_WARN_ON_ONCE(is_zero_pfn(pte_pfn(pte)) && pte_dirty(pte));
@@ -83,6 +96,179 @@ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
return pte_dirty(pte);
}
+bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
+ pte_t pte)
+{
+ if (!(vma->vm_flags & VM_SHARED))
+ return can_change_private_pte_writable(vma, addr, pte);
+
+ return can_change_shared_pte_writable(vma, pte);
+}
+
+static int mprotect_folio_pte_batch(struct folio *folio, pte_t *ptep,
+ pte_t pte, int max_nr_ptes, fpb_t flags)
+{
+ /* No underlying folio, so cannot batch */
+ if (!folio)
+ return 1;
+
+ if (!folio_test_large(folio))
+ return 1;
+
+ return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr_ptes, flags);
+}
+
+static bool prot_numa_skip(struct vm_area_struct *vma, unsigned long addr,
+ pte_t oldpte, pte_t *pte, int target_node,
+ struct folio *folio)
+{
+ bool ret = true;
+ bool toptier;
+ int nid;
+
+ /* Avoid TLB flush if possible */
+ if (pte_protnone(oldpte))
+ goto skip;
+
+ if (!folio)
+ goto skip;
+
+ if (folio_is_zone_device(folio) || folio_test_ksm(folio))
+ goto skip;
+
+ /* Also skip shared copy-on-write pages */
+ if (is_cow_mapping(vma->vm_flags) &&
+ (folio_maybe_dma_pinned(folio) || folio_maybe_mapped_shared(folio)))
+ goto skip;
+
+ /*
+ * While migration can move some dirty pages,
+ * it cannot move them all from MIGRATE_ASYNC
+ * context.
+ */
+ if (folio_is_file_lru(folio) && folio_test_dirty(folio))
+ goto skip;
+
+ /*
+ * Don't mess with PTEs if page is already on the node
+ * a single-threaded process is running on.
+ */
+ nid = folio_nid(folio);
+ if (target_node == nid)
+ goto skip;
+
+ toptier = node_is_toptier(nid);
+
+ /*
+ * Skip scanning top tier node if normal numa
+ * balancing is disabled
+ */
+ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && toptier)
+ goto skip;
+
+ ret = false;
+ if (folio_use_access_time(folio))
+ folio_xchg_access_time(folio, jiffies_to_msecs(jiffies));
+
+skip:
+ return ret;
+}
+
+/* Set nr_ptes number of ptes, starting from idx */
+static void prot_commit_flush_ptes(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep, pte_t oldpte, pte_t ptent, int nr_ptes,
+ int idx, bool set_write, struct mmu_gather *tlb)
+{
+ /*
+ * Advance the position in the batch by idx; note that if idx > 0,
+ * then the nr_ptes passed here is <= batch size - idx.
+ */
+ addr += idx * PAGE_SIZE;
+ ptep += idx;
+ oldpte = pte_advance_pfn(oldpte, idx);
+ ptent = pte_advance_pfn(ptent, idx);
+
+ if (set_write)
+ ptent = pte_mkwrite(ptent, vma);
+
+ modify_prot_commit_ptes(vma, addr, ptep, oldpte, ptent, nr_ptes);
+ if (pte_needs_flush(oldpte, ptent))
+ tlb_flush_pte_range(tlb, addr, nr_ptes * PAGE_SIZE);
+}
+
+/*
+ * Get max length of consecutive ptes pointing to PageAnonExclusive() pages or
+ * !PageAnonExclusive() pages, starting from start_idx. Caller must enforce
+ * that the ptes point to consecutive pages of the same anon large folio.
+ */
+static int page_anon_exclusive_sub_batch(int start_idx, int max_len,
+ struct page *first_page, bool expected_anon_exclusive)
+{
+ int idx;
+
+ for (idx = start_idx + 1; idx < start_idx + max_len; ++idx) {
+ if (expected_anon_exclusive != PageAnonExclusive(first_page + idx))
+ break;
+ }
+ return idx - start_idx;
+}
+
+/*
+ * This function is a result of trying our very best to retain the
+ * "avoid the write-fault handler" optimization. In can_change_pte_writable(),
+ * if the vma is a private vma, and we cannot determine whether to change
+ * the pte to writable just from the vma and the pte, we then need to look
+ * at the actual page pointed to by the pte. Unfortunately, if we have a
+ * batch of ptes pointing to consecutive pages of the same anon large folio,
+ * the anon-exclusivity (or the negation) of the first page does not guarantee
+ * the anon-exclusivity (or the negation) of the other pages corresponding to
+ * the pte batch; hence in this case it is incorrect to decide to change or
+ * not change the ptes to writable just by using information from the first
+ * pte of the batch. Therefore, we must individually check all pages and
+ * retrieve sub-batches.
+ */
+static void commit_anon_folio_batch(struct vm_area_struct *vma,
+ struct folio *folio, struct page *first_page, unsigned long addr, pte_t *ptep,
+ pte_t oldpte, pte_t ptent, int nr_ptes, struct mmu_gather *tlb)
+{
+ bool expected_anon_exclusive;
+ int sub_batch_idx = 0;
+ int len;
+
+ while (nr_ptes) {
+ expected_anon_exclusive = PageAnonExclusive(first_page + sub_batch_idx);
+ len = page_anon_exclusive_sub_batch(sub_batch_idx, nr_ptes,
+ first_page, expected_anon_exclusive);
+ prot_commit_flush_ptes(vma, addr, ptep, oldpte, ptent, len,
+ sub_batch_idx, expected_anon_exclusive, tlb);
+ sub_batch_idx += len;
+ nr_ptes -= len;
+ }
+}
+
+static void set_write_prot_commit_flush_ptes(struct vm_area_struct *vma,
+ struct folio *folio, struct page *page, unsigned long addr, pte_t *ptep,
+ pte_t oldpte, pte_t ptent, int nr_ptes, struct mmu_gather *tlb)
+{
+ bool set_write;
+
+ if (vma->vm_flags & VM_SHARED) {
+ set_write = can_change_shared_pte_writable(vma, ptent);
+ prot_commit_flush_ptes(vma, addr, ptep, oldpte, ptent, nr_ptes,
+ /* idx = */ 0, set_write, tlb);
+ return;
+ }
+
+ set_write = maybe_change_pte_writable(vma, ptent) &&
+ (folio && folio_test_anon(folio));
+ if (!set_write) {
+ prot_commit_flush_ptes(vma, addr, ptep, oldpte, ptent, nr_ptes,
+ /* idx = */ 0, set_write, tlb);
+ return;
+ }
+ commit_anon_folio_batch(vma, folio, page, addr, ptep, oldpte, ptent, nr_ptes, tlb);
+}
+
static long change_pte_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr,
unsigned long end, pgprot_t newprot, unsigned long cp_flags)
@@ -94,6 +280,7 @@ static long change_pte_range(struct mmu_gather *tlb,
bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
+ int nr_ptes;
tlb_change_page_size(tlb, PAGE_SIZE);
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
@@ -108,65 +295,37 @@ static long change_pte_range(struct mmu_gather *tlb,
flush_tlb_batched_pending(vma->vm_mm);
arch_enter_lazy_mmu_mode();
do {
+ nr_ptes = 1;
oldpte = ptep_get(pte);
if (pte_present(oldpte)) {
+ const fpb_t flags = FPB_RESPECT_SOFT_DIRTY | FPB_RESPECT_WRITE;
+ int max_nr_ptes = (end - addr) >> PAGE_SHIFT;
+ struct folio *folio = NULL;
+ struct page *page;
pte_t ptent;
+ page = vm_normal_page(vma, addr, oldpte);
+ if (page)
+ folio = page_folio(page);
/*
* Avoid trapping faults against the zero or KSM
* pages. See similar comment in change_huge_pmd.
*/
if (prot_numa) {
- struct folio *folio;
- int nid;
- bool toptier;
-
- /* Avoid TLB flush if possible */
- if (pte_protnone(oldpte))
- continue;
-
- folio = vm_normal_folio(vma, addr, oldpte);
- if (!folio || folio_is_zone_device(folio) ||
- folio_test_ksm(folio))
- continue;
-
- /* Also skip shared copy-on-write pages */
- if (is_cow_mapping(vma->vm_flags) &&
- (folio_maybe_dma_pinned(folio) ||
- folio_maybe_mapped_shared(folio)))
- continue;
-
- /*
- * While migration can move some dirty pages,
- * it cannot move them all from MIGRATE_ASYNC
- * context.
- */
- if (folio_is_file_lru(folio) &&
- folio_test_dirty(folio))
- continue;
-
- /*
- * Don't mess with PTEs if page is already on the node
- * a single-threaded process is running on.
- */
- nid = folio_nid(folio);
- if (target_node == nid)
- continue;
- toptier = node_is_toptier(nid);
+ int ret = prot_numa_skip(vma, addr, oldpte, pte,
+ target_node, folio);
+ if (ret) {
- /*
- * Skip scanning top tier node if normal numa
- * balancing is disabled
- */
- if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
- toptier)
+ /* determine batch to skip */
+ nr_ptes = mprotect_folio_pte_batch(folio,
+ pte, oldpte, max_nr_ptes, /* flags = */ 0);
continue;
- if (folio_use_access_time(folio))
- folio_xchg_access_time(folio,
- jiffies_to_msecs(jiffies));
+ }
}
- oldpte = ptep_modify_prot_start(vma, addr, pte);
+ nr_ptes = mprotect_folio_pte_batch(folio, pte, oldpte, max_nr_ptes, flags);
+
+ oldpte = modify_prot_start_ptes(vma, addr, pte, nr_ptes);
ptent = pte_modify(oldpte, newprot);
if (uffd_wp)
@@ -188,14 +347,13 @@ static long change_pte_range(struct mmu_gather *tlb,
* COW or special handling is required.
*/
if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) &&
- !pte_write(ptent) &&
- can_change_pte_writable(vma, addr, ptent))
- ptent = pte_mkwrite(ptent, vma);
-
- ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent);
- if (pte_needs_flush(oldpte, ptent))
- tlb_flush_pte_range(tlb, addr, PAGE_SIZE);
- pages++;
+ !pte_write(ptent))
+ set_write_prot_commit_flush_ptes(vma, folio, page,
+ addr, pte, oldpte, ptent, nr_ptes, tlb);
+ else
+ prot_commit_flush_ptes(vma, addr, pte, oldpte, ptent,
+ nr_ptes, /* idx = */ 0, /* set_write = */ false, tlb);
+ pages += nr_ptes;
} else if (is_swap_pte(oldpte)) {
swp_entry_t entry = pte_to_swp_entry(oldpte);
pte_t newpte;
@@ -280,7 +438,7 @@ static long change_pte_range(struct mmu_gather *tlb,
pages++;
}
}
- } while (pte++, addr += PAGE_SIZE, addr != end);
+ } while (pte += nr_ptes, addr += nr_ptes * PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);
@@ -376,10 +534,10 @@ again:
goto next;
_pmd = pmdp_get_lockless(pmd);
- if (is_swap_pmd(_pmd) || pmd_trans_huge(_pmd) || pmd_devmap(_pmd)) {
+ if (is_swap_pmd(_pmd) || pmd_trans_huge(_pmd)) {
if ((next - addr != HPAGE_PMD_SIZE) ||
pgtable_split_needed(vma, cp_flags)) {
- __split_huge_pmd(vma, pmd, addr, false, NULL);
+ __split_huge_pmd(vma, pmd, addr, false);
/*
* For file-backed, the pmd could have been
* cleared; make sure pmd populated if
@@ -596,16 +754,16 @@ static const struct mm_walk_ops prot_none_walk_ops = {
int
mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb,
struct vm_area_struct *vma, struct vm_area_struct **pprev,
- unsigned long start, unsigned long end, unsigned long newflags)
+ unsigned long start, unsigned long end, vm_flags_t newflags)
{
struct mm_struct *mm = vma->vm_mm;
- unsigned long oldflags = READ_ONCE(vma->vm_flags);
+ vm_flags_t oldflags = READ_ONCE(vma->vm_flags);
long nrpages = (end - start) >> PAGE_SHIFT;
unsigned int mm_cp_flags = 0;
unsigned long charged = 0;
int error;
- if (!can_modify_vma(vma))
+ if (vma_is_sealed(vma))
return -EPERM;
if (newflags == oldflags) {
@@ -774,8 +932,8 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
nstart = start;
tmp = vma->vm_start;
for_each_vma_range(vmi, vma, end) {
- unsigned long mask_off_old_flags;
- unsigned long newflags;
+ vm_flags_t mask_off_old_flags;
+ vm_flags_t newflags;
int new_vma_pkey;
if (vma->vm_start != tmp) {
diff --git a/mm/mremap.c b/mm/mremap.c
index 7db9da609c84..35de0a7b910e 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -52,7 +52,7 @@ struct vma_remap_struct {
unsigned long addr; /* User-specified address from which we remap. */
unsigned long old_len; /* Length of range being remapped. */
unsigned long new_len; /* Desired new length of mapping. */
- unsigned long flags; /* user-specified MREMAP_* flags. */
+ const unsigned long flags; /* user-specified MREMAP_* flags. */
unsigned long new_addr; /* Optionally, desired new address. */
/* uffd state. */
@@ -65,10 +65,11 @@ struct vma_remap_struct {
/* Internal state, determined in do_mremap(). */
unsigned long delta; /* Absolute delta of old_len,new_len. */
- bool mlocked; /* Was the VMA mlock()'d? */
+ bool populate_expand; /* mlock()'d expanded, must populate. */
enum mremap_type remap_type; /* expand, shrink, etc. */
bool mmap_locked; /* Is mm currently write-locked? */
unsigned long charged; /* If VM_ACCOUNT, # pages to account. */
+ bool vmi_needs_invalidate; /* Is the VMA iterator invalidated? */
};
static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr)
@@ -170,13 +171,33 @@ static pte_t move_soft_dirty_pte(pte_t pte)
return pte;
}
+static int mremap_folio_pte_batch(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep, pte_t pte, int max_nr)
+{
+ struct folio *folio;
+
+ if (max_nr == 1)
+ return 1;
+
+ /* Avoid expensive folio lookup if we stand no chance of benefit. */
+ if (pte_batch_hint(ptep, pte) == 1)
+ return 1;
+
+ folio = vm_normal_folio(vma, addr, pte);
+ if (!folio || !folio_test_large(folio))
+ return 1;
+
+ return folio_pte_batch(folio, ptep, pte, max_nr);
+}
+
static int move_ptes(struct pagetable_move_control *pmc,
unsigned long extent, pmd_t *old_pmd, pmd_t *new_pmd)
{
struct vm_area_struct *vma = pmc->old;
bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma);
struct mm_struct *mm = vma->vm_mm;
- pte_t *old_pte, *new_pte, pte;
+ pte_t *old_ptep, *new_ptep;
+ pte_t old_pte, pte;
pmd_t dummy_pmdval;
spinlock_t *old_ptl, *new_ptl;
bool force_flush = false;
@@ -184,6 +205,8 @@ static int move_ptes(struct pagetable_move_control *pmc,
unsigned long new_addr = pmc->new_addr;
unsigned long old_end = old_addr + extent;
unsigned long len = old_end - old_addr;
+ int max_nr_ptes;
+ int nr_ptes;
int err = 0;
/*
@@ -211,8 +234,8 @@ static int move_ptes(struct pagetable_move_control *pmc,
* We don't have to worry about the ordering of src and dst
* pte locks because exclusive mmap_lock prevents deadlock.
*/
- old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
- if (!old_pte) {
+ old_ptep = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
+ if (!old_ptep) {
err = -EAGAIN;
goto out;
}
@@ -223,10 +246,10 @@ static int move_ptes(struct pagetable_move_control *pmc,
* mmap_lock, so this new_pte page is stable, so there is no need to get
* pmdval and do pmd_same() check.
*/
- new_pte = pte_offset_map_rw_nolock(mm, new_pmd, new_addr, &dummy_pmdval,
+ new_ptep = pte_offset_map_rw_nolock(mm, new_pmd, new_addr, &dummy_pmdval,
&new_ptl);
- if (!new_pte) {
- pte_unmap_unlock(old_pte, old_ptl);
+ if (!new_ptep) {
+ pte_unmap_unlock(old_ptep, old_ptl);
err = -EAGAIN;
goto out;
}
@@ -235,12 +258,16 @@ static int move_ptes(struct pagetable_move_control *pmc,
flush_tlb_batched_pending(vma->vm_mm);
arch_enter_lazy_mmu_mode();
- for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
- new_pte++, new_addr += PAGE_SIZE) {
- if (pte_none(ptep_get(old_pte)))
+ for (; old_addr < old_end; old_ptep += nr_ptes, old_addr += nr_ptes * PAGE_SIZE,
+ new_ptep += nr_ptes, new_addr += nr_ptes * PAGE_SIZE) {
+ VM_WARN_ON_ONCE(!pte_none(*new_ptep));
+
+ nr_ptes = 1;
+ max_nr_ptes = (old_end - old_addr) >> PAGE_SHIFT;
+ old_pte = ptep_get(old_ptep);
+ if (pte_none(old_pte))
continue;
- pte = ptep_get_and_clear(mm, old_addr, old_pte);
/*
* If we are remapping a valid PTE, make sure
* to flush TLB before we drop the PTL for the
@@ -252,13 +279,17 @@ static int move_ptes(struct pagetable_move_control *pmc,
* the TLB entry for the old mapping has been
* flushed.
*/
- if (pte_present(pte))
+ if (pte_present(old_pte)) {
+ nr_ptes = mremap_folio_pte_batch(vma, old_addr, old_ptep,
+ old_pte, max_nr_ptes);
force_flush = true;
+ }
+ pte = get_and_clear_ptes(mm, old_addr, old_ptep, nr_ptes);
pte = move_pte(pte, old_addr, new_addr);
pte = move_soft_dirty_pte(pte);
if (need_clear_uffd_wp && pte_marker_uffd_wp(pte))
- pte_clear(mm, new_addr, new_pte);
+ pte_clear(mm, new_addr, new_ptep);
else {
if (need_clear_uffd_wp) {
if (pte_present(pte))
@@ -266,7 +297,7 @@ static int move_ptes(struct pagetable_move_control *pmc,
else if (is_swap_pte(pte))
pte = pte_swp_clear_uffd_wp(pte);
}
- set_pte_at(mm, new_addr, new_pte, pte);
+ set_ptes(mm, new_addr, new_ptep, pte, nr_ptes);
}
}
@@ -275,8 +306,8 @@ static int move_ptes(struct pagetable_move_control *pmc,
flush_tlb_range(vma, old_end - len, old_end);
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
- pte_unmap(new_pte - 1);
- pte_unmap_unlock(old_pte - 1, old_ptl);
+ pte_unmap(new_ptep - 1);
+ pte_unmap_unlock(old_ptep - 1, old_ptl);
out:
if (pmc->need_rmap_locks)
drop_rmap_locks(vma);
@@ -292,6 +323,25 @@ static inline bool arch_supports_page_table_move(void)
}
#endif
+static inline bool uffd_supports_page_table_move(struct pagetable_move_control *pmc)
+{
+ /*
+ * If we are moving a VMA that has uffd-wp registered but with
+ * remap events disabled (new VMA will not be registered with uffd), we
+ * need to ensure that the uffd-wp state is cleared from all pgtables.
+ * This means recursing into lower page tables in move_page_tables().
+ *
+ * We might get called with VMAs reversed when recovering from a
+ * failed page table move. In that case, the
+ * "old"-but-actually-"originally new" VMA during recovery will not have
+ * a uffd context. Recursing into lower page tables during the original
+ * move but not during the recovery move will cause trouble, because we
+ * run into already-existing page tables. So check both VMAs.
+ */
+ return !vma_has_uffd_without_event_remap(pmc->old) &&
+ !vma_has_uffd_without_event_remap(pmc->new);
+}
+
#ifdef CONFIG_HAVE_MOVE_PMD
static bool move_normal_pmd(struct pagetable_move_control *pmc,
pmd_t *old_pmd, pmd_t *new_pmd)
@@ -304,6 +354,8 @@ static bool move_normal_pmd(struct pagetable_move_control *pmc,
if (!arch_supports_page_table_move())
return false;
+ if (!uffd_supports_page_table_move(pmc))
+ return false;
/*
* The destination pmd shouldn't be established, free_pgtables()
* should have released it.
@@ -330,15 +382,6 @@ static bool move_normal_pmd(struct pagetable_move_control *pmc,
if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
return false;
- /* If this pmd belongs to a uffd vma with remap events disabled, we need
- * to ensure that the uffd-wp state is cleared from all pgtables. This
- * means recursing into lower page tables in move_page_tables(), and we
- * can reuse the existing code if we simply treat the entry as "not
- * moved".
- */
- if (vma_has_uffd_without_event_remap(vma))
- return false;
-
/*
* We don't have to worry about the ordering of src and dst
* ptlocks because exclusive mmap_lock prevents deadlock.
@@ -387,6 +430,8 @@ static bool move_normal_pud(struct pagetable_move_control *pmc,
if (!arch_supports_page_table_move())
return false;
+ if (!uffd_supports_page_table_move(pmc))
+ return false;
/*
* The destination pud shouldn't be established, free_pgtables()
* should have released it.
@@ -394,15 +439,6 @@ static bool move_normal_pud(struct pagetable_move_control *pmc,
if (WARN_ON_ONCE(!pud_none(*new_pud)))
return false;
- /* If this pud belongs to a uffd vma with remap events disabled, we need
- * to ensure that the uffd-wp state is cleared from all pgtables. This
- * means recursing into lower page tables in move_page_tables(), and we
- * can reuse the existing code if we simply treat the entry as "not
- * moved".
- */
- if (vma_has_uffd_without_event_remap(vma))
- return false;
-
/*
* We don't have to worry about the ordering of src and dst
* ptlocks because exclusive mmap_lock prevents deadlock.
@@ -792,7 +828,7 @@ unsigned long move_page_tables(struct pagetable_move_control *pmc)
new_pud = alloc_new_pud(mm, pmc->new_addr);
if (!new_pud)
break;
- if (pud_trans_huge(*old_pud) || pud_devmap(*old_pud)) {
+ if (pud_trans_huge(*old_pud)) {
if (extent == HPAGE_PUD_SIZE) {
move_pgt_entry(pmc, HPAGE_PUD, old_pud, new_pud);
/* We ignore and continue on error? */
@@ -811,8 +847,7 @@ unsigned long move_page_tables(struct pagetable_move_control *pmc)
if (!new_pmd)
break;
again:
- if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) ||
- pmd_devmap(*old_pmd)) {
+ if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd)) {
if (extent == HPAGE_PMD_SIZE &&
move_pgt_entry(pmc, HPAGE_PMD, old_pmd, new_pmd))
continue;
@@ -884,7 +919,11 @@ static bool vrm_overlaps(struct vma_remap_struct *vrm)
return false;
}
-/* Do the mremap() flags require that the new_addr parameter be specified? */
+/*
+ * Will a new address definitely be assigned? This either if the user specifies
+ * it via MREMAP_FIXED, or if MREMAP_DONTUNMAP is used, indicating we will
+ * always detemrine a target address.
+ */
static bool vrm_implies_new_addr(struct vma_remap_struct *vrm)
{
return vrm->flags & (MREMAP_FIXED | MREMAP_DONTUNMAP);
@@ -930,7 +969,7 @@ static unsigned long vrm_set_new_addr(struct vma_remap_struct *vrm)
*
* Returns true on success, false if insufficient memory to charge.
*/
-static bool vrm_charge(struct vma_remap_struct *vrm)
+static bool vrm_calc_charge(struct vma_remap_struct *vrm)
{
unsigned long charged;
@@ -981,10 +1020,8 @@ static void vrm_stat_account(struct vma_remap_struct *vrm,
struct vm_area_struct *vma = vrm->vma;
vm_stat_account(mm, vma->vm_flags, pages);
- if (vma->vm_flags & VM_LOCKED) {
+ if (vma->vm_flags & VM_LOCKED)
mm->locked_vm += pages;
- vrm->mlocked = true;
- }
}
/*
@@ -997,7 +1034,7 @@ static unsigned long prep_move_vma(struct vma_remap_struct *vrm)
struct vm_area_struct *vma = vrm->vma;
unsigned long old_addr = vrm->addr;
unsigned long old_len = vrm->old_len;
- unsigned long dummy = vma->vm_flags;
+ vm_flags_t dummy = vma->vm_flags;
/*
* We'd prefer to avoid failure later on in do_munmap:
@@ -1084,6 +1121,7 @@ static void unmap_source_vma(struct vma_remap_struct *vrm)
err = do_vmi_munmap(&vmi, mm, addr, len, vrm->uf_unmap, /* unlock= */false);
vrm->vma = NULL; /* Invalidated. */
+ vrm->vmi_needs_invalidate = true;
if (err) {
/* OOM: unable to split vma, just get accounts right */
vm_acct_memory(len >> PAGE_SHIFT);
@@ -1159,6 +1197,10 @@ static int copy_vma_and_data(struct vma_remap_struct *vrm,
*new_vma_ptr = NULL;
return -ENOMEM;
}
+ /* By merging, we may have invalidated any iterator in use. */
+ if (vma != vrm->vma)
+ vrm->vmi_needs_invalidate = true;
+
vrm->vma = vma;
pmc.old = vma;
pmc.new = new_vma;
@@ -1188,12 +1230,7 @@ static int copy_vma_and_data(struct vma_remap_struct *vrm,
mremap_userfaultfd_prep(new_vma, vrm->uf);
}
- if (is_vm_hugetlb_page(vma))
- clear_vma_resv_huge_pages(vma);
-
- /* Tell pfnmap has moved from this vma */
- if (unlikely(vma->vm_flags & VM_PFNMAP))
- untrack_pfn_clear(vma);
+ fixup_hugetlb_reservations(vma);
*new_vma_ptr = new_vma;
return err;
@@ -1240,8 +1277,11 @@ static unsigned long move_vma(struct vma_remap_struct *vrm)
if (err)
return err;
- /* If accounted, charge the number of bytes the operation will use. */
- if (!vrm_charge(vrm))
+ /*
+ * If accounted, determine the number of bytes the operation will
+ * charge.
+ */
+ if (!vrm_calc_charge(vrm))
return -ENOMEM;
/* We don't want racing faults. */
@@ -1280,64 +1320,6 @@ static unsigned long move_vma(struct vma_remap_struct *vrm)
}
/*
- * resize_is_valid() - Ensure the vma can be resized to the new length at the give
- * address.
- *
- * Return 0 on success, error otherwise.
- */
-static int resize_is_valid(struct vma_remap_struct *vrm)
-{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma = vrm->vma;
- unsigned long addr = vrm->addr;
- unsigned long old_len = vrm->old_len;
- unsigned long new_len = vrm->new_len;
- unsigned long pgoff;
-
- /*
- * !old_len is a special case where an attempt is made to 'duplicate'
- * a mapping. This makes no sense for private mappings as it will
- * instead create a fresh/new mapping unrelated to the original. This
- * is contrary to the basic idea of mremap which creates new mappings
- * based on the original. There are no known use cases for this
- * behavior. As a result, fail such attempts.
- */
- if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) {
- pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap. This is not supported.\n",
- current->comm, current->pid);
- return -EINVAL;
- }
-
- if ((vrm->flags & MREMAP_DONTUNMAP) &&
- (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)))
- return -EINVAL;
-
- /* We can't remap across vm area boundaries */
- if (old_len > vma->vm_end - addr)
- return -EFAULT;
-
- if (new_len == old_len)
- return 0;
-
- /* Need to be careful about a growing mapping */
- pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
- pgoff += vma->vm_pgoff;
- if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
- return -EINVAL;
-
- if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
- return -EFAULT;
-
- if (!mlock_future_ok(mm, vma->vm_flags, vrm->delta))
- return -EAGAIN;
-
- if (!may_expand_vm(mm, vma->vm_flags, vrm->delta >> PAGE_SHIFT))
- return -ENOMEM;
-
- return 0;
-}
-
-/*
* The user has requested that the VMA be shrunk (i.e., old_len > new_len), so
* execute this, optionally dropping the mmap lock when we do so.
*
@@ -1386,14 +1368,6 @@ static unsigned long mremap_to(struct vma_remap_struct *vrm)
struct mm_struct *mm = current->mm;
unsigned long err;
- /* Is the new length or address silly? */
- if (vrm->new_len > TASK_SIZE ||
- vrm->new_addr > TASK_SIZE - vrm->new_len)
- return -EINVAL;
-
- if (vrm_overlaps(vrm))
- return -EINVAL;
-
if (vrm->flags & MREMAP_FIXED) {
/*
* In mremap_to().
@@ -1403,6 +1377,7 @@ static unsigned long mremap_to(struct vma_remap_struct *vrm)
err = do_munmap(mm, vrm->new_addr, vrm->new_len,
vrm->uf_unmap_early);
vrm->vma = NULL; /* Invalidated. */
+ vrm->vmi_needs_invalidate = true;
if (err)
return err;
@@ -1424,10 +1399,6 @@ static unsigned long mremap_to(struct vma_remap_struct *vrm)
vrm->old_len = vrm->new_len;
}
- err = resize_is_valid(vrm);
- if (err)
- return err;
-
/* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */
if (vrm->flags & MREMAP_DONTUNMAP) {
vm_flags_t vm_flags = vrm->vma->vm_flags;
@@ -1476,68 +1447,6 @@ static bool vrm_can_expand_in_place(struct vma_remap_struct *vrm)
}
/*
- * Are the parameters passed to mremap() valid? If so return 0, otherwise return
- * error.
- */
-static unsigned long check_mremap_params(struct vma_remap_struct *vrm)
-
-{
- unsigned long addr = vrm->addr;
- unsigned long flags = vrm->flags;
-
- /* Ensure no unexpected flag values. */
- if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
- return -EINVAL;
-
- /* Start address must be page-aligned. */
- if (offset_in_page(addr))
- return -EINVAL;
-
- /*
- * We allow a zero old-len as a special case
- * for DOS-emu "duplicate shm area" thing. But
- * a zero new-len is nonsensical.
- */
- if (!PAGE_ALIGN(vrm->new_len))
- return -EINVAL;
-
- /* Remainder of checks are for cases with specific new_addr. */
- if (!vrm_implies_new_addr(vrm))
- return 0;
-
- /* The new address must be page-aligned. */
- if (offset_in_page(vrm->new_addr))
- return -EINVAL;
-
- /* A fixed address implies a move. */
- if (!(flags & MREMAP_MAYMOVE))
- return -EINVAL;
-
- /* MREMAP_DONTUNMAP does not allow resizing in the process. */
- if (flags & MREMAP_DONTUNMAP && vrm->old_len != vrm->new_len)
- return -EINVAL;
-
- /*
- * move_vma() need us to stay 4 maps below the threshold, otherwise
- * it will bail out at the very beginning.
- * That is a problem if we have already unmaped the regions here
- * (new_addr, and old_addr), because userspace will not know the
- * state of the vma's after it gets -ENOMEM.
- * So, to avoid such scenario we can pre-compute if the whole
- * operation has high chances to success map-wise.
- * Worst-scenario case is when both vma's (new_addr and old_addr) get
- * split in 3 before unmapping it.
- * That means 2 more maps (1 for each) to the ones we already hold.
- * Check whether current map count plus 2 still leads us to 4 maps below
- * the threshold, otherwise return -ENOMEM here to be more safe.
- */
- if ((current->mm->map_count + 2) >= sysctl_max_map_count - 3)
- return -ENOMEM;
-
- return 0;
-}
-
-/*
* We know we can expand the VMA in-place by delta pages, so do so.
*
* If we discover the VMA is locked, update mm_struct statistics accordingly and
@@ -1549,7 +1458,7 @@ static unsigned long expand_vma_in_place(struct vma_remap_struct *vrm)
struct vm_area_struct *vma = vrm->vma;
VMA_ITERATOR(vmi, mm, vma->vm_end);
- if (!vrm_charge(vrm))
+ if (!vrm_calc_charge(vrm))
return -ENOMEM;
/*
@@ -1593,8 +1502,6 @@ static bool align_hugetlb(struct vma_remap_struct *vrm)
if (vrm->new_len > vrm->old_len)
return false;
- vrm_set_delta(vrm);
-
return true;
}
@@ -1608,11 +1515,6 @@ static bool align_hugetlb(struct vma_remap_struct *vrm)
static unsigned long expand_vma(struct vma_remap_struct *vrm)
{
unsigned long err;
- unsigned long addr = vrm->addr;
-
- err = resize_is_valid(vrm);
- if (err)
- return err;
/*
* [addr, old_len) spans precisely to the end of the VMA, so try to
@@ -1623,16 +1525,8 @@ static unsigned long expand_vma(struct vma_remap_struct *vrm)
if (err)
return err;
- /*
- * We want to populate the newly expanded portion of the VMA to
- * satisfy the expectation that mlock()'ing a VMA maintains all
- * of its pages in memory.
- */
- if (vrm->mlocked)
- vrm->new_addr = addr;
-
/* OK we're done! */
- return addr;
+ return vrm->addr;
}
/*
@@ -1683,64 +1577,381 @@ static unsigned long mremap_at(struct vma_remap_struct *vrm)
return expand_vma(vrm);
}
- BUG();
+ /* Should not be possible. */
+ WARN_ON_ONCE(1);
+ return -EINVAL;
}
-static unsigned long do_mremap(struct vma_remap_struct *vrm)
+/*
+ * Will this operation result in the VMA being expanded or moved and thus need
+ * to map a new portion of virtual address space?
+ */
+static bool vrm_will_map_new(struct vma_remap_struct *vrm)
+{
+ if (vrm->remap_type == MREMAP_EXPAND)
+ return true;
+
+ if (vrm_implies_new_addr(vrm))
+ return true;
+
+ return false;
+}
+
+/* Does this remap ONLY move mappings? */
+static bool vrm_move_only(struct vma_remap_struct *vrm)
+{
+ if (!(vrm->flags & MREMAP_FIXED))
+ return false;
+
+ if (vrm->old_len != vrm->new_len)
+ return false;
+
+ return true;
+}
+
+static void notify_uffd(struct vma_remap_struct *vrm, bool failed)
+{
+ struct mm_struct *mm = current->mm;
+
+ /* Regardless of success/failure, we always notify of any unmaps. */
+ userfaultfd_unmap_complete(mm, vrm->uf_unmap_early);
+ if (failed)
+ mremap_userfaultfd_fail(vrm->uf);
+ else
+ mremap_userfaultfd_complete(vrm->uf, vrm->addr,
+ vrm->new_addr, vrm->old_len);
+ userfaultfd_unmap_complete(mm, vrm->uf_unmap);
+}
+
+static bool vma_multi_allowed(struct vm_area_struct *vma)
+{
+ struct file *file = vma->vm_file;
+
+ /*
+ * We can't support moving multiple uffd VMAs as notify requires
+ * mmap lock to be dropped.
+ */
+ if (userfaultfd_armed(vma))
+ return false;
+
+ /*
+ * Custom get unmapped area might result in MREMAP_FIXED not
+ * being obeyed.
+ */
+ if (!file || !file->f_op->get_unmapped_area)
+ return true;
+ /* Known good. */
+ if (vma_is_shmem(vma))
+ return true;
+ if (is_vm_hugetlb_page(vma))
+ return true;
+ if (file->f_op->get_unmapped_area == thp_get_unmapped_area)
+ return true;
+
+ return false;
+}
+
+static int check_prep_vma(struct vma_remap_struct *vrm)
{
+ struct vm_area_struct *vma = vrm->vma;
struct mm_struct *mm = current->mm;
+ unsigned long addr = vrm->addr;
+ unsigned long old_len, new_len, pgoff;
+
+ if (!vma)
+ return -EFAULT;
+
+ /* If mseal()'d, mremap() is prohibited. */
+ if (vma_is_sealed(vma))
+ return -EPERM;
+
+ /* Align to hugetlb page size, if required. */
+ if (is_vm_hugetlb_page(vma) && !align_hugetlb(vrm))
+ return -EINVAL;
+
+ vrm_set_delta(vrm);
+ vrm->remap_type = vrm_remap_type(vrm);
+ /* For convenience, we set new_addr even if VMA won't move. */
+ if (!vrm_implies_new_addr(vrm))
+ vrm->new_addr = addr;
+
+ /* Below only meaningful if we expand or move a VMA. */
+ if (!vrm_will_map_new(vrm))
+ return 0;
+
+ old_len = vrm->old_len;
+ new_len = vrm->new_len;
+
+ /*
+ * !old_len is a special case where an attempt is made to 'duplicate'
+ * a mapping. This makes no sense for private mappings as it will
+ * instead create a fresh/new mapping unrelated to the original. This
+ * is contrary to the basic idea of mremap which creates new mappings
+ * based on the original. There are no known use cases for this
+ * behavior. As a result, fail such attempts.
+ */
+ if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) {
+ pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap. This is not supported.\n",
+ current->comm, current->pid);
+ return -EINVAL;
+ }
+
+ if ((vrm->flags & MREMAP_DONTUNMAP) &&
+ (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)))
+ return -EINVAL;
+
+ /*
+ * We permit crossing of boundaries for the range being unmapped due to
+ * a shrink.
+ */
+ if (vrm->remap_type == MREMAP_SHRINK)
+ old_len = new_len;
+
+ /*
+ * We can't remap across the end of VMAs, as another VMA may be
+ * adjacent:
+ *
+ * addr vma->vm_end
+ * |-----.----------|
+ * | . |
+ * |-----.----------|
+ * .<--------->xxx>
+ * old_len
+ *
+ * We also require that vma->vm_start <= addr < vma->vm_end.
+ */
+ if (old_len > vma->vm_end - addr)
+ return -EFAULT;
+
+ if (new_len == old_len)
+ return 0;
+
+ /* We are expanding and the VMA is mlock()'d so we need to populate. */
+ if (vma->vm_flags & VM_LOCKED)
+ vrm->populate_expand = true;
+
+ /* Need to be careful about a growing mapping */
+ pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
+ pgoff += vma->vm_pgoff;
+ if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
+ return -EINVAL;
+
+ if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
+ return -EFAULT;
+
+ if (!mlock_future_ok(mm, vma->vm_flags, vrm->delta))
+ return -EAGAIN;
+
+ if (!may_expand_vm(mm, vma->vm_flags, vrm->delta >> PAGE_SHIFT))
+ return -ENOMEM;
+
+ return 0;
+}
+
+/*
+ * Are the parameters passed to mremap() valid? If so return 0, otherwise return
+ * error.
+ */
+static unsigned long check_mremap_params(struct vma_remap_struct *vrm)
+
+{
+ unsigned long addr = vrm->addr;
+ unsigned long flags = vrm->flags;
+
+ /* Ensure no unexpected flag values. */
+ if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
+ return -EINVAL;
+
+ /* Start address must be page-aligned. */
+ if (offset_in_page(addr))
+ return -EINVAL;
+
+ /*
+ * We allow a zero old-len as a special case
+ * for DOS-emu "duplicate shm area" thing. But
+ * a zero new-len is nonsensical.
+ */
+ if (!vrm->new_len)
+ return -EINVAL;
+
+ /* Is the new length silly? */
+ if (vrm->new_len > TASK_SIZE)
+ return -EINVAL;
+
+ /* Remainder of checks are for cases with specific new_addr. */
+ if (!vrm_implies_new_addr(vrm))
+ return 0;
+
+ /* Is the new address silly? */
+ if (vrm->new_addr > TASK_SIZE - vrm->new_len)
+ return -EINVAL;
+
+ /* The new address must be page-aligned. */
+ if (offset_in_page(vrm->new_addr))
+ return -EINVAL;
+
+ /* A fixed address implies a move. */
+ if (!(flags & MREMAP_MAYMOVE))
+ return -EINVAL;
+
+ /* MREMAP_DONTUNMAP does not allow resizing in the process. */
+ if (flags & MREMAP_DONTUNMAP && vrm->old_len != vrm->new_len)
+ return -EINVAL;
+
+ /* Target VMA must not overlap source VMA. */
+ if (vrm_overlaps(vrm))
+ return -EINVAL;
+
+ /*
+ * move_vma() need us to stay 4 maps below the threshold, otherwise
+ * it will bail out at the very beginning.
+ * That is a problem if we have already unmaped the regions here
+ * (new_addr, and old_addr), because userspace will not know the
+ * state of the vma's after it gets -ENOMEM.
+ * So, to avoid such scenario we can pre-compute if the whole
+ * operation has high chances to success map-wise.
+ * Worst-scenario case is when both vma's (new_addr and old_addr) get
+ * split in 3 before unmapping it.
+ * That means 2 more maps (1 for each) to the ones we already hold.
+ * Check whether current map count plus 2 still leads us to 4 maps below
+ * the threshold, otherwise return -ENOMEM here to be more safe.
+ */
+ if ((current->mm->map_count + 2) >= sysctl_max_map_count - 3)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static unsigned long remap_move(struct vma_remap_struct *vrm)
+{
struct vm_area_struct *vma;
- unsigned long ret;
+ unsigned long start = vrm->addr;
+ unsigned long end = vrm->addr + vrm->old_len;
+ unsigned long new_addr = vrm->new_addr;
+ unsigned long target_addr = new_addr;
+ unsigned long res = -EFAULT;
+ unsigned long last_end;
+ bool seen_vma = false;
+
+ VMA_ITERATOR(vmi, current->mm, start);
- ret = check_mremap_params(vrm);
- if (ret)
- return ret;
+ /*
+ * When moving VMAs we allow for batched moves across multiple VMAs,
+ * with all VMAs in the input range [addr, addr + old_len) being moved
+ * (and split as necessary).
+ */
+ for_each_vma_range(vmi, vma, end) {
+ /* Account for start, end not aligned with VMA start, end. */
+ unsigned long addr = max(vma->vm_start, start);
+ unsigned long len = min(end, vma->vm_end) - addr;
+ unsigned long offset, res_vma;
+ bool multi_allowed;
+
+ /* No gap permitted at the start of the range. */
+ if (!seen_vma && start < vma->vm_start)
+ return -EFAULT;
+
+ /*
+ * To sensibly move multiple VMAs, accounting for the fact that
+ * get_unmapped_area() may align even MAP_FIXED moves, we simply
+ * attempt to move such that the gaps between source VMAs remain
+ * consistent in destination VMAs, e.g.:
+ *
+ * X Y X Y
+ * <---> <-> <---> <->
+ * |-------| |-----| |-----| |-------| |-----| |-----|
+ * | A | | B | | C | ---> | A' | | B' | | C' |
+ * |-------| |-----| |-----| |-------| |-----| |-----|
+ * new_addr
+ *
+ * So we map B' at A'->vm_end + X, and C' at B'->vm_end + Y.
+ */
+ offset = seen_vma ? vma->vm_start - last_end : 0;
+ last_end = vma->vm_end;
+
+ vrm->vma = vma;
+ vrm->addr = addr;
+ vrm->new_addr = target_addr + offset;
+ vrm->old_len = vrm->new_len = len;
+
+ multi_allowed = vma_multi_allowed(vma);
+ if (!multi_allowed) {
+ /* This is not the first VMA, abort immediately. */
+ if (seen_vma)
+ return -EFAULT;
+ /* This is the first, but there are more, abort. */
+ if (vma->vm_end < end)
+ return -EFAULT;
+ }
+
+ res_vma = check_prep_vma(vrm);
+ if (!res_vma)
+ res_vma = mremap_to(vrm);
+ if (IS_ERR_VALUE(res_vma))
+ return res_vma;
+
+ if (!seen_vma) {
+ VM_WARN_ON_ONCE(multi_allowed && res_vma != new_addr);
+ res = res_vma;
+ }
+
+ /* mmap lock is only dropped on shrink. */
+ VM_WARN_ON_ONCE(!vrm->mmap_locked);
+ /* This is a move, no expand should occur. */
+ VM_WARN_ON_ONCE(vrm->populate_expand);
+
+ if (vrm->vmi_needs_invalidate) {
+ vma_iter_invalidate(&vmi);
+ vrm->vmi_needs_invalidate = false;
+ }
+ seen_vma = true;
+ target_addr = res_vma + vrm->new_len;
+ }
+
+ return res;
+}
+
+static unsigned long do_mremap(struct vma_remap_struct *vrm)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long res;
+ bool failed;
vrm->old_len = PAGE_ALIGN(vrm->old_len);
vrm->new_len = PAGE_ALIGN(vrm->new_len);
- vrm_set_delta(vrm);
+
+ res = check_mremap_params(vrm);
+ if (res)
+ return res;
if (mmap_write_lock_killable(mm))
return -EINTR;
vrm->mmap_locked = true;
- vma = vrm->vma = vma_lookup(mm, vrm->addr);
- if (!vma) {
- ret = -EFAULT;
- goto out;
- }
-
- /* If mseal()'d, mremap() is prohibited. */
- if (!can_modify_vma(vma)) {
- ret = -EPERM;
- goto out;
- }
+ if (vrm_move_only(vrm)) {
+ res = remap_move(vrm);
+ } else {
+ vrm->vma = vma_lookup(current->mm, vrm->addr);
+ res = check_prep_vma(vrm);
+ if (res)
+ goto out;
- /* Align to hugetlb page size, if required. */
- if (is_vm_hugetlb_page(vma) && !align_hugetlb(vrm)) {
- ret = -EINVAL;
- goto out;
+ /* Actually execute mremap. */
+ res = vrm_implies_new_addr(vrm) ? mremap_to(vrm) : mremap_at(vrm);
}
- vrm->remap_type = vrm_remap_type(vrm);
-
- /* Actually execute mremap. */
- ret = vrm_implies_new_addr(vrm) ? mremap_to(vrm) : mremap_at(vrm);
-
out:
- if (vrm->mmap_locked) {
- mmap_write_unlock(mm);
- vrm->mmap_locked = false;
+ failed = IS_ERR_VALUE(res);
- if (!offset_in_page(ret) && vrm->mlocked && vrm->new_len > vrm->old_len)
- mm_populate(vrm->new_addr + vrm->old_len, vrm->delta);
- }
+ if (vrm->mmap_locked)
+ mmap_write_unlock(mm);
- userfaultfd_unmap_complete(mm, vrm->uf_unmap_early);
- mremap_userfaultfd_complete(vrm->uf, vrm->addr, ret, vrm->old_len);
- userfaultfd_unmap_complete(mm, vrm->uf_unmap);
+ /* VMA mlock'd + was expanded, so populated expanded region. */
+ if (!failed && vrm->populate_expand)
+ mm_populate(vrm->new_addr + vrm->old_len, vrm->delta);
- return ret;
+ notify_uffd(vrm, failed);
+ return res;
}
/*
diff --git a/mm/mseal.c b/mm/mseal.c
index c27197ac04e8..e5b205562d2e 100644
--- a/mm/mseal.c
+++ b/mm/mseal.c
@@ -11,148 +11,74 @@
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
-#include <linux/mmu_context.h>
#include <linux/syscalls.h>
#include <linux/sched.h>
#include "internal.h"
-static inline void set_vma_sealed(struct vm_area_struct *vma)
-{
- vm_flags_set(vma, VM_SEALED);
-}
-
-static bool is_madv_discard(int behavior)
-{
- switch (behavior) {
- case MADV_FREE:
- case MADV_DONTNEED:
- case MADV_DONTNEED_LOCKED:
- case MADV_REMOVE:
- case MADV_DONTFORK:
- case MADV_WIPEONFORK:
- case MADV_GUARD_INSTALL:
- return true;
- }
-
- return false;
-}
-
-static bool is_ro_anon(struct vm_area_struct *vma)
-{
- /* check anonymous mapping. */
- if (vma->vm_file || vma->vm_flags & VM_SHARED)
- return false;
-
- /*
- * check for non-writable:
- * PROT=RO or PKRU is not writeable.
- */
- if (!(vma->vm_flags & VM_WRITE) ||
- !arch_vma_access_permitted(vma, true, false, false))
- return true;
-
- return false;
-}
-
/*
- * Check if a vma is allowed to be modified by madvise.
+ * mseal() disallows an input range which contain unmapped ranges (VMA holes).
+ *
+ * It disallows unmapped regions from start to end whether they exist at the
+ * start, in the middle, or at the end of the range, or any combination thereof.
+ *
+ * This is because after sealng a range, there's nothing to stop memory mapping
+ * of ranges in the remaining gaps later, meaning that the user might then
+ * wrongly consider the entirety of the mseal()'d range to be sealed when it
+ * in fact isn't.
*/
-bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior)
-{
- if (!is_madv_discard(behavior))
- return true;
-
- if (unlikely(!can_modify_vma(vma) && is_ro_anon(vma)))
- return false;
-
- /* Allow by default. */
- return true;
-}
-
-static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
- struct vm_area_struct **prev, unsigned long start,
- unsigned long end, vm_flags_t newflags)
-{
- int ret = 0;
- vm_flags_t oldflags = vma->vm_flags;
-
- if (newflags == oldflags)
- goto out;
-
- vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags);
- if (IS_ERR(vma)) {
- ret = PTR_ERR(vma);
- goto out;
- }
-
- set_vma_sealed(vma);
-out:
- *prev = vma;
- return ret;
-}
/*
- * Check for do_mseal:
- * 1> start is part of a valid vma.
- * 2> end is part of a valid vma.
- * 3> No gap (unallocated address) between start and end.
- * 4> map is sealable.
+ * Does the [start, end) range contain any unmapped memory?
+ *
+ * We ensure that:
+ * - start is part of a valid VMA.
+ * - end is part of a valid VMA.
+ * - no gap (unallocated memory) exists between start and end.
*/
-static int check_mm_seal(unsigned long start, unsigned long end)
+static bool range_contains_unmapped(struct mm_struct *mm,
+ unsigned long start, unsigned long end)
{
struct vm_area_struct *vma;
- unsigned long nstart = start;
-
+ unsigned long prev_end = start;
VMA_ITERATOR(vmi, current->mm, start);
- /* going through each vma to check. */
for_each_vma_range(vmi, vma, end) {
- if (vma->vm_start > nstart)
- /* unallocated memory found. */
- return -ENOMEM;
-
- if (vma->vm_end >= end)
- return 0;
+ if (vma->vm_start > prev_end)
+ return true;
- nstart = vma->vm_end;
+ prev_end = vma->vm_end;
}
- return -ENOMEM;
+ return prev_end < end;
}
-/*
- * Apply sealing.
- */
-static int apply_mm_seal(unsigned long start, unsigned long end)
+static int mseal_apply(struct mm_struct *mm,
+ unsigned long start, unsigned long end)
{
- unsigned long nstart;
struct vm_area_struct *vma, *prev;
+ unsigned long curr_start = start;
+ VMA_ITERATOR(vmi, mm, start);
- VMA_ITERATOR(vmi, current->mm, start);
-
+ /* We know there are no gaps so this will be non-NULL. */
vma = vma_iter_load(&vmi);
- /*
- * Note: check_mm_seal should already checked ENOMEM case.
- * so vma should not be null, same for the other ENOMEM cases.
- */
prev = vma_prev(&vmi);
if (start > vma->vm_start)
prev = vma;
- nstart = start;
for_each_vma_range(vmi, vma, end) {
- int error;
- unsigned long tmp;
- vm_flags_t newflags;
+ unsigned long curr_end = MIN(vma->vm_end, end);
- newflags = vma->vm_flags | VM_SEALED;
- tmp = vma->vm_end;
- if (tmp > end)
- tmp = end;
- error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags);
- if (error)
- return error;
- nstart = vma_iter_end(&vmi);
+ if (!(vma->vm_flags & VM_SEALED)) {
+ vma = vma_modify_flags(&vmi, prev, vma,
+ curr_start, curr_end,
+ vma->vm_flags | VM_SEALED);
+ if (IS_ERR(vma))
+ return PTR_ERR(vma);
+ vm_flags_set(vma, VM_SEALED);
+ }
+
+ prev = vma;
+ curr_start = curr_end;
}
return 0;
@@ -240,14 +166,10 @@ int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
if (mmap_write_lock_killable(mm))
return -EINTR;
- /*
- * First pass, this helps to avoid
- * partial sealing in case of error in input address range,
- * e.g. ENOMEM error.
- */
- ret = check_mm_seal(start, end);
- if (ret)
+ if (range_contains_unmapped(mm, start, end)) {
+ ret = -ENOMEM;
goto out;
+ }
/*
* Second pass, this should success, unless there are errors
@@ -255,10 +177,10 @@ int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
* reaching the max supported VMAs, however, those cases shall
* be rare.
*/
- ret = apply_mm_seal(start, end);
+ ret = mseal_apply(mm, start, end);
out:
- mmap_write_unlock(current->mm);
+ mmap_write_unlock(mm);
return ret;
}
diff --git a/mm/nommu.c b/mm/nommu.c
index 617e7ba8022f..c3a23b082adb 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -64,7 +64,7 @@ const struct vm_operations_struct generic_file_vm_ops = {
*/
unsigned int kobjsize(const void *objp)
{
- struct page *page;
+ struct folio *folio;
/*
* If the object we have should not have ksize performed on it,
@@ -73,22 +73,22 @@ unsigned int kobjsize(const void *objp)
if (!objp || !virt_addr_valid(objp))
return 0;
- page = virt_to_head_page(objp);
+ folio = virt_to_folio(objp);
/*
* If the allocator sets PageSlab, we know the pointer came from
* kmalloc().
*/
- if (PageSlab(page))
+ if (folio_test_slab(folio))
return ksize(objp);
/*
- * If it's not a compound page, see if we have a matching VMA
+ * If it's not a large folio, see if we have a matching VMA
* region. This test is intentionally done in reverse order,
* so if there's no VMA, we still fall through and hand back
- * PAGE_SIZE for 0-order pages.
+ * PAGE_SIZE for 0-order folios.
*/
- if (!PageCompound(page)) {
+ if (!folio_test_large(folio)) {
struct vm_area_struct *vma;
vma = find_vma(current->mm, (unsigned long)objp);
@@ -100,7 +100,7 @@ unsigned int kobjsize(const void *objp)
* The ksize() function is only guaranteed to work for pointers
* returned by kmalloc(). So handle arbitrary pointers here.
*/
- return page_size(page);
+ return folio_size(folio);
}
void vfree(const void *addr)
@@ -119,7 +119,8 @@ void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask)
}
EXPORT_SYMBOL(__vmalloc_noprof);
-void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
+void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align,
+ gfp_t flags, int node)
{
return krealloc_noprof(p, size, (flags | __GFP_COMP) & ~__GFP_HIGHMEM);
}
@@ -200,7 +201,23 @@ void *vmalloc_noprof(unsigned long size)
}
EXPORT_SYMBOL(vmalloc_noprof);
-void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) __weak __alias(__vmalloc_noprof);
+/*
+ * vmalloc_huge_node - allocate virtually contiguous memory, on a node
+ *
+ * @size: allocation size
+ * @gfp_mask: flags for the page level allocator
+ * @node: node to use for allocation or NUMA_NO_NODE
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ *
+ * Due to NOMMU implications the node argument and HUGE page attribute is
+ * ignored.
+ */
+void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node)
+{
+ return __vmalloc_noprof(size, gfp_mask);
+}
/*
* vzalloc - allocate virtually contiguous memory with zero fill
@@ -399,7 +416,8 @@ static const struct ctl_table nommu_table[] = {
};
/*
- * initialise the percpu counter for VM and region record slabs
+ * initialise the percpu counter for VM and region record slabs, initialise VMA
+ * state.
*/
void __init mmap_init(void)
{
@@ -409,6 +427,7 @@ void __init mmap_init(void)
VM_BUG_ON(ret);
vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT);
register_sysctl_init("vm", nommu_table);
+ vma_state_init();
}
/*
@@ -627,22 +646,6 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
EXPORT_SYMBOL(find_vma);
/*
- * At least xtensa ends up having protection faults even with no
- * MMU.. No stack expansion, at least.
- */
-struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
- unsigned long addr, struct pt_regs *regs)
-{
- struct vm_area_struct *vma;
-
- mmap_read_lock(mm);
- vma = vma_lookup(mm, addr);
- if (!vma)
- mmap_read_unlock(mm);
- return vma;
-}
-
-/*
* expand a stack to a given address
* - not supported under NOMMU conditions
*/
@@ -717,7 +720,7 @@ static int validate_mmap_request(struct file *file,
if (file) {
/* files must support mmap */
- if (!file->f_op->mmap)
+ if (!can_mmap_file(file))
return -ENODEV;
/* work out if what we've got could possibly be shared
@@ -842,12 +845,12 @@ static int validate_mmap_request(struct file *file,
* we've determined that we can make the mapping, now translate what we
* now know into VMA flags
*/
-static unsigned long determine_vm_flags(struct file *file,
- unsigned long prot,
- unsigned long flags,
- unsigned long capabilities)
+static vm_flags_t determine_vm_flags(struct file *file,
+ unsigned long prot,
+ unsigned long flags,
+ unsigned long capabilities)
{
- unsigned long vm_flags;
+ vm_flags_t vm_flags;
vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(file, flags);
@@ -1890,3 +1893,11 @@ static int __meminit init_admin_reserve(void)
return 0;
}
subsys_initcall(init_admin_reserve);
+
+int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+ mmap_write_lock(oldmm);
+ dup_mm_exe_file(mm, oldmm);
+ mmap_write_unlock(oldmm);
+ return 0;
+}
diff --git a/mm/numa.c b/mm/numa.c
index f1787d7713a6..7d5e06fe5bd4 100644
--- a/mm/numa.c
+++ b/mm/numa.c
@@ -13,7 +13,6 @@ void __init alloc_node_data(int nid)
{
const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES);
u64 nd_pa;
- void *nd;
int tnid;
/* Allocate node data. Try node-local memory and then any node. */
@@ -21,7 +20,6 @@ void __init alloc_node_data(int nid)
if (!nd_pa)
panic("Cannot allocate %zu bytes for node %d data\n",
nd_size, nid);
- nd = __va(nd_pa);
/* report and initialize */
pr_info("NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
@@ -30,7 +28,7 @@ void __init alloc_node_data(int nid)
if (tnid != nid)
pr_info(" NODE_DATA(%d) on node %d\n", nid, tnid);
- node_data[nid] = nd;
+ node_data[nid] = __va(nd_pa);
memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
}
diff --git a/mm/numa_emulation.c b/mm/numa_emulation.c
index 9d55679d99ce..703c8fa05048 100644
--- a/mm/numa_emulation.c
+++ b/mm/numa_emulation.c
@@ -73,7 +73,7 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei,
}
printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n",
- nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20);
+ nid, eb->start, eb->end - 1, (eb->end - eb->start) / SZ_1M);
return 0;
}
@@ -264,7 +264,7 @@ static int __init split_nodes_size_interleave_uniform(struct numa_meminfo *ei,
min_size = ALIGN(max(min_size, FAKE_NODE_MIN_SIZE), FAKE_NODE_MIN_SIZE);
if (size < min_size) {
pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
- size >> 20, min_size >> 20);
+ size / SZ_1M, min_size / SZ_1M);
size = min_size;
}
size = ALIGN_DOWN(size, FAKE_NODE_MIN_SIZE);
diff --git a/mm/numa_memblks.c b/mm/numa_memblks.c
index ff4054f4334d..5b009a9cd8b4 100644
--- a/mm/numa_memblks.c
+++ b/mm/numa_memblks.c
@@ -76,7 +76,7 @@ static int __init numa_alloc_distance(void)
for (j = 0; j < cnt; j++)
numa_distance[i * cnt + j] = i == j ?
LOCAL_DISTANCE : REMOTE_DISTANCE;
- printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
+ pr_debug("NUMA: Initialized distance table, cnt=%d\n", cnt);
return 0;
}
@@ -201,6 +201,28 @@ int __init numa_add_memblk(int nid, u64 start, u64 end)
}
/**
+ * numa_add_reserved_memblk - Add one numa_memblk to numa_reserved_meminfo
+ * @nid: NUMA node ID of the new memblk
+ * @start: Start address of the new memblk
+ * @end: End address of the new memblk
+ *
+ * Add a new memblk to the numa_reserved_meminfo.
+ *
+ * Usage Case: numa_cleanup_meminfo() reconciles all numa_memblk instances
+ * against memblock_type information and moves any that intersect reserved
+ * ranges to numa_reserved_meminfo. However, when that information is known
+ * ahead of time, we use numa_add_reserved_memblk() to add the numa_memblk
+ * to numa_reserved_meminfo directly.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int __init numa_add_reserved_memblk(int nid, u64 start, u64 end)
+{
+ return numa_add_memblk_to(nid, start, end, &numa_reserved_meminfo);
+}
+
+/**
* numa_cleanup_meminfo - Cleanup a numa_meminfo
* @mi: numa_meminfo to clean up
*
@@ -405,9 +427,9 @@ static int __init numa_register_meminfo(struct numa_meminfo *mi)
unsigned long pfn_align = node_map_pfn_alignment();
if (pfn_align && pfn_align < PAGES_PER_SECTION) {
- unsigned long node_align_mb = PFN_PHYS(pfn_align) >> 20;
+ unsigned long node_align_mb = PFN_PHYS(pfn_align) / SZ_1M;
- unsigned long sect_align_mb = PFN_PHYS(PAGES_PER_SECTION) >> 20;
+ unsigned long sect_align_mb = PFN_PHYS(PAGES_PER_SECTION) / SZ_1M;
pr_warn("Node alignment %luMB < min %luMB, rejecting NUMA config\n",
node_align_mb, sect_align_mb);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 25923cfec9c6..c145b0feecc1 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/oom_kill.c
- *
+ *
* Copyright (C) 1998,2000 Rik van Riel
* Thanks go out to Claus Fischer for some serious inspiration and
* for goading me into coding this file...
@@ -218,7 +218,7 @@ long oom_badness(struct task_struct *p, unsigned long totalpages)
*/
adj = (long)p->signal->oom_score_adj;
if (adj == OOM_SCORE_ADJ_MIN ||
- test_bit(MMF_OOM_SKIP, &p->mm->flags) ||
+ mm_flags_test(MMF_OOM_SKIP, p->mm) ||
in_vfork(p)) {
task_unlock(p);
return LONG_MIN;
@@ -325,7 +325,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
* any memory is quite low.
*/
if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {
- if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))
+ if (mm_flags_test(MMF_OOM_SKIP, task->signal->oom_mm))
goto next;
goto abort;
}
@@ -490,12 +490,12 @@ static bool oom_killer_disabled __read_mostly;
* task's threads: if one of those is using this mm then this task was also
* using it.
*/
-bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
+bool process_shares_mm(const struct task_struct *p, const struct mm_struct *mm)
{
- struct task_struct *t;
+ const struct task_struct *t;
for_each_thread(p, t) {
- struct mm_struct *t_mm = READ_ONCE(t->mm);
+ const struct mm_struct *t_mm = READ_ONCE(t->mm);
if (t_mm)
return t_mm == mm;
}
@@ -516,7 +516,7 @@ static bool __oom_reap_task_mm(struct mm_struct *mm)
{
struct vm_area_struct *vma;
bool ret = true;
- VMA_ITERATOR(vmi, mm, 0);
+ MA_STATE(mas, &mm->mm_mt, ULONG_MAX, ULONG_MAX);
/*
* Tell all users of get_user/copy_from_user etc... that the content
@@ -524,9 +524,15 @@ static bool __oom_reap_task_mm(struct mm_struct *mm)
* should imply barriers already and the reader would hit a page fault
* if it stumbled over a reaped memory.
*/
- set_bit(MMF_UNSTABLE, &mm->flags);
+ mm_flags_set(MMF_UNSTABLE, mm);
- for_each_vma(vmi, vma) {
+ /*
+ * It might start racing with the dying task and compete for shared
+ * resources - e.g. page table lock contention has been observed.
+ * Reduce those races by reaping the oom victim from the other end
+ * of the address space.
+ */
+ mas_for_each_rev(&mas, vma, 0) {
if (vma->vm_flags & (VM_HUGETLB|VM_PFNMAP))
continue;
@@ -583,7 +589,7 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
* under mmap_lock for reading because it serializes against the
* mmap_write_lock();mmap_write_unlock() cycle in exit_mmap().
*/
- if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
+ if (mm_flags_test(MMF_OOM_SKIP, mm)) {
trace_skip_task_reaping(tsk->pid);
goto out_unlock;
}
@@ -619,7 +625,7 @@ static void oom_reap_task(struct task_struct *tsk)
schedule_timeout_idle(HZ/10);
if (attempts <= MAX_OOM_REAP_RETRIES ||
- test_bit(MMF_OOM_SKIP, &mm->flags))
+ mm_flags_test(MMF_OOM_SKIP, mm))
goto done;
pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
@@ -634,7 +640,7 @@ done:
* Hide this mm from OOM killer because it has been either reaped or
* somebody can't call mmap_write_unlock(mm).
*/
- set_bit(MMF_OOM_SKIP, &mm->flags);
+ mm_flags_set(MMF_OOM_SKIP, mm);
/* Drop a reference taken by queue_oom_reaper */
put_task_struct(tsk);
@@ -670,7 +676,7 @@ static void wake_oom_reaper(struct timer_list *timer)
unsigned long flags;
/* The victim managed to terminate on its own - see exit_mmap */
- if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
+ if (mm_flags_test(MMF_OOM_SKIP, mm)) {
put_task_struct(tsk);
return;
}
@@ -695,7 +701,7 @@ static void wake_oom_reaper(struct timer_list *timer)
static void queue_oom_reaper(struct task_struct *tsk)
{
/* mm is already queued? */
- if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
+ if (mm_flags_test_and_set(MMF_OOM_REAP_QUEUED, tsk->signal->oom_mm))
return;
get_task_struct(tsk);
@@ -772,12 +778,12 @@ static void mark_oom_victim(struct task_struct *tsk)
mmgrab(tsk->signal->oom_mm);
/*
- * Make sure that the task is woken up from uninterruptible sleep
- * if it is frozen because OOM killer wouldn't be able to free
- * any memory and livelock. freezing_slow_path will tell the freezer
- * that TIF_MEMDIE tasks should be ignored.
+ * Make sure that the process is woken up from uninterruptible sleep
+ * if it is frozen because OOM killer wouldn't be able to free any
+ * memory and livelock. The freezer will thaw the tasks that are OOM
+ * victims regardless of the PM freezing and cgroup freezing states.
*/
- __thaw_task(tsk);
+ thaw_process(tsk);
atomic_inc(&oom_victims);
cred = get_task_cred(tsk);
trace_mark_victim(tsk, cred->uid.val);
@@ -892,7 +898,7 @@ static bool task_will_free_mem(struct task_struct *task)
* This task has already been drained by the oom reaper so there are
* only small chances it will free some more
*/
- if (test_bit(MMF_OOM_SKIP, &mm->flags))
+ if (mm_flags_test(MMF_OOM_SKIP, mm))
return false;
if (atomic_read(&mm->mm_users) <= 1)
@@ -977,7 +983,7 @@ static void __oom_kill_process(struct task_struct *victim, const char *message)
continue;
if (is_global_init(p)) {
can_oom_reap = false;
- set_bit(MMF_OOM_SKIP, &mm->flags);
+ mm_flags_set(MMF_OOM_SKIP, mm);
pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
task_pid_nr(victim), victim->comm,
task_pid_nr(p), p->comm);
@@ -1235,7 +1241,7 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
reap = true;
else {
/* Error only if the work has not been done already */
- if (!test_bit(MMF_OOM_SKIP, &mm->flags))
+ if (!mm_flags_test(MMF_OOM_SKIP, mm))
ret = -EINVAL;
}
task_unlock(p);
@@ -1251,7 +1257,7 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
* Check MMF_OOM_SKIP again under mmap_read_lock protection to ensure
* possible change in exit_mmap is seen
*/
- if (!test_bit(MMF_OOM_SKIP, &mm->flags) && !__oom_reap_task_mm(mm))
+ if (!mm_flags_test(MMF_OOM_SKIP, mm) && !__oom_reap_task_mm(mm))
ret = -EAGAIN;
mmap_read_unlock(mm);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index c81624bc3969..757bc4d3b5b5 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -38,6 +38,7 @@
#include <linux/sched/rt.h>
#include <linux/sched/signal.h>
#include <linux/mm_inline.h>
+#include <linux/shmem_fs.h>
#include <trace/events/writeback.h>
#include "internal.h"
@@ -520,8 +521,8 @@ static int dirty_ratio_handler(const struct ctl_table *table, int write, void *b
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
- writeback_set_ratelimit();
vm_dirty_bytes = 0;
+ writeback_set_ratelimit();
}
return ret;
}
@@ -607,7 +608,7 @@ EXPORT_SYMBOL_GPL(wb_writeout_inc);
*/
static void writeout_period(struct timer_list *t)
{
- struct wb_domain *dom = from_timer(dom, t, period_timer);
+ struct wb_domain *dom = timer_container_of(dom, t, period_timer);
int miss_periods = (jiffies - dom->period_time) /
VM_COMPLETIONS_PERIOD_LEN;
@@ -1100,9 +1101,7 @@ static void wb_position_ratio(struct dirty_throttle_control *dtc)
* such filesystems balance_dirty_pages always checks wb counters
* against wb limits. Even if global "nr_dirty" is under "freerun".
* This is especially important for fuse which sets bdi->max_ratio to
- * 1% by default. Without strictlimit feature, fuse writeback may
- * consume arbitrary amount of RAM because it is accounted in
- * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty".
+ * 1% by default.
*
* Here, in wb_position_ratio(), we calculate pos_ratio based on
* two values: wb_dirty and wb_thresh. Let's consider an example:
@@ -2202,7 +2201,7 @@ static int dirty_writeback_centisecs_handler(const struct ctl_table *table, int
void laptop_mode_timer_fn(struct timer_list *t)
{
struct backing_dev_info *backing_dev_info =
- from_timer(backing_dev_info, t, laptop_mode_wb_timer);
+ timer_container_of(backing_dev_info, t, laptop_mode_wb_timer);
wakeup_flusher_threads_bdi(backing_dev_info, WB_REASON_LAPTOP_TIMER);
}
@@ -2564,11 +2563,11 @@ struct folio *writeback_iter(struct address_space *mapping,
if (!folio) {
/*
* To avoid deadlocks between range_cyclic writeback and callers
- * that hold pages in PageWriteback to aggregate I/O until
+ * that hold folios in writeback to aggregate I/O until
* the writeback iteration finishes, we do not loop back to the
- * start of the file. Doing so causes a page lock/page
+ * start of the file. Doing so causes a folio lock/folio
* writeback access order inversion - we should only ever lock
- * multiple pages in ascending page->index order, and looping
+ * multiple folios in ascending folio->index order, and looping
* back to the start of the file violates that rule and causes
* deadlocks.
*/
@@ -2591,57 +2590,6 @@ done:
}
EXPORT_SYMBOL_GPL(writeback_iter);
-/**
- * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
- * @mapping: address space structure to write
- * @wbc: subtract the number of written pages from *@wbc->nr_to_write
- * @writepage: function called for each page
- * @data: data passed to writepage function
- *
- * Return: %0 on success, negative error code otherwise
- *
- * Note: please use writeback_iter() instead.
- */
-int write_cache_pages(struct address_space *mapping,
- struct writeback_control *wbc, writepage_t writepage,
- void *data)
-{
- struct folio *folio = NULL;
- int error;
-
- while ((folio = writeback_iter(mapping, wbc, folio, &error))) {
- error = writepage(folio, wbc, data);
- if (error == AOP_WRITEPAGE_ACTIVATE) {
- folio_unlock(folio);
- error = 0;
- }
- }
-
- return error;
-}
-EXPORT_SYMBOL(write_cache_pages);
-
-static int writeback_use_writepage(struct address_space *mapping,
- struct writeback_control *wbc)
-{
- struct folio *folio = NULL;
- struct blk_plug plug;
- int err;
-
- blk_start_plug(&plug);
- while ((folio = writeback_iter(mapping, wbc, folio, &err))) {
- err = mapping->a_ops->writepage(&folio->page, wbc);
- if (err == AOP_WRITEPAGE_ACTIVATE) {
- folio_unlock(folio);
- err = 0;
- }
- mapping_set_error(mapping, err);
- }
- blk_finish_plug(&plug);
-
- return err;
-}
-
int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
int ret;
@@ -2652,14 +2600,11 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
wb = inode_to_wb_wbc(mapping->host, wbc);
wb_bandwidth_estimate_start(wb);
while (1) {
- if (mapping->a_ops->writepages) {
+ if (mapping->a_ops->writepages)
ret = mapping->a_ops->writepages(mapping, wbc);
- } else if (mapping->a_ops->writepage) {
- ret = writeback_use_writepage(mapping, wbc);
- } else {
+ else
/* deal with chardevs and other special files */
ret = 0;
- }
if (ret != -ENOMEM || wbc->sync_mode != WB_SYNC_ALL)
break;
@@ -2760,12 +2705,18 @@ void __folio_mark_dirty(struct folio *folio, struct address_space *mapping,
{
unsigned long flags;
+ /*
+ * Shmem writeback relies on swap, and swap writeback is LRU based,
+ * not using the dirty mark.
+ */
+ VM_WARN_ON_ONCE(folio_test_swapcache(folio) || shmem_mapping(mapping));
+
xa_lock_irqsave(&mapping->i_pages, flags);
if (folio->mapping) { /* Race with truncate? */
WARN_ON_ONCE(warn && !folio_test_uptodate(folio));
folio_account_dirtied(folio, mapping);
- __xa_set_mark(&mapping->i_pages, folio_index(folio),
- PAGECACHE_TAG_DIRTY);
+ __xa_set_mark(&mapping->i_pages, folio->index,
+ PAGECACHE_TAG_DIRTY);
}
xa_unlock_irqrestore(&mapping->i_pages, flags);
}
@@ -3039,26 +2990,23 @@ bool __folio_end_writeback(struct folio *folio)
if (mapping && mapping_use_writeback_tags(mapping)) {
struct inode *inode = mapping->host;
- struct backing_dev_info *bdi = inode_to_bdi(inode);
+ struct bdi_writeback *wb;
unsigned long flags;
xa_lock_irqsave(&mapping->i_pages, flags);
ret = folio_xor_flags_has_waiters(folio, 1 << PG_writeback);
- __xa_clear_mark(&mapping->i_pages, folio_index(folio),
+ __xa_clear_mark(&mapping->i_pages, folio->index,
PAGECACHE_TAG_WRITEBACK);
- if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
- struct bdi_writeback *wb = inode_to_wb(inode);
- wb_stat_mod(wb, WB_WRITEBACK, -nr);
- __wb_writeout_add(wb, nr);
- if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
- wb_inode_writeback_end(wb);
+ wb = inode_to_wb(inode);
+ wb_stat_mod(wb, WB_WRITEBACK, -nr);
+ __wb_writeout_add(wb, nr);
+ if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
+ wb_inode_writeback_end(wb);
+ if (mapping->host)
+ sb_clear_inode_writeback(mapping->host);
}
- if (mapping->host && !mapping_tagged(mapping,
- PAGECACHE_TAG_WRITEBACK))
- sb_clear_inode_writeback(mapping->host);
-
xa_unlock_irqrestore(&mapping->i_pages, flags);
} else {
ret = folio_xor_flags_has_waiters(folio, 1 << PG_writeback);
@@ -3081,9 +3029,9 @@ void __folio_start_writeback(struct folio *folio, bool keep_write)
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
if (mapping && mapping_use_writeback_tags(mapping)) {
- XA_STATE(xas, &mapping->i_pages, folio_index(folio));
+ XA_STATE(xas, &mapping->i_pages, folio->index);
struct inode *inode = mapping->host;
- struct backing_dev_info *bdi = inode_to_bdi(inode);
+ struct bdi_writeback *wb;
unsigned long flags;
bool on_wblist;
@@ -3094,21 +3042,19 @@ void __folio_start_writeback(struct folio *folio, bool keep_write)
on_wblist = mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK);
xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK);
- if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
- struct bdi_writeback *wb = inode_to_wb(inode);
-
- wb_stat_mod(wb, WB_WRITEBACK, nr);
- if (!on_wblist)
- wb_inode_writeback_start(wb);
+ wb = inode_to_wb(inode);
+ wb_stat_mod(wb, WB_WRITEBACK, nr);
+ if (!on_wblist) {
+ wb_inode_writeback_start(wb);
+ /*
+ * We can come through here when swapping anonymous
+ * folios, so we don't necessarily have an inode to
+ * track for sync.
+ */
+ if (mapping->host)
+ sb_mark_inode_writeback(mapping->host);
}
- /*
- * We can come through here when swapping anonymous
- * folios, so we don't necessarily have an inode to
- * track for sync.
- */
- if (mapping->host && !on_wblist)
- sb_mark_inode_writeback(mapping->host);
if (!folio_test_dirty(folio))
xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
if (!keep_write)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8258349e49ac..600d9e981c23 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -353,81 +353,224 @@ static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn)
return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
}
+static __always_inline bool is_standalone_pb_bit(enum pageblock_bits pb_bit)
+{
+ return pb_bit >= PB_compact_skip && pb_bit < __NR_PAGEBLOCK_BITS;
+}
+
+static __always_inline void
+get_pfnblock_bitmap_bitidx(const struct page *page, unsigned long pfn,
+ unsigned long **bitmap_word, unsigned long *bitidx)
+{
+ unsigned long *bitmap;
+ unsigned long word_bitidx;
+
+#ifdef CONFIG_MEMORY_ISOLATION
+ BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 8);
+#else
+ BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
+#endif
+ BUILD_BUG_ON(__MIGRATE_TYPE_END > MIGRATETYPE_MASK);
+ VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
+
+ bitmap = get_pageblock_bitmap(page, pfn);
+ *bitidx = pfn_to_bitidx(page, pfn);
+ word_bitidx = *bitidx / BITS_PER_LONG;
+ *bitidx &= (BITS_PER_LONG - 1);
+ *bitmap_word = &bitmap[word_bitidx];
+}
+
+
/**
- * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
+ * __get_pfnblock_flags_mask - Return the requested group of flags for
+ * a pageblock_nr_pages block of pages
* @page: The page within the block of interest
* @pfn: The target page frame number
* @mask: mask of bits that the caller is interested in
*
* Return: pageblock_bits flags
*/
-unsigned long get_pfnblock_flags_mask(const struct page *page,
- unsigned long pfn, unsigned long mask)
+static unsigned long __get_pfnblock_flags_mask(const struct page *page,
+ unsigned long pfn,
+ unsigned long mask)
{
- unsigned long *bitmap;
- unsigned long bitidx, word_bitidx;
+ unsigned long *bitmap_word;
+ unsigned long bitidx;
unsigned long word;
- bitmap = get_pageblock_bitmap(page, pfn);
- bitidx = pfn_to_bitidx(page, pfn);
- word_bitidx = bitidx / BITS_PER_LONG;
- bitidx &= (BITS_PER_LONG-1);
+ get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
/*
- * This races, without locks, with set_pfnblock_flags_mask(). Ensure
+ * This races, without locks, with set_pfnblock_migratetype(). Ensure
* a consistent read of the memory array, so that results, even though
* racy, are not corrupted.
*/
- word = READ_ONCE(bitmap[word_bitidx]);
+ word = READ_ONCE(*bitmap_word);
return (word >> bitidx) & mask;
}
-static __always_inline int get_pfnblock_migratetype(const struct page *page,
- unsigned long pfn)
+/**
+ * get_pfnblock_bit - Check if a standalone bit of a pageblock is set
+ * @page: The page within the block of interest
+ * @pfn: The target page frame number
+ * @pb_bit: pageblock bit to check
+ *
+ * Return: true if the bit is set, otherwise false
+ */
+bool get_pfnblock_bit(const struct page *page, unsigned long pfn,
+ enum pageblock_bits pb_bit)
{
- return get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
+ unsigned long *bitmap_word;
+ unsigned long bitidx;
+
+ if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit)))
+ return false;
+
+ get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
+
+ return test_bit(bitidx + pb_bit, bitmap_word);
}
/**
- * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
+ * get_pfnblock_migratetype - Return the migratetype of a pageblock
* @page: The page within the block of interest
- * @flags: The flags to set
* @pfn: The target page frame number
- * @mask: mask of bits that the caller is interested in
+ *
+ * Return: The migratetype of the pageblock
+ *
+ * Use get_pfnblock_migratetype() if caller already has both @page and @pfn
+ * to save a call to page_to_pfn().
*/
-void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
- unsigned long pfn,
- unsigned long mask)
+__always_inline enum migratetype
+get_pfnblock_migratetype(const struct page *page, unsigned long pfn)
{
- unsigned long *bitmap;
- unsigned long bitidx, word_bitidx;
- unsigned long word;
+ unsigned long mask = MIGRATETYPE_AND_ISO_MASK;
+ unsigned long flags;
- BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
- BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
+ flags = __get_pfnblock_flags_mask(page, pfn, mask);
- bitmap = get_pageblock_bitmap(page, pfn);
- bitidx = pfn_to_bitidx(page, pfn);
- word_bitidx = bitidx / BITS_PER_LONG;
- bitidx &= (BITS_PER_LONG-1);
+#ifdef CONFIG_MEMORY_ISOLATION
+ if (flags & BIT(PB_migrate_isolate))
+ return MIGRATE_ISOLATE;
+#endif
+ return flags & MIGRATETYPE_MASK;
+}
- VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
+/**
+ * __set_pfnblock_flags_mask - Set the requested group of flags for
+ * a pageblock_nr_pages block of pages
+ * @page: The page within the block of interest
+ * @pfn: The target page frame number
+ * @flags: The flags to set
+ * @mask: mask of bits that the caller is interested in
+ */
+static void __set_pfnblock_flags_mask(struct page *page, unsigned long pfn,
+ unsigned long flags, unsigned long mask)
+{
+ unsigned long *bitmap_word;
+ unsigned long bitidx;
+ unsigned long word;
+
+ get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
mask <<= bitidx;
flags <<= bitidx;
- word = READ_ONCE(bitmap[word_bitidx]);
+ word = READ_ONCE(*bitmap_word);
do {
- } while (!try_cmpxchg(&bitmap[word_bitidx], &word, (word & ~mask) | flags));
+ } while (!try_cmpxchg(bitmap_word, &word, (word & ~mask) | flags));
}
-void set_pageblock_migratetype(struct page *page, int migratetype)
+/**
+ * set_pfnblock_bit - Set a standalone bit of a pageblock
+ * @page: The page within the block of interest
+ * @pfn: The target page frame number
+ * @pb_bit: pageblock bit to set
+ */
+void set_pfnblock_bit(const struct page *page, unsigned long pfn,
+ enum pageblock_bits pb_bit)
{
+ unsigned long *bitmap_word;
+ unsigned long bitidx;
+
+ if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit)))
+ return;
+
+ get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
+
+ set_bit(bitidx + pb_bit, bitmap_word);
+}
+
+/**
+ * clear_pfnblock_bit - Clear a standalone bit of a pageblock
+ * @page: The page within the block of interest
+ * @pfn: The target page frame number
+ * @pb_bit: pageblock bit to clear
+ */
+void clear_pfnblock_bit(const struct page *page, unsigned long pfn,
+ enum pageblock_bits pb_bit)
+{
+ unsigned long *bitmap_word;
+ unsigned long bitidx;
+
+ if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit)))
+ return;
+
+ get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
+
+ clear_bit(bitidx + pb_bit, bitmap_word);
+}
+
+/**
+ * set_pageblock_migratetype - Set the migratetype of a pageblock
+ * @page: The page within the block of interest
+ * @migratetype: migratetype to set
+ */
+static void set_pageblock_migratetype(struct page *page,
+ enum migratetype migratetype)
+{
+ if (unlikely(page_group_by_mobility_disabled &&
+ migratetype < MIGRATE_PCPTYPES))
+ migratetype = MIGRATE_UNMOVABLE;
+
+#ifdef CONFIG_MEMORY_ISOLATION
+ if (migratetype == MIGRATE_ISOLATE) {
+ VM_WARN_ONCE(1,
+ "Use set_pageblock_isolate() for pageblock isolation");
+ return;
+ }
+ VM_WARN_ONCE(get_pageblock_isolate(page),
+ "Use clear_pageblock_isolate() to unisolate pageblock");
+ /* MIGRATETYPE_AND_ISO_MASK clears PB_migrate_isolate if it is set */
+#endif
+ __set_pfnblock_flags_mask(page, page_to_pfn(page),
+ (unsigned long)migratetype,
+ MIGRATETYPE_AND_ISO_MASK);
+}
+
+void __meminit init_pageblock_migratetype(struct page *page,
+ enum migratetype migratetype,
+ bool isolate)
+{
+ unsigned long flags;
+
if (unlikely(page_group_by_mobility_disabled &&
migratetype < MIGRATE_PCPTYPES))
migratetype = MIGRATE_UNMOVABLE;
- set_pfnblock_flags_mask(page, (unsigned long)migratetype,
- page_to_pfn(page), MIGRATETYPE_MASK);
+ flags = migratetype;
+
+#ifdef CONFIG_MEMORY_ISOLATION
+ if (migratetype == MIGRATE_ISOLATE) {
+ VM_WARN_ONCE(
+ 1,
+ "Set isolate=true to isolate pageblock with a migratetype");
+ return;
+ }
+ if (isolate)
+ flags |= BIT(PB_migrate_isolate);
+#endif
+ __set_pfnblock_flags_mask(page, page_to_pfn(page), flags,
+ MIGRATETYPE_AND_ISO_MASK);
}
#ifdef CONFIG_DEBUG_VM
@@ -653,7 +796,7 @@ static inline void account_freepages(struct zone *zone, int nr_pages,
if (is_migrate_cma(migratetype))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages);
- else if (is_migrate_highatomic(migratetype))
+ else if (migratetype == MIGRATE_HIGHATOMIC)
WRITE_ONCE(zone->nr_free_highatomic,
zone->nr_free_highatomic + nr_pages);
}
@@ -667,7 +810,7 @@ static inline void __add_to_free_list(struct page *page, struct zone *zone,
int nr_pages = 1 << order;
VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype,
- "page type is %lu, passed migratetype is %d (nr=%d)\n",
+ "page type is %d, passed migratetype is %d (nr=%d)\n",
get_pageblock_migratetype(page), migratetype, nr_pages);
if (tail)
@@ -693,7 +836,7 @@ static inline void move_to_free_list(struct page *page, struct zone *zone,
/* Free page moving can fail, so it happens before the type update */
VM_WARN_ONCE(get_pageblock_migratetype(page) != old_mt,
- "page type is %lu, passed migratetype is %d (nr=%d)\n",
+ "page type is %d, passed migratetype is %d (nr=%d)\n",
get_pageblock_migratetype(page), old_mt, nr_pages);
list_move_tail(&page->buddy_list, &area->free_list[new_mt]);
@@ -715,7 +858,7 @@ static inline void __del_page_from_free_list(struct page *page, struct zone *zon
int nr_pages = 1 << order;
VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype,
- "page type is %lu, passed migratetype is %d (nr=%d)\n",
+ "page type is %d, passed migratetype is %d (nr=%d)\n",
get_pageblock_migratetype(page), migratetype, nr_pages);
/* clear reported state and update reported page count */
@@ -806,7 +949,7 @@ static inline void __free_one_page(struct page *page,
bool to_tail;
VM_BUG_ON(!zone_is_initialized(zone));
- VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
+ VM_BUG_ON_PAGE(page->flags.f & PAGE_FLAGS_CHECK_AT_PREP, page);
VM_BUG_ON(migratetype == -1);
VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
@@ -898,10 +1041,8 @@ static inline bool page_expected_state(struct page *page,
#ifdef CONFIG_MEMCG
page->memcg_data |
#endif
-#ifdef CONFIG_PAGE_POOL
- ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) |
-#endif
- (page->flags & check_flags)))
+ page_pool_page_is_pp(page) |
+ (page->flags.f & check_flags)))
return false;
return true;
@@ -917,7 +1058,7 @@ static const char *page_bad_reason(struct page *page, unsigned long flags)
bad_reason = "non-NULL mapping";
if (unlikely(page_ref_count(page) != 0))
bad_reason = "nonzero _refcount";
- if (unlikely(page->flags & flags)) {
+ if (unlikely(page->flags.f & flags)) {
if (flags == PAGE_FLAGS_CHECK_AT_PREP)
bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set";
else
@@ -927,26 +1068,18 @@ static const char *page_bad_reason(struct page *page, unsigned long flags)
if (unlikely(page->memcg_data))
bad_reason = "page still charged to cgroup";
#endif
-#ifdef CONFIG_PAGE_POOL
- if (unlikely((page->pp_magic & ~0x3UL) == PP_SIGNATURE))
+ if (unlikely(page_pool_page_is_pp(page)))
bad_reason = "page_pool leak";
-#endif
return bad_reason;
}
-static void free_page_is_bad_report(struct page *page)
-{
- bad_page(page,
- page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE));
-}
-
static inline bool free_page_is_bad(struct page *page)
{
if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
return false;
/* Something has gone sideways, find it */
- free_page_is_bad_report(page);
+ bad_page(page, page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE));
return true;
}
@@ -1224,7 +1357,7 @@ __always_inline bool free_pages_prepare(struct page *page,
int i;
if (compound) {
- page[1].flags &= ~PAGE_FLAGS_SECOND;
+ page[1].flags.f &= ~PAGE_FLAGS_SECOND;
#ifdef NR_PAGES_IN_LARGE_FOLIO
folio->_nr_pages = 0;
#endif
@@ -1238,14 +1371,17 @@ __always_inline bool free_pages_prepare(struct page *page,
continue;
}
}
- (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+ (page + i)->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
}
}
- if (PageMappingFlags(page)) {
- if (PageAnon(page))
- mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
- page->mapping = NULL;
+ if (folio_test_anon(folio)) {
+ mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
+ folio->mapping = NULL;
}
+ if (unlikely(page_has_type(page)))
+ /* Reset the page_type (which overlays _mapcount) */
+ page->page_type = UINT_MAX;
+
if (is_check_pages_enabled()) {
if (free_page_is_bad(page))
bad++;
@@ -1254,7 +1390,7 @@ __always_inline bool free_pages_prepare(struct page *page,
}
page_cpupid_reset_last(page);
- page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+ page->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
reset_page_owner(page, order);
page_table_check_free(page, order);
pgalloc_tag_sub(page, 1 << order);
@@ -1384,7 +1520,7 @@ static void add_page_to_zone_llist(struct zone *zone, struct page *page,
unsigned int order)
{
/* Remember the order */
- page->order = order;
+ page->private = order;
/* Add the page to the free list */
llist_add(&page->pcp_llist, &zone->trylock_free_pages);
}
@@ -1413,7 +1549,7 @@ static void free_one_page(struct zone *zone, struct page *page,
llnode = llist_del_all(llhead);
llist_for_each_entry_safe(p, tmp, llnode, pcp_llist) {
- unsigned int p_order = p->order;
+ unsigned int p_order = p->private;
split_large_buddy(zone, p, page_to_pfn(p), p_order, fpi_flags);
__count_vm_events(PGFREE, 1 << p_order);
@@ -1791,8 +1927,8 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
#endif
/*
- * Change the type of a block and move all its free pages to that
- * type's freelist.
+ * Move all free pages of a block to new type's freelist. Caller needs to
+ * change the block type.
*/
static int __move_freepages_block(struct zone *zone, unsigned long start_pfn,
int old_mt, int new_mt)
@@ -1824,8 +1960,6 @@ static int __move_freepages_block(struct zone *zone, unsigned long start_pfn,
pages_moved += 1 << order;
}
- set_pageblock_migratetype(pfn_to_page(start_pfn), new_mt);
-
return pages_moved;
}
@@ -1870,7 +2004,7 @@ static bool prep_move_freepages_block(struct zone *zone, struct page *page,
* migration are movable. But we don't actually try
* isolating, as that would be expensive.
*/
- if (PageLRU(page) || __PageMovable(page))
+ if (PageLRU(page) || page_has_movable_ops(page))
(*num_movable)++;
pfn++;
}
@@ -1883,18 +2017,29 @@ static int move_freepages_block(struct zone *zone, struct page *page,
int old_mt, int new_mt)
{
unsigned long start_pfn;
+ int res;
if (!prep_move_freepages_block(zone, page, &start_pfn, NULL, NULL))
return -1;
- return __move_freepages_block(zone, start_pfn, old_mt, new_mt);
+ res = __move_freepages_block(zone, start_pfn, old_mt, new_mt);
+ set_pageblock_migratetype(pfn_to_page(start_pfn), new_mt);
+
+ return res;
+
}
#ifdef CONFIG_MEMORY_ISOLATION
/* Look for a buddy that straddles start_pfn */
static unsigned long find_large_buddy(unsigned long start_pfn)
{
- int order = 0;
+ /*
+ * If start_pfn is not an order-0 PageBuddy, next PageBuddy containing
+ * start_pfn has minimal order of __ffs(start_pfn) + 1. Start checking
+ * the order with __ffs(start_pfn). If start_pfn is order-0 PageBuddy,
+ * the starting order does not matter.
+ */
+ int order = start_pfn ? __ffs(start_pfn) : MAX_PAGE_ORDER;
struct page *page;
unsigned long pfn = start_pfn;
@@ -1915,11 +2060,19 @@ static unsigned long find_large_buddy(unsigned long start_pfn)
return start_pfn;
}
+static inline void toggle_pageblock_isolate(struct page *page, bool isolate)
+{
+ if (isolate)
+ set_pageblock_isolate(page);
+ else
+ clear_pageblock_isolate(page);
+}
+
/**
- * move_freepages_block_isolate - move free pages in block for page isolation
+ * __move_freepages_block_isolate - move free pages in block for page isolation
* @zone: the zone
* @page: the pageblock page
- * @migratetype: migratetype to set on the pageblock
+ * @isolate: to isolate the given pageblock or unisolate it
*
* This is similar to move_freepages_block(), but handles the special
* case encountered in page isolation, where the block of interest
@@ -1934,10 +2087,19 @@ static unsigned long find_large_buddy(unsigned long start_pfn)
*
* Returns %true if pages could be moved, %false otherwise.
*/
-bool move_freepages_block_isolate(struct zone *zone, struct page *page,
- int migratetype)
+static bool __move_freepages_block_isolate(struct zone *zone,
+ struct page *page, bool isolate)
{
- unsigned long start_pfn, pfn;
+ unsigned long start_pfn, buddy_pfn;
+ int from_mt;
+ int to_mt;
+ struct page *buddy;
+
+ if (isolate == get_pageblock_isolate(page)) {
+ VM_WARN_ONCE(1, "%s a pageblock that is already in that state",
+ isolate ? "Isolate" : "Unisolate");
+ return false;
+ }
if (!prep_move_freepages_block(zone, page, &start_pfn, NULL, NULL))
return false;
@@ -1946,35 +2108,47 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page,
if (pageblock_order == MAX_PAGE_ORDER)
goto move;
- /* We're a tail block in a larger buddy */
- pfn = find_large_buddy(start_pfn);
- if (pfn != start_pfn) {
- struct page *buddy = pfn_to_page(pfn);
+ buddy_pfn = find_large_buddy(start_pfn);
+ buddy = pfn_to_page(buddy_pfn);
+ /* We're a part of a larger buddy */
+ if (PageBuddy(buddy) && buddy_order(buddy) > pageblock_order) {
int order = buddy_order(buddy);
del_page_from_free_list(buddy, zone, order,
- get_pfnblock_migratetype(buddy, pfn));
- set_pageblock_migratetype(page, migratetype);
- split_large_buddy(zone, buddy, pfn, order, FPI_NONE);
+ get_pfnblock_migratetype(buddy, buddy_pfn));
+ toggle_pageblock_isolate(page, isolate);
+ split_large_buddy(zone, buddy, buddy_pfn, order, FPI_NONE);
return true;
}
- /* We're the starting block of a larger buddy */
- if (PageBuddy(page) && buddy_order(page) > pageblock_order) {
- int order = buddy_order(page);
-
- del_page_from_free_list(page, zone, order,
- get_pfnblock_migratetype(page, pfn));
- set_pageblock_migratetype(page, migratetype);
- split_large_buddy(zone, page, pfn, order, FPI_NONE);
- return true;
- }
move:
- __move_freepages_block(zone, start_pfn,
- get_pfnblock_migratetype(page, start_pfn),
- migratetype);
+ /* Use MIGRATETYPE_MASK to get non-isolate migratetype */
+ if (isolate) {
+ from_mt = __get_pfnblock_flags_mask(page, page_to_pfn(page),
+ MIGRATETYPE_MASK);
+ to_mt = MIGRATE_ISOLATE;
+ } else {
+ from_mt = MIGRATE_ISOLATE;
+ to_mt = __get_pfnblock_flags_mask(page, page_to_pfn(page),
+ MIGRATETYPE_MASK);
+ }
+
+ __move_freepages_block(zone, start_pfn, from_mt, to_mt);
+ toggle_pageblock_isolate(pfn_to_page(start_pfn), isolate);
+
return true;
}
+
+bool pageblock_isolate_and_move_free_pages(struct zone *zone, struct page *page)
+{
+ return __move_freepages_block_isolate(zone, page, true);
+}
+
+bool pageblock_unisolate_and_move_free_pages(struct zone *zone, struct page *page)
+{
+ return __move_freepages_block_isolate(zone, page, false);
+}
+
#endif /* CONFIG_MEMORY_ISOLATION */
static void change_pageblock_range(struct page *pageblock_page,
@@ -2074,31 +2248,25 @@ static bool should_try_claim_block(unsigned int order, int start_mt)
/*
* Check whether there is a suitable fallback freepage with requested order.
- * Sets *claim_block to instruct the caller whether it should convert a whole
- * pageblock to the returned migratetype.
- * If only_claim is true, this function returns fallback_mt only if
+ * If claimable is true, this function returns fallback_mt only if
* we would do this whole-block claiming. This would help to reduce
* fragmentation due to mixed migratetype pages in one pageblock.
*/
int find_suitable_fallback(struct free_area *area, unsigned int order,
- int migratetype, bool only_claim, bool *claim_block)
+ int migratetype, bool claimable)
{
int i;
- int fallback_mt;
+
+ if (claimable && !should_try_claim_block(order, migratetype))
+ return -2;
if (area->nr_free == 0)
return -1;
- *claim_block = false;
for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) {
- fallback_mt = fallbacks[migratetype][i];
- if (free_area_empty(area, fallback_mt))
- continue;
+ int fallback_mt = fallbacks[migratetype][i];
- if (should_try_claim_block(order, migratetype))
- *claim_block = true;
-
- if (*claim_block || !only_claim)
+ if (!free_area_empty(area, fallback_mt))
return fallback_mt;
}
@@ -2172,6 +2340,7 @@ try_to_claim_block(struct zone *zone, struct page *page,
if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
page_group_by_mobility_disabled) {
__move_freepages_block(zone, start_pfn, block_type, start_type);
+ set_pageblock_migratetype(pfn_to_page(start_pfn), start_type);
return __rmqueue_smallest(zone, order, start_type);
}
@@ -2195,7 +2364,6 @@ __rmqueue_claim(struct zone *zone, int order, int start_migratetype,
int min_order = order;
struct page *page;
int fallback_mt;
- bool claim_block;
/*
* Do not steal pages from freelists belonging to other pageblocks
@@ -2214,11 +2382,14 @@ __rmqueue_claim(struct zone *zone, int order, int start_migratetype,
--current_order) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
- start_migratetype, false, &claim_block);
+ start_migratetype, true);
+
+ /* No block in that order */
if (fallback_mt == -1)
continue;
- if (!claim_block)
+ /* Advanced into orders too low to claim, abort */
+ if (fallback_mt == -2)
break;
page = get_page_from_free_area(area, fallback_mt);
@@ -2246,12 +2417,11 @@ __rmqueue_steal(struct zone *zone, int order, int start_migratetype)
int current_order;
struct page *page;
int fallback_mt;
- bool claim_block;
for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
- start_migratetype, false, &claim_block);
+ start_migratetype, false);
if (fallback_mt == -1)
continue;
@@ -2672,10 +2842,10 @@ static void free_frozen_page_commit(struct zone *zone,
* stops will be drained from vmstat refresh context.
*/
if (order && order <= PAGE_ALLOC_COSTLY_ORDER) {
- free_high = (pcp->free_count >= batch &&
+ free_high = (pcp->free_count >= (batch + pcp->high_min / 2) &&
(pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) &&
(!(pcp->flags & PCPF_FREE_HIGH_BATCH) ||
- pcp->count >= READ_ONCE(batch)));
+ pcp->count >= batch));
pcp->flags |= PCPF_PREV_FREE_HIGH_ORDER;
} else if (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) {
pcp->flags &= ~PCPF_PREV_FREE_HIGH_ORDER;
@@ -2690,14 +2860,29 @@ static void free_frozen_page_commit(struct zone *zone,
*/
return;
}
+
high = nr_pcp_high(pcp, zone, batch, free_high);
- if (pcp->count >= high) {
- free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high),
- pcp, pindex);
- if (test_bit(ZONE_BELOW_HIGH, &zone->flags) &&
- zone_watermark_ok(zone, 0, high_wmark_pages(zone),
- ZONE_MOVABLE, 0))
- clear_bit(ZONE_BELOW_HIGH, &zone->flags);
+ if (pcp->count < high)
+ return;
+
+ free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high),
+ pcp, pindex);
+ if (test_bit(ZONE_BELOW_HIGH, &zone->flags) &&
+ zone_watermark_ok(zone, 0, high_wmark_pages(zone),
+ ZONE_MOVABLE, 0)) {
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ clear_bit(ZONE_BELOW_HIGH, &zone->flags);
+
+ /*
+ * Assume that memory pressure on this node is gone and may be
+ * in a reclaimable state. If a memory fallback node exists,
+ * direct reclaim may not have been triggered, causing a
+ * 'hopeless node' to stay in that state for a while. Let
+ * kswapd work again by resetting kswapd_failures.
+ */
+ if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES &&
+ next_memory_node(pgdat->node_id) < MAX_NUMNODES)
+ atomic_set(&pgdat->kswapd_failures, 0);
}
}
@@ -3138,7 +3323,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
/*
* Do not instrument rmqueue() with KMSAN. This function may call
- * __msan_poison_alloca() through a call to set_pfnblock_flags_mask().
+ * __msan_poison_alloca() through a call to set_pfnblock_migratetype().
* If __msan_poison_alloca() attempts to allocate pages for the stack depot, it
* may call rmqueue() again, which will result in a deadlock.
*/
@@ -3550,11 +3735,13 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
struct pglist_data *last_pgdat = NULL;
bool last_pgdat_dirty_ok = false;
bool no_fallback;
+ bool skip_kswapd_nodes = nr_online_nodes > 1;
+ bool skipped_kswapd_nodes = false;
retry:
/*
* Scan zonelist, looking for a zone with enough free.
- * See also cpuset_node_allowed() comment in kernel/cgroup/cpuset.c.
+ * See also cpuset_current_node_allowed() comment in kernel/cgroup/cpuset.c.
*/
no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
z = ac->preferred_zoneref;
@@ -3612,6 +3799,19 @@ retry:
}
}
+ /*
+ * If kswapd is already active on a node, keep looking
+ * for other nodes that might be idle. This can happen
+ * if another process has NUMA bindings and is causing
+ * kswapd wakeups on only some nodes. Avoid accidental
+ * "node_reclaim_mode"-like behavior in this case.
+ */
+ if (skip_kswapd_nodes &&
+ !waitqueue_active(&zone->zone_pgdat->kswapd_wait)) {
+ skipped_kswapd_nodes = true;
+ continue;
+ }
+
cond_accept_memory(zone, order, alloc_flags);
/*
@@ -3704,6 +3904,15 @@ try_this_zone:
}
/*
+ * If we skipped over nodes with active kswapds and found no
+ * idle nodes, retry and place anywhere the watermarks permit.
+ */
+ if (skip_kswapd_nodes && skipped_kswapd_nodes) {
+ skip_kswapd_nodes = false;
+ goto retry;
+ }
+
+ /*
* It's possible on a UMA machine to get through all zones that are
* fragmented. If avoiding fragmentation, reset and try again.
*/
@@ -4008,7 +4217,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
}
static inline bool
-should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
+should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
enum compact_result compact_result,
enum compact_priority *compact_priority,
int *compaction_retries)
@@ -4234,14 +4443,14 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
if (!(gfp_mask & __GFP_NOMEMALLOC)) {
alloc_flags |= ALLOC_NON_BLOCK;
- if (order > 0)
+ if (order > 0 && (alloc_flags & ALLOC_MIN_RESERVE))
alloc_flags |= ALLOC_HIGHATOMIC;
}
/*
* Ignore cpuset mems for non-blocking __GFP_HIGH (probably
* GFP_ATOMIC) rather than fail, see the comment for
- * cpuset_node_allowed().
+ * cpuset_current_node_allowed().
*/
if (alloc_flags & ALLOC_MIN_RESERVE)
alloc_flags &= ~ALLOC_CPUSET;
@@ -4562,6 +4771,14 @@ restart:
}
retry:
+ /*
+ * Deal with possible cpuset update races or zonelist updates to avoid
+ * infinite retries.
+ */
+ if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
+ check_retry_zonelist(zonelist_iter_cookie))
+ goto restart;
+
/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
if (alloc_flags & ALLOC_KSWAPD)
wake_all_kswapds(order, gfp_mask, ac);
@@ -5035,11 +5252,35 @@ unsigned long get_zeroed_page_noprof(gfp_t gfp_mask)
}
EXPORT_SYMBOL(get_zeroed_page_noprof);
+static void ___free_pages(struct page *page, unsigned int order,
+ fpi_t fpi_flags)
+{
+ /* get PageHead before we drop reference */
+ int head = PageHead(page);
+ /* get alloc tag in case the page is released by others */
+ struct alloc_tag *tag = pgalloc_tag_get(page);
+
+ if (put_page_testzero(page))
+ __free_frozen_pages(page, order, fpi_flags);
+ else if (!head) {
+ pgalloc_tag_sub_pages(tag, (1 << order) - 1);
+ while (order-- > 0) {
+ /*
+ * The "tail" pages of this non-compound high-order
+ * page will have no code tags, so to avoid warnings
+ * mark them as empty.
+ */
+ clear_page_tag_ref(page + (1 << order));
+ __free_frozen_pages(page + (1 << order), order,
+ fpi_flags);
+ }
+ }
+}
+
/**
- * ___free_pages - Free pages allocated with alloc_pages().
+ * __free_pages - Free pages allocated with alloc_pages().
* @page: The page pointer returned from alloc_pages().
* @order: The order of the allocation.
- * @fpi_flags: Free Page Internal flags.
*
* This function can free multi-page allocations that are not compound
* pages. It does not check that the @order passed in matches that of
@@ -5056,23 +5297,6 @@ EXPORT_SYMBOL(get_zeroed_page_noprof);
* Context: May be called in interrupt context or while holding a normal
* spinlock, but not in NMI context or while holding a raw spinlock.
*/
-static void ___free_pages(struct page *page, unsigned int order,
- fpi_t fpi_flags)
-{
- /* get PageHead before we drop reference */
- int head = PageHead(page);
- /* get alloc tag in case the page is released by others */
- struct alloc_tag *tag = pgalloc_tag_get(page);
-
- if (put_page_testzero(page))
- __free_frozen_pages(page, order, fpi_flags);
- else if (!head) {
- pgalloc_tag_sub_pages(tag, (1 << order) - 1);
- while (order-- > 0)
- __free_frozen_pages(page + (1 << order), order,
- fpi_flags);
- }
-}
void __free_pages(struct page *page, unsigned int order)
{
___free_pages(page, order, FPI_NONE);
@@ -5081,13 +5305,22 @@ EXPORT_SYMBOL(__free_pages);
/*
* Can be called while holding raw_spin_lock or from IRQ and NMI for any
- * page type (not only those that came from try_alloc_pages)
+ * page type (not only those that came from alloc_pages_nolock)
*/
void free_pages_nolock(struct page *page, unsigned int order)
{
___free_pages(page, order, FPI_TRYLOCK);
}
+/**
+ * free_pages - Free pages allocated with __get_free_pages().
+ * @addr: The virtual address tied to a page returned from __get_free_pages().
+ * @order: The order of the allocation.
+ *
+ * This function behaves the same as __free_pages(). Use this function
+ * to free pages when you only have a valid virtual address. If you have
+ * the page, call __free_pages() instead.
+ */
void free_pages(unsigned long addr, unsigned int order)
{
if (addr != 0) {
@@ -5764,7 +5997,6 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta
pcp->high_min = BOOT_PAGESET_HIGH;
pcp->high_max = BOOT_PAGESET_HIGH;
pcp->batch = BOOT_PAGESET_BATCH;
- pcp->free_count = 0;
}
static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high_min,
@@ -6054,16 +6286,13 @@ static void calculate_totalreserve_pages(void)
unsigned long managed_pages = zone_managed_pages(zone);
/* Find valid and maximum lowmem_reserve in the zone */
- for (j = i; j < MAX_NR_ZONES; j++) {
- if (zone->lowmem_reserve[j] > max)
- max = zone->lowmem_reserve[j];
- }
+ for (j = i; j < MAX_NR_ZONES; j++)
+ max = max(max, zone->lowmem_reserve[j]);
/* we treat the high watermark as reserved pages. */
max += high_wmark_pages(zone);
- if (max > managed_pages)
- max = managed_pages;
+ max = min_t(unsigned long, max, managed_pages);
pgdat->totalreserve_pages += max;
@@ -6512,13 +6741,9 @@ static void alloc_contig_dump_pages(struct list_head *page_list)
}
}
-/*
- * [start, end) must belong to a single zone.
- * @migratetype: using migratetype to filter the type of migration in
- * trace_mm_alloc_contig_migrate_range_info.
- */
+/* [start, end) must belong to a single zone. */
static int __alloc_contig_migrate_range(struct compact_control *cc,
- unsigned long start, unsigned long end, int migratetype)
+ unsigned long start, unsigned long end)
{
/* This function is based on compact_zone() from compaction.c. */
unsigned int nr_reclaimed;
@@ -6530,10 +6755,6 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
.gfp_mask = cc->gfp_mask,
.reason = MR_CONTIG_RANGE,
};
- struct page *page;
- unsigned long total_mapped = 0;
- unsigned long total_migrated = 0;
- unsigned long total_reclaimed = 0;
lru_cache_disable();
@@ -6559,22 +6780,9 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
&cc->migratepages);
cc->nr_migratepages -= nr_reclaimed;
- if (trace_mm_alloc_contig_migrate_range_info_enabled()) {
- total_reclaimed += nr_reclaimed;
- list_for_each_entry(page, &cc->migratepages, lru) {
- struct folio *folio = page_folio(page);
-
- total_mapped += folio_mapped(folio) *
- folio_nr_pages(folio);
- }
- }
-
ret = migrate_pages(&cc->migratepages, alloc_migration_target,
NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE, NULL);
- if (trace_mm_alloc_contig_migrate_range_info_enabled() && !ret)
- total_migrated += cc->nr_migratepages;
-
/*
* On -ENOMEM, migrate_pages() bails out right away. It is pointless
* to retry again over this error, so do the same here.
@@ -6590,10 +6798,6 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
putback_movable_pages(&cc->migratepages);
}
- trace_mm_alloc_contig_migrate_range_info(start, end, migratetype,
- total_migrated,
- total_reclaimed,
- total_mapped);
return (ret < 0) ? ret : 0;
}
@@ -6661,10 +6865,7 @@ static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask)
* alloc_contig_range() -- tries to allocate given range of pages
* @start: start PFN to allocate
* @end: one-past-the-last PFN to allocate
- * @migratetype: migratetype of the underlying pageblocks (either
- * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks
- * in range must have the same migratetype and it must
- * be either of the two.
+ * @alloc_flags: allocation information
* @gfp_mask: GFP mask. Node/zone/placement hints are ignored; only some
* action and reclaim modifiers are supported. Reclaim modifiers
* control allocation behavior during compaction/migration/reclaim.
@@ -6681,8 +6882,9 @@ static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask)
* need to be freed with free_contig_range().
*/
int alloc_contig_range_noprof(unsigned long start, unsigned long end,
- unsigned migratetype, gfp_t gfp_mask)
+ acr_flags_t alloc_flags, gfp_t gfp_mask)
{
+ const unsigned int order = ilog2(end - start);
unsigned long outer_start, outer_end;
int ret = 0;
@@ -6696,6 +6898,17 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
.alloc_contig = true,
};
INIT_LIST_HEAD(&cc.migratepages);
+ enum pb_isolate_mode mode = (alloc_flags & ACR_FLAGS_CMA) ?
+ PB_ISOLATE_MODE_CMA_ALLOC :
+ PB_ISOLATE_MODE_OTHER;
+
+ /*
+ * In contrast to the buddy, we allow for orders here that exceed
+ * MAX_PAGE_ORDER, so we must manually make sure that we are not
+ * exceeding the maximum folio order.
+ */
+ if (WARN_ON_ONCE((gfp_mask & __GFP_COMP) && order > MAX_FOLIO_ORDER))
+ return -EINVAL;
gfp_mask = current_gfp_context(gfp_mask);
if (__alloc_contig_verify_gfp_mask(gfp_mask, (gfp_t *)&cc.gfp_mask))
@@ -6722,7 +6935,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
* put back to page allocator so that buddy can use them.
*/
- ret = start_isolate_page_range(start, end, migratetype, 0);
+ ret = start_isolate_page_range(start, end, mode);
if (ret)
goto done;
@@ -6738,7 +6951,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
* allocated. So, if we fall through be sure to clear ret so that
* -EBUSY is not accidentally used or returned to caller.
*/
- ret = __alloc_contig_migrate_range(&cc, start, end, migratetype);
+ ret = __alloc_contig_migrate_range(&cc, start, end);
if (ret && ret != -EBUSY)
goto done;
@@ -6772,7 +6985,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
outer_start = find_large_buddy(start);
/* Make sure the range is really isolated. */
- if (test_pages_isolated(outer_start, end, 0)) {
+ if (test_pages_isolated(outer_start, end, mode)) {
ret = -EBUSY;
goto done;
}
@@ -6794,7 +7007,6 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
free_contig_range(end, outer_end - end);
} else if (start == outer_start && end == outer_end && is_power_of_2(end - start)) {
struct page *head = pfn_to_page(start);
- int order = ilog2(end - start);
check_new_pages(head, order);
prep_new_page(head, order, gfp_mask, 0);
@@ -6805,7 +7017,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
start, end, outer_start, outer_end);
}
done:
- undo_isolate_page_range(start, end, migratetype);
+ undo_isolate_page_range(start, end);
return ret;
}
EXPORT_SYMBOL(alloc_contig_range_noprof);
@@ -6815,8 +7027,8 @@ static int __alloc_contig_pages(unsigned long start_pfn,
{
unsigned long end_pfn = start_pfn + nr_pages;
- return alloc_contig_range_noprof(start_pfn, end_pfn, MIGRATE_MOVABLE,
- gfp_mask);
+ return alloc_contig_range_noprof(start_pfn, end_pfn, ACR_FLAGS_NONE,
+ gfp_mask);
}
static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
@@ -7321,21 +7533,7 @@ static bool __free_unaccepted(struct page *page)
#endif /* CONFIG_UNACCEPTED_MEMORY */
-/**
- * try_alloc_pages - opportunistic reentrant allocation from any context
- * @nid: node to allocate from
- * @order: allocation order size
- *
- * Allocates pages of a given order from the given node. This is safe to
- * call from any context (from atomic, NMI, and also reentrant
- * allocator -> tracepoint -> try_alloc_pages_noprof).
- * Allocation is best effort and to be expected to fail easily so nobody should
- * rely on the success. Failures are not reported via warn_alloc().
- * See always fail conditions below.
- *
- * Return: allocated page or NULL on failure.
- */
-struct page *try_alloc_pages_noprof(int nid, unsigned int order)
+struct page *alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order)
{
/*
* Do not specify __GFP_DIRECT_RECLAIM, since direct claim is not allowed.
@@ -7344,7 +7542,7 @@ struct page *try_alloc_pages_noprof(int nid, unsigned int order)
*
* These two are the conditions for gfpflags_allow_spinning() being true.
*
- * Specify __GFP_NOWARN since failing try_alloc_pages() is not a reason
+ * Specify __GFP_NOWARN since failing alloc_pages_nolock() is not a reason
* to warn. Also warn would trigger printk() which is unsafe from
* various contexts. We cannot use printk_deferred_enter() to mitigate,
* since the running context is unknown.
@@ -7354,15 +7552,16 @@ struct page *try_alloc_pages_noprof(int nid, unsigned int order)
* BPF use cases.
*
* Though __GFP_NOMEMALLOC is not checked in the code path below,
- * specify it here to highlight that try_alloc_pages()
+ * specify it here to highlight that alloc_pages_nolock()
* doesn't want to deplete reserves.
*/
- gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC
- | __GFP_ACCOUNT;
+ gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC | __GFP_COMP
+ | gfp_flags;
unsigned int alloc_flags = ALLOC_TRYLOCK;
struct alloc_context ac = { };
struct page *page;
+ VM_WARN_ON_ONCE(gfp_flags & ~__GFP_ACCOUNT);
/*
* In PREEMPT_RT spin_trylock() will call raw_spin_lock() which is
* unsafe in NMI. If spin_trylock() is called from hard IRQ the current
@@ -7397,15 +7596,38 @@ struct page *try_alloc_pages_noprof(int nid, unsigned int order)
/* Unlike regular alloc_pages() there is no __alloc_pages_slowpath(). */
- if (page)
- set_page_refcounted(page);
-
- if (memcg_kmem_online() && page &&
+ if (memcg_kmem_online() && page && (gfp_flags & __GFP_ACCOUNT) &&
unlikely(__memcg_kmem_charge_page(page, alloc_gfp, order) != 0)) {
- free_pages_nolock(page, order);
+ __free_frozen_pages(page, order, FPI_TRYLOCK);
page = NULL;
}
trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype);
kmsan_alloc_page(page, order, alloc_gfp);
return page;
}
+/**
+ * alloc_pages_nolock - opportunistic reentrant allocation from any context
+ * @gfp_flags: GFP flags. Only __GFP_ACCOUNT allowed.
+ * @nid: node to allocate from
+ * @order: allocation order size
+ *
+ * Allocates pages of a given order from the given node. This is safe to
+ * call from any context (from atomic, NMI, and also reentrant
+ * allocator -> tracepoint -> alloc_pages_nolock_noprof).
+ * Allocation is best effort and to be expected to fail easily so nobody should
+ * rely on the success. Failures are not reported via warn_alloc().
+ * See always fail conditions below.
+ *
+ * Return: allocated page or NULL on failure. NULL does not mean EBUSY or EAGAIN.
+ * It means ENOMEM. There is no reason to call it again and expect !NULL.
+ */
+struct page *alloc_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order)
+{
+ struct page *page;
+
+ page = alloc_frozen_pages_nolock_noprof(gfp_flags, nid, order);
+ if (page)
+ set_page_refcounted(page);
+ return page;
+}
+EXPORT_SYMBOL_GPL(alloc_pages_nolock_noprof);
diff --git a/mm/page_ext.c b/mm/page_ext.c
index c351fdfe9e9a..d7396a8970e5 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -369,25 +369,15 @@ static void __invalidate_page_ext(unsigned long pfn)
}
static int __meminit online_page_ext(unsigned long start_pfn,
- unsigned long nr_pages,
- int nid)
+ unsigned long nr_pages)
{
+ int nid = pfn_to_nid(start_pfn);
unsigned long start, end, pfn;
int fail = 0;
start = SECTION_ALIGN_DOWN(start_pfn);
end = SECTION_ALIGN_UP(start_pfn + nr_pages);
- if (nid == NUMA_NO_NODE) {
- /*
- * In this case, "nid" already exists and contains valid memory.
- * "start_pfn" passed to us is a pfn which is an arg for
- * online__pages(), and start_pfn should exist.
- */
- nid = pfn_to_nid(start_pfn);
- VM_BUG_ON(!node_online(nid));
- }
-
for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION)
fail = init_section_page_ext(pfn, nid);
if (!fail)
@@ -435,8 +425,7 @@ static int __meminit page_ext_callback(struct notifier_block *self,
switch (action) {
case MEM_GOING_ONLINE:
- ret = online_page_ext(mn->start_pfn,
- mn->nr_pages, mn->status_change_nid);
+ ret = online_page_ext(mn->start_pfn, mn->nr_pages);
break;
case MEM_OFFLINE:
offline_page_ext(mn->start_pfn,
diff --git a/mm/page_idle.c b/mm/page_idle.c
index 408aaf29a3ea..a82b340dc204 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -208,7 +208,7 @@ static const struct bin_attribute *const page_idle_bin_attrs[] = {
};
static const struct attribute_group page_idle_attr_group = {
- .bin_attrs_new = page_idle_bin_attrs,
+ .bin_attrs = page_idle_bin_attrs,
.name = "page_idle",
};
diff --git a/mm/page_io.c b/mm/page_io.c
index 4bce19df557b..3c342db77ce3 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -204,7 +204,7 @@ static bool is_folio_zero_filled(struct folio *folio)
static void swap_zeromap_folio_set(struct folio *folio)
{
struct obj_cgroup *objcg = get_obj_cgroup_from_folio(folio);
- struct swap_info_struct *sis = swp_swap_info(folio->swap);
+ struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
int nr_pages = folio_nr_pages(folio);
swp_entry_t entry;
unsigned int i;
@@ -223,7 +223,7 @@ static void swap_zeromap_folio_set(struct folio *folio)
static void swap_zeromap_folio_clear(struct folio *folio)
{
- struct swap_info_struct *sis = swp_swap_info(folio->swap);
+ struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
swp_entry_t entry;
unsigned int i;
@@ -237,15 +237,13 @@ static void swap_zeromap_folio_clear(struct folio *folio)
* We may have stale swap cache pages in memory: notice
* them here and get rid of the unnecessary final write.
*/
-int swap_writepage(struct page *page, struct writeback_control *wbc)
+int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
{
- struct folio *folio = page_folio(page);
- int ret;
+ int ret = 0;
+
+ if (folio_free_swap(folio))
+ goto out_unlock;
- if (folio_free_swap(folio)) {
- folio_unlock(folio);
- return 0;
- }
/*
* Arch code may have to preserve more data than just the page
* contents, e.g. memory tags.
@@ -253,8 +251,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
ret = arch_prepare_to_swap(folio);
if (ret) {
folio_mark_dirty(folio);
- folio_unlock(folio);
- return ret;
+ goto out_unlock;
}
/*
@@ -265,28 +262,30 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
*/
if (is_folio_zero_filled(folio)) {
swap_zeromap_folio_set(folio);
- folio_unlock(folio);
- return 0;
- } else {
- /*
- * Clear bits this folio occupies in the zeromap to prevent
- * zero data being read in from any previous zero writes that
- * occupied the same swap entries.
- */
- swap_zeromap_folio_clear(folio);
+ goto out_unlock;
}
+
+ /*
+ * Clear bits this folio occupies in the zeromap to prevent zero data
+ * being read in from any previous zero writes that occupied the same
+ * swap entries.
+ */
+ swap_zeromap_folio_clear(folio);
+
if (zswap_store(folio)) {
count_mthp_stat(folio_order(folio), MTHP_STAT_ZSWPOUT);
- folio_unlock(folio);
- return 0;
+ goto out_unlock;
}
if (!mem_cgroup_zswap_writeback_enabled(folio_memcg(folio))) {
folio_mark_dirty(folio);
return AOP_WRITEPAGE_ACTIVATE;
}
- __swap_writepage(folio, wbc);
+ __swap_writepage(folio, swap_plug);
return 0;
+out_unlock:
+ folio_unlock(folio);
+ return ret;
}
static inline void count_swpout_vm_event(struct folio *folio)
@@ -372,18 +371,16 @@ static void sio_write_complete(struct kiocb *iocb, long ret)
mempool_free(sio, sio_pool);
}
-static void swap_writepage_fs(struct folio *folio, struct writeback_control *wbc)
+static void swap_writepage_fs(struct folio *folio, struct swap_iocb **swap_plug)
{
- struct swap_iocb *sio = NULL;
- struct swap_info_struct *sis = swp_swap_info(folio->swap);
+ struct swap_iocb *sio = swap_plug ? *swap_plug : NULL;
+ struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
struct file *swap_file = sis->swap_file;
loff_t pos = swap_dev_pos(folio->swap);
count_swpout_vm_event(folio);
folio_start_writeback(folio);
folio_unlock(folio);
- if (wbc->swap_plug)
- sio = *wbc->swap_plug;
if (sio) {
if (sio->iocb.ki_filp != swap_file ||
sio->iocb.ki_pos + sio->len != pos) {
@@ -402,22 +399,21 @@ static void swap_writepage_fs(struct folio *folio, struct writeback_control *wbc
bvec_set_folio(&sio->bvec[sio->pages], folio, folio_size(folio), 0);
sio->len += folio_size(folio);
sio->pages += 1;
- if (sio->pages == ARRAY_SIZE(sio->bvec) || !wbc->swap_plug) {
+ if (sio->pages == ARRAY_SIZE(sio->bvec) || !swap_plug) {
swap_write_unplug(sio);
sio = NULL;
}
- if (wbc->swap_plug)
- *wbc->swap_plug = sio;
+ if (swap_plug)
+ *swap_plug = sio;
}
static void swap_writepage_bdev_sync(struct folio *folio,
- struct writeback_control *wbc, struct swap_info_struct *sis)
+ struct swap_info_struct *sis)
{
struct bio_vec bv;
struct bio bio;
- bio_init(&bio, sis->bdev, &bv, 1,
- REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc));
+ bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_WRITE | REQ_SWAP);
bio.bi_iter.bi_sector = swap_folio_sector(folio);
bio_add_folio_nofail(&bio, folio, folio_size(folio), 0);
@@ -432,13 +428,11 @@ static void swap_writepage_bdev_sync(struct folio *folio,
}
static void swap_writepage_bdev_async(struct folio *folio,
- struct writeback_control *wbc, struct swap_info_struct *sis)
+ struct swap_info_struct *sis)
{
struct bio *bio;
- bio = bio_alloc(sis->bdev, 1,
- REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc),
- GFP_NOIO);
+ bio = bio_alloc(sis->bdev, 1, REQ_OP_WRITE | REQ_SWAP, GFP_NOIO);
bio->bi_iter.bi_sector = swap_folio_sector(folio);
bio->bi_end_io = end_swap_bio_write;
bio_add_folio_nofail(bio, folio, folio_size(folio), 0);
@@ -450,9 +444,9 @@ static void swap_writepage_bdev_async(struct folio *folio,
submit_bio(bio);
}
-void __swap_writepage(struct folio *folio, struct writeback_control *wbc)
+void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
{
- struct swap_info_struct *sis = swp_swap_info(folio->swap);
+ struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
/*
@@ -461,16 +455,16 @@ void __swap_writepage(struct folio *folio, struct writeback_control *wbc)
* is safe.
*/
if (data_race(sis->flags & SWP_FS_OPS))
- swap_writepage_fs(folio, wbc);
+ swap_writepage_fs(folio, swap_plug);
/*
* ->flags can be updated non-atomicially (scan_swap_map_slots),
* but that will never affect SWP_SYNCHRONOUS_IO, so the data_race
* is safe.
*/
else if (data_race(sis->flags & SWP_SYNCHRONOUS_IO))
- swap_writepage_bdev_sync(folio, wbc, sis);
+ swap_writepage_bdev_sync(folio, sis);
else
- swap_writepage_bdev_async(folio, wbc, sis);
+ swap_writepage_bdev_async(folio, sis);
}
void swap_write_unplug(struct swap_iocb *sio)
@@ -543,7 +537,7 @@ static bool swap_read_folio_zeromap(struct folio *folio)
static void swap_read_folio_fs(struct folio *folio, struct swap_iocb **plug)
{
- struct swap_info_struct *sis = swp_swap_info(folio->swap);
+ struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
struct swap_iocb *sio = NULL;
loff_t pos = swap_dev_pos(folio->swap);
@@ -614,7 +608,7 @@ static void swap_read_folio_bdev_async(struct folio *folio,
void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
{
- struct swap_info_struct *sis = swp_swap_info(folio->swap);
+ struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
bool synchronous = sis->flags & SWP_SYNCHRONOUS_IO;
bool workingset = folio_test_workingset(folio);
unsigned long pflags;
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index b2fc5266e3d2..f72b6cd38b95 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -21,9 +21,9 @@
* consequently belong to a single zone.
*
* PageLRU check without isolation or lru_lock could race so that
- * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
- * check without lock_page also may miss some movable non-lru pages at
- * race condition. So you can't expect this function should be exact.
+ * MIGRATE_MOVABLE block might include unmovable pages. Similarly, pages
+ * with movable_ops can only be identified some time after they were
+ * allocated. So you can't expect this function should be exact.
*
* Returns a page without holding a reference. If the caller wants to
* dereference that page (e.g., dumping), it has to make sure that it
@@ -31,7 +31,7 @@
*
*/
static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long end_pfn,
- int migratetype, int flags)
+ enum pb_isolate_mode mode)
{
struct page *page = pfn_to_page(start_pfn);
struct zone *zone = page_zone(page);
@@ -46,7 +46,7 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e
* isolate CMA pageblocks even when they are not movable in fact
* so consider them movable here.
*/
- if (is_migrate_cma(migratetype))
+ if (mode == PB_ISOLATE_MODE_CMA_ALLOC)
return NULL;
return page;
@@ -92,7 +92,7 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e
h = size_to_hstate(folio_size(folio));
if (h && !hugepage_migration_supported(h))
return page;
- } else if (!folio_test_lru(folio) && !__folio_test_movable(folio)) {
+ } else if (!folio_test_lru(folio)) {
return page;
}
@@ -117,7 +117,7 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e
* The HWPoisoned page may be not in buddy system, and
* page_count() is not 0.
*/
- if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
+ if ((mode == PB_ISOLATE_MODE_MEM_OFFLINE) && PageHWPoison(page))
continue;
/*
@@ -130,10 +130,10 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e
* move these pages that still have a reference count > 0.
* (false negatives in this function only)
*/
- if ((flags & MEMORY_OFFLINE) && PageOffline(page))
+ if ((mode == PB_ISOLATE_MODE_MEM_OFFLINE) && PageOffline(page))
continue;
- if (__PageMovable(page) || PageLRU(page))
+ if (PageLRU(page) || page_has_movable_ops(page))
continue;
/*
@@ -151,7 +151,7 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e
* present in [start_pfn, end_pfn). The pageblock must intersect with
* [start_pfn, end_pfn).
*/
-static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags,
+static int set_migratetype_isolate(struct page *page, enum pb_isolate_mode mode,
unsigned long start_pfn, unsigned long end_pfn)
{
struct zone *zone = page_zone(page);
@@ -186,9 +186,9 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
end_pfn);
unmovable = has_unmovable_pages(check_unmovable_start, check_unmovable_end,
- migratetype, isol_flags);
+ mode);
if (!unmovable) {
- if (!move_freepages_block_isolate(zone, page, MIGRATE_ISOLATE)) {
+ if (!pageblock_isolate_and_move_free_pages(zone, page)) {
spin_unlock_irqrestore(&zone->lock, flags);
return -EBUSY;
}
@@ -198,7 +198,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
}
spin_unlock_irqrestore(&zone->lock, flags);
- if (isol_flags & REPORT_FAILURE) {
+ if (mode == PB_ISOLATE_MODE_MEM_OFFLINE) {
/*
* printk() with zone->lock held will likely trigger a
* lockdep splat, so defer it here.
@@ -209,7 +209,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
return -EBUSY;
}
-static void unset_migratetype_isolate(struct page *page, int migratetype)
+static void unset_migratetype_isolate(struct page *page)
{
struct zone *zone;
unsigned long flags;
@@ -262,10 +262,10 @@ static void unset_migratetype_isolate(struct page *page, int migratetype)
* Isolating this block already succeeded, so this
* should not fail on zone boundaries.
*/
- WARN_ON_ONCE(!move_freepages_block_isolate(zone, page, migratetype));
+ WARN_ON_ONCE(!pageblock_unisolate_and_move_free_pages(zone, page));
} else {
- set_pageblock_migratetype(page, migratetype);
- __putback_isolated_page(page, order, migratetype);
+ clear_pageblock_isolate(page);
+ __putback_isolated_page(page, order, get_pageblock_migratetype(page));
}
zone->nr_isolate_pageblock--;
out:
@@ -292,11 +292,10 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* isolate_single_pageblock() -- tries to isolate a pageblock that might be
* within a free or in-use page.
* @boundary_pfn: pageblock-aligned pfn that a page might cross
- * @flags: isolation flags
+ * @mode: isolation mode
* @isolate_before: isolate the pageblock before the boundary_pfn
* @skip_isolation: the flag to skip the pageblock isolation in second
* isolate_single_pageblock()
- * @migratetype: migrate type to set in error recovery.
*
* Free and in-use pages can be as big as MAX_PAGE_ORDER and contain more than one
* pageblock. When not all pageblocks within a page are isolated at the same
@@ -311,8 +310,9 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* either. The function handles this by splitting the free page or migrating
* the in-use page then splitting the free page.
*/
-static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
- bool isolate_before, bool skip_isolation, int migratetype)
+static int isolate_single_pageblock(unsigned long boundary_pfn,
+ enum pb_isolate_mode mode, bool isolate_before,
+ bool skip_isolation)
{
unsigned long start_pfn;
unsigned long isolate_pageblock;
@@ -338,12 +338,11 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
zone->zone_start_pfn);
if (skip_isolation) {
- int mt __maybe_unused = get_pageblock_migratetype(pfn_to_page(isolate_pageblock));
-
- VM_BUG_ON(!is_migrate_isolate(mt));
+ VM_BUG_ON(!get_pageblock_isolate(pfn_to_page(isolate_pageblock)));
} else {
- ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), migratetype,
- flags, isolate_pageblock, isolate_pageblock + pageblock_nr_pages);
+ ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock),
+ mode, isolate_pageblock,
+ isolate_pageblock + pageblock_nr_pages);
if (ret)
return ret;
@@ -383,7 +382,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
if (PageBuddy(page)) {
int order = buddy_order(page);
- /* move_freepages_block_isolate() handled this */
+ /* pageblock_isolate_and_move_free_pages() handled this */
VM_WARN_ON_ONCE(pfn + (1 << order) > boundary_pfn);
pfn += 1UL << order;
@@ -422,7 +421,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
* proper free and split handling for them.
*/
VM_WARN_ON_ONCE_PAGE(PageLRU(page), page);
- VM_WARN_ON_ONCE_PAGE(__PageMovable(page), page);
+ VM_WARN_ON_ONCE_PAGE(page_has_movable_ops(page), page);
goto failed;
}
@@ -433,7 +432,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
failed:
/* restore the original migratetype */
if (!skip_isolation)
- unset_migratetype_isolate(pfn_to_page(isolate_pageblock), migratetype);
+ unset_migratetype_isolate(pfn_to_page(isolate_pageblock));
return -EBUSY;
}
@@ -441,14 +440,7 @@ failed:
* start_isolate_page_range() - mark page range MIGRATE_ISOLATE
* @start_pfn: The first PFN of the range to be isolated.
* @end_pfn: The last PFN of the range to be isolated.
- * @migratetype: Migrate type to set in error recovery.
- * @flags: The following flags are allowed (they can be combined in
- * a bit mask)
- * MEMORY_OFFLINE - isolate to offline (!allocate) memory
- * e.g., skip over PageHWPoison() pages
- * and PageOffline() pages.
- * REPORT_FAILURE - report details about the failure to
- * isolate the range
+ * @mode: isolation mode
*
* Making page-allocation-type to be MIGRATE_ISOLATE means free pages in
* the range will never be allocated. Any free pages and pages freed in the
@@ -481,7 +473,7 @@ failed:
* Return: 0 on success and -EBUSY if any part of range cannot be isolated.
*/
int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
- int migratetype, int flags)
+ enum pb_isolate_mode mode)
{
unsigned long pfn;
struct page *page;
@@ -492,8 +484,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
bool skip_isolation = false;
/* isolate [isolate_start, isolate_start + pageblock_nr_pages) pageblock */
- ret = isolate_single_pageblock(isolate_start, flags, false,
- skip_isolation, migratetype);
+ ret = isolate_single_pageblock(isolate_start, mode, false,
+ skip_isolation);
if (ret)
return ret;
@@ -501,10 +493,9 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
skip_isolation = true;
/* isolate [isolate_end - pageblock_nr_pages, isolate_end) pageblock */
- ret = isolate_single_pageblock(isolate_end, flags, true,
- skip_isolation, migratetype);
+ ret = isolate_single_pageblock(isolate_end, mode, true, skip_isolation);
if (ret) {
- unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype);
+ unset_migratetype_isolate(pfn_to_page(isolate_start));
return ret;
}
@@ -513,12 +504,11 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
pfn < isolate_end - pageblock_nr_pages;
pfn += pageblock_nr_pages) {
page = __first_valid_page(pfn, pageblock_nr_pages);
- if (page && set_migratetype_isolate(page, migratetype, flags,
- start_pfn, end_pfn)) {
- undo_isolate_page_range(isolate_start, pfn, migratetype);
+ if (page && set_migratetype_isolate(page, mode, start_pfn,
+ end_pfn)) {
+ undo_isolate_page_range(isolate_start, pfn);
unset_migratetype_isolate(
- pfn_to_page(isolate_end - pageblock_nr_pages),
- migratetype);
+ pfn_to_page(isolate_end - pageblock_nr_pages));
return -EBUSY;
}
}
@@ -529,13 +519,10 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
* undo_isolate_page_range - undo effects of start_isolate_page_range()
* @start_pfn: The first PFN of the isolated range
* @end_pfn: The last PFN of the isolated range
- * @migratetype: New migrate type to set on the range
*
- * This finds every MIGRATE_ISOLATE page block in the given range
- * and switches it to @migratetype.
+ * This finds and unsets every MIGRATE_ISOLATE page block in the given range
*/
-void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
- int migratetype)
+void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn)
{
unsigned long pfn;
struct page *page;
@@ -548,7 +535,7 @@ void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
page = __first_valid_page(pfn, pageblock_nr_pages);
if (!page || !is_migrate_isolate_page(page))
continue;
- unset_migratetype_isolate(page, migratetype);
+ unset_migratetype_isolate(page);
}
}
/*
@@ -560,7 +547,7 @@ void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
*/
static unsigned long
__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
- int flags)
+ enum pb_isolate_mode mode)
{
struct page *page;
@@ -573,11 +560,12 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
* simple way to verify that as VM_BUG_ON(), though.
*/
pfn += 1 << buddy_order(page);
- else if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
+ else if ((mode == PB_ISOLATE_MODE_MEM_OFFLINE) &&
+ PageHWPoison(page))
/* A HWPoisoned page cannot be also PageBuddy */
pfn++;
- else if ((flags & MEMORY_OFFLINE) && PageOffline(page) &&
- !page_count(page))
+ else if ((mode == PB_ISOLATE_MODE_MEM_OFFLINE) &&
+ PageOffline(page) && !page_count(page))
/*
* The responsible driver agreed to skip PageOffline()
* pages when offlining memory by dropping its
@@ -595,11 +583,11 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
* test_pages_isolated - check if pageblocks in range are isolated
* @start_pfn: The first PFN of the isolated range
* @end_pfn: The first PFN *after* the isolated range
- * @isol_flags: Testing mode flags
+ * @mode: Testing mode
*
* This tests if all in the specified range are free.
*
- * If %MEMORY_OFFLINE is specified in @flags, it will consider
+ * If %PB_ISOLATE_MODE_MEM_OFFLINE specified in @mode, it will consider
* poisoned and offlined pages free as well.
*
* Caller must ensure the requested range doesn't span zones.
@@ -607,7 +595,7 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
* Returns 0 if true, -EBUSY if one or more pages are in use.
*/
int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
- int isol_flags)
+ enum pb_isolate_mode mode)
{
unsigned long pfn, flags;
struct page *page;
@@ -643,7 +631,7 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
/* Check all pages are free or marked as ISOLATED */
zone = page_zone(page);
spin_lock_irqsave(&zone->lock, flags);
- pfn = __test_page_isolated_in_pageblock(start_pfn, end_pfn, isol_flags);
+ pfn = __test_page_isolated_in_pageblock(start_pfn, end_pfn, mode);
spin_unlock_irqrestore(&zone->lock, flags);
ret = pfn < end_pfn ? -EBUSY : 0;
diff --git a/mm/page_owner.c b/mm/page_owner.c
index cc4a6916eec6..c3ca21132c2c 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -302,7 +302,7 @@ void __reset_page_owner(struct page *page, unsigned short order)
/*
* Do not specify GFP_NOWAIT to make gfpflags_allow_spinning() == false
* to prevent issues in stack_depot_save().
- * This is similar to try_alloc_pages() gfp flags, but only used
+ * This is similar to alloc_pages_nolock() gfp flags, but only used
* to signal stack_depot to avoid spin_locks.
*/
handle = save_stack(__GFP_NOWARN);
@@ -333,9 +333,9 @@ noinline void __set_page_owner(struct page *page, unsigned short order,
inc_stack_record_count(handle, gfp_mask, 1 << order);
}
-void __set_page_owner_migrate_reason(struct page *page, int reason)
+void __folio_set_owner_migrate_reason(struct folio *folio, int reason)
{
- struct page_ext *page_ext = page_ext_get(page);
+ struct page_ext *page_ext = page_ext_get(&folio->page);
struct page_owner *page_owner;
if (unlikely(!page_ext))
diff --git a/mm/page_table_check.c b/mm/page_table_check.c
index 68109ee93841..4eeca782b888 100644
--- a/mm/page_table_check.c
+++ b/mm/page_table_check.c
@@ -218,33 +218,39 @@ static inline void page_table_check_pmd_flags(pmd_t pmd)
WARN_ON_ONCE(swap_cached_writable(pmd_to_swp_entry(pmd)));
}
-void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd)
+void __page_table_check_pmds_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd,
+ unsigned int nr)
{
+ unsigned long stride = PMD_SIZE >> PAGE_SHIFT;
+ unsigned int i;
+
if (&init_mm == mm)
return;
page_table_check_pmd_flags(pmd);
- __page_table_check_pmd_clear(mm, *pmdp);
- if (pmd_user_accessible_page(pmd)) {
- page_table_check_set(pmd_pfn(pmd), PMD_SIZE >> PAGE_SHIFT,
- pmd_write(pmd));
- }
+ for (i = 0; i < nr; i++)
+ __page_table_check_pmd_clear(mm, *(pmdp + i));
+ if (pmd_user_accessible_page(pmd))
+ page_table_check_set(pmd_pfn(pmd), stride * nr, pmd_write(pmd));
}
-EXPORT_SYMBOL(__page_table_check_pmd_set);
+EXPORT_SYMBOL(__page_table_check_pmds_set);
-void __page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud)
+void __page_table_check_puds_set(struct mm_struct *mm, pud_t *pudp, pud_t pud,
+ unsigned int nr)
{
+ unsigned long stride = PUD_SIZE >> PAGE_SHIFT;
+ unsigned int i;
+
if (&init_mm == mm)
return;
- __page_table_check_pud_clear(mm, *pudp);
- if (pud_user_accessible_page(pud)) {
- page_table_check_set(pud_pfn(pud), PUD_SIZE >> PAGE_SHIFT,
- pud_write(pud));
- }
+ for (i = 0; i < nr; i++)
+ __page_table_check_pud_clear(mm, *(pudp + i));
+ if (pud_user_accessible_page(pud))
+ page_table_check_set(pud_pfn(pud), stride * nr, pud_write(pud));
}
-EXPORT_SYMBOL(__page_table_check_pud_set);
+EXPORT_SYMBOL(__page_table_check_puds_set);
void __page_table_check_pte_clear_range(struct mm_struct *mm,
unsigned long addr,
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index e463c3be934a..c498a91b6706 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -246,8 +246,7 @@ restart:
*/
pmde = pmdp_get_lockless(pvmw->pmd);
- if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde) ||
- (pmd_present(pmde) && pmd_devmap(pmde))) {
+ if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) {
pvmw->ptl = pmd_lock(mm, pvmw->pmd);
pmde = *pvmw->pmd;
if (!pmd_present(pmde)) {
@@ -262,7 +261,7 @@ restart:
return not_found(pvmw);
return true;
}
- if (likely(pmd_trans_huge(pmde) || pmd_devmap(pmde))) {
+ if (likely(pmd_trans_huge(pmde))) {
if (pvmw->flags & PVMW_MIGRATION)
return not_found(pvmw);
if (!check_pmd(pmd_pfn(pmde), pvmw))
@@ -310,6 +309,7 @@ next_pte:
}
pte_unmap(pvmw->pte);
pvmw->pte = NULL;
+ pvmw->flags |= PVMW_PGTABLE_CROSSED;
goto restart;
}
pvmw->pte++;
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index e478777c86e1..9f91cf85a5be 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -143,8 +143,7 @@ again:
* We are ONLY installing, so avoid unnecessarily
* splitting a present huge page.
*/
- if (pmd_present(*pmd) &&
- (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)))
+ if (pmd_present(*pmd) && pmd_trans_huge(*pmd))
continue;
}
@@ -210,8 +209,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
* We are ONLY installing, so avoid unnecessarily
* splitting a present huge page.
*/
- if (pud_present(*pud) &&
- (pud_trans_huge(*pud) || pud_devmap(*pud)))
+ if (pud_present(*pud) && pud_trans_huge(*pud))
continue;
}
@@ -422,7 +420,7 @@ static inline void process_mm_walk_lock(struct mm_struct *mm,
{
if (walk_lock == PGWALK_RDLOCK)
mmap_assert_locked(mm);
- else
+ else if (walk_lock != PGWALK_VMA_RDLOCK_VERIFY)
mmap_assert_write_locked(mm);
}
@@ -437,6 +435,9 @@ static inline void process_vma_walk_lock(struct vm_area_struct *vma,
case PGWALK_WRLOCK_VERIFY:
vma_assert_write_locked(vma);
break;
+ case PGWALK_VMA_RDLOCK_VERIFY:
+ vma_assert_locked(vma);
+ break;
case PGWALK_RDLOCK:
/* PGWALK_RDLOCK is handled by process_mm_walk_lock */
break;
@@ -585,8 +586,7 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
}
/**
- * walk_page_range_novma - walk a range of pagetables not backed by a vma
- * @mm: mm_struct representing the target process of page table walk
+ * walk_kernel_page_table_range - walk a range of kernel pagetables.
* @start: start address of the virtual address range
* @end: end address of the virtual address range
* @ops: operation to call during the walk
@@ -596,17 +596,73 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
* Similar to walk_page_range() but can walk any page tables even if they are
* not backed by VMAs. Because 'unusual' entries may be walked this function
* will also not lock the PTEs for the pte_entry() callback. This is useful for
- * walking the kernel pages tables or page tables for firmware.
+ * walking kernel pages tables or page tables for firmware.
*
* Note: Be careful to walk the kernel pages tables, the caller may be need to
* take other effective approaches (mmap lock may be insufficient) to prevent
* the intermediate kernel page tables belonging to the specified address range
* from being freed (e.g. memory hot-remove).
*/
-int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
+int walk_kernel_page_table_range(unsigned long start, unsigned long end,
+ const struct mm_walk_ops *ops, pgd_t *pgd, void *private)
+{
+ /*
+ * Kernel intermediate page tables are usually not freed, so the mmap
+ * read lock is sufficient. But there are some exceptions.
+ * E.g. memory hot-remove. In which case, the mmap lock is insufficient
+ * to prevent the intermediate kernel pages tables belonging to the
+ * specified address range from being freed. The caller should take
+ * other actions to prevent this race.
+ */
+ mmap_assert_locked(&init_mm);
+
+ return walk_kernel_page_table_range_lockless(start, end, ops, pgd,
+ private);
+}
+
+/*
+ * Use this function to walk the kernel page tables locklessly. It should be
+ * guaranteed that the caller has exclusive access over the range they are
+ * operating on - that there should be no concurrent access, for example,
+ * changing permissions for vmalloc objects.
+ */
+int walk_kernel_page_table_range_lockless(unsigned long start, unsigned long end,
+ const struct mm_walk_ops *ops, pgd_t *pgd, void *private)
+{
+ struct mm_walk walk = {
+ .ops = ops,
+ .mm = &init_mm,
+ .pgd = pgd,
+ .private = private,
+ .no_vma = true
+ };
+
+ if (start >= end)
+ return -EINVAL;
+ if (!check_ops_valid(ops))
+ return -EINVAL;
+
+ return walk_pgd_range(start, end, &walk);
+}
+
+/**
+ * walk_page_range_debug - walk a range of pagetables not backed by a vma
+ * @mm: mm_struct representing the target process of page table walk
+ * @start: start address of the virtual address range
+ * @end: end address of the virtual address range
+ * @ops: operation to call during the walk
+ * @pgd: pgd to walk if different from mm->pgd
+ * @private: private data for callbacks' usage
+ *
+ * Similar to walk_page_range() but can walk any page tables even if they are
+ * not backed by VMAs. Because 'unusual' entries may be walked this function
+ * will also not lock the PTEs for the pte_entry() callback.
+ *
+ * This is for debugging purposes ONLY.
+ */
+int walk_page_range_debug(struct mm_struct *mm, unsigned long start,
unsigned long end, const struct mm_walk_ops *ops,
- pgd_t *pgd,
- void *private)
+ pgd_t *pgd, void *private)
{
struct mm_walk walk = {
.ops = ops,
@@ -616,34 +672,24 @@ int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
.no_vma = true
};
+ /* For convenience, we allow traversal of kernel mappings. */
+ if (mm == &init_mm)
+ return walk_kernel_page_table_range(start, end, ops,
+ pgd, private);
if (start >= end || !walk.mm)
return -EINVAL;
if (!check_ops_valid(ops))
return -EINVAL;
/*
- * 1) For walking the user virtual address space:
- *
* The mmap lock protects the page walker from changes to the page
* tables during the walk. However a read lock is insufficient to
* protect those areas which don't have a VMA as munmap() detaches
* the VMAs before downgrading to a read lock and actually tearing
* down PTEs/page tables. In which case, the mmap write lock should
- * be hold.
- *
- * 2) For walking the kernel virtual address space:
- *
- * The kernel intermediate page tables usually do not be freed, so
- * the mmap map read lock is sufficient. But there are some exceptions.
- * E.g. memory hot-remove. In which case, the mmap lock is insufficient
- * to prevent the intermediate kernel pages tables belonging to the
- * specified address range from being freed. The caller should take
- * other actions to prevent this race.
+ * be held.
*/
- if (mm == &init_mm)
- mmap_assert_locked(walk.mm);
- else
- mmap_assert_write_locked(walk.mm);
+ mmap_assert_write_locked(mm);
return walk_pgd_range(start, end, &walk);
}
@@ -868,23 +914,23 @@ struct folio *folio_walk_start(struct folio_walk *fw,
fw->pudp = pudp;
fw->pud = pud;
- /*
- * TODO: FW_MIGRATION support for PUD migration entries
- * once there are relevant users.
- */
- if (!pud_present(pud) || pud_devmap(pud) || pud_special(pud)) {
+ if (pud_none(pud)) {
spin_unlock(ptl);
goto not_found;
- } else if (!pud_leaf(pud)) {
+ } else if (pud_present(pud) && !pud_leaf(pud)) {
spin_unlock(ptl);
goto pmd_table;
+ } else if (pud_present(pud)) {
+ page = vm_normal_page_pud(vma, addr, pud);
+ if (page)
+ goto found;
}
/*
- * TODO: vm_normal_page_pud() will be handy once we want to
- * support PUD mappings in VM_PFNMAP|VM_MIXEDMAP VMAs.
+ * TODO: FW_MIGRATION support for PUD migration entries
+ * once there are relevant users.
*/
- page = pud_page(pud);
- goto found;
+ spin_unlock(ptl);
+ goto not_found;
}
pmd_table:
@@ -970,7 +1016,7 @@ not_found:
found:
if (expose_page)
/* Note: Offset from the mapped page, not the folio start. */
- fw->page = nth_page(page, (addr & (entry_size - 1)) >> PAGE_SHIFT);
+ fw->page = page + ((addr & (entry_size - 1)) >> PAGE_SHIFT);
else
fw->page = NULL;
fw->ptl = ptl;
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index fe31aa19db81..4efa74a495cb 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -69,7 +69,7 @@ static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
}
for (i = 0; i < nr_pages; i++)
- pcpu_set_page_chunk(nth_page(pages, i), chunk);
+ pcpu_set_page_chunk(pages + i, chunk);
chunk->data = pages;
chunk->base_addr = page_address(pages);
diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c
index dd3590dfc23d..9b9d5d6accae 100644
--- a/mm/percpu-stats.c
+++ b/mm/percpu-stats.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * mm/percpu-debug.c
*
* Copyright (C) 2017 Facebook Inc.
* Copyright (C) 2017 Dennis Zhou <dennis@kernel.org>
diff --git a/mm/percpu.c b/mm/percpu.c
index b35494c8ede2..81462ce5866e 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1734,7 +1734,7 @@ void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved,
bool is_atomic;
bool do_warn;
struct obj_cgroup *objcg = NULL;
- static int warn_limit = 10;
+ static atomic_t warn_limit = ATOMIC_INIT(10);
struct pcpu_chunk *chunk, *next;
const char *err;
int slot, off, cpu, ret;
@@ -1904,13 +1904,17 @@ fail_unlock:
fail:
trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align);
- if (do_warn && warn_limit) {
- pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
- size, align, is_atomic, err);
- if (!is_atomic)
- dump_stack();
- if (!--warn_limit)
- pr_info("limit reached, disable warning\n");
+ if (do_warn) {
+ int remaining = atomic_dec_if_positive(&warn_limit);
+
+ if (remaining >= 0) {
+ pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
+ size, align, is_atomic, err);
+ if (!is_atomic)
+ dump_stack();
+ if (remaining == 0)
+ pr_info("limit reached, disable warning\n");
+ }
}
if (is_atomic) {
@@ -3108,7 +3112,7 @@ out_free:
#endif /* BUILD_EMBED_FIRST_CHUNK */
#ifdef BUILD_PAGE_FIRST_CHUNK
-#include <asm/pgalloc.h>
+#include <linux/pgalloc.h>
#ifndef P4D_TABLE_SIZE
#define P4D_TABLE_SIZE PAGE_SIZE
@@ -3134,13 +3138,13 @@ void __init __weak pcpu_populate_pte(unsigned long addr)
if (pgd_none(*pgd)) {
p4d = memblock_alloc_or_panic(P4D_TABLE_SIZE, P4D_TABLE_SIZE);
- pgd_populate(&init_mm, pgd, p4d);
+ pgd_populate_kernel(addr, pgd, p4d);
}
p4d = p4d_offset(pgd, addr);
if (p4d_none(*p4d)) {
pud = memblock_alloc_or_panic(PUD_TABLE_SIZE, PUD_TABLE_SIZE);
- p4d_populate(&init_mm, p4d, pud);
+ p4d_populate_kernel(addr, p4d, pud);
}
pud = pud_offset(p4d, addr);
@@ -3355,7 +3359,7 @@ void __init setup_per_cpu_areas(void)
*/
unsigned long pcpu_nr_pages(void)
{
- return pcpu_nr_populated * pcpu_nr_units;
+ return data_race(READ_ONCE(pcpu_nr_populated)) * pcpu_nr_units;
}
/*
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 5a882f2b10f9..567e2d084071 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -139,8 +139,7 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
{
pmd_t pmd;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) &&
- !pmd_devmap(*pmdp));
+ VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp));
pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
return pmd;
@@ -153,7 +152,7 @@ pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
pud_t pud;
VM_BUG_ON(address & ~HPAGE_PUD_MASK);
- VM_BUG_ON(!pud_trans_huge(*pudp) && !pud_devmap(*pudp));
+ VM_BUG_ON(!pud_trans_huge(*pudp));
pud = pudp_huge_get_and_clear(vma->vm_mm, address, pudp);
flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE);
return pud;
@@ -293,7 +292,7 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
*pmdvalp = pmdval;
if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval)))
goto nomap;
- if (unlikely(pmd_trans_huge(pmdval) || pmd_devmap(pmdval)))
+ if (unlikely(pmd_trans_huge(pmdval)))
goto nomap;
if (unlikely(pmd_bad(pmdval))) {
pmd_clear_bad(pmd);
diff --git a/mm/ptdump.c b/mm/ptdump.c
index 106e1d66e9f9..b600c7f864b8 100644
--- a/mm/ptdump.c
+++ b/mm/ptdump.c
@@ -4,6 +4,7 @@
#include <linux/debugfs.h>
#include <linux/ptdump.h>
#include <linux/kasan.h>
+#include "internal.h"
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
/*
@@ -18,7 +19,7 @@ static inline int note_kasan_page_table(struct mm_walk *walk,
{
struct ptdump_state *st = walk->private;
- st->note_page(st, addr, 4, pte_val(kasan_early_shadow_pte[0]));
+ st->note_page_pte(st, addr, kasan_early_shadow_pte[0]);
walk->action = ACTION_CONTINUE;
@@ -38,11 +39,11 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr,
return note_kasan_page_table(walk, addr);
#endif
- if (st->effective_prot)
- st->effective_prot(st, 0, pgd_val(val));
+ if (st->effective_prot_pgd)
+ st->effective_prot_pgd(st, val);
if (pgd_leaf(val)) {
- st->note_page(st, addr, 0, pgd_val(val));
+ st->note_page_pgd(st, addr, val);
walk->action = ACTION_CONTINUE;
}
@@ -61,11 +62,11 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr,
return note_kasan_page_table(walk, addr);
#endif
- if (st->effective_prot)
- st->effective_prot(st, 1, p4d_val(val));
+ if (st->effective_prot_p4d)
+ st->effective_prot_p4d(st, val);
if (p4d_leaf(val)) {
- st->note_page(st, addr, 1, p4d_val(val));
+ st->note_page_p4d(st, addr, val);
walk->action = ACTION_CONTINUE;
}
@@ -84,11 +85,11 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr,
return note_kasan_page_table(walk, addr);
#endif
- if (st->effective_prot)
- st->effective_prot(st, 2, pud_val(val));
+ if (st->effective_prot_pud)
+ st->effective_prot_pud(st, val);
if (pud_leaf(val)) {
- st->note_page(st, addr, 2, pud_val(val));
+ st->note_page_pud(st, addr, val);
walk->action = ACTION_CONTINUE;
}
@@ -106,10 +107,10 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr,
return note_kasan_page_table(walk, addr);
#endif
- if (st->effective_prot)
- st->effective_prot(st, 3, pmd_val(val));
+ if (st->effective_prot_pmd)
+ st->effective_prot_pmd(st, val);
if (pmd_leaf(val)) {
- st->note_page(st, addr, 3, pmd_val(val));
+ st->note_page_pmd(st, addr, val);
walk->action = ACTION_CONTINUE;
}
@@ -122,10 +123,10 @@ static int ptdump_pte_entry(pte_t *pte, unsigned long addr,
struct ptdump_state *st = walk->private;
pte_t val = ptep_get_lockless(pte);
- if (st->effective_prot)
- st->effective_prot(st, 4, pte_val(val));
+ if (st->effective_prot_pte)
+ st->effective_prot_pte(st, val);
- st->note_page(st, addr, 4, pte_val(val));
+ st->note_page_pte(st, addr, val);
return 0;
}
@@ -134,9 +135,31 @@ static int ptdump_hole(unsigned long addr, unsigned long next,
int depth, struct mm_walk *walk)
{
struct ptdump_state *st = walk->private;
-
- st->note_page(st, addr, depth, 0);
-
+ pte_t pte_zero = {0};
+ pmd_t pmd_zero = {0};
+ pud_t pud_zero = {0};
+ p4d_t p4d_zero = {0};
+ pgd_t pgd_zero = {0};
+
+ switch (depth) {
+ case 4:
+ st->note_page_pte(st, addr, pte_zero);
+ break;
+ case 3:
+ st->note_page_pmd(st, addr, pmd_zero);
+ break;
+ case 2:
+ st->note_page_pud(st, addr, pud_zero);
+ break;
+ case 1:
+ st->note_page_p4d(st, addr, p4d_zero);
+ break;
+ case 0:
+ st->note_page_pgd(st, addr, pgd_zero);
+ break;
+ default:
+ break;
+ }
return 0;
}
@@ -153,16 +176,18 @@ void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd)
{
const struct ptdump_range *range = st->range;
+ get_online_mems();
mmap_write_lock(mm);
while (range->start != range->end) {
- walk_page_range_novma(mm, range->start, range->end,
+ walk_page_range_debug(mm, range->start, range->end,
&ptdump_ops, pgd, st);
range++;
}
mmap_write_unlock(mm);
+ put_online_mems();
/* Flush out the last page */
- st->note_page(st, 0, -1, 0);
+ st->note_page_flush(st);
}
static int check_wx_show(struct seq_file *m, void *v)
diff --git a/mm/readahead.c b/mm/readahead.c
index 6a4e96b69702..3a4b5d58eeb6 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -129,6 +129,9 @@
#include <linux/fadvise.h>
#include <linux/sched/mm.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/readahead.h>
+
#include "internal.h"
/*
@@ -225,6 +228,8 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
*/
unsigned int nofs = memalloc_nofs_save();
+ trace_page_cache_ra_unbounded(mapping->host, index, nr_to_read,
+ lookahead_size);
filemap_invalidate_lock_shared(mapping);
index = mapping_align_index(mapping, index);
@@ -457,7 +462,7 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index,
}
void page_cache_ra_order(struct readahead_control *ractl,
- struct file_ra_state *ra, unsigned int new_order)
+ struct file_ra_state *ra)
{
struct address_space *mapping = ractl->mapping;
pgoff_t start = readahead_index(ractl);
@@ -468,24 +473,22 @@ void page_cache_ra_order(struct readahead_control *ractl,
unsigned int nofs;
int err = 0;
gfp_t gfp = readahead_gfp_mask(mapping);
- unsigned int min_ra_size = max(4, mapping_min_folio_nrpages(mapping));
+ unsigned int new_order = ra->order;
- /*
- * Fallback when size < min_nrpages as each folio should be
- * at least min_nrpages anyway.
- */
- if (!mapping_large_folio_support(mapping) || ra->size < min_ra_size)
+ trace_page_cache_ra_order(mapping->host, start, ra);
+ if (!mapping_large_folio_support(mapping)) {
+ ra->order = 0;
goto fallback;
+ }
limit = min(limit, index + ra->size - 1);
- if (new_order < mapping_max_folio_order(mapping))
- new_order += 2;
-
new_order = min(mapping_max_folio_order(mapping), new_order);
new_order = min_t(unsigned int, new_order, ilog2(ra->size));
new_order = max(new_order, min_order);
+ ra->order = new_order;
+
/* See comment in page_cache_ra_unbounded() */
nofs = memalloc_nofs_save();
filemap_invalidate_lock_shared(mapping);
@@ -557,6 +560,7 @@ void page_cache_sync_ra(struct readahead_control *ractl,
unsigned long max_pages, contig_count;
pgoff_t prev_index, miss;
+ trace_page_cache_sync_ra(ractl->mapping->host, index, ra, req_count);
/*
* Even if readahead is disabled, issue this request as readahead
* as we'll need it to satisfy the requested range. The forced
@@ -617,8 +621,9 @@ void page_cache_sync_ra(struct readahead_control *ractl,
ra->size = min(contig_count + req_count, max_pages);
ra->async_size = 1;
readit:
+ ra->order = 0;
ractl->_index = ra->start;
- page_cache_ra_order(ractl, ra, 0);
+ page_cache_ra_order(ractl, ra);
}
EXPORT_SYMBOL_GPL(page_cache_sync_ra);
@@ -628,8 +633,7 @@ void page_cache_async_ra(struct readahead_control *ractl,
unsigned long max_pages;
struct file_ra_state *ra = ractl->ra;
pgoff_t index = readahead_index(ractl);
- pgoff_t expected, start;
- unsigned int order = folio_order(folio);
+ pgoff_t expected, start, end, aligned_end, align;
/* no readahead */
if (!ra->ra_pages)
@@ -641,6 +645,7 @@ void page_cache_async_ra(struct readahead_control *ractl,
if (folio_test_writeback(folio))
return;
+ trace_page_cache_async_ra(ractl->mapping->host, index, ra, req_count);
folio_clear_readahead(folio);
if (blk_cgroup_congested())
@@ -652,7 +657,7 @@ void page_cache_async_ra(struct readahead_control *ractl,
* Ramp up sizes, and push forward the readahead window.
*/
expected = round_down(ra->start + ra->size - ra->async_size,
- 1UL << order);
+ folio_nr_pages(folio));
if (index == expected) {
ra->start += ra->size;
/*
@@ -660,7 +665,6 @@ void page_cache_async_ra(struct readahead_control *ractl,
* the readahead window.
*/
ra->size = max(ra->size, get_next_ra_size(ra, max_pages));
- ra->async_size = ra->size;
goto readit;
}
@@ -681,18 +685,30 @@ void page_cache_async_ra(struct readahead_control *ractl,
ra->size = start - index; /* old async_size */
ra->size += req_count;
ra->size = get_next_ra_size(ra, max_pages);
- ra->async_size = ra->size;
readit:
+ ra->order += 2;
+ align = 1UL << min(ra->order, ffs(max_pages) - 1);
+ end = ra->start + ra->size;
+ aligned_end = round_down(end, align);
+ if (aligned_end > ra->start)
+ ra->size -= end - aligned_end;
+ ra->async_size = ra->size;
ractl->_index = ra->start;
- page_cache_ra_order(ractl, ra, order);
+ page_cache_ra_order(ractl, ra);
}
EXPORT_SYMBOL_GPL(page_cache_async_ra);
ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
{
+ struct file *file;
+ const struct inode *inode;
+
CLASS(fd, f)(fd);
+ if (fd_empty(f))
+ return -EBADF;
- if (fd_empty(f) || !(fd_file(f)->f_mode & FMODE_READ))
+ file = fd_file(f);
+ if (!(file->f_mode & FMODE_READ))
return -EBADF;
/*
@@ -700,9 +716,15 @@ ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
* that can execute readahead. If readahead is not possible
* on this file, then we must return -EINVAL.
*/
- if (!fd_file(f)->f_mapping || !fd_file(f)->f_mapping->a_ops ||
- (!S_ISREG(file_inode(fd_file(f))->i_mode) &&
- !S_ISBLK(file_inode(fd_file(f))->i_mode)))
+ if (!file->f_mapping)
+ return -EINVAL;
+ if (!file->f_mapping->a_ops)
+ return -EINVAL;
+
+ inode = file_inode(file);
+ if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
+ return -EINVAL;
+ if (IS_ANON_FILE(inode))
return -EINVAL;
return vfs_fadvise(fd_file(f), offset, count, POSIX_FADV_WILLNEED);
diff --git a/mm/rmap.c b/mm/rmap.c
index 67bb273dfb80..ac4f783d6ec2 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -79,7 +79,6 @@
#include <asm/tlbflush.h>
#define CREATE_TRACE_POINTS
-#include <trace/events/tlb.h>
#include <trace/events/migrate.h>
#include "internal.h"
@@ -285,7 +284,7 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
struct anon_vma *anon_vma;
- avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN);
+ avc = anon_vma_chain_alloc(GFP_NOWAIT);
if (unlikely(!avc)) {
unlock_anon_vma_root(root);
root = NULL;
@@ -503,12 +502,12 @@ struct anon_vma *folio_get_anon_vma(const struct folio *folio)
rcu_read_lock();
anon_mapping = (unsigned long)READ_ONCE(folio->mapping);
- if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
+ if ((anon_mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON)
goto out;
if (!folio_mapped(folio))
goto out;
- anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
+ anon_vma = (struct anon_vma *) (anon_mapping - FOLIO_MAPPING_ANON);
if (!atomic_inc_not_zero(&anon_vma->refcount)) {
anon_vma = NULL;
goto out;
@@ -550,12 +549,12 @@ struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio,
retry:
rcu_read_lock();
anon_mapping = (unsigned long)READ_ONCE(folio->mapping);
- if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
+ if ((anon_mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON)
goto out;
if (!folio_mapped(folio))
goto out;
- anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
+ anon_vma = (struct anon_vma *) (anon_mapping - FOLIO_MAPPING_ANON);
root_anon_vma = READ_ONCE(anon_vma->root);
if (down_read_trylock(&root_anon_vma->rwsem)) {
/*
@@ -746,7 +745,7 @@ void flush_tlb_batched_pending(struct mm_struct *mm)
int flushed = batch >> TLB_FLUSH_BATCH_FLUSHED_SHIFT;
if (pending != flushed) {
- arch_flush_tlb_batched_pending(mm);
+ flush_tlb_mm(mm);
/*
* If the new TLB flushing is pending during flushing, leave
* mm->tlb_flush_batched as is, to avoid losing flushing.
@@ -774,7 +773,7 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
* @vma: The VMA we need to know the address in.
*
* Calculates the user virtual address of this page in the specified VMA.
- * It is the caller's responsibililty to check the page is actually
+ * It is the caller's responsibility to check the page is actually
* within the VMA. There may not currently be a PTE pointing at this
* page, but if a page fault occurs at this address, this is the page
* which will be accessed.
@@ -789,13 +788,13 @@ unsigned long page_address_in_vma(const struct folio *folio,
const struct page *page, const struct vm_area_struct *vma)
{
if (folio_test_anon(folio)) {
- struct anon_vma *page__anon_vma = folio_anon_vma(folio);
+ struct anon_vma *anon_vma = folio_anon_vma(folio);
/*
* Note: swapoff's unuse_vma() is more efficient with this
* check, and needs it to match anon_vma when KSM is active.
*/
- if (!vma->anon_vma || !page__anon_vma ||
- vma->anon_vma->root != page__anon_vma->root)
+ if (!vma->anon_vma || !anon_vma ||
+ vma->anon_vma->root != anon_vma->root)
return -EFAULT;
} else if (!vma->vm_file) {
return -EFAULT;
@@ -803,7 +802,7 @@ unsigned long page_address_in_vma(const struct folio *folio,
return -EFAULT;
}
- /* KSM folios don't reach here because of the !page__anon_vma check */
+ /* KSM folios don't reach here because of the !anon_vma check */
return vma_address(vma, page_pgoff(folio, page), 1);
}
@@ -839,7 +838,7 @@ out:
struct folio_referenced_arg {
int mapcount;
int referenced;
- unsigned long vm_flags;
+ vm_flags_t vm_flags;
struct mem_cgroup *memcg;
};
@@ -851,34 +850,34 @@ static bool folio_referenced_one(struct folio *folio,
{
struct folio_referenced_arg *pra = arg;
DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
- int referenced = 0;
- unsigned long start = address, ptes = 0;
+ int ptes = 0, referenced = 0;
while (page_vma_mapped_walk(&pvmw)) {
address = pvmw.address;
if (vma->vm_flags & VM_LOCKED) {
- if (!folio_test_large(folio) || !pvmw.pte) {
- /* Restore the mlock which got missed */
- mlock_vma_folio(folio, vma);
- page_vma_mapped_walk_done(&pvmw);
- pra->vm_flags |= VM_LOCKED;
- return false; /* To break the loop */
- }
+ ptes++;
+ pra->mapcount--;
+
+ /* Only mlock fully mapped pages */
+ if (pvmw.pte && ptes != pvmw.nr_pages)
+ continue;
+
/*
- * For large folio fully mapped to VMA, will
- * be handled after the pvmw loop.
+ * All PTEs must be protected by page table lock in
+ * order to mlock the page.
*
- * For large folio cross VMA boundaries, it's
- * expected to be picked by page reclaim. But
- * should skip reference of pages which are in
- * the range of VM_LOCKED vma. As page reclaim
- * should just count the reference of pages out
- * the range of VM_LOCKED vma.
+ * If page table boundary has been cross, current ptl
+ * only protect part of ptes.
*/
- ptes++;
- pra->mapcount--;
- continue;
+ if (pvmw.flags & PVMW_PGTABLE_CROSSED)
+ continue;
+
+ /* Restore the mlock which got missed */
+ mlock_vma_folio(folio, vma);
+ page_vma_mapped_walk_done(&pvmw);
+ pra->vm_flags |= VM_LOCKED;
+ return false; /* To break the loop */
}
/*
@@ -914,23 +913,6 @@ static bool folio_referenced_one(struct folio *folio,
pra->mapcount--;
}
- if ((vma->vm_flags & VM_LOCKED) &&
- folio_test_large(folio) &&
- folio_within_vma(folio, vma)) {
- unsigned long s_align, e_align;
-
- s_align = ALIGN_DOWN(start, PMD_SIZE);
- e_align = ALIGN_DOWN(start + folio_size(folio) - 1, PMD_SIZE);
-
- /* folio doesn't cross page table boundary and fully mapped */
- if ((s_align == e_align) && (ptes == folio_nr_pages(folio))) {
- /* Restore the mlock which got missed */
- mlock_vma_folio(folio, vma);
- pra->vm_flags |= VM_LOCKED;
- return false; /* To break the loop */
- }
- }
-
if (referenced)
folio_clear_idle(folio);
if (folio_test_clear_young(folio))
@@ -984,7 +966,7 @@ static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg)
* the function bailed out due to rmap lock contention.
*/
int folio_referenced(struct folio *folio, int is_locked,
- struct mem_cgroup *memcg, unsigned long *vm_flags)
+ struct mem_cgroup *memcg, vm_flags_t *vm_flags)
{
bool we_locked = false;
struct folio_referenced_arg pra = {
@@ -1241,18 +1223,40 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
return page_vma_mkclean_one(&pvmw);
}
-static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
+static void __folio_mod_stat(struct folio *folio, int nr, int nr_pmdmapped)
+{
+ int idx;
+
+ if (nr) {
+ idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED;
+ __lruvec_stat_mod_folio(folio, idx, nr);
+ }
+ if (nr_pmdmapped) {
+ if (folio_test_anon(folio)) {
+ idx = NR_ANON_THPS;
+ __lruvec_stat_mod_folio(folio, idx, nr_pmdmapped);
+ } else {
+ /* NR_*_PMDMAPPED are not maintained per-memcg */
+ idx = folio_test_swapbacked(folio) ?
+ NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED;
+ __mod_node_page_state(folio_pgdat(folio), idx,
+ nr_pmdmapped);
+ }
+ }
+}
+
+static __always_inline void __folio_add_rmap(struct folio *folio,
struct page *page, int nr_pages, struct vm_area_struct *vma,
- enum rmap_level level, int *nr_pmdmapped)
+ enum pgtable_level level)
{
atomic_t *mapped = &folio->_nr_pages_mapped;
const int orig_nr_pages = nr_pages;
- int first = 0, nr = 0;
+ int first = 0, nr = 0, nr_pmdmapped = 0;
__folio_rmap_sanity_checks(folio, page, nr_pages, level);
switch (level) {
- case RMAP_LEVEL_PTE:
+ case PGTABLE_LEVEL_PTE:
if (!folio_test_large(folio)) {
nr = atomic_inc_and_test(&folio->_mapcount);
break;
@@ -1278,12 +1282,12 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
folio_add_large_mapcount(folio, orig_nr_pages, vma);
break;
- case RMAP_LEVEL_PMD:
- case RMAP_LEVEL_PUD:
+ case PGTABLE_LEVEL_PMD:
+ case PGTABLE_LEVEL_PUD:
first = atomic_inc_and_test(&folio->_entire_mapcount);
if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
- if (level == RMAP_LEVEL_PMD && first)
- *nr_pmdmapped = folio_large_nr_pages(folio);
+ if (level == PGTABLE_LEVEL_PMD && first)
+ nr_pmdmapped = folio_large_nr_pages(folio);
nr = folio_inc_return_large_mapcount(folio, vma);
if (nr == 1)
/* Was completely unmapped. */
@@ -1301,8 +1305,8 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
* We only track PMD mappings of PMD-sized
* folios separately.
*/
- if (level == RMAP_LEVEL_PMD)
- *nr_pmdmapped = nr_pages;
+ if (level == PGTABLE_LEVEL_PMD)
+ nr_pmdmapped = nr_pages;
nr = nr_pages - (nr & FOLIO_PAGES_MAPPED);
/* Raced ahead of a remove and another add? */
if (unlikely(nr < 0))
@@ -1314,8 +1318,10 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
}
folio_inc_large_mapcount(folio, vma);
break;
+ default:
+ BUILD_BUG();
}
- return nr;
+ __folio_mod_stat(folio, nr, nr_pmdmapped);
}
/**
@@ -1334,9 +1340,9 @@ void folio_move_anon_rmap(struct folio *folio, struct vm_area_struct *vma)
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
VM_BUG_ON_VMA(!anon_vma, vma);
- anon_vma += PAGE_MAPPING_ANON;
+ anon_vma += FOLIO_MAPPING_ANON;
/*
- * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written
+ * Ensure that anon_vma and the FOLIO_MAPPING_ANON bit are written
* simultaneously, so a concurrent reader (eg folio_referenced()'s
* folio_test_anon()) will not see one without the other.
*/
@@ -1367,10 +1373,10 @@ static void __folio_set_anon(struct folio *folio, struct vm_area_struct *vma,
/*
* page_idle does a lockless/optimistic rmap scan on folio->mapping.
* Make sure the compiler doesn't split the stores of anon_vma and
- * the PAGE_MAPPING_ANON type identifier, otherwise the rmap code
+ * the FOLIO_MAPPING_ANON type identifier, otherwise the rmap code
* could mistake the mapping for a struct address_space and crash.
*/
- anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
+ anon_vma = (void *) anon_vma + FOLIO_MAPPING_ANON;
WRITE_ONCE(folio->mapping, (struct address_space *) anon_vma);
folio->index = linear_page_index(vma, address);
}
@@ -1403,59 +1409,37 @@ static void __page_check_anon_rmap(const struct folio *folio,
page);
}
-static void __folio_mod_stat(struct folio *folio, int nr, int nr_pmdmapped)
-{
- int idx;
-
- if (nr) {
- idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED;
- __lruvec_stat_mod_folio(folio, idx, nr);
- }
- if (nr_pmdmapped) {
- if (folio_test_anon(folio)) {
- idx = NR_ANON_THPS;
- __lruvec_stat_mod_folio(folio, idx, nr_pmdmapped);
- } else {
- /* NR_*_PMDMAPPED are not maintained per-memcg */
- idx = folio_test_swapbacked(folio) ?
- NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED;
- __mod_node_page_state(folio_pgdat(folio), idx,
- nr_pmdmapped);
- }
- }
-}
-
static __always_inline void __folio_add_anon_rmap(struct folio *folio,
struct page *page, int nr_pages, struct vm_area_struct *vma,
- unsigned long address, rmap_t flags, enum rmap_level level)
+ unsigned long address, rmap_t flags, enum pgtable_level level)
{
- int i, nr, nr_pmdmapped = 0;
+ int i;
VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
- nr = __folio_add_rmap(folio, page, nr_pages, vma, level, &nr_pmdmapped);
+ __folio_add_rmap(folio, page, nr_pages, vma, level);
if (likely(!folio_test_ksm(folio)))
__page_check_anon_rmap(folio, page, vma, address);
- __folio_mod_stat(folio, nr, nr_pmdmapped);
-
if (flags & RMAP_EXCLUSIVE) {
switch (level) {
- case RMAP_LEVEL_PTE:
+ case PGTABLE_LEVEL_PTE:
for (i = 0; i < nr_pages; i++)
SetPageAnonExclusive(page + i);
break;
- case RMAP_LEVEL_PMD:
+ case PGTABLE_LEVEL_PMD:
SetPageAnonExclusive(page);
break;
- case RMAP_LEVEL_PUD:
+ case PGTABLE_LEVEL_PUD:
/*
* Keep the compiler happy, we don't support anonymous
* PUD mappings.
*/
WARN_ON_ONCE(1);
break;
+ default:
+ BUILD_BUG();
}
}
@@ -1479,12 +1463,12 @@ static __always_inline void __folio_add_anon_rmap(struct folio *folio,
}
/*
- * For large folio, only mlock it if it's fully mapped to VMA. It's
- * not easy to check whether the large folio is fully mapped to VMA
- * here. Only mlock normal 4K folio and leave page reclaim to handle
- * large folio.
+ * Only mlock it if the folio is fully mapped to the VMA.
+ *
+ * Partially mapped folios can be split on reclaim and part outside
+ * of mlocked VMA can be evicted or freed.
*/
- if (!folio_test_large(folio))
+ if (folio_nr_pages(folio) == nr_pages)
mlock_vma_folio(folio, vma);
}
@@ -1509,7 +1493,7 @@ void folio_add_anon_rmap_ptes(struct folio *folio, struct page *page,
rmap_t flags)
{
__folio_add_anon_rmap(folio, page, nr_pages, vma, address, flags,
- RMAP_LEVEL_PTE);
+ PGTABLE_LEVEL_PTE);
}
/**
@@ -1530,7 +1514,7 @@ void folio_add_anon_rmap_pmd(struct folio *folio, struct page *page,
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
__folio_add_anon_rmap(folio, page, HPAGE_PMD_NR, vma, address, flags,
- RMAP_LEVEL_PMD);
+ PGTABLE_LEVEL_PMD);
#else
WARN_ON_ONCE(true);
#endif
@@ -1611,17 +1595,19 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
static __always_inline void __folio_add_file_rmap(struct folio *folio,
struct page *page, int nr_pages, struct vm_area_struct *vma,
- enum rmap_level level)
+ enum pgtable_level level)
{
- int nr, nr_pmdmapped = 0;
-
VM_WARN_ON_FOLIO(folio_test_anon(folio), folio);
- nr = __folio_add_rmap(folio, page, nr_pages, vma, level, &nr_pmdmapped);
- __folio_mod_stat(folio, nr, nr_pmdmapped);
+ __folio_add_rmap(folio, page, nr_pages, vma, level);
- /* See comments in folio_add_anon_rmap_*() */
- if (!folio_test_large(folio))
+ /*
+ * Only mlock it if the folio is fully mapped to the VMA.
+ *
+ * Partially mapped folios can be split on reclaim and part outside
+ * of mlocked VMA can be evicted or freed.
+ */
+ if (folio_nr_pages(folio) == nr_pages)
mlock_vma_folio(folio, vma);
}
@@ -1639,7 +1625,7 @@ static __always_inline void __folio_add_file_rmap(struct folio *folio,
void folio_add_file_rmap_ptes(struct folio *folio, struct page *page,
int nr_pages, struct vm_area_struct *vma)
{
- __folio_add_file_rmap(folio, page, nr_pages, vma, RMAP_LEVEL_PTE);
+ __folio_add_file_rmap(folio, page, nr_pages, vma, PGTABLE_LEVEL_PTE);
}
/**
@@ -1656,7 +1642,7 @@ void folio_add_file_rmap_pmd(struct folio *folio, struct page *page,
struct vm_area_struct *vma)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- __folio_add_file_rmap(folio, page, HPAGE_PMD_NR, vma, RMAP_LEVEL_PMD);
+ __folio_add_file_rmap(folio, page, HPAGE_PMD_NR, vma, PGTABLE_LEVEL_PMD);
#else
WARN_ON_ONCE(true);
#endif
@@ -1677,7 +1663,7 @@ void folio_add_file_rmap_pud(struct folio *folio, struct page *page,
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
- __folio_add_file_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD);
+ __folio_add_file_rmap(folio, page, HPAGE_PUD_NR, vma, PGTABLE_LEVEL_PUD);
#else
WARN_ON_ONCE(true);
#endif
@@ -1685,7 +1671,7 @@ void folio_add_file_rmap_pud(struct folio *folio, struct page *page,
static __always_inline void __folio_remove_rmap(struct folio *folio,
struct page *page, int nr_pages, struct vm_area_struct *vma,
- enum rmap_level level)
+ enum pgtable_level level)
{
atomic_t *mapped = &folio->_nr_pages_mapped;
int last = 0, nr = 0, nr_pmdmapped = 0;
@@ -1694,7 +1680,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
__folio_rmap_sanity_checks(folio, page, nr_pages, level);
switch (level) {
- case RMAP_LEVEL_PTE:
+ case PGTABLE_LEVEL_PTE:
if (!folio_test_large(folio)) {
nr = atomic_add_negative(-1, &folio->_mapcount);
break;
@@ -1704,7 +1690,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
nr = folio_sub_return_large_mapcount(folio, nr_pages, vma);
if (!nr) {
/* Now completely unmapped. */
- nr = folio_nr_pages(folio);
+ nr = folio_large_nr_pages(folio);
} else {
partially_mapped = nr < folio_large_nr_pages(folio) &&
!folio_entire_mapcount(folio);
@@ -1724,11 +1710,11 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
partially_mapped = nr && atomic_read(mapped);
break;
- case RMAP_LEVEL_PMD:
- case RMAP_LEVEL_PUD:
+ case PGTABLE_LEVEL_PMD:
+ case PGTABLE_LEVEL_PUD:
if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
last = atomic_add_negative(-1, &folio->_entire_mapcount);
- if (level == RMAP_LEVEL_PMD && last)
+ if (level == PGTABLE_LEVEL_PMD && last)
nr_pmdmapped = folio_large_nr_pages(folio);
nr = folio_dec_return_large_mapcount(folio, vma);
if (!nr) {
@@ -1748,9 +1734,9 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
nr = atomic_sub_return_relaxed(ENTIRELY_MAPPED, mapped);
if (likely(nr < ENTIRELY_MAPPED)) {
nr_pages = folio_large_nr_pages(folio);
- if (level == RMAP_LEVEL_PMD)
+ if (level == PGTABLE_LEVEL_PMD)
nr_pmdmapped = nr_pages;
- nr = nr_pages - (nr & FOLIO_PAGES_MAPPED);
+ nr = nr_pages - nr;
/* Raced ahead of another remove and an add? */
if (unlikely(nr < 0))
nr = 0;
@@ -1762,6 +1748,8 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
partially_mapped = nr && nr < nr_pmdmapped;
break;
+ default:
+ BUILD_BUG();
}
/*
@@ -1801,7 +1789,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
void folio_remove_rmap_ptes(struct folio *folio, struct page *page,
int nr_pages, struct vm_area_struct *vma)
{
- __folio_remove_rmap(folio, page, nr_pages, vma, RMAP_LEVEL_PTE);
+ __folio_remove_rmap(folio, page, nr_pages, vma, PGTABLE_LEVEL_PTE);
}
/**
@@ -1818,7 +1806,7 @@ void folio_remove_rmap_pmd(struct folio *folio, struct page *page,
struct vm_area_struct *vma)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- __folio_remove_rmap(folio, page, HPAGE_PMD_NR, vma, RMAP_LEVEL_PMD);
+ __folio_remove_rmap(folio, page, HPAGE_PMD_NR, vma, PGTABLE_LEVEL_PMD);
#else
WARN_ON_ONCE(true);
#endif
@@ -1839,29 +1827,36 @@ void folio_remove_rmap_pud(struct folio *folio, struct page *page,
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
- __folio_remove_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD);
+ __folio_remove_rmap(folio, page, HPAGE_PUD_NR, vma, PGTABLE_LEVEL_PUD);
#else
WARN_ON_ONCE(true);
#endif
}
-/* We support batch unmapping of PTEs for lazyfree large folios */
-static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
- struct folio *folio, pte_t *ptep)
+static inline unsigned int folio_unmap_pte_batch(struct folio *folio,
+ struct page_vma_mapped_walk *pvmw,
+ enum ttu_flags flags, pte_t pte)
{
- const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
- int max_nr = folio_nr_pages(folio);
- pte_t pte = ptep_get(ptep);
+ unsigned long end_addr, addr = pvmw->address;
+ struct vm_area_struct *vma = pvmw->vma;
+ unsigned int max_nr;
+ if (flags & TTU_HWPOISON)
+ return 1;
+ if (!folio_test_large(folio))
+ return 1;
+
+ /* We may only batch within a single VMA and a single page table. */
+ end_addr = pmd_addr_end(addr, vma->vm_end);
+ max_nr = (end_addr - addr) >> PAGE_SHIFT;
+
+ /* We only support lazyfree batching for now ... */
if (!folio_test_anon(folio) || folio_test_swapbacked(folio))
- return false;
+ return 1;
if (pte_unused(pte))
- return false;
- if (pte_pfn(pte) != folio_pfn(folio))
- return false;
+ return 1;
- return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL,
- NULL, NULL) == max_nr;
+ return folio_pte_batch(folio, pvmw->pte, pte, max_nr);
}
/*
@@ -1880,6 +1875,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
unsigned long nr_pages = 1, end_addr;
unsigned long pfn;
unsigned long hsz = 0;
+ int ptes = 0;
/*
* When racing against e.g. zap_pte_range() on another cpu,
@@ -1920,10 +1916,34 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
*/
if (!(flags & TTU_IGNORE_MLOCK) &&
(vma->vm_flags & VM_LOCKED)) {
+ ptes++;
+
+ /*
+ * Set 'ret' to indicate the page cannot be unmapped.
+ *
+ * Do not jump to walk_abort immediately as additional
+ * iteration might be required to detect fully mapped
+ * folio an mlock it.
+ */
+ ret = false;
+
+ /* Only mlock fully mapped pages */
+ if (pvmw.pte && ptes != pvmw.nr_pages)
+ continue;
+
+ /*
+ * All PTEs must be protected by page table lock in
+ * order to mlock the page.
+ *
+ * If page table boundary has been cross, current ptl
+ * only protect part of ptes.
+ */
+ if (pvmw.flags & PVMW_PGTABLE_CROSSED)
+ goto walk_done;
+
/* Restore the mlock which got missed */
- if (!folio_test_large(folio))
- mlock_vma_folio(folio, vma);
- goto walk_abort;
+ mlock_vma_folio(folio, vma);
+ goto walk_done;
}
if (!pvmw.pte) {
@@ -1944,7 +1964,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
* restart so we can process the PTE-mapped THP.
*/
split_huge_pmd_locked(vma, pvmw.address,
- pvmw.pmd, false, folio);
+ pvmw.pmd, false);
flags &= ~TTU_SPLIT_HUGE_PMD;
page_vma_mapped_walk_restart(&pvmw);
continue;
@@ -2024,14 +2044,12 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
if (pte_dirty(pteval))
folio_mark_dirty(folio);
} else if (likely(pte_present(pteval))) {
- if (folio_test_large(folio) && !(flags & TTU_HWPOISON) &&
- can_batch_unmap_folio_ptes(address, folio, pvmw.pte))
- nr_pages = folio_nr_pages(folio);
+ nr_pages = folio_unmap_pte_batch(folio, &pvmw, flags, pteval);
end_addr = address + nr_pages * PAGE_SIZE;
flush_cache_range(vma, address, end_addr);
/* Nuke the page table entry. */
- pteval = get_and_clear_full_ptes(mm, address, pvmw.pte, nr_pages, 0);
+ pteval = get_and_clear_ptes(mm, address, pvmw.pte, nr_pages);
/*
* We clear the PTE but do not flush so potentially
* a remote CPU could still be writing to the folio.
@@ -2206,13 +2224,16 @@ discard:
hugetlb_remove_rmap(folio);
} else {
folio_remove_rmap_ptes(folio, subpage, nr_pages, vma);
- folio_ref_sub(folio, nr_pages - 1);
}
if (vma->vm_flags & VM_LOCKED)
mlock_drain_local();
- folio_put(folio);
- /* We have already batched the entire folio */
- if (nr_pages > 1)
+ folio_put_refs(folio, nr_pages);
+
+ /*
+ * If we are sure that we batched the entire folio and cleared
+ * all PTEs, we can just optimize and stop right here.
+ */
+ if (nr_pages == folio_nr_pages(folio))
goto walk_done;
continue;
walk_abort:
@@ -2292,13 +2313,6 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
pvmw.flags = PVMW_SYNC;
/*
- * unmap_page() in mm/huge_memory.c is the only user of migration with
- * TTU_SPLIT_HUGE_PMD and it wants to freeze.
- */
- if (flags & TTU_SPLIT_HUGE_PMD)
- split_huge_pmd_address(vma, address, true, folio);
-
- /*
* For THP, we have to assume the worse case ie pmd for invalidation.
* For hugetlb, it could be much worse if we need to do pud
* invalidation in the case of pmd sharing.
@@ -2323,9 +2337,16 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
mmu_notifier_invalidate_range_start(&range);
while (page_vma_mapped_walk(&pvmw)) {
-#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
/* PMD-mapped THP migration entry */
if (!pvmw.pte) {
+ if (flags & TTU_SPLIT_HUGE_PMD) {
+ split_huge_pmd_locked(vma, pvmw.address,
+ pvmw.pmd, true);
+ ret = false;
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
subpage = folio_page(folio,
pmd_pfn(*pvmw.pmd) - folio_pfn(folio));
VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
@@ -2337,8 +2358,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
break;
}
continue;
- }
#endif
+ }
/* Unexpected PMD-mapped THP? */
VM_BUG_ON_FOLIO(!pvmw.pte, folio);
diff --git a/mm/secretmem.c b/mm/secretmem.c
index 1b0a214ee558..60137305bc20 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -54,7 +54,6 @@ static vm_fault_t secretmem_fault(struct vm_fault *vmf)
pgoff_t offset = vmf->pgoff;
gfp_t gfp = vmf->gfp_mask;
unsigned long addr;
- struct page *page;
struct folio *folio;
vm_fault_t ret;
int err;
@@ -65,16 +64,15 @@ static vm_fault_t secretmem_fault(struct vm_fault *vmf)
filemap_invalidate_lock_shared(mapping);
retry:
- page = find_lock_page(mapping, offset);
- if (!page) {
+ folio = filemap_lock_folio(mapping, offset);
+ if (IS_ERR(folio)) {
folio = folio_alloc(gfp | __GFP_ZERO, 0);
if (!folio) {
ret = VM_FAULT_OOM;
goto out;
}
- page = &folio->page;
- err = set_direct_map_invalid_noflush(page);
+ err = set_direct_map_invalid_noflush(folio_page(folio, 0));
if (err) {
folio_put(folio);
ret = vmf_error(err);
@@ -90,7 +88,7 @@ retry:
* already happened when we marked the page invalid
* which guarantees that this call won't fail
*/
- set_direct_map_default_noflush(page);
+ set_direct_map_default_noflush(folio_page(folio, 0));
if (err == -EEXIST)
goto retry;
@@ -98,11 +96,11 @@ retry:
goto out;
}
- addr = (unsigned long)page_address(page);
+ addr = (unsigned long)folio_address(folio);
flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
}
- vmf->page = page;
+ vmf->page = folio_file_page(folio, vmf->pgoff);
ret = VM_FAULT_LOCKED;
out:
@@ -120,18 +118,18 @@ static int secretmem_release(struct inode *inode, struct file *file)
return 0;
}
-static int secretmem_mmap(struct file *file, struct vm_area_struct *vma)
+static int secretmem_mmap_prepare(struct vm_area_desc *desc)
{
- unsigned long len = vma->vm_end - vma->vm_start;
+ const unsigned long len = desc->end - desc->start;
- if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0)
+ if ((desc->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0)
return -EINVAL;
- if (!mlock_future_ok(vma->vm_mm, vma->vm_flags | VM_LOCKED, len))
+ if (!mlock_future_ok(desc->mm, desc->vm_flags | VM_LOCKED, len))
return -EAGAIN;
- vm_flags_set(vma, VM_LOCKED | VM_DONTDUMP);
- vma->vm_ops = &secretmem_vm_ops;
+ desc->vm_flags |= VM_LOCKED | VM_DONTDUMP;
+ desc->vm_ops = &secretmem_vm_ops;
return 0;
}
@@ -143,7 +141,7 @@ bool vma_is_secretmem(struct vm_area_struct *vma)
static const struct file_operations secretmem_fops = {
.release = secretmem_release,
- .mmap = secretmem_mmap,
+ .mmap_prepare = secretmem_mmap_prepare,
};
static int secretmem_migrate_folio(struct address_space *mapping,
@@ -154,7 +152,7 @@ static int secretmem_migrate_folio(struct address_space *mapping,
static void secretmem_free_folio(struct folio *folio)
{
- set_direct_map_default_noflush(&folio->page);
+ set_direct_map_default_noflush(folio_page(folio, 0));
folio_zero_segment(folio, 0, folio_size(folio));
}
@@ -195,20 +193,13 @@ static struct file *secretmem_file_create(unsigned long flags)
struct file *file;
struct inode *inode;
const char *anon_name = "[secretmem]";
- int err;
- inode = alloc_anon_inode(secretmem_mnt->mnt_sb);
+ inode = anon_inode_make_secure_inode(secretmem_mnt->mnt_sb, anon_name, NULL);
if (IS_ERR(inode))
return ERR_CAST(inode);
- err = security_inode_init_security_anon(inode, &QSTR(anon_name), NULL);
- if (err) {
- file = ERR_PTR(err);
- goto err_free_inode;
- }
-
file = alloc_file_pseudo(inode, secretmem_mnt, "secretmem",
- O_RDWR, &secretmem_fops);
+ O_RDWR | O_LARGEFILE, &secretmem_fops);
if (IS_ERR(file))
goto err_free_inode;
@@ -222,6 +213,8 @@ static struct file *secretmem_file_create(unsigned long flags)
inode->i_mode |= S_IFREG;
inode->i_size = 0;
+ atomic_inc(&secretmem_users);
+
return file;
err_free_inode:
@@ -255,9 +248,6 @@ SYSCALL_DEFINE1(memfd_secret, unsigned int, flags)
goto err_put_fd;
}
- file->f_flags |= O_LARGEFILE;
-
- atomic_inc(&secretmem_users);
fd_install(fd, file);
return fd;
@@ -268,7 +258,15 @@ err_put_fd:
static int secretmem_init_fs_context(struct fs_context *fc)
{
- return init_pseudo(fc, SECRETMEM_MAGIC) ? 0 : -ENOMEM;
+ struct pseudo_fs_context *ctx;
+
+ ctx = init_pseudo(fc, SECRETMEM_MAGIC);
+ if (!ctx)
+ return -ENOMEM;
+
+ fc->s_iflags |= SB_I_NOEXEC;
+ fc->s_iflags |= SB_I_NODEV;
+ return 0;
}
static struct file_system_type secretmem_fs = {
@@ -286,9 +284,6 @@ static int __init secretmem_init(void)
if (IS_ERR(secretmem_mnt))
return PTR_ERR(secretmem_mnt);
- /* prevent secretmem mappings from ever getting PROT_EXEC */
- secretmem_mnt->mnt_flags |= MNT_NOEXEC;
-
return 0;
}
fs_initcall(secretmem_init);
diff --git a/mm/shmem.c b/mm/shmem.c
index 99327c30507c..b9081b817d28 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -98,7 +98,7 @@ static struct vfsmount *shm_mnt __ro_after_init;
#define SHORT_SYMLINK_LEN 128
/*
- * shmem_fallocate communicates with shmem_fault or shmem_writepage via
+ * shmem_fallocate communicates with shmem_fault or shmem_writeout via
* inode->i_private (with i_rwsem making sure that it has only one user at
* a time): we would prefer not to enlarge the shmem inode just for that.
*/
@@ -107,7 +107,7 @@ struct shmem_falloc {
pgoff_t start; /* start of range currently being fallocated */
pgoff_t next; /* the next page offset to be fallocated */
pgoff_t nr_falloced; /* how many new pages have been fallocated */
- pgoff_t nr_unswapped; /* how often writepage refused to swap out */
+ pgoff_t nr_unswapped; /* how often writeout refused to swap out */
};
struct shmem_options {
@@ -275,24 +275,24 @@ static const struct vm_operations_struct shmem_vm_ops;
static const struct vm_operations_struct shmem_anon_vm_ops;
static struct file_system_type shmem_fs_type;
-bool shmem_mapping(struct address_space *mapping)
+bool shmem_mapping(const struct address_space *mapping)
{
return mapping->a_ops == &shmem_aops;
}
EXPORT_SYMBOL_GPL(shmem_mapping);
-bool vma_is_anon_shmem(struct vm_area_struct *vma)
+bool vma_is_anon_shmem(const struct vm_area_struct *vma)
{
return vma->vm_ops == &shmem_anon_vm_ops;
}
-bool vma_is_shmem(struct vm_area_struct *vma)
+bool vma_is_shmem(const struct vm_area_struct *vma)
{
return vma_is_anon_shmem(vma) || vma->vm_ops == &shmem_vm_ops;
}
static LIST_HEAD(shmem_swaplist);
-static DEFINE_MUTEX(shmem_swaplist_mutex);
+static DEFINE_SPINLOCK(shmem_swaplist_lock);
#ifdef CONFIG_TMPFS_QUOTA
@@ -432,10 +432,13 @@ static void shmem_free_inode(struct super_block *sb, size_t freed_ispace)
*
* But normally info->alloced == inode->i_mapping->nrpages + info->swapped
* So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
+ *
+ * Return: true if swapped was incremented from 0, for shmem_writeout().
*/
-static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped)
+static bool shmem_recalc_inode(struct inode *inode, long alloced, long swapped)
{
struct shmem_inode_info *info = SHMEM_I(inode);
+ bool first_swapped = false;
long freed;
spin_lock(&info->lock);
@@ -446,12 +449,15 @@ static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped)
/*
* Special case: whereas normally shmem_recalc_inode() is called
* after i_mapping->nrpages has already been adjusted (up or down),
- * shmem_writepage() has to raise swapped before nrpages is lowered -
+ * shmem_writeout() has to raise swapped before nrpages is lowered -
* to stop a racing shmem_recalc_inode() from thinking that a page has
* been freed. Compensate here, to avoid the need for a followup call.
*/
- if (swapped > 0)
+ if (swapped > 0) {
+ if (info->swapped == swapped)
+ first_swapped = true;
freed += swapped;
+ }
if (freed > 0)
info->alloced -= freed;
spin_unlock(&info->lock);
@@ -459,6 +465,7 @@ static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped)
/* The quota case may block */
if (freed > 0)
shmem_inode_unacct_blocks(inode, freed);
+ return first_swapped;
}
bool shmem_charge(struct inode *inode, long pages)
@@ -505,15 +512,27 @@ static int shmem_replace_entry(struct address_space *mapping,
/*
* Sometimes, before we decide whether to proceed or to fail, we must check
- * that an entry was not already brought back from swap by a racing thread.
+ * that an entry was not already brought back or split by a racing thread.
*
* Checking folio is not enough: by the time a swapcache folio is locked, it
* might be reused, and again be swapcache, using the same swap as before.
+ * Returns the swap entry's order if it still presents, else returns -1.
*/
-static bool shmem_confirm_swap(struct address_space *mapping,
- pgoff_t index, swp_entry_t swap)
+static int shmem_confirm_swap(struct address_space *mapping, pgoff_t index,
+ swp_entry_t swap)
{
- return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
+ XA_STATE(xas, &mapping->i_pages, index);
+ int ret = -1;
+ void *entry;
+
+ rcu_read_lock();
+ do {
+ entry = xas_load(&xas);
+ if (entry == swp_to_radix_entry(swap))
+ ret = xas_get_order(&xas);
+ } while (xas_retry(&xas, entry));
+ rcu_read_unlock();
+ return ret;
}
/*
@@ -554,42 +573,6 @@ static bool shmem_confirm_swap(struct address_space *mapping,
static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER;
static int tmpfs_huge __read_mostly = SHMEM_HUGE_NEVER;
-/**
- * shmem_mapping_size_orders - Get allowable folio orders for the given file size.
- * @mapping: Target address_space.
- * @index: The page index.
- * @write_end: end of a write, could extend inode size.
- *
- * This returns huge orders for folios (when supported) based on the file size
- * which the mapping currently allows at the given index. The index is relevant
- * due to alignment considerations the mapping might have. The returned order
- * may be less than the size passed.
- *
- * Return: The orders.
- */
-static inline unsigned int
-shmem_mapping_size_orders(struct address_space *mapping, pgoff_t index, loff_t write_end)
-{
- unsigned int order;
- size_t size;
-
- if (!mapping_large_folio_support(mapping) || !write_end)
- return 0;
-
- /* Calculate the write size based on the write_end */
- size = write_end - (index << PAGE_SHIFT);
- order = filemap_get_order(size);
- if (!order)
- return 0;
-
- /* If we're not aligned, allocate a smaller folio */
- if (index & ((1UL << order) - 1))
- order = __ffs(index);
-
- order = min_t(size_t, order, MAX_PAGECACHE_ORDER);
- return order > 0 ? BIT(order + 1) - 1 : 0;
-}
-
static unsigned int shmem_get_orders_within_size(struct inode *inode,
unsigned long within_size_orders, pgoff_t index,
loff_t write_end)
@@ -615,7 +598,7 @@ static unsigned int shmem_get_orders_within_size(struct inode *inode,
static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
loff_t write_end, bool shmem_huge_force,
struct vm_area_struct *vma,
- unsigned long vm_flags)
+ vm_flags_t vm_flags)
{
unsigned int maybe_pmd_order = HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER ?
0 : BIT(HPAGE_PMD_ORDER);
@@ -636,22 +619,21 @@ static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index
* For tmpfs mmap()'s huge order, we still use PMD-sized order to
* allocate huge pages due to lack of a write size hint.
*
- * Otherwise, tmpfs will allow getting a highest order hint based on
- * the size of write and fallocate paths, then will try each allowable
- * huge orders.
+ * For tmpfs with 'huge=always' or 'huge=within_size' mount option,
+ * we will always try PMD-sized order first. If that failed, it will
+ * fall back to small large folios.
*/
switch (SHMEM_SB(inode->i_sb)->huge) {
case SHMEM_HUGE_ALWAYS:
if (vma)
return maybe_pmd_order;
- return shmem_mapping_size_orders(inode->i_mapping, index, write_end);
+ return THP_ORDERS_ALL_FILE_DEFAULT;
case SHMEM_HUGE_WITHIN_SIZE:
if (vma)
within_size_orders = maybe_pmd_order;
else
- within_size_orders = shmem_mapping_size_orders(inode->i_mapping,
- index, write_end);
+ within_size_orders = THP_ORDERS_ALL_FILE_DEFAULT;
within_size_orders = shmem_get_orders_within_size(inode, within_size_orders,
index, write_end);
@@ -862,7 +844,7 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
loff_t write_end, bool shmem_huge_force,
struct vm_area_struct *vma,
- unsigned long vm_flags)
+ vm_flags_t vm_flags)
{
return 0;
}
@@ -884,7 +866,9 @@ static int shmem_add_to_page_cache(struct folio *folio,
pgoff_t index, void *expected, gfp_t gfp)
{
XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
- long nr = folio_nr_pages(folio);
+ unsigned long nr = folio_nr_pages(folio);
+ swp_entry_t iter, swap;
+ void *entry;
VM_BUG_ON_FOLIO(index != round_down(index, nr), folio);
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
@@ -896,14 +880,25 @@ static int shmem_add_to_page_cache(struct folio *folio,
gfp &= GFP_RECLAIM_MASK;
folio_throttle_swaprate(folio, gfp);
+ swap = radix_to_swp_entry(expected);
do {
+ iter = swap;
xas_lock_irq(&xas);
- if (expected != xas_find_conflict(&xas)) {
- xas_set_err(&xas, -EEXIST);
- goto unlock;
+ xas_for_each_conflict(&xas, entry) {
+ /*
+ * The range must either be empty, or filled with
+ * expected swap entries. Shmem swap entries are never
+ * partially freed without split of both entry and
+ * folio, so there shouldn't be any holes.
+ */
+ if (!expected || entry != swp_to_radix_entry(iter)) {
+ xas_set_err(&xas, -EEXIST);
+ goto unlock;
+ }
+ iter.val += 1 << xas_get_order(&xas);
}
- if (expected && xas_find_conflict(&xas)) {
+ if (expected && iter.val - nr != swap.val) {
xas_set_err(&xas, -EEXIST);
goto unlock;
}
@@ -974,15 +969,15 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
XA_STATE(xas, &mapping->i_pages, start);
- struct page *page;
+ struct folio *folio;
unsigned long swapped = 0;
unsigned long max = end - 1;
rcu_read_lock();
- xas_for_each(&xas, page, max) {
- if (xas_retry(&xas, page))
+ xas_for_each(&xas, folio, max) {
+ if (xas_retry(&xas, folio))
continue;
- if (xa_is_value(page))
+ if (xa_is_value(folio))
swapped += 1 << xas_get_order(&xas);
if (xas.xa_index == max)
break;
@@ -1375,11 +1370,11 @@ static void shmem_evict_inode(struct inode *inode)
/* Wait while shmem_unuse() is scanning this inode... */
wait_var_event(&info->stop_eviction,
!atomic_read(&info->stop_eviction));
- mutex_lock(&shmem_swaplist_mutex);
+ spin_lock(&shmem_swaplist_lock);
/* ...but beware of the race if we peeked too early */
if (!atomic_read(&info->stop_eviction))
list_del_init(&info->swaplist);
- mutex_unlock(&shmem_swaplist_mutex);
+ spin_unlock(&shmem_swaplist_lock);
}
}
@@ -1446,8 +1441,6 @@ static int shmem_unuse_swap_entries(struct inode *inode,
for (i = 0; i < folio_batch_count(fbatch); i++) {
struct folio *folio = fbatch->folios[i];
- if (!xa_is_value(folio))
- continue;
error = shmem_swapin_folio(inode, indices[i], &folio, SGP_CACHE,
mapping_gfp_mask(mapping), NULL, NULL);
if (error == 0) {
@@ -1504,7 +1497,8 @@ int shmem_unuse(unsigned int type)
if (list_empty(&shmem_swaplist))
return 0;
- mutex_lock(&shmem_swaplist_mutex);
+ spin_lock(&shmem_swaplist_lock);
+start_over:
list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
if (!info->swapped) {
list_del_init(&info->swaplist);
@@ -1517,31 +1511,38 @@ int shmem_unuse(unsigned int type)
* (igrab() would protect from unlink, but not from unmount).
*/
atomic_inc(&info->stop_eviction);
- mutex_unlock(&shmem_swaplist_mutex);
+ spin_unlock(&shmem_swaplist_lock);
error = shmem_unuse_inode(&info->vfs_inode, type);
cond_resched();
- mutex_lock(&shmem_swaplist_mutex);
- next = list_next_entry(info, swaplist);
- if (!info->swapped)
- list_del_init(&info->swaplist);
+ spin_lock(&shmem_swaplist_lock);
if (atomic_dec_and_test(&info->stop_eviction))
wake_up_var(&info->stop_eviction);
if (error)
break;
+ if (list_empty(&info->swaplist))
+ goto start_over;
+ next = list_next_entry(info, swaplist);
+ if (!info->swapped)
+ list_del_init(&info->swaplist);
}
- mutex_unlock(&shmem_swaplist_mutex);
+ spin_unlock(&shmem_swaplist_lock);
return error;
}
-/*
- * Move the page from the page cache to the swap cache.
+/**
+ * shmem_writeout - Write the folio to swap
+ * @folio: The folio to write
+ * @plug: swap plug
+ * @folio_list: list to put back folios on split
+ *
+ * Move the folio from the page cache to the swap cache.
*/
-static int shmem_writepage(struct page *page, struct writeback_control *wbc)
+int shmem_writeout(struct folio *folio, struct swap_iocb **plug,
+ struct list_head *folio_list)
{
- struct folio *folio = page_folio(page);
struct address_space *mapping = folio->mapping;
struct inode *inode = mapping->host;
struct shmem_inode_info *info = SHMEM_I(inode);
@@ -1550,16 +1551,6 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
int nr_pages;
bool split = false;
- /*
- * Our capabilities prevent regular writeback or sync from ever calling
- * shmem_writepage; but a stacking filesystem might use ->writepage of
- * its underlying filesystem, in which case tmpfs should write out to
- * swap only in response to memory pressure, and not for the writeback
- * threads or sync.
- */
- if (WARN_ON_ONCE(!wbc->for_reclaim))
- goto redirty;
-
if ((info->flags & VM_LOCKED) || sbinfo->noswap)
goto redirty;
@@ -1586,9 +1577,8 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
try_split:
/* Ensure the subpages are still dirty */
folio_test_set_dirty(folio);
- if (split_huge_page_to_list_to_order(page, wbc->list, 0))
+ if (split_folio_to_list(folio, folio_list))
goto redirty;
- folio = page_folio(page);
folio_clear_dirty(folio);
}
@@ -1627,39 +1617,66 @@ try_split:
folio_mark_uptodate(folio);
}
- /*
- * Add inode to shmem_unuse()'s list of swapped-out inodes,
- * if it's not already there. Do it now before the folio is
- * moved to swap cache, when its pagelock no longer protects
- * the inode from eviction. But don't unlock the mutex until
- * we've incremented swapped, because shmem_unuse_inode() will
- * prune a !swapped inode from the swaplist under this mutex.
- */
- mutex_lock(&shmem_swaplist_mutex);
- if (list_empty(&info->swaplist))
- list_add(&info->swaplist, &shmem_swaplist);
-
if (!folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN)) {
- shmem_recalc_inode(inode, 0, nr_pages);
+ bool first_swapped = shmem_recalc_inode(inode, 0, nr_pages);
+ int error;
+
+ /*
+ * Add inode to shmem_unuse()'s list of swapped-out inodes,
+ * if it's not already there. Do it now before the folio is
+ * removed from page cache, when its pagelock no longer
+ * protects the inode from eviction. And do it now, after
+ * we've incremented swapped, because shmem_unuse() will
+ * prune a !swapped inode from the swaplist.
+ */
+ if (first_swapped) {
+ spin_lock(&shmem_swaplist_lock);
+ if (list_empty(&info->swaplist))
+ list_add(&info->swaplist, &shmem_swaplist);
+ spin_unlock(&shmem_swaplist_lock);
+ }
+
swap_shmem_alloc(folio->swap, nr_pages);
shmem_delete_from_page_cache(folio, swp_to_radix_entry(folio->swap));
- mutex_unlock(&shmem_swaplist_mutex);
BUG_ON(folio_mapped(folio));
- return swap_writepage(&folio->page, wbc);
- }
+ error = swap_writeout(folio, plug);
+ if (error != AOP_WRITEPAGE_ACTIVATE) {
+ /* folio has been unlocked */
+ return error;
+ }
- list_del_init(&info->swaplist);
- mutex_unlock(&shmem_swaplist_mutex);
+ /*
+ * The intention here is to avoid holding on to the swap when
+ * zswap was unable to compress and unable to writeback; but
+ * it will be appropriate if other reactivate cases are added.
+ */
+ error = shmem_add_to_page_cache(folio, mapping, index,
+ swp_to_radix_entry(folio->swap),
+ __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
+ /* Swap entry might be erased by racing shmem_free_swap() */
+ if (!error) {
+ shmem_recalc_inode(inode, 0, -nr_pages);
+ swap_free_nr(folio->swap, nr_pages);
+ }
+
+ /*
+ * The swap_cache_del_folio() below could be left for
+ * shrink_folio_list()'s folio_free_swap() to dispose of;
+ * but I'm a little nervous about letting this folio out of
+ * shmem_writeout() in a hybrid half-tmpfs-half-swap state
+ * e.g. folio_mapping(folio) might give an unexpected answer.
+ */
+ swap_cache_del_folio(folio);
+ goto redirty;
+ }
if (nr_pages > 1)
goto try_split;
redirty:
folio_mark_dirty(folio);
- if (wbc->for_reclaim)
- return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */
- folio_unlock(folio);
- return 0;
+ return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */
}
+EXPORT_SYMBOL_GPL(shmem_writeout);
#if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS)
static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
@@ -1760,10 +1777,10 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode,
{
unsigned long mask = READ_ONCE(huge_shmem_orders_always);
unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size);
- unsigned long vm_flags = vma ? vma->vm_flags : 0;
+ vm_flags_t vm_flags = vma ? vma->vm_flags : 0;
unsigned int global_orders;
- if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags)))
+ if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags, shmem_huge_force)))
return 0;
global_orders = shmem_huge_global_enabled(inode, index, write_end,
@@ -1963,30 +1980,47 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode,
swp_entry_t entry, int order, gfp_t gfp)
{
struct shmem_inode_info *info = SHMEM_I(inode);
+ int nr_pages = 1 << order;
struct folio *new;
+ gfp_t alloc_gfp;
void *shadow;
- int nr_pages;
/*
* We have arrived here because our zones are constrained, so don't
* limit chance of success with further cpuset and node constraints.
*/
gfp &= ~GFP_CONSTRAINT_MASK;
- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && order > 0) {
- gfp_t huge_gfp = vma_thp_gfp_mask(vma);
+ alloc_gfp = gfp;
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
+ if (WARN_ON_ONCE(order))
+ return ERR_PTR(-EINVAL);
+ } else if (order) {
+ /*
+ * If uffd is active for the vma, we need per-page fault
+ * fidelity to maintain the uffd semantics, then fallback
+ * to swapin order-0 folio, as well as for zswap case.
+ * Any existing sub folio in the swap cache also blocks
+ * mTHP swapin.
+ */
+ if ((vma && unlikely(userfaultfd_armed(vma))) ||
+ !zswap_never_enabled() ||
+ non_swapcache_batch(entry, nr_pages) != nr_pages)
+ goto fallback;
- gfp = limit_gfp_mask(huge_gfp, gfp);
+ alloc_gfp = limit_gfp_mask(vma_thp_gfp_mask(vma), gfp);
+ }
+retry:
+ new = shmem_alloc_folio(alloc_gfp, order, info, index);
+ if (!new) {
+ new = ERR_PTR(-ENOMEM);
+ goto fallback;
}
- new = shmem_alloc_folio(gfp, order, info, index);
- if (!new)
- return ERR_PTR(-ENOMEM);
-
- nr_pages = folio_nr_pages(new);
if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL,
- gfp, entry)) {
+ alloc_gfp, entry)) {
folio_put(new);
- return ERR_PTR(-ENOMEM);
+ new = ERR_PTR(-ENOMEM);
+ goto fallback;
}
/*
@@ -2001,7 +2035,9 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode,
*/
if (swapcache_prepare(entry, nr_pages)) {
folio_put(new);
- return ERR_PTR(-EEXIST);
+ new = ERR_PTR(-EEXIST);
+ /* Try smaller folio to avoid cache conflict */
+ goto fallback;
}
__folio_set_locked(new);
@@ -2009,12 +2045,21 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode,
new->swap = entry;
memcg1_swapin(entry, nr_pages);
- shadow = get_shadow_from_swap_cache(entry);
+ shadow = swap_cache_get_shadow(entry);
if (shadow)
workingset_refault(new, shadow);
folio_add_lru(new);
swap_read_folio(new, NULL);
return new;
+fallback:
+ /* Order 0 swapin failed, nothing to fallback to, abort */
+ if (!order)
+ return new;
+ entry.val += index - round_down(index, nr_pages);
+ alloc_gfp = gfp;
+ nr_pages = 1;
+ order = 0;
+ goto retry;
}
/*
@@ -2038,13 +2083,11 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index,
struct vm_area_struct *vma)
{
+ struct swap_cluster_info *ci;
struct folio *new, *old = *foliop;
swp_entry_t entry = old->swap;
- struct address_space *swap_mapping = swap_address_space(entry);
- pgoff_t swap_index = swap_cache_index(entry);
- XA_STATE(xas, &swap_mapping->i_pages, swap_index);
int nr_pages = folio_nr_pages(old);
- int error = 0, i;
+ int error = 0;
/*
* We have arrived here because our zones are constrained, so don't
@@ -2073,38 +2116,15 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
new->swap = entry;
folio_set_swapcache(new);
- /* Swap cache still stores N entries instead of a high-order entry */
- xa_lock_irq(&swap_mapping->i_pages);
- for (i = 0; i < nr_pages; i++) {
- void *item = xas_load(&xas);
+ ci = swap_cluster_get_and_lock_irq(old);
+ __swap_cache_replace_folio(ci, old, new);
+ mem_cgroup_replace_folio(old, new);
+ shmem_update_stats(new, nr_pages);
+ shmem_update_stats(old, -nr_pages);
+ swap_cluster_unlock_irq(ci);
- if (item != old) {
- error = -ENOENT;
- break;
- }
-
- xas_store(&xas, new);
- xas_next(&xas);
- }
- if (!error) {
- mem_cgroup_replace_folio(old, new);
- shmem_update_stats(new, nr_pages);
- shmem_update_stats(old, -nr_pages);
- }
- xa_unlock_irq(&swap_mapping->i_pages);
-
- if (unlikely(error)) {
- /*
- * Is this possible? I think not, now that our callers
- * check both the swapcache flag and folio->private
- * after getting the folio lock; but be defensive.
- * Reverse old to newpage for clear and free.
- */
- old = new;
- } else {
- folio_add_lru(new);
- *foliop = new;
- }
+ folio_add_lru(new);
+ *foliop = new;
folio_clear_swapcache(old);
old->private = NULL;
@@ -2138,7 +2158,7 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
nr_pages = folio_nr_pages(folio);
folio_wait_writeback(folio);
if (!skip_swapcache)
- delete_from_swap_cache(folio);
+ swap_cache_del_folio(folio);
/*
* Don't treat swapin error folio as alloced. Otherwise inode->i_blocks
* won't be 0 when inode is released and thus trigger WARN_ON(i_blocks)
@@ -2153,7 +2173,7 @@ static int shmem_split_large_entry(struct inode *inode, pgoff_t index,
{
struct address_space *mapping = inode->i_mapping;
XA_STATE_ORDER(xas, &mapping->i_pages, index, 0);
- int split_order = 0, entry_order;
+ int split_order = 0;
int i;
/* Convert user data gfp flags to xarray node gfp flags */
@@ -2171,15 +2191,12 @@ static int shmem_split_large_entry(struct inode *inode, pgoff_t index,
goto unlock;
}
- entry_order = xas_get_order(&xas);
-
- if (!entry_order)
+ cur_order = xas_get_order(&xas);
+ if (!cur_order)
goto unlock;
/* Try to split large swap entry in pagecache */
- cur_order = entry_order;
- swap_index = round_down(index, 1 << entry_order);
-
+ swap_index = round_down(index, 1 << cur_order);
split_order = xas_try_split_min_order(cur_order);
while (cur_order > 0) {
@@ -2220,7 +2237,7 @@ unlock:
if (xas_error(&xas))
return xas_error(&xas);
- return entry_order;
+ return 0;
}
/*
@@ -2237,127 +2254,111 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
struct address_space *mapping = inode->i_mapping;
struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL;
struct shmem_inode_info *info = SHMEM_I(inode);
+ swp_entry_t swap, index_entry;
struct swap_info_struct *si;
struct folio *folio = NULL;
bool skip_swapcache = false;
- swp_entry_t swap;
- int error, nr_pages, order, split_order;
+ int error, nr_pages, order;
+ pgoff_t offset;
VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
- swap = radix_to_swp_entry(*foliop);
+ index_entry = radix_to_swp_entry(*foliop);
+ swap = index_entry;
*foliop = NULL;
- if (is_poisoned_swp_entry(swap))
+ if (is_poisoned_swp_entry(index_entry))
return -EIO;
- si = get_swap_device(swap);
- if (!si) {
- if (!shmem_confirm_swap(mapping, index, swap))
+ si = get_swap_device(index_entry);
+ order = shmem_confirm_swap(mapping, index, index_entry);
+ if (unlikely(!si)) {
+ if (order < 0)
return -EEXIST;
else
return -EINVAL;
}
+ if (unlikely(order < 0)) {
+ put_swap_device(si);
+ return -EEXIST;
+ }
+
+ /* index may point to the middle of a large entry, get the sub entry */
+ if (order) {
+ offset = index - round_down(index, 1 << order);
+ swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
+ }
/* Look it up and read it in.. */
- folio = swap_cache_get_folio(swap, NULL, 0);
- order = xa_get_order(&mapping->i_pages, index);
+ folio = swap_cache_get_folio(swap);
if (!folio) {
- bool fallback_order0 = false;
-
- /* Or update major stats only when swapin succeeds?? */
+ if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
+ /* Direct swapin skipping swap cache & readahead */
+ folio = shmem_swap_alloc_folio(inode, vma, index,
+ index_entry, order, gfp);
+ if (IS_ERR(folio)) {
+ error = PTR_ERR(folio);
+ folio = NULL;
+ goto failed;
+ }
+ skip_swapcache = true;
+ } else {
+ /* Cached swapin only supports order 0 folio */
+ folio = shmem_swapin_cluster(swap, gfp, info, index);
+ if (!folio) {
+ error = -ENOMEM;
+ goto failed;
+ }
+ }
if (fault_type) {
*fault_type |= VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT);
count_memcg_event_mm(fault_mm, PGMAJFAULT);
}
+ } else {
+ swap_update_readahead(folio, NULL, 0);
+ }
+ if (order > folio_order(folio)) {
/*
- * If uffd is active for the vma, we need per-page fault
- * fidelity to maintain the uffd semantics, then fallback
- * to swapin order-0 folio, as well as for zswap case.
- */
- if (order > 0 && ((vma && unlikely(userfaultfd_armed(vma))) ||
- !zswap_never_enabled()))
- fallback_order0 = true;
-
- /* Skip swapcache for synchronous device. */
- if (!fallback_order0 && data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
- folio = shmem_swap_alloc_folio(inode, vma, index, swap, order, gfp);
- if (!IS_ERR(folio)) {
- skip_swapcache = true;
- goto alloced;
- }
-
- /*
- * Fallback to swapin order-0 folio unless the swap entry
- * already exists.
- */
- error = PTR_ERR(folio);
- folio = NULL;
- if (error == -EEXIST)
- goto failed;
- }
-
- /*
- * Now swap device can only swap in order 0 folio, then we
- * should split the large swap entry stored in the pagecache
- * if necessary.
- */
- split_order = shmem_split_large_entry(inode, index, swap, gfp);
- if (split_order < 0) {
- error = split_order;
- goto failed;
- }
-
- /*
- * If the large swap entry has already been split, it is
- * necessary to recalculate the new swap entry based on
- * the old order alignment.
- */
- if (split_order > 0) {
- pgoff_t offset = index - round_down(index, 1 << split_order);
-
- swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
- }
-
- /* Here we actually start the io */
- folio = shmem_swapin_cluster(swap, gfp, info, index);
- if (!folio) {
- error = -ENOMEM;
- goto failed;
- }
- } else if (order != folio_order(folio)) {
- /*
- * Swap readahead may swap in order 0 folios into swapcache
+ * Swapin may get smaller folios due to various reasons:
+ * It may fallback to order 0 due to memory pressure or race,
+ * swap readahead may swap in order 0 folios into swapcache
* asynchronously, while the shmem mapping can still stores
* large swap entries. In such cases, we should split the
* large swap entry to prevent possible data corruption.
*/
- split_order = shmem_split_large_entry(inode, index, swap, gfp);
- if (split_order < 0) {
- error = split_order;
- goto failed;
- }
-
- /*
- * If the large swap entry has already been split, it is
- * necessary to recalculate the new swap entry based on
- * the old order alignment.
- */
- if (split_order > 0) {
- pgoff_t offset = index - round_down(index, 1 << split_order);
+ error = shmem_split_large_entry(inode, index, index_entry, gfp);
+ if (error)
+ goto failed_nolock;
+ }
- swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
- }
+ /*
+ * If the folio is large, round down swap and index by folio size.
+ * No matter what race occurs, the swap layer ensures we either get
+ * a valid folio that has its swap entry aligned by size, or a
+ * temporarily invalid one which we'll abort very soon and retry.
+ *
+ * shmem_add_to_page_cache ensures the whole range contains expected
+ * entries and prevents any corruption, so any race split is fine
+ * too, it will succeed as long as the entries are still there.
+ */
+ nr_pages = folio_nr_pages(folio);
+ if (nr_pages > 1) {
+ swap.val = round_down(swap.val, nr_pages);
+ index = round_down(index, nr_pages);
}
-alloced:
- /* We have to do this with folio locked to prevent races */
+ /*
+ * We have to do this with the folio locked to prevent races.
+ * The shmem_confirm_swap below only checks if the first swap
+ * entry matches the folio, that's enough to ensure the folio
+ * is not used outside of shmem, as shmem swap entries
+ * and swap cache folios are never partially freed.
+ */
folio_lock(folio);
if ((!skip_swapcache && !folio_test_swapcache(folio)) ||
- folio->swap.val != swap.val ||
- !shmem_confirm_swap(mapping, index, swap) ||
- xa_get_order(&mapping->i_pages, index) != folio_order(folio)) {
+ shmem_confirm_swap(mapping, index, swap) < 0 ||
+ folio->swap.val != swap.val) {
error = -EEXIST;
goto unlock;
}
@@ -2366,7 +2367,6 @@ alloced:
goto failed;
}
folio_wait_writeback(folio);
- nr_pages = folio_nr_pages(folio);
/*
* Some architectures may have to restore extra metadata to the
@@ -2380,8 +2380,7 @@ alloced:
goto failed;
}
- error = shmem_add_to_page_cache(folio, mapping,
- round_down(index, nr_pages),
+ error = shmem_add_to_page_cache(folio, mapping, index,
swp_to_radix_entry(swap), gfp);
if (error)
goto failed;
@@ -2395,7 +2394,7 @@ alloced:
folio->swap.val = 0;
swapcache_clear(si, swap, nr_pages);
} else {
- delete_from_swap_cache(folio);
+ swap_cache_del_folio(folio);
}
folio_mark_dirty(folio);
swap_free_nr(swap, nr_pages);
@@ -2404,18 +2403,19 @@ alloced:
*foliop = folio;
return 0;
failed:
- if (!shmem_confirm_swap(mapping, index, swap))
+ if (shmem_confirm_swap(mapping, index, swap) < 0)
error = -EEXIST;
if (error == -EIO)
shmem_set_folio_swapin_error(inode, index, folio, swap,
skip_swapcache);
unlock:
- if (skip_swapcache)
- swapcache_clear(si, swap, folio_nr_pages(folio));
- if (folio) {
+ if (folio)
folio_unlock(folio);
+failed_nolock:
+ if (skip_swapcache)
+ swapcache_clear(si, folio->swap, folio_nr_pages(folio));
+ if (folio)
folio_put(folio);
- }
put_swap_device(si);
return error;
@@ -3267,9 +3267,9 @@ static const struct inode_operations shmem_symlink_inode_operations;
static const struct inode_operations shmem_short_symlink_operations;
static int
-shmem_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct folio **foliop, void **fsdata)
+shmem_write_begin(const struct kiocb *iocb, struct address_space *mapping,
+ loff_t pos, unsigned len,
+ struct folio **foliop, void **fsdata)
{
struct inode *inode = mapping->host;
struct shmem_inode_info *info = SHMEM_I(inode);
@@ -3301,9 +3301,9 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
}
static int
-shmem_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct folio *folio, void *fsdata)
+shmem_write_end(const struct kiocb *iocb, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct folio *folio, void *fsdata)
{
struct inode *inode = mapping->host;
@@ -3768,7 +3768,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
index--;
/*
- * Inform shmem_writepage() how far we have reached.
+ * Inform shmem_writeout() how far we have reached.
* No need for lock or barrier: we have the page lock.
*/
if (!folio_test_uptodate(folio))
@@ -4184,7 +4184,7 @@ static const char *shmem_get_link(struct dentry *dentry, struct inode *inode,
#ifdef CONFIG_TMPFS_XATTR
-static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+static int shmem_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
@@ -4194,7 +4194,7 @@ static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa)
}
static int shmem_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa)
+ struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
struct shmem_inode_info *info = SHMEM_I(inode);
@@ -4981,7 +4981,6 @@ static void shmem_put_super(struct super_block *sb)
static const struct dentry_operations shmem_ci_dentry_ops = {
.d_hash = generic_ci_d_hash,
.d_compare = generic_ci_d_compare,
- .d_delete = always_delete_dentry,
};
#endif
@@ -5018,7 +5017,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_flags |= SB_NOUSER;
}
sb->s_export_op = &shmem_export_ops;
- sb->s_flags |= SB_NOSEC | SB_I_VERSION;
+ sb->s_flags |= SB_NOSEC;
#if IS_ENABLED(CONFIG_UNICODE)
if (!ctx->encoding && ctx->strict_encoding) {
@@ -5029,7 +5028,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
if (ctx->encoding) {
sb->s_encoding = ctx->encoding;
- sb->s_d_op = &shmem_ci_dentry_ops;
+ set_default_d_op(sb, &shmem_ci_dentry_ops);
if (ctx->strict_encoding)
sb->s_encoding_flags = SB_ENC_STRICT_MODE_FL;
}
@@ -5038,6 +5037,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
#else
sb->s_flags |= SB_NOUSER;
#endif /* CONFIG_TMPFS */
+ sb->s_d_flags |= DCACHE_DONTCACHE;
sbinfo->max_blocks = ctx->blocks;
sbinfo->max_inodes = ctx->inodes;
sbinfo->free_ispace = sbinfo->max_inodes * BOGO_INODE_SIZE;
@@ -5191,7 +5191,6 @@ static int shmem_error_remove_folio(struct address_space *mapping,
}
static const struct address_space_operations shmem_aops = {
- .writepage = shmem_writepage,
.dirty_folio = noop_dirty_folio,
#ifdef CONFIG_TMPFS
.write_begin = shmem_write_begin,
@@ -5278,7 +5277,7 @@ static const struct super_operations shmem_ops = {
.get_dquots = shmem_get_dquots,
#endif
.evict_inode = shmem_evict_inode,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.put_super = shmem_put_super,
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
.nr_cached_objects = shmem_unused_huge_count,
@@ -5322,6 +5321,9 @@ int shmem_init_fs_context(struct fs_context *fc)
fc->fs_private = ctx;
fc->ops = &shmem_fs_context_ops;
+#ifdef CONFIG_TMPFS
+ fc->sb_flags |= SB_I_VERSION;
+#endif
return 0;
}
@@ -5810,12 +5812,12 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name,
if (size < 0 || size > MAX_LFS_FILESIZE)
return ERR_PTR(-EINVAL);
- if (shmem_acct_size(flags, size))
- return ERR_PTR(-ENOMEM);
-
if (is_idmapped_mnt(mnt))
return ERR_PTR(-EINVAL);
+ if (shmem_acct_size(flags, size))
+ return ERR_PTR(-ENOMEM);
+
inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL,
S_IFREG | S_IRWXUGO, 0, flags);
if (IS_ERR(inode)) {
@@ -5926,8 +5928,8 @@ struct folio *shmem_read_folio_gfp(struct address_space *mapping,
struct folio *folio;
int error;
- error = shmem_get_folio_gfp(inode, index, 0, &folio, SGP_CACHE,
- gfp, NULL, NULL);
+ error = shmem_get_folio_gfp(inode, index, i_size_read(inode),
+ &folio, SGP_CACHE, gfp, NULL, NULL);
if (error)
return ERR_PTR(error);
diff --git a/mm/show_mem.c b/mm/show_mem.c
index 6af13bcd2ab3..3a4b5207635d 100644
--- a/mm/show_mem.c
+++ b/mm/show_mem.c
@@ -94,26 +94,20 @@ void si_meminfo_node(struct sysinfo *val, int nid)
unsigned long free_highpages = 0;
pg_data_t *pgdat = NODE_DATA(nid);
- for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
- managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]);
- val->totalram = managed_pages;
- val->sharedram = node_page_state(pgdat, NR_SHMEM);
- val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
-#ifdef CONFIG_HIGHMEM
for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
struct zone *zone = &pgdat->node_zones[zone_type];
-
+ managed_pages += zone_managed_pages(zone);
if (is_highmem(zone)) {
managed_highpages += zone_managed_pages(zone);
free_highpages += zone_page_state(zone, NR_FREE_PAGES);
}
}
+
+ val->totalram = managed_pages;
+ val->sharedram = node_page_state(pgdat, NR_SHMEM);
+ val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
val->totalhigh = managed_highpages;
val->freehigh = free_highpages;
-#else
- val->totalhigh = managed_highpages;
- val->freehigh = free_highpages;
-#endif
val->mem_unit = PAGE_SIZE;
}
#endif
@@ -223,7 +217,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
global_node_page_state(NR_SHMEM),
global_node_page_state(NR_PAGETABLE),
global_node_page_state(NR_SECONDARY_PAGETABLE),
- global_zone_page_state(NR_BOUNCE),
+ 0UL,
global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE),
global_zone_page_state(NR_FREE_PAGES),
free_pcp,
@@ -252,7 +246,6 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
" shmem_pmdmapped:%lukB"
" anon_thp:%lukB"
#endif
- " writeback_tmp:%lukB"
" kernel_stack:%lukB"
#ifdef CONFIG_SHADOW_CALL_STACK
" shadow_call_stack:%lukB"
@@ -279,14 +272,14 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)),
K(node_page_state(pgdat, NR_ANON_THPS)),
#endif
- K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
node_page_state(pgdat, NR_KERNEL_STACK_KB),
#ifdef CONFIG_SHADOW_CALL_STACK
node_page_state(pgdat, NR_KERNEL_SCS_KB),
#endif
K(node_page_state(pgdat, NR_PAGETABLE)),
K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)),
- str_yes_no(pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES),
+ str_yes_no(atomic_read(&pgdat->kswapd_failures) >=
+ MAX_RECLAIM_RETRIES),
K(node_page_state(pgdat, NR_BALLOON_PAGES)));
}
@@ -311,12 +304,14 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
" low:%lukB"
" high:%lukB"
" reserved_highatomic:%luKB"
+ " free_highatomic:%luKB"
" active_anon:%lukB"
" inactive_anon:%lukB"
" active_file:%lukB"
" inactive_file:%lukB"
" unevictable:%lukB"
" writepending:%lukB"
+ " zspages:%lukB"
" present:%lukB"
" managed:%lukB"
" mlocked:%lukB"
@@ -332,16 +327,22 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
K(low_wmark_pages(zone)),
K(high_wmark_pages(zone)),
K(zone->nr_reserved_highatomic),
+ K(zone->nr_free_highatomic),
K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),
K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
+#if IS_ENABLED(CONFIG_ZSMALLOC)
+ K(zone_page_state(zone, NR_ZSPAGES)),
+#else
+ 0UL,
+#endif
K(zone->present_pages),
K(zone_managed_pages(zone)),
K(zone_page_state(zone, NR_MLOCK)),
- K(zone_page_state(zone, NR_BOUNCE)),
+ 0UL,
K(free_pcp),
K(this_cpu_read(zone->per_cpu_pageset->count)),
K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
@@ -425,13 +426,16 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
#endif
#ifdef CONFIG_MEM_ALLOC_PROFILING
- {
+ static DEFINE_SPINLOCK(mem_alloc_profiling_spinlock);
+
+ if (spin_trylock(&mem_alloc_profiling_spinlock)) {
struct codetag_bytes tags[10];
size_t i, nr;
nr = alloc_tag_top_users(tags, ARRAY_SIZE(tags), false);
if (nr) {
- pr_notice("Memory allocations:\n");
+ pr_notice("Memory allocations (profiling is currently turned %s):\n",
+ mem_alloc_profiling_enabled() ? "on" : "off");
for (i = 0; i < nr; i++) {
struct codetag *ct = tags[i].ct;
struct alloc_tag *tag = ct_to_alloc_tag(ct);
@@ -451,6 +455,7 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
ct->lineno, ct->function);
}
}
+ spin_unlock(&mem_alloc_profiling_spinlock);
}
#endif
}
diff --git a/mm/slab.h b/mm/slab.h
index 05a21dc796e0..078daecc7cf5 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -50,13 +50,17 @@ typedef union {
/* Reuses the bits in struct page */
struct slab {
- unsigned long __page_flags;
+ memdesc_flags_t flags;
struct kmem_cache *slab_cache;
union {
struct {
union {
struct list_head slab_list;
+ struct { /* For deferred deactivate_slab() */
+ struct llist_node llnode;
+ void *flush_freelist;
+ };
#ifdef CONFIG_SLUB_CPU_PARTIAL
struct {
struct slab *next;
@@ -99,7 +103,7 @@ struct slab {
#define SLAB_MATCH(pg, sl) \
static_assert(offsetof(struct page, pg) == offsetof(struct slab, sl))
-SLAB_MATCH(flags, __page_flags);
+SLAB_MATCH(flags, flags);
SLAB_MATCH(compound_head, slab_cache); /* Ensure bit 0 is clear */
SLAB_MATCH(_refcount, __page_refcount);
#ifdef CONFIG_MEMCG
@@ -167,30 +171,6 @@ static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t)
*/
#define slab_page(s) folio_page(slab_folio(s), 0)
-/*
- * If network-based swap is enabled, sl*b must keep track of whether pages
- * were allocated from pfmemalloc reserves.
- */
-static inline bool slab_test_pfmemalloc(const struct slab *slab)
-{
- return folio_test_active(slab_folio(slab));
-}
-
-static inline void slab_set_pfmemalloc(struct slab *slab)
-{
- folio_set_active(slab_folio(slab));
-}
-
-static inline void slab_clear_pfmemalloc(struct slab *slab)
-{
- folio_clear_active(slab_folio(slab));
-}
-
-static inline void __slab_clear_pfmemalloc(struct slab *slab)
-{
- __folio_clear_active(slab_folio(slab));
-}
-
static inline void *slab_address(const struct slab *slab)
{
return folio_address(slab_folio(slab));
@@ -198,12 +178,12 @@ static inline void *slab_address(const struct slab *slab)
static inline int slab_nid(const struct slab *slab)
{
- return folio_nid(slab_folio(slab));
+ return memdesc_nid(slab->flags);
}
static inline pg_data_t *slab_pgdat(const struct slab *slab)
{
- return folio_pgdat(slab_folio(slab));
+ return NODE_DATA(slab_nid(slab));
}
static inline struct slab *virt_to_slab(const void *addr)
@@ -258,7 +238,9 @@ struct kmem_cache_order_objects {
struct kmem_cache {
#ifndef CONFIG_SLUB_TINY
struct kmem_cache_cpu __percpu *cpu_slab;
+ struct lock_class_key lock_key;
#endif
+ struct slub_percpu_sheaves __percpu *cpu_sheaves;
/* Used for retrieving partial slabs, etc. */
slab_flags_t flags;
unsigned long min_partial;
@@ -272,6 +254,7 @@ struct kmem_cache {
/* Number of per cpu partial slabs to keep around */
unsigned int cpu_partial_slabs;
#endif
+ unsigned int sheaf_capacity;
struct kmem_cache_order_objects oo;
/* Allocation and freeing of slabs */
@@ -457,6 +440,9 @@ static inline bool is_kmalloc_normal(struct kmem_cache *s)
return !(s->flags & (SLAB_CACHE_DMA|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT));
}
+bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj);
+void flush_all_rcu_sheaves(void);
+
#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \
SLAB_CACHE_DMA32 | SLAB_PANIC | \
SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS | \
@@ -550,8 +536,12 @@ static inline struct slabobj_ext *slab_obj_exts(struct slab *slab)
unsigned long obj_exts = READ_ONCE(slab->obj_exts);
#ifdef CONFIG_MEMCG
- VM_BUG_ON_PAGE(obj_exts && !(obj_exts & MEMCG_DATA_OBJEXTS),
- slab_page(slab));
+ /*
+ * obj_exts should be either NULL, a valid pointer with
+ * MEMCG_DATA_OBJEXTS bit set or be equal to OBJEXTS_ALLOC_FAIL.
+ */
+ VM_BUG_ON_PAGE(obj_exts && !(obj_exts & MEMCG_DATA_OBJEXTS) &&
+ obj_exts != OBJEXTS_ALLOC_FAIL, slab_page(slab));
VM_BUG_ON_PAGE(obj_exts & MEMCG_DATA_KMEM, slab_page(slab));
#endif
return (struct slabobj_ext *)(obj_exts & ~OBJEXTS_FLAGS_MASK);
@@ -680,6 +670,8 @@ void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
void __check_heap_object(const void *ptr, unsigned long n,
const struct slab *slab, bool to_user);
+void defer_free_barrier(void);
+
static inline bool slub_debug_orig_size(struct kmem_cache *s)
{
return (kmem_cache_debug_flags(s, SLAB_STORE_USER) &&
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 5be257e03c7c..932d13ada36c 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -2,7 +2,7 @@
/*
* Slab allocator functions that are independent of the allocator strategy
*
- * (C) 2012 Christoph Lameter <cl@linux.com>
+ * (C) 2012 Christoph Lameter <cl@gentwo.org>
*/
#include <linux/slab.h>
@@ -163,6 +163,9 @@ int slab_unmergeable(struct kmem_cache *s)
return 1;
#endif
+ if (s->cpu_sheaves)
+ return 1;
+
/*
* We may have set a slab to be unmergeable during bootstrap.
*/
@@ -321,7 +324,7 @@ struct kmem_cache *__kmem_cache_create_args(const char *name,
object_size - args->usersize < args->useroffset))
args->usersize = args->useroffset = 0;
- if (!args->usersize)
+ if (!args->usersize && !args->sheaf_capacity)
s = __kmem_cache_alias(name, object_size, args->align, flags,
args->ctor);
if (s)
@@ -507,6 +510,9 @@ void kmem_cache_destroy(struct kmem_cache *s)
rcu_barrier();
}
+ /* Wait for deferred work from kmalloc/kfree_nolock() */
+ defer_free_barrier();
+
cpus_read_lock();
mutex_lock(&slab_mutex);
@@ -1605,6 +1611,30 @@ static void kfree_rcu_work(struct work_struct *work)
kvfree_rcu_list(head);
}
+static bool kfree_rcu_sheaf(void *obj)
+{
+ struct kmem_cache *s;
+ struct folio *folio;
+ struct slab *slab;
+
+ if (is_vmalloc_addr(obj))
+ return false;
+
+ folio = virt_to_folio(obj);
+ if (unlikely(!folio_test_slab(folio)))
+ return false;
+
+ slab = folio_slab(folio);
+ s = slab->slab_cache;
+ if (s->cpu_sheaves) {
+ if (likely(!IS_ENABLED(CONFIG_NUMA) ||
+ slab_nid(slab) == numa_mem_id()))
+ return __kfree_rcu_sheaf(s, obj);
+ }
+
+ return false;
+}
+
static bool
need_offload_krc(struct kfree_rcu_cpu *krcp)
{
@@ -1949,6 +1979,9 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr)
if (!head)
might_sleep();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT) && kfree_rcu_sheaf(ptr))
+ return;
+
// Queue the object but don't yet schedule the batch.
if (debug_rcu_head_queue(ptr)) {
// Probable double kfree_rcu(), just leak.
@@ -2023,6 +2056,8 @@ void kvfree_rcu_barrier(void)
bool queued;
int i, cpu;
+ flush_all_rcu_sheaves();
+
/*
* Firstly we detach objects and queue them over an RCU-batch
* for all CPUs. Finally queued works are flushed for each CPU.
diff --git a/mm/slub.c b/mm/slub.c
index be8b09e09d30..b1f15598fbfd 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -23,6 +23,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/kasan.h>
+#include <linux/node.h>
#include <linux/kmsan.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
@@ -43,7 +44,8 @@
#include <kunit/test.h>
#include <kunit/test-bug.h>
#include <linux/sort.h>
-
+#include <linux/irq_work.h>
+#include <linux/kprobes.h>
#include <linux/debugfs.h>
#include <trace/events/kmem.h>
@@ -91,14 +93,14 @@
* The partially empty slabs cached on the CPU partial list are used
* for performance reasons, which speeds up the allocation process.
* These slabs are not frozen, but are also exempt from list management,
- * by clearing the PG_workingset flag when moving out of the node
+ * by clearing the SL_partial flag when moving out of the node
* partial list. Please see __slab_free() for more details.
*
* To sum up, the current scheme is:
- * - node partial slab: PG_Workingset && !frozen
- * - cpu partial slab: !PG_Workingset && !frozen
- * - cpu slab: !PG_Workingset && frozen
- * - full slab: !PG_Workingset && !frozen
+ * - node partial slab: SL_partial && !frozen
+ * - cpu partial slab: !SL_partial && !frozen
+ * - cpu slab: !SL_partial && frozen
+ * - full slab: !SL_partial && !frozen
*
* list_lock
*
@@ -183,6 +185,22 @@
* the fast path and disables lockless freelists.
*/
+/**
+ * enum slab_flags - How the slab flags bits are used.
+ * @SL_locked: Is locked with slab_lock()
+ * @SL_partial: On the per-node partial list
+ * @SL_pfmemalloc: Was allocated from PF_MEMALLOC reserves
+ *
+ * The slab flags share space with the page flags but some bits have
+ * different interpretations. The high bits are used for information
+ * like zone/node/section.
+ */
+enum slab_flags {
+ SL_locked = PG_locked,
+ SL_partial = PG_workingset, /* Historical reasons for this bit */
+ SL_pfmemalloc = PG_active, /* Historical reasons for this bit */
+};
+
/*
* We could simply use migrate_disable()/enable() but as long as it's a
* function call even on !PREEMPT_RT, use inline preempt_disable() there.
@@ -346,8 +364,12 @@ static inline void debugfs_slab_add(struct kmem_cache *s) { }
#endif
enum stat_item {
+ ALLOC_PCS, /* Allocation from percpu sheaf */
ALLOC_FASTPATH, /* Allocation from cpu slab */
ALLOC_SLOWPATH, /* Allocation by getting a new cpu slab */
+ FREE_PCS, /* Free to percpu sheaf */
+ FREE_RCU_SHEAF, /* Free to rcu_free sheaf */
+ FREE_RCU_SHEAF_FAIL, /* Failed to free to a rcu_free sheaf */
FREE_FASTPATH, /* Free to cpu slab */
FREE_SLOWPATH, /* Freeing not to cpu slab */
FREE_FROZEN, /* Freeing to frozen slab */
@@ -372,6 +394,19 @@ enum stat_item {
CPU_PARTIAL_FREE, /* Refill cpu partial on free */
CPU_PARTIAL_NODE, /* Refill cpu partial from node partial */
CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */
+ SHEAF_FLUSH, /* Objects flushed from a sheaf */
+ SHEAF_REFILL, /* Objects refilled to a sheaf */
+ SHEAF_ALLOC, /* Allocation of an empty sheaf */
+ SHEAF_FREE, /* Freeing of an empty sheaf */
+ BARN_GET, /* Got full sheaf from barn */
+ BARN_GET_FAIL, /* Failed to get full sheaf from barn */
+ BARN_PUT, /* Put full sheaf to barn */
+ BARN_PUT_FAIL, /* Failed to put full sheaf to barn */
+ SHEAF_PREFILL_FAST, /* Sheaf prefill grabbed the spare sheaf */
+ SHEAF_PREFILL_SLOW, /* Sheaf prefill found no spare sheaf */
+ SHEAF_PREFILL_OVERSIZE, /* Allocation of oversize sheaf for prefill */
+ SHEAF_RETURN_FAST, /* Sheaf return reattached spare sheaf */
+ SHEAF_RETURN_SLOW, /* Sheaf return could not reattach spare */
NR_SLUB_STAT_ITEMS
};
@@ -392,7 +427,7 @@ struct kmem_cache_cpu {
#ifdef CONFIG_SLUB_CPU_PARTIAL
struct slab *partial; /* Partially allocated slabs */
#endif
- local_lock_t lock; /* Protects the fields above */
+ local_trylock_t lock; /* Protects the fields above */
#ifdef CONFIG_SLUB_STATS
unsigned int stat[NR_SLUB_STAT_ITEMS];
#endif
@@ -418,6 +453,37 @@ void stat_add(const struct kmem_cache *s, enum stat_item si, int v)
#endif
}
+#define MAX_FULL_SHEAVES 10
+#define MAX_EMPTY_SHEAVES 10
+
+struct node_barn {
+ spinlock_t lock;
+ struct list_head sheaves_full;
+ struct list_head sheaves_empty;
+ unsigned int nr_full;
+ unsigned int nr_empty;
+};
+
+struct slab_sheaf {
+ union {
+ struct rcu_head rcu_head;
+ struct list_head barn_list;
+ /* only used for prefilled sheafs */
+ unsigned int capacity;
+ };
+ struct kmem_cache *cache;
+ unsigned int size;
+ int node; /* only used for rcu_sheaf */
+ void *objects[];
+};
+
+struct slub_percpu_sheaves {
+ local_trylock_t lock;
+ struct slab_sheaf *main; /* never NULL when unlocked */
+ struct slab_sheaf *spare; /* empty or full, may be NULL */
+ struct slab_sheaf *rcu_free; /* for batching kfree_rcu() */
+};
+
/*
* The slab lists for all objects.
*/
@@ -430,6 +496,7 @@ struct kmem_cache_node {
atomic_long_t total_objects;
struct list_head full;
#endif
+ struct node_barn *barn;
};
static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
@@ -438,6 +505,20 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
}
/*
+ * Get the barn of the current cpu's closest memory node. It may not exist on
+ * systems with memoryless nodes but without CONFIG_HAVE_MEMORYLESS_NODES
+ */
+static inline struct node_barn *get_barn(struct kmem_cache *s)
+{
+ struct kmem_cache_node *n = get_node(s, numa_mem_id());
+
+ if (!n)
+ return NULL;
+
+ return n->barn;
+}
+
+/*
* Iterator over all nodes. The body will be executed for each node that has
* a kmem_cache_node structure allocated (which is true for all online nodes)
*/
@@ -447,18 +528,25 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
/*
* Tracks for which NUMA nodes we have kmem_cache_nodes allocated.
- * Corresponds to node_state[N_NORMAL_MEMORY], but can temporarily
+ * Corresponds to node_state[N_MEMORY], but can temporarily
* differ during memory hotplug/hotremove operations.
* Protected by slab_mutex.
*/
static nodemask_t slab_nodes;
-#ifndef CONFIG_SLUB_TINY
/*
* Workqueue used for flush_cpu_slab().
*/
static struct workqueue_struct *flushwq;
-#endif
+
+struct slub_flush_work {
+ struct work_struct work;
+ struct kmem_cache *s;
+ bool skip;
+};
+
+static DEFINE_MUTEX(flush_lock);
+static DEFINE_PER_CPU(struct slub_flush_work, slub_flush);
/********************************************************************
* Core slab cache functions
@@ -635,16 +723,35 @@ static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s)
#endif /* CONFIG_SLUB_CPU_PARTIAL */
/*
+ * If network-based swap is enabled, slub must keep track of whether memory
+ * were allocated from pfmemalloc reserves.
+ */
+static inline bool slab_test_pfmemalloc(const struct slab *slab)
+{
+ return test_bit(SL_pfmemalloc, &slab->flags.f);
+}
+
+static inline void slab_set_pfmemalloc(struct slab *slab)
+{
+ set_bit(SL_pfmemalloc, &slab->flags.f);
+}
+
+static inline void __slab_clear_pfmemalloc(struct slab *slab)
+{
+ __clear_bit(SL_pfmemalloc, &slab->flags.f);
+}
+
+/*
* Per slab locking using the pagelock
*/
static __always_inline void slab_lock(struct slab *slab)
{
- bit_spin_lock(PG_locked, &slab->__page_flags);
+ bit_spin_lock(SL_locked, &slab->flags.f);
}
static __always_inline void slab_unlock(struct slab *slab)
{
- bit_spin_unlock(PG_locked, &slab->__page_flags);
+ bit_spin_unlock(SL_locked, &slab->flags.f);
}
static inline bool
@@ -786,6 +893,16 @@ static inline unsigned int get_orig_size(struct kmem_cache *s, void *object)
}
#ifdef CONFIG_SLUB_DEBUG
+
+/*
+ * For debugging context when we want to check if the struct slab pointer
+ * appears to be valid.
+ */
+static inline bool validate_slab_ptr(struct slab *slab)
+{
+ return PageSlab(slab_page(slab));
+}
+
static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)];
static DEFINE_SPINLOCK(object_map_lock);
@@ -926,19 +1043,19 @@ static struct track *get_track(struct kmem_cache *s, void *object,
}
#ifdef CONFIG_STACKDEPOT
-static noinline depot_stack_handle_t set_track_prepare(void)
+static noinline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags)
{
depot_stack_handle_t handle;
unsigned long entries[TRACK_ADDRS_COUNT];
unsigned int nr_entries;
nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3);
- handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT);
+ handle = stack_depot_save(entries, nr_entries, gfp_flags);
return handle;
}
#else
-static inline depot_stack_handle_t set_track_prepare(void)
+static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags)
{
return 0;
}
@@ -960,9 +1077,9 @@ static void set_track_update(struct kmem_cache *s, void *object,
}
static __always_inline void set_track(struct kmem_cache *s, void *object,
- enum track_item alloc, unsigned long addr)
+ enum track_item alloc, unsigned long addr, gfp_t gfp_flags)
{
- depot_stack_handle_t handle = set_track_prepare();
+ depot_stack_handle_t handle = set_track_prepare(gfp_flags);
set_track_update(s, object, alloc, addr, handle);
}
@@ -1010,7 +1127,7 @@ static void print_slab_info(const struct slab *slab)
{
pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%pGp\n",
slab, slab->objects, slab->inuse, slab->freelist,
- &slab->__page_flags);
+ &slab->flags.f);
}
void skip_orig_size_check(struct kmem_cache *s, const void *object)
@@ -1104,7 +1221,12 @@ static void object_err(struct kmem_cache *s, struct slab *slab,
return;
slab_bug(s, reason);
- print_trailer(s, slab, object);
+ if (!object || !check_valid_pointer(s, slab, object)) {
+ print_slab_info(slab);
+ pr_err("Invalid pointer 0x%p\n", object);
+ } else {
+ print_trailer(s, slab, object);
+ }
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
WARN_ON(1);
@@ -1408,15 +1530,15 @@ static int check_object(struct kmem_cache *s, struct slab *slab,
return ret;
}
+/*
+ * Checks if the slab state looks sane. Assumes the struct slab pointer
+ * was either obtained in a way that ensures it's valid, or validated
+ * by validate_slab_ptr()
+ */
static int check_slab(struct kmem_cache *s, struct slab *slab)
{
int maxobj;
- if (!folio_test_slab(slab_folio(slab))) {
- slab_err(s, slab, "Not a valid slab page");
- return 0;
- }
-
maxobj = order_objects(slab_order(slab), s->size);
if (slab->objects > maxobj) {
slab_err(s, slab, "objects %u > max %u",
@@ -1612,17 +1734,15 @@ static noinline bool alloc_debug_processing(struct kmem_cache *s,
return true;
bad:
- if (folio_test_slab(slab_folio(slab))) {
- /*
- * If this is a slab page then lets do the best we can
- * to avoid issues in the future. Marking all objects
- * as used avoids touching the remaining objects.
- */
- slab_fix(s, "Marking all objects used");
- slab->inuse = slab->objects;
- slab->freelist = NULL;
- slab->frozen = 1; /* mark consistency-failed slab as frozen */
- }
+ /*
+ * Let's do the best we can to avoid issues in the future. Marking all
+ * objects as used avoids touching the remaining objects.
+ */
+ slab_fix(s, "Marking all objects used");
+ slab->inuse = slab->objects;
+ slab->freelist = NULL;
+ slab->frozen = 1; /* mark consistency-failed slab as frozen */
+
return false;
}
@@ -1643,10 +1763,7 @@ static inline int free_consistency_checks(struct kmem_cache *s,
return 0;
if (unlikely(s != slab->slab_cache)) {
- if (!folio_test_slab(slab_folio(slab))) {
- slab_err(s, slab, "Attempt to free object(0x%p) outside of slab",
- object);
- } else if (!slab->slab_cache) {
+ if (!slab->slab_cache) {
slab_err(NULL, slab, "No slab cache for object 0x%p",
object);
} else {
@@ -1885,9 +2002,9 @@ static inline bool free_debug_processing(struct kmem_cache *s,
static inline void slab_pad_check(struct kmem_cache *s, struct slab *slab) {}
static inline int check_object(struct kmem_cache *s, struct slab *slab,
void *object, u8 val) { return 1; }
-static inline depot_stack_handle_t set_track_prepare(void) { return 0; }
+static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags) { return 0; }
static inline void set_track(struct kmem_cache *s, void *object,
- enum track_item alloc, unsigned long addr) {}
+ enum track_item alloc, unsigned long addr, gfp_t gfp_flags) {}
static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
struct slab *slab) {}
static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
@@ -1948,7 +2065,7 @@ static inline void handle_failed_objexts_alloc(unsigned long obj_exts,
* objects with no tag reference. Mark all references in this
* vector as empty to avoid warnings later on.
*/
- if (obj_exts & OBJEXTS_ALLOC_FAIL) {
+ if (obj_exts == OBJEXTS_ALLOC_FAIL) {
unsigned int i;
for (i = 0; i < objects; i++)
@@ -1981,6 +2098,7 @@ static inline void init_slab_obj_exts(struct slab *slab)
int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
gfp_t gfp, bool new_slab)
{
+ bool allow_spin = gfpflags_allow_spinning(gfp);
unsigned int objects = objs_per_slab(s, slab);
unsigned long new_exts;
unsigned long old_exts;
@@ -1989,17 +2107,32 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
gfp &= ~OBJCGS_CLEAR_MASK;
/* Prevent recursive extension vector allocation */
gfp |= __GFP_NO_OBJ_EXT;
- vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp,
- slab_nid(slab));
+
+ /*
+ * Note that allow_spin may be false during early boot and its
+ * restricted GFP_BOOT_MASK. Due to kmalloc_nolock() only supporting
+ * architectures with cmpxchg16b, early obj_exts will be missing for
+ * very early allocations on those.
+ */
+ if (unlikely(!allow_spin)) {
+ size_t sz = objects * sizeof(struct slabobj_ext);
+
+ vec = kmalloc_nolock(sz, __GFP_ZERO | __GFP_NO_OBJ_EXT,
+ slab_nid(slab));
+ } else {
+ vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp,
+ slab_nid(slab));
+ }
if (!vec) {
/* Mark vectors which failed to allocate */
- if (new_slab)
- mark_failed_objexts_alloc(slab);
+ mark_failed_objexts_alloc(slab);
return -ENOMEM;
}
new_exts = (unsigned long)vec;
+ if (unlikely(!allow_spin))
+ new_exts |= OBJEXTS_NOSPIN_ALLOC;
#ifdef CONFIG_MEMCG
new_exts |= MEMCG_DATA_OBJEXTS;
#endif
@@ -2020,11 +2153,15 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
* objcg vector should be reused.
*/
mark_objexts_empty(vec);
- kfree(vec);
+ if (unlikely(!allow_spin))
+ kfree_nolock(vec);
+ else
+ kfree(vec);
return 0;
}
- kmemleak_not_leak(vec);
+ if (allow_spin)
+ kmemleak_not_leak(vec);
return 0;
}
@@ -2044,7 +2181,10 @@ static inline void free_slab_obj_exts(struct slab *slab)
* the extension for obj_exts is expected to be NULL.
*/
mark_objexts_empty(obj_exts);
- kfree(obj_exts);
+ if (unlikely(READ_ONCE(slab->obj_exts) & OBJEXTS_NOSPIN_ALLOC))
+ kfree_nolock(obj_exts);
+ else
+ kfree(obj_exts);
slab->obj_exts = 0;
}
@@ -2073,21 +2213,13 @@ prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p)
{
struct slab *slab;
- if (!p)
- return NULL;
-
- if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE))
- return NULL;
-
- if (flags & __GFP_NO_OBJ_EXT)
- return NULL;
-
slab = virt_to_slab(p);
if (!slab_obj_exts(slab) &&
- WARN(alloc_slab_obj_exts(slab, s, flags, false),
- "%s, %s: Failed to create slab extension vector!\n",
- __func__, s->name))
+ alloc_slab_obj_exts(slab, s, flags, false)) {
+ pr_warn_once("%s, %s: Failed to create slab extension vector!\n",
+ __func__, s->name);
return NULL;
+ }
return slab_obj_exts(slab) + obj_to_index(s, slab, p);
}
@@ -2098,6 +2230,15 @@ __alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
{
struct slabobj_ext *obj_exts;
+ if (!object)
+ return;
+
+ if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE))
+ return;
+
+ if (flags & __GFP_NO_OBJ_EXT)
+ return;
+
obj_exts = prepare_slab_obj_exts_hook(s, flags, object);
/*
* Currently obj_exts is used only for allocation profiling.
@@ -2106,6 +2247,8 @@ __alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
*/
if (likely(obj_exts))
alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size);
+ else
+ alloc_tag_set_inaccurate(current->alloc_tag);
}
static inline void
@@ -2377,7 +2520,7 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init,
}
/* KASAN might put x into memory quarantine, delaying its reuse. */
- return !kasan_slab_free(s, x, init, still_accessible);
+ return !kasan_slab_free(s, x, init, still_accessible, false);
}
static __fastpath_inline
@@ -2436,17 +2579,463 @@ static void *setup_object(struct kmem_cache *s, void *object)
return object;
}
+static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp)
+{
+ struct slab_sheaf *sheaf = kzalloc(struct_size(sheaf, objects,
+ s->sheaf_capacity), gfp);
+
+ if (unlikely(!sheaf))
+ return NULL;
+
+ sheaf->cache = s;
+
+ stat(s, SHEAF_ALLOC);
+
+ return sheaf;
+}
+
+static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf)
+{
+ kfree(sheaf);
+
+ stat(s, SHEAF_FREE);
+}
+
+static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags,
+ size_t size, void **p);
+
+
+static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf,
+ gfp_t gfp)
+{
+ int to_fill = s->sheaf_capacity - sheaf->size;
+ int filled;
+
+ if (!to_fill)
+ return 0;
+
+ filled = __kmem_cache_alloc_bulk(s, gfp, to_fill,
+ &sheaf->objects[sheaf->size]);
+
+ sheaf->size += filled;
+
+ stat_add(s, SHEAF_REFILL, filled);
+
+ if (filled < to_fill)
+ return -ENOMEM;
+
+ return 0;
+}
+
+
+static struct slab_sheaf *alloc_full_sheaf(struct kmem_cache *s, gfp_t gfp)
+{
+ struct slab_sheaf *sheaf = alloc_empty_sheaf(s, gfp);
+
+ if (!sheaf)
+ return NULL;
+
+ if (refill_sheaf(s, sheaf, gfp)) {
+ free_empty_sheaf(s, sheaf);
+ return NULL;
+ }
+
+ return sheaf;
+}
+
+/*
+ * Maximum number of objects freed during a single flush of main pcs sheaf.
+ * Translates directly to an on-stack array size.
+ */
+#define PCS_BATCH_MAX 32U
+
+static void __kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p);
+
+/*
+ * Free all objects from the main sheaf. In order to perform
+ * __kmem_cache_free_bulk() outside of cpu_sheaves->lock, work in batches where
+ * object pointers are moved to a on-stack array under the lock. To bound the
+ * stack usage, limit each batch to PCS_BATCH_MAX.
+ *
+ * returns true if at least partially flushed
+ */
+static bool sheaf_flush_main(struct kmem_cache *s)
+{
+ struct slub_percpu_sheaves *pcs;
+ unsigned int batch, remaining;
+ void *objects[PCS_BATCH_MAX];
+ struct slab_sheaf *sheaf;
+ bool ret = false;
+
+next_batch:
+ if (!local_trylock(&s->cpu_sheaves->lock))
+ return ret;
+
+ pcs = this_cpu_ptr(s->cpu_sheaves);
+ sheaf = pcs->main;
+
+ batch = min(PCS_BATCH_MAX, sheaf->size);
+
+ sheaf->size -= batch;
+ memcpy(objects, sheaf->objects + sheaf->size, batch * sizeof(void *));
+
+ remaining = sheaf->size;
+
+ local_unlock(&s->cpu_sheaves->lock);
+
+ __kmem_cache_free_bulk(s, batch, &objects[0]);
+
+ stat_add(s, SHEAF_FLUSH, batch);
+
+ ret = true;
+
+ if (remaining)
+ goto next_batch;
+
+ return ret;
+}
+
+/*
+ * Free all objects from a sheaf that's unused, i.e. not linked to any
+ * cpu_sheaves, so we need no locking and batching. The locking is also not
+ * necessary when flushing cpu's sheaves (both spare and main) during cpu
+ * hotremove as the cpu is not executing anymore.
+ */
+static void sheaf_flush_unused(struct kmem_cache *s, struct slab_sheaf *sheaf)
+{
+ if (!sheaf->size)
+ return;
+
+ stat_add(s, SHEAF_FLUSH, sheaf->size);
+
+ __kmem_cache_free_bulk(s, sheaf->size, &sheaf->objects[0]);
+
+ sheaf->size = 0;
+}
+
+static void __rcu_free_sheaf_prepare(struct kmem_cache *s,
+ struct slab_sheaf *sheaf)
+{
+ bool init = slab_want_init_on_free(s);
+ void **p = &sheaf->objects[0];
+ unsigned int i = 0;
+
+ while (i < sheaf->size) {
+ struct slab *slab = virt_to_slab(p[i]);
+
+ memcg_slab_free_hook(s, slab, p + i, 1);
+ alloc_tagging_slab_free_hook(s, slab, p + i, 1);
+
+ if (unlikely(!slab_free_hook(s, p[i], init, true))) {
+ p[i] = p[--sheaf->size];
+ continue;
+ }
+
+ i++;
+ }
+}
+
+static void rcu_free_sheaf_nobarn(struct rcu_head *head)
+{
+ struct slab_sheaf *sheaf;
+ struct kmem_cache *s;
+
+ sheaf = container_of(head, struct slab_sheaf, rcu_head);
+ s = sheaf->cache;
+
+ __rcu_free_sheaf_prepare(s, sheaf);
+
+ sheaf_flush_unused(s, sheaf);
+
+ free_empty_sheaf(s, sheaf);
+}
+
+/*
+ * Caller needs to make sure migration is disabled in order to fully flush
+ * single cpu's sheaves
+ *
+ * must not be called from an irq
+ *
+ * flushing operations are rare so let's keep it simple and flush to slabs
+ * directly, skipping the barn
+ */
+static void pcs_flush_all(struct kmem_cache *s)
+{
+ struct slub_percpu_sheaves *pcs;
+ struct slab_sheaf *spare, *rcu_free;
+
+ local_lock(&s->cpu_sheaves->lock);
+ pcs = this_cpu_ptr(s->cpu_sheaves);
+
+ spare = pcs->spare;
+ pcs->spare = NULL;
+
+ rcu_free = pcs->rcu_free;
+ pcs->rcu_free = NULL;
+
+ local_unlock(&s->cpu_sheaves->lock);
+
+ if (spare) {
+ sheaf_flush_unused(s, spare);
+ free_empty_sheaf(s, spare);
+ }
+
+ if (rcu_free)
+ call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn);
+
+ sheaf_flush_main(s);
+}
+
+static void __pcs_flush_all_cpu(struct kmem_cache *s, unsigned int cpu)
+{
+ struct slub_percpu_sheaves *pcs;
+
+ pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
+
+ /* The cpu is not executing anymore so we don't need pcs->lock */
+ sheaf_flush_unused(s, pcs->main);
+ if (pcs->spare) {
+ sheaf_flush_unused(s, pcs->spare);
+ free_empty_sheaf(s, pcs->spare);
+ pcs->spare = NULL;
+ }
+
+ if (pcs->rcu_free) {
+ call_rcu(&pcs->rcu_free->rcu_head, rcu_free_sheaf_nobarn);
+ pcs->rcu_free = NULL;
+ }
+}
+
+static void pcs_destroy(struct kmem_cache *s)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct slub_percpu_sheaves *pcs;
+
+ pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
+
+ /* can happen when unwinding failed create */
+ if (!pcs->main)
+ continue;
+
+ /*
+ * We have already passed __kmem_cache_shutdown() so everything
+ * was flushed and there should be no objects allocated from
+ * slabs, otherwise kmem_cache_destroy() would have aborted.
+ * Therefore something would have to be really wrong if the
+ * warnings here trigger, and we should rather leave objects and
+ * sheaves to leak in that case.
+ */
+
+ WARN_ON(pcs->spare);
+ WARN_ON(pcs->rcu_free);
+
+ if (!WARN_ON(pcs->main->size)) {
+ free_empty_sheaf(s, pcs->main);
+ pcs->main = NULL;
+ }
+ }
+
+ free_percpu(s->cpu_sheaves);
+ s->cpu_sheaves = NULL;
+}
+
+static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn)
+{
+ struct slab_sheaf *empty = NULL;
+ unsigned long flags;
+
+ if (!data_race(barn->nr_empty))
+ return NULL;
+
+ spin_lock_irqsave(&barn->lock, flags);
+
+ if (likely(barn->nr_empty)) {
+ empty = list_first_entry(&barn->sheaves_empty,
+ struct slab_sheaf, barn_list);
+ list_del(&empty->barn_list);
+ barn->nr_empty--;
+ }
+
+ spin_unlock_irqrestore(&barn->lock, flags);
+
+ return empty;
+}
+
+/*
+ * The following two functions are used mainly in cases where we have to undo an
+ * intended action due to a race or cpu migration. Thus they do not check the
+ * empty or full sheaf limits for simplicity.
+ */
+
+static void barn_put_empty_sheaf(struct node_barn *barn, struct slab_sheaf *sheaf)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&barn->lock, flags);
+
+ list_add(&sheaf->barn_list, &barn->sheaves_empty);
+ barn->nr_empty++;
+
+ spin_unlock_irqrestore(&barn->lock, flags);
+}
+
+static void barn_put_full_sheaf(struct node_barn *barn, struct slab_sheaf *sheaf)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&barn->lock, flags);
+
+ list_add(&sheaf->barn_list, &barn->sheaves_full);
+ barn->nr_full++;
+
+ spin_unlock_irqrestore(&barn->lock, flags);
+}
+
+static struct slab_sheaf *barn_get_full_or_empty_sheaf(struct node_barn *barn)
+{
+ struct slab_sheaf *sheaf = NULL;
+ unsigned long flags;
+
+ if (!data_race(barn->nr_full) && !data_race(barn->nr_empty))
+ return NULL;
+
+ spin_lock_irqsave(&barn->lock, flags);
+
+ if (barn->nr_full) {
+ sheaf = list_first_entry(&barn->sheaves_full, struct slab_sheaf,
+ barn_list);
+ list_del(&sheaf->barn_list);
+ barn->nr_full--;
+ } else if (barn->nr_empty) {
+ sheaf = list_first_entry(&barn->sheaves_empty,
+ struct slab_sheaf, barn_list);
+ list_del(&sheaf->barn_list);
+ barn->nr_empty--;
+ }
+
+ spin_unlock_irqrestore(&barn->lock, flags);
+
+ return sheaf;
+}
+
+/*
+ * If a full sheaf is available, return it and put the supplied empty one to
+ * barn. We ignore the limit on empty sheaves as the number of sheaves doesn't
+ * change.
+ */
+static struct slab_sheaf *
+barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty)
+{
+ struct slab_sheaf *full = NULL;
+ unsigned long flags;
+
+ if (!data_race(barn->nr_full))
+ return NULL;
+
+ spin_lock_irqsave(&barn->lock, flags);
+
+ if (likely(barn->nr_full)) {
+ full = list_first_entry(&barn->sheaves_full, struct slab_sheaf,
+ barn_list);
+ list_del(&full->barn_list);
+ list_add(&empty->barn_list, &barn->sheaves_empty);
+ barn->nr_full--;
+ barn->nr_empty++;
+ }
+
+ spin_unlock_irqrestore(&barn->lock, flags);
+
+ return full;
+}
+
+/*
+ * If an empty sheaf is available, return it and put the supplied full one to
+ * barn. But if there are too many full sheaves, reject this with -E2BIG.
+ */
+static struct slab_sheaf *
+barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full)
+{
+ struct slab_sheaf *empty;
+ unsigned long flags;
+
+ /* we don't repeat this check under barn->lock as it's not critical */
+ if (data_race(barn->nr_full) >= MAX_FULL_SHEAVES)
+ return ERR_PTR(-E2BIG);
+ if (!data_race(barn->nr_empty))
+ return ERR_PTR(-ENOMEM);
+
+ spin_lock_irqsave(&barn->lock, flags);
+
+ if (likely(barn->nr_empty)) {
+ empty = list_first_entry(&barn->sheaves_empty, struct slab_sheaf,
+ barn_list);
+ list_del(&empty->barn_list);
+ list_add(&full->barn_list, &barn->sheaves_full);
+ barn->nr_empty--;
+ barn->nr_full++;
+ } else {
+ empty = ERR_PTR(-ENOMEM);
+ }
+
+ spin_unlock_irqrestore(&barn->lock, flags);
+
+ return empty;
+}
+
+static void barn_init(struct node_barn *barn)
+{
+ spin_lock_init(&barn->lock);
+ INIT_LIST_HEAD(&barn->sheaves_full);
+ INIT_LIST_HEAD(&barn->sheaves_empty);
+ barn->nr_full = 0;
+ barn->nr_empty = 0;
+}
+
+static void barn_shrink(struct kmem_cache *s, struct node_barn *barn)
+{
+ struct list_head empty_list;
+ struct list_head full_list;
+ struct slab_sheaf *sheaf, *sheaf2;
+ unsigned long flags;
+
+ INIT_LIST_HEAD(&empty_list);
+ INIT_LIST_HEAD(&full_list);
+
+ spin_lock_irqsave(&barn->lock, flags);
+
+ list_splice_init(&barn->sheaves_full, &full_list);
+ barn->nr_full = 0;
+ list_splice_init(&barn->sheaves_empty, &empty_list);
+ barn->nr_empty = 0;
+
+ spin_unlock_irqrestore(&barn->lock, flags);
+
+ list_for_each_entry_safe(sheaf, sheaf2, &full_list, barn_list) {
+ sheaf_flush_unused(s, sheaf);
+ free_empty_sheaf(s, sheaf);
+ }
+
+ list_for_each_entry_safe(sheaf, sheaf2, &empty_list, barn_list)
+ free_empty_sheaf(s, sheaf);
+}
+
/*
* Slab allocation and freeing
*/
static inline struct slab *alloc_slab_page(gfp_t flags, int node,
- struct kmem_cache_order_objects oo)
+ struct kmem_cache_order_objects oo,
+ bool allow_spin)
{
struct folio *folio;
struct slab *slab;
unsigned int order = oo_order(oo);
- if (node == NUMA_NO_NODE)
+ if (unlikely(!allow_spin))
+ folio = (struct folio *)alloc_frozen_pages_nolock(0/* __GFP_COMP is implied */,
+ node, order);
+ else if (node == NUMA_NO_NODE)
folio = (struct folio *)alloc_frozen_pages(flags, order);
else
folio = (struct folio *)__alloc_frozen_pages(flags, order, node, NULL);
@@ -2596,6 +3185,7 @@ static __always_inline void unaccount_slab(struct slab *slab, int order,
static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
{
+ bool allow_spin = gfpflags_allow_spinning(flags);
struct slab *slab;
struct kmem_cache_order_objects oo = s->oo;
gfp_t alloc_gfp;
@@ -2615,7 +3205,11 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min))
alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_RECLAIM;
- slab = alloc_slab_page(alloc_gfp, node, oo);
+ /*
+ * __GFP_RECLAIM could be cleared on the first allocation attempt,
+ * so pass allow_spin flag directly.
+ */
+ slab = alloc_slab_page(alloc_gfp, node, oo, allow_spin);
if (unlikely(!slab)) {
oo = s->min;
alloc_gfp = flags;
@@ -2623,7 +3217,7 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
* Allocation may have failed due to fragmentation.
* Try a lower order alloc if possible
*/
- slab = alloc_slab_page(alloc_gfp, node, oo);
+ slab = alloc_slab_page(alloc_gfp, node, oo, allow_spin);
if (unlikely(!slab))
return NULL;
stat(s, ORDER_FALLBACK);
@@ -2716,23 +3310,19 @@ static void discard_slab(struct kmem_cache *s, struct slab *slab)
free_slab(s, slab);
}
-/*
- * SLUB reuses PG_workingset bit to keep track of whether it's on
- * the per-node partial list.
- */
static inline bool slab_test_node_partial(const struct slab *slab)
{
- return folio_test_workingset(slab_folio(slab));
+ return test_bit(SL_partial, &slab->flags.f);
}
static inline void slab_set_node_partial(struct slab *slab)
{
- set_bit(PG_workingset, folio_flags(slab_folio(slab), 0));
+ set_bit(SL_partial, &slab->flags.f);
}
static inline void slab_clear_node_partial(struct slab *slab)
{
- clear_bit(PG_workingset, folio_flags(slab_folio(slab), 0));
+ clear_bit(SL_partial, &slab->flags.f);
}
/*
@@ -2778,13 +3368,21 @@ static void *alloc_single_from_partial(struct kmem_cache *s,
lockdep_assert_held(&n->list_lock);
+#ifdef CONFIG_SLUB_DEBUG
+ if (s->flags & SLAB_CONSISTENCY_CHECKS) {
+ if (!validate_slab_ptr(slab)) {
+ slab_err(s, slab, "Not a valid slab page");
+ return NULL;
+ }
+ }
+#endif
+
object = slab->freelist;
slab->freelist = get_freepointer(s, object);
slab->inuse++;
if (!alloc_debug_processing(s, slab, object, orig_size)) {
- if (folio_test_slab(slab_folio(slab)))
- remove_partial(n, slab);
+ remove_partial(n, slab);
return NULL;
}
@@ -2796,33 +3394,47 @@ static void *alloc_single_from_partial(struct kmem_cache *s,
return object;
}
+static void defer_deactivate_slab(struct slab *slab, void *flush_freelist);
+
/*
* Called only for kmem_cache_debug() caches to allocate from a freshly
* allocated slab. Allocate a single object instead of whole freelist
* and put the slab to the partial (or full) list.
*/
-static void *alloc_single_from_new_slab(struct kmem_cache *s,
- struct slab *slab, int orig_size)
+static void *alloc_single_from_new_slab(struct kmem_cache *s, struct slab *slab,
+ int orig_size, gfp_t gfpflags)
{
+ bool allow_spin = gfpflags_allow_spinning(gfpflags);
int nid = slab_nid(slab);
struct kmem_cache_node *n = get_node(s, nid);
unsigned long flags;
void *object;
+ if (!allow_spin && !spin_trylock_irqsave(&n->list_lock, flags)) {
+ /* Unlucky, discard newly allocated slab */
+ slab->frozen = 1;
+ defer_deactivate_slab(slab, NULL);
+ return NULL;
+ }
object = slab->freelist;
slab->freelist = get_freepointer(s, object);
slab->inuse = 1;
- if (!alloc_debug_processing(s, slab, object, orig_size))
+ if (!alloc_debug_processing(s, slab, object, orig_size)) {
/*
* It's not really expected that this would fail on a
* freshly allocated slab, but a concurrent memory
* corruption in theory could cause that.
+ * Leak memory of allocated slab.
*/
+ if (!allow_spin)
+ spin_unlock_irqrestore(&n->list_lock, flags);
return NULL;
+ }
- spin_lock_irqsave(&n->list_lock, flags);
+ if (allow_spin)
+ spin_lock_irqsave(&n->list_lock, flags);
if (slab->inuse == slab->objects)
add_full(s, n, slab);
@@ -2863,7 +3475,10 @@ static struct slab *get_partial_node(struct kmem_cache *s,
if (!n || !n->nr_partial)
return NULL;
- spin_lock_irqsave(&n->list_lock, flags);
+ if (gfpflags_allow_spinning(pc->flags))
+ spin_lock_irqsave(&n->list_lock, flags);
+ else if (!spin_trylock_irqsave(&n->list_lock, flags))
+ return NULL;
list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) {
if (!pfmemalloc_match(slab, pc->flags))
continue;
@@ -3031,30 +3646,46 @@ static inline void note_cmpxchg_failure(const char *n,
pr_info("%s %s: cmpxchg redo ", n, s->name);
-#ifdef CONFIG_PREEMPTION
- if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
+ if (IS_ENABLED(CONFIG_PREEMPTION) &&
+ tid_to_cpu(tid) != tid_to_cpu(actual_tid)) {
pr_warn("due to cpu change %d -> %d\n",
tid_to_cpu(tid), tid_to_cpu(actual_tid));
- else
-#endif
- if (tid_to_event(tid) != tid_to_event(actual_tid))
+ } else if (tid_to_event(tid) != tid_to_event(actual_tid)) {
pr_warn("due to cpu running other code. Event %ld->%ld\n",
tid_to_event(tid), tid_to_event(actual_tid));
- else
+ } else {
pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n",
actual_tid, tid, next_tid(tid));
+ }
#endif
stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
}
static void init_kmem_cache_cpus(struct kmem_cache *s)
{
+#ifdef CONFIG_PREEMPT_RT
+ /*
+ * Register lockdep key for non-boot kmem caches to avoid
+ * WARN_ON_ONCE(static_obj(key))) in lockdep_register_key()
+ */
+ bool finegrain_lockdep = !init_section_contains(s, 1);
+#else
+ /*
+ * Don't bother with different lockdep classes for each
+ * kmem_cache, since we only use local_trylock_irqsave().
+ */
+ bool finegrain_lockdep = false;
+#endif
int cpu;
struct kmem_cache_cpu *c;
+ if (finegrain_lockdep)
+ lockdep_register_key(&s->lock_key);
for_each_possible_cpu(cpu) {
c = per_cpu_ptr(s->cpu_slab, cpu);
- local_lock_init(&c->lock);
+ local_trylock_init(&c->lock);
+ if (finegrain_lockdep)
+ lockdep_set_class(&c->lock, &s->lock_key);
c->tid = init_tid(cpu);
}
}
@@ -3145,6 +3776,47 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab,
}
}
+/*
+ * ___slab_alloc()'s caller is supposed to check if kmem_cache::kmem_cache_cpu::lock
+ * can be acquired without a deadlock before invoking the function.
+ *
+ * Without LOCKDEP we trust the code to be correct. kmalloc_nolock() is
+ * using local_lock_is_locked() properly before calling local_lock_cpu_slab(),
+ * and kmalloc() is not used in an unsupported context.
+ *
+ * With LOCKDEP, on PREEMPT_RT lockdep does its checking in local_lock_irqsave().
+ * On !PREEMPT_RT we use trylock to avoid false positives in NMI, but
+ * lockdep_assert() will catch a bug in case:
+ * #1
+ * kmalloc() -> ___slab_alloc() -> irqsave -> NMI -> bpf -> kmalloc_nolock()
+ * or
+ * #2
+ * kmalloc() -> ___slab_alloc() -> irqsave -> tracepoint/kprobe -> bpf -> kmalloc_nolock()
+ *
+ * On PREEMPT_RT an invocation is not possible from IRQ-off or preempt
+ * disabled context. The lock will always be acquired and if needed it
+ * block and sleep until the lock is available.
+ * #1 is possible in !PREEMPT_RT only.
+ * #2 is possible in both with a twist that irqsave is replaced with rt_spinlock:
+ * kmalloc() -> ___slab_alloc() -> rt_spin_lock(kmem_cache_A) ->
+ * tracepoint/kprobe -> bpf -> kmalloc_nolock() -> rt_spin_lock(kmem_cache_B)
+ *
+ * local_lock_is_locked() prevents the case kmem_cache_A == kmem_cache_B
+ */
+#if defined(CONFIG_PREEMPT_RT) || !defined(CONFIG_LOCKDEP)
+#define local_lock_cpu_slab(s, flags) \
+ local_lock_irqsave(&(s)->cpu_slab->lock, flags)
+#else
+#define local_lock_cpu_slab(s, flags) \
+ do { \
+ bool __l = local_trylock_irqsave(&(s)->cpu_slab->lock, flags); \
+ lockdep_assert(__l); \
+ } while (0)
+#endif
+
+#define local_unlock_cpu_slab(s, flags) \
+ local_unlock_irqrestore(&(s)->cpu_slab->lock, flags)
+
#ifdef CONFIG_SLUB_CPU_PARTIAL
static void __put_partials(struct kmem_cache *s, struct slab *partial_slab)
{
@@ -3229,7 +3901,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain)
unsigned long flags;
int slabs = 0;
- local_lock_irqsave(&s->cpu_slab->lock, flags);
+ local_lock_cpu_slab(s, flags);
oldslab = this_cpu_read(s->cpu_slab->partial);
@@ -3254,7 +3926,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain)
this_cpu_write(s->cpu_slab->partial, slab);
- local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+ local_unlock_cpu_slab(s, flags);
if (slab_to_put) {
__put_partials(s, slab_to_put);
@@ -3311,11 +3983,40 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
put_partials_cpu(s, c);
}
-struct slub_flush_work {
- struct work_struct work;
- struct kmem_cache *s;
- bool skip;
-};
+static inline void flush_this_cpu_slab(struct kmem_cache *s)
+{
+ struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab);
+
+ if (c->slab)
+ flush_slab(s, c);
+
+ put_partials(s);
+}
+
+static bool has_cpu_slab(int cpu, struct kmem_cache *s)
+{
+ struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
+
+ return c->slab || slub_percpu_partial(c);
+}
+
+#else /* CONFIG_SLUB_TINY */
+static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { }
+static inline bool has_cpu_slab(int cpu, struct kmem_cache *s) { return false; }
+static inline void flush_this_cpu_slab(struct kmem_cache *s) { }
+#endif /* CONFIG_SLUB_TINY */
+
+static bool has_pcs_used(int cpu, struct kmem_cache *s)
+{
+ struct slub_percpu_sheaves *pcs;
+
+ if (!s->cpu_sheaves)
+ return false;
+
+ pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
+
+ return (pcs->spare || pcs->rcu_free || pcs->main->size);
+}
/*
* Flush cpu slab.
@@ -3325,30 +4026,18 @@ struct slub_flush_work {
static void flush_cpu_slab(struct work_struct *w)
{
struct kmem_cache *s;
- struct kmem_cache_cpu *c;
struct slub_flush_work *sfw;
sfw = container_of(w, struct slub_flush_work, work);
s = sfw->s;
- c = this_cpu_ptr(s->cpu_slab);
- if (c->slab)
- flush_slab(s, c);
+ if (s->cpu_sheaves)
+ pcs_flush_all(s);
- put_partials(s);
+ flush_this_cpu_slab(s);
}
-static bool has_cpu_slab(int cpu, struct kmem_cache *s)
-{
- struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
-
- return c->slab || slub_percpu_partial(c);
-}
-
-static DEFINE_MUTEX(flush_lock);
-static DEFINE_PER_CPU(struct slub_flush_work, slub_flush);
-
static void flush_all_cpus_locked(struct kmem_cache *s)
{
struct slub_flush_work *sfw;
@@ -3359,7 +4048,7 @@ static void flush_all_cpus_locked(struct kmem_cache *s)
for_each_online_cpu(cpu) {
sfw = &per_cpu(slub_flush, cpu);
- if (!has_cpu_slab(cpu, s)) {
+ if (!has_cpu_slab(cpu, s) && !has_pcs_used(cpu, s)) {
sfw->skip = true;
continue;
}
@@ -3386,6 +4075,74 @@ static void flush_all(struct kmem_cache *s)
cpus_read_unlock();
}
+static void flush_rcu_sheaf(struct work_struct *w)
+{
+ struct slub_percpu_sheaves *pcs;
+ struct slab_sheaf *rcu_free;
+ struct slub_flush_work *sfw;
+ struct kmem_cache *s;
+
+ sfw = container_of(w, struct slub_flush_work, work);
+ s = sfw->s;
+
+ local_lock(&s->cpu_sheaves->lock);
+ pcs = this_cpu_ptr(s->cpu_sheaves);
+
+ rcu_free = pcs->rcu_free;
+ pcs->rcu_free = NULL;
+
+ local_unlock(&s->cpu_sheaves->lock);
+
+ if (rcu_free)
+ call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn);
+}
+
+
+/* needed for kvfree_rcu_barrier() */
+void flush_all_rcu_sheaves(void)
+{
+ struct slub_flush_work *sfw;
+ struct kmem_cache *s;
+ unsigned int cpu;
+
+ cpus_read_lock();
+ mutex_lock(&slab_mutex);
+
+ list_for_each_entry(s, &slab_caches, list) {
+ if (!s->cpu_sheaves)
+ continue;
+
+ mutex_lock(&flush_lock);
+
+ for_each_online_cpu(cpu) {
+ sfw = &per_cpu(slub_flush, cpu);
+
+ /*
+ * we don't check if rcu_free sheaf exists - racing
+ * __kfree_rcu_sheaf() might have just removed it.
+ * by executing flush_rcu_sheaf() on the cpu we make
+ * sure the __kfree_rcu_sheaf() finished its call_rcu()
+ */
+
+ INIT_WORK(&sfw->work, flush_rcu_sheaf);
+ sfw->s = s;
+ queue_work_on(cpu, flushwq, &sfw->work);
+ }
+
+ for_each_online_cpu(cpu) {
+ sfw = &per_cpu(slub_flush, cpu);
+ flush_work(&sfw->work);
+ }
+
+ mutex_unlock(&flush_lock);
+ }
+
+ mutex_unlock(&slab_mutex);
+ cpus_read_unlock();
+
+ rcu_barrier();
+}
+
/*
* Use the cpu notifier to insure that the cpu slabs are flushed when
* necessary.
@@ -3395,19 +4152,15 @@ static int slub_cpu_dead(unsigned int cpu)
struct kmem_cache *s;
mutex_lock(&slab_mutex);
- list_for_each_entry(s, &slab_caches, list)
+ list_for_each_entry(s, &slab_caches, list) {
__flush_cpu_slab(s, cpu);
+ if (s->cpu_sheaves)
+ __pcs_flush_all_cpu(s, cpu);
+ }
mutex_unlock(&slab_mutex);
return 0;
}
-#else /* CONFIG_SLUB_TINY */
-static inline void flush_all_cpus_locked(struct kmem_cache *s) { }
-static inline void flush_all(struct kmem_cache *s) { }
-static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { }
-static inline int slub_cpu_dead(unsigned int cpu) { return 0; }
-#endif /* CONFIG_SLUB_TINY */
-
/*
* Check if the objects in a per cpu structure fit numa
* locality expectations.
@@ -3688,6 +4441,7 @@ static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab)
static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
{
+ bool allow_spin = gfpflags_allow_spinning(gfpflags);
void *freelist;
struct slab *slab;
unsigned long flags;
@@ -3713,9 +4467,21 @@ reread_slab:
if (unlikely(!node_match(slab, node))) {
/*
* same as above but node_match() being false already
- * implies node != NUMA_NO_NODE
+ * implies node != NUMA_NO_NODE.
+ *
+ * We don't strictly honor pfmemalloc and NUMA preferences
+ * when !allow_spin because:
+ *
+ * 1. Most kmalloc() users allocate objects on the local node,
+ * so kmalloc_nolock() tries not to interfere with them by
+ * deactivating the cpu slab.
+ *
+ * 2. Deactivating due to NUMA or pfmemalloc mismatch may cause
+ * unnecessary slab allocations even when n->partial list
+ * is not empty.
*/
- if (!node_isset(node, slab_nodes)) {
+ if (!node_isset(node, slab_nodes) ||
+ !allow_spin) {
node = NUMA_NO_NODE;
} else {
stat(s, ALLOC_NODE_MISMATCH);
@@ -3728,13 +4494,14 @@ reread_slab:
* PFMEMALLOC but right now, we are losing the pfmemalloc
* information when the page leaves the per-cpu allocator
*/
- if (unlikely(!pfmemalloc_match(slab, gfpflags)))
+ if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin))
goto deactivate_slab;
/* must check again c->slab in case we got preempted and it changed */
- local_lock_irqsave(&s->cpu_slab->lock, flags);
+ local_lock_cpu_slab(s, flags);
+
if (unlikely(slab != c->slab)) {
- local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+ local_unlock_cpu_slab(s, flags);
goto reread_slab;
}
freelist = c->freelist;
@@ -3746,7 +4513,7 @@ reread_slab:
if (!freelist) {
c->slab = NULL;
c->tid = next_tid(c->tid);
- local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+ local_unlock_cpu_slab(s, flags);
stat(s, DEACTIVATE_BYPASS);
goto new_slab;
}
@@ -3765,34 +4532,34 @@ load_freelist:
VM_BUG_ON(!c->slab->frozen);
c->freelist = get_freepointer(s, freelist);
c->tid = next_tid(c->tid);
- local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+ local_unlock_cpu_slab(s, flags);
return freelist;
deactivate_slab:
- local_lock_irqsave(&s->cpu_slab->lock, flags);
+ local_lock_cpu_slab(s, flags);
if (slab != c->slab) {
- local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+ local_unlock_cpu_slab(s, flags);
goto reread_slab;
}
freelist = c->freelist;
c->slab = NULL;
c->freelist = NULL;
c->tid = next_tid(c->tid);
- local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+ local_unlock_cpu_slab(s, flags);
deactivate_slab(s, slab, freelist);
new_slab:
#ifdef CONFIG_SLUB_CPU_PARTIAL
while (slub_percpu_partial(c)) {
- local_lock_irqsave(&s->cpu_slab->lock, flags);
+ local_lock_cpu_slab(s, flags);
if (unlikely(c->slab)) {
- local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+ local_unlock_cpu_slab(s, flags);
goto reread_slab;
}
if (unlikely(!slub_percpu_partial(c))) {
- local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+ local_unlock_cpu_slab(s, flags);
/* we were preempted and partial list got empty */
goto new_objects;
}
@@ -3801,7 +4568,8 @@ new_slab:
slub_set_percpu_partial(c, slab);
if (likely(node_match(slab, node) &&
- pfmemalloc_match(slab, gfpflags))) {
+ pfmemalloc_match(slab, gfpflags)) ||
+ !allow_spin) {
c->slab = slab;
freelist = get_freelist(s, slab);
VM_BUG_ON(!freelist);
@@ -3809,7 +4577,7 @@ new_slab:
goto load_freelist;
}
- local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+ local_unlock_cpu_slab(s, flags);
slab->next = NULL;
__put_partials(s, slab);
@@ -3831,8 +4599,13 @@ new_objects:
* allocating new page from other nodes
*/
if (unlikely(node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE)
- && try_thisnode))
- pc.flags = GFP_NOWAIT | __GFP_THISNODE;
+ && try_thisnode)) {
+ if (unlikely(!allow_spin))
+ /* Do not upgrade gfp to NOWAIT from more restrictive mode */
+ pc.flags = gfpflags | __GFP_THISNODE;
+ else
+ pc.flags = GFP_NOWAIT | __GFP_THISNODE;
+ }
pc.orig_size = orig_size;
slab = get_partial(s, node, &pc);
@@ -3843,9 +4616,14 @@ new_objects:
* For debug caches here we had to go through
* alloc_single_from_partial() so just store the
* tracking info and return the object.
+ *
+ * Due to disabled preemption we need to disallow
+ * blocking. The flags are further adjusted by
+ * gfp_nested_mask() in stack_depot itself.
*/
if (s->flags & SLAB_STORE_USER)
- set_track(s, freelist, TRACK_ALLOC, addr);
+ set_track(s, freelist, TRACK_ALLOC, addr,
+ gfpflags & ~(__GFP_DIRECT_RECLAIM));
return freelist;
}
@@ -3871,13 +4649,14 @@ new_objects:
stat(s, ALLOC_SLAB);
if (kmem_cache_debug(s)) {
- freelist = alloc_single_from_new_slab(s, slab, orig_size);
+ freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags);
if (unlikely(!freelist))
goto new_objects;
if (s->flags & SLAB_STORE_USER)
- set_track(s, freelist, TRACK_ALLOC, addr);
+ set_track(s, freelist, TRACK_ALLOC, addr,
+ gfpflags & ~(__GFP_DIRECT_RECLAIM));
return freelist;
}
@@ -3893,7 +4672,7 @@ new_objects:
inc_slabs_node(s, slab_nid(slab), slab->objects);
- if (unlikely(!pfmemalloc_match(slab, gfpflags))) {
+ if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) {
/*
* For !pfmemalloc_match() case we don't load freelist so that
* we don't make further mismatched allocations easier.
@@ -3904,7 +4683,7 @@ new_objects:
retry_load_slab:
- local_lock_irqsave(&s->cpu_slab->lock, flags);
+ local_lock_cpu_slab(s, flags);
if (unlikely(c->slab)) {
void *flush_freelist = c->freelist;
struct slab *flush_slab = c->slab;
@@ -3913,9 +4692,14 @@ retry_load_slab:
c->freelist = NULL;
c->tid = next_tid(c->tid);
- local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+ local_unlock_cpu_slab(s, flags);
- deactivate_slab(s, flush_slab, flush_freelist);
+ if (unlikely(!allow_spin)) {
+ /* Reentrant slub cannot take locks, defer */
+ defer_deactivate_slab(flush_slab, flush_freelist);
+ } else {
+ deactivate_slab(s, flush_slab, flush_freelist);
+ }
stat(s, CPUSLAB_FLUSH);
@@ -3925,6 +4709,19 @@ retry_load_slab:
goto load_freelist;
}
+/*
+ * We disallow kprobes in ___slab_alloc() to prevent reentrance
+ *
+ * kmalloc() -> ___slab_alloc() -> local_lock_cpu_slab() protected part of
+ * ___slab_alloc() manipulating c->freelist -> kprobe -> bpf ->
+ * kmalloc_nolock() or kfree_nolock() -> __update_cpu_freelist_fast()
+ * manipulating c->freelist without lock.
+ *
+ * This does not prevent kprobe in functions called from ___slab_alloc() such as
+ * local_lock_irqsave() itself, and that is fine, we only need to protect the
+ * c->freelist manipulation in ___slab_alloc() itself.
+ */
+NOKPROBE_SYMBOL(___slab_alloc);
/*
* A wrapper for ___slab_alloc() for contexts where preemption is not yet
@@ -3944,8 +4741,19 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
*/
c = slub_get_cpu_ptr(s->cpu_slab);
#endif
-
+ if (unlikely(!gfpflags_allow_spinning(gfpflags))) {
+ if (local_lock_is_locked(&s->cpu_slab->lock)) {
+ /*
+ * EBUSY is an internal signal to kmalloc_nolock() to
+ * retry a different bucket. It's not propagated
+ * to the caller.
+ */
+ p = ERR_PTR(-EBUSY);
+ goto out;
+ }
+ }
p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size);
+out:
#ifdef CONFIG_PREEMPT_COUNT
slub_put_cpu_ptr(s->cpu_slab);
#endif
@@ -4069,7 +4877,7 @@ static void *__slab_alloc_node(struct kmem_cache *s,
return NULL;
}
- object = alloc_single_from_new_slab(s, slab, orig_size);
+ object = alloc_single_from_new_slab(s, slab, orig_size, gfpflags);
return object;
}
@@ -4148,8 +4956,9 @@ bool slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
if (p[i] && init && (!kasan_init ||
!kasan_has_integrated_init()))
memset(p[i], 0, zero_size);
- kmemleak_alloc_recursive(p[i], s->object_size, 1,
- s->flags, init_flags);
+ if (gfpflags_allow_spinning(flags))
+ kmemleak_alloc_recursive(p[i], s->object_size, 1,
+ s->flags, init_flags);
kmsan_slab_alloc(s, p[i], init_flags);
alloc_tagging_slab_alloc_hook(s, p[i], flags);
}
@@ -4158,6 +4967,262 @@ bool slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
}
/*
+ * Replace the empty main sheaf with a (at least partially) full sheaf.
+ *
+ * Must be called with the cpu_sheaves local lock locked. If successful, returns
+ * the pcs pointer and the local lock locked (possibly on a different cpu than
+ * initially called). If not successful, returns NULL and the local lock
+ * unlocked.
+ */
+static struct slub_percpu_sheaves *
+__pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, gfp_t gfp)
+{
+ struct slab_sheaf *empty = NULL;
+ struct slab_sheaf *full;
+ struct node_barn *barn;
+ bool can_alloc;
+
+ lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock));
+
+ if (pcs->spare && pcs->spare->size > 0) {
+ swap(pcs->main, pcs->spare);
+ return pcs;
+ }
+
+ barn = get_barn(s);
+ if (!barn) {
+ local_unlock(&s->cpu_sheaves->lock);
+ return NULL;
+ }
+
+ full = barn_replace_empty_sheaf(barn, pcs->main);
+
+ if (full) {
+ stat(s, BARN_GET);
+ pcs->main = full;
+ return pcs;
+ }
+
+ stat(s, BARN_GET_FAIL);
+
+ can_alloc = gfpflags_allow_blocking(gfp);
+
+ if (can_alloc) {
+ if (pcs->spare) {
+ empty = pcs->spare;
+ pcs->spare = NULL;
+ } else {
+ empty = barn_get_empty_sheaf(barn);
+ }
+ }
+
+ local_unlock(&s->cpu_sheaves->lock);
+
+ if (!can_alloc)
+ return NULL;
+
+ if (empty) {
+ if (!refill_sheaf(s, empty, gfp)) {
+ full = empty;
+ } else {
+ /*
+ * we must be very low on memory so don't bother
+ * with the barn
+ */
+ free_empty_sheaf(s, empty);
+ }
+ } else {
+ full = alloc_full_sheaf(s, gfp);
+ }
+
+ if (!full)
+ return NULL;
+
+ /*
+ * we can reach here only when gfpflags_allow_blocking
+ * so this must not be an irq
+ */
+ local_lock(&s->cpu_sheaves->lock);
+ pcs = this_cpu_ptr(s->cpu_sheaves);
+
+ /*
+ * If we are returning empty sheaf, we either got it from the
+ * barn or had to allocate one. If we are returning a full
+ * sheaf, it's due to racing or being migrated to a different
+ * cpu. Breaching the barn's sheaf limits should be thus rare
+ * enough so just ignore them to simplify the recovery.
+ */
+
+ if (pcs->main->size == 0) {
+ barn_put_empty_sheaf(barn, pcs->main);
+ pcs->main = full;
+ return pcs;
+ }
+
+ if (!pcs->spare) {
+ pcs->spare = full;
+ return pcs;
+ }
+
+ if (pcs->spare->size == 0) {
+ barn_put_empty_sheaf(barn, pcs->spare);
+ pcs->spare = full;
+ return pcs;
+ }
+
+ barn_put_full_sheaf(barn, full);
+ stat(s, BARN_PUT);
+
+ return pcs;
+}
+
+static __fastpath_inline
+void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp, int node)
+{
+ struct slub_percpu_sheaves *pcs;
+ bool node_requested;
+ void *object;
+
+#ifdef CONFIG_NUMA
+ if (static_branch_unlikely(&strict_numa) &&
+ node == NUMA_NO_NODE) {
+
+ struct mempolicy *mpol = current->mempolicy;
+
+ if (mpol) {
+ /*
+ * Special BIND rule support. If the local node
+ * is in permitted set then do not redirect
+ * to a particular node.
+ * Otherwise we apply the memory policy to get
+ * the node we need to allocate on.
+ */
+ if (mpol->mode != MPOL_BIND ||
+ !node_isset(numa_mem_id(), mpol->nodes))
+
+ node = mempolicy_slab_node();
+ }
+ }
+#endif
+
+ node_requested = IS_ENABLED(CONFIG_NUMA) && node != NUMA_NO_NODE;
+
+ /*
+ * We assume the percpu sheaves contain only local objects although it's
+ * not completely guaranteed, so we verify later.
+ */
+ if (unlikely(node_requested && node != numa_mem_id()))
+ return NULL;
+
+ if (!local_trylock(&s->cpu_sheaves->lock))
+ return NULL;
+
+ pcs = this_cpu_ptr(s->cpu_sheaves);
+
+ if (unlikely(pcs->main->size == 0)) {
+ pcs = __pcs_replace_empty_main(s, pcs, gfp);
+ if (unlikely(!pcs))
+ return NULL;
+ }
+
+ object = pcs->main->objects[pcs->main->size - 1];
+
+ if (unlikely(node_requested)) {
+ /*
+ * Verify that the object was from the node we want. This could
+ * be false because of cpu migration during an unlocked part of
+ * the current allocation or previous freeing process.
+ */
+ if (folio_nid(virt_to_folio(object)) != node) {
+ local_unlock(&s->cpu_sheaves->lock);
+ return NULL;
+ }
+ }
+
+ pcs->main->size--;
+
+ local_unlock(&s->cpu_sheaves->lock);
+
+ stat(s, ALLOC_PCS);
+
+ return object;
+}
+
+static __fastpath_inline
+unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
+{
+ struct slub_percpu_sheaves *pcs;
+ struct slab_sheaf *main;
+ unsigned int allocated = 0;
+ unsigned int batch;
+
+next_batch:
+ if (!local_trylock(&s->cpu_sheaves->lock))
+ return allocated;
+
+ pcs = this_cpu_ptr(s->cpu_sheaves);
+
+ if (unlikely(pcs->main->size == 0)) {
+
+ struct slab_sheaf *full;
+ struct node_barn *barn;
+
+ if (pcs->spare && pcs->spare->size > 0) {
+ swap(pcs->main, pcs->spare);
+ goto do_alloc;
+ }
+
+ barn = get_barn(s);
+ if (!barn) {
+ local_unlock(&s->cpu_sheaves->lock);
+ return allocated;
+ }
+
+ full = barn_replace_empty_sheaf(barn, pcs->main);
+
+ if (full) {
+ stat(s, BARN_GET);
+ pcs->main = full;
+ goto do_alloc;
+ }
+
+ stat(s, BARN_GET_FAIL);
+
+ local_unlock(&s->cpu_sheaves->lock);
+
+ /*
+ * Once full sheaves in barn are depleted, let the bulk
+ * allocation continue from slab pages, otherwise we would just
+ * be copying arrays of pointers twice.
+ */
+ return allocated;
+ }
+
+do_alloc:
+
+ main = pcs->main;
+ batch = min(size, main->size);
+
+ main->size -= batch;
+ memcpy(p, main->objects + main->size, batch * sizeof(void *));
+
+ local_unlock(&s->cpu_sheaves->lock);
+
+ stat_add(s, ALLOC_PCS, batch);
+
+ allocated += batch;
+
+ if (batch < size) {
+ p += batch;
+ size -= batch;
+ goto next_batch;
+ }
+
+ return allocated;
+}
+
+
+/*
* Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
* have the fastpath folded into their functions. So no function call
* overhead for requests that can be satisfied on the fastpath.
@@ -4181,7 +5246,11 @@ static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list
if (unlikely(object))
goto out;
- object = __slab_alloc_node(s, gfpflags, node, addr, orig_size);
+ if (s->cpu_sheaves)
+ object = alloc_from_pcs(s, gfpflags, node);
+
+ if (!object)
+ object = __slab_alloc_node(s, gfpflags, node, addr, orig_size);
maybe_wipe_obj_freeptr(s, object);
init = slab_want_init_on_alloc(gfpflags, s);
@@ -4254,6 +5323,232 @@ void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t gfpflags, int nod
EXPORT_SYMBOL(kmem_cache_alloc_node_noprof);
/*
+ * returns a sheaf that has at least the requested size
+ * when prefilling is needed, do so with given gfp flags
+ *
+ * return NULL if sheaf allocation or prefilling failed
+ */
+struct slab_sheaf *
+kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size)
+{
+ struct slub_percpu_sheaves *pcs;
+ struct slab_sheaf *sheaf = NULL;
+ struct node_barn *barn;
+
+ if (unlikely(size > s->sheaf_capacity)) {
+
+ /*
+ * slab_debug disables cpu sheaves intentionally so all
+ * prefilled sheaves become "oversize" and we give up on
+ * performance for the debugging. Same with SLUB_TINY.
+ * Creating a cache without sheaves and then requesting a
+ * prefilled sheaf is however not expected, so warn.
+ */
+ WARN_ON_ONCE(s->sheaf_capacity == 0 &&
+ !IS_ENABLED(CONFIG_SLUB_TINY) &&
+ !(s->flags & SLAB_DEBUG_FLAGS));
+
+ sheaf = kzalloc(struct_size(sheaf, objects, size), gfp);
+ if (!sheaf)
+ return NULL;
+
+ stat(s, SHEAF_PREFILL_OVERSIZE);
+ sheaf->cache = s;
+ sheaf->capacity = size;
+
+ if (!__kmem_cache_alloc_bulk(s, gfp, size,
+ &sheaf->objects[0])) {
+ kfree(sheaf);
+ return NULL;
+ }
+
+ sheaf->size = size;
+
+ return sheaf;
+ }
+
+ local_lock(&s->cpu_sheaves->lock);
+ pcs = this_cpu_ptr(s->cpu_sheaves);
+
+ if (pcs->spare) {
+ sheaf = pcs->spare;
+ pcs->spare = NULL;
+ stat(s, SHEAF_PREFILL_FAST);
+ } else {
+ barn = get_barn(s);
+
+ stat(s, SHEAF_PREFILL_SLOW);
+ if (barn)
+ sheaf = barn_get_full_or_empty_sheaf(barn);
+ if (sheaf && sheaf->size)
+ stat(s, BARN_GET);
+ else
+ stat(s, BARN_GET_FAIL);
+ }
+
+ local_unlock(&s->cpu_sheaves->lock);
+
+
+ if (!sheaf)
+ sheaf = alloc_empty_sheaf(s, gfp);
+
+ if (sheaf && sheaf->size < size) {
+ if (refill_sheaf(s, sheaf, gfp)) {
+ sheaf_flush_unused(s, sheaf);
+ free_empty_sheaf(s, sheaf);
+ sheaf = NULL;
+ }
+ }
+
+ if (sheaf)
+ sheaf->capacity = s->sheaf_capacity;
+
+ return sheaf;
+}
+
+/*
+ * Use this to return a sheaf obtained by kmem_cache_prefill_sheaf()
+ *
+ * If the sheaf cannot simply become the percpu spare sheaf, but there's space
+ * for a full sheaf in the barn, we try to refill the sheaf back to the cache's
+ * sheaf_capacity to avoid handling partially full sheaves.
+ *
+ * If the refill fails because gfp is e.g. GFP_NOWAIT, or the barn is full, the
+ * sheaf is instead flushed and freed.
+ */
+void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp,
+ struct slab_sheaf *sheaf)
+{
+ struct slub_percpu_sheaves *pcs;
+ struct node_barn *barn;
+
+ if (unlikely(sheaf->capacity != s->sheaf_capacity)) {
+ sheaf_flush_unused(s, sheaf);
+ kfree(sheaf);
+ return;
+ }
+
+ local_lock(&s->cpu_sheaves->lock);
+ pcs = this_cpu_ptr(s->cpu_sheaves);
+ barn = get_barn(s);
+
+ if (!pcs->spare) {
+ pcs->spare = sheaf;
+ sheaf = NULL;
+ stat(s, SHEAF_RETURN_FAST);
+ }
+
+ local_unlock(&s->cpu_sheaves->lock);
+
+ if (!sheaf)
+ return;
+
+ stat(s, SHEAF_RETURN_SLOW);
+
+ /*
+ * If the barn has too many full sheaves or we fail to refill the sheaf,
+ * simply flush and free it.
+ */
+ if (!barn || data_race(barn->nr_full) >= MAX_FULL_SHEAVES ||
+ refill_sheaf(s, sheaf, gfp)) {
+ sheaf_flush_unused(s, sheaf);
+ free_empty_sheaf(s, sheaf);
+ return;
+ }
+
+ barn_put_full_sheaf(barn, sheaf);
+ stat(s, BARN_PUT);
+}
+
+/*
+ * refill a sheaf previously returned by kmem_cache_prefill_sheaf to at least
+ * the given size
+ *
+ * the sheaf might be replaced by a new one when requesting more than
+ * s->sheaf_capacity objects if such replacement is necessary, but the refill
+ * fails (returning -ENOMEM), the existing sheaf is left intact
+ *
+ * In practice we always refill to full sheaf's capacity.
+ */
+int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp,
+ struct slab_sheaf **sheafp, unsigned int size)
+{
+ struct slab_sheaf *sheaf;
+
+ /*
+ * TODO: do we want to support *sheaf == NULL to be equivalent of
+ * kmem_cache_prefill_sheaf() ?
+ */
+ if (!sheafp || !(*sheafp))
+ return -EINVAL;
+
+ sheaf = *sheafp;
+ if (sheaf->size >= size)
+ return 0;
+
+ if (likely(sheaf->capacity >= size)) {
+ if (likely(sheaf->capacity == s->sheaf_capacity))
+ return refill_sheaf(s, sheaf, gfp);
+
+ if (!__kmem_cache_alloc_bulk(s, gfp, sheaf->capacity - sheaf->size,
+ &sheaf->objects[sheaf->size])) {
+ return -ENOMEM;
+ }
+ sheaf->size = sheaf->capacity;
+
+ return 0;
+ }
+
+ /*
+ * We had a regular sized sheaf and need an oversize one, or we had an
+ * oversize one already but need a larger one now.
+ * This should be a very rare path so let's not complicate it.
+ */
+ sheaf = kmem_cache_prefill_sheaf(s, gfp, size);
+ if (!sheaf)
+ return -ENOMEM;
+
+ kmem_cache_return_sheaf(s, gfp, *sheafp);
+ *sheafp = sheaf;
+ return 0;
+}
+
+/*
+ * Allocate from a sheaf obtained by kmem_cache_prefill_sheaf()
+ *
+ * Guaranteed not to fail as many allocations as was the requested size.
+ * After the sheaf is emptied, it fails - no fallback to the slab cache itself.
+ *
+ * The gfp parameter is meant only to specify __GFP_ZERO or __GFP_ACCOUNT
+ * memcg charging is forced over limit if necessary, to avoid failure.
+ */
+void *
+kmem_cache_alloc_from_sheaf_noprof(struct kmem_cache *s, gfp_t gfp,
+ struct slab_sheaf *sheaf)
+{
+ void *ret = NULL;
+ bool init;
+
+ if (sheaf->size == 0)
+ goto out;
+
+ ret = sheaf->objects[--sheaf->size];
+
+ init = slab_want_init_on_alloc(gfp, s);
+
+ /* add __GFP_NOFAIL to force successful memcg charging */
+ slab_post_alloc_hook(s, NULL, gfp | __GFP_NOFAIL, 1, &ret, init, s->object_size);
+out:
+ trace_kmem_cache_alloc(_RET_IP_, ret, s, gfp, NUMA_NO_NODE);
+
+ return ret;
+}
+
+unsigned int kmem_cache_sheaf_size(struct slab_sheaf *sheaf)
+{
+ return sheaf->size;
+}
+/*
* To avoid unnecessary overhead, we pass through large allocation requests
* directly to the page allocator. We use __GFP_COMP, because we will need to
* know the allocation order to free the pages properly in kfree.
@@ -4268,7 +5563,12 @@ static void *___kmalloc_large_node(size_t size, gfp_t flags, int node)
flags = kmalloc_fix_flags(flags);
flags |= __GFP_COMP;
- folio = (struct folio *)alloc_pages_node_noprof(node, flags, order);
+
+ if (node == NUMA_NO_NODE)
+ folio = (struct folio *)alloc_frozen_pages_noprof(flags, order);
+ else
+ folio = (struct folio *)__alloc_frozen_pages_noprof(flags, order, node, NULL);
+
if (folio) {
ptr = folio_address(folio);
lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B,
@@ -4340,6 +5640,96 @@ void *__kmalloc_noprof(size_t size, gfp_t flags)
}
EXPORT_SYMBOL(__kmalloc_noprof);
+/**
+ * kmalloc_nolock - Allocate an object of given size from any context.
+ * @size: size to allocate
+ * @gfp_flags: GFP flags. Only __GFP_ACCOUNT, __GFP_ZERO, __GFP_NO_OBJ_EXT
+ * allowed.
+ * @node: node number of the target node.
+ *
+ * Return: pointer to the new object or NULL in case of error.
+ * NULL does not mean EBUSY or EAGAIN. It means ENOMEM.
+ * There is no reason to call it again and expect !NULL.
+ */
+void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node)
+{
+ gfp_t alloc_gfp = __GFP_NOWARN | __GFP_NOMEMALLOC | gfp_flags;
+ struct kmem_cache *s;
+ bool can_retry = true;
+ void *ret = ERR_PTR(-EBUSY);
+
+ VM_WARN_ON_ONCE(gfp_flags & ~(__GFP_ACCOUNT | __GFP_ZERO |
+ __GFP_NO_OBJ_EXT));
+
+ if (unlikely(!size))
+ return ZERO_SIZE_PTR;
+
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq()))
+ /* kmalloc_nolock() in PREEMPT_RT is not supported from irq */
+ return NULL;
+retry:
+ if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
+ return NULL;
+ s = kmalloc_slab(size, NULL, alloc_gfp, _RET_IP_);
+
+ if (!(s->flags & __CMPXCHG_DOUBLE) && !kmem_cache_debug(s))
+ /*
+ * kmalloc_nolock() is not supported on architectures that
+ * don't implement cmpxchg16b, but debug caches don't use
+ * per-cpu slab and per-cpu partial slabs. They rely on
+ * kmem_cache_node->list_lock, so kmalloc_nolock() can
+ * attempt to allocate from debug caches by
+ * spin_trylock_irqsave(&n->list_lock, ...)
+ */
+ return NULL;
+
+ /*
+ * Do not call slab_alloc_node(), since trylock mode isn't
+ * compatible with slab_pre_alloc_hook/should_failslab and
+ * kfence_alloc. Hence call __slab_alloc_node() (at most twice)
+ * and slab_post_alloc_hook() directly.
+ *
+ * In !PREEMPT_RT ___slab_alloc() manipulates (freelist,tid) pair
+ * in irq saved region. It assumes that the same cpu will not
+ * __update_cpu_freelist_fast() into the same (freelist,tid) pair.
+ * Therefore use in_nmi() to check whether particular bucket is in
+ * irq protected section.
+ *
+ * If in_nmi() && local_lock_is_locked(s->cpu_slab) then it means that
+ * this cpu was interrupted somewhere inside ___slab_alloc() after
+ * it did local_lock_irqsave(&s->cpu_slab->lock, flags).
+ * In this case fast path with __update_cpu_freelist_fast() is not safe.
+ */
+#ifndef CONFIG_SLUB_TINY
+ if (!in_nmi() || !local_lock_is_locked(&s->cpu_slab->lock))
+#endif
+ ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size);
+
+ if (PTR_ERR(ret) == -EBUSY) {
+ if (can_retry) {
+ /* pick the next kmalloc bucket */
+ size = s->object_size + 1;
+ /*
+ * Another alternative is to
+ * if (memcg) alloc_gfp &= ~__GFP_ACCOUNT;
+ * else if (!memcg) alloc_gfp |= __GFP_ACCOUNT;
+ * to retry from bucket of the same size.
+ */
+ can_retry = false;
+ goto retry;
+ }
+ ret = NULL;
+ }
+
+ maybe_wipe_obj_freeptr(s, ret);
+ slab_post_alloc_hook(s, NULL, alloc_gfp, 1, &ret,
+ slab_want_init_on_alloc(alloc_gfp, s), size);
+
+ ret = kasan_kmalloc(s, ret, size, alloc_gfp);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kmalloc_nolock_noprof);
+
void *__kmalloc_node_track_caller_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags,
int node, unsigned long caller)
{
@@ -4383,8 +5773,12 @@ static noinline void free_to_partial_list(
unsigned long flags;
depot_stack_handle_t handle = 0;
+ /*
+ * We cannot use GFP_NOWAIT as there are callsites where waking up
+ * kswapd could deadlock
+ */
if (s->flags & SLAB_STORE_USER)
- handle = set_track_prepare();
+ handle = set_track_prepare(__GFP_NOWARN);
spin_lock_irqsave(&n->list_lock, flags);
@@ -4553,6 +5947,558 @@ slab_empty:
discard_slab(s, slab);
}
+/*
+ * pcs is locked. We should have get rid of the spare sheaf and obtained an
+ * empty sheaf, while the main sheaf is full. We want to install the empty sheaf
+ * as a main sheaf, and make the current main sheaf a spare sheaf.
+ *
+ * However due to having relinquished the cpu_sheaves lock when obtaining
+ * the empty sheaf, we need to handle some unlikely but possible cases.
+ *
+ * If we put any sheaf to barn here, it's because we were interrupted or have
+ * been migrated to a different cpu, which should be rare enough so just ignore
+ * the barn's limits to simplify the handling.
+ *
+ * An alternative scenario that gets us here is when we fail
+ * barn_replace_full_sheaf(), because there's no empty sheaf available in the
+ * barn, so we had to allocate it by alloc_empty_sheaf(). But because we saw the
+ * limit on full sheaves was not exceeded, we assume it didn't change and just
+ * put the full sheaf there.
+ */
+static void __pcs_install_empty_sheaf(struct kmem_cache *s,
+ struct slub_percpu_sheaves *pcs, struct slab_sheaf *empty,
+ struct node_barn *barn)
+{
+ lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock));
+
+ /* This is what we expect to find if nobody interrupted us. */
+ if (likely(!pcs->spare)) {
+ pcs->spare = pcs->main;
+ pcs->main = empty;
+ return;
+ }
+
+ /*
+ * Unlikely because if the main sheaf had space, we would have just
+ * freed to it. Get rid of our empty sheaf.
+ */
+ if (pcs->main->size < s->sheaf_capacity) {
+ barn_put_empty_sheaf(barn, empty);
+ return;
+ }
+
+ /* Also unlikely for the same reason */
+ if (pcs->spare->size < s->sheaf_capacity) {
+ swap(pcs->main, pcs->spare);
+ barn_put_empty_sheaf(barn, empty);
+ return;
+ }
+
+ /*
+ * We probably failed barn_replace_full_sheaf() due to no empty sheaf
+ * available there, but we allocated one, so finish the job.
+ */
+ barn_put_full_sheaf(barn, pcs->main);
+ stat(s, BARN_PUT);
+ pcs->main = empty;
+}
+
+/*
+ * Replace the full main sheaf with a (at least partially) empty sheaf.
+ *
+ * Must be called with the cpu_sheaves local lock locked. If successful, returns
+ * the pcs pointer and the local lock locked (possibly on a different cpu than
+ * initially called). If not successful, returns NULL and the local lock
+ * unlocked.
+ */
+static struct slub_percpu_sheaves *
+__pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs)
+{
+ struct slab_sheaf *empty;
+ struct node_barn *barn;
+ bool put_fail;
+
+restart:
+ lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock));
+
+ barn = get_barn(s);
+ if (!barn) {
+ local_unlock(&s->cpu_sheaves->lock);
+ return NULL;
+ }
+
+ put_fail = false;
+
+ if (!pcs->spare) {
+ empty = barn_get_empty_sheaf(barn);
+ if (empty) {
+ pcs->spare = pcs->main;
+ pcs->main = empty;
+ return pcs;
+ }
+ goto alloc_empty;
+ }
+
+ if (pcs->spare->size < s->sheaf_capacity) {
+ swap(pcs->main, pcs->spare);
+ return pcs;
+ }
+
+ empty = barn_replace_full_sheaf(barn, pcs->main);
+
+ if (!IS_ERR(empty)) {
+ stat(s, BARN_PUT);
+ pcs->main = empty;
+ return pcs;
+ }
+
+ if (PTR_ERR(empty) == -E2BIG) {
+ /* Since we got here, spare exists and is full */
+ struct slab_sheaf *to_flush = pcs->spare;
+
+ stat(s, BARN_PUT_FAIL);
+
+ pcs->spare = NULL;
+ local_unlock(&s->cpu_sheaves->lock);
+
+ sheaf_flush_unused(s, to_flush);
+ empty = to_flush;
+ goto got_empty;
+ }
+
+ /*
+ * We could not replace full sheaf because barn had no empty
+ * sheaves. We can still allocate it and put the full sheaf in
+ * __pcs_install_empty_sheaf(), but if we fail to allocate it,
+ * make sure to count the fail.
+ */
+ put_fail = true;
+
+alloc_empty:
+ local_unlock(&s->cpu_sheaves->lock);
+
+ empty = alloc_empty_sheaf(s, GFP_NOWAIT);
+ if (empty)
+ goto got_empty;
+
+ if (put_fail)
+ stat(s, BARN_PUT_FAIL);
+
+ if (!sheaf_flush_main(s))
+ return NULL;
+
+ if (!local_trylock(&s->cpu_sheaves->lock))
+ return NULL;
+
+ pcs = this_cpu_ptr(s->cpu_sheaves);
+
+ /*
+ * we flushed the main sheaf so it should be empty now,
+ * but in case we got preempted or migrated, we need to
+ * check again
+ */
+ if (pcs->main->size == s->sheaf_capacity)
+ goto restart;
+
+ return pcs;
+
+got_empty:
+ if (!local_trylock(&s->cpu_sheaves->lock)) {
+ barn_put_empty_sheaf(barn, empty);
+ return NULL;
+ }
+
+ pcs = this_cpu_ptr(s->cpu_sheaves);
+ __pcs_install_empty_sheaf(s, pcs, empty, barn);
+
+ return pcs;
+}
+
+/*
+ * Free an object to the percpu sheaves.
+ * The object is expected to have passed slab_free_hook() already.
+ */
+static __fastpath_inline
+bool free_to_pcs(struct kmem_cache *s, void *object)
+{
+ struct slub_percpu_sheaves *pcs;
+
+ if (!local_trylock(&s->cpu_sheaves->lock))
+ return false;
+
+ pcs = this_cpu_ptr(s->cpu_sheaves);
+
+ if (unlikely(pcs->main->size == s->sheaf_capacity)) {
+
+ pcs = __pcs_replace_full_main(s, pcs);
+ if (unlikely(!pcs))
+ return false;
+ }
+
+ pcs->main->objects[pcs->main->size++] = object;
+
+ local_unlock(&s->cpu_sheaves->lock);
+
+ stat(s, FREE_PCS);
+
+ return true;
+}
+
+static void rcu_free_sheaf(struct rcu_head *head)
+{
+ struct kmem_cache_node *n;
+ struct slab_sheaf *sheaf;
+ struct node_barn *barn = NULL;
+ struct kmem_cache *s;
+
+ sheaf = container_of(head, struct slab_sheaf, rcu_head);
+
+ s = sheaf->cache;
+
+ /*
+ * This may remove some objects due to slab_free_hook() returning false,
+ * so that the sheaf might no longer be completely full. But it's easier
+ * to handle it as full (unless it became completely empty), as the code
+ * handles it fine. The only downside is that sheaf will serve fewer
+ * allocations when reused. It only happens due to debugging, which is a
+ * performance hit anyway.
+ */
+ __rcu_free_sheaf_prepare(s, sheaf);
+
+ n = get_node(s, sheaf->node);
+ if (!n)
+ goto flush;
+
+ barn = n->barn;
+
+ /* due to slab_free_hook() */
+ if (unlikely(sheaf->size == 0))
+ goto empty;
+
+ /*
+ * Checking nr_full/nr_empty outside lock avoids contention in case the
+ * barn is at the respective limit. Due to the race we might go over the
+ * limit but that should be rare and harmless.
+ */
+
+ if (data_race(barn->nr_full) < MAX_FULL_SHEAVES) {
+ stat(s, BARN_PUT);
+ barn_put_full_sheaf(barn, sheaf);
+ return;
+ }
+
+flush:
+ stat(s, BARN_PUT_FAIL);
+ sheaf_flush_unused(s, sheaf);
+
+empty:
+ if (barn && data_race(barn->nr_empty) < MAX_EMPTY_SHEAVES) {
+ barn_put_empty_sheaf(barn, sheaf);
+ return;
+ }
+
+ free_empty_sheaf(s, sheaf);
+}
+
+bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj)
+{
+ struct slub_percpu_sheaves *pcs;
+ struct slab_sheaf *rcu_sheaf;
+
+ if (!local_trylock(&s->cpu_sheaves->lock))
+ goto fail;
+
+ pcs = this_cpu_ptr(s->cpu_sheaves);
+
+ if (unlikely(!pcs->rcu_free)) {
+
+ struct slab_sheaf *empty;
+ struct node_barn *barn;
+
+ if (pcs->spare && pcs->spare->size == 0) {
+ pcs->rcu_free = pcs->spare;
+ pcs->spare = NULL;
+ goto do_free;
+ }
+
+ barn = get_barn(s);
+ if (!barn) {
+ local_unlock(&s->cpu_sheaves->lock);
+ goto fail;
+ }
+
+ empty = barn_get_empty_sheaf(barn);
+
+ if (empty) {
+ pcs->rcu_free = empty;
+ goto do_free;
+ }
+
+ local_unlock(&s->cpu_sheaves->lock);
+
+ empty = alloc_empty_sheaf(s, GFP_NOWAIT);
+
+ if (!empty)
+ goto fail;
+
+ if (!local_trylock(&s->cpu_sheaves->lock)) {
+ barn_put_empty_sheaf(barn, empty);
+ goto fail;
+ }
+
+ pcs = this_cpu_ptr(s->cpu_sheaves);
+
+ if (unlikely(pcs->rcu_free))
+ barn_put_empty_sheaf(barn, empty);
+ else
+ pcs->rcu_free = empty;
+ }
+
+do_free:
+
+ rcu_sheaf = pcs->rcu_free;
+
+ /*
+ * Since we flush immediately when size reaches capacity, we never reach
+ * this with size already at capacity, so no OOB write is possible.
+ */
+ rcu_sheaf->objects[rcu_sheaf->size++] = obj;
+
+ if (likely(rcu_sheaf->size < s->sheaf_capacity)) {
+ rcu_sheaf = NULL;
+ } else {
+ pcs->rcu_free = NULL;
+ rcu_sheaf->node = numa_mem_id();
+ }
+
+ /*
+ * we flush before local_unlock to make sure a racing
+ * flush_all_rcu_sheaves() doesn't miss this sheaf
+ */
+ if (rcu_sheaf)
+ call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf);
+
+ local_unlock(&s->cpu_sheaves->lock);
+
+ stat(s, FREE_RCU_SHEAF);
+ return true;
+
+fail:
+ stat(s, FREE_RCU_SHEAF_FAIL);
+ return false;
+}
+
+/*
+ * Bulk free objects to the percpu sheaves.
+ * Unlike free_to_pcs() this includes the calls to all necessary hooks
+ * and the fallback to freeing to slab pages.
+ */
+static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
+{
+ struct slub_percpu_sheaves *pcs;
+ struct slab_sheaf *main, *empty;
+ bool init = slab_want_init_on_free(s);
+ unsigned int batch, i = 0;
+ struct node_barn *barn;
+ void *remote_objects[PCS_BATCH_MAX];
+ unsigned int remote_nr = 0;
+ int node = numa_mem_id();
+
+next_remote_batch:
+ while (i < size) {
+ struct slab *slab = virt_to_slab(p[i]);
+
+ memcg_slab_free_hook(s, slab, p + i, 1);
+ alloc_tagging_slab_free_hook(s, slab, p + i, 1);
+
+ if (unlikely(!slab_free_hook(s, p[i], init, false))) {
+ p[i] = p[--size];
+ if (!size)
+ goto flush_remote;
+ continue;
+ }
+
+ if (unlikely(IS_ENABLED(CONFIG_NUMA) && slab_nid(slab) != node)) {
+ remote_objects[remote_nr] = p[i];
+ p[i] = p[--size];
+ if (++remote_nr >= PCS_BATCH_MAX)
+ goto flush_remote;
+ continue;
+ }
+
+ i++;
+ }
+
+next_batch:
+ if (!local_trylock(&s->cpu_sheaves->lock))
+ goto fallback;
+
+ pcs = this_cpu_ptr(s->cpu_sheaves);
+
+ if (likely(pcs->main->size < s->sheaf_capacity))
+ goto do_free;
+
+ barn = get_barn(s);
+ if (!barn)
+ goto no_empty;
+
+ if (!pcs->spare) {
+ empty = barn_get_empty_sheaf(barn);
+ if (!empty)
+ goto no_empty;
+
+ pcs->spare = pcs->main;
+ pcs->main = empty;
+ goto do_free;
+ }
+
+ if (pcs->spare->size < s->sheaf_capacity) {
+ swap(pcs->main, pcs->spare);
+ goto do_free;
+ }
+
+ empty = barn_replace_full_sheaf(barn, pcs->main);
+ if (IS_ERR(empty)) {
+ stat(s, BARN_PUT_FAIL);
+ goto no_empty;
+ }
+
+ stat(s, BARN_PUT);
+ pcs->main = empty;
+
+do_free:
+ main = pcs->main;
+ batch = min(size, s->sheaf_capacity - main->size);
+
+ memcpy(main->objects + main->size, p, batch * sizeof(void *));
+ main->size += batch;
+
+ local_unlock(&s->cpu_sheaves->lock);
+
+ stat_add(s, FREE_PCS, batch);
+
+ if (batch < size) {
+ p += batch;
+ size -= batch;
+ goto next_batch;
+ }
+
+ return;
+
+no_empty:
+ local_unlock(&s->cpu_sheaves->lock);
+
+ /*
+ * if we depleted all empty sheaves in the barn or there are too
+ * many full sheaves, free the rest to slab pages
+ */
+fallback:
+ __kmem_cache_free_bulk(s, size, p);
+
+flush_remote:
+ if (remote_nr) {
+ __kmem_cache_free_bulk(s, remote_nr, &remote_objects[0]);
+ if (i < size) {
+ remote_nr = 0;
+ goto next_remote_batch;
+ }
+ }
+}
+
+struct defer_free {
+ struct llist_head objects;
+ struct llist_head slabs;
+ struct irq_work work;
+};
+
+static void free_deferred_objects(struct irq_work *work);
+
+static DEFINE_PER_CPU(struct defer_free, defer_free_objects) = {
+ .objects = LLIST_HEAD_INIT(objects),
+ .slabs = LLIST_HEAD_INIT(slabs),
+ .work = IRQ_WORK_INIT(free_deferred_objects),
+};
+
+/*
+ * In PREEMPT_RT irq_work runs in per-cpu kthread, so it's safe
+ * to take sleeping spin_locks from __slab_free() and deactivate_slab().
+ * In !PREEMPT_RT irq_work will run after local_unlock_irqrestore().
+ */
+static void free_deferred_objects(struct irq_work *work)
+{
+ struct defer_free *df = container_of(work, struct defer_free, work);
+ struct llist_head *objs = &df->objects;
+ struct llist_head *slabs = &df->slabs;
+ struct llist_node *llnode, *pos, *t;
+
+ if (llist_empty(objs) && llist_empty(slabs))
+ return;
+
+ llnode = llist_del_all(objs);
+ llist_for_each_safe(pos, t, llnode) {
+ struct kmem_cache *s;
+ struct slab *slab;
+ void *x = pos;
+
+ slab = virt_to_slab(x);
+ s = slab->slab_cache;
+
+ /*
+ * We used freepointer in 'x' to link 'x' into df->objects.
+ * Clear it to NULL to avoid false positive detection
+ * of "Freepointer corruption".
+ */
+ *(void **)x = NULL;
+
+ /* Point 'x' back to the beginning of allocated object */
+ x -= s->offset;
+ __slab_free(s, slab, x, x, 1, _THIS_IP_);
+ }
+
+ llnode = llist_del_all(slabs);
+ llist_for_each_safe(pos, t, llnode) {
+ struct slab *slab = container_of(pos, struct slab, llnode);
+
+#ifdef CONFIG_SLUB_TINY
+ discard_slab(slab->slab_cache, slab);
+#else
+ deactivate_slab(slab->slab_cache, slab, slab->flush_freelist);
+#endif
+ }
+}
+
+static void defer_free(struct kmem_cache *s, void *head)
+{
+ struct defer_free *df;
+
+ guard(preempt)();
+
+ df = this_cpu_ptr(&defer_free_objects);
+ if (llist_add(head + s->offset, &df->objects))
+ irq_work_queue(&df->work);
+}
+
+static void defer_deactivate_slab(struct slab *slab, void *flush_freelist)
+{
+ struct defer_free *df;
+
+ slab->flush_freelist = flush_freelist;
+
+ guard(preempt)();
+
+ df = this_cpu_ptr(&defer_free_objects);
+ if (llist_add(&slab->llnode, &df->slabs))
+ irq_work_queue(&df->work);
+}
+
+void defer_free_barrier(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ irq_work_sync(&per_cpu_ptr(&defer_free_objects, cpu)->work);
+}
+
#ifndef CONFIG_SLUB_TINY
/*
* Fastpath with forced inlining to produce a kfree and kmem_cache_free that
@@ -4573,6 +6519,8 @@ static __always_inline void do_slab_free(struct kmem_cache *s,
struct slab *slab, void *head, void *tail,
int cnt, unsigned long addr)
{
+ /* cnt == 0 signals that it's called from kfree_nolock() */
+ bool allow_spin = cnt;
struct kmem_cache_cpu *c;
unsigned long tid;
void **freelist;
@@ -4591,10 +6539,29 @@ redo:
barrier();
if (unlikely(slab != c->slab)) {
- __slab_free(s, slab, head, tail, cnt, addr);
+ if (unlikely(!allow_spin)) {
+ /*
+ * __slab_free() can locklessly cmpxchg16 into a slab,
+ * but then it might need to take spin_lock or local_lock
+ * in put_cpu_partial() for further processing.
+ * Avoid the complexity and simply add to a deferred list.
+ */
+ defer_free(s, head);
+ } else {
+ __slab_free(s, slab, head, tail, cnt, addr);
+ }
return;
}
+ if (unlikely(!allow_spin)) {
+ if ((in_nmi() || !USE_LOCKLESS_FAST_PATH()) &&
+ local_lock_is_locked(&s->cpu_slab->lock)) {
+ defer_free(s, head);
+ return;
+ }
+ cnt = 1; /* restore cnt. kfree_nolock() frees one object at a time */
+ }
+
if (USE_LOCKLESS_FAST_PATH()) {
freelist = READ_ONCE(c->freelist);
@@ -4605,11 +6572,13 @@ redo:
goto redo;
}
} else {
+ __maybe_unused unsigned long flags = 0;
+
/* Update the free list under the local lock */
- local_lock(&s->cpu_slab->lock);
+ local_lock_cpu_slab(s, flags);
c = this_cpu_ptr(s->cpu_slab);
if (unlikely(slab != c->slab)) {
- local_unlock(&s->cpu_slab->lock);
+ local_unlock_cpu_slab(s, flags);
goto redo;
}
tid = c->tid;
@@ -4619,7 +6588,7 @@ redo:
c->freelist = head;
c->tid = next_tid(tid);
- local_unlock(&s->cpu_slab->lock);
+ local_unlock_cpu_slab(s, flags);
}
stat_add(s, FREE_FASTPATH, cnt);
}
@@ -4639,8 +6608,16 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object,
memcg_slab_free_hook(s, slab, &object, 1);
alloc_tagging_slab_free_hook(s, slab, &object, 1);
- if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false)))
- do_slab_free(s, slab, object, object, 1, addr);
+ if (unlikely(!slab_free_hook(s, object, slab_want_init_on_free(s), false)))
+ return;
+
+ if (s->cpu_sheaves && likely(!IS_ENABLED(CONFIG_NUMA) ||
+ slab_nid(slab) == numa_mem_id())) {
+ if (likely(free_to_pcs(s, object)))
+ return;
+ }
+
+ do_slab_free(s, slab, object, object, 1, addr);
}
#ifdef CONFIG_MEMCG
@@ -4764,7 +6741,7 @@ static void free_large_kmalloc(struct folio *folio, void *object)
lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B,
-(PAGE_SIZE << order));
__folio_clear_large_kmalloc(folio);
- folio_put(folio);
+ free_frozen_pages(&folio->page, order);
}
/*
@@ -4842,8 +6819,73 @@ void kfree(const void *object)
}
EXPORT_SYMBOL(kfree);
+/*
+ * Can be called while holding raw_spinlock_t or from IRQ and NMI,
+ * but ONLY for objects allocated by kmalloc_nolock().
+ * Debug checks (like kmemleak and kfence) were skipped on allocation,
+ * hence
+ * obj = kmalloc(); kfree_nolock(obj);
+ * will miss kmemleak/kfence book keeping and will cause false positives.
+ * large_kmalloc is not supported either.
+ */
+void kfree_nolock(const void *object)
+{
+ struct folio *folio;
+ struct slab *slab;
+ struct kmem_cache *s;
+ void *x = (void *)object;
+
+ if (unlikely(ZERO_OR_NULL_PTR(object)))
+ return;
+
+ folio = virt_to_folio(object);
+ if (unlikely(!folio_test_slab(folio))) {
+ WARN_ONCE(1, "large_kmalloc is not supported by kfree_nolock()");
+ return;
+ }
+
+ slab = folio_slab(folio);
+ s = slab->slab_cache;
+
+ memcg_slab_free_hook(s, slab, &x, 1);
+ alloc_tagging_slab_free_hook(s, slab, &x, 1);
+ /*
+ * Unlike slab_free() do NOT call the following:
+ * kmemleak_free_recursive(x, s->flags);
+ * debug_check_no_locks_freed(x, s->object_size);
+ * debug_check_no_obj_freed(x, s->object_size);
+ * __kcsan_check_access(x, s->object_size, ..);
+ * kfence_free(x);
+ * since they take spinlocks or not safe from any context.
+ */
+ kmsan_slab_free(s, x);
+ /*
+ * If KASAN finds a kernel bug it will do kasan_report_invalid_free()
+ * which will call raw_spin_lock_irqsave() which is technically
+ * unsafe from NMI, but take chance and report kernel bug.
+ * The sequence of
+ * kasan_report_invalid_free() -> raw_spin_lock_irqsave() -> NMI
+ * -> kfree_nolock() -> kasan_report_invalid_free() on the same CPU
+ * is double buggy and deserves to deadlock.
+ */
+ if (kasan_slab_pre_free(s, x))
+ return;
+ /*
+ * memcg, kasan_slab_pre_free are done for 'x'.
+ * The only thing left is kasan_poison without quarantine,
+ * since kasan quarantine takes locks and not supported from NMI.
+ */
+ kasan_slab_free(s, x, false, false, /* skip quarantine */true);
+#ifndef CONFIG_SLUB_TINY
+ do_slab_free(s, slab, x, x, 0, _RET_IP_);
+#else
+ defer_free(s, x);
+#endif
+}
+EXPORT_SYMBOL_GPL(kfree_nolock);
+
static __always_inline __realloc_size(2) void *
-__do_krealloc(const void *p, size_t new_size, gfp_t flags)
+__do_krealloc(const void *p, size_t new_size, unsigned long align, gfp_t flags, int nid)
{
void *ret;
size_t ks = 0;
@@ -4857,6 +6899,16 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags)
if (!kasan_check_byte(p))
return NULL;
+ /*
+ * If reallocation is not necessary (e. g. the new size is less
+ * than the current allocated size), the current allocation will be
+ * preserved unless __GFP_THISNODE is set. In the latter case a new
+ * allocation on the requested node will be attempted.
+ */
+ if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE &&
+ nid != page_to_nid(virt_to_page(p)))
+ goto alloc_new;
+
if (is_kfence_address(p)) {
ks = orig_size = kfence_ksize(p);
} else {
@@ -4879,6 +6931,10 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags)
if (new_size > ks)
goto alloc_new;
+ /* If the old object doesn't satisfy the new alignment, allocate a new one */
+ if (!IS_ALIGNED((unsigned long)p, align))
+ goto alloc_new;
+
/* Zero out spare memory. */
if (want_init_on_alloc(flags)) {
kasan_disable_current();
@@ -4901,7 +6957,7 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags)
return (void *)p;
alloc_new:
- ret = kmalloc_node_track_caller_noprof(new_size, flags, NUMA_NO_NODE, _RET_IP_);
+ ret = kmalloc_node_track_caller_noprof(new_size, flags, nid, _RET_IP_);
if (ret && p) {
/* Disable KASAN checks as the object's redzone is accessed. */
kasan_disable_current();
@@ -4913,14 +6969,19 @@ alloc_new:
}
/**
- * krealloc - reallocate memory. The contents will remain unchanged.
+ * krealloc_node_align - reallocate memory. The contents will remain unchanged.
* @p: object to reallocate memory for.
* @new_size: how many bytes of memory are required.
+ * @align: desired alignment.
* @flags: the type of memory to allocate.
+ * @nid: NUMA node or NUMA_NO_NODE
*
* If @p is %NULL, krealloc() behaves exactly like kmalloc(). If @new_size
* is 0 and @p is not a %NULL pointer, the object pointed to is freed.
*
+ * Only alignments up to those guaranteed by kmalloc() will be honored. Please see
+ * Documentation/core-api/memory-allocation.rst for more details.
+ *
* If __GFP_ZERO logic is requested, callers must ensure that, starting with the
* initial memory allocation, every subsequent call to this API for the same
* memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
@@ -4929,12 +6990,12 @@ alloc_new:
* When slub_debug_orig_size() is off, krealloc() only knows about the bucket
* size of an allocation (but not the exact size it was allocated with) and
* hence implements the following semantics for shrinking and growing buffers
- * with __GFP_ZERO.
+ * with __GFP_ZERO::
*
- * new bucket
- * 0 size size
- * |--------|----------------|
- * | keep | zero |
+ * new bucket
+ * 0 size size
+ * |--------|----------------|
+ * | keep | zero |
*
* Otherwise, the original allocation size 'orig_size' could be used to
* precisely clear the requested size, and the new size will also be stored
@@ -4945,7 +7006,8 @@ alloc_new:
*
* Return: pointer to the allocated memory or %NULL in case of error
*/
-void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags)
+void *krealloc_node_align_noprof(const void *p, size_t new_size, unsigned long align,
+ gfp_t flags, int nid)
{
void *ret;
@@ -4954,13 +7016,13 @@ void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags)
return ZERO_SIZE_PTR;
}
- ret = __do_krealloc(p, new_size, flags);
+ ret = __do_krealloc(p, new_size, align, flags, nid);
if (ret && kasan_reset_tag(p) != kasan_reset_tag(ret))
kfree(p);
return ret;
}
-EXPORT_SYMBOL(krealloc_noprof);
+EXPORT_SYMBOL(krealloc_node_align_noprof);
static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size)
{
@@ -4968,14 +7030,16 @@ static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size)
* We want to attempt a large physically contiguous block first because
* it is less likely to fragment multiple larger blocks and therefore
* contribute to a long term fragmentation less than vmalloc fallback.
- * However make sure that larger requests are not too disruptive - no
- * OOM killer and no allocation failure warnings as we have a fallback.
+ * However make sure that larger requests are not too disruptive - i.e.
+ * do not direct reclaim unless physically continuous memory is preferred
+ * (__GFP_RETRY_MAYFAIL mode). We still kick in kswapd/kcompactd to
+ * start working in the background
*/
if (size > PAGE_SIZE) {
flags |= __GFP_NOWARN;
if (!(flags & __GFP_RETRY_MAYFAIL))
- flags |= __GFP_NORETRY;
+ flags &= ~__GFP_DIRECT_RECLAIM;
/* nofail semantic is implemented by the vmalloc fallback */
flags &= ~__GFP_NOFAIL;
@@ -4989,9 +7053,13 @@ static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size)
* failure, fall back to non-contiguous (vmalloc) allocation.
* @size: size of the request.
* @b: which set of kmalloc buckets to allocate from.
+ * @align: desired alignment.
* @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
* @node: numa node to allocate from
*
+ * Only alignments up to those guaranteed by kmalloc() will be honored. Please see
+ * Documentation/core-api/memory-allocation.rst for more details.
+ *
* Uses kmalloc to get the memory but if the allocation fails then falls back
* to the vmalloc allocator. Use kvfree for freeing the memory.
*
@@ -5001,7 +7069,8 @@ static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size)
*
* Return: pointer to the allocated memory of %NULL in case of failure
*/
-void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node)
+void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), unsigned long align,
+ gfp_t flags, int node)
{
void *ret;
@@ -5031,7 +7100,7 @@ void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node)
* about the resulting pointer, and cannot play
* protection games.
*/
- return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
+ return __vmalloc_node_range_noprof(size, align, VMALLOC_START, VMALLOC_END,
flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
node, __builtin_return_address(0));
}
@@ -5075,14 +7144,19 @@ void kvfree_sensitive(const void *addr, size_t len)
EXPORT_SYMBOL(kvfree_sensitive);
/**
- * kvrealloc - reallocate memory; contents remain unchanged
+ * kvrealloc_node_align - reallocate memory; contents remain unchanged
* @p: object to reallocate memory for
* @size: the size to reallocate
+ * @align: desired alignment
* @flags: the flags for the page level allocator
+ * @nid: NUMA node id
*
* If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0
* and @p is not a %NULL pointer, the object pointed to is freed.
*
+ * Only alignments up to those guaranteed by kmalloc() will be honored. Please see
+ * Documentation/core-api/memory-allocation.rst for more details.
+ *
* If __GFP_ZERO logic is requested, callers must ensure that, starting with the
* initial memory allocation, every subsequent call to this API for the same
* memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
@@ -5096,17 +7170,18 @@ EXPORT_SYMBOL(kvfree_sensitive);
*
* Return: pointer to the allocated memory or %NULL in case of error
*/
-void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags)
+void *kvrealloc_node_align_noprof(const void *p, size_t size, unsigned long align,
+ gfp_t flags, int nid)
{
void *n;
if (is_vmalloc_addr(p))
- return vrealloc_noprof(p, size, flags);
+ return vrealloc_node_align_noprof(p, size, align, flags, nid);
- n = krealloc_noprof(p, size, kmalloc_gfp_adjust(flags, size));
+ n = krealloc_node_align_noprof(p, size, align, kmalloc_gfp_adjust(flags, size), nid);
if (!n) {
/* We failed to krealloc(), fall back to kvmalloc(). */
- n = kvmalloc_noprof(size, flags);
+ n = kvmalloc_node_align_noprof(size, align, flags, nid);
if (!n)
return NULL;
@@ -5122,7 +7197,7 @@ void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags)
return n;
}
-EXPORT_SYMBOL(kvrealloc_noprof);
+EXPORT_SYMBOL(kvrealloc_node_align_noprof);
struct detached_freelist {
struct slab *slab;
@@ -5233,6 +7308,15 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
if (!size)
return;
+ /*
+ * freeing to sheaves is so incompatible with the detached freelist so
+ * once we go that way, we have to do everything differently
+ */
+ if (s && s->cpu_sheaves) {
+ free_to_pcs_bulk(s, size, p);
+ return;
+ }
+
do {
struct detached_freelist df;
@@ -5351,7 +7435,7 @@ error:
int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size,
void **p)
{
- int i;
+ unsigned int i = 0;
if (!size)
return 0;
@@ -5360,9 +7444,20 @@ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size,
if (unlikely(!s))
return 0;
- i = __kmem_cache_alloc_bulk(s, flags, size, p);
- if (unlikely(i == 0))
- return 0;
+ if (s->cpu_sheaves)
+ i = alloc_from_pcs_bulk(s, size, p);
+
+ if (i < size) {
+ /*
+ * If we ran out of memory, don't bother with freeing back to
+ * the percpu sheaves, we have bigger problems.
+ */
+ if (unlikely(__kmem_cache_alloc_bulk(s, flags, size - i, p + i) == 0)) {
+ if (i > 0)
+ __kmem_cache_free_bulk(s, i, p);
+ return 0;
+ }
+ }
/*
* memcg and kmem_cache debug support and memory initialization.
@@ -5372,11 +7467,11 @@ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size,
slab_want_init_on_alloc(flags, s), s->object_size))) {
return 0;
}
- return i;
+
+ return size;
}
EXPORT_SYMBOL(kmem_cache_alloc_bulk_noprof);
-
/*
* Object placement in a slab is made very easy because we always start at
* offset 0. If we tune the size of the object to the alignment then we can
@@ -5510,7 +7605,7 @@ static inline int calculate_order(unsigned int size)
}
static void
-init_kmem_cache_node(struct kmem_cache_node *n)
+init_kmem_cache_node(struct kmem_cache_node *n, struct node_barn *barn)
{
n->nr_partial = 0;
spin_lock_init(&n->list_lock);
@@ -5520,6 +7615,9 @@ init_kmem_cache_node(struct kmem_cache_node *n)
atomic_long_set(&n->total_objects, 0);
INIT_LIST_HEAD(&n->full);
#endif
+ n->barn = barn;
+ if (barn)
+ barn_init(barn);
}
#ifndef CONFIG_SLUB_TINY
@@ -5550,6 +7648,26 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
}
#endif /* CONFIG_SLUB_TINY */
+static int init_percpu_sheaves(struct kmem_cache *s)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct slub_percpu_sheaves *pcs;
+
+ pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
+
+ local_trylock_init(&pcs->lock);
+
+ pcs->main = alloc_empty_sheaf(s, GFP_KERNEL);
+
+ if (!pcs->main)
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
static struct kmem_cache *kmem_cache_node;
/*
@@ -5585,7 +7703,7 @@ static void early_kmem_cache_node_alloc(int node)
slab->freelist = get_freepointer(kmem_cache_node, n);
slab->inuse = 1;
kmem_cache_node->node[node] = n;
- init_kmem_cache_node(n);
+ init_kmem_cache_node(n, NULL);
inc_slabs_node(kmem_cache_node, node, slab->objects);
/*
@@ -5601,6 +7719,13 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
struct kmem_cache_node *n;
for_each_kmem_cache_node(s, node, n) {
+ if (n->barn) {
+ WARN_ON(n->barn->nr_full);
+ WARN_ON(n->barn->nr_empty);
+ kfree(n->barn);
+ n->barn = NULL;
+ }
+
s->node[node] = NULL;
kmem_cache_free(kmem_cache_node, n);
}
@@ -5609,7 +7734,13 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
void __kmem_cache_release(struct kmem_cache *s)
{
cache_random_seq_destroy(s);
+ if (s->cpu_sheaves)
+ pcs_destroy(s);
#ifndef CONFIG_SLUB_TINY
+#ifdef CONFIG_PREEMPT_RT
+ if (s->cpu_slab)
+ lockdep_unregister_key(&s->lock_key);
+#endif
free_percpu(s->cpu_slab);
#endif
free_kmem_cache_nodes(s);
@@ -5621,20 +7752,29 @@ static int init_kmem_cache_nodes(struct kmem_cache *s)
for_each_node_mask(node, slab_nodes) {
struct kmem_cache_node *n;
+ struct node_barn *barn = NULL;
if (slab_state == DOWN) {
early_kmem_cache_node_alloc(node);
continue;
}
+
+ if (s->cpu_sheaves) {
+ barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, node);
+
+ if (!barn)
+ return 0;
+ }
+
n = kmem_cache_alloc_node(kmem_cache_node,
GFP_KERNEL, node);
-
if (!n) {
- free_kmem_cache_nodes(s);
+ kfree(barn);
return 0;
}
- init_kmem_cache_node(n);
+ init_kmem_cache_node(n, barn);
+
s->node[node] = n;
}
return 1;
@@ -5889,8 +8029,15 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
struct kmem_cache_node *n;
flush_all_cpus_locked(s);
+
+ /* we might have rcu sheaves in flight */
+ if (s->cpu_sheaves)
+ rcu_barrier();
+
/* Attempt to free all objects */
for_each_kmem_cache_node(s, node, n) {
+ if (n->barn)
+ barn_shrink(s, n->barn);
free_partial(s, n);
if (n->nr_partial || node_nr_slabs(n))
return 1;
@@ -6094,6 +8241,9 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s)
for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
INIT_LIST_HEAD(promote + i);
+ if (n->barn)
+ barn_shrink(s, n->barn);
+
spin_lock_irqsave(&n->list_lock, flags);
/*
@@ -6146,7 +8296,7 @@ int __kmem_cache_shrink(struct kmem_cache *s)
return __kmem_cache_do_shrink(s);
}
-static int slab_mem_going_offline_callback(void *arg)
+static int slab_mem_going_offline_callback(void)
{
struct kmem_cache *s;
@@ -6160,58 +8310,37 @@ static int slab_mem_going_offline_callback(void *arg)
return 0;
}
-static void slab_mem_offline_callback(void *arg)
-{
- struct memory_notify *marg = arg;
- int offline_node;
-
- offline_node = marg->status_change_nid_normal;
-
- /*
- * If the node still has available memory. we need kmem_cache_node
- * for it yet.
- */
- if (offline_node < 0)
- return;
-
- mutex_lock(&slab_mutex);
- node_clear(offline_node, slab_nodes);
- /*
- * We no longer free kmem_cache_node structures here, as it would be
- * racy with all get_node() users, and infeasible to protect them with
- * slab_mutex.
- */
- mutex_unlock(&slab_mutex);
-}
-
-static int slab_mem_going_online_callback(void *arg)
+static int slab_mem_going_online_callback(int nid)
{
struct kmem_cache_node *n;
struct kmem_cache *s;
- struct memory_notify *marg = arg;
- int nid = marg->status_change_nid_normal;
int ret = 0;
/*
- * If the node's memory is already available, then kmem_cache_node is
- * already created. Nothing to do.
- */
- if (nid < 0)
- return 0;
-
- /*
* We are bringing a node online. No memory is available yet. We must
* allocate a kmem_cache_node structure in order to bring the node
* online.
*/
mutex_lock(&slab_mutex);
list_for_each_entry(s, &slab_caches, list) {
+ struct node_barn *barn = NULL;
+
/*
* The structure may already exist if the node was previously
* onlined and offlined.
*/
if (get_node(s, nid))
continue;
+
+ if (s->cpu_sheaves) {
+ barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, nid);
+
+ if (!barn) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
/*
* XXX: kmem_cache_alloc_node will fallback to other nodes
* since memory is not yet available from the node that
@@ -6219,10 +8348,13 @@ static int slab_mem_going_online_callback(void *arg)
*/
n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL);
if (!n) {
+ kfree(barn);
ret = -ENOMEM;
goto out;
}
- init_kmem_cache_node(n);
+
+ init_kmem_cache_node(n, barn);
+
s->node[nid] = n;
}
/*
@@ -6238,21 +8370,16 @@ out:
static int slab_memory_callback(struct notifier_block *self,
unsigned long action, void *arg)
{
+ struct node_notify *nn = arg;
+ int nid = nn->nid;
int ret = 0;
switch (action) {
- case MEM_GOING_ONLINE:
- ret = slab_mem_going_online_callback(arg);
- break;
- case MEM_GOING_OFFLINE:
- ret = slab_mem_going_offline_callback(arg);
+ case NODE_ADDING_FIRST_MEMORY:
+ ret = slab_mem_going_online_callback(nid);
break;
- case MEM_OFFLINE:
- case MEM_CANCEL_ONLINE:
- slab_mem_offline_callback(arg);
- break;
- case MEM_ONLINE:
- case MEM_CANCEL_OFFLINE:
+ case NODE_REMOVING_LAST_MEMORY:
+ ret = slab_mem_going_offline_callback();
break;
}
if (ret)
@@ -6310,9 +8437,8 @@ void __init kmem_cache_init(void)
if (debug_guardpage_minorder())
slub_max_order = 0;
- /* Print slub debugging pointers without hashing */
- if (__slub_debug_enabled())
- no_hash_pointers_enable(NULL);
+ /* Inform pointer hashing choice about slub debugging state. */
+ hash_pointers_finalize(__slub_debug_enabled());
kmem_cache_node = &boot_kmem_cache_node;
kmem_cache = &boot_kmem_cache;
@@ -6321,14 +8447,14 @@ void __init kmem_cache_init(void)
* Initialize the nodemask for which we will allocate per node
* structures. Here we don't need taking slab_mutex yet.
*/
- for_each_node_state(node, N_NORMAL_MEMORY)
+ for_each_node_state(node, N_MEMORY)
node_set(node, slab_nodes);
create_boot_cache(kmem_cache_node, "kmem_cache_node",
sizeof(struct kmem_cache_node),
SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0);
- hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
+ hotplug_node_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
/* Able to allocate the per node structures */
slab_state = PARTIAL;
@@ -6441,6 +8567,17 @@ int do_kmem_cache_create(struct kmem_cache *s, const char *name,
set_cpu_partial(s);
+ if (args->sheaf_capacity && !IS_ENABLED(CONFIG_SLUB_TINY)
+ && !(s->flags & SLAB_DEBUG_FLAGS)) {
+ s->cpu_sheaves = alloc_percpu(struct slub_percpu_sheaves);
+ if (!s->cpu_sheaves) {
+ err = -ENOMEM;
+ goto out;
+ }
+ // TODO: increase capacity to grow slab_sheaf up to next kmalloc size?
+ s->sheaf_capacity = args->sheaf_capacity;
+ }
+
#ifdef CONFIG_NUMA
s->remote_node_defrag_ratio = 1000;
#endif
@@ -6457,6 +8594,12 @@ int do_kmem_cache_create(struct kmem_cache *s, const char *name,
if (!alloc_kmem_cache_cpus(s))
goto out;
+ if (s->cpu_sheaves) {
+ err = init_percpu_sheaves(s);
+ if (err)
+ goto out;
+ }
+
err = 0;
/* Mutex is not taken during early boot */
@@ -6498,6 +8641,11 @@ static void validate_slab(struct kmem_cache *s, struct slab *slab,
void *p;
void *addr = slab_address(slab);
+ if (!validate_slab_ptr(slab)) {
+ slab_err(s, slab, "Not a valid slab page");
+ return;
+ }
+
if (!check_slab(s, slab) || !on_freelist(s, slab, NULL))
return;
@@ -6909,6 +9057,12 @@ static ssize_t order_show(struct kmem_cache *s, char *buf)
}
SLAB_ATTR_RO(order);
+static ssize_t sheaf_capacity_show(struct kmem_cache *s, char *buf)
+{
+ return sysfs_emit(buf, "%u\n", s->sheaf_capacity);
+}
+SLAB_ATTR_RO(sheaf_capacity);
+
static ssize_t min_partial_show(struct kmem_cache *s, char *buf)
{
return sysfs_emit(buf, "%lu\n", s->min_partial);
@@ -7256,8 +9410,12 @@ static ssize_t text##_store(struct kmem_cache *s, \
} \
SLAB_ATTR(text); \
+STAT_ATTR(ALLOC_PCS, alloc_cpu_sheaf);
STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
+STAT_ATTR(FREE_PCS, free_cpu_sheaf);
+STAT_ATTR(FREE_RCU_SHEAF, free_rcu_sheaf);
+STAT_ATTR(FREE_RCU_SHEAF_FAIL, free_rcu_sheaf_fail);
STAT_ATTR(FREE_FASTPATH, free_fastpath);
STAT_ATTR(FREE_SLOWPATH, free_slowpath);
STAT_ATTR(FREE_FROZEN, free_frozen);
@@ -7282,6 +9440,19 @@ STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc);
STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node);
STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain);
+STAT_ATTR(SHEAF_FLUSH, sheaf_flush);
+STAT_ATTR(SHEAF_REFILL, sheaf_refill);
+STAT_ATTR(SHEAF_ALLOC, sheaf_alloc);
+STAT_ATTR(SHEAF_FREE, sheaf_free);
+STAT_ATTR(BARN_GET, barn_get);
+STAT_ATTR(BARN_GET_FAIL, barn_get_fail);
+STAT_ATTR(BARN_PUT, barn_put);
+STAT_ATTR(BARN_PUT_FAIL, barn_put_fail);
+STAT_ATTR(SHEAF_PREFILL_FAST, sheaf_prefill_fast);
+STAT_ATTR(SHEAF_PREFILL_SLOW, sheaf_prefill_slow);
+STAT_ATTR(SHEAF_PREFILL_OVERSIZE, sheaf_prefill_oversize);
+STAT_ATTR(SHEAF_RETURN_FAST, sheaf_return_fast);
+STAT_ATTR(SHEAF_RETURN_SLOW, sheaf_return_slow);
#endif /* CONFIG_SLUB_STATS */
#ifdef CONFIG_KFENCE
@@ -7312,6 +9483,7 @@ static struct attribute *slab_attrs[] = {
&object_size_attr.attr,
&objs_per_slab_attr.attr,
&order_attr.attr,
+ &sheaf_capacity_attr.attr,
&min_partial_attr.attr,
&cpu_partial_attr.attr,
&objects_partial_attr.attr,
@@ -7343,8 +9515,12 @@ static struct attribute *slab_attrs[] = {
&remote_node_defrag_ratio_attr.attr,
#endif
#ifdef CONFIG_SLUB_STATS
+ &alloc_cpu_sheaf_attr.attr,
&alloc_fastpath_attr.attr,
&alloc_slowpath_attr.attr,
+ &free_cpu_sheaf_attr.attr,
+ &free_rcu_sheaf_attr.attr,
+ &free_rcu_sheaf_fail_attr.attr,
&free_fastpath_attr.attr,
&free_slowpath_attr.attr,
&free_frozen_attr.attr,
@@ -7369,6 +9545,19 @@ static struct attribute *slab_attrs[] = {
&cpu_partial_free_attr.attr,
&cpu_partial_node_attr.attr,
&cpu_partial_drain_attr.attr,
+ &sheaf_flush_attr.attr,
+ &sheaf_refill_attr.attr,
+ &sheaf_alloc_attr.attr,
+ &sheaf_free_attr.attr,
+ &barn_get_attr.attr,
+ &barn_get_fail_attr.attr,
+ &barn_put_attr.attr,
+ &barn_put_fail_attr.attr,
+ &sheaf_prefill_fast_attr.attr,
+ &sheaf_prefill_slow_attr.attr,
+ &sheaf_prefill_oversize_attr.attr,
+ &sheaf_return_fast_attr.attr,
+ &sheaf_return_slow_attr.attr,
#endif
#ifdef CONFIG_FAILSLAB
&failslab_attr.attr,
@@ -7710,15 +9899,12 @@ static void *slab_debugfs_next(struct seq_file *seq, void *v, loff_t *ppos)
return NULL;
}
-static int cmp_loc_by_count(const void *a, const void *b, const void *data)
+static int cmp_loc_by_count(const void *a, const void *b)
{
struct location *loc1 = (struct location *)a;
struct location *loc2 = (struct location *)b;
- if (loc1->count > loc2->count)
- return -1;
- else
- return 1;
+ return cmp_int(loc2->count, loc1->count);
}
static void *slab_debugfs_start(struct seq_file *seq, loff_t *ppos)
@@ -7780,8 +9966,8 @@ static int slab_debug_trace_open(struct inode *inode, struct file *filep)
}
/* Sort locations by count */
- sort_r(t->loc, t->count, sizeof(struct location),
- cmp_loc_by_count, NULL, NULL);
+ sort(t->loc, t->count, sizeof(struct location),
+ cmp_loc_by_count, NULL);
bitmap_free(obj_map);
return 0;
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index fd2ab5118e13..dbd8daccade2 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -27,9 +27,9 @@
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
#include <linux/sched.h>
+#include <linux/pgalloc.h>
#include <asm/dma.h>
-#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include "hugetlb_vmemmap.h"
@@ -229,7 +229,7 @@ p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node)
if (!p)
return NULL;
pud_init(p);
- p4d_populate(&init_mm, p4d, p);
+ p4d_populate_kernel(addr, p4d, p);
}
return p4d;
}
@@ -241,7 +241,7 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
if (!p)
return NULL;
- pgd_populate(&init_mm, pgd, p);
+ pgd_populate_kernel(addr, pgd, p);
}
return pgd;
}
@@ -578,11 +578,6 @@ struct page * __meminit __populate_section_memmap(unsigned long pfn,
if (r < 0)
return NULL;
- if (system_state == SYSTEM_BOOTING)
- memmap_boot_pages_add(DIV_ROUND_UP(end - start, PAGE_SIZE));
- else
- memmap_pages_add(DIV_ROUND_UP(end - start, PAGE_SIZE));
-
return pfn_to_page(pfn);
}
diff --git a/mm/sparse.c b/mm/sparse.c
index 3c012cf83cc2..17c50a6415c2 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -43,11 +43,11 @@ static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
#endif
-int page_to_nid(const struct page *page)
+int memdesc_nid(memdesc_flags_t mdf)
{
- return section_to_node_table[page_to_section(page)];
+ return section_to_node_table[memdesc_section(mdf)];
}
-EXPORT_SYMBOL(page_to_nid);
+EXPORT_SYMBOL(memdesc_nid);
static void set_section_nid(unsigned long section_nr, int nid)
{
@@ -454,9 +454,6 @@ static void __init sparse_buffer_init(unsigned long size, int nid)
*/
sparsemap_buf = memmap_alloc(size, section_map_size(), addr, nid, true);
sparsemap_buf_end = sparsemap_buf + size;
-#ifndef CONFIG_SPARSEMEM_VMEMMAP
- memmap_boot_pages_add(DIV_ROUND_UP(size, PAGE_SIZE));
-#endif
}
static void __init sparse_buffer_fini(void)
@@ -567,6 +564,8 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
sparse_buffer_fini();
goto failed;
}
+ memmap_boot_pages_add(DIV_ROUND_UP(PAGES_PER_SECTION * sizeof(struct page),
+ PAGE_SIZE));
sparse_init_early_section(nid, map, pnum, 0);
}
}
@@ -680,7 +679,6 @@ static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
unsigned long start = (unsigned long) pfn_to_page(pfn);
unsigned long end = start + nr_pages * sizeof(struct page);
- memmap_pages_add(-1L * (DIV_ROUND_UP(end - start, PAGE_SIZE)));
vmemmap_free(start, end, altmap);
}
static void free_map_bootmem(struct page *memmap)
@@ -856,10 +854,14 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
* The memmap of early sections is always fully populated. See
* section_activate() and pfn_valid() .
*/
- if (!section_is_early)
+ if (!section_is_early) {
+ memmap_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE)));
depopulate_section_memmap(pfn, nr_pages, altmap);
- else if (memmap)
+ } else if (memmap) {
+ memmap_boot_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page),
+ PAGE_SIZE)));
free_map_bootmem(memmap);
+ }
if (empty)
ms->section_mem_map = (unsigned long)NULL;
@@ -904,6 +906,7 @@ static struct page * __meminit section_activate(int nid, unsigned long pfn,
section_deactivate(pfn, nr_pages, altmap);
return ERR_PTR(-ENOMEM);
}
+ memmap_pages_add(DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE));
return memmap;
}
diff --git a/mm/swap.c b/mm/swap.c
index 77b2d5997873..2260dcd2775e 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -164,6 +164,10 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
for (i = 0; i < folio_batch_count(fbatch); i++) {
struct folio *folio = fbatch->folios[i];
+ /* block memcg migration while the folio moves between lru */
+ if (move_fn != lru_add && !folio_test_clear_lru(folio))
+ continue;
+
folio_lruvec_relock_irqsave(folio, &lruvec, &flags);
move_fn(lruvec, folio);
@@ -176,14 +180,10 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
}
static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch,
- struct folio *folio, move_fn_t move_fn,
- bool on_lru, bool disable_irq)
+ struct folio *folio, move_fn_t move_fn, bool disable_irq)
{
unsigned long flags;
- if (on_lru && !folio_test_clear_lru(folio))
- return;
-
folio_get(folio);
if (disable_irq)
@@ -191,8 +191,8 @@ static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch,
else
local_lock(&cpu_fbatches.lock);
- if (!folio_batch_add(this_cpu_ptr(fbatch), folio) || folio_test_large(folio) ||
- lru_cache_disabled())
+ if (!folio_batch_add(this_cpu_ptr(fbatch), folio) ||
+ !folio_may_be_lru_cached(folio) || lru_cache_disabled())
folio_batch_move_lru(this_cpu_ptr(fbatch), move_fn);
if (disable_irq)
@@ -201,13 +201,13 @@ static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch,
local_unlock(&cpu_fbatches.lock);
}
-#define folio_batch_add_and_move(folio, op, on_lru) \
- __folio_batch_add_and_move( \
- &cpu_fbatches.op, \
- folio, \
- op, \
- on_lru, \
- offsetof(struct cpu_fbatches, op) >= offsetof(struct cpu_fbatches, lock_irq) \
+#define folio_batch_add_and_move(folio, op) \
+ __folio_batch_add_and_move( \
+ &cpu_fbatches.op, \
+ folio, \
+ op, \
+ offsetof(struct cpu_fbatches, op) >= \
+ offsetof(struct cpu_fbatches, lock_irq) \
)
static void lru_move_tail(struct lruvec *lruvec, struct folio *folio)
@@ -231,14 +231,15 @@ static void lru_move_tail(struct lruvec *lruvec, struct folio *folio)
void folio_rotate_reclaimable(struct folio *folio)
{
if (folio_test_locked(folio) || folio_test_dirty(folio) ||
- folio_test_unevictable(folio))
+ folio_test_unevictable(folio) || !folio_test_lru(folio))
return;
- folio_batch_add_and_move(folio, lru_move_tail, true);
+ folio_batch_add_and_move(folio, lru_move_tail);
}
-void lru_note_cost(struct lruvec *lruvec, bool file,
- unsigned int nr_io, unsigned int nr_rotated)
+void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file,
+ unsigned int nr_io, unsigned int nr_rotated)
+ __releases(lruvec->lru_lock)
{
unsigned long cost;
@@ -250,18 +251,14 @@ void lru_note_cost(struct lruvec *lruvec, bool file,
* different between them, adjust scan balance for CPU work.
*/
cost = nr_io * SWAP_CLUSTER_MAX + nr_rotated;
+ if (!cost) {
+ spin_unlock_irq(&lruvec->lru_lock);
+ return;
+ }
- do {
+ for (;;) {
unsigned long lrusize;
- /*
- * Hold lruvec->lru_lock is safe here, since
- * 1) The pinned lruvec in reclaim, or
- * 2) From a pre-LRU page during refault (which also holds the
- * rcu lock, so would be safe even if the page was on the LRU
- * and could move simultaneously to a new lruvec).
- */
- spin_lock_irq(&lruvec->lru_lock);
/* Record cost event */
if (file)
lruvec->file_cost += cost;
@@ -285,14 +282,22 @@ void lru_note_cost(struct lruvec *lruvec, bool file,
lruvec->file_cost /= 2;
lruvec->anon_cost /= 2;
}
+
spin_unlock_irq(&lruvec->lru_lock);
- } while ((lruvec = parent_lruvec(lruvec)));
+ lruvec = parent_lruvec(lruvec);
+ if (!lruvec)
+ break;
+ spin_lock_irq(&lruvec->lru_lock);
+ }
}
void lru_note_cost_refault(struct folio *folio)
{
- lru_note_cost(folio_lruvec(folio), folio_is_file_lru(folio),
- folio_nr_pages(folio), 0);
+ struct lruvec *lruvec;
+
+ lruvec = folio_lruvec_lock_irq(folio);
+ lru_note_cost_unlock_irq(lruvec, folio_is_file_lru(folio),
+ folio_nr_pages(folio), 0);
}
static void lru_activate(struct lruvec *lruvec, struct folio *folio)
@@ -309,7 +314,7 @@ static void lru_activate(struct lruvec *lruvec, struct folio *folio)
trace_mm_lru_activate(folio);
__count_vm_events(PGACTIVATE, nr_pages);
- __count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, nr_pages);
+ count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, nr_pages);
}
#ifdef CONFIG_SMP
@@ -323,10 +328,11 @@ static void folio_activate_drain(int cpu)
void folio_activate(struct folio *folio)
{
- if (folio_test_active(folio) || folio_test_unevictable(folio))
+ if (folio_test_active(folio) || folio_test_unevictable(folio) ||
+ !folio_test_lru(folio))
return;
- folio_batch_add_and_move(folio, lru_activate, true);
+ folio_batch_add_and_move(folio, lru_activate);
}
#else
@@ -382,14 +388,14 @@ static void __lru_cache_activate_folio(struct folio *folio)
static void lru_gen_inc_refs(struct folio *folio)
{
- unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
+ unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f);
if (folio_test_unevictable(folio))
return;
/* see the comment on LRU_REFS_FLAGS */
if (!folio_test_referenced(folio)) {
- set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced));
+ set_mask_bits(&folio->flags.f, LRU_REFS_MASK, BIT(PG_referenced));
return;
}
@@ -401,7 +407,7 @@ static void lru_gen_inc_refs(struct folio *folio)
}
new_flags = old_flags + BIT(LRU_REFS_PGOFF);
- } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
+ } while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags));
}
static bool lru_gen_clear_refs(struct folio *folio)
@@ -413,7 +419,7 @@ static bool lru_gen_clear_refs(struct folio *folio)
if (gen < 0)
return true;
- set_mask_bits(&folio->flags, LRU_REFS_FLAGS | BIT(PG_workingset), 0);
+ set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS | BIT(PG_workingset), 0);
lrugen = &folio_lruvec(folio)->lrugen;
/* whether can do without shuffling under the LRU lock */
@@ -502,7 +508,7 @@ void folio_add_lru(struct folio *folio)
lru_gen_in_fault() && !(current->flags & PF_MEMALLOC))
folio_set_active(folio);
- folio_batch_add_and_move(folio, lru_add, false);
+ folio_batch_add_and_move(folio, lru_add);
}
EXPORT_SYMBOL(folio_add_lru);
@@ -581,7 +587,7 @@ static void lru_deactivate_file(struct lruvec *lruvec, struct folio *folio)
if (active) {
__count_vm_events(PGDEACTIVATE, nr_pages);
- __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
+ count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
nr_pages);
}
}
@@ -599,7 +605,7 @@ static void lru_deactivate(struct lruvec *lruvec, struct folio *folio)
lruvec_add_folio(lruvec, folio);
__count_vm_events(PGDEACTIVATE, nr_pages);
- __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages);
+ count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages);
}
static void lru_lazyfree(struct lruvec *lruvec, struct folio *folio)
@@ -625,7 +631,7 @@ static void lru_lazyfree(struct lruvec *lruvec, struct folio *folio)
lruvec_add_folio(lruvec, folio);
__count_vm_events(PGLAZYFREE, nr_pages);
- __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, nr_pages);
+ count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, nr_pages);
}
/*
@@ -680,13 +686,13 @@ void lru_add_drain_cpu(int cpu)
void deactivate_file_folio(struct folio *folio)
{
/* Deactivating an unevictable folio will not accelerate reclaim */
- if (folio_test_unevictable(folio))
+ if (folio_test_unevictable(folio) || !folio_test_lru(folio))
return;
if (lru_gen_enabled() && lru_gen_clear_refs(folio))
return;
- folio_batch_add_and_move(folio, lru_deactivate_file, true);
+ folio_batch_add_and_move(folio, lru_deactivate_file);
}
/*
@@ -699,13 +705,13 @@ void deactivate_file_folio(struct folio *folio)
*/
void folio_deactivate(struct folio *folio)
{
- if (folio_test_unevictable(folio))
+ if (folio_test_unevictable(folio) || !folio_test_lru(folio))
return;
if (lru_gen_enabled() ? lru_gen_clear_refs(folio) : !folio_test_active(folio))
return;
- folio_batch_add_and_move(folio, lru_deactivate, true);
+ folio_batch_add_and_move(folio, lru_deactivate);
}
/**
@@ -718,10 +724,11 @@ void folio_deactivate(struct folio *folio)
void folio_mark_lazyfree(struct folio *folio)
{
if (!folio_test_anon(folio) || !folio_test_swapbacked(folio) ||
+ !folio_test_lru(folio) ||
folio_test_swapcache(folio) || folio_test_unevictable(folio))
return;
- folio_batch_add_and_move(folio, lru_lazyfree, true);
+ folio_batch_add_and_move(folio, lru_lazyfree);
}
void lru_add_drain(void)
@@ -827,6 +834,9 @@ static inline void __lru_add_drain_all(bool force_all_cpus)
*/
this_gen = smp_load_acquire(&lru_drain_gen);
+ /* It helps everyone if we do our own local drain immediately. */
+ lru_add_drain();
+
mutex_lock(&lock);
/*
@@ -1091,7 +1101,7 @@ static const struct ctl_table swap_sysctl_table[] = {
*/
void __init swap_setup(void)
{
- unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT);
+ unsigned long megs = PAGES_TO_MB(totalram_pages());
/* Use a smaller cluster for small-memory machines */
if (megs < 16)
diff --git a/mm/swap.h b/mm/swap.h
index 6f4a3f927edb..8d8efdf1297a 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -2,13 +2,187 @@
#ifndef _MM_SWAP_H
#define _MM_SWAP_H
+#include <linux/atomic.h> /* for atomic_long_t */
struct mempolicy;
+struct swap_iocb;
+
extern int page_cluster;
+#ifdef CONFIG_THP_SWAP
+#define SWAPFILE_CLUSTER HPAGE_PMD_NR
+#define swap_entry_order(order) (order)
+#else
+#define SWAPFILE_CLUSTER 256
+#define swap_entry_order(order) 0
+#endif
+
+extern struct swap_info_struct *swap_info[];
+
+/*
+ * We use this to track usage of a cluster. A cluster is a block of swap disk
+ * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All
+ * free clusters are organized into a list. We fetch an entry from the list to
+ * get a free cluster.
+ *
+ * The flags field determines if a cluster is free. This is
+ * protected by cluster lock.
+ */
+struct swap_cluster_info {
+ spinlock_t lock; /*
+ * Protect swap_cluster_info fields
+ * other than list, and swap_info_struct->swap_map
+ * elements corresponding to the swap cluster.
+ */
+ u16 count;
+ u8 flags;
+ u8 order;
+ atomic_long_t __rcu *table; /* Swap table entries, see mm/swap_table.h */
+ struct list_head list;
+};
+
+/* All on-list cluster must have a non-zero flag. */
+enum swap_cluster_flags {
+ CLUSTER_FLAG_NONE = 0, /* For temporary off-list cluster */
+ CLUSTER_FLAG_FREE,
+ CLUSTER_FLAG_NONFULL,
+ CLUSTER_FLAG_FRAG,
+ /* Clusters with flags above are allocatable */
+ CLUSTER_FLAG_USABLE = CLUSTER_FLAG_FRAG,
+ CLUSTER_FLAG_FULL,
+ CLUSTER_FLAG_DISCARD,
+ CLUSTER_FLAG_MAX,
+};
+
#ifdef CONFIG_SWAP
#include <linux/swapops.h> /* for swp_offset */
#include <linux/blk_types.h> /* for bio_end_io_t */
+static inline unsigned int swp_cluster_offset(swp_entry_t entry)
+{
+ return swp_offset(entry) % SWAPFILE_CLUSTER;
+}
+
+/*
+ * Callers of all helpers below must ensure the entry, type, or offset is
+ * valid, and protect the swap device with reference count or locks.
+ */
+static inline struct swap_info_struct *__swap_type_to_info(int type)
+{
+ struct swap_info_struct *si;
+
+ si = READ_ONCE(swap_info[type]); /* rcu_dereference() */
+ VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */
+ return si;
+}
+
+static inline struct swap_info_struct *__swap_entry_to_info(swp_entry_t entry)
+{
+ return __swap_type_to_info(swp_type(entry));
+}
+
+static inline struct swap_cluster_info *__swap_offset_to_cluster(
+ struct swap_info_struct *si, pgoff_t offset)
+{
+ VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */
+ VM_WARN_ON_ONCE(offset >= si->max);
+ return &si->cluster_info[offset / SWAPFILE_CLUSTER];
+}
+
+static inline struct swap_cluster_info *__swap_entry_to_cluster(swp_entry_t entry)
+{
+ return __swap_offset_to_cluster(__swap_entry_to_info(entry),
+ swp_offset(entry));
+}
+
+static __always_inline struct swap_cluster_info *__swap_cluster_lock(
+ struct swap_info_struct *si, unsigned long offset, bool irq)
+{
+ struct swap_cluster_info *ci = __swap_offset_to_cluster(si, offset);
+
+ /*
+ * Nothing modifies swap cache in an IRQ context. All access to
+ * swap cache is wrapped by swap_cache_* helpers, and swap cache
+ * writeback is handled outside of IRQs. Swapin or swapout never
+ * occurs in IRQ, and neither does in-place split or replace.
+ *
+ * Besides, modifying swap cache requires synchronization with
+ * swap_map, which was never IRQ safe.
+ */
+ VM_WARN_ON_ONCE(!in_task());
+ VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */
+ if (irq)
+ spin_lock_irq(&ci->lock);
+ else
+ spin_lock(&ci->lock);
+ return ci;
+}
+
+/**
+ * swap_cluster_lock - Lock and return the swap cluster of given offset.
+ * @si: swap device the cluster belongs to.
+ * @offset: the swap entry offset, pointing to a valid slot.
+ *
+ * Context: The caller must ensure the offset is in the valid range and
+ * protect the swap device with reference count or locks.
+ */
+static inline struct swap_cluster_info *swap_cluster_lock(
+ struct swap_info_struct *si, unsigned long offset)
+{
+ return __swap_cluster_lock(si, offset, false);
+}
+
+static inline struct swap_cluster_info *__swap_cluster_get_and_lock(
+ const struct folio *folio, bool irq)
+{
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio);
+ return __swap_cluster_lock(__swap_entry_to_info(folio->swap),
+ swp_offset(folio->swap), irq);
+}
+
+/*
+ * swap_cluster_get_and_lock - Locks the cluster that holds a folio's entries.
+ * @folio: The folio.
+ *
+ * This locks and returns the swap cluster that contains a folio's swap
+ * entries. The swap entries of a folio are always in one single cluster.
+ * The folio has to be locked so its swap entries won't change and the
+ * cluster won't be freed.
+ *
+ * Context: Caller must ensure the folio is locked and in the swap cache.
+ * Return: Pointer to the swap cluster.
+ */
+static inline struct swap_cluster_info *swap_cluster_get_and_lock(
+ const struct folio *folio)
+{
+ return __swap_cluster_get_and_lock(folio, false);
+}
+
+/*
+ * swap_cluster_get_and_lock_irq - Locks the cluster that holds a folio's entries.
+ * @folio: The folio.
+ *
+ * Same as swap_cluster_get_and_lock but also disable IRQ.
+ *
+ * Context: Caller must ensure the folio is locked and in the swap cache.
+ * Return: Pointer to the swap cluster.
+ */
+static inline struct swap_cluster_info *swap_cluster_get_and_lock_irq(
+ const struct folio *folio)
+{
+ return __swap_cluster_get_and_lock(folio, true);
+}
+
+static inline void swap_cluster_unlock(struct swap_cluster_info *ci)
+{
+ spin_unlock(&ci->lock);
+}
+
+static inline void swap_cluster_unlock_irq(struct swap_cluster_info *ci)
+{
+ spin_unlock_irq(&ci->lock);
+}
+
/* linux/mm/page_io.c */
int sio_pool_init(void);
struct swap_iocb;
@@ -20,18 +194,15 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
__swap_read_unplug(plug);
}
void swap_write_unplug(struct swap_iocb *sio);
-int swap_writepage(struct page *page, struct writeback_control *wbc);
-void __swap_writepage(struct folio *folio, struct writeback_control *wbc);
+int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug);
+void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
/* linux/mm/swap_state.c */
-/* One swap address space for each 64M swap space */
-#define SWAP_ADDRESS_SPACE_SHIFT 14
-#define SWAP_ADDRESS_SPACE_PAGES (1 << SWAP_ADDRESS_SPACE_SHIFT)
-#define SWAP_ADDRESS_SPACE_MASK (SWAP_ADDRESS_SPACE_PAGES - 1)
-extern struct address_space *swapper_spaces[];
-#define swap_address_space(entry) \
- (&swapper_spaces[swp_type(entry)][swp_offset(entry) \
- >> SWAP_ADDRESS_SPACE_SHIFT])
+extern struct address_space swap_space __ro_after_init;
+static inline struct address_space *swap_address_space(swp_entry_t entry)
+{
+ return &swap_space;
+}
/*
* Return the swap device position of the swap entry.
@@ -41,30 +212,52 @@ static inline loff_t swap_dev_pos(swp_entry_t entry)
return ((loff_t)swp_offset(entry)) << PAGE_SHIFT;
}
-/*
- * Return the swap cache index of the swap entry.
+/**
+ * folio_matches_swap_entry - Check if a folio matches a given swap entry.
+ * @folio: The folio.
+ * @entry: The swap entry to check against.
+ *
+ * Context: The caller should have the folio locked to ensure it's stable
+ * and nothing will move it in or out of the swap cache.
+ * Return: true or false.
*/
-static inline pgoff_t swap_cache_index(swp_entry_t entry)
+static inline bool folio_matches_swap_entry(const struct folio *folio,
+ swp_entry_t entry)
{
- BUILD_BUG_ON((SWP_OFFSET_MASK | SWAP_ADDRESS_SPACE_MASK) != SWP_OFFSET_MASK);
- return swp_offset(entry) & SWAP_ADDRESS_SPACE_MASK;
+ swp_entry_t folio_entry = folio->swap;
+ long nr_pages = folio_nr_pages(folio);
+
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
+ if (!folio_test_swapcache(folio))
+ return false;
+ VM_WARN_ON_ONCE_FOLIO(!IS_ALIGNED(folio_entry.val, nr_pages), folio);
+ return folio_entry.val == round_down(entry.val, nr_pages);
}
+/*
+ * All swap cache helpers below require the caller to ensure the swap entries
+ * used are valid and stablize the device by any of the following ways:
+ * - Hold a reference by get_swap_device(): this ensures a single entry is
+ * valid and increases the swap device's refcount.
+ * - Locking a folio in the swap cache: this ensures the folio's swap entries
+ * are valid and pinned, also implies reference to the device.
+ * - Locking anything referencing the swap entry: e.g. PTL that protects
+ * swap entries in the page table, similar to locking swap cache folio.
+ * - See the comment of get_swap_device() for more complex usage.
+ */
+struct folio *swap_cache_get_folio(swp_entry_t entry);
+void *swap_cache_get_shadow(swp_entry_t entry);
+void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadow);
+void swap_cache_del_folio(struct folio *folio);
+/* Below helpers require the caller to lock and pass in the swap cluster. */
+void __swap_cache_del_folio(struct swap_cluster_info *ci,
+ struct folio *folio, swp_entry_t entry, void *shadow);
+void __swap_cache_replace_folio(struct swap_cluster_info *ci,
+ struct folio *old, struct folio *new);
+void __swap_cache_clear_shadow(swp_entry_t entry, int nr_ents);
+
void show_swap_cache_info(void);
-void *get_shadow_from_swap_cache(swp_entry_t entry);
-int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
- gfp_t gfp, void **shadowp);
-void __delete_from_swap_cache(struct folio *folio,
- swp_entry_t entry, void *shadow);
-void delete_from_swap_cache(struct folio *folio);
-void clear_shadow_from_swap_cache(int type, unsigned long begin,
- unsigned long end);
void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr);
-struct folio *swap_cache_get_folio(swp_entry_t entry,
- struct vm_area_struct *vma, unsigned long addr);
-struct folio *filemap_get_incore_folio(struct address_space *mapping,
- pgoff_t index);
-
struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma, unsigned long addr,
struct swap_iocb **plug);
@@ -75,10 +268,12 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
struct mempolicy *mpol, pgoff_t ilx);
struct folio *swapin_readahead(swp_entry_t entry, gfp_t flag,
struct vm_fault *vmf);
+void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma,
+ unsigned long addr);
static inline unsigned int folio_swap_flags(struct folio *folio)
{
- return swp_swap_info(folio->swap)->flags;
+ return __swap_entry_to_info(folio->swap)->flags;
}
/*
@@ -89,7 +284,7 @@ static inline unsigned int folio_swap_flags(struct folio *folio)
static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr,
bool *is_zeromap)
{
- struct swap_info_struct *sis = swp_swap_info(entry);
+ struct swap_info_struct *sis = __swap_entry_to_info(entry);
unsigned long start = swp_offset(entry);
unsigned long end = start + max_nr;
bool first_bit;
@@ -106,8 +301,58 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr,
return find_next_bit(sis->zeromap, end, start) - start;
}
+static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
+{
+ struct swap_info_struct *si = __swap_entry_to_info(entry);
+ pgoff_t offset = swp_offset(entry);
+ int i;
+
+ /*
+ * While allocating a large folio and doing mTHP swapin, we need to
+ * ensure all entries are not cached, otherwise, the mTHP folio will
+ * be in conflict with the folio in swap cache.
+ */
+ for (i = 0; i < max_nr; i++) {
+ if ((si->swap_map[offset + i] & SWAP_HAS_CACHE))
+ return i;
+ }
+
+ return i;
+}
+
#else /* CONFIG_SWAP */
struct swap_iocb;
+static inline struct swap_cluster_info *swap_cluster_lock(
+ struct swap_info_struct *si, pgoff_t offset, bool irq)
+{
+ return NULL;
+}
+
+static inline struct swap_cluster_info *swap_cluster_get_and_lock(
+ struct folio *folio)
+{
+ return NULL;
+}
+
+static inline struct swap_cluster_info *swap_cluster_get_and_lock_irq(
+ struct folio *folio)
+{
+ return NULL;
+}
+
+static inline void swap_cluster_unlock(struct swap_cluster_info *ci)
+{
+}
+
+static inline void swap_cluster_unlock_irq(struct swap_cluster_info *ci)
+{
+}
+
+static inline struct swap_info_struct *__swap_entry_to_info(swp_entry_t entry)
+{
+ return NULL;
+}
+
static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
{
}
@@ -120,9 +365,9 @@ static inline struct address_space *swap_address_space(swp_entry_t entry)
return NULL;
}
-static inline pgoff_t swap_cache_index(swp_entry_t entry)
+static inline bool folio_matches_swap_entry(const struct folio *folio, swp_entry_t entry)
{
- return 0;
+ return false;
}
static inline void show_swap_cache_info(void)
@@ -141,50 +386,46 @@ static inline struct folio *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
return NULL;
}
-static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
+static inline void swap_update_readahead(struct folio *folio,
+ struct vm_area_struct *vma, unsigned long addr)
{
- return 0;
}
-static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr)
+static inline int swap_writeout(struct folio *folio,
+ struct swap_iocb **swap_plug)
{
+ return 0;
}
-static inline struct folio *swap_cache_get_folio(swp_entry_t entry,
- struct vm_area_struct *vma, unsigned long addr)
+static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr)
{
- return NULL;
}
-static inline
-struct folio *filemap_get_incore_folio(struct address_space *mapping,
- pgoff_t index)
+static inline struct folio *swap_cache_get_folio(swp_entry_t entry)
{
- return filemap_get_folio(mapping, index);
+ return NULL;
}
-static inline void *get_shadow_from_swap_cache(swp_entry_t entry)
+static inline void *swap_cache_get_shadow(swp_entry_t entry)
{
return NULL;
}
-static inline int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
- gfp_t gfp_mask, void **shadowp)
+static inline void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadow)
{
- return -1;
}
-static inline void __delete_from_swap_cache(struct folio *folio,
- swp_entry_t entry, void *shadow)
+static inline void swap_cache_del_folio(struct folio *folio)
{
}
-static inline void delete_from_swap_cache(struct folio *folio)
+static inline void __swap_cache_del_folio(struct swap_cluster_info *ci,
+ struct folio *folio, swp_entry_t entry, void *shadow)
{
}
-static inline void clear_shadow_from_swap_cache(int type, unsigned long begin,
- unsigned long end)
+static inline void __swap_cache_replace_folio(struct swap_cluster_info *ci,
+ struct folio *old, struct folio *new)
{
}
@@ -199,6 +440,30 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr,
return 0;
}
+static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
+{
+ return 0;
+}
#endif /* CONFIG_SWAP */
+/**
+ * folio_index - File index of a folio.
+ * @folio: The folio.
+ *
+ * For a folio which is either in the page cache or the swap cache,
+ * return its index within the address_space it belongs to. If you know
+ * the folio is definitely in the page cache, you can look at the folio's
+ * index directly.
+ *
+ * Return: The index (offset in units of pages) of a folio in its file.
+ */
+static inline pgoff_t folio_index(struct folio *folio)
+{
+#ifdef CONFIG_SWAP
+ if (unlikely(folio_test_swapcache(folio)))
+ return swp_offset(folio->swap);
+#endif
+ return folio->index;
+}
+
#endif /* _MM_SWAP_H */
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 68fd981b514f..b13e9c4baa90 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -23,6 +23,7 @@
#include <linux/huge_mm.h>
#include <linux/shmem_fs.h>
#include "internal.h"
+#include "swap_table.h"
#include "swap.h"
/*
@@ -30,15 +31,17 @@
* vmscan's shrink_folio_list.
*/
static const struct address_space_operations swap_aops = {
- .writepage = swap_writepage,
.dirty_folio = noop_dirty_folio,
#ifdef CONFIG_MIGRATION
.migrate_folio = migrate_folio,
#endif
};
-struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly;
-static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly;
+/* Set swap_space as read only as swap cache is handled by swap table */
+struct address_space swap_space __ro_after_init = {
+ .a_ops = &swap_aops,
+};
+
static bool enable_vma_readahead __read_mostly = true;
#define SWAP_RA_ORDER_CEILING 5
@@ -70,150 +73,237 @@ void show_swap_cache_info(void)
printk("Total swap = %lukB\n", K(total_swap_pages));
}
-void *get_shadow_from_swap_cache(swp_entry_t entry)
+/**
+ * swap_cache_get_folio - Looks up a folio in the swap cache.
+ * @entry: swap entry used for the lookup.
+ *
+ * A found folio will be returned unlocked and with its refcount increased.
+ *
+ * Context: Caller must ensure @entry is valid and protect the swap device
+ * with reference count or locks.
+ * Return: Returns the found folio on success, NULL otherwise. The caller
+ * must lock nd check if the folio still matches the swap entry before
+ * use (e.g., folio_matches_swap_entry).
+ */
+struct folio *swap_cache_get_folio(swp_entry_t entry)
{
- struct address_space *address_space = swap_address_space(entry);
- pgoff_t idx = swap_cache_index(entry);
- void *shadow;
+ unsigned long swp_tb;
+ struct folio *folio;
+
+ for (;;) {
+ swp_tb = swap_table_get(__swap_entry_to_cluster(entry),
+ swp_cluster_offset(entry));
+ if (!swp_tb_is_folio(swp_tb))
+ return NULL;
+ folio = swp_tb_to_folio(swp_tb);
+ if (likely(folio_try_get(folio)))
+ return folio;
+ }
- shadow = xa_load(&address_space->i_pages, idx);
- if (xa_is_value(shadow))
- return shadow;
return NULL;
}
-/*
- * add_to_swap_cache resembles filemap_add_folio on swapper_space,
- * but sets SwapCache flag and 'swap' instead of mapping and index.
+/**
+ * swap_cache_get_shadow - Looks up a shadow in the swap cache.
+ * @entry: swap entry used for the lookup.
+ *
+ * Context: Caller must ensure @entry is valid and protect the swap device
+ * with reference count or locks.
+ * Return: Returns either NULL or an XA_VALUE (shadow).
*/
-int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
- gfp_t gfp, void **shadowp)
+void *swap_cache_get_shadow(swp_entry_t entry)
{
- struct address_space *address_space = swap_address_space(entry);
- pgoff_t idx = swap_cache_index(entry);
- XA_STATE_ORDER(xas, &address_space->i_pages, idx, folio_order(folio));
- unsigned long i, nr = folio_nr_pages(folio);
- void *old;
+ unsigned long swp_tb;
- xas_set_update(&xas, workingset_update_node);
+ swp_tb = swap_table_get(__swap_entry_to_cluster(entry),
+ swp_cluster_offset(entry));
+ if (swp_tb_is_shadow(swp_tb))
+ return swp_tb_to_shadow(swp_tb);
+ return NULL;
+}
- VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
- VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio);
- VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio);
+/**
+ * swap_cache_add_folio - Add a folio into the swap cache.
+ * @folio: The folio to be added.
+ * @entry: The swap entry corresponding to the folio.
+ * @gfp: gfp_mask for XArray node allocation.
+ * @shadowp: If a shadow is found, return the shadow.
+ *
+ * Context: Caller must ensure @entry is valid and protect the swap device
+ * with reference count or locks.
+ * The caller also needs to update the corresponding swap_map slots with
+ * SWAP_HAS_CACHE bit to avoid race or conflict.
+ */
+void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadowp)
+{
+ void *shadow = NULL;
+ unsigned long old_tb, new_tb;
+ struct swap_cluster_info *ci;
+ unsigned int ci_start, ci_off, ci_end;
+ unsigned long nr_pages = folio_nr_pages(folio);
+
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_swapbacked(folio), folio);
+
+ new_tb = folio_to_swp_tb(folio);
+ ci_start = swp_cluster_offset(entry);
+ ci_end = ci_start + nr_pages;
+ ci_off = ci_start;
+ ci = swap_cluster_lock(__swap_entry_to_info(entry), swp_offset(entry));
+ do {
+ old_tb = __swap_table_xchg(ci, ci_off, new_tb);
+ WARN_ON_ONCE(swp_tb_is_folio(old_tb));
+ if (swp_tb_is_shadow(old_tb))
+ shadow = swp_tb_to_shadow(old_tb);
+ } while (++ci_off < ci_end);
- folio_ref_add(folio, nr);
+ folio_ref_add(folio, nr_pages);
folio_set_swapcache(folio);
folio->swap = entry;
+ swap_cluster_unlock(ci);
- do {
- xas_lock_irq(&xas);
- xas_create_range(&xas);
- if (xas_error(&xas))
- goto unlock;
- for (i = 0; i < nr; i++) {
- VM_BUG_ON_FOLIO(xas.xa_index != idx + i, folio);
- if (shadowp) {
- old = xas_load(&xas);
- if (xa_is_value(old))
- *shadowp = old;
- }
- xas_store(&xas, folio);
- xas_next(&xas);
- }
- address_space->nrpages += nr;
- __node_stat_mod_folio(folio, NR_FILE_PAGES, nr);
- __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr);
-unlock:
- xas_unlock_irq(&xas);
- } while (xas_nomem(&xas, gfp));
-
- if (!xas_error(&xas))
- return 0;
+ node_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages);
+ lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr_pages);
- folio_clear_swapcache(folio);
- folio_ref_sub(folio, nr);
- return xas_error(&xas);
+ if (shadowp)
+ *shadowp = shadow;
}
-/*
- * This must be called only on folios that have
- * been verified to be in the swap cache.
+/**
+ * __swap_cache_del_folio - Removes a folio from the swap cache.
+ * @ci: The locked swap cluster.
+ * @folio: The folio.
+ * @entry: The first swap entry that the folio corresponds to.
+ * @shadow: shadow value to be filled in the swap cache.
+ *
+ * Removes a folio from the swap cache and fills a shadow in place.
+ * This won't put the folio's refcount. The caller has to do that.
+ *
+ * Context: Caller must ensure the folio is locked and in the swap cache
+ * using the index of @entry, and lock the cluster that holds the entries.
*/
-void __delete_from_swap_cache(struct folio *folio,
- swp_entry_t entry, void *shadow)
+void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio,
+ swp_entry_t entry, void *shadow)
{
- struct address_space *address_space = swap_address_space(entry);
- int i;
- long nr = folio_nr_pages(folio);
- pgoff_t idx = swap_cache_index(entry);
- XA_STATE(xas, &address_space->i_pages, idx);
-
- xas_set_update(&xas, workingset_update_node);
-
- VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
- VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
- VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio);
-
- for (i = 0; i < nr; i++) {
- void *entry = xas_store(&xas, shadow);
- VM_BUG_ON_PAGE(entry != folio, entry);
- xas_next(&xas);
- }
+ unsigned long old_tb, new_tb;
+ unsigned int ci_start, ci_off, ci_end;
+ unsigned long nr_pages = folio_nr_pages(folio);
+
+ VM_WARN_ON_ONCE(__swap_entry_to_cluster(entry) != ci);
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_writeback(folio), folio);
+
+ new_tb = shadow_swp_to_tb(shadow);
+ ci_start = swp_cluster_offset(entry);
+ ci_end = ci_start + nr_pages;
+ ci_off = ci_start;
+ do {
+ /* If shadow is NULL, we sets an empty shadow */
+ old_tb = __swap_table_xchg(ci, ci_off, new_tb);
+ WARN_ON_ONCE(!swp_tb_is_folio(old_tb) ||
+ swp_tb_to_folio(old_tb) != folio);
+ } while (++ci_off < ci_end);
+
folio->swap.val = 0;
folio_clear_swapcache(folio);
- address_space->nrpages -= nr;
- __node_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
- __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr);
+ node_stat_mod_folio(folio, NR_FILE_PAGES, -nr_pages);
+ lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr_pages);
}
-/*
- * This must be called only on folios that have
- * been verified to be in the swap cache and locked.
- * It will never put the folio into the free list,
- * the caller has a reference on the folio.
+/**
+ * swap_cache_del_folio - Removes a folio from the swap cache.
+ * @folio: The folio.
+ *
+ * Same as __swap_cache_del_folio, but handles lock and refcount. The
+ * caller must ensure the folio is either clean or has a swap count
+ * equal to zero, or it may cause data loss.
+ *
+ * Context: Caller must ensure the folio is locked and in the swap cache.
*/
-void delete_from_swap_cache(struct folio *folio)
+void swap_cache_del_folio(struct folio *folio)
{
+ struct swap_cluster_info *ci;
swp_entry_t entry = folio->swap;
- struct address_space *address_space = swap_address_space(entry);
- xa_lock_irq(&address_space->i_pages);
- __delete_from_swap_cache(folio, entry, NULL);
- xa_unlock_irq(&address_space->i_pages);
+ ci = swap_cluster_lock(__swap_entry_to_info(entry), swp_offset(entry));
+ __swap_cache_del_folio(ci, folio, entry, NULL);
+ swap_cluster_unlock(ci);
put_swap_folio(folio, entry);
folio_ref_sub(folio, folio_nr_pages(folio));
}
-void clear_shadow_from_swap_cache(int type, unsigned long begin,
- unsigned long end)
+/**
+ * __swap_cache_replace_folio - Replace a folio in the swap cache.
+ * @ci: The locked swap cluster.
+ * @old: The old folio to be replaced.
+ * @new: The new folio.
+ *
+ * Replace an existing folio in the swap cache with a new folio. The
+ * caller is responsible for setting up the new folio's flag and swap
+ * entries. Replacement will take the new folio's swap entry value as
+ * the starting offset to override all slots covered by the new folio.
+ *
+ * Context: Caller must ensure both folios are locked, and lock the
+ * cluster that holds the old folio to be replaced.
+ */
+void __swap_cache_replace_folio(struct swap_cluster_info *ci,
+ struct folio *old, struct folio *new)
{
- unsigned long curr = begin;
- void *old;
-
- for (;;) {
- swp_entry_t entry = swp_entry(type, curr);
- unsigned long index = curr & SWAP_ADDRESS_SPACE_MASK;
- struct address_space *address_space = swap_address_space(entry);
- XA_STATE(xas, &address_space->i_pages, index);
-
- xas_set_update(&xas, workingset_update_node);
-
- xa_lock_irq(&address_space->i_pages);
- xas_for_each(&xas, old, min(index + (end - curr), SWAP_ADDRESS_SPACE_PAGES)) {
- if (!xa_is_value(old))
- continue;
- xas_store(&xas, NULL);
- }
- xa_unlock_irq(&address_space->i_pages);
+ swp_entry_t entry = new->swap;
+ unsigned long nr_pages = folio_nr_pages(new);
+ unsigned int ci_off = swp_cluster_offset(entry);
+ unsigned int ci_end = ci_off + nr_pages;
+ unsigned long old_tb, new_tb;
+
+ VM_WARN_ON_ONCE(!folio_test_swapcache(old) || !folio_test_swapcache(new));
+ VM_WARN_ON_ONCE(!folio_test_locked(old) || !folio_test_locked(new));
+ VM_WARN_ON_ONCE(!entry.val);
+
+ /* Swap cache still stores N entries instead of a high-order entry */
+ new_tb = folio_to_swp_tb(new);
+ do {
+ old_tb = __swap_table_xchg(ci, ci_off, new_tb);
+ WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) != old);
+ } while (++ci_off < ci_end);
- /* search the next swapcache until we meet end */
- curr = ALIGN((curr + 1), SWAP_ADDRESS_SPACE_PAGES);
- if (curr > end)
- break;
+ /*
+ * If the old folio is partially replaced (e.g., splitting a large
+ * folio, the old folio is shrunk, and new split sub folios replace
+ * the shrunk part), ensure the new folio doesn't overlap it.
+ */
+ if (IS_ENABLED(CONFIG_DEBUG_VM) &&
+ folio_order(old) != folio_order(new)) {
+ ci_off = swp_cluster_offset(old->swap);
+ ci_end = ci_off + folio_nr_pages(old);
+ while (ci_off++ < ci_end)
+ WARN_ON_ONCE(swp_tb_to_folio(__swap_table_get(ci, ci_off)) != old);
}
}
+/**
+ * swap_cache_clear_shadow - Clears a set of shadows in the swap cache.
+ * @entry: The starting index entry.
+ * @nr_ents: How many slots need to be cleared.
+ *
+ * Context: Caller must ensure the range is valid, all in one single cluster,
+ * not occupied by any folio, and lock the cluster.
+ */
+void __swap_cache_clear_shadow(swp_entry_t entry, int nr_ents)
+{
+ struct swap_cluster_info *ci = __swap_entry_to_cluster(entry);
+ unsigned int ci_off = swp_cluster_offset(entry), ci_end;
+ unsigned long old;
+
+ ci_end = ci_off + nr_ents;
+ do {
+ old = __swap_table_xchg(ci, ci_off, null_to_swp_tb());
+ WARN_ON_ONCE(swp_tb_is_folio(old));
+ } while (++ci_off < ci_end);
+}
+
/*
* If we are the only user, then try to free up the swap cache.
*
@@ -232,13 +322,11 @@ void free_swap_cache(struct folio *folio)
}
/*
- * Perform a free_page(), also freeing any swap cache associated with
- * this page if it is the last user of the page.
+ * Freeing a folio and also freeing any swap cache associated with
+ * this folio if it is the last user.
*/
-void free_page_and_swap_cache(struct page *page)
+void free_folio_and_swap_cache(struct folio *folio)
{
- struct folio *folio = page_folio(page);
-
free_swap_cache(folio);
if (!is_huge_zero_folio(folio))
folio_put(folio);
@@ -275,100 +363,50 @@ static inline bool swap_use_vma_readahead(void)
return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap);
}
-/*
- * Lookup a swap entry in the swap cache. A found folio will be returned
- * unlocked and with its refcount incremented - we rely on the kernel
- * lock getting page table operations atomic even if we drop the folio
- * lock before returning.
- *
- * Caller must lock the swap device or hold a reference to keep it valid.
+/**
+ * swap_update_readahead - Update the readahead statistics of VMA or globally.
+ * @folio: the swap cache folio that just got hit.
+ * @vma: the VMA that should be updated, could be NULL for global update.
+ * @addr: the addr that triggered the swapin, ignored if @vma is NULL.
*/
-struct folio *swap_cache_get_folio(swp_entry_t entry,
- struct vm_area_struct *vma, unsigned long addr)
+void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma,
+ unsigned long addr)
{
- struct folio *folio;
-
- folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry));
- if (!IS_ERR(folio)) {
- bool vma_ra = swap_use_vma_readahead();
- bool readahead;
+ bool readahead, vma_ra = swap_use_vma_readahead();
- /*
- * At the moment, we don't support PG_readahead for anon THP
- * so let's bail out rather than confusing the readahead stat.
- */
- if (unlikely(folio_test_large(folio)))
- return folio;
-
- readahead = folio_test_clear_readahead(folio);
- if (vma && vma_ra) {
- unsigned long ra_val;
- int win, hits;
-
- ra_val = GET_SWAP_RA_VAL(vma);
- win = SWAP_RA_WIN(ra_val);
- hits = SWAP_RA_HITS(ra_val);
- if (readahead)
- hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
- atomic_long_set(&vma->swap_readahead_info,
- SWAP_RA_VAL(addr, win, hits));
- }
-
- if (readahead) {
- count_vm_event(SWAP_RA_HIT);
- if (!vma || !vma_ra)
- atomic_inc(&swapin_readahead_hits);
- }
- } else {
- folio = NULL;
+ /*
+ * At the moment, we don't support PG_readahead for anon THP
+ * so let's bail out rather than confusing the readahead stat.
+ */
+ if (unlikely(folio_test_large(folio)))
+ return;
+
+ readahead = folio_test_clear_readahead(folio);
+ if (vma && vma_ra) {
+ unsigned long ra_val;
+ int win, hits;
+
+ ra_val = GET_SWAP_RA_VAL(vma);
+ win = SWAP_RA_WIN(ra_val);
+ hits = SWAP_RA_HITS(ra_val);
+ if (readahead)
+ hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
+ atomic_long_set(&vma->swap_readahead_info,
+ SWAP_RA_VAL(addr, win, hits));
}
- return folio;
-}
-
-/**
- * filemap_get_incore_folio - Find and get a folio from the page or swap caches.
- * @mapping: The address_space to search.
- * @index: The page cache index.
- *
- * This differs from filemap_get_folio() in that it will also look for the
- * folio in the swap cache.
- *
- * Return: The found folio or %NULL.
- */
-struct folio *filemap_get_incore_folio(struct address_space *mapping,
- pgoff_t index)
-{
- swp_entry_t swp;
- struct swap_info_struct *si;
- struct folio *folio = filemap_get_entry(mapping, index);
-
- if (!folio)
- return ERR_PTR(-ENOENT);
- if (!xa_is_value(folio))
- return folio;
- if (!shmem_mapping(mapping))
- return ERR_PTR(-ENOENT);
-
- swp = radix_to_swp_entry(folio);
- /* There might be swapin error entries in shmem mapping. */
- if (non_swap_entry(swp))
- return ERR_PTR(-ENOENT);
- /* Prevent swapoff from happening to us */
- si = get_swap_device(swp);
- if (!si)
- return ERR_PTR(-ENOENT);
- index = swap_cache_index(swp);
- folio = filemap_get_folio(swap_address_space(swp), index);
- put_swap_device(si);
- return folio;
+ if (readahead) {
+ count_vm_event(SWAP_RA_HIT);
+ if (!vma || !vma_ra)
+ atomic_inc(&swapin_readahead_hits);
+ }
}
struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated,
bool skip_if_exists)
{
- struct swap_info_struct *si = swp_swap_info(entry);
+ struct swap_info_struct *si = __swap_entry_to_info(entry);
struct folio *folio;
struct folio *new_folio = NULL;
struct folio *result = NULL;
@@ -377,14 +415,13 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
*new_page_allocated = false;
for (;;) {
int err;
+
/*
- * First check the swap cache. Since this is normally
- * called after swap_cache_get_folio() failed, re-calling
- * that would confuse statistics.
+ * Check the swap cache first, if a cached folio is found,
+ * return it unlocked. The caller will lock and check it.
*/
- folio = filemap_get_folio(swap_address_space(entry),
- swap_cache_index(entry));
- if (!IS_ERR(folio))
+ folio = swap_cache_get_folio(entry);
+ if (folio)
goto got_folio;
/*
@@ -426,7 +463,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
goto put_and_return;
/*
- * We might race against __delete_from_swap_cache(), and
+ * We might race against __swap_cache_del_folio(), and
* stumble across a swap_map entry whose SWAP_HAS_CACHE
* has not yet been cleared. Or race against another
* __read_swap_cache_async(), which has set SWAP_HAS_CACHE
@@ -444,10 +481,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
if (mem_cgroup_swapin_charge_folio(new_folio, NULL, gfp_mask, entry))
goto fail_unlock;
- /* May fail (-ENOMEM) if XArray node allocation failed. */
- if (add_to_swap_cache(new_folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow))
- goto fail_unlock;
-
+ swap_cache_add_folio(new_folio, entry, &shadow);
memcg1_swapin(entry, 1);
if (shadow)
@@ -593,7 +627,7 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
unsigned long offset = entry_offset;
unsigned long start_offset, end_offset;
unsigned long mask;
- struct swap_info_struct *si = swp_swap_info(entry);
+ struct swap_info_struct *si = __swap_entry_to_info(entry);
struct blk_plug plug;
struct swap_iocb *splug = NULL;
bool page_allocated;
@@ -639,41 +673,6 @@ skip:
return folio;
}
-int init_swap_address_space(unsigned int type, unsigned long nr_pages)
-{
- struct address_space *spaces, *space;
- unsigned int i, nr;
-
- nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES);
- spaces = kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL);
- if (!spaces)
- return -ENOMEM;
- for (i = 0; i < nr; i++) {
- space = spaces + i;
- xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ);
- atomic_set(&space->i_mmap_writable, 0);
- space->a_ops = &swap_aops;
- /* swap cache doesn't use writeback related tags */
- mapping_set_no_writeback_tags(space);
- }
- nr_swapper_spaces[type] = nr;
- swapper_spaces[type] = spaces;
-
- return 0;
-}
-
-void exit_swap_address_space(unsigned int type)
-{
- int i;
- struct address_space *spaces = swapper_spaces[type];
-
- for (i = 0; i < nr_swapper_spaces[type]; i++)
- VM_WARN_ON_ONCE(!mapping_empty(&spaces[i]));
- kvfree(spaces);
- nr_swapper_spaces[type] = 0;
- swapper_spaces[type] = NULL;
-}
-
static int swap_vma_ra_win(struct vm_fault *vmf, unsigned long *start,
unsigned long *end)
{
@@ -846,7 +845,7 @@ static const struct attribute_group swap_attr_group = {
.attrs = swap_attrs,
};
-static int __init swap_init_sysfs(void)
+static int __init swap_init(void)
{
int err;
struct kobject *swap_kobj;
@@ -861,11 +860,13 @@ static int __init swap_init_sysfs(void)
pr_err("failed to register swap group\n");
goto delete_obj;
}
+ /* Swap cache writeback is LRU based, no tags for it */
+ mapping_set_no_writeback_tags(&swap_space);
return 0;
delete_obj:
kobject_put(swap_kobj);
return err;
}
-subsys_initcall(swap_init_sysfs);
+subsys_initcall(swap_init);
#endif
diff --git a/mm/swap_table.h b/mm/swap_table.h
new file mode 100644
index 000000000000..ea244a57a5b7
--- /dev/null
+++ b/mm/swap_table.h
@@ -0,0 +1,130 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _MM_SWAP_TABLE_H
+#define _MM_SWAP_TABLE_H
+
+#include <linux/rcupdate.h>
+#include <linux/atomic.h>
+#include "swap.h"
+
+/* A typical flat array in each cluster as swap table */
+struct swap_table {
+ atomic_long_t entries[SWAPFILE_CLUSTER];
+};
+
+#define SWP_TABLE_USE_PAGE (sizeof(struct swap_table) == PAGE_SIZE)
+
+/*
+ * A swap table entry represents the status of a swap slot on a swap
+ * (physical or virtual) device. The swap table in each cluster is a
+ * 1:1 map of the swap slots in this cluster.
+ *
+ * Each swap table entry could be a pointer (folio), a XA_VALUE
+ * (shadow), or NULL.
+ */
+
+/*
+ * Helpers for casting one type of info into a swap table entry.
+ */
+static inline unsigned long null_to_swp_tb(void)
+{
+ BUILD_BUG_ON(sizeof(unsigned long) != sizeof(atomic_long_t));
+ return 0;
+}
+
+static inline unsigned long folio_to_swp_tb(struct folio *folio)
+{
+ BUILD_BUG_ON(sizeof(unsigned long) != sizeof(void *));
+ return (unsigned long)folio;
+}
+
+static inline unsigned long shadow_swp_to_tb(void *shadow)
+{
+ BUILD_BUG_ON((BITS_PER_XA_VALUE + 1) !=
+ BITS_PER_BYTE * sizeof(unsigned long));
+ VM_WARN_ON_ONCE(shadow && !xa_is_value(shadow));
+ return (unsigned long)shadow;
+}
+
+/*
+ * Helpers for swap table entry type checking.
+ */
+static inline bool swp_tb_is_null(unsigned long swp_tb)
+{
+ return !swp_tb;
+}
+
+static inline bool swp_tb_is_folio(unsigned long swp_tb)
+{
+ return !xa_is_value((void *)swp_tb) && !swp_tb_is_null(swp_tb);
+}
+
+static inline bool swp_tb_is_shadow(unsigned long swp_tb)
+{
+ return xa_is_value((void *)swp_tb);
+}
+
+/*
+ * Helpers for retrieving info from swap table.
+ */
+static inline struct folio *swp_tb_to_folio(unsigned long swp_tb)
+{
+ VM_WARN_ON(!swp_tb_is_folio(swp_tb));
+ return (void *)swp_tb;
+}
+
+static inline void *swp_tb_to_shadow(unsigned long swp_tb)
+{
+ VM_WARN_ON(!swp_tb_is_shadow(swp_tb));
+ return (void *)swp_tb;
+}
+
+/*
+ * Helpers for accessing or modifying the swap table of a cluster,
+ * the swap cluster must be locked.
+ */
+static inline void __swap_table_set(struct swap_cluster_info *ci,
+ unsigned int off, unsigned long swp_tb)
+{
+ atomic_long_t *table = rcu_dereference_protected(ci->table, true);
+
+ lockdep_assert_held(&ci->lock);
+ VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER);
+ atomic_long_set(&table[off], swp_tb);
+}
+
+static inline unsigned long __swap_table_xchg(struct swap_cluster_info *ci,
+ unsigned int off, unsigned long swp_tb)
+{
+ atomic_long_t *table = rcu_dereference_protected(ci->table, true);
+
+ lockdep_assert_held(&ci->lock);
+ VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER);
+ /* Ordering is guaranteed by cluster lock, relax */
+ return atomic_long_xchg_relaxed(&table[off], swp_tb);
+}
+
+static inline unsigned long __swap_table_get(struct swap_cluster_info *ci,
+ unsigned int off)
+{
+ atomic_long_t *table;
+
+ VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER);
+ table = rcu_dereference_check(ci->table, lockdep_is_held(&ci->lock));
+
+ return atomic_long_read(&table[off]);
+}
+
+static inline unsigned long swap_table_get(struct swap_cluster_info *ci,
+ unsigned int off)
+{
+ atomic_long_t *table;
+ unsigned long swp_tb;
+
+ rcu_read_lock();
+ table = rcu_dereference(ci->table);
+ swp_tb = table ? atomic_long_read(&table[off]) : null_to_swp_tb();
+ rcu_read_unlock();
+
+ return swp_tb;
+}
+#endif
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 412ccd6543b3..10760240a3a2 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -46,21 +46,22 @@
#include <asm/tlbflush.h>
#include <linux/swapops.h>
#include <linux/swap_cgroup.h>
+#include "swap_table.h"
#include "internal.h"
#include "swap.h"
static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
unsigned char);
static void free_swap_count_continuations(struct swap_info_struct *);
-static void swap_entry_range_free(struct swap_info_struct *si,
- struct swap_cluster_info *ci,
- swp_entry_t entry, unsigned int nr_pages);
+static void swap_entries_free(struct swap_info_struct *si,
+ struct swap_cluster_info *ci,
+ swp_entry_t entry, unsigned int nr_pages);
static void swap_range_alloc(struct swap_info_struct *si,
unsigned int nr_entries);
static bool folio_swapcache_freeable(struct folio *folio);
-static struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
- unsigned long offset);
-static inline void unlock_cluster(struct swap_cluster_info *ci);
+static void move_cluster(struct swap_info_struct *si,
+ struct swap_cluster_info *ci, struct list_head *list,
+ enum swap_cluster_flags new_flags);
static DEFINE_SPINLOCK(swap_lock);
static unsigned int nr_swapfiles;
@@ -105,7 +106,9 @@ static PLIST_HEAD(swap_active_head);
static struct plist_head *swap_avail_heads;
static DEFINE_SPINLOCK(swap_avail_lock);
-static struct swap_info_struct *swap_info[MAX_SWAPFILES];
+struct swap_info_struct *swap_info[MAX_SWAPFILES];
+
+static struct kmem_cache *swap_table_cachep;
static DEFINE_MUTEX(swapon_mutex);
@@ -127,14 +130,20 @@ static DEFINE_PER_CPU(struct percpu_swap_cluster, percpu_swap_cluster) = {
.lock = INIT_LOCAL_LOCK(),
};
-static struct swap_info_struct *swap_type_to_swap_info(int type)
+/* May return NULL on invalid type, caller must check for NULL return */
+static struct swap_info_struct *swap_type_to_info(int type)
{
if (type >= MAX_SWAPFILES)
return NULL;
-
return READ_ONCE(swap_info[type]); /* rcu_dereference() */
}
+/* May return NULL on invalid entry, caller must check for NULL return */
+static struct swap_info_struct *swap_entry_to_info(swp_entry_t entry)
+{
+ return swap_type_to_info(swp_type(entry));
+}
+
static inline unsigned char swap_count(unsigned char ent)
{
return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */
@@ -192,7 +201,7 @@ static bool swap_is_last_map(struct swap_info_struct *si,
unsigned char *map_end = map + nr_pages;
unsigned char count = *map;
- if (swap_count(count) != 1)
+ if (swap_count(count) != 1 && swap_count(count) != SWAP_MAP_SHMEM)
return false;
while (++map < map_end) {
@@ -212,16 +221,15 @@ static bool swap_is_last_map(struct swap_info_struct *si,
static int __try_to_reclaim_swap(struct swap_info_struct *si,
unsigned long offset, unsigned long flags)
{
- swp_entry_t entry = swp_entry(si->type, offset);
- struct address_space *address_space = swap_address_space(entry);
+ const swp_entry_t entry = swp_entry(si->type, offset);
struct swap_cluster_info *ci;
struct folio *folio;
int ret, nr_pages;
bool need_reclaim;
again:
- folio = filemap_get_folio(address_space, swap_cache_index(entry));
- if (IS_ERR(folio))
+ folio = swap_cache_get_folio(entry);
+ if (!folio)
return 0;
nr_pages = folio_nr_pages(folio);
@@ -241,13 +249,12 @@ again:
* Offset could point to the middle of a large folio, or folio
* may no longer point to the expected offset before it's locked.
*/
- entry = folio->swap;
- if (offset < swp_offset(entry) || offset >= swp_offset(entry) + nr_pages) {
+ if (!folio_matches_swap_entry(folio, entry)) {
folio_unlock(folio);
folio_put(folio);
goto again;
}
- offset = swp_offset(entry);
+ offset = swp_offset(folio->swap);
need_reclaim = ((flags & TTRS_ANYWAY) ||
((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) ||
@@ -260,13 +267,13 @@ again:
* swap_map is HAS_CACHE only, which means the slots have no page table
* reference or pending writeback, and can't be allocated to others.
*/
- ci = lock_cluster(si, offset);
+ ci = swap_cluster_lock(si, offset);
need_reclaim = swap_only_has_cache(si, offset, nr_pages);
- unlock_cluster(ci);
+ swap_cluster_unlock(ci);
if (!need_reclaim)
goto out_unlock;
- delete_from_swap_cache(folio);
+ swap_cache_del_folio(folio);
folio_set_dirty(folio);
ret = nr_pages;
out_unlock:
@@ -347,7 +354,7 @@ offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset)
sector_t swap_folio_sector(struct folio *folio)
{
- struct swap_info_struct *sis = swp_swap_info(folio->swap);
+ struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
struct swap_extent *se;
sector_t sector;
pgoff_t offset;
@@ -387,19 +394,6 @@ static void discard_swap_cluster(struct swap_info_struct *si,
}
}
-#ifdef CONFIG_THP_SWAP
-#define SWAPFILE_CLUSTER HPAGE_PMD_NR
-
-#define swap_entry_order(order) (order)
-#else
-#define SWAPFILE_CLUSTER 256
-
-/*
- * Define swap_entry_order() as constant to let compiler to optimize
- * out some code if !CONFIG_THP_SWAP
- */
-#define swap_entry_order(order) 0
-#endif
#define LATENCY_LIMIT 256
static inline bool cluster_is_empty(struct swap_cluster_info *info)
@@ -412,10 +406,17 @@ static inline bool cluster_is_discard(struct swap_cluster_info *info)
return info->flags == CLUSTER_FLAG_DISCARD;
}
+static inline bool cluster_table_is_alloced(struct swap_cluster_info *ci)
+{
+ return rcu_dereference_protected(ci->table, lockdep_is_held(&ci->lock));
+}
+
static inline bool cluster_is_usable(struct swap_cluster_info *ci, int order)
{
if (unlikely(ci->flags > CLUSTER_FLAG_USABLE))
return false;
+ if (!cluster_table_is_alloced(ci))
+ return false;
if (!order)
return true;
return cluster_is_empty(ci) || order == ci->order;
@@ -427,32 +428,126 @@ static inline unsigned int cluster_index(struct swap_info_struct *si,
return ci - si->cluster_info;
}
-static inline struct swap_cluster_info *offset_to_cluster(struct swap_info_struct *si,
- unsigned long offset)
-{
- return &si->cluster_info[offset / SWAPFILE_CLUSTER];
-}
-
static inline unsigned int cluster_offset(struct swap_info_struct *si,
struct swap_cluster_info *ci)
{
return cluster_index(si, ci) * SWAPFILE_CLUSTER;
}
-static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
- unsigned long offset)
+static struct swap_table *swap_table_alloc(gfp_t gfp)
{
- struct swap_cluster_info *ci;
+ struct folio *folio;
- ci = offset_to_cluster(si, offset);
- spin_lock(&ci->lock);
+ if (!SWP_TABLE_USE_PAGE)
+ return kmem_cache_zalloc(swap_table_cachep, gfp);
- return ci;
+ folio = folio_alloc(gfp | __GFP_ZERO, 0);
+ if (folio)
+ return folio_address(folio);
+ return NULL;
+}
+
+static void swap_table_free_folio_rcu_cb(struct rcu_head *head)
+{
+ struct folio *folio;
+
+ folio = page_folio(container_of(head, struct page, rcu_head));
+ folio_put(folio);
}
-static inline void unlock_cluster(struct swap_cluster_info *ci)
+static void swap_table_free(struct swap_table *table)
{
+ if (!SWP_TABLE_USE_PAGE) {
+ kmem_cache_free(swap_table_cachep, table);
+ return;
+ }
+
+ call_rcu(&(folio_page(virt_to_folio(table), 0)->rcu_head),
+ swap_table_free_folio_rcu_cb);
+}
+
+static void swap_cluster_free_table(struct swap_cluster_info *ci)
+{
+ unsigned int ci_off;
+ struct swap_table *table;
+
+ /* Only empty cluster's table is allow to be freed */
+ lockdep_assert_held(&ci->lock);
+ VM_WARN_ON_ONCE(!cluster_is_empty(ci));
+ for (ci_off = 0; ci_off < SWAPFILE_CLUSTER; ci_off++)
+ VM_WARN_ON_ONCE(!swp_tb_is_null(__swap_table_get(ci, ci_off)));
+ table = (void *)rcu_dereference_protected(ci->table, true);
+ rcu_assign_pointer(ci->table, NULL);
+
+ swap_table_free(table);
+}
+
+/*
+ * Allocate swap table for one cluster. Attempt an atomic allocation first,
+ * then fallback to sleeping allocation.
+ */
+static struct swap_cluster_info *
+swap_cluster_alloc_table(struct swap_info_struct *si,
+ struct swap_cluster_info *ci)
+{
+ struct swap_table *table;
+
+ /*
+ * Only cluster isolation from the allocator does table allocation.
+ * Swap allocator uses percpu clusters and holds the local lock.
+ */
+ lockdep_assert_held(&ci->lock);
+ lockdep_assert_held(&this_cpu_ptr(&percpu_swap_cluster)->lock);
+
+ /* The cluster must be free and was just isolated from the free list. */
+ VM_WARN_ON_ONCE(ci->flags || !cluster_is_empty(ci));
+
+ table = swap_table_alloc(__GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
+ if (table) {
+ rcu_assign_pointer(ci->table, table);
+ return ci;
+ }
+
+ /*
+ * Try a sleep allocation. Each isolated free cluster may cause
+ * a sleep allocation, but there is a limited number of them, so
+ * the potential recursive allocation is limited.
+ */
spin_unlock(&ci->lock);
+ if (!(si->flags & SWP_SOLIDSTATE))
+ spin_unlock(&si->global_cluster_lock);
+ local_unlock(&percpu_swap_cluster.lock);
+
+ table = swap_table_alloc(__GFP_HIGH | __GFP_NOMEMALLOC | GFP_KERNEL);
+
+ /*
+ * Back to atomic context. We might have migrated to a new CPU with a
+ * usable percpu cluster. But just keep using the isolated cluster to
+ * make things easier. Migration indicates a slight change of workload
+ * so using a new free cluster might not be a bad idea, and the worst
+ * could happen with ignoring the percpu cluster is fragmentation,
+ * which is acceptable since this fallback and race is rare.
+ */
+ local_lock(&percpu_swap_cluster.lock);
+ if (!(si->flags & SWP_SOLIDSTATE))
+ spin_lock(&si->global_cluster_lock);
+ spin_lock(&ci->lock);
+
+ /* Nothing except this helper should touch a dangling empty cluster. */
+ if (WARN_ON_ONCE(cluster_table_is_alloced(ci))) {
+ if (table)
+ swap_table_free(table);
+ return ci;
+ }
+
+ if (!table) {
+ move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE);
+ spin_unlock(&ci->lock);
+ return NULL;
+ }
+
+ rcu_assign_pointer(ci->table, table);
+ return ci;
}
static void move_cluster(struct swap_info_struct *si,
@@ -470,11 +565,6 @@ static void move_cluster(struct swap_info_struct *si,
else
list_move_tail(&ci->list, list);
spin_unlock(&si->lock);
-
- if (ci->flags == CLUSTER_FLAG_FRAG)
- atomic_long_dec(&si->frag_cluster_nr[ci->order]);
- else if (new_flags == CLUSTER_FLAG_FRAG)
- atomic_long_inc(&si->frag_cluster_nr[ci->order]);
ci->flags = new_flags;
}
@@ -489,7 +579,7 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
{
- lockdep_assert_held(&ci->lock);
+ swap_cluster_free_table(ci);
move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE);
ci->order = 0;
}
@@ -504,15 +594,11 @@ static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info
* this returns NULL for an non-empty list.
*/
static struct swap_cluster_info *isolate_lock_cluster(
- struct swap_info_struct *si, struct list_head *list)
+ struct swap_info_struct *si, struct list_head *list, int order)
{
- struct swap_cluster_info *ci, *ret = NULL;
+ struct swap_cluster_info *ci, *found = NULL;
spin_lock(&si->lock);
-
- if (unlikely(!(si->flags & SWP_WRITEOK)))
- goto out;
-
list_for_each_entry(ci, list, list) {
if (!spin_trylock(&ci->lock))
continue;
@@ -524,13 +610,19 @@ static struct swap_cluster_info *isolate_lock_cluster(
list_del(&ci->list);
ci->flags = CLUSTER_FLAG_NONE;
- ret = ci;
+ found = ci;
break;
}
-out:
spin_unlock(&si->lock);
- return ret;
+ if (found && !cluster_table_is_alloced(found)) {
+ /* Only an empty free cluster's swap table can be freed. */
+ VM_WARN_ON_ONCE(list != &si->free_clusters);
+ VM_WARN_ON_ONCE(!cluster_is_empty(found));
+ return swap_cluster_alloc_table(si, found);
+ }
+
+ return found;
}
/*
@@ -663,17 +755,27 @@ static void relocate_cluster(struct swap_info_struct *si,
* added to free cluster list and its usage counter will be increased by 1.
* Only used for initialization.
*/
-static void inc_cluster_info_page(struct swap_info_struct *si,
+static int inc_cluster_info_page(struct swap_info_struct *si,
struct swap_cluster_info *cluster_info, unsigned long page_nr)
{
unsigned long idx = page_nr / SWAPFILE_CLUSTER;
+ struct swap_table *table;
struct swap_cluster_info *ci;
ci = cluster_info + idx;
+ if (!ci->table) {
+ table = swap_table_alloc(GFP_KERNEL);
+ if (!table)
+ return -ENOMEM;
+ rcu_assign_pointer(ci->table, table);
+ }
+
ci->count++;
VM_BUG_ON(ci->count > SWAPFILE_CLUSTER);
VM_BUG_ON(ci->flags);
+
+ return 0;
}
static bool cluster_reclaim_range(struct swap_info_struct *si,
@@ -742,6 +844,26 @@ static bool cluster_scan_range(struct swap_info_struct *si,
return true;
}
+/*
+ * Currently, the swap table is not used for count tracking, just
+ * do a sanity check here to ensure nothing leaked, so the swap
+ * table should be empty upon freeing.
+ */
+static void swap_cluster_assert_table_empty(struct swap_cluster_info *ci,
+ unsigned int start, unsigned int nr)
+{
+ unsigned int ci_off = start % SWAPFILE_CLUSTER;
+ unsigned int ci_end = ci_off + nr;
+ unsigned long swp_tb;
+
+ if (IS_ENABLED(CONFIG_DEBUG_VM)) {
+ do {
+ swp_tb = __swap_table_get(ci, ci_off);
+ VM_WARN_ON_ONCE(!swp_tb_is_null(swp_tb));
+ } while (++ci_off < ci_end);
+ }
+}
+
static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci,
unsigned int start, unsigned char usage,
unsigned int order)
@@ -761,6 +883,7 @@ static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster
ci->order = order;
memset(si->swap_map + start, usage, nr_pages);
+ swap_cluster_assert_table_empty(ci, start, nr_pages);
swap_range_alloc(si, nr_pages);
ci->count += nr_pages;
@@ -815,7 +938,7 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
}
out:
relocate_cluster(si, ci);
- unlock_cluster(ci);
+ swap_cluster_unlock(ci);
if (si->flags & SWP_SOLIDSTATE) {
this_cpu_write(percpu_swap_cluster.offset[order], next);
this_cpu_write(percpu_swap_cluster.si[order], si);
@@ -825,6 +948,29 @@ out:
return found;
}
+static unsigned int alloc_swap_scan_list(struct swap_info_struct *si,
+ struct list_head *list,
+ unsigned int order,
+ unsigned char usage,
+ bool scan_all)
+{
+ unsigned int found = SWAP_ENTRY_INVALID;
+
+ do {
+ struct swap_cluster_info *ci = isolate_lock_cluster(si, list, order);
+ unsigned long offset;
+
+ if (!ci)
+ break;
+ offset = cluster_offset(si, ci);
+ found = alloc_swap_scan_cluster(si, ci, offset, order, usage);
+ if (found)
+ break;
+ } while (scan_all);
+
+ return found;
+}
+
static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
{
long to_scan = 1;
@@ -836,7 +982,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
if (force)
to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER;
- while ((ci = isolate_lock_cluster(si, &si->full_clusters))) {
+ while ((ci = isolate_lock_cluster(si, &si->full_clusters, 0))) {
offset = cluster_offset(si, ci);
end = min(si->max, offset + SWAPFILE_CLUSTER);
to_scan--;
@@ -859,7 +1005,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
if (ci->flags == CLUSTER_FLAG_NONE)
relocate_cluster(si, ci);
- unlock_cluster(ci);
+ swap_cluster_unlock(ci);
if (to_scan <= 0)
break;
}
@@ -898,7 +1044,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
if (offset == SWAP_ENTRY_INVALID)
goto new_cluster;
- ci = lock_cluster(si, offset);
+ ci = swap_cluster_lock(si, offset);
/* Cluster could have been used by another order */
if (cluster_is_usable(ci, order)) {
if (cluster_is_empty(ci))
@@ -906,59 +1052,58 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
found = alloc_swap_scan_cluster(si, ci, offset,
order, usage);
} else {
- unlock_cluster(ci);
+ swap_cluster_unlock(ci);
}
if (found)
goto done;
}
new_cluster:
- ci = isolate_lock_cluster(si, &si->free_clusters);
- if (ci) {
- found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
- order, usage);
+ /*
+ * If the device need discard, prefer new cluster over nonfull
+ * to spread out the writes.
+ */
+ if (si->flags & SWP_PAGE_DISCARD) {
+ found = alloc_swap_scan_list(si, &si->free_clusters, order, usage,
+ false);
if (found)
goto done;
}
- /* Try reclaim from full clusters if free clusters list is drained */
+ if (order < PMD_ORDER) {
+ found = alloc_swap_scan_list(si, &si->nonfull_clusters[order],
+ order, usage, true);
+ if (found)
+ goto done;
+ }
+
+ if (!(si->flags & SWP_PAGE_DISCARD)) {
+ found = alloc_swap_scan_list(si, &si->free_clusters, order, usage,
+ false);
+ if (found)
+ goto done;
+ }
+
+ /* Try reclaim full clusters if free and nonfull lists are drained */
if (vm_swap_full())
swap_reclaim_full_clusters(si, false);
if (order < PMD_ORDER) {
- unsigned int frags = 0, frags_existing;
-
- while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[order]))) {
- found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
- order, usage);
- if (found)
- goto done;
- /* Clusters failed to allocate are moved to frag_clusters */
- frags++;
- }
-
- frags_existing = atomic_long_read(&si->frag_cluster_nr[order]);
- while (frags < frags_existing &&
- (ci = isolate_lock_cluster(si, &si->frag_clusters[order]))) {
- atomic_long_dec(&si->frag_cluster_nr[order]);
- /*
- * Rotate the frag list to iterate, they were all
- * failing high order allocation or moved here due to
- * per-CPU usage, but they could contain newly released
- * reclaimable (eg. lazy-freed swap cache) slots.
- */
- found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
- order, usage);
- if (found)
- goto done;
- frags++;
- }
+ /*
+ * Scan only one fragment cluster is good enough. Order 0
+ * allocation will surely success, and large allocation
+ * failure is not critical. Scanning one cluster still
+ * keeps the list rotated and reclaimed (for HAS_CACHE).
+ */
+ found = alloc_swap_scan_list(si, &si->frag_clusters[order], order,
+ usage, false);
+ if (found)
+ goto done;
}
/*
- * We don't have free cluster but have some clusters in
- * discarding, do discard now and reclaim them, then
- * reread cluster_next_cpu since we dropped si->lock
+ * We don't have free cluster but have some clusters in discarding,
+ * do discard now and reclaim them.
*/
if ((si->flags & SWP_PAGE_DISCARD) && swap_do_scheduled_discard(si))
goto new_cluster;
@@ -972,24 +1117,20 @@ new_cluster:
* Clusters here have at least one usable slots and can't fail order 0
* allocation, but reclaim may drop si->lock and race with another user.
*/
- while ((ci = isolate_lock_cluster(si, &si->frag_clusters[o]))) {
- atomic_long_dec(&si->frag_cluster_nr[o]);
- found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
- 0, usage);
- if (found)
- goto done;
- }
+ found = alloc_swap_scan_list(si, &si->frag_clusters[o],
+ 0, usage, true);
+ if (found)
+ goto done;
- while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[o]))) {
- found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
- 0, usage);
- if (found)
- goto done;
- }
+ found = alloc_swap_scan_list(si, &si->nonfull_clusters[o],
+ 0, usage, true);
+ if (found)
+ goto done;
}
done:
if (!(si->flags & SWP_SOLIDSTATE))
spin_unlock(&si->global_cluster_lock);
+
return found;
}
@@ -1115,6 +1256,7 @@ static void swap_range_alloc(struct swap_info_struct *si,
if (vm_swap_full())
schedule_work(&si->reclaim_work);
}
+ atomic_long_sub(nr_entries, &nr_swap_pages);
}
static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
@@ -1145,7 +1287,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
swap_slot_free_notify(si->bdev, offset);
offset++;
}
- clear_shadow_from_swap_cache(si->type, begin, end);
+ __swap_cache_clear_shadow(swp_entry(si->type, begin), nr_entries);
/*
* Make sure that try_to_unuse() observes si->inuse_pages reaching 0
@@ -1192,7 +1334,7 @@ static bool swap_alloc_fast(swp_entry_t *entry,
if (!si || !offset || !get_swap_device_info(si))
return false;
- ci = lock_cluster(si, offset);
+ ci = swap_cluster_lock(si, offset);
if (cluster_is_usable(ci, order)) {
if (cluster_is_empty(ci))
offset = cluster_offset(si, ci);
@@ -1200,7 +1342,7 @@ static bool swap_alloc_fast(swp_entry_t *entry,
if (found)
*entry = swp_entry(si->type, found);
} else {
- unlock_cluster(ci);
+ swap_cluster_unlock(ci);
}
put_swap_device(si);
@@ -1302,18 +1444,8 @@ int folio_alloc_swap(struct folio *folio, gfp_t gfp)
if (!entry.val)
return -ENOMEM;
- /*
- * XArray node allocations from PF_MEMALLOC contexts could
- * completely exhaust the page allocator. __GFP_NOMEMALLOC
- * stops emergency reserves from being allocated.
- *
- * TODO: this could cause a theoretical memory reclaim
- * deadlock in the swap out path.
- */
- if (add_to_swap_cache(folio, entry, gfp | __GFP_NOMEMALLOC, NULL))
- goto out_free;
+ swap_cache_add_folio(folio, entry, NULL);
- atomic_long_sub(size, &nr_swap_pages);
return 0;
out_free:
@@ -1328,7 +1460,7 @@ static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
if (!entry.val)
goto out;
- si = swp_swap_info(entry);
+ si = swap_entry_to_info(entry);
if (!si)
goto bad_nofile;
if (data_race(!(si->flags & SWP_USED)))
@@ -1355,10 +1487,12 @@ out:
return NULL;
}
-static unsigned char __swap_entry_free_locked(struct swap_info_struct *si,
- unsigned long offset,
- unsigned char usage)
+static unsigned char swap_entry_put_locked(struct swap_info_struct *si,
+ struct swap_cluster_info *ci,
+ swp_entry_t entry,
+ unsigned char usage)
{
+ unsigned long offset = swp_offset(entry);
unsigned char count;
unsigned char has_cache;
@@ -1390,7 +1524,7 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *si,
if (usage)
WRITE_ONCE(si->swap_map[offset], usage);
else
- WRITE_ONCE(si->swap_map[offset], SWAP_HAS_CACHE);
+ swap_entries_free(si, ci, entry, 1);
return usage;
}
@@ -1441,7 +1575,7 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry)
if (!entry.val)
goto out;
- si = swp_swap_info(entry);
+ si = swap_entry_to_info(entry);
if (!si)
goto bad_nofile;
if (!get_swap_device_info(si))
@@ -1461,89 +1595,122 @@ put_out:
return NULL;
}
-static unsigned char __swap_entry_free(struct swap_info_struct *si,
- swp_entry_t entry)
+static void swap_entries_put_cache(struct swap_info_struct *si,
+ swp_entry_t entry, int nr)
{
- struct swap_cluster_info *ci;
unsigned long offset = swp_offset(entry);
- unsigned char usage;
-
- ci = lock_cluster(si, offset);
- usage = __swap_entry_free_locked(si, offset, 1);
- if (!usage)
- swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1);
- unlock_cluster(ci);
+ struct swap_cluster_info *ci;
- return usage;
+ ci = swap_cluster_lock(si, offset);
+ if (swap_only_has_cache(si, offset, nr)) {
+ swap_entries_free(si, ci, entry, nr);
+ } else {
+ for (int i = 0; i < nr; i++, entry.val++)
+ swap_entry_put_locked(si, ci, entry, SWAP_HAS_CACHE);
+ }
+ swap_cluster_unlock(ci);
}
-static bool __swap_entries_free(struct swap_info_struct *si,
- swp_entry_t entry, int nr)
+static bool swap_entries_put_map(struct swap_info_struct *si,
+ swp_entry_t entry, int nr)
{
unsigned long offset = swp_offset(entry);
- unsigned int type = swp_type(entry);
struct swap_cluster_info *ci;
bool has_cache = false;
unsigned char count;
int i;
- if (nr <= 1 || swap_count(data_race(si->swap_map[offset])) != 1)
+ if (nr <= 1)
goto fallback;
- /* cross into another cluster */
- if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER)
+ count = swap_count(data_race(si->swap_map[offset]));
+ if (count != 1 && count != SWAP_MAP_SHMEM)
goto fallback;
- ci = lock_cluster(si, offset);
+ ci = swap_cluster_lock(si, offset);
if (!swap_is_last_map(si, offset, nr, &has_cache)) {
- unlock_cluster(ci);
- goto fallback;
+ goto locked_fallback;
}
- for (i = 0; i < nr; i++)
- WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE);
if (!has_cache)
- swap_entry_range_free(si, ci, entry, nr);
- unlock_cluster(ci);
+ swap_entries_free(si, ci, entry, nr);
+ else
+ for (i = 0; i < nr; i++)
+ WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE);
+ swap_cluster_unlock(ci);
return has_cache;
fallback:
- for (i = 0; i < nr; i++) {
- if (data_race(si->swap_map[offset + i])) {
- count = __swap_entry_free(si, swp_entry(type, offset + i));
- if (count == SWAP_HAS_CACHE)
- has_cache = true;
- } else {
- WARN_ON_ONCE(1);
- }
+ ci = swap_cluster_lock(si, offset);
+locked_fallback:
+ for (i = 0; i < nr; i++, entry.val++) {
+ count = swap_entry_put_locked(si, ci, entry, 1);
+ if (count == SWAP_HAS_CACHE)
+ has_cache = true;
}
+ swap_cluster_unlock(ci);
return has_cache;
}
/*
- * Drop the last HAS_CACHE flag of swap entries, caller have to
- * ensure all entries belong to the same cgroup.
+ * Only functions with "_nr" suffix are able to free entries spanning
+ * cross multi clusters, so ensure the range is within a single cluster
+ * when freeing entries with functions without "_nr" suffix.
*/
-static void swap_entry_range_free(struct swap_info_struct *si,
- struct swap_cluster_info *ci,
- swp_entry_t entry, unsigned int nr_pages)
+static bool swap_entries_put_map_nr(struct swap_info_struct *si,
+ swp_entry_t entry, int nr)
+{
+ int cluster_nr, cluster_rest;
+ unsigned long offset = swp_offset(entry);
+ bool has_cache = false;
+
+ cluster_rest = SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER;
+ while (nr) {
+ cluster_nr = min(nr, cluster_rest);
+ has_cache |= swap_entries_put_map(si, entry, cluster_nr);
+ cluster_rest = SWAPFILE_CLUSTER;
+ nr -= cluster_nr;
+ entry.val += cluster_nr;
+ }
+
+ return has_cache;
+}
+
+/*
+ * Check if it's the last ref of swap entry in the freeing path.
+ * Qualified vlaue includes 1, SWAP_HAS_CACHE or SWAP_MAP_SHMEM.
+ */
+static inline bool __maybe_unused swap_is_last_ref(unsigned char count)
+{
+ return (count == SWAP_HAS_CACHE) || (count == 1) ||
+ (count == SWAP_MAP_SHMEM);
+}
+
+/*
+ * Drop the last ref of swap entries, caller have to ensure all entries
+ * belong to the same cgroup and cluster.
+ */
+static void swap_entries_free(struct swap_info_struct *si,
+ struct swap_cluster_info *ci,
+ swp_entry_t entry, unsigned int nr_pages)
{
unsigned long offset = swp_offset(entry);
unsigned char *map = si->swap_map + offset;
unsigned char *map_end = map + nr_pages;
/* It should never free entries across different clusters */
- VM_BUG_ON(ci != offset_to_cluster(si, offset + nr_pages - 1));
+ VM_BUG_ON(ci != __swap_offset_to_cluster(si, offset + nr_pages - 1));
VM_BUG_ON(cluster_is_empty(ci));
VM_BUG_ON(ci->count < nr_pages);
ci->count -= nr_pages;
do {
- VM_BUG_ON(*map != SWAP_HAS_CACHE);
+ VM_BUG_ON(!swap_is_last_ref(*map));
*map = 0;
} while (++map < map_end);
mem_cgroup_uncharge_swap(entry, nr_pages);
swap_range_free(si, offset, nr_pages);
+ swap_cluster_assert_table_empty(ci, offset, nr_pages);
if (!ci->count)
free_cluster(si, ci);
@@ -1551,21 +1718,6 @@ static void swap_entry_range_free(struct swap_info_struct *si,
partial_free_cluster(si, ci);
}
-static void cluster_swap_free_nr(struct swap_info_struct *si,
- unsigned long offset, int nr_pages,
- unsigned char usage)
-{
- struct swap_cluster_info *ci;
- unsigned long end = offset + nr_pages;
-
- ci = lock_cluster(si, offset);
- do {
- if (!__swap_entry_free_locked(si, offset, usage))
- swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1);
- } while (++offset < end);
- unlock_cluster(ci);
-}
-
/*
* Caller has made sure that the swap device corresponding to entry
* is still around or has not been recycled.
@@ -1582,7 +1734,7 @@ void swap_free_nr(swp_entry_t entry, int nr_pages)
while (nr_pages) {
nr = min_t(int, nr_pages, SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER);
- cluster_swap_free_nr(sis, offset, nr, 1);
+ swap_entries_put_map(sis, swp_entry(sis->type, offset), nr);
offset += nr;
nr_pages -= nr;
}
@@ -1593,8 +1745,6 @@ void swap_free_nr(swp_entry_t entry, int nr_pages)
*/
void put_swap_folio(struct folio *folio, swp_entry_t entry)
{
- unsigned long offset = swp_offset(entry);
- struct swap_cluster_info *ci;
struct swap_info_struct *si;
int size = 1 << swap_entry_order(folio_order(folio));
@@ -1602,21 +1752,12 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry)
if (!si)
return;
- ci = lock_cluster(si, offset);
- if (swap_only_has_cache(si, offset, size))
- swap_entry_range_free(si, ci, entry, size);
- else {
- for (int i = 0; i < size; i++, entry.val++) {
- if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE))
- swap_entry_range_free(si, ci, entry, 1);
- }
- }
- unlock_cluster(ci);
+ swap_entries_put_cache(si, entry, size);
}
int __swap_count(swp_entry_t entry)
{
- struct swap_info_struct *si = swp_swap_info(entry);
+ struct swap_info_struct *si = __swap_entry_to_info(entry);
pgoff_t offset = swp_offset(entry);
return swap_count(si->swap_map[offset]);
@@ -1633,9 +1774,9 @@ bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry)
struct swap_cluster_info *ci;
int count;
- ci = lock_cluster(si, offset);
+ ci = swap_cluster_lock(si, offset);
count = swap_count(si->swap_map[offset]);
- unlock_cluster(ci);
+ swap_cluster_unlock(ci);
return !!count;
}
@@ -1658,7 +1799,7 @@ int swp_swapcount(swp_entry_t entry)
offset = swp_offset(entry);
- ci = lock_cluster(si, offset);
+ ci = swap_cluster_lock(si, offset);
count = swap_count(si->swap_map[offset]);
if (!(count & COUNT_CONTINUED))
@@ -1681,7 +1822,7 @@ int swp_swapcount(swp_entry_t entry)
n *= (SWAP_CONT_MAX + 1);
} while (tmp_count & COUNT_CONTINUED);
out:
- unlock_cluster(ci);
+ swap_cluster_unlock(ci);
return count;
}
@@ -1696,7 +1837,7 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
int i;
bool ret = false;
- ci = lock_cluster(si, offset);
+ ci = swap_cluster_lock(si, offset);
if (nr_pages == 1) {
if (swap_count(map[roffset]))
ret = true;
@@ -1709,7 +1850,7 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
}
}
unlock_out:
- unlock_cluster(ci);
+ swap_cluster_unlock(ci);
return ret;
}
@@ -1773,7 +1914,7 @@ bool folio_free_swap(struct folio *folio)
if (folio_swapped(folio))
return false;
- delete_from_swap_cache(folio);
+ swap_cache_del_folio(folio);
folio_set_dirty(folio);
return true;
}
@@ -1806,7 +1947,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
/*
* First free all entries in the range.
*/
- any_only_cache = __swap_entries_free(si, entry, nr);
+ any_only_cache = swap_entries_put_map_nr(si, entry, nr);
/*
* Short-circuit the below loop if none of the entries had their
@@ -1816,13 +1957,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
goto out;
/*
- * Now go back over the range trying to reclaim the swap cache. This is
- * more efficient for large folios because we will only try to reclaim
- * the swap once per folio in the common case. If we do
- * __swap_entry_free() and __try_to_reclaim_swap() in the same loop, the
- * latter will get a reference and lock the folio for every individual
- * page but will only succeed once the swap slot for every subpage is
- * zero.
+ * Now go back over the range trying to reclaim the swap cache.
*/
for (offset = start_offset; offset < end_offset; offset += nr) {
nr = 1;
@@ -1853,7 +1988,7 @@ out:
swp_entry_t get_swap_page_of_type(int type)
{
- struct swap_info_struct *si = swap_type_to_swap_info(type);
+ struct swap_info_struct *si = swap_type_to_info(type);
unsigned long offset;
swp_entry_t entry = {0};
@@ -1863,7 +1998,13 @@ swp_entry_t get_swap_page_of_type(int type)
/* This is called for allocating swap entry, not cache */
if (get_swap_device_info(si)) {
if (si->flags & SWP_WRITEOK) {
+ /*
+ * Grab the local lock to be complaint
+ * with swap table allocation.
+ */
+ local_lock(&percpu_swap_cluster.lock);
offset = cluster_alloc_swap_entry(si, 0, 1);
+ local_unlock(&percpu_swap_cluster.lock);
if (offset) {
entry = swp_entry(si->type, offset);
atomic_long_dec(&nr_swap_pages);
@@ -1934,7 +2075,7 @@ int find_first_swap(dev_t *device)
*/
sector_t swapdev_block(int type, pgoff_t offset)
{
- struct swap_info_struct *si = swap_type_to_swap_info(type);
+ struct swap_info_struct *si = swap_type_to_info(type);
struct swap_extent *se;
if (!si || !(si->flags & SWP_WRITEOK))
@@ -1990,6 +2131,13 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
bool hwpoisoned = false;
int ret = 1;
+ /*
+ * If the folio is removed from swap cache by others, continue to
+ * unuse other PTEs. try_to_unuse may try again if we missed this one.
+ */
+ if (!folio_matches_swap_entry(folio, entry))
+ return 0;
+
swapcache = folio;
folio = ksm_might_need_to_copy(folio, vma, addr);
if (unlikely(!folio))
@@ -2116,7 +2264,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
pte_unmap(pte);
pte = NULL;
- folio = swap_cache_get_folio(entry, vma, addr);
+ folio = swap_cache_get_folio(entry);
if (!folio) {
struct vm_fault vmf = {
.vma = vma,
@@ -2241,6 +2389,8 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type)
VMA_ITERATOR(vmi, mm, 0);
mmap_read_lock(mm);
+ if (check_stable_address_space(mm))
+ goto unlock;
for_each_vma(vmi, vma) {
if (vma->anon_vma && !is_vm_hugetlb_page(vma)) {
ret = unuse_vma(vma, type);
@@ -2250,6 +2400,7 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type)
cond_resched();
}
+unlock:
mmap_read_unlock(mm);
return ret;
}
@@ -2342,8 +2493,8 @@ retry:
(i = find_next_to_unuse(si, i)) != 0) {
entry = swp_entry(type, i);
- folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry));
- if (IS_ERR(folio))
+ folio = swap_cache_get_folio(entry);
+ if (!folio)
continue;
/*
@@ -2368,7 +2519,7 @@ retry:
* Limit the number of retries? No: when mmget_not_zero()
* above fails, that mm is likely to be freeing swap from
* exit_mmap(), which proceeds at its own independent pace;
- * and even shmem_writepage() could have been preempted after
+ * and even shmem_writeout() could have been preempted after
* folio_alloc_swap(), temporarily hiding that swap. It's easy
* and robust (though cpu-intensive) just to keep retrying.
*/
@@ -2642,11 +2793,32 @@ static void wait_for_allocation(struct swap_info_struct *si)
BUG_ON(si->flags & SWP_WRITEOK);
for (offset = 0; offset < end; offset += SWAPFILE_CLUSTER) {
- ci = lock_cluster(si, offset);
- unlock_cluster(ci);
+ ci = swap_cluster_lock(si, offset);
+ swap_cluster_unlock(ci);
}
}
+static void free_cluster_info(struct swap_cluster_info *cluster_info,
+ unsigned long maxpages)
+{
+ struct swap_cluster_info *ci;
+ int i, nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
+
+ if (!cluster_info)
+ return;
+ for (i = 0; i < nr_clusters; i++) {
+ ci = cluster_info + i;
+ /* Cluster with bad marks count will have a remaining table */
+ spin_lock(&ci->lock);
+ if (rcu_dereference_protected(ci->table, true)) {
+ ci->count = 0;
+ swap_cluster_free_table(ci);
+ }
+ spin_unlock(&ci->lock);
+ }
+ kvfree(cluster_info);
+}
+
/*
* Called after swap device's reference count is dead, so
* neither scan nor allocation will use it.
@@ -2679,6 +2851,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
struct address_space *mapping;
struct inode *inode;
struct filename *pathname;
+ unsigned int maxpages;
int err, found = 0;
if (!capable(CAP_SYS_ADMIN))
@@ -2781,12 +2954,13 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
swap_file = p->swap_file;
p->swap_file = NULL;
- p->max = 0;
swap_map = p->swap_map;
p->swap_map = NULL;
zeromap = p->zeromap;
p->zeromap = NULL;
+ maxpages = p->max;
cluster_info = p->cluster_info;
+ p->max = 0;
p->cluster_info = NULL;
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
@@ -2797,10 +2971,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
p->global_cluster = NULL;
vfree(swap_map);
kvfree(zeromap);
- kvfree(cluster_info);
+ free_cluster_info(cluster_info, maxpages);
/* Destroy swap account information */
swap_cgroup_swapoff(p->type);
- exit_swap_address_space(p->type);
inode = mapping->host;
@@ -2856,7 +3029,7 @@ static void *swap_start(struct seq_file *swap, loff_t *pos)
if (!l)
return SEQ_START_TOKEN;
- for (type = 0; (si = swap_type_to_swap_info(type)); type++) {
+ for (type = 0; (si = swap_type_to_info(type)); type++) {
if (!(si->flags & SWP_USED) || !si->swap_map)
continue;
if (!--l)
@@ -2877,7 +3050,7 @@ static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
type = si->type + 1;
++(*pos);
- for (; (si = swap_type_to_swap_info(type)); type++) {
+ for (; (si = swap_type_to_info(type)); type++) {
if (!(si->flags & SWP_USED) || !si->swap_map)
continue;
return si;
@@ -3138,60 +3311,40 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
return maxpages;
}
-static int setup_swap_map_and_extents(struct swap_info_struct *si,
- union swap_header *swap_header,
- unsigned char *swap_map,
- unsigned long maxpages,
- sector_t *span)
+static int setup_swap_map(struct swap_info_struct *si,
+ union swap_header *swap_header,
+ unsigned char *swap_map,
+ unsigned long maxpages)
{
- unsigned int nr_good_pages;
unsigned long i;
- int nr_extents;
-
- nr_good_pages = maxpages - 1; /* omit header page */
+ swap_map[0] = SWAP_MAP_BAD; /* omit header page */
for (i = 0; i < swap_header->info.nr_badpages; i++) {
unsigned int page_nr = swap_header->info.badpages[i];
if (page_nr == 0 || page_nr > swap_header->info.last_page)
return -EINVAL;
if (page_nr < maxpages) {
swap_map[page_nr] = SWAP_MAP_BAD;
- nr_good_pages--;
+ si->pages--;
}
}
- if (nr_good_pages) {
- swap_map[0] = SWAP_MAP_BAD;
- si->max = maxpages;
- si->pages = nr_good_pages;
- nr_extents = setup_swap_extents(si, span);
- if (nr_extents < 0)
- return nr_extents;
- nr_good_pages = si->pages;
- }
- if (!nr_good_pages) {
+ if (!si->pages) {
pr_warn("Empty swap-file\n");
return -EINVAL;
}
- return nr_extents;
+ return 0;
}
-#define SWAP_CLUSTER_INFO_COLS \
- DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info))
-#define SWAP_CLUSTER_SPACE_COLS \
- DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER)
-#define SWAP_CLUSTER_COLS \
- max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS)
-
static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
union swap_header *swap_header,
unsigned long maxpages)
{
unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
struct swap_cluster_info *cluster_info;
- unsigned long i, j, idx;
int err = -ENOMEM;
+ unsigned long i;
cluster_info = kvcalloc(nr_clusters, sizeof(*cluster_info), GFP_KERNEL);
if (!cluster_info)
@@ -3214,15 +3367,26 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
* Mark unusable pages as unavailable. The clusters aren't
* marked free yet, so no list operations are involved yet.
*
- * See setup_swap_map_and_extents(): header page, bad pages,
+ * See setup_swap_map(): header page, bad pages,
* and the EOF part of the last cluster.
*/
- inc_cluster_info_page(si, cluster_info, 0);
- for (i = 0; i < swap_header->info.nr_badpages; i++)
- inc_cluster_info_page(si, cluster_info,
- swap_header->info.badpages[i]);
- for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
- inc_cluster_info_page(si, cluster_info, i);
+ err = inc_cluster_info_page(si, cluster_info, 0);
+ if (err)
+ goto err;
+ for (i = 0; i < swap_header->info.nr_badpages; i++) {
+ unsigned int page_nr = swap_header->info.badpages[i];
+
+ if (page_nr >= maxpages)
+ continue;
+ err = inc_cluster_info_page(si, cluster_info, page_nr);
+ if (err)
+ goto err;
+ }
+ for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) {
+ err = inc_cluster_info_page(si, cluster_info, i);
+ if (err)
+ goto err;
+ }
INIT_LIST_HEAD(&si->free_clusters);
INIT_LIST_HEAD(&si->full_clusters);
@@ -3231,34 +3395,23 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
for (i = 0; i < SWAP_NR_ORDERS; i++) {
INIT_LIST_HEAD(&si->nonfull_clusters[i]);
INIT_LIST_HEAD(&si->frag_clusters[i]);
- atomic_long_set(&si->frag_cluster_nr[i], 0);
}
- /*
- * Reduce false cache line sharing between cluster_info and
- * sharing same address space.
- */
- for (j = 0; j < SWAP_CLUSTER_COLS; j++) {
- for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
- struct swap_cluster_info *ci;
- idx = i * SWAP_CLUSTER_COLS + j;
- ci = cluster_info + idx;
- if (idx >= nr_clusters)
- continue;
- if (ci->count) {
- ci->flags = CLUSTER_FLAG_NONFULL;
- list_add_tail(&ci->list, &si->nonfull_clusters[0]);
- continue;
- }
+ for (i = 0; i < nr_clusters; i++) {
+ struct swap_cluster_info *ci = &cluster_info[i];
+
+ if (ci->count) {
+ ci->flags = CLUSTER_FLAG_NONFULL;
+ list_add_tail(&ci->list, &si->nonfull_clusters[0]);
+ } else {
ci->flags = CLUSTER_FLAG_FREE;
list_add_tail(&ci->list, &si->free_clusters);
}
}
return cluster_info;
-
err_free:
- kvfree(cluster_info);
+ free_cluster_info(cluster_info, maxpages);
err:
return ERR_PTR(err);
}
@@ -3360,6 +3513,21 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
goto bad_swap_unlock_inode;
}
+ si->max = maxpages;
+ si->pages = maxpages - 1;
+ nr_extents = setup_swap_extents(si, &span);
+ if (nr_extents < 0) {
+ error = nr_extents;
+ goto bad_swap_unlock_inode;
+ }
+ if (si->pages != si->max - 1) {
+ pr_err("swap:%u != (max:%u - 1)\n", si->pages, si->max);
+ error = -EINVAL;
+ goto bad_swap_unlock_inode;
+ }
+
+ maxpages = si->max;
+
/* OK, set up the swap map and apply the bad block list */
swap_map = vzalloc(maxpages);
if (!swap_map) {
@@ -3371,12 +3539,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
if (error)
goto bad_swap_unlock_inode;
- nr_extents = setup_swap_map_and_extents(si, swap_header, swap_map,
- maxpages, &span);
- if (unlikely(nr_extents < 0)) {
- error = nr_extents;
+ error = setup_swap_map(si, swap_header, swap_map, maxpages);
+ if (error)
goto bad_swap_unlock_inode;
- }
/*
* Use kvmalloc_array instead of bitmap_zalloc as the allocation order might
@@ -3440,13 +3605,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
}
}
- error = init_swap_address_space(si->type, maxpages);
- if (error)
- goto bad_swap_unlock_inode;
-
error = zswap_swapon(si->type, maxpages);
if (error)
- goto free_swap_address_space;
+ goto bad_swap_unlock_inode;
/*
* Flush any pending IO and dirty mappings before we start using this
@@ -3481,8 +3642,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
goto out;
free_swap_zswap:
zswap_swapoff(si->type);
-free_swap_address_space:
- exit_swap_address_space(si->type);
bad_swap_unlock_inode:
inode_unlock(inode);
bad_swap:
@@ -3497,7 +3656,8 @@ bad_swap:
spin_unlock(&swap_lock);
vfree(swap_map);
kvfree(zeromap);
- kvfree(cluster_info);
+ if (cluster_info)
+ free_cluster_info(cluster_info, maxpages);
if (inced_nr_rotate_swap)
atomic_dec(&nr_rotate_swap);
if (swap_file)
@@ -3548,7 +3708,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
unsigned char has_cache;
int err, i;
- si = swp_swap_info(entry);
+ si = swap_entry_to_info(entry);
if (WARN_ON_ONCE(!si)) {
pr_err("%s%08lx\n", Bad_file, entry.val);
return -EINVAL;
@@ -3557,7 +3717,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
offset = swp_offset(entry);
VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER);
VM_WARN_ON(usage == 1 && nr > 1);
- ci = lock_cluster(si, offset);
+ ci = swap_cluster_lock(si, offset);
err = 0;
for (i = 0; i < nr; i++) {
@@ -3612,7 +3772,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
}
unlock_out:
- unlock_cluster(ci);
+ swap_cluster_unlock(ci);
return err;
}
@@ -3654,32 +3814,14 @@ int swapcache_prepare(swp_entry_t entry, int nr)
return __swap_duplicate(entry, SWAP_HAS_CACHE, nr);
}
-void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr)
-{
- unsigned long offset = swp_offset(entry);
-
- cluster_swap_free_nr(si, offset, nr, SWAP_HAS_CACHE);
-}
-
-struct swap_info_struct *swp_swap_info(swp_entry_t entry)
-{
- return swap_type_to_swap_info(swp_type(entry));
-}
-
/*
- * out-of-line methods to avoid include hell.
+ * Caller should ensure entries belong to the same folio so
+ * the entries won't span cross cluster boundary.
*/
-struct address_space *swapcache_mapping(struct folio *folio)
-{
- return swp_swap_info(folio->swap)->swap_file->f_mapping;
-}
-EXPORT_SYMBOL_GPL(swapcache_mapping);
-
-pgoff_t __folio_swap_cache_index(struct folio *folio)
+void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr)
{
- return swap_cache_index(folio->swap);
+ swap_entries_put_cache(si, entry, nr);
}
-EXPORT_SYMBOL_GPL(__folio_swap_cache_index);
/*
* add_swap_count_continuation - called when a swap count is duplicated
@@ -3724,7 +3866,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
offset = swp_offset(entry);
- ci = lock_cluster(si, offset);
+ ci = swap_cluster_lock(si, offset);
count = swap_count(si->swap_map[offset]);
@@ -3784,7 +3926,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
out_unlock_cont:
spin_unlock(&si->cont_lock);
out:
- unlock_cluster(ci);
+ swap_cluster_unlock(ci);
put_swap_device(si);
outer:
if (page)
@@ -3798,7 +3940,7 @@ outer:
* into, carry if so, or else fail until a new continuation page is allocated;
* when the original swap_map count is decremented from 0 with continuation,
* borrow from the continuation and report whether it still holds more.
- * Called while __swap_duplicate() or caller of __swap_entry_free_locked()
+ * Called while __swap_duplicate() or caller of swap_entry_put_locked()
* holds cluster lock.
*/
static bool swap_count_continued(struct swap_info_struct *si,
@@ -3958,6 +4100,16 @@ static int __init swapfile_init(void)
swapfile_maximum_size = arch_max_swapfile_size();
+ /*
+ * Once a cluster is freed, it's swap table content is read
+ * only, and all swap cache readers (swap_cache_*) verifies
+ * the content before use. So it's safe to use RCU slab here.
+ */
+ if (!SWP_TABLE_USE_PAGE)
+ swap_table_cachep = kmem_cache_create("swap_table",
+ sizeof(struct swap_table),
+ 0, SLAB_PANIC | SLAB_TYPESAFE_BY_RCU, NULL);
+
#ifdef CONFIG_MIGRATION
if (swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS))
swap_migration_ad_supported = true;
diff --git a/mm/truncate.c b/mm/truncate.c
index 5d98054094d1..91eb92a5ce4f 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -191,6 +191,7 @@ int truncate_inode_folio(struct address_space *mapping, struct folio *folio)
bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
{
loff_t pos = folio_pos(folio);
+ size_t size = folio_size(folio);
unsigned int offset, length;
struct page *split_at, *split_at2;
@@ -198,14 +199,13 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
offset = start - pos;
else
offset = 0;
- length = folio_size(folio);
- if (pos + length <= (u64)end)
- length = length - offset;
+ if (pos + size <= (u64)end)
+ length = size - offset;
else
length = end + 1 - pos - offset;
folio_wait_writeback(folio);
- if (length == folio_size(folio)) {
+ if (length == size) {
truncate_inode_folio(folio->mapping, folio);
return true;
}
@@ -224,16 +224,20 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
return true;
split_at = folio_page(folio, PAGE_ALIGN_DOWN(offset) / PAGE_SIZE);
- split_at2 = folio_page(folio,
- PAGE_ALIGN_DOWN(offset + length) / PAGE_SIZE);
-
if (!try_folio_split(folio, split_at, NULL)) {
/*
* try to split at offset + length to make sure folios within
* the range can be dropped, especially to avoid memory waste
* for shmem truncate
*/
- struct folio *folio2 = page_folio(split_at2);
+ struct folio *folio2;
+
+ if (offset + length == size)
+ goto no_split;
+
+ split_at2 = folio_page(folio,
+ PAGE_ALIGN_DOWN(offset + length) / PAGE_SIZE);
+ folio2 = page_folio(split_at2);
if (!folio_try_get(folio2))
goto no_split;
@@ -421,7 +425,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
for (i = 0; i < folio_batch_count(&fbatch); i++) {
struct folio *folio = fbatch.folios[i];
- /* We rely upon deletion not changing page->index */
+ /* We rely upon deletion not changing folio->index */
if (xa_is_value(folio))
continue;
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index e0db855c89b4..af61b95c89e4 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -561,7 +561,7 @@ retry:
}
while (src_addr < src_start + len) {
- BUG_ON(dst_addr >= dst_start + len);
+ VM_WARN_ON_ONCE(dst_addr >= dst_start + len);
/*
* Serialize via vma_lock and hugetlb_fault_mutex.
@@ -602,7 +602,7 @@ retry:
if (unlikely(err == -ENOENT)) {
up_read(&ctx->map_changing_lock);
uffd_mfill_unlock(dst_vma);
- BUG_ON(!folio);
+ VM_WARN_ON_ONCE(!folio);
err = copy_folio_from_user(folio,
(const void __user *)src_addr, true);
@@ -614,7 +614,7 @@ retry:
dst_vma = NULL;
goto retry;
} else
- BUG_ON(folio);
+ VM_WARN_ON_ONCE(folio);
if (!err) {
dst_addr += vma_hpagesize;
@@ -635,9 +635,9 @@ out_unlock_vma:
out:
if (folio)
folio_put(folio);
- BUG_ON(copied < 0);
- BUG_ON(err > 0);
- BUG_ON(!copied && !err);
+ VM_WARN_ON_ONCE(copied < 0);
+ VM_WARN_ON_ONCE(err > 0);
+ VM_WARN_ON_ONCE(!copied && !err);
return copied ? copied : err;
}
#else /* !CONFIG_HUGETLB_PAGE */
@@ -711,12 +711,12 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
/*
* Sanitize the command parameters:
*/
- BUG_ON(dst_start & ~PAGE_MASK);
- BUG_ON(len & ~PAGE_MASK);
+ VM_WARN_ON_ONCE(dst_start & ~PAGE_MASK);
+ VM_WARN_ON_ONCE(len & ~PAGE_MASK);
/* Does the address range wrap, or is the span zero-sized? */
- BUG_ON(src_start + len <= src_start);
- BUG_ON(dst_start + len <= dst_start);
+ VM_WARN_ON_ONCE(src_start + len <= src_start);
+ VM_WARN_ON_ONCE(dst_start + len <= dst_start);
src_addr = src_start;
dst_addr = dst_start;
@@ -775,7 +775,7 @@ retry:
while (src_addr < src_start + len) {
pmd_t dst_pmdval;
- BUG_ON(dst_addr >= dst_start + len);
+ VM_WARN_ON_ONCE(dst_addr >= dst_start + len);
dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
if (unlikely(!dst_pmd)) {
@@ -795,8 +795,8 @@ retry:
* (This includes the case where the PMD used to be THP and
* changed back to none after __pte_alloc().)
*/
- if (unlikely(!pmd_present(dst_pmdval) || pmd_trans_huge(dst_pmdval) ||
- pmd_devmap(dst_pmdval))) {
+ if (unlikely(!pmd_present(dst_pmdval) ||
+ pmd_trans_huge(dst_pmdval))) {
err = -EEXIST;
break;
}
@@ -818,7 +818,7 @@ retry:
up_read(&ctx->map_changing_lock);
uffd_mfill_unlock(dst_vma);
- BUG_ON(!folio);
+ VM_WARN_ON_ONCE(!folio);
kaddr = kmap_local_folio(folio, 0);
err = copy_from_user(kaddr,
@@ -832,7 +832,7 @@ retry:
flush_dcache_folio(folio);
goto retry;
} else
- BUG_ON(folio);
+ VM_WARN_ON_ONCE(folio);
if (!err) {
dst_addr += PAGE_SIZE;
@@ -852,9 +852,9 @@ out_unlock:
out:
if (folio)
folio_put(folio);
- BUG_ON(copied < 0);
- BUG_ON(err > 0);
- BUG_ON(!copied && !err);
+ VM_WARN_ON_ONCE(copied < 0);
+ VM_WARN_ON_ONCE(err > 0);
+ VM_WARN_ON_ONCE(!copied && !err);
return copied ? copied : err;
}
@@ -940,11 +940,11 @@ int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
/*
* Sanitize the command parameters:
*/
- BUG_ON(start & ~PAGE_MASK);
- BUG_ON(len & ~PAGE_MASK);
+ VM_WARN_ON_ONCE(start & ~PAGE_MASK);
+ VM_WARN_ON_ONCE(len & ~PAGE_MASK);
/* Does the address range wrap, or is the span zero-sized? */
- BUG_ON(start + len <= start);
+ VM_WARN_ON_ONCE(start + len <= start);
mmap_read_lock(dst_mm);
@@ -1026,18 +1026,64 @@ static inline bool is_pte_pages_stable(pte_t *dst_pte, pte_t *src_pte,
pmd_same(dst_pmdval, pmdp_get_lockless(dst_pmd));
}
-static int move_present_pte(struct mm_struct *mm,
- struct vm_area_struct *dst_vma,
- struct vm_area_struct *src_vma,
- unsigned long dst_addr, unsigned long src_addr,
- pte_t *dst_pte, pte_t *src_pte,
- pte_t orig_dst_pte, pte_t orig_src_pte,
- pmd_t *dst_pmd, pmd_t dst_pmdval,
- spinlock_t *dst_ptl, spinlock_t *src_ptl,
- struct folio *src_folio)
+/*
+ * Checks if the two ptes and the corresponding folio are eligible for batched
+ * move. If so, then returns pointer to the locked folio. Otherwise, returns NULL.
+ *
+ * NOTE: folio's reference is not required as the whole operation is within
+ * PTL's critical section.
+ */
+static struct folio *check_ptes_for_batched_move(struct vm_area_struct *src_vma,
+ unsigned long src_addr,
+ pte_t *src_pte, pte_t *dst_pte,
+ struct anon_vma *src_anon_vma)
+{
+ pte_t orig_dst_pte, orig_src_pte;
+ struct folio *folio;
+
+ orig_dst_pte = ptep_get(dst_pte);
+ if (!pte_none(orig_dst_pte))
+ return NULL;
+
+ orig_src_pte = ptep_get(src_pte);
+ if (!pte_present(orig_src_pte) || is_zero_pfn(pte_pfn(orig_src_pte)))
+ return NULL;
+
+ folio = vm_normal_folio(src_vma, src_addr, orig_src_pte);
+ if (!folio || !folio_trylock(folio))
+ return NULL;
+ if (!PageAnonExclusive(&folio->page) || folio_test_large(folio) ||
+ folio_anon_vma(folio) != src_anon_vma) {
+ folio_unlock(folio);
+ return NULL;
+ }
+ return folio;
+}
+
+/*
+ * Moves src folios to dst in a batch as long as they share the same
+ * anon_vma as the first folio, are not large, and can successfully
+ * take the lock via folio_trylock().
+ */
+static long move_present_ptes(struct mm_struct *mm,
+ struct vm_area_struct *dst_vma,
+ struct vm_area_struct *src_vma,
+ unsigned long dst_addr, unsigned long src_addr,
+ pte_t *dst_pte, pte_t *src_pte,
+ pte_t orig_dst_pte, pte_t orig_src_pte,
+ pmd_t *dst_pmd, pmd_t dst_pmdval,
+ spinlock_t *dst_ptl, spinlock_t *src_ptl,
+ struct folio **first_src_folio, unsigned long len,
+ struct anon_vma *src_anon_vma)
{
int err = 0;
+ struct folio *src_folio = *first_src_folio;
+ unsigned long src_start = src_addr;
+ unsigned long src_end;
+ len = pmd_addr_end(dst_addr, dst_addr + len) - dst_addr;
+ src_end = pmd_addr_end(src_addr, src_addr + len);
+ flush_cache_range(src_vma, src_addr, src_end);
double_pt_lock(dst_ptl, src_ptl);
if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
@@ -1051,31 +1097,56 @@ static int move_present_pte(struct mm_struct *mm,
err = -EBUSY;
goto out;
}
+ /* It's safe to drop the reference now as the page-table is holding one. */
+ folio_put(*first_src_folio);
+ *first_src_folio = NULL;
+ arch_enter_lazy_mmu_mode();
+
+ while (true) {
+ orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
+ /* Folio got pinned from under us. Put it back and fail the move. */
+ if (folio_maybe_dma_pinned(src_folio)) {
+ set_pte_at(mm, src_addr, src_pte, orig_src_pte);
+ err = -EBUSY;
+ break;
+ }
- orig_src_pte = ptep_clear_flush(src_vma, src_addr, src_pte);
- /* Folio got pinned from under us. Put it back and fail the move. */
- if (folio_maybe_dma_pinned(src_folio)) {
- set_pte_at(mm, src_addr, src_pte, orig_src_pte);
- err = -EBUSY;
- goto out;
- }
-
- folio_move_anon_rmap(src_folio, dst_vma);
- src_folio->index = linear_page_index(dst_vma, dst_addr);
+ folio_move_anon_rmap(src_folio, dst_vma);
+ src_folio->index = linear_page_index(dst_vma, dst_addr);
- orig_dst_pte = mk_pte(&src_folio->page, dst_vma->vm_page_prot);
- /* Set soft dirty bit so userspace can notice the pte was moved */
+ orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot);
+ /* Set soft dirty bit so userspace can notice the pte was moved */
#ifdef CONFIG_MEM_SOFT_DIRTY
- orig_dst_pte = pte_mksoft_dirty(orig_dst_pte);
+ orig_dst_pte = pte_mksoft_dirty(orig_dst_pte);
#endif
- if (pte_dirty(orig_src_pte))
- orig_dst_pte = pte_mkdirty(orig_dst_pte);
- orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma);
+ if (pte_dirty(orig_src_pte))
+ orig_dst_pte = pte_mkdirty(orig_dst_pte);
+ orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma);
+ set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte);
- set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte);
+ src_addr += PAGE_SIZE;
+ if (src_addr == src_end)
+ break;
+ dst_addr += PAGE_SIZE;
+ dst_pte++;
+ src_pte++;
+
+ folio_unlock(src_folio);
+ src_folio = check_ptes_for_batched_move(src_vma, src_addr, src_pte,
+ dst_pte, src_anon_vma);
+ if (!src_folio)
+ break;
+ }
+
+ arch_leave_lazy_mmu_mode();
+ if (src_addr > src_start)
+ flush_tlb_range(src_vma, src_start, src_addr);
+
+ if (src_folio)
+ folio_unlock(src_folio);
out:
double_pt_unlock(dst_ptl, src_ptl);
- return err;
+ return src_addr > src_start ? src_addr - src_start : err;
}
static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma,
@@ -1084,8 +1155,18 @@ static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma,
pte_t orig_dst_pte, pte_t orig_src_pte,
pmd_t *dst_pmd, pmd_t dst_pmdval,
spinlock_t *dst_ptl, spinlock_t *src_ptl,
- struct folio *src_folio)
+ struct folio *src_folio,
+ struct swap_info_struct *si, swp_entry_t entry)
{
+ /*
+ * Check if the folio still belongs to the target swap entry after
+ * acquiring the lock. Folio can be freed in the swap cache while
+ * not locked.
+ */
+ if (src_folio && unlikely(!folio_test_swapcache(src_folio) ||
+ entry.val != src_folio->swap.val))
+ return -EAGAIN;
+
double_pt_lock(dst_ptl, src_ptl);
if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
@@ -1102,6 +1183,25 @@ static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma,
if (src_folio) {
folio_move_anon_rmap(src_folio, dst_vma);
src_folio->index = linear_page_index(dst_vma, dst_addr);
+ } else {
+ /*
+ * Check if the swap entry is cached after acquiring the src_pte
+ * lock. Otherwise, we might miss a newly loaded swap cache folio.
+ *
+ * Check swap_map directly to minimize overhead, READ_ONCE is sufficient.
+ * We are trying to catch newly added swap cache, the only possible case is
+ * when a folio is swapped in and out again staying in swap cache, using the
+ * same entry before the PTE check above. The PTL is acquired and released
+ * twice, each time after updating the swap_map's flag. So holding
+ * the PTL here ensures we see the updated value. False positive is possible,
+ * e.g. SWP_SYNCHRONOUS_IO swapin may set the flag without touching the
+ * cache, or during the tiny synchronization window between swap cache and
+ * swap_map, but it will be gone very quickly, worst result is retry jitters.
+ */
+ if (READ_ONCE(si->swap_map[swp_offset(entry)]) & SWAP_HAS_CACHE) {
+ double_pt_unlock(dst_ptl, src_ptl);
+ return -EAGAIN;
+ }
}
orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
@@ -1111,7 +1211,7 @@ static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma,
set_pte_at(mm, dst_addr, dst_pte, orig_src_pte);
double_pt_unlock(dst_ptl, src_ptl);
- return 0;
+ return PAGE_SIZE;
}
static int move_zeropage_pte(struct mm_struct *mm,
@@ -1138,20 +1238,20 @@ static int move_zeropage_pte(struct mm_struct *mm,
set_pte_at(mm, dst_addr, dst_pte, zero_pte);
double_pt_unlock(dst_ptl, src_ptl);
- return 0;
+ return PAGE_SIZE;
}
/*
- * The mmap_lock for reading is held by the caller. Just move the page
- * from src_pmd to dst_pmd if possible, and return true if succeeded
- * in moving the page.
+ * The mmap_lock for reading is held by the caller. Just move the page(s)
+ * from src_pmd to dst_pmd if possible, and return number of bytes moved.
+ * On failure, an error code is returned.
*/
-static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
- struct vm_area_struct *dst_vma,
- struct vm_area_struct *src_vma,
- unsigned long dst_addr, unsigned long src_addr,
- __u64 mode)
+static long move_pages_ptes(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
+ struct vm_area_struct *dst_vma,
+ struct vm_area_struct *src_vma,
+ unsigned long dst_addr, unsigned long src_addr,
+ unsigned long len, __u64 mode)
{
swp_entry_t entry;
struct swap_info_struct *si = NULL;
@@ -1165,11 +1265,10 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
struct folio *src_folio = NULL;
struct anon_vma *src_anon_vma = NULL;
struct mmu_notifier_range range;
- int err = 0;
+ long ret = 0;
- flush_cache_range(src_vma, src_addr, src_addr + PAGE_SIZE);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
- src_addr, src_addr + PAGE_SIZE);
+ src_addr, src_addr + len);
mmu_notifier_invalidate_range_start(&range);
retry:
/*
@@ -1183,7 +1282,7 @@ retry:
/* Retry if a huge pmd materialized from under us */
if (unlikely(!dst_pte)) {
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
@@ -1202,14 +1301,14 @@ retry:
* transparent huge pages under us.
*/
if (unlikely(!src_pte)) {
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
/* Sanity checks before the operation */
if (pmd_none(*dst_pmd) || pmd_none(*src_pmd) ||
pmd_trans_huge(*dst_pmd) || pmd_trans_huge(*src_pmd)) {
- err = -EINVAL;
+ ret = -EINVAL;
goto out;
}
@@ -1217,7 +1316,7 @@ retry:
orig_dst_pte = ptep_get(dst_pte);
spin_unlock(dst_ptl);
if (!pte_none(orig_dst_pte)) {
- err = -EEXIST;
+ ret = -EEXIST;
goto out;
}
@@ -1226,21 +1325,21 @@ retry:
spin_unlock(src_ptl);
if (pte_none(orig_src_pte)) {
if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES))
- err = -ENOENT;
+ ret = -ENOENT;
else /* nothing to do to move a hole */
- err = 0;
+ ret = PAGE_SIZE;
goto out;
}
/* If PTE changed after we locked the folio them start over */
if (src_folio && unlikely(!pte_same(src_folio_pte, orig_src_pte))) {
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
if (pte_present(orig_src_pte)) {
if (is_zero_pfn(pte_pfn(orig_src_pte))) {
- err = move_zeropage_pte(mm, dst_vma, src_vma,
+ ret = move_zeropage_pte(mm, dst_vma, src_vma,
dst_addr, src_addr, dst_pte, src_pte,
orig_dst_pte, orig_src_pte,
dst_pmd, dst_pmdval, dst_ptl, src_ptl);
@@ -1263,14 +1362,14 @@ retry:
spin_lock(src_ptl);
if (!pte_same(orig_src_pte, ptep_get(src_pte))) {
spin_unlock(src_ptl);
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
folio = vm_normal_folio(src_vma, src_addr, orig_src_pte);
if (!folio || !PageAnonExclusive(&folio->page)) {
spin_unlock(src_ptl);
- err = -EBUSY;
+ ret = -EBUSY;
goto out;
}
@@ -1284,7 +1383,7 @@ retry:
*/
if (!locked && folio_test_large(folio)) {
spin_unlock(src_ptl);
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
@@ -1303,7 +1402,7 @@ retry:
}
if (WARN_ON_ONCE(!folio_test_anon(src_folio))) {
- err = -EBUSY;
+ ret = -EBUSY;
goto out;
}
}
@@ -1314,8 +1413,8 @@ retry:
pte_unmap(src_pte);
pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
- err = split_folio(src_folio);
- if (err)
+ ret = split_folio(src_folio);
+ if (ret)
goto out;
/* have to reacquire the folio after it got split */
folio_unlock(src_folio);
@@ -1333,7 +1432,7 @@ retry:
src_anon_vma = folio_get_anon_vma(src_folio);
if (!src_anon_vma) {
/* page was unmapped from under us */
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
if (!anon_vma_trylock_write(src_anon_vma)) {
@@ -1346,10 +1445,11 @@ retry:
}
}
- err = move_present_pte(mm, dst_vma, src_vma,
- dst_addr, src_addr, dst_pte, src_pte,
- orig_dst_pte, orig_src_pte, dst_pmd,
- dst_pmdval, dst_ptl, src_ptl, src_folio);
+ ret = move_present_ptes(mm, dst_vma, src_vma,
+ dst_addr, src_addr, dst_pte, src_pte,
+ orig_dst_pte, orig_src_pte, dst_pmd,
+ dst_pmdval, dst_ptl, src_ptl, &src_folio,
+ len, src_anon_vma);
} else {
struct folio *folio = NULL;
@@ -1360,20 +1460,20 @@ retry:
pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
migration_entry_wait(mm, src_pmd, src_addr);
- err = -EAGAIN;
+ ret = -EAGAIN;
} else
- err = -EFAULT;
+ ret = -EFAULT;
goto out;
}
if (!pte_swp_exclusive(orig_src_pte)) {
- err = -EBUSY;
+ ret = -EBUSY;
goto out;
}
si = get_swap_device(entry);
if (unlikely(!si)) {
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
/*
@@ -1389,11 +1489,10 @@ retry:
* separately to allow proper handling.
*/
if (!src_folio)
- folio = filemap_get_folio(swap_address_space(entry),
- swap_cache_index(entry));
- if (!IS_ERR_OR_NULL(folio)) {
+ folio = swap_cache_get_folio(entry);
+ if (folio) {
if (folio_test_large(folio)) {
- err = -EBUSY;
+ ret = -EBUSY;
folio_put(folio);
goto out;
}
@@ -1410,9 +1509,9 @@ retry:
goto retry;
}
}
- err = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte,
+ ret = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte,
orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval,
- dst_ptl, src_ptl, src_folio);
+ dst_ptl, src_ptl, src_folio, si, entry);
}
out:
@@ -1424,15 +1523,20 @@ out:
folio_unlock(src_folio);
folio_put(src_folio);
}
- if (dst_pte)
- pte_unmap(dst_pte);
+ /*
+ * Unmap in reverse order (LIFO) to maintain proper kmap_local
+ * index ordering when CONFIG_HIGHPTE is enabled. We mapped dst_pte
+ * first, then src_pte, so we must unmap src_pte first, then dst_pte.
+ */
if (src_pte)
pte_unmap(src_pte);
+ if (dst_pte)
+ pte_unmap(dst_pte);
mmu_notifier_invalidate_range_end(&range);
if (si)
put_swap_device(si);
- return err;
+ return ret;
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -1703,21 +1807,19 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
{
struct mm_struct *mm = ctx->mm;
struct vm_area_struct *src_vma, *dst_vma;
- unsigned long src_addr, dst_addr;
+ unsigned long src_addr, dst_addr, src_end;
pmd_t *src_pmd, *dst_pmd;
long err = -EINVAL;
ssize_t moved = 0;
/* Sanitize the command parameters. */
- if (WARN_ON_ONCE(src_start & ~PAGE_MASK) ||
- WARN_ON_ONCE(dst_start & ~PAGE_MASK) ||
- WARN_ON_ONCE(len & ~PAGE_MASK))
- goto out;
+ VM_WARN_ON_ONCE(src_start & ~PAGE_MASK);
+ VM_WARN_ON_ONCE(dst_start & ~PAGE_MASK);
+ VM_WARN_ON_ONCE(len & ~PAGE_MASK);
/* Does the address range wrap, or is the span zero-sized? */
- if (WARN_ON_ONCE(src_start + len <= src_start) ||
- WARN_ON_ONCE(dst_start + len <= dst_start))
- goto out;
+ VM_WARN_ON_ONCE(src_start + len < src_start);
+ VM_WARN_ON_ONCE(dst_start + len < dst_start);
err = uffd_move_lock(mm, dst_start, src_start, &dst_vma, &src_vma);
if (err)
@@ -1748,8 +1850,8 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
if (err)
goto out_unlock;
- for (src_addr = src_start, dst_addr = dst_start;
- src_addr < src_start + len;) {
+ for (src_addr = src_start, dst_addr = dst_start, src_end = src_start + len;
+ src_addr < src_end;) {
spinlock_t *ptl;
pmd_t dst_pmdval;
unsigned long step_size;
@@ -1791,22 +1893,19 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
ptl = pmd_trans_huge_lock(src_pmd, src_vma);
if (ptl) {
- if (pmd_devmap(*src_pmd)) {
- spin_unlock(ptl);
- err = -ENOENT;
- break;
- }
-
/* Check if we can move the pmd without splitting it. */
if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) ||
!pmd_none(dst_pmdval)) {
- struct folio *folio = pmd_folio(*src_pmd);
-
- if (!folio || (!is_huge_zero_folio(folio) &&
- !PageAnonExclusive(&folio->page))) {
- spin_unlock(ptl);
- err = -EBUSY;
- break;
+ /* Can be a migration entry */
+ if (pmd_present(*src_pmd)) {
+ struct folio *folio = pmd_folio(*src_pmd);
+
+ if (!is_huge_zero_folio(folio) &&
+ !PageAnonExclusive(&folio->page)) {
+ spin_unlock(ptl);
+ err = -EBUSY;
+ break;
+ }
}
spin_unlock(ptl);
@@ -1820,6 +1919,8 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
dst_addr, src_addr);
step_size = HPAGE_PMD_SIZE;
} else {
+ long ret;
+
if (pmd_none(*src_pmd)) {
if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) {
err = -ENOENT;
@@ -1836,10 +1937,13 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
break;
}
- err = move_pages_pte(mm, dst_pmd, src_pmd,
- dst_vma, src_vma,
- dst_addr, src_addr, mode);
- step_size = PAGE_SIZE;
+ ret = move_pages_ptes(mm, dst_pmd, src_pmd,
+ dst_vma, src_vma, dst_addr,
+ src_addr, src_end - src_addr, mode);
+ if (ret < 0)
+ err = ret;
+ else
+ step_size = ret;
}
cond_resched();
@@ -1867,18 +1971,18 @@ out_unlock:
up_read(&ctx->map_changing_lock);
uffd_move_unlock(dst_vma, src_vma);
out:
- VM_WARN_ON(moved < 0);
- VM_WARN_ON(err > 0);
- VM_WARN_ON(!moved && !err);
+ VM_WARN_ON_ONCE(moved < 0);
+ VM_WARN_ON_ONCE(err > 0);
+ VM_WARN_ON_ONCE(!moved && !err);
return moved ? moved : err;
}
static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
- vm_flags_t flags)
+ vm_flags_t vm_flags)
{
- const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP;
+ const bool uffd_wp_changed = (vma->vm_flags ^ vm_flags) & VM_UFFD_WP;
- vm_flags_reset(vma, flags);
+ vm_flags_reset(vma, vm_flags);
/*
* For shared mappings, we want to enable writenotify while
* userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply
@@ -1890,12 +1994,12 @@ static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
static void userfaultfd_set_ctx(struct vm_area_struct *vma,
struct userfaultfd_ctx *ctx,
- unsigned long flags)
+ vm_flags_t vm_flags)
{
vma_start_write(vma);
vma->vm_userfaultfd_ctx = (struct vm_userfaultfd_ctx){ctx};
userfaultfd_set_vm_flags(vma,
- (vma->vm_flags & ~__VM_UFFD_FLAGS) | flags);
+ (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags);
}
void userfaultfd_reset_ctx(struct vm_area_struct *vma)
@@ -1941,14 +2045,14 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi,
/* Assumes mmap write lock taken, and mm_struct pinned. */
int userfaultfd_register_range(struct userfaultfd_ctx *ctx,
struct vm_area_struct *vma,
- unsigned long vm_flags,
+ vm_flags_t vm_flags,
unsigned long start, unsigned long end,
bool wp_async)
{
VMA_ITERATOR(vmi, ctx->mm, start);
struct vm_area_struct *prev = vma_prev(&vmi);
unsigned long vma_end;
- unsigned long new_flags;
+ vm_flags_t new_flags;
if (vma->vm_start < start)
prev = vma;
@@ -1956,10 +2060,10 @@ int userfaultfd_register_range(struct userfaultfd_ctx *ctx,
for_each_vma_range(vmi, vma, end) {
cond_resched();
- BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async));
- BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
- vma->vm_userfaultfd_ctx.ctx != ctx);
- WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
+ VM_WARN_ON_ONCE(!vma_can_userfault(vma, vm_flags, wp_async));
+ VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx &&
+ vma->vm_userfaultfd_ctx.ctx != ctx);
+ VM_WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE));
/*
* Nothing to do: this vma is already registered into this
@@ -2035,8 +2139,8 @@ void userfaultfd_release_all(struct mm_struct *mm,
prev = NULL;
for_each_vma(vmi, vma) {
cond_resched();
- BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
- !!(vma->vm_flags & __VM_UFFD_FLAGS));
+ VM_WARN_ON_ONCE(!!vma->vm_userfaultfd_ctx.ctx ^
+ !!(vma->vm_flags & __VM_UFFD_FLAGS));
if (vma->vm_userfaultfd_ctx.ctx != ctx) {
prev = vma;
continue;
diff --git a/mm/util.c b/mm/util.c
index 448117da071f..8989d5767528 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -25,6 +25,7 @@
#include <linux/sizes.h>
#include <linux/compat.h>
#include <linux/fsnotify.h>
+#include <linux/page_idle.h>
#include <linux/uaccess.h>
@@ -314,7 +315,7 @@ void *memdup_user_nul(const void __user *src, size_t len)
EXPORT_SYMBOL(memdup_user_nul);
/* Check if the vma is being used as a stack by this task */
-int vma_is_stack_for_current(struct vm_area_struct *vma)
+int vma_is_stack_for_current(const struct vm_area_struct *vma)
{
struct task_struct * __maybe_unused t = current;
@@ -409,7 +410,7 @@ unsigned long arch_mmap_rnd(void)
return rnd << PAGE_SHIFT;
}
-static int mmap_is_legacy(struct rlimit *rlim_stack)
+static int mmap_is_legacy(const struct rlimit *rlim_stack)
{
if (current->personality & ADDR_COMPAT_LAYOUT)
return 1;
@@ -430,7 +431,7 @@ static int mmap_is_legacy(struct rlimit *rlim_stack)
#define MIN_GAP (SZ_128M)
#define MAX_GAP (STACK_TOP / 6 * 5)
-static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
+static unsigned long mmap_base(const unsigned long rnd, const struct rlimit *rlim_stack)
{
#ifdef CONFIG_STACK_GROWSUP
/*
@@ -461,7 +462,7 @@ static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
#endif
}
-void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
+void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack)
{
unsigned long random_factor = 0UL;
@@ -470,17 +471,17 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
if (mmap_is_legacy(rlim_stack)) {
mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
- clear_bit(MMF_TOPDOWN, &mm->flags);
+ mm_flags_clear(MMF_TOPDOWN, mm);
} else {
mm->mmap_base = mmap_base(random_factor, rlim_stack);
- set_bit(MMF_TOPDOWN, &mm->flags);
+ mm_flags_set(MMF_TOPDOWN, mm);
}
}
#elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
-void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
+void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack)
{
mm->mmap_base = TASK_UNMAPPED_BASE;
- clear_bit(MMF_TOPDOWN, &mm->flags);
+ mm_flags_clear(MMF_TOPDOWN, mm);
}
#endif
#ifdef CONFIG_MMU
@@ -503,7 +504,7 @@ EXPORT_SYMBOL_IF_KUNIT(arch_pick_mmap_layout);
* * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
*/
int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
- struct task_struct *task, bool bypass_rlim)
+ const struct task_struct *task, bool bypass_rlim)
{
unsigned long locked_vm, limit;
int ret = 0;
@@ -565,6 +566,7 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flag, unsigned long pgoff)
{
+ loff_t off = (loff_t)pgoff << PAGE_SHIFT;
unsigned long ret;
struct mm_struct *mm = current->mm;
unsigned long populate;
@@ -572,7 +574,7 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
ret = security_mmap_file(file, prot, flag);
if (!ret)
- ret = fsnotify_mmap_perm(file, prot, pgoff >> PAGE_SHIFT, len);
+ ret = fsnotify_mmap_perm(file, prot, off, len);
if (!ret) {
if (mmap_write_lock_killable(mm))
return -EINTR;
@@ -670,9 +672,9 @@ struct anon_vma *folio_anon_vma(const struct folio *folio)
{
unsigned long mapping = (unsigned long)folio->mapping;
- if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
+ if ((mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON)
return NULL;
- return (void *)(mapping - PAGE_MAPPING_ANON);
+ return (void *)(mapping - FOLIO_MAPPING_ANON);
}
/**
@@ -687,7 +689,7 @@ struct anon_vma *folio_anon_vma(const struct folio *folio)
* You can call this for folios which aren't in the swap cache or page
* cache and it will return NULL.
*/
-struct address_space *folio_mapping(struct folio *folio)
+struct address_space *folio_mapping(const struct folio *folio)
{
struct address_space *mapping;
@@ -699,7 +701,7 @@ struct address_space *folio_mapping(struct folio *folio)
return swap_address_space(folio->swap);
mapping = folio->mapping;
- if ((unsigned long)mapping & PAGE_MAPPING_FLAGS)
+ if ((unsigned long)mapping & FOLIO_MAPPING_FLAGS)
return NULL;
return mapping;
@@ -925,7 +927,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed);
* Note this is a helper function intended to be used by LSMs which
* wish to use this logic.
*/
-int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
+int __vm_enough_memory(const struct mm_struct *mm, long pages, int cap_sys_admin)
{
long allowed;
unsigned long bytes_failed;
@@ -1131,3 +1133,214 @@ void flush_dcache_folio(struct folio *folio)
}
EXPORT_SYMBOL(flush_dcache_folio);
#endif
+
+/**
+ * __compat_vma_mmap_prepare() - See description for compat_vma_mmap_prepare()
+ * for details. This is the same operation, only with a specific file operations
+ * struct which may or may not be the same as vma->vm_file->f_op.
+ * @f_op: The file operations whose .mmap_prepare() hook is specified.
+ * @file: The file which backs or will back the mapping.
+ * @vma: The VMA to apply the .mmap_prepare() hook to.
+ * Returns: 0 on success or error.
+ */
+int __compat_vma_mmap_prepare(const struct file_operations *f_op,
+ struct file *file, struct vm_area_struct *vma)
+{
+ struct vm_area_desc desc = {
+ .mm = vma->vm_mm,
+ .file = file,
+ .start = vma->vm_start,
+ .end = vma->vm_end,
+
+ .pgoff = vma->vm_pgoff,
+ .vm_file = vma->vm_file,
+ .vm_flags = vma->vm_flags,
+ .page_prot = vma->vm_page_prot,
+ };
+ int err;
+
+ err = f_op->mmap_prepare(&desc);
+ if (err)
+ return err;
+ set_vma_from_desc(vma, &desc);
+
+ return 0;
+}
+EXPORT_SYMBOL(__compat_vma_mmap_prepare);
+
+/**
+ * compat_vma_mmap_prepare() - Apply the file's .mmap_prepare() hook to an
+ * existing VMA.
+ * @file: The file which possesss an f_op->mmap_prepare() hook.
+ * @vma: The VMA to apply the .mmap_prepare() hook to.
+ *
+ * Ordinarily, .mmap_prepare() is invoked directly upon mmap(). However, certain
+ * stacked filesystems invoke a nested mmap hook of an underlying file.
+ *
+ * Until all filesystems are converted to use .mmap_prepare(), we must be
+ * conservative and continue to invoke these stacked filesystems using the
+ * deprecated .mmap() hook.
+ *
+ * However we have a problem if the underlying file system possesses an
+ * .mmap_prepare() hook, as we are in a different context when we invoke the
+ * .mmap() hook, already having a VMA to deal with.
+ *
+ * compat_vma_mmap_prepare() is a compatibility function that takes VMA state,
+ * establishes a struct vm_area_desc descriptor, passes to the underlying
+ * .mmap_prepare() hook and applies any changes performed by it.
+ *
+ * Once the conversion of filesystems is complete this function will no longer
+ * be required and will be removed.
+ *
+ * Returns: 0 on success or error.
+ */
+int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma)
+{
+ return __compat_vma_mmap_prepare(file->f_op, file, vma);
+}
+EXPORT_SYMBOL(compat_vma_mmap_prepare);
+
+static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio,
+ const struct page *page)
+{
+ /*
+ * Only the first page of a high-order buddy page has PageBuddy() set.
+ * So we have to check manually whether this page is part of a high-
+ * order buddy page.
+ */
+ if (PageBuddy(page))
+ ps->flags |= PAGE_SNAPSHOT_PG_BUDDY;
+ else if (page_count(page) == 0 && is_free_buddy_page(page))
+ ps->flags |= PAGE_SNAPSHOT_PG_BUDDY;
+
+ if (folio_test_idle(folio))
+ ps->flags |= PAGE_SNAPSHOT_PG_IDLE;
+}
+
+/**
+ * snapshot_page() - Create a snapshot of a struct page
+ * @ps: Pointer to a struct page_snapshot to store the page snapshot
+ * @page: The page to snapshot
+ *
+ * Create a snapshot of the page and store both its struct page and struct
+ * folio representations in @ps.
+ *
+ * A snapshot is marked as "faithful" if the compound state of @page was
+ * stable and allowed safe reconstruction of the folio representation. In
+ * rare cases where this is not possible (e.g. due to folio splitting),
+ * snapshot_page() falls back to treating @page as a single page and the
+ * snapshot is marked as "unfaithful". The snapshot_page_is_faithful()
+ * helper can be used to check for this condition.
+ */
+void snapshot_page(struct page_snapshot *ps, const struct page *page)
+{
+ unsigned long head, nr_pages = 1;
+ struct folio *foliop;
+ int loops = 5;
+
+ ps->pfn = page_to_pfn(page);
+ ps->flags = PAGE_SNAPSHOT_FAITHFUL;
+
+again:
+ memset(&ps->folio_snapshot, 0, sizeof(struct folio));
+ memcpy(&ps->page_snapshot, page, sizeof(*page));
+ head = ps->page_snapshot.compound_head;
+ if ((head & 1) == 0) {
+ ps->idx = 0;
+ foliop = (struct folio *)&ps->page_snapshot;
+ if (!folio_test_large(foliop)) {
+ set_ps_flags(ps, page_folio(page), page);
+ memcpy(&ps->folio_snapshot, foliop,
+ sizeof(struct page));
+ return;
+ }
+ foliop = (struct folio *)page;
+ } else {
+ foliop = (struct folio *)(head - 1);
+ ps->idx = folio_page_idx(foliop, page);
+ }
+
+ if (ps->idx < MAX_FOLIO_NR_PAGES) {
+ memcpy(&ps->folio_snapshot, foliop, 2 * sizeof(struct page));
+ nr_pages = folio_nr_pages(&ps->folio_snapshot);
+ if (nr_pages > 1)
+ memcpy(&ps->folio_snapshot.__page_2, &foliop->__page_2,
+ sizeof(struct page));
+ set_ps_flags(ps, foliop, page);
+ }
+
+ if (ps->idx > nr_pages) {
+ if (loops-- > 0)
+ goto again;
+ clear_compound_head(&ps->page_snapshot);
+ foliop = (struct folio *)&ps->page_snapshot;
+ memcpy(&ps->folio_snapshot, foliop, sizeof(struct page));
+ ps->flags = 0;
+ ps->idx = 0;
+ }
+}
+
+#ifdef CONFIG_MMU
+/**
+ * folio_pte_batch - detect a PTE batch for a large folio
+ * @folio: The large folio to detect a PTE batch for.
+ * @ptep: Page table pointer for the first entry.
+ * @pte: Page table entry for the first page.
+ * @max_nr: The maximum number of table entries to consider.
+ *
+ * This is a simplified variant of folio_pte_batch_flags().
+ *
+ * Detect a PTE batch: consecutive (present) PTEs that map consecutive
+ * pages of the same large folio in a single VMA and a single page table.
+ *
+ * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN,
+ * the accessed bit, writable bit, dirt-bit and soft-dirty bit.
+ *
+ * ptep must map any page of the folio. max_nr must be at least one and
+ * must be limited by the caller so scanning cannot exceed a single VMA and
+ * a single page table.
+ *
+ * Return: the number of table entries in the batch.
+ */
+unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte,
+ unsigned int max_nr)
+{
+ return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr, 0);
+}
+#endif /* CONFIG_MMU */
+
+#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
+/**
+ * page_range_contiguous - test whether the page range is contiguous
+ * @page: the start of the page range.
+ * @nr_pages: the number of pages in the range.
+ *
+ * Test whether the page range is contiguous, such that they can be iterated
+ * naively, corresponding to iterating a contiguous PFN range.
+ *
+ * This function should primarily only be used for debug checks, or when
+ * working with page ranges that are not naturally contiguous (e.g., pages
+ * within a folio are).
+ *
+ * Returns true if contiguous, otherwise false.
+ */
+bool page_range_contiguous(const struct page *page, unsigned long nr_pages)
+{
+ const unsigned long start_pfn = page_to_pfn(page);
+ const unsigned long end_pfn = start_pfn + nr_pages;
+ unsigned long pfn;
+
+ /*
+ * The memmap is allocated per memory section, so no need to check
+ * within the first section. However, we need to check each other
+ * spanned memory section once, making sure the first page in a
+ * section could similarly be reached by just iterating pages.
+ */
+ for (pfn = ALIGN(start_pfn, PAGES_PER_SECTION);
+ pfn < end_pfn; pfn += PAGES_PER_SECTION)
+ if (unlikely(page + (pfn - start_pfn) != pfn_to_page(pfn)))
+ return false;
+ return true;
+}
+EXPORT_SYMBOL(page_range_contiguous);
+#endif
diff --git a/mm/vma.c b/mm/vma.c
index 839d12f02c88..abe0da33c844 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -15,11 +15,15 @@ struct mmap_state {
unsigned long end;
pgoff_t pgoff;
unsigned long pglen;
- unsigned long flags;
+ vm_flags_t vm_flags;
struct file *file;
+ pgprot_t page_prot;
+
+ /* User-defined fields, perhaps updated by .mmap_prepare(). */
+ const struct vm_operations_struct *vm_ops;
+ void *vm_private_data;
unsigned long charged;
- bool retry_merge;
struct vm_area_struct *prev;
struct vm_area_struct *next;
@@ -28,9 +32,12 @@ struct mmap_state {
struct vma_munmap_struct vms;
struct ma_state mas_detach;
struct maple_tree mt_detach;
+
+ /* Determine if we can check KSM flags early in mmap() logic. */
+ bool check_ksm_early;
};
-#define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, flags_, file_) \
+#define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, vm_flags_, file_) \
struct mmap_state name = { \
.mm = mm_, \
.vmi = vmi_, \
@@ -38,8 +45,9 @@ struct mmap_state {
.end = (addr_) + (len_), \
.pgoff = pgoff_, \
.pglen = PHYS_PFN(len_), \
- .flags = flags_, \
+ .vm_flags = vm_flags_, \
.file = file_, \
+ .page_prot = vm_get_page_prot(vm_flags_), \
}
#define VMG_MMAP_STATE(name, map_, vma_) \
@@ -48,7 +56,7 @@ struct mmap_state {
.vmi = (map_)->vmi, \
.start = (map_)->addr, \
.end = (map_)->end, \
- .flags = (map_)->flags, \
+ .vm_flags = (map_)->vm_flags, \
.pgoff = (map_)->pgoff, \
.file = (map_)->file, \
.prev = (map_)->prev, \
@@ -57,6 +65,22 @@ struct mmap_state {
.state = VMA_MERGE_START, \
}
+/*
+ * If, at any point, the VMA had unCoW'd mappings from parents, it will maintain
+ * more than one anon_vma_chain connecting it to more than one anon_vma. A merge
+ * would mean a wider range of folios sharing the root anon_vma lock, and thus
+ * potential lock contention, we do not wish to encourage merging such that this
+ * scales to a problem.
+ */
+static bool vma_had_uncowed_parents(struct vm_area_struct *vma)
+{
+ /*
+ * The list_is_singular() test is to avoid merging VMA cloned from
+ * parents. This can improve scalability caused by anon_vma lock.
+ */
+ return vma && vma->anon_vma && !list_is_singular(&vma->anon_vma_chain);
+}
+
static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next)
{
struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev;
@@ -71,7 +95,7 @@ static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_nex
* the kernel to generate new VMAs when old one could be
* extended instead.
*/
- if ((vma->vm_flags ^ vmg->flags) & ~VM_SOFTDIRTY)
+ if ((vma->vm_flags ^ vmg->vm_flags) & ~VM_SOFTDIRTY)
return false;
if (vma->vm_file != vmg->file)
return false;
@@ -82,24 +106,28 @@ static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_nex
return true;
}
-static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1,
- struct anon_vma *anon_vma2, struct vm_area_struct *vma)
+static bool is_mergeable_anon_vma(struct vma_merge_struct *vmg, bool merge_next)
{
+ struct vm_area_struct *tgt = merge_next ? vmg->next : vmg->prev;
+ struct vm_area_struct *src = vmg->middle; /* exisitng merge case. */
+ struct anon_vma *tgt_anon = tgt->anon_vma;
+ struct anon_vma *src_anon = vmg->anon_vma;
+
/*
- * The list_is_singular() test is to avoid merging VMA cloned from
- * parents. This can improve scalability caused by anon_vma lock.
+ * We _can_ have !src, vmg->anon_vma via copy_vma(). In this instance we
+ * will remove the existing VMA's anon_vma's so there's no scalability
+ * concerns.
*/
- if ((!anon_vma1 || !anon_vma2) && (!vma ||
- list_is_singular(&vma->anon_vma_chain)))
- return true;
- return anon_vma1 == anon_vma2;
-}
+ VM_WARN_ON(src && src_anon != src->anon_vma);
-/* Are the anon_vma's belonging to each VMA compatible with one another? */
-static inline bool are_anon_vmas_compatible(struct vm_area_struct *vma1,
- struct vm_area_struct *vma2)
-{
- return is_mergeable_anon_vma(vma1->anon_vma, vma2->anon_vma, NULL);
+ /* Case 1 - we will dup_anon_vma() from src into tgt. */
+ if (!tgt_anon && src_anon)
+ return !vma_had_uncowed_parents(src);
+ /* Case 2 - we will simply use tgt's anon_vma. */
+ if (tgt_anon && !src_anon)
+ return !vma_had_uncowed_parents(tgt);
+ /* Case 3 - the anon_vma's are already shared. */
+ return src_anon == tgt_anon;
}
/*
@@ -144,6 +172,9 @@ static void init_multi_vma_prep(struct vma_prepare *vp,
vp->file = vma->vm_file;
if (vp->file)
vp->mapping = vma->vm_file->f_mapping;
+
+ if (vmg && vmg->skip_vma_uprobe)
+ vp->skip_vma_uprobe = true;
}
/*
@@ -164,7 +195,7 @@ static bool can_vma_merge_before(struct vma_merge_struct *vmg)
pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
if (is_mergeable_vma(vmg, /* merge_next = */ true) &&
- is_mergeable_anon_vma(vmg->anon_vma, vmg->next->anon_vma, vmg->next)) {
+ is_mergeable_anon_vma(vmg, /* merge_next = */ true)) {
if (vmg->next->vm_pgoff == vmg->pgoff + pglen)
return true;
}
@@ -184,7 +215,7 @@ static bool can_vma_merge_before(struct vma_merge_struct *vmg)
static bool can_vma_merge_after(struct vma_merge_struct *vmg)
{
if (is_mergeable_vma(vmg, /* merge_next = */ false) &&
- is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) {
+ is_mergeable_anon_vma(vmg, /* merge_next = */ false)) {
if (vmg->prev->vm_pgoff + vma_pages(vmg->prev) == vmg->pgoff)
return true;
}
@@ -333,10 +364,13 @@ static void vma_complete(struct vma_prepare *vp, struct vma_iterator *vmi,
if (vp->file) {
i_mmap_unlock_write(vp->mapping);
- uprobe_mmap(vp->vma);
- if (vp->adj_next)
- uprobe_mmap(vp->adj_next);
+ if (!vp->skip_vma_uprobe) {
+ uprobe_mmap(vp->vma);
+
+ if (vp->adj_next)
+ uprobe_mmap(vp->adj_next);
+ }
}
if (vp->remove) {
@@ -400,8 +434,10 @@ static bool can_vma_merge_left(struct vma_merge_struct *vmg)
static bool can_vma_merge_right(struct vma_merge_struct *vmg,
bool can_merge_left)
{
- if (!vmg->next || vmg->end != vmg->next->vm_start ||
- !can_vma_merge_before(vmg))
+ struct vm_area_struct *next = vmg->next;
+ struct vm_area_struct *prev;
+
+ if (!next || vmg->end != next->vm_start || !can_vma_merge_before(vmg))
return false;
if (!can_merge_left)
@@ -414,7 +450,9 @@ static bool can_vma_merge_right(struct vma_merge_struct *vmg,
*
* We therefore check this in addition to mergeability to either side.
*/
- return are_anon_vmas_compatible(vmg->prev, vmg->next);
+ prev = vmg->prev;
+ return !prev->anon_vma || !next->anon_vma ||
+ prev->anon_vma == next->anon_vma;
}
/*
@@ -510,7 +548,14 @@ __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
init_vma_prep(&vp, vma);
vp.insert = new;
vma_prepare(&vp);
+
+ /*
+ * Get rid of huge pages and shared page tables straddling the split
+ * boundary.
+ */
vma_adjust_trans_huge(vma, vma->vm_start, addr, NULL);
+ if (is_vm_hugetlb_page(vma))
+ hugetlb_split(vma, addr);
if (new_below) {
vma->vm_start = addr;
@@ -554,7 +599,9 @@ static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
}
/*
- * dup_anon_vma() - Helper function to duplicate anon_vma
+ * dup_anon_vma() - Helper function to duplicate anon_vma on VMA merge in the
+ * instance that the destination VMA has no anon_vma but the source does.
+ *
* @dst: The destination VMA
* @src: The source VMA
* @dup: Pointer to the destination VMA when successful.
@@ -565,9 +612,18 @@ static int dup_anon_vma(struct vm_area_struct *dst,
struct vm_area_struct *src, struct vm_area_struct **dup)
{
/*
- * Easily overlooked: when mprotect shifts the boundary, make sure the
- * expanding vma has anon_vma set if the shrinking vma had, to cover any
- * anon pages imported.
+ * There are three cases to consider for correctly propagating
+ * anon_vma's on merge.
+ *
+ * The first is trivial - neither VMA has anon_vma, we need not do
+ * anything.
+ *
+ * The second where both have anon_vma is also a no-op, as they must
+ * then be the same, so there is simply nothing to copy.
+ *
+ * Here we cover the third - if the destination VMA has no anon_vma,
+ * that is it is unfaulted, we need to ensure that the newly merged
+ * range is referenced by the anon_vma's of the source.
*/
if (src->anon_vma && !dst->anon_vma) {
int ret;
@@ -787,7 +843,7 @@ static __must_check struct vm_area_struct *vma_merge_existing_range(
* furthermost left or right side of the VMA, then we have no chance of
* merging and should abort.
*/
- if (vmg->flags & VM_SPECIAL || (!left_side && !right_side))
+ if (vmg->vm_flags & VM_SPECIAL || (!left_side && !right_side))
return NULL;
if (left_side)
@@ -914,27 +970,10 @@ static __must_check struct vm_area_struct *vma_merge_existing_range(
err = dup_anon_vma(next, middle, &anon_dup);
}
- if (err)
+ if (err || commit_merge(vmg))
goto abort;
- err = commit_merge(vmg);
- if (err) {
- VM_WARN_ON(err != -ENOMEM);
-
- if (anon_dup)
- unlink_anon_vmas(anon_dup);
-
- /*
- * We've cleaned up any cloned anon_vma's, no VMAs have been
- * modified, no harm no foul if the user requests that we not
- * report this and just give up, leaving the VMAs unmerged.
- */
- if (!vmg->give_up_on_oom)
- vmg->state = VMA_MERGE_ERROR_NOMEM;
- return NULL;
- }
-
- khugepaged_enter_vma(vmg->target, vmg->flags);
+ khugepaged_enter_vma(vmg->target, vmg->vm_flags);
vmg->state = VMA_MERGE_SUCCESS;
return vmg->target;
@@ -942,6 +981,9 @@ abort:
vma_iter_set(vmg->vmi, start);
vma_iter_load(vmg->vmi);
+ if (anon_dup)
+ unlink_anon_vmas(anon_dup);
+
/*
* This means we have failed to clone anon_vma's correctly, but no
* actual changes to VMAs have occurred, so no harm no foul - if the
@@ -1006,13 +1048,14 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
mmap_assert_write_locked(vmg->mm);
VM_WARN_ON_VMG(vmg->middle, vmg);
+ VM_WARN_ON_VMG(vmg->target, vmg);
/* vmi must point at or before the gap. */
VM_WARN_ON_VMG(vma_iter_addr(vmg->vmi) > end, vmg);
vmg->state = VMA_MERGE_NOMERGE;
/* Special VMAs are unmergeable, also if no prev/next. */
- if ((vmg->flags & VM_SPECIAL) || (!prev && !next))
+ if ((vmg->vm_flags & VM_SPECIAL) || (!prev && !next))
return NULL;
can_merge_left = can_vma_merge_left(vmg);
@@ -1021,13 +1064,13 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
/* If we can merge with the next VMA, adjust vmg accordingly. */
if (can_merge_right) {
vmg->end = next->vm_end;
- vmg->middle = next;
+ vmg->target = next;
}
/* If we can merge with the previous VMA, adjust vmg accordingly. */
if (can_merge_left) {
vmg->start = prev->vm_start;
- vmg->middle = prev;
+ vmg->target = prev;
vmg->pgoff = prev->vm_pgoff;
/*
@@ -1049,10 +1092,10 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
* Now try to expand adjacent VMA(s). This takes care of removing the
* following VMA if we have VMAs on both sides.
*/
- if (vmg->middle && !vma_expand(vmg)) {
- khugepaged_enter_vma(vmg->middle, vmg->flags);
+ if (vmg->target && !vma_expand(vmg)) {
+ khugepaged_enter_vma(vmg->target, vmg->vm_flags);
vmg->state = VMA_MERGE_SUCCESS;
- return vmg->middle;
+ return vmg->target;
}
return NULL;
@@ -1064,27 +1107,29 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
* @vmg: Describes a VMA expansion operation.
*
* Expand @vma to vmg->start and vmg->end. Can expand off the start and end.
- * Will expand over vmg->next if it's different from vmg->middle and vmg->end ==
- * vmg->next->vm_end. Checking if the vmg->middle can expand and merge with
+ * Will expand over vmg->next if it's different from vmg->target and vmg->end ==
+ * vmg->next->vm_end. Checking if the vmg->target can expand and merge with
* vmg->next needs to be handled by the caller.
*
* Returns: 0 on success.
*
* ASSUMPTIONS:
- * - The caller must hold a WRITE lock on vmg->middle->mm->mmap_lock.
- * - The caller must have set @vmg->middle and @vmg->next.
+ * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
+ * - The caller must have set @vmg->target and @vmg->next.
*/
int vma_expand(struct vma_merge_struct *vmg)
{
struct vm_area_struct *anon_dup = NULL;
bool remove_next = false;
- struct vm_area_struct *middle = vmg->middle;
+ struct vm_area_struct *target = vmg->target;
struct vm_area_struct *next = vmg->next;
+ VM_WARN_ON_VMG(!target, vmg);
+
mmap_assert_write_locked(vmg->mm);
- vma_start_write(middle);
- if (next && (middle != next) && (vmg->end == next->vm_end)) {
+ vma_start_write(target);
+ if (next && (target != next) && (vmg->end == next->vm_end)) {
int ret;
remove_next = true;
@@ -1095,19 +1140,18 @@ int vma_expand(struct vma_merge_struct *vmg)
* In this case we don't report OOM, so vmg->give_up_on_mm is
* safe.
*/
- ret = dup_anon_vma(middle, next, &anon_dup);
+ ret = dup_anon_vma(target, next, &anon_dup);
if (ret)
return ret;
}
/* Not merging but overwriting any part of next is not handled. */
VM_WARN_ON_VMG(next && !remove_next &&
- next != middle && vmg->end > next->vm_start, vmg);
+ next != target && vmg->end > next->vm_start, vmg);
/* Only handles expanding */
- VM_WARN_ON_VMG(middle->vm_start < vmg->start ||
- middle->vm_end > vmg->end, vmg);
+ VM_WARN_ON_VMG(target->vm_start < vmg->start ||
+ target->vm_end > vmg->end, vmg);
- vmg->target = middle;
if (remove_next)
vmg->__remove_next = true;
@@ -1307,7 +1351,7 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
}
/* Don't bother splitting the VMA if we can't unmap it anyway */
- if (!can_modify_vma(vms->vma)) {
+ if (vma_is_sealed(vms->vma)) {
error = -EPERM;
goto start_split_failed;
}
@@ -1327,7 +1371,7 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
for_each_vma_range(*(vms->vmi), next, vms->end) {
long nrpages;
- if (!can_modify_vma(next)) {
+ if (vma_is_sealed(next)) {
error = -EPERM;
goto modify_vma_failed;
}
@@ -1596,27 +1640,25 @@ static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg)
struct vm_area_struct *vma_modify_flags(
struct vma_iterator *vmi, struct vm_area_struct *prev,
struct vm_area_struct *vma, unsigned long start, unsigned long end,
- unsigned long new_flags)
+ vm_flags_t vm_flags)
{
VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
- vmg.flags = new_flags;
+ vmg.vm_flags = vm_flags;
return vma_modify(&vmg);
}
struct vm_area_struct
-*vma_modify_flags_name(struct vma_iterator *vmi,
+*vma_modify_name(struct vma_iterator *vmi,
struct vm_area_struct *prev,
struct vm_area_struct *vma,
unsigned long start,
unsigned long end,
- unsigned long new_flags,
struct anon_vma_name *new_name)
{
VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
- vmg.flags = new_flags;
vmg.anon_name = new_name;
return vma_modify(&vmg);
@@ -1641,13 +1683,13 @@ struct vm_area_struct
struct vm_area_struct *prev,
struct vm_area_struct *vma,
unsigned long start, unsigned long end,
- unsigned long new_flags,
+ vm_flags_t vm_flags,
struct vm_userfaultfd_ctx new_ctx,
bool give_up_on_oom)
{
VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
- vmg.flags = new_flags;
+ vmg.vm_flags = vm_flags;
vmg.uffd_ctx = new_ctx;
if (give_up_on_oom)
vmg.give_up_on_oom = true;
@@ -1783,6 +1825,14 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
faulted_in_anon_vma = false;
}
+ /*
+ * If the VMA we are copying might contain a uprobe PTE, ensure
+ * that we do not establish one upon merge. Otherwise, when mremap()
+ * moves page tables, it will orphan the newly created PTE.
+ */
+ if (vma->vm_file)
+ vmg.skip_vma_uprobe = true;
+
new_vma = find_vma_prev(mm, addr, &vmg.prev);
if (new_vma && new_vma->vm_start < addr + len)
return NULL; /* should never get here */
@@ -1834,6 +1884,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
return new_vma;
out_vma_link:
+ fixup_hugetlb_reservations(new_vma);
vma_close(new_vma);
if (new_vma->vm_file)
@@ -2272,6 +2323,11 @@ static void vms_abort_munmap_vmas(struct vma_munmap_struct *vms,
vms_complete_munmap_vmas(vms, mas_detach);
}
+static void update_ksm_flags(struct mmap_state *map)
+{
+ map->vm_flags = ksm_vma_flags(map->mm, map->file, map->vm_flags);
+}
+
/*
* __mmap_prepare() - Prepare to gather any overlapping VMAs that need to be
* unmapped once the map operation is completed, check limits, account mapping
@@ -2314,11 +2370,11 @@ static int __mmap_prepare(struct mmap_state *map, struct list_head *uf)
}
/* Check against address space limit. */
- if (!may_expand_vm(map->mm, map->flags, map->pglen - vms->nr_pages))
+ if (!may_expand_vm(map->mm, map->vm_flags, map->pglen - vms->nr_pages))
return -ENOMEM;
/* Private writable mapping: check memory availability. */
- if (accountable_mapping(map->file, map->flags)) {
+ if (accountable_mapping(map->file, map->vm_flags)) {
map->charged = map->pglen;
map->charged -= vms->nr_accounted;
if (map->charged) {
@@ -2328,7 +2384,7 @@ static int __mmap_prepare(struct mmap_state *map, struct list_head *uf)
}
vms->nr_accounted = 0;
- map->flags |= VM_ACCOUNT;
+ map->vm_flags |= VM_ACCOUNT;
}
/*
@@ -2350,6 +2406,10 @@ static int __mmap_new_file_vma(struct mmap_state *map,
int error;
vma->vm_file = get_file(map->file);
+
+ if (!map->file->f_op->mmap)
+ return 0;
+
error = mmap_file(vma->vm_file, vma);
if (error) {
fput(vma->vm_file);
@@ -2368,13 +2428,12 @@ static int __mmap_new_file_vma(struct mmap_state *map,
* Drivers should not permit writability when previously it was
* disallowed.
*/
- VM_WARN_ON_ONCE(map->flags != vma->vm_flags &&
- !(map->flags & VM_MAYWRITE) &&
+ VM_WARN_ON_ONCE(map->vm_flags != vma->vm_flags &&
+ !(map->vm_flags & VM_MAYWRITE) &&
(vma->vm_flags & VM_MAYWRITE));
- /* If the flags change (and are mergeable), let's retry later. */
- map->retry_merge = vma->vm_flags != map->flags && !(vma->vm_flags & VM_SPECIAL);
- map->flags = vma->vm_flags;
+ map->file = vma->vm_file;
+ map->vm_flags = vma->vm_flags;
return 0;
}
@@ -2405,8 +2464,8 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
vma_iter_config(vmi, map->addr, map->end);
vma_set_range(vma, map->addr, map->end, map->pgoff);
- vm_flags_init(vma, map->flags);
- vma->vm_page_prot = vm_get_page_prot(map->flags);
+ vm_flags_init(vma, map->vm_flags);
+ vma->vm_page_prot = map->page_prot;
if (vma_iter_prealloc(vmi, vma)) {
error = -ENOMEM;
@@ -2415,7 +2474,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
if (map->file)
error = __mmap_new_file_vma(map, vma);
- else if (map->flags & VM_SHARED)
+ else if (map->vm_flags & VM_SHARED)
error = shmem_zero_setup(vma);
else
vma_set_anonymous(vma);
@@ -2423,9 +2482,14 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
if (error)
goto free_iter_vma;
+ if (!map->check_ksm_early) {
+ update_ksm_flags(map);
+ vm_flags_init(vma, map->vm_flags);
+ }
+
#ifdef CONFIG_SPARC64
/* TODO: Fix SPARC ADI! */
- WARN_ON_ONCE(!arch_validate_flags(map->flags));
+ WARN_ON_ONCE(!arch_validate_flags(map->vm_flags));
#endif
/* Lock the VMA since it is modified after insertion into VMA tree */
@@ -2439,8 +2503,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
* call covers the non-merge case.
*/
if (!vma_is_anonymous(vma))
- khugepaged_enter_vma(vma, map->flags);
- ksm_add_vma(vma);
+ khugepaged_enter_vma(vma, map->vm_flags);
*vmap = vma;
return 0;
@@ -2461,7 +2524,7 @@ free_vma:
static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma)
{
struct mm_struct *mm = map->mm;
- unsigned long vm_flags = vma->vm_flags;
+ vm_flags_t vm_flags = vma->vm_flags;
perf_event_mmap(vma);
@@ -2493,6 +2556,86 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma)
vma_set_page_prot(vma);
}
+/*
+ * Invoke the f_op->mmap_prepare() callback for a file-backed mapping that
+ * specifies it.
+ *
+ * This is called prior to any merge attempt, and updates whitelisted fields
+ * that are permitted to be updated by the caller.
+ *
+ * All but user-defined fields will be pre-populated with original values.
+ *
+ * Returns 0 on success, or an error code otherwise.
+ */
+static int call_mmap_prepare(struct mmap_state *map)
+{
+ int err;
+ struct vm_area_desc desc = {
+ .mm = map->mm,
+ .file = map->file,
+ .start = map->addr,
+ .end = map->end,
+
+ .pgoff = map->pgoff,
+ .vm_file = map->file,
+ .vm_flags = map->vm_flags,
+ .page_prot = map->page_prot,
+ };
+
+ /* Invoke the hook. */
+ err = vfs_mmap_prepare(map->file, &desc);
+ if (err)
+ return err;
+
+ /* Update fields permitted to be changed. */
+ map->pgoff = desc.pgoff;
+ map->file = desc.vm_file;
+ map->vm_flags = desc.vm_flags;
+ map->page_prot = desc.page_prot;
+ /* User-defined fields. */
+ map->vm_ops = desc.vm_ops;
+ map->vm_private_data = desc.private_data;
+
+ return 0;
+}
+
+static void set_vma_user_defined_fields(struct vm_area_struct *vma,
+ struct mmap_state *map)
+{
+ if (map->vm_ops)
+ vma->vm_ops = map->vm_ops;
+ vma->vm_private_data = map->vm_private_data;
+}
+
+/*
+ * Are we guaranteed no driver can change state such as to preclude KSM merging?
+ * If so, let's set the KSM mergeable flag early so we don't break VMA merging.
+ */
+static bool can_set_ksm_flags_early(struct mmap_state *map)
+{
+ struct file *file = map->file;
+
+ /* Anonymous mappings have no driver which can change them. */
+ if (!file)
+ return true;
+
+ /*
+ * If .mmap_prepare() is specified, then the driver will have already
+ * manipulated state prior to updating KSM flags. So no need to worry
+ * about mmap callbacks modifying VMA flags after the KSM flag has been
+ * updated here, which could otherwise affect KSM eligibility.
+ */
+ if (file->f_op->mmap_prepare)
+ return true;
+
+ /* shmem is safe. */
+ if (shmem_file(file))
+ return true;
+
+ /* Any other .mmap callback is not safe. */
+ return false;
+}
+
static unsigned long __mmap_region(struct file *file, unsigned long addr,
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
struct list_head *uf)
@@ -2500,13 +2643,21 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr,
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = NULL;
int error;
+ bool have_mmap_prepare = file && file->f_op->mmap_prepare;
VMA_ITERATOR(vmi, mm, addr);
MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file);
+ map.check_ksm_early = can_set_ksm_flags_early(&map);
+
error = __mmap_prepare(&map, uf);
+ if (!error && have_mmap_prepare)
+ error = call_mmap_prepare(&map);
if (error)
goto abort_munmap;
+ if (map.check_ksm_early)
+ update_ksm_flags(&map);
+
/* Attempt to merge with adjacent VMAs... */
if (map.prev || map.next) {
VMG_MMAP_STATE(vmg, &map, /* vma = */ NULL);
@@ -2521,16 +2672,8 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr,
goto unacct_error;
}
- /* If flags changed, we might be able to merge, so try again. */
- if (map.retry_merge) {
- struct vm_area_struct *merged;
- VMG_MMAP_STATE(vmg, &map, vma);
-
- vma_iter_config(map.vmi, map.addr, map.end);
- merged = vma_merge_existing_range(&vmg);
- if (merged)
- vma = merged;
- }
+ if (have_mmap_prepare)
+ set_vma_user_defined_fields(vma, &map);
__mmap_complete(&map, vma);
@@ -2610,14 +2753,14 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
* @addr: The start address
* @len: The length of the increase
* @vma: The vma,
- * @flags: The VMA Flags
+ * @vm_flags: The VMA Flags
*
* Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags
* do not match then create a new anonymous VMA. Eventually we may be able to
* do some brk-specific accounting here.
*/
int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
- unsigned long addr, unsigned long len, unsigned long flags)
+ unsigned long addr, unsigned long len, vm_flags_t vm_flags)
{
struct mm_struct *mm = current->mm;
@@ -2625,8 +2768,9 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
* Check against address space limits by the changed size
* Note: This happens *after* clearing old mappings in some code paths.
*/
- flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
- if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
+ vm_flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
+ vm_flags = ksm_vma_flags(mm, NULL, vm_flags);
+ if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT))
return -ENOMEM;
if (mm->map_count > sysctl_max_map_count)
@@ -2640,7 +2784,7 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
* occur after forking, so the expand will only happen on new VMAs.
*/
if (vma && vma->vm_end == addr) {
- VMG_STATE(vmg, mm, vmi, addr, addr + len, flags, PHYS_PFN(addr));
+ VMG_STATE(vmg, mm, vmi, addr, addr + len, vm_flags, PHYS_PFN(addr));
vmg.prev = vma;
/* vmi is positioned at prev, which this mode expects. */
@@ -2661,20 +2805,19 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
vma_set_anonymous(vma);
vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT);
- vm_flags_init(vma, flags);
- vma->vm_page_prot = vm_get_page_prot(flags);
+ vm_flags_init(vma, vm_flags);
+ vma->vm_page_prot = vm_get_page_prot(vm_flags);
vma_start_write(vma);
if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL))
goto mas_store_fail;
mm->map_count++;
validate_mm(mm);
- ksm_add_vma(vma);
out:
perf_event_mmap(vma);
mm->total_vm += len >> PAGE_SHIFT;
mm->data_vm += len >> PAGE_SHIFT;
- if (flags & VM_LOCKED)
+ if (vm_flags & VM_LOCKED)
mm->locked_vm += (len >> PAGE_SHIFT);
vm_flags_set(vma, VM_SOFTDIRTY);
return 0;
@@ -3017,3 +3160,45 @@ int __vm_munmap(unsigned long start, size_t len, bool unlock)
userfaultfd_unmap_complete(mm, &uf);
return ret;
}
+
+/* Insert vm structure into process list sorted by address
+ * and into the inode's i_mmap tree. If vm_file is non-NULL
+ * then i_mmap_rwsem is taken here.
+ */
+int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+ unsigned long charged = vma_pages(vma);
+
+
+ if (find_vma_intersection(mm, vma->vm_start, vma->vm_end))
+ return -ENOMEM;
+
+ if ((vma->vm_flags & VM_ACCOUNT) &&
+ security_vm_enough_memory_mm(mm, charged))
+ return -ENOMEM;
+
+ /*
+ * The vm_pgoff of a purely anonymous vma should be irrelevant
+ * until its first write fault, when page's anon_vma and index
+ * are set. But now set the vm_pgoff it will almost certainly
+ * end up with (unless mremap moves it elsewhere before that
+ * first wfault), so /proc/pid/maps tells a consistent story.
+ *
+ * By setting it to reflect the virtual start address of the
+ * vma, merges and splits can happen in a seamless way, just
+ * using the existing file pgoff checks and manipulations.
+ * Similarly in do_mmap and in do_brk_flags.
+ */
+ if (vma_is_anonymous(vma)) {
+ BUG_ON(vma->anon_vma);
+ vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
+ }
+
+ if (vma_link(mm, vma)) {
+ if (vma->vm_flags & VM_ACCOUNT)
+ vm_unacct_memory(charged);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
diff --git a/mm/vma.h b/mm/vma.h
index 149926e8a6d1..9183fe549009 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -19,6 +19,8 @@ struct vma_prepare {
struct vm_area_struct *insert;
struct vm_area_struct *remove;
struct vm_area_struct *remove2;
+
+ bool skip_vma_uprobe :1;
};
struct unlink_vma_file_batch {
@@ -96,7 +98,7 @@ struct vma_merge_struct {
unsigned long end;
pgoff_t pgoff;
- unsigned long flags;
+ vm_flags_t vm_flags;
struct file *file;
struct anon_vma *anon_vma;
struct mempolicy *policy;
@@ -120,6 +122,11 @@ struct vma_merge_struct {
*/
bool give_up_on_oom :1;
+ /*
+ * If set, skip uprobe_mmap upon merged vma.
+ */
+ bool skip_vma_uprobe :1;
+
/* Internal flags set during merge process: */
/*
@@ -138,7 +145,7 @@ struct vma_merge_struct {
*/
bool __remove_middle :1;
/*
- * Internal flag used during the merge operationr to indicate we will
+ * Internal flag used during the merge operation to indicate we will
* remove vmg->next.
*/
bool __remove_next :1;
@@ -157,13 +164,13 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma,
return vma->vm_pgoff + PHYS_PFN(addr - vma->vm_start);
}
-#define VMG_STATE(name, mm_, vmi_, start_, end_, flags_, pgoff_) \
+#define VMG_STATE(name, mm_, vmi_, start_, end_, vm_flags_, pgoff_) \
struct vma_merge_struct name = { \
.mm = mm_, \
.vmi = vmi_, \
.start = start_, \
.end = end_, \
- .flags = flags_, \
+ .vm_flags = vm_flags_, \
.pgoff = pgoff_, \
.state = VMA_MERGE_START, \
}
@@ -177,7 +184,7 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma,
.next = NULL, \
.start = start_, \
.end = end_, \
- .flags = vma_->vm_flags, \
+ .vm_flags = vma_->vm_flags, \
.pgoff = vma_pgoff_offset(vma_, start_), \
.file = vma_->vm_file, \
.anon_vma = vma_->anon_vma, \
@@ -215,6 +222,33 @@ static inline int vma_iter_store_gfp(struct vma_iterator *vmi,
return 0;
}
+/*
+ * Temporary helper function for stacked mmap handlers which specify
+ * f_op->mmap() but which might have an underlying file system which implements
+ * f_op->mmap_prepare().
+ */
+static inline void set_vma_from_desc(struct vm_area_struct *vma,
+ struct vm_area_desc *desc)
+{
+ /*
+ * Since we're invoking .mmap_prepare() despite having a partially
+ * established VMA, we must take care to handle setting fields
+ * correctly.
+ */
+
+ /* Mutable fields. Populated with initial state. */
+ vma->vm_pgoff = desc->pgoff;
+ if (desc->vm_file != vma->vm_file)
+ vma_set_file(vma, desc->vm_file);
+ if (desc->vm_flags != vma->vm_flags)
+ vm_flags_set(vma, desc->vm_flags);
+ vma->vm_page_prot = desc->page_prot;
+
+ /* User-defined fields. */
+ vma->vm_ops = desc->vm_ops;
+ vma->vm_private_data = desc->private_data;
+}
+
int
do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
struct mm_struct *mm, unsigned long start,
@@ -234,17 +268,16 @@ __must_check struct vm_area_struct
*vma_modify_flags(struct vma_iterator *vmi,
struct vm_area_struct *prev, struct vm_area_struct *vma,
unsigned long start, unsigned long end,
- unsigned long new_flags);
+ vm_flags_t vm_flags);
-/* We are about to modify the VMA's flags and/or anon_name. */
+/* We are about to modify the VMA's anon_name. */
__must_check struct vm_area_struct
-*vma_modify_flags_name(struct vma_iterator *vmi,
- struct vm_area_struct *prev,
- struct vm_area_struct *vma,
- unsigned long start,
- unsigned long end,
- unsigned long new_flags,
- struct anon_vma_name *new_name);
+*vma_modify_name(struct vma_iterator *vmi,
+ struct vm_area_struct *prev,
+ struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end,
+ struct anon_vma_name *new_name);
/* We are about to modify the VMA's memory policy. */
__must_check struct vm_area_struct
@@ -260,7 +293,7 @@ __must_check struct vm_area_struct
struct vm_area_struct *prev,
struct vm_area_struct *vma,
unsigned long start, unsigned long end,
- unsigned long new_flags,
+ vm_flags_t vm_flags,
struct vm_userfaultfd_ctx new_ctx,
bool give_up_on_oom);
@@ -321,7 +354,7 @@ static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma
}
#ifdef CONFIG_MMU
-static inline pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags)
+static inline pgprot_t vm_pgprot_modify(pgprot_t oldprot, vm_flags_t vm_flags)
{
return pgprot_modify(oldprot, vm_get_page_prot(vm_flags));
}
@@ -506,38 +539,15 @@ struct vm_area_struct *vma_iter_next_rewind(struct vma_iterator *vmi,
}
#ifdef CONFIG_64BIT
-
static inline bool vma_is_sealed(struct vm_area_struct *vma)
{
return (vma->vm_flags & VM_SEALED);
}
-
-/*
- * check if a vma is sealed for modification.
- * return true, if modification is allowed.
- */
-static inline bool can_modify_vma(struct vm_area_struct *vma)
-{
- if (unlikely(vma_is_sealed(vma)))
- return false;
-
- return true;
-}
-
-bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior);
-
#else
-
-static inline bool can_modify_vma(struct vm_area_struct *vma)
-{
- return true;
-}
-
-static inline bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior)
+static inline bool vma_is_sealed(struct vm_area_struct *vma)
{
- return true;
+ return false;
}
-
#endif
#if defined(CONFIG_STACK_GROWSUP)
@@ -548,4 +558,19 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address);
int __vm_munmap(unsigned long start, size_t len, bool unlock);
+int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma);
+
+/* vma_init.h, shared between CONFIG_MMU and nommu. */
+void __init vma_state_init(void);
+struct vm_area_struct *vm_area_alloc(struct mm_struct *mm);
+struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig);
+void vm_area_free(struct vm_area_struct *vma);
+
+/* vma_exec.c */
+#ifdef CONFIG_MMU
+int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap,
+ unsigned long *top_mem_p);
+int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift);
+#endif
+
#endif /* __MM_VMA_H */
diff --git a/mm/vma_exec.c b/mm/vma_exec.c
new file mode 100644
index 000000000000..922ee51747a6
--- /dev/null
+++ b/mm/vma_exec.c
@@ -0,0 +1,161 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * Functions explicitly implemented for exec functionality which however are
+ * explicitly VMA-only logic.
+ */
+
+#include "vma_internal.h"
+#include "vma.h"
+
+/*
+ * Relocate a VMA downwards by shift bytes. There cannot be any VMAs between
+ * this VMA and its relocated range, which will now reside at [vma->vm_start -
+ * shift, vma->vm_end - shift).
+ *
+ * This function is almost certainly NOT what you want for anything other than
+ * early executable temporary stack relocation.
+ */
+int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
+{
+ /*
+ * The process proceeds as follows:
+ *
+ * 1) Use shift to calculate the new vma endpoints.
+ * 2) Extend vma to cover both the old and new ranges. This ensures the
+ * arguments passed to subsequent functions are consistent.
+ * 3) Move vma's page tables to the new range.
+ * 4) Free up any cleared pgd range.
+ * 5) Shrink the vma to cover only the new range.
+ */
+
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long old_start = vma->vm_start;
+ unsigned long old_end = vma->vm_end;
+ unsigned long length = old_end - old_start;
+ unsigned long new_start = old_start - shift;
+ unsigned long new_end = old_end - shift;
+ VMA_ITERATOR(vmi, mm, new_start);
+ VMG_STATE(vmg, mm, &vmi, new_start, old_end, 0, vma->vm_pgoff);
+ struct vm_area_struct *next;
+ struct mmu_gather tlb;
+ PAGETABLE_MOVE(pmc, vma, vma, old_start, new_start, length);
+
+ BUG_ON(new_start > new_end);
+
+ /*
+ * ensure there are no vmas between where we want to go
+ * and where we are
+ */
+ if (vma != vma_next(&vmi))
+ return -EFAULT;
+
+ vma_iter_prev_range(&vmi);
+ /*
+ * cover the whole range: [new_start, old_end)
+ */
+ vmg.target = vma;
+ if (vma_expand(&vmg))
+ return -ENOMEM;
+
+ /*
+ * move the page tables downwards, on failure we rely on
+ * process cleanup to remove whatever mess we made.
+ */
+ pmc.for_stack = true;
+ if (length != move_page_tables(&pmc))
+ return -ENOMEM;
+
+ tlb_gather_mmu(&tlb, mm);
+ next = vma_next(&vmi);
+ if (new_end > old_start) {
+ /*
+ * when the old and new regions overlap clear from new_end.
+ */
+ free_pgd_range(&tlb, new_end, old_end, new_end,
+ next ? next->vm_start : USER_PGTABLES_CEILING);
+ } else {
+ /*
+ * otherwise, clean from old_start; this is done to not touch
+ * the address space in [new_end, old_start) some architectures
+ * have constraints on va-space that make this illegal (IA64) -
+ * for the others its just a little faster.
+ */
+ free_pgd_range(&tlb, old_start, old_end, new_end,
+ next ? next->vm_start : USER_PGTABLES_CEILING);
+ }
+ tlb_finish_mmu(&tlb);
+
+ vma_prev(&vmi);
+ /* Shrink the vma to just the new range */
+ return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff);
+}
+
+/*
+ * Establish the stack VMA in an execve'd process, located temporarily at the
+ * maximum stack address provided by the architecture.
+ *
+ * We later relocate this downwards in relocate_vma_down().
+ *
+ * This function is almost certainly NOT what you want for anything other than
+ * early executable initialisation.
+ *
+ * On success, returns 0 and sets *vmap to the stack VMA and *top_mem_p to the
+ * maximum addressable location in the stack (that is capable of storing a
+ * system word of data).
+ */
+int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap,
+ unsigned long *top_mem_p)
+{
+ int err;
+ struct vm_area_struct *vma = vm_area_alloc(mm);
+
+ if (!vma)
+ return -ENOMEM;
+
+ vma_set_anonymous(vma);
+
+ if (mmap_write_lock_killable(mm)) {
+ err = -EINTR;
+ goto err_free;
+ }
+
+ /*
+ * Need to be called with mmap write lock
+ * held, to avoid race with ksmd.
+ */
+ err = ksm_execve(mm);
+ if (err)
+ goto err_ksm;
+
+ /*
+ * Place the stack at the largest stack address the architecture
+ * supports. Later, we'll move this to an appropriate place. We don't
+ * use STACK_TOP because that can depend on attributes which aren't
+ * configured yet.
+ */
+ BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
+ vma->vm_end = STACK_TOP_MAX;
+ vma->vm_start = vma->vm_end - PAGE_SIZE;
+ vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP);
+ vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
+
+ err = insert_vm_struct(mm, vma);
+ if (err)
+ goto err;
+
+ mm->stack_vm = mm->total_vm = 1;
+ mmap_write_unlock(mm);
+ *vmap = vma;
+ *top_mem_p = vma->vm_end - sizeof(void *);
+ return 0;
+
+err:
+ ksm_exit(mm);
+err_ksm:
+ mmap_write_unlock(mm);
+err_free:
+ *vmap = NULL;
+ vm_area_free(vma);
+ return err;
+}
diff --git a/mm/vma_init.c b/mm/vma_init.c
new file mode 100644
index 000000000000..3c0b65950510
--- /dev/null
+++ b/mm/vma_init.c
@@ -0,0 +1,152 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/*
+ * Functions for initializing, allocating, freeing and duplicating VMAs. Shared
+ * between CONFIG_MMU and non-CONFIG_MMU kernel configurations.
+ */
+
+#include "vma_internal.h"
+#include "vma.h"
+
+/* SLAB cache for vm_area_struct structures */
+static struct kmem_cache *vm_area_cachep;
+
+void __init vma_state_init(void)
+{
+ struct kmem_cache_args args = {
+ .use_freeptr_offset = true,
+ .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr),
+ .sheaf_capacity = 32,
+ };
+
+ vm_area_cachep = kmem_cache_create("vm_area_struct",
+ sizeof(struct vm_area_struct), &args,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
+ SLAB_ACCOUNT);
+}
+
+struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+
+ vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+ if (!vma)
+ return NULL;
+
+ vma_init(vma, mm);
+
+ return vma;
+}
+
+static void vm_area_init_from(const struct vm_area_struct *src,
+ struct vm_area_struct *dest)
+{
+ dest->vm_mm = src->vm_mm;
+ dest->vm_ops = src->vm_ops;
+ dest->vm_start = src->vm_start;
+ dest->vm_end = src->vm_end;
+ dest->anon_vma = src->anon_vma;
+ dest->vm_pgoff = src->vm_pgoff;
+ dest->vm_file = src->vm_file;
+ dest->vm_private_data = src->vm_private_data;
+ vm_flags_init(dest, src->vm_flags);
+ memcpy(&dest->vm_page_prot, &src->vm_page_prot,
+ sizeof(dest->vm_page_prot));
+ /*
+ * src->shared.rb may be modified concurrently when called from
+ * dup_mmap(), but the clone will reinitialize it.
+ */
+ data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
+ memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
+ sizeof(dest->vm_userfaultfd_ctx));
+#ifdef CONFIG_ANON_VMA_NAME
+ dest->anon_name = src->anon_name;
+#endif
+#ifdef CONFIG_SWAP
+ memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
+ sizeof(dest->swap_readahead_info));
+#endif
+#ifndef CONFIG_MMU
+ dest->vm_region = src->vm_region;
+#endif
+#ifdef CONFIG_NUMA
+ dest->vm_policy = src->vm_policy;
+#endif
+#ifdef __HAVE_PFNMAP_TRACKING
+ dest->pfnmap_track_ctx = NULL;
+#endif
+}
+
+#ifdef __HAVE_PFNMAP_TRACKING
+static inline int vma_pfnmap_track_ctx_dup(struct vm_area_struct *orig,
+ struct vm_area_struct *new)
+{
+ struct pfnmap_track_ctx *ctx = orig->pfnmap_track_ctx;
+
+ if (likely(!ctx))
+ return 0;
+
+ /*
+ * We don't expect to ever hit this. If ever required, we would have
+ * to duplicate the tracking.
+ */
+ if (unlikely(kref_read(&ctx->kref) >= REFCOUNT_MAX))
+ return -ENOMEM;
+ kref_get(&ctx->kref);
+ new->pfnmap_track_ctx = ctx;
+ return 0;
+}
+
+static inline void vma_pfnmap_track_ctx_release(struct vm_area_struct *vma)
+{
+ struct pfnmap_track_ctx *ctx = vma->pfnmap_track_ctx;
+
+ if (likely(!ctx))
+ return;
+
+ kref_put(&ctx->kref, pfnmap_track_ctx_release);
+ vma->pfnmap_track_ctx = NULL;
+}
+#else
+static inline int vma_pfnmap_track_ctx_dup(struct vm_area_struct *orig,
+ struct vm_area_struct *new)
+{
+ return 0;
+}
+static inline void vma_pfnmap_track_ctx_release(struct vm_area_struct *vma)
+{
+}
+#endif
+
+struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
+{
+ struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+
+ if (!new)
+ return NULL;
+
+ ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
+ ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
+ vm_area_init_from(orig, new);
+
+ if (vma_pfnmap_track_ctx_dup(orig, new)) {
+ kmem_cache_free(vm_area_cachep, new);
+ return NULL;
+ }
+ vma_lock_init(new, true);
+ INIT_LIST_HEAD(&new->anon_vma_chain);
+ vma_numab_state_init(new);
+ dup_anon_vma_name(orig, new);
+
+ return new;
+}
+
+void vm_area_free(struct vm_area_struct *vma)
+{
+ /* The vma should be detached while being destroyed. */
+ vma_assert_detached(vma);
+ vma_numab_state_free(vma);
+ free_anon_vma_name(vma);
+ vma_pfnmap_track_ctx_release(vma);
+ kmem_cache_free(vm_area_cachep, vma);
+}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 2d7511654831..798b2ed21e46 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -104,6 +104,9 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte = pte_alloc_kernel_track(pmd, addr, mask);
if (!pte)
return -ENOMEM;
+
+ arch_enter_lazy_mmu_mode();
+
do {
if (unlikely(!pte_none(ptep_get(pte)))) {
if (pfn_valid(pfn)) {
@@ -127,6 +130,8 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
pfn++;
} while (pte += PFN_DOWN(size), addr += size, addr != end);
+
+ arch_leave_lazy_mmu_mode();
*mask |= PGTBL_PTE_MODIFIED;
return 0;
}
@@ -350,12 +355,30 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pgtbl_mod_mask *mask)
{
pte_t *pte;
+ pte_t ptent;
+ unsigned long size = PAGE_SIZE;
pte = pte_offset_kernel(pmd, addr);
+ arch_enter_lazy_mmu_mode();
+
do {
- pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
+#ifdef CONFIG_HUGETLB_PAGE
+ size = arch_vmap_pte_range_unmap_size(addr, pte);
+ if (size != PAGE_SIZE) {
+ if (WARN_ON(!IS_ALIGNED(addr, size))) {
+ addr = ALIGN_DOWN(addr, size);
+ pte = PTR_ALIGN_DOWN(pte, sizeof(*pte) * (size >> PAGE_SHIFT));
+ }
+ ptent = huge_ptep_get_and_clear(&init_mm, addr, pte, size);
+ if (WARN_ON(end - addr < size))
+ size = end - addr;
+ } else
+#endif
+ ptent = ptep_get_and_clear(&init_mm, addr, pte);
WARN_ON(!pte_none(ptent) && !pte_present(ptent));
- } while (pte++, addr += PAGE_SIZE, addr != end);
+ } while (pte += (size >> PAGE_SHIFT), addr += size, addr != end);
+
+ arch_leave_lazy_mmu_mode();
*mask |= PGTBL_PTE_MODIFIED;
}
@@ -374,8 +397,10 @@ static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
if (cleared || pmd_bad(*pmd))
*mask |= PGTBL_PMD_MODIFIED;
- if (cleared)
+ if (cleared) {
+ WARN_ON(next - addr < PMD_SIZE);
continue;
+ }
if (pmd_none_or_clear_bad(pmd))
continue;
vunmap_pte_range(pmd, addr, next, mask);
@@ -399,8 +424,10 @@ static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
if (cleared || pud_bad(*pud))
*mask |= PGTBL_PUD_MODIFIED;
- if (cleared)
+ if (cleared) {
+ WARN_ON(next - addr < PUD_SIZE);
continue;
+ }
if (pud_none_or_clear_bad(pud))
continue;
vunmap_pmd_range(pud, addr, next, mask);
@@ -487,6 +514,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr,
pgtbl_mod_mask *mask)
{
+ int err = 0;
pte_t *pte;
/*
@@ -497,21 +525,33 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
pte = pte_alloc_kernel_track(pmd, addr, mask);
if (!pte)
return -ENOMEM;
+
+ arch_enter_lazy_mmu_mode();
+
do {
struct page *page = pages[*nr];
- if (WARN_ON(!pte_none(ptep_get(pte))))
- return -EBUSY;
- if (WARN_ON(!page))
- return -ENOMEM;
- if (WARN_ON(!pfn_valid(page_to_pfn(page))))
- return -EINVAL;
+ if (WARN_ON(!pte_none(ptep_get(pte)))) {
+ err = -EBUSY;
+ break;
+ }
+ if (WARN_ON(!page)) {
+ err = -ENOMEM;
+ break;
+ }
+ if (WARN_ON(!pfn_valid(page_to_pfn(page)))) {
+ err = -EINVAL;
+ break;
+ }
set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
(*nr)++;
} while (pte++, addr += PAGE_SIZE, addr != end);
+
+ arch_leave_lazy_mmu_mode();
*mask |= PGTBL_PTE_MODIFIED;
- return 0;
+
+ return err;
}
static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr,
@@ -900,6 +940,11 @@ static struct vmap_node *vmap_nodes = &single;
static __read_mostly unsigned int nr_vmap_nodes = 1;
static __read_mostly unsigned int vmap_zone_size = 1;
+/* A simple iterator over all vmap-nodes. */
+#define for_each_vmap_node(vn) \
+ for ((vn) = &vmap_nodes[0]; \
+ (vn) < &vmap_nodes[nr_vmap_nodes]; (vn)++)
+
static inline unsigned int
addr_to_node_id(unsigned long addr)
{
@@ -918,6 +963,19 @@ id_to_node(unsigned int id)
return &vmap_nodes[id % nr_vmap_nodes];
}
+static inline unsigned int
+node_to_id(struct vmap_node *node)
+{
+ /* Pointer arithmetic. */
+ unsigned int id = node - vmap_nodes;
+
+ if (likely(id < nr_vmap_nodes))
+ return id;
+
+ WARN_ONCE(1, "An address 0x%p is out-of-bounds.\n", node);
+ return 0;
+}
+
/*
* We use the value 0 to represent "no node", that is why
* an encoded value will be the node-id incremented by 1.
@@ -990,7 +1048,8 @@ static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
static void drain_vmap_area_work(struct work_struct *work);
static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work);
-static atomic_long_t nr_vmalloc_pages;
+static __cacheline_aligned_in_smp atomic_long_t nr_vmalloc_pages;
+static __cacheline_aligned_in_smp atomic_long_t vmap_lazy_nr;
unsigned long vmalloc_nr_pages(void)
{
@@ -1056,12 +1115,11 @@ find_vmap_area_exceed_addr_lock(unsigned long addr, struct vmap_area **va)
{
unsigned long va_start_lowest;
struct vmap_node *vn;
- int i;
repeat:
- for (i = 0, va_start_lowest = 0; i < nr_vmap_nodes; i++) {
- vn = &vmap_nodes[i];
+ va_start_lowest = 0;
+ for_each_vmap_node(vn) {
spin_lock(&vn->busy.lock);
*va = __find_vmap_area_exceed_addr(addr, &vn->busy.root);
@@ -1698,7 +1756,7 @@ va_clip(struct rb_root *root, struct list_head *head,
*/
lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
if (!lva)
- return -1;
+ return -ENOMEM;
}
/*
@@ -1712,7 +1770,7 @@ va_clip(struct rb_root *root, struct list_head *head,
*/
va->va_start = nva_start_addr + size;
} else {
- return -1;
+ return -EINVAL;
}
if (type != FL_FIT_TYPE) {
@@ -1741,19 +1799,19 @@ va_alloc(struct vmap_area *va,
/* Check the "vend" restriction. */
if (nva_start_addr + size > vend)
- return vend;
+ return -ERANGE;
/* Update the free vmap_area. */
ret = va_clip(root, head, va, nva_start_addr, size);
if (WARN_ON_ONCE(ret))
- return vend;
+ return ret;
return nva_start_addr;
}
/*
* Returns a start address of the newly allocated area, if success.
- * Otherwise a vend is returned that indicates failure.
+ * Otherwise an error value is returned that indicates failure.
*/
static __always_inline unsigned long
__alloc_vmap_area(struct rb_root *root, struct list_head *head,
@@ -1778,14 +1836,13 @@ __alloc_vmap_area(struct rb_root *root, struct list_head *head,
va = find_vmap_lowest_match(root, size, align, vstart, adjust_search_size);
if (unlikely(!va))
- return vend;
+ return -ENOENT;
nva_start_addr = va_alloc(va, root, head, size, align, vstart, vend);
- if (nva_start_addr == vend)
- return vend;
#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
- find_vmap_lowest_match_check(root, head, size, align);
+ if (!IS_ERR_VALUE(nva_start_addr))
+ find_vmap_lowest_match_check(root, head, size, align);
#endif
return nva_start_addr;
@@ -1915,7 +1972,7 @@ node_alloc(unsigned long size, unsigned long align,
struct vmap_area *va;
*vn_id = 0;
- *addr = vend;
+ *addr = -EINVAL;
/*
* Fallback to a global heap if not vmalloc or there
@@ -1969,6 +2026,8 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
if (unlikely(!vmap_initialized))
return ERR_PTR(-EBUSY);
+ /* Only reclaim behaviour flags are relevant. */
+ gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
might_sleep();
/*
@@ -1981,8 +2040,6 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
*/
va = node_alloc(size, align, vstart, vend, &addr, &vn_id);
if (!va) {
- gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
-
va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
if (unlikely(!va))
return ERR_PTR(-ENOMEM);
@@ -1995,20 +2052,26 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
}
retry:
- if (addr == vend) {
+ if (IS_ERR_VALUE(addr)) {
preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node);
addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list,
size, align, vstart, vend);
spin_unlock(&free_vmap_area_lock);
+
+ /*
+ * This is not a fast path. Check if yielding is needed. This
+ * is the only reschedule point in the vmalloc() path.
+ */
+ cond_resched();
}
- trace_alloc_vmap_area(addr, size, align, vstart, vend, addr == vend);
+ trace_alloc_vmap_area(addr, size, align, vstart, vend, IS_ERR_VALUE(addr));
/*
- * If an allocation fails, the "vend" address is
+ * If an allocation fails, the error value is
* returned. Therefore trigger the overflow path.
*/
- if (unlikely(addr == vend))
+ if (IS_ERR_VALUE(addr))
goto overflow;
va->va_start = addr;
@@ -2032,7 +2095,7 @@ retry:
BUG_ON(va->va_start < vstart);
BUG_ON(va->va_end > vend);
- ret = kasan_populate_vmalloc(addr, size);
+ ret = kasan_populate_vmalloc(addr, size, gfp_mask);
if (ret) {
free_vmap_area(va);
return ERR_PTR(ret);
@@ -2100,8 +2163,6 @@ static unsigned long lazy_max_pages(void)
return log * (32UL * 1024 * 1024 / PAGE_SIZE);
}
-static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0);
-
/*
* Serialize vmap purging. There is no actual critical section protected
* by this lock, but we want to avoid concurrent calls for performance
@@ -2111,7 +2172,6 @@ static DEFINE_MUTEX(vmap_purge_lock);
/* for per-CPU blocks */
static void purge_fragmented_blocks_allcpus(void);
-static cpumask_t purge_nodes;
static void
reclaim_list_global(struct list_head *head)
@@ -2134,7 +2194,7 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay)
LIST_HEAD(decay_list);
struct rb_root decay_root = RB_ROOT;
struct vmap_area *va, *nva;
- unsigned long n_decay;
+ unsigned long n_decay, pool_len;
int i;
for (i = 0; i < MAX_VA_SIZE_PAGES; i++) {
@@ -2148,22 +2208,20 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay)
list_replace_init(&vn->pool[i].head, &tmp_list);
spin_unlock(&vn->pool_lock);
- if (full_decay)
- WRITE_ONCE(vn->pool[i].len, 0);
+ pool_len = n_decay = vn->pool[i].len;
+ WRITE_ONCE(vn->pool[i].len, 0);
/* Decay a pool by ~25% out of left objects. */
- n_decay = vn->pool[i].len >> 2;
+ if (!full_decay)
+ n_decay >>= 2;
+ pool_len -= n_decay;
list_for_each_entry_safe(va, nva, &tmp_list, list) {
+ if (!n_decay--)
+ break;
+
list_del_init(&va->list);
merge_or_add_vmap_area(va, &decay_root, &decay_list);
-
- if (!full_decay) {
- WRITE_ONCE(vn->pool[i].len, vn->pool[i].len - 1);
-
- if (!--n_decay)
- break;
- }
}
/*
@@ -2172,9 +2230,10 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay)
* can populate the pool therefore a simple list replace
* operation takes place here.
*/
- if (!full_decay && !list_empty(&tmp_list)) {
+ if (!list_empty(&tmp_list)) {
spin_lock(&vn->pool_lock);
list_replace_init(&tmp_list, &vn->pool[i].head);
+ WRITE_ONCE(vn->pool[i].len, pool_len);
spin_unlock(&vn->pool_lock);
}
}
@@ -2244,6 +2303,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end,
{
unsigned long nr_purged_areas = 0;
unsigned int nr_purge_helpers;
+ static cpumask_t purge_nodes;
unsigned int nr_purge_nodes;
struct vmap_node *vn;
int i;
@@ -2255,9 +2315,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end,
*/
purge_nodes = CPU_MASK_NONE;
- for (i = 0; i < nr_vmap_nodes; i++) {
- vn = &vmap_nodes[i];
-
+ for_each_vmap_node(vn) {
INIT_LIST_HEAD(&vn->purge_list);
vn->skip_populate = full_pool_decay;
decay_va_pool_node(vn, full_pool_decay);
@@ -2276,7 +2334,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end,
end = max(end, list_last_entry(&vn->purge_list,
struct vmap_area, list)->va_end);
- cpumask_set_cpu(i, &purge_nodes);
+ cpumask_set_cpu(node_to_id(vn), &purge_nodes);
}
nr_purge_nodes = cpumask_weight(&purge_nodes);
@@ -2355,7 +2413,7 @@ static void free_vmap_area_noflush(struct vmap_area *va)
if (WARN_ON_ONCE(!list_empty(&va->list)))
return;
- nr_lazy = atomic_long_add_return(va_size(va) >> PAGE_SHIFT,
+ nr_lazy = atomic_long_add_return_relaxed(va_size(va) >> PAGE_SHIFT,
&vmap_lazy_nr);
/*
@@ -2421,7 +2479,7 @@ struct vmap_area *find_vmap_area(unsigned long addr)
if (va)
return va;
- } while ((i = (i + 1) % nr_vmap_nodes) != j);
+ } while ((i = (i + nr_vmap_nodes - 1) % nr_vmap_nodes) != j);
return NULL;
}
@@ -2447,7 +2505,7 @@ static struct vmap_area *find_unlink_vmap_area(unsigned long addr)
if (va)
return va;
- } while ((i = (i + 1) % nr_vmap_nodes) != j);
+ } while ((i = (i + nr_vmap_nodes - 1) % nr_vmap_nodes) != j);
return NULL;
}
@@ -2916,10 +2974,7 @@ static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
*/
void vm_unmap_aliases(void)
{
- unsigned long start = ULONG_MAX, end = 0;
- int flush = 0;
-
- _vm_unmap_aliases(start, end, flush);
+ _vm_unmap_aliases(ULONG_MAX, 0, 0);
}
EXPORT_SYMBOL_GPL(vm_unmap_aliases);
@@ -3100,7 +3155,7 @@ static void clear_vm_uninitialized_flag(struct vm_struct *vm)
/*
* Before removing VM_UNINITIALIZED,
* we should make sure that vm has proper values.
- * Pair with smp_rmb() in show_numa_info().
+ * Pair with smp_rmb() in vread_iter() and vmalloc_info_show().
*/
smp_wmb();
vm->flags &= ~VM_UNINITIALIZED;
@@ -3371,12 +3426,13 @@ void vfree(const void *addr)
if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS))
vm_reset_perms(vm);
+ /* All pages of vm should be charged to same memcg, so use first one. */
+ if (vm->nr_pages && !(vm->flags & VM_MAP_PUT_PAGES))
+ mod_memcg_page_state(vm->pages[0], MEMCG_VMALLOC, -vm->nr_pages);
for (i = 0; i < vm->nr_pages; i++) {
struct page *page = vm->pages[i];
BUG_ON(!page);
- if (!(vm->flags & VM_MAP_PUT_PAGES))
- mod_memcg_page_state(page, MEMCG_VMALLOC, -1);
/*
* High-order allocs for huge vmallocs are split, so
* can be freed as an array of order-0 allocations
@@ -3572,7 +3628,6 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
pages + nr_allocated);
nr_allocated += nr;
- cond_resched();
/*
* If zero or pages were obtained partly,
@@ -3614,7 +3669,6 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
for (i = 0; i < (1U << order); i++)
pages[nr_allocated + i] = page + i;
- cond_resched();
nr_allocated += 1U << order;
}
@@ -3672,12 +3726,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
node, page_order, nr_small_pages, area->pages);
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
- if (gfp_mask & __GFP_ACCOUNT) {
- int i;
-
- for (i = 0; i < area->nr_pages; i++)
- mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, 1);
- }
+ /* All pages of vm should be charged to same memcg, so use first one. */
+ if (gfp_mask & __GFP_ACCOUNT && area->nr_pages)
+ mod_memcg_page_state(area->pages[0], MEMCG_VMALLOC,
+ area->nr_pages);
/*
* If not enough pages were obtained to accomplish an
@@ -3944,9 +3996,10 @@ void *vmalloc_noprof(unsigned long size)
EXPORT_SYMBOL(vmalloc_noprof);
/**
- * vmalloc_huge - allocate virtually contiguous memory, allow huge pages
+ * vmalloc_huge_node - allocate virtually contiguous memory, allow huge pages
* @size: allocation size
* @gfp_mask: flags for the page level allocator
+ * @node: node to use for allocation or NUMA_NO_NODE
*
* Allocate enough pages to cover @size from the page level
* allocator and map them into contiguous kernel virtual space.
@@ -3955,13 +4008,13 @@ EXPORT_SYMBOL(vmalloc_noprof);
*
* Return: pointer to the allocated memory or %NULL on error
*/
-void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask)
+void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node)
{
return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
- gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
- NUMA_NO_NODE, __builtin_return_address(0));
+ gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
+ node, __builtin_return_address(0));
}
-EXPORT_SYMBOL_GPL(vmalloc_huge_noprof);
+EXPORT_SYMBOL_GPL(vmalloc_huge_node_noprof);
/**
* vzalloc - allocate virtually contiguous memory with zero fill
@@ -4040,19 +4093,29 @@ void *vzalloc_node_noprof(unsigned long size, int node)
EXPORT_SYMBOL(vzalloc_node_noprof);
/**
- * vrealloc - reallocate virtually contiguous memory; contents remain unchanged
+ * vrealloc_node_align_noprof - reallocate virtually contiguous memory; contents
+ * remain unchanged
* @p: object to reallocate memory for
* @size: the size to reallocate
+ * @align: requested alignment
* @flags: the flags for the page level allocator
+ * @nid: node number of the target node
*
- * If @p is %NULL, vrealloc() behaves exactly like vmalloc(). If @size is 0 and
- * @p is not a %NULL pointer, the object pointed to is freed.
+ * If @p is %NULL, vrealloc_XXX() behaves exactly like vmalloc_XXX(). If @size
+ * is 0 and @p is not a %NULL pointer, the object pointed to is freed.
+ *
+ * If the caller wants the new memory to be on specific node *only*,
+ * __GFP_THISNODE flag should be set, otherwise the function will try to avoid
+ * reallocation and possibly disregard the specified @nid.
*
* If __GFP_ZERO logic is requested, callers must ensure that, starting with the
* initial memory allocation, every subsequent call to this API for the same
* memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
* __GFP_ZERO is not fully honored by this API.
*
+ * Requesting an alignment that is bigger than the alignment of the existing
+ * allocation will fail.
+ *
* In any case, the contents of the object pointed to are preserved up to the
* lesser of the new and old sizes.
*
@@ -4062,7 +4125,8 @@ EXPORT_SYMBOL(vzalloc_node_noprof);
* Return: pointer to the allocated memory; %NULL if @size is zero or in case of
* failure
*/
-void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
+void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align,
+ gfp_t flags, int nid)
{
struct vm_struct *vm = NULL;
size_t alloced_size = 0;
@@ -4086,6 +4150,12 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
if (WARN(alloced_size < old_size,
"vrealloc() has mismatched area vs requested sizes (%p)\n", p))
return NULL;
+ if (WARN(!IS_ALIGNED((unsigned long)p, align),
+ "will not reallocate with a bigger alignment (0x%lx)\n", align))
+ return NULL;
+ if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE &&
+ nid != page_to_nid(vmalloc_to_page(p)))
+ goto need_realloc;
}
/*
@@ -4093,8 +4163,8 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
* would be a good heuristic for when to shrink the vm_area?
*/
if (size <= old_size) {
- /* Zero out "freed" memory. */
- if (want_init_on_free())
+ /* Zero out "freed" memory, potentially for future realloc. */
+ if (want_init_on_free() || want_init_on_alloc(flags))
memset((void *)p + size, 0, old_size - size);
vm->requested_size = size;
kasan_poison_vmalloc(p + size, old_size - size);
@@ -4107,14 +4177,19 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
if (size <= alloced_size) {
kasan_unpoison_vmalloc(p + old_size, size - old_size,
KASAN_VMALLOC_PROT_NORMAL);
- /* Zero out "alloced" memory. */
- if (want_init_on_alloc(flags))
- memset((void *)p + old_size, 0, size - old_size);
+ /*
+ * No need to zero memory here, as unused memory will have
+ * already been zeroed at initial allocation time or during
+ * realloc shrink time.
+ */
vm->requested_size = size;
+ return (void *)p;
}
+need_realloc:
/* TODO: Grow the vm_area, i.e. allocate and map additional pages. */
- n = __vmalloc_noprof(size, flags);
+ n = __vmalloc_node_noprof(size, align, flags, nid, __builtin_return_address(0));
+
if (!n)
return NULL;
@@ -4774,7 +4849,7 @@ retry:
/* populate the kasan shadow space */
for (area = 0; area < nr_vms; area++) {
- if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area]))
+ if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area], GFP_KERNEL))
goto err_free_shadow;
}
@@ -4931,39 +5006,37 @@ bool vmalloc_dump_obj(void *object)
#endif
#ifdef CONFIG_PROC_FS
-static void show_numa_info(struct seq_file *m, struct vm_struct *v)
-{
- if (IS_ENABLED(CONFIG_NUMA)) {
- unsigned int nr, *counters = m->private;
- unsigned int step = 1U << vm_area_page_order(v);
- if (!counters)
- return;
+/*
+ * Print number of pages allocated on each memory node.
+ *
+ * This function can only be called if CONFIG_NUMA is enabled
+ * and VM_UNINITIALIZED bit in v->flags is disabled.
+ */
+static void show_numa_info(struct seq_file *m, struct vm_struct *v,
+ unsigned int *counters)
+{
+ unsigned int nr;
+ unsigned int step = 1U << vm_area_page_order(v);
- if (v->flags & VM_UNINITIALIZED)
- return;
- /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
- smp_rmb();
+ if (!counters)
+ return;
- memset(counters, 0, nr_node_ids * sizeof(unsigned int));
+ memset(counters, 0, nr_node_ids * sizeof(unsigned int));
- for (nr = 0; nr < v->nr_pages; nr += step)
- counters[page_to_nid(v->pages[nr])] += step;
- for_each_node_state(nr, N_HIGH_MEMORY)
- if (counters[nr])
- seq_printf(m, " N%u=%u", nr, counters[nr]);
- }
+ for (nr = 0; nr < v->nr_pages; nr += step)
+ counters[page_to_nid(v->pages[nr])] += step;
+ for_each_node_state(nr, N_HIGH_MEMORY)
+ if (counters[nr])
+ seq_printf(m, " N%u=%u", nr, counters[nr]);
}
static void show_purge_info(struct seq_file *m)
{
struct vmap_node *vn;
struct vmap_area *va;
- int i;
-
- for (i = 0; i < nr_vmap_nodes; i++) {
- vn = &vmap_nodes[i];
+ for_each_vmap_node(vn) {
spin_lock(&vn->lazy.lock);
list_for_each_entry(va, &vn->lazy.head, list) {
seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
@@ -4979,11 +5052,12 @@ static int vmalloc_info_show(struct seq_file *m, void *p)
struct vmap_node *vn;
struct vmap_area *va;
struct vm_struct *v;
- int i;
+ unsigned int *counters;
- for (i = 0; i < nr_vmap_nodes; i++) {
- vn = &vmap_nodes[i];
+ if (IS_ENABLED(CONFIG_NUMA))
+ counters = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
+ for_each_vmap_node(vn) {
spin_lock(&vn->busy.lock);
list_for_each_entry(va, &vn->busy.head, list) {
if (!va->vm) {
@@ -4996,6 +5070,11 @@ static int vmalloc_info_show(struct seq_file *m, void *p)
}
v = va->vm;
+ if (v->flags & VM_UNINITIALIZED)
+ continue;
+
+ /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
+ smp_rmb();
seq_printf(m, "0x%pK-0x%pK %7ld",
v->addr, v->addr + v->size, v->size);
@@ -5030,7 +5109,9 @@ static int vmalloc_info_show(struct seq_file *m, void *p)
if (is_vmalloc_addr(v->pages))
seq_puts(m, " vpages");
- show_numa_info(m, v);
+ if (IS_ENABLED(CONFIG_NUMA))
+ show_numa_info(m, v, counters);
+
seq_putc(m, '\n');
}
spin_unlock(&vn->busy.lock);
@@ -5040,19 +5121,14 @@ static int vmalloc_info_show(struct seq_file *m, void *p)
* As a final step, dump "unpurged" areas.
*/
show_purge_info(m);
+ if (IS_ENABLED(CONFIG_NUMA))
+ kfree(counters);
return 0;
}
static int __init proc_vmalloc_init(void)
{
- void *priv_data = NULL;
-
- if (IS_ENABLED(CONFIG_NUMA))
- priv_data = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
-
- proc_create_single_data("vmallocinfo",
- 0400, NULL, vmalloc_info_show, priv_data);
-
+ proc_create_single("vmallocinfo", 0400, NULL, vmalloc_info_show);
return 0;
}
module_init(proc_vmalloc_init);
@@ -5104,7 +5180,7 @@ static void __init vmap_init_free_space(void)
static void vmap_init_nodes(void)
{
struct vmap_node *vn;
- int i, n;
+ int i;
#if BITS_PER_LONG == 64
/*
@@ -5121,10 +5197,10 @@ static void vmap_init_nodes(void)
* set of cores. Therefore a per-domain purging is supposed to
* be added as well as a per-domain balancing.
*/
- n = clamp_t(unsigned int, num_possible_cpus(), 1, 128);
+ int n = clamp_t(unsigned int, num_possible_cpus(), 1, 128);
if (n > 1) {
- vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT | __GFP_NOWARN);
+ vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT);
if (vn) {
/* Node partition is 16 pages. */
vmap_zone_size = (1 << 4) * PAGE_SIZE;
@@ -5136,8 +5212,7 @@ static void vmap_init_nodes(void)
}
#endif
- for (n = 0; n < nr_vmap_nodes; n++) {
- vn = &vmap_nodes[n];
+ for_each_vmap_node(vn) {
vn->busy.root = RB_ROOT;
INIT_LIST_HEAD(&vn->busy.head);
spin_lock_init(&vn->busy.lock);
@@ -5158,15 +5233,13 @@ static void vmap_init_nodes(void)
static unsigned long
vmap_node_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
{
- unsigned long count;
+ unsigned long count = 0;
struct vmap_node *vn;
- int i, j;
-
- for (count = 0, i = 0; i < nr_vmap_nodes; i++) {
- vn = &vmap_nodes[i];
+ int i;
- for (j = 0; j < MAX_VA_SIZE_PAGES; j++)
- count += READ_ONCE(vn->pool[j].len);
+ for_each_vmap_node(vn) {
+ for (i = 0; i < MAX_VA_SIZE_PAGES; i++)
+ count += READ_ONCE(vn->pool[i].len);
}
return count ? count : SHRINK_EMPTY;
@@ -5175,10 +5248,10 @@ vmap_node_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
static unsigned long
vmap_node_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
{
- int i;
+ struct vmap_node *vn;
- for (i = 0; i < nr_vmap_nodes; i++)
- decay_va_pool_node(&vmap_nodes[i], true);
+ for_each_vmap_node(vn)
+ decay_va_pool_node(vn, true);
return SHRINK_STOP;
}
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index bd5183dfd879..c197ed47bcc4 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -316,7 +316,7 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
* asserted for a second in which subsequent
* pressure events can occur.
*/
- WRITE_ONCE(memcg->socket_pressure, jiffies + HZ);
+ mem_cgroup_set_socket_pressure(memcg);
}
}
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3783e45bfc92..b2fc8b626d3d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -57,6 +57,7 @@
#include <linux/rculist_nulls.h>
#include <linux/random.h>
#include <linux/mmu_notifier.h>
+#include <linux/parser.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -93,10 +94,8 @@ struct scan_control {
unsigned long anon_cost;
unsigned long file_cost;
-#ifdef CONFIG_MEMCG
/* Swappiness value for proactive reclaim. Always use sc_swappiness()! */
int *proactive_swappiness;
-#endif
/* Can active folios be deactivated as part of reclaim? */
#define DEACTIVATE_ANON 1
@@ -120,7 +119,7 @@ struct scan_control {
/* Has cache_trim_mode failed at least once? */
unsigned int cache_trim_mode_failed:1;
- /* Proactive reclaim invoked by userspace through memory.reclaim */
+ /* Proactive reclaim invoked by userspace */
unsigned int proactive:1;
/*
@@ -342,16 +341,22 @@ static void flush_reclaim_state(struct scan_control *sc)
}
}
-static bool can_demote(int nid, struct scan_control *sc)
+static bool can_demote(int nid, struct scan_control *sc,
+ struct mem_cgroup *memcg)
{
+ int demotion_nid;
+
if (!numa_demotion_enabled)
return false;
if (sc && sc->no_demotion)
return false;
- if (next_demotion_node(nid) == NUMA_NO_NODE)
+
+ demotion_nid = next_demotion_node(nid);
+ if (demotion_nid == NUMA_NO_NODE)
return false;
- return true;
+ /* If demotion node isn't in the cgroup's mems_allowed, fall back */
+ return mem_cgroup_node_allowed(memcg, demotion_nid);
}
static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
@@ -376,7 +381,7 @@ static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
*
* Can it be reclaimed from this node via demotion?
*/
- return can_demote(nid, sc);
+ return can_demote(nid, sc, memcg);
}
/*
@@ -393,14 +398,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL))
nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
- /*
- * If there are no reclaimable file-backed or anonymous pages,
- * ensure zones with sufficient free pages are not skipped.
- * This prevents zones like DMA32 from being ignored in reclaim
- * scenarios where they can still help alleviate memory pressure.
- */
- if (nr == 0)
- nr = zone_page_state_snapshot(zone, NR_FREE_PAGES);
+
return nr;
}
@@ -520,7 +518,7 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat)
* If kswapd is disabled, reschedule if necessary but do not
* throttle as the system is likely near OOM.
*/
- if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+ if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES)
return true;
/*
@@ -646,23 +644,53 @@ typedef enum {
PAGE_CLEAN,
} pageout_t;
+static pageout_t writeout(struct folio *folio, struct address_space *mapping,
+ struct swap_iocb **plug, struct list_head *folio_list)
+{
+ int res;
+
+ folio_set_reclaim(folio);
+
+ /*
+ * The large shmem folio can be split if CONFIG_THP_SWAP is not enabled
+ * or we failed to allocate contiguous swap entries, in which case
+ * the split out folios get added back to folio_list.
+ */
+ if (shmem_mapping(mapping))
+ res = shmem_writeout(folio, plug, folio_list);
+ else
+ res = swap_writeout(folio, plug);
+
+ if (res < 0)
+ handle_write_error(mapping, folio, res);
+ if (res == AOP_WRITEPAGE_ACTIVATE) {
+ folio_clear_reclaim(folio);
+ return PAGE_ACTIVATE;
+ }
+
+ /* synchronous write? */
+ if (!folio_test_writeback(folio))
+ folio_clear_reclaim(folio);
+
+ trace_mm_vmscan_write_folio(folio);
+ node_stat_add_folio(folio, NR_VMSCAN_WRITE);
+ return PAGE_SUCCESS;
+}
+
/*
* pageout is called by shrink_folio_list() for each dirty folio.
- * Calls ->writepage().
*/
static pageout_t pageout(struct folio *folio, struct address_space *mapping,
struct swap_iocb **plug, struct list_head *folio_list)
{
/*
- * If the folio is dirty, only perform writeback if that write
- * will be non-blocking. To prevent this allocation from being
- * stalled by pagecache activity. But note that there may be
- * stalls if we need to run get_block(). We could test
- * PagePrivate for that.
- *
- * If this process is currently in __generic_file_write_iter() against
- * this folio's queue, we can perform writeback even if that
- * will block.
+ * We no longer attempt to writeback filesystem folios here, other
+ * than tmpfs/shmem. That's taken care of in page-writeback.
+ * If we find a dirty filesystem folio at the end of the LRU list,
+ * typically that means the filesystem is saturating the storage
+ * with contiguous writes and telling it to write a folio here
+ * would only make the situation worse by injecting an element
+ * of random access.
*
* If the folio is swapcache, write it back even if that would
* block, for some throttling. This happens by accident, because
@@ -685,47 +713,12 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping,
}
return PAGE_KEEP;
}
- if (mapping->a_ops->writepage == NULL)
- return PAGE_ACTIVATE;
-
- if (folio_clear_dirty_for_io(folio)) {
- int res;
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_NONE,
- .nr_to_write = SWAP_CLUSTER_MAX,
- .range_start = 0,
- .range_end = LLONG_MAX,
- .for_reclaim = 1,
- .swap_plug = plug,
- };
-
- /*
- * The large shmem folio can be split if CONFIG_THP_SWAP is
- * not enabled or contiguous swap entries are failed to
- * allocate.
- */
- if (shmem_mapping(mapping) && folio_test_large(folio))
- wbc.list = folio_list;
-
- folio_set_reclaim(folio);
- res = mapping->a_ops->writepage(&folio->page, &wbc);
- if (res < 0)
- handle_write_error(mapping, folio, res);
- if (res == AOP_WRITEPAGE_ACTIVATE) {
- folio_clear_reclaim(folio);
- return PAGE_ACTIVATE;
- }
-
- if (!folio_test_writeback(folio)) {
- /* synchronous write or broken a_ops? */
- folio_clear_reclaim(folio);
- }
- trace_mm_vmscan_write_folio(folio);
- node_stat_add_folio(folio, NR_VMSCAN_WRITE);
- return PAGE_SUCCESS;
- }
- return PAGE_CLEAN;
+ if (!shmem_mapping(mapping) && !folio_test_anon(folio))
+ return PAGE_ACTIVATE;
+ if (!folio_clear_dirty_for_io(folio))
+ return PAGE_CLEAN;
+ return writeout(folio, mapping, plug, folio_list);
}
/*
@@ -737,13 +730,18 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
{
int refcount;
void *shadow = NULL;
+ struct swap_cluster_info *ci;
BUG_ON(!folio_test_locked(folio));
BUG_ON(mapping != folio_mapping(folio));
- if (!folio_test_swapcache(folio))
+ if (folio_test_swapcache(folio)) {
+ ci = swap_cluster_get_and_lock_irq(folio);
+ } else {
spin_lock(&mapping->host->i_lock);
- xa_lock_irq(&mapping->i_pages);
+ xa_lock_irq(&mapping->i_pages);
+ }
+
/*
* The non racy check for a busy folio.
*
@@ -783,9 +781,9 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
if (reclaimed && !mapping_exiting(mapping))
shadow = workingset_eviction(folio, target_memcg);
- __delete_from_swap_cache(folio, swap, shadow);
+ __swap_cache_del_folio(ci, folio, swap, shadow);
memcg1_swapout(folio, swap);
- xa_unlock_irq(&mapping->i_pages);
+ swap_cluster_unlock_irq(ci);
put_swap_folio(folio, swap);
} else {
void (*free_folio)(struct folio *);
@@ -823,9 +821,12 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
return 1;
cannot_free:
- xa_unlock_irq(&mapping->i_pages);
- if (!folio_test_swapcache(folio))
+ if (folio_test_swapcache(folio)) {
+ swap_cluster_unlock_irq(ci);
+ } else {
+ xa_unlock_irq(&mapping->i_pages);
spin_unlock(&mapping->host->i_lock);
+ }
return 0;
}
@@ -888,11 +889,11 @@ static bool lru_gen_set_refs(struct folio *folio)
{
/* see the comment on LRU_REFS_FLAGS */
if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) {
- set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced));
+ set_mask_bits(&folio->flags.f, LRU_REFS_MASK, BIT(PG_referenced));
return false;
}
- set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_workingset));
+ set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_workingset));
return true;
}
#else
@@ -906,7 +907,7 @@ static enum folio_references folio_check_references(struct folio *folio,
struct scan_control *sc)
{
int referenced_ptes, referenced_folio;
- unsigned long vm_flags;
+ vm_flags_t vm_flags;
referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup,
&vm_flags);
@@ -1005,7 +1006,8 @@ static void folio_check_dirty_writeback(struct folio *folio,
mapping->a_ops->is_dirty_writeback(folio, dirty, writeback);
}
-struct folio *alloc_migrate_folio(struct folio *src, unsigned long private)
+static struct folio *alloc_demote_folio(struct folio *src,
+ unsigned long private)
{
struct folio *dst;
nodemask_t *allowed_mask;
@@ -1068,7 +1070,7 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
node_get_allowed_targets(pgdat, &allowed_mask);
/* Demotion ignores all cpuset and mempolicy settings */
- migrate_pages(demote_folios, alloc_migrate_folio, NULL,
+ migrate_pages(demote_folios, alloc_demote_folio, NULL,
(unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
&nr_succeeded);
@@ -1096,7 +1098,8 @@ static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask)
*/
static unsigned int shrink_folio_list(struct list_head *folio_list,
struct pglist_data *pgdat, struct scan_control *sc,
- struct reclaim_stat *stat, bool ignore_references)
+ struct reclaim_stat *stat, bool ignore_references,
+ struct mem_cgroup *memcg)
{
struct folio_batch free_folios;
LIST_HEAD(ret_folios);
@@ -1109,7 +1112,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
folio_batch_init(&free_folios);
memset(stat, 0, sizeof(*stat));
cond_resched();
- do_demote_pass = can_demote(pgdat->node_id, sc);
+ do_demote_pass = can_demote(pgdat->node_id, sc, memcg);
retry:
while (!list_empty(folio_list)) {
@@ -1128,6 +1131,14 @@ retry:
goto keep;
if (folio_contain_hwpoisoned_page(folio)) {
+ /*
+ * unmap_poisoned_folio() can't handle large
+ * folio, just skip it. memory_failure() will
+ * handle it if the UCE is triggered again.
+ */
+ if (folio_test_large(folio))
+ goto keep_locked;
+
unmap_poisoned_folio(folio, folio_pfn(folio), false);
folio_unlock(folio);
folio_put(folio);
@@ -1187,8 +1198,10 @@ retry:
* 2) Global or new memcg reclaim encounters a folio that is
* not marked for immediate reclaim, or the caller does not
* have __GFP_FS (or __GFP_IO if it's simply going to swap,
- * not to fs). In this case mark the folio for immediate
- * reclaim and continue scanning.
+ * not to fs), or the folio belongs to a mapping where
+ * waiting on writeback during reclaim may lead to a deadlock.
+ * In this case mark the folio for immediate reclaim and
+ * continue scanning.
*
* Require may_enter_fs() because we would wait on fs, which
* may not have submitted I/O yet. And the loop driver might
@@ -1213,6 +1226,8 @@ retry:
* takes to write them to disk.
*/
if (folio_test_writeback(folio)) {
+ mapping = folio_mapping(folio);
+
/* Case 1 above */
if (current_is_kswapd() &&
folio_test_reclaim(folio) &&
@@ -1223,7 +1238,9 @@ retry:
/* Case 2 above */
} else if (writeback_throttling_sane(sc) ||
!folio_test_reclaim(folio) ||
- !may_enter_fs(folio, sc->gfp_mask)) {
+ !may_enter_fs(folio, sc->gfp_mask) ||
+ (mapping &&
+ mapping_writeback_may_deadlock_on_reclaim(mapping))) {
/*
* This is slightly racy -
* folio_end_writeback() might have
@@ -1642,9 +1659,11 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
unsigned int noreclaim_flag;
list_for_each_entry_safe(folio, next, folio_list, lru) {
+ /* TODO: these pages should not even appear in this list. */
+ if (page_has_movable_ops(&folio->page))
+ continue;
if (!folio_test_hugetlb(folio) && folio_is_file_lru(folio) &&
- !folio_test_dirty(folio) && !__folio_test_movable(folio) &&
- !folio_test_unevictable(folio)) {
+ !folio_test_dirty(folio) && !folio_test_unevictable(folio)) {
folio_clear_active(folio);
list_move(&folio->lru, &clean_folios);
}
@@ -1658,7 +1677,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
*/
noreclaim_flag = memalloc_noreclaim_save();
nr_reclaimed = shrink_folio_list(&clean_folios, zone->zone_pgdat, &sc,
- &stat, true);
+ &stat, true, NULL);
memalloc_noreclaim_restore(noreclaim_flag);
list_splice(&clean_folios, folio_list);
@@ -1725,13 +1744,11 @@ static unsigned long isolate_lru_folios(unsigned long nr_to_scan,
unsigned long nr_taken = 0;
unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
- unsigned long skipped = 0;
- unsigned long scan, total_scan, nr_pages;
+ unsigned long skipped = 0, total_scan = 0, scan = 0;
+ unsigned long nr_pages;
unsigned long max_nr_skipped = 0;
LIST_HEAD(folios_skipped);
- total_scan = 0;
- scan = 0;
while (scan < nr_to_scan && !list_empty(src)) {
struct list_head *move_to = src;
struct folio *folio;
@@ -2023,7 +2040,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
item = PGSCAN_KSWAPD + reclaimer_offset(sc);
if (!cgroup_reclaim(sc))
__count_vm_events(item, nr_scanned);
- __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
+ count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
__count_vm_events(PGSCAN_ANON + file, nr_scanned);
spin_unlock_irq(&lruvec->lru_lock);
@@ -2031,7 +2048,8 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
if (nr_taken == 0)
return 0;
- nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false);
+ nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false,
+ lruvec_memcg(lruvec));
spin_lock_irq(&lruvec->lru_lock);
move_folios_to_lru(lruvec, &folio_list);
@@ -2042,11 +2060,11 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
item = PGSTEAL_KSWAPD + reclaimer_offset(sc);
if (!cgroup_reclaim(sc))
__count_vm_events(item, nr_reclaimed);
- __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
+ count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
__count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
- spin_unlock_irq(&lruvec->lru_lock);
- lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed);
+ lru_note_cost_unlock_irq(lruvec, file, stat.nr_pageout,
+ nr_scanned - nr_reclaimed);
/*
* If dirty folios are scanned that are not queued for IO, it
@@ -2112,7 +2130,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
{
unsigned long nr_taken;
unsigned long nr_scanned;
- unsigned long vm_flags;
+ vm_flags_t vm_flags;
LIST_HEAD(l_hold); /* The folios which were snipped off */
LIST_HEAD(l_active);
LIST_HEAD(l_inactive);
@@ -2132,7 +2150,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
if (!cgroup_reclaim(sc))
__count_vm_events(PGREFILL, nr_scanned);
- __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
+ count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
spin_unlock_irq(&lruvec->lru_lock);
@@ -2189,13 +2207,11 @@ static void shrink_active_list(unsigned long nr_to_scan,
nr_deactivate = move_folios_to_lru(lruvec, &l_inactive);
__count_vm_events(PGDEACTIVATE, nr_deactivate);
- __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
+ count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
- spin_unlock_irq(&lruvec->lru_lock);
- if (nr_rotated)
- lru_note_cost(lruvec, file, 0, nr_rotated);
+ lru_note_cost_unlock_irq(lruvec, file, 0, nr_rotated);
trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
nr_deactivate, nr_rotated, sc->priority, file);
}
@@ -2214,7 +2230,7 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list,
.no_demotion = 1,
};
- nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &stat, true);
+ nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &stat, true, NULL);
while (!list_empty(folio_list)) {
folio = lru_to_folio(folio_list);
list_del(&folio->lru);
@@ -2467,6 +2483,69 @@ static inline void calculate_pressure_balance(struct scan_control *sc,
*denominator = ap + fp;
}
+static unsigned long apply_proportional_protection(struct mem_cgroup *memcg,
+ struct scan_control *sc, unsigned long scan)
+{
+ unsigned long min, low;
+
+ mem_cgroup_protection(sc->target_mem_cgroup, memcg, &min, &low);
+
+ if (min || low) {
+ /*
+ * Scale a cgroup's reclaim pressure by proportioning
+ * its current usage to its memory.low or memory.min
+ * setting.
+ *
+ * This is important, as otherwise scanning aggression
+ * becomes extremely binary -- from nothing as we
+ * approach the memory protection threshold, to totally
+ * nominal as we exceed it. This results in requiring
+ * setting extremely liberal protection thresholds. It
+ * also means we simply get no protection at all if we
+ * set it too low, which is not ideal.
+ *
+ * If there is any protection in place, we reduce scan
+ * pressure by how much of the total memory used is
+ * within protection thresholds.
+ *
+ * There is one special case: in the first reclaim pass,
+ * we skip over all groups that are within their low
+ * protection. If that fails to reclaim enough pages to
+ * satisfy the reclaim goal, we come back and override
+ * the best-effort low protection. However, we still
+ * ideally want to honor how well-behaved groups are in
+ * that case instead of simply punishing them all
+ * equally. As such, we reclaim them based on how much
+ * memory they are using, reducing the scan pressure
+ * again by how much of the total memory used is under
+ * hard protection.
+ */
+ unsigned long cgroup_size = mem_cgroup_size(memcg);
+ unsigned long protection;
+
+ /* memory.low scaling, make sure we retry before OOM */
+ if (!sc->memcg_low_reclaim && low > min) {
+ protection = low;
+ sc->memcg_low_skipped = 1;
+ } else {
+ protection = min;
+ }
+
+ /* Avoid TOCTOU with earlier protection check */
+ cgroup_size = max(cgroup_size, protection);
+
+ scan -= scan * protection / (cgroup_size + 1);
+
+ /*
+ * Minimally target SWAP_CLUSTER_MAX pages to keep
+ * reclaim moving forwards, avoiding decrementing
+ * sc->priority further than desirable.
+ */
+ scan = max(scan, SWAP_CLUSTER_MAX);
+ }
+ return scan;
+}
+
/*
* Determine how aggressively the anon and file LRU lists should be
* scanned.
@@ -2503,6 +2582,13 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
goto out;
}
+ /* Proactive reclaim initiated by userspace for anonymous memory only */
+ if (swappiness == SWAPPINESS_ANON_ONLY) {
+ WARN_ON_ONCE(!sc->proactive);
+ scan_balance = SCAN_ANON;
+ goto out;
+ }
+
/*
* Do not apply any pressure balancing cleverness when the
* system is close to OOM, scan both anon and file equally
@@ -2523,7 +2609,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
/*
* If there is enough inactive page cache, we do not reclaim
- * anything from the anonymous working right now.
+ * anything from the anonymous working right now to make sure
+ * a streaming file access pattern doesn't cause swapping.
*/
if (sc->cache_trim_mode) {
scan_balance = SCAN_FILE;
@@ -2537,70 +2624,10 @@ out:
for_each_evictable_lru(lru) {
bool file = is_file_lru(lru);
unsigned long lruvec_size;
- unsigned long low, min;
unsigned long scan;
lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
- mem_cgroup_protection(sc->target_mem_cgroup, memcg,
- &min, &low);
-
- if (min || low) {
- /*
- * Scale a cgroup's reclaim pressure by proportioning
- * its current usage to its memory.low or memory.min
- * setting.
- *
- * This is important, as otherwise scanning aggression
- * becomes extremely binary -- from nothing as we
- * approach the memory protection threshold, to totally
- * nominal as we exceed it. This results in requiring
- * setting extremely liberal protection thresholds. It
- * also means we simply get no protection at all if we
- * set it too low, which is not ideal.
- *
- * If there is any protection in place, we reduce scan
- * pressure by how much of the total memory used is
- * within protection thresholds.
- *
- * There is one special case: in the first reclaim pass,
- * we skip over all groups that are within their low
- * protection. If that fails to reclaim enough pages to
- * satisfy the reclaim goal, we come back and override
- * the best-effort low protection. However, we still
- * ideally want to honor how well-behaved groups are in
- * that case instead of simply punishing them all
- * equally. As such, we reclaim them based on how much
- * memory they are using, reducing the scan pressure
- * again by how much of the total memory used is under
- * hard protection.
- */
- unsigned long cgroup_size = mem_cgroup_size(memcg);
- unsigned long protection;
-
- /* memory.low scaling, make sure we retry before OOM */
- if (!sc->memcg_low_reclaim && low > min) {
- protection = low;
- sc->memcg_low_skipped = 1;
- } else {
- protection = min;
- }
-
- /* Avoid TOCTOU with earlier protection check */
- cgroup_size = max(cgroup_size, protection);
-
- scan = lruvec_size - lruvec_size * protection /
- (cgroup_size + 1);
-
- /*
- * Minimally target SWAP_CLUSTER_MAX pages to keep
- * reclaim moving forwards, avoiding decrementing
- * sc->priority further than desirable.
- */
- scan = max(scan, SWAP_CLUSTER_MAX);
- } else {
- scan = lruvec_size;
- }
-
+ scan = apply_proportional_protection(memcg, sc, lruvec_size);
scan >>= sc->priority;
/*
@@ -2646,7 +2673,7 @@ out:
* Anonymous LRU management is a waste if there is
* ultimately no way to reclaim the memory.
*/
-static bool can_age_anon_pages(struct pglist_data *pgdat,
+static bool can_age_anon_pages(struct lruvec *lruvec,
struct scan_control *sc)
{
/* Aging the anon LRU is valuable if swap is present: */
@@ -2654,7 +2681,8 @@ static bool can_age_anon_pages(struct pglist_data *pgdat,
return true;
/* Also valuable if anon pages can be demoted: */
- return can_demote(pgdat->node_id, sc);
+ return can_demote(lruvec_pgdat(lruvec)->node_id, sc,
+ lruvec_memcg(lruvec));
}
#ifdef CONFIG_LRU_GEN
@@ -2690,8 +2718,12 @@ static bool should_clear_pmd_young(void)
READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \
}
+/* Get the min/max evictable type based on swappiness */
+#define min_type(swappiness) (!(swappiness))
+#define max_type(swappiness) ((swappiness) < SWAPPINESS_ANON_ONLY)
+
#define evictable_min_seq(min_seq, swappiness) \
- min((min_seq)[!(swappiness)], (min_seq)[(swappiness) <= MAX_SWAPPINESS])
+ min((min_seq)[min_type(swappiness)], (min_seq)[max_type(swappiness)])
#define for_each_gen_type_zone(gen, type, zone) \
for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
@@ -2699,7 +2731,7 @@ static bool should_clear_pmd_young(void)
for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
#define for_each_evictable_type(type, swappiness) \
- for ((type) = !(swappiness); (type) <= ((swappiness) <= MAX_SWAPPINESS); (type)++)
+ for ((type) = min_type(swappiness); (type) <= max_type(swappiness); (type)++)
#define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS)
#define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS)
@@ -2732,7 +2764,7 @@ static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
if (!sc->may_swap)
return 0;
- if (!can_demote(pgdat->node_id, sc) &&
+ if (!can_demote(pgdat->node_id, sc, memcg) &&
mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
return 0;
@@ -3226,13 +3258,13 @@ static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv)
/* promote pages accessed through page tables */
static int folio_update_gen(struct folio *folio, int gen)
{
- unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
+ unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f);
VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
/* see the comment on LRU_REFS_FLAGS */
if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) {
- set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced));
+ set_mask_bits(&folio->flags.f, LRU_REFS_MASK, BIT(PG_referenced));
return -1;
}
@@ -3243,7 +3275,7 @@ static int folio_update_gen(struct folio *folio, int gen)
new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS);
new_flags |= ((gen + 1UL) << LRU_GEN_PGOFF) | BIT(PG_workingset);
- } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
+ } while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags));
return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
}
@@ -3254,7 +3286,7 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai
int type = folio_is_file_lru(folio);
struct lru_gen_folio *lrugen = &lruvec->lrugen;
int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
- unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
+ unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f);
VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio);
@@ -3271,7 +3303,7 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai
/* for folio_end_writeback() */
if (reclaiming)
new_flags |= BIT(PG_reclaim);
- } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
+ } while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags));
lru_gen_update_size(lruvec, folio, old_gen, new_gen);
@@ -3401,7 +3433,7 @@ static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned
if (!pte_present(pte) || is_zero_pfn(pfn))
return -1;
- if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte)))
+ if (WARN_ON_ONCE(pte_special(pte)))
return -1;
if (!pte_young(pte) && !mm_has_notifiers(vma->vm_mm))
@@ -3426,9 +3458,6 @@ static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned
if (!pmd_present(pmd) || is_huge_zero_pmd(pmd))
return -1;
- if (WARN_ON_ONCE(pmd_devmap(pmd)))
- return -1;
-
if (!pmd_young(pmd) && !mm_has_notifiers(vma->vm_mm))
return -1;
@@ -3850,7 +3879,12 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, int swappiness)
int hist = lru_hist_from_seq(lrugen->min_seq[type]);
int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
- if (type ? swappiness > MAX_SWAPPINESS : !swappiness)
+ /* For file type, skip the check if swappiness is anon only */
+ if (type && (swappiness == SWAPPINESS_ANON_ONLY))
+ goto done;
+
+ /* For anon type, skip the check if swappiness is zero (file only) */
+ if (!type && !swappiness)
goto done;
/* prevent cold/hot inversion if the type is evictable */
@@ -3894,6 +3928,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness)
{
int gen, type, zone;
bool success = false;
+ bool seq_inc_flag = false;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
DEFINE_MIN_SEQ(lruvec);
@@ -3910,11 +3945,20 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness)
}
min_seq[type]++;
+ seq_inc_flag = true;
}
next:
;
}
+ /*
+ * If min_seq[type] of both anonymous and file is not increased,
+ * we can directly return false to avoid unnecessary checking
+ * overhead later.
+ */
+ if (!seq_inc_flag)
+ return success;
+
/* see the comment on lru_gen_folio */
if (swappiness && swappiness <= MAX_SWAPPINESS) {
unsigned long seq = lrugen->max_seq - MIN_NR_GENS;
@@ -4464,7 +4508,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
}
/* ineligible */
- if (!folio_test_lru(folio) || zone > sc->reclaim_idx) {
+ if (zone > sc->reclaim_idx) {
gen = folio_inc_gen(lruvec, folio, false);
list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
return true;
@@ -4510,7 +4554,7 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca
/* see the comment on LRU_REFS_FLAGS */
if (!folio_test_referenced(folio))
- set_mask_bits(&folio->flags, LRU_REFS_MASK, 0);
+ set_mask_bits(&folio->flags.f, LRU_REFS_MASK, 0);
/* for shrink_folio_list() */
folio_clear_reclaim(folio);
@@ -4521,8 +4565,9 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca
return true;
}
-static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
- int type, int tier, struct list_head *list)
+static int scan_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
+ struct scan_control *sc, int type, int tier,
+ struct list_head *list)
{
int i;
int gen;
@@ -4531,7 +4576,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
int scanned = 0;
int isolated = 0;
int skipped = 0;
- int remaining = MAX_LRU_BATCH;
+ int remaining = min(nr_to_scan, MAX_LRU_BATCH);
struct lru_gen_folio *lrugen = &lruvec->lrugen;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
@@ -4588,8 +4633,8 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
__count_vm_events(item, isolated);
__count_vm_events(PGREFILL, sorted);
}
- __count_memcg_events(memcg, item, isolated);
- __count_memcg_events(memcg, PGREFILL, sorted);
+ count_memcg_events(memcg, item, isolated);
+ count_memcg_events(memcg, PGREFILL, sorted);
__count_vm_events(PGSCAN_ANON + type, isolated);
trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, MAX_LRU_BATCH,
scanned, skipped, isolated,
@@ -4642,7 +4687,8 @@ static int get_type_to_scan(struct lruvec *lruvec, int swappiness)
return positive_ctrl_err(&sp, &pv);
}
-static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
+static int isolate_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
+ struct scan_control *sc, int swappiness,
int *type_scanned, struct list_head *list)
{
int i;
@@ -4654,7 +4700,7 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw
*type_scanned = type;
- scanned = scan_folios(lruvec, sc, type, tier, list);
+ scanned = scan_folios(nr_to_scan, lruvec, sc, type, tier, list);
if (scanned)
return scanned;
@@ -4664,7 +4710,8 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw
return 0;
}
-static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
+static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
+ struct scan_control *sc, int swappiness)
{
int type;
int scanned;
@@ -4683,7 +4730,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
spin_lock_irq(&lruvec->lru_lock);
- scanned = isolate_folios(lruvec, sc, swappiness, &type, &list);
+ scanned = isolate_folios(nr_to_scan, lruvec, sc, swappiness, &type, &list);
scanned += try_to_inc_min_seq(lruvec, swappiness);
@@ -4695,7 +4742,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
if (list_empty(&list))
return scanned;
retry:
- reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false);
+ reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false, memcg);
sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
sc->nr_reclaimed += reclaimed;
trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
@@ -4720,7 +4767,7 @@ retry:
/* don't add rejected folios to the oldest generation */
if (lru_gen_folio_seq(lruvec, folio, false) == min_seq[type])
- set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_active));
+ set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_active));
}
spin_lock_irq(&lruvec->lru_lock);
@@ -4739,7 +4786,7 @@ retry:
item = PGSTEAL_KSWAPD + reclaimer_offset(sc);
if (!cgroup_reclaim(sc))
__count_vm_events(item, reclaimed);
- __count_memcg_events(memcg, item, reclaimed);
+ count_memcg_events(memcg, item, reclaimed);
__count_vm_events(PGSTEAL_ANON + type, reclaimed);
spin_unlock_irq(&lruvec->lru_lock);
@@ -4804,6 +4851,8 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int s
if (nr_to_scan && !mem_cgroup_online(memcg))
return nr_to_scan;
+ nr_to_scan = apply_proportional_protection(memcg, sc, nr_to_scan);
+
/* try to get away with not aging at the default priority */
if (!success || sc->priority == DEF_PRIORITY)
return nr_to_scan >> sc->priority;
@@ -4856,7 +4905,7 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
if (nr_to_scan <= 0)
break;
- delta = evict_folios(lruvec, sc, swappiness);
+ delta = evict_folios(nr_to_scan, lruvec, sc, swappiness);
if (!delta)
break;
@@ -5052,7 +5101,7 @@ static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *
blk_finish_plug(&plug);
done:
if (sc->nr_reclaimed > reclaimed)
- pgdat->kswapd_failures = 0;
+ atomic_set(&pgdat->kswapd_failures, 0);
}
/******************************************************************************
@@ -5387,7 +5436,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
static int lru_gen_seq_show(struct seq_file *m, void *v)
{
unsigned long seq;
- bool full = !debugfs_real_fops(m->file)->write;
+ bool full = debugfs_get_aux_num(m->file);
struct lruvec *lruvec = v;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
int nid = lruvec_pgdat(lruvec)->node_id;
@@ -5477,7 +5526,8 @@ static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_co
if (sc->nr_reclaimed >= nr_to_reclaim)
return 0;
- if (!evict_folios(lruvec, sc, swappiness))
+ if (!evict_folios(nr_to_reclaim - sc->nr_reclaimed, lruvec, sc,
+ swappiness))
return 0;
cond_resched();
@@ -5512,11 +5562,12 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
if (memcg_id != mem_cgroup_id(memcg))
goto done;
+ sc->target_mem_cgroup = memcg;
lruvec = get_lruvec(memcg, nid);
if (swappiness < MIN_SWAPPINESS)
swappiness = get_swappiness(lruvec, sc);
- else if (swappiness > MAX_SWAPPINESS + 1)
+ else if (swappiness > SWAPPINESS_ANON_ONLY)
goto done;
switch (cmd) {
@@ -5548,6 +5599,7 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
.may_swap = true,
.reclaim_idx = MAX_NR_ZONES - 1,
.gfp_mask = GFP_KERNEL,
+ .proactive = true,
};
buf = kvmalloc(len + 1, GFP_KERNEL);
@@ -5573,24 +5625,35 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
while ((cur = strsep(&next, ",;\n"))) {
int n;
int end;
- char cmd;
+ char cmd, swap_string[5];
unsigned int memcg_id;
unsigned int nid;
unsigned long seq;
- unsigned int swappiness = -1;
+ unsigned int swappiness;
unsigned long opt = -1;
cur = skip_spaces(cur);
if (!*cur)
continue;
- n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid,
- &seq, &end, &swappiness, &end, &opt, &end);
+ n = sscanf(cur, "%c %u %u %lu %n %4s %n %lu %n", &cmd, &memcg_id, &nid,
+ &seq, &end, swap_string, &end, &opt, &end);
if (n < 4 || cur[end]) {
err = -EINVAL;
break;
}
+ if (n == 4) {
+ swappiness = -1;
+ } else if (!strcmp("max", swap_string)) {
+ /* set by userspace for anonymous memory only */
+ swappiness = SWAPPINESS_ANON_ONLY;
+ } else {
+ err = kstrtouint(swap_string, 0, &swappiness);
+ if (err)
+ break;
+ }
+
err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt);
if (err)
break;
@@ -5712,8 +5775,10 @@ static int __init init_lru_gen(void)
if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
pr_err("lru_gen: failed to create sysfs group\n");
- debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops);
- debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops);
+ debugfs_create_file_aux_num("lru_gen", 0644, NULL, NULL, false,
+ &lru_gen_rw_fops);
+ debugfs_create_file_aux_num("lru_gen_full", 0444, NULL, NULL, true,
+ &lru_gen_ro_fops);
return 0;
};
@@ -5850,7 +5915,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/
- if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) &&
+ if (can_age_anon_pages(lruvec, sc) &&
inactive_is_low(lruvec, LRU_INACTIVE_ANON))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
@@ -6115,7 +6180,7 @@ again:
* successful direct reclaim run will revive a dormant kswapd.
*/
if (reclaimable)
- pgdat->kswapd_failures = 0;
+ atomic_set(&pgdat->kswapd_failures, 0);
else if (sc->cache_trim_mode)
sc->cache_trim_mode_failed = 1;
}
@@ -6427,11 +6492,11 @@ static bool allow_direct_reclaim(pg_data_t *pgdat)
int i;
bool wmark_ok;
- if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+ if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES)
return true;
for_each_managed_zone_pgdat(zone, pgdat, i, ZONE_NORMAL) {
- if (!zone_reclaimable_pages(zone))
+ if (!zone_reclaimable_pages(zone) && zone_page_state_snapshot(zone, NR_FREE_PAGES))
continue;
pfmemalloc_reserve += min_wmark_pages(zone);
@@ -6669,6 +6734,15 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
return nr_reclaimed;
}
+#else
+unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
+ unsigned long nr_pages,
+ gfp_t gfp_mask,
+ unsigned int reclaim_options,
+ int *swappiness)
+{
+ return 0;
+}
#endif
static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc)
@@ -6681,10 +6755,10 @@ static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc)
return;
}
- if (!can_age_anon_pages(pgdat, sc))
+ lruvec = mem_cgroup_lruvec(NULL, pgdat);
+ if (!can_age_anon_pages(lruvec, sc))
return;
- lruvec = mem_cgroup_lruvec(NULL, pgdat);
if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON))
return;
@@ -6828,7 +6902,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order,
wake_up_all(&pgdat->pfmemalloc_wait);
/* Hopeless node, leave it to direct reclaim */
- if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+ if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES)
return true;
if (pgdat_balanced(pgdat, order, highest_zoneidx)) {
@@ -7096,7 +7170,7 @@ restart:
}
if (!sc.nr_reclaimed)
- pgdat->kswapd_failures++;
+ atomic_inc(&pgdat->kswapd_failures);
out:
clear_reclaim_active(pgdat, highest_zoneidx);
@@ -7355,7 +7429,7 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
return;
/* Hopeless node, leave it to direct reclaim if possible */
- if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
+ if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES ||
(pgdat_balanced(pgdat, order, highest_zoneidx) &&
!pgdat_watermark_boosted(pgdat, highest_zoneidx))) {
/*
@@ -7563,36 +7637,26 @@ static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat)
/*
* Try to free up some pages from this node through reclaim.
*/
-static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
+static unsigned long __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask,
+ unsigned long nr_pages,
+ struct scan_control *sc)
{
- /* Minimum pages needed in order to stay on node */
- const unsigned long nr_pages = 1 << order;
struct task_struct *p = current;
unsigned int noreclaim_flag;
- struct scan_control sc = {
- .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
- .gfp_mask = current_gfp_context(gfp_mask),
- .order = order,
- .priority = NODE_RECLAIM_PRIORITY,
- .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
- .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
- .may_swap = 1,
- .reclaim_idx = gfp_zone(gfp_mask),
- };
unsigned long pflags;
- trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order,
- sc.gfp_mask);
+ trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, sc->order,
+ sc->gfp_mask);
cond_resched();
psi_memstall_enter(&pflags);
delayacct_freepages_start();
- fs_reclaim_acquire(sc.gfp_mask);
+ fs_reclaim_acquire(sc->gfp_mask);
/*
* We need to be able to allocate from the reserves for RECLAIM_UNMAP
*/
noreclaim_flag = memalloc_noreclaim_save();
- set_task_reclaim_state(p, &sc.reclaim_state);
+ set_task_reclaim_state(p, &sc->reclaim_state);
if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages ||
node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) > pgdat->min_slab_pages) {
@@ -7601,24 +7665,36 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
* priorities until we have enough memory freed.
*/
do {
- shrink_node(pgdat, &sc);
- } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
+ shrink_node(pgdat, sc);
+ } while (sc->nr_reclaimed < nr_pages && --sc->priority >= 0);
}
set_task_reclaim_state(p, NULL);
memalloc_noreclaim_restore(noreclaim_flag);
- fs_reclaim_release(sc.gfp_mask);
- psi_memstall_leave(&pflags);
+ fs_reclaim_release(sc->gfp_mask);
delayacct_freepages_end();
+ psi_memstall_leave(&pflags);
- trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed);
+ trace_mm_vmscan_node_reclaim_end(sc->nr_reclaimed);
- return sc.nr_reclaimed >= nr_pages;
+ return sc->nr_reclaimed;
}
int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
{
int ret;
+ /* Minimum pages needed in order to stay on node */
+ const unsigned long nr_pages = 1 << order;
+ struct scan_control sc = {
+ .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
+ .gfp_mask = current_gfp_context(gfp_mask),
+ .order = order,
+ .priority = NODE_RECLAIM_PRIORITY,
+ .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
+ .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
+ .may_swap = 1,
+ .reclaim_idx = gfp_zone(gfp_mask),
+ };
/*
* Node reclaim reclaims unmapped file backed pages and
@@ -7653,7 +7729,7 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
if (test_and_set_bit_lock(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
return NODE_RECLAIM_NOSCAN;
- ret = __node_reclaim(pgdat, gfp_mask, order);
+ ret = __node_reclaim(pgdat, gfp_mask, nr_pages, &sc) >= nr_pages;
clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
if (ret)
@@ -7663,6 +7739,114 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
return ret;
}
+
+enum {
+ MEMORY_RECLAIM_SWAPPINESS = 0,
+ MEMORY_RECLAIM_SWAPPINESS_MAX,
+ MEMORY_RECLAIM_NULL,
+};
+static const match_table_t tokens = {
+ { MEMORY_RECLAIM_SWAPPINESS, "swappiness=%d"},
+ { MEMORY_RECLAIM_SWAPPINESS_MAX, "swappiness=max"},
+ { MEMORY_RECLAIM_NULL, NULL },
+};
+
+int user_proactive_reclaim(char *buf,
+ struct mem_cgroup *memcg, pg_data_t *pgdat)
+{
+ unsigned int nr_retries = MAX_RECLAIM_RETRIES;
+ unsigned long nr_to_reclaim, nr_reclaimed = 0;
+ int swappiness = -1;
+ char *old_buf, *start;
+ substring_t args[MAX_OPT_ARGS];
+ gfp_t gfp_mask = GFP_KERNEL;
+
+ if (!buf || (!memcg && !pgdat) || (memcg && pgdat))
+ return -EINVAL;
+
+ buf = strstrip(buf);
+
+ old_buf = buf;
+ nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE;
+ if (buf == old_buf)
+ return -EINVAL;
+
+ buf = strstrip(buf);
+
+ while ((start = strsep(&buf, " ")) != NULL) {
+ if (!strlen(start))
+ continue;
+ switch (match_token(start, tokens, args)) {
+ case MEMORY_RECLAIM_SWAPPINESS:
+ if (match_int(&args[0], &swappiness))
+ return -EINVAL;
+ if (swappiness < MIN_SWAPPINESS ||
+ swappiness > MAX_SWAPPINESS)
+ return -EINVAL;
+ break;
+ case MEMORY_RECLAIM_SWAPPINESS_MAX:
+ swappiness = SWAPPINESS_ANON_ONLY;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ while (nr_reclaimed < nr_to_reclaim) {
+ /* Will converge on zero, but reclaim enforces a minimum */
+ unsigned long batch_size = (nr_to_reclaim - nr_reclaimed) / 4;
+ unsigned long reclaimed;
+
+ if (signal_pending(current))
+ return -EINTR;
+
+ /*
+ * This is the final attempt, drain percpu lru caches in the
+ * hope of introducing more evictable pages.
+ */
+ if (!nr_retries)
+ lru_add_drain_all();
+
+ if (memcg) {
+ unsigned int reclaim_options;
+
+ reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
+ MEMCG_RECLAIM_PROACTIVE;
+ reclaimed = try_to_free_mem_cgroup_pages(memcg,
+ batch_size, gfp_mask,
+ reclaim_options,
+ swappiness == -1 ? NULL : &swappiness);
+ } else {
+ struct scan_control sc = {
+ .gfp_mask = current_gfp_context(gfp_mask),
+ .reclaim_idx = gfp_zone(gfp_mask),
+ .proactive_swappiness = swappiness == -1 ? NULL : &swappiness,
+ .priority = DEF_PRIORITY,
+ .may_writepage = !laptop_mode,
+ .nr_to_reclaim = max(batch_size, SWAP_CLUSTER_MAX),
+ .may_unmap = 1,
+ .may_swap = 1,
+ .proactive = 1,
+ };
+
+ if (test_and_set_bit_lock(PGDAT_RECLAIM_LOCKED,
+ &pgdat->flags))
+ return -EBUSY;
+
+ reclaimed = __node_reclaim(pgdat, gfp_mask,
+ batch_size, &sc);
+ clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
+ }
+
+ if (!reclaimed && !nr_retries--)
+ return -EAGAIN;
+
+ nr_reclaimed += reclaimed;
+ }
+
+ return 0;
+}
+
#endif
/**
@@ -7710,3 +7894,26 @@ void check_move_unevictable_folios(struct folio_batch *fbatch)
}
}
EXPORT_SYMBOL_GPL(check_move_unevictable_folios);
+
+#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
+static ssize_t reclaim_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ int ret, nid = dev->id;
+
+ ret = user_proactive_reclaim((char *)buf, NULL, NODE_DATA(nid));
+ return ret ? -EAGAIN : count;
+}
+
+static DEVICE_ATTR_WO(reclaim);
+int reclaim_register_node(struct node *node)
+{
+ return device_create_file(&node->dev, &dev_attr_reclaim);
+}
+
+void reclaim_unregister_node(struct node *node)
+{
+ return device_remove_file(&node->dev, &dev_attr_reclaim);
+}
+#endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 4c268ce39ff2..bb09c032eecf 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -7,7 +7,7 @@
*
* zoned VM statistics
* Copyright (C) 2006 Silicon Graphics, Inc.,
- * Christoph Lameter <christoph@lameter.com>
+ * Christoph Lameter <cl@gentwo.org>
* Copyright (C) 2008-2014 Christoph Lameter
*/
#include <linux/fs.h>
@@ -1163,320 +1163,341 @@ int fragmentation_index(struct zone *zone, unsigned int order)
#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || \
defined(CONFIG_NUMA) || defined(CONFIG_MEMCG)
#ifdef CONFIG_ZONE_DMA
-#define TEXT_FOR_DMA(xx) xx "_dma",
+#define TEXT_FOR_DMA(xx, yy) [xx##_DMA] = yy "_dma",
#else
-#define TEXT_FOR_DMA(xx)
+#define TEXT_FOR_DMA(xx, yy)
#endif
#ifdef CONFIG_ZONE_DMA32
-#define TEXT_FOR_DMA32(xx) xx "_dma32",
+#define TEXT_FOR_DMA32(xx, yy) [xx##_DMA32] = yy "_dma32",
#else
-#define TEXT_FOR_DMA32(xx)
+#define TEXT_FOR_DMA32(xx, yy)
#endif
#ifdef CONFIG_HIGHMEM
-#define TEXT_FOR_HIGHMEM(xx) xx "_high",
+#define TEXT_FOR_HIGHMEM(xx, yy) [xx##_HIGH] = yy "_high",
#else
-#define TEXT_FOR_HIGHMEM(xx)
+#define TEXT_FOR_HIGHMEM(xx, yy)
#endif
#ifdef CONFIG_ZONE_DEVICE
-#define TEXT_FOR_DEVICE(xx) xx "_device",
+#define TEXT_FOR_DEVICE(xx, yy) [xx##_DEVICE] = yy "_device",
#else
-#define TEXT_FOR_DEVICE(xx)
+#define TEXT_FOR_DEVICE(xx, yy)
#endif
-#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
- TEXT_FOR_HIGHMEM(xx) xx "_movable", \
- TEXT_FOR_DEVICE(xx)
+#define TEXTS_FOR_ZONES(xx, yy) \
+ TEXT_FOR_DMA(xx, yy) \
+ TEXT_FOR_DMA32(xx, yy) \
+ [xx##_NORMAL] = yy "_normal", \
+ TEXT_FOR_HIGHMEM(xx, yy) \
+ [xx##_MOVABLE] = yy "_movable", \
+ TEXT_FOR_DEVICE(xx, yy)
const char * const vmstat_text[] = {
/* enum zone_stat_item counters */
- "nr_free_pages",
- "nr_free_pages_blocks",
- "nr_zone_inactive_anon",
- "nr_zone_active_anon",
- "nr_zone_inactive_file",
- "nr_zone_active_file",
- "nr_zone_unevictable",
- "nr_zone_write_pending",
- "nr_mlock",
- "nr_bounce",
+#define I(x) (x)
+ [I(NR_FREE_PAGES)] = "nr_free_pages",
+ [I(NR_FREE_PAGES_BLOCKS)] = "nr_free_pages_blocks",
+ [I(NR_ZONE_INACTIVE_ANON)] = "nr_zone_inactive_anon",
+ [I(NR_ZONE_ACTIVE_ANON)] = "nr_zone_active_anon",
+ [I(NR_ZONE_INACTIVE_FILE)] = "nr_zone_inactive_file",
+ [I(NR_ZONE_ACTIVE_FILE)] = "nr_zone_active_file",
+ [I(NR_ZONE_UNEVICTABLE)] = "nr_zone_unevictable",
+ [I(NR_ZONE_WRITE_PENDING)] = "nr_zone_write_pending",
+ [I(NR_MLOCK)] = "nr_mlock",
#if IS_ENABLED(CONFIG_ZSMALLOC)
- "nr_zspages",
+ [I(NR_ZSPAGES)] = "nr_zspages",
#endif
- "nr_free_cma",
+ [I(NR_FREE_CMA_PAGES)] = "nr_free_cma",
#ifdef CONFIG_UNACCEPTED_MEMORY
- "nr_unaccepted",
+ [I(NR_UNACCEPTED)] = "nr_unaccepted",
#endif
+#undef I
/* enum numa_stat_item counters */
+#define I(x) (NR_VM_ZONE_STAT_ITEMS + x)
#ifdef CONFIG_NUMA
- "numa_hit",
- "numa_miss",
- "numa_foreign",
- "numa_interleave",
- "numa_local",
- "numa_other",
+ [I(NUMA_HIT)] = "numa_hit",
+ [I(NUMA_MISS)] = "numa_miss",
+ [I(NUMA_FOREIGN)] = "numa_foreign",
+ [I(NUMA_INTERLEAVE_HIT)] = "numa_interleave",
+ [I(NUMA_LOCAL)] = "numa_local",
+ [I(NUMA_OTHER)] = "numa_other",
#endif
+#undef I
/* enum node_stat_item counters */
- "nr_inactive_anon",
- "nr_active_anon",
- "nr_inactive_file",
- "nr_active_file",
- "nr_unevictable",
- "nr_slab_reclaimable",
- "nr_slab_unreclaimable",
- "nr_isolated_anon",
- "nr_isolated_file",
- "workingset_nodes",
- "workingset_refault_anon",
- "workingset_refault_file",
- "workingset_activate_anon",
- "workingset_activate_file",
- "workingset_restore_anon",
- "workingset_restore_file",
- "workingset_nodereclaim",
- "nr_anon_pages",
- "nr_mapped",
- "nr_file_pages",
- "nr_dirty",
- "nr_writeback",
- "nr_writeback_temp",
- "nr_shmem",
- "nr_shmem_hugepages",
- "nr_shmem_pmdmapped",
- "nr_file_hugepages",
- "nr_file_pmdmapped",
- "nr_anon_transparent_hugepages",
- "nr_vmscan_write",
- "nr_vmscan_immediate_reclaim",
- "nr_dirtied",
- "nr_written",
- "nr_throttled_written",
- "nr_kernel_misc_reclaimable",
- "nr_foll_pin_acquired",
- "nr_foll_pin_released",
- "nr_kernel_stack",
+#define I(x) (NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS + x)
+ [I(NR_INACTIVE_ANON)] = "nr_inactive_anon",
+ [I(NR_ACTIVE_ANON)] = "nr_active_anon",
+ [I(NR_INACTIVE_FILE)] = "nr_inactive_file",
+ [I(NR_ACTIVE_FILE)] = "nr_active_file",
+ [I(NR_UNEVICTABLE)] = "nr_unevictable",
+ [I(NR_SLAB_RECLAIMABLE_B)] = "nr_slab_reclaimable",
+ [I(NR_SLAB_UNRECLAIMABLE_B)] = "nr_slab_unreclaimable",
+ [I(NR_ISOLATED_ANON)] = "nr_isolated_anon",
+ [I(NR_ISOLATED_FILE)] = "nr_isolated_file",
+ [I(WORKINGSET_NODES)] = "workingset_nodes",
+ [I(WORKINGSET_REFAULT_ANON)] = "workingset_refault_anon",
+ [I(WORKINGSET_REFAULT_FILE)] = "workingset_refault_file",
+ [I(WORKINGSET_ACTIVATE_ANON)] = "workingset_activate_anon",
+ [I(WORKINGSET_ACTIVATE_FILE)] = "workingset_activate_file",
+ [I(WORKINGSET_RESTORE_ANON)] = "workingset_restore_anon",
+ [I(WORKINGSET_RESTORE_FILE)] = "workingset_restore_file",
+ [I(WORKINGSET_NODERECLAIM)] = "workingset_nodereclaim",
+ [I(NR_ANON_MAPPED)] = "nr_anon_pages",
+ [I(NR_FILE_MAPPED)] = "nr_mapped",
+ [I(NR_FILE_PAGES)] = "nr_file_pages",
+ [I(NR_FILE_DIRTY)] = "nr_dirty",
+ [I(NR_WRITEBACK)] = "nr_writeback",
+ [I(NR_SHMEM)] = "nr_shmem",
+ [I(NR_SHMEM_THPS)] = "nr_shmem_hugepages",
+ [I(NR_SHMEM_PMDMAPPED)] = "nr_shmem_pmdmapped",
+ [I(NR_FILE_THPS)] = "nr_file_hugepages",
+ [I(NR_FILE_PMDMAPPED)] = "nr_file_pmdmapped",
+ [I(NR_ANON_THPS)] = "nr_anon_transparent_hugepages",
+ [I(NR_VMSCAN_WRITE)] = "nr_vmscan_write",
+ [I(NR_VMSCAN_IMMEDIATE)] = "nr_vmscan_immediate_reclaim",
+ [I(NR_DIRTIED)] = "nr_dirtied",
+ [I(NR_WRITTEN)] = "nr_written",
+ [I(NR_THROTTLED_WRITTEN)] = "nr_throttled_written",
+ [I(NR_KERNEL_MISC_RECLAIMABLE)] = "nr_kernel_misc_reclaimable",
+ [I(NR_FOLL_PIN_ACQUIRED)] = "nr_foll_pin_acquired",
+ [I(NR_FOLL_PIN_RELEASED)] = "nr_foll_pin_released",
+ [I(NR_KERNEL_STACK_KB)] = "nr_kernel_stack",
#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
- "nr_shadow_call_stack",
+ [I(NR_KERNEL_SCS_KB)] = "nr_shadow_call_stack",
#endif
- "nr_page_table_pages",
- "nr_sec_page_table_pages",
+ [I(NR_PAGETABLE)] = "nr_page_table_pages",
+ [I(NR_SECONDARY_PAGETABLE)] = "nr_sec_page_table_pages",
#ifdef CONFIG_IOMMU_SUPPORT
- "nr_iommu_pages",
+ [I(NR_IOMMU_PAGES)] = "nr_iommu_pages",
#endif
#ifdef CONFIG_SWAP
- "nr_swapcached",
+ [I(NR_SWAPCACHE)] = "nr_swapcached",
#endif
#ifdef CONFIG_NUMA_BALANCING
- "pgpromote_success",
- "pgpromote_candidate",
+ [I(PGPROMOTE_SUCCESS)] = "pgpromote_success",
+ [I(PGPROMOTE_CANDIDATE)] = "pgpromote_candidate",
+ [I(PGPROMOTE_CANDIDATE_NRL)] = "pgpromote_candidate_nrl",
#endif
- "pgdemote_kswapd",
- "pgdemote_direct",
- "pgdemote_khugepaged",
- "pgdemote_proactive",
+ [I(PGDEMOTE_KSWAPD)] = "pgdemote_kswapd",
+ [I(PGDEMOTE_DIRECT)] = "pgdemote_direct",
+ [I(PGDEMOTE_KHUGEPAGED)] = "pgdemote_khugepaged",
+ [I(PGDEMOTE_PROACTIVE)] = "pgdemote_proactive",
#ifdef CONFIG_HUGETLB_PAGE
- "nr_hugetlb",
+ [I(NR_HUGETLB)] = "nr_hugetlb",
#endif
- "nr_balloon_pages",
- /* system-wide enum vm_stat_item counters */
- "nr_dirty_threshold",
- "nr_dirty_background_threshold",
- "nr_memmap_pages",
- "nr_memmap_boot_pages",
+ [I(NR_BALLOON_PAGES)] = "nr_balloon_pages",
+ [I(NR_KERNEL_FILE_PAGES)] = "nr_kernel_file_pages",
+#undef I
-#if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG)
+ /* system-wide enum vm_stat_item counters */
+#define I(x) (NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS + \
+ NR_VM_NODE_STAT_ITEMS + x)
+ [I(NR_DIRTY_THRESHOLD)] = "nr_dirty_threshold",
+ [I(NR_DIRTY_BG_THRESHOLD)] = "nr_dirty_background_threshold",
+ [I(NR_MEMMAP_PAGES)] = "nr_memmap_pages",
+ [I(NR_MEMMAP_BOOT_PAGES)] = "nr_memmap_boot_pages",
+#undef I
+
+#if defined(CONFIG_VM_EVENT_COUNTERS)
/* enum vm_event_item counters */
- "pgpgin",
- "pgpgout",
- "pswpin",
- "pswpout",
-
- TEXTS_FOR_ZONES("pgalloc")
- TEXTS_FOR_ZONES("allocstall")
- TEXTS_FOR_ZONES("pgskip")
-
- "pgfree",
- "pgactivate",
- "pgdeactivate",
- "pglazyfree",
-
- "pgfault",
- "pgmajfault",
- "pglazyfreed",
-
- "pgrefill",
- "pgreuse",
- "pgsteal_kswapd",
- "pgsteal_direct",
- "pgsteal_khugepaged",
- "pgsteal_proactive",
- "pgscan_kswapd",
- "pgscan_direct",
- "pgscan_khugepaged",
- "pgscan_proactive",
- "pgscan_direct_throttle",
- "pgscan_anon",
- "pgscan_file",
- "pgsteal_anon",
- "pgsteal_file",
+#define I(x) (NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS + \
+ NR_VM_NODE_STAT_ITEMS + NR_VM_STAT_ITEMS + x)
+
+ [I(PGPGIN)] = "pgpgin",
+ [I(PGPGOUT)] = "pgpgout",
+ [I(PSWPIN)] = "pswpin",
+ [I(PSWPOUT)] = "pswpout",
+
+#define OFF (NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS + \
+ NR_VM_NODE_STAT_ITEMS + NR_VM_STAT_ITEMS)
+ TEXTS_FOR_ZONES(OFF+PGALLOC, "pgalloc")
+ TEXTS_FOR_ZONES(OFF+ALLOCSTALL, "allocstall")
+ TEXTS_FOR_ZONES(OFF+PGSCAN_SKIP, "pgskip")
+#undef OFF
+
+ [I(PGFREE)] = "pgfree",
+ [I(PGACTIVATE)] = "pgactivate",
+ [I(PGDEACTIVATE)] = "pgdeactivate",
+ [I(PGLAZYFREE)] = "pglazyfree",
+
+ [I(PGFAULT)] = "pgfault",
+ [I(PGMAJFAULT)] = "pgmajfault",
+ [I(PGLAZYFREED)] = "pglazyfreed",
+
+ [I(PGREFILL)] = "pgrefill",
+ [I(PGREUSE)] = "pgreuse",
+ [I(PGSTEAL_KSWAPD)] = "pgsteal_kswapd",
+ [I(PGSTEAL_DIRECT)] = "pgsteal_direct",
+ [I(PGSTEAL_KHUGEPAGED)] = "pgsteal_khugepaged",
+ [I(PGSTEAL_PROACTIVE)] = "pgsteal_proactive",
+ [I(PGSCAN_KSWAPD)] = "pgscan_kswapd",
+ [I(PGSCAN_DIRECT)] = "pgscan_direct",
+ [I(PGSCAN_KHUGEPAGED)] = "pgscan_khugepaged",
+ [I(PGSCAN_PROACTIVE)] = "pgscan_proactive",
+ [I(PGSCAN_DIRECT_THROTTLE)] = "pgscan_direct_throttle",
+ [I(PGSCAN_ANON)] = "pgscan_anon",
+ [I(PGSCAN_FILE)] = "pgscan_file",
+ [I(PGSTEAL_ANON)] = "pgsteal_anon",
+ [I(PGSTEAL_FILE)] = "pgsteal_file",
#ifdef CONFIG_NUMA
- "zone_reclaim_success",
- "zone_reclaim_failed",
+ [I(PGSCAN_ZONE_RECLAIM_SUCCESS)] = "zone_reclaim_success",
+ [I(PGSCAN_ZONE_RECLAIM_FAILED)] = "zone_reclaim_failed",
#endif
- "pginodesteal",
- "slabs_scanned",
- "kswapd_inodesteal",
- "kswapd_low_wmark_hit_quickly",
- "kswapd_high_wmark_hit_quickly",
- "pageoutrun",
+ [I(PGINODESTEAL)] = "pginodesteal",
+ [I(SLABS_SCANNED)] = "slabs_scanned",
+ [I(KSWAPD_INODESTEAL)] = "kswapd_inodesteal",
+ [I(KSWAPD_LOW_WMARK_HIT_QUICKLY)] = "kswapd_low_wmark_hit_quickly",
+ [I(KSWAPD_HIGH_WMARK_HIT_QUICKLY)] = "kswapd_high_wmark_hit_quickly",
+ [I(PAGEOUTRUN)] = "pageoutrun",
- "pgrotated",
+ [I(PGROTATED)] = "pgrotated",
- "drop_pagecache",
- "drop_slab",
- "oom_kill",
+ [I(DROP_PAGECACHE)] = "drop_pagecache",
+ [I(DROP_SLAB)] = "drop_slab",
+ [I(OOM_KILL)] = "oom_kill",
#ifdef CONFIG_NUMA_BALANCING
- "numa_pte_updates",
- "numa_huge_pte_updates",
- "numa_hint_faults",
- "numa_hint_faults_local",
- "numa_pages_migrated",
+ [I(NUMA_PTE_UPDATES)] = "numa_pte_updates",
+ [I(NUMA_HUGE_PTE_UPDATES)] = "numa_huge_pte_updates",
+ [I(NUMA_HINT_FAULTS)] = "numa_hint_faults",
+ [I(NUMA_HINT_FAULTS_LOCAL)] = "numa_hint_faults_local",
+ [I(NUMA_PAGE_MIGRATE)] = "numa_pages_migrated",
#endif
#ifdef CONFIG_MIGRATION
- "pgmigrate_success",
- "pgmigrate_fail",
- "thp_migration_success",
- "thp_migration_fail",
- "thp_migration_split",
+ [I(PGMIGRATE_SUCCESS)] = "pgmigrate_success",
+ [I(PGMIGRATE_FAIL)] = "pgmigrate_fail",
+ [I(THP_MIGRATION_SUCCESS)] = "thp_migration_success",
+ [I(THP_MIGRATION_FAIL)] = "thp_migration_fail",
+ [I(THP_MIGRATION_SPLIT)] = "thp_migration_split",
#endif
#ifdef CONFIG_COMPACTION
- "compact_migrate_scanned",
- "compact_free_scanned",
- "compact_isolated",
- "compact_stall",
- "compact_fail",
- "compact_success",
- "compact_daemon_wake",
- "compact_daemon_migrate_scanned",
- "compact_daemon_free_scanned",
+ [I(COMPACTMIGRATE_SCANNED)] = "compact_migrate_scanned",
+ [I(COMPACTFREE_SCANNED)] = "compact_free_scanned",
+ [I(COMPACTISOLATED)] = "compact_isolated",
+ [I(COMPACTSTALL)] = "compact_stall",
+ [I(COMPACTFAIL)] = "compact_fail",
+ [I(COMPACTSUCCESS)] = "compact_success",
+ [I(KCOMPACTD_WAKE)] = "compact_daemon_wake",
+ [I(KCOMPACTD_MIGRATE_SCANNED)] = "compact_daemon_migrate_scanned",
+ [I(KCOMPACTD_FREE_SCANNED)] = "compact_daemon_free_scanned",
#endif
#ifdef CONFIG_HUGETLB_PAGE
- "htlb_buddy_alloc_success",
- "htlb_buddy_alloc_fail",
+ [I(HTLB_BUDDY_PGALLOC)] = "htlb_buddy_alloc_success",
+ [I(HTLB_BUDDY_PGALLOC_FAIL)] = "htlb_buddy_alloc_fail",
#endif
#ifdef CONFIG_CMA
- "cma_alloc_success",
- "cma_alloc_fail",
+ [I(CMA_ALLOC_SUCCESS)] = "cma_alloc_success",
+ [I(CMA_ALLOC_FAIL)] = "cma_alloc_fail",
#endif
- "unevictable_pgs_culled",
- "unevictable_pgs_scanned",
- "unevictable_pgs_rescued",
- "unevictable_pgs_mlocked",
- "unevictable_pgs_munlocked",
- "unevictable_pgs_cleared",
- "unevictable_pgs_stranded",
+ [I(UNEVICTABLE_PGCULLED)] = "unevictable_pgs_culled",
+ [I(UNEVICTABLE_PGSCANNED)] = "unevictable_pgs_scanned",
+ [I(UNEVICTABLE_PGRESCUED)] = "unevictable_pgs_rescued",
+ [I(UNEVICTABLE_PGMLOCKED)] = "unevictable_pgs_mlocked",
+ [I(UNEVICTABLE_PGMUNLOCKED)] = "unevictable_pgs_munlocked",
+ [I(UNEVICTABLE_PGCLEARED)] = "unevictable_pgs_cleared",
+ [I(UNEVICTABLE_PGSTRANDED)] = "unevictable_pgs_stranded",
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- "thp_fault_alloc",
- "thp_fault_fallback",
- "thp_fault_fallback_charge",
- "thp_collapse_alloc",
- "thp_collapse_alloc_failed",
- "thp_file_alloc",
- "thp_file_fallback",
- "thp_file_fallback_charge",
- "thp_file_mapped",
- "thp_split_page",
- "thp_split_page_failed",
- "thp_deferred_split_page",
- "thp_underused_split_page",
- "thp_split_pmd",
- "thp_scan_exceed_none_pte",
- "thp_scan_exceed_swap_pte",
- "thp_scan_exceed_share_pte",
+ [I(THP_FAULT_ALLOC)] = "thp_fault_alloc",
+ [I(THP_FAULT_FALLBACK)] = "thp_fault_fallback",
+ [I(THP_FAULT_FALLBACK_CHARGE)] = "thp_fault_fallback_charge",
+ [I(THP_COLLAPSE_ALLOC)] = "thp_collapse_alloc",
+ [I(THP_COLLAPSE_ALLOC_FAILED)] = "thp_collapse_alloc_failed",
+ [I(THP_FILE_ALLOC)] = "thp_file_alloc",
+ [I(THP_FILE_FALLBACK)] = "thp_file_fallback",
+ [I(THP_FILE_FALLBACK_CHARGE)] = "thp_file_fallback_charge",
+ [I(THP_FILE_MAPPED)] = "thp_file_mapped",
+ [I(THP_SPLIT_PAGE)] = "thp_split_page",
+ [I(THP_SPLIT_PAGE_FAILED)] = "thp_split_page_failed",
+ [I(THP_DEFERRED_SPLIT_PAGE)] = "thp_deferred_split_page",
+ [I(THP_UNDERUSED_SPLIT_PAGE)] = "thp_underused_split_page",
+ [I(THP_SPLIT_PMD)] = "thp_split_pmd",
+ [I(THP_SCAN_EXCEED_NONE_PTE)] = "thp_scan_exceed_none_pte",
+ [I(THP_SCAN_EXCEED_SWAP_PTE)] = "thp_scan_exceed_swap_pte",
+ [I(THP_SCAN_EXCEED_SHARED_PTE)] = "thp_scan_exceed_share_pte",
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
- "thp_split_pud",
+ [I(THP_SPLIT_PUD)] = "thp_split_pud",
#endif
- "thp_zero_page_alloc",
- "thp_zero_page_alloc_failed",
- "thp_swpout",
- "thp_swpout_fallback",
+ [I(THP_ZERO_PAGE_ALLOC)] = "thp_zero_page_alloc",
+ [I(THP_ZERO_PAGE_ALLOC_FAILED)] = "thp_zero_page_alloc_failed",
+ [I(THP_SWPOUT)] = "thp_swpout",
+ [I(THP_SWPOUT_FALLBACK)] = "thp_swpout_fallback",
#endif
#ifdef CONFIG_MEMORY_BALLOON
- "balloon_inflate",
- "balloon_deflate",
+ [I(BALLOON_INFLATE)] = "balloon_inflate",
+ [I(BALLOON_DEFLATE)] = "balloon_deflate",
#ifdef CONFIG_BALLOON_COMPACTION
- "balloon_migrate",
+ [I(BALLOON_MIGRATE)] = "balloon_migrate",
#endif
#endif /* CONFIG_MEMORY_BALLOON */
#ifdef CONFIG_DEBUG_TLBFLUSH
- "nr_tlb_remote_flush",
- "nr_tlb_remote_flush_received",
- "nr_tlb_local_flush_all",
- "nr_tlb_local_flush_one",
+ [I(NR_TLB_REMOTE_FLUSH)] = "nr_tlb_remote_flush",
+ [I(NR_TLB_REMOTE_FLUSH_RECEIVED)] = "nr_tlb_remote_flush_received",
+ [I(NR_TLB_LOCAL_FLUSH_ALL)] = "nr_tlb_local_flush_all",
+ [I(NR_TLB_LOCAL_FLUSH_ONE)] = "nr_tlb_local_flush_one",
#endif /* CONFIG_DEBUG_TLBFLUSH */
#ifdef CONFIG_SWAP
- "swap_ra",
- "swap_ra_hit",
- "swpin_zero",
- "swpout_zero",
+ [I(SWAP_RA)] = "swap_ra",
+ [I(SWAP_RA_HIT)] = "swap_ra_hit",
+ [I(SWPIN_ZERO)] = "swpin_zero",
+ [I(SWPOUT_ZERO)] = "swpout_zero",
#ifdef CONFIG_KSM
- "ksm_swpin_copy",
+ [I(KSM_SWPIN_COPY)] = "ksm_swpin_copy",
#endif
#endif
#ifdef CONFIG_KSM
- "cow_ksm",
+ [I(COW_KSM)] = "cow_ksm",
#endif
#ifdef CONFIG_ZSWAP
- "zswpin",
- "zswpout",
- "zswpwb",
+ [I(ZSWPIN)] = "zswpin",
+ [I(ZSWPOUT)] = "zswpout",
+ [I(ZSWPWB)] = "zswpwb",
#endif
#ifdef CONFIG_X86
- "direct_map_level2_splits",
- "direct_map_level3_splits",
- "direct_map_level2_collapses",
- "direct_map_level3_collapses",
+ [I(DIRECT_MAP_LEVEL2_SPLIT)] = "direct_map_level2_splits",
+ [I(DIRECT_MAP_LEVEL3_SPLIT)] = "direct_map_level3_splits",
+ [I(DIRECT_MAP_LEVEL2_COLLAPSE)] = "direct_map_level2_collapses",
+ [I(DIRECT_MAP_LEVEL3_COLLAPSE)] = "direct_map_level3_collapses",
#endif
#ifdef CONFIG_PER_VMA_LOCK_STATS
- "vma_lock_success",
- "vma_lock_abort",
- "vma_lock_retry",
- "vma_lock_miss",
+ [I(VMA_LOCK_SUCCESS)] = "vma_lock_success",
+ [I(VMA_LOCK_ABORT)] = "vma_lock_abort",
+ [I(VMA_LOCK_RETRY)] = "vma_lock_retry",
+ [I(VMA_LOCK_MISS)] = "vma_lock_miss",
#endif
#ifdef CONFIG_DEBUG_STACK_USAGE
- "kstack_1k",
+ [I(KSTACK_1K)] = "kstack_1k",
#if THREAD_SIZE > 1024
- "kstack_2k",
+ [I(KSTACK_2K)] = "kstack_2k",
#endif
#if THREAD_SIZE > 2048
- "kstack_4k",
+ [I(KSTACK_4K)] = "kstack_4k",
#endif
#if THREAD_SIZE > 4096
- "kstack_8k",
+ [I(KSTACK_8K)] = "kstack_8k",
#endif
#if THREAD_SIZE > 8192
- "kstack_16k",
+ [I(KSTACK_16K)] = "kstack_16k",
#endif
#if THREAD_SIZE > 16384
- "kstack_32k",
+ [I(KSTACK_32K)] = "kstack_32k",
#endif
#if THREAD_SIZE > 32768
- "kstack_64k",
+ [I(KSTACK_64K)] = "kstack_64k",
#endif
#if THREAD_SIZE > 65536
- "kstack_rest",
+ [I(KSTACK_REST)] = "kstack_rest",
#endif
#endif
-#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
+#undef I
+#endif /* CONFIG_VM_EVENT_COUNTERS */
};
#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
@@ -1827,7 +1848,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
seq_printf(m,
"\n node_unreclaimable: %u"
"\n start_pfn: %lu",
- pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES,
+ atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES,
zone->zone_start_pfn);
seq_putc(m, '\n');
}
@@ -1868,7 +1889,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
if (*pos >= NR_VMSTAT_ITEMS)
return NULL;
- BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) < NR_VMSTAT_ITEMS);
+ BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) != NR_VMSTAT_ITEMS);
fold_vm_numa_events();
v = kmalloc_array(NR_VMSTAT_ITEMS, sizeof(unsigned long), GFP_KERNEL);
m->private = v;
diff --git a/mm/workingset.c b/mm/workingset.c
index 4841ae8af411..68a76a91111f 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -318,7 +318,7 @@ static void lru_gen_refault(struct folio *folio, void *shadow)
folio_set_workingset(folio);
mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
} else
- set_mask_bits(&folio->flags, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF);
+ set_mask_bits(&folio->flags.f, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF);
unlock:
rcu_read_unlock();
}
@@ -612,7 +612,6 @@ struct list_lru shadow_nodes;
void workingset_update_node(struct xa_node *node)
{
- struct address_space *mapping;
struct page *page = virt_to_page(node);
/*
@@ -623,8 +622,7 @@ void workingset_update_node(struct xa_node *node)
* already where they should be. The list_empty() test is safe
* as node->private_list is protected by the i_pages lock.
*/
- mapping = container_of(node->array, struct address_space, i_pages);
- lockdep_assert_held(&mapping->i_pages.xa_lock);
+ lockdep_assert_held(&node->array->xa_lock);
if (node->count && node->count == node->nr_values) {
if (list_empty(&node->private_list)) {
diff --git a/mm/zpdesc.h b/mm/zpdesc.h
index fa47fece2237..b8258dc78548 100644
--- a/mm/zpdesc.h
+++ b/mm/zpdesc.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* zpdesc.h: zswap.zpool memory descriptor
+/* zpdesc.h: zsmalloc pool memory descriptor
*
* Written by Alex Shi <alexs@kernel.org>
* Hyeonggon Yoo <42.hyeyoo@gmail.com>
@@ -7,15 +7,18 @@
#ifndef __MM_ZPDESC_H__
#define __MM_ZPDESC_H__
+#include <linux/migrate.h>
+#include <linux/pagemap.h>
+
/*
- * struct zpdesc - Memory descriptor for zpool memory.
+ * struct zpdesc - Memory descriptor for zsmalloc pool memory.
* @flags: Page flags, mostly unused by zsmalloc.
* @lru: Indirectly used by page migration.
* @movable_ops: Used by page migration.
- * @next: Next zpdesc in a zspage in zsmalloc zpool.
- * @handle: For huge zspage in zsmalloc zpool.
+ * @next: Next zpdesc in a zspage in zsmalloc pool.
+ * @handle: For huge zspage in zsmalloc pool.
* @zspage: Points to the zspage this zpdesc is a part of.
- * @first_obj_offset: First object offset in zsmalloc zpool.
+ * @first_obj_offset: First object offset in zsmalloc pool.
* @_refcount: The number of references to this zpdesc.
*
* This struct overlays struct page for now. Do not modify without a good
@@ -51,8 +54,8 @@ struct zpdesc {
ZPDESC_MATCH(flags, flags);
ZPDESC_MATCH(lru, lru);
ZPDESC_MATCH(mapping, movable_ops);
-ZPDESC_MATCH(index, next);
-ZPDESC_MATCH(index, handle);
+ZPDESC_MATCH(__folio_index, next);
+ZPDESC_MATCH(__folio_index, handle);
ZPDESC_MATCH(private, zspage);
ZPDESC_MATCH(page_type, first_obj_offset);
ZPDESC_MATCH(_refcount, _refcount);
@@ -76,8 +79,8 @@ static_assert(sizeof(struct zpdesc) <= sizeof(struct page));
* zpdesc_folio - The folio allocated for a zpdesc
* @zp: The zpdesc.
*
- * Zpdescs are descriptors for zpool memory. The zpool memory itself is
- * allocated as folios that contain the zpool objects, and zpdesc uses specific
+ * Zpdescs are descriptors for zsmalloc memory. The memory itself is allocated
+ * as folios that contain the zsmalloc objects, and zpdesc uses specific
* fields in the first struct page of the folio - those fields are now accessed
* by struct zpdesc.
*
@@ -149,10 +152,9 @@ static inline struct zpdesc *pfn_zpdesc(unsigned long pfn)
return page_zpdesc(pfn_to_page(pfn));
}
-static inline void __zpdesc_set_movable(struct zpdesc *zpdesc,
- const struct movable_operations *mops)
+static inline void __zpdesc_set_movable(struct zpdesc *zpdesc)
{
- __SetPageMovable(zpdesc_page(zpdesc), mops);
+ SetPageMovableOps(zpdesc_page(zpdesc));
}
static inline void __zpdesc_set_zsmalloc(struct zpdesc *zpdesc)
@@ -160,16 +162,6 @@ static inline void __zpdesc_set_zsmalloc(struct zpdesc *zpdesc)
__SetPageZsmalloc(zpdesc_page(zpdesc));
}
-static inline void __zpdesc_clear_zsmalloc(struct zpdesc *zpdesc)
-{
- __ClearPageZsmalloc(zpdesc_page(zpdesc));
-}
-
-static inline bool zpdesc_is_isolated(struct zpdesc *zpdesc)
-{
- return PageIsolated(zpdesc_page(zpdesc));
-}
-
static inline struct zone *zpdesc_zone(struct zpdesc *zpdesc)
{
return page_zone(zpdesc_page(zpdesc));
diff --git a/mm/zpool.c b/mm/zpool.c
deleted file mode 100644
index 6d6d88930932..000000000000
--- a/mm/zpool.c
+++ /dev/null
@@ -1,326 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * zpool memory storage api
- *
- * Copyright (C) 2014 Dan Streetman
- *
- * This is a common frontend for memory storage pool implementations.
- * Typically, this is used to store compressed memory.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/list.h>
-#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/module.h>
-#include <linux/zpool.h>
-
-struct zpool {
- struct zpool_driver *driver;
- void *pool;
-};
-
-static LIST_HEAD(drivers_head);
-static DEFINE_SPINLOCK(drivers_lock);
-
-/**
- * zpool_register_driver() - register a zpool implementation.
- * @driver: driver to register
- */
-void zpool_register_driver(struct zpool_driver *driver)
-{
- spin_lock(&drivers_lock);
- atomic_set(&driver->refcount, 0);
- list_add(&driver->list, &drivers_head);
- spin_unlock(&drivers_lock);
-}
-EXPORT_SYMBOL(zpool_register_driver);
-
-/**
- * zpool_unregister_driver() - unregister a zpool implementation.
- * @driver: driver to unregister.
- *
- * Module usage counting is used to prevent using a driver
- * while/after unloading, so if this is called from module
- * exit function, this should never fail; if called from
- * other than the module exit function, and this returns
- * failure, the driver is in use and must remain available.
- */
-int zpool_unregister_driver(struct zpool_driver *driver)
-{
- int ret = 0, refcount;
-
- spin_lock(&drivers_lock);
- refcount = atomic_read(&driver->refcount);
- WARN_ON(refcount < 0);
- if (refcount > 0)
- ret = -EBUSY;
- else
- list_del(&driver->list);
- spin_unlock(&drivers_lock);
-
- return ret;
-}
-EXPORT_SYMBOL(zpool_unregister_driver);
-
-/* this assumes @type is null-terminated. */
-static struct zpool_driver *zpool_get_driver(const char *type)
-{
- struct zpool_driver *driver;
-
- spin_lock(&drivers_lock);
- list_for_each_entry(driver, &drivers_head, list) {
- if (!strcmp(driver->type, type)) {
- bool got = try_module_get(driver->owner);
-
- if (got)
- atomic_inc(&driver->refcount);
- spin_unlock(&drivers_lock);
- return got ? driver : NULL;
- }
- }
-
- spin_unlock(&drivers_lock);
- return NULL;
-}
-
-static void zpool_put_driver(struct zpool_driver *driver)
-{
- atomic_dec(&driver->refcount);
- module_put(driver->owner);
-}
-
-/**
- * zpool_has_pool() - Check if the pool driver is available
- * @type: The type of the zpool to check (e.g. zsmalloc)
- *
- * This checks if the @type pool driver is available. This will try to load
- * the requested module, if needed, but there is no guarantee the module will
- * still be loaded and available immediately after calling. If this returns
- * true, the caller should assume the pool is available, but must be prepared
- * to handle the @zpool_create_pool() returning failure. However if this
- * returns false, the caller should assume the requested pool type is not
- * available; either the requested pool type module does not exist, or could
- * not be loaded, and calling @zpool_create_pool() with the pool type will
- * fail.
- *
- * The @type string must be null-terminated.
- *
- * Returns: true if @type pool is available, false if not
- */
-bool zpool_has_pool(char *type)
-{
- struct zpool_driver *driver = zpool_get_driver(type);
-
- if (!driver) {
- request_module("zpool-%s", type);
- driver = zpool_get_driver(type);
- }
-
- if (!driver)
- return false;
-
- zpool_put_driver(driver);
- return true;
-}
-EXPORT_SYMBOL(zpool_has_pool);
-
-/**
- * zpool_create_pool() - Create a new zpool
- * @type: The type of the zpool to create (e.g. zsmalloc)
- * @name: The name of the zpool (e.g. zram0, zswap)
- * @gfp: The GFP flags to use when allocating the pool.
- *
- * This creates a new zpool of the specified type. The gfp flags will be
- * used when allocating memory, if the implementation supports it. If the
- * ops param is NULL, then the created zpool will not be evictable.
- *
- * Implementations must guarantee this to be thread-safe.
- *
- * The @type and @name strings must be null-terminated.
- *
- * Returns: New zpool on success, NULL on failure.
- */
-struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp)
-{
- struct zpool_driver *driver;
- struct zpool *zpool;
-
- pr_debug("creating pool type %s\n", type);
-
- driver = zpool_get_driver(type);
-
- if (!driver) {
- request_module("zpool-%s", type);
- driver = zpool_get_driver(type);
- }
-
- if (!driver) {
- pr_err("no driver for type %s\n", type);
- return NULL;
- }
-
- zpool = kmalloc(sizeof(*zpool), gfp);
- if (!zpool) {
- pr_err("couldn't create zpool - out of memory\n");
- zpool_put_driver(driver);
- return NULL;
- }
-
- zpool->driver = driver;
- zpool->pool = driver->create(name, gfp);
-
- if (!zpool->pool) {
- pr_err("couldn't create %s pool\n", type);
- zpool_put_driver(driver);
- kfree(zpool);
- return NULL;
- }
-
- pr_debug("created pool type %s\n", type);
-
- return zpool;
-}
-
-/**
- * zpool_destroy_pool() - Destroy a zpool
- * @zpool: The zpool to destroy.
- *
- * Implementations must guarantee this to be thread-safe,
- * however only when destroying different pools. The same
- * pool should only be destroyed once, and should not be used
- * after it is destroyed.
- *
- * This destroys an existing zpool. The zpool should not be in use.
- */
-void zpool_destroy_pool(struct zpool *zpool)
-{
- pr_debug("destroying pool type %s\n", zpool->driver->type);
-
- zpool->driver->destroy(zpool->pool);
- zpool_put_driver(zpool->driver);
- kfree(zpool);
-}
-
-/**
- * zpool_get_type() - Get the type of the zpool
- * @zpool: The zpool to check
- *
- * This returns the type of the pool.
- *
- * Implementations must guarantee this to be thread-safe.
- *
- * Returns: The type of zpool.
- */
-const char *zpool_get_type(struct zpool *zpool)
-{
- return zpool->driver->type;
-}
-
-/**
- * zpool_malloc() - Allocate memory
- * @zpool: The zpool to allocate from.
- * @size: The amount of memory to allocate.
- * @gfp: The GFP flags to use when allocating memory.
- * @handle: Pointer to the handle to set
- *
- * This allocates the requested amount of memory from the pool.
- * The gfp flags will be used when allocating memory, if the
- * implementation supports it. The provided @handle will be
- * set to the allocated object handle.
- *
- * Implementations must guarantee this to be thread-safe.
- *
- * Returns: 0 on success, negative value on error.
- */
-int zpool_malloc(struct zpool *zpool, size_t size, gfp_t gfp,
- unsigned long *handle)
-{
- return zpool->driver->malloc(zpool->pool, size, gfp, handle);
-}
-
-/**
- * zpool_free() - Free previously allocated memory
- * @zpool: The zpool that allocated the memory.
- * @handle: The handle to the memory to free.
- *
- * This frees previously allocated memory. This does not guarantee
- * that the pool will actually free memory, only that the memory
- * in the pool will become available for use by the pool.
- *
- * Implementations must guarantee this to be thread-safe,
- * however only when freeing different handles. The same
- * handle should only be freed once, and should not be used
- * after freeing.
- */
-void zpool_free(struct zpool *zpool, unsigned long handle)
-{
- zpool->driver->free(zpool->pool, handle);
-}
-
-/**
- * zpool_obj_read_begin() - Start reading from a previously allocated handle.
- * @zpool: The zpool that the handle was allocated from
- * @handle: The handle to read from
- * @local_copy: A local buffer to use if needed.
- *
- * This starts a read operation of a previously allocated handle. The passed
- * @local_copy buffer may be used if needed by copying the memory into.
- * zpool_obj_read_end() MUST be called after the read is completed to undo any
- * actions taken (e.g. release locks).
- *
- * Returns: A pointer to the handle memory to be read, if @local_copy is used,
- * the returned pointer is @local_copy.
- */
-void *zpool_obj_read_begin(struct zpool *zpool, unsigned long handle,
- void *local_copy)
-{
- return zpool->driver->obj_read_begin(zpool->pool, handle, local_copy);
-}
-
-/**
- * zpool_obj_read_end() - Finish reading from a previously allocated handle.
- * @zpool: The zpool that the handle was allocated from
- * @handle: The handle to read from
- * @handle_mem: The pointer returned by zpool_obj_read_begin()
- *
- * Finishes a read operation previously started by zpool_obj_read_begin().
- */
-void zpool_obj_read_end(struct zpool *zpool, unsigned long handle,
- void *handle_mem)
-{
- zpool->driver->obj_read_end(zpool->pool, handle, handle_mem);
-}
-
-/**
- * zpool_obj_write() - Write to a previously allocated handle.
- * @zpool: The zpool that the handle was allocated from
- * @handle: The handle to read from
- * @handle_mem: The memory to copy from into the handle.
- * @mem_len: The length of memory to be written.
- *
- */
-void zpool_obj_write(struct zpool *zpool, unsigned long handle,
- void *handle_mem, size_t mem_len)
-{
- zpool->driver->obj_write(zpool->pool, handle, handle_mem, mem_len);
-}
-
-/**
- * zpool_get_total_pages() - The total size of the pool
- * @zpool: The zpool to check
- *
- * This returns the total size in pages of the pool.
- *
- * Returns: Total size of the zpool in pages.
- */
-u64 zpool_get_total_pages(struct zpool *zpool)
-{
- return zpool->driver->total_pages(zpool->pool);
-}
-
-MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>");
-MODULE_DESCRIPTION("Common API for compressed memory storage");
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index d14a7e317ac8..5bf832f9c05c 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -26,29 +26,18 @@
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/sched.h>
-#include <linux/bitops.h>
#include <linux/errno.h>
#include <linux/highmem.h>
#include <linux/string.h>
#include <linux/slab.h>
-#include <linux/pgtable.h>
-#include <asm/tlbflush.h>
-#include <linux/cpumask.h>
-#include <linux/cpu.h>
-#include <linux/vmalloc.h>
-#include <linux/preempt.h>
#include <linux/spinlock.h>
#include <linux/sprintf.h>
#include <linux/shrinker.h>
#include <linux/types.h>
#include <linux/debugfs.h>
#include <linux/zsmalloc.h>
-#include <linux/zpool.h>
-#include <linux/migrate.h>
-#include <linux/wait.h>
-#include <linux/pagemap.h>
#include <linux/fs.h>
-#include <linux/local_lock.h>
+#include <linux/workqueue.h>
#include "zpdesc.h"
#define ZSPAGE_MAGIC 0x58
@@ -243,9 +232,9 @@ static inline void zpdesc_dec_zone_page_state(struct zpdesc *zpdesc)
dec_zone_page_state(zpdesc_page(zpdesc), NR_ZSPAGES);
}
-static inline struct zpdesc *alloc_zpdesc(gfp_t gfp)
+static inline struct zpdesc *alloc_zpdesc(gfp_t gfp, const int nid)
{
- struct page *page = alloc_page(gfp);
+ struct page *page = alloc_pages_node(nid, gfp, 0);
return page_zpdesc(page);
}
@@ -254,6 +243,7 @@ static inline void free_zpdesc(struct zpdesc *zpdesc)
{
struct page *page = zpdesc_page(zpdesc);
+ /* PageZsmalloc is sticky until the page is freed to the buddy. */
__free_page(page);
}
@@ -442,78 +432,6 @@ static void record_obj(unsigned long handle, unsigned long obj)
*(unsigned long *)handle = obj;
}
-/* zpool driver */
-
-#ifdef CONFIG_ZPOOL
-
-static void *zs_zpool_create(const char *name, gfp_t gfp)
-{
- /*
- * Ignore global gfp flags: zs_malloc() may be invoked from
- * different contexts and its caller must provide a valid
- * gfp mask.
- */
- return zs_create_pool(name);
-}
-
-static void zs_zpool_destroy(void *pool)
-{
- zs_destroy_pool(pool);
-}
-
-static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp,
- unsigned long *handle)
-{
- *handle = zs_malloc(pool, size, gfp);
-
- if (IS_ERR_VALUE(*handle))
- return PTR_ERR((void *)*handle);
- return 0;
-}
-static void zs_zpool_free(void *pool, unsigned long handle)
-{
- zs_free(pool, handle);
-}
-
-static void *zs_zpool_obj_read_begin(void *pool, unsigned long handle,
- void *local_copy)
-{
- return zs_obj_read_begin(pool, handle, local_copy);
-}
-
-static void zs_zpool_obj_read_end(void *pool, unsigned long handle,
- void *handle_mem)
-{
- zs_obj_read_end(pool, handle, handle_mem);
-}
-
-static void zs_zpool_obj_write(void *pool, unsigned long handle,
- void *handle_mem, size_t mem_len)
-{
- zs_obj_write(pool, handle, handle_mem, mem_len);
-}
-
-static u64 zs_zpool_total_pages(void *pool)
-{
- return zs_get_total_pages(pool);
-}
-
-static struct zpool_driver zs_zpool_driver = {
- .type = "zsmalloc",
- .owner = THIS_MODULE,
- .create = zs_zpool_create,
- .destroy = zs_zpool_destroy,
- .malloc = zs_zpool_malloc,
- .free = zs_zpool_free,
- .obj_read_begin = zs_zpool_obj_read_begin,
- .obj_read_end = zs_zpool_obj_read_end,
- .obj_write = zs_zpool_obj_write,
- .total_pages = zs_zpool_total_pages,
-};
-
-MODULE_ALIAS("zpool-zsmalloc");
-#endif /* CONFIG_ZPOOL */
-
static inline bool __maybe_unused is_first_zpdesc(struct zpdesc *zpdesc)
{
return PagePrivate(zpdesc_page(zpdesc));
@@ -886,11 +804,10 @@ static void reset_zpdesc(struct zpdesc *zpdesc)
{
struct page *page = zpdesc_page(zpdesc);
- __ClearPageMovable(page);
ClearPagePrivate(page);
zpdesc->zspage = NULL;
zpdesc->next = NULL;
- __ClearPageZsmalloc(page);
+ /* PageZsmalloc is sticky until the page is freed to the buddy. */
}
static int trylock_zspage(struct zspage *zspage)
@@ -1043,8 +960,8 @@ static void create_page_chain(struct size_class *class, struct zspage *zspage,
* Allocate a zspage for the given size class
*/
static struct zspage *alloc_zspage(struct zs_pool *pool,
- struct size_class *class,
- gfp_t gfp)
+ struct size_class *class,
+ gfp_t gfp, const int nid)
{
int i;
struct zpdesc *zpdescs[ZS_MAX_PAGES_PER_ZSPAGE];
@@ -1053,6 +970,9 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
if (!zspage)
return NULL;
+ if (!IS_ENABLED(CONFIG_COMPACTION))
+ gfp &= ~__GFP_MOVABLE;
+
zspage->magic = ZSPAGE_MAGIC;
zspage->pool = pool;
zspage->class = class->index;
@@ -1061,11 +981,10 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
for (i = 0; i < class->pages_per_zspage; i++) {
struct zpdesc *zpdesc;
- zpdesc = alloc_zpdesc(gfp);
+ zpdesc = alloc_zpdesc(gfp, nid);
if (!zpdesc) {
while (--i >= 0) {
zpdesc_dec_zone_page_state(zpdescs[i]);
- __zpdesc_clear_zsmalloc(zpdescs[i]);
free_zpdesc(zpdescs[i]);
}
cache_free_zspage(pool, zspage);
@@ -1336,12 +1255,14 @@ static unsigned long obj_malloc(struct zs_pool *pool,
* @pool: pool to allocate from
* @size: size of block to allocate
* @gfp: gfp flags when allocating object
+ * @nid: The preferred node id to allocate new zspage (if needed)
*
* On success, handle to the allocated object is returned,
* otherwise an ERR_PTR().
* Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail.
*/
-unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
+unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp,
+ const int nid)
{
unsigned long handle;
struct size_class *class;
@@ -1376,7 +1297,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
spin_unlock(&class->lock);
- zspage = alloc_zspage(pool, class, gfp);
+ zspage = alloc_zspage(pool, class, gfp, nid);
if (!zspage) {
cache_free_handle(pool, handle);
return (unsigned long)ERR_PTR(-ENOMEM);
@@ -1694,8 +1615,6 @@ static void lock_zspage(struct zspage *zspage)
#ifdef CONFIG_COMPACTION
-static const struct movable_operations zsmalloc_mops;
-
static void replace_sub_page(struct size_class *class, struct zspage *zspage,
struct zpdesc *newzpdesc, struct zpdesc *oldzpdesc)
{
@@ -1718,18 +1637,17 @@ static void replace_sub_page(struct size_class *class, struct zspage *zspage,
set_first_obj_offset(newzpdesc, first_obj_offset);
if (unlikely(ZsHugePage(zspage)))
newzpdesc->handle = oldzpdesc->handle;
- __zpdesc_set_movable(newzpdesc, &zsmalloc_mops);
+ __zpdesc_set_movable(newzpdesc);
}
static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
{
/*
- * Page is locked so zspage couldn't be destroyed. For detail, look at
- * lock_zspage in free_zspage.
+ * Page is locked so zspage can't be destroyed concurrently
+ * (see free_zspage()). But if the page was already destroyed
+ * (see reset_zpdesc()), refuse isolation here.
*/
- VM_BUG_ON_PAGE(PageIsolated(page), page);
-
- return true;
+ return page_zpdesc(page)->zspage;
}
static int zs_page_migrate(struct page *newpage, struct page *page,
@@ -1747,7 +1665,15 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
unsigned long old_obj, new_obj;
unsigned int obj_idx;
- VM_BUG_ON_PAGE(!zpdesc_is_isolated(zpdesc), zpdesc_page(zpdesc));
+ /*
+ * TODO: nothing prevents a zspage from getting destroyed while
+ * it is isolated for migration, as the page lock is temporarily
+ * dropped after zs_page_isolate() succeeded: we should rework that
+ * and defer destroying such pages once they are un-isolated (putback)
+ * instead.
+ */
+ if (!zpdesc->zspage)
+ return 0;
/* The page is locked, so this pointer must remain valid */
zspage = get_zspage(zpdesc);
@@ -1814,15 +1740,14 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
reset_zpdesc(zpdesc);
zpdesc_put(zpdesc);
- return MIGRATEPAGE_SUCCESS;
+ return 0;
}
static void zs_page_putback(struct page *page)
{
- VM_BUG_ON_PAGE(!PageIsolated(page), page);
}
-static const struct movable_operations zsmalloc_mops = {
+const struct movable_operations zsmalloc_mops = {
.isolate_page = zs_page_isolate,
.migrate_page = zs_page_migrate,
.putback_page = zs_page_putback,
@@ -1885,7 +1810,7 @@ static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage)
do {
WARN_ON(!zpdesc_trylock(zpdesc));
- __zpdesc_set_movable(zpdesc, &zsmalloc_mops);
+ __zpdesc_set_movable(zpdesc);
zpdesc_unlock(zpdesc);
} while ((zpdesc = get_next_zpdesc(zpdesc)) != NULL);
}
@@ -2248,8 +2173,12 @@ EXPORT_SYMBOL_GPL(zs_destroy_pool);
static int __init zs_init(void)
{
-#ifdef CONFIG_ZPOOL
- zpool_register_driver(&zs_zpool_driver);
+ int rc __maybe_unused;
+
+#ifdef CONFIG_COMPACTION
+ rc = set_movable_ops(&zsmalloc_mops, PGTY_zsmalloc);
+ if (rc)
+ return rc;
#endif
zs_stat_init();
return 0;
@@ -2257,8 +2186,8 @@ static int __init zs_init(void)
static void __exit zs_exit(void)
{
-#ifdef CONFIG_ZPOOL
- zpool_unregister_driver(&zs_zpool_driver);
+#ifdef CONFIG_COMPACTION
+ set_movable_ops(NULL, PGTY_zsmalloc);
#endif
zs_stat_exit();
}
diff --git a/mm/zswap.c b/mm/zswap.c
index 204fb59da33c..c1af782e54ec 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -25,7 +25,6 @@
#include <linux/scatterlist.h>
#include <linux/mempolicy.h>
#include <linux/mempool.h>
-#include <linux/zpool.h>
#include <crypto/acompress.h>
#include <linux/zswap.h>
#include <linux/mm_types.h>
@@ -35,6 +34,7 @@
#include <linux/pagemap.h>
#include <linux/workqueue.h>
#include <linux/list_lru.h>
+#include <linux/zsmalloc.h>
#include "swap.h"
#include "internal.h"
@@ -42,8 +42,10 @@
/*********************************
* statistics
**********************************/
-/* The number of compressed pages currently stored in zswap */
+/* The number of pages currently stored in zswap */
atomic_long_t zswap_stored_pages = ATOMIC_LONG_INIT(0);
+/* The number of incompressible pages currently stored in zswap */
+static atomic_long_t zswap_stored_incompressible_pages = ATOMIC_LONG_INIT(0);
/*
* The statistics below are not protected from concurrent access for
@@ -105,16 +107,6 @@ static const struct kernel_param_ops zswap_compressor_param_ops = {
module_param_cb(compressor, &zswap_compressor_param_ops,
&zswap_compressor, 0644);
-/* Compressed storage zpool to use */
-static char *zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT;
-static int zswap_zpool_param_set(const char *, const struct kernel_param *);
-static const struct kernel_param_ops zswap_zpool_param_ops = {
- .set = zswap_zpool_param_set,
- .get = param_get_charp,
- .free = param_free_charp,
-};
-module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644);
-
/* The maximum percentage of memory that the compressed pool can occupy */
static unsigned int zswap_max_pool_percent = 20;
module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644);
@@ -159,7 +151,7 @@ struct crypto_acomp_ctx {
* needs to be verified that it's still valid in the tree.
*/
struct zswap_pool {
- struct zpool *zpool;
+ struct zs_pool *zs_pool;
struct crypto_acomp_ctx __percpu *acomp_ctx;
struct percpu_ref ref;
struct list_head list;
@@ -191,7 +183,7 @@ static struct shrinker *zswap_shrinker;
* logic if referenced is unset. See comments in the shrinker
* section for context.
* pool - the zswap_pool the entry's data is in
- * handle - zpool allocation handle that stores the compressed page data
+ * handle - zsmalloc allocation handle that stores the compressed page data
* objcg - the obj_cgroup that the compressed memory is charged to
* lru - handle to the pool's lru used to evict pages.
*/
@@ -212,7 +204,7 @@ static unsigned int nr_zswap_trees[MAX_SWAPFILES];
static LIST_HEAD(zswap_pools);
/* protects zswap_pools list modification */
static DEFINE_SPINLOCK(zswap_pools_lock);
-/* pool counter to provide unique names to zpool */
+/* pool counter to provide unique names to zsmalloc */
static atomic_t zswap_pools_count = ATOMIC_INIT(0);
enum zswap_init_type {
@@ -233,38 +225,31 @@ static bool zswap_has_pool;
* helpers and fwd declarations
**********************************/
+/* One swap address space for each 64M swap space */
+#define ZSWAP_ADDRESS_SPACE_SHIFT 14
+#define ZSWAP_ADDRESS_SPACE_PAGES (1 << ZSWAP_ADDRESS_SPACE_SHIFT)
static inline struct xarray *swap_zswap_tree(swp_entry_t swp)
{
return &zswap_trees[swp_type(swp)][swp_offset(swp)
- >> SWAP_ADDRESS_SPACE_SHIFT];
+ >> ZSWAP_ADDRESS_SPACE_SHIFT];
}
-#define zswap_pool_debug(msg, p) \
- pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \
- zpool_get_type((p)->zpool))
+#define zswap_pool_debug(msg, p) \
+ pr_debug("%s pool %s\n", msg, (p)->tfm_name)
/*********************************
* pool functions
**********************************/
static void __zswap_pool_empty(struct percpu_ref *ref);
-static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
+static struct zswap_pool *zswap_pool_create(char *compressor)
{
struct zswap_pool *pool;
char name[38]; /* 'zswap' + 32 char (max) num + \0 */
- gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
int ret, cpu;
- if (!zswap_has_pool) {
- /* if either are unset, pool initialization failed, and we
- * need both params to be set correctly before trying to
- * create a pool.
- */
- if (!strcmp(type, ZSWAP_PARAM_UNSET))
- return NULL;
- if (!strcmp(compressor, ZSWAP_PARAM_UNSET))
- return NULL;
- }
+ if (!zswap_has_pool && !strcmp(compressor, ZSWAP_PARAM_UNSET))
+ return NULL;
pool = kzalloc(sizeof(*pool), GFP_KERNEL);
if (!pool)
@@ -272,12 +257,9 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
/* unique name for each pool specifically required by zsmalloc */
snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count));
- pool->zpool = zpool_create_pool(type, name, gfp);
- if (!pool->zpool) {
- pr_err("%s zpool not available\n", type);
+ pool->zs_pool = zs_create_pool(name);
+ if (!pool->zs_pool)
goto error;
- }
- pr_debug("using %s zpool\n", zpool_get_type(pool->zpool));
strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
@@ -313,52 +295,29 @@ ref_fail:
error:
if (pool->acomp_ctx)
free_percpu(pool->acomp_ctx);
- if (pool->zpool)
- zpool_destroy_pool(pool->zpool);
+ if (pool->zs_pool)
+ zs_destroy_pool(pool->zs_pool);
kfree(pool);
return NULL;
}
static struct zswap_pool *__zswap_pool_create_fallback(void)
{
- bool has_comp, has_zpool;
-
- has_comp = crypto_has_acomp(zswap_compressor, 0, 0);
- if (!has_comp && strcmp(zswap_compressor,
- CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) {
+ if (!crypto_has_acomp(zswap_compressor, 0, 0) &&
+ strcmp(zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) {
pr_err("compressor %s not available, using default %s\n",
zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT);
param_free_charp(&zswap_compressor);
zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT;
- has_comp = crypto_has_acomp(zswap_compressor, 0, 0);
- }
- if (!has_comp) {
- pr_err("default compressor %s not available\n",
- zswap_compressor);
- param_free_charp(&zswap_compressor);
- zswap_compressor = ZSWAP_PARAM_UNSET;
- }
-
- has_zpool = zpool_has_pool(zswap_zpool_type);
- if (!has_zpool && strcmp(zswap_zpool_type,
- CONFIG_ZSWAP_ZPOOL_DEFAULT)) {
- pr_err("zpool %s not available, using default %s\n",
- zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT);
- param_free_charp(&zswap_zpool_type);
- zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT;
- has_zpool = zpool_has_pool(zswap_zpool_type);
- }
- if (!has_zpool) {
- pr_err("default zpool %s not available\n",
- zswap_zpool_type);
- param_free_charp(&zswap_zpool_type);
- zswap_zpool_type = ZSWAP_PARAM_UNSET;
}
- if (!has_comp || !has_zpool)
+ /* Default compressor should be available. Kconfig bug? */
+ if (WARN_ON_ONCE(!crypto_has_acomp(zswap_compressor, 0, 0))) {
+ zswap_compressor = ZSWAP_PARAM_UNSET;
return NULL;
+ }
- return zswap_pool_create(zswap_zpool_type, zswap_compressor);
+ return zswap_pool_create(zswap_compressor);
}
static void zswap_pool_destroy(struct zswap_pool *pool)
@@ -368,7 +327,7 @@ static void zswap_pool_destroy(struct zswap_pool *pool)
cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
free_percpu(pool->acomp_ctx);
- zpool_destroy_pool(pool->zpool);
+ zs_destroy_pool(pool->zs_pool);
kfree(pool);
}
@@ -460,7 +419,7 @@ static struct zswap_pool *zswap_pool_current_get(void)
}
/* type and compressor must be null-terminated */
-static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
+static struct zswap_pool *zswap_pool_find_get(char *compressor)
{
struct zswap_pool *pool;
@@ -469,8 +428,6 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
list_for_each_entry_rcu(pool, &zswap_pools, list) {
if (strcmp(pool->tfm_name, compressor))
continue;
- if (strcmp(zpool_get_type(pool->zpool), type))
- continue;
/* if we can't get it, it's about to be destroyed */
if (!zswap_pool_tryget(pool))
continue;
@@ -497,7 +454,7 @@ unsigned long zswap_total_pages(void)
rcu_read_lock();
list_for_each_entry_rcu(pool, &zswap_pools, list)
- total += zpool_get_total_pages(pool->zpool);
+ total += zs_get_total_pages(pool->zs_pool);
rcu_read_unlock();
return total;
@@ -522,33 +479,22 @@ static bool zswap_check_limits(void)
* param callbacks
**********************************/
-static bool zswap_pool_changed(const char *s, const struct kernel_param *kp)
-{
- /* no change required */
- if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool)
- return false;
- return true;
-}
-
-/* val must be a null-terminated string */
-static int __zswap_param_set(const char *val, const struct kernel_param *kp,
- char *type, char *compressor)
+static int zswap_compressor_param_set(const char *val, const struct kernel_param *kp)
{
struct zswap_pool *pool, *put_pool = NULL;
char *s = strstrip((char *)val);
+ bool create_pool = false;
int ret = 0;
- bool new_pool = false;
mutex_lock(&zswap_init_lock);
switch (zswap_init_state) {
case ZSWAP_UNINIT:
- /* if this is load-time (pre-init) param setting,
- * don't create a pool; that's done during init.
- */
+ /* Handled in zswap_setup() */
ret = param_set_charp(s, kp);
break;
case ZSWAP_INIT_SUCCEED:
- new_pool = zswap_pool_changed(s, kp);
+ if (!zswap_has_pool || strcmp(s, *(char **)kp->arg))
+ create_pool = true;
break;
case ZSWAP_INIT_FAILED:
pr_err("can't set param, initialization failed\n");
@@ -556,30 +502,17 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp,
}
mutex_unlock(&zswap_init_lock);
- /* no need to create a new pool, return directly */
- if (!new_pool)
+ if (!create_pool)
return ret;
- if (!type) {
- if (!zpool_has_pool(s)) {
- pr_err("zpool %s not available\n", s);
- return -ENOENT;
- }
- type = s;
- } else if (!compressor) {
- if (!crypto_has_acomp(s, 0, 0)) {
- pr_err("compressor %s not available\n", s);
- return -ENOENT;
- }
- compressor = s;
- } else {
- WARN_ON(1);
- return -EINVAL;
+ if (!crypto_has_acomp(s, 0, 0)) {
+ pr_err("compressor %s not available\n", s);
+ return -ENOENT;
}
spin_lock_bh(&zswap_pools_lock);
- pool = zswap_pool_find_get(type, compressor);
+ pool = zswap_pool_find_get(s);
if (pool) {
zswap_pool_debug("using existing", pool);
WARN_ON(pool == zswap_pool_current());
@@ -589,7 +522,7 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp,
spin_unlock_bh(&zswap_pools_lock);
if (!pool)
- pool = zswap_pool_create(type, compressor);
+ pool = zswap_pool_create(s);
else {
/*
* Restore the initial ref dropped by percpu_ref_kill()
@@ -614,7 +547,8 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp,
list_add_rcu(&pool->list, &zswap_pools);
zswap_has_pool = true;
} else if (pool) {
- /* add the possibly pre-existing pool to the end of the pools
+ /*
+ * Add the possibly pre-existing pool to the end of the pools
* list; if it's new (and empty) then it'll be removed and
* destroyed by the put after we drop the lock
*/
@@ -624,18 +558,8 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp,
spin_unlock_bh(&zswap_pools_lock);
- if (!zswap_has_pool && !pool) {
- /* if initial pool creation failed, and this pool creation also
- * failed, maybe both compressor and zpool params were bad.
- * Allow changing this param, so pool creation will succeed
- * when the other param is changed. We already verified this
- * param is ok in the zpool_has_pool() or crypto_has_acomp()
- * checks above.
- */
- ret = param_set_charp(s, kp);
- }
-
- /* drop the ref from either the old current pool,
+ /*
+ * Drop the ref from either the old current pool,
* or the new pool we failed to add
*/
if (put_pool)
@@ -644,18 +568,6 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp,
return ret;
}
-static int zswap_compressor_param_set(const char *val,
- const struct kernel_param *kp)
-{
- return __zswap_param_set(val, kp, zswap_zpool_type, NULL);
-}
-
-static int zswap_zpool_param_set(const char *val,
- const struct kernel_param *kp)
-{
- return __zswap_param_set(val, kp, NULL, zswap_compressor);
-}
-
static int zswap_enabled_param_set(const char *val,
const struct kernel_param *kp)
{
@@ -799,18 +711,20 @@ static void zswap_entry_cache_free(struct zswap_entry *entry)
}
/*
- * Carries out the common pattern of freeing and entry's zpool allocation,
+ * Carries out the common pattern of freeing an entry's zsmalloc allocation,
* freeing the entry itself, and decrementing the number of stored pages.
*/
static void zswap_entry_free(struct zswap_entry *entry)
{
zswap_lru_del(&zswap_list_lru, entry);
- zpool_free(entry->pool->zpool, entry->handle);
+ zs_free(entry->pool->zs_pool, entry->handle);
zswap_pool_put(entry->pool);
if (entry->objcg) {
obj_cgroup_uncharge_zswap(entry->objcg, entry->length);
obj_cgroup_put(entry->objcg);
}
+ if (entry->length == PAGE_SIZE)
+ atomic_long_dec(&zswap_stored_incompressible_pages);
zswap_entry_cache_free(entry);
atomic_long_dec(&zswap_stored_pages);
}
@@ -827,7 +741,7 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
u8 *buffer = NULL;
int ret;
- buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
+ buffer = kmalloc_node(PAGE_SIZE, GFP_KERNEL, cpu_to_node(cpu));
if (!buffer) {
ret = -ENOMEM;
goto fail;
@@ -945,21 +859,16 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
int comp_ret = 0, alloc_ret = 0;
unsigned int dlen = PAGE_SIZE;
unsigned long handle;
- struct zpool *zpool;
gfp_t gfp;
u8 *dst;
+ bool mapped = false;
acomp_ctx = acomp_ctx_get_cpu_lock(pool);
dst = acomp_ctx->buffer;
sg_init_table(&input, 1);
sg_set_page(&input, page, PAGE_SIZE, 0);
- /*
- * We need PAGE_SIZE * 2 here since there maybe over-compression case,
- * and hardware-accelerators may won't check the dst buffer size, so
- * giving the dst buffer with enough length to avoid buffer overflow.
- */
- sg_init_one(&output, dst, PAGE_SIZE * 2);
+ sg_init_one(&output, dst, PAGE_SIZE);
acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen);
/*
@@ -976,20 +885,41 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
*/
comp_ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
dlen = acomp_ctx->req->dlen;
- if (comp_ret)
- goto unlock;
- zpool = pool->zpool;
+ /*
+ * If a page cannot be compressed into a size smaller than PAGE_SIZE,
+ * save the content as is without a compression, to keep the LRU order
+ * of writebacks. If writeback is disabled, reject the page since it
+ * only adds metadata overhead. swap_writeout() will put the page back
+ * to the active LRU list in the case.
+ */
+ if (comp_ret || !dlen || dlen >= PAGE_SIZE) {
+ dlen = PAGE_SIZE;
+ if (!mem_cgroup_zswap_writeback_enabled(
+ folio_memcg(page_folio(page)))) {
+ comp_ret = comp_ret ? comp_ret : -EINVAL;
+ goto unlock;
+ }
+ comp_ret = 0;
+ dlen = PAGE_SIZE;
+ dst = kmap_local_page(page);
+ mapped = true;
+ }
+
gfp = GFP_NOWAIT | __GFP_NORETRY | __GFP_HIGHMEM | __GFP_MOVABLE;
- alloc_ret = zpool_malloc(zpool, dlen, gfp, &handle);
- if (alloc_ret)
+ handle = zs_malloc(pool->zs_pool, dlen, gfp, page_to_nid(page));
+ if (IS_ERR_VALUE(handle)) {
+ alloc_ret = PTR_ERR((void *)handle);
goto unlock;
+ }
- zpool_obj_write(zpool, handle, dst, dlen);
+ zs_obj_write(pool->zs_pool, handle, dst, dlen);
entry->handle = handle;
entry->length = dlen;
unlock:
+ if (mapped)
+ kunmap_local(dst);
if (comp_ret == -ENOSPC || alloc_ret == -ENOSPC)
zswap_reject_compress_poor++;
else if (comp_ret)
@@ -1003,17 +933,23 @@ unlock:
static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
{
- struct zpool *zpool = entry->pool->zpool;
+ struct zswap_pool *pool = entry->pool;
struct scatterlist input, output;
struct crypto_acomp_ctx *acomp_ctx;
- int decomp_ret, dlen;
+ int decomp_ret = 0, dlen = PAGE_SIZE;
u8 *src, *obj;
- acomp_ctx = acomp_ctx_get_cpu_lock(entry->pool);
- obj = zpool_obj_read_begin(zpool, entry->handle, acomp_ctx->buffer);
+ acomp_ctx = acomp_ctx_get_cpu_lock(pool);
+ obj = zs_obj_read_begin(pool->zs_pool, entry->handle, acomp_ctx->buffer);
+
+ /* zswap entries of length PAGE_SIZE are not compressed. */
+ if (entry->length == PAGE_SIZE) {
+ memcpy_to_folio(folio, 0, obj, entry->length);
+ goto read_done;
+ }
/*
- * zpool_obj_read_begin() might return a kmap address of highmem when
+ * zs_obj_read_begin() might return a kmap address of highmem when
* acomp_ctx->buffer is not used. However, sg_init_one() does not
* handle highmem addresses, so copy the object to acomp_ctx->buffer.
*/
@@ -1032,7 +968,8 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
decomp_ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
dlen = acomp_ctx->req->dlen;
- zpool_obj_read_end(zpool, entry->handle, obj);
+read_done:
+ zs_obj_read_end(pool->zs_pool, entry->handle, obj);
acomp_ctx_put_unlock(acomp_ctx);
if (!decomp_ret && dlen == PAGE_SIZE)
@@ -1070,9 +1007,6 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
struct mempolicy *mpol;
bool folio_was_allocated;
struct swap_info_struct *si;
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_NONE,
- };
int ret = 0;
/* try to allocate swap cache folio */
@@ -1134,11 +1068,11 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
folio_set_reclaim(folio);
/* start writeback */
- __swap_writepage(folio, &wbc);
+ __swap_writepage(folio, NULL);
out:
if (ret && ret != -EEXIST) {
- delete_from_swap_cache(folio);
+ swap_cache_del_folio(folio);
folio_unlock(folio);
}
folio_put(folio);
@@ -1527,6 +1461,8 @@ static bool zswap_store_page(struct page *page,
obj_cgroup_charge_zswap(objcg, entry->length);
}
atomic_long_inc(&zswap_stored_pages);
+ if (entry->length == PAGE_SIZE)
+ atomic_long_inc(&zswap_stored_incompressible_pages);
/*
* We finish initializing the entry while it's already in xarray.
@@ -1550,7 +1486,7 @@ static bool zswap_store_page(struct page *page,
return true;
store_failed:
- zpool_free(pool->zpool, entry->handle);
+ zs_free(pool->zs_pool, entry->handle);
compress_failed:
zswap_entry_cache_free(entry);
return false;
@@ -1741,7 +1677,7 @@ int zswap_swapon(int type, unsigned long nr_pages)
struct xarray *trees, *tree;
unsigned int nr, i;
- nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES);
+ nr = DIV_ROUND_UP(nr_pages, ZSWAP_ADDRESS_SPACE_PAGES);
trees = kvcalloc(nr, sizeof(*tree), GFP_KERNEL);
if (!trees) {
pr_err("alloc failed, zswap disabled for swap type %d\n", type);
@@ -1795,6 +1731,14 @@ static int debugfs_get_stored_pages(void *data, u64 *val)
}
DEFINE_DEBUGFS_ATTRIBUTE(stored_pages_fops, debugfs_get_stored_pages, NULL, "%llu\n");
+static int debugfs_get_stored_incompressible_pages(void *data, u64 *val)
+{
+ *val = atomic_long_read(&zswap_stored_incompressible_pages);
+ return 0;
+}
+DEFINE_DEBUGFS_ATTRIBUTE(stored_incompressible_pages_fops,
+ debugfs_get_stored_incompressible_pages, NULL, "%llu\n");
+
static int zswap_debugfs_init(void)
{
if (!debugfs_initialized())
@@ -1822,6 +1766,9 @@ static int zswap_debugfs_init(void)
zswap_debugfs_root, NULL, &total_size_fops);
debugfs_create_file("stored_pages", 0444,
zswap_debugfs_root, NULL, &stored_pages_fops);
+ debugfs_create_file("stored_incompressible_pages", 0444,
+ zswap_debugfs_root, NULL,
+ &stored_incompressible_pages_fops);
return 0;
}
@@ -1869,8 +1816,7 @@ static int zswap_setup(void)
pool = __zswap_pool_create_fallback();
if (pool) {
- pr_info("loaded using pool %s/%s\n", pool->tfm_name,
- zpool_get_type(pool->zpool));
+ pr_info("loaded using pool %s\n", pool->tfm_name);
list_add(&pool->list, &zswap_pools);
zswap_has_pool = true;
static_branch_enable(&zswap_ever_enabled);