diff options
Diffstat (limited to 'mm')
155 files changed, 19220 insertions, 13509 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 84000b016808..781be3240e21 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -129,7 +129,6 @@ choice prompt "Default allocator" depends on ZSWAP default ZSWAP_ZPOOL_DEFAULT_ZSMALLOC if MMU - default ZSWAP_ZPOOL_DEFAULT_ZBUD help Selects the default allocator for the compressed cache for swap pages. @@ -140,21 +139,6 @@ choice The selection made here can be overridden by using the kernel command line 'zswap.zpool=' option. -config ZSWAP_ZPOOL_DEFAULT_ZBUD - bool "zbud" - select ZBUD - help - Use the zbud allocator as the default allocator. - -config ZSWAP_ZPOOL_DEFAULT_Z3FOLD_DEPRECATED - bool "z3foldi (DEPRECATED)" - select Z3FOLD_DEPRECATED - help - Use the z3fold allocator as the default allocator. - - Deprecated and scheduled for removal in a few cycles, - see CONFIG_Z3FOLD_DEPRECATED. - config ZSWAP_ZPOOL_DEFAULT_ZSMALLOC bool "zsmalloc" select ZSMALLOC @@ -165,40 +149,9 @@ endchoice config ZSWAP_ZPOOL_DEFAULT string depends on ZSWAP - default "zbud" if ZSWAP_ZPOOL_DEFAULT_ZBUD - default "z3fold" if ZSWAP_ZPOOL_DEFAULT_Z3FOLD_DEPRECATED default "zsmalloc" if ZSWAP_ZPOOL_DEFAULT_ZSMALLOC default "" -config ZBUD - tristate "2:1 compression allocator (zbud)" - depends on ZSWAP - help - A special purpose allocator for storing compressed pages. - It is designed to store up to two compressed pages per physical - page. While this design limits storage density, it has simple and - deterministic reclaim properties that make it preferable to a higher - density approach when reclaim will be used. - -config Z3FOLD_DEPRECATED - tristate "3:1 compression allocator (z3fold) (DEPRECATED)" - depends on ZSWAP - help - Deprecated and scheduled for removal in a few cycles. If you have - a good reason for using Z3FOLD over ZSMALLOC, please contact - linux-mm@kvack.org and the zswap maintainers. - - A special purpose allocator for storing compressed pages. - It is designed to store up to three compressed pages per physical - page. It is a ZBUD derivative so the simplicity and determinism are - still there. - -config Z3FOLD - tristate - default y if Z3FOLD_DEPRECATED=y - default m if Z3FOLD_DEPRECATED=m - depends on Z3FOLD_DEPRECATED - config ZSMALLOC tristate prompt "N:1 compression allocator (zsmalloc)" if (ZSWAP || ZRAM) @@ -242,9 +195,13 @@ menu "Slab allocator options" config SLUB def_bool y +config KVFREE_RCU_BATCHED + def_bool y + depends on !SLUB_TINY && !TINY_RCU + config SLUB_TINY bool "Configure for minimal memory footprint" - depends on EXPERT + depends on EXPERT && !COMPILE_TEST select SLAB_MERGE_DEFAULT help Configures the slab allocator in a way to achieve minimal memory @@ -489,6 +446,9 @@ config SPARSEMEM_VMEMMAP SPARSEMEM_VMEMMAP uses a virtually mapped memmap to optimise pfn_to_page and page_to_pfn operations. This is the most efficient option when sufficient kernel resources are available. + +config SPARSEMEM_VMEMMAP_PREINIT + bool # # Select this config option from the architecture Kconfig, if it is preferred # to enable the feature of HugeTLB/dev_dax vmemmap optimization. @@ -499,6 +459,9 @@ config ARCH_WANT_OPTIMIZE_DAX_VMEMMAP config ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP bool +config ARCH_WANT_HUGETLB_VMEMMAP_PREINIT + bool + config HAVE_MEMBLOCK_PHYS_MAP bool @@ -506,6 +469,10 @@ config HAVE_GUP_FAST depends on MMU bool +# Enable memblock support for scratch memory which is needed for kexec handover +config MEMBLOCK_KHO_SCRATCH + bool + # Don't discard allocated memory used to track "memory" and "reserved" memblocks # after early boot, so it can still be used to test for validity of memory. # Also, memblocks are updated with memory hot(un)plug. @@ -550,20 +517,63 @@ menuconfig MEMORY_HOTPLUG if MEMORY_HOTPLUG -config MEMORY_HOTPLUG_DEFAULT_ONLINE - bool "Online the newly added memory blocks by default" - depends on MEMORY_HOTPLUG +choice + prompt "Memory Hotplug Default Online Type" + default MHP_DEFAULT_ONLINE_TYPE_OFFLINE help + Default memory type for hotplugged memory. + This option sets the default policy setting for memory hotplug onlining policy (/sys/devices/system/memory/auto_online_blocks) which determines what happens to newly added memory regions. Policy setting can always be changed at runtime. + + The default is 'offline'. + + Select offline to defer onlining to drivers and user policy. + Select auto to let the kernel choose what zones to utilize. + Select online_kernel to generally allow kernel usage of this memory. + Select online_movable to generally disallow kernel usage of this memory. + + Example kernel usage would be page structs and page tables. + See Documentation/admin-guide/mm/memory-hotplug.rst for more information. - Say Y here if you want all hot-plugged memory blocks to appear in - 'online' state by default. - Say N here if you want the default policy to keep all hot-plugged - memory blocks in 'offline' state. +config MHP_DEFAULT_ONLINE_TYPE_OFFLINE + bool "offline" + help + Hotplugged memory will not be onlined by default. + Choose this for systems with drivers and user policy that + handle onlining of hotplug memory policy. + +config MHP_DEFAULT_ONLINE_TYPE_ONLINE_AUTO + bool "auto" + help + Select this if you want the kernel to automatically online + hotplugged memory into the zone it thinks is reasonable. + This memory may be utilized for kernel data. + +config MHP_DEFAULT_ONLINE_TYPE_ONLINE_KERNEL + bool "kernel" + help + Select this if you want the kernel to automatically online + hotplugged memory into a zone capable of being used for kernel + data. This typically means ZONE_NORMAL. + +config MHP_DEFAULT_ONLINE_TYPE_ONLINE_MOVABLE + bool "movable" + help + Select this if you want the kernel to automatically online + hotplug memory into ZONE_MOVABLE. This memory will generally + not be utilized for kernel data. + + This should only be used when the admin knows sufficient + ZONE_NORMAL memory is available to describe hotplug memory, + otherwise hotplug memory may fail to online. For example, + sufficient kernel-capable memory (ZONE_NORMAL) must be + available to allocate page structs to describe ZONE_MOVABLE. + +endchoice config MEMORY_HOTREMOVE bool "Allow for memory hot remove" @@ -813,11 +823,15 @@ config ARCH_WANT_GENERAL_HUGETLB config ARCH_WANTS_THP_SWAP def_bool n +config MM_ID + def_bool n + menuconfig TRANSPARENT_HUGEPAGE bool "Transparent Hugepage Support" depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT select COMPACTION select XARRAY_MULTI + select MM_ID help Transparent Hugepages allows the kernel to use huge pages and huge tlb transparently to the applications whenever possible. @@ -872,7 +886,7 @@ config THP_SWAP config READ_ONLY_THP_FOR_FS bool "Read-only THP for filesystems (EXPERIMENTAL)" - depends on TRANSPARENT_HUGEPAGE && SHMEM + depends on TRANSPARENT_HUGEPAGE help Allow khugepaged to put read-only file-backed pages in THP. @@ -881,8 +895,25 @@ config READ_ONLY_THP_FOR_FS support of file THPs will be developed in the next few release cycles. +config NO_PAGE_MAPCOUNT + bool "No per-page mapcount (EXPERIMENTAL)" + help + Do not maintain per-page mapcounts for pages part of larger + allocations, such as transparent huge pages. + + When this config option is enabled, some interfaces that relied on + this information will rely on less-precise per-allocation information + instead: for example, using the average per-page mapcount in such + a large allocation instead of the per-page mapcount. + + EXPERIMENTAL because the impact of some changes is still unclear. + endif # TRANSPARENT_HUGEPAGE +# simple helper to make the code a bit easier to read +config PAGE_MAPCOUNT + def_bool !NO_PAGE_MAPCOUNT + # # The architecture supports pgtable leaves that is larger than PAGE_SIZE # @@ -962,6 +993,40 @@ config CMA_AREAS If unsure, leave the default value "8" in UMA and "20" in NUMA. +# +# Select this config option from the architecture Kconfig, if available, to set +# the max page order for physically contiguous allocations. +# +config ARCH_FORCE_MAX_ORDER + int + +# +# When ARCH_FORCE_MAX_ORDER is not defined, +# the default page block order is MAX_PAGE_ORDER (10) as per +# include/linux/mmzone.h. +# +config PAGE_BLOCK_ORDER + int "Page Block Order" + range 1 10 if ARCH_FORCE_MAX_ORDER = 0 + default 10 if ARCH_FORCE_MAX_ORDER = 0 + range 1 ARCH_FORCE_MAX_ORDER if ARCH_FORCE_MAX_ORDER != 0 + default ARCH_FORCE_MAX_ORDER if ARCH_FORCE_MAX_ORDER != 0 + help + The page block order refers to the power of two number of pages that + are physically contiguous and can have a migrate type associated to + them. The maximum size of the page block order is limited by + ARCH_FORCE_MAX_ORDER. + + This config allows overriding the default page block order when the + page block order is required to be smaller than ARCH_FORCE_MAX_ORDER + or MAX_PAGE_ORDER. + + Reducing pageblock order can negatively impact THP generation + success rate. If your workloads uses THP heavily, please use this + option with caution. + + Don't change if unsure. + config MEM_SOFT_DIRTY bool "Track memory changes" depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS @@ -1290,6 +1355,7 @@ config NUMA_MEMBLKS config NUMA_EMU bool "NUMA emulation" depends on NUMA_MEMBLKS + depends on X86 || GENERIC_ARCH_NUMA help Enable NUMA emulation. A flat machine will be split into virtual nodes when booted with "numa=fake=N", where N is the @@ -1301,6 +1367,21 @@ config ARCH_HAS_USER_SHADOW_STACK The architecture has hardware support for userspace shadow call stacks (eg, x86 CET, arm64 GCS or RISC-V Zicfiss). +config ARCH_SUPPORTS_PT_RECLAIM + def_bool n + +config PT_RECLAIM + bool "reclaim empty user page table pages" + default y + depends on ARCH_SUPPORTS_PT_RECLAIM && MMU && SMP + select MMU_GATHER_RCU_TABLE_FREE + help + Try to reclaim empty user page table pages in paths other than munmap + and exit_mmap path. + + Note: now only empty user PTE page table pages will be reclaimed. + + source "mm/damon/Kconfig" endmenu diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 41a58536531d..32b65073d0cc 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -186,8 +186,9 @@ config ARCH_HAS_DEBUG_WX config DEBUG_WX bool "Warn on W+X mappings at boot" depends on ARCH_HAS_DEBUG_WX + depends on ARCH_HAS_PTDUMP depends on MMU - select PTDUMP_CORE + select PTDUMP help Generate a warning if any W+X mappings are found at boot. @@ -212,18 +213,18 @@ config DEBUG_WX If in doubt, say "Y". -config GENERIC_PTDUMP +config ARCH_HAS_PTDUMP bool -config PTDUMP_CORE +config PTDUMP bool config PTDUMP_DEBUGFS bool "Export kernel pagetable layout to userspace via debugfs" depends on DEBUG_KERNEL depends on DEBUG_FS - depends on GENERIC_PTDUMP - select PTDUMP_CORE + depends on ARCH_HAS_PTDUMP + select PTDUMP help Say Y here if you want to show the kernel pagetable layout in a debugfs file. This information is only useful for kernel developers diff --git a/mm/Makefile b/mm/Makefile index dba52bb0da8a..1a7a11d4933d 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -37,7 +37,7 @@ mmu-y := nommu.o mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \ mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \ msync.o page_vma_mapped.o pagewalk.o \ - pgtable-generic.o rmap.o vmalloc.o vma.o + pgtable-generic.o rmap.o vmalloc.o vma.o vma_exec.o ifdef CONFIG_CROSS_MEMORY_ATTACH @@ -55,7 +55,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ mm_init.o percpu.o slab_common.o \ compaction.o show_mem.o \ interval_tree.o list_lru.o workingset.o \ - debug.o gup.o mmap_lock.o $(mmu-y) + debug.o gup.o mmap_lock.o vma_init.o $(mmu-y) # Give 'page_alloc' its own module-parameter namespace page-alloc-y := page_alloc.o @@ -75,10 +75,13 @@ ifdef CONFIG_MMU obj-$(CONFIG_ADVISE_SYSCALLS) += madvise.o endif -obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o swap_slots.o +obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o obj-$(CONFIG_ZSWAP) += zswap.o obj-$(CONFIG_HAS_DMA) += dmapool.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o +ifdef CONFIG_CMA +obj-$(CONFIG_HUGETLBFS) += hugetlb_cma.o +endif obj-$(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) += hugetlb_vmemmap.o obj-$(CONFIG_NUMA) += mempolicy.o obj-$(CONFIG_SPARSEMEM) += sparse.o @@ -113,9 +116,7 @@ obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o obj-$(CONFIG_PAGE_OWNER) += page_owner.o obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o obj-$(CONFIG_ZPOOL) += zpool.o -obj-$(CONFIG_ZBUD) += zbud.o obj-$(CONFIG_ZSMALLOC) += zsmalloc.o -obj-$(CONFIG_Z3FOLD) += z3fold.o obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o obj-$(CONFIG_CMA) += cma.o obj-$(CONFIG_NUMA) += numa.o @@ -138,7 +139,7 @@ obj-$(CONFIG_ZONE_DEVICE) += memremap.o obj-$(CONFIG_HMM_MIRROR) += hmm.o obj-$(CONFIG_MEMFD_CREATE) += memfd.o obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o -obj-$(CONFIG_PTDUMP_CORE) += ptdump.o +obj-$(CONFIG_PTDUMP) += ptdump.o obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o obj-$(CONFIG_IO_MAPPING) += io-mapping.o obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o @@ -146,3 +147,4 @@ obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o obj-$(CONFIG_EXECMEM) += execmem.o obj-$(CONFIG_TMPFS_QUOTA) += shmem_quota.o +obj-$(CONFIG_PT_RECLAIM) += pt_reclaim.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index e61bbb1bd622..783904d8c5ef 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -1151,7 +1151,7 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi) void bdi_unregister(struct backing_dev_info *bdi) { - del_timer_sync(&bdi->laptop_mode_wb_timer); + timer_delete_sync(&bdi->laptop_mode_wb_timer); /* make sure nobody finds us on the bdi_list anymore */ bdi_remove_from_list(bdi); diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 6597ebea8ae2..d3e00731e262 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -24,6 +24,7 @@ static void balloon_page_enqueue_one(struct balloon_dev_info *b_dev_info, balloon_page_insert(b_dev_info, page); unlock_page(page); __count_vm_event(BALLOON_INFLATE); + inc_node_page_state(page, NR_BALLOON_PAGES); } /** @@ -103,6 +104,7 @@ size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info, __count_vm_event(BALLOON_DEFLATE); list_add(&page->lru, pages); unlock_page(page); + dec_node_page_state(page, NR_BALLOON_PAGES); n_pages++; } spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); diff --git a/mm/bootmem_info.c b/mm/bootmem_info.c index 95f288169a38..b0e2a9fa641f 100644 --- a/mm/bootmem_info.c +++ b/mm/bootmem_info.c @@ -88,7 +88,9 @@ static void __init register_page_bootmem_info_section(unsigned long start_pfn) memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); - register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION); + if (!preinited_vmemmap_section(ms)) + register_page_bootmem_memmap(section_nr, memmap, + PAGES_PER_SECTION); usage = ms->usage; page = virt_to_page(usage); @@ -18,6 +18,7 @@ #include <linux/memblock.h> #include <linux/err.h> +#include <linux/list.h> #include <linux/mm.h> #include <linux/sizes.h> #include <linux/slab.h> @@ -33,11 +34,17 @@ struct cma cma_areas[MAX_CMA_AREAS]; unsigned int cma_area_count; -static DEFINE_MUTEX(cma_mutex); + +static int __init __cma_declare_contiguous_nid(phys_addr_t *basep, + phys_addr_t size, phys_addr_t limit, + phys_addr_t alignment, unsigned int order_per_bit, + bool fixed, const char *name, struct cma **res_cma, + int nid); phys_addr_t cma_get_base(const struct cma *cma) { - return PFN_PHYS(cma->base_pfn); + WARN_ON_ONCE(cma->nranges != 1); + return PFN_PHYS(cma->ranges[0].base_pfn); } unsigned long cma_get_size(const struct cma *cma) @@ -63,9 +70,10 @@ static unsigned long cma_bitmap_aligned_mask(const struct cma *cma, * The value returned is represented in order_per_bits. */ static unsigned long cma_bitmap_aligned_offset(const struct cma *cma, + const struct cma_memrange *cmr, unsigned int align_order) { - return (cma->base_pfn & ((1UL << align_order) - 1)) + return (cmr->base_pfn & ((1UL << align_order) - 1)) >> cma->order_per_bit; } @@ -75,65 +83,123 @@ static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma, return ALIGN(pages, 1UL << cma->order_per_bit) >> cma->order_per_bit; } -static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, - unsigned long count) +static void cma_clear_bitmap(struct cma *cma, const struct cma_memrange *cmr, + unsigned long pfn, unsigned long count) { unsigned long bitmap_no, bitmap_count; unsigned long flags; - bitmap_no = (pfn - cma->base_pfn) >> cma->order_per_bit; + bitmap_no = (pfn - cmr->base_pfn) >> cma->order_per_bit; bitmap_count = cma_bitmap_pages_to_bits(cma, count); spin_lock_irqsave(&cma->lock, flags); - bitmap_clear(cma->bitmap, bitmap_no, bitmap_count); + bitmap_clear(cmr->bitmap, bitmap_no, bitmap_count); + cma->available_count += count; spin_unlock_irqrestore(&cma->lock, flags); } -static void __init cma_activate_area(struct cma *cma) +/* + * Check if a CMA area contains no ranges that intersect with + * multiple zones. Store the result in the flags in case + * this gets called more than once. + */ +bool cma_validate_zones(struct cma *cma) { - unsigned long base_pfn = cma->base_pfn, pfn; - struct zone *zone; - - cma->bitmap = bitmap_zalloc(cma_bitmap_maxno(cma), GFP_KERNEL); - if (!cma->bitmap) - goto out_error; + int r; + unsigned long base_pfn; + struct cma_memrange *cmr; + bool valid_bit_set; /* - * alloc_contig_range() requires the pfn range specified to be in the - * same zone. Simplify by forcing the entire CMA resv range to be in the - * same zone. + * If already validated, return result of previous check. + * Either the valid or invalid bit will be set if this + * check has already been done. If neither is set, the + * check has not been performed yet. */ - WARN_ON_ONCE(!pfn_valid(base_pfn)); - zone = page_zone(pfn_to_page(base_pfn)); - for (pfn = base_pfn + 1; pfn < base_pfn + cma->count; pfn++) { - WARN_ON_ONCE(!pfn_valid(pfn)); - if (page_zone(pfn_to_page(pfn)) != zone) - goto not_in_zone; + valid_bit_set = test_bit(CMA_ZONES_VALID, &cma->flags); + if (valid_bit_set || test_bit(CMA_ZONES_INVALID, &cma->flags)) + return valid_bit_set; + + for (r = 0; r < cma->nranges; r++) { + cmr = &cma->ranges[r]; + base_pfn = cmr->base_pfn; + + /* + * alloc_contig_range() requires the pfn range specified + * to be in the same zone. Simplify by forcing the entire + * CMA resv range to be in the same zone. + */ + WARN_ON_ONCE(!pfn_valid(base_pfn)); + if (pfn_range_intersects_zones(cma->nid, base_pfn, cmr->count)) { + set_bit(CMA_ZONES_INVALID, &cma->flags); + return false; + } } - for (pfn = base_pfn; pfn < base_pfn + cma->count; - pfn += pageblock_nr_pages) - init_cma_reserved_pageblock(pfn_to_page(pfn)); + set_bit(CMA_ZONES_VALID, &cma->flags); + + return true; +} + +static void __init cma_activate_area(struct cma *cma) +{ + unsigned long pfn, end_pfn, early_pfn[CMA_MAX_RANGES]; + int allocrange, r; + struct cma_memrange *cmr; + unsigned long bitmap_count, count; + + for (allocrange = 0; allocrange < cma->nranges; allocrange++) { + cmr = &cma->ranges[allocrange]; + early_pfn[allocrange] = cmr->early_pfn; + cmr->bitmap = bitmap_zalloc(cma_bitmap_maxno(cma, cmr), + GFP_KERNEL); + if (!cmr->bitmap) + goto cleanup; + } + + if (!cma_validate_zones(cma)) + goto cleanup; + + for (r = 0; r < cma->nranges; r++) { + cmr = &cma->ranges[r]; + if (early_pfn[r] != cmr->base_pfn) { + count = early_pfn[r] - cmr->base_pfn; + bitmap_count = cma_bitmap_pages_to_bits(cma, count); + bitmap_set(cmr->bitmap, 0, bitmap_count); + } + + for (pfn = early_pfn[r]; pfn < cmr->base_pfn + cmr->count; + pfn += pageblock_nr_pages) + init_cma_reserved_pageblock(pfn_to_page(pfn)); + } spin_lock_init(&cma->lock); + mutex_init(&cma->alloc_mutex); + #ifdef CONFIG_CMA_DEBUGFS INIT_HLIST_HEAD(&cma->mem_head); spin_lock_init(&cma->mem_head_lock); #endif + set_bit(CMA_ACTIVATED, &cma->flags); return; -not_in_zone: - bitmap_free(cma->bitmap); -out_error: +cleanup: + for (r = 0; r < allocrange; r++) + bitmap_free(cma->ranges[r].bitmap); + /* Expose all pages to the buddy, they are useless for CMA. */ - if (!cma->reserve_pages_on_error) { - for (pfn = base_pfn; pfn < base_pfn + cma->count; pfn++) - free_reserved_page(pfn_to_page(pfn)); + if (!test_bit(CMA_RESERVE_PAGES_ON_ERROR, &cma->flags)) { + for (r = 0; r < allocrange; r++) { + cmr = &cma->ranges[r]; + end_pfn = cmr->base_pfn + cmr->count; + for (pfn = early_pfn[r]; pfn < end_pfn; pfn++) + free_reserved_page(pfn_to_page(pfn)); + } } totalcma_pages -= cma->count; - cma->count = 0; + cma->available_count = cma->count = 0; pr_err("CMA area %s could not be activated\n", cma->name); } @@ -150,7 +216,44 @@ core_initcall(cma_init_reserved_areas); void __init cma_reserve_pages_on_error(struct cma *cma) { - cma->reserve_pages_on_error = true; + set_bit(CMA_RESERVE_PAGES_ON_ERROR, &cma->flags); +} + +static int __init cma_new_area(const char *name, phys_addr_t size, + unsigned int order_per_bit, + struct cma **res_cma) +{ + struct cma *cma; + + if (cma_area_count == ARRAY_SIZE(cma_areas)) { + pr_err("Not enough slots for CMA reserved regions!\n"); + return -ENOSPC; + } + + /* + * Each reserved area must be initialised later, when more kernel + * subsystems (like slab allocator) are available. + */ + cma = &cma_areas[cma_area_count]; + cma_area_count++; + + if (name) + snprintf(cma->name, CMA_MAX_NAME, "%s", name); + else + snprintf(cma->name, CMA_MAX_NAME, "cma%d\n", cma_area_count); + + cma->available_count = cma->count = size >> PAGE_SHIFT; + cma->order_per_bit = order_per_bit; + *res_cma = cma; + totalcma_pages += cma->count; + + return 0; +} + +static void __init cma_drop_area(struct cma *cma) +{ + totalcma_pages -= cma->count; + cma_area_count--; } /** @@ -171,13 +274,9 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, struct cma **res_cma) { struct cma *cma; + int ret; /* Sanity checks */ - if (cma_area_count == ARRAY_SIZE(cma_areas)) { - pr_err("Not enough slots for CMA reserved regions!\n"); - return -ENOSPC; - } - if (!size || !memblock_is_region_reserved(base, size)) return -EINVAL; @@ -194,25 +293,264 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, if (!IS_ALIGNED(base | size, CMA_MIN_ALIGNMENT_BYTES)) return -EINVAL; + ret = cma_new_area(name, size, order_per_bit, &cma); + if (ret != 0) + return ret; + + cma->ranges[0].base_pfn = PFN_DOWN(base); + cma->ranges[0].early_pfn = PFN_DOWN(base); + cma->ranges[0].count = cma->count; + cma->nranges = 1; + cma->nid = NUMA_NO_NODE; + + *res_cma = cma; + + return 0; +} + +/* + * Structure used while walking physical memory ranges and finding out + * which one(s) to use for a CMA area. + */ +struct cma_init_memrange { + phys_addr_t base; + phys_addr_t size; + struct list_head list; +}; + +/* + * Work array used during CMA initialization. + */ +static struct cma_init_memrange memranges[CMA_MAX_RANGES] __initdata; + +static bool __init revsizecmp(struct cma_init_memrange *mlp, + struct cma_init_memrange *mrp) +{ + return mlp->size > mrp->size; +} + +static bool __init basecmp(struct cma_init_memrange *mlp, + struct cma_init_memrange *mrp) +{ + return mlp->base < mrp->base; +} + +/* + * Helper function to create sorted lists. + */ +static void __init list_insert_sorted( + struct list_head *ranges, + struct cma_init_memrange *mrp, + bool (*cmp)(struct cma_init_memrange *lh, struct cma_init_memrange *rh)) +{ + struct list_head *mp; + struct cma_init_memrange *mlp; + + if (list_empty(ranges)) + list_add(&mrp->list, ranges); + else { + list_for_each(mp, ranges) { + mlp = list_entry(mp, struct cma_init_memrange, list); + if (cmp(mlp, mrp)) + break; + } + __list_add(&mrp->list, mlp->list.prev, &mlp->list); + } +} + +/* + * Create CMA areas with a total size of @total_size. A normal allocation + * for one area is tried first. If that fails, the biggest memblock + * ranges above 4G are selected, and allocated bottom up. + * + * The complexity here is not great, but this function will only be + * called during boot, and the lists operated on have fewer than + * CMA_MAX_RANGES elements (default value: 8). + */ +int __init cma_declare_contiguous_multi(phys_addr_t total_size, + phys_addr_t align, unsigned int order_per_bit, + const char *name, struct cma **res_cma, int nid) +{ + phys_addr_t start = 0, end; + phys_addr_t size, sizesum, sizeleft; + struct cma_init_memrange *mrp, *mlp, *failed; + struct cma_memrange *cmrp; + LIST_HEAD(ranges); + LIST_HEAD(final_ranges); + struct list_head *mp, *next; + int ret, nr = 1; + u64 i; + struct cma *cma; + /* - * Each reserved area must be initialised later, when more kernel - * subsystems (like slab allocator) are available. + * First, try it the normal way, producing just one range. */ - cma = &cma_areas[cma_area_count]; + ret = __cma_declare_contiguous_nid(&start, total_size, 0, align, + order_per_bit, false, name, res_cma, nid); + if (ret != -ENOMEM) + goto out; - if (name) - snprintf(cma->name, CMA_MAX_NAME, name); - else - snprintf(cma->name, CMA_MAX_NAME, "cma%d\n", cma_area_count); + /* + * Couldn't find one range that fits our needs, so try multiple + * ranges. + * + * No need to do the alignment checks here, the call to + * cma_declare_contiguous_nid above would have caught + * any issues. With the checks, we know that: + * + * - @align is a power of 2 + * - @align is >= pageblock alignment + * - @size is aligned to @align and to @order_per_bit + * + * So, as long as we create ranges that have a base + * aligned to @align, and a size that is aligned to + * both @align and @order_to_bit, things will work out. + */ + nr = 0; + sizesum = 0; + failed = NULL; - cma->base_pfn = PFN_DOWN(base); - cma->count = size >> PAGE_SHIFT; - cma->order_per_bit = order_per_bit; + ret = cma_new_area(name, total_size, order_per_bit, &cma); + if (ret != 0) + goto out; + + align = max_t(phys_addr_t, align, CMA_MIN_ALIGNMENT_BYTES); + /* + * Create a list of ranges above 4G, largest range first. + */ + for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &start, &end, NULL) { + if (upper_32_bits(start) == 0) + continue; + + start = ALIGN(start, align); + if (start >= end) + continue; + + end = ALIGN_DOWN(end, align); + if (end <= start) + continue; + + size = end - start; + size = ALIGN_DOWN(size, (PAGE_SIZE << order_per_bit)); + if (!size) + continue; + sizesum += size; + + pr_debug("consider %016llx - %016llx\n", (u64)start, (u64)end); + + /* + * If we don't yet have used the maximum number of + * areas, grab a new one. + * + * If we can't use anymore, see if this range is not + * smaller than the smallest one already recorded. If + * not, re-use the smallest element. + */ + if (nr < CMA_MAX_RANGES) + mrp = &memranges[nr++]; + else { + mrp = list_last_entry(&ranges, + struct cma_init_memrange, list); + if (size < mrp->size) + continue; + list_del(&mrp->list); + sizesum -= mrp->size; + pr_debug("deleted %016llx - %016llx from the list\n", + (u64)mrp->base, (u64)mrp->base + size); + } + mrp->base = start; + mrp->size = size; + + /* + * Now do a sorted insert. + */ + list_insert_sorted(&ranges, mrp, revsizecmp); + pr_debug("added %016llx - %016llx to the list\n", + (u64)mrp->base, (u64)mrp->base + size); + pr_debug("total size now %llu\n", (u64)sizesum); + } + + /* + * There is not enough room in the CMA_MAX_RANGES largest + * ranges, so bail out. + */ + if (sizesum < total_size) { + cma_drop_area(cma); + ret = -ENOMEM; + goto out; + } + + /* + * Found ranges that provide enough combined space. + * Now, sorted them by address, smallest first, because we + * want to mimic a bottom-up memblock allocation. + */ + sizesum = 0; + list_for_each_safe(mp, next, &ranges) { + mlp = list_entry(mp, struct cma_init_memrange, list); + list_del(mp); + list_insert_sorted(&final_ranges, mlp, basecmp); + sizesum += mlp->size; + if (sizesum >= total_size) + break; + } + + /* + * Walk the final list, and add a CMA range for + * each range, possibly not using the last one fully. + */ + nr = 0; + sizeleft = total_size; + list_for_each(mp, &final_ranges) { + mlp = list_entry(mp, struct cma_init_memrange, list); + size = min(sizeleft, mlp->size); + if (memblock_reserve(mlp->base, size)) { + /* + * Unexpected error. Could go on to + * the next one, but just abort to + * be safe. + */ + failed = mlp; + break; + } + + pr_debug("created region %d: %016llx - %016llx\n", + nr, (u64)mlp->base, (u64)mlp->base + size); + cmrp = &cma->ranges[nr++]; + cmrp->base_pfn = PHYS_PFN(mlp->base); + cmrp->early_pfn = cmrp->base_pfn; + cmrp->count = size >> PAGE_SHIFT; + + sizeleft -= size; + if (sizeleft == 0) + break; + } + + if (failed) { + list_for_each(mp, &final_ranges) { + mlp = list_entry(mp, struct cma_init_memrange, list); + if (mlp == failed) + break; + memblock_phys_free(mlp->base, mlp->size); + } + cma_drop_area(cma); + ret = -ENOMEM; + goto out; + } + + cma->nranges = nr; + cma->nid = nid; *res_cma = cma; - cma_area_count++; - totalcma_pages += cma->count; - return 0; +out: + if (ret != 0) + pr_err("Failed to reserve %lu MiB\n", + (unsigned long)total_size / SZ_1M); + else + pr_info("Reserved %lu MiB in %d range%s\n", + (unsigned long)total_size / SZ_1M, nr, + nr > 1 ? "s" : ""); + return ret; } /** @@ -241,8 +579,28 @@ int __init cma_declare_contiguous_nid(phys_addr_t base, bool fixed, const char *name, struct cma **res_cma, int nid) { + int ret; + + ret = __cma_declare_contiguous_nid(&base, size, limit, alignment, + order_per_bit, fixed, name, res_cma, nid); + if (ret != 0) + pr_err("Failed to reserve %ld MiB\n", + (unsigned long)size / SZ_1M); + else + pr_info("Reserved %ld MiB at %pa\n", + (unsigned long)size / SZ_1M, &base); + + return ret; +} + +static int __init __cma_declare_contiguous_nid(phys_addr_t *basep, + phys_addr_t size, phys_addr_t limit, + phys_addr_t alignment, unsigned int order_per_bit, + bool fixed, const char *name, struct cma **res_cma, + int nid) +{ phys_addr_t memblock_end = memblock_end_of_DRAM(); - phys_addr_t highmem_start; + phys_addr_t highmem_start, base = *basep; int ret; /* @@ -251,7 +609,10 @@ int __init cma_declare_contiguous_nid(phys_addr_t base, * complain. Find the boundary by adding one to the last valid * address. */ - highmem_start = __pa(high_memory - 1) + 1; + if (IS_ENABLED(CONFIG_HIGHMEM)) + highmem_start = __pa(high_memory - 1) + 1; + else + highmem_start = memblock_end_of_DRAM(); pr_debug("%s(size %pa, base %pa, limit %pa alignment %pa)\n", __func__, &size, &base, &limit, &alignment); @@ -272,10 +633,9 @@ int __init cma_declare_contiguous_nid(phys_addr_t base, /* Sanitise input arguments. */ alignment = max_t(phys_addr_t, alignment, CMA_MIN_ALIGNMENT_BYTES); if (fixed && base & (alignment - 1)) { - ret = -EINVAL; pr_err("Region at %pa must be aligned to %pa bytes\n", &base, &alignment); - goto err; + return -EINVAL; } base = ALIGN(base, alignment); size = ALIGN(size, alignment); @@ -293,10 +653,9 @@ int __init cma_declare_contiguous_nid(phys_addr_t base, * low/high memory boundary. */ if (fixed && base < highmem_start && base + size > highmem_start) { - ret = -EINVAL; pr_err("Region at %pa defined on low/high memory boundary (%pa)\n", &base, &highmem_start); - goto err; + return -EINVAL; } /* @@ -308,18 +667,16 @@ int __init cma_declare_contiguous_nid(phys_addr_t base, limit = memblock_end; if (base + size > limit) { - ret = -EINVAL; pr_err("Size (%pa) of region at %pa exceeds limit (%pa)\n", &size, &base, &limit); - goto err; + return -EINVAL; } /* Reserve memory */ if (fixed) { if (memblock_is_region_reserved(base, size) || memblock_reserve(base, size) < 0) { - ret = -EBUSY; - goto err; + return -EBUSY; } } else { phys_addr_t addr = 0; @@ -356,10 +713,8 @@ int __init cma_declare_contiguous_nid(phys_addr_t base, if (!addr) { addr = memblock_alloc_range_nid(size, alignment, base, limit, nid, true); - if (!addr) { - ret = -ENOMEM; - goto err; - } + if (!addr) + return -ENOMEM; } /* @@ -371,87 +726,93 @@ int __init cma_declare_contiguous_nid(phys_addr_t base, } ret = cma_init_reserved_mem(base, size, order_per_bit, name, res_cma); - if (ret) - goto free_mem; + if (ret) { + memblock_phys_free(base, size); + return ret; + } - pr_info("Reserved %ld MiB at %pa on node %d\n", (unsigned long)size / SZ_1M, - &base, nid); - return 0; + (*res_cma)->nid = nid; + *basep = base; -free_mem: - memblock_phys_free(base, size); -err: - pr_err("Failed to reserve %ld MiB on node %d\n", (unsigned long)size / SZ_1M, - nid); - return ret; + return 0; } static void cma_debug_show_areas(struct cma *cma) { unsigned long next_zero_bit, next_set_bit, nr_zero; - unsigned long start = 0; - unsigned long nr_part, nr_total = 0; - unsigned long nbits = cma_bitmap_maxno(cma); + unsigned long start; + unsigned long nr_part; + unsigned long nbits; + int r; + struct cma_memrange *cmr; spin_lock_irq(&cma->lock); pr_info("number of available pages: "); - for (;;) { - next_zero_bit = find_next_zero_bit(cma->bitmap, nbits, start); - if (next_zero_bit >= nbits) - break; - next_set_bit = find_next_bit(cma->bitmap, nbits, next_zero_bit); - nr_zero = next_set_bit - next_zero_bit; - nr_part = nr_zero << cma->order_per_bit; - pr_cont("%s%lu@%lu", nr_total ? "+" : "", nr_part, - next_zero_bit); - nr_total += nr_part; - start = next_zero_bit + nr_zero; + for (r = 0; r < cma->nranges; r++) { + cmr = &cma->ranges[r]; + + start = 0; + nbits = cma_bitmap_maxno(cma, cmr); + + pr_info("range %d: ", r); + for (;;) { + next_zero_bit = find_next_zero_bit(cmr->bitmap, + nbits, start); + if (next_zero_bit >= nbits) + break; + next_set_bit = find_next_bit(cmr->bitmap, nbits, + next_zero_bit); + nr_zero = next_set_bit - next_zero_bit; + nr_part = nr_zero << cma->order_per_bit; + pr_cont("%s%lu@%lu", start ? "+" : "", nr_part, + next_zero_bit); + start = next_zero_bit + nr_zero; + } + pr_info("\n"); } - pr_cont("=> %lu free of %lu total pages\n", nr_total, cma->count); + pr_cont("=> %lu free of %lu total pages\n", cma->available_count, + cma->count); spin_unlock_irq(&cma->lock); } -static struct page *__cma_alloc(struct cma *cma, unsigned long count, - unsigned int align, gfp_t gfp) +static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr, + unsigned long count, unsigned int align, + struct page **pagep, gfp_t gfp) { unsigned long mask, offset; unsigned long pfn = -1; unsigned long start = 0; unsigned long bitmap_maxno, bitmap_no, bitmap_count; - unsigned long i; + int ret = -EBUSY; struct page *page = NULL; - int ret = -ENOMEM; - const char *name = cma ? cma->name : NULL; - - trace_cma_alloc_start(name, count, align); - - if (!cma || !cma->count || !cma->bitmap) - return page; - - pr_debug("%s(cma %p, name: %s, count %lu, align %d)\n", __func__, - (void *)cma, cma->name, count, align); - - if (!count) - return page; mask = cma_bitmap_aligned_mask(cma, align); - offset = cma_bitmap_aligned_offset(cma, align); - bitmap_maxno = cma_bitmap_maxno(cma); + offset = cma_bitmap_aligned_offset(cma, cmr, align); + bitmap_maxno = cma_bitmap_maxno(cma, cmr); bitmap_count = cma_bitmap_pages_to_bits(cma, count); if (bitmap_count > bitmap_maxno) - return page; + goto out; for (;;) { spin_lock_irq(&cma->lock); - bitmap_no = bitmap_find_next_zero_area_off(cma->bitmap, + /* + * If the request is larger than the available number + * of pages, stop right away. + */ + if (count > cma->available_count) { + spin_unlock_irq(&cma->lock); + break; + } + bitmap_no = bitmap_find_next_zero_area_off(cmr->bitmap, bitmap_maxno, start, bitmap_count, mask, offset); if (bitmap_no >= bitmap_maxno) { spin_unlock_irq(&cma->lock); break; } - bitmap_set(cma->bitmap, bitmap_no, bitmap_count); + bitmap_set(cmr->bitmap, bitmap_no, bitmap_count); + cma->available_count -= count; /* * It's safe to drop the lock here. We've marked this region for * our exclusive use. If the migration fails we will take the @@ -459,16 +820,16 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count, */ spin_unlock_irq(&cma->lock); - pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit); - mutex_lock(&cma_mutex); + pfn = cmr->base_pfn + (bitmap_no << cma->order_per_bit); + mutex_lock(&cma->alloc_mutex); ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA, gfp); - mutex_unlock(&cma_mutex); + mutex_unlock(&cma->alloc_mutex); if (ret == 0) { page = pfn_to_page(pfn); break; } - cma_clear_bitmap(cma, pfn, count); + cma_clear_bitmap(cma, cmr, pfn, count); if (ret != -EBUSY) break; @@ -480,6 +841,38 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count, /* try again with a bit different memory target */ start = bitmap_no + mask + 1; } +out: + *pagep = page; + return ret; +} + +static struct page *__cma_alloc(struct cma *cma, unsigned long count, + unsigned int align, gfp_t gfp) +{ + struct page *page = NULL; + int ret = -ENOMEM, r; + unsigned long i; + const char *name = cma ? cma->name : NULL; + + trace_cma_alloc_start(name, count, align); + + if (!cma || !cma->count) + return page; + + pr_debug("%s(cma %p, name: %s, count %lu, align %d)\n", __func__, + (void *)cma, cma->name, count, align); + + if (!count) + return page; + + for (r = 0; r < cma->nranges; r++) { + page = NULL; + + ret = cma_range_alloc(cma, &cma->ranges[r], count, align, + &page, gfp); + if (ret != -EBUSY || page) + break; + } /* * CMA can allocate multiple page blocks, which results in different @@ -498,7 +891,8 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count, } pr_debug("%s(): returned %p\n", __func__, page); - trace_cma_alloc_finish(name, pfn, page, count, align, ret); + trace_cma_alloc_finish(name, page ? page_to_pfn(page) : 0, + page, count, align, ret); if (page) { count_vm_event(CMA_ALLOC_SUCCESS); cma_sysfs_account_success_pages(cma, count); @@ -541,20 +935,31 @@ struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp) bool cma_pages_valid(struct cma *cma, const struct page *pages, unsigned long count) { - unsigned long pfn; + unsigned long pfn, end; + int r; + struct cma_memrange *cmr; + bool ret; - if (!cma || !pages) + if (!cma || !pages || count > cma->count) return false; pfn = page_to_pfn(pages); + ret = false; - if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count) { - pr_debug("%s(page %p, count %lu)\n", __func__, - (void *)pages, count); - return false; + for (r = 0; r < cma->nranges; r++) { + cmr = &cma->ranges[r]; + end = cmr->base_pfn + cmr->count; + if (pfn >= cmr->base_pfn && pfn < end) { + ret = pfn + count <= end; + break; + } } - return true; + if (!ret) + pr_debug("%s(page %p, count %lu)\n", + __func__, (void *)pages, count); + + return ret; } /** @@ -570,19 +975,32 @@ bool cma_pages_valid(struct cma *cma, const struct page *pages, bool cma_release(struct cma *cma, const struct page *pages, unsigned long count) { - unsigned long pfn; + struct cma_memrange *cmr; + unsigned long pfn, end_pfn; + int r; + + pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count); if (!cma_pages_valid(cma, pages, count)) return false; - pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count); - pfn = page_to_pfn(pages); + end_pfn = pfn + count; + + for (r = 0; r < cma->nranges; r++) { + cmr = &cma->ranges[r]; + if (pfn >= cmr->base_pfn && + pfn < (cmr->base_pfn + cmr->count)) { + VM_BUG_ON(end_pfn > cmr->base_pfn + cmr->count); + break; + } + } - VM_BUG_ON(pfn + count > cma->base_pfn + cma->count); + if (r == cma->nranges) + return false; free_contig_range(pfn, count); - cma_clear_bitmap(cma, pfn, count); + cma_clear_bitmap(cma, cmr, pfn, count); cma_sysfs_account_release_pages(cma, count); trace_cma_release(cma->name, pfn, pages, count); @@ -610,3 +1028,86 @@ int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data) return 0; } + +bool cma_intersects(struct cma *cma, unsigned long start, unsigned long end) +{ + int r; + struct cma_memrange *cmr; + unsigned long rstart, rend; + + for (r = 0; r < cma->nranges; r++) { + cmr = &cma->ranges[r]; + + rstart = PFN_PHYS(cmr->base_pfn); + rend = PFN_PHYS(cmr->base_pfn + cmr->count); + if (end < rstart) + continue; + if (start >= rend) + continue; + return true; + } + + return false; +} + +/* + * Very basic function to reserve memory from a CMA area that has not + * yet been activated. This is expected to be called early, when the + * system is single-threaded, so there is no locking. The alignment + * checking is restrictive - only pageblock-aligned areas + * (CMA_MIN_ALIGNMENT_BYTES) may be reserved through this function. + * This keeps things simple, and is enough for the current use case. + * + * The CMA bitmaps have not yet been allocated, so just start + * reserving from the bottom up, using a PFN to keep track + * of what has been reserved. Unreserving is not possible. + * + * The caller is responsible for initializing the page structures + * in the area properly, since this just points to memblock-allocated + * memory. The caller should subsequently use init_cma_pageblock to + * set the migrate type and CMA stats the pageblocks that were reserved. + * + * If the CMA area fails to activate later, memory obtained through + * this interface is not handed to the page allocator, this is + * the responsibility of the caller (e.g. like normal memblock-allocated + * memory). + */ +void __init *cma_reserve_early(struct cma *cma, unsigned long size) +{ + int r; + struct cma_memrange *cmr; + unsigned long available; + void *ret = NULL; + + if (!cma || !cma->count) + return NULL; + /* + * Can only be called early in init. + */ + if (test_bit(CMA_ACTIVATED, &cma->flags)) + return NULL; + + if (!IS_ALIGNED(size, CMA_MIN_ALIGNMENT_BYTES)) + return NULL; + + if (!IS_ALIGNED(size, (PAGE_SIZE << cma->order_per_bit))) + return NULL; + + size >>= PAGE_SHIFT; + + if (size > cma->available_count) + return NULL; + + for (r = 0; r < cma->nranges; r++) { + cmr = &cma->ranges[r]; + available = cmr->count - (cmr->early_pfn - cmr->base_pfn); + if (size <= available) { + ret = phys_to_virt(PFN_PHYS(cmr->early_pfn)); + cmr->early_pfn += size; + cma->available_count -= size; + return ret; + } + } + + return ret; +} @@ -10,18 +10,45 @@ struct cma_kobject { struct cma *cma; }; +/* + * Multi-range support. This can be useful if the size of the allocation + * is not expected to be larger than the alignment (like with hugetlb_cma), + * and the total amount of memory requested, while smaller than the total + * amount of memory available, is large enough that it doesn't fit in a + * single physical memory range because of memory holes. + * + * Fields: + * @base_pfn: physical address of range + * @early_pfn: first PFN not reserved through cma_reserve_early + * @count: size of range + * @bitmap: bitmap of allocated (1 << order_per_bit)-sized chunks. + */ +struct cma_memrange { + unsigned long base_pfn; + unsigned long count; + union { + unsigned long early_pfn; + unsigned long *bitmap; + }; +#ifdef CONFIG_CMA_DEBUGFS + struct debugfs_u32_array dfs_bitmap; +#endif +}; +#define CMA_MAX_RANGES 8 + struct cma { - unsigned long base_pfn; unsigned long count; - unsigned long *bitmap; + unsigned long available_count; unsigned int order_per_bit; /* Order of pages represented by one bit */ spinlock_t lock; + struct mutex alloc_mutex; #ifdef CONFIG_CMA_DEBUGFS struct hlist_head mem_head; spinlock_t mem_head_lock; - struct debugfs_u32_array dfs_bitmap; #endif char name[CMA_MAX_NAME]; + int nranges; + struct cma_memrange ranges[CMA_MAX_RANGES]; #ifdef CONFIG_CMA_SYSFS /* the number of CMA page successful allocations */ atomic64_t nr_pages_succeeded; @@ -32,15 +59,25 @@ struct cma { /* kobject requires dynamic object */ struct cma_kobject *cma_kobj; #endif - bool reserve_pages_on_error; + unsigned long flags; + /* NUMA node (NUMA_NO_NODE if unspecified) */ + int nid; +}; + +enum cma_flags { + CMA_RESERVE_PAGES_ON_ERROR, + CMA_ZONES_VALID, + CMA_ZONES_INVALID, + CMA_ACTIVATED, }; extern struct cma cma_areas[MAX_CMA_AREAS]; -extern unsigned cma_area_count; +extern unsigned int cma_area_count; -static inline unsigned long cma_bitmap_maxno(struct cma *cma) +static inline unsigned long cma_bitmap_maxno(struct cma *cma, + struct cma_memrange *cmr) { - return cma->count >> cma->order_per_bit; + return cmr->count >> cma->order_per_bit; } #ifdef CONFIG_CMA_SYSFS diff --git a/mm/cma_debug.c b/mm/cma_debug.c index 602fff89b15f..fdf899532ca0 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -34,13 +34,10 @@ DEFINE_DEBUGFS_ATTRIBUTE(cma_debugfs_fops, cma_debugfs_get, NULL, "%llu\n"); static int cma_used_get(void *data, u64 *val) { struct cma *cma = data; - unsigned long used; spin_lock_irq(&cma->lock); - /* pages counter is smaller than sizeof(int) */ - used = bitmap_weight(cma->bitmap, (int)cma_bitmap_maxno(cma)); + *val = cma->count - cma->available_count; spin_unlock_irq(&cma->lock); - *val = (u64)used << cma->order_per_bit; return 0; } @@ -49,17 +46,26 @@ DEFINE_DEBUGFS_ATTRIBUTE(cma_used_fops, cma_used_get, NULL, "%llu\n"); static int cma_maxchunk_get(void *data, u64 *val) { struct cma *cma = data; + struct cma_memrange *cmr; unsigned long maxchunk = 0; - unsigned long start, end = 0; - unsigned long bitmap_maxno = cma_bitmap_maxno(cma); + unsigned long start, end; + unsigned long bitmap_maxno; + int r; spin_lock_irq(&cma->lock); - for (;;) { - start = find_next_zero_bit(cma->bitmap, bitmap_maxno, end); - if (start >= bitmap_maxno) - break; - end = find_next_bit(cma->bitmap, bitmap_maxno, start); - maxchunk = max(end - start, maxchunk); + for (r = 0; r < cma->nranges; r++) { + cmr = &cma->ranges[r]; + bitmap_maxno = cma_bitmap_maxno(cma, cmr); + end = 0; + for (;;) { + start = find_next_zero_bit(cmr->bitmap, + bitmap_maxno, end); + if (start >= bitmap_maxno) + break; + end = find_next_bit(cmr->bitmap, bitmap_maxno, + start); + maxchunk = max(end - start, maxchunk); + } } spin_unlock_irq(&cma->lock); *val = (u64)maxchunk << cma->order_per_bit; @@ -162,24 +168,41 @@ DEFINE_DEBUGFS_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n"); static void cma_debugfs_add_one(struct cma *cma, struct dentry *root_dentry) { - struct dentry *tmp; + struct dentry *tmp, *dir, *rangedir; + int r; + char rdirname[12]; + struct cma_memrange *cmr; tmp = debugfs_create_dir(cma->name, root_dentry); debugfs_create_file("alloc", 0200, tmp, cma, &cma_alloc_fops); debugfs_create_file("free", 0200, tmp, cma, &cma_free_fops); - debugfs_create_file("base_pfn", 0444, tmp, - &cma->base_pfn, &cma_debugfs_fops); debugfs_create_file("count", 0444, tmp, &cma->count, &cma_debugfs_fops); debugfs_create_file("order_per_bit", 0444, tmp, &cma->order_per_bit, &cma_debugfs_fops); debugfs_create_file("used", 0444, tmp, cma, &cma_used_fops); debugfs_create_file("maxchunk", 0444, tmp, cma, &cma_maxchunk_fops); - cma->dfs_bitmap.array = (u32 *)cma->bitmap; - cma->dfs_bitmap.n_elements = DIV_ROUND_UP(cma_bitmap_maxno(cma), - BITS_PER_BYTE * sizeof(u32)); - debugfs_create_u32_array("bitmap", 0444, tmp, &cma->dfs_bitmap); + rangedir = debugfs_create_dir("ranges", tmp); + for (r = 0; r < cma->nranges; r++) { + cmr = &cma->ranges[r]; + snprintf(rdirname, sizeof(rdirname), "%d", r); + dir = debugfs_create_dir(rdirname, rangedir); + debugfs_create_file("base_pfn", 0444, dir, + &cmr->base_pfn, &cma_debugfs_fops); + cmr->dfs_bitmap.array = (u32 *)cmr->bitmap; + cmr->dfs_bitmap.n_elements = + DIV_ROUND_UP(cma_bitmap_maxno(cma, cmr), + BITS_PER_BYTE * sizeof(u32)); + debugfs_create_u32_array("bitmap", 0444, dir, + &cmr->dfs_bitmap); + } + + /* + * Backward compatible symlinks to range 0 for base_pfn and bitmap. + */ + debugfs_create_symlink("base_pfn", tmp, "ranges/0/base_pfn"); + debugfs_create_symlink("bitmap", tmp, "ranges/0/bitmap"); } static int __init cma_debugfs_init(void) diff --git a/mm/cma_sysfs.c b/mm/cma_sysfs.c index f50db3973171..97acd3e5a6a5 100644 --- a/mm/cma_sysfs.c +++ b/mm/cma_sysfs.c @@ -62,6 +62,24 @@ static ssize_t release_pages_success_show(struct kobject *kobj, } CMA_ATTR_RO(release_pages_success); +static ssize_t total_pages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct cma *cma = cma_from_kobj(kobj); + + return sysfs_emit(buf, "%lu\n", cma->count); +} +CMA_ATTR_RO(total_pages); + +static ssize_t available_pages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct cma *cma = cma_from_kobj(kobj); + + return sysfs_emit(buf, "%lu\n", cma->available_count); +} +CMA_ATTR_RO(available_pages); + static void cma_kobj_release(struct kobject *kobj) { struct cma *cma = cma_from_kobj(kobj); @@ -75,6 +93,8 @@ static struct attribute *cma_attrs[] = { &alloc_pages_success_attr.attr, &alloc_pages_fail_attr.attr, &release_pages_success_attr.attr, + &total_pages_attr.attr, + &available_pages_attr.attr, NULL, }; ATTRIBUTE_GROUPS(cma); diff --git a/mm/compaction.c b/mm/compaction.c index a2b16b08cbbf..3925cb61dbb8 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -83,6 +83,7 @@ static inline bool is_via_compact_memory(int order) { return false; } static struct page *mark_allocated_noprof(struct page *page, unsigned int order, gfp_t gfp_flags) { post_alloc_hook(page, order, __GFP_MOVABLE); + set_page_refcounted(page); return page; } #define mark_allocated(...) alloc_hooks(mark_allocated_noprof(__VA_ARGS__)) @@ -630,7 +631,8 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, if (PageCompound(page)) { const unsigned int order = compound_order(page); - if (blockpfn + (1UL << order) <= end_pfn) { + if ((order <= MAX_PAGE_ORDER) && + (blockpfn + (1UL << order) <= end_pfn)) { blockpfn += (1UL << order) - 1; page += (1UL << order) - 1; nr_scanned += (1UL << order) - 1; @@ -979,13 +981,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, } if (PageHuge(page)) { + const unsigned int order = compound_order(page); /* * skip hugetlbfs if we are not compacting for pages * bigger than its order. THPs and other compound pages * are handled below. */ if (!cc->alloc_contig) { - const unsigned int order = compound_order(page); if (order <= MAX_PAGE_ORDER) { low_pfn += (1UL << order) - 1; @@ -999,27 +1001,27 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, locked = NULL; } - ret = isolate_or_dissolve_huge_page(page, &cc->migratepages); + folio = page_folio(page); + ret = isolate_or_dissolve_huge_folio(folio, &cc->migratepages); /* - * Fail isolation in case isolate_or_dissolve_huge_page() + * Fail isolation in case isolate_or_dissolve_huge_folio() * reports an error. In case of -ENOMEM, abort right away. */ if (ret < 0) { /* Do not report -EBUSY down the chain */ if (ret == -EBUSY) ret = 0; - low_pfn += compound_nr(page) - 1; - nr_scanned += compound_nr(page) - 1; + low_pfn += (1UL << order) - 1; + nr_scanned += (1UL << order) - 1; goto isolate_fail; } - if (PageHuge(page)) { + if (folio_test_hugetlb(folio)) { /* * Hugepage was successfully isolated and placed * on the cc->migratepages list. */ - folio = page_folio(page); low_pfn += folio_nr_pages(folio) - 1; goto isolate_success_no_list; } @@ -1868,6 +1870,7 @@ again: dst = (struct folio *)freepage; post_alloc_hook(&dst->page, order, __GFP_MOVABLE); + set_page_refcounted(&dst->page); if (order) prep_compound_page(&dst->page, order); cc->nr_freepages -= 1 << order; @@ -2246,15 +2249,11 @@ static unsigned int fragmentation_score_node(pg_data_t *pgdat) static unsigned int fragmentation_score_wmark(bool low) { - unsigned int wmark_low; + unsigned int wmark_low, leeway; - /* - * Cap the low watermark to avoid excessive compaction - * activity in case a user sets the proactiveness tunable - * close to 100 (maximum). - */ - wmark_low = max(100U - sysctl_compaction_proactiveness, 5U); - return low ? wmark_low : min(wmark_low + 10, 100U); + wmark_low = 100U - sysctl_compaction_proactiveness; + leeway = min(10U, wmark_low / 2); + return low ? wmark_low : min(wmark_low + leeway, 100U); } static bool should_proactive_compact_node(pg_data_t *pgdat) @@ -2325,11 +2324,26 @@ static enum compact_result __compact_finished(struct compact_control *cc) if (!pageblock_aligned(cc->migrate_pfn)) return COMPACT_CONTINUE; + /* + * When defrag_mode is enabled, make kcompactd target + * watermarks in whole pageblocks. Because they can be stolen + * without polluting, no further fallback checks are needed. + */ + if (defrag_mode && !cc->direct_compaction) { + if (__zone_watermark_ok(cc->zone, cc->order, + high_wmark_pages(cc->zone), + cc->highest_zoneidx, cc->alloc_flags, + zone_page_state(cc->zone, + NR_FREE_PAGES_BLOCKS))) + return COMPACT_SUCCESS; + + return COMPACT_CONTINUE; + } + /* Direct compactor: Is a suitable page free? */ ret = COMPACT_NO_SUITABLE_PAGE; for (order = cc->order; order < NR_PAGE_ORDERS; order++) { struct free_area *area = &cc->zone->free_area[order]; - bool can_steal; /* Job done if page is free of the right migratetype */ if (!free_area_empty(area, migratetype)) @@ -2345,8 +2359,7 @@ static enum compact_result __compact_finished(struct compact_control *cc) * Job done if allocation would steal freepages from * other migratetype buddy lists. */ - if (find_suitable_fallback(area, order, migratetype, - true, &can_steal) != -1) + if (find_suitable_fallback(area, order, migratetype, true) >= 0) /* * Movable pages are OK in any pageblock. If we are * stealing for a non-movable allocation, make sure @@ -2378,40 +2391,42 @@ static enum compact_result compact_finished(struct compact_control *cc) } static bool __compaction_suitable(struct zone *zone, int order, - int highest_zoneidx, - unsigned long wmark_target) + unsigned long watermark, int highest_zoneidx, + unsigned long free_pages) { - unsigned long watermark; /* * Watermarks for order-0 must be met for compaction to be able to * isolate free pages for migration targets. This means that the - * watermark and alloc_flags have to match, or be more pessimistic than - * the check in __isolate_free_page(). We don't use the direct - * compactor's alloc_flags, as they are not relevant for freepage - * isolation. We however do use the direct compactor's highest_zoneidx - * to skip over zones where lowmem reserves would prevent allocation - * even if compaction succeeds. - * For costly orders, we require low watermark instead of min for - * compaction to proceed to increase its chances. + * watermark have to match, or be more pessimistic than the check in + * __isolate_free_page(). + * + * For costly orders, we require a higher watermark for compaction to + * proceed to increase its chances. + * + * We use the direct compactor's highest_zoneidx to skip over zones + * where lowmem reserves would prevent allocation even if compaction + * succeeds. + * * ALLOC_CMA is used, as pages in CMA pageblocks are considered - * suitable migration targets + * suitable migration targets. */ - watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ? - low_wmark_pages(zone) : min_wmark_pages(zone); watermark += compact_gap(order); + if (order > PAGE_ALLOC_COSTLY_ORDER) + watermark += low_wmark_pages(zone) - min_wmark_pages(zone); return __zone_watermark_ok(zone, 0, watermark, highest_zoneidx, - ALLOC_CMA, wmark_target); + ALLOC_CMA, free_pages); } /* * compaction_suitable: Is this suitable to run compaction on this zone now? */ -bool compaction_suitable(struct zone *zone, int order, int highest_zoneidx) +bool compaction_suitable(struct zone *zone, int order, unsigned long watermark, + int highest_zoneidx) { enum compact_result compact_result; bool suitable; - suitable = __compaction_suitable(zone, order, highest_zoneidx, + suitable = __compaction_suitable(zone, order, watermark, highest_zoneidx, zone_page_state(zone, NR_FREE_PAGES)); /* * fragmentation index determines if allocation failures are due to @@ -2449,6 +2464,7 @@ bool compaction_suitable(struct zone *zone, int order, int highest_zoneidx) return suitable; } +/* Used by direct reclaimers */ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, int alloc_flags) { @@ -2471,8 +2487,8 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, */ available = zone_reclaimable_pages(zone) / order; available += zone_page_state_snapshot(zone, NR_FREE_PAGES); - if (__compaction_suitable(zone, order, ac->highest_zoneidx, - available)) + if (__compaction_suitable(zone, order, min_wmark_pages(zone), + ac->highest_zoneidx, available)) return true; } @@ -2488,16 +2504,40 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, */ static enum compact_result compaction_suit_allocation_order(struct zone *zone, unsigned int order, - int highest_zoneidx, unsigned int alloc_flags) + int highest_zoneidx, unsigned int alloc_flags, + bool async, bool kcompactd) { + unsigned long free_pages; unsigned long watermark; + if (kcompactd && defrag_mode) + free_pages = zone_page_state(zone, NR_FREE_PAGES_BLOCKS); + else + free_pages = zone_page_state(zone, NR_FREE_PAGES); + watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); - if (zone_watermark_ok(zone, order, watermark, highest_zoneidx, - alloc_flags)) + if (__zone_watermark_ok(zone, order, watermark, highest_zoneidx, + alloc_flags, free_pages)) return COMPACT_SUCCESS; - if (!compaction_suitable(zone, order, highest_zoneidx)) + /* + * For unmovable allocations (without ALLOC_CMA), check if there is enough + * free memory in the non-CMA pageblocks. Otherwise compaction could form + * the high-order page in CMA pageblocks, which would not help the + * allocation to succeed. However, limit the check to costly order async + * compaction (such as opportunistic THP attempts) because there is the + * possibility that compaction would migrate pages from non-CMA to CMA + * pageblock. + */ + if (order > PAGE_ALLOC_COSTLY_ORDER && async && + !(alloc_flags & ALLOC_CMA)) { + if (!__zone_watermark_ok(zone, 0, watermark + compact_gap(order), + highest_zoneidx, 0, + zone_page_state(zone, NR_FREE_PAGES))) + return COMPACT_SKIPPED; + } + + if (!compaction_suitable(zone, order, watermark, highest_zoneidx)) return COMPACT_SKIPPED; return COMPACT_CONTINUE; @@ -2532,7 +2572,9 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) if (!is_via_compact_memory(cc->order)) { ret = compaction_suit_allocation_order(cc->zone, cc->order, cc->highest_zoneidx, - cc->alloc_flags); + cc->alloc_flags, + cc->mode == MIGRATE_ASYNC, + !cc->direct_compaction); if (ret != COMPACT_CONTINUE) return ret; } @@ -3026,6 +3068,8 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat) struct zone *zone; enum zone_type highest_zoneidx = pgdat->kcompactd_highest_zoneidx; enum compact_result ret; + unsigned int alloc_flags = defrag_mode ? + ALLOC_WMARK_HIGH : ALLOC_WMARK_MIN; for (zoneid = 0; zoneid <= highest_zoneidx; zoneid++) { zone = &pgdat->node_zones[zoneid]; @@ -3035,7 +3079,8 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat) ret = compaction_suit_allocation_order(zone, pgdat->kcompactd_max_order, - highest_zoneidx, ALLOC_WMARK_MIN); + highest_zoneidx, alloc_flags, + false, true); if (ret == COMPACT_CONTINUE) return true; } @@ -3058,6 +3103,7 @@ static void kcompactd_do_work(pg_data_t *pgdat) .mode = MIGRATE_SYNC_LIGHT, .ignore_skip_hint = false, .gfp_mask = GFP_KERNEL, + .alloc_flags = defrag_mode ? ALLOC_WMARK_HIGH : ALLOC_WMARK_MIN, }; enum compact_result ret; @@ -3076,7 +3122,8 @@ static void kcompactd_do_work(pg_data_t *pgdat) continue; ret = compaction_suit_allocation_order(zone, - cc.order, zoneid, ALLOC_WMARK_MIN); + cc.order, zoneid, cc.alloc_flags, + false, true); if (ret != COMPACT_CONTINUE) continue; @@ -3154,15 +3201,10 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx) static int kcompactd(void *p) { pg_data_t *pgdat = (pg_data_t *)p; - struct task_struct *tsk = current; long default_timeout = msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC); long timeout = default_timeout; - const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); - - if (!cpumask_empty(cpumask)) - set_cpus_allowed_ptr(tsk, cpumask); - + current->flags |= PF_KCOMPACTD; set_freezable(); pgdat->kcompactd_max_order = 0; @@ -3219,6 +3261,8 @@ static int kcompactd(void *p) pgdat->proactive_compact_trigger = false; } + current->flags &= ~PF_KCOMPACTD; + return 0; } @@ -3233,10 +3277,12 @@ void __meminit kcompactd_run(int nid) if (pgdat->kcompactd) return; - pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid); + pgdat->kcompactd = kthread_create_on_node(kcompactd, pgdat, nid, "kcompactd%d", nid); if (IS_ERR(pgdat->kcompactd)) { pr_err("Failed to start kcompactd on node %d\n", nid); pgdat->kcompactd = NULL; + } else { + wake_up_process(pgdat->kcompactd); } } @@ -3254,30 +3300,6 @@ void __meminit kcompactd_stop(int nid) } } -/* - * It's optimal to keep kcompactd on the same CPUs as their memory, but - * not required for correctness. So if the last cpu in a node goes - * away, we get changed to run anywhere: as the first one comes back, - * restore their cpu bindings. - */ -static int kcompactd_cpu_online(unsigned int cpu) -{ - int nid; - - for_each_node_state(nid, N_MEMORY) { - pg_data_t *pgdat = NODE_DATA(nid); - const struct cpumask *mask; - - mask = cpumask_of_node(pgdat->node_id); - - if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) - /* One of our CPUs online: restore mask */ - if (pgdat->kcompactd) - set_cpus_allowed_ptr(pgdat->kcompactd, mask); - } - return 0; -} - static int proc_dointvec_minmax_warn_RT_change(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -3297,7 +3319,7 @@ static int proc_dointvec_minmax_warn_RT_change(const struct ctl_table *table, return ret; } -static struct ctl_table vm_compaction[] = { +static const struct ctl_table vm_compaction[] = { { .procname = "compact_memory", .data = &sysctl_compact_memory, @@ -3337,15 +3359,6 @@ static struct ctl_table vm_compaction[] = { static int __init kcompactd_init(void) { int nid; - int ret; - - ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, - "mm/compaction:online", - kcompactd_cpu_online, NULL); - if (ret < 0) { - pr_err("kcompactd: failed to register hotplug callbacks.\n"); - return ret; - } for_each_node_state(nid, N_MEMORY) kcompactd_run(nid); diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index d0357f3e9372..c93d0c56b963 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -28,6 +28,7 @@ config DAMON_VADDR bool "Data access monitoring operations for virtual address spaces" depends on DAMON && MMU select PAGE_IDLE_FLAG + default DAMON help This builds the default data access monitoring operations for DAMON that work for virtual address spaces. @@ -36,6 +37,7 @@ config DAMON_PADDR bool "Data access monitoring operations for the physical address space" depends on DAMON && MMU select PAGE_IDLE_FLAG + default DAMON help This builds the default data access monitoring operations for DAMON that works for the physical address space. @@ -55,6 +57,7 @@ config DAMON_VADDR_KUNIT_TEST config DAMON_SYSFS bool "DAMON sysfs interface" depends on DAMON && SYSFS + default DAMON help This builds the sysfs interface for DAMON. The user space can use the interface for arbitrary data access monitoring. @@ -71,36 +74,6 @@ config DAMON_SYSFS_KUNIT_TEST If unsure, say N. -config DAMON_DBGFS_DEPRECATED - bool "DAMON debugfs interface (DEPRECATED!)" - depends on DAMON_VADDR && DAMON_PADDR && DEBUG_FS - help - This builds the debugfs interface for DAMON. The user space admins - can use the interface for arbitrary data access monitoring. - - If unsure, say N. - - This is deprecated, so users should move to the sysfs interface - (DAMON_SYSFS). If you depend on this and cannot move, please report - your usecase to damon@lists.linux.dev and linux-mm@kvack.org. - -config DAMON_DBGFS - bool - default y - depends on DAMON_DBGFS_DEPRECATED - -config DAMON_DBGFS_KUNIT_TEST - bool "Test for damon debugfs interface" if !KUNIT_ALL_TESTS - depends on DAMON_DBGFS && KUNIT=y - default KUNIT_ALL_TESTS - help - This builds the DAMON debugfs interface Kunit test suite. - - For more information on KUnit and unit tests in general, please refer - to the KUnit documentation. - - If unsure, say N. - config DAMON_RECLAIM bool "Build DAMON-based reclaim (DAMON_RECLAIM)" depends on DAMON_PADDR diff --git a/mm/damon/Makefile b/mm/damon/Makefile index f7add3f4aa79..8b49012ba8c3 100644 --- a/mm/damon/Makefile +++ b/mm/damon/Makefile @@ -4,6 +4,5 @@ obj-y := core.o obj-$(CONFIG_DAMON_VADDR) += ops-common.o vaddr.o obj-$(CONFIG_DAMON_PADDR) += ops-common.o paddr.o obj-$(CONFIG_DAMON_SYSFS) += sysfs-common.o sysfs-schemes.o sysfs.o -obj-$(CONFIG_DAMON_DBGFS) += dbgfs.o obj-$(CONFIG_DAMON_RECLAIM) += modules-common.o reclaim.o obj-$(CONFIG_DAMON_LRU_SORT) += modules-common.o lru_sort.o diff --git a/mm/damon/core.c b/mm/damon/core.c index 8b8e2933dcd4..b217e0120e09 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -14,6 +14,7 @@ #include <linux/psi.h> #include <linux/slab.h> #include <linux/string.h> +#include <linux/string_choices.h> #define CREATE_TRACE_POINTS #include <trace/events/damon.h> @@ -75,14 +76,13 @@ int damon_register_ops(struct damon_operations *ops) if (ops->id >= NR_DAMON_OPS) return -EINVAL; + mutex_lock(&damon_ops_lock); /* Fail for already registered ops */ - if (__damon_is_registered_ops(ops->id)) { + if (__damon_is_registered_ops(ops->id)) err = -EINVAL; - goto out; - } - damon_registered_ops[ops->id] = *ops; -out: + else + damon_registered_ops[ops->id] = *ops; mutex_unlock(&damon_ops_lock); return err; } @@ -266,7 +266,7 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, } struct damos_filter *damos_new_filter(enum damos_filter_type type, - bool matching) + bool matching, bool allow) { struct damos_filter *filter; @@ -275,13 +275,36 @@ struct damos_filter *damos_new_filter(enum damos_filter_type type, return NULL; filter->type = type; filter->matching = matching; + filter->allow = allow; INIT_LIST_HEAD(&filter->list); return filter; } +/** + * damos_filter_for_ops() - Return if the filter is ops-hndled one. + * @type: type of the filter. + * + * Return: true if the filter of @type needs to be handled by ops layer, false + * otherwise. + */ +bool damos_filter_for_ops(enum damos_filter_type type) +{ + switch (type) { + case DAMOS_FILTER_TYPE_ADDR: + case DAMOS_FILTER_TYPE_TARGET: + return false; + default: + break; + } + return true; +} + void damos_add_filter(struct damos *s, struct damos_filter *f) { - list_add_tail(&f->list, &s->filters); + if (damos_filter_for_ops(f->type)) + list_add_tail(&f->list, &s->ops_filters); + else + list_add_tail(&f->list, &s->filters); } static void damos_del_filter(struct damos_filter *f) @@ -371,7 +394,9 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern, * or damon_attrs are updated. */ scheme->next_apply_sis = 0; + scheme->walk_completed = false; INIT_LIST_HEAD(&scheme->filters); + INIT_LIST_HEAD(&scheme->ops_filters); scheme->stat = (struct damos_stat){}; INIT_LIST_HEAD(&scheme->list); @@ -499,11 +524,13 @@ struct damon_ctx *damon_new_ctx(void) ctx->attrs.ops_update_interval = 60 * 1000 * 1000; ctx->passed_sample_intervals = 0; - /* These will be set from kdamond_init_intervals_sis() */ + /* These will be set from kdamond_init_ctx() */ ctx->next_aggregation_sis = 0; ctx->next_ops_update_sis = 0; mutex_init(&ctx->kdamond_lock); + mutex_init(&ctx->call_control_lock); + mutex_init(&ctx->walk_control_lock); ctx->attrs.min_nr_regions = 10; ctx->attrs.max_nr_regions = 1000; @@ -575,11 +602,25 @@ static unsigned int damon_nr_accesses_for_new_attrs(unsigned int nr_accesses, } static void damon_update_monitoring_result(struct damon_region *r, - struct damon_attrs *old_attrs, struct damon_attrs *new_attrs) -{ - r->nr_accesses = damon_nr_accesses_for_new_attrs(r->nr_accesses, - old_attrs, new_attrs); - r->nr_accesses_bp = r->nr_accesses * 10000; + struct damon_attrs *old_attrs, struct damon_attrs *new_attrs, + bool aggregating) +{ + if (!aggregating) { + r->nr_accesses = damon_nr_accesses_for_new_attrs( + r->nr_accesses, old_attrs, new_attrs); + r->nr_accesses_bp = r->nr_accesses * 10000; + } else { + /* + * if this is called in the middle of the aggregation, reset + * the aggregations we made so far for this aggregation + * interval. In other words, make the status like + * kdamond_reset_aggregated() is called. + */ + r->last_nr_accesses = damon_nr_accesses_for_new_attrs( + r->last_nr_accesses, old_attrs, new_attrs); + r->nr_accesses_bp = r->last_nr_accesses * 10000; + r->nr_accesses = 0; + } r->age = damon_age_for_new_attrs(r->age, old_attrs, new_attrs); } @@ -592,7 +633,7 @@ static void damon_update_monitoring_result(struct damon_region *r, * ->nr_accesses and ->age of given damon_ctx's regions for new damon_attrs. */ static void damon_update_monitoring_results(struct damon_ctx *ctx, - struct damon_attrs *new_attrs) + struct damon_attrs *new_attrs, bool aggregating) { struct damon_attrs *old_attrs = &ctx->attrs; struct damon_target *t; @@ -607,7 +648,26 @@ static void damon_update_monitoring_results(struct damon_ctx *ctx, damon_for_each_target(t, ctx) damon_for_each_region(r, t) damon_update_monitoring_result( - r, old_attrs, new_attrs); + r, old_attrs, new_attrs, aggregating); +} + +/* + * damon_valid_intervals_goal() - return if the intervals goal of @attrs is + * valid. + */ +static bool damon_valid_intervals_goal(struct damon_attrs *attrs) +{ + struct damon_intervals_goal *goal = &attrs->intervals_goal; + + /* tuning is disabled */ + if (!goal->aggrs) + return true; + if (goal->min_sample_us > goal->max_sample_us) + return false; + if (attrs->sample_interval < goal->min_sample_us || + goal->max_sample_us < attrs->sample_interval) + return false; + return true; } /** @@ -615,10 +675,10 @@ static void damon_update_monitoring_results(struct damon_ctx *ctx, * @ctx: monitoring context * @attrs: monitoring attributes * - * This function should be called while the kdamond is not running, or an - * access check results aggregation is not ongoing (e.g., from - * &struct damon_callback->after_aggregation or - * &struct damon_callback->after_wmarks_check callbacks). + * This function should be called while the kdamond is not running, an access + * check results aggregation is not ongoing (e.g., from &struct + * damon_callback->after_aggregation or &struct + * damon_callback->after_wmarks_check callbacks), or from damon_call(). * * Every time interval is in micro-seconds. * @@ -629,6 +689,11 @@ int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs) unsigned long sample_interval = attrs->sample_interval ? attrs->sample_interval : 1; struct damos *s; + bool aggregating = ctx->passed_sample_intervals < + ctx->next_aggregation_sis; + + if (!damon_valid_intervals_goal(attrs)) + return -EINVAL; if (attrs->min_nr_regions < 3) return -EINVAL; @@ -637,12 +702,16 @@ int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs) if (attrs->sample_interval > attrs->aggr_interval) return -EINVAL; + /* calls from core-external doesn't set this. */ + if (!attrs->aggr_samples) + attrs->aggr_samples = attrs->aggr_interval / sample_interval; + ctx->next_aggregation_sis = ctx->passed_sample_intervals + attrs->aggr_interval / sample_interval; ctx->next_ops_update_sis = ctx->passed_sample_intervals + attrs->ops_update_interval / sample_interval; - damon_update_monitoring_results(ctx, attrs); + damon_update_monitoring_results(ctx, attrs, aggregating); ctx->attrs = *attrs; damon_for_each_scheme(s, ctx) @@ -772,6 +841,9 @@ static void damos_commit_filter_arg( case DAMOS_FILTER_TYPE_TARGET: dst->target_idx = src->target_idx; break; + case DAMOS_FILTER_TYPE_HUGEPAGE_SIZE: + dst->sz_range = src->sz_range; + break; default: break; } @@ -785,7 +857,7 @@ static void damos_commit_filter( damos_commit_filter_arg(dst, src); } -static int damos_commit_filters(struct damos *dst, struct damos *src) +static int damos_commit_core_filters(struct damos *dst, struct damos *src) { struct damos_filter *dst_filter, *next, *src_filter, *new_filter; int i = 0, j = 0; @@ -803,7 +875,36 @@ static int damos_commit_filters(struct damos *dst, struct damos *src) continue; new_filter = damos_new_filter( - src_filter->type, src_filter->matching); + src_filter->type, src_filter->matching, + src_filter->allow); + if (!new_filter) + return -ENOMEM; + damos_commit_filter_arg(new_filter, src_filter); + damos_add_filter(dst, new_filter); + } + return 0; +} + +static int damos_commit_ops_filters(struct damos *dst, struct damos *src) +{ + struct damos_filter *dst_filter, *next, *src_filter, *new_filter; + int i = 0, j = 0; + + damos_for_each_ops_filter_safe(dst_filter, next, dst) { + src_filter = damos_nth_filter(i++, src); + if (src_filter) + damos_commit_filter(dst_filter, src_filter); + else + damos_destroy_filter(dst_filter); + } + + damos_for_each_ops_filter_safe(src_filter, next, src) { + if (j++ < i) + continue; + + new_filter = damos_new_filter( + src_filter->type, src_filter->matching, + src_filter->allow); if (!new_filter) return -ENOMEM; damos_commit_filter_arg(new_filter, src_filter); @@ -812,6 +913,46 @@ static int damos_commit_filters(struct damos *dst, struct damos *src) return 0; } +/** + * damos_filters_default_reject() - decide whether to reject memory that didn't + * match with any given filter. + * @filters: Given DAMOS filters of a group. + */ +static bool damos_filters_default_reject(struct list_head *filters) +{ + struct damos_filter *last_filter; + + if (list_empty(filters)) + return false; + last_filter = list_last_entry(filters, struct damos_filter, list); + return last_filter->allow; +} + +static void damos_set_filters_default_reject(struct damos *s) +{ + if (!list_empty(&s->ops_filters)) + s->core_filters_default_reject = false; + else + s->core_filters_default_reject = + damos_filters_default_reject(&s->filters); + s->ops_filters_default_reject = + damos_filters_default_reject(&s->ops_filters); +} + +static int damos_commit_filters(struct damos *dst, struct damos *src) +{ + int err; + + err = damos_commit_core_filters(dst, src); + if (err) + return err; + err = damos_commit_ops_filters(dst, src); + if (err) + return err; + damos_set_filters_default_reject(dst); + return 0; +} + static struct damos *damon_nth_scheme(int n, struct damon_ctx *ctx) { struct damos *s; @@ -868,6 +1009,11 @@ static int damon_commit_schemes(struct damon_ctx *dst, struct damon_ctx *src) NUMA_NO_NODE); if (!new_scheme) return -ENOMEM; + err = damos_commit(new_scheme, src_scheme); + if (err) { + damon_destroy_scheme(new_scheme); + return err; + } damon_add_scheme(dst, new_scheme); } return 0; @@ -947,9 +1093,17 @@ static int damon_commit_targets( if (err) return err; } else { + struct damos *s; + if (damon_target_has_pid(dst)) put_pid(dst_target->pid); damon_destroy_target(dst_target); + damon_for_each_scheme(s, dst) { + if (s->quota.charge_target_from == dst_target) { + s->quota.charge_target_from = NULL; + s->quota.charge_addr_from = 0; + } + } } } @@ -961,8 +1115,11 @@ static int damon_commit_targets( return -ENOMEM; err = damon_commit_target(new_target, false, src_target, damon_target_has_pid(src)); - if (err) + if (err) { + damon_destroy_target(new_target); return err; + } + damon_add_target(dst, new_target); } return 0; } @@ -1154,6 +1311,107 @@ int damon_stop(struct damon_ctx **ctxs, int nr_ctxs) return err; } +static bool damon_is_running(struct damon_ctx *ctx) +{ + bool running; + + mutex_lock(&ctx->kdamond_lock); + running = ctx->kdamond != NULL; + mutex_unlock(&ctx->kdamond_lock); + return running; +} + +/** + * damon_call() - Invoke a given function on DAMON worker thread (kdamond). + * @ctx: DAMON context to call the function for. + * @control: Control variable of the call request. + * + * Ask DAMON worker thread (kdamond) of @ctx to call a function with an + * argument data that respectively passed via &damon_call_control->fn and + * &damon_call_control->data of @control, and wait until the kdamond finishes + * handling of the request. + * + * The kdamond executes the function with the argument in the main loop, just + * after a sampling of the iteration is finished. The function can hence + * safely access the internal data of the &struct damon_ctx without additional + * synchronization. The return value of the function will be saved in + * &damon_call_control->return_code. + * + * Return: 0 on success, negative error code otherwise. + */ +int damon_call(struct damon_ctx *ctx, struct damon_call_control *control) +{ + init_completion(&control->completion); + control->canceled = false; + + mutex_lock(&ctx->call_control_lock); + if (ctx->call_control) { + mutex_unlock(&ctx->call_control_lock); + return -EBUSY; + } + ctx->call_control = control; + mutex_unlock(&ctx->call_control_lock); + if (!damon_is_running(ctx)) + return -EINVAL; + wait_for_completion(&control->completion); + if (control->canceled) + return -ECANCELED; + return 0; +} + +/** + * damos_walk() - Invoke a given functions while DAMOS walk regions. + * @ctx: DAMON context to call the functions for. + * @control: Control variable of the walk request. + * + * Ask DAMON worker thread (kdamond) of @ctx to call a function for each region + * that the kdamond will apply DAMOS action to, and wait until the kdamond + * finishes handling of the request. + * + * The kdamond executes the given function in the main loop, for each region + * just after it applied any DAMOS actions of @ctx to it. The invocation is + * made only within one &damos->apply_interval_us since damos_walk() + * invocation, for each scheme. The given callback function can hence safely + * access the internal data of &struct damon_ctx and &struct damon_region that + * each of the scheme will apply the action for next interval, without + * additional synchronizations against the kdamond. If every scheme of @ctx + * passed at least one &damos->apply_interval_us, kdamond marks the request as + * completed so that damos_walk() can wakeup and return. + * + * Return: 0 on success, negative error code otherwise. + */ +int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control) +{ + init_completion(&control->completion); + control->canceled = false; + mutex_lock(&ctx->walk_control_lock); + if (ctx->walk_control) { + mutex_unlock(&ctx->walk_control_lock); + return -EBUSY; + } + ctx->walk_control = control; + mutex_unlock(&ctx->walk_control_lock); + if (!damon_is_running(ctx)) + return -EINVAL; + wait_for_completion(&control->completion); + if (control->canceled) + return -ECANCELED; + return 0; +} + +/* + * Warn and fix corrupted ->nr_accesses[_bp] for investigations and preventing + * the problem being propagated. + */ +static void damon_warn_fix_nr_accesses_corruption(struct damon_region *r) +{ + if (r->nr_accesses_bp == r->nr_accesses * 10000) + return; + WARN_ONCE(true, "invalid nr_accesses_bp at reset: %u %u\n", + r->nr_accesses_bp, r->nr_accesses); + r->nr_accesses_bp = r->nr_accesses * 10000; +} + /* * Reset the aggregated monitoring results ('nr_accesses' of each region). */ @@ -1167,6 +1425,7 @@ static void kdamond_reset_aggregated(struct damon_ctx *c) damon_for_each_region(r, t) { trace_damon_aggregated(ti, r, damon_nr_regions(t)); + damon_warn_fix_nr_accesses_corruption(r); r->last_nr_accesses = r->nr_accesses; r->nr_accesses = 0; } @@ -1174,6 +1433,65 @@ static void kdamond_reset_aggregated(struct damon_ctx *c) } } +static unsigned long damon_get_intervals_score(struct damon_ctx *c) +{ + struct damon_target *t; + struct damon_region *r; + unsigned long sz_region, max_access_events = 0, access_events = 0; + unsigned long target_access_events; + unsigned long goal_bp = c->attrs.intervals_goal.access_bp; + + damon_for_each_target(t, c) { + damon_for_each_region(r, t) { + sz_region = damon_sz_region(r); + max_access_events += sz_region * c->attrs.aggr_samples; + access_events += sz_region * r->nr_accesses; + } + } + target_access_events = max_access_events * goal_bp / 10000; + return access_events * 10000 / target_access_events; +} + +static unsigned long damon_feed_loop_next_input(unsigned long last_input, + unsigned long score); + +static unsigned long damon_get_intervals_adaptation_bp(struct damon_ctx *c) +{ + unsigned long score_bp, adaptation_bp; + + score_bp = damon_get_intervals_score(c); + adaptation_bp = damon_feed_loop_next_input(100000000, score_bp) / + 10000; + /* + * adaptaion_bp ranges from 1 to 20,000. Avoid too rapid reduction of + * the intervals by rescaling [1,10,000] to [5000, 10,000]. + */ + if (adaptation_bp <= 10000) + adaptation_bp = 5000 + adaptation_bp / 2; + return adaptation_bp; +} + +static void kdamond_tune_intervals(struct damon_ctx *c) +{ + unsigned long adaptation_bp; + struct damon_attrs new_attrs; + struct damon_intervals_goal *goal; + + adaptation_bp = damon_get_intervals_adaptation_bp(c); + if (adaptation_bp == 10000) + return; + + new_attrs = c->attrs; + goal = &c->attrs.intervals_goal; + new_attrs.sample_interval = min(goal->max_sample_us, + c->attrs.sample_interval * adaptation_bp / 10000); + new_attrs.sample_interval = max(goal->min_sample_us, + new_attrs.sample_interval); + new_attrs.aggr_interval = new_attrs.sample_interval * + c->attrs.aggr_samples; + damon_set_attrs(c, &new_attrs); +} + static void damon_split_region_at(struct damon_target *t, struct damon_region *r, unsigned long sz_r); @@ -1264,16 +1582,18 @@ static bool damos_skip_charged_region(struct damon_target *t, } static void damos_update_stat(struct damos *s, - unsigned long sz_tried, unsigned long sz_applied) + unsigned long sz_tried, unsigned long sz_applied, + unsigned long sz_ops_filter_passed) { s->stat.nr_tried++; s->stat.sz_tried += sz_tried; if (sz_applied) s->stat.nr_applied++; s->stat.sz_applied += sz_applied; + s->stat.sz_ops_filter_passed += sz_ops_filter_passed; } -static bool __damos_filter_out(struct damon_ctx *ctx, struct damon_target *t, +static bool damos_filter_match(struct damon_ctx *ctx, struct damon_target *t, struct damon_region *r, struct damos_filter *filter) { bool matched = false; @@ -1326,11 +1646,102 @@ static bool damos_filter_out(struct damon_ctx *ctx, struct damon_target *t, { struct damos_filter *filter; + s->core_filters_allowed = false; damos_for_each_filter(filter, s) { - if (__damos_filter_out(ctx, t, r, filter)) - return true; + if (damos_filter_match(ctx, t, r, filter)) { + if (filter->allow) + s->core_filters_allowed = true; + return !filter->allow; + } } - return false; + return s->core_filters_default_reject; +} + +/* + * damos_walk_call_walk() - Call &damos_walk_control->walk_fn. + * @ctx: The context of &damon_ctx->walk_control. + * @t: The monitoring target of @r that @s will be applied. + * @r: The region of @t that @s will be applied. + * @s: The scheme of @ctx that will be applied to @r. + * + * This function is called from kdamond whenever it asked the operation set to + * apply a DAMOS scheme action to a region. If a DAMOS walk request is + * installed by damos_walk() and not yet uninstalled, invoke it. + */ +static void damos_walk_call_walk(struct damon_ctx *ctx, struct damon_target *t, + struct damon_region *r, struct damos *s, + unsigned long sz_filter_passed) +{ + struct damos_walk_control *control; + + if (s->walk_completed) + return; + + control = ctx->walk_control; + if (!control) + return; + + control->walk_fn(control->data, ctx, t, r, s, sz_filter_passed); +} + +/* + * damos_walk_complete() - Complete DAMOS walk request if all walks are done. + * @ctx: The context of &damon_ctx->walk_control. + * @s: A scheme of @ctx that all walks are now done. + * + * This function is called when kdamond finished applying the action of a DAMOS + * scheme to all regions that eligible for the given &damos->apply_interval_us. + * If every scheme of @ctx including @s now finished walking for at least one + * &damos->apply_interval_us, this function makrs the handling of the given + * DAMOS walk request is done, so that damos_walk() can wake up and return. + */ +static void damos_walk_complete(struct damon_ctx *ctx, struct damos *s) +{ + struct damos *siter; + struct damos_walk_control *control; + + control = ctx->walk_control; + if (!control) + return; + + s->walk_completed = true; + /* if all schemes completed, signal completion to walker */ + damon_for_each_scheme(siter, ctx) { + if (!siter->walk_completed) + return; + } + damon_for_each_scheme(siter, ctx) + siter->walk_completed = false; + + complete(&control->completion); + ctx->walk_control = NULL; +} + +/* + * damos_walk_cancel() - Cancel the current DAMOS walk request. + * @ctx: The context of &damon_ctx->walk_control. + * + * This function is called when @ctx is deactivated by DAMOS watermarks, DAMOS + * walk is requested but there is no DAMOS scheme to walk for, or the kdamond + * is already out of the main loop and therefore gonna be terminated, and hence + * cannot continue the walks. This function therefore marks the walk request + * as canceled, so that damos_walk() can wake up and return. + */ +static void damos_walk_cancel(struct damon_ctx *ctx) +{ + struct damos_walk_control *control; + + mutex_lock(&ctx->walk_control_lock); + control = ctx->walk_control; + mutex_unlock(&ctx->walk_control_lock); + + if (!control) + return; + control->canceled = true; + complete(&control->completion); + mutex_lock(&ctx->walk_control_lock); + ctx->walk_control = NULL; + mutex_unlock(&ctx->walk_control_lock); } static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, @@ -1340,7 +1751,7 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, unsigned long sz = damon_sz_region(r); struct timespec64 begin, end; unsigned long sz_applied = 0; - int err = 0; + unsigned long sz_ops_filter_passed = 0; /* * We plan to support multiple context per kdamond, as DAMON sysfs * implies with 'nr_contexts' file. Nevertheless, only single context @@ -1380,13 +1791,11 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, if (damos_filter_out(c, t, r, s)) return; ktime_get_coarse_ts64(&begin); - if (c->callback.before_damos_apply) - err = c->callback.before_damos_apply(c, t, r, s); - if (!err) { - trace_damos_before_apply(cidx, sidx, tidx, r, - damon_nr_regions(t), do_trace); - sz_applied = c->ops.apply_scheme(c, t, r, s); - } + trace_damos_before_apply(cidx, sidx, tidx, r, + damon_nr_regions(t), do_trace); + sz_applied = c->ops.apply_scheme(c, t, r, s, + &sz_ops_filter_passed); + damos_walk_call_walk(c, t, r, s, sz_ops_filter_passed); ktime_get_coarse_ts64(&end); quota->total_charged_ns += timespec64_to_ns(&end) - timespec64_to_ns(&begin); @@ -1400,7 +1809,7 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, r->age = 0; update_stat: - damos_update_stat(s, sz, sz_applied); + damos_update_stat(s, sz, sz_applied, sz_ops_filter_passed); } static void damon_do_apply_schemes(struct damon_ctx *c, @@ -1502,6 +1911,29 @@ static inline u64 damos_get_some_mem_psi_total(void) #endif /* CONFIG_PSI */ +#ifdef CONFIG_NUMA +static __kernel_ulong_t damos_get_node_mem_bp( + struct damos_quota_goal *goal) +{ + struct sysinfo i; + __kernel_ulong_t numerator; + + si_meminfo_node(&i, goal->nid); + if (goal->metric == DAMOS_QUOTA_NODE_MEM_USED_BP) + numerator = i.totalram - i.freeram; + else /* DAMOS_QUOTA_NODE_MEM_FREE_BP */ + numerator = i.freeram; + return numerator * 10000 / i.totalram; +} +#else +static __kernel_ulong_t damos_get_node_mem_bp( + struct damos_quota_goal *goal) +{ + return 0; +} +#endif + + static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal) { u64 now_psi_total; @@ -1515,6 +1947,10 @@ static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal) goal->current_value = now_psi_total - goal->last_psi_total; goal->last_psi_total = now_psi_total; break; + case DAMOS_QUOTA_NODE_MEM_USED_BP: + case DAMOS_QUOTA_NODE_MEM_FREE_BP: + goal->current_value = damos_get_node_mem_bp(goal); + break; default: break; } @@ -1542,7 +1978,7 @@ static unsigned long damos_quota_score(struct damos_quota *quota) static void damos_set_effective_quota(struct damos_quota *quota) { unsigned long throughput; - unsigned long esz; + unsigned long esz = ULONG_MAX; if (!quota->ms && list_empty("a->goals)) { quota->esz = quota->sz; @@ -1564,10 +2000,7 @@ static void damos_set_effective_quota(struct damos_quota *quota) quota->total_charged_ns; else throughput = PAGE_SIZE * 1024; - if (!list_empty("a->goals)) - esz = min(throughput * quota->ms, esz); - else - esz = throughput * quota->ms; + esz = min(throughput * quota->ms, esz); } if (quota->sz && quota->sz < esz) @@ -1650,6 +2083,7 @@ static void kdamond_apply_schemes(struct damon_ctx *c) if (!has_schemes_to_apply) return; + mutex_lock(&c->walk_control_lock); damon_for_each_target(t, c) { damon_for_each_region_safe(r, next_r, t) damon_do_apply_schemes(c, t, r); @@ -1658,10 +2092,13 @@ static void kdamond_apply_schemes(struct damon_ctx *c) damon_for_each_scheme(s, c) { if (c->passed_sample_intervals < s->next_apply_sis) continue; + damos_walk_complete(c, s); s->next_apply_sis = c->passed_sample_intervals + (s->apply_interval_us ? s->apply_interval_us : c->attrs.aggr_interval) / sample_interval; + s->last_applied = NULL; } + mutex_unlock(&c->walk_control_lock); } /* @@ -1886,9 +2323,8 @@ static unsigned long damos_wmark_wait_us(struct damos *scheme) if (metric > scheme->wmarks.high || scheme->wmarks.low > metric) { if (scheme->wmarks.activated) pr_debug("deactivate a scheme (%d) for %s wmark\n", - scheme->action, - metric > scheme->wmarks.high ? - "high" : "low"); + scheme->action, + str_high_low(metric > scheme->wmarks.high)); scheme->wmarks.activated = false; return scheme->wmarks.interval; } @@ -1912,6 +2348,39 @@ static void kdamond_usleep(unsigned long usecs) usleep_range_idle(usecs, usecs + 1); } +/* + * kdamond_call() - handle damon_call_control. + * @ctx: The &struct damon_ctx of the kdamond. + * @cancel: Whether to cancel the invocation of the function. + * + * If there is a &struct damon_call_control request that registered via + * &damon_call() on @ctx, do or cancel the invocation of the function depending + * on @cancel. @cancel is set when the kdamond is deactivated by DAMOS + * watermarks, or the kdamond is already out of the main loop and therefore + * will be terminated. + */ +static void kdamond_call(struct damon_ctx *ctx, bool cancel) +{ + struct damon_call_control *control; + int ret = 0; + + mutex_lock(&ctx->call_control_lock); + control = ctx->call_control; + mutex_unlock(&ctx->call_control_lock); + if (!control) + return; + if (cancel) { + control->canceled = true; + } else { + ret = control->fn(control->data); + control->return_code = ret; + } + complete(&control->completion); + mutex_lock(&ctx->call_control_lock); + ctx->call_control = NULL; + mutex_unlock(&ctx->call_control_lock); +} + /* Returns negative error code if it's not activated but should return */ static int kdamond_wait_activation(struct damon_ctx *ctx) { @@ -1936,11 +2405,13 @@ static int kdamond_wait_activation(struct damon_ctx *ctx) if (ctx->callback.after_wmarks_check && ctx->callback.after_wmarks_check(ctx)) break; + kdamond_call(ctx, true); + damos_walk_cancel(ctx); } return -EBUSY; } -static void kdamond_init_intervals_sis(struct damon_ctx *ctx) +static void kdamond_init_ctx(struct damon_ctx *ctx) { unsigned long sample_interval = ctx->attrs.sample_interval ? ctx->attrs.sample_interval : 1; @@ -1951,11 +2422,14 @@ static void kdamond_init_intervals_sis(struct damon_ctx *ctx) ctx->next_aggregation_sis = ctx->attrs.aggr_interval / sample_interval; ctx->next_ops_update_sis = ctx->attrs.ops_update_interval / sample_interval; + ctx->next_intervals_tune_sis = ctx->next_aggregation_sis * + ctx->attrs.intervals_goal.aggrs; damon_for_each_scheme(scheme, ctx) { apply_interval = scheme->apply_interval_us ? scheme->apply_interval_us : ctx->attrs.aggr_interval; scheme->next_apply_sis = apply_interval / sample_interval; + damos_set_filters_default_reject(scheme); } } @@ -1973,12 +2447,10 @@ static int kdamond_fn(void *data) pr_debug("kdamond (%d) starts\n", current->pid); complete(&ctx->kdamond_started); - kdamond_init_intervals_sis(ctx); + kdamond_init_ctx(ctx); if (ctx->ops.init) ctx->ops.init(ctx); - if (ctx->callback.before_start && ctx->callback.before_start(ctx)) - goto done; ctx->regions_score_histogram = kmalloc_array(DAMOS_MAX_SCORE + 1, sizeof(*ctx->regions_score_histogram), GFP_KERNEL); if (!ctx->regions_score_histogram) @@ -2003,9 +2475,6 @@ static int kdamond_fn(void *data) if (ctx->ops.prepare_access_checks) ctx->ops.prepare_access_checks(ctx); - if (ctx->callback.after_sampling && - ctx->callback.after_sampling(ctx)) - break; kdamond_usleep(sample_interval); ctx->passed_sample_intervals++; @@ -2023,22 +2492,52 @@ static int kdamond_fn(void *data) } /* - * do kdamond_apply_schemes() after kdamond_merge_regions() if - * possible, to reduce overhead + * do kdamond_call() and kdamond_apply_schemes() after + * kdamond_merge_regions() if possible, to reduce overhead */ + kdamond_call(ctx, false); if (!list_empty(&ctx->schemes)) kdamond_apply_schemes(ctx); + else + damos_walk_cancel(ctx); sample_interval = ctx->attrs.sample_interval ? ctx->attrs.sample_interval : 1; if (ctx->passed_sample_intervals >= next_aggregation_sis) { + if (ctx->attrs.intervals_goal.aggrs && + ctx->passed_sample_intervals >= + ctx->next_intervals_tune_sis) { + /* + * ctx->next_aggregation_sis might be updated + * from kdamond_call(). In the case, + * damon_set_attrs() which will be called from + * kdamond_tune_interval() may wrongly think + * this is in the middle of the current + * aggregation, and make aggregation + * information reset for all regions. Then, + * following kdamond_reset_aggregated() call + * will make the region information invalid, + * particularly for ->nr_accesses_bp. + * + * Reset ->next_aggregation_sis to avoid that. + * It will anyway correctly updated after this + * if caluse. + */ + ctx->next_aggregation_sis = + next_aggregation_sis; + ctx->next_intervals_tune_sis += + ctx->attrs.aggr_samples * + ctx->attrs.intervals_goal.aggrs; + kdamond_tune_intervals(ctx); + sample_interval = ctx->attrs.sample_interval ? + ctx->attrs.sample_interval : 1; + + } ctx->next_aggregation_sis = next_aggregation_sis + ctx->attrs.aggr_interval / sample_interval; kdamond_reset_aggregated(ctx); kdamond_split_regions(ctx); - if (ctx->ops.reset_aggregated) - ctx->ops.reset_aggregated(ctx); } if (ctx->passed_sample_intervals >= next_ops_update_sis) { @@ -2067,6 +2566,9 @@ done: ctx->kdamond = NULL; mutex_unlock(&ctx->kdamond_lock); + kdamond_call(ctx, true); + damos_walk_cancel(ctx); + mutex_lock(&damon_lock); nr_running_ctxs--; if (!nr_running_ctxs && running_exclusive_ctxs) diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c deleted file mode 100644 index b4213bc47e44..000000000000 --- a/mm/damon/dbgfs.c +++ /dev/null @@ -1,1148 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * DAMON Debugfs Interface - * - * Author: SeongJae Park <sj@kernel.org> - */ - -#define pr_fmt(fmt) "damon-dbgfs: " fmt - -#include <linux/damon.h> -#include <linux/debugfs.h> -#include <linux/file.h> -#include <linux/mm.h> -#include <linux/module.h> -#include <linux/page_idle.h> -#include <linux/slab.h> - -#define DAMON_DBGFS_DEPRECATION_NOTICE \ - "DAMON debugfs interface is deprecated, so users should move " \ - "to DAMON_SYSFS. If you cannot, please report your usecase to " \ - "damon@lists.linux.dev and linux-mm@kvack.org.\n" - -static struct damon_ctx **dbgfs_ctxs; -static int dbgfs_nr_ctxs; -static struct dentry **dbgfs_dirs; -static DEFINE_MUTEX(damon_dbgfs_lock); - -static void damon_dbgfs_warn_deprecation(void) -{ - pr_warn_once(DAMON_DBGFS_DEPRECATION_NOTICE); -} - -/* - * Returns non-empty string on success, negative error code otherwise. - */ -static char *user_input_str(const char __user *buf, size_t count, loff_t *ppos) -{ - char *kbuf; - ssize_t ret; - - /* We do not accept continuous write */ - if (*ppos) - return ERR_PTR(-EINVAL); - - kbuf = kmalloc(count + 1, GFP_KERNEL | __GFP_NOWARN); - if (!kbuf) - return ERR_PTR(-ENOMEM); - - ret = simple_write_to_buffer(kbuf, count + 1, ppos, buf, count); - if (ret != count) { - kfree(kbuf); - return ERR_PTR(-EIO); - } - kbuf[ret] = '\0'; - - return kbuf; -} - -static ssize_t dbgfs_attrs_read(struct file *file, - char __user *buf, size_t count, loff_t *ppos) -{ - struct damon_ctx *ctx = file->private_data; - char kbuf[128]; - int ret; - - mutex_lock(&ctx->kdamond_lock); - ret = scnprintf(kbuf, ARRAY_SIZE(kbuf), "%lu %lu %lu %lu %lu\n", - ctx->attrs.sample_interval, ctx->attrs.aggr_interval, - ctx->attrs.ops_update_interval, - ctx->attrs.min_nr_regions, ctx->attrs.max_nr_regions); - mutex_unlock(&ctx->kdamond_lock); - - return simple_read_from_buffer(buf, count, ppos, kbuf, ret); -} - -static ssize_t dbgfs_attrs_write(struct file *file, - const char __user *buf, size_t count, loff_t *ppos) -{ - struct damon_ctx *ctx = file->private_data; - struct damon_attrs attrs; - char *kbuf; - ssize_t ret; - - kbuf = user_input_str(buf, count, ppos); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - - if (sscanf(kbuf, "%lu %lu %lu %lu %lu", - &attrs.sample_interval, &attrs.aggr_interval, - &attrs.ops_update_interval, - &attrs.min_nr_regions, - &attrs.max_nr_regions) != 5) { - ret = -EINVAL; - goto out; - } - - mutex_lock(&ctx->kdamond_lock); - if (ctx->kdamond) { - ret = -EBUSY; - goto unlock_out; - } - - ret = damon_set_attrs(ctx, &attrs); - if (!ret) - ret = count; -unlock_out: - mutex_unlock(&ctx->kdamond_lock); -out: - kfree(kbuf); - return ret; -} - -/* - * Return corresponding dbgfs' scheme action value (int) for the given - * damos_action if the given damos_action value is valid and supported by - * dbgfs, negative error code otherwise. - */ -static int damos_action_to_dbgfs_scheme_action(enum damos_action action) -{ - switch (action) { - case DAMOS_WILLNEED: - return 0; - case DAMOS_COLD: - return 1; - case DAMOS_PAGEOUT: - return 2; - case DAMOS_HUGEPAGE: - return 3; - case DAMOS_NOHUGEPAGE: - return 4; - case DAMOS_STAT: - return 5; - default: - return -EINVAL; - } -} - -static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len) -{ - struct damos *s; - int written = 0; - int rc; - - damon_for_each_scheme(s, c) { - rc = scnprintf(&buf[written], len - written, - "%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %d %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", - s->pattern.min_sz_region, - s->pattern.max_sz_region, - s->pattern.min_nr_accesses, - s->pattern.max_nr_accesses, - s->pattern.min_age_region, - s->pattern.max_age_region, - damos_action_to_dbgfs_scheme_action(s->action), - s->quota.ms, s->quota.sz, - s->quota.reset_interval, - s->quota.weight_sz, - s->quota.weight_nr_accesses, - s->quota.weight_age, - s->wmarks.metric, s->wmarks.interval, - s->wmarks.high, s->wmarks.mid, s->wmarks.low, - s->stat.nr_tried, s->stat.sz_tried, - s->stat.nr_applied, s->stat.sz_applied, - s->stat.qt_exceeds); - if (!rc) - return -ENOMEM; - - written += rc; - } - return written; -} - -static ssize_t dbgfs_schemes_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) -{ - struct damon_ctx *ctx = file->private_data; - char *kbuf; - ssize_t len; - - kbuf = kmalloc(count, GFP_KERNEL | __GFP_NOWARN); - if (!kbuf) - return -ENOMEM; - - mutex_lock(&ctx->kdamond_lock); - len = sprint_schemes(ctx, kbuf, count); - mutex_unlock(&ctx->kdamond_lock); - if (len < 0) - goto out; - len = simple_read_from_buffer(buf, count, ppos, kbuf, len); - -out: - kfree(kbuf); - return len; -} - -static void free_schemes_arr(struct damos **schemes, ssize_t nr_schemes) -{ - ssize_t i; - - for (i = 0; i < nr_schemes; i++) - kfree(schemes[i]); - kfree(schemes); -} - -/* - * Return corresponding damos_action for the given dbgfs input for a scheme - * action if the input is valid, negative error code otherwise. - */ -static enum damos_action dbgfs_scheme_action_to_damos_action(int dbgfs_action) -{ - switch (dbgfs_action) { - case 0: - return DAMOS_WILLNEED; - case 1: - return DAMOS_COLD; - case 2: - return DAMOS_PAGEOUT; - case 3: - return DAMOS_HUGEPAGE; - case 4: - return DAMOS_NOHUGEPAGE; - case 5: - return DAMOS_STAT; - default: - return -EINVAL; - } -} - -/* - * Converts a string into an array of struct damos pointers - * - * Returns an array of struct damos pointers that converted if the conversion - * success, or NULL otherwise. - */ -static struct damos **str_to_schemes(const char *str, ssize_t len, - ssize_t *nr_schemes) -{ - struct damos *scheme, **schemes; - const int max_nr_schemes = 256; - int pos = 0, parsed, ret; - unsigned int action_input; - enum damos_action action; - - schemes = kmalloc_array(max_nr_schemes, sizeof(scheme), - GFP_KERNEL); - if (!schemes) - return NULL; - - *nr_schemes = 0; - while (pos < len && *nr_schemes < max_nr_schemes) { - struct damos_access_pattern pattern = {}; - struct damos_quota quota = {}; - struct damos_watermarks wmarks; - - ret = sscanf(&str[pos], - "%lu %lu %u %u %u %u %u %lu %lu %lu %u %u %u %u %lu %lu %lu %lu%n", - &pattern.min_sz_region, &pattern.max_sz_region, - &pattern.min_nr_accesses, - &pattern.max_nr_accesses, - &pattern.min_age_region, - &pattern.max_age_region, - &action_input, "a.ms, - "a.sz, "a.reset_interval, - "a.weight_sz, "a.weight_nr_accesses, - "a.weight_age, &wmarks.metric, - &wmarks.interval, &wmarks.high, &wmarks.mid, - &wmarks.low, &parsed); - if (ret != 18) - break; - action = dbgfs_scheme_action_to_damos_action(action_input); - if ((int)action < 0) - goto fail; - - if (pattern.min_sz_region > pattern.max_sz_region || - pattern.min_nr_accesses > pattern.max_nr_accesses || - pattern.min_age_region > pattern.max_age_region) - goto fail; - - if (wmarks.high < wmarks.mid || wmarks.high < wmarks.low || - wmarks.mid < wmarks.low) - goto fail; - - pos += parsed; - scheme = damon_new_scheme(&pattern, action, 0, "a, - &wmarks, NUMA_NO_NODE); - if (!scheme) - goto fail; - - schemes[*nr_schemes] = scheme; - *nr_schemes += 1; - } - return schemes; -fail: - free_schemes_arr(schemes, *nr_schemes); - return NULL; -} - -static ssize_t dbgfs_schemes_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct damon_ctx *ctx = file->private_data; - char *kbuf; - struct damos **schemes; - ssize_t nr_schemes = 0, ret; - - kbuf = user_input_str(buf, count, ppos); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - - schemes = str_to_schemes(kbuf, count, &nr_schemes); - if (!schemes) { - ret = -EINVAL; - goto out; - } - - mutex_lock(&ctx->kdamond_lock); - if (ctx->kdamond) { - ret = -EBUSY; - goto unlock_out; - } - - damon_set_schemes(ctx, schemes, nr_schemes); - ret = count; - nr_schemes = 0; - -unlock_out: - mutex_unlock(&ctx->kdamond_lock); - free_schemes_arr(schemes, nr_schemes); -out: - kfree(kbuf); - return ret; -} - -static ssize_t sprint_target_ids(struct damon_ctx *ctx, char *buf, ssize_t len) -{ - struct damon_target *t; - int id; - int written = 0; - int rc; - - damon_for_each_target(t, ctx) { - if (damon_target_has_pid(ctx)) - /* Show pid numbers to debugfs users */ - id = pid_vnr(t->pid); - else - /* Show 42 for physical address space, just for fun */ - id = 42; - - rc = scnprintf(&buf[written], len - written, "%d ", id); - if (!rc) - return -ENOMEM; - written += rc; - } - if (written) - written -= 1; - written += scnprintf(&buf[written], len - written, "\n"); - return written; -} - -static ssize_t dbgfs_target_ids_read(struct file *file, - char __user *buf, size_t count, loff_t *ppos) -{ - struct damon_ctx *ctx = file->private_data; - ssize_t len; - char ids_buf[320]; - - mutex_lock(&ctx->kdamond_lock); - len = sprint_target_ids(ctx, ids_buf, 320); - mutex_unlock(&ctx->kdamond_lock); - if (len < 0) - return len; - - return simple_read_from_buffer(buf, count, ppos, ids_buf, len); -} - -/* - * Converts a string into an integers array - * - * Returns an array of integers array if the conversion success, or NULL - * otherwise. - */ -static int *str_to_ints(const char *str, ssize_t len, ssize_t *nr_ints) -{ - int *array; - const int max_nr_ints = 32; - int nr; - int pos = 0, parsed, ret; - - *nr_ints = 0; - array = kmalloc_array(max_nr_ints, sizeof(*array), GFP_KERNEL); - if (!array) - return NULL; - while (*nr_ints < max_nr_ints && pos < len) { - ret = sscanf(&str[pos], "%d%n", &nr, &parsed); - pos += parsed; - if (ret != 1) - break; - array[*nr_ints] = nr; - *nr_ints += 1; - } - - return array; -} - -static void dbgfs_put_pids(struct pid **pids, int nr_pids) -{ - int i; - - for (i = 0; i < nr_pids; i++) - put_pid(pids[i]); -} - -/* - * Converts a string into an struct pid pointers array - * - * Returns an array of struct pid pointers if the conversion success, or NULL - * otherwise. - */ -static struct pid **str_to_pids(const char *str, ssize_t len, ssize_t *nr_pids) -{ - int *ints; - ssize_t nr_ints; - struct pid **pids; - - *nr_pids = 0; - - ints = str_to_ints(str, len, &nr_ints); - if (!ints) - return NULL; - - pids = kmalloc_array(nr_ints, sizeof(*pids), GFP_KERNEL); - if (!pids) - goto out; - - for (; *nr_pids < nr_ints; (*nr_pids)++) { - pids[*nr_pids] = find_get_pid(ints[*nr_pids]); - if (!pids[*nr_pids]) { - dbgfs_put_pids(pids, *nr_pids); - kfree(ints); - kfree(pids); - return NULL; - } - } - -out: - kfree(ints); - return pids; -} - -/* - * dbgfs_set_targets() - Set monitoring targets. - * @ctx: monitoring context - * @nr_targets: number of targets - * @pids: array of target pids (size is same to @nr_targets) - * - * This function should not be called while the kdamond is running. @pids is - * ignored if the context is not configured to have pid in each target. On - * failure, reference counts of all pids in @pids are decremented. - * - * Return: 0 on success, negative error code otherwise. - */ -static int dbgfs_set_targets(struct damon_ctx *ctx, ssize_t nr_targets, - struct pid **pids) -{ - ssize_t i; - struct damon_target *t, *next; - - damon_for_each_target_safe(t, next, ctx) { - if (damon_target_has_pid(ctx)) - put_pid(t->pid); - damon_destroy_target(t); - } - - for (i = 0; i < nr_targets; i++) { - t = damon_new_target(); - if (!t) { - damon_for_each_target_safe(t, next, ctx) - damon_destroy_target(t); - if (damon_target_has_pid(ctx)) - dbgfs_put_pids(pids, nr_targets); - return -ENOMEM; - } - if (damon_target_has_pid(ctx)) - t->pid = pids[i]; - damon_add_target(ctx, t); - } - - return 0; -} - -static ssize_t dbgfs_target_ids_write(struct file *file, - const char __user *buf, size_t count, loff_t *ppos) -{ - struct damon_ctx *ctx = file->private_data; - bool id_is_pid = true; - char *kbuf; - struct pid **target_pids = NULL; - ssize_t nr_targets; - ssize_t ret; - - kbuf = user_input_str(buf, count, ppos); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - - if (!strncmp(kbuf, "paddr\n", count)) { - id_is_pid = false; - nr_targets = 1; - } - - if (id_is_pid) { - target_pids = str_to_pids(kbuf, count, &nr_targets); - if (!target_pids) { - ret = -ENOMEM; - goto out; - } - } - - mutex_lock(&ctx->kdamond_lock); - if (ctx->kdamond) { - if (id_is_pid) - dbgfs_put_pids(target_pids, nr_targets); - ret = -EBUSY; - goto unlock_out; - } - - /* remove previously set targets */ - dbgfs_set_targets(ctx, 0, NULL); - if (!nr_targets) { - ret = count; - goto unlock_out; - } - - /* Configure the context for the address space type */ - if (id_is_pid) - ret = damon_select_ops(ctx, DAMON_OPS_VADDR); - else - ret = damon_select_ops(ctx, DAMON_OPS_PADDR); - if (ret) - goto unlock_out; - - ret = dbgfs_set_targets(ctx, nr_targets, target_pids); - if (!ret) - ret = count; - -unlock_out: - mutex_unlock(&ctx->kdamond_lock); - kfree(target_pids); -out: - kfree(kbuf); - return ret; -} - -static ssize_t sprint_init_regions(struct damon_ctx *c, char *buf, ssize_t len) -{ - struct damon_target *t; - struct damon_region *r; - int target_idx = 0; - int written = 0; - int rc; - - damon_for_each_target(t, c) { - damon_for_each_region(r, t) { - rc = scnprintf(&buf[written], len - written, - "%d %lu %lu\n", - target_idx, r->ar.start, r->ar.end); - if (!rc) - return -ENOMEM; - written += rc; - } - target_idx++; - } - return written; -} - -static ssize_t dbgfs_init_regions_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) -{ - struct damon_ctx *ctx = file->private_data; - char *kbuf; - ssize_t len; - - kbuf = kmalloc(count, GFP_KERNEL | __GFP_NOWARN); - if (!kbuf) - return -ENOMEM; - - mutex_lock(&ctx->kdamond_lock); - if (ctx->kdamond) { - mutex_unlock(&ctx->kdamond_lock); - len = -EBUSY; - goto out; - } - - len = sprint_init_regions(ctx, kbuf, count); - mutex_unlock(&ctx->kdamond_lock); - if (len < 0) - goto out; - len = simple_read_from_buffer(buf, count, ppos, kbuf, len); - -out: - kfree(kbuf); - return len; -} - -static int add_init_region(struct damon_ctx *c, int target_idx, - struct damon_addr_range *ar) -{ - struct damon_target *t; - struct damon_region *r, *prev; - unsigned long idx = 0; - int rc = -EINVAL; - - if (ar->start >= ar->end) - return -EINVAL; - - damon_for_each_target(t, c) { - if (idx++ == target_idx) { - r = damon_new_region(ar->start, ar->end); - if (!r) - return -ENOMEM; - damon_add_region(r, t); - if (damon_nr_regions(t) > 1) { - prev = damon_prev_region(r); - if (prev->ar.end > r->ar.start) { - damon_destroy_region(r, t); - return -EINVAL; - } - } - rc = 0; - } - } - return rc; -} - -static int set_init_regions(struct damon_ctx *c, const char *str, ssize_t len) -{ - struct damon_target *t; - struct damon_region *r, *next; - int pos = 0, parsed, ret; - int target_idx; - struct damon_addr_range ar; - int err; - - damon_for_each_target(t, c) { - damon_for_each_region_safe(r, next, t) - damon_destroy_region(r, t); - } - - while (pos < len) { - ret = sscanf(&str[pos], "%d %lu %lu%n", - &target_idx, &ar.start, &ar.end, &parsed); - if (ret != 3) - break; - err = add_init_region(c, target_idx, &ar); - if (err) - goto fail; - pos += parsed; - } - - return 0; - -fail: - damon_for_each_target(t, c) { - damon_for_each_region_safe(r, next, t) - damon_destroy_region(r, t); - } - return err; -} - -static ssize_t dbgfs_init_regions_write(struct file *file, - const char __user *buf, size_t count, - loff_t *ppos) -{ - struct damon_ctx *ctx = file->private_data; - char *kbuf; - ssize_t ret = count; - int err; - - kbuf = user_input_str(buf, count, ppos); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - - mutex_lock(&ctx->kdamond_lock); - if (ctx->kdamond) { - ret = -EBUSY; - goto unlock_out; - } - - err = set_init_regions(ctx, kbuf, ret); - if (err) - ret = err; - -unlock_out: - mutex_unlock(&ctx->kdamond_lock); - kfree(kbuf); - return ret; -} - -static ssize_t dbgfs_kdamond_pid_read(struct file *file, - char __user *buf, size_t count, loff_t *ppos) -{ - struct damon_ctx *ctx = file->private_data; - char *kbuf; - ssize_t len; - - kbuf = kmalloc(count, GFP_KERNEL | __GFP_NOWARN); - if (!kbuf) - return -ENOMEM; - - mutex_lock(&ctx->kdamond_lock); - if (ctx->kdamond) - len = scnprintf(kbuf, count, "%d\n", ctx->kdamond->pid); - else - len = scnprintf(kbuf, count, "none\n"); - mutex_unlock(&ctx->kdamond_lock); - if (!len) - goto out; - len = simple_read_from_buffer(buf, count, ppos, kbuf, len); - -out: - kfree(kbuf); - return len; -} - -static int damon_dbgfs_open(struct inode *inode, struct file *file) -{ - damon_dbgfs_warn_deprecation(); - - file->private_data = inode->i_private; - - return nonseekable_open(inode, file); -} - -static const struct file_operations attrs_fops = { - .open = damon_dbgfs_open, - .read = dbgfs_attrs_read, - .write = dbgfs_attrs_write, -}; - -static const struct file_operations schemes_fops = { - .open = damon_dbgfs_open, - .read = dbgfs_schemes_read, - .write = dbgfs_schemes_write, -}; - -static const struct file_operations target_ids_fops = { - .open = damon_dbgfs_open, - .read = dbgfs_target_ids_read, - .write = dbgfs_target_ids_write, -}; - -static const struct file_operations init_regions_fops = { - .open = damon_dbgfs_open, - .read = dbgfs_init_regions_read, - .write = dbgfs_init_regions_write, -}; - -static const struct file_operations kdamond_pid_fops = { - .open = damon_dbgfs_open, - .read = dbgfs_kdamond_pid_read, -}; - -static void dbgfs_fill_ctx_dir(struct dentry *dir, struct damon_ctx *ctx) -{ - const char * const file_names[] = {"attrs", "schemes", "target_ids", - "init_regions", "kdamond_pid"}; - const struct file_operations *fops[] = {&attrs_fops, &schemes_fops, - &target_ids_fops, &init_regions_fops, &kdamond_pid_fops}; - int i; - - for (i = 0; i < ARRAY_SIZE(file_names); i++) - debugfs_create_file(file_names[i], 0600, dir, ctx, fops[i]); -} - -static void dbgfs_before_terminate(struct damon_ctx *ctx) -{ - struct damon_target *t, *next; - - if (!damon_target_has_pid(ctx)) - return; - - mutex_lock(&ctx->kdamond_lock); - damon_for_each_target_safe(t, next, ctx) { - put_pid(t->pid); - damon_destroy_target(t); - } - mutex_unlock(&ctx->kdamond_lock); -} - -static struct damon_ctx *dbgfs_new_ctx(void) -{ - struct damon_ctx *ctx; - - ctx = damon_new_ctx(); - if (!ctx) - return NULL; - - if (damon_select_ops(ctx, DAMON_OPS_VADDR) && - damon_select_ops(ctx, DAMON_OPS_PADDR)) { - damon_destroy_ctx(ctx); - return NULL; - } - ctx->callback.before_terminate = dbgfs_before_terminate; - return ctx; -} - -static void dbgfs_destroy_ctx(struct damon_ctx *ctx) -{ - damon_destroy_ctx(ctx); -} - -static ssize_t damon_dbgfs_deprecated_read(struct file *file, - char __user *buf, size_t count, loff_t *ppos) -{ - static const char kbuf[512] = DAMON_DBGFS_DEPRECATION_NOTICE; - - return simple_read_from_buffer(buf, count, ppos, kbuf, strlen(kbuf)); -} - -/* - * Make a context of @name and create a debugfs directory for it. - * - * This function should be called while holding damon_dbgfs_lock. - * - * Returns 0 on success, negative error code otherwise. - */ -static int dbgfs_mk_context(char *name) -{ - struct dentry *root, **new_dirs, *new_dir; - struct damon_ctx **new_ctxs, *new_ctx; - - if (damon_nr_running_ctxs()) - return -EBUSY; - - new_ctxs = krealloc(dbgfs_ctxs, sizeof(*dbgfs_ctxs) * - (dbgfs_nr_ctxs + 1), GFP_KERNEL); - if (!new_ctxs) - return -ENOMEM; - dbgfs_ctxs = new_ctxs; - - new_dirs = krealloc(dbgfs_dirs, sizeof(*dbgfs_dirs) * - (dbgfs_nr_ctxs + 1), GFP_KERNEL); - if (!new_dirs) - return -ENOMEM; - dbgfs_dirs = new_dirs; - - root = dbgfs_dirs[0]; - if (!root) - return -ENOENT; - - new_dir = debugfs_create_dir(name, root); - /* Below check is required for a potential duplicated name case */ - if (IS_ERR(new_dir)) - return PTR_ERR(new_dir); - dbgfs_dirs[dbgfs_nr_ctxs] = new_dir; - - new_ctx = dbgfs_new_ctx(); - if (!new_ctx) { - debugfs_remove(new_dir); - dbgfs_dirs[dbgfs_nr_ctxs] = NULL; - return -ENOMEM; - } - - dbgfs_ctxs[dbgfs_nr_ctxs] = new_ctx; - dbgfs_fill_ctx_dir(dbgfs_dirs[dbgfs_nr_ctxs], - dbgfs_ctxs[dbgfs_nr_ctxs]); - dbgfs_nr_ctxs++; - - return 0; -} - -static ssize_t dbgfs_mk_context_write(struct file *file, - const char __user *buf, size_t count, loff_t *ppos) -{ - char *kbuf; - char *ctx_name; - ssize_t ret; - - kbuf = user_input_str(buf, count, ppos); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - ctx_name = kmalloc(count + 1, GFP_KERNEL); - if (!ctx_name) { - kfree(kbuf); - return -ENOMEM; - } - - /* Trim white space */ - if (sscanf(kbuf, "%s", ctx_name) != 1) { - ret = -EINVAL; - goto out; - } - - mutex_lock(&damon_dbgfs_lock); - ret = dbgfs_mk_context(ctx_name); - if (!ret) - ret = count; - mutex_unlock(&damon_dbgfs_lock); - -out: - kfree(kbuf); - kfree(ctx_name); - return ret; -} - -/* - * Remove a context of @name and its debugfs directory. - * - * This function should be called while holding damon_dbgfs_lock. - * - * Return 0 on success, negative error code otherwise. - */ -static int dbgfs_rm_context(char *name) -{ - struct dentry *root, *dir, **new_dirs; - struct inode *inode; - struct damon_ctx **new_ctxs; - int i, j; - int ret = 0; - - if (damon_nr_running_ctxs()) - return -EBUSY; - - root = dbgfs_dirs[0]; - if (!root) - return -ENOENT; - - dir = debugfs_lookup(name, root); - if (!dir) - return -ENOENT; - - inode = d_inode(dir); - if (!S_ISDIR(inode->i_mode)) { - ret = -EINVAL; - goto out_dput; - } - - new_dirs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_dirs), - GFP_KERNEL); - if (!new_dirs) { - ret = -ENOMEM; - goto out_dput; - } - - new_ctxs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_ctxs), - GFP_KERNEL); - if (!new_ctxs) { - ret = -ENOMEM; - goto out_new_dirs; - } - - for (i = 0, j = 0; i < dbgfs_nr_ctxs; i++) { - if (dbgfs_dirs[i] == dir) { - debugfs_remove(dbgfs_dirs[i]); - dbgfs_destroy_ctx(dbgfs_ctxs[i]); - continue; - } - new_dirs[j] = dbgfs_dirs[i]; - new_ctxs[j++] = dbgfs_ctxs[i]; - } - - kfree(dbgfs_dirs); - kfree(dbgfs_ctxs); - - dbgfs_dirs = new_dirs; - dbgfs_ctxs = new_ctxs; - dbgfs_nr_ctxs--; - - goto out_dput; - -out_new_dirs: - kfree(new_dirs); -out_dput: - dput(dir); - return ret; -} - -static ssize_t dbgfs_rm_context_write(struct file *file, - const char __user *buf, size_t count, loff_t *ppos) -{ - char *kbuf; - ssize_t ret; - char *ctx_name; - - kbuf = user_input_str(buf, count, ppos); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - ctx_name = kmalloc(count + 1, GFP_KERNEL); - if (!ctx_name) { - kfree(kbuf); - return -ENOMEM; - } - - /* Trim white space */ - if (sscanf(kbuf, "%s", ctx_name) != 1) { - ret = -EINVAL; - goto out; - } - - mutex_lock(&damon_dbgfs_lock); - ret = dbgfs_rm_context(ctx_name); - if (!ret) - ret = count; - mutex_unlock(&damon_dbgfs_lock); - -out: - kfree(kbuf); - kfree(ctx_name); - return ret; -} - -static ssize_t dbgfs_monitor_on_read(struct file *file, - char __user *buf, size_t count, loff_t *ppos) -{ - char monitor_on_buf[5]; - bool monitor_on = damon_nr_running_ctxs() != 0; - int len; - - len = scnprintf(monitor_on_buf, 5, monitor_on ? "on\n" : "off\n"); - - return simple_read_from_buffer(buf, count, ppos, monitor_on_buf, len); -} - -static ssize_t dbgfs_monitor_on_write(struct file *file, - const char __user *buf, size_t count, loff_t *ppos) -{ - ssize_t ret; - char *kbuf; - - kbuf = user_input_str(buf, count, ppos); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - - /* Remove white space */ - if (sscanf(kbuf, "%s", kbuf) != 1) { - kfree(kbuf); - return -EINVAL; - } - - mutex_lock(&damon_dbgfs_lock); - if (!strncmp(kbuf, "on", count)) { - int i; - - for (i = 0; i < dbgfs_nr_ctxs; i++) { - if (damon_targets_empty(dbgfs_ctxs[i])) { - kfree(kbuf); - mutex_unlock(&damon_dbgfs_lock); - return -EINVAL; - } - } - ret = damon_start(dbgfs_ctxs, dbgfs_nr_ctxs, true); - } else if (!strncmp(kbuf, "off", count)) { - ret = damon_stop(dbgfs_ctxs, dbgfs_nr_ctxs); - } else { - ret = -EINVAL; - } - mutex_unlock(&damon_dbgfs_lock); - - if (!ret) - ret = count; - kfree(kbuf); - return ret; -} - -static int damon_dbgfs_static_file_open(struct inode *inode, struct file *file) -{ - damon_dbgfs_warn_deprecation(); - return nonseekable_open(inode, file); -} - -static const struct file_operations deprecated_fops = { - .read = damon_dbgfs_deprecated_read, -}; - -static const struct file_operations mk_contexts_fops = { - .open = damon_dbgfs_static_file_open, - .write = dbgfs_mk_context_write, -}; - -static const struct file_operations rm_contexts_fops = { - .open = damon_dbgfs_static_file_open, - .write = dbgfs_rm_context_write, -}; - -static const struct file_operations monitor_on_fops = { - .open = damon_dbgfs_static_file_open, - .read = dbgfs_monitor_on_read, - .write = dbgfs_monitor_on_write, -}; - -static int __init __damon_dbgfs_init(void) -{ - struct dentry *dbgfs_root; - const char * const file_names[] = {"mk_contexts", "rm_contexts", - "monitor_on_DEPRECATED", "DEPRECATED"}; - const struct file_operations *fops[] = {&mk_contexts_fops, - &rm_contexts_fops, &monitor_on_fops, &deprecated_fops}; - int i; - - dbgfs_root = debugfs_create_dir("damon", NULL); - - for (i = 0; i < ARRAY_SIZE(file_names); i++) - debugfs_create_file(file_names[i], 0600, dbgfs_root, NULL, - fops[i]); - dbgfs_fill_ctx_dir(dbgfs_root, dbgfs_ctxs[0]); - - dbgfs_dirs = kmalloc(sizeof(dbgfs_root), GFP_KERNEL); - if (!dbgfs_dirs) { - debugfs_remove(dbgfs_root); - return -ENOMEM; - } - dbgfs_dirs[0] = dbgfs_root; - - return 0; -} - -/* - * Functions for the initialization - */ - -static int __init damon_dbgfs_init(void) -{ - int rc = -ENOMEM; - - mutex_lock(&damon_dbgfs_lock); - dbgfs_ctxs = kmalloc(sizeof(*dbgfs_ctxs), GFP_KERNEL); - if (!dbgfs_ctxs) - goto out; - dbgfs_ctxs[0] = dbgfs_new_ctx(); - if (!dbgfs_ctxs[0]) { - kfree(dbgfs_ctxs); - goto out; - } - dbgfs_nr_ctxs = 1; - - rc = __damon_dbgfs_init(); - if (rc) { - kfree(dbgfs_ctxs[0]); - kfree(dbgfs_ctxs); - pr_err("%s: dbgfs init failed\n", __func__); - } - -out: - mutex_unlock(&damon_dbgfs_lock); - return rc; -} - -module_init(damon_dbgfs_init); - -#include "tests/dbgfs-kunit.h" diff --git a/mm/damon/modules-common.c b/mm/damon/modules-common.c index 7cf96574cde7..86d58f8c4f63 100644 --- a/mm/damon/modules-common.c +++ b/mm/damon/modules-common.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Common Primitives for DAMON Modules + * Common Code for DAMON Modules * * Author: SeongJae Park <sj@kernel.org> */ diff --git a/mm/damon/modules-common.h b/mm/damon/modules-common.h index f49cdb417005..f103ad556368 100644 --- a/mm/damon/modules-common.h +++ b/mm/damon/modules-common.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * Common Primitives for DAMON Modules + * Common Code for DAMON Modules * * Author: SeongJae Park <sj@kernel.org> */ diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index d25d99cb5f2b..b43620fee6bb 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Common Primitives for Data Access Monitoring + * Common Code for Data Access Monitoring * * Author: SeongJae Park <sj@kernel.org> */ @@ -9,6 +9,8 @@ #include <linux/page_idle.h> #include <linux/pagemap.h> #include <linux/rmap.h> +#include <linux/swap.h> +#include <linux/swapops.h> #include "ops-common.h" @@ -24,7 +26,7 @@ struct folio *damon_get_folio(unsigned long pfn) struct page *page = pfn_to_online_page(pfn); struct folio *folio; - if (!page || PageTail(page)) + if (!page) return NULL; folio = page_folio(page); @@ -39,12 +41,29 @@ struct folio *damon_get_folio(unsigned long pfn) void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr) { - struct folio *folio = damon_get_folio(pte_pfn(ptep_get(pte))); + pte_t pteval = ptep_get(pte); + struct folio *folio; + bool young = false; + unsigned long pfn; + + if (likely(pte_present(pteval))) + pfn = pte_pfn(pteval); + else + pfn = swp_offset_pfn(pte_to_swp_entry(pteval)); + folio = damon_get_folio(pfn); if (!folio) return; - if (ptep_clear_young_notify(vma, addr, pte)) + /* + * PFN swap PTEs, such as device-exclusive ones, that actually map pages + * are "old" from a CPU perspective. The MMU notifier takes care of any + * device aspects. + */ + if (likely(pte_present(pteval))) + young |= ptep_test_and_clear_young(vma, addr, pte); + young |= mmu_notifier_clear_young(vma->vm_mm, addr, addr + PAGE_SIZE); + if (young) folio_set_young(folio); folio_set_idle(folio); diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h index 18d837d11bce..cc9f5da9c012 100644 --- a/mm/damon/ops-common.h +++ b/mm/damon/ops-common.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * Common Primitives for Data Access Monitoring + * Common Code for Data Access Monitoring * * Author: SeongJae Park <sj@kernel.org> */ diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index a9ff35341d65..4102a8c5f992 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * DAMON Primitives for The Physical Address Space + * DAMON Code for The Physical Address Space * * Author: SeongJae Park <sj@kernel.org> */ @@ -92,12 +92,20 @@ static bool damon_folio_young_one(struct folio *folio, { bool *accessed = arg; DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0); + pte_t pte; *accessed = false; while (page_vma_mapped_walk(&pvmw)) { addr = pvmw.address; if (pvmw.pte) { - *accessed = pte_young(ptep_get(pvmw.pte)) || + pte = ptep_get(pvmw.pte); + + /* + * PFN swap PTEs, such as device-exclusive ones, that + * actually map pages are "old" from a CPU perspective. + * The MMU notifier takes care of any device aspects. + */ + *accessed = (pte_present(pte) && pte_young(pte)) || !folio_test_idle(folio) || mmu_notifier_test_young(vma->vm_mm, addr); } else { @@ -198,16 +206,20 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx) return max_nr_accesses; } -static bool __damos_pa_filter_out(struct damos_filter *filter, +static bool damos_pa_filter_match(struct damos_filter *filter, struct folio *folio) { bool matched = false; struct mem_cgroup *memcg; + size_t folio_sz; switch (filter->type) { case DAMOS_FILTER_TYPE_ANON: matched = folio_test_anon(folio); break; + case DAMOS_FILTER_TYPE_ACTIVE: + matched = folio_test_active(folio); + break; case DAMOS_FILTER_TYPE_MEMCG: rcu_read_lock(); memcg = folio_memcg_check(folio); @@ -222,6 +234,14 @@ static bool __damos_pa_filter_out(struct damos_filter *filter, if (matched) damon_folio_mkold(folio); break; + case DAMOS_FILTER_TYPE_HUGEPAGE_SIZE: + folio_sz = folio_size(folio); + matched = filter->sz_range.min <= folio_sz && + folio_sz <= filter->sz_range.max; + break; + case DAMOS_FILTER_TYPE_UNMAPPED: + matched = !folio_mapped(folio) || !folio_raw_mapping(folio); + break; default: break; } @@ -236,42 +256,63 @@ static bool damos_pa_filter_out(struct damos *scheme, struct folio *folio) { struct damos_filter *filter; - damos_for_each_filter(filter, scheme) { - if (__damos_pa_filter_out(filter, folio)) - return true; + if (scheme->core_filters_allowed) + return false; + + damos_for_each_ops_filter(filter, scheme) { + if (damos_pa_filter_match(filter, folio)) + return !filter->allow; + } + return scheme->ops_filters_default_reject; +} + +static bool damon_pa_invalid_damos_folio(struct folio *folio, struct damos *s) +{ + if (!folio) + return true; + if (folio == s->last_applied) { + folio_put(folio); + return true; } return false; } -static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s) +static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s, + unsigned long *sz_filter_passed) { unsigned long addr, applied; LIST_HEAD(folio_list); bool install_young_filter = true; struct damos_filter *filter; + struct folio *folio; /* check access in page level again by default */ - damos_for_each_filter(filter, s) { + damos_for_each_ops_filter(filter, s) { if (filter->type == DAMOS_FILTER_TYPE_YOUNG) { install_young_filter = false; break; } } if (install_young_filter) { - filter = damos_new_filter(DAMOS_FILTER_TYPE_YOUNG, true); + filter = damos_new_filter( + DAMOS_FILTER_TYPE_YOUNG, true, false); if (!filter) return 0; damos_add_filter(s, filter); } - for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) { - struct folio *folio = damon_get_folio(PHYS_PFN(addr)); - - if (!folio) + addr = r->ar.start; + while (addr < r->ar.end) { + folio = damon_get_folio(PHYS_PFN(addr)); + if (damon_pa_invalid_damos_folio(folio, s)) { + addr += PAGE_SIZE; continue; + } if (damos_pa_filter_out(s, folio)) goto put_folio; + else + *sz_filter_passed += folio_size(folio); folio_clear_referenced(folio); folio_test_clear_young(folio); @@ -282,28 +323,36 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s) else list_add(&folio->lru, &folio_list); put_folio: + addr += folio_size(folio); folio_put(folio); } if (install_young_filter) damos_destroy_filter(filter); applied = reclaim_pages(&folio_list); cond_resched(); + s->last_applied = folio; return applied * PAGE_SIZE; } static inline unsigned long damon_pa_mark_accessed_or_deactivate( - struct damon_region *r, struct damos *s, bool mark_accessed) + struct damon_region *r, struct damos *s, bool mark_accessed, + unsigned long *sz_filter_passed) { unsigned long addr, applied = 0; + struct folio *folio; - for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) { - struct folio *folio = damon_get_folio(PHYS_PFN(addr)); - - if (!folio) + addr = r->ar.start; + while (addr < r->ar.end) { + folio = damon_get_folio(PHYS_PFN(addr)); + if (damon_pa_invalid_damos_folio(folio, s)) { + addr += PAGE_SIZE; continue; + } if (damos_pa_filter_out(s, folio)) goto put_folio; + else + *sz_filter_passed += folio_size(folio); if (mark_accessed) folio_mark_accessed(folio); @@ -311,21 +360,25 @@ static inline unsigned long damon_pa_mark_accessed_or_deactivate( folio_deactivate(folio); applied += folio_nr_pages(folio); put_folio: + addr += folio_size(folio); folio_put(folio); } + s->last_applied = folio; return applied * PAGE_SIZE; } static unsigned long damon_pa_mark_accessed(struct damon_region *r, - struct damos *s) + struct damos *s, unsigned long *sz_filter_passed) { - return damon_pa_mark_accessed_or_deactivate(r, s, true); + return damon_pa_mark_accessed_or_deactivate(r, s, true, + sz_filter_passed); } static unsigned long damon_pa_deactivate_pages(struct damon_region *r, - struct damos *s) + struct damos *s, unsigned long *sz_filter_passed) { - return damon_pa_mark_accessed_or_deactivate(r, s, false); + return damon_pa_mark_accessed_or_deactivate(r, s, false, + sz_filter_passed); } static unsigned int __damon_pa_migrate_folio_list( @@ -449,48 +502,90 @@ static unsigned long damon_pa_migrate_pages(struct list_head *folio_list, return nr_migrated; } -static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s) +static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s, + unsigned long *sz_filter_passed) { unsigned long addr, applied; LIST_HEAD(folio_list); + struct folio *folio; - for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) { - struct folio *folio = damon_get_folio(PHYS_PFN(addr)); - - if (!folio) + addr = r->ar.start; + while (addr < r->ar.end) { + folio = damon_get_folio(PHYS_PFN(addr)); + if (damon_pa_invalid_damos_folio(folio, s)) { + addr += PAGE_SIZE; continue; + } if (damos_pa_filter_out(s, folio)) goto put_folio; + else + *sz_filter_passed += folio_size(folio); if (!folio_isolate_lru(folio)) goto put_folio; list_add(&folio->lru, &folio_list); put_folio: + addr += folio_size(folio); folio_put(folio); } applied = damon_pa_migrate_pages(&folio_list, s->target_nid); cond_resched(); + s->last_applied = folio; return applied * PAGE_SIZE; } +static bool damon_pa_scheme_has_filter(struct damos *s) +{ + struct damos_filter *f; + + damos_for_each_ops_filter(f, s) + return true; + return false; +} + +static unsigned long damon_pa_stat(struct damon_region *r, struct damos *s, + unsigned long *sz_filter_passed) +{ + unsigned long addr; + struct folio *folio; + + if (!damon_pa_scheme_has_filter(s)) + return 0; + + addr = r->ar.start; + while (addr < r->ar.end) { + folio = damon_get_folio(PHYS_PFN(addr)); + if (damon_pa_invalid_damos_folio(folio, s)) { + addr += PAGE_SIZE; + continue; + } + + if (!damos_pa_filter_out(s, folio)) + *sz_filter_passed += folio_size(folio); + addr += folio_size(folio); + folio_put(folio); + } + s->last_applied = folio; + return 0; +} static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, struct damon_region *r, - struct damos *scheme) + struct damos *scheme, unsigned long *sz_filter_passed) { switch (scheme->action) { case DAMOS_PAGEOUT: - return damon_pa_pageout(r, scheme); + return damon_pa_pageout(r, scheme, sz_filter_passed); case DAMOS_LRU_PRIO: - return damon_pa_mark_accessed(r, scheme); + return damon_pa_mark_accessed(r, scheme, sz_filter_passed); case DAMOS_LRU_DEPRIO: - return damon_pa_deactivate_pages(r, scheme); + return damon_pa_deactivate_pages(r, scheme, sz_filter_passed); case DAMOS_MIGRATE_HOT: case DAMOS_MIGRATE_COLD: - return damon_pa_migrate(r, scheme); + return damon_pa_migrate(r, scheme, sz_filter_passed); case DAMOS_STAT: - break; + return damon_pa_stat(r, scheme, sz_filter_passed); default: /* DAMOS actions that not yet supported by 'paddr'. */ break; @@ -528,7 +623,6 @@ static int __init damon_pa_initcall(void) .update = NULL, .prepare_access_checks = damon_pa_prepare_access_checks, .check_accesses = damon_pa_check_accesses, - .reset_aggregated = NULL, .target_valid = NULL, .cleanup = NULL, .apply_scheme = damon_pa_apply_scheme, diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 9e0077a9404e..a675150965e0 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -221,7 +221,7 @@ static int damon_reclaim_apply_parameters(void) } if (skip_anon) { - filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true); + filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true, false); if (!filter) goto out; damos_add_filter(scheme, filter); diff --git a/mm/damon/sysfs-common.c b/mm/damon/sysfs-common.c index 70edf45c2174..ffaf285e241a 100644 --- a/mm/damon/sysfs-common.c +++ b/mm/damon/sysfs-common.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Common Primitives for DAMON Sysfs Interface + * Common Code for DAMON Sysfs Interface * * Author: SeongJae Park <sj@kernel.org> */ diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h index 9a18f3c535d3..2099adee11d0 100644 --- a/mm/damon/sysfs-common.h +++ b/mm/damon/sysfs-common.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * Common Primitives for DAMON Sysfs Interface + * Common Code for DAMON Sysfs Interface * * Author: SeongJae Park <sj@kernel.org> */ @@ -45,19 +45,13 @@ void damon_sysfs_schemes_update_stats( struct damon_sysfs_schemes *sysfs_schemes, struct damon_ctx *ctx); -int damon_sysfs_schemes_update_regions_start( - struct damon_sysfs_schemes *sysfs_schemes, - struct damon_ctx *ctx, bool total_bytes_only); - -void damos_sysfs_mark_finished_regions_updates(struct damon_ctx *ctx); - -bool damos_sysfs_regions_upd_done(void); - -int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx); +void damos_sysfs_populate_region_dir(struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx, struct damon_target *t, + struct damon_region *r, struct damos *s, + bool total_bytes_only, unsigned long sz_filter_passed); int damon_sysfs_schemes_clear_regions( - struct damon_sysfs_schemes *sysfs_schemes, - struct damon_ctx *ctx); + struct damon_sysfs_schemes *sysfs_schemes); int damos_sysfs_set_quota_scores(struct damon_sysfs_schemes *sysfs_schemes, struct damon_ctx *ctx); diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index b095457380b5..30ae7518ffbf 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -19,6 +19,7 @@ struct damon_sysfs_scheme_region { struct damon_addr_range ar; unsigned int nr_accesses; unsigned int age; + unsigned long sz_filter_passed; struct list_head list; }; @@ -74,6 +75,15 @@ static ssize_t age_show(struct kobject *kobj, struct kobj_attribute *attr, return sysfs_emit(buf, "%u\n", region->age); } +static ssize_t sz_filter_passed_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_scheme_region *region = container_of(kobj, + struct damon_sysfs_scheme_region, kobj); + + return sysfs_emit(buf, "%lu\n", region->sz_filter_passed); +} + static void damon_sysfs_scheme_region_release(struct kobject *kobj) { struct damon_sysfs_scheme_region *region = container_of(kobj, @@ -95,11 +105,15 @@ static struct kobj_attribute damon_sysfs_scheme_region_nr_accesses_attr = static struct kobj_attribute damon_sysfs_scheme_region_age_attr = __ATTR_RO_MODE(age, 0400); +static struct kobj_attribute damon_sysfs_scheme_region_sz_filter_passed_attr = + __ATTR_RO_MODE(sz_filter_passed, 0400); + static struct attribute *damon_sysfs_scheme_region_attrs[] = { &damon_sysfs_scheme_region_start_attr.attr, &damon_sysfs_scheme_region_end_attr.attr, &damon_sysfs_scheme_region_nr_accesses_attr.attr, &damon_sysfs_scheme_region_age_attr.attr, + &damon_sysfs_scheme_region_sz_filter_passed_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damon_sysfs_scheme_region); @@ -114,55 +128,11 @@ static const struct kobj_type damon_sysfs_scheme_region_ktype = { * scheme regions directory */ -/* - * enum damos_sysfs_regions_upd_status - Represent DAMOS tried regions update - * status - * @DAMOS_TRIED_REGIONS_UPD_IDLE: Waiting for next request. - * @DAMOS_TRIED_REGIONS_UPD_STARTED: Update started. - * @DAMOS_TRIED_REGIONS_UPD_FINISHED: Update finished. - * - * Each DAMON-based operation scheme (&struct damos) has its own apply - * interval, and we need to expose the scheme tried regions based on only - * single snapshot. For this, we keep the tried regions update status for each - * scheme. The status becomes 'idle' at the beginning. - * - * Once the tried regions update request is received, the request handling - * start function (damon_sysfs_scheme_update_regions_start()) sets the status - * of all schemes as 'idle' again, and register ->before_damos_apply() - * callback. - * - * Then, the first followup ->before_damos_apply() callback - * (damon_sysfs_before_damos_apply()) sets the status 'started'. The first - * ->after_sampling() or ->after_aggregation() callback - * (damon_sysfs_cmd_request_callback()) after the call is called only after - * the scheme is completely applied to the given snapshot. Hence the callback - * knows the situation by showing 'started' status, and sets the status as - * 'finished'. Then, damon_sysfs_before_damos_apply() understands the - * situation by showing the 'finished' status and do nothing. - * - * If DAMOS is not applied to any region due to any reasons including the - * access pattern, the watermarks, the quotas, and the filters, - * ->before_damos_apply() will not be called back. Until the situation is - * changed, the update will not be finished. To avoid this, - * damon_sysfs_after_sampling() set the status as 'finished' if more than two - * apply intervals of the scheme is passed while the state is 'idle'. - * - * Finally, the tried regions request handling finisher function - * (damon_sysfs_schemes_update_regions_stop()) unregisters the callbacks. - */ -enum damos_sysfs_regions_upd_status { - DAMOS_TRIED_REGIONS_UPD_IDLE, - DAMOS_TRIED_REGIONS_UPD_STARTED, - DAMOS_TRIED_REGIONS_UPD_FINISHED, -}; - struct damon_sysfs_scheme_regions { struct kobject kobj; struct list_head regions_list; int nr_regions; unsigned long total_bytes; - enum damos_sysfs_regions_upd_status upd_status; - unsigned long upd_timeout_jiffies; }; static struct damon_sysfs_scheme_regions * @@ -178,7 +148,6 @@ damon_sysfs_scheme_regions_alloc(void) INIT_LIST_HEAD(®ions->regions_list); regions->nr_regions = 0; regions->total_bytes = 0; - regions->upd_status = DAMOS_TRIED_REGIONS_UPD_IDLE; return regions; } @@ -233,6 +202,7 @@ struct damon_sysfs_stats { unsigned long sz_tried; unsigned long nr_applied; unsigned long sz_applied; + unsigned long sz_ops_filter_passed; unsigned long qt_exceeds; }; @@ -277,6 +247,15 @@ static ssize_t sz_applied_show(struct kobject *kobj, return sysfs_emit(buf, "%lu\n", stats->sz_applied); } +static ssize_t sz_ops_filter_passed_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + + return sysfs_emit(buf, "%lu\n", stats->sz_ops_filter_passed); +} + static ssize_t qt_exceeds_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -303,6 +282,9 @@ static struct kobj_attribute damon_sysfs_stats_nr_applied_attr = static struct kobj_attribute damon_sysfs_stats_sz_applied_attr = __ATTR_RO_MODE(sz_applied, 0400); +static struct kobj_attribute damon_sysfs_stats_sz_ops_filter_passed_attr = + __ATTR_RO_MODE(sz_ops_filter_passed, 0400); + static struct kobj_attribute damon_sysfs_stats_qt_exceeds_attr = __ATTR_RO_MODE(qt_exceeds, 0400); @@ -311,6 +293,7 @@ static struct attribute *damon_sysfs_stats_attrs[] = { &damon_sysfs_stats_sz_tried_attr.attr, &damon_sysfs_stats_nr_applied_attr.attr, &damon_sysfs_stats_sz_applied_attr.attr, + &damon_sysfs_stats_sz_ops_filter_passed_attr.attr, &damon_sysfs_stats_qt_exceeds_attr.attr, NULL, }; @@ -326,25 +309,46 @@ static const struct kobj_type damon_sysfs_stats_ktype = { * filter directory */ +/* + * enum damos_sysfs_filter_handle_layer - Layers handling filters of a dir. + */ +enum damos_sysfs_filter_handle_layer { + DAMOS_SYSFS_FILTER_HANDLE_LAYER_CORE, + DAMOS_SYSFS_FILTER_HANDLE_LAYER_OPS, + DAMOS_SYSFS_FILTER_HANDLE_LAYER_BOTH, +}; + struct damon_sysfs_scheme_filter { struct kobject kobj; + enum damos_sysfs_filter_handle_layer handle_layer; enum damos_filter_type type; bool matching; + bool allow; char *memcg_path; struct damon_addr_range addr_range; + struct damon_size_range sz_range; int target_idx; }; -static struct damon_sysfs_scheme_filter *damon_sysfs_scheme_filter_alloc(void) +static struct damon_sysfs_scheme_filter *damon_sysfs_scheme_filter_alloc( + enum damos_sysfs_filter_handle_layer layer) { - return kzalloc(sizeof(struct damon_sysfs_scheme_filter), GFP_KERNEL); + struct damon_sysfs_scheme_filter *filter; + + filter = kzalloc(sizeof(struct damon_sysfs_scheme_filter), GFP_KERNEL); + if (filter) + filter->handle_layer = layer; + return filter; } /* Should match with enum damos_filter_type */ static const char * const damon_sysfs_scheme_filter_type_strs[] = { "anon", + "active", "memcg", "young", + "hugepage_size", + "unmapped", "addr", "target", }; @@ -359,6 +363,23 @@ static ssize_t type_show(struct kobject *kobj, damon_sysfs_scheme_filter_type_strs[filter->type]); } +static bool damos_sysfs_scheme_filter_valid_type( + enum damos_sysfs_filter_handle_layer layer, + enum damos_filter_type type) +{ + switch (layer) { + case DAMOS_SYSFS_FILTER_HANDLE_LAYER_BOTH: + return true; + case DAMOS_SYSFS_FILTER_HANDLE_LAYER_CORE: + return !damos_filter_for_ops(type); + case DAMOS_SYSFS_FILTER_HANDLE_LAYER_OPS: + return damos_filter_for_ops(type); + default: + break; + } + return false; +} + static ssize_t type_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { @@ -370,6 +391,9 @@ static ssize_t type_store(struct kobject *kobj, for (type = 0; type < NR_DAMOS_FILTER_TYPES; type++) { if (sysfs_streq(buf, damon_sysfs_scheme_filter_type_strs[ type])) { + if (!damos_sysfs_scheme_filter_valid_type( + filter->handle_layer, type)) + break; filter->type = type; ret = count; break; @@ -402,6 +426,30 @@ static ssize_t matching_store(struct kobject *kobj, return count; } +static ssize_t allow_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + + return sysfs_emit(buf, "%c\n", filter->allow ? 'Y' : 'N'); +} + +static ssize_t allow_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + bool allow; + int err = kstrtobool(buf, &allow); + + if (err) + return err; + + filter->allow = allow; + return count; +} + static ssize_t memcg_path_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -417,12 +465,14 @@ static ssize_t memcg_path_store(struct kobject *kobj, { struct damon_sysfs_scheme_filter *filter = container_of(kobj, struct damon_sysfs_scheme_filter, kobj); - char *path = kmalloc(sizeof(*path) * (count + 1), GFP_KERNEL); + char *path = kmalloc_array(size_add(count, 1), sizeof(*path), + GFP_KERNEL); if (!path) return -ENOMEM; strscpy(path, buf, count + 1); + kfree(filter->memcg_path); filter->memcg_path = path; return count; } @@ -465,6 +515,44 @@ static ssize_t addr_end_store(struct kobject *kobj, return err ? err : count; } +static ssize_t min_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + + return sysfs_emit(buf, "%lu\n", filter->sz_range.min); +} + +static ssize_t min_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + int err = kstrtoul(buf, 0, &filter->sz_range.min); + + return err ? err : count; +} + +static ssize_t max_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + + return sysfs_emit(buf, "%lu\n", filter->sz_range.max); +} + +static ssize_t max_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + int err = kstrtoul(buf, 0, &filter->sz_range.max); + + return err ? err : count; +} + static ssize_t damon_target_idx_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -499,6 +587,9 @@ static struct kobj_attribute damon_sysfs_scheme_filter_type_attr = static struct kobj_attribute damon_sysfs_scheme_filter_matching_attr = __ATTR_RW_MODE(matching, 0600); +static struct kobj_attribute damon_sysfs_scheme_filter_allow_attr = + __ATTR_RW_MODE(allow, 0600); + static struct kobj_attribute damon_sysfs_scheme_filter_memcg_path_attr = __ATTR_RW_MODE(memcg_path, 0600); @@ -508,15 +599,24 @@ static struct kobj_attribute damon_sysfs_scheme_filter_addr_start_attr = static struct kobj_attribute damon_sysfs_scheme_filter_addr_end_attr = __ATTR_RW_MODE(addr_end, 0600); +static struct kobj_attribute damon_sysfs_scheme_filter_min_attr = + __ATTR_RW_MODE(min, 0600); + +static struct kobj_attribute damon_sysfs_scheme_filter_max_attr = + __ATTR_RW_MODE(max, 0600); + static struct kobj_attribute damon_sysfs_scheme_filter_damon_target_idx_attr = __ATTR_RW_MODE(damon_target_idx, 0600); static struct attribute *damon_sysfs_scheme_filter_attrs[] = { &damon_sysfs_scheme_filter_type_attr.attr, &damon_sysfs_scheme_filter_matching_attr.attr, + &damon_sysfs_scheme_filter_allow_attr.attr, &damon_sysfs_scheme_filter_memcg_path_attr.attr, &damon_sysfs_scheme_filter_addr_start_attr.attr, &damon_sysfs_scheme_filter_addr_end_attr.attr, + &damon_sysfs_scheme_filter_min_attr.attr, + &damon_sysfs_scheme_filter_max_attr.attr, &damon_sysfs_scheme_filter_damon_target_idx_attr.attr, NULL, }; @@ -534,14 +634,20 @@ static const struct kobj_type damon_sysfs_scheme_filter_ktype = { struct damon_sysfs_scheme_filters { struct kobject kobj; + enum damos_sysfs_filter_handle_layer handle_layer; struct damon_sysfs_scheme_filter **filters_arr; int nr; }; static struct damon_sysfs_scheme_filters * -damon_sysfs_scheme_filters_alloc(void) +damon_sysfs_scheme_filters_alloc(enum damos_sysfs_filter_handle_layer layer) { - return kzalloc(sizeof(struct damon_sysfs_scheme_filters), GFP_KERNEL); + struct damon_sysfs_scheme_filters *filters; + + filters = kzalloc(sizeof(struct damon_sysfs_scheme_filters), GFP_KERNEL); + if (filters) + filters->handle_layer = layer; + return filters; } static void damon_sysfs_scheme_filters_rm_dirs( @@ -574,7 +680,8 @@ static int damon_sysfs_scheme_filters_add_dirs( filters->filters_arr = filters_arr; for (i = 0; i < nr_filters; i++) { - filter = damon_sysfs_scheme_filter_alloc(); + filter = damon_sysfs_scheme_filter_alloc( + filters->handle_layer); if (!filter) { damon_sysfs_scheme_filters_rm_dirs(filters); return -ENOMEM; @@ -831,12 +938,15 @@ struct damos_sysfs_quota_goal { enum damos_quota_goal_metric metric; unsigned long target_value; unsigned long current_value; + int nid; }; -/* This should match with enum damos_action */ +/* This should match with enum damos_quota_goal_metric */ static const char * const damos_sysfs_quota_goal_metric_strs[] = { "user_input", "some_mem_psi_us", + "node_mem_used_bp", + "node_mem_free_bp", }; static struct damos_sysfs_quota_goal *damos_sysfs_quota_goal_alloc(void) @@ -909,6 +1019,28 @@ static ssize_t current_value_store(struct kobject *kobj, return err ? err : count; } +static ssize_t nid_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damos_sysfs_quota_goal *goal = container_of(kobj, struct + damos_sysfs_quota_goal, kobj); + + /* todo: return error if the goal is not using nid */ + + return sysfs_emit(buf, "%d\n", goal->nid); +} + +static ssize_t nid_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damos_sysfs_quota_goal *goal = container_of(kobj, struct + damos_sysfs_quota_goal, kobj); + int err = kstrtoint(buf, 0, &goal->nid); + + /* feed callback should check existence of this file and read value */ + return err ? err : count; +} + static void damos_sysfs_quota_goal_release(struct kobject *kobj) { /* or, notify this release to the feed callback */ @@ -924,10 +1056,14 @@ static struct kobj_attribute damos_sysfs_quota_goal_target_value_attr = static struct kobj_attribute damos_sysfs_quota_goal_current_value_attr = __ATTR_RW_MODE(current_value, 0600); +static struct kobj_attribute damos_sysfs_quota_goal_nid_attr = + __ATTR_RW_MODE(nid, 0600); + static struct attribute *damos_sysfs_quota_goal_attrs[] = { &damos_sysfs_quota_goal_target_metric_attr.attr, &damos_sysfs_quota_goal_target_value_attr.attr, &damos_sysfs_quota_goal_current_value_attr.attr, + &damos_sysfs_quota_goal_nid_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damos_sysfs_quota_goal); @@ -1367,7 +1503,7 @@ static int damon_sysfs_access_pattern_add_range_dir( if (!range) return -ENOMEM; err = kobject_init_and_add(&range->kobj, &damon_sysfs_ul_range_ktype, - &access_pattern->kobj, name); + &access_pattern->kobj, "%s", name); if (err) kobject_put(&range->kobj); else @@ -1443,6 +1579,8 @@ struct damon_sysfs_scheme { unsigned long apply_interval_us; struct damon_sysfs_quotas *quotas; struct damon_sysfs_watermarks *watermarks; + struct damon_sysfs_scheme_filters *core_filters; + struct damon_sysfs_scheme_filters *ops_filters; struct damon_sysfs_scheme_filters *filters; struct damon_sysfs_stats *stats; struct damon_sysfs_scheme_regions *tried_regions; @@ -1543,21 +1681,53 @@ static int damon_sysfs_scheme_set_watermarks(struct damon_sysfs_scheme *scheme) return err; } -static int damon_sysfs_scheme_set_filters(struct damon_sysfs_scheme *scheme) +static int damon_sysfs_scheme_set_filters(struct damon_sysfs_scheme *scheme, + enum damos_sysfs_filter_handle_layer layer, const char *name, + struct damon_sysfs_scheme_filters **filters_ptr) { struct damon_sysfs_scheme_filters *filters = - damon_sysfs_scheme_filters_alloc(); + damon_sysfs_scheme_filters_alloc(layer); int err; if (!filters) return -ENOMEM; err = kobject_init_and_add(&filters->kobj, &damon_sysfs_scheme_filters_ktype, &scheme->kobj, - "filters"); + "%s", name); if (err) kobject_put(&filters->kobj); else - scheme->filters = filters; + *filters_ptr = filters; + return err; +} + +static int damos_sysfs_set_filter_dirs(struct damon_sysfs_scheme *scheme) +{ + int err; + + err = damon_sysfs_scheme_set_filters(scheme, + DAMOS_SYSFS_FILTER_HANDLE_LAYER_BOTH, "filters", + &scheme->filters); + if (err) + return err; + err = damon_sysfs_scheme_set_filters(scheme, + DAMOS_SYSFS_FILTER_HANDLE_LAYER_CORE, "core_filters", + &scheme->core_filters); + if (err) + goto put_filters_out; + err = damon_sysfs_scheme_set_filters(scheme, + DAMOS_SYSFS_FILTER_HANDLE_LAYER_OPS, "ops_filters", + &scheme->ops_filters); + if (err) + goto put_core_filters_out; + return 0; + +put_core_filters_out: + kobject_put(&scheme->core_filters->kobj); + scheme->core_filters = NULL; +put_filters_out: + kobject_put(&scheme->filters->kobj); + scheme->filters = NULL; return err; } @@ -1609,7 +1779,7 @@ static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme) err = damon_sysfs_scheme_set_watermarks(scheme); if (err) goto put_quotas_access_pattern_out; - err = damon_sysfs_scheme_set_filters(scheme); + err = damos_sysfs_set_filter_dirs(scheme); if (err) goto put_watermarks_quotas_access_pattern_out; err = damon_sysfs_scheme_set_stats(scheme); @@ -1624,6 +1794,10 @@ put_tried_regions_out: kobject_put(&scheme->tried_regions->kobj); scheme->tried_regions = NULL; put_filters_watermarks_quotas_access_pattern_out: + kobject_put(&scheme->ops_filters->kobj); + scheme->ops_filters = NULL; + kobject_put(&scheme->core_filters->kobj); + scheme->core_filters = NULL; kobject_put(&scheme->filters->kobj); scheme->filters = NULL; put_watermarks_quotas_access_pattern_out: @@ -1647,6 +1821,10 @@ static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme) kobject_put(&scheme->watermarks->kobj); damon_sysfs_scheme_filters_rm_dirs(scheme->filters); kobject_put(&scheme->filters->kobj); + damon_sysfs_scheme_filters_rm_dirs(scheme->core_filters); + kobject_put(&scheme->core_filters->kobj); + damon_sysfs_scheme_filters_rm_dirs(scheme->ops_filters); + kobject_put(&scheme->ops_filters->kobj); kobject_put(&scheme->stats->kobj); damon_sysfs_scheme_regions_rm_dirs(scheme->tried_regions); kobject_put(&scheme->tried_regions->kobj); @@ -1888,7 +2066,7 @@ static int damon_sysfs_memcg_path_to_id(char *memcg_path, unsigned short *id) if (!memcg_path) return -EINVAL; - path = kmalloc(sizeof(*path) * PATH_MAX, GFP_KERNEL); + path = kmalloc_array(PATH_MAX, sizeof(*path), GFP_KERNEL); if (!path) return -ENOMEM; @@ -1918,7 +2096,8 @@ static int damon_sysfs_add_scheme_filters(struct damos *scheme, sysfs_filters->filters_arr[i]; struct damos_filter *filter = damos_new_filter(sysfs_filter->type, - sysfs_filter->matching); + sysfs_filter->matching, + sysfs_filter->allow); int err; if (!filter) @@ -1940,6 +2119,13 @@ static int damon_sysfs_add_scheme_filters(struct damos *scheme, filter->addr_range = sysfs_filter->addr_range; } else if (filter->type == DAMOS_FILTER_TYPE_TARGET) { filter->target_idx = sysfs_filter->target_idx; + } else if (filter->type == DAMOS_FILTER_TYPE_HUGEPAGE_SIZE) { + if (sysfs_filter->sz_range.min > + sysfs_filter->sz_range.max) { + damos_destroy_filter(filter); + return -EINVAL; + } + filter->sz_range = sysfs_filter->sz_range; } damos_add_filter(scheme, filter); @@ -1965,8 +2151,17 @@ static int damos_sysfs_add_quota_score( sysfs_goal->target_value); if (!goal) return -ENOMEM; - if (sysfs_goal->metric == DAMOS_QUOTA_USER_INPUT) + switch (sysfs_goal->metric) { + case DAMOS_QUOTA_USER_INPUT: goal->current_value = sysfs_goal->current_value; + break; + case DAMOS_QUOTA_NODE_MEM_USED_BP: + case DAMOS_QUOTA_NODE_MEM_FREE_BP: + goal->nid = sysfs_goal->nid; + break; + default: + break; + } damos_add_quota_goal(quota, goal); } return 0; @@ -2035,8 +2230,6 @@ static struct damos *damon_sysfs_mk_scheme( struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas; struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights; struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks; - struct damon_sysfs_scheme_filters *sysfs_filters = - sysfs_scheme->filters; struct damos *scheme; int err; @@ -2076,7 +2269,17 @@ static struct damos *damon_sysfs_mk_scheme( return NULL; } - err = damon_sysfs_add_scheme_filters(scheme, sysfs_filters); + err = damon_sysfs_add_scheme_filters(scheme, sysfs_scheme->core_filters); + if (err) { + damon_destroy_scheme(scheme); + return NULL; + } + err = damon_sysfs_add_scheme_filters(scheme, sysfs_scheme->ops_filters); + if (err) { + damon_destroy_scheme(scheme); + return NULL; + } + err = damon_sysfs_add_scheme_filters(scheme, sysfs_scheme->filters); if (err) { damon_destroy_scheme(scheme); return NULL; @@ -2122,32 +2325,32 @@ void damon_sysfs_schemes_update_stats( sysfs_stats->sz_tried = scheme->stat.sz_tried; sysfs_stats->nr_applied = scheme->stat.nr_applied; sysfs_stats->sz_applied = scheme->stat.sz_applied; + sysfs_stats->sz_ops_filter_passed = + scheme->stat.sz_ops_filter_passed; sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds; } } -/* - * damon_sysfs_schemes that need to update its schemes regions dir. Protected - * by damon_sysfs_lock - */ -static struct damon_sysfs_schemes *damon_sysfs_schemes_for_damos_callback; -static int damon_sysfs_schemes_region_idx; -static bool damos_regions_upd_total_bytes_only; - -/* - * DAMON callback that called before damos apply. While this callback is - * registered, damon_sysfs_lock should be held to ensure the regions - * directories exist. +/** + * damos_sysfs_populate_region_dir() - Populate a schemes tried region dir. + * @sysfs_schemes: Schemes directory to populate regions directory. + * @ctx: Corresponding DAMON context. + * @t: DAMON target of @r. + * @r: DAMON region to populate the directory for. + * @s: Corresponding scheme. + * @total_bytes_only: Whether the request is for bytes update only. + * @sz_filter_passed: Bytes of @r that passed filters of @s. + * + * Called from DAMOS walk callback while holding damon_sysfs_lock. */ -static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx, - struct damon_target *t, struct damon_region *r, - struct damos *s) +void damos_sysfs_populate_region_dir(struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx, struct damon_target *t, + struct damon_region *r, struct damos *s, bool total_bytes_only, + unsigned long sz_filter_passed) { struct damos *scheme; struct damon_sysfs_scheme_regions *sysfs_regions; struct damon_sysfs_scheme_region *region; - struct damon_sysfs_schemes *sysfs_schemes = - damon_sysfs_schemes_for_damos_callback; int schemes_idx = 0; damon_for_each_scheme(scheme, ctx) { @@ -2158,152 +2361,39 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx, /* user could have removed the scheme sysfs dir */ if (schemes_idx >= sysfs_schemes->nr) - return 0; + return; sysfs_regions = sysfs_schemes->schemes_arr[schemes_idx]->tried_regions; - if (sysfs_regions->upd_status == DAMOS_TRIED_REGIONS_UPD_FINISHED) - return 0; - if (sysfs_regions->upd_status == DAMOS_TRIED_REGIONS_UPD_IDLE) - sysfs_regions->upd_status = DAMOS_TRIED_REGIONS_UPD_STARTED; sysfs_regions->total_bytes += r->ar.end - r->ar.start; - if (damos_regions_upd_total_bytes_only) - return 0; + if (total_bytes_only) + return; region = damon_sysfs_scheme_region_alloc(r); if (!region) - return 0; + return; + region->sz_filter_passed = sz_filter_passed; list_add_tail(®ion->list, &sysfs_regions->regions_list); sysfs_regions->nr_regions++; if (kobject_init_and_add(®ion->kobj, &damon_sysfs_scheme_region_ktype, &sysfs_regions->kobj, "%d", - damon_sysfs_schemes_region_idx++)) { + sysfs_regions->nr_regions++)) { kobject_put(®ion->kobj); } - return 0; } -/* - * DAMON callback that called after each accesses sampling. While this - * callback is registered, damon_sysfs_lock should be held to ensure the - * regions directories exist. - */ -void damos_sysfs_mark_finished_regions_updates(struct damon_ctx *ctx) +int damon_sysfs_schemes_clear_regions( + struct damon_sysfs_schemes *sysfs_schemes) { - struct damon_sysfs_schemes *sysfs_schemes = - damon_sysfs_schemes_for_damos_callback; - struct damon_sysfs_scheme_regions *sysfs_regions; int i; for (i = 0; i < sysfs_schemes->nr; i++) { - sysfs_regions = sysfs_schemes->schemes_arr[i]->tried_regions; - if (sysfs_regions->upd_status == - DAMOS_TRIED_REGIONS_UPD_STARTED || - time_after(jiffies, - sysfs_regions->upd_timeout_jiffies)) - sysfs_regions->upd_status = - DAMOS_TRIED_REGIONS_UPD_FINISHED; - } -} - -/* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */ -int damon_sysfs_schemes_clear_regions( - struct damon_sysfs_schemes *sysfs_schemes, - struct damon_ctx *ctx) -{ - struct damos *scheme; - int schemes_idx = 0; - - damon_for_each_scheme(scheme, ctx) { struct damon_sysfs_scheme *sysfs_scheme; - /* user could have removed the scheme sysfs dir */ - if (schemes_idx >= sysfs_schemes->nr) - break; - - sysfs_scheme = sysfs_schemes->schemes_arr[schemes_idx++]; + sysfs_scheme = sysfs_schemes->schemes_arr[i]; damon_sysfs_scheme_regions_rm_dirs( sysfs_scheme->tried_regions); sysfs_scheme->tried_regions->total_bytes = 0; } return 0; } - -static struct damos *damos_sysfs_nth_scheme(int n, struct damon_ctx *ctx) -{ - struct damos *scheme; - int i = 0; - - damon_for_each_scheme(scheme, ctx) { - if (i == n) - return scheme; - i++; - } - return NULL; -} - -static void damos_tried_regions_init_upd_status( - struct damon_sysfs_schemes *sysfs_schemes, - struct damon_ctx *ctx) -{ - int i; - struct damos *scheme; - struct damon_sysfs_scheme_regions *sysfs_regions; - - for (i = 0; i < sysfs_schemes->nr; i++) { - sysfs_regions = sysfs_schemes->schemes_arr[i]->tried_regions; - scheme = damos_sysfs_nth_scheme(i, ctx); - if (!scheme) { - sysfs_regions->upd_status = - DAMOS_TRIED_REGIONS_UPD_FINISHED; - continue; - } - sysfs_regions->upd_status = DAMOS_TRIED_REGIONS_UPD_IDLE; - sysfs_regions->upd_timeout_jiffies = jiffies + - 2 * usecs_to_jiffies(scheme->apply_interval_us ? - scheme->apply_interval_us : - ctx->attrs.aggr_interval); - } -} - -/* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */ -int damon_sysfs_schemes_update_regions_start( - struct damon_sysfs_schemes *sysfs_schemes, - struct damon_ctx *ctx, bool total_bytes_only) -{ - damon_sysfs_schemes_clear_regions(sysfs_schemes, ctx); - damon_sysfs_schemes_for_damos_callback = sysfs_schemes; - damos_tried_regions_init_upd_status(sysfs_schemes, ctx); - damos_regions_upd_total_bytes_only = total_bytes_only; - ctx->callback.before_damos_apply = damon_sysfs_before_damos_apply; - return 0; -} - -bool damos_sysfs_regions_upd_done(void) -{ - struct damon_sysfs_schemes *sysfs_schemes = - damon_sysfs_schemes_for_damos_callback; - struct damon_sysfs_scheme_regions *sysfs_regions; - int i; - - for (i = 0; i < sysfs_schemes->nr; i++) { - sysfs_regions = sysfs_schemes->schemes_arr[i]->tried_regions; - if (sysfs_regions->upd_status != - DAMOS_TRIED_REGIONS_UPD_FINISHED) - return false; - } - return true; -} - -/* - * Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock. Caller - * should unlock damon_sysfs_lock which held before - * damon_sysfs_schemes_update_regions_start() - */ -int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx) -{ - damon_sysfs_schemes_for_damos_callback = NULL; - ctx->callback.before_damos_apply = NULL; - damon_sysfs_schemes_region_idx = 0; - return 0; -} diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 58145d59881d..1af6aff35d84 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -409,6 +409,164 @@ static const struct kobj_type damon_sysfs_targets_ktype = { }; /* + * intervals goal directory + */ + +struct damon_sysfs_intervals_goal { + struct kobject kobj; + unsigned long access_bp; + unsigned long aggrs; + unsigned long min_sample_us; + unsigned long max_sample_us; +}; + +static struct damon_sysfs_intervals_goal *damon_sysfs_intervals_goal_alloc( + unsigned long access_bp, unsigned long aggrs, + unsigned long min_sample_us, unsigned long max_sample_us) +{ + struct damon_sysfs_intervals_goal *goal = kmalloc(sizeof(*goal), + GFP_KERNEL); + + if (!goal) + return NULL; + + goal->kobj = (struct kobject){}; + goal->access_bp = access_bp; + goal->aggrs = aggrs; + goal->min_sample_us = min_sample_us; + goal->max_sample_us = max_sample_us; + return goal; +} + +static ssize_t access_bp_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_intervals_goal *goal = container_of(kobj, + struct damon_sysfs_intervals_goal, kobj); + + return sysfs_emit(buf, "%lu\n", goal->access_bp); +} + +static ssize_t access_bp_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_intervals_goal *goal = container_of(kobj, + struct damon_sysfs_intervals_goal, kobj); + unsigned long nr; + int err = kstrtoul(buf, 0, &nr); + + if (err) + return err; + + goal->access_bp = nr; + return count; +} + +static ssize_t aggrs_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_intervals_goal *goal = container_of(kobj, + struct damon_sysfs_intervals_goal, kobj); + + return sysfs_emit(buf, "%lu\n", goal->aggrs); +} + +static ssize_t aggrs_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_intervals_goal *goal = container_of(kobj, + struct damon_sysfs_intervals_goal, kobj); + unsigned long nr; + int err = kstrtoul(buf, 0, &nr); + + if (err) + return err; + + goal->aggrs = nr; + return count; +} + +static ssize_t min_sample_us_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_intervals_goal *goal = container_of(kobj, + struct damon_sysfs_intervals_goal, kobj); + + return sysfs_emit(buf, "%lu\n", goal->min_sample_us); +} + +static ssize_t min_sample_us_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_intervals_goal *goal = container_of(kobj, + struct damon_sysfs_intervals_goal, kobj); + unsigned long nr; + int err = kstrtoul(buf, 0, &nr); + + if (err) + return err; + + goal->min_sample_us = nr; + return count; +} + +static ssize_t max_sample_us_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_intervals_goal *goal = container_of(kobj, + struct damon_sysfs_intervals_goal, kobj); + + return sysfs_emit(buf, "%lu\n", goal->max_sample_us); +} + +static ssize_t max_sample_us_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_intervals_goal *goal = container_of(kobj, + struct damon_sysfs_intervals_goal, kobj); + unsigned long nr; + int err = kstrtoul(buf, 0, &nr); + + if (err) + return err; + + goal->max_sample_us = nr; + return count; +} + +static void damon_sysfs_intervals_goal_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_intervals_goal, kobj)); +} + +static struct kobj_attribute damon_sysfs_intervals_goal_access_bp_attr = + __ATTR_RW_MODE(access_bp, 0600); + +static struct kobj_attribute damon_sysfs_intervals_goal_aggrs_attr = + __ATTR_RW_MODE(aggrs, 0600); + +static struct kobj_attribute damon_sysfs_intervals_goal_min_sample_us_attr = + __ATTR_RW_MODE(min_sample_us, 0600); + +static struct kobj_attribute damon_sysfs_intervals_goal_max_sample_us_attr = + __ATTR_RW_MODE(max_sample_us, 0600); + +static struct attribute *damon_sysfs_intervals_goal_attrs[] = { + &damon_sysfs_intervals_goal_access_bp_attr.attr, + &damon_sysfs_intervals_goal_aggrs_attr.attr, + &damon_sysfs_intervals_goal_min_sample_us_attr.attr, + &damon_sysfs_intervals_goal_max_sample_us_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_intervals_goal); + +static const struct kobj_type damon_sysfs_intervals_goal_ktype = { + .release = damon_sysfs_intervals_goal_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_intervals_goal_groups, +}; + +/* * intervals directory */ @@ -417,6 +575,7 @@ struct damon_sysfs_intervals { unsigned long sample_us; unsigned long aggr_us; unsigned long update_us; + struct damon_sysfs_intervals_goal *intervals_goal; }; static struct damon_sysfs_intervals *damon_sysfs_intervals_alloc( @@ -436,6 +595,32 @@ static struct damon_sysfs_intervals *damon_sysfs_intervals_alloc( return intervals; } +static int damon_sysfs_intervals_add_dirs(struct damon_sysfs_intervals *intervals) +{ + struct damon_sysfs_intervals_goal *goal; + int err; + + goal = damon_sysfs_intervals_goal_alloc(0, 0, 0, 0); + if (!goal) + return -ENOMEM; + + err = kobject_init_and_add(&goal->kobj, + &damon_sysfs_intervals_goal_ktype, &intervals->kobj, + "intervals_goal"); + if (err) { + kobject_put(&goal->kobj); + intervals->intervals_goal = NULL; + return err; + } + intervals->intervals_goal = goal; + return 0; +} + +static void damon_sysfs_intervals_rm_dirs(struct damon_sysfs_intervals *intervals) +{ + kobject_put(&intervals->intervals_goal->kobj); +} + static ssize_t sample_us_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -571,6 +756,9 @@ static int damon_sysfs_attrs_add_dirs(struct damon_sysfs_attrs *attrs) "intervals"); if (err) goto put_intervals_out; + err = damon_sysfs_intervals_add_dirs(intervals); + if (err) + goto put_intervals_out; attrs->intervals = intervals; nr_regions_range = damon_sysfs_ul_range_alloc(10, 1000); @@ -599,6 +787,7 @@ put_intervals_out: static void damon_sysfs_attrs_rm_dirs(struct damon_sysfs_attrs *attrs) { kobject_put(&attrs->nr_regions_range->kobj); + damon_sysfs_intervals_rm_dirs(attrs->intervals); kobject_put(&attrs->intervals->kobj); } @@ -1025,6 +1214,11 @@ enum damon_sysfs_cmd { */ DAMON_SYSFS_CMD_UPDATE_SCHEMES_EFFECTIVE_QUOTAS, /* + * @DAMON_SYSFS_CMD_UPDATE_TUNED_INTERVALS: Update the tuned monitoring + * intevals. + */ + DAMON_SYSFS_CMD_UPDATE_TUNED_INTERVALS, + /* * @NR_DAMON_SYSFS_CMDS: Total number of DAMON sysfs commands. */ NR_DAMON_SYSFS_CMDS, @@ -1041,27 +1235,9 @@ static const char * const damon_sysfs_cmd_strs[] = { "update_schemes_tried_regions", "clear_schemes_tried_regions", "update_schemes_effective_quotas", + "update_tuned_intervals", }; -/* - * struct damon_sysfs_cmd_request - A request to the DAMON callback. - * @cmd: The command that needs to be handled by the callback. - * @kdamond: The kobject wrapper that associated to the kdamond thread. - * - * This structure represents a sysfs command request that need to access some - * DAMON context-internal data. Because DAMON context-internal data can be - * safely accessed from DAMON callbacks without additional synchronization, the - * request will be handled by the DAMON callback. None-``NULL`` @kdamond means - * the request is valid. - */ -struct damon_sysfs_cmd_request { - enum damon_sysfs_cmd cmd; - struct damon_sysfs_kdamond *kdamond; -}; - -/* Current DAMON callback request. Protected by damon_sysfs_lock. */ -static struct damon_sysfs_cmd_request damon_sysfs_cmd_request; - static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -1084,11 +1260,18 @@ static int damon_sysfs_set_attrs(struct damon_ctx *ctx, struct damon_sysfs_attrs *sys_attrs) { struct damon_sysfs_intervals *sys_intervals = sys_attrs->intervals; + struct damon_sysfs_intervals_goal *sys_goal = + sys_intervals->intervals_goal; struct damon_sysfs_ul_range *sys_nr_regions = sys_attrs->nr_regions_range; struct damon_attrs attrs = { .sample_interval = sys_intervals->sample_us, .aggr_interval = sys_intervals->aggr_us, + .intervals_goal = { + .access_bp = sys_goal->access_bp, + .aggrs = sys_goal->aggrs, + .min_sample_us = sys_goal->min_sample_us, + .max_sample_us = sys_goal->max_sample_us}, .ops_update_interval = sys_intervals->update_us, .min_nr_regions = sys_nr_regions->min, .max_nr_regions = sys_nr_regions->max, @@ -1181,25 +1364,9 @@ static int damon_sysfs_add_targets(struct damon_ctx *ctx, return 0; } -static bool damon_sysfs_schemes_regions_updating; - static void damon_sysfs_before_terminate(struct damon_ctx *ctx) { struct damon_target *t, *next; - struct damon_sysfs_kdamond *kdamond; - enum damon_sysfs_cmd cmd; - - /* damon_sysfs_schemes_update_regions_stop() might not yet called */ - kdamond = damon_sysfs_cmd_request.kdamond; - cmd = damon_sysfs_cmd_request.cmd; - if (kdamond && ctx == kdamond->damon_ctx && - (cmd == DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS || - cmd == DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES) && - damon_sysfs_schemes_regions_updating) { - damon_sysfs_schemes_update_regions_stop(ctx); - damon_sysfs_schemes_regions_updating = false; - mutex_unlock(&damon_sysfs_lock); - } if (!damon_target_has_pid(ctx)) return; @@ -1214,57 +1381,24 @@ static void damon_sysfs_before_terminate(struct damon_ctx *ctx) /* * damon_sysfs_upd_schemes_stats() - Update schemes stats sysfs files. - * @kdamond: The kobject wrapper that associated to the kdamond thread. + * @data: The kobject wrapper that associated to the kdamond thread. * * This function reads the schemes stats of specific kdamond and update the * related values for sysfs files. This function should be called from DAMON - * callbacks while holding ``damon_syfs_lock``, to safely access the DAMON - * contexts-internal data and DAMON sysfs variables. + * worker thread,to safely access the DAMON contexts-internal data. Caller + * should also ensure holding ``damon_syfs_lock``, and ->damon_ctx of @data is + * not NULL but a valid pointer, to safely access DAMON sysfs variables. */ -static int damon_sysfs_upd_schemes_stats(struct damon_sysfs_kdamond *kdamond) +static int damon_sysfs_upd_schemes_stats(void *data) { + struct damon_sysfs_kdamond *kdamond = data; struct damon_ctx *ctx = kdamond->damon_ctx; - if (!ctx) - return -EINVAL; damon_sysfs_schemes_update_stats( kdamond->contexts->contexts_arr[0]->schemes, ctx); return 0; } -static int damon_sysfs_upd_schemes_regions_start( - struct damon_sysfs_kdamond *kdamond, bool total_bytes_only) -{ - struct damon_ctx *ctx = kdamond->damon_ctx; - - if (!ctx) - return -EINVAL; - return damon_sysfs_schemes_update_regions_start( - kdamond->contexts->contexts_arr[0]->schemes, ctx, - total_bytes_only); -} - -static int damon_sysfs_upd_schemes_regions_stop( - struct damon_sysfs_kdamond *kdamond) -{ - struct damon_ctx *ctx = kdamond->damon_ctx; - - if (!ctx) - return -EINVAL; - return damon_sysfs_schemes_update_regions_stop(ctx); -} - -static int damon_sysfs_clear_schemes_regions( - struct damon_sysfs_kdamond *kdamond) -{ - struct damon_ctx *ctx = kdamond->damon_ctx; - - if (!ctx) - return -EINVAL; - return damon_sysfs_schemes_clear_regions( - kdamond->contexts->contexts_arr[0]->schemes, ctx); -} - static inline bool damon_sysfs_kdamond_running( struct damon_sysfs_kdamond *kdamond) { @@ -1296,11 +1430,12 @@ static struct damon_ctx *damon_sysfs_build_ctx( * damon_sysfs_commit_input() - Commit user inputs to a running kdamond. * @kdamond: The kobject wrapper for the associated kdamond. * - * If the sysfs input is wrong, the kdamond will be terminated. + * Returns error if the sysfs input is wrong. */ -static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond) +static int damon_sysfs_commit_input(void *data) { - struct damon_ctx *param_ctx; + struct damon_sysfs_kdamond *kdamond = data; + struct damon_ctx *param_ctx, *test_ctx; int err; if (!damon_sysfs_kdamond_running(kdamond)) @@ -1312,15 +1447,23 @@ static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond) param_ctx = damon_sysfs_build_ctx(kdamond->contexts->contexts_arr[0]); if (IS_ERR(param_ctx)) return PTR_ERR(param_ctx); + test_ctx = damon_new_ctx(); + err = damon_commit_ctx(test_ctx, param_ctx); + if (err) { + damon_sysfs_destroy_targets(test_ctx); + damon_destroy_ctx(test_ctx); + goto out; + } err = damon_commit_ctx(kdamond->damon_ctx, param_ctx); +out: damon_sysfs_destroy_targets(param_ctx); damon_destroy_ctx(param_ctx); return err; } -static int damon_sysfs_commit_schemes_quota_goals( - struct damon_sysfs_kdamond *sysfs_kdamond) +static int damon_sysfs_commit_schemes_quota_goals(void *data) { + struct damon_sysfs_kdamond *sysfs_kdamond = data; struct damon_ctx *ctx; struct damon_sysfs_context *sysfs_ctx; @@ -1338,128 +1481,33 @@ static int damon_sysfs_commit_schemes_quota_goals( /* * damon_sysfs_upd_schemes_effective_quotas() - Update schemes effective quotas * sysfs files. - * @kdamond: The kobject wrapper that associated to the kdamond thread. + * @data: The kobject wrapper that associated to the kdamond thread. * * This function reads the schemes' effective quotas of specific kdamond and * update the related values for sysfs files. This function should be called * from DAMON callbacks while holding ``damon_syfs_lock``, to safely access the * DAMON contexts-internal data and DAMON sysfs variables. */ -static int damon_sysfs_upd_schemes_effective_quotas( - struct damon_sysfs_kdamond *kdamond) +static int damon_sysfs_upd_schemes_effective_quotas(void *data) { + struct damon_sysfs_kdamond *kdamond = data; struct damon_ctx *ctx = kdamond->damon_ctx; - if (!ctx) - return -EINVAL; damos_sysfs_update_effective_quotas( kdamond->contexts->contexts_arr[0]->schemes, ctx); return 0; } - -/* - * damon_sysfs_cmd_request_callback() - DAMON callback for handling requests. - * @c: The DAMON context of the callback. - * @active: Whether @c is not deactivated due to watermarks. - * @after_aggr: Whether this is called from after_aggregation() callback. - * - * This function is periodically called back from the kdamond thread for @c. - * Then, it checks if there is a waiting DAMON sysfs request and handles it. - */ -static int damon_sysfs_cmd_request_callback(struct damon_ctx *c, bool active, - bool after_aggregation) -{ - struct damon_sysfs_kdamond *kdamond; - bool total_bytes_only = false; - int err = 0; - - /* avoid deadlock due to concurrent state_store('off') */ - if (!damon_sysfs_schemes_regions_updating && - !mutex_trylock(&damon_sysfs_lock)) - return 0; - kdamond = damon_sysfs_cmd_request.kdamond; - if (!kdamond || kdamond->damon_ctx != c) - goto out; - switch (damon_sysfs_cmd_request.cmd) { - case DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS: - err = damon_sysfs_upd_schemes_stats(kdamond); - break; - case DAMON_SYSFS_CMD_COMMIT: - if (!after_aggregation) - goto out; - err = damon_sysfs_commit_input(kdamond); - break; - case DAMON_SYSFS_CMD_COMMIT_SCHEMES_QUOTA_GOALS: - err = damon_sysfs_commit_schemes_quota_goals(kdamond); - break; - case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES: - total_bytes_only = true; - fallthrough; - case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS: - if (!damon_sysfs_schemes_regions_updating) { - err = damon_sysfs_upd_schemes_regions_start(kdamond, - total_bytes_only); - if (!err) { - damon_sysfs_schemes_regions_updating = true; - goto keep_lock_out; - } - } else { - damos_sysfs_mark_finished_regions_updates(c); - /* - * Continue regions updating if DAMON is till - * active and the update for all schemes is not - * finished. - */ - if (active && !damos_sysfs_regions_upd_done()) - goto keep_lock_out; - err = damon_sysfs_upd_schemes_regions_stop(kdamond); - damon_sysfs_schemes_regions_updating = false; - } - break; - case DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS: - err = damon_sysfs_clear_schemes_regions(kdamond); - break; - case DAMON_SYSFS_CMD_UPDATE_SCHEMES_EFFECTIVE_QUOTAS: - err = damon_sysfs_upd_schemes_effective_quotas(kdamond); - break; - default: - break; - } - /* Mark the request as invalid now. */ - damon_sysfs_cmd_request.kdamond = NULL; -out: - if (!damon_sysfs_schemes_regions_updating) - mutex_unlock(&damon_sysfs_lock); -keep_lock_out: - return err; -} - -static int damon_sysfs_after_wmarks_check(struct damon_ctx *c) +static int damon_sysfs_upd_tuned_intervals(void *data) { - /* - * after_wmarks_check() is called back while the context is deactivated - * by watermarks. - */ - return damon_sysfs_cmd_request_callback(c, false, false); -} - -static int damon_sysfs_after_sampling(struct damon_ctx *c) -{ - /* - * after_sampling() is called back only while the context is not - * deactivated by watermarks. - */ - return damon_sysfs_cmd_request_callback(c, true, false); -} + struct damon_sysfs_kdamond *kdamond = data; + struct damon_ctx *ctx = kdamond->damon_ctx; -static int damon_sysfs_after_aggregation(struct damon_ctx *c) -{ - /* - * after_aggregation() is called back only while the context is not - * deactivated by watermarks. - */ - return damon_sysfs_cmd_request_callback(c, true, true); + kdamond->contexts->contexts_arr[0]->attrs->intervals->sample_us = + ctx->attrs.sample_interval; + kdamond->contexts->contexts_arr[0]->attrs->intervals->aggr_us = + ctx->attrs.aggr_interval; + return 0; } static struct damon_ctx *damon_sysfs_build_ctx( @@ -1477,9 +1525,6 @@ static struct damon_ctx *damon_sysfs_build_ctx( return ERR_PTR(err); } - ctx->callback.after_wmarks_check = damon_sysfs_after_wmarks_check; - ctx->callback.after_sampling = damon_sysfs_after_sampling; - ctx->callback.after_aggregation = damon_sysfs_after_aggregation; ctx->callback.before_terminate = damon_sysfs_before_terminate; return ctx; } @@ -1491,8 +1536,6 @@ static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond) if (damon_sysfs_kdamond_running(kdamond)) return -EBUSY; - if (damon_sysfs_cmd_request.kdamond == kdamond) - return -EBUSY; /* TODO: support multiple contexts per kdamond */ if (kdamond->contexts->nr != 1) return -EINVAL; @@ -1525,63 +1568,102 @@ static int damon_sysfs_turn_damon_off(struct damon_sysfs_kdamond *kdamond) */ } +static int damon_sysfs_damon_call(int (*fn)(void *data), + struct damon_sysfs_kdamond *kdamond) +{ + struct damon_call_control call_control = {}; + + if (!kdamond->damon_ctx) + return -EINVAL; + call_control.fn = fn; + call_control.data = kdamond; + return damon_call(kdamond->damon_ctx, &call_control); +} + +struct damon_sysfs_schemes_walk_data { + struct damon_sysfs_kdamond *sysfs_kdamond; + bool total_bytes_only; +}; + +/* populate the region directory */ +static void damon_sysfs_schemes_tried_regions_upd_one(void *data, struct damon_ctx *ctx, + struct damon_target *t, struct damon_region *r, + struct damos *s, unsigned long sz_filter_passed) +{ + struct damon_sysfs_schemes_walk_data *walk_data = data; + struct damon_sysfs_kdamond *sysfs_kdamond = walk_data->sysfs_kdamond; + + damos_sysfs_populate_region_dir( + sysfs_kdamond->contexts->contexts_arr[0]->schemes, + ctx, t, r, s, walk_data->total_bytes_only, + sz_filter_passed); +} + +static int damon_sysfs_update_schemes_tried_regions( + struct damon_sysfs_kdamond *sysfs_kdamond, bool total_bytes_only) +{ + struct damon_sysfs_schemes_walk_data walk_data = { + .sysfs_kdamond = sysfs_kdamond, + .total_bytes_only = total_bytes_only, + }; + struct damos_walk_control control = { + .walk_fn = damon_sysfs_schemes_tried_regions_upd_one, + .data = &walk_data, + }; + struct damon_ctx *ctx = sysfs_kdamond->damon_ctx; + + if (!ctx) + return -EINVAL; + + damon_sysfs_schemes_clear_regions( + sysfs_kdamond->contexts->contexts_arr[0]->schemes); + return damos_walk(ctx, &control); +} + /* * damon_sysfs_handle_cmd() - Handle a command for a specific kdamond. * @cmd: The command to handle. * @kdamond: The kobject wrapper for the associated kdamond. * - * This function handles a DAMON sysfs command for a kdamond. For commands - * that need to access running DAMON context-internal data, it requests - * handling of the command to the DAMON callback - * (@damon_sysfs_cmd_request_callback()) and wait until it is properly handled, - * or the context is completed. + * This function handles a DAMON sysfs command for a kdamond. * * Return: 0 on success, negative error code otherwise. */ static int damon_sysfs_handle_cmd(enum damon_sysfs_cmd cmd, struct damon_sysfs_kdamond *kdamond) { - bool need_wait = true; - - /* Handle commands that doesn't access DAMON context-internal data */ switch (cmd) { case DAMON_SYSFS_CMD_ON: return damon_sysfs_turn_damon_on(kdamond); case DAMON_SYSFS_CMD_OFF: return damon_sysfs_turn_damon_off(kdamond); + case DAMON_SYSFS_CMD_COMMIT: + return damon_sysfs_damon_call( + damon_sysfs_commit_input, kdamond); + case DAMON_SYSFS_CMD_COMMIT_SCHEMES_QUOTA_GOALS: + return damon_sysfs_damon_call( + damon_sysfs_commit_schemes_quota_goals, + kdamond); + case DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS: + return damon_sysfs_damon_call( + damon_sysfs_upd_schemes_stats, kdamond); + case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES: + return damon_sysfs_update_schemes_tried_regions(kdamond, true); + case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS: + return damon_sysfs_update_schemes_tried_regions(kdamond, false); + case DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS: + return damon_sysfs_schemes_clear_regions( + kdamond->contexts->contexts_arr[0]->schemes); + case DAMON_SYSFS_CMD_UPDATE_SCHEMES_EFFECTIVE_QUOTAS: + return damon_sysfs_damon_call( + damon_sysfs_upd_schemes_effective_quotas, + kdamond); + case DAMON_SYSFS_CMD_UPDATE_TUNED_INTERVALS: + return damon_sysfs_damon_call( + damon_sysfs_upd_tuned_intervals, kdamond); default: - break; - } - - /* Pass the command to DAMON callback for safe DAMON context access */ - if (damon_sysfs_cmd_request.kdamond) - return -EBUSY; - if (!damon_sysfs_kdamond_running(kdamond)) return -EINVAL; - damon_sysfs_cmd_request.cmd = cmd; - damon_sysfs_cmd_request.kdamond = kdamond; - - /* - * wait until damon_sysfs_cmd_request_callback() handles the request - * from kdamond context - */ - mutex_unlock(&damon_sysfs_lock); - while (need_wait) { - schedule_timeout_idle(msecs_to_jiffies(100)); - if (!mutex_trylock(&damon_sysfs_lock)) - continue; - if (!damon_sysfs_cmd_request.kdamond) { - /* damon_sysfs_cmd_request_callback() handled */ - need_wait = false; - } else if (!damon_sysfs_kdamond_running(kdamond)) { - /* kdamond has already finished */ - need_wait = false; - damon_sysfs_cmd_request.kdamond = NULL; - } - mutex_unlock(&damon_sysfs_lock); } - mutex_lock(&damon_sysfs_lock); - return 0; } static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -1693,8 +1775,7 @@ static bool damon_sysfs_kdamonds_busy(struct damon_sysfs_kdamond **kdamonds, int i; for (i = 0; i < nr_kdamonds; i++) { - if (damon_sysfs_kdamond_running(kdamonds[i]) || - damon_sysfs_cmd_request.kdamond == kdamonds[i]) + if (damon_sysfs_kdamond_running(kdamonds[i])) return true; } diff --git a/mm/damon/tests/.kunitconfig b/mm/damon/tests/.kunitconfig index a73be044fc9b..36a450f57b58 100644 --- a/mm/damon/tests/.kunitconfig +++ b/mm/damon/tests/.kunitconfig @@ -13,10 +13,3 @@ CONFIG_DAMON_VADDR_KUNIT_TEST=y CONFIG_SYSFS=y CONFIG_DAMON_SYSFS=y CONFIG_DAMON_SYSFS_KUNIT_TEST=y - -# for DAMON debugfs interface -CONFIG_DEBUG_FS=y -CONFIG_DAMON_PADDR=y -CONFIG_DAMON_DBGFS_DEPRECATED=y -CONFIG_DAMON_DBGFS=y -CONFIG_DAMON_DBGFS_KUNIT_TEST=y diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index cf22e09a3507..298c67557fae 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -348,19 +348,19 @@ static void damon_test_update_monitoring_result(struct kunit *test) new_attrs = (struct damon_attrs){ .sample_interval = 100, .aggr_interval = 10000,}; - damon_update_monitoring_result(r, &old_attrs, &new_attrs); + damon_update_monitoring_result(r, &old_attrs, &new_attrs, false); KUNIT_EXPECT_EQ(test, r->nr_accesses, 15); KUNIT_EXPECT_EQ(test, r->age, 2); new_attrs = (struct damon_attrs){ .sample_interval = 1, .aggr_interval = 1000}; - damon_update_monitoring_result(r, &old_attrs, &new_attrs); + damon_update_monitoring_result(r, &old_attrs, &new_attrs, false); KUNIT_EXPECT_EQ(test, r->nr_accesses, 150); KUNIT_EXPECT_EQ(test, r->age, 2); new_attrs = (struct damon_attrs){ .sample_interval = 1, .aggr_interval = 100}; - damon_update_monitoring_result(r, &old_attrs, &new_attrs); + damon_update_monitoring_result(r, &old_attrs, &new_attrs, false); KUNIT_EXPECT_EQ(test, r->nr_accesses, 150); KUNIT_EXPECT_EQ(test, r->age, 20); @@ -411,7 +411,7 @@ static void damos_test_new_filter(struct kunit *test) { struct damos_filter *filter; - filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true); + filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true, false); KUNIT_EXPECT_EQ(test, filter->type, DAMOS_FILTER_TYPE_ANON); KUNIT_EXPECT_EQ(test, filter->matching, true); KUNIT_EXPECT_PTR_EQ(test, filter->list.prev, &filter->list); @@ -425,7 +425,7 @@ static void damos_test_filter_out(struct kunit *test) struct damon_region *r, *r2; struct damos_filter *f; - f = damos_new_filter(DAMOS_FILTER_TYPE_ADDR, true); + f = damos_new_filter(DAMOS_FILTER_TYPE_ADDR, true, false); f->addr_range = (struct damon_addr_range){ .start = DAMON_MIN_REGION * 2, .end = DAMON_MIN_REGION * 6}; @@ -434,25 +434,25 @@ static void damos_test_filter_out(struct kunit *test) damon_add_region(r, t); /* region in the range */ - KUNIT_EXPECT_TRUE(test, __damos_filter_out(NULL, t, r, f)); + KUNIT_EXPECT_TRUE(test, damos_filter_match(NULL, t, r, f)); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1); /* region before the range */ r->ar.start = DAMON_MIN_REGION * 1; r->ar.end = DAMON_MIN_REGION * 2; - KUNIT_EXPECT_FALSE(test, __damos_filter_out(NULL, t, r, f)); + KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f)); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1); /* region after the range */ r->ar.start = DAMON_MIN_REGION * 6; r->ar.end = DAMON_MIN_REGION * 8; - KUNIT_EXPECT_FALSE(test, __damos_filter_out(NULL, t, r, f)); + KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f)); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1); /* region started before the range */ r->ar.start = DAMON_MIN_REGION * 1; r->ar.end = DAMON_MIN_REGION * 4; - KUNIT_EXPECT_FALSE(test, __damos_filter_out(NULL, t, r, f)); + KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f)); /* filter should have split the region */ KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 1); KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 2); @@ -465,7 +465,7 @@ static void damos_test_filter_out(struct kunit *test) /* region started in the range */ r->ar.start = DAMON_MIN_REGION * 2; r->ar.end = DAMON_MIN_REGION * 8; - KUNIT_EXPECT_TRUE(test, __damos_filter_out(NULL, t, r, f)); + KUNIT_EXPECT_TRUE(test, damos_filter_match(NULL, t, r, f)); /* filter should have split the region */ KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 2); KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 6); @@ -510,6 +510,75 @@ static void damon_test_feed_loop_next_input(struct kunit *test) damon_feed_loop_next_input(last_input, 2000)); } +static void damon_test_set_filters_default_reject(struct kunit *test) +{ + struct damos scheme; + struct damos_filter *target_filter, *anon_filter; + + INIT_LIST_HEAD(&scheme.filters); + INIT_LIST_HEAD(&scheme.ops_filters); + + damos_set_filters_default_reject(&scheme); + /* + * No filter is installed. Allow by default on both core and ops layer + * filtering stages, since there are no filters at all. + */ + KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, false); + KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, false); + + target_filter = damos_new_filter(DAMOS_FILTER_TYPE_TARGET, true, true); + damos_add_filter(&scheme, target_filter); + damos_set_filters_default_reject(&scheme); + /* + * A core-handled allow-filter is installed. + * Rejct by default on core layer filtering stage due to the last + * core-layer-filter's behavior. + * Allow by default on ops layer filtering stage due to the absence of + * ops layer filters. + */ + KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, true); + KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, false); + + target_filter->allow = false; + damos_set_filters_default_reject(&scheme); + /* + * A core-handled reject-filter is installed. + * Allow by default on core layer filtering stage due to the last + * core-layer-filter's behavior. + * Allow by default on ops layer filtering stage due to the absence of + * ops layer filters. + */ + KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, false); + KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, false); + + anon_filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true, true); + damos_add_filter(&scheme, anon_filter); + + damos_set_filters_default_reject(&scheme); + /* + * A core-handled reject-filter and ops-handled allow-filter are installed. + * Allow by default on core layer filtering stage due to the existence + * of the ops-handled filter. + * Reject by default on ops layer filtering stage due to the last + * ops-layer-filter's behavior. + */ + KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, false); + KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, true); + + target_filter->allow = true; + damos_set_filters_default_reject(&scheme); + /* + * A core-handled allow-filter and ops-handled allow-filter are + * installed. + * Allow by default on core layer filtering stage due to the existence + * of the ops-handled filter. + * Reject by default on ops layer filtering stage due to the last + * ops-layer-filter's behavior. + */ + KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, false); + KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, true); +} + static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_target), KUNIT_CASE(damon_test_regions), @@ -527,6 +596,7 @@ static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damos_test_new_filter), KUNIT_CASE(damos_test_filter_out), KUNIT_CASE(damon_test_feed_loop_next_input), + KUNIT_CASE(damon_test_set_filters_default_reject), {}, }; diff --git a/mm/damon/tests/dbgfs-kunit.h b/mm/damon/tests/dbgfs-kunit.h deleted file mode 100644 index 087e53f641a8..000000000000 --- a/mm/damon/tests/dbgfs-kunit.h +++ /dev/null @@ -1,173 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * DAMON Debugfs Interface Unit Tests - * - * Author: SeongJae Park <sj@kernel.org> - */ - -#ifdef CONFIG_DAMON_DBGFS_KUNIT_TEST - -#ifndef _DAMON_DBGFS_TEST_H -#define _DAMON_DBGFS_TEST_H - -#include <kunit/test.h> - -static void damon_dbgfs_test_str_to_ints(struct kunit *test) -{ - char *question; - int *answers; - int expected[] = {12, 35, 46}; - ssize_t nr_integers = 0, i; - - question = "123"; - answers = str_to_ints(question, strlen(question), &nr_integers); - KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers); - KUNIT_EXPECT_EQ(test, 123, answers[0]); - kfree(answers); - - question = "123abc"; - answers = str_to_ints(question, strlen(question), &nr_integers); - KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers); - KUNIT_EXPECT_EQ(test, 123, answers[0]); - kfree(answers); - - question = "a123"; - answers = str_to_ints(question, strlen(question), &nr_integers); - KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers); - kfree(answers); - - question = "12 35"; - answers = str_to_ints(question, strlen(question), &nr_integers); - KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers); - for (i = 0; i < nr_integers; i++) - KUNIT_EXPECT_EQ(test, expected[i], answers[i]); - kfree(answers); - - question = "12 35 46"; - answers = str_to_ints(question, strlen(question), &nr_integers); - KUNIT_EXPECT_EQ(test, (ssize_t)3, nr_integers); - for (i = 0; i < nr_integers; i++) - KUNIT_EXPECT_EQ(test, expected[i], answers[i]); - kfree(answers); - - question = "12 35 abc 46"; - answers = str_to_ints(question, strlen(question), &nr_integers); - KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers); - for (i = 0; i < 2; i++) - KUNIT_EXPECT_EQ(test, expected[i], answers[i]); - kfree(answers); - - question = ""; - answers = str_to_ints(question, strlen(question), &nr_integers); - KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers); - kfree(answers); - - question = "\n"; - answers = str_to_ints(question, strlen(question), &nr_integers); - KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers); - kfree(answers); -} - -static void damon_dbgfs_test_set_targets(struct kunit *test) -{ - struct damon_ctx *ctx = dbgfs_new_ctx(); - char buf[64]; - - if (!damon_is_registered_ops(DAMON_OPS_PADDR)) { - dbgfs_destroy_ctx(ctx); - kunit_skip(test, "PADDR not registered"); - } - - /* Make DAMON consider target has no pid */ - damon_select_ops(ctx, DAMON_OPS_PADDR); - - dbgfs_set_targets(ctx, 0, NULL); - sprint_target_ids(ctx, buf, 64); - KUNIT_EXPECT_STREQ(test, (char *)buf, "\n"); - - dbgfs_set_targets(ctx, 1, NULL); - sprint_target_ids(ctx, buf, 64); - KUNIT_EXPECT_STREQ(test, (char *)buf, "42\n"); - - dbgfs_set_targets(ctx, 0, NULL); - sprint_target_ids(ctx, buf, 64); - KUNIT_EXPECT_STREQ(test, (char *)buf, "\n"); - - dbgfs_destroy_ctx(ctx); -} - -static void damon_dbgfs_test_set_init_regions(struct kunit *test) -{ - struct damon_ctx *ctx = damon_new_ctx(); - /* Each line represents one region in ``<target idx> <start> <end>`` */ - char * const valid_inputs[] = {"1 10 20\n 1 20 30\n1 35 45", - "1 10 20\n", - "1 10 20\n0 39 59\n0 70 134\n 1 20 25\n", - ""}; - /* Reading the file again will show sorted, clean output */ - char * const valid_expects[] = {"1 10 20\n1 20 30\n1 35 45\n", - "1 10 20\n", - "0 39 59\n0 70 134\n1 10 20\n1 20 25\n", - ""}; - char * const invalid_inputs[] = {"3 10 20\n", /* target not exists */ - "1 10 20\n 1 14 26\n", /* regions overlap */ - "0 10 20\n1 30 40\n 0 5 8"}; /* not sorted by address */ - char *input, *expect; - int i, rc; - char buf[256]; - - if (!damon_is_registered_ops(DAMON_OPS_PADDR)) { - damon_destroy_ctx(ctx); - kunit_skip(test, "PADDR not registered"); - } - - damon_select_ops(ctx, DAMON_OPS_PADDR); - - dbgfs_set_targets(ctx, 3, NULL); - - /* Put valid inputs and check the results */ - for (i = 0; i < ARRAY_SIZE(valid_inputs); i++) { - input = valid_inputs[i]; - expect = valid_expects[i]; - - rc = set_init_regions(ctx, input, strnlen(input, 256)); - KUNIT_EXPECT_EQ(test, rc, 0); - - memset(buf, 0, 256); - sprint_init_regions(ctx, buf, 256); - - KUNIT_EXPECT_STREQ(test, (char *)buf, expect); - } - /* Put invalid inputs and check the return error code */ - for (i = 0; i < ARRAY_SIZE(invalid_inputs); i++) { - input = invalid_inputs[i]; - pr_info("input: %s\n", input); - rc = set_init_regions(ctx, input, strnlen(input, 256)); - KUNIT_EXPECT_EQ(test, rc, -EINVAL); - - memset(buf, 0, 256); - sprint_init_regions(ctx, buf, 256); - - KUNIT_EXPECT_STREQ(test, (char *)buf, ""); - } - - dbgfs_set_targets(ctx, 0, NULL); - damon_destroy_ctx(ctx); -} - -static struct kunit_case damon_test_cases[] = { - KUNIT_CASE(damon_dbgfs_test_str_to_ints), - KUNIT_CASE(damon_dbgfs_test_set_targets), - KUNIT_CASE(damon_dbgfs_test_set_init_regions), - {}, -}; - -static struct kunit_suite damon_test_suite = { - .name = "damon-dbgfs", - .test_cases = damon_test_cases, -}; -kunit_test_suite(damon_test_suite); - -#endif /* _DAMON_DBGFS_TEST_H */ - -#endif /* CONFIG_DAMON_KUNIT_TEST */ diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h index b9fe3bc8472b..7cd944266a92 100644 --- a/mm/damon/tests/vaddr-kunit.h +++ b/mm/damon/tests/vaddr-kunit.h @@ -68,7 +68,7 @@ static void damon_test_three_regions_in_vmas(struct kunit *test) static struct mm_struct mm; struct damon_addr_range regions[3] = {0}; /* 10-20-25, 200-210-220, 300-305, 307-330 */ - struct vm_area_struct vmas[] = { + static struct vm_area_struct vmas[] = { (struct vm_area_struct) {.vm_start = 10, .vm_end = 20}, (struct vm_area_struct) {.vm_start = 20, .vm_end = 25}, (struct vm_area_struct) {.vm_start = 200, .vm_end = 210}, diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index b9eaa20b73b9..46554e49a478 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * DAMON Primitives for Virtual Address Spaces + * DAMON Code for Virtual Address Spaces * * Author: SeongJae Park <sj@kernel.org> */ @@ -655,7 +655,7 @@ static unsigned long damos_madvise(struct damon_target *target, static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, struct damon_region *r, - struct damos *scheme) + struct damos *scheme, unsigned long *sz_filter_passed) { int madv_action; @@ -710,7 +710,6 @@ static int __init damon_va_initcall(void) .update = damon_va_update, .prepare_access_checks = damon_va_prepare_access_checks, .check_accesses = damon_va_check_accesses, - .reset_aggregated = NULL, .target_valid = damon_va_target_valid, .cleanup = NULL, .apply_scheme = damon_va_apply_scheme, diff --git a/mm/debug.c b/mm/debug.c index aa57d3ffd4ed..907382257062 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -71,20 +71,27 @@ static void __dump_folio(struct folio *folio, struct page *page, unsigned long pfn, unsigned long idx) { struct address_space *mapping = folio_mapping(folio); - int mapcount = atomic_read(&page->_mapcount); + int mapcount = atomic_read(&page->_mapcount) + 1; char *type = ""; - mapcount = page_mapcount_is_type(mapcount) ? 0 : mapcount + 1; + if (page_mapcount_is_type(mapcount)) + mapcount = 0; + pr_warn("page: refcount:%d mapcount:%d mapping:%p index:%#lx pfn:%#lx\n", folio_ref_count(folio), mapcount, mapping, folio->index + idx, pfn); if (folio_test_large(folio)) { + int pincount = 0; + + if (folio_has_pincount(folio)) + pincount = atomic_read(&folio->_pincount); + pr_warn("head: order:%u mapcount:%d entire_mapcount:%d nr_pages_mapped:%d pincount:%d\n", folio_order(folio), folio_mapcount(folio), folio_entire_mapcount(folio), folio_nr_pages_mapped(folio), - atomic_read(&folio->_pincount)); + pincount); } #ifdef CONFIG_MEMCG @@ -124,25 +131,31 @@ static void __dump_page(const struct page *page) { struct folio *foliop, folio; struct page precise; + unsigned long head; unsigned long pfn = page_to_pfn(page); unsigned long idx, nr_pages = 1; int loops = 5; again: memcpy(&precise, page, sizeof(*page)); - foliop = page_folio(&precise); - if (foliop == (struct folio *)&precise) { + head = precise.compound_head; + if ((head & 1) == 0) { + foliop = (struct folio *)&precise; idx = 0; if (!folio_test_large(foliop)) goto dump; foliop = (struct folio *)page; } else { + foliop = (struct folio *)(head - 1); idx = folio_page_idx(foliop, page); } if (idx < MAX_FOLIO_NR_PAGES) { memcpy(&folio, foliop, 2 * sizeof(struct page)); nr_pages = folio_nr_pages(&folio); + if (nr_pages > 1) + memcpy(&folio.__page_2, &foliop->__page_2, + sizeof(struct page)); foliop = &folio; } @@ -162,7 +175,7 @@ dump: void dump_page(const struct page *page, const char *reason) { if (PagePoisoned(page)) - pr_warn("page:%p is uninitialized and poisoned", page); + pr_warn("page:%p is uninitialized and poisoned\n", page); else __dump_page(page); if (reason) @@ -178,11 +191,17 @@ void dump_vma(const struct vm_area_struct *vma) pr_emerg("vma %px start %px end %px mm %px\n" "prot %lx anon_vma %px vm_ops %px\n" "pgoff %lx file %px private_data %px\n" +#ifdef CONFIG_PER_VMA_LOCK + "refcnt %x\n" +#endif "flags: %#lx(%pGv)\n", vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_mm, (unsigned long)pgprot_val(vma->vm_page_prot), vma->anon_vma, vma->vm_ops, vma->vm_pgoff, vma->vm_file, vma->vm_private_data, +#ifdef CONFIG_PER_VMA_LOCK + refcount_read(&vma->vm_refcnt), +#endif vma->vm_flags, &vma->vm_flags); } EXPORT_SYMBOL(dump_vma); @@ -246,6 +265,83 @@ void dump_mm(const struct mm_struct *mm) } EXPORT_SYMBOL(dump_mm); +void dump_vmg(const struct vma_merge_struct *vmg, const char *reason) +{ + if (reason) + pr_warn("vmg %px dumped because: %s\n", vmg, reason); + + if (!vmg) { + pr_warn("vmg %px state: (NULL)\n", vmg); + return; + } + + pr_warn("vmg %px state: mm %px pgoff %lx\n" + "vmi %px [%lx,%lx)\n" + "prev %px middle %px next %px target %px\n" + "start %lx end %lx flags %lx\n" + "file %px anon_vma %px policy %px\n" + "uffd_ctx %px\n" + "anon_name %px\n" + "state %x\n" + "just_expand %d\n" + "__adjust_middle_start %d __adjust_next_start %d\n" + "__remove_middle %d __remove_next %d\n", + vmg, vmg->mm, vmg->pgoff, + vmg->vmi, vmg->vmi ? vma_iter_addr(vmg->vmi) : 0, + vmg->vmi ? vma_iter_end(vmg->vmi) : 0, + vmg->prev, vmg->middle, vmg->next, vmg->target, + vmg->start, vmg->end, vmg->flags, + vmg->file, vmg->anon_vma, vmg->policy, +#ifdef CONFIG_USERFAULTFD + vmg->uffd_ctx.ctx, +#else + (void *)0, +#endif + vmg->anon_name, + (int)vmg->state, + vmg->just_expand, + vmg->__adjust_middle_start, vmg->__adjust_next_start, + vmg->__remove_middle, vmg->__remove_next); + + if (vmg->mm) { + pr_warn("vmg %px mm:\n", vmg); + dump_mm(vmg->mm); + } else { + pr_warn("vmg %px mm: (NULL)\n", vmg); + } + + if (vmg->prev) { + pr_warn("vmg %px prev:\n", vmg); + dump_vma(vmg->prev); + } else { + pr_warn("vmg %px prev: (NULL)\n", vmg); + } + + if (vmg->middle) { + pr_warn("vmg %px middle:\n", vmg); + dump_vma(vmg->middle); + } else { + pr_warn("vmg %px middle: (NULL)\n", vmg); + } + + if (vmg->next) { + pr_warn("vmg %px next:\n", vmg); + dump_vma(vmg->next); + } else { + pr_warn("vmg %px next: (NULL)\n", vmg); + } + +#ifdef CONFIG_DEBUG_VM_MAPLE_TREE + if (vmg->vmi) { + pr_warn("vmg %px vmi:\n", vmg); + vma_iter_dump_tree(vmg->vmi); + } else { + pr_warn("vmg %px vmi: (NULL)\n", vmg); + } +#endif +} +EXPORT_SYMBOL(dump_vmg); + static bool page_init_poisoning __read_mostly = true; static int __init setup_vm_debug(char *str) diff --git a/mm/debug_page_alloc.c b/mm/debug_page_alloc.c index d46acf989dde..6a26eca546c3 100644 --- a/mm/debug_page_alloc.c +++ b/mm/debug_page_alloc.c @@ -23,7 +23,7 @@ static int __init debug_guardpage_minorder_setup(char *buf) unsigned long res; if (kstrtoul(buf, 10, &res) < 0 || res > MAX_PAGE_ORDER / 2) { - pr_err("Bad debug_guardpage_minorder value\n"); + pr_err("Bad debug_guardpage_minorder value: %s\n", buf); return 0; } _debug_guardpage_minorder = res; diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index bc748f700a9e..7731b238b534 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -910,26 +910,18 @@ static void __init swap_migration_tests(struct pgtable_debug_args *args) #ifdef CONFIG_HUGETLB_PAGE static void __init hugetlb_basic_tests(struct pgtable_debug_args *args) { - struct page *page; pte_t pte; pr_debug("Validating HugeTLB basic\n"); - /* - * Accessing the page associated with the pfn is safe here, - * as it was previously derived from a real kernel symbol. - */ - page = pfn_to_page(args->fixed_pmd_pfn); - pte = mk_huge_pte(page, args->page_prot); + pte = pfn_pte(args->fixed_pmd_pfn, args->page_prot); + pte = arch_make_huge_pte(pte, PMD_SHIFT, VM_ACCESS_FLAGS); +#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB + WARN_ON(!pte_huge(pte)); +#endif WARN_ON(!huge_pte_dirty(huge_pte_mkdirty(pte))); WARN_ON(!huge_pte_write(huge_pte_mkwrite(huge_pte_wrprotect(pte)))); WARN_ON(huge_pte_write(huge_pte_wrprotect(huge_pte_mkwrite(pte)))); - -#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB - pte = pfn_pte(args->fixed_pmd_pfn, args->page_prot); - - WARN_ON(!pte_huge(arch_make_huge_pte(pte, PMD_SHIFT, VM_ACCESS_FLAGS))); -#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ } #else /* !CONFIG_HUGETLB_PAGE */ static void __init hugetlb_basic_tests(struct pgtable_debug_args *args) { } diff --git a/mm/dmapool.c b/mm/dmapool.c index f0bfc6c490f4..5be8cc1c6529 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -56,6 +56,7 @@ struct dma_pool { /* the pool */ unsigned int size; unsigned int allocation; unsigned int boundary; + int node; char name[32]; struct list_head pools; }; @@ -199,12 +200,13 @@ static void pool_block_push(struct dma_pool *pool, struct dma_block *block, /** - * dma_pool_create - Creates a pool of consistent memory blocks, for dma. + * dma_pool_create_node - Creates a pool of consistent memory blocks, for dma. * @name: name of pool, for diagnostics * @dev: device that will be doing the DMA * @size: size of the blocks in this pool. * @align: alignment requirement for blocks; must be a power of two * @boundary: returned blocks won't cross this power of two boundary + * @node: optional NUMA node to allocate structs 'dma_pool' and 'dma_page' on * Context: not in_interrupt() * * Given one of these pools, dma_pool_alloc() @@ -221,8 +223,8 @@ static void pool_block_push(struct dma_pool *pool, struct dma_block *block, * Return: a dma allocation pool with the requested characteristics, or * %NULL if one can't be created. */ -struct dma_pool *dma_pool_create(const char *name, struct device *dev, - size_t size, size_t align, size_t boundary) +struct dma_pool *dma_pool_create_node(const char *name, struct device *dev, + size_t size, size_t align, size_t boundary, int node) { struct dma_pool *retval; size_t allocation; @@ -251,7 +253,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, boundary = min(boundary, allocation); - retval = kzalloc(sizeof(*retval), GFP_KERNEL); + retval = kzalloc_node(sizeof(*retval), GFP_KERNEL, node); if (!retval) return retval; @@ -264,6 +266,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, retval->size = size; retval->boundary = boundary; retval->allocation = allocation; + retval->node = node; INIT_LIST_HEAD(&retval->pools); /* @@ -295,7 +298,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, mutex_unlock(&pools_reg_lock); return retval; } -EXPORT_SYMBOL(dma_pool_create); +EXPORT_SYMBOL(dma_pool_create_node); static void pool_initialise_page(struct dma_pool *pool, struct dma_page *page) { @@ -335,7 +338,7 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags) { struct dma_page *page; - page = kmalloc(sizeof(*page), mem_flags); + page = kmalloc_node(sizeof(*page), mem_flags, pool->node); if (!page) return NULL; diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c index ce06b2884789..ff35b84a7b50 100644 --- a/mm/early_ioremap.c +++ b/mm/early_ioremap.c @@ -245,7 +245,10 @@ early_memremap_prot(resource_size_t phys_addr, unsigned long size, #define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) -void __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size) +/* + * If no empty slot, handle that and return -ENOMEM. + */ +int __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size) { unsigned long slop, clen; char *p; @@ -256,12 +259,15 @@ void __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size) if (clen > MAX_MAP_CHUNK - slop) clen = MAX_MAP_CHUNK - slop; p = early_memremap(src & PAGE_MASK, clen + slop); + if (!p) + return -ENOMEM; memcpy(dest, p + slop, clen); early_memunmap(p, clen + slop); dest += clen; src += clen; size -= clen; } + return 0; } #else /* CONFIG_MMU */ diff --git a/mm/execmem.c b/mm/execmem.c index 317b6a8d35be..2b683e7d864d 100644 --- a/mm/execmem.c +++ b/mm/execmem.c @@ -257,7 +257,6 @@ out_unlock: static int execmem_cache_populate(struct execmem_range *range, size_t size) { unsigned long vm_flags = VM_ALLOW_HUGE_VMAP; - unsigned long start, end; struct vm_struct *vm; size_t alloc_size; int err = -ENOMEM; @@ -275,26 +274,18 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size) /* fill memory with instructions that will trap */ execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true); - start = (unsigned long)p; - end = start + alloc_size; - - vunmap_range(start, end); - - err = execmem_set_direct_map_valid(vm, false); - if (err) - goto err_free_mem; - - err = vmap_pages_range_noflush(start, end, range->pgprot, vm->pages, - PMD_SHIFT); + err = set_memory_rox((unsigned long)p, vm->nr_pages); if (err) goto err_free_mem; err = execmem_cache_add(p, alloc_size); if (err) - goto err_free_mem; + goto err_reset_direct_map; return 0; +err_reset_direct_map: + execmem_set_direct_map_valid(vm, true); err_free_mem: vfree(p); return err; @@ -344,6 +335,28 @@ static bool execmem_cache_free(void *ptr) return true; } + +int execmem_make_temp_rw(void *ptr, size_t size) +{ + unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT; + unsigned long addr = (unsigned long)ptr; + int ret; + + ret = set_memory_nx(addr, nr); + if (ret) + return ret; + + return set_memory_rw(addr, nr); +} + +int execmem_restore_rox(void *ptr, size_t size) +{ + unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT; + unsigned long addr = (unsigned long)ptr; + + return set_memory_rox(addr, nr); +} + #else /* CONFIG_ARCH_HAS_EXECMEM_ROX */ static void *execmem_cache_alloc(struct execmem_range *range, size_t size) { @@ -364,6 +377,8 @@ void *execmem_alloc(enum execmem_type type, size_t size) pgprot_t pgprot = range->pgprot; void *p; + size = PAGE_ALIGN(size); + if (use_cache) p = execmem_cache_alloc(range, size); else diff --git a/mm/filemap.c b/mm/filemap.c index 7c76a123ba18..bada249b9fb7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -47,6 +47,7 @@ #include <linux/splice.h> #include <linux/rcupdate_wait.h> #include <linux/sched/mm.h> +#include <linux/sysctl.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include "internal.h" @@ -124,15 +125,6 @@ * ->private_lock (zap_pte_range->block_dirty_folio) */ -static void mapping_set_update(struct xa_state *xas, - struct address_space *mapping) -{ - if (dax_mapping(mapping) || shmem_mapping(mapping)) - return; - xas_set_update(xas, workingset_update_node); - xas_set_lru(xas, &shadow_nodes); -} - static void page_cache_delete(struct address_space *mapping, struct folio *folio, void *shadow) { @@ -150,7 +142,7 @@ static void page_cache_delete(struct address_space *mapping, xas_init_marks(&xas); folio->mapping = NULL; - /* Leave page->index set: truncation lookup relies upon it */ + /* Leave folio->index set: truncation lookup relies upon it */ mapping->nrpages -= nr; } @@ -235,15 +227,12 @@ void __filemap_remove_folio(struct folio *folio, void *shadow) void filemap_free_folio(struct address_space *mapping, struct folio *folio) { void (*free_folio)(struct folio *); - int refs = 1; free_folio = mapping->a_ops->free_folio; if (free_folio) free_folio(folio); - if (folio_test_large(folio)) - refs = folio_nr_pages(folio); - folio_put_refs(folio, refs); + folio_put_refs(folio, folio_nr_pages(folio)); } /** @@ -450,6 +439,24 @@ int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, EXPORT_SYMBOL(filemap_fdatawrite_range); /** + * filemap_fdatawrite_range_kick - start writeback on a range + * @mapping: target address_space + * @start: index to start writeback on + * @end: last (inclusive) index for writeback + * + * This is a non-integrity writeback helper, to start writing back folios + * for the indicated range. + * + * Return: %0 on success, negative error code otherwise. + */ +int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start, + loff_t end) +{ + return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_NONE); +} +EXPORT_SYMBOL_GPL(filemap_fdatawrite_range_kick); + +/** * filemap_flush - mostly a non-blocking flush * @mapping: target address_space * @@ -850,11 +857,10 @@ EXPORT_SYMBOL_GPL(replace_page_cache_folio); noinline int __filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp) { - XA_STATE(xas, &mapping->i_pages, index); - void *alloced_shadow = NULL; - int alloced_order = 0; + XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio)); bool huge; long nr; + unsigned int forder = folio_order(folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio); @@ -863,7 +869,6 @@ noinline int __filemap_add_folio(struct address_space *mapping, mapping_set_update(&xas, mapping); VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio); - xas_set_order(&xas, index, folio_order(folio)); huge = folio_test_hugetlb(folio); nr = folio_nr_pages(folio); @@ -873,7 +878,7 @@ noinline int __filemap_add_folio(struct address_space *mapping, folio->index = xas.xa_index; for (;;) { - int order = -1, split_order = 0; + int order = -1; void *entry, *old = NULL; xas_lock_irq(&xas); @@ -891,21 +896,25 @@ noinline int __filemap_add_folio(struct address_space *mapping, order = xas_get_order(&xas); } - /* entry may have changed before we re-acquire the lock */ - if (alloced_order && (old != alloced_shadow || order != alloced_order)) { - xas_destroy(&xas); - alloced_order = 0; - } - if (old) { - if (order > 0 && order > folio_order(folio)) { + if (order > 0 && order > forder) { + unsigned int split_order = max(forder, + xas_try_split_min_order(order)); + /* How to handle large swap entries? */ BUG_ON(shmem_mapping(mapping)); - if (!alloced_order) { - split_order = order; - goto unlock; + + while (order > forder) { + xas_set_order(&xas, index, split_order); + xas_try_split(&xas, old, order); + if (xas_error(&xas)) + goto unlock; + order = split_order; + split_order = + max(xas_try_split_min_order( + split_order), + forder); } - xas_split(&xas, old, order); xas_reset(&xas); } if (shadowp) @@ -929,17 +938,6 @@ noinline int __filemap_add_folio(struct address_space *mapping, unlock: xas_unlock_irq(&xas); - /* split needed, alloc here and retry. */ - if (split_order) { - xas_split_alloc(&xas, old, split_order, gfp); - if (xas_error(&xas)) - goto error; - alloced_shadow = old; - alloced_order = split_order; - xas_reset(&xas); - continue; - } - if (!xas_nomem(&xas, gfp)) break; } @@ -951,7 +949,7 @@ unlock: return 0; error: folio->mapping = NULL; - /* Leave page->index set: truncation relies upon it */ + /* Leave folio->index set: truncation relies upon it */ folio_put_refs(folio, nr); return xas_error(&xas); } @@ -1068,6 +1066,19 @@ static wait_queue_head_t *folio_waitqueue(struct folio *folio) return &folio_wait_table[hash_ptr(folio, PAGE_WAIT_TABLE_BITS)]; } +/* How many times do we accept lock stealing from under a waiter? */ +static int sysctl_page_lock_unfairness = 5; +static const struct ctl_table filemap_sysctl_table[] = { + { + .procname = "page_lock_unfairness", + .data = &sysctl_page_lock_unfairness, + .maxlen = sizeof(sysctl_page_lock_unfairness), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + } +}; + void __init pagecache_init(void) { int i; @@ -1076,6 +1087,7 @@ void __init pagecache_init(void) init_waitqueue_head(&folio_wait_table[i]); page_writeback_init(); + register_sysctl_init("vm", filemap_sysctl_table); } /* @@ -1223,9 +1235,6 @@ static inline bool folio_trylock_flag(struct folio *folio, int bit_nr, return true; } -/* How many times do we accept lock stealing from under a waiter? */ -int sysctl_page_lock_unfairness = 5; - static inline int folio_wait_bit_common(struct folio *folio, int bit_nr, int state, enum behavior behavior) { @@ -1369,7 +1378,7 @@ repeat: * @ptl: already locked ptl. This function will drop the lock. * * Wait for a migration entry referencing the given page to be removed. This is - * equivalent to put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE) except + * equivalent to folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE) except * this can be called without taking a reference on the page. Instead this * should be called while holding the ptl for the migration entry referencing * the page. @@ -1473,25 +1482,6 @@ static int folio_put_wait_locked(struct folio *folio, int state) } /** - * folio_add_wait_queue - Add an arbitrary waiter to a folio's wait queue - * @folio: Folio defining the wait queue of interest - * @waiter: Waiter to add to the queue - * - * Add an arbitrary @waiter to the wait queue for the nominated @folio. - */ -void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter) -{ - wait_queue_head_t *q = folio_waitqueue(folio); - unsigned long flags; - - spin_lock_irqsave(&q->lock, flags); - __add_wait_queue_entry_tail(q, waiter); - folio_set_waiters(folio); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL_GPL(folio_add_wait_queue); - -/** * folio_unlock - Unlock a locked folio. * @folio: The folio. * @@ -1532,7 +1522,7 @@ void folio_end_read(struct folio *folio, bool success) /* Must be in bottom byte for x86 to work */ BUILD_BUG_ON(PG_uptodate > 7); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); - VM_BUG_ON_FOLIO(folio_test_uptodate(folio), folio); + VM_BUG_ON_FOLIO(success && folio_test_uptodate(folio), folio); if (likely(success)) mask |= 1 << PG_uptodate; @@ -1599,6 +1589,43 @@ int folio_wait_private_2_killable(struct folio *folio) } EXPORT_SYMBOL(folio_wait_private_2_killable); +static void filemap_end_dropbehind(struct folio *folio) +{ + struct address_space *mapping = folio->mapping; + + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + + if (folio_test_writeback(folio) || folio_test_dirty(folio)) + return; + if (!folio_test_clear_dropbehind(folio)) + return; + if (mapping) + folio_unmap_invalidate(mapping, folio, 0); +} + +/* + * If folio was marked as dropbehind, then pages should be dropped when writeback + * completes. Do that now. If we fail, it's likely because of a big folio - + * just reset dropbehind for that case and latter completions should invalidate. + */ +static void filemap_end_dropbehind_write(struct folio *folio) +{ + if (!folio_test_dropbehind(folio)) + return; + + /* + * Hitting !in_task() should not happen off RWF_DONTCACHE writeback, + * but can happen if normal writeback just happens to find dirty folios + * that were created as part of uncached writeback, and that writeback + * would otherwise not need non-IRQ handling. Just skip the + * invalidation in that case. + */ + if (in_task() && folio_trylock(folio)) { + filemap_end_dropbehind(folio); + folio_unlock(folio); + } +} + /** * folio_end_writeback - End writeback against a folio. * @folio: The folio. @@ -1632,6 +1659,8 @@ void folio_end_writeback(struct folio *folio) folio_get(folio); if (__folio_end_writeback(folio)) folio_wake_bit(folio, PG_writeback); + + filemap_end_dropbehind_write(folio); acct_reclaim_writeback(folio); folio_put(folio); } @@ -1955,6 +1984,8 @@ no_page: /* Init accessed so avoid atomic mark_page_accessed later */ if (fgp_flags & FGP_ACCESSED) __folio_set_referenced(folio); + if (fgp_flags & FGP_DONTCACHE) + __folio_set_dropbehind(folio); err = filemap_add_folio(mapping, folio, index, gfp); if (!err) @@ -1965,8 +1996,19 @@ no_page: if (err == -EEXIST) goto repeat; - if (err) + if (err) { + /* + * When NOWAIT I/O fails to allocate folios this could + * be due to a nonblocking memory allocation and not + * because the system actually is out of memory. + * Return -EAGAIN so that there caller retries in a + * blocking fashion instead of propagating -ENOMEM + * to the application. + */ + if ((fgp_flags & FGP_NOWAIT) && err == -ENOMEM) + err = -EAGAIN; return ERR_PTR(err); + } /* * filemap_add_folio locks the page, and for mmap * we expect an unlocked page. @@ -1977,6 +2019,9 @@ no_page: if (!folio) return ERR_PTR(-ENOENT); + /* not an uncached lookup, clear uncached if set */ + if (folio_test_dropbehind(folio) && !(fgp_flags & FGP_DONTCACHE)) + folio_clear_dropbehind(folio); return folio; } EXPORT_SYMBOL(__filemap_get_folio); @@ -2210,6 +2255,7 @@ unsigned filemap_get_folios_contig(struct address_space *mapping, *start = folio->index + nr; goto out; } + xas_advance(&xas, folio_next_index(folio) - 1); continue; put_folio: folio_put(folio); @@ -2459,18 +2505,22 @@ unlock_mapping: return error; } -static int filemap_create_folio(struct file *file, - struct address_space *mapping, loff_t pos, - struct folio_batch *fbatch) +static int filemap_create_folio(struct kiocb *iocb, struct folio_batch *fbatch) { + struct address_space *mapping = iocb->ki_filp->f_mapping; struct folio *folio; int error; unsigned int min_order = mapping_min_folio_order(mapping); pgoff_t index; + if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) + return -EAGAIN; + folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order); if (!folio) return -ENOMEM; + if (iocb->ki_flags & IOCB_DONTCACHE) + __folio_set_dropbehind(folio); /* * Protect against truncate / hole punch. Grabbing invalidate_lock @@ -2486,7 +2536,7 @@ static int filemap_create_folio(struct file *file, * well to keep locking rules simple. */ filemap_invalidate_lock_shared(mapping); - index = (pos >> (PAGE_SHIFT + min_order)) << min_order; + index = (iocb->ki_pos >> (PAGE_SHIFT + min_order)) << min_order; error = filemap_add_folio(mapping, folio, index, mapping_gfp_constraint(mapping, GFP_KERNEL)); if (error == -EEXIST) @@ -2494,7 +2544,8 @@ static int filemap_create_folio(struct file *file, if (error) goto error; - error = filemap_read_folio(file, mapping->a_ops->read_folio, folio); + error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio, + folio); if (error) goto error; @@ -2515,6 +2566,8 @@ static int filemap_readahead(struct kiocb *iocb, struct file *file, if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; + if (iocb->ki_flags & IOCB_DONTCACHE) + ractl.dropbehind = 1; page_cache_async_ra(&ractl, folio, last_index - folio->index); return 0; } @@ -2524,7 +2577,6 @@ static int filemap_get_pages(struct kiocb *iocb, size_t count, { struct file *filp = iocb->ki_filp; struct address_space *mapping = filp->f_mapping; - struct file_ra_state *ra = &filp->f_ra; pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; pgoff_t last_index; struct folio *folio; @@ -2539,20 +2591,21 @@ retry: filemap_get_read_batch(mapping, index, last_index - 1, fbatch); if (!folio_batch_count(fbatch)) { + DEFINE_READAHEAD(ractl, filp, &filp->f_ra, mapping, index); + if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; if (iocb->ki_flags & IOCB_NOWAIT) flags = memalloc_noio_save(); - page_cache_sync_readahead(mapping, ra, filp, index, - last_index - index); + if (iocb->ki_flags & IOCB_DONTCACHE) + ractl.dropbehind = 1; + page_cache_sync_ra(&ractl, last_index - index); if (iocb->ki_flags & IOCB_NOWAIT) memalloc_noio_restore(flags); filemap_get_read_batch(mapping, index, last_index - 1, fbatch); } if (!folio_batch_count(fbatch)) { - if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) - return -EAGAIN; - err = filemap_create_folio(filp, mapping, iocb->ki_pos, fbatch); + err = filemap_create_folio(iocb, fbatch); if (err == AOP_TRUNCATED_PAGE) goto retry; return err; @@ -2593,6 +2646,18 @@ static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio) return (pos1 >> shift == pos2 >> shift); } +static void filemap_end_dropbehind_read(struct folio *folio) +{ + if (!folio_test_dropbehind(folio)) + return; + if (folio_test_writeback(folio) || folio_test_dirty(folio)) + return; + if (folio_trylock(folio)) { + filemap_end_dropbehind(folio); + folio_unlock(folio); + } +} + /** * filemap_read - Read data from the page cache. * @iocb: The iocb to read. @@ -2706,8 +2771,12 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, } } put_folios: - for (i = 0; i < folio_batch_count(&fbatch); i++) - folio_put(fbatch.folios[i]); + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; + + filemap_end_dropbehind_read(folio); + folio_put(folio); + } folio_batch_init(&fbatch); } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error); @@ -2848,8 +2917,7 @@ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe, size = min(size, folio_size(folio) - offset); offset %= PAGE_SIZE; - while (spliced < size && - !pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { + while (spliced < size && !pipe_is_full(pipe)) { struct pipe_buffer *buf = pipe_head_buf(pipe); size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced); @@ -2906,7 +2974,7 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos, iocb.ki_pos = *ppos; /* Work out how much data we can actually add into the pipe */ - used = pipe_occupancy(pipe->head, pipe->tail); + used = pipe_buf_usage(pipe); npages = max_t(ssize_t, pipe->max_usage - used, 0); len = min_t(size_t, len, npages * PAGE_SIZE); @@ -2966,7 +3034,7 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos, total_spliced += n; *ppos += n; in->f_ra.prev_pos = *ppos; - if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) + if (pipe_is_full(pipe)) goto out; } @@ -3005,7 +3073,7 @@ static inline loff_t folio_seek_hole_data(struct xa_state *xas, if (ops->is_partially_uptodate(folio, offset, bsz) == seek_data) break; - start = (start + bsz) & ~(bsz - 1); + start = (start + bsz) & ~((u64)bsz - 1); offset += bsz; } while (offset < folio_size(folio)); unlock: @@ -3474,7 +3542,7 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio, if (pmd_none(*vmf->pmd) && folio_test_pmd_mappable(folio)) { struct page *page = folio_file_page(folio, start); - vm_fault_t ret = do_set_pmd(vmf, page); + vm_fault_t ret = do_set_pmd(vmf, folio, page); if (!ret) { /* The page is mapped successfully, reference consumed. */ folio_unlock(folio); @@ -3501,10 +3569,10 @@ static struct folio *next_uptodate_folio(struct xa_state *xas, continue; if (xa_is_value(folio)) continue; - if (folio_test_locked(folio)) - continue; if (!folio_try_get(folio)) continue; + if (folio_test_locked(folio)) + goto skip; /* Has the page moved or been split? */ if (unlikely(folio != xas_reload(xas))) goto skip; @@ -4036,17 +4104,6 @@ retry: bytes = min(chunk - offset, bytes); balance_dirty_pages_ratelimited(mapping); - /* - * Bring in the user page that we will copy from _first_. - * Otherwise there's a nasty deadlock on copying from the - * same page as we're writing to, without it being marked - * up-to-date. - */ - if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) { - status = -EFAULT; - break; - } - if (fatal_signal_pending(current)) { status = -EINTR; break; @@ -4064,6 +4121,12 @@ retry: if (mapping_writably_mapped(mapping)) flush_dcache_folio(folio); + /* + * Faults here on mmap()s can recurse into arbitrary + * filesystem code. Lots of locks are held that can + * deadlock. Use an atomic copy to avoid deadlocking + * in page fault handling. + */ copied = copy_folio_from_iter_atomic(folio, offset, bytes, i); flush_dcache_folio(folio); @@ -4089,6 +4152,16 @@ retry: bytes = copied; goto retry; } + + /* + * 'folio' is now unlocked and faults on it can be + * handled. Ensure forward progress by trying to + * fault it in now. + */ + if (fault_in_iov_iter_readable(i, bytes) == bytes) { + status = -EFAULT; + break; + } } else { pos += status; written += status; @@ -4385,6 +4458,20 @@ resched: } /* + * See mincore: reveal pagecache information only for files + * that the calling process has write access to, or could (if + * tried) open for writing. + */ +static inline bool can_do_cachestat(struct file *f) +{ + if (f->f_mode & FMODE_WRITE) + return true; + if (inode_owner_or_capable(file_mnt_idmap(f), file_inode(f))) + return true; + return file_permission(f, MAY_WRITE) == 0; +} + +/* * The cachestat(2) system call. * * cachestat() returns the page cache statistics of a file in the @@ -4439,6 +4526,9 @@ SYSCALL_DEFINE4(cachestat, unsigned int, fd, if (is_file_hugepages(fd_file(f))) return -EOPNOTSUPP; + if (!can_do_cachestat(fd_file(f))) + return -EPERM; + if (flags != 0) return -EINVAL; diff --git a/mm/folio-compat.c b/mm/folio-compat.c index 1d1832e2a599..45540942d148 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -28,12 +28,6 @@ void wait_on_page_writeback(struct page *page) } EXPORT_SYMBOL_GPL(wait_on_page_writeback); -void wait_for_stable_page(struct page *page) -{ - return folio_wait_stable(page_folio(page)); -} -EXPORT_SYMBOL_GPL(wait_for_stable_page); - void mark_page_accessed(struct page *page) { folio_mark_accessed(page_folio(page)); @@ -90,11 +84,3 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, return folio_file_page(folio, index); } EXPORT_SYMBOL(pagecache_get_page); - -struct page *grab_cache_page_write_begin(struct address_space *mapping, - pgoff_t index) -{ - return pagecache_get_page(mapping, index, FGP_WRITEBEGIN, - mapping_gfp_mask(mapping)); -} -EXPORT_SYMBOL(grab_cache_page_write_begin); @@ -26,6 +26,7 @@ #include <asm/tlbflush.h> #include "internal.h" +#include "swap.h" struct follow_page_context { struct dev_pagemap *pgmap; @@ -52,7 +53,12 @@ static inline void sanity_check_pinned_pages(struct page **pages, */ for (; npages; npages--, pages++) { struct page *page = *pages; - struct folio *folio = page_folio(page); + struct folio *folio; + + if (!page) + continue; + + folio = page_folio(page); if (is_zero_page(page) || !folio_test_anon(folio)) @@ -91,8 +97,7 @@ retry: * belongs to this folio. */ if (unlikely(page_folio(page) != folio)) { - if (!put_devmap_managed_folio_refs(folio, refs)) - folio_put_refs(folio, refs); + folio_put_refs(folio, refs); goto retry; } @@ -105,14 +110,13 @@ static void gup_put_folio(struct folio *folio, int refs, unsigned int flags) if (is_zero_folio(folio)) return; node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs); - if (folio_test_large(folio)) + if (folio_has_pincount(folio)) atomic_sub(refs, &folio->_pincount); else refs *= GUP_PIN_COUNTING_BIAS; } - if (!put_devmap_managed_folio_refs(folio, refs)) - folio_put_refs(folio, refs); + folio_put_refs(folio, refs); } /** @@ -161,7 +165,7 @@ int __must_check try_grab_folio(struct folio *folio, int refs, * Increment the normal page refcount field at least once, * so that the page really is pinned. */ - if (folio_test_large(folio)) { + if (folio_has_pincount(folio)) { folio_ref_add(folio, refs); atomic_add(refs, &folio->_pincount); } else { @@ -220,7 +224,7 @@ void folio_add_pin(struct folio *folio) * page refcount field at least once, so that the page really is * pinned. */ - if (folio_test_large(folio)) { + if (folio_has_pincount(folio)) { WARN_ON_ONCE(atomic_read(&folio->_pincount) < 1); folio_ref_inc(folio); atomic_inc(&folio->_pincount); @@ -409,6 +413,10 @@ void unpin_user_pages(struct page **pages, unsigned long npages) sanity_check_pinned_pages(pages, npages); for (i = 0; i < npages; i += nr) { + if (!pages[i]) { + nr = 1; + continue; + } folio = gup_folio_next(pages, npages, i, &nr); gup_put_folio(folio, nr, FOLL_PIN); } @@ -556,8 +564,7 @@ static struct folio *try_grab_folio_fast(struct page *page, int refs, */ if (unlikely((flags & FOLL_LONGTERM) && !folio_is_longterm_pinnable(folio))) { - if (!put_devmap_managed_folio_refs(folio, refs)) - folio_put_refs(folio, refs); + folio_put_refs(folio, refs); return NULL; } @@ -569,7 +576,7 @@ static struct folio *try_grab_folio_fast(struct page *page, int refs, * is pinned. That's why the refcount from the earlier * try_get_folio() is left intact. */ - if (folio_test_large(folio)) + if (folio_has_pincount(folio)) atomic_add(refs, &folio->_pincount); else folio_ref_add(folio, @@ -587,6 +594,33 @@ static struct folio *try_grab_folio_fast(struct page *page, int refs, } #endif /* CONFIG_HAVE_GUP_FAST */ +/* Common code for can_follow_write_* */ +static inline bool can_follow_write_common(struct page *page, + struct vm_area_struct *vma, unsigned int flags) +{ + /* Maybe FOLL_FORCE is set to override it? */ + if (!(flags & FOLL_FORCE)) + return false; + + /* But FOLL_FORCE has no effect on shared mappings */ + if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) + return false; + + /* ... or read-only private ones */ + if (!(vma->vm_flags & VM_MAYWRITE)) + return false; + + /* ... or already writable ones that just need to take a write fault */ + if (vma->vm_flags & VM_WRITE) + return false; + + /* + * See can_change_pte_writable(): we broke COW and could map the page + * writable if we have an exclusive anonymous page ... + */ + return page && PageAnon(page) && PageAnonExclusive(page); +} + static struct page *no_page_table(struct vm_area_struct *vma, unsigned int flags, unsigned long address) { @@ -613,6 +647,18 @@ static struct page *no_page_table(struct vm_area_struct *vma, } #ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES +/* FOLL_FORCE can write to even unwritable PUDs in COW mappings. */ +static inline bool can_follow_write_pud(pud_t pud, struct page *page, + struct vm_area_struct *vma, + unsigned int flags) +{ + /* If the pud is writable, we can write to the page. */ + if (pud_write(pud)) + return true; + + return can_follow_write_common(page, vma, flags); +} + static struct page *follow_huge_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pudp, int flags, struct follow_page_context *ctx) @@ -625,10 +671,11 @@ static struct page *follow_huge_pud(struct vm_area_struct *vma, assert_spin_locked(pud_lockptr(mm, pudp)); - if ((flags & FOLL_WRITE) && !pud_write(pud)) + if (!pud_present(pud)) return NULL; - if (!pud_present(pud)) + if ((flags & FOLL_WRITE) && + !can_follow_write_pud(pud, pfn_to_page(pfn), vma, flags)) return NULL; pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT; @@ -677,27 +724,7 @@ static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page, if (pmd_write(pmd)) return true; - /* Maybe FOLL_FORCE is set to override it? */ - if (!(flags & FOLL_FORCE)) - return false; - - /* But FOLL_FORCE has no effect on shared mappings */ - if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) - return false; - - /* ... or read-only private ones */ - if (!(vma->vm_flags & VM_MAYWRITE)) - return false; - - /* ... or already writable ones that just need to take a write fault */ - if (vma->vm_flags & VM_WRITE) - return false; - - /* - * See can_change_pte_writable(): we broke COW and could map the page - * writable if we have an exclusive anonymous page ... - */ - if (!page || !PageAnon(page) || !PageAnonExclusive(page)) + if (!can_follow_write_common(page, vma, flags)) return false; /* ... and a write-fault isn't required for other reasons. */ @@ -798,27 +825,7 @@ static inline bool can_follow_write_pte(pte_t pte, struct page *page, if (pte_write(pte)) return true; - /* Maybe FOLL_FORCE is set to override it? */ - if (!(flags & FOLL_FORCE)) - return false; - - /* But FOLL_FORCE has no effect on shared mappings */ - if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) - return false; - - /* ... or read-only private ones */ - if (!(vma->vm_flags & VM_MAYWRITE)) - return false; - - /* ... or already writable ones that just need to take a write fault */ - if (vma->vm_flags & VM_WRITE) - return false; - - /* - * See can_change_pte_writable(): we broke COW and could map the page - * writable if we have an exclusive anonymous page ... - */ - if (!page || !PageAnon(page) || !PageAnonExclusive(page)) + if (!can_follow_write_common(page, vma, flags)) return false; /* ... and a write-fault isn't required for other reasons. */ @@ -838,11 +845,6 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, pte_t *ptep, pte; int ret; - /* FOLL_GET and FOLL_PIN are mutually exclusive. */ - if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == - (FOLL_PIN | FOLL_GET))) - return ERR_PTR(-EINVAL); - ptep = pte_offset_map_lock(mm, pmd, address, &ptl); if (!ptep) return no_page_table(vma, flags, address); @@ -1100,10 +1102,7 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, /* user gate pages are read-only */ if (gup_flags & FOLL_WRITE) return -EFAULT; - if (address > TASK_SIZE) - pgd = pgd_offset_k(address); - else - pgd = pgd_offset_gate(mm, address); + pgd = pgd_offset(mm, address); if (pgd_none(*pgd)) return -EFAULT; p4d = p4d_offset(pgd, address); @@ -1274,6 +1273,9 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma)) return -EOPNOTSUPP; + if ((gup_flags & FOLL_SPLIT_PMD) && is_vm_hugetlb_page(vma)) + return -EOPNOTSUPP; + if (vma_is_secretmem(vma)) return -EFAULT; @@ -1285,9 +1287,6 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) if (!(vm_flags & VM_WRITE) || (vm_flags & VM_SHADOW_STACK)) { if (!(gup_flags & FOLL_FORCE)) return -EFAULT; - /* hugetlb does not support FOLL_FORCE|FOLL_WRITE. */ - if (is_vm_hugetlb_page(vma)) - return -EFAULT; /* * We used to let the write,force case do COW in a * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could @@ -1426,7 +1425,11 @@ static long __get_user_pages(struct mm_struct *mm, start = untagged_addr_remote(mm, start); - VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN))); + VM_WARN_ON_ONCE(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN))); + + /* FOLL_GET and FOLL_PIN are mutually exclusive. */ + VM_WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) == + (FOLL_PIN | FOLL_GET)); do { struct page *page; @@ -2108,28 +2111,22 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, */ size_t fault_in_writeable(char __user *uaddr, size_t size) { - char __user *start = uaddr, *end; + const unsigned long start = (unsigned long)uaddr; + const unsigned long end = start + size; + unsigned long cur; if (unlikely(size == 0)) return 0; if (!user_write_access_begin(uaddr, size)) return size; - if (!PAGE_ALIGNED(uaddr)) { - unsafe_put_user(0, uaddr, out); - uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr); - } - end = (char __user *)PAGE_ALIGN((unsigned long)start + size); - if (unlikely(end < start)) - end = NULL; - while (uaddr != end) { - unsafe_put_user(0, uaddr, out); - uaddr += PAGE_SIZE; - } + /* Stop once we overflow to 0. */ + for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE)) + unsafe_put_user(0, (char __user *)cur, out); out: user_write_access_end(); - if (size > uaddr - start) - return size - (uaddr - start); + if (size > cur - start) + return size - (cur - start); return 0; } EXPORT_SYMBOL(fault_in_writeable); @@ -2183,26 +2180,24 @@ EXPORT_SYMBOL(fault_in_subpage_writeable); */ size_t fault_in_safe_writeable(const char __user *uaddr, size_t size) { - unsigned long start = (unsigned long)uaddr, end; + const unsigned long start = (unsigned long)uaddr; + const unsigned long end = start + size; + unsigned long cur; struct mm_struct *mm = current->mm; bool unlocked = false; if (unlikely(size == 0)) return 0; - end = PAGE_ALIGN(start + size); - if (end < start) - end = 0; mmap_read_lock(mm); - do { - if (fixup_user_fault(mm, start, FAULT_FLAG_WRITE, &unlocked)) + /* Stop once we overflow to 0. */ + for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE)) + if (fixup_user_fault(mm, cur, FAULT_FLAG_WRITE, &unlocked)) break; - start = (start + PAGE_SIZE) & PAGE_MASK; - } while (start != end); mmap_read_unlock(mm); - if (size > (unsigned long)uaddr - start) - return size - ((unsigned long)uaddr - start); + if (size > cur - start) + return size - (cur - start); return 0; } EXPORT_SYMBOL(fault_in_safe_writeable); @@ -2217,30 +2212,24 @@ EXPORT_SYMBOL(fault_in_safe_writeable); */ size_t fault_in_readable(const char __user *uaddr, size_t size) { - const char __user *start = uaddr, *end; + const unsigned long start = (unsigned long)uaddr; + const unsigned long end = start + size; + unsigned long cur; volatile char c; if (unlikely(size == 0)) return 0; if (!user_read_access_begin(uaddr, size)) return size; - if (!PAGE_ALIGNED(uaddr)) { - unsafe_get_user(c, uaddr, out); - uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr); - } - end = (const char __user *)PAGE_ALIGN((unsigned long)start + size); - if (unlikely(end < start)) - end = NULL; - while (uaddr != end) { - unsafe_get_user(c, uaddr, out); - uaddr += PAGE_SIZE; - } + /* Stop once we overflow to 0. */ + for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE)) + unsafe_get_user(c, (const char __user *)cur, out); out: user_read_access_end(); (void)c; - if (size > uaddr - start) - return size - (uaddr - start); + if (size > cur - start) + return size - (cur - start); return 0; } EXPORT_SYMBOL(fault_in_readable); @@ -2248,6 +2237,7 @@ EXPORT_SYMBOL(fault_in_readable); /** * get_dump_page() - pin user page in memory while writing it to core dump * @addr: user address + * @locked: a pointer to an int denoting whether the mmap sem is held * * Returns struct page pointer of user page pinned for dump, * to be freed afterwards by put_page(). @@ -2260,13 +2250,12 @@ EXPORT_SYMBOL(fault_in_readable); * Called without mmap_lock (takes and releases the mmap_lock by itself). */ #ifdef CONFIG_ELF_CORE -struct page *get_dump_page(unsigned long addr) +struct page *get_dump_page(unsigned long addr, int *locked) { struct page *page; - int locked = 0; int ret; - ret = __get_user_pages_locked(current->mm, addr, 1, &page, &locked, + ret = __get_user_pages_locked(current->mm, addr, 1, &page, locked, FOLL_FORCE | FOLL_DUMP | FOLL_GET); return (ret == 1) ? page : NULL; } @@ -2338,7 +2327,7 @@ static unsigned long collect_longterm_unpinnable_folios( continue; if (folio_test_hugetlb(folio)) { - isolate_hugetlb(folio, movable_folio_list); + folio_isolate_hugetlb(folio, movable_folio_list); continue; } @@ -2757,7 +2746,7 @@ EXPORT_SYMBOL(get_user_pages_unlocked); * * *) ptes can be read atomically by the architecture. * - * *) access_ok is sufficient to validate userspace address ranges. + * *) valid user addesses are below TASK_MAX_SIZE * * The last two assumptions can be relaxed by the addition of helper functions. * @@ -3010,11 +2999,6 @@ static int gup_fast_devmap_leaf(unsigned long pfn, unsigned long addr, break; } - if (!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)) { - gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages); - break; - } - folio = try_grab_folio_fast(page, 1, flags); if (!folio) { gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages); @@ -3178,46 +3162,6 @@ static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr, return 1; } -static int gup_fast_pgd_leaf(pgd_t orig, pgd_t *pgdp, unsigned long addr, - unsigned long end, unsigned int flags, struct page **pages, - int *nr) -{ - int refs; - struct page *page; - struct folio *folio; - - if (!pgd_access_permitted(orig, flags & FOLL_WRITE)) - return 0; - - BUILD_BUG_ON(pgd_devmap(orig)); - - page = pgd_page(orig); - refs = record_subpages(page, PGDIR_SIZE, addr, end, pages + *nr); - - folio = try_grab_folio_fast(page, refs, flags); - if (!folio) - return 0; - - if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) { - gup_put_folio(folio, refs, flags); - return 0; - } - - if (!pgd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) { - gup_put_folio(folio, refs, flags); - return 0; - } - - if (!gup_fast_folio_allowed(folio, flags)) { - gup_put_folio(folio, refs, flags); - return 0; - } - - *nr += refs; - folio_set_referenced(folio); - return 1; -} - static int gup_fast_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) @@ -3312,12 +3256,9 @@ static void gup_fast_pgd_range(unsigned long addr, unsigned long end, next = pgd_addr_end(addr, end); if (pgd_none(pgd)) return; - if (unlikely(pgd_leaf(pgd))) { - if (!gup_fast_pgd_leaf(pgd, pgdp, addr, next, flags, - pages, nr)) - return; - } else if (!gup_fast_p4d_range(pgdp, pgd, addr, next, flags, - pages, nr)) + BUILD_BUG_ON(pgd_leaf(pgd)); + if (!gup_fast_p4d_range(pgdp, pgd, addr, next, flags, + pages, nr)) return; } while (pgdp++, addr = next, addr != end); } @@ -3351,8 +3292,7 @@ static unsigned long gup_fast(unsigned long start, unsigned long end, return 0; if (gup_flags & FOLL_PIN) { - seq = raw_read_seqcount(¤t->mm->write_protect_seq); - if (seq & 1) + if (!raw_seqcount_try_begin(¤t->mm->write_protect_seq, seq)) return 0; } @@ -3365,7 +3305,7 @@ static unsigned long gup_fast(unsigned long start, unsigned long end, * include/asm-generic/tlb.h for more details. * * We do not adopt an rcu_read_lock() here as we also want to block IPIs - * that come from THPs splitting. + * that come from callers of tlb_remove_table_sync_one(). */ local_irq_save(flags); gup_fast_pgd_range(start, end, gup_flags, pages, &nr_pinned); @@ -3412,8 +3352,6 @@ static int gup_fast_fallback(unsigned long start, unsigned long nr_pages, return -EOVERFLOW; if (end > TASK_SIZE_MAX) return -EFAULT; - if (unlikely(!access_ok((void __user *)start, len))) - return -EFAULT; nr_pinned = gup_fast(start, end, gup_flags, pages); if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY) @@ -3655,7 +3593,7 @@ long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end, { unsigned int flags, nr_folios, nr_found; unsigned int i, pgshift = PAGE_SHIFT; - pgoff_t start_idx, end_idx, next_idx; + pgoff_t start_idx, end_idx; struct folio *folio = NULL; struct folio_batch fbatch; struct hstate *h; @@ -3705,20 +3643,8 @@ long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end, folio = NULL; } - next_idx = 0; for (i = 0; i < nr_found; i++) { - /* - * As there can be multiple entries for a - * given folio in the batch returned by - * filemap_get_folios_contig(), the below - * check is to ensure that we pin and return a - * unique set of folios between start and end. - */ - if (next_idx && - next_idx != folio_index(fbatch.folios[i])) - continue; - - folio = page_folio(&fbatch.folios[i]->page); + folio = fbatch.folios[i]; if (try_grab_folio(folio, 1, FOLL_PIN)) { folio_batch_release(&fbatch); @@ -3730,7 +3656,6 @@ long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end, *offset = offset_in_folio(folio, start); folios[nr_folios] = folio; - next_idx = folio_next_index(folio); if (++nr_folios == max_folios) break; } @@ -10,6 +10,7 @@ */ #include <linux/pagewalk.h> #include <linux/hmm.h> +#include <linux/hmm-dma.h> #include <linux/init.h> #include <linux/rmap.h> #include <linux/swap.h> @@ -23,6 +24,7 @@ #include <linux/sched/mm.h> #include <linux/jump_label.h> #include <linux/dma-mapping.h> +#include <linux/pci-p2pdma.h> #include <linux/mmu_notifier.h> #include <linux/memory_hotplug.h> @@ -39,13 +41,21 @@ enum { HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT, }; +enum { + /* These flags are carried from input-to-output */ + HMM_PFN_INOUT_FLAGS = HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA | + HMM_PFN_P2PDMA_BUS, +}; + static int hmm_pfns_fill(unsigned long addr, unsigned long end, struct hmm_range *range, unsigned long cpu_flags) { unsigned long i = (addr - range->start) >> PAGE_SHIFT; - for (; addr < end; addr += PAGE_SIZE, i++) - range->hmm_pfns[i] = cpu_flags; + for (; addr < end; addr += PAGE_SIZE, i++) { + range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS; + range->hmm_pfns[i] |= cpu_flags; + } return 0; } @@ -202,8 +212,10 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, return hmm_vma_fault(addr, end, required_fault, walk); pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) - hmm_pfns[i] = pfn | cpu_flags; + for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { + hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS; + hmm_pfns[i] |= pfn | cpu_flags; + } return 0; } #else /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -230,14 +242,14 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, unsigned long cpu_flags; pte_t pte = ptep_get(ptep); uint64_t pfn_req_flags = *hmm_pfn; + uint64_t new_pfn_flags = 0; if (pte_none_mostly(pte)) { required_fault = hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0); if (required_fault) goto fault; - *hmm_pfn = 0; - return 0; + goto out; } if (!pte_present(pte)) { @@ -248,21 +260,19 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, * just report the PFN. */ if (is_device_private_entry(entry) && - pfn_swap_entry_to_page(entry)->pgmap->owner == + page_pgmap(pfn_swap_entry_to_page(entry))->owner == range->dev_private_owner) { cpu_flags = HMM_PFN_VALID; if (is_writable_device_private_entry(entry)) cpu_flags |= HMM_PFN_WRITE; - *hmm_pfn = swp_offset_pfn(entry) | cpu_flags; - return 0; + new_pfn_flags = swp_offset_pfn(entry) | cpu_flags; + goto out; } required_fault = hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0); - if (!required_fault) { - *hmm_pfn = 0; - return 0; - } + if (!required_fault) + goto out; if (!non_swap_entry(entry)) goto fault; @@ -304,11 +314,13 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, pte_unmap(ptep); return -EFAULT; } - *hmm_pfn = HMM_PFN_ERROR; - return 0; + new_pfn_flags = HMM_PFN_ERROR; + goto out; } - *hmm_pfn = pte_pfn(pte) | cpu_flags; + new_pfn_flags = pte_pfn(pte) | cpu_flags; +out: + *hmm_pfn = (*hmm_pfn & HMM_PFN_INOUT_FLAGS) | new_pfn_flags; return 0; fault: @@ -448,8 +460,10 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, } pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - for (i = 0; i < npages; ++i, ++pfn) - hmm_pfns[i] = pfn | cpu_flags; + for (i = 0; i < npages; ++i, ++pfn) { + hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS; + hmm_pfns[i] |= pfn | cpu_flags; + } goto out_unlock; } @@ -507,8 +521,10 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, } pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT); - for (; addr < end; addr += PAGE_SIZE, i++, pfn++) - range->hmm_pfns[i] = pfn | cpu_flags; + for (; addr < end; addr += PAGE_SIZE, i++, pfn++) { + range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS; + range->hmm_pfns[i] |= pfn | cpu_flags; + } spin_unlock(ptl); return 0; @@ -607,3 +623,211 @@ int hmm_range_fault(struct hmm_range *range) return ret; } EXPORT_SYMBOL(hmm_range_fault); + +/** + * hmm_dma_map_alloc - Allocate HMM map structure + * @dev: device to allocate structure for + * @map: HMM map to allocate + * @nr_entries: number of entries in the map + * @dma_entry_size: size of the DMA entry in the map + * + * Allocate the HMM map structure and all the lists it contains. + * Return 0 on success, -ENOMEM on failure. + */ +int hmm_dma_map_alloc(struct device *dev, struct hmm_dma_map *map, + size_t nr_entries, size_t dma_entry_size) +{ + bool dma_need_sync = false; + bool use_iova; + + WARN_ON_ONCE(!(nr_entries * PAGE_SIZE / dma_entry_size)); + + /* + * The HMM API violates our normal DMA buffer ownership rules and can't + * transfer buffer ownership. The dma_addressing_limited() check is a + * best approximation to ensure no swiotlb buffering happens. + */ +#ifdef CONFIG_DMA_NEED_SYNC + dma_need_sync = !dev->dma_skip_sync; +#endif /* CONFIG_DMA_NEED_SYNC */ + if (dma_need_sync || dma_addressing_limited(dev)) + return -EOPNOTSUPP; + + map->dma_entry_size = dma_entry_size; + map->pfn_list = kvcalloc(nr_entries, sizeof(*map->pfn_list), + GFP_KERNEL | __GFP_NOWARN); + if (!map->pfn_list) + return -ENOMEM; + + use_iova = dma_iova_try_alloc(dev, &map->state, 0, + nr_entries * PAGE_SIZE); + if (!use_iova && dma_need_unmap(dev)) { + map->dma_list = kvcalloc(nr_entries, sizeof(*map->dma_list), + GFP_KERNEL | __GFP_NOWARN); + if (!map->dma_list) + goto err_dma; + } + return 0; + +err_dma: + kvfree(map->pfn_list); + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(hmm_dma_map_alloc); + +/** + * hmm_dma_map_free - iFree HMM map structure + * @dev: device to free structure from + * @map: HMM map containing the various lists and state + * + * Free the HMM map structure and all the lists it contains. + */ +void hmm_dma_map_free(struct device *dev, struct hmm_dma_map *map) +{ + if (dma_use_iova(&map->state)) + dma_iova_free(dev, &map->state); + kvfree(map->pfn_list); + kvfree(map->dma_list); +} +EXPORT_SYMBOL_GPL(hmm_dma_map_free); + +/** + * hmm_dma_map_pfn - Map a physical HMM page to DMA address + * @dev: Device to map the page for + * @map: HMM map + * @idx: Index into the PFN and dma address arrays + * @p2pdma_state: PCI P2P state. + * + * dma_alloc_iova() allocates IOVA based on the size specified by their use in + * iova->size. Call this function after IOVA allocation to link whole @page + * to get the DMA address. Note that very first call to this function + * will have @offset set to 0 in the IOVA space allocated from + * dma_alloc_iova(). For subsequent calls to this function on same @iova, + * @offset needs to be advanced by the caller with the size of previous + * page that was linked + DMA address returned for the previous page that was + * linked by this function. + */ +dma_addr_t hmm_dma_map_pfn(struct device *dev, struct hmm_dma_map *map, + size_t idx, + struct pci_p2pdma_map_state *p2pdma_state) +{ + struct dma_iova_state *state = &map->state; + dma_addr_t *dma_addrs = map->dma_list; + unsigned long *pfns = map->pfn_list; + struct page *page = hmm_pfn_to_page(pfns[idx]); + phys_addr_t paddr = hmm_pfn_to_phys(pfns[idx]); + size_t offset = idx * map->dma_entry_size; + unsigned long attrs = 0; + dma_addr_t dma_addr; + int ret; + + if ((pfns[idx] & HMM_PFN_DMA_MAPPED) && + !(pfns[idx] & HMM_PFN_P2PDMA_BUS)) { + /* + * We are in this flow when there is a need to resync flags, + * for example when page was already linked in prefetch call + * with READ flag and now we need to add WRITE flag + * + * This page was already programmed to HW and we don't want/need + * to unlink and link it again just to resync flags. + */ + if (dma_use_iova(state)) + return state->addr + offset; + + /* + * Without dma_need_unmap, the dma_addrs array is NULL, thus we + * need to regenerate the address below even if there already + * was a mapping. But !dma_need_unmap implies that the + * mapping stateless, so this is fine. + */ + if (dma_need_unmap(dev)) + return dma_addrs[idx]; + + /* Continue to remapping */ + } + + switch (pci_p2pdma_state(p2pdma_state, dev, page)) { + case PCI_P2PDMA_MAP_NONE: + break; + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: + attrs |= DMA_ATTR_SKIP_CPU_SYNC; + pfns[idx] |= HMM_PFN_P2PDMA; + break; + case PCI_P2PDMA_MAP_BUS_ADDR: + pfns[idx] |= HMM_PFN_P2PDMA_BUS | HMM_PFN_DMA_MAPPED; + return pci_p2pdma_bus_addr_map(p2pdma_state, paddr); + default: + return DMA_MAPPING_ERROR; + } + + if (dma_use_iova(state)) { + ret = dma_iova_link(dev, state, paddr, offset, + map->dma_entry_size, DMA_BIDIRECTIONAL, + attrs); + if (ret) + goto error; + + ret = dma_iova_sync(dev, state, offset, map->dma_entry_size); + if (ret) { + dma_iova_unlink(dev, state, offset, map->dma_entry_size, + DMA_BIDIRECTIONAL, attrs); + goto error; + } + + dma_addr = state->addr + offset; + } else { + if (WARN_ON_ONCE(dma_need_unmap(dev) && !dma_addrs)) + goto error; + + dma_addr = dma_map_page(dev, page, 0, map->dma_entry_size, + DMA_BIDIRECTIONAL); + if (dma_mapping_error(dev, dma_addr)) + goto error; + + if (dma_need_unmap(dev)) + dma_addrs[idx] = dma_addr; + } + pfns[idx] |= HMM_PFN_DMA_MAPPED; + return dma_addr; +error: + pfns[idx] &= ~HMM_PFN_P2PDMA; + return DMA_MAPPING_ERROR; + +} +EXPORT_SYMBOL_GPL(hmm_dma_map_pfn); + +/** + * hmm_dma_unmap_pfn - Unmap a physical HMM page from DMA address + * @dev: Device to unmap the page from + * @map: HMM map + * @idx: Index of the PFN to unmap + * + * Returns true if the PFN was mapped and has been unmapped, false otherwise. + */ +bool hmm_dma_unmap_pfn(struct device *dev, struct hmm_dma_map *map, size_t idx) +{ + const unsigned long valid_dma = HMM_PFN_VALID | HMM_PFN_DMA_MAPPED; + struct dma_iova_state *state = &map->state; + dma_addr_t *dma_addrs = map->dma_list; + unsigned long *pfns = map->pfn_list; + unsigned long attrs = 0; + + if ((pfns[idx] & valid_dma) != valid_dma) + return false; + + if (pfns[idx] & HMM_PFN_P2PDMA_BUS) + ; /* no need to unmap bus address P2P mappings */ + else if (dma_use_iova(state)) { + if (pfns[idx] & HMM_PFN_P2PDMA) + attrs |= DMA_ATTR_SKIP_CPU_SYNC; + dma_iova_unlink(dev, state, idx * map->dma_entry_size, + map->dma_entry_size, DMA_BIDIRECTIONAL, attrs); + } else if (dma_need_unmap(dev)) + dma_unmap_page(dev, dma_addrs[idx], map->dma_entry_size, + DMA_BIDIRECTIONAL); + + pfns[idx] &= + ~(HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA | HMM_PFN_P2PDMA_BUS); + return true; +} +EXPORT_SYMBOL_GPL(hmm_dma_unmap_pfn); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ee335d96fc39..d3e66136e41a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -617,6 +617,8 @@ DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT); DEFINE_MTHP_STAT_ATTR(swpin, MTHP_STAT_SWPIN); +DEFINE_MTHP_STAT_ATTR(swpin_fallback, MTHP_STAT_SWPIN_FALLBACK); +DEFINE_MTHP_STAT_ATTR(swpin_fallback_charge, MTHP_STAT_SWPIN_FALLBACK_CHARGE); DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT); DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK); #ifdef CONFIG_SHMEM @@ -637,6 +639,8 @@ static struct attribute *anon_stats_attrs[] = { #ifndef CONFIG_SHMEM &zswpout_attr.attr, &swpin_attr.attr, + &swpin_fallback_attr.attr, + &swpin_fallback_charge_attr.attr, &swpout_attr.attr, &swpout_fallback_attr.attr, #endif @@ -669,6 +673,8 @@ static struct attribute *any_stats_attrs[] = { #ifdef CONFIG_SHMEM &zswpout_attr.attr, &swpin_attr.attr, + &swpin_fallback_attr.attr, + &swpin_fallback_charge_attr.attr, &swpout_attr.attr, &swpout_fallback_attr.attr, #endif @@ -1176,11 +1182,12 @@ static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma, folio_throttle_swaprate(folio, gfp); /* - * When a folio is not zeroed during allocation (__GFP_ZERO not used), - * folio_zero_user() is used to make sure that the page corresponding - * to the faulting address will be hot in the cache after zeroing. + * When a folio is not zeroed during allocation (__GFP_ZERO not used) + * or user folios require special handling, folio_zero_user() is used to + * make sure that the page corresponding to the faulting address will be + * hot in the cache after zeroing. */ - if (!alloc_zeroed()) + if (user_alloc_needs_zeroing()) folio_zero_user(folio, addr); /* * The memory barrier inside __folio_mark_uptodate makes sure that @@ -1196,7 +1203,7 @@ static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd, { pmd_t entry; - entry = mk_huge_pmd(&folio->page, vma->vm_page_prot); + entry = folio_mk_pmd(folio, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); @@ -1302,10 +1309,7 @@ static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm, struct folio *zero_folio) { pmd_t entry; - if (!pmd_none(*pmd)) - return; - entry = mk_pmd(&zero_folio->page, vma->vm_page_prot); - entry = pmd_mkhuge(entry); + entry = folio_mk_pmd(zero_folio, vma->vm_page_prot); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); mm_inc_nr_ptes(mm); @@ -1368,20 +1372,20 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) return __do_huge_pmd_anonymous_page(vmf); } -static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, +static int insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write, pgtable_t pgtable) { struct mm_struct *mm = vma->vm_mm; pmd_t entry; - spinlock_t *ptl; - ptl = pmd_lock(mm, pmd); + lockdep_assert_held(pmd_lockptr(mm, pmd)); + if (!pmd_none(*pmd)) { if (write) { if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) { WARN_ON_ONCE(!is_huge_zero_pmd(*pmd)); - goto out_unlock; + return -EEXIST; } entry = pmd_mkyoung(*pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); @@ -1389,7 +1393,7 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, update_mmu_cache_pmd(vma, addr, pmd); } - goto out_unlock; + return -EEXIST; } entry = pmd_mkhuge(pfn_t_pmd(pfn, prot)); @@ -1405,16 +1409,11 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, if (pgtable) { pgtable_trans_huge_deposit(mm, pmd, pgtable); mm_inc_nr_ptes(mm); - pgtable = NULL; } set_pmd_at(mm, addr, pmd, entry); update_mmu_cache_pmd(vma, addr, pmd); - -out_unlock: - spin_unlock(ptl); - if (pgtable) - pte_free(mm, pgtable); + return 0; } /** @@ -1433,6 +1432,8 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write) struct vm_area_struct *vma = vmf->vma; pgprot_t pgprot = vma->vm_page_prot; pgtable_t pgtable = NULL; + spinlock_t *ptl; + int error; /* * If we had pmd_special, we could avoid all these restrictions, @@ -1454,13 +1455,58 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write) return VM_FAULT_OOM; } - track_pfn_insert(vma, &pgprot, pfn); + pfnmap_setup_cachemode_pfn(pfn_t_to_pfn(pfn), &pgprot); + + ptl = pmd_lock(vma->vm_mm, vmf->pmd); + error = insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, + pgtable); + spin_unlock(ptl); + if (error && pgtable) + pte_free(vma->vm_mm, pgtable); - insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable); return VM_FAULT_NOPAGE; } EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd); +vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio, + bool write) +{ + struct vm_area_struct *vma = vmf->vma; + unsigned long addr = vmf->address & PMD_MASK; + struct mm_struct *mm = vma->vm_mm; + spinlock_t *ptl; + pgtable_t pgtable = NULL; + int error; + + if (addr < vma->vm_start || addr >= vma->vm_end) + return VM_FAULT_SIGBUS; + + if (WARN_ON_ONCE(folio_order(folio) != PMD_ORDER)) + return VM_FAULT_SIGBUS; + + if (arch_needs_pgtable_deposit()) { + pgtable = pte_alloc_one(vma->vm_mm); + if (!pgtable) + return VM_FAULT_OOM; + } + + ptl = pmd_lock(mm, vmf->pmd); + if (pmd_none(*vmf->pmd)) { + folio_get(folio); + folio_add_file_rmap_pmd(folio, &folio->page, vma); + add_mm_counter(mm, mm_counter_file(folio), HPAGE_PMD_NR); + } + error = insert_pfn_pmd(vma, addr, vmf->pmd, + pfn_to_pfn_t(folio_pfn(folio)), vma->vm_page_prot, + write, pgtable); + spin_unlock(ptl); + if (error && pgtable) + pte_free(mm, pgtable); + + return VM_FAULT_NOPAGE; +} +EXPORT_SYMBOL_GPL(vmf_insert_folio_pmd); + #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma) { @@ -1475,19 +1521,17 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, struct mm_struct *mm = vma->vm_mm; pgprot_t prot = vma->vm_page_prot; pud_t entry; - spinlock_t *ptl; - ptl = pud_lock(mm, pud); if (!pud_none(*pud)) { if (write) { if (WARN_ON_ONCE(pud_pfn(*pud) != pfn_t_to_pfn(pfn))) - goto out_unlock; + return; entry = pud_mkyoung(*pud); entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma); if (pudp_set_access_flags(vma, addr, pud, entry, 1)) update_mmu_cache_pud(vma, addr, pud); } - goto out_unlock; + return; } entry = pud_mkhuge(pfn_t_pud(pfn, prot)); @@ -1501,9 +1545,6 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, } set_pud_at(mm, addr, pud, entry); update_mmu_cache_pud(vma, addr, pud); - -out_unlock: - spin_unlock(ptl); } /** @@ -1521,6 +1562,7 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write) unsigned long addr = vmf->address & PUD_MASK; struct vm_area_struct *vma = vmf->vma; pgprot_t pgprot = vma->vm_page_prot; + spinlock_t *ptl; /* * If we had pud_special, we could avoid all these restrictions, @@ -1536,12 +1578,59 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write) if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS; - track_pfn_insert(vma, &pgprot, pfn); + pfnmap_setup_cachemode_pfn(pfn_t_to_pfn(pfn), &pgprot); + ptl = pud_lock(vma->vm_mm, vmf->pud); insert_pfn_pud(vma, addr, vmf->pud, pfn, write); + spin_unlock(ptl); + return VM_FAULT_NOPAGE; } EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud); + +/** + * vmf_insert_folio_pud - insert a pud size folio mapped by a pud entry + * @vmf: Structure describing the fault + * @folio: folio to insert + * @write: whether it's a write fault + * + * Return: vm_fault_t value. + */ +vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio, + bool write) +{ + struct vm_area_struct *vma = vmf->vma; + unsigned long addr = vmf->address & PUD_MASK; + pud_t *pud = vmf->pud; + struct mm_struct *mm = vma->vm_mm; + spinlock_t *ptl; + + if (addr < vma->vm_start || addr >= vma->vm_end) + return VM_FAULT_SIGBUS; + + if (WARN_ON_ONCE(folio_order(folio) != PUD_ORDER)) + return VM_FAULT_SIGBUS; + + ptl = pud_lock(mm, pud); + + /* + * If there is already an entry present we assume the folio is + * already mapped, hence no need to take another reference. We + * still call insert_pfn_pud() though in case the mapping needs + * upgrading to writeable. + */ + if (pud_none(*vmf->pud)) { + folio_get(folio); + folio_add_file_rmap_pud(folio, &folio->page, vma); + add_mm_counter(mm, mm_counter_file(folio), HPAGE_PUD_NR); + } + insert_pfn_pud(vma, addr, vmf->pud, pfn_to_pfn_t(folio_pfn(folio)), + write); + spin_unlock(ptl); + + return VM_FAULT_NOPAGE; +} +EXPORT_SYMBOL_GPL(vmf_insert_folio_pud); #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ void touch_pmd(struct vm_area_struct *vma, unsigned long addr, @@ -1691,13 +1780,13 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, src_folio = page_folio(src_page); folio_get(src_folio); - if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio, src_page, src_vma))) { + if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio, src_page, dst_vma, src_vma))) { /* Page maybe pinned: split and retry the fault on PTEs. */ folio_put(src_folio); pte_free(dst_mm, pgtable); spin_unlock(src_ptl); spin_unlock(dst_ptl); - __split_huge_pmd(src_vma, src_pmd, addr, false, NULL); + __split_huge_pmd(src_vma, src_pmd, addr, false); return -EAGAIN; } add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); @@ -1919,7 +2008,7 @@ unlock_fallback: folio_unlock(folio); spin_unlock(vmf->ptl); fallback: - __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL); + __split_huge_pmd(vma, vmf->pmd, vmf->address, false); return VM_FAULT_FALLBACK; } @@ -2002,7 +2091,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) spin_unlock(vmf->ptl); writable = false; - if (!migrate_misplaced_folio(folio, vma, target_nid)) { + if (!migrate_misplaced_folio(folio, target_nid)) { flags |= TNF_MIGRATED; nid = target_nid; task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags); @@ -2064,7 +2153,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, * If other processes are mapping this folio, we couldn't discard * the folio unless they all do MADV_FREE so let's skip the folio. */ - if (folio_likely_mapped_shared(folio)) + if (folio_maybe_mapped_shared(folio)) goto out; if (!folio_trylock(folio)) @@ -2134,12 +2223,13 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, tlb->fullmm); arch_check_zapped_pmd(vma, orig_pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); - if (vma_is_special_huge(vma)) { + if (!vma_is_dax(vma) && vma_is_special_huge(vma)) { if (arch_needs_pgtable_deposit()) zap_deposited_table(tlb->mm, pmd); spin_unlock(ptl); } else if (is_huge_zero_pmd(orig_pmd)) { - zap_deposited_table(tlb->mm, pmd); + if (!vma_is_dax(vma) || arch_needs_pgtable_deposit()) + zap_deposited_table(tlb->mm, pmd); spin_unlock(ptl); } else { struct folio *folio = NULL; @@ -2170,6 +2260,14 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, zap_deposited_table(tlb->mm, pmd); add_mm_counter(tlb->mm, mm_counter_file(folio), -HPAGE_PMD_NR); + + /* + * Use flush_needed to indicate whether the PMD entry + * is present, instead of checking pmd_present() again. + */ + if (flush_needed && pmd_young(orig_pmd) && + likely(vma_has_recency(vma))) + folio_mark_accessed(folio); } spin_unlock(ptl); @@ -2205,6 +2303,16 @@ static pmd_t move_soft_dirty_pmd(pmd_t pmd) return pmd; } +static pmd_t clear_uffd_wp_pmd(pmd_t pmd) +{ + if (pmd_present(pmd)) + pmd = pmd_clear_uffd_wp(pmd); + else if (is_swap_pmd(pmd)) + pmd = pmd_swp_clear_uffd_wp(pmd); + + return pmd; +} + bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd) { @@ -2243,6 +2351,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, pgtable_trans_huge_deposit(mm, new_pmd, pgtable); } pmd = move_soft_dirty_pmd(pmd); + if (vma_has_uffd_without_event_remap(vma)) + pmd = clear_uffd_wp_pmd(pmd); set_pmd_at(mm, new_addr, new_pmd, pmd); if (force_flush) flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE); @@ -2551,12 +2661,12 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm folio_move_anon_rmap(src_folio, dst_vma); src_folio->index = linear_page_index(dst_vma, dst_addr); - _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot); + _dst_pmd = folio_mk_pmd(src_folio, dst_vma->vm_page_prot); /* Follow mremap() behavior and treat the entry dirty after the move */ _dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma); } else { src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd); - _dst_pmd = mk_huge_pmd(src_page, dst_vma->vm_page_prot); + _dst_pmd = folio_mk_pmd(src_folio, dst_vma->vm_page_prot); } set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd); @@ -2627,12 +2737,24 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, orig_pud = pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm); arch_check_zapped_pud(vma, orig_pud); tlb_remove_pud_tlb_entry(tlb, pud, addr); - if (vma_is_special_huge(vma)) { + if (!vma_is_dax(vma) && vma_is_special_huge(vma)) { spin_unlock(ptl); /* No zero page support yet */ } else { - /* No support for anonymous PUD pages yet */ - BUG(); + struct page *page = NULL; + struct folio *folio; + + /* No support for anonymous PUD pages or migration yet */ + VM_WARN_ON_ONCE(vma_is_anonymous(vma) || + !pud_present(orig_pud)); + + page = pud_page(orig_pud); + folio = page_folio(page); + folio_remove_rmap_pud(folio, page, vma); + add_mm_counter(tlb->mm, mm_counter_file(folio), -HPAGE_PUD_NR); + + spin_unlock(ptl); + tlb_remove_page_size(tlb, page, HPAGE_PUD_SIZE); } return 1; } @@ -2640,6 +2762,10 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud, unsigned long haddr) { + struct folio *folio; + struct page *page; + pud_t old_pud; + VM_BUG_ON(haddr & ~HPAGE_PUD_MASK); VM_BUG_ON_VMA(vma->vm_start > haddr, vma); VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma); @@ -2647,7 +2773,22 @@ static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud, count_vm_event(THP_SPLIT_PUD); - pudp_huge_clear_flush(vma, haddr, pud); + old_pud = pudp_huge_clear_flush(vma, haddr, pud); + + if (!vma_is_dax(vma)) + return; + + page = pud_page(old_pud); + folio = page_folio(page); + + if (!folio_test_dirty(folio) && pud_dirty(old_pud)) + folio_mark_dirty(folio); + if (!folio_test_referenced(folio) && pud_young(old_pud)) + folio_set_referenced(folio); + folio_remove_rmap_pud(folio, page, vma); + folio_put(folio); + add_mm_counter(vma->vm_mm, mm_counter_file(folio), + -HPAGE_PUD_NR); } void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, @@ -2747,13 +2888,15 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, */ if (arch_needs_pgtable_deposit()) zap_deposited_table(mm, pmd); - if (vma_is_special_huge(vma)) + if (!vma_is_dax(vma) && vma_is_special_huge(vma)) return; if (unlikely(is_pmd_migration_entry(old_pmd))) { swp_entry_t entry; entry = pmd_to_swp_entry(old_pmd); folio = pfn_swap_entry_folio(entry); + } else if (is_huge_zero_pmd(old_pmd)) { + return; } else { page = pmd_page(old_pmd); folio = page_folio(page); @@ -2938,28 +3081,16 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, } void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmd, bool freeze, struct folio *folio) + pmd_t *pmd, bool freeze) { - VM_WARN_ON_ONCE(folio && !folio_test_pmd_mappable(folio)); VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE)); - VM_WARN_ON_ONCE(folio && !folio_test_locked(folio)); - VM_BUG_ON(freeze && !folio); - - /* - * When the caller requests to set up a migration entry, we - * require a folio to check the PMD against. Otherwise, there - * is a risk of replacing the wrong folio. - */ if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) || - is_pmd_migration_entry(*pmd)) { - if (folio && folio != pmd_folio(*pmd)) - return; + is_pmd_migration_entry(*pmd)) __split_huge_pmd_locked(vma, pmd, address, freeze); - } } void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long address, bool freeze, struct folio *folio) + unsigned long address, bool freeze) { spinlock_t *ptl; struct mmu_notifier_range range; @@ -2969,20 +3100,20 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); ptl = pmd_lock(vma->vm_mm, pmd); - split_huge_pmd_locked(vma, range.start, pmd, freeze, folio); + split_huge_pmd_locked(vma, range.start, pmd, freeze); spin_unlock(ptl); mmu_notifier_invalidate_range_end(&range); } void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, - bool freeze, struct folio *folio) + bool freeze) { pmd_t *pmd = mm_find_pmd(vma->vm_mm, address); if (!pmd) return; - __split_huge_pmd(vma, pmd, address, freeze, folio); + __split_huge_pmd(vma, pmd, address, freeze); } static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address) @@ -2994,13 +3125,13 @@ static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) && range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE), ALIGN(address, HPAGE_PMD_SIZE))) - split_huge_pmd_address(vma, address, false, NULL); + split_huge_pmd_address(vma, address, false); } void vma_adjust_trans_huge(struct vm_area_struct *vma, - unsigned long start, - unsigned long end, - long adjust_next) + unsigned long start, + unsigned long end, + struct vm_area_struct *next) { /* Check if we need to split start first. */ split_huge_pmd_if_needed(vma, start); @@ -3008,16 +3139,9 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, /* Check if we need to split end next. */ split_huge_pmd_if_needed(vma, end); - /* - * If we're also updating the next vma vm_start, - * check if we need to split it. - */ - if (adjust_next > 0) { - struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end); - unsigned long nstart = next->vm_start; - nstart += adjust_next; - split_huge_pmd_if_needed(next, nstart); - } + /* If we're incrementing next->vm_start, we might need to split it. */ + if (next) + split_huge_pmd_if_needed(next, end); } static void unmap_folio(struct folio *folio) @@ -3051,8 +3175,12 @@ static bool __discard_anon_folio_pmd_locked(struct vm_area_struct *vma, int ref_count, map_count; pmd_t orig_pmd = *pmdp; - if (folio_test_dirty(folio) || pmd_dirty(orig_pmd)) + if (pmd_dirty(orig_pmd)) + folio_set_dirty(folio); + if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) { + folio_set_swapbacked(folio); return false; + } orig_pmd = pmdp_huge_clear_flush(vma, addr, pmdp); @@ -3079,8 +3207,15 @@ static bool __discard_anon_folio_pmd_locked(struct vm_area_struct *vma, * * The only folio refs must be one from isolation plus the rmap(s). */ - if (folio_test_dirty(folio) || pmd_dirty(orig_pmd) || - ref_count != map_count + 1) { + if (pmd_dirty(orig_pmd)) + folio_set_dirty(folio); + if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) { + folio_set_swapbacked(folio); + set_pmd_at(mm, addr, pmdp, orig_pmd); + return false; + } + + if (ref_count != map_count + 1) { set_pmd_at(mm, addr, pmdp, orig_pmd); return false; } @@ -3100,12 +3235,11 @@ bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr, { VM_WARN_ON_FOLIO(!folio_test_pmd_mappable(folio), folio); VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); + VM_WARN_ON_FOLIO(folio_test_swapbacked(folio), folio); VM_WARN_ON_ONCE(!IS_ALIGNED(addr, HPAGE_PMD_SIZE)); - if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) - return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio); - - return false; + return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio); } static void remap_page(struct folio *folio, unsigned long nr, int flags) @@ -3124,225 +3258,378 @@ static void remap_page(struct folio *folio, unsigned long nr, int flags) } } -static void lru_add_page_tail(struct folio *folio, struct page *tail, +static void lru_add_split_folio(struct folio *folio, struct folio *new_folio, struct lruvec *lruvec, struct list_head *list) { - VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); - VM_BUG_ON_FOLIO(PageLRU(tail), folio); + VM_BUG_ON_FOLIO(folio_test_lru(new_folio), folio); lockdep_assert_held(&lruvec->lru_lock); if (list) { /* page reclaim is reclaiming a huge page */ VM_WARN_ON(folio_test_lru(folio)); - get_page(tail); - list_add_tail(&tail->lru, list); + folio_get(new_folio); + list_add_tail(&new_folio->lru, list); } else { /* head is still on lru (and we have it frozen) */ VM_WARN_ON(!folio_test_lru(folio)); if (folio_test_unevictable(folio)) - tail->mlock_count = 0; + new_folio->mlock_count = 0; else - list_add_tail(&tail->lru, &folio->lru); - SetPageLRU(tail); + list_add_tail(&new_folio->lru, &folio->lru); + folio_set_lru(new_folio); } } -static void __split_huge_page_tail(struct folio *folio, int tail, - struct lruvec *lruvec, struct list_head *list, - unsigned int new_order) +/* Racy check whether the huge page can be split */ +bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins) { - struct page *head = &folio->page; - struct page *page_tail = head + tail; - /* - * Careful: new_folio is not a "real" folio before we cleared PageTail. - * Don't pass it around before clear_compound_head(). - */ - struct folio *new_folio = (struct folio *)page_tail; + int extra_pins; - VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail); + /* Additional pins from page cache */ + if (folio_test_anon(folio)) + extra_pins = folio_test_swapcache(folio) ? + folio_nr_pages(folio) : 0; + else + extra_pins = folio_nr_pages(folio); + if (pextra_pins) + *pextra_pins = extra_pins; + return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - + caller_pins; +} + +/* + * It splits @folio into @new_order folios and copies the @folio metadata to + * all the resulting folios. + */ +static void __split_folio_to_order(struct folio *folio, int old_order, + int new_order) +{ + long new_nr_pages = 1 << new_order; + long nr_pages = 1 << old_order; + long i; /* - * Clone page flags before unfreezing refcount. - * - * After successful get_page_unless_zero() might follow flags change, - * for example lock_page() which set PG_waiters. - * - * Note that for mapped sub-pages of an anonymous THP, - * PG_anon_exclusive has been cleared in unmap_folio() and is stored in - * the migration entry instead from where remap_page() will restore it. - * We can still have PG_anon_exclusive set on effectively unmapped and - * unreferenced sub-pages of an anonymous THP: we can simply drop - * PG_anon_exclusive (-> PG_mappedtodisk) for these here. + * Skip the first new_nr_pages, since the new folio from them have all + * the flags from the original folio. */ - page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; - page_tail->flags |= (head->flags & - ((1L << PG_referenced) | - (1L << PG_swapbacked) | - (1L << PG_swapcache) | - (1L << PG_mlocked) | - (1L << PG_uptodate) | - (1L << PG_active) | - (1L << PG_workingset) | - (1L << PG_locked) | - (1L << PG_unevictable) | + for (i = new_nr_pages; i < nr_pages; i += new_nr_pages) { + struct page *new_head = &folio->page + i; + + /* + * Careful: new_folio is not a "real" folio before we cleared PageTail. + * Don't pass it around before clear_compound_head(). + */ + struct folio *new_folio = (struct folio *)new_head; + + VM_BUG_ON_PAGE(atomic_read(&new_folio->_mapcount) != -1, new_head); + + /* + * Clone page flags before unfreezing refcount. + * + * After successful get_page_unless_zero() might follow flags change, + * for example lock_page() which set PG_waiters. + * + * Note that for mapped sub-pages of an anonymous THP, + * PG_anon_exclusive has been cleared in unmap_folio() and is stored in + * the migration entry instead from where remap_page() will restore it. + * We can still have PG_anon_exclusive set on effectively unmapped and + * unreferenced sub-pages of an anonymous THP: we can simply drop + * PG_anon_exclusive (-> PG_mappedtodisk) for these here. + */ + new_folio->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + new_folio->flags |= (folio->flags & + ((1L << PG_referenced) | + (1L << PG_swapbacked) | + (1L << PG_swapcache) | + (1L << PG_mlocked) | + (1L << PG_uptodate) | + (1L << PG_active) | + (1L << PG_workingset) | + (1L << PG_locked) | + (1L << PG_unevictable) | #ifdef CONFIG_ARCH_USES_PG_ARCH_2 - (1L << PG_arch_2) | + (1L << PG_arch_2) | #endif #ifdef CONFIG_ARCH_USES_PG_ARCH_3 - (1L << PG_arch_3) | + (1L << PG_arch_3) | #endif - (1L << PG_dirty) | - LRU_GEN_MASK | LRU_REFS_MASK)); + (1L << PG_dirty) | + LRU_GEN_MASK | LRU_REFS_MASK)); - /* ->mapping in first and second tail page is replaced by other uses */ - VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, - page_tail); - new_folio->mapping = folio->mapping; - new_folio->index = folio->index + tail; + new_folio->mapping = folio->mapping; + new_folio->index = folio->index + i; - /* - * page->private should not be set in tail pages. Fix up and warn once - * if private is unexpectedly set. - */ - if (unlikely(page_tail->private)) { - VM_WARN_ON_ONCE_PAGE(true, page_tail); - page_tail->private = 0; - } - if (folio_test_swapcache(folio)) - new_folio->swap.val = folio->swap.val + tail; + /* + * page->private should not be set in tail pages. Fix up and warn once + * if private is unexpectedly set. + */ + if (unlikely(new_folio->private)) { + VM_WARN_ON_ONCE_PAGE(true, new_head); + new_folio->private = NULL; + } - /* Page flags must be visible before we make the page non-compound. */ - smp_wmb(); + if (folio_test_swapcache(folio)) + new_folio->swap.val = folio->swap.val + i; - /* - * Clear PageTail before unfreezing page refcount. - * - * After successful get_page_unless_zero() might follow put_page() - * which needs correct compound_head(). - */ - clear_compound_head(page_tail); - if (new_order) { - prep_compound_page(page_tail, new_order); - folio_set_large_rmappable(new_folio); - } + /* Page flags must be visible before we make the page non-compound. */ + smp_wmb(); - /* Finally unfreeze refcount. Additional reference from page cache. */ - page_ref_unfreeze(page_tail, - 1 + ((!folio_test_anon(folio) || folio_test_swapcache(folio)) ? - folio_nr_pages(new_folio) : 0)); + /* + * Clear PageTail before unfreezing page refcount. + * + * After successful get_page_unless_zero() might follow put_page() + * which needs correct compound_head(). + */ + clear_compound_head(new_head); + if (new_order) { + prep_compound_page(new_head, new_order); + folio_set_large_rmappable(new_folio); + } - if (folio_test_young(folio)) - folio_set_young(new_folio); - if (folio_test_idle(folio)) - folio_set_idle(new_folio); + if (folio_test_young(folio)) + folio_set_young(new_folio); + if (folio_test_idle(folio)) + folio_set_idle(new_folio); +#ifdef CONFIG_MEMCG + new_folio->memcg_data = folio->memcg_data; +#endif - folio_xchg_last_cpupid(new_folio, folio_last_cpupid(folio)); + folio_xchg_last_cpupid(new_folio, folio_last_cpupid(folio)); + } - /* - * always add to the tail because some iterators expect new - * pages to show after the currently processed elements - e.g. - * migrate_pages - */ - lru_add_page_tail(folio, page_tail, lruvec, list); + if (new_order) + folio_set_order(folio, new_order); + else + ClearPageCompound(&folio->page); } -static void __split_huge_page(struct page *page, struct list_head *list, - pgoff_t end, unsigned int new_order) +/* + * It splits an unmapped @folio to lower order smaller folios in two ways. + * @folio: the to-be-split folio + * @new_order: the smallest order of the after split folios (since buddy + * allocator like split generates folios with orders from @folio's + * order - 1 to new_order). + * @split_at: in buddy allocator like split, the folio containing @split_at + * will be split until its order becomes @new_order. + * @lock_at: the folio containing @lock_at is left locked for caller. + * @list: the after split folios will be added to @list if it is not NULL, + * otherwise to LRU lists. + * @end: the end of the file @folio maps to. -1 if @folio is anonymous memory. + * @xas: xa_state pointing to folio->mapping->i_pages and locked by caller + * @mapping: @folio->mapping + * @uniform_split: if the split is uniform or not (buddy allocator like split) + * + * + * 1. uniform split: the given @folio into multiple @new_order small folios, + * where all small folios have the same order. This is done when + * uniform_split is true. + * 2. buddy allocator like (non-uniform) split: the given @folio is split into + * half and one of the half (containing the given page) is split into half + * until the given @page's order becomes @new_order. This is done when + * uniform_split is false. + * + * The high level flow for these two methods are: + * 1. uniform split: a single __split_folio_to_order() is called to split the + * @folio into @new_order, then we traverse all the resulting folios one by + * one in PFN ascending order and perform stats, unfreeze, adding to list, + * and file mapping index operations. + * 2. non-uniform split: in general, folio_order - @new_order calls to + * __split_folio_to_order() are made in a for loop to split the @folio + * to one lower order at a time. The resulting small folios are processed + * like what is done during the traversal in 1, except the one containing + * @page, which is split in next for loop. + * + * After splitting, the caller's folio reference will be transferred to the + * folio containing @page. The other folios may be freed if they are not mapped. + * + * In terms of locking, after splitting, + * 1. uniform split leaves @page (or the folio contains it) locked; + * 2. buddy allocator like (non-uniform) split leaves @folio locked. + * + * + * For !uniform_split, when -ENOMEM is returned, the original folio might be + * split. The caller needs to check the input folio. + */ +static int __split_unmapped_folio(struct folio *folio, int new_order, + struct page *split_at, struct page *lock_at, + struct list_head *list, pgoff_t end, + struct xa_state *xas, struct address_space *mapping, + bool uniform_split) { - struct folio *folio = page_folio(page); - struct page *head = &folio->page; struct lruvec *lruvec; struct address_space *swap_cache = NULL; - unsigned long offset = 0; - int i, nr_dropped = 0; - unsigned int new_nr = 1 << new_order; + struct folio *origin_folio = folio; + struct folio *next_folio = folio_next(folio); + struct folio *new_folio; + struct folio *next; int order = folio_order(folio); - unsigned int nr = 1 << order; + int split_order; + int start_order = uniform_split ? new_order : order - 1; + int nr_dropped = 0; + int ret = 0; + bool stop_split = false; - /* complete memcg works before add pages to LRU */ - split_page_memcg(head, order, new_order); + if (folio_test_swapcache(folio)) { + VM_BUG_ON(mapping); + + /* a swapcache folio can only be uniformly split to order-0 */ + if (!uniform_split || new_order != 0) + return -EINVAL; - if (folio_test_anon(folio) && folio_test_swapcache(folio)) { - offset = swap_cache_index(folio->swap); swap_cache = swap_address_space(folio->swap); xa_lock(&swap_cache->i_pages); } + if (folio_test_anon(folio)) + mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1); + /* lock lru list/PageCompound, ref frozen by page_ref_freeze */ lruvec = folio_lruvec_lock(folio); - ClearPageHasHWPoisoned(head); - - for (i = nr - new_nr; i >= new_nr; i -= new_nr) { - struct folio *tail; - __split_huge_page_tail(folio, i, lruvec, list, new_order); - tail = page_folio(head + i); - /* Some pages can be beyond EOF: drop them from page cache */ - if (tail->index >= end) { - if (shmem_mapping(folio->mapping)) - nr_dropped++; - else if (folio_test_clear_dirty(tail)) - folio_account_cleaned(tail, - inode_to_wb(folio->mapping->host)); - __filemap_remove_folio(tail, NULL); - folio_put(tail); - } else if (!folio_test_anon(folio)) { - __xa_store(&folio->mapping->i_pages, tail->index, - tail, 0); - } else if (swap_cache) { - __xa_store(&swap_cache->i_pages, offset + i, - tail, 0); + folio_clear_has_hwpoisoned(folio); + + /* + * split to new_order one order at a time. For uniform split, + * folio is split to new_order directly. + */ + for (split_order = start_order; + split_order >= new_order && !stop_split; + split_order--) { + int old_order = folio_order(folio); + struct folio *release; + struct folio *end_folio = folio_next(folio); + + /* order-1 anonymous folio is not supported */ + if (folio_test_anon(folio) && split_order == 1) + continue; + if (uniform_split && split_order != new_order) + continue; + + if (mapping) { + /* + * uniform split has xas_split_alloc() called before + * irq is disabled to allocate enough memory, whereas + * non-uniform split can handle ENOMEM. + */ + if (uniform_split) + xas_split(xas, folio, old_order); + else { + xas_set_order(xas, folio->index, split_order); + xas_try_split(xas, folio, old_order); + if (xas_error(xas)) { + ret = xas_error(xas); + stop_split = true; + goto after_split; + } + } } - } - if (!new_order) - ClearPageCompound(head); - else { - struct folio *new_folio = (struct folio *)head; + folio_split_memcg_refs(folio, old_order, split_order); + split_page_owner(&folio->page, old_order, split_order); + pgalloc_tag_split(folio, old_order, split_order); - folio_set_order(new_folio, new_order); - } - unlock_page_lruvec(lruvec); - /* Caller disabled irqs, so they are still disabled here */ + __split_folio_to_order(folio, old_order, split_order); - split_page_owner(head, order, new_order); - pgalloc_tag_split(folio, order, new_order); +after_split: + /* + * Iterate through after-split folios and perform related + * operations. But in buddy allocator like split, the folio + * containing the specified page is skipped until its order + * is new_order, since the folio will be worked on in next + * iteration. + */ + for (release = folio; release != end_folio; release = next) { + next = folio_next(release); + /* + * for buddy allocator like split, the folio containing + * page will be split next and should not be released, + * until the folio's order is new_order or stop_split + * is set to true by the above xas_split() failure. + */ + if (release == page_folio(split_at)) { + folio = release; + if (split_order != new_order && !stop_split) + continue; + } + if (folio_test_anon(release)) { + mod_mthp_stat(folio_order(release), + MTHP_STAT_NR_ANON, 1); + } - /* See comment in __split_huge_page_tail() */ - if (folio_test_anon(folio)) { - /* Additional pin to swap cache */ - if (folio_test_swapcache(folio)) { - folio_ref_add(folio, 1 + new_nr); - xa_unlock(&swap_cache->i_pages); - } else { - folio_ref_inc(folio); + /* + * origin_folio should be kept frozon until page cache + * entries are updated with all the other after-split + * folios to prevent others seeing stale page cache + * entries. + */ + if (release == origin_folio) + continue; + + folio_ref_unfreeze(release, 1 + + ((mapping || swap_cache) ? + folio_nr_pages(release) : 0)); + + lru_add_split_folio(origin_folio, release, lruvec, + list); + + /* Some pages can be beyond EOF: drop them from cache */ + if (release->index >= end) { + if (shmem_mapping(mapping)) + nr_dropped += folio_nr_pages(release); + else if (folio_test_clear_dirty(release)) + folio_account_cleaned(release, + inode_to_wb(mapping->host)); + __filemap_remove_folio(release, NULL); + folio_put_refs(release, folio_nr_pages(release)); + } else if (mapping) { + __xa_store(&mapping->i_pages, + release->index, release, 0); + } else if (swap_cache) { + __xa_store(&swap_cache->i_pages, + swap_cache_index(release->swap), + release, 0); + } } - } else { - /* Additional pin to page cache */ - folio_ref_add(folio, 1 + new_nr); - xa_unlock(&folio->mapping->i_pages); } + + /* + * Unfreeze origin_folio only after all page cache entries, which used + * to point to it, have been updated with new folios. Otherwise, + * a parallel folio_try_get() can grab origin_folio and its caller can + * see stale page cache entries. + */ + folio_ref_unfreeze(origin_folio, 1 + + ((mapping || swap_cache) ? folio_nr_pages(origin_folio) : 0)); + + unlock_page_lruvec(lruvec); + + if (swap_cache) + xa_unlock(&swap_cache->i_pages); + if (mapping) + xa_unlock(&mapping->i_pages); + + /* Caller disabled irqs, so they are still disabled here */ local_irq_enable(); if (nr_dropped) - shmem_uncharge(folio->mapping->host, nr_dropped); - remap_page(folio, nr, PageAnon(head) ? RMP_USE_SHARED_ZEROPAGE : 0); + shmem_uncharge(mapping->host, nr_dropped); + + remap_page(origin_folio, 1 << order, + folio_test_anon(origin_folio) ? + RMP_USE_SHARED_ZEROPAGE : 0); /* - * set page to its compound_head when split to non order-0 pages, so - * we can skip unlocking it below, since PG_locked is transferred to - * the compound_head of the page and the caller will unlock it. + * At this point, folio should contain the specified page. + * For uniform split, it is left for caller to unlock. + * For buddy allocator like split, the first after-split folio is left + * for caller to unlock. */ - if (new_order) - page = compound_head(page); - - for (i = 0; i < nr; i += new_nr) { - struct page *subpage = head + i; - struct folio *new_folio = page_folio(subpage); - if (subpage == page) + for (new_folio = origin_folio; new_folio != next_folio; new_folio = next) { + next = folio_next(new_folio); + if (new_folio == page_folio(lock_at)) continue; - folio_unlock(new_folio); + folio_unlock(new_folio); /* * Subpages may be freed if there wasn't any mapping * like if add_to_swap() is running on a lru page that @@ -3350,81 +3637,90 @@ static void __split_huge_page(struct page *page, struct list_head *list, * requires taking the lru_lock so we do the put_page * of the tail pages after the split is complete. */ - free_page_and_swap_cache(subpage); + free_folio_and_swap_cache(new_folio); } + return ret; } -/* Racy check whether the huge page can be split */ -bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins) +bool non_uniform_split_supported(struct folio *folio, unsigned int new_order, + bool warns) { - int extra_pins; + if (folio_test_anon(folio)) { + /* order-1 is not supported for anonymous THP. */ + VM_WARN_ONCE(warns && new_order == 1, + "Cannot split to order-1 folio"); + return new_order != 1; + } else if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && + !mapping_large_folio_support(folio->mapping)) { + /* + * No split if the file system does not support large folio. + * Note that we might still have THPs in such mappings due to + * CONFIG_READ_ONLY_THP_FOR_FS. But in that case, the mapping + * does not actually support large folios properly. + */ + VM_WARN_ONCE(warns, + "Cannot split file folio to non-0 order"); + return false; + } - /* Additional pins from page cache */ - if (folio_test_anon(folio)) - extra_pins = folio_test_swapcache(folio) ? - folio_nr_pages(folio) : 0; - else - extra_pins = folio_nr_pages(folio); - if (pextra_pins) - *pextra_pins = extra_pins; - return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - - caller_pins; + /* Only swapping a whole PMD-mapped folio is supported */ + if (folio_test_swapcache(folio)) { + VM_WARN_ONCE(warns, + "Cannot split swapcache folio to non-0 order"); + return false; + } + + return true; +} + +/* See comments in non_uniform_split_supported() */ +bool uniform_split_supported(struct folio *folio, unsigned int new_order, + bool warns) +{ + if (folio_test_anon(folio)) { + VM_WARN_ONCE(warns && new_order == 1, + "Cannot split to order-1 folio"); + return new_order != 1; + } else if (new_order) { + if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && + !mapping_large_folio_support(folio->mapping)) { + VM_WARN_ONCE(warns, + "Cannot split file folio to non-0 order"); + return false; + } + } + + if (new_order && folio_test_swapcache(folio)) { + VM_WARN_ONCE(warns, + "Cannot split swapcache folio to non-0 order"); + return false; + } + + return true; } /* - * This function splits a large folio into smaller folios of order @new_order. - * @page can point to any page of the large folio to split. The split operation - * does not change the position of @page. + * __folio_split: split a folio at @split_at to a @new_order folio + * @folio: folio to split + * @new_order: the order of the new folio + * @split_at: a page within the new folio + * @lock_at: a page within @folio to be left locked to caller + * @list: after-split folios will be put on it if non NULL + * @uniform_split: perform uniform split or not (non-uniform split) * - * Prerequisites: - * - * 1) The caller must hold a reference on the @page's owning folio, also known - * as the large folio. + * It calls __split_unmapped_folio() to perform uniform and non-uniform split. + * It is in charge of checking whether the split is supported or not and + * preparing @folio for __split_unmapped_folio(). * - * 2) The large folio must be locked. - * - * 3) The folio must not be pinned. Any unexpected folio references, including - * GUP pins, will result in the folio not getting split; instead, the caller - * will receive an -EAGAIN. - * - * 4) @new_order > 1, usually. Splitting to order-1 anonymous folios is not - * supported for non-file-backed folios, because folio->_deferred_list, which - * is used by partially mapped folios, is stored in subpage 2, but an order-1 - * folio only has subpages 0 and 1. File-backed order-1 folios are supported, - * since they do not use _deferred_list. - * - * After splitting, the caller's folio reference will be transferred to @page, - * resulting in a raised refcount of @page after this call. The other pages may - * be freed if they are not mapped. - * - * If @list is null, tail pages will be added to LRU list, otherwise, to @list. - * - * Pages in @new_order will inherit the mapping, flags, and so on from the - * huge page. - * - * Returns 0 if the huge page was split successfully. - * - * Returns -EAGAIN if the folio has unexpected reference (e.g., GUP) or if - * the folio was concurrently removed from the page cache. - * - * Returns -EBUSY when trying to split the huge zeropage, if the folio is - * under writeback, if fs-specific folio metadata cannot currently be - * released, or if some unexpected race happened (e.g., anon VMA disappeared, - * truncation). - * - * Callers should ensure that the order respects the address space mapping - * min-order if one is set for non-anonymous folios. - * - * Returns -EINVAL when trying to split to an order that is incompatible - * with the folio. Splitting to order 0 is compatible with all folios. + * return: 0: successful, <0 failed (if -ENOMEM is returned, @folio might be + * split but not to @new_order, the caller needs to check) */ -int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, - unsigned int new_order) +static int __folio_split(struct folio *folio, unsigned int new_order, + struct page *split_at, struct page *lock_at, + struct list_head *list, bool uniform_split) { - struct folio *folio = page_folio(page); struct deferred_split *ds_queue = get_deferred_split_queue(folio); - /* reset xarray order to new order after split */ - XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order); + XA_STATE(xas, &folio->mapping->i_pages, folio->index); bool is_anon = folio_test_anon(folio); struct address_space *mapping = NULL; struct anon_vma *anon_vma = NULL; @@ -3436,38 +3732,17 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); + if (folio != page_folio(split_at) || folio != page_folio(lock_at)) + return -EINVAL; + if (new_order >= folio_order(folio)) return -EINVAL; - if (is_anon) { - /* order-1 is not supported for anonymous THP. */ - if (new_order == 1) { - VM_WARN_ONCE(1, "Cannot split to order-1 folio"); - return -EINVAL; - } - } else if (new_order) { - /* Split shmem folio to non-zero order not supported */ - if (shmem_mapping(folio->mapping)) { - VM_WARN_ONCE(1, - "Cannot split shmem folio to non-0 order"); - return -EINVAL; - } - /* - * No split if the file system does not support large folio. - * Note that we might still have THPs in such mappings due to - * CONFIG_READ_ONLY_THP_FOR_FS. But in that case, the mapping - * does not actually support large folios properly. - */ - if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && - !mapping_large_folio_support(folio->mapping)) { - VM_WARN_ONCE(1, - "Cannot split file folio to non-0 order"); - return -EINVAL; - } - } + if (uniform_split && !uniform_split_supported(folio, new_order, true)) + return -EINVAL; - /* Only swapping a whole PMD-mapped folio is supported */ - if (folio_test_swapcache(folio) && new_order) + if (!uniform_split && + !non_uniform_split_supported(folio, new_order, true)) return -EINVAL; is_hzp = is_huge_zero_folio(folio); @@ -3503,6 +3778,11 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, mapping = folio->mapping; /* Truncated ? */ + /* + * TODO: add support for large shmem folio in swap cache. + * When shmem is in swap cache, mapping is NULL and + * folio_test_swapcache() is true. + */ if (!mapping) { ret = -EBUSY; goto out; @@ -3524,21 +3804,24 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, goto out; } - xas_split_alloc(&xas, folio, folio_order(folio), gfp); - if (xas_error(&xas)) { - ret = xas_error(&xas); - goto out; + if (uniform_split) { + xas_set_order(&xas, folio->index, new_order); + xas_split_alloc(&xas, folio, folio_order(folio), gfp); + if (xas_error(&xas)) { + ret = xas_error(&xas); + goto out; + } } anon_vma = NULL; i_mmap_lock_read(mapping); /* - *__split_huge_page() may need to trim off pages beyond EOF: - * but on 32-bit, i_size_read() takes an irq-unsafe seqlock, - * which cannot be nested inside the page tree lock. So note - * end now: i_size itself may be changed at any moment, but - * folio lock is good enough to serialize the trimming. + *__split_unmapped_folio() may need to trim off pages beyond + * EOF: but on 32-bit, i_size_read() takes an irq-unsafe + * seqlock, which cannot be nested inside the page tree lock. + * So note end now: i_size itself may be changed at any moment, + * but folio lock is good enough to serialize the trimming. */ end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); if (shmem_mapping(mapping)) @@ -3576,7 +3859,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, !list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; if (folio_test_partially_mapped(folio)) { - __folio_clear_partially_mapped(folio); + folio_clear_partially_mapped(folio); mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); } @@ -3592,7 +3875,6 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, if (mapping) { int nr = folio_nr_pages(folio); - xas_split(&xas, folio, folio_order(folio)); if (folio_test_pmd_mappable(folio) && new_order < HPAGE_PMD_ORDER) { if (folio_test_swapbacked(folio)) { @@ -3606,12 +3888,9 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, } } - if (is_anon) { - mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1); - mod_mthp_stat(new_order, MTHP_STAT_NR_ANON, 1 << (order - new_order)); - } - __split_huge_page(page, list, end, new_order); - ret = 0; + ret = __split_unmapped_folio(folio, new_order, + split_at, lock_at, list, end, &xas, mapping, + uniform_split); } else { spin_unlock(&ds_queue->split_queue_lock); fail: @@ -3637,6 +3916,90 @@ out: return ret; } +/* + * This function splits a large folio into smaller folios of order @new_order. + * @page can point to any page of the large folio to split. The split operation + * does not change the position of @page. + * + * Prerequisites: + * + * 1) The caller must hold a reference on the @page's owning folio, also known + * as the large folio. + * + * 2) The large folio must be locked. + * + * 3) The folio must not be pinned. Any unexpected folio references, including + * GUP pins, will result in the folio not getting split; instead, the caller + * will receive an -EAGAIN. + * + * 4) @new_order > 1, usually. Splitting to order-1 anonymous folios is not + * supported for non-file-backed folios, because folio->_deferred_list, which + * is used by partially mapped folios, is stored in subpage 2, but an order-1 + * folio only has subpages 0 and 1. File-backed order-1 folios are supported, + * since they do not use _deferred_list. + * + * After splitting, the caller's folio reference will be transferred to @page, + * resulting in a raised refcount of @page after this call. The other pages may + * be freed if they are not mapped. + * + * If @list is null, tail pages will be added to LRU list, otherwise, to @list. + * + * Pages in @new_order will inherit the mapping, flags, and so on from the + * huge page. + * + * Returns 0 if the huge page was split successfully. + * + * Returns -EAGAIN if the folio has unexpected reference (e.g., GUP) or if + * the folio was concurrently removed from the page cache. + * + * Returns -EBUSY when trying to split the huge zeropage, if the folio is + * under writeback, if fs-specific folio metadata cannot currently be + * released, or if some unexpected race happened (e.g., anon VMA disappeared, + * truncation). + * + * Callers should ensure that the order respects the address space mapping + * min-order if one is set for non-anonymous folios. + * + * Returns -EINVAL when trying to split to an order that is incompatible + * with the folio. Splitting to order 0 is compatible with all folios. + */ +int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, + unsigned int new_order) +{ + struct folio *folio = page_folio(page); + + return __folio_split(folio, new_order, &folio->page, page, list, true); +} + +/* + * folio_split: split a folio at @split_at to a @new_order folio + * @folio: folio to split + * @new_order: the order of the new folio + * @split_at: a page within the new folio + * + * return: 0: successful, <0 failed (if -ENOMEM is returned, @folio might be + * split but not to @new_order, the caller needs to check) + * + * It has the same prerequisites and returns as + * split_huge_page_to_list_to_order(). + * + * Split a folio at @split_at to a new_order folio, leave the + * remaining subpages of the original folio as large as possible. For example, + * in the case of splitting an order-9 folio at its third order-3 subpages to + * an order-3 folio, there are 2^(9-3)=64 order-3 subpages in the order-9 folio. + * After the split, there will be a group of folios with different orders and + * the new folio containing @split_at is marked in bracket: + * [order-4, {order-3}, order-3, order-5, order-6, order-7, order-8]. + * + * After split, folio is left locked for caller. + */ +int folio_split(struct folio *folio, unsigned int new_order, + struct page *split_at, struct list_head *list) +{ + return __folio_split(folio, new_order, split_at, &folio->page, list, + false); +} + int min_order_for_split(struct folio *folio) { if (folio_test_anon(folio)) @@ -3688,7 +4051,7 @@ bool __folio_unqueue_deferred_split(struct folio *folio) if (!list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; if (folio_test_partially_mapped(folio)) { - __folio_clear_partially_mapped(folio); + folio_clear_partially_mapped(folio); mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); } @@ -3721,7 +4084,7 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped) /* * Exclude swapcache: originally to avoid a corrupt deferred split - * queue. Nowadays that is fully prevented by mem_cgroup_swapout(); + * queue. Nowadays that is fully prevented by memcg1_swapout(); * but if page reclaim is already handling the same folio, it is * unnecessary to handle it again in the shrinker, so excluding * swapcache here may still be a useful optimization. @@ -3732,7 +4095,7 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped) spin_lock_irqsave(&ds_queue->split_queue_lock, flags); if (partially_mapped) { if (!folio_test_partially_mapped(folio)) { - __folio_set_partially_mapped(folio); + folio_set_partially_mapped(folio); if (folio_test_pmd_mappable(folio)) count_vm_event(THP_DEFERRED_SPLIT_PAGE); count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED); @@ -3825,7 +4188,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, } else { /* We lost race with folio_put() */ if (folio_test_partially_mapped(folio)) { - __folio_clear_partially_mapped(folio); + folio_clear_partially_mapped(folio); mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); } @@ -3956,7 +4319,8 @@ static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma) } static int split_huge_pages_pid(int pid, unsigned long vaddr_start, - unsigned long vaddr_end, unsigned int new_order) + unsigned long vaddr_end, unsigned int new_order, + long in_folio_offset) { int ret = 0; struct task_struct *task; @@ -4040,8 +4404,16 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, if (!folio_test_anon(folio) && folio->mapping != mapping) goto unlock; - if (!split_folio_to_order(folio, target_order)) - split++; + if (in_folio_offset < 0 || + in_folio_offset >= folio_nr_pages(folio)) { + if (!split_folio_to_order(folio, target_order)) + split++; + } else { + struct page *split_at = folio_page(folio, + in_folio_offset); + if (!folio_split(folio, target_order, split_at, NULL)) + split++; + } unlock: @@ -4064,7 +4436,8 @@ out: } static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start, - pgoff_t off_end, unsigned int new_order) + pgoff_t off_end, unsigned int new_order, + long in_folio_offset) { struct filename *file; struct file *candidate; @@ -4113,8 +4486,15 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start, if (folio->mapping != mapping) goto unlock; - if (!split_folio_to_order(folio, target_order)) - split++; + if (in_folio_offset < 0 || in_folio_offset >= nr_pages) { + if (!split_folio_to_order(folio, target_order)) + split++; + } else { + struct page *split_at = folio_page(folio, + in_folio_offset); + if (!folio_split(folio, target_order, split_at, NULL)) + split++; + } unlock: folio_unlock(folio); @@ -4147,6 +4527,7 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf, int pid; unsigned long vaddr_start, vaddr_end; unsigned int new_order = 0; + long in_folio_offset = -1; ret = mutex_lock_interruptible(&split_debug_mutex); if (ret) @@ -4162,42 +4543,46 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf, if (input_buf[0] == '/') { char *tok; - char *buf = input_buf; + char *tok_buf = input_buf; char file_path[MAX_INPUT_BUF_SZ]; pgoff_t off_start = 0, off_end = 0; size_t input_len = strlen(input_buf); - tok = strsep(&buf, ","); - if (tok) { + tok = strsep(&tok_buf, ","); + if (tok && tok_buf) { strscpy(file_path, tok); } else { ret = -EINVAL; goto out; } - ret = sscanf(buf, "0x%lx,0x%lx,%d", &off_start, &off_end, &new_order); - if (ret != 2 && ret != 3) { + ret = sscanf(tok_buf, "0x%lx,0x%lx,%d,%ld", &off_start, &off_end, + &new_order, &in_folio_offset); + if (ret != 2 && ret != 3 && ret != 4) { ret = -EINVAL; goto out; } - ret = split_huge_pages_in_file(file_path, off_start, off_end, new_order); + ret = split_huge_pages_in_file(file_path, off_start, off_end, + new_order, in_folio_offset); if (!ret) ret = input_len; goto out; } - ret = sscanf(input_buf, "%d,0x%lx,0x%lx,%d", &pid, &vaddr_start, &vaddr_end, &new_order); + ret = sscanf(input_buf, "%d,0x%lx,0x%lx,%d,%ld", &pid, &vaddr_start, + &vaddr_end, &new_order, &in_folio_offset); if (ret == 1 && pid == 1) { split_huge_pages_all(); ret = strlen(input_buf); goto out; - } else if (ret != 3 && ret != 4) { + } else if (ret != 3 && ret != 4 && ret != 5) { ret = -EINVAL; goto out; } - ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end, new_order); + ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end, new_order, + in_folio_offset); if (!ret) ret = strlen(input_buf); out: @@ -4286,7 +4671,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) entry = pmd_to_swp_entry(*pvmw->pmd); folio_get(folio); - pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot)); + pmde = folio_mk_pmd(folio, READ_ONCE(vma->vm_page_prot)); if (pmd_swp_soft_dirty(*pvmw->pmd)) pmde = pmd_mksoft_dirty(pmde); if (is_writable_migration_entry(entry)) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ea2ed8e301ef..9dc95eac558c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -14,9 +14,11 @@ #include <linux/pagemap.h> #include <linux/mempolicy.h> #include <linux/compiler.h> +#include <linux/cpumask.h> #include <linux/cpuset.h> #include <linux/mutex.h> #include <linux/memblock.h> +#include <linux/minmax.h> #include <linux/sysfs.h> #include <linux/slab.h> #include <linux/sched/mm.h> @@ -40,6 +42,7 @@ #include <asm/page.h> #include <asm/pgalloc.h> #include <asm/tlb.h> +#include <asm/setup.h> #include <linux/io.h> #include <linux/hugetlb.h> @@ -48,18 +51,34 @@ #include <linux/page_owner.h> #include "internal.h" #include "hugetlb_vmemmap.h" +#include "hugetlb_cma.h" +#include <linux/page-isolation.h> int hugetlb_max_hstate __read_mostly; unsigned int default_hstate_idx; struct hstate hstates[HUGE_MAX_HSTATE]; -#ifdef CONFIG_CMA -static struct cma *hugetlb_cma[MAX_NUMNODES]; -static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata; -#endif -static unsigned long hugetlb_cma_size __initdata; - +__initdata nodemask_t hugetlb_bootmem_nodes; __initdata struct list_head huge_boot_pages[MAX_NUMNODES]; +static unsigned long hstate_boot_nrinvalid[HUGE_MAX_HSTATE] __initdata; + +/* + * Due to ordering constraints across the init code for various + * architectures, hugetlb hstate cmdline parameters can't simply + * be early_param. early_param might call the setup function + * before valid hugetlb page sizes are determined, leading to + * incorrect rejection of valid hugepagesz= options. + * + * So, record the parameters early and consume them whenever the + * init code is ready for them, by calling hugetlb_parse_params(). + */ + +/* one (hugepagesz=,hugepages=) pair per hstate, one default_hugepagesz */ +#define HUGE_MAX_CMDLINE_ARGS (2 * HUGE_MAX_HSTATE + 1) +struct hugetlb_cmdline { + char *val; + int (*setup)(char *val); +}; /* for command line parsing */ static struct hstate * __initdata parsed_hstate; @@ -67,6 +86,21 @@ static unsigned long __initdata default_hstate_max_huge_pages; static bool __initdata parsed_valid_hugepagesz = true; static bool __initdata parsed_default_hugepagesz; static unsigned int default_hugepages_in_node[MAX_NUMNODES] __initdata; +static unsigned long hugepage_allocation_threads __initdata; + +static char hstate_cmdline_buf[COMMAND_LINE_SIZE] __initdata; +static int hstate_cmdline_index __initdata; +static struct hugetlb_cmdline hugetlb_params[HUGE_MAX_CMDLINE_ARGS] __initdata; +static int hugetlb_param_index __initdata; +static __init int hugetlb_add_param(char *s, int (*setup)(char *val)); +static __init void hugetlb_parse_params(void); + +#define hugetlb_early_param(str, func) \ +static __init int func##args(char *s) \ +{ \ + return hugetlb_add_param(s, func); \ +} \ +early_param(str, func##args) /* * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, @@ -87,17 +121,16 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma); static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma); static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma); static void hugetlb_unshare_pmds(struct vm_area_struct *vma, - unsigned long start, unsigned long end); + unsigned long start, unsigned long end, bool take_locks); static struct resv_map *vma_resv_map(struct vm_area_struct *vma); static void hugetlb_free_folio(struct folio *folio) { -#ifdef CONFIG_CMA - int nid = folio_nid(folio); - - if (cma_free_folio(hugetlb_cma[nid], folio)) + if (folio_test_hugetlb_cma(folio)) { + hugetlb_cma_free_folio(folio); return; -#endif + } + folio_put(folio); } @@ -1218,7 +1251,7 @@ void hugetlb_dup_vma_private(struct vm_area_struct *vma) /* * Reset and decrement one ref on hugepage private reservation. * Called with mm->mmap_lock writer semaphore held. - * This function should be only used by move_vma() and operate on + * This function should be only used by mremap and operate on * same sized vma. It should never come here with last ref on the * reservation. */ @@ -1246,69 +1279,6 @@ void clear_vma_resv_huge_pages(struct vm_area_struct *vma) hugetlb_dup_vma_private(vma); } -/* Returns true if the VMA has associated reserve pages */ -static bool vma_has_reserves(struct vm_area_struct *vma, long chg) -{ - if (vma->vm_flags & VM_NORESERVE) { - /* - * This address is already reserved by other process(chg == 0), - * so, we should decrement reserved count. Without decrementing, - * reserve count remains after releasing inode, because this - * allocated page will go into page cache and is regarded as - * coming from reserved pool in releasing step. Currently, we - * don't have any other solution to deal with this situation - * properly, so add work-around here. - */ - if (vma->vm_flags & VM_MAYSHARE && chg == 0) - return true; - else - return false; - } - - /* Shared mappings always use reserves */ - if (vma->vm_flags & VM_MAYSHARE) { - /* - * We know VM_NORESERVE is not set. Therefore, there SHOULD - * be a region map for all pages. The only situation where - * there is no region map is if a hole was punched via - * fallocate. In this case, there really are no reserves to - * use. This situation is indicated if chg != 0. - */ - if (chg) - return false; - else - return true; - } - - /* - * Only the process that called mmap() has reserves for - * private mappings. - */ - if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { - /* - * Like the shared case above, a hole punch or truncate - * could have been performed on the private mapping. - * Examine the value of chg to determine if reserves - * actually exist or were previously consumed. - * Very Subtle - The value of chg comes from a previous - * call to vma_needs_reserves(). The reserve map for - * private mappings has different (opposite) semantics - * than that of shared mappings. vma_needs_reserves() - * has already taken this difference in semantics into - * account. Therefore, the meaning of chg is the same - * as in the shared case above. Code could easily be - * combined, but keeping it separate draws attention to - * subtle differences. - */ - if (chg) - return false; - else - return true; - } - - return false; -} - static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio) { int nid = folio_nid(folio); @@ -1336,6 +1306,9 @@ static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h, if (folio_test_hwpoison(folio)) continue; + if (is_migrate_isolate_page(&folio->page)) + continue; + list_move(&folio->lru, &h->hugepage_activelist); folio_ref_unfreeze(folio, 1); folio_clear_hugetlb_freed(folio); @@ -1394,8 +1367,7 @@ static unsigned long available_huge_pages(struct hstate *h) static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, - unsigned long address, int avoid_reserve, - long chg) + unsigned long address, long gbl_chg) { struct folio *folio = NULL; struct mempolicy *mpol; @@ -1404,15 +1376,10 @@ static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h, int nid; /* - * A child process with MAP_PRIVATE mappings created by their parent - * have no page reserves. This check ensures that reservations are - * not "stolen". The child may still get SIGKILLed + * gbl_chg==1 means the allocation requires a new page that was not + * reserved before. Making sure there's at least one free page. */ - if (!vma_has_reserves(vma, chg) && !available_huge_pages(h)) - goto err; - - /* If reserves cannot be used, ensure enough pages are in the pool */ - if (avoid_reserve && !available_huge_pages(h)) + if (gbl_chg && !available_huge_pages(h)) goto err; gfp_mask = htlb_alloc_mask(h); @@ -1430,11 +1397,6 @@ static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h, folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, nid, nodemask); - if (folio && !avoid_reserve && vma_has_reserves(vma, chg)) { - folio_set_hugetlb_restore_reserve(folio); - h->resv_huge_pages--; - } - mpol_cond_put(mpol); return folio; @@ -1525,27 +1487,11 @@ static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, if (nid == NUMA_NO_NODE) nid = numa_mem_id(); retry: - folio = NULL; -#ifdef CONFIG_CMA - { - int node; - - if (hugetlb_cma[nid]) - folio = cma_alloc_folio(hugetlb_cma[nid], order, gfp_mask); - - if (!folio && !(gfp_mask & __GFP_THISNODE)) { - for_each_node_mask(node, *nodemask) { - if (node == nid || !hugetlb_cma[node]) - continue; - - folio = cma_alloc_folio(hugetlb_cma[node], order, gfp_mask); - if (folio) - break; - } - } - } -#endif + folio = hugetlb_cma_alloc_folio(h, gfp_mask, nid, nodemask); if (!folio) { + if (hugetlb_cma_exclusive_alloc()) + return NULL; + folio = folio_alloc_gigantic(order, gfp_mask, nid, nodemask); if (!folio) return NULL; @@ -1704,7 +1650,6 @@ static void __update_and_free_hugetlb_folio(struct hstate *h, folio_ref_unfreeze(folio, 1); - INIT_LIST_HEAD(&folio->_deferred_list); hugetlb_free_folio(folio); } @@ -2006,7 +1951,6 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, int order = huge_page_order(h); struct folio *folio; bool alloc_try_hard = true; - bool retry = true; /* * By default we always try hard to allocate the folio with @@ -2021,22 +1965,8 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, gfp_mask |= __GFP_RETRY_MAYFAIL; if (nid == NUMA_NO_NODE) nid = numa_mem_id(); -retry: - folio = __folio_alloc(gfp_mask, order, nid, nmask); - /* Ensure hugetlb folio won't have large_rmappable flag set. */ - if (folio) - folio_clear_large_rmappable(folio); - if (folio && !folio_ref_freeze(folio, 1)) { - folio_put(folio); - if (retry) { /* retry once */ - retry = false; - goto retry; - } - /* WOW! twice in a row. */ - pr_warn("HugeTLB unexpected inflated folio ref count\n"); - folio = NULL; - } + folio = (struct folio *)__alloc_frozen_pages(gfp_mask, order, nid, nmask); /* * If we did not specify __GFP_RETRY_MAYFAIL, but still got a @@ -2205,6 +2135,8 @@ retry: if (!folio_ref_count(folio)) { struct hstate *h = folio_hstate(folio); + bool adjust_surplus = false; + if (!available_huge_pages(h)) goto out; @@ -2227,7 +2159,9 @@ retry: goto retry; } - remove_hugetlb_folio(h, folio, false); + if (h->surplus_huge_pages_node[folio_nid(folio)]) + adjust_surplus = true; + remove_hugetlb_folio(h, folio, adjust_surplus); h->max_huge_pages--; spin_unlock_irq(&hugetlb_lock); @@ -2247,7 +2181,7 @@ retry: rc = hugetlb_vmemmap_restore_folio(h, folio); if (rc) { spin_lock_irq(&hugetlb_lock); - add_hugetlb_folio(h, folio, false); + add_hugetlb_folio(h, folio, adjust_surplus); h->max_huge_pages++; goto out; } @@ -2311,12 +2245,21 @@ static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h, goto out_unlock; spin_unlock_irq(&hugetlb_lock); - folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask); + folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL); if (!folio) return NULL; + hugetlb_vmemmap_optimize_folio(h, folio); + spin_lock_irq(&hugetlb_lock); /* + * nr_huge_pages needs to be adjusted within the same lock cycle + * as surplus_pages, otherwise it might confuse + * persistent_huge_pages() momentarily. + */ + __prep_account_new_huge_page(h, folio_nid(folio)); + + /* * We could have raced with the pool size change. * Double check that and simply deallocate the new page * if we would end up overcommiting the surpluses. Abuse @@ -2462,8 +2405,13 @@ static int gather_surplus_pages(struct hstate *h, long delta) long i; long needed, allocated; bool alloc_ok = true; - int node; - nodemask_t *mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h)); + nodemask_t *mbind_nodemask, alloc_nodemask; + + mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h)); + if (mbind_nodemask) + nodes_and(alloc_nodemask, *mbind_nodemask, cpuset_current_mems_allowed); + else + alloc_nodemask = cpuset_current_mems_allowed; lockdep_assert_held(&hugetlb_lock); needed = (h->resv_huge_pages + delta) - h->free_huge_pages; @@ -2479,14 +2427,13 @@ retry: spin_unlock_irq(&hugetlb_lock); for (i = 0; i < needed; i++) { folio = NULL; - for_each_node_mask(node, cpuset_current_mems_allowed) { - if (!mbind_nodemask || node_isset(node, *mbind_nodemask)) { - folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), - node, NULL); - if (folio) - break; - } - } + + /* + * It is okay to use NUMA_NO_NODE because we use numa_mem_id() + * down the road to pick the current node if that is the case. + */ + folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), + NUMA_NO_NODE, &alloc_nodemask); if (!folio) { alloc_ok = false; break; @@ -2840,20 +2787,24 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, /* * alloc_and_dissolve_hugetlb_folio - Allocate a new folio and dissolve * the old one - * @h: struct hstate old page belongs to * @old_folio: Old folio to dissolve * @list: List to isolate the page in case we need to * Returns 0 on success, otherwise negated error. */ -static int alloc_and_dissolve_hugetlb_folio(struct hstate *h, - struct folio *old_folio, struct list_head *list) +static int alloc_and_dissolve_hugetlb_folio(struct folio *old_folio, + struct list_head *list) { - gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; + gfp_t gfp_mask; + struct hstate *h; int nid = folio_nid(old_folio); struct folio *new_folio = NULL; int ret = 0; retry: + /* + * The old_folio might have been dissolved from under our feet, so make sure + * to carefully check the state under the lock. + */ spin_lock_irq(&hugetlb_lock); if (!folio_test_hugetlb(old_folio)) { /* @@ -2868,7 +2819,7 @@ retry: * Fail with -EBUSY if not possible. */ spin_unlock_irq(&hugetlb_lock); - isolated = isolate_hugetlb(old_folio, list); + isolated = folio_isolate_hugetlb(old_folio, list); ret = isolated ? 0 : -EBUSY; spin_lock_irq(&hugetlb_lock); goto free_new; @@ -2882,8 +2833,10 @@ retry: cond_resched(); goto retry; } else { + h = folio_hstate(old_folio); if (!new_folio) { spin_unlock_irq(&hugetlb_lock); + gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, NULL, NULL); if (!new_folio) @@ -2925,105 +2878,154 @@ free_new: return ret; } -int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) +int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list) { - struct hstate *h; - struct folio *folio = page_folio(page); int ret = -EBUSY; - /* - * The page might have been dissolved from under our feet, so make sure - * to carefully check the state under the lock. - * Return success when racing as if we dissolved the page ourselves. - */ - spin_lock_irq(&hugetlb_lock); - if (folio_test_hugetlb(folio)) { - h = folio_hstate(folio); - } else { - spin_unlock_irq(&hugetlb_lock); + /* Not to disrupt normal path by vainly holding hugetlb_lock */ + if (!folio_test_hugetlb(folio)) return 0; - } - spin_unlock_irq(&hugetlb_lock); /* * Fence off gigantic pages as there is a cyclic dependency between * alloc_contig_range and them. Return -ENOMEM as this has the effect * of bailing out right away without further retrying. */ - if (hstate_is_gigantic(h)) + if (folio_order(folio) > MAX_PAGE_ORDER) return -ENOMEM; - if (folio_ref_count(folio) && isolate_hugetlb(folio, list)) + if (folio_ref_count(folio) && folio_isolate_hugetlb(folio, list)) ret = 0; else if (!folio_ref_count(folio)) - ret = alloc_and_dissolve_hugetlb_folio(h, folio, list); + ret = alloc_and_dissolve_hugetlb_folio(folio, list); return ret; } +/* + * replace_free_hugepage_folios - Replace free hugepage folios in a given pfn + * range with new folios. + * @start_pfn: start pfn of the given pfn range + * @end_pfn: end pfn of the given pfn range + * Returns 0 on success, otherwise negated error. + */ +int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn) +{ + struct folio *folio; + int ret = 0; + + LIST_HEAD(isolate_list); + + while (start_pfn < end_pfn) { + folio = pfn_folio(start_pfn); + + /* Not to disrupt normal path by vainly holding hugetlb_lock */ + if (folio_test_hugetlb(folio) && !folio_ref_count(folio)) { + ret = alloc_and_dissolve_hugetlb_folio(folio, &isolate_list); + if (ret) + break; + + putback_movable_pages(&isolate_list); + } + start_pfn++; + } + + return ret; +} + +void wait_for_freed_hugetlb_folios(void) +{ + if (llist_empty(&hpage_freelist)) + return; + + flush_work(&free_hpage_work); +} + +typedef enum { + /* + * For either 0/1: we checked the per-vma resv map, and one resv + * count either can be reused (0), or an extra needed (1). + */ + MAP_CHG_REUSE = 0, + MAP_CHG_NEEDED = 1, + /* + * Cannot use per-vma resv count can be used, hence a new resv + * count is enforced. + * + * NOTE: This is mostly identical to MAP_CHG_NEEDED, except + * that currently vma_needs_reservation() has an unwanted side + * effect to either use end() or commit() to complete the + * transaction. Hence it needs to differenciate from NEEDED. + */ + MAP_CHG_ENFORCED = 2, +} map_chg_state; + +/* + * NOTE! "cow_from_owner" represents a very hacky usage only used in CoW + * faults of hugetlb private mappings on top of a non-page-cache folio (in + * which case even if there's a private vma resv map it won't cover such + * allocation). New call sites should (probably) never set it to true!! + * When it's set, the allocation will bypass all vma level reservations. + */ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, - unsigned long addr, int avoid_reserve) + unsigned long addr, bool cow_from_owner) { struct hugepage_subpool *spool = subpool_vma(vma); struct hstate *h = hstate_vma(vma); struct folio *folio; - long map_chg, map_commit, nr_pages = pages_per_huge_page(h); - long gbl_chg; - int memcg_charge_ret, ret, idx; + long retval, gbl_chg, gbl_reserve; + map_chg_state map_chg; + int ret, idx; struct hugetlb_cgroup *h_cg = NULL; - struct mem_cgroup *memcg; - bool deferred_reserve; gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL; - memcg = get_mem_cgroup_from_current(); - memcg_charge_ret = mem_cgroup_hugetlb_try_charge(memcg, gfp, nr_pages); - if (memcg_charge_ret == -ENOMEM) { - mem_cgroup_put(memcg); - return ERR_PTR(-ENOMEM); - } - idx = hstate_index(h); - /* - * Examine the region/reserve map to determine if the process - * has a reservation for the page to be allocated. A return - * code of zero indicates a reservation exists (no change). - */ - map_chg = gbl_chg = vma_needs_reservation(h, vma, addr); - if (map_chg < 0) { - if (!memcg_charge_ret) - mem_cgroup_cancel_charge(memcg, nr_pages); - mem_cgroup_put(memcg); - return ERR_PTR(-ENOMEM); + + /* Whether we need a separate per-vma reservation? */ + if (cow_from_owner) { + /* + * Special case! Since it's a CoW on top of a reserved + * page, the private resv map doesn't count. So it cannot + * consume the per-vma resv map even if it's reserved. + */ + map_chg = MAP_CHG_ENFORCED; + } else { + /* + * Examine the region/reserve map to determine if the process + * has a reservation for the page to be allocated. A return + * code of zero indicates a reservation exists (no change). + */ + retval = vma_needs_reservation(h, vma, addr); + if (retval < 0) + return ERR_PTR(-ENOMEM); + map_chg = retval ? MAP_CHG_NEEDED : MAP_CHG_REUSE; } /* + * Whether we need a separate global reservation? + * * Processes that did not create the mapping will have no * reserves as indicated by the region/reserve map. Check * that the allocation will not exceed the subpool limit. - * Allocations for MAP_NORESERVE mappings also need to be - * checked against any subpool limit. + * Or if it can get one from the pool reservation directly. */ - if (map_chg || avoid_reserve) { + if (map_chg) { gbl_chg = hugepage_subpool_get_pages(spool, 1); if (gbl_chg < 0) goto out_end_reservation; - + } else { /* - * Even though there was no reservation in the region/reserve - * map, there could be reservations associated with the - * subpool that can be used. This would be indicated if the - * return value of hugepage_subpool_get_pages() is zero. - * However, if avoid_reserve is specified we still avoid even - * the subpool reservations. + * If we have the vma reservation ready, no need for extra + * global reservation. */ - if (avoid_reserve) - gbl_chg = 1; + gbl_chg = 0; } - /* If this allocation is not consuming a reservation, charge it now. + /* + * If this allocation is not consuming a per-vma reservation, + * charge the hugetlb cgroup now. */ - deferred_reserve = map_chg || avoid_reserve; - if (deferred_reserve) { + if (map_chg) { ret = hugetlb_cgroup_charge_cgroup_rsvd( idx, pages_per_huge_page(h), &h_cg); if (ret) @@ -3040,27 +3042,32 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, * from the global free pool (global change). gbl_chg == 0 indicates * a reservation exists for the allocation. */ - folio = dequeue_hugetlb_folio_vma(h, vma, addr, avoid_reserve, gbl_chg); + folio = dequeue_hugetlb_folio_vma(h, vma, addr, gbl_chg); if (!folio) { spin_unlock_irq(&hugetlb_lock); folio = alloc_buddy_hugetlb_folio_with_mpol(h, vma, addr); if (!folio) goto out_uncharge_cgroup; spin_lock_irq(&hugetlb_lock); - if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { - folio_set_hugetlb_restore_reserve(folio); - h->resv_huge_pages--; - } list_add(&folio->lru, &h->hugepage_activelist); folio_ref_unfreeze(folio, 1); /* Fall through */ } + /* + * Either dequeued or buddy-allocated folio needs to add special + * mark to the folio when it consumes a global reservation. + */ + if (!gbl_chg) { + folio_set_hugetlb_restore_reserve(folio); + h->resv_huge_pages--; + } + hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, folio); /* If allocation is not consuming a reservation, also store the * hugetlb_cgroup pointer on the page. */ - if (deferred_reserve) { + if (map_chg) { hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h), h_cg, folio); } @@ -3069,53 +3076,122 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, hugetlb_set_folio_subpool(folio, spool); - map_commit = vma_commit_reservation(h, vma, addr); - if (unlikely(map_chg > map_commit)) { + if (map_chg != MAP_CHG_ENFORCED) { + /* commit() is only needed if the map_chg is not enforced */ + retval = vma_commit_reservation(h, vma, addr); /* + * Check for possible race conditions. When it happens.. * The page was added to the reservation map between * vma_needs_reservation and vma_commit_reservation. * This indicates a race with hugetlb_reserve_pages. * Adjust for the subpool count incremented above AND - * in hugetlb_reserve_pages for the same page. Also, + * in hugetlb_reserve_pages for the same page. Also, * the reservation count added in hugetlb_reserve_pages * no longer applies. */ - long rsv_adjust; + if (unlikely(map_chg == MAP_CHG_NEEDED && retval == 0)) { + long rsv_adjust; - rsv_adjust = hugepage_subpool_put_pages(spool, 1); - hugetlb_acct_memory(h, -rsv_adjust); - if (deferred_reserve) { - spin_lock_irq(&hugetlb_lock); - hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), - pages_per_huge_page(h), folio); - spin_unlock_irq(&hugetlb_lock); + rsv_adjust = hugepage_subpool_put_pages(spool, 1); + hugetlb_acct_memory(h, -rsv_adjust); + if (map_chg) { + spin_lock_irq(&hugetlb_lock); + hugetlb_cgroup_uncharge_folio_rsvd( + hstate_index(h), pages_per_huge_page(h), + folio); + spin_unlock_irq(&hugetlb_lock); + } } } - if (!memcg_charge_ret) - mem_cgroup_commit_charge(folio, memcg); + ret = mem_cgroup_charge_hugetlb(folio, gfp); + /* + * Unconditionally increment NR_HUGETLB here. If it turns out that + * mem_cgroup_charge_hugetlb failed, then immediately free the page and + * decrement NR_HUGETLB. + */ lruvec_stat_mod_folio(folio, NR_HUGETLB, pages_per_huge_page(h)); - mem_cgroup_put(memcg); + + if (ret == -ENOMEM) { + free_huge_folio(folio); + return ERR_PTR(-ENOMEM); + } return folio; out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); out_uncharge_cgroup_reservation: - if (deferred_reserve) + if (map_chg) hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h), h_cg); out_subpool_put: - if (map_chg || avoid_reserve) - hugepage_subpool_put_pages(spool, 1); + /* + * put page to subpool iff the quota of subpool's rsv_hpages is used + * during hugepage_subpool_get_pages. + */ + if (map_chg && !gbl_chg) { + gbl_reserve = hugepage_subpool_put_pages(spool, 1); + hugetlb_acct_memory(h, -gbl_reserve); + } + + out_end_reservation: - vma_end_reservation(h, vma, addr); - if (!memcg_charge_ret) - mem_cgroup_cancel_charge(memcg, nr_pages); - mem_cgroup_put(memcg); + if (map_chg != MAP_CHG_ENFORCED) + vma_end_reservation(h, vma, addr); return ERR_PTR(-ENOSPC); } +static __init void *alloc_bootmem(struct hstate *h, int nid, bool node_exact) +{ + struct huge_bootmem_page *m; + int listnode = nid; + + if (hugetlb_early_cma(h)) + m = hugetlb_cma_alloc_bootmem(h, &listnode, node_exact); + else { + if (node_exact) + m = memblock_alloc_exact_nid_raw(huge_page_size(h), + huge_page_size(h), 0, + MEMBLOCK_ALLOC_ACCESSIBLE, nid); + else { + m = memblock_alloc_try_nid_raw(huge_page_size(h), + huge_page_size(h), 0, + MEMBLOCK_ALLOC_ACCESSIBLE, nid); + /* + * For pre-HVO to work correctly, pages need to be on + * the list for the node they were actually allocated + * from. That node may be different in the case of + * fallback by memblock_alloc_try_nid_raw. So, + * extract the actual node first. + */ + if (m) + listnode = early_pfn_to_nid(PHYS_PFN(virt_to_phys(m))); + } + + if (m) { + m->flags = 0; + m->cma = NULL; + } + } + + if (m) { + /* + * Use the beginning of the huge page to store the + * huge_bootmem_page struct (until gather_bootmem + * puts them into the mem_map). + * + * Put them into a private list first because mem_map + * is not up yet. + */ + INIT_LIST_HEAD(&m->list); + list_add(&m->list, &huge_boot_pages[listnode]); + m->hstate = h; + } + + return m; +} + int alloc_bootmem_huge_page(struct hstate *h, int nid) __attribute__ ((weak, alias("__alloc_bootmem_huge_page"))); int __alloc_bootmem_huge_page(struct hstate *h, int nid) @@ -3125,22 +3201,16 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid) /* do node specific alloc */ if (nid != NUMA_NO_NODE) { - m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h), - 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid); + m = alloc_bootmem(h, node, true); if (!m) return 0; goto found; } + /* allocate from next node when distributing huge pages */ - for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, &node_states[N_MEMORY]) { - m = memblock_alloc_try_nid_raw( - huge_page_size(h), huge_page_size(h), - 0, MEMBLOCK_ALLOC_ACCESSIBLE, node); - /* - * Use the beginning of the huge page to store the - * huge_bootmem_page struct (until gather_bootmem - * puts them into the mem_map). - */ + for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, + &hugetlb_bootmem_nodes) { + m = alloc_bootmem(h, node, false); if (!m) return 0; goto found; @@ -3157,10 +3227,7 @@ found: */ memblock_reserved_mark_noinit(virt_to_phys((void *)m + PAGE_SIZE), huge_page_size(h) - PAGE_SIZE); - /* Put them into a private list first because mem_map is not up yet */ - INIT_LIST_HEAD(&m->list); - list_add(&m->list, &huge_boot_pages[node]); - m->hstate = h; + return 1; } @@ -3178,7 +3245,6 @@ static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio, for (pfn = head_pfn + start_page_number; pfn < end_pfn; pfn++) { struct page *page = pfn_to_page(pfn); - __ClearPageReserved(folio_page(folio, pfn - head_pfn)); __init_single_page(page, pfn, zone, nid); prep_compound_tail((struct page *)folio, pfn - head_pfn); ret = page_ref_freeze(page, 1); @@ -3202,6 +3268,42 @@ static void __init hugetlb_folio_init_vmemmap(struct folio *folio, prep_compound_head((struct page *)folio, huge_page_order(h)); } +static bool __init hugetlb_bootmem_page_prehvo(struct huge_bootmem_page *m) +{ + return m->flags & HUGE_BOOTMEM_HVO; +} + +static bool __init hugetlb_bootmem_page_earlycma(struct huge_bootmem_page *m) +{ + return m->flags & HUGE_BOOTMEM_CMA; +} + +/* + * memblock-allocated pageblocks might not have the migrate type set + * if marked with the 'noinit' flag. Set it to the default (MIGRATE_MOVABLE) + * here, or MIGRATE_CMA if this was a page allocated through an early CMA + * reservation. + * + * In case of vmemmap optimized folios, the tail vmemmap pages are mapped + * read-only, but that's ok - for sparse vmemmap this does not write to + * the page structure. + */ +static void __init hugetlb_bootmem_init_migratetype(struct folio *folio, + struct hstate *h) +{ + unsigned long nr_pages = pages_per_huge_page(h), i; + + WARN_ON_ONCE(!pageblock_aligned(folio_pfn(folio))); + + for (i = 0; i < nr_pages; i += pageblock_nr_pages) { + if (folio_test_hugetlb_cma(folio)) + init_cma_pageblock(folio_page(folio, i)); + else + set_pageblock_migratetype(folio_page(folio, i), + MIGRATE_MOVABLE); + } +} + static void __init prep_and_add_bootmem_folios(struct hstate *h, struct list_head *folio_list) { @@ -3209,7 +3311,7 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h, struct folio *folio, *tmp_f; /* Send list for bulk vmemmap optimization processing */ - hugetlb_vmemmap_optimize_folios(h, folio_list); + hugetlb_vmemmap_optimize_bootmem_folios(h, folio_list); list_for_each_entry_safe(folio, tmp_f, folio_list, lru) { if (!folio_test_hugetlb_vmemmap_optimized(folio)) { @@ -3223,6 +3325,7 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h, HUGETLB_VMEMMAP_RESERVE_PAGES, pages_per_huge_page(h)); } + hugetlb_bootmem_init_migratetype(folio, h); /* Subdivide locks to achieve better parallel performance */ spin_lock_irqsave(&hugetlb_lock, flags); __prep_account_new_huge_page(h, folio_nid(folio)); @@ -3231,6 +3334,57 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h, } } +bool __init hugetlb_bootmem_page_zones_valid(int nid, + struct huge_bootmem_page *m) +{ + unsigned long start_pfn; + bool valid; + + if (m->flags & HUGE_BOOTMEM_ZONES_VALID) { + /* + * Already validated, skip check. + */ + return true; + } + + if (hugetlb_bootmem_page_earlycma(m)) { + valid = cma_validate_zones(m->cma); + goto out; + } + + start_pfn = virt_to_phys(m) >> PAGE_SHIFT; + + valid = !pfn_range_intersects_zones(nid, start_pfn, + pages_per_huge_page(m->hstate)); +out: + if (!valid) + hstate_boot_nrinvalid[hstate_index(m->hstate)]++; + + return valid; +} + +/* + * Free a bootmem page that was found to be invalid (intersecting with + * multiple zones). + * + * Since it intersects with multiple zones, we can't just do a free + * operation on all pages at once, but instead have to walk all + * pages, freeing them one by one. + */ +static void __init hugetlb_bootmem_free_invalid_page(int nid, struct page *page, + struct hstate *h) +{ + unsigned long npages = pages_per_huge_page(h); + unsigned long pfn; + + while (npages--) { + pfn = page_to_pfn(page); + __init_page_from_nid(pfn, nid); + free_reserved_page(page); + page++; + } +} + /* * Put bootmem huge pages into the standard lists after mem_map is up. * Note: This only applies to gigantic (order > MAX_PAGE_ORDER) pages. @@ -3238,14 +3392,25 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h, static void __init gather_bootmem_prealloc_node(unsigned long nid) { LIST_HEAD(folio_list); - struct huge_bootmem_page *m; + struct huge_bootmem_page *m, *tm; struct hstate *h = NULL, *prev_h = NULL; - list_for_each_entry(m, &huge_boot_pages[nid], list) { + list_for_each_entry_safe(m, tm, &huge_boot_pages[nid], list) { struct page *page = virt_to_page(m); struct folio *folio = (void *)page; h = m->hstate; + if (!hugetlb_bootmem_page_zones_valid(nid, m)) { + /* + * Can't use this page. Initialize the + * page structures if that hasn't already + * been done, and give them to the page + * allocator. + */ + hugetlb_bootmem_free_invalid_page(nid, page, h); + continue; + } + /* * It is possible to have multiple huge page sizes (hstates) * in this list. If so, process each size separately. @@ -3260,14 +3425,30 @@ static void __init gather_bootmem_prealloc_node(unsigned long nid) hugetlb_folio_init_vmemmap(folio, h, HUGETLB_VMEMMAP_RESERVE_PAGES); init_new_hugetlb_folio(h, folio); + + if (hugetlb_bootmem_page_prehvo(m)) + /* + * If pre-HVO was done, just set the + * flag, the HVO code will then skip + * this folio. + */ + folio_set_hugetlb_vmemmap_optimized(folio); + + if (hugetlb_bootmem_page_earlycma(m)) + folio_set_hugetlb_cma(folio); + list_add(&folio->lru, &folio_list); /* * We need to restore the 'stolen' pages to totalram_pages * in order to fix confusing memory reports from free(1) and * other side-effects, like CommitLimit going negative. + * + * For CMA pages, this is done in init_cma_pageblock + * (via hugetlb_bootmem_init_migratetype), so skip it here. */ - adjust_managed_page_count(page, pages_per_huge_page(h)); + if (!folio_test_hugetlb_cma(folio)) + adjust_managed_page_count(page, pages_per_huge_page(h)); cond_resched(); } @@ -3289,7 +3470,7 @@ static void __init gather_bootmem_prealloc(void) .thread_fn = gather_bootmem_prealloc_parallel, .fn_arg = NULL, .start = 0, - .size = num_node_state(N_MEMORY), + .size = nr_node_ids, .align = 1, .min_chunk = 1, .max_threads = num_node_state(N_MEMORY), @@ -3407,32 +3588,44 @@ static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h) .numa_aware = true }; + unsigned long jiffies_start; + unsigned long jiffies_end; + job.thread_fn = hugetlb_pages_alloc_boot_node; job.start = 0; job.size = h->max_huge_pages; /* - * job.max_threads is twice the num_node_state(N_MEMORY), + * job.max_threads is 25% of the available cpu threads by default. * - * Tests below indicate that a multiplier of 2 significantly improves - * performance, and although larger values also provide improvements, - * the gains are marginal. + * On large servers with terabytes of memory, huge page allocation + * can consume a considerably amount of time. * - * Therefore, choosing 2 as the multiplier strikes a good balance between - * enhancing parallel processing capabilities and maintaining efficient - * resource management. + * Tests below show how long it takes to allocate 1 TiB of memory with 2MiB huge pages. + * 2MiB huge pages. Using more threads can significantly improve allocation time. * - * +------------+-------+-------+-------+-------+-------+ - * | multiplier | 1 | 2 | 3 | 4 | 5 | - * +------------+-------+-------+-------+-------+-------+ - * | 256G 2node | 358ms | 215ms | 157ms | 134ms | 126ms | - * | 2T 4node | 979ms | 679ms | 543ms | 489ms | 481ms | - * | 50G 2node | 71ms | 44ms | 37ms | 30ms | 31ms | - * +------------+-------+-------+-------+-------+-------+ + * +-----------------------+-------+-------+-------+-------+-------+ + * | threads | 8 | 16 | 32 | 64 | 128 | + * +-----------------------+-------+-------+-------+-------+-------+ + * | skylake 144 cpus | 44s | 22s | 16s | 19s | 20s | + * | cascade lake 192 cpus | 39s | 20s | 11s | 10s | 9s | + * +-----------------------+-------+-------+-------+-------+-------+ */ - job.max_threads = num_node_state(N_MEMORY) * 2; - job.min_chunk = h->max_huge_pages / num_node_state(N_MEMORY) / 2; + if (hugepage_allocation_threads == 0) { + hugepage_allocation_threads = num_online_cpus() / 4; + hugepage_allocation_threads = max(hugepage_allocation_threads, 1); + } + + job.max_threads = hugepage_allocation_threads; + job.min_chunk = h->max_huge_pages / hugepage_allocation_threads; + + jiffies_start = jiffies; padata_do_multithreaded(&job); + jiffies_end = jiffies; + + pr_info("HugeTLB: allocation took %dms with hugepage_allocation_threads=%ld\n", + jiffies_to_msecs(jiffies_end - jiffies_start), + hugepage_allocation_threads); return h->nr_huge_pages; } @@ -3451,23 +3644,17 @@ static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h) static void __init hugetlb_hstate_alloc_pages(struct hstate *h) { unsigned long allocated; - static bool initialized __initdata; - /* skip gigantic hugepages allocation if hugetlb_cma enabled */ - if (hstate_is_gigantic(h) && hugetlb_cma_size) { + /* + * Skip gigantic hugepages allocation if early CMA + * reservations are not available. + */ + if (hstate_is_gigantic(h) && hugetlb_cma_total_size() && + !hugetlb_early_cma(h)) { pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n"); return; } - /* hugetlb_hstate_alloc_pages will be called many times, initialize huge_boot_pages once */ - if (!initialized) { - int i = 0; - - for (i = 0; i < MAX_NUMNODES; i++) - INIT_LIST_HEAD(&huge_boot_pages[i]); - initialized = true; - } - /* do node specific alloc */ if (hugetlb_hstate_alloc_pages_specific_nodes(h)) return; @@ -3486,6 +3673,15 @@ static void __init hugetlb_init_hstates(void) struct hstate *h, *h2; for_each_hstate(h) { + /* + * Always reset to first_memory_node here, even if + * next_nid_to_alloc was set before - we can't + * reference hugetlb_bootmem_nodes after init, and + * first_memory_node is right for all further allocations. + */ + h->next_nid_to_alloc = first_memory_node; + h->next_nid_to_free = first_memory_node; + /* oversize hugepages were init'ed in early boot */ if (!hstate_is_gigantic(h)) hugetlb_hstate_alloc_pages(h); @@ -3500,7 +3696,7 @@ static void __init hugetlb_init_hstates(void) */ if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) continue; - if (hugetlb_cma_size && h->order <= HUGETLB_PAGE_ORDER) + if (hugetlb_cma_total_size() && h->order <= HUGETLB_PAGE_ORDER) continue; for_each_hstate(h2) { if (h2 == h) @@ -3515,13 +3711,20 @@ static void __init hugetlb_init_hstates(void) static void __init report_hugepages(void) { struct hstate *h; + unsigned long nrinvalid; for_each_hstate(h) { char buf[32]; + nrinvalid = hstate_boot_nrinvalid[hstate_index(h)]; + h->max_huge_pages -= nrinvalid; + string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n", - buf, h->free_huge_pages); + buf, h->nr_huge_pages); + if (nrinvalid) + pr_info("HugeTLB: %s page size: %lu invalid page%s discarded\n", + buf, nrinvalid, nrinvalid > 1 ? "s" : ""); pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n", hugetlb_vmemmap_optimizable_size(h) / SZ_1K, buf); } @@ -3603,6 +3806,7 @@ found: static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, nodemask_t *nodes_allowed) { + unsigned long persistent_free_count; unsigned long min_count; unsigned long allocated; struct folio *folio; @@ -3737,8 +3941,24 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, * though, we'll note that we're not allowed to exceed surplus * and won't grow the pool anywhere else. Not until one of the * sysctls are changed, or the surplus pages go out of use. + * + * min_count is the expected number of persistent pages, we + * shouldn't calculate min_count by using + * resv_huge_pages + persistent_huge_pages() - free_huge_pages, + * because there may exist free surplus huge pages, and this will + * lead to subtracting twice. Free surplus huge pages come from HVO + * failing to restore vmemmap, see comments in the callers of + * hugetlb_vmemmap_restore_folio(). Thus, we should calculate + * persistent free count first. */ - min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; + persistent_free_count = h->free_huge_pages; + if (h->free_huge_pages > persistent_huge_pages(h)) { + if (h->free_huge_pages > h->surplus_huge_pages) + persistent_free_count -= h->surplus_huge_pages; + else + persistent_free_count = 0; + } + min_count = h->resv_huge_pages + persistent_huge_pages(h) - persistent_free_count; min_count = max(count, min_count); try_to_free_low(h, min_count, nodes_allowed); @@ -3795,10 +4015,13 @@ static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst, list_for_each_entry_safe(folio, next, src_list, lru) { int i; + bool cma; if (folio_test_hugetlb_vmemmap_optimized(folio)) continue; + cma = folio_test_hugetlb_cma(folio); + list_del(&folio->lru); split_page_owner(&folio->page, huge_page_order(src), huge_page_order(dst)); @@ -3806,13 +4029,18 @@ static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst, for (i = 0; i < pages_per_huge_page(src); i += pages_per_huge_page(dst)) { struct page *page = folio_page(folio, i); + /* Careful: see __split_huge_page_tail() */ + struct folio *new_folio = (struct folio *)page; - page->mapping = NULL; clear_compound_head(page); prep_compound_page(page, dst->order); - init_new_hugetlb_folio(dst, page_folio(page)); - list_add(&page->lru, &dst_list); + new_folio->mapping = NULL; + init_new_hugetlb_folio(dst, new_folio); + /* Copy the CMA flag so that it is freed correctly */ + if (cma) + folio_set_hugetlb_cma(new_folio); + list_add(&new_folio->lru, &dst_list); } } @@ -4393,14 +4621,6 @@ static void hugetlb_register_all_nodes(void) { } #endif -#ifdef CONFIG_CMA -static void __init hugetlb_cma_check(void); -#else -static inline __init void hugetlb_cma_check(void) -{ -} -#endif - static void __init hugetlb_sysfs_init(void) { struct hstate *h; @@ -4414,7 +4634,7 @@ static void __init hugetlb_sysfs_init(void) err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, hstate_kobjs, &hstate_attr_group); if (err) - pr_err("HugeTLB: Unable to add hstate %s", h->name); + pr_err("HugeTLB: Unable to add hstate %s\n", h->name); } #ifdef CONFIG_NUMA @@ -4525,8 +4745,6 @@ void __init hugetlb_add_hstate(unsigned int order) for (i = 0; i < MAX_NUMNODES; ++i) INIT_LIST_HEAD(&h->hugepage_freelists[i]); INIT_LIST_HEAD(&h->hugepage_activelist); - h->next_nid_to_alloc = first_memory_node; - h->next_nid_to_free = first_memory_node; snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", huge_page_size(h)/SZ_1K); @@ -4551,6 +4769,44 @@ static void __init hugepages_clear_pages_in_node(void) } } +static __init int hugetlb_add_param(char *s, int (*setup)(char *)) +{ + size_t len; + char *p; + + if (hugetlb_param_index >= HUGE_MAX_CMDLINE_ARGS) + return -EINVAL; + + len = strlen(s) + 1; + if (len + hstate_cmdline_index > sizeof(hstate_cmdline_buf)) + return -EINVAL; + + p = &hstate_cmdline_buf[hstate_cmdline_index]; + memcpy(p, s, len); + hstate_cmdline_index += len; + + hugetlb_params[hugetlb_param_index].val = p; + hugetlb_params[hugetlb_param_index].setup = setup; + + hugetlb_param_index++; + + return 0; +} + +static __init void hugetlb_parse_params(void) +{ + int i; + struct hugetlb_cmdline *hcp; + + for (i = 0; i < hugetlb_param_index; i++) { + hcp = &hugetlb_params[i]; + + hcp->setup(hcp->val); + } + + hugetlb_cma_validate_params(); +} + /* * hugepages command line processing * hugepages normally follows a valid hugepagsz or default_hugepagsz @@ -4570,7 +4826,7 @@ static int __init hugepages_setup(char *s) if (!parsed_valid_hugepagesz) { pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s); parsed_valid_hugepagesz = true; - return 1; + return -EINVAL; } /* @@ -4624,24 +4880,16 @@ static int __init hugepages_setup(char *s) } } - /* - * Global state is always initialized later in hugetlb_init. - * But we need to allocate gigantic hstates here early to still - * use the bootmem allocator. - */ - if (hugetlb_max_hstate && hstate_is_gigantic(parsed_hstate)) - hugetlb_hstate_alloc_pages(parsed_hstate); - last_mhp = mhp; - return 1; + return 0; invalid: pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p); hugepages_clear_pages_in_node(); - return 1; + return -EINVAL; } -__setup("hugepages=", hugepages_setup); +hugetlb_early_param("hugepages", hugepages_setup); /* * hugepagesz command line processing @@ -4660,7 +4908,7 @@ static int __init hugepagesz_setup(char *s) if (!arch_hugetlb_valid_size(size)) { pr_err("HugeTLB: unsupported hugepagesz=%s\n", s); - return 1; + return -EINVAL; } h = size_to_hstate(size); @@ -4675,7 +4923,7 @@ static int __init hugepagesz_setup(char *s) if (!parsed_default_hugepagesz || h != &default_hstate || default_hstate.max_huge_pages) { pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s); - return 1; + return -EINVAL; } /* @@ -4685,14 +4933,14 @@ static int __init hugepagesz_setup(char *s) */ parsed_hstate = h; parsed_valid_hugepagesz = true; - return 1; + return 0; } hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); parsed_valid_hugepagesz = true; - return 1; + return 0; } -__setup("hugepagesz=", hugepagesz_setup); +hugetlb_early_param("hugepagesz", hugepagesz_setup); /* * default_hugepagesz command line input @@ -4706,14 +4954,14 @@ static int __init default_hugepagesz_setup(char *s) parsed_valid_hugepagesz = false; if (parsed_default_hugepagesz) { pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s); - return 1; + return -EINVAL; } size = (unsigned long)memparse(s, NULL); if (!arch_hugetlb_valid_size(size)) { pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s); - return 1; + return -EINVAL; } hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); @@ -4730,17 +4978,89 @@ static int __init default_hugepagesz_setup(char *s) */ if (default_hstate_max_huge_pages) { default_hstate.max_huge_pages = default_hstate_max_huge_pages; - for_each_online_node(i) - default_hstate.max_huge_pages_node[i] = - default_hugepages_in_node[i]; - if (hstate_is_gigantic(&default_hstate)) - hugetlb_hstate_alloc_pages(&default_hstate); + /* + * Since this is an early parameter, we can't check + * NUMA node state yet, so loop through MAX_NUMNODES. + */ + for (i = 0; i < MAX_NUMNODES; i++) { + if (default_hugepages_in_node[i] != 0) + default_hstate.max_huge_pages_node[i] = + default_hugepages_in_node[i]; + } default_hstate_max_huge_pages = 0; } + return 0; +} +hugetlb_early_param("default_hugepagesz", default_hugepagesz_setup); + +void __init hugetlb_bootmem_set_nodes(void) +{ + int i, nid; + unsigned long start_pfn, end_pfn; + + if (!nodes_empty(hugetlb_bootmem_nodes)) + return; + + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { + if (end_pfn > start_pfn) + node_set(nid, hugetlb_bootmem_nodes); + } +} + +static bool __hugetlb_bootmem_allocated __initdata; + +bool __init hugetlb_bootmem_allocated(void) +{ + return __hugetlb_bootmem_allocated; +} + +void __init hugetlb_bootmem_alloc(void) +{ + struct hstate *h; + int i; + + if (__hugetlb_bootmem_allocated) + return; + + hugetlb_bootmem_set_nodes(); + + for (i = 0; i < MAX_NUMNODES; i++) + INIT_LIST_HEAD(&huge_boot_pages[i]); + + hugetlb_parse_params(); + + for_each_hstate(h) { + h->next_nid_to_alloc = first_online_node; + + if (hstate_is_gigantic(h)) + hugetlb_hstate_alloc_pages(h); + } + + __hugetlb_bootmem_allocated = true; +} + +/* + * hugepage_alloc_threads command line parsing. + * + * When set, use this specific number of threads for the boot + * allocation of hugepages. + */ +static int __init hugepage_alloc_threads_setup(char *s) +{ + unsigned long allocation_threads; + + if (kstrtoul(s, 0, &allocation_threads) != 0) + return 1; + + if (allocation_threads == 0) + return 1; + + hugepage_allocation_threads = allocation_threads; + return 1; } -__setup("default_hugepagesz=", default_hugepagesz_setup); +__setup("hugepage_alloc_threads=", hugepage_alloc_threads_setup); static unsigned int allowed_mems_nr(struct hstate *h) { @@ -4845,7 +5165,7 @@ out: return ret; } -static struct ctl_table hugetlb_table[] = { +static const struct ctl_table hugetlb_table[] = { { .procname = "nr_hugepages", .data = NULL, @@ -4878,7 +5198,7 @@ static struct ctl_table hugetlb_table[] = { }, }; -static void hugetlb_sysctl_init(void) +static void __init hugetlb_sysctl_init(void) { register_sysctl_init("vm", hugetlb_table); } @@ -5086,26 +5406,40 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr) { if (addr & ~(huge_page_mask(hstate_vma(vma)))) return -EINVAL; + return 0; +} +void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) +{ /* * PMD sharing is only possible for PUD_SIZE-aligned address ranges * in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this * split, unshare PMDs in the PUD_SIZE interval surrounding addr now. + * This function is called in the middle of a VMA split operation, with + * MM, VMA and rmap all write-locked to prevent concurrent page table + * walks (except hardware and gup_fast()). */ + vma_assert_write_locked(vma); + i_mmap_assert_write_locked(vma->vm_file->f_mapping); + if (addr & ~PUD_MASK) { - /* - * hugetlb_vm_op_split is called right before we attempt to - * split the VMA. We will need to unshare PMDs in the old and - * new VMAs, so let's unshare before we split. - */ unsigned long floor = addr & PUD_MASK; unsigned long ceil = floor + PUD_SIZE; - if (floor >= vma->vm_start && ceil <= vma->vm_end) - hugetlb_unshare_pmds(vma, floor, ceil); + if (floor >= vma->vm_start && ceil <= vma->vm_end) { + /* + * Locking: + * Use take_locks=false here. + * The file rmap lock is already held. + * The hugetlb VMA lock can't be taken when we already + * hold the file rmap lock, and we don't need it because + * its purpose is to synchronize against concurrent page + * table walks, which are not possible thanks to the + * locks held by our caller. + */ + hugetlb_unshare_pmds(vma, floor, ceil, /* take_locks = */ false); + } } - - return 0; } static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma) @@ -5140,18 +5474,16 @@ const struct vm_operations_struct hugetlb_vm_ops = { .pagesize = hugetlb_vm_op_pagesize, }; -static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, - int writable) +static pte_t make_huge_pte(struct vm_area_struct *vma, struct folio *folio, + bool try_mkwrite) { - pte_t entry; + pte_t entry = folio_mk_pte(folio, vma->vm_page_prot); unsigned int shift = huge_page_shift(hstate_vma(vma)); - if (writable) { - entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page, - vma->vm_page_prot))); + if (try_mkwrite && (vma->vm_flags & VM_WRITE)) { + entry = pte_mkwrite_novma(pte_mkdirty(entry)); } else { - entry = huge_pte_wrprotect(mk_huge_pte(page, - vma->vm_page_prot)); + entry = pte_wrprotect(entry); } entry = pte_mkyoung(entry); entry = arch_make_huge_pte(entry, shift, vma->vm_flags); @@ -5169,6 +5501,13 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, update_mmu_cache(vma, address, ptep); } +static void set_huge_ptep_maybe_writable(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) +{ + if (vma->vm_flags & VM_WRITE) + set_huge_ptep_writable(vma, address, ptep); +} + bool is_hugetlb_entry_migration(pte_t pte) { swp_entry_t swp; @@ -5199,7 +5538,7 @@ static void hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr, struct folio *new_folio, pte_t old, unsigned long sz) { - pte_t newpte = make_huge_pte(vma, &new_folio->page, 1); + pte_t newpte = make_huge_pte(vma, new_folio, true); __folio_mark_uptodate(new_folio); hugetlb_add_new_anon_rmap(new_folio, vma, addr); @@ -5333,14 +5672,14 @@ again: spin_unlock(src_ptl); spin_unlock(dst_ptl); /* Do not use reserve as it's private owned */ - new_folio = alloc_hugetlb_folio(dst_vma, addr, 1); + new_folio = alloc_hugetlb_folio(dst_vma, addr, false); if (IS_ERR(new_folio)) { folio_put(pte_folio); ret = PTR_ERR(new_folio); break; } ret = copy_user_large_folio(new_folio, pte_folio, - ALIGN_DOWN(addr, sz), dst_vma); + addr, dst_vma); folio_put(pte_folio); if (ret) { folio_put(new_folio); @@ -5402,6 +5741,7 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte, unsigned long sz) { + bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma); struct hstate *h = hstate_vma(vma); struct mm_struct *mm = vma->vm_mm; spinlock_t *src_ptl, *dst_ptl; @@ -5417,8 +5757,19 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, if (src_ptl != dst_ptl) spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); - pte = huge_ptep_get_and_clear(mm, old_addr, src_pte); - set_huge_pte_at(mm, new_addr, dst_pte, pte, sz); + pte = huge_ptep_get_and_clear(mm, old_addr, src_pte, sz); + + if (need_clear_uffd_wp && pte_marker_uffd_wp(pte)) + huge_pte_clear(mm, new_addr, dst_pte, sz); + else { + if (need_clear_uffd_wp) { + if (pte_present(pte)) + pte = huge_pte_clear_uffd_wp(pte); + else if (is_swap_pte(pte)) + pte = pte_swp_clear_uffd_wp(pte); + } + set_huge_pte_at(mm, new_addr, dst_pte, pte, sz); + } if (src_ptl != dst_ptl) spin_unlock(src_ptl); @@ -5491,14 +5842,14 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, - struct page *ref_page, zap_flags_t zap_flags) + struct folio *folio, zap_flags_t zap_flags) { struct mm_struct *mm = vma->vm_mm; + const bool folio_provided = !!folio; unsigned long address; pte_t *ptep; pte_t pte; spinlock_t *ptl; - struct page *page; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); bool adjust_reservation = false; @@ -5562,14 +5913,13 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, continue; } - page = pte_page(pte); /* - * If a reference page is supplied, it is because a specific - * page is being unmapped, not a range. Ensure the page we - * are about to unmap is the actual page of interest. + * If a folio is supplied, it is because a specific + * folio is being unmapped, not a range. Ensure the folio we + * are about to unmap is the actual folio of interest. */ - if (ref_page) { - if (page != ref_page) { + if (folio_provided) { + if (folio != page_folio(pte_page(pte))) { spin_unlock(ptl); continue; } @@ -5579,12 +5929,14 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, * looking like data was lost */ set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED); + } else { + folio = page_folio(pte_page(pte)); } - pte = huge_ptep_get_and_clear(mm, address, ptep); + pte = huge_ptep_get_and_clear(mm, address, ptep, sz); tlb_remove_huge_tlb_entry(h, tlb, ptep, address); if (huge_pte_dirty(pte)) - set_page_dirty(page); + folio_mark_dirty(folio); /* Leave a uffd-wp pte marker if needed */ if (huge_pte_uffd_wp(pte) && !(zap_flags & ZAP_FLAG_DROP_MARKER)) @@ -5592,7 +5944,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, make_pte_marker(PTE_MARKER_UFFD_WP), sz); hugetlb_count_sub(pages_per_huge_page(h), mm); - hugetlb_remove_rmap(page_folio(page)); + hugetlb_remove_rmap(folio); /* * Restore the reservation for anonymous page, otherwise the @@ -5601,8 +5953,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, * reservation bit. */ if (!h->surplus_huge_pages && __vma_private_lock(vma) && - folio_test_anon(page_folio(page))) { - folio_set_hugetlb_restore_reserve(page_folio(page)); + folio_test_anon(folio)) { + folio_set_hugetlb_restore_reserve(folio); /* Reservation to be adjusted after the spin lock */ adjust_reservation = true; } @@ -5626,16 +5978,17 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, * count will not be incremented by free_huge_folio. * Act as if we consumed the reservation. */ - folio_clear_hugetlb_restore_reserve(page_folio(page)); + folio_clear_hugetlb_restore_reserve(folio); else if (rc) vma_add_reservation(h, vma, address); } - tlb_remove_page_size(tlb, page, huge_page_size(h)); + tlb_remove_page_size(tlb, folio_page(folio, 0), + folio_size(folio)); /* - * Bail out after unmapping reference page if supplied + * If we were instructed to unmap a specific folio, we're done. */ - if (ref_page) + if (folio_provided) break; } tlb_end_vma(tlb, vma); @@ -5697,7 +6050,7 @@ void __hugetlb_zap_end(struct vm_area_struct *vma, } void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end, struct page *ref_page, + unsigned long end, struct folio *folio, zap_flags_t zap_flags) { struct mmu_notifier_range range; @@ -5709,7 +6062,8 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, mmu_notifier_invalidate_range_start(&range); tlb_gather_mmu(&tlb, vma->vm_mm); - __unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags); + __unmap_hugepage_range(&tlb, vma, start, end, + folio, zap_flags); mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb); @@ -5722,7 +6076,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, * same region. */ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, - struct page *page, unsigned long address) + struct folio *folio, unsigned long address) { struct hstate *h = hstate_vma(vma); struct vm_area_struct *iter_vma; @@ -5766,7 +6120,8 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, */ if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) unmap_hugepage_range(iter_vma, address, - address + huge_page_size(h), page, 0); + address + huge_page_size(h), + folio, 0); } i_mmap_unlock_write(mapping); } @@ -5787,7 +6142,7 @@ static vm_fault_t hugetlb_wp(struct folio *pagecache_folio, struct hstate *h = hstate_vma(vma); struct folio *old_folio; struct folio *new_folio; - int outside_reserve = 0; + bool cow_from_owner = 0; vm_fault_t ret = 0; struct mmu_notifier_range range; @@ -5802,13 +6157,6 @@ static vm_fault_t hugetlb_wp(struct folio *pagecache_folio, if (!unshare && huge_pte_uffd_wp(pte)) return 0; - /* - * hugetlb does not support FOLL_FORCE-style write faults that keep the - * PTE mapped R/O such as maybe_mkwrite() would do. - */ - if (WARN_ON_ONCE(!unshare && !(vma->vm_flags & VM_WRITE))) - return VM_FAULT_SIGSEGV; - /* Let's take out MAP_SHARED mappings first. */ if (vma->vm_flags & VM_MAYSHARE) { set_huge_ptep_writable(vma, vmf->address, vmf->pte); @@ -5837,7 +6185,8 @@ retry_avoidcopy: SetPageAnonExclusive(&old_folio->page); } if (likely(!unshare)) - set_huge_ptep_writable(vma, vmf->address, vmf->pte); + set_huge_ptep_maybe_writable(vma, vmf->address, + vmf->pte); delayacct_wpcopy_end(); return 0; @@ -5856,7 +6205,7 @@ retry_avoidcopy: */ if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && old_folio != pagecache_folio) - outside_reserve = 1; + cow_from_owner = true; folio_get(old_folio); @@ -5865,7 +6214,7 @@ retry_avoidcopy: * be acquired again before returning to the caller, as expected. */ spin_unlock(vmf->ptl); - new_folio = alloc_hugetlb_folio(vma, vmf->address, outside_reserve); + new_folio = alloc_hugetlb_folio(vma, vmf->address, cow_from_owner); if (IS_ERR(new_folio)) { /* @@ -5875,7 +6224,7 @@ retry_avoidcopy: * reliability, unmap the page from child processes. The child * may get SIGKILLed if it later faults. */ - if (outside_reserve) { + if (cow_from_owner) { struct address_space *mapping = vma->vm_file->f_mapping; pgoff_t idx; u32 hash; @@ -5895,8 +6244,7 @@ retry_avoidcopy: hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); - unmap_ref_private(mm, vma, &old_folio->page, - vmf->address); + unmap_ref_private(mm, vma, old_folio, vmf->address); mutex_lock(&hugetlb_fault_mutex_table[hash]); hugetlb_vma_lock_read(vma); @@ -5943,7 +6291,7 @@ retry_avoidcopy: spin_lock(vmf->ptl); vmf->pte = hugetlb_walk(vma, vmf->address, huge_page_size(h)); if (likely(vmf->pte && pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), pte))) { - pte_t newpte = make_huge_pte(vma, &new_folio->page, !unshare); + pte_t newpte = make_huge_pte(vma, new_folio, !unshare); /* Break COW or unshare */ huge_ptep_clear_flush(vma, vmf->address, vmf->pte); @@ -6126,7 +6474,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping, goto out; } - folio = alloc_hugetlb_folio(vma, vmf->address, 0); + folio = alloc_hugetlb_folio(vma, vmf->address, false); if (IS_ERR(folio)) { /* * Returning error will result in faulting task being @@ -6223,8 +6571,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping, hugetlb_add_new_anon_rmap(folio, vma, vmf->address); else hugetlb_add_file_rmap(folio); - new_pte = make_huge_pte(vma, &folio->page, ((vma->vm_flags & VM_WRITE) - && (vma->vm_flags & VM_SHARED))); + new_pte = make_huge_pte(vma, folio, vma->vm_flags & VM_SHARED); /* * If this pte was previously wr-protected, keep it wr-protected even * if populated. @@ -6556,7 +6903,6 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, spinlock_t *ptl; int ret = -ENOMEM; struct folio *folio; - int writable; bool folio_in_pagecache = false; if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) { @@ -6594,7 +6940,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, goto out; } - folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0); + folio = alloc_hugetlb_folio(dst_vma, dst_addr, false); if (IS_ERR(folio)) { ret = -ENOMEM; goto out; @@ -6636,15 +6982,14 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, goto out; } - folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0); + folio = alloc_hugetlb_folio(dst_vma, dst_addr, false); if (IS_ERR(folio)) { folio_put(*foliop); ret = -ENOMEM; *foliop = NULL; goto out; } - ret = copy_user_large_folio(folio, *foliop, - ALIGN_DOWN(dst_addr, size), dst_vma); + ret = copy_user_large_folio(folio, *foliop, dst_addr, dst_vma); folio_put(*foliop); *foliop = NULL; if (ret) { @@ -6711,12 +7056,8 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY * with wp flag set, don't set pte write bit. */ - if (wp_enabled || (is_continue && !vm_shared)) - writable = 0; - else - writable = dst_vma->vm_flags & VM_WRITE; - - _dst_pte = make_huge_pte(dst_vma, &folio->page, writable); + _dst_pte = make_huge_pte(dst_vma, folio, + !wp_enabled && !(is_continue && !vm_shared)); /* * Always mark UFFDIO_COPY page dirty; note that this may not be * extremely important for hugetlbfs for now since swapping is not @@ -6909,7 +7250,7 @@ bool hugetlb_reserve_pages(struct inode *inode, struct vm_area_struct *vma, vm_flags_t vm_flags) { - long chg = -1, add = -1; + long chg = -1, add = -1, spool_resv, gbl_resv; struct hstate *h = hstate_inode(inode); struct hugepage_subpool *spool = subpool_inode(inode); struct resv_map *resv_map; @@ -7044,8 +7385,16 @@ bool hugetlb_reserve_pages(struct inode *inode, return true; out_put_pages: - /* put back original number of pages, chg */ - (void)hugepage_subpool_put_pages(spool, chg); + spool_resv = chg - gbl_reserve; + if (spool_resv) { + /* put sub pool's reservation back, chg - gbl_reserve */ + gbl_resv = hugepage_subpool_put_pages(spool, spool_resv); + /* + * subpool's reserved pages can not be put back due to race, + * return to hstate. + */ + hugetlb_acct_memory(h, -gbl_resv); + } out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), chg * pages_per_huge_page(h), h_cg); @@ -7212,7 +7561,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, spte = hugetlb_walk(svma, saddr, vma_mmu_pagesize(svma)); if (spte) { - get_page(virt_to_page(spte)); + ptdesc_pmd_pts_inc(virt_to_ptdesc(spte)); break; } } @@ -7227,7 +7576,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, (pmd_t *)((unsigned long)spte & PAGE_MASK)); mm_inc_nr_pmds(mm); } else { - put_page(virt_to_page(spte)); + ptdesc_pmd_pts_dec(virt_to_ptdesc(spte)); } spin_unlock(&mm->page_table_lock); out: @@ -7239,10 +7588,6 @@ out: /* * unmap huge page backed by shared pte. * - * Hugetlb pte page is ref counted at the time of mapping. If pte is shared - * indicated by page_count > 1, unmap is achieved by clearing pud and - * decrementing the ref count. If count == 1, the pte page is not shared. - * * Called with page table lock held. * * returns: 1 successfully unmapped a shared pte page @@ -7251,18 +7596,27 @@ out: int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { + unsigned long sz = huge_page_size(hstate_vma(vma)); pgd_t *pgd = pgd_offset(mm, addr); p4d_t *p4d = p4d_offset(pgd, addr); pud_t *pud = pud_offset(p4d, addr); i_mmap_assert_write_locked(vma->vm_file->f_mapping); hugetlb_vma_assert_locked(vma); - BUG_ON(page_count(virt_to_page(ptep)) == 0); - if (page_count(virt_to_page(ptep)) == 1) + if (sz != PMD_SIZE) + return 0; + if (!ptdesc_pmd_pts_count(virt_to_ptdesc(ptep))) return 0; pud_clear(pud); - put_page(virt_to_page(ptep)); + /* + * Once our caller drops the rmap lock, some other process might be + * using this page table as a normal, non-hugetlb page table. + * Wait for pending gup_fast() in other threads to finish before letting + * that happen. + */ + tlb_remove_table_sync_one(); + ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep)); mm_dec_nr_pmds(mm); return 1; } @@ -7397,7 +7751,24 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h) #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ -bool isolate_hugetlb(struct folio *folio, struct list_head *list) +/** + * folio_isolate_hugetlb - try to isolate an allocated hugetlb folio + * @folio: the folio to isolate + * @list: the list to add the folio to on success + * + * Isolate an allocated (refcount > 0) hugetlb folio, marking it as + * isolated/non-migratable, and moving it from the active list to the + * given list. + * + * Isolation will fail if @folio is not an allocated hugetlb folio, or if + * it is already isolated/non-migratable. + * + * On success, an additional folio reference is taken that must be dropped + * using folio_putback_hugetlb() to undo the isolation. + * + * Return: True if isolation worked, otherwise False. + */ +bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list) { bool ret = true; @@ -7445,7 +7816,18 @@ int get_huge_page_for_hwpoison(unsigned long pfn, int flags, return ret; } -void folio_putback_active_hugetlb(struct folio *folio) +/** + * folio_putback_hugetlb - unisolate a hugetlb folio + * @folio: the isolated hugetlb folio + * + * Putback/un-isolate the hugetlb folio that was previous isolated using + * folio_isolate_hugetlb(): marking it non-isolated/migratable and putting it + * back onto the active list. + * + * Will drop the additional folio reference obtained through + * folio_isolate_hugetlb(). + */ +void folio_putback_hugetlb(struct folio *folio) { spin_lock_irq(&hugetlb_lock); folio_set_hugetlb_migratable(folio); @@ -7492,11 +7874,28 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re } spin_unlock_irq(&hugetlb_lock); } + + /* + * Our old folio is isolated and has "migratable" cleared until it + * is putback. As migration succeeded, set the new folio "migratable" + * and add it to the active list. + */ + spin_lock_irq(&hugetlb_lock); + folio_set_hugetlb_migratable(new_folio); + list_move_tail(&new_folio->lru, &(folio_hstate(new_folio))->hugepage_activelist); + spin_unlock_irq(&hugetlb_lock); } +/* + * If @take_locks is false, the caller must ensure that no concurrent page table + * access can happen (except for gup_fast() and hardware page walks). + * If @take_locks is true, we take the hugetlb VMA lock (to lock out things like + * concurrent page fault handling) and the file rmap lock. + */ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, unsigned long start, - unsigned long end) + unsigned long end, + bool take_locks) { struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); @@ -7520,8 +7919,12 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, start, end); mmu_notifier_invalidate_range_start(&range); - hugetlb_vma_lock_write(vma); - i_mmap_lock_write(vma->vm_file->f_mapping); + if (take_locks) { + hugetlb_vma_lock_write(vma); + i_mmap_lock_write(vma->vm_file->f_mapping); + } else { + i_mmap_assert_write_locked(vma->vm_file->f_mapping); + } for (address = start; address < end; address += PUD_SIZE) { ptep = hugetlb_walk(vma, address, sz); if (!ptep) @@ -7531,8 +7934,10 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, spin_unlock(ptl); } flush_hugetlb_tlb_range(vma, start, end); - i_mmap_unlock_write(vma->vm_file->f_mapping); - hugetlb_vma_unlock_write(vma); + if (take_locks) { + i_mmap_unlock_write(vma->vm_file->f_mapping); + hugetlb_vma_unlock_write(vma); + } /* * No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see * Documentation/mm/mmu_notifier.rst. @@ -7547,165 +7952,20 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE), - ALIGN_DOWN(vma->vm_end, PUD_SIZE)); + ALIGN_DOWN(vma->vm_end, PUD_SIZE), + /* take_locks = */ true); } -#ifdef CONFIG_CMA -static bool cma_reserve_called __initdata; - -static int __init cmdline_parse_hugetlb_cma(char *p) -{ - int nid, count = 0; - unsigned long tmp; - char *s = p; - - while (*s) { - if (sscanf(s, "%lu%n", &tmp, &count) != 1) - break; - - if (s[count] == ':') { - if (tmp >= MAX_NUMNODES) - break; - nid = array_index_nospec(tmp, MAX_NUMNODES); - - s += count + 1; - tmp = memparse(s, &s); - hugetlb_cma_size_in_node[nid] = tmp; - hugetlb_cma_size += tmp; - - /* - * Skip the separator if have one, otherwise - * break the parsing. - */ - if (*s == ',') - s++; - else - break; - } else { - hugetlb_cma_size = memparse(p, &p); - break; - } - } - - return 0; -} - -early_param("hugetlb_cma", cmdline_parse_hugetlb_cma); - -void __init hugetlb_cma_reserve(int order) -{ - unsigned long size, reserved, per_node; - bool node_specific_cma_alloc = false; - int nid; - - /* - * HugeTLB CMA reservation is required for gigantic - * huge pages which could not be allocated via the - * page allocator. Just warn if there is any change - * breaking this assumption. - */ - VM_WARN_ON(order <= MAX_PAGE_ORDER); - cma_reserve_called = true; - - if (!hugetlb_cma_size) - return; - - for (nid = 0; nid < MAX_NUMNODES; nid++) { - if (hugetlb_cma_size_in_node[nid] == 0) - continue; - - if (!node_online(nid)) { - pr_warn("hugetlb_cma: invalid node %d specified\n", nid); - hugetlb_cma_size -= hugetlb_cma_size_in_node[nid]; - hugetlb_cma_size_in_node[nid] = 0; - continue; - } - - if (hugetlb_cma_size_in_node[nid] < (PAGE_SIZE << order)) { - pr_warn("hugetlb_cma: cma area of node %d should be at least %lu MiB\n", - nid, (PAGE_SIZE << order) / SZ_1M); - hugetlb_cma_size -= hugetlb_cma_size_in_node[nid]; - hugetlb_cma_size_in_node[nid] = 0; - } else { - node_specific_cma_alloc = true; - } - } - - /* Validate the CMA size again in case some invalid nodes specified. */ - if (!hugetlb_cma_size) - return; - - if (hugetlb_cma_size < (PAGE_SIZE << order)) { - pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n", - (PAGE_SIZE << order) / SZ_1M); - hugetlb_cma_size = 0; - return; - } - - if (!node_specific_cma_alloc) { - /* - * If 3 GB area is requested on a machine with 4 numa nodes, - * let's allocate 1 GB on first three nodes and ignore the last one. - */ - per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes); - pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n", - hugetlb_cma_size / SZ_1M, per_node / SZ_1M); - } - - reserved = 0; - for_each_online_node(nid) { - int res; - char name[CMA_MAX_NAME]; - - if (node_specific_cma_alloc) { - if (hugetlb_cma_size_in_node[nid] == 0) - continue; - - size = hugetlb_cma_size_in_node[nid]; - } else { - size = min(per_node, hugetlb_cma_size - reserved); - } - - size = round_up(size, PAGE_SIZE << order); - - snprintf(name, sizeof(name), "hugetlb%d", nid); - /* - * Note that 'order per bit' is based on smallest size that - * may be returned to CMA allocator in the case of - * huge page demotion. - */ - res = cma_declare_contiguous_nid(0, size, 0, - PAGE_SIZE << order, - HUGETLB_PAGE_ORDER, false, name, - &hugetlb_cma[nid], nid); - if (res) { - pr_warn("hugetlb_cma: reservation failed: err %d, node %d", - res, nid); - continue; - } - - reserved += size; - pr_info("hugetlb_cma: reserved %lu MiB on node %d\n", - size / SZ_1M, nid); - - if (reserved >= hugetlb_cma_size) - break; - } - - if (!reserved) - /* - * hugetlb_cma_size is used to determine if allocations from - * cma are possible. Set to zero if no cma regions are set up. - */ - hugetlb_cma_size = 0; -} - -static void __init hugetlb_cma_check(void) +/* + * For hugetlb, mremap() is an odd edge case - while the VMA copying is + * performed, we permit both the old and new VMAs to reference the same + * reservation. + * + * We fix this up after the operation succeeds, or if a newly allocated VMA + * is closed as a result of a failure to allocate memory. + */ +void fixup_hugetlb_reservations(struct vm_area_struct *vma) { - if (!hugetlb_cma_size || cma_reserve_called) - return; - - pr_warn("hugetlb_cma: the option isn't supported by current arch\n"); + if (is_vm_hugetlb_page(vma)) + clear_vma_resv_huge_pages(vma); } - -#endif /* CONFIG_CMA */ diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index e716c4671a15..58e895f3899a 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -101,10 +101,9 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup, int idx; for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) { - struct page_counter *fault_parent = NULL; - struct page_counter *rsvd_parent = NULL; + struct page_counter *fault, *fault_parent = NULL; + struct page_counter *rsvd, *rsvd_parent = NULL; unsigned long limit; - int ret; if (parent_h_cgroup) { fault_parent = hugetlb_cgroup_counter_from_cgroup( @@ -112,24 +111,22 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup, rsvd_parent = hugetlb_cgroup_counter_from_cgroup_rsvd( parent_h_cgroup, idx); } - page_counter_init(hugetlb_cgroup_counter_from_cgroup(h_cgroup, - idx), - fault_parent, false); - page_counter_init( - hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx), - rsvd_parent, false); + fault = hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx); + rsvd = hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx); + + page_counter_init(fault, fault_parent, false); + page_counter_init(rsvd, rsvd_parent, false); + + if (!cgroup_subsys_on_dfl(hugetlb_cgrp_subsys)) { + fault->track_failcnt = true; + rsvd->track_failcnt = true; + } limit = round_down(PAGE_COUNTER_MAX, pages_per_huge_page(&hstates[idx])); - ret = page_counter_set_max( - hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx), - limit); - VM_BUG_ON(ret); - ret = page_counter_set_max( - hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx), - limit); - VM_BUG_ON(ret); + VM_BUG_ON(page_counter_set_max(fault, limit)); + VM_BUG_ON(page_counter_set_max(rsvd, limit)); } } @@ -195,24 +192,23 @@ static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css) * cannot fail. */ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, - struct page *page) + struct folio *folio) { unsigned int nr_pages; struct page_counter *counter; - struct hugetlb_cgroup *page_hcg; + struct hugetlb_cgroup *hcg; struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg); - struct folio *folio = page_folio(page); - page_hcg = hugetlb_cgroup_from_folio(folio); + hcg = hugetlb_cgroup_from_folio(folio); /* * We can have pages in active list without any cgroup * ie, hugepage with less than 3 pages. We can safely * ignore those pages. */ - if (!page_hcg || page_hcg != h_cg) + if (!hcg || hcg != h_cg) goto out; - nr_pages = compound_nr(page); + nr_pages = folio_nr_pages(folio); if (!parent) { parent = root_h_cgroup; /* root has no limit */ @@ -235,13 +231,13 @@ static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css) { struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); struct hstate *h; - struct page *page; + struct folio *folio; do { for_each_hstate(h) { spin_lock_irq(&hugetlb_lock); - list_for_each_entry(page, &h->hugepage_activelist, lru) - hugetlb_cgroup_move_parent(hstate_index(h), h_cg, page); + list_for_each_entry(folio, &h->hugepage_activelist, lru) + hugetlb_cgroup_move_parent(hstate_index(h), h_cg, folio); spin_unlock_irq(&hugetlb_lock); } @@ -917,7 +913,6 @@ void hugetlb_cgroup_migrate(struct folio *old_folio, struct folio *new_folio) set_hugetlb_cgroup_rsvd(new_folio, h_cg_rsvd); list_move(&new_folio->lru, &h->hugepage_activelist); spin_unlock_irq(&hugetlb_lock); - return; } static struct cftype hugetlb_files[] = { diff --git a/mm/hugetlb_cma.c b/mm/hugetlb_cma.c new file mode 100644 index 000000000000..f58ef4969e7a --- /dev/null +++ b/mm/hugetlb_cma.c @@ -0,0 +1,278 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/mm.h> +#include <linux/cma.h> +#include <linux/compiler.h> +#include <linux/mm_inline.h> + +#include <asm/page.h> +#include <asm/setup.h> + +#include <linux/hugetlb.h> +#include "internal.h" +#include "hugetlb_cma.h" + + +static struct cma *hugetlb_cma[MAX_NUMNODES]; +static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata; +static bool hugetlb_cma_only; +static unsigned long hugetlb_cma_size __initdata; + +void hugetlb_cma_free_folio(struct folio *folio) +{ + int nid = folio_nid(folio); + + WARN_ON_ONCE(!cma_free_folio(hugetlb_cma[nid], folio)); +} + + +struct folio *hugetlb_cma_alloc_folio(struct hstate *h, gfp_t gfp_mask, + int nid, nodemask_t *nodemask) +{ + int node; + int order = huge_page_order(h); + struct folio *folio = NULL; + + if (hugetlb_cma[nid]) + folio = cma_alloc_folio(hugetlb_cma[nid], order, gfp_mask); + + if (!folio && !(gfp_mask & __GFP_THISNODE)) { + for_each_node_mask(node, *nodemask) { + if (node == nid || !hugetlb_cma[node]) + continue; + + folio = cma_alloc_folio(hugetlb_cma[node], order, gfp_mask); + if (folio) + break; + } + } + + if (folio) + folio_set_hugetlb_cma(folio); + + return folio; +} + +struct huge_bootmem_page * __init +hugetlb_cma_alloc_bootmem(struct hstate *h, int *nid, bool node_exact) +{ + struct cma *cma; + struct huge_bootmem_page *m; + int node = *nid; + + cma = hugetlb_cma[*nid]; + m = cma_reserve_early(cma, huge_page_size(h)); + if (!m) { + if (node_exact) + return NULL; + + for_each_node_mask(node, hugetlb_bootmem_nodes) { + cma = hugetlb_cma[node]; + if (!cma || node == *nid) + continue; + m = cma_reserve_early(cma, huge_page_size(h)); + if (m) { + *nid = node; + break; + } + } + } + + if (m) { + m->flags = HUGE_BOOTMEM_CMA; + m->cma = cma; + } + + return m; +} + + +static bool cma_reserve_called __initdata; + +static int __init cmdline_parse_hugetlb_cma(char *p) +{ + int nid, count = 0; + unsigned long tmp; + char *s = p; + + while (*s) { + if (sscanf(s, "%lu%n", &tmp, &count) != 1) + break; + + if (s[count] == ':') { + if (tmp >= MAX_NUMNODES) + break; + nid = array_index_nospec(tmp, MAX_NUMNODES); + + s += count + 1; + tmp = memparse(s, &s); + hugetlb_cma_size_in_node[nid] = tmp; + hugetlb_cma_size += tmp; + + /* + * Skip the separator if have one, otherwise + * break the parsing. + */ + if (*s == ',') + s++; + else + break; + } else { + hugetlb_cma_size = memparse(p, &p); + break; + } + } + + return 0; +} + +early_param("hugetlb_cma", cmdline_parse_hugetlb_cma); + +static int __init cmdline_parse_hugetlb_cma_only(char *p) +{ + return kstrtobool(p, &hugetlb_cma_only); +} + +early_param("hugetlb_cma_only", cmdline_parse_hugetlb_cma_only); + +void __init hugetlb_cma_reserve(int order) +{ + unsigned long size, reserved, per_node; + bool node_specific_cma_alloc = false; + int nid; + + /* + * HugeTLB CMA reservation is required for gigantic + * huge pages which could not be allocated via the + * page allocator. Just warn if there is any change + * breaking this assumption. + */ + VM_WARN_ON(order <= MAX_PAGE_ORDER); + cma_reserve_called = true; + + if (!hugetlb_cma_size) + return; + + hugetlb_bootmem_set_nodes(); + + for (nid = 0; nid < MAX_NUMNODES; nid++) { + if (hugetlb_cma_size_in_node[nid] == 0) + continue; + + if (!node_isset(nid, hugetlb_bootmem_nodes)) { + pr_warn("hugetlb_cma: invalid node %d specified\n", nid); + hugetlb_cma_size -= hugetlb_cma_size_in_node[nid]; + hugetlb_cma_size_in_node[nid] = 0; + continue; + } + + if (hugetlb_cma_size_in_node[nid] < (PAGE_SIZE << order)) { + pr_warn("hugetlb_cma: cma area of node %d should be at least %lu MiB\n", + nid, (PAGE_SIZE << order) / SZ_1M); + hugetlb_cma_size -= hugetlb_cma_size_in_node[nid]; + hugetlb_cma_size_in_node[nid] = 0; + } else { + node_specific_cma_alloc = true; + } + } + + /* Validate the CMA size again in case some invalid nodes specified. */ + if (!hugetlb_cma_size) + return; + + if (hugetlb_cma_size < (PAGE_SIZE << order)) { + pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n", + (PAGE_SIZE << order) / SZ_1M); + hugetlb_cma_size = 0; + return; + } + + if (!node_specific_cma_alloc) { + /* + * If 3 GB area is requested on a machine with 4 numa nodes, + * let's allocate 1 GB on first three nodes and ignore the last one. + */ + per_node = DIV_ROUND_UP(hugetlb_cma_size, + nodes_weight(hugetlb_bootmem_nodes)); + pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n", + hugetlb_cma_size / SZ_1M, per_node / SZ_1M); + } + + reserved = 0; + for_each_node_mask(nid, hugetlb_bootmem_nodes) { + int res; + char name[CMA_MAX_NAME]; + + if (node_specific_cma_alloc) { + if (hugetlb_cma_size_in_node[nid] == 0) + continue; + + size = hugetlb_cma_size_in_node[nid]; + } else { + size = min(per_node, hugetlb_cma_size - reserved); + } + + size = round_up(size, PAGE_SIZE << order); + + snprintf(name, sizeof(name), "hugetlb%d", nid); + /* + * Note that 'order per bit' is based on smallest size that + * may be returned to CMA allocator in the case of + * huge page demotion. + */ + res = cma_declare_contiguous_multi(size, PAGE_SIZE << order, + HUGETLB_PAGE_ORDER, name, + &hugetlb_cma[nid], nid); + if (res) { + pr_warn("hugetlb_cma: reservation failed: err %d, node %d", + res, nid); + continue; + } + + reserved += size; + pr_info("hugetlb_cma: reserved %lu MiB on node %d\n", + size / SZ_1M, nid); + + if (reserved >= hugetlb_cma_size) + break; + } + + if (!reserved) + /* + * hugetlb_cma_size is used to determine if allocations from + * cma are possible. Set to zero if no cma regions are set up. + */ + hugetlb_cma_size = 0; +} + +void __init hugetlb_cma_check(void) +{ + if (!hugetlb_cma_size || cma_reserve_called) + return; + + pr_warn("hugetlb_cma: the option isn't supported by current arch\n"); +} + +bool hugetlb_cma_exclusive_alloc(void) +{ + return hugetlb_cma_only; +} + +unsigned long __init hugetlb_cma_total_size(void) +{ + return hugetlb_cma_size; +} + +void __init hugetlb_cma_validate_params(void) +{ + if (!hugetlb_cma_size) + hugetlb_cma_only = false; +} + +bool __init hugetlb_early_cma(struct hstate *h) +{ + if (arch_has_huge_bootmem_alloc()) + return false; + + return hstate_is_gigantic(h) && hugetlb_cma_only; +} diff --git a/mm/hugetlb_cma.h b/mm/hugetlb_cma.h new file mode 100644 index 000000000000..f7d7fb9880a2 --- /dev/null +++ b/mm/hugetlb_cma.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_HUGETLB_CMA_H +#define _LINUX_HUGETLB_CMA_H + +#ifdef CONFIG_CMA +void hugetlb_cma_free_folio(struct folio *folio); +struct folio *hugetlb_cma_alloc_folio(struct hstate *h, gfp_t gfp_mask, + int nid, nodemask_t *nodemask); +struct huge_bootmem_page *hugetlb_cma_alloc_bootmem(struct hstate *h, int *nid, + bool node_exact); +void hugetlb_cma_check(void); +bool hugetlb_cma_exclusive_alloc(void); +unsigned long hugetlb_cma_total_size(void); +void hugetlb_cma_validate_params(void); +bool hugetlb_early_cma(struct hstate *h); +#else +static inline void hugetlb_cma_free_folio(struct folio *folio) +{ +} + +static inline struct folio *hugetlb_cma_alloc_folio(struct hstate *h, + gfp_t gfp_mask, int nid, nodemask_t *nodemask) +{ + return NULL; +} + +static inline +struct huge_bootmem_page *hugetlb_cma_alloc_bootmem(struct hstate *h, int *nid, + bool node_exact) +{ + return NULL; +} + +static inline void hugetlb_cma_check(void) +{ +} + +static inline bool hugetlb_cma_exclusive_alloc(void) +{ + return false; +} + +static inline unsigned long hugetlb_cma_total_size(void) +{ + return 0; +} + +static inline void hugetlb_cma_validate_params(void) +{ +} + +static inline bool hugetlb_early_cma(struct hstate *h) +{ + return false; +} +#endif +#endif diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 57b7f591eee8..27245e86df25 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -238,11 +238,11 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, * struct page, the special metadata (e.g. page->flags or page->mapping) * cannot copy to the tail struct page structs. The invalid value will be * checked in the free_tail_page_prepare(). In order to avoid the message - * of "corrupted mapping in tail page". We need to reset at least 3 (one - * head struct page struct and two tail struct page structs) struct page + * of "corrupted mapping in tail page". We need to reset at least 4 (one + * head struct page struct and three tail struct page structs) struct page * structs. */ -#define NR_RESET_STRUCT_PAGE 3 +#define NR_RESET_STRUCT_PAGE 4 static inline void reset_struct_pages(struct page *start) { @@ -444,7 +444,11 @@ DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); -core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0); +static int __init hugetlb_vmemmap_optimize_param(char *buf) +{ + return kstrtobool(buf, &vmemmap_optimize_enabled); +} +early_param("hugetlb_free_vmemmap", hugetlb_vmemmap_optimize_param); static int __hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio, unsigned long flags) @@ -645,14 +649,39 @@ static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *fol return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse); } -void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list) +static void __hugetlb_vmemmap_optimize_folios(struct hstate *h, + struct list_head *folio_list, + bool boot) { struct folio *folio; + int nr_to_optimize; LIST_HEAD(vmemmap_pages); unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU; + nr_to_optimize = 0; list_for_each_entry(folio, folio_list, lru) { - int ret = hugetlb_vmemmap_split_folio(h, folio); + int ret; + unsigned long spfn, epfn; + + if (boot && folio_test_hugetlb_vmemmap_optimized(folio)) { + /* + * Already optimized by pre-HVO, just map the + * mirrored tail page structs RO. + */ + spfn = (unsigned long)&folio->page; + epfn = spfn + pages_per_huge_page(h); + vmemmap_wrprotect_hvo(spfn, epfn, folio_nid(folio), + HUGETLB_VMEMMAP_RESERVE_SIZE); + register_page_bootmem_memmap(pfn_to_section_nr(spfn), + &folio->page, + HUGETLB_VMEMMAP_RESERVE_SIZE); + static_branch_inc(&hugetlb_optimize_vmemmap_key); + continue; + } + + nr_to_optimize++; + + ret = hugetlb_vmemmap_split_folio(h, folio); /* * Spliting the PMD requires allocating a page, thus lets fail @@ -664,6 +693,16 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l break; } + if (!nr_to_optimize) + /* + * All pre-HVO folios, nothing left to do. It's ok if + * there is a mix of pre-HVO and not yet HVO-ed folios + * here, as __hugetlb_vmemmap_optimize_folio() will + * skip any folios that already have the optimized flag + * set, see vmemmap_should_optimize_folio(). + */ + goto out; + flush_tlb_all(); list_for_each_entry(folio, folio_list, lru) { @@ -689,11 +728,165 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l } } +out: flush_tlb_all(); free_vmemmap_page_list(&vmemmap_pages); } -static struct ctl_table hugetlb_vmemmap_sysctls[] = { +void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list) +{ + __hugetlb_vmemmap_optimize_folios(h, folio_list, false); +} + +void hugetlb_vmemmap_optimize_bootmem_folios(struct hstate *h, struct list_head *folio_list) +{ + __hugetlb_vmemmap_optimize_folios(h, folio_list, true); +} + +#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT + +/* Return true of a bootmem allocated HugeTLB page should be pre-HVO-ed */ +static bool vmemmap_should_optimize_bootmem_page(struct huge_bootmem_page *m) +{ + unsigned long section_size, psize, pmd_vmemmap_size; + phys_addr_t paddr; + + if (!READ_ONCE(vmemmap_optimize_enabled)) + return false; + + if (!hugetlb_vmemmap_optimizable(m->hstate)) + return false; + + psize = huge_page_size(m->hstate); + paddr = virt_to_phys(m); + + /* + * Pre-HVO only works if the bootmem huge page + * is aligned to the section size. + */ + section_size = (1UL << PA_SECTION_SHIFT); + if (!IS_ALIGNED(paddr, section_size) || + !IS_ALIGNED(psize, section_size)) + return false; + + /* + * The pre-HVO code does not deal with splitting PMDS, + * so the bootmem page must be aligned to the number + * of base pages that can be mapped with one vmemmap PMD. + */ + pmd_vmemmap_size = (PMD_SIZE / (sizeof(struct page))) << PAGE_SHIFT; + if (!IS_ALIGNED(paddr, pmd_vmemmap_size) || + !IS_ALIGNED(psize, pmd_vmemmap_size)) + return false; + + return true; +} + +/* + * Initialize memmap section for a gigantic page, HVO-style. + */ +void __init hugetlb_vmemmap_init_early(int nid) +{ + unsigned long psize, paddr, section_size; + unsigned long ns, i, pnum, pfn, nr_pages; + unsigned long start, end; + struct huge_bootmem_page *m = NULL; + void *map; + + /* + * Noting to do if bootmem pages were not allocated + * early in boot, or if HVO wasn't enabled in the + * first place. + */ + if (!hugetlb_bootmem_allocated()) + return; + + if (!READ_ONCE(vmemmap_optimize_enabled)) + return; + + section_size = (1UL << PA_SECTION_SHIFT); + + list_for_each_entry(m, &huge_boot_pages[nid], list) { + if (!vmemmap_should_optimize_bootmem_page(m)) + continue; + + nr_pages = pages_per_huge_page(m->hstate); + psize = nr_pages << PAGE_SHIFT; + paddr = virt_to_phys(m); + pfn = PHYS_PFN(paddr); + map = pfn_to_page(pfn); + start = (unsigned long)map; + end = start + nr_pages * sizeof(struct page); + + if (vmemmap_populate_hvo(start, end, nid, + HUGETLB_VMEMMAP_RESERVE_SIZE) < 0) + continue; + + memmap_boot_pages_add(HUGETLB_VMEMMAP_RESERVE_SIZE / PAGE_SIZE); + + pnum = pfn_to_section_nr(pfn); + ns = psize / section_size; + + for (i = 0; i < ns; i++) { + sparse_init_early_section(nid, map, pnum, + SECTION_IS_VMEMMAP_PREINIT); + map += section_map_size(); + pnum++; + } + + m->flags |= HUGE_BOOTMEM_HVO; + } +} + +void __init hugetlb_vmemmap_init_late(int nid) +{ + struct huge_bootmem_page *m, *tm; + unsigned long phys, nr_pages, start, end; + unsigned long pfn, nr_mmap; + struct hstate *h; + void *map; + + if (!hugetlb_bootmem_allocated()) + return; + + if (!READ_ONCE(vmemmap_optimize_enabled)) + return; + + list_for_each_entry_safe(m, tm, &huge_boot_pages[nid], list) { + if (!(m->flags & HUGE_BOOTMEM_HVO)) + continue; + + phys = virt_to_phys(m); + h = m->hstate; + pfn = PHYS_PFN(phys); + nr_pages = pages_per_huge_page(h); + + if (!hugetlb_bootmem_page_zones_valid(nid, m)) { + /* + * Oops, the hugetlb page spans multiple zones. + * Remove it from the list, and undo HVO. + */ + list_del(&m->list); + + map = pfn_to_page(pfn); + + start = (unsigned long)map; + end = start + nr_pages * sizeof(struct page); + + vmemmap_undo_hvo(start, end, nid, + HUGETLB_VMEMMAP_RESERVE_SIZE); + nr_mmap = end - start - HUGETLB_VMEMMAP_RESERVE_SIZE; + memmap_boot_pages_add(DIV_ROUND_UP(nr_mmap, PAGE_SIZE)); + + memblock_phys_free(phys, huge_page_size(h)); + continue; + } else + m->flags |= HUGE_BOOTMEM_ZONES_VALID; + } +} +#endif + +static const struct ctl_table hugetlb_vmemmap_sysctls[] = { { .procname = "hugetlb_optimize_vmemmap", .data = &vmemmap_optimize_enabled, diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h index 2fcae92d3359..18b490825215 100644 --- a/mm/hugetlb_vmemmap.h +++ b/mm/hugetlb_vmemmap.h @@ -9,6 +9,8 @@ #ifndef _LINUX_HUGETLB_VMEMMAP_H #define _LINUX_HUGETLB_VMEMMAP_H #include <linux/hugetlb.h> +#include <linux/io.h> +#include <linux/memblock.h> /* * Reserve one vmemmap page, all vmemmap addresses are mapped to it. See @@ -24,6 +26,12 @@ long hugetlb_vmemmap_restore_folios(const struct hstate *h, struct list_head *non_hvo_folios); void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio); void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list); +void hugetlb_vmemmap_optimize_bootmem_folios(struct hstate *h, struct list_head *folio_list); +#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT +void hugetlb_vmemmap_init_early(int nid); +void hugetlb_vmemmap_init_late(int nid); +#endif + static inline unsigned int hugetlb_vmemmap_size(const struct hstate *h) { @@ -48,7 +56,7 @@ static inline int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct f return 0; } -static long hugetlb_vmemmap_restore_folios(const struct hstate *h, +static inline long hugetlb_vmemmap_restore_folios(const struct hstate *h, struct list_head *folio_list, struct list_head *non_hvo_folios) { @@ -64,6 +72,19 @@ static inline void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list { } +static inline void hugetlb_vmemmap_optimize_bootmem_folios(struct hstate *h, + struct list_head *folio_list) +{ +} + +static inline void hugetlb_vmemmap_init_early(int nid) +{ +} + +static inline void hugetlb_vmemmap_init_late(int nid) +{ +} + static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate *h) { return 0; diff --git a/mm/init-mm.c b/mm/init-mm.c index 24c809379274..4600e7605cab 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -40,7 +40,8 @@ struct mm_struct init_mm = { .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), #ifdef CONFIG_PER_VMA_LOCK - .mm_lock_seq = 0, + .vma_writer_wait = __RCUWAIT_INITIALIZER(init_mm.vma_writer_wait), + .mm_lock_seq = SEQCNT_ZERO(init_mm.mm_lock_seq), #endif .user_ns = &init_user_ns, .cpu_bitmap = CPU_BITS_NONE, diff --git a/mm/internal.h b/mm/internal.h index cb8d8e8e3ffa..6b8ed2017743 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -25,6 +25,47 @@ struct folio_batch; /* + * Maintains state across a page table move. The operation assumes both source + * and destination VMAs already exist and are specified by the user. + * + * Partial moves are permitted, but the old and new ranges must both reside + * within a VMA. + * + * mmap lock must be held in write and VMA write locks must be held on any VMA + * that is visible. + * + * Use the PAGETABLE_MOVE() macro to initialise this struct. + * + * The old_addr and new_addr fields are updated as the page table move is + * executed. + * + * NOTE: The page table move is affected by reading from [old_addr, old_end), + * and old_addr may be updated for better page table alignment, so len_in + * represents the length of the range being copied as specified by the user. + */ +struct pagetable_move_control { + struct vm_area_struct *old; /* Source VMA. */ + struct vm_area_struct *new; /* Destination VMA. */ + unsigned long old_addr; /* Address from which the move begins. */ + unsigned long old_end; /* Exclusive address at which old range ends. */ + unsigned long new_addr; /* Address to move page tables to. */ + unsigned long len_in; /* Bytes to remap specified by user. */ + + bool need_rmap_locks; /* Do rmap locks need to be taken? */ + bool for_stack; /* Is this an early temp stack being moved? */ +}; + +#define PAGETABLE_MOVE(name, old_, new_, old_addr_, new_addr_, len_) \ + struct pagetable_move_control name = { \ + .old = old_, \ + .new = new_, \ + .old_addr = old_addr_, \ + .old_end = (old_addr_) + (len_), \ + .new_addr = new_addr_, \ + .len_in = len_, \ + } + +/* * The set of flags that only affect watermark checking and reclaim * behaviour. This is used by the MM to obey the caller constraints * about IO, FS and watermark checking while ignoring placement @@ -84,6 +125,8 @@ void page_writeback_init(void); */ static inline int folio_nr_pages_mapped(const struct folio *folio) { + if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) + return -1; return atomic_read(&folio->_nr_pages_mapped) & FOLIO_PAGES_MAPPED; } @@ -205,11 +248,9 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags, bool *any_writable, bool *any_young, bool *any_dirty) { - unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio); - const pte_t *end_ptep = start_ptep + max_nr; pte_t expected_pte, *ptep; bool writable, young, dirty; - int nr; + int nr, cur_nr; if (any_writable) *any_writable = false; @@ -222,11 +263,15 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio); VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio); + /* Limit max_nr to the actual remaining PFNs in the folio we could batch. */ + max_nr = min_t(unsigned long, max_nr, + folio_pfn(folio) + folio_nr_pages(folio) - pte_pfn(pte)); + nr = pte_batch_hint(start_ptep, pte); expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags); ptep = start_ptep + nr; - while (ptep < end_ptep) { + while (nr < max_nr) { pte = ptep_get(ptep); if (any_writable) writable = !!pte_write(pte); @@ -239,14 +284,6 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, if (!pte_same(pte, expected_pte)) break; - /* - * Stop immediately once we reached the end of the folio. In - * corner cases the next PFN might fall into a different - * folio. - */ - if (pte_pfn(pte) >= folio_end_pfn) - break; - if (any_writable) *any_writable |= writable; if (any_young) @@ -254,12 +291,13 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, if (any_dirty) *any_dirty |= dirty; - nr = pte_batch_hint(ptep, pte); - expected_pte = pte_advance_pfn(expected_pte, nr); - ptep += nr; + cur_nr = pte_batch_hint(ptep, pte); + expected_pte = pte_advance_pfn(expected_pte, cur_nr); + ptep += cur_nr; + nr += cur_nr; } - return min(ptep - start_ptep, max_nr); + return min(nr, max_nr); } /** @@ -392,6 +430,11 @@ void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, struct zap_details *details); +void zap_page_range_single_batched(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long addr, + unsigned long size, struct zap_details *details); +int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio, + gfp_t gfp); void page_cache_ra_order(struct readahead_control *, struct file_ra_state *, unsigned int order); @@ -491,6 +534,7 @@ extern char * const zone_names[MAX_NR_ZONES]; DECLARE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled); extern int min_free_kbytes; +extern int defrag_mode; void setup_per_zone_wmarks(void); void calculate_min_free_kbytes(void); @@ -656,6 +700,8 @@ static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn, } void set_zone_contiguous(struct zone *zone); +bool pfn_range_intersects_zones(int nid, unsigned long start_pfn, + unsigned long nr_pages); static inline void clear_zone_contiguous(struct zone *zone) { @@ -680,8 +726,8 @@ static inline void folio_set_order(struct folio *folio, unsigned int order) return; folio->_flags_1 = (folio->_flags_1 & ~0xffUL) | order; -#ifdef CONFIG_64BIT - folio->_folio_nr_pages = 1U << order; +#ifdef NR_PAGES_IN_LARGE_FOLIO + folio->_nr_pages = 1U << order; #endif } @@ -717,9 +763,17 @@ static inline void prep_compound_head(struct page *page, unsigned int order) folio_set_order(folio, order); atomic_set(&folio->_large_mapcount, -1); - atomic_set(&folio->_entire_mapcount, -1); - atomic_set(&folio->_nr_pages_mapped, 0); - atomic_set(&folio->_pincount, 0); + if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) + atomic_set(&folio->_nr_pages_mapped, 0); + if (IS_ENABLED(CONFIG_MM_ID)) { + folio->_mm_ids = 0; + folio->_mm_id_mapcount[0] = -1; + folio->_mm_id_mapcount[1] = -1; + } + if (IS_ENABLED(CONFIG_64BIT) || order > 1) { + atomic_set(&folio->_pincount, 0); + atomic_set(&folio->_entire_mapcount, -1); + } if (order > 1) INIT_LIST_HEAD(&folio->_deferred_list); } @@ -733,17 +787,30 @@ static inline void prep_compound_tail(struct page *head, int tail_idx) set_page_private(p, 0); } -extern void prep_compound_page(struct page *page, unsigned int order); - -extern void post_alloc_hook(struct page *page, unsigned int order, - gfp_t gfp_flags); +void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); extern bool free_pages_prepare(struct page *page, unsigned int order); extern int user_min_free_kbytes; -void free_unref_page(struct page *page, unsigned int order); +struct page *__alloc_frozen_pages_noprof(gfp_t, unsigned int order, int nid, + nodemask_t *); +#define __alloc_frozen_pages(...) \ + alloc_hooks(__alloc_frozen_pages_noprof(__VA_ARGS__)) +void free_frozen_pages(struct page *page, unsigned int order); void free_unref_folios(struct folio_batch *fbatch); +#ifdef CONFIG_NUMA +struct page *alloc_frozen_pages_noprof(gfp_t, unsigned int order); +#else +static inline struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned int order) +{ + return __alloc_frozen_pages_noprof(gfp, order, numa_node_id(), NULL); +} +#endif + +#define alloc_frozen_pages(...) \ + alloc_hooks(alloc_frozen_pages_noprof(__VA_ARGS__)) + extern void zone_pcp_reset(struct zone *zone); extern void zone_pcp_disable(struct zone *zone); extern void zone_pcp_enable(struct zone *zone); @@ -824,17 +891,29 @@ int isolate_migratepages_range(struct compact_control *cc, unsigned long low_pfn, unsigned long end_pfn); -int __alloc_contig_migrate_range(struct compact_control *cc, - unsigned long start, unsigned long end, - int migratetype); - /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ void init_cma_reserved_pageblock(struct page *page); #endif /* CONFIG_COMPACTION || CONFIG_CMA */ +struct cma; + +#ifdef CONFIG_CMA +void *cma_reserve_early(struct cma *cma, unsigned long size); +void init_cma_pageblock(struct page *page); +#else +static inline void *cma_reserve_early(struct cma *cma, unsigned long size) +{ + return NULL; +} +static inline void init_cma_pageblock(struct page *page) +{ +} +#endif + + int find_suitable_fallback(struct free_area *area, unsigned int order, - int migratetype, bool only_stealable, bool *can_steal); + int migratetype, bool claimable); static inline bool free_area_empty(struct free_area *area, int migratetype) { @@ -1040,6 +1119,8 @@ DECLARE_STATIC_KEY_TRUE(deferred_pages); bool __init deferred_grow_zone(struct zone *zone, unsigned int order); #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ +void init_deferred_page(unsigned long pfn, int nid); + enum mminit_level { MMINIT_WARNING, MMINIT_VERIFY, @@ -1084,9 +1165,13 @@ static inline void mminit_verify_zonelist(void) #define NODE_RECLAIM_SUCCESS 1 #ifdef CONFIG_NUMA +extern int node_reclaim_mode; + extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int); extern int find_next_best_node(int node, nodemask_t *used_node_mask); #else +#define node_reclaim_mode 0 + static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask, unsigned int order) { @@ -1098,11 +1183,17 @@ static inline int find_next_best_node(int node, nodemask_t *used_node_mask) } #endif +static inline bool node_reclaim_enabled(void) +{ + /* Is any node_reclaim_mode bit set? */ + return node_reclaim_mode & (RECLAIM_ZONE|RECLAIM_WRITE|RECLAIM_UNMAP); +} + /* * mm/memory-failure.c */ #ifdef CONFIG_MEMORY_FAILURE -void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu); +int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill); void shake_folio(struct folio *folio); extern int hwpoison_filter(struct page *p); @@ -1125,8 +1216,9 @@ unsigned long page_mapped_in_vma(const struct page *page, struct vm_area_struct *vma); #else -static inline void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu) +static inline int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill) { + return -EBUSY; } #endif @@ -1174,6 +1266,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, #define ALLOC_NOFRAGMENT 0x0 #endif #define ALLOC_HIGHATOMIC 0x200 /* Allows access to MIGRATE_HIGHATOMIC */ +#define ALLOC_TRYLOCK 0x400 /* Only use spin_trylock in allocation path */ #define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */ /* Flags that allow allocations below the min watermark. */ @@ -1285,12 +1378,6 @@ void touch_pud(struct vm_area_struct *vma, unsigned long addr, void touch_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, bool write); -static inline bool alloc_zeroed(void) -{ - return static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, - &init_on_alloc); -} - /* * Parses a string with mem suffixes into its order. Useful to parse kernel * parameters. @@ -1400,7 +1487,8 @@ static inline bool gup_must_unshare(struct vm_area_struct *vma, } extern bool mirrored_kernelcore; -extern bool memblock_has_mirror(void); +bool memblock_has_mirror(void); +void memblock_free_all(void); static __always_inline void vma_set_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, @@ -1441,27 +1529,12 @@ static inline bool pte_needs_soft_dirty_wp(struct vm_area_struct *vma, pte_t pte void __meminit __init_single_page(struct page *page, unsigned long pfn, unsigned long zone, int nid); +void __meminit __init_page_from_nid(unsigned long pfn, int nid); /* shrinker related functions */ unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority); -#ifdef CONFIG_64BIT -static inline int can_do_mseal(unsigned long flags) -{ - if (flags) - return -EINVAL; - - return 0; -} - -#else -static inline int can_do_mseal(unsigned long flags) -{ - return -EPERM; -} -#endif - #ifdef CONFIG_SHRINKER_DEBUG static inline __printf(2, 0) int shrinker_debugfs_name_alloc( struct shrinker *shrinker, const char *fmt, va_list ap) @@ -1510,12 +1583,15 @@ static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry, /* Only track the nodes of mappings with shadow entries */ void workingset_update_node(struct xa_node *node); extern struct list_lru shadow_nodes; +#define mapping_set_update(xas, mapping) do { \ + if (!dax_mapping(mapping) && !shmem_mapping(mapping)) { \ + xas_set_update(xas, workingset_update_node); \ + xas_set_lru(xas, &shadow_nodes); \ + } \ +} while (0) /* mremap.c */ -unsigned long move_page_tables(struct vm_area_struct *vma, - unsigned long old_addr, struct vm_area_struct *new_vma, - unsigned long new_addr, unsigned long len, - bool need_rmap_locks, bool for_stack); +unsigned long move_page_tables(struct pagetable_move_control *pmc); #ifdef CONFIG_UNACCEPTED_MEMORY void accept_page(struct page *page); @@ -1530,4 +1606,25 @@ int walk_page_range_mm(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private); +/* pt_reclaim.c */ +bool try_get_and_clear_pmd(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdval); +void free_pte(struct mm_struct *mm, unsigned long addr, struct mmu_gather *tlb, + pmd_t pmdval); +void try_to_free_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, + struct mmu_gather *tlb); + +#ifdef CONFIG_PT_RECLAIM +bool reclaim_pt_is_enabled(unsigned long start, unsigned long end, + struct zap_details *details); +#else +static inline bool reclaim_pt_is_enabled(unsigned long start, unsigned long end, + struct zap_details *details) +{ + return false; +} +#endif /* CONFIG_PT_RECLAIM */ + +void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm); +int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm); + #endif /* __MM_INTERNAL_H */ diff --git a/mm/io-mapping.c b/mm/io-mapping.c index 01b362799930..d3586e95c12c 100644 --- a/mm/io-mapping.c +++ b/mm/io-mapping.c @@ -21,9 +21,10 @@ int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma, if (WARN_ON_ONCE((vma->vm_flags & expected_flags) != expected_flags)) return -EINVAL; - /* We rely on prevalidation of the io-mapping to skip track_pfn(). */ - return remap_pfn_range_notrack(vma, addr, pfn, size, - __pgprot((pgprot_val(iomap->prot) & _PAGE_CACHE_MASK) | - (pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK))); + pgprot_t remap_prot = __pgprot((pgprot_val(iomap->prot) & _PAGE_CACHE_MASK) | + (pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK)); + + /* We rely on prevalidation of the io-mapping to skip pfnmap tracking. */ + return remap_pfn_range_notrack(vma, addr, pfn, size, remap_prot); } EXPORT_SYMBOL_GPL(io_mapping_map_user); diff --git a/mm/ioremap.c b/mm/ioremap.c index 3e049dfb28bd..c36dd9f62fd5 100644 --- a/mm/ioremap.c +++ b/mm/ioremap.c @@ -50,9 +50,9 @@ void __iomem *generic_ioremap_prot(phys_addr_t phys_addr, size_t size, #ifndef ioremap_prot void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, - unsigned long prot) + pgprot_t prot) { - return generic_ioremap_prot(phys_addr, size, __pgprot(prot)); + return generic_ioremap_prot(phys_addr, size, prot); } EXPORT_SYMBOL(ioremap_prot); #endif diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index 1a958e7c8a46..dd93ae8a6beb 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile @@ -35,7 +35,7 @@ CFLAGS_shadow.o := $(CC_FLAGS_KASAN_RUNTIME) CFLAGS_hw_tags.o := $(CC_FLAGS_KASAN_RUNTIME) CFLAGS_sw_tags.o := $(CC_FLAGS_KASAN_RUNTIME) -CFLAGS_KASAN_TEST := $(CFLAGS_KASAN) $(call cc-disable-warning, vla) +CFLAGS_KASAN_TEST := $(CFLAGS_KASAN) ifndef CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX # If compiler instruments memintrinsics by prefixing them with __asan/__hwasan, # we need to treat them normally (as builtins), otherwise the compiler won't @@ -44,6 +44,7 @@ ifndef CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX CFLAGS_KASAN_TEST += -fno-builtin endif +CFLAGS_REMOVE_kasan_test_c.o += $(call cc-option, -Wvla-larger-than=1) CFLAGS_kasan_test_c.o := $(CFLAGS_KASAN_TEST) RUSTFLAGS_kasan_test_rust.o := $(RUSTFLAGS_KASAN) diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 8b9e348113b1..d54e89f8c3e7 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -524,7 +524,11 @@ size_t kasan_metadata_size(struct kmem_cache *cache, bool in_object) sizeof(struct kasan_free_meta) : 0); } -static void __kasan_record_aux_stack(void *addr, depot_flags_t depot_flags) +/* + * This function avoids dynamic memory allocations and thus can be called from + * contexts that do not allow allocating memory. + */ +void kasan_record_aux_stack(void *addr) { struct slab *slab = kasan_addr_to_slab(addr); struct kmem_cache *cache; @@ -541,17 +545,7 @@ static void __kasan_record_aux_stack(void *addr, depot_flags_t depot_flags) return; alloc_meta->aux_stack[1] = alloc_meta->aux_stack[0]; - alloc_meta->aux_stack[0] = kasan_save_stack(0, depot_flags); -} - -void kasan_record_aux_stack(void *addr) -{ - return __kasan_record_aux_stack(addr, STACK_DEPOT_FLAG_CAN_ALLOC); -} - -void kasan_record_aux_stack_noalloc(void *addr) -{ - return __kasan_record_aux_stack(addr, 0); + alloc_meta->aux_stack[0] = kasan_save_stack(0, 0); } void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index ccd66c7a4081..9a6927394b54 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -16,6 +16,7 @@ #include <linux/mm.h> #include <linux/static_key.h> #include <linux/string.h> +#include <linux/string_choices.h> #include <linux/types.h> #include <linux/vmalloc.h> @@ -263,8 +264,8 @@ void __init kasan_init_hw_tags(void) pr_info("KernelAddressSanitizer initialized (hw-tags, mode=%s, vmalloc=%s, stacktrace=%s)\n", kasan_mode_info(), - kasan_vmalloc_enabled() ? "on" : "off", - kasan_stack_collection_enabled() ? "on" : "off"); + str_on_off(kasan_vmalloc_enabled()), + str_on_off(kasan_stack_collection_enabled())); } #ifdef CONFIG_KASAN_VMALLOC diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index b7e4b81421b3..129178be5e64 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -501,18 +501,18 @@ static inline bool kasan_byte_accessible(const void *addr) /** * kasan_poison - mark the memory range as inaccessible - * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE - * @size - range size, must be aligned to KASAN_GRANULE_SIZE - * @value - value that's written to metadata for the range - * @init - whether to initialize the memory range (only for hardware tag-based) + * @addr: range start address, must be aligned to KASAN_GRANULE_SIZE + * @size: range size, must be aligned to KASAN_GRANULE_SIZE + * @value: value that's written to metadata for the range + * @init: whether to initialize the memory range (only for hardware tag-based) */ void kasan_poison(const void *addr, size_t size, u8 value, bool init); /** * kasan_unpoison - mark the memory range as accessible - * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE - * @size - range size, can be unaligned - * @init - whether to initialize the memory range (only for hardware tag-based) + * @addr: range start address, must be aligned to KASAN_GRANULE_SIZE + * @size: range size, can be unaligned + * @init: whether to initialize the memory range (only for hardware tag-based) * * For the tag-based modes, the @size gets aligned to KASAN_GRANULE_SIZE before * marking the range. @@ -530,8 +530,8 @@ bool kasan_byte_accessible(const void *addr); /** * kasan_poison_last_granule - mark the last granule of the memory range as * inaccessible - * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE - * @size - range size + * @address: range start address, must be aligned to KASAN_GRANULE_SIZE + * @size: range size * * This function is only available for the generic mode, as it's the only mode * that has partially poisoned memory granules. diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c index e0ec5a6d15be..5f922dd38ffa 100644 --- a/mm/kasan/kasan_test_c.c +++ b/mm/kasan/kasan_test_c.c @@ -33,7 +33,7 @@ #define OOB_TAG_OFF (IS_ENABLED(CONFIG_KASAN_GENERIC) ? 0 : KASAN_GRANULE_SIZE) -MODULE_IMPORT_NS(EXPORTED_FOR_KUNIT_TESTING); +MODULE_IMPORT_NS("EXPORTED_FOR_KUNIT_TESTING"); static bool multishot; @@ -47,8 +47,8 @@ static struct { * Some tests use these global variables to store return values from function * calls that could otherwise be eliminated by the compiler as dead code. */ -void *kasan_ptr_result; -int kasan_int_result; +static volatile void *kasan_ptr_result; +static volatile int kasan_int_result; /* Probe for console output: obtains test_status lines of interest. */ static void probe_console(void *ignore, const char *buf, size_t len) @@ -1073,14 +1073,11 @@ static void kmem_cache_rcu_uaf(struct kunit *test) kmem_cache_destroy(cache); } -static void empty_cache_ctor(void *object) { } - static void kmem_cache_double_destroy(struct kunit *test) { struct kmem_cache *cache; - /* Provide a constructor to prevent cache merging. */ - cache = kmem_cache_create("test_cache", 200, 0, 0, empty_cache_ctor); + cache = kmem_cache_create("test_cache", 200, 0, SLAB_NO_MERGE, NULL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache); kmem_cache_destroy(cache); KUNIT_EXPECT_KASAN_FAIL(test, kmem_cache_destroy(cache)); @@ -1570,6 +1567,7 @@ static void kasan_memcmp(struct kunit *test) static void kasan_strings(struct kunit *test) { char *ptr; + char *src; size_t size = 24; /* @@ -1581,6 +1579,25 @@ static void kasan_strings(struct kunit *test) ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + src = kmalloc(KASAN_GRANULE_SIZE, GFP_KERNEL | __GFP_ZERO); + strscpy(src, "f0cacc1a0000000", KASAN_GRANULE_SIZE); + + /* + * Make sure that strscpy() does not trigger KASAN if it overreads into + * poisoned memory. + * + * The expected size does not include the terminator '\0' + * so it is (KASAN_GRANULE_SIZE - 2) == + * KASAN_GRANULE_SIZE - ("initial removed character" + "\0"). + */ + KUNIT_EXPECT_EQ(test, KASAN_GRANULE_SIZE - 2, + strscpy(ptr, src + 1, KASAN_GRANULE_SIZE)); + + /* strscpy should fail if the first byte is unreadable. */ + KUNIT_EXPECT_KASAN_FAIL(test, strscpy(ptr, src + KASAN_GRANULE_SIZE, + KASAN_GRANULE_SIZE)); + + kfree(src); kfree(ptr); /* @@ -2130,4 +2147,5 @@ static struct kunit_suite kasan_kunit_test_suite = { kunit_test_suite(kasan_kunit_test_suite); +MODULE_DESCRIPTION("KUnit tests for checking KASAN bug-detection capabilities"); MODULE_LICENSE("GPL"); diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 50fb19ad4388..8357e1a33699 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -201,7 +201,7 @@ static inline void fail_non_kasan_kunit_test(void) { } #endif /* CONFIG_KUNIT */ -static DEFINE_SPINLOCK(report_lock); +static DEFINE_RAW_SPINLOCK(report_lock); static void start_report(unsigned long *flags, bool sync) { @@ -212,7 +212,7 @@ static void start_report(unsigned long *flags, bool sync) lockdep_off(); /* Make sure we don't end up in loop. */ report_suppress_start(); - spin_lock_irqsave(&report_lock, *flags); + raw_spin_lock_irqsave(&report_lock, *flags); pr_err("==================================================================\n"); } @@ -222,7 +222,7 @@ static void end_report(unsigned long *flags, const void *addr, bool is_write) trace_error_report_end(ERROR_DETECTOR_KASAN, (unsigned long)addr); pr_err("==================================================================\n"); - spin_unlock_irqrestore(&report_lock, *flags); + raw_spin_unlock_irqrestore(&report_lock, *flags); if (!test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) check_panic_on_warn("KASAN"); switch (kasan_arg_fault) { @@ -370,6 +370,36 @@ static inline bool init_task_stack_addr(const void *addr) sizeof(init_thread_union.stack)); } +/* + * This function is invoked with report_lock (a raw_spinlock) held. A + * PREEMPT_RT kernel cannot call find_vm_area() as it will acquire a sleeping + * rt_spinlock. + * + * For !RT kernel, the PROVE_RAW_LOCK_NESTING config option will print a + * lockdep warning for this raw_spinlock -> spinlock dependency. This config + * option is enabled by default to ensure better test coverage to expose this + * kind of RT kernel problem. This lockdep splat, however, can be suppressed + * by using DEFINE_WAIT_OVERRIDE_MAP() if it serves a useful purpose and the + * invalid PREEMPT_RT case has been taken care of. + */ +static inline struct vm_struct *kasan_find_vm_area(void *addr) +{ + static DEFINE_WAIT_OVERRIDE_MAP(vmalloc_map, LD_WAIT_SLEEP); + struct vm_struct *va; + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + return NULL; + + /* + * Suppress lockdep warning and fetch vmalloc area of the + * offending address. + */ + lock_map_acquire_try(&vmalloc_map); + va = find_vm_area(addr); + lock_map_release(&vmalloc_map); + return va; +} + static void print_address_description(void *addr, u8 tag, struct kasan_report_info *info) { @@ -399,7 +429,7 @@ static void print_address_description(void *addr, u8 tag, } if (is_vmalloc_addr(addr)) { - struct vm_struct *va = find_vm_area(addr); + struct vm_struct *va = kasan_find_vm_area(addr); if (va) { pr_err("The buggy address belongs to the virtual mapping at\n" @@ -409,6 +439,8 @@ static void print_address_description(void *addr, u8 tag, pr_err("\n"); page = vmalloc_to_page(addr); + } else { + pr_err("The buggy address %px belongs to a vmalloc virtual mapping\n", addr); } } diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 88d1c9dcb507..d2c70cd2afb1 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -292,33 +292,99 @@ void __init __weak kasan_populate_early_vm_area_shadow(void *start, { } +struct vmalloc_populate_data { + unsigned long start; + struct page **pages; +}; + static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, - void *unused) + void *_data) { - unsigned long page; + struct vmalloc_populate_data *data = _data; + struct page *page; pte_t pte; + int index; if (likely(!pte_none(ptep_get(ptep)))) return 0; - page = __get_free_page(GFP_KERNEL); - if (!page) - return -ENOMEM; - - __memset((void *)page, KASAN_VMALLOC_INVALID, PAGE_SIZE); - pte = pfn_pte(PFN_DOWN(__pa(page)), PAGE_KERNEL); + index = PFN_DOWN(addr - data->start); + page = data->pages[index]; + __memset(page_to_virt(page), KASAN_VMALLOC_INVALID, PAGE_SIZE); + pte = pfn_pte(page_to_pfn(page), PAGE_KERNEL); spin_lock(&init_mm.page_table_lock); if (likely(pte_none(ptep_get(ptep)))) { set_pte_at(&init_mm, addr, ptep, pte); - page = 0; + data->pages[index] = NULL; } spin_unlock(&init_mm.page_table_lock); - if (page) - free_page(page); + + return 0; +} + +static void ___free_pages_bulk(struct page **pages, int nr_pages) +{ + int i; + + for (i = 0; i < nr_pages; i++) { + if (pages[i]) { + __free_pages(pages[i], 0); + pages[i] = NULL; + } + } +} + +static int ___alloc_pages_bulk(struct page **pages, int nr_pages) +{ + unsigned long nr_populated, nr_total = nr_pages; + struct page **page_array = pages; + + while (nr_pages) { + nr_populated = alloc_pages_bulk(GFP_KERNEL, nr_pages, pages); + if (!nr_populated) { + ___free_pages_bulk(page_array, nr_total - nr_pages); + return -ENOMEM; + } + pages += nr_populated; + nr_pages -= nr_populated; + } + return 0; } +static int __kasan_populate_vmalloc(unsigned long start, unsigned long end) +{ + unsigned long nr_pages, nr_total = PFN_UP(end - start); + struct vmalloc_populate_data data; + int ret = 0; + + data.pages = (struct page **)__get_free_page(GFP_KERNEL | __GFP_ZERO); + if (!data.pages) + return -ENOMEM; + + while (nr_total) { + nr_pages = min(nr_total, PAGE_SIZE / sizeof(data.pages[0])); + ret = ___alloc_pages_bulk(data.pages, nr_pages); + if (ret) + break; + + data.start = start; + ret = apply_to_page_range(&init_mm, start, nr_pages * PAGE_SIZE, + kasan_populate_vmalloc_pte, &data); + ___free_pages_bulk(data.pages, nr_pages); + if (ret) + break; + + start += nr_pages * PAGE_SIZE; + nr_total -= nr_pages; + } + + free_page((unsigned long)data.pages); + + return ret; +} + int kasan_populate_vmalloc(unsigned long addr, unsigned long size) { unsigned long shadow_start, shadow_end; @@ -348,9 +414,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size) shadow_start = PAGE_ALIGN_DOWN(shadow_start); shadow_end = PAGE_ALIGN(shadow_end); - ret = apply_to_page_range(&init_mm, shadow_start, - shadow_end - shadow_start, - kasan_populate_vmalloc_pte, NULL); + ret = __kasan_populate_vmalloc(shadow_start, shadow_end); if (ret) return ret; diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c index 220b5d4c6876..b9382b5b6a37 100644 --- a/mm/kasan/sw_tags.c +++ b/mm/kasan/sw_tags.c @@ -26,6 +26,7 @@ #include <linux/slab.h> #include <linux/stacktrace.h> #include <linux/string.h> +#include <linux/string_choices.h> #include <linux/types.h> #include <linux/vmalloc.h> #include <linux/bug.h> @@ -45,7 +46,7 @@ void __init kasan_init_sw_tags(void) kasan_init_tags(); pr_info("KernelAddressSanitizer initialized (sw-tags, stacktrace=%s)\n", - kasan_stack_collection_enabled() ? "on" : "off"); + str_on_off(kasan_stack_collection_enabled())); } /* diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 67fc321db79b..102048821c22 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -21,6 +21,7 @@ #include <linux/log2.h> #include <linux/memblock.h> #include <linux/moduleparam.h> +#include <linux/nodemask.h> #include <linux/notifier.h> #include <linux/panic_notifier.h> #include <linux/random.h> @@ -1084,6 +1085,7 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) * properties (e.g. reside in DMAable memory). */ if ((flags & GFP_ZONEMASK) || + ((flags & __GFP_THISNODE) && num_online_nodes() > 1) || (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32))) { atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]); return NULL; diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c index f65fb182466d..00034e37bc9f 100644 --- a/mm/kfence/kfence_test.c +++ b/mm/kfence/kfence_test.c @@ -20,6 +20,7 @@ #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/string.h> +#include <linux/string_choices.h> #include <linux/tracepoint.h> #include <trace/events/printk.h> @@ -88,7 +89,7 @@ struct expect_report { static const char *get_access_type(const struct expect_report *r) { - return r->is_write ? "write" : "read"; + return str_write_read(r->is_write); } /* Check observed report matches information in @r. */ diff --git a/mm/kfence/report.c b/mm/kfence/report.c index 6370c5207d1a..10e6802a2edf 100644 --- a/mm/kfence/report.c +++ b/mm/kfence/report.c @@ -16,6 +16,7 @@ #include <linux/sprintf.h> #include <linux/stacktrace.h> #include <linux/string.h> +#include <linux/string_choices.h> #include <linux/sched/clock.h> #include <trace/events/error_report.h> @@ -184,7 +185,7 @@ static void print_diff_canary(unsigned long address, size_t bytes_to_show, static const char *get_access_type(bool is_write) { - return is_write ? "write" : "read"; + return str_write_read(is_write); } void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *regs, diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 6f8d46d107b4..15203ea7d007 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -19,6 +19,7 @@ #include <linux/rcupdate_wait.h> #include <linux/swapops.h> #include <linux/shmem_fs.h> +#include <linux/dax.h> #include <linux/ksm.h> #include <asm/tlb.h> @@ -547,19 +548,6 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte, } } -static bool is_refcount_suitable(struct folio *folio) -{ - int expected_refcount = folio_mapcount(folio); - - if (!folio_test_anon(folio) || folio_test_swapcache(folio)) - expected_refcount += folio_nr_pages(folio); - - if (folio_test_private(folio)) - expected_refcount++; - - return folio_ref_count(folio) == expected_refcount; -} - static int __collapse_huge_page_isolate(struct vm_area_struct *vma, unsigned long address, pte_t *pte, @@ -606,7 +594,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio); /* See hpage_collapse_scan_pmd(). */ - if (folio_likely_mapped_shared(folio)) { + if (folio_maybe_mapped_shared(folio)) { ++shared; if (cc->is_khugepaged && shared > khugepaged_max_ptes_shared) { @@ -651,7 +639,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, * but not from this process. The other process cannot write to * the page, only trigger CoW. */ - if (!is_refcount_suitable(folio)) { + if (folio_expected_ref_count(folio) != folio_ref_count(folio)) { folio_unlock(folio); result = SCAN_PAGE_COUNT; goto out; @@ -695,13 +683,13 @@ next: result = SCAN_LACK_REFERENCED_PAGE; } else { result = SCAN_SUCCEED; - trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero, + trace_mm_collapse_huge_page_isolate(folio, none_or_zero, referenced, writable, result); return result; } out: release_pte_pages(pte, _pte, compound_pagelist); - trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero, + trace_mm_collapse_huge_page_isolate(folio, none_or_zero, referenced, writable, result); return result; } @@ -745,7 +733,7 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte, ptep_clear(vma->vm_mm, address, _pte); folio_remove_rmap_pte(src, src_page, vma); spin_unlock(ptl); - free_page_and_swap_cache(src_page); + free_folio_and_swap_cache(src); } } @@ -947,17 +935,10 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, return SCAN_SUCCEED; } -static int find_pmd_or_thp_or_none(struct mm_struct *mm, - unsigned long address, - pmd_t **pmd) +static inline int check_pmd_state(pmd_t *pmd) { - pmd_t pmde; + pmd_t pmde = pmdp_get_lockless(pmd); - *pmd = mm_find_pmd(mm, address); - if (!*pmd) - return SCAN_PMD_NULL; - - pmde = pmdp_get_lockless(*pmd); if (pmd_none(pmde)) return SCAN_PMD_NONE; if (!pmd_present(pmde)) @@ -971,6 +952,17 @@ static int find_pmd_or_thp_or_none(struct mm_struct *mm, return SCAN_SUCCEED; } +static int find_pmd_or_thp_or_none(struct mm_struct *mm, + unsigned long address, + pmd_t **pmd) +{ + *pmd = mm_find_pmd(mm, address); + if (!*pmd) + return SCAN_PMD_NULL; + + return check_pmd_state(*pmd); +} + static int check_pmd_still_valid(struct mm_struct *mm, unsigned long address, pmd_t *pmd) @@ -1234,7 +1226,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, __folio_mark_uptodate(folio); pgtable = pmd_pgtable(_pmd); - _pmd = mk_huge_pmd(&folio->page, vma->vm_page_prot); + _pmd = folio_mk_pmd(folio, vma->vm_page_prot); _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); spin_lock(pmd_ptl); @@ -1354,11 +1346,9 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, /* * We treat a single page as shared if any part of the THP - * is shared. "False negatives" from - * folio_likely_mapped_shared() are not expected to matter - * much in practice. + * is shared. */ - if (folio_likely_mapped_shared(folio)) { + if (folio_maybe_mapped_shared(folio)) { ++shared; if (cc->is_khugepaged && shared > khugepaged_max_ptes_shared) { @@ -1399,7 +1389,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, * has excessive GUP pins (i.e. 512). Anyway the same check * will be done again later the risk seems low. */ - if (!is_refcount_suitable(folio)) { + if (folio_expected_ref_count(folio) != folio_ref_count(folio)) { result = SCAN_PAGE_COUNT; goto out_unmap; } @@ -1432,7 +1422,7 @@ out_unmap: *mmap_locked = false; } out: - trace_mm_khugepaged_scan_pmd(mm, &folio->page, writable, referenced, + trace_mm_khugepaged_scan_pmd(mm, folio, writable, referenced, none_or_zero, result, unmapped); return result; } @@ -1461,10 +1451,9 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) } } -#ifdef CONFIG_SHMEM -/* hpage must be locked, and mmap_lock must be held */ +/* folio must be locked, and mmap_lock must be held */ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmdp, struct page *hpage) + pmd_t *pmdp, struct folio *folio, struct page *page) { struct vm_fault vmf = { .vma = vma, @@ -1473,13 +1462,12 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, .pmd = pmdp, }; - VM_BUG_ON(!PageTransHuge(hpage)); mmap_assert_locked(vma->vm_mm); - if (do_set_pmd(&vmf, hpage)) + if (do_set_pmd(&vmf, folio, page)) return SCAN_FAIL; - get_page(hpage); + folio_get(folio); return SCAN_SUCCEED; } @@ -1686,7 +1674,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, maybe_install_pmd: /* step 5: install pmd entry */ result = install_pmd - ? set_huge_pmd(vma, haddr, pmd, &folio->page) + ? set_huge_pmd(vma, haddr, pmd, folio, &folio->page) : SCAN_SUCCEED; goto drop_folio; abort: @@ -1720,7 +1708,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) pmd_t *pmd, pgt_pmd; spinlock_t *pml; spinlock_t *ptl; - bool skipped_uffd = false; + bool success = false; /* * Check vma->anon_vma to exclude MAP_PRIVATE mappings that @@ -1757,6 +1745,19 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) mmu_notifier_invalidate_range_start(&range); pml = pmd_lock(mm, pmd); + /* + * The lock of new_folio is still held, we will be blocked in + * the page fault path, which prevents the pte entries from + * being set again. So even though the old empty PTE page may be + * concurrently freed and a new PTE page is filled into the pmd + * entry, it is still empty and can be removed. + * + * So here we only need to recheck if the state of pmd entry + * still meets our requirements, rather than checking pmd_same() + * like elsewhere. + */ + if (check_pmd_state(pmd) != SCAN_SUCCEED) + goto drop_pml; ptl = pte_lockptr(mm, pmd); if (ptl != pml) spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); @@ -1770,20 +1771,20 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * repeating the anon_vma check protects from one category, * and repeating the userfaultfd_wp() check from another. */ - if (unlikely(vma->anon_vma || userfaultfd_wp(vma))) { - skipped_uffd = true; - } else { + if (likely(!vma->anon_vma && !userfaultfd_wp(vma))) { pgt_pmd = pmdp_collapse_flush(vma, addr, pmd); pmdp_get_lockless_sync(); + success = true; } if (ptl != pml) spin_unlock(ptl); +drop_pml: spin_unlock(pml); mmu_notifier_invalidate_range_end(&range); - if (!skipped_uffd) { + if (success) { mm_dec_nr_ptes(mm); page_table_check_pte_clear_range(mm, addr, pgt_pmd); pte_free_defer(mm, pmd_pgtable(pgt_pmd)); @@ -1837,6 +1838,8 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, if (result != SCAN_SUCCEED) goto out; + mapping_set_update(&xas, mapping); + __folio_set_locked(new_folio); if (is_shmem) __folio_set_swapbacked(new_folio); @@ -2277,6 +2280,17 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, continue; } + if (!folio_try_get(folio)) { + xas_reset(&xas); + continue; + } + + if (unlikely(folio != xas_reload(&xas))) { + folio_put(folio); + xas_reset(&xas); + continue; + } + if (folio_order(folio) == HPAGE_PMD_ORDER && folio->index == start) { /* Maybe PMD-mapped */ @@ -2287,23 +2301,27 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, * it's safe to skip LRU and refcount checks before * returning. */ + folio_put(folio); break; } node = folio_nid(folio); if (hpage_collapse_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; + folio_put(folio); break; } cc->node_load[node]++; if (!folio_test_lru(folio)) { result = SCAN_PAGE_LRU; + folio_put(folio); break; } - if (!is_refcount_suitable(folio)) { + if (folio_expected_ref_count(folio) + 1 != folio_ref_count(folio)) { result = SCAN_PAGE_COUNT; + folio_put(folio); break; } @@ -2315,6 +2333,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, */ present += folio_nr_pages(folio); + folio_put(folio); if (need_resched()) { xas_pause(&xas); @@ -2336,14 +2355,6 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, trace_mm_khugepaged_scan_file(mm, folio, file, present, swap, result); return result; } -#else -static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, - struct file *file, pgoff_t start, - struct collapse_control *cc) -{ - BUILD_BUG(); -} -#endif static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, struct collapse_control *cc) @@ -2419,7 +2430,7 @@ skip: VM_BUG_ON(khugepaged_scan.address < hstart || khugepaged_scan.address + HPAGE_PMD_SIZE > hend); - if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) { + if (!vma_is_anonymous(vma)) { struct file *file = get_file(vma->vm_file); pgoff_t pgoff = linear_page_index(vma, khugepaged_scan.address); @@ -2765,7 +2776,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, mmap_assert_locked(mm); memset(cc->node_load, 0, sizeof(cc->node_load)); nodes_clear(cc->alloc_nmask); - if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) { + if (!vma_is_anonymous(vma)) { struct file *file = get_file(vma->vm_file); pgoff_t pgoff = linear_page_index(vma, addr); diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 2a945c07ae99..8d588e685311 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -210,13 +210,11 @@ static struct kmem_cache *object_cache; static struct kmem_cache *scan_area_cache; /* set if tracing memory operations is enabled */ -static int kmemleak_enabled = 1; +static int kmemleak_enabled __read_mostly = 1; /* same as above but only for the kmemleak_free() callback */ -static int kmemleak_free_enabled = 1; +static int kmemleak_free_enabled __read_mostly = 1; /* set in the late_initcall if there were no errors */ static int kmemleak_late_initialized; -/* set if a kmemleak warning was issued */ -static int kmemleak_warning; /* set if a fatal kmemleak error has occurred */ static int kmemleak_error; @@ -254,7 +252,6 @@ static void kmemleak_disable(void); #define kmemleak_warn(x...) do { \ pr_warn(x); \ dump_stack(); \ - kmemleak_warning = 1; \ } while (0) /* @@ -325,8 +322,6 @@ static void hex_dump_object(struct seq_file *seq, * sufficient references to it (count >= min_count) * - black - ignore, it doesn't contain references (e.g. text section) * (min_count == -1). No function defined for this color. - * Newly created objects don't have any color assigned (object->count == -1) - * before the next memory scan when they become white. */ static bool color_white(const struct kmemleak_object *object) { @@ -352,6 +347,15 @@ static bool unreferenced_object(struct kmemleak_object *object) jiffies_last_scan); } +static const char *__object_type_str(struct kmemleak_object *object) +{ + if (object->flags & OBJECT_PHYS) + return " (phys)"; + if (object->flags & OBJECT_PERCPU) + return " (percpu)"; + return ""; +} + /* * Printing of the unreferenced objects information to the seq file. The * print_unreferenced function must be called with the object->lock held. @@ -364,8 +368,9 @@ static void print_unreferenced(struct seq_file *seq, unsigned int nr_entries; nr_entries = stack_depot_fetch(object->trace_handle, &entries); - warn_or_seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n", - object->pointer, object->size); + warn_or_seq_printf(seq, "unreferenced object%s 0x%08lx (size %zu):\n", + __object_type_str(object), + object->pointer, object->size); warn_or_seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n", object->comm, object->pid, object->jiffies); hex_dump_object(seq, object); @@ -373,7 +378,7 @@ static void print_unreferenced(struct seq_file *seq, for (i = 0; i < nr_entries; i++) { void *ptr = (void *)entries[i]; - warn_or_seq_printf(seq, " [<%pK>] %pS\n", ptr, ptr); + warn_or_seq_printf(seq, " %pS\n", ptr); } } @@ -384,10 +389,10 @@ static void print_unreferenced(struct seq_file *seq, */ static void dump_object_info(struct kmemleak_object *object) { - pr_notice("Object 0x%08lx (size %zu):\n", - object->pointer, object->size); + pr_notice("Object%s 0x%08lx (size %zu):\n", + __object_type_str(object), object->pointer, object->size); pr_notice(" comm \"%s\", pid %d, jiffies %lu\n", - object->comm, object->pid, object->jiffies); + object->comm, object->pid, object->jiffies); pr_notice(" min_count = %d\n", object->min_count); pr_notice(" count = %d\n", object->count); pr_notice(" flags = 0x%x\n", object->flags); @@ -1093,7 +1098,7 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, pr_debug("%s(0x%px, %zu)\n", __func__, ptr, size); if (kmemleak_enabled && ptr && !IS_ERR_PCPU(ptr)) - create_object_percpu((__force unsigned long)ptr, size, 0, gfp); + create_object_percpu((__force unsigned long)ptr, size, 1, gfp); } EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu); @@ -1242,6 +1247,20 @@ void __ref kmemleak_transient_leak(const void *ptr) EXPORT_SYMBOL(kmemleak_transient_leak); /** + * kmemleak_ignore_percpu - similar to kmemleak_ignore but taking a percpu + * address argument + * @ptr: percpu address of the object + */ +void __ref kmemleak_ignore_percpu(const void __percpu *ptr) +{ + pr_debug("%s(0x%px)\n", __func__, ptr); + + if (kmemleak_enabled && ptr && !IS_ERR_PCPU(ptr)) + make_black_object((unsigned long)ptr, OBJECT_PERCPU); +} +EXPORT_SYMBOL_GPL(kmemleak_ignore_percpu); + +/** * kmemleak_ignore - ignore an allocated object * @ptr: pointer to beginning of the object * @@ -1689,7 +1708,7 @@ static void kmemleak_scan(void) unsigned long phys = object->pointer; if (PHYS_PFN(phys) < min_low_pfn || - PHYS_PFN(phys + object->size) >= max_low_pfn) + PHYS_PFN(phys + object->size) > max_low_pfn) __paint_it(object, KMEMLEAK_BLACK); } @@ -1855,7 +1874,7 @@ static int kmemleak_scan_thread(void *arg) * Wait before the first scan to allow the system to fully initialize. */ if (first_run) { - signed long timeout = msecs_to_jiffies(SECS_FIRST_SCAN * 1000); + signed long timeout = secs_to_jiffies(SECS_FIRST_SCAN); first_run = 0; while (timeout && !kthread_should_stop()) timeout = schedule_timeout_interruptible(timeout); @@ -1998,25 +2017,41 @@ static int kmemleak_open(struct inode *inode, struct file *file) return seq_open(file, &kmemleak_seq_ops); } -static int dump_str_object_info(const char *str) +static bool __dump_str_object_info(unsigned long addr, unsigned int objflags) { unsigned long flags; struct kmemleak_object *object; + + object = __find_and_get_object(addr, 1, objflags); + if (!object) + return false; + + raw_spin_lock_irqsave(&object->lock, flags); + dump_object_info(object); + raw_spin_unlock_irqrestore(&object->lock, flags); + + put_object(object); + + return true; +} + +static int dump_str_object_info(const char *str) +{ unsigned long addr; + bool found = false; if (kstrtoul(str, 0, &addr)) return -EINVAL; - object = find_and_get_object(addr, 0); - if (!object) { + + found |= __dump_str_object_info(addr, 0); + found |= __dump_str_object_info(addr, OBJECT_PHYS); + found |= __dump_str_object_info(addr, OBJECT_PERCPU); + + if (!found) { pr_info("Unknown object at 0x%08lx\n", addr); return -EINVAL; } - raw_spin_lock_irqsave(&object->lock, flags); - dump_object_info(object); - raw_spin_unlock_irqrestore(&object->lock, flags); - - put_object(object); return 0; } @@ -2241,7 +2276,7 @@ void __init kmemleak_init(void) return; jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE); - jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000); + jiffies_scan_wait = secs_to_jiffies(SECS_SCAN_WAIT); object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE); scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE); diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c index a495debf1436..1ea711786c52 100644 --- a/mm/kmsan/core.c +++ b/mm/kmsan/core.c @@ -159,8 +159,8 @@ depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id) * Make sure we have enough spare bits in @id to hold the UAF bit and * the chain depth. */ - BUILD_BUG_ON( - (1 << STACK_DEPOT_EXTRA_BITS) <= (KMSAN_MAX_ORIGIN_DEPTH << 1)); + BUILD_BUG_ON((1 << STACK_DEPOT_EXTRA_BITS) <= + (KMSAN_MAX_ORIGIN_DEPTH << 1)); extra_bits = stack_depot_get_extra_bits(id); depth = kmsan_depth_from_eb(extra_bits); @@ -274,11 +274,9 @@ void kmsan_internal_check_memory(void *addr, size_t size, * bytes before, report them. */ if (cur_origin) { - kmsan_enter_runtime(); kmsan_report(cur_origin, addr, size, cur_off_start, pos - 1, user_addr, reason); - kmsan_leave_runtime(); } cur_origin = 0; cur_off_start = -1; @@ -292,11 +290,9 @@ void kmsan_internal_check_memory(void *addr, size_t size, * poisoned bytes before, report them. */ if (cur_origin) { - kmsan_enter_runtime(); kmsan_report(cur_origin, addr, size, cur_off_start, pos + i - 1, user_addr, reason); - kmsan_leave_runtime(); } cur_origin = 0; cur_off_start = -1; @@ -312,11 +308,9 @@ void kmsan_internal_check_memory(void *addr, size_t size, */ if (cur_origin != new_origin) { if (cur_origin) { - kmsan_enter_runtime(); kmsan_report(cur_origin, addr, size, cur_off_start, pos + i - 1, user_addr, reason); - kmsan_leave_runtime(); } cur_origin = new_origin; cur_off_start = pos + i; @@ -326,10 +320,8 @@ void kmsan_internal_check_memory(void *addr, size_t size, } KMSAN_WARN_ON(pos != size); if (cur_origin) { - kmsan_enter_runtime(); kmsan_report(cur_origin, addr, size, cur_off_start, pos - 1, user_addr, reason); - kmsan_leave_runtime(); } } diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 3ea50f09311f..97de3d6194f0 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -114,9 +114,7 @@ void kmsan_kfree_large(const void *ptr) kmsan_enter_runtime(); page = virt_to_head_page((void *)ptr); KMSAN_WARN_ON(ptr != page_address(page)); - kmsan_internal_poison_memory((void *)ptr, - page_size(page), - GFP_KERNEL, + kmsan_internal_poison_memory((void *)ptr, page_size(page), GFP_KERNEL, KMSAN_POISON_CHECK | KMSAN_POISON_FREE); kmsan_leave_runtime(); } @@ -277,8 +275,10 @@ void kmsan_copy_to_user(void __user *to, const void *from, size_t to_copy, * Don't check anything, just copy the shadow of the copied * bytes. */ + kmsan_enter_runtime(); kmsan_internal_memmove_metadata((void *)to, (void *)from, to_copy - left); + kmsan_leave_runtime(); } user_access_restore(ua_flags); } @@ -357,6 +357,7 @@ void kmsan_handle_dma(struct page *page, size_t offset, size_t size, size -= to_go; } } +EXPORT_SYMBOL_GPL(kmsan_handle_dma); void kmsan_handle_dma_sg(struct scatterlist *sg, int nents, enum dma_data_direction dir) diff --git a/mm/kmsan/init.c b/mm/kmsan/init.c index 10f52c085e6c..b14ce3417e65 100644 --- a/mm/kmsan/init.c +++ b/mm/kmsan/init.c @@ -35,8 +35,7 @@ static void __init kmsan_record_future_shadow_range(void *start, void *end) KMSAN_WARN_ON(future_index == NUM_FUTURE_RANGES); KMSAN_WARN_ON((nstart >= nend) || /* Virtual address 0 is valid on s390. */ - (!IS_ENABLED(CONFIG_S390) && !nstart) || - !nend); + (!IS_ENABLED(CONFIG_S390) && !nstart) || !nend); nstart = ALIGN_DOWN(nstart, PAGE_SIZE); nend = ALIGN(nend, PAGE_SIZE); diff --git a/mm/kmsan/instrumentation.c b/mm/kmsan/instrumentation.c index 02a405e55d6c..69f0a57a401c 100644 --- a/mm/kmsan/instrumentation.c +++ b/mm/kmsan/instrumentation.c @@ -312,13 +312,9 @@ EXPORT_SYMBOL(__msan_unpoison_alloca); void __msan_warning(u32 origin); void __msan_warning(u32 origin) { - if (!kmsan_enabled || kmsan_in_runtime()) - return; - kmsan_enter_runtime(); kmsan_report(origin, /*address*/ NULL, /*size*/ 0, /*off_first*/ 0, /*off_last*/ 0, /*user_addr*/ NULL, REASON_ANY); - kmsan_leave_runtime(); } EXPORT_SYMBOL(__msan_warning); diff --git a/mm/kmsan/kmsan.h b/mm/kmsan/kmsan.h index 29555a8bc315..bc3d1810f352 100644 --- a/mm/kmsan/kmsan.h +++ b/mm/kmsan/kmsan.h @@ -121,7 +121,6 @@ static __always_inline void kmsan_leave_runtime(void) KMSAN_WARN_ON(--ctx->kmsan_in_runtime); } -depot_stack_handle_t kmsan_save_stack(void); depot_stack_handle_t kmsan_save_stack_with_flags(gfp_t flags, unsigned int extra_bits); diff --git a/mm/kmsan/kmsan_test.c b/mm/kmsan/kmsan_test.c index 9733a22c46c1..c6c5b2bbede0 100644 --- a/mm/kmsan/kmsan_test.c +++ b/mm/kmsan/kmsan_test.c @@ -732,3 +732,4 @@ kunit_test_suites(&kmsan_test_suite); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Alexander Potapenko <glider@google.com>"); +MODULE_DESCRIPTION("Test cases for KMSAN"); diff --git a/mm/kmsan/report.c b/mm/kmsan/report.c index 94a3303fb65e..d6853ce08954 100644 --- a/mm/kmsan/report.c +++ b/mm/kmsan/report.c @@ -157,14 +157,14 @@ void kmsan_report(depot_stack_handle_t origin, void *address, int size, unsigned long ua_flags; bool is_uaf; - if (!kmsan_enabled) + if (!kmsan_enabled || kmsan_in_runtime()) return; if (current->kmsan_ctx.depth) return; if (!origin) return; - kmsan_disable_current(); + kmsan_enter_runtime(); ua_flags = user_access_save(); raw_spin_lock(&kmsan_report_lock); pr_err("=====================================================\n"); @@ -217,5 +217,5 @@ void kmsan_report(depot_stack_handle_t origin, void *address, int size, if (panic_on_kmsan) panic("kmsan.panic set ...\n"); user_access_restore(ua_flags); - kmsan_enable_current(); + kmsan_leave_runtime(); } diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index 9c58f081d84f..54f3c3c962f0 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -207,8 +207,7 @@ void kmsan_free_page(struct page *page, unsigned int order) if (!kmsan_enabled || kmsan_in_runtime()) return; kmsan_enter_runtime(); - kmsan_internal_poison_memory(page_address(page), - page_size(page), + kmsan_internal_poison_memory(page_address(page), page_size(page), GFP_KERNEL, KMSAN_POISON_CHECK | KMSAN_POISON_FREE); kmsan_leave_runtime(); @@ -248,17 +247,19 @@ int kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, kmsan_enter_runtime(); mapped = __vmap_pages_range_noflush(shadow_start, shadow_end, prot, s_pages, page_shift); + kmsan_leave_runtime(); if (mapped) { err = mapped; goto ret; } + kmsan_enter_runtime(); mapped = __vmap_pages_range_noflush(origin_start, origin_end, prot, o_pages, page_shift); + kmsan_leave_runtime(); if (mapped) { err = mapped; goto ret; } - kmsan_leave_runtime(); flush_tlb_kernel_range(shadow_start, shadow_end); flush_tlb_kernel_range(origin_start, origin_end); flush_cache_vmap(shadow_start, shadow_end); @@ -280,12 +281,8 @@ void __init kmsan_init_alloc_meta_for_range(void *start, void *end) start = (void *)PAGE_ALIGN_DOWN((u64)start); size = PAGE_ALIGN((u64)end - (u64)start); - shadow = memblock_alloc(size, PAGE_SIZE); - origin = memblock_alloc(size, PAGE_SIZE); - - if (!shadow || !origin) - panic("%s: Failed to allocate metadata memory for early boot range of size %llu", - __func__, size); + shadow = memblock_alloc_or_panic(size, PAGE_SIZE); + origin = memblock_alloc_or_panic(size, PAGE_SIZE); for (u64 addr = 0; addr < size; addr += PAGE_SIZE) { page = virt_to_page_or_null((char *)start + addr); @@ -1270,8 +1270,15 @@ static int write_protect_page(struct vm_area_struct *vma, struct folio *folio, if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?")) goto out_unlock; - anon_exclusive = PageAnonExclusive(&folio->page); entry = ptep_get(pvmw.pte); + /* + * Handle PFN swap PTEs, such as device-exclusive ones, that actually + * map pages: give up just like the next folio_walk would. + */ + if (unlikely(!pte_present(entry))) + goto out_unlock; + + anon_exclusive = PageAnonExclusive(&folio->page); if (pte_write(entry) || pte_dirty(entry) || anon_exclusive || mm_tlb_flush_pending(mm)) { swapped = folio_test_swapcache(folio); @@ -3262,6 +3269,25 @@ static void wait_while_offlining(void) #endif /* CONFIG_MEMORY_HOTREMOVE */ #ifdef CONFIG_PROC_FS +/* + * The process is mergeable only if any VMA is currently + * applicable to KSM. + * + * The mmap lock must be held in read mode. + */ +bool ksm_process_mergeable(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + + mmap_assert_locked(mm); + VMA_ITERATOR(vmi, mm, 0); + for_each_vma(vmi, vma) + if (vma->vm_flags & VM_MERGEABLE) + return true; + + return false; +} + long ksm_process_profit(struct mm_struct *mm) { return (long)(mm->ksm_merging_pages + mm_ksm_zero_pages(mm)) * PAGE_SIZE - diff --git a/mm/list_lru.c b/mm/list_lru.c index f93ada6a207b..490473af3122 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -77,7 +77,6 @@ again: spin_lock(&l->lock); nr_items = READ_ONCE(l->nr_items); if (likely(nr_items != LONG_MIN)) { - WARN_ON(nr_items < 0); rcu_read_unlock(); return l; } @@ -450,6 +449,7 @@ static void memcg_reparent_list_lru_one(struct list_lru *lru, int nid, list_splice_init(&src->list, &dst->list); if (src->nr_items) { + WARN_ON(src->nr_items < 0); dst->nr_items += src->nr_items; set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); } @@ -510,7 +510,7 @@ int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, gfp_t gfp) { unsigned long flags; - struct list_lru_memcg *mlru; + struct list_lru_memcg *mlru = NULL; struct mem_cgroup *pos, *parent; XA_STATE(xas, &lru->xa, 0); @@ -535,9 +535,11 @@ int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, parent = parent_mem_cgroup(pos); } - mlru = memcg_init_list_lru_one(lru, gfp); - if (!mlru) - return -ENOMEM; + if (!mlru) { + mlru = memcg_init_list_lru_one(lru, gfp); + if (!mlru) + return -ENOMEM; + } xas_set(&xas, pos->kmemcg_id); do { xas_lock_irqsave(&xas, flags); @@ -548,10 +550,11 @@ int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, } xas_unlock_irqrestore(&xas, flags); } while (xas_nomem(&xas, gfp)); - if (mlru) - kfree(mlru); } while (pos != memcg && !css_is_dying(&pos->css)); + if (unlikely(mlru)) + kfree(mlru); + return xas_error(&xas); } #else diff --git a/mm/maccess.c b/mm/maccess.c index 8f0906180a94..831b4dd7296c 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -196,7 +196,7 @@ long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr, if (ret >= count) { ret = count; dst[ret - 1] = '\0'; - } else if (ret > 0) { + } else if (ret >= 0) { ret++; } diff --git a/mm/madvise.c b/mm/madvise.c index 0ceae57da7da..1d44a35ae85c 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -48,6 +48,11 @@ struct madvise_walk_private { bool pageout; }; +struct madvise_behavior { + int behavior; + struct mmu_gather *tlb; +}; + /* * Any behaviour which results in changes to the vma->vm_flags needs to * take mmap_lock for writing. Others, which simply traverse vmas, need @@ -387,7 +392,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, folio = pmd_folio(orig_pmd); /* Do not interfere with other mappings of this folio */ - if (folio_likely_mapped_shared(folio)) + if (folio_maybe_mapped_shared(folio)) goto huge_unlock; if (pageout_anon_only_filter && !folio_test_anon(folio)) @@ -486,7 +491,7 @@ restart: if (nr < folio_nr_pages(folio)) { int err; - if (folio_likely_mapped_shared(folio)) + if (folio_maybe_mapped_shared(folio)) continue; if (pageout_anon_only_filter && !folio_test_anon(folio)) continue; @@ -503,6 +508,7 @@ restart: pte_offset_map_lock(mm, pmd, addr, &ptl); if (!start_pte) break; + flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); if (!err) nr = 0; @@ -721,7 +727,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, if (nr < folio_nr_pages(folio)) { int err; - if (folio_likely_mapped_shared(folio)) + if (folio_maybe_mapped_shared(folio)) continue; if (!folio_trylock(folio)) continue; @@ -736,6 +742,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, start_pte = pte; if (!start_pte) break; + flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); if (!err) nr = 0; @@ -794,12 +801,13 @@ static const struct mm_walk_ops madvise_free_walk_ops = { .walk_lock = PGWALK_RDLOCK, }; -static int madvise_free_single_vma(struct vm_area_struct *vma, +static int madvise_free_single_vma(struct madvise_behavior *madv_behavior, + struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr) { struct mm_struct *mm = vma->vm_mm; struct mmu_notifier_range range; - struct mmu_gather tlb; + struct mmu_gather *tlb = madv_behavior->tlb; /* MADV_FREE works for only anon vma at the moment */ if (!vma_is_anonymous(vma)) @@ -815,17 +823,14 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, range.start, range.end); lru_add_drain(); - tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(&range); - tlb_start_vma(&tlb, vma); + tlb_start_vma(tlb, vma); walk_page_range(vma->vm_mm, range.start, range.end, - &madvise_free_walk_ops, &tlb); - tlb_end_vma(&tlb, vma); + &madvise_free_walk_ops, tlb); + tlb_end_vma(tlb, vma); mmu_notifier_invalidate_range_end(&range); - tlb_finish_mmu(&tlb); - return 0; } @@ -848,10 +853,17 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, * An interface that causes the system to free clean pages and flush * dirty pages is already available as msync(MS_INVALIDATE). */ -static long madvise_dontneed_single_vma(struct vm_area_struct *vma, +static long madvise_dontneed_single_vma(struct madvise_behavior *madv_behavior, + struct vm_area_struct *vma, unsigned long start, unsigned long end) { - zap_page_range_single(vma, start, end - start, NULL); + struct zap_details details = { + .reclaim_pt = true, + .even_cows = true, + }; + + zap_page_range_single_batched( + madv_behavior->tlb, vma, start, end - start, &details); return 0; } @@ -888,8 +900,9 @@ static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma, static long madvise_dontneed_free(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, - int behavior) + struct madvise_behavior *madv_behavior) { + int behavior = madv_behavior->behavior; struct mm_struct *mm = vma->vm_mm; *prev = vma; @@ -928,13 +941,23 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, */ end = vma->vm_end; } - VM_WARN_ON(start >= end); + /* + * If the memory region between start and end was + * originally backed by 4kB pages and then remapped to + * be backed by hugepages while mmap_lock was dropped, + * the adjustment for hugetlb vma above may have rounded + * end down to the start address. + */ + if (start == end) + return 0; + VM_WARN_ON(start > end); } if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED) - return madvise_dontneed_single_vma(vma, start, end); + return madvise_dontneed_single_vma( + madv_behavior, vma, start, end); else if (behavior == MADV_FREE) - return madvise_free_single_vma(vma, start, end); + return madvise_free_single_vma(madv_behavior, vma, start, end); else return -EINVAL; } @@ -1037,13 +1060,7 @@ static bool is_valid_guard_vma(struct vm_area_struct *vma, bool allow_locked) if (!allow_locked) disallowed |= VM_LOCKED; - if (!vma_is_anonymous(vma)) - return false; - - if ((vma->vm_flags & (VM_MAYWRITE | disallowed)) != VM_MAYWRITE) - return false; - - return true; + return !(vma->vm_flags & disallowed); } static bool is_guard_pte_marker(pte_t ptent) @@ -1241,8 +1258,10 @@ static long madvise_guard_remove(struct vm_area_struct *vma, static int madvise_vma_behavior(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, - unsigned long behavior) + void *behavior_arg) { + struct madvise_behavior *arg = behavior_arg; + int behavior = arg->behavior; int error; struct anon_vma_name *anon_name; unsigned long new_flags = vma->vm_flags; @@ -1262,7 +1281,7 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, case MADV_FREE: case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: - return madvise_dontneed_free(vma, prev, start, end, behavior); + return madvise_dontneed_free(vma, prev, start, end, arg); case MADV_NORMAL: new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; break; @@ -1384,7 +1403,32 @@ static int madvise_inject_error(int behavior, return 0; } -#endif + +static bool is_memory_failure(int behavior) +{ + switch (behavior) { + case MADV_HWPOISON: + case MADV_SOFT_OFFLINE: + return true; + default: + return false; + } +} + +#else + +static int madvise_inject_error(int behavior, + unsigned long start, unsigned long end) +{ + return 0; +} + +static bool is_memory_failure(int behavior) +{ + return false; +} + +#endif /* CONFIG_MEMORY_FAILURE */ static bool madvise_behavior_valid(int behavior) @@ -1454,10 +1498,10 @@ static bool process_madvise_remote_valid(int behavior) */ static int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, - unsigned long end, unsigned long arg, + unsigned long end, void *arg, int (*visit)(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, - unsigned long end, unsigned long arg)) + unsigned long end, void *arg)) { struct vm_area_struct *vma; struct vm_area_struct *prev; @@ -1515,7 +1559,7 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, static int madvise_vma_anon_name(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, - unsigned long anon_name) + void *anon_name) { int error; @@ -1524,7 +1568,7 @@ static int madvise_vma_anon_name(struct vm_area_struct *vma, return -EBADF; error = madvise_update_vma(vma, prev, start, end, vma->vm_flags, - (struct anon_vma_name *)anon_name); + anon_name); /* * madvise() returns EAGAIN if kernel resources, such as @@ -1556,10 +1600,142 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, if (end == start) return 0; - return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name, + return madvise_walk_vmas(mm, start, end, anon_name, madvise_vma_anon_name); } #endif /* CONFIG_ANON_VMA_NAME */ + +static int madvise_lock(struct mm_struct *mm, int behavior) +{ + if (is_memory_failure(behavior)) + return 0; + + if (madvise_need_mmap_write(behavior)) { + if (mmap_write_lock_killable(mm)) + return -EINTR; + } else { + mmap_read_lock(mm); + } + return 0; +} + +static void madvise_unlock(struct mm_struct *mm, int behavior) +{ + if (is_memory_failure(behavior)) + return; + + if (madvise_need_mmap_write(behavior)) + mmap_write_unlock(mm); + else + mmap_read_unlock(mm); +} + +static bool madvise_batch_tlb_flush(int behavior) +{ + switch (behavior) { + case MADV_DONTNEED: + case MADV_DONTNEED_LOCKED: + case MADV_FREE: + return true; + default: + return false; + } +} + +static void madvise_init_tlb(struct madvise_behavior *madv_behavior, + struct mm_struct *mm) +{ + if (madvise_batch_tlb_flush(madv_behavior->behavior)) + tlb_gather_mmu(madv_behavior->tlb, mm); +} + +static void madvise_finish_tlb(struct madvise_behavior *madv_behavior) +{ + if (madvise_batch_tlb_flush(madv_behavior->behavior)) + tlb_finish_mmu(madv_behavior->tlb); +} + +static bool is_valid_madvise(unsigned long start, size_t len_in, int behavior) +{ + size_t len; + + if (!madvise_behavior_valid(behavior)) + return false; + + if (!PAGE_ALIGNED(start)) + return false; + len = PAGE_ALIGN(len_in); + + /* Check to see whether len was rounded up from small -ve to zero */ + if (len_in && !len) + return false; + + if (start + len < start) + return false; + + return true; +} + +/* + * madvise_should_skip() - Return if the request is invalid or nothing. + * @start: Start address of madvise-requested address range. + * @len_in: Length of madvise-requested address range. + * @behavior: Requested madvise behavor. + * @err: Pointer to store an error code from the check. + * + * If the specified behaviour is invalid or nothing would occur, we skip the + * operation. This function returns true in the cases, otherwise false. In + * the former case we store an error on @err. + */ +static bool madvise_should_skip(unsigned long start, size_t len_in, + int behavior, int *err) +{ + if (!is_valid_madvise(start, len_in, behavior)) { + *err = -EINVAL; + return true; + } + if (start + PAGE_ALIGN(len_in) == start) { + *err = 0; + return true; + } + return false; +} + +static bool is_madvise_populate(int behavior) +{ + switch (behavior) { + case MADV_POPULATE_READ: + case MADV_POPULATE_WRITE: + return true; + default: + return false; + } +} + +static int madvise_do_behavior(struct mm_struct *mm, + unsigned long start, size_t len_in, + struct madvise_behavior *madv_behavior) +{ + int behavior = madv_behavior->behavior; + struct blk_plug plug; + unsigned long end; + int error; + + if (is_memory_failure(behavior)) + return madvise_inject_error(behavior, start, start + len_in); + start = untagged_addr_remote(mm, start); + end = start + PAGE_ALIGN(len_in); + + blk_start_plug(&plug); + if (is_madvise_populate(behavior)) + error = madvise_populate(mm, start, end, behavior); + else + error = madvise_walk_vmas(mm, start, end, madv_behavior, + madvise_vma_behavior); + blk_finish_plug(&plug); + return error; +} + /* * The madvise(2) system call. * @@ -1634,63 +1810,22 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, */ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) { - unsigned long end; int error; - int write; - size_t len; - struct blk_plug plug; - - if (!madvise_behavior_valid(behavior)) - return -EINVAL; - - if (!PAGE_ALIGNED(start)) - return -EINVAL; - len = PAGE_ALIGN(len_in); - - /* Check to see whether len was rounded up from small -ve to zero */ - if (len_in && !len) - return -EINVAL; - - end = start + len; - if (end < start) - return -EINVAL; - - if (end == start) - return 0; - -#ifdef CONFIG_MEMORY_FAILURE - if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) - return madvise_inject_error(behavior, start, start + len_in); -#endif - - write = madvise_need_mmap_write(behavior); - if (write) { - if (mmap_write_lock_killable(mm)) - return -EINTR; - } else { - mmap_read_lock(mm); - } - - start = untagged_addr_remote(mm, start); - end = start + len; - - blk_start_plug(&plug); - switch (behavior) { - case MADV_POPULATE_READ: - case MADV_POPULATE_WRITE: - error = madvise_populate(mm, start, end, behavior); - break; - default: - error = madvise_walk_vmas(mm, start, end, behavior, - madvise_vma_behavior); - break; - } - blk_finish_plug(&plug); + struct mmu_gather tlb; + struct madvise_behavior madv_behavior = { + .behavior = behavior, + .tlb = &tlb, + }; - if (write) - mmap_write_unlock(mm); - else - mmap_read_unlock(mm); + if (madvise_should_skip(start, len_in, behavior, &error)) + return error; + error = madvise_lock(mm, behavior); + if (error) + return error; + madvise_init_tlb(&madv_behavior, mm); + error = madvise_do_behavior(mm, start, len_in, &madv_behavior); + madvise_finish_tlb(&madv_behavior); + madvise_unlock(mm, behavior); return error; } @@ -1706,19 +1841,36 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, { ssize_t ret = 0; size_t total_len; + struct mmu_gather tlb; + struct madvise_behavior madv_behavior = { + .behavior = behavior, + .tlb = &tlb, + }; total_len = iov_iter_count(iter); + ret = madvise_lock(mm, behavior); + if (ret) + return ret; + madvise_init_tlb(&madv_behavior, mm); + while (iov_iter_count(iter)) { - ret = do_madvise(mm, (unsigned long)iter_iov_addr(iter), - iter_iov_len(iter), behavior); + unsigned long start = (unsigned long)iter_iov_addr(iter); + size_t len_in = iter_iov_len(iter); + int error; + + if (madvise_should_skip(start, len_in, behavior, &error)) + ret = error; + else + ret = madvise_do_behavior(mm, start, len_in, + &madv_behavior); /* * An madvise operation is attempting to restart the syscall, * but we cannot proceed as it would not be correct to repeat * the operation in aggregate, and would be surprising to the * user. * - * As we have already dropped locks, it is safe to just loop and + * We drop and reacquire locks so it is safe to just loop and * try again. We check for fatal signals in case we need exit * early anyway. */ @@ -1727,13 +1879,24 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, ret = -EINTR; break; } + + /* Drop and reacquire lock to unwind race. */ + madvise_finish_tlb(&madv_behavior); + madvise_unlock(mm, behavior); + ret = madvise_lock(mm, behavior); + if (ret) + goto out; + madvise_init_tlb(&madv_behavior, mm); continue; } if (ret < 0) break; iov_iter_advance(iter, iter_iov_len(iter)); } + madvise_finish_tlb(&madv_behavior); + madvise_unlock(mm, behavior); +out: ret = (total_len - iov_iter_count(iter)) ? : ret; return ret; diff --git a/mm/memblock.c b/mm/memblock.c index 0389ce5cd281..154f1d73b61f 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -16,6 +16,12 @@ #include <linux/kmemleak.h> #include <linux/seq_file.h> #include <linux/memblock.h> +#include <linux/mutex.h> + +#ifdef CONFIG_KEXEC_HANDOVER +#include <linux/libfdt.h> +#include <linux/kexec_handover.h> +#endif /* CONFIG_KEXEC_HANDOVER */ #include <asm/sections.h> #include <linux/io.h> @@ -106,6 +112,13 @@ unsigned long min_low_pfn; unsigned long max_pfn; unsigned long long max_possible_pfn; +#ifdef CONFIG_MEMBLOCK_KHO_SCRATCH +/* When set to true, only allocate from MEMBLOCK_KHO_SCRATCH ranges */ +static bool kho_scratch_only; +#else +#define kho_scratch_only false +#endif + static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_MEMORY_REGIONS] __initdata_memblock; static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_RESERVED_REGIONS] __initdata_memblock; #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP @@ -165,6 +178,10 @@ bool __init_memblock memblock_has_mirror(void) static enum memblock_flags __init_memblock choose_memblock_flags(void) { + /* skip non-scratch memory for kho early boot allocations */ + if (kho_scratch_only) + return MEMBLOCK_KHO_SCRATCH; + return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE; } @@ -456,7 +473,14 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, min(new_area_start, memblock.current_limit), new_alloc_size, PAGE_SIZE); - new_array = addr ? __va(addr) : NULL; + if (addr) { + /* The memory may not have been accepted, yet. */ + accept_memory(addr, new_alloc_size); + + new_array = __va(addr); + } else { + new_array = NULL; + } } if (!addr) { pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n", @@ -491,7 +515,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, * needn't do it */ if (!use_slab) - BUG_ON(memblock_reserve(addr, new_alloc_size)); + BUG_ON(memblock_reserve_kern(addr, new_alloc_size)); /* Update slab flag */ *in_slab = use_slab; @@ -641,7 +665,7 @@ repeat: #ifdef CONFIG_NUMA WARN_ON(nid != memblock_get_region_node(rgn)); #endif - WARN_ON(flags != rgn->flags); + WARN_ON(flags != MEMBLOCK_NONE && flags != rgn->flags); nr_new++; if (insert) { if (start_rgn == -1) @@ -735,7 +759,7 @@ int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) /** * memblock_validate_numa_coverage - check if amount of memory with * no node ID assigned is less than a threshold - * @threshold_bytes: maximal number of pages that can have unassigned node + * @threshold_bytes: maximal memory size that can have unassigned node * ID (in bytes). * * A buggy firmware may report memory that does not belong to any node. @@ -755,7 +779,7 @@ bool __init_memblock memblock_validate_numa_coverage(unsigned long threshold_byt nr_pages += end_pfn - start_pfn; } - if ((nr_pages << PAGE_SHIFT) >= threshold_bytes) { + if ((nr_pages << PAGE_SHIFT) > threshold_bytes) { mem_size_mb = memblock_phys_mem_size() >> 20; pr_err("NUMA: no nodes coverage for %luMB of %luMB RAM\n", (nr_pages << PAGE_SHIFT) >> 20, mem_size_mb); @@ -901,14 +925,15 @@ int __init_memblock memblock_phys_free(phys_addr_t base, phys_addr_t size) return memblock_remove_range(&memblock.reserved, base, size); } -int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) +int __init_memblock __memblock_reserve(phys_addr_t base, phys_addr_t size, + int nid, enum memblock_flags flags) { phys_addr_t end = base + size - 1; - memblock_dbg("%s: [%pa-%pa] %pS\n", __func__, - &base, &end, (void *)_RET_IP_); + memblock_dbg("%s: [%pa-%pa] nid=%d flags=%x %pS\n", __func__, + &base, &end, nid, flags, (void *)_RET_IP_); - return memblock_add_range(&memblock.reserved, base, size, MAX_NUMNODES, 0); + return memblock_add_range(&memblock.reserved, base, size, nid, flags); } #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP @@ -923,6 +948,40 @@ int __init_memblock memblock_physmem_add(phys_addr_t base, phys_addr_t size) } #endif +#ifdef CONFIG_MEMBLOCK_KHO_SCRATCH +__init void memblock_set_kho_scratch_only(void) +{ + kho_scratch_only = true; +} + +__init void memblock_clear_kho_scratch_only(void) +{ + kho_scratch_only = false; +} + +__init void memmap_init_kho_scratch_pages(void) +{ + phys_addr_t start, end; + unsigned long pfn; + int nid; + u64 i; + + if (!IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT)) + return; + + /* + * Initialize struct pages for free scratch memory. + * The struct pages for reserved scratch memory will be set up in + * reserve_bootmem_region() + */ + __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE, + MEMBLOCK_KHO_SCRATCH, &start, &end, &nid) { + for (pfn = PFN_UP(start); pfn < PFN_DOWN(end); pfn++) + init_deferred_page(pfn, nid); + } +} +#endif + /** * memblock_setclr_flag - set or clear flag for a memory region * @type: memblock type to set/clear flag for @@ -1048,6 +1107,36 @@ int __init_memblock memblock_reserved_mark_noinit(phys_addr_t base, phys_addr_t MEMBLOCK_RSRV_NOINIT); } +/** + * memblock_mark_kho_scratch - Mark a memory region as MEMBLOCK_KHO_SCRATCH. + * @base: the base phys addr of the region + * @size: the size of the region + * + * Only memory regions marked with %MEMBLOCK_KHO_SCRATCH will be considered + * for allocations during early boot with kexec handover. + * + * Return: 0 on success, -errno on failure. + */ +__init int memblock_mark_kho_scratch(phys_addr_t base, phys_addr_t size) +{ + return memblock_setclr_flag(&memblock.memory, base, size, 1, + MEMBLOCK_KHO_SCRATCH); +} + +/** + * memblock_clear_kho_scratch - Clear MEMBLOCK_KHO_SCRATCH flag for a + * specified region. + * @base: the base phys addr of the region + * @size: the size of the region + * + * Return: 0 on success, -errno on failure. + */ +__init int memblock_clear_kho_scratch(phys_addr_t base, phys_addr_t size) +{ + return memblock_setclr_flag(&memblock.memory, base, size, 0, + MEMBLOCK_KHO_SCRATCH); +} + static bool should_skip_region(struct memblock_type *type, struct memblock_region *m, int nid, int flags) @@ -1079,6 +1168,13 @@ static bool should_skip_region(struct memblock_type *type, if (!(flags & MEMBLOCK_DRIVER_MANAGED) && memblock_is_driver_managed(m)) return true; + /* + * In early alloc during kexec handover, we can only consider + * MEMBLOCK_KHO_SCRATCH regions for the allocations + */ + if ((flags & MEMBLOCK_KHO_SCRATCH) && !memblock_is_kho_scratch(m)) + return true; + return false; } @@ -1459,14 +1555,14 @@ phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, again: found = memblock_find_in_range_node(size, align, start, end, nid, flags); - if (found && !memblock_reserve(found, size)) + if (found && !__memblock_reserve(found, size, nid, MEMBLOCK_RSRV_KERN)) goto done; if (numa_valid_node(nid) && !exact_nid) { found = memblock_find_in_range_node(size, align, start, end, NUMA_NO_NODE, flags); - if (found && !memblock_reserve(found, size)) + if (found && !memblock_reserve_kern(found, size)) goto done; } @@ -1692,6 +1788,26 @@ void * __init memblock_alloc_try_nid( } /** + * __memblock_alloc_or_panic - Try to allocate memory and panic on failure + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @func: caller func name + * + * This function attempts to allocate memory using memblock_alloc, + * and in case of failure, it calls panic with the formatted message. + * This function should not be used directly, please use the macro memblock_alloc_or_panic. + */ +void *__init __memblock_alloc_or_panic(phys_addr_t size, phys_addr_t align, + const char *func) +{ + void *addr = memblock_alloc(size, align); + + if (unlikely(!addr)) + panic("%s: Failed to allocate %pap bytes\n", func, &size); + return addr; +} + +/** * memblock_free_late - free pages directly to buddy allocator * @base: phys starting address of the boot memory block * @size: size of the boot memory block in bytes @@ -1731,6 +1847,28 @@ phys_addr_t __init_memblock memblock_reserved_size(void) return memblock.reserved.total_size; } +phys_addr_t __init_memblock memblock_reserved_kern_size(phys_addr_t limit, int nid) +{ + struct memblock_region *r; + phys_addr_t total = 0; + + for_each_reserved_mem_region(r) { + phys_addr_t size = r->size; + + if (r->base > limit) + break; + + if (r->base + r->size > limit) + size = limit - r->base; + + if (nid == memblock_get_region_node(r) || !numa_valid_node(nid)) + if (r->flags & MEMBLOCK_RSRV_KERN) + total += size; + } + + return total; +} + /** * memblock_estimated_nr_free_pages - return estimated number of free pages * from memblock point of view @@ -2144,8 +2282,10 @@ static unsigned long __init __free_memory_core(phys_addr_t start, phys_addr_t end) { unsigned long start_pfn = PFN_UP(start); - unsigned long end_pfn = min_t(unsigned long, - PFN_DOWN(end), max_low_pfn); + unsigned long end_pfn = PFN_DOWN(end); + + if (!IS_ENABLED(CONFIG_HIGHMEM) && end_pfn > max_low_pfn) + end_pfn = max_low_pfn; if (start_pfn >= end_pfn) return 0; @@ -2160,11 +2300,14 @@ static void __init memmap_init_reserved_pages(void) struct memblock_region *region; phys_addr_t start, end; int nid; + unsigned long max_reserved; /* * set nid on all reserved pages and also treat struct * pages for the NOMAP regions as PageReserved */ +repeat: + max_reserved = memblock.reserved.max; for_each_mem_region(region) { nid = memblock_get_region_node(region); start = region->base; @@ -2173,8 +2316,15 @@ static void __init memmap_init_reserved_pages(void) if (memblock_is_nomap(region)) reserve_bootmem_region(start, end, nid); - memblock_set_node(start, end, &memblock.reserved, nid); + memblock_set_node(start, region->size, &memblock.reserved, nid); } + /* + * 'max' is changed means memblock.reserved has been doubled its + * array, which may result a new reserved region before current + * 'start'. Now we should repeat the procedure to set its node id. + */ + if (max_reserved != memblock.reserved.max) + goto repeat; /* * initialize struct pages for reserved regions that don't have @@ -2249,6 +2399,7 @@ void __init memblock_free_all(void) free_unused_memmap(); reset_all_zones_managed_pages(); + memblock_clear_kho_scratch_only(); pages = free_low_memory_core_early(); totalram_pages_add(pages); } @@ -2263,6 +2414,7 @@ struct reserve_mem_table { }; static struct reserve_mem_table reserved_mem_table[RESERVE_MEM_MAX_ENTRIES]; static int reserved_mem_count; +static DEFINE_MUTEX(reserve_mem_lock); /* Add wildcard region with a lookup name */ static void __init reserved_mem_add(phys_addr_t start, phys_addr_t size, @@ -2276,6 +2428,21 @@ static void __init reserved_mem_add(phys_addr_t start, phys_addr_t size, strscpy(map->name, name); } +static struct reserve_mem_table *reserve_mem_find_by_name_nolock(const char *name) +{ + struct reserve_mem_table *map; + int i; + + for (i = 0; i < reserved_mem_count; i++) { + map = &reserved_mem_table[i]; + if (!map->size) + continue; + if (strcmp(name, map->name) == 0) + return map; + } + return NULL; +} + /** * reserve_mem_find_by_name - Find reserved memory region with a given name * @name: The name that is attached to a reserved memory region @@ -2289,21 +2456,229 @@ static void __init reserved_mem_add(phys_addr_t start, phys_addr_t size, int reserve_mem_find_by_name(const char *name, phys_addr_t *start, phys_addr_t *size) { struct reserve_mem_table *map; - int i; + + guard(mutex)(&reserve_mem_lock); + map = reserve_mem_find_by_name_nolock(name); + if (!map) + return 0; + + *start = map->start; + *size = map->size; + return 1; +} +EXPORT_SYMBOL_GPL(reserve_mem_find_by_name); + +/** + * reserve_mem_release_by_name - Release reserved memory region with a given name + * @name: The name that is attatched to a reserved memory region + * + * Forcibly release the pages in the reserved memory region so that those memory + * can be used as free memory. After released the reserved region size becomes 0. + * + * Returns: 1 if released or 0 if not found. + */ +int reserve_mem_release_by_name(const char *name) +{ + char buf[RESERVE_MEM_NAME_SIZE + 12]; + struct reserve_mem_table *map; + void *start, *end; + + guard(mutex)(&reserve_mem_lock); + map = reserve_mem_find_by_name_nolock(name); + if (!map) + return 0; + + start = phys_to_virt(map->start); + end = start + map->size - 1; + snprintf(buf, sizeof(buf), "reserve_mem:%s", name); + free_reserved_area(start, end, 0, buf); + map->size = 0; + + return 1; +} + +#ifdef CONFIG_KEXEC_HANDOVER +#define MEMBLOCK_KHO_FDT "memblock" +#define MEMBLOCK_KHO_NODE_COMPATIBLE "memblock-v1" +#define RESERVE_MEM_KHO_NODE_COMPATIBLE "reserve-mem-v1" +static struct page *kho_fdt; + +static int reserve_mem_kho_finalize(struct kho_serialization *ser) +{ + int err = 0, i; for (i = 0; i < reserved_mem_count; i++) { - map = &reserved_mem_table[i]; - if (!map->size) - continue; - if (strcmp(name, map->name) == 0) { - *start = map->start; - *size = map->size; - return 1; - } + struct reserve_mem_table *map = &reserved_mem_table[i]; + + err |= kho_preserve_phys(map->start, map->size); } - return 0; + + err |= kho_preserve_folio(page_folio(kho_fdt)); + err |= kho_add_subtree(ser, MEMBLOCK_KHO_FDT, page_to_virt(kho_fdt)); + + return notifier_from_errno(err); } -EXPORT_SYMBOL_GPL(reserve_mem_find_by_name); + +static int reserve_mem_kho_notifier(struct notifier_block *self, + unsigned long cmd, void *v) +{ + switch (cmd) { + case KEXEC_KHO_FINALIZE: + return reserve_mem_kho_finalize((struct kho_serialization *)v); + case KEXEC_KHO_ABORT: + return NOTIFY_DONE; + default: + return NOTIFY_BAD; + } +} + +static struct notifier_block reserve_mem_kho_nb = { + .notifier_call = reserve_mem_kho_notifier, +}; + +static int __init prepare_kho_fdt(void) +{ + int err = 0, i; + void *fdt; + + kho_fdt = alloc_page(GFP_KERNEL); + if (!kho_fdt) + return -ENOMEM; + + fdt = page_to_virt(kho_fdt); + + err |= fdt_create(fdt, PAGE_SIZE); + err |= fdt_finish_reservemap(fdt); + + err |= fdt_begin_node(fdt, ""); + err |= fdt_property_string(fdt, "compatible", MEMBLOCK_KHO_NODE_COMPATIBLE); + for (i = 0; i < reserved_mem_count; i++) { + struct reserve_mem_table *map = &reserved_mem_table[i]; + + err |= fdt_begin_node(fdt, map->name); + err |= fdt_property_string(fdt, "compatible", RESERVE_MEM_KHO_NODE_COMPATIBLE); + err |= fdt_property(fdt, "start", &map->start, sizeof(map->start)); + err |= fdt_property(fdt, "size", &map->size, sizeof(map->size)); + err |= fdt_end_node(fdt); + } + err |= fdt_end_node(fdt); + + err |= fdt_finish(fdt); + + if (err) { + pr_err("failed to prepare memblock FDT for KHO: %d\n", err); + put_page(kho_fdt); + kho_fdt = NULL; + } + + return err; +} + +static int __init reserve_mem_init(void) +{ + int err; + + if (!kho_is_enabled() || !reserved_mem_count) + return 0; + + err = prepare_kho_fdt(); + if (err) + return err; + + err = register_kho_notifier(&reserve_mem_kho_nb); + if (err) { + put_page(kho_fdt); + kho_fdt = NULL; + } + + return err; +} +late_initcall(reserve_mem_init); + +static void *__init reserve_mem_kho_retrieve_fdt(void) +{ + phys_addr_t fdt_phys; + static void *fdt; + int err; + + if (fdt) + return fdt; + + err = kho_retrieve_subtree(MEMBLOCK_KHO_FDT, &fdt_phys); + if (err) { + if (err != -ENOENT) + pr_warn("failed to retrieve FDT '%s' from KHO: %d\n", + MEMBLOCK_KHO_FDT, err); + return NULL; + } + + fdt = phys_to_virt(fdt_phys); + + err = fdt_node_check_compatible(fdt, 0, MEMBLOCK_KHO_NODE_COMPATIBLE); + if (err) { + pr_warn("FDT '%s' is incompatible with '%s': %d\n", + MEMBLOCK_KHO_FDT, MEMBLOCK_KHO_NODE_COMPATIBLE, err); + fdt = NULL; + } + + return fdt; +} + +static bool __init reserve_mem_kho_revive(const char *name, phys_addr_t size, + phys_addr_t align) +{ + int err, len_start, len_size, offset; + const phys_addr_t *p_start, *p_size; + const void *fdt; + + fdt = reserve_mem_kho_retrieve_fdt(); + if (!fdt) + return false; + + offset = fdt_subnode_offset(fdt, 0, name); + if (offset < 0) { + pr_warn("FDT '%s' has no child '%s': %d\n", + MEMBLOCK_KHO_FDT, name, offset); + return false; + } + err = fdt_node_check_compatible(fdt, offset, RESERVE_MEM_KHO_NODE_COMPATIBLE); + if (err) { + pr_warn("Node '%s' is incompatible with '%s': %d\n", + name, RESERVE_MEM_KHO_NODE_COMPATIBLE, err); + return false; + } + + p_start = fdt_getprop(fdt, offset, "start", &len_start); + p_size = fdt_getprop(fdt, offset, "size", &len_size); + if (!p_start || len_start != sizeof(*p_start) || !p_size || + len_size != sizeof(*p_size)) { + return false; + } + + if (*p_start & (align - 1)) { + pr_warn("KHO reserve-mem '%s' has wrong alignment (0x%lx, 0x%lx)\n", + name, (long)align, (long)*p_start); + return false; + } + + if (*p_size != size) { + pr_warn("KHO reserve-mem '%s' has wrong size (0x%lx != 0x%lx)\n", + name, (long)*p_size, (long)size); + return false; + } + + reserved_mem_add(*p_start, size, name); + pr_info("Revived memory reservation '%s' from KHO\n", name); + + return true; +} +#else +static bool __init reserve_mem_kho_revive(const char *name, phys_addr_t size, + phys_addr_t align) +{ + return false; +} +#endif /* CONFIG_KEXEC_HANDOVER */ /* * Parse reserve_mem=nn:align:name @@ -2360,6 +2735,11 @@ static int __init reserve_mem(char *p) if (reserve_mem_find_by_name(name, &start, &tmp)) return -EBUSY; + /* Pick previous allocations up from KHO if available */ + if (reserve_mem_kho_revive(name, size, align)) + return 1; + + /* TODO: Allocation must be outside of scratch region */ start = memblock_phys_alloc(size, align); if (!start) return -ENOMEM; @@ -2377,6 +2757,8 @@ static const char * const flagname[] = { [ilog2(MEMBLOCK_NOMAP)] = "NOMAP", [ilog2(MEMBLOCK_DRIVER_MANAGED)] = "DRV_MNG", [ilog2(MEMBLOCK_RSRV_NOINIT)] = "RSV_NIT", + [ilog2(MEMBLOCK_RSRV_KERN)] = "RSV_KERN", + [ilog2(MEMBLOCK_KHO_SCRATCH)] = "KHO_SCRATCH", }; static int memblock_debug_show(struct seq_file *m, void *private) diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index a071fa43d479..4b94731305b9 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -490,6 +490,19 @@ static void mem_cgroup_threshold(struct mem_cgroup *memcg) } /* Cgroup1: threshold notifications & softlimit tree updates */ + +/* + * Per memcg event counter is incremented at every pagein/pageout. With THP, + * it will be incremented by the number of pages. This counter is used + * to trigger some periodic events. This is straightforward and better + * than using jiffies etc. to handle periodic memcg event. + */ +enum mem_cgroup_events_target { + MEM_CGROUP_TARGET_THRESH, + MEM_CGROUP_TARGET_SOFTLIMIT, + MEM_CGROUP_NTARGETS, +}; + struct memcg1_events_percpu { unsigned long nr_page_events; unsigned long targets[MEM_CGROUP_NTARGETS]; @@ -499,9 +512,9 @@ static void memcg1_charge_statistics(struct mem_cgroup *memcg, int nr_pages) { /* pagein of a big page is an event. So, ignore page size */ if (nr_pages > 0) - __count_memcg_events(memcg, PGPGIN, 1); + count_memcg_events(memcg, PGPGIN, 1); else { - __count_memcg_events(memcg, PGPGOUT, 1); + count_memcg_events(memcg, PGPGOUT, 1); nr_pages = -nr_pages; /* for event */ } @@ -568,8 +581,59 @@ void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg) local_irq_restore(flags); } -void memcg1_swapout(struct folio *folio, struct mem_cgroup *memcg) +/** + * memcg1_swapout - transfer a memsw charge to swap + * @folio: folio whose memsw charge to transfer + * @entry: swap entry to move the charge to + * + * Transfer the memsw charge of @folio to @entry. + */ +void memcg1_swapout(struct folio *folio, swp_entry_t entry) { + struct mem_cgroup *memcg, *swap_memcg; + unsigned int nr_entries; + + VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); + VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); + + if (mem_cgroup_disabled()) + return; + + if (!do_memsw_account()) + return; + + memcg = folio_memcg(folio); + + VM_WARN_ON_ONCE_FOLIO(!memcg, folio); + if (!memcg) + return; + + /* + * In case the memcg owning these pages has been offlined and doesn't + * have an ID allocated to it anymore, charge the closest online + * ancestor for the swap instead and transfer the memory+swap charge. + */ + swap_memcg = mem_cgroup_id_get_online(memcg); + nr_entries = folio_nr_pages(folio); + /* Get references for the tail pages, too */ + if (nr_entries > 1) + mem_cgroup_id_get_many(swap_memcg, nr_entries - 1); + mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); + + swap_cgroup_record(folio, mem_cgroup_id(swap_memcg), entry); + + folio_unqueue_deferred_split(folio); + folio->memcg_data = 0; + + if (!mem_cgroup_is_root(memcg)) + page_counter_uncharge(&memcg->memory, nr_entries); + + if (memcg != swap_memcg) { + if (!mem_cgroup_is_root(swap_memcg)) + page_counter_charge(&swap_memcg->memsw, nr_entries); + page_counter_uncharge(&memcg->memsw, nr_entries); + } + /* * Interrupts should be disabled here because the caller holds the * i_pages lock which is taken with interrupts-off. It is @@ -581,6 +645,42 @@ void memcg1_swapout(struct folio *folio, struct mem_cgroup *memcg) memcg1_charge_statistics(memcg, -folio_nr_pages(folio)); preempt_enable_nested(); memcg1_check_events(memcg, folio_nid(folio)); + + css_put(&memcg->css); +} + +/* + * memcg1_swapin - uncharge swap slot + * @entry: the first swap entry for which the pages are charged + * @nr_pages: number of pages which will be uncharged + * + * Call this function after successfully adding the charged page to swapcache. + * + * Note: This function assumes the page for which swap slot is being uncharged + * is order 0 page. + */ +void memcg1_swapin(swp_entry_t entry, unsigned int nr_pages) +{ + /* + * Cgroup1's unified memory+swap counter has been charged with the + * new swapcache page, finish the transfer by uncharging the swap + * slot. The swap slot would also get uncharged when it dies, but + * it can stick around indefinitely and we'd count the page twice + * the entire time. + * + * Cgroup2 has separate resource counters for memory and swap, + * so this is a non-issue here. Memory and swap charge lifetimes + * correspond 1:1 to page and swap slot lifetimes: we charge the + * page to memory here, and uncharge swap when the slot is freed. + */ + if (do_memsw_account()) { + /* + * The swap entry might not get freed for a long time, + * let's not wait for it. The page already received a + * memory+swap charge, drop the swap entry duplicate. + */ + mem_cgroup_uncharge_swap(entry, nr_pages); + } } void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, @@ -589,7 +689,7 @@ void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, unsigned long flags; local_irq_save(flags); - __count_memcg_events(memcg, PGPGOUT, pgpgout); + count_memcg_events(memcg, PGPGOUT, pgpgout); __this_cpu_add(memcg->events_percpu->nr_page_events, nr_memory); memcg1_check_events(memcg, nid); local_irq_restore(flags); @@ -899,7 +999,7 @@ static void memcg_event_remove(struct work_struct *work) * * Called with wqh->lock held and interrupts disabled. */ -static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode, +static int memcg_event_wake(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) { struct mem_cgroup_event *event = @@ -1040,13 +1140,13 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, } else if (!strcmp(name, "memory.oom_control")) { pr_warn_once("oom_control is deprecated and will be removed. " "Please report your usecase to linux-mm-@kvack.org" - " if you depend on this functionality. \n"); + " if you depend on this functionality.\n"); event->register_event = mem_cgroup_oom_register_event; event->unregister_event = mem_cgroup_oom_unregister_event; } else if (!strcmp(name, "memory.pressure_level")) { pr_warn_once("pressure_level is deprecated and will be removed. " "Please report your usecase to linux-mm-@kvack.org " - "if you depend on this functionality. \n"); + "if you depend on this functionality.\n"); event->register_event = vmpressure_register_event; event->unregister_event = vmpressure_unregister_event; } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { @@ -1134,8 +1234,8 @@ static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) failed = iter; mem_cgroup_iter_break(memcg, iter); break; - } else - iter->oom_lock = true; + } + iter->oom_lock = true; } if (failed) { @@ -1202,7 +1302,7 @@ struct oom_wait_info { }; static int memcg_oom_wake_function(wait_queue_entry_t *wait, - unsigned mode, int sync, void *arg) + unsigned int mode, int sync, void *arg) { struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; struct mem_cgroup *oom_wait_memcg; @@ -1644,7 +1744,7 @@ static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, unsigned long nr = 0; enum lru_list lru; - VM_BUG_ON((unsigned)nid >= nr_node_ids); + VM_BUG_ON((unsigned int)nid >= nr_node_ids); for_each_lru(lru) { if (!(BIT(lru) & lru_mask)) @@ -1855,9 +1955,11 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, if (val > MAX_SWAPPINESS) return -EINVAL; - if (!mem_cgroup_is_root(memcg)) + if (!mem_cgroup_is_root(memcg)) { + pr_info_once("Per memcg swappiness does not exist in cgroup v2. " + "See memory.reclaim or memory.swap.max there\n "); WRITE_ONCE(memcg->swappiness, val); - else + } else WRITE_ONCE(vm_swappiness, val); return 0; @@ -1881,7 +1983,7 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, pr_warn_once("oom_control is deprecated and will be removed. " "Please report your usecase to linux-mm-@kvack.org if you " - "depend on this functionality. \n"); + "depend on this functionality.\n"); /* cannot set to root cgroup and only 0 and 1 are allowed */ if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1))) @@ -2096,8 +2198,7 @@ bool memcg1_alloc_events(struct mem_cgroup *memcg) void memcg1_free_events(struct mem_cgroup *memcg) { - if (memcg->events_percpu) - free_percpu(memcg->events_percpu); + free_percpu(memcg->events_percpu); } static int __init memcg1_init(void) diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h index 0e3b82951d91..6358464bb416 100644 --- a/mm/memcontrol-v1.h +++ b/mm/memcontrol-v1.h @@ -7,21 +7,6 @@ /* Cgroup v1 and v2 common declarations */ -int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, - unsigned int nr_pages); - -static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, - unsigned int nr_pages) -{ - if (mem_cgroup_is_root(memcg)) - return 0; - - return try_charge_memcg(memcg, gfp_mask, nr_pages); -} - -void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n); -void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n); - /* * Iteration constructs for visiting all cgroups (under a tree). If * loops are exited prematurely (break), mem_cgroup_iter_break() must @@ -37,38 +22,29 @@ void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n); iter != NULL; \ iter = mem_cgroup_iter(NULL, iter, NULL)) -/* Whether legacy memory+swap accounting is active */ -static bool do_memsw_account(void) -{ - return !cgroup_subsys_on_dfl(memory_cgrp_subsys); -} - -/* - * Per memcg event counter is incremented at every pagein/pageout. With THP, - * it will be incremented by the number of pages. This counter is used - * to trigger some periodic events. This is straightforward and better - * than using jiffies etc. to handle periodic memcg event. - */ -enum mem_cgroup_events_target { - MEM_CGROUP_TARGET_THRESH, - MEM_CGROUP_TARGET_SOFTLIMIT, - MEM_CGROUP_NTARGETS, -}; - unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap); void drain_all_stock(struct mem_cgroup *root_memcg); unsigned long memcg_events(struct mem_cgroup *memcg, int event); -unsigned long memcg_events_local(struct mem_cgroup *memcg, int event); -unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx); unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item); -unsigned long memcg_page_state_local_output(struct mem_cgroup *memcg, int item); int memory_stat_show(struct seq_file *m, void *v); +void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n); +struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg); + /* Cgroup v1-specific declarations */ #ifdef CONFIG_MEMCG_V1 +/* Whether legacy memory+swap accounting is active */ +static inline bool do_memsw_account(void) +{ + return !cgroup_subsys_on_dfl(memory_cgrp_subsys); +} + +unsigned long memcg_events_local(struct mem_cgroup *memcg, int event); +unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx); +unsigned long memcg_page_state_local_output(struct mem_cgroup *memcg, int item); bool memcg1_alloc_events(struct mem_cgroup *memcg); void memcg1_free_events(struct mem_cgroup *memcg); @@ -96,7 +72,6 @@ void memcg1_oom_finish(struct mem_cgroup *memcg, bool locked); void memcg1_oom_recover(struct mem_cgroup *memcg); void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg); -void memcg1_swapout(struct folio *folio, struct mem_cgroup *memcg); void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, unsigned long nr_memory, int nid); @@ -119,6 +94,7 @@ extern struct cftype mem_cgroup_legacy_files[]; #else /* CONFIG_MEMCG_V1 */ +static inline bool do_memsw_account(void) { return false; } static inline bool memcg1_alloc_events(struct mem_cgroup *memcg) { return true; } static inline void memcg1_free_events(struct mem_cgroup *memcg) {} @@ -134,8 +110,6 @@ static inline void memcg1_oom_recover(struct mem_cgroup *memcg) {} static inline void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg) {} -static inline void memcg1_swapout(struct folio *folio, struct mem_cgroup *memcg) {} - static inline void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, unsigned long nr_memory, int nid) {} diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7b3503d12aaf..902da8a9c643 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -29,6 +29,7 @@ #include <linux/page_counter.h> #include <linux/memcontrol.h> #include <linux/cgroup.h> +#include <linux/cpuset.h> #include <linux/sched/mm.h> #include <linux/shmem_fs.h> #include <linux/hugetlb.h> @@ -95,6 +96,9 @@ static bool cgroup_memory_nokmem __ro_after_init; /* BPF memory accounting disabled? */ static bool cgroup_memory_nobpf __ro_after_init; +static struct kmem_cache *memcg_cachep; +static struct kmem_cache *memcg_pn_cachep; + #ifdef CONFIG_CGROUP_WRITEBACK static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); #endif @@ -129,8 +133,7 @@ bool mem_cgroup_kmem_disabled(void) return cgroup_memory_nokmem; } -static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, - unsigned int nr_pages); +static void memcg_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages); static void obj_cgroup_release(struct percpu_ref *ref) { @@ -163,8 +166,16 @@ static void obj_cgroup_release(struct percpu_ref *ref) WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1)); nr_pages = nr_bytes >> PAGE_SHIFT; - if (nr_pages) - obj_cgroup_uncharge_pages(objcg, nr_pages); + if (nr_pages) { + struct mem_cgroup *memcg; + + memcg = get_mem_cgroup_from_objcg(objcg); + mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages); + memcg1_account_kmem(memcg, -nr_pages); + if (!mem_cgroup_is_root(memcg)) + memcg_uncharge(memcg, nr_pages); + mem_cgroup_put(memcg); + } spin_lock_irqsave(&objcg_lock, flags); list_del(&objcg->list); @@ -315,6 +326,7 @@ static const unsigned int memcg_node_stat_items[] = { PGDEMOTE_KSWAPD, PGDEMOTE_DIRECT, PGDEMOTE_KHUGEPAGED, + PGDEMOTE_PROACTIVE, #ifdef CONFIG_HUGETLB_PAGE NR_HUGETLB, #endif @@ -431,9 +443,11 @@ static const unsigned int memcg_vm_event_stat[] = { PGSCAN_KSWAPD, PGSCAN_DIRECT, PGSCAN_KHUGEPAGED, + PGSCAN_PROACTIVE, PGSTEAL_KSWAPD, PGSTEAL_DIRECT, PGSTEAL_KHUGEPAGED, + PGSTEAL_PROACTIVE, PGFAULT, PGMAJFAULT, PGREFILL, @@ -460,6 +474,8 @@ static const unsigned int memcg_vm_event_stat[] = { NUMA_PAGE_MIGRATE, NUMA_PTE_UPDATES, NUMA_HINT_FAULTS, + NUMA_TASK_MIGRATE, + NUMA_TASK_SWAP, #endif }; @@ -489,8 +505,8 @@ struct memcg_vmstats_percpu { unsigned int stats_updates; /* Cached pointers for fast iteration in memcg_rstat_updated() */ - struct memcg_vmstats_percpu *parent; - struct memcg_vmstats *vmstats; + struct memcg_vmstats_percpu __percpu *parent_pcpu; + struct memcg_vmstats *vmstats; /* The above should fit a single cacheline for memcg_rstat_updated() */ @@ -517,7 +533,7 @@ struct memcg_vmstats { unsigned long events_pending[NR_MEMCG_EVENTS]; /* Stats updates since the last flush */ - atomic64_t stats_updates; + atomic_t stats_updates; }; /* @@ -541,60 +557,43 @@ static u64 flush_last_time; #define FLUSH_TIME (2UL*HZ) -/* - * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can - * not rely on this as part of an acquired spinlock_t lock. These functions are - * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion - * is sufficient. - */ -static void memcg_stats_lock(void) -{ - preempt_disable_nested(); - VM_WARN_ON_IRQS_ENABLED(); -} - -static void __memcg_stats_lock(void) -{ - preempt_disable_nested(); -} - -static void memcg_stats_unlock(void) -{ - preempt_enable_nested(); -} - - static bool memcg_vmstats_needs_flush(struct memcg_vmstats *vmstats) { - return atomic64_read(&vmstats->stats_updates) > + return atomic_read(&vmstats->stats_updates) > MEMCG_CHARGE_BATCH * num_online_cpus(); } -static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) +static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val, + int cpu) { + struct memcg_vmstats_percpu __percpu *statc_pcpu; struct memcg_vmstats_percpu *statc; - int cpu = smp_processor_id(); unsigned int stats_updates; if (!val) return; - cgroup_rstat_updated(memcg->css.cgroup, cpu); - statc = this_cpu_ptr(memcg->vmstats_percpu); - for (; statc; statc = statc->parent) { - stats_updates = READ_ONCE(statc->stats_updates) + abs(val); - WRITE_ONCE(statc->stats_updates, stats_updates); + /* TODO: add to cgroup update tree once it is nmi-safe. */ + if (!in_nmi()) + css_rstat_updated(&memcg->css, cpu); + statc_pcpu = memcg->vmstats_percpu; + for (; statc_pcpu; statc_pcpu = statc->parent_pcpu) { + statc = this_cpu_ptr(statc_pcpu); + /* + * If @memcg is already flushable then all its ancestors are + * flushable as well and also there is no need to increase + * stats_updates. + */ + if (memcg_vmstats_needs_flush(statc->vmstats)) + break; + + stats_updates = this_cpu_add_return(statc_pcpu->stats_updates, + abs(val)); if (stats_updates < MEMCG_CHARGE_BATCH) continue; - /* - * If @memcg is already flush-able, increasing stats_updates is - * redundant. Avoid the overhead of the atomic update. - */ - if (!memcg_vmstats_needs_flush(statc->vmstats)) - atomic64_add(stats_updates, - &statc->vmstats->stats_updates); - WRITE_ONCE(statc->stats_updates, 0); + stats_updates = this_cpu_xchg(statc_pcpu->stats_updates, 0); + atomic_add(stats_updates, &statc->vmstats->stats_updates); } } @@ -602,7 +601,7 @@ static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force) { bool needs_flush = memcg_vmstats_needs_flush(memcg->vmstats); - trace_memcg_flush_stats(memcg, atomic64_read(&memcg->vmstats->stats_updates), + trace_memcg_flush_stats(memcg, atomic_read(&memcg->vmstats->stats_updates), force, needs_flush); if (!force && !needs_flush) @@ -611,7 +610,7 @@ static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force) if (mem_cgroup_is_root(memcg)) WRITE_ONCE(flush_last_time, jiffies_64); - cgroup_rstat_flush(memcg->css.cgroup); + css_rstat_flush(&memcg->css); } /* @@ -684,15 +683,16 @@ static int memcg_state_val_in_pages(int idx, int val) } /** - * __mod_memcg_state - update cgroup memory statistics + * mod_memcg_state - update cgroup memory statistics * @memcg: the memory cgroup * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item * @val: delta to add to the counter, can be negative */ -void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, +void mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, int val) { int i = memcg_stats_index(idx); + int cpu; if (mem_cgroup_disabled()) return; @@ -700,12 +700,17 @@ void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) return; - __this_cpu_add(memcg->vmstats_percpu->state[i], val); + cpu = get_cpu(); + + this_cpu_add(memcg->vmstats_percpu->state[i], val); val = memcg_state_val_in_pages(idx, val); - memcg_rstat_updated(memcg, val); + memcg_rstat_updated(memcg, val, cpu); trace_mod_memcg_state(memcg, idx, val); + + put_cpu(); } +#ifdef CONFIG_MEMCG_V1 /* idx can be of type enum memcg_stat_item or node_stat_item. */ unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) { @@ -722,14 +727,16 @@ unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) #endif return x; } +#endif -static void __mod_memcg_lruvec_state(struct lruvec *lruvec, +static void mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { struct mem_cgroup_per_node *pn; struct mem_cgroup *memcg; int i = memcg_stats_index(idx); + int cpu; if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) return; @@ -737,35 +744,19 @@ static void __mod_memcg_lruvec_state(struct lruvec *lruvec, pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); memcg = pn->memcg; - /* - * The caller from rmap relies on disabled preemption because they never - * update their counter from in-interrupt context. For these two - * counters we check that the update is never performed from an - * interrupt context while other caller need to have disabled interrupt. - */ - __memcg_stats_lock(); - if (IS_ENABLED(CONFIG_DEBUG_VM)) { - switch (idx) { - case NR_ANON_MAPPED: - case NR_FILE_MAPPED: - case NR_ANON_THPS: - WARN_ON_ONCE(!in_task()); - break; - default: - VM_WARN_ON_IRQS_ENABLED(); - } - } + cpu = get_cpu(); /* Update memcg */ - __this_cpu_add(memcg->vmstats_percpu->state[i], val); + this_cpu_add(memcg->vmstats_percpu->state[i], val); /* Update lruvec */ - __this_cpu_add(pn->lruvec_stats_percpu->state[i], val); + this_cpu_add(pn->lruvec_stats_percpu->state[i], val); val = memcg_state_val_in_pages(idx, val); - memcg_rstat_updated(memcg, val); + memcg_rstat_updated(memcg, val, cpu); trace_mod_memcg_lruvec_state(memcg, idx, val); - memcg_stats_unlock(); + + put_cpu(); } /** @@ -786,7 +777,7 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, /* Update memcg and lruvec */ if (!mem_cgroup_disabled()) - __mod_memcg_lruvec_state(lruvec, idx, val); + mod_memcg_lruvec_state(lruvec, idx, val); } void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, @@ -836,15 +827,16 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) } /** - * __count_memcg_events - account VM events in a cgroup + * count_memcg_events - account VM events in a cgroup * @memcg: the memory cgroup * @idx: the event item * @count: the number of events that occurred */ -void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, +void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count) { int i = memcg_events_index(idx); + int cpu; if (mem_cgroup_disabled()) return; @@ -852,11 +844,13 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) return; - memcg_stats_lock(); - __this_cpu_add(memcg->vmstats_percpu->events[i], count); - memcg_rstat_updated(memcg, count); + cpu = get_cpu(); + + this_cpu_add(memcg->vmstats_percpu->events[i], count); + memcg_rstat_updated(memcg, count, cpu); trace_count_memcg_events(memcg, idx, count); - memcg_stats_unlock(); + + put_cpu(); } unsigned long memcg_events(struct mem_cgroup *memcg, int event) @@ -869,6 +863,7 @@ unsigned long memcg_events(struct mem_cgroup *memcg, int event) return READ_ONCE(memcg->vmstats->events[i]); } +#ifdef CONFIG_MEMCG_V1 unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) { int i = memcg_events_index(event); @@ -878,6 +873,7 @@ unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) return READ_ONCE(memcg->vmstats->events_local[i]); } +#endif struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) { @@ -1169,8 +1165,11 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, struct task_struct *task; css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it); - while (!ret && (task = css_task_iter_next(&it))) + while (!ret && (task = css_task_iter_next(&it))) { ret = fn(task, arg); + /* Avoid potential softlockup warning */ + cond_resched(); + } css_task_iter_end(&it); if (ret) { mem_cgroup_iter_break(memcg, iter); @@ -1385,6 +1384,7 @@ static const struct memory_stat memory_stats[] = { { "pgdemote_kswapd", PGDEMOTE_KSWAPD }, { "pgdemote_direct", PGDEMOTE_DIRECT }, { "pgdemote_khugepaged", PGDEMOTE_KHUGEPAGED }, + { "pgdemote_proactive", PGDEMOTE_PROACTIVE }, #ifdef CONFIG_NUMA_BALANCING { "pgpromote_success", PGPROMOTE_SUCCESS }, #endif @@ -1427,6 +1427,7 @@ static int memcg_page_state_output_unit(int item) case PGDEMOTE_KSWAPD: case PGDEMOTE_DIRECT: case PGDEMOTE_KHUGEPAGED: + case PGDEMOTE_PROACTIVE: #ifdef CONFIG_NUMA_BALANCING case PGPROMOTE_SUCCESS: #endif @@ -1442,11 +1443,25 @@ unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item) memcg_page_state_output_unit(item); } +#ifdef CONFIG_MEMCG_V1 unsigned long memcg_page_state_local_output(struct mem_cgroup *memcg, int item) { return memcg_page_state_local(memcg, item) * memcg_page_state_output_unit(item); } +#endif + +#ifdef CONFIG_HUGETLB_PAGE +static bool memcg_accounts_hugetlb(void) +{ + return cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; +} +#else /* CONFIG_HUGETLB_PAGE */ +static bool memcg_accounts_hugetlb(void) +{ + return false; +} +#endif /* CONFIG_HUGETLB_PAGE */ static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) { @@ -1469,7 +1484,7 @@ static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) #ifdef CONFIG_HUGETLB_PAGE if (unlikely(memory_stats[i].idx == NR_HUGETLB) && - !(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)) + !memcg_accounts_hugetlb()) continue; #endif size = memcg_page_state_output(memcg, memory_stats[i].idx); @@ -1486,10 +1501,12 @@ static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) seq_buf_printf(s, "pgscan %lu\n", memcg_events(memcg, PGSCAN_KSWAPD) + memcg_events(memcg, PGSCAN_DIRECT) + + memcg_events(memcg, PGSCAN_PROACTIVE) + memcg_events(memcg, PGSCAN_KHUGEPAGED)); seq_buf_printf(s, "pgsteal %lu\n", memcg_events(memcg, PGSTEAL_KSWAPD) + memcg_events(memcg, PGSTEAL_DIRECT) + + memcg_events(memcg, PGSTEAL_PROACTIVE) + memcg_events(memcg, PGSTEAL_KHUGEPAGED)); for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) { @@ -1549,16 +1566,23 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) /* Use static buffer, for the caller is holding oom_lock. */ static char buf[SEQ_BUF_SIZE]; struct seq_buf s; + unsigned long memory_failcnt; lockdep_assert_held(&oom_lock); + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) + memory_failcnt = atomic_long_read(&memcg->memory_events[MEMCG_MAX]); + else + memory_failcnt = memcg->memory.failcnt; + pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", K((u64)page_counter_read(&memcg->memory)), - K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt); + K((u64)READ_ONCE(memcg->memory.max)), memory_failcnt); if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n", K((u64)page_counter_read(&memcg->swap)), - K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt); + K((u64)READ_ONCE(memcg->swap.max)), + atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX])); #ifdef CONFIG_MEMCG_V1 else { pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", @@ -1627,7 +1651,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, * A few threads which were not waiting at mutex_lock_killable() can * fail to bail out. Therefore, check again after holding oom_lock. */ - ret = task_is_dying() || out_of_memory(&oc); + ret = out_of_memory(&oc); unlock: mutex_unlock(&oom_lock); @@ -1721,28 +1745,45 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) pr_cont(" are going to be killed due to memory.oom.group set\n"); } +/* + * The value of NR_MEMCG_STOCK is selected to keep the cached memcgs and their + * nr_pages in a single cacheline. This may change in future. + */ +#define NR_MEMCG_STOCK 7 +#define FLUSHING_CACHED_CHARGE 0 struct memcg_stock_pcp { - local_lock_t stock_lock; - struct mem_cgroup *cached; /* this never be root cgroup */ - unsigned int nr_pages; + local_trylock_t lock; + uint8_t nr_pages[NR_MEMCG_STOCK]; + struct mem_cgroup *cached[NR_MEMCG_STOCK]; + struct work_struct work; + unsigned long flags; +}; + +static DEFINE_PER_CPU_ALIGNED(struct memcg_stock_pcp, memcg_stock) = { + .lock = INIT_LOCAL_TRYLOCK(lock), +}; + +struct obj_stock_pcp { + local_trylock_t lock; + unsigned int nr_bytes; struct obj_cgroup *cached_objcg; struct pglist_data *cached_pgdat; - unsigned int nr_bytes; int nr_slab_reclaimable_b; int nr_slab_unreclaimable_b; struct work_struct work; unsigned long flags; -#define FLUSHING_CACHED_CHARGE 0 }; -static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = { - .stock_lock = INIT_LOCAL_LOCK(stock_lock), + +static DEFINE_PER_CPU_ALIGNED(struct obj_stock_pcp, obj_stock) = { + .lock = INIT_LOCAL_TRYLOCK(lock), }; + static DEFINE_MUTEX(percpu_charge_mutex); -static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock); -static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, +static void drain_obj_stock(struct obj_stock_pcp *stock); +static bool obj_stock_flush_required(struct obj_stock_pcp *stock, struct mem_cgroup *root_memcg); /** @@ -1750,110 +1791,188 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, * @memcg: memcg to consume from. * @nr_pages: how many pages to charge. * - * The charges will only happen if @memcg matches the current cpu's memcg - * stock, and at least @nr_pages are available in that stock. Failure to - * service an allocation will refill the stock. + * Consume the cached charge if enough nr_pages are present otherwise return + * failure. Also return failure for charge request larger than + * MEMCG_CHARGE_BATCH or if the local lock is already taken. * * returns true if successful, false otherwise. */ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { struct memcg_stock_pcp *stock; - unsigned int stock_pages; - unsigned long flags; + uint8_t stock_pages; bool ret = false; + int i; - if (nr_pages > MEMCG_CHARGE_BATCH) + if (nr_pages > MEMCG_CHARGE_BATCH || + !local_trylock(&memcg_stock.lock)) return ret; - local_lock_irqsave(&memcg_stock.stock_lock, flags); - stock = this_cpu_ptr(&memcg_stock); - stock_pages = READ_ONCE(stock->nr_pages); - if (memcg == READ_ONCE(stock->cached) && stock_pages >= nr_pages) { - WRITE_ONCE(stock->nr_pages, stock_pages - nr_pages); - ret = true; + + for (i = 0; i < NR_MEMCG_STOCK; ++i) { + if (memcg != READ_ONCE(stock->cached[i])) + continue; + + stock_pages = READ_ONCE(stock->nr_pages[i]); + if (stock_pages >= nr_pages) { + WRITE_ONCE(stock->nr_pages[i], stock_pages - nr_pages); + ret = true; + } + break; } - local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + local_unlock(&memcg_stock.lock); return ret; } +static void memcg_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages) +{ + page_counter_uncharge(&memcg->memory, nr_pages); + if (do_memsw_account()) + page_counter_uncharge(&memcg->memsw, nr_pages); +} + /* * Returns stocks cached in percpu and reset cached information. */ -static void drain_stock(struct memcg_stock_pcp *stock) +static void drain_stock(struct memcg_stock_pcp *stock, int i) { - unsigned int stock_pages = READ_ONCE(stock->nr_pages); - struct mem_cgroup *old = READ_ONCE(stock->cached); + struct mem_cgroup *old = READ_ONCE(stock->cached[i]); + uint8_t stock_pages; if (!old) return; + stock_pages = READ_ONCE(stock->nr_pages[i]); if (stock_pages) { - page_counter_uncharge(&old->memory, stock_pages); - if (do_memsw_account()) - page_counter_uncharge(&old->memsw, stock_pages); - - WRITE_ONCE(stock->nr_pages, 0); + memcg_uncharge(old, stock_pages); + WRITE_ONCE(stock->nr_pages[i], 0); } css_put(&old->css); - WRITE_ONCE(stock->cached, NULL); + WRITE_ONCE(stock->cached[i], NULL); +} + +static void drain_stock_fully(struct memcg_stock_pcp *stock) +{ + int i; + + for (i = 0; i < NR_MEMCG_STOCK; ++i) + drain_stock(stock, i); } -static void drain_local_stock(struct work_struct *dummy) +static void drain_local_memcg_stock(struct work_struct *dummy) { struct memcg_stock_pcp *stock; - struct obj_cgroup *old = NULL; - unsigned long flags; - /* - * The only protection from cpu hotplug (memcg_hotplug_cpu_dead) vs. - * drain_stock races is that we always operate on local CPU stock - * here with IRQ disabled - */ - local_lock_irqsave(&memcg_stock.stock_lock, flags); + if (WARN_ONCE(!in_task(), "drain in non-task context")) + return; + + local_lock(&memcg_stock.lock); stock = this_cpu_ptr(&memcg_stock); - old = drain_obj_stock(stock); - drain_stock(stock); + drain_stock_fully(stock); clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); - local_unlock_irqrestore(&memcg_stock.stock_lock, flags); - obj_cgroup_put(old); + local_unlock(&memcg_stock.lock); } -/* - * Cache charges(val) to local per_cpu area. - * This will be consumed by consume_stock() function, later. - */ -static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) +static void drain_local_obj_stock(struct work_struct *dummy) +{ + struct obj_stock_pcp *stock; + + if (WARN_ONCE(!in_task(), "drain in non-task context")) + return; + + local_lock(&obj_stock.lock); + + stock = this_cpu_ptr(&obj_stock); + drain_obj_stock(stock); + clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); + + local_unlock(&obj_stock.lock); +} + +static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { struct memcg_stock_pcp *stock; - unsigned int stock_pages; + struct mem_cgroup *cached; + uint8_t stock_pages; + bool success = false; + int empty_slot = -1; + int i; + + /* + * For now limit MEMCG_CHARGE_BATCH to 127 and less. In future if we + * decide to increase it more than 127 then we will need more careful + * handling of nr_pages[] in struct memcg_stock_pcp. + */ + BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S8_MAX); + + VM_WARN_ON_ONCE(mem_cgroup_is_root(memcg)); + + if (nr_pages > MEMCG_CHARGE_BATCH || + !local_trylock(&memcg_stock.lock)) { + /* + * In case of larger than batch refill or unlikely failure to + * lock the percpu memcg_stock.lock, uncharge memcg directly. + */ + memcg_uncharge(memcg, nr_pages); + return; + } stock = this_cpu_ptr(&memcg_stock); - if (READ_ONCE(stock->cached) != memcg) { /* reset if necessary */ - drain_stock(stock); + for (i = 0; i < NR_MEMCG_STOCK; ++i) { + cached = READ_ONCE(stock->cached[i]); + if (!cached && empty_slot == -1) + empty_slot = i; + if (memcg == READ_ONCE(stock->cached[i])) { + stock_pages = READ_ONCE(stock->nr_pages[i]) + nr_pages; + WRITE_ONCE(stock->nr_pages[i], stock_pages); + if (stock_pages > MEMCG_CHARGE_BATCH) + drain_stock(stock, i); + success = true; + break; + } + } + + if (!success) { + i = empty_slot; + if (i == -1) { + i = get_random_u32_below(NR_MEMCG_STOCK); + drain_stock(stock, i); + } css_get(&memcg->css); - WRITE_ONCE(stock->cached, memcg); + WRITE_ONCE(stock->cached[i], memcg); + WRITE_ONCE(stock->nr_pages[i], nr_pages); } - stock_pages = READ_ONCE(stock->nr_pages) + nr_pages; - WRITE_ONCE(stock->nr_pages, stock_pages); - if (stock_pages > MEMCG_CHARGE_BATCH) - drain_stock(stock); + local_unlock(&memcg_stock.lock); } -static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) +static bool is_memcg_drain_needed(struct memcg_stock_pcp *stock, + struct mem_cgroup *root_memcg) { - unsigned long flags; + struct mem_cgroup *memcg; + bool flush = false; + int i; - local_lock_irqsave(&memcg_stock.stock_lock, flags); - __refill_stock(memcg, nr_pages); - local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + rcu_read_lock(); + for (i = 0; i < NR_MEMCG_STOCK; ++i) { + memcg = READ_ONCE(stock->cached[i]); + if (!memcg) + continue; + + if (READ_ONCE(stock->nr_pages[i]) && + mem_cgroup_is_descendant(memcg, root_memcg)) { + flush = true; + break; + } + } + rcu_read_unlock(); + return flush; } /* @@ -1876,25 +1995,27 @@ void drain_all_stock(struct mem_cgroup *root_memcg) migrate_disable(); curcpu = smp_processor_id(); for_each_online_cpu(cpu) { - struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); - struct mem_cgroup *memcg; - bool flush = false; + struct memcg_stock_pcp *memcg_st = &per_cpu(memcg_stock, cpu); + struct obj_stock_pcp *obj_st = &per_cpu(obj_stock, cpu); - rcu_read_lock(); - memcg = READ_ONCE(stock->cached); - if (memcg && READ_ONCE(stock->nr_pages) && - mem_cgroup_is_descendant(memcg, root_memcg)) - flush = true; - else if (obj_stock_flush_required(stock, root_memcg)) - flush = true; - rcu_read_unlock(); + if (!test_bit(FLUSHING_CACHED_CHARGE, &memcg_st->flags) && + is_memcg_drain_needed(memcg_st, root_memcg) && + !test_and_set_bit(FLUSHING_CACHED_CHARGE, + &memcg_st->flags)) { + if (cpu == curcpu) + drain_local_memcg_stock(&memcg_st->work); + else if (!cpu_is_isolated(cpu)) + schedule_work_on(cpu, &memcg_st->work); + } - if (flush && - !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { + if (!test_bit(FLUSHING_CACHED_CHARGE, &obj_st->flags) && + obj_stock_flush_required(obj_st, root_memcg) && + !test_and_set_bit(FLUSHING_CACHED_CHARGE, + &obj_st->flags)) { if (cpu == curcpu) - drain_local_stock(&stock->work); + drain_local_obj_stock(&obj_st->work); else if (!cpu_is_isolated(cpu)) - schedule_work_on(cpu, &stock->work); + schedule_work_on(cpu, &obj_st->work); } } migrate_enable(); @@ -1903,10 +2024,9 @@ void drain_all_stock(struct mem_cgroup *root_memcg) static int memcg_hotplug_cpu_dead(unsigned int cpu) { - struct memcg_stock_pcp *stock; - - stock = &per_cpu(memcg_stock, cpu); - drain_stock(stock); + /* no need for the local lock */ + drain_obj_stock(&per_cpu(obj_stock, cpu)); + drain_stock_fully(&per_cpu(memcg_stock, cpu)); return 0; } @@ -2181,8 +2301,8 @@ out: css_put(&memcg->css); } -int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, - unsigned int nr_pages) +static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, + unsigned int nr_pages) { unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages); int nr_retries = MAX_RECLAIM_RETRIES; @@ -2199,6 +2319,10 @@ retry: if (consume_stock(memcg, nr_pages)) return 0; + if (!gfpflags_allow_spinning(gfp_mask)) + /* Avoid the refill and flush of the older stock */ + batch = nr_pages; + if (!do_memsw_account() || page_counter_try_charge(&memcg->memsw, batch, &counter)) { if (page_counter_try_charge(&memcg->memory, batch, &counter)) @@ -2371,19 +2495,13 @@ done_restock: return 0; } -/** - * mem_cgroup_cancel_charge() - cancel an uncommitted try_charge() call. - * @memcg: memcg previously charged. - * @nr_pages: number of pages previously charged. - */ -void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) +static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, + unsigned int nr_pages) { if (mem_cgroup_is_root(memcg)) - return; + return 0; - page_counter_uncharge(&memcg->memory, nr_pages); - if (do_memsw_account()) - page_counter_uncharge(&memcg->memsw, nr_pages); + return try_charge_memcg(memcg, gfp_mask, nr_pages); } static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) @@ -2399,29 +2517,47 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) folio->memcg_data = (unsigned long)memcg; } -/** - * mem_cgroup_commit_charge - commit a previously successful try_charge(). - * @folio: folio to commit the charge to. - * @memcg: memcg previously charged. - */ -void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg) +#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC +static inline void account_slab_nmi_safe(struct mem_cgroup *memcg, + struct pglist_data *pgdat, + enum node_stat_item idx, int nr) { - css_get(&memcg->css); - commit_charge(folio, memcg); - memcg1_commit_charge(folio, memcg); + struct lruvec *lruvec; + + if (likely(!in_nmi())) { + lruvec = mem_cgroup_lruvec(memcg, pgdat); + mod_memcg_lruvec_state(lruvec, idx, nr); + } else { + struct mem_cgroup_per_node *pn = memcg->nodeinfo[pgdat->node_id]; + + /* TODO: add to cgroup update tree once it is nmi-safe. */ + if (idx == NR_SLAB_RECLAIMABLE_B) + atomic_add(nr, &pn->slab_reclaimable); + else + atomic_add(nr, &pn->slab_unreclaimable); + } } +#else +static inline void account_slab_nmi_safe(struct mem_cgroup *memcg, + struct pglist_data *pgdat, + enum node_stat_item idx, int nr) +{ + struct lruvec *lruvec; + + lruvec = mem_cgroup_lruvec(memcg, pgdat); + mod_memcg_lruvec_state(lruvec, idx, nr); +} +#endif -static inline void __mod_objcg_mlstate(struct obj_cgroup *objcg, +static inline void mod_objcg_mlstate(struct obj_cgroup *objcg, struct pglist_data *pgdat, enum node_stat_item idx, int nr) { struct mem_cgroup *memcg; - struct lruvec *lruvec; rcu_read_lock(); memcg = obj_cgroup_memcg(objcg); - lruvec = mem_cgroup_lruvec(memcg, pgdat); - __mod_memcg_lruvec_state(lruvec, idx, nr); + account_slab_nmi_safe(memcg, pgdat, idx, nr); rcu_read_unlock(); } @@ -2546,6 +2682,9 @@ __always_inline struct obj_cgroup *current_obj_cgroup(void) struct mem_cgroup *memcg; struct obj_cgroup *objcg; + if (IS_ENABLED(CONFIG_MEMCG_NMI_UNSAFE) && in_nmi()) + return NULL; + if (in_task()) { memcg = current->active_memcg; if (unlikely(memcg)) @@ -2608,6 +2747,23 @@ struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio) return objcg; } +#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC +static inline void account_kmem_nmi_safe(struct mem_cgroup *memcg, int val) +{ + if (likely(!in_nmi())) { + mod_memcg_state(memcg, MEMCG_KMEM, val); + } else { + /* TODO: add to cgroup update tree once it is nmi-safe. */ + atomic_add(val, &memcg->kmem_stat); + } +} +#else +static inline void account_kmem_nmi_safe(struct mem_cgroup *memcg, int val) +{ + mod_memcg_state(memcg, MEMCG_KMEM, val); +} +#endif + /* * obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg * @objcg: object cgroup to uncharge @@ -2620,9 +2776,10 @@ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, memcg = get_mem_cgroup_from_objcg(objcg); - mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages); + account_kmem_nmi_safe(memcg, -nr_pages); memcg1_account_kmem(memcg, -nr_pages); - refill_stock(memcg, nr_pages); + if (!mem_cgroup_is_root(memcg)) + refill_stock(memcg, nr_pages); css_put(&memcg->css); } @@ -2647,7 +2804,7 @@ static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp, if (ret) goto out; - mod_memcg_state(memcg, MEMCG_KMEM, nr_pages); + account_kmem_nmi_safe(memcg, nr_pages); memcg1_account_kmem(memcg, nr_pages); out: css_put(&memcg->css); @@ -2655,6 +2812,23 @@ out: return ret; } +static struct obj_cgroup *page_objcg(const struct page *page) +{ + unsigned long memcg_data = page->memcg_data; + + if (mem_cgroup_disabled() || !memcg_data) + return NULL; + + VM_BUG_ON_PAGE((memcg_data & OBJEXTS_FLAGS_MASK) != MEMCG_DATA_KMEM, + page); + return (struct obj_cgroup *)(memcg_data - MEMCG_DATA_KMEM); +} + +static void page_set_objcg(struct page *page, const struct obj_cgroup *objcg) +{ + page->memcg_data = (unsigned long)objcg | MEMCG_DATA_KMEM; +} + /** * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup * @page: page to charge @@ -2673,8 +2847,7 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order); if (!ret) { obj_cgroup_get(objcg); - page->memcg_data = (unsigned long)objcg | - MEMCG_DATA_KMEM; + page_set_objcg(page, objcg); return 0; } } @@ -2688,53 +2861,38 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) */ void __memcg_kmem_uncharge_page(struct page *page, int order) { - struct folio *folio = page_folio(page); - struct obj_cgroup *objcg; + struct obj_cgroup *objcg = page_objcg(page); unsigned int nr_pages = 1 << order; - if (!folio_memcg_kmem(folio)) + if (!objcg) return; - objcg = __folio_objcg(folio); obj_cgroup_uncharge_pages(objcg, nr_pages); - folio->memcg_data = 0; + page->memcg_data = 0; obj_cgroup_put(objcg); } -static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, - enum node_stat_item idx, int nr) +static void __account_obj_stock(struct obj_cgroup *objcg, + struct obj_stock_pcp *stock, int nr, + struct pglist_data *pgdat, enum node_stat_item idx) { - struct memcg_stock_pcp *stock; - struct obj_cgroup *old = NULL; - unsigned long flags; int *bytes; - local_lock_irqsave(&memcg_stock.stock_lock, flags); - stock = this_cpu_ptr(&memcg_stock); - /* * Save vmstat data in stock and skip vmstat array update unless - * accumulating over a page of vmstat data or when pgdat or idx - * changes. + * accumulating over a page of vmstat data or when pgdat changes. */ - if (READ_ONCE(stock->cached_objcg) != objcg) { - old = drain_obj_stock(stock); - obj_cgroup_get(objcg); - stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) - ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; - WRITE_ONCE(stock->cached_objcg, objcg); - stock->cached_pgdat = pgdat; - } else if (stock->cached_pgdat != pgdat) { + if (stock->cached_pgdat != pgdat) { /* Flush the existing cached vmstat data */ struct pglist_data *oldpg = stock->cached_pgdat; if (stock->nr_slab_reclaimable_b) { - __mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B, + mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B, stock->nr_slab_reclaimable_b); stock->nr_slab_reclaimable_b = 0; } if (stock->nr_slab_unreclaimable_b) { - __mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B, + mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B, stock->nr_slab_unreclaimable_b); stock->nr_slab_unreclaimable_b = 0; } @@ -2760,37 +2918,38 @@ static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, } } if (nr) - __mod_objcg_mlstate(objcg, pgdat, idx, nr); - - local_unlock_irqrestore(&memcg_stock.stock_lock, flags); - obj_cgroup_put(old); + mod_objcg_mlstate(objcg, pgdat, idx, nr); } -static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) +static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, + struct pglist_data *pgdat, enum node_stat_item idx) { - struct memcg_stock_pcp *stock; - unsigned long flags; + struct obj_stock_pcp *stock; bool ret = false; - local_lock_irqsave(&memcg_stock.stock_lock, flags); + if (!local_trylock(&obj_stock.lock)) + return ret; - stock = this_cpu_ptr(&memcg_stock); + stock = this_cpu_ptr(&obj_stock); if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) { stock->nr_bytes -= nr_bytes; ret = true; + + if (pgdat) + __account_obj_stock(objcg, stock, nr_bytes, pgdat, idx); } - local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + local_unlock(&obj_stock.lock); return ret; } -static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) +static void drain_obj_stock(struct obj_stock_pcp *stock) { struct obj_cgroup *old = READ_ONCE(stock->cached_objcg); if (!old) - return NULL; + return; if (stock->nr_bytes) { unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT; @@ -2803,7 +2962,8 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages); memcg1_account_kmem(memcg, -nr_pages); - __refill_stock(memcg, nr_pages); + if (!mem_cgroup_is_root(memcg)) + memcg_uncharge(memcg, nr_pages); css_put(&memcg->css); } @@ -2827,13 +2987,13 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) */ if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) { if (stock->nr_slab_reclaimable_b) { - __mod_objcg_mlstate(old, stock->cached_pgdat, + mod_objcg_mlstate(old, stock->cached_pgdat, NR_SLAB_RECLAIMABLE_B, stock->nr_slab_reclaimable_b); stock->nr_slab_reclaimable_b = 0; } if (stock->nr_slab_unreclaimable_b) { - __mod_objcg_mlstate(old, stock->cached_pgdat, + mod_objcg_mlstate(old, stock->cached_pgdat, NR_SLAB_UNRECLAIMABLE_B, stock->nr_slab_unreclaimable_b); stock->nr_slab_unreclaimable_b = 0; @@ -2842,67 +3002,76 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) } WRITE_ONCE(stock->cached_objcg, NULL); - /* - * The `old' objects needs to be released by the caller via - * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock. - */ - return old; + obj_cgroup_put(old); } -static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, +static bool obj_stock_flush_required(struct obj_stock_pcp *stock, struct mem_cgroup *root_memcg) { struct obj_cgroup *objcg = READ_ONCE(stock->cached_objcg); struct mem_cgroup *memcg; + bool flush = false; + rcu_read_lock(); if (objcg) { memcg = obj_cgroup_memcg(objcg); if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) - return true; + flush = true; } + rcu_read_unlock(); - return false; + return flush; } static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, - bool allow_uncharge) + bool allow_uncharge, int nr_acct, struct pglist_data *pgdat, + enum node_stat_item idx) { - struct memcg_stock_pcp *stock; - struct obj_cgroup *old = NULL; - unsigned long flags; + struct obj_stock_pcp *stock; unsigned int nr_pages = 0; - local_lock_irqsave(&memcg_stock.stock_lock, flags); + if (!local_trylock(&obj_stock.lock)) { + if (pgdat) + mod_objcg_mlstate(objcg, pgdat, idx, nr_bytes); + nr_pages = nr_bytes >> PAGE_SHIFT; + nr_bytes = nr_bytes & (PAGE_SIZE - 1); + atomic_add(nr_bytes, &objcg->nr_charged_bytes); + goto out; + } - stock = this_cpu_ptr(&memcg_stock); + stock = this_cpu_ptr(&obj_stock); if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */ - old = drain_obj_stock(stock); + drain_obj_stock(stock); obj_cgroup_get(objcg); - WRITE_ONCE(stock->cached_objcg, objcg); stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; + WRITE_ONCE(stock->cached_objcg, objcg); + allow_uncharge = true; /* Allow uncharge when objcg changes */ } stock->nr_bytes += nr_bytes; + if (pgdat) + __account_obj_stock(objcg, stock, nr_acct, pgdat, idx); + if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) { nr_pages = stock->nr_bytes >> PAGE_SHIFT; stock->nr_bytes &= (PAGE_SIZE - 1); } - local_unlock_irqrestore(&memcg_stock.stock_lock, flags); - obj_cgroup_put(old); - + local_unlock(&obj_stock.lock); +out: if (nr_pages) obj_cgroup_uncharge_pages(objcg, nr_pages); } -int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) +static int obj_cgroup_charge_account(struct obj_cgroup *objcg, gfp_t gfp, size_t size, + struct pglist_data *pgdat, enum node_stat_item idx) { unsigned int nr_pages, nr_bytes; int ret; - if (consume_obj_stock(objcg, size)) + if (likely(consume_obj_stock(objcg, size, pgdat, idx))) return 0; /* @@ -2935,15 +3104,21 @@ int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) nr_pages += 1; ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages); - if (!ret && nr_bytes) - refill_obj_stock(objcg, PAGE_SIZE - nr_bytes, false); + if (!ret && (nr_bytes || pgdat)) + refill_obj_stock(objcg, nr_bytes ? PAGE_SIZE - nr_bytes : 0, + false, size, pgdat, idx); return ret; } +int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) +{ + return obj_cgroup_charge_account(objcg, gfp, size, NULL, 0); +} + void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) { - refill_obj_stock(objcg, size, true); + refill_obj_stock(objcg, size, true, 0, NULL, 0); } static inline size_t obj_full_size(struct kmem_cache *s) @@ -2995,23 +3170,32 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, return false; } - if (obj_cgroup_charge(objcg, flags, size * obj_full_size(s))) - return false; - for (i = 0; i < size; i++) { slab = virt_to_slab(p[i]); if (!slab_obj_exts(slab) && alloc_slab_obj_exts(slab, s, flags, false)) { - obj_cgroup_uncharge(objcg, obj_full_size(s)); continue; } + /* + * if we fail and size is 1, memcg_alloc_abort_single() will + * just free the object, which is ok as we have not assigned + * objcg to its obj_ext yet + * + * for larger sizes, kmem_cache_free_bulk() will uncharge + * any objects that were already charged and obj_ext assigned + * + * TODO: we could batch this until slab_pgdat(slab) changes + * between iterations, with a more complicated undo + */ + if (obj_cgroup_charge_account(objcg, flags, obj_full_size(s), + slab_pgdat(slab), cache_vmstat_idx(s))) + return false; + off = obj_to_index(s, slab, p[i]); obj_cgroup_get(objcg); slab_obj_exts(slab)[off].objcg = objcg; - mod_objcg_state(objcg, slab_pgdat(slab), - cache_vmstat_idx(s), obj_full_size(s)); } return true; @@ -3020,6 +3204,8 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, int objects, struct slabobj_ext *obj_exts) { + size_t obj_size = obj_full_size(s); + for (int i = 0; i < objects; i++) { struct obj_cgroup *objcg; unsigned int off; @@ -3030,33 +3216,40 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, continue; obj_exts[off].objcg = NULL; - obj_cgroup_uncharge(objcg, obj_full_size(s)); - mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s), - -obj_full_size(s)); + refill_obj_stock(objcg, obj_size, true, -obj_size, + slab_pgdat(slab), cache_vmstat_idx(s)); obj_cgroup_put(objcg); } } /* - * Because folio_memcg(head) is not set on tails, set it now. + * The objcg is only set on the first page, so transfer it to all the + * other pages. */ -void split_page_memcg(struct page *head, int old_order, int new_order) +void split_page_memcg(struct page *page, unsigned order) { - struct folio *folio = page_folio(head); - int i; - unsigned int old_nr = 1 << old_order; - unsigned int new_nr = 1 << new_order; + struct obj_cgroup *objcg = page_objcg(page); + unsigned int i, nr = 1 << order; - if (mem_cgroup_disabled() || !folio_memcg_charged(folio)) + if (!objcg) return; - for (i = new_nr; i < old_nr; i += new_nr) - folio_page(folio, i)->memcg_data = folio->memcg_data; + for (i = 1; i < nr; i++) + page_set_objcg(&page[i], objcg); - if (folio_memcg_kmem(folio)) - obj_cgroup_get_many(__folio_objcg(folio), old_nr / new_nr - 1); - else - css_get_many(&folio_memcg(folio)->css, old_nr / new_nr - 1); + obj_cgroup_get_many(objcg, nr - 1); +} + +void folio_split_memcg_refs(struct folio *folio, unsigned old_order, + unsigned new_order) +{ + unsigned new_refs; + + if (mem_cgroup_disabled() || !folio_memcg_charged(folio)) + return; + + new_refs = (1 << (old_order - new_order)) - 1; + css_get_many(&__folio_memcg(folio)->css, new_refs); } unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) @@ -3384,7 +3577,7 @@ void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg, refcount_add(n, &memcg->id.ref); } -void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) +static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) { if (refcount_sub_and_test(n, &memcg->id.ref)) { mem_cgroup_id_remove(memcg); @@ -3399,6 +3592,24 @@ static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) mem_cgroup_id_put_many(memcg, 1); } +struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) +{ + while (!refcount_inc_not_zero(&memcg->id.ref)) { + /* + * The root cgroup cannot be destroyed, so it's refcount must + * always be >= 1. + */ + if (WARN_ON_ONCE(mem_cgroup_is_root(memcg))) { + VM_BUG_ON(1); + break; + } + memcg = parent_mem_cgroup(memcg); + if (!memcg) + memcg = root_mem_cgroup; + } + return memcg; +} + /** * mem_cgroup_from_id - look up a memcg from a memcg id * @id: the memcg id to look up @@ -3434,11 +3645,22 @@ struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) } #endif +static void free_mem_cgroup_per_node_info(struct mem_cgroup_per_node *pn) +{ + if (!pn) + return; + + free_percpu(pn->lruvec_stats_percpu); + kfree(pn->lruvec_stats); + kfree(pn); +} + static bool alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) { struct mem_cgroup_per_node *pn; - pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, node); + pn = kmem_cache_alloc_node(memcg_pn_cachep, GFP_KERNEL | __GFP_ZERO, + node); if (!pn) return false; @@ -3458,23 +3680,10 @@ static bool alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) memcg->nodeinfo[node] = pn; return true; fail: - kfree(pn->lruvec_stats); - kfree(pn); + free_mem_cgroup_per_node_info(pn); return false; } -static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) -{ - struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; - - if (!pn) - return; - - free_percpu(pn->lruvec_stats_percpu); - kfree(pn->lruvec_stats); - kfree(pn); -} - static void __mem_cgroup_free(struct mem_cgroup *memcg) { int node; @@ -3482,7 +3691,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) obj_cgroup_put(memcg->orig_objcg); for_each_node(node) - free_mem_cgroup_per_node_info(memcg, node); + free_mem_cgroup_per_node_info(memcg->nodeinfo[node]); memcg1_free_events(memcg); kfree(memcg->vmstats); free_percpu(memcg->vmstats_percpu); @@ -3498,13 +3707,14 @@ static void mem_cgroup_free(struct mem_cgroup *memcg) static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) { - struct memcg_vmstats_percpu *statc, *pstatc; + struct memcg_vmstats_percpu *statc; + struct memcg_vmstats_percpu __percpu *pstatc_pcpu; struct mem_cgroup *memcg; int node, cpu; int __maybe_unused i; long error; - memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL); + memcg = kmem_cache_zalloc(memcg_cachep, GFP_KERNEL); if (!memcg) return ERR_PTR(-ENOMEM); @@ -3529,9 +3739,9 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) for_each_possible_cpu(cpu) { if (parent) - pstatc = per_cpu_ptr(parent->vmstats_percpu, cpu); + pstatc_pcpu = parent->vmstats_percpu; statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); - statc->parent = parent ? pstatc : NULL; + statc->parent_pcpu = parent ? pstatc_pcpu : NULL; statc->vmstats = memcg->vmstats; } @@ -3575,6 +3785,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { struct mem_cgroup *parent = mem_cgroup_from_css(parent_css); struct mem_cgroup *memcg, *old_memcg; + bool memcg_on_dfl = cgroup_subsys_on_dfl(memory_cgrp_subsys); old_memcg = set_active_memcg(parent); memcg = mem_cgroup_alloc(parent); @@ -3592,9 +3803,10 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (parent) { WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent)); - page_counter_init(&memcg->memory, &parent->memory, true); + page_counter_init(&memcg->memory, &parent->memory, memcg_on_dfl); page_counter_init(&memcg->swap, &parent->swap, false); #ifdef CONFIG_MEMCG_V1 + memcg->memory.track_failcnt = !memcg_on_dfl; WRITE_ONCE(memcg->oom_kill_disable, READ_ONCE(parent->oom_kill_disable)); page_counter_init(&memcg->kmem, &parent->kmem, false); page_counter_init(&memcg->tcpmem, &parent->tcpmem, false); @@ -3612,7 +3824,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) return &memcg->css; } - if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) + if (memcg_on_dfl && !cgroup_memory_nosocket) static_branch_inc(&memcg_sockets_enabled_key); if (!cgroup_memory_nobpf) @@ -3803,6 +4015,53 @@ static void mem_cgroup_stat_aggregate(struct aggregate_control *ac) } } +#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC +static void flush_nmi_stats(struct mem_cgroup *memcg, struct mem_cgroup *parent, + int cpu) +{ + int nid; + + if (atomic_read(&memcg->kmem_stat)) { + int kmem = atomic_xchg(&memcg->kmem_stat, 0); + int index = memcg_stats_index(MEMCG_KMEM); + + memcg->vmstats->state[index] += kmem; + if (parent) + parent->vmstats->state_pending[index] += kmem; + } + + for_each_node_state(nid, N_MEMORY) { + struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid]; + struct lruvec_stats *lstats = pn->lruvec_stats; + struct lruvec_stats *plstats = NULL; + + if (parent) + plstats = parent->nodeinfo[nid]->lruvec_stats; + + if (atomic_read(&pn->slab_reclaimable)) { + int slab = atomic_xchg(&pn->slab_reclaimable, 0); + int index = memcg_stats_index(NR_SLAB_RECLAIMABLE_B); + + lstats->state[index] += slab; + if (plstats) + plstats->state_pending[index] += slab; + } + if (atomic_read(&pn->slab_unreclaimable)) { + int slab = atomic_xchg(&pn->slab_unreclaimable, 0); + int index = memcg_stats_index(NR_SLAB_UNRECLAIMABLE_B); + + lstats->state[index] += slab; + if (plstats) + plstats->state_pending[index] += slab; + } + } +} +#else +static void flush_nmi_stats(struct mem_cgroup *memcg, struct mem_cgroup *parent, + int cpu) +{} +#endif + static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); @@ -3811,6 +4070,8 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) struct aggregate_control ac; int nid; + flush_nmi_stats(memcg, parent, cpu); + statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); ac = (struct aggregate_control) { @@ -3860,8 +4121,8 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) } WRITE_ONCE(statc->stats_updates, 0); /* We are in a per-cpu loop here, only do the atomic write once */ - if (atomic64_read(&memcg->vmstats->stats_updates)) - atomic64_set(&memcg->vmstats->stats_updates, 0); + if (atomic_read(&memcg->vmstats->stats_updates)) + atomic_set(&memcg->vmstats->stats_updates, 0); } static void mem_cgroup_fork(struct task_struct *task) @@ -4014,7 +4275,7 @@ static ssize_t peak_write(struct kernfs_open_file *of, char *buf, size_t nbytes, WRITE_ONCE(peer_ctx->value, usage); /* initial write, register watcher */ - if (ofp->value == -1) + if (ofp->value == OFP_PEAK_UNSET) list_add(&ofp->list, watchers); WRITE_ONCE(ofp->value, usage); @@ -4102,6 +4363,9 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, page_counter_set_high(&memcg->memory, high); + if (of->file->f_flags & O_NONBLOCK) + goto out; + for (;;) { unsigned long nr_pages = page_counter_read(&memcg->memory); unsigned long reclaimed; @@ -4124,7 +4388,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, if (!reclaimed && !nr_retries--) break; } - +out: memcg_wb_domain_size_changed(memcg); return nbytes; } @@ -4151,6 +4415,9 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, xchg(&memcg->memory.max, max); + if (of->file->f_flags & O_NONBLOCK) + goto out; + for (;;) { unsigned long nr_pages = page_counter_read(&memcg->memory); @@ -4176,8 +4443,9 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, memcg_memory_event(memcg, MEMCG_OOM); if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0)) break; + cond_resched(); } - +out: memcg_wb_domain_size_changed(memcg); return nbytes; } @@ -4300,11 +4568,13 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of, enum { MEMORY_RECLAIM_SWAPPINESS = 0, + MEMORY_RECLAIM_SWAPPINESS_MAX, MEMORY_RECLAIM_NULL, }; static const match_table_t tokens = { { MEMORY_RECLAIM_SWAPPINESS, "swappiness=%d"}, + { MEMORY_RECLAIM_SWAPPINESS_MAX, "swappiness=max"}, { MEMORY_RECLAIM_NULL, NULL }, }; @@ -4338,6 +4608,9 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, if (swappiness < MIN_SWAPPINESS || swappiness > MAX_SWAPPINESS) return -EINVAL; break; + case MEMORY_RECLAIM_SWAPPINESS_MAX: + swappiness = SWAPPINESS_ANON_ONLY; + break; default: return -EINVAL; } @@ -4498,7 +4771,9 @@ static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg, if (ret) goto out; - mem_cgroup_commit_charge(folio, memcg); + css_get(&memcg->css); + commit_charge(folio, memcg); + memcg1_commit_charge(folio, memcg); out: return ret; } @@ -4516,38 +4791,37 @@ int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) } /** - * mem_cgroup_hugetlb_try_charge - try to charge the memcg for a hugetlb folio - * @memcg: memcg to charge. - * @gfp: reclaim mode. - * @nr_pages: number of pages to charge. - * - * This function is called when allocating a huge page folio to determine if - * the memcg has the capacity for it. It does not commit the charge yet, - * as the hugetlb folio itself has not been obtained from the hugetlb pool. + * mem_cgroup_charge_hugetlb - charge the memcg for a hugetlb folio + * @folio: folio being charged + * @gfp: reclaim mode * - * Once we have obtained the hugetlb folio, we can call - * mem_cgroup_commit_charge() to commit the charge. If we fail to obtain the - * folio, we should instead call mem_cgroup_cancel_charge() to undo the effect - * of try_charge(). + * This function is called when allocating a huge page folio, after the page has + * already been obtained and charged to the appropriate hugetlb cgroup + * controller (if it is enabled). * - * Returns 0 on success. Otherwise, an error code is returned. + * Returns ENOMEM if the memcg is already full. + * Returns 0 if either the charge was successful, or if we skip the charging. */ -int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, - long nr_pages) +int mem_cgroup_charge_hugetlb(struct folio *folio, gfp_t gfp) { + struct mem_cgroup *memcg = get_mem_cgroup_from_current(); + int ret = 0; + /* - * If hugetlb memcg charging is not enabled, do not fail hugetlb allocation, - * but do not attempt to commit charge later (or cancel on error) either. + * Even memcg does not account for hugetlb, we still want to update + * system-level stats via lruvec_stat_mod_folio. Return 0, and skip + * charging the memcg. */ - if (mem_cgroup_disabled() || !memcg || - !cgroup_subsys_on_dfl(memory_cgrp_subsys) || - !(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)) - return -EOPNOTSUPP; + if (mem_cgroup_disabled() || !memcg_accounts_hugetlb() || + !memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + goto out; - if (try_charge(memcg, gfp, nr_pages)) - return -ENOMEM; + if (charge_memcg(folio, memcg, gfp)) + ret = -ENOMEM; - return 0; +out: + mem_cgroup_put(memcg); + return ret; } /** @@ -4585,40 +4859,6 @@ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, return ret; } -/* - * mem_cgroup_swapin_uncharge_swap - uncharge swap slot - * @entry: the first swap entry for which the pages are charged - * @nr_pages: number of pages which will be uncharged - * - * Call this function after successfully adding the charged page to swapcache. - * - * Note: This function assumes the page for which swap slot is being uncharged - * is order 0 page. - */ -void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) -{ - /* - * Cgroup1's unified memory+swap counter has been charged with the - * new swapcache page, finish the transfer by uncharging the swap - * slot. The swap slot would also get uncharged when it dies, but - * it can stick around indefinitely and we'd count the page twice - * the entire time. - * - * Cgroup2 has separate resource counters for memory and swap, - * so this is a non-issue here. Memory and swap charge lifetimes - * correspond 1:1 to page and swap slot lifetimes: we charge the - * page to memory here, and uncharge swap when the slot is freed. - */ - if (!mem_cgroup_disabled() && do_memsw_account()) { - /* - * The swap entry might not get freed for a long time, - * let's not wait for it. The page already received a - * memory+swap charge, drop the swap entry duplicate. - */ - mem_cgroup_uncharge_swap(entry, nr_pages); - } -} - struct uncharge_gather { struct mem_cgroup *memcg; unsigned long nr_memory; @@ -4635,9 +4875,7 @@ static inline void uncharge_gather_clear(struct uncharge_gather *ug) static void uncharge_batch(const struct uncharge_gather *ug) { if (ug->nr_memory) { - page_counter_uncharge(&ug->memcg->memory, ug->nr_memory); - if (do_memsw_account()) - page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory); + memcg_uncharge(ug->memcg, ug->nr_memory); if (ug->nr_kmem) { mod_memcg_state(ug->memcg, MEMCG_KMEM, -ug->nr_kmem); memcg1_account_kmem(ug->memcg, -ug->nr_kmem); @@ -4869,7 +5107,7 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) return memcg1_charge_skmem(memcg, nr_pages, gfp_mask); - if (try_charge(memcg, gfp_mask, nr_pages) == 0) { + if (try_charge_memcg(memcg, gfp_mask, nr_pages) == 0) { mod_memcg_state(memcg, MEMCG_SOCK, nr_pages); return true; } @@ -4913,15 +5151,16 @@ static int __init cgroup_memory(char *s) __setup("cgroup.memory=", cgroup_memory); /* - * subsys_initcall() for memory controller. + * Memory controller init before cgroup_init() initialize root_mem_cgroup. * * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this * context because of lock dependencies (cgroup_lock -> cpu hotplug) but * basically everything that doesn't depend on a specific mem_cgroup structure * should be initialized from here. */ -static int __init mem_cgroup_init(void) +int __init mem_cgroup_init(void) { + unsigned int memcg_size; int cpu; /* @@ -4935,92 +5174,24 @@ static int __init mem_cgroup_init(void) cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL, memcg_hotplug_cpu_dead); - for_each_possible_cpu(cpu) + for_each_possible_cpu(cpu) { INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work, - drain_local_stock); - - return 0; -} -subsys_initcall(mem_cgroup_init); - -#ifdef CONFIG_SWAP -static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) -{ - while (!refcount_inc_not_zero(&memcg->id.ref)) { - /* - * The root cgroup cannot be destroyed, so it's refcount must - * always be >= 1. - */ - if (WARN_ON_ONCE(mem_cgroup_is_root(memcg))) { - VM_BUG_ON(1); - break; - } - memcg = parent_mem_cgroup(memcg); - if (!memcg) - memcg = root_mem_cgroup; + drain_local_memcg_stock); + INIT_WORK(&per_cpu_ptr(&obj_stock, cpu)->work, + drain_local_obj_stock); } - return memcg; -} - -/** - * mem_cgroup_swapout - transfer a memsw charge to swap - * @folio: folio whose memsw charge to transfer - * @entry: swap entry to move the charge to - * - * Transfer the memsw charge of @folio to @entry. - */ -void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) -{ - struct mem_cgroup *memcg, *swap_memcg; - unsigned int nr_entries; - unsigned short oldid; - - VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); - VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); - - if (mem_cgroup_disabled()) - return; - - if (!do_memsw_account()) - return; - - memcg = folio_memcg(folio); - - VM_WARN_ON_ONCE_FOLIO(!memcg, folio); - if (!memcg) - return; - - /* - * In case the memcg owning these pages has been offlined and doesn't - * have an ID allocated to it anymore, charge the closest online - * ancestor for the swap instead and transfer the memory+swap charge. - */ - swap_memcg = mem_cgroup_id_get_online(memcg); - nr_entries = folio_nr_pages(folio); - /* Get references for the tail pages, too */ - if (nr_entries > 1) - mem_cgroup_id_get_many(swap_memcg, nr_entries - 1); - oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), - nr_entries); - VM_BUG_ON_FOLIO(oldid, folio); - mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); - folio_unqueue_deferred_split(folio); - folio->memcg_data = 0; + memcg_size = struct_size_t(struct mem_cgroup, nodeinfo, nr_node_ids); + memcg_cachep = kmem_cache_create("mem_cgroup", memcg_size, 0, + SLAB_PANIC | SLAB_HWCACHE_ALIGN, NULL); - if (!mem_cgroup_is_root(memcg)) - page_counter_uncharge(&memcg->memory, nr_entries); + memcg_pn_cachep = KMEM_CACHE(mem_cgroup_per_node, + SLAB_PANIC | SLAB_HWCACHE_ALIGN); - if (memcg != swap_memcg) { - if (!mem_cgroup_is_root(swap_memcg)) - page_counter_charge(&swap_memcg->memsw, nr_entries); - page_counter_uncharge(&memcg->memsw, nr_entries); - } - - memcg1_swapout(folio, memcg); - css_put(&memcg->css); + return 0; } +#ifdef CONFIG_SWAP /** * __mem_cgroup_try_charge_swap - try charging swap space for a folio * @folio: folio being added to swap @@ -5035,7 +5206,6 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) unsigned int nr_pages = folio_nr_pages(folio); struct page_counter *counter; struct mem_cgroup *memcg; - unsigned short oldid; if (do_memsw_account()) return 0; @@ -5064,10 +5234,10 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) /* Get references for the tail pages, too */ if (nr_pages > 1) mem_cgroup_id_get_many(memcg, nr_pages - 1); - oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages); - VM_BUG_ON_FOLIO(oldid, folio); mod_memcg_state(memcg, MEMCG_SWAP, nr_pages); + swap_cgroup_record(folio, mem_cgroup_id(memcg), entry); + return 0; } @@ -5081,7 +5251,7 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) struct mem_cgroup *memcg; unsigned short id; - id = swap_cgroup_record(entry, 0, nr_pages); + id = swap_cgroup_clear(entry, nr_pages); rcu_read_lock(); memcg = mem_cgroup_from_id(id); if (memcg) { @@ -5473,3 +5643,8 @@ static int __init mem_cgroup_swap_init(void) subsys_initcall(mem_cgroup_swap_init); #endif /* CONFIG_SWAP */ + +bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid) +{ + return memcg ? cpuset_node_allowed(memcg->css.cgroup, nid) : true; +} diff --git a/mm/memfd.c b/mm/memfd.c index c17c3ea701a1..ab367e61553d 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -20,6 +20,7 @@ #include <linux/memfd.h> #include <linux/pid_namespace.h> #include <uapi/linux/memfd.h> +#include "swap.h" /* * We need a tag: a new tag would expand every xa_node by 8 bytes, @@ -259,7 +260,7 @@ static int memfd_add_seals(struct file *file, unsigned int seals) } /* - * SEAL_EXEC implys SEAL_WRITE, making W^X from the start. + * SEAL_EXEC implies SEAL_WRITE, making W^X from the start. */ if (seals & F_SEAL_EXEC && inode->i_mode & 0111) seals |= F_SEAL_SHRINK|F_SEAL_GROW|F_SEAL_WRITE|F_SEAL_FUTURE_WRITE; @@ -327,15 +328,51 @@ static int check_sysctl_memfd_noexec(unsigned int *flags) return 0; } -SYSCALL_DEFINE2(memfd_create, - const char __user *, uname, - unsigned int, flags) +static inline bool is_write_sealed(unsigned int seals) { - unsigned int *file_seals; - struct file *file; - int fd, error; - char *name; - long len; + return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE); +} + +static int check_write_seal(unsigned long *vm_flags_ptr) +{ + unsigned long vm_flags = *vm_flags_ptr; + unsigned long mask = vm_flags & (VM_SHARED | VM_WRITE); + + /* If a private mapping then writability is irrelevant. */ + if (!(mask & VM_SHARED)) + return 0; + + /* + * New PROT_WRITE and MAP_SHARED mmaps are not allowed when + * write seals are active. + */ + if (mask & VM_WRITE) + return -EPERM; + + /* + * This is a read-only mapping, disallow mprotect() from making a + * write-sealed mapping writable in future. + */ + *vm_flags_ptr &= ~VM_MAYWRITE; + + return 0; +} + +int memfd_check_seals_mmap(struct file *file, unsigned long *vm_flags_ptr) +{ + int err = 0; + unsigned int *seals_ptr = memfd_file_seals_ptr(file); + unsigned int seals = seals_ptr ? *seals_ptr : 0; + + if (is_write_sealed(seals)) + err = check_write_seal(vm_flags_ptr); + + return err; +} + +static int sanitize_flags(unsigned int *flags_ptr) +{ + unsigned int flags = *flags_ptr; if (!(flags & MFD_HUGETLB)) { if (flags & ~(unsigned int)MFD_ALL_FLAGS) @@ -351,50 +388,52 @@ SYSCALL_DEFINE2(memfd_create, if ((flags & MFD_EXEC) && (flags & MFD_NOEXEC_SEAL)) return -EINVAL; - error = check_sysctl_memfd_noexec(&flags); - if (error < 0) - return error; + return check_sysctl_memfd_noexec(flags_ptr); +} - /* length includes terminating zero */ - len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); - if (len <= 0) - return -EFAULT; - if (len > MFD_NAME_MAX_LEN + 1) - return -EINVAL; +static char *alloc_name(const char __user *uname) +{ + int error; + char *name; + long len; - name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL); + name = kmalloc(NAME_MAX + 1, GFP_KERNEL); if (!name) - return -ENOMEM; + return ERR_PTR(-ENOMEM); strcpy(name, MFD_NAME_PREFIX); - if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) { + /* returned length does not include terminating zero */ + len = strncpy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, MFD_NAME_MAX_LEN + 1); + if (len < 0) { error = -EFAULT; goto err_name; - } - - /* terminating-zero may have changed after strnlen_user() returned */ - if (name[len + MFD_NAME_PREFIX_LEN - 1]) { - error = -EFAULT; + } else if (len > MFD_NAME_MAX_LEN) { + error = -EINVAL; goto err_name; } - fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); - if (fd < 0) { - error = fd; - goto err_name; - } + return name; + +err_name: + kfree(name); + return ERR_PTR(error); +} + +static struct file *alloc_file(const char *name, unsigned int flags) +{ + unsigned int *file_seals; + struct file *file; if (flags & MFD_HUGETLB) { file = hugetlb_file_setup(name, 0, VM_NORESERVE, HUGETLB_ANONHUGE_INODE, (flags >> MFD_HUGE_SHIFT) & MFD_HUGE_MASK); - } else + } else { file = shmem_file_setup(name, 0, VM_NORESERVE); - if (IS_ERR(file)) { - error = PTR_ERR(file); - goto err_fd; } + if (IS_ERR(file)) + return file; file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; file->f_flags |= O_LARGEFILE; @@ -414,6 +453,37 @@ SYSCALL_DEFINE2(memfd_create, *file_seals &= ~F_SEAL_SEAL; } + return file; +} + +SYSCALL_DEFINE2(memfd_create, + const char __user *, uname, + unsigned int, flags) +{ + struct file *file; + int fd, error; + char *name; + + error = sanitize_flags(&flags); + if (error < 0) + return error; + + name = alloc_name(uname); + if (IS_ERR(name)) + return PTR_ERR(name); + + fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); + if (fd < 0) { + error = fd; + goto err_name; + } + + file = alloc_file(name, flags); + if (IS_ERR(file)) { + error = PTR_ERR(file); + goto err_fd; + } + fd_install(fd, file); kfree(name); return fd; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index a7b8ccd29b6f..b91a33fb6c69 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -124,7 +124,7 @@ const struct attribute_group memory_failure_attr_group = { .attrs = memory_failure_attr, }; -static struct ctl_table memory_failure_table[] = { +static const struct ctl_table memory_failure_table[] = { { .procname = "memory_failure_early_kill", .data = &sysctl_memory_failure_early_kill, @@ -419,18 +419,18 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma, pud = pud_offset(p4d, address); if (!pud_present(*pud)) return 0; - if (pud_devmap(*pud)) + if (pud_trans_huge(*pud)) return PUD_SHIFT; pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) return 0; - if (pmd_devmap(*pmd)) + if (pmd_trans_huge(*pmd)) return PMD_SHIFT; pte = pte_offset_map(pmd, address); if (!pte) return 0; ptent = ptep_get(pte); - if (pte_present(ptent) && pte_devmap(ptent)) + if (pte_present(ptent)) ret = PAGE_SHIFT; pte_unmap(pte); return ret; @@ -881,12 +881,17 @@ static int kill_accessing_process(struct task_struct *p, unsigned long pfn, mmap_read_lock(p->mm); ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwpoison_walk_ops, (void *)&priv); + /* + * ret = 1 when CMCI wins, regardless of whether try_to_unmap() + * succeeds or fails, then kill the process with SIGBUS. + * ret = 0 when poison page is a clean page and it's dropped, no + * SIGBUS is needed. + */ if (ret == 1 && priv.tk.addr) kill_proc(&priv.tk, pfn, flags); - else - ret = 0; mmap_read_unlock(p->mm); - return ret > 0 ? -EHWPOISON : -EFAULT; + + return ret > 0 ? -EHWPOISON : 0; } /* @@ -1556,11 +1561,35 @@ static int get_hwpoison_page(struct page *p, unsigned long flags) return ret; } -void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu) +int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill) { - if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) { - struct address_space *mapping; + enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON; + struct address_space *mapping; + + if (folio_test_swapcache(folio)) { + pr_err("%#lx: keeping poisoned page in swap cache\n", pfn); + ttu &= ~TTU_HWPOISON; + } + + /* + * Propagate the dirty bit from PTEs to struct page first, because we + * need this to decide if we should kill or just drop the page. + * XXX: the dirty test could be racy: set_page_dirty() may not always + * be called inside page lock (it's recommended but not enforced). + */ + mapping = folio_mapping(folio); + if (!must_kill && !folio_test_dirty(folio) && mapping && + mapping_can_writeback(mapping)) { + if (folio_mkclean(folio)) { + folio_set_dirty(folio); + } else { + ttu &= ~TTU_HWPOISON; + pr_info("%#lx: corrupted page was clean: dropped without side effects\n", + pfn); + } + } + if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) { /* * For hugetlb folios in shared mappings, try_to_unmap * could potentially call huge_pmd_unshare. Because of @@ -1572,7 +1601,7 @@ void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu) if (!mapping) { pr_info("%#lx: could not lock mapping for mapped hugetlb folio\n", folio_pfn(folio)); - return; + return -EBUSY; } try_to_unmap(folio, ttu|TTU_RMAP_LOCKED); @@ -1580,6 +1609,8 @@ void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu) } else { try_to_unmap(folio, ttu); } + + return folio_mapped(folio) ? -EBUSY : 0; } /* @@ -1589,8 +1620,6 @@ void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu) static bool hwpoison_user_mappings(struct folio *folio, struct page *p, unsigned long pfn, int flags) { - enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON; - struct address_space *mapping; LIST_HEAD(tokill); bool unmap_success; int forcekill; @@ -1613,29 +1642,6 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p, if (!folio_mapped(folio)) return true; - if (folio_test_swapcache(folio)) { - pr_err("%#lx: keeping poisoned page in swap cache\n", pfn); - ttu &= ~TTU_HWPOISON; - } - - /* - * Propagate the dirty bit from PTEs to struct page first, because we - * need this to decide if we should kill or just drop the page. - * XXX: the dirty test could be racy: set_page_dirty() may not always - * be called inside page lock (it's recommended but not enforced). - */ - mapping = folio_mapping(folio); - if (!(flags & MF_MUST_KILL) && !folio_test_dirty(folio) && mapping && - mapping_can_writeback(mapping)) { - if (folio_mkclean(folio)) { - folio_set_dirty(folio); - } else { - ttu &= ~TTU_HWPOISON; - pr_info("%#lx: corrupted page was clean: dropped without side effects\n", - pfn); - } - } - /* * First collect all the processes that have the page * mapped in dirty form. This has to be done before try_to_unmap, @@ -1643,9 +1649,7 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p, */ collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED); - unmap_poisoned_folio(folio, ttu); - - unmap_success = !folio_mapped(folio); + unmap_success = !unmap_poisoned_folio(folio, pfn, flags & MF_MUST_KILL); if (!unmap_success) pr_err("%#lx: failed to unmap page (folio mapcount=%d)\n", pfn, folio_mapcount(folio)); @@ -2211,9 +2215,13 @@ static void kill_procs_now(struct page *p, unsigned long pfn, int flags, * Must run in process context (e.g. a work queue) with interrupts * enabled and no spinlocks held. * - * Return: 0 for successfully handled the memory error, - * -EOPNOTSUPP for hwpoison_filter() filtered the error event, - * < 0(except -EOPNOTSUPP) on failure. + * Return: + * 0 - success, + * -ENXIO - memory not managed by the kernel + * -EOPNOTSUPP - hwpoison_filter() filtered the error event, + * -EHWPOISON - the page was already poisoned, potentially + * kill process, + * other negative values - failure. */ int memory_failure(unsigned long pfn, int flags) { diff --git a/mm/memory.c b/mm/memory.c index 75c2dfd04f72..b0cda5aab398 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -94,14 +94,6 @@ #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid. #endif -#ifndef CONFIG_NUMA -unsigned long max_mapnr; -EXPORT_SYMBOL(max_mapnr); - -struct page *mem_map; -EXPORT_SYMBOL(mem_map); -#endif - static vm_fault_t do_fault(struct vm_fault *vmf); static vm_fault_t do_anonymous_page(struct vm_fault *vmf); static bool vmf_pte_changed(struct vm_fault *vmf); @@ -121,14 +113,6 @@ static __always_inline bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf) } /* - * A number of key systems in x86 including ioremap() rely on the assumption - * that high_memory defines the upper bound on direct map memory, then end - * of ZONE_NORMAL. - */ -void *high_memory; -EXPORT_SYMBOL(high_memory); - -/* * Randomize the address space (stacks, mmaps, brk, etc.). * * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization, @@ -294,8 +278,17 @@ static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, p4d_free_tlb(tlb, p4d, start); } -/* - * This function frees user-level page tables of a process. +/** + * free_pgd_range - Unmap and free page tables in the range + * @tlb: the mmu_gather containing pending TLB flush info + * @addr: virtual address start + * @end: virtual address end + * @floor: lowest address boundary + * @ceiling: highest address boundary + * + * This function tears down all user-level page tables in the + * specified virtual address range [@addr..@end). It is part of + * the memory unmap flow. */ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, @@ -365,6 +358,8 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, { struct unlink_vma_file_batch vb; + tlb_free_vmas(tlb); + do { unsigned long addr = vma->vm_start; struct vm_area_struct *next; @@ -534,10 +529,11 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, dump_page(page, "bad pte"); pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n", (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); - pr_alert("file:%pD fault:%ps mmap:%ps read_folio:%ps\n", + pr_alert("file:%pD fault:%ps mmap:%ps mmap_prepare: %ps read_folio:%ps\n", vma->vm_file, vma->vm_ops ? vma->vm_ops->fault : NULL, vma->vm_file ? vma->vm_file->f_op->mmap : NULL, + vma->vm_file ? vma->vm_file->f_op->mmap_prepare : NULL, mapping ? mapping->a_ops->read_folio : NULL); dump_stack(); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); @@ -715,42 +711,53 @@ struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma, } #endif +/** + * restore_exclusive_pte - Restore a device-exclusive entry + * @vma: VMA covering @address + * @folio: the mapped folio + * @page: the mapped folio page + * @address: the virtual address + * @ptep: pte pointer into the locked page table mapping the folio page + * @orig_pte: pte value at @ptep + * + * Restore a device-exclusive non-swap entry to an ordinary present pte. + * + * The folio and the page table must be locked, and MMU notifiers must have + * been called to invalidate any (exclusive) device mappings. + * + * Locking the folio makes sure that anybody who just converted the pte to + * a device-exclusive entry can map it into the device to make forward + * progress without others converting it back until the folio was unlocked. + * + * If the folio lock ever becomes an issue, we can stop relying on the folio + * lock; it might make some scenarios with heavy thrashing less likely to + * make forward progress, but these scenarios might not be valid use cases. + * + * Note that the folio lock does not protect against all cases of concurrent + * page table modifications (e.g., MADV_DONTNEED, mprotect), so device drivers + * must use MMU notifiers to sync against any concurrent changes. + */ static void restore_exclusive_pte(struct vm_area_struct *vma, - struct page *page, unsigned long address, - pte_t *ptep) + struct folio *folio, struct page *page, unsigned long address, + pte_t *ptep, pte_t orig_pte) { - struct folio *folio = page_folio(page); - pte_t orig_pte; pte_t pte; - swp_entry_t entry; - orig_pte = ptep_get(ptep); + VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); + pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot))); if (pte_swp_soft_dirty(orig_pte)) pte = pte_mksoft_dirty(pte); - entry = pte_to_swp_entry(orig_pte); if (pte_swp_uffd_wp(orig_pte)) pte = pte_mkuffd_wp(pte); - else if (is_writable_device_exclusive_entry(entry)) - pte = maybe_mkwrite(pte_mkdirty(pte), vma); - - VM_BUG_ON_FOLIO(pte_write(pte) && (!folio_test_anon(folio) && - PageAnonExclusive(page)), folio); - - /* - * No need to take a page reference as one was already - * created when the swap entry was made. - */ - if (folio_test_anon(folio)) - folio_add_anon_rmap_pte(folio, page, vma, address, RMAP_NONE); - else - /* - * Currently device exclusive access only supports anonymous - * memory so the entry shouldn't point to a filebacked page. - */ - WARN_ON_ONCE(1); + if ((vma->vm_flags & VM_WRITE) && + can_change_pte_writable(vma, address, pte)) { + if (folio_test_dirty(folio)) + pte = pte_mkdirty(pte); + pte = pte_mkwrite(pte, vma); + } set_pte_at(vma->vm_mm, address, ptep, pte); /* @@ -764,16 +771,15 @@ static void restore_exclusive_pte(struct vm_area_struct *vma, * Tries to restore an exclusive pte if the page lock can be acquired without * sleeping. */ -static int -try_restore_exclusive_pte(pte_t *src_pte, struct vm_area_struct *vma, - unsigned long addr) +static int try_restore_exclusive_pte(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, pte_t orig_pte) { - swp_entry_t entry = pte_to_swp_entry(ptep_get(src_pte)); - struct page *page = pfn_swap_entry_to_page(entry); + struct page *page = pfn_swap_entry_to_page(pte_to_swp_entry(orig_pte)); + struct folio *folio = page_folio(page); - if (trylock_page(page)) { - restore_exclusive_pte(vma, page, addr, src_pte); - unlock_page(page); + if (folio_trylock(folio)) { + restore_exclusive_pte(vma, folio, page, addr, ptep, orig_pte); + folio_unlock(folio); return 0; } @@ -853,7 +859,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, folio_get(folio); rss[mm_counter(folio)]++; /* Cannot fail as these pages cannot get pinned. */ - folio_try_dup_anon_rmap_pte(folio, page, src_vma); + folio_try_dup_anon_rmap_pte(folio, page, dst_vma, src_vma); /* * We do not preserve soft-dirty information, because so @@ -879,7 +885,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, * (ie. COW) mappings. */ VM_BUG_ON(!is_cow_mapping(src_vma->vm_flags)); - if (try_restore_exclusive_pte(src_pte, src_vma, addr)) + if (try_restore_exclusive_pte(src_vma, addr, src_pte, orig_pte)) return -EBUSY; return -ENOENT; } else if (is_pte_marker_entry(entry)) { @@ -935,7 +941,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma rss[MM_ANONPAGES]++; /* All done, just insert the new page copy in the child */ - pte = mk_pte(&new_folio->page, dst_vma->vm_page_prot); + pte = folio_mk_pte(new_folio, dst_vma->vm_page_prot); pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma); if (userfaultfd_pte_wp(dst_vma, ptep_get(src_pte))) /* Uffd-wp needs to be delivered to dest pte as well */ @@ -1007,14 +1013,14 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma folio_ref_add(folio, nr); if (folio_test_anon(folio)) { if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page, - nr, src_vma))) { + nr, dst_vma, src_vma))) { folio_ref_sub(folio, nr); return -EAGAIN; } rss[MM_ANONPAGES] += nr; VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio); } else { - folio_dup_file_rmap_ptes(folio, page, nr); + folio_dup_file_rmap_ptes(folio, page, nr, dst_vma); rss[mm_counter_file(folio)] += nr; } if (any_writable) @@ -1032,7 +1038,7 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma * guarantee the pinned page won't be randomly replaced in the * future. */ - if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, src_vma))) { + if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, dst_vma, src_vma))) { /* Page may be pinned, we have to copy. */ folio_put(folio); err = copy_present_page(dst_vma, src_vma, dst_pte, src_pte, @@ -1042,7 +1048,7 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma rss[MM_ANONPAGES]++; VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio); } else { - folio_dup_file_rmap_pte(folio, page); + folio_dup_file_rmap_pte(folio, page, dst_vma); rss[mm_counter_file(folio)]++; } @@ -1362,12 +1368,12 @@ int copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { pgd_t *src_pgd, *dst_pgd; - unsigned long next; unsigned long addr = src_vma->vm_start; unsigned long end = src_vma->vm_end; struct mm_struct *dst_mm = dst_vma->vm_mm; struct mm_struct *src_mm = src_vma->vm_mm; struct mmu_notifier_range range; + unsigned long next; bool is_cow; int ret; @@ -1377,16 +1383,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) if (is_vm_hugetlb_page(src_vma)) return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma); - if (unlikely(src_vma->vm_flags & VM_PFNMAP)) { - /* - * We do not free on error cases below as remove_vma - * gets called on error from higher level routine - */ - ret = track_pfn_copy(src_vma); - if (ret) - return ret; - } - /* * We need to invalidate the secondary MMU mappings only when * there could be a permission downgrade on the ptes of the @@ -1419,7 +1415,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) continue; if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd, addr, next))) { - untrack_pfn_clear(dst_vma); ret = -ENOMEM; break; } @@ -1436,7 +1431,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) static inline bool should_zap_cows(struct zap_details *details) { /* By default, zap all pages */ - if (!details) + if (!details || details->reclaim_pt) return true; /* Or, we zap COWed pages only if the caller wants to */ @@ -1466,34 +1461,42 @@ static inline bool zap_drop_markers(struct zap_details *details) /* * This function makes sure that we'll replace the none pte with an uffd-wp * swap special pte marker when necessary. Must be with the pgtable lock held. + * + * Returns true if uffd-wp ptes was installed, false otherwise. */ -static inline void +static inline bool zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, pte_t *pte, int nr, struct zap_details *details, pte_t pteval) { + bool was_installed = false; + +#ifdef CONFIG_PTE_MARKER_UFFD_WP /* Zap on anonymous always means dropping everything */ if (vma_is_anonymous(vma)) - return; + return false; if (zap_drop_markers(details)) - return; + return false; for (;;) { /* the PFN in the PTE is irrelevant. */ - pte_install_uffd_wp_if_needed(vma, addr, pte, pteval); + if (pte_install_uffd_wp_if_needed(vma, addr, pte, pteval)) + was_installed = true; if (--nr == 0) break; pte++; addr += PAGE_SIZE; } +#endif + return was_installed; } static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb, struct vm_area_struct *vma, struct folio *folio, struct page *page, pte_t *pte, pte_t ptent, unsigned int nr, unsigned long addr, struct zap_details *details, int *rss, - bool *force_flush, bool *force_break) + bool *force_flush, bool *force_break, bool *any_skipped) { struct mm_struct *mm = tlb->mm; bool delay_rmap = false; @@ -1519,8 +1522,8 @@ static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb, arch_check_zapped_pte(vma, ptent); tlb_remove_tlb_entries(tlb, pte, nr, addr); if (unlikely(userfaultfd_pte_wp(vma, ptent))) - zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, - ptent); + *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte, + nr, details, ptent); if (!delay_rmap) { folio_remove_rmap_ptes(folio, page, nr, vma); @@ -1544,7 +1547,7 @@ static inline int zap_present_ptes(struct mmu_gather *tlb, struct vm_area_struct *vma, pte_t *pte, pte_t ptent, unsigned int max_nr, unsigned long addr, struct zap_details *details, int *rss, bool *force_flush, - bool *force_break) + bool *force_break, bool *any_skipped) { const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; struct mm_struct *mm = tlb->mm; @@ -1559,15 +1562,17 @@ static inline int zap_present_ptes(struct mmu_gather *tlb, arch_check_zapped_pte(vma, ptent); tlb_remove_tlb_entry(tlb, pte, addr); if (userfaultfd_pte_wp(vma, ptent)) - zap_install_uffd_wp_if_needed(vma, addr, pte, 1, - details, ptent); + *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, + pte, 1, details, ptent); ksm_might_unmap_zero_page(mm, ptent); return 1; } folio = page_folio(page); - if (unlikely(!should_zap_folio(details, folio))) + if (unlikely(!should_zap_folio(details, folio))) { + *any_skipped = true; return 1; + } /* * Make sure that the common "small folio" case is as fast as possible @@ -1579,14 +1584,120 @@ static inline int zap_present_ptes(struct mmu_gather *tlb, zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr, addr, details, rss, force_flush, - force_break); + force_break, any_skipped); return nr; } zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, 1, addr, - details, rss, force_flush, force_break); + details, rss, force_flush, force_break, any_skipped); return 1; } +static inline int zap_nonpresent_ptes(struct mmu_gather *tlb, + struct vm_area_struct *vma, pte_t *pte, pte_t ptent, + unsigned int max_nr, unsigned long addr, + struct zap_details *details, int *rss, bool *any_skipped) +{ + swp_entry_t entry; + int nr = 1; + + *any_skipped = true; + entry = pte_to_swp_entry(ptent); + if (is_device_private_entry(entry) || + is_device_exclusive_entry(entry)) { + struct page *page = pfn_swap_entry_to_page(entry); + struct folio *folio = page_folio(page); + + if (unlikely(!should_zap_folio(details, folio))) + return 1; + /* + * Both device private/exclusive mappings should only + * work with anonymous page so far, so we don't need to + * consider uffd-wp bit when zap. For more information, + * see zap_install_uffd_wp_if_needed(). + */ + WARN_ON_ONCE(!vma_is_anonymous(vma)); + rss[mm_counter(folio)]--; + folio_remove_rmap_pte(folio, page, vma); + folio_put(folio); + } else if (!non_swap_entry(entry)) { + /* Genuine swap entries, hence a private anon pages */ + if (!should_zap_cows(details)) + return 1; + + nr = swap_pte_batch(pte, max_nr, ptent); + rss[MM_SWAPENTS] -= nr; + free_swap_and_cache_nr(entry, nr); + } else if (is_migration_entry(entry)) { + struct folio *folio = pfn_swap_entry_folio(entry); + + if (!should_zap_folio(details, folio)) + return 1; + rss[mm_counter(folio)]--; + } else if (pte_marker_entry_uffd_wp(entry)) { + /* + * For anon: always drop the marker; for file: only + * drop the marker if explicitly requested. + */ + if (!vma_is_anonymous(vma) && !zap_drop_markers(details)) + return 1; + } else if (is_guard_swp_entry(entry)) { + /* + * Ordinary zapping should not remove guard PTE + * markers. Only do so if we should remove PTE markers + * in general. + */ + if (!zap_drop_markers(details)) + return 1; + } else if (is_hwpoison_entry(entry) || is_poisoned_swp_entry(entry)) { + if (!should_zap_cows(details)) + return 1; + } else { + /* We should have covered all the swap entry types */ + pr_alert("unrecognized swap entry 0x%lx\n", entry.val); + WARN_ON_ONCE(1); + } + clear_not_present_full_ptes(vma->vm_mm, addr, pte, nr, tlb->fullmm); + *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent); + + return nr; +} + +static inline int do_zap_pte_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pte_t *pte, + unsigned long addr, unsigned long end, + struct zap_details *details, int *rss, + bool *force_flush, bool *force_break, + bool *any_skipped) +{ + pte_t ptent = ptep_get(pte); + int max_nr = (end - addr) / PAGE_SIZE; + int nr = 0; + + /* Skip all consecutive none ptes */ + if (pte_none(ptent)) { + for (nr = 1; nr < max_nr; nr++) { + ptent = ptep_get(pte + nr); + if (!pte_none(ptent)) + break; + } + max_nr -= nr; + if (!max_nr) + return nr; + pte += nr; + addr += nr * PAGE_SIZE; + } + + if (pte_present(ptent)) + nr += zap_present_ptes(tlb, vma, pte, ptent, max_nr, addr, + details, rss, force_flush, force_break, + any_skipped); + else + nr += zap_nonpresent_ptes(tlb, vma, pte, ptent, max_nr, addr, + details, rss, any_skipped); + + return nr; +} + static unsigned long zap_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, @@ -1598,9 +1709,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, spinlock_t *ptl; pte_t *start_pte; pte_t *pte; - swp_entry_t entry; + pmd_t pmdval; + unsigned long start = addr; + bool can_reclaim_pt = reclaim_pt_is_enabled(start, end, details); + bool direct_reclaim = true; int nr; +retry: tlb_change_page_size(tlb, PAGE_SIZE); init_rss_vec(rss); start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); @@ -1610,90 +1725,35 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); do { - pte_t ptent = ptep_get(pte); - struct folio *folio; - struct page *page; - int max_nr; + bool any_skipped = false; - nr = 1; - if (pte_none(ptent)) - continue; - - if (need_resched()) + if (need_resched()) { + direct_reclaim = false; break; - - if (pte_present(ptent)) { - max_nr = (end - addr) / PAGE_SIZE; - nr = zap_present_ptes(tlb, vma, pte, ptent, max_nr, - addr, details, rss, &force_flush, - &force_break); - if (unlikely(force_break)) { - addr += nr * PAGE_SIZE; - break; - } - continue; } - entry = pte_to_swp_entry(ptent); - if (is_device_private_entry(entry) || - is_device_exclusive_entry(entry)) { - page = pfn_swap_entry_to_page(entry); - folio = page_folio(page); - if (unlikely(!should_zap_folio(details, folio))) - continue; - /* - * Both device private/exclusive mappings should only - * work with anonymous page so far, so we don't need to - * consider uffd-wp bit when zap. For more information, - * see zap_install_uffd_wp_if_needed(). - */ - WARN_ON_ONCE(!vma_is_anonymous(vma)); - rss[mm_counter(folio)]--; - if (is_device_private_entry(entry)) - folio_remove_rmap_pte(folio, page, vma); - folio_put(folio); - } else if (!non_swap_entry(entry)) { - max_nr = (end - addr) / PAGE_SIZE; - nr = swap_pte_batch(pte, max_nr, ptent); - /* Genuine swap entries, hence a private anon pages */ - if (!should_zap_cows(details)) - continue; - rss[MM_SWAPENTS] -= nr; - free_swap_and_cache_nr(entry, nr); - } else if (is_migration_entry(entry)) { - folio = pfn_swap_entry_folio(entry); - if (!should_zap_folio(details, folio)) - continue; - rss[mm_counter(folio)]--; - } else if (pte_marker_entry_uffd_wp(entry)) { - /* - * For anon: always drop the marker; for file: only - * drop the marker if explicitly requested. - */ - if (!vma_is_anonymous(vma) && - !zap_drop_markers(details)) - continue; - } else if (is_guard_swp_entry(entry)) { - /* - * Ordinary zapping should not remove guard PTE - * markers. Only do so if we should remove PTE markers - * in general. - */ - if (!zap_drop_markers(details)) - continue; - } else if (is_hwpoison_entry(entry) || - is_poisoned_swp_entry(entry)) { - if (!should_zap_cows(details)) - continue; - } else { - /* We should have covered all the swap entry types */ - pr_alert("unrecognized swap entry 0x%lx\n", entry.val); - WARN_ON_ONCE(1); + nr = do_zap_pte_range(tlb, vma, pte, addr, end, details, rss, + &force_flush, &force_break, &any_skipped); + if (any_skipped) + can_reclaim_pt = false; + if (unlikely(force_break)) { + addr += nr * PAGE_SIZE; + direct_reclaim = false; + break; } - clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm); - zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent); } while (pte += nr, addr += PAGE_SIZE * nr, addr != end); + /* + * Fast path: try to hold the pmd lock and unmap the PTE page. + * + * If the pte lock was released midway (retry case), or if the attempt + * to hold the pmd lock failed, then we need to recheck all pte entries + * to ensure they are still none, thereby preventing the pte entries + * from being repopulated by another thread. + */ + if (can_reclaim_pt && direct_reclaim && addr == end) + direct_reclaim = try_get_and_clear_pmd(mm, pmd, &pmdval); + add_mm_rss_vec(mm, rss); arch_leave_lazy_mmu_mode(); @@ -1713,6 +1773,20 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, if (force_flush) tlb_flush_mmu(tlb); + if (addr != end) { + cond_resched(); + force_flush = false; + force_break = false; + goto retry; + } + + if (can_reclaim_pt) { + if (direct_reclaim) + free_pte(mm, start, tlb, pmdval); + else + try_to_free_pte(mm, pmd, start, tlb); + } + return addr; } @@ -1729,7 +1803,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, next = pmd_addr_end(addr, end); if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) - __split_huge_pmd(vma, pmd, addr, false, NULL); + __split_huge_pmd(vma, pmd, addr, false); else if (zap_huge_pmd(tlb, vma, pmd, addr)) { addr = next; continue; @@ -1844,9 +1918,6 @@ static void unmap_single_vma(struct mmu_gather *tlb, if (vma->vm_file) uprobe_munmap(vma, start, end); - if (unlikely(vma->vm_flags & VM_PFNMAP)) - untrack_pfn(vma, 0, 0, mm_wr_locked); - if (start != end) { if (unlikely(is_vm_hugetlb_page(vma))) { /* @@ -1920,36 +1991,64 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, } /** - * zap_page_range_single - remove user pages in a given range + * zap_page_range_single_batched - remove user pages in a given range + * @tlb: pointer to the caller's struct mmu_gather * @vma: vm_area_struct holding the applicable pages - * @address: starting address of pages to zap - * @size: number of bytes to zap + * @address: starting address of pages to remove + * @size: number of bytes to remove * @details: details of shared cache invalidation * - * The range must fit into one VMA. + * @tlb shouldn't be NULL. The range must fit into one VMA. If @vma is for + * hugetlb, @tlb is flushed and re-initialized by this function. */ -void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, +void zap_page_range_single_batched(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details) { const unsigned long end = address + size; struct mmu_notifier_range range; - struct mmu_gather tlb; - lru_add_drain(); + VM_WARN_ON_ONCE(!tlb || tlb->mm != vma->vm_mm); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address, end); hugetlb_zap_begin(vma, &range.start, &range.end); - tlb_gather_mmu(&tlb, vma->vm_mm); update_hiwater_rss(vma->vm_mm); mmu_notifier_invalidate_range_start(&range); /* * unmap 'address-end' not 'range.start-range.end' as range * could have been expanded for hugetlb pmd sharing. */ - unmap_single_vma(&tlb, vma, address, end, details, false); + unmap_single_vma(tlb, vma, address, end, details, false); mmu_notifier_invalidate_range_end(&range); + if (is_vm_hugetlb_page(vma)) { + /* + * flush tlb and free resources before hugetlb_zap_end(), to + * avoid concurrent page faults' allocation failure. + */ + tlb_finish_mmu(tlb); + hugetlb_zap_end(vma, details); + tlb_gather_mmu(tlb, vma->vm_mm); + } +} + +/** + * zap_page_range_single - remove user pages in a given range + * @vma: vm_area_struct holding the applicable pages + * @address: starting address of pages to zap + * @size: number of bytes to zap + * @details: details of shared cache invalidation + * + * The range must fit into one VMA. + */ +void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, + unsigned long size, struct zap_details *details) +{ + struct mmu_gather tlb; + + tlb_gather_mmu(&tlb, vma->vm_mm); + zap_page_range_single_batched(&tlb, vma, address, size, details); tlb_finish_mmu(&tlb); - hugetlb_zap_end(vma, details); } /** @@ -2056,19 +2155,39 @@ static int validate_page_before_insert(struct vm_area_struct *vma, } static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte, - unsigned long addr, struct page *page, pgprot_t prot) + unsigned long addr, struct page *page, + pgprot_t prot, bool mkwrite) { struct folio *folio = page_folio(page); - pte_t pteval; + pte_t pteval = ptep_get(pte); + + if (!pte_none(pteval)) { + if (!mkwrite) + return -EBUSY; + + /* see insert_pfn(). */ + if (pte_pfn(pteval) != page_to_pfn(page)) { + WARN_ON_ONCE(!is_zero_pfn(pte_pfn(pteval))); + return -EFAULT; + } + pteval = maybe_mkwrite(pteval, vma); + pteval = pte_mkyoung(pteval); + if (ptep_set_access_flags(vma, addr, pte, pteval, 1)) + update_mmu_cache(vma, addr, pte); + return 0; + } - if (!pte_none(ptep_get(pte))) - return -EBUSY; /* Ok, finally just insert the thing.. */ pteval = mk_pte(page, prot); if (unlikely(is_zero_folio(folio))) { pteval = pte_mkspecial(pteval); } else { folio_get(folio); + pteval = mk_pte(page, prot); + if (mkwrite) { + pteval = pte_mkyoung(pteval); + pteval = maybe_mkwrite(pte_mkdirty(pteval), vma); + } inc_mm_counter(vma->vm_mm, mm_counter_file(folio)); folio_add_file_rmap_pte(folio, page, vma); } @@ -2077,7 +2196,7 @@ static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte, } static int insert_page(struct vm_area_struct *vma, unsigned long addr, - struct page *page, pgprot_t prot) + struct page *page, pgprot_t prot, bool mkwrite) { int retval; pte_t *pte; @@ -2090,7 +2209,8 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, pte = get_locked_pte(vma->vm_mm, addr, &ptl); if (!pte) goto out; - retval = insert_page_into_pte_locked(vma, pte, addr, page, prot); + retval = insert_page_into_pte_locked(vma, pte, addr, page, prot, + mkwrite); pte_unmap_unlock(pte, ptl); out: return retval; @@ -2104,7 +2224,7 @@ static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte, err = validate_page_before_insert(vma, page); if (err) return err; - return insert_page_into_pte_locked(vma, pte, addr, page, prot); + return insert_page_into_pte_locked(vma, pte, addr, page, prot, false); } /* insert_pages() amortizes the cost of spinlock operations @@ -2240,7 +2360,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, BUG_ON(vma->vm_flags & VM_PFNMAP); vm_flags_set(vma, VM_MIXEDMAP); } - return insert_page(vma, addr, page, vma->vm_page_prot); + return insert_page(vma, addr, page, vma->vm_page_prot, false); } EXPORT_SYMBOL(vm_insert_page); @@ -2435,7 +2555,7 @@ vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, if (!pfn_modify_allowed(pfn, pgprot)) return VM_FAULT_SIGBUS; - track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)); + pfnmap_setup_cachemode_pfn(pfn, &pgprot); return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot, false); @@ -2498,7 +2618,7 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma, if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS; - track_pfn_insert(vma, &pgprot, pfn); + pfnmap_setup_cachemode_pfn(pfn_t_to_pfn(pfn), &pgprot); if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot)) return VM_FAULT_SIGBUS; @@ -2520,7 +2640,7 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma, * result in pfn_t_has_page() == false. */ page = pfn_to_page(pfn_t_to_pfn(pfn)); - err = insert_page(vma, addr, page, pgprot); + err = insert_page(vma, addr, page, pgprot, mkwrite); } else { return insert_pfn(vma, addr, pfn, pgprot, mkwrite); } @@ -2533,6 +2653,26 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma, return VM_FAULT_NOPAGE; } +vm_fault_t vmf_insert_page_mkwrite(struct vm_fault *vmf, struct page *page, + bool write) +{ + pgprot_t pgprot = vmf->vma->vm_page_prot; + unsigned long addr = vmf->address; + int err; + + if (addr < vmf->vma->vm_start || addr >= vmf->vma->vm_end) + return VM_FAULT_SIGBUS; + + err = insert_page(vmf->vma, addr, page, pgprot, write); + if (err == -ENOMEM) + return VM_FAULT_OOM; + if (err < 0 && err != -EBUSY) + return VM_FAULT_SIGBUS; + + return VM_FAULT_NOPAGE; +} +EXPORT_SYMBOL_GPL(vmf_insert_page_mkwrite); + vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn) { @@ -2723,6 +2863,36 @@ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, return error; } +#ifdef __HAVE_PFNMAP_TRACKING +static inline struct pfnmap_track_ctx *pfnmap_track_ctx_alloc(unsigned long pfn, + unsigned long size, pgprot_t *prot) +{ + struct pfnmap_track_ctx *ctx; + + if (pfnmap_track(pfn, size, prot)) + return ERR_PTR(-EINVAL); + + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); + if (unlikely(!ctx)) { + pfnmap_untrack(pfn, size); + return ERR_PTR(-ENOMEM); + } + + ctx->pfn = pfn; + ctx->size = size; + kref_init(&ctx->kref); + return ctx; +} + +void pfnmap_track_ctx_release(struct kref *ref) +{ + struct pfnmap_track_ctx *ctx = container_of(ref, struct pfnmap_track_ctx, kref); + + pfnmap_untrack(ctx->pfn, ctx->size); + kfree(ctx); +} +#endif /* __HAVE_PFNMAP_TRACKING */ + /** * remap_pfn_range - remap kernel memory to userspace * @vma: user vma to map to @@ -2735,20 +2905,51 @@ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, * * Return: %0 on success, negative error code otherwise. */ +#ifdef __HAVE_PFNMAP_TRACKING int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { + struct pfnmap_track_ctx *ctx = NULL; int err; - err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size)); - if (err) + size = PAGE_ALIGN(size); + + /* + * If we cover the full VMA, we'll perform actual tracking, and + * remember to untrack when the last reference to our tracking + * context from a VMA goes away. We'll keep tracking the whole pfn + * range even during VMA splits and partial unmapping. + * + * If we only cover parts of the VMA, we'll only setup the cachemode + * in the pgprot for the pfn range. + */ + if (addr == vma->vm_start && addr + size == vma->vm_end) { + if (vma->pfnmap_track_ctx) + return -EINVAL; + ctx = pfnmap_track_ctx_alloc(pfn, size, &prot); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + } else if (pfnmap_setup_cachemode(pfn, size, &prot)) { return -EINVAL; + } err = remap_pfn_range_notrack(vma, addr, pfn, size, prot); - if (err) - untrack_pfn(vma, pfn, PAGE_ALIGN(size), true); + if (ctx) { + if (err) + kref_put(&ctx->kref, pfnmap_track_ctx_release); + else + vma->pfnmap_track_ctx = ctx; + } return err; } + +#else +int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + return remap_pfn_range_notrack(vma, addr, pfn, size, prot); +} +#endif EXPORT_SYMBOL(remap_pfn_range); /** @@ -2828,11 +3029,11 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, if (fn) { do { if (create || !pte_none(ptep_get(pte))) { - err = fn(pte++, addr, data); + err = fn(pte, addr, data); if (err) break; } - } while (addr += PAGE_SIZE, addr != end); + } while (pte++, addr += PAGE_SIZE, addr != end); } *mask |= PGTBL_PTE_MODIFIED; @@ -2971,8 +3172,10 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr, next = pgd_addr_end(addr, end); if (pgd_none(*pgd) && !create) continue; - if (WARN_ON_ONCE(pgd_leaf(*pgd))) - return -EINVAL; + if (WARN_ON_ONCE(pgd_leaf(*pgd))) { + err = -EINVAL; + break; + } if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) { if (!create) continue; @@ -3013,7 +3216,6 @@ int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr, { return __apply_to_page_range(mm, addr, size, fn, data, false); } -EXPORT_SYMBOL_GPL(apply_to_existing_page_range); /* * handle_pte_fault chooses page fault handler according to an entry which was @@ -3412,7 +3614,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) inc_mm_counter(mm, MM_ANONPAGES); } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); - entry = mk_pte(&new_folio->page, vma->vm_page_prot); + entry = folio_mk_pte(new_folio, vma->vm_page_prot); entry = pte_sw_mkyoung(entry); if (unlikely(unshare)) { if (pte_soft_dirty(vmf->orig_pte)) @@ -3596,19 +3798,86 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio) return ret; } -static bool wp_can_reuse_anon_folio(struct folio *folio, - struct vm_area_struct *vma) +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static bool __wp_can_reuse_large_anon_folio(struct folio *folio, + struct vm_area_struct *vma) { + bool exclusive = false; + + /* Let's just free up a large folio if only a single page is mapped. */ + if (folio_large_mapcount(folio) <= 1) + return false; + /* - * We could currently only reuse a subpage of a large folio if no - * other subpages of the large folios are still mapped. However, - * let's just consistently not reuse subpages even if we could - * reuse in that scenario, and give back a large folio a bit - * sooner. + * The assumption for anonymous folios is that each page can only get + * mapped once into each MM. The only exception are KSM folios, which + * are always small. + * + * Each taken mapcount must be paired with exactly one taken reference, + * whereby the refcount must be incremented before the mapcount when + * mapping a page, and the refcount must be decremented after the + * mapcount when unmapping a page. + * + * If all folio references are from mappings, and all mappings are in + * the page tables of this MM, then this folio is exclusive to this MM. */ - if (folio_test_large(folio)) + if (test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids)) + return false; + + VM_WARN_ON_ONCE(folio_test_ksm(folio)); + + if (unlikely(folio_test_swapcache(folio))) { + /* + * Note: freeing up the swapcache will fail if some PTEs are + * still swap entries. + */ + if (!folio_trylock(folio)) + return false; + folio_free_swap(folio); + folio_unlock(folio); + } + + if (folio_large_mapcount(folio) != folio_ref_count(folio)) return false; + /* Stabilize the mapcount vs. refcount and recheck. */ + folio_lock_large_mapcount(folio); + VM_WARN_ON_ONCE_FOLIO(folio_large_mapcount(folio) > folio_ref_count(folio), folio); + + if (test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids)) + goto unlock; + if (folio_large_mapcount(folio) != folio_ref_count(folio)) + goto unlock; + + VM_WARN_ON_ONCE_FOLIO(folio_large_mapcount(folio) > folio_nr_pages(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_entire_mapcount(folio), folio); + VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != vma->vm_mm->mm_id && + folio_mm_id(folio, 1) != vma->vm_mm->mm_id); + + /* + * Do we need the folio lock? Likely not. If there would have been + * references from page migration/swapout, we would have detected + * an additional folio reference and never ended up here. + */ + exclusive = true; +unlock: + folio_unlock_large_mapcount(folio); + return exclusive; +} +#else /* !CONFIG_TRANSPARENT_HUGEPAGE */ +static bool __wp_can_reuse_large_anon_folio(struct folio *folio, + struct vm_area_struct *vma) +{ + BUILD_BUG(); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +static bool wp_can_reuse_anon_folio(struct folio *folio, + struct vm_area_struct *vma) +{ + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && folio_test_large(folio)) + return __wp_can_reuse_large_anon_folio(folio, vma); + /* * We have to verify under folio lock: these early checks are * just an optimization to avoid locking the folio and freeing @@ -3717,13 +3986,15 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { /* * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a - * VM_PFNMAP VMA. + * VM_PFNMAP VMA. FS DAX also wants ops->pfn_mkwrite called. * * We should not cow pages in a shared writeable mapping. * Just mark the pages writable and/or call ops->pfn_mkwrite. */ - if (!vmf->page) + if (!vmf->page || is_fsdax_page(vmf->page)) { + vmf->page = NULL; return wp_pfn_shared(vmf); + } return wp_page_shared(vmf, folio); } @@ -3913,7 +4184,7 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) folio_put(folio); return ret; } - mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, + mmu_notifier_range_init_owner(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, vmf->address & PAGE_MASK, (vmf->address & PAGE_MASK) + PAGE_SIZE, NULL); mmu_notifier_invalidate_range_start(&range); @@ -3921,7 +4192,8 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) - restore_exclusive_pte(vma, vmf->page, vmf->address, vmf->pte); + restore_exclusive_pte(vma, folio, vmf->page, vmf->address, + vmf->pte, vmf->orig_pte); if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); @@ -4043,26 +4315,6 @@ static struct folio *__alloc_swap_folio(struct vm_fault *vmf) } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) -{ - struct swap_info_struct *si = swp_swap_info(entry); - pgoff_t offset = swp_offset(entry); - int i; - - /* - * While allocating a large folio and doing swap_read_folio, which is - * the case the being faulted pte doesn't have swapcache. We need to - * ensure all PTEs have no cache as well, otherwise, we might go to - * swap devices while the content is in swapcache. - */ - for (i = 0; i < max_nr; i++) { - if ((si->swap_map[offset + i] & SWAP_HAS_CACHE)) - return i; - } - - return i; -} - /* * Check if the PTEs within a range are contiguous swap entries * and have consistent swapcache, zeromap. @@ -4189,8 +4441,10 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, gfp, entry)) return folio; + count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE); folio_put(folio); } + count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK); order = next_order(&orders, order); } @@ -4267,10 +4521,18 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * Get a page reference while we know the page can't be * freed. */ - get_page(vmf->page); - pte_unmap_unlock(vmf->pte, vmf->ptl); - ret = vmf->page->pgmap->ops->migrate_to_ram(vmf); - put_page(vmf->page); + if (trylock_page(vmf->page)) { + struct dev_pagemap *pgmap; + + get_page(vmf->page); + pte_unmap_unlock(vmf->pte, vmf->ptl); + pgmap = page_pgmap(vmf->page); + ret = pgmap->ops->migrate_to_ram(vmf); + unlock_page(vmf->page); + put_page(vmf->page); + } else { + pte_unmap_unlock(vmf->pte, vmf->ptl); + } } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; } else if (is_pte_marker_entry(entry)) { @@ -4324,7 +4586,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) } need_clear_cache = true; - mem_cgroup_swapin_uncharge_swap(entry, nr_pages); + memcg1_swapin(entry, nr_pages); shadow = get_shadow_from_swap_cache(entry); if (shadow) @@ -4388,8 +4650,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) /* * KSM sometimes has to copy on read faults, for example, if - * page->index of !PageKSM() pages would be nonlinear inside the - * anon VMA -- PageKSM() is lost on actual swapout. + * folio->index of non-ksm folios would be nonlinear inside the + * anon VMA -- the ksm flag is lost on actual swapout. */ folio = ksm_might_need_to_copy(folio, vma, vmf->address); if (unlikely(!folio)) { @@ -4733,12 +4995,12 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) folio_throttle_swaprate(folio, gfp); /* * When a folio is not zeroed during allocation - * (__GFP_ZERO not used), folio_zero_user() is used - * to make sure that the page corresponding to the - * faulting address will be hot in the cache after - * zeroing. + * (__GFP_ZERO not used) or user folios require special + * handling, folio_zero_user() is used to make sure + * that the page corresponding to the faulting address + * will be hot in the cache after zeroing. */ - if (!alloc_zeroed()) + if (user_alloc_needs_zeroing()) folio_zero_user(folio, vmf->address); return folio; } @@ -4822,7 +5084,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) */ __folio_mark_uptodate(folio); - entry = mk_pte(&folio->page, vma->vm_page_prot); + entry = folio_mk_pte(folio, vma->vm_page_prot); entry = pte_sw_mkyoung(entry); if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry), vma); @@ -4947,9 +5209,8 @@ static void deposit_prealloc_pte(struct vm_fault *vmf) vmf->prealloc_pte = NULL; } -vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) +vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *page) { - struct folio *folio = page_folio(page); struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; @@ -4997,7 +5258,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) flush_icache_pages(vma, page, HPAGE_PMD_NR); - entry = mk_huge_pmd(page, vma->vm_page_prot); + entry = folio_mk_pmd(folio, vma->vm_page_prot); if (write) entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); @@ -5022,7 +5283,7 @@ out: return ret; } #else -vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) +vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *page) { return VM_FAULT_FALLBACK; } @@ -5054,6 +5315,8 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio, if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); + else if (pte_write(entry) && folio_test_dirty(folio)) + entry = pte_mkdirty(entry); if (unlikely(vmf_orig_pte_uffd_wp(vmf))) entry = pte_mkuffd_wp(entry); /* copy-on-write page */ @@ -5102,7 +5365,11 @@ vm_fault_t finish_fault(struct vm_fault *vmf) bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED); int type, nr_pages; - unsigned long addr = vmf->address; + unsigned long addr; + bool needs_fallback = false; + +fallback: + addr = vmf->address; /* Did we COW the page? */ if (is_cow) @@ -5110,6 +5377,7 @@ vm_fault_t finish_fault(struct vm_fault *vmf) else page = vmf->page; + folio = page_folio(page); /* * check even for read faults because we might have lost our CoWed * page @@ -5121,8 +5389,8 @@ vm_fault_t finish_fault(struct vm_fault *vmf) } if (pmd_none(*vmf->pmd)) { - if (PageTransCompound(page)) { - ret = do_set_pmd(vmf, page); + if (folio_test_pmd_mappable(folio)) { + ret = do_set_pmd(vmf, folio, page); if (ret != VM_FAULT_FALLBACK) return ret; } @@ -5133,7 +5401,6 @@ vm_fault_t finish_fault(struct vm_fault *vmf) return VM_FAULT_OOM; } - folio = page_folio(page); nr_pages = folio_nr_pages(folio); /* @@ -5141,7 +5408,8 @@ vm_fault_t finish_fault(struct vm_fault *vmf) * approach also applies to non-anonymous-shmem faults to avoid * inflating the RSS of the process. */ - if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma))) { + if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma)) || + unlikely(needs_fallback)) { nr_pages = 1; } else if (nr_pages > 1) { pgoff_t idx = folio_page_idx(folio, page); @@ -5177,9 +5445,9 @@ vm_fault_t finish_fault(struct vm_fault *vmf) ret = VM_FAULT_NOPAGE; goto unlock; } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) { - update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages); - ret = VM_FAULT_NOPAGE; - goto unlock; + needs_fallback = true; + pte_unmap_unlock(vmf->pte, vmf->ptl); + goto fallback; } folio_ref_add(folio, nr_pages - 1); @@ -5488,7 +5756,7 @@ int numa_migrate_check(struct folio *folio, struct vm_fault *vmf, * Flag if the folio is shared between multiple address spaces. This * is later used when determining whether to group tasks together */ - if (folio_likely_mapped_shared(folio) && (vma->vm_flags & VM_SHARED)) + if (folio_maybe_mapped_shared(folio) && (vma->vm_flags & VM_SHARED)) *flags |= TNF_SHARED; /* * For memory tiering mode, cpupid of slow memory page is used @@ -5625,7 +5893,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) ignore_writable = true; /* Migrate to the requested node */ - if (!migrate_misplaced_folio(folio, vma, target_nid)) { + if (!migrate_misplaced_folio(folio, target_nid)) { nid = target_nid; flags |= TNF_MIGRATED; task_numa_fault(last_cpupid, nid, nr_pages, flags); @@ -5696,7 +5964,7 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) split: /* COW or write-notify handled on pte level: split pmd. */ - __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL); + __split_huge_pmd(vma, vmf->pmd, vmf->address, false); return VM_FAULT_FALLBACK; } @@ -6069,7 +6337,8 @@ static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma, } /* - * By the time we get here, we already hold the mm semaphore + * By the time we get here, we already hold either the VMA lock or the + * mmap_lock (FAULT_FLAG_VMA_LOCK tells you which). * * The mmap_lock may have been released depending on flags and our * return value. See filemap_fault() and __folio_lock_or_retry(). @@ -6141,173 +6410,6 @@ out: } EXPORT_SYMBOL_GPL(handle_mm_fault); -#ifdef CONFIG_LOCK_MM_AND_FIND_VMA -#include <linux/extable.h> - -static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) -{ - if (likely(mmap_read_trylock(mm))) - return true; - - if (regs && !user_mode(regs)) { - unsigned long ip = exception_ip(regs); - if (!search_exception_tables(ip)) - return false; - } - - return !mmap_read_lock_killable(mm); -} - -static inline bool mmap_upgrade_trylock(struct mm_struct *mm) -{ - /* - * We don't have this operation yet. - * - * It should be easy enough to do: it's basically a - * atomic_long_try_cmpxchg_acquire() - * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but - * it also needs the proper lockdep magic etc. - */ - return false; -} - -static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) -{ - mmap_read_unlock(mm); - if (regs && !user_mode(regs)) { - unsigned long ip = exception_ip(regs); - if (!search_exception_tables(ip)) - return false; - } - return !mmap_write_lock_killable(mm); -} - -/* - * Helper for page fault handling. - * - * This is kind of equivalend to "mmap_read_lock()" followed - * by "find_extend_vma()", except it's a lot more careful about - * the locking (and will drop the lock on failure). - * - * For example, if we have a kernel bug that causes a page - * fault, we don't want to just use mmap_read_lock() to get - * the mm lock, because that would deadlock if the bug were - * to happen while we're holding the mm lock for writing. - * - * So this checks the exception tables on kernel faults in - * order to only do this all for instructions that are actually - * expected to fault. - * - * We can also actually take the mm lock for writing if we - * need to extend the vma, which helps the VM layer a lot. - */ -struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, - unsigned long addr, struct pt_regs *regs) -{ - struct vm_area_struct *vma; - - if (!get_mmap_lock_carefully(mm, regs)) - return NULL; - - vma = find_vma(mm, addr); - if (likely(vma && (vma->vm_start <= addr))) - return vma; - - /* - * Well, dang. We might still be successful, but only - * if we can extend a vma to do so. - */ - if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) { - mmap_read_unlock(mm); - return NULL; - } - - /* - * We can try to upgrade the mmap lock atomically, - * in which case we can continue to use the vma - * we already looked up. - * - * Otherwise we'll have to drop the mmap lock and - * re-take it, and also look up the vma again, - * re-checking it. - */ - if (!mmap_upgrade_trylock(mm)) { - if (!upgrade_mmap_lock_carefully(mm, regs)) - return NULL; - - vma = find_vma(mm, addr); - if (!vma) - goto fail; - if (vma->vm_start <= addr) - goto success; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto fail; - } - - if (expand_stack_locked(vma, addr)) - goto fail; - -success: - mmap_write_downgrade(mm); - return vma; - -fail: - mmap_write_unlock(mm); - return NULL; -} -#endif - -#ifdef CONFIG_PER_VMA_LOCK -/* - * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be - * stable and not isolated. If the VMA is not found or is being modified the - * function returns NULL. - */ -struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, - unsigned long address) -{ - MA_STATE(mas, &mm->mm_mt, address, address); - struct vm_area_struct *vma; - - rcu_read_lock(); -retry: - vma = mas_walk(&mas); - if (!vma) - goto inval; - - if (!vma_start_read(vma)) - goto inval; - - /* Check if the VMA got isolated after we found it */ - if (vma->detached) { - vma_end_read(vma); - count_vm_vma_lock_event(VMA_LOCK_MISS); - /* The area was replaced with another one */ - goto retry; - } - /* - * At this point, we have a stable reference to a VMA: The VMA is - * locked and we know it hasn't already been isolated. - * From here on, we can access the VMA without worrying about which - * fields are accessible for RCU readers. - */ - - /* Check since vm_start/vm_end might change before we lock the VMA */ - if (unlikely(address < vma->vm_start || address >= vma->vm_end)) - goto inval_end_read; - - rcu_read_unlock(); - return vma; - -inval_end_read: - vma_end_read(vma); -inval: - rcu_read_unlock(); - count_vm_vma_lock_event(VMA_LOCK_ABORT); - return NULL; -} -#endif /* CONFIG_PER_VMA_LOCK */ - #ifndef __PAGETABLE_P4D_FOLDED /* * Allocate p4d page table. @@ -6388,6 +6490,7 @@ static inline void pfnmap_args_setup(struct follow_pfnmap_args *args, args->lock = lock; args->ptep = ptep; args->pfn = pfn_base + ((args->address & ~addr_mask) >> PAGE_SHIFT); + args->addr_mask = addr_mask; args->pgprot = pgprot; args->writable = writable; args->special = special; @@ -6547,7 +6650,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write) { resource_size_t phys_addr; - unsigned long prot = 0; + pgprot_t prot = __pgprot(0); void __iomem *maddr; int offset = offset_in_page(addr); int ret = -EINVAL; @@ -6557,7 +6660,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, retry: if (follow_pfnmap_start(&args)) return -EINVAL; - prot = pgprot_val(args.pgprot); + prot = args.pgprot; phys_addr = (resource_size_t)args.pfn << PAGE_SHIFT; writable = args.writable; follow_pfnmap_end(&args); @@ -6572,7 +6675,7 @@ retry: if (follow_pfnmap_start(&args)) goto out_unmap; - if ((prot != pgprot_val(args.pgprot)) || + if ((pgprot_val(prot) != pgprot_val(args.pgprot)) || (phys_addr != (args.pfn << PAGE_SHIFT)) || (writable != args.writable)) { follow_pfnmap_end(&args); @@ -6714,6 +6817,124 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, } EXPORT_SYMBOL_GPL(access_process_vm); +#ifdef CONFIG_BPF_SYSCALL +/* + * Copy a string from another process's address space as given in mm. + * If there is any error return -EFAULT. + */ +static int __copy_remote_vm_str(struct mm_struct *mm, unsigned long addr, + void *buf, int len, unsigned int gup_flags) +{ + void *old_buf = buf; + int err = 0; + + *(char *)buf = '\0'; + + if (mmap_read_lock_killable(mm)) + return -EFAULT; + + addr = untagged_addr_remote(mm, addr); + + /* Avoid triggering the temporary warning in __get_user_pages */ + if (!vma_lookup(mm, addr)) { + err = -EFAULT; + goto out; + } + + while (len) { + int bytes, offset, retval; + void *maddr; + struct page *page; + struct vm_area_struct *vma = NULL; + + page = get_user_page_vma_remote(mm, addr, gup_flags, &vma); + if (IS_ERR(page)) { + /* + * Treat as a total failure for now until we decide how + * to handle the CONFIG_HAVE_IOREMAP_PROT case and + * stack expansion. + */ + *(char *)buf = '\0'; + err = -EFAULT; + goto out; + } + + bytes = len; + offset = addr & (PAGE_SIZE - 1); + if (bytes > PAGE_SIZE - offset) + bytes = PAGE_SIZE - offset; + + maddr = kmap_local_page(page); + retval = strscpy(buf, maddr + offset, bytes); + if (retval >= 0) { + /* Found the end of the string */ + buf += retval; + unmap_and_put_page(page, maddr); + break; + } + + buf += bytes - 1; + /* + * Because strscpy always NUL terminates we need to + * copy the last byte in the page if we are going to + * load more pages + */ + if (bytes != len) { + addr += bytes - 1; + copy_from_user_page(vma, page, addr, buf, maddr + (PAGE_SIZE - 1), 1); + buf += 1; + addr += 1; + } + len -= bytes; + + unmap_and_put_page(page, maddr); + } + +out: + mmap_read_unlock(mm); + if (err) + return err; + return buf - old_buf; +} + +/** + * copy_remote_vm_str - copy a string from another process's address space. + * @tsk: the task of the target address space + * @addr: start address to read from + * @buf: destination buffer + * @len: number of bytes to copy + * @gup_flags: flags modifying lookup behaviour + * + * The caller must hold a reference on @mm. + * + * Return: number of bytes copied from @addr (source) to @buf (destination); + * not including the trailing NUL. Always guaranteed to leave NUL-terminated + * buffer. On any error, return -EFAULT. + */ +int copy_remote_vm_str(struct task_struct *tsk, unsigned long addr, + void *buf, int len, unsigned int gup_flags) +{ + struct mm_struct *mm; + int ret; + + if (unlikely(len == 0)) + return 0; + + mm = get_task_mm(tsk); + if (!mm) { + *(char *)buf = '\0'; + return -EFAULT; + } + + ret = __copy_remote_vm_str(mm, addr, buf, len, gup_flags); + + mmput(mm); + + return ret; +} +EXPORT_SYMBOL_GPL(copy_remote_vm_str); +#endif /* CONFIG_BPF_SYSCALL */ + /* * Print the name of a VMA. */ @@ -6746,10 +6967,8 @@ void __might_fault(const char *file, int line) if (pagefault_disabled()) return; __might_sleep(file, line); -#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) if (current->mm) might_lock_read(¤t->mm->mmap_lock); -#endif } EXPORT_SYMBOL(__might_fault); #endif @@ -6815,9 +7034,10 @@ static inline int process_huge_page( return 0; } -static void clear_gigantic_page(struct folio *folio, unsigned long addr, +static void clear_gigantic_page(struct folio *folio, unsigned long addr_hint, unsigned int nr_pages) { + unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(folio)); int i; might_sleep(); @@ -6851,13 +7071,14 @@ void folio_zero_user(struct folio *folio, unsigned long addr_hint) } static int copy_user_gigantic_page(struct folio *dst, struct folio *src, - unsigned long addr, + unsigned long addr_hint, struct vm_area_struct *vma, unsigned int nr_pages) { - int i; + unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(dst)); struct page *dst_page; struct page *src_page; + int i; for (i = 0; i < nr_pages; i++) { dst_page = folio_page(dst, i); @@ -6959,7 +7180,8 @@ bool ptlock_alloc(struct ptdesc *ptdesc) void ptlock_free(struct ptdesc *ptdesc) { - kmem_cache_free(page_ptl_cachep, ptdesc->ptl); + if (ptdesc->ptl) + kmem_cache_free(page_ptl_cachep, ptdesc->ptl); } #endif diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c43b4e7fb298..b1caedbade5b 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -219,11 +219,30 @@ void put_online_mems(void) bool movable_node_enabled = false; -#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE -int mhp_default_online_type = MMOP_OFFLINE; -#else -int mhp_default_online_type = MMOP_ONLINE; -#endif +static int mhp_default_online_type = -1; +int mhp_get_default_online_type(void) +{ + if (mhp_default_online_type >= 0) + return mhp_default_online_type; + + if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_OFFLINE)) + mhp_default_online_type = MMOP_OFFLINE; + else if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_AUTO)) + mhp_default_online_type = MMOP_ONLINE; + else if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_KERNEL)) + mhp_default_online_type = MMOP_ONLINE_KERNEL; + else if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_MOVABLE)) + mhp_default_online_type = MMOP_ONLINE_MOVABLE; + else + mhp_default_online_type = MMOP_OFFLINE; + + return mhp_default_online_type; +} + +void mhp_set_default_online_type(int online_type) +{ + mhp_default_online_type = online_type; +} static int __init setup_memhp_default_state(char *str) { @@ -650,6 +669,7 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages) * this and the first chunk to online will be pageblock_nr_pages. */ for (pfn = start_pfn; pfn < end_pfn;) { + struct page *page = pfn_to_page(pfn); int order; /* @@ -664,7 +684,14 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages) else order = MAX_PAGE_ORDER; - (*online_page_callback)(pfn_to_page(pfn), order); + /* + * Exposing the page to the buddy by freeing can cause + * issues with debug_pagealloc enabled: some archs don't + * like double-unmappings. So treat them like any pages that + * were allocated from the buddy. + */ + debug_pagealloc_map_pages(page, 1 << order); + (*online_page_callback)(page, order); pfn += (1UL << order); } @@ -1320,7 +1347,7 @@ static int check_hotplug_memory_range(u64 start, u64 size) static int online_memory_block(struct memory_block *mem, void *arg) { - mem->online_type = mhp_default_online_type; + mem->online_type = mhp_get_default_online_type(); return device_online(&mem->dev); } @@ -1567,7 +1594,7 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) merge_system_ram_resource(res); /* online pages if requested */ - if (mhp_default_online_type != MMOP_OFFLINE) + if (mhp_get_default_online_type() != MMOP_OFFLINE) walk_memory_blocks(start, size, NULL, online_memory_block); return ret; @@ -1729,12 +1756,10 @@ static int scan_movable_pages(unsigned long start, unsigned long end, { unsigned long pfn; - for (pfn = start; pfn < end; pfn++) { + for_each_valid_pfn(pfn, start, end) { struct page *page; struct folio *folio; - if (!pfn_valid(pfn)) - continue; page = pfn_to_page(pfn); if (PageLRU(page)) goto found; @@ -1778,43 +1803,32 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) static DEFINE_RATELIMIT_STATE(migrate_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - for (pfn = start_pfn; pfn < end_pfn; pfn++) { + for_each_valid_pfn(pfn, start_pfn, end_pfn) { struct page *page; - if (!pfn_valid(pfn)) - continue; page = pfn_to_page(pfn); folio = page_folio(page); - /* - * No reference or lock is held on the folio, so it might - * be modified concurrently (e.g. split). As such, - * folio_nr_pages() may read garbage. This is fine as the outer - * loop will revisit the split folio later. - */ + if (!folio_try_get(folio)) + continue; + + if (unlikely(page_folio(page) != folio)) + goto put_folio; + if (folio_test_large(folio)) pfn = folio_pfn(folio) + folio_nr_pages(folio) - 1; - /* - * HWPoison pages have elevated reference counts so the migration would - * fail on them. It also doesn't make any sense to migrate them in the - * first place. Still try to unmap such a page in case it is still mapped - * (keep the unmap as the catch all safety net). - */ - if (folio_test_hwpoison(folio) || - (folio_test_large(folio) && folio_test_has_hwpoisoned(folio))) { + if (folio_contain_hwpoisoned_page(folio)) { if (WARN_ON(folio_test_lru(folio))) folio_isolate_lru(folio); - if (folio_mapped(folio)) - unmap_poisoned_folio(folio, TTU_IGNORE_MLOCK); - continue; - } - - if (!folio_try_get(folio)) - continue; + if (folio_mapped(folio)) { + folio_lock(folio); + unmap_poisoned_folio(folio, pfn, false); + folio_unlock(folio); + } - if (unlikely(page_folio(page) != folio)) goto put_folio; + } if (!isolate_folio_to_list(folio, &source)) { if (__ratelimit(&migrate_rs)) { @@ -1830,7 +1844,7 @@ put_folio: nodemask_t nmask = node_states[N_MEMORY]; struct migration_target_control mtc = { .nmask = &nmask, - .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, + .gfp_mask = GFP_KERNEL | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, .reason = MR_MEMORY_HOTPLUG, }; int ret; @@ -1992,8 +2006,7 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages, /* set above range as isolated */ ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE, - MEMORY_OFFLINE | REPORT_FAILURE, - GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL); + MEMORY_OFFLINE | REPORT_FAILURE); if (ret) { reason = "failure to isolate range"; goto failed_removal_pcplists_disabled; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index bb37cd1a51d8..3b1dfd08338b 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -109,10 +109,12 @@ #include <linux/mmu_notifier.h> #include <linux/printk.h> #include <linux/swapops.h> +#include <linux/gcd.h> #include <asm/tlbflush.h> #include <asm/tlb.h> #include <linux/uaccess.h> +#include <linux/memory.h> #include "internal.h" @@ -139,31 +141,138 @@ static struct mempolicy default_policy = { static struct mempolicy preferred_node_policy[MAX_NUMNODES]; /* - * iw_table is the sysfs-set interleave weight table, a value of 0 denotes - * system-default value should be used. A NULL iw_table also denotes that - * system-default values should be used. Until the system-default table - * is implemented, the system-default is always 1. - * - * iw_table is RCU protected + * weightiness balances the tradeoff between small weights (cycles through nodes + * faster, more fair/even distribution) and large weights (smaller errors + * between actual bandwidth ratios and weight ratios). 32 is a number that has + * been found to perform at a reasonable compromise between the two goals. + */ +static const int weightiness = 32; + +/* + * A null weighted_interleave_state is interpreted as having .mode="auto", + * and .iw_table is interpreted as an array of 1s with length nr_node_ids. + */ +struct weighted_interleave_state { + bool mode_auto; + u8 iw_table[]; +}; +static struct weighted_interleave_state __rcu *wi_state; +static unsigned int *node_bw_table; + +/* + * wi_state_lock protects both wi_state and node_bw_table. + * node_bw_table is only used by writers to update wi_state. */ -static u8 __rcu *iw_table; -static DEFINE_MUTEX(iw_table_lock); +static DEFINE_MUTEX(wi_state_lock); static u8 get_il_weight(int node) { - u8 *table; - u8 weight; + struct weighted_interleave_state *state; + u8 weight = 1; rcu_read_lock(); - table = rcu_dereference(iw_table); - /* if no iw_table, use system default */ - weight = table ? table[node] : 1; - /* if value in iw_table is 0, use system default */ - weight = weight ? weight : 1; + state = rcu_dereference(wi_state); + if (state) + weight = state->iw_table[node]; rcu_read_unlock(); return weight; } +/* + * Convert bandwidth values into weighted interleave weights. + * Call with wi_state_lock. + */ +static void reduce_interleave_weights(unsigned int *bw, u8 *new_iw) +{ + u64 sum_bw = 0; + unsigned int cast_sum_bw, scaling_factor = 1, iw_gcd = 0; + int nid; + + for_each_node_state(nid, N_MEMORY) + sum_bw += bw[nid]; + + /* Scale bandwidths to whole numbers in the range [1, weightiness] */ + for_each_node_state(nid, N_MEMORY) { + /* + * Try not to perform 64-bit division. + * If sum_bw < scaling_factor, then sum_bw < U32_MAX. + * If sum_bw > scaling_factor, then round the weight up to 1. + */ + scaling_factor = weightiness * bw[nid]; + if (bw[nid] && sum_bw < scaling_factor) { + cast_sum_bw = (unsigned int)sum_bw; + new_iw[nid] = scaling_factor / cast_sum_bw; + } else { + new_iw[nid] = 1; + } + if (!iw_gcd) + iw_gcd = new_iw[nid]; + iw_gcd = gcd(iw_gcd, new_iw[nid]); + } + + /* 1:2 is strictly better than 16:32. Reduce by the weights' GCD. */ + for_each_node_state(nid, N_MEMORY) + new_iw[nid] /= iw_gcd; +} + +int mempolicy_set_node_perf(unsigned int node, struct access_coordinate *coords) +{ + struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL; + unsigned int *old_bw, *new_bw; + unsigned int bw_val; + int i; + + bw_val = min(coords->read_bandwidth, coords->write_bandwidth); + new_bw = kcalloc(nr_node_ids, sizeof(unsigned int), GFP_KERNEL); + if (!new_bw) + return -ENOMEM; + + new_wi_state = kmalloc(struct_size(new_wi_state, iw_table, nr_node_ids), + GFP_KERNEL); + if (!new_wi_state) { + kfree(new_bw); + return -ENOMEM; + } + new_wi_state->mode_auto = true; + for (i = 0; i < nr_node_ids; i++) + new_wi_state->iw_table[i] = 1; + + /* + * Update bandwidth info, even in manual mode. That way, when switching + * to auto mode in the future, iw_table can be overwritten using + * accurate bw data. + */ + mutex_lock(&wi_state_lock); + + old_bw = node_bw_table; + if (old_bw) + memcpy(new_bw, old_bw, nr_node_ids * sizeof(*old_bw)); + new_bw[node] = bw_val; + node_bw_table = new_bw; + + old_wi_state = rcu_dereference_protected(wi_state, + lockdep_is_held(&wi_state_lock)); + if (old_wi_state && !old_wi_state->mode_auto) { + /* Manual mode; skip reducing weights and updating wi_state */ + mutex_unlock(&wi_state_lock); + kfree(new_wi_state); + goto out; + } + + /* NULL wi_state assumes auto=true; reduce weights and update wi_state*/ + reduce_interleave_weights(new_bw, new_wi_state->iw_table); + rcu_assign_pointer(wi_state, new_wi_state); + + mutex_unlock(&wi_state_lock); + if (old_wi_state) { + synchronize_rcu(); + kfree(old_wi_state); + } +out: + kfree(old_bw); + return 0; +} + /** * numa_nearest_node - Find nearest node by state * @node: Node id to start the search @@ -196,6 +305,37 @@ int numa_nearest_node(int node, unsigned int state) } EXPORT_SYMBOL_GPL(numa_nearest_node); +/** + * nearest_node_nodemask - Find the node in @mask at the nearest distance + * from @node. + * + * @node: a valid node ID to start the search from. + * @mask: a pointer to a nodemask representing the allowed nodes. + * + * This function iterates over all nodes in @mask and calculates the + * distance from the starting @node, then it returns the node ID that is + * the closest to @node, or MAX_NUMNODES if no node is found. + * + * Note that @node must be a valid node ID usable with node_distance(), + * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes + * or unexpected behavior. + */ +int nearest_node_nodemask(int node, nodemask_t *mask) +{ + int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES; + + for_each_node_mask(n, *mask) { + dist = node_distance(node, n); + if (dist < min_dist) { + min_dist = dist; + min_node = n; + } + } + + return min_node; +} +EXPORT_SYMBOL_GPL(nearest_node_nodemask); + struct mempolicy *get_task_policy(struct task_struct *p) { struct mempolicy *pol = p->mempolicy; @@ -535,6 +675,7 @@ static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk) static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { + const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; struct vm_area_struct *vma = walk->vma; struct folio *folio; struct queue_pages *qp = walk->private; @@ -542,6 +683,7 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, pte_t *pte, *mapped_pte; pte_t ptent; spinlock_t *ptl; + int max_nr, nr; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { @@ -555,7 +697,9 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, walk->action = ACTION_AGAIN; return 0; } - for (; addr != end; pte++, addr += PAGE_SIZE) { + for (; addr != end; pte += nr, addr += nr * PAGE_SIZE) { + max_nr = (end - addr) >> PAGE_SHIFT; + nr = 1; ptent = ptep_get(pte); if (pte_none(ptent)) continue; @@ -567,6 +711,10 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; + if (folio_test_large(folio) && max_nr != 1) + nr = folio_pte_batch(folio, addr, pte, ptent, + max_nr, fpb_flags, + NULL, NULL, NULL); /* * vm_normal_folio() filters out zero pages, but there might * still be reserved folios to skip, perhaps in a VDSO. @@ -599,7 +747,7 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || !vma_migratable(vma) || !migrate_folio_add(folio, qp->pagelist, flags)) { - qp->nr_failed++; + qp->nr_failed += nr; if (strictly_unmovable(flags)) break; } @@ -642,12 +790,12 @@ static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. * Choosing not to migrate a shared folio is not counted as a failure. * - * See folio_likely_mapped_shared() on possible imprecision when we + * See folio_maybe_mapped_shared() on possible imprecision when we * cannot easily detect if a folio is shared. */ if ((flags & MPOL_MF_MOVE_ALL) || - (!folio_likely_mapped_shared(folio) && !hugetlb_pmd_shared(pte))) - if (!isolate_hugetlb(folio, qp->pagelist)) + (!folio_maybe_mapped_shared(folio) && !hugetlb_pmd_shared(pte))) + if (!folio_isolate_hugetlb(folio, qp->pagelist)) qp->nr_failed++; unlock: spin_unlock(ptl); @@ -1033,10 +1181,10 @@ static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. * Choosing not to migrate a shared folio is not counted as a failure. * - * See folio_likely_mapped_shared() on possible imprecision when we + * See folio_maybe_mapped_shared() on possible imprecision when we * cannot easily detect if a folio is shared. */ - if ((flags & MPOL_MF_MOVE_ALL) || !folio_likely_mapped_shared(folio)) { + if ((flags & MPOL_MF_MOVE_ALL) || !folio_maybe_mapped_shared(folio)) { if (folio_isolate_lru(folio)) { list_add_tail(&folio->lru, foliolist); node_stat_mod_folio(folio, @@ -1080,6 +1228,10 @@ static long migrate_to_node(struct mm_struct *mm, int source, int dest, mmap_read_lock(mm); vma = find_vma(mm, 0); + if (unlikely(!vma)) { + mmap_read_unlock(mm); + return 0; + } /* * This does not migrate the range, but isolates all pages that @@ -1979,26 +2131,28 @@ static unsigned int read_once_policy_nodemask(struct mempolicy *pol, static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx) { + struct weighted_interleave_state *state; nodemask_t nodemask; unsigned int target, nr_nodes; - u8 *table; + u8 *table = NULL; unsigned int weight_total = 0; u8 weight; - int nid; + int nid = 0; nr_nodes = read_once_policy_nodemask(pol, &nodemask); if (!nr_nodes) return numa_node_id(); rcu_read_lock(); - table = rcu_dereference(iw_table); + + state = rcu_dereference(wi_state); + /* Uninitialized wi_state means we should assume all weights are 1 */ + if (state) + table = state->iw_table; + /* calculate the total weight */ - for_each_node_mask(nid, nodemask) { - /* detect system default usage */ - weight = table ? table[nid] : 1; - weight = weight ? weight : 1; - weight_total += weight; - } + for_each_node_mask(nid, nodemask) + weight_total += table ? table[nid] : 1; /* Calculate the node offset based on totals */ target = ilx % weight_total; @@ -2006,7 +2160,6 @@ static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx) while (target) { /* detect system default usage */ weight = table ? table[nid] : 1; - weight = weight ? weight : 1; if (target < weight) break; target -= weight; @@ -2201,9 +2354,9 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, */ preferred_gfp = gfp | __GFP_NOWARN; preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); - page = __alloc_pages_noprof(preferred_gfp, order, nid, nodemask); + page = __alloc_frozen_pages_noprof(preferred_gfp, order, nid, nodemask); if (!page) - page = __alloc_pages_noprof(gfp, order, nid, NULL); + page = __alloc_frozen_pages_noprof(gfp, order, nid, NULL); return page; } @@ -2218,7 +2371,7 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, * * Return: The page on success or NULL if allocation fails. */ -struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order, +static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, struct mempolicy *pol, pgoff_t ilx, int nid) { nodemask_t *nodemask; @@ -2249,8 +2402,9 @@ struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order, * First, try to allocate THP only on local node, but * don't reclaim unnecessarily, just compact. */ - page = __alloc_pages_node_noprof(nid, - gfp | __GFP_THISNODE | __GFP_NORETRY, order); + page = __alloc_frozen_pages_noprof( + gfp | __GFP_THISNODE | __GFP_NORETRY, order, + nid, NULL); if (page || !(gfp & __GFP_DIRECT_RECLAIM)) return page; /* @@ -2262,9 +2416,10 @@ struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order, } } - page = __alloc_pages_noprof(gfp, order, nid, nodemask); + page = __alloc_frozen_pages_noprof(gfp, order, nid, nodemask); - if (unlikely(pol->mode == MPOL_INTERLEAVE) && page) { + if (unlikely(pol->mode == MPOL_INTERLEAVE || + pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) { /* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */ if (static_branch_likely(&vm_numa_stat_key) && page_to_nid(page) == nid) { @@ -2280,8 +2435,13 @@ struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order, struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, struct mempolicy *pol, pgoff_t ilx, int nid) { - return page_rmappable_folio(alloc_pages_mpol_noprof(gfp | __GFP_COMP, - order, pol, ilx, nid)); + struct page *page = alloc_pages_mpol(gfp | __GFP_COMP, order, pol, + ilx, nid); + if (!page) + return NULL; + + set_page_refcounted(page); + return page_rmappable_folio(page); } /** @@ -2295,7 +2455,7 @@ struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, * NUMA policy. The caller must hold the mmap_lock of the mm_struct of the * VMA to prevent it from going away. Should be used for all allocations * for folios that will be mapped into user space, excepting hugetlbfs, and - * excepting where direct use of alloc_pages_mpol() is more appropriate. + * excepting where direct use of folio_alloc_mpol() is more appropriate. * * Return: The folio on success or NULL if allocation fails. */ @@ -2316,6 +2476,21 @@ struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct } EXPORT_SYMBOL(vma_alloc_folio_noprof); +struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned order) +{ + struct mempolicy *pol = &default_policy; + + /* + * No reference counting needed for current->mempolicy + * nor system default_policy + */ + if (!in_interrupt() && !(gfp & __GFP_THISNODE)) + pol = get_task_policy(current); + + return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX, + numa_node_id()); +} + /** * alloc_pages - Allocate pages. * @gfp: GFP flags. @@ -2332,17 +2507,11 @@ EXPORT_SYMBOL(vma_alloc_folio_noprof); */ struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order) { - struct mempolicy *pol = &default_policy; - - /* - * No reference counting needed for current->mempolicy - * nor system default_policy - */ - if (!in_interrupt() && !(gfp & __GFP_THISNODE)) - pol = get_task_policy(current); + struct page *page = alloc_frozen_pages_noprof(gfp, order); - return alloc_pages_mpol_noprof(gfp, order, pol, NO_INTERLEAVE_INDEX, - numa_node_id()); + if (page) + set_page_refcounted(page); + return page; } EXPORT_SYMBOL(alloc_pages_noprof); @@ -2352,7 +2521,7 @@ struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) } EXPORT_SYMBOL(folio_alloc_noprof); -static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, +static unsigned long alloc_pages_bulk_interleave(gfp_t gfp, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) { @@ -2371,13 +2540,13 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, if (delta) { nr_allocated = alloc_pages_bulk_noprof(gfp, interleave_nodes(pol), NULL, - nr_pages_per_node + 1, NULL, + nr_pages_per_node + 1, page_array); delta--; } else { nr_allocated = alloc_pages_bulk_noprof(gfp, interleave_nodes(pol), NULL, - nr_pages_per_node, NULL, page_array); + nr_pages_per_node, page_array); } page_array += nr_allocated; @@ -2387,17 +2556,18 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, return total_allocated; } -static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp, +static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) { + struct weighted_interleave_state *state; struct task_struct *me = current; unsigned int cpuset_mems_cookie; unsigned long total_allocated = 0; unsigned long nr_allocated = 0; unsigned long rounds; unsigned long node_pages, delta; - u8 *table, *weights, weight; + u8 *weights, weight; unsigned int weight_total = 0; unsigned long rem_pages = nr_pages; nodemask_t nodes; @@ -2426,7 +2596,7 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp, if (weight && node_isset(node, nodes)) { node_pages = min(rem_pages, weight); nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, - NULL, page_array); + page_array); page_array += nr_allocated; total_allocated += nr_allocated; /* if that's all the pages, no need to interleave */ @@ -2447,17 +2617,19 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp, return total_allocated; rcu_read_lock(); - table = rcu_dereference(iw_table); - if (table) - memcpy(weights, table, nr_node_ids); - rcu_read_unlock(); + state = rcu_dereference(wi_state); + if (state) { + memcpy(weights, state->iw_table, nr_node_ids * sizeof(u8)); + rcu_read_unlock(); + } else { + rcu_read_unlock(); + for (i = 0; i < nr_node_ids; i++) + weights[i] = 1; + } /* calculate total, detect system default usage */ - for_each_node_mask(node, nodes) { - if (!weights[node]) - weights[node] = 1; + for_each_node_mask(node, nodes) weight_total += weights[node]; - } /* * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls. @@ -2489,7 +2661,7 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp, if (!node_pages) break; nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, - NULL, page_array); + page_array); page_array += nr_allocated; total_allocated += nr_allocated; if (total_allocated == nr_pages) @@ -2502,7 +2674,7 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp, return total_allocated; } -static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid, +static unsigned long alloc_pages_bulk_preferred_many(gfp_t gfp, int nid, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) { @@ -2513,11 +2685,11 @@ static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid, preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); nr_allocated = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes, - nr_pages, NULL, page_array); + nr_pages, page_array); if (nr_allocated < nr_pages) nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL, - nr_pages - nr_allocated, NULL, + nr_pages - nr_allocated, page_array + nr_allocated); return nr_allocated; } @@ -2528,7 +2700,7 @@ static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid, * It can accelerate memory allocation especially interleaving * allocate memory. */ -unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp, +unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp, unsigned long nr_pages, struct page **page_array) { struct mempolicy *pol = &default_policy; @@ -2539,21 +2711,21 @@ unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp, pol = get_task_policy(current); if (pol->mode == MPOL_INTERLEAVE) - return alloc_pages_bulk_array_interleave(gfp, pol, + return alloc_pages_bulk_interleave(gfp, pol, nr_pages, page_array); if (pol->mode == MPOL_WEIGHTED_INTERLEAVE) - return alloc_pages_bulk_array_weighted_interleave( + return alloc_pages_bulk_weighted_interleave( gfp, pol, nr_pages, page_array); if (pol->mode == MPOL_PREFERRED_MANY) - return alloc_pages_bulk_array_preferred_many(gfp, + return alloc_pages_bulk_preferred_many(gfp, numa_node_id(), pol, nr_pages, page_array); nid = numa_node_id(); nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid); return alloc_pages_bulk_noprof(gfp, nid, nodemask, - nr_pages, NULL, page_array); + nr_pages, page_array); } int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) @@ -3368,6 +3540,14 @@ struct iw_node_attr { int nid; }; +struct sysfs_wi_group { + struct kobject wi_kobj; + struct mutex kobj_lock; + struct iw_node_attr *nattrs[]; +}; + +static struct sysfs_wi_group *wi_group; + static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -3382,177 +3562,316 @@ static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr, static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { + struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL; struct iw_node_attr *node_attr; - u8 *new; - u8 *old; u8 weight = 0; + int i; node_attr = container_of(attr, struct iw_node_attr, kobj_attr); - if (count == 0 || sysfs_streq(buf, "")) - weight = 0; - else if (kstrtou8(buf, 0, &weight)) + if (count == 0 || sysfs_streq(buf, "") || + kstrtou8(buf, 0, &weight) || weight == 0) return -EINVAL; - new = kzalloc(nr_node_ids, GFP_KERNEL); - if (!new) + new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids), + GFP_KERNEL); + if (!new_wi_state) return -ENOMEM; - mutex_lock(&iw_table_lock); - old = rcu_dereference_protected(iw_table, - lockdep_is_held(&iw_table_lock)); - if (old) - memcpy(new, old, nr_node_ids); - new[node_attr->nid] = weight; - rcu_assign_pointer(iw_table, new); - mutex_unlock(&iw_table_lock); - synchronize_rcu(); - kfree(old); + mutex_lock(&wi_state_lock); + old_wi_state = rcu_dereference_protected(wi_state, + lockdep_is_held(&wi_state_lock)); + if (old_wi_state) { + memcpy(new_wi_state->iw_table, old_wi_state->iw_table, + nr_node_ids * sizeof(u8)); + } else { + for (i = 0; i < nr_node_ids; i++) + new_wi_state->iw_table[i] = 1; + } + new_wi_state->iw_table[node_attr->nid] = weight; + new_wi_state->mode_auto = false; + + rcu_assign_pointer(wi_state, new_wi_state); + mutex_unlock(&wi_state_lock); + if (old_wi_state) { + synchronize_rcu(); + kfree(old_wi_state); + } return count; } -static struct iw_node_attr **node_attrs; - -static void sysfs_wi_node_release(struct iw_node_attr *node_attr, - struct kobject *parent) +static ssize_t weighted_interleave_auto_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) { - if (!node_attr) - return; - sysfs_remove_file(parent, &node_attr->kobj_attr.attr); - kfree(node_attr->kobj_attr.attr.name); - kfree(node_attr); + struct weighted_interleave_state *state; + bool wi_auto = true; + + rcu_read_lock(); + state = rcu_dereference(wi_state); + if (state) + wi_auto = state->mode_auto; + rcu_read_unlock(); + + return sysfs_emit(buf, "%s\n", str_true_false(wi_auto)); } -static void sysfs_wi_release(struct kobject *wi_kobj) +static ssize_t weighted_interleave_auto_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) { + struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL; + unsigned int *bw; + bool input; int i; + if (kstrtobool(buf, &input)) + return -EINVAL; + + new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids), + GFP_KERNEL); + if (!new_wi_state) + return -ENOMEM; for (i = 0; i < nr_node_ids; i++) - sysfs_wi_node_release(node_attrs[i], wi_kobj); - kobject_put(wi_kobj); + new_wi_state->iw_table[i] = 1; + + mutex_lock(&wi_state_lock); + if (!input) { + old_wi_state = rcu_dereference_protected(wi_state, + lockdep_is_held(&wi_state_lock)); + if (!old_wi_state) + goto update_wi_state; + if (input == old_wi_state->mode_auto) { + mutex_unlock(&wi_state_lock); + return count; + } + + memcpy(new_wi_state->iw_table, old_wi_state->iw_table, + nr_node_ids * sizeof(u8)); + goto update_wi_state; + } + + bw = node_bw_table; + if (!bw) { + mutex_unlock(&wi_state_lock); + kfree(new_wi_state); + return -ENODEV; + } + + new_wi_state->mode_auto = true; + reduce_interleave_weights(bw, new_wi_state->iw_table); + +update_wi_state: + rcu_assign_pointer(wi_state, new_wi_state); + mutex_unlock(&wi_state_lock); + if (old_wi_state) { + synchronize_rcu(); + kfree(old_wi_state); + } + return count; +} + +static void sysfs_wi_node_delete(int nid) +{ + struct iw_node_attr *attr; + + if (nid < 0 || nid >= nr_node_ids) + return; + + mutex_lock(&wi_group->kobj_lock); + attr = wi_group->nattrs[nid]; + if (!attr) { + mutex_unlock(&wi_group->kobj_lock); + return; + } + + wi_group->nattrs[nid] = NULL; + mutex_unlock(&wi_group->kobj_lock); + + sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr); + kfree(attr->kobj_attr.attr.name); + kfree(attr); +} + +static void sysfs_wi_node_delete_all(void) +{ + int nid; + + for (nid = 0; nid < nr_node_ids; nid++) + sysfs_wi_node_delete(nid); +} + +static void wi_state_free(void) +{ + struct weighted_interleave_state *old_wi_state; + + mutex_lock(&wi_state_lock); + + old_wi_state = rcu_dereference_protected(wi_state, + lockdep_is_held(&wi_state_lock)); + if (!old_wi_state) { + mutex_unlock(&wi_state_lock); + return; + } + + rcu_assign_pointer(wi_state, NULL); + mutex_unlock(&wi_state_lock); + synchronize_rcu(); + kfree(old_wi_state); +} + +static struct kobj_attribute wi_auto_attr = + __ATTR(auto, 0664, weighted_interleave_auto_show, + weighted_interleave_auto_store); + +static void wi_cleanup(void) { + sysfs_remove_file(&wi_group->wi_kobj, &wi_auto_attr.attr); + sysfs_wi_node_delete_all(); + wi_state_free(); +} + +static void wi_kobj_release(struct kobject *wi_kobj) +{ + kfree(wi_group); } static const struct kobj_type wi_ktype = { .sysfs_ops = &kobj_sysfs_ops, - .release = sysfs_wi_release, + .release = wi_kobj_release, }; -static int add_weight_node(int nid, struct kobject *wi_kobj) +static int sysfs_wi_node_add(int nid) { - struct iw_node_attr *node_attr; + int ret; char *name; + struct iw_node_attr *new_attr; - node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL); - if (!node_attr) + if (nid < 0 || nid >= nr_node_ids) { + pr_err("invalid node id: %d\n", nid); + return -EINVAL; + } + + new_attr = kzalloc(sizeof(*new_attr), GFP_KERNEL); + if (!new_attr) return -ENOMEM; name = kasprintf(GFP_KERNEL, "node%d", nid); if (!name) { - kfree(node_attr); + kfree(new_attr); return -ENOMEM; } - sysfs_attr_init(&node_attr->kobj_attr.attr); - node_attr->kobj_attr.attr.name = name; - node_attr->kobj_attr.attr.mode = 0644; - node_attr->kobj_attr.show = node_show; - node_attr->kobj_attr.store = node_store; - node_attr->nid = nid; + sysfs_attr_init(&new_attr->kobj_attr.attr); + new_attr->kobj_attr.attr.name = name; + new_attr->kobj_attr.attr.mode = 0644; + new_attr->kobj_attr.show = node_show; + new_attr->kobj_attr.store = node_store; + new_attr->nid = nid; - if (sysfs_create_file(wi_kobj, &node_attr->kobj_attr.attr)) { - kfree(node_attr->kobj_attr.attr.name); - kfree(node_attr); - pr_err("failed to add attribute to weighted_interleave\n"); - return -ENOMEM; + mutex_lock(&wi_group->kobj_lock); + if (wi_group->nattrs[nid]) { + mutex_unlock(&wi_group->kobj_lock); + ret = -EEXIST; + goto out; } - node_attrs[nid] = node_attr; + ret = sysfs_create_file(&wi_group->wi_kobj, &new_attr->kobj_attr.attr); + if (ret) { + mutex_unlock(&wi_group->kobj_lock); + goto out; + } + wi_group->nattrs[nid] = new_attr; + mutex_unlock(&wi_group->kobj_lock); return 0; + +out: + kfree(new_attr->kobj_attr.attr.name); + kfree(new_attr); + return ret; } -static int add_weighted_interleave_group(struct kobject *root_kobj) +static int wi_node_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + int err; + struct memory_notify *arg = data; + int nid = arg->status_change_nid; + + if (nid < 0) + return NOTIFY_OK; + + switch (action) { + case MEM_ONLINE: + err = sysfs_wi_node_add(nid); + if (err) + pr_err("failed to add sysfs for node%d during hotplug: %d\n", + nid, err); + break; + case MEM_OFFLINE: + sysfs_wi_node_delete(nid); + break; + } + + return NOTIFY_OK; +} + +static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj) { - struct kobject *wi_kobj; int nid, err; - wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL); - if (!wi_kobj) + wi_group = kzalloc(struct_size(wi_group, nattrs, nr_node_ids), + GFP_KERNEL); + if (!wi_group) return -ENOMEM; + mutex_init(&wi_group->kobj_lock); - err = kobject_init_and_add(wi_kobj, &wi_ktype, root_kobj, + err = kobject_init_and_add(&wi_group->wi_kobj, &wi_ktype, mempolicy_kobj, "weighted_interleave"); - if (err) { - kfree(wi_kobj); - return err; - } + if (err) + goto err_put_kobj; - for_each_node_state(nid, N_POSSIBLE) { - err = add_weight_node(nid, wi_kobj); + err = sysfs_create_file(&wi_group->wi_kobj, &wi_auto_attr.attr); + if (err) + goto err_put_kobj; + + for_each_online_node(nid) { + if (!node_state(nid, N_MEMORY)) + continue; + + err = sysfs_wi_node_add(nid); if (err) { - pr_err("failed to add sysfs [node%d]\n", nid); - break; + pr_err("failed to add sysfs for node%d during init: %d\n", + nid, err); + goto err_cleanup_kobj; } } - if (err) - kobject_put(wi_kobj); - return 0; -} -static void mempolicy_kobj_release(struct kobject *kobj) -{ - u8 *old; + hotplug_memory_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI); + return 0; - mutex_lock(&iw_table_lock); - old = rcu_dereference_protected(iw_table, - lockdep_is_held(&iw_table_lock)); - rcu_assign_pointer(iw_table, NULL); - mutex_unlock(&iw_table_lock); - synchronize_rcu(); - kfree(old); - kfree(node_attrs); - kfree(kobj); +err_cleanup_kobj: + wi_cleanup(); + kobject_del(&wi_group->wi_kobj); +err_put_kobj: + kobject_put(&wi_group->wi_kobj); + return err; } -static const struct kobj_type mempolicy_ktype = { - .release = mempolicy_kobj_release -}; - static int __init mempolicy_sysfs_init(void) { int err; static struct kobject *mempolicy_kobj; - mempolicy_kobj = kzalloc(sizeof(*mempolicy_kobj), GFP_KERNEL); - if (!mempolicy_kobj) { - err = -ENOMEM; - goto err_out; - } - - node_attrs = kcalloc(nr_node_ids, sizeof(struct iw_node_attr *), - GFP_KERNEL); - if (!node_attrs) { - err = -ENOMEM; - goto mempol_out; - } + mempolicy_kobj = kobject_create_and_add("mempolicy", mm_kobj); + if (!mempolicy_kobj) + return -ENOMEM; - err = kobject_init_and_add(mempolicy_kobj, &mempolicy_ktype, mm_kobj, - "mempolicy"); + err = add_weighted_interleave_group(mempolicy_kobj); if (err) - goto node_out; + goto err_kobj; - err = add_weighted_interleave_group(mempolicy_kobj); - if (err) { - pr_err("mempolicy sysfs structure failed to initialize\n"); - kobject_put(mempolicy_kobj); - return err; - } + return 0; - return err; -node_out: - kfree(node_attrs); -mempol_out: - kfree(mempolicy_kobj); -err_out: - pr_err("failed to add mempolicy kobject to the system\n"); +err_kobj: + kobject_del(mempolicy_kobj); + kobject_put(mempolicy_kobj); return err; } diff --git a/mm/memremap.c b/mm/memremap.c index 40d4547ce514..c417c843e9b1 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -130,7 +130,7 @@ static void pageunmap_range(struct dev_pagemap *pgmap, int range_id) } mem_hotplug_done(); - untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range), true); + pfnmap_untrack(PHYS_PFN(range->start), range_len(range)); pgmap_array_delete(range); } @@ -211,8 +211,8 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, if (nid < 0) nid = numa_mem_id(); - error = track_pfn_remap(NULL, ¶ms->pgprot, PHYS_PFN(range->start), 0, - range_len(range)); + error = pfnmap_track(PHYS_PFN(range->start), range_len(range), + ¶ms->pgprot); if (error) goto err_pfn_remap; @@ -277,7 +277,7 @@ err_add_memory: if (!is_private) kasan_remove_zero_shadow(__va(range->start), range_len(range)); err_kasan: - untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range), true); + pfnmap_untrack(PHYS_PFN(range->start), range_len(range)); err_pfn_remap: pgmap_array_delete(range); return error; @@ -458,8 +458,9 @@ EXPORT_SYMBOL_GPL(get_dev_pagemap); void free_zone_device_folio(struct folio *folio) { - if (WARN_ON_ONCE(!folio->page.pgmap->ops || - !folio->page.pgmap->ops->page_free)) + struct dev_pagemap *pgmap = folio->pgmap; + + if (WARN_ON_ONCE(!pgmap)) return; mem_cgroup_uncharge(folio); @@ -484,19 +485,42 @@ void free_zone_device_folio(struct folio *folio) * For other types of ZONE_DEVICE pages, migration is either * handled differently or not done at all, so there is no need * to clear folio->mapping. + * + * FS DAX pages clear the mapping when the folio->share count hits + * zero which indicating the page has been removed from the file + * system mapping. */ - folio->mapping = NULL; - folio->page.pgmap->ops->page_free(folio_page(folio, 0)); + if (pgmap->type != MEMORY_DEVICE_FS_DAX && + pgmap->type != MEMORY_DEVICE_GENERIC) + folio->mapping = NULL; + + switch (pgmap->type) { + case MEMORY_DEVICE_PRIVATE: + case MEMORY_DEVICE_COHERENT: + if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->page_free)) + break; + pgmap->ops->page_free(folio_page(folio, 0)); + put_dev_pagemap(pgmap); + break; - if (folio->page.pgmap->type != MEMORY_DEVICE_PRIVATE && - folio->page.pgmap->type != MEMORY_DEVICE_COHERENT) + case MEMORY_DEVICE_GENERIC: /* * Reset the refcount to 1 to prepare for handing out the page * again. */ folio_set_count(folio, 1); - else - put_dev_pagemap(folio->page.pgmap); + break; + + case MEMORY_DEVICE_FS_DAX: + wake_up_var(&folio->page); + break; + + case MEMORY_DEVICE_PCI_P2PDMA: + if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->page_free)) + break; + pgmap->ops->page_free(folio_page(folio, 0)); + break; + } } void zone_device_page_init(struct page *page) @@ -505,26 +529,8 @@ void zone_device_page_init(struct page *page) * Drivers shouldn't be allocating pages after calling * memunmap_pages(). */ - WARN_ON_ONCE(!percpu_ref_tryget_live(&page->pgmap->ref)); + WARN_ON_ONCE(!percpu_ref_tryget_live(&page_pgmap(page)->ref)); set_page_count(page, 1); lock_page(page); } EXPORT_SYMBOL_GPL(zone_device_page_init); - -#ifdef CONFIG_FS_DAX -bool __put_devmap_managed_folio_refs(struct folio *folio, int refs) -{ - if (folio->page.pgmap->type != MEMORY_DEVICE_FS_DAX) - return false; - - /* - * fsdax page refcounts are 1-based, rather than 0-based: if - * refcount is 1, then the page is free and the refcount is - * stable because nobody holds a reference on the page. - */ - if (folio_ref_sub_return(folio, refs) == 1) - wake_up_var(&folio->_refcount); - return true; -} -EXPORT_SYMBOL(__put_devmap_managed_folio_refs); -#endif /* CONFIG_FS_DAX */ diff --git a/mm/migrate.c b/mm/migrate.c index 2ce6b4b814df..8cf0f9c9599d 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -50,6 +50,7 @@ #include <trace/events/migrate.h> #include "internal.h" +#include "swap.h" bool isolate_movable_page(struct page *page, isolate_mode_t mode) { @@ -68,10 +69,6 @@ bool isolate_movable_page(struct page *page, isolate_mode_t mode) if (!folio) goto out; - if (unlikely(folio_test_slab(folio))) - goto out_putfolio; - /* Pairs with smp_wmb() in slab freeing, e.g. SLUB's __free_slab() */ - smp_rmb(); /* * Check movable flag before taking the page lock because * we use non-atomic bitops on newly allocated page flags so @@ -79,10 +76,6 @@ bool isolate_movable_page(struct page *page, isolate_mode_t mode) */ if (unlikely(!__folio_test_movable(folio))) goto out_putfolio; - /* Pairs with smp_wmb() in slab allocation, e.g. SLUB's alloc_slab_page() */ - smp_rmb(); - if (unlikely(folio_test_slab(folio))) - goto out_putfolio; /* * As movable pages are not isolated from LRU lists, concurrent @@ -136,7 +129,7 @@ static void putback_movable_folio(struct folio *folio) * * This function shall be used whenever the isolated pageset has been * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range() - * and isolate_hugetlb(). + * and folio_isolate_hugetlb(). */ void putback_movable_pages(struct list_head *l) { @@ -145,7 +138,7 @@ void putback_movable_pages(struct list_head *l) list_for_each_entry_safe(folio, folio2, l, lru) { if (unlikely(folio_test_hugetlb(folio))) { - folio_putback_active_hugetlb(folio); + folio_putback_hugetlb(folio); continue; } list_del(&folio->lru); @@ -177,7 +170,7 @@ bool isolate_folio_to_list(struct folio *folio, struct list_head *list) bool isolated, lru; if (folio_test_hugetlb(folio)) - return isolate_hugetlb(folio, list); + return folio_isolate_hugetlb(folio, list); lru = !__folio_test_movable(folio); if (lru) @@ -210,7 +203,7 @@ static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw, return false; VM_BUG_ON_PAGE(!PageAnon(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page); + VM_BUG_ON_PAGE(pte_present(ptep_get(pvmw->pte)), page); if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags & VM_LOCKED) || mm_forbids_zeropage(pvmw->vma->vm_mm)) @@ -336,7 +329,7 @@ static bool remove_migration_pte(struct folio *folio, folio_add_file_rmap_pte(folio, new, vma); set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); } - if (vma->vm_flags & VM_LOCKED) + if (READ_ONCE(vma->vm_flags) & VM_LOCKED) mlock_drain_local(); trace_remove_migration_pte(pvmw.address, pte_val(pte), @@ -453,20 +446,6 @@ unlock: } #endif -static int folio_expected_refs(struct address_space *mapping, - struct folio *folio) -{ - int refs = 1; - if (!mapping) - return refs; - - refs += folio_nr_pages(folio); - if (folio_test_private(folio)) - refs++; - - return refs; -} - /* * Replace the folio in the mapping. * @@ -526,15 +505,13 @@ static int __folio_migrate_mapping(struct address_space *mapping, if (folio_test_anon(folio) && folio_test_large(folio)) mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1); folio_ref_add(newfolio, nr); /* add cache reference */ - if (folio_test_swapbacked(folio)) { + if (folio_test_swapbacked(folio)) __folio_set_swapbacked(newfolio); - if (folio_test_swapcache(folio)) { - folio_set_swapcache(newfolio); - newfolio->private = folio_get_private(folio); - } + if (folio_test_swapcache(folio)) { + folio_set_swapcache(newfolio); + newfolio->private = folio_get_private(folio); entries = nr; } else { - VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio); entries = 1; } @@ -611,7 +588,7 @@ static int __folio_migrate_mapping(struct address_space *mapping, int folio_migrate_mapping(struct address_space *mapping, struct folio *newfolio, struct folio *folio, int extra_count) { - int expected_count = folio_expected_refs(mapping, folio) + extra_count; + int expected_count = folio_expected_ref_count(folio) + extra_count + 1; if (folio_ref_count(folio) != expected_count) return -EAGAIN; @@ -628,7 +605,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, struct folio *dst, struct folio *src) { XA_STATE(xas, &mapping->i_pages, folio_index(src)); - int rc, expected_count = folio_expected_refs(mapping, src); + int rc, expected_count = folio_expected_ref_count(src) + 1; if (folio_ref_count(src) != expected_count) return -EAGAIN; @@ -745,7 +722,7 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio) folio_set_readahead(newfolio); folio_copy_owner(newfolio, folio); - pgalloc_tag_copy(newfolio, folio); + pgalloc_tag_swap(newfolio, folio); mem_cgroup_migrate(folio, newfolio); } @@ -759,7 +736,7 @@ static int __migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, void *src_private, enum migrate_mode mode) { - int rc, expected_count = folio_expected_refs(mapping, src); + int rc, expected_count = folio_expected_ref_count(src) + 1; /* Check whether src does not have extra refs before we do more work */ if (folio_ref_count(src) != expected_count) @@ -847,7 +824,7 @@ static int __buffer_migrate_folio(struct address_space *mapping, return migrate_folio(mapping, dst, src, mode); /* Check whether page does not have extra refs before we do more work */ - expected_count = folio_expected_refs(mapping, src); + expected_count = folio_expected_ref_count(src) + 1; if (folio_ref_count(src) != expected_count) return -EAGAIN; @@ -855,9 +832,11 @@ static int __buffer_migrate_folio(struct address_space *mapping, return -EAGAIN; if (check_refs) { - bool busy; + bool busy, migrating; bool invalidated = false; + migrating = test_and_set_bit_lock(BH_Migrate, &head->b_state); + VM_WARN_ON_ONCE(migrating); recheck_buffers: busy = false; spin_lock(&mapping->i_private_lock); @@ -869,12 +848,12 @@ recheck_buffers: } bh = bh->b_this_page; } while (bh != head); + spin_unlock(&mapping->i_private_lock); if (busy) { if (invalidated) { rc = -EAGAIN; goto unlock_buffers; } - spin_unlock(&mapping->i_private_lock); invalidate_bh_lrus(); invalidated = true; goto recheck_buffers; @@ -893,7 +872,7 @@ recheck_buffers: unlock_buffers: if (check_refs) - spin_unlock(&mapping->i_private_lock); + clear_bit_unlock(BH_Migrate, &head->b_state); bh = head; do { unlock_buffer(bh); @@ -955,66 +934,20 @@ int filemap_migrate_folio(struct address_space *mapping, EXPORT_SYMBOL_GPL(filemap_migrate_folio); /* - * Writeback a folio to clean the dirty state - */ -static int writeout(struct address_space *mapping, struct folio *folio) -{ - struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, - .nr_to_write = 1, - .range_start = 0, - .range_end = LLONG_MAX, - .for_reclaim = 1 - }; - int rc; - - if (!mapping->a_ops->writepage) - /* No write method for the address space */ - return -EINVAL; - - if (!folio_clear_dirty_for_io(folio)) - /* Someone else already triggered a write */ - return -EAGAIN; - - /* - * A dirty folio may imply that the underlying filesystem has - * the folio on some queue. So the folio must be clean for - * migration. Writeout may mean we lose the lock and the - * folio state is no longer what we checked for earlier. - * At this point we know that the migration attempt cannot - * be successful. - */ - remove_migration_ptes(folio, folio, 0); - - rc = mapping->a_ops->writepage(&folio->page, &wbc); - - if (rc != AOP_WRITEPAGE_ACTIVATE) - /* unlocked. Relock */ - folio_lock(folio); - - return (rc < 0) ? -EIO : -EAGAIN; -} - -/* * Default handling if a filesystem does not provide a migration function. */ static int fallback_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) { - if (folio_test_dirty(src)) { - /* Only writeback folios in full synchronous migration */ - switch (mode) { - case MIGRATE_SYNC: - break; - default: - return -EBUSY; - } - return writeout(mapping, src); - } + WARN_ONCE(mapping->a_ops->writepages, + "%ps does not implement migrate_folio\n", + mapping->a_ops); + if (folio_test_dirty(src)) + return -EBUSY; /* - * Buffers may be managed in a filesystem specific way. - * We must have no buffers or drop them. + * Filesystem may have private data at folio->private that we + * can't migrate automatically. */ if (!filemap_release_folio(src, GFP_KERNEL)) return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY; @@ -1459,7 +1392,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio, if (folio_ref_count(src) == 1) { /* page was freed from under us. So we are done. */ - folio_putback_active_hugetlb(src); + folio_putback_hugetlb(src); return MIGRATEPAGE_SUCCESS; } @@ -1542,19 +1475,19 @@ out_unlock: folio_unlock(src); out: if (rc == MIGRATEPAGE_SUCCESS) - folio_putback_active_hugetlb(src); + folio_putback_hugetlb(src); else if (rc != -EAGAIN) list_move_tail(&src->lru, ret); /* - * If migration was not successful and there's a freeing callback, use - * it. Otherwise, put_page() will drop the reference grabbed during - * isolation. + * If migration was not successful and there's a freeing callback, + * return the folio to that special allocator. Otherwise, simply drop + * our additional reference. */ if (put_new_folio) put_new_folio(dst, private); else - folio_putback_active_hugetlb(dst); + folio_put(dst); return rc; } @@ -1695,6 +1628,81 @@ static int migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio, return nr_failed; } +static void migrate_folios_move(struct list_head *src_folios, + struct list_head *dst_folios, + free_folio_t put_new_folio, unsigned long private, + enum migrate_mode mode, int reason, + struct list_head *ret_folios, + struct migrate_pages_stats *stats, + int *retry, int *thp_retry, int *nr_failed, + int *nr_retry_pages) +{ + struct folio *folio, *folio2, *dst, *dst2; + bool is_thp; + int nr_pages; + int rc; + + dst = list_first_entry(dst_folios, struct folio, lru); + dst2 = list_next_entry(dst, lru); + list_for_each_entry_safe(folio, folio2, src_folios, lru) { + is_thp = folio_test_large(folio) && folio_test_pmd_mappable(folio); + nr_pages = folio_nr_pages(folio); + + cond_resched(); + + rc = migrate_folio_move(put_new_folio, private, + folio, dst, mode, + reason, ret_folios); + /* + * The rules are: + * Success: folio will be freed + * -EAGAIN: stay on the unmap_folios list + * Other errno: put on ret_folios list + */ + switch (rc) { + case -EAGAIN: + *retry += 1; + *thp_retry += is_thp; + *nr_retry_pages += nr_pages; + break; + case MIGRATEPAGE_SUCCESS: + stats->nr_succeeded += nr_pages; + stats->nr_thp_succeeded += is_thp; + break; + default: + *nr_failed += 1; + stats->nr_thp_failed += is_thp; + stats->nr_failed_pages += nr_pages; + break; + } + dst = dst2; + dst2 = list_next_entry(dst, lru); + } +} + +static void migrate_folios_undo(struct list_head *src_folios, + struct list_head *dst_folios, + free_folio_t put_new_folio, unsigned long private, + struct list_head *ret_folios) +{ + struct folio *folio, *folio2, *dst, *dst2; + + dst = list_first_entry(dst_folios, struct folio, lru); + dst2 = list_next_entry(dst, lru); + list_for_each_entry_safe(folio, folio2, src_folios, lru) { + int old_page_state = 0; + struct anon_vma *anon_vma = NULL; + + __migrate_folio_extract(dst, &old_page_state, &anon_vma); + migrate_folio_undo_src(folio, old_page_state & PAGE_WAS_MAPPED, + anon_vma, true, ret_folios); + list_del(&dst->lru); + migrate_folio_undo_dst(dst, true, put_new_folio, private); + dst = dst2; + dst2 = list_next_entry(dst, lru); + } +} + /* * migrate_pages_batch() first unmaps folios in the from list as many as * possible, then move the unmapped folios. @@ -1717,7 +1725,7 @@ static int migrate_pages_batch(struct list_head *from, int pass = 0; bool is_thp = false; bool is_large = false; - struct folio *folio, *folio2, *dst = NULL, *dst2; + struct folio *folio, *folio2, *dst = NULL; int rc, rc_saved = 0, nr_pages; LIST_HEAD(unmap_folios); LIST_HEAD(dst_folios); @@ -1888,42 +1896,11 @@ move: thp_retry = 0; nr_retry_pages = 0; - dst = list_first_entry(&dst_folios, struct folio, lru); - dst2 = list_next_entry(dst, lru); - list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) { - is_thp = folio_test_large(folio) && folio_test_pmd_mappable(folio); - nr_pages = folio_nr_pages(folio); - - cond_resched(); - - rc = migrate_folio_move(put_new_folio, private, - folio, dst, mode, - reason, ret_folios); - /* - * The rules are: - * Success: folio will be freed - * -EAGAIN: stay on the unmap_folios list - * Other errno: put on ret_folios list - */ - switch(rc) { - case -EAGAIN: - retry++; - thp_retry += is_thp; - nr_retry_pages += nr_pages; - break; - case MIGRATEPAGE_SUCCESS: - stats->nr_succeeded += nr_pages; - stats->nr_thp_succeeded += is_thp; - break; - default: - nr_failed++; - stats->nr_thp_failed += is_thp; - stats->nr_failed_pages += nr_pages; - break; - } - dst = dst2; - dst2 = list_next_entry(dst, lru); - } + /* Move the unmapped folios */ + migrate_folios_move(&unmap_folios, &dst_folios, + put_new_folio, private, mode, reason, + ret_folios, stats, &retry, &thp_retry, + &nr_failed, &nr_retry_pages); } nr_failed += retry; stats->nr_thp_failed += thp_retry; @@ -1932,20 +1909,8 @@ move: rc = rc_saved ? : nr_failed; out: /* Cleanup remaining folios */ - dst = list_first_entry(&dst_folios, struct folio, lru); - dst2 = list_next_entry(dst, lru); - list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) { - int old_page_state = 0; - struct anon_vma *anon_vma = NULL; - - __migrate_folio_extract(dst, &old_page_state, &anon_vma); - migrate_folio_undo_src(folio, old_page_state & PAGE_WAS_MAPPED, - anon_vma, true, ret_folios); - list_del(&dst->lru); - migrate_folio_undo_dst(dst, true, put_new_folio, private); - dst = dst2; - dst2 = list_next_entry(dst, lru); - } + migrate_folios_undo(&unmap_folios, &dst_folios, + put_new_folio, private, ret_folios); return rc; } @@ -2204,11 +2169,11 @@ static int __add_folio_for_migration(struct folio *folio, int node, if (folio_nid(folio) == node) return 0; - if (folio_likely_mapped_shared(folio) && !migrate_all) + if (folio_maybe_mapped_shared(folio) && !migrate_all) return -EACCES; if (folio_test_hugetlb(folio)) { - if (isolate_hugetlb(folio, pagelist)) + if (folio_isolate_hugetlb(folio, pagelist)) return 1; } else if (folio_isolate_lru(folio)) { list_add_tail(&folio->lru, pagelist); @@ -2629,11 +2594,10 @@ int migrate_misplaced_folio_prepare(struct folio *folio, * processes with execute permissions as they are probably * shared libraries. * - * See folio_likely_mapped_shared() on possible imprecision + * See folio_maybe_mapped_shared() on possible imprecision * when we cannot easily detect if a folio is shared. */ - if ((vma->vm_flags & VM_EXEC) && - folio_likely_mapped_shared(folio)) + if ((vma->vm_flags & VM_EXEC) && folio_maybe_mapped_shared(folio)) return -EACCES; /* @@ -2683,8 +2647,7 @@ int migrate_misplaced_folio_prepare(struct folio *folio, * elevated reference count on the folio. This function will un-isolate the * folio, dereferencing the folio before returning. */ -int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma, - int node) +int migrate_misplaced_folio(struct folio *folio, int node) { pg_data_t *pgdat = NODE_DATA(node); int nr_remaining; diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 9cf26592ac93..3158afe7eb23 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -60,6 +60,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, struct mm_walk *walk) { struct migrate_vma *migrate = walk->private; + struct folio *fault_folio = migrate->fault_page ? + page_folio(migrate->fault_page) : NULL; struct vm_area_struct *vma = walk->vma; struct mm_struct *mm = vma->vm_mm; unsigned long addr = start, unmapped = 0; @@ -88,11 +90,16 @@ again: folio_get(folio); spin_unlock(ptl); + /* FIXME: we don't expect THP for fault_folio */ + if (WARN_ON_ONCE(fault_folio == folio)) + return migrate_vma_collect_skip(start, end, + walk); if (unlikely(!folio_trylock(folio))) return migrate_vma_collect_skip(start, end, walk); ret = split_folio(folio); - folio_unlock(folio); + if (fault_folio != folio) + folio_unlock(folio); folio_put(folio); if (ret) return migrate_vma_collect_skip(start, end, @@ -106,6 +113,7 @@ again: arch_enter_lazy_mmu_mode(); for (; addr < end; addr += PAGE_SIZE, ptep++) { + struct dev_pagemap *pgmap; unsigned long mpfn = 0, pfn; struct folio *folio; struct page *page; @@ -133,9 +141,10 @@ again: goto next; page = pfn_swap_entry_to_page(entry); + pgmap = page_pgmap(page); if (!(migrate->flags & MIGRATE_VMA_SELECT_DEVICE_PRIVATE) || - page->pgmap->owner != migrate->pgmap_owner) + pgmap->owner != migrate->pgmap_owner) goto next; mpfn = migrate_pfn(page_to_pfn(page)) | @@ -152,12 +161,16 @@ again: } page = vm_normal_page(migrate->vma, addr, pte); if (page && !is_zone_device_page(page) && - !(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) - goto next; - else if (page && is_device_coherent_page(page) && - (!(migrate->flags & MIGRATE_VMA_SELECT_DEVICE_COHERENT) || - page->pgmap->owner != migrate->pgmap_owner)) + !(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) { goto next; + } else if (page && is_device_coherent_page(page)) { + pgmap = page_pgmap(page); + + if (!(migrate->flags & + MIGRATE_VMA_SELECT_DEVICE_COHERENT) || + pgmap->owner != migrate->pgmap_owner) + goto next; + } mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; } @@ -192,7 +205,7 @@ again: * optimisation to avoid walking the rmap later with * try_to_migrate(). */ - if (folio_trylock(folio)) { + if (fault_folio == folio || folio_trylock(folio)) { bool anon_exclusive; pte_t swp_pte; @@ -204,7 +217,8 @@ again: if (folio_try_share_anon_rmap_pte(folio, page)) { set_pte_at(mm, addr, ptep, pte); - folio_unlock(folio); + if (fault_folio != folio) + folio_unlock(folio); folio_put(folio); mpfn = 0; goto next; @@ -363,6 +377,8 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns, unsigned long npages, struct page *fault_page) { + struct folio *fault_folio = fault_page ? + page_folio(fault_page) : NULL; unsigned long i, restore = 0; bool allow_drain = true; unsigned long unmapped = 0; @@ -427,7 +443,8 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns, remove_migration_ptes(folio, folio, 0); src_pfns[i] = 0; - folio_unlock(folio); + if (fault_folio != folio) + folio_unlock(folio); folio_put(folio); restore--; } @@ -536,6 +553,8 @@ int migrate_vma_setup(struct migrate_vma *args) return -EINVAL; if (args->fault_page && !is_device_private_page(args->fault_page)) return -EINVAL; + if (args->fault_page && !PageLocked(args->fault_page)) + return -EINVAL; memset(args->src, 0, sizeof(*args->src) * nr_pages); args->cpages = 0; @@ -799,19 +818,13 @@ void migrate_vma_pages(struct migrate_vma *migrate) } EXPORT_SYMBOL(migrate_vma_pages); -/* - * migrate_device_finalize() - complete page migration - * @src_pfns: src_pfns returned from migrate_device_range() - * @dst_pfns: array of pfns allocated by the driver to migrate memory to - * @npages: number of pages in the range - * - * Completes migration of the page by removing special migration entries. - * Drivers must ensure copying of page data is complete and visible to the CPU - * before calling this. - */ -void migrate_device_finalize(unsigned long *src_pfns, - unsigned long *dst_pfns, unsigned long npages) +static void __migrate_device_finalize(unsigned long *src_pfns, + unsigned long *dst_pfns, + unsigned long npages, + struct page *fault_page) { + struct folio *fault_folio = fault_page ? + page_folio(fault_page) : NULL; unsigned long i; for (i = 0; i < npages; i++) { @@ -824,6 +837,7 @@ void migrate_device_finalize(unsigned long *src_pfns, if (!page) { if (dst) { + WARN_ON_ONCE(fault_folio == dst); folio_unlock(dst); folio_put(dst); } @@ -834,29 +848,43 @@ void migrate_device_finalize(unsigned long *src_pfns, if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE) || !dst) { if (dst) { + WARN_ON_ONCE(fault_folio == dst); folio_unlock(dst); folio_put(dst); } dst = src; } + if (!folio_is_zone_device(dst)) + folio_add_lru(dst); remove_migration_ptes(src, dst, 0); - folio_unlock(src); - - if (folio_is_zone_device(src)) - folio_put(src); - else - folio_putback_lru(src); + if (fault_folio != src) + folio_unlock(src); + folio_put(src); if (dst != src) { + WARN_ON_ONCE(fault_folio == dst); folio_unlock(dst); - if (folio_is_zone_device(dst)) - folio_put(dst); - else - folio_putback_lru(dst); + folio_put(dst); } } } + +/* + * migrate_device_finalize() - complete page migration + * @src_pfns: src_pfns returned from migrate_device_range() + * @dst_pfns: array of pfns allocated by the driver to migrate memory to + * @npages: number of pages in the range + * + * Completes migration of the page by removing special migration entries. + * Drivers must ensure copying of page data is complete and visible to the CPU + * before calling this. + */ +void migrate_device_finalize(unsigned long *src_pfns, + unsigned long *dst_pfns, unsigned long npages) +{ + return __migrate_device_finalize(src_pfns, dst_pfns, npages, NULL); +} EXPORT_SYMBOL(migrate_device_finalize); /** @@ -872,10 +900,27 @@ EXPORT_SYMBOL(migrate_device_finalize); */ void migrate_vma_finalize(struct migrate_vma *migrate) { - migrate_device_finalize(migrate->src, migrate->dst, migrate->npages); + __migrate_device_finalize(migrate->src, migrate->dst, migrate->npages, + migrate->fault_page); } EXPORT_SYMBOL(migrate_vma_finalize); +static unsigned long migrate_device_pfn_lock(unsigned long pfn) +{ + struct folio *folio; + + folio = folio_get_nontail_page(pfn_to_page(pfn)); + if (!folio) + return 0; + + if (!folio_trylock(folio)) { + folio_put(folio); + return 0; + } + + return migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; +} + /** * migrate_device_range() - migrate device private pfns to normal memory. * @src_pfns: array large enough to hold migrating source device private pfns. @@ -900,29 +945,35 @@ int migrate_device_range(unsigned long *src_pfns, unsigned long start, { unsigned long i, pfn; - for (pfn = start, i = 0; i < npages; pfn++, i++) { - struct folio *folio; + for (pfn = start, i = 0; i < npages; pfn++, i++) + src_pfns[i] = migrate_device_pfn_lock(pfn); - folio = folio_get_nontail_page(pfn_to_page(pfn)); - if (!folio) { - src_pfns[i] = 0; - continue; - } + migrate_device_unmap(src_pfns, npages, NULL); - if (!folio_trylock(folio)) { - src_pfns[i] = 0; - folio_put(folio); - continue; - } + return 0; +} +EXPORT_SYMBOL(migrate_device_range); - src_pfns[i] = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; - } +/** + * migrate_device_pfns() - migrate device private pfns to normal memory. + * @src_pfns: pre-popluated array of source device private pfns to migrate. + * @npages: number of pages to migrate. + * + * Similar to migrate_device_range() but supports non-contiguous pre-popluated + * array of device pages to migrate. + */ +int migrate_device_pfns(unsigned long *src_pfns, unsigned long npages) +{ + unsigned long i; + + for (i = 0; i < npages; i++) + src_pfns[i] = migrate_device_pfn_lock(src_pfns[i]); migrate_device_unmap(src_pfns, npages, NULL); return 0; } -EXPORT_SYMBOL(migrate_device_range); +EXPORT_SYMBOL(migrate_device_pfns); /* * Migrate a device coherent folio back to normal memory. The caller should have diff --git a/mm/mincore.c b/mm/mincore.c index d6bd19e520fc..42d6c9c8da86 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -21,6 +21,7 @@ #include <linux/uaccess.h> #include "swap.h" +#include "internal.h" static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -105,6 +106,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pte_t *ptep; unsigned char *vec = walk->private; int nr = (end - addr) >> PAGE_SHIFT; + int step, i; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { @@ -118,16 +120,26 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, walk->action = ACTION_AGAIN; return 0; } - for (; addr != end; ptep++, addr += PAGE_SIZE) { + for (; addr != end; ptep += step, addr += step * PAGE_SIZE) { pte_t pte = ptep_get(ptep); + step = 1; /* We need to do cache lookup too for pte markers */ if (pte_none_mostly(pte)) __mincore_unmapped_range(addr, addr + PAGE_SIZE, vma, vec); - else if (pte_present(pte)) - *vec = 1; - else { /* pte is a swap entry */ + else if (pte_present(pte)) { + unsigned int batch = pte_batch_hint(ptep, pte); + + if (batch > 1) { + unsigned int max_nr = (end - addr) >> PAGE_SHIFT; + + step = min_t(unsigned int, batch, max_nr); + } + + for (i = 0; i < step; i++) + vec[i] = 1; + } else { /* pte is a swap entry */ swp_entry_t entry = pte_to_swp_entry(pte); if (non_swap_entry(entry)) { @@ -146,7 +158,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, #endif } } - vec++; + vec += step; } pte_unmap_unlock(ptep - 1, ptl); out: @@ -239,7 +251,7 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len, start = untagged_addr(start); /* Check the start address: needs to be page-aligned.. */ - if (start & ~PAGE_MASK) + if (unlikely(start & ~PAGE_MASK)) return -EINVAL; /* ..and we need to be passed a valid user-space range */ diff --git a/mm/mlock.c b/mm/mlock.c index cde076fa7d5e..3cb72b579ffd 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -368,6 +368,8 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, if (is_huge_zero_pmd(*pmd)) goto out; folio = pmd_folio(*pmd); + if (folio_is_zone_device(folio)) + goto out; if (vma->vm_flags & VM_LOCKED) mlock_folio(folio); else diff --git a/mm/mm_init.c b/mm/mm_init.c index 24b68b425afb..f2944748f526 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -30,12 +30,29 @@ #include <linux/crash_dump.h> #include <linux/execmem.h> #include <linux/vmstat.h> +#include <linux/kexec_handover.h> +#include <linux/hugetlb.h> #include "internal.h" #include "slab.h" #include "shuffle.h" #include <asm/setup.h> +#ifndef CONFIG_NUMA +unsigned long max_mapnr; +EXPORT_SYMBOL(max_mapnr); + +struct page *mem_map; +EXPORT_SYMBOL(mem_map); +#endif + +/* + * high_memory defines the upper bound on direct map memory, then end + * of ZONE_NORMAL. + */ +void *high_memory; +EXPORT_SYMBOL(high_memory); + #ifdef CONFIG_DEBUG_MEMORY_INIT int __meminitdata mminit_loglevel; @@ -438,7 +455,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) * was requested by the user */ required_movablecore = - roundup(required_movablecore, MAX_ORDER_NR_PAGES); + round_up(required_movablecore, MAX_ORDER_NR_PAGES); required_movablecore = min(totalpages, required_movablecore); corepages = totalpages - required_movablecore; @@ -545,11 +562,11 @@ restart: out2: /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ - for (nid = 0; nid < MAX_NUMNODES; nid++) { + for_each_node_state(nid, N_MEMORY) { unsigned long start_pfn, end_pfn; zone_movable_pfn[nid] = - roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); + round_up(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); if (zone_movable_pfn[nid] >= end_pfn) @@ -649,6 +666,28 @@ static inline void fixup_hashdist(void) static inline void fixup_hashdist(void) {} #endif /* CONFIG_NUMA */ +/* + * Initialize a reserved page unconditionally, finding its zone first. + */ +void __meminit __init_page_from_nid(unsigned long pfn, int nid) +{ + pg_data_t *pgdat; + int zid; + + pgdat = NODE_DATA(nid); + + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + struct zone *zone = &pgdat->node_zones[zid]; + + if (zone_spans_pfn(zone, pfn)) + break; + } + __init_single_page(pfn_to_page(pfn), pfn, zid, nid); + + if (pageblock_aligned(pfn)) + set_pageblock_migratetype(pfn_to_page(pfn), MIGRATE_MOVABLE); +} + #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT static inline void pgdat_set_deferred_range(pg_data_t *pgdat) { @@ -705,26 +744,12 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn) return false; } -static void __meminit init_reserved_page(unsigned long pfn, int nid) +static void __meminit __init_deferred_page(unsigned long pfn, int nid) { - pg_data_t *pgdat; - int zid; - if (early_page_initialised(pfn, nid)) return; - pgdat = NODE_DATA(nid); - - for (zid = 0; zid < MAX_NR_ZONES; zid++) { - struct zone *zone = &pgdat->node_zones[zid]; - - if (zone_spans_pfn(zone, pfn)) - break; - } - __init_single_page(pfn_to_page(pfn), pfn, zid, nid); - - if (pageblock_aligned(pfn)) - set_pageblock_migratetype(pfn_to_page(pfn), MIGRATE_MOVABLE); + __init_page_from_nid(pfn, nid); } #else static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {} @@ -739,11 +764,16 @@ static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn) return false; } -static inline void init_reserved_page(unsigned long pfn, int nid) +static inline void __init_deferred_page(unsigned long pfn, int nid) { } #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ +void __meminit init_deferred_page(unsigned long pfn, int nid) +{ + __init_deferred_page(pfn, nid); +} + /* * Initialised pages do not have PageReserved set. This function is * called for each range allocated by the bootmem allocator and @@ -753,22 +783,19 @@ static inline void init_reserved_page(unsigned long pfn, int nid) void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end, int nid) { - unsigned long start_pfn = PFN_DOWN(start); - unsigned long end_pfn = PFN_UP(end); + unsigned long pfn; - for (; start_pfn < end_pfn; start_pfn++) { - if (pfn_valid(start_pfn)) { - struct page *page = pfn_to_page(start_pfn); + for_each_valid_pfn(pfn, PFN_DOWN(start), PFN_UP(end)) { + struct page *page = pfn_to_page(pfn); - init_reserved_page(start_pfn, nid); + __init_deferred_page(pfn, nid); - /* - * no need for atomic set_bit because the struct - * page is not visible yet so nobody should - * access it yet. - */ - __SetPageReserved(page); - } + /* + * no need for atomic set_bit because the struct + * page is not visible yet so nobody should + * access it yet. + */ + __SetPageReserved(page); } } @@ -804,7 +831,7 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn) * - physical memory bank size is not necessarily the exact multiple of the * arbitrary section size * - early reserved memory may not be listed in memblock.memory - * - non-memory regions covered by the contigious flatmem mapping + * - non-memory regions covered by the contiguous flatmem mapping * - memory layouts defined with memmap= kernel parameter may not align * nicely with memmap sections * @@ -824,11 +851,7 @@ static void __init init_unavailable_range(unsigned long spfn, unsigned long pfn; u64 pgcnt = 0; - for (pfn = spfn; pfn < epfn; pfn++) { - if (!pfn_valid(pageblock_start_pfn(pfn))) { - pfn = pageblock_end_pfn(pfn) - 1; - continue; - } + for_each_valid_pfn(pfn, spfn, epfn) { __init_single_page(pfn_to_page(pfn), pfn, zone, node); __SetPageReserved(pfn_to_page(pfn)); pgcnt++; @@ -960,19 +983,19 @@ static void __init memmap_init(void) } } -#ifdef CONFIG_SPARSEMEM /* * Initialize the memory map for hole in the range [memory_end, - * section_end]. + * section_end] for SPARSEMEM and in the range [memory_end, memmap_end] + * for FLATMEM. * Append the pages in this hole to the highest zone in the last * node. - * The call to init_unavailable_range() is outside the ifdef to - * silence the compiler warining about zone_id set but not used; - * for FLATMEM it is a nop anyway */ +#ifdef CONFIG_SPARSEMEM end_pfn = round_up(end_pfn, PAGES_PER_SECTION); - if (hole_pfn < end_pfn) +#else + end_pfn = round_up(end_pfn, MAX_ORDER_NR_PAGES); #endif + if (hole_pfn < end_pfn) init_unavailable_range(hole_pfn, end_pfn, zone_id, nid); } @@ -998,7 +1021,7 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn, * and zone_device_data. It is a bug if a ZONE_DEVICE page is * ever freed or placed on a driver-private list. */ - page->pgmap = pgmap; + page_folio(page)->pgmap = pgmap; page->zone_device_data = NULL; /* @@ -1017,12 +1040,25 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn, } /* - * ZONE_DEVICE pages are released directly to the driver page allocator - * which will set the page count to 1 when allocating the page. + * ZONE_DEVICE pages other than MEMORY_TYPE_GENERIC are released + * directly to the driver page allocator which will set the page count + * to 1 when allocating the page. + * + * MEMORY_TYPE_GENERIC and MEMORY_TYPE_FS_DAX pages automatically have + * their refcount reset to one whenever they are freed (ie. after + * their refcount drops to 0). */ - if (pgmap->type == MEMORY_DEVICE_PRIVATE || - pgmap->type == MEMORY_DEVICE_COHERENT) + switch (pgmap->type) { + case MEMORY_DEVICE_FS_DAX: + case MEMORY_DEVICE_PRIVATE: + case MEMORY_DEVICE_COHERENT: + case MEMORY_DEVICE_PCI_P2PDMA: set_page_count(page, 0); + break; + + case MEMORY_DEVICE_GENERIC: + break; + } } /* @@ -1431,7 +1467,7 @@ void __meminit init_currently_empty_zone(struct zone *zone, #ifndef CONFIG_SPARSEMEM /* - * Calculate the size of the zone->blockflags rounded to an unsigned long + * Calculate the size of the zone->pageblock_flags rounded to an unsigned long * Start by making sure zonesize is a multiple of pageblock_order by rounding * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally * round what is now in bits to nearest long in bits, then return it in @@ -1442,10 +1478,10 @@ static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned l unsigned long usemapsize; zonesize += zone_start_pfn & (pageblock_nr_pages-1); - usemapsize = roundup(zonesize, pageblock_nr_pages); + usemapsize = round_up(zonesize, pageblock_nr_pages); usemapsize = usemapsize >> pageblock_order; usemapsize *= NR_PAGEBLOCK_BITS; - usemapsize = roundup(usemapsize, BITS_PER_LONG); + usemapsize = round_up(usemapsize, BITS_PER_LONG); return usemapsize / BITS_PER_BYTE; } @@ -1473,7 +1509,7 @@ static inline void setup_usemap(struct zone *zone) {} /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ void __init set_pageblock_order(void) { - unsigned int order = MAX_PAGE_ORDER; + unsigned int order = PAGE_BLOCK_ORDER; /* Check that pageblock_nr_pages has not already been setup */ if (pageblock_order) @@ -1585,13 +1621,17 @@ void __init *memmap_alloc(phys_addr_t size, phys_addr_t align, { void *ptr; + /* + * Kmemleak will explicitly scan mem_map by traversing all valid + * `struct *page`,so memblock does not need to be added to the scan list. + */ if (exact_nid) ptr = memblock_alloc_exact_nid_raw(size, align, min_addr, - MEMBLOCK_ALLOC_ACCESSIBLE, + MEMBLOCK_ALLOC_NOLEAKTRACE, nid); else ptr = memblock_alloc_try_nid_raw(size, align, min_addr, - MEMBLOCK_ALLOC_ACCESSIBLE, + MEMBLOCK_ALLOC_NOLEAKTRACE, nid); if (ptr && size > 0) @@ -1613,7 +1653,7 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat) start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); offset = pgdat->node_start_pfn - start; /* - * The zone's endpoints aren't required to be MAX_PAGE_ORDER + * The zone's endpoints aren't required to be MAX_PAGE_ORDER * aligned but the node_mem_map endpoints must be in order * for the buddy allocator to function correctly. */ @@ -1629,14 +1669,15 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat) pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n", __func__, pgdat->node_id, (unsigned long)pgdat, (unsigned long)pgdat->node_mem_map); -#ifndef CONFIG_NUMA + /* the global mem_map is just set as node 0's */ - if (pgdat == NODE_DATA(0)) { - mem_map = NODE_DATA(0)->node_mem_map; - if (page_to_pfn(mem_map) != pgdat->node_start_pfn) - mem_map -= offset; - } -#endif + WARN_ON(pgdat != NODE_DATA(0)); + + mem_map = pgdat->node_mem_map; + if (page_to_pfn(mem_map) != pgdat->node_start_pfn) + mem_map -= offset; + + max_mapnr = end - start; } #else static inline void alloc_node_mem_map(struct pglist_data *pgdat) { } @@ -1743,6 +1784,27 @@ static bool arch_has_descending_max_zone_pfns(void) return IS_ENABLED(CONFIG_ARC) && !IS_ENABLED(CONFIG_ARC_HAS_PAE40); } +static void __init set_high_memory(void) +{ + phys_addr_t highmem = memblock_end_of_DRAM(); + + /* + * Some architectures (e.g. ARM) set high_memory very early and + * use it in arch setup code. + * If an architecture already set high_memory don't overwrite it + */ + if (high_memory) + return; + +#ifdef CONFIG_HIGHMEM + if (arch_has_descending_max_zone_pfns() || + highmem > PFN_PHYS(arch_zone_lowest_possible_pfn[ZONE_HIGHMEM])) + highmem = PFN_PHYS(arch_zone_lowest_possible_pfn[ZONE_HIGHMEM]); +#endif + + high_memory = phys_to_virt(highmem - 1) + 1; +} + /** * free_area_init - Initialise all pg_data_t and zone data * @max_zone_pfn: an array of max PFNs for each zone @@ -1844,7 +1906,7 @@ void __init free_area_init(unsigned long *max_zone_pfn) free_area_init_node(nid); /* - * No sysfs hierarcy will be created via register_one_node() + * No sysfs hierarchy will be created via register_one_node() *for memory-less node because here it's not marked as N_MEMORY *and won't be set online later. The benefit is userspace *program won't be confused by sysfs files/directories of @@ -1857,11 +1919,16 @@ void __init free_area_init(unsigned long *max_zone_pfn) } } + for_each_node_state(nid, N_MEMORY) + sparse_vmemmap_init_nid_late(nid); + calc_nr_kernel_pages(); memmap_init(); /* disable hash distribution for systems with a single node */ fixup_hashdist(); + + set_high_memory(); } /** @@ -2247,6 +2314,15 @@ void __init init_cma_reserved_pageblock(struct page *page) adjust_managed_page_count(page, pageblock_nr_pages); page_zone(page)->cma_pages += pageblock_nr_pages; } +/* + * Similar to above, but only set the migrate type and stats. + */ +void __init init_cma_pageblock(struct page *page) +{ + set_pageblock_migratetype(page, MIGRATE_CMA); + adjust_managed_page_count(page, pageblock_nr_pages); + page_zone(page)->cma_pages += pageblock_nr_pages; +} #endif void set_zone_contiguous(struct zone *zone) @@ -2271,6 +2347,31 @@ void set_zone_contiguous(struct zone *zone) zone->contiguous = true; } +/* + * Check if a PFN range intersects multiple zones on one or more + * NUMA nodes. Specify the @nid argument if it is known that this + * PFN range is on one node, NUMA_NO_NODE otherwise. + */ +bool pfn_range_intersects_zones(int nid, unsigned long start_pfn, + unsigned long nr_pages) +{ + struct zone *zone, *izone = NULL; + + for_each_zone(zone) { + if (nid != NUMA_NO_NODE && zone_to_nid(zone) != nid) + continue; + + if (zone_intersects(zone, start_pfn, nr_pages)) { + if (izone != NULL) + return true; + izone = zone; + } + + } + + return false; +} + static void __init mem_init_print_info(void); void __init page_alloc_init_late(void) { @@ -2565,12 +2666,6 @@ static void __init report_meminit(void) stack = "all(pattern)"; else if (IS_ENABLED(CONFIG_INIT_STACK_ALL_ZERO)) stack = "all(zero)"; - else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL)) - stack = "byref_all(zero)"; - else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF)) - stack = "byref(zero)"; - else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_USER)) - stack = "__user(zero)"; else stack = "off"; @@ -2632,11 +2727,22 @@ static void __init mem_init_print_info(void) ); } +void __init __weak arch_mm_preinit(void) +{ +} + +void __init __weak mem_init(void) +{ +} + /* * Set up kernel memory allocators */ void __init mm_core_init(void) { + arch_mm_preinit(); + hugetlb_bootmem_alloc(); + /* Initializations relying on SMP setup */ BUILD_BUG_ON(MAX_ZONELISTS > 2); build_all_zonelists(NULL); @@ -2652,6 +2758,14 @@ void __init mm_core_init(void) report_meminit(); kmsan_init_shadow(); stack_depot_early_init(); + + /* + * KHO memory setup must happen while memblock is still active, but + * as close as possible to buddy initialization + */ + kho_memory_init(); + + memblock_free_all(); mem_init(); kmem_cache_init(); /* diff --git a/mm/mmap.c b/mm/mmap.c index 386429f7db5a..09c563c95112 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -47,6 +47,7 @@ #include <linux/oom.h> #include <linux/sched/mm.h> #include <linux/ksm.h> +#include <linux/memfd.h> #include <linux/uaccess.h> #include <asm/cacheflush.h> @@ -110,8 +111,7 @@ static int check_brk_limits(unsigned long addr, unsigned long len) return mlock_future_ok(current->mm, current->mm->def_flags, len) ? 0 : -EAGAIN; } -static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma, - unsigned long addr, unsigned long request, unsigned long flags); + SYSCALL_DEFINE1(brk, unsigned long, brk) { unsigned long newbrk, oldbrk, origbrk; @@ -277,8 +277,62 @@ static inline bool file_mmap_ok(struct file *file, struct inode *inode, return true; } -/* +/** + * do_mmap() - Perform a userland memory mapping into the current process + * address space of length @len with protection bits @prot, mmap flags @flags + * (from which VMA flags will be inferred), and any additional VMA flags to + * apply @vm_flags. If this is a file-backed mapping then the file is specified + * in @file and page offset into the file via @pgoff. + * + * This function does not perform security checks on the file and assumes, if + * @uf is non-NULL, the caller has provided a list head to track unmap events + * for userfaultfd @uf. + * + * It also simply indicates whether memory population is required by setting + * @populate, which must be non-NULL, expecting the caller to actually perform + * this task itself if appropriate. + * + * This function will invoke architecture-specific (and if provided and + * relevant, file system-specific) logic to determine the most appropriate + * unmapped area in which to place the mapping if not MAP_FIXED. + * + * Callers which require userland mmap() behaviour should invoke vm_mmap(), + * which is also exported for module use. + * + * Those which require this behaviour less security checks, userfaultfd and + * populate behaviour, and who handle the mmap write lock themselves, should + * call this function. + * + * Note that the returned address may reside within a merged VMA if an + * appropriate merge were to take place, so it doesn't necessarily specify the + * start of a VMA, rather only the start of a valid mapped range of length + * @len bytes, rounded down to the nearest page size. + * * The caller must write-lock current->mm->mmap_lock. + * + * @file: An optional struct file pointer describing the file which is to be + * mapped, if a file-backed mapping. + * @addr: If non-zero, hints at (or if @flags has MAP_FIXED set, specifies) the + * address at which to perform this mapping. See mmap (2) for details. Must be + * page-aligned. + * @len: The length of the mapping. Will be page-aligned and must be at least 1 + * page in size. + * @prot: Protection bits describing access required to the mapping. See mmap + * (2) for details. + * @flags: Flags specifying how the mapping should be performed, see mmap (2) + * for details. + * @vm_flags: VMA flags which should be set by default, or 0 otherwise. + * @pgoff: Page offset into the @file if file-backed, should be 0 otherwise. + * @populate: A pointer to a value which will be set to 0 if no population of + * the range is required, or the number of bytes to populate if it is. Must be + * non-NULL. See mmap (2) for details as to under what circumstances population + * of the range occurs. + * @uf: An optional pointer to a list head to track userfaultfd unmap events + * should unmapping events arise. If provided, it is up to the caller to manage + * this. + * + * Returns: Either an error, or the address at which the requested mapping has + * been performed. */ unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, @@ -291,6 +345,8 @@ unsigned long do_mmap(struct file *file, unsigned long addr, *populate = 0; + mmap_assert_write_locked(mm); + if (!len) return -EINVAL; @@ -369,6 +425,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (file) { struct inode *inode = file_inode(file); unsigned long flags_mask; + int err; if (!file_mmap_ok(file, inode, pgoff, len)) return -EOVERFLOW; @@ -418,7 +475,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, vm_flags &= ~VM_MAYEXEC; } - if (!file->f_op->mmap) + if (!file_has_valid_mmap_hooks(file)) return -ENODEV; if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) return -EINVAL; @@ -427,6 +484,14 @@ unsigned long do_mmap(struct file *file, unsigned long addr, default: return -EINVAL; } + + /* + * Check to see if we are violating any seals and update VMA + * flags if necessary to avoid future seal violations. + */ + err = memfd_check_seals_mmap(file, &vm_flags); + if (err) + return (unsigned long)err; } else { switch (flags & MAP_TYPE) { case MAP_SHARED: @@ -577,115 +642,6 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) } #endif /* __ARCH_WANT_SYS_OLD_MMAP */ -/** - * unmapped_area() - Find an area between the low_limit and the high_limit with - * the correct alignment and offset, all from @info. Note: current->mm is used - * for the search. - * - * @info: The unmapped area information including the range [low_limit - - * high_limit), the alignment offset and mask. - * - * Return: A memory address or -ENOMEM. - */ -static unsigned long unmapped_area(struct vm_unmapped_area_info *info) -{ - unsigned long length, gap; - unsigned long low_limit, high_limit; - struct vm_area_struct *tmp; - VMA_ITERATOR(vmi, current->mm, 0); - - /* Adjust search length to account for worst case alignment overhead */ - length = info->length + info->align_mask + info->start_gap; - if (length < info->length) - return -ENOMEM; - - low_limit = info->low_limit; - if (low_limit < mmap_min_addr) - low_limit = mmap_min_addr; - high_limit = info->high_limit; -retry: - if (vma_iter_area_lowest(&vmi, low_limit, high_limit, length)) - return -ENOMEM; - - /* - * Adjust for the gap first so it doesn't interfere with the - * later alignment. The first step is the minimum needed to - * fulill the start gap, the next steps is the minimum to align - * that. It is the minimum needed to fulill both. - */ - gap = vma_iter_addr(&vmi) + info->start_gap; - gap += (info->align_offset - gap) & info->align_mask; - tmp = vma_next(&vmi); - if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ - if (vm_start_gap(tmp) < gap + length - 1) { - low_limit = tmp->vm_end; - vma_iter_reset(&vmi); - goto retry; - } - } else { - tmp = vma_prev(&vmi); - if (tmp && vm_end_gap(tmp) > gap) { - low_limit = vm_end_gap(tmp); - vma_iter_reset(&vmi); - goto retry; - } - } - - return gap; -} - -/** - * unmapped_area_topdown() - Find an area between the low_limit and the - * high_limit with the correct alignment and offset at the highest available - * address, all from @info. Note: current->mm is used for the search. - * - * @info: The unmapped area information including the range [low_limit - - * high_limit), the alignment offset and mask. - * - * Return: A memory address or -ENOMEM. - */ -static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) -{ - unsigned long length, gap, gap_end; - unsigned long low_limit, high_limit; - struct vm_area_struct *tmp; - VMA_ITERATOR(vmi, current->mm, 0); - - /* Adjust search length to account for worst case alignment overhead */ - length = info->length + info->align_mask + info->start_gap; - if (length < info->length) - return -ENOMEM; - - low_limit = info->low_limit; - if (low_limit < mmap_min_addr) - low_limit = mmap_min_addr; - high_limit = info->high_limit; -retry: - if (vma_iter_area_highest(&vmi, low_limit, high_limit, length)) - return -ENOMEM; - - gap = vma_iter_end(&vmi) - info->length; - gap -= (gap - info->align_offset) & info->align_mask; - gap_end = vma_iter_end(&vmi); - tmp = vma_next(&vmi); - if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ - if (vm_start_gap(tmp) < gap_end) { - high_limit = vm_start_gap(tmp); - vma_iter_reset(&vmi); - goto retry; - } - } else { - tmp = vma_prev(&vmi); - if (tmp && vm_end_gap(tmp) > gap) { - high_limit = tmp->vm_start; - vma_iter_reset(&vmi); - goto retry; - } - } - - return gap; -} - /* * Determine if the allocation needs to ensure that there is no * existing mapping within it's guard gaps, for use as start_gap. @@ -888,7 +844,8 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, if (get_area) { addr = get_area(file, addr, len, pgoff, flags); - } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) + } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && !file + && !addr /* no hint */ && IS_ALIGNED(len, PMD_SIZE)) { /* Ensures that larger anonymous mappings are THP aligned. */ addr = thp_get_unmapped_area_vmflags(file, addr, len, @@ -984,211 +941,6 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr, return vma; } -/* - * Verify that the stack growth is acceptable and - * update accounting. This is shared with both the - * grow-up and grow-down cases. - */ -static int acct_stack_growth(struct vm_area_struct *vma, - unsigned long size, unsigned long grow) -{ - struct mm_struct *mm = vma->vm_mm; - unsigned long new_start; - - /* address space limit tests */ - if (!may_expand_vm(mm, vma->vm_flags, grow)) - return -ENOMEM; - - /* Stack limit test */ - if (size > rlimit(RLIMIT_STACK)) - return -ENOMEM; - - /* mlock limit tests */ - if (!mlock_future_ok(mm, vma->vm_flags, grow << PAGE_SHIFT)) - return -ENOMEM; - - /* Check to ensure the stack will not grow into a hugetlb-only region */ - new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start : - vma->vm_end - size; - if (is_hugepage_only_range(vma->vm_mm, new_start, size)) - return -EFAULT; - - /* - * Overcommit.. This must be the final test, as it will - * update security statistics. - */ - if (security_vm_enough_memory_mm(mm, grow)) - return -ENOMEM; - - return 0; -} - -#if defined(CONFIG_STACK_GROWSUP) -/* - * PA-RISC uses this for its stack. - * vma is the last one with address > vma->vm_end. Have to extend vma. - */ -static int expand_upwards(struct vm_area_struct *vma, unsigned long address) -{ - struct mm_struct *mm = vma->vm_mm; - struct vm_area_struct *next; - unsigned long gap_addr; - int error = 0; - VMA_ITERATOR(vmi, mm, vma->vm_start); - - if (!(vma->vm_flags & VM_GROWSUP)) - return -EFAULT; - - mmap_assert_write_locked(mm); - - /* Guard against exceeding limits of the address space. */ - address &= PAGE_MASK; - if (address >= (TASK_SIZE & PAGE_MASK)) - return -ENOMEM; - address += PAGE_SIZE; - - /* Enforce stack_guard_gap */ - gap_addr = address + stack_guard_gap; - - /* Guard against overflow */ - if (gap_addr < address || gap_addr > TASK_SIZE) - gap_addr = TASK_SIZE; - - next = find_vma_intersection(mm, vma->vm_end, gap_addr); - if (next && vma_is_accessible(next)) { - if (!(next->vm_flags & VM_GROWSUP)) - return -ENOMEM; - /* Check that both stack segments have the same anon_vma? */ - } - - if (next) - vma_iter_prev_range_limit(&vmi, address); - - vma_iter_config(&vmi, vma->vm_start, address); - if (vma_iter_prealloc(&vmi, vma)) - return -ENOMEM; - - /* We must make sure the anon_vma is allocated. */ - if (unlikely(anon_vma_prepare(vma))) { - vma_iter_free(&vmi); - return -ENOMEM; - } - - /* Lock the VMA before expanding to prevent concurrent page faults */ - vma_start_write(vma); - /* We update the anon VMA tree. */ - anon_vma_lock_write(vma->anon_vma); - - /* Somebody else might have raced and expanded it already */ - if (address > vma->vm_end) { - unsigned long size, grow; - - size = address - vma->vm_start; - grow = (address - vma->vm_end) >> PAGE_SHIFT; - - error = -ENOMEM; - if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { - error = acct_stack_growth(vma, size, grow); - if (!error) { - if (vma->vm_flags & VM_LOCKED) - mm->locked_vm += grow; - vm_stat_account(mm, vma->vm_flags, grow); - anon_vma_interval_tree_pre_update_vma(vma); - vma->vm_end = address; - /* Overwrite old entry in mtree. */ - vma_iter_store(&vmi, vma); - anon_vma_interval_tree_post_update_vma(vma); - - perf_event_mmap(vma); - } - } - } - anon_vma_unlock_write(vma->anon_vma); - vma_iter_free(&vmi); - validate_mm(mm); - return error; -} -#endif /* CONFIG_STACK_GROWSUP */ - -/* - * vma is the first one with address < vma->vm_start. Have to extend vma. - * mmap_lock held for writing. - */ -int expand_downwards(struct vm_area_struct *vma, unsigned long address) -{ - struct mm_struct *mm = vma->vm_mm; - struct vm_area_struct *prev; - int error = 0; - VMA_ITERATOR(vmi, mm, vma->vm_start); - - if (!(vma->vm_flags & VM_GROWSDOWN)) - return -EFAULT; - - mmap_assert_write_locked(mm); - - address &= PAGE_MASK; - if (address < mmap_min_addr || address < FIRST_USER_ADDRESS) - return -EPERM; - - /* Enforce stack_guard_gap */ - prev = vma_prev(&vmi); - /* Check that both stack segments have the same anon_vma? */ - if (prev) { - if (!(prev->vm_flags & VM_GROWSDOWN) && - vma_is_accessible(prev) && - (address - prev->vm_end < stack_guard_gap)) - return -ENOMEM; - } - - if (prev) - vma_iter_next_range_limit(&vmi, vma->vm_start); - - vma_iter_config(&vmi, address, vma->vm_end); - if (vma_iter_prealloc(&vmi, vma)) - return -ENOMEM; - - /* We must make sure the anon_vma is allocated. */ - if (unlikely(anon_vma_prepare(vma))) { - vma_iter_free(&vmi); - return -ENOMEM; - } - - /* Lock the VMA before expanding to prevent concurrent page faults */ - vma_start_write(vma); - /* We update the anon VMA tree. */ - anon_vma_lock_write(vma->anon_vma); - - /* Somebody else might have raced and expanded it already */ - if (address < vma->vm_start) { - unsigned long size, grow; - - size = vma->vm_end - address; - grow = (vma->vm_start - address) >> PAGE_SHIFT; - - error = -ENOMEM; - if (grow <= vma->vm_pgoff) { - error = acct_stack_growth(vma, size, grow); - if (!error) { - if (vma->vm_flags & VM_LOCKED) - mm->locked_vm += grow; - vm_stat_account(mm, vma->vm_flags, grow); - anon_vma_interval_tree_pre_update_vma(vma); - vma->vm_start = address; - vma->vm_pgoff -= grow; - /* Overwrite old entry in mtree. */ - vma_iter_store(&vmi, vma); - anon_vma_interval_tree_post_update_vma(vma); - - perf_event_mmap(vma); - } - } - } - anon_vma_unlock_write(vma->anon_vma); - vma_iter_free(&vmi); - validate_mm(mm); - return error; -} - /* enforced gap between the expanding stack and other mappings. */ unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT; @@ -1320,58 +1072,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, return do_vmi_munmap(&vmi, mm, start, len, uf, false); } -unsigned long mmap_region(struct file *file, unsigned long addr, - unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, - struct list_head *uf) -{ - unsigned long ret; - bool writable_file_mapping = false; - - /* Check to see if MDWE is applicable. */ - if (map_deny_write_exec(vm_flags, vm_flags)) - return -EACCES; - - /* Allow architectures to sanity-check the vm_flags. */ - if (!arch_validate_flags(vm_flags)) - return -EINVAL; - - /* Map writable and ensure this isn't a sealed memfd. */ - if (file && is_shared_maywrite(vm_flags)) { - int error = mapping_map_writable(file->f_mapping); - - if (error) - return error; - writable_file_mapping = true; - } - - ret = __mmap_region(file, addr, len, vm_flags, pgoff, uf); - - /* Clear our write mapping regardless of error. */ - if (writable_file_mapping) - mapping_unmap_writable(file->f_mapping); - - validate_mm(current->mm); - return ret; -} - -static int __vm_munmap(unsigned long start, size_t len, bool unlock) -{ - int ret; - struct mm_struct *mm = current->mm; - LIST_HEAD(uf); - VMA_ITERATOR(vmi, mm, start); - - if (mmap_write_lock_killable(mm)) - return -EINTR; - - ret = do_vmi_munmap(&vmi, mm, start, len, &uf, unlock); - if (ret || !unlock) - mmap_write_unlock(mm); - - userfaultfd_unmap_complete(mm, &uf); - return ret; -} - int vm_munmap(unsigned long start, size_t len) { return __vm_munmap(start, len, false); @@ -1507,88 +1207,6 @@ out: return ret; } -/* - * do_brk_flags() - Increase the brk vma if the flags match. - * @vmi: The vma iterator - * @addr: The start address - * @len: The length of the increase - * @vma: The vma, - * @flags: The VMA Flags - * - * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags - * do not match then create a new anonymous VMA. Eventually we may be able to - * do some brk-specific accounting here. - */ -static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long addr, unsigned long len, unsigned long flags) -{ - struct mm_struct *mm = current->mm; - - /* - * Check against address space limits by the changed size - * Note: This happens *after* clearing old mappings in some code paths. - */ - flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; - if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) - return -ENOMEM; - - if (mm->map_count > sysctl_max_map_count) - return -ENOMEM; - - if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) - return -ENOMEM; - - /* - * Expand the existing vma if possible; Note that singular lists do not - * occur after forking, so the expand will only happen on new VMAs. - */ - if (vma && vma->vm_end == addr) { - VMG_STATE(vmg, mm, vmi, addr, addr + len, flags, PHYS_PFN(addr)); - - vmg.prev = vma; - /* vmi is positioned at prev, which this mode expects. */ - vmg.merge_flags = VMG_FLAG_JUST_EXPAND; - - if (vma_merge_new_range(&vmg)) - goto out; - else if (vmg_nomem(&vmg)) - goto unacct_fail; - } - - if (vma) - vma_iter_next_range(vmi); - /* create a vma struct for an anonymous mapping */ - vma = vm_area_alloc(mm); - if (!vma) - goto unacct_fail; - - vma_set_anonymous(vma); - vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT); - vm_flags_init(vma, flags); - vma->vm_page_prot = vm_get_page_prot(flags); - vma_start_write(vma); - if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL)) - goto mas_store_fail; - - mm->map_count++; - validate_mm(mm); - ksm_add_vma(vma); -out: - perf_event_mmap(vma); - mm->total_vm += len >> PAGE_SHIFT; - mm->data_vm += len >> PAGE_SHIFT; - if (flags & VM_LOCKED) - mm->locked_vm += (len >> PAGE_SHIFT); - vm_flags_set(vma, VM_SOFTDIRTY); - return 0; - -mas_store_fail: - vm_area_free(vma); -unacct_fail: - vm_unacct_memory(len >> PAGE_SHIFT); - return -ENOMEM; -} - int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) { struct mm_struct *mm = current->mm; @@ -1659,7 +1277,6 @@ void exit_mmap(struct mm_struct *mm) goto destroy; } - lru_add_drain(); flush_cache_mm(mm); tlb_gather_mmu_fullmm(&tlb, mm); /* update_hiwater_rss(mm) here? but nobody should be looking */ @@ -1688,7 +1305,8 @@ void exit_mmap(struct mm_struct *mm) do { if (vma->vm_flags & VM_ACCOUNT) nr_accounted += vma_pages(vma); - remove_vma(vma, /* unreachable = */ true); + vma_mark_detached(vma); + remove_vma(vma); count++; cond_resched(); vma = vma_next(&vmi); @@ -1703,48 +1321,6 @@ destroy: vm_unacct_memory(nr_accounted); } -/* Insert vm structure into process list sorted by address - * and into the inode's i_mmap tree. If vm_file is non-NULL - * then i_mmap_rwsem is taken here. - */ -int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) -{ - unsigned long charged = vma_pages(vma); - - - if (find_vma_intersection(mm, vma->vm_start, vma->vm_end)) - return -ENOMEM; - - if ((vma->vm_flags & VM_ACCOUNT) && - security_vm_enough_memory_mm(mm, charged)) - return -ENOMEM; - - /* - * The vm_pgoff of a purely anonymous vma should be irrelevant - * until its first write fault, when page's anon_vma and index - * are set. But now set the vm_pgoff it will almost certainly - * end up with (unless mremap moves it elsewhere before that - * first wfault), so /proc/pid/maps tells a consistent story. - * - * By setting it to reflect the virtual start address of the - * vma, merges and splits can happen in a seamless way, just - * using the existing file pgoff checks and manipulations. - * Similarly in do_mmap and in do_brk_flags. - */ - if (vma_is_anonymous(vma)) { - BUG_ON(vma->anon_vma); - vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; - } - - if (vma_link(mm, vma)) { - if (vma->vm_flags & VM_ACCOUNT) - vm_unacct_memory(charged); - return -ENOMEM; - } - - return 0; -} - /* * Return true if the calling process may expand its vm space by the passed * number of pages @@ -1926,8 +1502,59 @@ struct vm_area_struct *_install_special_mapping( &special_mapping_vmops); } +#ifdef CONFIG_SYSCTL +#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ + defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) +int sysctl_legacy_va_layout; +#endif + +static const struct ctl_table mmap_table[] = { + { + .procname = "max_map_count", + .data = &sysctl_max_map_count, + .maxlen = sizeof(sysctl_max_map_count), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, +#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ + defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) + { + .procname = "legacy_va_layout", + .data = &sysctl_legacy_va_layout, + .maxlen = sizeof(sysctl_legacy_va_layout), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, +#endif +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS + { + .procname = "mmap_rnd_bits", + .data = &mmap_rnd_bits, + .maxlen = sizeof(mmap_rnd_bits), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&mmap_rnd_bits_min, + .extra2 = (void *)&mmap_rnd_bits_max, + }, +#endif +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS + { + .procname = "mmap_rnd_compat_bits", + .data = &mmap_rnd_compat_bits, + .maxlen = sizeof(mmap_rnd_compat_bits), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&mmap_rnd_compat_bits_min, + .extra2 = (void *)&mmap_rnd_compat_bits_max, + }, +#endif +}; +#endif /* CONFIG_SYSCTL */ + /* - * initialise the percpu counter for VM + * initialise the percpu counter for VM, initialise VMA state. */ void __init mmap_init(void) { @@ -1935,6 +1562,10 @@ void __init mmap_init(void) ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); VM_BUG_ON(ret); +#ifdef CONFIG_SYSCTL + register_sysctl_init("vm", mmap_table); +#endif + vma_state_init(); } /* @@ -2046,84 +1677,217 @@ static int __meminit init_reserve_notifier(void) subsys_initcall(init_reserve_notifier); /* - * Relocate a VMA downwards by shift bytes. There cannot be any VMAs between - * this VMA and its relocated range, which will now reside at [vma->vm_start - - * shift, vma->vm_end - shift). + * Obtain a read lock on mm->mmap_lock, if the specified address is below the + * start of the VMA, the intent is to perform a write, and it is a + * downward-growing stack, then attempt to expand the stack to contain it. + * + * This function is intended only for obtaining an argument page from an ELF + * image, and is almost certainly NOT what you want to use for any other + * purpose. + * + * IMPORTANT - VMA fields are accessed without an mmap lock being held, so the + * VMA referenced must not be linked in any user-visible tree, i.e. it must be a + * new VMA being mapped. + * + * The function assumes that addr is either contained within the VMA or below + * it, and makes no attempt to validate this value beyond that. + * + * Returns true if the read lock was obtained and a stack was perhaps expanded, + * false if the stack expansion failed. * - * This function is almost certainly NOT what you want for anything other than - * early executable temporary stack relocation. + * On stack expansion the function temporarily acquires an mmap write lock + * before downgrading it. */ -int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift) +bool mmap_read_lock_maybe_expand(struct mm_struct *mm, + struct vm_area_struct *new_vma, + unsigned long addr, bool write) { - /* - * The process proceeds as follows: - * - * 1) Use shift to calculate the new vma endpoints. - * 2) Extend vma to cover both the old and new ranges. This ensures the - * arguments passed to subsequent functions are consistent. - * 3) Move vma's page tables to the new range. - * 4) Free up any cleared pgd range. - * 5) Shrink the vma to cover only the new range. - */ + if (!write || addr >= new_vma->vm_start) { + mmap_read_lock(mm); + return true; + } - struct mm_struct *mm = vma->vm_mm; - unsigned long old_start = vma->vm_start; - unsigned long old_end = vma->vm_end; - unsigned long length = old_end - old_start; - unsigned long new_start = old_start - shift; - unsigned long new_end = old_end - shift; - VMA_ITERATOR(vmi, mm, new_start); - VMG_STATE(vmg, mm, &vmi, new_start, old_end, 0, vma->vm_pgoff); - struct vm_area_struct *next; - struct mmu_gather tlb; + if (!(new_vma->vm_flags & VM_GROWSDOWN)) + return false; - BUG_ON(new_start > new_end); + mmap_write_lock(mm); + if (expand_downwards(new_vma, addr)) { + mmap_write_unlock(mm); + return false; + } - /* - * ensure there are no vmas between where we want to go - * and where we are - */ - if (vma != vma_next(&vmi)) - return -EFAULT; + mmap_write_downgrade(mm); + return true; +} - vma_iter_prev_range(&vmi); - /* - * cover the whole range: [new_start, old_end) - */ - vmg.vma = vma; - if (vma_expand(&vmg)) - return -ENOMEM; +__latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) +{ + struct vm_area_struct *mpnt, *tmp; + int retval; + unsigned long charge = 0; + LIST_HEAD(uf); + VMA_ITERATOR(vmi, mm, 0); + if (mmap_write_lock_killable(oldmm)) + return -EINTR; + flush_cache_dup_mm(oldmm); + uprobe_dup_mmap(oldmm, mm); /* - * move the page tables downwards, on failure we rely on - * process cleanup to remove whatever mess we made. + * Not linked in yet - no deadlock potential: */ - if (length != move_page_tables(vma, old_start, - vma, new_start, length, false, true)) - return -ENOMEM; + mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING); - lru_add_drain(); - tlb_gather_mmu(&tlb, mm); - next = vma_next(&vmi); - if (new_end > old_start) { + /* No ordering required: file already has been exposed. */ + dup_mm_exe_file(mm, oldmm); + + mm->total_vm = oldmm->total_vm; + mm->data_vm = oldmm->data_vm; + mm->exec_vm = oldmm->exec_vm; + mm->stack_vm = oldmm->stack_vm; + + /* Use __mt_dup() to efficiently build an identical maple tree. */ + retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL); + if (unlikely(retval)) + goto out; + + mt_clear_in_rcu(vmi.mas.tree); + for_each_vma(vmi, mpnt) { + struct file *file; + + vma_start_write(mpnt); + if (mpnt->vm_flags & VM_DONTCOPY) { + retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start, + mpnt->vm_end, GFP_KERNEL); + if (retval) + goto loop_out; + + vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); + continue; + } + charge = 0; /* - * when the old and new regions overlap clear from new_end. + * Don't duplicate many vmas if we've been oom-killed (for + * example) */ - free_pgd_range(&tlb, new_end, old_end, new_end, - next ? next->vm_start : USER_PGTABLES_CEILING); + if (fatal_signal_pending(current)) { + retval = -EINTR; + goto loop_out; + } + if (mpnt->vm_flags & VM_ACCOUNT) { + unsigned long len = vma_pages(mpnt); + + if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ + goto fail_nomem; + charge = len; + } + + tmp = vm_area_dup(mpnt); + if (!tmp) + goto fail_nomem; + retval = vma_dup_policy(mpnt, tmp); + if (retval) + goto fail_nomem_policy; + tmp->vm_mm = mm; + retval = dup_userfaultfd(tmp, &uf); + if (retval) + goto fail_nomem_anon_vma_fork; + if (tmp->vm_flags & VM_WIPEONFORK) { + /* + * VM_WIPEONFORK gets a clean slate in the child. + * Don't prepare anon_vma until fault since we don't + * copy page for current vma. + */ + tmp->anon_vma = NULL; + } else if (anon_vma_fork(tmp, mpnt)) + goto fail_nomem_anon_vma_fork; + vm_flags_clear(tmp, VM_LOCKED_MASK); + /* + * Copy/update hugetlb private vma information. + */ + if (is_vm_hugetlb_page(tmp)) + hugetlb_dup_vma_private(tmp); + + /* + * Link the vma into the MT. After using __mt_dup(), memory + * allocation is not necessary here, so it cannot fail. + */ + vma_iter_bulk_store(&vmi, tmp); + + mm->map_count++; + + if (tmp->vm_ops && tmp->vm_ops->open) + tmp->vm_ops->open(tmp); + + file = tmp->vm_file; + if (file) { + struct address_space *mapping = file->f_mapping; + + get_file(file); + i_mmap_lock_write(mapping); + if (vma_is_shared_maywrite(tmp)) + mapping_allow_writable(mapping); + flush_dcache_mmap_lock(mapping); + /* insert tmp into the share list, just after mpnt */ + vma_interval_tree_insert_after(tmp, mpnt, + &mapping->i_mmap); + flush_dcache_mmap_unlock(mapping); + i_mmap_unlock_write(mapping); + } + + if (!(tmp->vm_flags & VM_WIPEONFORK)) + retval = copy_page_range(tmp, mpnt); + + if (retval) { + mpnt = vma_next(&vmi); + goto loop_out; + } + } + /* a new mm has just been created */ + retval = arch_dup_mmap(oldmm, mm); +loop_out: + vma_iter_free(&vmi); + if (!retval) { + mt_set_in_rcu(vmi.mas.tree); + ksm_fork(mm, oldmm); + khugepaged_fork(mm, oldmm); } else { + /* - * otherwise, clean from old_start; this is done to not touch - * the address space in [new_end, old_start) some architectures - * have constraints on va-space that make this illegal (IA64) - - * for the others its just a little faster. + * The entire maple tree has already been duplicated. If the + * mmap duplication fails, mark the failure point with + * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered, + * stop releasing VMAs that have not been duplicated after this + * point. */ - free_pgd_range(&tlb, old_start, old_end, new_end, - next ? next->vm_start : USER_PGTABLES_CEILING); + if (mpnt) { + mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1); + mas_store(&vmi.mas, XA_ZERO_ENTRY); + /* Avoid OOM iterating a broken tree */ + set_bit(MMF_OOM_SKIP, &mm->flags); + } + /* + * The mm_struct is going to exit, but the locks will be dropped + * first. Set the mm_struct as unstable is advisable as it is + * not fully initialised. + */ + set_bit(MMF_UNSTABLE, &mm->flags); } - tlb_finish_mmu(&tlb); +out: + mmap_write_unlock(mm); + flush_tlb_mm(oldmm); + mmap_write_unlock(oldmm); + if (!retval) + dup_userfaultfd_complete(&uf); + else + dup_userfaultfd_fail(&uf); + return retval; - vma_prev(&vmi); - /* Shrink the vma to just the new range */ - return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff); +fail_nomem_anon_vma_fork: + mpol_put(vma_policy(tmp)); +fail_nomem_policy: + vm_area_free(tmp); +fail_nomem: + retval = -ENOMEM; + vm_unacct_memory(charge); + goto loop_out; } diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index f186d57df2c6..5f725cc67334 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -17,51 +17,7 @@ EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking); EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned); EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released); -#ifdef CONFIG_MEMCG - -/* - * Size of the buffer for memcg path names. Ignoring stack trace support, - * trace_events_hist.c uses MAX_FILTER_STR_VAL for this, so we also use it. - */ -#define MEMCG_PATH_BUF_SIZE MAX_FILTER_STR_VAL - -#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ - do { \ - if (trace_mmap_lock_##type##_enabled()) { \ - char buf[MEMCG_PATH_BUF_SIZE]; \ - get_mm_memcg_path(mm, buf, sizeof(buf)); \ - trace_mmap_lock_##type(mm, buf, ##__VA_ARGS__); \ - } \ - } while (0) - -#else /* !CONFIG_MEMCG */ - -#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ - trace_mmap_lock_##type(mm, "", ##__VA_ARGS__) - -#endif /* CONFIG_MEMCG */ - #ifdef CONFIG_TRACING -#ifdef CONFIG_MEMCG -/* - * Write the given mm_struct's memcg path to a buffer. If the path cannot be - * determined, empty string is written. - */ -static void get_mm_memcg_path(struct mm_struct *mm, char *buf, size_t buflen) -{ - struct mem_cgroup *memcg; - - buf[0] = '\0'; - memcg = get_mem_cgroup_from_mm(mm); - if (memcg == NULL) - return; - if (memcg->css.cgroup) - cgroup_path(memcg->css.cgroup, buf, buflen); - css_put(&memcg->css); -} - -#endif /* CONFIG_MEMCG */ - /* * Trace calls must be in a separate file, as otherwise there's a circular * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h. @@ -69,20 +25,293 @@ static void get_mm_memcg_path(struct mm_struct *mm, char *buf, size_t buflen) void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write) { - TRACE_MMAP_LOCK_EVENT(start_locking, mm, write); + trace_mmap_lock_start_locking(mm, write); } EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking); void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write, bool success) { - TRACE_MMAP_LOCK_EVENT(acquire_returned, mm, write, success); + trace_mmap_lock_acquire_returned(mm, write, success); } EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned); void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write) { - TRACE_MMAP_LOCK_EVENT(released, mm, write); + trace_mmap_lock_released(mm, write); } EXPORT_SYMBOL(__mmap_lock_do_trace_released); #endif /* CONFIG_TRACING */ + +#ifdef CONFIG_MMU +#ifdef CONFIG_PER_VMA_LOCK +static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching) +{ + unsigned int tgt_refcnt = VMA_LOCK_OFFSET; + + /* Additional refcnt if the vma is attached. */ + if (!detaching) + tgt_refcnt++; + + /* + * If vma is detached then only vma_mark_attached() can raise the + * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached(). + */ + if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt)) + return false; + + rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_); + rcuwait_wait_event(&vma->vm_mm->vma_writer_wait, + refcount_read(&vma->vm_refcnt) == tgt_refcnt, + TASK_UNINTERRUPTIBLE); + lock_acquired(&vma->vmlock_dep_map, _RET_IP_); + + return true; +} + +static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached) +{ + *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt); + rwsem_release(&vma->vmlock_dep_map, _RET_IP_); +} + +void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq) +{ + bool locked; + + /* + * __vma_enter_locked() returns false immediately if the vma is not + * attached, otherwise it waits until refcnt is indicating that vma + * is attached with no readers. + */ + locked = __vma_enter_locked(vma, false); + + /* + * We should use WRITE_ONCE() here because we can have concurrent reads + * from the early lockless pessimistic check in vma_start_read(). + * We don't really care about the correctness of that early check, but + * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. + */ + WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); + + if (locked) { + bool detached; + + __vma_exit_locked(vma, &detached); + WARN_ON_ONCE(detached); /* vma should remain attached */ + } +} +EXPORT_SYMBOL_GPL(__vma_start_write); + +void vma_mark_detached(struct vm_area_struct *vma) +{ + vma_assert_write_locked(vma); + vma_assert_attached(vma); + + /* + * We are the only writer, so no need to use vma_refcount_put(). + * The condition below is unlikely because the vma has been already + * write-locked and readers can increment vm_refcnt only temporarily + * before they check vm_lock_seq, realize the vma is locked and drop + * back the vm_refcnt. That is a narrow window for observing a raised + * vm_refcnt. + */ + if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { + /* Wait until vma is detached with no readers. */ + if (__vma_enter_locked(vma, true)) { + bool detached; + + __vma_exit_locked(vma, &detached); + WARN_ON_ONCE(!detached); + } + } +} + +/* + * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be + * stable and not isolated. If the VMA is not found or is being modified the + * function returns NULL. + */ +struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, + unsigned long address) +{ + MA_STATE(mas, &mm->mm_mt, address, address); + struct vm_area_struct *vma; + + rcu_read_lock(); +retry: + vma = mas_walk(&mas); + if (!vma) + goto inval; + + vma = vma_start_read(mm, vma); + if (IS_ERR_OR_NULL(vma)) { + /* Check if the VMA got isolated after we found it */ + if (PTR_ERR(vma) == -EAGAIN) { + count_vm_vma_lock_event(VMA_LOCK_MISS); + /* The area was replaced with another one */ + goto retry; + } + + /* Failed to lock the VMA */ + goto inval; + } + /* + * At this point, we have a stable reference to a VMA: The VMA is + * locked and we know it hasn't already been isolated. + * From here on, we can access the VMA without worrying about which + * fields are accessible for RCU readers. + */ + + /* Check if the vma we locked is the right one. */ + if (unlikely(vma->vm_mm != mm || + address < vma->vm_start || address >= vma->vm_end)) + goto inval_end_read; + + rcu_read_unlock(); + return vma; + +inval_end_read: + vma_end_read(vma); +inval: + rcu_read_unlock(); + count_vm_vma_lock_event(VMA_LOCK_ABORT); + return NULL; +} +#endif /* CONFIG_PER_VMA_LOCK */ + +#ifdef CONFIG_LOCK_MM_AND_FIND_VMA +#include <linux/extable.h> + +static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) +{ + if (likely(mmap_read_trylock(mm))) + return true; + + if (regs && !user_mode(regs)) { + unsigned long ip = exception_ip(regs); + if (!search_exception_tables(ip)) + return false; + } + + return !mmap_read_lock_killable(mm); +} + +static inline bool mmap_upgrade_trylock(struct mm_struct *mm) +{ + /* + * We don't have this operation yet. + * + * It should be easy enough to do: it's basically a + * atomic_long_try_cmpxchg_acquire() + * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but + * it also needs the proper lockdep magic etc. + */ + return false; +} + +static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) +{ + mmap_read_unlock(mm); + if (regs && !user_mode(regs)) { + unsigned long ip = exception_ip(regs); + if (!search_exception_tables(ip)) + return false; + } + return !mmap_write_lock_killable(mm); +} + +/* + * Helper for page fault handling. + * + * This is kind of equivalent to "mmap_read_lock()" followed + * by "find_extend_vma()", except it's a lot more careful about + * the locking (and will drop the lock on failure). + * + * For example, if we have a kernel bug that causes a page + * fault, we don't want to just use mmap_read_lock() to get + * the mm lock, because that would deadlock if the bug were + * to happen while we're holding the mm lock for writing. + * + * So this checks the exception tables on kernel faults in + * order to only do this all for instructions that are actually + * expected to fault. + * + * We can also actually take the mm lock for writing if we + * need to extend the vma, which helps the VM layer a lot. + */ +struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, + unsigned long addr, struct pt_regs *regs) +{ + struct vm_area_struct *vma; + + if (!get_mmap_lock_carefully(mm, regs)) + return NULL; + + vma = find_vma(mm, addr); + if (likely(vma && (vma->vm_start <= addr))) + return vma; + + /* + * Well, dang. We might still be successful, but only + * if we can extend a vma to do so. + */ + if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) { + mmap_read_unlock(mm); + return NULL; + } + + /* + * We can try to upgrade the mmap lock atomically, + * in which case we can continue to use the vma + * we already looked up. + * + * Otherwise we'll have to drop the mmap lock and + * re-take it, and also look up the vma again, + * re-checking it. + */ + if (!mmap_upgrade_trylock(mm)) { + if (!upgrade_mmap_lock_carefully(mm, regs)) + return NULL; + + vma = find_vma(mm, addr); + if (!vma) + goto fail; + if (vma->vm_start <= addr) + goto success; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto fail; + } + + if (expand_stack_locked(vma, addr)) + goto fail; + +success: + mmap_write_downgrade(mm); + return vma; + +fail: + mmap_write_unlock(mm); + return NULL; +} +#endif /* CONFIG_LOCK_MM_AND_FIND_VMA */ + +#else /* CONFIG_MMU */ + +/* + * At least xtensa ends up having protection faults even with no + * MMU.. No stack expansion, at least. + */ +struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, + unsigned long addr, struct pt_regs *regs) +{ + struct vm_area_struct *vma; + + mmap_read_lock(mm); + vma = vma_lookup(mm, addr); + if (!vma) + mmap_read_unlock(mm); + return vma; +} + +#endif /* CONFIG_MMU */ diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 99b3e9408aa0..b49cc6385f1f 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -246,8 +246,16 @@ static void __tlb_remove_table_free(struct mmu_table_batch *batch) * IRQs delays the completion of the TLB flush we can never observe an already * freed page. * - * Architectures that do not have this (PPC) need to delay the freeing by some - * other means, this is that means. + * Not all systems IPI every CPU for this purpose: + * + * - Some architectures have HW support for cross-CPU synchronisation of TLB + * flushes, so there's no IPI at all. + * + * - Paravirt guests can do this TLB flushing in the hypervisor, or coordinate + * with the hypervisor to defer flushing on preempted vCPUs. + * + * Such systems need to delay the freeing by some other means, this is that + * means. * * What we do is batch the freed directory pages (tables) and RCU free them. * We use the sched RCU variant, as that guarantees that IRQ/preempt disabling @@ -311,11 +319,34 @@ static inline void tlb_table_invalidate(struct mmu_gather *tlb) } } -static void tlb_remove_table_one(void *table) +#ifdef CONFIG_PT_RECLAIM +static inline void __tlb_remove_table_one_rcu(struct rcu_head *head) +{ + struct ptdesc *ptdesc; + + ptdesc = container_of(head, struct ptdesc, pt_rcu_head); + __tlb_remove_table(ptdesc); +} + +static inline void __tlb_remove_table_one(void *table) +{ + struct ptdesc *ptdesc; + + ptdesc = table; + call_rcu(&ptdesc->pt_rcu_head, __tlb_remove_table_one_rcu); +} +#else +static inline void __tlb_remove_table_one(void *table) { tlb_remove_table_sync_one(); __tlb_remove_table(table); } +#endif /* CONFIG_PT_RECLAIM */ + +static void tlb_remove_table_one(void *table) +{ + __tlb_remove_table_one(table); +} static void tlb_table_flush(struct mmu_gather *tlb) { @@ -393,6 +424,7 @@ static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, #ifdef CONFIG_MMU_GATHER_PAGE_SIZE tlb->page_size = 0; #endif + tlb->vma_pfn = 0; __tlb_reset_range(tlb); inc_tlb_flush_pending(tlb->mm); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index fc18fe274505..8e0125dc0522 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -4,7 +4,7 @@ * * Copyright (C) 2008 Qumranet, Inc. * Copyright (C) 2008 SGI - * Christoph Lameter <cl@linux.com> + * Christoph Lameter <cl@gentwo.org> */ #include <linux/rculist.h> diff --git a/mm/mprotect.c b/mm/mprotect.c index 516b1d847e2c..88608d0dc2c2 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -133,7 +133,7 @@ static long change_pte_range(struct mmu_gather *tlb, /* Also skip shared copy-on-write pages */ if (is_cow_mapping(vma->vm_flags) && (folio_maybe_dma_pinned(folio) || - folio_likely_mapped_shared(folio))) + folio_maybe_mapped_shared(folio))) continue; /* @@ -225,14 +225,6 @@ static long change_pte_range(struct mmu_gather *tlb, newpte = swp_entry_to_pte(entry); if (pte_swp_uffd_wp(oldpte)) newpte = pte_swp_mkuffd_wp(newpte); - } else if (is_writable_device_exclusive_entry(entry)) { - entry = make_readable_device_exclusive_entry( - swp_offset(entry)); - newpte = swp_entry_to_pte(entry); - if (pte_swp_soft_dirty(oldpte)) - newpte = pte_swp_mksoft_dirty(newpte); - if (pte_swp_uffd_wp(oldpte)) - newpte = pte_swp_mkuffd_wp(newpte); } else if (is_pte_marker_entry(entry)) { /* * Ignore error swap entries unconditionally, @@ -387,7 +379,7 @@ again: if (is_swap_pmd(_pmd) || pmd_trans_huge(_pmd) || pmd_devmap(_pmd)) { if ((next - addr != HPAGE_PMD_SIZE) || pgtable_split_needed(vma, cp_flags)) { - __split_huge_pmd(vma, pmd, addr, false, NULL); + __split_huge_pmd(vma, pmd, addr, false); /* * For file-backed, the pmd could have been * cleared; make sure pmd populated if @@ -607,7 +599,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, unsigned long start, unsigned long end, unsigned long newflags) { struct mm_struct *mm = vma->vm_mm; - unsigned long oldflags = vma->vm_flags; + unsigned long oldflags = READ_ONCE(vma->vm_flags); long nrpages = (end - start) >> PAGE_SHIFT; unsigned int mm_cp_flags = 0; unsigned long charged = 0; @@ -627,7 +619,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, * uncommon case, so doesn't need to be very optimized. */ if (arch_has_pfn_modify_check() && - (vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && + (oldflags & (VM_PFNMAP|VM_MIXEDMAP)) && (newflags & VM_ACCESS_FLAGS) == 0) { pgprot_t new_pgprot = vm_get_page_prot(newflags); @@ -676,7 +668,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, * held in write mode. */ vma_start_write(vma); - vm_flags_reset(vma, newflags); + vm_flags_reset_once(vma, newflags); if (vma_wants_manual_pte_write_upgrade(vma)) mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; vma_set_page_prot(vma); diff --git a/mm/mremap.c b/mm/mremap.c index 60473413836b..60f6b8d0d5f0 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -32,6 +32,45 @@ #include "internal.h" +/* Classify the kind of remap operation being performed. */ +enum mremap_type { + MREMAP_INVALID, /* Initial state. */ + MREMAP_NO_RESIZE, /* old_len == new_len, if not moved, do nothing. */ + MREMAP_SHRINK, /* old_len > new_len. */ + MREMAP_EXPAND, /* old_len < new_len. */ +}; + +/* + * Describes a VMA mremap() operation and is threaded throughout it. + * + * Any of the fields may be mutated by the operation, however these values will + * always accurately reflect the remap (for instance, we may adjust lengths and + * delta to account for hugetlb alignment). + */ +struct vma_remap_struct { + /* User-provided state. */ + unsigned long addr; /* User-specified address from which we remap. */ + unsigned long old_len; /* Length of range being remapped. */ + unsigned long new_len; /* Desired new length of mapping. */ + unsigned long flags; /* user-specified MREMAP_* flags. */ + unsigned long new_addr; /* Optionally, desired new address. */ + + /* uffd state. */ + struct vm_userfaultfd_ctx *uf; + struct list_head *uf_unmap_early; + struct list_head *uf_unmap; + + /* VMA state, determined in do_mremap(). */ + struct vm_area_struct *vma; + + /* Internal state, determined in do_mremap(). */ + unsigned long delta; /* Absolute delta of old_len,new_len. */ + bool mlocked; /* Was the VMA mlock()'d? */ + enum mremap_type remap_type; /* expand, shrink, etc. */ + bool mmap_locked; /* Is mm currently write-locked? */ + unsigned long charged; /* If VM_ACCOUNT, # pages to account. */ +}; + static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; @@ -69,8 +108,7 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) return pmd; } -static pud_t *alloc_new_pud(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr) +static pud_t *alloc_new_pud(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; p4d_t *p4d; @@ -83,13 +121,12 @@ static pud_t *alloc_new_pud(struct mm_struct *mm, struct vm_area_struct *vma, return pud_alloc(mm, p4d, addr); } -static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr) +static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr) { pud_t *pud; pmd_t *pmd; - pud = alloc_new_pud(mm, vma, addr); + pud = alloc_new_pud(mm, addr); if (!pud) return NULL; @@ -133,16 +170,19 @@ static pte_t move_soft_dirty_pte(pte_t pte) return pte; } -static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, - unsigned long old_addr, unsigned long old_end, - struct vm_area_struct *new_vma, pmd_t *new_pmd, - unsigned long new_addr, bool need_rmap_locks) +static int move_ptes(struct pagetable_move_control *pmc, + unsigned long extent, pmd_t *old_pmd, pmd_t *new_pmd) { + struct vm_area_struct *vma = pmc->old; + bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma); struct mm_struct *mm = vma->vm_mm; pte_t *old_pte, *new_pte, pte; pmd_t dummy_pmdval; spinlock_t *old_ptl, *new_ptl; bool force_flush = false; + unsigned long old_addr = pmc->old_addr; + unsigned long new_addr = pmc->new_addr; + unsigned long old_end = old_addr + extent; unsigned long len = old_end - old_addr; int err = 0; @@ -164,7 +204,7 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, * serialize access to individual ptes, but only rmap traversal * order guarantees that we won't miss both the old and new ptes). */ - if (need_rmap_locks) + if (pmc->need_rmap_locks) take_rmap_locks(vma); /* @@ -197,6 +237,8 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE, new_pte++, new_addr += PAGE_SIZE) { + VM_WARN_ON_ONCE(!pte_none(*new_pte)); + if (pte_none(ptep_get(old_pte))) continue; @@ -216,7 +258,18 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, force_flush = true; pte = move_pte(pte, old_addr, new_addr); pte = move_soft_dirty_pte(pte); - set_pte_at(mm, new_addr, new_pte, pte); + + if (need_clear_uffd_wp && pte_marker_uffd_wp(pte)) + pte_clear(mm, new_addr, new_pte); + else { + if (need_clear_uffd_wp) { + if (pte_present(pte)) + pte = pte_clear_uffd_wp(pte); + else if (is_swap_pte(pte)) + pte = pte_swp_clear_uffd_wp(pte); + } + set_pte_at(mm, new_addr, new_pte, pte); + } } arch_leave_lazy_mmu_mode(); @@ -227,7 +280,7 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, pte_unmap(new_pte - 1); pte_unmap_unlock(old_pte - 1, old_ptl); out: - if (need_rmap_locks) + if (pmc->need_rmap_locks) drop_rmap_locks(vma); return err; } @@ -242,10 +295,11 @@ static inline bool arch_supports_page_table_move(void) #endif #ifdef CONFIG_HAVE_MOVE_PMD -static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr, - unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd) +static bool move_normal_pmd(struct pagetable_move_control *pmc, + pmd_t *old_pmd, pmd_t *new_pmd) { spinlock_t *old_ptl, *new_ptl; + struct vm_area_struct *vma = pmc->old; struct mm_struct *mm = vma->vm_mm; bool res = false; pmd_t pmd; @@ -278,11 +332,20 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr, if (WARN_ON_ONCE(!pmd_none(*new_pmd))) return false; + /* If this pmd belongs to a uffd vma with remap events disabled, we need + * to ensure that the uffd-wp state is cleared from all pgtables. This + * means recursing into lower page tables in move_page_tables(), and we + * can reuse the existing code if we simply treat the entry as "not + * moved". + */ + if (vma_has_uffd_without_event_remap(vma)) + return false; + /* * We don't have to worry about the ordering of src and dst * ptlocks because exclusive mmap_lock prevents deadlock. */ - old_ptl = pmd_lock(vma->vm_mm, old_pmd); + old_ptl = pmd_lock(mm, old_pmd); new_ptl = pmd_lockptr(mm, new_pmd); if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); @@ -299,7 +362,7 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr, VM_BUG_ON(!pmd_none(*new_pmd)); pmd_populate(mm, new_pmd, pmd_pgtable(pmd)); - flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE); + flush_tlb_range(vma, pmc->old_addr, pmc->old_addr + PMD_SIZE); out_unlock: if (new_ptl != old_ptl) spin_unlock(new_ptl); @@ -308,19 +371,19 @@ out_unlock: return res; } #else -static inline bool move_normal_pmd(struct vm_area_struct *vma, - unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd, - pmd_t *new_pmd) +static inline bool move_normal_pmd(struct pagetable_move_control *pmc, + pmd_t *old_pmd, pmd_t *new_pmd) { return false; } #endif #if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_HAVE_MOVE_PUD) -static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr, - unsigned long new_addr, pud_t *old_pud, pud_t *new_pud) +static bool move_normal_pud(struct pagetable_move_control *pmc, + pud_t *old_pud, pud_t *new_pud) { spinlock_t *old_ptl, *new_ptl; + struct vm_area_struct *vma = pmc->old; struct mm_struct *mm = vma->vm_mm; pud_t pud; @@ -333,11 +396,20 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr, if (WARN_ON_ONCE(!pud_none(*new_pud))) return false; + /* If this pud belongs to a uffd vma with remap events disabled, we need + * to ensure that the uffd-wp state is cleared from all pgtables. This + * means recursing into lower page tables in move_page_tables(), and we + * can reuse the existing code if we simply treat the entry as "not + * moved". + */ + if (vma_has_uffd_without_event_remap(vma)) + return false; + /* * We don't have to worry about the ordering of src and dst * ptlocks because exclusive mmap_lock prevents deadlock. */ - old_ptl = pud_lock(vma->vm_mm, old_pud); + old_ptl = pud_lock(mm, old_pud); new_ptl = pud_lockptr(mm, new_pud); if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); @@ -349,7 +421,7 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr, VM_BUG_ON(!pud_none(*new_pud)); pud_populate(mm, new_pud, pud_pgtable(pud)); - flush_tlb_range(vma, old_addr, old_addr + PUD_SIZE); + flush_tlb_range(vma, pmc->old_addr, pmc->old_addr + PUD_SIZE); if (new_ptl != old_ptl) spin_unlock(new_ptl); spin_unlock(old_ptl); @@ -357,19 +429,19 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr, return true; } #else -static inline bool move_normal_pud(struct vm_area_struct *vma, - unsigned long old_addr, unsigned long new_addr, pud_t *old_pud, - pud_t *new_pud) +static inline bool move_normal_pud(struct pagetable_move_control *pmc, + pud_t *old_pud, pud_t *new_pud) { return false; } #endif #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) -static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr, - unsigned long new_addr, pud_t *old_pud, pud_t *new_pud) +static bool move_huge_pud(struct pagetable_move_control *pmc, + pud_t *old_pud, pud_t *new_pud) { spinlock_t *old_ptl, *new_ptl; + struct vm_area_struct *vma = pmc->old; struct mm_struct *mm = vma->vm_mm; pud_t pud; @@ -384,7 +456,7 @@ static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr, * We don't have to worry about the ordering of src and dst * ptlocks because exclusive mmap_lock prevents deadlock. */ - old_ptl = pud_lock(vma->vm_mm, old_pud); + old_ptl = pud_lock(mm, old_pud); new_ptl = pud_lockptr(mm, new_pud); if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); @@ -397,8 +469,8 @@ static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr, /* Set the new pud */ /* mark soft_ditry when we add pud level soft dirty support */ - set_pud_at(mm, new_addr, new_pud, pud); - flush_pud_tlb_range(vma, old_addr, old_addr + HPAGE_PUD_SIZE); + set_pud_at(mm, pmc->new_addr, new_pud, pud); + flush_pud_tlb_range(vma, pmc->old_addr, pmc->old_addr + HPAGE_PUD_SIZE); if (new_ptl != old_ptl) spin_unlock(new_ptl); spin_unlock(old_ptl); @@ -406,8 +478,9 @@ static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr, return true; } #else -static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr, - unsigned long new_addr, pud_t *old_pud, pud_t *new_pud) +static bool move_huge_pud(struct pagetable_move_control *pmc, + pud_t *old_pud, pud_t *new_pud) + { WARN_ON_ONCE(1); return false; @@ -428,10 +501,12 @@ enum pgt_entry { * destination pgt_entry. */ static __always_inline unsigned long get_extent(enum pgt_entry entry, - unsigned long old_addr, unsigned long old_end, - unsigned long new_addr) + struct pagetable_move_control *pmc) { unsigned long next, extent, mask, size; + unsigned long old_addr = pmc->old_addr; + unsigned long old_end = pmc->old_end; + unsigned long new_addr = pmc->new_addr; switch (entry) { case HPAGE_PMD: @@ -461,37 +536,50 @@ static __always_inline unsigned long get_extent(enum pgt_entry entry, } /* + * Should move_pgt_entry() acquire the rmap locks? This is either expressed in + * the PMC, or overridden in the case of normal, larger page tables. + */ +static bool should_take_rmap_locks(struct pagetable_move_control *pmc, + enum pgt_entry entry) +{ + switch (entry) { + case NORMAL_PMD: + case NORMAL_PUD: + return true; + default: + return pmc->need_rmap_locks; + } +} + +/* * Attempts to speedup the move by moving entry at the level corresponding to * pgt_entry. Returns true if the move was successful, else false. */ -static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma, - unsigned long old_addr, unsigned long new_addr, - void *old_entry, void *new_entry, bool need_rmap_locks) +static bool move_pgt_entry(struct pagetable_move_control *pmc, + enum pgt_entry entry, void *old_entry, void *new_entry) { bool moved = false; + bool need_rmap_locks = should_take_rmap_locks(pmc, entry); /* See comment in move_ptes() */ if (need_rmap_locks) - take_rmap_locks(vma); + take_rmap_locks(pmc->old); switch (entry) { case NORMAL_PMD: - moved = move_normal_pmd(vma, old_addr, new_addr, old_entry, - new_entry); + moved = move_normal_pmd(pmc, old_entry, new_entry); break; case NORMAL_PUD: - moved = move_normal_pud(vma, old_addr, new_addr, old_entry, - new_entry); + moved = move_normal_pud(pmc, old_entry, new_entry); break; case HPAGE_PMD: moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && - move_huge_pmd(vma, old_addr, new_addr, old_entry, + move_huge_pmd(pmc->old, pmc->old_addr, pmc->new_addr, old_entry, new_entry); break; case HPAGE_PUD: moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && - move_huge_pud(vma, old_addr, new_addr, old_entry, - new_entry); + move_huge_pud(pmc, old_entry, new_entry); break; default: @@ -500,7 +588,7 @@ static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma, } if (need_rmap_locks) - drop_rmap_locks(vma); + drop_rmap_locks(pmc->old); return moved; } @@ -511,8 +599,9 @@ static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma, * the VMA that is created to span the source and destination of the move, * so we make an exception for it. */ -static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_align, - unsigned long mask, bool for_stack) +static bool can_align_down(struct pagetable_move_control *pmc, + struct vm_area_struct *vma, unsigned long addr_to_align, + unsigned long mask) { unsigned long addr_masked = addr_to_align & mask; @@ -521,11 +610,11 @@ static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_ali * of the corresponding VMA, we can't align down or we will destroy part * of the current mapping. */ - if (!for_stack && vma->vm_start != addr_to_align) + if (!pmc->for_stack && vma->vm_start != addr_to_align) return false; /* In the stack case we explicitly permit in-VMA alignment. */ - if (for_stack && addr_masked >= vma->vm_start) + if (pmc->for_stack && addr_masked >= vma->vm_start) return true; /* @@ -535,163 +624,390 @@ static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_ali return find_vma_intersection(vma->vm_mm, addr_masked, vma->vm_start) == NULL; } -/* Opportunistically realign to specified boundary for faster copy. */ -static void try_realign_addr(unsigned long *old_addr, struct vm_area_struct *old_vma, - unsigned long *new_addr, struct vm_area_struct *new_vma, - unsigned long mask, bool for_stack) +/* + * Determine if are in fact able to realign for efficiency to a higher page + * table boundary. + */ +static bool can_realign_addr(struct pagetable_move_control *pmc, + unsigned long pagetable_mask) { + unsigned long align_mask = ~pagetable_mask; + unsigned long old_align = pmc->old_addr & align_mask; + unsigned long new_align = pmc->new_addr & align_mask; + unsigned long pagetable_size = align_mask + 1; + unsigned long old_align_next = pagetable_size - old_align; + + /* + * We don't want to have to go hunting for VMAs from the end of the old + * VMA to the next page table boundary, also we want to make sure the + * operation is wortwhile. + * + * So ensure that we only perform this realignment if the end of the + * range being copied reaches or crosses the page table boundary. + * + * boundary boundary + * .<- old_align -> . + * . |----------------.-----------| + * . | vma . | + * . |----------------.-----------| + * . <----------------.-----------> + * . len_in + * <-------------------------------> + * . pagetable_size . + * . <----------------> + * . old_align_next . + */ + if (pmc->len_in < old_align_next) + return false; + /* Skip if the addresses are already aligned. */ - if ((*old_addr & ~mask) == 0) - return; + if (old_align == 0) + return false; /* Only realign if the new and old addresses are mutually aligned. */ - if ((*old_addr & ~mask) != (*new_addr & ~mask)) - return; + if (old_align != new_align) + return false; /* Ensure realignment doesn't cause overlap with existing mappings. */ - if (!can_align_down(old_vma, *old_addr, mask, for_stack) || - !can_align_down(new_vma, *new_addr, mask, for_stack)) + if (!can_align_down(pmc, pmc->old, pmc->old_addr, pagetable_mask) || + !can_align_down(pmc, pmc->new, pmc->new_addr, pagetable_mask)) + return false; + + return true; +} + +/* + * Opportunistically realign to specified boundary for faster copy. + * + * Consider an mremap() of a VMA with page table boundaries as below, and no + * preceding VMAs from the lower page table boundary to the start of the VMA, + * with the end of the range reaching or crossing the page table boundary. + * + * boundary boundary + * . |----------------.-----------| + * . | vma . | + * . |----------------.-----------| + * . pmc->old_addr . pmc->old_end + * . <----------------------------> + * . move these page tables + * + * If we proceed with moving page tables in this scenario, we will have a lot of + * work to do traversing old page tables and establishing new ones in the + * destination across multiple lower level page tables. + * + * The idea here is simply to align pmc->old_addr, pmc->new_addr down to the + * page table boundary, so we can simply copy a single page table entry for the + * aligned portion of the VMA instead: + * + * boundary boundary + * . |----------------.-----------| + * . | vma . | + * . |----------------.-----------| + * pmc->old_addr . pmc->old_end + * <-------------------------------------------> + * . move these page tables + */ +static void try_realign_addr(struct pagetable_move_control *pmc, + unsigned long pagetable_mask) +{ + + if (!can_realign_addr(pmc, pagetable_mask)) return; - *old_addr = *old_addr & mask; - *new_addr = *new_addr & mask; + /* + * Simply align to page table boundaries. Note that we do NOT update the + * pmc->old_end value, and since the move_page_tables() operation spans + * from [old_addr, old_end) (offsetting new_addr as it is performed), + * this simply changes the start of the copy, not the end. + */ + pmc->old_addr &= pagetable_mask; + pmc->new_addr &= pagetable_mask; } -unsigned long move_page_tables(struct vm_area_struct *vma, - unsigned long old_addr, struct vm_area_struct *new_vma, - unsigned long new_addr, unsigned long len, - bool need_rmap_locks, bool for_stack) +/* Is the page table move operation done? */ +static bool pmc_done(struct pagetable_move_control *pmc) { - unsigned long extent, old_end; + return pmc->old_addr >= pmc->old_end; +} + +/* Advance to the next page table, offset by extent bytes. */ +static void pmc_next(struct pagetable_move_control *pmc, unsigned long extent) +{ + pmc->old_addr += extent; + pmc->new_addr += extent; +} + +/* + * Determine how many bytes in the specified input range have had their page + * tables moved so far. + */ +static unsigned long pmc_progress(struct pagetable_move_control *pmc) +{ + unsigned long orig_old_addr = pmc->old_end - pmc->len_in; + unsigned long old_addr = pmc->old_addr; + + /* + * Prevent negative return values when {old,new}_addr was realigned but + * we broke out of the loop in move_page_tables() for the first PMD + * itself. + */ + return old_addr < orig_old_addr ? 0 : old_addr - orig_old_addr; +} + +unsigned long move_page_tables(struct pagetable_move_control *pmc) +{ + unsigned long extent; struct mmu_notifier_range range; pmd_t *old_pmd, *new_pmd; pud_t *old_pud, *new_pud; + struct mm_struct *mm = pmc->old->vm_mm; - if (!len) + if (!pmc->len_in) return 0; - old_end = old_addr + len; - - if (is_vm_hugetlb_page(vma)) - return move_hugetlb_page_tables(vma, new_vma, old_addr, - new_addr, len); + if (is_vm_hugetlb_page(pmc->old)) + return move_hugetlb_page_tables(pmc->old, pmc->new, pmc->old_addr, + pmc->new_addr, pmc->len_in); /* * If possible, realign addresses to PMD boundary for faster copy. * Only realign if the mremap copying hits a PMD boundary. */ - if (len >= PMD_SIZE - (old_addr & ~PMD_MASK)) - try_realign_addr(&old_addr, vma, &new_addr, new_vma, PMD_MASK, - for_stack); + try_realign_addr(pmc, PMD_MASK); - flush_cache_range(vma, old_addr, old_end); - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm, - old_addr, old_end); + flush_cache_range(pmc->old, pmc->old_addr, pmc->old_end); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, mm, + pmc->old_addr, pmc->old_end); mmu_notifier_invalidate_range_start(&range); - for (; old_addr < old_end; old_addr += extent, new_addr += extent) { + for (; !pmc_done(pmc); pmc_next(pmc, extent)) { cond_resched(); /* * If extent is PUD-sized try to speed up the move by moving at the * PUD level if possible. */ - extent = get_extent(NORMAL_PUD, old_addr, old_end, new_addr); + extent = get_extent(NORMAL_PUD, pmc); - old_pud = get_old_pud(vma->vm_mm, old_addr); + old_pud = get_old_pud(mm, pmc->old_addr); if (!old_pud) continue; - new_pud = alloc_new_pud(vma->vm_mm, vma, new_addr); + new_pud = alloc_new_pud(mm, pmc->new_addr); if (!new_pud) break; if (pud_trans_huge(*old_pud) || pud_devmap(*old_pud)) { if (extent == HPAGE_PUD_SIZE) { - move_pgt_entry(HPAGE_PUD, vma, old_addr, new_addr, - old_pud, new_pud, need_rmap_locks); + move_pgt_entry(pmc, HPAGE_PUD, old_pud, new_pud); /* We ignore and continue on error? */ continue; } } else if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) { - - if (move_pgt_entry(NORMAL_PUD, vma, old_addr, new_addr, - old_pud, new_pud, true)) + if (move_pgt_entry(pmc, NORMAL_PUD, old_pud, new_pud)) continue; } - extent = get_extent(NORMAL_PMD, old_addr, old_end, new_addr); - old_pmd = get_old_pmd(vma->vm_mm, old_addr); + extent = get_extent(NORMAL_PMD, pmc); + old_pmd = get_old_pmd(mm, pmc->old_addr); if (!old_pmd) continue; - new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr); + new_pmd = alloc_new_pmd(mm, pmc->new_addr); if (!new_pmd) break; again: if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) || pmd_devmap(*old_pmd)) { if (extent == HPAGE_PMD_SIZE && - move_pgt_entry(HPAGE_PMD, vma, old_addr, new_addr, - old_pmd, new_pmd, need_rmap_locks)) + move_pgt_entry(pmc, HPAGE_PMD, old_pmd, new_pmd)) continue; - split_huge_pmd(vma, old_pmd, old_addr); + split_huge_pmd(pmc->old, old_pmd, pmc->old_addr); } else if (IS_ENABLED(CONFIG_HAVE_MOVE_PMD) && extent == PMD_SIZE) { /* * If the extent is PMD-sized, try to speed the move by * moving at the PMD level if possible. */ - if (move_pgt_entry(NORMAL_PMD, vma, old_addr, new_addr, - old_pmd, new_pmd, true)) + if (move_pgt_entry(pmc, NORMAL_PMD, old_pmd, new_pmd)) continue; } if (pmd_none(*old_pmd)) continue; - if (pte_alloc(new_vma->vm_mm, new_pmd)) + if (pte_alloc(pmc->new->vm_mm, new_pmd)) break; - if (move_ptes(vma, old_pmd, old_addr, old_addr + extent, - new_vma, new_pmd, new_addr, need_rmap_locks) < 0) + if (move_ptes(pmc, extent, old_pmd, new_pmd) < 0) goto again; } mmu_notifier_invalidate_range_end(&range); + return pmc_progress(pmc); +} + +/* Set vrm->delta to the difference in VMA size specified by user. */ +static void vrm_set_delta(struct vma_remap_struct *vrm) +{ + vrm->delta = abs_diff(vrm->old_len, vrm->new_len); +} + +/* Determine what kind of remap this is - shrink, expand or no resize at all. */ +static enum mremap_type vrm_remap_type(struct vma_remap_struct *vrm) +{ + if (vrm->delta == 0) + return MREMAP_NO_RESIZE; + + if (vrm->old_len > vrm->new_len) + return MREMAP_SHRINK; + + return MREMAP_EXPAND; +} + +/* + * When moving a VMA to vrm->new_adr, does this result in the new and old VMAs + * overlapping? + */ +static bool vrm_overlaps(struct vma_remap_struct *vrm) +{ + unsigned long start_old = vrm->addr; + unsigned long start_new = vrm->new_addr; + unsigned long end_old = vrm->addr + vrm->old_len; + unsigned long end_new = vrm->new_addr + vrm->new_len; + /* - * Prevent negative return values when {old,new}_addr was realigned - * but we broke out of the above loop for the first PMD itself. + * start_old end_old + * |-----------| + * | | + * |-----------| + * |-------------| + * | | + * |-------------| + * start_new end_new */ - if (old_addr < old_end - len) - return 0; + if (end_old > start_new && end_new > start_old) + return true; - return len + old_addr - old_end; /* how much done */ + return false; } -static unsigned long move_vma(struct vm_area_struct *vma, - unsigned long old_addr, unsigned long old_len, - unsigned long new_len, unsigned long new_addr, - bool *locked, unsigned long flags, - struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap) +/* Do the mremap() flags require that the new_addr parameter be specified? */ +static bool vrm_implies_new_addr(struct vma_remap_struct *vrm) { - long to_account = new_len - old_len; - struct mm_struct *mm = vma->vm_mm; - struct vm_area_struct *new_vma; - unsigned long vm_flags = vma->vm_flags; - unsigned long new_pgoff; - unsigned long moved_len; - unsigned long account_start = 0; - unsigned long account_end = 0; - unsigned long hiwater_vm; - int err = 0; - bool need_rmap_locks; - struct vma_iterator vmi; + return vrm->flags & (MREMAP_FIXED | MREMAP_DONTUNMAP); +} + +/* + * Find an unmapped area for the requested vrm->new_addr. + * + * If MREMAP_FIXED then this is equivalent to a MAP_FIXED mmap() call. If only + * MREMAP_DONTUNMAP is set, then this is equivalent to providing a hint to + * mmap(), otherwise this is equivalent to mmap() specifying a NULL address. + * + * Returns 0 on success (with vrm->new_addr updated), or an error code upon + * failure. + */ +static unsigned long vrm_set_new_addr(struct vma_remap_struct *vrm) +{ + struct vm_area_struct *vma = vrm->vma; + unsigned long map_flags = 0; + /* Page Offset _into_ the VMA. */ + pgoff_t internal_pgoff = (vrm->addr - vma->vm_start) >> PAGE_SHIFT; + pgoff_t pgoff = vma->vm_pgoff + internal_pgoff; + unsigned long new_addr = vrm_implies_new_addr(vrm) ? vrm->new_addr : 0; + unsigned long res; + + if (vrm->flags & MREMAP_FIXED) + map_flags |= MAP_FIXED; + if (vma->vm_flags & VM_MAYSHARE) + map_flags |= MAP_SHARED; + + res = get_unmapped_area(vma->vm_file, new_addr, vrm->new_len, pgoff, + map_flags); + if (IS_ERR_VALUE(res)) + return res; + + vrm->new_addr = res; + return 0; +} + +/* + * Keep track of pages which have been added to the memory mapping. If the VMA + * is accounted, also check to see if there is sufficient memory. + * + * Returns true on success, false if insufficient memory to charge. + */ +static bool vrm_charge(struct vma_remap_struct *vrm) +{ + unsigned long charged; + + if (!(vrm->vma->vm_flags & VM_ACCOUNT)) + return true; + + /* + * If we don't unmap the old mapping, then we account the entirety of + * the length of the new one. Otherwise it's just the delta in size. + */ + if (vrm->flags & MREMAP_DONTUNMAP) + charged = vrm->new_len >> PAGE_SHIFT; + else + charged = vrm->delta >> PAGE_SHIFT; + + + /* This accounts 'charged' pages of memory. */ + if (security_vm_enough_memory_mm(current->mm, charged)) + return false; + + vrm->charged = charged; + return true; +} + +/* + * an error has occurred so we will not be using vrm->charged memory. Unaccount + * this memory if the VMA is accounted. + */ +static void vrm_uncharge(struct vma_remap_struct *vrm) +{ + if (!(vrm->vma->vm_flags & VM_ACCOUNT)) + return; + + vm_unacct_memory(vrm->charged); + vrm->charged = 0; +} + +/* + * Update mm exec_vm, stack_vm, data_vm, and locked_vm fields as needed to + * account for 'bytes' memory used, and if locked, indicate this in the VRM so + * we can handle this correctly later. + */ +static void vrm_stat_account(struct vma_remap_struct *vrm, + unsigned long bytes) +{ + unsigned long pages = bytes >> PAGE_SHIFT; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma = vrm->vma; + + vm_stat_account(mm, vma->vm_flags, pages); + if (vma->vm_flags & VM_LOCKED) { + mm->locked_vm += pages; + vrm->mlocked = true; + } +} + +/* + * Perform checks before attempting to write a VMA prior to it being + * moved. + */ +static unsigned long prep_move_vma(struct vma_remap_struct *vrm) +{ + unsigned long err = 0; + struct vm_area_struct *vma = vrm->vma; + unsigned long old_addr = vrm->addr; + unsigned long old_len = vrm->old_len; + unsigned long dummy = vma->vm_flags; /* * We'd prefer to avoid failure later on in do_munmap: * which may split one vma into three before unmapping. */ - if (mm->map_count >= sysctl_max_map_count - 3) + if (current->mm->map_count >= sysctl_max_map_count - 3) return -ENOMEM; - if (unlikely(flags & MREMAP_DONTUNMAP)) - to_account = new_len; - if (vma->vm_ops && vma->vm_ops->may_split) { if (vma->vm_start != old_addr) err = vma->vm_ops->may_split(vma, old_addr); @@ -709,61 +1025,234 @@ static unsigned long move_vma(struct vm_area_struct *vma, * so KSM can come around to merge on vma and new_vma afterwards. */ err = ksm_madvise(vma, old_addr, old_addr + old_len, - MADV_UNMERGEABLE, &vm_flags); + MADV_UNMERGEABLE, &dummy); if (err) return err; - if (vm_flags & VM_ACCOUNT) { - if (security_vm_enough_memory_mm(mm, to_account >> PAGE_SHIFT)) - return -ENOMEM; + return 0; +} + +/* + * Unmap source VMA for VMA move, turning it from a copy to a move, being + * careful to ensure we do not underflow memory account while doing so if an + * accountable move. + * + * This is best effort, if we fail to unmap then we simply try to correct + * accounting and exit. + */ +static void unmap_source_vma(struct vma_remap_struct *vrm) +{ + struct mm_struct *mm = current->mm; + unsigned long addr = vrm->addr; + unsigned long len = vrm->old_len; + struct vm_area_struct *vma = vrm->vma; + VMA_ITERATOR(vmi, mm, addr); + int err; + unsigned long vm_start; + unsigned long vm_end; + /* + * It might seem odd that we check for MREMAP_DONTUNMAP here, given this + * function implies that we unmap the original VMA, which seems + * contradictory. + * + * However, this occurs when this operation was attempted and an error + * arose, in which case we _do_ wish to unmap the _new_ VMA, which means + * we actually _do_ want it be unaccounted. + */ + bool accountable_move = (vma->vm_flags & VM_ACCOUNT) && + !(vrm->flags & MREMAP_DONTUNMAP); + + /* + * So we perform a trick here to prevent incorrect accounting. Any merge + * or new VMA allocation performed in copy_vma() does not adjust + * accounting, it is expected that callers handle this. + * + * And indeed we already have, accounting appropriately in the case of + * both in vrm_charge(). + * + * However, when we unmap the existing VMA (to effect the move), this + * code will, if the VMA has VM_ACCOUNT set, attempt to unaccount + * removed pages. + * + * To avoid this we temporarily clear this flag, reinstating on any + * portions of the original VMA that remain. + */ + if (accountable_move) { + vm_flags_clear(vma, VM_ACCOUNT); + /* We are about to split vma, so store the start/end. */ + vm_start = vma->vm_start; + vm_end = vma->vm_end; + } + + err = do_vmi_munmap(&vmi, mm, addr, len, vrm->uf_unmap, /* unlock= */false); + vrm->vma = NULL; /* Invalidated. */ + if (err) { + /* OOM: unable to split vma, just get accounts right */ + vm_acct_memory(len >> PAGE_SHIFT); + return; } - vma_start_write(vma); - new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); - new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff, - &need_rmap_locks); + /* + * If we mremap() from a VMA like this: + * + * addr end + * | | + * v v + * |-------------| + * | | + * |-------------| + * + * Having cleared VM_ACCOUNT from the whole VMA, after we unmap above + * we'll end up with: + * + * addr end + * | | + * v v + * |---| |---| + * | A | | B | + * |---| |---| + * + * The VMI is still pointing at addr, so vma_prev() will give us A, and + * a subsequent or lone vma_next() will give as B. + * + * do_vmi_munmap() will have restored the VMI back to addr. + */ + if (accountable_move) { + unsigned long end = addr + len; + + if (vm_start < addr) { + struct vm_area_struct *prev = vma_prev(&vmi); + + vm_flags_set(prev, VM_ACCOUNT); /* Acquires VMA lock. */ + } + + if (vm_end > end) { + struct vm_area_struct *next = vma_next(&vmi); + + vm_flags_set(next, VM_ACCOUNT); /* Acquires VMA lock. */ + } + } +} + +/* + * Copy vrm->vma over to vrm->new_addr possibly adjusting size as part of the + * process. Additionally handle an error occurring on moving of page tables, + * where we reset vrm state to cause unmapping of the new VMA. + * + * Outputs the newly installed VMA to new_vma_ptr. Returns 0 on success or an + * error code. + */ +static int copy_vma_and_data(struct vma_remap_struct *vrm, + struct vm_area_struct **new_vma_ptr) +{ + unsigned long internal_offset = vrm->addr - vrm->vma->vm_start; + unsigned long internal_pgoff = internal_offset >> PAGE_SHIFT; + unsigned long new_pgoff = vrm->vma->vm_pgoff + internal_pgoff; + unsigned long moved_len; + struct vm_area_struct *vma = vrm->vma; + struct vm_area_struct *new_vma; + int err = 0; + PAGETABLE_MOVE(pmc, NULL, NULL, vrm->addr, vrm->new_addr, vrm->old_len); + + new_vma = copy_vma(&vma, vrm->new_addr, vrm->new_len, new_pgoff, + &pmc.need_rmap_locks); if (!new_vma) { - if (vm_flags & VM_ACCOUNT) - vm_unacct_memory(to_account >> PAGE_SHIFT); + vrm_uncharge(vrm); + *new_vma_ptr = NULL; return -ENOMEM; } + vrm->vma = vma; + pmc.old = vma; + pmc.new = new_vma; - moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len, - need_rmap_locks, false); - if (moved_len < old_len) { + moved_len = move_page_tables(&pmc); + if (moved_len < vrm->old_len) err = -ENOMEM; - } else if (vma->vm_ops && vma->vm_ops->mremap) { + else if (vma->vm_ops && vma->vm_ops->mremap) err = vma->vm_ops->mremap(new_vma); - } if (unlikely(err)) { + PAGETABLE_MOVE(pmc_revert, new_vma, vma, vrm->new_addr, + vrm->addr, moved_len); + /* * On error, move entries back from new area to old, * which will succeed since page tables still there, * and then proceed to unmap new area instead of old. */ - move_page_tables(new_vma, new_addr, vma, old_addr, moved_len, - true, false); - vma = new_vma; - old_len = new_len; - old_addr = new_addr; - new_addr = err; + pmc_revert.need_rmap_locks = true; + move_page_tables(&pmc_revert); + + vrm->vma = new_vma; + vrm->old_len = vrm->new_len; + vrm->addr = vrm->new_addr; } else { - mremap_userfaultfd_prep(new_vma, uf); + mremap_userfaultfd_prep(new_vma, vrm->uf); } - if (is_vm_hugetlb_page(vma)) { - clear_vma_resv_huge_pages(vma); - } + fixup_hugetlb_reservations(vma); - /* Conceal VM_ACCOUNT so old reservation is not undone */ - if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) { - vm_flags_clear(vma, VM_ACCOUNT); - if (vma->vm_start < old_addr) - account_start = vma->vm_start; - if (vma->vm_end > old_addr + old_len) - account_end = vma->vm_end; - } + *new_vma_ptr = new_vma; + return err; +} + +/* + * Perform final tasks for MADV_DONTUNMAP operation, clearing mlock() and + * account flags on remaining VMA by convention (it cannot be mlock()'d any + * longer, as pages in range are no longer mapped), and removing anon_vma_chain + * links from it (if the entire VMA was copied over). + */ +static void dontunmap_complete(struct vma_remap_struct *vrm, + struct vm_area_struct *new_vma) +{ + unsigned long start = vrm->addr; + unsigned long end = vrm->addr + vrm->old_len; + unsigned long old_start = vrm->vma->vm_start; + unsigned long old_end = vrm->vma->vm_end; + + /* + * We always clear VM_LOCKED[ONFAULT] | VM_ACCOUNT on the old + * vma. + */ + vm_flags_clear(vrm->vma, VM_LOCKED_MASK | VM_ACCOUNT); + + /* + * anon_vma links of the old vma is no longer needed after its page + * table has been moved. + */ + if (new_vma != vrm->vma && start == old_start && end == old_end) + unlink_anon_vmas(vrm->vma); + + /* Because we won't unmap we don't need to touch locked_vm. */ +} + +static unsigned long move_vma(struct vma_remap_struct *vrm) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *new_vma; + unsigned long hiwater_vm; + int err; + + err = prep_move_vma(vrm); + if (err) + return err; + + /* If accounted, charge the number of bytes the operation will use. */ + if (!vrm_charge(vrm)) + return -ENOMEM; + + /* We don't want racing faults. */ + vma_start_write(vrm->vma); + + /* Perform copy step. */ + err = copy_vma_and_data(vrm, &new_vma); + /* + * If we established the copied-to VMA, we attempt to recover from the + * error by setting the destination VMA to the source VMA and unmapping + * it below. + */ + if (err && !new_vma) + return err; /* * If we failed to move page tables we still do total_vm increment @@ -775,73 +1264,31 @@ static unsigned long move_vma(struct vm_area_struct *vma, * If this were a serious issue, we'd add a flag to do_munmap(). */ hiwater_vm = mm->hiwater_vm; - vm_stat_account(mm, vma->vm_flags, new_len >> PAGE_SHIFT); - - /* Tell pfnmap has moved from this vma */ - if (unlikely(vma->vm_flags & VM_PFNMAP)) - untrack_pfn_clear(vma); - - if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) { - /* We always clear VM_LOCKED[ONFAULT] on the old vma */ - vm_flags_clear(vma, VM_LOCKED_MASK); - - /* - * anon_vma links of the old vma is no longer needed after its page - * table has been moved. - */ - if (new_vma != vma && vma->vm_start == old_addr && - vma->vm_end == (old_addr + old_len)) - unlink_anon_vmas(vma); - - /* Because we won't unmap we don't need to touch locked_vm */ - return new_addr; - } - - vma_iter_init(&vmi, mm, old_addr); - if (do_vmi_munmap(&vmi, mm, old_addr, old_len, uf_unmap, false) < 0) { - /* OOM: unable to split vma, just get accounts right */ - if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) - vm_acct_memory(old_len >> PAGE_SHIFT); - account_start = account_end = 0; - } - if (vm_flags & VM_LOCKED) { - mm->locked_vm += new_len >> PAGE_SHIFT; - *locked = true; - } + vrm_stat_account(vrm, vrm->new_len); + if (unlikely(!err && (vrm->flags & MREMAP_DONTUNMAP))) + dontunmap_complete(vrm, new_vma); + else + unmap_source_vma(vrm); mm->hiwater_vm = hiwater_vm; - /* Restore VM_ACCOUNT if one or two pieces of vma left */ - if (account_start) { - vma = vma_prev(&vmi); - vm_flags_set(vma, VM_ACCOUNT); - } - - if (account_end) { - vma = vma_next(&vmi); - vm_flags_set(vma, VM_ACCOUNT); - } - - return new_addr; + return err ? (unsigned long)err : vrm->new_addr; } /* * resize_is_valid() - Ensure the vma can be resized to the new length at the give * address. * - * @vma: The vma to resize - * @addr: The old address - * @old_len: The current size - * @new_len: The desired size - * @flags: The vma flags - * * Return 0 on success, error otherwise. */ -static int resize_is_valid(struct vm_area_struct *vma, unsigned long addr, - unsigned long old_len, unsigned long new_len, unsigned long flags) +static int resize_is_valid(struct vma_remap_struct *vrm) { struct mm_struct *mm = current->mm; + struct vm_area_struct *vma = vrm->vma; + unsigned long addr = vrm->addr; + unsigned long old_len = vrm->old_len; + unsigned long new_len = vrm->new_len; unsigned long pgoff; /* @@ -853,11 +1300,12 @@ static int resize_is_valid(struct vm_area_struct *vma, unsigned long addr, * behavior. As a result, fail such attempts. */ if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) { - pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap. This is not supported.\n", current->comm, current->pid); + pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap. This is not supported.\n", + current->comm, current->pid); return -EINVAL; } - if ((flags & MREMAP_DONTUNMAP) && + if ((vrm->flags & MREMAP_DONTUNMAP) && (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))) return -EINVAL; @@ -877,118 +1325,120 @@ static int resize_is_valid(struct vm_area_struct *vma, unsigned long addr, if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) return -EFAULT; - if (!mlock_future_ok(mm, vma->vm_flags, new_len - old_len)) + if (!mlock_future_ok(mm, vma->vm_flags, vrm->delta)) return -EAGAIN; - if (!may_expand_vm(mm, vma->vm_flags, - (new_len - old_len) >> PAGE_SHIFT)) + if (!may_expand_vm(mm, vma->vm_flags, vrm->delta >> PAGE_SHIFT)) return -ENOMEM; return 0; } /* - * mremap_to() - remap a vma to a new location - * @addr: The old address - * @old_len: The old size - * @new_addr: The target address - * @new_len: The new size - * @locked: If the returned vma is locked (VM_LOCKED) - * @flags: the mremap flags - * @uf: The mremap userfaultfd context - * @uf_unmap_early: The userfaultfd unmap early context - * @uf_unmap: The userfaultfd unmap context + * The user has requested that the VMA be shrunk (i.e., old_len > new_len), so + * execute this, optionally dropping the mmap lock when we do so. * - * Returns: The new address of the vma or an error. + * In both cases this invalidates the VMA, however if we don't drop the lock, + * then load the correct VMA into vrm->vma afterwards. */ -static unsigned long mremap_to(unsigned long addr, unsigned long old_len, - unsigned long new_addr, unsigned long new_len, bool *locked, - unsigned long flags, struct vm_userfaultfd_ctx *uf, - struct list_head *uf_unmap_early, - struct list_head *uf_unmap) +static unsigned long shrink_vma(struct vma_remap_struct *vrm, + bool drop_lock) { struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - unsigned long ret; - unsigned long map_flags = 0; + unsigned long unmap_start = vrm->addr + vrm->new_len; + unsigned long unmap_bytes = vrm->delta; + unsigned long res; + VMA_ITERATOR(vmi, mm, unmap_start); - if (offset_in_page(new_addr)) - return -EINVAL; + VM_BUG_ON(vrm->remap_type != MREMAP_SHRINK); - if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) - return -EINVAL; - - /* Ensure the old/new locations do not overlap */ - if (addr + old_len > new_addr && new_addr + new_len > addr) - return -EINVAL; + res = do_vmi_munmap(&vmi, mm, unmap_start, unmap_bytes, + vrm->uf_unmap, drop_lock); + vrm->vma = NULL; /* Invalidated. */ + if (res) + return res; /* - * move_vma() need us to stay 4 maps below the threshold, otherwise - * it will bail out at the very beginning. - * That is a problem if we have already unmaped the regions here - * (new_addr, and old_addr), because userspace will not know the - * state of the vma's after it gets -ENOMEM. - * So, to avoid such scenario we can pre-compute if the whole - * operation has high chances to success map-wise. - * Worst-scenario case is when both vma's (new_addr and old_addr) get - * split in 3 before unmapping it. - * That means 2 more maps (1 for each) to the ones we already hold. - * Check whether current map count plus 2 still leads us to 4 maps below - * the threshold, otherwise return -ENOMEM here to be more safe. + * If we've not dropped the lock, then we should reload the VMA to + * replace the invalidated VMA with the one that may have now been + * split. */ - if ((mm->map_count + 2) >= sysctl_max_map_count - 3) - return -ENOMEM; + if (drop_lock) { + vrm->mmap_locked = false; + } else { + vrm->vma = vma_lookup(mm, vrm->addr); + if (!vrm->vma) + return -EFAULT; + } + + return 0; +} + +/* + * mremap_to() - remap a vma to a new location. + * Returns: The new address of the vma or an error. + */ +static unsigned long mremap_to(struct vma_remap_struct *vrm) +{ + struct mm_struct *mm = current->mm; + unsigned long err; + + /* Is the new length or address silly? */ + if (vrm->new_len > TASK_SIZE || + vrm->new_addr > TASK_SIZE - vrm->new_len) + return -EINVAL; + + if (vrm_overlaps(vrm)) + return -EINVAL; - if (flags & MREMAP_FIXED) { + if (vrm->flags & MREMAP_FIXED) { /* * In mremap_to(). * VMA is moved to dst address, and munmap dst first. * do_munmap will check if dst is sealed. */ - ret = do_munmap(mm, new_addr, new_len, uf_unmap_early); - if (ret) - return ret; - } + err = do_munmap(mm, vrm->new_addr, vrm->new_len, + vrm->uf_unmap_early); + vrm->vma = NULL; /* Invalidated. */ + if (err) + return err; - if (old_len > new_len) { - ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap); - if (ret) - return ret; - old_len = new_len; + /* + * If we remap a portion of a VMA elsewhere in the same VMA, + * this can invalidate the old VMA. Reset. + */ + vrm->vma = vma_lookup(mm, vrm->addr); + if (!vrm->vma) + return -EFAULT; } - vma = vma_lookup(mm, addr); - if (!vma) - return -EFAULT; - - ret = resize_is_valid(vma, addr, old_len, new_len, flags); - if (ret) - return ret; + if (vrm->remap_type == MREMAP_SHRINK) { + err = shrink_vma(vrm, /* drop_lock= */false); + if (err) + return err; - /* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */ - if (flags & MREMAP_DONTUNMAP && - !may_expand_vm(mm, vma->vm_flags, old_len >> PAGE_SHIFT)) { - return -ENOMEM; + /* Set up for the move now shrink has been executed. */ + vrm->old_len = vrm->new_len; } - if (flags & MREMAP_FIXED) - map_flags |= MAP_FIXED; + err = resize_is_valid(vrm); + if (err) + return err; - if (vma->vm_flags & VM_MAYSHARE) - map_flags |= MAP_SHARED; + /* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */ + if (vrm->flags & MREMAP_DONTUNMAP) { + vm_flags_t vm_flags = vrm->vma->vm_flags; + unsigned long pages = vrm->old_len >> PAGE_SHIFT; - ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff + - ((addr - vma->vm_start) >> PAGE_SHIFT), - map_flags); - if (IS_ERR_VALUE(ret)) - return ret; + if (!may_expand_vm(mm, vm_flags, pages)) + return -ENOMEM; + } - /* We got a new mapping */ - if (!(flags & MREMAP_FIXED)) - new_addr = ret; + err = vrm_set_new_addr(vrm); + if (err) + return err; - return move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, - uf, uf_unmap); + return move_vma(vrm); } static int vma_expandable(struct vm_area_struct *vma, unsigned long delta) @@ -1005,215 +1455,329 @@ static int vma_expandable(struct vm_area_struct *vma, unsigned long delta) return 1; } +/* Determine whether we are actually able to execute an in-place expansion. */ +static bool vrm_can_expand_in_place(struct vma_remap_struct *vrm) +{ + /* Number of bytes from vrm->addr to end of VMA. */ + unsigned long suffix_bytes = vrm->vma->vm_end - vrm->addr; + + /* If end of range aligns to end of VMA, we can just expand in-place. */ + if (suffix_bytes != vrm->old_len) + return false; + + /* Check whether this is feasible. */ + if (!vma_expandable(vrm->vma, vrm->delta)) + return false; + + return true; +} + /* - * Expand (or shrink) an existing mapping, potentially moving it at the - * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) - * - * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise - * This option implies MREMAP_MAYMOVE. + * Are the parameters passed to mremap() valid? If so return 0, otherwise return + * error. */ -SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, - unsigned long, new_len, unsigned long, flags, - unsigned long, new_addr) +static unsigned long check_mremap_params(struct vma_remap_struct *vrm) + { - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - unsigned long ret = -EINVAL; - bool locked = false; - struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX; - LIST_HEAD(uf_unmap_early); - LIST_HEAD(uf_unmap); + unsigned long addr = vrm->addr; + unsigned long flags = vrm->flags; + + /* Ensure no unexpected flag values. */ + if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP)) + return -EINVAL; + + /* Start address must be page-aligned. */ + if (offset_in_page(addr)) + return -EINVAL; /* - * There is a deliberate asymmetry here: we strip the pointer tag - * from the old address but leave the new address alone. This is - * for consistency with mmap(), where we prevent the creation of - * aliasing mappings in userspace by leaving the tag bits of the - * mapping address intact. A non-zero tag will cause the subsequent - * range checks to reject the address as invalid. - * - * See Documentation/arch/arm64/tagged-address-abi.rst for more - * information. + * We allow a zero old-len as a special case + * for DOS-emu "duplicate shm area" thing. But + * a zero new-len is nonsensical. */ - addr = untagged_addr(addr); + if (!PAGE_ALIGN(vrm->new_len)) + return -EINVAL; - if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP)) - return ret; + /* Remainder of checks are for cases with specific new_addr. */ + if (!vrm_implies_new_addr(vrm)) + return 0; - if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE)) - return ret; + /* The new address must be page-aligned. */ + if (offset_in_page(vrm->new_addr)) + return -EINVAL; + + /* A fixed address implies a move. */ + if (!(flags & MREMAP_MAYMOVE)) + return -EINVAL; + + /* MREMAP_DONTUNMAP does not allow resizing in the process. */ + if (flags & MREMAP_DONTUNMAP && vrm->old_len != vrm->new_len) + return -EINVAL; /* - * MREMAP_DONTUNMAP is always a move and it does not allow resizing - * in the process. + * move_vma() need us to stay 4 maps below the threshold, otherwise + * it will bail out at the very beginning. + * That is a problem if we have already unmaped the regions here + * (new_addr, and old_addr), because userspace will not know the + * state of the vma's after it gets -ENOMEM. + * So, to avoid such scenario we can pre-compute if the whole + * operation has high chances to success map-wise. + * Worst-scenario case is when both vma's (new_addr and old_addr) get + * split in 3 before unmapping it. + * That means 2 more maps (1 for each) to the ones we already hold. + * Check whether current map count plus 2 still leads us to 4 maps below + * the threshold, otherwise return -ENOMEM here to be more safe. */ - if (flags & MREMAP_DONTUNMAP && - (!(flags & MREMAP_MAYMOVE) || old_len != new_len)) - return ret; + if ((current->mm->map_count + 2) >= sysctl_max_map_count - 3) + return -ENOMEM; + return 0; +} - if (offset_in_page(addr)) - return ret; +/* + * We know we can expand the VMA in-place by delta pages, so do so. + * + * If we discover the VMA is locked, update mm_struct statistics accordingly and + * indicate so to the caller. + */ +static unsigned long expand_vma_in_place(struct vma_remap_struct *vrm) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma = vrm->vma; + VMA_ITERATOR(vmi, mm, vma->vm_end); - old_len = PAGE_ALIGN(old_len); - new_len = PAGE_ALIGN(new_len); + if (!vrm_charge(vrm)) + return -ENOMEM; /* - * We allow a zero old-len as a special case - * for DOS-emu "duplicate shm area" thing. But - * a zero new-len is nonsensical. + * Function vma_merge_extend() is called on the + * extension we are adding to the already existing vma, + * vma_merge_extend() will merge this extension with the + * already existing vma (expand operation itself) and + * possibly also with the next vma if it becomes + * adjacent to the expanded vma and otherwise + * compatible. */ - if (!new_len) - return ret; - - if (mmap_write_lock_killable(current->mm)) - return -EINTR; - vma = vma_lookup(mm, addr); + vma = vma_merge_extend(&vmi, vma, vrm->delta); if (!vma) { - ret = -EFAULT; - goto out; + vrm_uncharge(vrm); + return -ENOMEM; } + vrm->vma = vma; - /* Don't allow remapping vmas when they have already been sealed */ - if (!can_modify_vma(vma)) { - ret = -EPERM; - goto out; - } + vrm_stat_account(vrm, vrm->delta); - if (is_vm_hugetlb_page(vma)) { - struct hstate *h __maybe_unused = hstate_vma(vma); + return 0; +} + +static bool align_hugetlb(struct vma_remap_struct *vrm) +{ + struct hstate *h __maybe_unused = hstate_vma(vrm->vma); + + vrm->old_len = ALIGN(vrm->old_len, huge_page_size(h)); + vrm->new_len = ALIGN(vrm->new_len, huge_page_size(h)); + + /* addrs must be huge page aligned */ + if (vrm->addr & ~huge_page_mask(h)) + return false; + if (vrm->new_addr & ~huge_page_mask(h)) + return false; + + /* + * Don't allow remap expansion, because the underlying hugetlb + * reservation is not yet capable to handle split reservation. + */ + if (vrm->new_len > vrm->old_len) + return false; - old_len = ALIGN(old_len, huge_page_size(h)); - new_len = ALIGN(new_len, huge_page_size(h)); + vrm_set_delta(vrm); - /* addrs must be huge page aligned */ - if (addr & ~huge_page_mask(h)) - goto out; - if (new_addr & ~huge_page_mask(h)) - goto out; + return true; +} + +/* + * We are mremap()'ing without specifying a fixed address to move to, but are + * requesting that the VMA's size be increased. + * + * Try to do so in-place, if this fails, then move the VMA to a new location to + * action the change. + */ +static unsigned long expand_vma(struct vma_remap_struct *vrm) +{ + unsigned long err; + unsigned long addr = vrm->addr; + + err = resize_is_valid(vrm); + if (err) + return err; + + /* + * [addr, old_len) spans precisely to the end of the VMA, so try to + * expand it in-place. + */ + if (vrm_can_expand_in_place(vrm)) { + err = expand_vma_in_place(vrm); + if (err) + return err; /* - * Don't allow remap expansion, because the underlying hugetlb - * reservation is not yet capable to handle split reservation. + * We want to populate the newly expanded portion of the VMA to + * satisfy the expectation that mlock()'ing a VMA maintains all + * of its pages in memory. */ - if (new_len > old_len) - goto out; - } + if (vrm->mlocked) + vrm->new_addr = addr; - if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) { - ret = mremap_to(addr, old_len, new_addr, new_len, - &locked, flags, &uf, &uf_unmap_early, - &uf_unmap); - goto out; + /* OK we're done! */ + return addr; } /* - * Always allow a shrinking remap: that just unmaps - * the unnecessary pages.. - * do_vmi_munmap does all the needed commit accounting, and - * unlocks the mmap_lock if so directed. + * We weren't able to just expand or shrink the area, + * we need to create a new one and move it. */ - if (old_len >= new_len) { - VMA_ITERATOR(vmi, mm, addr + new_len); - if (old_len == new_len) { - ret = addr; - goto out; - } + /* We're not allowed to move the VMA, so error out. */ + if (!(vrm->flags & MREMAP_MAYMOVE)) + return -ENOMEM; + + /* Find a new location to move the VMA to. */ + err = vrm_set_new_addr(vrm); + if (err) + return err; + + return move_vma(vrm); +} + +/* + * Attempt to resize the VMA in-place, if we cannot, then move the VMA to the + * first available address to perform the operation. + */ +static unsigned long mremap_at(struct vma_remap_struct *vrm) +{ + unsigned long res; - ret = do_vmi_munmap(&vmi, mm, addr + new_len, old_len - new_len, - &uf_unmap, true); - if (ret) - goto out; + switch (vrm->remap_type) { + case MREMAP_INVALID: + break; + case MREMAP_NO_RESIZE: + /* NO-OP CASE - resizing to the same size. */ + return vrm->addr; + case MREMAP_SHRINK: + /* + * SHRINK CASE. Can always be done in-place. + * + * Simply unmap the shrunken portion of the VMA. This does all + * the needed commit accounting, and we indicate that the mmap + * lock should be dropped. + */ + res = shrink_vma(vrm, /* drop_lock= */true); + if (res) + return res; - ret = addr; - goto out_unlocked; + return vrm->addr; + case MREMAP_EXPAND: + return expand_vma(vrm); } - /* - * Ok, we need to grow.. - */ - ret = resize_is_valid(vma, addr, old_len, new_len, flags); + BUG(); +} + +static unsigned long do_mremap(struct vma_remap_struct *vrm) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long ret; + + ret = check_mremap_params(vrm); if (ret) - goto out; + return ret; - /* old_len exactly to the end of the area.. - */ - if (old_len == vma->vm_end - addr) { - unsigned long delta = new_len - old_len; - - /* can we just expand the current mapping? */ - if (vma_expandable(vma, delta)) { - long pages = delta >> PAGE_SHIFT; - VMA_ITERATOR(vmi, mm, vma->vm_end); - long charged = 0; - - if (vma->vm_flags & VM_ACCOUNT) { - if (security_vm_enough_memory_mm(mm, pages)) { - ret = -ENOMEM; - goto out; - } - charged = pages; - } + vrm->old_len = PAGE_ALIGN(vrm->old_len); + vrm->new_len = PAGE_ALIGN(vrm->new_len); + vrm_set_delta(vrm); - /* - * Function vma_merge_extend() is called on the - * extension we are adding to the already existing vma, - * vma_merge_extend() will merge this extension with the - * already existing vma (expand operation itself) and - * possibly also with the next vma if it becomes - * adjacent to the expanded vma and otherwise - * compatible. - */ - vma = vma_merge_extend(&vmi, vma, delta); - if (!vma) { - vm_unacct_memory(charged); - ret = -ENOMEM; - goto out; - } + if (mmap_write_lock_killable(mm)) + return -EINTR; + vrm->mmap_locked = true; - vm_stat_account(mm, vma->vm_flags, pages); - if (vma->vm_flags & VM_LOCKED) { - mm->locked_vm += pages; - locked = true; - new_addr = addr; - } - ret = addr; - goto out; - } + vma = vrm->vma = vma_lookup(mm, vrm->addr); + if (!vma) { + ret = -EFAULT; + goto out; } - /* - * We weren't able to just expand or shrink the area, - * we need to create a new one and move it.. - */ - ret = -ENOMEM; - if (flags & MREMAP_MAYMOVE) { - unsigned long map_flags = 0; - if (vma->vm_flags & VM_MAYSHARE) - map_flags |= MAP_SHARED; - - new_addr = get_unmapped_area(vma->vm_file, 0, new_len, - vma->vm_pgoff + - ((addr - vma->vm_start) >> PAGE_SHIFT), - map_flags); - if (IS_ERR_VALUE(new_addr)) { - ret = new_addr; - goto out; - } + /* If mseal()'d, mremap() is prohibited. */ + if (!can_modify_vma(vma)) { + ret = -EPERM; + goto out; + } - ret = move_vma(vma, addr, old_len, new_len, new_addr, - &locked, flags, &uf, &uf_unmap); + /* Align to hugetlb page size, if required. */ + if (is_vm_hugetlb_page(vma) && !align_hugetlb(vrm)) { + ret = -EINVAL; + goto out; } + + vrm->remap_type = vrm_remap_type(vrm); + + /* Actually execute mremap. */ + ret = vrm_implies_new_addr(vrm) ? mremap_to(vrm) : mremap_at(vrm); + out: - if (offset_in_page(ret)) - locked = false; - mmap_write_unlock(current->mm); - if (locked && new_len > old_len) - mm_populate(new_addr + old_len, new_len - old_len); -out_unlocked: - userfaultfd_unmap_complete(mm, &uf_unmap_early); - mremap_userfaultfd_complete(&uf, addr, ret, old_len); - userfaultfd_unmap_complete(mm, &uf_unmap); + if (vrm->mmap_locked) { + mmap_write_unlock(mm); + vrm->mmap_locked = false; + + if (!offset_in_page(ret) && vrm->mlocked && vrm->new_len > vrm->old_len) + mm_populate(vrm->new_addr + vrm->old_len, vrm->delta); + } + + userfaultfd_unmap_complete(mm, vrm->uf_unmap_early); + mremap_userfaultfd_complete(vrm->uf, vrm->addr, ret, vrm->old_len); + userfaultfd_unmap_complete(mm, vrm->uf_unmap); + return ret; } + +/* + * Expand (or shrink) an existing mapping, potentially moving it at the + * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) + * + * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise + * This option implies MREMAP_MAYMOVE. + */ +SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, + unsigned long, new_len, unsigned long, flags, + unsigned long, new_addr) +{ + struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX; + LIST_HEAD(uf_unmap_early); + LIST_HEAD(uf_unmap); + /* + * There is a deliberate asymmetry here: we strip the pointer tag + * from the old address but leave the new address alone. This is + * for consistency with mmap(), where we prevent the creation of + * aliasing mappings in userspace by leaving the tag bits of the + * mapping address intact. A non-zero tag will cause the subsequent + * range checks to reject the address as invalid. + * + * See Documentation/arch/arm64/tagged-address-abi.rst for more + * information. + */ + struct vma_remap_struct vrm = { + .addr = untagged_addr(addr), + .old_len = old_len, + .new_len = new_len, + .flags = flags, + .new_addr = new_addr, + + .uf = &uf, + .uf_unmap_early = &uf_unmap_early, + .uf_unmap = &uf_unmap, + + .remap_type = MREMAP_INVALID, /* We set later. */ + }; + + return do_mremap(&vrm); +} diff --git a/mm/mseal.c b/mm/mseal.c index 81d6e980e8a9..c27197ac04e8 100644 --- a/mm/mseal.c +++ b/mm/mseal.c @@ -217,9 +217,9 @@ int do_mseal(unsigned long start, size_t len_in, unsigned long flags) unsigned long end; struct mm_struct *mm = current->mm; - ret = can_do_mseal(flags); - if (ret) - return ret; + /* Verify flags not set. */ + if (flags) + return -EINVAL; start = untagged_addr(start); if (!PAGE_ALIGNED(start)) diff --git a/mm/nommu.c b/mm/nommu.c index 9cb6e99215e2..b624acec6d2e 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -42,18 +42,11 @@ #include <asm/mmu_context.h> #include "internal.h" -void *high_memory; -EXPORT_SYMBOL(high_memory); -struct page *mem_map; -unsigned long max_mapnr; -EXPORT_SYMBOL(max_mapnr); unsigned long highest_memmap_pfn; -int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; int heap_stack_gap = 0; atomic_long_t mmap_pages_allocated; -EXPORT_SYMBOL(mem_map); /* list of mapped, potentially shareable regions */ static struct kmem_cache *vm_region_jar; @@ -207,7 +200,23 @@ void *vmalloc_noprof(unsigned long size) } EXPORT_SYMBOL(vmalloc_noprof); -void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) __weak __alias(__vmalloc_noprof); +/* + * vmalloc_huge_node - allocate virtually contiguous memory, on a node + * + * @size: allocation size + * @gfp_mask: flags for the page level allocator + * @node: node to use for allocation or NUMA_NO_NODE + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * + * Due to NOMMU implications the node argument and HUGE page attribute is + * ignored. + */ +void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node) +{ + return __vmalloc_noprof(size, gfp_mask); +} /* * vzalloc - allocate virtually contiguous memory with zero fill @@ -392,8 +401,22 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) return mm->brk = brk; } +static int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; + +static const struct ctl_table nommu_table[] = { + { + .procname = "nr_trim_pages", + .data = &sysctl_nr_trim_pages, + .maxlen = sizeof(sysctl_nr_trim_pages), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, +}; + /* - * initialise the percpu counter for VM and region record slabs + * initialise the percpu counter for VM and region record slabs, initialise VMA + * state. */ void __init mmap_init(void) { @@ -402,6 +425,8 @@ void __init mmap_init(void) ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); VM_BUG_ON(ret); vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT); + register_sysctl_init("vm", nommu_table); + vma_state_init(); } /* @@ -620,22 +645,6 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) EXPORT_SYMBOL(find_vma); /* - * At least xtensa ends up having protection faults even with no - * MMU.. No stack expansion, at least. - */ -struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, - unsigned long addr, struct pt_regs *regs) -{ - struct vm_area_struct *vma; - - mmap_read_lock(mm); - vma = vma_lookup(mm, addr); - if (!vma) - mmap_read_unlock(mm); - return vma; -} - -/* * expand a stack to a given address * - not supported under NOMMU conditions */ @@ -1191,7 +1200,7 @@ share: setup_vma_to_mm(vma, current->mm); current->mm->map_count++; /* add the VMA to the tree */ - vma_iter_store(&vmi, vma); + vma_iter_store_new(&vmi, vma); /* we flush the region from the icache only when the first executable * mapping of it is made */ @@ -1356,7 +1365,7 @@ static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, setup_vma_to_mm(vma, mm); setup_vma_to_mm(new, mm); - vma_iter_store(vmi, new); + vma_iter_store_new(vmi, new); mm->map_count++; return 0; @@ -1701,6 +1710,85 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in } EXPORT_SYMBOL_GPL(access_process_vm); +#ifdef CONFIG_BPF_SYSCALL +/* + * Copy a string from another process's address space as given in mm. + * If there is any error return -EFAULT. + */ +static int __copy_remote_vm_str(struct mm_struct *mm, unsigned long addr, + void *buf, int len) +{ + unsigned long addr_end; + struct vm_area_struct *vma; + int ret = -EFAULT; + + *(char *)buf = '\0'; + + if (mmap_read_lock_killable(mm)) + return ret; + + /* the access must start within one of the target process's mappings */ + vma = find_vma(mm, addr); + if (!vma) + goto out; + + if (check_add_overflow(addr, len, &addr_end)) + goto out; + + /* don't overrun this mapping */ + if (addr_end > vma->vm_end) + len = vma->vm_end - addr; + + /* only read mappings where it is permitted */ + if (vma->vm_flags & VM_MAYREAD) { + ret = strscpy(buf, (char *)addr, len); + if (ret < 0) + ret = len - 1; + } + +out: + mmap_read_unlock(mm); + return ret; +} + +/** + * copy_remote_vm_str - copy a string from another process's address space. + * @tsk: the task of the target address space + * @addr: start address to read from + * @buf: destination buffer + * @len: number of bytes to copy + * @gup_flags: flags modifying lookup behaviour (unused) + * + * The caller must hold a reference on @mm. + * + * Return: number of bytes copied from @addr (source) to @buf (destination); + * not including the trailing NUL. Always guaranteed to leave NUL-terminated + * buffer. On any error, return -EFAULT. + */ +int copy_remote_vm_str(struct task_struct *tsk, unsigned long addr, + void *buf, int len, unsigned int gup_flags) +{ + struct mm_struct *mm; + int ret; + + if (unlikely(len == 0)) + return 0; + + mm = get_task_mm(tsk); + if (!mm) { + *(char *)buf = '\0'; + return -EFAULT; + } + + ret = __copy_remote_vm_str(mm, addr, buf, len); + + mmput(mm); + + return ret; +} +EXPORT_SYMBOL_GPL(copy_remote_vm_str); +#endif /* CONFIG_BPF_SYSCALL */ + /** * nommu_shrink_inode_mappings - Shrink the shared mappings on an inode * @inode: The inode to check @@ -1804,3 +1892,11 @@ static int __meminit init_admin_reserve(void) return 0; } subsys_initcall(init_admin_reserve); + +int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) +{ + mmap_write_lock(oldmm); + dup_mm_exe_file(mm, oldmm); + mmap_write_unlock(oldmm); + return 0; +} diff --git a/mm/numa.c b/mm/numa.c index e2eec07707d1..7d5e06fe5bd4 100644 --- a/mm/numa.c +++ b/mm/numa.c @@ -13,7 +13,6 @@ void __init alloc_node_data(int nid) { const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES); u64 nd_pa; - void *nd; int tnid; /* Allocate node data. Try node-local memory and then any node. */ @@ -21,7 +20,6 @@ void __init alloc_node_data(int nid) if (!nd_pa) panic("Cannot allocate %zu bytes for node %d data\n", nd_size, nid); - nd = __va(nd_pa); /* report and initialize */ pr_info("NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid, @@ -30,20 +28,14 @@ void __init alloc_node_data(int nid) if (tnid != nid) pr_info(" NODE_DATA(%d) on node %d\n", nid, tnid); - node_data[nid] = nd; + node_data[nid] = __va(nd_pa); memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); } void __init alloc_offline_node_data(int nid) { pg_data_t *pgdat; - - pgdat = memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES); - if (!pgdat) - panic("Cannot allocate %zuB for node %d.\n", - sizeof(*pgdat), nid); - - node_data[nid] = pgdat; + node_data[nid] = memblock_alloc_or_panic(sizeof(*pgdat), SMP_CACHE_BYTES); } /* Stub functions: */ diff --git a/mm/numa_emulation.c b/mm/numa_emulation.c index 031fb9961bf7..9d55679d99ce 100644 --- a/mm/numa_emulation.c +++ b/mm/numa_emulation.c @@ -8,11 +8,12 @@ #include <linux/memblock.h> #include <linux/numa_memblks.h> #include <asm/numa.h> +#include <acpi/acpi_numa.h> #define FAKE_NODE_MIN_SIZE ((u64)32 << 20) #define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) -static int emu_nid_to_phys[MAX_NUMNODES]; +int emu_nid_to_phys[MAX_NUMNODES]; static char *emu_cmdline __initdata; int __init numa_emu_cmdline(char *str) @@ -379,6 +380,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); int max_emu_nid, dfl_phys_nid; int i, j, ret; + nodemask_t physnode_mask = numa_nodes_parsed; if (!emu_cmdline) goto no_emu; @@ -395,7 +397,6 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) * split the system RAM into N fake nodes. */ if (strchr(emu_cmdline, 'U')) { - nodemask_t physnode_mask = numa_nodes_parsed; unsigned long n; int nid = 0; @@ -465,9 +466,6 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) */ max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid); - /* commit */ - *numa_meminfo = ei; - /* Make sure numa_nodes_parsed only contains emulated nodes */ nodes_clear(numa_nodes_parsed); for (i = 0; i < ARRAY_SIZE(ei.blk); i++) @@ -475,10 +473,21 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) ei.blk[i].nid != NUMA_NO_NODE) node_set(ei.blk[i].nid, numa_nodes_parsed); - numa_emu_update_cpu_to_node(emu_nid_to_phys, ARRAY_SIZE(emu_nid_to_phys)); + /* fix pxm_to_node_map[] and node_to_pxm_map[] to avoid collision + * with faked numa nodes, particularly during later memory hotplug + * handling, and also update numa_nodes_parsed accordingly. + */ + ret = fix_pxm_node_maps(max_emu_nid); + if (ret < 0) + goto no_emu; + + /* commit */ + *numa_meminfo = ei; + + numa_emu_update_cpu_to_node(emu_nid_to_phys, max_emu_nid + 1); /* make sure all emulated nodes are mapped to a physical node */ - for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) + for (i = 0; i < max_emu_nid + 1; i++) if (emu_nid_to_phys[i] == NUMA_NO_NODE) emu_nid_to_phys[i] = dfl_phys_nid; @@ -501,12 +510,34 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) numa_set_distance(i, j, dist); } } + for (i = 0; i < numa_distance_cnt; i++) { + for (j = 0; j < numa_distance_cnt; j++) { + int physi, physj; + u8 dist; + + /* distance between fake nodes is already ok */ + if (emu_nid_to_phys[i] != NUMA_NO_NODE && + emu_nid_to_phys[j] != NUMA_NO_NODE) + continue; + if (emu_nid_to_phys[i] != NUMA_NO_NODE) + physi = emu_nid_to_phys[i]; + else + physi = i - max_emu_nid; + if (emu_nid_to_phys[j] != NUMA_NO_NODE) + physj = emu_nid_to_phys[j]; + else + physj = j - max_emu_nid; + dist = phys_dist[physi * numa_dist_cnt + physj]; + numa_set_distance(i, j, dist); + } + } /* free the copied physical distance table */ memblock_free(phys_dist, phys_size); return; no_emu: + numa_nodes_parsed = physnode_mask; /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */ for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) emu_nid_to_phys[i] = i; diff --git a/mm/numa_memblks.c b/mm/numa_memblks.c index a3877e9bc878..541a99c4071a 100644 --- a/mm/numa_memblks.c +++ b/mm/numa_memblks.c @@ -7,7 +7,7 @@ #include <linux/numa.h> #include <linux/numa_memblks.h> -static int numa_distance_cnt; +int numa_distance_cnt; static u8 *numa_distance; nodemask_t numa_nodes_parsed __initdata; @@ -201,6 +201,28 @@ int __init numa_add_memblk(int nid, u64 start, u64 end) } /** + * numa_add_reserved_memblk - Add one numa_memblk to numa_reserved_meminfo + * @nid: NUMA node ID of the new memblk + * @start: Start address of the new memblk + * @end: End address of the new memblk + * + * Add a new memblk to the numa_reserved_meminfo. + * + * Usage Case: numa_cleanup_meminfo() reconciles all numa_memblk instances + * against memblock_type information and moves any that intersect reserved + * ranges to numa_reserved_meminfo. However, when that information is known + * ahead of time, we use numa_add_reserved_memblk() to add the numa_memblk + * to numa_reserved_meminfo directly. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init numa_add_reserved_memblk(int nid, u64 start, u64 end) +{ + return numa_add_memblk_to(nid, start, end, &numa_reserved_meminfo); +} + +/** * numa_cleanup_meminfo - Cleanup a numa_meminfo * @mi: numa_meminfo to clean up * diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 1c485beb0b93..25923cfec9c6 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -44,6 +44,7 @@ #include <linux/init.h> #include <linux/mmu_notifier.h> #include <linux/cred.h> +#include <linux/nmi.h> #include <asm/tlb.h> #include "internal.h" @@ -430,10 +431,15 @@ static void dump_tasks(struct oom_control *oc) mem_cgroup_scan_tasks(oc->memcg, dump_task, oc); else { struct task_struct *p; + int i = 0; rcu_read_lock(); - for_each_process(p) + for_each_process(p) { + /* Avoid potential softlockup warning */ + if ((++i & 1023) == 0) + touch_softlockup_watchdog(); dump_task(p, oc); + } rcu_read_unlock(); } } @@ -557,7 +563,7 @@ static bool __oom_reap_task_mm(struct mm_struct *mm) } /* - * Reaps the address space of the give task. + * Reaps the address space of the given task. * * Returns true on success and false if none or part of the address space * has been reclaimed and the caller should retry later. @@ -699,7 +705,7 @@ static void queue_oom_reaper(struct task_struct *tsk) } #ifdef CONFIG_SYSCTL -static struct ctl_table vm_oom_kill_table[] = { +static const struct ctl_table vm_oom_kill_table[] = { { .procname = "panic_on_oom", .data = &sysctl_panic_on_oom, diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d213ead95675..72b0ff0d4bae 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -41,6 +41,7 @@ #include <trace/events/writeback.h> #include "internal.h" +#include "swap.h" /* * Sleep at most 200ms at a time in balance_dirty_pages(). @@ -120,29 +121,6 @@ EXPORT_SYMBOL(laptop_mode); struct wb_domain global_wb_domain; -/* consolidated parameters for balance_dirty_pages() and its subroutines */ -struct dirty_throttle_control { -#ifdef CONFIG_CGROUP_WRITEBACK - struct wb_domain *dom; - struct dirty_throttle_control *gdtc; /* only set in memcg dtc's */ -#endif - struct bdi_writeback *wb; - struct fprop_local_percpu *wb_completions; - - unsigned long avail; /* dirtyable */ - unsigned long dirty; /* file_dirty + write + nfs */ - unsigned long thresh; /* dirty threshold */ - unsigned long bg_thresh; /* dirty background threshold */ - - unsigned long wb_dirty; /* per-wb counterparts */ - unsigned long wb_thresh; - unsigned long wb_bg_thresh; - - unsigned long pos_ratio; - bool freerun; - bool dirty_exceeded; -}; - /* * Length of period for aging writeout fractions of bdis. This is an * arbitrarily chosen number. The longer the period, the slower fractions will @@ -543,8 +521,8 @@ static int dirty_ratio_handler(const struct ctl_table *table, int write, void *b ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_ratio != old_ratio) { - writeback_set_ratelimit(); vm_dirty_bytes = 0; + writeback_set_ratelimit(); } return ret; } @@ -630,7 +608,7 @@ EXPORT_SYMBOL_GPL(wb_writeout_inc); */ static void writeout_period(struct timer_list *t) { - struct wb_domain *dom = from_timer(dom, t, period_timer); + struct wb_domain *dom = timer_container_of(dom, t, period_timer); int miss_periods = (jiffies - dom->period_time) / VM_COMPLETIONS_PERIOD_LEN; @@ -663,7 +641,7 @@ int wb_domain_init(struct wb_domain *dom, gfp_t gfp) #ifdef CONFIG_CGROUP_WRITEBACK void wb_domain_exit(struct wb_domain *dom) { - del_timer_sync(&dom->period_timer); + timer_delete_sync(&dom->period_timer); fprop_global_destroy(&dom->completions); } #endif @@ -692,6 +670,8 @@ static unsigned long bdi_ratio_from_pages(unsigned long pages) unsigned long ratio; global_dirty_limits(&background_thresh, &dirty_thresh); + if (!dirty_thresh) + return -EINVAL; ratio = div64_u64(pages * 100ULL * BDI_RATIO_SCALE, dirty_thresh); return ratio; @@ -790,13 +770,15 @@ int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes) { int ret; unsigned long pages = min_bytes >> PAGE_SHIFT; - unsigned long min_ratio; + long min_ratio; ret = bdi_check_pages_limit(pages); if (ret) return ret; min_ratio = bdi_ratio_from_pages(pages); + if (min_ratio < 0) + return min_ratio; return __bdi_set_min_ratio(bdi, min_ratio); } @@ -809,13 +791,15 @@ int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes) { int ret; unsigned long pages = max_bytes >> PAGE_SHIFT; - unsigned long max_ratio; + long max_ratio; ret = bdi_check_pages_limit(pages); if (ret) return ret; max_ratio = bdi_ratio_from_pages(pages); + if (max_ratio < 0) + return max_ratio; return __bdi_set_max_ratio(bdi, max_ratio); } @@ -936,26 +920,25 @@ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc, wb_min_max_ratio(wb, &wb_min_ratio, &wb_max_ratio); wb_thresh += (thresh * wb_min_ratio) / (100 * BDI_RATIO_SCALE); - wb_max_thresh = thresh * wb_max_ratio / (100 * BDI_RATIO_SCALE); - if (wb_thresh > wb_max_thresh) - wb_thresh = wb_max_thresh; /* - * With strictlimit flag, the wb_thresh is treated as - * a hard limit in balance_dirty_pages() and wb_position_ratio(). - * It's possible that wb_thresh is close to zero, not because - * the device is slow, but because it has been inactive. - * To prevent occasional writes from being blocked, we raise wb_thresh. + * It's very possible that wb_thresh is close to 0 not because the + * device is slow, but that it has remained inactive for long time. + * Honour such devices a reasonable good (hopefully IO efficient) + * threshold, so that the occasional writes won't be blocked and active + * writes can rampup the threshold quickly. */ - if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { - unsigned long limit = hard_dirty_limit(dom, dtc->thresh); - u64 wb_scale_thresh = 0; - - if (limit > dtc->dirty) - wb_scale_thresh = (limit - dtc->dirty) / 100; - wb_thresh = max(wb_thresh, min(wb_scale_thresh, wb_max_thresh / 4)); + if (thresh > dtc->dirty) { + if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) + wb_thresh = max(wb_thresh, (thresh - dtc->dirty) / 100); + else + wb_thresh = max(wb_thresh, (thresh - dtc->dirty) / 8); } + wb_max_thresh = thresh * wb_max_ratio / (100 * BDI_RATIO_SCALE); + if (wb_thresh > wb_max_thresh) + wb_thresh = wb_max_thresh; + return wb_thresh; } @@ -963,6 +946,7 @@ unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh) { struct dirty_throttle_control gdtc = { GDTC_INIT(wb) }; + domain_dirty_avail(&gdtc, true); return __wb_calc_thresh(&gdtc, thresh); } @@ -1089,7 +1073,7 @@ static void wb_position_ratio(struct dirty_throttle_control *dtc) struct bdi_writeback *wb = dtc->wb; unsigned long write_bw = READ_ONCE(wb->avg_write_bandwidth); unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh); - unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh); + unsigned long limit = dtc->limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh); unsigned long wb_thresh = dtc->wb_thresh; unsigned long x_intercept; unsigned long setpoint; /* dirty pages' target balance point */ @@ -1139,12 +1123,6 @@ static void wb_position_ratio(struct dirty_throttle_control *dtc) if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { long long wb_pos_ratio; - if (dtc->wb_dirty < 8) { - dtc->pos_ratio = min_t(long long, pos_ratio * 2, - 2 << RATELIMIT_CALC_SHIFT); - return; - } - if (dtc->wb_dirty >= wb_thresh) return; @@ -1216,14 +1194,6 @@ static void wb_position_ratio(struct dirty_throttle_control *dtc) if (unlikely(wb_thresh > dtc->thresh)) wb_thresh = dtc->thresh; /* - * It's very possible that wb_thresh is close to 0 not because the - * device is slow, but that it has remained inactive for long time. - * Honour such devices a reasonable good (hopefully IO efficient) - * threshold, so that the occasional writes won't be blocked and active - * writes can rampup the threshold quickly. - */ - wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8); - /* * scale global setpoint to wb's: * wb_setpoint = setpoint * wb_thresh / thresh */ @@ -1478,17 +1448,10 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate). * Hence, to calculate "step" properly, we have to use wb_dirty as * "dirty" and wb_setpoint as "setpoint". - * - * We rampup dirty_ratelimit forcibly if wb_dirty is low because - * it's possible that wb_thresh is close to zero due to inactivity - * of backing device. */ if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { dirty = dtc->wb_dirty; - if (dtc->wb_dirty < 8) - setpoint = dtc->wb_dirty + 1; - else - setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2; + setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2; } if (dirty < setpoint) { @@ -1977,11 +1940,7 @@ free_running: */ if (pause < min_pause) { trace_balance_dirty_pages(wb, - sdtc->thresh, - sdtc->bg_thresh, - sdtc->dirty, - sdtc->wb_thresh, - sdtc->wb_dirty, + sdtc, dirty_ratelimit, task_ratelimit, pages_dirtied, @@ -2006,11 +1965,7 @@ free_running: pause: trace_balance_dirty_pages(wb, - sdtc->thresh, - sdtc->bg_thresh, - sdtc->dirty, - sdtc->wb_thresh, - sdtc->wb_dirty, + sdtc, dirty_ratelimit, task_ratelimit, pages_dirtied, @@ -2248,7 +2203,7 @@ static int dirty_writeback_centisecs_handler(const struct ctl_table *table, int void laptop_mode_timer_fn(struct timer_list *t) { struct backing_dev_info *backing_dev_info = - from_timer(backing_dev_info, t, laptop_mode_wb_timer); + timer_container_of(backing_dev_info, t, laptop_mode_wb_timer); wakeup_flusher_threads_bdi(backing_dev_info, WB_REASON_LAPTOP_TIMER); } @@ -2275,7 +2230,7 @@ void laptop_sync_completion(void) rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) - del_timer(&bdi->laptop_mode_wb_timer); + timer_delete(&bdi->laptop_mode_wb_timer); rcu_read_unlock(); } @@ -2313,7 +2268,7 @@ static int page_writeback_cpu_online(unsigned int cpu) /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE; -static struct ctl_table vm_page_writeback_sysctls[] = { +static const struct ctl_table vm_page_writeback_sysctls[] = { { .procname = "dirty_background_ratio", .data = &dirty_background_ratio, @@ -2610,11 +2565,11 @@ struct folio *writeback_iter(struct address_space *mapping, if (!folio) { /* * To avoid deadlocks between range_cyclic writeback and callers - * that hold pages in PageWriteback to aggregate I/O until + * that hold folios in writeback to aggregate I/O until * the writeback iteration finishes, we do not loop back to the - * start of the file. Doing so causes a page lock/page + * start of the file. Doing so causes a folio lock/folio * writeback access order inversion - we should only ever lock - * multiple pages in ascending page->index order, and looping + * multiple folios in ascending folio->index order, and looping * back to the start of the file violates that rule and causes * deadlocks. */ @@ -2667,27 +2622,6 @@ int write_cache_pages(struct address_space *mapping, } EXPORT_SYMBOL(write_cache_pages); -static int writeback_use_writepage(struct address_space *mapping, - struct writeback_control *wbc) -{ - struct folio *folio = NULL; - struct blk_plug plug; - int err; - - blk_start_plug(&plug); - while ((folio = writeback_iter(mapping, wbc, folio, &err))) { - err = mapping->a_ops->writepage(&folio->page, wbc); - if (err == AOP_WRITEPAGE_ACTIVATE) { - folio_unlock(folio); - err = 0; - } - mapping_set_error(mapping, err); - } - blk_finish_plug(&plug); - - return err; -} - int do_writepages(struct address_space *mapping, struct writeback_control *wbc) { int ret; @@ -2698,14 +2632,11 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) wb = inode_to_wb_wbc(mapping->host, wbc); wb_bandwidth_estimate_start(wb); while (1) { - if (mapping->a_ops->writepages) { + if (mapping->a_ops->writepages) ret = mapping->a_ops->writepages(mapping, wbc); - } else if (mapping->a_ops->writepage) { - ret = writeback_use_writepage(mapping, wbc); - } else { + else /* deal with chardevs and other special files */ ret = 0; - } if (ret != -ENOMEM || wbc->sync_mode != WB_SYNC_ALL) break; @@ -3124,6 +3055,7 @@ void __folio_start_writeback(struct folio *folio, bool keep_write) int access_ret; VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (mapping && mapping_use_writeback_tags(mapping)) { XA_STATE(xas, &mapping->i_pages, folio_index(folio)); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1cb4b8c8886d..2ef3c07266b3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -88,6 +88,9 @@ typedef int __bitwise fpi_t; */ #define FPI_TO_TAIL ((__force fpi_t)BIT(1)) +/* Free the page without taking locks. Rely on trylock only. */ +#define FPI_TRYLOCK ((__force fpi_t)BIT(2)) + /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8) @@ -273,6 +276,7 @@ int min_free_kbytes = 1024; int user_min_free_kbytes = -1; static int watermark_boost_factor __read_mostly = 15000; static int watermark_scale_factor = 10; +int defrag_mode; /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ int movable_zone; @@ -286,7 +290,8 @@ EXPORT_SYMBOL(nr_online_nodes); #endif static bool page_contains_unaccepted(struct page *page, unsigned int order); -static bool cond_accept_memory(struct zone *zone, unsigned int order); +static bool cond_accept_memory(struct zone *zone, unsigned int order, + int alloc_flags); static bool __free_unaccepted(struct page *page); int page_group_by_mobility_disabled __read_mostly; @@ -508,9 +513,9 @@ out: static inline unsigned int order_to_pindex(int migratetype, int order) { - bool __maybe_unused movable; #ifdef CONFIG_TRANSPARENT_HUGEPAGE + bool movable; if (order > PAGE_ALLOC_COSTLY_ORDER) { VM_BUG_ON(order != HPAGE_PMD_ORDER); @@ -614,6 +619,10 @@ compaction_capture(struct capture_control *capc, struct page *page, capc->cc->migratetype != MIGRATE_MOVABLE) return false; + if (migratetype != capc->cc->migratetype) + trace_mm_page_alloc_extfrag(page, capc->cc->order, order, + capc->cc->migratetype, migratetype); + capc->page = page; return true; } @@ -655,16 +664,20 @@ static inline void __add_to_free_list(struct page *page, struct zone *zone, bool tail) { struct free_area *area = &zone->free_area[order]; + int nr_pages = 1 << order; VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype, "page type is %lu, passed migratetype is %d (nr=%d)\n", - get_pageblock_migratetype(page), migratetype, 1 << order); + get_pageblock_migratetype(page), migratetype, nr_pages); if (tail) list_add_tail(&page->buddy_list, &area->free_list[migratetype]); else list_add(&page->buddy_list, &area->free_list[migratetype]); area->nr_free++; + + if (order >= pageblock_order && !is_migrate_isolate(migratetype)) + __mod_zone_page_state(zone, NR_FREE_PAGES_BLOCKS, nr_pages); } /* @@ -676,24 +689,34 @@ static inline void move_to_free_list(struct page *page, struct zone *zone, unsigned int order, int old_mt, int new_mt) { struct free_area *area = &zone->free_area[order]; + int nr_pages = 1 << order; /* Free page moving can fail, so it happens before the type update */ VM_WARN_ONCE(get_pageblock_migratetype(page) != old_mt, "page type is %lu, passed migratetype is %d (nr=%d)\n", - get_pageblock_migratetype(page), old_mt, 1 << order); + get_pageblock_migratetype(page), old_mt, nr_pages); list_move_tail(&page->buddy_list, &area->free_list[new_mt]); - account_freepages(zone, -(1 << order), old_mt); - account_freepages(zone, 1 << order, new_mt); + account_freepages(zone, -nr_pages, old_mt); + account_freepages(zone, nr_pages, new_mt); + + if (order >= pageblock_order && + is_migrate_isolate(old_mt) != is_migrate_isolate(new_mt)) { + if (!is_migrate_isolate(old_mt)) + nr_pages = -nr_pages; + __mod_zone_page_state(zone, NR_FREE_PAGES_BLOCKS, nr_pages); + } } static inline void __del_page_from_free_list(struct page *page, struct zone *zone, unsigned int order, int migratetype) { + int nr_pages = 1 << order; + VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype, "page type is %lu, passed migratetype is %d (nr=%d)\n", - get_pageblock_migratetype(page), migratetype, 1 << order); + get_pageblock_migratetype(page), migratetype, nr_pages); /* clear reported state and update reported page count */ if (page_reported(page)) @@ -703,6 +726,9 @@ static inline void __del_page_from_free_list(struct page *page, struct zone *zon __ClearPageBuddy(page); set_page_private(page, 0); zone->free_area[order].nr_free--; + + if (order >= pageblock_order && !is_migrate_isolate(migratetype)) + __mod_zone_page_state(zone, NR_FREE_PAGES_BLOCKS, -nr_pages); } static inline void del_page_from_free_list(struct page *page, struct zone *zone, @@ -872,9 +898,7 @@ static inline bool page_expected_state(struct page *page, #ifdef CONFIG_MEMCG page->memcg_data | #endif -#ifdef CONFIG_PAGE_POOL - ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) | -#endif + page_pool_page_is_pp(page) | (page->flags & check_flags))) return false; @@ -901,26 +925,18 @@ static const char *page_bad_reason(struct page *page, unsigned long flags) if (unlikely(page->memcg_data)) bad_reason = "page still charged to cgroup"; #endif -#ifdef CONFIG_PAGE_POOL - if (unlikely((page->pp_magic & ~0x3UL) == PP_SIGNATURE)) + if (unlikely(page_pool_page_is_pp(page))) bad_reason = "page_pool leak"; -#endif return bad_reason; } -static void free_page_is_bad_report(struct page *page) -{ - bad_page(page, - page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE)); -} - static inline bool free_page_is_bad(struct page *page) { if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) return false; /* Something has gone sideways, find it */ - free_page_is_bad_report(page); + bad_page(page, page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE)); return true; } @@ -947,21 +963,34 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page) switch (page - head_page) { case 1: /* the first tail page: these may be in place of ->mapping */ - if (unlikely(folio_entire_mapcount(folio))) { - bad_page(page, "nonzero entire_mapcount"); - goto out; - } if (unlikely(folio_large_mapcount(folio))) { bad_page(page, "nonzero large_mapcount"); goto out; } - if (unlikely(atomic_read(&folio->_nr_pages_mapped))) { + if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT) && + unlikely(atomic_read(&folio->_nr_pages_mapped))) { bad_page(page, "nonzero nr_pages_mapped"); goto out; } - if (unlikely(atomic_read(&folio->_pincount))) { - bad_page(page, "nonzero pincount"); - goto out; + if (IS_ENABLED(CONFIG_MM_ID)) { + if (unlikely(folio->_mm_id_mapcount[0] != -1)) { + bad_page(page, "nonzero mm mapcount 0"); + goto out; + } + if (unlikely(folio->_mm_id_mapcount[1] != -1)) { + bad_page(page, "nonzero mm mapcount 1"); + goto out; + } + } + if (IS_ENABLED(CONFIG_64BIT)) { + if (unlikely(atomic_read(&folio->_entire_mapcount) + 1)) { + bad_page(page, "nonzero entire_mapcount"); + goto out; + } + if (unlikely(atomic_read(&folio->_pincount))) { + bad_page(page, "nonzero pincount"); + goto out; + } } break; case 2: @@ -970,7 +999,22 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page) bad_page(page, "on deferred list"); goto out; } + if (!IS_ENABLED(CONFIG_64BIT)) { + if (unlikely(atomic_read(&folio->_entire_mapcount) + 1)) { + bad_page(page, "nonzero entire_mapcount"); + goto out; + } + if (unlikely(atomic_read(&folio->_pincount))) { + bad_page(page, "nonzero pincount"); + goto out; + } + } break; + case 3: + /* the third tail page: hugetlb specifics overlap ->mappings */ + if (IS_ENABLED(CONFIG_HUGETLB_PAGE)) + break; + fallthrough; default: if (page->mapping != TAIL_MAPPING) { bad_page(page, "corrupted mapping in tail page"); @@ -1041,6 +1085,79 @@ static void kernel_init_pages(struct page *page, int numpages) kasan_enable_current(); } +#ifdef CONFIG_MEM_ALLOC_PROFILING + +/* Should be called only if mem_alloc_profiling_enabled() */ +void __clear_page_tag_ref(struct page *page) +{ + union pgtag_ref_handle handle; + union codetag_ref ref; + + if (get_page_tag_ref(page, &ref, &handle)) { + set_codetag_empty(&ref); + update_page_tag_ref(handle, &ref); + put_page_tag_ref(handle); + } +} + +/* Should be called only if mem_alloc_profiling_enabled() */ +static noinline +void __pgalloc_tag_add(struct page *page, struct task_struct *task, + unsigned int nr) +{ + union pgtag_ref_handle handle; + union codetag_ref ref; + + if (get_page_tag_ref(page, &ref, &handle)) { + alloc_tag_add(&ref, task->alloc_tag, PAGE_SIZE * nr); + update_page_tag_ref(handle, &ref); + put_page_tag_ref(handle); + } +} + +static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, + unsigned int nr) +{ + if (mem_alloc_profiling_enabled()) + __pgalloc_tag_add(page, task, nr); +} + +/* Should be called only if mem_alloc_profiling_enabled() */ +static noinline +void __pgalloc_tag_sub(struct page *page, unsigned int nr) +{ + union pgtag_ref_handle handle; + union codetag_ref ref; + + if (get_page_tag_ref(page, &ref, &handle)) { + alloc_tag_sub(&ref, PAGE_SIZE * nr); + update_page_tag_ref(handle, &ref); + put_page_tag_ref(handle); + } +} + +static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) +{ + if (mem_alloc_profiling_enabled()) + __pgalloc_tag_sub(page, nr); +} + +/* When tag is not NULL, assuming mem_alloc_profiling_enabled */ +static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) +{ + if (tag) + this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr); +} + +#else /* CONFIG_MEM_ALLOC_PROFILING */ + +static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, + unsigned int nr) {} +static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {} +static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {} + +#endif /* CONFIG_MEM_ALLOC_PROFILING */ + __always_inline bool free_pages_prepare(struct page *page, unsigned int order) { @@ -1096,8 +1213,12 @@ __always_inline bool free_pages_prepare(struct page *page, if (unlikely(order)) { int i; - if (compound) + if (compound) { page[1].flags &= ~PAGE_FLAGS_SECOND; +#ifdef NR_PAGES_IN_LARGE_FOLIO + folio->_nr_pages = 0; +#endif + } for (i = 1; i < (1 << order); i++) { if (compound) bad += free_tail_page_prepare(page, page + i); @@ -1238,22 +1359,56 @@ static void split_large_buddy(struct zone *zone, struct page *page, if (order > pageblock_order) order = pageblock_order; - while (pfn != end) { + do { int mt = get_pfnblock_migratetype(page, pfn); __free_one_page(page, pfn, zone, order, mt, fpi); pfn += 1 << order; + if (pfn == end) + break; page = pfn_to_page(pfn); - } + } while (1); +} + +static void add_page_to_zone_llist(struct zone *zone, struct page *page, + unsigned int order) +{ + /* Remember the order */ + page->order = order; + /* Add the page to the free list */ + llist_add(&page->pcp_llist, &zone->trylock_free_pages); } static void free_one_page(struct zone *zone, struct page *page, unsigned long pfn, unsigned int order, fpi_t fpi_flags) { + struct llist_head *llhead; unsigned long flags; - spin_lock_irqsave(&zone->lock, flags); + if (unlikely(fpi_flags & FPI_TRYLOCK)) { + if (!spin_trylock_irqsave(&zone->lock, flags)) { + add_page_to_zone_llist(zone, page, order); + return; + } + } else { + spin_lock_irqsave(&zone->lock, flags); + } + + /* The lock succeeded. Process deferred pages. */ + llhead = &zone->trylock_free_pages; + if (unlikely(!llist_empty(llhead) && !(fpi_flags & FPI_TRYLOCK))) { + struct llist_node *llnode; + struct page *p, *tmp; + + llnode = llist_del_all(llhead); + llist_for_each_entry_safe(p, tmp, llnode, pcp_llist) { + unsigned int p_order = p->order; + + split_large_buddy(zone, p, page_to_pfn(p), p_order, fpi_flags); + __count_vm_events(PGFREE, 1 << p_order); + } + } split_large_buddy(zone, page, pfn, order, fpi_flags); spin_unlock_irqrestore(&zone->lock, flags); @@ -1293,12 +1448,6 @@ void __meminit __free_pages_core(struct page *page, unsigned int order, set_page_count(p, 0); } - /* - * Freeing the page with debug_pagealloc enabled will try to - * unmap it; some archs don't like double-unmappings, so - * map it first. - */ - debug_pagealloc_map_pages(page, nr_pages); adjust_managed_page_count(page, nr_pages); } else { for (loop = 0; loop < nr_pages; loop++, p++) { @@ -1431,7 +1580,7 @@ static __always_inline void page_del_and_expand(struct zone *zone, static void check_new_page_bad(struct page *page) { - if (unlikely(page->flags & __PG_HWPOISON)) { + if (unlikely(PageHWPoison(page))) { /* Don't complain about hwpoisoned pages */ if (PageBuddy(page)) __ClearPageBuddy(page); @@ -1506,7 +1655,6 @@ inline void post_alloc_hook(struct page *page, unsigned int order, int i; set_page_private(page, 0); - set_page_refcounted(page); arch_alloc_page(page, order); debug_pagealloc_map_pages(page, 1 << order); @@ -1830,39 +1978,6 @@ static void change_pageblock_range(struct page *pageblock_page, } } -/* - * When we are falling back to another migratetype during allocation, try to - * steal extra free pages from the same pageblocks to satisfy further - * allocations, instead of polluting multiple pageblocks. - * - * If we are stealing a relatively large buddy page, it is likely there will - * be more free pages in the pageblock, so try to steal them all. For - * reclaimable and unmovable allocations, we steal regardless of page size, - * as fragmentation caused by those allocations polluting movable pageblocks - * is worse than movable allocations stealing from unmovable and reclaimable - * pageblocks. - */ -static bool can_steal_fallback(unsigned int order, int start_mt) -{ - /* - * Leaving this order check is intended, although there is - * relaxed order check in next check. The reason is that - * we can actually steal whole pageblock if this condition met, - * but, below check doesn't guarantee it and that is just heuristic - * so could be changed anytime. - */ - if (order >= pageblock_order) - return true; - - if (order >= pageblock_order / 2 || - start_mt == MIGRATE_RECLAIMABLE || - start_mt == MIGRATE_UNMOVABLE || - page_group_by_mobility_disabled) - return true; - - return false; -} - static inline bool boost_watermark(struct zone *zone) { unsigned long max_boost; @@ -1901,30 +2016,93 @@ static inline bool boost_watermark(struct zone *zone) } /* - * This function implements actual steal behaviour. If order is large enough, we - * can claim the whole pageblock for the requested migratetype. If not, we check - * the pageblock for constituent pages; if at least half of the pages are free - * or compatible, we can still claim the whole block, so pages freed in the - * future will be put on the correct free list. Otherwise, we isolate exactly - * the order we need from the fallback block and leave its migratetype alone. + * When we are falling back to another migratetype during allocation, should we + * try to claim an entire block to satisfy further allocations, instead of + * polluting multiple pageblocks? */ -static struct page * -steal_suitable_fallback(struct zone *zone, struct page *page, - int current_order, int order, int start_type, - unsigned int alloc_flags, bool whole_block) +static bool should_try_claim_block(unsigned int order, int start_mt) { - int free_pages, movable_pages, alike_pages; - unsigned long start_pfn; - int block_type; + /* + * Leaving this order check is intended, although there is + * relaxed order check in next check. The reason is that + * we can actually claim the whole pageblock if this condition met, + * but, below check doesn't guarantee it and that is just heuristic + * so could be changed anytime. + */ + if (order >= pageblock_order) + return true; + + /* + * Above a certain threshold, always try to claim, as it's likely there + * will be more free pages in the pageblock. + */ + if (order >= pageblock_order / 2) + return true; + + /* + * Unmovable/reclaimable allocations would cause permanent + * fragmentations if they fell back to allocating from a movable block + * (polluting it), so we try to claim the whole block regardless of the + * allocation size. Later movable allocations can always steal from this + * block, which is less problematic. + */ + if (start_mt == MIGRATE_RECLAIMABLE || start_mt == MIGRATE_UNMOVABLE) + return true; - block_type = get_pageblock_migratetype(page); + if (page_group_by_mobility_disabled) + return true; /* - * This can happen due to races and we want to prevent broken - * highatomic accounting. + * Movable pages won't cause permanent fragmentation, so when you alloc + * small pages, we just need to temporarily steal unmovable or + * reclaimable pages that are closest to the request size. After a + * while, memory compaction may occur to form large contiguous pages, + * and the next movable allocation may not need to steal. */ - if (is_migrate_highatomic(block_type)) - goto single_page; + return false; +} + +/* + * Check whether there is a suitable fallback freepage with requested order. + * If claimable is true, this function returns fallback_mt only if + * we would do this whole-block claiming. This would help to reduce + * fragmentation due to mixed migratetype pages in one pageblock. + */ +int find_suitable_fallback(struct free_area *area, unsigned int order, + int migratetype, bool claimable) +{ + int i; + + if (claimable && !should_try_claim_block(order, migratetype)) + return -2; + + if (area->nr_free == 0) + return -1; + + for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) { + int fallback_mt = fallbacks[migratetype][i]; + + if (!free_area_empty(area, fallback_mt)) + return fallback_mt; + } + + return -1; +} + +/* + * This function implements actual block claiming behaviour. If order is large + * enough, we can claim the whole pageblock for the requested migratetype. If + * not, we check the pageblock for constituent pages; if at least half of the + * pages are free or compatible, we can still claim the whole block, so pages + * freed in the future will be put on the correct free list. + */ +static struct page * +try_to_claim_block(struct zone *zone, struct page *page, + int current_order, int order, int start_type, + int block_type, unsigned int alloc_flags) +{ + int free_pages, movable_pages, alike_pages; + unsigned long start_pfn; /* Take ownership for orders >= pageblock_order */ if (current_order >= pageblock_order) { @@ -1945,14 +2123,10 @@ steal_suitable_fallback(struct zone *zone, struct page *page, if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD)) set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); - /* We are not allowed to try stealing from the whole block */ - if (!whole_block) - goto single_page; - /* moving whole block can fail due to zone boundary conditions */ if (!prep_move_freepages_block(zone, page, &start_pfn, &free_pages, &movable_pages)) - goto single_page; + return NULL; /* * Determine how many pages are compatible with our allocation. @@ -1985,201 +2159,19 @@ steal_suitable_fallback(struct zone *zone, struct page *page, return __rmqueue_smallest(zone, order, start_type); } -single_page: - page_del_and_expand(zone, page, order, current_order, block_type); - return page; -} - -/* - * Check whether there is a suitable fallback freepage with requested order. - * If only_stealable is true, this function returns fallback_mt only if - * we can steal other freepages all together. This would help to reduce - * fragmentation due to mixed migratetype pages in one pageblock. - */ -int find_suitable_fallback(struct free_area *area, unsigned int order, - int migratetype, bool only_stealable, bool *can_steal) -{ - int i; - int fallback_mt; - - if (area->nr_free == 0) - return -1; - - *can_steal = false; - for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) { - fallback_mt = fallbacks[migratetype][i]; - if (free_area_empty(area, fallback_mt)) - continue; - - if (can_steal_fallback(order, migratetype)) - *can_steal = true; - - if (!only_stealable) - return fallback_mt; - - if (*can_steal) - return fallback_mt; - } - - return -1; -} - -/* - * Reserve the pageblock(s) surrounding an allocation request for - * exclusive use of high-order atomic allocations if there are no - * empty page blocks that contain a page with a suitable order - */ -static void reserve_highatomic_pageblock(struct page *page, int order, - struct zone *zone) -{ - int mt; - unsigned long max_managed, flags; - - /* - * The number reserved as: minimum is 1 pageblock, maximum is - * roughly 1% of a zone. But if 1% of a zone falls below a - * pageblock size, then don't reserve any pageblocks. - * Check is race-prone but harmless. - */ - if ((zone_managed_pages(zone) / 100) < pageblock_nr_pages) - return; - max_managed = ALIGN((zone_managed_pages(zone) / 100), pageblock_nr_pages); - if (zone->nr_reserved_highatomic >= max_managed) - return; - - spin_lock_irqsave(&zone->lock, flags); - - /* Recheck the nr_reserved_highatomic limit under the lock */ - if (zone->nr_reserved_highatomic >= max_managed) - goto out_unlock; - - /* Yoink! */ - mt = get_pageblock_migratetype(page); - /* Only reserve normal pageblocks (i.e., they can merge with others) */ - if (!migratetype_is_mergeable(mt)) - goto out_unlock; - - if (order < pageblock_order) { - if (move_freepages_block(zone, page, mt, MIGRATE_HIGHATOMIC) == -1) - goto out_unlock; - zone->nr_reserved_highatomic += pageblock_nr_pages; - } else { - change_pageblock_range(page, order, MIGRATE_HIGHATOMIC); - zone->nr_reserved_highatomic += 1 << order; - } - -out_unlock: - spin_unlock_irqrestore(&zone->lock, flags); -} - -/* - * Used when an allocation is about to fail under memory pressure. This - * potentially hurts the reliability of high-order allocations when under - * intense memory pressure but failed atomic allocations should be easier - * to recover from than an OOM. - * - * If @force is true, try to unreserve pageblocks even though highatomic - * pageblock is exhausted. - */ -static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, - bool force) -{ - struct zonelist *zonelist = ac->zonelist; - unsigned long flags; - struct zoneref *z; - struct zone *zone; - struct page *page; - int order; - int ret; - - for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx, - ac->nodemask) { - /* - * Preserve at least one pageblock unless memory pressure - * is really high. - */ - if (!force && zone->nr_reserved_highatomic <= - pageblock_nr_pages) - continue; - - spin_lock_irqsave(&zone->lock, flags); - for (order = 0; order < NR_PAGE_ORDERS; order++) { - struct free_area *area = &(zone->free_area[order]); - int mt; - - page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC); - if (!page) - continue; - - mt = get_pageblock_migratetype(page); - /* - * In page freeing path, migratetype change is racy so - * we can counter several free pages in a pageblock - * in this loop although we changed the pageblock type - * from highatomic to ac->migratetype. So we should - * adjust the count once. - */ - if (is_migrate_highatomic(mt)) { - unsigned long size; - /* - * It should never happen but changes to - * locking could inadvertently allow a per-cpu - * drain to add pages to MIGRATE_HIGHATOMIC - * while unreserving so be safe and watch for - * underflows. - */ - size = max(pageblock_nr_pages, 1UL << order); - size = min(size, zone->nr_reserved_highatomic); - zone->nr_reserved_highatomic -= size; - } - - /* - * Convert to ac->migratetype and avoid the normal - * pageblock stealing heuristics. Minimally, the caller - * is doing the work and needs the pages. More - * importantly, if the block was always converted to - * MIGRATE_UNMOVABLE or another type then the number - * of pageblocks that cannot be completely freed - * may increase. - */ - if (order < pageblock_order) - ret = move_freepages_block(zone, page, mt, - ac->migratetype); - else { - move_to_free_list(page, zone, order, mt, - ac->migratetype); - change_pageblock_range(page, order, - ac->migratetype); - ret = 1; - } - /* - * Reserving the block(s) already succeeded, - * so this should not fail on zone boundaries. - */ - WARN_ON_ONCE(ret == -1); - if (ret > 0) { - spin_unlock_irqrestore(&zone->lock, flags); - return ret; - } - } - spin_unlock_irqrestore(&zone->lock, flags); - } - - return false; + return NULL; } /* - * Try finding a free buddy page on the fallback list and put it on the free - * list of requested migratetype, possibly along with other pages from the same - * block, depending on fragmentation avoidance heuristics. Returns true if - * fallback was found so that __rmqueue_smallest() can grab it. + * Try to allocate from some fallback migratetype by claiming the entire block, + * i.e. converting it to the allocation's start migratetype. * * The use of signed ints for order and current_order is a deliberate * deviation from the rest of this file, to make the for loop * condition simpler. */ static __always_inline struct page * -__rmqueue_fallback(struct zone *zone, int order, int start_migratetype, +__rmqueue_claim(struct zone *zone, int order, int start_migratetype, unsigned int alloc_flags) { struct free_area *area; @@ -2187,7 +2179,6 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, int min_order = order; struct page *page; int fallback_mt; - bool can_steal; /* * Do not steal pages from freelists belonging to other pageblocks @@ -2206,62 +2197,73 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, --current_order) { area = &(zone->free_area[current_order]); fallback_mt = find_suitable_fallback(area, current_order, - start_migratetype, false, &can_steal); + start_migratetype, true); + + /* No block in that order */ if (fallback_mt == -1) continue; - /* - * We cannot steal all free pages from the pageblock and the - * requested migratetype is movable. In that case it's better to - * steal and split the smallest available page instead of the - * largest available page, because even if the next movable - * allocation falls back into a different pageblock than this - * one, it won't cause permanent fragmentation. - */ - if (!can_steal && start_migratetype == MIGRATE_MOVABLE - && current_order > order) - goto find_smallest; + /* Advanced into orders too low to claim, abort */ + if (fallback_mt == -2) + break; - goto do_steal; + page = get_page_from_free_area(area, fallback_mt); + page = try_to_claim_block(zone, page, current_order, order, + start_migratetype, fallback_mt, + alloc_flags); + if (page) { + trace_mm_page_alloc_extfrag(page, order, current_order, + start_migratetype, fallback_mt); + return page; + } } return NULL; +} + +/* + * Try to steal a single page from some fallback migratetype. Leave the rest of + * the block as its current migratetype, potentially causing fragmentation. + */ +static __always_inline struct page * +__rmqueue_steal(struct zone *zone, int order, int start_migratetype) +{ + struct free_area *area; + int current_order; + struct page *page; + int fallback_mt; -find_smallest: for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) { area = &(zone->free_area[current_order]); fallback_mt = find_suitable_fallback(area, current_order, - start_migratetype, false, &can_steal); - if (fallback_mt != -1) - break; - } - - /* - * This should not happen - we already found a suitable fallback - * when looking for the largest page. - */ - VM_BUG_ON(current_order > MAX_PAGE_ORDER); - -do_steal: - page = get_page_from_free_area(area, fallback_mt); - - /* take off list, maybe claim block, expand remainder */ - page = steal_suitable_fallback(zone, page, current_order, order, - start_migratetype, alloc_flags, can_steal); + start_migratetype, false); + if (fallback_mt == -1) + continue; - trace_mm_page_alloc_extfrag(page, order, current_order, - start_migratetype, fallback_mt); + page = get_page_from_free_area(area, fallback_mt); + page_del_and_expand(zone, page, order, current_order, fallback_mt); + trace_mm_page_alloc_extfrag(page, order, current_order, + start_migratetype, fallback_mt); + return page; + } - return page; + return NULL; } +enum rmqueue_mode { + RMQUEUE_NORMAL, + RMQUEUE_CMA, + RMQUEUE_CLAIM, + RMQUEUE_STEAL, +}; + /* * Do the hard work of removing an element from the buddy allocator. * Call me with the zone->lock already held. */ static __always_inline struct page * __rmqueue(struct zone *zone, unsigned int order, int migratetype, - unsigned int alloc_flags) + unsigned int alloc_flags, enum rmqueue_mode *mode) { struct page *page; @@ -2280,16 +2282,48 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype, } } - page = __rmqueue_smallest(zone, order, migratetype); - if (unlikely(!page)) { - if (alloc_flags & ALLOC_CMA) + /* + * First try the freelists of the requested migratetype, then try + * fallbacks modes with increasing levels of fragmentation risk. + * + * The fallback logic is expensive and rmqueue_bulk() calls in + * a loop with the zone->lock held, meaning the freelists are + * not subject to any outside changes. Remember in *mode where + * we found pay dirt, to save us the search on the next call. + */ + switch (*mode) { + case RMQUEUE_NORMAL: + page = __rmqueue_smallest(zone, order, migratetype); + if (page) + return page; + fallthrough; + case RMQUEUE_CMA: + if (alloc_flags & ALLOC_CMA) { page = __rmqueue_cma_fallback(zone, order); - - if (!page) - page = __rmqueue_fallback(zone, order, migratetype, - alloc_flags); + if (page) { + *mode = RMQUEUE_CMA; + return page; + } + } + fallthrough; + case RMQUEUE_CLAIM: + page = __rmqueue_claim(zone, order, migratetype, alloc_flags); + if (page) { + /* Replenished preferred freelist, back to normal mode. */ + *mode = RMQUEUE_NORMAL; + return page; + } + fallthrough; + case RMQUEUE_STEAL: + if (!(alloc_flags & ALLOC_NOFRAGMENT)) { + page = __rmqueue_steal(zone, order, migratetype); + if (page) { + *mode = RMQUEUE_STEAL; + return page; + } + } } - return page; + return NULL; } /* @@ -2301,13 +2335,19 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, int migratetype, unsigned int alloc_flags) { + enum rmqueue_mode rmqm = RMQUEUE_NORMAL; unsigned long flags; int i; - spin_lock_irqsave(&zone->lock, flags); + if (unlikely(alloc_flags & ALLOC_TRYLOCK)) { + if (!spin_trylock_irqsave(&zone->lock, flags)) + return 0; + } else { + spin_lock_irqsave(&zone->lock, flags); + } for (i = 0; i < count; ++i) { struct page *page = __rmqueue(zone, order, migratetype, - alloc_flags); + alloc_flags, &rmqm); if (unlikely(page == NULL)) break; @@ -2590,9 +2630,9 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone, return high; } -static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, - struct page *page, int migratetype, - unsigned int order) +static void free_frozen_page_commit(struct zone *zone, + struct per_cpu_pages *pcp, struct page *page, int migratetype, + unsigned int order, fpi_t fpi_flags) { int high, batch; int pindex; @@ -2617,16 +2657,24 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, * stops will be drained from vmstat refresh context. */ if (order && order <= PAGE_ALLOC_COSTLY_ORDER) { - free_high = (pcp->free_count >= batch && + free_high = (pcp->free_count >= (batch + pcp->high_min / 2) && (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) && (!(pcp->flags & PCPF_FREE_HIGH_BATCH) || - pcp->count >= READ_ONCE(batch))); + pcp->count >= batch)); pcp->flags |= PCPF_PREV_FREE_HIGH_ORDER; } else if (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) { pcp->flags &= ~PCPF_PREV_FREE_HIGH_ORDER; } if (pcp->free_count < (batch << CONFIG_PCP_BATCH_SCALE_MAX)) pcp->free_count += (1 << order); + + if (unlikely(fpi_flags & FPI_TRYLOCK)) { + /* + * Do not attempt to take a zone lock. Let pcp->count get + * over high mark temporarily. + */ + return; + } high = nr_pcp_high(pcp, zone, batch, free_high); if (pcp->count >= high) { free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high), @@ -2641,7 +2689,8 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, /* * Free a pcp page */ -void free_unref_page(struct page *page, unsigned int order) +static void __free_frozen_pages(struct page *page, unsigned int order, + fpi_t fpi_flags) { unsigned long __maybe_unused UP_flags; struct per_cpu_pages *pcp; @@ -2650,7 +2699,7 @@ void free_unref_page(struct page *page, unsigned int order) int migratetype; if (!pcp_allowed_order(order)) { - __free_pages_ok(page, order, FPI_NONE); + __free_pages_ok(page, order, fpi_flags); return; } @@ -2664,27 +2713,37 @@ void free_unref_page(struct page *page, unsigned int order) * get those areas back if necessary. Otherwise, we may have to free * excessively into the page allocator */ + zone = page_zone(page); migratetype = get_pfnblock_migratetype(page, pfn); if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { if (unlikely(is_migrate_isolate(migratetype))) { - free_one_page(page_zone(page), page, pfn, order, FPI_NONE); + free_one_page(zone, page, pfn, order, fpi_flags); return; } migratetype = MIGRATE_MOVABLE; } - zone = page_zone(page); + if (unlikely((fpi_flags & FPI_TRYLOCK) && IS_ENABLED(CONFIG_PREEMPT_RT) + && (in_nmi() || in_hardirq()))) { + add_page_to_zone_llist(zone, page, order); + return; + } pcp_trylock_prepare(UP_flags); pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (pcp) { - free_unref_page_commit(zone, pcp, page, migratetype, order); + free_frozen_page_commit(zone, pcp, page, migratetype, order, fpi_flags); pcp_spin_unlock(pcp); } else { - free_one_page(zone, page, pfn, order, FPI_NONE); + free_one_page(zone, page, pfn, order, fpi_flags); } pcp_trylock_finish(UP_flags); } +void free_frozen_pages(struct page *page, unsigned int order) +{ + __free_frozen_pages(page, order, FPI_NONE); +} + /* * Free a batch of folios */ @@ -2741,7 +2800,7 @@ void free_unref_folios(struct folio_batch *folios) /* * Free isolated pages directly to the - * allocator, see comment in free_unref_page. + * allocator, see comment in free_frozen_pages. */ if (is_migrate_isolate(migratetype)) { free_one_page(zone, &folio->page, pfn, @@ -2772,8 +2831,8 @@ void free_unref_folios(struct folio_batch *folios) migratetype = MIGRATE_MOVABLE; trace_mm_page_free_batched(&folio->page); - free_unref_page_commit(zone, pcp, &folio->page, migratetype, - order); + free_frozen_page_commit(zone, pcp, &folio->page, migratetype, + order, FPI_NONE); } if (pcp) { @@ -2802,7 +2861,7 @@ void split_page(struct page *page, unsigned int order) set_page_refcounted(page + i); split_page_owner(page, order, 0); pgalloc_tag_split(page_folio(page), order, 0); - split_page_memcg(page, order, 0); + split_page_memcg(page, order); } EXPORT_SYMBOL_GPL(split_page); @@ -2904,11 +2963,18 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, do { page = NULL; - spin_lock_irqsave(&zone->lock, flags); + if (unlikely(alloc_flags & ALLOC_TRYLOCK)) { + if (!spin_trylock_irqsave(&zone->lock, flags)) + return NULL; + } else { + spin_lock_irqsave(&zone->lock, flags); + } if (alloc_flags & ALLOC_HIGHATOMIC) page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); if (!page) { - page = __rmqueue(zone, order, migratetype, alloc_flags); + enum rmqueue_mode rmqm = RMQUEUE_NORMAL; + + page = __rmqueue(zone, order, migratetype, alloc_flags, &rmqm); /* * If the allocation fails, allow OOM handling and @@ -3092,6 +3158,142 @@ out: return page; } +/* + * Reserve the pageblock(s) surrounding an allocation request for + * exclusive use of high-order atomic allocations if there are no + * empty page blocks that contain a page with a suitable order + */ +static void reserve_highatomic_pageblock(struct page *page, int order, + struct zone *zone) +{ + int mt; + unsigned long max_managed, flags; + + /* + * The number reserved as: minimum is 1 pageblock, maximum is + * roughly 1% of a zone. But if 1% of a zone falls below a + * pageblock size, then don't reserve any pageblocks. + * Check is race-prone but harmless. + */ + if ((zone_managed_pages(zone) / 100) < pageblock_nr_pages) + return; + max_managed = ALIGN((zone_managed_pages(zone) / 100), pageblock_nr_pages); + if (zone->nr_reserved_highatomic >= max_managed) + return; + + spin_lock_irqsave(&zone->lock, flags); + + /* Recheck the nr_reserved_highatomic limit under the lock */ + if (zone->nr_reserved_highatomic >= max_managed) + goto out_unlock; + + /* Yoink! */ + mt = get_pageblock_migratetype(page); + /* Only reserve normal pageblocks (i.e., they can merge with others) */ + if (!migratetype_is_mergeable(mt)) + goto out_unlock; + + if (order < pageblock_order) { + if (move_freepages_block(zone, page, mt, MIGRATE_HIGHATOMIC) == -1) + goto out_unlock; + zone->nr_reserved_highatomic += pageblock_nr_pages; + } else { + change_pageblock_range(page, order, MIGRATE_HIGHATOMIC); + zone->nr_reserved_highatomic += 1 << order; + } + +out_unlock: + spin_unlock_irqrestore(&zone->lock, flags); +} + +/* + * Used when an allocation is about to fail under memory pressure. This + * potentially hurts the reliability of high-order allocations when under + * intense memory pressure but failed atomic allocations should be easier + * to recover from than an OOM. + * + * If @force is true, try to unreserve pageblocks even though highatomic + * pageblock is exhausted. + */ +static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, + bool force) +{ + struct zonelist *zonelist = ac->zonelist; + unsigned long flags; + struct zoneref *z; + struct zone *zone; + struct page *page; + int order; + int ret; + + for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx, + ac->nodemask) { + /* + * Preserve at least one pageblock unless memory pressure + * is really high. + */ + if (!force && zone->nr_reserved_highatomic <= + pageblock_nr_pages) + continue; + + spin_lock_irqsave(&zone->lock, flags); + for (order = 0; order < NR_PAGE_ORDERS; order++) { + struct free_area *area = &(zone->free_area[order]); + unsigned long size; + + page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC); + if (!page) + continue; + + size = max(pageblock_nr_pages, 1UL << order); + /* + * It should never happen but changes to + * locking could inadvertently allow a per-cpu + * drain to add pages to MIGRATE_HIGHATOMIC + * while unreserving so be safe and watch for + * underflows. + */ + if (WARN_ON_ONCE(size > zone->nr_reserved_highatomic)) + size = zone->nr_reserved_highatomic; + zone->nr_reserved_highatomic -= size; + + /* + * Convert to ac->migratetype and avoid the normal + * pageblock stealing heuristics. Minimally, the caller + * is doing the work and needs the pages. More + * importantly, if the block was always converted to + * MIGRATE_UNMOVABLE or another type then the number + * of pageblocks that cannot be completely freed + * may increase. + */ + if (order < pageblock_order) + ret = move_freepages_block(zone, page, + MIGRATE_HIGHATOMIC, + ac->migratetype); + else { + move_to_free_list(page, zone, order, + MIGRATE_HIGHATOMIC, + ac->migratetype); + change_pageblock_range(page, order, + ac->migratetype); + ret = 1; + } + /* + * Reserving the block(s) already succeeded, + * so this should not fail on zone boundaries. + */ + WARN_ON_ONCE(ret == -1); + if (ret > 0) { + spin_unlock_irqrestore(&zone->lock, flags); + return ret; + } + } + spin_unlock_irqrestore(&zone->lock, flags); + } + + return false; +} + static inline long __zone_watermark_unusable_free(struct zone *z, unsigned int order, unsigned int alloc_flags) { @@ -3249,18 +3451,6 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, return false; } -bool zone_watermark_ok_safe(struct zone *z, unsigned int order, - unsigned long mark, int highest_zoneidx) -{ - long free_pages = zone_page_state(z, NR_FREE_PAGES); - - if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) - free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); - - return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0, - free_pages); -} - #ifdef CONFIG_NUMA int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; @@ -3295,6 +3485,11 @@ alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask) */ alloc_flags = (__force int) (gfp_mask & __GFP_KSWAPD_RECLAIM); + if (defrag_mode) { + alloc_flags |= ALLOC_NOFRAGMENT; + return alloc_flags; + } + #ifdef CONFIG_ZONE_DMA32 if (!zone) return alloc_flags; @@ -3344,7 +3539,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, retry: /* * Scan zonelist, looking for a zone with enough free. - * See also cpuset_node_allowed() comment in kernel/cgroup/cpuset.c. + * See also cpuset_current_node_allowed() comment in kernel/cgroup/cpuset.c. */ no_fallback = alloc_flags & ALLOC_NOFRAGMENT; z = ac->preferred_zoneref; @@ -3386,7 +3581,7 @@ retry: continue; } - if (no_fallback && nr_online_nodes > 1 && + if (no_fallback && !defrag_mode && nr_online_nodes > 1 && zone != zonelist_zone(ac->preferred_zoneref)) { int local_nid; @@ -3402,7 +3597,7 @@ retry: } } - cond_accept_memory(zone, order); + cond_accept_memory(zone, order, alloc_flags); /* * Detect whether the number of free pages is below high @@ -3429,7 +3624,7 @@ check_alloc_wmark: gfp_mask)) { int ret; - if (cond_accept_memory(zone, order)) + if (cond_accept_memory(zone, order, alloc_flags)) goto try_this_zone; /* @@ -3482,7 +3677,7 @@ try_this_zone: return page; } else { - if (cond_accept_memory(zone, order)) + if (cond_accept_memory(zone, order, alloc_flags)) goto try_this_zone; /* Try again if zone has deferred pages */ @@ -3497,7 +3692,7 @@ try_this_zone: * It's possible on a UMA machine to get through all zones that are * fragmented. If avoiding fragmentation, reset and try again. */ - if (no_fallback) { + if (no_fallback && !defrag_mode) { alloc_flags &= ~ALLOC_NOFRAGMENT; goto retry; } @@ -3565,7 +3760,6 @@ __alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order, if (!page) page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); - return page; } @@ -3977,15 +4171,21 @@ static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask, struct zone *zone; pg_data_t *last_pgdat = NULL; enum zone_type highest_zoneidx = ac->highest_zoneidx; + unsigned int reclaim_order; + + if (defrag_mode) + reclaim_order = max(order, pageblock_order); + else + reclaim_order = order; for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx, ac->nodemask) { if (!managed_zone(zone)) continue; - if (last_pgdat != zone->zone_pgdat) { - wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx); - last_pgdat = zone->zone_pgdat; - } + if (last_pgdat == zone->zone_pgdat) + continue; + wakeup_kswapd(zone, gfp_mask, reclaim_order, highest_zoneidx); + last_pgdat = zone->zone_pgdat; } } @@ -4026,7 +4226,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order) /* * Ignore cpuset mems for non-blocking __GFP_HIGH (probably * GFP_ATOMIC) rather than fail, see the comment for - * cpuset_node_allowed(). + * cpuset_current_node_allowed(). */ if (alloc_flags & ALLOC_MIN_RESERVE) alloc_flags &= ~ALLOC_CPUSET; @@ -4035,6 +4235,9 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order) alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags); + if (defrag_mode) + alloc_flags |= ALLOC_NOFRAGMENT; + return alloc_flags; } @@ -4241,6 +4444,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, restart: compaction_retries = 0; no_progress_loops = 0; + compact_result = COMPACT_SKIPPED; compact_priority = DEF_COMPACT_PRIORITY; cpuset_mems_cookie = read_mems_allowed_begin(); zonelist_iter_cookie = zonelist_iter_begin(); @@ -4343,6 +4547,14 @@ restart: } retry: + /* + * Deal with possible cpuset update races or zonelist updates to avoid + * infinite retries. + */ + if (check_retry_cpuset(cpuset_mems_cookie, ac) || + check_retry_zonelist(zonelist_iter_cookie)) + goto restart; + /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ if (alloc_flags & ALLOC_KSWAPD) wake_all_kswapds(order, gfp_mask, ac); @@ -4416,6 +4628,11 @@ retry: &compaction_retries)) goto retry; + /* Reclaim/compaction failed to prevent the fallback */ + if (defrag_mode && (alloc_flags & ALLOC_NOFRAGMENT)) { + alloc_flags &= ~ALLOC_NOFRAGMENT; + goto retry; + } /* * Deal with possible cpuset update races or zonelist updates to avoid @@ -4509,7 +4726,12 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, might_alloc(gfp_mask); - if (should_fail_alloc_page(gfp_mask, order)) + /* + * Don't invoke should_fail logic, since it may call + * get_random_u32() and printk() which need to spin_lock. + */ + if (!(*alloc_flags & ALLOC_TRYLOCK) && + should_fail_alloc_page(gfp_mask, order)) return false; *alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, *alloc_flags); @@ -4529,28 +4751,23 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, } /* - * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array + * __alloc_pages_bulk - Allocate a number of order-0 pages to an array * @gfp: GFP flags for the allocation * @preferred_nid: The preferred NUMA node ID to allocate from * @nodemask: Set of nodes to allocate from, may be NULL - * @nr_pages: The number of pages desired on the list or array - * @page_list: Optional list to store the allocated pages - * @page_array: Optional array to store the pages + * @nr_pages: The number of pages desired in the array + * @page_array: Array to store the pages * * This is a batched version of the page allocator that attempts to - * allocate nr_pages quickly. Pages are added to page_list if page_list - * is not NULL, otherwise it is assumed that the page_array is valid. - * - * For lists, nr_pages is the number of pages that should be allocated. + * allocate nr_pages quickly. Pages are added to the page_array. * - * For arrays, only NULL elements are populated with pages and nr_pages + * Note that only NULL elements are populated with pages and nr_pages * is the maximum number of pages that will be stored in the array. * - * Returns the number of pages on the list or array. + * Returns the number of pages in the array. */ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, nodemask_t *nodemask, int nr_pages, - struct list_head *page_list, struct page **page_array) { struct page *page; @@ -4568,7 +4785,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, * Skip populated array elements to determine if any pages need * to be allocated before disabling IRQs. */ - while (page_array && nr_populated < nr_pages && page_array[nr_populated]) + while (nr_populated < nr_pages && page_array[nr_populated]) nr_populated++; /* No pages requested? */ @@ -4576,7 +4793,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, goto out; /* Already populated array? */ - if (unlikely(page_array && nr_pages - nr_populated == 0)) + if (unlikely(nr_pages - nr_populated == 0)) goto out; /* Bulk allocator does not support memcg accounting. */ @@ -4621,7 +4838,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, goto failed; } - cond_accept_memory(zone, 0); + cond_accept_memory(zone, 0, alloc_flags); retry_this_zone: mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages; if (zone_watermark_fast(zone, 0, mark, @@ -4630,7 +4847,7 @@ retry_this_zone: break; } - if (cond_accept_memory(zone, 0)) + if (cond_accept_memory(zone, 0, alloc_flags)) goto retry_this_zone; /* Try again if zone has deferred pages */ @@ -4658,7 +4875,7 @@ retry_this_zone: while (nr_populated < nr_pages) { /* Skip existing pages */ - if (page_array && page_array[nr_populated]) { + if (page_array[nr_populated]) { nr_populated++; continue; } @@ -4676,11 +4893,8 @@ retry_this_zone: nr_account++; prep_new_page(page, 0, gfp, 0); - if (page_list) - list_add(&page->lru, page_list); - else - page_array[nr_populated] = page; - nr_populated++; + set_page_refcounted(page); + page_array[nr_populated++] = page; } pcp_spin_unlock(pcp); @@ -4697,14 +4911,8 @@ failed_irq: failed: page = __alloc_pages_noprof(gfp, 0, preferred_nid, nodemask); - if (page) { - if (page_list) - list_add(&page->lru, page_list); - else - page_array[nr_populated] = page; - nr_populated++; - } - + if (page) + page_array[nr_populated++] = page; goto out; } EXPORT_SYMBOL_GPL(alloc_pages_bulk_noprof); @@ -4712,8 +4920,8 @@ EXPORT_SYMBOL_GPL(alloc_pages_bulk_noprof); /* * This is the 'heart' of the zoned buddy allocator. */ -struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, - int preferred_nid, nodemask_t *nodemask) +struct page *__alloc_frozen_pages_noprof(gfp_t gfp, unsigned int order, + int preferred_nid, nodemask_t *nodemask) { struct page *page; unsigned int alloc_flags = ALLOC_WMARK_LOW; @@ -4766,7 +4974,7 @@ struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, out: if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT) && page && unlikely(__memcg_kmem_charge_page(page, gfp, order) != 0)) { - __free_pages(page, order); + free_frozen_pages(page, order); page = NULL; } @@ -4775,6 +4983,18 @@ out: return page; } +EXPORT_SYMBOL(__alloc_frozen_pages_noprof); + +struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, + int preferred_nid, nodemask_t *nodemask) +{ + struct page *page; + + page = __alloc_frozen_pages_noprof(gfp, order, preferred_nid, nodemask); + if (page) + set_page_refcounted(page); + return page; +} EXPORT_SYMBOL(__alloc_pages_noprof); struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_nid, @@ -4809,9 +5029,10 @@ unsigned long get_zeroed_page_noprof(gfp_t gfp_mask) EXPORT_SYMBOL(get_zeroed_page_noprof); /** - * __free_pages - Free pages allocated with alloc_pages(). + * ___free_pages - Free pages allocated with alloc_pages(). * @page: The page pointer returned from alloc_pages(). * @order: The order of the allocation. + * @fpi_flags: Free Page Internal flags. * * This function can free multi-page allocations that are not compound * pages. It does not check that the @order passed in matches that of @@ -4828,22 +5049,38 @@ EXPORT_SYMBOL(get_zeroed_page_noprof); * Context: May be called in interrupt context or while holding a normal * spinlock, but not in NMI context or while holding a raw spinlock. */ -void __free_pages(struct page *page, unsigned int order) +static void ___free_pages(struct page *page, unsigned int order, + fpi_t fpi_flags) { /* get PageHead before we drop reference */ int head = PageHead(page); + /* get alloc tag in case the page is released by others */ struct alloc_tag *tag = pgalloc_tag_get(page); if (put_page_testzero(page)) - free_unref_page(page, order); + __free_frozen_pages(page, order, fpi_flags); else if (!head) { pgalloc_tag_sub_pages(tag, (1 << order) - 1); while (order-- > 0) - free_unref_page(page + (1 << order), order); + __free_frozen_pages(page + (1 << order), order, + fpi_flags); } } +void __free_pages(struct page *page, unsigned int order) +{ + ___free_pages(page, order, FPI_NONE); +} EXPORT_SYMBOL(__free_pages); +/* + * Can be called while holding raw_spin_lock or from IRQ and NMI for any + * page type (not only those that came from alloc_pages_nolock) + */ +void free_pages_nolock(struct page *page, unsigned int order) +{ + ___free_pages(page, order, FPI_TRYLOCK); +} + void free_pages(unsigned long addr, unsigned int order) { if (addr != 0) { @@ -4864,7 +5101,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, split_page_owner(page, order, 0); pgalloc_tag_split(page_folio(page), order, 0); - split_page_memcg(page, order, 0); + split_page_memcg(page, order); while (page < --last) set_page_refcounted(last); @@ -5159,13 +5396,6 @@ static void build_thisnode_zonelists(pg_data_t *pgdat) zonerefs->zone_idx = 0; } -/* - * Build zonelists ordered by zone and nodes within zones. - * This results in conserving DMA zone[s] until all Normal memory is - * exhausted, but results in overflowing to remote node while memory - * may still exist in local DMA zone. - */ - static void build_zonelists(pg_data_t *pgdat) { static int node_order[MAX_NUMNODES]; @@ -5690,10 +5920,13 @@ __meminit void zone_pcp_init(struct zone *zone) zone->present_pages, zone_batchsize(zone)); } +static void setup_per_zone_lowmem_reserve(void); + void adjust_managed_page_count(struct page *page, long count) { atomic_long_add(count, &page_zone(page)->managed_pages); totalram_pages_add(count); + setup_per_zone_lowmem_reserve(); } EXPORT_SYMBOL(adjust_managed_page_count); @@ -5831,6 +6064,7 @@ static void calculate_totalreserve_pages(void) } } totalreserve_pages = reserve_pages; + trace_mm_calculate_totalreserve_pages(totalreserve_pages); } /* @@ -5853,14 +6087,15 @@ static void setup_per_zone_lowmem_reserve(void) for (j = i + 1; j < MAX_NR_ZONES; j++) { struct zone *upper_zone = &pgdat->node_zones[j]; - bool empty = !zone_managed_pages(upper_zone); managed_pages += zone_managed_pages(upper_zone); - if (clear || empty) + if (clear) zone->lowmem_reserve[j] = 0; else zone->lowmem_reserve[j] = managed_pages / ratio; + trace_mm_setup_per_zone_lowmem_reserve(zone, upper_zone, + zone->lowmem_reserve[j]); } } } @@ -5924,6 +6159,7 @@ static void __setup_per_zone_wmarks(void) zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; zone->_watermark[WMARK_HIGH] = low_wmark_pages(zone) + tmp; zone->_watermark[WMARK_PROMO] = high_wmark_pages(zone) + tmp; + trace_mm_setup_per_zone_wmarks(zone); spin_unlock_irqrestore(&zone->lock, flags); } @@ -6170,7 +6406,7 @@ out: return ret; } -static struct ctl_table page_alloc_sysctl_table[] = { +static const struct ctl_table page_alloc_sysctl_table[] = { { .procname = "min_free_kbytes", .data = &min_free_kbytes, @@ -6197,6 +6433,15 @@ static struct ctl_table page_alloc_sysctl_table[] = { .extra2 = SYSCTL_THREE_THOUSAND, }, { + .procname = "defrag_mode", + .data = &defrag_mode, + .maxlen = sizeof(defrag_mode), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { .procname = "percpu_pagelist_high_fraction", .data = &percpu_pagelist_high_fraction, .maxlen = sizeof(percpu_pagelist_high_fraction), @@ -6265,9 +6510,8 @@ static void alloc_contig_dump_pages(struct list_head *page_list) * @migratetype: using migratetype to filter the type of migration in * trace_mm_alloc_contig_migrate_range_info. */ -int __alloc_contig_migrate_range(struct compact_control *cc, - unsigned long start, unsigned long end, - int migratetype) +static int __alloc_contig_migrate_range(struct compact_control *cc, + unsigned long start, unsigned long end, int migratetype) { /* This function is based on compact_zone() from compaction.c. */ unsigned int nr_reclaimed; @@ -6276,7 +6520,7 @@ int __alloc_contig_migrate_range(struct compact_control *cc, int ret = 0; struct migration_target_control mtc = { .nid = zone_to_nid(cc->zone), - .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, + .gfp_mask = cc->gfp_mask, .reason = MR_CONTIG_RANGE, }; struct page *page; @@ -6346,7 +6590,7 @@ int __alloc_contig_migrate_range(struct compact_control *cc, return (ret < 0) ? ret : 0; } -static void split_free_pages(struct list_head *list) +static void split_free_pages(struct list_head *list, gfp_t gfp_mask) { int order; @@ -6357,7 +6601,8 @@ static void split_free_pages(struct list_head *list) list_for_each_entry_safe(page, next, &list[order], lru) { int i; - post_alloc_hook(page, order, __GFP_MOVABLE); + post_alloc_hook(page, order, gfp_mask); + set_page_refcounted(page); if (!order) continue; @@ -6371,6 +6616,40 @@ static void split_free_pages(struct list_head *list) } } +static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask) +{ + const gfp_t reclaim_mask = __GFP_IO | __GFP_FS | __GFP_RECLAIM; + const gfp_t action_mask = __GFP_COMP | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | + __GFP_ZERO | __GFP_ZEROTAGS | __GFP_SKIP_ZERO; + const gfp_t cc_action_mask = __GFP_RETRY_MAYFAIL | __GFP_NOWARN; + + /* + * We are given the range to allocate; node, mobility and placement + * hints are irrelevant at this point. We'll simply ignore them. + */ + gfp_mask &= ~(GFP_ZONEMASK | __GFP_RECLAIMABLE | __GFP_WRITE | + __GFP_HARDWALL | __GFP_THISNODE | __GFP_MOVABLE); + + /* + * We only support most reclaim flags (but not NOFAIL/NORETRY), and + * selected action flags. + */ + if (gfp_mask & ~(reclaim_mask | action_mask)) + return -EINVAL; + + /* + * Flags to control page compaction/migration/reclaim, to free up our + * page range. Migratable pages are movable, __GFP_MOVABLE is implied + * for them. + * + * Traditionally we always had __GFP_RETRY_MAYFAIL set, keep doing that + * to not degrade callers. + */ + *gfp_cc_mask = (gfp_mask & (reclaim_mask | cc_action_mask)) | + __GFP_MOVABLE | __GFP_RETRY_MAYFAIL; + return 0; +} + /** * alloc_contig_range() -- tries to allocate given range of pages * @start: start PFN to allocate @@ -6379,7 +6658,9 @@ static void split_free_pages(struct list_head *list) * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks * in range must have the same migratetype and it must * be either of the two. - * @gfp_mask: GFP mask to use during compaction + * @gfp_mask: GFP mask. Node/zone/placement hints are ignored; only some + * action and reclaim modifiers are supported. Reclaim modifiers + * control allocation behavior during compaction/migration/reclaim. * * The PFN range does not have to be pageblock aligned. The PFN range must * belong to a single zone. @@ -6405,11 +6686,14 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, .mode = MIGRATE_SYNC, .ignore_skip_hint = true, .no_set_skip_hint = true, - .gfp_mask = current_gfp_context(gfp_mask), .alloc_contig = true, }; INIT_LIST_HEAD(&cc.migratepages); + gfp_mask = current_gfp_context(gfp_mask); + if (__alloc_contig_verify_gfp_mask(gfp_mask, (gfp_t *)&cc.gfp_mask)) + return -EINVAL; + /* * What we do here is we mark all pageblocks in range as * MIGRATE_ISOLATE. Because pageblock and max order pages may @@ -6431,7 +6715,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, * put back to page allocator so that buddy can use them. */ - ret = start_isolate_page_range(start, end, migratetype, 0, gfp_mask); + ret = start_isolate_page_range(start, end, migratetype, 0); if (ret) goto done; @@ -6450,7 +6734,17 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, ret = __alloc_contig_migrate_range(&cc, start, end, migratetype); if (ret && ret != -EBUSY) goto done; - ret = 0; + + /* + * When in-use hugetlb pages are migrated, they may simply be released + * back into the free hugepage pool instead of being returned to the + * buddy system. After the migration of in-use huge pages is completed, + * we will invoke replace_free_hugepage_folios() to ensure that these + * hugepages are properly released to the buddy system. + */ + ret = replace_free_hugepage_folios(start, end); + if (ret) + goto done; /* * Pages from [start, end) are within a pageblock_nr_pages @@ -6484,7 +6778,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, } if (!(gfp_mask & __GFP_COMP)) { - split_free_pages(cc.freepages); + split_free_pages(cc.freepages, gfp_mask); /* Free head and tail (if any) */ if (start != outer_start) @@ -6497,6 +6791,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, check_new_pages(head, order); prep_new_page(head, order, gfp_mask, 0); + set_page_refcounted(head); } else { ret = -EINVAL; WARN(true, "PFN range: requested [%lu, %lu), allocated [%lu, %lu)\n", @@ -6551,7 +6846,9 @@ static bool zone_spans_last_pfn(const struct zone *zone, /** * alloc_contig_pages() -- tries to find and allocate contiguous range of pages * @nr_pages: Number of contiguous pages to allocate - * @gfp_mask: GFP mask to limit search and used during compaction + * @gfp_mask: GFP mask. Node/zone/placement hints limit the search; only some + * action and reclaim modifiers are supported. Reclaim modifiers + * control allocation behavior during compaction/migration/reclaim. * @nid: Target node * @nodemask: Mask for other possible nodes * @@ -6868,9 +7165,6 @@ bool has_managed_dma(void) #ifdef CONFIG_UNACCEPTED_MEMORY -/* Counts number of zones with unaccepted pages. */ -static DEFINE_STATIC_KEY_FALSE(zones_with_unaccepted_pages); - static bool lazy_accept = true; static int __init accept_memory_parse(char *p) @@ -6897,11 +7191,7 @@ static bool page_contains_unaccepted(struct page *page, unsigned int order) static void __accept_page(struct zone *zone, unsigned long *flags, struct page *page) { - bool last; - list_del(&page->lru); - last = list_empty(&zone->unaccepted_pages); - account_freepages(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); __mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES); __ClearPageUnaccepted(page); @@ -6910,9 +7200,6 @@ static void __accept_page(struct zone *zone, unsigned long *flags, accept_memory(page_to_phys(page), PAGE_SIZE << MAX_PAGE_ORDER); __free_pages_ok(page, MAX_PAGE_ORDER, FPI_TO_TAIL); - - if (last) - static_branch_dec(&zones_with_unaccepted_pages); } void accept_page(struct page *page) @@ -6949,24 +7236,31 @@ static bool try_to_accept_memory_one(struct zone *zone) return true; } -static inline bool has_unaccepted_memory(void) +static bool cond_accept_memory(struct zone *zone, unsigned int order, + int alloc_flags) { - return static_branch_unlikely(&zones_with_unaccepted_pages); -} - -static bool cond_accept_memory(struct zone *zone, unsigned int order) -{ - long to_accept; + long to_accept, wmark; bool ret = false; - if (!has_unaccepted_memory()) + if (list_empty(&zone->unaccepted_pages)) return false; - if (list_empty(&zone->unaccepted_pages)) + /* Bailout, since try_to_accept_memory_one() needs to take a lock */ + if (alloc_flags & ALLOC_TRYLOCK) return false; + wmark = promo_wmark_pages(zone); + + /* + * Watermarks have not been initialized yet. + * + * Accepting one MAX_ORDER page to ensure progress. + */ + if (!wmark) + return try_to_accept_memory_one(zone); + /* How much to accept to get to promo watermark? */ - to_accept = promo_wmark_pages(zone) - + to_accept = wmark - (zone_page_state(zone, NR_FREE_PAGES) - __zone_watermark_unusable_free(zone, order, 0) - zone_page_state(zone, NR_UNACCEPTED)); @@ -6985,22 +7279,17 @@ static bool __free_unaccepted(struct page *page) { struct zone *zone = page_zone(page); unsigned long flags; - bool first = false; if (!lazy_accept) return false; spin_lock_irqsave(&zone->lock, flags); - first = list_empty(&zone->unaccepted_pages); list_add_tail(&page->lru, &zone->unaccepted_pages); account_freepages(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); __mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES); __SetPageUnaccepted(page); spin_unlock_irqrestore(&zone->lock, flags); - if (first) - static_branch_inc(&zones_with_unaccepted_pages); - return true; } @@ -7011,7 +7300,8 @@ static bool page_contains_unaccepted(struct page *page, unsigned int order) return false; } -static bool cond_accept_memory(struct zone *zone, unsigned int order) +static bool cond_accept_memory(struct zone *zone, unsigned int order, + int alloc_flags) { return false; } @@ -7023,3 +7313,93 @@ static bool __free_unaccepted(struct page *page) } #endif /* CONFIG_UNACCEPTED_MEMORY */ + +/** + * alloc_pages_nolock - opportunistic reentrant allocation from any context + * @nid: node to allocate from + * @order: allocation order size + * + * Allocates pages of a given order from the given node. This is safe to + * call from any context (from atomic, NMI, and also reentrant + * allocator -> tracepoint -> alloc_pages_nolock_noprof). + * Allocation is best effort and to be expected to fail easily so nobody should + * rely on the success. Failures are not reported via warn_alloc(). + * See always fail conditions below. + * + * Return: allocated page or NULL on failure. NULL does not mean EBUSY or EAGAIN. + * It means ENOMEM. There is no reason to call it again and expect !NULL. + */ +struct page *alloc_pages_nolock_noprof(int nid, unsigned int order) +{ + /* + * Do not specify __GFP_DIRECT_RECLAIM, since direct claim is not allowed. + * Do not specify __GFP_KSWAPD_RECLAIM either, since wake up of kswapd + * is not safe in arbitrary context. + * + * These two are the conditions for gfpflags_allow_spinning() being true. + * + * Specify __GFP_NOWARN since failing alloc_pages_nolock() is not a reason + * to warn. Also warn would trigger printk() which is unsafe from + * various contexts. We cannot use printk_deferred_enter() to mitigate, + * since the running context is unknown. + * + * Specify __GFP_ZERO to make sure that call to kmsan_alloc_page() below + * is safe in any context. Also zeroing the page is mandatory for + * BPF use cases. + * + * Though __GFP_NOMEMALLOC is not checked in the code path below, + * specify it here to highlight that alloc_pages_nolock() + * doesn't want to deplete reserves. + */ + gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC + | __GFP_ACCOUNT; + unsigned int alloc_flags = ALLOC_TRYLOCK; + struct alloc_context ac = { }; + struct page *page; + + /* + * In PREEMPT_RT spin_trylock() will call raw_spin_lock() which is + * unsafe in NMI. If spin_trylock() is called from hard IRQ the current + * task may be waiting for one rt_spin_lock, but rt_spin_trylock() will + * mark the task as the owner of another rt_spin_lock which will + * confuse PI logic, so return immediately if called form hard IRQ or + * NMI. + * + * Note, irqs_disabled() case is ok. This function can be called + * from raw_spin_lock_irqsave region. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq())) + return NULL; + if (!pcp_allowed_order(order)) + return NULL; + + /* Bailout, since _deferred_grow_zone() needs to take a lock */ + if (deferred_pages_enabled()) + return NULL; + + if (nid == NUMA_NO_NODE) + nid = numa_node_id(); + + prepare_alloc_pages(alloc_gfp, order, nid, NULL, &ac, + &alloc_gfp, &alloc_flags); + + /* + * Best effort allocation from percpu free list. + * If it's empty attempt to spin_trylock zone->lock. + */ + page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac); + + /* Unlike regular alloc_pages() there is no __alloc_pages_slowpath(). */ + + if (page) + set_page_refcounted(page); + + if (memcg_kmem_online() && page && + unlikely(__memcg_kmem_charge_page(page, alloc_gfp, order) != 0)) { + free_pages_nolock(page, order); + page = NULL; + } + trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype); + kmsan_alloc_page(page, order, alloc_gfp); + return page; +} diff --git a/mm/page_counter.c b/mm/page_counter.c index b249d15af9dd..661e0f2a5127 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -121,6 +121,7 @@ bool page_counter_try_charge(struct page_counter *counter, { struct page_counter *c; bool protection = track_protection(counter); + bool track_failcnt = counter->track_failcnt; for (c = counter; c; c = c->parent) { long new; @@ -146,7 +147,8 @@ bool page_counter_try_charge(struct page_counter *counter, * inaccuracy in the failcnt which is only used * to report stats. */ - data_race(c->failcnt++); + if (track_failcnt) + data_race(c->failcnt++); *fail = c; goto failed; } @@ -288,7 +290,7 @@ int page_counter_memparse(const char *buf, const char *max, } -#ifdef CONFIG_MEMCG +#if IS_ENABLED(CONFIG_MEMCG) || IS_ENABLED(CONFIG_CGROUP_DMEM) /* * This function calculates an individual page counter's effective * protection which is derived from its own memory.min/low, its @@ -460,4 +462,4 @@ void page_counter_calculate_protection(struct page_counter *root, atomic_long_read(&parent->children_low_usage), recursive_protection)); } -#endif /* CONFIG_MEMCG */ +#endif /* CONFIG_MEMCG || CONFIG_CGROUP_DMEM */ diff --git a/mm/page_ext.c b/mm/page_ext.c index 641d93f6af4c..c351fdfe9e9a 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -508,6 +508,19 @@ void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) #endif /** + * page_ext_lookup() - Lookup a page extension for a PFN. + * @pfn: PFN of the page we're interested in. + * + * Must be called with RCU read lock taken and @pfn must be valid. + * + * Return: NULL if no page_ext exists for this page. + */ +struct page_ext *page_ext_lookup(unsigned long pfn) +{ + return lookup_page_ext(pfn_to_page(pfn)); +} + +/** * page_ext_get() - Get the extended information for a page. * @page: The page we're interested in. * diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c index 3f7a203d35c6..d2423f30577e 100644 --- a/mm/page_frag_cache.c +++ b/mm/page_frag_cache.c @@ -86,7 +86,7 @@ void __page_frag_cache_drain(struct page *page, unsigned int count) VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); if (page_ref_sub_and_test(page, count)) - free_unref_page(page, compound_order(page)); + free_frozen_pages(page, compound_order(page)); } EXPORT_SYMBOL(__page_frag_cache_drain); @@ -138,7 +138,7 @@ refill: goto refill; if (unlikely(encoded_page_decode_pfmemalloc(encoded_page))) { - free_unref_page(page, + free_frozen_pages(page, encoded_page_decode_order(encoded_page)); goto refill; } @@ -166,6 +166,6 @@ void page_frag_free(void *addr) struct page *page = virt_to_head_page(addr); if (unlikely(put_page_testzero(page))) - free_unref_page(page, compound_order(page)); + free_frozen_pages(page, compound_order(page)); } EXPORT_SYMBOL(page_frag_free); diff --git a/mm/page_idle.c b/mm/page_idle.c index 41ea77f22011..408aaf29a3ea 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -62,9 +62,14 @@ static bool page_idle_clear_pte_refs_one(struct folio *folio, /* * For PTE-mapped THP, one sub page is referenced, * the whole THP is referenced. + * + * PFN swap PTEs, such as device-exclusive ones, that + * actually map pages are "old" from a CPU perspective. + * The MMU notifier takes care of any device aspects. */ - if (ptep_clear_young_notify(vma, addr, pvmw.pte)) - referenced = true; + if (likely(pte_present(ptep_get(pvmw.pte)))) + referenced |= ptep_test_and_clear_young(vma, addr, pvmw.pte); + referenced |= mmu_notifier_clear_young(vma->vm_mm, addr, addr + PAGE_SIZE); } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { if (pmdp_clear_young_notify(vma, addr, pvmw.pmd)) referenced = true; @@ -112,7 +117,7 @@ static void page_idle_clear_pte_refs(struct folio *folio) } static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj, - struct bin_attribute *attr, char *buf, + const struct bin_attribute *attr, char *buf, loff_t pos, size_t count) { u64 *out = (u64 *)buf; @@ -157,7 +162,7 @@ static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj, } static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj, - struct bin_attribute *attr, char *buf, + const struct bin_attribute *attr, char *buf, loff_t pos, size_t count) { const u64 *in = (u64 *)buf; @@ -193,17 +198,17 @@ static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj, return (char *)in - buf; } -static struct bin_attribute page_idle_bitmap_attr = +static const struct bin_attribute page_idle_bitmap_attr = __BIN_ATTR(bitmap, 0600, page_idle_bitmap_read, page_idle_bitmap_write, 0); -static struct bin_attribute *page_idle_bin_attrs[] = { +static const struct bin_attribute *const page_idle_bin_attrs[] = { &page_idle_bitmap_attr, NULL, }; static const struct attribute_group page_idle_attr_group = { - .bin_attrs = page_idle_bin_attrs, + .bin_attrs_new = page_idle_bin_attrs, .name = "page_idle", }; diff --git a/mm/page_io.c b/mm/page_io.c index 4b4ea8e49cf6..f7716b6569fa 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -163,7 +163,6 @@ reprobe: page_no = 1; /* force Empty message */ sis->max = page_no; sis->pages = page_no - 1; - sis->highest_bit = page_no - 1; out: return ret; bad_bmap: @@ -238,9 +237,8 @@ static void swap_zeromap_folio_clear(struct folio *folio) * We may have stale swap cache pages in memory: notice * them here and get rid of the unnecessary final write. */ -int swap_writepage(struct page *page, struct writeback_control *wbc) +int swap_writeout(struct folio *folio, struct writeback_control *wbc) { - struct folio *folio = page_folio(page); int ret; if (folio_free_swap(folio)) { @@ -639,11 +637,11 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug) if (swap_read_folio_zeromap(folio)) { folio_unlock(folio); goto finish; - } else if (zswap_load(folio)) { - folio_unlock(folio); - goto finish; } + if (zswap_load(folio) != -ENOENT) + goto finish; + /* We have to read from slower devices. Increase zswap protection. */ zswap_folio_swapin(folio); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 7e04047977cf..b2fc5266e3d2 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -83,7 +83,14 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e unsigned int skip_pages; if (PageHuge(page)) { - if (!hugepage_migration_supported(folio_hstate(folio))) + struct hstate *h; + + /* + * The huge page may be freed so can not + * use folio_hstate() directly. + */ + h = size_to_hstate(folio_size(folio)); + if (h && !hugepage_migration_supported(h)) return page; } else if (!folio_test_lru(folio) && !__folio_test_movable(folio)) { return page; @@ -286,7 +293,6 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * within a free or in-use page. * @boundary_pfn: pageblock-aligned pfn that a page might cross * @flags: isolation flags - * @gfp_flags: GFP flags used for migrating pages * @isolate_before: isolate the pageblock before the boundary_pfn * @skip_isolation: the flag to skip the pageblock isolation in second * isolate_single_pageblock() @@ -306,8 +312,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * the in-use page then splitting the free page. */ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, - gfp_t gfp_flags, bool isolate_before, bool skip_isolation, - int migratetype) + bool isolate_before, bool skip_isolation, int migratetype) { unsigned long start_pfn; unsigned long isolate_pageblock; @@ -444,8 +449,6 @@ failed: * and PageOffline() pages. * REPORT_FAILURE - report details about the failure to * isolate the range - * @gfp_flags: GFP flags used for migrating pages that sit across the - * range boundaries. * * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in * the range will never be allocated. Any free pages and pages freed in the @@ -478,7 +481,7 @@ failed: * Return: 0 on success and -EBUSY if any part of range cannot be isolated. */ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, - int migratetype, int flags, gfp_t gfp_flags) + int migratetype, int flags) { unsigned long pfn; struct page *page; @@ -489,7 +492,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, bool skip_isolation = false; /* isolate [isolate_start, isolate_start + pageblock_nr_pages) pageblock */ - ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false, + ret = isolate_single_pageblock(isolate_start, flags, false, skip_isolation, migratetype); if (ret) return ret; @@ -498,7 +501,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, skip_isolation = true; /* isolate [isolate_end - pageblock_nr_pages, isolate_end) pageblock */ - ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true, + ret = isolate_single_pageblock(isolate_end, flags, true, skip_isolation, migratetype); if (ret) { unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype); @@ -612,6 +615,16 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, int ret; /* + * Due to the deferred freeing of hugetlb folios, the hugepage folios may + * not immediately release to the buddy system. This can cause PageBuddy() + * to fail in __test_page_isolated_in_pageblock(). To ensure that the + * hugetlb folios are properly released back to the buddy system, we + * invoke the wait_for_freed_hugetlb_folios() function to wait for the + * release to complete. + */ + wait_for_freed_hugetlb_folios(); + + /* * Note: pageblock_nr_pages != MAX_PAGE_ORDER. Then, chunks of free * pages are not aligned to pageblock_nr_pages. * Then we just check migratetype first. diff --git a/mm/page_owner.c b/mm/page_owner.c index 2d6360eaccbb..9928c9ac8c31 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -229,17 +229,19 @@ static void dec_stack_record_count(depot_stack_handle_t handle, handle); } -static inline void __update_page_owner_handle(struct page_ext *page_ext, +static inline void __update_page_owner_handle(struct page *page, depot_stack_handle_t handle, unsigned short order, gfp_t gfp_mask, short last_migrate_reason, u64 ts_nsec, pid_t pid, pid_t tgid, char *comm) { - int i; + struct page_ext_iter iter; + struct page_ext *page_ext; struct page_owner *page_owner; - for (i = 0; i < (1 << order); i++) { + rcu_read_lock(); + for_each_page_ext(page, 1 << order, page_ext, iter) { page_owner = get_page_owner(page_ext); page_owner->handle = handle; page_owner->order = order; @@ -252,20 +254,22 @@ static inline void __update_page_owner_handle(struct page_ext *page_ext, sizeof(page_owner->comm)); __set_bit(PAGE_EXT_OWNER, &page_ext->flags); __set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); - page_ext = page_ext_next(page_ext); } + rcu_read_unlock(); } -static inline void __update_page_owner_free_handle(struct page_ext *page_ext, +static inline void __update_page_owner_free_handle(struct page *page, depot_stack_handle_t handle, unsigned short order, pid_t pid, pid_t tgid, u64 free_ts_nsec) { - int i; + struct page_ext_iter iter; + struct page_ext *page_ext; struct page_owner *page_owner; - for (i = 0; i < (1 << order); i++) { + rcu_read_lock(); + for_each_page_ext(page, 1 << order, page_ext, iter) { page_owner = get_page_owner(page_ext); /* Only __reset_page_owner() wants to clear the bit */ if (handle) { @@ -275,8 +279,8 @@ static inline void __update_page_owner_free_handle(struct page_ext *page_ext, page_owner->free_ts_nsec = free_ts_nsec; page_owner->free_pid = current->pid; page_owner->free_tgid = current->tgid; - page_ext = page_ext_next(page_ext); } + rcu_read_unlock(); } void __reset_page_owner(struct page *page, unsigned short order) @@ -293,11 +297,17 @@ void __reset_page_owner(struct page *page, unsigned short order) page_owner = get_page_owner(page_ext); alloc_handle = page_owner->handle; + page_ext_put(page_ext); - handle = save_stack(GFP_NOWAIT | __GFP_NOWARN); - __update_page_owner_free_handle(page_ext, handle, order, current->pid, + /* + * Do not specify GFP_NOWAIT to make gfpflags_allow_spinning() == false + * to prevent issues in stack_depot_save(). + * This is similar to alloc_pages_nolock() gfp flags, but only used + * to signal stack_depot to avoid spin_locks. + */ + handle = save_stack(__GFP_NOWARN); + __update_page_owner_free_handle(page, handle, order, current->pid, current->tgid, free_ts_nsec); - page_ext_put(page_ext); if (alloc_handle != early_handle) /* @@ -313,19 +323,13 @@ void __reset_page_owner(struct page *page, unsigned short order) noinline void __set_page_owner(struct page *page, unsigned short order, gfp_t gfp_mask) { - struct page_ext *page_ext; u64 ts_nsec = local_clock(); depot_stack_handle_t handle; handle = save_stack(gfp_mask); - - page_ext = page_ext_get(page); - if (unlikely(!page_ext)) - return; - __update_page_owner_handle(page_ext, handle, order, gfp_mask, -1, + __update_page_owner_handle(page, handle, order, gfp_mask, -1, ts_nsec, current->pid, current->tgid, current->comm); - page_ext_put(page_ext); inc_stack_record_count(handle, gfp_mask, 1 << order); } @@ -344,44 +348,42 @@ void __set_page_owner_migrate_reason(struct page *page, int reason) void __split_page_owner(struct page *page, int old_order, int new_order) { - int i; - struct page_ext *page_ext = page_ext_get(page); + struct page_ext_iter iter; + struct page_ext *page_ext; struct page_owner *page_owner; - if (unlikely(!page_ext)) - return; - - for (i = 0; i < (1 << old_order); i++) { + rcu_read_lock(); + for_each_page_ext(page, 1 << old_order, page_ext, iter) { page_owner = get_page_owner(page_ext); page_owner->order = new_order; - page_ext = page_ext_next(page_ext); } - page_ext_put(page_ext); + rcu_read_unlock(); } void __folio_copy_owner(struct folio *newfolio, struct folio *old) { - int i; - struct page_ext *old_ext; - struct page_ext *new_ext; + struct page_ext *page_ext; + struct page_ext_iter iter; struct page_owner *old_page_owner; struct page_owner *new_page_owner; depot_stack_handle_t migrate_handle; - old_ext = page_ext_get(&old->page); - if (unlikely(!old_ext)) + page_ext = page_ext_get(&old->page); + if (unlikely(!page_ext)) return; - new_ext = page_ext_get(&newfolio->page); - if (unlikely(!new_ext)) { - page_ext_put(old_ext); + old_page_owner = get_page_owner(page_ext); + page_ext_put(page_ext); + + page_ext = page_ext_get(&newfolio->page); + if (unlikely(!page_ext)) return; - } - old_page_owner = get_page_owner(old_ext); - new_page_owner = get_page_owner(new_ext); + new_page_owner = get_page_owner(page_ext); + page_ext_put(page_ext); + migrate_handle = new_page_owner->handle; - __update_page_owner_handle(new_ext, old_page_owner->handle, + __update_page_owner_handle(&newfolio->page, old_page_owner->handle, old_page_owner->order, old_page_owner->gfp_mask, old_page_owner->last_migrate_reason, old_page_owner->ts_nsec, old_page_owner->pid, @@ -391,7 +393,7 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old) * will be freed after migration. Keep them until then as they may be * useful. */ - __update_page_owner_free_handle(new_ext, 0, old_page_owner->order, + __update_page_owner_free_handle(&newfolio->page, 0, old_page_owner->order, old_page_owner->free_pid, old_page_owner->free_tgid, old_page_owner->free_ts_nsec); @@ -400,14 +402,12 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old) * for the new one and the old folio otherwise there will be an imbalance * when subtracting those pages from the stack. */ - for (i = 0; i < (1 << new_page_owner->order); i++) { + rcu_read_lock(); + for_each_page_ext(&old->page, 1 << new_page_owner->order, page_ext, iter) { + old_page_owner = get_page_owner(page_ext); old_page_owner->handle = migrate_handle; - old_ext = page_ext_next(old_ext); - old_page_owner = get_page_owner(old_ext); } - - page_ext_put(new_ext); - page_ext_put(old_ext); + rcu_read_unlock(); } void pagetypeinfo_showmixedcount_print(struct seq_file *m, @@ -507,7 +507,7 @@ static inline int print_page_owner_memcg(char *kbuf, size_t count, int ret, rcu_read_lock(); memcg_data = READ_ONCE(page->memcg_data); - if (!memcg_data) + if (!memcg_data || PageTail(page)) goto out_unlock; if (memcg_data & MEMCG_DATA_OBJEXTS) @@ -813,7 +813,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) goto ext_put_continue; /* Found early allocated page */ - __update_page_owner_handle(page_ext, early_handle, 0, 0, + __update_page_owner_handle(page, early_handle, 0, 0, -1, local_clock(), current->pid, current->tgid, current->comm); count++; diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 509c6ef8de40..4eeca782b888 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -62,24 +62,20 @@ static struct page_table_check *get_page_table_check(struct page_ext *page_ext) */ static void page_table_check_clear(unsigned long pfn, unsigned long pgcnt) { + struct page_ext_iter iter; struct page_ext *page_ext; struct page *page; - unsigned long i; bool anon; if (!pfn_valid(pfn)) return; page = pfn_to_page(pfn); - page_ext = page_ext_get(page); - - if (!page_ext) - return; - BUG_ON(PageSlab(page)); anon = PageAnon(page); - for (i = 0; i < pgcnt; i++) { + rcu_read_lock(); + for_each_page_ext(page, pgcnt, page_ext, iter) { struct page_table_check *ptc = get_page_table_check(page_ext); if (anon) { @@ -89,9 +85,8 @@ static void page_table_check_clear(unsigned long pfn, unsigned long pgcnt) BUG_ON(atomic_read(&ptc->anon_map_count)); BUG_ON(atomic_dec_return(&ptc->file_map_count) < 0); } - page_ext = page_ext_next(page_ext); } - page_ext_put(page_ext); + rcu_read_unlock(); } /* @@ -102,24 +97,20 @@ static void page_table_check_clear(unsigned long pfn, unsigned long pgcnt) static void page_table_check_set(unsigned long pfn, unsigned long pgcnt, bool rw) { + struct page_ext_iter iter; struct page_ext *page_ext; struct page *page; - unsigned long i; bool anon; if (!pfn_valid(pfn)) return; page = pfn_to_page(pfn); - page_ext = page_ext_get(page); - - if (!page_ext) - return; - BUG_ON(PageSlab(page)); anon = PageAnon(page); - for (i = 0; i < pgcnt; i++) { + rcu_read_lock(); + for_each_page_ext(page, pgcnt, page_ext, iter) { struct page_table_check *ptc = get_page_table_check(page_ext); if (anon) { @@ -129,9 +120,8 @@ static void page_table_check_set(unsigned long pfn, unsigned long pgcnt, BUG_ON(atomic_read(&ptc->anon_map_count)); BUG_ON(atomic_inc_return(&ptc->file_map_count) < 0); } - page_ext = page_ext_next(page_ext); } - page_ext_put(page_ext); + rcu_read_unlock(); } /* @@ -140,24 +130,19 @@ static void page_table_check_set(unsigned long pfn, unsigned long pgcnt, */ void __page_table_check_zero(struct page *page, unsigned int order) { + struct page_ext_iter iter; struct page_ext *page_ext; - unsigned long i; BUG_ON(PageSlab(page)); - page_ext = page_ext_get(page); - - if (!page_ext) - return; - - for (i = 0; i < (1ul << order); i++) { + rcu_read_lock(); + for_each_page_ext(page, 1 << order, page_ext, iter) { struct page_table_check *ptc = get_page_table_check(page_ext); BUG_ON(atomic_read(&ptc->anon_map_count)); BUG_ON(atomic_read(&ptc->file_map_count)); - page_ext = page_ext_next(page_ext); } - page_ext_put(page_ext); + rcu_read_unlock(); } void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) @@ -196,9 +181,8 @@ EXPORT_SYMBOL(__page_table_check_pud_clear); /* Whether the swap entry cached writable information */ static inline bool swap_cached_writable(swp_entry_t entry) { - return is_writable_device_exclusive_entry(entry) || - is_writable_device_private_entry(entry) || - is_writable_migration_entry(entry); + return is_writable_device_private_entry(entry) || + is_writable_migration_entry(entry); } static inline void page_table_check_pte_flags(pte_t pte) @@ -234,33 +218,39 @@ static inline void page_table_check_pmd_flags(pmd_t pmd) WARN_ON_ONCE(swap_cached_writable(pmd_to_swp_entry(pmd))); } -void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd) +void __page_table_check_pmds_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd, + unsigned int nr) { + unsigned long stride = PMD_SIZE >> PAGE_SHIFT; + unsigned int i; + if (&init_mm == mm) return; page_table_check_pmd_flags(pmd); - __page_table_check_pmd_clear(mm, *pmdp); - if (pmd_user_accessible_page(pmd)) { - page_table_check_set(pmd_pfn(pmd), PMD_SIZE >> PAGE_SHIFT, - pmd_write(pmd)); - } + for (i = 0; i < nr; i++) + __page_table_check_pmd_clear(mm, *(pmdp + i)); + if (pmd_user_accessible_page(pmd)) + page_table_check_set(pmd_pfn(pmd), stride * nr, pmd_write(pmd)); } -EXPORT_SYMBOL(__page_table_check_pmd_set); +EXPORT_SYMBOL(__page_table_check_pmds_set); -void __page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud) +void __page_table_check_puds_set(struct mm_struct *mm, pud_t *pudp, pud_t pud, + unsigned int nr) { + unsigned long stride = PUD_SIZE >> PAGE_SHIFT; + unsigned int i; + if (&init_mm == mm) return; - __page_table_check_pud_clear(mm, *pudp); - if (pud_user_accessible_page(pud)) { - page_table_check_set(pud_pfn(pud), PUD_SIZE >> PAGE_SHIFT, - pud_write(pud)); - } + for (i = 0; i < nr; i++) + __page_table_check_pud_clear(mm, *(pudp + i)); + if (pud_user_accessible_page(pud)) + page_table_check_set(pud_pfn(pud), stride * nr, pud_write(pud)); } -EXPORT_SYMBOL(__page_table_check_pud_set); +EXPORT_SYMBOL(__page_table_check_puds_set); void __page_table_check_pte_clear_range(struct mm_struct *mm, unsigned long addr, diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 81839a9e74f1..e463c3be934a 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -84,6 +84,7 @@ again: * mapped at the @pvmw->pte * @pvmw: page_vma_mapped_walk struct, includes a pair pte and pfn range * for checking + * @pte_nr: the number of small pages described by @pvmw->pte. * * page_vma_mapped_walk() found a place where pfn range is *potentially* * mapped. check_pte() has to validate this. @@ -100,7 +101,7 @@ again: * Otherwise, return false. * */ -static bool check_pte(struct page_vma_mapped_walk *pvmw) +static bool check_pte(struct page_vma_mapped_walk *pvmw, unsigned long pte_nr) { unsigned long pfn; pte_t ptent = ptep_get(pvmw->pte); @@ -111,8 +112,7 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw) return false; entry = pte_to_swp_entry(ptent); - if (!is_migration_entry(entry) && - !is_device_exclusive_entry(entry)) + if (!is_migration_entry(entry)) return false; pfn = swp_offset_pfn(entry); @@ -133,7 +133,11 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw) pfn = pte_pfn(ptent); } - return (pfn - pvmw->pfn) < pvmw->nr_pages; + if ((pfn + pte_nr - 1) < pvmw->pfn) + return false; + if (pfn > (pvmw->pfn + pvmw->nr_pages - 1)) + return false; + return true; } /* Returns true if the two ranges overlap. Careful to not overflow. */ @@ -208,7 +212,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) return false; pvmw->ptl = huge_pte_lock(hstate, mm, pvmw->pte); - if (!check_pte(pvmw)) + if (!check_pte(pvmw, pages_per_huge_page(hstate))) return not_found(pvmw); return true; } @@ -291,7 +295,7 @@ restart: goto next_pte; } this_pte: - if (check_pte(pvmw)) + if (check_pte(pvmw, 1)) return true; next_pte: do { diff --git a/mm/percpu.c b/mm/percpu.c index d8dd31a2e407..b35494c8ede2 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1359,10 +1359,7 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, /* allocate chunk */ alloc_size = struct_size(chunk, populated, BITS_TO_LONGS(region_size >> PAGE_SHIFT)); - chunk = memblock_alloc(alloc_size, SMP_CACHE_BYTES); - if (!chunk) - panic("%s: Failed to allocate %zu bytes\n", __func__, - alloc_size); + chunk = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); INIT_LIST_HEAD(&chunk->list); @@ -1374,24 +1371,14 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, region_bits = pcpu_chunk_map_bits(chunk); alloc_size = BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]); - chunk->alloc_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES); - if (!chunk->alloc_map) - panic("%s: Failed to allocate %zu bytes\n", __func__, - alloc_size); + chunk->alloc_map = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); alloc_size = BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]); - chunk->bound_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES); - if (!chunk->bound_map) - panic("%s: Failed to allocate %zu bytes\n", __func__, - alloc_size); + chunk->bound_map = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); alloc_size = pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]); - chunk->md_blocks = memblock_alloc(alloc_size, SMP_CACHE_BYTES); - if (!chunk->md_blocks) - panic("%s: Failed to allocate %zu bytes\n", __func__, - alloc_size); - + chunk->md_blocks = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); #ifdef NEED_PCPUOBJ_EXT /* first chunk is free to use */ chunk->obj_exts = NULL; @@ -1758,7 +1745,7 @@ void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved, gfp = current_gfp_context(gfp); /* whitelisted flags that can be passed to the backing allocators */ pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); - is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL; + is_atomic = !gfpflags_allow_blocking(gfp); do_warn = !(gfp & __GFP_NOWARN); /* @@ -2204,7 +2191,12 @@ static void pcpu_balance_workfn(struct work_struct *work) * to grow other chunks. This then gives pcpu_reclaim_populated() time * to move fully free chunks to the active list to be freed if * appropriate. + * + * Enforce GFP_NOIO allocations because we have pcpu_alloc users + * constrained to GFP_NOIO/NOFS contexts and they could form lock + * dependency through pcpu_alloc_mutex */ + unsigned int flags = memalloc_noio_save(); mutex_lock(&pcpu_alloc_mutex); spin_lock_irq(&pcpu_lock); @@ -2215,6 +2207,7 @@ static void pcpu_balance_workfn(struct work_struct *work) spin_unlock_irq(&pcpu_lock); mutex_unlock(&pcpu_alloc_mutex); + memalloc_noio_restore(flags); } /** @@ -2595,28 +2588,16 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, /* process group information and build config tables accordingly */ alloc_size = ai->nr_groups * sizeof(group_offsets[0]); - group_offsets = memblock_alloc(alloc_size, SMP_CACHE_BYTES); - if (!group_offsets) - panic("%s: Failed to allocate %zu bytes\n", __func__, - alloc_size); + group_offsets = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); alloc_size = ai->nr_groups * sizeof(group_sizes[0]); - group_sizes = memblock_alloc(alloc_size, SMP_CACHE_BYTES); - if (!group_sizes) - panic("%s: Failed to allocate %zu bytes\n", __func__, - alloc_size); + group_sizes = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); alloc_size = nr_cpu_ids * sizeof(unit_map[0]); - unit_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES); - if (!unit_map) - panic("%s: Failed to allocate %zu bytes\n", __func__, - alloc_size); + unit_map = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); alloc_size = nr_cpu_ids * sizeof(unit_off[0]); - unit_off = memblock_alloc(alloc_size, SMP_CACHE_BYTES); - if (!unit_off) - panic("%s: Failed to allocate %zu bytes\n", __func__, - alloc_size); + unit_off = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); for (cpu = 0; cpu < nr_cpu_ids; cpu++) unit_map[cpu] = UINT_MAX; @@ -2685,12 +2666,9 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, pcpu_free_slot = pcpu_sidelined_slot + 1; pcpu_to_depopulate_slot = pcpu_free_slot + 1; pcpu_nr_slots = pcpu_to_depopulate_slot + 1; - pcpu_chunk_lists = memblock_alloc(pcpu_nr_slots * + pcpu_chunk_lists = memblock_alloc_or_panic(pcpu_nr_slots * sizeof(pcpu_chunk_lists[0]), SMP_CACHE_BYTES); - if (!pcpu_chunk_lists) - panic("%s: Failed to allocate %zu bytes\n", __func__, - pcpu_nr_slots * sizeof(pcpu_chunk_lists[0])); for (i = 0; i < pcpu_nr_slots; i++) INIT_LIST_HEAD(&pcpu_chunk_lists[i]); @@ -3099,7 +3077,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, continue; } /* copy and return the unused part */ - memcpy(ptr, __per_cpu_load, ai->static_size); + memcpy(ptr, __per_cpu_start, ai->static_size); pcpu_fc_free(ptr + size_sum, ai->unit_size - size_sum); } } @@ -3155,25 +3133,19 @@ void __init __weak pcpu_populate_pte(unsigned long addr) pmd_t *pmd; if (pgd_none(*pgd)) { - p4d = memblock_alloc(P4D_TABLE_SIZE, P4D_TABLE_SIZE); - if (!p4d) - goto err_alloc; + p4d = memblock_alloc_or_panic(P4D_TABLE_SIZE, P4D_TABLE_SIZE); pgd_populate(&init_mm, pgd, p4d); } p4d = p4d_offset(pgd, addr); if (p4d_none(*p4d)) { - pud = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE); - if (!pud) - goto err_alloc; + pud = memblock_alloc_or_panic(PUD_TABLE_SIZE, PUD_TABLE_SIZE); p4d_populate(&init_mm, p4d, pud); } pud = pud_offset(p4d, addr); if (pud_none(*pud)) { - pmd = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE); - if (!pmd) - goto err_alloc; + pmd = memblock_alloc_or_panic(PMD_TABLE_SIZE, PMD_TABLE_SIZE); pud_populate(&init_mm, pud, pmd); } @@ -3181,16 +3153,11 @@ void __init __weak pcpu_populate_pte(unsigned long addr) if (!pmd_present(*pmd)) { pte_t *new; - new = memblock_alloc(PTE_TABLE_SIZE, PTE_TABLE_SIZE); - if (!new) - goto err_alloc; + new = memblock_alloc_or_panic(PTE_TABLE_SIZE, PTE_TABLE_SIZE); pmd_populate_kernel(&init_mm, pmd, new); } return; - -err_alloc: - panic("%s: Failed to allocate memory\n", __func__); } /** @@ -3237,10 +3204,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t /* unaligned allocations can't be freed, round up to page size */ pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * sizeof(pages[0])); - pages = memblock_alloc(pages_size, SMP_CACHE_BYTES); - if (!pages) - panic("%s: Failed to allocate %zu bytes\n", __func__, - pages_size); + pages = memblock_alloc_or_panic(pages_size, SMP_CACHE_BYTES); /* allocate pages */ j = 0; @@ -3282,7 +3246,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t flush_cache_vmap_early(unit_addr, unit_addr + ai->unit_size); /* copy static data */ - memcpy((void *)unit_addr, __per_cpu_load, ai->static_size); + memcpy((void *)unit_addr, __per_cpu_start, ai->static_size); } /* we're ready, commit */ diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 5297dcc38c37..5a882f2b10f9 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -279,7 +279,7 @@ static unsigned long pmdp_get_lockless_start(void) { return 0; } static void pmdp_get_lockless_end(unsigned long irqflags) { } #endif -pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) +pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) { unsigned long irqflags; pmd_t pmdval; diff --git a/mm/pt_reclaim.c b/mm/pt_reclaim.c new file mode 100644 index 000000000000..7e9455a18aae --- /dev/null +++ b/mm/pt_reclaim.c @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/hugetlb.h> +#include <asm-generic/tlb.h> +#include <asm/pgalloc.h> + +#include "internal.h" + +bool reclaim_pt_is_enabled(unsigned long start, unsigned long end, + struct zap_details *details) +{ + return details && details->reclaim_pt && (end - start >= PMD_SIZE); +} + +bool try_get_and_clear_pmd(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdval) +{ + spinlock_t *pml = pmd_lockptr(mm, pmd); + + if (!spin_trylock(pml)) + return false; + + *pmdval = pmdp_get_lockless(pmd); + pmd_clear(pmd); + spin_unlock(pml); + + return true; +} + +void free_pte(struct mm_struct *mm, unsigned long addr, struct mmu_gather *tlb, + pmd_t pmdval) +{ + pte_free_tlb(tlb, pmd_pgtable(pmdval), addr); + mm_dec_nr_ptes(mm); +} + +void try_to_free_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, + struct mmu_gather *tlb) +{ + pmd_t pmdval; + spinlock_t *pml, *ptl = NULL; + pte_t *start_pte, *pte; + int i; + + pml = pmd_lock(mm, pmd); + start_pte = pte_offset_map_rw_nolock(mm, pmd, addr, &pmdval, &ptl); + if (!start_pte) + goto out_ptl; + if (ptl != pml) + spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); + + /* Check if it is empty PTE page */ + for (i = 0, pte = start_pte; i < PTRS_PER_PTE; i++, pte++) { + if (!pte_none(ptep_get(pte))) + goto out_ptl; + } + pte_unmap(start_pte); + + pmd_clear(pmd); + + if (ptl != pml) + spin_unlock(ptl); + spin_unlock(pml); + + free_pte(mm, addr, tlb, pmdval); + + return; +out_ptl: + if (start_pte) + pte_unmap_unlock(start_pte, ptl); + if (ptl != pml) + spin_unlock(pml); +} diff --git a/mm/ptdump.c b/mm/ptdump.c index 106e1d66e9f9..9374f29cdc6f 100644 --- a/mm/ptdump.c +++ b/mm/ptdump.c @@ -18,7 +18,7 @@ static inline int note_kasan_page_table(struct mm_walk *walk, { struct ptdump_state *st = walk->private; - st->note_page(st, addr, 4, pte_val(kasan_early_shadow_pte[0])); + st->note_page_pte(st, addr, kasan_early_shadow_pte[0]); walk->action = ACTION_CONTINUE; @@ -38,11 +38,11 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr, return note_kasan_page_table(walk, addr); #endif - if (st->effective_prot) - st->effective_prot(st, 0, pgd_val(val)); + if (st->effective_prot_pgd) + st->effective_prot_pgd(st, val); if (pgd_leaf(val)) { - st->note_page(st, addr, 0, pgd_val(val)); + st->note_page_pgd(st, addr, val); walk->action = ACTION_CONTINUE; } @@ -61,11 +61,11 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr, return note_kasan_page_table(walk, addr); #endif - if (st->effective_prot) - st->effective_prot(st, 1, p4d_val(val)); + if (st->effective_prot_p4d) + st->effective_prot_p4d(st, val); if (p4d_leaf(val)) { - st->note_page(st, addr, 1, p4d_val(val)); + st->note_page_p4d(st, addr, val); walk->action = ACTION_CONTINUE; } @@ -84,11 +84,11 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr, return note_kasan_page_table(walk, addr); #endif - if (st->effective_prot) - st->effective_prot(st, 2, pud_val(val)); + if (st->effective_prot_pud) + st->effective_prot_pud(st, val); if (pud_leaf(val)) { - st->note_page(st, addr, 2, pud_val(val)); + st->note_page_pud(st, addr, val); walk->action = ACTION_CONTINUE; } @@ -106,10 +106,10 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr, return note_kasan_page_table(walk, addr); #endif - if (st->effective_prot) - st->effective_prot(st, 3, pmd_val(val)); + if (st->effective_prot_pmd) + st->effective_prot_pmd(st, val); if (pmd_leaf(val)) { - st->note_page(st, addr, 3, pmd_val(val)); + st->note_page_pmd(st, addr, val); walk->action = ACTION_CONTINUE; } @@ -122,10 +122,10 @@ static int ptdump_pte_entry(pte_t *pte, unsigned long addr, struct ptdump_state *st = walk->private; pte_t val = ptep_get_lockless(pte); - if (st->effective_prot) - st->effective_prot(st, 4, pte_val(val)); + if (st->effective_prot_pte) + st->effective_prot_pte(st, val); - st->note_page(st, addr, 4, pte_val(val)); + st->note_page_pte(st, addr, val); return 0; } @@ -134,9 +134,31 @@ static int ptdump_hole(unsigned long addr, unsigned long next, int depth, struct mm_walk *walk) { struct ptdump_state *st = walk->private; - - st->note_page(st, addr, depth, 0); - + pte_t pte_zero = {0}; + pmd_t pmd_zero = {0}; + pud_t pud_zero = {0}; + p4d_t p4d_zero = {0}; + pgd_t pgd_zero = {0}; + + switch (depth) { + case 4: + st->note_page_pte(st, addr, pte_zero); + break; + case 3: + st->note_page_pmd(st, addr, pmd_zero); + break; + case 2: + st->note_page_pud(st, addr, pud_zero); + break; + case 1: + st->note_page_p4d(st, addr, p4d_zero); + break; + case 0: + st->note_page_pgd(st, addr, pgd_zero); + break; + default: + break; + } return 0; } @@ -162,7 +184,7 @@ void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd) mmap_write_unlock(mm); /* Flush out the last page */ - st->note_page(st, 0, -1, 0); + st->note_page_flush(st); } static int check_wx_show(struct seq_file *m, void *v) diff --git a/mm/readahead.c b/mm/readahead.c index 8f1cf599b572..20d36d6b055e 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -158,20 +158,10 @@ static void read_pages(struct readahead_control *rac) if (aops->readahead) { aops->readahead(rac); - /* - * Clean up the remaining folios. The sizes in ->ra - * may be used to size the next readahead, so make sure - * they accurately reflect what happened. - */ + /* Clean up the remaining folios. */ while ((folio = readahead_folio(rac)) != NULL) { - unsigned long nr = folio_nr_pages(folio); - folio_get(folio); - rac->ra->size -= nr; - if (rac->ra->async_size >= nr) { - rac->ra->async_size -= nr; - filemap_remove_folio(folio); - } + filemap_remove_folio(folio); folio_unlock(folio); folio_put(folio); } @@ -188,6 +178,18 @@ static void read_pages(struct readahead_control *rac) BUG_ON(readahead_count(rac)); } +static struct folio *ractl_alloc_folio(struct readahead_control *ractl, + gfp_t gfp_mask, unsigned int order) +{ + struct folio *folio; + + folio = filemap_alloc_folio(gfp_mask, order); + if (folio && ractl->dropbehind) + __folio_set_dropbehind(folio); + + return folio; +} + /** * page_cache_ra_unbounded - Start unchecked readahead. * @ractl: Readahead control. @@ -265,8 +267,8 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, continue; } - folio = filemap_alloc_folio(gfp_mask, - mapping_min_folio_order(mapping)); + folio = ractl_alloc_folio(ractl, gfp_mask, + mapping_min_folio_order(mapping)); if (!folio) break; @@ -436,7 +438,7 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index, pgoff_t mark, unsigned int order, gfp_t gfp) { int err; - struct folio *folio = filemap_alloc_folio(gfp, order); + struct folio *folio = ractl_alloc_folio(ractl, gfp, order); if (!folio) return -ENOMEM; @@ -517,12 +519,18 @@ void page_cache_ra_order(struct readahead_control *ractl, /* * If there were already pages in the page cache, then we may have * left some gaps. Let the regular readahead code take care of this - * situation. + * situation below. */ if (!err) return; fallback: - do_page_cache_ra(ractl, ra->size - (index - start), ra->async_size); + /* + * ->readahead() may have updated readahead window size so we have to + * check there's still something to read. + */ + if (ra->size > index - start) + do_page_cache_ra(ractl, ra->size - (index - start), + ra->async_size); } static unsigned long ractl_max_pages(struct readahead_control *ractl, @@ -647,7 +655,11 @@ void page_cache_async_ra(struct readahead_control *ractl, 1UL << order); if (index == expected) { ra->start += ra->size; - ra->size = get_next_ra_size(ra, max_pages); + /* + * In the case of MADV_HUGEPAGE, the actual size might exceed + * the readahead window. + */ + ra->size = max(ra->size, get_next_ra_size(ra, max_pages)); ra->async_size = ra->size; goto readit; } @@ -678,9 +690,15 @@ EXPORT_SYMBOL_GPL(page_cache_async_ra); ssize_t ksys_readahead(int fd, loff_t offset, size_t count) { + struct file *file; + const struct inode *inode; + CLASS(fd, f)(fd); + if (fd_empty(f)) + return -EBADF; - if (fd_empty(f) || !(fd_file(f)->f_mode & FMODE_READ)) + file = fd_file(f); + if (!(file->f_mode & FMODE_READ)) return -EBADF; /* @@ -688,9 +706,15 @@ ssize_t ksys_readahead(int fd, loff_t offset, size_t count) * that can execute readahead. If readahead is not possible * on this file, then we must return -EINVAL. */ - if (!fd_file(f)->f_mapping || !fd_file(f)->f_mapping->a_ops || - (!S_ISREG(file_inode(fd_file(f))->i_mode) && - !S_ISBLK(file_inode(fd_file(f))->i_mode))) + if (!file->f_mapping) + return -EINVAL; + if (!file->f_mapping->a_ops) + return -EINVAL; + + inode = file_inode(file); + if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode)) + return -EINVAL; + if (IS_ANON_FILE(inode)) return -EINVAL; return vfs_fadvise(fd_file(f), offset, count, POSIX_FADV_WILLNEED); @@ -751,7 +775,7 @@ void readahead_expand(struct readahead_control *ractl, if (folio && !xa_is_value(folio)) return; /* Folio apparently present */ - folio = filemap_alloc_folio(gfp_mask, min_order); + folio = ractl_alloc_folio(ractl, gfp_mask, min_order); if (!folio) return; @@ -780,7 +804,7 @@ void readahead_expand(struct readahead_control *ractl, if (folio && !xa_is_value(folio)) return; /* Folio apparently present */ - folio = filemap_alloc_folio(gfp_mask, min_order); + folio = ractl_alloc_folio(ractl, gfp_mask, min_order); if (!folio) return; diff --git a/mm/rmap.c b/mm/rmap.c index c6c4d4ea29a7..fb63d9256f09 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -672,7 +672,7 @@ void try_to_unmap_flush_dirty(void) (TLB_FLUSH_BATCH_PENDING_MASK / 2) static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval, - unsigned long uaddr) + unsigned long start, unsigned long end) { struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; int batch; @@ -681,7 +681,7 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval, if (!pte_accessible(mm, pteval)) return; - arch_tlbbatch_add_pending(&tlb_ubc->arch, mm, uaddr); + arch_tlbbatch_add_pending(&tlb_ubc->arch, mm, start, end); tlb_ubc->flush_required = true; /* @@ -757,7 +757,7 @@ void flush_tlb_batched_pending(struct mm_struct *mm) } #else static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval, - unsigned long uaddr) + unsigned long start, unsigned long end) { } @@ -774,7 +774,7 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) * @vma: The VMA we need to know the address in. * * Calculates the user virtual address of this page in the specified VMA. - * It is the caller's responsibililty to check the page is actually + * It is the caller's responsibility to check the page is actually * within the VMA. There may not currently be a PTE pointing at this * page, but if a page fault occurs at this address, this is the page * which will be accessed. @@ -789,13 +789,13 @@ unsigned long page_address_in_vma(const struct folio *folio, const struct page *page, const struct vm_area_struct *vma) { if (folio_test_anon(folio)) { - struct anon_vma *page__anon_vma = folio_anon_vma(folio); + struct anon_vma *anon_vma = folio_anon_vma(folio); /* * Note: swapoff's unuse_vma() is more efficient with this * check, and needs it to match anon_vma when KSM is active. */ - if (!vma->anon_vma || !page__anon_vma || - vma->anon_vma->root != page__anon_vma->root) + if (!vma->anon_vma || !anon_vma || + vma->anon_vma->root != anon_vma->root) return -EFAULT; } else if (!vma->vm_file) { return -EFAULT; @@ -803,7 +803,7 @@ unsigned long page_address_in_vma(const struct folio *folio, return -EFAULT; } - /* KSM folios don't reach here because of the !page__anon_vma check */ + /* KSM folios don't reach here because of the !anon_vma check */ return vma_address(vma, page_pgoff(folio, page), 1); } @@ -889,7 +889,7 @@ static bool folio_referenced_one(struct folio *folio, if ((!atomic_read(&vma->vm_mm->mm_users) || check_stable_address_space(vma->vm_mm)) && folio_test_anon(folio) && folio_test_swapbacked(folio) && - !folio_likely_mapped_shared(folio)) { + !folio_maybe_mapped_shared(folio)) { pra->referenced = -1; page_vma_mapped_walk_done(&pvmw); return false; @@ -1044,6 +1044,14 @@ static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw) pte_t *pte = pvmw->pte; pte_t entry = ptep_get(pte); + /* + * PFN swap PTEs, such as device-exclusive ones, that + * actually map pages are clean and not writable from a + * CPU perspective. The MMU notifier takes care of any + * device aspects. + */ + if (!pte_present(entry)) + continue; if (!pte_dirty(entry) && !pte_write(entry)) continue; @@ -1127,6 +1135,80 @@ int folio_mkclean(struct folio *folio) } EXPORT_SYMBOL_GPL(folio_mkclean); +struct wrprotect_file_state { + int cleaned; + pgoff_t pgoff; + unsigned long pfn; + unsigned long nr_pages; +}; + +static bool mapping_wrprotect_range_one(struct folio *folio, + struct vm_area_struct *vma, unsigned long address, void *arg) +{ + struct wrprotect_file_state *state = (struct wrprotect_file_state *)arg; + struct page_vma_mapped_walk pvmw = { + .pfn = state->pfn, + .nr_pages = state->nr_pages, + .pgoff = state->pgoff, + .vma = vma, + .address = address, + .flags = PVMW_SYNC, + }; + + state->cleaned += page_vma_mkclean_one(&pvmw); + + return true; +} + +static void __rmap_walk_file(struct folio *folio, struct address_space *mapping, + pgoff_t pgoff_start, unsigned long nr_pages, + struct rmap_walk_control *rwc, bool locked); + +/** + * mapping_wrprotect_range() - Write-protect all mappings in a specified range. + * + * @mapping: The mapping whose reverse mapping should be traversed. + * @pgoff: The page offset at which @pfn is mapped within @mapping. + * @pfn: The PFN of the page mapped in @mapping at @pgoff. + * @nr_pages: The number of physically contiguous base pages spanned. + * + * Traverses the reverse mapping, finding all VMAs which contain a shared + * mapping of the pages in the specified range in @mapping, and write-protects + * them (that is, updates the page tables to mark the mappings read-only such + * that a write protection fault arises when the mappings are written to). + * + * The @pfn value need not refer to a folio, but rather can reference a kernel + * allocation which is mapped into userland. We therefore do not require that + * the page maps to a folio with a valid mapping or index field, rather the + * caller specifies these in @mapping and @pgoff. + * + * Return: the number of write-protected PTEs, or an error. + */ +int mapping_wrprotect_range(struct address_space *mapping, pgoff_t pgoff, + unsigned long pfn, unsigned long nr_pages) +{ + struct wrprotect_file_state state = { + .cleaned = 0, + .pgoff = pgoff, + .pfn = pfn, + .nr_pages = nr_pages, + }; + struct rmap_walk_control rwc = { + .arg = (void *)&state, + .rmap_one = mapping_wrprotect_range_one, + .invalid_vma = invalid_mkclean_vma, + }; + + if (!mapping) + return 0; + + __rmap_walk_file(/* folio = */NULL, mapping, pgoff, nr_pages, &rwc, + /* locked = */false); + + return state.cleaned; +} +EXPORT_SYMBOL_GPL(mapping_wrprotect_range); + /** * pfn_mkclean_range - Cleans the PTEs (including PMDs) mapped with range of * [@pfn, @pfn + @nr_pages) at the specific offset (@pgoff) @@ -1160,8 +1242,8 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, } static __always_inline unsigned int __folio_add_rmap(struct folio *folio, - struct page *page, int nr_pages, enum rmap_level level, - int *nr_pmdmapped) + struct page *page, int nr_pages, struct vm_area_struct *vma, + enum rmap_level level, int *nr_pmdmapped) { atomic_t *mapped = &folio->_nr_pages_mapped; const int orig_nr_pages = nr_pages; @@ -1176,6 +1258,16 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio, break; } + if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) { + nr = folio_add_return_large_mapcount(folio, orig_nr_pages, vma); + if (nr == orig_nr_pages) + /* Was completely unmapped. */ + nr = folio_large_nr_pages(folio); + else + nr = 0; + break; + } + do { first += atomic_inc_and_test(&page->_mapcount); } while (page++, --nr_pages > 0); @@ -1184,15 +1276,34 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio, atomic_add_return_relaxed(first, mapped) < ENTIRELY_MAPPED) nr = first; - atomic_add(orig_nr_pages, &folio->_large_mapcount); + folio_add_large_mapcount(folio, orig_nr_pages, vma); break; case RMAP_LEVEL_PMD: + case RMAP_LEVEL_PUD: first = atomic_inc_and_test(&folio->_entire_mapcount); + if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) { + if (level == RMAP_LEVEL_PMD && first) + *nr_pmdmapped = folio_large_nr_pages(folio); + nr = folio_inc_return_large_mapcount(folio, vma); + if (nr == 1) + /* Was completely unmapped. */ + nr = folio_large_nr_pages(folio); + else + nr = 0; + break; + } + if (first) { nr = atomic_add_return_relaxed(ENTIRELY_MAPPED, mapped); if (likely(nr < ENTIRELY_MAPPED + ENTIRELY_MAPPED)) { - *nr_pmdmapped = folio_nr_pages(folio); - nr = *nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED); + nr_pages = folio_large_nr_pages(folio); + /* + * We only track PMD mappings of PMD-sized + * folios separately. + */ + if (level == RMAP_LEVEL_PMD) + *nr_pmdmapped = nr_pages; + nr = nr_pages - (nr & FOLIO_PAGES_MAPPED); /* Raced ahead of a remove and another add? */ if (unlikely(nr < 0)) nr = 0; @@ -1201,7 +1312,7 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio, nr = 0; } } - atomic_inc(&folio->_large_mapcount); + folio_inc_large_mapcount(folio, vma); break; } return nr; @@ -1322,7 +1433,7 @@ static __always_inline void __folio_add_anon_rmap(struct folio *folio, VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); - nr = __folio_add_rmap(folio, page, nr_pages, level, &nr_pmdmapped); + nr = __folio_add_rmap(folio, page, nr_pages, vma, level, &nr_pmdmapped); if (likely(!folio_test_ksm(folio))) __page_check_anon_rmap(folio, page, vma, address); @@ -1338,15 +1449,32 @@ static __always_inline void __folio_add_anon_rmap(struct folio *folio, case RMAP_LEVEL_PMD: SetPageAnonExclusive(page); break; + case RMAP_LEVEL_PUD: + /* + * Keep the compiler happy, we don't support anonymous + * PUD mappings. + */ + WARN_ON_ONCE(1); + break; } } + + VM_WARN_ON_FOLIO(!folio_test_large(folio) && PageAnonExclusive(page) && + atomic_read(&folio->_mapcount) > 0, folio); for (i = 0; i < nr_pages; i++) { struct page *cur_page = page + i; - /* While PTE-mapping a THP we have a PMD and a PTE mapping. */ - VM_WARN_ON_FOLIO((atomic_read(&cur_page->_mapcount) > 0 || - (folio_test_large(folio) && - folio_entire_mapcount(folio) > 1)) && + VM_WARN_ON_FOLIO(folio_test_large(folio) && + folio_entire_mapcount(folio) > 1 && + PageAnonExclusive(cur_page), folio); + if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) + continue; + + /* + * While PTE-mapping a THP we have a PMD and a PTE + * mapping. + */ + VM_WARN_ON_FOLIO(atomic_read(&cur_page->_mapcount) > 0 && PageAnonExclusive(cur_page), folio); } @@ -1426,14 +1554,11 @@ void folio_add_anon_rmap_pmd(struct folio *folio, struct page *page, void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, unsigned long address, rmap_t flags) { - const int nr = folio_nr_pages(folio); const bool exclusive = flags & RMAP_EXCLUSIVE; - int nr_pmdmapped = 0; + int nr = 1, nr_pmdmapped = 0; VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); VM_WARN_ON_FOLIO(!exclusive && !folio_test_locked(folio), folio); - VM_BUG_ON_VMA(address < vma->vm_start || - address + (nr << PAGE_SHIFT) > vma->vm_end, vma); /* * VM_DROPPABLE mappings don't swap; instead they're just dropped when @@ -1451,29 +1576,35 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, } else if (!folio_test_pmd_mappable(folio)) { int i; + nr = folio_large_nr_pages(folio); for (i = 0; i < nr; i++) { struct page *page = folio_page(folio, i); - /* increment count (starts at -1) */ - atomic_set(&page->_mapcount, 0); + if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) + /* increment count (starts at -1) */ + atomic_set(&page->_mapcount, 0); if (exclusive) SetPageAnonExclusive(page); } - /* increment count (starts at -1) */ - atomic_set(&folio->_large_mapcount, nr - 1); - atomic_set(&folio->_nr_pages_mapped, nr); + folio_set_large_mapcount(folio, nr, vma); + if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) + atomic_set(&folio->_nr_pages_mapped, nr); } else { + nr = folio_large_nr_pages(folio); /* increment count (starts at -1) */ atomic_set(&folio->_entire_mapcount, 0); - /* increment count (starts at -1) */ - atomic_set(&folio->_large_mapcount, 0); - atomic_set(&folio->_nr_pages_mapped, ENTIRELY_MAPPED); + folio_set_large_mapcount(folio, 1, vma); + if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) + atomic_set(&folio->_nr_pages_mapped, ENTIRELY_MAPPED); if (exclusive) SetPageAnonExclusive(&folio->page); nr_pmdmapped = nr; } + VM_WARN_ON_ONCE(address < vma->vm_start || + address + (nr << PAGE_SHIFT) > vma->vm_end); + __folio_mod_stat(folio, nr, nr_pmdmapped); mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1); } @@ -1486,7 +1617,7 @@ static __always_inline void __folio_add_file_rmap(struct folio *folio, VM_WARN_ON_FOLIO(folio_test_anon(folio), folio); - nr = __folio_add_rmap(folio, page, nr_pages, level, &nr_pmdmapped); + nr = __folio_add_rmap(folio, page, nr_pages, vma, level, &nr_pmdmapped); __folio_mod_stat(folio, nr, nr_pmdmapped); /* See comments in folio_add_anon_rmap_*() */ @@ -1531,6 +1662,27 @@ void folio_add_file_rmap_pmd(struct folio *folio, struct page *page, #endif } +/** + * folio_add_file_rmap_pud - add a PUD mapping to a page range of a folio + * @folio: The folio to add the mapping to + * @page: The first page to add + * @vma: The vm area in which the mapping is added + * + * The page range of the folio is defined by [page, page + HPAGE_PUD_NR) + * + * The caller needs to hold the page table lock. + */ +void folio_add_file_rmap_pud(struct folio *folio, struct page *page, + struct vm_area_struct *vma) +{ +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ + defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) + __folio_add_file_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD); +#else + WARN_ON_ONCE(true); +#endif +} + static __always_inline void __folio_remove_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, enum rmap_level level) @@ -1548,7 +1700,20 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, break; } - atomic_sub(nr_pages, &folio->_large_mapcount); + if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) { + nr = folio_sub_return_large_mapcount(folio, nr_pages, vma); + if (!nr) { + /* Now completely unmapped. */ + nr = folio_nr_pages(folio); + } else { + partially_mapped = nr < folio_large_nr_pages(folio) && + !folio_entire_mapcount(folio); + nr = 0; + } + break; + } + + folio_sub_large_mapcount(folio, nr_pages, vma); do { last += atomic_add_negative(-1, &page->_mapcount); } while (page++, --nr_pages > 0); @@ -1560,13 +1725,32 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, partially_mapped = nr && atomic_read(mapped); break; case RMAP_LEVEL_PMD: - atomic_dec(&folio->_large_mapcount); + case RMAP_LEVEL_PUD: + if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) { + last = atomic_add_negative(-1, &folio->_entire_mapcount); + if (level == RMAP_LEVEL_PMD && last) + nr_pmdmapped = folio_large_nr_pages(folio); + nr = folio_dec_return_large_mapcount(folio, vma); + if (!nr) { + /* Now completely unmapped. */ + nr = folio_large_nr_pages(folio); + } else { + partially_mapped = last && + nr < folio_large_nr_pages(folio); + nr = 0; + } + break; + } + + folio_dec_large_mapcount(folio, vma); last = atomic_add_negative(-1, &folio->_entire_mapcount); if (last) { nr = atomic_sub_return_relaxed(ENTIRELY_MAPPED, mapped); if (likely(nr < ENTIRELY_MAPPED)) { - nr_pmdmapped = folio_nr_pages(folio); - nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED); + nr_pages = folio_large_nr_pages(folio); + if (level == RMAP_LEVEL_PMD) + nr_pmdmapped = nr_pages; + nr = nr_pages - (nr & FOLIO_PAGES_MAPPED); /* Raced ahead of another remove and an add? */ if (unlikely(nr < 0)) nr = 0; @@ -1640,6 +1824,46 @@ void folio_remove_rmap_pmd(struct folio *folio, struct page *page, #endif } +/** + * folio_remove_rmap_pud - remove a PUD mapping from a page range of a folio + * @folio: The folio to remove the mapping from + * @page: The first page to remove + * @vma: The vm area from which the mapping is removed + * + * The page range of the folio is defined by [page, page + HPAGE_PUD_NR) + * + * The caller needs to hold the page table lock. + */ +void folio_remove_rmap_pud(struct folio *folio, struct page *page, + struct vm_area_struct *vma) +{ +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ + defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) + __folio_remove_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD); +#else + WARN_ON_ONCE(true); +#endif +} + +/* We support batch unmapping of PTEs for lazyfree large folios */ +static inline bool can_batch_unmap_folio_ptes(unsigned long addr, + struct folio *folio, pte_t *ptep) +{ + const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; + int max_nr = folio_nr_pages(folio); + pte_t pte = ptep_get(ptep); + + if (!folio_test_anon(folio) || folio_test_swapbacked(folio)) + return false; + if (pte_unused(pte)) + return false; + if (pte_pfn(pte) != folio_pfn(folio)) + return false; + + return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL, + NULL, NULL) == max_nr; +} + /* * @arg: enum ttu_flags will be passed to this argument */ @@ -1648,11 +1872,12 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, { struct mm_struct *mm = vma->vm_mm; DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); + bool anon_exclusive, ret = true; pte_t pteval; struct page *subpage; - bool anon_exclusive, ret = true; struct mmu_notifier_range range; enum ttu_flags flags = (enum ttu_flags)(long)arg; + unsigned long nr_pages = 1, end_addr; unsigned long pfn; unsigned long hsz = 0; @@ -1702,9 +1927,16 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, } if (!pvmw.pte) { - if (unmap_huge_pmd_locked(vma, pvmw.address, pvmw.pmd, - folio)) - goto walk_done; + if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) { + if (unmap_huge_pmd_locked(vma, pvmw.address, pvmw.pmd, folio)) + goto walk_done; + /* + * unmap_huge_pmd_locked has either already marked + * the folio as swap-backed or decided to retain it + * due to GUP or speculative references. + */ + goto walk_abort; + } if (flags & TTU_SPLIT_HUGE_PMD) { /* @@ -1712,7 +1944,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * restart so we can process the PTE-mapped THP. */ split_huge_pmd_locked(vma, pvmw.address, - pvmw.pmd, false, folio); + pvmw.pmd, false); flags &= ~TTU_SPLIT_HUGE_PMD; page_vma_mapped_walk_restart(&pvmw); continue; @@ -1722,7 +1954,18 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, /* Unexpected PMD-mapped THP? */ VM_BUG_ON_FOLIO(!pvmw.pte, folio); - pfn = pte_pfn(ptep_get(pvmw.pte)); + /* + * Handle PFN swap PTEs, such as device-exclusive ones, that + * actually map pages. + */ + pteval = ptep_get(pvmw.pte); + if (likely(pte_present(pteval))) { + pfn = pte_pfn(pteval); + } else { + pfn = swp_offset_pfn(pte_to_swp_entry(pteval)); + VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); + } + subpage = folio_page(folio, pfn - folio_pfn(folio)); address = pvmw.address; anon_exclusive = folio_test_anon(folio) && @@ -1778,24 +2021,33 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, hugetlb_vma_unlock_write(vma); } pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); - } else { - flush_cache_page(vma, address, pfn); - /* Nuke the page table entry. */ - if (should_defer_flush(mm, flags)) { - /* - * We clear the PTE but do not flush so potentially - * a remote CPU could still be writing to the folio. - * If the entry was previously clean then the - * architecture must guarantee that a clear->dirty - * transition on a cached TLB entry is written through - * and traps if the PTE is unmapped. - */ - pteval = ptep_get_and_clear(mm, address, pvmw.pte); + if (pte_dirty(pteval)) + folio_mark_dirty(folio); + } else if (likely(pte_present(pteval))) { + if (folio_test_large(folio) && !(flags & TTU_HWPOISON) && + can_batch_unmap_folio_ptes(address, folio, pvmw.pte)) + nr_pages = folio_nr_pages(folio); + end_addr = address + nr_pages * PAGE_SIZE; + flush_cache_range(vma, address, end_addr); - set_tlb_ubc_flush_pending(mm, pteval, address); - } else { - pteval = ptep_clear_flush(vma, address, pvmw.pte); - } + /* Nuke the page table entry. */ + pteval = get_and_clear_full_ptes(mm, address, pvmw.pte, nr_pages, 0); + /* + * We clear the PTE but do not flush so potentially + * a remote CPU could still be writing to the folio. + * If the entry was previously clean then the + * architecture must guarantee that a clear->dirty + * transition on a cached TLB entry is written through + * and traps if the PTE is unmapped. + */ + if (should_defer_flush(mm, flags)) + set_tlb_ubc_flush_pending(mm, pteval, address, end_addr); + else + flush_tlb_range(vma, address, end_addr); + if (pte_dirty(pteval)) + folio_mark_dirty(folio); + } else { + pte_clear(mm, address, pvmw.pte); } /* @@ -1805,10 +2057,6 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, */ pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval); - /* Set the dirty flag on the folio now the pte is gone. */ - if (pte_dirty(pteval)) - folio_mark_dirty(folio); - /* Update high watermark before we lower rss */ update_hiwater_rss(mm); @@ -1822,8 +2070,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, dec_mm_counter(mm, mm_counter(folio)); set_pte_at(mm, address, pvmw.pte, pteval); } - - } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) { + } else if (likely(pte_present(pteval)) && pte_unused(pteval) && + !userfaultfd_armed(vma)) { /* * The guest indicated that the page content is of no * interest anymore. Simply discard the pte, vmscan @@ -1868,40 +2116,41 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, */ smp_rmb(); - /* - * The only page refs must be one from isolation - * plus the rmap(s) (dropped by discard:). - */ - if (ref_count == 1 + map_count && - (!folio_test_dirty(folio) || - /* - * Unlike MADV_FREE mappings, VM_DROPPABLE - * ones can be dropped even if they've - * been dirtied. - */ - (vma->vm_flags & VM_DROPPABLE))) { - dec_mm_counter(mm, MM_ANONPAGES); - goto discard; - } - - /* - * If the folio was redirtied, it cannot be - * discarded. Remap the page to page table. - */ - set_pte_at(mm, address, pvmw.pte, pteval); - /* - * Unlike MADV_FREE mappings, VM_DROPPABLE ones - * never get swap backed on failure to drop. - */ - if (!(vma->vm_flags & VM_DROPPABLE)) + if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) { + /* + * redirtied either using the page table or a previously + * obtained GUP reference. + */ + set_ptes(mm, address, pvmw.pte, pteval, nr_pages); folio_set_swapbacked(folio); - goto walk_abort; + goto walk_abort; + } else if (ref_count != 1 + map_count) { + /* + * Additional reference. Could be a GUP reference or any + * speculative reference. GUP users must mark the folio + * dirty if there was a modification. This folio cannot be + * reclaimed right now either way, so act just like nothing + * happened. + * We'll come back here later and detect if the folio was + * dirtied when the additional reference is gone. + */ + set_ptes(mm, address, pvmw.pte, pteval, nr_pages); + goto walk_abort; + } + add_mm_counter(mm, MM_ANONPAGES, -nr_pages); + goto discard; } if (swap_duplicate(entry) < 0) { set_pte_at(mm, address, pvmw.pte, pteval); goto walk_abort; } + + /* + * arch_unmap_one() is expected to be a NOP on + * architectures where we could have PFN swap PTEs, + * so we'll not check/care. + */ if (arch_unmap_one(mm, vma, address, pteval) < 0) { swap_free(entry); set_pte_at(mm, address, pvmw.pte, pteval); @@ -1926,10 +2175,17 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, swp_pte = swp_entry_to_pte(entry); if (anon_exclusive) swp_pte = pte_swp_mkexclusive(swp_pte); - if (pte_soft_dirty(pteval)) - swp_pte = pte_swp_mksoft_dirty(swp_pte); - if (pte_uffd_wp(pteval)) - swp_pte = pte_swp_mkuffd_wp(swp_pte); + if (likely(pte_present(pteval))) { + if (pte_soft_dirty(pteval)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_uffd_wp(pteval)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); + } else { + if (pte_swp_soft_dirty(pteval)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_swp_uffd_wp(pteval)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); + } set_pte_at(mm, address, pvmw.pte, swp_pte); } else { /* @@ -1946,13 +2202,18 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, dec_mm_counter(mm, mm_counter_file(folio)); } discard: - if (unlikely(folio_test_hugetlb(folio))) + if (unlikely(folio_test_hugetlb(folio))) { hugetlb_remove_rmap(folio); - else - folio_remove_rmap_pte(folio, subpage, vma); + } else { + folio_remove_rmap_ptes(folio, subpage, nr_pages, vma); + folio_ref_sub(folio, nr_pages - 1); + } if (vma->vm_flags & VM_LOCKED) mlock_drain_local(); folio_put(folio); + /* We have already batched the entire folio */ + if (nr_pages > 1) + goto walk_done; continue; walk_abort: ret = false; @@ -2013,9 +2274,9 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, { struct mm_struct *mm = vma->vm_mm; DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); + bool anon_exclusive, writable, ret = true; pte_t pteval; struct page *subpage; - bool anon_exclusive, ret = true; struct mmu_notifier_range range; enum ttu_flags flags = (enum ttu_flags)(long)arg; unsigned long pfn; @@ -2031,13 +2292,6 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, pvmw.flags = PVMW_SYNC; /* - * unmap_page() in mm/huge_memory.c is the only user of migration with - * TTU_SPLIT_HUGE_PMD and it wants to freeze. - */ - if (flags & TTU_SPLIT_HUGE_PMD) - split_huge_pmd_address(vma, address, true, folio); - - /* * For THP, we have to assume the worse case ie pmd for invalidation. * For hugetlb, it could be much worse if we need to do pud * invalidation in the case of pmd sharing. @@ -2062,9 +2316,16 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(&pvmw)) { -#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION /* PMD-mapped THP migration entry */ if (!pvmw.pte) { + if (flags & TTU_SPLIT_HUGE_PMD) { + split_huge_pmd_locked(vma, pvmw.address, + pvmw.pmd, true); + ret = false; + page_vma_mapped_walk_done(&pvmw); + break; + } +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION subpage = folio_page(folio, pmd_pfn(*pvmw.pmd) - folio_pfn(folio)); VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) || @@ -2076,30 +2337,25 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, break; } continue; - } #endif + } /* Unexpected PMD-mapped THP? */ VM_BUG_ON_FOLIO(!pvmw.pte, folio); - pfn = pte_pfn(ptep_get(pvmw.pte)); - - if (folio_is_zone_device(folio)) { - /* - * Our PTE is a non-present device exclusive entry and - * calculating the subpage as for the common case would - * result in an invalid pointer. - * - * Since only PAGE_SIZE pages can currently be - * migrated, just set it to page. This will need to be - * changed when hugepage migrations to device private - * memory are supported. - */ - VM_BUG_ON_FOLIO(folio_nr_pages(folio) > 1, folio); - subpage = &folio->page; + /* + * Handle PFN swap PTEs, such as device-exclusive ones, that + * actually map pages. + */ + pteval = ptep_get(pvmw.pte); + if (likely(pte_present(pteval))) { + pfn = pte_pfn(pteval); } else { - subpage = folio_page(folio, pfn - folio_pfn(folio)); + pfn = swp_offset_pfn(pte_to_swp_entry(pteval)); + VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); } + + subpage = folio_page(folio, pfn - folio_pfn(folio)); address = pvmw.address; anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(subpage); @@ -2155,7 +2411,10 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, } /* Nuke the hugetlb page table entry */ pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); - } else { + if (pte_dirty(pteval)) + folio_mark_dirty(folio); + writable = pte_write(pteval); + } else if (likely(pte_present(pteval))) { flush_cache_page(vma, address, pfn); /* Nuke the page table entry. */ if (should_defer_flush(mm, flags)) { @@ -2169,58 +2428,27 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, */ pteval = ptep_get_and_clear(mm, address, pvmw.pte); - set_tlb_ubc_flush_pending(mm, pteval, address); + set_tlb_ubc_flush_pending(mm, pteval, address, address + PAGE_SIZE); } else { pteval = ptep_clear_flush(vma, address, pvmw.pte); } + if (pte_dirty(pteval)) + folio_mark_dirty(folio); + writable = pte_write(pteval); + } else { + pte_clear(mm, address, pvmw.pte); + writable = is_writable_device_private_entry(pte_to_swp_entry(pteval)); } - /* Set the dirty flag on the folio now the pte is gone. */ - if (pte_dirty(pteval)) - folio_mark_dirty(folio); + VM_WARN_ON_FOLIO(writable && folio_test_anon(folio) && + !anon_exclusive, folio); /* Update high watermark before we lower rss */ update_hiwater_rss(mm); - if (folio_is_device_private(folio)) { - unsigned long pfn = folio_pfn(folio); - swp_entry_t entry; - pte_t swp_pte; - - if (anon_exclusive) - WARN_ON_ONCE(folio_try_share_anon_rmap_pte(folio, - subpage)); - - /* - * Store the pfn of the page in a special migration - * pte. do_swap_page() will wait until the migration - * pte is removed and then restart fault handling. - */ - entry = pte_to_swp_entry(pteval); - if (is_writable_device_private_entry(entry)) - entry = make_writable_migration_entry(pfn); - else if (anon_exclusive) - entry = make_readable_exclusive_migration_entry(pfn); - else - entry = make_readable_migration_entry(pfn); - swp_pte = swp_entry_to_pte(entry); + if (PageHWPoison(subpage)) { + VM_WARN_ON_FOLIO(folio_is_device_private(folio), folio); - /* - * pteval maps a zone device page and is therefore - * a swap pte. - */ - if (pte_swp_soft_dirty(pteval)) - swp_pte = pte_swp_mksoft_dirty(swp_pte); - if (pte_swp_uffd_wp(pteval)) - swp_pte = pte_swp_mkuffd_wp(swp_pte); - set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); - trace_set_migration_pte(pvmw.address, pte_val(swp_pte), - folio_order(folio)); - /* - * No need to invalidate here it will synchronize on - * against the special swap migration pte. - */ - } else if (PageHWPoison(subpage)) { pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); if (folio_test_hugetlb(folio)) { hugetlb_count_sub(folio_nr_pages(folio), mm); @@ -2230,8 +2458,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, dec_mm_counter(mm, mm_counter(folio)); set_pte_at(mm, address, pvmw.pte, pteval); } - - } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) { + } else if (likely(pte_present(pteval)) && pte_unused(pteval) && + !userfaultfd_armed(vma)) { /* * The guest indicated that the page content is of no * interest anymore. Simply discard the pte, vmscan @@ -2247,6 +2475,11 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, swp_entry_t entry; pte_t swp_pte; + /* + * arch_unmap_one() is expected to be a NOP on + * architectures where we could have PFN swap PTEs, + * so we'll not check/care. + */ if (arch_unmap_one(mm, vma, address, pteval) < 0) { if (folio_test_hugetlb(folio)) set_huge_pte_at(mm, address, pvmw.pte, @@ -2257,8 +2490,6 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, page_vma_mapped_walk_done(&pvmw); break; } - VM_BUG_ON_PAGE(pte_write(pteval) && folio_test_anon(folio) && - !anon_exclusive, subpage); /* See folio_try_share_anon_rmap_pte(): clear PTE first. */ if (folio_test_hugetlb(folio)) { @@ -2283,7 +2514,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, * pte. do_swap_page() will wait until the migration * pte is removed and then restart fault handling. */ - if (pte_write(pteval)) + if (writable) entry = make_writable_migration_entry( page_to_pfn(subpage)); else if (anon_exclusive) @@ -2292,15 +2523,23 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, else entry = make_readable_migration_entry( page_to_pfn(subpage)); - if (pte_young(pteval)) - entry = make_migration_entry_young(entry); - if (pte_dirty(pteval)) - entry = make_migration_entry_dirty(entry); - swp_pte = swp_entry_to_pte(entry); - if (pte_soft_dirty(pteval)) - swp_pte = pte_swp_mksoft_dirty(swp_pte); - if (pte_uffd_wp(pteval)) - swp_pte = pte_swp_mkuffd_wp(swp_pte); + if (likely(pte_present(pteval))) { + if (pte_young(pteval)) + entry = make_migration_entry_young(entry); + if (pte_dirty(pteval)) + entry = make_migration_entry_dirty(entry); + swp_pte = swp_entry_to_pte(entry); + if (pte_soft_dirty(pteval)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_uffd_wp(pteval)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); + } else { + swp_pte = swp_entry_to_pte(entry); + if (pte_swp_soft_dirty(pteval)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_swp_uffd_wp(pteval)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); + } if (folio_test_hugetlb(folio)) set_huge_pte_at(mm, address, pvmw.pte, swp_pte, hsz); @@ -2375,190 +2614,139 @@ void try_to_migrate(struct folio *folio, enum ttu_flags flags) } #ifdef CONFIG_DEVICE_PRIVATE -struct make_exclusive_args { - struct mm_struct *mm; - unsigned long address; - void *owner; - bool valid; -}; - -static bool page_make_device_exclusive_one(struct folio *folio, - struct vm_area_struct *vma, unsigned long address, void *priv) +/** + * make_device_exclusive() - Mark a page for exclusive use by a device + * @mm: mm_struct of associated target process + * @addr: the virtual address to mark for exclusive device access + * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering + * @foliop: folio pointer will be stored here on success. + * + * This function looks up the page mapped at the given address, grabs a + * folio reference, locks the folio and replaces the PTE with special + * device-exclusive PFN swap entry, preventing access through the process + * page tables. The function will return with the folio locked and referenced. + * + * On fault, the device-exclusive entries are replaced with the original PTE + * under folio lock, after calling MMU notifiers. + * + * Only anonymous non-hugetlb folios are supported and the VMA must have + * write permissions such that we can fault in the anonymous page writable + * in order to mark it exclusive. The caller must hold the mmap_lock in read + * mode. + * + * A driver using this to program access from a device must use a mmu notifier + * critical section to hold a device specific lock during programming. Once + * programming is complete it should drop the folio lock and reference after + * which point CPU access to the page will revoke the exclusive access. + * + * Notes: + * #. This function always operates on individual PTEs mapping individual + * pages. PMD-sized THPs are first remapped to be mapped by PTEs before + * the conversion happens on a single PTE corresponding to @addr. + * #. While concurrent access through the process page tables is prevented, + * concurrent access through other page references (e.g., earlier GUP + * invocation) is not handled and not supported. + * #. device-exclusive entries are considered "clean" and "old" by core-mm. + * Device drivers must update the folio state when informed by MMU + * notifiers. + * + * Returns: pointer to mapped page on success, otherwise a negative error. + */ +struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr, + void *owner, struct folio **foliop) { - struct mm_struct *mm = vma->vm_mm; - DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); - struct make_exclusive_args *args = priv; - pte_t pteval; - struct page *subpage; - bool ret = true; struct mmu_notifier_range range; + struct folio *folio, *fw_folio; + struct vm_area_struct *vma; + struct folio_walk fw; + struct page *page; swp_entry_t entry; pte_t swp_pte; - pte_t ptent; - - mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, - vma->vm_mm, address, min(vma->vm_end, - address + folio_size(folio)), - args->owner); - mmu_notifier_invalidate_range_start(&range); - - while (page_vma_mapped_walk(&pvmw)) { - /* Unexpected PMD-mapped THP? */ - VM_BUG_ON_FOLIO(!pvmw.pte, folio); - - ptent = ptep_get(pvmw.pte); - if (!pte_present(ptent)) { - ret = false; - page_vma_mapped_walk_done(&pvmw); - break; - } + int ret; - subpage = folio_page(folio, - pte_pfn(ptent) - folio_pfn(folio)); - address = pvmw.address; - - /* Nuke the page table entry. */ - flush_cache_page(vma, address, pte_pfn(ptent)); - pteval = ptep_clear_flush(vma, address, pvmw.pte); - - /* Set the dirty flag on the folio now the pte is gone. */ - if (pte_dirty(pteval)) - folio_mark_dirty(folio); - - /* - * Check that our target page is still mapped at the expected - * address. - */ - if (args->mm == mm && args->address == address && - pte_write(pteval)) - args->valid = true; - - /* - * Store the pfn of the page in a special migration - * pte. do_swap_page() will wait until the migration - * pte is removed and then restart fault handling. - */ - if (pte_write(pteval)) - entry = make_writable_device_exclusive_entry( - page_to_pfn(subpage)); - else - entry = make_readable_device_exclusive_entry( - page_to_pfn(subpage)); - swp_pte = swp_entry_to_pte(entry); - if (pte_soft_dirty(pteval)) - swp_pte = pte_swp_mksoft_dirty(swp_pte); - if (pte_uffd_wp(pteval)) - swp_pte = pte_swp_mkuffd_wp(swp_pte); - - set_pte_at(mm, address, pvmw.pte, swp_pte); + mmap_assert_locked(mm); + addr = PAGE_ALIGN_DOWN(addr); - /* - * There is a reference on the page for the swap entry which has - * been removed, so shouldn't take another. - */ - folio_remove_rmap_pte(folio, subpage, vma); + /* + * Fault in the page writable and try to lock it; note that if the + * address would already be marked for exclusive use by a device, + * the GUP call would undo that first by triggering a fault. + * + * If any other device would already map this page exclusively, the + * fault will trigger a conversion to an ordinary + * (non-device-exclusive) PTE and issue a MMU_NOTIFY_EXCLUSIVE. + */ +retry: + page = get_user_page_vma_remote(mm, addr, + FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD, + &vma); + if (IS_ERR(page)) + return page; + folio = page_folio(page); + + if (!folio_test_anon(folio) || folio_test_hugetlb(folio)) { + folio_put(folio); + return ERR_PTR(-EOPNOTSUPP); } - mmu_notifier_invalidate_range_end(&range); - - return ret; -} - -/** - * folio_make_device_exclusive - Mark the folio exclusively owned by a device. - * @folio: The folio to replace page table entries for. - * @mm: The mm_struct where the folio is expected to be mapped. - * @address: Address where the folio is expected to be mapped. - * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier callbacks - * - * Tries to remove all the page table entries which are mapping this - * folio and replace them with special device exclusive swap entries to - * grant a device exclusive access to the folio. - * - * Context: Caller must hold the folio lock. - * Return: false if the page is still mapped, or if it could not be unmapped - * from the expected address. Otherwise returns true (success). - */ -static bool folio_make_device_exclusive(struct folio *folio, - struct mm_struct *mm, unsigned long address, void *owner) -{ - struct make_exclusive_args args = { - .mm = mm, - .address = address, - .owner = owner, - .valid = false, - }; - struct rmap_walk_control rwc = { - .rmap_one = page_make_device_exclusive_one, - .done = folio_not_mapped, - .anon_lock = folio_lock_anon_vma_read, - .arg = &args, - }; + ret = folio_lock_killable(folio); + if (ret) { + folio_put(folio); + return ERR_PTR(ret); + } /* - * Restrict to anonymous folios for now to avoid potential writeback - * issues. + * Inform secondary MMUs that we are going to convert this PTE to + * device-exclusive, such that they unmap it now. Note that the + * caller must filter this event out to prevent livelocks. */ - if (!folio_test_anon(folio)) - return false; - - rmap_walk(folio, &rwc); + mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, + mm, addr, addr + PAGE_SIZE, owner); + mmu_notifier_invalidate_range_start(&range); - return args.valid && !folio_mapcount(folio); -} + /* + * Let's do a second walk and make sure we still find the same page + * mapped writable. Note that any page of an anonymous folio can + * only be mapped writable using exactly one PTE ("exclusive"), so + * there cannot be other mappings. + */ + fw_folio = folio_walk_start(&fw, vma, addr, 0); + if (fw_folio != folio || fw.page != page || + fw.level != FW_LEVEL_PTE || !pte_write(fw.pte)) { + if (fw_folio) + folio_walk_end(&fw, vma); + mmu_notifier_invalidate_range_end(&range); + folio_unlock(folio); + folio_put(folio); + goto retry; + } -/** - * make_device_exclusive_range() - Mark a range for exclusive use by a device - * @mm: mm_struct of associated target process - * @start: start of the region to mark for exclusive device access - * @end: end address of region - * @pages: returns the pages which were successfully marked for exclusive access - * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering - * - * Returns: number of pages found in the range by GUP. A page is marked for - * exclusive access only if the page pointer is non-NULL. - * - * This function finds ptes mapping page(s) to the given address range, locks - * them and replaces mappings with special swap entries preventing userspace CPU - * access. On fault these entries are replaced with the original mapping after - * calling MMU notifiers. - * - * A driver using this to program access from a device must use a mmu notifier - * critical section to hold a device specific lock during programming. Once - * programming is complete it should drop the page lock and reference after - * which point CPU access to the page will revoke the exclusive access. - */ -int make_device_exclusive_range(struct mm_struct *mm, unsigned long start, - unsigned long end, struct page **pages, - void *owner) -{ - long npages = (end - start) >> PAGE_SHIFT; - long i; - - npages = get_user_pages_remote(mm, start, npages, - FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD, - pages, NULL); - if (npages < 0) - return npages; - - for (i = 0; i < npages; i++, start += PAGE_SIZE) { - struct folio *folio = page_folio(pages[i]); - if (PageTail(pages[i]) || !folio_trylock(folio)) { - folio_put(folio); - pages[i] = NULL; - continue; - } + /* Nuke the page table entry so we get the uptodate dirty bit. */ + flush_cache_page(vma, addr, page_to_pfn(page)); + fw.pte = ptep_clear_flush(vma, addr, fw.ptep); - if (!folio_make_device_exclusive(folio, mm, start, owner)) { - folio_unlock(folio); - folio_put(folio); - pages[i] = NULL; - } - } + /* Set the dirty flag on the folio now the PTE is gone. */ + if (pte_dirty(fw.pte)) + folio_mark_dirty(folio); - return npages; + /* + * Store the pfn of the page in a special device-exclusive PFN swap PTE. + * do_swap_page() will trigger the conversion back while holding the + * folio lock. + */ + entry = make_device_exclusive_entry(page_to_pfn(page)); + swp_pte = swp_entry_to_pte(entry); + if (pte_soft_dirty(fw.pte)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + /* The pte is writable, uffd-wp does not apply. */ + set_pte_at(mm, addr, fw.ptep, swp_pte); + + folio_walk_end(&fw, vma); + mmu_notifier_invalidate_range_end(&range); + *foliop = folio; + return page; } -EXPORT_SYMBOL_GPL(make_device_exclusive_range); +EXPORT_SYMBOL_GPL(make_device_exclusive); #endif void __put_anon_vma(struct anon_vma *anon_vma) @@ -2653,35 +2841,37 @@ static void rmap_walk_anon(struct folio *folio, anon_vma_unlock_read(anon_vma); } -/* - * rmap_walk_file - do something to file page using the object-based rmap method - * @folio: the folio to be handled - * @rwc: control variable according to each walk type - * @locked: caller holds relevant rmap lock +/** + * __rmap_walk_file() - Traverse the reverse mapping for a file-backed mapping + * of a page mapped within a specified page cache object at a specified offset. * - * Find all the mappings of a folio using the mapping pointer and the vma chains - * contained in the address_space struct it points to. + * @folio: Either the folio whose mappings to traverse, or if NULL, + * the callbacks specified in @rwc will be configured such + * as to be able to look up mappings correctly. + * @mapping: The page cache object whose mapping VMAs we intend to + * traverse. If @folio is non-NULL, this should be equal to + * folio_mapping(folio). + * @pgoff_start: The offset within @mapping of the page which we are + * looking up. If @folio is non-NULL, this should be equal + * to folio_pgoff(folio). + * @nr_pages: The number of pages mapped by the mapping. If @folio is + * non-NULL, this should be equal to folio_nr_pages(folio). + * @rwc: The reverse mapping walk control object describing how + * the traversal should proceed. + * @locked: Is the @mapping already locked? If not, we acquire the + * lock. */ -static void rmap_walk_file(struct folio *folio, - struct rmap_walk_control *rwc, bool locked) +static void __rmap_walk_file(struct folio *folio, struct address_space *mapping, + pgoff_t pgoff_start, unsigned long nr_pages, + struct rmap_walk_control *rwc, bool locked) { - struct address_space *mapping = folio_mapping(folio); - pgoff_t pgoff_start, pgoff_end; + pgoff_t pgoff_end = pgoff_start + nr_pages - 1; struct vm_area_struct *vma; - /* - * The page lock not only makes sure that page->mapping cannot - * suddenly be NULLified by truncation, it makes sure that the - * structure at mapping cannot be freed and reused yet, - * so we can safely take mapping->i_mmap_rwsem. - */ - VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_FOLIO(folio && mapping != folio_mapping(folio), folio); + VM_WARN_ON_FOLIO(folio && pgoff_start != folio_pgoff(folio), folio); + VM_WARN_ON_FOLIO(folio && nr_pages != folio_nr_pages(folio), folio); - if (!mapping) - return; - - pgoff_start = folio_pgoff(folio); - pgoff_end = pgoff_start + folio_nr_pages(folio) - 1; if (!locked) { if (i_mmap_trylock_read(mapping)) goto lookup; @@ -2696,8 +2886,7 @@ static void rmap_walk_file(struct folio *folio, lookup: vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff_start, pgoff_end) { - unsigned long address = vma_address(vma, pgoff_start, - folio_nr_pages(folio)); + unsigned long address = vma_address(vma, pgoff_start, nr_pages); VM_BUG_ON_VMA(address == -EFAULT, vma); cond_resched(); @@ -2710,12 +2899,38 @@ lookup: if (rwc->done && rwc->done(folio)) goto done; } - done: if (!locked) i_mmap_unlock_read(mapping); } +/* + * rmap_walk_file - do something to file page using the object-based rmap method + * @folio: the folio to be handled + * @rwc: control variable according to each walk type + * @locked: caller holds relevant rmap lock + * + * Find all the mappings of a folio using the mapping pointer and the vma chains + * contained in the address_space struct it points to. + */ +static void rmap_walk_file(struct folio *folio, + struct rmap_walk_control *rwc, bool locked) +{ + /* + * The folio lock not only makes sure that folio->mapping cannot + * suddenly be NULLified by truncation, it makes sure that the structure + * at mapping cannot be freed and reused yet, so we can safely take + * mapping->i_mmap_rwsem. + */ + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + + if (!folio->mapping) + return; + + __rmap_walk_file(folio, folio->mapping, folio->index, + folio_nr_pages(folio), rwc, locked); +} + void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc) { if (unlikely(folio_test_ksm(folio))) diff --git a/mm/rodata_test.c b/mm/rodata_test.c index 6d783436951f..e7173fcd210c 100644 --- a/mm/rodata_test.c +++ b/mm/rodata_test.c @@ -12,7 +12,8 @@ #include <linux/mm.h> #include <asm/sections.h> -static const int rodata_test_data = 0xC3; +#define TEST_VALUE 0xC3 +static const int rodata_test_data = TEST_VALUE; void rodata_test(void) { @@ -20,7 +21,7 @@ void rodata_test(void) /* test 1: read the value */ /* If this test fails, some previous testrun has clobbered the state */ - if (!rodata_test_data) { + if (unlikely(READ_ONCE(rodata_test_data) != TEST_VALUE)) { pr_err("test 1 fails (start data)\n"); return; } @@ -33,7 +34,7 @@ void rodata_test(void) } /* test 3: check the value hasn't changed */ - if (rodata_test_data == zero) { + if (unlikely(READ_ONCE(rodata_test_data) != TEST_VALUE)) { pr_err("test data was changed\n"); return; } diff --git a/mm/secretmem.c b/mm/secretmem.c index 399552814fd0..9a11a38a6770 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -120,18 +120,18 @@ static int secretmem_release(struct inode *inode, struct file *file) return 0; } -static int secretmem_mmap(struct file *file, struct vm_area_struct *vma) +static int secretmem_mmap_prepare(struct vm_area_desc *desc) { - unsigned long len = vma->vm_end - vma->vm_start; + const unsigned long len = desc->end - desc->start; - if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0) + if ((desc->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0) return -EINVAL; - if (!mlock_future_ok(vma->vm_mm, vma->vm_flags | VM_LOCKED, len)) + if (!mlock_future_ok(desc->mm, desc->vm_flags | VM_LOCKED, len)) return -EAGAIN; - vm_flags_set(vma, VM_LOCKED | VM_DONTDUMP); - vma->vm_ops = &secretmem_vm_ops; + desc->vm_flags |= VM_LOCKED | VM_DONTDUMP; + desc->vm_ops = &secretmem_vm_ops; return 0; } @@ -143,7 +143,7 @@ bool vma_is_secretmem(struct vm_area_struct *vma) static const struct file_operations secretmem_fops = { .release = secretmem_release, - .mmap = secretmem_mmap, + .mmap_prepare = secretmem_mmap_prepare, }; static int secretmem_migrate_folio(struct address_space *mapping, @@ -195,19 +195,11 @@ static struct file *secretmem_file_create(unsigned long flags) struct file *file; struct inode *inode; const char *anon_name = "[secretmem]"; - const struct qstr qname = QSTR_INIT(anon_name, strlen(anon_name)); - int err; - inode = alloc_anon_inode(secretmem_mnt->mnt_sb); + inode = anon_inode_make_secure_inode(secretmem_mnt->mnt_sb, anon_name, NULL); if (IS_ERR(inode)) return ERR_CAST(inode); - err = security_inode_init_security_anon(inode, &qname, NULL); - if (err) { - file = ERR_PTR(err); - goto err_free_inode; - } - file = alloc_file_pseudo(inode, secretmem_mnt, "secretmem", O_RDWR, &secretmem_fops); if (IS_ERR(file)) diff --git a/mm/shmem.c b/mm/shmem.c index ccb9629a0f70..3a5a65b1f41a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -86,7 +86,6 @@ static struct vfsmount *shm_mnt __ro_after_init; #include "internal.h" -#define BLOCKS_PER_PAGE (PAGE_SIZE/512) #define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT) /* Pretend that each entry is of this size in directory's i_size */ @@ -99,7 +98,7 @@ static struct vfsmount *shm_mnt __ro_after_init; #define SHORT_SYMLINK_LEN 128 /* - * shmem_fallocate communicates with shmem_fault or shmem_writepage via + * shmem_fallocate communicates with shmem_fault or shmem_writeout via * inode->i_private (with i_rwsem making sure that it has only one user at * a time): we would prefer not to enlarge the shmem inode just for that. */ @@ -108,7 +107,7 @@ struct shmem_falloc { pgoff_t start; /* start of range currently being fallocated */ pgoff_t next; /* the next page offset to be fallocated */ pgoff_t nr_falloced; /* how many new pages have been fallocated */ - pgoff_t nr_unswapped; /* how often writepage refused to swap out */ + pgoff_t nr_unswapped; /* how often writeout refused to swap out */ }; struct shmem_options { @@ -447,7 +446,7 @@ static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped) /* * Special case: whereas normally shmem_recalc_inode() is called * after i_mapping->nrpages has already been adjusted (up or down), - * shmem_writepage() has to raise swapped before nrpages is lowered - + * shmem_writeout() has to raise swapped before nrpages is lowered - * to stop a racing shmem_recalc_inode() from thinking that a page has * been freed. Compensate here, to avoid the need for a followup call. */ @@ -526,9 +525,9 @@ static bool shmem_confirm_swap(struct address_space *mapping, * enables huge pages for the mount; * SHMEM_HUGE_WITHIN_SIZE: * only allocate huge pages if the page will be fully within i_size, - * also respect fadvise()/madvise() hints; + * also respect madvise() hints; * SHMEM_HUGE_ADVISE: - * only allocate huge pages if requested with fadvise()/madvise(); + * only allocate huge pages if requested with madvise(); */ #define SHMEM_HUGE_NEVER 0 @@ -553,38 +552,119 @@ static bool shmem_confirm_swap(struct address_space *mapping, /* ifdef here to avoid bloating shmem.o when not necessary */ static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; +static int tmpfs_huge __read_mostly = SHMEM_HUGE_NEVER; -static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index, - loff_t write_end, bool shmem_huge_force, - unsigned long vm_flags) +/** + * shmem_mapping_size_orders - Get allowable folio orders for the given file size. + * @mapping: Target address_space. + * @index: The page index. + * @write_end: end of a write, could extend inode size. + * + * This returns huge orders for folios (when supported) based on the file size + * which the mapping currently allows at the given index. The index is relevant + * due to alignment considerations the mapping might have. The returned order + * may be less than the size passed. + * + * Return: The orders. + */ +static inline unsigned int +shmem_mapping_size_orders(struct address_space *mapping, pgoff_t index, loff_t write_end) +{ + unsigned int order; + size_t size; + + if (!mapping_large_folio_support(mapping) || !write_end) + return 0; + + /* Calculate the write size based on the write_end */ + size = write_end - (index << PAGE_SHIFT); + order = filemap_get_order(size); + if (!order) + return 0; + + /* If we're not aligned, allocate a smaller folio */ + if (index & ((1UL << order) - 1)) + order = __ffs(index); + + order = min_t(size_t, order, MAX_PAGECACHE_ORDER); + return order > 0 ? BIT(order + 1) - 1 : 0; +} + +static unsigned int shmem_get_orders_within_size(struct inode *inode, + unsigned long within_size_orders, pgoff_t index, + loff_t write_end) { + pgoff_t aligned_index; + unsigned long order; loff_t i_size; - if (HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER) - return false; + order = highest_order(within_size_orders); + while (within_size_orders) { + aligned_index = round_up(index + 1, 1 << order); + i_size = max(write_end, i_size_read(inode)); + i_size = round_up(i_size, PAGE_SIZE); + if (i_size >> PAGE_SHIFT >= aligned_index) + return within_size_orders; + + order = next_order(&within_size_orders, order); + } + + return 0; +} + +static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index, + loff_t write_end, bool shmem_huge_force, + struct vm_area_struct *vma, + unsigned long vm_flags) +{ + unsigned int maybe_pmd_order = HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER ? + 0 : BIT(HPAGE_PMD_ORDER); + unsigned long within_size_orders; + if (!S_ISREG(inode->i_mode)) - return false; + return 0; if (shmem_huge == SHMEM_HUGE_DENY) - return false; + return 0; if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE) - return true; + return maybe_pmd_order; + /* + * The huge order allocation for anon shmem is controlled through + * the mTHP interface, so we still use PMD-sized huge order to + * check whether global control is enabled. + * + * For tmpfs mmap()'s huge order, we still use PMD-sized order to + * allocate huge pages due to lack of a write size hint. + * + * Otherwise, tmpfs will allow getting a highest order hint based on + * the size of write and fallocate paths, then will try each allowable + * huge orders. + */ switch (SHMEM_SB(inode->i_sb)->huge) { case SHMEM_HUGE_ALWAYS: - return true; + if (vma) + return maybe_pmd_order; + + return shmem_mapping_size_orders(inode->i_mapping, index, write_end); case SHMEM_HUGE_WITHIN_SIZE: - index = round_up(index + 1, HPAGE_PMD_NR); - i_size = max(write_end, i_size_read(inode)); - i_size = round_up(i_size, PAGE_SIZE); - if (i_size >> PAGE_SHIFT >= index) - return true; + if (vma) + within_size_orders = maybe_pmd_order; + else + within_size_orders = shmem_mapping_size_orders(inode->i_mapping, + index, write_end); + + within_size_orders = shmem_get_orders_within_size(inode, within_size_orders, + index, write_end); + if (within_size_orders > 0) + return within_size_orders; + fallthrough; case SHMEM_HUGE_ADVISE: if (vm_flags & VM_HUGEPAGE) - return true; + return maybe_pmd_order; fallthrough; default: - return false; + return 0; } } @@ -779,14 +859,23 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, return 0; } -static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index, - loff_t write_end, bool shmem_huge_force, - unsigned long vm_flags) +static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index, + loff_t write_end, bool shmem_huge_force, + struct vm_area_struct *vma, + unsigned long vm_flags) { - return false; + return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +static void shmem_update_stats(struct folio *folio, int nr_pages) +{ + if (folio_test_pmd_mappable(folio)) + __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr_pages); + __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); + __lruvec_stat_mod_folio(folio, NR_SHMEM, nr_pages); +} + /* * Somewhat like filemap_add_folio, but error if expected item has gone. */ @@ -821,10 +910,7 @@ static int shmem_add_to_page_cache(struct folio *folio, xas_store(&xas, folio); if (xas_error(&xas)) goto unlock; - if (folio_test_pmd_mappable(folio)) - __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr); - __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr); - __lruvec_stat_mod_folio(folio, NR_SHMEM, nr); + shmem_update_stats(folio, nr); mapping->nrpages += nr; unlock: xas_unlock_irq(&xas); @@ -852,8 +938,7 @@ static void shmem_delete_from_page_cache(struct folio *folio, void *radswap) error = shmem_replace_entry(mapping, folio->index, folio, radswap); folio->mapping = NULL; mapping->nrpages -= nr; - __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); - __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); + shmem_update_stats(folio, -nr); xa_unlock_irq(&mapping->i_pages); folio_put_refs(folio, nr); BUG_ON(error); @@ -1176,7 +1261,7 @@ static int shmem_getattr(struct mnt_idmap *idmap, STATX_ATTR_NODUMP); generic_fillattr(idmap, request_mask, inode, stat); - if (shmem_huge_global_enabled(inode, 0, 0, false, 0)) + if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0)) stat->blksize = HPAGE_PMD_SIZE; if (request_mask & STATX_BTIME) { @@ -1308,9 +1393,9 @@ static void shmem_evict_inode(struct inode *inode) #endif } -static int shmem_find_swap_entries(struct address_space *mapping, - pgoff_t start, struct folio_batch *fbatch, - pgoff_t *indices, unsigned int type) +static unsigned int shmem_find_swap_entries(struct address_space *mapping, + pgoff_t start, struct folio_batch *fbatch, + pgoff_t *indices, unsigned int type) { XA_STATE(xas, &mapping->i_pages, start); struct folio *folio; @@ -1343,7 +1428,7 @@ static int shmem_find_swap_entries(struct address_space *mapping, } rcu_read_unlock(); - return xas.xa_index; + return folio_batch_count(fbatch); } /* @@ -1361,8 +1446,6 @@ static int shmem_unuse_swap_entries(struct inode *inode, for (i = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; - if (!xa_is_value(folio)) - continue; error = shmem_swapin_folio(inode, indices[i], &folio, SGP_CACHE, mapping_gfp_mask(mapping), NULL, NULL); if (error == 0) { @@ -1390,8 +1473,8 @@ static int shmem_unuse_inode(struct inode *inode, unsigned int type) do { folio_batch_init(&fbatch); - shmem_find_swap_entries(mapping, start, &fbatch, indices, type); - if (folio_batch_count(&fbatch) == 0) { + if (!shmem_find_swap_entries(mapping, start, &fbatch, + indices, type)) { ret = 0; break; } @@ -1420,6 +1503,7 @@ int shmem_unuse(unsigned int type) return 0; mutex_lock(&shmem_swaplist_mutex); +start_over: list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) { if (!info->swapped) { list_del_init(&info->swaplist); @@ -1438,45 +1522,42 @@ int shmem_unuse(unsigned int type) cond_resched(); mutex_lock(&shmem_swaplist_mutex); - next = list_next_entry(info, swaplist); - if (!info->swapped) - list_del_init(&info->swaplist); if (atomic_dec_and_test(&info->stop_eviction)) wake_up_var(&info->stop_eviction); if (error) break; + if (list_empty(&info->swaplist)) + goto start_over; + next = list_next_entry(info, swaplist); + if (!info->swapped) + list_del_init(&info->swaplist); } mutex_unlock(&shmem_swaplist_mutex); return error; } -/* - * Move the page from the page cache to the swap cache. +/** + * shmem_writeout - Write the folio to swap + * @folio: The folio to write + * @wbc: How writeback is to be done + * + * Move the folio from the page cache to the swap cache. */ -static int shmem_writepage(struct page *page, struct writeback_control *wbc) +int shmem_writeout(struct folio *folio, struct writeback_control *wbc) { - struct folio *folio = page_folio(page); struct address_space *mapping = folio->mapping; struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); - swp_entry_t swap; pgoff_t index; int nr_pages; bool split = false; - /* - * Our capabilities prevent regular writeback or sync from ever calling - * shmem_writepage; but a stacking filesystem might use ->writepage of - * its underlying filesystem, in which case tmpfs should write out to - * swap only in response to memory pressure, and not for the writeback - * threads or sync. - */ if (WARN_ON_ONCE(!wbc->for_reclaim)) goto redirty; - if (WARN_ON_ONCE((info->flags & VM_LOCKED) || sbinfo->noswap)) + if ((info->flags & VM_LOCKED) || sbinfo->noswap) goto redirty; if (!total_swap_pages) @@ -1502,9 +1583,8 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) try_split: /* Ensure the subpages are still dirty */ folio_test_set_dirty(folio); - if (split_huge_page_to_list_to_order(page, wbc->list, 0)) + if (split_folio_to_list(folio, wbc->list)) goto redirty; - folio = page_folio(page); folio_clear_dirty(folio); } @@ -1531,7 +1611,7 @@ try_split: !shmem_falloc->waitq && index >= shmem_falloc->start && index < shmem_falloc->next) - shmem_falloc->nr_unswapped++; + shmem_falloc->nr_unswapped += nr_pages; else shmem_falloc = NULL; spin_unlock(&inode->i_lock); @@ -1543,14 +1623,6 @@ try_split: folio_mark_uptodate(folio); } - swap = folio_alloc_swap(folio); - if (!swap.val) { - if (nr_pages > 1) - goto try_split; - - goto redirty; - } - /* * Add inode to shmem_unuse()'s list of swapped-out inodes, * if it's not already there. Do it now before the folio is @@ -1563,20 +1635,20 @@ try_split: if (list_empty(&info->swaplist)) list_add(&info->swaplist, &shmem_swaplist); - if (add_to_swap_cache(folio, swap, - __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN, - NULL) == 0) { + if (!folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN)) { shmem_recalc_inode(inode, 0, nr_pages); - swap_shmem_alloc(swap, nr_pages); - shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap)); + swap_shmem_alloc(folio->swap, nr_pages); + shmem_delete_from_page_cache(folio, swp_to_radix_entry(folio->swap)); mutex_unlock(&shmem_swaplist_mutex); BUG_ON(folio_mapped(folio)); - return swap_writepage(&folio->page, wbc); + return swap_writeout(folio, wbc); } - + if (!info->swapped) + list_del_init(&info->swaplist); mutex_unlock(&shmem_swaplist_mutex); - put_swap_folio(folio, swap); + if (nr_pages > 1) + goto try_split; redirty: folio_mark_dirty(folio); if (wbc->for_reclaim) @@ -1584,6 +1656,7 @@ redirty: folio_unlock(folio); return 0; } +EXPORT_SYMBOL_GPL(shmem_writeout); #if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS) static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) @@ -1685,22 +1758,16 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, unsigned long mask = READ_ONCE(huge_shmem_orders_always); unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size); unsigned long vm_flags = vma ? vma->vm_flags : 0; - bool global_huge; - loff_t i_size; - int order; + unsigned int global_orders; if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags))) return 0; - global_huge = shmem_huge_global_enabled(inode, index, write_end, - shmem_huge_force, vm_flags); - if (!vma || !vma_is_anon_shmem(vma)) { - /* - * For tmpfs, we now only support PMD sized THP if huge page - * is enabled, otherwise fallback to order 0. - */ - return global_huge ? BIT(HPAGE_PMD_ORDER) : 0; - } + global_orders = shmem_huge_global_enabled(inode, index, write_end, + shmem_huge_force, vma, vm_flags); + /* Tmpfs huge pages allocation */ + if (!vma || !vma_is_anon_shmem(vma)) + return global_orders; /* * Following the 'deny' semantics of the top level, force the huge @@ -1717,22 +1784,12 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, return READ_ONCE(huge_shmem_orders_inherit); /* Allow mTHP that will be fully within i_size. */ - order = highest_order(within_size_orders); - while (within_size_orders) { - index = round_up(index + 1, order); - i_size = round_up(i_size_read(inode), PAGE_SIZE); - if (i_size >> PAGE_SHIFT >= index) { - mask |= within_size_orders; - break; - } - - order = next_order(&within_size_orders, order); - } + mask |= shmem_get_orders_within_size(inode, within_size_orders, index, 0); if (vm_flags & VM_HUGEPAGE) mask |= READ_ONCE(huge_shmem_orders_madvise); - if (global_huge) + if (global_orders > 0) mask |= READ_ONCE(huge_shmem_orders_inherit); return THP_ORDERS_ALL_FILE_DEFAULT & mask; @@ -1898,6 +1955,65 @@ unlock: return ERR_PTR(error); } +static struct folio *shmem_swap_alloc_folio(struct inode *inode, + struct vm_area_struct *vma, pgoff_t index, + swp_entry_t entry, int order, gfp_t gfp) +{ + struct shmem_inode_info *info = SHMEM_I(inode); + struct folio *new; + void *shadow; + int nr_pages; + + /* + * We have arrived here because our zones are constrained, so don't + * limit chance of success with further cpuset and node constraints. + */ + gfp &= ~GFP_CONSTRAINT_MASK; + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && order > 0) { + gfp_t huge_gfp = vma_thp_gfp_mask(vma); + + gfp = limit_gfp_mask(huge_gfp, gfp); + } + + new = shmem_alloc_folio(gfp, order, info, index); + if (!new) + return ERR_PTR(-ENOMEM); + + nr_pages = folio_nr_pages(new); + if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL, + gfp, entry)) { + folio_put(new); + return ERR_PTR(-ENOMEM); + } + + /* + * Prevent parallel swapin from proceeding with the swap cache flag. + * + * Of course there is another possible concurrent scenario as well, + * that is to say, the swap cache flag of a large folio has already + * been set by swapcache_prepare(), while another thread may have + * already split the large swap entry stored in the shmem mapping. + * In this case, shmem_add_to_page_cache() will help identify the + * concurrent swapin and return -EEXIST. + */ + if (swapcache_prepare(entry, nr_pages)) { + folio_put(new); + return ERR_PTR(-EEXIST); + } + + __folio_set_locked(new); + __folio_set_swapbacked(new); + new->swap = entry; + + memcg1_swapin(entry, nr_pages); + shadow = get_shadow_from_swap_cache(entry); + if (shadow) + workingset_refault(new, shadow); + folio_add_lru(new); + swap_read_folio(new, NULL); + return new; +} + /* * When a page is moved from swapcache to shmem filecache (either by the * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of @@ -1969,10 +2085,8 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, } if (!error) { mem_cgroup_replace_folio(old, new); - __lruvec_stat_mod_folio(new, NR_FILE_PAGES, nr_pages); - __lruvec_stat_mod_folio(new, NR_SHMEM, nr_pages); - __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -nr_pages); - __lruvec_stat_mod_folio(old, NR_SHMEM, -nr_pages); + shmem_update_stats(new, nr_pages); + shmem_update_stats(old, -nr_pages); } xa_unlock_irq(&swap_mapping->i_pages); @@ -2003,7 +2117,8 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, } static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, - struct folio *folio, swp_entry_t swap) + struct folio *folio, swp_entry_t swap, + bool skip_swapcache) { struct address_space *mapping = inode->i_mapping; swp_entry_t swapin_error; @@ -2019,7 +2134,8 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, nr_pages = folio_nr_pages(folio); folio_wait_writeback(folio); - delete_from_swap_cache(folio); + if (!skip_swapcache) + delete_from_swap_cache(folio); /* * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks) @@ -2034,15 +2150,16 @@ static int shmem_split_large_entry(struct inode *inode, pgoff_t index, { struct address_space *mapping = inode->i_mapping; XA_STATE_ORDER(xas, &mapping->i_pages, index, 0); - void *alloced_shadow = NULL; - int alloced_order = 0, i; + int split_order = 0, entry_order; + int i; /* Convert user data gfp flags to xarray node gfp flags */ gfp &= GFP_RECLAIM_MASK; for (;;) { - int order = -1, split_order = 0; void *old = NULL; + int cur_order; + pgoff_t swap_index; xas_lock_irq(&xas); old = xas_load(&xas); @@ -2051,60 +2168,56 @@ static int shmem_split_large_entry(struct inode *inode, pgoff_t index, goto unlock; } - order = xas_get_order(&xas); + entry_order = xas_get_order(&xas); - /* Swap entry may have changed before we re-acquire the lock */ - if (alloced_order && - (old != alloced_shadow || order != alloced_order)) { - xas_destroy(&xas); - alloced_order = 0; - } + if (!entry_order) + goto unlock; /* Try to split large swap entry in pagecache */ - if (order > 0) { - if (!alloced_order) { - split_order = order; + cur_order = entry_order; + swap_index = round_down(index, 1 << entry_order); + + split_order = xas_try_split_min_order(cur_order); + + while (cur_order > 0) { + pgoff_t aligned_index = + round_down(index, 1 << cur_order); + pgoff_t swap_offset = aligned_index - swap_index; + + xas_set_order(&xas, index, split_order); + xas_try_split(&xas, old, cur_order); + if (xas_error(&xas)) goto unlock; - } - xas_split(&xas, old, order); /* * Re-set the swap entry after splitting, and the swap * offset of the original large entry must be continuous. */ - for (i = 0; i < 1 << order; i++) { - pgoff_t aligned_index = round_down(index, 1 << order); + for (i = 0; i < 1 << cur_order; + i += (1 << split_order)) { swp_entry_t tmp; - tmp = swp_entry(swp_type(swap), swp_offset(swap) + i); + tmp = swp_entry(swp_type(swap), + swp_offset(swap) + swap_offset + + i); __xa_store(&mapping->i_pages, aligned_index + i, swp_to_radix_entry(tmp), 0); } + cur_order = split_order; + split_order = xas_try_split_min_order(split_order); } unlock: xas_unlock_irq(&xas); - /* split needed, alloc here and retry. */ - if (split_order) { - xas_split_alloc(&xas, old, split_order, gfp); - if (xas_error(&xas)) - goto error; - alloced_shadow = old; - alloced_order = split_order; - xas_reset(&xas); - continue; - } - if (!xas_nomem(&xas, gfp)) break; } -error: if (xas_error(&xas)) return xas_error(&xas); - return alloced_order; + return entry_order; } /* @@ -2123,8 +2236,9 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct shmem_inode_info *info = SHMEM_I(inode); struct swap_info_struct *si; struct folio *folio = NULL; + bool skip_swapcache = false; swp_entry_t swap; - int error, nr_pages; + int error, nr_pages, order, split_order; VM_BUG_ON(!*foliop || !xa_is_value(*foliop)); swap = radix_to_swp_entry(*foliop); @@ -2143,8 +2257,10 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, /* Look it up and read it in.. */ folio = swap_cache_get_folio(swap, NULL, 0); + order = xa_get_order(&mapping->i_pages, index); if (!folio) { - int split_order; + int nr_pages = 1 << order; + bool fallback_order0 = false; /* Or update major stats only when swapin succeeds?? */ if (fault_type) { @@ -2154,6 +2270,36 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, } /* + * If uffd is active for the vma, we need per-page fault + * fidelity to maintain the uffd semantics, then fallback + * to swapin order-0 folio, as well as for zswap case. + * Any existing sub folio in the swap cache also blocks + * mTHP swapin. + */ + if (order > 0 && ((vma && unlikely(userfaultfd_armed(vma))) || + !zswap_never_enabled() || + non_swapcache_batch(swap, nr_pages) != nr_pages)) + fallback_order0 = true; + + /* Skip swapcache for synchronous device. */ + if (!fallback_order0 && data_race(si->flags & SWP_SYNCHRONOUS_IO)) { + folio = shmem_swap_alloc_folio(inode, vma, index, swap, order, gfp); + if (!IS_ERR(folio)) { + skip_swapcache = true; + goto alloced; + } + + /* + * Fallback to swapin order-0 folio unless the swap entry + * already exists. + */ + error = PTR_ERR(folio); + folio = NULL; + if (error == -EEXIST) + goto failed; + } + + /* * Now swap device can only swap in order 0 folio, then we * should split the large swap entry stored in the pagecache * if necessary. @@ -2181,13 +2327,40 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, error = -ENOMEM; goto failed; } + } else if (order != folio_order(folio)) { + /* + * Swap readahead may swap in order 0 folios into swapcache + * asynchronously, while the shmem mapping can still stores + * large swap entries. In such cases, we should split the + * large swap entry to prevent possible data corruption. + */ + split_order = shmem_split_large_entry(inode, index, swap, gfp); + if (split_order < 0) { + folio_put(folio); + folio = NULL; + error = split_order; + goto failed; + } + + /* + * If the large swap entry has already been split, it is + * necessary to recalculate the new swap entry based on + * the old order alignment. + */ + if (split_order > 0) { + pgoff_t offset = index - round_down(index, 1 << split_order); + + swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); + } } +alloced: /* We have to do this with folio locked to prevent races */ folio_lock(folio); - if (!folio_test_swapcache(folio) || + if ((!skip_swapcache && !folio_test_swapcache(folio)) || folio->swap.val != swap.val || - !shmem_confirm_swap(mapping, index, swap)) { + !shmem_confirm_swap(mapping, index, swap) || + xa_get_order(&mapping->i_pages, index) != folio_order(folio)) { error = -EEXIST; goto unlock; } @@ -2221,7 +2394,12 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, if (sgp == SGP_WRITE) folio_mark_accessed(folio); - delete_from_swap_cache(folio); + if (skip_swapcache) { + folio->swap.val = 0; + swapcache_clear(si, swap, nr_pages); + } else { + delete_from_swap_cache(folio); + } folio_mark_dirty(folio); swap_free_nr(swap, nr_pages); put_swap_device(si); @@ -2232,8 +2410,11 @@ failed: if (!shmem_confirm_swap(mapping, index, swap)) error = -EEXIST; if (error == -EIO) - shmem_set_folio_swapin_error(inode, index, folio, swap); + shmem_set_folio_swapin_error(inode, index, folio, swap, + skip_swapcache); unlock: + if (skip_swapcache) + swapcache_clear(si, swap, folio_nr_pages(folio)); if (folio) { folio_unlock(folio); folio_put(folio); @@ -2749,12 +2930,6 @@ out_nomem: static int shmem_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); - struct shmem_inode_info *info = SHMEM_I(inode); - int ret; - - ret = seal_check_write(info->seals, vma); - if (ret) - return ret; file_accessed(file); /* This is anonymous shared memory if it is unlinked at the time of mmap */ @@ -3118,8 +3293,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping, if (ret) return ret; - if (folio_test_hwpoison(folio) || - (folio_test_large(folio) && folio_test_has_hwpoisoned(folio))) { + if (folio_contain_hwpoisoned_page(folio)) { folio_unlock(folio); folio_put(folio); return -EIO; @@ -3326,7 +3500,7 @@ static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe, size = min_t(size_t, size, PAGE_SIZE - offset); - if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { + if (!pipe_is_full(pipe)) { struct pipe_buffer *buf = pipe_head_buf(pipe); *buf = (struct pipe_buffer) { @@ -3353,7 +3527,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, int error = 0; /* Work out how much data we can actually add into the pipe */ - used = pipe_occupancy(pipe->head, pipe->tail); + used = pipe_buf_usage(pipe); npages = max_t(ssize_t, pipe->max_usage - used, 0); len = min_t(size_t, len, npages * PAGE_SIZE); @@ -3440,7 +3614,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, total_spliced += n; *ppos += n; in->f_ra.prev_pos = *ppos; - if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) + if (pipe_is_full(pipe)) break; cond_resched(); @@ -3597,7 +3771,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, index--; /* - * Inform shmem_writepage() how far we have reached. + * Inform shmem_writeout() how far we have reached. * No need for lock or barrier: we have the page lock. */ if (!folio_test_uptodate(folio)) @@ -3728,16 +3902,16 @@ out_iput: return error; } -static int shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { int error; error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0); if (error) - return error; + return ERR_PTR(error); inc_nlink(dir); - return 0; + return NULL; } static int shmem_create(struct mnt_idmap *idmap, struct inode *dir, @@ -3818,7 +3992,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry) static int shmem_rmdir(struct inode *dir, struct dentry *dentry) { - if (!simple_offset_empty(dentry)) + if (!simple_empty(dentry)) return -ENOTEMPTY; drop_nlink(d_inode(dentry)); @@ -3875,7 +4049,7 @@ static int shmem_rename2(struct mnt_idmap *idmap, return simple_offset_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); - if (!simple_offset_empty(new_dentry)) + if (!simple_empty(new_dentry)) return -ENOTEMPTY; if (flags & RENAME_WHITEOUT) { @@ -3914,6 +4088,7 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, int len; struct inode *inode; struct folio *folio; + char *link; len = strlen(symname) + 1; if (len > PAGE_SIZE) @@ -3935,12 +4110,13 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, inode->i_size = len-1; if (len <= SHORT_SYMLINK_LEN) { - inode->i_link = kmemdup(symname, len, GFP_KERNEL); - if (!inode->i_link) { + link = kmemdup(symname, len, GFP_KERNEL); + if (!link) { error = -ENOMEM; goto out_remove_offset; } inode->i_op = &shmem_short_symlink_operations; + inode_set_cached_link(inode, link, len - 1); } else { inode_nohighmem(inode); inode->i_mapping->a_ops = &shmem_aops; @@ -4365,7 +4541,7 @@ static int shmem_parse_opt_casefold(struct fs_context *fc, struct fs_parameter * bool latest_version) { struct shmem_options *ctx = fc->fs_private; - unsigned int version = UTF8_LATEST; + int version = UTF8_LATEST; struct unicode_map *encoding; char *version_str = param->string + 5; @@ -4580,48 +4756,37 @@ bad_value: return invalfc(fc, "Bad value for '%s'", param->key); } -static int shmem_parse_options(struct fs_context *fc, void *data) +static char *shmem_next_opt(char **s) { - char *options = data; + char *sbegin = *s; + char *p; - if (options) { - int err = security_sb_eat_lsm_opts(options, &fc->security); - if (err) - return err; - } + if (sbegin == NULL) + return NULL; - while (options != NULL) { - char *this_char = options; - for (;;) { - /* - * NUL-terminate this option: unfortunately, - * mount options form a comma-separated list, - * but mpol's nodelist may also contain commas. - */ - options = strchr(options, ','); - if (options == NULL) - break; - options++; - if (!isdigit(*options)) { - options[-1] = '\0'; - break; - } - } - if (*this_char) { - char *value = strchr(this_char, '='); - size_t len = 0; - int err; - - if (value) { - *value++ = '\0'; - len = strlen(value); - } - err = vfs_parse_fs_string(fc, this_char, value, len); - if (err < 0) - return err; + /* + * NUL-terminate this option: unfortunately, + * mount options form a comma-separated list, + * but mpol's nodelist may also contain commas. + */ + for (;;) { + p = strchr(*s, ','); + if (p == NULL) + break; + *s = p + 1; + if (!isdigit(*(p+1))) { + *p = '\0'; + return sbegin; } } - return 0; + + *s = NULL; + return sbegin; +} + +static int shmem_parse_monolithic(struct fs_context *fc, void *data) +{ + return vfs_parse_monolithic_sep(fc, data, shmem_next_opt); } /* @@ -4888,7 +5053,12 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) sbinfo->gid = ctx->gid; sbinfo->full_inums = ctx->full_inums; sbinfo->mode = ctx->mode; - sbinfo->huge = ctx->huge; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (ctx->seen & SHMEM_SEEN_HUGE) + sbinfo->huge = ctx->huge; + else + sbinfo->huge = tmpfs_huge; +#endif sbinfo->mpol = ctx->mpol; ctx->mpol = NULL; @@ -4966,7 +5136,7 @@ static const struct fs_context_operations shmem_fs_context_ops = { .free = shmem_free_fc, .get_tree = shmem_get_tree, #ifdef CONFIG_TMPFS - .parse_monolithic = shmem_parse_options, + .parse_monolithic = shmem_parse_monolithic, .parse_param = shmem_parse_one, .reconfigure = shmem_reconfigure, #endif @@ -5024,7 +5194,6 @@ static int shmem_error_remove_folio(struct address_space *mapping, } static const struct address_space_operations shmem_aops = { - .writepage = shmem_writepage, .dirty_folio = noop_dirty_folio, #ifdef CONFIG_TMPFS .write_begin = shmem_write_begin, @@ -5439,6 +5608,21 @@ static int __init setup_transparent_hugepage_shmem(char *str) } __setup("transparent_hugepage_shmem=", setup_transparent_hugepage_shmem); +static int __init setup_transparent_hugepage_tmpfs(char *str) +{ + int huge; + + huge = shmem_parse_huge(str); + if (huge < 0) { + pr_warn("transparent_hugepage_tmpfs= cannot parse, ignored\n"); + return huge; + } + + tmpfs_huge = huge; + return 1; +} +__setup("transparent_hugepage_tmpfs=", setup_transparent_hugepage_tmpfs); + static char str_dup[PAGE_SIZE] __initdata; static int __init setup_thp_shmem(char *str) { @@ -5479,19 +5663,19 @@ static int __init setup_thp_shmem(char *str) THP_ORDERS_ALL_FILE_DEFAULT); } - if (start == -EINVAL) { + if (start < 0) { pr_err("invalid size %s in thp_shmem boot parameter\n", start_size); goto err; } - if (end == -EINVAL) { + if (end < 0) { pr_err("invalid size %s in thp_shmem boot parameter\n", end_size); goto err; } - if (start < 0 || end < 0 || start > end) + if (start > end) goto err; nr = end - start + 1; @@ -5628,12 +5812,12 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, if (size < 0 || size > MAX_LFS_FILESIZE) return ERR_PTR(-EINVAL); - if (shmem_acct_size(flags, size)) - return ERR_PTR(-ENOMEM); - if (is_idmapped_mnt(mnt)) return ERR_PTR(-EINVAL); + if (shmem_acct_size(flags, size)) + return ERR_PTR(-ENOMEM); + inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); if (IS_ERR(inode)) { @@ -5658,7 +5842,7 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, * underlying inode. So users of this interface must do LSM checks at a * higher layer. The users are the big_key and shm implementations. LSM * checks are provided at the key or shm level rather than the inode. - * @name: name for dentry (to be seen in /proc/<pid>/maps + * @name: name for dentry (to be seen in /proc/<pid>/maps) * @size: size to be set for the file * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size */ @@ -5670,7 +5854,7 @@ EXPORT_SYMBOL_GPL(shmem_kernel_file_setup); /** * shmem_file_setup - get an unlinked file living in tmpfs - * @name: name for dentry (to be seen in /proc/<pid>/maps + * @name: name for dentry (to be seen in /proc/<pid>/maps) * @size: size to be set for the file * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size */ @@ -5683,7 +5867,7 @@ EXPORT_SYMBOL_GPL(shmem_file_setup); /** * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs * @mnt: the tmpfs mount where the file will be created - * @name: name for dentry (to be seen in /proc/<pid>/maps + * @name: name for dentry (to be seen in /proc/<pid>/maps) * @size: size to be set for the file * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size */ diff --git a/mm/show_mem.c b/mm/show_mem.c index 43afb56abbd3..0cf8bf5d832d 100644 --- a/mm/show_mem.c +++ b/mm/show_mem.c @@ -94,26 +94,20 @@ void si_meminfo_node(struct sysinfo *val, int nid) unsigned long free_highpages = 0; pg_data_t *pgdat = NODE_DATA(nid); - for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) - managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]); - val->totalram = managed_pages; - val->sharedram = node_page_state(pgdat, NR_SHMEM); - val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES); -#ifdef CONFIG_HIGHMEM for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { struct zone *zone = &pgdat->node_zones[zone_type]; - + managed_pages += zone_managed_pages(zone); if (is_highmem(zone)) { managed_highpages += zone_managed_pages(zone); free_highpages += zone_page_state(zone, NR_FREE_PAGES); } } + + val->totalram = managed_pages; + val->sharedram = node_page_state(pgdat, NR_SHMEM); + val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES); val->totalhigh = managed_highpages; val->freehigh = free_highpages; -#else - val->totalhigh = managed_highpages; - val->freehigh = free_highpages; -#endif val->mem_unit = PAGE_SIZE; } #endif @@ -223,7 +217,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z global_node_page_state(NR_SHMEM), global_node_page_state(NR_PAGETABLE), global_node_page_state(NR_SECONDARY_PAGETABLE), - global_zone_page_state(NR_BOUNCE), + 0UL, global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE), global_zone_page_state(NR_FREE_PAGES), free_pcp, @@ -260,6 +254,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z " pagetables:%lukB" " sec_pagetables:%lukB" " all_unreclaimable? %s" + " Balloon:%lukB" "\n", pgdat->node_id, K(node_page_state(pgdat, NR_ACTIVE_ANON)), @@ -285,7 +280,8 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z #endif K(node_page_state(pgdat, NR_PAGETABLE)), K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)), - str_yes_no(pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)); + str_yes_no(pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES), + K(node_page_state(pgdat, NR_BALLOON_PAGES))); } for_each_populated_zone(zone) { @@ -309,6 +305,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z " low:%lukB" " high:%lukB" " reserved_highatomic:%luKB" + " free_highatomic:%luKB" " active_anon:%lukB" " inactive_anon:%lukB" " active_file:%lukB" @@ -330,6 +327,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z K(low_wmark_pages(zone)), K(high_wmark_pages(zone)), K(zone->nr_reserved_highatomic), + K(zone->nr_free_highatomic), K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)), K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)), K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)), @@ -339,7 +337,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z K(zone->present_pages), K(zone_managed_pages(zone)), K(zone_page_state(zone, NR_MLOCK)), - K(zone_page_state(zone, NR_BOUNCE)), + 0UL, K(free_pcp), K(this_cpu_read(zone->per_cpu_pageset->count)), K(zone_page_state(zone, NR_FREE_CMA_PAGES))); diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c index 4a85b94d12ce..20eaee3e97f7 100644 --- a/mm/shrinker_debug.c +++ b/mm/shrinker_debug.c @@ -195,8 +195,6 @@ int shrinker_debugfs_add(struct shrinker *shrinker) int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...) { - struct dentry *entry; - char buf[128]; const char *new, *old; va_list ap; int ret = 0; @@ -213,23 +211,17 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...) old = shrinker->name; shrinker->name = new; - if (shrinker->debugfs_entry) { - snprintf(buf, sizeof(buf), "%s-%d", shrinker->name, - shrinker->debugfs_id); - - entry = debugfs_rename(shrinker_debugfs_root, - shrinker->debugfs_entry, - shrinker_debugfs_root, buf); - if (IS_ERR(entry)) - ret = PTR_ERR(entry); - else - shrinker->debugfs_entry = entry; - } + ret = debugfs_change_name(shrinker->debugfs_entry, "%s-%d", + shrinker->name, shrinker->debugfs_id); + if (ret) { + shrinker->name = old; + kfree_const(new); + } else { + kfree_const(old); + } mutex_unlock(&shrinker_mutex); - kfree_const(old); - return ret; } EXPORT_SYMBOL(shrinker_debugfs_rename); diff --git a/mm/slab.h b/mm/slab.h index 632fedd71fea..05a21dc796e0 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -128,7 +128,7 @@ static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t) /** * slab_folio - The folio allocated for a slab - * @slab: The slab. + * @s: The slab. * * Slabs are allocated as folios that contain the individual objects and are * using some fields in the first struct page of the folio - those fields are @@ -159,7 +159,7 @@ static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t) /** * slab_page - The first struct page allocated for a slab - * @slab: The slab. + * @s: The slab. * * A convenience wrapper for converting slab to the first struct page of the * underlying folio, to communicate with code not yet converted to folio or @@ -457,39 +457,17 @@ static inline bool is_kmalloc_normal(struct kmem_cache *s) return !(s->flags & (SLAB_CACHE_DMA|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT)); } -/* Legal flag mask for kmem_cache_create(), for various configurations */ #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \ SLAB_CACHE_DMA32 | SLAB_PANIC | \ - SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS ) + SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS | \ + SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ + SLAB_TEMPORARY | SLAB_ACCOUNT | \ + SLAB_NO_USER_FLAGS | SLAB_KMALLOC | SLAB_NO_MERGE) -#ifdef CONFIG_SLUB_DEBUG #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ SLAB_TRACE | SLAB_CONSISTENCY_CHECKS) -#else -#define SLAB_DEBUG_FLAGS (0) -#endif -#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ - SLAB_TEMPORARY | SLAB_ACCOUNT | \ - SLAB_NO_USER_FLAGS | SLAB_KMALLOC | SLAB_NO_MERGE) - -/* Common flags available with current configuration */ -#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) - -/* Common flags permitted for kmem_cache_create */ -#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | \ - SLAB_RED_ZONE | \ - SLAB_POISON | \ - SLAB_STORE_USER | \ - SLAB_TRACE | \ - SLAB_CONSISTENCY_CHECKS | \ - SLAB_NOLEAKTRACE | \ - SLAB_RECLAIM_ACCOUNT | \ - SLAB_TEMPORARY | \ - SLAB_ACCOUNT | \ - SLAB_KMALLOC | \ - SLAB_NO_MERGE | \ - SLAB_NO_USER_FLAGS) +#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS) bool __kmem_cache_empty(struct kmem_cache *); int __kmem_cache_shutdown(struct kmem_cache *); @@ -604,6 +582,8 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, int objects, struct slabobj_ext *obj_exts); #endif +void kvfree_rcu_cb(struct rcu_head *head); + size_t __ksize(const void *objp); static inline size_t slab_ksize(const struct kmem_cache *s) diff --git a/mm/slab_common.c b/mm/slab_common.c index a29457bef626..bfe7c40eeee1 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -2,7 +2,7 @@ /* * Slab allocator functions that are independent of the allocator strategy * - * (C) 2012 Christoph Lameter <cl@linux.com> + * (C) 2012 Christoph Lameter <cl@gentwo.org> */ #include <linux/slab.h> @@ -28,7 +28,9 @@ #include <asm/page.h> #include <linux/memcontrol.h> #include <linux/stackdepot.h> +#include <trace/events/rcu.h> +#include "../kernel/rcu/rcu.h" #include "internal.h" #include "slab.h" @@ -296,6 +298,8 @@ struct kmem_cache *__kmem_cache_create_args(const char *name, static_branch_enable(&slub_debug_enabled); if (flags & SLAB_STORE_USER) stack_depot_init(); +#else + flags &= ~SLAB_DEBUG_FLAGS; #endif mutex_lock(&slab_mutex); @@ -305,20 +309,11 @@ struct kmem_cache *__kmem_cache_create_args(const char *name, goto out_unlock; } - /* Refuse requests with allocator specific flags */ if (flags & ~SLAB_FLAGS_PERMITTED) { err = -EINVAL; goto out_unlock; } - /* - * Some allocators will constraint the set of valid flags to a subset - * of all flags. We expect them to define CACHE_CREATE_MASK in this - * case, and we'll just provide them with a sanitized version of the - * passed flags. - */ - flags &= CACHE_CREATE_MASK; - /* Fail closed on bad usersize of useroffset values. */ if (!IS_ENABLED(CONFIG_HARDENED_USERCOPY) || WARN_ON(!args->usersize && args->useroffset) || @@ -1282,3 +1277,908 @@ EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); EXPORT_TRACEPOINT_SYMBOL(kfree); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); +#ifndef CONFIG_KVFREE_RCU_BATCHED + +void kvfree_call_rcu(struct rcu_head *head, void *ptr) +{ + if (head) { + kasan_record_aux_stack(ptr); + call_rcu(head, kvfree_rcu_cb); + return; + } + + // kvfree_rcu(one_arg) call. + might_sleep(); + synchronize_rcu(); + kvfree(ptr); +} +EXPORT_SYMBOL_GPL(kvfree_call_rcu); + +void __init kvfree_rcu_init(void) +{ +} + +#else /* CONFIG_KVFREE_RCU_BATCHED */ + +/* + * This rcu parameter is runtime-read-only. It reflects + * a minimum allowed number of objects which can be cached + * per-CPU. Object size is equal to one page. This value + * can be changed at boot time. + */ +static int rcu_min_cached_objs = 5; +module_param(rcu_min_cached_objs, int, 0444); + +// A page shrinker can ask for pages to be freed to make them +// available for other parts of the system. This usually happens +// under low memory conditions, and in that case we should also +// defer page-cache filling for a short time period. +// +// The default value is 5 seconds, which is long enough to reduce +// interference with the shrinker while it asks other systems to +// drain their caches. +static int rcu_delay_page_cache_fill_msec = 5000; +module_param(rcu_delay_page_cache_fill_msec, int, 0444); + +static struct workqueue_struct *rcu_reclaim_wq; + +/* Maximum number of jiffies to wait before draining a batch. */ +#define KFREE_DRAIN_JIFFIES (5 * HZ) +#define KFREE_N_BATCHES 2 +#define FREE_N_CHANNELS 2 + +/** + * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers + * @list: List node. All blocks are linked between each other + * @gp_snap: Snapshot of RCU state for objects placed to this bulk + * @nr_records: Number of active pointers in the array + * @records: Array of the kvfree_rcu() pointers + */ +struct kvfree_rcu_bulk_data { + struct list_head list; + struct rcu_gp_oldstate gp_snap; + unsigned long nr_records; + void *records[] __counted_by(nr_records); +}; + +/* + * This macro defines how many entries the "records" array + * will contain. It is based on the fact that the size of + * kvfree_rcu_bulk_data structure becomes exactly one page. + */ +#define KVFREE_BULK_MAX_ENTR \ + ((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *)) + +/** + * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests + * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period + * @head_free: List of kfree_rcu() objects waiting for a grace period + * @head_free_gp_snap: Grace-period snapshot to check for attempted premature frees. + * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period + * @krcp: Pointer to @kfree_rcu_cpu structure + */ + +struct kfree_rcu_cpu_work { + struct rcu_work rcu_work; + struct rcu_head *head_free; + struct rcu_gp_oldstate head_free_gp_snap; + struct list_head bulk_head_free[FREE_N_CHANNELS]; + struct kfree_rcu_cpu *krcp; +}; + +/** + * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period + * @head: List of kfree_rcu() objects not yet waiting for a grace period + * @head_gp_snap: Snapshot of RCU state for objects placed to "@head" + * @bulk_head: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period + * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period + * @lock: Synchronize access to this structure + * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES + * @initialized: The @rcu_work fields have been initialized + * @head_count: Number of objects in rcu_head singular list + * @bulk_count: Number of objects in bulk-list + * @bkvcache: + * A simple cache list that contains objects for reuse purpose. + * In order to save some per-cpu space the list is singular. + * Even though it is lockless an access has to be protected by the + * per-cpu lock. + * @page_cache_work: A work to refill the cache when it is empty + * @backoff_page_cache_fill: Delay cache refills + * @work_in_progress: Indicates that page_cache_work is running + * @hrtimer: A hrtimer for scheduling a page_cache_work + * @nr_bkv_objs: number of allocated objects at @bkvcache. + * + * This is a per-CPU structure. The reason that it is not included in + * the rcu_data structure is to permit this code to be extracted from + * the RCU files. Such extraction could allow further optimization of + * the interactions with the slab allocators. + */ +struct kfree_rcu_cpu { + // Objects queued on a linked list + // through their rcu_head structures. + struct rcu_head *head; + unsigned long head_gp_snap; + atomic_t head_count; + + // Objects queued on a bulk-list. + struct list_head bulk_head[FREE_N_CHANNELS]; + atomic_t bulk_count[FREE_N_CHANNELS]; + + struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; + raw_spinlock_t lock; + struct delayed_work monitor_work; + bool initialized; + + struct delayed_work page_cache_work; + atomic_t backoff_page_cache_fill; + atomic_t work_in_progress; + struct hrtimer hrtimer; + + struct llist_head bkvcache; + int nr_bkv_objs; +}; + +static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = { + .lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock), +}; + +static __always_inline void +debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead) +{ +#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD + int i; + + for (i = 0; i < bhead->nr_records; i++) + debug_rcu_head_unqueue((struct rcu_head *)(bhead->records[i])); +#endif +} + +static inline struct kfree_rcu_cpu * +krc_this_cpu_lock(unsigned long *flags) +{ + struct kfree_rcu_cpu *krcp; + + local_irq_save(*flags); // For safely calling this_cpu_ptr(). + krcp = this_cpu_ptr(&krc); + raw_spin_lock(&krcp->lock); + + return krcp; +} + +static inline void +krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags) +{ + raw_spin_unlock_irqrestore(&krcp->lock, flags); +} + +static inline struct kvfree_rcu_bulk_data * +get_cached_bnode(struct kfree_rcu_cpu *krcp) +{ + if (!krcp->nr_bkv_objs) + return NULL; + + WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs - 1); + return (struct kvfree_rcu_bulk_data *) + llist_del_first(&krcp->bkvcache); +} + +static inline bool +put_cached_bnode(struct kfree_rcu_cpu *krcp, + struct kvfree_rcu_bulk_data *bnode) +{ + // Check the limit. + if (krcp->nr_bkv_objs >= rcu_min_cached_objs) + return false; + + llist_add((struct llist_node *) bnode, &krcp->bkvcache); + WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs + 1); + return true; +} + +static int +drain_page_cache(struct kfree_rcu_cpu *krcp) +{ + unsigned long flags; + struct llist_node *page_list, *pos, *n; + int freed = 0; + + if (!rcu_min_cached_objs) + return 0; + + raw_spin_lock_irqsave(&krcp->lock, flags); + page_list = llist_del_all(&krcp->bkvcache); + WRITE_ONCE(krcp->nr_bkv_objs, 0); + raw_spin_unlock_irqrestore(&krcp->lock, flags); + + llist_for_each_safe(pos, n, page_list) { + free_page((unsigned long)pos); + freed++; + } + + return freed; +} + +static void +kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp, + struct kvfree_rcu_bulk_data *bnode, int idx) +{ + unsigned long flags; + int i; + + if (!WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&bnode->gp_snap))) { + debug_rcu_bhead_unqueue(bnode); + rcu_lock_acquire(&rcu_callback_map); + if (idx == 0) { // kmalloc() / kfree(). + trace_rcu_invoke_kfree_bulk_callback( + "slab", bnode->nr_records, + bnode->records); + + kfree_bulk(bnode->nr_records, bnode->records); + } else { // vmalloc() / vfree(). + for (i = 0; i < bnode->nr_records; i++) { + trace_rcu_invoke_kvfree_callback( + "slab", bnode->records[i], 0); + + vfree(bnode->records[i]); + } + } + rcu_lock_release(&rcu_callback_map); + } + + raw_spin_lock_irqsave(&krcp->lock, flags); + if (put_cached_bnode(krcp, bnode)) + bnode = NULL; + raw_spin_unlock_irqrestore(&krcp->lock, flags); + + if (bnode) + free_page((unsigned long) bnode); + + cond_resched_tasks_rcu_qs(); +} + +static void +kvfree_rcu_list(struct rcu_head *head) +{ + struct rcu_head *next; + + for (; head; head = next) { + void *ptr = (void *) head->func; + unsigned long offset = (void *) head - ptr; + + next = head->next; + debug_rcu_head_unqueue((struct rcu_head *)ptr); + rcu_lock_acquire(&rcu_callback_map); + trace_rcu_invoke_kvfree_callback("slab", head, offset); + + kvfree(ptr); + + rcu_lock_release(&rcu_callback_map); + cond_resched_tasks_rcu_qs(); + } +} + +/* + * This function is invoked in workqueue context after a grace period. + * It frees all the objects queued on ->bulk_head_free or ->head_free. + */ +static void kfree_rcu_work(struct work_struct *work) +{ + unsigned long flags; + struct kvfree_rcu_bulk_data *bnode, *n; + struct list_head bulk_head[FREE_N_CHANNELS]; + struct rcu_head *head; + struct kfree_rcu_cpu *krcp; + struct kfree_rcu_cpu_work *krwp; + struct rcu_gp_oldstate head_gp_snap; + int i; + + krwp = container_of(to_rcu_work(work), + struct kfree_rcu_cpu_work, rcu_work); + krcp = krwp->krcp; + + raw_spin_lock_irqsave(&krcp->lock, flags); + // Channels 1 and 2. + for (i = 0; i < FREE_N_CHANNELS; i++) + list_replace_init(&krwp->bulk_head_free[i], &bulk_head[i]); + + // Channel 3. + head = krwp->head_free; + krwp->head_free = NULL; + head_gp_snap = krwp->head_free_gp_snap; + raw_spin_unlock_irqrestore(&krcp->lock, flags); + + // Handle the first two channels. + for (i = 0; i < FREE_N_CHANNELS; i++) { + // Start from the tail page, so a GP is likely passed for it. + list_for_each_entry_safe(bnode, n, &bulk_head[i], list) + kvfree_rcu_bulk(krcp, bnode, i); + } + + /* + * This is used when the "bulk" path can not be used for the + * double-argument of kvfree_rcu(). This happens when the + * page-cache is empty, which means that objects are instead + * queued on a linked list through their rcu_head structures. + * This list is named "Channel 3". + */ + if (head && !WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&head_gp_snap))) + kvfree_rcu_list(head); +} + +static bool +need_offload_krc(struct kfree_rcu_cpu *krcp) +{ + int i; + + for (i = 0; i < FREE_N_CHANNELS; i++) + if (!list_empty(&krcp->bulk_head[i])) + return true; + + return !!READ_ONCE(krcp->head); +} + +static bool +need_wait_for_krwp_work(struct kfree_rcu_cpu_work *krwp) +{ + int i; + + for (i = 0; i < FREE_N_CHANNELS; i++) + if (!list_empty(&krwp->bulk_head_free[i])) + return true; + + return !!krwp->head_free; +} + +static int krc_count(struct kfree_rcu_cpu *krcp) +{ + int sum = atomic_read(&krcp->head_count); + int i; + + for (i = 0; i < FREE_N_CHANNELS; i++) + sum += atomic_read(&krcp->bulk_count[i]); + + return sum; +} + +static void +__schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) +{ + long delay, delay_left; + + delay = krc_count(krcp) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES; + if (delayed_work_pending(&krcp->monitor_work)) { + delay_left = krcp->monitor_work.timer.expires - jiffies; + if (delay < delay_left) + mod_delayed_work(rcu_reclaim_wq, &krcp->monitor_work, delay); + return; + } + queue_delayed_work(rcu_reclaim_wq, &krcp->monitor_work, delay); +} + +static void +schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&krcp->lock, flags); + __schedule_delayed_monitor_work(krcp); + raw_spin_unlock_irqrestore(&krcp->lock, flags); +} + +static void +kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp) +{ + struct list_head bulk_ready[FREE_N_CHANNELS]; + struct kvfree_rcu_bulk_data *bnode, *n; + struct rcu_head *head_ready = NULL; + unsigned long flags; + int i; + + raw_spin_lock_irqsave(&krcp->lock, flags); + for (i = 0; i < FREE_N_CHANNELS; i++) { + INIT_LIST_HEAD(&bulk_ready[i]); + + list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) { + if (!poll_state_synchronize_rcu_full(&bnode->gp_snap)) + break; + + atomic_sub(bnode->nr_records, &krcp->bulk_count[i]); + list_move(&bnode->list, &bulk_ready[i]); + } + } + + if (krcp->head && poll_state_synchronize_rcu(krcp->head_gp_snap)) { + head_ready = krcp->head; + atomic_set(&krcp->head_count, 0); + WRITE_ONCE(krcp->head, NULL); + } + raw_spin_unlock_irqrestore(&krcp->lock, flags); + + for (i = 0; i < FREE_N_CHANNELS; i++) { + list_for_each_entry_safe(bnode, n, &bulk_ready[i], list) + kvfree_rcu_bulk(krcp, bnode, i); + } + + if (head_ready) + kvfree_rcu_list(head_ready); +} + +/* + * Return: %true if a work is queued, %false otherwise. + */ +static bool +kvfree_rcu_queue_batch(struct kfree_rcu_cpu *krcp) +{ + unsigned long flags; + bool queued = false; + int i, j; + + raw_spin_lock_irqsave(&krcp->lock, flags); + + // Attempt to start a new batch. + for (i = 0; i < KFREE_N_BATCHES; i++) { + struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]); + + // Try to detach bulk_head or head and attach it, only when + // all channels are free. Any channel is not free means at krwp + // there is on-going rcu work to handle krwp's free business. + if (need_wait_for_krwp_work(krwp)) + continue; + + // kvfree_rcu_drain_ready() might handle this krcp, if so give up. + if (need_offload_krc(krcp)) { + // Channel 1 corresponds to the SLAB-pointer bulk path. + // Channel 2 corresponds to vmalloc-pointer bulk path. + for (j = 0; j < FREE_N_CHANNELS; j++) { + if (list_empty(&krwp->bulk_head_free[j])) { + atomic_set(&krcp->bulk_count[j], 0); + list_replace_init(&krcp->bulk_head[j], + &krwp->bulk_head_free[j]); + } + } + + // Channel 3 corresponds to both SLAB and vmalloc + // objects queued on the linked list. + if (!krwp->head_free) { + krwp->head_free = krcp->head; + get_state_synchronize_rcu_full(&krwp->head_free_gp_snap); + atomic_set(&krcp->head_count, 0); + WRITE_ONCE(krcp->head, NULL); + } + + // One work is per one batch, so there are three + // "free channels", the batch can handle. Break + // the loop since it is done with this CPU thus + // queuing an RCU work is _always_ success here. + queued = queue_rcu_work(rcu_reclaim_wq, &krwp->rcu_work); + WARN_ON_ONCE(!queued); + break; + } + } + + raw_spin_unlock_irqrestore(&krcp->lock, flags); + return queued; +} + +/* + * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. + */ +static void kfree_rcu_monitor(struct work_struct *work) +{ + struct kfree_rcu_cpu *krcp = container_of(work, + struct kfree_rcu_cpu, monitor_work.work); + + // Drain ready for reclaim. + kvfree_rcu_drain_ready(krcp); + + // Queue a batch for a rest. + kvfree_rcu_queue_batch(krcp); + + // If there is nothing to detach, it means that our job is + // successfully done here. In case of having at least one + // of the channels that is still busy we should rearm the + // work to repeat an attempt. Because previous batches are + // still in progress. + if (need_offload_krc(krcp)) + schedule_delayed_monitor_work(krcp); +} + +static void fill_page_cache_func(struct work_struct *work) +{ + struct kvfree_rcu_bulk_data *bnode; + struct kfree_rcu_cpu *krcp = + container_of(work, struct kfree_rcu_cpu, + page_cache_work.work); + unsigned long flags; + int nr_pages; + bool pushed; + int i; + + nr_pages = atomic_read(&krcp->backoff_page_cache_fill) ? + 1 : rcu_min_cached_objs; + + for (i = READ_ONCE(krcp->nr_bkv_objs); i < nr_pages; i++) { + bnode = (struct kvfree_rcu_bulk_data *) + __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); + + if (!bnode) + break; + + raw_spin_lock_irqsave(&krcp->lock, flags); + pushed = put_cached_bnode(krcp, bnode); + raw_spin_unlock_irqrestore(&krcp->lock, flags); + + if (!pushed) { + free_page((unsigned long) bnode); + break; + } + } + + atomic_set(&krcp->work_in_progress, 0); + atomic_set(&krcp->backoff_page_cache_fill, 0); +} + +// Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock() +// state specified by flags. If can_alloc is true, the caller must +// be schedulable and not be holding any locks or mutexes that might be +// acquired by the memory allocator or anything that it might invoke. +// Returns true if ptr was successfully recorded, else the caller must +// use a fallback. +static inline bool +add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, + unsigned long *flags, void *ptr, bool can_alloc) +{ + struct kvfree_rcu_bulk_data *bnode; + int idx; + + *krcp = krc_this_cpu_lock(flags); + if (unlikely(!(*krcp)->initialized)) + return false; + + idx = !!is_vmalloc_addr(ptr); + bnode = list_first_entry_or_null(&(*krcp)->bulk_head[idx], + struct kvfree_rcu_bulk_data, list); + + /* Check if a new block is required. */ + if (!bnode || bnode->nr_records == KVFREE_BULK_MAX_ENTR) { + bnode = get_cached_bnode(*krcp); + if (!bnode && can_alloc) { + krc_this_cpu_unlock(*krcp, *flags); + + // __GFP_NORETRY - allows a light-weight direct reclaim + // what is OK from minimizing of fallback hitting point of + // view. Apart of that it forbids any OOM invoking what is + // also beneficial since we are about to release memory soon. + // + // __GFP_NOMEMALLOC - prevents from consuming of all the + // memory reserves. Please note we have a fallback path. + // + // __GFP_NOWARN - it is supposed that an allocation can + // be failed under low memory or high memory pressure + // scenarios. + bnode = (struct kvfree_rcu_bulk_data *) + __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); + raw_spin_lock_irqsave(&(*krcp)->lock, *flags); + } + + if (!bnode) + return false; + + // Initialize the new block and attach it. + bnode->nr_records = 0; + list_add(&bnode->list, &(*krcp)->bulk_head[idx]); + } + + // Finally insert and update the GP for this page. + bnode->nr_records++; + bnode->records[bnode->nr_records - 1] = ptr; + get_state_synchronize_rcu_full(&bnode->gp_snap); + atomic_inc(&(*krcp)->bulk_count[idx]); + + return true; +} + +static enum hrtimer_restart +schedule_page_work_fn(struct hrtimer *t) +{ + struct kfree_rcu_cpu *krcp = + container_of(t, struct kfree_rcu_cpu, hrtimer); + + queue_delayed_work(system_highpri_wq, &krcp->page_cache_work, 0); + return HRTIMER_NORESTART; +} + +static void +run_page_cache_worker(struct kfree_rcu_cpu *krcp) +{ + // If cache disabled, bail out. + if (!rcu_min_cached_objs) + return; + + if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && + !atomic_xchg(&krcp->work_in_progress, 1)) { + if (atomic_read(&krcp->backoff_page_cache_fill)) { + queue_delayed_work(rcu_reclaim_wq, + &krcp->page_cache_work, + msecs_to_jiffies(rcu_delay_page_cache_fill_msec)); + } else { + hrtimer_setup(&krcp->hrtimer, schedule_page_work_fn, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); + hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL); + } + } +} + +void __init kfree_rcu_scheduler_running(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + + if (need_offload_krc(krcp)) + schedule_delayed_monitor_work(krcp); + } +} + +/* + * Queue a request for lazy invocation of the appropriate free routine + * after a grace period. Please note that three paths are maintained, + * two for the common case using arrays of pointers and a third one that + * is used only when the main paths cannot be used, for example, due to + * memory pressure. + * + * Each kvfree_call_rcu() request is added to a batch. The batch will be drained + * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will + * be free'd in workqueue context. This allows us to: batch requests together to + * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load. + */ +void kvfree_call_rcu(struct rcu_head *head, void *ptr) +{ + unsigned long flags; + struct kfree_rcu_cpu *krcp; + bool success; + + /* + * Please note there is a limitation for the head-less + * variant, that is why there is a clear rule for such + * objects: it can be used from might_sleep() context + * only. For other places please embed an rcu_head to + * your data. + */ + if (!head) + might_sleep(); + + // Queue the object but don't yet schedule the batch. + if (debug_rcu_head_queue(ptr)) { + // Probable double kfree_rcu(), just leak. + WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n", + __func__, head); + + // Mark as success and leave. + return; + } + + kasan_record_aux_stack(ptr); + success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head); + if (!success) { + run_page_cache_worker(krcp); + + if (head == NULL) + // Inline if kvfree_rcu(one_arg) call. + goto unlock_return; + + head->func = ptr; + head->next = krcp->head; + WRITE_ONCE(krcp->head, head); + atomic_inc(&krcp->head_count); + + // Take a snapshot for this krcp. + krcp->head_gp_snap = get_state_synchronize_rcu(); + success = true; + } + + /* + * The kvfree_rcu() caller considers the pointer freed at this point + * and likely removes any references to it. Since the actual slab + * freeing (and kmemleak_free()) is deferred, tell kmemleak to ignore + * this object (no scanning or false positives reporting). + */ + kmemleak_ignore(ptr); + + // Set timer to drain after KFREE_DRAIN_JIFFIES. + if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING) + __schedule_delayed_monitor_work(krcp); + +unlock_return: + krc_this_cpu_unlock(krcp, flags); + + /* + * Inline kvfree() after synchronize_rcu(). We can do + * it from might_sleep() context only, so the current + * CPU can pass the QS state. + */ + if (!success) { + debug_rcu_head_unqueue((struct rcu_head *) ptr); + synchronize_rcu(); + kvfree(ptr); + } +} +EXPORT_SYMBOL_GPL(kvfree_call_rcu); + +/** + * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete. + * + * Note that a single argument of kvfree_rcu() call has a slow path that + * triggers synchronize_rcu() following by freeing a pointer. It is done + * before the return from the function. Therefore for any single-argument + * call that will result in a kfree() to a cache that is to be destroyed + * during module exit, it is developer's responsibility to ensure that all + * such calls have returned before the call to kmem_cache_destroy(). + */ +void kvfree_rcu_barrier(void) +{ + struct kfree_rcu_cpu_work *krwp; + struct kfree_rcu_cpu *krcp; + bool queued; + int i, cpu; + + /* + * Firstly we detach objects and queue them over an RCU-batch + * for all CPUs. Finally queued works are flushed for each CPU. + * + * Please note. If there are outstanding batches for a particular + * CPU, those have to be finished first following by queuing a new. + */ + for_each_possible_cpu(cpu) { + krcp = per_cpu_ptr(&krc, cpu); + + /* + * Check if this CPU has any objects which have been queued for a + * new GP completion. If not(means nothing to detach), we are done + * with it. If any batch is pending/running for this "krcp", below + * per-cpu flush_rcu_work() waits its completion(see last step). + */ + if (!need_offload_krc(krcp)) + continue; + + while (1) { + /* + * If we are not able to queue a new RCU work it means: + * - batches for this CPU are still in flight which should + * be flushed first and then repeat; + * - no objects to detach, because of concurrency. + */ + queued = kvfree_rcu_queue_batch(krcp); + + /* + * Bail out, if there is no need to offload this "krcp" + * anymore. As noted earlier it can run concurrently. + */ + if (queued || !need_offload_krc(krcp)) + break; + + /* There are ongoing batches. */ + for (i = 0; i < KFREE_N_BATCHES; i++) { + krwp = &(krcp->krw_arr[i]); + flush_rcu_work(&krwp->rcu_work); + } + } + } + + /* + * Now we guarantee that all objects are flushed. + */ + for_each_possible_cpu(cpu) { + krcp = per_cpu_ptr(&krc, cpu); + + /* + * A monitor work can drain ready to reclaim objects + * directly. Wait its completion if running or pending. + */ + cancel_delayed_work_sync(&krcp->monitor_work); + + for (i = 0; i < KFREE_N_BATCHES; i++) { + krwp = &(krcp->krw_arr[i]); + flush_rcu_work(&krwp->rcu_work); + } + } +} +EXPORT_SYMBOL_GPL(kvfree_rcu_barrier); + +static unsigned long +kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) +{ + int cpu; + unsigned long count = 0; + + /* Snapshot count of all CPUs */ + for_each_possible_cpu(cpu) { + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + + count += krc_count(krcp); + count += READ_ONCE(krcp->nr_bkv_objs); + atomic_set(&krcp->backoff_page_cache_fill, 1); + } + + return count == 0 ? SHRINK_EMPTY : count; +} + +static unsigned long +kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) +{ + int cpu, freed = 0; + + for_each_possible_cpu(cpu) { + int count; + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + + count = krc_count(krcp); + count += drain_page_cache(krcp); + kfree_rcu_monitor(&krcp->monitor_work.work); + + sc->nr_to_scan -= count; + freed += count; + + if (sc->nr_to_scan <= 0) + break; + } + + return freed == 0 ? SHRINK_STOP : freed; +} + +void __init kvfree_rcu_init(void) +{ + int cpu; + int i, j; + struct shrinker *kfree_rcu_shrinker; + + rcu_reclaim_wq = alloc_workqueue("kvfree_rcu_reclaim", + WQ_UNBOUND | WQ_MEM_RECLAIM, 0); + WARN_ON(!rcu_reclaim_wq); + + /* Clamp it to [0:100] seconds interval. */ + if (rcu_delay_page_cache_fill_msec < 0 || + rcu_delay_page_cache_fill_msec > 100 * MSEC_PER_SEC) { + + rcu_delay_page_cache_fill_msec = + clamp(rcu_delay_page_cache_fill_msec, 0, + (int) (100 * MSEC_PER_SEC)); + + pr_info("Adjusting rcutree.rcu_delay_page_cache_fill_msec to %d ms.\n", + rcu_delay_page_cache_fill_msec); + } + + for_each_possible_cpu(cpu) { + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + + for (i = 0; i < KFREE_N_BATCHES; i++) { + INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work); + krcp->krw_arr[i].krcp = krcp; + + for (j = 0; j < FREE_N_CHANNELS; j++) + INIT_LIST_HEAD(&krcp->krw_arr[i].bulk_head_free[j]); + } + + for (i = 0; i < FREE_N_CHANNELS; i++) + INIT_LIST_HEAD(&krcp->bulk_head[i]); + + INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); + INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func); + krcp->initialized = true; + } + + kfree_rcu_shrinker = shrinker_alloc(0, "slab-kvfree-rcu"); + if (!kfree_rcu_shrinker) { + pr_err("Failed to allocate kfree_rcu() shrinker!\n"); + return; + } + + kfree_rcu_shrinker->count_objects = kfree_rcu_shrink_count; + kfree_rcu_shrinker->scan_objects = kfree_rcu_shrink_scan; + + shrinker_register(kfree_rcu_shrinker); +} + +#endif /* CONFIG_KVFREE_RCU_BATCHED */ + diff --git a/mm/slub.c b/mm/slub.c index 19980419b176..31e11ef256f9 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -19,6 +19,7 @@ #include <linux/bitops.h> #include <linux/slab.h> #include "slab.h" +#include <linux/vmalloc.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/kasan.h> @@ -1017,22 +1018,31 @@ void skip_orig_size_check(struct kmem_cache *s, const void *object) set_orig_size(s, (void *)object, s->object_size); } -static void slab_bug(struct kmem_cache *s, char *fmt, ...) +static void __slab_bug(struct kmem_cache *s, const char *fmt, va_list argsp) { struct va_format vaf; va_list args; - va_start(args, fmt); + va_copy(args, argsp); vaf.fmt = fmt; vaf.va = &args; pr_err("=============================================================================\n"); - pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf); + pr_err("BUG %s (%s): %pV\n", s ? s->name : "<unknown>", print_tainted(), &vaf); pr_err("-----------------------------------------------------------------------------\n\n"); va_end(args); } +static void slab_bug(struct kmem_cache *s, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + __slab_bug(s, fmt, args); + va_end(args); +} + __printf(2, 3) -static void slab_fix(struct kmem_cache *s, char *fmt, ...) +static void slab_fix(struct kmem_cache *s, const char *fmt, ...) { struct va_format vaf; va_list args; @@ -1085,19 +1095,19 @@ static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p) /* Beginning of the filler is the free pointer */ print_section(KERN_ERR, "Padding ", p + off, size_from_object(s) - off); - - dump_stack(); } static void object_err(struct kmem_cache *s, struct slab *slab, - u8 *object, char *reason) + u8 *object, const char *reason) { if (slab_add_kunit_errors()) return; - slab_bug(s, "%s", reason); + slab_bug(s, reason); print_trailer(s, slab, object); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + + WARN_ON(1); } static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, @@ -1114,22 +1124,30 @@ static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, return false; } +static void __slab_err(struct slab *slab) +{ + if (slab_in_kunit_test()) + return; + + print_slab_info(slab); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + + WARN_ON(1); +} + static __printf(3, 4) void slab_err(struct kmem_cache *s, struct slab *slab, const char *fmt, ...) { va_list args; - char buf[100]; if (slab_add_kunit_errors()) return; va_start(args, fmt); - vsnprintf(buf, sizeof(buf), fmt, args); + __slab_bug(s, fmt, args); va_end(args); - slab_bug(s, "%s", buf); - print_slab_info(slab); - dump_stack(); - add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + + __slab_err(slab); } static void init_object(struct kmem_cache *s, void *object, u8 val) @@ -1166,7 +1184,7 @@ static void init_object(struct kmem_cache *s, void *object, u8 val) s->inuse - poison_size); } -static void restore_bytes(struct kmem_cache *s, char *message, u8 data, +static void restore_bytes(struct kmem_cache *s, const char *message, u8 data, void *from, void *to) { slab_fix(s, "Restoring %s 0x%p-0x%p=0x%x", message, from, to - 1, data); @@ -1181,8 +1199,8 @@ static void restore_bytes(struct kmem_cache *s, char *message, u8 data, static pad_check_attributes int check_bytes_and_report(struct kmem_cache *s, struct slab *slab, - u8 *object, char *what, - u8 *start, unsigned int value, unsigned int bytes) + u8 *object, const char *what, u8 *start, unsigned int value, + unsigned int bytes, bool slab_obj_print) { u8 *fault; u8 *end; @@ -1201,10 +1219,11 @@ check_bytes_and_report(struct kmem_cache *s, struct slab *slab, if (slab_add_kunit_errors()) goto skip_bug_print; - slab_bug(s, "%s overwritten", what); - pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", - fault, end - 1, fault - addr, - fault[0], value); + pr_err("[%s overwritten] 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", + what, fault, end - 1, fault - addr, fault[0], value); + + if (slab_obj_print) + object_err(s, slab, object, "Object corrupt"); skip_bug_print: restore_bytes(s, what, value, fault, end); @@ -1268,7 +1287,7 @@ static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p) return 1; return check_bytes_and_report(s, slab, p, "Object padding", - p + off, POISON_INUSE, size_from_object(s) - off); + p + off, POISON_INUSE, size_from_object(s) - off, true); } /* Check the pad bytes at the end of a slab page */ @@ -1301,9 +1320,10 @@ slab_pad_check(struct kmem_cache *s, struct slab *slab) while (end > fault && end[-1] == POISON_INUSE) end--; - slab_err(s, slab, "Padding overwritten. 0x%p-0x%p @offset=%tu", - fault, end - 1, fault - start); + slab_bug(s, "Padding overwritten. 0x%p-0x%p @offset=%tu", + fault, end - 1, fault - start); print_section(KERN_ERR, "Padding ", pad, remainder); + __slab_err(slab); restore_bytes(s, "slab padding", POISON_INUSE, fault, end); } @@ -1318,11 +1338,11 @@ static int check_object(struct kmem_cache *s, struct slab *slab, if (s->flags & SLAB_RED_ZONE) { if (!check_bytes_and_report(s, slab, object, "Left Redzone", - object - s->red_left_pad, val, s->red_left_pad)) + object - s->red_left_pad, val, s->red_left_pad, ret)) ret = 0; if (!check_bytes_and_report(s, slab, object, "Right Redzone", - endobject, val, s->inuse - s->object_size)) + endobject, val, s->inuse - s->object_size, ret)) ret = 0; if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) { @@ -1331,7 +1351,7 @@ static int check_object(struct kmem_cache *s, struct slab *slab, if (s->object_size > orig_size && !check_bytes_and_report(s, slab, object, "kmalloc Redzone", p + orig_size, - val, s->object_size - orig_size)) { + val, s->object_size - orig_size, ret)) { ret = 0; } } @@ -1339,7 +1359,7 @@ static int check_object(struct kmem_cache *s, struct slab *slab, if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) { if (!check_bytes_and_report(s, slab, p, "Alignment padding", endobject, POISON_INUSE, - s->inuse - s->object_size)) + s->inuse - s->object_size, ret)) ret = 0; } } @@ -1355,11 +1375,11 @@ static int check_object(struct kmem_cache *s, struct slab *slab, if (kasan_meta_size < s->object_size - 1 && !check_bytes_and_report(s, slab, p, "Poison", p + kasan_meta_size, POISON_FREE, - s->object_size - kasan_meta_size - 1)) + s->object_size - kasan_meta_size - 1, ret)) ret = 0; if (kasan_meta_size < s->object_size && !check_bytes_and_report(s, slab, p, "End Poison", - p + s->object_size - 1, POISON_END, 1)) + p + s->object_size - 1, POISON_END, 1, ret)) ret = 0; } /* @@ -1385,11 +1405,6 @@ static int check_object(struct kmem_cache *s, struct slab *slab, ret = 0; } - if (!ret && !slab_in_kunit_test()) { - print_trailer(s, slab, object); - add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); - } - return ret; } @@ -1427,7 +1442,7 @@ static int check_slab(struct kmem_cache *s, struct slab *slab) * Determine if a certain object in a slab is on the freelist. Must hold the * slab lock to guarantee that the chains are in a consistent state. */ -static int on_freelist(struct kmem_cache *s, struct slab *slab, void *search) +static bool on_freelist(struct kmem_cache *s, struct slab *slab, void *search) { int nr = 0; void *fp; @@ -1437,26 +1452,34 @@ static int on_freelist(struct kmem_cache *s, struct slab *slab, void *search) fp = slab->freelist; while (fp && nr <= slab->objects) { if (fp == search) - return 1; + return true; if (!check_valid_pointer(s, slab, fp)) { if (object) { object_err(s, slab, object, "Freechain corrupt"); set_freepointer(s, object, NULL); + break; } else { slab_err(s, slab, "Freepointer corrupt"); slab->freelist = NULL; slab->inuse = slab->objects; slab_fix(s, "Freelist cleared"); - return 0; + return false; } - break; } object = fp; fp = get_freepointer(s, object); nr++; } + if (nr > slab->objects) { + slab_err(s, slab, "Freelist cycle detected"); + slab->freelist = NULL; + slab->inuse = slab->objects; + slab_fix(s, "Freelist cleared"); + return false; + } + max_objects = order_objects(slab_order(slab), s->size); if (max_objects > MAX_OBJS_PER_PAGE) max_objects = MAX_OBJS_PER_PAGE; @@ -1624,12 +1647,12 @@ static inline int free_consistency_checks(struct kmem_cache *s, slab_err(s, slab, "Attempt to free object(0x%p) outside of slab", object); } else if (!slab->slab_cache) { - pr_err("SLUB <none>: no slab for object 0x%p.\n", - object); - dump_stack(); - } else + slab_err(NULL, slab, "No slab cache for object 0x%p", + object); + } else { object_err(s, slab, object, - "page slab pointer corrupt."); + "page slab pointer corrupt."); + } return 0; } return 1; @@ -1950,6 +1973,11 @@ static inline void handle_failed_objexts_alloc(unsigned long obj_exts, #define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \ __GFP_ACCOUNT | __GFP_NOFAIL) +static inline void init_slab_obj_exts(struct slab *slab) +{ + slab->obj_exts = 0; +} + int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, gfp_t gfp, bool new_slab) { @@ -2020,20 +2048,12 @@ static inline void free_slab_obj_exts(struct slab *slab) slab->obj_exts = 0; } -static inline bool need_slab_obj_ext(void) -{ - if (mem_alloc_profiling_enabled()) - return true; +#else /* CONFIG_SLAB_OBJ_EXT */ - /* - * CONFIG_MEMCG creates vector of obj_cgroup objects conditionally - * inside memcg_slab_post_alloc_hook. No other users for now. - */ - return false; +static inline void init_slab_obj_exts(struct slab *slab) +{ } -#else /* CONFIG_SLAB_OBJ_EXT */ - static int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, gfp_t gfp, bool new_slab) { @@ -2044,11 +2064,6 @@ static inline void free_slab_obj_exts(struct slab *slab) { } -static inline bool need_slab_obj_ext(void) -{ - return false; -} - #endif /* CONFIG_SLAB_OBJ_EXT */ #ifdef CONFIG_MEM_ALLOC_PROFILING @@ -2069,41 +2084,46 @@ prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p) slab = virt_to_slab(p); if (!slab_obj_exts(slab) && - WARN(alloc_slab_obj_exts(slab, s, flags, false), - "%s, %s: Failed to create slab extension vector!\n", - __func__, s->name)) + alloc_slab_obj_exts(slab, s, flags, false)) { + pr_warn_once("%s, %s: Failed to create slab extension vector!\n", + __func__, s->name); return NULL; + } return slab_obj_exts(slab) + obj_to_index(s, slab, p); } -static inline void -alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags) +/* Should be called only if mem_alloc_profiling_enabled() */ +static noinline void +__alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags) { - if (need_slab_obj_ext()) { - struct slabobj_ext *obj_exts; + struct slabobj_ext *obj_exts; - obj_exts = prepare_slab_obj_exts_hook(s, flags, object); - /* - * Currently obj_exts is used only for allocation profiling. - * If other users appear then mem_alloc_profiling_enabled() - * check should be added before alloc_tag_add(). - */ - if (likely(obj_exts)) - alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size); - } + obj_exts = prepare_slab_obj_exts_hook(s, flags, object); + /* + * Currently obj_exts is used only for allocation profiling. + * If other users appear then mem_alloc_profiling_enabled() + * check should be added before alloc_tag_add(). + */ + if (likely(obj_exts)) + alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size); } static inline void -alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, - int objects) +alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags) +{ + if (mem_alloc_profiling_enabled()) + __alloc_tagging_slab_alloc_hook(s, object, flags); +} + +/* Should be called only if mem_alloc_profiling_enabled() */ +static noinline void +__alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, + int objects) { struct slabobj_ext *obj_exts; int i; - if (!mem_alloc_profiling_enabled()) - return; - /* slab->obj_exts might not be NULL if it was created for MEMCG accounting. */ if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE)) return; @@ -2119,6 +2139,14 @@ alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, } } +static inline void +alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, + int objects) +{ + if (mem_alloc_profiling_enabled()) + __alloc_tagging_slab_free_hook(s, slab, p, objects); +} + #else /* CONFIG_MEM_ALLOC_PROFILING */ static inline void @@ -2189,9 +2217,24 @@ bool memcg_slab_post_charge(void *p, gfp_t flags) folio = virt_to_folio(p); if (!folio_test_slab(folio)) { - return folio_memcg_kmem(folio) || - (__memcg_kmem_charge_page(folio_page(folio, 0), flags, - folio_order(folio)) == 0); + int size; + + if (folio_memcg_kmem(folio)) + return true; + + if (__memcg_kmem_charge_page(folio_page(folio, 0), flags, + folio_order(folio))) + return false; + + /* + * This folio has already been accounted in the global stats but + * not in the memcg stats. So, subtract from the global and use + * the interface which adds to both global and memcg stats. + */ + size = folio_size(folio); + node_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, -size); + lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, size); + return true; } slab = folio_slab(folio); @@ -2296,7 +2339,7 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init, * We have to do this manually because the rcu_head is * not located inside the object. */ - kasan_record_aux_stack_noalloc(x); + kasan_record_aux_stack(x); delayed_free->object = x; call_rcu(&delayed_free->head, slab_free_after_rcu_debug); @@ -2405,17 +2448,15 @@ static inline struct slab *alloc_slab_page(gfp_t flags, int node, unsigned int order = oo_order(oo); if (node == NUMA_NO_NODE) - folio = (struct folio *)alloc_pages(flags, order); + folio = (struct folio *)alloc_frozen_pages(flags, order); else - folio = (struct folio *)__alloc_pages_node(node, flags, order); + folio = (struct folio *)__alloc_frozen_pages(flags, order, node, NULL); if (!folio) return NULL; slab = folio_slab(folio); __folio_set_slab(folio); - /* Make the flag visible before any changes to folio->mapping */ - smp_wmb(); if (folio_is_pfmemalloc(folio)) slab_set_pfmemalloc(slab); @@ -2543,8 +2584,12 @@ static __always_inline void account_slab(struct slab *slab, int order, static __always_inline void unaccount_slab(struct slab *slab, int order, struct kmem_cache *s) { - if (memcg_kmem_online() || need_slab_obj_ext()) - free_slab_obj_exts(slab); + /* + * The slab object extensions should now be freed regardless of + * whether mem_alloc_profiling_enabled() or not because profiling + * might have been disabled after slab->obj_exts got allocated. + */ + free_slab_obj_exts(slab); mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), -(PAGE_SIZE << order)); @@ -2588,6 +2633,7 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) slab->objects = oo_objects(oo); slab->inuse = 0; slab->frozen = 0; + init_slab_obj_exts(slab); account_slab(slab, oo_order(oo), s, flags); @@ -2636,12 +2682,10 @@ static void __free_slab(struct kmem_cache *s, struct slab *slab) __slab_clear_pfmemalloc(slab); folio->mapping = NULL; - /* Make the mapping reset visible before clearing the flag */ - smp_wmb(); __folio_clear_slab(folio); mm_account_reclaimed_pages(pages); unaccount_slab(slab, order, s); - __free_pages(&folio->page, order); + free_frozen_pages(&folio->page, order); } static void rcu_free_slab(struct rcu_head *h) @@ -4230,6 +4274,7 @@ static void *___kmalloc_large_node(size_t size, gfp_t flags, int node) ptr = folio_address(folio); lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, PAGE_SIZE << order); + __folio_set_large_kmalloc(folio); } ptr = kasan_kmalloc_large(ptr, size, flags); @@ -4705,6 +4750,11 @@ static void free_large_kmalloc(struct folio *folio, void *object) { unsigned int order = folio_order(folio); + if (WARN_ON_ONCE(!folio_test_large_kmalloc(folio))) { + dump_page(&folio->page, "Not a kmalloc allocation"); + return; + } + if (WARN_ON_ONCE(order == 0)) pr_warn_once("object pointer: 0x%p\n", object); @@ -4714,9 +4764,55 @@ static void free_large_kmalloc(struct folio *folio, void *object) lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, -(PAGE_SIZE << order)); + __folio_clear_large_kmalloc(folio); folio_put(folio); } +/* + * Given an rcu_head embedded within an object obtained from kvmalloc at an + * offset < 4k, free the object in question. + */ +void kvfree_rcu_cb(struct rcu_head *head) +{ + void *obj = head; + struct folio *folio; + struct slab *slab; + struct kmem_cache *s; + void *slab_addr; + + if (is_vmalloc_addr(obj)) { + obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj); + vfree(obj); + return; + } + + folio = virt_to_folio(obj); + if (!folio_test_slab(folio)) { + /* + * rcu_head offset can be only less than page size so no need to + * consider folio order + */ + obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj); + free_large_kmalloc(folio, obj); + return; + } + + slab = folio_slab(folio); + s = slab->slab_cache; + slab_addr = folio_address(folio); + + if (is_kfence_address(obj)) { + obj = kfence_object_start(obj); + } else { + unsigned int idx = __obj_to_index(s, slab_addr, obj); + + obj = slab_addr + s->size * idx; + obj = fixup_red_left(s, obj); + } + + slab_free(s, slab, obj, _RET_IP_); +} + /** * kfree - free previously allocated memory * @object: pointer returned by kmalloc() or kmem_cache_alloc() @@ -4867,6 +4963,170 @@ void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags) } EXPORT_SYMBOL(krealloc_noprof); +static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size) +{ + /* + * We want to attempt a large physically contiguous block first because + * it is less likely to fragment multiple larger blocks and therefore + * contribute to a long term fragmentation less than vmalloc fallback. + * However make sure that larger requests are not too disruptive - i.e. + * do not direct reclaim unless physically continuous memory is preferred + * (__GFP_RETRY_MAYFAIL mode). We still kick in kswapd/kcompactd to + * start working in the background + */ + if (size > PAGE_SIZE) { + flags |= __GFP_NOWARN; + + if (!(flags & __GFP_RETRY_MAYFAIL)) + flags &= ~__GFP_DIRECT_RECLAIM; + + /* nofail semantic is implemented by the vmalloc fallback */ + flags &= ~__GFP_NOFAIL; + } + + return flags; +} + +/** + * __kvmalloc_node - attempt to allocate physically contiguous memory, but upon + * failure, fall back to non-contiguous (vmalloc) allocation. + * @size: size of the request. + * @b: which set of kmalloc buckets to allocate from. + * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL. + * @node: numa node to allocate from + * + * Uses kmalloc to get the memory but if the allocation fails then falls back + * to the vmalloc allocator. Use kvfree for freeing the memory. + * + * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier. + * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is + * preferable to the vmalloc fallback, due to visible performance drawbacks. + * + * Return: pointer to the allocated memory of %NULL in case of failure + */ +void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) +{ + void *ret; + + /* + * It doesn't really make sense to fallback to vmalloc for sub page + * requests + */ + ret = __do_kmalloc_node(size, PASS_BUCKET_PARAM(b), + kmalloc_gfp_adjust(flags, size), + node, _RET_IP_); + if (ret || size <= PAGE_SIZE) + return ret; + + /* non-sleeping allocations are not supported by vmalloc */ + if (!gfpflags_allow_blocking(flags)) + return NULL; + + /* Don't even allow crazy sizes */ + if (unlikely(size > INT_MAX)) { + WARN_ON_ONCE(!(flags & __GFP_NOWARN)); + return NULL; + } + + /* + * kvmalloc() can always use VM_ALLOW_HUGE_VMAP, + * since the callers already cannot assume anything + * about the resulting pointer, and cannot play + * protection games. + */ + return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, + flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, + node, __builtin_return_address(0)); +} +EXPORT_SYMBOL(__kvmalloc_node_noprof); + +/** + * kvfree() - Free memory. + * @addr: Pointer to allocated memory. + * + * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc(). + * It is slightly more efficient to use kfree() or vfree() if you are certain + * that you know which one to use. + * + * Context: Either preemptible task context or not-NMI interrupt. + */ +void kvfree(const void *addr) +{ + if (is_vmalloc_addr(addr)) + vfree(addr); + else + kfree(addr); +} +EXPORT_SYMBOL(kvfree); + +/** + * kvfree_sensitive - Free a data object containing sensitive information. + * @addr: address of the data object to be freed. + * @len: length of the data object. + * + * Use the special memzero_explicit() function to clear the content of a + * kvmalloc'ed object containing sensitive data to make sure that the + * compiler won't optimize out the data clearing. + */ +void kvfree_sensitive(const void *addr, size_t len) +{ + if (likely(!ZERO_OR_NULL_PTR(addr))) { + memzero_explicit((void *)addr, len); + kvfree(addr); + } +} +EXPORT_SYMBOL(kvfree_sensitive); + +/** + * kvrealloc - reallocate memory; contents remain unchanged + * @p: object to reallocate memory for + * @size: the size to reallocate + * @flags: the flags for the page level allocator + * + * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0 + * and @p is not a %NULL pointer, the object pointed to is freed. + * + * If __GFP_ZERO logic is requested, callers must ensure that, starting with the + * initial memory allocation, every subsequent call to this API for the same + * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that + * __GFP_ZERO is not fully honored by this API. + * + * In any case, the contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. + * + * This function must not be called concurrently with itself or kvfree() for the + * same memory allocation. + * + * Return: pointer to the allocated memory or %NULL in case of error + */ +void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags) +{ + void *n; + + if (is_vmalloc_addr(p)) + return vrealloc_noprof(p, size, flags); + + n = krealloc_noprof(p, size, kmalloc_gfp_adjust(flags, size)); + if (!n) { + /* We failed to krealloc(), fall back to kvmalloc(). */ + n = kvmalloc_noprof(size, flags); + if (!n) + return NULL; + + if (p) { + /* We already know that `p` is not a vmalloc address. */ + kasan_disable_current(); + memcpy(n, kasan_reset_tag(p), ksize(p)); + kasan_enable_current(); + + kfree(p); + } + } + + return n; +} +EXPORT_SYMBOL(kvrealloc_noprof); + struct detached_freelist { struct slab *slab; void *tail; @@ -5559,14 +5819,14 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s) return !!oo_objects(s->oo); } -static void list_slab_objects(struct kmem_cache *s, struct slab *slab, - const char *text) +static void list_slab_objects(struct kmem_cache *s, struct slab *slab) { #ifdef CONFIG_SLUB_DEBUG void *addr = slab_address(slab); void *p; - slab_err(s, slab, text, s->name); + if (!slab_add_kunit_errors()) + slab_bug(s, "Objects remaining on __kmem_cache_shutdown()"); spin_lock(&object_map_lock); __fill_map(object_map, s, slab); @@ -5581,6 +5841,8 @@ static void list_slab_objects(struct kmem_cache *s, struct slab *slab, } } spin_unlock(&object_map_lock); + + __slab_err(slab); #endif } @@ -5601,8 +5863,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) remove_partial(n, slab); list_add(&slab->slab_list, &discard); } else { - list_slab_objects(s, slab, - "Objects remaining in %s on __kmem_cache_shutdown()"); + list_slab_objects(s, slab); } } spin_unlock_irq(&n->list_lock); @@ -7498,10 +7759,7 @@ static int slab_debug_trace_open(struct inode *inode, struct file *filep) return -ENOMEM; } - if (strcmp(filep->f_path.dentry->d_name.name, "alloc_traces") == 0) - alloc = TRACK_ALLOC; - else - alloc = TRACK_FREE; + alloc = debugfs_get_aux_num(filep); if (!alloc_loc_track(t, PAGE_SIZE / sizeof(struct location), GFP_KERNEL)) { bitmap_free(obj_map); @@ -7557,11 +7815,11 @@ static void debugfs_slab_add(struct kmem_cache *s) slab_cache_dir = debugfs_create_dir(s->name, slab_debugfs_root); - debugfs_create_file("alloc_traces", 0400, - slab_cache_dir, s, &slab_debugfs_fops); + debugfs_create_file_aux_num("alloc_traces", 0400, slab_cache_dir, s, + TRACK_ALLOC, &slab_debugfs_fops); - debugfs_create_file("free_traces", 0400, - slab_cache_dir, s, &slab_debugfs_fops); + debugfs_create_file_aux_num("free_traces", 0400, slab_cache_dir, s, + TRACK_FREE, &slab_debugfs_fops); } void debugfs_slab_release(struct kmem_cache *s) diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index cec67c5f37d8..fd2ab5118e13 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -30,6 +30,17 @@ #include <asm/dma.h> #include <asm/pgalloc.h> +#include <asm/tlbflush.h> + +#include "hugetlb_vmemmap.h" + +/* + * Flags for vmemmap_populate_range and friends. + */ +/* Get a ref on the head page struct page, for ZONE_DEVICE compound pages */ +#define VMEMMAP_POPULATE_PAGEREF 0x0001 + +#include "internal.h" /* * Allocate a block of memory to be used to back the virtual memory map @@ -42,8 +53,7 @@ static void * __ref __earlyonly_bootmem_alloc(int node, unsigned long align, unsigned long goal) { - return memblock_alloc_try_nid_raw(size, align, goal, - MEMBLOCK_ALLOC_ACCESSIBLE, node); + return memmap_alloc(size, align, goal, node, false); } void * __meminit vmemmap_alloc_block(unsigned long size, int node) @@ -143,17 +153,18 @@ void __meminit vmemmap_verify(pte_t *pte, int node, pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node, struct vmem_altmap *altmap, - struct page *reuse) + unsigned long ptpfn, unsigned long flags) { pte_t *pte = pte_offset_kernel(pmd, addr); if (pte_none(ptep_get(pte))) { pte_t entry; void *p; - if (!reuse) { + if (ptpfn == (unsigned long)-1) { p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap); if (!p) return NULL; + ptpfn = PHYS_PFN(__pa(p)); } else { /* * When a PTE/PMD entry is freed from the init_mm @@ -164,10 +175,10 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node, * and through vmemmap_populate_compound_pages() when * slab is available. */ - get_page(reuse); - p = page_to_virt(reuse); + if (flags & VMEMMAP_POPULATE_PAGEREF) + get_page(pfn_to_page(ptpfn)); } - entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); + entry = pfn_pte(ptpfn, PAGE_KERNEL); set_pte_at(&init_mm, addr, pte, entry); } return pte; @@ -237,7 +248,8 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node) static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node, struct vmem_altmap *altmap, - struct page *reuse) + unsigned long ptpfn, + unsigned long flags) { pgd_t *pgd; p4d_t *p4d; @@ -257,7 +269,7 @@ static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node, pmd = vmemmap_pmd_populate(pud, addr, node); if (!pmd) return NULL; - pte = vmemmap_pte_populate(pmd, addr, node, altmap, reuse); + pte = vmemmap_pte_populate(pmd, addr, node, altmap, ptpfn, flags); if (!pte) return NULL; vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); @@ -268,13 +280,15 @@ static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node, static int __meminit vmemmap_populate_range(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap, - struct page *reuse) + unsigned long ptpfn, + unsigned long flags) { unsigned long addr = start; pte_t *pte; for (; addr < end; addr += PAGE_SIZE) { - pte = vmemmap_populate_address(addr, node, altmap, reuse); + pte = vmemmap_populate_address(addr, node, altmap, + ptpfn, flags); if (!pte) return -ENOMEM; } @@ -285,7 +299,107 @@ static int __meminit vmemmap_populate_range(unsigned long start, int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap) { - return vmemmap_populate_range(start, end, node, altmap, NULL); + return vmemmap_populate_range(start, end, node, altmap, -1, 0); +} + +/* + * Undo populate_hvo, and replace it with a normal base page mapping. + * Used in memory init in case a HVO mapping needs to be undone. + * + * This can happen when it is discovered that a memblock allocated + * hugetlb page spans multiple zones, which can only be verified + * after zones have been initialized. + * + * We know that: + * 1) The first @headsize / PAGE_SIZE vmemmap pages were individually + * allocated through memblock, and mapped. + * + * 2) The rest of the vmemmap pages are mirrors of the last head page. + */ +int __meminit vmemmap_undo_hvo(unsigned long addr, unsigned long end, + int node, unsigned long headsize) +{ + unsigned long maddr, pfn; + pte_t *pte; + int headpages; + + /* + * Should only be called early in boot, so nothing will + * be accessing these page structures. + */ + WARN_ON(!early_boot_irqs_disabled); + + headpages = headsize >> PAGE_SHIFT; + + /* + * Clear mirrored mappings for tail page structs. + */ + for (maddr = addr + headsize; maddr < end; maddr += PAGE_SIZE) { + pte = virt_to_kpte(maddr); + pte_clear(&init_mm, maddr, pte); + } + + /* + * Clear and free mappings for head page and first tail page + * structs. + */ + for (maddr = addr; headpages-- > 0; maddr += PAGE_SIZE) { + pte = virt_to_kpte(maddr); + pfn = pte_pfn(ptep_get(pte)); + pte_clear(&init_mm, maddr, pte); + memblock_phys_free(PFN_PHYS(pfn), PAGE_SIZE); + } + + flush_tlb_kernel_range(addr, end); + + return vmemmap_populate(addr, end, node, NULL); +} + +/* + * Write protect the mirrored tail page structs for HVO. This will be + * called from the hugetlb code when gathering and initializing the + * memblock allocated gigantic pages. The write protect can't be + * done earlier, since it can't be guaranteed that the reserved + * page structures will not be written to during initialization, + * even if CONFIG_DEFERRED_STRUCT_PAGE_INIT is enabled. + * + * The PTEs are known to exist, and nothing else should be touching + * these pages. The caller is responsible for any TLB flushing. + */ +void vmemmap_wrprotect_hvo(unsigned long addr, unsigned long end, + int node, unsigned long headsize) +{ + unsigned long maddr; + pte_t *pte; + + for (maddr = addr + headsize; maddr < end; maddr += PAGE_SIZE) { + pte = virt_to_kpte(maddr); + ptep_set_wrprotect(&init_mm, maddr, pte); + } +} + +/* + * Populate vmemmap pages HVO-style. The first page contains the head + * page and needed tail pages, the other ones are mirrors of the first + * page. + */ +int __meminit vmemmap_populate_hvo(unsigned long addr, unsigned long end, + int node, unsigned long headsize) +{ + pte_t *pte; + unsigned long maddr; + + for (maddr = addr; maddr < addr + headsize; maddr += PAGE_SIZE) { + pte = vmemmap_populate_address(maddr, node, NULL, -1, 0); + if (!pte) + return -ENOMEM; + } + + /* + * Reuse the last page struct page mapped above for the rest. + */ + return vmemmap_populate_range(maddr, end, node, NULL, + pte_pfn(ptep_get(pte)), 0); } void __weak __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node, @@ -408,7 +522,8 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn, * with just tail struct pages. */ return vmemmap_populate_range(start, end, node, NULL, - pte_page(ptep_get(pte))); + pte_pfn(ptep_get(pte)), + VMEMMAP_POPULATE_PAGEREF); } size = min(end - start, pgmap_vmemmap_nr(pgmap) * sizeof(struct page)); @@ -416,13 +531,13 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn, unsigned long next, last = addr + size; /* Populate the head page vmemmap page */ - pte = vmemmap_populate_address(addr, node, NULL, NULL); + pte = vmemmap_populate_address(addr, node, NULL, -1, 0); if (!pte) return -ENOMEM; /* Populate the tail pages vmemmap page */ next = addr + PAGE_SIZE; - pte = vmemmap_populate_address(next, node, NULL, NULL); + pte = vmemmap_populate_address(next, node, NULL, -1, 0); if (!pte) return -ENOMEM; @@ -432,7 +547,8 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn, */ next += PAGE_SIZE; rc = vmemmap_populate_range(next, last, node, NULL, - pte_page(ptep_get(pte))); + pte_pfn(ptep_get(pte)), + VMEMMAP_POPULATE_PAGEREF); if (rc) return -ENOMEM; } @@ -469,3 +585,28 @@ struct page * __meminit __populate_section_memmap(unsigned long pfn, return pfn_to_page(pfn); } + +#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT +/* + * This is called just before initializing sections for a NUMA node. + * Any special initialization that needs to be done before the + * generic initialization can be done from here. Sections that + * are initialized in hooks called from here will be skipped by + * the generic initialization. + */ +void __init sparse_vmemmap_init_nid_early(int nid) +{ + hugetlb_vmemmap_init_early(nid); +} + +/* + * This is called just before the initialization of page structures + * through memmap_init. Zones are now initialized, so any work that + * needs to be done that needs zone information can be done from + * here. + */ +void __init sparse_vmemmap_init_nid_late(int nid) +{ + hugetlb_vmemmap_init_late(nid); +} +#endif diff --git a/mm/sparse.c b/mm/sparse.c index 13b6624d3562..3c012cf83cc2 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -170,11 +170,6 @@ static void __section_mark_present(struct mem_section *ms, ms->section_mem_map |= SECTION_MARKED_PRESENT; } -#define for_each_present_section_nr(start, section_nr) \ - for (section_nr = next_present_section_nr(start-1); \ - section_nr != -1; \ - section_nr = next_present_section_nr(section_nr)) - static inline unsigned long first_present_section_nr(void) { return next_present_section_nr(-1); @@ -257,10 +252,7 @@ static void __init memblocks_present(void) size = sizeof(struct mem_section *) * NR_SECTION_ROOTS; align = 1 << (INTERNODE_CACHE_SHIFT); - mem_section = memblock_alloc(size, align); - if (!mem_section) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, size, align); + mem_section = memblock_alloc_or_panic(size, align); } #endif @@ -411,13 +403,13 @@ static void __init check_usemap_section_nr(int nid, #endif /* CONFIG_MEMORY_HOTREMOVE */ #ifdef CONFIG_SPARSEMEM_VMEMMAP -static unsigned long __init section_map_size(void) +unsigned long __init section_map_size(void) { return ALIGN(sizeof(struct page) * PAGES_PER_SECTION, PMD_SIZE); } #else -static unsigned long __init section_map_size(void) +unsigned long __init section_map_size(void) { return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION); } @@ -498,6 +490,44 @@ void __weak __meminit vmemmap_populate_print_last(void) { } +static void *sparse_usagebuf __meminitdata; +static void *sparse_usagebuf_end __meminitdata; + +/* + * Helper function that is used for generic section initialization, and + * can also be used by any hooks added above. + */ +void __init sparse_init_early_section(int nid, struct page *map, + unsigned long pnum, unsigned long flags) +{ + BUG_ON(!sparse_usagebuf || sparse_usagebuf >= sparse_usagebuf_end); + check_usemap_section_nr(nid, sparse_usagebuf); + sparse_init_one_section(__nr_to_section(pnum), pnum, map, + sparse_usagebuf, SECTION_IS_EARLY | flags); + sparse_usagebuf = (void *)sparse_usagebuf + mem_section_usage_size(); +} + +static int __init sparse_usage_init(int nid, unsigned long map_count) +{ + unsigned long size; + + size = mem_section_usage_size() * map_count; + sparse_usagebuf = sparse_early_usemaps_alloc_pgdat_section( + NODE_DATA(nid), size); + if (!sparse_usagebuf) { + sparse_usagebuf_end = NULL; + return -ENOMEM; + } + + sparse_usagebuf_end = sparse_usagebuf + size; + return 0; +} + +static void __init sparse_usage_fini(void) +{ + sparse_usagebuf = sparse_usagebuf_end = NULL; +} + /* * Initialize sparse on a specific node. The node spans [pnum_begin, pnum_end) * And number of present sections in this node is map_count. @@ -506,47 +536,54 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin, unsigned long pnum_end, unsigned long map_count) { - struct mem_section_usage *usage; unsigned long pnum; struct page *map; + struct mem_section *ms; - usage = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid), - mem_section_usage_size() * map_count); - if (!usage) { + if (sparse_usage_init(nid, map_count)) { pr_err("%s: node[%d] usemap allocation failed", __func__, nid); goto failed; } + sparse_buffer_init(map_count * section_map_size(), nid); + + sparse_vmemmap_init_nid_early(nid); + for_each_present_section_nr(pnum_begin, pnum) { unsigned long pfn = section_nr_to_pfn(pnum); if (pnum >= pnum_end) break; - map = __populate_section_memmap(pfn, PAGES_PER_SECTION, - nid, NULL, NULL); - if (!map) { - pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.", - __func__, nid); - pnum_begin = pnum; - sparse_buffer_fini(); - goto failed; + ms = __nr_to_section(pnum); + if (!preinited_vmemmap_section(ms)) { + map = __populate_section_memmap(pfn, PAGES_PER_SECTION, + nid, NULL, NULL); + if (!map) { + pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.", + __func__, nid); + pnum_begin = pnum; + sparse_usage_fini(); + sparse_buffer_fini(); + goto failed; + } + sparse_init_early_section(nid, map, pnum, 0); } - check_usemap_section_nr(nid, usage); - sparse_init_one_section(__nr_to_section(pnum), pnum, map, usage, - SECTION_IS_EARLY); - usage = (void *) usage + mem_section_usage_size(); } + sparse_usage_fini(); sparse_buffer_fini(); return; failed: - /* We failed to allocate, mark all the following pnums as not present */ + /* + * We failed to allocate, mark all the following pnums as not present, + * except the ones already initialized earlier. + */ for_each_present_section_nr(pnum_begin, pnum) { - struct mem_section *ms; - if (pnum >= pnum_end) break; ms = __nr_to_section(pnum); + if (!preinited_vmemmap_section(ms)) + ms->section_mem_map = 0; ms->section_mem_map = 0; } } diff --git a/mm/swap.c b/mm/swap.c index 10decd9dffa1..4fc322f7111a 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -45,7 +45,7 @@ /* How many pages do we try to swap or page in/out together? As a power of 2 */ int page_cluster; -const int page_cluster_max = 31; +static const int page_cluster_max = 31; struct cpu_fbatches { /* @@ -109,7 +109,7 @@ void __folio_put(struct folio *folio) page_cache_release(folio); folio_unqueue_deferred_split(folio); mem_cgroup_uncharge(folio); - free_unref_page(&folio->page, folio_order(folio)); + free_frozen_pages(&folio->page, folio_order(folio)); } EXPORT_SYMBOL(__folio_put); @@ -309,7 +309,7 @@ static void lru_activate(struct lruvec *lruvec, struct folio *folio) trace_mm_lru_activate(folio); __count_vm_events(PGACTIVATE, nr_pages); - __count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, nr_pages); + count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, nr_pages); } #ifdef CONFIG_SMP @@ -379,37 +379,58 @@ static void __lru_cache_activate_folio(struct folio *folio) } #ifdef CONFIG_LRU_GEN -static void folio_inc_refs(struct folio *folio) + +static void lru_gen_inc_refs(struct folio *folio) { unsigned long new_flags, old_flags = READ_ONCE(folio->flags); if (folio_test_unevictable(folio)) return; + /* see the comment on LRU_REFS_FLAGS */ if (!folio_test_referenced(folio)) { - folio_set_referenced(folio); - return; - } - - if (!folio_test_workingset(folio)) { - folio_set_workingset(folio); + set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced)); return; } - /* see the comment on MAX_NR_TIERS */ do { - new_flags = old_flags & LRU_REFS_MASK; - if (new_flags == LRU_REFS_MASK) - break; + if ((old_flags & LRU_REFS_MASK) == LRU_REFS_MASK) { + if (!folio_test_workingset(folio)) + folio_set_workingset(folio); + return; + } - new_flags += BIT(LRU_REFS_PGOFF); - new_flags |= old_flags & ~LRU_REFS_MASK; + new_flags = old_flags + BIT(LRU_REFS_PGOFF); } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); } -#else -static void folio_inc_refs(struct folio *folio) + +static bool lru_gen_clear_refs(struct folio *folio) +{ + struct lru_gen_folio *lrugen; + int gen = folio_lru_gen(folio); + int type = folio_is_file_lru(folio); + + if (gen < 0) + return true; + + set_mask_bits(&folio->flags, LRU_REFS_FLAGS | BIT(PG_workingset), 0); + + lrugen = &folio_lruvec(folio)->lrugen; + /* whether can do without shuffling under the LRU lock */ + return gen == lru_gen_from_seq(READ_ONCE(lrugen->min_seq[type])); +} + +#else /* !CONFIG_LRU_GEN */ + +static void lru_gen_inc_refs(struct folio *folio) +{ +} + +static bool lru_gen_clear_refs(struct folio *folio) { + return false; } + #endif /* CONFIG_LRU_GEN */ /** @@ -427,8 +448,10 @@ static void folio_inc_refs(struct folio *folio) */ void folio_mark_accessed(struct folio *folio) { + if (folio_test_dropbehind(folio)) + return; if (lru_gen_enabled()) { - folio_inc_refs(folio); + lru_gen_inc_refs(folio); return; } @@ -474,7 +497,7 @@ void folio_add_lru(struct folio *folio) folio_test_unevictable(folio), folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); - /* see the comment in lru_gen_add_folio() */ + /* see the comment in lru_gen_folio_seq() */ if (lru_gen_enabled() && !folio_test_unevictable(folio) && lru_gen_in_fault() && !(current->flags & PF_MEMALLOC)) folio_set_active(folio); @@ -524,7 +547,7 @@ void folio_add_lru_vma(struct folio *folio, struct vm_area_struct *vma) */ static void lru_deactivate_file(struct lruvec *lruvec, struct folio *folio) { - bool active = folio_test_active(folio); + bool active = folio_test_active(folio) || lru_gen_enabled(); long nr_pages = folio_nr_pages(folio); if (folio_test_unevictable(folio)) @@ -558,7 +581,7 @@ static void lru_deactivate_file(struct lruvec *lruvec, struct folio *folio) if (active) { __count_vm_events(PGDEACTIVATE, nr_pages); - __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, + count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages); } } @@ -576,7 +599,7 @@ static void lru_deactivate(struct lruvec *lruvec, struct folio *folio) lruvec_add_folio(lruvec, folio); __count_vm_events(PGDEACTIVATE, nr_pages); - __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages); + count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages); } static void lru_lazyfree(struct lruvec *lruvec, struct folio *folio) @@ -589,7 +612,10 @@ static void lru_lazyfree(struct lruvec *lruvec, struct folio *folio) lruvec_del_folio(lruvec, folio); folio_clear_active(folio); - folio_clear_referenced(folio); + if (lru_gen_enabled()) + lru_gen_clear_refs(folio); + else + folio_clear_referenced(folio); /* * Lazyfree folios are clean anonymous folios. They have * the swapbacked flag cleared, to distinguish them from normal @@ -599,7 +625,7 @@ static void lru_lazyfree(struct lruvec *lruvec, struct folio *folio) lruvec_add_folio(lruvec, folio); __count_vm_events(PGLAZYFREE, nr_pages); - __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, nr_pages); + count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, nr_pages); } /* @@ -657,6 +683,9 @@ void deactivate_file_folio(struct folio *folio) if (folio_test_unevictable(folio)) return; + if (lru_gen_enabled() && lru_gen_clear_refs(folio)) + return; + folio_batch_add_and_move(folio, lru_deactivate_file, true); } @@ -670,7 +699,10 @@ void deactivate_file_folio(struct folio *folio) */ void folio_deactivate(struct folio *folio) { - if (folio_test_unevictable(folio) || !(folio_test_active(folio) || lru_gen_enabled())) + if (folio_test_unevictable(folio)) + return; + + if (lru_gen_enabled() ? lru_gen_clear_refs(folio) : !folio_test_active(folio)) return; folio_batch_add_and_move(folio, lru_deactivate, true); @@ -924,8 +956,6 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) unlock_page_lruvec_irqrestore(lruvec, flags); lruvec = NULL; } - if (put_devmap_managed_folio_refs(folio, nr_refs)) - continue; if (folio_ref_sub_and_test(folio, nr_refs)) free_zone_device_folio(folio); continue; @@ -1044,6 +1074,18 @@ void folio_batch_remove_exceptionals(struct folio_batch *fbatch) fbatch->nr = j; } +static const struct ctl_table swap_sysctl_table[] = { + { + .procname = "page-cluster", + .data = &page_cluster, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = (void *)&page_cluster_max, + } +}; + /* * Perform any setup for the swap system */ @@ -1060,4 +1102,6 @@ void __init swap_setup(void) * Right now other parts of the system means that we * _really_ don't want to cluster much more */ + + register_sysctl_init("vm", swap_sysctl_table); } diff --git a/mm/swap.h b/mm/swap.h index ad2f121de970..9096082a915e 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -3,6 +3,7 @@ #define _MM_SWAP_H struct mempolicy; +extern int page_cluster; #ifdef CONFIG_SWAP #include <linux/swapops.h> /* for swp_offset */ @@ -19,7 +20,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug) __swap_read_unplug(plug); } void swap_write_unplug(struct swap_iocb *sio); -int swap_writepage(struct page *page, struct writeback_control *wbc); +int swap_writeout(struct folio *folio, struct writeback_control *wbc); void __swap_writepage(struct folio *folio, struct writeback_control *wbc); /* linux/mm/swap_state.c */ @@ -50,7 +51,6 @@ static inline pgoff_t swap_cache_index(swp_entry_t entry) } void show_swap_cache_info(void); -bool add_to_swap(struct folio *folio); void *get_shadow_from_swap_cache(swp_entry_t entry); int add_to_swap_cache(struct folio *folio, swp_entry_t entry, gfp_t gfp, void **shadowp); @@ -106,6 +106,25 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, return find_next_bit(sis->zeromap, end, start) - start; } +static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) +{ + struct swap_info_struct *si = swp_swap_info(entry); + pgoff_t offset = swp_offset(entry); + int i; + + /* + * While allocating a large folio and doing mTHP swapin, we need to + * ensure all entries are not cached, otherwise, the mTHP folio will + * be in conflict with the folio in swap cache. + */ + for (i = 0; i < max_nr; i++) { + if ((si->swap_map[offset + i] & SWAP_HAS_CACHE)) + return i; + } + + return i; +} + #else /* CONFIG_SWAP */ struct swap_iocb; static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug) @@ -141,7 +160,7 @@ static inline struct folio *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask, return NULL; } -static inline int swap_writepage(struct page *p, struct writeback_control *wbc) +static inline int swap_writeout(struct folio *f, struct writeback_control *wbc) { return 0; } @@ -163,11 +182,6 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping, return filemap_get_folio(mapping, index); } -static inline bool add_to_swap(struct folio *folio) -{ - return false; -} - static inline void *get_shadow_from_swap_cache(swp_entry_t entry) { return NULL; @@ -204,6 +218,28 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, return 0; } +static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) +{ + return 0; +} #endif /* CONFIG_SWAP */ +/** + * folio_index - File index of a folio. + * @folio: The folio. + * + * For a folio which is either in the page cache or the swap cache, + * return its index within the address_space it belongs to. If you know + * the folio is definitely in the page cache, you can look at the folio's + * index directly. + * + * Return: The index (offset in units of pages) of a folio in its file. + */ +static inline pgoff_t folio_index(struct folio *folio) +{ + if (unlikely(folio_test_swapcache(folio))) + return swap_cache_index(folio->swap); + return folio->index; +} + #endif /* _MM_SWAP_H */ diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c index da1278f0563b..de779fed8c21 100644 --- a/mm/swap_cgroup.c +++ b/mm/swap_cgroup.c @@ -6,149 +6,106 @@ #include <linux/swapops.h> /* depends on mm.h include */ static DEFINE_MUTEX(swap_cgroup_mutex); -struct swap_cgroup_ctrl { - struct page **map; - unsigned long length; - spinlock_t lock; -}; - -static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; +/* Pack two cgroup id (short) of two entries in one swap_cgroup (atomic_t) */ +#define ID_PER_SC (sizeof(struct swap_cgroup) / sizeof(unsigned short)) +#define ID_SHIFT (BITS_PER_TYPE(unsigned short)) +#define ID_MASK (BIT(ID_SHIFT) - 1) struct swap_cgroup { - unsigned short id; + atomic_t ids; }; -#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup)) - -/* - * SwapCgroup implements "lookup" and "exchange" operations. - * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge - * against SwapCache. At swap_free(), this is accessed directly from swap. - * - * This means, - * - we have no race in "exchange" when we're accessed via SwapCache because - * SwapCache(and its swp_entry) is under lock. - * - When called via swap_free(), there is no user of this entry and no race. - * Then, we don't need lock around "exchange". - * - * TODO: we can push these buffers out to HIGHMEM. - */ - -/* - * allocate buffer for swap_cgroup. - */ -static int swap_cgroup_prepare(int type) -{ - struct page *page; - struct swap_cgroup_ctrl *ctrl; - unsigned long idx, max; - - ctrl = &swap_cgroup_ctrl[type]; - - for (idx = 0; idx < ctrl->length; idx++) { - page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) - goto not_enough_page; - ctrl->map[idx] = page; - if (!(idx % SWAP_CLUSTER_MAX)) - cond_resched(); - } - return 0; -not_enough_page: - max = idx; - for (idx = 0; idx < max; idx++) - __free_page(ctrl->map[idx]); +struct swap_cgroup_ctrl { + struct swap_cgroup *map; +}; - return -ENOMEM; -} +static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; -static struct swap_cgroup *__lookup_swap_cgroup(struct swap_cgroup_ctrl *ctrl, - pgoff_t offset) +static unsigned short __swap_cgroup_id_lookup(struct swap_cgroup *map, + pgoff_t offset) { - struct page *mappage; - struct swap_cgroup *sc; + unsigned int shift = (offset % ID_PER_SC) * ID_SHIFT; + unsigned int old_ids = atomic_read(&map[offset / ID_PER_SC].ids); + + BUILD_BUG_ON(!is_power_of_2(ID_PER_SC)); + BUILD_BUG_ON(sizeof(struct swap_cgroup) != sizeof(atomic_t)); - mappage = ctrl->map[offset / SC_PER_PAGE]; - sc = page_address(mappage); - return sc + offset % SC_PER_PAGE; + return (old_ids >> shift) & ID_MASK; } -static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, - struct swap_cgroup_ctrl **ctrlp) +static unsigned short __swap_cgroup_id_xchg(struct swap_cgroup *map, + pgoff_t offset, + unsigned short new_id) { - pgoff_t offset = swp_offset(ent); - struct swap_cgroup_ctrl *ctrl; - - ctrl = &swap_cgroup_ctrl[swp_type(ent)]; - if (ctrlp) - *ctrlp = ctrl; - return __lookup_swap_cgroup(ctrl, offset); + unsigned short old_id; + struct swap_cgroup *sc = &map[offset / ID_PER_SC]; + unsigned int shift = (offset % ID_PER_SC) * ID_SHIFT; + unsigned int new_ids, old_ids = atomic_read(&sc->ids); + + do { + old_id = (old_ids >> shift) & ID_MASK; + new_ids = (old_ids & ~(ID_MASK << shift)); + new_ids |= ((unsigned int)new_id) << shift; + } while (!atomic_try_cmpxchg(&sc->ids, &old_ids, new_ids)); + + return old_id; } /** - * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. - * @ent: swap entry to be cmpxchged - * @old: old id - * @new: new id + * swap_cgroup_record - record mem_cgroup for a set of swap entries. + * These entries must belong to one single folio, and that folio + * must be being charged for swap space (swap out), and these + * entries must not have been charged * - * Returns old id at success, 0 at failure. - * (There is no mem_cgroup using 0 as its id) + * @folio: the folio that the swap entry belongs to + * @id: mem_cgroup ID to be recorded + * @ent: the first swap entry to be recorded */ -unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, - unsigned short old, unsigned short new) +void swap_cgroup_record(struct folio *folio, unsigned short id, + swp_entry_t ent) { - struct swap_cgroup_ctrl *ctrl; - struct swap_cgroup *sc; - unsigned long flags; - unsigned short retval; - - sc = lookup_swap_cgroup(ent, &ctrl); - - spin_lock_irqsave(&ctrl->lock, flags); - retval = sc->id; - if (retval == old) - sc->id = new; - else - retval = 0; - spin_unlock_irqrestore(&ctrl->lock, flags); - return retval; + unsigned int nr_ents = folio_nr_pages(folio); + struct swap_cgroup *map; + pgoff_t offset, end; + unsigned short old; + + offset = swp_offset(ent); + end = offset + nr_ents; + map = swap_cgroup_ctrl[swp_type(ent)].map; + + do { + old = __swap_cgroup_id_xchg(map, offset, id); + VM_BUG_ON(old); + } while (++offset != end); } /** - * swap_cgroup_record - record mem_cgroup for a set of swap entries + * swap_cgroup_clear - clear mem_cgroup for a set of swap entries. + * These entries must be being uncharged from swap. They either + * belongs to one single folio in the swap cache (swap in for + * cgroup v1), or no longer have any users (slot freeing). + * * @ent: the first swap entry to be recorded into - * @id: mem_cgroup to be recorded * @nr_ents: number of swap entries to be recorded * - * Returns old value at success, 0 at failure. - * (Of course, old value can be 0.) + * Returns the existing old value. */ -unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id, - unsigned int nr_ents) +unsigned short swap_cgroup_clear(swp_entry_t ent, unsigned int nr_ents) { - struct swap_cgroup_ctrl *ctrl; - struct swap_cgroup *sc; - unsigned short old; - unsigned long flags; - pgoff_t offset = swp_offset(ent); - pgoff_t end = offset + nr_ents; - - sc = lookup_swap_cgroup(ent, &ctrl); - - spin_lock_irqsave(&ctrl->lock, flags); - old = sc->id; - for (;;) { - VM_BUG_ON(sc->id != old); - sc->id = id; - offset++; - if (offset == end) - break; - if (offset % SC_PER_PAGE) - sc++; - else - sc = __lookup_swap_cgroup(ctrl, offset); - } - spin_unlock_irqrestore(&ctrl->lock, flags); + pgoff_t offset, end; + struct swap_cgroup *map; + unsigned short old, iter = 0; + + offset = swp_offset(ent); + end = offset + nr_ents; + map = swap_cgroup_ctrl[swp_type(ent)].map; + + do { + old = __swap_cgroup_id_xchg(map, offset, 0); + if (!iter) + iter = old; + VM_BUG_ON(iter != old); + } while (++offset != end); return old; } @@ -161,39 +118,33 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id, */ unsigned short lookup_swap_cgroup_id(swp_entry_t ent) { + struct swap_cgroup_ctrl *ctrl; + if (mem_cgroup_disabled()) return 0; - return lookup_swap_cgroup(ent, NULL)->id; + + ctrl = &swap_cgroup_ctrl[swp_type(ent)]; + return __swap_cgroup_id_lookup(ctrl->map, swp_offset(ent)); } int swap_cgroup_swapon(int type, unsigned long max_pages) { - void *array; - unsigned long length; + struct swap_cgroup *map; struct swap_cgroup_ctrl *ctrl; if (mem_cgroup_disabled()) return 0; - length = DIV_ROUND_UP(max_pages, SC_PER_PAGE); - - array = vcalloc(length, sizeof(void *)); - if (!array) + BUILD_BUG_ON(sizeof(unsigned short) * ID_PER_SC != + sizeof(struct swap_cgroup)); + map = vzalloc(DIV_ROUND_UP(max_pages, ID_PER_SC) * + sizeof(struct swap_cgroup)); + if (!map) goto nomem; ctrl = &swap_cgroup_ctrl[type]; mutex_lock(&swap_cgroup_mutex); - ctrl->length = length; - ctrl->map = array; - spin_lock_init(&ctrl->lock); - if (swap_cgroup_prepare(type)) { - /* memory shortage */ - ctrl->map = NULL; - ctrl->length = 0; - mutex_unlock(&swap_cgroup_mutex); - vfree(array); - goto nomem; - } + ctrl->map = map; mutex_unlock(&swap_cgroup_mutex); return 0; @@ -205,8 +156,7 @@ nomem: void swap_cgroup_swapoff(int type) { - struct page **map; - unsigned long i, length; + struct swap_cgroup *map; struct swap_cgroup_ctrl *ctrl; if (mem_cgroup_disabled()) @@ -215,19 +165,8 @@ void swap_cgroup_swapoff(int type) mutex_lock(&swap_cgroup_mutex); ctrl = &swap_cgroup_ctrl[type]; map = ctrl->map; - length = ctrl->length; ctrl->map = NULL; - ctrl->length = 0; mutex_unlock(&swap_cgroup_mutex); - if (map) { - for (i = 0; i < length; i++) { - struct page *page = map[i]; - if (page) - __free_page(page); - if (!(i % SWAP_CLUSTER_MAX)) - cond_resched(); - } - vfree(map); - } + vfree(map); } diff --git a/mm/swap_slots.c b/mm/swap_slots.c deleted file mode 100644 index 13ab3b771409..000000000000 --- a/mm/swap_slots.c +++ /dev/null @@ -1,353 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Manage cache of swap slots to be used for and returned from - * swap. - * - * Copyright(c) 2016 Intel Corporation. - * - * Author: Tim Chen <tim.c.chen@linux.intel.com> - * - * We allocate the swap slots from the global pool and put - * it into local per cpu caches. This has the advantage - * of no needing to acquire the swap_info lock every time - * we need a new slot. - * - * There is also opportunity to simply return the slot - * to local caches without needing to acquire swap_info - * lock. We do not reuse the returned slots directly but - * move them back to the global pool in a batch. This - * allows the slots to coalesce and reduce fragmentation. - * - * The swap entry allocated is marked with SWAP_HAS_CACHE - * flag in map_count that prevents it from being allocated - * again from the global pool. - * - * The swap slots cache is protected by a mutex instead of - * a spin lock as when we search for slots with scan_swap_map, - * we can possibly sleep. - */ - -#include <linux/swap_slots.h> -#include <linux/cpu.h> -#include <linux/cpumask.h> -#include <linux/slab.h> -#include <linux/vmalloc.h> -#include <linux/mutex.h> -#include <linux/mm.h> - -static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots); -static bool swap_slot_cache_active; -bool swap_slot_cache_enabled; -static bool swap_slot_cache_initialized; -static DEFINE_MUTEX(swap_slots_cache_mutex); -/* Serialize swap slots cache enable/disable operations */ -static DEFINE_MUTEX(swap_slots_cache_enable_mutex); - -static void __drain_swap_slots_cache(unsigned int type); - -#define use_swap_slot_cache (swap_slot_cache_active && swap_slot_cache_enabled) -#define SLOTS_CACHE 0x1 -#define SLOTS_CACHE_RET 0x2 - -static void deactivate_swap_slots_cache(void) -{ - mutex_lock(&swap_slots_cache_mutex); - swap_slot_cache_active = false; - __drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET); - mutex_unlock(&swap_slots_cache_mutex); -} - -static void reactivate_swap_slots_cache(void) -{ - mutex_lock(&swap_slots_cache_mutex); - swap_slot_cache_active = true; - mutex_unlock(&swap_slots_cache_mutex); -} - -/* Must not be called with cpu hot plug lock */ -void disable_swap_slots_cache_lock(void) -{ - mutex_lock(&swap_slots_cache_enable_mutex); - swap_slot_cache_enabled = false; - if (swap_slot_cache_initialized) { - /* serialize with cpu hotplug operations */ - cpus_read_lock(); - __drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET); - cpus_read_unlock(); - } -} - -static void __reenable_swap_slots_cache(void) -{ - swap_slot_cache_enabled = has_usable_swap(); -} - -void reenable_swap_slots_cache_unlock(void) -{ - __reenable_swap_slots_cache(); - mutex_unlock(&swap_slots_cache_enable_mutex); -} - -static bool check_cache_active(void) -{ - long pages; - - if (!swap_slot_cache_enabled) - return false; - - pages = get_nr_swap_pages(); - if (!swap_slot_cache_active) { - if (pages > num_online_cpus() * - THRESHOLD_ACTIVATE_SWAP_SLOTS_CACHE) - reactivate_swap_slots_cache(); - goto out; - } - - /* if global pool of slot caches too low, deactivate cache */ - if (pages < num_online_cpus() * THRESHOLD_DEACTIVATE_SWAP_SLOTS_CACHE) - deactivate_swap_slots_cache(); -out: - return swap_slot_cache_active; -} - -static int alloc_swap_slot_cache(unsigned int cpu) -{ - struct swap_slots_cache *cache; - swp_entry_t *slots, *slots_ret; - - /* - * Do allocation outside swap_slots_cache_mutex - * as kvzalloc could trigger reclaim and folio_alloc_swap, - * which can lock swap_slots_cache_mutex. - */ - slots = kvcalloc(SWAP_SLOTS_CACHE_SIZE, sizeof(swp_entry_t), - GFP_KERNEL); - if (!slots) - return -ENOMEM; - - slots_ret = kvcalloc(SWAP_SLOTS_CACHE_SIZE, sizeof(swp_entry_t), - GFP_KERNEL); - if (!slots_ret) { - kvfree(slots); - return -ENOMEM; - } - - mutex_lock(&swap_slots_cache_mutex); - cache = &per_cpu(swp_slots, cpu); - if (cache->slots || cache->slots_ret) { - /* cache already allocated */ - mutex_unlock(&swap_slots_cache_mutex); - - kvfree(slots); - kvfree(slots_ret); - - return 0; - } - - if (!cache->lock_initialized) { - mutex_init(&cache->alloc_lock); - spin_lock_init(&cache->free_lock); - cache->lock_initialized = true; - } - cache->nr = 0; - cache->cur = 0; - cache->n_ret = 0; - /* - * We initialized alloc_lock and free_lock earlier. We use - * !cache->slots or !cache->slots_ret to know if it is safe to acquire - * the corresponding lock and use the cache. Memory barrier below - * ensures the assumption. - */ - mb(); - cache->slots = slots; - cache->slots_ret = slots_ret; - mutex_unlock(&swap_slots_cache_mutex); - return 0; -} - -static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type, - bool free_slots) -{ - struct swap_slots_cache *cache; - swp_entry_t *slots = NULL; - - cache = &per_cpu(swp_slots, cpu); - if ((type & SLOTS_CACHE) && cache->slots) { - mutex_lock(&cache->alloc_lock); - swapcache_free_entries(cache->slots + cache->cur, cache->nr); - cache->cur = 0; - cache->nr = 0; - if (free_slots && cache->slots) { - kvfree(cache->slots); - cache->slots = NULL; - } - mutex_unlock(&cache->alloc_lock); - } - if ((type & SLOTS_CACHE_RET) && cache->slots_ret) { - spin_lock_irq(&cache->free_lock); - swapcache_free_entries(cache->slots_ret, cache->n_ret); - cache->n_ret = 0; - if (free_slots && cache->slots_ret) { - slots = cache->slots_ret; - cache->slots_ret = NULL; - } - spin_unlock_irq(&cache->free_lock); - kvfree(slots); - } -} - -static void __drain_swap_slots_cache(unsigned int type) -{ - unsigned int cpu; - - /* - * This function is called during - * 1) swapoff, when we have to make sure no - * left over slots are in cache when we remove - * a swap device; - * 2) disabling of swap slot cache, when we run low - * on swap slots when allocating memory and need - * to return swap slots to global pool. - * - * We cannot acquire cpu hot plug lock here as - * this function can be invoked in the cpu - * hot plug path: - * cpu_up -> lock cpu_hotplug -> cpu hotplug state callback - * -> memory allocation -> direct reclaim -> folio_alloc_swap - * -> drain_swap_slots_cache - * - * Hence the loop over current online cpu below could miss cpu that - * is being brought online but not yet marked as online. - * That is okay as we do not schedule and run anything on a - * cpu before it has been marked online. Hence, we will not - * fill any swap slots in slots cache of such cpu. - * There are no slots on such cpu that need to be drained. - */ - for_each_online_cpu(cpu) - drain_slots_cache_cpu(cpu, type, false); -} - -static int free_slot_cache(unsigned int cpu) -{ - mutex_lock(&swap_slots_cache_mutex); - drain_slots_cache_cpu(cpu, SLOTS_CACHE | SLOTS_CACHE_RET, true); - mutex_unlock(&swap_slots_cache_mutex); - return 0; -} - -void enable_swap_slots_cache(void) -{ - mutex_lock(&swap_slots_cache_enable_mutex); - if (!swap_slot_cache_initialized) { - int ret; - - ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "swap_slots_cache", - alloc_swap_slot_cache, free_slot_cache); - if (WARN_ONCE(ret < 0, "Cache allocation failed (%s), operating " - "without swap slots cache.\n", __func__)) - goto out_unlock; - - swap_slot_cache_initialized = true; - } - - __reenable_swap_slots_cache(); -out_unlock: - mutex_unlock(&swap_slots_cache_enable_mutex); -} - -/* called with swap slot cache's alloc lock held */ -static int refill_swap_slots_cache(struct swap_slots_cache *cache) -{ - if (!use_swap_slot_cache) - return 0; - - cache->cur = 0; - if (swap_slot_cache_active) - cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, - cache->slots, 0); - - return cache->nr; -} - -void free_swap_slot(swp_entry_t entry) -{ - struct swap_slots_cache *cache; - - /* Large folio swap slot is not covered. */ - zswap_invalidate(entry); - - cache = raw_cpu_ptr(&swp_slots); - if (likely(use_swap_slot_cache && cache->slots_ret)) { - spin_lock_irq(&cache->free_lock); - /* Swap slots cache may be deactivated before acquiring lock */ - if (!use_swap_slot_cache || !cache->slots_ret) { - spin_unlock_irq(&cache->free_lock); - goto direct_free; - } - if (cache->n_ret >= SWAP_SLOTS_CACHE_SIZE) { - /* - * Return slots to global pool. - * The current swap_map value is SWAP_HAS_CACHE. - * Set it to 0 to indicate it is available for - * allocation in global pool - */ - swapcache_free_entries(cache->slots_ret, cache->n_ret); - cache->n_ret = 0; - } - cache->slots_ret[cache->n_ret++] = entry; - spin_unlock_irq(&cache->free_lock); - } else { -direct_free: - swapcache_free_entries(&entry, 1); - } -} - -swp_entry_t folio_alloc_swap(struct folio *folio) -{ - swp_entry_t entry; - struct swap_slots_cache *cache; - - entry.val = 0; - - if (folio_test_large(folio)) { - if (IS_ENABLED(CONFIG_THP_SWAP)) - get_swap_pages(1, &entry, folio_order(folio)); - goto out; - } - - /* - * Preemption is allowed here, because we may sleep - * in refill_swap_slots_cache(). But it is safe, because - * accesses to the per-CPU data structure are protected by the - * mutex cache->alloc_lock. - * - * The alloc path here does not touch cache->slots_ret - * so cache->free_lock is not taken. - */ - cache = raw_cpu_ptr(&swp_slots); - - if (likely(check_cache_active() && cache->slots)) { - mutex_lock(&cache->alloc_lock); - if (cache->slots) { -repeat: - if (cache->nr) { - entry = cache->slots[cache->cur]; - cache->slots[cache->cur++].val = 0; - cache->nr--; - } else if (refill_swap_slots_cache(cache)) { - goto repeat; - } - } - mutex_unlock(&cache->alloc_lock); - if (entry.val) - goto out; - } - - get_swap_pages(1, &entry, 0); -out: - if (mem_cgroup_try_charge_swap(folio, entry)) { - put_swap_folio(folio, entry); - entry.val = 0; - } - return entry; -} diff --git a/mm/swap_state.c b/mm/swap_state.c index e0c0321b8ff7..c354435a0923 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -20,7 +20,6 @@ #include <linux/blkdev.h> #include <linux/migrate.h> #include <linux/vmalloc.h> -#include <linux/swap_slots.h> #include <linux/huge_mm.h> #include <linux/shmem_fs.h> #include "internal.h" @@ -31,7 +30,6 @@ * vmscan's shrink_folio_list. */ static const struct address_space_operations swap_aops = { - .writepage = swap_writepage, .dirty_folio = noop_dirty_folio, #ifdef CONFIG_MIGRATION .migrate_folio = migrate_folio, @@ -85,7 +83,7 @@ void *get_shadow_from_swap_cache(swp_entry_t entry) /* * add_to_swap_cache resembles filemap_add_folio on swapper_space, - * but sets SwapCache flag and private instead of mapping and index. + * but sets SwapCache flag and 'swap' instead of mapping and index. */ int add_to_swap_cache(struct folio *folio, swp_entry_t entry, gfp_t gfp, void **shadowp) @@ -167,67 +165,6 @@ void __delete_from_swap_cache(struct folio *folio, __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr); } -/** - * add_to_swap - allocate swap space for a folio - * @folio: folio we want to move to swap - * - * Allocate swap space for the folio and add the folio to the - * swap cache. - * - * Context: Caller needs to hold the folio lock. - * Return: Whether the folio was added to the swap cache. - */ -bool add_to_swap(struct folio *folio) -{ - swp_entry_t entry; - int err; - - VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); - VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio); - - entry = folio_alloc_swap(folio); - if (!entry.val) - return false; - - /* - * XArray node allocations from PF_MEMALLOC contexts could - * completely exhaust the page allocator. __GFP_NOMEMALLOC - * stops emergency reserves from being allocated. - * - * TODO: this could cause a theoretical memory reclaim - * deadlock in the swap out path. - */ - /* - * Add it to the swap cache. - */ - err = add_to_swap_cache(folio, entry, - __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL); - if (err) - /* - * add_to_swap_cache() doesn't return -EEXIST, so we can safely - * clear SWAP_HAS_CACHE flag. - */ - goto fail; - /* - * Normally the folio will be dirtied in unmap because its - * pte should be dirty. A special case is MADV_FREE page. The - * page's pte could have dirty bit cleared but the folio's - * SwapBacked flag is still set because clearing the dirty bit - * and SwapBacked flag has no lock protected. For such folio, - * unmap will not set dirty bit for it, so folio reclaim will - * not write the folio out. This can cause data corruption when - * the folio is swapped in later. Always setting the dirty flag - * for the folio solves the problem. - */ - folio_mark_dirty(folio); - - return true; - -fail: - put_swap_folio(folio, entry); - return false; -} - /* * This must be called only on folios that have * been verified to be in the swap cache and locked. @@ -270,9 +207,7 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin, xa_unlock_irq(&address_space->i_pages); /* search the next swapcache until we meet end */ - curr >>= SWAP_ADDRESS_SPACE_SHIFT; - curr++; - curr <<= SWAP_ADDRESS_SPACE_SHIFT; + curr = ALIGN((curr + 1), SWAP_ADDRESS_SPACE_PAGES); if (curr > end) break; } @@ -296,13 +231,11 @@ void free_swap_cache(struct folio *folio) } /* - * Perform a free_page(), also freeing any swap cache associated with - * this page if it is the last user of the page. + * Freeing a folio and also freeing any swap cache associated with + * this folio if it is the last user. */ -void free_page_and_swap_cache(struct page *page) +void free_folio_and_swap_cache(struct folio *folio) { - struct folio *folio = page_folio(page); - free_swap_cache(folio); if (!is_huge_zero_folio(folio)) folio_put(folio); @@ -317,7 +250,6 @@ void free_pages_and_swap_cache(struct encoded_page **pages, int nr) struct folio_batch folios; unsigned int refs[PAGEVEC_SIZE]; - lru_add_drain(); folio_batch_init(&folios); for (int i = 0; i < nr; i++) { struct folio *folio = page_folio(encoded_page_ptr(pages[i])); @@ -433,17 +365,13 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated, bool skip_if_exists) { - struct swap_info_struct *si; + struct swap_info_struct *si = swp_swap_info(entry); struct folio *folio; struct folio *new_folio = NULL; struct folio *result = NULL; void *shadow = NULL; *new_page_allocated = false; - si = get_swap_device(entry); - if (!si) - return NULL; - for (;;) { int err; /* @@ -458,13 +386,8 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, /* * Just skip read ahead for unused swap slot. - * During swap_off when swap_slot_cache is disabled, - * we have to handle the race between putting - * swap entry in swap cache and marking swap slot - * as SWAP_HAS_CACHE. That's done in later part of code or - * else swap_off will be aborted if we return NULL. */ - if (!swap_swapcount(si, entry) && swap_slot_cache_enabled) + if (!swap_entry_swapped(si, entry)) goto put_and_return; /* @@ -522,7 +445,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, if (add_to_swap_cache(new_folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) goto fail_unlock; - mem_cgroup_swapin_uncharge_swap(entry, 1); + memcg1_swapin(entry, 1); if (shadow) workingset_refault(new_folio, shadow); @@ -539,7 +462,6 @@ fail_unlock: put_swap_folio(new_folio, entry); folio_unlock(new_folio); put_and_return: - put_swap_device(si); if (!(*new_page_allocated) && new_folio) folio_put(new_folio); return result; @@ -559,11 +481,16 @@ struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, struct swap_iocb **plug) { + struct swap_info_struct *si; bool page_allocated; struct mempolicy *mpol; pgoff_t ilx; struct folio *folio; + si = get_swap_device(entry); + if (!si) + return NULL; + mpol = get_vma_policy(vma, addr, 0, &ilx); folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, &page_allocated, false); @@ -571,6 +498,8 @@ struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, if (page_allocated) swap_read_folio(folio, plug); + + put_swap_device(si); return folio; } diff --git a/mm/swapfile.c b/mm/swapfile.c index b0a9071cfe1d..68ce283e84be 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -37,7 +37,6 @@ #include <linux/oom.h> #include <linux/swapfile.h> #include <linux/export.h> -#include <linux/swap_slots.h> #include <linux/sort.h> #include <linux/completion.h> #include <linux/suspend.h> @@ -53,15 +52,15 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); static void free_swap_count_continuations(struct swap_info_struct *); -static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry, - unsigned int nr_pages); -static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, +static void swap_entries_free(struct swap_info_struct *si, + struct swap_cluster_info *ci, + swp_entry_t entry, unsigned int nr_pages); +static void swap_range_alloc(struct swap_info_struct *si, unsigned int nr_entries); static bool folio_swapcache_freeable(struct folio *folio); -static struct swap_cluster_info *lock_cluster_or_swap_info( - struct swap_info_struct *si, unsigned long offset); -static void unlock_cluster_or_swap_info(struct swap_info_struct *si, - struct swap_cluster_info *ci); +static struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, + unsigned long offset); +static inline void unlock_cluster(struct swap_cluster_info *ci); static DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; @@ -116,6 +115,18 @@ static atomic_t proc_poll_event = ATOMIC_INIT(0); atomic_t nr_rotate_swap = ATOMIC_INIT(0); +struct percpu_swap_cluster { + struct swap_info_struct *si[SWAP_NR_ORDERS]; + unsigned long offset[SWAP_NR_ORDERS]; + local_lock_t lock; +}; + +static DEFINE_PER_CPU(struct percpu_swap_cluster, percpu_swap_cluster) = { + .si = { NULL }, + .offset = { SWAP_ENTRY_INVALID }, + .lock = INIT_LOCAL_LOCK(), +}; + static struct swap_info_struct *swap_type_to_swap_info(int type) { if (type >= MAX_SWAPFILES) @@ -129,6 +140,26 @@ static inline unsigned char swap_count(unsigned char ent) return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ } +/* + * Use the second highest bit of inuse_pages counter as the indicator + * if one swap device is on the available plist, so the atomic can + * still be updated arithmetically while having special data embedded. + * + * inuse_pages counter is the only thing indicating if a device should + * be on avail_lists or not (except swapon / swapoff). By embedding the + * off-list bit in the atomic counter, updates no longer need any lock + * to check the list status. + * + * This bit will be set if the device is not on the plist and not + * usable, will be cleared if the device is on the plist. + */ +#define SWAP_USAGE_OFFLIST_BIT (1UL << (BITS_PER_TYPE(atomic_t) - 2)) +#define SWAP_USAGE_COUNTER_MASK (~SWAP_USAGE_OFFLIST_BIT) +static long swap_usage_in_pages(struct swap_info_struct *si) +{ + return atomic_long_read(&si->inuse_pages) & SWAP_USAGE_COUNTER_MASK; +} + /* Reclaim the swap entry anyway if possible */ #define TTRS_ANYWAY 0x1 /* @@ -138,10 +169,8 @@ static inline unsigned char swap_count(unsigned char ent) #define TTRS_UNMAPPED 0x2 /* Reclaim the swap entry if swap is getting full */ #define TTRS_FULL 0x4 -/* Reclaim directly, bypass the slot cache and don't touch device lock */ -#define TTRS_DIRECT 0x8 -static bool swap_is_has_cache(struct swap_info_struct *si, +static bool swap_only_has_cache(struct swap_info_struct *si, unsigned long offset, int nr_pages) { unsigned char *map = si->swap_map + offset; @@ -163,7 +192,7 @@ static bool swap_is_last_map(struct swap_info_struct *si, unsigned char *map_end = map + nr_pages; unsigned char count = *map; - if (swap_count(count) != 1) + if (swap_count(count) != 1 && swap_count(count) != SWAP_MAP_SHMEM) return false; while (++map < map_end) { @@ -190,6 +219,7 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, int ret, nr_pages; bool need_reclaim; +again: folio = filemap_get_folio(address_space, swap_cache_index(entry)); if (IS_ERR(folio)) return 0; @@ -207,8 +237,16 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, if (!folio_trylock(folio)) goto out; - /* offset could point to the middle of a large folio */ + /* + * Offset could point to the middle of a large folio, or folio + * may no longer point to the expected offset before it's locked. + */ entry = folio->swap; + if (offset < swp_offset(entry) || offset >= swp_offset(entry) + nr_pages) { + folio_unlock(folio); + folio_put(folio); + goto again; + } offset = swp_offset(entry); need_reclaim = ((flags & TTRS_ANYWAY) || @@ -222,32 +260,14 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, * swap_map is HAS_CACHE only, which means the slots have no page table * reference or pending writeback, and can't be allocated to others. */ - ci = lock_cluster_or_swap_info(si, offset); - need_reclaim = swap_is_has_cache(si, offset, nr_pages); - unlock_cluster_or_swap_info(si, ci); + ci = lock_cluster(si, offset); + need_reclaim = swap_only_has_cache(si, offset, nr_pages); + unlock_cluster(ci); if (!need_reclaim) goto out_unlock; - if (!(flags & TTRS_DIRECT)) { - /* Free through slot cache */ - delete_from_swap_cache(folio); - folio_set_dirty(folio); - ret = nr_pages; - goto out_unlock; - } - - xa_lock_irq(&address_space->i_pages); - __delete_from_swap_cache(folio, entry, NULL); - xa_unlock_irq(&address_space->i_pages); - folio_ref_sub(folio, nr_pages); + delete_from_swap_cache(folio); folio_set_dirty(folio); - - spin_lock(&si->lock); - /* Only sinple page folio can be backed by zswap */ - if (nr_pages == 1) - zswap_invalidate(entry); - swap_entry_range_free(si, entry, nr_pages); - spin_unlock(&si->lock); ret = nr_pages; out_unlock: folio_unlock(folio); @@ -382,9 +402,23 @@ static void discard_swap_cluster(struct swap_info_struct *si, #endif #define LATENCY_LIMIT 256 -static inline bool cluster_is_free(struct swap_cluster_info *info) +static inline bool cluster_is_empty(struct swap_cluster_info *info) +{ + return info->count == 0; +} + +static inline bool cluster_is_discard(struct swap_cluster_info *info) +{ + return info->flags == CLUSTER_FLAG_DISCARD; +} + +static inline bool cluster_is_usable(struct swap_cluster_info *ci, int order) { - return info->flags & CLUSTER_FLAG_FREE; + if (unlikely(ci->flags > CLUSTER_FLAG_USABLE)) + return false; + if (!order) + return true; + return cluster_is_empty(ci) || order == ci->order; } static inline unsigned int cluster_index(struct swap_info_struct *si, @@ -393,6 +427,12 @@ static inline unsigned int cluster_index(struct swap_info_struct *si, return ci - si->cluster_info; } +static inline struct swap_cluster_info *offset_to_cluster(struct swap_info_struct *si, + unsigned long offset) +{ + return &si->cluster_info[offset / SWAPFILE_CLUSTER]; +} + static inline unsigned int cluster_offset(struct swap_info_struct *si, struct swap_cluster_info *ci) { @@ -404,105 +444,134 @@ static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si { struct swap_cluster_info *ci; - ci = si->cluster_info; - if (ci) { - ci += offset / SWAPFILE_CLUSTER; - spin_lock(&ci->lock); - } + ci = offset_to_cluster(si, offset); + spin_lock(&ci->lock); + return ci; } static inline void unlock_cluster(struct swap_cluster_info *ci) { - if (ci) - spin_unlock(&ci->lock); + spin_unlock(&ci->lock); } -/* - * Determine the locking method in use for this device. Return - * swap_cluster_info if SSD-style cluster-based locking is in place. - */ -static inline struct swap_cluster_info *lock_cluster_or_swap_info( - struct swap_info_struct *si, unsigned long offset) +static void move_cluster(struct swap_info_struct *si, + struct swap_cluster_info *ci, struct list_head *list, + enum swap_cluster_flags new_flags) { - struct swap_cluster_info *ci; + VM_WARN_ON(ci->flags == new_flags); - /* Try to use fine-grained SSD-style locking if available: */ - ci = lock_cluster(si, offset); - /* Otherwise, fall back to traditional, coarse locking: */ - if (!ci) - spin_lock(&si->lock); - - return ci; -} + BUILD_BUG_ON(1 << sizeof(ci->flags) * BITS_PER_BYTE < CLUSTER_FLAG_MAX); + lockdep_assert_held(&ci->lock); -static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si, - struct swap_cluster_info *ci) -{ - if (ci) - unlock_cluster(ci); + spin_lock(&si->lock); + if (ci->flags == CLUSTER_FLAG_NONE) + list_add_tail(&ci->list, list); else - spin_unlock(&si->lock); + list_move_tail(&ci->list, list); + spin_unlock(&si->lock); + + if (ci->flags == CLUSTER_FLAG_FRAG) + atomic_long_dec(&si->frag_cluster_nr[ci->order]); + else if (new_flags == CLUSTER_FLAG_FRAG) + atomic_long_inc(&si->frag_cluster_nr[ci->order]); + ci->flags = new_flags; } /* Add a cluster to discard list and schedule it to do discard */ static void swap_cluster_schedule_discard(struct swap_info_struct *si, struct swap_cluster_info *ci) { - unsigned int idx = cluster_index(si, ci); - /* - * If scan_swap_map_slots() can't find a free cluster, it will check - * si->swap_map directly. To make sure the discarding cluster isn't - * taken by scan_swap_map_slots(), mark the swap entries bad (occupied). - * It will be cleared after discard - */ - memset(si->swap_map + idx * SWAPFILE_CLUSTER, - SWAP_MAP_BAD, SWAPFILE_CLUSTER); - - VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE); - list_move_tail(&ci->list, &si->discard_clusters); - ci->flags = 0; + VM_BUG_ON(ci->flags == CLUSTER_FLAG_FREE); + move_cluster(si, ci, &si->discard_clusters, CLUSTER_FLAG_DISCARD); schedule_work(&si->discard_work); } static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) { - lockdep_assert_held(&si->lock); lockdep_assert_held(&ci->lock); - - if (ci->flags) - list_move_tail(&ci->list, &si->free_clusters); - else - list_add_tail(&ci->list, &si->free_clusters); - ci->flags = CLUSTER_FLAG_FREE; + move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE); ci->order = 0; } /* + * Isolate and lock the first cluster that is not contented on a list, + * clean its flag before taken off-list. Cluster flag must be in sync + * with list status, so cluster updaters can always know the cluster + * list status without touching si lock. + * + * Note it's possible that all clusters on a list are contented so + * this returns NULL for an non-empty list. + */ +static struct swap_cluster_info *isolate_lock_cluster( + struct swap_info_struct *si, struct list_head *list) +{ + struct swap_cluster_info *ci, *ret = NULL; + + spin_lock(&si->lock); + + if (unlikely(!(si->flags & SWP_WRITEOK))) + goto out; + + list_for_each_entry(ci, list, list) { + if (!spin_trylock(&ci->lock)) + continue; + + /* We may only isolate and clear flags of following lists */ + VM_BUG_ON(!ci->flags); + VM_BUG_ON(ci->flags > CLUSTER_FLAG_USABLE && + ci->flags != CLUSTER_FLAG_FULL); + + list_del(&ci->list); + ci->flags = CLUSTER_FLAG_NONE; + ret = ci; + break; + } +out: + spin_unlock(&si->lock); + + return ret; +} + +/* * Doing discard actually. After a cluster discard is finished, the cluster - * will be added to free cluster list. caller should hold si->lock. -*/ -static void swap_do_scheduled_discard(struct swap_info_struct *si) + * will be added to free cluster list. Discard cluster is a bit special as + * they don't participate in allocation or reclaim, so clusters marked as + * CLUSTER_FLAG_DISCARD must remain off-list or on discard list. + */ +static bool swap_do_scheduled_discard(struct swap_info_struct *si) { struct swap_cluster_info *ci; + bool ret = false; unsigned int idx; + spin_lock(&si->lock); while (!list_empty(&si->discard_clusters)) { ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list); + /* + * Delete the cluster from list to prepare for discard, but keep + * the CLUSTER_FLAG_DISCARD flag, percpu_swap_cluster could be + * pointing to it, or ran into by relocate_cluster. + */ list_del(&ci->list); idx = cluster_index(si, ci); spin_unlock(&si->lock); - discard_swap_cluster(si, idx * SWAPFILE_CLUSTER, SWAPFILE_CLUSTER); - spin_lock(&si->lock); spin_lock(&ci->lock); + /* + * Discard is done, clear its flags as it's off-list, then + * return the cluster to allocation list. + */ + ci->flags = CLUSTER_FLAG_NONE; __free_cluster(si, ci); - memset(si->swap_map + idx * SWAPFILE_CLUSTER, - 0, SWAPFILE_CLUSTER); spin_unlock(&ci->lock); + ret = true; + spin_lock(&si->lock); } + spin_unlock(&si->lock); + return ret; } static void swap_discard_work(struct work_struct *work) @@ -511,9 +580,7 @@ static void swap_discard_work(struct work_struct *work) si = container_of(work, struct swap_info_struct, discard_work); - spin_lock(&si->lock); swap_do_scheduled_discard(si); - spin_unlock(&si->lock); } static void swap_users_ref_free(struct percpu_ref *ref) @@ -524,15 +591,16 @@ static void swap_users_ref_free(struct percpu_ref *ref) complete(&si->comp); } +/* + * Must be called after freeing if ci->count == 0, moves the cluster to free + * or discard list. + */ static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) { VM_BUG_ON(ci->count != 0); - lockdep_assert_held(&si->lock); + VM_BUG_ON(ci->flags == CLUSTER_FLAG_FREE); lockdep_assert_held(&ci->lock); - if (ci->flags & CLUSTER_FLAG_FRAG) - si->frag_cluster_nr[ci->order]--; - /* * If the swap is discardable, prepare discard the cluster * instead of free it immediately. The cluster will be freed @@ -548,6 +616,49 @@ static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info * } /* + * Must be called after freeing if ci->count != 0, moves the cluster to + * nonfull list. + */ +static void partial_free_cluster(struct swap_info_struct *si, + struct swap_cluster_info *ci) +{ + VM_BUG_ON(!ci->count || ci->count == SWAPFILE_CLUSTER); + lockdep_assert_held(&ci->lock); + + if (ci->flags != CLUSTER_FLAG_NONFULL) + move_cluster(si, ci, &si->nonfull_clusters[ci->order], + CLUSTER_FLAG_NONFULL); +} + +/* + * Must be called after allocation, moves the cluster to full or frag list. + * Note: allocation doesn't acquire si lock, and may drop the ci lock for + * reclaim, so the cluster could be any where when called. + */ +static void relocate_cluster(struct swap_info_struct *si, + struct swap_cluster_info *ci) +{ + lockdep_assert_held(&ci->lock); + + /* Discard cluster must remain off-list or on discard list */ + if (cluster_is_discard(ci)) + return; + + if (!ci->count) { + if (ci->flags != CLUSTER_FLAG_FREE) + free_cluster(si, ci); + } else if (ci->count != SWAPFILE_CLUSTER) { + if (ci->flags != CLUSTER_FLAG_FRAG) + move_cluster(si, ci, &si->frag_clusters[ci->order], + CLUSTER_FLAG_FRAG); + } else { + if (ci->flags != CLUSTER_FLAG_FULL) + move_cluster(si, ci, &si->full_clusters, + CLUSTER_FLAG_FULL); + } +} + +/* * The cluster corresponding to page_nr will be used. The cluster will not be * added to free cluster list and its usage counter will be increased by 1. * Only used for initialization. @@ -558,9 +669,6 @@ static void inc_cluster_info_page(struct swap_info_struct *si, unsigned long idx = page_nr / SWAPFILE_CLUSTER; struct swap_cluster_info *ci; - if (!cluster_info) - return; - ci = cluster_info + idx; ci->count++; @@ -568,63 +676,33 @@ static void inc_cluster_info_page(struct swap_info_struct *si, VM_BUG_ON(ci->flags); } -/* - * The cluster ci decreases @nr_pages usage. If the usage counter becomes 0, - * which means no page in the cluster is in use, we can optionally discard - * the cluster and add it to free cluster list. - */ -static void dec_cluster_info_page(struct swap_info_struct *si, - struct swap_cluster_info *ci, int nr_pages) -{ - if (!si->cluster_info) - return; - - VM_BUG_ON(ci->count < nr_pages); - VM_BUG_ON(cluster_is_free(ci)); - lockdep_assert_held(&si->lock); - lockdep_assert_held(&ci->lock); - ci->count -= nr_pages; - - if (!ci->count) { - free_cluster(si, ci); - return; - } - - if (!(ci->flags & CLUSTER_FLAG_NONFULL)) { - VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE); - if (ci->flags & CLUSTER_FLAG_FRAG) - si->frag_cluster_nr[ci->order]--; - list_move_tail(&ci->list, &si->nonfull_clusters[ci->order]); - ci->flags = CLUSTER_FLAG_NONFULL; - } -} - static bool cluster_reclaim_range(struct swap_info_struct *si, struct swap_cluster_info *ci, unsigned long start, unsigned long end) { unsigned char *map = si->swap_map; - unsigned long offset; + unsigned long offset = start; + int nr_reclaim; spin_unlock(&ci->lock); - spin_unlock(&si->lock); - - for (offset = start; offset < end; offset++) { + do { switch (READ_ONCE(map[offset])) { case 0: - continue; + offset++; + break; case SWAP_HAS_CACHE: - if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT) > 0) - continue; - goto out; + nr_reclaim = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); + if (nr_reclaim > 0) + offset += nr_reclaim; + else + goto out; + break; default: goto out; } - } + } while (offset < end); out: - spin_lock(&si->lock); spin_lock(&ci->lock); - /* * Recheck the range no matter reclaim succeeded or not, the slot * could have been be freed while we are not holding the lock. @@ -638,11 +716,14 @@ out: static bool cluster_scan_range(struct swap_info_struct *si, struct swap_cluster_info *ci, - unsigned long start, unsigned int nr_pages) + unsigned long start, unsigned int nr_pages, + bool *need_reclaim) { unsigned long offset, end = start + nr_pages; unsigned char *map = si->swap_map; - bool need_reclaim = false; + + if (cluster_is_empty(ci)) + return true; for (offset = start; offset < end; offset++) { switch (READ_ONCE(map[offset])) { @@ -651,16 +732,13 @@ static bool cluster_scan_range(struct swap_info_struct *si, case SWAP_HAS_CACHE: if (!vm_swap_full()) return false; - need_reclaim = true; + *need_reclaim = true; continue; default: return false; } } - if (need_reclaim) - return cluster_reclaim_range(si, ci, start, end); - return true; } @@ -670,76 +748,83 @@ static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster { unsigned int nr_pages = 1 << order; + lockdep_assert_held(&ci->lock); + if (!(si->flags & SWP_WRITEOK)) return false; - if (cluster_is_free(ci)) { - if (nr_pages < SWAPFILE_CLUSTER) { - list_move_tail(&ci->list, &si->nonfull_clusters[order]); - ci->flags = CLUSTER_FLAG_NONFULL; - } + /* + * The first allocation in a cluster makes the + * cluster exclusive to this order + */ + if (cluster_is_empty(ci)) ci->order = order; - } memset(si->swap_map + start, usage, nr_pages); - swap_range_alloc(si, start, nr_pages); + swap_range_alloc(si, nr_pages); ci->count += nr_pages; - if (ci->count == SWAPFILE_CLUSTER) { - VM_BUG_ON(!(ci->flags & - (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL | CLUSTER_FLAG_FRAG))); - if (ci->flags & CLUSTER_FLAG_FRAG) - si->frag_cluster_nr[ci->order]--; - list_move_tail(&ci->list, &si->full_clusters); - ci->flags = CLUSTER_FLAG_FULL; - } - return true; } -static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigned long offset, - unsigned int *foundp, unsigned int order, +/* Try use a new cluster for current CPU and allocate from it. */ +static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, + struct swap_cluster_info *ci, + unsigned long offset, + unsigned int order, unsigned char usage) { - unsigned long start = offset & ~(SWAPFILE_CLUSTER - 1); + unsigned int next = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID; + unsigned long start = ALIGN_DOWN(offset, SWAPFILE_CLUSTER); unsigned long end = min(start + SWAPFILE_CLUSTER, si->max); unsigned int nr_pages = 1 << order; - struct swap_cluster_info *ci; + bool need_reclaim, ret; - if (end < nr_pages) - return SWAP_NEXT_INVALID; - end -= nr_pages; + lockdep_assert_held(&ci->lock); - ci = lock_cluster(si, offset); - if (ci->count + nr_pages > SWAPFILE_CLUSTER) { - offset = SWAP_NEXT_INVALID; - goto done; - } + if (end < nr_pages || ci->count + nr_pages > SWAPFILE_CLUSTER) + goto out; - while (offset <= end) { - if (cluster_scan_range(si, ci, offset, nr_pages)) { - if (!cluster_alloc_range(si, ci, offset, usage, order)) { - offset = SWAP_NEXT_INVALID; - goto done; - } - *foundp = offset; - if (ci->count == SWAPFILE_CLUSTER) { - offset = SWAP_NEXT_INVALID; - goto done; - } - offset += nr_pages; - break; + for (end -= nr_pages; offset <= end; offset += nr_pages) { + need_reclaim = false; + if (!cluster_scan_range(si, ci, offset, nr_pages, &need_reclaim)) + continue; + if (need_reclaim) { + ret = cluster_reclaim_range(si, ci, offset, offset + nr_pages); + /* + * Reclaim drops ci->lock and cluster could be used + * by another order. Not checking flag as off-list + * cluster has no flag set, and change of list + * won't cause fragmentation. + */ + if (!cluster_is_usable(ci, order)) + goto out; + if (cluster_is_empty(ci)) + offset = start; + /* Reclaim failed but cluster is usable, try next */ + if (!ret) + continue; } + if (!cluster_alloc_range(si, ci, offset, usage, order)) + break; + found = offset; offset += nr_pages; + if (ci->count < SWAPFILE_CLUSTER && offset <= end) + next = offset; + break; } - if (offset > end) - offset = SWAP_NEXT_INVALID; -done: +out: + relocate_cluster(si, ci); unlock_cluster(ci); - return offset; + if (si->flags & SWP_SOLIDSTATE) { + this_cpu_write(percpu_swap_cluster.offset[order], next); + this_cpu_write(percpu_swap_cluster.si[order], si); + } else { + si->global_cluster->next[order] = next; + } + return found; } -/* Return true if reclaimed a whole cluster */ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) { long to_scan = 1; @@ -749,20 +834,19 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) int nr_reclaim; if (force) - to_scan = si->inuse_pages / SWAPFILE_CLUSTER; + to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER; - while (!list_empty(&si->full_clusters)) { - ci = list_first_entry(&si->full_clusters, struct swap_cluster_info, list); - list_move_tail(&ci->list, &si->full_clusters); + while ((ci = isolate_lock_cluster(si, &si->full_clusters))) { offset = cluster_offset(si, ci); end = min(si->max, offset + SWAPFILE_CLUSTER); to_scan--; - spin_unlock(&si->lock); while (offset < end) { if (READ_ONCE(map[offset]) == SWAP_HAS_CACHE) { + spin_unlock(&ci->lock); nr_reclaim = __try_to_reclaim_swap(si, offset, - TTRS_ANYWAY | TTRS_DIRECT); + TTRS_ANYWAY); + spin_lock(&ci->lock); if (nr_reclaim) { offset += abs(nr_reclaim); continue; @@ -770,8 +854,12 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) } offset++; } - spin_lock(&si->lock); + /* in case no swap cache is reclaimed */ + if (ci->flags == CLUSTER_FLAG_NONE) + relocate_cluster(si, ci); + + unlock_cluster(ci); if (to_scan <= 0) break; } @@ -783,42 +871,54 @@ static void swap_reclaim_work(struct work_struct *work) si = container_of(work, struct swap_info_struct, reclaim_work); - spin_lock(&si->lock); swap_reclaim_full_clusters(si, true); - spin_unlock(&si->lock); } /* - * Try to get swap entries with specified order from current cpu's swap entry - * pool (a cluster). This might involve allocating a new cluster for current CPU - * too. + * Try to allocate swap entries with specified order and try set a new + * cluster for current CPU too. */ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order, unsigned char usage) { - struct percpu_cluster *cluster; struct swap_cluster_info *ci; - unsigned int offset, found = 0; + unsigned int offset = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID; -new_cluster: - lockdep_assert_held(&si->lock); - cluster = this_cpu_ptr(si->percpu_cluster); - offset = cluster->next[order]; - if (offset) { - offset = alloc_swap_scan_cluster(si, offset, &found, order, usage); + /* + * Swapfile is not block device so unable + * to allocate large entries. + */ + if (order && !(si->flags & SWP_BLKDEV)) + return 0; + + if (!(si->flags & SWP_SOLIDSTATE)) { + /* Serialize HDD SWAP allocation for each device. */ + spin_lock(&si->global_cluster_lock); + offset = si->global_cluster->next[order]; + if (offset == SWAP_ENTRY_INVALID) + goto new_cluster; + + ci = lock_cluster(si, offset); + /* Cluster could have been used by another order */ + if (cluster_is_usable(ci, order)) { + if (cluster_is_empty(ci)) + offset = cluster_offset(si, ci); + found = alloc_swap_scan_cluster(si, ci, offset, + order, usage); + } else { + unlock_cluster(ci); + } if (found) goto done; } - if (!list_empty(&si->free_clusters)) { - ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list); - offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, order, usage); - /* - * Either we didn't touch the cluster due to swapoff, - * or the allocation must success. - */ - VM_BUG_ON((si->flags & SWP_WRITEOK) && !found); - goto done; +new_cluster: + ci = isolate_lock_cluster(si, &si->free_clusters); + if (ci) { + found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), + order, usage); + if (found) + goto done; } /* Try reclaim from full clusters if free clusters list is drained */ @@ -826,56 +926,42 @@ new_cluster: swap_reclaim_full_clusters(si, false); if (order < PMD_ORDER) { - unsigned int frags = 0; - - while (!list_empty(&si->nonfull_clusters[order])) { - ci = list_first_entry(&si->nonfull_clusters[order], - struct swap_cluster_info, list); - list_move_tail(&ci->list, &si->frag_clusters[order]); - ci->flags = CLUSTER_FLAG_FRAG; - si->frag_cluster_nr[order]++; - offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), - &found, order, usage); - frags++; + unsigned int frags = 0, frags_existing; + + while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[order]))) { + found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), + order, usage); if (found) - break; + goto done; + /* Clusters failed to allocate are moved to frag_clusters */ + frags++; } - if (!found) { + frags_existing = atomic_long_read(&si->frag_cluster_nr[order]); + while (frags < frags_existing && + (ci = isolate_lock_cluster(si, &si->frag_clusters[order]))) { + atomic_long_dec(&si->frag_cluster_nr[order]); /* - * Nonfull clusters are moved to frag tail if we reached - * here, count them too, don't over scan the frag list. + * Rotate the frag list to iterate, they were all + * failing high order allocation or moved here due to + * per-CPU usage, but they could contain newly released + * reclaimable (eg. lazy-freed swap cache) slots. */ - while (frags < si->frag_cluster_nr[order]) { - ci = list_first_entry(&si->frag_clusters[order], - struct swap_cluster_info, list); - /* - * Rotate the frag list to iterate, they were all failing - * high order allocation or moved here due to per-CPU usage, - * this help keeping usable cluster ahead. - */ - list_move_tail(&ci->list, &si->frag_clusters[order]); - offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), - &found, order, usage); - frags++; - if (found) - break; - } + found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), + order, usage); + if (found) + goto done; + frags++; } } - if (found) - goto done; - - if (!list_empty(&si->discard_clusters)) { - /* - * we don't have free cluster but have some clusters in - * discarding, do discard now and reclaim them, then - * reread cluster_next_cpu since we dropped si->lock - */ - swap_do_scheduled_discard(si); + /* + * We don't have free cluster but have some clusters in + * discarding, do discard now and reclaim them, then + * reread cluster_next_cpu since we dropped si->lock + */ + if ((si->flags & SWP_PAGE_DISCARD) && swap_do_scheduled_discard(si)) goto new_cluster; - } if (order) goto done; @@ -886,74 +972,149 @@ new_cluster: * Clusters here have at least one usable slots and can't fail order 0 * allocation, but reclaim may drop si->lock and race with another user. */ - while (!list_empty(&si->frag_clusters[o])) { - ci = list_first_entry(&si->frag_clusters[o], - struct swap_cluster_info, list); - offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), - &found, 0, usage); + while ((ci = isolate_lock_cluster(si, &si->frag_clusters[o]))) { + atomic_long_dec(&si->frag_cluster_nr[o]); + found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), + 0, usage); if (found) goto done; } - while (!list_empty(&si->nonfull_clusters[o])) { - ci = list_first_entry(&si->nonfull_clusters[o], - struct swap_cluster_info, list); - offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), - &found, 0, usage); + while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[o]))) { + found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), + 0, usage); if (found) goto done; } } - done: - cluster->next[order] = offset; + if (!(si->flags & SWP_SOLIDSTATE)) + spin_unlock(&si->global_cluster_lock); return found; } -static void __del_from_avail_list(struct swap_info_struct *si) +/* SWAP_USAGE_OFFLIST_BIT can only be set by this helper. */ +static void del_from_avail_list(struct swap_info_struct *si, bool swapoff) { int nid; + unsigned long pages; + + spin_lock(&swap_avail_lock); + + if (swapoff) { + /* + * Forcefully remove it. Clear the SWP_WRITEOK flags for + * swapoff here so it's synchronized by both si->lock and + * swap_avail_lock, to ensure the result can be seen by + * add_to_avail_list. + */ + lockdep_assert_held(&si->lock); + si->flags &= ~SWP_WRITEOK; + atomic_long_or(SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages); + } else { + /* + * If not called by swapoff, take it off-list only if it's + * full and SWAP_USAGE_OFFLIST_BIT is not set (strictly + * si->inuse_pages == pages), any concurrent slot freeing, + * or device already removed from plist by someone else + * will make this return false. + */ + pages = si->pages; + if (!atomic_long_try_cmpxchg(&si->inuse_pages, &pages, + pages | SWAP_USAGE_OFFLIST_BIT)) + goto skip; + } - assert_spin_locked(&si->lock); for_each_node(nid) plist_del(&si->avail_lists[nid], &swap_avail_heads[nid]); + +skip: + spin_unlock(&swap_avail_lock); } -static void del_from_avail_list(struct swap_info_struct *si) +/* SWAP_USAGE_OFFLIST_BIT can only be cleared by this helper. */ +static void add_to_avail_list(struct swap_info_struct *si, bool swapon) { + int nid; + long val; + unsigned long pages; + spin_lock(&swap_avail_lock); - __del_from_avail_list(si); + + /* Corresponding to SWP_WRITEOK clearing in del_from_avail_list */ + if (swapon) { + lockdep_assert_held(&si->lock); + si->flags |= SWP_WRITEOK; + } else { + if (!(READ_ONCE(si->flags) & SWP_WRITEOK)) + goto skip; + } + + if (!(atomic_long_read(&si->inuse_pages) & SWAP_USAGE_OFFLIST_BIT)) + goto skip; + + val = atomic_long_fetch_and_relaxed(~SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages); + + /* + * When device is full and device is on the plist, only one updater will + * see (inuse_pages == si->pages) and will call del_from_avail_list. If + * that updater happen to be here, just skip adding. + */ + pages = si->pages; + if (val == pages) { + /* Just like the cmpxchg in del_from_avail_list */ + if (atomic_long_try_cmpxchg(&si->inuse_pages, &pages, + pages | SWAP_USAGE_OFFLIST_BIT)) + goto skip; + } + + for_each_node(nid) + plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]); + +skip: spin_unlock(&swap_avail_lock); } -static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, - unsigned int nr_entries) +/* + * swap_usage_add / swap_usage_sub of each slot are serialized by ci->lock + * within each cluster, so the total contribution to the global counter should + * always be positive and cannot exceed the total number of usable slots. + */ +static bool swap_usage_add(struct swap_info_struct *si, unsigned int nr_entries) { - unsigned int end = offset + nr_entries - 1; - - if (offset == si->lowest_bit) - si->lowest_bit += nr_entries; - if (end == si->highest_bit) - WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries); - WRITE_ONCE(si->inuse_pages, si->inuse_pages + nr_entries); - if (si->inuse_pages == si->pages) { - si->lowest_bit = si->max; - si->highest_bit = 0; - del_from_avail_list(si); + long val = atomic_long_add_return_relaxed(nr_entries, &si->inuse_pages); - if (si->cluster_info && vm_swap_full()) - schedule_work(&si->reclaim_work); + /* + * If device is full, and SWAP_USAGE_OFFLIST_BIT is not set, + * remove it from the plist. + */ + if (unlikely(val == si->pages)) { + del_from_avail_list(si, false); + return true; } + + return false; } -static void add_to_avail_list(struct swap_info_struct *si) +static void swap_usage_sub(struct swap_info_struct *si, unsigned int nr_entries) { - int nid; + long val = atomic_long_sub_return_relaxed(nr_entries, &si->inuse_pages); - spin_lock(&swap_avail_lock); - for_each_node(nid) - plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]); - spin_unlock(&swap_avail_lock); + /* + * If device is not full, and SWAP_USAGE_OFFLIST_BIT is set, + * add it to the plist. + */ + if (unlikely(val & SWAP_USAGE_OFFLIST_BIT)) + add_to_avail_list(si, false); +} + +static void swap_range_alloc(struct swap_info_struct *si, + unsigned int nr_entries) +{ + if (swap_usage_add(si, nr_entries)) { + if (vm_swap_full()) + schedule_work(&si->reclaim_work); + } } static void swap_range_free(struct swap_info_struct *si, unsigned long offset, @@ -968,18 +1129,11 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, * Use atomic clear_bit operations only on zeromap instead of non-atomic * bitmap_clear to prevent adjacent bits corruption due to simultaneous writes. */ - for (i = 0; i < nr_entries; i++) + for (i = 0; i < nr_entries; i++) { clear_bit(offset + i, si->zeromap); - - if (offset < si->lowest_bit) - si->lowest_bit = offset; - if (end > si->highest_bit) { - bool was_full = !si->highest_bit; - - WRITE_ONCE(si->highest_bit, end); - if (was_full && (si->flags & SWP_WRITEOK)) - add_to_avail_list(si); + zswap_invalidate(swp_entry(si->type, offset + i)); } + if (si->flags & SWP_BLKDEV) swap_slot_free_notify = si->bdev->bd_disk->fops->swap_slot_free_notify; @@ -999,324 +1153,87 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, */ smp_wmb(); atomic_long_add(nr_entries, &nr_swap_pages); - WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries); + swap_usage_sub(si, nr_entries); } -static void set_cluster_next(struct swap_info_struct *si, unsigned long next) +static bool get_swap_device_info(struct swap_info_struct *si) { - unsigned long prev; - - if (!(si->flags & SWP_SOLIDSTATE)) { - si->cluster_next = next; - return; - } - - prev = this_cpu_read(*si->cluster_next_cpu); + if (!percpu_ref_tryget_live(&si->users)) + return false; /* - * Cross the swap address space size aligned trunk, choose - * another trunk randomly to avoid lock contention on swap - * address space if possible. + * Guarantee the si->users are checked before accessing other + * fields of swap_info_struct, and si->flags (SWP_WRITEOK) is + * up to dated. + * + * Paired with the spin_unlock() after setup_swap_info() in + * enable_swap_info(), and smp_wmb() in swapoff. */ - if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) != - (next >> SWAP_ADDRESS_SPACE_SHIFT)) { - /* No free swap slots available */ - if (si->highest_bit <= si->lowest_bit) - return; - next = get_random_u32_inclusive(si->lowest_bit, si->highest_bit); - next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES); - next = max_t(unsigned int, next, si->lowest_bit); - } - this_cpu_write(*si->cluster_next_cpu, next); -} - -static bool swap_offset_available_and_locked(struct swap_info_struct *si, - unsigned long offset) -{ - if (data_race(!si->swap_map[offset])) { - spin_lock(&si->lock); - return true; - } - - if (vm_swap_full() && READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { - spin_lock(&si->lock); - return true; - } - - return false; -} - -static int cluster_alloc_swap(struct swap_info_struct *si, - unsigned char usage, int nr, - swp_entry_t slots[], int order) -{ - int n_ret = 0; - - VM_BUG_ON(!si->cluster_info); - - si->flags += SWP_SCANNING; - - while (n_ret < nr) { - unsigned long offset = cluster_alloc_swap_entry(si, order, usage); - - if (!offset) - break; - slots[n_ret++] = swp_entry(si->type, offset); - } - - si->flags -= SWP_SCANNING; - - return n_ret; + smp_rmb(); + return true; } -static int scan_swap_map_slots(struct swap_info_struct *si, - unsigned char usage, int nr, - swp_entry_t slots[], int order) +/* + * Fast path try to get swap entries with specified order from current + * CPU's swap entry pool (a cluster). + */ +static bool swap_alloc_fast(swp_entry_t *entry, + int order) { - unsigned long offset; - unsigned long scan_base; - unsigned long last_in_cluster = 0; - int latency_ration = LATENCY_LIMIT; - unsigned int nr_pages = 1 << order; - int n_ret = 0; - bool scanned_many = false; - - /* - * We try to cluster swap pages by allocating them sequentially - * in swap. Once we've allocated SWAPFILE_CLUSTER pages this - * way, however, we resort to first-free allocation, starting - * a new cluster. This prevents us from scattering swap pages - * all over the entire swap partition, so that we reduce - * overall disk seek times between swap pages. -- sct - * But we do now try to find an empty cluster. -Andrea - * And we let swap pages go all over an SSD partition. Hugh - */ - - if (order > 0) { - /* - * Should not even be attempting large allocations when huge - * page swap is disabled. Warn and fail the allocation. - */ - if (!IS_ENABLED(CONFIG_THP_SWAP) || - nr_pages > SWAPFILE_CLUSTER) { - VM_WARN_ON_ONCE(1); - return 0; - } - - /* - * Swapfile is not block device or not using clusters so unable - * to allocate large entries. - */ - if (!(si->flags & SWP_BLKDEV) || !si->cluster_info) - return 0; - } - - if (si->cluster_info) - return cluster_alloc_swap(si, usage, nr, slots, order); - - si->flags += SWP_SCANNING; - - /* For HDD, sequential access is more important. */ - scan_base = si->cluster_next; - offset = scan_base; - - if (unlikely(!si->cluster_nr--)) { - if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { - si->cluster_nr = SWAPFILE_CLUSTER - 1; - goto checks; - } - - spin_unlock(&si->lock); - - /* - * If seek is expensive, start searching for new cluster from - * start of partition, to minimize the span of allocated swap. - */ - scan_base = offset = si->lowest_bit; - last_in_cluster = offset + SWAPFILE_CLUSTER - 1; - - /* Locate the first empty (unaligned) cluster */ - for (; last_in_cluster <= READ_ONCE(si->highest_bit); offset++) { - if (si->swap_map[offset]) - last_in_cluster = offset + SWAPFILE_CLUSTER; - else if (offset == last_in_cluster) { - spin_lock(&si->lock); - offset -= SWAPFILE_CLUSTER - 1; - si->cluster_next = offset; - si->cluster_nr = SWAPFILE_CLUSTER - 1; - goto checks; - } - if (unlikely(--latency_ration < 0)) { - cond_resched(); - latency_ration = LATENCY_LIMIT; - } - } - - offset = scan_base; - spin_lock(&si->lock); - si->cluster_nr = SWAPFILE_CLUSTER - 1; - } - -checks: - if (!(si->flags & SWP_WRITEOK)) - goto no_page; - if (!si->highest_bit) - goto no_page; - if (offset > si->highest_bit) - scan_base = offset = si->lowest_bit; - - /* reuse swap entry of cache-only swap if not busy. */ - if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { - int swap_was_freed; - spin_unlock(&si->lock); - swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT); - spin_lock(&si->lock); - /* entry was freed successfully, try to use this again */ - if (swap_was_freed > 0) - goto checks; - goto scan; /* check next one */ - } - - if (si->swap_map[offset]) { - if (!n_ret) - goto scan; - else - goto done; - } - memset(si->swap_map + offset, usage, nr_pages); - - swap_range_alloc(si, offset, nr_pages); - slots[n_ret++] = swp_entry(si->type, offset); - - /* got enough slots or reach max slots? */ - if ((n_ret == nr) || (offset >= si->highest_bit)) - goto done; - - /* search for next available slot */ - - /* time to take a break? */ - if (unlikely(--latency_ration < 0)) { - if (n_ret) - goto done; - spin_unlock(&si->lock); - cond_resched(); - spin_lock(&si->lock); - latency_ration = LATENCY_LIMIT; - } - - if (si->cluster_nr && !si->swap_map[++offset]) { - /* non-ssd case, still more slots in cluster? */ - --si->cluster_nr; - goto checks; - } + struct swap_cluster_info *ci; + struct swap_info_struct *si; + unsigned int offset, found = SWAP_ENTRY_INVALID; /* - * Even if there's no free clusters available (fragmented), - * try to scan a little more quickly with lock held unless we - * have scanned too many slots already. + * Once allocated, swap_info_struct will never be completely freed, + * so checking it's liveness by get_swap_device_info is enough. */ - if (!scanned_many) { - unsigned long scan_limit; - - if (offset < scan_base) - scan_limit = scan_base; - else - scan_limit = si->highest_bit; - for (; offset <= scan_limit && --latency_ration > 0; - offset++) { - if (!si->swap_map[offset]) - goto checks; - } - } - -done: - if (order == 0) - set_cluster_next(si, offset + 1); - si->flags -= SWP_SCANNING; - return n_ret; + si = this_cpu_read(percpu_swap_cluster.si[order]); + offset = this_cpu_read(percpu_swap_cluster.offset[order]); + if (!si || !offset || !get_swap_device_info(si)) + return false; -scan: - VM_WARN_ON(order > 0); - spin_unlock(&si->lock); - while (++offset <= READ_ONCE(si->highest_bit)) { - if (unlikely(--latency_ration < 0)) { - cond_resched(); - latency_ration = LATENCY_LIMIT; - scanned_many = true; - } - if (swap_offset_available_and_locked(si, offset)) - goto checks; - } - offset = si->lowest_bit; - while (offset < scan_base) { - if (unlikely(--latency_ration < 0)) { - cond_resched(); - latency_ration = LATENCY_LIMIT; - scanned_many = true; - } - if (swap_offset_available_and_locked(si, offset)) - goto checks; - offset++; + ci = lock_cluster(si, offset); + if (cluster_is_usable(ci, order)) { + if (cluster_is_empty(ci)) + offset = cluster_offset(si, ci); + found = alloc_swap_scan_cluster(si, ci, offset, order, SWAP_HAS_CACHE); + if (found) + *entry = swp_entry(si->type, found); + } else { + unlock_cluster(ci); } - spin_lock(&si->lock); -no_page: - si->flags -= SWP_SCANNING; - return n_ret; + put_swap_device(si); + return !!found; } -int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order) +/* Rotate the device and switch to a new cluster */ +static bool swap_alloc_slow(swp_entry_t *entry, + int order) { - int order = swap_entry_order(entry_order); - unsigned long size = 1 << order; - struct swap_info_struct *si, *next; - long avail_pgs; - int n_ret = 0; int node; + unsigned long offset; + struct swap_info_struct *si, *next; + node = numa_node_id(); spin_lock(&swap_avail_lock); - - avail_pgs = atomic_long_read(&nr_swap_pages) / size; - if (avail_pgs <= 0) { - spin_unlock(&swap_avail_lock); - goto noswap; - } - - n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs); - - atomic_long_sub(n_goal * size, &nr_swap_pages); - start_over: - node = numa_node_id(); plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) { - /* requeue si to after same-priority siblings */ + /* Rotate the device and switch to a new cluster */ plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); spin_unlock(&swap_avail_lock); - spin_lock(&si->lock); - if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { - spin_lock(&swap_avail_lock); - if (plist_node_empty(&si->avail_lists[node])) { - spin_unlock(&si->lock); - goto nextsi; + if (get_swap_device_info(si)) { + offset = cluster_alloc_swap_entry(si, order, SWAP_HAS_CACHE); + put_swap_device(si); + if (offset) { + *entry = swp_entry(si->type, offset); + return true; } - WARN(!si->highest_bit, - "swap_info %d in list but !highest_bit\n", - si->type); - WARN(!(si->flags & SWP_WRITEOK), - "swap_info %d in list but !SWP_WRITEOK\n", - si->type); - __del_from_avail_list(si); - spin_unlock(&si->lock); - goto nextsi; + if (order) + return false; } - n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, - n_goal, swp_entries, order); - spin_unlock(&si->lock); - if (n_ret || size > 1) - goto check_out; - cond_resched(); spin_lock(&swap_avail_lock); -nextsi: /* * if we got here, it's likely that si was almost full before, * and since scan_swap_map_slots() can drop the si->lock, @@ -1331,15 +1248,77 @@ nextsi: if (plist_node_empty(&next->avail_lists[node])) goto start_over; } - spin_unlock(&swap_avail_lock); + return false; +} + +/** + * folio_alloc_swap - allocate swap space for a folio + * @folio: folio we want to move to swap + * @gfp: gfp mask for shadow nodes + * + * Allocate swap space for the folio and add the folio to the + * swap cache. + * + * Context: Caller needs to hold the folio lock. + * Return: Whether the folio was added to the swap cache. + */ +int folio_alloc_swap(struct folio *folio, gfp_t gfp) +{ + unsigned int order = folio_order(folio); + unsigned int size = 1 << order; + swp_entry_t entry = {}; + + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio); + + if (order) { + /* + * Reject large allocation when THP_SWAP is disabled, + * the caller should split the folio and try again. + */ + if (!IS_ENABLED(CONFIG_THP_SWAP)) + return -EAGAIN; + + /* + * Allocation size should never exceed cluster size + * (HPAGE_PMD_SIZE). + */ + if (size > SWAPFILE_CLUSTER) { + VM_WARN_ON_ONCE(1); + return -EINVAL; + } + } + + local_lock(&percpu_swap_cluster.lock); + if (!swap_alloc_fast(&entry, order)) + swap_alloc_slow(&entry, order); + local_unlock(&percpu_swap_cluster.lock); + + /* Need to call this even if allocation failed, for MEMCG_SWAP_FAIL. */ + if (mem_cgroup_try_charge_swap(folio, entry)) + goto out_free; -check_out: - if (n_ret < n_goal) - atomic_long_add((long)(n_goal - n_ret) * size, - &nr_swap_pages); -noswap: - return n_ret; + if (!entry.val) + return -ENOMEM; + + /* + * XArray node allocations from PF_MEMALLOC contexts could + * completely exhaust the page allocator. __GFP_NOMEMALLOC + * stops emergency reserves from being allocated. + * + * TODO: this could cause a theoretical memory reclaim + * deadlock in the swap out path. + */ + if (add_to_swap_cache(folio, entry, gfp | __GFP_NOMEMALLOC, NULL)) + goto out_free; + + atomic_long_sub(size, &nr_swap_pages); + return 0; + +out_free: + put_swap_folio(folio, entry); + return -ENOMEM; } static struct swap_info_struct *_swap_info_get(swp_entry_t entry) @@ -1376,26 +1355,12 @@ out: return NULL; } -static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry, - struct swap_info_struct *q) -{ - struct swap_info_struct *p; - - p = _swap_info_get(entry); - - if (p != q) { - if (q != NULL) - spin_unlock(&q->lock); - if (p != NULL) - spin_lock(&p->lock); - } - return p; -} - -static unsigned char __swap_entry_free_locked(struct swap_info_struct *si, - unsigned long offset, - unsigned char usage) +static unsigned char swap_entry_put_locked(struct swap_info_struct *si, + struct swap_cluster_info *ci, + swp_entry_t entry, + unsigned char usage) { + unsigned long offset = swp_offset(entry); unsigned char count; unsigned char has_cache; @@ -1427,7 +1392,7 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *si, if (usage) WRITE_ONCE(si->swap_map[offset], usage); else - WRITE_ONCE(si->swap_map[offset], SWAP_HAS_CACHE); + swap_entries_free(si, ci, entry, 1); return usage; } @@ -1481,16 +1446,8 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry) si = swp_swap_info(entry); if (!si) goto bad_nofile; - if (!percpu_ref_tryget_live(&si->users)) + if (!get_swap_device_info(si)) goto out; - /* - * Guarantee the si->users are checked before accessing other - * fields of swap_info_struct. - * - * Paired with the spin_unlock() after setup_swap_info() in - * enable_swap_info(). - */ - smp_rmb(); offset = swp_offset(entry); if (offset >= si->max) goto put_out; @@ -1506,121 +1463,127 @@ put_out: return NULL; } -static unsigned char __swap_entry_free(struct swap_info_struct *si, - swp_entry_t entry) +static void swap_entries_put_cache(struct swap_info_struct *si, + swp_entry_t entry, int nr) { - struct swap_cluster_info *ci; unsigned long offset = swp_offset(entry); - unsigned char usage; - - ci = lock_cluster_or_swap_info(si, offset); - usage = __swap_entry_free_locked(si, offset, 1); - unlock_cluster_or_swap_info(si, ci); - if (!usage) - free_swap_slot(entry); + struct swap_cluster_info *ci; - return usage; + ci = lock_cluster(si, offset); + if (swap_only_has_cache(si, offset, nr)) + swap_entries_free(si, ci, entry, nr); + else { + for (int i = 0; i < nr; i++, entry.val++) + swap_entry_put_locked(si, ci, entry, SWAP_HAS_CACHE); + } + unlock_cluster(ci); } -static bool __swap_entries_free(struct swap_info_struct *si, - swp_entry_t entry, int nr) +static bool swap_entries_put_map(struct swap_info_struct *si, + swp_entry_t entry, int nr) { unsigned long offset = swp_offset(entry); - unsigned int type = swp_type(entry); struct swap_cluster_info *ci; bool has_cache = false; unsigned char count; int i; - if (nr <= 1 || swap_count(data_race(si->swap_map[offset])) != 1) + if (nr <= 1) goto fallback; - /* cross into another cluster */ - if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER) + count = swap_count(data_race(si->swap_map[offset])); + if (count != 1 && count != SWAP_MAP_SHMEM) goto fallback; - ci = lock_cluster_or_swap_info(si, offset); + ci = lock_cluster(si, offset); if (!swap_is_last_map(si, offset, nr, &has_cache)) { - unlock_cluster_or_swap_info(si, ci); - goto fallback; + goto locked_fallback; } - for (i = 0; i < nr; i++) - WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE); - unlock_cluster_or_swap_info(si, ci); - - if (!has_cache) { + if (!has_cache) + swap_entries_free(si, ci, entry, nr); + else for (i = 0; i < nr; i++) - zswap_invalidate(swp_entry(si->type, offset + i)); - spin_lock(&si->lock); - swap_entry_range_free(si, entry, nr); - spin_unlock(&si->lock); - } + WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE); + unlock_cluster(ci); + return has_cache; fallback: - for (i = 0; i < nr; i++) { - if (data_race(si->swap_map[offset + i])) { - count = __swap_entry_free(si, swp_entry(type, offset + i)); - if (count == SWAP_HAS_CACHE) - has_cache = true; - } else { - WARN_ON_ONCE(1); - } + ci = lock_cluster(si, offset); +locked_fallback: + for (i = 0; i < nr; i++, entry.val++) { + count = swap_entry_put_locked(si, ci, entry, 1); + if (count == SWAP_HAS_CACHE) + has_cache = true; + } + unlock_cluster(ci); + return has_cache; + +} + +/* + * Only functions with "_nr" suffix are able to free entries spanning + * cross multi clusters, so ensure the range is within a single cluster + * when freeing entries with functions without "_nr" suffix. + */ +static bool swap_entries_put_map_nr(struct swap_info_struct *si, + swp_entry_t entry, int nr) +{ + int cluster_nr, cluster_rest; + unsigned long offset = swp_offset(entry); + bool has_cache = false; + + cluster_rest = SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER; + while (nr) { + cluster_nr = min(nr, cluster_rest); + has_cache |= swap_entries_put_map(si, entry, cluster_nr); + cluster_rest = SWAPFILE_CLUSTER; + nr -= cluster_nr; + entry.val += cluster_nr; } + return has_cache; } /* - * Drop the last HAS_CACHE flag of swap entries, caller have to - * ensure all entries belong to the same cgroup. + * Check if it's the last ref of swap entry in the freeing path. + * Qualified vlaue includes 1, SWAP_HAS_CACHE or SWAP_MAP_SHMEM. + */ +static inline bool __maybe_unused swap_is_last_ref(unsigned char count) +{ + return (count == SWAP_HAS_CACHE) || (count == 1) || + (count == SWAP_MAP_SHMEM); +} + +/* + * Drop the last ref of swap entries, caller have to ensure all entries + * belong to the same cgroup and cluster. */ -static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry, - unsigned int nr_pages) +static void swap_entries_free(struct swap_info_struct *si, + struct swap_cluster_info *ci, + swp_entry_t entry, unsigned int nr_pages) { unsigned long offset = swp_offset(entry); unsigned char *map = si->swap_map + offset; unsigned char *map_end = map + nr_pages; - struct swap_cluster_info *ci; - ci = lock_cluster(si, offset); + /* It should never free entries across different clusters */ + VM_BUG_ON(ci != offset_to_cluster(si, offset + nr_pages - 1)); + VM_BUG_ON(cluster_is_empty(ci)); + VM_BUG_ON(ci->count < nr_pages); + + ci->count -= nr_pages; do { - VM_BUG_ON(*map != SWAP_HAS_CACHE); + VM_BUG_ON(!swap_is_last_ref(*map)); *map = 0; } while (++map < map_end); - dec_cluster_info_page(si, ci, nr_pages); - unlock_cluster(ci); mem_cgroup_uncharge_swap(entry, nr_pages); swap_range_free(si, offset, nr_pages); -} - -static void cluster_swap_free_nr(struct swap_info_struct *si, - unsigned long offset, int nr_pages, - unsigned char usage) -{ - struct swap_cluster_info *ci; - DECLARE_BITMAP(to_free, BITS_PER_LONG) = { 0 }; - int i, nr; - ci = lock_cluster_or_swap_info(si, offset); - while (nr_pages) { - nr = min(BITS_PER_LONG, nr_pages); - for (i = 0; i < nr; i++) { - if (!__swap_entry_free_locked(si, offset + i, usage)) - bitmap_set(to_free, i, 1); - } - if (!bitmap_empty(to_free, BITS_PER_LONG)) { - unlock_cluster_or_swap_info(si, ci); - for_each_set_bit(i, to_free, BITS_PER_LONG) - free_swap_slot(swp_entry(si->type, offset + i)); - if (nr == nr_pages) - return; - bitmap_clear(to_free, 0, BITS_PER_LONG); - ci = lock_cluster_or_swap_info(si, offset); - } - offset += nr; - nr_pages -= nr; - } - unlock_cluster_or_swap_info(si, ci); + if (!ci->count) + free_cluster(si, ci); + else + partial_free_cluster(si, ci); } /* @@ -1639,7 +1602,7 @@ void swap_free_nr(swp_entry_t entry, int nr_pages) while (nr_pages) { nr = min_t(int, nr_pages, SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); - cluster_swap_free_nr(sis, offset, nr, 1); + swap_entries_put_map(sis, swp_entry(sis->type, offset), nr); offset += nr; nr_pages -= nr; } @@ -1650,8 +1613,6 @@ void swap_free_nr(swp_entry_t entry, int nr_pages) */ void put_swap_folio(struct folio *folio, swp_entry_t entry) { - unsigned long offset = swp_offset(entry); - struct swap_cluster_info *ci; struct swap_info_struct *si; int size = 1 << swap_entry_order(folio_order(folio)); @@ -1659,59 +1620,7 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) if (!si) return; - ci = lock_cluster_or_swap_info(si, offset); - if (size > 1 && swap_is_has_cache(si, offset, size)) { - unlock_cluster_or_swap_info(si, ci); - spin_lock(&si->lock); - swap_entry_range_free(si, entry, size); - spin_unlock(&si->lock); - return; - } - for (int i = 0; i < size; i++, entry.val++) { - if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) { - unlock_cluster_or_swap_info(si, ci); - free_swap_slot(entry); - if (i == size - 1) - return; - lock_cluster_or_swap_info(si, offset); - } - } - unlock_cluster_or_swap_info(si, ci); -} - -static int swp_entry_cmp(const void *ent1, const void *ent2) -{ - const swp_entry_t *e1 = ent1, *e2 = ent2; - - return (int)swp_type(*e1) - (int)swp_type(*e2); -} - -void swapcache_free_entries(swp_entry_t *entries, int n) -{ - struct swap_info_struct *p, *prev; - int i; - - if (n <= 0) - return; - - prev = NULL; - p = NULL; - - /* - * Sort swap entries by swap device, so each lock is only taken once. - * nr_swapfiles isn't absolutely correct, but the overhead of sort() is - * so low that it isn't necessary to optimize further. - */ - if (nr_swapfiles > 1) - sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL); - for (i = 0; i < n; ++i) { - p = swap_info_get_cont(entries[i], prev); - if (p) - swap_entry_range_free(p, entries[i], 1); - prev = p; - } - if (p) - spin_unlock(&p->lock); + swap_entries_put_cache(si, entry, size); } int __swap_count(swp_entry_t entry) @@ -1727,16 +1636,16 @@ int __swap_count(swp_entry_t entry) * This does not give an exact answer when swap count is continued, * but does include the high COUNT_CONTINUED flag to allow for that. */ -int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) +bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry) { pgoff_t offset = swp_offset(entry); struct swap_cluster_info *ci; int count; - ci = lock_cluster_or_swap_info(si, offset); + ci = lock_cluster(si, offset); count = swap_count(si->swap_map[offset]); - unlock_cluster_or_swap_info(si, ci); - return count; + unlock_cluster(ci); + return !!count; } /* @@ -1758,7 +1667,7 @@ int swp_swapcount(swp_entry_t entry) offset = swp_offset(entry); - ci = lock_cluster_or_swap_info(si, offset); + ci = lock_cluster(si, offset); count = swap_count(si->swap_map[offset]); if (!(count & COUNT_CONTINUED)) @@ -1781,7 +1690,7 @@ int swp_swapcount(swp_entry_t entry) n *= (SWAP_CONT_MAX + 1); } while (tmp_count & COUNT_CONTINUED); out: - unlock_cluster_or_swap_info(si, ci); + unlock_cluster(ci); return count; } @@ -1796,8 +1705,8 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, int i; bool ret = false; - ci = lock_cluster_or_swap_info(si, offset); - if (!ci || nr_pages == 1) { + ci = lock_cluster(si, offset); + if (nr_pages == 1) { if (swap_count(map[roffset])) ret = true; goto unlock_out; @@ -1809,7 +1718,7 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, } } unlock_out: - unlock_cluster_or_swap_info(si, ci); + unlock_cluster(ci); return ret; } @@ -1822,7 +1731,7 @@ static bool folio_swapped(struct folio *folio) return false; if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio))) - return swap_swapcount(si, entry) != 0; + return swap_entry_swapped(si, entry); return swap_page_trans_huge_swapped(si, entry, folio_order(folio)); } @@ -1896,9 +1805,6 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr) bool any_only_cache = false; unsigned long offset; - if (non_swap_entry(entry)) - return; - si = get_swap_device(entry); if (!si) return; @@ -1909,7 +1815,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr) /* * First free all entries in the range. */ - any_only_cache = __swap_entries_free(si, entry, nr); + any_only_cache = swap_entries_put_map_nr(si, entry, nr); /* * Short-circuit the below loop if none of the entries had their @@ -1919,13 +1825,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr) goto out; /* - * Now go back over the range trying to reclaim the swap cache. This is - * more efficient for large folios because we will only try to reclaim - * the swap once per folio in the common case. If we do - * __swap_entry_free() and __try_to_reclaim_swap() in the same loop, the - * latter will get a reference and lock the folio for every individual - * page but will only succeed once the swap slot for every subpage is - * zero. + * Now go back over the range trying to reclaim the swap cache. */ for (offset = start_offset; offset < end_offset; offset += nr) { nr = 1; @@ -1957,16 +1857,23 @@ out: swp_entry_t get_swap_page_of_type(int type) { struct swap_info_struct *si = swap_type_to_swap_info(type); + unsigned long offset; swp_entry_t entry = {0}; if (!si) goto fail; /* This is called for allocating swap entry, not cache */ - spin_lock(&si->lock); - if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry, 0)) - atomic_long_dec(&nr_swap_pages); - spin_unlock(&si->lock); + if (get_swap_device_info(si)) { + if (si->flags & SWP_WRITEOK) { + offset = cluster_alloc_swap_entry(si, 0, 1); + if (offset) { + entry = swp_entry(si->type, offset); + atomic_long_dec(&nr_swap_pages); + } + } + put_swap_device(si); + } fail: return entry; } @@ -2057,7 +1964,7 @@ unsigned int count_swap_pages(int type, int free) if (sis->flags & SWP_WRITEOK) { n = sis->pages; if (free) - n -= sis->inuse_pages; + n -= swap_usage_in_pages(sis); } spin_unlock(&sis->lock); } @@ -2392,7 +2299,7 @@ static int try_to_unuse(unsigned int type) swp_entry_t entry; unsigned int i; - if (!READ_ONCE(si->inuse_pages)) + if (!swap_usage_in_pages(si)) goto success; retry: @@ -2405,7 +2312,7 @@ retry: spin_lock(&mmlist_lock); p = &init_mm.mmlist; - while (READ_ONCE(si->inuse_pages) && + while (swap_usage_in_pages(si) && !signal_pending(current) && (p = p->next) != &init_mm.mmlist) { @@ -2433,7 +2340,7 @@ retry: mmput(prev_mm); i = 0; - while (READ_ONCE(si->inuse_pages) && + while (swap_usage_in_pages(si) && !signal_pending(current) && (i = find_next_to_unuse(si, i)) != 0) { @@ -2464,11 +2371,11 @@ retry: * Limit the number of retries? No: when mmget_not_zero() * above fails, that mm is likely to be freeing swap from * exit_mmap(), which proceeds at its own independent pace; - * and even shmem_writepage() could have been preempted after + * and even shmem_writeout() could have been preempted after * folio_alloc_swap(), temporarily hiding that swap. It's easy * and robust (though cpu-intensive) just to keep retrying. */ - if (READ_ONCE(si->inuse_pages)) { + if (swap_usage_in_pages(si)) { if (!signal_pending(current)) goto retry; return -EINTR; @@ -2495,7 +2402,7 @@ static void drain_mmlist(void) unsigned int type; for (type = 0; type < nr_swapfiles; type++) - if (swap_info[type]->inuse_pages) + if (swap_usage_in_pages(swap_info[type])) return; spin_lock(&mmlist_lock); list_for_each_safe(p, next, &init_mm.mmlist) @@ -2674,7 +2581,6 @@ static void setup_swap_info(struct swap_info_struct *si, int prio, static void _enable_swap_info(struct swap_info_struct *si) { - si->flags |= SWP_WRITEOK; atomic_long_add(si->pages, &nr_swap_pages); total_swap_pages += si->pages; @@ -2691,9 +2597,8 @@ static void _enable_swap_info(struct swap_info_struct *si) */ plist_add(&si->list, &swap_active_head); - /* add to available list iff swap device is not full */ - if (si->highest_bit) - add_to_avail_list(si); + /* Add back to available list */ + add_to_avail_list(si, true); } static void enable_swap_info(struct swap_info_struct *si, int prio, @@ -2727,21 +2632,46 @@ static void reinsert_swap_info(struct swap_info_struct *si) spin_unlock(&swap_lock); } -static bool __has_usable_swap(void) +/* + * Called after clearing SWP_WRITEOK, ensures cluster_alloc_range + * see the updated flags, so there will be no more allocations. + */ +static void wait_for_allocation(struct swap_info_struct *si) { - return !plist_head_empty(&swap_active_head); + unsigned long offset; + unsigned long end = ALIGN(si->max, SWAPFILE_CLUSTER); + struct swap_cluster_info *ci; + + BUG_ON(si->flags & SWP_WRITEOK); + + for (offset = 0; offset < end; offset += SWAPFILE_CLUSTER) { + ci = lock_cluster(si, offset); + unlock_cluster(ci); + } } -bool has_usable_swap(void) +/* + * Called after swap device's reference count is dead, so + * neither scan nor allocation will use it. + */ +static void flush_percpu_swap_cluster(struct swap_info_struct *si) { - bool ret; + int cpu, i; + struct swap_info_struct **pcp_si; - spin_lock(&swap_lock); - ret = __has_usable_swap(); - spin_unlock(&swap_lock); - return ret; + for_each_possible_cpu(cpu) { + pcp_si = per_cpu_ptr(percpu_swap_cluster.si, cpu); + /* + * Invalidate the percpu swap cluster cache, si->users + * is dead, so no new user will point to it, just flush + * any existing user. + */ + for (i = 0; i < SWAP_NR_ORDERS; i++) + cmpxchg(&pcp_si[i], si, NULL); + } } + SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) { struct swap_info_struct *p = NULL; @@ -2791,7 +2721,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) goto out_dput; } spin_lock(&p->lock); - del_from_avail_list(p); + del_from_avail_list(p, true); if (p->prio < 0) { struct swap_info_struct *si = p; int nid; @@ -2809,11 +2739,10 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) plist_del(&p->list, &swap_active_head); atomic_long_sub(p->pages, &nr_swap_pages); total_swap_pages -= p->pages; - p->flags &= ~SWP_WRITEOK; spin_unlock(&p->lock); spin_unlock(&swap_lock); - disable_swap_slots_cache_lock(); + wait_for_allocation(p); set_current_oom_origin(); err = try_to_unuse(p->type); @@ -2822,12 +2751,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) if (err) { /* re-insert swap space back into swap_list */ reinsert_swap_info(p); - reenable_swap_slots_cache_unlock(); goto out_dput; } - reenable_swap_slots_cache_unlock(); - /* * Wait for swap operations protected by get/put_swap_device() * to complete. Because of synchronize_rcu() here, all swap @@ -2842,6 +2768,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) flush_work(&p->discard_work); flush_work(&p->reclaim_work); + flush_percpu_swap_cluster(p); destroy_swap_extents(p); if (p->flags & SWP_CONTINUED) @@ -2855,16 +2782,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_lock(&p->lock); drain_mmlist(); - /* wait for anyone still in scan_swap_map_slots */ - p->highest_bit = 0; /* cuts scans short */ - while (p->flags >= SWP_SCANNING) { - spin_unlock(&p->lock); - spin_unlock(&swap_lock); - schedule_timeout_uninterruptible(1); - spin_lock(&swap_lock); - spin_lock(&p->lock); - } - swap_file = p->swap_file; p->swap_file = NULL; p->max = 0; @@ -2879,10 +2796,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) arch_swap_invalidate_area(p->type); zswap_swapoff(p->type); mutex_unlock(&swapon_mutex); - free_percpu(p->percpu_cluster); - p->percpu_cluster = NULL; - free_percpu(p->cluster_next_cpu); - p->cluster_next_cpu = NULL; + kfree(p->global_cluster); + p->global_cluster = NULL; vfree(swap_map); kvfree(zeromap); kvfree(cluster_info); @@ -2992,7 +2907,7 @@ static int swap_show(struct seq_file *swap, void *v) } bytes = K(si->pages); - inuse = K(READ_ONCE(si->inuse_pages)); + inuse = K(swap_usage_in_pages(si)); file = si->swap_file; len = seq_file_path(swap, file, " \t\n\\"); @@ -3109,6 +3024,7 @@ static struct swap_info_struct *alloc_swap_info(void) } spin_lock_init(&p->lock); spin_lock_init(&p->cont_lock); + atomic_long_set(&p->inuse_pages, SWAP_USAGE_OFFLIST_BIT); init_completion(&p->comp); return p; @@ -3193,10 +3109,6 @@ static unsigned long read_swap_header(struct swap_info_struct *si, return 0; } - si->lowest_bit = 1; - si->cluster_next = 1; - si->cluster_nr = 0; - maxpages = swapfile_maximum_size; last_page = swap_header->info.last_page; if (!last_page) { @@ -3213,7 +3125,6 @@ static unsigned long read_swap_header(struct swap_info_struct *si, if ((unsigned int)maxpages == 0) maxpages = UINT_MAX; } - si->highest_bit = maxpages - 1; if (!maxpages) return 0; @@ -3230,13 +3141,6 @@ static unsigned long read_swap_header(struct swap_info_struct *si, return maxpages; } -#define SWAP_CLUSTER_INFO_COLS \ - DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info)) -#define SWAP_CLUSTER_SPACE_COLS \ - DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER) -#define SWAP_CLUSTER_COLS \ - max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS) - static int setup_swap_map_and_extents(struct swap_info_struct *si, union swap_header *swap_header, unsigned char *swap_map, @@ -3276,15 +3180,21 @@ static int setup_swap_map_and_extents(struct swap_info_struct *si, return nr_extents; } +#define SWAP_CLUSTER_INFO_COLS \ + DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info)) +#define SWAP_CLUSTER_SPACE_COLS \ + DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER) +#define SWAP_CLUSTER_COLS \ + max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS) + static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, union swap_header *swap_header, unsigned long maxpages) { unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); - unsigned long col = si->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS; struct swap_cluster_info *cluster_info; - unsigned long i, j, k, idx; - int cpu, err = -ENOMEM; + unsigned long i, j, idx; + int err = -ENOMEM; cluster_info = kvcalloc(nr_clusters, sizeof(*cluster_info), GFP_KERNEL); if (!cluster_info) @@ -3293,25 +3203,14 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, for (i = 0; i < nr_clusters; i++) spin_lock_init(&cluster_info[i].lock); - si->cluster_next_cpu = alloc_percpu(unsigned int); - if (!si->cluster_next_cpu) - goto err_free; - - /* Random start position to help with wear leveling */ - for_each_possible_cpu(cpu) - per_cpu(*si->cluster_next_cpu, cpu) = - get_random_u32_inclusive(1, si->highest_bit); - - si->percpu_cluster = alloc_percpu(struct percpu_cluster); - if (!si->percpu_cluster) - goto err_free; - - for_each_possible_cpu(cpu) { - struct percpu_cluster *cluster; - - cluster = per_cpu_ptr(si->percpu_cluster, cpu); + if (!(si->flags & SWP_SOLIDSTATE)) { + si->global_cluster = kmalloc(sizeof(*si->global_cluster), + GFP_KERNEL); + if (!si->global_cluster) + goto err_free; for (i = 0; i < SWAP_NR_ORDERS; i++) - cluster->next[i] = SWAP_NEXT_INVALID; + si->global_cluster->next[i] = SWAP_ENTRY_INVALID; + spin_lock_init(&si->global_cluster_lock); } /* @@ -3335,15 +3234,14 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, for (i = 0; i < SWAP_NR_ORDERS; i++) { INIT_LIST_HEAD(&si->nonfull_clusters[i]); INIT_LIST_HEAD(&si->frag_clusters[i]); - si->frag_cluster_nr[i] = 0; + atomic_long_set(&si->frag_cluster_nr[i], 0); } /* * Reduce false cache line sharing between cluster_info and * sharing same address space. */ - for (k = 0; k < SWAP_CLUSTER_COLS; k++) { - j = (k + col) % SWAP_CLUSTER_COLS; + for (j = 0; j < SWAP_CLUSTER_COLS; j++) { for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) { struct swap_cluster_info *ci; idx = i * SWAP_CLUSTER_COLS + j; @@ -3437,6 +3335,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } /* + * The swap subsystem needs a major overhaul to support this. + * It doesn't work yet so just disable it for now. + */ + if (mapping_min_folio_order(mapping) > 0) { + error = -EINVAL; + goto bad_swap_unlock_inode; + } + + /* * Read the swap header. */ if (!mapping->a_ops->read_folio) { @@ -3493,18 +3400,18 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (si->bdev && bdev_nonrot(si->bdev)) { si->flags |= SWP_SOLIDSTATE; - - cluster_info = setup_clusters(si, swap_header, maxpages); - if (IS_ERR(cluster_info)) { - error = PTR_ERR(cluster_info); - cluster_info = NULL; - goto bad_swap_unlock_inode; - } } else { atomic_inc(&nr_rotate_swap); inced_nr_rotate_swap = true; } + cluster_info = setup_clusters(si, swap_header, maxpages); + if (IS_ERR(cluster_info)) { + error = PTR_ERR(cluster_info); + cluster_info = NULL; + goto bad_swap_unlock_inode; + } + if ((swap_flags & SWAP_FLAG_DISCARD) && si->bdev && bdev_max_discard_sectors(si->bdev)) { /* @@ -3558,8 +3465,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) mutex_lock(&swapon_mutex); prio = -1; if (swap_flags & SWAP_FLAG_PREFER) - prio = - (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; + prio = swap_flags & SWAP_FLAG_PRIO_MASK; enable_swap_info(si, prio, swap_map, cluster_info, zeromap); pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s\n", @@ -3583,10 +3489,8 @@ free_swap_address_space: bad_swap_unlock_inode: inode_unlock(inode); bad_swap: - free_percpu(si->percpu_cluster); - si->percpu_cluster = NULL; - free_percpu(si->cluster_next_cpu); - si->cluster_next_cpu = NULL; + kfree(si->global_cluster); + si->global_cluster = NULL; inode = NULL; destroy_swap_extents(si); swap_cgroup_swapoff(si->type); @@ -3608,8 +3512,6 @@ out: putname(name); if (inode) inode_unlock(inode); - if (!error) - enable_swap_slots_cache(); return error; } @@ -3623,7 +3525,7 @@ void si_swapinfo(struct sysinfo *val) struct swap_info_struct *si = swap_info[type]; if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) - nr_to_be_unused += READ_ONCE(si->inuse_pages); + nr_to_be_unused += swap_usage_in_pages(si); } val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused; val->totalswap = total_swap_pages + nr_to_be_unused; @@ -3636,7 +3538,6 @@ void si_swapinfo(struct sysinfo *val) * Returns error code in following case. * - success -> 0 * - swp_entry is invalid -> EINVAL - * - swp_entry is migration entry -> EINVAL * - swap-cache reference is requested but there is already one. -> EEXIST * - swap-cache reference is requested but the entry is not used. -> ENOENT * - swap-mapped reference requested but needs continued swap count. -> ENOMEM @@ -3651,11 +3552,15 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) int err, i; si = swp_swap_info(entry); + if (WARN_ON_ONCE(!si)) { + pr_err("%s%08lx\n", Bad_file, entry.val); + return -EINVAL; + } offset = swp_offset(entry); VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); VM_WARN_ON(usage == 1 && nr > 1); - ci = lock_cluster_or_swap_info(si, offset); + ci = lock_cluster(si, offset); err = 0; for (i = 0; i < nr; i++) { @@ -3710,7 +3615,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) } unlock_out: - unlock_cluster_or_swap_info(si, ci); + unlock_cluster(ci); return err; } @@ -3752,11 +3657,13 @@ int swapcache_prepare(swp_entry_t entry, int nr) return __swap_duplicate(entry, SWAP_HAS_CACHE, nr); } +/* + * Caller should ensure entries belong to the same folio so + * the entries won't span cross cluster boundary. + */ void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr) { - unsigned long offset = swp_offset(entry); - - cluster_swap_free_nr(si, offset, nr, SWAP_HAS_CACHE); + swap_entries_put_cache(si, entry, nr); } struct swap_info_struct *swp_swap_info(swp_entry_t entry) @@ -3765,21 +3672,6 @@ struct swap_info_struct *swp_swap_info(swp_entry_t entry) } /* - * out-of-line methods to avoid include hell. - */ -struct address_space *swapcache_mapping(struct folio *folio) -{ - return swp_swap_info(folio->swap)->swap_file->f_mapping; -} -EXPORT_SYMBOL_GPL(swapcache_mapping); - -pgoff_t __folio_swap_cache_index(struct folio *folio) -{ - return swap_cache_index(folio->swap); -} -EXPORT_SYMBOL_GPL(__folio_swap_cache_index); - -/* * add_swap_count_continuation - called when a swap count is duplicated * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's * page of the original vmalloc'ed swap_map, to hold the continuation count @@ -3819,7 +3711,6 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) */ goto outer; } - spin_lock(&si->lock); offset = swp_offset(entry); @@ -3884,7 +3775,6 @@ out_unlock_cont: spin_unlock(&si->cont_lock); out: unlock_cluster(ci); - spin_unlock(&si->lock); put_swap_device(si); outer: if (page) @@ -3898,8 +3788,8 @@ outer: * into, carry if so, or else fail until a new continuation page is allocated; * when the original swap_map count is decremented from 0 with continuation, * borrow from the continuation and report whether it still holds more. - * Called while __swap_duplicate() or swap_entry_free() holds swap or cluster - * lock. + * Called while __swap_duplicate() or caller of swap_entry_put_locked() + * holds cluster lock. */ static bool swap_count_continued(struct swap_info_struct *si, pgoff_t offset, unsigned char count) @@ -4004,6 +3894,11 @@ static void free_swap_count_continuations(struct swap_info_struct *si) } #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) +static bool __has_usable_swap(void) +{ + return !plist_head_empty(&swap_active_head); +} + void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp) { struct swap_info_struct *si, *next; diff --git a/mm/truncate.c b/mm/truncate.c index 7c304d2f0052..91eb92a5ce4f 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -78,8 +78,22 @@ static void truncate_folio_batch_exceptionals(struct address_space *mapping, if (dax_mapping(mapping)) { for (i = j; i < nr; i++) { - if (xa_is_value(fbatch->folios[i])) + if (xa_is_value(fbatch->folios[i])) { + /* + * File systems should already have called + * dax_break_layout_entry() to remove all DAX + * entries while holding a lock to prevent + * establishing new entries. Therefore we + * shouldn't find any here. + */ + WARN_ON_ONCE(1); + + /* + * Delete the mapping so truncate_pagecache() + * doesn't loop forever. + */ dax_delete_mapping_entry(mapping, indices[i]); + } } goto out; } @@ -177,20 +191,21 @@ int truncate_inode_folio(struct address_space *mapping, struct folio *folio) bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) { loff_t pos = folio_pos(folio); + size_t size = folio_size(folio); unsigned int offset, length; + struct page *split_at, *split_at2; if (pos < start) offset = start - pos; else offset = 0; - length = folio_size(folio); - if (pos + length <= (u64)end) - length = length - offset; + if (pos + size <= (u64)end) + length = size - offset; else length = end + 1 - pos - offset; folio_wait_writeback(folio); - if (length == folio_size(folio)) { + if (length == size) { truncate_inode_folio(folio->mapping, folio); return true; } @@ -207,8 +222,46 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) folio_invalidate(folio, offset, length); if (!folio_test_large(folio)) return true; - if (split_folio(folio) == 0) + + split_at = folio_page(folio, PAGE_ALIGN_DOWN(offset) / PAGE_SIZE); + if (!try_folio_split(folio, split_at, NULL)) { + /* + * try to split at offset + length to make sure folios within + * the range can be dropped, especially to avoid memory waste + * for shmem truncate + */ + struct folio *folio2; + + if (offset + length == size) + goto no_split; + + split_at2 = folio_page(folio, + PAGE_ALIGN_DOWN(offset + length) / PAGE_SIZE); + folio2 = page_folio(split_at2); + + if (!folio_try_get(folio2)) + goto no_split; + + if (!folio_test_large(folio2)) + goto out; + + if (!folio_trylock(folio2)) + goto out; + + /* + * make sure folio2 is large and does not change its mapping. + * Its split result does not matter here. + */ + if (folio_test_large(folio2) && + folio2->mapping == folio->mapping) + try_folio_split(folio2, split_at2, NULL); + + folio_unlock(folio2); +out: + folio_put(folio2); +no_split: return true; + } if (folio_test_dirty(folio)) return false; truncate_inode_folio(folio->mapping, folio); @@ -372,7 +425,7 @@ void truncate_inode_pages_range(struct address_space *mapping, for (i = 0; i < folio_batch_count(&fbatch); i++) { struct folio *folio = fbatch.folios[i]; - /* We rely upon deletion not changing page->index */ + /* We rely upon deletion not changing folio->index */ if (xa_is_value(folio)) continue; @@ -525,6 +578,15 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, } EXPORT_SYMBOL(invalidate_mapping_pages); +static int folio_launder(struct address_space *mapping, struct folio *folio) +{ + if (!folio_test_dirty(folio)) + return 0; + if (folio->mapping != mapping || mapping->a_ops->launder_folio == NULL) + return 0; + return mapping->a_ops->launder_folio(folio); +} + /* * This is like mapping_evict_folio(), except it ignores the folio's * refcount. We do this because invalidate_inode_pages2() needs stronger @@ -532,14 +594,24 @@ EXPORT_SYMBOL(invalidate_mapping_pages); * shrink_folio_list() has a temp ref on them, or because they're transiently * sitting in the folio_add_lru() caches. */ -static int invalidate_complete_folio2(struct address_space *mapping, - struct folio *folio) +int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio, + gfp_t gfp) { - if (folio->mapping != mapping) - return 0; + int ret; - if (!filemap_release_folio(folio, GFP_KERNEL)) - return 0; + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + + if (folio_mapped(folio)) + unmap_mapping_folio(folio); + BUG_ON(folio_mapped(folio)); + + ret = folio_launder(mapping, folio); + if (ret) + return ret; + if (folio->mapping != mapping) + return -EBUSY; + if (!filemap_release_folio(folio, gfp)) + return -EBUSY; spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); @@ -558,16 +630,7 @@ static int invalidate_complete_folio2(struct address_space *mapping, failed: xa_unlock_irq(&mapping->i_pages); spin_unlock(&mapping->host->i_lock); - return 0; -} - -static int folio_launder(struct address_space *mapping, struct folio *folio) -{ - if (!folio_test_dirty(folio)) - return 0; - if (folio->mapping != mapping || mapping->a_ops->launder_folio == NULL) - return 0; - return mapping->a_ops->launder_folio(folio); + return -EBUSY; } /** @@ -631,16 +694,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, } VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio); folio_wait_writeback(folio); - - if (folio_mapped(folio)) - unmap_mapping_folio(folio); - BUG_ON(folio_mapped(folio)); - - ret2 = folio_launder(mapping, folio); - if (ret2 == 0) { - if (!invalidate_complete_folio2(mapping, folio)) - ret2 = -EBUSY; - } + ret2 = folio_unmap_invalidate(mapping, folio, GFP_KERNEL); if (ret2 < 0) ret = ret2; folio_unlock(folio); diff --git a/mm/usercopy.c b/mm/usercopy.c index 83c164aba6e0..dbdcc43964fb 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -17,7 +17,7 @@ #include <linux/sched.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> -#include <linux/thread_info.h> +#include <linux/ucopysize.h> #include <linux/vmalloc.h> #include <linux/atomic.h> #include <linux/jump_label.h> @@ -201,7 +201,9 @@ static inline void check_heap_object(const void *ptr, unsigned long n, } } -static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks); +DEFINE_STATIC_KEY_MAYBE_RO(CONFIG_HARDENED_USERCOPY_DEFAULT_ON, + validate_usercopy_range); +EXPORT_SYMBOL(validate_usercopy_range); /* * Validates that the given object is: @@ -212,9 +214,6 @@ static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks); */ void __check_object_size(const void *ptr, unsigned long n, bool to_user) { - if (static_branch_unlikely(&bypass_usercopy_checks)) - return; - /* Skip all tests if size is zero. */ if (!n) return; @@ -255,7 +254,8 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user) } EXPORT_SYMBOL(__check_object_size); -static bool enable_checks __initdata = true; +static bool enable_checks __initdata = + IS_ENABLED(CONFIG_HARDENED_USERCOPY_DEFAULT_ON); static int __init parse_hardened_usercopy(char *str) { @@ -269,8 +269,10 @@ __setup("hardened_usercopy=", parse_hardened_usercopy); static int __init set_hardened_usercopy(void) { - if (enable_checks == false) - static_branch_enable(&bypass_usercopy_checks); + if (enable_checks) + static_branch_enable(&validate_usercopy_range); + else + static_branch_disable(&validate_usercopy_range); return 1; } diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 60a0be33766f..8253978ee0fb 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -18,6 +18,7 @@ #include <asm/tlbflush.h> #include <asm/tlb.h> #include "internal.h" +#include "swap.h" static __always_inline bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end) @@ -85,14 +86,10 @@ static struct vm_area_struct *uffd_lock_vma(struct mm_struct *mm, mmap_read_lock(mm); vma = find_vma_and_prepare_anon(mm, address); if (!IS_ERR(vma)) { - /* - * We cannot use vma_start_read() as it may fail due to - * false locked (see comment in vma_start_read()). We - * can avoid that by directly locking vm_lock under - * mmap_lock, which guarantees that nobody can lock the - * vma for write (vma_start_write()) under us. - */ - down_read(&vma->vm_lock->lock); + bool locked = vma_start_read_locked(vma); + + if (!locked) + vma = ERR_PTR(-EAGAIN); } mmap_read_unlock(mm); @@ -1020,6 +1017,14 @@ void double_pt_unlock(spinlock_t *ptl1, __release(ptl2); } +static inline bool is_pte_pages_stable(pte_t *dst_pte, pte_t *src_pte, + pte_t orig_dst_pte, pte_t orig_src_pte, + pmd_t *dst_pmd, pmd_t dst_pmdval) +{ + return pte_same(ptep_get(src_pte), orig_src_pte) && + pte_same(ptep_get(dst_pte), orig_dst_pte) && + pmd_same(dst_pmdval, pmdp_get_lockless(dst_pmd)); +} static int move_present_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, @@ -1027,6 +1032,7 @@ static int move_present_pte(struct mm_struct *mm, unsigned long dst_addr, unsigned long src_addr, pte_t *dst_pte, pte_t *src_pte, pte_t orig_dst_pte, pte_t orig_src_pte, + pmd_t *dst_pmd, pmd_t dst_pmdval, spinlock_t *dst_ptl, spinlock_t *src_ptl, struct folio *src_folio) { @@ -1034,8 +1040,8 @@ static int move_present_pte(struct mm_struct *mm, double_pt_lock(dst_ptl, src_ptl); - if (!pte_same(ptep_get(src_pte), orig_src_pte) || - !pte_same(ptep_get(dst_pte), orig_dst_pte)) { + if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, + dst_pmd, dst_pmdval)) { err = -EAGAIN; goto out; } @@ -1057,9 +1063,14 @@ static int move_present_pte(struct mm_struct *mm, folio_move_anon_rmap(src_folio, dst_vma); src_folio->index = linear_page_index(dst_vma, dst_addr); - orig_dst_pte = mk_pte(&src_folio->page, dst_vma->vm_page_prot); - /* Follow mremap() behavior and treat the entry dirty after the move */ - orig_dst_pte = pte_mkwrite(pte_mkdirty(orig_dst_pte), dst_vma); + orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot); + /* Set soft dirty bit so userspace can notice the pte was moved */ +#ifdef CONFIG_MEM_SOFT_DIRTY + orig_dst_pte = pte_mksoft_dirty(orig_dst_pte); +#endif + if (pte_dirty(orig_src_pte)) + orig_dst_pte = pte_mkdirty(orig_dst_pte); + orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma); set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte); out: @@ -1067,24 +1078,65 @@ out: return err; } -static int move_swap_pte(struct mm_struct *mm, +static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, pte_t *dst_pte, pte_t *src_pte, pte_t orig_dst_pte, pte_t orig_src_pte, - spinlock_t *dst_ptl, spinlock_t *src_ptl) + pmd_t *dst_pmd, pmd_t dst_pmdval, + spinlock_t *dst_ptl, spinlock_t *src_ptl, + struct folio *src_folio, + struct swap_info_struct *si, swp_entry_t entry) { - if (!pte_swp_exclusive(orig_src_pte)) - return -EBUSY; + /* + * Check if the folio still belongs to the target swap entry after + * acquiring the lock. Folio can be freed in the swap cache while + * not locked. + */ + if (src_folio && unlikely(!folio_test_swapcache(src_folio) || + entry.val != src_folio->swap.val)) + return -EAGAIN; double_pt_lock(dst_ptl, src_ptl); - if (!pte_same(ptep_get(src_pte), orig_src_pte) || - !pte_same(ptep_get(dst_pte), orig_dst_pte)) { + if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, + dst_pmd, dst_pmdval)) { double_pt_unlock(dst_ptl, src_ptl); return -EAGAIN; } + /* + * The src_folio resides in the swapcache, requiring an update to its + * index and mapping to align with the dst_vma, where a swap-in may + * occur and hit the swapcache after moving the PTE. + */ + if (src_folio) { + folio_move_anon_rmap(src_folio, dst_vma); + src_folio->index = linear_page_index(dst_vma, dst_addr); + } else { + /* + * Check if the swap entry is cached after acquiring the src_pte + * lock. Otherwise, we might miss a newly loaded swap cache folio. + * + * Check swap_map directly to minimize overhead, READ_ONCE is sufficient. + * We are trying to catch newly added swap cache, the only possible case is + * when a folio is swapped in and out again staying in swap cache, using the + * same entry before the PTE check above. The PTL is acquired and released + * twice, each time after updating the swap_map's flag. So holding + * the PTL here ensures we see the updated value. False positive is possible, + * e.g. SWP_SYNCHRONOUS_IO swapin may set the flag without touching the + * cache, or during the tiny synchronization window between swap cache and + * swap_map, but it will be gone very quickly, worst result is retry jitters. + */ + if (READ_ONCE(si->swap_map[swp_offset(entry)]) & SWAP_HAS_CACHE) { + double_pt_unlock(dst_ptl, src_ptl); + return -EAGAIN; + } + } + orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); +#ifdef CONFIG_MEM_SOFT_DIRTY + orig_src_pte = pte_swp_mksoft_dirty(orig_src_pte); +#endif set_pte_at(mm, dst_addr, dst_pte, orig_src_pte); double_pt_unlock(dst_ptl, src_ptl); @@ -1097,13 +1149,14 @@ static int move_zeropage_pte(struct mm_struct *mm, unsigned long dst_addr, unsigned long src_addr, pte_t *dst_pte, pte_t *src_pte, pte_t orig_dst_pte, pte_t orig_src_pte, + pmd_t *dst_pmd, pmd_t dst_pmdval, spinlock_t *dst_ptl, spinlock_t *src_ptl) { pte_t zero_pte; double_pt_lock(dst_ptl, src_ptl); - if (!pte_same(ptep_get(src_pte), orig_src_pte) || - !pte_same(ptep_get(dst_pte), orig_dst_pte)) { + if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, + dst_pmd, dst_pmdval)) { double_pt_unlock(dst_ptl, src_ptl); return -EAGAIN; } @@ -1130,12 +1183,14 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, __u64 mode) { swp_entry_t entry; + struct swap_info_struct *si = NULL; pte_t orig_src_pte, orig_dst_pte; pte_t src_folio_pte; spinlock_t *src_ptl, *dst_ptl; pte_t *src_pte = NULL; pte_t *dst_pte = NULL; pmd_t dummy_pmdval; + pmd_t dst_pmdval; struct folio *src_folio = NULL; struct anon_vma *src_anon_vma = NULL; struct mmu_notifier_range range; @@ -1148,11 +1203,11 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, retry: /* * Use the maywrite version to indicate that dst_pte will be modified, - * but since we will use pte_same() to detect the change of the pte - * entry, there is no need to get pmdval, so just pass a dummy variable - * to it. + * since dst_pte needs to be none, the subsequent pte_same() check + * cannot prevent the dst_pte page from being freed concurrently, so we + * also need to abtain dst_pmdval and recheck pmd_same() later. */ - dst_pte = pte_offset_map_rw_nolock(mm, dst_pmd, dst_addr, &dummy_pmdval, + dst_pte = pte_offset_map_rw_nolock(mm, dst_pmd, dst_addr, &dst_pmdval, &dst_ptl); /* Retry if a huge pmd materialized from under us */ @@ -1161,7 +1216,11 @@ retry: goto out; } - /* same as dst_pte */ + /* + * Unlike dst_pte, the subsequent pte_same() check can ensure the + * stability of the src_pte page, so there is no need to get pmdval, + * just pass a dummy variable to it. + */ src_pte = pte_offset_map_rw_nolock(mm, src_pmd, src_addr, &dummy_pmdval, &src_ptl); @@ -1177,8 +1236,8 @@ retry: } /* Sanity checks before the operation */ - if (WARN_ON_ONCE(pmd_none(*dst_pmd)) || WARN_ON_ONCE(pmd_none(*src_pmd)) || - WARN_ON_ONCE(pmd_trans_huge(*dst_pmd)) || WARN_ON_ONCE(pmd_trans_huge(*src_pmd))) { + if (pmd_none(*dst_pmd) || pmd_none(*src_pmd) || + pmd_trans_huge(*dst_pmd) || pmd_trans_huge(*src_pmd)) { err = -EINVAL; goto out; } @@ -1213,7 +1272,7 @@ retry: err = move_zeropage_pte(mm, dst_vma, src_vma, dst_addr, src_addr, dst_pte, src_pte, orig_dst_pte, orig_src_pte, - dst_ptl, src_ptl); + dst_pmd, dst_pmdval, dst_ptl, src_ptl); goto out; } @@ -1224,6 +1283,7 @@ retry: */ if (!src_folio) { struct folio *folio; + bool locked; /* * Pin the page while holding the lock to be sure the @@ -1243,14 +1303,28 @@ retry: goto out; } + locked = folio_trylock(folio); + /* + * We avoid waiting for folio lock with a raised + * refcount for large folios because extra refcounts + * will result in split_folio() failing later and + * retrying. If multiple tasks are trying to move a + * large folio we can end up livelocking. + */ + if (!locked && folio_test_large(folio)) { + spin_unlock(src_ptl); + err = -EAGAIN; + goto out; + } + folio_get(folio); src_folio = folio; src_folio_pte = orig_src_pte; spin_unlock(src_ptl); - if (!folio_trylock(src_folio)) { - pte_unmap(&orig_src_pte); - pte_unmap(&orig_dst_pte); + if (!locked) { + pte_unmap(src_pte); + pte_unmap(dst_pte); src_pte = dst_pte = NULL; /* now we can block and wait */ folio_lock(src_folio); @@ -1266,8 +1340,8 @@ retry: /* at this point we have src_folio locked */ if (folio_test_large(src_folio)) { /* split_folio() can block */ - pte_unmap(&orig_src_pte); - pte_unmap(&orig_dst_pte); + pte_unmap(src_pte); + pte_unmap(dst_pte); src_pte = dst_pte = NULL; err = split_folio(src_folio); if (err) @@ -1292,8 +1366,8 @@ retry: goto out; } if (!anon_vma_trylock_write(src_anon_vma)) { - pte_unmap(&orig_src_pte); - pte_unmap(&orig_dst_pte); + pte_unmap(src_pte); + pte_unmap(dst_pte); src_pte = dst_pte = NULL; /* now we can block and wait */ anon_vma_lock_write(src_anon_vma); @@ -1303,14 +1377,16 @@ retry: err = move_present_pte(mm, dst_vma, src_vma, dst_addr, src_addr, dst_pte, src_pte, - orig_dst_pte, orig_src_pte, - dst_ptl, src_ptl, src_folio); + orig_dst_pte, orig_src_pte, dst_pmd, + dst_pmdval, dst_ptl, src_ptl, src_folio); } else { + struct folio *folio = NULL; + entry = pte_to_swp_entry(orig_src_pte); if (non_swap_entry(entry)) { if (is_migration_entry(entry)) { - pte_unmap(&orig_src_pte); - pte_unmap(&orig_dst_pte); + pte_unmap(src_pte); + pte_unmap(dst_pte); src_pte = dst_pte = NULL; migration_entry_wait(mm, src_pmd, src_addr); err = -EAGAIN; @@ -1319,10 +1395,53 @@ retry: goto out; } - err = move_swap_pte(mm, dst_addr, src_addr, - dst_pte, src_pte, - orig_dst_pte, orig_src_pte, - dst_ptl, src_ptl); + if (!pte_swp_exclusive(orig_src_pte)) { + err = -EBUSY; + goto out; + } + + si = get_swap_device(entry); + if (unlikely(!si)) { + err = -EAGAIN; + goto out; + } + /* + * Verify the existence of the swapcache. If present, the folio's + * index and mapping must be updated even when the PTE is a swap + * entry. The anon_vma lock is not taken during this process since + * the folio has already been unmapped, and the swap entry is + * exclusive, preventing rmap walks. + * + * For large folios, return -EBUSY immediately, as split_folio() + * also returns -EBUSY when attempting to split unmapped large + * folios in the swapcache. This issue needs to be resolved + * separately to allow proper handling. + */ + if (!src_folio) + folio = filemap_get_folio(swap_address_space(entry), + swap_cache_index(entry)); + if (!IS_ERR_OR_NULL(folio)) { + if (folio_test_large(folio)) { + err = -EBUSY; + folio_put(folio); + goto out; + } + src_folio = folio; + src_folio_pte = orig_src_pte; + if (!folio_trylock(src_folio)) { + pte_unmap(src_pte); + pte_unmap(dst_pte); + src_pte = dst_pte = NULL; + put_swap_device(si); + si = NULL; + /* now we can block and wait */ + folio_lock(src_folio); + goto retry; + } + } + err = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte, + orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval, + dst_ptl, src_ptl, src_folio, si, entry); } out: @@ -1339,6 +1458,8 @@ out: if (src_pte) pte_unmap(src_pte); mmu_notifier_invalidate_range_end(&range); + if (si) + put_swap_device(si); return err; } @@ -1475,16 +1596,24 @@ static int uffd_move_lock(struct mm_struct *mm, mmap_read_lock(mm); err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap); - if (!err) { - /* - * See comment in uffd_lock_vma() as to why not using - * vma_start_read() here. - */ - down_read(&(*dst_vmap)->vm_lock->lock); - if (*dst_vmap != *src_vmap) - down_read_nested(&(*src_vmap)->vm_lock->lock, - SINGLE_DEPTH_NESTING); + if (err) + goto out; + + if (!vma_start_read_locked(*dst_vmap)) { + err = -EAGAIN; + goto out; + } + + /* Nothing further to do if both vmas are locked. */ + if (*dst_vmap == *src_vmap) + goto out; + + if (!vma_start_read_locked_nested(*src_vmap, SINGLE_DEPTH_NESTING)) { + /* Undo dst_vmap locking if src_vmap failed to lock */ + vma_end_read(*dst_vmap); + err = -EAGAIN; } +out: mmap_read_unlock(mm); return err; } @@ -1810,6 +1939,14 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, unsigned long end) { struct vm_area_struct *ret; + bool give_up_on_oom = false; + + /* + * If we are modifying only and not splitting, just give up on the merge + * if OOM prevents us from merging successfully. + */ + if (start == vma->vm_start && end == vma->vm_end) + give_up_on_oom = true; /* Reset ptes for the whole vma range if wr-protected */ if (userfaultfd_wp(vma)) @@ -1817,7 +1954,7 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, ret = vma_modify_flags_uffd(vmi, prev, vma, start, end, vma->vm_flags & ~__VM_UFFD_FLAGS, - NULL_VM_UFFD_CTX); + NULL_VM_UFFD_CTX, give_up_on_oom); /* * In the vma_merge() successful mprotect-like case 8: @@ -1868,7 +2005,8 @@ int userfaultfd_register_range(struct userfaultfd_ctx *ctx, new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, new_flags, - (struct vm_userfaultfd_ctx){ctx}); + (struct vm_userfaultfd_ctx){ctx}, + /* give_up_on_oom = */false); if (IS_ERR(vma)) return PTR_ERR(vma); diff --git a/mm/util.c b/mm/util.c index c1c3b06ab4f9..0b270c43d7d1 100644 --- a/mm/util.c +++ b/mm/util.c @@ -12,6 +12,7 @@ #include <linux/security.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/sysctl.h> #include <linux/mman.h> #include <linux/hugetlb.h> #include <linux/vmalloc.h> @@ -23,6 +24,7 @@ #include <linux/processor.h> #include <linux/sizes.h> #include <linux/compat.h> +#include <linux/fsnotify.h> #include <linux/uaccess.h> @@ -297,12 +299,7 @@ void *memdup_user_nul(const void __user *src, size_t len) { char *p; - /* - * Always use GFP_KERNEL, since copy_from_user() can sleep and - * cause pagefault, which makes it pointless to use GFP_NOFS - * or GFP_ATOMIC. - */ - p = kmalloc_track_caller(len + 1, GFP_KERNEL); + p = kmem_buckets_alloc_track_caller(user_buckets, len + 1, GFP_USER | __GFP_NOWARN); if (!p) return ERR_PTR(-ENOMEM); @@ -574,6 +571,8 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, LIST_HEAD(uf); ret = security_mmap_file(file, prot, flag); + if (!ret) + ret = fsnotify_mmap_perm(file, prot, pgoff >> PAGE_SHIFT, len); if (!ret) { if (mmap_write_lock_killable(mm)) return -EINTR; @@ -587,6 +586,23 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, return ret; } +/* + * Perform a userland memory mapping into the current process address space. See + * the comment for do_mmap() for more details on this operation in general. + * + * This differs from do_mmap() in that: + * + * a. An offset parameter is provided rather than pgoff, which is both checked + * for overflow and page alignment. + * b. mmap locking is performed on the caller's behalf. + * c. Userfaultfd unmap events and memory population are handled. + * + * This means that this function performs essentially the same work as if + * userland were invoking mmap (2). + * + * Returns either an error, or the address at which the requested mapping has + * been performed. + */ unsigned long vm_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flag, unsigned long offset) @@ -600,168 +616,6 @@ unsigned long vm_mmap(struct file *file, unsigned long addr, } EXPORT_SYMBOL(vm_mmap); -static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size) -{ - /* - * We want to attempt a large physically contiguous block first because - * it is less likely to fragment multiple larger blocks and therefore - * contribute to a long term fragmentation less than vmalloc fallback. - * However make sure that larger requests are not too disruptive - no - * OOM killer and no allocation failure warnings as we have a fallback. - */ - if (size > PAGE_SIZE) { - flags |= __GFP_NOWARN; - - if (!(flags & __GFP_RETRY_MAYFAIL)) - flags |= __GFP_NORETRY; - - /* nofail semantic is implemented by the vmalloc fallback */ - flags &= ~__GFP_NOFAIL; - } - - return flags; -} - -/** - * __kvmalloc_node - attempt to allocate physically contiguous memory, but upon - * failure, fall back to non-contiguous (vmalloc) allocation. - * @size: size of the request. - * @b: which set of kmalloc buckets to allocate from. - * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL. - * @node: numa node to allocate from - * - * Uses kmalloc to get the memory but if the allocation fails then falls back - * to the vmalloc allocator. Use kvfree for freeing the memory. - * - * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier. - * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is - * preferable to the vmalloc fallback, due to visible performance drawbacks. - * - * Return: pointer to the allocated memory of %NULL in case of failure - */ -void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) -{ - void *ret; - - /* - * It doesn't really make sense to fallback to vmalloc for sub page - * requests - */ - ret = __kmalloc_node_noprof(PASS_BUCKET_PARAMS(size, b), - kmalloc_gfp_adjust(flags, size), - node); - if (ret || size <= PAGE_SIZE) - return ret; - - /* non-sleeping allocations are not supported by vmalloc */ - if (!gfpflags_allow_blocking(flags)) - return NULL; - - /* Don't even allow crazy sizes */ - if (unlikely(size > INT_MAX)) { - WARN_ON_ONCE(!(flags & __GFP_NOWARN)); - return NULL; - } - - /* - * kvmalloc() can always use VM_ALLOW_HUGE_VMAP, - * since the callers already cannot assume anything - * about the resulting pointer, and cannot play - * protection games. - */ - return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, - flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, - node, __builtin_return_address(0)); -} -EXPORT_SYMBOL(__kvmalloc_node_noprof); - -/** - * kvfree() - Free memory. - * @addr: Pointer to allocated memory. - * - * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc(). - * It is slightly more efficient to use kfree() or vfree() if you are certain - * that you know which one to use. - * - * Context: Either preemptible task context or not-NMI interrupt. - */ -void kvfree(const void *addr) -{ - if (is_vmalloc_addr(addr)) - vfree(addr); - else - kfree(addr); -} -EXPORT_SYMBOL(kvfree); - -/** - * kvfree_sensitive - Free a data object containing sensitive information. - * @addr: address of the data object to be freed. - * @len: length of the data object. - * - * Use the special memzero_explicit() function to clear the content of a - * kvmalloc'ed object containing sensitive data to make sure that the - * compiler won't optimize out the data clearing. - */ -void kvfree_sensitive(const void *addr, size_t len) -{ - if (likely(!ZERO_OR_NULL_PTR(addr))) { - memzero_explicit((void *)addr, len); - kvfree(addr); - } -} -EXPORT_SYMBOL(kvfree_sensitive); - -/** - * kvrealloc - reallocate memory; contents remain unchanged - * @p: object to reallocate memory for - * @size: the size to reallocate - * @flags: the flags for the page level allocator - * - * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0 - * and @p is not a %NULL pointer, the object pointed to is freed. - * - * If __GFP_ZERO logic is requested, callers must ensure that, starting with the - * initial memory allocation, every subsequent call to this API for the same - * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that - * __GFP_ZERO is not fully honored by this API. - * - * In any case, the contents of the object pointed to are preserved up to the - * lesser of the new and old sizes. - * - * This function must not be called concurrently with itself or kvfree() for the - * same memory allocation. - * - * Return: pointer to the allocated memory or %NULL in case of error - */ -void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags) -{ - void *n; - - if (is_vmalloc_addr(p)) - return vrealloc_noprof(p, size, flags); - - n = krealloc_noprof(p, size, kmalloc_gfp_adjust(flags, size)); - if (!n) { - /* We failed to krealloc(), fall back to kvmalloc(). */ - n = kvmalloc_noprof(size, flags); - if (!n) - return NULL; - - if (p) { - /* We already know that `p` is not a vmalloc address. */ - kasan_disable_current(); - memcpy(n, kasan_reset_tag(p), ksize(p)); - kasan_enable_current(); - - kfree(p); - } - } - - return n; -} -EXPORT_SYMBOL(kvrealloc_noprof); - /** * __vmalloc_array - allocate memory for a virtually contiguous array. * @n: number of elements. @@ -894,14 +748,16 @@ int folio_mc_copy(struct folio *dst, struct folio *src) EXPORT_SYMBOL(folio_mc_copy); int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; -int sysctl_overcommit_ratio __read_mostly = 50; -unsigned long sysctl_overcommit_kbytes __read_mostly; +static int sysctl_overcommit_ratio __read_mostly = 50; +static unsigned long sysctl_overcommit_kbytes __read_mostly; int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ -int overcommit_ratio_handler(const struct ctl_table *table, int write, void *buffer, - size_t *lenp, loff_t *ppos) +#ifdef CONFIG_SYSCTL + +static int overcommit_ratio_handler(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -916,8 +772,8 @@ static void sync_overcommit_as(struct work_struct *dummy) percpu_counter_sync(&vm_committed_as); } -int overcommit_policy_handler(const struct ctl_table *table, int write, void *buffer, - size_t *lenp, loff_t *ppos) +static int overcommit_policy_handler(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; int new_policy = -1; @@ -952,8 +808,8 @@ int overcommit_policy_handler(const struct ctl_table *table, int write, void *bu return ret; } -int overcommit_kbytes_handler(const struct ctl_table *table, int write, void *buffer, - size_t *lenp, loff_t *ppos) +static int overcommit_kbytes_handler(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -963,6 +819,54 @@ int overcommit_kbytes_handler(const struct ctl_table *table, int write, void *bu return ret; } +static const struct ctl_table util_sysctl_table[] = { + { + .procname = "overcommit_memory", + .data = &sysctl_overcommit_memory, + .maxlen = sizeof(sysctl_overcommit_memory), + .mode = 0644, + .proc_handler = overcommit_policy_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { + .procname = "overcommit_ratio", + .data = &sysctl_overcommit_ratio, + .maxlen = sizeof(sysctl_overcommit_ratio), + .mode = 0644, + .proc_handler = overcommit_ratio_handler, + }, + { + .procname = "overcommit_kbytes", + .data = &sysctl_overcommit_kbytes, + .maxlen = sizeof(sysctl_overcommit_kbytes), + .mode = 0644, + .proc_handler = overcommit_kbytes_handler, + }, + { + .procname = "user_reserve_kbytes", + .data = &sysctl_user_reserve_kbytes, + .maxlen = sizeof(sysctl_user_reserve_kbytes), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "admin_reserve_kbytes", + .data = &sysctl_admin_reserve_kbytes, + .maxlen = sizeof(sysctl_admin_reserve_kbytes), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, +}; + +static int __init init_vm_util_sysctls(void) +{ + register_sysctl_init("vm", util_sysctl_table); + return 0; +} +subsys_initcall(init_vm_util_sysctls); +#endif /* CONFIG_SYSCTL */ + /* * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used */ @@ -1227,3 +1131,43 @@ void flush_dcache_folio(struct folio *folio) } EXPORT_SYMBOL(flush_dcache_folio); #endif + +/** + * compat_vma_mmap_prepare() - Apply the file's .mmap_prepare() hook to an + * existing VMA + * @file: The file which possesss an f_op->mmap_prepare() hook + * @vma: The VMA to apply the .mmap_prepare() hook to. + * + * Ordinarily, .mmap_prepare() is invoked directly upon mmap(). However, certain + * 'wrapper' file systems invoke a nested mmap hook of an underlying file. + * + * Until all filesystems are converted to use .mmap_prepare(), we must be + * conservative and continue to invoke these 'wrapper' filesystems using the + * deprecated .mmap() hook. + * + * However we have a problem if the underlying file system possesses an + * .mmap_prepare() hook, as we are in a different context when we invoke the + * .mmap() hook, already having a VMA to deal with. + * + * compat_vma_mmap_prepare() is a compatibility function that takes VMA state, + * establishes a struct vm_area_desc descriptor, passes to the underlying + * .mmap_prepare() hook and applies any changes performed by it. + * + * Once the conversion of filesystems is complete this function will no longer + * be required and will be removed. + * + * Returns: 0 on success or error. + */ +int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma) +{ + struct vm_area_desc desc; + int err; + + err = file->f_op->mmap_prepare(vma_to_desc(vma, &desc)); + if (err) + return err; + set_vma_from_desc(vma, &desc); + + return 0; +} +EXPORT_SYMBOL(compat_vma_mmap_prepare); @@ -17,9 +17,13 @@ struct mmap_state { unsigned long pglen; unsigned long flags; struct file *file; + pgprot_t page_prot; + + /* User-defined fields, perhaps updated by .mmap_prepare(). */ + const struct vm_operations_struct *vm_ops; + void *vm_private_data; unsigned long charged; - bool retry_merge; struct vm_area_struct *prev; struct vm_area_struct *next; @@ -35,11 +39,12 @@ struct mmap_state { .mm = mm_, \ .vmi = vmi_, \ .addr = addr_, \ - .end = (addr_) + len, \ + .end = (addr_) + (len_), \ .pgoff = pgoff_, \ .pglen = PHYS_PFN(len_), \ .flags = flags_, \ .file = file_, \ + .page_prot = vm_get_page_prot(flags_), \ } #define VMG_MMAP_STATE(name, map_, vma_) \ @@ -52,12 +57,27 @@ struct mmap_state { .pgoff = (map_)->pgoff, \ .file = (map_)->file, \ .prev = (map_)->prev, \ - .vma = vma_, \ + .middle = vma_, \ .next = (vma_) ? NULL : (map_)->next, \ .state = VMA_MERGE_START, \ - .merge_flags = VMG_FLAG_DEFAULT, \ } +/* + * If, at any point, the VMA had unCoW'd mappings from parents, it will maintain + * more than one anon_vma_chain connecting it to more than one anon_vma. A merge + * would mean a wider range of folios sharing the root anon_vma lock, and thus + * potential lock contention, we do not wish to encourage merging such that this + * scales to a problem. + */ +static bool vma_had_uncowed_parents(struct vm_area_struct *vma) +{ + /* + * The list_is_singular() test is to avoid merging VMA cloned from + * parents. This can improve scalability caused by anon_vma lock. + */ + return vma && vma->anon_vma && !list_is_singular(&vma->anon_vma_chain); +} + static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next) { struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev; @@ -83,53 +103,75 @@ static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_nex return true; } -static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1, - struct anon_vma *anon_vma2, struct vm_area_struct *vma) +static bool is_mergeable_anon_vma(struct vma_merge_struct *vmg, bool merge_next) { + struct vm_area_struct *tgt = merge_next ? vmg->next : vmg->prev; + struct vm_area_struct *src = vmg->middle; /* exisitng merge case. */ + struct anon_vma *tgt_anon = tgt->anon_vma; + struct anon_vma *src_anon = vmg->anon_vma; + /* - * The list_is_singular() test is to avoid merging VMA cloned from - * parents. This can improve scalability caused by anon_vma lock. + * We _can_ have !src, vmg->anon_vma via copy_vma(). In this instance we + * will remove the existing VMA's anon_vma's so there's no scalability + * concerns. */ - if ((!anon_vma1 || !anon_vma2) && (!vma || - list_is_singular(&vma->anon_vma_chain))) - return true; - return anon_vma1 == anon_vma2; -} + VM_WARN_ON(src && src_anon != src->anon_vma); -/* Are the anon_vma's belonging to each VMA compatible with one another? */ -static inline bool are_anon_vmas_compatible(struct vm_area_struct *vma1, - struct vm_area_struct *vma2) -{ - return is_mergeable_anon_vma(vma1->anon_vma, vma2->anon_vma, NULL); + /* Case 1 - we will dup_anon_vma() from src into tgt. */ + if (!tgt_anon && src_anon) + return !vma_had_uncowed_parents(src); + /* Case 2 - we will simply use tgt's anon_vma. */ + if (tgt_anon && !src_anon) + return !vma_had_uncowed_parents(tgt); + /* Case 3 - the anon_vma's are already shared. */ + return src_anon == tgt_anon; } /* * init_multi_vma_prep() - Initializer for struct vma_prepare * @vp: The vma_prepare struct * @vma: The vma that will be altered once locked - * @next: The next vma if it is to be adjusted - * @remove: The first vma to be removed - * @remove2: The second vma to be removed + * @vmg: The merge state that will be used to determine adjustment and VMA + * removal. */ static void init_multi_vma_prep(struct vma_prepare *vp, struct vm_area_struct *vma, - struct vm_area_struct *next, - struct vm_area_struct *remove, - struct vm_area_struct *remove2) + struct vma_merge_struct *vmg) { + struct vm_area_struct *adjust; + struct vm_area_struct **remove = &vp->remove; + memset(vp, 0, sizeof(struct vma_prepare)); vp->vma = vma; vp->anon_vma = vma->anon_vma; - vp->remove = remove; - vp->remove2 = remove2; - vp->adj_next = next; - if (!vp->anon_vma && next) - vp->anon_vma = next->anon_vma; + + if (vmg && vmg->__remove_middle) { + *remove = vmg->middle; + remove = &vp->remove2; + } + if (vmg && vmg->__remove_next) + *remove = vmg->next; + + if (vmg && vmg->__adjust_middle_start) + adjust = vmg->middle; + else if (vmg && vmg->__adjust_next_start) + adjust = vmg->next; + else + adjust = NULL; + + vp->adj_next = adjust; + if (!vp->anon_vma && adjust) + vp->anon_vma = adjust->anon_vma; + + VM_WARN_ON(vp->anon_vma && adjust && adjust->anon_vma && + vp->anon_vma != adjust->anon_vma); vp->file = vma->vm_file; if (vp->file) vp->mapping = vma->vm_file->f_mapping; + if (vmg && vmg->skip_vma_uprobe) + vp->skip_vma_uprobe = true; } /* @@ -150,7 +192,7 @@ static bool can_vma_merge_before(struct vma_merge_struct *vmg) pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start); if (is_mergeable_vma(vmg, /* merge_next = */ true) && - is_mergeable_anon_vma(vmg->anon_vma, vmg->next->anon_vma, vmg->next)) { + is_mergeable_anon_vma(vmg, /* merge_next = */ true)) { if (vmg->next->vm_pgoff == vmg->pgoff + pglen) return true; } @@ -170,7 +212,7 @@ static bool can_vma_merge_before(struct vma_merge_struct *vmg) static bool can_vma_merge_after(struct vma_merge_struct *vmg) { if (is_mergeable_vma(vmg, /* merge_next = */ false) && - is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) { + is_mergeable_anon_vma(vmg, /* merge_next = */ false)) { if (vmg->prev->vm_pgoff + vma_pages(vmg->prev) == vmg->pgoff) return true; } @@ -203,6 +245,38 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma, } /* + * vma has some anon_vma assigned, and is already inserted on that + * anon_vma's interval trees. + * + * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the + * vma must be removed from the anon_vma's interval trees using + * anon_vma_interval_tree_pre_update_vma(). + * + * After the update, the vma will be reinserted using + * anon_vma_interval_tree_post_update_vma(). + * + * The entire update must be protected by exclusive mmap_lock and by + * the root anon_vma's mutex. + */ +static void +anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) +{ + struct anon_vma_chain *avc; + + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); +} + +static void +anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) +{ + struct anon_vma_chain *avc; + + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); +} + +/* * vma_prepare() - Helper function for handling locking VMAs prior to altering * @vp: The initialized vma_prepare struct */ @@ -274,7 +348,7 @@ static void vma_complete(struct vma_prepare *vp, struct vma_iterator *vmi, * us to insert it before dropping the locks * (it may either follow vma or precede it). */ - vma_iter_store(vmi, vp->insert); + vma_iter_store_new(vmi, vp->insert); mm->map_count++; } @@ -287,15 +361,18 @@ static void vma_complete(struct vma_prepare *vp, struct vma_iterator *vmi, if (vp->file) { i_mmap_unlock_write(vp->mapping); - uprobe_mmap(vp->vma); - if (vp->adj_next) - uprobe_mmap(vp->adj_next); + if (!vp->skip_vma_uprobe) { + uprobe_mmap(vp->vma); + + if (vp->adj_next) + uprobe_mmap(vp->adj_next); + } } if (vp->remove) { again: - vma_mark_detached(vp->remove, true); + vma_mark_detached(vp->remove); if (vp->file) { uprobe_munmap(vp->remove, vp->remove->vm_start, vp->remove->vm_end); @@ -330,7 +407,7 @@ again: */ static void init_vma_prep(struct vma_prepare *vp, struct vm_area_struct *vma) { - init_multi_vma_prep(vp, vma, NULL, NULL, NULL); + init_multi_vma_prep(vp, vma, NULL); } /* @@ -354,8 +431,10 @@ static bool can_vma_merge_left(struct vma_merge_struct *vmg) static bool can_vma_merge_right(struct vma_merge_struct *vmg, bool can_merge_left) { - if (!vmg->next || vmg->end != vmg->next->vm_start || - !can_vma_merge_before(vmg)) + struct vm_area_struct *next = vmg->next; + struct vm_area_struct *prev; + + if (!next || vmg->end != next->vm_start || !can_vma_merge_before(vmg)) return false; if (!can_merge_left) @@ -368,23 +447,22 @@ static bool can_vma_merge_right(struct vma_merge_struct *vmg, * * We therefore check this in addition to mergeability to either side. */ - return are_anon_vmas_compatible(vmg->prev, vmg->next); + prev = vmg->prev; + return !prev->anon_vma || !next->anon_vma || + prev->anon_vma == next->anon_vma; } /* * Close a vm structure and free it. */ -void remove_vma(struct vm_area_struct *vma, bool unreachable) +void remove_vma(struct vm_area_struct *vma) { might_sleep(); vma_close(vma); if (vma->vm_file) fput(vma->vm_file); mpol_put(vma_policy(vma)); - if (unreachable) - __vm_area_free(vma); - else - vm_area_free(vma); + vm_area_free(vma); } /* @@ -398,7 +476,6 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; struct mmu_gather tlb; - lru_add_drain(); tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end, @@ -415,8 +492,9 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, * has already been checked or doesn't make sense to fail. * VMA Iterator will point to the original VMA. */ -static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long addr, int new_below) +static __must_check int +__split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long addr, int new_below) { struct vma_prepare vp; struct vm_area_struct *new; @@ -467,7 +545,14 @@ static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, init_vma_prep(&vp, vma); vp.insert = new; vma_prepare(&vp); - vma_adjust_trans_huge(vma, vma->vm_start, addr, 0); + + /* + * Get rid of huge pages and shared page tables straddling the split + * boundary. + */ + vma_adjust_trans_huge(vma, vma->vm_start, addr, NULL); + if (is_vm_hugetlb_page(vma)) + hugetlb_split(vma, addr); if (new_below) { vma->vm_start = addr; @@ -511,39 +596,9 @@ static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, } /* - * vma has some anon_vma assigned, and is already inserted on that - * anon_vma's interval trees. - * - * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the - * vma must be removed from the anon_vma's interval trees using - * anon_vma_interval_tree_pre_update_vma(). - * - * After the update, the vma will be reinserted using - * anon_vma_interval_tree_post_update_vma(). + * dup_anon_vma() - Helper function to duplicate anon_vma on VMA merge in the + * instance that the destination VMA has no anon_vma but the source does. * - * The entire update must be protected by exclusive mmap_lock and by - * the root anon_vma's mutex. - */ -void -anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) -{ - struct anon_vma_chain *avc; - - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) - anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); -} - -void -anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) -{ - struct anon_vma_chain *avc; - - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) - anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); -} - -/* - * dup_anon_vma() - Helper function to duplicate anon_vma * @dst: The destination VMA * @src: The source VMA * @dup: Pointer to the destination VMA when successful. @@ -554,9 +609,18 @@ static int dup_anon_vma(struct vm_area_struct *dst, struct vm_area_struct *src, struct vm_area_struct **dup) { /* - * Easily overlooked: when mprotect shifts the boundary, make sure the - * expanding vma has anon_vma set if the shrinking vma had, to cover any - * anon pages imported. + * There are three cases to consider for correctly propagating + * anon_vma's on merge. + * + * The first is trivial - neither VMA has anon_vma, we need not do + * anything. + * + * The second where both have anon_vma is also a no-op, as they must + * then be the same, so there is simply nothing to copy. + * + * Here we cover the third - if the destination VMA has no anon_vma, + * that is it is unfaulted, we need to ensure that the newly merged + * range is referenced by the anon_vma's of the source. */ if (src->anon_vma && !dst->anon_vma) { int ret; @@ -629,49 +693,75 @@ void validate_mm(struct mm_struct *mm) } #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */ -/* Actually perform the VMA merge operation. */ -static int commit_merge(struct vma_merge_struct *vmg, - struct vm_area_struct *adjust, - struct vm_area_struct *remove, - struct vm_area_struct *remove2, - long adj_start, - bool expanded) +/* + * Based on the vmg flag indicating whether we need to adjust the vm_start field + * for the middle or next VMA, we calculate what the range of the newly adjusted + * VMA ought to be, and set the VMA's range accordingly. + */ +static void vmg_adjust_set_range(struct vma_merge_struct *vmg) { - struct vma_prepare vp; + struct vm_area_struct *adjust; + pgoff_t pgoff; - init_multi_vma_prep(&vp, vmg->vma, adjust, remove, remove2); + if (vmg->__adjust_middle_start) { + adjust = vmg->middle; + pgoff = adjust->vm_pgoff + PHYS_PFN(vmg->end - adjust->vm_start); + } else if (vmg->__adjust_next_start) { + adjust = vmg->next; + pgoff = adjust->vm_pgoff - PHYS_PFN(adjust->vm_start - vmg->end); + } else { + return; + } - VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma && - vp.anon_vma != adjust->anon_vma); + vma_set_range(adjust, vmg->end, adjust->vm_end, pgoff); +} - if (expanded) { - /* Note: vma iterator must be pointing to 'start'. */ - vma_iter_config(vmg->vmi, vmg->start, vmg->end); +/* + * Actually perform the VMA merge operation. + * + * IMPORTANT: We guarantee that, should vmg->give_up_on_oom is set, to not + * modify any VMAs or cause inconsistent state should an OOM condition arise. + * + * Returns 0 on success, or an error value on failure. + */ +static int commit_merge(struct vma_merge_struct *vmg) +{ + struct vm_area_struct *vma; + struct vma_prepare vp; + + if (vmg->__adjust_next_start) { + /* We manipulate middle and adjust next, which is the target. */ + vma = vmg->middle; + vma_iter_config(vmg->vmi, vmg->end, vmg->next->vm_end); } else { - vma_iter_config(vmg->vmi, adjust->vm_start + adj_start, - adjust->vm_end); + vma = vmg->target; + /* Note: vma iterator must be pointing to 'start'. */ + vma_iter_config(vmg->vmi, vmg->start, vmg->end); } - if (vma_iter_prealloc(vmg->vmi, vmg->vma)) + init_multi_vma_prep(&vp, vma, vmg); + + /* + * If vmg->give_up_on_oom is set, we're safe, because we don't actually + * manipulate any VMAs until we succeed at preallocation. + * + * Past this point, we will not return an error. + */ + if (vma_iter_prealloc(vmg->vmi, vma)) return -ENOMEM; vma_prepare(&vp); - vma_adjust_trans_huge(vmg->vma, vmg->start, vmg->end, adj_start); - vma_set_range(vmg->vma, vmg->start, vmg->end, vmg->pgoff); - - if (expanded) - vma_iter_store(vmg->vmi, vmg->vma); - - if (adj_start) { - adjust->vm_start += adj_start; - adjust->vm_pgoff += PHYS_PFN(adj_start); - if (adj_start < 0) { - WARN_ON(expanded); - vma_iter_store(vmg->vmi, adjust); - } - } + /* + * THP pages may need to do additional splits if we increase + * middle->vm_start. + */ + vma_adjust_trans_huge(vma, vmg->start, vmg->end, + vmg->__adjust_middle_start ? vmg->middle : NULL); + vma_set_range(vma, vmg->start, vmg->end, vmg->pgoff); + vmg_adjust_set_range(vmg); + vma_iter_store_overwrite(vmg->vmi, vmg->target); - vma_complete(&vp, vmg->vmi, vmg->vma->vm_mm); + vma_complete(&vp, vmg->vmi, vma->vm_mm); return 0; } @@ -694,8 +784,9 @@ static bool can_merge_remove_vma(struct vm_area_struct *vma) * identical properties. * * This function checks for the existence of any such mergeable VMAs and updates - * the maple tree describing the @vmg->vma->vm_mm address space to account for - * this, as well as any VMAs shrunk/expanded/deleted as a result of this merge. + * the maple tree describing the @vmg->middle->vm_mm address space to account + * for this, as well as any VMAs shrunk/expanded/deleted as a result of this + * merge. * * As part of this operation, if a merge occurs, the @vmg object will have its * vma, start, end, and pgoff fields modified to execute the merge. Subsequent @@ -704,43 +795,43 @@ static bool can_merge_remove_vma(struct vm_area_struct *vma) * Returns: The merged VMA if merge succeeds, or NULL otherwise. * * ASSUMPTIONS: - * - The caller must assign the VMA to be modifed to @vmg->vma. + * - The caller must assign the VMA to be modifed to @vmg->middle. * - The caller must have set @vmg->prev to the previous VMA, if there is one. * - The caller must not set @vmg->next, as we determine this. * - The caller must hold a WRITE lock on the mm_struct->mmap_lock. - * - vmi must be positioned within [@vmg->vma->vm_start, @vmg->vma->vm_end). + * - vmi must be positioned within [@vmg->middle->vm_start, @vmg->middle->vm_end). */ -static struct vm_area_struct *vma_merge_existing_range(struct vma_merge_struct *vmg) +static __must_check struct vm_area_struct *vma_merge_existing_range( + struct vma_merge_struct *vmg) { - struct vm_area_struct *vma = vmg->vma; + struct vm_area_struct *middle = vmg->middle; struct vm_area_struct *prev = vmg->prev; - struct vm_area_struct *next, *res; + struct vm_area_struct *next; struct vm_area_struct *anon_dup = NULL; - struct vm_area_struct *adjust = NULL; unsigned long start = vmg->start; unsigned long end = vmg->end; - bool left_side = vma && start == vma->vm_start; - bool right_side = vma && end == vma->vm_end; + bool left_side = middle && start == middle->vm_start; + bool right_side = middle && end == middle->vm_end; int err = 0; - long adj_start = 0; - bool merge_will_delete_vma, merge_will_delete_next; bool merge_left, merge_right, merge_both; - bool expanded; mmap_assert_write_locked(vmg->mm); - VM_WARN_ON(!vma); /* We are modifying a VMA, so caller must specify. */ - VM_WARN_ON(vmg->next); /* We set this. */ - VM_WARN_ON(prev && start <= prev->vm_start); - VM_WARN_ON(start >= end); + VM_WARN_ON_VMG(!middle, vmg); /* We are modifying a VMA, so caller must specify. */ + VM_WARN_ON_VMG(vmg->next, vmg); /* We set this. */ + VM_WARN_ON_VMG(prev && start <= prev->vm_start, vmg); + VM_WARN_ON_VMG(start >= end, vmg); + /* - * If vma == prev, then we are offset into a VMA. Otherwise, if we are + * If middle == prev, then we are offset into a VMA. Otherwise, if we are * not, we must span a portion of the VMA. */ - VM_WARN_ON(vma && ((vma != prev && vmg->start != vma->vm_start) || - vmg->end > vma->vm_end)); - /* The vmi must be positioned within vmg->vma. */ - VM_WARN_ON(vma && !(vma_iter_addr(vmg->vmi) >= vma->vm_start && - vma_iter_addr(vmg->vmi) < vma->vm_end)); + VM_WARN_ON_VMG(middle && + ((middle != prev && vmg->start != middle->vm_start) || + vmg->end > middle->vm_end), vmg); + /* The vmi must be positioned within vmg->middle. */ + VM_WARN_ON_VMG(middle && + !(vma_iter_addr(vmg->vmi) >= middle->vm_start && + vma_iter_addr(vmg->vmi) < middle->vm_end), vmg); vmg->state = VMA_MERGE_NOMERGE; @@ -774,49 +865,52 @@ static struct vm_area_struct *vma_merge_existing_range(struct vma_merge_struct * merge_both = merge_left && merge_right; /* If we span the entire VMA, a merge implies it will be deleted. */ - merge_will_delete_vma = left_side && right_side; + vmg->__remove_middle = left_side && right_side; /* - * If we need to remove vma in its entirety but are unable to do so, + * If we need to remove middle in its entirety but are unable to do so, * we have no sensible recourse but to abort the merge. */ - if (merge_will_delete_vma && !can_merge_remove_vma(vma)) + if (vmg->__remove_middle && !can_merge_remove_vma(middle)) return NULL; /* * If we merge both VMAs, then next is also deleted. This implies * merge_will_delete_vma also. */ - merge_will_delete_next = merge_both; + vmg->__remove_next = merge_both; /* * If we cannot delete next, then we can reduce the operation to merging - * prev and vma (thereby deleting vma). + * prev and middle (thereby deleting middle). */ - if (merge_will_delete_next && !can_merge_remove_vma(next)) { - merge_will_delete_next = false; + if (vmg->__remove_next && !can_merge_remove_vma(next)) { + vmg->__remove_next = false; merge_right = false; merge_both = false; } - /* No matter what happens, we will be adjusting vma. */ - vma_start_write(vma); - - if (merge_left) - vma_start_write(prev); + /* No matter what happens, we will be adjusting middle. */ + vma_start_write(middle); - if (merge_right) + if (merge_right) { vma_start_write(next); + vmg->target = next; + } + + if (merge_left) { + vma_start_write(prev); + vmg->target = prev; + } if (merge_both) { /* - * |<----->| - * |-------*********-------| - * prev vma next - * extend delete delete + * |<-------------------->| + * |-------********-------| + * prev middle next + * extend delete delete */ - vmg->vma = prev; vmg->start = prev->vm_start; vmg->end = next->vm_end; vmg->pgoff = prev->vm_pgoff; @@ -824,97 +918,77 @@ static struct vm_area_struct *vma_merge_existing_range(struct vma_merge_struct * /* * We already ensured anon_vma compatibility above, so now it's * simply a case of, if prev has no anon_vma object, which of - * next or vma contains the anon_vma we must duplicate. + * next or middle contains the anon_vma we must duplicate. */ - err = dup_anon_vma(prev, next->anon_vma ? next : vma, &anon_dup); + err = dup_anon_vma(prev, next->anon_vma ? next : middle, + &anon_dup); } else if (merge_left) { /* - * |<----->| OR - * |<--------->| + * |<------------>| OR + * |<----------------->| * |-------************* - * prev vma + * prev middle * extend shrink/delete */ - vmg->vma = prev; vmg->start = prev->vm_start; vmg->pgoff = prev->vm_pgoff; - if (!merge_will_delete_vma) { - adjust = vma; - adj_start = vmg->end - vma->vm_start; - } + if (!vmg->__remove_middle) + vmg->__adjust_middle_start = true; - err = dup_anon_vma(prev, vma, &anon_dup); + err = dup_anon_vma(prev, middle, &anon_dup); } else { /* merge_right */ /* - * |<----->| OR - * |<--------->| + * |<------------->| OR + * |<----------------->| * *************-------| - * vma next + * middle next * shrink/delete extend */ pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start); - VM_WARN_ON(!merge_right); - /* If we are offset into a VMA, then prev must be vma. */ - VM_WARN_ON(vmg->start > vma->vm_start && prev && vma != prev); + VM_WARN_ON_VMG(!merge_right, vmg); + /* If we are offset into a VMA, then prev must be middle. */ + VM_WARN_ON_VMG(vmg->start > middle->vm_start && prev && middle != prev, vmg); - if (merge_will_delete_vma) { - vmg->vma = next; + if (vmg->__remove_middle) { vmg->end = next->vm_end; vmg->pgoff = next->vm_pgoff - pglen; } else { - /* - * We shrink vma and expand next. - * - * IMPORTANT: This is the ONLY case where the final - * merged VMA is NOT vmg->vma, but rather vmg->next. - */ - - vmg->start = vma->vm_start; + /* We shrink middle and expand next. */ + vmg->__adjust_next_start = true; + vmg->start = middle->vm_start; vmg->end = start; - vmg->pgoff = vma->vm_pgoff; - - adjust = next; - adj_start = -(vma->vm_end - start); + vmg->pgoff = middle->vm_pgoff; } - err = dup_anon_vma(next, vma, &anon_dup); + err = dup_anon_vma(next, middle, &anon_dup); } - if (err) + if (err || commit_merge(vmg)) goto abort; - /* - * In nearly all cases, we expand vmg->vma. There is one exception - - * merge_right where we partially span the VMA. In this case we shrink - * the end of vmg->vma and adjust the start of vmg->next accordingly. - */ - expanded = !merge_right || merge_will_delete_vma; - - if (commit_merge(vmg, adjust, - merge_will_delete_vma ? vma : NULL, - merge_will_delete_next ? next : NULL, - adj_start, expanded)) { - if (anon_dup) - unlink_anon_vmas(anon_dup); - - vmg->state = VMA_MERGE_ERROR_NOMEM; - return NULL; - } - - res = merge_left ? prev : next; - khugepaged_enter_vma(res, vmg->flags); - + khugepaged_enter_vma(vmg->target, vmg->flags); vmg->state = VMA_MERGE_SUCCESS; - return res; + return vmg->target; abort: vma_iter_set(vmg->vmi, start); vma_iter_load(vmg->vmi); - vmg->state = VMA_MERGE_ERROR_NOMEM; + + if (anon_dup) + unlink_anon_vmas(anon_dup); + + /* + * This means we have failed to clone anon_vma's correctly, but no + * actual changes to VMAs have occurred, so no harm no foul - if the + * user doesn't want this reported and instead just wants to give up on + * the merge, allow it. + */ + if (!vmg->give_up_on_oom) + vmg->state = VMA_MERGE_ERROR_NOMEM; return NULL; } @@ -968,12 +1042,11 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) struct vm_area_struct *next = vmg->next; unsigned long end = vmg->end; bool can_merge_left, can_merge_right; - bool just_expand = vmg->merge_flags & VMG_FLAG_JUST_EXPAND; mmap_assert_write_locked(vmg->mm); - VM_WARN_ON(vmg->vma); + VM_WARN_ON_VMG(vmg->middle, vmg); /* vmi must point at or before the gap. */ - VM_WARN_ON(vma_iter_addr(vmg->vmi) > end); + VM_WARN_ON_VMG(vma_iter_addr(vmg->vmi) > end, vmg); vmg->state = VMA_MERGE_NOMERGE; @@ -982,18 +1055,18 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) return NULL; can_merge_left = can_vma_merge_left(vmg); - can_merge_right = !just_expand && can_vma_merge_right(vmg, can_merge_left); + can_merge_right = !vmg->just_expand && can_vma_merge_right(vmg, can_merge_left); /* If we can merge with the next VMA, adjust vmg accordingly. */ if (can_merge_right) { vmg->end = next->vm_end; - vmg->vma = next; + vmg->middle = next; } /* If we can merge with the previous VMA, adjust vmg accordingly. */ if (can_merge_left) { vmg->start = prev->vm_start; - vmg->vma = prev; + vmg->middle = prev; vmg->pgoff = prev->vm_pgoff; /* @@ -1005,7 +1078,7 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) vmg->end = end; /* In expand-only case we are already positioned at prev. */ - if (!just_expand) { + if (!vmg->just_expand) { /* Equivalent to going to the previous range. */ vma_prev(vmg->vmi); } @@ -1015,10 +1088,10 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) * Now try to expand adjacent VMA(s). This takes care of removing the * following VMA if we have VMAs on both sides. */ - if (vmg->vma && !vma_expand(vmg)) { - khugepaged_enter_vma(vmg->vma, vmg->flags); + if (vmg->middle && !vma_expand(vmg)) { + khugepaged_enter_vma(vmg->middle, vmg->flags); vmg->state = VMA_MERGE_SUCCESS; - return vmg->vma; + return vmg->middle; } return NULL; @@ -1030,53 +1103,68 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) * @vmg: Describes a VMA expansion operation. * * Expand @vma to vmg->start and vmg->end. Can expand off the start and end. - * Will expand over vmg->next if it's different from vmg->vma and vmg->end == - * vmg->next->vm_end. Checking if the vmg->vma can expand and merge with + * Will expand over vmg->next if it's different from vmg->middle and vmg->end == + * vmg->next->vm_end. Checking if the vmg->middle can expand and merge with * vmg->next needs to be handled by the caller. * * Returns: 0 on success. * * ASSUMPTIONS: - * - The caller must hold a WRITE lock on vmg->vma->mm->mmap_lock. - * - The caller must have set @vmg->vma and @vmg->next. + * - The caller must hold a WRITE lock on vmg->middle->mm->mmap_lock. + * - The caller must have set @vmg->middle and @vmg->next. */ int vma_expand(struct vma_merge_struct *vmg) { struct vm_area_struct *anon_dup = NULL; bool remove_next = false; - struct vm_area_struct *vma = vmg->vma; + struct vm_area_struct *middle = vmg->middle; struct vm_area_struct *next = vmg->next; mmap_assert_write_locked(vmg->mm); - vma_start_write(vma); - if (next && (vma != next) && (vmg->end == next->vm_end)) { + vma_start_write(middle); + if (next && (middle != next) && (vmg->end == next->vm_end)) { int ret; remove_next = true; /* This should already have been checked by this point. */ - VM_WARN_ON(!can_merge_remove_vma(next)); + VM_WARN_ON_VMG(!can_merge_remove_vma(next), vmg); vma_start_write(next); - ret = dup_anon_vma(vma, next, &anon_dup); + /* + * In this case we don't report OOM, so vmg->give_up_on_mm is + * safe. + */ + ret = dup_anon_vma(middle, next, &anon_dup); if (ret) return ret; } /* Not merging but overwriting any part of next is not handled. */ - VM_WARN_ON(next && !remove_next && - next != vma && vmg->end > next->vm_start); + VM_WARN_ON_VMG(next && !remove_next && + next != middle && vmg->end > next->vm_start, vmg); /* Only handles expanding */ - VM_WARN_ON(vma->vm_start < vmg->start || vma->vm_end > vmg->end); + VM_WARN_ON_VMG(middle->vm_start < vmg->start || + middle->vm_end > vmg->end, vmg); + + vmg->target = middle; + if (remove_next) + vmg->__remove_next = true; - if (commit_merge(vmg, NULL, remove_next ? next : NULL, NULL, 0, true)) + if (commit_merge(vmg)) goto nomem; return 0; nomem: - vmg->state = VMA_MERGE_ERROR_NOMEM; if (anon_dup) unlink_anon_vmas(anon_dup); + /* + * If the user requests that we just give upon OOM, we are safe to do so + * here, as commit merge provides this contract to us. Nothing has been + * changed - no harm no foul, just don't report it. + */ + if (!vmg->give_up_on_oom) + vmg->state = VMA_MERGE_ERROR_NOMEM; return -ENOMEM; } @@ -1108,7 +1196,7 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, init_vma_prep(&vp, vma); vma_prepare(&vp); - vma_adjust_trans_huge(vma, start, end, 0); + vma_adjust_trans_huge(vma, start, end, NULL); vma_iter_clear(vmi); vma_set_range(vma, start, end, pgoff); @@ -1130,7 +1218,6 @@ static inline void vms_clear_ptes(struct vma_munmap_struct *vms, * were isolated before we downgraded mmap_lock. */ mas_set(mas_detach, 1); - lru_add_drain(); tlb_gather_mmu(&tlb, vms->vma->vm_mm); update_hiwater_rss(vms->vma->vm_mm); unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end, @@ -1198,7 +1285,7 @@ static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms, /* Remove and clean up vmas */ mas_set(mas_detach, 0); mas_for_each(mas_detach, vma, ULONG_MAX) - remove_vma(vma, /* unreachable = */ false); + remove_vma(vma); vm_unacct_memory(vms->nr_accounted); validate_mm(mm); @@ -1220,7 +1307,7 @@ static void reattach_vmas(struct ma_state *mas_detach) mas_set(mas_detach, 0); mas_for_each(mas_detach, vma, ULONG_MAX) - vma_mark_detached(vma, false); + vma_mark_attached(vma); __mt_destroy(mas_detach->tree); } @@ -1295,7 +1382,7 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, if (error) goto munmap_gather_failed; - vma_mark_detached(next, true); + vma_mark_detached(next); nrpages = vma_pages(next); vms->nr_pages += nrpages; @@ -1507,25 +1594,36 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, */ static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg) { - struct vm_area_struct *vma = vmg->vma; + struct vm_area_struct *vma = vmg->middle; + unsigned long start = vmg->start; + unsigned long end = vmg->end; struct vm_area_struct *merged; /* First, try to merge. */ merged = vma_merge_existing_range(vmg); if (merged) return merged; + if (vmg_nomem(vmg)) + return ERR_PTR(-ENOMEM); + + /* + * Split can fail for reasons other than OOM, so if the user requests + * this it's probably a mistake. + */ + VM_WARN_ON(vmg->give_up_on_oom && + (vma->vm_start != start || vma->vm_end != end)); /* Split any preceding portion of the VMA. */ - if (vma->vm_start < vmg->start) { - int err = split_vma(vmg->vmi, vma, vmg->start, 1); + if (vma->vm_start < start) { + int err = split_vma(vmg->vmi, vma, start, 1); if (err) return ERR_PTR(err); } /* Split any trailing portion of the VMA. */ - if (vma->vm_end > vmg->end) { - int err = split_vma(vmg->vmi, vma, vmg->end, 0); + if (vma->vm_end > end) { + int err = split_vma(vmg->vmi, vma, end, 0); if (err) return ERR_PTR(err); @@ -1583,12 +1681,15 @@ struct vm_area_struct struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long new_flags, - struct vm_userfaultfd_ctx new_ctx) + struct vm_userfaultfd_ctx new_ctx, + bool give_up_on_oom) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); vmg.flags = new_flags; vmg.uffd_ctx = new_ctx; + if (give_up_on_oom) + vmg.give_up_on_oom = true; return vma_modify(&vmg); } @@ -1604,7 +1705,7 @@ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, VMG_VMA_STATE(vmg, vmi, vma, vma, vma->vm_end, vma->vm_end + delta); vmg.next = vma_iter_next_rewind(vmi, NULL); - vmg.vma = NULL; /* We use the VMA to populate VMG fields only. */ + vmg.middle = NULL; /* We use the VMA to populate VMG fields only. */ return vma_merge_new_range(&vmg); } @@ -1689,7 +1790,7 @@ int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) return -ENOMEM; vma_start_write(vma); - vma_iter_store(&vmi, vma); + vma_iter_store_new(&vmi, vma); vma_link_file(vma); mm->map_count++; validate_mm(mm); @@ -1721,11 +1822,19 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, faulted_in_anon_vma = false; } + /* + * If the VMA we are copying might contain a uprobe PTE, ensure + * that we do not establish one upon merge. Otherwise, when mremap() + * moves page tables, it will orphan the newly created PTE. + */ + if (vma->vm_file) + vmg.skip_vma_uprobe = true; + new_vma = find_vma_prev(mm, addr, &vmg.prev); if (new_vma && new_vma->vm_start < addr + len) return NULL; /* should never get here */ - vmg.vma = NULL; /* New VMA range. */ + vmg.middle = NULL; /* New VMA range. */ vmg.pgoff = pgoff; vmg.next = vma_iter_next_rewind(&vmi, NULL); new_vma = vma_merge_new_range(&vmg); @@ -1772,6 +1881,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, return new_vma; out_vma_link: + fixup_hugetlb_reservations(new_vma); vma_close(new_vma); if (new_vma->vm_file) @@ -2288,6 +2398,10 @@ static int __mmap_new_file_vma(struct mmap_state *map, int error; vma->vm_file = get_file(map->file); + + if (!map->file->f_op->mmap) + return 0; + error = mmap_file(vma->vm_file, vma); if (error) { fput(vma->vm_file); @@ -2310,8 +2424,6 @@ static int __mmap_new_file_vma(struct mmap_state *map, !(map->flags & VM_MAYWRITE) && (vma->vm_flags & VM_MAYWRITE)); - /* If the flags change (and are mergeable), let's retry later. */ - map->retry_merge = vma->vm_flags != map->flags && !(vma->vm_flags & VM_SPECIAL); map->flags = vma->vm_flags; return 0; @@ -2344,7 +2456,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) vma_iter_config(vmi, map->addr, map->end); vma_set_range(vma, map->addr, map->end, map->pgoff); vm_flags_init(vma, map->flags); - vma->vm_page_prot = vm_get_page_prot(map->flags); + vma->vm_page_prot = map->page_prot; if (vma_iter_prealloc(vmi, vma)) { error = -ENOMEM; @@ -2368,7 +2480,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) /* Lock the VMA since it is modified after insertion into VMA tree */ vma_start_write(vma); - vma_iter_store(vmi, vma); + vma_iter_store_new(vmi, vma); map->mm->map_count++; vma_link_file(vma); @@ -2376,7 +2488,8 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) * vma_merge_new_range() calls khugepaged_enter_vma() too, the below * call covers the non-merge case. */ - khugepaged_enter_vma(vma, map->flags); + if (!vma_is_anonymous(vma)) + khugepaged_enter_vma(vma, map->flags); ksm_add_vma(vma); *vmap = vma; return 0; @@ -2430,17 +2543,70 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) vma_set_page_prot(vma); } -unsigned long __mmap_region(struct file *file, unsigned long addr, +/* + * Invoke the f_op->mmap_prepare() callback for a file-backed mapping that + * specifies it. + * + * This is called prior to any merge attempt, and updates whitelisted fields + * that are permitted to be updated by the caller. + * + * All but user-defined fields will be pre-populated with original values. + * + * Returns 0 on success, or an error code otherwise. + */ +static int call_mmap_prepare(struct mmap_state *map) +{ + int err; + struct vm_area_desc desc = { + .mm = map->mm, + .start = map->addr, + .end = map->end, + + .pgoff = map->pgoff, + .file = map->file, + .vm_flags = map->flags, + .page_prot = map->page_prot, + }; + + /* Invoke the hook. */ + err = __call_mmap_prepare(map->file, &desc); + if (err) + return err; + + /* Update fields permitted to be changed. */ + map->pgoff = desc.pgoff; + map->file = desc.file; + map->flags = desc.vm_flags; + map->page_prot = desc.page_prot; + /* User-defined fields. */ + map->vm_ops = desc.vm_ops; + map->vm_private_data = desc.private_data; + + return 0; +} + +static void set_vma_user_defined_fields(struct vm_area_struct *vma, + struct mmap_state *map) +{ + if (map->vm_ops) + vma->vm_ops = map->vm_ops; + vma->vm_private_data = map->vm_private_data; +} + +static unsigned long __mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; int error; + bool have_mmap_prepare = file && file->f_op->mmap_prepare; VMA_ITERATOR(vmi, mm, addr); MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file); error = __mmap_prepare(&map, uf); + if (!error && have_mmap_prepare) + error = call_mmap_prepare(&map); if (error) goto abort_munmap; @@ -2458,13 +2624,8 @@ unsigned long __mmap_region(struct file *file, unsigned long addr, goto unacct_error; } - /* If flags changed, we might be able to merge, so try again. */ - if (map.retry_merge) { - VMG_MMAP_STATE(vmg, &map, vma); - - vma_iter_config(map.vmi, map.addr, map.end); - vma_merge_existing_range(&vmg); - } + if (have_mmap_prepare) + set_vma_user_defined_fields(vma, &map); __mmap_complete(&map, vma); @@ -2478,3 +2639,518 @@ abort_munmap: vms_abort_munmap_vmas(&map.vms, &map.mas_detach); return error; } + +/** + * mmap_region() - Actually perform the userland mapping of a VMA into + * current->mm with known, aligned and overflow-checked @addr and @len, and + * correctly determined VMA flags @vm_flags and page offset @pgoff. + * + * This is an internal memory management function, and should not be used + * directly. + * + * The caller must write-lock current->mm->mmap_lock. + * + * @file: If a file-backed mapping, a pointer to the struct file describing the + * file to be mapped, otherwise NULL. + * @addr: The page-aligned address at which to perform the mapping. + * @len: The page-aligned, non-zero, length of the mapping. + * @vm_flags: The VMA flags which should be applied to the mapping. + * @pgoff: If @file is specified, the page offset into the file, if not then + * the virtual page offset in memory of the anonymous mapping. + * @uf: Optionally, a pointer to a list head used for tracking userfaultfd unmap + * events. + * + * Returns: Either an error, or the address at which the requested mapping has + * been performed. + */ +unsigned long mmap_region(struct file *file, unsigned long addr, + unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, + struct list_head *uf) +{ + unsigned long ret; + bool writable_file_mapping = false; + + mmap_assert_write_locked(current->mm); + + /* Check to see if MDWE is applicable. */ + if (map_deny_write_exec(vm_flags, vm_flags)) + return -EACCES; + + /* Allow architectures to sanity-check the vm_flags. */ + if (!arch_validate_flags(vm_flags)) + return -EINVAL; + + /* Map writable and ensure this isn't a sealed memfd. */ + if (file && is_shared_maywrite(vm_flags)) { + int error = mapping_map_writable(file->f_mapping); + + if (error) + return error; + writable_file_mapping = true; + } + + ret = __mmap_region(file, addr, len, vm_flags, pgoff, uf); + + /* Clear our write mapping regardless of error. */ + if (writable_file_mapping) + mapping_unmap_writable(file->f_mapping); + + validate_mm(current->mm); + return ret; +} + +/* + * do_brk_flags() - Increase the brk vma if the flags match. + * @vmi: The vma iterator + * @addr: The start address + * @len: The length of the increase + * @vma: The vma, + * @flags: The VMA Flags + * + * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags + * do not match then create a new anonymous VMA. Eventually we may be able to + * do some brk-specific accounting here. + */ +int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long addr, unsigned long len, unsigned long flags) +{ + struct mm_struct *mm = current->mm; + + /* + * Check against address space limits by the changed size + * Note: This happens *after* clearing old mappings in some code paths. + */ + flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; + if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) + return -ENOMEM; + + if (mm->map_count > sysctl_max_map_count) + return -ENOMEM; + + if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) + return -ENOMEM; + + /* + * Expand the existing vma if possible; Note that singular lists do not + * occur after forking, so the expand will only happen on new VMAs. + */ + if (vma && vma->vm_end == addr) { + VMG_STATE(vmg, mm, vmi, addr, addr + len, flags, PHYS_PFN(addr)); + + vmg.prev = vma; + /* vmi is positioned at prev, which this mode expects. */ + vmg.just_expand = true; + + if (vma_merge_new_range(&vmg)) + goto out; + else if (vmg_nomem(&vmg)) + goto unacct_fail; + } + + if (vma) + vma_iter_next_range(vmi); + /* create a vma struct for an anonymous mapping */ + vma = vm_area_alloc(mm); + if (!vma) + goto unacct_fail; + + vma_set_anonymous(vma); + vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT); + vm_flags_init(vma, flags); + vma->vm_page_prot = vm_get_page_prot(flags); + vma_start_write(vma); + if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL)) + goto mas_store_fail; + + mm->map_count++; + validate_mm(mm); + ksm_add_vma(vma); +out: + perf_event_mmap(vma); + mm->total_vm += len >> PAGE_SHIFT; + mm->data_vm += len >> PAGE_SHIFT; + if (flags & VM_LOCKED) + mm->locked_vm += (len >> PAGE_SHIFT); + vm_flags_set(vma, VM_SOFTDIRTY); + return 0; + +mas_store_fail: + vm_area_free(vma); +unacct_fail: + vm_unacct_memory(len >> PAGE_SHIFT); + return -ENOMEM; +} + +/** + * unmapped_area() - Find an area between the low_limit and the high_limit with + * the correct alignment and offset, all from @info. Note: current->mm is used + * for the search. + * + * @info: The unmapped area information including the range [low_limit - + * high_limit), the alignment offset and mask. + * + * Return: A memory address or -ENOMEM. + */ +unsigned long unmapped_area(struct vm_unmapped_area_info *info) +{ + unsigned long length, gap; + unsigned long low_limit, high_limit; + struct vm_area_struct *tmp; + VMA_ITERATOR(vmi, current->mm, 0); + + /* Adjust search length to account for worst case alignment overhead */ + length = info->length + info->align_mask + info->start_gap; + if (length < info->length) + return -ENOMEM; + + low_limit = info->low_limit; + if (low_limit < mmap_min_addr) + low_limit = mmap_min_addr; + high_limit = info->high_limit; +retry: + if (vma_iter_area_lowest(&vmi, low_limit, high_limit, length)) + return -ENOMEM; + + /* + * Adjust for the gap first so it doesn't interfere with the + * later alignment. The first step is the minimum needed to + * fulill the start gap, the next steps is the minimum to align + * that. It is the minimum needed to fulill both. + */ + gap = vma_iter_addr(&vmi) + info->start_gap; + gap += (info->align_offset - gap) & info->align_mask; + tmp = vma_next(&vmi); + if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ + if (vm_start_gap(tmp) < gap + length - 1) { + low_limit = tmp->vm_end; + vma_iter_reset(&vmi); + goto retry; + } + } else { + tmp = vma_prev(&vmi); + if (tmp && vm_end_gap(tmp) > gap) { + low_limit = vm_end_gap(tmp); + vma_iter_reset(&vmi); + goto retry; + } + } + + return gap; +} + +/** + * unmapped_area_topdown() - Find an area between the low_limit and the + * high_limit with the correct alignment and offset at the highest available + * address, all from @info. Note: current->mm is used for the search. + * + * @info: The unmapped area information including the range [low_limit - + * high_limit), the alignment offset and mask. + * + * Return: A memory address or -ENOMEM. + */ +unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) +{ + unsigned long length, gap, gap_end; + unsigned long low_limit, high_limit; + struct vm_area_struct *tmp; + VMA_ITERATOR(vmi, current->mm, 0); + + /* Adjust search length to account for worst case alignment overhead */ + length = info->length + info->align_mask + info->start_gap; + if (length < info->length) + return -ENOMEM; + + low_limit = info->low_limit; + if (low_limit < mmap_min_addr) + low_limit = mmap_min_addr; + high_limit = info->high_limit; +retry: + if (vma_iter_area_highest(&vmi, low_limit, high_limit, length)) + return -ENOMEM; + + gap = vma_iter_end(&vmi) - info->length; + gap -= (gap - info->align_offset) & info->align_mask; + gap_end = vma_iter_end(&vmi); + tmp = vma_next(&vmi); + if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ + if (vm_start_gap(tmp) < gap_end) { + high_limit = vm_start_gap(tmp); + vma_iter_reset(&vmi); + goto retry; + } + } else { + tmp = vma_prev(&vmi); + if (tmp && vm_end_gap(tmp) > gap) { + high_limit = tmp->vm_start; + vma_iter_reset(&vmi); + goto retry; + } + } + + return gap; +} + +/* + * Verify that the stack growth is acceptable and + * update accounting. This is shared with both the + * grow-up and grow-down cases. + */ +static int acct_stack_growth(struct vm_area_struct *vma, + unsigned long size, unsigned long grow) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long new_start; + + /* address space limit tests */ + if (!may_expand_vm(mm, vma->vm_flags, grow)) + return -ENOMEM; + + /* Stack limit test */ + if (size > rlimit(RLIMIT_STACK)) + return -ENOMEM; + + /* mlock limit tests */ + if (!mlock_future_ok(mm, vma->vm_flags, grow << PAGE_SHIFT)) + return -ENOMEM; + + /* Check to ensure the stack will not grow into a hugetlb-only region */ + new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start : + vma->vm_end - size; + if (is_hugepage_only_range(vma->vm_mm, new_start, size)) + return -EFAULT; + + /* + * Overcommit.. This must be the final test, as it will + * update security statistics. + */ + if (security_vm_enough_memory_mm(mm, grow)) + return -ENOMEM; + + return 0; +} + +#if defined(CONFIG_STACK_GROWSUP) +/* + * PA-RISC uses this for its stack. + * vma is the last one with address > vma->vm_end. Have to extend vma. + */ +int expand_upwards(struct vm_area_struct *vma, unsigned long address) +{ + struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *next; + unsigned long gap_addr; + int error = 0; + VMA_ITERATOR(vmi, mm, vma->vm_start); + + if (!(vma->vm_flags & VM_GROWSUP)) + return -EFAULT; + + mmap_assert_write_locked(mm); + + /* Guard against exceeding limits of the address space. */ + address &= PAGE_MASK; + if (address >= (TASK_SIZE & PAGE_MASK)) + return -ENOMEM; + address += PAGE_SIZE; + + /* Enforce stack_guard_gap */ + gap_addr = address + stack_guard_gap; + + /* Guard against overflow */ + if (gap_addr < address || gap_addr > TASK_SIZE) + gap_addr = TASK_SIZE; + + next = find_vma_intersection(mm, vma->vm_end, gap_addr); + if (next && vma_is_accessible(next)) { + if (!(next->vm_flags & VM_GROWSUP)) + return -ENOMEM; + /* Check that both stack segments have the same anon_vma? */ + } + + if (next) + vma_iter_prev_range_limit(&vmi, address); + + vma_iter_config(&vmi, vma->vm_start, address); + if (vma_iter_prealloc(&vmi, vma)) + return -ENOMEM; + + /* We must make sure the anon_vma is allocated. */ + if (unlikely(anon_vma_prepare(vma))) { + vma_iter_free(&vmi); + return -ENOMEM; + } + + /* Lock the VMA before expanding to prevent concurrent page faults */ + vma_start_write(vma); + /* We update the anon VMA tree. */ + anon_vma_lock_write(vma->anon_vma); + + /* Somebody else might have raced and expanded it already */ + if (address > vma->vm_end) { + unsigned long size, grow; + + size = address - vma->vm_start; + grow = (address - vma->vm_end) >> PAGE_SHIFT; + + error = -ENOMEM; + if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { + error = acct_stack_growth(vma, size, grow); + if (!error) { + if (vma->vm_flags & VM_LOCKED) + mm->locked_vm += grow; + vm_stat_account(mm, vma->vm_flags, grow); + anon_vma_interval_tree_pre_update_vma(vma); + vma->vm_end = address; + /* Overwrite old entry in mtree. */ + vma_iter_store_overwrite(&vmi, vma); + anon_vma_interval_tree_post_update_vma(vma); + + perf_event_mmap(vma); + } + } + } + anon_vma_unlock_write(vma->anon_vma); + vma_iter_free(&vmi); + validate_mm(mm); + return error; +} +#endif /* CONFIG_STACK_GROWSUP */ + +/* + * vma is the first one with address < vma->vm_start. Have to extend vma. + * mmap_lock held for writing. + */ +int expand_downwards(struct vm_area_struct *vma, unsigned long address) +{ + struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *prev; + int error = 0; + VMA_ITERATOR(vmi, mm, vma->vm_start); + + if (!(vma->vm_flags & VM_GROWSDOWN)) + return -EFAULT; + + mmap_assert_write_locked(mm); + + address &= PAGE_MASK; + if (address < mmap_min_addr || address < FIRST_USER_ADDRESS) + return -EPERM; + + /* Enforce stack_guard_gap */ + prev = vma_prev(&vmi); + /* Check that both stack segments have the same anon_vma? */ + if (prev) { + if (!(prev->vm_flags & VM_GROWSDOWN) && + vma_is_accessible(prev) && + (address - prev->vm_end < stack_guard_gap)) + return -ENOMEM; + } + + if (prev) + vma_iter_next_range_limit(&vmi, vma->vm_start); + + vma_iter_config(&vmi, address, vma->vm_end); + if (vma_iter_prealloc(&vmi, vma)) + return -ENOMEM; + + /* We must make sure the anon_vma is allocated. */ + if (unlikely(anon_vma_prepare(vma))) { + vma_iter_free(&vmi); + return -ENOMEM; + } + + /* Lock the VMA before expanding to prevent concurrent page faults */ + vma_start_write(vma); + /* We update the anon VMA tree. */ + anon_vma_lock_write(vma->anon_vma); + + /* Somebody else might have raced and expanded it already */ + if (address < vma->vm_start) { + unsigned long size, grow; + + size = vma->vm_end - address; + grow = (vma->vm_start - address) >> PAGE_SHIFT; + + error = -ENOMEM; + if (grow <= vma->vm_pgoff) { + error = acct_stack_growth(vma, size, grow); + if (!error) { + if (vma->vm_flags & VM_LOCKED) + mm->locked_vm += grow; + vm_stat_account(mm, vma->vm_flags, grow); + anon_vma_interval_tree_pre_update_vma(vma); + vma->vm_start = address; + vma->vm_pgoff -= grow; + /* Overwrite old entry in mtree. */ + vma_iter_store_overwrite(&vmi, vma); + anon_vma_interval_tree_post_update_vma(vma); + + perf_event_mmap(vma); + } + } + } + anon_vma_unlock_write(vma->anon_vma); + vma_iter_free(&vmi); + validate_mm(mm); + return error; +} + +int __vm_munmap(unsigned long start, size_t len, bool unlock) +{ + int ret; + struct mm_struct *mm = current->mm; + LIST_HEAD(uf); + VMA_ITERATOR(vmi, mm, start); + + if (mmap_write_lock_killable(mm)) + return -EINTR; + + ret = do_vmi_munmap(&vmi, mm, start, len, &uf, unlock); + if (ret || !unlock) + mmap_write_unlock(mm); + + userfaultfd_unmap_complete(mm, &uf); + return ret; +} + +/* Insert vm structure into process list sorted by address + * and into the inode's i_mmap tree. If vm_file is non-NULL + * then i_mmap_rwsem is taken here. + */ +int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) +{ + unsigned long charged = vma_pages(vma); + + + if (find_vma_intersection(mm, vma->vm_start, vma->vm_end)) + return -ENOMEM; + + if ((vma->vm_flags & VM_ACCOUNT) && + security_vm_enough_memory_mm(mm, charged)) + return -ENOMEM; + + /* + * The vm_pgoff of a purely anonymous vma should be irrelevant + * until its first write fault, when page's anon_vma and index + * are set. But now set the vm_pgoff it will almost certainly + * end up with (unless mremap moves it elsewhere before that + * first wfault), so /proc/pid/maps tells a consistent story. + * + * By setting it to reflect the virtual start address of the + * vma, merges and splits can happen in a seamless way, just + * using the existing file pgoff checks and manipulations. + * Similarly in do_mmap and in do_brk_flags. + */ + if (vma_is_anonymous(vma)) { + BUG_ON(vma->anon_vma); + vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; + } + + if (vma_link(mm, vma)) { + if (vma->vm_flags & VM_ACCOUNT) + vm_unacct_memory(charged); + return -ENOMEM; + } + + return 0; +} @@ -19,6 +19,8 @@ struct vma_prepare { struct vm_area_struct *insert; struct vm_area_struct *remove; struct vm_area_struct *remove2; + + bool skip_vma_uprobe :1; }; struct unlink_vma_file_batch { @@ -58,35 +60,96 @@ enum vma_merge_state { VMA_MERGE_SUCCESS, }; -enum vma_merge_flags { - VMG_FLAG_DEFAULT = 0, - /* - * If we can expand, simply do so. We know there is nothing to merge to - * the right. Does not reset state upon failure to merge. The VMA - * iterator is assumed to be positioned at the previous VMA, rather than - * at the gap. - */ - VMG_FLAG_JUST_EXPAND = 1 << 0, -}; - -/* Represents a VMA merge operation. */ +/* + * Describes a VMA merge operation and is threaded throughout it. + * + * Any of the fields may be mutated by the merge operation, so no guarantees are + * made to the contents of this structure after a merge operation has completed. + */ struct vma_merge_struct { struct mm_struct *mm; struct vma_iterator *vmi; - pgoff_t pgoff; + /* + * Adjacent VMAs, any of which may be NULL if not present: + * + * |------|--------|------| + * | prev | middle | next | + * |------|--------|------| + * + * middle may not yet exist in the case of a proposed new VMA being + * merged, or it may be an existing VMA. + * + * next may be assigned by the caller. + */ struct vm_area_struct *prev; - struct vm_area_struct *next; /* Modified by vma_merge(). */ - struct vm_area_struct *vma; /* Either a new VMA or the one being modified. */ + struct vm_area_struct *middle; + struct vm_area_struct *next; + /* This is the VMA we ultimately target to become the merged VMA. */ + struct vm_area_struct *target; + /* + * Initially, the start, end, pgoff fields are provided by the caller + * and describe the proposed new VMA range, whether modifying an + * existing VMA (which will be 'middle'), or adding a new one. + * + * During the merge process these fields are updated to describe the new + * range _including those VMAs which will be merged_. + */ unsigned long start; unsigned long end; + pgoff_t pgoff; + unsigned long flags; struct file *file; struct anon_vma *anon_vma; struct mempolicy *policy; struct vm_userfaultfd_ctx uffd_ctx; struct anon_vma_name *anon_name; - enum vma_merge_flags merge_flags; enum vma_merge_state state; + + /* Flags which callers can use to modify merge behaviour: */ + + /* + * If we can expand, simply do so. We know there is nothing to merge to + * the right. Does not reset state upon failure to merge. The VMA + * iterator is assumed to be positioned at the previous VMA, rather than + * at the gap. + */ + bool just_expand :1; + + /* + * If a merge is possible, but an OOM error occurs, give up and don't + * execute the merge, returning NULL. + */ + bool give_up_on_oom :1; + + /* + * If set, skip uprobe_mmap upon merged vma. + */ + bool skip_vma_uprobe :1; + + /* Internal flags set during merge process: */ + + /* + * Internal flag indicating the merge increases vmg->middle->vm_start + * (and thereby, vmg->prev->vm_end). + */ + bool __adjust_middle_start :1; + /* + * Internal flag indicating the merge decreases vmg->next->vm_start + * (and thereby, vmg->middle->vm_end). + */ + bool __adjust_next_start :1; + /* + * Internal flag used during the merge operation to indicate we will + * remove vmg->middle. + */ + bool __remove_middle :1; + /* + * Internal flag used during the merge operationr to indicate we will + * remove vmg->next. + */ + bool __remove_next :1; + }; static inline bool vmg_nomem(struct vma_merge_struct *vmg) @@ -110,7 +173,6 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma, .flags = flags_, \ .pgoff = pgoff_, \ .state = VMA_MERGE_START, \ - .merge_flags = VMG_FLAG_DEFAULT, \ } #define VMG_VMA_STATE(name, vmi_, prev_, vma_, start_, end_) \ @@ -118,8 +180,8 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma, .mm = vma_->vm_mm, \ .vmi = vmi_, \ .prev = prev_, \ + .middle = vma_, \ .next = NULL, \ - .vma = vma_, \ .start = start_, \ .end = end_, \ .flags = vma_->vm_flags, \ @@ -130,7 +192,6 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma, .uffd_ctx = vma_->vm_userfaultfd_ctx, \ .anon_name = anon_vma_name(vma_), \ .state = VMA_MERGE_START, \ - .merge_flags = VMG_FLAG_DEFAULT, \ } #ifdef CONFIG_DEBUG_VM_MAPLE_TREE @@ -139,15 +200,10 @@ void validate_mm(struct mm_struct *mm); #define validate_mm(mm) do { } while (0) #endif -/* Required for expand_downwards(). */ -void anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma); - -/* Required for expand_downwards(). */ -void anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma); - -int vma_expand(struct vma_merge_struct *vmg); -int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long start, unsigned long end, pgoff_t pgoff); +__must_check int vma_expand(struct vma_merge_struct *vmg); +__must_check int vma_shrink(struct vma_iterator *vmi, + struct vm_area_struct *vma, + unsigned long start, unsigned long end, pgoff_t pgoff); static inline int vma_iter_store_gfp(struct vma_iterator *vmi, struct vm_area_struct *vma, gfp_t gfp) @@ -162,9 +218,57 @@ static inline int vma_iter_store_gfp(struct vma_iterator *vmi, if (unlikely(mas_is_err(&vmi->mas))) return -ENOMEM; + vma_mark_attached(vma); return 0; } + +/* + * Temporary helper functions for file systems which wrap an invocation of + * f_op->mmap() but which might have an underlying file system which implements + * f_op->mmap_prepare(). + */ + +static inline struct vm_area_desc *vma_to_desc(struct vm_area_struct *vma, + struct vm_area_desc *desc) +{ + desc->mm = vma->vm_mm; + desc->start = vma->vm_start; + desc->end = vma->vm_end; + + desc->pgoff = vma->vm_pgoff; + desc->file = vma->vm_file; + desc->vm_flags = vma->vm_flags; + desc->page_prot = vma->vm_page_prot; + + desc->vm_ops = NULL; + desc->private_data = NULL; + + return desc; +} + +static inline void set_vma_from_desc(struct vm_area_struct *vma, + struct vm_area_desc *desc) +{ + /* + * Since we're invoking .mmap_prepare() despite having a partially + * established VMA, we must take care to handle setting fields + * correctly. + */ + + /* Mutable fields. Populated with initial state. */ + vma->vm_pgoff = desc->pgoff; + if (vma->vm_file != desc->file) + vma_set_file(vma, desc->file); + if (vma->vm_flags != desc->vm_flags) + vm_flags_set(vma, desc->vm_flags); + vma->vm_page_prot = desc->page_prot; + + /* User-defined fields. */ + vma->vm_ops = desc->vm_ops; + vma->vm_private_data = desc->private_data; +} + int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, @@ -174,19 +278,20 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf, bool unlock); -void remove_vma(struct vm_area_struct *vma, bool unreachable); +void remove_vma(struct vm_area_struct *vma); void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, struct vm_area_struct *prev, struct vm_area_struct *next); /* We are about to modify the VMA's flags. */ -struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, +__must_check struct vm_area_struct +*vma_modify_flags(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long new_flags); /* We are about to modify the VMA's flags and/or anon_name. */ -struct vm_area_struct +__must_check struct vm_area_struct *vma_modify_flags_name(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, @@ -196,7 +301,7 @@ struct vm_area_struct struct anon_vma_name *new_name); /* We are about to modify the VMA's memory policy. */ -struct vm_area_struct +__must_check struct vm_area_struct *vma_modify_policy(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, @@ -204,19 +309,22 @@ struct vm_area_struct struct mempolicy *new_pol); /* We are about to modify the VMA's flags and/or uffd context. */ -struct vm_area_struct +__must_check struct vm_area_struct *vma_modify_flags_uffd(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long new_flags, - struct vm_userfaultfd_ctx new_ctx); + struct vm_userfaultfd_ctx new_ctx, + bool give_up_on_oom); -struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg); +__must_check struct vm_area_struct +*vma_merge_new_range(struct vma_merge_struct *vmg); -struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, - struct vm_area_struct *vma, - unsigned long delta); +__must_check struct vm_area_struct +*vma_merge_extend(struct vma_iterator *vmi, + struct vm_area_struct *vma, + unsigned long delta); void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb); @@ -243,10 +351,16 @@ bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); int mm_take_all_locks(struct mm_struct *mm); void mm_drop_all_locks(struct mm_struct *mm); -unsigned long __mmap_region(struct file *file, unsigned long addr, +unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf); +int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma, + unsigned long addr, unsigned long request, unsigned long flags); + +unsigned long unmapped_area(struct vm_unmapped_area_info *info); +unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info); + static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma) { /* @@ -360,9 +474,10 @@ static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi) } /* Store a VMA with preallocated memory */ -static inline void vma_iter_store(struct vma_iterator *vmi, - struct vm_area_struct *vma) +static inline void vma_iter_store_overwrite(struct vma_iterator *vmi, + struct vm_area_struct *vma) { + vma_assert_attached(vma); #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start && @@ -387,6 +502,13 @@ static inline void vma_iter_store(struct vma_iterator *vmi, mas_store_prealloc(&vmi->mas, vma); } +static inline void vma_iter_store_new(struct vma_iterator *vmi, + struct vm_area_struct *vma) +{ + vma_mark_attached(vma); + vma_iter_store_overwrite(vmi, vma); +} + static inline unsigned long vma_iter_addr(struct vma_iterator *vmi) { return vmi->mas.index; @@ -472,4 +594,27 @@ static inline bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior) #endif +#if defined(CONFIG_STACK_GROWSUP) +int expand_upwards(struct vm_area_struct *vma, unsigned long address); +#endif + +int expand_downwards(struct vm_area_struct *vma, unsigned long address); + +int __vm_munmap(unsigned long start, size_t len, bool unlock); + +int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma); + +/* vma_init.h, shared between CONFIG_MMU and nommu. */ +void __init vma_state_init(void); +struct vm_area_struct *vm_area_alloc(struct mm_struct *mm); +struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig); +void vm_area_free(struct vm_area_struct *vma); + +/* vma_exec.c */ +#ifdef CONFIG_MMU +int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap, + unsigned long *top_mem_p); +int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift); +#endif + #endif /* __MM_VMA_H */ diff --git a/mm/vma_exec.c b/mm/vma_exec.c new file mode 100644 index 000000000000..2dffb02ed6a2 --- /dev/null +++ b/mm/vma_exec.c @@ -0,0 +1,161 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * Functions explicitly implemented for exec functionality which however are + * explicitly VMA-only logic. + */ + +#include "vma_internal.h" +#include "vma.h" + +/* + * Relocate a VMA downwards by shift bytes. There cannot be any VMAs between + * this VMA and its relocated range, which will now reside at [vma->vm_start - + * shift, vma->vm_end - shift). + * + * This function is almost certainly NOT what you want for anything other than + * early executable temporary stack relocation. + */ +int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift) +{ + /* + * The process proceeds as follows: + * + * 1) Use shift to calculate the new vma endpoints. + * 2) Extend vma to cover both the old and new ranges. This ensures the + * arguments passed to subsequent functions are consistent. + * 3) Move vma's page tables to the new range. + * 4) Free up any cleared pgd range. + * 5) Shrink the vma to cover only the new range. + */ + + struct mm_struct *mm = vma->vm_mm; + unsigned long old_start = vma->vm_start; + unsigned long old_end = vma->vm_end; + unsigned long length = old_end - old_start; + unsigned long new_start = old_start - shift; + unsigned long new_end = old_end - shift; + VMA_ITERATOR(vmi, mm, new_start); + VMG_STATE(vmg, mm, &vmi, new_start, old_end, 0, vma->vm_pgoff); + struct vm_area_struct *next; + struct mmu_gather tlb; + PAGETABLE_MOVE(pmc, vma, vma, old_start, new_start, length); + + BUG_ON(new_start > new_end); + + /* + * ensure there are no vmas between where we want to go + * and where we are + */ + if (vma != vma_next(&vmi)) + return -EFAULT; + + vma_iter_prev_range(&vmi); + /* + * cover the whole range: [new_start, old_end) + */ + vmg.middle = vma; + if (vma_expand(&vmg)) + return -ENOMEM; + + /* + * move the page tables downwards, on failure we rely on + * process cleanup to remove whatever mess we made. + */ + pmc.for_stack = true; + if (length != move_page_tables(&pmc)) + return -ENOMEM; + + tlb_gather_mmu(&tlb, mm); + next = vma_next(&vmi); + if (new_end > old_start) { + /* + * when the old and new regions overlap clear from new_end. + */ + free_pgd_range(&tlb, new_end, old_end, new_end, + next ? next->vm_start : USER_PGTABLES_CEILING); + } else { + /* + * otherwise, clean from old_start; this is done to not touch + * the address space in [new_end, old_start) some architectures + * have constraints on va-space that make this illegal (IA64) - + * for the others its just a little faster. + */ + free_pgd_range(&tlb, old_start, old_end, new_end, + next ? next->vm_start : USER_PGTABLES_CEILING); + } + tlb_finish_mmu(&tlb); + + vma_prev(&vmi); + /* Shrink the vma to just the new range */ + return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff); +} + +/* + * Establish the stack VMA in an execve'd process, located temporarily at the + * maximum stack address provided by the architecture. + * + * We later relocate this downwards in relocate_vma_down(). + * + * This function is almost certainly NOT what you want for anything other than + * early executable initialisation. + * + * On success, returns 0 and sets *vmap to the stack VMA and *top_mem_p to the + * maximum addressable location in the stack (that is capable of storing a + * system word of data). + */ +int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap, + unsigned long *top_mem_p) +{ + int err; + struct vm_area_struct *vma = vm_area_alloc(mm); + + if (!vma) + return -ENOMEM; + + vma_set_anonymous(vma); + + if (mmap_write_lock_killable(mm)) { + err = -EINTR; + goto err_free; + } + + /* + * Need to be called with mmap write lock + * held, to avoid race with ksmd. + */ + err = ksm_execve(mm); + if (err) + goto err_ksm; + + /* + * Place the stack at the largest stack address the architecture + * supports. Later, we'll move this to an appropriate place. We don't + * use STACK_TOP because that can depend on attributes which aren't + * configured yet. + */ + BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); + vma->vm_end = STACK_TOP_MAX; + vma->vm_start = vma->vm_end - PAGE_SIZE; + vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP); + vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); + + err = insert_vm_struct(mm, vma); + if (err) + goto err; + + mm->stack_vm = mm->total_vm = 1; + mmap_write_unlock(mm); + *vmap = vma; + *top_mem_p = vma->vm_end - sizeof(void *); + return 0; + +err: + ksm_exit(mm); +err_ksm: + mmap_write_unlock(mm); +err_free: + *vmap = NULL; + vm_area_free(vma); + return err; +} diff --git a/mm/vma_init.c b/mm/vma_init.c new file mode 100644 index 000000000000..8e53c7943561 --- /dev/null +++ b/mm/vma_init.c @@ -0,0 +1,151 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* + * Functions for initialisaing, allocating, freeing and duplicating VMAs. Shared + * between CONFIG_MMU and non-CONFIG_MMU kernel configurations. + */ + +#include "vma_internal.h" +#include "vma.h" + +/* SLAB cache for vm_area_struct structures */ +static struct kmem_cache *vm_area_cachep; + +void __init vma_state_init(void) +{ + struct kmem_cache_args args = { + .use_freeptr_offset = true, + .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr), + }; + + vm_area_cachep = kmem_cache_create("vm_area_struct", + sizeof(struct vm_area_struct), &args, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| + SLAB_ACCOUNT); +} + +struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + + vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + if (!vma) + return NULL; + + vma_init(vma, mm); + + return vma; +} + +static void vm_area_init_from(const struct vm_area_struct *src, + struct vm_area_struct *dest) +{ + dest->vm_mm = src->vm_mm; + dest->vm_ops = src->vm_ops; + dest->vm_start = src->vm_start; + dest->vm_end = src->vm_end; + dest->anon_vma = src->anon_vma; + dest->vm_pgoff = src->vm_pgoff; + dest->vm_file = src->vm_file; + dest->vm_private_data = src->vm_private_data; + vm_flags_init(dest, src->vm_flags); + memcpy(&dest->vm_page_prot, &src->vm_page_prot, + sizeof(dest->vm_page_prot)); + /* + * src->shared.rb may be modified concurrently when called from + * dup_mmap(), but the clone will reinitialize it. + */ + data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared))); + memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx, + sizeof(dest->vm_userfaultfd_ctx)); +#ifdef CONFIG_ANON_VMA_NAME + dest->anon_name = src->anon_name; +#endif +#ifdef CONFIG_SWAP + memcpy(&dest->swap_readahead_info, &src->swap_readahead_info, + sizeof(dest->swap_readahead_info)); +#endif +#ifndef CONFIG_MMU + dest->vm_region = src->vm_region; +#endif +#ifdef CONFIG_NUMA + dest->vm_policy = src->vm_policy; +#endif +#ifdef __HAVE_PFNMAP_TRACKING + dest->pfnmap_track_ctx = NULL; +#endif +} + +#ifdef __HAVE_PFNMAP_TRACKING +static inline int vma_pfnmap_track_ctx_dup(struct vm_area_struct *orig, + struct vm_area_struct *new) +{ + struct pfnmap_track_ctx *ctx = orig->pfnmap_track_ctx; + + if (likely(!ctx)) + return 0; + + /* + * We don't expect to ever hit this. If ever required, we would have + * to duplicate the tracking. + */ + if (unlikely(kref_read(&ctx->kref) >= REFCOUNT_MAX)) + return -ENOMEM; + kref_get(&ctx->kref); + new->pfnmap_track_ctx = ctx; + return 0; +} + +static inline void vma_pfnmap_track_ctx_release(struct vm_area_struct *vma) +{ + struct pfnmap_track_ctx *ctx = vma->pfnmap_track_ctx; + + if (likely(!ctx)) + return; + + kref_put(&ctx->kref, pfnmap_track_ctx_release); + vma->pfnmap_track_ctx = NULL; +} +#else +static inline int vma_pfnmap_track_ctx_dup(struct vm_area_struct *orig, + struct vm_area_struct *new) +{ + return 0; +} +static inline void vma_pfnmap_track_ctx_release(struct vm_area_struct *vma) +{ +} +#endif + +struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) +{ + struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + + if (!new) + return NULL; + + ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); + ASSERT_EXCLUSIVE_WRITER(orig->vm_file); + vm_area_init_from(orig, new); + + if (vma_pfnmap_track_ctx_dup(orig, new)) { + kmem_cache_free(vm_area_cachep, new); + return NULL; + } + vma_lock_init(new, true); + INIT_LIST_HEAD(&new->anon_vma_chain); + vma_numab_state_init(new); + dup_anon_vma_name(orig, new); + + return new; +} + +void vm_area_free(struct vm_area_struct *vma) +{ + /* The vma should be detached while being destroyed. */ + vma_assert_detached(vma); + vma_numab_state_free(vma); + free_anon_vma_name(vma); + vma_pfnmap_track_ctx_release(vma); + kmem_cache_free(vm_area_cachep, vma); +} diff --git a/mm/vma_internal.h b/mm/vma_internal.h index fc5f172a36bd..2f05735ff190 100644 --- a/mm/vma_internal.h +++ b/mm/vma_internal.h @@ -35,6 +35,7 @@ #include <linux/mutex.h> #include <linux/pagemap.h> #include <linux/perf_event.h> +#include <linux/personality.h> #include <linux/pfn.h> #include <linux/rcupdate.h> #include <linux/rmap.h> diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 7ed39d104201..ab986dd09b6a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -104,6 +104,9 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pte = pte_alloc_kernel_track(pmd, addr, mask); if (!pte) return -ENOMEM; + + arch_enter_lazy_mmu_mode(); + do { if (unlikely(!pte_none(ptep_get(pte)))) { if (pfn_valid(pfn)) { @@ -127,6 +130,8 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot)); pfn++; } while (pte += PFN_DOWN(size), addr += size, addr != end); + + arch_leave_lazy_mmu_mode(); *mask |= PGTBL_PTE_MODIFIED; return 0; } @@ -350,12 +355,30 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pgtbl_mod_mask *mask) { pte_t *pte; + pte_t ptent; + unsigned long size = PAGE_SIZE; pte = pte_offset_kernel(pmd, addr); + arch_enter_lazy_mmu_mode(); + do { - pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte); +#ifdef CONFIG_HUGETLB_PAGE + size = arch_vmap_pte_range_unmap_size(addr, pte); + if (size != PAGE_SIZE) { + if (WARN_ON(!IS_ALIGNED(addr, size))) { + addr = ALIGN_DOWN(addr, size); + pte = PTR_ALIGN_DOWN(pte, sizeof(*pte) * (size >> PAGE_SHIFT)); + } + ptent = huge_ptep_get_and_clear(&init_mm, addr, pte, size); + if (WARN_ON(end - addr < size)) + size = end - addr; + } else +#endif + ptent = ptep_get_and_clear(&init_mm, addr, pte); WARN_ON(!pte_none(ptent) && !pte_present(ptent)); - } while (pte++, addr += PAGE_SIZE, addr != end); + } while (pte += (size >> PAGE_SHIFT), addr += size, addr != end); + + arch_leave_lazy_mmu_mode(); *mask |= PGTBL_PTE_MODIFIED; } @@ -374,8 +397,10 @@ static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, if (cleared || pmd_bad(*pmd)) *mask |= PGTBL_PMD_MODIFIED; - if (cleared) + if (cleared) { + WARN_ON(next - addr < PMD_SIZE); continue; + } if (pmd_none_or_clear_bad(pmd)) continue; vunmap_pte_range(pmd, addr, next, mask); @@ -399,8 +424,10 @@ static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, if (cleared || pud_bad(*pud)) *mask |= PGTBL_PUD_MODIFIED; - if (cleared) + if (cleared) { + WARN_ON(next - addr < PUD_SIZE); continue; + } if (pud_none_or_clear_bad(pud)) continue; vunmap_pmd_range(pud, addr, next, mask); @@ -497,6 +524,9 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, pte = pte_alloc_kernel_track(pmd, addr, mask); if (!pte) return -ENOMEM; + + arch_enter_lazy_mmu_mode(); + do { struct page *page = pages[*nr]; @@ -510,6 +540,8 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); (*nr)++; } while (pte++, addr += PAGE_SIZE, addr != end); + + arch_leave_lazy_mmu_mode(); *mask |= PGTBL_PTE_MODIFIED; return 0; } @@ -586,13 +618,13 @@ static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, mask |= PGTBL_PGD_MODIFIED; err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask); if (err) - return err; + break; } while (pgd++, addr = next, addr != end); if (mask & ARCH_PAGE_TABLE_SYNC_MASK) arch_sync_kernel_mappings(start, end); - return 0; + return err; } /* @@ -900,6 +932,11 @@ static struct vmap_node *vmap_nodes = &single; static __read_mostly unsigned int nr_vmap_nodes = 1; static __read_mostly unsigned int vmap_zone_size = 1; +/* A simple iterator over all vmap-nodes. */ +#define for_each_vmap_node(vn) \ + for ((vn) = &vmap_nodes[0]; \ + (vn) < &vmap_nodes[nr_vmap_nodes]; (vn)++) + static inline unsigned int addr_to_node_id(unsigned long addr) { @@ -918,6 +955,19 @@ id_to_node(unsigned int id) return &vmap_nodes[id % nr_vmap_nodes]; } +static inline unsigned int +node_to_id(struct vmap_node *node) +{ + /* Pointer arithmetic. */ + unsigned int id = node - vmap_nodes; + + if (likely(id < nr_vmap_nodes)) + return id; + + WARN_ONCE(1, "An address 0x%p is out-of-bounds.\n", node); + return 0; +} + /* * We use the value 0 to represent "no node", that is why * an encoded value will be the node-id incremented by 1. @@ -990,7 +1040,8 @@ static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); static void drain_vmap_area_work(struct work_struct *work); static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work); -static atomic_long_t nr_vmalloc_pages; +static __cacheline_aligned_in_smp atomic_long_t nr_vmalloc_pages; +static __cacheline_aligned_in_smp atomic_long_t vmap_lazy_nr; unsigned long vmalloc_nr_pages(void) { @@ -1056,12 +1107,11 @@ find_vmap_area_exceed_addr_lock(unsigned long addr, struct vmap_area **va) { unsigned long va_start_lowest; struct vmap_node *vn; - int i; repeat: - for (i = 0, va_start_lowest = 0; i < nr_vmap_nodes; i++) { - vn = &vmap_nodes[i]; + va_start_lowest = 0; + for_each_vmap_node(vn) { spin_lock(&vn->busy.lock); *va = __find_vmap_area_exceed_addr(addr, &vn->busy.root); @@ -1698,7 +1748,7 @@ va_clip(struct rb_root *root, struct list_head *head, */ lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT); if (!lva) - return -1; + return -ENOMEM; } /* @@ -1712,7 +1762,7 @@ va_clip(struct rb_root *root, struct list_head *head, */ va->va_start = nva_start_addr + size; } else { - return -1; + return -EINVAL; } if (type != FL_FIT_TYPE) { @@ -1741,19 +1791,19 @@ va_alloc(struct vmap_area *va, /* Check the "vend" restriction. */ if (nva_start_addr + size > vend) - return vend; + return -ERANGE; /* Update the free vmap_area. */ ret = va_clip(root, head, va, nva_start_addr, size); if (WARN_ON_ONCE(ret)) - return vend; + return ret; return nva_start_addr; } /* * Returns a start address of the newly allocated area, if success. - * Otherwise a vend is returned that indicates failure. + * Otherwise an error value is returned that indicates failure. */ static __always_inline unsigned long __alloc_vmap_area(struct rb_root *root, struct list_head *head, @@ -1778,14 +1828,13 @@ __alloc_vmap_area(struct rb_root *root, struct list_head *head, va = find_vmap_lowest_match(root, size, align, vstart, adjust_search_size); if (unlikely(!va)) - return vend; + return -ENOENT; nva_start_addr = va_alloc(va, root, head, size, align, vstart, vend); - if (nva_start_addr == vend) - return vend; #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK - find_vmap_lowest_match_check(root, head, size, align); + if (!IS_ERR_VALUE(nva_start_addr)) + find_vmap_lowest_match_check(root, head, size, align); #endif return nva_start_addr; @@ -1915,7 +1964,7 @@ node_alloc(unsigned long size, unsigned long align, struct vmap_area *va; *vn_id = 0; - *addr = vend; + *addr = -EINVAL; /* * Fallback to a global heap if not vmalloc or there @@ -1940,7 +1989,7 @@ static inline void setup_vmalloc_vm(struct vm_struct *vm, { vm->flags = flags; vm->addr = (void *)va->va_start; - vm->size = va_size(va); + vm->size = vm->requested_size = va_size(va); vm->caller = caller; va->vm = vm; } @@ -1995,20 +2044,20 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, } retry: - if (addr == vend) { + if (IS_ERR_VALUE(addr)) { preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node); addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list, size, align, vstart, vend); spin_unlock(&free_vmap_area_lock); } - trace_alloc_vmap_area(addr, size, align, vstart, vend, addr == vend); + trace_alloc_vmap_area(addr, size, align, vstart, vend, IS_ERR_VALUE(addr)); /* - * If an allocation fails, the "vend" address is + * If an allocation fails, the error value is * returned. Therefore trigger the overflow path. */ - if (unlikely(addr == vend)) + if (IS_ERR_VALUE(addr)) goto overflow; va->va_start = addr; @@ -2100,8 +2149,6 @@ static unsigned long lazy_max_pages(void) return log * (32UL * 1024 * 1024 / PAGE_SIZE); } -static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0); - /* * Serialize vmap purging. There is no actual critical section protected * by this lock, but we want to avoid concurrent calls for performance @@ -2111,7 +2158,6 @@ static DEFINE_MUTEX(vmap_purge_lock); /* for per-CPU blocks */ static void purge_fragmented_blocks_allcpus(void); -static cpumask_t purge_nodes; static void reclaim_list_global(struct list_head *head) @@ -2134,7 +2180,7 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay) LIST_HEAD(decay_list); struct rb_root decay_root = RB_ROOT; struct vmap_area *va, *nva; - unsigned long n_decay; + unsigned long n_decay, pool_len; int i; for (i = 0; i < MAX_VA_SIZE_PAGES; i++) { @@ -2148,22 +2194,20 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay) list_replace_init(&vn->pool[i].head, &tmp_list); spin_unlock(&vn->pool_lock); - if (full_decay) - WRITE_ONCE(vn->pool[i].len, 0); + pool_len = n_decay = vn->pool[i].len; + WRITE_ONCE(vn->pool[i].len, 0); /* Decay a pool by ~25% out of left objects. */ - n_decay = vn->pool[i].len >> 2; + if (!full_decay) + n_decay >>= 2; + pool_len -= n_decay; list_for_each_entry_safe(va, nva, &tmp_list, list) { + if (!n_decay--) + break; + list_del_init(&va->list); merge_or_add_vmap_area(va, &decay_root, &decay_list); - - if (!full_decay) { - WRITE_ONCE(vn->pool[i].len, vn->pool[i].len - 1); - - if (!--n_decay) - break; - } } /* @@ -2172,9 +2216,10 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay) * can populate the pool therefore a simple list replace * operation takes place here. */ - if (!full_decay && !list_empty(&tmp_list)) { + if (!list_empty(&tmp_list)) { spin_lock(&vn->pool_lock); list_replace_init(&tmp_list, &vn->pool[i].head); + WRITE_ONCE(vn->pool[i].len, pool_len); spin_unlock(&vn->pool_lock); } } @@ -2244,6 +2289,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end, { unsigned long nr_purged_areas = 0; unsigned int nr_purge_helpers; + static cpumask_t purge_nodes; unsigned int nr_purge_nodes; struct vmap_node *vn; int i; @@ -2255,9 +2301,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end, */ purge_nodes = CPU_MASK_NONE; - for (i = 0; i < nr_vmap_nodes; i++) { - vn = &vmap_nodes[i]; - + for_each_vmap_node(vn) { INIT_LIST_HEAD(&vn->purge_list); vn->skip_populate = full_pool_decay; decay_va_pool_node(vn, full_pool_decay); @@ -2276,7 +2320,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end, end = max(end, list_last_entry(&vn->purge_list, struct vmap_area, list)->va_end); - cpumask_set_cpu(i, &purge_nodes); + cpumask_set_cpu(node_to_id(vn), &purge_nodes); } nr_purge_nodes = cpumask_weight(&purge_nodes); @@ -2355,7 +2399,7 @@ static void free_vmap_area_noflush(struct vmap_area *va) if (WARN_ON_ONCE(!list_empty(&va->list))) return; - nr_lazy = atomic_long_add_return(va_size(va) >> PAGE_SHIFT, + nr_lazy = atomic_long_add_return_relaxed(va_size(va) >> PAGE_SHIFT, &vmap_lazy_nr); /* @@ -2421,7 +2465,7 @@ struct vmap_area *find_vmap_area(unsigned long addr) if (va) return va; - } while ((i = (i + 1) % nr_vmap_nodes) != j); + } while ((i = (i + nr_vmap_nodes - 1) % nr_vmap_nodes) != j); return NULL; } @@ -2447,7 +2491,7 @@ static struct vmap_area *find_unlink_vmap_area(unsigned long addr) if (va) return va; - } while ((i = (i + 1) % nr_vmap_nodes) != j); + } while ((i = (i + nr_vmap_nodes - 1) % nr_vmap_nodes) != j); return NULL; } @@ -2916,10 +2960,7 @@ static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush) */ void vm_unmap_aliases(void) { - unsigned long start = ULONG_MAX, end = 0; - int flush = 0; - - _vm_unmap_aliases(start, end, flush); + _vm_unmap_aliases(ULONG_MAX, 0, 0); } EXPORT_SYMBOL_GPL(vm_unmap_aliases); @@ -3100,7 +3141,7 @@ static void clear_vm_uninitialized_flag(struct vm_struct *vm) /* * Before removing VM_UNINITIALIZED, * we should make sure that vm has proper values. - * Pair with smp_rmb() in show_numa_info(). + * Pair with smp_rmb() in vread_iter() and vmalloc_info_show(). */ smp_wmb(); vm->flags &= ~VM_UNINITIALIZED; @@ -3133,6 +3174,7 @@ struct vm_struct *__get_vm_area_node(unsigned long size, area->flags = flags; area->caller = caller; + area->requested_size = requested_size; va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area); if (IS_ERR(va)) { @@ -3370,11 +3412,13 @@ void vfree(const void *addr) if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS)) vm_reset_perms(vm); + /* All pages of vm should be charged to same memcg, so use first one. */ + if (vm->nr_pages && !(vm->flags & VM_MAP_PUT_PAGES)) + mod_memcg_page_state(vm->pages[0], MEMCG_VMALLOC, -vm->nr_pages); for (i = 0; i < vm->nr_pages; i++) { struct page *page = vm->pages[i]; BUG_ON(!page); - mod_memcg_page_state(page, MEMCG_VMALLOC, -1); /* * High-order allocs for huge vmallocs are split, so * can be freed as an array of order-0 allocations @@ -3382,7 +3426,8 @@ void vfree(const void *addr) __free_page(page); cond_resched(); } - atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages); + if (!(vm->flags & VM_MAP_PUT_PAGES)) + atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages); kvfree(vm->pages); kfree(vm); } @@ -3560,11 +3605,11 @@ vm_area_alloc_pages(gfp_t gfp, int nid, * but mempolicy wants to alloc memory by interleaving. */ if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE) - nr = alloc_pages_bulk_array_mempolicy_noprof(gfp, + nr = alloc_pages_bulk_mempolicy_noprof(gfp, nr_pages_request, pages + nr_allocated); else - nr = alloc_pages_bulk_array_node_noprof(gfp, nid, + nr = alloc_pages_bulk_node_noprof(gfp, nid, nr_pages_request, pages + nr_allocated); @@ -3669,12 +3714,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, node, page_order, nr_small_pages, area->pages); atomic_long_add(area->nr_pages, &nr_vmalloc_pages); - if (gfp_mask & __GFP_ACCOUNT) { - int i; - - for (i = 0; i < area->nr_pages; i++) - mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, 1); - } + /* All pages of vm should be charged to same memcg, so use first one. */ + if (gfp_mask & __GFP_ACCOUNT && area->nr_pages) + mod_memcg_page_state(area->pages[0], MEMCG_VMALLOC, + area->nr_pages); /* * If not enough pages were obtained to accomplish an @@ -3769,8 +3812,7 @@ void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, struct vm_struct *area; void *ret; kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE; - unsigned long real_size = size; - unsigned long real_align = align; + unsigned long original_align = align; unsigned int shift = PAGE_SHIFT; if (WARN_ON_ONCE(!size)) @@ -3779,7 +3821,7 @@ void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, if ((size >> PAGE_SHIFT) > totalram_pages()) { warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, exceeds total pages", - real_size); + size); return NULL; } @@ -3796,19 +3838,18 @@ void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, else shift = arch_vmap_pte_supported_shift(size); - align = max(real_align, 1UL << shift); - size = ALIGN(real_size, 1UL << shift); + align = max(original_align, 1UL << shift); } again: - area = __get_vm_area_node(real_size, align, shift, VM_ALLOC | + area = __get_vm_area_node(size, align, shift, VM_ALLOC | VM_UNINITIALIZED | vm_flags, start, end, node, gfp_mask, caller); if (!area) { bool nofail = gfp_mask & __GFP_NOFAIL; warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, vm_struct allocation failed%s", - real_size, (nofail) ? ". Retrying." : ""); + size, (nofail) ? ". Retrying." : ""); if (nofail) { schedule_timeout_uninterruptible(1); goto again; @@ -3858,7 +3899,7 @@ again: (gfp_mask & __GFP_SKIP_ZERO)) kasan_flags |= KASAN_VMALLOC_INIT; /* KASAN_VMALLOC_PROT_NORMAL already set if required. */ - area->addr = kasan_unpoison_vmalloc(area->addr, real_size, kasan_flags); + area->addr = kasan_unpoison_vmalloc(area->addr, size, kasan_flags); /* * In this function, newly allocated vm_struct has VM_UNINITIALIZED @@ -3867,17 +3908,15 @@ again: */ clear_vm_uninitialized_flag(area); - size = PAGE_ALIGN(size); if (!(vm_flags & VM_DEFER_KMEMLEAK)) - kmemleak_vmalloc(area, size, gfp_mask); + kmemleak_vmalloc(area, PAGE_ALIGN(size), gfp_mask); return area->addr; fail: if (shift > PAGE_SHIFT) { shift = PAGE_SHIFT; - align = real_align; - size = real_size; + align = original_align; goto again; } @@ -3945,9 +3984,10 @@ void *vmalloc_noprof(unsigned long size) EXPORT_SYMBOL(vmalloc_noprof); /** - * vmalloc_huge - allocate virtually contiguous memory, allow huge pages + * vmalloc_huge_node - allocate virtually contiguous memory, allow huge pages * @size: allocation size * @gfp_mask: flags for the page level allocator + * @node: node to use for allocation or NUMA_NO_NODE * * Allocate enough pages to cover @size from the page level * allocator and map them into contiguous kernel virtual space. @@ -3956,13 +3996,13 @@ EXPORT_SYMBOL(vmalloc_noprof); * * Return: pointer to the allocated memory or %NULL on error */ -void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) +void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node) { return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, - gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, - NUMA_NO_NODE, __builtin_return_address(0)); + gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, + node, __builtin_return_address(0)); } -EXPORT_SYMBOL_GPL(vmalloc_huge_noprof); +EXPORT_SYMBOL_GPL(vmalloc_huge_node_noprof); /** * vzalloc - allocate virtually contiguous memory with zero fill @@ -4065,6 +4105,8 @@ EXPORT_SYMBOL(vzalloc_node_noprof); */ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) { + struct vm_struct *vm = NULL; + size_t alloced_size = 0; size_t old_size = 0; void *n; @@ -4074,15 +4116,17 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) } if (p) { - struct vm_struct *vm; - vm = find_vm_area(p); if (unlikely(!vm)) { WARN(1, "Trying to vrealloc() nonexistent vm area (%p)\n", p); return NULL; } - old_size = get_vm_area_size(vm); + alloced_size = get_vm_area_size(vm); + old_size = vm->requested_size; + if (WARN(alloced_size < old_size, + "vrealloc() has mismatched area vs requested sizes (%p)\n", p)) + return NULL; } /* @@ -4090,10 +4134,26 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) * would be a good heuristic for when to shrink the vm_area? */ if (size <= old_size) { - /* Zero out spare memory. */ - if (want_init_on_alloc(flags)) + /* Zero out "freed" memory, potentially for future realloc. */ + if (want_init_on_free() || want_init_on_alloc(flags)) memset((void *)p + size, 0, old_size - size); + vm->requested_size = size; + kasan_poison_vmalloc(p + size, old_size - size); + return (void *)p; + } + /* + * We already have the bytes available in the allocation; use them. + */ + if (size <= alloced_size) { + kasan_unpoison_vmalloc(p + old_size, size - old_size, + KASAN_VMALLOC_PROT_NORMAL); + /* + * No need to zero memory here, as unused memory will have + * already been zeroed at initial allocation time or during + * realloc shrink time. + */ + vm->requested_size = size; return (void *)p; } @@ -4915,39 +4975,37 @@ bool vmalloc_dump_obj(void *object) #endif #ifdef CONFIG_PROC_FS -static void show_numa_info(struct seq_file *m, struct vm_struct *v) -{ - if (IS_ENABLED(CONFIG_NUMA)) { - unsigned int nr, *counters = m->private; - unsigned int step = 1U << vm_area_page_order(v); - if (!counters) - return; +/* + * Print number of pages allocated on each memory node. + * + * This function can only be called if CONFIG_NUMA is enabled + * and VM_UNINITIALIZED bit in v->flags is disabled. + */ +static void show_numa_info(struct seq_file *m, struct vm_struct *v, + unsigned int *counters) +{ + unsigned int nr; + unsigned int step = 1U << vm_area_page_order(v); - if (v->flags & VM_UNINITIALIZED) - return; - /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ - smp_rmb(); + if (!counters) + return; - memset(counters, 0, nr_node_ids * sizeof(unsigned int)); + memset(counters, 0, nr_node_ids * sizeof(unsigned int)); - for (nr = 0; nr < v->nr_pages; nr += step) - counters[page_to_nid(v->pages[nr])] += step; - for_each_node_state(nr, N_HIGH_MEMORY) - if (counters[nr]) - seq_printf(m, " N%u=%u", nr, counters[nr]); - } + for (nr = 0; nr < v->nr_pages; nr += step) + counters[page_to_nid(v->pages[nr])] += step; + for_each_node_state(nr, N_HIGH_MEMORY) + if (counters[nr]) + seq_printf(m, " N%u=%u", nr, counters[nr]); } static void show_purge_info(struct seq_file *m) { struct vmap_node *vn; struct vmap_area *va; - int i; - - for (i = 0; i < nr_vmap_nodes; i++) { - vn = &vmap_nodes[i]; + for_each_vmap_node(vn) { spin_lock(&vn->lazy.lock); list_for_each_entry(va, &vn->lazy.head, list) { seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n", @@ -4963,11 +5021,12 @@ static int vmalloc_info_show(struct seq_file *m, void *p) struct vmap_node *vn; struct vmap_area *va; struct vm_struct *v; - int i; + unsigned int *counters; - for (i = 0; i < nr_vmap_nodes; i++) { - vn = &vmap_nodes[i]; + if (IS_ENABLED(CONFIG_NUMA)) + counters = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); + for_each_vmap_node(vn) { spin_lock(&vn->busy.lock); list_for_each_entry(va, &vn->busy.head, list) { if (!va->vm) { @@ -4980,6 +5039,11 @@ static int vmalloc_info_show(struct seq_file *m, void *p) } v = va->vm; + if (v->flags & VM_UNINITIALIZED) + continue; + + /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ + smp_rmb(); seq_printf(m, "0x%pK-0x%pK %7ld", v->addr, v->addr + v->size, v->size); @@ -5014,7 +5078,9 @@ static int vmalloc_info_show(struct seq_file *m, void *p) if (is_vmalloc_addr(v->pages)) seq_puts(m, " vpages"); - show_numa_info(m, v); + if (IS_ENABLED(CONFIG_NUMA)) + show_numa_info(m, v, counters); + seq_putc(m, '\n'); } spin_unlock(&vn->busy.lock); @@ -5024,19 +5090,14 @@ static int vmalloc_info_show(struct seq_file *m, void *p) * As a final step, dump "unpurged" areas. */ show_purge_info(m); + if (IS_ENABLED(CONFIG_NUMA)) + kfree(counters); return 0; } static int __init proc_vmalloc_init(void) { - void *priv_data = NULL; - - if (IS_ENABLED(CONFIG_NUMA)) - priv_data = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); - - proc_create_single_data("vmallocinfo", - 0400, NULL, vmalloc_info_show, priv_data); - + proc_create_single("vmallocinfo", 0400, NULL, vmalloc_info_show); return 0; } module_init(proc_vmalloc_init); @@ -5088,7 +5149,7 @@ static void __init vmap_init_free_space(void) static void vmap_init_nodes(void) { struct vmap_node *vn; - int i, n; + int i; #if BITS_PER_LONG == 64 /* @@ -5105,7 +5166,7 @@ static void vmap_init_nodes(void) * set of cores. Therefore a per-domain purging is supposed to * be added as well as a per-domain balancing. */ - n = clamp_t(unsigned int, num_possible_cpus(), 1, 128); + int n = clamp_t(unsigned int, num_possible_cpus(), 1, 128); if (n > 1) { vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT | __GFP_NOWARN); @@ -5120,8 +5181,7 @@ static void vmap_init_nodes(void) } #endif - for (n = 0; n < nr_vmap_nodes; n++) { - vn = &vmap_nodes[n]; + for_each_vmap_node(vn) { vn->busy.root = RB_ROOT; INIT_LIST_HEAD(&vn->busy.head); spin_lock_init(&vn->busy.lock); @@ -5142,15 +5202,13 @@ static void vmap_init_nodes(void) static unsigned long vmap_node_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { - unsigned long count; + unsigned long count = 0; struct vmap_node *vn; - int i, j; - - for (count = 0, i = 0; i < nr_vmap_nodes; i++) { - vn = &vmap_nodes[i]; + int i; - for (j = 0; j < MAX_VA_SIZE_PAGES; j++) - count += READ_ONCE(vn->pool[j].len); + for_each_vmap_node(vn) { + for (i = 0; i < MAX_VA_SIZE_PAGES; i++) + count += READ_ONCE(vn->pool[i].len); } return count ? count : SHRINK_EMPTY; @@ -5159,10 +5217,10 @@ vmap_node_shrink_count(struct shrinker *shrink, struct shrink_control *sc) static unsigned long vmap_node_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { - int i; + struct vmap_node *vn; - for (i = 0; i < nr_vmap_nodes; i++) - decay_va_pool_node(&vmap_nodes[i], true); + for_each_vmap_node(vn) + decay_va_pool_node(vn, true); return SHRINK_STOP; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 76378bc257e3..f8dfd2864bbf 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -271,6 +271,25 @@ static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg) } #endif +/* for_each_managed_zone_pgdat - helper macro to iterate over all managed zones in a pgdat up to + * and including the specified highidx + * @zone: The current zone in the iterator + * @pgdat: The pgdat which node_zones are being iterated + * @idx: The index variable + * @highidx: The index of the highest zone to return + * + * This macro iterates through all managed zones up to and including the specified highidx. + * The zone iterator enters an invalid state after macro call and must be reinitialized + * before it can be used again. + */ +#define for_each_managed_zone_pgdat(zone, pgdat, idx, highidx) \ + for ((idx) = 0, (zone) = (pgdat)->node_zones; \ + (idx) <= (highidx); \ + (idx)++, (zone)++) \ + if (!managed_zone(zone)) \ + continue; \ + else + static void set_task_reclaim_state(struct task_struct *task, struct reclaim_state *rs) { @@ -323,16 +342,22 @@ static void flush_reclaim_state(struct scan_control *sc) } } -static bool can_demote(int nid, struct scan_control *sc) +static bool can_demote(int nid, struct scan_control *sc, + struct mem_cgroup *memcg) { + int demotion_nid; + if (!numa_demotion_enabled) return false; if (sc && sc->no_demotion) return false; - if (next_demotion_node(nid) == NUMA_NO_NODE) + + demotion_nid = next_demotion_node(nid); + if (demotion_nid == NUMA_NO_NODE) return false; - return true; + /* If demotion node isn't in the cgroup's mems_allowed, fall back */ + return mem_cgroup_node_allowed(memcg, demotion_nid); } static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg, @@ -357,7 +382,7 @@ static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg, * * Can it be reclaimed from this node via demotion? */ - return can_demote(nid, sc); + return can_demote(nid, sc, memcg); } /* @@ -374,7 +399,14 @@ unsigned long zone_reclaimable_pages(struct zone *zone) if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL)) nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) + zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON); - + /* + * If there are no reclaimable file-backed or anonymous pages, + * ensure zones with sufficient free pages are not skipped. + * This prevents zones like DMA32 from being ignored in reclaim + * scenarios where they can still help alleviate memory pressure. + */ + if (nr == 0) + nr = zone_page_state_snapshot(zone, NR_FREE_PAGES); return nr; } @@ -389,13 +421,9 @@ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, { unsigned long size = 0; int zid; + struct zone *zone; - for (zid = 0; zid <= zone_idx; zid++) { - struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid]; - - if (!managed_zone(zone)) - continue; - + for_each_managed_zone_pgdat(zone, lruvec_pgdat(lruvec), zid, zone_idx) { if (!mem_cgroup_disabled()) size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid); else @@ -434,21 +462,26 @@ void drop_slab(void) } while ((freed >> shift++) > 1); } -static int reclaimer_offset(void) +#define CHECK_RECLAIMER_OFFSET(type) \ + do { \ + BUILD_BUG_ON(PGSTEAL_##type - PGSTEAL_KSWAPD != \ + PGDEMOTE_##type - PGDEMOTE_KSWAPD); \ + BUILD_BUG_ON(PGSTEAL_##type - PGSTEAL_KSWAPD != \ + PGSCAN_##type - PGSCAN_KSWAPD); \ + } while (0) + +static int reclaimer_offset(struct scan_control *sc) { - BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD != - PGDEMOTE_DIRECT - PGDEMOTE_KSWAPD); - BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD != - PGDEMOTE_KHUGEPAGED - PGDEMOTE_KSWAPD); - BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD != - PGSCAN_DIRECT - PGSCAN_KSWAPD); - BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD != - PGSCAN_KHUGEPAGED - PGSCAN_KSWAPD); + CHECK_RECLAIMER_OFFSET(DIRECT); + CHECK_RECLAIMER_OFFSET(KHUGEPAGED); + CHECK_RECLAIMER_OFFSET(PROACTIVE); if (current_is_kswapd()) return 0; if (current_is_khugepaged()) return PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD; + if (sc->proactive) + return PGSTEAL_PROACTIVE - PGSTEAL_KSWAPD; return PGSTEAL_DIRECT - PGSTEAL_KSWAPD; } @@ -488,7 +521,7 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat) { int reclaimable = 0, write_pending = 0; int i; - + struct zone *zone; /* * If kswapd is disabled, reschedule if necessary but do not * throttle as the system is likely near OOM. @@ -501,12 +534,7 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat) * throttle as throttling will occur when the folios cycle * towards the end of the LRU if still under writeback. */ - for (i = 0; i < MAX_NR_ZONES; i++) { - struct zone *zone = pgdat->node_zones + i; - - if (!managed_zone(zone)) - continue; - + for_each_managed_zone_pgdat(zone, pgdat, i, MAX_NR_ZONES - 1) { reclaimable += zone_reclaimable_pages(zone); write_pending += zone_page_state_snapshot(zone, NR_ZONE_WRITE_PENDING); @@ -626,21 +654,20 @@ typedef enum { /* * pageout is called by shrink_folio_list() for each dirty folio. - * Calls ->writepage(). */ static pageout_t pageout(struct folio *folio, struct address_space *mapping, struct swap_iocb **plug, struct list_head *folio_list) { + int (*writeout)(struct folio *, struct writeback_control *); + /* - * If the folio is dirty, only perform writeback if that write - * will be non-blocking. To prevent this allocation from being - * stalled by pagecache activity. But note that there may be - * stalls if we need to run get_block(). We could test - * PagePrivate for that. - * - * If this process is currently in __generic_file_write_iter() against - * this folio's queue, we can perform writeback even if that - * will block. + * We no longer attempt to writeback filesystem folios here, other + * than tmpfs/shmem. That's taken care of in page-writeback. + * If we find a dirty filesystem folio at the end of the LRU list, + * typically that means the filesystem is saturating the storage + * with contiguous writes and telling it to write a folio here + * would only make the situation worse by injecting an element + * of random access. * * If the folio is swapcache, write it back even if that would * block, for some throttling. This happens by accident, because @@ -663,7 +690,11 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping, } return PAGE_KEEP; } - if (mapping->a_ops->writepage == NULL) + if (shmem_mapping(mapping)) + writeout = shmem_writeout; + else if (folio_test_anon(folio)) + writeout = swap_writeout; + else return PAGE_ACTIVATE; if (folio_clear_dirty_for_io(folio)) { @@ -686,7 +717,7 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping, wbc.list = folio_list; folio_set_reclaim(folio); - res = mapping->a_ops->writepage(&folio->page, &wbc); + res = writeout(folio, &wbc); if (res < 0) handle_write_error(mapping, folio, res); if (res == AOP_WRITEPAGE_ACTIVATE) { @@ -695,7 +726,7 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping, } if (!folio_test_writeback(folio)) { - /* synchronous write or broken a_ops? */ + /* synchronous write? */ folio_clear_reclaim(folio); } trace_mm_vmscan_write_folio(folio); @@ -762,7 +793,7 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, if (reclaimed && !mapping_exiting(mapping)) shadow = workingset_eviction(folio, target_memcg); __delete_from_swap_cache(folio, swap, shadow); - mem_cgroup_swapout(folio, swap); + memcg1_swapout(folio, swap); xa_unlock_irq(&mapping->i_pages); put_swap_folio(folio, swap); } else { @@ -855,6 +886,31 @@ enum folio_references { FOLIOREF_ACTIVATE, }; +#ifdef CONFIG_LRU_GEN +/* + * Only used on a mapped folio in the eviction (rmap walk) path, where promotion + * needs to be done by taking the folio off the LRU list and then adding it back + * with PG_active set. In contrast, the aging (page table walk) path uses + * folio_update_gen(). + */ +static bool lru_gen_set_refs(struct folio *folio) +{ + /* see the comment on LRU_REFS_FLAGS */ + if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) { + set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced)); + return false; + } + + set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_workingset)); + return true; +} +#else +static bool lru_gen_set_refs(struct folio *folio) +{ + return false; +} +#endif /* CONFIG_LRU_GEN */ + static enum folio_references folio_check_references(struct folio *folio, struct scan_control *sc) { @@ -863,7 +919,6 @@ static enum folio_references folio_check_references(struct folio *folio, referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup, &vm_flags); - referenced_folio = folio_test_clear_referenced(folio); /* * The supposedly reclaimable folio was found to be in a VM_LOCKED vma. @@ -881,6 +936,15 @@ static enum folio_references folio_check_references(struct folio *folio, if (referenced_ptes == -1) return FOLIOREF_KEEP; + if (lru_gen_enabled()) { + if (!referenced_ptes) + return FOLIOREF_RECLAIM; + + return lru_gen_set_refs(folio) ? FOLIOREF_ACTIVATE : FOLIOREF_KEEP; + } + + referenced_folio = folio_test_clear_referenced(folio); + if (referenced_ptes) { /* * All mapped folios start out with page table @@ -1041,12 +1105,13 @@ static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask) */ static unsigned int shrink_folio_list(struct list_head *folio_list, struct pglist_data *pgdat, struct scan_control *sc, - struct reclaim_stat *stat, bool ignore_references) + struct reclaim_stat *stat, bool ignore_references, + struct mem_cgroup *memcg) { struct folio_batch free_folios; LIST_HEAD(ret_folios); LIST_HEAD(demote_folios); - unsigned int nr_reclaimed = 0; + unsigned int nr_reclaimed = 0, nr_demoted = 0; unsigned int pgactivate = 0; bool do_demote_pass; struct swap_iocb *plug = NULL; @@ -1054,7 +1119,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, folio_batch_init(&free_folios); memset(stat, 0, sizeof(*stat)); cond_resched(); - do_demote_pass = can_demote(pgdat->node_id, sc); + do_demote_pass = can_demote(pgdat->node_id, sc, memcg); retry: while (!list_empty(folio_list)) { @@ -1072,6 +1137,13 @@ retry: if (!folio_trylock(folio)) goto keep; + if (folio_contain_hwpoisoned_page(folio)) { + unmap_poisoned_folio(folio, folio_pfn(folio), false); + folio_unlock(folio); + folio_put(folio); + continue; + } + VM_BUG_ON_FOLIO(folio_test_active(folio), folio); nr_pages = folio_nr_pages(folio); @@ -1085,11 +1157,6 @@ retry: if (!sc->may_unmap && folio_mapped(folio)) goto keep_locked; - /* folio_update_gen() tried to promote this page? */ - if (lru_gen_enabled() && !ignore_references && - folio_mapped(folio) && folio_test_referenced(folio)) - goto keep_locked; - /* * The number of dirty pages determines if a node is marked * reclaim_congested. kswapd will stall and start writing @@ -1130,8 +1197,10 @@ retry: * 2) Global or new memcg reclaim encounters a folio that is * not marked for immediate reclaim, or the caller does not * have __GFP_FS (or __GFP_IO if it's simply going to swap, - * not to fs). In this case mark the folio for immediate - * reclaim and continue scanning. + * not to fs), or the folio belongs to a mapping where + * waiting on writeback during reclaim may lead to a deadlock. + * In this case mark the folio for immediate reclaim and + * continue scanning. * * Require may_enter_fs() because we would wait on fs, which * may not have submitted I/O yet. And the loop driver might @@ -1156,6 +1225,8 @@ retry: * takes to write them to disk. */ if (folio_test_writeback(folio)) { + mapping = folio_mapping(folio); + /* Case 1 above */ if (current_is_kswapd() && folio_test_reclaim(folio) && @@ -1166,7 +1237,9 @@ retry: /* Case 2 above */ } else if (writeback_throttling_sane(sc) || !folio_test_reclaim(folio) || - !may_enter_fs(folio, sc->gfp_mask)) { + !may_enter_fs(folio, sc->gfp_mask) || + (mapping && + mapping_writeback_may_deadlock_on_reclaim(mapping))) { /* * This is slightly racy - * folio_end_writeback() might have @@ -1244,7 +1317,7 @@ retry: split_folio_to_list(folio, folio_list)) goto activate_locked; } - if (!add_to_swap(folio)) { + if (folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN)) { int __maybe_unused order = folio_order(folio); if (!folio_test_large(folio)) @@ -1260,9 +1333,21 @@ retry: } #endif count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK); - if (!add_to_swap(folio)) + if (folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN)) goto activate_locked_split; } + /* + * Normally the folio will be dirtied in unmap because its + * pte should be dirty. A special case is MADV_FREE page. The + * page's pte could have dirty bit cleared but the folio's + * SwapBacked flag is still set because clearing the dirty bit + * and SwapBacked flag has no lock protected. For such folio, + * unmap will not set dirty bit for it, so folio reclaim will + * not write the folio out. This can cause data corruption when + * the folio is swapped in later. Always setting the dirty flag + * for the folio solves the problem. + */ + folio_mark_dirty(folio); } } @@ -1515,8 +1600,9 @@ keep: /* 'folio_list' is always empty here */ /* Migrate folios selected for demotion */ - stat->nr_demoted = demote_folio_list(&demote_folios, pgdat); - nr_reclaimed += stat->nr_demoted; + nr_demoted = demote_folio_list(&demote_folios, pgdat); + nr_reclaimed += nr_demoted; + stat->nr_demoted += nr_demoted; /* Folios that could not be demoted are still in @demote_folios */ if (!list_empty(&demote_folios)) { /* Folios which weren't demoted go back on @folio_list */ @@ -1588,7 +1674,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, */ noreclaim_flag = memalloc_noreclaim_save(); nr_reclaimed = shrink_folio_list(&clean_folios, zone->zone_pgdat, &sc, - &stat, true); + &stat, true, NULL); memalloc_noreclaim_restore(noreclaim_flag); list_splice(&clean_folios, folio_list); @@ -1655,12 +1741,11 @@ static unsigned long isolate_lru_folios(unsigned long nr_to_scan, unsigned long nr_taken = 0; unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 }; unsigned long nr_skipped[MAX_NR_ZONES] = { 0, }; - unsigned long skipped = 0; - unsigned long scan, total_scan, nr_pages; + unsigned long skipped = 0, total_scan = 0, scan = 0; + unsigned long nr_pages; + unsigned long max_nr_skipped = 0; LIST_HEAD(folios_skipped); - total_scan = 0; - scan = 0; while (scan < nr_to_scan && !list_empty(src)) { struct list_head *move_to = src; struct folio *folio; @@ -1671,9 +1756,12 @@ static unsigned long isolate_lru_folios(unsigned long nr_to_scan, nr_pages = folio_nr_pages(folio); total_scan += nr_pages; - if (folio_zonenum(folio) > sc->reclaim_idx) { + /* Using max_nr_skipped to prevent hard LOCKUP*/ + if (max_nr_skipped < SWAP_CLUSTER_MAX_SKIPPED && + (folio_zonenum(folio) > sc->reclaim_idx)) { nr_skipped[folio_zonenum(folio)] += nr_pages; move_to = &folios_skipped; + max_nr_skipped++; goto move; } @@ -1946,10 +2034,10 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, &nr_scanned, sc, lru); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); - item = PGSCAN_KSWAPD + reclaimer_offset(); + item = PGSCAN_KSWAPD + reclaimer_offset(sc); if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_scanned); - __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); + count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); __count_vm_events(PGSCAN_ANON + file, nr_scanned); spin_unlock_irq(&lruvec->lru_lock); @@ -1957,18 +2045,19 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, if (nr_taken == 0) return 0; - nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false); + nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false, + lruvec_memcg(lruvec)); spin_lock_irq(&lruvec->lru_lock); move_folios_to_lru(lruvec, &folio_list); - __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(), + __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc), stat.nr_demoted); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); - item = PGSTEAL_KSWAPD + reclaimer_offset(); + item = PGSTEAL_KSWAPD + reclaimer_offset(sc); if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_reclaimed); - __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); + count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed); spin_unlock_irq(&lruvec->lru_lock); @@ -2058,7 +2147,7 @@ static void shrink_active_list(unsigned long nr_to_scan, if (!cgroup_reclaim(sc)) __count_vm_events(PGREFILL, nr_scanned); - __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); + count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); spin_unlock_irq(&lruvec->lru_lock); @@ -2115,7 +2204,7 @@ static void shrink_active_list(unsigned long nr_to_scan, nr_deactivate = move_folios_to_lru(lruvec, &l_inactive); __count_vm_events(PGDEACTIVATE, nr_deactivate); - __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate); + count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&lruvec->lru_lock); @@ -2140,7 +2229,7 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list, .no_demotion = 1, }; - nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &stat, true); + nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &stat, true, NULL); while (!list_empty(folio_list)) { folio = lru_to_folio(folio_list); list_del(&folio->lru); @@ -2332,17 +2421,13 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc) unsigned long total_high_wmark = 0; unsigned long free, anon; int z; + struct zone *zone; free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); file = node_page_state(pgdat, NR_ACTIVE_FILE) + node_page_state(pgdat, NR_INACTIVE_FILE); - for (z = 0; z < MAX_NR_ZONES; z++) { - struct zone *zone = &pgdat->node_zones[z]; - - if (!managed_zone(zone)) - continue; - + for_each_managed_zone_pgdat(zone, pgdat, z, MAX_NR_ZONES - 1) { total_high_wmark += high_wmark_pages(zone); } @@ -2360,6 +2445,43 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc) } } +static inline void calculate_pressure_balance(struct scan_control *sc, + int swappiness, u64 *fraction, u64 *denominator) +{ + unsigned long anon_cost, file_cost, total_cost; + unsigned long ap, fp; + + /* + * Calculate the pressure balance between anon and file pages. + * + * The amount of pressure we put on each LRU is inversely + * proportional to the cost of reclaiming each list, as + * determined by the share of pages that are refaulting, times + * the relative IO cost of bringing back a swapped out + * anonymous page vs reloading a filesystem page (swappiness). + * + * Although we limit that influence to ensure no list gets + * left behind completely: at least a third of the pressure is + * applied, before swappiness. + * + * With swappiness at 100, anon and file have equal IO cost. + */ + total_cost = sc->anon_cost + sc->file_cost; + anon_cost = total_cost + sc->anon_cost; + file_cost = total_cost + sc->file_cost; + total_cost = anon_cost + file_cost; + + ap = swappiness * (total_cost + 1); + ap /= anon_cost + 1; + + fp = (MAX_SWAPPINESS - swappiness) * (total_cost + 1); + fp /= file_cost + 1; + + fraction[WORKINGSET_ANON] = ap; + fraction[WORKINGSET_FILE] = fp; + *denominator = ap + fp; +} + /* * Determine how aggressively the anon and file LRU lists should be * scanned. @@ -2372,12 +2494,10 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, { struct pglist_data *pgdat = lruvec_pgdat(lruvec); struct mem_cgroup *memcg = lruvec_memcg(lruvec); - unsigned long anon_cost, file_cost, total_cost; int swappiness = sc_swappiness(sc, memcg); u64 fraction[ANON_AND_FILE]; u64 denominator = 0; /* gcc */ enum scan_balance scan_balance; - unsigned long ap, fp; enum lru_list lru; /* If we have no swap space, do not bother scanning anon folios. */ @@ -2398,6 +2518,13 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, goto out; } + /* Proactive reclaim initiated by userspace for anonymous memory only */ + if (swappiness == SWAPPINESS_ANON_ONLY) { + WARN_ON_ONCE(!sc->proactive); + scan_balance = SCAN_ANON; + goto out; + } + /* * Do not apply any pressure balancing cleverness when the * system is close to OOM, scan both anon and file equally @@ -2418,7 +2545,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, /* * If there is enough inactive page cache, we do not reclaim - * anything from the anonymous working right now. + * anything from the anonymous working right now to make sure + * a streaming file access pattern doesn't cause swapping. */ if (sc->cache_trim_mode) { scan_balance = SCAN_FILE; @@ -2426,35 +2554,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, } scan_balance = SCAN_FRACT; - /* - * Calculate the pressure balance between anon and file pages. - * - * The amount of pressure we put on each LRU is inversely - * proportional to the cost of reclaiming each list, as - * determined by the share of pages that are refaulting, times - * the relative IO cost of bringing back a swapped out - * anonymous page vs reloading a filesystem page (swappiness). - * - * Although we limit that influence to ensure no list gets - * left behind completely: at least a third of the pressure is - * applied, before swappiness. - * - * With swappiness at 100, anon and file have equal IO cost. - */ - total_cost = sc->anon_cost + sc->file_cost; - anon_cost = total_cost + sc->anon_cost; - file_cost = total_cost + sc->file_cost; - total_cost = anon_cost + file_cost; - - ap = swappiness * (total_cost + 1); - ap /= anon_cost + 1; - - fp = (MAX_SWAPPINESS - swappiness) * (total_cost + 1); - fp /= file_cost + 1; + calculate_pressure_balance(sc, swappiness, fraction, &denominator); - fraction[0] = ap; - fraction[1] = fp; - denominator = ap + fp; out: for_each_evictable_lru(lru) { bool file = is_file_lru(lru); @@ -2568,7 +2669,7 @@ out: * Anonymous LRU management is a waste if there is * ultimately no way to reclaim the memory. */ -static bool can_age_anon_pages(struct pglist_data *pgdat, +static bool can_age_anon_pages(struct lruvec *lruvec, struct scan_control *sc) { /* Aging the anon LRU is valuable if swap is present: */ @@ -2576,7 +2677,8 @@ static bool can_age_anon_pages(struct pglist_data *pgdat, return true; /* Also valuable if anon pages can be demoted: */ - return can_demote(pgdat->node_id, sc); + return can_demote(lruvec_pgdat(lruvec)->node_id, sc, + lruvec_memcg(lruvec)); } #ifdef CONFIG_LRU_GEN @@ -2612,11 +2714,21 @@ static bool should_clear_pmd_young(void) READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \ } +/* Get the min/max evictable type based on swappiness */ +#define min_type(swappiness) (!(swappiness)) +#define max_type(swappiness) ((swappiness) < SWAPPINESS_ANON_ONLY) + +#define evictable_min_seq(min_seq, swappiness) \ + min((min_seq)[min_type(swappiness)], (min_seq)[max_type(swappiness)]) + #define for_each_gen_type_zone(gen, type, zone) \ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) +#define for_each_evictable_type(type, swappiness) \ + for ((type) = min_type(swappiness); (type) <= max_type(swappiness); (type)++) + #define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS) #define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS) @@ -2648,7 +2760,7 @@ static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc) if (!sc->may_swap) return 0; - if (!can_demote(pgdat->node_id, sc) && + if (!can_demote(pgdat->node_id, sc, memcg) && mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH) return 0; @@ -2662,10 +2774,16 @@ static int get_nr_gens(struct lruvec *lruvec, int type) static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) { - /* see the comment on lru_gen_folio */ - return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS && - get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) && - get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS; + int type; + + for (type = 0; type < ANON_AND_FILE; type++) { + int n = get_nr_gens(lruvec, type); + + if (n < MIN_NR_GENS || n > MAX_NR_GENS) + return false; + } + + return true; } /****************************************************************************** @@ -3066,16 +3184,20 @@ struct ctrl_pos { static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain, struct ctrl_pos *pos) { + int i; struct lru_gen_folio *lrugen = &lruvec->lrugen; int hist = lru_hist_from_seq(lrugen->min_seq[type]); - pos->refaulted = lrugen->avg_refaulted[type][tier] + - atomic_long_read(&lrugen->refaulted[hist][type][tier]); - pos->total = lrugen->avg_total[type][tier] + - atomic_long_read(&lrugen->evicted[hist][type][tier]); - if (tier) - pos->total += lrugen->protected[hist][type][tier - 1]; pos->gain = gain; + pos->refaulted = pos->total = 0; + + for (i = tier % MAX_NR_TIERS; i <= min(tier, MAX_NR_TIERS - 1); i++) { + pos->refaulted += lrugen->avg_refaulted[type][i] + + atomic_long_read(&lrugen->refaulted[hist][type][i]); + pos->total += lrugen->avg_total[type][i] + + lrugen->protected[hist][type][i] + + atomic_long_read(&lrugen->evicted[hist][type][i]); + } } static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover) @@ -3101,17 +3223,15 @@ static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover) WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2); sum = lrugen->avg_total[type][tier] + + lrugen->protected[hist][type][tier] + atomic_long_read(&lrugen->evicted[hist][type][tier]); - if (tier) - sum += lrugen->protected[hist][type][tier - 1]; WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2); } if (clear) { atomic_long_set(&lrugen->refaulted[hist][type][tier], 0); atomic_long_set(&lrugen->evicted[hist][type][tier], 0); - if (tier) - WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0); + WRITE_ONCE(lrugen->protected[hist][type][tier], 0); } } } @@ -3138,16 +3258,19 @@ static int folio_update_gen(struct folio *folio, int gen) VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); + /* see the comment on LRU_REFS_FLAGS */ + if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) { + set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced)); + return -1; + } + do { /* lru_gen_del_folio() has isolated this page? */ - if (!(old_flags & LRU_GEN_MASK)) { - /* for shrink_folio_list() */ - new_flags = old_flags | BIT(PG_referenced); - continue; - } + if (!(old_flags & LRU_GEN_MASK)) + return -1; - new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); - new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; + new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS); + new_flags |= ((gen + 1UL) << LRU_GEN_PGOFF) | BIT(PG_workingset); } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; @@ -3171,7 +3294,7 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai new_gen = (old_gen + 1) % MAX_NR_GENS; - new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); + new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS); new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF; /* for folio_end_writeback() */ if (reclaiming) @@ -3246,7 +3369,7 @@ static int should_skip_vma(unsigned long start, unsigned long end, struct mm_wal return true; if (vma_is_anonymous(vma)) - return !walk->can_swap; + return !walk->swappiness; if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping)) return true; @@ -3256,7 +3379,10 @@ static int should_skip_vma(unsigned long start, unsigned long end, struct mm_wal return true; if (shmem_mapping(mapping)) - return !walk->can_swap; + return !walk->swappiness; + + if (walk->swappiness > MAX_SWAPPINESS) + return true; /* to exclude special mappings like dax, etc. */ return !mapping->a_ops->read_folio; @@ -3344,19 +3470,17 @@ static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned } static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg, - struct pglist_data *pgdat, bool can_swap) + struct pglist_data *pgdat) { - struct folio *folio; + struct folio *folio = pfn_folio(pfn); - folio = pfn_folio(pfn); - if (folio_nid(folio) != pgdat->node_id) + if (folio_lru_gen(folio) < 0) return NULL; - if (folio_memcg(folio) != memcg) + if (folio_nid(folio) != pgdat->node_id) return NULL; - /* file VMAs can contain anon pages from COW */ - if (!folio_is_file_lru(folio) && !can_swap) + if (folio_memcg(folio) != memcg) return NULL; return folio; @@ -3370,29 +3494,55 @@ static bool suitable_to_scan(int total, int young) return young * n >= total; } +static void walk_update_folio(struct lru_gen_mm_walk *walk, struct folio *folio, + int new_gen, bool dirty) +{ + int old_gen; + + if (!folio) + return; + + if (dirty && !folio_test_dirty(folio) && + !(folio_test_anon(folio) && folio_test_swapbacked(folio) && + !folio_test_swapcache(folio))) + folio_mark_dirty(folio); + + if (walk) { + old_gen = folio_update_gen(folio, new_gen); + if (old_gen >= 0 && old_gen != new_gen) + update_batch_size(walk, folio, old_gen, new_gen); + } else if (lru_gen_set_refs(folio)) { + old_gen = folio_lru_gen(folio); + if (old_gen >= 0 && old_gen != new_gen) + folio_activate(folio); + } +} + static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, struct mm_walk *args) { int i; + bool dirty; pte_t *pte; spinlock_t *ptl; unsigned long addr; int total = 0; int young = 0; + struct folio *last = NULL; struct lru_gen_mm_walk *walk = args->private; struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); DEFINE_MAX_SEQ(walk->lruvec); - int old_gen, new_gen = lru_gen_from_seq(max_seq); + int gen = lru_gen_from_seq(max_seq); pmd_t pmdval; - pte = pte_offset_map_rw_nolock(args->mm, pmd, start & PMD_MASK, &pmdval, - &ptl); + pte = pte_offset_map_rw_nolock(args->mm, pmd, start & PMD_MASK, &pmdval, &ptl); if (!pte) return false; + if (!spin_trylock(ptl)) { pte_unmap(pte); - return false; + return true; } if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pmd)))) { @@ -3414,26 +3564,30 @@ restart: if (pfn == -1) continue; - folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); + folio = get_pfn_folio(pfn, memcg, pgdat); if (!folio) continue; if (!ptep_clear_young_notify(args->vma, addr, pte + i)) continue; - young++; - walk->mm_stats[MM_LEAF_YOUNG]++; + if (last != folio) { + walk_update_folio(walk, last, gen, dirty); + + last = folio; + dirty = false; + } - if (pte_dirty(ptent) && !folio_test_dirty(folio) && - !(folio_test_anon(folio) && folio_test_swapbacked(folio) && - !folio_test_swapcache(folio))) - folio_mark_dirty(folio); + if (pte_dirty(ptent)) + dirty = true; - old_gen = folio_update_gen(folio, new_gen); - if (old_gen >= 0 && old_gen != new_gen) - update_batch_size(walk, folio, old_gen, new_gen); + young++; + walk->mm_stats[MM_LEAF_YOUNG]++; } + walk_update_folio(walk, last, gen, dirty); + last = NULL; + if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end)) goto restart; @@ -3447,13 +3601,15 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area struct mm_walk *args, unsigned long *bitmap, unsigned long *first) { int i; + bool dirty; pmd_t *pmd; spinlock_t *ptl; + struct folio *last = NULL; struct lru_gen_mm_walk *walk = args->private; struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); DEFINE_MAX_SEQ(walk->lruvec); - int old_gen, new_gen = lru_gen_from_seq(max_seq); + int gen = lru_gen_from_seq(max_seq); VM_WARN_ON_ONCE(pud_leaf(*pud)); @@ -3499,27 +3655,30 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area if (pfn == -1) goto next; - folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); + folio = get_pfn_folio(pfn, memcg, pgdat); if (!folio) goto next; if (!pmdp_clear_young_notify(vma, addr, pmd + i)) goto next; - walk->mm_stats[MM_LEAF_YOUNG]++; + if (last != folio) { + walk_update_folio(walk, last, gen, dirty); - if (pmd_dirty(pmd[i]) && !folio_test_dirty(folio) && - !(folio_test_anon(folio) && folio_test_swapbacked(folio) && - !folio_test_swapcache(folio))) - folio_mark_dirty(folio); + last = folio; + dirty = false; + } - old_gen = folio_update_gen(folio, new_gen); - if (old_gen >= 0 && old_gen != new_gen) - update_batch_size(walk, folio, old_gen, new_gen); + if (pmd_dirty(pmd[i])) + dirty = true; + + walk->mm_stats[MM_LEAF_YOUNG]++; next: i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1; } while (i <= MIN_LRU_BATCH); + walk_update_folio(walk, last, gen, dirty); + arch_leave_lazy_mmu_mode(); spin_unlock(ptl); done: @@ -3711,22 +3870,30 @@ static void clear_mm_walk(void) kfree(walk); } -static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) +static bool inc_min_seq(struct lruvec *lruvec, int type, int swappiness) { int zone; int remaining = MAX_LRU_BATCH; struct lru_gen_folio *lrugen = &lruvec->lrugen; + int hist = lru_hist_from_seq(lrugen->min_seq[type]); int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); - if (type == LRU_GEN_ANON && !can_swap) + /* For file type, skip the check if swappiness is anon only */ + if (type && (swappiness == SWAPPINESS_ANON_ONLY)) + goto done; + + /* For anon type, skip the check if swappiness is zero (file only) */ + if (!type && !swappiness) goto done; - /* prevent cold/hot inversion if force_scan is true */ + /* prevent cold/hot inversion if the type is evictable */ for (zone = 0; zone < MAX_NR_ZONES; zone++) { struct list_head *head = &lrugen->folios[old_gen][type][zone]; while (!list_empty(head)) { struct folio *folio = lru_to_folio(head); + int refs = folio_lru_refs(folio); + bool workingset = folio_test_workingset(folio); VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); @@ -3736,6 +3903,15 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) new_gen = folio_inc_gen(lruvec, folio, false); list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]); + /* don't count the workingset being lazily promoted */ + if (refs + workingset != BIT(LRU_REFS_WIDTH) + 1) { + int tier = lru_tier_from_refs(refs, workingset); + int delta = folio_nr_pages(folio); + + WRITE_ONCE(lrugen->protected[hist][type][tier], + lrugen->protected[hist][type][tier] + delta); + } + if (!--remaining) return false; } @@ -3747,7 +3923,7 @@ done: return true; } -static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) +static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness) { int gen, type, zone; bool success = false; @@ -3757,7 +3933,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); /* find the oldest populated generation */ - for (type = !can_swap; type < ANON_AND_FILE; type++) { + for_each_evictable_type(type, swappiness) { while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) { gen = lru_gen_from_seq(min_seq[type]); @@ -3773,13 +3949,17 @@ next: } /* see the comment on lru_gen_folio */ - if (can_swap) { - min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]); - min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]); + if (swappiness && swappiness <= MAX_SWAPPINESS) { + unsigned long seq = lrugen->max_seq - MIN_NR_GENS; + + if (min_seq[LRU_GEN_ANON] > seq && min_seq[LRU_GEN_FILE] < seq) + min_seq[LRU_GEN_ANON] = seq; + else if (min_seq[LRU_GEN_FILE] > seq && min_seq[LRU_GEN_ANON] < seq) + min_seq[LRU_GEN_FILE] = seq; } - for (type = !can_swap; type < ANON_AND_FILE; type++) { - if (min_seq[type] == lrugen->min_seq[type]) + for_each_evictable_type(type, swappiness) { + if (min_seq[type] <= lrugen->min_seq[type]) continue; reset_ctrl_pos(lruvec, type, true); @@ -3790,8 +3970,7 @@ next: return success; } -static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq, - bool can_swap, bool force_scan) +static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq, int swappiness) { bool success; int prev, next; @@ -3809,13 +3988,11 @@ restart: if (!success) goto unlock; - for (type = ANON_AND_FILE - 1; type >= 0; type--) { + for (type = 0; type < ANON_AND_FILE; type++) { if (get_nr_gens(lruvec, type) != MAX_NR_GENS) continue; - VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap)); - - if (inc_min_seq(lruvec, type, can_swap)) + if (inc_min_seq(lruvec, type, swappiness)) continue; spin_unlock_irq(&lruvec->lru_lock); @@ -3859,7 +4036,7 @@ unlock: } static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq, - bool can_swap, bool force_scan) + int swappiness, bool force_scan) { bool success; struct lru_gen_mm_walk *walk; @@ -3870,7 +4047,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq, VM_WARN_ON_ONCE(seq > READ_ONCE(lrugen->max_seq)); if (!mm_state) - return inc_max_seq(lruvec, seq, can_swap, force_scan); + return inc_max_seq(lruvec, seq, swappiness); /* see the comment in iterate_mm_list() */ if (seq <= READ_ONCE(mm_state->seq)) @@ -3895,7 +4072,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq, walk->lruvec = lruvec; walk->seq = seq; - walk->can_swap = can_swap; + walk->swappiness = swappiness; walk->force_scan = force_scan; do { @@ -3905,7 +4082,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq, } while (mm); done: if (success) { - success = inc_max_seq(lruvec, seq, can_swap, force_scan); + success = inc_max_seq(lruvec, seq, swappiness); WARN_ON_ONCE(!success); } @@ -3946,13 +4123,13 @@ static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc) { int gen, type, zone; unsigned long total = 0; - bool can_swap = get_swappiness(lruvec, sc); + int swappiness = get_swappiness(lruvec, sc); struct lru_gen_folio *lrugen = &lruvec->lrugen; struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MAX_SEQ(lruvec); DEFINE_MIN_SEQ(lruvec); - for (type = !can_swap; type < ANON_AND_FILE; type++) { + for_each_evictable_type(type, swappiness) { unsigned long seq; for (seq = min_seq[type]; seq <= max_seq; seq++) { @@ -3972,6 +4149,7 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc { int gen; unsigned long birth; + int swappiness = get_swappiness(lruvec, sc); struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MIN_SEQ(lruvec); @@ -3981,8 +4159,7 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc if (!lruvec_is_sizable(lruvec, sc)) return false; - /* see the comment on lru_gen_folio */ - gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]); + gen = lru_gen_from_seq(evictable_min_seq(min_seq, swappiness)); birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); return time_is_before_jiffies(birth + min_ttl); @@ -4041,21 +4218,22 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) { int i; + bool dirty; unsigned long start; unsigned long end; struct lru_gen_mm_walk *walk; + struct folio *last = NULL; int young = 1; pte_t *pte = pvmw->pte; unsigned long addr = pvmw->address; struct vm_area_struct *vma = pvmw->vma; struct folio *folio = pfn_folio(pvmw->pfn); - bool can_swap = !folio_is_file_lru(folio); struct mem_cgroup *memcg = folio_memcg(folio); struct pglist_data *pgdat = folio_pgdat(folio); struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); DEFINE_MAX_SEQ(lruvec); - int old_gen, new_gen = lru_gen_from_seq(max_seq); + int gen = lru_gen_from_seq(max_seq); lockdep_assert_held(pvmw->ptl); VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio); @@ -4102,37 +4280,28 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) if (pfn == -1) continue; - folio = get_pfn_folio(pfn, memcg, pgdat, can_swap); + folio = get_pfn_folio(pfn, memcg, pgdat); if (!folio) continue; if (!ptep_clear_young_notify(vma, addr, pte + i)) continue; - young++; - - if (pte_dirty(ptent) && !folio_test_dirty(folio) && - !(folio_test_anon(folio) && folio_test_swapbacked(folio) && - !folio_test_swapcache(folio))) - folio_mark_dirty(folio); - - if (walk) { - old_gen = folio_update_gen(folio, new_gen); - if (old_gen >= 0 && old_gen != new_gen) - update_batch_size(walk, folio, old_gen, new_gen); + if (last != folio) { + walk_update_folio(walk, last, gen, dirty); - continue; + last = folio; + dirty = false; } - old_gen = folio_lru_gen(folio); - if (old_gen < 0) - folio_set_referenced(folio); - else if (old_gen != new_gen) { - folio_clear_lru_refs(folio); - folio_activate(folio); - } + if (pte_dirty(ptent)) + dirty = true; + + young++; } + walk_update_folio(walk, last, gen, dirty); + arch_leave_lazy_mmu_mode(); /* feedback from rmap walkers to page table walkers */ @@ -4290,7 +4459,8 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c int zone = folio_zonenum(folio); int delta = folio_nr_pages(folio); int refs = folio_lru_refs(folio); - int tier = lru_tier_from_refs(refs); + bool workingset = folio_test_workingset(folio); + int tier = lru_tier_from_refs(refs, workingset); struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio); @@ -4312,14 +4482,17 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c } /* protected */ - if (tier > tier_idx || refs == BIT(LRU_REFS_WIDTH)) { - int hist = lru_hist_from_seq(lrugen->min_seq[type]); - + if (tier > tier_idx || refs + workingset == BIT(LRU_REFS_WIDTH) + 1) { gen = folio_inc_gen(lruvec, folio, false); - list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]); + list_move(&folio->lru, &lrugen->folios[gen][type][zone]); - WRITE_ONCE(lrugen->protected[hist][type][tier - 1], - lrugen->protected[hist][type][tier - 1] + delta); + /* don't count the workingset being lazily promoted */ + if (refs + workingset != BIT(LRU_REFS_WIDTH) + 1) { + int hist = lru_hist_from_seq(lrugen->min_seq[type]); + + WRITE_ONCE(lrugen->protected[hist][type][tier], + lrugen->protected[hist][type][tier] + delta); + } return true; } @@ -4339,8 +4512,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c } /* waiting for writeback */ - if (folio_test_locked(folio) || writeback || - (type == LRU_GEN_FILE && dirty)) { + if (writeback || (type == LRU_GEN_FILE && dirty)) { gen = folio_inc_gen(lruvec, folio, true); list_move(&folio->lru, &lrugen->folios[gen][type][zone]); return true; @@ -4369,13 +4541,12 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca return false; } - /* see the comment on MAX_NR_TIERS */ + /* see the comment on LRU_REFS_FLAGS */ if (!folio_test_referenced(folio)) - folio_clear_lru_refs(folio); + set_mask_bits(&folio->flags, LRU_REFS_MASK, 0); /* for shrink_folio_list() */ folio_clear_reclaim(folio); - folio_clear_referenced(folio); success = lru_gen_del_folio(lruvec, folio, true); VM_WARN_ON_ONCE_FOLIO(!success, folio); @@ -4445,13 +4616,13 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, break; } - item = PGSCAN_KSWAPD + reclaimer_offset(); + item = PGSCAN_KSWAPD + reclaimer_offset(sc); if (!cgroup_reclaim(sc)) { __count_vm_events(item, isolated); __count_vm_events(PGREFILL, sorted); } - __count_memcg_events(memcg, item, isolated); - __count_memcg_events(memcg, PGREFILL, sorted); + count_memcg_events(memcg, item, isolated); + count_memcg_events(memcg, PGREFILL, sorted); __count_vm_events(PGSCAN_ANON + type, isolated); trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, MAX_LRU_BATCH, scanned, skipped, isolated, @@ -4471,13 +4642,13 @@ static int get_tier_idx(struct lruvec *lruvec, int type) struct ctrl_pos sp, pv; /* - * To leave a margin for fluctuations, use a larger gain factor (1:2). + * To leave a margin for fluctuations, use a larger gain factor (2:3). * This value is chosen because any other tier would have at least twice * as many refaults as the first tier. */ - read_ctrl_pos(lruvec, type, 0, 1, &sp); + read_ctrl_pos(lruvec, type, 0, 2, &sp); for (tier = 1; tier < MAX_NR_TIERS; tier++) { - read_ctrl_pos(lruvec, type, tier, 2, &pv); + read_ctrl_pos(lruvec, type, tier, 3, &pv); if (!positive_ctrl_err(&sp, &pv)) break; } @@ -4485,79 +4656,45 @@ static int get_tier_idx(struct lruvec *lruvec, int type) return tier - 1; } -static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx) +static int get_type_to_scan(struct lruvec *lruvec, int swappiness) { - int type, tier; struct ctrl_pos sp, pv; - int gain[ANON_AND_FILE] = { swappiness, MAX_SWAPPINESS - swappiness }; + if (swappiness <= MIN_SWAPPINESS + 1) + return LRU_GEN_FILE; + + if (swappiness >= MAX_SWAPPINESS) + return LRU_GEN_ANON; /* - * Compare the first tier of anon with that of file to determine which - * type to scan. Also need to compare other tiers of the selected type - * with the first tier of the other type to determine the last tier (of - * the selected type) to evict. + * Compare the sum of all tiers of anon with that of file to determine + * which type to scan. */ - read_ctrl_pos(lruvec, LRU_GEN_ANON, 0, gain[LRU_GEN_ANON], &sp); - read_ctrl_pos(lruvec, LRU_GEN_FILE, 0, gain[LRU_GEN_FILE], &pv); - type = positive_ctrl_err(&sp, &pv); + read_ctrl_pos(lruvec, LRU_GEN_ANON, MAX_NR_TIERS, swappiness, &sp); + read_ctrl_pos(lruvec, LRU_GEN_FILE, MAX_NR_TIERS, MAX_SWAPPINESS - swappiness, &pv); - read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp); - for (tier = 1; tier < MAX_NR_TIERS; tier++) { - read_ctrl_pos(lruvec, type, tier, gain[type], &pv); - if (!positive_ctrl_err(&sp, &pv)) - break; - } - - *tier_idx = tier - 1; - - return type; + return positive_ctrl_err(&sp, &pv); } static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness, int *type_scanned, struct list_head *list) { int i; - int type; - int scanned; - int tier = -1; - DEFINE_MIN_SEQ(lruvec); + int type = get_type_to_scan(lruvec, swappiness); - /* - * Try to make the obvious choice first, and if anon and file are both - * available from the same generation, - * 1. Interpret swappiness 1 as file first and MAX_SWAPPINESS as anon - * first. - * 2. If !__GFP_IO, file first since clean pagecache is more likely to - * exist than clean swapcache. - */ - if (!swappiness) - type = LRU_GEN_FILE; - else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE]) - type = LRU_GEN_ANON; - else if (swappiness == 1) - type = LRU_GEN_FILE; - else if (swappiness == MAX_SWAPPINESS) - type = LRU_GEN_ANON; - else if (!(sc->gfp_mask & __GFP_IO)) - type = LRU_GEN_FILE; - else - type = get_type_to_scan(lruvec, swappiness, &tier); + for_each_evictable_type(i, swappiness) { + int scanned; + int tier = get_tier_idx(lruvec, type); - for (i = !swappiness; i < ANON_AND_FILE; i++) { - if (tier < 0) - tier = get_tier_idx(lruvec, type); + *type_scanned = type; scanned = scan_folios(lruvec, sc, type, tier, list); if (scanned) - break; + return scanned; type = !type; - tier = -1; } - *type_scanned = type; - - return scanned; + return 0; } static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness) @@ -4573,6 +4710,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap struct reclaim_stat stat; struct lru_gen_mm_walk *walk; bool skip_retry = false; + struct lru_gen_folio *lrugen = &lruvec->lrugen; struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct pglist_data *pgdat = lruvec_pgdat(lruvec); @@ -4582,7 +4720,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap scanned += try_to_inc_min_seq(lruvec, swappiness); - if (get_nr_gens(lruvec, !swappiness) == MIN_NR_GENS) + if (evictable_min_seq(lrugen->min_seq, swappiness) + MIN_NR_GENS > lrugen->max_seq) scanned = 0; spin_unlock_irq(&lruvec->lru_lock); @@ -4590,7 +4728,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap if (list_empty(&list)) return scanned; retry: - reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false); + reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false, memcg); sc->nr.unqueued_dirty += stat.nr_unqueued_dirty; sc->nr_reclaimed += reclaimed; trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, @@ -4598,31 +4736,24 @@ retry: type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON); list_for_each_entry_safe_reverse(folio, next, &list, lru) { + DEFINE_MIN_SEQ(lruvec); + if (!folio_evictable(folio)) { list_del(&folio->lru); folio_putback_lru(folio); continue; } - if (folio_test_reclaim(folio) && - (folio_test_dirty(folio) || folio_test_writeback(folio))) { - /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ - if (folio_test_workingset(folio)) - folio_set_referenced(folio); - continue; - } - - if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) || - folio_mapped(folio) || folio_test_locked(folio) || - folio_test_dirty(folio) || folio_test_writeback(folio)) { - /* don't add rejected folios to the oldest generation */ - set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, - BIT(PG_active)); + /* retry folios that may have missed folio_rotate_reclaimable() */ + if (!skip_retry && !folio_test_active(folio) && !folio_mapped(folio) && + !folio_test_dirty(folio) && !folio_test_writeback(folio)) { + list_move(&folio->lru, &clean); continue; } - /* retry folios that may have missed folio_rotate_reclaimable() */ - list_move(&folio->lru, &clean); + /* don't add rejected folios to the oldest generation */ + if (lru_gen_folio_seq(lruvec, folio, false) == min_seq[type]) + set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_active)); } spin_lock_irq(&lruvec->lru_lock); @@ -4635,10 +4766,13 @@ retry: reset_batch_size(walk); } - item = PGSTEAL_KSWAPD + reclaimer_offset(); + __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc), + stat.nr_demoted); + + item = PGSTEAL_KSWAPD + reclaimer_offset(sc); if (!cgroup_reclaim(sc)) __count_vm_events(item, reclaimed); - __count_memcg_events(memcg, item, reclaimed); + count_memcg_events(memcg, item, reclaimed); __count_vm_events(PGSTEAL_ANON + type, reclaimed); spin_unlock_irq(&lruvec->lru_lock); @@ -4654,63 +4788,32 @@ retry: } static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, - bool can_swap, unsigned long *nr_to_scan) + int swappiness, unsigned long *nr_to_scan) { int gen, type, zone; - unsigned long old = 0; - unsigned long young = 0; - unsigned long total = 0; + unsigned long size = 0; struct lru_gen_folio *lrugen = &lruvec->lrugen; DEFINE_MIN_SEQ(lruvec); - /* whether this lruvec is completely out of cold folios */ - if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) { - *nr_to_scan = 0; + *nr_to_scan = 0; + /* have to run aging, since eviction is not possible anymore */ + if (evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS > max_seq) return true; - } - for (type = !can_swap; type < ANON_AND_FILE; type++) { + for_each_evictable_type(type, swappiness) { unsigned long seq; for (seq = min_seq[type]; seq <= max_seq; seq++) { - unsigned long size = 0; - gen = lru_gen_from_seq(seq); for (zone = 0; zone < MAX_NR_ZONES; zone++) size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); - - total += size; - if (seq == max_seq) - young += size; - else if (seq + MIN_NR_GENS == max_seq) - old += size; } } - *nr_to_scan = total; - - /* - * The aging tries to be lazy to reduce the overhead, while the eviction - * stalls when the number of generations reaches MIN_NR_GENS. Hence, the - * ideal number of generations is MIN_NR_GENS+1. - */ - if (min_seq[!can_swap] + MIN_NR_GENS < max_seq) - return false; - - /* - * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1) - * of the total number of pages for each generation. A reasonable range - * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The - * aging cares about the upper bound of hot pages, while the eviction - * cares about the lower bound of cold pages. - */ - if (young * MIN_NR_GENS > total) - return true; - if (old * (MIN_NR_GENS + 2) < total) - return true; - - return false; + *nr_to_scan = size; + /* better to run aging even though eviction is still possible */ + return evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS == max_seq; } /* @@ -4718,7 +4821,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg * reclaim. */ -static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap) +static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int swappiness) { bool success; unsigned long nr_to_scan; @@ -4728,7 +4831,7 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg)) return -1; - success = should_run_aging(lruvec, max_seq, can_swap, &nr_to_scan); + success = should_run_aging(lruvec, max_seq, swappiness, &nr_to_scan); /* try to scrape all its memory if this memcg was deleted */ if (nr_to_scan && !mem_cgroup_online(memcg)) @@ -4739,7 +4842,7 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool return nr_to_scan >> sc->priority; /* stop scanning this lruvec as it's low on cold folios */ - return try_to_inc_max_seq(lruvec, max_seq, can_swap, false) ? -1 : 0; + return try_to_inc_max_seq(lruvec, max_seq, swappiness, false) ? -1 : 0; } static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc) @@ -5283,8 +5386,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, s = "rep"; n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]); n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]); - if (tier) - n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]); + n[2] = READ_ONCE(lrugen->protected[hist][type][tier]); } for (i = 0; i < 3; i++) @@ -5339,7 +5441,7 @@ static int lru_gen_seq_show(struct seq_file *m, void *v) seq_printf(m, " node %5d\n", nid); if (!full) - seq = min_seq[LRU_GEN_ANON]; + seq = evictable_min_seq(min_seq, MAX_SWAPPINESS / 2); else if (max_seq >= MAX_NR_GENS) seq = max_seq - MAX_NR_GENS + 1; else @@ -5379,23 +5481,14 @@ static const struct seq_operations lru_gen_seq_ops = { }; static int run_aging(struct lruvec *lruvec, unsigned long seq, - bool can_swap, bool force_scan) + int swappiness, bool force_scan) { DEFINE_MAX_SEQ(lruvec); - DEFINE_MIN_SEQ(lruvec); - - if (seq < max_seq) - return 0; if (seq > max_seq) return -EINVAL; - if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq) - return -ERANGE; - - try_to_inc_max_seq(lruvec, max_seq, can_swap, force_scan); - - return 0; + return try_to_inc_max_seq(lruvec, max_seq, swappiness, force_scan) ? 0 : -EEXIST; } static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc, @@ -5411,7 +5504,7 @@ static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_co while (!signal_pending(current)) { DEFINE_MIN_SEQ(lruvec); - if (seq < min_seq[!swappiness]) + if (seq < evictable_min_seq(min_seq, swappiness)) return 0; if (sc->nr_reclaimed >= nr_to_reclaim) @@ -5456,7 +5549,7 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, if (swappiness < MIN_SWAPPINESS) swappiness = get_swappiness(lruvec, sc); - else if (swappiness > MAX_SWAPPINESS) + else if (swappiness > SWAPPINESS_ANON_ONLY) goto done; switch (cmd) { @@ -5513,24 +5606,35 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, while ((cur = strsep(&next, ",;\n"))) { int n; int end; - char cmd; + char cmd, swap_string[5]; unsigned int memcg_id; unsigned int nid; unsigned long seq; - unsigned int swappiness = -1; + unsigned int swappiness; unsigned long opt = -1; cur = skip_spaces(cur); if (!*cur) continue; - n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid, - &seq, &end, &swappiness, &end, &opt, &end); + n = sscanf(cur, "%c %u %u %lu %n %4s %n %lu %n", &cmd, &memcg_id, &nid, + &seq, &end, swap_string, &end, &opt, &end); if (n < 4 || cur[end]) { err = -EINVAL; break; } + if (n == 4) { + swappiness = -1; + } else if (!strcmp("max", swap_string)) { + /* set by userspace for anonymous memory only */ + swappiness = SWAPPINESS_ANON_ONLY; + } else { + err = kstrtouint(swap_string, 0, &swappiness); + if (err) + break; + } + err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt); if (err) break; @@ -5790,7 +5894,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ - if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) && + if (can_age_anon_pages(lruvec, sc) && inactive_is_low(lruvec, LRU_INACTIVE_ANON)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); @@ -5821,6 +5925,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, unsigned long pages_for_compaction; unsigned long inactive_lru_pages; int z; + struct zone *zone; /* If not in reclaim/compaction mode, stop */ if (!in_reclaim_compaction(sc)) @@ -5840,17 +5945,16 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, return false; /* If compaction would go ahead or the allocation would succeed, stop */ - for (z = 0; z <= sc->reclaim_idx; z++) { - struct zone *zone = &pgdat->node_zones[z]; - if (!managed_zone(zone)) - continue; + for_each_managed_zone_pgdat(zone, pgdat, z, sc->reclaim_idx) { + unsigned long watermark = min_wmark_pages(zone); /* Allocation can already succeed, nothing to do */ - if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone), + if (zone_watermark_ok(zone, sc->order, watermark, sc->reclaim_idx, 0)) return false; - if (compaction_suitable(zone, sc->order, sc->reclaim_idx)) + if (compaction_suitable(zone, sc->order, watermark, + sc->reclaim_idx)) return false; } @@ -6077,22 +6181,21 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) sc->reclaim_idx, 0)) return true; - /* Compaction cannot yet proceed. Do reclaim. */ - if (!compaction_suitable(zone, sc->order, sc->reclaim_idx)) - return false; - /* - * Compaction is already possible, but it takes time to run and there - * are potentially other callers using the pages just freed. So proceed - * with reclaim to make a buffer of free pages available to give - * compaction a reasonable chance of completing and allocating the page. + * Direct reclaim usually targets the min watermark, but compaction + * takes time to run and there are potentially other callers using the + * pages just freed. So target a higher buffer to give compaction a + * reasonable chance of completing and allocating the pages. + * * Note that we won't actually reclaim the whole buffer in one attempt * as the target watermark in should_continue_reclaim() is lower. But if * we are already above the high+gap watermark, don't reclaim at all. */ - watermark = high_wmark_pages(zone) + compact_gap(sc->order); + watermark = high_wmark_pages(zone); + if (compaction_suitable(zone, sc->order, watermark, sc->reclaim_idx)) + return true; - return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx); + return false; } static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc) @@ -6371,11 +6474,7 @@ static bool allow_direct_reclaim(pg_data_t *pgdat) if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) return true; - for (i = 0; i <= ZONE_NORMAL; i++) { - zone = &pgdat->node_zones[i]; - if (!managed_zone(zone)) - continue; - + for_each_managed_zone_pgdat(zone, pgdat, i, ZONE_NORMAL) { if (!zone_reclaimable_pages(zone)) continue; @@ -6626,10 +6725,10 @@ static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc) return; } - if (!can_age_anon_pages(pgdat, sc)) + lruvec = mem_cgroup_lruvec(NULL, pgdat); + if (!can_age_anon_pages(lruvec, sc)) return; - lruvec = mem_cgroup_lruvec(NULL, pgdat); if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON)) return; @@ -6680,17 +6779,48 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx) * Check watermarks bottom-up as lower zones are more likely to * meet watermarks. */ - for (i = 0; i <= highest_zoneidx; i++) { - zone = pgdat->node_zones + i; - - if (!managed_zone(zone)) - continue; + for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) { + enum zone_stat_item item; + unsigned long free_pages; if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) mark = promo_wmark_pages(zone); else mark = high_wmark_pages(zone); - if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx)) + + /* + * In defrag_mode, watermarks must be met in whole + * blocks to avoid polluting allocator fallbacks. + * + * However, kswapd usually cannot accomplish this on + * its own and needs kcompactd support. Once it's + * reclaimed a compaction gap, and kswapd_shrink_node + * has dropped order, simply ensure there are enough + * base pages for compaction, wake kcompactd & sleep. + */ + if (defrag_mode && order) + item = NR_FREE_PAGES_BLOCKS; + else + item = NR_FREE_PAGES; + + /* + * When there is a high number of CPUs in the system, + * the cumulative error from the vmstat per-cpu cache + * can blur the line between the watermarks. In that + * case, be safe and get an accurate snapshot. + * + * TODO: NR_FREE_PAGES_BLOCKS moves in steps of + * pageblock_nr_pages, while the vmstat pcp threshold + * is limited to 125. On many configurations that + * counter won't actually be per-cpu cached. But keep + * things simple for now; revisit when somebody cares. + */ + free_pages = zone_page_state(zone, item); + if (zone->percpu_drift_mark && free_pages < zone->percpu_drift_mark) + free_pages = zone_page_state_snapshot(zone, item); + + if (__zone_watermark_ok(zone, order, mark, highest_zoneidx, + 0, free_pages)) return true; } @@ -6770,11 +6900,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat, /* Reclaim a number of pages proportional to the number of zones */ sc->nr_to_reclaim = 0; - for (z = 0; z <= sc->reclaim_idx; z++) { - zone = pgdat->node_zones + z; - if (!managed_zone(zone)) - continue; - + for_each_managed_zone_pgdat(zone, pgdat, z, sc->reclaim_idx) { sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX); } @@ -6805,12 +6931,7 @@ update_reclaim_active(pg_data_t *pgdat, int highest_zoneidx, bool active) int i; struct zone *zone; - for (i = 0; i <= highest_zoneidx; i++) { - zone = pgdat->node_zones + i; - - if (!managed_zone(zone)) - continue; - + for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) { if (active) set_bit(ZONE_RECLAIM_ACTIVE, &zone->flags); else @@ -6871,11 +6992,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) * stall or direct reclaim until kswapd is finished. */ nr_boost_reclaim = 0; - for (i = 0; i <= highest_zoneidx; i++) { - zone = pgdat->node_zones + i; - if (!managed_zone(zone)) - continue; - + for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) { nr_boost_reclaim += zone->watermark_boost; zone_boosts[i] = zone->watermark_boost; } @@ -7182,10 +7299,6 @@ static int kswapd(void *p) unsigned int highest_zoneidx = MAX_NR_ZONES - 1; pg_data_t *pgdat = (pg_data_t *)p; struct task_struct *tsk = current; - const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); - - if (!cpumask_empty(cpumask)) - set_cpus_allowed_ptr(tsk, cpumask); /* * Tell the memory management that we're a "memory allocator", @@ -7354,13 +7467,15 @@ void __meminit kswapd_run(int nid) pgdat_kswapd_lock(pgdat); if (!pgdat->kswapd) { - pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); + pgdat->kswapd = kthread_create_on_node(kswapd, pgdat, nid, "kswapd%d", nid); if (IS_ERR(pgdat->kswapd)) { /* failure at boot is fatal */ pr_err("Failed to start kswapd on node %d,ret=%ld\n", nid, PTR_ERR(pgdat->kswapd)); BUG_ON(system_state < SYSTEM_RUNNING); pgdat->kswapd = NULL; + } else { + wake_up_process(pgdat->kswapd); } } pgdat_kswapd_unlock(pgdat); @@ -7384,6 +7499,28 @@ void __meminit kswapd_stop(int nid) pgdat_kswapd_unlock(pgdat); } +static const struct ctl_table vmscan_sysctl_table[] = { + { + .procname = "swappiness", + .data = &vm_swappiness, + .maxlen = sizeof(vm_swappiness), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO_HUNDRED, + }, +#ifdef CONFIG_NUMA + { + .procname = "zone_reclaim_mode", + .data = &node_reclaim_mode, + .maxlen = sizeof(node_reclaim_mode), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + } +#endif +}; + static int __init kswapd_init(void) { int nid; @@ -7391,6 +7528,7 @@ static int __init kswapd_init(void) swap_setup(); for_each_node_state(nid, N_MEMORY) kswapd_run(nid); + register_sysctl_init("vm", vmscan_sysctl_table); return 0; } @@ -7556,11 +7694,11 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id()) return NODE_RECLAIM_NOSCAN; - if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags)) + if (test_and_set_bit_lock(PGDAT_RECLAIM_LOCKED, &pgdat->flags)) return NODE_RECLAIM_NOSCAN; ret = __node_reclaim(pgdat, gfp_mask, order); - clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags); + clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags); if (ret) count_vm_event(PGSCAN_ZONE_RECLAIM_SUCCESS); diff --git a/mm/vmstat.c b/mm/vmstat.c index 4d016314a56c..429ae5339bfe 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -7,7 +7,7 @@ * * zoned VM statistics * Copyright (C) 2006 Silicon Graphics, Inc., - * Christoph Lameter <christoph@lameter.com> + * Christoph Lameter <cl@gentwo.org> * Copyright (C) 2008-2014 Christoph Lameter */ #include <linux/fs.h> @@ -31,8 +31,10 @@ #include "internal.h" +#ifdef CONFIG_PROC_FS #ifdef CONFIG_NUMA -int sysctl_vm_numa_stat = ENABLE_NUMA_STAT; +#define ENABLE_NUMA_STAT 1 +static int sysctl_vm_numa_stat = ENABLE_NUMA_STAT; /* zero numa counters within a zone */ static void zero_zone_numa_counters(struct zone *zone) @@ -74,7 +76,7 @@ static void invalid_numa_statistics(void) static DEFINE_MUTEX(vm_numa_stat_lock); -int sysctl_vm_numa_stat_handler(const struct ctl_table *table, int write, +static int sysctl_vm_numa_stat_handler(const struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { int ret, oldval; @@ -102,6 +104,7 @@ out: return ret; } #endif +#endif /* CONFIG_PROC_FS */ #ifdef CONFIG_VM_EVENT_COUNTERS DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; @@ -1190,6 +1193,7 @@ int fragmentation_index(struct zone *zone, unsigned int order) const char * const vmstat_text[] = { /* enum zone_stat_item counters */ "nr_free_pages", + "nr_free_pages_blocks", "nr_zone_inactive_anon", "nr_zone_active_anon", "nr_zone_inactive_file", @@ -1197,7 +1201,6 @@ const char * const vmstat_text[] = { "nr_zone_unevictable", "nr_zone_write_pending", "nr_mlock", - "nr_bounce", #if IS_ENABLED(CONFIG_ZSMALLOC) "nr_zspages", #endif @@ -1273,9 +1276,11 @@ const char * const vmstat_text[] = { "pgdemote_kswapd", "pgdemote_direct", "pgdemote_khugepaged", + "pgdemote_proactive", #ifdef CONFIG_HUGETLB_PAGE "nr_hugetlb", #endif + "nr_balloon_pages", /* system-wide enum vm_stat_item counters */ "nr_dirty_threshold", "nr_dirty_background_threshold", @@ -1307,9 +1312,11 @@ const char * const vmstat_text[] = { "pgsteal_kswapd", "pgsteal_direct", "pgsteal_khugepaged", + "pgsteal_proactive", "pgscan_kswapd", "pgscan_direct", "pgscan_khugepaged", + "pgscan_proactive", "pgscan_direct_throttle", "pgscan_anon", "pgscan_file", @@ -1339,6 +1346,8 @@ const char * const vmstat_text[] = { "numa_hint_faults", "numa_hint_faults_local", "numa_pages_migrated", + "numa_task_migrated", + "numa_task_swapped", #endif #ifdef CONFIG_MIGRATION "pgmigrate_success", @@ -1435,6 +1444,8 @@ const char * const vmstat_text[] = { #ifdef CONFIG_X86 "direct_map_level2_splits", "direct_map_level3_splits", + "direct_map_level2_collapses", + "direct_map_level3_collapses", #endif #ifdef CONFIG_PER_VMA_LOCK_STATS "vma_lock_success", @@ -1938,7 +1949,7 @@ static const struct seq_operations vmstat_op = { #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct delayed_work, vmstat_work); -int sysctl_stat_interval __read_mostly = HZ; +static int sysctl_stat_interval __read_mostly = HZ; static int vmstat_late_init_done; #ifdef CONFIG_PROC_FS @@ -1947,7 +1958,7 @@ static void refresh_vm_stats(struct work_struct *work) refresh_cpu_vm_stats(true); } -int vmstat_refresh(const struct ctl_table *table, int write, +static int vmstat_refresh(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { long val; @@ -2122,10 +2133,20 @@ static void __init start_shepherd_timer(void) { int cpu; - for_each_possible_cpu(cpu) + for_each_possible_cpu(cpu) { INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu), vmstat_update); + /* + * For secondary CPUs during CPU hotplug scenarios, + * vmstat_cpu_online() will enable the work. + * mm/vmstat:online enables and disables vmstat_work + * symmetrically during CPU hotplug events. + */ + if (!cpu_online(cpu)) + disable_delayed_work_sync(&per_cpu(vmstat_work, cpu)); + } + schedule_delayed_work(&shepherd, round_jiffies_relative(sysctl_stat_interval)); } @@ -2148,13 +2169,14 @@ static int vmstat_cpu_online(unsigned int cpu) if (!node_state(cpu_to_node(cpu), N_CPU)) { node_set_state(cpu_to_node(cpu), N_CPU); } + enable_delayed_work(&per_cpu(vmstat_work, cpu)); return 0; } static int vmstat_cpu_down_prep(unsigned int cpu) { - cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu)); + disable_delayed_work_sync(&per_cpu(vmstat_work, cpu)); return 0; } @@ -2185,6 +2207,38 @@ static int __init vmstat_late_init(void) late_initcall(vmstat_late_init); #endif +#ifdef CONFIG_PROC_FS +static const struct ctl_table vmstat_table[] = { +#ifdef CONFIG_SMP + { + .procname = "stat_interval", + .data = &sysctl_stat_interval, + .maxlen = sizeof(sysctl_stat_interval), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "stat_refresh", + .data = NULL, + .maxlen = 0, + .mode = 0600, + .proc_handler = vmstat_refresh, + }, +#endif +#ifdef CONFIG_NUMA + { + .procname = "numa_stat", + .data = &sysctl_vm_numa_stat, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_vm_numa_stat_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif +}; +#endif + struct workqueue_struct *mm_percpu_wq; void __init init_mm_internals(void) @@ -2216,6 +2270,7 @@ void __init init_mm_internals(void) proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op); proc_create_seq("vmstat", 0444, NULL, &vmstat_op); proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op); + register_sysctl_init("vm", vmstat_table); #endif } diff --git a/mm/workingset.c b/mm/workingset.c index a4705e196545..6e7f4cb1b9a7 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -239,7 +239,8 @@ static void *lru_gen_eviction(struct folio *folio) int type = folio_is_file_lru(folio); int delta = folio_nr_pages(folio); int refs = folio_lru_refs(folio); - int tier = lru_tier_from_refs(refs); + bool workingset = folio_test_workingset(folio); + int tier = lru_tier_from_refs(refs, workingset); struct mem_cgroup *memcg = folio_memcg(folio); struct pglist_data *pgdat = folio_pgdat(folio); @@ -253,18 +254,18 @@ static void *lru_gen_eviction(struct folio *folio) hist = lru_hist_from_seq(min_seq); atomic_long_add(delta, &lrugen->evicted[hist][type][tier]); - return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs); + return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset); } /* * Tests if the shadow entry is for a folio that was recently evicted. * Fills in @lruvec, @token, @workingset with the values unpacked from shadow. */ -static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec, +static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec, unsigned long *token, bool *workingset) { int memcg_id; - unsigned long min_seq; + unsigned long max_seq; struct mem_cgroup *memcg; struct pglist_data *pgdat; @@ -273,8 +274,10 @@ static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec, memcg = mem_cgroup_from_id(memcg_id); *lruvec = mem_cgroup_lruvec(memcg, pgdat); - min_seq = READ_ONCE((*lruvec)->lrugen.min_seq[file]); - return (*token >> LRU_REFS_WIDTH) == (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH)); + max_seq = READ_ONCE((*lruvec)->lrugen.max_seq); + max_seq &= EVICTION_MASK >> LRU_REFS_WIDTH; + + return abs_diff(max_seq, *token >> LRU_REFS_WIDTH) < MAX_NR_GENS; } static void lru_gen_refault(struct folio *folio, void *shadow) @@ -290,7 +293,7 @@ static void lru_gen_refault(struct folio *folio, void *shadow) rcu_read_lock(); - recent = lru_gen_test_recent(shadow, type, &lruvec, &token, &workingset); + recent = lru_gen_test_recent(shadow, &lruvec, &token, &workingset); if (lruvec != folio_lruvec(folio)) goto unlock; @@ -302,24 +305,20 @@ static void lru_gen_refault(struct folio *folio, void *shadow) lrugen = &lruvec->lrugen; hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type])); - /* see the comment in folio_lru_refs() */ - refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset; - tier = lru_tier_from_refs(refs); + refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + 1; + tier = lru_tier_from_refs(refs, workingset); atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]); - mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); - /* - * Count the following two cases as stalls: - * 1. For pages accessed through page tables, hotter pages pushed out - * hot pages which refaulted immediately. - * 2. For pages accessed multiple times through file descriptors, - * they would have been protected by sort_folio(). - */ - if (lru_gen_in_fault() || refs >= BIT(LRU_REFS_WIDTH) - 1) { - set_mask_bits(&folio->flags, 0, LRU_REFS_MASK | BIT(PG_workingset)); + /* see folio_add_lru() where folio_set_active() will be called */ + if (lru_gen_in_fault()) + mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); + + if (workingset) { + folio_set_workingset(folio); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta); - } + } else + set_mask_bits(&folio->flags, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF); unlock: rcu_read_unlock(); } @@ -331,7 +330,7 @@ static void *lru_gen_eviction(struct folio *folio) return NULL; } -static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec, +static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec, unsigned long *token, bool *workingset) { return false; @@ -428,17 +427,16 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset, struct pglist_data *pgdat; unsigned long eviction; - rcu_read_lock(); - if (lru_gen_enabled()) { - bool recent = lru_gen_test_recent(shadow, file, - &eviction_lruvec, &eviction, workingset); + bool recent; + rcu_read_lock(); + recent = lru_gen_test_recent(shadow, &eviction_lruvec, &eviction, workingset); rcu_read_unlock(); return recent; } - + rcu_read_lock(); unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset); eviction <<= bucket_order; @@ -459,14 +457,12 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset, * configurations instead. */ eviction_memcg = mem_cgroup_from_id(memcgid); - if (!mem_cgroup_disabled() && - (!eviction_memcg || !mem_cgroup_tryget(eviction_memcg))) { - rcu_read_unlock(); - return false; - } - + if (!mem_cgroup_tryget(eviction_memcg)) + eviction_memcg = NULL; rcu_read_unlock(); + if (!mem_cgroup_disabled() && !eviction_memcg) + return false; /* * Flush stats (and potentially sleep) outside the RCU read section. * @@ -544,6 +540,8 @@ void workingset_refault(struct folio *folio, void *shadow) bool workingset; long nr; + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + if (lru_gen_enabled()) { lru_gen_refault(folio, shadow); return; @@ -558,7 +556,6 @@ void workingset_refault(struct folio *folio, void *shadow) * is actually experiencing the refault event. Make sure the folio is * locked to guarantee folio_memcg() stability throughout. */ - VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); nr = folio_nr_pages(folio); memcg = folio_memcg(folio); pgdat = folio_pgdat(folio); @@ -615,7 +612,6 @@ struct list_lru shadow_nodes; void workingset_update_node(struct xa_node *node) { - struct address_space *mapping; struct page *page = virt_to_page(node); /* @@ -626,8 +622,7 @@ void workingset_update_node(struct xa_node *node) * already where they should be. The list_empty() test is safe * as node->private_list is protected by the i_pages lock. */ - mapping = container_of(node->array, struct address_space, i_pages); - lockdep_assert_held(&mapping->i_pages.xa_lock); + lockdep_assert_held(&node->array->xa_lock); if (node->count && node->count == node->nr_values) { if (list_empty(&node->private_list)) { diff --git a/mm/z3fold.c b/mm/z3fold.c deleted file mode 100644 index 379d24b4fef9..000000000000 --- a/mm/z3fold.c +++ /dev/null @@ -1,1447 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * z3fold.c - * - * Author: Vitaly Wool <vitaly.wool@konsulko.com> - * Copyright (C) 2016, Sony Mobile Communications Inc. - * - * This implementation is based on zbud written by Seth Jennings. - * - * z3fold is an special purpose allocator for storing compressed pages. It - * can store up to three compressed pages per page which improves the - * compression ratio of zbud while retaining its main concepts (e. g. always - * storing an integral number of objects per page) and simplicity. - * It still has simple and deterministic reclaim properties that make it - * preferable to a higher density approach (with no requirement on integral - * number of object per page) when reclaim is used. - * - * As in zbud, pages are divided into "chunks". The size of the chunks is - * fixed at compile time and is determined by NCHUNKS_ORDER below. - * - * z3fold doesn't export any API and is meant to be used via zpool API. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/atomic.h> -#include <linux/sched.h> -#include <linux/cpumask.h> -#include <linux/list.h> -#include <linux/mm.h> -#include <linux/module.h> -#include <linux/page-flags.h> -#include <linux/migrate.h> -#include <linux/node.h> -#include <linux/compaction.h> -#include <linux/percpu.h> -#include <linux/preempt.h> -#include <linux/workqueue.h> -#include <linux/slab.h> -#include <linux/spinlock.h> -#include <linux/zpool.h> -#include <linux/kmemleak.h> - -/* - * NCHUNKS_ORDER determines the internal allocation granularity, effectively - * adjusting internal fragmentation. It also determines the number of - * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the - * allocation granularity will be in chunks of size PAGE_SIZE/64. Some chunks - * in the beginning of an allocated page are occupied by z3fold header, so - * NCHUNKS will be calculated to 63 (or 62 in case CONFIG_DEBUG_SPINLOCK=y), - * which shows the max number of free chunks in z3fold page, also there will - * be 63, or 62, respectively, freelists per pool. - */ -#define NCHUNKS_ORDER 6 - -#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER) -#define CHUNK_SIZE (1 << CHUNK_SHIFT) -#define ZHDR_SIZE_ALIGNED round_up(sizeof(struct z3fold_header), CHUNK_SIZE) -#define ZHDR_CHUNKS (ZHDR_SIZE_ALIGNED >> CHUNK_SHIFT) -#define TOTAL_CHUNKS (PAGE_SIZE >> CHUNK_SHIFT) -#define NCHUNKS (TOTAL_CHUNKS - ZHDR_CHUNKS) - -#define BUDDY_MASK (0x3) -#define BUDDY_SHIFT 2 -#define SLOTS_ALIGN (0x40) - -/***************** - * Structures -*****************/ -struct z3fold_pool; - -enum buddy { - HEADLESS = 0, - FIRST, - MIDDLE, - LAST, - BUDDIES_MAX = LAST -}; - -struct z3fold_buddy_slots { - /* - * we are using BUDDY_MASK in handle_to_buddy etc. so there should - * be enough slots to hold all possible variants - */ - unsigned long slot[BUDDY_MASK + 1]; - unsigned long pool; /* back link */ - rwlock_t lock; -}; -#define HANDLE_FLAG_MASK (0x03) - -/* - * struct z3fold_header - z3fold page metadata occupying first chunks of each - * z3fold page, except for HEADLESS pages - * @buddy: links the z3fold page into the relevant list in the - * pool - * @page_lock: per-page lock - * @refcount: reference count for the z3fold page - * @work: work_struct for page layout optimization - * @slots: pointer to the structure holding buddy slots - * @pool: pointer to the containing pool - * @cpu: CPU which this page "belongs" to - * @first_chunks: the size of the first buddy in chunks, 0 if free - * @middle_chunks: the size of the middle buddy in chunks, 0 if free - * @last_chunks: the size of the last buddy in chunks, 0 if free - * @first_num: the starting number (for the first handle) - * @mapped_count: the number of objects currently mapped - */ -struct z3fold_header { - struct list_head buddy; - spinlock_t page_lock; - struct kref refcount; - struct work_struct work; - struct z3fold_buddy_slots *slots; - struct z3fold_pool *pool; - short cpu; - unsigned short first_chunks; - unsigned short middle_chunks; - unsigned short last_chunks; - unsigned short start_middle; - unsigned short first_num:2; - unsigned short mapped_count:2; - unsigned short foreign_handles:2; -}; - -/** - * struct z3fold_pool - stores metadata for each z3fold pool - * @name: pool name - * @lock: protects pool unbuddied lists - * @stale_lock: protects pool stale page list - * @unbuddied: per-cpu array of lists tracking z3fold pages that contain 2- - * buddies; the list each z3fold page is added to depends on - * the size of its free region. - * @stale: list of pages marked for freeing - * @pages_nr: number of z3fold pages in the pool. - * @c_handle: cache for z3fold_buddy_slots allocation - * @compact_wq: workqueue for page layout background optimization - * @release_wq: workqueue for safe page release - * @work: work_struct for safe page release - * - * This structure is allocated at pool creation time and maintains metadata - * pertaining to a particular z3fold pool. - */ -struct z3fold_pool { - const char *name; - spinlock_t lock; - spinlock_t stale_lock; - struct list_head __percpu *unbuddied; - struct list_head stale; - atomic64_t pages_nr; - struct kmem_cache *c_handle; - struct workqueue_struct *compact_wq; - struct workqueue_struct *release_wq; - struct work_struct work; -}; - -/* - * Internal z3fold page flags - */ -enum z3fold_page_flags { - PAGE_HEADLESS = 0, - MIDDLE_CHUNK_MAPPED, - NEEDS_COMPACTING, - PAGE_STALE, - PAGE_CLAIMED, /* by either reclaim or free */ - PAGE_MIGRATED, /* page is migrated and soon to be released */ -}; - -/* - * handle flags, go under HANDLE_FLAG_MASK - */ -enum z3fold_handle_flags { - HANDLES_NOFREE = 0, -}; - -/* - * Forward declarations - */ -static struct z3fold_header *__z3fold_alloc(struct z3fold_pool *, size_t, bool); -static void compact_page_work(struct work_struct *w); - -/***************** - * Helpers -*****************/ - -/* Converts an allocation size in bytes to size in z3fold chunks */ -static int size_to_chunks(size_t size) -{ - return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; -} - -#define for_each_unbuddied_list(_iter, _begin) \ - for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++) - -static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool, - gfp_t gfp) -{ - struct z3fold_buddy_slots *slots = kmem_cache_zalloc(pool->c_handle, - gfp); - - if (slots) { - /* It will be freed separately in free_handle(). */ - kmemleak_not_leak(slots); - slots->pool = (unsigned long)pool; - rwlock_init(&slots->lock); - } - - return slots; -} - -static inline struct z3fold_pool *slots_to_pool(struct z3fold_buddy_slots *s) -{ - return (struct z3fold_pool *)(s->pool & ~HANDLE_FLAG_MASK); -} - -static inline struct z3fold_buddy_slots *handle_to_slots(unsigned long handle) -{ - return (struct z3fold_buddy_slots *)(handle & ~(SLOTS_ALIGN - 1)); -} - -/* Lock a z3fold page */ -static inline void z3fold_page_lock(struct z3fold_header *zhdr) -{ - spin_lock(&zhdr->page_lock); -} - -/* Try to lock a z3fold page */ -static inline int z3fold_page_trylock(struct z3fold_header *zhdr) -{ - return spin_trylock(&zhdr->page_lock); -} - -/* Unlock a z3fold page */ -static inline void z3fold_page_unlock(struct z3fold_header *zhdr) -{ - spin_unlock(&zhdr->page_lock); -} - -/* return locked z3fold page if it's not headless */ -static inline struct z3fold_header *get_z3fold_header(unsigned long handle) -{ - struct z3fold_buddy_slots *slots; - struct z3fold_header *zhdr; - int locked = 0; - - if (!(handle & (1 << PAGE_HEADLESS))) { - slots = handle_to_slots(handle); - do { - unsigned long addr; - - read_lock(&slots->lock); - addr = *(unsigned long *)handle; - zhdr = (struct z3fold_header *)(addr & PAGE_MASK); - locked = z3fold_page_trylock(zhdr); - read_unlock(&slots->lock); - if (locked) { - struct page *page = virt_to_page(zhdr); - - if (!test_bit(PAGE_MIGRATED, &page->private)) - break; - z3fold_page_unlock(zhdr); - } - cpu_relax(); - } while (true); - } else { - zhdr = (struct z3fold_header *)(handle & PAGE_MASK); - } - - return zhdr; -} - -static inline void put_z3fold_header(struct z3fold_header *zhdr) -{ - struct page *page = virt_to_page(zhdr); - - if (!test_bit(PAGE_HEADLESS, &page->private)) - z3fold_page_unlock(zhdr); -} - -static inline void free_handle(unsigned long handle, struct z3fold_header *zhdr) -{ - struct z3fold_buddy_slots *slots; - int i; - bool is_free; - - if (WARN_ON(*(unsigned long *)handle == 0)) - return; - - slots = handle_to_slots(handle); - write_lock(&slots->lock); - *(unsigned long *)handle = 0; - - if (test_bit(HANDLES_NOFREE, &slots->pool)) { - write_unlock(&slots->lock); - return; /* simple case, nothing else to do */ - } - - if (zhdr->slots != slots) - zhdr->foreign_handles--; - - is_free = true; - for (i = 0; i <= BUDDY_MASK; i++) { - if (slots->slot[i]) { - is_free = false; - break; - } - } - write_unlock(&slots->lock); - - if (is_free) { - struct z3fold_pool *pool = slots_to_pool(slots); - - if (zhdr->slots == slots) - zhdr->slots = NULL; - kmem_cache_free(pool->c_handle, slots); - } -} - -/* Initializes the z3fold header of a newly allocated z3fold page */ -static struct z3fold_header *init_z3fold_page(struct page *page, bool headless, - struct z3fold_pool *pool, gfp_t gfp) -{ - struct z3fold_header *zhdr = page_address(page); - struct z3fold_buddy_slots *slots; - - clear_bit(PAGE_HEADLESS, &page->private); - clear_bit(MIDDLE_CHUNK_MAPPED, &page->private); - clear_bit(NEEDS_COMPACTING, &page->private); - clear_bit(PAGE_STALE, &page->private); - clear_bit(PAGE_CLAIMED, &page->private); - clear_bit(PAGE_MIGRATED, &page->private); - if (headless) - return zhdr; - - slots = alloc_slots(pool, gfp); - if (!slots) - return NULL; - - memset(zhdr, 0, sizeof(*zhdr)); - spin_lock_init(&zhdr->page_lock); - kref_init(&zhdr->refcount); - zhdr->cpu = -1; - zhdr->slots = slots; - zhdr->pool = pool; - INIT_LIST_HEAD(&zhdr->buddy); - INIT_WORK(&zhdr->work, compact_page_work); - return zhdr; -} - -/* Resets the struct page fields and frees the page */ -static void free_z3fold_page(struct page *page, bool headless) -{ - if (!headless) { - lock_page(page); - __ClearPageMovable(page); - unlock_page(page); - } - __free_page(page); -} - -/* Helper function to build the index */ -static inline int __idx(struct z3fold_header *zhdr, enum buddy bud) -{ - return (bud + zhdr->first_num) & BUDDY_MASK; -} - -/* - * Encodes the handle of a particular buddy within a z3fold page. - * Zhdr->page_lock should be held as this function accesses first_num - * if bud != HEADLESS. - */ -static unsigned long __encode_handle(struct z3fold_header *zhdr, - struct z3fold_buddy_slots *slots, - enum buddy bud) -{ - unsigned long h = (unsigned long)zhdr; - int idx = 0; - - /* - * For a headless page, its handle is its pointer with the extra - * PAGE_HEADLESS bit set - */ - if (bud == HEADLESS) - return h | (1 << PAGE_HEADLESS); - - /* otherwise, return pointer to encoded handle */ - idx = __idx(zhdr, bud); - h += idx; - if (bud == LAST) - h |= (zhdr->last_chunks << BUDDY_SHIFT); - - write_lock(&slots->lock); - slots->slot[idx] = h; - write_unlock(&slots->lock); - return (unsigned long)&slots->slot[idx]; -} - -static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud) -{ - return __encode_handle(zhdr, zhdr->slots, bud); -} - -/* only for LAST bud, returns zero otherwise */ -static unsigned short handle_to_chunks(unsigned long handle) -{ - struct z3fold_buddy_slots *slots = handle_to_slots(handle); - unsigned long addr; - - read_lock(&slots->lock); - addr = *(unsigned long *)handle; - read_unlock(&slots->lock); - return (addr & ~PAGE_MASK) >> BUDDY_SHIFT; -} - -/* - * (handle & BUDDY_MASK) < zhdr->first_num is possible in encode_handle - * but that doesn't matter. because the masking will result in the - * correct buddy number. - */ -static enum buddy handle_to_buddy(unsigned long handle) -{ - struct z3fold_header *zhdr; - struct z3fold_buddy_slots *slots = handle_to_slots(handle); - unsigned long addr; - - read_lock(&slots->lock); - WARN_ON(handle & (1 << PAGE_HEADLESS)); - addr = *(unsigned long *)handle; - read_unlock(&slots->lock); - zhdr = (struct z3fold_header *)(addr & PAGE_MASK); - return (addr - zhdr->first_num) & BUDDY_MASK; -} - -static inline struct z3fold_pool *zhdr_to_pool(struct z3fold_header *zhdr) -{ - return zhdr->pool; -} - -static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked) -{ - struct page *page = virt_to_page(zhdr); - struct z3fold_pool *pool = zhdr_to_pool(zhdr); - - WARN_ON(!list_empty(&zhdr->buddy)); - set_bit(PAGE_STALE, &page->private); - clear_bit(NEEDS_COMPACTING, &page->private); - spin_lock(&pool->lock); - spin_unlock(&pool->lock); - - if (locked) - z3fold_page_unlock(zhdr); - - spin_lock(&pool->stale_lock); - list_add(&zhdr->buddy, &pool->stale); - queue_work(pool->release_wq, &pool->work); - spin_unlock(&pool->stale_lock); - - atomic64_dec(&pool->pages_nr); -} - -static void release_z3fold_page_locked(struct kref *ref) -{ - struct z3fold_header *zhdr = container_of(ref, struct z3fold_header, - refcount); - WARN_ON(z3fold_page_trylock(zhdr)); - __release_z3fold_page(zhdr, true); -} - -static void release_z3fold_page_locked_list(struct kref *ref) -{ - struct z3fold_header *zhdr = container_of(ref, struct z3fold_header, - refcount); - struct z3fold_pool *pool = zhdr_to_pool(zhdr); - - spin_lock(&pool->lock); - list_del_init(&zhdr->buddy); - spin_unlock(&pool->lock); - - WARN_ON(z3fold_page_trylock(zhdr)); - __release_z3fold_page(zhdr, true); -} - -static inline int put_z3fold_locked(struct z3fold_header *zhdr) -{ - return kref_put(&zhdr->refcount, release_z3fold_page_locked); -} - -static inline int put_z3fold_locked_list(struct z3fold_header *zhdr) -{ - return kref_put(&zhdr->refcount, release_z3fold_page_locked_list); -} - -static void free_pages_work(struct work_struct *w) -{ - struct z3fold_pool *pool = container_of(w, struct z3fold_pool, work); - - spin_lock(&pool->stale_lock); - while (!list_empty(&pool->stale)) { - struct z3fold_header *zhdr = list_first_entry(&pool->stale, - struct z3fold_header, buddy); - struct page *page = virt_to_page(zhdr); - - list_del(&zhdr->buddy); - if (WARN_ON(!test_bit(PAGE_STALE, &page->private))) - continue; - spin_unlock(&pool->stale_lock); - cancel_work_sync(&zhdr->work); - free_z3fold_page(page, false); - cond_resched(); - spin_lock(&pool->stale_lock); - } - spin_unlock(&pool->stale_lock); -} - -/* - * Returns the number of free chunks in a z3fold page. - * NB: can't be used with HEADLESS pages. - */ -static int num_free_chunks(struct z3fold_header *zhdr) -{ - int nfree; - /* - * If there is a middle object, pick up the bigger free space - * either before or after it. Otherwise just subtract the number - * of chunks occupied by the first and the last objects. - */ - if (zhdr->middle_chunks != 0) { - int nfree_before = zhdr->first_chunks ? - 0 : zhdr->start_middle - ZHDR_CHUNKS; - int nfree_after = zhdr->last_chunks ? - 0 : TOTAL_CHUNKS - - (zhdr->start_middle + zhdr->middle_chunks); - nfree = max(nfree_before, nfree_after); - } else - nfree = NCHUNKS - zhdr->first_chunks - zhdr->last_chunks; - return nfree; -} - -/* Add to the appropriate unbuddied list */ -static inline void add_to_unbuddied(struct z3fold_pool *pool, - struct z3fold_header *zhdr) -{ - if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 || - zhdr->middle_chunks == 0) { - struct list_head *unbuddied; - int freechunks = num_free_chunks(zhdr); - - migrate_disable(); - unbuddied = this_cpu_ptr(pool->unbuddied); - spin_lock(&pool->lock); - list_add(&zhdr->buddy, &unbuddied[freechunks]); - spin_unlock(&pool->lock); - zhdr->cpu = smp_processor_id(); - migrate_enable(); - } -} - -static inline enum buddy get_free_buddy(struct z3fold_header *zhdr, int chunks) -{ - enum buddy bud = HEADLESS; - - if (zhdr->middle_chunks) { - if (!zhdr->first_chunks && - chunks <= zhdr->start_middle - ZHDR_CHUNKS) - bud = FIRST; - else if (!zhdr->last_chunks) - bud = LAST; - } else { - if (!zhdr->first_chunks) - bud = FIRST; - else if (!zhdr->last_chunks) - bud = LAST; - else - bud = MIDDLE; - } - - return bud; -} - -static inline void *mchunk_memmove(struct z3fold_header *zhdr, - unsigned short dst_chunk) -{ - void *beg = zhdr; - return memmove(beg + (dst_chunk << CHUNK_SHIFT), - beg + (zhdr->start_middle << CHUNK_SHIFT), - zhdr->middle_chunks << CHUNK_SHIFT); -} - -static inline bool buddy_single(struct z3fold_header *zhdr) -{ - return !((zhdr->first_chunks && zhdr->middle_chunks) || - (zhdr->first_chunks && zhdr->last_chunks) || - (zhdr->middle_chunks && zhdr->last_chunks)); -} - -static struct z3fold_header *compact_single_buddy(struct z3fold_header *zhdr) -{ - struct z3fold_pool *pool = zhdr_to_pool(zhdr); - void *p = zhdr; - unsigned long old_handle = 0; - size_t sz = 0; - struct z3fold_header *new_zhdr = NULL; - int first_idx = __idx(zhdr, FIRST); - int middle_idx = __idx(zhdr, MIDDLE); - int last_idx = __idx(zhdr, LAST); - unsigned short *moved_chunks = NULL; - - /* - * No need to protect slots here -- all the slots are "local" and - * the page lock is already taken - */ - if (zhdr->first_chunks && zhdr->slots->slot[first_idx]) { - p += ZHDR_SIZE_ALIGNED; - sz = zhdr->first_chunks << CHUNK_SHIFT; - old_handle = (unsigned long)&zhdr->slots->slot[first_idx]; - moved_chunks = &zhdr->first_chunks; - } else if (zhdr->middle_chunks && zhdr->slots->slot[middle_idx]) { - p += zhdr->start_middle << CHUNK_SHIFT; - sz = zhdr->middle_chunks << CHUNK_SHIFT; - old_handle = (unsigned long)&zhdr->slots->slot[middle_idx]; - moved_chunks = &zhdr->middle_chunks; - } else if (zhdr->last_chunks && zhdr->slots->slot[last_idx]) { - p += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT); - sz = zhdr->last_chunks << CHUNK_SHIFT; - old_handle = (unsigned long)&zhdr->slots->slot[last_idx]; - moved_chunks = &zhdr->last_chunks; - } - - if (sz > 0) { - enum buddy new_bud = HEADLESS; - short chunks = size_to_chunks(sz); - void *q; - - new_zhdr = __z3fold_alloc(pool, sz, false); - if (!new_zhdr) - return NULL; - - if (WARN_ON(new_zhdr == zhdr)) - goto out_fail; - - new_bud = get_free_buddy(new_zhdr, chunks); - q = new_zhdr; - switch (new_bud) { - case FIRST: - new_zhdr->first_chunks = chunks; - q += ZHDR_SIZE_ALIGNED; - break; - case MIDDLE: - new_zhdr->middle_chunks = chunks; - new_zhdr->start_middle = - new_zhdr->first_chunks + ZHDR_CHUNKS; - q += new_zhdr->start_middle << CHUNK_SHIFT; - break; - case LAST: - new_zhdr->last_chunks = chunks; - q += PAGE_SIZE - (new_zhdr->last_chunks << CHUNK_SHIFT); - break; - default: - goto out_fail; - } - new_zhdr->foreign_handles++; - memcpy(q, p, sz); - write_lock(&zhdr->slots->lock); - *(unsigned long *)old_handle = (unsigned long)new_zhdr + - __idx(new_zhdr, new_bud); - if (new_bud == LAST) - *(unsigned long *)old_handle |= - (new_zhdr->last_chunks << BUDDY_SHIFT); - write_unlock(&zhdr->slots->lock); - add_to_unbuddied(pool, new_zhdr); - z3fold_page_unlock(new_zhdr); - - *moved_chunks = 0; - } - - return new_zhdr; - -out_fail: - if (new_zhdr && !put_z3fold_locked(new_zhdr)) { - add_to_unbuddied(pool, new_zhdr); - z3fold_page_unlock(new_zhdr); - } - return NULL; - -} - -#define BIG_CHUNK_GAP 3 -/* Has to be called with lock held */ -static int z3fold_compact_page(struct z3fold_header *zhdr) -{ - struct page *page = virt_to_page(zhdr); - - if (test_bit(MIDDLE_CHUNK_MAPPED, &page->private)) - return 0; /* can't move middle chunk, it's used */ - - if (unlikely(PageIsolated(page))) - return 0; - - if (zhdr->middle_chunks == 0) - return 0; /* nothing to compact */ - - if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) { - /* move to the beginning */ - mchunk_memmove(zhdr, ZHDR_CHUNKS); - zhdr->first_chunks = zhdr->middle_chunks; - zhdr->middle_chunks = 0; - zhdr->start_middle = 0; - zhdr->first_num++; - return 1; - } - - /* - * moving data is expensive, so let's only do that if - * there's substantial gain (at least BIG_CHUNK_GAP chunks) - */ - if (zhdr->first_chunks != 0 && zhdr->last_chunks == 0 && - zhdr->start_middle - (zhdr->first_chunks + ZHDR_CHUNKS) >= - BIG_CHUNK_GAP) { - mchunk_memmove(zhdr, zhdr->first_chunks + ZHDR_CHUNKS); - zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS; - return 1; - } else if (zhdr->last_chunks != 0 && zhdr->first_chunks == 0 && - TOTAL_CHUNKS - (zhdr->last_chunks + zhdr->start_middle - + zhdr->middle_chunks) >= - BIG_CHUNK_GAP) { - unsigned short new_start = TOTAL_CHUNKS - zhdr->last_chunks - - zhdr->middle_chunks; - mchunk_memmove(zhdr, new_start); - zhdr->start_middle = new_start; - return 1; - } - - return 0; -} - -static void do_compact_page(struct z3fold_header *zhdr, bool locked) -{ - struct z3fold_pool *pool = zhdr_to_pool(zhdr); - struct page *page; - - page = virt_to_page(zhdr); - if (locked) - WARN_ON(z3fold_page_trylock(zhdr)); - else - z3fold_page_lock(zhdr); - if (WARN_ON(!test_and_clear_bit(NEEDS_COMPACTING, &page->private))) { - z3fold_page_unlock(zhdr); - return; - } - spin_lock(&pool->lock); - list_del_init(&zhdr->buddy); - spin_unlock(&pool->lock); - - if (put_z3fold_locked(zhdr)) - return; - - if (test_bit(PAGE_STALE, &page->private) || - test_and_set_bit(PAGE_CLAIMED, &page->private)) { - z3fold_page_unlock(zhdr); - return; - } - - if (!zhdr->foreign_handles && buddy_single(zhdr) && - zhdr->mapped_count == 0 && compact_single_buddy(zhdr)) { - if (!put_z3fold_locked(zhdr)) { - clear_bit(PAGE_CLAIMED, &page->private); - z3fold_page_unlock(zhdr); - } - return; - } - - z3fold_compact_page(zhdr); - add_to_unbuddied(pool, zhdr); - clear_bit(PAGE_CLAIMED, &page->private); - z3fold_page_unlock(zhdr); -} - -static void compact_page_work(struct work_struct *w) -{ - struct z3fold_header *zhdr = container_of(w, struct z3fold_header, - work); - - do_compact_page(zhdr, false); -} - -/* returns _locked_ z3fold page header or NULL */ -static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool, - size_t size, bool can_sleep) -{ - struct z3fold_header *zhdr = NULL; - struct page *page; - struct list_head *unbuddied; - int chunks = size_to_chunks(size), i; - -lookup: - migrate_disable(); - /* First, try to find an unbuddied z3fold page. */ - unbuddied = this_cpu_ptr(pool->unbuddied); - for_each_unbuddied_list(i, chunks) { - struct list_head *l = &unbuddied[i]; - - zhdr = list_first_entry_or_null(READ_ONCE(l), - struct z3fold_header, buddy); - - if (!zhdr) - continue; - - /* Re-check under lock. */ - spin_lock(&pool->lock); - if (unlikely(zhdr != list_first_entry(READ_ONCE(l), - struct z3fold_header, buddy)) || - !z3fold_page_trylock(zhdr)) { - spin_unlock(&pool->lock); - zhdr = NULL; - migrate_enable(); - if (can_sleep) - cond_resched(); - goto lookup; - } - list_del_init(&zhdr->buddy); - zhdr->cpu = -1; - spin_unlock(&pool->lock); - - page = virt_to_page(zhdr); - if (test_bit(NEEDS_COMPACTING, &page->private) || - test_bit(PAGE_CLAIMED, &page->private)) { - z3fold_page_unlock(zhdr); - zhdr = NULL; - migrate_enable(); - if (can_sleep) - cond_resched(); - goto lookup; - } - - /* - * this page could not be removed from its unbuddied - * list while pool lock was held, and then we've taken - * page lock so kref_put could not be called before - * we got here, so it's safe to just call kref_get() - */ - kref_get(&zhdr->refcount); - break; - } - migrate_enable(); - - if (!zhdr) { - int cpu; - - /* look for _exact_ match on other cpus' lists */ - for_each_online_cpu(cpu) { - struct list_head *l; - - unbuddied = per_cpu_ptr(pool->unbuddied, cpu); - spin_lock(&pool->lock); - l = &unbuddied[chunks]; - - zhdr = list_first_entry_or_null(READ_ONCE(l), - struct z3fold_header, buddy); - - if (!zhdr || !z3fold_page_trylock(zhdr)) { - spin_unlock(&pool->lock); - zhdr = NULL; - continue; - } - list_del_init(&zhdr->buddy); - zhdr->cpu = -1; - spin_unlock(&pool->lock); - - page = virt_to_page(zhdr); - if (test_bit(NEEDS_COMPACTING, &page->private) || - test_bit(PAGE_CLAIMED, &page->private)) { - z3fold_page_unlock(zhdr); - zhdr = NULL; - if (can_sleep) - cond_resched(); - continue; - } - kref_get(&zhdr->refcount); - break; - } - } - - if (zhdr && !zhdr->slots) { - zhdr->slots = alloc_slots(pool, GFP_ATOMIC); - if (!zhdr->slots) - goto out_fail; - } - return zhdr; - -out_fail: - if (!put_z3fold_locked(zhdr)) { - add_to_unbuddied(pool, zhdr); - z3fold_page_unlock(zhdr); - } - return NULL; -} - -/* - * API Functions - */ - -/** - * z3fold_create_pool() - create a new z3fold pool - * @name: pool name - * @gfp: gfp flags when allocating the z3fold pool structure - * - * Return: pointer to the new z3fold pool or NULL if the metadata allocation - * failed. - */ -static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp) -{ - struct z3fold_pool *pool = NULL; - int i, cpu; - - pool = kzalloc(sizeof(struct z3fold_pool), gfp); - if (!pool) - goto out; - pool->c_handle = kmem_cache_create("z3fold_handle", - sizeof(struct z3fold_buddy_slots), - SLOTS_ALIGN, 0, NULL); - if (!pool->c_handle) - goto out_c; - spin_lock_init(&pool->lock); - spin_lock_init(&pool->stale_lock); - pool->unbuddied = __alloc_percpu(sizeof(struct list_head) * NCHUNKS, - __alignof__(struct list_head)); - if (!pool->unbuddied) - goto out_pool; - for_each_possible_cpu(cpu) { - struct list_head *unbuddied = - per_cpu_ptr(pool->unbuddied, cpu); - for_each_unbuddied_list(i, 0) - INIT_LIST_HEAD(&unbuddied[i]); - } - INIT_LIST_HEAD(&pool->stale); - atomic64_set(&pool->pages_nr, 0); - pool->name = name; - pool->compact_wq = create_singlethread_workqueue(pool->name); - if (!pool->compact_wq) - goto out_unbuddied; - pool->release_wq = create_singlethread_workqueue(pool->name); - if (!pool->release_wq) - goto out_wq; - INIT_WORK(&pool->work, free_pages_work); - return pool; - -out_wq: - destroy_workqueue(pool->compact_wq); -out_unbuddied: - free_percpu(pool->unbuddied); -out_pool: - kmem_cache_destroy(pool->c_handle); -out_c: - kfree(pool); -out: - return NULL; -} - -/** - * z3fold_destroy_pool() - destroys an existing z3fold pool - * @pool: the z3fold pool to be destroyed - * - * The pool should be emptied before this function is called. - */ -static void z3fold_destroy_pool(struct z3fold_pool *pool) -{ - kmem_cache_destroy(pool->c_handle); - - /* - * We need to destroy pool->compact_wq before pool->release_wq, - * as any pending work on pool->compact_wq will call - * queue_work(pool->release_wq, &pool->work). - * - * There are still outstanding pages until both workqueues are drained, - * so we cannot unregister migration until then. - */ - - destroy_workqueue(pool->compact_wq); - destroy_workqueue(pool->release_wq); - free_percpu(pool->unbuddied); - kfree(pool); -} - -static const struct movable_operations z3fold_mops; - -/** - * z3fold_alloc() - allocates a region of a given size - * @pool: z3fold pool from which to allocate - * @size: size in bytes of the desired allocation - * @gfp: gfp flags used if the pool needs to grow - * @handle: handle of the new allocation - * - * This function will attempt to find a free region in the pool large enough to - * satisfy the allocation request. A search of the unbuddied lists is - * performed first. If no suitable free region is found, then a new page is - * allocated and added to the pool to satisfy the request. - * - * Return: 0 if success and handle is set, otherwise -EINVAL if the size or - * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate - * a new page. - */ -static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, - unsigned long *handle) -{ - int chunks = size_to_chunks(size); - struct z3fold_header *zhdr = NULL; - struct page *page = NULL; - enum buddy bud; - bool can_sleep = gfpflags_allow_blocking(gfp); - - if (!size || (gfp & __GFP_HIGHMEM)) - return -EINVAL; - - if (size > PAGE_SIZE) - return -ENOSPC; - - if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE) - bud = HEADLESS; - else { -retry: - zhdr = __z3fold_alloc(pool, size, can_sleep); - if (zhdr) { - bud = get_free_buddy(zhdr, chunks); - if (bud == HEADLESS) { - if (!put_z3fold_locked(zhdr)) - z3fold_page_unlock(zhdr); - pr_err("No free chunks in unbuddied\n"); - WARN_ON(1); - goto retry; - } - page = virt_to_page(zhdr); - goto found; - } - bud = FIRST; - } - - page = alloc_page(gfp); - if (!page) - return -ENOMEM; - - zhdr = init_z3fold_page(page, bud == HEADLESS, pool, gfp); - if (!zhdr) { - __free_page(page); - return -ENOMEM; - } - atomic64_inc(&pool->pages_nr); - - if (bud == HEADLESS) { - set_bit(PAGE_HEADLESS, &page->private); - goto headless; - } - if (can_sleep) { - lock_page(page); - __SetPageMovable(page, &z3fold_mops); - unlock_page(page); - } else { - WARN_ON(!trylock_page(page)); - __SetPageMovable(page, &z3fold_mops); - unlock_page(page); - } - z3fold_page_lock(zhdr); - -found: - if (bud == FIRST) - zhdr->first_chunks = chunks; - else if (bud == LAST) - zhdr->last_chunks = chunks; - else { - zhdr->middle_chunks = chunks; - zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS; - } - add_to_unbuddied(pool, zhdr); - -headless: - spin_lock(&pool->lock); - *handle = encode_handle(zhdr, bud); - spin_unlock(&pool->lock); - if (bud != HEADLESS) - z3fold_page_unlock(zhdr); - - return 0; -} - -/** - * z3fold_free() - frees the allocation associated with the given handle - * @pool: pool in which the allocation resided - * @handle: handle associated with the allocation returned by z3fold_alloc() - * - * In the case that the z3fold page in which the allocation resides is under - * reclaim, as indicated by the PAGE_CLAIMED flag being set, this function - * only sets the first|middle|last_chunks to 0. The page is actually freed - * once all buddies are evicted (see z3fold_reclaim_page() below). - */ -static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) -{ - struct z3fold_header *zhdr; - struct page *page; - enum buddy bud; - bool page_claimed; - - zhdr = get_z3fold_header(handle); - page = virt_to_page(zhdr); - page_claimed = test_and_set_bit(PAGE_CLAIMED, &page->private); - - if (test_bit(PAGE_HEADLESS, &page->private)) { - /* if a headless page is under reclaim, just leave. - * NB: we use test_and_set_bit for a reason: if the bit - * has not been set before, we release this page - * immediately so we don't care about its value any more. - */ - if (!page_claimed) { - put_z3fold_header(zhdr); - free_z3fold_page(page, true); - atomic64_dec(&pool->pages_nr); - } - return; - } - - /* Non-headless case */ - bud = handle_to_buddy(handle); - - switch (bud) { - case FIRST: - zhdr->first_chunks = 0; - break; - case MIDDLE: - zhdr->middle_chunks = 0; - break; - case LAST: - zhdr->last_chunks = 0; - break; - default: - pr_err("%s: unknown bud %d\n", __func__, bud); - WARN_ON(1); - put_z3fold_header(zhdr); - return; - } - - if (!page_claimed) - free_handle(handle, zhdr); - if (put_z3fold_locked_list(zhdr)) - return; - if (page_claimed) { - /* the page has not been claimed by us */ - put_z3fold_header(zhdr); - return; - } - if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) { - clear_bit(PAGE_CLAIMED, &page->private); - put_z3fold_header(zhdr); - return; - } - if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) { - zhdr->cpu = -1; - kref_get(&zhdr->refcount); - clear_bit(PAGE_CLAIMED, &page->private); - do_compact_page(zhdr, true); - return; - } - kref_get(&zhdr->refcount); - clear_bit(PAGE_CLAIMED, &page->private); - queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work); - put_z3fold_header(zhdr); -} - -/** - * z3fold_map() - maps the allocation associated with the given handle - * @pool: pool in which the allocation resides - * @handle: handle associated with the allocation to be mapped - * - * Extracts the buddy number from handle and constructs the pointer to the - * correct starting chunk within the page. - * - * Returns: a pointer to the mapped allocation - */ -static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle) -{ - struct z3fold_header *zhdr; - struct page *page; - void *addr; - enum buddy buddy; - - zhdr = get_z3fold_header(handle); - addr = zhdr; - page = virt_to_page(zhdr); - - if (test_bit(PAGE_HEADLESS, &page->private)) - goto out; - - buddy = handle_to_buddy(handle); - switch (buddy) { - case FIRST: - addr += ZHDR_SIZE_ALIGNED; - break; - case MIDDLE: - addr += zhdr->start_middle << CHUNK_SHIFT; - set_bit(MIDDLE_CHUNK_MAPPED, &page->private); - break; - case LAST: - addr += PAGE_SIZE - (handle_to_chunks(handle) << CHUNK_SHIFT); - break; - default: - pr_err("unknown buddy id %d\n", buddy); - WARN_ON(1); - addr = NULL; - break; - } - - if (addr) - zhdr->mapped_count++; -out: - put_z3fold_header(zhdr); - return addr; -} - -/** - * z3fold_unmap() - unmaps the allocation associated with the given handle - * @pool: pool in which the allocation resides - * @handle: handle associated with the allocation to be unmapped - */ -static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle) -{ - struct z3fold_header *zhdr; - struct page *page; - enum buddy buddy; - - zhdr = get_z3fold_header(handle); - page = virt_to_page(zhdr); - - if (test_bit(PAGE_HEADLESS, &page->private)) - return; - - buddy = handle_to_buddy(handle); - if (buddy == MIDDLE) - clear_bit(MIDDLE_CHUNK_MAPPED, &page->private); - zhdr->mapped_count--; - put_z3fold_header(zhdr); -} - -/** - * z3fold_get_pool_pages() - gets the z3fold pool size in pages - * @pool: pool whose size is being queried - * - * Returns: size in pages of the given pool. - */ -static u64 z3fold_get_pool_pages(struct z3fold_pool *pool) -{ - return atomic64_read(&pool->pages_nr); -} - -static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) -{ - struct z3fold_header *zhdr; - struct z3fold_pool *pool; - - VM_BUG_ON_PAGE(PageIsolated(page), page); - - if (test_bit(PAGE_HEADLESS, &page->private)) - return false; - - zhdr = page_address(page); - z3fold_page_lock(zhdr); - if (test_bit(NEEDS_COMPACTING, &page->private) || - test_bit(PAGE_STALE, &page->private)) - goto out; - - if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0) - goto out; - - if (test_and_set_bit(PAGE_CLAIMED, &page->private)) - goto out; - pool = zhdr_to_pool(zhdr); - spin_lock(&pool->lock); - if (!list_empty(&zhdr->buddy)) - list_del_init(&zhdr->buddy); - spin_unlock(&pool->lock); - - kref_get(&zhdr->refcount); - z3fold_page_unlock(zhdr); - return true; - -out: - z3fold_page_unlock(zhdr); - return false; -} - -static int z3fold_page_migrate(struct page *newpage, struct page *page, - enum migrate_mode mode) -{ - struct z3fold_header *zhdr, *new_zhdr; - struct z3fold_pool *pool; - - VM_BUG_ON_PAGE(!PageIsolated(page), page); - VM_BUG_ON_PAGE(!test_bit(PAGE_CLAIMED, &page->private), page); - VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); - - zhdr = page_address(page); - pool = zhdr_to_pool(zhdr); - - if (!z3fold_page_trylock(zhdr)) - return -EAGAIN; - if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0) { - clear_bit(PAGE_CLAIMED, &page->private); - z3fold_page_unlock(zhdr); - return -EBUSY; - } - if (work_pending(&zhdr->work)) { - z3fold_page_unlock(zhdr); - return -EAGAIN; - } - new_zhdr = page_address(newpage); - memcpy(new_zhdr, zhdr, PAGE_SIZE); - newpage->private = page->private; - set_bit(PAGE_MIGRATED, &page->private); - z3fold_page_unlock(zhdr); - spin_lock_init(&new_zhdr->page_lock); - INIT_WORK(&new_zhdr->work, compact_page_work); - /* - * z3fold_page_isolate() ensures that new_zhdr->buddy is empty, - * so we only have to reinitialize it. - */ - INIT_LIST_HEAD(&new_zhdr->buddy); - __ClearPageMovable(page); - - get_page(newpage); - z3fold_page_lock(new_zhdr); - if (new_zhdr->first_chunks) - encode_handle(new_zhdr, FIRST); - if (new_zhdr->last_chunks) - encode_handle(new_zhdr, LAST); - if (new_zhdr->middle_chunks) - encode_handle(new_zhdr, MIDDLE); - set_bit(NEEDS_COMPACTING, &newpage->private); - new_zhdr->cpu = smp_processor_id(); - __SetPageMovable(newpage, &z3fold_mops); - z3fold_page_unlock(new_zhdr); - - queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work); - - /* PAGE_CLAIMED and PAGE_MIGRATED are cleared now. */ - page->private = 0; - put_page(page); - return 0; -} - -static void z3fold_page_putback(struct page *page) -{ - struct z3fold_header *zhdr; - struct z3fold_pool *pool; - - zhdr = page_address(page); - pool = zhdr_to_pool(zhdr); - - z3fold_page_lock(zhdr); - if (!list_empty(&zhdr->buddy)) - list_del_init(&zhdr->buddy); - INIT_LIST_HEAD(&page->lru); - if (put_z3fold_locked(zhdr)) - return; - if (list_empty(&zhdr->buddy)) - add_to_unbuddied(pool, zhdr); - clear_bit(PAGE_CLAIMED, &page->private); - z3fold_page_unlock(zhdr); -} - -static const struct movable_operations z3fold_mops = { - .isolate_page = z3fold_page_isolate, - .migrate_page = z3fold_page_migrate, - .putback_page = z3fold_page_putback, -}; - -/***************** - * zpool - ****************/ - -static void *z3fold_zpool_create(const char *name, gfp_t gfp) -{ - return z3fold_create_pool(name, gfp); -} - -static void z3fold_zpool_destroy(void *pool) -{ - z3fold_destroy_pool(pool); -} - -static int z3fold_zpool_malloc(void *pool, size_t size, gfp_t gfp, - unsigned long *handle) -{ - return z3fold_alloc(pool, size, gfp, handle); -} -static void z3fold_zpool_free(void *pool, unsigned long handle) -{ - z3fold_free(pool, handle); -} - -static void *z3fold_zpool_map(void *pool, unsigned long handle, - enum zpool_mapmode mm) -{ - return z3fold_map(pool, handle); -} -static void z3fold_zpool_unmap(void *pool, unsigned long handle) -{ - z3fold_unmap(pool, handle); -} - -static u64 z3fold_zpool_total_pages(void *pool) -{ - return z3fold_get_pool_pages(pool); -} - -static struct zpool_driver z3fold_zpool_driver = { - .type = "z3fold", - .sleep_mapped = true, - .owner = THIS_MODULE, - .create = z3fold_zpool_create, - .destroy = z3fold_zpool_destroy, - .malloc = z3fold_zpool_malloc, - .free = z3fold_zpool_free, - .map = z3fold_zpool_map, - .unmap = z3fold_zpool_unmap, - .total_pages = z3fold_zpool_total_pages, -}; - -MODULE_ALIAS("zpool-z3fold"); - -static int __init init_z3fold(void) -{ - /* - * Make sure the z3fold header is not larger than the page size and - * there has remaining spaces for its buddy. - */ - BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE - CHUNK_SIZE); - zpool_register_driver(&z3fold_zpool_driver); - - return 0; -} - -static void __exit exit_z3fold(void) -{ - zpool_unregister_driver(&z3fold_zpool_driver); -} - -module_init(init_z3fold); -module_exit(exit_z3fold); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Vitaly Wool <vitalywool@gmail.com>"); -MODULE_DESCRIPTION("3-Fold Allocator for Compressed Pages"); diff --git a/mm/zbud.c b/mm/zbud.c deleted file mode 100644 index e9836fff9438..000000000000 --- a/mm/zbud.c +++ /dev/null @@ -1,455 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * zbud.c - * - * Copyright (C) 2013, Seth Jennings, IBM - * - * Concepts based on zcache internal zbud allocator by Dan Magenheimer. - * - * zbud is an special purpose allocator for storing compressed pages. Contrary - * to what its name may suggest, zbud is not a buddy allocator, but rather an - * allocator that "buddies" two compressed pages together in a single memory - * page. - * - * While this design limits storage density, it has simple and deterministic - * reclaim properties that make it preferable to a higher density approach when - * reclaim will be used. - * - * zbud works by storing compressed pages, or "zpages", together in pairs in a - * single memory page called a "zbud page". The first buddy is "left - * justified" at the beginning of the zbud page, and the last buddy is "right - * justified" at the end of the zbud page. The benefit is that if either - * buddy is freed, the freed buddy space, coalesced with whatever slack space - * that existed between the buddies, results in the largest possible free region - * within the zbud page. - * - * zbud also provides an attractive lower bound on density. The ratio of zpages - * to zbud pages can not be less than 1. This ensures that zbud can never "do - * harm" by using more pages to store zpages than the uncompressed zpages would - * have used on their own. - * - * zbud pages are divided into "chunks". The size of the chunks is fixed at - * compile time and determined by NCHUNKS_ORDER below. Dividing zbud pages - * into chunks allows organizing unbuddied zbud pages into a manageable number - * of unbuddied lists according to the number of free chunks available in the - * zbud page. - * - * The zbud API differs from that of conventional allocators in that the - * allocation function, zbud_alloc(), returns an opaque handle to the user, - * not a dereferenceable pointer. The user must map the handle using - * zbud_map() in order to get a usable pointer by which to access the - * allocation data and unmap the handle with zbud_unmap() when operations - * on the allocation data are complete. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/atomic.h> -#include <linux/list.h> -#include <linux/mm.h> -#include <linux/module.h> -#include <linux/preempt.h> -#include <linux/slab.h> -#include <linux/spinlock.h> -#include <linux/zpool.h> - -/***************** - * Structures -*****************/ -/* - * NCHUNKS_ORDER determines the internal allocation granularity, effectively - * adjusting internal fragmentation. It also determines the number of - * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the - * allocation granularity will be in chunks of size PAGE_SIZE/64. As one chunk - * in allocated page is occupied by zbud header, NCHUNKS will be calculated to - * 63 which shows the max number of free chunks in zbud page, also there will be - * 63 freelists per pool. - */ -#define NCHUNKS_ORDER 6 - -#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER) -#define CHUNK_SIZE (1 << CHUNK_SHIFT) -#define ZHDR_SIZE_ALIGNED CHUNK_SIZE -#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT) - -struct zbud_pool; - -/** - * struct zbud_pool - stores metadata for each zbud pool - * @lock: protects all pool fields and first|last_chunk fields of any - * zbud page in the pool - * @unbuddied: array of lists tracking zbud pages that only contain one buddy; - * the lists each zbud page is added to depends on the size of - * its free region. - * @buddied: list tracking the zbud pages that contain two buddies; - * these zbud pages are full - * @pages_nr: number of zbud pages in the pool. - * - * This structure is allocated at pool creation time and maintains metadata - * pertaining to a particular zbud pool. - */ -struct zbud_pool { - spinlock_t lock; - union { - /* - * Reuse unbuddied[0] as buddied on the ground that - * unbuddied[0] is unused. - */ - struct list_head buddied; - struct list_head unbuddied[NCHUNKS]; - }; - u64 pages_nr; -}; - -/* - * struct zbud_header - zbud page metadata occupying the first chunk of each - * zbud page. - * @buddy: links the zbud page into the unbuddied/buddied lists in the pool - * @first_chunks: the size of the first buddy in chunks, 0 if free - * @last_chunks: the size of the last buddy in chunks, 0 if free - */ -struct zbud_header { - struct list_head buddy; - unsigned int first_chunks; - unsigned int last_chunks; -}; - -/***************** - * Helpers -*****************/ -/* Just to make the code easier to read */ -enum buddy { - FIRST, - LAST -}; - -/* Converts an allocation size in bytes to size in zbud chunks */ -static int size_to_chunks(size_t size) -{ - return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; -} - -#define for_each_unbuddied_list(_iter, _begin) \ - for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++) - -/* Initializes the zbud header of a newly allocated zbud page */ -static struct zbud_header *init_zbud_page(struct page *page) -{ - struct zbud_header *zhdr = page_address(page); - zhdr->first_chunks = 0; - zhdr->last_chunks = 0; - INIT_LIST_HEAD(&zhdr->buddy); - return zhdr; -} - -/* Resets the struct page fields and frees the page */ -static void free_zbud_page(struct zbud_header *zhdr) -{ - __free_page(virt_to_page(zhdr)); -} - -/* - * Encodes the handle of a particular buddy within a zbud page - * Pool lock should be held as this function accesses first|last_chunks - */ -static unsigned long encode_handle(struct zbud_header *zhdr, enum buddy bud) -{ - unsigned long handle; - - /* - * For now, the encoded handle is actually just the pointer to the data - * but this might not always be the case. A little information hiding. - * Add CHUNK_SIZE to the handle if it is the first allocation to jump - * over the zbud header in the first chunk. - */ - handle = (unsigned long)zhdr; - if (bud == FIRST) - /* skip over zbud header */ - handle += ZHDR_SIZE_ALIGNED; - else /* bud == LAST */ - handle += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT); - return handle; -} - -/* Returns the zbud page where a given handle is stored */ -static struct zbud_header *handle_to_zbud_header(unsigned long handle) -{ - return (struct zbud_header *)(handle & PAGE_MASK); -} - -/* Returns the number of free chunks in a zbud page */ -static int num_free_chunks(struct zbud_header *zhdr) -{ - /* - * Rather than branch for different situations, just use the fact that - * free buddies have a length of zero to simplify everything. - */ - return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks; -} - -/***************** - * API Functions -*****************/ -/** - * zbud_create_pool() - create a new zbud pool - * @gfp: gfp flags when allocating the zbud pool structure - * - * Return: pointer to the new zbud pool or NULL if the metadata allocation - * failed. - */ -static struct zbud_pool *zbud_create_pool(gfp_t gfp) -{ - struct zbud_pool *pool; - int i; - - pool = kzalloc(sizeof(struct zbud_pool), gfp); - if (!pool) - return NULL; - spin_lock_init(&pool->lock); - for_each_unbuddied_list(i, 0) - INIT_LIST_HEAD(&pool->unbuddied[i]); - INIT_LIST_HEAD(&pool->buddied); - pool->pages_nr = 0; - return pool; -} - -/** - * zbud_destroy_pool() - destroys an existing zbud pool - * @pool: the zbud pool to be destroyed - * - * The pool should be emptied before this function is called. - */ -static void zbud_destroy_pool(struct zbud_pool *pool) -{ - kfree(pool); -} - -/** - * zbud_alloc() - allocates a region of a given size - * @pool: zbud pool from which to allocate - * @size: size in bytes of the desired allocation - * @gfp: gfp flags used if the pool needs to grow - * @handle: handle of the new allocation - * - * This function will attempt to find a free region in the pool large enough to - * satisfy the allocation request. A search of the unbuddied lists is - * performed first. If no suitable free region is found, then a new page is - * allocated and added to the pool to satisfy the request. - * - * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used - * as zbud pool pages. - * - * Return: 0 if success and handle is set, otherwise -EINVAL if the size or - * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate - * a new page. - */ -static int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp, - unsigned long *handle) -{ - int chunks, i, freechunks; - struct zbud_header *zhdr = NULL; - enum buddy bud; - struct page *page; - - if (!size || (gfp & __GFP_HIGHMEM)) - return -EINVAL; - if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE) - return -ENOSPC; - chunks = size_to_chunks(size); - spin_lock(&pool->lock); - - /* First, try to find an unbuddied zbud page. */ - for_each_unbuddied_list(i, chunks) { - if (!list_empty(&pool->unbuddied[i])) { - zhdr = list_first_entry(&pool->unbuddied[i], - struct zbud_header, buddy); - list_del(&zhdr->buddy); - if (zhdr->first_chunks == 0) - bud = FIRST; - else - bud = LAST; - goto found; - } - } - - /* Couldn't find unbuddied zbud page, create new one */ - spin_unlock(&pool->lock); - page = alloc_page(gfp); - if (!page) - return -ENOMEM; - spin_lock(&pool->lock); - pool->pages_nr++; - zhdr = init_zbud_page(page); - bud = FIRST; - -found: - if (bud == FIRST) - zhdr->first_chunks = chunks; - else - zhdr->last_chunks = chunks; - - if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0) { - /* Add to unbuddied list */ - freechunks = num_free_chunks(zhdr); - list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); - } else { - /* Add to buddied list */ - list_add(&zhdr->buddy, &pool->buddied); - } - - *handle = encode_handle(zhdr, bud); - spin_unlock(&pool->lock); - - return 0; -} - -/** - * zbud_free() - frees the allocation associated with the given handle - * @pool: pool in which the allocation resided - * @handle: handle associated with the allocation returned by zbud_alloc() - */ -static void zbud_free(struct zbud_pool *pool, unsigned long handle) -{ - struct zbud_header *zhdr; - int freechunks; - - spin_lock(&pool->lock); - zhdr = handle_to_zbud_header(handle); - - /* If first buddy, handle will be page aligned */ - if ((handle - ZHDR_SIZE_ALIGNED) & ~PAGE_MASK) - zhdr->last_chunks = 0; - else - zhdr->first_chunks = 0; - - /* Remove from existing buddy list */ - list_del(&zhdr->buddy); - - if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) { - /* zbud page is empty, free */ - free_zbud_page(zhdr); - pool->pages_nr--; - } else { - /* Add to unbuddied list */ - freechunks = num_free_chunks(zhdr); - list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); - } - - spin_unlock(&pool->lock); -} - -/** - * zbud_map() - maps the allocation associated with the given handle - * @pool: pool in which the allocation resides - * @handle: handle associated with the allocation to be mapped - * - * While trivial for zbud, the mapping functions for others allocators - * implementing this allocation API could have more complex information encoded - * in the handle and could create temporary mappings to make the data - * accessible to the user. - * - * Returns: a pointer to the mapped allocation - */ -static void *zbud_map(struct zbud_pool *pool, unsigned long handle) -{ - return (void *)(handle); -} - -/** - * zbud_unmap() - maps the allocation associated with the given handle - * @pool: pool in which the allocation resides - * @handle: handle associated with the allocation to be unmapped - */ -static void zbud_unmap(struct zbud_pool *pool, unsigned long handle) -{ -} - -/** - * zbud_get_pool_pages() - gets the zbud pool size in pages - * @pool: pool whose size is being queried - * - * Returns: size in pages of the given pool. The pool lock need not be - * taken to access pages_nr. - */ -static u64 zbud_get_pool_pages(struct zbud_pool *pool) -{ - return pool->pages_nr; -} - -/***************** - * zpool - ****************/ - -static void *zbud_zpool_create(const char *name, gfp_t gfp) -{ - return zbud_create_pool(gfp); -} - -static void zbud_zpool_destroy(void *pool) -{ - zbud_destroy_pool(pool); -} - -static int zbud_zpool_malloc(void *pool, size_t size, gfp_t gfp, - unsigned long *handle) -{ - return zbud_alloc(pool, size, gfp, handle); -} -static void zbud_zpool_free(void *pool, unsigned long handle) -{ - zbud_free(pool, handle); -} - -static void *zbud_zpool_map(void *pool, unsigned long handle, - enum zpool_mapmode mm) -{ - return zbud_map(pool, handle); -} -static void zbud_zpool_unmap(void *pool, unsigned long handle) -{ - zbud_unmap(pool, handle); -} - -static u64 zbud_zpool_total_pages(void *pool) -{ - return zbud_get_pool_pages(pool); -} - -static struct zpool_driver zbud_zpool_driver = { - .type = "zbud", - .sleep_mapped = true, - .owner = THIS_MODULE, - .create = zbud_zpool_create, - .destroy = zbud_zpool_destroy, - .malloc = zbud_zpool_malloc, - .free = zbud_zpool_free, - .map = zbud_zpool_map, - .unmap = zbud_zpool_unmap, - .total_pages = zbud_zpool_total_pages, -}; - -MODULE_ALIAS("zpool-zbud"); - -static int __init init_zbud(void) -{ - /* Make sure the zbud header will fit in one chunk */ - BUILD_BUG_ON(sizeof(struct zbud_header) > ZHDR_SIZE_ALIGNED); - pr_info("loaded\n"); - - zpool_register_driver(&zbud_zpool_driver); - - return 0; -} - -static void __exit exit_zbud(void) -{ - zpool_unregister_driver(&zbud_zpool_driver); - pr_info("unloaded\n"); -} - -module_init(init_zbud); -module_exit(exit_zbud); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>"); -MODULE_DESCRIPTION("Buddy Allocator for Compressed Pages"); diff --git a/mm/zpdesc.h b/mm/zpdesc.h new file mode 100644 index 000000000000..d3df316e5bb7 --- /dev/null +++ b/mm/zpdesc.h @@ -0,0 +1,185 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* zpdesc.h: zswap.zpool memory descriptor + * + * Written by Alex Shi <alexs@kernel.org> + * Hyeonggon Yoo <42.hyeyoo@gmail.com> + */ +#ifndef __MM_ZPDESC_H__ +#define __MM_ZPDESC_H__ + +#include <linux/migrate.h> +#include <linux/pagemap.h> + +/* + * struct zpdesc - Memory descriptor for zpool memory. + * @flags: Page flags, mostly unused by zsmalloc. + * @lru: Indirectly used by page migration. + * @movable_ops: Used by page migration. + * @next: Next zpdesc in a zspage in zsmalloc zpool. + * @handle: For huge zspage in zsmalloc zpool. + * @zspage: Points to the zspage this zpdesc is a part of. + * @first_obj_offset: First object offset in zsmalloc zpool. + * @_refcount: The number of references to this zpdesc. + * + * This struct overlays struct page for now. Do not modify without a good + * understanding of the issues. In particular, do not expand into the overlap + * with memcg_data. + * + * Page flags used: + * * PG_private identifies the first component page. + * * PG_locked is used by page migration code. + */ +struct zpdesc { + unsigned long flags; + struct list_head lru; + unsigned long movable_ops; + union { + struct zpdesc *next; + unsigned long handle; + }; + struct zspage *zspage; + /* + * Only the lower 24 bits are available for offset, limiting a page + * to 16 MiB. The upper 8 bits are reserved for PGTY_zsmalloc. + * + * Do not access this field directly. + * Instead, use {get,set}_first_obj_offset() helpers. + */ + unsigned int first_obj_offset; + atomic_t _refcount; +}; +#define ZPDESC_MATCH(pg, zp) \ + static_assert(offsetof(struct page, pg) == offsetof(struct zpdesc, zp)) + +ZPDESC_MATCH(flags, flags); +ZPDESC_MATCH(lru, lru); +ZPDESC_MATCH(mapping, movable_ops); +ZPDESC_MATCH(__folio_index, next); +ZPDESC_MATCH(__folio_index, handle); +ZPDESC_MATCH(private, zspage); +ZPDESC_MATCH(page_type, first_obj_offset); +ZPDESC_MATCH(_refcount, _refcount); +#undef ZPDESC_MATCH +static_assert(sizeof(struct zpdesc) <= sizeof(struct page)); + +/* + * zpdesc_page - The first struct page allocated for a zpdesc + * @zp: The zpdesc. + * + * A convenience wrapper for converting zpdesc to the first struct page of the + * underlying folio, to communicate with code not yet converted to folio or + * struct zpdesc. + * + */ +#define zpdesc_page(zp) (_Generic((zp), \ + const struct zpdesc *: (const struct page *)(zp), \ + struct zpdesc *: (struct page *)(zp))) + +/** + * zpdesc_folio - The folio allocated for a zpdesc + * @zp: The zpdesc. + * + * Zpdescs are descriptors for zpool memory. The zpool memory itself is + * allocated as folios that contain the zpool objects, and zpdesc uses specific + * fields in the first struct page of the folio - those fields are now accessed + * by struct zpdesc. + * + * It is occasionally necessary convert to back to a folio in order to + * communicate with the rest of the mm. Please use this helper function + * instead of casting yourself, as the implementation may change in the future. + */ +#define zpdesc_folio(zp) (_Generic((zp), \ + const struct zpdesc *: (const struct folio *)(zp), \ + struct zpdesc *: (struct folio *)(zp))) +/** + * page_zpdesc - Converts from first struct page to zpdesc. + * @p: The first (either head of compound or single) page of zpdesc. + * + * A temporary wrapper to convert struct page to struct zpdesc in situations + * where we know the page is the compound head, or single order-0 page. + * + * Long-term ideally everything would work with struct zpdesc directly or go + * through folio to struct zpdesc. + * + * Return: The zpdesc which contains this page + */ +#define page_zpdesc(p) (_Generic((p), \ + const struct page *: (const struct zpdesc *)(p), \ + struct page *: (struct zpdesc *)(p))) + +static inline void zpdesc_lock(struct zpdesc *zpdesc) +{ + folio_lock(zpdesc_folio(zpdesc)); +} + +static inline bool zpdesc_trylock(struct zpdesc *zpdesc) +{ + return folio_trylock(zpdesc_folio(zpdesc)); +} + +static inline void zpdesc_unlock(struct zpdesc *zpdesc) +{ + folio_unlock(zpdesc_folio(zpdesc)); +} + +static inline void zpdesc_wait_locked(struct zpdesc *zpdesc) +{ + folio_wait_locked(zpdesc_folio(zpdesc)); +} + +static inline void zpdesc_get(struct zpdesc *zpdesc) +{ + folio_get(zpdesc_folio(zpdesc)); +} + +static inline void zpdesc_put(struct zpdesc *zpdesc) +{ + folio_put(zpdesc_folio(zpdesc)); +} + +static inline void *kmap_local_zpdesc(struct zpdesc *zpdesc) +{ + return kmap_local_page(zpdesc_page(zpdesc)); +} + +static inline unsigned long zpdesc_pfn(struct zpdesc *zpdesc) +{ + return page_to_pfn(zpdesc_page(zpdesc)); +} + +static inline struct zpdesc *pfn_zpdesc(unsigned long pfn) +{ + return page_zpdesc(pfn_to_page(pfn)); +} + +static inline void __zpdesc_set_movable(struct zpdesc *zpdesc, + const struct movable_operations *mops) +{ + __SetPageMovable(zpdesc_page(zpdesc), mops); +} + +static inline void __zpdesc_set_zsmalloc(struct zpdesc *zpdesc) +{ + __SetPageZsmalloc(zpdesc_page(zpdesc)); +} + +static inline void __zpdesc_clear_zsmalloc(struct zpdesc *zpdesc) +{ + __ClearPageZsmalloc(zpdesc_page(zpdesc)); +} + +static inline bool zpdesc_is_isolated(struct zpdesc *zpdesc) +{ + return PageIsolated(zpdesc_page(zpdesc)); +} + +static inline struct zone *zpdesc_zone(struct zpdesc *zpdesc) +{ + return page_zone(zpdesc_page(zpdesc)); +} + +static inline bool zpdesc_is_locked(struct zpdesc *zpdesc) +{ + return folio_test_locked(zpdesc_folio(zpdesc)); +} +#endif diff --git a/mm/zpool.c b/mm/zpool.c index b9fda1fa857d..0a71d03369f1 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -95,7 +95,7 @@ static void zpool_put_driver(struct zpool_driver *driver) /** * zpool_has_pool() - Check if the pool driver is available - * @type: The type of the zpool to check (e.g. zbud, zsmalloc) + * @type: The type of the zpool to check (e.g. zsmalloc) * * This checks if the @type pool driver is available. This will try to load * the requested module, if needed, but there is no guarantee the module will @@ -130,7 +130,7 @@ EXPORT_SYMBOL(zpool_has_pool); /** * zpool_create_pool() - Create a new zpool - * @type: The type of the zpool to create (e.g. zbud, zsmalloc) + * @type: The type of the zpool to create (e.g. zsmalloc) * @name: The name of the zpool (e.g. zram0, zswap) * @gfp: The GFP flags to use when allocating the pool. * @@ -221,41 +221,27 @@ const char *zpool_get_type(struct zpool *zpool) } /** - * zpool_malloc_support_movable() - Check if the zpool supports - * allocating movable memory - * @zpool: The zpool to check - * - * This returns if the zpool supports allocating movable memory. - * - * Implementations must guarantee this to be thread-safe. - * - * Returns: true if the zpool supports allocating movable memory, false if not - */ -bool zpool_malloc_support_movable(struct zpool *zpool) -{ - return zpool->driver->malloc_support_movable; -} - -/** * zpool_malloc() - Allocate memory * @zpool: The zpool to allocate from. * @size: The amount of memory to allocate. * @gfp: The GFP flags to use when allocating memory. * @handle: Pointer to the handle to set + * @nid: The preferred node id. * * This allocates the requested amount of memory from the pool. * The gfp flags will be used when allocating memory, if the * implementation supports it. The provided @handle will be - * set to the allocated object handle. + * set to the allocated object handle. The allocation will + * prefer the NUMA node specified by @nid. * * Implementations must guarantee this to be thread-safe. * * Returns: 0 on success, negative value on error. */ int zpool_malloc(struct zpool *zpool, size_t size, gfp_t gfp, - unsigned long *handle) + unsigned long *handle, const int nid) { - return zpool->driver->malloc(zpool->pool, size, gfp, handle); + return zpool->driver->malloc(zpool->pool, size, gfp, handle, nid); } /** @@ -278,46 +264,51 @@ void zpool_free(struct zpool *zpool, unsigned long handle) } /** - * zpool_map_handle() - Map a previously allocated handle into memory + * zpool_obj_read_begin() - Start reading from a previously allocated handle. * @zpool: The zpool that the handle was allocated from - * @handle: The handle to map - * @mapmode: How the memory should be mapped + * @handle: The handle to read from + * @local_copy: A local buffer to use if needed. * - * This maps a previously allocated handle into memory. The @mapmode - * param indicates to the implementation how the memory will be - * used, i.e. read-only, write-only, read-write. If the - * implementation does not support it, the memory will be treated - * as read-write. + * This starts a read operation of a previously allocated handle. The passed + * @local_copy buffer may be used if needed by copying the memory into. + * zpool_obj_read_end() MUST be called after the read is completed to undo any + * actions taken (e.g. release locks). * - * This may hold locks, disable interrupts, and/or preemption, - * and the zpool_unmap_handle() must be called to undo those - * actions. The code that uses the mapped handle should complete - * its operations on the mapped handle memory quickly and unmap - * as soon as possible. As the implementation may use per-cpu - * data, multiple handles should not be mapped concurrently on - * any cpu. + * Returns: A pointer to the handle memory to be read, if @local_copy is used, + * the returned pointer is @local_copy. + */ +void *zpool_obj_read_begin(struct zpool *zpool, unsigned long handle, + void *local_copy) +{ + return zpool->driver->obj_read_begin(zpool->pool, handle, local_copy); +} + +/** + * zpool_obj_read_end() - Finish reading from a previously allocated handle. + * @zpool: The zpool that the handle was allocated from + * @handle: The handle to read from + * @handle_mem: The pointer returned by zpool_obj_read_begin() * - * Returns: A pointer to the handle's mapped memory area. + * Finishes a read operation previously started by zpool_obj_read_begin(). */ -void *zpool_map_handle(struct zpool *zpool, unsigned long handle, - enum zpool_mapmode mapmode) +void zpool_obj_read_end(struct zpool *zpool, unsigned long handle, + void *handle_mem) { - return zpool->driver->map(zpool->pool, handle, mapmode); + zpool->driver->obj_read_end(zpool->pool, handle, handle_mem); } /** - * zpool_unmap_handle() - Unmap a previously mapped handle + * zpool_obj_write() - Write to a previously allocated handle. * @zpool: The zpool that the handle was allocated from - * @handle: The handle to unmap + * @handle: The handle to read from + * @handle_mem: The memory to copy from into the handle. + * @mem_len: The length of memory to be written. * - * This unmaps a previously mapped handle. Any locks or other - * actions that the implementation took in zpool_map_handle() - * will be undone here. The memory area returned from - * zpool_map_handle() should no longer be used after this. */ -void zpool_unmap_handle(struct zpool *zpool, unsigned long handle) +void zpool_obj_write(struct zpool *zpool, unsigned long handle, + void *handle_mem, size_t mem_len) { - zpool->driver->unmap(zpool->pool, handle); + zpool->driver->obj_write(zpool->pool, handle, handle_mem, mem_len); } /** @@ -333,23 +324,5 @@ u64 zpool_get_total_pages(struct zpool *zpool) return zpool->driver->total_pages(zpool->pool); } -/** - * zpool_can_sleep_mapped - Test if zpool can sleep when do mapped. - * @zpool: The zpool to test - * - * Some allocators enter non-preemptible context in ->map() callback (e.g. - * disable pagefaults) and exit that context in ->unmap(), which limits what - * we can do with the mapped object. For instance, we cannot wait for - * asynchronous crypto API to decompress such an object or take mutexes - * since those will call into the scheduler. This function tells us whether - * we use such an allocator. - * - * Returns: true if zpool can sleep; false otherwise. - */ -bool zpool_can_sleep_mapped(struct zpool *zpool) -{ - return zpool->driver->sleep_mapped; -} - MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>"); MODULE_DESCRIPTION("Common API for compressed memory storage"); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 64b66a4d3e6e..999b513c7fdf 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -13,30 +13,12 @@ * Released under the terms of GNU General Public License Version 2.0 */ -/* - * Following is how we use various fields and flags of underlying - * struct page(s) to form a zspage. - * - * Usage of struct page fields: - * page->private: points to zspage - * page->index: links together all component pages of a zspage - * For the huge page, this is always 0, so we use this field - * to store handle. - * page->page_type: PGTY_zsmalloc, lower 24 bits locate the first object - * offset in a subpage of a zspage - * - * Usage of struct page flags: - * PG_private: identifies the first component page - * PG_owner_priv_1: identifies the huge component page - * - */ - #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt /* * lock ordering: * page_lock - * pool->migrate_lock + * pool->lock * class->lock * zspage->lock */ @@ -44,17 +26,10 @@ #include <linux/module.h> #include <linux/kernel.h> #include <linux/sched.h> -#include <linux/bitops.h> #include <linux/errno.h> #include <linux/highmem.h> #include <linux/string.h> #include <linux/slab.h> -#include <linux/pgtable.h> -#include <asm/tlbflush.h> -#include <linux/cpumask.h> -#include <linux/cpu.h> -#include <linux/vmalloc.h> -#include <linux/preempt.h> #include <linux/spinlock.h> #include <linux/sprintf.h> #include <linux/shrinker.h> @@ -62,11 +37,9 @@ #include <linux/debugfs.h> #include <linux/zsmalloc.h> #include <linux/zpool.h> -#include <linux/migrate.h> -#include <linux/wait.h> -#include <linux/pagemap.h> #include <linux/fs.h> -#include <linux/local_lock.h> +#include <linux/workqueue.h> +#include "zpdesc.h" #define ZSPAGE_MAGIC 0x58 @@ -240,11 +213,49 @@ struct zs_pool { #ifdef CONFIG_COMPACTION struct work_struct free_work; #endif - /* protect page/zspage migration */ - rwlock_t migrate_lock; + /* protect zspage migration/compaction */ + rwlock_t lock; atomic_t compaction_in_progress; }; +static inline void zpdesc_set_first(struct zpdesc *zpdesc) +{ + SetPagePrivate(zpdesc_page(zpdesc)); +} + +static inline void zpdesc_inc_zone_page_state(struct zpdesc *zpdesc) +{ + inc_zone_page_state(zpdesc_page(zpdesc), NR_ZSPAGES); +} + +static inline void zpdesc_dec_zone_page_state(struct zpdesc *zpdesc) +{ + dec_zone_page_state(zpdesc_page(zpdesc), NR_ZSPAGES); +} + +static inline struct zpdesc *alloc_zpdesc(gfp_t gfp, const int nid) +{ + struct page *page = alloc_pages_node(nid, gfp, 0); + + return page_zpdesc(page); +} + +static inline void free_zpdesc(struct zpdesc *zpdesc) +{ + struct page *page = zpdesc_page(zpdesc); + + __free_page(page); +} + +#define ZS_PAGE_UNLOCKED 0 +#define ZS_PAGE_WRLOCKED -1 + +struct zspage_lock { + spinlock_t lock; + int cnt; + struct lockdep_map dep_map; +}; + struct zspage { struct { unsigned int huge:HUGE_BITS; @@ -254,18 +265,89 @@ struct zspage { }; unsigned int inuse; unsigned int freeobj; - struct page *first_page; + struct zpdesc *first_zpdesc; struct list_head list; /* fullness list */ struct zs_pool *pool; - rwlock_t lock; + struct zspage_lock zsl; }; -struct mapping_area { - local_lock_t lock; - char *vm_buf; /* copy buffer for objects that span pages */ - char *vm_addr; /* address of kmap_local_page()'ed pages */ - enum zs_mapmode vm_mm; /* mapping mode */ -}; +static void zspage_lock_init(struct zspage *zspage) +{ + static struct lock_class_key __key; + struct zspage_lock *zsl = &zspage->zsl; + + lockdep_init_map(&zsl->dep_map, "zspage->lock", &__key, 0); + spin_lock_init(&zsl->lock); + zsl->cnt = ZS_PAGE_UNLOCKED; +} + +/* + * The zspage lock can be held from atomic contexts, but it needs to remain + * preemptible when held for reading because it remains held outside of those + * atomic contexts, otherwise we unnecessarily lose preemptibility. + * + * To achieve this, the following rules are enforced on readers and writers: + * + * - Writers are blocked by both writers and readers, while readers are only + * blocked by writers (i.e. normal rwlock semantics). + * + * - Writers are always atomic (to allow readers to spin waiting for them). + * + * - Writers always use trylock (as the lock may be held be sleeping readers). + * + * - Readers may spin on the lock (as they can only wait for atomic writers). + * + * - Readers may sleep while holding the lock (as writes only use trylock). + */ +static void zspage_read_lock(struct zspage *zspage) +{ + struct zspage_lock *zsl = &zspage->zsl; + + rwsem_acquire_read(&zsl->dep_map, 0, 0, _RET_IP_); + + spin_lock(&zsl->lock); + zsl->cnt++; + spin_unlock(&zsl->lock); + + lock_acquired(&zsl->dep_map, _RET_IP_); +} + +static void zspage_read_unlock(struct zspage *zspage) +{ + struct zspage_lock *zsl = &zspage->zsl; + + rwsem_release(&zsl->dep_map, _RET_IP_); + + spin_lock(&zsl->lock); + zsl->cnt--; + spin_unlock(&zsl->lock); +} + +static __must_check bool zspage_write_trylock(struct zspage *zspage) +{ + struct zspage_lock *zsl = &zspage->zsl; + + spin_lock(&zsl->lock); + if (zsl->cnt == ZS_PAGE_UNLOCKED) { + zsl->cnt = ZS_PAGE_WRLOCKED; + rwsem_acquire(&zsl->dep_map, 0, 1, _RET_IP_); + lock_acquired(&zsl->dep_map, _RET_IP_); + return true; + } + + spin_unlock(&zsl->lock); + return false; +} + +static void zspage_write_unlock(struct zspage *zspage) +{ + struct zspage_lock *zsl = &zspage->zsl; + + rwsem_release(&zsl->dep_map, _RET_IP_); + + zsl->cnt = ZS_PAGE_UNLOCKED; + spin_unlock(&zsl->lock); +} /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ static void SetZsHugePage(struct zspage *zspage) @@ -278,12 +360,6 @@ static bool ZsHugePage(struct zspage *zspage) return zspage->huge; } -static void migrate_lock_init(struct zspage *zspage); -static void migrate_read_lock(struct zspage *zspage); -static void migrate_read_unlock(struct zspage *zspage); -static void migrate_write_lock(struct zspage *zspage); -static void migrate_write_unlock(struct zspage *zspage); - #ifdef CONFIG_COMPACTION static void kick_deferred_free(struct zs_pool *pool); static void init_deferred_free(struct zs_pool *pool); @@ -376,9 +452,9 @@ static void zs_zpool_destroy(void *pool) } static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, - unsigned long *handle) + unsigned long *handle, const int nid) { - *handle = zs_malloc(pool, size, gfp); + *handle = zs_malloc(pool, size, gfp, nid); if (IS_ERR_VALUE(*handle)) return PTR_ERR((void *)*handle); @@ -389,29 +465,22 @@ static void zs_zpool_free(void *pool, unsigned long handle) zs_free(pool, handle); } -static void *zs_zpool_map(void *pool, unsigned long handle, - enum zpool_mapmode mm) +static void *zs_zpool_obj_read_begin(void *pool, unsigned long handle, + void *local_copy) { - enum zs_mapmode zs_mm; - - switch (mm) { - case ZPOOL_MM_RO: - zs_mm = ZS_MM_RO; - break; - case ZPOOL_MM_WO: - zs_mm = ZS_MM_WO; - break; - case ZPOOL_MM_RW: - default: - zs_mm = ZS_MM_RW; - break; - } + return zs_obj_read_begin(pool, handle, local_copy); +} - return zs_map_object(pool, handle, zs_mm); +static void zs_zpool_obj_read_end(void *pool, unsigned long handle, + void *handle_mem) +{ + zs_obj_read_end(pool, handle, handle_mem); } -static void zs_zpool_unmap(void *pool, unsigned long handle) + +static void zs_zpool_obj_write(void *pool, unsigned long handle, + void *handle_mem, size_t mem_len) { - zs_unmap_object(pool, handle); + zs_obj_write(pool, handle, handle_mem, mem_len); } static u64 zs_zpool_total_pages(void *pool) @@ -424,25 +493,20 @@ static struct zpool_driver zs_zpool_driver = { .owner = THIS_MODULE, .create = zs_zpool_create, .destroy = zs_zpool_destroy, - .malloc_support_movable = true, .malloc = zs_zpool_malloc, .free = zs_zpool_free, - .map = zs_zpool_map, - .unmap = zs_zpool_unmap, + .obj_read_begin = zs_zpool_obj_read_begin, + .obj_read_end = zs_zpool_obj_read_end, + .obj_write = zs_zpool_obj_write, .total_pages = zs_zpool_total_pages, }; MODULE_ALIAS("zpool-zsmalloc"); #endif /* CONFIG_ZPOOL */ -/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ -static DEFINE_PER_CPU(struct mapping_area, zs_map_area) = { - .lock = INIT_LOCAL_LOCK(lock), -}; - -static __maybe_unused int is_first_page(struct page *page) +static inline bool __maybe_unused is_first_zpdesc(struct zpdesc *zpdesc) { - return PagePrivate(page); + return PagePrivate(zpdesc_page(zpdesc)); } /* Protected by class->lock */ @@ -451,36 +515,35 @@ static inline int get_zspage_inuse(struct zspage *zspage) return zspage->inuse; } - static inline void mod_zspage_inuse(struct zspage *zspage, int val) { zspage->inuse += val; } -static inline struct page *get_first_page(struct zspage *zspage) +static struct zpdesc *get_first_zpdesc(struct zspage *zspage) { - struct page *first_page = zspage->first_page; + struct zpdesc *first_zpdesc = zspage->first_zpdesc; - VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); - return first_page; + VM_BUG_ON_PAGE(!is_first_zpdesc(first_zpdesc), zpdesc_page(first_zpdesc)); + return first_zpdesc; } #define FIRST_OBJ_PAGE_TYPE_MASK 0xffffff -static inline unsigned int get_first_obj_offset(struct page *page) +static inline unsigned int get_first_obj_offset(struct zpdesc *zpdesc) { - VM_WARN_ON_ONCE(!PageZsmalloc(page)); - return page->page_type & FIRST_OBJ_PAGE_TYPE_MASK; + VM_WARN_ON_ONCE(!PageZsmalloc(zpdesc_page(zpdesc))); + return zpdesc->first_obj_offset & FIRST_OBJ_PAGE_TYPE_MASK; } -static inline void set_first_obj_offset(struct page *page, unsigned int offset) +static inline void set_first_obj_offset(struct zpdesc *zpdesc, unsigned int offset) { /* With 24 bits available, we can support offsets into 16 MiB pages. */ BUILD_BUG_ON(PAGE_SIZE > SZ_16M); - VM_WARN_ON_ONCE(!PageZsmalloc(page)); + VM_WARN_ON_ONCE(!PageZsmalloc(zpdesc_page(zpdesc))); VM_WARN_ON_ONCE(offset & ~FIRST_OBJ_PAGE_TYPE_MASK); - page->page_type &= ~FIRST_OBJ_PAGE_TYPE_MASK; - page->page_type |= offset & FIRST_OBJ_PAGE_TYPE_MASK; + zpdesc->first_obj_offset &= ~FIRST_OBJ_PAGE_TYPE_MASK; + zpdesc->first_obj_offset |= offset & FIRST_OBJ_PAGE_TYPE_MASK; } static inline unsigned int get_freeobj(struct zspage *zspage) @@ -733,52 +796,52 @@ out: return newfg; } -static struct zspage *get_zspage(struct page *page) +static struct zspage *get_zspage(struct zpdesc *zpdesc) { - struct zspage *zspage = (struct zspage *)page_private(page); + struct zspage *zspage = zpdesc->zspage; BUG_ON(zspage->magic != ZSPAGE_MAGIC); return zspage; } -static struct page *get_next_page(struct page *page) +static struct zpdesc *get_next_zpdesc(struct zpdesc *zpdesc) { - struct zspage *zspage = get_zspage(page); + struct zspage *zspage = get_zspage(zpdesc); if (unlikely(ZsHugePage(zspage))) return NULL; - return (struct page *)page->index; + return zpdesc->next; } /** - * obj_to_location - get (<page>, <obj_idx>) from encoded object value + * obj_to_location - get (<zpdesc>, <obj_idx>) from encoded object value * @obj: the encoded object value - * @page: page object resides in zspage + * @zpdesc: zpdesc object resides in zspage * @obj_idx: object index */ -static void obj_to_location(unsigned long obj, struct page **page, +static void obj_to_location(unsigned long obj, struct zpdesc **zpdesc, unsigned int *obj_idx) { - *page = pfn_to_page(obj >> OBJ_INDEX_BITS); + *zpdesc = pfn_zpdesc(obj >> OBJ_INDEX_BITS); *obj_idx = (obj & OBJ_INDEX_MASK); } -static void obj_to_page(unsigned long obj, struct page **page) +static void obj_to_zpdesc(unsigned long obj, struct zpdesc **zpdesc) { - *page = pfn_to_page(obj >> OBJ_INDEX_BITS); + *zpdesc = pfn_zpdesc(obj >> OBJ_INDEX_BITS); } /** - * location_to_obj - get obj value encoded from (<page>, <obj_idx>) - * @page: page object resides in zspage + * location_to_obj - get obj value encoded from (<zpdesc>, <obj_idx>) + * @zpdesc: zpdesc object resides in zspage * @obj_idx: object index */ -static unsigned long location_to_obj(struct page *page, unsigned int obj_idx) +static unsigned long location_to_obj(struct zpdesc *zpdesc, unsigned int obj_idx) { unsigned long obj; - obj = page_to_pfn(page) << OBJ_INDEX_BITS; + obj = zpdesc_pfn(zpdesc) << OBJ_INDEX_BITS; obj |= obj_idx & OBJ_INDEX_MASK; return obj; @@ -789,15 +852,15 @@ static unsigned long handle_to_obj(unsigned long handle) return *(unsigned long *)handle; } -static inline bool obj_allocated(struct page *page, void *obj, +static inline bool obj_allocated(struct zpdesc *zpdesc, void *obj, unsigned long *phandle) { unsigned long handle; - struct zspage *zspage = get_zspage(page); + struct zspage *zspage = get_zspage(zpdesc); if (unlikely(ZsHugePage(zspage))) { - VM_BUG_ON_PAGE(!is_first_page(page), page); - handle = page->index; + VM_BUG_ON_PAGE(!is_first_zpdesc(zpdesc), zpdesc_page(zpdesc)); + handle = zpdesc->handle; } else handle = *(unsigned long *)obj; @@ -809,22 +872,24 @@ static inline bool obj_allocated(struct page *page, void *obj, return true; } -static void reset_page(struct page *page) +static void reset_zpdesc(struct zpdesc *zpdesc) { + struct page *page = zpdesc_page(zpdesc); + __ClearPageMovable(page); ClearPagePrivate(page); - set_page_private(page, 0); - page->index = 0; + zpdesc->zspage = NULL; + zpdesc->next = NULL; __ClearPageZsmalloc(page); } static int trylock_zspage(struct zspage *zspage) { - struct page *cursor, *fail; + struct zpdesc *cursor, *fail; - for (cursor = get_first_page(zspage); cursor != NULL; cursor = - get_next_page(cursor)) { - if (!trylock_page(cursor)) { + for (cursor = get_first_zpdesc(zspage); cursor != NULL; cursor = + get_next_zpdesc(cursor)) { + if (!zpdesc_trylock(cursor)) { fail = cursor; goto unlock; } @@ -832,9 +897,9 @@ static int trylock_zspage(struct zspage *zspage) return 1; unlock: - for (cursor = get_first_page(zspage); cursor != fail; cursor = - get_next_page(cursor)) - unlock_page(cursor); + for (cursor = get_first_zpdesc(zspage); cursor != fail; cursor = + get_next_zpdesc(cursor)) + zpdesc_unlock(cursor); return 0; } @@ -842,23 +907,23 @@ unlock: static void __free_zspage(struct zs_pool *pool, struct size_class *class, struct zspage *zspage) { - struct page *page, *next; + struct zpdesc *zpdesc, *next; assert_spin_locked(&class->lock); VM_BUG_ON(get_zspage_inuse(zspage)); VM_BUG_ON(zspage->fullness != ZS_INUSE_RATIO_0); - next = page = get_first_page(zspage); + next = zpdesc = get_first_zpdesc(zspage); do { - VM_BUG_ON_PAGE(!PageLocked(page), page); - next = get_next_page(page); - reset_page(page); - unlock_page(page); - dec_zone_page_state(page, NR_ZSPAGES); - put_page(page); - page = next; - } while (page != NULL); + VM_BUG_ON_PAGE(!zpdesc_is_locked(zpdesc), zpdesc_page(zpdesc)); + next = get_next_zpdesc(zpdesc); + reset_zpdesc(zpdesc); + zpdesc_unlock(zpdesc); + zpdesc_dec_zone_page_state(zpdesc); + zpdesc_put(zpdesc); + zpdesc = next; + } while (zpdesc != NULL); cache_free_zspage(pool, zspage); @@ -891,16 +956,16 @@ static void init_zspage(struct size_class *class, struct zspage *zspage) { unsigned int freeobj = 1; unsigned long off = 0; - struct page *page = get_first_page(zspage); + struct zpdesc *zpdesc = get_first_zpdesc(zspage); - while (page) { - struct page *next_page; + while (zpdesc) { + struct zpdesc *next_zpdesc; struct link_free *link; void *vaddr; - set_first_obj_offset(page, off); + set_first_obj_offset(zpdesc, off); - vaddr = kmap_local_page(page); + vaddr = kmap_local_zpdesc(zpdesc); link = (struct link_free *)vaddr + off / sizeof(*link); while ((off += class->size) < PAGE_SIZE) { @@ -913,8 +978,8 @@ static void init_zspage(struct size_class *class, struct zspage *zspage) * page, which must point to the first object on the next * page (if present) */ - next_page = get_next_page(page); - if (next_page) { + next_zpdesc = get_next_zpdesc(zpdesc); + if (next_zpdesc) { link->next = freeobj++ << OBJ_TAG_BITS; } else { /* @@ -924,7 +989,7 @@ static void init_zspage(struct size_class *class, struct zspage *zspage) link->next = -1UL << OBJ_TAG_BITS; } kunmap_local(vaddr); - page = next_page; + zpdesc = next_zpdesc; off %= PAGE_SIZE; } @@ -932,35 +997,35 @@ static void init_zspage(struct size_class *class, struct zspage *zspage) } static void create_page_chain(struct size_class *class, struct zspage *zspage, - struct page *pages[]) + struct zpdesc *zpdescs[]) { int i; - struct page *page; - struct page *prev_page = NULL; - int nr_pages = class->pages_per_zspage; + struct zpdesc *zpdesc; + struct zpdesc *prev_zpdesc = NULL; + int nr_zpdescs = class->pages_per_zspage; /* * Allocate individual pages and link them together as: - * 1. all pages are linked together using page->index - * 2. each sub-page point to zspage using page->private + * 1. all pages are linked together using zpdesc->next + * 2. each sub-page point to zspage using zpdesc->zspage * - * we set PG_private to identify the first page (i.e. no other sub-page + * we set PG_private to identify the first zpdesc (i.e. no other zpdesc * has this flag set). */ - for (i = 0; i < nr_pages; i++) { - page = pages[i]; - set_page_private(page, (unsigned long)zspage); - page->index = 0; + for (i = 0; i < nr_zpdescs; i++) { + zpdesc = zpdescs[i]; + zpdesc->zspage = zspage; + zpdesc->next = NULL; if (i == 0) { - zspage->first_page = page; - SetPagePrivate(page); + zspage->first_zpdesc = zpdesc; + zpdesc_set_first(zpdesc); if (unlikely(class->objs_per_zspage == 1 && class->pages_per_zspage == 1)) SetZsHugePage(zspage); } else { - prev_page->index = (unsigned long)page; + prev_zpdesc->next = zpdesc; } - prev_page = page; + prev_zpdesc = zpdesc; } } @@ -968,42 +1033,42 @@ static void create_page_chain(struct size_class *class, struct zspage *zspage, * Allocate a zspage for the given size class */ static struct zspage *alloc_zspage(struct zs_pool *pool, - struct size_class *class, - gfp_t gfp) + struct size_class *class, + gfp_t gfp, const int nid) { int i; - struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE]; + struct zpdesc *zpdescs[ZS_MAX_PAGES_PER_ZSPAGE]; struct zspage *zspage = cache_alloc_zspage(pool, gfp); if (!zspage) return NULL; zspage->magic = ZSPAGE_MAGIC; - migrate_lock_init(zspage); + zspage->pool = pool; + zspage->class = class->index; + zspage_lock_init(zspage); for (i = 0; i < class->pages_per_zspage; i++) { - struct page *page; + struct zpdesc *zpdesc; - page = alloc_page(gfp); - if (!page) { + zpdesc = alloc_zpdesc(gfp, nid); + if (!zpdesc) { while (--i >= 0) { - dec_zone_page_state(pages[i], NR_ZSPAGES); - __ClearPageZsmalloc(pages[i]); - __free_page(pages[i]); + zpdesc_dec_zone_page_state(zpdescs[i]); + __zpdesc_clear_zsmalloc(zpdescs[i]); + free_zpdesc(zpdescs[i]); } cache_free_zspage(pool, zspage); return NULL; } - __SetPageZsmalloc(page); + __zpdesc_set_zsmalloc(zpdesc); - inc_zone_page_state(page, NR_ZSPAGES); - pages[i] = page; + zpdesc_inc_zone_page_state(zpdesc); + zpdescs[i] = zpdesc; } - create_page_chain(class, zspage, pages); + create_page_chain(class, zspage, zpdescs); init_zspage(class, zspage); - zspage->pool = pool; - zspage->class = class->index; return zspage; } @@ -1023,93 +1088,6 @@ static struct zspage *find_get_zspage(struct size_class *class) return zspage; } -static inline int __zs_cpu_up(struct mapping_area *area) -{ - /* - * Make sure we don't leak memory if a cpu UP notification - * and zs_init() race and both call zs_cpu_up() on the same cpu - */ - if (area->vm_buf) - return 0; - area->vm_buf = kmalloc(ZS_MAX_ALLOC_SIZE, GFP_KERNEL); - if (!area->vm_buf) - return -ENOMEM; - return 0; -} - -static inline void __zs_cpu_down(struct mapping_area *area) -{ - kfree(area->vm_buf); - area->vm_buf = NULL; -} - -static void *__zs_map_object(struct mapping_area *area, - struct page *pages[2], int off, int size) -{ - size_t sizes[2]; - char *buf = area->vm_buf; - - /* disable page faults to match kmap_local_page() return conditions */ - pagefault_disable(); - - /* no read fastpath */ - if (area->vm_mm == ZS_MM_WO) - goto out; - - sizes[0] = PAGE_SIZE - off; - sizes[1] = size - sizes[0]; - - /* copy object to per-cpu buffer */ - memcpy_from_page(buf, pages[0], off, sizes[0]); - memcpy_from_page(buf + sizes[0], pages[1], 0, sizes[1]); -out: - return area->vm_buf; -} - -static void __zs_unmap_object(struct mapping_area *area, - struct page *pages[2], int off, int size) -{ - size_t sizes[2]; - char *buf; - - /* no write fastpath */ - if (area->vm_mm == ZS_MM_RO) - goto out; - - buf = area->vm_buf; - buf = buf + ZS_HANDLE_SIZE; - size -= ZS_HANDLE_SIZE; - off += ZS_HANDLE_SIZE; - - sizes[0] = PAGE_SIZE - off; - sizes[1] = size - sizes[0]; - - /* copy per-cpu buffer to object */ - memcpy_to_page(pages[0], off, buf, sizes[0]); - memcpy_to_page(pages[1], 0, buf + sizes[0], sizes[1]); - -out: - /* enable page faults to match kunmap_local() return conditions */ - pagefault_enable(); -} - -static int zs_cpu_prepare(unsigned int cpu) -{ - struct mapping_area *area; - - area = &per_cpu(zs_map_area, cpu); - return __zs_cpu_up(area); -} - -static int zs_cpu_dead(unsigned int cpu) -{ - struct mapping_area *area; - - area = &per_cpu(zs_map_area, cpu); - __zs_cpu_down(area); - return 0; -} - static bool can_merge(struct size_class *prev, int pages_per_zspage, int objs_per_zspage) { @@ -1157,116 +1135,130 @@ unsigned long zs_get_total_pages(struct zs_pool *pool) } EXPORT_SYMBOL_GPL(zs_get_total_pages); -/** - * zs_map_object - get address of allocated object from handle. - * @pool: pool from which the object was allocated - * @handle: handle returned from zs_malloc - * @mm: mapping mode to use - * - * Before using an object allocated from zs_malloc, it must be mapped using - * this function. When done with the object, it must be unmapped using - * zs_unmap_object. - * - * Only one object can be mapped per cpu at a time. There is no protection - * against nested mappings. - * - * This function returns with preemption and page faults disabled. - */ -void *zs_map_object(struct zs_pool *pool, unsigned long handle, - enum zs_mapmode mm) +void *zs_obj_read_begin(struct zs_pool *pool, unsigned long handle, + void *local_copy) { struct zspage *zspage; - struct page *page; + struct zpdesc *zpdesc; unsigned long obj, off; unsigned int obj_idx; - struct size_class *class; - struct mapping_area *area; - struct page *pages[2]; - void *ret; - - /* - * Because we use per-cpu mapping areas shared among the - * pools/users, we can't allow mapping in interrupt context - * because it can corrupt another users mappings. - */ - BUG_ON(in_interrupt()); + void *addr; - /* It guarantees it can get zspage from handle safely */ - read_lock(&pool->migrate_lock); + /* Guarantee we can get zspage from handle safely */ + read_lock(&pool->lock); obj = handle_to_obj(handle); - obj_to_location(obj, &page, &obj_idx); - zspage = get_zspage(page); + obj_to_location(obj, &zpdesc, &obj_idx); + zspage = get_zspage(zpdesc); - /* - * migration cannot move any zpages in this zspage. Here, class->lock - * is too heavy since callers would take some time until they calls - * zs_unmap_object API so delegate the locking from class to zspage - * which is smaller granularity. - */ - migrate_read_lock(zspage); - read_unlock(&pool->migrate_lock); + /* Make sure migration doesn't move any pages in this zspage */ + zspage_read_lock(zspage); + read_unlock(&pool->lock); class = zspage_class(pool, zspage); off = offset_in_page(class->size * obj_idx); - local_lock(&zs_map_area.lock); - area = this_cpu_ptr(&zs_map_area); - area->vm_mm = mm; if (off + class->size <= PAGE_SIZE) { /* this object is contained entirely within a page */ - area->vm_addr = kmap_local_page(page); - ret = area->vm_addr + off; - goto out; + addr = kmap_local_zpdesc(zpdesc); + addr += off; + } else { + size_t sizes[2]; + + /* this object spans two pages */ + sizes[0] = PAGE_SIZE - off; + sizes[1] = class->size - sizes[0]; + addr = local_copy; + + memcpy_from_page(addr, zpdesc_page(zpdesc), + off, sizes[0]); + zpdesc = get_next_zpdesc(zpdesc); + memcpy_from_page(addr + sizes[0], + zpdesc_page(zpdesc), + 0, sizes[1]); } - /* this object spans two pages */ - pages[0] = page; - pages[1] = get_next_page(page); - BUG_ON(!pages[1]); - - ret = __zs_map_object(area, pages, off, class->size); -out: - if (likely(!ZsHugePage(zspage))) - ret += ZS_HANDLE_SIZE; + if (!ZsHugePage(zspage)) + addr += ZS_HANDLE_SIZE; - return ret; + return addr; } -EXPORT_SYMBOL_GPL(zs_map_object); +EXPORT_SYMBOL_GPL(zs_obj_read_begin); -void zs_unmap_object(struct zs_pool *pool, unsigned long handle) +void zs_obj_read_end(struct zs_pool *pool, unsigned long handle, + void *handle_mem) { struct zspage *zspage; - struct page *page; + struct zpdesc *zpdesc; unsigned long obj, off; unsigned int obj_idx; - struct size_class *class; - struct mapping_area *area; obj = handle_to_obj(handle); - obj_to_location(obj, &page, &obj_idx); - zspage = get_zspage(page); + obj_to_location(obj, &zpdesc, &obj_idx); + zspage = get_zspage(zpdesc); class = zspage_class(pool, zspage); off = offset_in_page(class->size * obj_idx); - area = this_cpu_ptr(&zs_map_area); - if (off + class->size <= PAGE_SIZE) - kunmap_local(area->vm_addr); - else { - struct page *pages[2]; + if (off + class->size <= PAGE_SIZE) { + if (!ZsHugePage(zspage)) + off += ZS_HANDLE_SIZE; + handle_mem -= off; + kunmap_local(handle_mem); + } + + zspage_read_unlock(zspage); +} +EXPORT_SYMBOL_GPL(zs_obj_read_end); + +void zs_obj_write(struct zs_pool *pool, unsigned long handle, + void *handle_mem, size_t mem_len) +{ + struct zspage *zspage; + struct zpdesc *zpdesc; + unsigned long obj, off; + unsigned int obj_idx; + struct size_class *class; + + /* Guarantee we can get zspage from handle safely */ + read_lock(&pool->lock); + obj = handle_to_obj(handle); + obj_to_location(obj, &zpdesc, &obj_idx); + zspage = get_zspage(zpdesc); + + /* Make sure migration doesn't move any pages in this zspage */ + zspage_read_lock(zspage); + read_unlock(&pool->lock); + + class = zspage_class(pool, zspage); + off = offset_in_page(class->size * obj_idx); - pages[0] = page; - pages[1] = get_next_page(page); - BUG_ON(!pages[1]); + if (!ZsHugePage(zspage)) + off += ZS_HANDLE_SIZE; - __zs_unmap_object(area, pages, off, class->size); + if (off + mem_len <= PAGE_SIZE) { + /* this object is contained entirely within a page */ + void *dst = kmap_local_zpdesc(zpdesc); + + memcpy(dst + off, handle_mem, mem_len); + kunmap_local(dst); + } else { + /* this object spans two pages */ + size_t sizes[2]; + + sizes[0] = PAGE_SIZE - off; + sizes[1] = mem_len - sizes[0]; + + memcpy_to_page(zpdesc_page(zpdesc), off, + handle_mem, sizes[0]); + zpdesc = get_next_zpdesc(zpdesc); + memcpy_to_page(zpdesc_page(zpdesc), 0, + handle_mem + sizes[0], sizes[1]); } - local_unlock(&zs_map_area.lock); - migrate_read_unlock(zspage); + zspage_read_unlock(zspage); } -EXPORT_SYMBOL_GPL(zs_unmap_object); +EXPORT_SYMBOL_GPL(zs_obj_write); /** * zs_huge_class_size() - Returns the size (in bytes) of the first huge @@ -1290,12 +1282,12 @@ EXPORT_SYMBOL_GPL(zs_huge_class_size); static unsigned long obj_malloc(struct zs_pool *pool, struct zspage *zspage, unsigned long handle) { - int i, nr_page, offset; + int i, nr_zpdesc, offset; unsigned long obj; struct link_free *link; struct size_class *class; - struct page *m_page; + struct zpdesc *m_zpdesc; unsigned long m_offset; void *vaddr; @@ -1303,27 +1295,26 @@ static unsigned long obj_malloc(struct zs_pool *pool, obj = get_freeobj(zspage); offset = obj * class->size; - nr_page = offset >> PAGE_SHIFT; + nr_zpdesc = offset >> PAGE_SHIFT; m_offset = offset_in_page(offset); - m_page = get_first_page(zspage); + m_zpdesc = get_first_zpdesc(zspage); - for (i = 0; i < nr_page; i++) - m_page = get_next_page(m_page); + for (i = 0; i < nr_zpdesc; i++) + m_zpdesc = get_next_zpdesc(m_zpdesc); - vaddr = kmap_local_page(m_page); + vaddr = kmap_local_zpdesc(m_zpdesc); link = (struct link_free *)vaddr + m_offset / sizeof(*link); set_freeobj(zspage, link->next >> OBJ_TAG_BITS); if (likely(!ZsHugePage(zspage))) /* record handle in the header of allocated chunk */ link->handle = handle | OBJ_ALLOCATED_TAG; else - /* record handle to page->index */ - zspage->first_page->index = handle | OBJ_ALLOCATED_TAG; + zspage->first_zpdesc->handle = handle | OBJ_ALLOCATED_TAG; kunmap_local(vaddr); mod_zspage_inuse(zspage, 1); - obj = location_to_obj(m_page, obj); + obj = location_to_obj(m_zpdesc, obj); record_obj(handle, obj); return obj; @@ -1335,12 +1326,14 @@ static unsigned long obj_malloc(struct zs_pool *pool, * @pool: pool to allocate from * @size: size of block to allocate * @gfp: gfp flags when allocating object + * @nid: The preferred node id to allocate new zspage (if needed) * * On success, handle to the allocated object is returned, * otherwise an ERR_PTR(). * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail. */ -unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) +unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp, + const int nid) { unsigned long handle; struct size_class *class; @@ -1375,7 +1368,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) spin_unlock(&class->lock); - zspage = alloc_zspage(pool, class, gfp); + zspage = alloc_zspage(pool, class, gfp, nid); if (!zspage) { cache_free_handle(pool, handle); return (unsigned long)ERR_PTR(-ENOMEM); @@ -1402,23 +1395,24 @@ static void obj_free(int class_size, unsigned long obj) { struct link_free *link; struct zspage *zspage; - struct page *f_page; + struct zpdesc *f_zpdesc; unsigned long f_offset; unsigned int f_objidx; void *vaddr; - obj_to_location(obj, &f_page, &f_objidx); + + obj_to_location(obj, &f_zpdesc, &f_objidx); f_offset = offset_in_page(class_size * f_objidx); - zspage = get_zspage(f_page); + zspage = get_zspage(f_zpdesc); - vaddr = kmap_local_page(f_page); + vaddr = kmap_local_zpdesc(f_zpdesc); link = (struct link_free *)(vaddr + f_offset); /* Insert this object in containing zspage's freelist */ if (likely(!ZsHugePage(zspage))) link->next = get_freeobj(zspage) << OBJ_TAG_BITS; else - f_page->index = 0; + f_zpdesc->handle = 0; set_freeobj(zspage, f_objidx); kunmap_local(vaddr); @@ -1428,7 +1422,7 @@ static void obj_free(int class_size, unsigned long obj) void zs_free(struct zs_pool *pool, unsigned long handle) { struct zspage *zspage; - struct page *f_page; + struct zpdesc *f_zpdesc; unsigned long obj; struct size_class *class; int fullness; @@ -1437,16 +1431,16 @@ void zs_free(struct zs_pool *pool, unsigned long handle) return; /* - * The pool->migrate_lock protects the race with zpage's migration + * The pool->lock protects the race with zpage's migration * so it's safe to get the page from handle. */ - read_lock(&pool->migrate_lock); + read_lock(&pool->lock); obj = handle_to_obj(handle); - obj_to_page(obj, &f_page); - zspage = get_zspage(f_page); + obj_to_zpdesc(obj, &f_zpdesc); + zspage = get_zspage(f_zpdesc); class = zspage_class(pool, zspage); spin_lock(&class->lock); - read_unlock(&pool->migrate_lock); + read_unlock(&pool->lock); class_stat_sub(class, ZS_OBJS_INUSE, 1); obj_free(class->size, obj); @@ -1463,7 +1457,7 @@ EXPORT_SYMBOL_GPL(zs_free); static void zs_object_copy(struct size_class *class, unsigned long dst, unsigned long src) { - struct page *s_page, *d_page; + struct zpdesc *s_zpdesc, *d_zpdesc; unsigned int s_objidx, d_objidx; unsigned long s_off, d_off; void *s_addr, *d_addr; @@ -1472,8 +1466,8 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, s_size = d_size = class->size; - obj_to_location(src, &s_page, &s_objidx); - obj_to_location(dst, &d_page, &d_objidx); + obj_to_location(src, &s_zpdesc, &s_objidx); + obj_to_location(dst, &d_zpdesc, &d_objidx); s_off = offset_in_page(class->size * s_objidx); d_off = offset_in_page(class->size * d_objidx); @@ -1484,8 +1478,8 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, if (d_off + class->size > PAGE_SIZE) d_size = PAGE_SIZE - d_off; - s_addr = kmap_local_page(s_page); - d_addr = kmap_local_page(d_page); + s_addr = kmap_local_zpdesc(s_zpdesc); + d_addr = kmap_local_zpdesc(d_zpdesc); while (1) { size = min(s_size, d_size); @@ -1510,17 +1504,17 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, if (s_off >= PAGE_SIZE) { kunmap_local(d_addr); kunmap_local(s_addr); - s_page = get_next_page(s_page); - s_addr = kmap_local_page(s_page); - d_addr = kmap_local_page(d_page); + s_zpdesc = get_next_zpdesc(s_zpdesc); + s_addr = kmap_local_zpdesc(s_zpdesc); + d_addr = kmap_local_zpdesc(d_zpdesc); s_size = class->size - written; s_off = 0; } if (d_off >= PAGE_SIZE) { kunmap_local(d_addr); - d_page = get_next_page(d_page); - d_addr = kmap_local_page(d_page); + d_zpdesc = get_next_zpdesc(d_zpdesc); + d_addr = kmap_local_zpdesc(d_zpdesc); d_size = class->size - written; d_off = 0; } @@ -1535,18 +1529,18 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, * return handle. */ static unsigned long find_alloced_obj(struct size_class *class, - struct page *page, int *obj_idx) + struct zpdesc *zpdesc, int *obj_idx) { unsigned int offset; int index = *obj_idx; unsigned long handle = 0; - void *addr = kmap_local_page(page); + void *addr = kmap_local_zpdesc(zpdesc); - offset = get_first_obj_offset(page); + offset = get_first_obj_offset(zpdesc); offset += class->size * index; while (offset < PAGE_SIZE) { - if (obj_allocated(page, addr + offset, &handle)) + if (obj_allocated(zpdesc, addr + offset, &handle)) break; offset += class->size; @@ -1566,14 +1560,14 @@ static void migrate_zspage(struct zs_pool *pool, struct zspage *src_zspage, unsigned long used_obj, free_obj; unsigned long handle; int obj_idx = 0; - struct page *s_page = get_first_page(src_zspage); + struct zpdesc *s_zpdesc = get_first_zpdesc(src_zspage); struct size_class *class = pool->size_class[src_zspage->class]; while (1) { - handle = find_alloced_obj(class, s_page, &obj_idx); + handle = find_alloced_obj(class, s_zpdesc, &obj_idx); if (!handle) { - s_page = get_next_page(s_page); - if (!s_page) + s_zpdesc = get_next_zpdesc(s_zpdesc); + if (!s_zpdesc) break; obj_idx = 0; continue; @@ -1653,93 +1647,70 @@ static int putback_zspage(struct size_class *class, struct zspage *zspage) */ static void lock_zspage(struct zspage *zspage) { - struct page *curr_page, *page; + struct zpdesc *curr_zpdesc, *zpdesc; /* * Pages we haven't locked yet can be migrated off the list while we're * trying to lock them, so we need to be careful and only attempt to - * lock each page under migrate_read_lock(). Otherwise, the page we lock + * lock each page under zspage_read_lock(). Otherwise, the page we lock * may no longer belong to the zspage. This means that we may wait for * the wrong page to unlock, so we must take a reference to the page - * prior to waiting for it to unlock outside migrate_read_lock(). + * prior to waiting for it to unlock outside zspage_read_lock(). */ while (1) { - migrate_read_lock(zspage); - page = get_first_page(zspage); - if (trylock_page(page)) + zspage_read_lock(zspage); + zpdesc = get_first_zpdesc(zspage); + if (zpdesc_trylock(zpdesc)) break; - get_page(page); - migrate_read_unlock(zspage); - wait_on_page_locked(page); - put_page(page); + zpdesc_get(zpdesc); + zspage_read_unlock(zspage); + zpdesc_wait_locked(zpdesc); + zpdesc_put(zpdesc); } - curr_page = page; - while ((page = get_next_page(curr_page))) { - if (trylock_page(page)) { - curr_page = page; + curr_zpdesc = zpdesc; + while ((zpdesc = get_next_zpdesc(curr_zpdesc))) { + if (zpdesc_trylock(zpdesc)) { + curr_zpdesc = zpdesc; } else { - get_page(page); - migrate_read_unlock(zspage); - wait_on_page_locked(page); - put_page(page); - migrate_read_lock(zspage); + zpdesc_get(zpdesc); + zspage_read_unlock(zspage); + zpdesc_wait_locked(zpdesc); + zpdesc_put(zpdesc); + zspage_read_lock(zspage); } } - migrate_read_unlock(zspage); + zspage_read_unlock(zspage); } #endif /* CONFIG_COMPACTION */ -static void migrate_lock_init(struct zspage *zspage) -{ - rwlock_init(&zspage->lock); -} - -static void migrate_read_lock(struct zspage *zspage) __acquires(&zspage->lock) -{ - read_lock(&zspage->lock); -} - -static void migrate_read_unlock(struct zspage *zspage) __releases(&zspage->lock) -{ - read_unlock(&zspage->lock); -} - -static void migrate_write_lock(struct zspage *zspage) -{ - write_lock(&zspage->lock); -} - -static void migrate_write_unlock(struct zspage *zspage) -{ - write_unlock(&zspage->lock); -} - #ifdef CONFIG_COMPACTION static const struct movable_operations zsmalloc_mops; static void replace_sub_page(struct size_class *class, struct zspage *zspage, - struct page *newpage, struct page *oldpage) + struct zpdesc *newzpdesc, struct zpdesc *oldzpdesc) { - struct page *page; - struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE] = {NULL, }; + struct zpdesc *zpdesc; + struct zpdesc *zpdescs[ZS_MAX_PAGES_PER_ZSPAGE] = {NULL, }; + unsigned int first_obj_offset; int idx = 0; - page = get_first_page(zspage); + zpdesc = get_first_zpdesc(zspage); do { - if (page == oldpage) - pages[idx] = newpage; + if (zpdesc == oldzpdesc) + zpdescs[idx] = newzpdesc; else - pages[idx] = page; + zpdescs[idx] = zpdesc; idx++; - } while ((page = get_next_page(page)) != NULL); + } while ((zpdesc = get_next_zpdesc(zpdesc)) != NULL); - create_page_chain(class, zspage, pages); - set_first_obj_offset(newpage, get_first_obj_offset(oldpage)); + create_page_chain(class, zspage, zpdescs); + first_obj_offset = get_first_obj_offset(oldzpdesc); + set_first_obj_offset(newzpdesc, first_obj_offset); if (unlikely(ZsHugePage(zspage))) - newpage->index = oldpage->index; - __SetPageMovable(newpage, &zsmalloc_mops); + newzpdesc->handle = oldzpdesc->handle; + __zpdesc_set_movable(newzpdesc, &zsmalloc_mops); } static bool zs_page_isolate(struct page *page, isolate_mode_t mode) @@ -1759,76 +1730,81 @@ static int zs_page_migrate(struct page *newpage, struct page *page, struct zs_pool *pool; struct size_class *class; struct zspage *zspage; - struct page *dummy; + struct zpdesc *dummy; + struct zpdesc *newzpdesc = page_zpdesc(newpage); + struct zpdesc *zpdesc = page_zpdesc(page); void *s_addr, *d_addr, *addr; unsigned int offset; unsigned long handle; unsigned long old_obj, new_obj; unsigned int obj_idx; - VM_BUG_ON_PAGE(!PageIsolated(page), page); - - /* We're committed, tell the world that this is a Zsmalloc page. */ - __SetPageZsmalloc(newpage); + VM_BUG_ON_PAGE(!zpdesc_is_isolated(zpdesc), zpdesc_page(zpdesc)); /* The page is locked, so this pointer must remain valid */ - zspage = get_zspage(page); + zspage = get_zspage(zpdesc); pool = zspage->pool; /* * The pool migrate_lock protects the race between zpage migration * and zs_free. */ - write_lock(&pool->migrate_lock); + write_lock(&pool->lock); class = zspage_class(pool, zspage); /* * the class lock protects zpage alloc/free in the zspage. */ spin_lock(&class->lock); - /* the migrate_write_lock protects zpage access via zs_map_object */ - migrate_write_lock(zspage); + /* the zspage write_lock protects zpage access via zs_obj_read/write() */ + if (!zspage_write_trylock(zspage)) { + spin_unlock(&class->lock); + write_unlock(&pool->lock); + return -EINVAL; + } - offset = get_first_obj_offset(page); - s_addr = kmap_local_page(page); + /* We're committed, tell the world that this is a Zsmalloc page. */ + __zpdesc_set_zsmalloc(newzpdesc); + + offset = get_first_obj_offset(zpdesc); + s_addr = kmap_local_zpdesc(zpdesc); /* * Here, any user cannot access all objects in the zspage so let's move. */ - d_addr = kmap_local_page(newpage); + d_addr = kmap_local_zpdesc(newzpdesc); copy_page(d_addr, s_addr); kunmap_local(d_addr); for (addr = s_addr + offset; addr < s_addr + PAGE_SIZE; addr += class->size) { - if (obj_allocated(page, addr, &handle)) { + if (obj_allocated(zpdesc, addr, &handle)) { old_obj = handle_to_obj(handle); obj_to_location(old_obj, &dummy, &obj_idx); - new_obj = (unsigned long)location_to_obj(newpage, - obj_idx); + new_obj = (unsigned long)location_to_obj(newzpdesc, obj_idx); record_obj(handle, new_obj); } } kunmap_local(s_addr); - replace_sub_page(class, zspage, newpage, page); + replace_sub_page(class, zspage, newzpdesc, zpdesc); /* * Since we complete the data copy and set up new zspage structure, * it's okay to release migration_lock. */ - write_unlock(&pool->migrate_lock); + write_unlock(&pool->lock); spin_unlock(&class->lock); - migrate_write_unlock(zspage); + zspage_write_unlock(zspage); - get_page(newpage); - if (page_zone(newpage) != page_zone(page)) { - dec_zone_page_state(page, NR_ZSPAGES); - inc_zone_page_state(newpage, NR_ZSPAGES); + zpdesc_get(newzpdesc); + if (zpdesc_zone(newzpdesc) != zpdesc_zone(zpdesc)) { + zpdesc_dec_zone_page_state(zpdesc); + zpdesc_inc_zone_page_state(newzpdesc); } - reset_page(page); - put_page(page); + reset_zpdesc(zpdesc); + zpdesc_put(zpdesc); return MIGRATEPAGE_SUCCESS; } @@ -1897,13 +1873,13 @@ static void init_deferred_free(struct zs_pool *pool) static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) { - struct page *page = get_first_page(zspage); + struct zpdesc *zpdesc = get_first_zpdesc(zspage); do { - WARN_ON(!trylock_page(page)); - __SetPageMovable(page, &zsmalloc_mops); - unlock_page(page); - } while ((page = get_next_page(page)) != NULL); + WARN_ON(!zpdesc_trylock(zpdesc)); + __zpdesc_set_movable(zpdesc, &zsmalloc_mops); + zpdesc_unlock(zpdesc); + } while ((zpdesc = get_next_zpdesc(zpdesc)) != NULL); } #else static inline void zs_flush_migration(struct zs_pool *pool) { } @@ -1940,7 +1916,7 @@ static unsigned long __zs_compact(struct zs_pool *pool, * protect the race between zpage migration and zs_free * as well as zpage allocation/free */ - write_lock(&pool->migrate_lock); + write_lock(&pool->lock); spin_lock(&class->lock); while (zs_can_compact(class)) { int fg; @@ -1955,9 +1931,11 @@ static unsigned long __zs_compact(struct zs_pool *pool, if (!src_zspage) break; - migrate_write_lock(src_zspage); + if (!zspage_write_trylock(src_zspage)) + break; + migrate_zspage(pool, src_zspage, dst_zspage); - migrate_write_unlock(src_zspage); + zspage_write_unlock(src_zspage); fg = putback_zspage(class, src_zspage); if (fg == ZS_INUSE_RATIO_0) { @@ -1967,14 +1945,14 @@ static unsigned long __zs_compact(struct zs_pool *pool, src_zspage = NULL; if (get_fullness_group(class, dst_zspage) == ZS_INUSE_RATIO_100 - || rwlock_is_contended(&pool->migrate_lock)) { + || rwlock_is_contended(&pool->lock)) { putback_zspage(class, dst_zspage); dst_zspage = NULL; spin_unlock(&class->lock); - write_unlock(&pool->migrate_lock); + write_unlock(&pool->lock); cond_resched(); - write_lock(&pool->migrate_lock); + write_lock(&pool->lock); spin_lock(&class->lock); } } @@ -1986,7 +1964,7 @@ static unsigned long __zs_compact(struct zs_pool *pool, putback_zspage(class, dst_zspage); spin_unlock(&class->lock); - write_unlock(&pool->migrate_lock); + write_unlock(&pool->lock); return pages_freed; } @@ -1998,10 +1976,10 @@ unsigned long zs_compact(struct zs_pool *pool) unsigned long pages_freed = 0; /* - * Pool compaction is performed under pool->migrate_lock so it is basically + * Pool compaction is performed under pool->lock so it is basically * single-threaded. Having more than one thread in __zs_compact() - * will increase pool->migrate_lock contention, which will impact other - * zsmalloc operations that need pool->migrate_lock. + * will increase pool->lock contention, which will impact other + * zsmalloc operations that need pool->lock. */ if (atomic_xchg(&pool->compaction_in_progress, 1)) return 0; @@ -2123,7 +2101,7 @@ struct zs_pool *zs_create_pool(const char *name) return NULL; init_deferred_free(pool); - rwlock_init(&pool->migrate_lock); + rwlock_init(&pool->lock); atomic_set(&pool->compaction_in_progress, 0); pool->name = kstrdup(name, GFP_KERNEL); @@ -2262,23 +2240,11 @@ EXPORT_SYMBOL_GPL(zs_destroy_pool); static int __init zs_init(void) { - int ret; - - ret = cpuhp_setup_state(CPUHP_MM_ZS_PREPARE, "mm/zsmalloc:prepare", - zs_cpu_prepare, zs_cpu_dead); - if (ret) - goto out; - #ifdef CONFIG_ZPOOL zpool_register_driver(&zs_zpool_driver); #endif - zs_stat_init(); - return 0; - -out: - return ret; } static void __exit zs_exit(void) @@ -2286,8 +2252,6 @@ static void __exit zs_exit(void) #ifdef CONFIG_ZPOOL zpool_unregister_driver(&zs_zpool_driver); #endif - cpuhp_remove_state(CPUHP_MM_ZS_PREPARE); - zs_stat_exit(); } diff --git a/mm/zswap.c b/mm/zswap.c index f6316b66fb23..455e9425c5f5 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -43,7 +43,7 @@ * statistics **********************************/ /* The number of compressed pages currently stored in zswap */ -atomic_long_t zswap_stored_pages = ATOMIC_INIT(0); +atomic_long_t zswap_stored_pages = ATOMIC_LONG_INIT(0); /* * The statistics below are not protected from concurrent access for @@ -62,6 +62,8 @@ static u64 zswap_reject_reclaim_fail; static u64 zswap_reject_compress_fail; /* Compressed page was too big for the allocator to (optimally) store */ static u64 zswap_reject_compress_poor; +/* Load or writeback failed due to decompression failure */ +static u64 zswap_decompress_fail; /* Store failed because underlying allocator could not get memory */ static u64 zswap_reject_alloc_fail; /* Store failed because the entry metadata could not be allocated (rare) */ @@ -251,7 +253,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) struct zswap_pool *pool; char name[38]; /* 'zswap' + 32 char (max) num + \0 */ gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; - int ret; + int ret, cpu; if (!zswap_has_pool) { /* if either are unset, pool initialization failed, and we @@ -285,6 +287,9 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) goto error; } + for_each_possible_cpu(cpu) + mutex_init(&per_cpu_ptr(pool->acomp_ctx, cpu)->mutex); + ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); if (ret) @@ -817,36 +822,41 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) { struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); - struct crypto_acomp *acomp; - struct acomp_req *req; + struct crypto_acomp *acomp = NULL; + struct acomp_req *req = NULL; + u8 *buffer = NULL; int ret; - mutex_init(&acomp_ctx->mutex); - - acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); - if (!acomp_ctx->buffer) - return -ENOMEM; + buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); + if (!buffer) { + ret = -ENOMEM; + goto fail; + } acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); if (IS_ERR(acomp)) { pr_err("could not alloc crypto acomp %s : %ld\n", pool->tfm_name, PTR_ERR(acomp)); ret = PTR_ERR(acomp); - goto acomp_fail; + goto fail; } - acomp_ctx->acomp = acomp; - acomp_ctx->is_sleepable = acomp_is_async(acomp); - req = acomp_request_alloc(acomp_ctx->acomp); + req = acomp_request_alloc(acomp); if (!req) { pr_err("could not alloc crypto acomp_request %s\n", pool->tfm_name); ret = -ENOMEM; - goto req_fail; + goto fail; } - acomp_ctx->req = req; + /* + * Only hold the mutex after completing allocations, otherwise we may + * recurse into zswap through reclaim and attempt to hold the mutex + * again resulting in a deadlock. + */ + mutex_lock(&acomp_ctx->mutex); crypto_init_wait(&acomp_ctx->wait); + /* * if the backend of acomp is async zip, crypto_req_done() will wakeup * crypto_wait_req(); if the backend of acomp is scomp, the callback @@ -855,12 +865,17 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, crypto_req_done, &acomp_ctx->wait); + acomp_ctx->buffer = buffer; + acomp_ctx->acomp = acomp; + acomp_ctx->is_sleepable = acomp_is_async(acomp); + acomp_ctx->req = req; + mutex_unlock(&acomp_ctx->mutex); return 0; -req_fail: - crypto_free_acomp(acomp_ctx->acomp); -acomp_fail: - kfree(acomp_ctx->buffer); +fail: + if (acomp) + crypto_free_acomp(acomp); + kfree(buffer); return ret; } @@ -868,18 +883,60 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) { struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); + struct acomp_req *req; + struct crypto_acomp *acomp; + u8 *buffer; - if (!IS_ERR_OR_NULL(acomp_ctx)) { - if (!IS_ERR_OR_NULL(acomp_ctx->req)) - acomp_request_free(acomp_ctx->req); - if (!IS_ERR_OR_NULL(acomp_ctx->acomp)) - crypto_free_acomp(acomp_ctx->acomp); - kfree(acomp_ctx->buffer); - } + if (IS_ERR_OR_NULL(acomp_ctx)) + return 0; + + mutex_lock(&acomp_ctx->mutex); + req = acomp_ctx->req; + acomp = acomp_ctx->acomp; + buffer = acomp_ctx->buffer; + acomp_ctx->req = NULL; + acomp_ctx->acomp = NULL; + acomp_ctx->buffer = NULL; + mutex_unlock(&acomp_ctx->mutex); + + /* + * Do the actual freeing after releasing the mutex to avoid subtle + * locking dependencies causing deadlocks. + */ + if (!IS_ERR_OR_NULL(req)) + acomp_request_free(req); + if (!IS_ERR_OR_NULL(acomp)) + crypto_free_acomp(acomp); + kfree(buffer); return 0; } +static struct crypto_acomp_ctx *acomp_ctx_get_cpu_lock(struct zswap_pool *pool) +{ + struct crypto_acomp_ctx *acomp_ctx; + + for (;;) { + acomp_ctx = raw_cpu_ptr(pool->acomp_ctx); + mutex_lock(&acomp_ctx->mutex); + if (likely(acomp_ctx->req)) + return acomp_ctx; + /* + * It is possible that we were migrated to a different CPU after + * getting the per-CPU ctx but before the mutex was acquired. If + * the old CPU got offlined, zswap_cpu_comp_dead() could have + * already freed ctx->req (among other things) and set it to + * NULL. Just try again on the new CPU that we ended up on. + */ + mutex_unlock(&acomp_ctx->mutex); + } +} + +static void acomp_ctx_put_unlock(struct crypto_acomp_ctx *acomp_ctx) +{ + mutex_unlock(&acomp_ctx->mutex); +} + static bool zswap_compress(struct page *page, struct zswap_entry *entry, struct zswap_pool *pool) { @@ -889,14 +946,10 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, unsigned int dlen = PAGE_SIZE; unsigned long handle; struct zpool *zpool; - char *buf; gfp_t gfp; u8 *dst; - acomp_ctx = raw_cpu_ptr(pool->acomp_ctx); - - mutex_lock(&acomp_ctx->mutex); - + acomp_ctx = acomp_ctx_get_cpu_lock(pool); dst = acomp_ctx->buffer; sg_init_table(&input, 1); sg_set_page(&input, page, PAGE_SIZE, 0); @@ -927,17 +980,12 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, goto unlock; zpool = pool->zpool; - gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; - if (zpool_malloc_support_movable(zpool)) - gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; - alloc_ret = zpool_malloc(zpool, dlen, gfp, &handle); + gfp = GFP_NOWAIT | __GFP_NORETRY | __GFP_HIGHMEM | __GFP_MOVABLE; + alloc_ret = zpool_malloc(zpool, dlen, gfp, &handle, page_to_nid(page)); if (alloc_ret) goto unlock; - buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO); - memcpy(buf, dst, dlen); - zpool_unmap_handle(zpool, handle); - + zpool_obj_write(zpool, handle, dst, dlen); entry->handle = handle; entry->length = dlen; @@ -949,47 +997,53 @@ unlock: else if (alloc_ret) zswap_reject_alloc_fail++; - mutex_unlock(&acomp_ctx->mutex); + acomp_ctx_put_unlock(acomp_ctx); return comp_ret == 0 && alloc_ret == 0; } -static void zswap_decompress(struct zswap_entry *entry, struct folio *folio) +static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio) { struct zpool *zpool = entry->pool->zpool; struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; - u8 *src; + int decomp_ret, dlen; + u8 *src, *obj; - acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); - mutex_lock(&acomp_ctx->mutex); + acomp_ctx = acomp_ctx_get_cpu_lock(entry->pool); + obj = zpool_obj_read_begin(zpool, entry->handle, acomp_ctx->buffer); - src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO); /* - * If zpool_map_handle is atomic, we cannot reliably utilize its mapped buffer - * to do crypto_acomp_decompress() which might sleep. In such cases, we must - * resort to copying the buffer to a temporary one. - * Meanwhile, zpool_map_handle() might return a non-linearly mapped buffer, - * such as a kmap address of high memory or even ever a vmap address. - * However, sg_init_one is only equipped to handle linearly mapped low memory. - * In such cases, we also must copy the buffer to a temporary and lowmem one. + * zpool_obj_read_begin() might return a kmap address of highmem when + * acomp_ctx->buffer is not used. However, sg_init_one() does not + * handle highmem addresses, so copy the object to acomp_ctx->buffer. */ - if ((acomp_ctx->is_sleepable && !zpool_can_sleep_mapped(zpool)) || - !virt_addr_valid(src)) { - memcpy(acomp_ctx->buffer, src, entry->length); + if (virt_addr_valid(obj)) { + src = obj; + } else { + WARN_ON_ONCE(obj == acomp_ctx->buffer); + memcpy(acomp_ctx->buffer, obj, entry->length); src = acomp_ctx->buffer; - zpool_unmap_handle(zpool, entry->handle); } sg_init_one(&input, src, entry->length); sg_init_table(&output, 1); sg_set_folio(&output, folio, PAGE_SIZE, 0); acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE); - BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait)); - BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE); - mutex_unlock(&acomp_ctx->mutex); + decomp_ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); + dlen = acomp_ctx->req->dlen; - if (src != acomp_ctx->buffer) - zpool_unmap_handle(zpool, entry->handle); + zpool_obj_read_end(zpool, entry->handle, obj); + acomp_ctx_put_unlock(acomp_ctx); + + if (!decomp_ret && dlen == PAGE_SIZE) + return true; + + zswap_decompress_fail++; + pr_alert_ratelimited("Decompression error from zswap (%d:%lu %s %u->%d)\n", + swp_type(entry->swpentry), + swp_offset(entry->swpentry), + entry->pool->tfm_name, entry->length, dlen); + return false; } /********************************* @@ -1015,14 +1069,21 @@ static int zswap_writeback_entry(struct zswap_entry *entry, struct folio *folio; struct mempolicy *mpol; bool folio_was_allocated; + struct swap_info_struct *si; struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, }; + int ret = 0; /* try to allocate swap cache folio */ + si = get_swap_device(swpentry); + if (!si) + return -EEXIST; + mpol = get_task_policy(current); folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol, - NO_INTERLEAVE_INDEX, &folio_was_allocated, true); + NO_INTERLEAVE_INDEX, &folio_was_allocated, true); + put_swap_device(si); if (!folio) return -ENOMEM; @@ -1034,8 +1095,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry, * and freed when invalidated by the concurrent shrinker anyway. */ if (!folio_was_allocated) { - folio_put(folio); - return -EEXIST; + ret = -EEXIST; + goto out; } /* @@ -1048,14 +1109,17 @@ static int zswap_writeback_entry(struct zswap_entry *entry, * be dereferenced. */ tree = swap_zswap_tree(swpentry); - if (entry != xa_cmpxchg(tree, offset, entry, NULL, GFP_KERNEL)) { - delete_from_swap_cache(folio); - folio_unlock(folio); - folio_put(folio); - return -ENOMEM; + if (entry != xa_load(tree, offset)) { + ret = -ENOMEM; + goto out; + } + + if (!zswap_decompress(entry, folio)) { + ret = -EIO; + goto out; } - zswap_decompress(entry, folio); + xa_erase(tree, offset); count_vm_event(ZSWPWB); if (entry->objcg) @@ -1071,9 +1135,14 @@ static int zswap_writeback_entry(struct zswap_entry *entry, /* start writeback */ __swap_writepage(folio, &wbc); - folio_put(folio); - return 0; +out: + if (ret && ret != -EEXIST) { + delete_from_swap_cache(folio); + folio_unlock(folio); + } + folio_put(folio); + return ret; } /********************************* @@ -1156,7 +1225,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o /* * It's safe to drop the lock here because we return either - * LRU_REMOVED_RETRY or LRU_RETRY. + * LRU_REMOVED_RETRY, LRU_RETRY or LRU_STOP. */ spin_unlock(&l->lock); @@ -1409,9 +1478,9 @@ resched: * main API **********************************/ -static ssize_t zswap_store_page(struct page *page, - struct obj_cgroup *objcg, - struct zswap_pool *pool) +static bool zswap_store_page(struct page *page, + struct obj_cgroup *objcg, + struct zswap_pool *pool) { swp_entry_t page_swpentry = page_swap_entry(page); struct zswap_entry *entry, *old; @@ -1420,7 +1489,7 @@ static ssize_t zswap_store_page(struct page *page, entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page)); if (!entry) { zswap_reject_kmemcache_fail++; - return -EINVAL; + return false; } if (!zswap_compress(page, entry, pool)) @@ -1447,13 +1516,17 @@ static ssize_t zswap_store_page(struct page *page, /* * The entry is successfully compressed and stored in the tree, there is - * no further possibility of failure. Grab refs to the pool and objcg. - * These refs will be dropped by zswap_entry_free() when the entry is - * removed from the tree. + * no further possibility of failure. Grab refs to the pool and objcg, + * charge zswap memory, and increment zswap_stored_pages. + * The opposite actions will be performed by zswap_entry_free() + * when the entry is removed from the tree. */ zswap_pool_get(pool); - if (objcg) + if (objcg) { obj_cgroup_get(objcg); + obj_cgroup_charge_zswap(objcg, entry->length); + } + atomic_long_inc(&zswap_stored_pages); /* * We finish initializing the entry while it's already in xarray. @@ -1474,13 +1547,13 @@ static ssize_t zswap_store_page(struct page *page, zswap_lru_add(&zswap_list_lru, entry); } - return entry->length; + return true; store_failed: zpool_free(pool->zpool, entry->handle); compress_failed: zswap_entry_cache_free(entry); - return -EINVAL; + return false; } bool zswap_store(struct folio *folio) @@ -1490,7 +1563,6 @@ bool zswap_store(struct folio *folio) struct obj_cgroup *objcg = NULL; struct mem_cgroup *memcg = NULL; struct zswap_pool *pool; - size_t compressed_bytes = 0; bool ret = false; long index; @@ -1528,20 +1600,14 @@ bool zswap_store(struct folio *folio) for (index = 0; index < nr_pages; ++index) { struct page *page = folio_page(folio, index); - ssize_t bytes; - bytes = zswap_store_page(page, objcg, pool); - if (bytes < 0) + if (!zswap_store_page(page, objcg, pool)) goto put_pool; - compressed_bytes += bytes; } - if (objcg) { - obj_cgroup_charge_zswap(objcg, compressed_bytes); + if (objcg) count_objcg_events(objcg, ZSWPOUT, nr_pages); - } - atomic_long_add(nr_pages, &zswap_stored_pages); count_vm_events(ZSWPOUT, nr_pages); ret = true; @@ -1576,7 +1642,27 @@ check_old: return ret; } -bool zswap_load(struct folio *folio) +/** + * zswap_load() - load a folio from zswap + * @folio: folio to load + * + * Return: 0 on success, with the folio unlocked and marked up-to-date, or one + * of the following error codes: + * + * -EIO: if the swapped out content was in zswap, but could not be loaded + * into the page due to a decompression failure. The folio is unlocked, but + * NOT marked up-to-date, so that an IO error is emitted (e.g. do_swap_page() + * will SIGBUS). + * + * -EINVAL: if the swapped out content was in zswap, but the page belongs + * to a large folio, which is not supported by zswap. The folio is unlocked, + * but NOT marked up-to-date, so that an IO error is emitted (e.g. + * do_swap_page() will SIGBUS). + * + * -ENOENT: if the swapped out content was not in zswap. The folio remains + * locked on return. + */ +int zswap_load(struct folio *folio) { swp_entry_t swp = folio->swap; pgoff_t offset = swp_offset(swp); @@ -1587,18 +1673,32 @@ bool zswap_load(struct folio *folio) VM_WARN_ON_ONCE(!folio_test_locked(folio)); if (zswap_never_enabled()) - return false; + return -ENOENT; /* * Large folios should not be swapped in while zswap is being used, as * they are not properly handled. Zswap does not properly load large * folios, and a large folio may only be partially in zswap. - * - * Return true without marking the folio uptodate so that an IO error is - * emitted (e.g. do_swap_page() will sigbus). */ - if (WARN_ON_ONCE(folio_test_large(folio))) - return true; + if (WARN_ON_ONCE(folio_test_large(folio))) { + folio_unlock(folio); + return -EINVAL; + } + + entry = xa_load(tree, offset); + if (!entry) + return -ENOENT; + + if (!zswap_decompress(entry, folio)) { + folio_unlock(folio); + return -EIO; + } + + folio_mark_uptodate(folio); + + count_vm_event(ZSWPIN); + if (entry->objcg) + count_objcg_events(entry->objcg, ZSWPIN, 1); /* * When reading into the swapcache, invalidate our entry. The @@ -1612,27 +1712,14 @@ bool zswap_load(struct folio *folio) * files, which reads into a private page and may free it if * the fault fails. We remain the primary owner of the entry.) */ - if (swapcache) - entry = xa_erase(tree, offset); - else - entry = xa_load(tree, offset); - - if (!entry) - return false; - - zswap_decompress(entry, folio); - - count_vm_event(ZSWPIN); - if (entry->objcg) - count_objcg_events(entry->objcg, ZSWPIN, 1); - if (swapcache) { - zswap_entry_free(entry); folio_mark_dirty(folio); + xa_erase(tree, offset); + zswap_entry_free(entry); } - folio_mark_uptodate(folio); - return true; + folio_unlock(folio); + return 0; } void zswap_invalidate(swp_entry_t swp) @@ -1727,6 +1814,8 @@ static int zswap_debugfs_init(void) zswap_debugfs_root, &zswap_reject_compress_fail); debugfs_create_u64("reject_compress_poor", 0444, zswap_debugfs_root, &zswap_reject_compress_poor); + debugfs_create_u64("decompress_fail", 0444, + zswap_debugfs_root, &zswap_decompress_fail); debugfs_create_u64("written_back_pages", 0444, zswap_debugfs_root, &zswap_written_back_pages); debugfs_create_file("pool_total_size", 0444, |