From 8b7b85384fad6e21e8a28628e7ebacb5a6329de4 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Mon, 23 Mar 2026 09:20:42 +0200 Subject: memblock: move reserve_bootmem_range() to memblock.c and make it static reserve_bootmem_region() is only called from memmap_init_reserved_pages() and it was in mm/mm_init.c because of its dependecies on static init_deferred_page(). Since init_deferred_page() is not static anymore, move reserve_bootmem_region(), rename it to memmap_init_reserved_range() and make it static. Update the comment describing it to better reflect what the function does and drop bogus comment about reserved pages in free_bootmem_page(). Update memblock test stubs to reflect the core changes. Reviewed-by: Lorenzo Stoakes (Oracle) Reviewed-by: David Hildenbrand (Arm) Link: https://patch.msgid.link/20260323072042.3651061-1-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) --- include/linux/mm.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index abb4963c1f06..764d10fdfb5d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3686,9 +3686,6 @@ extern unsigned long free_reserved_area(void *start, void *end, extern void adjust_managed_page_count(struct page *page, long count); -extern void reserve_bootmem_region(phys_addr_t start, - phys_addr_t end, int nid); - /* Free the reserved page into the buddy system, so it gets managed. */ void free_reserved_page(struct page *page); -- cgit v1.2.3 From 2b8acf8450f577d3785dacfd616630b76dc8f88d Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 25 Feb 2026 16:13:58 +0000 Subject: mm: introduce vm_mmap_shadow_stack() as a helper for VM_SHADOW_STACK mappings Patch series "mm: arch/shstk: Common shadow stack mapping helper and VM_NOHUGEPAGE", v2. A series to extract the common shadow stack mmap into a separate helper for arm64, riscv and x86. This patch (of 5): arm64, riscv and x86 use a similar pattern for mapping the user shadow stack (cloned from x86). Extract this into a helper to facilitate code reuse. The call to do_mmap() from the new helper uses PROT_READ|PROT_WRITE prot bits instead of the PROT_READ with an explicit VM_WRITE vm_flag. The x86 intent was to avoid PROT_WRITE implying normal write since the shadow stack is not writable by normal stores. However, from a kernel perspective, the vma is writeable. Functionally there is no difference. Link: https://lkml.kernel.org/r/20260225161404.3157851-1-catalin.marinas@arm.com Link: https://lkml.kernel.org/r/20260225161404.3157851-2-catalin.marinas@arm.com Signed-off-by: Catalin Marinas Tested-by: Deepak Gupta Reviewed-by: Mark Brown Acked-by: David Hildenbrand (Arm) Reviewed-by: Mike Rapoport (Microsoft) Cc: Albert Ou Cc: Alexandre Ghiti Cc: "Borislav Petkov (AMD)" Cc: "Edgecombe, Rick P" Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "Liam R. Howlett" Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Palmer Dabbelt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Will Deacon Cc: Dave Hansen Cc: Paul Walmsley Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 ++ mm/util.c | 25 +++++++++++++++++++++++++ 2 files changed, 27 insertions(+) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index abb4963c1f06..bb0cfe38ca19 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3903,6 +3903,8 @@ extern int vm_munmap(unsigned long, size_t); extern unsigned long __must_check vm_mmap(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); +extern unsigned long __must_check vm_mmap_shadow_stack(unsigned long addr, + unsigned long len, unsigned long flags); struct vm_unmapped_area_info { #define VM_UNMAPPED_AREA_TOPDOWN 1 diff --git a/mm/util.c b/mm/util.c index b05ab6f97e11..51f7f417e91f 100644 --- a/mm/util.c +++ b/mm/util.c @@ -618,6 +618,31 @@ unsigned long vm_mmap(struct file *file, unsigned long addr, } EXPORT_SYMBOL(vm_mmap); +#ifdef CONFIG_ARCH_HAS_USER_SHADOW_STACK +/* + * Perform a userland memory mapping for a shadow stack into the current + * process address space. This is intended to be used by architectures that + * support user shadow stacks. + */ +unsigned long vm_mmap_shadow_stack(unsigned long addr, unsigned long len, + unsigned long flags) +{ + struct mm_struct *mm = current->mm; + unsigned long ret, unused; + + flags |= MAP_ANONYMOUS | MAP_PRIVATE; + if (addr) + flags |= MAP_FIXED_NOREPLACE; + + mmap_write_lock(mm); + ret = do_mmap(NULL, addr, len, PROT_READ | PROT_WRITE, flags, + VM_SHADOW_STACK, 0, &unused, NULL); + mmap_write_unlock(mm); + + return ret; +} +#endif /* CONFIG_ARCH_HAS_USER_SHADOW_STACK */ + /** * __vmalloc_array - allocate memory for a virtually contiguous array. * @n: number of elements. -- cgit v1.2.3 From a2c77ec320a99581e8272868ccfa53a7d7a7b168 Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Fri, 27 Feb 2026 19:42:39 +0000 Subject: mm: move MAX_FOLIO_ORDER definition to mmzone.h Patch series "mm: Eliminate fake head pages from vmemmap optimization", v7. This series removes "fake head pages" from the HugeTLB vmemmap optimization (HVO) by changing how tail pages encode their relationship to the head page. It simplifies compound_head() and page_ref_add_unless(). Both are in the hot path. Background ========== HVO reduces memory overhead by freeing vmemmap pages for HugeTLB pages and remapping the freed virtual addresses to a single physical page. Previously, all tail page vmemmap entries were remapped to the first vmemmap page (containing the head struct page), creating "fake heads" - tail pages that appear to have PG_head set when accessed through the deduplicated vmemmap. This required special handling in compound_head() to detect and work around fake heads, adding complexity and overhead to a very hot path. New Approach ============ For architectures/configs where sizeof(struct page) is a power of 2 (the common case), this series changes how position of the head page is encoded in the tail pages. Instead of storing a pointer to the head page, the ->compound_info (renamed from ->compound_head) now stores a mask. The mask can be applied to any tail page's virtual address to compute the head page address. Critically, all tail pages of the same order now have identical compound_info values, regardless of which compound page they belong to. The key insight is that all tail pages of the same order now have identical compound_info values, regardless of which compound page they belong to. In v7, these shared tail pages are allocated per-zone. This ensures that zone information (stored in page->flags) is correct even for shared tail pages, removing the need for the special-casing in page_zonenum() proposed in earlier versions. To support per-zone shared pages for boot-allocated gigantic pages, the vmemmap population is deferred until zones are initialized. This simplifies the logic significantly and allows the removal of vmemmap_undo_hvo(). Benefits ======== 1. Simplified compound_head(): No fake head detection needed, can be implemented in a branchless manner. 2. Simplified page_ref_add_unless(): RCU protection removed since there's no race with fake head remapping. 3. Cleaner architecture: The shared tail pages are truly read-only and contain valid tail page metadata. If sizeof(struct page) is not power-of-2, there are no functional changes. HVO is not supported in this configuration. I had hoped to see performance improvement, but my testing thus far has shown either no change or only a slight improvement within the noise. Series Organization =================== Patch 1: Move MAX_FOLIO_ORDER definition to mmzone.h. Patches 2-4: Refactoring of field names and interfaces. Patches 5-6: Architecture alignment for LoongArch and RISC-V. Patch 7: Mask-based compound_head() implementation. Patch 8: Add memmap alignment checks. Patch 9: Branchless compound_head() optimization. Patch 10: Defer vmemmap population for bootmem hugepages. Patch 11: Refactor vmemmap_walk. Patch 12: x86 vDSO build fix. Patch 13: Eliminate fake heads with per-zone shared tail pages. Patches 14-16: Cleanup of fake head infrastructure. Patch 17: Documentation update. Patch 18: Use compound_head() in page_slab(). This patch (of 17): Move MAX_FOLIO_ORDER definition from mm.h to mmzone.h. This is preparation for adding the vmemmap_tails array to struct zone, which requires MAX_FOLIO_ORDER to be available in mmzone.h. Link: https://lkml.kernel.org/r/20260227194302.274384-1-kas@kernel.org Link: https://lkml.kernel.org/r/20260227194302.274384-2-kas@kernel.org Signed-off-by: Kiryl Shutsemau Acked-by: David Hildenbrand (Red Hat) Acked-by: Zi Yan Acked-by: Muchun Song Acked-by: Usama Arif Reviewed-by: Vlastimil Babka Cc: Albert Ou Cc: Alexandre Ghiti Cc: Baoquan He Cc: Christoph Lameter Cc: David Rientjes Cc: Frank van der Linden Cc: Harry Yoo Cc: Huacai Chen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Roman Gushchin Cc: WANG Xuerui Signed-off-by: Andrew Morton --- include/linux/mm.h | 31 ------------------------------- include/linux/mmzone.h | 31 +++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 31 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index bb0cfe38ca19..4e999c21d89a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include @@ -2479,36 +2478,6 @@ static inline unsigned long folio_nr_pages(const struct folio *folio) return folio_large_nr_pages(folio); } -#if !defined(CONFIG_HAVE_GIGANTIC_FOLIOS) -/* - * We don't expect any folios that exceed buddy sizes (and consequently - * memory sections). - */ -#define MAX_FOLIO_ORDER MAX_PAGE_ORDER -#elif defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) -/* - * Only pages within a single memory section are guaranteed to be - * contiguous. By limiting folios to a single memory section, all folio - * pages are guaranteed to be contiguous. - */ -#define MAX_FOLIO_ORDER PFN_SECTION_SHIFT -#elif defined(CONFIG_HUGETLB_PAGE) -/* - * There is no real limit on the folio size. We limit them to the maximum we - * currently expect (see CONFIG_HAVE_GIGANTIC_FOLIOS): with hugetlb, we expect - * no folios larger than 16 GiB on 64bit and 1 GiB on 32bit. - */ -#define MAX_FOLIO_ORDER get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G) -#else -/* - * Without hugetlb, gigantic folios that are bigger than a single PUD are - * currently impossible. - */ -#define MAX_FOLIO_ORDER PUD_ORDER -#endif - -#define MAX_FOLIO_NR_PAGES (1UL << MAX_FOLIO_ORDER) - /* * compound_nr() returns the number of pages in this potentially compound * page. compound_nr() can be called on a tail page, and is defined to diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index db41b18a919d..4c481ec77da9 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -23,6 +23,7 @@ #include #include #include +#include #include /* Free memory management - zoned buddy allocator. */ @@ -61,6 +62,36 @@ */ #define PAGE_ALLOC_COSTLY_ORDER 3 +#if !defined(CONFIG_HAVE_GIGANTIC_FOLIOS) +/* + * We don't expect any folios that exceed buddy sizes (and consequently + * memory sections). + */ +#define MAX_FOLIO_ORDER MAX_PAGE_ORDER +#elif defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) +/* + * Only pages within a single memory section are guaranteed to be + * contiguous. By limiting folios to a single memory section, all folio + * pages are guaranteed to be contiguous. + */ +#define MAX_FOLIO_ORDER PFN_SECTION_SHIFT +#elif defined(CONFIG_HUGETLB_PAGE) +/* + * There is no real limit on the folio size. We limit them to the maximum we + * currently expect (see CONFIG_HAVE_GIGANTIC_FOLIOS): with hugetlb, we expect + * no folios larger than 16 GiB on 64bit and 1 GiB on 32bit. + */ +#define MAX_FOLIO_ORDER get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G) +#else +/* + * Without hugetlb, gigantic folios that are bigger than a single PUD are + * currently impossible. + */ +#define MAX_FOLIO_ORDER PUD_ORDER +#endif + +#define MAX_FOLIO_NR_PAGES (1UL << MAX_FOLIO_ORDER) + enum migratetype { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, -- cgit v1.2.3 From 209e6d9eb13aaf1b6e0fc6f76afc00d055e5ba12 Mon Sep 17 00:00:00 2001 From: "Kiryl Shutsemau (Meta)" Date: Fri, 27 Feb 2026 19:42:47 +0000 Subject: mm/hugetlb: defer vmemmap population for bootmem hugepages Currently, the vmemmap for bootmem-allocated gigantic pages is populated early in hugetlb_vmemmap_init_early(). However, the zone information is only available after zones are initialized. If it is later discovered that a page spans multiple zones, the HVO mapping must be undone and replaced with a normal mapping using vmemmap_undo_hvo(). Defer the actual vmemmap population to hugetlb_vmemmap_init_late(). At this stage, zones are already initialized, so it can be checked if the page is valid for HVO before deciding how to populate the vmemmap. This allows us to remove vmemmap_undo_hvo() and the complex logic required to rollback HVO mappings. In hugetlb_vmemmap_init_late(), if HVO population fails or if the zones are invalid, fall back to a normal vmemmap population. Postponing population until hugetlb_vmemmap_init_late() also makes zone information available from within vmemmap_populate_hvo(). Link: https://lkml.kernel.org/r/20260227194302.274384-10-kas@kernel.org Signed-off-by: Kiryl Shutsemau (Meta) Acked-by: David Hildenbrand (Arm) Cc: Albert Ou Cc: Alexandre Ghiti Cc: Baoquan He Cc: Christoph Lameter Cc: David Rientjes Cc: Frank van der Linden Cc: Harry Yoo Cc: Huacai Chen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Roman Gushchin Cc: Usama Arif Cc: Vlastimil Babka Cc: WANG Xuerui Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 -- mm/hugetlb_vmemmap.c | 37 ++++++++++++++++++------------------ mm/sparse-vmemmap.c | 53 ---------------------------------------------------- 3 files changed, 18 insertions(+), 74 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 4e999c21d89a..d7e53532a109 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4481,8 +4481,6 @@ int vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); int vmemmap_populate_hvo(unsigned long start, unsigned long end, int node, unsigned long headsize); -int vmemmap_undo_hvo(unsigned long start, unsigned long end, int node, - unsigned long headsize); void vmemmap_wrprotect_hvo(unsigned long start, unsigned long end, int node, unsigned long headsize); void vmemmap_populate_print_last(void); diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index a9280259e12a..935ec5829be9 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -790,7 +790,6 @@ void __init hugetlb_vmemmap_init_early(int nid) { unsigned long psize, paddr, section_size; unsigned long ns, i, pnum, pfn, nr_pages; - unsigned long start, end; struct huge_bootmem_page *m = NULL; void *map; @@ -808,14 +807,6 @@ void __init hugetlb_vmemmap_init_early(int nid) paddr = virt_to_phys(m); pfn = PHYS_PFN(paddr); map = pfn_to_page(pfn); - start = (unsigned long)map; - end = start + nr_pages * sizeof(struct page); - - if (vmemmap_populate_hvo(start, end, nid, - HUGETLB_VMEMMAP_RESERVE_SIZE) < 0) - continue; - - memmap_boot_pages_add(HUGETLB_VMEMMAP_RESERVE_SIZE / PAGE_SIZE); pnum = pfn_to_section_nr(pfn); ns = psize / section_size; @@ -850,28 +841,36 @@ void __init hugetlb_vmemmap_init_late(int nid) h = m->hstate; pfn = PHYS_PFN(phys); nr_pages = pages_per_huge_page(h); + map = pfn_to_page(pfn); + start = (unsigned long)map; + end = start + nr_pages * sizeof(struct page); if (!hugetlb_bootmem_page_zones_valid(nid, m)) { /* * Oops, the hugetlb page spans multiple zones. - * Remove it from the list, and undo HVO. + * Remove it from the list, and populate it normally. */ list_del(&m->list); - map = pfn_to_page(pfn); - - start = (unsigned long)map; - end = start + nr_pages * sizeof(struct page); - - vmemmap_undo_hvo(start, end, nid, - HUGETLB_VMEMMAP_RESERVE_SIZE); - nr_mmap = end - start - HUGETLB_VMEMMAP_RESERVE_SIZE; + vmemmap_populate(start, end, nid, NULL); + nr_mmap = end - start; memmap_boot_pages_add(DIV_ROUND_UP(nr_mmap, PAGE_SIZE)); memblock_phys_free(phys, huge_page_size(h)); continue; - } else + } + + if (vmemmap_populate_hvo(start, end, nid, + HUGETLB_VMEMMAP_RESERVE_SIZE) < 0) { + /* Fallback if HVO population fails */ + vmemmap_populate(start, end, nid, NULL); + nr_mmap = end - start; + } else { m->flags |= HUGE_BOOTMEM_ZONES_VALID; + nr_mmap = HUGETLB_VMEMMAP_RESERVE_SIZE; + } + + memmap_boot_pages_add(DIV_ROUND_UP(nr_mmap, PAGE_SIZE)); } } #endif diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 37522d6cb398..032a81450838 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -302,59 +302,6 @@ int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end, return vmemmap_populate_range(start, end, node, altmap, -1, 0); } -/* - * Undo populate_hvo, and replace it with a normal base page mapping. - * Used in memory init in case a HVO mapping needs to be undone. - * - * This can happen when it is discovered that a memblock allocated - * hugetlb page spans multiple zones, which can only be verified - * after zones have been initialized. - * - * We know that: - * 1) The first @headsize / PAGE_SIZE vmemmap pages were individually - * allocated through memblock, and mapped. - * - * 2) The rest of the vmemmap pages are mirrors of the last head page. - */ -int __meminit vmemmap_undo_hvo(unsigned long addr, unsigned long end, - int node, unsigned long headsize) -{ - unsigned long maddr, pfn; - pte_t *pte; - int headpages; - - /* - * Should only be called early in boot, so nothing will - * be accessing these page structures. - */ - WARN_ON(!early_boot_irqs_disabled); - - headpages = headsize >> PAGE_SHIFT; - - /* - * Clear mirrored mappings for tail page structs. - */ - for (maddr = addr + headsize; maddr < end; maddr += PAGE_SIZE) { - pte = virt_to_kpte(maddr); - pte_clear(&init_mm, maddr, pte); - } - - /* - * Clear and free mappings for head page and first tail page - * structs. - */ - for (maddr = addr; headpages-- > 0; maddr += PAGE_SIZE) { - pte = virt_to_kpte(maddr); - pfn = pte_pfn(ptep_get(pte)); - pte_clear(&init_mm, maddr, pte); - memblock_phys_free(PFN_PHYS(pfn), PAGE_SIZE); - } - - flush_tlb_kernel_range(addr, end); - - return vmemmap_populate(addr, end, node, NULL); -} - /* * Write protect the mirrored tail page structs for HVO. This will be * called from the hugetlb code when gathering and initializing the -- cgit v1.2.3 From 622026e87c4019e609010811757e31193cc23847 Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Fri, 27 Feb 2026 19:42:50 +0000 Subject: mm/hugetlb: remove fake head pages HugeTLB Vmemmap Optimization (HVO) reduces memory usage by freeing most vmemmap pages for huge pages and remapping the freed range to a single page containing the struct page metadata. With the new mask-based compound_info encoding (for power-of-2 struct page sizes), all tail pages of the same order are now identical regardless of which compound page they belong to. This means the tail pages can be truly shared without fake heads. Allocate a single page of initialized tail struct pages per zone per order in the vmemmap_tails[] array in struct zone. All huge pages of that order in the zone share this tail page, mapped read-only into their vmemmap. The head page remains unique per huge page. Redefine MAX_FOLIO_ORDER using ilog2(). The define has to produce a compile-constant as it is used to specify vmemmap_tail array size. For some reason, compiler is not able to solve get_order() at compile-time, but ilog2() works. Avoid PUD_ORDER to define MAX_FOLIO_ORDER as it adds dependency to which generates hard-to-break include loop. This eliminates fake heads while maintaining the same memory savings, and simplifies compound_head() by removing fake head detection. Link: https://lkml.kernel.org/r/20260227194302.274384-13-kas@kernel.org Signed-off-by: Kiryl Shutsemau Reviewed-by: Vlastimil Babka (SUSE) Acked-by: David Hildenbrand (Arm) Cc: Albert Ou Cc: Alexandre Ghiti Cc: Baoquan He Cc: Christoph Lameter Cc: David Rientjes Cc: Frank van der Linden Cc: Harry Yoo Cc: Huacai Chen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Roman Gushchin Cc: Usama Arif Cc: WANG Xuerui Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 3 ++- include/linux/mmzone.h | 19 +++++++++++-- mm/hugetlb_vmemmap.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++--- mm/internal.h | 9 +++++++ mm/sparse-vmemmap.c | 55 +++++++++++++++++++++++++++++++------ 5 files changed, 145 insertions(+), 14 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index d7e53532a109..19619e5efeba 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4479,7 +4479,8 @@ int vmemmap_populate_hugepages(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); int vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); -int vmemmap_populate_hvo(unsigned long start, unsigned long end, int node, +int vmemmap_populate_hvo(unsigned long start, unsigned long end, + unsigned int order, struct zone *zone, unsigned long headsize); void vmemmap_wrprotect_hvo(unsigned long start, unsigned long end, int node, unsigned long headsize); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 0bef68e41f19..5c3ae0348754 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -81,13 +81,17 @@ * currently expect (see CONFIG_HAVE_GIGANTIC_FOLIOS): with hugetlb, we expect * no folios larger than 16 GiB on 64bit and 1 GiB on 32bit. */ -#define MAX_FOLIO_ORDER get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G) +#ifdef CONFIG_64BIT +#define MAX_FOLIO_ORDER (ilog2(SZ_16G) - PAGE_SHIFT) +#else +#define MAX_FOLIO_ORDER (ilog2(SZ_1G) - PAGE_SHIFT) +#endif #else /* * Without hugetlb, gigantic folios that are bigger than a single PUD are * currently impossible. */ -#define MAX_FOLIO_ORDER PUD_ORDER +#define MAX_FOLIO_ORDER (PUD_SHIFT - PAGE_SHIFT) #endif #define MAX_FOLIO_NR_PAGES (1UL << MAX_FOLIO_ORDER) @@ -103,6 +107,14 @@ is_power_of_2(sizeof(struct page)) ? \ MAX_FOLIO_NR_PAGES * sizeof(struct page) : 0) +/* + * vmemmap optimization (like HVO) is only possible for page orders that fill + * two or more pages with struct pages. + */ +#define VMEMMAP_TAIL_MIN_ORDER (ilog2(2 * PAGE_SIZE / sizeof(struct page))) +#define __NR_VMEMMAP_TAILS (MAX_FOLIO_ORDER - VMEMMAP_TAIL_MIN_ORDER + 1) +#define NR_VMEMMAP_TAILS (__NR_VMEMMAP_TAILS > 0 ? __NR_VMEMMAP_TAILS : 0) + enum migratetype { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, @@ -1113,6 +1125,9 @@ struct zone { /* Zone statistics */ atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS]; +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP + struct page *vmemmap_tails[NR_VMEMMAP_TAILS]; +#endif } ____cacheline_internodealigned_in_smp; enum pgdat_flags { diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 3628fb5b2a28..92330f172eb7 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -19,6 +19,7 @@ #include #include "hugetlb_vmemmap.h" +#include "internal.h" /** * struct vmemmap_remap_walk - walk vmemmap page table @@ -505,6 +506,32 @@ static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio * return true; } +static struct page *vmemmap_get_tail(unsigned int order, struct zone *zone) +{ + const unsigned int idx = order - VMEMMAP_TAIL_MIN_ORDER; + struct page *tail, *p; + int node = zone_to_nid(zone); + + tail = READ_ONCE(zone->vmemmap_tails[idx]); + if (likely(tail)) + return tail; + + tail = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); + if (!tail) + return NULL; + + p = page_to_virt(tail); + for (int i = 0; i < PAGE_SIZE / sizeof(struct page); i++) + init_compound_tail(p + i, NULL, order, zone); + + if (cmpxchg(&zone->vmemmap_tails[idx], NULL, tail)) { + __free_page(tail); + tail = READ_ONCE(zone->vmemmap_tails[idx]); + } + + return tail; +} + static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio, struct list_head *vmemmap_pages, @@ -520,6 +547,11 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, if (!vmemmap_should_optimize_folio(h, folio)) return ret; + nid = folio_nid(folio); + vmemmap_tail = vmemmap_get_tail(h->order, folio_zone(folio)); + if (!vmemmap_tail) + return -ENOMEM; + static_branch_inc(&hugetlb_optimize_vmemmap_key); if (flags & VMEMMAP_SYNCHRONIZE_RCU) @@ -537,7 +569,6 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, */ folio_set_hugetlb_vmemmap_optimized(folio); - nid = folio_nid(folio); vmemmap_head = alloc_pages_node(nid, GFP_KERNEL, 0); if (!vmemmap_head) { ret = -ENOMEM; @@ -548,7 +579,6 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, list_add(&vmemmap_head->lru, vmemmap_pages); memmap_pages_add(1); - vmemmap_tail = vmemmap_head; vmemmap_start = (unsigned long)&folio->page; vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); @@ -776,11 +806,26 @@ void __init hugetlb_vmemmap_init_early(int nid) } } +static struct zone *pfn_to_zone(unsigned nid, unsigned long pfn) +{ + struct zone *zone; + enum zone_type zone_type; + + for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { + zone = &NODE_DATA(nid)->node_zones[zone_type]; + if (zone_spans_pfn(zone, pfn)) + return zone; + } + + return NULL; +} + void __init hugetlb_vmemmap_init_late(int nid) { struct huge_bootmem_page *m, *tm; unsigned long phys, nr_pages, start, end; unsigned long pfn, nr_mmap; + struct zone *zone = NULL; struct hstate *h; void *map; @@ -814,7 +859,12 @@ void __init hugetlb_vmemmap_init_late(int nid) continue; } - if (vmemmap_populate_hvo(start, end, nid, + if (!zone || !zone_spans_pfn(zone, pfn)) + zone = pfn_to_zone(nid, pfn); + if (WARN_ON_ONCE(!zone)) + continue; + + if (vmemmap_populate_hvo(start, end, huge_page_order(h), zone, HUGETLB_VMEMMAP_RESERVE_SIZE) < 0) { /* Fallback if HVO population fails */ vmemmap_populate(start, end, nid, NULL); @@ -842,10 +892,27 @@ static const struct ctl_table hugetlb_vmemmap_sysctls[] = { static int __init hugetlb_vmemmap_init(void) { const struct hstate *h; + struct zone *zone; /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */ BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES); + for_each_zone(zone) { + for (int i = 0; i < NR_VMEMMAP_TAILS; i++) { + struct page *tail, *p; + unsigned int order; + + tail = zone->vmemmap_tails[i]; + if (!tail) + continue; + + order = i + VMEMMAP_TAIL_MIN_ORDER; + p = page_to_virt(tail); + for (int j = 0; j < PAGE_SIZE / sizeof(struct page); j++) + init_compound_tail(p + j, NULL, order, zone); + } + } + for_each_hstate(h) { if (hugetlb_vmemmap_optimizable(h)) { register_sysctl_init("vm", hugetlb_vmemmap_sysctls); diff --git a/mm/internal.h b/mm/internal.h index 9cfbd8e41914..84167b0570c9 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -905,6 +905,15 @@ static inline void prep_compound_tail(struct page *tail, set_page_private(tail, 0); } +static inline void init_compound_tail(struct page *tail, + const struct page *head, unsigned int order, struct zone *zone) +{ + atomic_set(&tail->_mapcount, -1); + set_page_node(tail, zone_to_nid(zone)); + set_page_zone(tail, zone_idx(zone)); + prep_compound_tail(tail, head, order); +} + void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); extern bool free_pages_prepare(struct page *page, unsigned int order); diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 032a81450838..842ed2f0bce6 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -325,16 +325,54 @@ void vmemmap_wrprotect_hvo(unsigned long addr, unsigned long end, } } -/* - * Populate vmemmap pages HVO-style. The first page contains the head - * page and needed tail pages, the other ones are mirrors of the first - * page. - */ +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP +static __meminit struct page *vmemmap_get_tail(unsigned int order, struct zone *zone) +{ + struct page *p, *tail; + unsigned int idx; + int node = zone_to_nid(zone); + + if (WARN_ON_ONCE(order < VMEMMAP_TAIL_MIN_ORDER)) + return NULL; + if (WARN_ON_ONCE(order > MAX_FOLIO_ORDER)) + return NULL; + + idx = order - VMEMMAP_TAIL_MIN_ORDER; + tail = zone->vmemmap_tails[idx]; + if (tail) + return tail; + + /* + * Only allocate the page, but do not initialize it. + * + * Any initialization done here will be overwritten by memmap_init(). + * + * hugetlb_vmemmap_init() will take care of initialization after + * memmap_init(). + */ + + p = vmemmap_alloc_block_zero(PAGE_SIZE, node); + if (!p) + return NULL; + + tail = virt_to_page(p); + zone->vmemmap_tails[idx] = tail; + + return tail; +} + int __meminit vmemmap_populate_hvo(unsigned long addr, unsigned long end, - int node, unsigned long headsize) + unsigned int order, struct zone *zone, + unsigned long headsize) { - pte_t *pte; unsigned long maddr; + struct page *tail; + pte_t *pte; + int node = zone_to_nid(zone); + + tail = vmemmap_get_tail(order, zone); + if (!tail) + return -ENOMEM; for (maddr = addr; maddr < addr + headsize; maddr += PAGE_SIZE) { pte = vmemmap_populate_address(maddr, node, NULL, -1, 0); @@ -346,8 +384,9 @@ int __meminit vmemmap_populate_hvo(unsigned long addr, unsigned long end, * Reuse the last page struct page mapped above for the rest. */ return vmemmap_populate_range(maddr, end, node, NULL, - pte_pfn(ptep_get(pte)), 0); + page_to_pfn(tail), 0); } +#endif void __weak __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node, unsigned long addr, unsigned long next) -- cgit v1.2.3 From 28266ac94a50e585c267a79d9ef5c2803d4dcd7a Mon Sep 17 00:00:00 2001 From: Gladyshev Ilya Date: Sun, 1 Mar 2026 13:19:39 +0000 Subject: mm: make ref_unless functions unless_zero only There are no users of (folio/page)_ref_add_unless(page, nr, u) with u != 0 [1] and all current users are "internal" for page refcounting API. This allows us to safely drop this parameter and reduce function semantics to the "unless zero" cases only. If needed, these functions for the u!=0 cases can be trivially reintroduced later using the same atomic_add_unless operations as before. [1]: The last user was dropped in v5.18 kernel, commit 27674ef6c73f ("mm: remove the extra ZONE_DEVICE struct page refcount"). There is no trace of discussion as to why this cleanup wasn't done earlier. Link: https://lkml.kernel.org/r/a0c89b49d38c671a0bdd35069d15ee13e08314d2.1772370066.git.gladyshev.ilya1@h-partners.com Co-developed-by: Gorbunov Ivan Signed-off-by: Gorbunov Ivan Signed-off-by: Gladyshev Ilya Acked-by: David Hildenbrand (Arm) Acked-by: Kiryl Shutsemau Acked-by: Zi Yan Reviewed-by: Lorenzo Stoakes Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Will Deacon Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- include/linux/page_ref.h | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 19619e5efeba..08b743aab92a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1506,7 +1506,7 @@ static inline int folio_put_testzero(struct folio *folio) */ static inline bool get_page_unless_zero(struct page *page) { - return page_ref_add_unless(page, 1, 0); + return page_ref_add_unless_zero(page, 1); } static inline struct folio *folio_get_nontail_page(struct page *page) diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h index 490d0ad6e56d..94d3f0e71c06 100644 --- a/include/linux/page_ref.h +++ b/include/linux/page_ref.h @@ -228,18 +228,18 @@ static inline int folio_ref_dec_return(struct folio *folio) return page_ref_dec_return(&folio->page); } -static inline bool page_ref_add_unless(struct page *page, int nr, int u) +static inline bool page_ref_add_unless_zero(struct page *page, int nr) { - bool ret = atomic_add_unless(&page->_refcount, nr, u); + bool ret = atomic_add_unless(&page->_refcount, nr, 0); if (page_ref_tracepoint_active(page_ref_mod_unless)) __page_ref_mod_unless(page, nr, ret); return ret; } -static inline bool folio_ref_add_unless(struct folio *folio, int nr, int u) +static inline bool folio_ref_add_unless_zero(struct folio *folio, int nr) { - return page_ref_add_unless(&folio->page, nr, u); + return page_ref_add_unless_zero(&folio->page, nr); } /** @@ -255,12 +255,12 @@ static inline bool folio_ref_add_unless(struct folio *folio, int nr, int u) */ static inline bool folio_try_get(struct folio *folio) { - return folio_ref_add_unless(folio, 1, 0); + return folio_ref_add_unless_zero(folio, 1); } static inline bool folio_ref_try_add(struct folio *folio, int count) { - return folio_ref_add_unless(folio, count, 0); + return folio_ref_add_unless_zero(folio, count); } static inline int page_ref_freeze(struct page *page, int count) -- cgit v1.2.3 From de008c9ba5684f14e83bcf86cd45fb0e4e6c4d82 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 27 Feb 2026 21:08:33 +0100 Subject: mm/memory: remove "zap_details" parameter from zap_page_range_single() Nobody except memory.c should really set that parameter to non-NULL. So let's just drop it and make unmap_mapping_range_vma() use zap_page_range_single_batched() instead. [david@kernel.org: format on a single line] Link: https://lkml.kernel.org/r/8a27e9ac-2025-4724-a46d-0a7c90894ba7@kernel.org Link: https://lkml.kernel.org/r/20260227200848.114019-3-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Acked-by: Puranjay Mohan Cc: Alexander Gordeev Cc: Alexei Starovoitov Cc: Alice Ryhl Cc: Andrii Nakryiko Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnd Bergmann Cc: Arve Cc: "Borislav Petkov (AMD)" Cc: Carlos Llamas Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: Daniel Borkman Cc: Dave Airlie Cc: David Ahern Cc: David Rientjes Cc: David S. Miller Cc: Dimitri Sivanich Cc: Eric Dumazet Cc: Gerald Schaefer Cc: Greg Kroah-Hartman Cc: Hartley Sweeten Cc: Heiko Carstens Cc: Ian Abbott Cc: Ingo Molnar Cc: Jakub Kacinski Cc: Jani Nikula Cc: Jann Horn Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jason Gunthorpe Cc: Jonas Lahtinen Cc: Leon Romanovsky Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Namhyung kim Cc: Neal Cardwell Cc: Paolo Abeni Cc: Pedro Falcato Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Todd Kjos Cc: Tvrtko Ursulin Cc: Vasily Gorbik Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- arch/s390/mm/gmap_helpers.c | 2 +- drivers/android/binder_alloc.c | 2 +- include/linux/mm.h | 5 ++--- kernel/bpf/arena.c | 3 +-- kernel/events/core.c | 2 +- mm/madvise.c | 3 +-- mm/memory.c | 16 ++++++++++------ net/ipv4/tcp.c | 5 ++--- rust/kernel/mm/virt.rs | 4 +--- 9 files changed, 20 insertions(+), 22 deletions(-) (limited to 'include/linux/mm.h') diff --git a/arch/s390/mm/gmap_helpers.c b/arch/s390/mm/gmap_helpers.c index dea83e3103e5..ae2d59a19313 100644 --- a/arch/s390/mm/gmap_helpers.c +++ b/arch/s390/mm/gmap_helpers.c @@ -89,7 +89,7 @@ void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned lo if (!vma) return; if (!is_vm_hugetlb_page(vma)) - zap_page_range_single(vma, vmaddr, min(end, vma->vm_end) - vmaddr, NULL); + zap_page_range_single(vma, vmaddr, min(end, vma->vm_end) - vmaddr); vmaddr = vma->vm_end; } } diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 241f16a9b63d..dd2046bd5cde 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -1185,7 +1185,7 @@ enum lru_status binder_alloc_free_page(struct list_head *item, if (vma) { trace_binder_unmap_user_start(alloc, index); - zap_page_range_single(vma, page_addr, PAGE_SIZE, NULL); + zap_page_range_single(vma, page_addr, PAGE_SIZE); trace_binder_unmap_user_end(alloc, index); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 08b743aab92a..6512d70c5852 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2804,11 +2804,10 @@ struct page *vm_normal_page_pud(struct vm_area_struct *vma, unsigned long addr, void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, - unsigned long size, struct zap_details *details); + unsigned long size); static inline void zap_vma_pages(struct vm_area_struct *vma) { - zap_page_range_single(vma, vma->vm_start, - vma->vm_end - vma->vm_start, NULL); + zap_page_range_single(vma, vma->vm_start, vma->vm_end - vma->vm_start); } struct mmu_notifier_range; diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index f355cf1c1a16..19cca936eb9d 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -656,8 +656,7 @@ static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt) guard(mutex)(&arena->lock); /* iterate link list under lock */ list_for_each_entry(vml, &arena->vma_list, head) - zap_page_range_single(vml->vma, uaddr, - PAGE_SIZE * page_cnt, NULL); + zap_page_range_single(vml->vma, uaddr, PAGE_SIZE * page_cnt); } static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable) diff --git a/kernel/events/core.c b/kernel/events/core.c index 89b40e439717..2ecdaabf1b4d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7213,7 +7213,7 @@ static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma) #ifdef CONFIG_MMU /* Clear any partial mappings on error. */ if (err) - zap_page_range_single(vma, vma->vm_start, nr_pages * PAGE_SIZE, NULL); + zap_page_range_single(vma, vma->vm_start, nr_pages * PAGE_SIZE); #endif return err; diff --git a/mm/madvise.c b/mm/madvise.c index 1313166c5514..e4a2728593a8 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1193,8 +1193,7 @@ static long madvise_guard_install(struct madvise_behavior *madv_behavior) * OK some of the range have non-guard pages mapped, zap * them. This leaves existing guard pages in place. */ - zap_page_range_single(vma, range->start, - range->end - range->start, NULL); + zap_page_range_single(vma, range->start, range->end - range->start); } /* diff --git a/mm/memory.c b/mm/memory.c index f78ab3869f8d..fbd02d5bd520 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2203,17 +2203,16 @@ void zap_page_range_single_batched(struct mmu_gather *tlb, * @vma: vm_area_struct holding the applicable pages * @address: starting address of pages to zap * @size: number of bytes to zap - * @details: details of shared cache invalidation * * The range must fit into one VMA. */ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, - unsigned long size, struct zap_details *details) + unsigned long size) { struct mmu_gather tlb; tlb_gather_mmu(&tlb, vma->vm_mm); - zap_page_range_single_batched(&tlb, vma, address, size, details); + zap_page_range_single_batched(&tlb, vma, address, size, NULL); tlb_finish_mmu(&tlb); } @@ -2235,7 +2234,7 @@ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, !(vma->vm_flags & VM_PFNMAP)) return; - zap_page_range_single(vma, address, size, NULL); + zap_page_range_single(vma, address, size); } EXPORT_SYMBOL_GPL(zap_vma_ptes); @@ -3003,7 +3002,7 @@ static int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long add * maintain page reference counts, and callers may free * pages due to the error. So zap it early. */ - zap_page_range_single(vma, addr, size, NULL); + zap_page_range_single(vma, addr, size); return error; } @@ -4226,7 +4225,12 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, struct zap_details *details) { - zap_page_range_single(vma, start_addr, end_addr - start_addr, details); + struct mmu_gather tlb; + + tlb_gather_mmu(&tlb, vma->vm_mm); + zap_page_range_single_batched(&tlb, vma, start_addr, + end_addr - start_addr, details); + tlb_finish_mmu(&tlb); } static inline void unmap_mapping_range_tree(struct rb_root_cached *root, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 202a4e57a218..89c962672e51 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2105,7 +2105,7 @@ static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma, maybe_zap_len = total_bytes_to_map - /* All bytes to map */ *length + /* Mapped or pending */ (pages_remaining * PAGE_SIZE); /* Failed map. */ - zap_page_range_single(vma, *address, maybe_zap_len, NULL); + zap_page_range_single(vma, *address, maybe_zap_len); err = 0; } @@ -2270,8 +2270,7 @@ static int tcp_zerocopy_receive(struct sock *sk, total_bytes_to_map = avail_len & ~(PAGE_SIZE - 1); if (total_bytes_to_map) { if (!(zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT)) - zap_page_range_single(vma, address, total_bytes_to_map, - NULL); + zap_page_range_single(vma, address, total_bytes_to_map); zc->length = total_bytes_to_map; zc->recv_skip_hint = 0; } else { diff --git a/rust/kernel/mm/virt.rs b/rust/kernel/mm/virt.rs index da21d65ccd20..6bfd91cfa1f4 100644 --- a/rust/kernel/mm/virt.rs +++ b/rust/kernel/mm/virt.rs @@ -123,9 +123,7 @@ impl VmaRef { // SAFETY: By the type invariants, the caller has read access to this VMA, which is // sufficient for this method call. This method has no requirements on the vma flags. The // address range is checked to be within the vma. - unsafe { - bindings::zap_page_range_single(self.as_ptr(), address, size, core::ptr::null_mut()) - }; + unsafe { bindings::zap_page_range_single(self.as_ptr(), address, size) }; } /// If the [`VM_MIXEDMAP`] flag is set, returns a [`VmaMixedMap`] to this VMA, otherwise -- cgit v1.2.3 From 599a59e6037838ea7cd6264d7980ea63de244994 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 27 Feb 2026 21:08:35 +0100 Subject: mm/memory: simplify calculation in unmap_mapping_range_tree() Let's simplify the calculation a bit further to make it easier to get, reusing vma_last_pgoff() which we move from interval_tree.c to mm.h. Link: https://lkml.kernel.org/r/20260227200848.114019-5-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Alexander Gordeev Cc: Alexei Starovoitov Cc: Alice Ryhl Cc: Andrii Nakryiko Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnd Bergmann Cc: Arve Cc: "Borislav Petkov (AMD)" Cc: Carlos Llamas Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: Daniel Borkman Cc: Dave Airlie Cc: David Ahern Cc: David Rientjes Cc: David S. Miller Cc: Dimitri Sivanich Cc: Eric Dumazet Cc: Gerald Schaefer Cc: Greg Kroah-Hartman Cc: Hartley Sweeten Cc: Heiko Carstens Cc: Ian Abbott Cc: Ingo Molnar Cc: Jakub Kacinski Cc: Jani Nikula Cc: Jann Horn Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jason Gunthorpe Cc: Jonas Lahtinen Cc: Leon Romanovsky Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Namhyung kim Cc: Neal Cardwell Cc: Paolo Abeni Cc: Pedro Falcato Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Todd Kjos Cc: Tvrtko Ursulin Cc: Vasily Gorbik Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- include/linux/mm.h | 5 +++++ mm/interval_tree.c | 5 ----- mm/memory.c | 12 +++++------- 3 files changed, 10 insertions(+), 12 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6512d70c5852..771d021b7948 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3969,6 +3969,11 @@ static inline unsigned long vma_pages(const struct vm_area_struct *vma) return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; } +static inline unsigned long vma_last_pgoff(struct vm_area_struct *vma) +{ + return vma->vm_pgoff + vma_pages(vma) - 1; +} + static inline unsigned long vma_desc_size(const struct vm_area_desc *desc) { return desc->end - desc->start; diff --git a/mm/interval_tree.c b/mm/interval_tree.c index 32e390c42c53..32bcfbfcf15f 100644 --- a/mm/interval_tree.c +++ b/mm/interval_tree.c @@ -15,11 +15,6 @@ static inline unsigned long vma_start_pgoff(struct vm_area_struct *v) return v->vm_pgoff; } -static inline unsigned long vma_last_pgoff(struct vm_area_struct *v) -{ - return v->vm_pgoff + vma_pages(v) - 1; -} - INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb, unsigned long, shared.rb_subtree_last, vma_start_pgoff, vma_last_pgoff, /* empty */, vma_interval_tree) diff --git a/mm/memory.c b/mm/memory.c index f1c5d6b01a62..24b768885379 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4227,17 +4227,15 @@ static inline void unmap_mapping_range_tree(struct rb_root_cached *root, struct zap_details *details) { struct vm_area_struct *vma; - pgoff_t vba, vea, zba, zea; unsigned long start, size; struct mmu_gather tlb; vma_interval_tree_foreach(vma, root, first_index, last_index) { - vba = vma->vm_pgoff; - vea = vba + vma_pages(vma) - 1; - zba = max(first_index, vba); - zea = min(last_index, vea); - start = ((zba - vba) << PAGE_SHIFT) + vma->vm_start; - size = (zea - zba + 1) << PAGE_SHIFT; + const pgoff_t start_idx = max(first_index, vma->vm_pgoff); + const pgoff_t end_idx = min(last_index, vma_last_pgoff(vma)) + 1; + + start = vma->vm_start + ((start_idx - vma->vm_pgoff) << PAGE_SHIFT); + size = (end_idx - start_idx) << PAGE_SHIFT; tlb_gather_mmu(&tlb, vma->vm_mm); zap_page_range_single_batched(&tlb, vma, start, size, details); -- cgit v1.2.3 From a97bc13d15f472c7f8ede1b38660fb55b6dab68d Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 27 Feb 2026 21:08:40 +0100 Subject: mm/memory: convert details->even_cows into details->skip_cows The current semantics are confusing: simply because someone specifies an empty zap_detail struct suddenly makes should_zap_cows() behave differently. The default should be to also zap CoW'ed anonymous pages. Really only unmap_mapping_pages() and friends want to skip zapping of these anon folios. So let's invert the meaning; turn the confusing "reclaim_pt" check that overrides other properties in should_zap_cows() into a safety check. Note that the only caller that sets reclaim_pt=true is madvise_dontneed_single_vma(), which wants to zap any pages. Link: https://lkml.kernel.org/r/20260227200848.114019-10-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Alexander Gordeev Cc: Alexei Starovoitov Cc: Alice Ryhl Cc: Andrii Nakryiko Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnd Bergmann Cc: Arve Cc: "Borislav Petkov (AMD)" Cc: Carlos Llamas Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: Daniel Borkman Cc: Dave Airlie Cc: David Ahern Cc: David Rientjes Cc: David S. Miller Cc: Dimitri Sivanich Cc: Eric Dumazet Cc: Gerald Schaefer Cc: Greg Kroah-Hartman Cc: Hartley Sweeten Cc: Heiko Carstens Cc: Ian Abbott Cc: Ingo Molnar Cc: Jakub Kacinski Cc: Jani Nikula Cc: Jann Horn Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jason Gunthorpe Cc: Jonas Lahtinen Cc: Leon Romanovsky Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Namhyung kim Cc: Neal Cardwell Cc: Paolo Abeni Cc: Pedro Falcato Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Todd Kjos Cc: Tvrtko Ursulin Cc: Vasily Gorbik Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- mm/madvise.c | 1 - mm/memory.c | 12 ++++++------ 3 files changed, 7 insertions(+), 8 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 771d021b7948..cb4f5fbccaf0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2767,7 +2767,7 @@ extern void pagefault_out_of_memory(void); */ struct zap_details { struct folio *single_folio; /* Locked folio to be unmapped */ - bool even_cows; /* Zap COWed private pages too? */ + bool skip_cows; /* Do not zap COWed private pages */ bool reclaim_pt; /* Need reclaim page tables? */ zap_flags_t zap_flags; /* Extra flags for zapping */ }; diff --git a/mm/madvise.c b/mm/madvise.c index e4a2728593a8..e86228682842 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -853,7 +853,6 @@ static long madvise_dontneed_single_vma(struct madvise_behavior *madv_behavior) struct madvise_behavior_range *range = &madv_behavior->range; struct zap_details details = { .reclaim_pt = true, - .even_cows = true, }; zap_page_range_single_batched( diff --git a/mm/memory.c b/mm/memory.c index 7e5d52534ee9..c66b7b8b47eb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1554,11 +1554,13 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) static inline bool should_zap_cows(struct zap_details *details) { /* By default, zap all pages */ - if (!details || details->reclaim_pt) + if (!details) return true; + VM_WARN_ON_ONCE(details->skip_cows && details->reclaim_pt); + /* Or, we zap COWed pages only if the caller wants to */ - return details->even_cows; + return !details->skip_cows; } /* Decides whether we should zap this folio with the folio pointer specified */ @@ -2149,8 +2151,6 @@ void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap) struct mmu_notifier_range range; struct zap_details details = { .zap_flags = ZAP_FLAG_DROP_MARKER | ZAP_FLAG_UNMAP, - /* Careful - we need to zap private pages too! */ - .even_cows = true, }; vma = unmap->first; @@ -4282,7 +4282,7 @@ void unmap_mapping_folio(struct folio *folio) first_index = folio->index; last_index = folio_next_index(folio) - 1; - details.even_cows = false; + details.skip_cows = true; details.single_folio = folio; details.zap_flags = ZAP_FLAG_DROP_MARKER; @@ -4312,7 +4312,7 @@ void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t first_index = start; pgoff_t last_index = start + nr - 1; - details.even_cows = even_cows; + details.skip_cows = !even_cows; if (last_index < first_index) last_index = ULONG_MAX; -- cgit v1.2.3 From 5f10cbbddc2bd80a5944f1c783830f7ebf648ad2 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 27 Feb 2026 21:08:41 +0100 Subject: mm/memory: use __zap_vma_range() in zap_vma_for_reaping() Let's call __zap_vma_range() instead of unmap_page_range() to prepare for further cleanups. To keep the existing behavior, whereby we do not call uprobe_munmap() which could block, add a new "reaping" member to zap_details and use it. Likely we should handle the possible blocking in uprobe_munmap() differently, but for now keep it unchanged. Link: https://lkml.kernel.org/r/20260227200848.114019-11-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Alexander Gordeev Cc: Alexei Starovoitov Cc: Alice Ryhl Cc: Andrii Nakryiko Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnd Bergmann Cc: Arve Cc: "Borislav Petkov (AMD)" Cc: Carlos Llamas Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: Daniel Borkman Cc: Dave Airlie Cc: David Ahern Cc: David Rientjes Cc: David S. Miller Cc: Dimitri Sivanich Cc: Eric Dumazet Cc: Gerald Schaefer Cc: Greg Kroah-Hartman Cc: Hartley Sweeten Cc: Heiko Carstens Cc: Ian Abbott Cc: Ingo Molnar Cc: Jakub Kacinski Cc: Jani Nikula Cc: Jann Horn Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jason Gunthorpe Cc: Jonas Lahtinen Cc: Leon Romanovsky Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Namhyung kim Cc: Neal Cardwell Cc: Paolo Abeni Cc: Pedro Falcato Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Todd Kjos Cc: Tvrtko Ursulin Cc: Vasily Gorbik Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 + mm/memory.c | 13 +++++++++---- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index cb4f5fbccaf0..488a144c9161 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2769,6 +2769,7 @@ struct zap_details { struct folio *single_folio; /* Locked folio to be unmapped */ bool skip_cows; /* Do not zap COWed private pages */ bool reclaim_pt; /* Need reclaim page tables? */ + bool reaping; /* Reaping, do not block. */ zap_flags_t zap_flags; /* Extra flags for zapping */ }; diff --git a/mm/memory.c b/mm/memory.c index c66b7b8b47eb..d1fd3cdd677a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2079,14 +2079,18 @@ static void __zap_vma_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct zap_details *details) { + const bool reaping = details && details->reaping; + VM_WARN_ON_ONCE(start >= end || !range_in_vma(vma, start, end)); - if (vma->vm_file) + /* uprobe_munmap() might sleep, so skip it when reaping. */ + if (vma->vm_file && !reaping) uprobe_munmap(vma, start, end); if (unlikely(is_vm_hugetlb_page(vma))) { zap_flags_t zap_flags = details ? details->zap_flags : 0; + VM_WARN_ON_ONCE(reaping); /* * vm_file will be NULL when we fail early while instantiating * a new mapping. In this case, no pages were mapped yet and @@ -2111,11 +2115,12 @@ static void __zap_vma_range(struct mmu_gather *tlb, struct vm_area_struct *vma, */ int zap_vma_for_reaping(struct vm_area_struct *vma) { + struct zap_details details = { + .reaping = true, + }; struct mmu_notifier_range range; struct mmu_gather tlb; - VM_WARN_ON_ONCE(is_vm_hugetlb_page(vma)); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, vma->vm_start, vma->vm_end); tlb_gather_mmu(&tlb, vma->vm_mm); @@ -2123,7 +2128,7 @@ int zap_vma_for_reaping(struct vm_area_struct *vma) tlb_finish_mmu(&tlb); return -EBUSY; } - unmap_page_range(&tlb, vma, range.start, range.end, NULL); + __zap_vma_range(&tlb, vma, range.start, range.end, &details); mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb); return 0; -- cgit v1.2.3 From 32bc7fe4a6f4d359b6de96cbc106d2cac695154e Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 27 Feb 2026 21:08:43 +0100 Subject: mm: rename zap_vma_pages() to zap_vma() Let's rename it to an even simpler name. While at it, add some simplistic kernel doc. Link: https://lkml.kernel.org/r/20260227200848.114019-13-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Alexander Gordeev Cc: Alexei Starovoitov Cc: Alice Ryhl Cc: Andrii Nakryiko Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnd Bergmann Cc: Arve Cc: "Borislav Petkov (AMD)" Cc: Carlos Llamas Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: Daniel Borkman Cc: Dave Airlie Cc: David Ahern Cc: David Rientjes Cc: David S. Miller Cc: Dimitri Sivanich Cc: Eric Dumazet Cc: Gerald Schaefer Cc: Greg Kroah-Hartman Cc: Hartley Sweeten Cc: Heiko Carstens Cc: Ian Abbott Cc: Ingo Molnar Cc: Jakub Kacinski Cc: Jani Nikula Cc: Jann Horn Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jason Gunthorpe Cc: Jonas Lahtinen Cc: Leon Romanovsky Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Namhyung kim Cc: Neal Cardwell Cc: Paolo Abeni Cc: Pedro Falcato Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Todd Kjos Cc: Tvrtko Ursulin Cc: Vasily Gorbik Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- arch/powerpc/platforms/book3s/vas-api.c | 2 +- arch/powerpc/platforms/pseries/vas.c | 2 +- include/linux/mm.h | 6 +++++- lib/vdso/datastore.c | 2 +- mm/page-writeback.c | 2 +- 5 files changed, 9 insertions(+), 5 deletions(-) (limited to 'include/linux/mm.h') diff --git a/arch/powerpc/platforms/book3s/vas-api.c b/arch/powerpc/platforms/book3s/vas-api.c index ea4ffa63f043..e96d79db69fe 100644 --- a/arch/powerpc/platforms/book3s/vas-api.c +++ b/arch/powerpc/platforms/book3s/vas-api.c @@ -414,7 +414,7 @@ static vm_fault_t vas_mmap_fault(struct vm_fault *vmf) /* * When the LPAR lost credits due to core removal or during * migration, invalidate the existing mapping for the current - * paste addresses and set windows in-active (zap_vma_pages in + * paste addresses and set windows in-active (zap_vma() in * reconfig_close_windows()). * New mapping will be done later after migration or new credits * available. So continue to receive faults if the user space diff --git a/arch/powerpc/platforms/pseries/vas.c b/arch/powerpc/platforms/pseries/vas.c index ceb0a8788c0a..fa05f04364fe 100644 --- a/arch/powerpc/platforms/pseries/vas.c +++ b/arch/powerpc/platforms/pseries/vas.c @@ -807,7 +807,7 @@ static int reconfig_close_windows(struct vas_caps *vcap, int excess_creds, * is done before the original mmap() and after the ioctl. */ if (vma) - zap_vma_pages(vma); + zap_vma(vma); mutex_unlock(&task_ref->mmap_mutex); mmap_write_unlock(task_ref->mm); diff --git a/include/linux/mm.h b/include/linux/mm.h index 488a144c9161..60c13d40c65c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2806,7 +2806,11 @@ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, unsigned long size); -static inline void zap_vma_pages(struct vm_area_struct *vma) +/** + * zap_vma - zap all page table entries in a vma + * @vma: The vma to zap. + */ +static inline void zap_vma(struct vm_area_struct *vma) { zap_page_range_single(vma, vma->vm_start, vma->vm_end - vma->vm_start); } diff --git a/lib/vdso/datastore.c b/lib/vdso/datastore.c index a565c30c71a0..222c143aebf7 100644 --- a/lib/vdso/datastore.c +++ b/lib/vdso/datastore.c @@ -121,7 +121,7 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) mmap_read_lock(mm); for_each_vma(vmi, vma) { if (vma_is_special_mapping(vma, &vdso_vvar_mapping)) - zap_vma_pages(vma); + zap_vma(vma); } mmap_read_unlock(mm); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 1009bb042ba4..8dc47b59ca18 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2645,7 +2645,7 @@ void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb) * while this function is in progress, although it may have been truncated * before this function is called. Most callers have the folio locked. * A few have the folio blocked from truncation through other means (e.g. - * zap_vma_pages() has it mapped and is holding the page table lock). + * zap_vma() has it mapped and is holding the page table lock). * When called from mark_buffer_dirty(), the filesystem should hold a * reference to the buffer_head that is being marked dirty, which causes * try_to_free_buffers() to fail. -- cgit v1.2.3 From 0326440c3545c86b6501c7c636fcf018d6e87b8c Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 27 Feb 2026 21:08:45 +0100 Subject: mm: rename zap_page_range_single() to zap_vma_range() Let's rename it to make it better match our new naming scheme. While at it, polish the kerneldoc. [akpm@linux-foundation.org: fix rustfmtcheck] Link: https://lkml.kernel.org/r/20260227200848.114019-15-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Acked-by: Puranjay Mohan Cc: Alexander Gordeev Cc: Alexei Starovoitov Cc: Alice Ryhl Cc: Andrii Nakryiko Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnd Bergmann Cc: Arve Cc: "Borislav Petkov (AMD)" Cc: Carlos Llamas Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: Daniel Borkman Cc: Dave Airlie Cc: David Ahern Cc: David Rientjes Cc: David S. Miller Cc: Dimitri Sivanich Cc: Eric Dumazet Cc: Gerald Schaefer Cc: Greg Kroah-Hartman Cc: Hartley Sweeten Cc: Heiko Carstens Cc: Ian Abbott Cc: Ingo Molnar Cc: Jakub Kacinski Cc: Jani Nikula Cc: Jann Horn Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jason Gunthorpe Cc: Jonas Lahtinen Cc: Leon Romanovsky Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Namhyung kim Cc: Neal Cardwell Cc: Paolo Abeni Cc: Pedro Falcato Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Todd Kjos Cc: Tvrtko Ursulin Cc: Vasily Gorbik Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- arch/s390/mm/gmap_helpers.c | 2 +- drivers/android/binder/page_range.rs | 4 ++-- drivers/android/binder_alloc.c | 2 +- include/linux/mm.h | 4 ++-- kernel/bpf/arena.c | 2 +- kernel/events/core.c | 2 +- mm/madvise.c | 4 ++-- mm/memory.c | 14 +++++++------- net/ipv4/tcp.c | 6 +++--- rust/kernel/mm/virt.rs | 4 ++-- 10 files changed, 22 insertions(+), 22 deletions(-) (limited to 'include/linux/mm.h') diff --git a/arch/s390/mm/gmap_helpers.c b/arch/s390/mm/gmap_helpers.c index ae2d59a19313..f8789ffcc05c 100644 --- a/arch/s390/mm/gmap_helpers.c +++ b/arch/s390/mm/gmap_helpers.c @@ -89,7 +89,7 @@ void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned lo if (!vma) return; if (!is_vm_hugetlb_page(vma)) - zap_page_range_single(vma, vmaddr, min(end, vma->vm_end) - vmaddr); + zap_vma_range(vma, vmaddr, min(end, vma->vm_end) - vmaddr); vmaddr = vma->vm_end; } } diff --git a/drivers/android/binder/page_range.rs b/drivers/android/binder/page_range.rs index 9dfc154e5dd4..8882fd18d9f3 100644 --- a/drivers/android/binder/page_range.rs +++ b/drivers/android/binder/page_range.rs @@ -130,7 +130,7 @@ pub(crate) struct ShrinkablePageRange { pid: Pid, /// The mm for the relevant process. mm: ARef, - /// Used to synchronize calls to `vm_insert_page` and `zap_page_range_single`. + /// Used to synchronize calls to `vm_insert_page` and `zap_vma_range`. #[pin] mm_lock: Mutex<()>, /// Spinlock protecting changes to pages. @@ -762,7 +762,7 @@ unsafe extern "C" fn rust_shrink_free_page( if let Some(unchecked_vma) = mmap_read.vma_lookup(vma_addr) { if let Some(vma) = check_vma(unchecked_vma, range_ptr) { let user_page_addr = vma_addr + (page_index << PAGE_SHIFT); - vma.zap_page_range_single(user_page_addr, PAGE_SIZE); + vma.zap_vma_range(user_page_addr, PAGE_SIZE); } } diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index dd2046bd5cde..e4488ad86a65 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -1185,7 +1185,7 @@ enum lru_status binder_alloc_free_page(struct list_head *item, if (vma) { trace_binder_unmap_user_start(alloc, index); - zap_page_range_single(vma, page_addr, PAGE_SIZE); + zap_vma_range(vma, page_addr, PAGE_SIZE); trace_binder_unmap_user_end(alloc, index); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 60c13d40c65c..10a5b9ba4eeb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2804,7 +2804,7 @@ struct page *vm_normal_page_pud(struct vm_area_struct *vma, unsigned long addr, void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); -void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, +void zap_vma_range(struct vm_area_struct *vma, unsigned long address, unsigned long size); /** * zap_vma - zap all page table entries in a vma @@ -2812,7 +2812,7 @@ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, */ static inline void zap_vma(struct vm_area_struct *vma) { - zap_page_range_single(vma, vma->vm_start, vma->vm_end - vma->vm_start); + zap_vma_range(vma, vma->vm_start, vma->vm_end - vma->vm_start); } struct mmu_notifier_range; diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 19cca936eb9d..08d008cc471e 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -656,7 +656,7 @@ static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt) guard(mutex)(&arena->lock); /* iterate link list under lock */ list_for_each_entry(vml, &arena->vma_list, head) - zap_page_range_single(vml->vma, uaddr, PAGE_SIZE * page_cnt); + zap_vma_range(vml->vma, uaddr, PAGE_SIZE * page_cnt); } static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable) diff --git a/kernel/events/core.c b/kernel/events/core.c index 2ecdaabf1b4d..d5b21077e829 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7213,7 +7213,7 @@ static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma) #ifdef CONFIG_MMU /* Clear any partial mappings on error. */ if (err) - zap_page_range_single(vma, vma->vm_start, nr_pages * PAGE_SIZE); + zap_vma_range(vma, vma->vm_start, nr_pages * PAGE_SIZE); #endif return err; diff --git a/mm/madvise.c b/mm/madvise.c index a50ec5f90e3e..afe0f01765c4 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -832,7 +832,7 @@ static int madvise_free_single_vma(struct madvise_behavior *madv_behavior) * Application no longer needs these pages. If the pages are dirty, * it's OK to just throw them away. The app will be more careful about * data it wants to keep. Be sure to free swap resources too. The - * zap_page_range_single call sets things up for shrink_active_list to actually + * zap_vma_range call sets things up for shrink_active_list to actually * free these pages later if no one else has touched them in the meantime, * although we could add these pages to a global reuse list for * shrink_active_list to pick up before reclaiming other pages. @@ -1191,7 +1191,7 @@ static long madvise_guard_install(struct madvise_behavior *madv_behavior) * OK some of the range have non-guard pages mapped, zap * them. This leaves existing guard pages in place. */ - zap_page_range_single(vma, range->start, range->end - range->start); + zap_vma_range(vma, range->start, range->end - range->start); } /* diff --git a/mm/memory.c b/mm/memory.c index 879858e466ef..dd80fbf6473a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2215,14 +2215,14 @@ void zap_vma_range_batched(struct mmu_gather *tlb, } /** - * zap_page_range_single - remove user pages in a given range - * @vma: vm_area_struct holding the applicable pages - * @address: starting address of pages to zap + * zap_vma_range - zap all page table entries in a vma range + * @vma: the vma covering the range to zap + * @address: starting address of the range to zap * @size: number of bytes to zap * - * The range must fit into one VMA. + * The provided address range must be fully contained within @vma. */ -void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, +void zap_vma_range(struct vm_area_struct *vma, unsigned long address, unsigned long size) { struct mmu_gather tlb; @@ -2250,7 +2250,7 @@ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, !(vma->vm_flags & VM_PFNMAP)) return; - zap_page_range_single(vma, address, size); + zap_vma_range(vma, address, size); } EXPORT_SYMBOL_GPL(zap_vma_ptes); @@ -3018,7 +3018,7 @@ static int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long add * maintain page reference counts, and callers may free * pages due to the error. So zap it early. */ - zap_page_range_single(vma, addr, size); + zap_vma_range(vma, addr, size); return error; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 89c962672e51..9573ce9b0ac1 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2105,7 +2105,7 @@ static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma, maybe_zap_len = total_bytes_to_map - /* All bytes to map */ *length + /* Mapped or pending */ (pages_remaining * PAGE_SIZE); /* Failed map. */ - zap_page_range_single(vma, *address, maybe_zap_len); + zap_vma_range(vma, *address, maybe_zap_len); err = 0; } @@ -2113,7 +2113,7 @@ static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma, unsigned long leftover_pages = pages_remaining; int bytes_mapped; - /* We called zap_page_range_single, try to reinsert. */ + /* We called zap_vma_range, try to reinsert. */ err = vm_insert_pages(vma, *address, pending_pages, &pages_remaining); @@ -2270,7 +2270,7 @@ static int tcp_zerocopy_receive(struct sock *sk, total_bytes_to_map = avail_len & ~(PAGE_SIZE - 1); if (total_bytes_to_map) { if (!(zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT)) - zap_page_range_single(vma, address, total_bytes_to_map); + zap_vma_range(vma, address, total_bytes_to_map); zc->length = total_bytes_to_map; zc->recv_skip_hint = 0; } else { diff --git a/rust/kernel/mm/virt.rs b/rust/kernel/mm/virt.rs index 6bfd91cfa1f4..63eb730b0b05 100644 --- a/rust/kernel/mm/virt.rs +++ b/rust/kernel/mm/virt.rs @@ -113,7 +113,7 @@ impl VmaRef { /// kernel goes further in freeing unused page tables, but for the purposes of this operation /// we must only assume that the leaf level is cleared. #[inline] - pub fn zap_page_range_single(&self, address: usize, size: usize) { + pub fn zap_vma_range(&self, address: usize, size: usize) { let (end, did_overflow) = address.overflowing_add(size); if did_overflow || address < self.start() || self.end() < end { // TODO: call WARN_ONCE once Rust version of it is added @@ -123,7 +123,7 @@ impl VmaRef { // SAFETY: By the type invariants, the caller has read access to this VMA, which is // sufficient for this method call. This method has no requirements on the vma flags. The // address range is checked to be within the vma. - unsafe { bindings::zap_page_range_single(self.as_ptr(), address, size) }; + unsafe { bindings::zap_vma_range(self.as_ptr(), address, size) }; } /// If the [`VM_MIXEDMAP`] flag is set, returns a [`VmaMixedMap`] to this VMA, otherwise -- cgit v1.2.3 From 52a9e9cd181fab8b03cf4e982533224697669976 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 27 Feb 2026 21:08:46 +0100 Subject: mm: rename zap_vma_ptes() to zap_special_vma_range() zap_vma_ptes() is the only zapping function we export to modules. It's essentially a wrapper around zap_vma_range(), however, with some safety checks: * That the passed range fits fully into the VMA * That it's only used for VM_PFNMAP We will add support for VM_MIXEDMAP next, so use the more-generic term "special vma", although "special" is a bit overloaded. Maybe we'll later just support any VM_SPECIAL flag. While at it, improve the kerneldoc. Link: https://lkml.kernel.org/r/20260227200848.114019-16-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Acked-by: Leon Romanovsky [drivers/infiniband] Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Alexander Gordeev Cc: Alexei Starovoitov Cc: Alice Ryhl Cc: Andrii Nakryiko Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnd Bergmann Cc: Arve Cc: "Borislav Petkov (AMD)" Cc: Carlos Llamas Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: Daniel Borkman Cc: Dave Airlie Cc: David Ahern Cc: David Rientjes Cc: David S. Miller Cc: Dimitri Sivanich Cc: Eric Dumazet Cc: Gerald Schaefer Cc: Greg Kroah-Hartman Cc: Hartley Sweeten Cc: Heiko Carstens Cc: Ian Abbott Cc: Ingo Molnar Cc: Jakub Kacinski Cc: Jani Nikula Cc: Jann Horn Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jason Gunthorpe Cc: Jonas Lahtinen Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Namhyung kim Cc: Neal Cardwell Cc: Paolo Abeni Cc: Pedro Falcato Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Todd Kjos Cc: Tvrtko Ursulin Cc: Vasily Gorbik Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- arch/x86/kernel/cpu/sgx/encl.c | 2 +- drivers/comedi/comedi_fops.c | 2 +- drivers/gpu/drm/i915/i915_mm.c | 4 ++-- drivers/infiniband/core/uverbs_main.c | 6 +++--- drivers/misc/sgi-gru/grumain.c | 2 +- include/linux/mm.h | 2 +- mm/memory.c | 16 +++++++--------- 7 files changed, 16 insertions(+), 18 deletions(-) (limited to 'include/linux/mm.h') diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index ac60ebde5d9b..3f0222d10f6e 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -1220,7 +1220,7 @@ void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr) ret = sgx_encl_find(encl_mm->mm, addr, &vma); if (!ret && encl == vma->vm_private_data) - zap_vma_ptes(vma, addr, PAGE_SIZE); + zap_special_vma_range(vma, addr, PAGE_SIZE); mmap_read_unlock(encl_mm->mm); diff --git a/drivers/comedi/comedi_fops.c b/drivers/comedi/comedi_fops.c index 48a8a607a84c..b91e0b5ac394 100644 --- a/drivers/comedi/comedi_fops.c +++ b/drivers/comedi/comedi_fops.c @@ -2588,7 +2588,7 @@ static int comedi_mmap(struct file *file, struct vm_area_struct *vma) * remap_pfn_range() because we call remap_pfn_range() in a loop. */ if (retval) - zap_vma_ptes(vma, vma->vm_start, size); + zap_special_vma_range(vma, vma->vm_start, size); #endif if (retval == 0) { diff --git a/drivers/gpu/drm/i915/i915_mm.c b/drivers/gpu/drm/i915/i915_mm.c index c33bd3d83069..fd89e7c7d8d6 100644 --- a/drivers/gpu/drm/i915/i915_mm.c +++ b/drivers/gpu/drm/i915/i915_mm.c @@ -108,7 +108,7 @@ int remap_io_mapping(struct vm_area_struct *vma, err = apply_to_page_range(r.mm, addr, size, remap_pfn, &r); if (unlikely(err)) { - zap_vma_ptes(vma, addr, (r.pfn - pfn) << PAGE_SHIFT); + zap_special_vma_range(vma, addr, (r.pfn - pfn) << PAGE_SHIFT); return err; } @@ -156,7 +156,7 @@ int remap_io_sg(struct vm_area_struct *vma, err = apply_to_page_range(r.mm, addr, size, remap_sg, &r); if (unlikely(err)) { - zap_vma_ptes(vma, addr, r.pfn << PAGE_SHIFT); + zap_special_vma_range(vma, addr, r.pfn << PAGE_SHIFT); return err; } diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 7b68967a6301..f5837da47299 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -756,7 +756,7 @@ out_zap: * point, so zap it. */ vma->vm_private_data = NULL; - zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); + zap_special_vma_range(vma, vma->vm_start, vma->vm_end - vma->vm_start); } static void rdma_umap_close(struct vm_area_struct *vma) @@ -782,7 +782,7 @@ static void rdma_umap_close(struct vm_area_struct *vma) } /* - * Once the zap_vma_ptes has been called touches to the VMA will come here and + * Once the zap_special_vma_range has been called touches to the VMA will come here and * we return a dummy writable zero page for all the pfns. */ static vm_fault_t rdma_umap_fault(struct vm_fault *vmf) @@ -878,7 +878,7 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile) continue; list_del_init(&priv->list); - zap_vma_ptes(vma, vma->vm_start, + zap_special_vma_range(vma, vma->vm_start, vma->vm_end - vma->vm_start); if (priv->entry) { diff --git a/drivers/misc/sgi-gru/grumain.c b/drivers/misc/sgi-gru/grumain.c index 8d749f345246..278b76cbd281 100644 --- a/drivers/misc/sgi-gru/grumain.c +++ b/drivers/misc/sgi-gru/grumain.c @@ -542,7 +542,7 @@ void gru_unload_context(struct gru_thread_state *gts, int savestate) int ctxnum = gts->ts_ctxnum; if (!is_kernel_context(gts)) - zap_vma_ptes(gts->ts_vma, UGRUADDR(gts), GRU_GSEG_PAGESIZE); + zap_special_vma_range(gts->ts_vma, UGRUADDR(gts), GRU_GSEG_PAGESIZE); cch = get_cch(gru->gs_gru_base_vaddr, ctxnum); gru_dbg(grudev, "gts %p, cbrmap 0x%lx, dsrmap 0x%lx\n", diff --git a/include/linux/mm.h b/include/linux/mm.h index 10a5b9ba4eeb..c516d5177211 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2802,7 +2802,7 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, struct page *vm_normal_page_pud(struct vm_area_struct *vma, unsigned long addr, pud_t pud); -void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, +void zap_special_vma_range(struct vm_area_struct *vma, unsigned long address, unsigned long size); void zap_vma_range(struct vm_area_struct *vma, unsigned long address, unsigned long size); diff --git a/mm/memory.c b/mm/memory.c index dd80fbf6473a..3dc4664c9af7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2233,17 +2233,15 @@ void zap_vma_range(struct vm_area_struct *vma, unsigned long address, } /** - * zap_vma_ptes - remove ptes mapping the vma - * @vma: vm_area_struct holding ptes to be zapped - * @address: starting address of pages to zap + * zap_special_vma_range - zap all page table entries in a special vma range + * @vma: the vma covering the range to zap + * @address: starting address of the range to zap * @size: number of bytes to zap * - * This function only unmaps ptes assigned to VM_PFNMAP vmas. - * - * The entire address range must be fully contained within the vma. - * + * This function does nothing when the provided address range is not fully + * contained in @vma, or when the @vma is not VM_PFNMAP. */ -void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, +void zap_special_vma_range(struct vm_area_struct *vma, unsigned long address, unsigned long size) { if (!range_in_vma(vma, address, address + size) || @@ -2252,7 +2250,7 @@ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, zap_vma_range(vma, address, size); } -EXPORT_SYMBOL_GPL(zap_vma_ptes); +EXPORT_SYMBOL_GPL(zap_special_vma_range); static pmd_t *walk_to_pmd(struct mm_struct *mm, unsigned long addr) { -- cgit v1.2.3 From e650bb30ca532901da6def04c7d1de72ae59ea4e Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Thu, 5 Mar 2026 10:50:14 +0000 Subject: mm: rename VMA flag helpers to be more readable Patch series "mm: vma flag tweaks". The ongoing work around introducing non-system word VMA flags has introduced a number of helper functions and macros to make life easier when working with these flags and to make conversions from the legacy use of VM_xxx flags more straightforward. This series improves these to reduce confusion as to what they do and to improve consistency and readability. Firstly the series renames vma_flags_test() to vma_flags_test_any() to make it abundantly clear that this function tests whether any of the flags are set (as opposed to vma_flags_test_all()). It then renames vma_desc_test_flags() to vma_desc_test_any() for the same reason. Note that we drop the 'flags' suffix here, as vma_desc_test_any_flags() would be cumbersome and 'test' implies a flag test. Similarly, we rename vma_test_all_flags() to vma_test_all() for consistency. Next, we have a couple of instances (erofs, zonefs) where we are now testing for vma_desc_test_any(desc, VMA_SHARED_BIT) && vma_desc_test_any(desc, VMA_MAYWRITE_BIT). This is silly, so this series introduces vma_desc_test_all() so these callers can instead invoke vma_desc_test_all(desc, VMA_SHARED_BIT, VMA_MAYWRITE_BIT). We then observe that quite a few instances of vma_flags_test_any() and vma_desc_test_any() are in fact only testing against a single flag. Using the _any() variant here is just confusing - 'any' of single item reads strangely and is liable to cause confusion. So in these instances the series reintroduces vma_flags_test() and vma_desc_test() as helpers which test against a single flag. The fact that vma_flags_t is a struct and that vma_flag_t utilises sparse to avoid confusion with vm_flags_t makes it impossible for a user to misuse these helpers without it getting flagged somewhere. The series also updates __mk_vma_flags() and functions invoked by it to explicitly mark them always inline to match expectation and to be consistent with other VMA flag helpers. It also renames vma_flag_set() to vma_flags_set_flag() (a function only used by __mk_vma_flags()) to be consistent with other VMA flag helpers. Finally it updates the VMA tests for each of these changes, and introduces explicit tests for vma_flags_test() and vma_desc_test() to assert that they behave as expected. This patch (of 6): On reflection, it's confusing to have vma_flags_test() and vma_desc_test_flags() test whether any comma-separated VMA flag bit is set, while also having vma_flags_test_all() and vma_test_all_flags() separately test whether all flags are set. Firstly, rename vma_flags_test() to vma_flags_test_any() to eliminate this confusion. Secondly, since the VMA descriptor flag functions are becoming rather cumbersome, prefer vma_desc_test*() to vma_desc_test_flags*(), and also rename vma_desc_test_flags() to vma_desc_test_any(). Finally, rename vma_test_all_flags() to vma_test_all() to keep the VMA-specific helper consistent with the VMA descriptor naming convention and to help avoid confusion vs. vma_flags_test_all(). While we're here, also update whitespace to be consistent in helper functions. Link: https://lkml.kernel.org/r/cover.1772704455.git.ljs@kernel.org Link: https://lkml.kernel.org/r/0f9cb3c511c478344fac0b3b3b0300bb95be95e9.1772704455.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Suggested-by: Pedro Falcato Acked-by: David Hildenbrand (Arm) Reviewed-by: Pedro Falcato Cc: Arnd Bergmann Cc: Babu Moger Cc: Baolin Wang Cc: Chao Yu Cc: Chatre, Reinette Cc: Chunhai Guo Cc: Damien Le Maol Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Hongbo Li Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jeffle Xu Cc: Johannes Thumshirn Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naohiro Aota Cc: Oscar Salvador Cc: Sandeep Dhavale Cc: Suren Baghdasaryan Cc: Vishal Verma Cc: Vlastimil Babka Cc: Yue Hu Signed-off-by: Andrew Morton --- drivers/char/mem.c | 2 +- drivers/dax/device.c | 2 +- fs/erofs/data.c | 4 ++-- fs/hugetlbfs/inode.c | 2 +- fs/ntfs3/file.c | 2 +- fs/resctrl/pseudo_lock.c | 2 +- fs/zonefs/file.c | 4 ++-- include/linux/dax.h | 4 ++-- include/linux/hugetlb_inline.h | 2 +- include/linux/mm.h | 48 +++++++++++++++++++++-------------------- mm/hugetlb.c | 14 ++++++------ mm/memory.c | 2 +- mm/secretmem.c | 2 +- mm/shmem.c | 4 ++-- tools/testing/vma/include/dup.h | 20 ++++++++--------- tools/testing/vma/tests/vma.c | 28 ++++++++++++------------ 16 files changed, 72 insertions(+), 70 deletions(-) (limited to 'include/linux/mm.h') diff --git a/drivers/char/mem.c b/drivers/char/mem.c index cca4529431f8..5118787d0954 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -520,7 +520,7 @@ static int mmap_zero_prepare(struct vm_area_desc *desc) #ifndef CONFIG_MMU return -ENOSYS; #endif - if (vma_desc_test_flags(desc, VMA_SHARED_BIT)) + if (vma_desc_test_any(desc, VMA_SHARED_BIT)) return shmem_zero_setup_desc(desc); desc->action.success_hook = mmap_zero_private_success; diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 528e81240c4d..381021c2e031 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -24,7 +24,7 @@ static int __check_vma(struct dev_dax *dev_dax, vma_flags_t flags, return -ENXIO; /* prevent private mappings from being established */ - if (!vma_flags_test(&flags, VMA_MAYSHARE_BIT)) { + if (!vma_flags_test_any(&flags, VMA_MAYSHARE_BIT)) { dev_info_ratelimited(dev, "%s: %s: fail, attempted private mapping\n", current->comm, func); diff --git a/fs/erofs/data.c b/fs/erofs/data.c index f79ee80627d9..6774d9b5ee82 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -473,8 +473,8 @@ static int erofs_file_mmap_prepare(struct vm_area_desc *desc) if (!IS_DAX(file_inode(desc->file))) return generic_file_readonly_mmap_prepare(desc); - if (vma_desc_test_flags(desc, VMA_SHARED_BIT) && - vma_desc_test_flags(desc, VMA_MAYWRITE_BIT)) + if (vma_desc_test_any(desc, VMA_SHARED_BIT) && + vma_desc_test_any(desc, VMA_MAYWRITE_BIT)) return -EINVAL; desc->vm_ops = &erofs_dax_vm_ops; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 2ec3e4231252..079ffaaf1f6c 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -164,7 +164,7 @@ static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc) goto out; ret = 0; - if (vma_desc_test_flags(desc, VMA_WRITE_BIT) && inode->i_size < len) + if (vma_desc_test_any(desc, VMA_WRITE_BIT) && inode->i_size < len) i_size_write(inode, len); out: inode_unlock(inode); diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 7eecf1e01f74..c5e2181f9f02 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -276,7 +276,7 @@ static int ntfs_file_mmap_prepare(struct vm_area_desc *desc) struct file *file = desc->file; struct inode *inode = file_inode(file); struct ntfs_inode *ni = ntfs_i(inode); - const bool rw = vma_desc_test_flags(desc, VMA_WRITE_BIT); + const bool rw = vma_desc_test_any(desc, VMA_WRITE_BIT); int err; /* Avoid any operation if inode is bad. */ diff --git a/fs/resctrl/pseudo_lock.c b/fs/resctrl/pseudo_lock.c index fa3687d69ebd..79a006c6f26c 100644 --- a/fs/resctrl/pseudo_lock.c +++ b/fs/resctrl/pseudo_lock.c @@ -1044,7 +1044,7 @@ static int pseudo_lock_dev_mmap_prepare(struct vm_area_desc *desc) * Ensure changes are carried directly to the memory being mapped, * do not allow copy-on-write mapping. */ - if (!vma_desc_test_flags(desc, VMA_SHARED_BIT)) { + if (!vma_desc_test_any(desc, VMA_SHARED_BIT)) { mutex_unlock(&rdtgroup_mutex); return -EINVAL; } diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index 8a7161fc49e5..9f9273ecf71a 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -333,8 +333,8 @@ static int zonefs_file_mmap_prepare(struct vm_area_desc *desc) * ordering between msync() and page cache writeback. */ if (zonefs_inode_is_seq(file_inode(file)) && - vma_desc_test_flags(desc, VMA_SHARED_BIT) && - vma_desc_test_flags(desc, VMA_MAYWRITE_BIT)) + vma_desc_test_any(desc, VMA_SHARED_BIT) && + vma_desc_test_any(desc, VMA_MAYWRITE_BIT)) return -EINVAL; file_accessed(file); diff --git a/include/linux/dax.h b/include/linux/dax.h index bf103f317cac..535019001577 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -69,7 +69,7 @@ static inline bool daxdev_mapping_supported(const struct vm_area_desc *desc, const struct inode *inode, struct dax_device *dax_dev) { - if (!vma_desc_test_flags(desc, VMA_SYNC_BIT)) + if (!vma_desc_test_any(desc, VMA_SYNC_BIT)) return true; if (!IS_DAX(inode)) return false; @@ -115,7 +115,7 @@ static inline bool daxdev_mapping_supported(const struct vm_area_desc *desc, const struct inode *inode, struct dax_device *dax_dev) { - return !vma_desc_test_flags(desc, VMA_SYNC_BIT); + return !vma_desc_test_any(desc, VMA_SYNC_BIT); } static inline size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h index 593f5d4e108b..84afc3c3e2e4 100644 --- a/include/linux/hugetlb_inline.h +++ b/include/linux/hugetlb_inline.h @@ -13,7 +13,7 @@ static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags) static inline bool is_vma_hugetlb_flags(const vma_flags_t *flags) { - return vma_flags_test(flags, VMA_HUGETLB_BIT); + return vma_flags_test_any(flags, VMA_HUGETLB_BIT); } #else diff --git a/include/linux/mm.h b/include/linux/mm.h index c516d5177211..ee7671d6c5eb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1062,7 +1062,7 @@ static inline vma_flags_t __mk_vma_flags(size_t count, const vma_flag_t *bits) (const vma_flag_t []){__VA_ARGS__}) /* Test each of to_test flags in flags, non-atomically. */ -static __always_inline bool vma_flags_test_mask(const vma_flags_t *flags, +static __always_inline bool vma_flags_test_any_mask(const vma_flags_t *flags, vma_flags_t to_test) { const unsigned long *bitmap = flags->__vma_flags; @@ -1074,10 +1074,10 @@ static __always_inline bool vma_flags_test_mask(const vma_flags_t *flags, /* * Test whether any specified VMA flag is set, e.g.: * - * if (vma_flags_test(flags, VMA_READ_BIT, VMA_MAYREAD_BIT)) { ... } + * if (vma_flags_test_any(flags, VMA_READ_BIT, VMA_MAYREAD_BIT)) { ... } */ -#define vma_flags_test(flags, ...) \ - vma_flags_test_mask(flags, mk_vma_flags(__VA_ARGS__)) +#define vma_flags_test_any(flags, ...) \ + vma_flags_test_any_mask(flags, mk_vma_flags(__VA_ARGS__)) /* Test that ALL of the to_test flags are set, non-atomically. */ static __always_inline bool vma_flags_test_all_mask(const vma_flags_t *flags, @@ -1098,7 +1098,8 @@ static __always_inline bool vma_flags_test_all_mask(const vma_flags_t *flags, vma_flags_test_all_mask(flags, mk_vma_flags(__VA_ARGS__)) /* Set each of the to_set flags in flags, non-atomically. */ -static __always_inline void vma_flags_set_mask(vma_flags_t *flags, vma_flags_t to_set) +static __always_inline void vma_flags_set_mask(vma_flags_t *flags, + vma_flags_t to_set) { unsigned long *bitmap = flags->__vma_flags; const unsigned long *bitmap_to_set = to_set.__vma_flags; @@ -1115,7 +1116,8 @@ static __always_inline void vma_flags_set_mask(vma_flags_t *flags, vma_flags_t t vma_flags_set_mask(flags, mk_vma_flags(__VA_ARGS__)) /* Clear all of the to-clear flags in flags, non-atomically. */ -static __always_inline void vma_flags_clear_mask(vma_flags_t *flags, vma_flags_t to_clear) +static __always_inline void vma_flags_clear_mask(vma_flags_t *flags, + vma_flags_t to_clear) { unsigned long *bitmap = flags->__vma_flags; const unsigned long *bitmap_to_clear = to_clear.__vma_flags; @@ -1137,8 +1139,8 @@ static __always_inline void vma_flags_clear_mask(vma_flags_t *flags, vma_flags_t * Note: appropriate locks must be held, this function does not acquire them for * you. */ -static inline bool vma_test_all_flags_mask(const struct vm_area_struct *vma, - vma_flags_t flags) +static inline bool vma_test_all_mask(const struct vm_area_struct *vma, + vma_flags_t flags) { return vma_flags_test_all_mask(&vma->flags, flags); } @@ -1146,10 +1148,10 @@ static inline bool vma_test_all_flags_mask(const struct vm_area_struct *vma, /* * Helper macro for checking that ALL specified flags are set in a VMA, e.g.: * - * if (vma_test_all_flags(vma, VMA_READ_BIT, VMA_MAYREAD_BIT) { ... } + * if (vma_test_all(vma, VMA_READ_BIT, VMA_MAYREAD_BIT) { ... } */ -#define vma_test_all_flags(vma, ...) \ - vma_test_all_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) +#define vma_test_all(vma, ...) \ + vma_test_all_mask(vma, mk_vma_flags(__VA_ARGS__)) /* * Helper to set all VMA flags in a VMA. @@ -1158,7 +1160,7 @@ static inline bool vma_test_all_flags_mask(const struct vm_area_struct *vma, * you. */ static inline void vma_set_flags_mask(struct vm_area_struct *vma, - vma_flags_t flags) + vma_flags_t flags) { vma_flags_set_mask(&vma->flags, flags); } @@ -1176,25 +1178,25 @@ static inline void vma_set_flags_mask(struct vm_area_struct *vma, vma_set_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) /* Helper to test all VMA flags in a VMA descriptor. */ -static inline bool vma_desc_test_flags_mask(const struct vm_area_desc *desc, - vma_flags_t flags) +static inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, + vma_flags_t flags) { - return vma_flags_test_mask(&desc->vma_flags, flags); + return vma_flags_test_any_mask(&desc->vma_flags, flags); } /* * Helper macro for testing VMA flags for an input pointer to a struct * vm_area_desc object describing a proposed VMA, e.g.: * - * if (vma_desc_test_flags(desc, VMA_IO_BIT, VMA_PFNMAP_BIT, + * if (vma_desc_test_any(desc, VMA_IO_BIT, VMA_PFNMAP_BIT, * VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT)) { ... } */ -#define vma_desc_test_flags(desc, ...) \ - vma_desc_test_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) +#define vma_desc_test_any(desc, ...) \ + vma_desc_test_any_mask(desc, mk_vma_flags(__VA_ARGS__)) /* Helper to set all VMA flags in a VMA descriptor. */ static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, - vma_flags_t flags) + vma_flags_t flags) { vma_flags_set_mask(&desc->vma_flags, flags); } @@ -1211,7 +1213,7 @@ static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, /* Helper to clear all VMA flags in a VMA descriptor. */ static inline void vma_desc_clear_flags_mask(struct vm_area_desc *desc, - vma_flags_t flags) + vma_flags_t flags) { vma_flags_clear_mask(&desc->vma_flags, flags); } @@ -1936,8 +1938,8 @@ static inline bool vma_desc_is_cow_mapping(struct vm_area_desc *desc) { const vma_flags_t *flags = &desc->vma_flags; - return vma_flags_test(flags, VMA_MAYWRITE_BIT) && - !vma_flags_test(flags, VMA_SHARED_BIT); + return vma_flags_test_any(flags, VMA_MAYWRITE_BIT) && + !vma_flags_test_any(flags, VMA_SHARED_BIT); } #ifndef CONFIG_MMU @@ -1956,7 +1958,7 @@ static inline bool is_nommu_shared_mapping(vm_flags_t flags) static inline bool is_nommu_shared_vma_flags(const vma_flags_t *flags) { - return vma_flags_test(flags, VMA_MAYSHARE_BIT, VMA_MAYOVERLAY_BIT); + return vma_flags_test_any(flags, VMA_MAYSHARE_BIT, VMA_MAYOVERLAY_BIT); } #endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1d41fa3dd43e..fbbe74f94426 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1194,7 +1194,7 @@ static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) static void set_vma_desc_resv_map(struct vm_area_desc *desc, struct resv_map *map) { VM_WARN_ON_ONCE(!is_vma_hugetlb_flags(&desc->vma_flags)); - VM_WARN_ON_ONCE(vma_desc_test_flags(desc, VMA_MAYSHARE_BIT)); + VM_WARN_ON_ONCE(vma_desc_test_any(desc, VMA_MAYSHARE_BIT)); desc->private_data = map; } @@ -1202,7 +1202,7 @@ static void set_vma_desc_resv_map(struct vm_area_desc *desc, struct resv_map *ma static void set_vma_desc_resv_flags(struct vm_area_desc *desc, unsigned long flags) { VM_WARN_ON_ONCE(!is_vma_hugetlb_flags(&desc->vma_flags)); - VM_WARN_ON_ONCE(vma_desc_test_flags(desc, VMA_MAYSHARE_BIT)); + VM_WARN_ON_ONCE(vma_desc_test_any(desc, VMA_MAYSHARE_BIT)); desc->private_data = (void *)((unsigned long)desc->private_data | flags); } @@ -6593,7 +6593,7 @@ long hugetlb_reserve_pages(struct inode *inode, * attempt will be made for VM_NORESERVE to allocate a page * without using reserves */ - if (vma_flags_test(&vma_flags, VMA_NORESERVE_BIT)) + if (vma_flags_test_any(&vma_flags, VMA_NORESERVE_BIT)) return 0; /* @@ -6602,7 +6602,7 @@ long hugetlb_reserve_pages(struct inode *inode, * to reserve the full area even if read-only as mprotect() may be * called to make the mapping read-write. Assume !desc is a shm mapping */ - if (!desc || vma_desc_test_flags(desc, VMA_MAYSHARE_BIT)) { + if (!desc || vma_desc_test_any(desc, VMA_MAYSHARE_BIT)) { /* * resv_map can not be NULL as hugetlb_reserve_pages is only * called for inodes for which resv_maps were created (see @@ -6636,7 +6636,7 @@ long hugetlb_reserve_pages(struct inode *inode, if (err < 0) goto out_err; - if (desc && !vma_desc_test_flags(desc, VMA_MAYSHARE_BIT) && h_cg) { + if (desc && !vma_desc_test_any(desc, VMA_MAYSHARE_BIT) && h_cg) { /* For private mappings, the hugetlb_cgroup uncharge info hangs * of the resv_map. */ @@ -6673,7 +6673,7 @@ long hugetlb_reserve_pages(struct inode *inode, * consumed reservations are stored in the map. Hence, nothing * else has to be done for private mappings here */ - if (!desc || vma_desc_test_flags(desc, VMA_MAYSHARE_BIT)) { + if (!desc || vma_desc_test_any(desc, VMA_MAYSHARE_BIT)) { add = region_add(resv_map, from, to, regions_needed, h, h_cg); if (unlikely(add < 0)) { @@ -6737,7 +6737,7 @@ out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), chg * pages_per_huge_page(h), h_cg); out_err: - if (!desc || vma_desc_test_flags(desc, VMA_MAYSHARE_BIT)) + if (!desc || vma_desc_test_any(desc, VMA_MAYSHARE_BIT)) /* Only call region_abort if the region_chg succeeded but the * region_add failed or didn't run. */ diff --git a/mm/memory.c b/mm/memory.c index b1c062bf5fc1..f21c804b50bf 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2982,7 +2982,7 @@ static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long ad if (WARN_ON_ONCE(!PAGE_ALIGNED(addr))) return -EINVAL; - VM_WARN_ON_ONCE(!vma_test_all_flags_mask(vma, VMA_REMAP_FLAGS)); + VM_WARN_ON_ONCE(!vma_test_all_mask(vma, VMA_REMAP_FLAGS)); BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; diff --git a/mm/secretmem.c b/mm/secretmem.c index 11a779c812a7..5f57ac4720d3 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -122,7 +122,7 @@ static int secretmem_mmap_prepare(struct vm_area_desc *desc) { const unsigned long len = vma_desc_size(desc); - if (!vma_desc_test_flags(desc, VMA_SHARED_BIT, VMA_MAYSHARE_BIT)) + if (!vma_desc_test_any(desc, VMA_SHARED_BIT, VMA_MAYSHARE_BIT)) return -EINVAL; vma_desc_set_flags(desc, VMA_LOCKED_BIT, VMA_DONTDUMP_BIT); diff --git a/mm/shmem.c b/mm/shmem.c index 5e7dcf5bc5d3..965a8908200b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3086,7 +3086,7 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, spin_lock_init(&info->lock); atomic_set(&info->stop_eviction, 0); info->seals = F_SEAL_SEAL; - info->flags = vma_flags_test(&flags, VMA_NORESERVE_BIT) + info->flags = vma_flags_test_any(&flags, VMA_NORESERVE_BIT) ? SHMEM_F_NORESERVE : 0; info->i_crtime = inode_get_mtime(inode); info->fsflags = (dir == NULL) ? 0 : @@ -5827,7 +5827,7 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, unsigned int i_flags) { const unsigned long shmem_flags = - vma_flags_test(&flags, VMA_NORESERVE_BIT) ? SHMEM_F_NORESERVE : 0; + vma_flags_test_any(&flags, VMA_NORESERVE_BIT) ? SHMEM_F_NORESERVE : 0; struct inode *inode; struct file *res; diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 3078ff1487d3..c46b523e428d 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -843,7 +843,7 @@ static inline vma_flags_t __mk_vma_flags(size_t count, const vma_flag_t *bits); #define mk_vma_flags(...) __mk_vma_flags(COUNT_ARGS(__VA_ARGS__), \ (const vma_flag_t []){__VA_ARGS__}) -static __always_inline bool vma_flags_test_mask(const vma_flags_t *flags, +static __always_inline bool vma_flags_test_any_mask(const vma_flags_t *flags, vma_flags_t to_test) { const unsigned long *bitmap = flags->__vma_flags; @@ -852,8 +852,8 @@ static __always_inline bool vma_flags_test_mask(const vma_flags_t *flags, return bitmap_intersects(bitmap_to_test, bitmap, NUM_VMA_FLAG_BITS); } -#define vma_flags_test(flags, ...) \ - vma_flags_test_mask(flags, mk_vma_flags(__VA_ARGS__)) +#define vma_flags_test_any(flags, ...) \ + vma_flags_test_any_mask(flags, mk_vma_flags(__VA_ARGS__)) static __always_inline bool vma_flags_test_all_mask(const vma_flags_t *flags, vma_flags_t to_test) @@ -889,14 +889,14 @@ static __always_inline void vma_flags_clear_mask(vma_flags_t *flags, vma_flags_t #define vma_flags_clear(flags, ...) \ vma_flags_clear_mask(flags, mk_vma_flags(__VA_ARGS__)) -static inline bool vma_test_all_flags_mask(const struct vm_area_struct *vma, +static inline bool vma_test_all_mask(const struct vm_area_struct *vma, vma_flags_t flags) { return vma_flags_test_all_mask(&vma->flags, flags); } -#define vma_test_all_flags(vma, ...) \ - vma_test_all_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) +#define vma_test_all(vma, ...) \ + vma_test_all_mask(vma, mk_vma_flags(__VA_ARGS__)) static inline bool is_shared_maywrite_vm_flags(vm_flags_t vm_flags) { @@ -913,14 +913,14 @@ static inline void vma_set_flags_mask(struct vm_area_struct *vma, #define vma_set_flags(vma, ...) \ vma_set_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) -static inline bool vma_desc_test_flags_mask(const struct vm_area_desc *desc, +static inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, vma_flags_t flags) { - return vma_flags_test_mask(&desc->vma_flags, flags); + return vma_flags_test_any_mask(&desc->vma_flags, flags); } -#define vma_desc_test_flags(desc, ...) \ - vma_desc_test_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) +#define vma_desc_test_any(desc, ...) \ + vma_desc_test_any_mask(desc, mk_vma_flags(__VA_ARGS__)) static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, vma_flags_t flags) diff --git a/tools/testing/vma/tests/vma.c b/tools/testing/vma/tests/vma.c index c54ffc954f11..f031e6dfb474 100644 --- a/tools/testing/vma/tests/vma.c +++ b/tools/testing/vma/tests/vma.c @@ -159,8 +159,8 @@ static bool test_vma_flags_word(void) return true; } -/* Ensure that vma_flags_test() and friends works correctly. */ -static bool test_vma_flags_test(void) +/* Ensure that vma_flags_test_any() and friends works correctly. */ +static bool test_vma_flags_test_any(void) { const vma_flags_t flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT, 64, 65); @@ -171,16 +171,16 @@ static bool test_vma_flags_test(void) desc.vma_flags = flags; #define do_test(...) \ - ASSERT_TRUE(vma_flags_test(&flags, __VA_ARGS__)); \ - ASSERT_TRUE(vma_desc_test_flags(&desc, __VA_ARGS__)) + ASSERT_TRUE(vma_flags_test_any(&flags, __VA_ARGS__)); \ + ASSERT_TRUE(vma_desc_test_any(&desc, __VA_ARGS__)) #define do_test_all_true(...) \ ASSERT_TRUE(vma_flags_test_all(&flags, __VA_ARGS__)); \ - ASSERT_TRUE(vma_test_all_flags(&vma, __VA_ARGS__)) + ASSERT_TRUE(vma_test_all(&vma, __VA_ARGS__)) #define do_test_all_false(...) \ ASSERT_FALSE(vma_flags_test_all(&flags, __VA_ARGS__)); \ - ASSERT_FALSE(vma_test_all_flags(&vma, __VA_ARGS__)) + ASSERT_FALSE(vma_test_all(&vma, __VA_ARGS__)) /* * Testing for some flags that are present, some that are not - should @@ -200,7 +200,7 @@ static bool test_vma_flags_test(void) * Check _mask variant. We don't need to test extensively as macro * helper is the equivalent. */ - ASSERT_TRUE(vma_flags_test_mask(&flags, flags)); + ASSERT_TRUE(vma_flags_test_any_mask(&flags, flags)); ASSERT_TRUE(vma_flags_test_all_mask(&flags, flags)); /* Single bits. */ @@ -268,9 +268,9 @@ static bool test_vma_flags_clear(void) vma_flags_clear_mask(&flags, mask); vma_flags_clear_mask(&vma.flags, mask); vma_desc_clear_flags_mask(&desc, mask); - ASSERT_FALSE(vma_flags_test(&flags, VMA_EXEC_BIT, 64)); - ASSERT_FALSE(vma_flags_test(&vma.flags, VMA_EXEC_BIT, 64)); - ASSERT_FALSE(vma_desc_test_flags(&desc, VMA_EXEC_BIT, 64)); + ASSERT_FALSE(vma_flags_test_any(&flags, VMA_EXEC_BIT, 64)); + ASSERT_FALSE(vma_flags_test_any(&vma.flags, VMA_EXEC_BIT, 64)); + ASSERT_FALSE(vma_desc_test_any(&desc, VMA_EXEC_BIT, 64)); /* Reset. */ vma_flags_set(&flags, VMA_EXEC_BIT, 64); vma_set_flags(&vma, VMA_EXEC_BIT, 64); @@ -284,9 +284,9 @@ static bool test_vma_flags_clear(void) vma_flags_clear(&flags, __VA_ARGS__); \ vma_flags_clear(&vma.flags, __VA_ARGS__); \ vma_desc_clear_flags(&desc, __VA_ARGS__); \ - ASSERT_FALSE(vma_flags_test(&flags, __VA_ARGS__)); \ - ASSERT_FALSE(vma_flags_test(&vma.flags, __VA_ARGS__)); \ - ASSERT_FALSE(vma_desc_test_flags(&desc, __VA_ARGS__)); \ + ASSERT_FALSE(vma_flags_test_any(&flags, __VA_ARGS__)); \ + ASSERT_FALSE(vma_flags_test_any(&vma.flags, __VA_ARGS__)); \ + ASSERT_FALSE(vma_desc_test_any(&desc, __VA_ARGS__)); \ vma_flags_set(&flags, __VA_ARGS__); \ vma_set_flags(&vma, __VA_ARGS__); \ vma_desc_set_flags(&desc, __VA_ARGS__) @@ -334,6 +334,6 @@ static void run_vma_tests(int *num_tests, int *num_fail) TEST(vma_flags_unchanged); TEST(vma_flags_cleared); TEST(vma_flags_word); - TEST(vma_flags_test); + TEST(vma_flags_test_any); TEST(vma_flags_clear); } -- cgit v1.2.3 From 0b3ed2a495b5c10296d9371502d70ce4398f0c58 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Thu, 5 Mar 2026 10:50:15 +0000 Subject: mm: add vma_desc_test_all() and use it erofs and zonefs are using vma_desc_test_any() twice to check whether all of VMA_SHARED_BIT and VMA_MAYWRITE_BIT are set, this is silly, so add vma_desc_test_all() to test all flags and update erofs and zonefs to use it. While we're here, update the helper function comments to be more consistent. Also add the same to the VMA test headers. Link: https://lkml.kernel.org/r/568c8f8d6a84ff64014f997517cba7a629f7eed6.1772704455.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Vlastimil Babka (SUSE) Acked-by: David Hildenbrand (Arm) Reviewed-by: Pedro Falcato Cc: Arnd Bergmann Cc: Babu Moger Cc: Baolin Wang Cc: Chao Yu Cc: Chatre, Reinette Cc: Chunhai Guo Cc: Damien Le Maol Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Hongbo Li Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jeffle Xu Cc: Johannes Thumshirn Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naohiro Aota Cc: Oscar Salvador Cc: Sandeep Dhavale Cc: Suren Baghdasaryan Cc: Vishal Verma Cc: Yue Hu Signed-off-by: Andrew Morton --- fs/erofs/data.c | 3 +-- fs/zonefs/file.c | 3 +-- include/linux/mm.h | 24 ++++++++++++++++++++---- tools/testing/vma/include/dup.h | 9 +++++++++ 4 files changed, 31 insertions(+), 8 deletions(-) (limited to 'include/linux/mm.h') diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 6774d9b5ee82..b33dd4d8710e 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -473,8 +473,7 @@ static int erofs_file_mmap_prepare(struct vm_area_desc *desc) if (!IS_DAX(file_inode(desc->file))) return generic_file_readonly_mmap_prepare(desc); - if (vma_desc_test_any(desc, VMA_SHARED_BIT) && - vma_desc_test_any(desc, VMA_MAYWRITE_BIT)) + if (vma_desc_test_all(desc, VMA_SHARED_BIT, VMA_MAYWRITE_BIT)) return -EINVAL; desc->vm_ops = &erofs_dax_vm_ops; diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index 9f9273ecf71a..5ada33f70bb4 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -333,8 +333,7 @@ static int zonefs_file_mmap_prepare(struct vm_area_desc *desc) * ordering between msync() and page cache writeback. */ if (zonefs_inode_is_seq(file_inode(file)) && - vma_desc_test_any(desc, VMA_SHARED_BIT) && - vma_desc_test_any(desc, VMA_MAYWRITE_BIT)) + vma_desc_test_all(desc, VMA_SHARED_BIT, VMA_MAYWRITE_BIT)) return -EINVAL; file_accessed(file); diff --git a/include/linux/mm.h b/include/linux/mm.h index ee7671d6c5eb..f964e4050583 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1177,7 +1177,7 @@ static inline void vma_set_flags_mask(struct vm_area_struct *vma, #define vma_set_flags(vma, ...) \ vma_set_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) -/* Helper to test all VMA flags in a VMA descriptor. */ +/* Helper to test any VMA flags in a VMA descriptor. */ static inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, vma_flags_t flags) { @@ -1185,8 +1185,8 @@ static inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, } /* - * Helper macro for testing VMA flags for an input pointer to a struct - * vm_area_desc object describing a proposed VMA, e.g.: + * Helper macro for testing whether any VMA flags are set in a VMA descriptor, + * e.g.: * * if (vma_desc_test_any(desc, VMA_IO_BIT, VMA_PFNMAP_BIT, * VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT)) { ... } @@ -1194,6 +1194,22 @@ static inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, #define vma_desc_test_any(desc, ...) \ vma_desc_test_any_mask(desc, mk_vma_flags(__VA_ARGS__)) +/* Helper to test all VMA flags in a VMA descriptor. */ +static inline bool vma_desc_test_all_mask(const struct vm_area_desc *desc, + vma_flags_t flags) +{ + return vma_flags_test_all_mask(&desc->vma_flags, flags); +} + +/* + * Helper macro for testing whether ALL VMA flags are set in a VMA descriptor, + * e.g.: + * + * if (vma_desc_test_all(desc, VMA_READ_BIT, VMA_MAYREAD_BIT)) { ... } + */ +#define vma_desc_test_all(desc, ...) \ + vma_desc_test_all_mask(desc, mk_vma_flags(__VA_ARGS__)) + /* Helper to set all VMA flags in a VMA descriptor. */ static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, vma_flags_t flags) @@ -1206,7 +1222,7 @@ static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, * vm_area_desc object describing a proposed VMA, e.g.: * * vma_desc_set_flags(desc, VMA_IO_BIT, VMA_PFNMAP_BIT, VMA_DONTEXPAND_BIT, - * VMA_DONTDUMP_BIT); + * VMA_DONTDUMP_BIT); */ #define vma_desc_set_flags(desc, ...) \ vma_desc_set_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index c46b523e428d..59788bc14d75 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -922,6 +922,15 @@ static inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, #define vma_desc_test_any(desc, ...) \ vma_desc_test_any_mask(desc, mk_vma_flags(__VA_ARGS__)) +static inline bool vma_desc_test_all_mask(const struct vm_area_desc *desc, + vma_flags_t flags) +{ + return vma_flags_test_all_mask(&desc->vma_flags, flags); +} + +#define vma_desc_test_all(desc, ...) \ + vma_desc_test_all_mask(desc, mk_vma_flags(__VA_ARGS__)) + static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, vma_flags_t flags) { -- cgit v1.2.3 From a5eee1128de526ba199bd4c7be39b849223e5001 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Thu, 5 Mar 2026 10:50:16 +0000 Subject: mm: always inline __mk_vma_flags() and invoked functions Be explicit about __mk_vma_flags() (which is used by the mk_vma_flags() macro) always being inline, as we rely on the compiler to evaluate the loop in this function and determine that it can replace the code with the an equivalent constant value, e.g. that: __mk_vma_flags(2, (const vma_flag_t []){ VMA_WRITE_BIT, VMA_EXEC_BIT }); Can be replaced with: (1UL << VMA_WRITE_BIT) | (1UL << VMA_EXEC_BIT) = (1UL << 1) | (1UL << 2) = 6 Most likely an 'inline' will suffice for this, but be explicit as we can be. Also update all of the functions __mk_vma_flags() ultimately invokes to be always inline too. Note that test_bitmap_const_eval() asserts that the relevant bitmap functions result in build time constant values. Additionally, vma_flag_set() operates on a vma_flags_t type, so it is inconsistently named versus other VMA flags functions. We only use vma_flag_set() in __mk_vma_flags() so we don't need to worry about its new name being rather cumbersome, so rename it to vma_flags_set_flag() to disambiguate it from vma_flags_set(). Also update the VMA test headers to reflect the changes. Link: https://lkml.kernel.org/r/241f49c52074d436edbb9c6a6662a8dc142a8f43.1772704455.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: David Hildenbrand (Arm) Reviewed-by: Pedro Falcato Cc: Arnd Bergmann Cc: Babu Moger Cc: Baolin Wang Cc: Chao Yu Cc: Chatre, Reinette Cc: Chunhai Guo Cc: Damien Le Maol Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Hongbo Li Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jeffle Xu Cc: Johannes Thumshirn Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naohiro Aota Cc: Oscar Salvador Cc: Sandeep Dhavale Cc: Suren Baghdasaryan Cc: Vishal Verma Cc: Vlastimil Babka Cc: Yue Hu Signed-off-by: Andrew Morton --- include/linux/mm.h | 8 +++++--- include/linux/mm_types.h | 2 +- tools/testing/vma/include/custom.h | 5 +++-- tools/testing/vma/include/dup.h | 5 +++-- 4 files changed, 12 insertions(+), 8 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index f964e4050583..9dcdf13570fb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1030,21 +1030,23 @@ static inline bool vma_test_atomic_flag(struct vm_area_struct *vma, vma_flag_t b } /* Set an individual VMA flag in flags, non-atomically. */ -static inline void vma_flag_set(vma_flags_t *flags, vma_flag_t bit) +static __always_inline void vma_flags_set_flag(vma_flags_t *flags, + vma_flag_t bit) { unsigned long *bitmap = flags->__vma_flags; __set_bit((__force int)bit, bitmap); } -static inline vma_flags_t __mk_vma_flags(size_t count, const vma_flag_t *bits) +static __always_inline vma_flags_t __mk_vma_flags(size_t count, + const vma_flag_t *bits) { vma_flags_t flags; int i; vma_flags_clear_all(&flags); for (i = 0; i < count; i++) - vma_flag_set(&flags, bits[i]); + vma_flags_set_flag(&flags, bits[i]); return flags; } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7bc82a2b889f..f22aecb047b7 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1056,7 +1056,7 @@ struct vm_area_struct { } __randomize_layout; /* Clears all bits in the VMA flags bitmap, non-atomically. */ -static inline void vma_flags_clear_all(vma_flags_t *flags) +static __always_inline void vma_flags_clear_all(vma_flags_t *flags) { bitmap_zero(flags->__vma_flags, NUM_VMA_FLAG_BITS); } diff --git a/tools/testing/vma/include/custom.h b/tools/testing/vma/include/custom.h index 802a76317245..833ff4d7f799 100644 --- a/tools/testing/vma/include/custom.h +++ b/tools/testing/vma/include/custom.h @@ -102,7 +102,8 @@ static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) refcount_set(&vma->vm_refcnt, 0); } -static inline vma_flags_t __mk_vma_flags(size_t count, const vma_flag_t *bits) +static __always_inline vma_flags_t __mk_vma_flags(size_t count, + const vma_flag_t *bits) { vma_flags_t flags; int i; @@ -114,6 +115,6 @@ static inline vma_flags_t __mk_vma_flags(size_t count, const vma_flag_t *bits) vma_flags_clear_all(&flags); for (i = 0; i < count; i++) if (bits[i] < NUM_VMA_FLAG_BITS) - vma_flag_set(&flags, bits[i]); + vma_flags_set_flag(&flags, bits[i]); return flags; } diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 59788bc14d75..ef6b9d963acc 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -780,12 +780,13 @@ static inline void vma_flags_clear_word(vma_flags_t *flags, unsigned long value) *bitmap &= ~value; } -static inline void vma_flags_clear_all(vma_flags_t *flags) +static __always_inline void vma_flags_clear_all(vma_flags_t *flags) { bitmap_zero(ACCESS_PRIVATE(flags, __vma_flags), NUM_VMA_FLAG_BITS); } -static inline void vma_flag_set(vma_flags_t *flags, vma_flag_t bit) +static __always_inline void vma_flags_set_flag(vma_flags_t *flags, + vma_flag_t bit) { unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); -- cgit v1.2.3 From 5e6d45d720ca299cc82d84948c4ba622fff64f22 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Thu, 5 Mar 2026 10:50:17 +0000 Subject: mm: reintroduce vma_flags_test() as a singular flag test Since we've now renamed vma_flags_test() to vma_flags_test_any() to be very clear as to what we are in fact testing, we now have the opportunity to bring vma_flags_test() back, but for explicitly testing a single VMA flag. This is useful, as often flag tests are against a single flag, and vma_flags_test_any(flags, VMA_READ_BIT) reads oddly and potentially causes confusion. We use sparse to enforce that users won't accidentally pass vm_flags_t to this function without it being flagged so this should make it harder to get this wrong. Of course, passing vma_flags_t to the function is impossible, as it is a struct. Also update the VMA tests to reflect this change. Link: https://lkml.kernel.org/r/f33f8d7f16c3f3d286a1dc2cba12c23683073134.1772704455.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: David Hildenbrand (Arm) Reviewed-by: Pedro Falcato Cc: Arnd Bergmann Cc: Babu Moger Cc: Baolin Wang Cc: Chao Yu Cc: Chatre, Reinette Cc: Chunhai Guo Cc: Damien Le Maol Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Hongbo Li Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jeffle Xu Cc: Johannes Thumshirn Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naohiro Aota Cc: Oscar Salvador Cc: Sandeep Dhavale Cc: Suren Baghdasaryan Cc: Vishal Verma Cc: Vlastimil Babka Cc: Yue Hu Signed-off-by: Andrew Morton --- include/linux/mm.h | 17 +++++++++++++++-- mm/hugetlb.c | 2 +- mm/shmem.c | 4 ++-- tools/testing/vma/include/dup.h | 8 ++++++++ 4 files changed, 26 insertions(+), 5 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9dcdf13570fb..9392723a5c50 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1050,6 +1050,19 @@ static __always_inline vma_flags_t __mk_vma_flags(size_t count, return flags; } +/* + * Test whether a specific VMA flag is set, e.g.: + * + * if (vma_flags_test(flags, VMA_READ_BIT)) { ... } + */ +static __always_inline bool vma_flags_test(const vma_flags_t *flags, + vma_flag_t bit) +{ + const unsigned long *bitmap = flags->__vma_flags; + + return test_bit((__force int)bit, bitmap); +} + /* * Helper macro which bitwise-or combines the specified input flags into a * vma_flags_t bitmap value. E.g.: @@ -1956,8 +1969,8 @@ static inline bool vma_desc_is_cow_mapping(struct vm_area_desc *desc) { const vma_flags_t *flags = &desc->vma_flags; - return vma_flags_test_any(flags, VMA_MAYWRITE_BIT) && - !vma_flags_test_any(flags, VMA_SHARED_BIT); + return vma_flags_test(flags, VMA_MAYWRITE_BIT) && + !vma_flags_test(flags, VMA_SHARED_BIT); } #ifndef CONFIG_MMU diff --git a/mm/hugetlb.c b/mm/hugetlb.c index fbbe74f94426..9363b6072c0a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6593,7 +6593,7 @@ long hugetlb_reserve_pages(struct inode *inode, * attempt will be made for VM_NORESERVE to allocate a page * without using reserves */ - if (vma_flags_test_any(&vma_flags, VMA_NORESERVE_BIT)) + if (vma_flags_test(&vma_flags, VMA_NORESERVE_BIT)) return 0; /* diff --git a/mm/shmem.c b/mm/shmem.c index 965a8908200b..5e7dcf5bc5d3 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3086,7 +3086,7 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, spin_lock_init(&info->lock); atomic_set(&info->stop_eviction, 0); info->seals = F_SEAL_SEAL; - info->flags = vma_flags_test_any(&flags, VMA_NORESERVE_BIT) + info->flags = vma_flags_test(&flags, VMA_NORESERVE_BIT) ? SHMEM_F_NORESERVE : 0; info->i_crtime = inode_get_mtime(inode); info->fsflags = (dir == NULL) ? 0 : @@ -5827,7 +5827,7 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, unsigned int i_flags) { const unsigned long shmem_flags = - vma_flags_test_any(&flags, VMA_NORESERVE_BIT) ? SHMEM_F_NORESERVE : 0; + vma_flags_test(&flags, VMA_NORESERVE_BIT) ? SHMEM_F_NORESERVE : 0; struct inode *inode; struct file *res; diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index ef6b9d963acc..630478f0d583 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -844,6 +844,14 @@ static inline vma_flags_t __mk_vma_flags(size_t count, const vma_flag_t *bits); #define mk_vma_flags(...) __mk_vma_flags(COUNT_ARGS(__VA_ARGS__), \ (const vma_flag_t []){__VA_ARGS__}) +static __always_inline bool vma_flags_test(const vma_flags_t *flags, + vma_flag_t bit) +{ + const unsigned long *bitmap = flags->__vma_flags; + + return test_bit((__force int)bit, bitmap); +} + static __always_inline bool vma_flags_test_any_mask(const vma_flags_t *flags, vma_flags_t to_test) { -- cgit v1.2.3 From 0c2aa6635716a5aa19576deef062efab5322072f Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Thu, 5 Mar 2026 10:50:18 +0000 Subject: mm: reintroduce vma_desc_test() as a singular flag test Similar to vma_flags_test(), we have previously renamed vma_desc_test() to vma_desc_test_any(). Now that is in place, we can reintroduce vma_desc_test() to explicitly check for a single VMA flag. As with vma_flags_test(), this is useful as often flag tests are against a single flag, and vma_desc_test_any(flags, VMA_READ_BIT) reads oddly and potentially causes confusion. As with vma_flags_test() a combination of sparse and vma_flags_t being a struct means that users cannot misuse this function without it getting flagged. Also update the VMA tests to reflect this change. Link: https://lkml.kernel.org/r/3a65ca23defb05060333f0586428fe279a484564.1772704455.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: David Hildenbrand (Arm) Reviewed-by: Pedro Falcato Cc: Arnd Bergmann Cc: Babu Moger Cc: Baolin Wang Cc: Chao Yu Cc: Chatre, Reinette Cc: Chunhai Guo Cc: Damien Le Maol Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Hongbo Li Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jeffle Xu Cc: Johannes Thumshirn Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naohiro Aota Cc: Oscar Salvador Cc: Sandeep Dhavale Cc: Suren Baghdasaryan Cc: Vishal Verma Cc: Vlastimil Babka Cc: Yue Hu Signed-off-by: Andrew Morton --- drivers/char/mem.c | 2 +- fs/hugetlbfs/inode.c | 2 +- fs/ntfs3/file.c | 2 +- fs/resctrl/pseudo_lock.c | 2 +- include/linux/dax.h | 4 ++-- include/linux/mm.h | 11 +++++++++++ mm/hugetlb.c | 12 ++++++------ tools/testing/vma/include/dup.h | 6 ++++++ 8 files changed, 29 insertions(+), 12 deletions(-) (limited to 'include/linux/mm.h') diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 5118787d0954..5fd421e48c04 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -520,7 +520,7 @@ static int mmap_zero_prepare(struct vm_area_desc *desc) #ifndef CONFIG_MMU return -ENOSYS; #endif - if (vma_desc_test_any(desc, VMA_SHARED_BIT)) + if (vma_desc_test(desc, VMA_SHARED_BIT)) return shmem_zero_setup_desc(desc); desc->action.success_hook = mmap_zero_private_success; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 079ffaaf1f6c..cd6b22f6e2b1 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -164,7 +164,7 @@ static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc) goto out; ret = 0; - if (vma_desc_test_any(desc, VMA_WRITE_BIT) && inode->i_size < len) + if (vma_desc_test(desc, VMA_WRITE_BIT) && inode->i_size < len) i_size_write(inode, len); out: inode_unlock(inode); diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index c5e2181f9f02..fbdfaf989a31 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -276,7 +276,7 @@ static int ntfs_file_mmap_prepare(struct vm_area_desc *desc) struct file *file = desc->file; struct inode *inode = file_inode(file); struct ntfs_inode *ni = ntfs_i(inode); - const bool rw = vma_desc_test_any(desc, VMA_WRITE_BIT); + const bool rw = vma_desc_test(desc, VMA_WRITE_BIT); int err; /* Avoid any operation if inode is bad. */ diff --git a/fs/resctrl/pseudo_lock.c b/fs/resctrl/pseudo_lock.c index 79a006c6f26c..d1cb0986006e 100644 --- a/fs/resctrl/pseudo_lock.c +++ b/fs/resctrl/pseudo_lock.c @@ -1044,7 +1044,7 @@ static int pseudo_lock_dev_mmap_prepare(struct vm_area_desc *desc) * Ensure changes are carried directly to the memory being mapped, * do not allow copy-on-write mapping. */ - if (!vma_desc_test_any(desc, VMA_SHARED_BIT)) { + if (!vma_desc_test(desc, VMA_SHARED_BIT)) { mutex_unlock(&rdtgroup_mutex); return -EINVAL; } diff --git a/include/linux/dax.h b/include/linux/dax.h index 535019001577..10a7cc79aea5 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -69,7 +69,7 @@ static inline bool daxdev_mapping_supported(const struct vm_area_desc *desc, const struct inode *inode, struct dax_device *dax_dev) { - if (!vma_desc_test_any(desc, VMA_SYNC_BIT)) + if (!vma_desc_test(desc, VMA_SYNC_BIT)) return true; if (!IS_DAX(inode)) return false; @@ -115,7 +115,7 @@ static inline bool daxdev_mapping_supported(const struct vm_area_desc *desc, const struct inode *inode, struct dax_device *dax_dev) { - return !vma_desc_test_any(desc, VMA_SYNC_BIT); + return !vma_desc_test(desc, VMA_SYNC_BIT); } static inline size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) diff --git a/include/linux/mm.h b/include/linux/mm.h index 9392723a5c50..63d1f619260e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1192,6 +1192,17 @@ static inline void vma_set_flags_mask(struct vm_area_struct *vma, #define vma_set_flags(vma, ...) \ vma_set_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) +/* + * Test whether a specific VMA flag is set in a VMA descriptor, e.g.: + * + * if (vma_desc_test(desc, VMA_READ_BIT)) { ... } + */ +static __always_inline bool vma_desc_test(const struct vm_area_desc *desc, + vma_flag_t bit) +{ + return vma_flags_test(&desc->vma_flags, bit); +} + /* Helper to test any VMA flags in a VMA descriptor. */ static inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, vma_flags_t flags) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9363b6072c0a..992c1632d26a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1194,7 +1194,7 @@ static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) static void set_vma_desc_resv_map(struct vm_area_desc *desc, struct resv_map *map) { VM_WARN_ON_ONCE(!is_vma_hugetlb_flags(&desc->vma_flags)); - VM_WARN_ON_ONCE(vma_desc_test_any(desc, VMA_MAYSHARE_BIT)); + VM_WARN_ON_ONCE(vma_desc_test(desc, VMA_MAYSHARE_BIT)); desc->private_data = map; } @@ -1202,7 +1202,7 @@ static void set_vma_desc_resv_map(struct vm_area_desc *desc, struct resv_map *ma static void set_vma_desc_resv_flags(struct vm_area_desc *desc, unsigned long flags) { VM_WARN_ON_ONCE(!is_vma_hugetlb_flags(&desc->vma_flags)); - VM_WARN_ON_ONCE(vma_desc_test_any(desc, VMA_MAYSHARE_BIT)); + VM_WARN_ON_ONCE(vma_desc_test(desc, VMA_MAYSHARE_BIT)); desc->private_data = (void *)((unsigned long)desc->private_data | flags); } @@ -6602,7 +6602,7 @@ long hugetlb_reserve_pages(struct inode *inode, * to reserve the full area even if read-only as mprotect() may be * called to make the mapping read-write. Assume !desc is a shm mapping */ - if (!desc || vma_desc_test_any(desc, VMA_MAYSHARE_BIT)) { + if (!desc || vma_desc_test(desc, VMA_MAYSHARE_BIT)) { /* * resv_map can not be NULL as hugetlb_reserve_pages is only * called for inodes for which resv_maps were created (see @@ -6636,7 +6636,7 @@ long hugetlb_reserve_pages(struct inode *inode, if (err < 0) goto out_err; - if (desc && !vma_desc_test_any(desc, VMA_MAYSHARE_BIT) && h_cg) { + if (desc && !vma_desc_test(desc, VMA_MAYSHARE_BIT) && h_cg) { /* For private mappings, the hugetlb_cgroup uncharge info hangs * of the resv_map. */ @@ -6673,7 +6673,7 @@ long hugetlb_reserve_pages(struct inode *inode, * consumed reservations are stored in the map. Hence, nothing * else has to be done for private mappings here */ - if (!desc || vma_desc_test_any(desc, VMA_MAYSHARE_BIT)) { + if (!desc || vma_desc_test(desc, VMA_MAYSHARE_BIT)) { add = region_add(resv_map, from, to, regions_needed, h, h_cg); if (unlikely(add < 0)) { @@ -6737,7 +6737,7 @@ out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), chg * pages_per_huge_page(h), h_cg); out_err: - if (!desc || vma_desc_test_any(desc, VMA_MAYSHARE_BIT)) + if (!desc || vma_desc_test(desc, VMA_MAYSHARE_BIT)) /* Only call region_abort if the region_chg succeeded but the * region_add failed or didn't run. */ diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 630478f0d583..5eb313beb43d 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -922,6 +922,12 @@ static inline void vma_set_flags_mask(struct vm_area_struct *vma, #define vma_set_flags(vma, ...) \ vma_set_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) +static __always_inline bool vma_desc_test(const struct vm_area_desc *desc, + vma_flag_t bit) +{ + return vma_flags_test(&desc->vma_flags, bit); +} + static inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, vma_flags_t flags) { -- cgit v1.2.3 From db359fccf212e7fa3136e6edbed6228475646fd7 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Tue, 24 Feb 2026 14:13:47 +0900 Subject: mm: introduce a new page type for page pool in page type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, the condition 'page->pp_magic == PP_SIGNATURE' is used to determine if a page belongs to a page pool. However, with the planned removal of @pp_magic, we should instead leverage the page_type in struct page, such as PGTY_netpp, for this purpose. Introduce and use the page type APIs e.g. PageNetpp(), __SetPageNetpp(), and __ClearPageNetpp() instead, and remove the existing APIs accessing @pp_magic e.g. page_pool_page_is_pp(), netmem_or_pp_magic(), and netmem_clear_pp_magic(). Plus, add @page_type to struct net_iov at the same offset as struct page so as to use the page_type APIs for struct net_iov as well. While at it, reorder @type and @owner in struct net_iov to avoid a hole and increasing the struct size. This work was inspired by the following link: https://lore.kernel.org/all/582f41c0-2742-4400-9c81-0d46bf4e8314@gmail.com/ While at it, move the sanity check for page pool to on the free path. [byungchul@sk.com: gate the sanity check, per Johannes] Link: https://lkml.kernel.org/r/20260316223113.20097-1-byungchul@sk.com Link: https://lkml.kernel.org/r/20260224051347.19621-1-byungchul@sk.com Co-developed-by: Pavel Begunkov Signed-off-by: Pavel Begunkov Signed-off-by: Byungchul Park Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Acked-by: Zi Yan Acked-by: Vlastimil Babka Reviewed-by: Toke Høiland-Jørgensen Acked-by: Mike Rapoport (Microsoft) Acked-by: Johannes Weiner Acked-by: Jakub Kicinski Acked-by: Jesper Dangaard Brouer Acked-by: Ilias Apalodimas Cc: Alexei Starovoitov Cc: Andrew Lunn Cc: Baolin Wang Cc: Brendan Jackman Cc: Christian Brauner Cc: Daniel Borkmann Cc: David S. Miller Cc: David Wei Cc: Dragos Tatulea Cc: Eric Dumazet Cc: John Fastabend Cc: Leon Romanovsky Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Mark Bloch Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mina Almasry Cc: Paolo Abeni Cc: Saeed Mahameed Cc: Simon Horman Cc: Stanislav Fomichev Cc: Stehen Rothwell Cc: Suren Baghdasaryan Cc: Taehee Yoo Cc: Tariq Toukan Cc: Usama Arif Cc: Yu Zhao Signed-off-by: Andrew Morton --- drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c | 2 +- include/linux/mm.h | 27 +++--------------------- include/linux/page-flags.h | 6 ++++++ include/net/netmem.h | 15 +++++++++++-- mm/page_alloc.c | 13 ++++++++---- net/core/netmem_priv.h | 23 +++++++++----------- net/core/page_pool.c | 24 +++++++++++++++++++-- 7 files changed, 64 insertions(+), 46 deletions(-) (limited to 'include/linux/mm.h') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index 80f9fc10877a..7d90d2485c78 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -707,7 +707,7 @@ static void mlx5e_free_xdpsq_desc(struct mlx5e_xdpsq *sq, xdpi = mlx5e_xdpi_fifo_pop(xdpi_fifo); page = xdpi.page.page; - /* No need to check page_pool_page_is_pp() as we + /* No need to check PageNetpp() as we * know this is a page_pool page. */ page_pool_recycle_direct(pp_page_to_nmdesc(page)->pp, diff --git a/include/linux/mm.h b/include/linux/mm.h index 63d1f619260e..c758f4e68727 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4840,10 +4840,9 @@ int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); * DMA mapping IDs for page_pool * * When DMA-mapping a page, page_pool allocates an ID (from an xarray) and - * stashes it in the upper bits of page->pp_magic. We always want to be able to - * unambiguously identify page pool pages (using page_pool_page_is_pp()). Non-PP - * pages can have arbitrary kernel pointers stored in the same field as pp_magic - * (since it overlaps with page->lru.next), so we must ensure that we cannot + * stashes it in the upper bits of page->pp_magic. Non-PP pages can have + * arbitrary kernel pointers stored in the same field as pp_magic (since + * it overlaps with page->lru.next), so we must ensure that we cannot * mistake a valid kernel pointer with any of the values we write into this * field. * @@ -4878,26 +4877,6 @@ int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); #define PP_DMA_INDEX_MASK GENMASK(PP_DMA_INDEX_BITS + PP_DMA_INDEX_SHIFT - 1, \ PP_DMA_INDEX_SHIFT) -/* Mask used for checking in page_pool_page_is_pp() below. page->pp_magic is - * OR'ed with PP_SIGNATURE after the allocation in order to preserve bit 0 for - * the head page of compound page and bit 1 for pfmemalloc page, as well as the - * bits used for the DMA index. page_is_pfmemalloc() is checked in - * __page_pool_put_page() to avoid recycling the pfmemalloc page. - */ -#define PP_MAGIC_MASK ~(PP_DMA_INDEX_MASK | 0x3UL) - -#ifdef CONFIG_PAGE_POOL -static inline bool page_pool_page_is_pp(const struct page *page) -{ - return (page->pp_magic & PP_MAGIC_MASK) == PP_SIGNATURE; -} -#else -static inline bool page_pool_page_is_pp(const struct page *page) -{ - return false; -} -#endif - #define PAGE_SNAPSHOT_FAITHFUL (1 << 0) #define PAGE_SNAPSHOT_PG_BUDDY (1 << 1) #define PAGE_SNAPSHOT_PG_IDLE (1 << 2) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 7223f6f4e2b4..0e03d816e8b9 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -923,6 +923,7 @@ enum pagetype { PGTY_zsmalloc = 0xf6, PGTY_unaccepted = 0xf7, PGTY_large_kmalloc = 0xf8, + PGTY_netpp = 0xf9, PGTY_mapcount_underflow = 0xff }; @@ -1055,6 +1056,11 @@ PAGE_TYPE_OPS(Zsmalloc, zsmalloc, zsmalloc) PAGE_TYPE_OPS(Unaccepted, unaccepted, unaccepted) PAGE_TYPE_OPS(LargeKmalloc, large_kmalloc, large_kmalloc) +/* + * Marks page_pool allocated pages. + */ +PAGE_TYPE_OPS(Netpp, netpp, netpp) + /** * PageHuge - Determine if the page belongs to hugetlbfs * @page: The page to test. diff --git a/include/net/netmem.h b/include/net/netmem.h index a96b3e5e5574..85e3b26ec547 100644 --- a/include/net/netmem.h +++ b/include/net/netmem.h @@ -110,10 +110,21 @@ struct net_iov { atomic_long_t pp_ref_count; }; }; - struct net_iov_area *owner; + + unsigned int page_type; enum net_iov_type type; + struct net_iov_area *owner; }; +/* Make sure 'the offset of page_type in struct page == the offset of + * type in struct net_iov'. + */ +#define NET_IOV_ASSERT_OFFSET(pg, iov) \ + static_assert(offsetof(struct page, pg) == \ + offsetof(struct net_iov, iov)) +NET_IOV_ASSERT_OFFSET(page_type, page_type); +#undef NET_IOV_ASSERT_OFFSET + struct net_iov_area { /* Array of net_iovs for this area. */ struct net_iov *niovs; @@ -256,7 +267,7 @@ static inline unsigned long netmem_pfn_trace(netmem_ref netmem) */ #define pp_page_to_nmdesc(p) \ ({ \ - DEBUG_NET_WARN_ON_ONCE(!page_pool_page_is_pp(p)); \ + DEBUG_NET_WARN_ON_ONCE(!PageNetpp(p)); \ __pp_page_to_nmdesc(p); \ }) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f11f38ba2e12..fdcc2fde565b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1043,7 +1043,6 @@ static inline bool page_expected_state(struct page *page, #ifdef CONFIG_MEMCG page->memcg_data | #endif - page_pool_page_is_pp(page) | (page->flags.f & check_flags))) return false; @@ -1070,8 +1069,6 @@ static const char *page_bad_reason(struct page *page, unsigned long flags) if (unlikely(page->memcg_data)) bad_reason = "page still charged to cgroup"; #endif - if (unlikely(page_pool_page_is_pp(page))) - bad_reason = "page_pool leak"; return bad_reason; } @@ -1380,9 +1377,17 @@ __always_inline bool __free_pages_prepare(struct page *page, mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1); folio->mapping = NULL; } - if (unlikely(page_has_type(page))) + if (unlikely(page_has_type(page))) { + /* networking expects to clear its page type before releasing */ + if (is_check_pages_enabled()) { + if (unlikely(PageNetpp(page))) { + bad_page(page, "page_pool leak"); + return false; + } + } /* Reset the page_type (which overlays _mapcount) */ page->page_type = UINT_MAX; + } if (is_check_pages_enabled()) { if (free_page_is_bad(page)) diff --git a/net/core/netmem_priv.h b/net/core/netmem_priv.h index 23175cb2bd86..3e6fde8f1726 100644 --- a/net/core/netmem_priv.h +++ b/net/core/netmem_priv.h @@ -8,21 +8,18 @@ static inline unsigned long netmem_get_pp_magic(netmem_ref netmem) return netmem_to_nmdesc(netmem)->pp_magic & ~PP_DMA_INDEX_MASK; } -static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic) -{ - netmem_to_nmdesc(netmem)->pp_magic |= pp_magic; -} - -static inline void netmem_clear_pp_magic(netmem_ref netmem) -{ - WARN_ON_ONCE(netmem_to_nmdesc(netmem)->pp_magic & PP_DMA_INDEX_MASK); - - netmem_to_nmdesc(netmem)->pp_magic = 0; -} - static inline bool netmem_is_pp(netmem_ref netmem) { - return (netmem_get_pp_magic(netmem) & PP_MAGIC_MASK) == PP_SIGNATURE; + struct page *page; + + /* XXX: Now that the offset of page_type is shared between + * struct page and net_iov, just cast the netmem to struct page + * unconditionally by clearing NET_IOV if any, no matter whether + * it comes from struct net_iov or struct page. This should be + * adjusted once the offset is no longer shared. + */ + page = (struct page *)((__force unsigned long)netmem & ~NET_IOV); + return PageNetpp(page); } static inline void netmem_set_pp(netmem_ref netmem, struct page_pool *pool) diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 265a729431bb..877bbf7a1938 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -702,8 +702,18 @@ s32 page_pool_inflight(const struct page_pool *pool, bool strict) void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem) { + struct page *page; + netmem_set_pp(netmem, pool); - netmem_or_pp_magic(netmem, PP_SIGNATURE); + + /* XXX: Now that the offset of page_type is shared between + * struct page and net_iov, just cast the netmem to struct page + * unconditionally by clearing NET_IOV if any, no matter whether + * it comes from struct net_iov or struct page. This should be + * adjusted once the offset is no longer shared. + */ + page = (struct page *)((__force unsigned long)netmem & ~NET_IOV); + __SetPageNetpp(page); /* Ensuring all pages have been split into one fragment initially: * page_pool_set_pp_info() is only called once for every page when it @@ -718,7 +728,17 @@ void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem) void page_pool_clear_pp_info(netmem_ref netmem) { - netmem_clear_pp_magic(netmem); + struct page *page; + + /* XXX: Now that the offset of page_type is shared between + * struct page and net_iov, just cast the netmem to struct page + * unconditionally by clearing NET_IOV if any, no matter whether + * it comes from struct net_iov or struct page. This should be + * adjusted once the offset is no longer shared. + */ + page = (struct page *)((__force unsigned long)netmem & ~NET_IOV); + __ClearPageNetpp(page); + netmem_set_pp(netmem, NULL); } -- cgit v1.2.3 From 341ffe82a7a3a1e0756b58999405b6df0c2b3e8d Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Mon, 9 Mar 2026 16:18:58 +0100 Subject: mm: move vma_kernel_pagesize() from hugetlb to mm.h Patch series "mm: move vma_(kernel|mmu)_pagesize() out of hugetlb.c", v2. Looking into vma_(kernel|mmu)_pagesize(), I realized that there is one scenario where DAX would not do the right thing when the kernel is not compiled with hugetlb support. Without hugetlb support, vma_(kernel|mmu)_pagesize() will always return PAGE_SIZE instead of using the ->pagesize() result provided by dax-device code. Fix that by moving vma_kernel_pagesize() to core MM code, where it belongs. I don't think this is stable material, but am not 100% sure. Also, move vma_mmu_pagesize() while at it. Remove the unnecessary hugetlb.h inclusion from KVM code. This patch (of 4): In the past, only hugetlb had special "vma_kernel_pagesize()" requirements, so it provided its own implementation. In commit 05ea88608d4e ("mm, hugetlbfs: introduce ->pagesize() to vm_operations_struct") we generalized that approach by providing a vm_ops->pagesize() callback to be used by device-dax. Once device-dax started using that callback in commit c1d53b92b95c ("device-dax: implement ->pagesize() for smaps to report MMUPageSize") it was missed that CONFIG_DEV_DAX does not depend on hugetlb support. So building a kernel with CONFIG_DEV_DAX but without CONFIG_HUGETLBFS would not pick up that value. Fix it by moving vma_kernel_pagesize() to mm.h, providing only a single implementation. While at it, improve the kerneldoc a bit. Ideally, we'd move vma_mmu_pagesize() as well to the header. However, its __weak symbol might be overwritten by a PPC variant in hugetlb code. So let's leave it in there for now, as it really only matters for some hugetlb oddities. This was found by code inspection. Link: https://lkml.kernel.org/r/20260309151901.123947-1-david@kernel.org Link: https://lkml.kernel.org/r/20260309151901.123947-2-david@kernel.org Fixes: c1d53b92b95c ("device-dax: implement ->pagesize() for smaps to report MMUPageSize") Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Acked-by: Mike Rapoport (Microsoft) Cc: Dan Williams Cc: "Christophe Leroy (CS GROUP)" Cc: Jann Horn Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Paolo Bonzini Cc: Pedro Falcato Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 7 ------- include/linux/mm.h | 20 ++++++++++++++++++++ mm/hugetlb.c | 17 ----------------- 3 files changed, 20 insertions(+), 24 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 65910437be1c..44c1848a2c21 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -777,8 +777,6 @@ static inline unsigned long huge_page_size(const struct hstate *h) return (unsigned long)PAGE_SIZE << h->order; } -extern unsigned long vma_kernel_pagesize(struct vm_area_struct *vma); - extern unsigned long vma_mmu_pagesize(struct vm_area_struct *vma); static inline unsigned long huge_page_mask(struct hstate *h) @@ -1177,11 +1175,6 @@ static inline unsigned long huge_page_mask(struct hstate *h) return PAGE_MASK; } -static inline unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) -{ - return PAGE_SIZE; -} - static inline unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) { return PAGE_SIZE; diff --git a/include/linux/mm.h b/include/linux/mm.h index c758f4e68727..e62cea754b0e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1351,6 +1351,26 @@ static inline bool vma_is_shared_maywrite(const struct vm_area_struct *vma) return is_shared_maywrite(&vma->flags); } +/** + * vma_kernel_pagesize - Default page size granularity for this VMA. + * @vma: The user mapping. + * + * The kernel page size specifies in which granularity VMA modifications + * can be performed. Folios in this VMA will be aligned to, and at least + * the size of the number of bytes returned by this function. + * + * The default kernel page size is not affected by Transparent Huge Pages + * being in effect. + * + * Return: The default page size granularity for this VMA. + */ +static inline unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) +{ + if (unlikely(vma->vm_ops && vma->vm_ops->pagesize)) + return vma->vm_ops->pagesize(vma); + return PAGE_SIZE; +} + static inline struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 992c1632d26a..66761ae5ce71 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1017,23 +1017,6 @@ static pgoff_t vma_hugecache_offset(struct hstate *h, (vma->vm_pgoff >> huge_page_order(h)); } -/** - * vma_kernel_pagesize - Page size granularity for this VMA. - * @vma: The user mapping. - * - * Folios in this VMA will be aligned to, and at least the size of the - * number of bytes returned by this function. - * - * Return: The default size of the folios allocated when backing a VMA. - */ -unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) -{ - if (vma->vm_ops && vma->vm_ops->pagesize) - return vma->vm_ops->pagesize(vma); - return PAGE_SIZE; -} -EXPORT_SYMBOL_GPL(vma_kernel_pagesize); - /* * Return the page size being used by the MMU to back a VMA. In the majority * of cases, the page size used by the kernel matches the MMU size. On -- cgit v1.2.3 From a9496e9e4b7c5785e82000a26b1118b4a1fd85c7 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Mon, 9 Mar 2026 16:18:59 +0100 Subject: mm: move vma_mmu_pagesize() from hugetlb to vma.c vma_mmu_pagesize() is also queried on non-hugetlb VMAs and does not really belong into hugetlb.c. PPC64 provides a custom overwrite with CONFIG_HUGETLB_PAGE, see arch/powerpc/mm/book3s64/slice.c, so we cannot easily make this a static inline function. So let's move it to vma.c and add some proper kerneldoc. To make vma tests happy, add a simple vma_kernel_pagesize() stub in tools/testing/vma/include/custom.h. Link: https://lkml.kernel.org/r/20260309151901.123947-3-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Acked-by: Mike Rapoport (Microsoft) Cc: "Christophe Leroy (CS GROUP)" Cc: Dan Williams Cc: Jann Horn Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Paolo Bonzini Cc: Pedro Falcato Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 7 ------- include/linux/mm.h | 2 ++ mm/hugetlb.c | 11 ----------- mm/vma.c | 21 +++++++++++++++++++++ tools/testing/vma/include/custom.h | 5 +++++ 5 files changed, 28 insertions(+), 18 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 44c1848a2c21..aaf3d472e6b5 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -777,8 +777,6 @@ static inline unsigned long huge_page_size(const struct hstate *h) return (unsigned long)PAGE_SIZE << h->order; } -extern unsigned long vma_mmu_pagesize(struct vm_area_struct *vma); - static inline unsigned long huge_page_mask(struct hstate *h) { return h->mask; @@ -1175,11 +1173,6 @@ static inline unsigned long huge_page_mask(struct hstate *h) return PAGE_MASK; } -static inline unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) -{ - return PAGE_SIZE; -} - static inline unsigned int huge_page_order(struct hstate *h) { return 0; diff --git a/include/linux/mm.h b/include/linux/mm.h index e62cea754b0e..efb8be5d259c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1371,6 +1371,8 @@ static inline unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) return PAGE_SIZE; } +unsigned long vma_mmu_pagesize(struct vm_area_struct *vma); + static inline struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 66761ae5ce71..a786034ac95c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1017,17 +1017,6 @@ static pgoff_t vma_hugecache_offset(struct hstate *h, (vma->vm_pgoff >> huge_page_order(h)); } -/* - * Return the page size being used by the MMU to back a VMA. In the majority - * of cases, the page size used by the kernel matches the MMU size. On - * architectures where it differs, an architecture-specific 'strong' - * version of this symbol is required. - */ -__weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) -{ - return vma_kernel_pagesize(vma); -} - /* * Flags for MAP_PRIVATE reservations. These are stored in the bottom * bits of the reservation map pointer, which are always clear due to diff --git a/mm/vma.c b/mm/vma.c index be64f781a3aa..e95fd5a5fe5c 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -3300,3 +3300,24 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) return 0; } + +/** + * vma_mmu_pagesize - Default MMU page size granularity for this VMA. + * @vma: The user mapping. + * + * In the common case, the default page size used by the MMU matches the + * default page size used by the kernel (see vma_kernel_pagesize()). On + * architectures where it differs, an architecture-specific 'strong' version + * of this symbol is required. + * + * The default MMU page size is not affected by Transparent Huge Pages + * being in effect, or any usage of larger MMU page sizes (either through + * architectural huge-page mappings or other explicit/implicit coalescing of + * virtual ranges performed by the MMU). + * + * Return: The default MMU page size granularity for this VMA. + */ +__weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) +{ + return vma_kernel_pagesize(vma); +} diff --git a/tools/testing/vma/include/custom.h b/tools/testing/vma/include/custom.h index 833ff4d7f799..7150e09122b2 100644 --- a/tools/testing/vma/include/custom.h +++ b/tools/testing/vma/include/custom.h @@ -118,3 +118,8 @@ static __always_inline vma_flags_t __mk_vma_flags(size_t count, vma_flags_set_flag(&flags, bits[i]); return flags; } + +static inline unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) +{ + return PAGE_SIZE; +} -- cgit v1.2.3 From 2d1e54aab6fd01f7502af20e125312e06a15bf9c Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Wed, 11 Mar 2026 17:24:37 +0000 Subject: mm: abstract reading sysctl_max_map_count, and READ_ONCE() Concurrent reads and writes of sysctl_max_map_count are possible, so we should READ_ONCE() and WRITE_ONCE(). The sysctl procfs logic already enforces WRITE_ONCE(), so abstract the read side with get_sysctl_max_map_count(). While we're here, also move the field to mm/internal.h and add the getter there since only mm interacts with it, there's no need for anybody else to have access. Finally, update the VMA userland tests to reflect the change. Link: https://lkml.kernel.org/r/0715259eb37cbdfde4f9e5db92a20ec7110a1ce5.1773249037.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Pedro Falcato Cc: Jann Horn Cc: Jianzhou Zhao Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 -- mm/internal.h | 6 ++++++ mm/mmap.c | 2 +- mm/mremap.c | 4 ++-- mm/nommu.c | 2 +- mm/vma.c | 6 +++--- tools/testing/vma/include/custom.h | 3 --- tools/testing/vma/include/dup.h | 9 +++++++++ tools/testing/vma/main.c | 2 ++ 9 files changed, 24 insertions(+), 12 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index efb8be5d259c..25ba5816e02b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -207,8 +207,6 @@ static inline void __mm_zero_struct_page(struct page *page) #define MAPCOUNT_ELF_CORE_MARGIN (5) #define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) -extern int sysctl_max_map_count; - extern unsigned long sysctl_user_reserve_kbytes; extern unsigned long sysctl_admin_reserve_kbytes; diff --git a/mm/internal.h b/mm/internal.h index f50a0376b87e..62d80fd37ae1 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1863,4 +1863,10 @@ static inline int pmdp_test_and_clear_young_notify(struct vm_area_struct *vma, #endif /* CONFIG_MMU_NOTIFIER */ +extern int sysctl_max_map_count; +static inline int get_sysctl_max_map_count(void) +{ + return READ_ONCE(sysctl_max_map_count); +} + #endif /* __MM_INTERNAL_H */ diff --git a/mm/mmap.c b/mm/mmap.c index 843160946aa5..79544d893411 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -375,7 +375,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, return -EOVERFLOW; /* Too many mappings? */ - if (mm->map_count > sysctl_max_map_count) + if (mm->map_count > get_sysctl_max_map_count()) return -ENOMEM; /* diff --git a/mm/mremap.c b/mm/mremap.c index e8c3021dd841..ba6c690f6c1b 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1045,7 +1045,7 @@ static unsigned long prep_move_vma(struct vma_remap_struct *vrm) * which may not merge, then (if MREMAP_DONTUNMAP is not set) unmap the * source, which may split, causing a net increase of 2 mappings. */ - if (current->mm->map_count + 2 > sysctl_max_map_count) + if (current->mm->map_count + 2 > get_sysctl_max_map_count()) return -ENOMEM; if (vma->vm_ops && vma->vm_ops->may_split) { @@ -1813,7 +1813,7 @@ static unsigned long check_mremap_params(struct vma_remap_struct *vrm) * net increased map count of 2. In move_vma() we check for headroom of * 2 additional mappings, so check early to avoid bailing out then. */ - if (current->mm->map_count + 4 > sysctl_max_map_count) + if (current->mm->map_count + 4 > get_sysctl_max_map_count()) return -ENOMEM; return 0; diff --git a/mm/nommu.c b/mm/nommu.c index c3a23b082adb..ed3934bc2de4 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1317,7 +1317,7 @@ static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, return -ENOMEM; mm = vma->vm_mm; - if (mm->map_count >= sysctl_max_map_count) + if (mm->map_count >= get_sysctl_max_map_count()) return -ENOMEM; region = kmem_cache_alloc(vm_region_jar, GFP_KERNEL); diff --git a/mm/vma.c b/mm/vma.c index b7055c264b5d..4d21e7d8e93c 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -590,7 +590,7 @@ out_free_vma: static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, int new_below) { - if (vma->vm_mm->map_count >= sysctl_max_map_count) + if (vma->vm_mm->map_count >= get_sysctl_max_map_count()) return -ENOMEM; return __split_vma(vmi, vma, addr, new_below); @@ -1394,7 +1394,7 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, * its limit temporarily, to help free resources as expected. */ if (vms->end < vms->vma->vm_end && - vms->vma->vm_mm->map_count >= sysctl_max_map_count) { + vms->vma->vm_mm->map_count >= get_sysctl_max_map_count()) { error = -ENOMEM; goto map_count_exceeded; } @@ -2868,7 +2868,7 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) return -ENOMEM; - if (mm->map_count > sysctl_max_map_count) + if (mm->map_count > get_sysctl_max_map_count()) return -ENOMEM; if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) diff --git a/tools/testing/vma/include/custom.h b/tools/testing/vma/include/custom.h index 7150e09122b2..6c62a38a2f6f 100644 --- a/tools/testing/vma/include/custom.h +++ b/tools/testing/vma/include/custom.h @@ -21,9 +21,6 @@ extern unsigned long dac_mmap_min_addr; #define VM_BUG_ON(_expr) (BUG_ON(_expr)) #define VM_BUG_ON_VMA(_expr, _vma) (BUG_ON(_expr)) -/* We hardcode this for now. */ -#define sysctl_max_map_count 0x1000000UL - #define TASK_SIZE ((1ul << 47)-PAGE_SIZE) /* diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 5eb313beb43d..8865ffe046d8 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -419,6 +419,9 @@ struct vma_iterator { #define EMPTY_VMA_FLAGS ((vma_flags_t){ }) +#define MAPCOUNT_ELF_CORE_MARGIN (5) +#define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) + /* What action should be taken after an .mmap_prepare call is complete? */ enum mmap_action_type { MMAP_NOTHING, /* Mapping is complete, no further action. */ @@ -1342,3 +1345,9 @@ static inline void vma_set_file(struct vm_area_struct *vma, struct file *file) swap(vma->vm_file, file); fput(file); } + +extern int sysctl_max_map_count; +static inline int get_sysctl_max_map_count(void) +{ + return READ_ONCE(sysctl_max_map_count); +} diff --git a/tools/testing/vma/main.c b/tools/testing/vma/main.c index 49b09e97a51f..18338f5d29e0 100644 --- a/tools/testing/vma/main.c +++ b/tools/testing/vma/main.c @@ -14,6 +14,8 @@ #include "tests/mmap.c" #include "tests/vma.c" +int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; + /* Helper functions which utilise static kernel functions. */ struct vm_area_struct *merge_existing(struct vma_merge_struct *vmg) -- cgit v1.2.3 From a91fd9f710490a89713823be3e7790ac59a085f8 Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Wed, 25 Mar 2026 05:40:18 -0600 Subject: mm: consolidate anonymous folio PTE mapping into helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm: khugepaged cleanups and mTHP prerequisites", v4. The following series contains cleanups and prerequisites for my work on khugepaged mTHP support [1]. These have been separated out to ease review. The first patch in the series refactors the page fault folio to pte mapping and follows a similar convention as defined by map_anon_folio_pmd_(no)pf(). This not only cleans up the current implementation of do_anonymous_page(), but will allow for reuse later in the khugepaged mTHP implementation. The second patch adds a small is_pmd_order() helper to check if an order is the PMD order. This check is open-coded in a number of places. This patch aims to clean this up and will be used more in the khugepaged mTHP work. The third patch also adds a small DEFINE for (HPAGE_PMD_NR - 1) which is used often across the khugepaged code. The fourth and fifth patch come from the khugepaged mTHP patchset [1]. These two patches include the rename of function prefixes, and the unification of khugepaged and madvise_collapse via a new collapse_single_pmd function. Patch 1: refactor do_anonymous_page into map_anon_folio_pte_(no)pf Patch 2: add is_pmd_order helper Patch 3: Add define for (HPAGE_PMD_NR - 1) Patch 4: Refactor/rename hpage_collapse Patch 5: Refactoring to combine madvise_collapse and khugepaged A big thanks to everyone that has reviewed, tested, and participated in the development process. This patch (of 5): The anonymous page fault handler in do_anonymous_page() open-codes the sequence to map a newly allocated anonymous folio at the PTE level: - construct the PTE entry - add rmap - add to LRU - set the PTEs - update the MMU cache. Introduce two helpers to consolidate this duplicated logic, mirroring the existing map_anon_folio_pmd_nopf() pattern for PMD-level mappings: map_anon_folio_pte_nopf(): constructs the PTE entry, takes folio references, adds anon rmap and LRU. This function also handles the uffd_wp that can occur in the pf variant. The future khugepaged mTHP code calls this to handle mapping the new collapsed mTHP to its folio. map_anon_folio_pte_pf(): extends the nopf variant to handle MM_ANONPAGES counter updates, and mTHP fault allocation statistics for the page fault path. The zero-page read path in do_anonymous_page() is also untangled from the shared setpte label, since it does not allocate a folio and should not share the same mapping sequence as the write path. We can now leave nr_pages undeclared at the function intialization, and use the single page update_mmu_cache function to handle the zero page update. This refactoring will also help reduce code duplication between mm/memory.c and mm/khugepaged.c, and provides a clean API for PTE-level anonymous folio mapping that can be reused by future callers (like khugpeaged mTHP support) Link: https://lkml.kernel.org/r/20260325114022.444081-1-npache@redhat.com Link: https://lkml.kernel.org/r/20260325114022.444081-2-npache@redhat.com Link: https://lore.kernel.org/all/20260122192841.128719-1-npache@redhat.com Signed-off-by: Nico Pache Suggested-by: Lorenzo Stoakes (Oracle) Reviewed-by: Lorenzo Stoakes (Oracle) Reviewed-by: Dev Jain Reviewed-by: Lance Yang Acked-by: David Hildenbrand (Arm) Cc: Alistair Popple Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Baolin Wang Cc: Barry Song Cc: Brendan Jackman Cc: Byungchul Park Cc: Catalin Marinas Cc: David Rientjes Cc: Gregory Price Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Joshua Hahn Cc: Kefeng Wang Cc: Liam Howlett Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Matthew Brost Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Nanyong Sun Cc: Pedro Falcato Cc: Peter Xu Cc: Rafael Aquini Cc: Rakie Kim Cc: Randy Dunlap Cc: Ryan Roberts Cc: Shivank Garg Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Takashi Iwai (SUSE) Cc: Thomas Hellström Cc: Usama Arif Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Wei Yang Cc: Will Deacon Cc: Yang Shi Cc: Zach O'Keefe Cc: Zi Yan Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- include/linux/mm.h | 4 ++++ mm/memory.c | 61 ++++++++++++++++++++++++++++++++++++------------------ 2 files changed, 45 insertions(+), 20 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 25ba5816e02b..16a1ad9a3397 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4916,4 +4916,8 @@ static inline bool snapshot_page_is_faithful(const struct page_snapshot *ps) void snapshot_page(struct page_snapshot *ps, const struct page *page); +void map_anon_folio_pte_nopf(struct folio *folio, pte_t *pte, + struct vm_area_struct *vma, unsigned long addr, + bool uffd_wp); + #endif /* _LINUX_MM_H */ diff --git a/mm/memory.c b/mm/memory.c index f21c804b50bf..7c350a38fecf 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5197,6 +5197,37 @@ fallback: return folio_prealloc(vma->vm_mm, vma, vmf->address, true); } +void map_anon_folio_pte_nopf(struct folio *folio, pte_t *pte, + struct vm_area_struct *vma, unsigned long addr, + bool uffd_wp) +{ + const unsigned int nr_pages = folio_nr_pages(folio); + pte_t entry = folio_mk_pte(folio, vma->vm_page_prot); + + entry = pte_sw_mkyoung(entry); + + if (vma->vm_flags & VM_WRITE) + entry = pte_mkwrite(pte_mkdirty(entry), vma); + if (uffd_wp) + entry = pte_mkuffd_wp(entry); + + folio_ref_add(folio, nr_pages - 1); + folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); + folio_add_lru_vma(folio, vma); + set_ptes(vma->vm_mm, addr, pte, entry, nr_pages); + update_mmu_cache_range(NULL, vma, addr, pte, nr_pages); +} + +static void map_anon_folio_pte_pf(struct folio *folio, pte_t *pte, + struct vm_area_struct *vma, unsigned long addr, bool uffd_wp) +{ + const unsigned int order = folio_order(folio); + + map_anon_folio_pte_nopf(folio, pte, vma, addr, uffd_wp); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1L << order); + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_ALLOC); +} + /* * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. @@ -5208,7 +5239,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) unsigned long addr = vmf->address; struct folio *folio; vm_fault_t ret = 0; - int nr_pages = 1; + int nr_pages; pte_t entry; /* File mapping without ->vm_ops ? */ @@ -5243,7 +5274,13 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) pte_unmap_unlock(vmf->pte, vmf->ptl); return handle_userfault(vmf, VM_UFFD_MISSING); } - goto setpte; + if (vmf_orig_pte_uffd_wp(vmf)) + entry = pte_mkuffd_wp(entry); + set_pte_at(vma->vm_mm, addr, vmf->pte, entry); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, addr, vmf->pte); + goto unlock; } /* Allocate our own private page. */ @@ -5267,11 +5304,6 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) */ __folio_mark_uptodate(folio); - entry = folio_mk_pte(folio, vma->vm_page_prot); - entry = pte_sw_mkyoung(entry); - if (vma->vm_flags & VM_WRITE) - entry = pte_mkwrite(pte_mkdirty(entry), vma); - vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl); if (!vmf->pte) goto release; @@ -5293,19 +5325,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) folio_put(folio); return handle_userfault(vmf, VM_UFFD_MISSING); } - - folio_ref_add(folio, nr_pages - 1); - add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); - count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC); - folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); - folio_add_lru_vma(folio, vma); -setpte: - if (vmf_orig_pte_uffd_wp(vmf)) - entry = pte_mkuffd_wp(entry); - set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages); - - /* No need to invalidate - it was non-present before */ - update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr_pages); + map_anon_folio_pte_pf(folio, vmf->pte, vma, addr, + vmf_orig_pte_uffd_wp(vmf)); unlock: if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); -- cgit v1.2.3 From 6bc0987d0b508b3768808efafa1e90041713526b Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:18 +0000 Subject: mm/vma: add vma_flags_empty(), vma_flags_and(), vma_flags_diff_pair() Patch series "mm/vma: convert vm_flags_t to vma_flags_t in vma code", v4. This series converts a lot of the existing use of the legacy vm_flags_t data type to the new vma_flags_t type which replaces it. In order to do so it adds a number of additional helpers: * vma_flags_empty() - Determines whether a vma_flags_t value has no bits set. * vma_flags_and() - Performs a bitwise AND between two vma_flags_t values. * vma_flags_diff_pair() - Determines which flags are not shared between a pair of VMA flags (typically non-constant values) * append_vma_flags() - Similar to mk_vma_flags(), but allows a vma_flags_t value to be specified (typically a constant value) which will be copied and appended to to create a new vma_flags_t value, with additional flags specified to append to it. * vma_flags_same() - Determines if a vma_flags_t value is exactly equal to a set of VMA flags. * vma_flags_same_mask() - Determines if a vma_flags_t value is eactly equal to another vma_flags_t value (typically constant). * vma_flags_same_pair() - Determines if a pair of vma_flags_t values are exactly equal to one another (typically both non-constant). * vma_flags_to_legacy() - Converts a vma_flags_t value to a vm_flags_t value, used to enable more iterative introduction of the use of vma_flags_t. * legacy_to_vma_flags() - Converts a vm_flags_t value to a vma_flags-t value, for the same purpose. * vma_flags_test_single_mask() - Tests whether a vma_flags_t value contain the single flag specified in an input vma_flags_t flag mask, or if that flag mask is empty, is defined to return false. Useful for config-predicated VMA flag mask defines. * vma_test() - Tests whether a VMA's flags contain a specific singular VMA flag. * vma_test_any() - Tests whether a VMA's flags contain any of a set of VMA flags. * vma_test_any_mask() - Tests whether a VMA's flags contain any of the flags specified in another, typically constant, vma_flags_t value. * vma_test_single_mask() - Tests whether a VMA's flags contain the single flag specified in an input vma_flags_t flag mask, or if that flag mask is empty, is defined to return false. Useful for config-predicated VMA flag mask defines. * vma_clear_flags() - Clears a specific set of VMA flags from a vma_flags_t value. * vma_clear_flags_mask() - Clears those flag set in a vma_flags_t value (typically constant) from a (typically not constant) vma_flags_t value. The series mostly focuses on the the VMA specific code, especially that contained in mm/vma.c and mm/vma.h. It updates both brk() and mmap() logic to utils vma_flags_t values as much as is practiaclly possible at this point, changing surrounding logic to be able to do so. It also updates the vma_modify_xxx() functions where they interact with VMA flags directly to use vm_flags_t values where possible. There is extensive testing added in the VMA userland tests to assert that all of these new VMA flag functions work correctly. This patch (of 25): Firstly, add the ability to determine if VMA flags are empty, that is no flags are set in a vma_flags_t value. Next, add the ability to obtain the equivalent of the bitwise and of two vma_flags_t values, via vma_flags_and_mask(). Next, add the ability to obtain the difference between two sets of VMA flags, that is the equivalent to the exclusive bitwise OR of the two sets of flags, via vma_flags_diff_pair(). vma_flags_xxx_mask() typically operates on a pointer to a vma_flags_t value, which is assumed to be an lvalue of some kind (such as a field in a struct or a stack variable) and an rvalue of some kind (typically a constant set of VMA flags obtained e.g. via mk_vma_flags() or equivalent). However vma_flags_diff_pair() is intended to operate on two lvalues, so use the _pair() suffix to make this clear. Finally, update VMA userland tests to add these helpers. We also port bitmap_xor() and __bitmap_xor() to the tools/ headers and source to allow the tests to work with vma_flags_diff_pair(). Link: https://lkml.kernel.org/r/cover.1774034900.git.ljs@kernel.org Link: https://lkml.kernel.org/r/53ab55b7da91425775e42c03177498ad6de88ef4.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm.h | 60 ++++++++++++++++++++++++++++++++++------- include/linux/mm_types.h | 8 ++++++ tools/include/linux/bitmap.h | 13 +++++++++ tools/lib/bitmap.c | 10 +++++++ tools/testing/vma/include/dup.h | 36 ++++++++++++++++++++++++- 5 files changed, 117 insertions(+), 10 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 16a1ad9a3397..7954a7a2b811 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1048,6 +1048,19 @@ static __always_inline vma_flags_t __mk_vma_flags(size_t count, return flags; } +/* + * Helper macro which bitwise-or combines the specified input flags into a + * vma_flags_t bitmap value. E.g.: + * + * vma_flags_t flags = mk_vma_flags(VMA_IO_BIT, VMA_PFNMAP_BIT, + * VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT); + * + * The compiler cleverly optimises away all of the work and this ends up being + * equivalent to aggregating the values manually. + */ +#define mk_vma_flags(...) __mk_vma_flags(COUNT_ARGS(__VA_ARGS__), \ + (const vma_flag_t []){__VA_ARGS__}) + /* * Test whether a specific VMA flag is set, e.g.: * @@ -1062,17 +1075,30 @@ static __always_inline bool vma_flags_test(const vma_flags_t *flags, } /* - * Helper macro which bitwise-or combines the specified input flags into a - * vma_flags_t bitmap value. E.g.: - * - * vma_flags_t flags = mk_vma_flags(VMA_IO_BIT, VMA_PFNMAP_BIT, - * VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT); + * Obtain a set of VMA flags which contain the overlapping flags contained + * within flags and to_and. + */ +static __always_inline vma_flags_t vma_flags_and_mask(const vma_flags_t *flags, + vma_flags_t to_and) +{ + vma_flags_t dst; + unsigned long *bitmap_dst = dst.__vma_flags; + const unsigned long *bitmap = flags->__vma_flags; + const unsigned long *bitmap_to_and = to_and.__vma_flags; + + bitmap_and(bitmap_dst, bitmap, bitmap_to_and, NUM_VMA_FLAG_BITS); + return dst; +} + +/* + * Obtain a set of VMA flags which contains the specified overlapping flags, + * e.g.: * - * The compiler cleverly optimises away all of the work and this ends up being - * equivalent to aggregating the values manually. + * vma_flags_t read_flags = vma_flags_and(&flags, VMA_READ_BIT, + * VMA_MAY_READ_BIT); */ -#define mk_vma_flags(...) __mk_vma_flags(COUNT_ARGS(__VA_ARGS__), \ - (const vma_flag_t []){__VA_ARGS__}) +#define vma_flags_and(flags, ...) \ + vma_flags_and_mask(flags, mk_vma_flags(__VA_ARGS__)) /* Test each of to_test flags in flags, non-atomically. */ static __always_inline bool vma_flags_test_any_mask(const vma_flags_t *flags, @@ -1146,6 +1172,22 @@ static __always_inline void vma_flags_clear_mask(vma_flags_t *flags, #define vma_flags_clear(flags, ...) \ vma_flags_clear_mask(flags, mk_vma_flags(__VA_ARGS__)) +/* + * Obtain a VMA flags value containing those flags that are present in flags or + * flags_other but not in both. + */ +static __always_inline vma_flags_t vma_flags_diff_pair(const vma_flags_t *flags, + const vma_flags_t *flags_other) +{ + vma_flags_t dst; + const unsigned long *bitmap_other = flags_other->__vma_flags; + const unsigned long *bitmap = flags->__vma_flags; + unsigned long *bitmap_dst = dst.__vma_flags; + + bitmap_xor(bitmap_dst, bitmap, bitmap_other, NUM_VMA_FLAG_BITS); + return dst; +} + /* * Helper to test that ALL specified flags are set in a VMA. * diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index f22aecb047b7..321aa150c1ee 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -870,6 +870,14 @@ typedef struct { #define EMPTY_VMA_FLAGS ((vma_flags_t){ }) +/* Are no flags set in the specified VMA flags? */ +static __always_inline bool vma_flags_empty(const vma_flags_t *flags) +{ + const unsigned long *bitmap = flags->__vma_flags; + + return bitmap_empty(bitmap, NUM_VMA_FLAG_BITS); +} + /* * Describes a VMA that is about to be mmap()'ed. Drivers may choose to * manipulate mutable fields which will cause those fields to be updated in the diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h index 250883090a5d..845eda759f67 100644 --- a/tools/include/linux/bitmap.h +++ b/tools/include/linux/bitmap.h @@ -28,6 +28,8 @@ bool __bitmap_subset(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); +void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, + const unsigned long *bitmap2, unsigned int nbits); #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1))) #define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1))) @@ -209,4 +211,15 @@ static inline void bitmap_clear(unsigned long *map, unsigned int start, else __bitmap_clear(map, start, nbits); } + +static __always_inline +void bitmap_xor(unsigned long *dst, const unsigned long *src1, + const unsigned long *src2, unsigned int nbits) +{ + if (small_const_nbits(nbits)) + *dst = *src1 ^ *src2; + else + __bitmap_xor(dst, src1, src2, nbits); +} + #endif /* _TOOLS_LINUX_BITMAP_H */ diff --git a/tools/lib/bitmap.c b/tools/lib/bitmap.c index aa83d22c45e3..fedc9070f0e4 100644 --- a/tools/lib/bitmap.c +++ b/tools/lib/bitmap.c @@ -169,3 +169,13 @@ bool __bitmap_subset(const unsigned long *bitmap1, return false; return true; } + +void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, + const unsigned long *bitmap2, unsigned int bits) +{ + unsigned int k; + unsigned int nr = BITS_TO_LONGS(bits); + + for (k = 0; k < nr; k++) + dst[k] = bitmap1[k] ^ bitmap2[k]; +} diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 8865ffe046d8..8091a5caaeb8 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -422,6 +422,13 @@ struct vma_iterator { #define MAPCOUNT_ELF_CORE_MARGIN (5) #define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) +static __always_inline bool vma_flags_empty(const vma_flags_t *flags) +{ + const unsigned long *bitmap = flags->__vma_flags; + + return bitmap_empty(bitmap, NUM_VMA_FLAG_BITS); +} + /* What action should be taken after an .mmap_prepare call is complete? */ enum mmap_action_type { MMAP_NOTHING, /* Mapping is complete, no further action. */ @@ -855,6 +862,21 @@ static __always_inline bool vma_flags_test(const vma_flags_t *flags, return test_bit((__force int)bit, bitmap); } +static __always_inline vma_flags_t vma_flags_and_mask(const vma_flags_t *flags, + vma_flags_t to_and) +{ + vma_flags_t dst; + unsigned long *bitmap_dst = dst.__vma_flags; + const unsigned long *bitmap = flags->__vma_flags; + const unsigned long *bitmap_to_and = to_and.__vma_flags; + + bitmap_and(bitmap_dst, bitmap, bitmap_to_and, NUM_VMA_FLAG_BITS); + return dst; +} + +#define vma_flags_and(flags, ...) \ + vma_flags_and_mask(flags, mk_vma_flags(__VA_ARGS__)) + static __always_inline bool vma_flags_test_any_mask(const vma_flags_t *flags, vma_flags_t to_test) { @@ -901,8 +923,20 @@ static __always_inline void vma_flags_clear_mask(vma_flags_t *flags, vma_flags_t #define vma_flags_clear(flags, ...) \ vma_flags_clear_mask(flags, mk_vma_flags(__VA_ARGS__)) +static __always_inline vma_flags_t vma_flags_diff_pair(const vma_flags_t *flags, + const vma_flags_t *flags_other) +{ + vma_flags_t dst; + const unsigned long *bitmap_other = flags_other->__vma_flags; + const unsigned long *bitmap = flags->__vma_flags; + unsigned long *bitmap_dst = dst.__vma_flags; + + bitmap_xor(bitmap_dst, bitmap, bitmap_other, NUM_VMA_FLAG_BITS); + return dst; +} + static inline bool vma_test_all_mask(const struct vm_area_struct *vma, - vma_flags_t flags) + vma_flags_t flags) { return vma_flags_test_all_mask(&vma->flags, flags); } -- cgit v1.2.3 From 7ec1885a7e283caaf6566aedc1eea5988d545f97 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:22 +0000 Subject: mm/vma: use new VMA flags for sticky flags logic Use the new vma_flags_t flags implementation to perform the logic around sticky flags and what flags are ignored on VMA merge. We make use of the new vma_flags_empty(), vma_flags_diff_pair(), and vma_flags_and_mask() functionality. Also update the VMA tests accordingly. Link: https://lkml.kernel.org/r/369574f06360ffa44707047e3b58eb4897345fba.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm.h | 32 ++++++++++++++----------- mm/vma.c | 48 ++++++++++++++++++++++++++++---------- tools/testing/vma/include/custom.h | 5 ---- tools/testing/vma/include/dup.h | 9 +++++-- 4 files changed, 62 insertions(+), 32 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7954a7a2b811..d7e647e31742 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -540,6 +540,7 @@ enum { /* VMA basic access permission flags */ #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) +#define VMA_ACCESS_FLAGS mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT) /* * Special vmas that are non-mergable, non-mlock()able. @@ -585,27 +586,32 @@ enum { * possesses it but the other does not, the merged VMA should nonetheless have * applied to it: * - * VM_SOFTDIRTY - if a VMA is marked soft-dirty, that is has not had its - * references cleared via /proc/$pid/clear_refs, any merged VMA - * should be considered soft-dirty also as it operates at a VMA - * granularity. + * VMA_SOFTDIRTY_BIT - if a VMA is marked soft-dirty, that is has not had its + * references cleared via /proc/$pid/clear_refs, any + * merged VMA should be considered soft-dirty also as it + * operates at a VMA granularity. * - * VM_MAYBE_GUARD - If a VMA may have guard regions in place it implies that - * mapped page tables may contain metadata not described by the - * VMA and thus any merged VMA may also contain this metadata, - * and thus we must make this flag sticky. + * VMA_MAYBE_GUARD_BIT - If a VMA may have guard regions in place it implies + * that mapped page tables may contain metadata not + * described by the VMA and thus any merged VMA may also + * contain this metadata, and thus we must make this flag + * sticky. */ -#define VM_STICKY (VM_SOFTDIRTY | VM_MAYBE_GUARD) +#ifdef CONFIG_MEM_SOFT_DIRTY +#define VMA_STICKY_FLAGS mk_vma_flags(VMA_SOFTDIRTY_BIT, VMA_MAYBE_GUARD_BIT) +#else +#define VMA_STICKY_FLAGS mk_vma_flags(VMA_MAYBE_GUARD_BIT) +#endif /* * VMA flags we ignore for the purposes of merge, i.e. one VMA possessing one * of these flags and the other not does not preclude a merge. * - * VM_STICKY - When merging VMAs, VMA flags must match, unless they are - * 'sticky'. If any sticky flags exist in either VMA, we simply - * set all of them on the merged VMA. + * VMA_STICKY_FLAGS - When merging VMAs, VMA flags must match, unless they + * are 'sticky'. If any sticky flags exist in either VMA, + * we simply set all of them on the merged VMA. */ -#define VM_IGNORE_MERGE VM_STICKY +#define VMA_IGNORE_MERGE_FLAGS VMA_STICKY_FLAGS /* * Flags which should result in page tables being copied on fork. These are diff --git a/mm/vma.c b/mm/vma.c index 4d21e7d8e93c..6af26619e020 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -86,10 +86,15 @@ static bool vma_is_fork_child(struct vm_area_struct *vma) static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next) { struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev; + vma_flags_t diff; if (!mpol_equal(vmg->policy, vma_policy(vma))) return false; - if ((vma->vm_flags ^ vmg->vm_flags) & ~VM_IGNORE_MERGE) + + diff = vma_flags_diff_pair(&vma->flags, &vmg->vma_flags); + vma_flags_clear_mask(&diff, VMA_IGNORE_MERGE_FLAGS); + + if (!vma_flags_empty(&diff)) return false; if (vma->vm_file != vmg->file) return false; @@ -805,7 +810,8 @@ static bool can_merge_remove_vma(struct vm_area_struct *vma) static __must_check struct vm_area_struct *vma_merge_existing_range( struct vma_merge_struct *vmg) { - vm_flags_t sticky_flags = vmg->vm_flags & VM_STICKY; + vma_flags_t sticky_flags = vma_flags_and_mask(&vmg->vma_flags, + VMA_STICKY_FLAGS); struct vm_area_struct *middle = vmg->middle; struct vm_area_struct *prev = vmg->prev; struct vm_area_struct *next; @@ -898,15 +904,22 @@ static __must_check struct vm_area_struct *vma_merge_existing_range( vma_start_write(middle); if (merge_right) { + vma_flags_t next_sticky; + vma_start_write(next); vmg->target = next; - sticky_flags |= (next->vm_flags & VM_STICKY); + next_sticky = vma_flags_and_mask(&next->flags, VMA_STICKY_FLAGS); + vma_flags_set_mask(&sticky_flags, next_sticky); } if (merge_left) { + vma_flags_t prev_sticky; + vma_start_write(prev); vmg->target = prev; - sticky_flags |= (prev->vm_flags & VM_STICKY); + + prev_sticky = vma_flags_and_mask(&prev->flags, VMA_STICKY_FLAGS); + vma_flags_set_mask(&sticky_flags, prev_sticky); } if (merge_both) { @@ -976,7 +989,7 @@ static __must_check struct vm_area_struct *vma_merge_existing_range( if (err || commit_merge(vmg)) goto abort; - vm_flags_set(vmg->target, sticky_flags); + vma_set_flags_mask(vmg->target, sticky_flags); khugepaged_enter_vma(vmg->target, vmg->vm_flags); vmg->state = VMA_MERGE_SUCCESS; return vmg->target; @@ -1154,12 +1167,16 @@ int vma_expand(struct vma_merge_struct *vmg) struct vm_area_struct *target = vmg->target; struct vm_area_struct *next = vmg->next; bool remove_next = false; - vm_flags_t sticky_flags; + vma_flags_t sticky_flags = + vma_flags_and_mask(&vmg->vma_flags, VMA_STICKY_FLAGS); + vma_flags_t target_sticky; int ret = 0; mmap_assert_write_locked(vmg->mm); vma_start_write(target); + target_sticky = vma_flags_and_mask(&target->flags, VMA_STICKY_FLAGS); + if (next && target != next && vmg->end == next->vm_end) remove_next = true; @@ -1174,10 +1191,7 @@ int vma_expand(struct vma_merge_struct *vmg) VM_WARN_ON_VMG(target->vm_start < vmg->start || target->vm_end > vmg->end, vmg); - sticky_flags = vmg->vm_flags & VM_STICKY; - sticky_flags |= target->vm_flags & VM_STICKY; - if (remove_next) - sticky_flags |= next->vm_flags & VM_STICKY; + vma_flags_set_mask(&sticky_flags, target_sticky); /* * If we are removing the next VMA or copying from a VMA @@ -1194,13 +1208,18 @@ int vma_expand(struct vma_merge_struct *vmg) return ret; if (remove_next) { + vma_flags_t next_sticky; + vma_start_write(next); vmg->__remove_next = true; + + next_sticky = vma_flags_and_mask(&next->flags, VMA_STICKY_FLAGS); + vma_flags_set_mask(&sticky_flags, next_sticky); } if (commit_merge(vmg)) goto nomem; - vm_flags_set(target, sticky_flags); + vma_set_flags_mask(target, sticky_flags); return 0; nomem: @@ -1950,10 +1969,15 @@ out: */ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b) { + vma_flags_t diff = vma_flags_diff_pair(&a->flags, &b->flags); + + vma_flags_clear_mask(&diff, VMA_ACCESS_FLAGS); + vma_flags_clear_mask(&diff, VMA_IGNORE_MERGE_FLAGS); + return a->vm_end == b->vm_start && mpol_equal(vma_policy(a), vma_policy(b)) && a->vm_file == b->vm_file && - !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_IGNORE_MERGE)) && + vma_flags_empty(&diff) && b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); } diff --git a/tools/testing/vma/include/custom.h b/tools/testing/vma/include/custom.h index 6200f938e586..7cdd0f60600a 100644 --- a/tools/testing/vma/include/custom.h +++ b/tools/testing/vma/include/custom.h @@ -134,8 +134,3 @@ static __always_inline bool vma_flags_same_mask(vma_flags_t *flags, vma_flags_same_mask(flags, mk_vma_flags(__VA_ARGS__)) #define VMA_SPECIAL_FLAGS mk_vma_flags(VMA_IO_BIT, VMA_DONTEXPAND_BIT, \ VMA_PFNMAP_BIT, VMA_MIXEDMAP_BIT) -#ifdef CONFIG_MEM_SOFT_DIRTY -#define VMA_STICKY_FLAGS mk_vma_flags(VMA_SOFTDIRTY_BIT, VMA_MAYBE_GUARD_BIT) -#else -#define VMA_STICKY_FLAGS mk_vma_flags(VMA_MAYBE_GUARD_BIT) -#endif diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 1dee78c34872..65134303b645 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -338,6 +338,7 @@ enum { /* VMA basic access permission flags */ #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) +#define VMA_ACCESS_FLAGS mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT) /* * Special vmas that are non-mergable, non-mlock()able. @@ -363,9 +364,13 @@ enum { #define CAP_IPC_LOCK 14 -#define VM_STICKY (VM_SOFTDIRTY | VM_MAYBE_GUARD) +#ifdef CONFIG_MEM_SOFT_DIRTY +#define VMA_STICKY_FLAGS mk_vma_flags(VMA_SOFTDIRTY_BIT, VMA_MAYBE_GUARD_BIT) +#else +#define VMA_STICKY_FLAGS mk_vma_flags(VMA_MAYBE_GUARD_BIT) +#endif -#define VM_IGNORE_MERGE VM_STICKY +#define VMA_IGNORE_MERGE_FLAGS VMA_STICKY_FLAGS #define VM_COPY_ON_FORK (VM_PFNMAP | VM_MIXEDMAP | VM_UFFD_WP | VM_MAYBE_GUARD) -- cgit v1.2.3 From e8d464f4a94ccbcae8c9d3137ac5621b57ddd8a1 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:24 +0000 Subject: mm/vma: add append_vma_flags() helper In order to be able to efficiently combine VMA flag masks with additional VMA flag bits we need to extend the concept introduced in mk_vma_flags() and __mk_vma_flags() by allowing the specification of a VMA flag mask to append VMA flag bits to. Update __mk_vma_flags() to allow for this and update mk_vma_flags() accordingly, and also provide append_vma_flags() to allow for the caller to specify which VMA flags mask to append to. Finally, update the VMA flags tests to reflect the change. Link: https://lkml.kernel.org/r/9f928cd4688270002f2c0c3777fcc9b49cc7a8ea.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm.h | 20 ++++++++++++++------ tools/testing/vma/include/dup.h | 14 +++++++------- 2 files changed, 21 insertions(+), 13 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index d7e647e31742..26cfb2fbe4db 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1042,13 +1042,11 @@ static __always_inline void vma_flags_set_flag(vma_flags_t *flags, __set_bit((__force int)bit, bitmap); } -static __always_inline vma_flags_t __mk_vma_flags(size_t count, - const vma_flag_t *bits) +static __always_inline vma_flags_t __mk_vma_flags(vma_flags_t flags, + size_t count, const vma_flag_t *bits) { - vma_flags_t flags; int i; - vma_flags_clear_all(&flags); for (i = 0; i < count; i++) vma_flags_set_flag(&flags, bits[i]); return flags; @@ -1064,8 +1062,18 @@ static __always_inline vma_flags_t __mk_vma_flags(size_t count, * The compiler cleverly optimises away all of the work and this ends up being * equivalent to aggregating the values manually. */ -#define mk_vma_flags(...) __mk_vma_flags(COUNT_ARGS(__VA_ARGS__), \ - (const vma_flag_t []){__VA_ARGS__}) +#define mk_vma_flags(...) __mk_vma_flags(EMPTY_VMA_FLAGS, \ + COUNT_ARGS(__VA_ARGS__), (const vma_flag_t []){__VA_ARGS__}) + +/* + * Helper macro which acts like mk_vma_flags, only appending to a copy of the + * specified flags rather than establishing new flags. E.g.: + * + * vma_flags_t flags = append_vma_flags(VMA_STACK_DEFAULT_FLAGS, VMA_STACK_BIT, + * VMA_ACCOUNT_BIT); + */ +#define append_vma_flags(flags, ...) __mk_vma_flags(flags, \ + COUNT_ARGS(__VA_ARGS__), (const vma_flag_t []){__VA_ARGS__}) /* * Test whether a specific VMA flag is set, e.g.: diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 3005e33d1ede..a2f311b5ea82 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -854,21 +854,21 @@ static inline void vm_flags_clear(struct vm_area_struct *vma, vma_flags_clear_word(&vma->flags, flags); } -static __always_inline vma_flags_t __mk_vma_flags(size_t count, - const vma_flag_t *bits) +static __always_inline vma_flags_t __mk_vma_flags(vma_flags_t flags, + size_t count, const vma_flag_t *bits) { - vma_flags_t flags; int i; - vma_flags_clear_all(&flags); for (i = 0; i < count; i++) vma_flags_set_flag(&flags, bits[i]); - return flags; } -#define mk_vma_flags(...) __mk_vma_flags(COUNT_ARGS(__VA_ARGS__), \ - (const vma_flag_t []){__VA_ARGS__}) +#define mk_vma_flags(...) __mk_vma_flags(EMPTY_VMA_FLAGS, \ + COUNT_ARGS(__VA_ARGS__), (const vma_flag_t []){__VA_ARGS__}) + +#define append_vma_flags(flags, ...) __mk_vma_flags(flags, \ + COUNT_ARGS(__VA_ARGS__), (const vma_flag_t []){__VA_ARGS__}) static __always_inline bool vma_flags_test(const vma_flags_t *flags, vma_flag_t bit) -- cgit v1.2.3 From 5fb55e951cf591c5e2d45273ceadbdcd0c44932c Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:26 +0000 Subject: mm: unexport vm_brk_flags() and eliminate vm_flags parameter This function is only used by elf_load(), and that is a static function that doesn't need an exported symbol to invoke an internal function, so un-EXPORT_SYMBOLS() it. Also, the vm_flags parameter is unnecessary, as we only ever set VM_EXEC, so simply make this parameter a boolean. While we're here, clean up the mm.h definitions for the various vm_xxx() helpers so we actually specify parameter names and elide the redundant extern's. Link: https://lkml.kernel.org/r/7bada48ddf3f9dbd3e6c4fc50ec2f4de97706f52.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- fs/binfmt_elf.c | 3 +-- include/linux/mm.h | 12 ++++++------ mm/mmap.c | 8 ++------ 3 files changed, 9 insertions(+), 14 deletions(-) (limited to 'include/linux/mm.h') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index fb857faaf0d6..16a56b6b3f6c 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -453,14 +453,13 @@ static unsigned long elf_load(struct file *filep, unsigned long addr, zero_end = ELF_PAGEALIGN(zero_end); error = vm_brk_flags(zero_start, zero_end - zero_start, - prot & PROT_EXEC ? VM_EXEC : 0); + prot & PROT_EXEC); if (error) map_addr = error; } return map_addr; } - static unsigned long total_mapping_size(const struct elf_phdr *phdr, int nr) { elf_addr_t min_addr = -1; diff --git a/include/linux/mm.h b/include/linux/mm.h index 26cfb2fbe4db..5b85ffc2760c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3991,12 +3991,12 @@ static inline void mm_populate(unsigned long addr, unsigned long len) {} #endif /* This takes the mm semaphore itself */ -extern int __must_check vm_brk_flags(unsigned long, unsigned long, unsigned long); -extern int vm_munmap(unsigned long, size_t); -extern unsigned long __must_check vm_mmap(struct file *, unsigned long, - unsigned long, unsigned long, - unsigned long, unsigned long); -extern unsigned long __must_check vm_mmap_shadow_stack(unsigned long addr, +int __must_check vm_brk_flags(unsigned long addr, unsigned long request, bool is_exec); +int vm_munmap(unsigned long start, size_t len); +unsigned long __must_check vm_mmap(struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, + unsigned long flag, unsigned long offset); +unsigned long __must_check vm_mmap_shadow_stack(unsigned long addr, unsigned long len, unsigned long flags); struct vm_unmapped_area_info { diff --git a/mm/mmap.c b/mm/mmap.c index 79544d893411..2d2b814978bf 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1201,8 +1201,9 @@ out: return ret; } -int vm_brk_flags(unsigned long addr, unsigned long request, vm_flags_t vm_flags) +int vm_brk_flags(unsigned long addr, unsigned long request, bool is_exec) { + const vm_flags_t vm_flags = is_exec ? VM_EXEC : 0; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; unsigned long len; @@ -1217,10 +1218,6 @@ int vm_brk_flags(unsigned long addr, unsigned long request, vm_flags_t vm_flags) if (!len) return 0; - /* Until we need other flags, refuse anything except VM_EXEC. */ - if ((vm_flags & (~VM_EXEC)) != 0) - return -EINVAL; - if (mmap_write_lock_killable(mm)) return -EINTR; @@ -1246,7 +1243,6 @@ limits_failed: mmap_write_unlock(mm); return ret; } -EXPORT_SYMBOL(vm_brk_flags); static unsigned long tear_down_vmas(struct mm_struct *mm, struct vma_iterator *vmi, -- cgit v1.2.3 From 3ee584538259c356c66146ac46f2e4fd2ba28bee Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:27 +0000 Subject: mm/vma: introduce vma_flags_same[_mask/_pair]() Add helpers to determine if two sets of VMA flags are precisely the same, that is - that every flag set one is set in another, and neither contain any flags not set in the other. We also introduce vma_flags_same_pair() for cases where we want to compare two sets of VMA flags which are both non-const values. Also update the VMA tests to reflect the change, we already implicitly test that this functions correctly having used it for testing purposes previously. Link: https://lkml.kernel.org/r/4f764bf619e77205837c7c819b62139ef6337ca3.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm.h | 28 ++++++++++++++++++++++++++++ tools/testing/vma/include/custom.h | 11 ----------- tools/testing/vma/include/dup.h | 21 +++++++++++++++++++++ 3 files changed, 49 insertions(+), 11 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 5b85ffc2760c..1f3e9100164d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1202,6 +1202,34 @@ static __always_inline vma_flags_t vma_flags_diff_pair(const vma_flags_t *flags, return dst; } +/* Determine if flags and flags_other have precisely the same flags set. */ +static __always_inline bool vma_flags_same_pair(const vma_flags_t *flags, + const vma_flags_t *flags_other) +{ + const unsigned long *bitmap = flags->__vma_flags; + const unsigned long *bitmap_other = flags_other->__vma_flags; + + return bitmap_equal(bitmap, bitmap_other, NUM_VMA_FLAG_BITS); +} + +/* Determine if flags and flags_other have precisely the same flags set. */ +static __always_inline bool vma_flags_same_mask(const vma_flags_t *flags, + vma_flags_t flags_other) +{ + const unsigned long *bitmap = flags->__vma_flags; + const unsigned long *bitmap_other = flags_other.__vma_flags; + + return bitmap_equal(bitmap, bitmap_other, NUM_VMA_FLAG_BITS); +} + +/* + * Helper macro to determine if only the specific flags are set, e.g.: + * + * if (vma_flags_same(&flags, VMA_WRITE_BIT) { ... } + */ +#define vma_flags_same(flags, ...) \ + vma_flags_same_mask(flags, mk_vma_flags(__VA_ARGS__)) + /* * Helper to test that ALL specified flags are set in a VMA. * diff --git a/tools/testing/vma/include/custom.h b/tools/testing/vma/include/custom.h index 8f33df02816a..2c498e713fbd 100644 --- a/tools/testing/vma/include/custom.h +++ b/tools/testing/vma/include/custom.h @@ -102,16 +102,5 @@ static inline unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) return PAGE_SIZE; } -/* Place here until needed in the kernel code. */ -static __always_inline bool vma_flags_same_mask(vma_flags_t *flags, - vma_flags_t flags_other) -{ - const unsigned long *bitmap = flags->__vma_flags; - const unsigned long *bitmap_other = flags_other.__vma_flags; - - return bitmap_equal(bitmap, bitmap_other, NUM_VMA_FLAG_BITS); -} -#define vma_flags_same(flags, ...) \ - vma_flags_same_mask(flags, mk_vma_flags(__VA_ARGS__)) #define VMA_SPECIAL_FLAGS mk_vma_flags(VMA_IO_BIT, VMA_DONTEXPAND_BIT, \ VMA_PFNMAP_BIT, VMA_MIXEDMAP_BIT) diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 802b3d97b627..65f630923461 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -954,6 +954,27 @@ static __always_inline vma_flags_t vma_flags_diff_pair(const vma_flags_t *flags, return dst; } +static __always_inline bool vma_flags_same_pair(const vma_flags_t *flags, + const vma_flags_t *flags_other) +{ + const unsigned long *bitmap = flags->__vma_flags; + const unsigned long *bitmap_other = flags_other->__vma_flags; + + return bitmap_equal(bitmap, bitmap_other, NUM_VMA_FLAG_BITS); +} + +static __always_inline bool vma_flags_same_mask(const vma_flags_t *flags, + vma_flags_t flags_other) +{ + const unsigned long *bitmap = flags->__vma_flags; + const unsigned long *bitmap_other = flags_other.__vma_flags; + + return bitmap_equal(bitmap, bitmap_other, NUM_VMA_FLAG_BITS); +} + +#define vma_flags_same(flags, ...) \ + vma_flags_same_mask(flags, mk_vma_flags(__VA_ARGS__)) + static inline bool vma_test_all_mask(const struct vm_area_struct *vma, vma_flags_t flags) { -- cgit v1.2.3 From fb67bba5d9b8561f433695c8916c097910193561 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:30 +0000 Subject: mm/vma: introduce vma_test[_any[_mask]](), and make inlining consistent Introduce helper functions and macros to make it convenient to test flags and flag masks for VMAs, specifically: * vma_test() - determine if a single VMA flag is set in a VMA. * vma_test_any_mask() - determine if any flags in a vma_flags_t value are set in a VMA. * vma_test_any() - Helper macro to test if any of specific flags are set. Also, there are a mix of 'inline's and '__always_inline's in VMA helper function declarations, update to consistently use __always_inline. Finally, update the VMA tests to reflect the changes. Link: https://lkml.kernel.org/r/be1d71f08307d747a82232cbd8664a88c0f41419.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm.h | 49 +++++++++++++++++++++++++++------ include/linux/mm_types.h | 12 +++++--- tools/testing/vma/include/dup.h | 61 +++++++++++++++++++++++++++-------------- 3 files changed, 88 insertions(+), 34 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 1f3e9100164d..f704d7cf2871 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -994,7 +994,8 @@ static inline void vm_flags_mod(struct vm_area_struct *vma, __vm_flags_mod(vma, set, clear); } -static inline bool __vma_atomic_valid_flag(struct vm_area_struct *vma, vma_flag_t bit) +static __always_inline bool __vma_atomic_valid_flag(struct vm_area_struct *vma, + vma_flag_t bit) { const vm_flags_t mask = BIT((__force int)bit); @@ -1009,7 +1010,8 @@ static inline bool __vma_atomic_valid_flag(struct vm_area_struct *vma, vma_flag_ * Set VMA flag atomically. Requires only VMA/mmap read lock. Only specific * valid flags are allowed to do this. */ -static inline void vma_set_atomic_flag(struct vm_area_struct *vma, vma_flag_t bit) +static __always_inline void vma_set_atomic_flag(struct vm_area_struct *vma, + vma_flag_t bit) { unsigned long *bitmap = vma->flags.__vma_flags; @@ -1025,7 +1027,8 @@ static inline void vma_set_atomic_flag(struct vm_area_struct *vma, vma_flag_t bi * This is necessarily racey, so callers must ensure that serialisation is * achieved through some other means, or that races are permissible. */ -static inline bool vma_test_atomic_flag(struct vm_area_struct *vma, vma_flag_t bit) +static __always_inline bool vma_test_atomic_flag(struct vm_area_struct *vma, + vma_flag_t bit) { if (__vma_atomic_valid_flag(vma, bit)) return test_bit((__force int)bit, &vma->vm_flags); @@ -1230,13 +1233,41 @@ static __always_inline bool vma_flags_same_mask(const vma_flags_t *flags, #define vma_flags_same(flags, ...) \ vma_flags_same_mask(flags, mk_vma_flags(__VA_ARGS__)) +/* + * Test whether a specific flag in the VMA is set, e.g.: + * + * if (vma_test(vma, VMA_READ_BIT)) { ... } + */ +static __always_inline bool vma_test(const struct vm_area_struct *vma, + vma_flag_t bit) +{ + return vma_flags_test(&vma->flags, bit); +} + +/* Helper to test any VMA flags in a VMA . */ +static __always_inline bool vma_test_any_mask(const struct vm_area_struct *vma, + vma_flags_t flags) +{ + return vma_flags_test_any_mask(&vma->flags, flags); +} + +/* + * Helper macro for testing whether any VMA flags are set in a VMA, + * e.g.: + * + * if (vma_test_any(vma, VMA_IO_BIT, VMA_PFNMAP_BIT, + * VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT)) { ... } + */ +#define vma_test_any(vma, ...) \ + vma_test_any_mask(vma, mk_vma_flags(__VA_ARGS__)) + /* * Helper to test that ALL specified flags are set in a VMA. * * Note: appropriate locks must be held, this function does not acquire them for * you. */ -static inline bool vma_test_all_mask(const struct vm_area_struct *vma, +static __always_inline bool vma_test_all_mask(const struct vm_area_struct *vma, vma_flags_t flags) { return vma_flags_test_all_mask(&vma->flags, flags); @@ -1256,7 +1287,7 @@ static inline bool vma_test_all_mask(const struct vm_area_struct *vma, * Note: appropriate locks must be held, this function does not acquire them for * you. */ -static inline void vma_set_flags_mask(struct vm_area_struct *vma, +static __always_inline void vma_set_flags_mask(struct vm_area_struct *vma, vma_flags_t flags) { vma_flags_set_mask(&vma->flags, flags); @@ -1286,7 +1317,7 @@ static __always_inline bool vma_desc_test(const struct vm_area_desc *desc, } /* Helper to test any VMA flags in a VMA descriptor. */ -static inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, +static __always_inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, vma_flags_t flags) { return vma_flags_test_any_mask(&desc->vma_flags, flags); @@ -1303,7 +1334,7 @@ static inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, vma_desc_test_any_mask(desc, mk_vma_flags(__VA_ARGS__)) /* Helper to test all VMA flags in a VMA descriptor. */ -static inline bool vma_desc_test_all_mask(const struct vm_area_desc *desc, +static __always_inline bool vma_desc_test_all_mask(const struct vm_area_desc *desc, vma_flags_t flags) { return vma_flags_test_all_mask(&desc->vma_flags, flags); @@ -1319,7 +1350,7 @@ static inline bool vma_desc_test_all_mask(const struct vm_area_desc *desc, vma_desc_test_all_mask(desc, mk_vma_flags(__VA_ARGS__)) /* Helper to set all VMA flags in a VMA descriptor. */ -static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, +static __always_inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, vma_flags_t flags) { vma_flags_set_mask(&desc->vma_flags, flags); @@ -1336,7 +1367,7 @@ static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, vma_desc_set_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) /* Helper to clear all VMA flags in a VMA descriptor. */ -static inline void vma_desc_clear_flags_mask(struct vm_area_desc *desc, +static __always_inline void vma_desc_clear_flags_mask(struct vm_area_desc *desc, vma_flags_t flags) { vma_flags_clear_mask(&desc->vma_flags, flags); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 1da8fb04133f..38fe6b915024 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1087,7 +1087,8 @@ static __always_inline vm_flags_t vma_flags_to_legacy(vma_flags_t flags) * IMPORTANT: This does not overwrite bytes past the first system word. The * caller must account for this. */ -static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long value) +static __always_inline void vma_flags_overwrite_word(vma_flags_t *flags, + unsigned long value) { unsigned long *bitmap = flags->__vma_flags; @@ -1114,7 +1115,8 @@ static __always_inline vma_flags_t legacy_to_vma_flags(vm_flags_t flags) * IMPORTANT: This does not overwrite bytes past the first system word. The * caller must account for this. */ -static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned long value) +static __always_inline void vma_flags_overwrite_word_once(vma_flags_t *flags, + unsigned long value) { unsigned long *bitmap = flags->__vma_flags; @@ -1122,7 +1124,8 @@ static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned lo } /* Update the first system word of VMA flags setting bits, non-atomically. */ -static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value) +static __always_inline void vma_flags_set_word(vma_flags_t *flags, + unsigned long value) { unsigned long *bitmap = flags->__vma_flags; @@ -1130,7 +1133,8 @@ static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value) } /* Update the first system word of VMA flags clearing bits, non-atomically. */ -static inline void vma_flags_clear_word(vma_flags_t *flags, unsigned long value) +static __always_inline void vma_flags_clear_word(vma_flags_t *flags, + unsigned long value) { unsigned long *bitmap = flags->__vma_flags; diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index f49af21319ba..f9fe07a8a443 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -764,7 +764,8 @@ static inline bool mm_flags_test(int flag, const struct mm_struct *mm) * IMPORTANT: This does not overwrite bytes past the first system word. The * caller must account for this. */ -static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long value) +static __always_inline void vma_flags_overwrite_word(vma_flags_t *flags, + unsigned long value) { unsigned long *bitmap = flags->__vma_flags; @@ -777,7 +778,8 @@ static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long va * IMPORTANT: This does not overwrite bytes past the first system word. The * caller must account for this. */ -static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned long value) +static __always_inline void vma_flags_overwrite_word_once(vma_flags_t *flags, + unsigned long value) { unsigned long *bitmap = flags->__vma_flags; @@ -785,7 +787,8 @@ static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned lo } /* Update the first system word of VMA flags setting bits, non-atomically. */ -static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value) +static __always_inline void vma_flags_set_word(vma_flags_t *flags, + unsigned long value) { unsigned long *bitmap = flags->__vma_flags; @@ -793,7 +796,8 @@ static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value) } /* Update the first system word of VMA flags clearing bits, non-atomically. */ -static inline void vma_flags_clear_word(vma_flags_t *flags, unsigned long value) +static __always_inline void vma_flags_clear_word(vma_flags_t *flags, + unsigned long value) { unsigned long *bitmap = flags->__vma_flags; @@ -1003,23 +1007,32 @@ static __always_inline bool vma_flags_same_mask(const vma_flags_t *flags, #define vma_flags_same(flags, ...) \ vma_flags_same_mask(flags, mk_vma_flags(__VA_ARGS__)) -static inline bool vma_test_all_mask(const struct vm_area_struct *vma, - vma_flags_t flags) +static __always_inline bool vma_test(const struct vm_area_struct *vma, + vma_flag_t bit) { - return vma_flags_test_all_mask(&vma->flags, flags); + return vma_flags_test(&vma->flags, bit); } -#define vma_test_all(vma, ...) \ - vma_test_all_mask(vma, mk_vma_flags(__VA_ARGS__)) +static __always_inline bool vma_test_any_mask(const struct vm_area_struct *vma, + vma_flags_t flags) +{ + return vma_flags_test_any_mask(&vma->flags, flags); +} -static inline bool is_shared_maywrite_vm_flags(vm_flags_t vm_flags) +#define vma_test_any(vma, ...) \ + vma_test_any_mask(vma, mk_vma_flags(__VA_ARGS__)) + +static __always_inline bool vma_test_all_mask(const struct vm_area_struct *vma, + vma_flags_t flags) { - return (vm_flags & (VM_SHARED | VM_MAYWRITE)) == - (VM_SHARED | VM_MAYWRITE); + return vma_flags_test_all_mask(&vma->flags, flags); } -static inline void vma_set_flags_mask(struct vm_area_struct *vma, - vma_flags_t flags) +#define vma_test_all(vma, ...) \ + vma_test_all_mask(vma, mk_vma_flags(__VA_ARGS__)) + +static __always_inline void vma_set_flags_mask(struct vm_area_struct *vma, + vma_flags_t flags) { vma_flags_set_mask(&vma->flags, flags); } @@ -1033,8 +1046,8 @@ static __always_inline bool vma_desc_test(const struct vm_area_desc *desc, return vma_flags_test(&desc->vma_flags, bit); } -static inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, - vma_flags_t flags) +static __always_inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, + vma_flags_t flags) { return vma_flags_test_any_mask(&desc->vma_flags, flags); } @@ -1042,7 +1055,7 @@ static inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, #define vma_desc_test_any(desc, ...) \ vma_desc_test_any_mask(desc, mk_vma_flags(__VA_ARGS__)) -static inline bool vma_desc_test_all_mask(const struct vm_area_desc *desc, +static __always_inline bool vma_desc_test_all_mask(const struct vm_area_desc *desc, vma_flags_t flags) { return vma_flags_test_all_mask(&desc->vma_flags, flags); @@ -1051,8 +1064,8 @@ static inline bool vma_desc_test_all_mask(const struct vm_area_desc *desc, #define vma_desc_test_all(desc, ...) \ vma_desc_test_all_mask(desc, mk_vma_flags(__VA_ARGS__)) -static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, - vma_flags_t flags) +static __always_inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, + vma_flags_t flags) { vma_flags_set_mask(&desc->vma_flags, flags); } @@ -1060,8 +1073,8 @@ static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, #define vma_desc_set_flags(desc, ...) \ vma_desc_set_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) -static inline void vma_desc_clear_flags_mask(struct vm_area_desc *desc, - vma_flags_t flags) +static __always_inline void vma_desc_clear_flags_mask(struct vm_area_desc *desc, + vma_flags_t flags) { vma_flags_clear_mask(&desc->vma_flags, flags); } @@ -1069,6 +1082,12 @@ static inline void vma_desc_clear_flags_mask(struct vm_area_desc *desc, #define vma_desc_clear_flags(desc, ...) \ vma_desc_clear_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) +static inline bool is_shared_maywrite_vm_flags(vm_flags_t vm_flags) +{ + return (vm_flags & (VM_SHARED | VM_MAYWRITE)) == + (VM_SHARED | VM_MAYWRITE); +} + static inline bool is_shared_maywrite(const vma_flags_t *flags) { return vma_flags_test_all(flags, VMA_SHARED_BIT, VMA_MAYWRITE_BIT); -- cgit v1.2.3 From e79d1c500f52506b9eab39e81017e30b76f2864d Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:32 +0000 Subject: mm: introduce vma_flags_count() and vma[_flags]_test_single_mask() vma_flags_count() determines how many bits are set in VMA flags, using bitmap_weight(). vma_flags_test_single_mask() determines if a vma_flags_t set of flags contains a single flag specified as another vma_flags_t value, or if the sought flag mask is empty, it is defined to return false. This is useful when we want to declare a VMA flag as optionally a single flag in a mask or empty depending on kernel configuration. This allows us to have VM_NONE-like semantics when checking whether the flag is set. In a subsequent patch, we introduce the use of VMA_DROPPABLE of type vma_flags_t using precisely these semantics. It would be actively confusing to use vma_flags_test_any_single_mask() for this (and vma_flags_test_all_mask() is not correct to use here, as it trivially returns true when tested against an empty vma flags mask). We introduce vma_flags_count() to be able to assert that the compared flag mask is singular or empty, checked when CONFIG_DEBUG_VM is enabled. Also update the VMA tests as part of this change. Link: https://lkml.kernel.org/r/cd778dd02b9f2a01eb54d25a49dea8ec2ddf7753.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm.h | 46 ++++++++++++++++++++++++++++++++++++++ tools/testing/vma/include/custom.h | 6 ----- tools/testing/vma/include/dup.h | 21 +++++++++++++++++ tools/testing/vma/vma_internal.h | 6 +++++ 4 files changed, 73 insertions(+), 6 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index f704d7cf2871..de72382efac2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1078,6 +1078,14 @@ static __always_inline vma_flags_t __mk_vma_flags(vma_flags_t flags, #define append_vma_flags(flags, ...) __mk_vma_flags(flags, \ COUNT_ARGS(__VA_ARGS__), (const vma_flag_t []){__VA_ARGS__}) +/* Calculates the number of set bits in the specified VMA flags. */ +static __always_inline int vma_flags_count(const vma_flags_t *flags) +{ + const unsigned long *bitmap = flags->__vma_flags; + + return bitmap_weight(bitmap, NUM_VMA_FLAG_BITS); +} + /* * Test whether a specific VMA flag is set, e.g.: * @@ -1153,6 +1161,26 @@ static __always_inline bool vma_flags_test_all_mask(const vma_flags_t *flags, #define vma_flags_test_all(flags, ...) \ vma_flags_test_all_mask(flags, mk_vma_flags(__VA_ARGS__)) +/* + * Helper to test that a flag mask of type vma_flags_t has a SINGLE flag set + * (returning false if flagmask has no flags set). + * + * This is defined to make the semantics clearer when testing an optionally + * defined VMA flags mask, e.g.: + * + * if (vma_flags_test_single_mask(&flags, VMA_DROPPABLE)) { ... } + * + * When VMA_DROPPABLE is defined if available, or set to EMPTY_VMA_FLAGS + * otherwise. + */ +static __always_inline bool vma_flags_test_single_mask(const vma_flags_t *flags, + vma_flags_t flagmask) +{ + VM_WARN_ON_ONCE(vma_flags_count(&flagmask) > 1); + + return vma_flags_test_any_mask(flags, flagmask); +} + /* Set each of the to_set flags in flags, non-atomically. */ static __always_inline void vma_flags_set_mask(vma_flags_t *flags, vma_flags_t to_set) @@ -1281,6 +1309,24 @@ static __always_inline bool vma_test_all_mask(const struct vm_area_struct *vma, #define vma_test_all(vma, ...) \ vma_test_all_mask(vma, mk_vma_flags(__VA_ARGS__)) +/* + * Helper to test that a flag mask of type vma_flags_t has a SINGLE flag set + * (returning false if flagmask has no flags set). + * + * This is useful when a flag needs to be either defined or not depending upon + * kernel configuration, e.g.: + * + * if (vma_test_single_mask(vma, VMA_DROPPABLE)) { ... } + * + * When VMA_DROPPABLE is defined if available, or set to EMPTY_VMA_FLAGS + * otherwise. + */ +static __always_inline bool +vma_test_single_mask(const struct vm_area_struct *vma, vma_flags_t flagmask) +{ + return vma_flags_test_single_mask(&vma->flags, flagmask); +} + /* * Helper to set all VMA flags in a VMA. * diff --git a/tools/testing/vma/include/custom.h b/tools/testing/vma/include/custom.h index 2c498e713fbd..b7d9eb0a44e4 100644 --- a/tools/testing/vma/include/custom.h +++ b/tools/testing/vma/include/custom.h @@ -15,12 +15,6 @@ extern unsigned long dac_mmap_min_addr; #define dac_mmap_min_addr 0UL #endif -#define VM_WARN_ON(_expr) (WARN_ON(_expr)) -#define VM_WARN_ON_ONCE(_expr) (WARN_ON_ONCE(_expr)) -#define VM_WARN_ON_VMG(_expr, _vmg) (WARN_ON(_expr)) -#define VM_BUG_ON(_expr) (BUG_ON(_expr)) -#define VM_BUG_ON_VMA(_expr, _vma) (BUG_ON(_expr)) - #define TASK_SIZE ((1ul << 47)-PAGE_SIZE) /* diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index f9fe07a8a443..244ee02dc21d 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -905,6 +905,13 @@ static __always_inline vma_flags_t __mk_vma_flags(vma_flags_t flags, #define append_vma_flags(flags, ...) __mk_vma_flags(flags, \ COUNT_ARGS(__VA_ARGS__), (const vma_flag_t []){__VA_ARGS__}) +static __always_inline int vma_flags_count(const vma_flags_t *flags) +{ + const unsigned long *bitmap = flags->__vma_flags; + + return bitmap_weight(bitmap, NUM_VMA_FLAG_BITS); +} + static __always_inline bool vma_flags_test(const vma_flags_t *flags, vma_flag_t bit) { @@ -952,6 +959,14 @@ static __always_inline bool vma_flags_test_all_mask(const vma_flags_t *flags, #define vma_flags_test_all(flags, ...) \ vma_flags_test_all_mask(flags, mk_vma_flags(__VA_ARGS__)) +static __always_inline bool vma_flags_test_single_mask(const vma_flags_t *flags, + vma_flags_t flagmask) +{ + VM_WARN_ON_ONCE(vma_flags_count(&flagmask) > 1); + + return vma_flags_test_any_mask(flags, flagmask); +} + static __always_inline void vma_flags_set_mask(vma_flags_t *flags, vma_flags_t to_set) { unsigned long *bitmap = flags->__vma_flags; @@ -1031,6 +1046,12 @@ static __always_inline bool vma_test_all_mask(const struct vm_area_struct *vma, #define vma_test_all(vma, ...) \ vma_test_all_mask(vma, mk_vma_flags(__VA_ARGS__)) +static __always_inline bool +vma_test_single_mask(const struct vm_area_struct *vma, vma_flags_t flagmask) +{ + return vma_flags_test_single_mask(&vma->flags, flagmask); +} + static __always_inline void vma_set_flags_mask(struct vm_area_struct *vma, vma_flags_t flags) { diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 0e1121e2ef23..e12ab2c80f95 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -51,6 +51,12 @@ typedef unsigned long pgprotval_t; typedef struct pgprot { pgprotval_t pgprot; } pgprot_t; typedef __bitwise unsigned int vm_fault_t; +#define VM_WARN_ON(_expr) (WARN_ON(_expr)) +#define VM_WARN_ON_ONCE(_expr) (WARN_ON_ONCE(_expr)) +#define VM_WARN_ON_VMG(_expr, _vmg) (WARN_ON(_expr)) +#define VM_BUG_ON(_expr) (BUG_ON(_expr)) +#define VM_BUG_ON_VMA(_expr, _vma) (BUG_ON(_expr)) + #include "include/stubs.h" #include "include/dup.h" #include "include/custom.h" -- cgit v1.2.3 From 3a6455d56bd7c4cfb1ea35ddae052943065e338e Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:34 +0000 Subject: mm: convert do_brk_flags() to use vma_flags_t In order to be able to do this, we need to change VM_DATA_DEFAULT_FLAGS and friends and update the architecture-specific definitions also. We then have to update some KSM logic to handle VMA flags, and introduce VMA_STACK_FLAGS to define the vma_flags_t equivalent of VM_STACK_FLAGS. We also introduce two helper functions for use during the time we are converting legacy flags to vma_flags_t values - vma_flags_to_legacy() and legacy_to_vma_flags(). This enables us to iteratively make changes to break these changes up into separate parts. We use these explicitly here to keep VM_STACK_FLAGS around for certain users which need to maintain the legacy vm_flags_t values for the time being. We are no longer able to rely on the simple VM_xxx being set to zero if the feature is not enabled, so in the case of VM_DROPPABLE we introduce VMA_DROPPABLE as the vma_flags_t equivalent, which is set to EMPTY_VMA_FLAGS if the droppable flag is not available. While we're here, we make the description of do_brk_flags() into a kdoc comment, as it almost was already. We use vma_flags_to_legacy() to not need to update the vm_get_page_prot() logic as this time. Note that in create_init_stack_vma() we have to replace the BUILD_BUG_ON() with a VM_WARN_ON_ONCE() as the tested values are no longer build time available. We also update mprotect_fixup() to use VMA flags where possible, though we have to live with a little duplication between vm_flags_t and vma_flags_t values for the time being until further conversions are made. While we're here, update VM_SPECIAL to be defined in terms of VMA_SPECIAL_FLAGS now we have vma_flags_to_legacy(). Finally, we update the VMA tests to reflect these changes. Link: https://lkml.kernel.org/r/d02e3e45d9a33d7904b149f5604904089fd640ae.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Paul Moore [SELinux] Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- arch/arc/include/asm/page.h | 2 +- arch/arm/include/asm/page.h | 2 +- arch/arm64/include/asm/page.h | 7 +++++- arch/hexagon/include/asm/page.h | 2 +- arch/loongarch/include/asm/page.h | 2 +- arch/mips/include/asm/page.h | 2 +- arch/nios2/include/asm/page.h | 2 +- arch/powerpc/include/asm/page.h | 4 ++-- arch/powerpc/include/asm/page_32.h | 2 +- arch/powerpc/include/asm/page_64.h | 12 +++++----- arch/riscv/include/asm/page.h | 2 +- arch/s390/include/asm/page.h | 2 +- arch/x86/include/asm/page_types.h | 2 +- arch/x86/um/asm/vm-flags.h | 4 ++-- include/linux/ksm.h | 10 ++++---- include/linux/mm.h | 49 +++++++++++++++++++++++--------------- mm/internal.h | 3 +++ mm/ksm.c | 43 +++++++++++++++++---------------- mm/mmap.c | 13 ++++++---- mm/mprotect.c | 46 +++++++++++++++++++++-------------- mm/mremap.c | 6 ++--- mm/vma.c | 34 ++++++++++++++------------ mm/vma.h | 14 ++++++++--- mm/vma_exec.c | 5 ++-- security/selinux/hooks.c | 4 +++- tools/testing/vma/include/custom.h | 3 --- tools/testing/vma/include/dup.h | 42 +++++++++++++++++--------------- tools/testing/vma/include/stubs.h | 9 +++---- tools/testing/vma/tests/merge.c | 3 +-- 29 files changed, 191 insertions(+), 140 deletions(-) (limited to 'include/linux/mm.h') diff --git a/arch/arc/include/asm/page.h b/arch/arc/include/asm/page.h index 38214e126c6d..facc7a03b250 100644 --- a/arch/arc/include/asm/page.h +++ b/arch/arc/include/asm/page.h @@ -131,7 +131,7 @@ static inline unsigned long virt_to_pfn(const void *kaddr) #define virt_addr_valid(kaddr) pfn_valid(virt_to_pfn(kaddr)) /* Default Permissions for stack/heaps pages (Non Executable) */ -#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_NON_EXEC +#define VMA_DATA_DEFAULT_FLAGS VMA_DATA_FLAGS_NON_EXEC #define WANT_PAGE_VIRTUAL 1 diff --git a/arch/arm/include/asm/page.h b/arch/arm/include/asm/page.h index ef11b721230e..fa4c1225dde5 100644 --- a/arch/arm/include/asm/page.h +++ b/arch/arm/include/asm/page.h @@ -184,7 +184,7 @@ extern int pfn_valid(unsigned long); #include -#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC +#define VMA_DATA_DEFAULT_FLAGS VMA_DATA_FLAGS_TSK_EXEC #include #include diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h index b39cc1127e1f..e25d0d18f6d7 100644 --- a/arch/arm64/include/asm/page.h +++ b/arch/arm64/include/asm/page.h @@ -46,7 +46,12 @@ int pfn_is_map_memory(unsigned long pfn); #endif /* !__ASSEMBLER__ */ -#define VM_DATA_DEFAULT_FLAGS (VM_DATA_FLAGS_TSK_EXEC | VM_MTE_ALLOWED) +#ifdef CONFIG_ARM64_MTE +#define VMA_DATA_DEFAULT_FLAGS append_vma_flags(VMA_DATA_FLAGS_TSK_EXEC, \ + VMA_MTE_ALLOWED_BIT) +#else +#define VMA_DATA_DEFAULT_FLAGS VMA_DATA_FLAGS_TSK_EXEC +#endif #include diff --git a/arch/hexagon/include/asm/page.h b/arch/hexagon/include/asm/page.h index f0aed3ed812b..6d82572a7f21 100644 --- a/arch/hexagon/include/asm/page.h +++ b/arch/hexagon/include/asm/page.h @@ -90,7 +90,7 @@ struct page; #define virt_to_page(kaddr) pfn_to_page(PFN_DOWN(__pa(kaddr))) /* Default vm area behavior is non-executable. */ -#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_NON_EXEC +#define VMA_DATA_DEFAULT_FLAGS VMA_DATA_FLAGS_NON_EXEC #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) diff --git a/arch/loongarch/include/asm/page.h b/arch/loongarch/include/asm/page.h index 327bf0bc92bf..79235f4fc399 100644 --- a/arch/loongarch/include/asm/page.h +++ b/arch/loongarch/include/asm/page.h @@ -104,7 +104,7 @@ struct page *tlb_virt_to_page(unsigned long kaddr); extern int __virt_addr_valid(volatile void *kaddr); #define virt_addr_valid(kaddr) __virt_addr_valid((volatile void *)(kaddr)) -#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC +#define VMA_DATA_DEFAULT_FLAGS VMA_DATA_FLAGS_TSK_EXEC #include #include diff --git a/arch/mips/include/asm/page.h b/arch/mips/include/asm/page.h index 5ec428fcc887..50a382a0d8f6 100644 --- a/arch/mips/include/asm/page.h +++ b/arch/mips/include/asm/page.h @@ -213,7 +213,7 @@ extern bool __virt_addr_valid(const volatile void *kaddr); #define virt_addr_valid(kaddr) \ __virt_addr_valid((const volatile void *) (kaddr)) -#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC +#define VMA_DATA_DEFAULT_FLAGS VMA_DATA_FLAGS_TSK_EXEC extern unsigned long __kaslr_offset; static inline unsigned long kaslr_offset(void) diff --git a/arch/nios2/include/asm/page.h b/arch/nios2/include/asm/page.h index 722956ac0bf8..71eb7c1b67d4 100644 --- a/arch/nios2/include/asm/page.h +++ b/arch/nios2/include/asm/page.h @@ -85,7 +85,7 @@ extern struct page *mem_map; # define virt_to_page(vaddr) pfn_to_page(PFN_DOWN(virt_to_phys(vaddr))) # define virt_addr_valid(vaddr) pfn_valid(PFN_DOWN(virt_to_phys(vaddr))) -# define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_NON_EXEC +# define VMA_DATA_DEFAULT_FLAGS VMA_DATA_FLAGS_NON_EXEC #include diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index f2bb1f98eebe..281f25e071a3 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -240,8 +240,8 @@ static inline const void *pfn_to_kaddr(unsigned long pfn) * and needs to be executable. This means the whole heap ends * up being executable. */ -#define VM_DATA_DEFAULT_FLAGS32 VM_DATA_FLAGS_TSK_EXEC -#define VM_DATA_DEFAULT_FLAGS64 VM_DATA_FLAGS_NON_EXEC +#define VMA_DATA_DEFAULT_FLAGS32 VMA_DATA_FLAGS_TSK_EXEC +#define VMA_DATA_DEFAULT_FLAGS64 VMA_DATA_FLAGS_NON_EXEC #ifdef __powerpc64__ #include diff --git a/arch/powerpc/include/asm/page_32.h b/arch/powerpc/include/asm/page_32.h index 25482405a811..1fd8c21f0a42 100644 --- a/arch/powerpc/include/asm/page_32.h +++ b/arch/powerpc/include/asm/page_32.h @@ -10,7 +10,7 @@ #endif #endif -#define VM_DATA_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS32 +#define VMA_DATA_DEFAULT_FLAGS VMA_DATA_DEFAULT_FLAGS32 #if defined(CONFIG_PPC_256K_PAGES) || \ (defined(CONFIG_PPC_8xx) && defined(CONFIG_PPC_16K_PAGES)) diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h index 0f564a06bf68..d96c984d023b 100644 --- a/arch/powerpc/include/asm/page_64.h +++ b/arch/powerpc/include/asm/page_64.h @@ -84,9 +84,9 @@ extern u64 ppc64_pft_size; #endif /* __ASSEMBLER__ */ -#define VM_DATA_DEFAULT_FLAGS \ +#define VMA_DATA_DEFAULT_FLAGS \ (is_32bit_task() ? \ - VM_DATA_DEFAULT_FLAGS32 : VM_DATA_DEFAULT_FLAGS64) + VMA_DATA_DEFAULT_FLAGS32 : VMA_DATA_DEFAULT_FLAGS64) /* * This is the default if a program doesn't have a PT_GNU_STACK @@ -94,12 +94,12 @@ extern u64 ppc64_pft_size; * stack by default, so in the absence of a PT_GNU_STACK program header * we turn execute permission off. */ -#define VM_STACK_DEFAULT_FLAGS32 VM_DATA_FLAGS_EXEC -#define VM_STACK_DEFAULT_FLAGS64 VM_DATA_FLAGS_NON_EXEC +#define VMA_STACK_DEFAULT_FLAGS32 VMA_DATA_FLAGS_EXEC +#define VMA_STACK_DEFAULT_FLAGS64 VMA_DATA_FLAGS_NON_EXEC -#define VM_STACK_DEFAULT_FLAGS \ +#define VMA_STACK_DEFAULT_FLAGS \ (is_32bit_task() ? \ - VM_STACK_DEFAULT_FLAGS32 : VM_STACK_DEFAULT_FLAGS64) + VMA_STACK_DEFAULT_FLAGS32 : VMA_STACK_DEFAULT_FLAGS64) #include diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index 187aad0a7b03..c78017061b17 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -204,7 +204,7 @@ static __always_inline void *pfn_to_kaddr(unsigned long pfn) (unsigned long)(_addr) >= PAGE_OFFSET && pfn_valid(virt_to_pfn(_addr)); \ }) -#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_NON_EXEC +#define VMA_DATA_DEFAULT_FLAGS VMA_DATA_FLAGS_NON_EXEC #include #include diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index f339258135f7..56da819a79e6 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -277,7 +277,7 @@ static inline unsigned long virt_to_pfn(const void *kaddr) #define virt_addr_valid(kaddr) pfn_valid(phys_to_pfn(__pa_nodebug((unsigned long)(kaddr)))) -#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_NON_EXEC +#define VMA_DATA_DEFAULT_FLAGS VMA_DATA_FLAGS_NON_EXEC #endif /* !__ASSEMBLER__ */ diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 018a8d906ca3..3e0801a0f782 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -26,7 +26,7 @@ #define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET) -#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC +#define VMA_DATA_DEFAULT_FLAGS VMA_DATA_FLAGS_TSK_EXEC /* Physical address where kernel should be loaded. */ #define LOAD_PHYSICAL_ADDR __ALIGN_KERNEL_MASK(CONFIG_PHYSICAL_START, CONFIG_PHYSICAL_ALIGN - 1) diff --git a/arch/x86/um/asm/vm-flags.h b/arch/x86/um/asm/vm-flags.h index df7a3896f5dd..622d36d6ddff 100644 --- a/arch/x86/um/asm/vm-flags.h +++ b/arch/x86/um/asm/vm-flags.h @@ -9,11 +9,11 @@ #ifdef CONFIG_X86_32 -#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC +#define VMA_DATA_DEFAULT_FLAGS VMA_DATA_FLAGS_TSK_EXEC #else -#define VM_STACK_DEFAULT_FLAGS (VM_GROWSDOWN | VM_DATA_FLAGS_EXEC) +#define VMA_STACK_DEFAULT_FLAGS append_vma_flags(VMA_DATA_FLAGS_EXEC, VMA_GROWSDOWN_BIT) #endif #endif diff --git a/include/linux/ksm.h b/include/linux/ksm.h index c982694c987b..d39d0d5483a2 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -17,8 +17,8 @@ #ifdef CONFIG_KSM int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, vm_flags_t *vm_flags); -vm_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file, - vm_flags_t vm_flags); +vma_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file, + vma_flags_t vma_flags); int ksm_enable_merge_any(struct mm_struct *mm); int ksm_disable_merge_any(struct mm_struct *mm); int ksm_disable(struct mm_struct *mm); @@ -103,10 +103,10 @@ bool ksm_process_mergeable(struct mm_struct *mm); #else /* !CONFIG_KSM */ -static inline vm_flags_t ksm_vma_flags(struct mm_struct *mm, - const struct file *file, vm_flags_t vm_flags) +static inline vma_flags_t ksm_vma_flags(struct mm_struct *mm, + const struct file *file, vma_flags_t vma_flags) { - return vm_flags; + return vma_flags; } static inline int ksm_disable(struct mm_struct *mm) diff --git a/include/linux/mm.h b/include/linux/mm.h index de72382efac2..4042a584671e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -346,9 +346,9 @@ enum { * if KVM does not lock down the memory type. */ DECLARE_VMA_BIT(ALLOW_ANY_UNCACHED, 39), -#ifdef CONFIG_PPC32 +#if defined(CONFIG_PPC32) DECLARE_VMA_BIT_ALIAS(DROPPABLE, ARCH_1), -#else +#elif defined(CONFIG_64BIT) DECLARE_VMA_BIT(DROPPABLE, 40), #endif DECLARE_VMA_BIT(UFFD_MINOR, 41), @@ -503,31 +503,42 @@ enum { #endif #if defined(CONFIG_64BIT) || defined(CONFIG_PPC32) #define VM_DROPPABLE INIT_VM_FLAG(DROPPABLE) +#define VMA_DROPPABLE mk_vma_flags(VMA_DROPPABLE_BIT) #else #define VM_DROPPABLE VM_NONE +#define VMA_DROPPABLE EMPTY_VMA_FLAGS #endif /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) -#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) +#define TASK_EXEC_BIT ((current->personality & READ_IMPLIES_EXEC) ? \ + VMA_EXEC_BIT : VMA_READ_BIT) /* Common data flag combinations */ -#define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \ - VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) -#define VM_DATA_FLAGS_NON_EXEC (VM_READ | VM_WRITE | VM_MAYREAD | \ - VM_MAYWRITE | VM_MAYEXEC) -#define VM_DATA_FLAGS_EXEC (VM_READ | VM_WRITE | VM_EXEC | \ - VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) - -#ifndef VM_DATA_DEFAULT_FLAGS /* arch can override this */ -#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_EXEC +#define VMA_DATA_FLAGS_TSK_EXEC mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, \ + TASK_EXEC_BIT, VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT, \ + VMA_MAYEXEC_BIT) +#define VMA_DATA_FLAGS_NON_EXEC mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, \ + VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT, VMA_MAYEXEC_BIT) +#define VMA_DATA_FLAGS_EXEC mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, \ + VMA_EXEC_BIT, VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT, \ + VMA_MAYEXEC_BIT) + +#ifndef VMA_DATA_DEFAULT_FLAGS /* arch can override this */ +#define VMA_DATA_DEFAULT_FLAGS VMA_DATA_FLAGS_EXEC #endif -#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ -#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS +#ifndef VMA_STACK_DEFAULT_FLAGS /* arch can override this */ +#define VMA_STACK_DEFAULT_FLAGS VMA_DATA_DEFAULT_FLAGS #endif +#define VMA_STACK_FLAGS append_vma_flags(VMA_STACK_DEFAULT_FLAGS, \ + VMA_STACK_BIT, VMA_ACCOUNT_BIT) + +/* Temporary until VMA flags conversion complete. */ +#define VM_STACK_FLAGS vma_flags_to_legacy(VMA_STACK_FLAGS) + #define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) #ifdef CONFIG_MSEAL_SYSTEM_MAPPINGS @@ -536,8 +547,6 @@ enum { #define VM_SEALED_SYSMAP VM_NONE #endif -#define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) - /* VMA basic access permission flags */ #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) #define VMA_ACCESS_FLAGS mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT) @@ -545,7 +554,10 @@ enum { /* * Special vmas that are non-mergable, non-mlock()able. */ -#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) + +#define VMA_SPECIAL_FLAGS mk_vma_flags(VMA_IO_BIT, VMA_DONTEXPAND_BIT, \ + VMA_PFNMAP_BIT, VMA_MIXEDMAP_BIT) +#define VM_SPECIAL vma_flags_to_legacy(VMA_SPECIAL_FLAGS) /* * Physically remapped pages are special. Tell the @@ -1407,7 +1419,7 @@ static __always_inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, * vm_area_desc object describing a proposed VMA, e.g.: * * vma_desc_set_flags(desc, VMA_IO_BIT, VMA_PFNMAP_BIT, VMA_DONTEXPAND_BIT, - * VMA_DONTDUMP_BIT); + * VMA_DONTDUMP_BIT); */ #define vma_desc_set_flags(desc, ...) \ vma_desc_set_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) @@ -4045,7 +4057,6 @@ extern int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); extern struct file *get_mm_exe_file(struct mm_struct *mm); extern struct file *get_task_exe_file(struct task_struct *task); -extern bool may_expand_vm(struct mm_struct *, vm_flags_t, unsigned long npages); extern void vm_stat_account(struct mm_struct *, vm_flags_t, long npages); extern bool vma_is_special_mapping(const struct vm_area_struct *vma, diff --git a/mm/internal.h b/mm/internal.h index 3d3fa35e5fd1..ce954bab8a37 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1916,4 +1916,7 @@ static inline int get_sysctl_max_map_count(void) return READ_ONCE(sysctl_max_map_count); } +bool may_expand_vm(struct mm_struct *mm, const vma_flags_t *vma_flags, + unsigned long npages); + #endif /* __MM_INTERNAL_H */ diff --git a/mm/ksm.c b/mm/ksm.c index 2a2f2f005fc3..7d5b76478f0b 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -735,21 +735,24 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr, return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; } -static bool ksm_compatible(const struct file *file, vm_flags_t vm_flags) +static bool ksm_compatible(const struct file *file, vma_flags_t vma_flags) { - if (vm_flags & (VM_SHARED | VM_MAYSHARE | VM_SPECIAL | - VM_HUGETLB | VM_DROPPABLE)) - return false; /* just ignore the advice */ - + /* Just ignore the advice. */ + if (vma_flags_test_any(&vma_flags, VMA_SHARED_BIT, VMA_MAYSHARE_BIT, + VMA_HUGETLB_BIT)) + return false; + if (vma_flags_test_single_mask(&vma_flags, VMA_DROPPABLE)) + return false; + if (vma_flags_test_any_mask(&vma_flags, VMA_SPECIAL_FLAGS)) + return false; if (file_is_dax(file)) return false; - #ifdef VM_SAO - if (vm_flags & VM_SAO) + if (vma_flags_test(&vma_flags, VMA_SAO_BIT)) return false; #endif #ifdef VM_SPARC_ADI - if (vm_flags & VM_SPARC_ADI) + if (vma_flags_test(&vma_flags, VMA_SPARC_ADI_BIT)) return false; #endif @@ -758,7 +761,7 @@ static bool ksm_compatible(const struct file *file, vm_flags_t vm_flags) static bool vma_ksm_compatible(struct vm_area_struct *vma) { - return ksm_compatible(vma->vm_file, vma->vm_flags); + return ksm_compatible(vma->vm_file, vma->flags); } static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm, @@ -2825,17 +2828,17 @@ static int ksm_scan_thread(void *nothing) return 0; } -static bool __ksm_should_add_vma(const struct file *file, vm_flags_t vm_flags) +static bool __ksm_should_add_vma(const struct file *file, vma_flags_t vma_flags) { - if (vm_flags & VM_MERGEABLE) + if (vma_flags_test(&vma_flags, VMA_MERGEABLE_BIT)) return false; - return ksm_compatible(file, vm_flags); + return ksm_compatible(file, vma_flags); } static void __ksm_add_vma(struct vm_area_struct *vma) { - if (__ksm_should_add_vma(vma->vm_file, vma->vm_flags)) + if (__ksm_should_add_vma(vma->vm_file, vma->flags)) vm_flags_set(vma, VM_MERGEABLE); } @@ -2860,16 +2863,16 @@ static int __ksm_del_vma(struct vm_area_struct *vma) * * @mm: Proposed VMA's mm_struct * @file: Proposed VMA's file-backed mapping, if any. - * @vm_flags: Proposed VMA"s flags. + * @vma_flags: Proposed VMA"s flags. * - * Returns: @vm_flags possibly updated to mark mergeable. + * Returns: @vma_flags possibly updated to mark mergeable. */ -vm_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file, - vm_flags_t vm_flags) +vma_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file, + vma_flags_t vma_flags) { if (mm_flags_test(MMF_VM_MERGE_ANY, mm) && - __ksm_should_add_vma(file, vm_flags)) { - vm_flags |= VM_MERGEABLE; + __ksm_should_add_vma(file, vma_flags)) { + vma_flags_set(&vma_flags, VMA_MERGEABLE_BIT); /* * Generally, the flags here always include MMF_VM_MERGEABLE. * However, in rare cases, this flag may be cleared by ksmd who @@ -2879,7 +2882,7 @@ vm_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file, __ksm_enter(mm); } - return vm_flags; + return vma_flags; } static void ksm_add_vmas(struct mm_struct *mm) diff --git a/mm/mmap.c b/mm/mmap.c index 2d2b814978bf..5754d1c36462 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -192,7 +192,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) brkvma = vma_prev_limit(&vmi, mm->start_brk); /* Ok, looks good - let it rip. */ - if (do_brk_flags(&vmi, brkvma, oldbrk, newbrk - oldbrk, 0) < 0) + if (do_brk_flags(&vmi, brkvma, oldbrk, newbrk - oldbrk, + EMPTY_VMA_FLAGS) < 0) goto out; mm->brk = brk; @@ -1203,7 +1204,8 @@ out: int vm_brk_flags(unsigned long addr, unsigned long request, bool is_exec) { - const vm_flags_t vm_flags = is_exec ? VM_EXEC : 0; + const vma_flags_t vma_flags = is_exec ? + mk_vma_flags(VMA_EXEC_BIT) : EMPTY_VMA_FLAGS; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; unsigned long len; @@ -1230,7 +1232,7 @@ int vm_brk_flags(unsigned long addr, unsigned long request, bool is_exec) goto munmap_failed; vma = vma_prev(&vmi); - ret = do_brk_flags(&vmi, vma, addr, len, vm_flags); + ret = do_brk_flags(&vmi, vma, addr, len, vma_flags); populate = ((mm->def_flags & VM_LOCKED) != 0); mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); @@ -1328,12 +1330,13 @@ destroy: * Return true if the calling process may expand its vm space by the passed * number of pages */ -bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages) +bool may_expand_vm(struct mm_struct *mm, const vma_flags_t *vma_flags, + unsigned long npages) { if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT) return false; - if (is_data_mapping(flags) && + if (is_data_mapping_vma_flags(vma_flags) && mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) { /* Workaround for Valgrind */ if (rlimit(RLIMIT_DATA) == 0 && diff --git a/mm/mprotect.c b/mm/mprotect.c index 9681f055b9fc..eaa724b99908 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -697,7 +697,8 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, unsigned long start, unsigned long end, vm_flags_t newflags) { struct mm_struct *mm = vma->vm_mm; - vm_flags_t oldflags = READ_ONCE(vma->vm_flags); + const vma_flags_t old_vma_flags = READ_ONCE(vma->flags); + vma_flags_t new_vma_flags = legacy_to_vma_flags(newflags); long nrpages = (end - start) >> PAGE_SHIFT; unsigned int mm_cp_flags = 0; unsigned long charged = 0; @@ -706,7 +707,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, if (vma_is_sealed(vma)) return -EPERM; - if (newflags == oldflags) { + if (vma_flags_same_pair(&old_vma_flags, &new_vma_flags)) { *pprev = vma; return 0; } @@ -717,8 +718,9 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, * uncommon case, so doesn't need to be very optimized. */ if (arch_has_pfn_modify_check() && - (oldflags & (VM_PFNMAP|VM_MIXEDMAP)) && - (newflags & VM_ACCESS_FLAGS) == 0) { + vma_flags_test_any(&old_vma_flags, VMA_PFNMAP_BIT, + VMA_MIXEDMAP_BIT) && + !vma_flags_test_any_mask(&new_vma_flags, VMA_ACCESS_FLAGS)) { pgprot_t new_pgprot = vm_get_page_prot(newflags); error = walk_page_range(current->mm, start, end, @@ -736,28 +738,31 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, * hugetlb mapping were accounted for even if read-only so there is * no need to account for them here. */ - if (newflags & VM_WRITE) { + if (vma_flags_test(&new_vma_flags, VMA_WRITE_BIT)) { /* Check space limits when area turns into data. */ - if (!may_expand_vm(mm, newflags, nrpages) && - may_expand_vm(mm, oldflags, nrpages)) + if (!may_expand_vm(mm, &new_vma_flags, nrpages) && + may_expand_vm(mm, &old_vma_flags, nrpages)) return -ENOMEM; - if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB| - VM_SHARED|VM_NORESERVE))) { + if (!vma_flags_test_any(&old_vma_flags, + VMA_ACCOUNT_BIT, VMA_WRITE_BIT, VMA_HUGETLB_BIT, + VMA_SHARED_BIT, VMA_NORESERVE_BIT)) { charged = nrpages; if (security_vm_enough_memory_mm(mm, charged)) return -ENOMEM; - newflags |= VM_ACCOUNT; + vma_flags_set(&new_vma_flags, VMA_ACCOUNT_BIT); } - } else if ((oldflags & VM_ACCOUNT) && vma_is_anonymous(vma) && - !vma->anon_vma) { - newflags &= ~VM_ACCOUNT; + } else if (vma_flags_test(&old_vma_flags, VMA_ACCOUNT_BIT) && + vma_is_anonymous(vma) && !vma->anon_vma) { + vma_flags_clear(&new_vma_flags, VMA_ACCOUNT_BIT); } + newflags = vma_flags_to_legacy(new_vma_flags); vma = vma_modify_flags(vmi, *pprev, vma, start, end, &newflags); if (IS_ERR(vma)) { error = PTR_ERR(vma); goto fail; } + new_vma_flags = legacy_to_vma_flags(newflags); *pprev = vma; @@ -773,19 +778,24 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, change_protection(tlb, vma, start, end, mm_cp_flags); - if ((oldflags & VM_ACCOUNT) && !(newflags & VM_ACCOUNT)) + if (vma_flags_test(&old_vma_flags, VMA_ACCOUNT_BIT) && + !vma_flags_test(&new_vma_flags, VMA_ACCOUNT_BIT)) vm_unacct_memory(nrpages); /* * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major * fault on access. */ - if ((oldflags & (VM_WRITE | VM_SHARED | VM_LOCKED)) == VM_LOCKED && - (newflags & VM_WRITE)) { - populate_vma_page_range(vma, start, end, NULL); + if (vma_flags_test(&new_vma_flags, VMA_WRITE_BIT)) { + const vma_flags_t mask = + vma_flags_and(&old_vma_flags, VMA_WRITE_BIT, + VMA_SHARED_BIT, VMA_LOCKED_BIT); + + if (vma_flags_same(&mask, VMA_LOCKED_BIT)) + populate_vma_page_range(vma, start, end, NULL); } - vm_stat_account(mm, oldflags, -nrpages); + vm_stat_account(mm, vma_flags_to_legacy(old_vma_flags), -nrpages); vm_stat_account(mm, newflags, nrpages); perf_event_mmap(vma); return 0; diff --git a/mm/mremap.c b/mm/mremap.c index 36b3f1caebad..e9c8b1d05832 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1472,10 +1472,10 @@ static unsigned long mremap_to(struct vma_remap_struct *vrm) /* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */ if (vrm->flags & MREMAP_DONTUNMAP) { - vm_flags_t vm_flags = vrm->vma->vm_flags; + vma_flags_t vma_flags = vrm->vma->flags; unsigned long pages = vrm->old_len >> PAGE_SHIFT; - if (!may_expand_vm(mm, vm_flags, pages)) + if (!may_expand_vm(mm, &vma_flags, pages)) return -ENOMEM; } @@ -1813,7 +1813,7 @@ static int check_prep_vma(struct vma_remap_struct *vrm) if (!mlock_future_ok(mm, vma->vm_flags & VM_LOCKED, vrm->delta)) return -EAGAIN; - if (!may_expand_vm(mm, vma->vm_flags, vrm->delta >> PAGE_SHIFT)) + if (!may_expand_vm(mm, &vma->flags, vrm->delta >> PAGE_SHIFT)) return -ENOMEM; return 0; diff --git a/mm/vma.c b/mm/vma.c index 6af26619e020..9362860389ae 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -2385,7 +2385,7 @@ static void vms_abort_munmap_vmas(struct vma_munmap_struct *vms, static void update_ksm_flags(struct mmap_state *map) { - map->vm_flags = ksm_vma_flags(map->mm, map->file, map->vm_flags); + map->vma_flags = ksm_vma_flags(map->mm, map->file, map->vma_flags); } static void set_desc_from_map(struct vm_area_desc *desc, @@ -2446,7 +2446,7 @@ static int __mmap_setup(struct mmap_state *map, struct vm_area_desc *desc, } /* Check against address space limit. */ - if (!may_expand_vm(map->mm, map->vm_flags, map->pglen - vms->nr_pages)) + if (!may_expand_vm(map->mm, &map->vma_flags, map->pglen - vms->nr_pages)) return -ENOMEM; /* Private writable mapping: check memory availability. */ @@ -2866,20 +2866,22 @@ unsigned long mmap_region(struct file *file, unsigned long addr, return ret; } -/* +/** * do_brk_flags() - Increase the brk vma if the flags match. * @vmi: The vma iterator * @addr: The start address * @len: The length of the increase * @vma: The vma, - * @vm_flags: The VMA Flags + * @vma_flags: The VMA Flags * * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags * do not match then create a new anonymous VMA. Eventually we may be able to * do some brk-specific accounting here. + * + * Returns: %0 on success, or otherwise an error. */ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long addr, unsigned long len, vm_flags_t vm_flags) + unsigned long addr, unsigned long len, vma_flags_t vma_flags) { struct mm_struct *mm = current->mm; @@ -2887,9 +2889,12 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, * Check against address space limits by the changed size * Note: This happens *after* clearing old mappings in some code paths. */ - vm_flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; - vm_flags = ksm_vma_flags(mm, NULL, vm_flags); - if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) + vma_flags_set_mask(&vma_flags, VMA_DATA_DEFAULT_FLAGS); + vma_flags_set(&vma_flags, VMA_ACCOUNT_BIT); + vma_flags_set_mask(&vma_flags, mm->def_vma_flags); + + vma_flags = ksm_vma_flags(mm, NULL, vma_flags); + if (!may_expand_vm(mm, &vma_flags, len >> PAGE_SHIFT)) return -ENOMEM; if (mm->map_count > get_sysctl_max_map_count()) @@ -2903,7 +2908,7 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, * occur after forking, so the expand will only happen on new VMAs. */ if (vma && vma->vm_end == addr) { - VMG_STATE(vmg, mm, vmi, addr, addr + len, vm_flags, PHYS_PFN(addr)); + VMG_STATE(vmg, mm, vmi, addr, addr + len, vma_flags, PHYS_PFN(addr)); vmg.prev = vma; /* vmi is positioned at prev, which this mode expects. */ @@ -2924,8 +2929,8 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, vma_set_anonymous(vma); vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT); - vm_flags_init(vma, vm_flags); - vma->vm_page_prot = vm_get_page_prot(vm_flags); + vma->flags = vma_flags; + vma->vm_page_prot = vm_get_page_prot(vma_flags_to_legacy(vma_flags)); vma_start_write(vma); if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL)) goto mas_store_fail; @@ -2936,10 +2941,10 @@ out: perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; mm->data_vm += len >> PAGE_SHIFT; - if (vm_flags & VM_LOCKED) + if (vma_flags_test(&vma_flags, VMA_LOCKED_BIT)) mm->locked_vm += (len >> PAGE_SHIFT); if (pgtable_supports_soft_dirty()) - vm_flags_set(vma, VM_SOFTDIRTY); + vma_set_flags(vma, VMA_SOFTDIRTY_BIT); return 0; mas_store_fail: @@ -3070,7 +3075,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long new_start; /* address space limit tests */ - if (!may_expand_vm(mm, vma->vm_flags, grow)) + if (!may_expand_vm(mm, &vma->flags, grow)) return -ENOMEM; /* Stack limit test */ @@ -3289,7 +3294,6 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) { unsigned long charged = vma_pages(vma); - if (find_vma_intersection(mm, vma->vm_start, vma->vm_end)) return -ENOMEM; diff --git a/mm/vma.h b/mm/vma.h index cf8926558bf6..1f2de6cb3b97 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -237,13 +237,13 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma, return vma->vm_pgoff + PHYS_PFN(addr - vma->vm_start); } -#define VMG_STATE(name, mm_, vmi_, start_, end_, vm_flags_, pgoff_) \ +#define VMG_STATE(name, mm_, vmi_, start_, end_, vma_flags_, pgoff_) \ struct vma_merge_struct name = { \ .mm = mm_, \ .vmi = vmi_, \ .start = start_, \ .end = end_, \ - .vm_flags = vm_flags_, \ + .vma_flags = vma_flags_, \ .pgoff = pgoff_, \ .state = VMA_MERGE_START, \ } @@ -465,7 +465,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, struct list_head *uf); int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma, - unsigned long addr, unsigned long request, unsigned long flags); + unsigned long addr, unsigned long request, + vma_flags_t vma_flags); unsigned long unmapped_area(struct vm_unmapped_area_info *info); unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info); @@ -527,6 +528,13 @@ static inline bool is_data_mapping(vm_flags_t flags) return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE; } +static inline bool is_data_mapping_vma_flags(const vma_flags_t *vma_flags) +{ + const vma_flags_t mask = vma_flags_and(vma_flags, + VMA_WRITE_BIT, VMA_SHARED_BIT, VMA_STACK_BIT); + + return vma_flags_same(&mask, VMA_WRITE_BIT); +} static inline void vma_iter_config(struct vma_iterator *vmi, unsigned long index, unsigned long last) diff --git a/mm/vma_exec.c b/mm/vma_exec.c index 8134e1afca68..5cee8b7efa0f 100644 --- a/mm/vma_exec.c +++ b/mm/vma_exec.c @@ -36,7 +36,8 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift) unsigned long new_start = old_start - shift; unsigned long new_end = old_end - shift; VMA_ITERATOR(vmi, mm, new_start); - VMG_STATE(vmg, mm, &vmi, new_start, old_end, 0, vma->vm_pgoff); + VMG_STATE(vmg, mm, &vmi, new_start, old_end, EMPTY_VMA_FLAGS, + vma->vm_pgoff); struct vm_area_struct *next; struct mmu_gather tlb; PAGETABLE_MOVE(pmc, vma, vma, old_start, new_start, length); @@ -135,7 +136,7 @@ int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap, * use STACK_TOP because that can depend on attributes which aren't * configured yet. */ - BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); + VM_WARN_ON_ONCE(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); vma->vm_end = STACK_TOP_MAX; vma->vm_start = vma->vm_end - PAGE_SIZE; if (pgtable_supports_soft_dirty()) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index d8224ea113d1..903303e084c2 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -7713,6 +7713,8 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = { static __init int selinux_init(void) { + vma_flags_t data_default_flags = VMA_DATA_DEFAULT_FLAGS; + pr_info("SELinux: Initializing.\n"); memset(&selinux_state, 0, sizeof(selinux_state)); @@ -7729,7 +7731,7 @@ static __init int selinux_init(void) AUDIT_CFG_LSM_SECCTX_SUBJECT | AUDIT_CFG_LSM_SECCTX_OBJECT); - default_noexec = !(VM_DATA_DEFAULT_FLAGS & VM_EXEC); + default_noexec = !vma_flags_test(&data_default_flags, VMA_EXEC_BIT); if (!default_noexec) pr_notice("SELinux: virtual memory is executable by default\n"); diff --git a/tools/testing/vma/include/custom.h b/tools/testing/vma/include/custom.h index b7d9eb0a44e4..744fe874c168 100644 --- a/tools/testing/vma/include/custom.h +++ b/tools/testing/vma/include/custom.h @@ -95,6 +95,3 @@ static inline unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) { return PAGE_SIZE; } - -#define VMA_SPECIAL_FLAGS mk_vma_flags(VMA_IO_BIT, VMA_DONTEXPAND_BIT, \ - VMA_PFNMAP_BIT, VMA_MIXEDMAP_BIT) diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 244ee02dc21d..36373b81ad24 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -314,27 +314,33 @@ enum { /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) -#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) +#define TASK_EXEC_BIT ((current->personality & READ_IMPLIES_EXEC) ? \ + VM_EXEC_BIT : VM_READ_BIT) /* Common data flag combinations */ -#define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \ - VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) -#define VM_DATA_FLAGS_NON_EXEC (VM_READ | VM_WRITE | VM_MAYREAD | \ - VM_MAYWRITE | VM_MAYEXEC) -#define VM_DATA_FLAGS_EXEC (VM_READ | VM_WRITE | VM_EXEC | \ - VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) - -#ifndef VM_DATA_DEFAULT_FLAGS /* arch can override this */ -#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_EXEC +#define VMA_DATA_FLAGS_TSK_EXEC mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, \ + TASK_EXEC_BIT, VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT, \ + VMA_MAYEXEC_BIT) +#define VMA_DATA_FLAGS_NON_EXEC mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, \ + VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT, VMA_MAYEXEC_BIT) +#define VMA_DATA_FLAGS_EXEC mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, \ + VMA_EXEC_BIT, VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT, \ + VMA_MAYEXEC_BIT) + +#ifndef VMA_DATA_DEFAULT_FLAGS /* arch can override this */ +#define VMA_DATA_DEFAULT_FLAGS VMA_DATA_FLAGS_EXEC #endif -#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ -#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS +#ifndef VMA_STACK_DEFAULT_FLAGS /* arch can override this */ +#define VMA_STACK_DEFAULT_FLAGS VMA_DATA_DEFAULT_FLAGS #endif -#define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) +#define VMA_STACK_FLAGS append_vma_flags(VMA_STACK_DEFAULT_FLAGS, \ + VMA_STACK_BIT, VMA_ACCOUNT_BIT) +/* Temporary until VMA flags conversion complete. */ +#define VM_STACK_FLAGS vma_flags_to_legacy(VMA_STACK_FLAGS) -#define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) +#define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) /* VMA basic access permission flags */ #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) @@ -345,6 +351,9 @@ enum { */ #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) +#define VMA_SPECIAL_FLAGS mk_vma_flags(VMA_IO_BIT, VMA_DONTEXPAND_BIT, \ + VMA_PFNMAP_BIT, VMA_MIXEDMAP_BIT) + #define VMA_REMAP_FLAGS mk_vma_flags(VMA_IO_BIT, VMA_PFNMAP_BIT, \ VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT) @@ -357,11 +366,6 @@ enum { /* This mask represents all the VMA flag bits used by mlock */ #define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) -#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) - -#define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \ - VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) - #define RLIMIT_STACK 3 /* max stack size */ #define RLIMIT_MEMLOCK 8 /* max locked-in-memory address space */ diff --git a/tools/testing/vma/include/stubs.h b/tools/testing/vma/include/stubs.h index 416bb93f5005..b5dced3b0bd4 100644 --- a/tools/testing/vma/include/stubs.h +++ b/tools/testing/vma/include/stubs.h @@ -101,10 +101,10 @@ static inline bool shmem_file(struct file *file) return false; } -static inline vm_flags_t ksm_vma_flags(const struct mm_struct *mm, - const struct file *file, vm_flags_t vm_flags) +static inline vma_flags_t ksm_vma_flags(struct mm_struct *mm, + const struct file *file, vma_flags_t vma_flags) { - return vm_flags; + return vma_flags; } static inline void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn) @@ -239,7 +239,8 @@ static inline int security_vm_enough_memory_mm(struct mm_struct *mm, long pages) return 0; } -static inline bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, +static inline bool may_expand_vm(struct mm_struct *mm, + const vma_flags_t *vma_flags, unsigned long npages) { return true; diff --git a/tools/testing/vma/tests/merge.c b/tools/testing/vma/tests/merge.c index d3e725dc0000..44e3977e3fc0 100644 --- a/tools/testing/vma/tests/merge.c +++ b/tools/testing/vma/tests/merge.c @@ -1429,11 +1429,10 @@ static bool test_expand_only_mode(void) { vma_flags_t vma_flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT); - vm_flags_t legacy_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; struct mm_struct mm = {}; VMA_ITERATOR(vmi, &mm, 0); struct vm_area_struct *vma_prev, *vma; - VMG_STATE(vmg, &mm, &vmi, 0x5000, 0x9000, legacy_flags, 5); + VMG_STATE(vmg, &mm, &vmi, 0x5000, 0x9000, vma_flags, 5); /* * Place a VMA prior to the one we're expanding so we assert that we do -- cgit v1.2.3 From d720b81d01b137dfc23e07461b05b76f822af6ab Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:36 +0000 Subject: mm/vma: introduce vma_clear_flags[_mask]() Introduce a helper function and helper macro to easily clear a VMA's flags using the new vma_flags_t vma->flags field: * vma_clear_flags_mask() - Clears all of the flags in a specified mask in the VMA's flags field. * vma_clear_flags() - Clears all of the specified individual VMA flag bits in a VMA's flags field. Also update the VMA tests to reflect the change. Link: https://lkml.kernel.org/r/9bd15da35c2c90e7441265adf01b5c2d3b5c6d41.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm.h | 16 ++++++++++++++++ tools/testing/vma/include/dup.h | 9 +++++++++ 2 files changed, 25 insertions(+) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 4042a584671e..6b614f8af045 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1363,6 +1363,22 @@ static __always_inline void vma_set_flags_mask(struct vm_area_struct *vma, #define vma_set_flags(vma, ...) \ vma_set_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) +/* Helper to clear all VMA flags in a VMA. */ +static __always_inline void vma_clear_flags_mask(struct vm_area_struct *vma, + vma_flags_t flags) +{ + vma_flags_clear_mask(&vma->flags, flags); +} + +/* + * Helper macro for clearing VMA flags, e.g.: + * + * vma_clear_flags(vma, VMA_IO_BIT, VMA_PFNMAP_BIT, VMA_DONTEXPAND_BIT, + * VMA_DONTDUMP_BIT); + */ +#define vma_clear_flags(vma, ...) \ + vma_clear_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) + /* * Test whether a specific VMA flag is set in a VMA descriptor, e.g.: * diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 36373b81ad24..93ea600d0895 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -1065,6 +1065,15 @@ static __always_inline void vma_set_flags_mask(struct vm_area_struct *vma, #define vma_set_flags(vma, ...) \ vma_set_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) +static __always_inline void vma_clear_flags_mask(struct vm_area_struct *vma, + vma_flags_t flags) +{ + vma_flags_clear_mask(&vma->flags, flags); +} + +#define vma_clear_flags(vma, ...) \ + vma_clear_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) + static __always_inline bool vma_desc_test(const struct vm_area_desc *desc, vma_flag_t bit) { -- cgit v1.2.3 From 769669bd9ca4cbae2562d57fe753efdcf17a196d Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:38 +0000 Subject: mm/vma: convert as much as we can in mm/vma.c to vma_flags_t Now we have established a good foundation for vm_flags_t to vma_flags_t changes, update mm/vma.c to utilise vma_flags_t wherever possible. We are able to convert VM_STARTGAP_FLAGS entirely as this is only used in mm/vma.c, and to account for the fact we can't use VM_NONE to make life easier, place the definition of this within existing #ifdef's to be cleaner. Generally the remaining changes are mechanical. Also update the VMA tests to reflect the changes. Link: https://lkml.kernel.org/r/5fdeaf8af9a12c2a5d68497495f52fa627d05a5b.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 ++- mm/vma.c | 89 ++++++++++++++++++++++----------------- tools/testing/vma/include/dup.h | 4 ++ tools/testing/vma/include/stubs.h | 2 +- 4 files changed, 59 insertions(+), 42 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6b614f8af045..c6b40dc88918 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -463,8 +463,10 @@ enum { #if defined(CONFIG_X86_USER_SHADOW_STACK) || defined(CONFIG_ARM64_GCS) || \ defined(CONFIG_RISCV_USER_CFI) #define VM_SHADOW_STACK INIT_VM_FLAG(SHADOW_STACK) +#define VMA_STARTGAP_FLAGS mk_vma_flags(VMA_GROWSDOWN_BIT, VMA_SHADOW_STACK_BIT) #else #define VM_SHADOW_STACK VM_NONE +#define VMA_STARTGAP_FLAGS mk_vma_flags(VMA_GROWSDOWN_BIT) #endif #if defined(CONFIG_PPC64) #define VM_SAO INIT_VM_FLAG(SAO) @@ -539,8 +541,6 @@ enum { /* Temporary until VMA flags conversion complete. */ #define VM_STACK_FLAGS vma_flags_to_legacy(VMA_STACK_FLAGS) -#define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) - #ifdef CONFIG_MSEAL_SYSTEM_MAPPINGS #define VM_SEALED_SYSMAP VM_SEALED #else @@ -584,6 +584,8 @@ enum { /* This mask represents all the VMA flag bits used by mlock */ #define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) +#define VMA_LOCKED_MASK mk_vma_flags(VMA_LOCKED_BIT, VMA_LOCKONFAULT_BIT) + /* These flags can be updated atomically via VMA/mmap read lock. */ #define VM_ATOMIC_SET_ALLOWED VM_MAYBE_GUARD diff --git a/mm/vma.c b/mm/vma.c index 9362860389ae..9d194f8e7acb 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -185,7 +185,7 @@ static void init_multi_vma_prep(struct vma_prepare *vp, } /* - * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) + * Return true if we can merge this (vma_flags,anon_vma,file,vm_pgoff) * in front of (at a lower virtual address and file offset than) the vma. * * We cannot merge two vmas if they have differently assigned (non-NULL) @@ -211,7 +211,7 @@ static bool can_vma_merge_before(struct vma_merge_struct *vmg) } /* - * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) + * Return true if we can merge this (vma_flags,anon_vma,file,vm_pgoff) * beyond (at a higher virtual address and file offset than) the vma. * * We cannot merge two vmas if they have differently assigned (non-NULL) @@ -850,7 +850,8 @@ static __must_check struct vm_area_struct *vma_merge_existing_range( * furthermost left or right side of the VMA, then we have no chance of * merging and should abort. */ - if (vmg->vm_flags & VM_SPECIAL || (!left_side && !right_side)) + if (vma_flags_test_any_mask(&vmg->vma_flags, VMA_SPECIAL_FLAGS) || + (!left_side && !right_side)) return NULL; if (left_side) @@ -1072,7 +1073,8 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) vmg->state = VMA_MERGE_NOMERGE; /* Special VMAs are unmergeable, also if no prev/next. */ - if ((vmg->vm_flags & VM_SPECIAL) || (!prev && !next)) + if (vma_flags_test_any_mask(&vmg->vma_flags, VMA_SPECIAL_FLAGS) || + (!prev && !next)) return NULL; can_merge_left = can_vma_merge_left(vmg); @@ -1459,17 +1461,17 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, nrpages = vma_pages(next); vms->nr_pages += nrpages; - if (next->vm_flags & VM_LOCKED) + if (vma_test(next, VMA_LOCKED_BIT)) vms->locked_vm += nrpages; - if (next->vm_flags & VM_ACCOUNT) + if (vma_test(next, VMA_ACCOUNT_BIT)) vms->nr_accounted += nrpages; if (is_exec_mapping(next->vm_flags)) vms->exec_vm += nrpages; else if (is_stack_mapping(next->vm_flags)) vms->stack_vm += nrpages; - else if (is_data_mapping(next->vm_flags)) + else if (is_data_mapping_vma_flags(&next->flags)) vms->data_vm += nrpages; if (vms->uf) { @@ -2065,14 +2067,13 @@ static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops) static bool vma_is_shared_writable(struct vm_area_struct *vma) { - return (vma->vm_flags & (VM_WRITE | VM_SHARED)) == - (VM_WRITE | VM_SHARED); + return vma_test_all(vma, VMA_WRITE_BIT, VMA_SHARED_BIT); } static bool vma_fs_can_writeback(struct vm_area_struct *vma) { /* No managed pages to writeback. */ - if (vma->vm_flags & VM_PFNMAP) + if (vma_test(vma, VMA_PFNMAP_BIT)) return false; return vma->vm_file && vma->vm_file->f_mapping && @@ -2338,8 +2339,11 @@ void mm_drop_all_locks(struct mm_struct *mm) * We account for memory if it's a private writeable mapping, * not hugepages and VM_NORESERVE wasn't set. */ -static bool accountable_mapping(struct file *file, vm_flags_t vm_flags) +static bool accountable_mapping(struct mmap_state *map) { + const struct file *file = map->file; + vma_flags_t mask; + /* * hugetlb has its own accounting separate from the core VM * VM_HUGETLB may not be set yet so we cannot check for that flag. @@ -2347,7 +2351,9 @@ static bool accountable_mapping(struct file *file, vm_flags_t vm_flags) if (file && is_file_hugepages(file)) return false; - return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; + mask = vma_flags_and(&map->vma_flags, VMA_NORESERVE_BIT, VMA_SHARED_BIT, + VMA_WRITE_BIT); + return vma_flags_same(&mask, VMA_WRITE_BIT); } /* @@ -2450,7 +2456,7 @@ static int __mmap_setup(struct mmap_state *map, struct vm_area_desc *desc, return -ENOMEM; /* Private writable mapping: check memory availability. */ - if (accountable_mapping(map->file, map->vm_flags)) { + if (accountable_mapping(map)) { map->charged = map->pglen; map->charged -= vms->nr_accounted; if (map->charged) { @@ -2460,7 +2466,7 @@ static int __mmap_setup(struct mmap_state *map, struct vm_area_desc *desc, } vms->nr_accounted = 0; - map->vm_flags |= VM_ACCOUNT; + vma_flags_set(&map->vma_flags, VMA_ACCOUNT_BIT); } /* @@ -2508,12 +2514,12 @@ static int __mmap_new_file_vma(struct mmap_state *map, * Drivers should not permit writability when previously it was * disallowed. */ - VM_WARN_ON_ONCE(map->vm_flags != vma->vm_flags && - !(map->vm_flags & VM_MAYWRITE) && - (vma->vm_flags & VM_MAYWRITE)); + VM_WARN_ON_ONCE(!vma_flags_same_pair(&map->vma_flags, &vma->flags) && + !vma_flags_test(&map->vma_flags, VMA_MAYWRITE_BIT) && + vma_test(vma, VMA_MAYWRITE_BIT)); map->file = vma->vm_file; - map->vm_flags = vma->vm_flags; + map->vma_flags = vma->flags; return 0; } @@ -2544,7 +2550,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) vma_iter_config(vmi, map->addr, map->end); vma_set_range(vma, map->addr, map->end, map->pgoff); - vm_flags_init(vma, map->vm_flags); + vma->flags = map->vma_flags; vma->vm_page_prot = map->page_prot; if (vma_iter_prealloc(vmi, vma)) { @@ -2554,7 +2560,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) if (map->file) error = __mmap_new_file_vma(map, vma); - else if (map->vm_flags & VM_SHARED) + else if (vma_flags_test(&map->vma_flags, VMA_SHARED_BIT)) error = shmem_zero_setup(vma); else vma_set_anonymous(vma); @@ -2564,7 +2570,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) if (!map->check_ksm_early) { update_ksm_flags(map); - vm_flags_init(vma, map->vm_flags); + vma->flags = map->vma_flags; } #ifdef CONFIG_SPARC64 @@ -2604,7 +2610,6 @@ free_vma: static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) { struct mm_struct *mm = map->mm; - vm_flags_t vm_flags = vma->vm_flags; perf_event_mmap(vma); @@ -2612,9 +2617,9 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) vms_complete_munmap_vmas(&map->vms, &map->mas_detach); vm_stat_account(mm, vma->vm_flags, map->pglen); - if (vm_flags & VM_LOCKED) { + if (vma_test(vma, VMA_LOCKED_BIT)) { if (!vma_supports_mlock(vma)) - vm_flags_clear(vma, VM_LOCKED_MASK); + vma_clear_flags_mask(vma, VMA_LOCKED_MASK); else mm->locked_vm += map->pglen; } @@ -2630,7 +2635,7 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) * a completely new data area). */ if (pgtable_supports_soft_dirty()) - vm_flags_set(vma, VM_SOFTDIRTY); + vma_set_flags(vma, VMA_SOFTDIRTY_BIT); vma_set_page_prot(vma); } @@ -2993,7 +2998,8 @@ retry: gap = vma_iter_addr(&vmi) + info->start_gap; gap += (info->align_offset - gap) & info->align_mask; tmp = vma_next(&vmi); - if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ + /* Avoid prev check if possible */ + if (tmp && vma_test_any_mask(tmp, VMA_STARTGAP_FLAGS)) { if (vm_start_gap(tmp) < gap + length - 1) { low_limit = tmp->vm_end; vma_iter_reset(&vmi); @@ -3045,7 +3051,8 @@ retry: gap -= (gap - info->align_offset) & info->align_mask; gap_end = vma_iter_end(&vmi); tmp = vma_next(&vmi); - if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ + /* Avoid prev check if possible */ + if (tmp && vma_test_any_mask(tmp, VMA_STARTGAP_FLAGS)) { if (vm_start_gap(tmp) < gap_end) { high_limit = vm_start_gap(tmp); vma_iter_reset(&vmi); @@ -3083,12 +3090,16 @@ static int acct_stack_growth(struct vm_area_struct *vma, return -ENOMEM; /* mlock limit tests */ - if (!mlock_future_ok(mm, vma->vm_flags & VM_LOCKED, grow << PAGE_SHIFT)) + if (!mlock_future_ok(mm, vma_test(vma, VMA_LOCKED_BIT), + grow << PAGE_SHIFT)) return -ENOMEM; /* Check to ensure the stack will not grow into a hugetlb-only region */ - new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start : - vma->vm_end - size; + new_start = vma->vm_end - size; +#ifdef CONFIG_STACK_GROWSUP + if (vma_test(vma, VMA_GROWSUP_BIT)) + new_start = vma->vm_start; +#endif if (is_hugepage_only_range(vma->vm_mm, new_start, size)) return -EFAULT; @@ -3102,7 +3113,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, return 0; } -#if defined(CONFIG_STACK_GROWSUP) +#ifdef CONFIG_STACK_GROWSUP /* * PA-RISC uses this for its stack. * vma is the last one with address > vma->vm_end. Have to extend vma. @@ -3115,7 +3126,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) int error = 0; VMA_ITERATOR(vmi, mm, vma->vm_start); - if (!(vma->vm_flags & VM_GROWSUP)) + if (!vma_test(vma, VMA_GROWSUP_BIT)) return -EFAULT; mmap_assert_write_locked(mm); @@ -3135,7 +3146,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) next = find_vma_intersection(mm, vma->vm_end, gap_addr); if (next && vma_is_accessible(next)) { - if (!(next->vm_flags & VM_GROWSUP)) + if (!vma_test(next, VMA_GROWSUP_BIT)) return -ENOMEM; /* Check that both stack segments have the same anon_vma? */ } @@ -3169,7 +3180,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { error = acct_stack_growth(vma, size, grow); if (!error) { - if (vma->vm_flags & VM_LOCKED) + if (vma_test(vma, VMA_LOCKED_BIT)) mm->locked_vm += grow; vm_stat_account(mm, vma->vm_flags, grow); anon_vma_interval_tree_pre_update_vma(vma); @@ -3200,7 +3211,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) int error = 0; VMA_ITERATOR(vmi, mm, vma->vm_start); - if (!(vma->vm_flags & VM_GROWSDOWN)) + if (!vma_test(vma, VMA_GROWSDOWN_BIT)) return -EFAULT; mmap_assert_write_locked(mm); @@ -3213,7 +3224,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) prev = vma_prev(&vmi); /* Check that both stack segments have the same anon_vma? */ if (prev) { - if (!(prev->vm_flags & VM_GROWSDOWN) && + if (!vma_test(prev, VMA_GROWSDOWN_BIT) && vma_is_accessible(prev) && (address - prev->vm_end < stack_guard_gap)) return -ENOMEM; @@ -3248,7 +3259,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) if (grow <= vma->vm_pgoff) { error = acct_stack_growth(vma, size, grow); if (!error) { - if (vma->vm_flags & VM_LOCKED) + if (vma_test(vma, VMA_LOCKED_BIT)) mm->locked_vm += grow; vm_stat_account(mm, vma->vm_flags, grow); anon_vma_interval_tree_pre_update_vma(vma); @@ -3297,7 +3308,7 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) if (find_vma_intersection(mm, vma->vm_start, vma->vm_end)) return -ENOMEM; - if ((vma->vm_flags & VM_ACCOUNT) && + if (vma_test(vma, VMA_ACCOUNT_BIT) && security_vm_enough_memory_mm(mm, charged)) return -ENOMEM; @@ -3319,7 +3330,7 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) } if (vma_link(mm, vma)) { - if (vma->vm_flags & VM_ACCOUNT) + if (vma_test(vma, VMA_ACCOUNT_BIT)) vm_unacct_memory(charged); return -ENOMEM; } diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 93ea600d0895..58a621ec389f 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -267,8 +267,10 @@ enum { #endif /* CONFIG_ARCH_HAS_PKEYS */ #if defined(CONFIG_X86_USER_SHADOW_STACK) || defined(CONFIG_ARM64_GCS) #define VM_SHADOW_STACK INIT_VM_FLAG(SHADOW_STACK) +#define VMA_STARTGAP_FLAGS mk_vma_flags(VMA_GROWSDOWN_BIT, VMA_SHADOW_STACK_BIT) #else #define VM_SHADOW_STACK VM_NONE +#define VMA_STARTGAP_FLAGS mk_vma_flags(VMA_GROWSDOWN_BIT) #endif #if defined(CONFIG_PPC64) #define VM_SAO INIT_VM_FLAG(SAO) @@ -366,6 +368,8 @@ enum { /* This mask represents all the VMA flag bits used by mlock */ #define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) +#define VMA_LOCKED_MASK mk_vma_flags(VMA_LOCKED_BIT, VMA_LOCKONFAULT_BIT) + #define RLIMIT_STACK 3 /* max stack size */ #define RLIMIT_MEMLOCK 8 /* max locked-in-memory address space */ diff --git a/tools/testing/vma/include/stubs.h b/tools/testing/vma/include/stubs.h index b5dced3b0bd4..5afb0afe2d48 100644 --- a/tools/testing/vma/include/stubs.h +++ b/tools/testing/vma/include/stubs.h @@ -229,7 +229,7 @@ static inline bool signal_pending(void *p) return false; } -static inline bool is_file_hugepages(struct file *file) +static inline bool is_file_hugepages(const struct file *file) { return false; } -- cgit v1.2.3 From a06eb2f8279e0b2b42799d42041f144377f5a086 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:40 +0000 Subject: mm/vma: convert vma_modify_flags[_uffd]() to use vma_flags_t Update the vma_modify_flags() and vma_modify_flags_uffd() functions to accept a vma_flags_t parameter rather than a vm_flags_t one, and propagate the changes as needed to implement this change. Also add vma_flags_reset_once() in replacement of vm_flags_reset_once(). We still need to be careful here because we need to avoid tearing, so maintain the assumption that the first system word set of flags are the only ones that require protection from tearing, and retain this functionality. We can copy the remainder of VMA flags above 64 bits normally. But hopefully by the time that happens, we will have replaced the logic that requires these WRITE_ONCE()'s with something else. We also replace instances of vm_flags_reset() with a simple write of VMA flags. We are no longer perform a number of checks, most notable of all the VMA flags asserts becase: 1. We might be operating on a VMA that is not yet added to the tree. 2. We might be operating on a VMA that is now detached. 3. Really in all but core code, you should be using vma_desc_xxx(). 4. Other VMA fields are manipulated with no such checks. 5. It'd be egregious to have to add variants of flag functions just to account for cases such as the above, especially when we don't do so for other VMA fields. Drivers are the problematic cases and why it was especially important (and also for debug as VMA locks were introduced), the mmap_prepare work is solving this generally. Additionally, we can fairly safely assume by this point the soft dirty flags are being set correctly, so it's reasonable to drop this also. Finally, update the VMA tests to reflect this. Link: https://lkml.kernel.org/r/51afbb2b8c3681003cc7926647e37335d793836e.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm.h | 22 ++++++++++------------ include/linux/userfaultfd_k.h | 3 +++ mm/madvise.c | 10 ++++++---- mm/mlock.c | 38 +++++++++++++++++++++----------------- mm/mprotect.c | 7 +++---- mm/mseal.c | 11 +++++++---- mm/userfaultfd.c | 21 ++++++++++++++------- mm/vma.c | 15 ++++++++------- mm/vma.h | 15 +++++++-------- tools/testing/vma/include/dup.h | 22 +++++++++++++--------- tools/testing/vma/tests/merge.c | 3 +-- 11 files changed, 93 insertions(+), 74 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index c6b40dc88918..72bc5016094b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -954,22 +954,20 @@ static inline void vm_flags_reset(struct vm_area_struct *vma, vm_flags_init(vma, flags); } -static inline void vm_flags_reset_once(struct vm_area_struct *vma, - vm_flags_t flags) +static inline void vma_flags_reset_once(struct vm_area_struct *vma, + vma_flags_t *flags) { - vma_assert_write_locked(vma); - /* - * If VMA flags exist beyond the first system word, also clear these. It - * is assumed the write once behaviour is required only for the first - * system word. - */ + const unsigned long word = flags->__vma_flags[0]; + + /* It is assumed only the first system word must be written once. */ + vma_flags_overwrite_word_once(&vma->flags, word); + /* The remainder can be copied normally. */ if (NUM_VMA_FLAG_BITS > BITS_PER_LONG) { - unsigned long *bitmap = vma->flags.__vma_flags; + unsigned long *dst = &vma->flags.__vma_flags[1]; + const unsigned long *src = &flags->__vma_flags[1]; - bitmap_zero(&bitmap[1], NUM_VMA_FLAG_BITS - BITS_PER_LONG); + bitmap_copy(dst, src, NUM_VMA_FLAG_BITS - BITS_PER_LONG); } - - vma_flags_overwrite_word_once(&vma->flags, flags); } static inline void vm_flags_set(struct vm_area_struct *vma, diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index fd5f42765497..d83e349900a3 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -23,6 +23,9 @@ /* The set of all possible UFFD-related VM flags. */ #define __VM_UFFD_FLAGS (VM_UFFD_MISSING | VM_UFFD_WP | VM_UFFD_MINOR) +#define __VMA_UFFD_FLAGS mk_vma_flags(VMA_UFFD_MISSING_BIT, VMA_UFFD_WP_BIT, \ + VMA_UFFD_MINOR_BIT) + /* * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining * new flags, since they might collide with O_* ones. We want diff --git a/mm/madvise.c b/mm/madvise.c index afe0f01765c4..69708e953cf5 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -151,13 +151,15 @@ static int madvise_update_vma(vm_flags_t new_flags, struct madvise_behavior *madv_behavior) { struct vm_area_struct *vma = madv_behavior->vma; + vma_flags_t new_vma_flags = legacy_to_vma_flags(new_flags); struct madvise_behavior_range *range = &madv_behavior->range; struct anon_vma_name *anon_name = madv_behavior->anon_name; bool set_new_anon_name = madv_behavior->behavior == __MADV_SET_ANON_VMA_NAME; VMA_ITERATOR(vmi, madv_behavior->mm, range->start); - if (new_flags == vma->vm_flags && (!set_new_anon_name || - anon_vma_name_eq(anon_vma_name(vma), anon_name))) + if (vma_flags_same_mask(&vma->flags, new_vma_flags) && + (!set_new_anon_name || + anon_vma_name_eq(anon_vma_name(vma), anon_name))) return 0; if (set_new_anon_name) @@ -165,7 +167,7 @@ static int madvise_update_vma(vm_flags_t new_flags, range->start, range->end, anon_name); else vma = vma_modify_flags(&vmi, madv_behavior->prev, vma, - range->start, range->end, &new_flags); + range->start, range->end, &new_vma_flags); if (IS_ERR(vma)) return PTR_ERR(vma); @@ -174,7 +176,7 @@ static int madvise_update_vma(vm_flags_t new_flags, /* vm_flags is protected by the mmap_lock held in write mode. */ vma_start_write(vma); - vm_flags_reset(vma, new_flags); + vma->flags = new_vma_flags; if (set_new_anon_name) return replace_anon_vma_name(vma, anon_name); diff --git a/mm/mlock.c b/mm/mlock.c index fd648138bc72..fdbd1434a35f 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -415,13 +415,14 @@ out: * @vma - vma containing range to be mlock()ed or munlock()ed * @start - start address in @vma of the range * @end - end of range in @vma - * @newflags - the new set of flags for @vma. + * @new_vma_flags - the new set of flags for @vma. * * Called for mlock(), mlock2() and mlockall(), to set @vma VM_LOCKED; * called for munlock() and munlockall(), to clear VM_LOCKED from @vma. */ static void mlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end, vm_flags_t newflags) + unsigned long start, unsigned long end, + vma_flags_t *new_vma_flags) { static const struct mm_walk_ops mlock_walk_ops = { .pmd_entry = mlock_pte_range, @@ -439,18 +440,18 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma, * combination should not be visible to other mmap_lock users; * but WRITE_ONCE so rmap walkers must see VM_IO if VM_LOCKED. */ - if (newflags & VM_LOCKED) - newflags |= VM_IO; + if (vma_flags_test(new_vma_flags, VMA_LOCKED_BIT)) + vma_flags_set(new_vma_flags, VMA_IO_BIT); vma_start_write(vma); - vm_flags_reset_once(vma, newflags); + vma_flags_reset_once(vma, new_vma_flags); lru_add_drain(); walk_page_range(vma->vm_mm, start, end, &mlock_walk_ops, NULL); lru_add_drain(); - if (newflags & VM_IO) { - newflags &= ~VM_IO; - vm_flags_reset_once(vma, newflags); + if (vma_flags_test(new_vma_flags, VMA_IO_BIT)) { + vma_flags_clear(new_vma_flags, VMA_IO_BIT); + vma_flags_reset_once(vma, new_vma_flags); } } @@ -467,20 +468,22 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, vm_flags_t newflags) { + vma_flags_t new_vma_flags = legacy_to_vma_flags(newflags); + const vma_flags_t old_vma_flags = vma->flags; struct mm_struct *mm = vma->vm_mm; int nr_pages; int ret = 0; - vm_flags_t oldflags = vma->vm_flags; - if (newflags == oldflags || vma_is_secretmem(vma) || - !vma_supports_mlock(vma)) + if (vma_flags_same_pair(&old_vma_flags, &new_vma_flags) || + vma_is_secretmem(vma) || !vma_supports_mlock(vma)) { /* * Don't set VM_LOCKED or VM_LOCKONFAULT and don't count. * For secretmem, don't allow the memory to be unlocked. */ goto out; + } - vma = vma_modify_flags(vmi, *prev, vma, start, end, &newflags); + vma = vma_modify_flags(vmi, *prev, vma, start, end, &new_vma_flags); if (IS_ERR(vma)) { ret = PTR_ERR(vma); goto out; @@ -490,9 +493,9 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, * Keep track of amount of locked VM. */ nr_pages = (end - start) >> PAGE_SHIFT; - if (!(newflags & VM_LOCKED)) + if (!vma_flags_test(&new_vma_flags, VMA_LOCKED_BIT)) nr_pages = -nr_pages; - else if (oldflags & VM_LOCKED) + else if (vma_flags_test(&old_vma_flags, VMA_LOCKED_BIT)) nr_pages = 0; mm->locked_vm += nr_pages; @@ -501,12 +504,13 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, * It's okay if try_to_unmap_one unmaps a page just after we * set VM_LOCKED, populate_vma_page_range will bring it back. */ - if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) { + if (vma_flags_test(&new_vma_flags, VMA_LOCKED_BIT) && + vma_flags_test(&old_vma_flags, VMA_LOCKED_BIT)) { /* No work to do, and mlocking twice would be wrong */ vma_start_write(vma); - vm_flags_reset(vma, newflags); + vma->flags = new_vma_flags; } else { - mlock_vma_pages_range(vma, start, end, newflags); + mlock_vma_pages_range(vma, start, end, &new_vma_flags); } out: *prev = vma; diff --git a/mm/mprotect.c b/mm/mprotect.c index eaa724b99908..941f1211da0d 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -756,13 +756,11 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, vma_flags_clear(&new_vma_flags, VMA_ACCOUNT_BIT); } - newflags = vma_flags_to_legacy(new_vma_flags); - vma = vma_modify_flags(vmi, *pprev, vma, start, end, &newflags); + vma = vma_modify_flags(vmi, *pprev, vma, start, end, &new_vma_flags); if (IS_ERR(vma)) { error = PTR_ERR(vma); goto fail; } - new_vma_flags = legacy_to_vma_flags(newflags); *pprev = vma; @@ -771,7 +769,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, * held in write mode. */ vma_start_write(vma); - vm_flags_reset_once(vma, newflags); + vma_flags_reset_once(vma, &new_vma_flags); if (vma_wants_manual_pte_write_upgrade(vma)) mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; vma_set_page_prot(vma); @@ -796,6 +794,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, } vm_stat_account(mm, vma_flags_to_legacy(old_vma_flags), -nrpages); + newflags = vma_flags_to_legacy(new_vma_flags); vm_stat_account(mm, newflags, nrpages); perf_event_mmap(vma); return 0; diff --git a/mm/mseal.c b/mm/mseal.c index ac58643181f7..e2093ae3d25c 100644 --- a/mm/mseal.c +++ b/mm/mseal.c @@ -68,14 +68,17 @@ static int mseal_apply(struct mm_struct *mm, const unsigned long curr_start = MAX(vma->vm_start, start); const unsigned long curr_end = MIN(vma->vm_end, end); - if (!(vma->vm_flags & VM_SEALED)) { - vm_flags_t vm_flags = vma->vm_flags | VM_SEALED; + if (!vma_test(vma, VMA_SEALED_BIT)) { + vma_flags_t vma_flags = vma->flags; + + vma_flags_set(&vma_flags, VMA_SEALED_BIT); vma = vma_modify_flags(&vmi, prev, vma, curr_start, - curr_end, &vm_flags); + curr_end, &vma_flags); if (IS_ERR(vma)) return PTR_ERR(vma); - vm_flags_set(vma, VM_SEALED); + vma_start_write(vma); + vma_set_flags(vma, VMA_SEALED_BIT); } prev = vma; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 2c565c7134b6..89879c3ba344 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -1976,6 +1976,9 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, { struct vm_area_struct *ret; bool give_up_on_oom = false; + vma_flags_t new_vma_flags = vma->flags; + + vma_flags_clear_mask(&new_vma_flags, __VMA_UFFD_FLAGS); /* * If we are modifying only and not splitting, just give up on the merge @@ -1989,8 +1992,8 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, uffd_wp_range(vma, start, end - start, false); ret = vma_modify_flags_uffd(vmi, prev, vma, start, end, - vma->vm_flags & ~__VM_UFFD_FLAGS, - NULL_VM_UFFD_CTX, give_up_on_oom); + &new_vma_flags, NULL_VM_UFFD_CTX, + give_up_on_oom); /* * In the vma_merge() successful mprotect-like case 8: @@ -2010,10 +2013,11 @@ int userfaultfd_register_range(struct userfaultfd_ctx *ctx, unsigned long start, unsigned long end, bool wp_async) { + vma_flags_t vma_flags = legacy_to_vma_flags(vm_flags); VMA_ITERATOR(vmi, ctx->mm, start); struct vm_area_struct *prev = vma_prev(&vmi); unsigned long vma_end; - vm_flags_t new_flags; + vma_flags_t new_vma_flags; if (vma->vm_start < start) prev = vma; @@ -2024,23 +2028,26 @@ int userfaultfd_register_range(struct userfaultfd_ctx *ctx, VM_WARN_ON_ONCE(!vma_can_userfault(vma, vm_flags, wp_async)); VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx && vma->vm_userfaultfd_ctx.ctx != ctx); - VM_WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE)); + VM_WARN_ON_ONCE(!vma_test(vma, VMA_MAYWRITE_BIT)); /* * Nothing to do: this vma is already registered into this * userfaultfd and with the right tracking mode too. */ if (vma->vm_userfaultfd_ctx.ctx == ctx && - (vma->vm_flags & vm_flags) == vm_flags) + vma_test_all_mask(vma, vma_flags)) goto skip; if (vma->vm_start > start) start = vma->vm_start; vma_end = min(end, vma->vm_end); - new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; + new_vma_flags = vma->flags; + vma_flags_clear_mask(&new_vma_flags, __VMA_UFFD_FLAGS); + vma_flags_set_mask(&new_vma_flags, vma_flags); + vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, - new_flags, + &new_vma_flags, (struct vm_userfaultfd_ctx){ctx}, /* give_up_on_oom = */false); if (IS_ERR(vma)) diff --git a/mm/vma.c b/mm/vma.c index 9d194f8e7acb..16a1d708c978 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -1710,13 +1710,13 @@ static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg) struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, - vm_flags_t *vm_flags_ptr) + vma_flags_t *vma_flags_ptr) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); - const vm_flags_t vm_flags = *vm_flags_ptr; + const vma_flags_t vma_flags = *vma_flags_ptr; struct vm_area_struct *ret; - vmg.vm_flags = vm_flags; + vmg.vma_flags = vma_flags; ret = vma_modify(&vmg); if (IS_ERR(ret)) @@ -1728,7 +1728,7 @@ struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, * them to the caller. */ if (vmg.state == VMA_MERGE_SUCCESS) - *vm_flags_ptr = ret->vm_flags; + *vma_flags_ptr = ret->flags; return ret; } @@ -1758,12 +1758,13 @@ struct vm_area_struct *vma_modify_policy(struct vma_iterator *vmi, struct vm_area_struct *vma_modify_flags_uffd(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, - unsigned long start, unsigned long end, vm_flags_t vm_flags, - struct vm_userfaultfd_ctx new_ctx, bool give_up_on_oom) + unsigned long start, unsigned long end, + const vma_flags_t *vma_flags, struct vm_userfaultfd_ctx new_ctx, + bool give_up_on_oom) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); - vmg.vm_flags = vm_flags; + vmg.vma_flags = *vma_flags; vmg.uffd_ctx = new_ctx; if (give_up_on_oom) vmg.give_up_on_oom = true; diff --git a/mm/vma.h b/mm/vma.h index 1f2de6cb3b97..270008e5babc 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -342,24 +342,23 @@ void unmap_region(struct unmap_desc *unmap); * @vma: The VMA containing the range @start to @end to be updated. * @start: The start of the range to update. May be offset within @vma. * @end: The exclusive end of the range to update, may be offset within @vma. - * @vm_flags_ptr: A pointer to the VMA flags that the @start to @end range is + * @vma_flags_ptr: A pointer to the VMA flags that the @start to @end range is * about to be set to. On merge, this will be updated to include sticky flags. * * IMPORTANT: The actual modification being requested here is NOT applied, * rather the VMA is perhaps split, perhaps merged to accommodate the change, * and the caller is expected to perform the actual modification. * - * In order to account for sticky VMA flags, the @vm_flags_ptr parameter points + * In order to account for sticky VMA flags, the @vma_flags_ptr parameter points * to the requested flags which are then updated so the caller, should they * overwrite any existing flags, correctly retains these. * * Returns: A VMA which contains the range @start to @end ready to have its - * flags altered to *@vm_flags. + * flags altered to *@vma_flags. */ __must_check struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, - unsigned long start, unsigned long end, - vm_flags_t *vm_flags_ptr); + unsigned long start, unsigned long end, vma_flags_t *vma_flags_ptr); /** * vma_modify_name() - Perform any necessary split/merge in preparation for @@ -418,7 +417,7 @@ __must_check struct vm_area_struct *vma_modify_policy(struct vma_iterator *vmi, * @vma: The VMA containing the range @start to @end to be updated. * @start: The start of the range to update. May be offset within @vma. * @end: The exclusive end of the range to update, may be offset within @vma. - * @vm_flags: The VMA flags that the @start to @end range is about to be set to. + * @vma_flags: The VMA flags that the @start to @end range is about to be set to. * @new_ctx: The userfaultfd context that the @start to @end range is about to * be set to. * @give_up_on_oom: If an out of memory condition occurs on merge, simply give @@ -429,11 +428,11 @@ __must_check struct vm_area_struct *vma_modify_policy(struct vma_iterator *vmi, * and the caller is expected to perform the actual modification. * * Returns: A VMA which contains the range @start to @end ready to have its VMA - * flags changed to @vm_flags and its userfaultfd context changed to @new_ctx. + * flags changed to @vma_flags and its userfaultfd context changed to @new_ctx. */ __must_check struct vm_area_struct *vma_modify_flags_uffd(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, - unsigned long start, unsigned long end, vm_flags_t vm_flags, + unsigned long start, unsigned long end, const vma_flags_t *vma_flags, struct vm_userfaultfd_ctx new_ctx, bool give_up_on_oom); __must_check struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg); diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 58a621ec389f..9dd57f50ea6d 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -871,16 +871,20 @@ static inline void vm_flags_reset(struct vm_area_struct *vma, vm_flags_init(vma, flags); } -static inline void vm_flags_reset_once(struct vm_area_struct *vma, - vm_flags_t flags) +static inline void vma_flags_reset_once(struct vm_area_struct *vma, + vma_flags_t *flags) { - vma_assert_write_locked(vma); - /* - * The user should only be interested in avoiding reordering of - * assignment to the first word. - */ - vma_flags_clear_all(&vma->flags); - vma_flags_overwrite_word_once(&vma->flags, flags); + const unsigned long word = flags->__vma_flags[0]; + + /* It is assumed only the first system word must be written once. */ + vma_flags_overwrite_word_once(&vma->flags, word); + /* The remainder can be copied normally. */ + if (NUM_VMA_FLAG_BITS > BITS_PER_LONG) { + unsigned long *dst = &vma->flags.__vma_flags[1]; + const unsigned long *src = &flags->__vma_flags[1]; + + bitmap_copy(dst, src, NUM_VMA_FLAG_BITS - BITS_PER_LONG); + } } static inline void vm_flags_set(struct vm_area_struct *vma, diff --git a/tools/testing/vma/tests/merge.c b/tools/testing/vma/tests/merge.c index 44e3977e3fc0..03b6f9820e0a 100644 --- a/tools/testing/vma/tests/merge.c +++ b/tools/testing/vma/tests/merge.c @@ -132,7 +132,6 @@ static bool test_simple_modify(void) struct vm_area_struct *vma; vma_flags_t vma_flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT); - vm_flags_t legacy_flags = VM_READ | VM_WRITE; struct mm_struct mm = {}; struct vm_area_struct *init_vma = alloc_vma(&mm, 0, 0x3000, 0, vma_flags); VMA_ITERATOR(vmi, &mm, 0x1000); @@ -144,7 +143,7 @@ static bool test_simple_modify(void) * performs the merge/split only. */ vma = vma_modify_flags(&vmi, init_vma, init_vma, - 0x1000, 0x2000, &legacy_flags); + 0x1000, 0x2000, &vma_flags); ASSERT_NE(vma, NULL); /* We modify the provided VMA, and on split allocate new VMAs. */ ASSERT_EQ(vma, init_vma); -- cgit v1.2.3 From 90cb921c4d7bf92854344d3e76561f48784c613e Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:41 +0000 Subject: mm/vma: convert __mmap_region() to use vma_flags_t Update the mmap() implementation logic implemented in __mmap_region() and functions invoked by it. The mmap_region() function converts its input vm_flags_t parameter to a vma_flags_t value which it then passes to __mmap_region() which uses the vma_flags_t value consistently from then on. As part of the change, we convert map_deny_write_exec() to using vma_flags_t (it was incorrectly using unsigned long before), and place it in vma.h, as it is only used internal to mm. With this change, we eliminate the legacy is_shared_maywrite_vm_flags() helper function which is now no longer required. We are also able to update the MMAP_STATE() and VMG_MMAP_STATE() macros to use the vma_flags_t value. Finally, we update the VMA tests to reflect the change. Link: https://lkml.kernel.org/r/1fc33a404c962f02da778da100387cc19bd62153.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm.h | 18 ++++++++++----- include/linux/mman.h | 49 --------------------------------------- mm/mprotect.c | 4 +++- mm/vma.c | 25 ++++++++++---------- mm/vma.h | 51 +++++++++++++++++++++++++++++++++++++++++ tools/testing/vma/include/dup.h | 34 ++++++--------------------- tools/testing/vma/tests/mmap.c | 18 +++++---------- 7 files changed, 92 insertions(+), 107 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 72bc5016094b..9472b3c9a22b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1522,12 +1522,6 @@ static inline bool vma_is_accessible(const struct vm_area_struct *vma) return vma->vm_flags & VM_ACCESS_FLAGS; } -static inline bool is_shared_maywrite_vm_flags(vm_flags_t vm_flags) -{ - return (vm_flags & (VM_SHARED | VM_MAYWRITE)) == - (VM_SHARED | VM_MAYWRITE); -} - static inline bool is_shared_maywrite(const vma_flags_t *flags) { return vma_flags_test_all(flags, VMA_SHARED_BIT, VMA_MAYWRITE_BIT); @@ -4335,12 +4329,24 @@ static inline bool range_in_vma(const struct vm_area_struct *vma, #ifdef CONFIG_MMU pgprot_t vm_get_page_prot(vm_flags_t vm_flags); + +static inline pgprot_t vma_get_page_prot(vma_flags_t vma_flags) +{ + const vm_flags_t vm_flags = vma_flags_to_legacy(vma_flags); + + return vm_get_page_prot(vm_flags); +} + void vma_set_page_prot(struct vm_area_struct *vma); #else static inline pgprot_t vm_get_page_prot(vm_flags_t vm_flags) { return __pgprot(0); } +static inline pgprot_t vma_get_page_prot(vma_flags_t vma_flags) +{ + return __pgprot(0); +} static inline void vma_set_page_prot(struct vm_area_struct *vma) { vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); diff --git a/include/linux/mman.h b/include/linux/mman.h index 0ba8a7e8b90a..389521594c69 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -170,53 +170,4 @@ static inline bool arch_memory_deny_write_exec_supported(void) } #define arch_memory_deny_write_exec_supported arch_memory_deny_write_exec_supported #endif - -/* - * Denies creating a writable executable mapping or gaining executable permissions. - * - * This denies the following: - * - * a) mmap(PROT_WRITE | PROT_EXEC) - * - * b) mmap(PROT_WRITE) - * mprotect(PROT_EXEC) - * - * c) mmap(PROT_WRITE) - * mprotect(PROT_READ) - * mprotect(PROT_EXEC) - * - * But allows the following: - * - * d) mmap(PROT_READ | PROT_EXEC) - * mmap(PROT_READ | PROT_EXEC | PROT_BTI) - * - * This is only applicable if the user has set the Memory-Deny-Write-Execute - * (MDWE) protection mask for the current process. - * - * @old specifies the VMA flags the VMA originally possessed, and @new the ones - * we propose to set. - * - * Return: false if proposed change is OK, true if not ok and should be denied. - */ -static inline bool map_deny_write_exec(unsigned long old, unsigned long new) -{ - /* If MDWE is disabled, we have nothing to deny. */ - if (!mm_flags_test(MMF_HAS_MDWE, current->mm)) - return false; - - /* If the new VMA is not executable, we have nothing to deny. */ - if (!(new & VM_EXEC)) - return false; - - /* Under MDWE we do not accept newly writably executable VMAs... */ - if (new & VM_WRITE) - return true; - - /* ...nor previously non-executable VMAs becoming executable. */ - if (!(old & VM_EXEC)) - return true; - - return false; -} - #endif /* _LINUX_MMAN_H */ diff --git a/mm/mprotect.c b/mm/mprotect.c index 941f1211da0d..007d9a72b2f0 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -882,6 +882,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, tmp = vma->vm_start; for_each_vma_range(vmi, vma, end) { vm_flags_t mask_off_old_flags; + vma_flags_t new_vma_flags; vm_flags_t newflags; int new_vma_pkey; @@ -904,6 +905,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, new_vma_pkey = arch_override_mprotect_pkey(vma, prot, pkey); newflags = calc_vm_prot_bits(prot, new_vma_pkey); newflags |= (vma->vm_flags & ~mask_off_old_flags); + new_vma_flags = legacy_to_vma_flags(newflags); /* newflags >> 4 shift VM_MAY% in place of VM_% */ if ((newflags & ~(newflags >> 4)) & VM_ACCESS_FLAGS) { @@ -911,7 +913,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, break; } - if (map_deny_write_exec(vma->vm_flags, newflags)) { + if (map_deny_write_exec(&vma->flags, &new_vma_flags)) { error = -EACCES; break; } diff --git a/mm/vma.c b/mm/vma.c index 16a1d708c978..c335f989586f 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -44,7 +44,7 @@ struct mmap_state { bool file_doesnt_need_get :1; }; -#define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, vm_flags_, file_) \ +#define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, vma_flags_, file_) \ struct mmap_state name = { \ .mm = mm_, \ .vmi = vmi_, \ @@ -52,9 +52,9 @@ struct mmap_state { .end = (addr_) + (len_), \ .pgoff = pgoff_, \ .pglen = PHYS_PFN(len_), \ - .vm_flags = vm_flags_, \ + .vma_flags = vma_flags_, \ .file = file_, \ - .page_prot = vm_get_page_prot(vm_flags_), \ + .page_prot = vma_get_page_prot(vma_flags_), \ } #define VMG_MMAP_STATE(name, map_, vma_) \ @@ -63,7 +63,7 @@ struct mmap_state { .vmi = (map_)->vmi, \ .start = (map_)->addr, \ .end = (map_)->end, \ - .vm_flags = (map_)->vm_flags, \ + .vma_flags = (map_)->vma_flags, \ .pgoff = (map_)->pgoff, \ .file = (map_)->file, \ .prev = (map_)->prev, \ @@ -2746,14 +2746,14 @@ static int call_action_complete(struct mmap_state *map, } static unsigned long __mmap_region(struct file *file, unsigned long addr, - unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, - struct list_head *uf) + unsigned long len, vma_flags_t vma_flags, + unsigned long pgoff, struct list_head *uf) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; bool have_mmap_prepare = file && file->f_op->mmap_prepare; VMA_ITERATOR(vmi, mm, addr); - MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file); + MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vma_flags, file); struct vm_area_desc desc = { .mm = mm, .file = file, @@ -2837,16 +2837,17 @@ abort_munmap: * been performed. */ unsigned long mmap_region(struct file *file, unsigned long addr, - unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, - struct list_head *uf) + unsigned long len, vm_flags_t vm_flags, + unsigned long pgoff, struct list_head *uf) { unsigned long ret; bool writable_file_mapping = false; + const vma_flags_t vma_flags = legacy_to_vma_flags(vm_flags); mmap_assert_write_locked(current->mm); /* Check to see if MDWE is applicable. */ - if (map_deny_write_exec(vm_flags, vm_flags)) + if (map_deny_write_exec(&vma_flags, &vma_flags)) return -EACCES; /* Allow architectures to sanity-check the vm_flags. */ @@ -2854,7 +2855,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, return -EINVAL; /* Map writable and ensure this isn't a sealed memfd. */ - if (file && is_shared_maywrite_vm_flags(vm_flags)) { + if (file && is_shared_maywrite(&vma_flags)) { int error = mapping_map_writable(file->f_mapping); if (error) @@ -2862,7 +2863,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, writable_file_mapping = true; } - ret = __mmap_region(file, addr, len, vm_flags, pgoff, uf); + ret = __mmap_region(file, addr, len, vma_flags, pgoff, uf); /* Clear our write mapping regardless of error. */ if (writable_file_mapping) diff --git a/mm/vma.h b/mm/vma.h index 270008e5babc..adc18f7dd9f1 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -704,4 +704,55 @@ int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap, int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift); #endif +#ifdef CONFIG_MMU +/* + * Denies creating a writable executable mapping or gaining executable permissions. + * + * This denies the following: + * + * a) mmap(PROT_WRITE | PROT_EXEC) + * + * b) mmap(PROT_WRITE) + * mprotect(PROT_EXEC) + * + * c) mmap(PROT_WRITE) + * mprotect(PROT_READ) + * mprotect(PROT_EXEC) + * + * But allows the following: + * + * d) mmap(PROT_READ | PROT_EXEC) + * mmap(PROT_READ | PROT_EXEC | PROT_BTI) + * + * This is only applicable if the user has set the Memory-Deny-Write-Execute + * (MDWE) protection mask for the current process. + * + * @old specifies the VMA flags the VMA originally possessed, and @new the ones + * we propose to set. + * + * Return: false if proposed change is OK, true if not ok and should be denied. + */ +static inline bool map_deny_write_exec(const vma_flags_t *old, + const vma_flags_t *new) +{ + /* If MDWE is disabled, we have nothing to deny. */ + if (!mm_flags_test(MMF_HAS_MDWE, current->mm)) + return false; + + /* If the new VMA is not executable, we have nothing to deny. */ + if (!vma_flags_test(new, VMA_EXEC_BIT)) + return false; + + /* Under MDWE we do not accept newly writably executable VMAs... */ + if (vma_flags_test(new, VMA_WRITE_BIT)) + return true; + + /* ...nor previously non-executable VMAs becoming executable. */ + if (!vma_flags_test(old, VMA_EXEC_BIT)) + return true; + + return false; +} +#endif + #endif /* __MM_VMA_H */ diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 9dd57f50ea6d..ab92358b082c 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -1124,12 +1124,6 @@ static __always_inline void vma_desc_clear_flags_mask(struct vm_area_desc *desc, #define vma_desc_clear_flags(desc, ...) \ vma_desc_clear_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) -static inline bool is_shared_maywrite_vm_flags(vm_flags_t vm_flags) -{ - return (vm_flags & (VM_SHARED | VM_MAYWRITE)) == - (VM_SHARED | VM_MAYWRITE); -} - static inline bool is_shared_maywrite(const vma_flags_t *flags) { return vma_flags_test_all(flags, VMA_SHARED_BIT, VMA_MAYWRITE_BIT); @@ -1446,27 +1440,6 @@ static inline bool mlock_future_ok(const struct mm_struct *mm, return locked_pages <= limit_pages; } -static inline bool map_deny_write_exec(unsigned long old, unsigned long new) -{ - /* If MDWE is disabled, we have nothing to deny. */ - if (mm_flags_test(MMF_HAS_MDWE, current->mm)) - return false; - - /* If the new VMA is not executable, we have nothing to deny. */ - if (!(new & VM_EXEC)) - return false; - - /* Under MDWE we do not accept newly writably executable VMAs... */ - if (new & VM_WRITE) - return true; - - /* ...nor previously non-executable VMAs becoming executable. */ - if (!(old & VM_EXEC)) - return true; - - return false; -} - static inline int mapping_map_writable(struct address_space *mapping) { return atomic_inc_unless_negative(&mapping->i_mmap_writable) ? @@ -1518,3 +1491,10 @@ static inline int get_sysctl_max_map_count(void) #ifndef pgtable_supports_soft_dirty #define pgtable_supports_soft_dirty() IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) #endif + +static inline pgprot_t vma_get_page_prot(vma_flags_t vma_flags) +{ + const vm_flags_t vm_flags = vma_flags_to_legacy(vma_flags); + + return vm_get_page_prot(vm_flags); +} diff --git a/tools/testing/vma/tests/mmap.c b/tools/testing/vma/tests/mmap.c index bded4ecbe5db..c85bc000d1cb 100644 --- a/tools/testing/vma/tests/mmap.c +++ b/tools/testing/vma/tests/mmap.c @@ -2,6 +2,8 @@ static bool test_mmap_region_basic(void) { + const vma_flags_t vma_flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, + VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT); struct mm_struct mm = {}; unsigned long addr; struct vm_area_struct *vma; @@ -10,27 +12,19 @@ static bool test_mmap_region_basic(void) current->mm = &mm; /* Map at 0x300000, length 0x3000. */ - addr = __mmap_region(NULL, 0x300000, 0x3000, - VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE, - 0x300, NULL); + addr = __mmap_region(NULL, 0x300000, 0x3000, vma_flags, 0x300, NULL); ASSERT_EQ(addr, 0x300000); /* Map at 0x250000, length 0x3000. */ - addr = __mmap_region(NULL, 0x250000, 0x3000, - VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE, - 0x250, NULL); + addr = __mmap_region(NULL, 0x250000, 0x3000, vma_flags, 0x250, NULL); ASSERT_EQ(addr, 0x250000); /* Map at 0x303000, merging to 0x300000 of length 0x6000. */ - addr = __mmap_region(NULL, 0x303000, 0x3000, - VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE, - 0x303, NULL); + addr = __mmap_region(NULL, 0x303000, 0x3000, vma_flags, 0x303, NULL); ASSERT_EQ(addr, 0x303000); /* Map at 0x24d000, merging to 0x250000 of length 0x6000. */ - addr = __mmap_region(NULL, 0x24d000, 0x3000, - VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE, - 0x24d, NULL); + addr = __mmap_region(NULL, 0x24d000, 0x3000, vma_flags, 0x24d, NULL); ASSERT_EQ(addr, 0x24d000); ASSERT_EQ(mm.map_count, 2); -- cgit v1.2.3 From 3e4bb2706817710d9461394da8b75be79981586b Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 22:39:27 +0000 Subject: mm: various small mmap_prepare cleanups Patch series "mm: expand mmap_prepare functionality and usage", v4. This series expands the mmap_prepare functionality, which is intended to replace the deprecated f_op->mmap hook which has been the source of bugs and security issues for some time. This series starts with some cleanup of existing mmap_prepare logic, then adds documentation for the mmap_prepare call to make it easier for filesystem and driver writers to understand how it works. It then importantly adds a vm_ops->mapped hook, a key feature that was missing from mmap_prepare previously - this is invoked when a driver which specifies mmap_prepare has successfully been mapped but not merged with another VMA. mmap_prepare is invoked prior to a merge being attempted, so you cannot manipulate state such as reference counts as if it were a new mapping. The vm_ops->mapped hook allows a driver to perform tasks required at this stage, and provides symmetry against subsequent vm_ops->open,close calls. The series uses this to correct the afs implementation which wrongly manipulated reference count at mmap_prepare time. It then adds an mmap_prepare equivalent of vm_iomap_memory() - mmap_action_simple_ioremap(), then uses this to update a number of drivers. It then splits out the mmap_prepare compatibility layer (which allows for invocation of mmap_prepare hooks in an mmap() hook) in such a way as to allow for more incremental implementation of mmap_prepare hooks. It then uses this to extend mmap_prepare usage in drivers. Finally it adds an mmap_prepare equivalent of vm_map_pages(), which lays the foundation for future work which will extend mmap_prepare to DMA coherent mappings. This patch (of 21): Rather than passing arbitrary fields, pass a vm_area_desc pointer to mmap prepare functions to mmap prepare, and an action and vma pointer to mmap complete in order to put all the action-specific logic in the function actually doing the work. Additionally, allow mmap prepare functions to return an error so we can error out as soon as possible if there is something logically incorrect in the input. Update remap_pfn_range_prepare() to properly check the input range for the CoW case. Also remove io_remap_pfn_range_complete(), as we can simply set up the fields correctly in io_remap_pfn_range_prepare() and use remap_pfn_range_complete() for this. While we're here, make remap_pfn_range_prepare_vma() a little neater, and pass mmap_action directly to call_action_complete(). Then, update compat_vma_mmap() to perform its logic directly, as __compat_vma_map() is not used by anything so we don't need to export it. Also update compat_vma_mmap() to use vfs_mmap_prepare() rather than calling the mmap_prepare op directly. Finally, update the VMA userland tests to reflect the changes. Link: https://lkml.kernel.org/r/cover.1774045440.git.ljs@kernel.org Link: https://lkml.kernel.org/r/99f408e4694f44ab12bdc55fe0bd9685d3bd1117.1774045440.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Al Viro Cc: Arnd Bergmann Cc: Bodo Stroesser Cc: Christian Brauner Cc: Clemens Ladisch Cc: David Hildenbrand Cc: David Howells Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Marc Dionne Cc: "Martin K. Petersen" Cc: Maxime Coquelin Cc: Michal Hocko Cc: Mike Rapoport Cc: Miquel Raynal Cc: Pedro Falcato Cc: Richard Weinberger Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Vignesh Raghavendra Cc: Wei Liu Signed-off-by: Andrew Morton --- include/linux/fs.h | 2 - include/linux/mm.h | 7 +-- mm/internal.h | 32 +++++----- mm/memory.c | 45 +++++++++----- mm/util.c | 121 +++++++++++++++++--------------------- mm/vma.c | 24 ++++---- tools/testing/vma/include/dup.h | 7 ++- tools/testing/vma/include/stubs.h | 8 +-- 8 files changed, 126 insertions(+), 120 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 8b3dd145b25e..a2628a12bd2b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2058,8 +2058,6 @@ static inline bool can_mmap_file(struct file *file) return true; } -int __compat_vma_mmap(const struct file_operations *f_op, - struct file *file, struct vm_area_struct *vma); int compat_vma_mmap(struct file *file, struct vm_area_struct *vma); static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/include/linux/mm.h b/include/linux/mm.h index 9472b3c9a22b..6ca2fc5ae83f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4304,10 +4304,9 @@ static inline void mmap_action_ioremap_full(struct vm_area_desc *desc, mmap_action_ioremap(desc, desc->start, start_pfn, vma_desc_size(desc)); } -void mmap_action_prepare(struct mmap_action *action, - struct vm_area_desc *desc); -int mmap_action_complete(struct mmap_action *action, - struct vm_area_struct *vma); +int mmap_action_prepare(struct vm_area_desc *desc); +int mmap_action_complete(struct vm_area_struct *vma, + struct mmap_action *action); /* Look up the first VMA which exactly match the interval vm_start ... vm_end */ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, diff --git a/mm/internal.h b/mm/internal.h index 9c690f8635da..4dddd89153d4 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1839,26 +1839,28 @@ int walk_page_range_debug(struct mm_struct *mm, unsigned long start, void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm); int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm); -void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn); -int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t pgprot); +int remap_pfn_range_prepare(struct vm_area_desc *desc); +int remap_pfn_range_complete(struct vm_area_struct *vma, + struct mmap_action *action); -static inline void io_remap_pfn_range_prepare(struct vm_area_desc *desc, - unsigned long orig_pfn, unsigned long size) +static inline int io_remap_pfn_range_prepare(struct vm_area_desc *desc) { + struct mmap_action *action = &desc->action; + const unsigned long orig_pfn = action->remap.start_pfn; + const pgprot_t orig_pgprot = action->remap.pgprot; + const unsigned long size = action->remap.size; const unsigned long pfn = io_remap_pfn_range_pfn(orig_pfn, size); + int err; - return remap_pfn_range_prepare(desc, pfn); -} + action->remap.start_pfn = pfn; + action->remap.pgprot = pgprot_decrypted(orig_pgprot); + err = remap_pfn_range_prepare(desc); + if (err) + return err; -static inline int io_remap_pfn_range_complete(struct vm_area_struct *vma, - unsigned long addr, unsigned long orig_pfn, unsigned long size, - pgprot_t orig_prot) -{ - const unsigned long pfn = io_remap_pfn_range_pfn(orig_pfn, size); - const pgprot_t prot = pgprot_decrypted(orig_prot); - - return remap_pfn_range_complete(vma, addr, pfn, size, prot); + /* Remap does the actual work. */ + action->type = MMAP_REMAP_PFN; + return 0; } #ifdef CONFIG_MMU_NOTIFIER diff --git a/mm/memory.c b/mm/memory.c index 425e852a2eb7..10a61dd81f97 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3099,26 +3099,34 @@ static int do_remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, } #endif -void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn) +int remap_pfn_range_prepare(struct vm_area_desc *desc) { - /* - * We set addr=VMA start, end=VMA end here, so this won't fail, but we - * check it again on complete and will fail there if specified addr is - * invalid. - */ - get_remap_pgoff(vma_desc_is_cow_mapping(desc), desc->start, desc->end, - desc->start, desc->end, pfn, &desc->pgoff); + const struct mmap_action *action = &desc->action; + const unsigned long start = action->remap.start; + const unsigned long end = start + action->remap.size; + const unsigned long pfn = action->remap.start_pfn; + const bool is_cow = vma_desc_is_cow_mapping(desc); + int err; + + err = get_remap_pgoff(is_cow, start, end, desc->start, desc->end, pfn, + &desc->pgoff); + if (err) + return err; + vma_desc_set_flags_mask(desc, VMA_REMAP_FLAGS); + return 0; } -static int remap_pfn_range_prepare_vma(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, unsigned long size) +static int remap_pfn_range_prepare_vma(struct vm_area_struct *vma, + unsigned long addr, unsigned long pfn, + unsigned long size) { - unsigned long end = addr + PAGE_ALIGN(size); + const unsigned long end = addr + PAGE_ALIGN(size); + const bool is_cow = is_cow_mapping(vma->vm_flags); int err; - err = get_remap_pgoff(is_cow_mapping(vma->vm_flags), addr, end, - vma->vm_start, vma->vm_end, pfn, &vma->vm_pgoff); + err = get_remap_pgoff(is_cow, addr, end, vma->vm_start, vma->vm_end, + pfn, &vma->vm_pgoff); if (err) return err; @@ -3151,10 +3159,15 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(remap_pfn_range); -int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t prot) +int remap_pfn_range_complete(struct vm_area_struct *vma, + struct mmap_action *action) { - return do_remap_pfn_range(vma, addr, pfn, size, prot); + const unsigned long start = action->remap.start; + const unsigned long pfn = action->remap.start_pfn; + const unsigned long size = action->remap.size; + const pgprot_t prot = action->remap.pgprot; + + return do_remap_pfn_range(vma, start, pfn, size, prot); } /** diff --git a/mm/util.c b/mm/util.c index ce7ae80047cf..73c97a748d8e 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1163,43 +1163,6 @@ void flush_dcache_folio(struct folio *folio) EXPORT_SYMBOL(flush_dcache_folio); #endif -/** - * __compat_vma_mmap() - See description for compat_vma_mmap() - * for details. This is the same operation, only with a specific file operations - * struct which may or may not be the same as vma->vm_file->f_op. - * @f_op: The file operations whose .mmap_prepare() hook is specified. - * @file: The file which backs or will back the mapping. - * @vma: The VMA to apply the .mmap_prepare() hook to. - * Returns: 0 on success or error. - */ -int __compat_vma_mmap(const struct file_operations *f_op, - struct file *file, struct vm_area_struct *vma) -{ - struct vm_area_desc desc = { - .mm = vma->vm_mm, - .file = file, - .start = vma->vm_start, - .end = vma->vm_end, - - .pgoff = vma->vm_pgoff, - .vm_file = vma->vm_file, - .vma_flags = vma->flags, - .page_prot = vma->vm_page_prot, - - .action.type = MMAP_NOTHING, /* Default */ - }; - int err; - - err = f_op->mmap_prepare(&desc); - if (err) - return err; - - mmap_action_prepare(&desc.action, &desc); - set_vma_from_desc(vma, &desc); - return mmap_action_complete(&desc.action, vma); -} -EXPORT_SYMBOL(__compat_vma_mmap); - /** * compat_vma_mmap() - Apply the file's .mmap_prepare() hook to an * existing VMA and execute any requested actions. @@ -1228,7 +1191,31 @@ EXPORT_SYMBOL(__compat_vma_mmap); */ int compat_vma_mmap(struct file *file, struct vm_area_struct *vma) { - return __compat_vma_mmap(file->f_op, file, vma); + struct vm_area_desc desc = { + .mm = vma->vm_mm, + .file = file, + .start = vma->vm_start, + .end = vma->vm_end, + + .pgoff = vma->vm_pgoff, + .vm_file = vma->vm_file, + .vma_flags = vma->flags, + .page_prot = vma->vm_page_prot, + + .action.type = MMAP_NOTHING, /* Default */ + }; + int err; + + err = vfs_mmap_prepare(file, &desc); + if (err) + return err; + + err = mmap_action_prepare(&desc); + if (err) + return err; + + set_vma_from_desc(vma, &desc); + return mmap_action_complete(vma, &desc.action); } EXPORT_SYMBOL(compat_vma_mmap); @@ -1320,8 +1307,8 @@ again: } } -static int mmap_action_finish(struct mmap_action *action, - const struct vm_area_struct *vma, int err) +static int mmap_action_finish(struct vm_area_struct *vma, + struct mmap_action *action, int err) { /* * If an error occurs, unmap the VMA altogether and return an error. We @@ -1353,37 +1340,38 @@ static int mmap_action_finish(struct mmap_action *action, /** * mmap_action_prepare - Perform preparatory setup for an VMA descriptor * action which need to be performed. - * @desc: The VMA descriptor to prepare for @action. - * @action: The action to perform. + * @desc: The VMA descriptor to prepare for its @desc->action. + * + * Returns: %0 on success, otherwise error. */ -void mmap_action_prepare(struct mmap_action *action, - struct vm_area_desc *desc) +int mmap_action_prepare(struct vm_area_desc *desc) { - switch (action->type) { + switch (desc->action.type) { case MMAP_NOTHING: - break; + return 0; case MMAP_REMAP_PFN: - remap_pfn_range_prepare(desc, action->remap.start_pfn); - break; + return remap_pfn_range_prepare(desc); case MMAP_IO_REMAP_PFN: - io_remap_pfn_range_prepare(desc, action->remap.start_pfn, - action->remap.size); - break; + return io_remap_pfn_range_prepare(desc); } + + WARN_ON_ONCE(1); + return -EINVAL; } EXPORT_SYMBOL(mmap_action_prepare); /** * mmap_action_complete - Execute VMA descriptor action. - * @action: The action to perform. * @vma: The VMA to perform the action upon. + * @action: The action to perform. * * Similar to mmap_action_prepare(). * * Return: 0 on success, or error, at which point the VMA will be unmapped. */ -int mmap_action_complete(struct mmap_action *action, - struct vm_area_struct *vma) +int mmap_action_complete(struct vm_area_struct *vma, + struct mmap_action *action) + { int err = 0; @@ -1391,25 +1379,22 @@ int mmap_action_complete(struct mmap_action *action, case MMAP_NOTHING: break; case MMAP_REMAP_PFN: - err = remap_pfn_range_complete(vma, action->remap.start, - action->remap.start_pfn, action->remap.size, - action->remap.pgprot); + err = remap_pfn_range_complete(vma, action); break; case MMAP_IO_REMAP_PFN: - err = io_remap_pfn_range_complete(vma, action->remap.start, - action->remap.start_pfn, action->remap.size, - action->remap.pgprot); + /* Should have been delegated. */ + WARN_ON_ONCE(1); + err = -EINVAL; break; } - return mmap_action_finish(action, vma, err); + return mmap_action_finish(vma, action, err); } EXPORT_SYMBOL(mmap_action_complete); #else -void mmap_action_prepare(struct mmap_action *action, - struct vm_area_desc *desc) +int mmap_action_prepare(struct vm_area_desc *desc) { - switch (action->type) { + switch (desc->action.type) { case MMAP_NOTHING: break; case MMAP_REMAP_PFN: @@ -1417,11 +1402,13 @@ void mmap_action_prepare(struct mmap_action *action, WARN_ON_ONCE(1); /* nommu cannot handle these. */ break; } + + return 0; } EXPORT_SYMBOL(mmap_action_prepare); -int mmap_action_complete(struct mmap_action *action, - struct vm_area_struct *vma) +int mmap_action_complete(struct vm_area_struct *vma, + struct mmap_action *action) { int err = 0; @@ -1436,7 +1423,7 @@ int mmap_action_complete(struct mmap_action *action, break; } - return mmap_action_finish(action, vma, err); + return mmap_action_finish(vma, action, err); } EXPORT_SYMBOL(mmap_action_complete); #endif diff --git a/mm/vma.c b/mm/vma.c index a4b30a069153..1e2996a12d7f 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -2640,15 +2640,18 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) vma_set_page_prot(vma); } -static void call_action_prepare(struct mmap_state *map, - struct vm_area_desc *desc) +static int call_action_prepare(struct mmap_state *map, + struct vm_area_desc *desc) { - struct mmap_action *action = &desc->action; + int err; - mmap_action_prepare(action, desc); + err = mmap_action_prepare(desc); + if (err) + return err; - if (action->hide_from_rmap_until_complete) + if (desc->action.hide_from_rmap_until_complete) map->hold_file_rmap_lock = true; + return 0; } /* @@ -2672,7 +2675,9 @@ static int call_mmap_prepare(struct mmap_state *map, if (err) return err; - call_action_prepare(map, desc); + err = call_action_prepare(map, desc); + if (err) + return err; /* Update fields permitted to be changed. */ map->pgoff = desc->pgoff; @@ -2727,13 +2732,12 @@ static bool can_set_ksm_flags_early(struct mmap_state *map) } static int call_action_complete(struct mmap_state *map, - struct vm_area_desc *desc, + struct mmap_action *action, struct vm_area_struct *vma) { - struct mmap_action *action = &desc->action; int ret; - ret = mmap_action_complete(action, vma); + ret = mmap_action_complete(vma, action); /* If we held the file rmap we need to release it. */ if (map->hold_file_rmap_lock) { @@ -2795,7 +2799,7 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr, __mmap_complete(&map, vma); if (have_mmap_prepare && allocated_new) { - error = call_action_complete(&map, &desc, vma); + error = call_action_complete(&map, &desc.action, vma); if (error) return error; diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index ab92358b082c..e7581efaf470 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -1277,9 +1277,12 @@ static inline int __compat_vma_mmap(const struct file_operations *f_op, if (err) return err; - mmap_action_prepare(&desc.action, &desc); + err = mmap_action_prepare(&desc); + if (err) + return err; + set_vma_from_desc(vma, &desc); - return mmap_action_complete(&desc.action, vma); + return mmap_action_complete(vma, &desc.action); } static inline int compat_vma_mmap(struct file *file, diff --git a/tools/testing/vma/include/stubs.h b/tools/testing/vma/include/stubs.h index 5afb0afe2d48..a30b8bc84955 100644 --- a/tools/testing/vma/include/stubs.h +++ b/tools/testing/vma/include/stubs.h @@ -81,13 +81,13 @@ static inline void free_anon_vma_name(struct vm_area_struct *vma) { } -static inline void mmap_action_prepare(struct mmap_action *action, - struct vm_area_desc *desc) +static inline int mmap_action_prepare(struct vm_area_desc *desc) { + return 0; } -static inline int mmap_action_complete(struct mmap_action *action, - struct vm_area_struct *vma) +static inline int mmap_action_complete(struct vm_area_struct *vma, + struct mmap_action *action) { return 0; } -- cgit v1.2.3 From 827e97cf4bf59e9a72bcec37944bcebb3139a457 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 22:39:29 +0000 Subject: mm: document vm_operations_struct->open the same as close() Describe when the operation is invoked and the context in which it is invoked, matching the description already added for vm_op->close(). While we're here, update all outdated references to an 'area' field for VMAs to the more consistent 'vma'. Link: https://lkml.kernel.org/r/7d0ca833c12014320f0fa00f816f95e6e10076f2.1774045440.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Al Viro Cc: Arnd Bergmann Cc: Bodo Stroesser Cc: Christian Brauner Cc: Clemens Ladisch Cc: David Hildenbrand Cc: David Howells Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Marc Dionne Cc: "Martin K. Petersen" Cc: Maxime Coquelin Cc: Michal Hocko Cc: Mike Rapoport Cc: Miquel Raynal Cc: Pedro Falcato Cc: Richard Weinberger Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Vignesh Raghavendra Cc: Wei Liu Signed-off-by: Andrew Morton --- include/linux/mm.h | 15 ++++++++++----- tools/testing/vma/include/dup.h | 15 ++++++++++----- 2 files changed, 20 insertions(+), 10 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6ca2fc5ae83f..21a2eef5f8fe 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -764,15 +764,20 @@ struct vm_fault { * to the functions called when a no-page or a wp-page exception occurs. */ struct vm_operations_struct { - void (*open)(struct vm_area_struct * area); + /** + * @open: Called when a VMA is remapped, split or forked. Not called + * upon first mapping a VMA. + * Context: User context. May sleep. Caller holds mmap_lock. + */ + void (*open)(struct vm_area_struct *vma); /** * @close: Called when the VMA is being removed from the MM. * Context: User context. May sleep. Caller holds mmap_lock. */ - void (*close)(struct vm_area_struct * area); + void (*close)(struct vm_area_struct *vma); /* Called any time before splitting to check if it's allowed */ - int (*may_split)(struct vm_area_struct *area, unsigned long addr); - int (*mremap)(struct vm_area_struct *area); + int (*may_split)(struct vm_area_struct *vma, unsigned long addr); + int (*mremap)(struct vm_area_struct *vma); /* * Called by mprotect() to make driver-specific permission * checks before mprotect() is finalised. The VMA must not @@ -784,7 +789,7 @@ struct vm_operations_struct { vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order); vm_fault_t (*map_pages)(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); - unsigned long (*pagesize)(struct vm_area_struct * area); + unsigned long (*pagesize)(struct vm_area_struct *vma); /* notification that a previously read-only page is about to become * writable, if an error is returned it will cause a SIGBUS */ diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index e7581efaf470..5bc04c801504 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -632,15 +632,20 @@ struct vm_area_struct { } __randomize_layout; struct vm_operations_struct { - void (*open)(struct vm_area_struct * area); + /** + * @open: Called when a VMA is remapped, split or forked. Not called + * upon first mapping a VMA. + * Context: User context. May sleep. Caller holds mmap_lock. + */ + void (*open)(struct vm_area_struct *vma); /** * @close: Called when the VMA is being removed from the MM. * Context: User context. May sleep. Caller holds mmap_lock. */ - void (*close)(struct vm_area_struct * area); + void (*close)(struct vm_area_struct *vma); /* Called any time before splitting to check if it's allowed */ - int (*may_split)(struct vm_area_struct *area, unsigned long addr); - int (*mremap)(struct vm_area_struct *area); + int (*may_split)(struct vm_area_struct *vma, unsigned long addr); + int (*mremap)(struct vm_area_struct *vma); /* * Called by mprotect() to make driver-specific permission * checks before mprotect() is finalised. The VMA must not @@ -652,7 +657,7 @@ struct vm_operations_struct { vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order); vm_fault_t (*map_pages)(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); - unsigned long (*pagesize)(struct vm_area_struct * area); + unsigned long (*pagesize)(struct vm_area_struct *vma); /* notification that a previously read-only page is about to become * writable, if an error is returned it will cause a SIGBUS */ -- cgit v1.2.3 From c50ca15dd4962bdf834945c2fa29b904042f366a Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 22:39:34 +0000 Subject: mm: add vm_ops->mapped hook Previously, when a driver needed to do something like establish a reference count, it could do so in the mmap hook in the knowledge that the mapping would succeed. With the introduction of f_op->mmap_prepare this is no longer the case, as it is invoked prior to actually establishing the mapping. mmap_prepare is not appropriate for this kind of thing as it is called before any merge might take place, and after which an error might occur meaning resources could be leaked. To take this into account, introduce a new vm_ops->mapped callback which is invoked when the VMA is first mapped (though notably - not when it is merged - which is correct and mirrors existing mmap/open/close behaviour). We do better that vm_ops->open() here, as this callback can return an error, at which point the VMA will be unmapped. Note that vm_ops->mapped() is invoked after any mmap action is complete (such as I/O remapping). We intentionally do not expose the VMA at this point, exposing only the fields that could be used, and an output parameter in case the operation needs to update the vma->vm_private_data field. In order to deal with stacked filesystems which invoke inner filesystem's mmap() invocations, add __compat_vma_mapped() and invoke it on vfs_mmap() (via compat_vma_mmap()) to ensure that the mapped callback is handled when an mmap() caller invokes a nested filesystem's mmap_prepare() callback. Update the mmap_prepare documentation to describe the mapped hook and make it clear what its intended use is. The vm_ops->mapped() call is handled by the mmap complete logic to ensure the same code paths are handled by both the compatibility and VMA layers. Additionally, update VMA userland test headers to reflect the change. Link: https://lkml.kernel.org/r/4c5e98297eb0aae9565c564e1c296a112702f144.1774045440.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Al Viro Cc: Arnd Bergmann Cc: Bodo Stroesser Cc: Christian Brauner Cc: Clemens Ladisch Cc: David Hildenbrand Cc: David Howells Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Marc Dionne Cc: "Martin K. Petersen" Cc: Maxime Coquelin Cc: Michal Hocko Cc: Mike Rapoport Cc: Miquel Raynal Cc: Pedro Falcato Cc: Richard Weinberger Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Vignesh Raghavendra Cc: Wei Liu Signed-off-by: Andrew Morton --- Documentation/filesystems/mmap_prepare.rst | 15 +++++ include/linux/fs.h | 9 ++- include/linux/mm.h | 17 ++++++ mm/util.c | 90 +++++++++++++++++++++--------- mm/vma.c | 1 - tools/testing/vma/include/dup.h | 17 ++++++ 6 files changed, 120 insertions(+), 29 deletions(-) (limited to 'include/linux/mm.h') diff --git a/Documentation/filesystems/mmap_prepare.rst b/Documentation/filesystems/mmap_prepare.rst index ae484d371861..f14b35ee11d5 100644 --- a/Documentation/filesystems/mmap_prepare.rst +++ b/Documentation/filesystems/mmap_prepare.rst @@ -25,6 +25,21 @@ That is - no resources should be allocated nor state updated to reflect that a mapping has been established, as the mapping may either be merged, or fail to be mapped after the callback is complete. +Mapped callback +--------------- + +If resources need to be allocated per-mapping, or state such as a reference +count needs to be manipulated, this should be done using the ``vm_ops->mapped`` +hook, which itself should be set by the >mmap_prepare hook. + +This callback is only invoked if a new mapping has been established and was not +merged with any other, and is invoked at a point where no error may occur before +the mapping is established. + +You may return an error to the callback itself, which will cause the mapping to +become unmapped and an error returned to the mmap() caller. This is useful if +resources need to be allocated, and that allocation might fail. + How To Use ========== diff --git a/include/linux/fs.h b/include/linux/fs.h index a2628a12bd2b..c390f5c667e3 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2059,13 +2059,20 @@ static inline bool can_mmap_file(struct file *file) } int compat_vma_mmap(struct file *file, struct vm_area_struct *vma); +int __vma_check_mmap_hook(struct vm_area_struct *vma); static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma) { + int err; + if (file->f_op->mmap_prepare) return compat_vma_mmap(file, vma); - return file->f_op->mmap(file, vma); + err = file->f_op->mmap(file, vma); + if (err) + return err; + + return __vma_check_mmap_hook(vma); } static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc) diff --git a/include/linux/mm.h b/include/linux/mm.h index 21a2eef5f8fe..81fbcfed44dd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -775,6 +775,23 @@ struct vm_operations_struct { * Context: User context. May sleep. Caller holds mmap_lock. */ void (*close)(struct vm_area_struct *vma); + /** + * @mapped: Called when the VMA is first mapped in the MM. Not called if + * the new VMA is merged with an adjacent VMA. + * + * The @vm_private_data field is an output field allowing the user to + * modify vma->vm_private_data as necessary. + * + * ONLY valid if set from f_op->mmap_prepare. Will result in an error if + * set from f_op->mmap. + * + * Returns %0 on success, or an error otherwise. On error, the VMA will + * be unmapped. + * + * Context: User context. May sleep. Caller holds mmap_lock. + */ + int (*mapped)(unsigned long start, unsigned long end, pgoff_t pgoff, + const struct file *file, void **vm_private_data); /* Called any time before splitting to check if it's allowed */ int (*may_split)(struct vm_area_struct *vma, unsigned long addr); int (*mremap)(struct vm_area_struct *vma); diff --git a/mm/util.c b/mm/util.c index e272efca8c0e..98fe67e59ec3 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1163,33 +1163,7 @@ void flush_dcache_folio(struct folio *folio) EXPORT_SYMBOL(flush_dcache_folio); #endif -/** - * compat_vma_mmap() - Apply the file's .mmap_prepare() hook to an - * existing VMA and execute any requested actions. - * @file: The file which possesss an f_op->mmap_prepare() hook. - * @vma: The VMA to apply the .mmap_prepare() hook to. - * - * Ordinarily, .mmap_prepare() is invoked directly upon mmap(). However, certain - * stacked filesystems invoke a nested mmap hook of an underlying file. - * - * Until all filesystems are converted to use .mmap_prepare(), we must be - * conservative and continue to invoke these stacked filesystems using the - * deprecated .mmap() hook. - * - * However we have a problem if the underlying file system possesses an - * .mmap_prepare() hook, as we are in a different context when we invoke the - * .mmap() hook, already having a VMA to deal with. - * - * compat_vma_mmap() is a compatibility function that takes VMA state, - * establishes a struct vm_area_desc descriptor, passes to the underlying - * .mmap_prepare() hook and applies any changes performed by it. - * - * Once the conversion of filesystems is complete this function will no longer - * be required and will be removed. - * - * Returns: 0 on success or error. - */ -int compat_vma_mmap(struct file *file, struct vm_area_struct *vma) +static int __compat_vma_mmap(struct file *file, struct vm_area_struct *vma) { struct vm_area_desc desc = { .mm = vma->vm_mm, @@ -1221,8 +1195,49 @@ int compat_vma_mmap(struct file *file, struct vm_area_struct *vma) set_vma_from_desc(vma, &desc); return mmap_action_complete(vma, action); } + +/** + * compat_vma_mmap() - Apply the file's .mmap_prepare() hook to an + * existing VMA and execute any requested actions. + * @file: The file which possesss an f_op->mmap_prepare() hook. + * @vma: The VMA to apply the .mmap_prepare() hook to. + * + * Ordinarily, .mmap_prepare() is invoked directly upon mmap(). However, certain + * stacked filesystems invoke a nested mmap hook of an underlying file. + * + * Until all filesystems are converted to use .mmap_prepare(), we must be + * conservative and continue to invoke these stacked filesystems using the + * deprecated .mmap() hook. + * + * However we have a problem if the underlying file system possesses an + * .mmap_prepare() hook, as we are in a different context when we invoke the + * .mmap() hook, already having a VMA to deal with. + * + * compat_vma_mmap() is a compatibility function that takes VMA state, + * establishes a struct vm_area_desc descriptor, passes to the underlying + * .mmap_prepare() hook and applies any changes performed by it. + * + * Once the conversion of filesystems is complete this function will no longer + * be required and will be removed. + * + * Returns: 0 on success or error. + */ +int compat_vma_mmap(struct file *file, struct vm_area_struct *vma) +{ + return __compat_vma_mmap(file, vma); +} EXPORT_SYMBOL(compat_vma_mmap); +int __vma_check_mmap_hook(struct vm_area_struct *vma) +{ + /* vm_ops->mapped is not valid if mmap() is specified. */ + if (vma->vm_ops && WARN_ON_ONCE(vma->vm_ops->mapped)) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL(__vma_check_mmap_hook); + static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio, const struct page *page) { @@ -1311,11 +1326,32 @@ again: } } +static int call_vma_mapped(struct vm_area_struct *vma) +{ + const struct vm_operations_struct *vm_ops = vma->vm_ops; + void *vm_private_data = vma->vm_private_data; + int err; + + if (!vm_ops || !vm_ops->mapped) + return 0; + + err = vm_ops->mapped(vma->vm_start, vma->vm_end, vma->vm_pgoff, + vma->vm_file, &vm_private_data); + if (err) + return err; + + if (vm_private_data != vma->vm_private_data) + vma->vm_private_data = vm_private_data; + return 0; +} + static int mmap_action_finish(struct vm_area_struct *vma, struct mmap_action *action, int err) { size_t len; + if (!err) + err = call_vma_mapped(vma); if (!err && action->success_hook) err = action->success_hook(vma); diff --git a/mm/vma.c b/mm/vma.c index e1950ae048e2..a43f3c5d4b3d 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -2781,7 +2781,6 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr, if (have_mmap_prepare && allocated_new) { error = mmap_action_complete(vma, &desc.action); - if (error) return error; } diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index a95a4b07f68b..1fb7bcae4f31 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -643,6 +643,23 @@ struct vm_operations_struct { * Context: User context. May sleep. Caller holds mmap_lock. */ void (*close)(struct vm_area_struct *vma); + /** + * @mapped: Called when the VMA is first mapped in the MM. Not called if + * the new VMA is merged with an adjacent VMA. + * + * The @vm_private_data field is an output field allowing the user to + * modify vma->vm_private_data as necessary. + * + * ONLY valid if set from f_op->mmap_prepare. Will result in an error if + * set from f_op->mmap. + * + * Returns %0 on success, or an error otherwise. On error, the VMA will + * be unmapped. + * + * Context: User context. May sleep. Caller holds mmap_lock. + */ + int (*mapped)(unsigned long start, unsigned long end, pgoff_t pgoff, + const struct file *file, void **vm_private_data); /* Called any time before splitting to check if it's allowed */ int (*may_split)(struct vm_area_struct *vma, unsigned long addr); int (*mremap)(struct vm_area_struct *vma); -- cgit v1.2.3 From a1b7fb40cb71a33c68a609fcee0946425d698415 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 22:39:37 +0000 Subject: mm: add mmap_action_simple_ioremap() Currently drivers use vm_iomap_memory() as a simple helper function for I/O remapping memory over a range starting at a specified physical address over a specified length. In order to utilise this from mmap_prepare, separate out the core logic into __simple_ioremap_prep(), update vm_iomap_memory() to use it, and add simple_ioremap_prepare() to do the same with a VMA descriptor object. We also add MMAP_SIMPLE_IO_REMAP and relevant fields to the struct mmap_action type to permit this operation also. We use mmap_action_ioremap() to set up the actual I/O remap operation once we have checked and figured out the parameters, which makes simple_ioremap_prepare() easy to implement. We then add mmap_action_simple_ioremap() to allow drivers to make use of this mode. We update the mmap_prepare documentation to describe this mode. Finally, we update the VMA tests to reflect this change. Link: https://lkml.kernel.org/r/a08ef1c4542202684da63bb37f459d5dbbeddd91.1774045440.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Suren Baghdasaryan Acked-by: Vlastimil Babka (SUSE) Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Al Viro Cc: Arnd Bergmann Cc: Bodo Stroesser Cc: Christian Brauner Cc: Clemens Ladisch Cc: David Hildenbrand Cc: David Howells Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Marc Dionne Cc: "Martin K. Petersen" Cc: Maxime Coquelin Cc: Michal Hocko Cc: Mike Rapoport Cc: Miquel Raynal Cc: Pedro Falcato Cc: Richard Weinberger Cc: Ryan Roberts Cc: Vignesh Raghavendra Cc: Wei Liu Signed-off-by: Andrew Morton --- Documentation/filesystems/mmap_prepare.rst | 3 ++ include/linux/mm.h | 24 ++++++++- include/linux/mm_types.h | 6 ++- mm/internal.h | 1 + mm/memory.c | 85 +++++++++++++++++++++--------- mm/util.c | 5 ++ tools/testing/vma/include/dup.h | 6 ++- 7 files changed, 102 insertions(+), 28 deletions(-) (limited to 'include/linux/mm.h') diff --git a/Documentation/filesystems/mmap_prepare.rst b/Documentation/filesystems/mmap_prepare.rst index f14b35ee11d5..14bb057be564 100644 --- a/Documentation/filesystems/mmap_prepare.rst +++ b/Documentation/filesystems/mmap_prepare.rst @@ -153,5 +153,8 @@ pointer. These are: * mmap_action_ioremap_full() - Same as mmap_action_ioremap(), only remaps the entire mapping from ``start_pfn`` onward. +* mmap_action_simple_ioremap() - Sets up an I/O remap from a specified + physical address and over a specified length. + **NOTE:** The ``action`` field should never normally be manipulated directly, rather you ought to use one of these helpers. diff --git a/include/linux/mm.h b/include/linux/mm.h index 81fbcfed44dd..53b21de40f87 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4321,11 +4321,33 @@ static inline void mmap_action_ioremap(struct vm_area_desc *desc, * @start_pfn: The first PFN in the range to remap. */ static inline void mmap_action_ioremap_full(struct vm_area_desc *desc, - unsigned long start_pfn) + unsigned long start_pfn) { mmap_action_ioremap(desc, desc->start, start_pfn, vma_desc_size(desc)); } +/** + * mmap_action_simple_ioremap - helper for mmap_prepare hook to specify that the + * physical range in [start_phys_addr, start_phys_addr + size) should be I/O + * remapped. + * @desc: The VMA descriptor for the VMA requiring remap. + * @start_phys_addr: Start of the physical memory to be mapped. + * @size: Size of the area to map. + * + * NOTE: Some drivers might want to tweak desc->page_prot for purposes of + * write-combine or similar. + */ +static inline void mmap_action_simple_ioremap(struct vm_area_desc *desc, + phys_addr_t start_phys_addr, + unsigned long size) +{ + struct mmap_action *action = &desc->action; + + action->simple_ioremap.start_phys_addr = start_phys_addr; + action->simple_ioremap.size = size; + action->type = MMAP_SIMPLE_IO_REMAP; +} + int mmap_action_prepare(struct vm_area_desc *desc); int mmap_action_complete(struct vm_area_struct *vma, struct mmap_action *action); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 38fe6b915024..91a3db174d78 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -814,6 +814,7 @@ enum mmap_action_type { MMAP_NOTHING, /* Mapping is complete, no further action. */ MMAP_REMAP_PFN, /* Remap PFN range. */ MMAP_IO_REMAP_PFN, /* I/O remap PFN range. */ + MMAP_SIMPLE_IO_REMAP, /* I/O remap with guardrails. */ }; /* @@ -822,13 +823,16 @@ enum mmap_action_type { */ struct mmap_action { union { - /* Remap range. */ struct { unsigned long start; unsigned long start_pfn; unsigned long size; pgprot_t pgprot; } remap; + struct { + phys_addr_t start_phys_addr; + unsigned long size; + } simple_ioremap; }; enum mmap_action_type type; diff --git a/mm/internal.h b/mm/internal.h index 241510e21f4b..c693646e5b3f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1842,6 +1842,7 @@ int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm); int remap_pfn_range_prepare(struct vm_area_desc *desc); int remap_pfn_range_complete(struct vm_area_struct *vma, struct mmap_action *action); +int simple_ioremap_prepare(struct vm_area_desc *desc); static inline int io_remap_pfn_range_prepare(struct vm_area_desc *desc) { diff --git a/mm/memory.c b/mm/memory.c index 10a61dd81f97..c1c323512939 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3170,6 +3170,58 @@ int remap_pfn_range_complete(struct vm_area_struct *vma, return do_remap_pfn_range(vma, start, pfn, size, prot); } +static int __simple_ioremap_prep(unsigned long vm_len, pgoff_t vm_pgoff, + phys_addr_t start_phys, unsigned long size, + unsigned long *pfnp) +{ + unsigned long pfn, pages; + + /* Check that the physical memory area passed in looks valid */ + if (start_phys + size < start_phys) + return -EINVAL; + /* + * You *really* shouldn't map things that aren't page-aligned, + * but we've historically allowed it because IO memory might + * just have smaller alignment. + */ + size += start_phys & ~PAGE_MASK; + pfn = start_phys >> PAGE_SHIFT; + pages = (size + ~PAGE_MASK) >> PAGE_SHIFT; + if (pfn + pages < pfn) + return -EINVAL; + + /* We start the mapping 'vm_pgoff' pages into the area */ + if (vm_pgoff > pages) + return -EINVAL; + pfn += vm_pgoff; + pages -= vm_pgoff; + + /* Can we fit all of the mapping? */ + if ((vm_len >> PAGE_SHIFT) > pages) + return -EINVAL; + + *pfnp = pfn; + return 0; +} + +int simple_ioremap_prepare(struct vm_area_desc *desc) +{ + struct mmap_action *action = &desc->action; + const phys_addr_t start = action->simple_ioremap.start_phys_addr; + const unsigned long size = action->simple_ioremap.size; + unsigned long pfn; + int err; + + err = __simple_ioremap_prep(vma_desc_size(desc), desc->pgoff, + start, size, &pfn); + if (err) + return err; + + /* The I/O remap logic does the heavy lifting. */ + mmap_action_ioremap_full(desc, pfn); + return io_remap_pfn_range_prepare(desc); +} + /** * vm_iomap_memory - remap memory to userspace * @vma: user vma to map to @@ -3187,32 +3239,15 @@ int remap_pfn_range_complete(struct vm_area_struct *vma, */ int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len) { - unsigned long vm_len, pfn, pages; - - /* Check that the physical memory area passed in looks valid */ - if (start + len < start) - return -EINVAL; - /* - * You *really* shouldn't map things that aren't page-aligned, - * but we've historically allowed it because IO memory might - * just have smaller alignment. - */ - len += start & ~PAGE_MASK; - pfn = start >> PAGE_SHIFT; - pages = (len + ~PAGE_MASK) >> PAGE_SHIFT; - if (pfn + pages < pfn) - return -EINVAL; - - /* We start the mapping 'vm_pgoff' pages into the area */ - if (vma->vm_pgoff > pages) - return -EINVAL; - pfn += vma->vm_pgoff; - pages -= vma->vm_pgoff; + const unsigned long vm_start = vma->vm_start; + const unsigned long vm_end = vma->vm_end; + const unsigned long vm_len = vm_end - vm_start; + unsigned long pfn; + int err; - /* Can we fit all of the mapping? */ - vm_len = vma->vm_end - vma->vm_start; - if (vm_len >> PAGE_SHIFT > pages) - return -EINVAL; + err = __simple_ioremap_prep(vm_len, vma->vm_pgoff, start, len, &pfn); + if (err) + return err; /* Ok, let it rip */ return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot); diff --git a/mm/util.c b/mm/util.c index 98fe67e59ec3..9a27d33273fd 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1393,6 +1393,8 @@ int mmap_action_prepare(struct vm_area_desc *desc) return remap_pfn_range_prepare(desc); case MMAP_IO_REMAP_PFN: return io_remap_pfn_range_prepare(desc); + case MMAP_SIMPLE_IO_REMAP: + return simple_ioremap_prepare(desc); } WARN_ON_ONCE(1); @@ -1421,6 +1423,7 @@ int mmap_action_complete(struct vm_area_struct *vma, err = remap_pfn_range_complete(vma, action); break; case MMAP_IO_REMAP_PFN: + case MMAP_SIMPLE_IO_REMAP: /* Should have been delegated. */ WARN_ON_ONCE(1); err = -EINVAL; @@ -1438,6 +1441,7 @@ int mmap_action_prepare(struct vm_area_desc *desc) break; case MMAP_REMAP_PFN: case MMAP_IO_REMAP_PFN: + case MMAP_SIMPLE_IO_REMAP: WARN_ON_ONCE(1); /* nommu cannot handle these. */ break; } @@ -1456,6 +1460,7 @@ int mmap_action_complete(struct vm_area_struct *vma, break; case MMAP_REMAP_PFN: case MMAP_IO_REMAP_PFN: + case MMAP_SIMPLE_IO_REMAP: WARN_ON_ONCE(1); /* nommu cannot handle this. */ err = -EINVAL; diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 1fb7bcae4f31..b31207bbe10d 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -453,6 +453,7 @@ enum mmap_action_type { MMAP_NOTHING, /* Mapping is complete, no further action. */ MMAP_REMAP_PFN, /* Remap PFN range. */ MMAP_IO_REMAP_PFN, /* I/O remap PFN range. */ + MMAP_SIMPLE_IO_REMAP, /* I/O remap with guardrails. */ }; /* @@ -461,13 +462,16 @@ enum mmap_action_type { */ struct mmap_action { union { - /* Remap range. */ struct { unsigned long start; unsigned long start_pfn; unsigned long size; pgprot_t pgprot; } remap; + struct { + phys_addr_t start_phys_addr; + unsigned long size; + } simple_ioremap; }; enum mmap_action_type type; -- cgit v1.2.3 From 62c65fd740e979a3967db08971b93aefcec510d4 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 22:39:46 +0000 Subject: mm: add mmap_action_map_kernel_pages[_full]() A user can invoke mmap_action_map_kernel_pages() to specify that the mapping should map kernel pages starting from desc->start of a specified number of pages specified in an array. In order to implement this, adjust mmap_action_prepare() to be able to return an error code, as it makes sense to assert that the specified parameters are valid as quickly as possible as well as updating the VMA flags to include VMA_MIXEDMAP_BIT as necessary. This provides an mmap_prepare equivalent of vm_insert_pages(). We additionally update the existing vm_insert_pages() code to use range_in_vma() and add a new range_in_vma_desc() helper function for the mmap_prepare case, sharing the code between the two in range_is_subset(). We add both mmap_action_map_kernel_pages() and mmap_action_map_kernel_pages_full() to allow for both partial and full VMA mappings. We update the documentation to reflect the new features. Finally, we update the VMA tests accordingly to reflect the changes. Link: https://lkml.kernel.org/r/926ac961690d856e67ec847bee2370ab3c6b9046.1774045440.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Suren Baghdasaryan Acked-by: Vlastimil Babka (SUSE) Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Al Viro Cc: Arnd Bergmann Cc: Bodo Stroesser Cc: Christian Brauner Cc: Clemens Ladisch Cc: David Hildenbrand Cc: David Howells Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Marc Dionne Cc: "Martin K. Petersen" Cc: Maxime Coquelin Cc: Michal Hocko Cc: Mike Rapoport Cc: Miquel Raynal Cc: Pedro Falcato Cc: Richard Weinberger Cc: Ryan Roberts Cc: Vignesh Raghavendra Cc: Wei Liu Signed-off-by: Andrew Morton --- Documentation/filesystems/mmap_prepare.rst | 8 +++ include/linux/mm.h | 95 +++++++++++++++++++++++++++++- include/linux/mm_types.h | 7 +++ mm/memory.c | 42 +++++++++++-- mm/util.c | 7 +++ tools/testing/vma/include/dup.h | 7 +++ 6 files changed, 160 insertions(+), 6 deletions(-) (limited to 'include/linux/mm.h') diff --git a/Documentation/filesystems/mmap_prepare.rst b/Documentation/filesystems/mmap_prepare.rst index 14bb057be564..82c99c95ad85 100644 --- a/Documentation/filesystems/mmap_prepare.rst +++ b/Documentation/filesystems/mmap_prepare.rst @@ -156,5 +156,13 @@ pointer. These are: * mmap_action_simple_ioremap() - Sets up an I/O remap from a specified physical address and over a specified length. +* mmap_action_map_kernel_pages() - Maps a specified array of `struct page` + pointers in the VMA from a specific offset. + +* mmap_action_map_kernel_pages_full() - Maps a specified array of `struct + page` pointers over the entire VMA. The caller must ensure there are + sufficient entries in the page array to cover the entire range of the + described VMA. + **NOTE:** The ``action`` field should never normally be manipulated directly, rather you ought to use one of these helpers. diff --git a/include/linux/mm.h b/include/linux/mm.h index 53b21de40f87..61dff7f03554 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2905,7 +2905,7 @@ static inline bool folio_maybe_mapped_shared(struct folio *folio) * The caller must add any reference (e.g., from folio_try_get()) it might be * holding itself to the result. * - * Returns the expected folio refcount. + * Returns: the expected folio refcount. */ static inline int folio_expected_ref_count(const struct folio *folio) { @@ -4348,6 +4348,45 @@ static inline void mmap_action_simple_ioremap(struct vm_area_desc *desc, action->type = MMAP_SIMPLE_IO_REMAP; } +/** + * mmap_action_map_kernel_pages - helper for mmap_prepare hook to specify that + * @num kernel pages contained in the @pages array should be mapped to userland + * starting at virtual address @start. + * @desc: The VMA descriptor for the VMA requiring kernel pags to be mapped. + * @start: The virtual address from which to map them. + * @pages: An array of struct page pointers describing the memory to map. + * @nr_pages: The number of entries in the @pages aray. + */ +static inline void mmap_action_map_kernel_pages(struct vm_area_desc *desc, + unsigned long start, struct page **pages, + unsigned long nr_pages) +{ + struct mmap_action *action = &desc->action; + + action->type = MMAP_MAP_KERNEL_PAGES; + action->map_kernel.start = start; + action->map_kernel.pages = pages; + action->map_kernel.nr_pages = nr_pages; + action->map_kernel.pgoff = desc->pgoff; +} + +/** + * mmap_action_map_kernel_pages_full - helper for mmap_prepare hook to specify that + * kernel pages contained in the @pages array should be mapped to userland + * from @desc->start to @desc->end. + * @desc: The VMA descriptor for the VMA requiring kernel pags to be mapped. + * @pages: An array of struct page pointers describing the memory to map. + * + * The caller must ensure that @pages contains sufficient entries to cover the + * entire range described by @desc. + */ +static inline void mmap_action_map_kernel_pages_full(struct vm_area_desc *desc, + struct page **pages) +{ + mmap_action_map_kernel_pages(desc, desc->start, pages, + vma_desc_pages(desc)); +} + int mmap_action_prepare(struct vm_area_desc *desc); int mmap_action_complete(struct vm_area_struct *vma, struct mmap_action *action); @@ -4364,10 +4403,59 @@ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, return vma; } +/** + * range_is_subset - Is the specified inner range a subset of the outer range? + * @outer_start: The start of the outer range. + * @outer_end: The exclusive end of the outer range. + * @inner_start: The start of the inner range. + * @inner_end: The exclusive end of the inner range. + * + * Returns: %true if [inner_start, inner_end) is a subset of [outer_start, + * outer_end), otherwise %false. + */ +static inline bool range_is_subset(unsigned long outer_start, + unsigned long outer_end, + unsigned long inner_start, + unsigned long inner_end) +{ + return outer_start <= inner_start && inner_end <= outer_end; +} + +/** + * range_in_vma - is the specified [@start, @end) range a subset of the VMA? + * @vma: The VMA against which we want to check [@start, @end). + * @start: The start of the range we wish to check. + * @end: The exclusive end of the range we wish to check. + * + * Returns: %true if [@start, @end) is a subset of [@vma->vm_start, + * @vma->vm_end), %false otherwise. + */ static inline bool range_in_vma(const struct vm_area_struct *vma, unsigned long start, unsigned long end) { - return (vma && vma->vm_start <= start && end <= vma->vm_end); + if (!vma) + return false; + + return range_is_subset(vma->vm_start, vma->vm_end, start, end); +} + +/** + * range_in_vma_desc - is the specified [@start, @end) range a subset of the VMA + * described by @desc, a VMA descriptor? + * @desc: The VMA descriptor against which we want to check [@start, @end). + * @start: The start of the range we wish to check. + * @end: The exclusive end of the range we wish to check. + * + * Returns: %true if [@start, @end) is a subset of [@desc->start, @desc->end), + * %false otherwise. + */ +static inline bool range_in_vma_desc(const struct vm_area_desc *desc, + unsigned long start, unsigned long end) +{ + if (!desc) + return false; + + return range_is_subset(desc->start, desc->end, start, end); } #ifdef CONFIG_MMU @@ -4411,6 +4499,9 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num); +int map_kernel_pages_prepare(struct vm_area_desc *desc); +int map_kernel_pages_complete(struct vm_area_struct *vma, + struct mmap_action *action); int vm_map_pages(struct vm_area_struct *vma, struct page **pages, unsigned long num); int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index b702c63bf0e0..a308e2c23b82 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -815,6 +815,7 @@ enum mmap_action_type { MMAP_REMAP_PFN, /* Remap PFN range. */ MMAP_IO_REMAP_PFN, /* I/O remap PFN range. */ MMAP_SIMPLE_IO_REMAP, /* I/O remap with guardrails. */ + MMAP_MAP_KERNEL_PAGES, /* Map kernel page range from array. */ }; /* @@ -833,6 +834,12 @@ struct mmap_action { phys_addr_t start_phys_addr; unsigned long size; } simple_ioremap; + struct { + unsigned long start; + struct page **pages; + unsigned long nr_pages; + pgoff_t pgoff; + } map_kernel; }; enum mmap_action_type type; diff --git a/mm/memory.c b/mm/memory.c index c1c323512939..5d032b5293c6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2484,13 +2484,14 @@ out: int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num) { - const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1; + const unsigned long nr_pages = *num; + const unsigned long end = addr + PAGE_SIZE * nr_pages; - if (addr < vma->vm_start || end_addr >= vma->vm_end) + if (!range_in_vma(vma, addr, end)) return -EFAULT; if (!(vma->vm_flags & VM_MIXEDMAP)) { - BUG_ON(mmap_read_trylock(vma->vm_mm)); - BUG_ON(vma->vm_flags & VM_PFNMAP); + VM_WARN_ON_ONCE(mmap_read_trylock(vma->vm_mm)); + VM_WARN_ON_ONCE(vma->vm_flags & VM_PFNMAP); vm_flags_set(vma, VM_MIXEDMAP); } /* Defer page refcount checking till we're about to map that page. */ @@ -2498,6 +2499,39 @@ int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(vm_insert_pages); +int map_kernel_pages_prepare(struct vm_area_desc *desc) +{ + const struct mmap_action *action = &desc->action; + const unsigned long addr = action->map_kernel.start; + unsigned long nr_pages, end; + + if (!vma_desc_test(desc, VMA_MIXEDMAP_BIT)) { + VM_WARN_ON_ONCE(mmap_read_trylock(desc->mm)); + VM_WARN_ON_ONCE(vma_desc_test(desc, VMA_PFNMAP_BIT)); + vma_desc_set_flags(desc, VMA_MIXEDMAP_BIT); + } + + nr_pages = action->map_kernel.nr_pages; + end = addr + PAGE_SIZE * nr_pages; + if (!range_in_vma_desc(desc, addr, end)) + return -EFAULT; + + return 0; +} +EXPORT_SYMBOL(map_kernel_pages_prepare); + +int map_kernel_pages_complete(struct vm_area_struct *vma, + struct mmap_action *action) +{ + unsigned long nr_pages; + + nr_pages = action->map_kernel.nr_pages; + return insert_pages(vma, action->map_kernel.start, + action->map_kernel.pages, + &nr_pages, vma->vm_page_prot); +} +EXPORT_SYMBOL(map_kernel_pages_complete); + /** * vm_insert_page - insert single page into user vma * @vma: user vma to map to diff --git a/mm/util.c b/mm/util.c index 5ae20876ef2c..f063fd4de1e8 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1448,6 +1448,8 @@ int mmap_action_prepare(struct vm_area_desc *desc) return io_remap_pfn_range_prepare(desc); case MMAP_SIMPLE_IO_REMAP: return simple_ioremap_prepare(desc); + case MMAP_MAP_KERNEL_PAGES: + return map_kernel_pages_prepare(desc); } WARN_ON_ONCE(1); @@ -1475,6 +1477,9 @@ int mmap_action_complete(struct vm_area_struct *vma, case MMAP_REMAP_PFN: err = remap_pfn_range_complete(vma, action); break; + case MMAP_MAP_KERNEL_PAGES: + err = map_kernel_pages_complete(vma, action); + break; case MMAP_IO_REMAP_PFN: case MMAP_SIMPLE_IO_REMAP: /* Should have been delegated. */ @@ -1495,6 +1500,7 @@ int mmap_action_prepare(struct vm_area_desc *desc) case MMAP_REMAP_PFN: case MMAP_IO_REMAP_PFN: case MMAP_SIMPLE_IO_REMAP: + case MMAP_MAP_KERNEL_PAGES: WARN_ON_ONCE(1); /* nommu cannot handle these. */ break; } @@ -1514,6 +1520,7 @@ int mmap_action_complete(struct vm_area_struct *vma, case MMAP_REMAP_PFN: case MMAP_IO_REMAP_PFN: case MMAP_SIMPLE_IO_REMAP: + case MMAP_MAP_KERNEL_PAGES: WARN_ON_ONCE(1); /* nommu cannot handle this. */ err = -EINVAL; diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index ecd47d0f7d17..b4864aad2db0 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -454,6 +454,7 @@ enum mmap_action_type { MMAP_REMAP_PFN, /* Remap PFN range. */ MMAP_IO_REMAP_PFN, /* I/O remap PFN range. */ MMAP_SIMPLE_IO_REMAP, /* I/O remap with guardrails. */ + MMAP_MAP_KERNEL_PAGES, /* Map kernel page range from an array. */ }; /* @@ -472,6 +473,12 @@ struct mmap_action { phys_addr_t start_phys_addr; unsigned long size; } simple_ioremap; + struct { + unsigned long start; + struct page **pages; + unsigned long nr_pages; + pgoff_t pgoff; + } map_kernel; }; enum mmap_action_type type; -- cgit v1.2.3 From c0ea52c18c78c33c68c350eb9d3dcdf8c513254d Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 18:07:18 +0000 Subject: mm/huge_memory: simplify vma_is_specal_huge() Patch series "mm/huge_memory: refactor zap_huge_pmd()", v3. zap_huge_pmd() is overly complicated, clean it up and also add an assert in the case that we encounter a buggy PMD entry that doesn't match expectations. This is motivated by a bug discovered [0] where the PMD entry was none of: * A non-DAX, PFN or mixed map. * The huge zero folio * A present PMD entry * A softleaf entry In zap_huge_pmd(), but due to the bug we manged to reach this code. It is useful to explicitly call this out rather than have an arbitrary NULL pointer dereference happen, which also improves understanding of what's going on. The series goes further to make use of vm_normal_folio_pmd() rather than implementing custom logic for retrieving the folio, and extends softleaf functionality to provide and use an equivalent softleaf function. This patch (of 13): This function is confused - it overloads the term 'special' yet again, checks for DAX but in many cases the code explicitly excludes DAX before invoking the predicate. It also unnecessarily checks for vma->vm_file - this has to be present for a driver to have set VMA_MIXEDMAP_BIT or VMA_PFNMAP_BIT. In fact, a far simpler form of this is to reverse the DAX predicate and return false if DAX is set. This makes sense from the point of view of 'special' as in vm_normal_page(), as DAX actually does potentially have retrievable folios. Also there's no need to have this in mm.h so move it to huge_memory.c. No functional change intended. Link: https://lkml.kernel.org/r/cover.1774029655.git.ljs@kernel.org Link: https://lkml.kernel.org/r/d2b65883dc4895f197c4b4a69fbf27a063463412.1774029655.git.ljs@kernel.org Link: https://lore.kernel.org/all/6b3d7ad7-49e1-407a-903d-3103704160d8@lucifer.local/ [0] Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Suren Baghdasaryan Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand Cc: Dev Jain Cc: Lance Yang Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Ryan Roberts Cc: Zi Yan Cc: Qi Zheng Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 4 ++-- include/linux/mm.h | 16 ---------------- mm/huge_memory.c | 30 +++++++++++++++++++++++------- 3 files changed, 25 insertions(+), 25 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index bd7f0e1d8094..61fda1672b29 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -83,7 +83,7 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr; * file is never split and the MAX_PAGECACHE_ORDER limit does not apply to * it. Same to PFNMAPs where there's neither page* nor pagecache. */ -#define THP_ORDERS_ALL_SPECIAL \ +#define THP_ORDERS_ALL_SPECIAL_DAX \ (BIT(PMD_ORDER) | BIT(PUD_ORDER)) #define THP_ORDERS_ALL_FILE_DEFAULT \ ((BIT(MAX_PAGECACHE_ORDER + 1) - 1) & ~BIT(0)) @@ -92,7 +92,7 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr; * Mask of all large folio orders supported for THP. */ #define THP_ORDERS_ALL \ - (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_SPECIAL | THP_ORDERS_ALL_FILE_DEFAULT) + (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_SPECIAL_DAX | THP_ORDERS_ALL_FILE_DEFAULT) enum tva_type { TVA_SMAPS, /* Exposing "THPeligible:" in smaps. */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 61dff7f03554..8260e28205e9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -5068,22 +5068,6 @@ long copy_folio_from_user(struct folio *dst_folio, const void __user *usr_src, bool allow_pagefault); -/** - * vma_is_special_huge - Are transhuge page-table entries considered special? - * @vma: Pointer to the struct vm_area_struct to consider - * - * Whether transhuge page-table entries are considered "special" following - * the definition in vm_normal_page(). - * - * Return: true if transhuge page-table entries should be considered special, - * false otherwise. - */ -static inline bool vma_is_special_huge(const struct vm_area_struct *vma) -{ - return vma_is_dax(vma) || (vma->vm_file && - (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))); -} - #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ #if MAX_NUMNODES > 1 diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1c1a7cf7b209..db390b0098d9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -100,6 +100,14 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) return !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode); } +/* If returns true, we are unable to access the VMA's folios. */ +static bool vma_is_special_huge(const struct vm_area_struct *vma) +{ + if (vma_is_dax(vma)) + return false; + return vma_test_any(vma, VMA_PFNMAP_BIT, VMA_MIXEDMAP_BIT); +} + unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, vm_flags_t vm_flags, enum tva_type type, @@ -113,8 +121,8 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, /* Check the intersection of requested and supported orders. */ if (vma_is_anonymous(vma)) supported_orders = THP_ORDERS_ALL_ANON; - else if (vma_is_special_huge(vma)) - supported_orders = THP_ORDERS_ALL_SPECIAL; + else if (vma_is_dax(vma) || vma_is_special_huge(vma)) + supported_orders = THP_ORDERS_ALL_SPECIAL_DAX; else supported_orders = THP_ORDERS_ALL_FILE_DEFAULT; @@ -2415,7 +2423,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, tlb->fullmm); arch_check_zapped_pmd(vma, orig_pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); - if (!vma_is_dax(vma) && vma_is_special_huge(vma)) { + if (vma_is_special_huge(vma)) { if (arch_needs_pgtable_deposit()) zap_deposited_table(tlb->mm, pmd); spin_unlock(ptl); @@ -2917,7 +2925,7 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, orig_pud = pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm); arch_check_zapped_pud(vma, orig_pud); tlb_remove_pud_tlb_entry(tlb, pud, addr); - if (!vma_is_dax(vma) && vma_is_special_huge(vma)) { + if (vma_is_special_huge(vma)) { spin_unlock(ptl); /* No zero page support yet */ } else { @@ -3068,7 +3076,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, */ if (arch_needs_pgtable_deposit()) zap_deposited_table(mm, pmd); - if (!vma_is_dax(vma) && vma_is_special_huge(vma)) + if (vma_is_special_huge(vma)) return; if (unlikely(pmd_is_migration_entry(old_pmd))) { const softleaf_t old_entry = softleaf_from_pmd(old_pmd); @@ -4629,8 +4637,16 @@ next: static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma) { - return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) || - is_vm_hugetlb_page(vma); + if (vma_is_dax(vma)) + return true; + if (vma_is_special_huge(vma)) + return true; + if (vma_test(vma, VMA_IO_BIT)) + return true; + if (is_vm_hugetlb_page(vma)) + return true; + + return false; } static int split_huge_pages_pid(int pid, unsigned long vaddr_start, -- cgit v1.2.3 From 0f48947c4232c934885711dde0b49066f9d8ee87 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Thu, 2 Apr 2026 07:11:48 +0300 Subject: userfaultfd: introduce vm_uffd_ops Current userfaultfd implementation works only with memory managed by core MM: anonymous, shmem and hugetlb. First, there is no fundamental reason to limit userfaultfd support only to the core memory types and userfaults can be handled similarly to regular page faults provided a VMA owner implements appropriate callbacks. Second, historically various code paths were conditioned on vma_is_anonymous(), vma_is_shmem() and is_vm_hugetlb_page() and some of these conditions can be expressed as operations implemented by a particular memory type. Introduce vm_uffd_ops extension to vm_operations_struct that will delegate memory type specific operations to a VMA owner. Operations for anonymous memory are handled internally in userfaultfd using anon_uffd_ops that implicitly assigned to anonymous VMAs. Start with a single operation, ->can_userfault() that will verify that a VMA meets requirements for userfaultfd support at registration time. Implement that method for anonymous, shmem and hugetlb and move relevant parts of vma_can_userfault() into the new callbacks. [rppt@kernel.org: relocate VM_DROPPABLE test, per Tal] Link: https://lore.kernel.org/adffgfM5ANxtPIEF@kernel.org Link: https://lore.kernel.org/20260402041156.1377214-8-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Cc: Andrea Arcangeli Cc: Andrei Vagin Cc: Axel Rasmussen Cc: Baolin Wang Cc: David Hildenbrand (Arm) Cc: Harry Yoo Cc: Harry Yoo (Oracle) Cc: Hugh Dickins Cc: James Houghton Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Muchun Song Cc: Nikita Kalyazin Cc: Oscar Salvador Cc: Paolo Bonzini Cc: Peter Xu Cc: Sean Christopherson Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: David Carlier Cc: Tal Zussman Signed-off-by: Andrew Morton --- include/linux/mm.h | 5 +++++ include/linux/userfaultfd_k.h | 6 ++++++ mm/hugetlb.c | 15 +++++++++++++++ mm/shmem.c | 15 +++++++++++++++ mm/userfaultfd.c | 38 ++++++++++++++++++++++++++++---------- 5 files changed, 69 insertions(+), 10 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8260e28205e9..633bbf9a184a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -758,6 +758,8 @@ struct vm_fault { */ }; +struct vm_uffd_ops; + /* * These are the virtual MM functions - opening of an area, closing and * unmapping it (needed to keep files on disk up-to-date etc), pointer @@ -865,6 +867,9 @@ struct vm_operations_struct { struct page *(*find_normal_page)(struct vm_area_struct *vma, unsigned long addr); #endif /* CONFIG_FIND_NORMAL_PAGE */ +#ifdef CONFIG_USERFAULTFD + const struct vm_uffd_ops *uffd_ops; +#endif }; #ifdef CONFIG_NUMA_BALANCING diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index ce0201c3dd82..6d445dbfe8ff 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -83,6 +83,12 @@ struct userfaultfd_ctx { extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason); +/* VMA userfaultfd operations */ +struct vm_uffd_ops { + /* Checks if a VMA can support userfaultfd */ + bool (*can_userfault)(struct vm_area_struct *vma, vm_flags_t vm_flags); +}; + /* A combined operation mode + behavior flags. */ typedef unsigned int __bitwise uffd_flags_t; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a786034ac95c..88009cd2a846 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4792,6 +4792,18 @@ static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf) return 0; } +#ifdef CONFIG_USERFAULTFD +static bool hugetlb_can_userfault(struct vm_area_struct *vma, + vm_flags_t vm_flags) +{ + return true; +} + +static const struct vm_uffd_ops hugetlb_uffd_ops = { + .can_userfault = hugetlb_can_userfault, +}; +#endif + /* * When a new function is introduced to vm_operations_struct and added * to hugetlb_vm_ops, please consider adding the function to shm_vm_ops. @@ -4805,6 +4817,9 @@ const struct vm_operations_struct hugetlb_vm_ops = { .close = hugetlb_vm_op_close, .may_split = hugetlb_vm_op_split, .pagesize = hugetlb_vm_op_pagesize, +#ifdef CONFIG_USERFAULTFD + .uffd_ops = &hugetlb_uffd_ops, +#endif }; static pte_t make_huge_pte(struct vm_area_struct *vma, struct folio *folio, diff --git a/mm/shmem.c b/mm/shmem.c index 6fa1e8340c93..389b2d76396e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3288,6 +3288,15 @@ out_unacct_blocks: shmem_inode_unacct_blocks(inode, 1); return ret; } + +static bool shmem_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags) +{ + return true; +} + +static const struct vm_uffd_ops shmem_uffd_ops = { + .can_userfault = shmem_can_userfault, +}; #endif /* CONFIG_USERFAULTFD */ #ifdef CONFIG_TMPFS @@ -5307,6 +5316,9 @@ static const struct vm_operations_struct shmem_vm_ops = { .set_policy = shmem_set_policy, .get_policy = shmem_get_policy, #endif +#ifdef CONFIG_USERFAULTFD + .uffd_ops = &shmem_uffd_ops, +#endif }; static const struct vm_operations_struct shmem_anon_vm_ops = { @@ -5316,6 +5328,9 @@ static const struct vm_operations_struct shmem_anon_vm_ops = { .set_policy = shmem_set_policy, .get_policy = shmem_get_policy, #endif +#ifdef CONFIG_USERFAULTFD + .uffd_ops = &shmem_uffd_ops, +#endif }; int shmem_init_fs_context(struct fs_context *fc) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index ebdc6e24a2c7..3a824e034a09 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -34,6 +34,25 @@ struct mfill_state { pmd_t *pmd; }; +static bool anon_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags) +{ + /* anonymous memory does not support MINOR mode */ + if (vm_flags & VM_UFFD_MINOR) + return false; + return true; +} + +static const struct vm_uffd_ops anon_uffd_ops = { + .can_userfault = anon_can_userfault, +}; + +static const struct vm_uffd_ops *vma_uffd_ops(struct vm_area_struct *vma) +{ + if (vma_is_anonymous(vma)) + return &anon_uffd_ops; + return vma->vm_ops ? vma->vm_ops->uffd_ops : NULL; +} + static __always_inline bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end) { @@ -2021,34 +2040,33 @@ out: bool vma_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags, bool wp_async) { - vm_flags &= __VM_UFFD_FLAGS; + const struct vm_uffd_ops *ops = vma_uffd_ops(vma); if (vma->vm_flags & VM_DROPPABLE) return false; - if ((vm_flags & VM_UFFD_MINOR) && - (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma))) - return false; + vm_flags &= __VM_UFFD_FLAGS; /* - * If wp async enabled, and WP is the only mode enabled, allow any + * If WP is the only mode enabled and context is wp async, allow any * memory type. */ if (wp_async && (vm_flags == VM_UFFD_WP)) return true; + /* For any other mode reject VMAs that don't implement vm_uffd_ops */ + if (!ops) + return false; + /* * If user requested uffd-wp but not enabled pte markers for - * uffd-wp, then shmem & hugetlbfs are not supported but only - * anonymous. + * uffd-wp, then only anonymous memory is supported */ if (!uffd_supports_wp_marker() && (vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma)) return false; - /* By default, allow any of anon|shmem|hugetlb */ - return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || - vma_is_shmem(vma); + return ops->can_userfault(vma, vm_flags); } static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, -- cgit v1.2.3