From 1aceed565ff172fc0331dd1d5e7e65139b711139 Mon Sep 17 00:00:00 2001 From: Bing Jiao Date: Wed, 14 Jan 2026 20:53:02 +0000 Subject: mm/vmscan: fix demotion targets checks in reclaim/demotion Patch series "mm/vmscan: fix demotion targets checks in reclaim/demotion", v9. This patch series addresses two issues in demote_folio_list(), can_demote(), and next_demotion_node() in reclaim/demotion. 1. demote_folio_list() and can_demote() do not correctly check demotion target against cpuset.mems_effective, which will cause (a) pages to be demoted to not-allowed nodes and (b) pages fail demotion even if the system still has allowed demotion nodes. Patch 1 fixes this bug by updating cpuset_node_allowed() and mem_cgroup_node_allowed() to return effective_mems, allowing directly logic-and operation against demotion targets. 2. next_demotion_node() returns a preferred demotion target, but it does not check the node against allowed nodes. Patch 2 ensures that next_demotion_node() filters against the allowed node mask and selects the closest demotion target to the source node. This patch (of 2): Fix two bugs in demote_folio_list() and can_demote() due to incorrect demotion target checks against cpuset.mems_effective in reclaim/demotion. Commit 7d709f49babc ("vmscan,cgroup: apply mems_effective to reclaim") introduces the cpuset.mems_effective check and applies it to can_demote(). However: 1. It does not apply this check in demote_folio_list(), which leads to situations where pages are demoted to nodes that are explicitly excluded from the task's cpuset.mems. 2. It checks only the nodes in the immediate next demotion hierarchy and does not check all allowed demotion targets in can_demote(). This can cause pages to never be demoted if the nodes in the next demotion hierarchy are not set in mems_effective. These bugs break resource isolation provided by cpuset.mems. This is visible from userspace because pages can either fail to be demoted entirely or are demoted to nodes that are not allowed in multi-tier memory systems. To address these bugs, update cpuset_node_allowed() and mem_cgroup_node_allowed() to return effective_mems, allowing directly logic-and operation against demotion targets. Also update can_demote() and demote_folio_list() accordingly. Bug 1 reproduction: Assume a system with 4 nodes, where nodes 0-1 are top-tier and nodes 2-3 are far-tier memory. All nodes have equal capacity. Test script: echo 1 > /sys/kernel/mm/numa/demotion_enabled mkdir /sys/fs/cgroup/test echo +cpuset > /sys/fs/cgroup/cgroup.subtree_control echo "0-2" > /sys/fs/cgroup/test/cpuset.mems echo $$ > /sys/fs/cgroup/test/cgroup.procs swapoff -a # Expectation: Should respect node 0-2 limit. # Observation: Node 3 shows significant allocation (MemFree drops) stress-ng --oomable --vm 1 --vm-bytes 150% --mbind 0,1 Bug 2 reproduction: Assume a system with 6 nodes, where nodes 0-2 are top-tier, node 3 is a far-tier node, and nodes 4-5 are the farthest-tier nodes. All nodes have equal capacity. Test script: echo 1 > /sys/kernel/mm/numa/demotion_enabled mkdir /sys/fs/cgroup/test echo +cpuset > /sys/fs/cgroup/cgroup.subtree_control echo "0-2,4-5" > /sys/fs/cgroup/test/cpuset.mems echo $$ > /sys/fs/cgroup/test/cgroup.procs swapoff -a # Expectation: Pages are demoted to Nodes 4-5 # Observation: No pages are demoted before oom. stress-ng --oomable --vm 1 --vm-bytes 150% --mbind 0,1,2 Link: https://lkml.kernel.org/r/20260114205305.2869796-1-bingjiao@google.com Link: https://lkml.kernel.org/r/20260114205305.2869796-2-bingjiao@google.com Fixes: 7d709f49babc ("vmscan,cgroup: apply mems_effective to reclaim") Signed-off-by: Bing Jiao Acked-by: Shakeel Butt Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Gregory Price Cc: Johannes Weiner Cc: Joshua Hahn Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Qi Zheng Cc: Roman Gushchin Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Vlastimil Babka Cc: Waiman Long Cc: Wei Xu Cc: Yuanchu Xie Cc: Signed-off-by: Andrew Morton --- include/linux/cpuset.h | 6 +++--- include/linux/memcontrol.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index a98d3330385c..631577384677 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -174,7 +174,7 @@ static inline void set_mems_allowed(nodemask_t nodemask) task_unlock(current); } -extern bool cpuset_node_allowed(struct cgroup *cgroup, int nid); +extern void cpuset_nodes_allowed(struct cgroup *cgroup, nodemask_t *mask); #else /* !CONFIG_CPUSETS */ static inline bool cpusets_enabled(void) { return false; } @@ -301,9 +301,9 @@ static inline bool read_mems_allowed_retry(unsigned int seq) return false; } -static inline bool cpuset_node_allowed(struct cgroup *cgroup, int nid) +static inline void cpuset_nodes_allowed(struct cgroup *cgroup, nodemask_t *mask) { - return true; + nodes_copy(*mask, node_states[N_MEMORY]); } #endif /* !CONFIG_CPUSETS */ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ed4764e1a30e..b6c82c8f73e1 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1736,7 +1736,7 @@ static inline void count_objcg_events(struct obj_cgroup *objcg, rcu_read_unlock(); } -bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid); +void mem_cgroup_node_filter_allowed(struct mem_cgroup *memcg, nodemask_t *mask); void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg); @@ -1807,9 +1807,9 @@ static inline ino_t page_cgroup_ino(struct page *page) return 0; } -static inline bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid) +static inline void mem_cgroup_node_filter_allowed(struct mem_cgroup *memcg, + nodemask_t *mask) { - return true; } static inline void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg) -- cgit v1.2.3 From 7ec9ecf217f8e565577bde8a47915a51491ef3a3 Mon Sep 17 00:00:00 2001 From: Bing Jiao Date: Wed, 14 Jan 2026 20:53:03 +0000 Subject: mm/vmscan: select the closest preferred node in demote_folio_list() The preferred demotion node (migration_target_control.nid) should be the one closest to the source node to minimize migration latency. Currently, a discrepancy exists where demote_folio_list() randomly selects an allowed node if the preferred node from next_demotion_node() is not set in mems_effective. To address it, update next_demotion_node() to select a preferred target against allowed nodes; and to return the closest demotion target if all preferred nodes are not in mems_effective via next_demotion_node(). It ensures that the preferred demotion target is consistently the closest available node to the source node. [akpm@linux-foundation.org: fix comment typo, per Shakeel] Link: https://lkml.kernel.org/r/20260114205305.2869796-3-bingjiao@google.com Signed-off-by: Bing Jiao Acked-by: Shakeel Butt Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Gregory Price Cc: Johannes Weiner Cc: Joshua Hahn Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Qi Zheng Cc: Roman Gushchin Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Vlastimil Babka Cc: Waiman Long Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/memory-tiers.h | 6 +++--- mm/memory-tiers.c | 21 ++++++++++++++++----- mm/vmscan.c | 5 ++--- 3 files changed, 21 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h index 7a805796fcfd..96987d9d95a8 100644 --- a/include/linux/memory-tiers.h +++ b/include/linux/memory-tiers.h @@ -53,11 +53,11 @@ struct memory_dev_type *mt_find_alloc_memory_type(int adist, struct list_head *memory_types); void mt_put_memory_types(struct list_head *memory_types); #ifdef CONFIG_MIGRATION -int next_demotion_node(int node); +int next_demotion_node(int node, const nodemask_t *allowed_mask); void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets); bool node_is_toptier(int node); #else -static inline int next_demotion_node(int node) +static inline int next_demotion_node(int node, const nodemask_t *allowed_mask) { return NUMA_NO_NODE; } @@ -101,7 +101,7 @@ static inline void clear_node_memory_type(int node, struct memory_dev_type *memt } -static inline int next_demotion_node(int node) +static inline int next_demotion_node(int node, const nodemask_t *allowed_mask) { return NUMA_NO_NODE; } diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index 0ae8bec86346..545e34626df7 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -320,16 +320,17 @@ void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets) /** * next_demotion_node() - Get the next node in the demotion path * @node: The starting node to lookup the next node + * @allowed_mask: The pointer to allowed node mask * * Return: node id for next memory node in the demotion path hierarchy * from @node; NUMA_NO_NODE if @node is terminal. This does not keep * @node online or guarantee that it *continues* to be the next demotion * target. */ -int next_demotion_node(int node) +int next_demotion_node(int node, const nodemask_t *allowed_mask) { struct demotion_nodes *nd; - int target; + nodemask_t mask; if (!node_demotion) return NUMA_NO_NODE; @@ -344,6 +345,10 @@ int next_demotion_node(int node) * node_demotion[] reads need to be consistent. */ rcu_read_lock(); + /* Filter out nodes that are not in allowed_mask. */ + nodes_and(mask, nd->preferred, *allowed_mask); + rcu_read_unlock(); + /* * If there are multiple target nodes, just select one * target node randomly. @@ -356,10 +361,16 @@ int next_demotion_node(int node) * caching issue, which seems more complicated. So selecting * target node randomly seems better until now. */ - target = node_random(&nd->preferred); - rcu_read_unlock(); + if (!nodes_empty(mask)) + return node_random(&mask); - return target; + /* + * Preferred nodes are not in allowed_mask. Flip bits in + * allowed_mask as used node mask. Then, use it to get the + * closest demotion target. + */ + nodes_complement(mask, *allowed_mask); + return find_next_best_node(node, &mask); } static void disable_all_demotion_targets(void) diff --git a/mm/vmscan.c b/mm/vmscan.c index 911614723689..44e4fcd6463c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1046,12 +1046,11 @@ static unsigned int demote_folio_list(struct list_head *demote_folios, if (nodes_empty(allowed_mask)) return 0; - target_nid = next_demotion_node(pgdat->node_id); + target_nid = next_demotion_node(pgdat->node_id, &allowed_mask); if (target_nid == NUMA_NO_NODE) /* No lower-tier nodes or nodes were hot-unplugged. */ return 0; - if (!node_isset(target_nid, allowed_mask)) - target_nid = node_random(&allowed_mask); + mtc.nid = target_nid; /* Demotion ignores all cpuset and mempolicy settings */ -- cgit v1.2.3 From bed76bec3111c956a9756643d759d5c7a2193b37 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 21 Jan 2026 11:49:36 -0500 Subject: mm: relocate the page table ceiling and floor definitions Patch series " Remove XA_ZERO from error recovery of dup_mmap()", v3. It is possible that the dup_mmap() call fails on allocating or setting up a vma after the maple tree of the oldmm is copied. Today, that failure point is marked by inserting an XA_ZERO entry over the failure point so that the exact location does not need to be communicated through to exit_mmap(). However, a race exists in the tear down process because the dup_mmap() drops the mmap lock before exit_mmap() can remove the partially set up vma tree. This means that other tasks may get to the mm tree and find the invalid vma pointer (since it's an XA_ZERO entry), even though the mm is marked as MMF_OOM_SKIP and MMF_UNSTABLE. To remove the race fully, the tree must be cleaned up before dropping the lock. This is accomplished by extracting the vma cleanup in exit_mmap() and changing the required functions to pass through the vma search limit. Any other tree modifications would require extra cycles which should be spent on freeing memory. This does run the risk of increasing the possibility of finding no vmas (which is already possible!) in code that isn't careful. The final four patches are to address the excessive argument lists being passed between the functions. Using the struct unmap_desc also allows some special-case code to be removed in favour of the struct setup differences. This patch (of 11): pgtables.h defines a fallback for ceiling and floor of the page tables within the CONFIG_MMU section. Moving the definitions to outside the CONFIG_MMU allows for using them in generic code. [akpm@linux-foundation.org: remove stray newline, per SeongJae] Link: https://lkml.kernel.org/r/20260121164946.2093480-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20260121164946.2093480-2-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Suggested-by: Lorenzo Stoakes Suggested-by: SeongJae Park Cc: Baoquan He Cc: Barry Song Cc: Chris Li Cc: David Hildenbrand Cc: Jann Horn Cc: Kairui Song Cc: Kemeng Shi Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Nhat Pham Cc: Pedro Falcato Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 38 +++++++++++++++++++------------------- mm/vma_internal.h | 1 + 2 files changed, 20 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 827dca25c0bc..21b67d937555 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -22,25 +22,6 @@ #error CONFIG_PGTABLE_LEVELS is not consistent with __PAGETABLE_{P4D,PUD,PMD}_FOLDED #endif -/* - * On almost all architectures and configurations, 0 can be used as the - * upper ceiling to free_pgtables(): on many architectures it has the same - * effect as using TASK_SIZE. However, there is one configuration which - * must impose a more careful limit, to avoid freeing kernel pgtables. - */ -#ifndef USER_PGTABLES_CEILING -#define USER_PGTABLES_CEILING 0UL -#endif - -/* - * This defines the first usable user address. Platforms - * can override its value with custom FIRST_USER_ADDRESS - * defined in their respective . - */ -#ifndef FIRST_USER_ADDRESS -#define FIRST_USER_ADDRESS 0UL -#endif - /* * This defines the generic helper for accessing PMD page * table page. Although platforms can still override this @@ -1629,6 +1610,25 @@ void arch_sync_kernel_mappings(unsigned long start, unsigned long end); #endif /* CONFIG_MMU */ +/* + * On almost all architectures and configurations, 0 can be used as the + * upper ceiling to free_pgtables(): on many architectures it has the same + * effect as using TASK_SIZE. However, there is one configuration which + * must impose a more careful limit, to avoid freeing kernel pgtables. + */ +#ifndef USER_PGTABLES_CEILING +#define USER_PGTABLES_CEILING 0UL +#endif + +/* + * This defines the first usable user address. Platforms + * can override its value with custom FIRST_USER_ADDRESS + * defined in their respective . + */ +#ifndef FIRST_USER_ADDRESS +#define FIRST_USER_ADDRESS 0UL +#endif + /* * No-op macros that just return the current protection value. Defined here * because these macros can be used even if CONFIG_MMU is not defined. diff --git a/mm/vma_internal.h b/mm/vma_internal.h index 2f05735ff190..2da6d224c1a8 100644 --- a/mm/vma_internal.h +++ b/mm/vma_internal.h @@ -46,6 +46,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3 From 0df5a8d3948da979b8ab811a692b34635e1b146d Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 21 Jan 2026 11:49:44 -0500 Subject: mm/vma: use unmap_desc in exit_mmap() and vms_clear_ptes() Convert vms_clear_ptes() to use unmap_desc to call unmap_vmas() instead of the large argument list. The UNMAP_STATE() cannot be used because the vma iterator in the vms does not point to the correct maple state (mas_detach), and the tree_end will be set incorrectly. Setting up the arguments manually avoids setting the struct up incorrectly and doing extra work to get the correct pagetable range. exit_mmap() also calls unmap_vmas() with many arguments. Using the unmap_all_init() function to set the unmap descriptor for all vmas makes this a bit easier to read. Update to the vma test code is necessary to ensure testing continues to function. No functional changes intended. Link: https://lkml.kernel.org/r/20260121164946.2093480-10-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Baoquan He Cc: Barry Song Cc: Chris Li Cc: David Hildenbrand Cc: David Hildenbrand Cc: Jann Horn Cc: Kairui Song Cc: Kemeng Shi Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Nhat Pham Cc: Pedro Falcato Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 4 ---- mm/internal.h | 3 +++ mm/memory.c | 20 ++++++++------------ mm/mmap.c | 4 +++- mm/vma.c | 27 ++++++++++++++++++++++----- mm/vma.h | 14 ++++++++++++++ tools/testing/vma/vma_internal.h | 6 +++--- 7 files changed, 53 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2c6c6d00ed73..945902d23d47 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2625,10 +2625,6 @@ static inline void zap_vma_pages(struct vm_area_struct *vma) zap_page_range_single(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL); } -void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, - struct vm_area_struct *start_vma, unsigned long start, - unsigned long end, unsigned long tree_end); - struct mmu_notifier_range; void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, diff --git a/mm/internal.h b/mm/internal.h index 2a0e42e36b48..0f3ad8665d95 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -197,6 +197,9 @@ static inline void vma_close(struct vm_area_struct *vma) } } +/* unmap_vmas is in mm/memory.c */ +void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap); + #ifdef CONFIG_MMU static inline void get_anon_vma(struct anon_vma *anon_vma) diff --git a/mm/memory.c b/mm/memory.c index 6033cf6c93de..d68f8f082b1c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2144,11 +2144,7 @@ static void unmap_single_vma(struct mmu_gather *tlb, /** * unmap_vmas - unmap a range of memory covered by a list of vma's * @tlb: address of the caller's struct mmu_gather - * @mas: the maple state - * @vma: the starting vma - * @start_addr: virtual address at which to start unmapping - * @end_addr: virtual address at which to end unmapping - * @tree_end: The maximum index to check + * @unmap: The unmap_desc * * Unmap all pages in the vma list. * @@ -2161,10 +2157,9 @@ static void unmap_single_vma(struct mmu_gather *tlb, * ensure that any thus-far unmapped pages are flushed before unmap_vmas() * drops the lock and schedules. */ -void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, - struct vm_area_struct *vma, unsigned long start_addr, - unsigned long end_addr, unsigned long tree_end) +void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap) { + struct vm_area_struct *vma; struct mmu_notifier_range range; struct zap_details details = { .zap_flags = ZAP_FLAG_DROP_MARKER | ZAP_FLAG_UNMAP, @@ -2172,16 +2167,17 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, .even_cows = true, }; + vma = unmap->first; mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm, - start_addr, end_addr); + unmap->vma_start, unmap->vma_end); mmu_notifier_invalidate_range_start(&range); do { - unsigned long start = start_addr; - unsigned long end = end_addr; + unsigned long start = unmap->vma_start; + unsigned long end = unmap->vma_end; hugetlb_zap_begin(vma, &start, &end); unmap_single_vma(tlb, vma, start, end, &details); hugetlb_zap_end(vma, &details); - vma = mas_find(mas, tree_end - 1); + vma = mas_find(unmap->mas, unmap->tree_end - 1); } while (vma); mmu_notifier_invalidate_range_end(&range); } diff --git a/mm/mmap.c b/mm/mmap.c index 4500e61a0d5e..042b6b4b6ab8 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1277,6 +1277,7 @@ void exit_mmap(struct mm_struct *mm) struct vm_area_struct *vma; unsigned long nr_accounted = 0; VMA_ITERATOR(vmi, mm, 0); + struct unmap_desc unmap; /* mm's last user has gone, and its about to be pulled down */ mmu_notifier_release(mm); @@ -1292,11 +1293,12 @@ void exit_mmap(struct mm_struct *mm) goto destroy; } + unmap_all_init(&unmap, &vmi, vma); flush_cache_mm(mm); tlb_gather_mmu_fullmm(&tlb, mm); /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */ - unmap_vmas(&tlb, &vmi.mas, vma, 0, ULONG_MAX, ULONG_MAX); + unmap_vmas(&tlb, &unmap); mmap_read_unlock(mm); /* diff --git a/mm/vma.c b/mm/vma.c index 75c68c74c062..b46c869d4bb0 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -480,8 +480,7 @@ void unmap_region(struct unmap_desc *unmap) tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); - unmap_vmas(&tlb, mas, unmap->first, unmap->vma_start, unmap->vma_end, - unmap->vma_end); + unmap_vmas(&tlb, unmap); mas_set(mas, unmap->tree_reset); free_pgtables(&tlb, mas, unmap->first, unmap->pg_start, unmap->pg_end, unmap->tree_end, unmap->mm_wr_locked); @@ -1257,6 +1256,26 @@ static inline void vms_clear_ptes(struct vma_munmap_struct *vms, struct ma_state *mas_detach, bool mm_wr_locked) { struct mmu_gather tlb; + struct unmap_desc unmap = { + .mas = mas_detach, + .first = vms->vma, + /* start and end may be different if there is no prev or next vma. */ + .pg_start = vms->unmap_start, + .pg_end = vms->unmap_end, + .vma_start = vms->start, + .vma_end = vms->end, + /* + * The tree limits and reset differ from the normal case since it's a + * side-tree + */ + .tree_reset = 1, + .tree_end = vms->vma_count, + /* + * We can free page tables without write-locking mmap_lock because VMAs + * were isolated before we downgraded mmap_lock. + */ + .mm_wr_locked = mm_wr_locked, + }; if (!vms->clear_ptes) /* Nothing to do */ return; @@ -1268,9 +1287,7 @@ static inline void vms_clear_ptes(struct vma_munmap_struct *vms, mas_set(mas_detach, 1); tlb_gather_mmu(&tlb, vms->vma->vm_mm); update_hiwater_rss(vms->vma->vm_mm); - unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end, - vms->vma_count); - + unmap_vmas(&tlb, &unmap); mas_set(mas_detach, 1); /* start and end may be different if there is no prev or next vma. */ free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start, diff --git a/mm/vma.h b/mm/vma.h index cca7553c7d64..bb7fa5d2bde2 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -167,6 +167,20 @@ struct unmap_desc { bool mm_wr_locked; /* If the mmap write lock is held */ }; +static inline void unmap_all_init(struct unmap_desc *unmap, + struct vma_iterator *vmi, struct vm_area_struct *vma) +{ + unmap->mas = &vmi->mas; + unmap->first = vma; + unmap->pg_start = FIRST_USER_ADDRESS; + unmap->pg_end = USER_PGTABLES_CEILING; + unmap->vma_start = 0; + unmap->vma_end = ULONG_MAX; + unmap->tree_end = ULONG_MAX; + unmap->tree_reset = vma->vm_end; + unmap->mm_wr_locked = false; +} + #define UNMAP_STATE(name, _vmi, _vma, _vma_start, _vma_end, _prev, _next) \ struct unmap_desc name = { \ .mas = &(_vmi)->mas, \ diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index f50b8ddee612..0b4918aac8d6 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -1131,9 +1131,9 @@ static inline void update_hiwater_vm(struct mm_struct *mm) { } -static inline void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, - struct vm_area_struct *vma, unsigned long start_addr, - unsigned long end_addr, unsigned long tree_end) +struct unmap_desc; + +static inline void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap) { } -- cgit v1.2.3 From a30de4c6b79a83944d0d6a54cd6ae63014b62ef7 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 22 Jan 2026 16:06:10 +0000 Subject: mm/vma: remove __private sparse decoration from vma_flags_t Patch series "mm: add bitmap VMA flag helpers and convert all mmap_prepare to use them", v2. We introduced the bitmap VMA type vma_flags_t in the aptly named commit 9ea35a25d51b ("mm: introduce VMA flags bitmap type") in order to permit future growth in VMA flags and to prevent the asinine requirement that VMA flags be available to 64-bit kernels only if they happened to use a bit number about 32-bits. This is a long-term project as there are very many users of VMA flags within the kernel that need to be updated in order to utilise this new type. In order to further this aim, this series adds a number of helper functions to enable ordinary interactions with VMA flags - that is testing, setting and clearing them. In order to make working with VMA bit numbers less cumbersome this series introduces the mk_vma_flags() helper macro which generates a vma_flags_t from a variadic parameter list, e.g.: vma_flags_t flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT); It turns out that the compiler optimises this very well to the point that this is just as efficient as using VM_xxx pre-computed bitmap values. This series then introduces the following functions: bool vma_flags_test_mask(vma_flags_t flags, vma_flags_t to_test); bool vma_flags_test_all_mask(vma_flags_t flags, vma_flags_t to_test); void vma_flags_set_mask(vma_flags_t *flags, vma_flags_t to_set); void vma_flags_clear_mask(vma_flags_t *flags, vma_flags_t to_clear); Providing means of testing any flag, testing all flags, setting, and clearing a specific vma_flags_t mask. For convenience, helper macros are provided - vma_flags_test(), vma_flags_set() and vma_flags_clear(), each of which utilise mk_vma_flags() to make these operations easier, as well as an EMPTY_VMA_FLAGS macro to make initialisation of an empty vma_flags_t value easier, e.g.: vma_flags_t flags = EMPTY_VMA_FLAGS; vma_flags_set(&flags, VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT); ... if (vma_flags_test(flags, VMA_READ_BIT)) { ... } ... if (vma_flags_test_all_mask(flags, VMA_REMAP_FLAGS)) { ... } ... vma_flags_clear(&flags, VMA_READ_BIT); Since callers are often dealing with a vm_area_struct (VMA) or vm_area_desc (VMA descriptor as used in .mmap_prepare) object, this series further provides helpers for these - firstly vma_set_flags_mask() and vma_set_flags() for a VMA: vma_flags_t flags = EMPTY_VMA_FLAGS: vma_flags_set(&flags, VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT); ... vma_set_flags_mask(&vma, flags); ... vma_set_flags(&vma, VMA_DONTDUMP_BIT); Note that these do NOT ensure appropriate locks are taken and assume the callers takes care of this. For VMA descriptors this series adds vma_desc_[test, set, clear]_flags_mask() and vma_desc_[test, set, clear]_flags() for a VMA descriptor, e.g.: static int foo_mmap_prepare(struct vm_area_desc *desc) { ... vma_desc_set_flags(desc, VMA_SEQ_READ_BIT); vma_desc_clear_flags(desc, VMA_RAND_READ_BIT); ... if (vma_desc_test_flags(desc, VMA_SHARED_BIT) { ... } ... } With these helpers introduced, this series then updates all mmap_prepare users to make use of the vma_flags_t vm_area_desc->vma_flags field rather than the legacy vm_flags_t vm_area_desc->vm_flags field. In order to do so, several other related functions need to be updated, with separate patches for larger changes in hugetlbfs, secretmem and shmem before finally removing vm_area_desc->vm_flags altogether. This lays the foundations for future elimination of vm_flags_t and associated defines and functionality altogether in the long run, and elimination of the use of vm_flags_t in f_op->mmap() hooks in the near term as mmap_prepare replaces these. There is a useful synergy between the VMA flags and mmap_prepare work here as with this change in place, converting f_op->mmap() to f_op->mmap_prepare naturally also converts use of vm_flags_t to vma_flags_t in all drivers which declare mmap handlers. This accounts for the majority of the users of the legacy vm_flags_*() helpers and thus a large number of drivers which need to interact with VMA flags in general. This series also updates the userland VMA tests to account for the change, and adds unit tests for these helper functions to assert that they behave as expected. In order to faciliate this change in a sensible way, the series also separates out the VMA unit tests into - code that is duplicated from the kernel that should be kept in sync, code that is customised for test purposes and code that is stubbed out. We also separate out the VMA userland tests into separate files to make it easier to manage and to provide a sensible baseline for adding the userland tests for these helpers. This patch (of 13): We need to pass around these values and access them in a way that sparse does not allow, as __private implies noderef, i.e. disallowing dereference of the value, which manifests as sparse warnings even when passed around benignly. Link: https://lkml.kernel.org/r/cover.1769097829.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/64fa89f416f22a60ae74cfff8fd565e7677be192.1769097829.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Pedro Falcato Reviewed-by: Liam R. Howlett Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand Cc: Dev Jain Cc: Jason Gunthorpe Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Cc: Damien Le Moal Cc: "Darrick J. Wong" Cc: Jarkko Sakkinen Cc: Yury Norov Cc: Chris Mason Signed-off-by: Andrew Morton --- include/linux/mm.h | 4 ++-- include/linux/mm_types.h | 14 ++++++++------ 2 files changed, 10 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 945902d23d47..c27d79f6b8c0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -943,7 +943,7 @@ static inline void vm_flags_reset_once(struct vm_area_struct *vma, * system word. */ if (NUM_VMA_FLAG_BITS > BITS_PER_LONG) { - unsigned long *bitmap = ACCESS_PRIVATE(&vma->flags, __vma_flags); + unsigned long *bitmap = vma->flags.__vma_flags; bitmap_zero(&bitmap[1], NUM_VMA_FLAG_BITS - BITS_PER_LONG); } @@ -1006,7 +1006,7 @@ static inline bool __vma_flag_atomic_valid(struct vm_area_struct *vma, static inline void vma_flag_set_atomic(struct vm_area_struct *vma, vma_flag_t bit) { - unsigned long *bitmap = ACCESS_PRIVATE(&vma->flags, __vma_flags); + unsigned long *bitmap = vma->flags.__vma_flags; vma_assert_stabilised(vma); if (__vma_flag_atomic_valid(vma, bit)) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 8731606d8d36..ed0e128361f7 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -866,7 +866,7 @@ struct mmap_action { #define NUM_VMA_FLAG_BITS BITS_PER_LONG typedef struct { DECLARE_BITMAP(__vma_flags, NUM_VMA_FLAG_BITS); -} __private vma_flags_t; +} vma_flags_t; /* * Describes a VMA that is about to be mmap()'ed. Drivers may choose to @@ -1059,7 +1059,7 @@ struct vm_area_struct { /* Clears all bits in the VMA flags bitmap, non-atomically. */ static inline void vma_flags_clear_all(vma_flags_t *flags) { - bitmap_zero(ACCESS_PRIVATE(flags, __vma_flags), NUM_VMA_FLAG_BITS); + bitmap_zero(flags->__vma_flags, NUM_VMA_FLAG_BITS); } /* @@ -1070,7 +1070,9 @@ static inline void vma_flags_clear_all(vma_flags_t *flags) */ static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long value) { - *ACCESS_PRIVATE(flags, __vma_flags) = value; + unsigned long *bitmap = flags->__vma_flags; + + bitmap[0] = value; } /* @@ -1081,7 +1083,7 @@ static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long va */ static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned long value) { - unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + unsigned long *bitmap = flags->__vma_flags; WRITE_ONCE(*bitmap, value); } @@ -1089,7 +1091,7 @@ static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned lo /* Update the first system word of VMA flags setting bits, non-atomically. */ static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value) { - unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + unsigned long *bitmap = flags->__vma_flags; *bitmap |= value; } @@ -1097,7 +1099,7 @@ static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value) /* Update the first system word of VMA flags clearing bits, non-atomically. */ static inline void vma_flags_clear_word(vma_flags_t *flags, unsigned long value) { - unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + unsigned long *bitmap = flags->__vma_flags; *bitmap &= ~value; } -- cgit v1.2.3 From e388d31257eddc1077a02ed786513d606c9e3266 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 22 Jan 2026 16:06:11 +0000 Subject: mm: rename vma_flag_test/set_atomic() to vma_test/set_atomic_flag() In order to stay consistent between functions which manipulate a vm_flags_t argument of the form of vma_flags_...() and those which manipulate a VMA (in this case the flags field of a VMA), rename vma_flag_[test/set]_atomic() to vma_[test/set]_atomic_flag(). This lays the groundwork for adding VMA flag manipulation functions in a subsequent commit. Link: https://lkml.kernel.org/r/033dcf12e819dee5064582bced9b12ea346d1607.1769097829.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Pedro Falcato Reviewed-by: Liam R. Howlett Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand Cc: Dev Jain Cc: Jason Gunthorpe Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Cc: Damien Le Moal Cc: "Darrick J. Wong" Cc: Jarkko Sakkinen Cc: Yury Norov Cc: Chris Mason Cc: Pedro Falcato Signed-off-by: Andrew Morton --- include/linux/mm.h | 13 +++++-------- mm/khugepaged.c | 2 +- mm/madvise.c | 2 +- 3 files changed, 7 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index c27d79f6b8c0..67b80f0ea225 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -987,8 +987,7 @@ static inline void vm_flags_mod(struct vm_area_struct *vma, __vm_flags_mod(vma, set, clear); } -static inline bool __vma_flag_atomic_valid(struct vm_area_struct *vma, - vma_flag_t bit) +static inline bool __vma_atomic_valid_flag(struct vm_area_struct *vma, vma_flag_t bit) { const vm_flags_t mask = BIT((__force int)bit); @@ -1003,13 +1002,12 @@ static inline bool __vma_flag_atomic_valid(struct vm_area_struct *vma, * Set VMA flag atomically. Requires only VMA/mmap read lock. Only specific * valid flags are allowed to do this. */ -static inline void vma_flag_set_atomic(struct vm_area_struct *vma, - vma_flag_t bit) +static inline void vma_set_atomic_flag(struct vm_area_struct *vma, vma_flag_t bit) { unsigned long *bitmap = vma->flags.__vma_flags; vma_assert_stabilised(vma); - if (__vma_flag_atomic_valid(vma, bit)) + if (__vma_atomic_valid_flag(vma, bit)) set_bit((__force int)bit, bitmap); } @@ -1020,10 +1018,9 @@ static inline void vma_flag_set_atomic(struct vm_area_struct *vma, * This is necessarily racey, so callers must ensure that serialisation is * achieved through some other means, or that races are permissible. */ -static inline bool vma_flag_test_atomic(struct vm_area_struct *vma, - vma_flag_t bit) +static inline bool vma_test_atomic_flag(struct vm_area_struct *vma, vma_flag_t bit) { - if (__vma_flag_atomic_valid(vma, bit)) + if (__vma_atomic_valid_flag(vma, bit)) return test_bit((__force int)bit, &vma->vm_flags); return false; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 1b8faae5b448..fa1e57fd2c46 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1741,7 +1741,7 @@ static bool file_backed_vma_is_retractable(struct vm_area_struct *vma) * obtained on guard region installation after the flag is set, so this * check being performed under this lock excludes races. */ - if (vma_flag_test_atomic(vma, VMA_MAYBE_GUARD_BIT)) + if (vma_test_atomic_flag(vma, VMA_MAYBE_GUARD_BIT)) return false; return true; diff --git a/mm/madvise.c b/mm/madvise.c index 1f3040688f04..8debb2d434aa 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1140,7 +1140,7 @@ static long madvise_guard_install(struct madvise_behavior *madv_behavior) * acquire an mmap/VMA write lock to read it. All remaining readers may * or may not see the flag set, but we don't care. */ - vma_flag_set_atomic(vma, VMA_MAYBE_GUARD_BIT); + vma_set_atomic_flag(vma, VMA_MAYBE_GUARD_BIT); /* * If anonymous and we are establishing page tables the VMA ought to -- cgit v1.2.3 From 1c628004e0de0383a5a56facdb0bf28a54441b5f Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 22 Jan 2026 16:06:12 +0000 Subject: mm: add mk_vma_flags() bitmap flag macro helper This patch introduces the mk_vma_flags() macro helper to allow easy manipulation of VMA flags utilising the new bitmap representation implemented of VMA flags defined by the vma_flags_t type. It is a variadic macro which provides a bitwise-or'd representation of all of each individual VMA flag specified. Note that, while we maintain VM_xxx flags for backwards compatibility until the conversion is complete, we define VMA flags of type vma_flag_t using VMA_xxx_BIT to avoid confusing the two. This helper macro therefore can be used thusly: vma_flags_t flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT); Testing has demonstrated that the compiler optimises this code such that it generates the same assembly utilising this macro as it does if the flags were specified manually, for instance: vma_flags_t get_flags(void) { return mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT); } Generates the same code as: vma_flags_t get_flags(void) { vma_flags_t flags; vma_flags_clear_all(&flags); vma_flag_set(&flags, VMA_READ_BIT); vma_flag_set(&flags, VMA_WRITE_BIT); vma_flag_set(&flags, VMA_EXEC_BIT); return flags; } And: vma_flags_t get_flags(void) { vma_flags_t flags; unsigned long *bitmap = ACCESS_PRIVATE(&flags, __vma_flags); *bitmap = 1UL << (__force int)VMA_READ_BIT; *bitmap |= 1UL << (__force int)VMA_WRITE_BIT; *bitmap |= 1UL << (__force int)VMA_EXEC_BIT; return flags; } That is: get_flags: movl $7, %eax ret Link: https://lkml.kernel.org/r/fde00df6ff7fb8c4b42cc0defa5a4924c7a1943a.1769097829.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Suggested-by: Jason Gunthorpe Reviewed-by: Pedro Falcato Reviewed-by: Liam R. Howlett Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand Cc: Dev Jain Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Cc: Damien Le Moal Cc: "Darrick J. Wong" Cc: Jarkko Sakkinen Cc: Yury Norov Cc: Chris Mason Signed-off-by: Andrew Morton --- include/linux/mm.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 67b80f0ea225..d3d10c769d6f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2,6 +2,7 @@ #ifndef _LINUX_MM_H #define _LINUX_MM_H +#include #include #include #include @@ -1026,6 +1027,38 @@ static inline bool vma_test_atomic_flag(struct vm_area_struct *vma, vma_flag_t b return false; } +/* Set an individual VMA flag in flags, non-atomically. */ +static inline void vma_flag_set(vma_flags_t *flags, vma_flag_t bit) +{ + unsigned long *bitmap = flags->__vma_flags; + + __set_bit((__force int)bit, bitmap); +} + +static inline vma_flags_t __mk_vma_flags(size_t count, const vma_flag_t *bits) +{ + vma_flags_t flags; + int i; + + vma_flags_clear_all(&flags); + for (i = 0; i < count; i++) + vma_flag_set(&flags, bits[i]); + return flags; +} + +/* + * Helper macro which bitwise-or combines the specified input flags into a + * vma_flags_t bitmap value. E.g.: + * + * vma_flags_t flags = mk_vma_flags(VMA_IO_BIT, VMA_PFNMAP_BIT, + * VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT); + * + * The compiler cleverly optimises away all of the work and this ends up being + * equivalent to aggregating the values manually. + */ +#define mk_vma_flags(...) __mk_vma_flags(COUNT_ARGS(__VA_ARGS__), \ + (const vma_flag_t []){__VA_ARGS__}) + static inline void vma_set_anonymous(struct vm_area_struct *vma) { vma->vm_ops = NULL; -- cgit v1.2.3 From bae0ba7c7c0a022287d8b093da63ebcb794d77ea Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 22 Jan 2026 16:06:14 +0000 Subject: mm: add basic VMA flag operation helper functions Now we have the mk_vma_flags() macro helper which permits easy specification of any number of VMA flags, add helper functions which operate with vma_flags_t parameters. This patch provides vma_flags_test[_mask](), vma_flags_set[_mask]() and vma_flags_clear[_mask]() respectively testing, setting and clearing flags with the _mask variants accepting vma_flag_t parameters, and the non-mask variants implemented as macros which accept a list of flags. This allows us to trivially test/set/clear aggregate VMA flag values as necessary, for instance: if (vma_flags_test(&flags, VMA_READ_BIT, VMA_WRITE_BIT)) goto readwrite; vma_flags_set(&flags, VMA_READ_BIT, VMA_WRITE_BIT); vma_flags_clear(&flags, VMA_READ_BIT, VMA_WRITE_BIT); We also add a function for testing that ALL flags are set for convenience, e.g.: if (vma_flags_test_all(&flags, VMA_READ_BIT, VMA_MAYREAD_BIT)) { /* Both READ and MAYREAD flags set */ ... } The compiler generates optimal assembly for each such that they behave as if the caller were setting the bitmap flags manually. This is important for e.g. drivers which manipulate flag values rather than a VMA's specific flag values. We also add helpers for testing, setting and clearing flags for VMA's and VMA descriptors to reduce boilerplate. Also add the EMPTY_VMA_FLAGS define to aid initialisation of empty flags. Finally, update the userland VMA tests to add the helpers there so they can be utilised as part of userland testing. Link: https://lkml.kernel.org/r/885d4897d67a6a57c0b07fa182a7055ad752df11.1769097829.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Pedro Falcato Reviewed-by: Liam R. Howlett Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand Cc: Dev Jain Cc: Jason Gunthorpe Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Cc: Damien Le Moal Cc: "Darrick J. Wong" Cc: Jarkko Sakkinen Cc: Yury Norov Cc: Chris Mason Signed-off-by: Andrew Morton --- include/linux/mm.h | 165 +++++++++++++++++++++++++++++++++++++++ include/linux/mm_types.h | 4 +- tools/testing/vma/vma_internal.h | 147 +++++++++++++++++++++++++++++----- 3 files changed, 295 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index d3d10c769d6f..aa99b28e7a8a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1059,6 +1059,171 @@ static inline vma_flags_t __mk_vma_flags(size_t count, const vma_flag_t *bits) #define mk_vma_flags(...) __mk_vma_flags(COUNT_ARGS(__VA_ARGS__), \ (const vma_flag_t []){__VA_ARGS__}) +/* Test each of to_test flags in flags, non-atomically. */ +static __always_inline bool vma_flags_test_mask(const vma_flags_t *flags, + vma_flags_t to_test) +{ + const unsigned long *bitmap = flags->__vma_flags; + const unsigned long *bitmap_to_test = to_test.__vma_flags; + + return bitmap_intersects(bitmap_to_test, bitmap, NUM_VMA_FLAG_BITS); +} + +/* + * Test whether any specified VMA flag is set, e.g.: + * + * if (vma_flags_test(flags, VMA_READ_BIT, VMA_MAYREAD_BIT)) { ... } + */ +#define vma_flags_test(flags, ...) \ + vma_flags_test_mask(flags, mk_vma_flags(__VA_ARGS__)) + +/* Test that ALL of the to_test flags are set, non-atomically. */ +static __always_inline bool vma_flags_test_all_mask(const vma_flags_t *flags, + vma_flags_t to_test) +{ + const unsigned long *bitmap = flags->__vma_flags; + const unsigned long *bitmap_to_test = to_test.__vma_flags; + + return bitmap_subset(bitmap_to_test, bitmap, NUM_VMA_FLAG_BITS); +} + +/* + * Test whether ALL specified VMA flags are set, e.g.: + * + * if (vma_flags_test_all(flags, VMA_READ_BIT, VMA_MAYREAD_BIT)) { ... } + */ +#define vma_flags_test_all(flags, ...) \ + vma_flags_test_all_mask(flags, mk_vma_flags(__VA_ARGS__)) + +/* Set each of the to_set flags in flags, non-atomically. */ +static __always_inline void vma_flags_set_mask(vma_flags_t *flags, vma_flags_t to_set) +{ + unsigned long *bitmap = flags->__vma_flags; + const unsigned long *bitmap_to_set = to_set.__vma_flags; + + bitmap_or(bitmap, bitmap, bitmap_to_set, NUM_VMA_FLAG_BITS); +} + +/* + * Set all specified VMA flags, e.g.: + * + * vma_flags_set(&flags, VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT); + */ +#define vma_flags_set(flags, ...) \ + vma_flags_set_mask(flags, mk_vma_flags(__VA_ARGS__)) + +/* Clear all of the to-clear flags in flags, non-atomically. */ +static __always_inline void vma_flags_clear_mask(vma_flags_t *flags, vma_flags_t to_clear) +{ + unsigned long *bitmap = flags->__vma_flags; + const unsigned long *bitmap_to_clear = to_clear.__vma_flags; + + bitmap_andnot(bitmap, bitmap, bitmap_to_clear, NUM_VMA_FLAG_BITS); +} + +/* + * Clear all specified individual flags, e.g.: + * + * vma_flags_clear(&flags, VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT); + */ +#define vma_flags_clear(flags, ...) \ + vma_flags_clear_mask(flags, mk_vma_flags(__VA_ARGS__)) + +/* + * Helper to test that ALL specified flags are set in a VMA. + * + * Note: appropriate locks must be held, this function does not acquire them for + * you. + */ +static inline bool vma_test_all_flags_mask(const struct vm_area_struct *vma, + vma_flags_t flags) +{ + return vma_flags_test_all_mask(&vma->flags, flags); +} + +/* + * Helper macro for checking that ALL specified flags are set in a VMA, e.g.: + * + * if (vma_test_all_flags(vma, VMA_READ_BIT, VMA_MAYREAD_BIT) { ... } + */ +#define vma_test_all_flags(vma, ...) \ + vma_test_all_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) + +/* + * Helper to set all VMA flags in a VMA. + * + * Note: appropriate locks must be held, this function does not acquire them for + * you. + */ +static inline void vma_set_flags_mask(struct vm_area_struct *vma, + vma_flags_t flags) +{ + vma_flags_set_mask(&vma->flags, flags); +} + +/* + * Helper macro for specifying VMA flags in a VMA, e.g.: + * + * vma_set_flags(vma, VMA_IO_BIT, VMA_PFNMAP_BIT, VMA_DONTEXPAND_BIT, + * VMA_DONTDUMP_BIT); + * + * Note: appropriate locks must be held, this function does not acquire them for + * you. + */ +#define vma_set_flags(vma, ...) \ + vma_set_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) + +/* Helper to test all VMA flags in a VMA descriptor. */ +static inline bool vma_desc_test_flags_mask(const struct vm_area_desc *desc, + vma_flags_t flags) +{ + return vma_flags_test_mask(&desc->vma_flags, flags); +} + +/* + * Helper macro for testing VMA flags for an input pointer to a struct + * vm_area_desc object describing a proposed VMA, e.g.: + * + * if (vma_desc_test_flags(desc, VMA_IO_BIT, VMA_PFNMAP_BIT, + * VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT)) { ... } + */ +#define vma_desc_test_flags(desc, ...) \ + vma_desc_test_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) + +/* Helper to set all VMA flags in a VMA descriptor. */ +static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, + vma_flags_t flags) +{ + vma_flags_set_mask(&desc->vma_flags, flags); +} + +/* + * Helper macro for specifying VMA flags for an input pointer to a struct + * vm_area_desc object describing a proposed VMA, e.g.: + * + * vma_desc_set_flags(desc, VMA_IO_BIT, VMA_PFNMAP_BIT, VMA_DONTEXPAND_BIT, + * VMA_DONTDUMP_BIT); + */ +#define vma_desc_set_flags(desc, ...) \ + vma_desc_set_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) + +/* Helper to clear all VMA flags in a VMA descriptor. */ +static inline void vma_desc_clear_flags_mask(struct vm_area_desc *desc, + vma_flags_t flags) +{ + vma_flags_clear_mask(&desc->vma_flags, flags); +} + +/* + * Helper macro for clearing VMA flags for an input pointer to a struct + * vm_area_desc object describing a proposed VMA, e.g.: + * + * vma_desc_clear_flags(desc, VMA_IO_BIT, VMA_PFNMAP_BIT, VMA_DONTEXPAND_BIT, + * VMA_DONTDUMP_BIT); + */ +#define vma_desc_clear_flags(desc, ...) \ + vma_desc_clear_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) + static inline void vma_set_anonymous(struct vm_area_struct *vma) { vma->vm_ops = NULL; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index ed0e128361f7..9b4311cfd5e8 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -844,7 +844,7 @@ struct mmap_action { /* * If specified, this hook is invoked when an error occurred when - * attempting the selection action. + * attempting the selected action. * * The hook can return an error code in order to filter the error, but * it is not valid to clear the error here. @@ -868,6 +868,8 @@ typedef struct { DECLARE_BITMAP(__vma_flags, NUM_VMA_FLAG_BITS); } vma_flags_t; +#define EMPTY_VMA_FLAGS ((vma_flags_t){ }) + /* * Describes a VMA that is about to be mmap()'ed. Drivers may choose to * manipulate mutable fields which will cause those fields to be updated in the diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index ca4eb563b29b..2b01794cbd61 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -21,7 +21,13 @@ #include +#ifdef __CONCAT +#undef __CONCAT +#endif + +#include #include +#include #include #include #include @@ -38,6 +44,8 @@ extern unsigned long dac_mmap_min_addr; #define dac_mmap_min_addr 0UL #endif +#define ACCESS_PRIVATE(p, member) ((p)->member) + #define VM_WARN_ON(_expr) (WARN_ON(_expr)) #define VM_WARN_ON_ONCE(_expr) (WARN_ON_ONCE(_expr)) #define VM_WARN_ON_VMG(_expr, _vmg) (WARN_ON(_expr)) @@ -533,6 +541,8 @@ typedef struct { DECLARE_BITMAP(__vma_flags, NUM_VMA_FLAG_BITS); } __private vma_flags_t; +#define EMPTY_VMA_FLAGS ((vma_flags_t){ }) + struct mm_struct { struct maple_tree mm_mt; int map_count; /* number of VMAs */ @@ -882,6 +892,123 @@ static inline pgprot_t vm_get_page_prot(vm_flags_t vm_flags) return __pgprot(vm_flags); } +static inline void vma_flags_clear_all(vma_flags_t *flags) +{ + bitmap_zero(flags->__vma_flags, NUM_VMA_FLAG_BITS); +} + +static inline void vma_flag_set(vma_flags_t *flags, vma_flag_t bit) +{ + unsigned long *bitmap = flags->__vma_flags; + + __set_bit((__force int)bit, bitmap); +} + +static inline vma_flags_t __mk_vma_flags(size_t count, const vma_flag_t *bits) +{ + vma_flags_t flags; + int i; + + vma_flags_clear_all(&flags); + for (i = 0; i < count; i++) + vma_flag_set(&flags, bits[i]); + return flags; +} + +#define mk_vma_flags(...) __mk_vma_flags(COUNT_ARGS(__VA_ARGS__), \ + (const vma_flag_t []){__VA_ARGS__}) + +static __always_inline bool vma_flags_test_mask(const vma_flags_t *flags, + vma_flags_t to_test) +{ + const unsigned long *bitmap = flags->__vma_flags; + const unsigned long *bitmap_to_test = to_test.__vma_flags; + + return bitmap_intersects(bitmap_to_test, bitmap, NUM_VMA_FLAG_BITS); +} + +#define vma_flags_test(flags, ...) \ + vma_flags_test_mask(flags, mk_vma_flags(__VA_ARGS__)) + +static __always_inline bool vma_flags_test_all_mask(const vma_flags_t *flags, + vma_flags_t to_test) +{ + const unsigned long *bitmap = flags->__vma_flags; + const unsigned long *bitmap_to_test = to_test.__vma_flags; + + return bitmap_subset(bitmap_to_test, bitmap, NUM_VMA_FLAG_BITS); +} + +#define vma_flags_test_all(flags, ...) \ + vma_flags_test_all_mask(flags, mk_vma_flags(__VA_ARGS__)) + +static __always_inline void vma_flags_set_mask(vma_flags_t *flags, vma_flags_t to_set) +{ + unsigned long *bitmap = flags->__vma_flags; + const unsigned long *bitmap_to_set = to_set.__vma_flags; + + bitmap_or(bitmap, bitmap, bitmap_to_set, NUM_VMA_FLAG_BITS); +} + +#define vma_flags_set(flags, ...) \ + vma_flags_set_mask(flags, mk_vma_flags(__VA_ARGS__)) + +static __always_inline void vma_flags_clear_mask(vma_flags_t *flags, vma_flags_t to_clear) +{ + unsigned long *bitmap = flags->__vma_flags; + const unsigned long *bitmap_to_clear = to_clear.__vma_flags; + + bitmap_andnot(bitmap, bitmap, bitmap_to_clear, NUM_VMA_FLAG_BITS); +} + +#define vma_flags_clear(flags, ...) \ + vma_flags_clear_mask(flags, mk_vma_flags(__VA_ARGS__)) + +static inline bool vma_test_all_flags_mask(const struct vm_area_struct *vma, + vma_flags_t flags) +{ + return vma_flags_test_all_mask(&vma->flags, flags); +} + +#define vma_test_all_flags(vma, ...) \ + vma_test_all_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) + +static inline void vma_set_flags_mask(struct vm_area_struct *vma, + vma_flags_t flags) +{ + vma_flags_set_mask(&vma->flags, flags); +} + +#define vma_set_flags(vma, ...) \ + vma_set_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) + +static inline bool vma_desc_test_flags_mask(const struct vm_area_desc *desc, + vma_flags_t flags) +{ + return vma_flags_test_mask(&desc->vma_flags, flags); +} + +#define vma_desc_test_flags(desc, ...) \ + vma_desc_test_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) + +static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, + vma_flags_t flags) +{ + vma_flags_set_mask(&desc->vma_flags, flags); +} + +#define vma_desc_set_flags(desc, ...) \ + vma_desc_set_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) + +static inline void vma_desc_clear_flags_mask(struct vm_area_desc *desc, + vma_flags_t flags) +{ + vma_flags_clear_mask(&desc->vma_flags, flags); +} + +#define vma_desc_clear_flags(desc, ...) \ + vma_desc_clear_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) + static inline bool is_shared_maywrite(vm_flags_t vm_flags) { return (vm_flags & (VM_SHARED | VM_MAYWRITE)) == @@ -1540,31 +1667,11 @@ static inline void userfaultfd_unmap_complete(struct mm_struct *mm, { } -#define ACCESS_PRIVATE(p, member) ((p)->member) - -#define bitmap_size(nbits) (ALIGN(nbits, BITS_PER_LONG) / BITS_PER_BYTE) - -static __always_inline void bitmap_zero(unsigned long *dst, unsigned int nbits) -{ - unsigned int len = bitmap_size(nbits); - - if (small_const_nbits(nbits)) - *dst = 0; - else - memset(dst, 0, len); -} - static inline bool mm_flags_test(int flag, const struct mm_struct *mm) { return test_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); } -/* Clears all bits in the VMA flags bitmap, non-atomically. */ -static inline void vma_flags_clear_all(vma_flags_t *flags) -{ - bitmap_zero(ACCESS_PRIVATE(flags, __vma_flags), NUM_VMA_FLAG_BITS); -} - /* * Copy value to the first system word of VMA flags, non-atomically. * -- cgit v1.2.3 From 097e8db5e22b03d6791abc243183f597d0f76a7b Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 22 Jan 2026 16:06:15 +0000 Subject: mm: update hugetlbfs to use VMA flags on mmap_prepare In order to update all mmap_prepare users to utilising the new VMA flags type vma_flags_t and associated helper functions, we start by updating hugetlbfs which has a lot of additional logic that requires updating to make this change. This is laying the groundwork for eliminating the vm_flags_t from struct vm_area_desc and using vma_flags_t only, which further lays the ground for removing the deprecated vm_flags_t type altogether. No functional changes intended. Link: https://lkml.kernel.org/r/9226bec80c9aa3447cc2b83354f733841dba8a50.1769097829.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand Cc: Dev Jain Cc: Jason Gunthorpe Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Cc: Damien Le Moal Cc: "Darrick J. Wong" Cc: Jarkko Sakkinen Cc: Yury Norov Cc: Chris Mason Cc: Pedro Falcato Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 14 +++++++------- include/linux/hugetlb.h | 6 +++--- include/linux/hugetlb_inline.h | 10 ++++++++++ ipc/shm.c | 12 +++++++----- mm/hugetlb.c | 22 +++++++++++----------- mm/memfd.c | 4 ++-- mm/mmap.c | 2 +- 7 files changed, 41 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 3b4c152c5c73..95a5b23b4808 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -109,7 +109,7 @@ static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc) loff_t len, vma_len; int ret; struct hstate *h = hstate_file(file); - vm_flags_t vm_flags; + vma_flags_t vma_flags; /* * vma address alignment (but not the pgoff alignment) has @@ -119,7 +119,7 @@ static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc) * way when do_mmap unwinds (may be important on powerpc * and ia64). */ - desc->vm_flags |= VM_HUGETLB | VM_DONTEXPAND; + vma_desc_set_flags(desc, VMA_HUGETLB_BIT, VMA_DONTEXPAND_BIT); desc->vm_ops = &hugetlb_vm_ops; /* @@ -148,23 +148,23 @@ static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc) ret = -ENOMEM; - vm_flags = desc->vm_flags; + vma_flags = desc->vma_flags; /* * for SHM_HUGETLB, the pages are reserved in the shmget() call so skip * reserving here. Note: only for SHM hugetlbfs file, the inode * flag S_PRIVATE is set. */ if (inode->i_flags & S_PRIVATE) - vm_flags |= VM_NORESERVE; + vma_flags_set(&vma_flags, VMA_NORESERVE_BIT); if (hugetlb_reserve_pages(inode, desc->pgoff >> huge_page_order(h), len >> huge_page_shift(h), desc, - vm_flags) < 0) + vma_flags) < 0) goto out; ret = 0; - if ((desc->vm_flags & VM_WRITE) && inode->i_size < len) + if (vma_desc_test_flags(desc, VMA_WRITE_BIT) && inode->i_size < len) i_size_write(inode, len); out: inode_unlock(inode); @@ -1527,7 +1527,7 @@ static int get_hstate_idx(int page_size_log) * otherwise hugetlb_reserve_pages reserves one less hugepages than intended. */ struct file *hugetlb_file_setup(const char *name, size_t size, - vm_flags_t acctflag, int creat_flags, + vma_flags_t acctflag, int creat_flags, int page_size_log) { struct inode *inode; diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 94a03591990c..4e72bf66077e 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -150,7 +150,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, struct folio **foliop); #endif /* CONFIG_USERFAULTFD */ long hugetlb_reserve_pages(struct inode *inode, long from, long to, - struct vm_area_desc *desc, vm_flags_t vm_flags); + struct vm_area_desc *desc, vma_flags_t vma_flags); long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list); @@ -529,7 +529,7 @@ static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) } extern const struct vm_operations_struct hugetlb_vm_ops; -struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct, +struct file *hugetlb_file_setup(const char *name, size_t size, vma_flags_t acct, int creat_flags, int page_size_log); static inline bool is_file_hugepages(const struct file *file) @@ -545,7 +545,7 @@ static inline struct hstate *hstate_inode(struct inode *i) #define is_file_hugepages(file) false static inline struct file * -hugetlb_file_setup(const char *name, size_t size, vm_flags_t acctflag, +hugetlb_file_setup(const char *name, size_t size, vma_flags_t acctflag, int creat_flags, int page_size_log) { return ERR_PTR(-ENOSYS); diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h index a27aa0162918..593f5d4e108b 100644 --- a/include/linux/hugetlb_inline.h +++ b/include/linux/hugetlb_inline.h @@ -11,6 +11,11 @@ static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags) return !!(vm_flags & VM_HUGETLB); } +static inline bool is_vma_hugetlb_flags(const vma_flags_t *flags) +{ + return vma_flags_test(flags, VMA_HUGETLB_BIT); +} + #else static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags) @@ -18,6 +23,11 @@ static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags) return false; } +static inline bool is_vma_hugetlb_flags(const vma_flags_t *flags) +{ + return false; +} + #endif static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) diff --git a/ipc/shm.c b/ipc/shm.c index 3db36773dd10..2c7379c4c647 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -707,9 +707,9 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) int error; struct shmid_kernel *shp; size_t numpages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + const bool has_no_reserve = shmflg & SHM_NORESERVE; struct file *file; char name[13]; - vm_flags_t acctflag = 0; if (size < SHMMIN || size > ns->shm_ctlmax) return -EINVAL; @@ -738,6 +738,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) sprintf(name, "SYSV%08x", key); if (shmflg & SHM_HUGETLB) { + vma_flags_t acctflag = EMPTY_VMA_FLAGS; struct hstate *hs; size_t hugesize; @@ -749,17 +750,18 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) hugesize = ALIGN(size, huge_page_size(hs)); /* hugetlb_file_setup applies strict accounting */ - if (shmflg & SHM_NORESERVE) - acctflag = VM_NORESERVE; + if (has_no_reserve) + vma_flags_set(&acctflag, VMA_NORESERVE_BIT); file = hugetlb_file_setup(name, hugesize, acctflag, HUGETLB_SHMFS_INODE, (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK); } else { + vm_flags_t acctflag = 0; + /* * Do not allow no accounting for OVERCOMMIT_NEVER, even * if it's asked for. */ - if ((shmflg & SHM_NORESERVE) && - sysctl_overcommit_memory != OVERCOMMIT_NEVER) + if (has_no_reserve && sysctl_overcommit_memory != OVERCOMMIT_NEVER) acctflag = VM_NORESERVE; file = shmem_kernel_file_setup(name, size, acctflag); } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0b005e944ee3..4cb3e1c86e3a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1193,16 +1193,16 @@ static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) static void set_vma_desc_resv_map(struct vm_area_desc *desc, struct resv_map *map) { - VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags)); - VM_WARN_ON_ONCE(desc->vm_flags & VM_MAYSHARE); + VM_WARN_ON_ONCE(!is_vma_hugetlb_flags(&desc->vma_flags)); + VM_WARN_ON_ONCE(vma_desc_test_flags(desc, VMA_MAYSHARE_BIT)); desc->private_data = map; } static void set_vma_desc_resv_flags(struct vm_area_desc *desc, unsigned long flags) { - VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags)); - VM_WARN_ON_ONCE(desc->vm_flags & VM_MAYSHARE); + VM_WARN_ON_ONCE(!is_vma_hugetlb_flags(&desc->vma_flags)); + VM_WARN_ON_ONCE(vma_desc_test_flags(desc, VMA_MAYSHARE_BIT)); desc->private_data = (void *)((unsigned long)desc->private_data | flags); } @@ -1216,7 +1216,7 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) static bool is_vma_desc_resv_set(struct vm_area_desc *desc, unsigned long flag) { - VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags)); + VM_WARN_ON_ONCE(!is_vma_hugetlb_flags(&desc->vma_flags)); return ((unsigned long)desc->private_data) & flag; } @@ -6571,7 +6571,7 @@ next: long hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_desc *desc, - vm_flags_t vm_flags) + vma_flags_t vma_flags) { long chg = -1, add = -1, spool_resv, gbl_resv; struct hstate *h = hstate_inode(inode); @@ -6592,7 +6592,7 @@ long hugetlb_reserve_pages(struct inode *inode, * attempt will be made for VM_NORESERVE to allocate a page * without using reserves */ - if (vm_flags & VM_NORESERVE) + if (vma_flags_test(&vma_flags, VMA_NORESERVE_BIT)) return 0; /* @@ -6601,7 +6601,7 @@ long hugetlb_reserve_pages(struct inode *inode, * to reserve the full area even if read-only as mprotect() may be * called to make the mapping read-write. Assume !desc is a shm mapping */ - if (!desc || desc->vm_flags & VM_MAYSHARE) { + if (!desc || vma_desc_test_flags(desc, VMA_MAYSHARE_BIT)) { /* * resv_map can not be NULL as hugetlb_reserve_pages is only * called for inodes for which resv_maps were created (see @@ -6635,7 +6635,7 @@ long hugetlb_reserve_pages(struct inode *inode, if (err < 0) goto out_err; - if (desc && !(desc->vm_flags & VM_MAYSHARE) && h_cg) { + if (desc && !vma_desc_test_flags(desc, VMA_MAYSHARE_BIT) && h_cg) { /* For private mappings, the hugetlb_cgroup uncharge info hangs * of the resv_map. */ @@ -6672,7 +6672,7 @@ long hugetlb_reserve_pages(struct inode *inode, * consumed reservations are stored in the map. Hence, nothing * else has to be done for private mappings here */ - if (!desc || desc->vm_flags & VM_MAYSHARE) { + if (!desc || vma_desc_test_flags(desc, VMA_MAYSHARE_BIT)) { add = region_add(resv_map, from, to, regions_needed, h, h_cg); if (unlikely(add < 0)) { @@ -6727,7 +6727,7 @@ out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), chg * pages_per_huge_page(h), h_cg); out_err: - if (!desc || desc->vm_flags & VM_MAYSHARE) + if (!desc || vma_desc_test_flags(desc, VMA_MAYSHARE_BIT)) /* Only call region_abort if the region_chg succeeded but the * region_add failed or didn't run. */ diff --git a/mm/memfd.c b/mm/memfd.c index 82a3f38aa30a..3e8f3bc4f72d 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -86,7 +86,7 @@ struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx) gfp_mask &= ~(__GFP_HIGHMEM | __GFP_MOVABLE); idx >>= huge_page_order(h); - nr_resv = hugetlb_reserve_pages(inode, idx, idx + 1, NULL, 0); + nr_resv = hugetlb_reserve_pages(inode, idx, idx + 1, NULL, EMPTY_VMA_FLAGS); if (nr_resv < 0) return ERR_PTR(nr_resv); @@ -463,7 +463,7 @@ struct file *memfd_alloc_file(const char *name, unsigned int flags) int err = 0; if (flags & MFD_HUGETLB) { - file = hugetlb_file_setup(name, 0, VM_NORESERVE, + file = hugetlb_file_setup(name, 0, mk_vma_flags(VMA_NORESERVE_BIT), HUGETLB_ANONHUGE_INODE, (flags >> MFD_HUGE_SHIFT) & MFD_HUGE_MASK); diff --git a/mm/mmap.c b/mm/mmap.c index a03b7681e13c..c62abc616485 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -594,7 +594,7 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, * taken when vm_ops->mmap() is called */ file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, - VM_NORESERVE, + mk_vma_flags(VMA_NORESERVE_BIT), HUGETLB_ANONHUGE_INODE, (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); if (IS_ERR(file)) -- cgit v1.2.3 From 590d356aa433074ece2b0d02faa5f959b26d54d6 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 22 Jan 2026 16:06:17 +0000 Subject: mm: update shmem_[kernel]_file_*() functions to use vma_flags_t In order to be able to use only vma_flags_t in vm_area_desc we must adjust shmem file setup functions to operate in terms of vma_flags_t rather than vm_flags_t. This patch makes this change and updates all callers to use the new functions. No functional changes intended. [akpm@linux-foundation.org: comment fixes, per Baolin] Link: https://lkml.kernel.org/r/736febd280eb484d79cef5cf55b8a6f79ad832d2.1769097829.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Baolin Wang Reviewed-by: Jarkko Sakkinen Reviewed-by: Liam R. Howlett Cc: Barry Song Cc: David Hildenbrand Cc: Dev Jain Cc: Jason Gunthorpe Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Cc: "Darrick J. Wong" Cc: Damien Le Moal Cc: Yury Norov Cc: Chris Mason Cc: Pedro Falcato Signed-off-by: Andrew Morton --- arch/x86/kernel/cpu/sgx/ioctl.c | 2 +- drivers/gpu/drm/drm_gem.c | 5 ++- drivers/gpu/drm/i915/gem/i915_gem_shmem.c | 2 +- drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 3 +- drivers/gpu/drm/i915/gt/shmem_utils.c | 3 +- drivers/gpu/drm/ttm/tests/ttm_tt_test.c | 2 +- drivers/gpu/drm/ttm/ttm_backup.c | 3 +- drivers/gpu/drm/ttm/ttm_tt.c | 2 +- fs/xfs/scrub/xfile.c | 3 +- fs/xfs/xfs_buf_mem.c | 2 +- include/linux/shmem_fs.h | 8 ++-- ipc/shm.c | 6 +-- mm/memfd.c | 2 +- mm/shmem.c | 61 +++++++++++++++++-------------- security/keys/big_key.c | 2 +- 15 files changed, 57 insertions(+), 49 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index 9322a9287dc7..0bc36957979d 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -83,7 +83,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs) encl_size = secs->size + PAGE_SIZE; backing = shmem_file_setup("SGX backing", encl_size + (encl_size >> 5), - VM_NORESERVE); + mk_vma_flags(VMA_NORESERVE_BIT)); if (IS_ERR(backing)) { ret = PTR_ERR(backing); goto err_out_shrink; diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c index e4df43427394..be4dca2bc34e 100644 --- a/drivers/gpu/drm/drm_gem.c +++ b/drivers/gpu/drm/drm_gem.c @@ -130,14 +130,15 @@ int drm_gem_object_init_with_mnt(struct drm_device *dev, struct vfsmount *gemfs) { struct file *filp; + const vma_flags_t flags = mk_vma_flags(VMA_NORESERVE_BIT); drm_gem_private_object_init(dev, obj, size); if (gemfs) filp = shmem_file_setup_with_mnt(gemfs, "drm mm object", size, - VM_NORESERVE); + flags); else - filp = shmem_file_setup("drm mm object", size, VM_NORESERVE); + filp = shmem_file_setup("drm mm object", size, flags); if (IS_ERR(filp)) return PTR_ERR(filp); diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c index 26dda55a07ff..fe1843497b27 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c @@ -496,7 +496,7 @@ static int __create_shmem(struct drm_i915_private *i915, struct drm_gem_object *obj, resource_size_t size) { - unsigned long flags = VM_NORESERVE; + const vma_flags_t flags = mk_vma_flags(VMA_NORESERVE_BIT); struct file *filp; drm_gem_private_object_init(&i915->drm, obj, size); diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c index f65fe86c02b5..7b1a7d01db2b 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c @@ -200,7 +200,8 @@ static int i915_ttm_tt_shmem_populate(struct ttm_device *bdev, struct address_space *mapping; gfp_t mask; - filp = shmem_file_setup("i915-shmem-tt", size, VM_NORESERVE); + filp = shmem_file_setup("i915-shmem-tt", size, + mk_vma_flags(VMA_NORESERVE_BIT)); if (IS_ERR(filp)) return PTR_ERR(filp); diff --git a/drivers/gpu/drm/i915/gt/shmem_utils.c b/drivers/gpu/drm/i915/gt/shmem_utils.c index 365c4b8b04f4..5f37c699a320 100644 --- a/drivers/gpu/drm/i915/gt/shmem_utils.c +++ b/drivers/gpu/drm/i915/gt/shmem_utils.c @@ -19,7 +19,8 @@ struct file *shmem_create_from_data(const char *name, void *data, size_t len) struct file *file; int err; - file = shmem_file_setup(name, PAGE_ALIGN(len), VM_NORESERVE); + file = shmem_file_setup(name, PAGE_ALIGN(len), + mk_vma_flags(VMA_NORESERVE_BIT)); if (IS_ERR(file)) return file; diff --git a/drivers/gpu/drm/ttm/tests/ttm_tt_test.c b/drivers/gpu/drm/ttm/tests/ttm_tt_test.c index 61ec6f580b62..bd5f7d0b9b62 100644 --- a/drivers/gpu/drm/ttm/tests/ttm_tt_test.c +++ b/drivers/gpu/drm/ttm/tests/ttm_tt_test.c @@ -143,7 +143,7 @@ static void ttm_tt_fini_shmem(struct kunit *test) err = ttm_tt_init(tt, bo, 0, caching, 0); KUNIT_ASSERT_EQ(test, err, 0); - shmem = shmem_file_setup("ttm swap", BO_SIZE, 0); + shmem = shmem_file_setup("ttm swap", BO_SIZE, EMPTY_VMA_FLAGS); tt->swap_storage = shmem; ttm_tt_fini(tt); diff --git a/drivers/gpu/drm/ttm/ttm_backup.c b/drivers/gpu/drm/ttm/ttm_backup.c index 32530c75f038..6bd4c123d94c 100644 --- a/drivers/gpu/drm/ttm/ttm_backup.c +++ b/drivers/gpu/drm/ttm/ttm_backup.c @@ -178,5 +178,6 @@ EXPORT_SYMBOL_GPL(ttm_backup_bytes_avail); */ struct file *ttm_backup_shmem_create(loff_t size) { - return shmem_file_setup("ttm shmem backup", size, 0); + return shmem_file_setup("ttm shmem backup", size, + EMPTY_VMA_FLAGS); } diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c index 611d20ab966d..f73a5ce87645 100644 --- a/drivers/gpu/drm/ttm/ttm_tt.c +++ b/drivers/gpu/drm/ttm/ttm_tt.c @@ -330,7 +330,7 @@ int ttm_tt_swapout(struct ttm_device *bdev, struct ttm_tt *ttm, struct page *to_page; int i, ret; - swap_storage = shmem_file_setup("ttm swap", size, 0); + swap_storage = shmem_file_setup("ttm swap", size, EMPTY_VMA_FLAGS); if (IS_ERR(swap_storage)) { pr_err("Failed allocating swap storage\n"); return PTR_ERR(swap_storage); diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c index c753c79df203..fe0584a39f16 100644 --- a/fs/xfs/scrub/xfile.c +++ b/fs/xfs/scrub/xfile.c @@ -61,7 +61,8 @@ xfile_create( if (!xf) return -ENOMEM; - xf->file = shmem_kernel_file_setup(description, isize, VM_NORESERVE); + xf->file = shmem_kernel_file_setup(description, isize, + mk_vma_flags(VMA_NORESERVE_BIT)); if (IS_ERR(xf->file)) { error = PTR_ERR(xf->file); goto out_xfile; diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c index dcbfa274e06d..fd6f0a5bc0ea 100644 --- a/fs/xfs/xfs_buf_mem.c +++ b/fs/xfs/xfs_buf_mem.c @@ -62,7 +62,7 @@ xmbuf_alloc( if (!btp) return -ENOMEM; - file = shmem_kernel_file_setup(descr, 0, 0); + file = shmem_kernel_file_setup(descr, 0, EMPTY_VMA_FLAGS); if (IS_ERR(file)) { error = PTR_ERR(file); goto out_free_btp; diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index e2069b3179c4..a8273b32e041 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -102,12 +102,10 @@ static inline struct shmem_inode_info *SHMEM_I(struct inode *inode) extern const struct fs_parameter_spec shmem_fs_parameters[]; extern void shmem_init(void); extern int shmem_init_fs_context(struct fs_context *fc); -extern struct file *shmem_file_setup(const char *name, - loff_t size, unsigned long flags); -extern struct file *shmem_kernel_file_setup(const char *name, loff_t size, - unsigned long flags); +struct file *shmem_file_setup(const char *name, loff_t size, vma_flags_t flags); +struct file *shmem_kernel_file_setup(const char *name, loff_t size, vma_flags_t vma_flags); extern struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, - const char *name, loff_t size, unsigned long flags); + const char *name, loff_t size, vma_flags_t flags); int shmem_zero_setup(struct vm_area_struct *vma); int shmem_zero_setup_desc(struct vm_area_desc *desc); extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr, diff --git a/ipc/shm.c b/ipc/shm.c index 2c7379c4c647..e8c7d1924c50 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -708,6 +708,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) struct shmid_kernel *shp; size_t numpages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; const bool has_no_reserve = shmflg & SHM_NORESERVE; + vma_flags_t acctflag = EMPTY_VMA_FLAGS; struct file *file; char name[13]; @@ -738,7 +739,6 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) sprintf(name, "SYSV%08x", key); if (shmflg & SHM_HUGETLB) { - vma_flags_t acctflag = EMPTY_VMA_FLAGS; struct hstate *hs; size_t hugesize; @@ -755,14 +755,12 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) file = hugetlb_file_setup(name, hugesize, acctflag, HUGETLB_SHMFS_INODE, (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK); } else { - vm_flags_t acctflag = 0; - /* * Do not allow no accounting for OVERCOMMIT_NEVER, even * if it's asked for. */ if (has_no_reserve && sysctl_overcommit_memory != OVERCOMMIT_NEVER) - acctflag = VM_NORESERVE; + vma_flags_set(&acctflag, VMA_NORESERVE_BIT); file = shmem_kernel_file_setup(name, size, acctflag); } error = PTR_ERR(file); diff --git a/mm/memfd.c b/mm/memfd.c index 3e8f3bc4f72d..919c2a53eb96 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -468,7 +468,7 @@ struct file *memfd_alloc_file(const char *name, unsigned int flags) (flags >> MFD_HUGE_SHIFT) & MFD_HUGE_MASK); } else { - file = shmem_file_setup(name, 0, VM_NORESERVE); + file = shmem_file_setup(name, 0, mk_vma_flags(VMA_NORESERVE_BIT)); } if (IS_ERR(file)) return file; diff --git a/mm/shmem.c b/mm/shmem.c index b8499871e830..9a9a6e4148c9 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3056,9 +3056,9 @@ static struct offset_ctx *shmem_get_offset_ctx(struct inode *inode) } static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, - struct super_block *sb, - struct inode *dir, umode_t mode, - dev_t dev, unsigned long flags) + struct super_block *sb, + struct inode *dir, umode_t mode, + dev_t dev, vma_flags_t flags) { struct inode *inode; struct shmem_inode_info *info; @@ -3086,7 +3086,8 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, spin_lock_init(&info->lock); atomic_set(&info->stop_eviction, 0); info->seals = F_SEAL_SEAL; - info->flags = (flags & VM_NORESERVE) ? SHMEM_F_NORESERVE : 0; + info->flags = vma_flags_test(&flags, VMA_NORESERVE_BIT) + ? SHMEM_F_NORESERVE : 0; info->i_crtime = inode_get_mtime(inode); info->fsflags = (dir == NULL) ? 0 : SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED; @@ -3139,7 +3140,7 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, #ifdef CONFIG_TMPFS_QUOTA static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, struct inode *dir, - umode_t mode, dev_t dev, unsigned long flags) + umode_t mode, dev_t dev, vma_flags_t flags) { int err; struct inode *inode; @@ -3165,9 +3166,9 @@ errout: return ERR_PTR(err); } #else -static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, +static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, struct inode *dir, - umode_t mode, dev_t dev, unsigned long flags) + umode_t mode, dev_t dev, vma_flags_t flags) { return __shmem_get_inode(idmap, sb, dir, mode, dev, flags); } @@ -3874,7 +3875,8 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir, if (!generic_ci_validate_strict_name(dir, &dentry->d_name)) return -EINVAL; - inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, VM_NORESERVE); + inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, + mk_vma_flags(VMA_NORESERVE_BIT)); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -3909,7 +3911,8 @@ shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct inode *inode; int error; - inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE); + inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, + mk_vma_flags(VMA_NORESERVE_BIT)); if (IS_ERR(inode)) { error = PTR_ERR(inode); goto err_out; @@ -4106,7 +4109,7 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, return -ENAMETOOLONG; inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0, - VM_NORESERVE); + mk_vma_flags(VMA_NORESERVE_BIT)); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -5107,7 +5110,8 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) #endif /* CONFIG_TMPFS_QUOTA */ inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL, - S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); + S_IFDIR | sbinfo->mode, 0, + mk_vma_flags(VMA_NORESERVE_BIT)); if (IS_ERR(inode)) { error = PTR_ERR(inode); goto failed; @@ -5807,7 +5811,7 @@ static inline void shmem_unacct_size(unsigned long flags, loff_t size) static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, struct inode *dir, - umode_t mode, dev_t dev, unsigned long flags) + umode_t mode, dev_t dev, vma_flags_t flags) { struct inode *inode = ramfs_get_inode(sb, dir, mode, dev); return inode ? inode : ERR_PTR(-ENOSPC); @@ -5818,10 +5822,11 @@ static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, /* common code */ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, - loff_t size, unsigned long vm_flags, + loff_t size, vma_flags_t flags, unsigned int i_flags) { - unsigned long flags = (vm_flags & VM_NORESERVE) ? SHMEM_F_NORESERVE : 0; + const unsigned long shmem_flags = + vma_flags_test(&flags, VMA_NORESERVE_BIT) ? SHMEM_F_NORESERVE : 0; struct inode *inode; struct file *res; @@ -5834,13 +5839,13 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, if (is_idmapped_mnt(mnt)) return ERR_PTR(-EINVAL); - if (shmem_acct_size(flags, size)) + if (shmem_acct_size(shmem_flags, size)) return ERR_PTR(-ENOMEM); inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL, - S_IFREG | S_IRWXUGO, 0, vm_flags); + S_IFREG | S_IRWXUGO, 0, flags); if (IS_ERR(inode)) { - shmem_unacct_size(flags, size); + shmem_unacct_size(shmem_flags, size); return ERR_CAST(inode); } inode->i_flags |= i_flags; @@ -5863,9 +5868,10 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, * checks are provided at the key or shm level rather than the inode. * @name: name for dentry (to be seen in /proc//maps) * @size: size to be set for the file - * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size + * @flags: VMA_NORESERVE_BIT suppresses pre-accounting of the entire object size */ -struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags) +struct file *shmem_kernel_file_setup(const char *name, loff_t size, + vma_flags_t flags) { return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE); } @@ -5875,9 +5881,9 @@ EXPORT_SYMBOL_GPL(shmem_kernel_file_setup); * shmem_file_setup - get an unlinked file living in tmpfs * @name: name for dentry (to be seen in /proc//maps) * @size: size to be set for the file - * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size + * @flags: VMA_NORESERVE_BIT suppresses pre-accounting of the entire object size */ -struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) +struct file *shmem_file_setup(const char *name, loff_t size, vma_flags_t flags) { return __shmem_file_setup(shm_mnt, name, size, flags, 0); } @@ -5888,16 +5894,17 @@ EXPORT_SYMBOL_GPL(shmem_file_setup); * @mnt: the tmpfs mount where the file will be created * @name: name for dentry (to be seen in /proc//maps) * @size: size to be set for the file - * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size + * @flags: VMA_NORESERVE_BIT suppresses pre-accounting of the entire object size */ struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name, - loff_t size, unsigned long flags) + loff_t size, vma_flags_t flags) { return __shmem_file_setup(mnt, name, size, flags, 0); } EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt); -static struct file *__shmem_zero_setup(unsigned long start, unsigned long end, vm_flags_t vm_flags) +static struct file *__shmem_zero_setup(unsigned long start, unsigned long end, + vma_flags_t flags) { loff_t size = end - start; @@ -5907,7 +5914,7 @@ static struct file *__shmem_zero_setup(unsigned long start, unsigned long end, v * accessible to the user through its mapping, use S_PRIVATE flag to * bypass file security, in the same way as shmem_kernel_file_setup(). */ - return shmem_kernel_file_setup("dev/zero", size, vm_flags); + return shmem_kernel_file_setup("dev/zero", size, flags); } /** @@ -5917,7 +5924,7 @@ static struct file *__shmem_zero_setup(unsigned long start, unsigned long end, v */ int shmem_zero_setup(struct vm_area_struct *vma) { - struct file *file = __shmem_zero_setup(vma->vm_start, vma->vm_end, vma->vm_flags); + struct file *file = __shmem_zero_setup(vma->vm_start, vma->vm_end, vma->flags); if (IS_ERR(file)) return PTR_ERR(file); @@ -5938,7 +5945,7 @@ int shmem_zero_setup(struct vm_area_struct *vma) */ int shmem_zero_setup_desc(struct vm_area_desc *desc) { - struct file *file = __shmem_zero_setup(desc->start, desc->end, desc->vm_flags); + struct file *file = __shmem_zero_setup(desc->start, desc->end, desc->vma_flags); if (IS_ERR(file)) return PTR_ERR(file); diff --git a/security/keys/big_key.c b/security/keys/big_key.c index d46862ab90d6..268f702df380 100644 --- a/security/keys/big_key.c +++ b/security/keys/big_key.c @@ -103,7 +103,7 @@ int big_key_preparse(struct key_preparsed_payload *prep) 0, enckey); /* save aligned data to file */ - file = shmem_kernel_file_setup("", enclen, 0); + file = shmem_kernel_file_setup("", enclen, EMPTY_VMA_FLAGS); if (IS_ERR(file)) { ret = PTR_ERR(file); goto err_enckey; -- cgit v1.2.3 From 5bd2c0650a9030007af5c2cf2a01dccdc67a6991 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 22 Jan 2026 16:06:18 +0000 Subject: mm: update all remaining mmap_prepare users to use vma_flags_t We will be shortly removing the vm_flags_t field from vm_area_desc so we need to update all mmap_prepare users to only use the dessc->vma_flags field. This patch achieves that and makes all ancillary changes required to make this possible. This lays the groundwork for future work to eliminate the use of vm_flags_t in vm_area_desc altogether and more broadly throughout the kernel. While we're here, we take the opportunity to replace VM_REMAP_FLAGS with VMA_REMAP_FLAGS, the vma_flags_t equivalent. No functional changes intended. Link: https://lkml.kernel.org/r/fb1f55323799f09fe6a36865b31550c9ec67c225.1769097829.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: Damien Le Moal [zonefs] Acked-by: "Darrick J. Wong" Acked-by: Pedro Falcato Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand Cc: Dev Jain Cc: Jason Gunthorpe Cc: Liam Howlett Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Cc: Jarkko Sakkinen Cc: Yury Norov Cc: Chris Mason Signed-off-by: Andrew Morton --- drivers/char/mem.c | 6 +++--- drivers/dax/device.c | 10 +++++----- fs/aio.c | 2 +- fs/erofs/data.c | 5 +++-- fs/ext4/file.c | 4 ++-- fs/ntfs3/file.c | 2 +- fs/orangefs/file.c | 4 ++-- fs/ramfs/file-nommu.c | 2 +- fs/resctrl/pseudo_lock.c | 2 +- fs/romfs/mmap-nommu.c | 2 +- fs/xfs/xfs_file.c | 4 ++-- fs/zonefs/file.c | 3 ++- include/linux/dax.h | 8 ++++---- include/linux/mm.h | 24 +++++++++++++++++++----- kernel/relay.c | 2 +- mm/memory.c | 17 ++++++++--------- 16 files changed, 56 insertions(+), 41 deletions(-) (limited to 'include/linux') diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 52039fae1594..cca4529431f8 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -306,7 +306,7 @@ static unsigned zero_mmap_capabilities(struct file *file) /* can't do an in-place private mapping if there's no MMU */ static inline int private_mapping_ok(struct vm_area_desc *desc) { - return is_nommu_shared_mapping(desc->vm_flags); + return is_nommu_shared_vma_flags(&desc->vma_flags); } #else @@ -360,7 +360,7 @@ static int mmap_mem_prepare(struct vm_area_desc *desc) desc->vm_ops = &mmap_mem_ops; - /* Remap-pfn-range will mark the range VM_IO. */ + /* Remap-pfn-range will mark the range with the I/O flag. */ mmap_action_remap_full(desc, desc->pgoff); /* We filter remap errors to -EAGAIN. */ desc->action.error_hook = mmap_filter_error; @@ -520,7 +520,7 @@ static int mmap_zero_prepare(struct vm_area_desc *desc) #ifndef CONFIG_MMU return -ENOSYS; #endif - if (desc->vm_flags & VM_SHARED) + if (vma_desc_test_flags(desc, VMA_SHARED_BIT)) return shmem_zero_setup_desc(desc); desc->action.success_hook = mmap_zero_private_success; diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 22999a402e02..528e81240c4d 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -13,7 +13,7 @@ #include "dax-private.h" #include "bus.h" -static int __check_vma(struct dev_dax *dev_dax, vm_flags_t vm_flags, +static int __check_vma(struct dev_dax *dev_dax, vma_flags_t flags, unsigned long start, unsigned long end, struct file *file, const char *func) { @@ -24,7 +24,7 @@ static int __check_vma(struct dev_dax *dev_dax, vm_flags_t vm_flags, return -ENXIO; /* prevent private mappings from being established */ - if ((vm_flags & VM_MAYSHARE) != VM_MAYSHARE) { + if (!vma_flags_test(&flags, VMA_MAYSHARE_BIT)) { dev_info_ratelimited(dev, "%s: %s: fail, attempted private mapping\n", current->comm, func); @@ -53,7 +53,7 @@ static int __check_vma(struct dev_dax *dev_dax, vm_flags_t vm_flags, static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, const char *func) { - return __check_vma(dev_dax, vma->vm_flags, vma->vm_start, vma->vm_end, + return __check_vma(dev_dax, vma->flags, vma->vm_start, vma->vm_end, vma->vm_file, func); } @@ -306,14 +306,14 @@ static int dax_mmap_prepare(struct vm_area_desc *desc) * fault time. */ id = dax_read_lock(); - rc = __check_vma(dev_dax, desc->vm_flags, desc->start, desc->end, filp, + rc = __check_vma(dev_dax, desc->vma_flags, desc->start, desc->end, filp, __func__); dax_read_unlock(id); if (rc) return rc; desc->vm_ops = &dax_vm_ops; - desc->vm_flags |= VM_HUGEPAGE; + vma_desc_set_flags(desc, VMA_HUGEPAGE_BIT); return 0; } diff --git a/fs/aio.c b/fs/aio.c index 0a23a8c0717f..59b67b8da1b2 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -394,7 +394,7 @@ static const struct vm_operations_struct aio_ring_vm_ops = { static int aio_ring_mmap_prepare(struct vm_area_desc *desc) { - desc->vm_flags |= VM_DONTEXPAND; + vma_desc_set_flags(desc, VMA_DONTEXPAND_BIT); desc->vm_ops = &aio_ring_vm_ops; return 0; } diff --git a/fs/erofs/data.c b/fs/erofs/data.c index bb13c4cb8455..e7bc29e764c6 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -438,11 +438,12 @@ static int erofs_file_mmap_prepare(struct vm_area_desc *desc) if (!IS_DAX(file_inode(desc->file))) return generic_file_readonly_mmap_prepare(desc); - if ((desc->vm_flags & VM_SHARED) && (desc->vm_flags & VM_MAYWRITE)) + if (vma_desc_test_flags(desc, VMA_SHARED_BIT) && + vma_desc_test_flags(desc, VMA_MAYWRITE_BIT)) return -EINVAL; desc->vm_ops = &erofs_dax_vm_ops; - desc->vm_flags |= VM_HUGEPAGE; + vma_desc_set_flags(desc, VMA_HUGEPAGE_BIT); return 0; } #else diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 7a8b30932189..dfd5f4fe1647 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -822,13 +822,13 @@ static int ext4_file_mmap_prepare(struct vm_area_desc *desc) * We don't support synchronous mappings for non-DAX files and * for DAX files if underneath dax_device is not synchronous. */ - if (!daxdev_mapping_supported(desc->vm_flags, file_inode(file), dax_dev)) + if (!daxdev_mapping_supported(desc, file_inode(file), dax_dev)) return -EOPNOTSUPP; file_accessed(file); if (IS_DAX(file_inode(file))) { desc->vm_ops = &ext4_dax_vm_ops; - desc->vm_flags |= VM_HUGEPAGE; + vma_desc_set_flags(desc, VMA_HUGEPAGE_BIT); } else { desc->vm_ops = &ext4_file_vm_ops; } diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 2e7b2e566ebe..2902fc6d9a85 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -347,7 +347,7 @@ static int ntfs_file_mmap_prepare(struct vm_area_desc *desc) struct inode *inode = file_inode(file); struct ntfs_inode *ni = ntfs_i(inode); u64 from = ((u64)desc->pgoff << PAGE_SHIFT); - bool rw = desc->vm_flags & VM_WRITE; + const bool rw = vma_desc_test_flags(desc, VMA_WRITE_BIT); int err; /* Avoid any operation if inode is bad. */ diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 919f99b16834..c75aa3f419b1 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -411,8 +411,8 @@ static int orangefs_file_mmap_prepare(struct vm_area_desc *desc) "orangefs_file_mmap: called on %pD\n", file); /* set the sequential readahead hint */ - desc->vm_flags |= VM_SEQ_READ; - desc->vm_flags &= ~VM_RAND_READ; + vma_desc_set_flags(desc, VMA_SEQ_READ_BIT); + vma_desc_clear_flags(desc, VMA_RAND_READ_BIT); file_accessed(file); desc->vm_ops = &orangefs_file_vm_ops; diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 77b8ca2757e0..0f8e838ece07 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -264,7 +264,7 @@ out: */ static int ramfs_nommu_mmap_prepare(struct vm_area_desc *desc) { - if (!is_nommu_shared_mapping(desc->vm_flags)) + if (!is_nommu_shared_vma_flags(&desc->vma_flags)) return -ENOSYS; file_accessed(desc->file); diff --git a/fs/resctrl/pseudo_lock.c b/fs/resctrl/pseudo_lock.c index 0bfc13c5b96d..e81d71abfe54 100644 --- a/fs/resctrl/pseudo_lock.c +++ b/fs/resctrl/pseudo_lock.c @@ -1044,7 +1044,7 @@ static int pseudo_lock_dev_mmap_prepare(struct vm_area_desc *desc) * Ensure changes are carried directly to the memory being mapped, * do not allow copy-on-write mapping. */ - if (!(desc->vm_flags & VM_SHARED)) { + if (!vma_desc_test_flags(desc, VMA_SHARED_BIT)) { mutex_unlock(&rdtgroup_mutex); return -EINVAL; } diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c index 4b77c6dc4418..7c3a1a7fecee 100644 --- a/fs/romfs/mmap-nommu.c +++ b/fs/romfs/mmap-nommu.c @@ -63,7 +63,7 @@ static unsigned long romfs_get_unmapped_area(struct file *file, */ static int romfs_mmap_prepare(struct vm_area_desc *desc) { - return is_nommu_shared_mapping(desc->vm_flags) ? 0 : -ENOSYS; + return is_nommu_shared_vma_flags(&desc->vma_flags) ? 0 : -ENOSYS; } static unsigned romfs_mmap_capabilities(struct file *file) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 7874cf745af3..1238ec018bc7 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1974,14 +1974,14 @@ xfs_file_mmap_prepare( * We don't support synchronous mappings for non-DAX files and * for DAX files if underneath dax_device is not synchronous. */ - if (!daxdev_mapping_supported(desc->vm_flags, file_inode(file), + if (!daxdev_mapping_supported(desc, file_inode(file), target->bt_daxdev)) return -EOPNOTSUPP; file_accessed(file); desc->vm_ops = &xfs_file_vm_ops; if (IS_DAX(inode)) - desc->vm_flags |= VM_HUGEPAGE; + vma_desc_set_flags(desc, VMA_HUGEPAGE_BIT); return 0; } diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index c1e5e30e90a0..8a7161fc49e5 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -333,7 +333,8 @@ static int zonefs_file_mmap_prepare(struct vm_area_desc *desc) * ordering between msync() and page cache writeback. */ if (zonefs_inode_is_seq(file_inode(file)) && - (desc->vm_flags & VM_SHARED) && (desc->vm_flags & VM_MAYWRITE)) + vma_desc_test_flags(desc, VMA_SHARED_BIT) && + vma_desc_test_flags(desc, VMA_MAYWRITE_BIT)) return -EINVAL; file_accessed(file); diff --git a/include/linux/dax.h b/include/linux/dax.h index 9d624f4d9df6..bf103f317cac 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -65,11 +65,11 @@ size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, /* * Check if given mapping is supported by the file / underlying device. */ -static inline bool daxdev_mapping_supported(vm_flags_t vm_flags, +static inline bool daxdev_mapping_supported(const struct vm_area_desc *desc, const struct inode *inode, struct dax_device *dax_dev) { - if (!(vm_flags & VM_SYNC)) + if (!vma_desc_test_flags(desc, VMA_SYNC_BIT)) return true; if (!IS_DAX(inode)) return false; @@ -111,11 +111,11 @@ static inline void set_dax_nomc(struct dax_device *dax_dev) static inline void set_dax_synchronous(struct dax_device *dax_dev) { } -static inline bool daxdev_mapping_supported(vm_flags_t vm_flags, +static inline bool daxdev_mapping_supported(const struct vm_area_desc *desc, const struct inode *inode, struct dax_device *dax_dev) { - return !(vm_flags & VM_SYNC); + return !vma_desc_test_flags(desc, VMA_SYNC_BIT); } static inline size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) diff --git a/include/linux/mm.h b/include/linux/mm.h index aa99b28e7a8a..05d950805701 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -550,17 +550,18 @@ enum { /* * Physically remapped pages are special. Tell the * rest of the world about it: - * VM_IO tells people not to look at these pages + * IO tells people not to look at these pages * (accesses can have side effects). - * VM_PFNMAP tells the core MM that the base pages are just + * PFNMAP tells the core MM that the base pages are just * raw PFN mappings, and do not have a "struct page" associated * with them. - * VM_DONTEXPAND + * DONTEXPAND * Disable vma merging and expanding with mremap(). - * VM_DONTDUMP + * DONTDUMP * Omit vma from core dump, even when VM_IO turned off. */ -#define VM_REMAP_FLAGS (VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP) +#define VMA_REMAP_FLAGS mk_vma_flags(VMA_IO_BIT, VMA_PFNMAP_BIT, \ + VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT) /* This mask prevents VMA from being scanned with khugepaged */ #define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB) @@ -1925,6 +1926,14 @@ static inline bool is_cow_mapping(vm_flags_t flags) return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; } +static inline bool vma_desc_is_cow_mapping(struct vm_area_desc *desc) +{ + const vma_flags_t *flags = &desc->vma_flags; + + return vma_flags_test(flags, VMA_MAYWRITE_BIT) && + !vma_flags_test(flags, VMA_SHARED_BIT); +} + #ifndef CONFIG_MMU static inline bool is_nommu_shared_mapping(vm_flags_t flags) { @@ -1938,6 +1947,11 @@ static inline bool is_nommu_shared_mapping(vm_flags_t flags) */ return flags & (VM_MAYSHARE | VM_MAYOVERLAY); } + +static inline bool is_nommu_shared_vma_flags(const vma_flags_t *flags) +{ + return vma_flags_test(flags, VMA_MAYSHARE_BIT, VMA_MAYOVERLAY_BIT); +} #endif #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) diff --git a/kernel/relay.c b/kernel/relay.c index e36f6b926f7f..1c8e88259df0 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -92,7 +92,7 @@ static int relay_mmap_prepare_buf(struct rchan_buf *buf, return -EINVAL; desc->vm_ops = &relay_file_mmap_ops; - desc->vm_flags |= VM_DONTEXPAND; + vma_desc_set_flags(desc, VMA_DONTEXPAND_BIT); desc->private_data = buf; return 0; diff --git a/mm/memory.c b/mm/memory.c index 136b80ca357b..9ee60d87969b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2957,7 +2957,7 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, return 0; } -static int get_remap_pgoff(vm_flags_t vm_flags, unsigned long addr, +static int get_remap_pgoff(bool is_cow, unsigned long addr, unsigned long end, unsigned long vm_start, unsigned long vm_end, unsigned long pfn, pgoff_t *vm_pgoff_p) { @@ -2967,7 +2967,7 @@ static int get_remap_pgoff(vm_flags_t vm_flags, unsigned long addr, * un-COW'ed pages by matching them up with "vma->vm_pgoff". * See vm_normal_page() for details. */ - if (is_cow_mapping(vm_flags)) { + if (is_cow) { if (addr != vm_start || end != vm_end) return -EINVAL; *vm_pgoff_p = pfn; @@ -2988,7 +2988,7 @@ static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long ad if (WARN_ON_ONCE(!PAGE_ALIGNED(addr))) return -EINVAL; - VM_WARN_ON_ONCE((vma->vm_flags & VM_REMAP_FLAGS) != VM_REMAP_FLAGS); + VM_WARN_ON_ONCE(!vma_test_all_flags_mask(vma, VMA_REMAP_FLAGS)); BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; @@ -3112,9 +3112,9 @@ void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn) * check it again on complete and will fail there if specified addr is * invalid. */ - get_remap_pgoff(desc->vm_flags, desc->start, desc->end, + get_remap_pgoff(vma_desc_is_cow_mapping(desc), desc->start, desc->end, desc->start, desc->end, pfn, &desc->pgoff); - desc->vm_flags |= VM_REMAP_FLAGS; + vma_desc_set_flags_mask(desc, VMA_REMAP_FLAGS); } static int remap_pfn_range_prepare_vma(struct vm_area_struct *vma, unsigned long addr, @@ -3123,13 +3123,12 @@ static int remap_pfn_range_prepare_vma(struct vm_area_struct *vma, unsigned long unsigned long end = addr + PAGE_ALIGN(size); int err; - err = get_remap_pgoff(vma->vm_flags, addr, end, - vma->vm_start, vma->vm_end, - pfn, &vma->vm_pgoff); + err = get_remap_pgoff(is_cow_mapping(vma->vm_flags), addr, end, + vma->vm_start, vma->vm_end, pfn, &vma->vm_pgoff); if (err) return err; - vm_flags_set(vma, VM_REMAP_FLAGS); + vma_set_flags_mask(vma, VMA_REMAP_FLAGS); return 0; } -- cgit v1.2.3 From 53f1d936445131cb5da2212c2b60884a25cb0330 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 22 Jan 2026 16:06:19 +0000 Subject: mm: make vm_area_desc utilise vma_flags_t only Now we have eliminated all uses of vm_area_desc->vm_flags, eliminate this field, and have mmap_prepare users utilise the vma_flags_t vm_area_desc->vma_flags field only. As part of this change we alter is_shared_maywrite() to accept a vma_flags_t parameter, and introduce is_shared_maywrite_vm_flags() for use with legacy vm_flags_t flags. We also update struct mmap_state to add a union between vma_flags and vm_flags temporarily until the mmap logic is also converted to using vma_flags_t. Also update the VMA userland tests to reflect this change. Link: https://lkml.kernel.org/r/fd2a2938b246b4505321954062b1caba7acfc77a.1769097829.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Pedro Falcato Reviewed-by: Liam R. Howlett Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand Cc: Dev Jain Cc: Jason Gunthorpe Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Cc: Damien Le Moal Cc: "Darrick J. Wong" Cc: Jarkko Sakkinen Cc: Yury Norov Cc: Chris Mason Signed-off-by: Andrew Morton --- include/linux/mm.h | 9 +++++++-- include/linux/mm_types.h | 5 +---- mm/filemap.c | 2 +- mm/util.c | 2 +- mm/vma.c | 11 +++++++---- mm/vma.h | 3 +-- tools/testing/vma/vma_internal.h | 9 +++++++-- 7 files changed, 25 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 05d950805701..f8a8fd47399c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1290,15 +1290,20 @@ static inline bool vma_is_accessible(const struct vm_area_struct *vma) return vma->vm_flags & VM_ACCESS_FLAGS; } -static inline bool is_shared_maywrite(vm_flags_t vm_flags) +static inline bool is_shared_maywrite_vm_flags(vm_flags_t vm_flags) { return (vm_flags & (VM_SHARED | VM_MAYWRITE)) == (VM_SHARED | VM_MAYWRITE); } +static inline bool is_shared_maywrite(const vma_flags_t *flags) +{ + return vma_flags_test_all(flags, VMA_SHARED_BIT, VMA_MAYWRITE_BIT); +} + static inline bool vma_is_shared_maywrite(const struct vm_area_struct *vma) { - return is_shared_maywrite(vma->vm_flags); + return is_shared_maywrite(&vma->flags); } static inline diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 9b4311cfd5e8..3cc8ae722886 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -887,10 +887,7 @@ struct vm_area_desc { /* Mutable fields. Populated with initial state. */ pgoff_t pgoff; struct file *vm_file; - union { - vm_flags_t vm_flags; - vma_flags_t vma_flags; - }; + vma_flags_t vma_flags; pgprot_t page_prot; /* Write-only fields. */ diff --git a/mm/filemap.c b/mm/filemap.c index ebd75684cb0a..6cd7974d4ada 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -4012,7 +4012,7 @@ int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) int generic_file_readonly_mmap_prepare(struct vm_area_desc *desc) { - if (is_shared_maywrite(desc->vm_flags)) + if (is_shared_maywrite(&desc->vma_flags)) return -EINVAL; return generic_file_mmap_prepare(desc); } diff --git a/mm/util.c b/mm/util.c index 97cae40c0209..b05ab6f97e11 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1154,7 +1154,7 @@ int __compat_vma_mmap(const struct file_operations *f_op, .pgoff = vma->vm_pgoff, .vm_file = vma->vm_file, - .vm_flags = vma->vm_flags, + .vma_flags = vma->flags, .page_prot = vma->vm_page_prot, .action.type = MMAP_NOTHING, /* Default */ diff --git a/mm/vma.c b/mm/vma.c index 39dcd9ddd4ba..be64f781a3aa 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -15,7 +15,10 @@ struct mmap_state { unsigned long end; pgoff_t pgoff; unsigned long pglen; - vm_flags_t vm_flags; + union { + vm_flags_t vm_flags; + vma_flags_t vma_flags; + }; struct file *file; pgprot_t page_prot; @@ -2369,7 +2372,7 @@ static void set_desc_from_map(struct vm_area_desc *desc, desc->pgoff = map->pgoff; desc->vm_file = map->file; - desc->vm_flags = map->vm_flags; + desc->vma_flags = map->vma_flags; desc->page_prot = map->page_prot; } @@ -2650,7 +2653,7 @@ static int call_mmap_prepare(struct mmap_state *map, map->file_doesnt_need_get = true; map->file = desc->vm_file; } - map->vm_flags = desc->vm_flags; + map->vma_flags = desc->vma_flags; map->page_prot = desc->page_prot; /* User-defined fields. */ map->vm_ops = desc->vm_ops; @@ -2823,7 +2826,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, return -EINVAL; /* Map writable and ensure this isn't a sealed memfd. */ - if (file && is_shared_maywrite(vm_flags)) { + if (file && is_shared_maywrite_vm_flags(vm_flags)) { int error = mapping_map_writable(file->f_mapping); if (error) diff --git a/mm/vma.h b/mm/vma.h index de30c69bceaf..eba388c61ef4 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -309,8 +309,7 @@ static inline void set_vma_from_desc(struct vm_area_struct *vma, vma->vm_pgoff = desc->pgoff; if (desc->vm_file != vma->vm_file) vma_set_file(vma, desc->vm_file); - if (desc->vm_flags != vma->vm_flags) - vm_flags_set(vma, desc->vm_flags); + vma->flags = desc->vma_flags; vma->vm_page_prot = desc->page_prot; /* User-defined fields. */ diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 2b01794cbd61..2743f12ecf32 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -1009,15 +1009,20 @@ static inline void vma_desc_clear_flags_mask(struct vm_area_desc *desc, #define vma_desc_clear_flags(desc, ...) \ vma_desc_clear_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) -static inline bool is_shared_maywrite(vm_flags_t vm_flags) +static inline bool is_shared_maywrite_vm_flags(vm_flags_t vm_flags) { return (vm_flags & (VM_SHARED | VM_MAYWRITE)) == (VM_SHARED | VM_MAYWRITE); } +static inline bool is_shared_maywrite(const vma_flags_t *flags) +{ + return vma_flags_test_all(flags, VMA_SHARED_BIT, VMA_MAYWRITE_BIT); +} + static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma) { - return is_shared_maywrite(vma->vm_flags); + return is_shared_maywrite(&vma->flags); } static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi) -- cgit v1.2.3 From 52e054f7184097bea009963e033cdd54af7bf8a2 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Mon, 9 Feb 2026 22:07:24 +0800 Subject: mm: rmap: support batched checks of the references for large folios Patch series "support batch checking of references and unmapping for large folios", v6. Currently, folio_referenced_one() always checks the young flag for each PTE sequentially, which is inefficient for large folios. This inefficiency is especially noticeable when reclaiming clean file-backed large folios, where folio_referenced() is observed as a significant performance hotspot. Moreover, on Arm architecture, which supports contiguous PTEs, there is already an optimization to clear the young flags for PTEs within a contiguous range. However, this is not sufficient. We can extend this to perform batched operations for the entire large folio (which might exceed the contiguous range: CONT_PTE_SIZE). Similar to folio_referenced_one(), we can also apply batched unmapping for large file folios to optimize the performance of file folio reclamation. By supporting batched checking of the young flags, flushing TLB entries, and unmapping, I can observed a significant performance improvements in my performance tests for file folios reclamation. Please check the performance data in the commit message of each patch. This patch (of 5): Currently, folio_referenced_one() always checks the young flag for each PTE sequentially, which is inefficient for large folios. This inefficiency is especially noticeable when reclaiming clean file-backed large folios, where folio_referenced() is observed as a significant performance hotspot. Moreover, on Arm64 architecture, which supports contiguous PTEs, there is already an optimization to clear the young flags for PTEs within a contiguous range. However, this is not sufficient. We can extend this to perform batched operations for the entire large folio (which might exceed the contiguous range: CONT_PTE_SIZE). Introduce a new API: clear_flush_young_ptes() to facilitate batched checking of the young flags and flushing TLB entries, thereby improving performance during large folio reclamation. And it will be overridden by the architecture that implements a more efficient batch operation in the following patches. While we are at it, rename ptep_clear_flush_young_notify() to clear_flush_young_ptes_notify() to indicate that this is a batch operation. Link: https://lkml.kernel.org/r/cover.1770645603.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/12132694536834262062d1fb304f8f8a064b6750.1770645603.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Harry Yoo Reviewed-by: Ryan Roberts Acked-by: David Hildenbrand (Arm) Cc: Catalin Marinas Cc: Jann Horn Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Rik van Riel Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Will Deacon Cc: Barry Song Signed-off-by: Andrew Morton --- include/linux/mmu_notifier.h | 9 +++++---- include/linux/pgtable.h | 35 +++++++++++++++++++++++++++++++++++ mm/rmap.c | 28 +++++++++++++++++++++++++--- 3 files changed, 65 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index d1094c2d5fb6..07a2bbaf86e9 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -515,16 +515,17 @@ static inline void mmu_notifier_range_init_owner( range->owner = owner; } -#define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ +#define clear_flush_young_ptes_notify(__vma, __address, __ptep, __nr) \ ({ \ int __young; \ struct vm_area_struct *___vma = __vma; \ unsigned long ___address = __address; \ - __young = ptep_clear_flush_young(___vma, ___address, __ptep); \ + unsigned int ___nr = __nr; \ + __young = clear_flush_young_ptes(___vma, ___address, __ptep, ___nr); \ __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \ ___address, \ ___address + \ - PAGE_SIZE); \ + ___nr * PAGE_SIZE); \ __young; \ }) @@ -650,7 +651,7 @@ static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm) #define mmu_notifier_range_update_to_read_only(r) false -#define ptep_clear_flush_young_notify ptep_clear_flush_young +#define clear_flush_young_ptes_notify clear_flush_young_ptes #define pmdp_clear_flush_young_notify pmdp_clear_flush_young #define ptep_clear_young_notify ptep_test_and_clear_young #define pmdp_clear_young_notify pmdp_test_and_clear_young diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 21b67d937555..a50df42a893f 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1068,6 +1068,41 @@ static inline void wrprotect_ptes(struct mm_struct *mm, unsigned long addr, } #endif +#ifndef clear_flush_young_ptes +/** + * clear_flush_young_ptes - Mark PTEs that map consecutive pages of the same + * folio as old and flush the TLB. + * @vma: The virtual memory area the pages are mapped into. + * @addr: Address the first page is mapped at. + * @ptep: Page table pointer for the first entry. + * @nr: Number of entries to clear access bit. + * + * May be overridden by the architecture; otherwise, implemented as a simple + * loop over ptep_clear_flush_young(). + * + * Note that PTE bits in the PTE range besides the PFN can differ. For example, + * some PTEs might be write-protected. + * + * Context: The caller holds the page table lock. The PTEs map consecutive + * pages that belong to the same folio. The PTEs are all in the same PMD. + */ +static inline int clear_flush_young_ptes(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, unsigned int nr) +{ + int young = 0; + + for (;;) { + young |= ptep_clear_flush_young(vma, addr, ptep); + if (--nr == 0) + break; + ptep++; + addr += PAGE_SIZE; + } + + return young; +} +#endif + /* * On some architectures hardware does not set page access bit when accessing * memory page, it is responsibility of software setting this bit. It brings diff --git a/mm/rmap.c b/mm/rmap.c index ab099405151f..3dbc2c4e02dc 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -913,9 +913,11 @@ static bool folio_referenced_one(struct folio *folio, struct folio_referenced_arg *pra = arg; DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); int ptes = 0, referenced = 0; + unsigned int nr; while (page_vma_mapped_walk(&pvmw)) { address = pvmw.address; + nr = 1; if (vma->vm_flags & VM_LOCKED) { ptes++; @@ -960,9 +962,21 @@ static bool folio_referenced_one(struct folio *folio, if (lru_gen_look_around(&pvmw)) referenced++; } else if (pvmw.pte) { - if (ptep_clear_flush_young_notify(vma, address, - pvmw.pte)) + if (folio_test_large(folio)) { + unsigned long end_addr = pmd_addr_end(address, vma->vm_end); + unsigned int max_nr = (end_addr - address) >> PAGE_SHIFT; + pte_t pteval = ptep_get(pvmw.pte); + + nr = folio_pte_batch(folio, pvmw.pte, + pteval, max_nr); + } + + ptes += nr; + if (clear_flush_young_ptes_notify(vma, address, pvmw.pte, nr)) referenced++; + /* Skip the batched PTEs */ + pvmw.pte += nr - 1; + pvmw.address += (nr - 1) * PAGE_SIZE; } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { if (pmdp_clear_flush_young_notify(vma, address, pvmw.pmd)) @@ -972,7 +986,15 @@ static bool folio_referenced_one(struct folio *folio, WARN_ON_ONCE(1); } - pra->mapcount--; + pra->mapcount -= nr; + /* + * If we are sure that we batched the entire folio, + * we can just optimize and stop right here. + */ + if (ptes == pvmw.nr_pages) { + page_vma_mapped_walk_done(&pvmw); + break; + } } if (referenced) -- cgit v1.2.3