From 37d17925480404f1293f24d027fbf3c9975603d7 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 29 Sep 2025 11:46:43 +0100 Subject: mm/thp: drop follow_devmap_pmd() default stub follow_devmap_pmd() has already been dropped by the commit fd2825b0760a ("mm/gup: remove pXX_devmap usage from get_user_pages()"). The fallback stub in the header which is now redundant, can be dropped off as well. Link: https://lkml.kernel.org/r/20250929104643.1100421-1-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Dev Jain Reviewed-by: Alistair Popple Reviewed-by: Wei Yang Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 71ac78b9f834..fee4cf7fa300 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -682,12 +682,6 @@ static inline void mm_put_huge_zero_folio(struct mm_struct *mm) return; } -static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap) -{ - return NULL; -} - static inline bool thp_migration_supported(void) { return false; -- cgit v1.2.3 From 9c47753167a6a585d0305663c6912f042e131c2d Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 7 Oct 2025 14:20:29 +0200 Subject: mm/vmalloc: defer freeing partly initialized vm_struct __vmalloc_area_node() may call free_vmap_area() or vfree() on error paths, both of which can sleep. This becomes problematic if the function is invoked from an atomic context, such as when GFP_ATOMIC or GFP_NOWAIT is passed via gfp_mask. To fix this, unify error paths and defer the cleanup of partly initialized vm_struct objects to a workqueue. This ensures that freeing happens in a process context and avoids invalid sleeps in atomic regions. Link: https://lkml.kernel.org/r/20251007122035.56347-5-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Acked-by: Michal Hocko Reviewed-by: Baoquan He Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Marco Elver Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/vmalloc.h | 6 +++++- mm/vmalloc.c | 34 +++++++++++++++++++++++++++++++--- 2 files changed, 36 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index eb54b7b3202f..1e43181369f1 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -50,7 +50,11 @@ struct iov_iter; /* in uio.h */ #endif struct vm_struct { - struct vm_struct *next; + union { + struct vm_struct *next; /* Early registration of vm_areas. */ + struct llist_node llnode; /* Asynchronous freeing on error paths. */ + }; + void *addr; unsigned long size; unsigned long flags; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d83c01caaabe..9e29dd767c41 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3687,6 +3687,35 @@ vm_area_alloc_pages(gfp_t gfp, int nid, return nr_allocated; } +static LLIST_HEAD(pending_vm_area_cleanup); +static void cleanup_vm_area_work(struct work_struct *work) +{ + struct vm_struct *area, *tmp; + struct llist_node *head; + + head = llist_del_all(&pending_vm_area_cleanup); + if (!head) + return; + + llist_for_each_entry_safe(area, tmp, head, llnode) { + if (!area->pages) + free_vm_area(area); + else + vfree(area->addr); + } +} + +/* + * Helper for __vmalloc_area_node() to defer cleanup + * of partially initialized vm_struct in error paths. + */ +static DECLARE_WORK(cleanup_vm_area, cleanup_vm_area_work); +static void defer_vm_area_cleanup(struct vm_struct *area) +{ + if (llist_add(&area->llnode, &pending_vm_area_cleanup)) + schedule_work(&cleanup_vm_area); +} + static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, unsigned int page_shift, int node) @@ -3718,8 +3747,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, failed to allocated page array size %lu", nr_small_pages * PAGE_SIZE, array_size); - free_vm_area(area); - return NULL; + goto fail; } set_vm_area_page_order(area, page_shift - PAGE_SHIFT); @@ -3796,7 +3824,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, return area->addr; fail: - vfree(area->addr); + defer_vm_area_cleanup(area); return NULL; } -- cgit v1.2.3 From 8da89ba18ed4e9000d9b9b5b1f699e5004f4abf6 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 7 Oct 2025 14:20:30 +0200 Subject: mm/vmalloc: handle non-blocking GFP in __vmalloc_area_node() Make __vmalloc_area_node() respect non-blocking GFP masks such as GFP_ATOMIC and GFP_NOWAIT. - Add memalloc_apply_gfp_scope()/memalloc_restore_scope() helpers to apply a proper scope. - Apply memalloc_apply_gfp_scope()/memalloc_restore_scope() around vmap_pages_range() for page table setup. - Set "nofail" to false if a non-blocking mask is used, as they are mutually exclusive. This is particularly important for page table allocations that internally use GFP_PGTABLE_KERNEL, which may sleep unless such scope restrictions are applied. For example: __pte_alloc_kernel() pte_alloc_one_kernel(&init_mm); pagetable_alloc_noprof(GFP_PGTABLE_KERNEL & ~__GFP_HIGHMEM, 0); Note: in most cases, PTE entries are established only up to the level required by current vmap space usage, meaning the page tables are typically fully populated during the mapping process. Link: https://lkml.kernel.org/r/20251007122035.56347-6-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Acked-by: Michal Hocko Reviewed-by: Baoquan He Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Marco Elver Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/vmalloc.h | 2 ++ mm/vmalloc.c | 52 +++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 44 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 1e43181369f1..e8e94f90d686 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -332,4 +332,6 @@ bool vmalloc_dump_obj(void *object); static inline bool vmalloc_dump_obj(void *object) { return false; } #endif +unsigned int memalloc_apply_gfp_scope(gfp_t gfp_mask); +void memalloc_restore_scope(unsigned int flags); #endif /* _LINUX_VMALLOC_H */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9e29dd767c41..d8bcd87239b5 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3716,6 +3716,42 @@ static void defer_vm_area_cleanup(struct vm_struct *area) schedule_work(&cleanup_vm_area); } +/* + * Page tables allocations ignore external GFP. Enforces it by + * the memalloc scope API. It is used by vmalloc internals and + * KASAN shadow population only. + * + * GFP to scope mapping: + * + * non-blocking (no __GFP_DIRECT_RECLAIM) - memalloc_noreclaim_save() + * GFP_NOFS - memalloc_nofs_save() + * GFP_NOIO - memalloc_noio_save() + * + * Returns a flag cookie to pair with restore. + */ +unsigned int +memalloc_apply_gfp_scope(gfp_t gfp_mask) +{ + unsigned int flags = 0; + + if (!gfpflags_allow_blocking(gfp_mask)) + flags = memalloc_noreclaim_save(); + else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) + flags = memalloc_nofs_save(); + else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) + flags = memalloc_noio_save(); + + /* 0 - no scope applied. */ + return flags; +} + +void +memalloc_restore_scope(unsigned int flags) +{ + if (flags) + memalloc_flags_restore(flags); +} + static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, unsigned int page_shift, int node) @@ -3732,6 +3768,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, array_size = (unsigned long)nr_small_pages * sizeof(struct page *); + /* __GFP_NOFAIL and "noblock" flags are mutually exclusive. */ + if (!gfpflags_allow_blocking(gfp_mask)) + nofail = false; + if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) gfp_mask |= __GFP_HIGHMEM; @@ -3797,22 +3837,14 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, * page tables allocations ignore external gfp mask, enforce it * by the scope API */ - if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) - flags = memalloc_nofs_save(); - else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) - flags = memalloc_noio_save(); - + flags = memalloc_apply_gfp_scope(gfp_mask); do { ret = vmap_pages_range(addr, addr + size, prot, area->pages, page_shift); if (nofail && (ret < 0)) schedule_timeout_uninterruptible(1); } while (nofail && (ret < 0)); - - if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) - memalloc_nofs_restore(flags); - else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) - memalloc_noio_restore(flags); + memalloc_restore_scope(flags); if (ret < 0) { warn_alloc(gfp_mask, NULL, -- cgit v1.2.3 From b186a94227b753f2fdcab0df29dfc636c63ac329 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 7 Oct 2025 14:20:32 +0200 Subject: kmsan: remove hard-coded GFP_KERNEL flags kmsan_vmap_pages_range_noflush() allocates its temp s_pages/o_pages arrays with GFP_KERNEL, which may sleep. This is inconsistent with vmalloc() as it will support non-blocking requests later. Plumb gfp_mask through the kmsan_vmap_pages_range_noflush(), so it can use it internally for its demand. Please note, the subsequent __vmap_pages_range_noflush() still uses GFP_KERNEL and can sleep. If a caller runs under reclaim constraints, sleeping is forbidden, it must establish the appropriate memalloc scope API. Link: https://lkml.kernel.org/r/20251007122035.56347-8-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Alexander Potapenko Cc: Marco Elver Cc: Andrey Ryabinin Cc: Baoquan He Cc: Michal Hocko Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/kmsan.h | 6 ++++-- mm/internal.h | 4 ++-- mm/kmsan/shadow.c | 6 +++--- mm/percpu-vm.c | 2 +- mm/vmalloc.c | 26 +++++++++++++++++--------- 5 files changed, 27 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index f2fd221107bb..7da9fd506b39 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -133,6 +133,7 @@ void kmsan_kfree_large(const void *ptr); * @prot: page protection flags used for vmap. * @pages: array of pages. * @page_shift: page_shift passed to vmap_range_noflush(). + * @gfp_mask: gfp_mask to use internally. * * KMSAN maps shadow and origin pages of @pages into contiguous ranges in * vmalloc metadata address range. Returns 0 on success, callers must check @@ -142,7 +143,8 @@ int __must_check kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, pgprot_t prot, struct page **pages, - unsigned int page_shift); + unsigned int page_shift, + gfp_t gfp_mask); /** * kmsan_vunmap_kernel_range_noflush() - Notify KMSAN about a vunmap. @@ -347,7 +349,7 @@ static inline void kmsan_kfree_large(const void *ptr) static inline int __must_check kmsan_vmap_pages_range_noflush( unsigned long start, unsigned long end, pgprot_t prot, - struct page **pages, unsigned int page_shift) + struct page **pages, unsigned int page_shift, gfp_t gfp_mask) { return 0; } diff --git a/mm/internal.h b/mm/internal.h index 1561fc2ff5b8..e623c8103358 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1355,7 +1355,7 @@ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe, #ifdef CONFIG_MMU void __init vmalloc_init(void); int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end, - pgprot_t prot, struct page **pages, unsigned int page_shift); + pgprot_t prot, struct page **pages, unsigned int page_shift, gfp_t gfp_mask); unsigned int get_vm_area_page_order(struct vm_struct *vm); #else static inline void vmalloc_init(void) @@ -1364,7 +1364,7 @@ static inline void vmalloc_init(void) static inline int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end, - pgprot_t prot, struct page **pages, unsigned int page_shift) + pgprot_t prot, struct page **pages, unsigned int page_shift, gfp_t gfp_mask) { return -EINVAL; } diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index 55fdea199aaf..e7f554a31bb4 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -215,7 +215,7 @@ void kmsan_free_page(struct page *page, unsigned int order) int kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, pgprot_t prot, struct page **pages, - unsigned int page_shift) + unsigned int page_shift, gfp_t gfp_mask) { unsigned long shadow_start, origin_start, shadow_end, origin_end; struct page **s_pages, **o_pages; @@ -230,8 +230,8 @@ int kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, return 0; nr = (end - start) / PAGE_SIZE; - s_pages = kcalloc(nr, sizeof(*s_pages), GFP_KERNEL); - o_pages = kcalloc(nr, sizeof(*o_pages), GFP_KERNEL); + s_pages = kcalloc(nr, sizeof(*s_pages), gfp_mask); + o_pages = kcalloc(nr, sizeof(*o_pages), gfp_mask); if (!s_pages || !o_pages) { err = -ENOMEM; goto ret; diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index cd69caf6aa8d..4f5937090590 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -194,7 +194,7 @@ static int __pcpu_map_pages(unsigned long addr, struct page **pages, int nr_pages) { return vmap_pages_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT), - PAGE_KERNEL, pages, PAGE_SHIFT); + PAGE_KERNEL, pages, PAGE_SHIFT, GFP_KERNEL); } /** diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d8bcd87239b5..d7e7049e01f8 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -671,16 +671,28 @@ int __vmap_pages_range_noflush(unsigned long addr, unsigned long end, } int vmap_pages_range_noflush(unsigned long addr, unsigned long end, - pgprot_t prot, struct page **pages, unsigned int page_shift) + pgprot_t prot, struct page **pages, unsigned int page_shift, + gfp_t gfp_mask) { int ret = kmsan_vmap_pages_range_noflush(addr, end, prot, pages, - page_shift); + page_shift, gfp_mask); if (ret) return ret; return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift); } +static int __vmap_pages_range(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int page_shift, + gfp_t gfp_mask) +{ + int err; + + err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift, gfp_mask); + flush_cache_vmap(addr, end); + return err; +} + /** * vmap_pages_range - map pages to a kernel virtual address * @addr: start of the VM area to map @@ -696,11 +708,7 @@ int vmap_pages_range_noflush(unsigned long addr, unsigned long end, int vmap_pages_range(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { - int err; - - err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift); - flush_cache_vmap(addr, end); - return err; + return __vmap_pages_range(addr, end, prot, pages, page_shift, GFP_KERNEL); } static int check_sparse_vm_area(struct vm_struct *area, unsigned long start, @@ -3839,8 +3847,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, */ flags = memalloc_apply_gfp_scope(gfp_mask); do { - ret = vmap_pages_range(addr, addr + size, prot, area->pages, - page_shift); + ret = __vmap_pages_range(addr, addr + size, prot, area->pages, + page_shift, nested_gfp); if (nofail && (ret < 0)) schedule_timeout_uninterruptible(1); } while (nofail && (ret < 0)); -- cgit v1.2.3 From 7241bb2ea33d5ff50b77a5981342bcc826bef52a Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 7 Oct 2025 14:20:33 +0200 Subject: mm: skip might_alloc() warnings when PF_MEMALLOC is set might_alloc() catches invalid blocking allocations in contexts where sleeping is not allowed. However when PF_MEMALLOC is set, the page allocator already skips reclaim and other blocking paths. In such cases, a blocking gfp_mask does not actually lead to blocking, so triggering might_alloc() splats is misleading. Adjust might_alloc() to skip warnings when the current task has PF_MEMALLOC set, matching the allocator's actual blocking behaviour. Link: https://lkml.kernel.org/r/20251007122035.56347-9-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Baoquan He Acked-by: Michal Hocko Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Marco Elver Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/sched/mm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 0232d983b715..a74582aed747 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -318,6 +318,9 @@ static inline void might_alloc(gfp_t gfp_mask) fs_reclaim_acquire(gfp_mask); fs_reclaim_release(gfp_mask); + if (current->flags & PF_MEMALLOC) + return; + might_sleep_if(gfpflags_allow_blocking(gfp_mask)); } -- cgit v1.2.3 From 590c03ca6a3fbb114396673314e2aa483839608b Mon Sep 17 00:00:00 2001 From: xu xin Date: Tue, 7 Oct 2025 18:28:21 +0800 Subject: mm/ksm: fix exec/fork inheritance support for prctl Patch series "ksm: fix exec/fork inheritance", v2. This series fixes exec/fork inheritance. See the detailed description of the issue below. This patch (of 2): Background ========== commit d7597f59d1d33 ("mm: add new api to enable ksm per process") introduced MMF_VM_MERGE_ANY for mm->flags, and allowed user to set it by prctl() so that the process's VMAs are forcibly scanned by ksmd. Subsequently, the 3c6f33b7273a ("mm/ksm: support fork/exec for prctl") supported inheriting the MMF_VM_MERGE_ANY flag when a task calls execve(). Finally, commit 3a9e567ca45fb ("mm/ksm: fix ksm exec support for prctl") fixed the issue that ksmd doesn't scan the mm_struct with MMF_VM_MERGE_ANY by adding the mm_slot to ksm_mm_head in __bprm_mm_init(). Problem ======= In some extreme scenarios, however, this inheritance of MMF_VM_MERGE_ANY during exec/fork can fail. For example, when the scanning frequency of ksmd is tuned extremely high, a process carrying MMF_VM_MERGE_ANY may still fail to pass it to the newly exec'd process. This happens because ksm_execve() is executed too early in the do_execve flow (prematurely adding the new mm_struct to the ksm_mm_slot list). As a result, before do_execve completes, ksmd may have already performed a scan and found that this new mm_struct has no VM_MERGEABLE VMAs, thus clearing its MMF_VM_MERGE_ANY flag. Consequently, when the new program executes, the flag MMF_VM_MERGE_ANY inheritance missed. Root reason =========== commit d7597f59d1d33 ("mm: add new api to enable ksm per process") clear the flag MMF_VM_MERGE_ANY when ksmd found no VM_MERGEABLE VMAs. Solution ======== Firstly, Don't clear MMF_VM_MERGE_ANY when ksmd found no VM_MERGEABLE VMAs, because perhaps their mm_struct has just been added to ksm_mm_slot list, and its process has not yet officially started running or has not yet performed mmap/brk to allocate anonymous VMAS. Secondly, recheck MMF_VM_MERGEABLE again if a process takes MMF_VM_MERGE_ANY, and create a mm_slot and join it into ksm_scan_list again. Link: https://lkml.kernel.org/r/20251007182504440BJgK8VXRHh8TD7IGSUIY4@zte.com.cn Link: https://lkml.kernel.org/r/20251007182821572h_SoFqYZXEP1mvWI4n9VL@zte.com.cn Fixes: 3c6f33b7273a ("mm/ksm: support fork/exec for prctl") Fixes: d7597f59d1d3 ("mm: add new api to enable ksm per process") Signed-off-by: xu xin Cc: Stefan Roesch Cc: David Hildenbrand Cc: Jinjiang Tu Cc: Wang Yaxin Cc: Yang Yang Cc: Signed-off-by: Andrew Morton --- include/linux/ksm.h | 4 ++-- mm/ksm.c | 20 +++++++++++++++++--- 2 files changed, 19 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 067538fc4d58..c982694c987b 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -17,7 +17,7 @@ #ifdef CONFIG_KSM int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, vm_flags_t *vm_flags); -vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file, +vm_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file, vm_flags_t vm_flags); int ksm_enable_merge_any(struct mm_struct *mm); int ksm_disable_merge_any(struct mm_struct *mm); @@ -103,7 +103,7 @@ bool ksm_process_mergeable(struct mm_struct *mm); #else /* !CONFIG_KSM */ -static inline vm_flags_t ksm_vma_flags(const struct mm_struct *mm, +static inline vm_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file, vm_flags_t vm_flags) { return vm_flags; diff --git a/mm/ksm.c b/mm/ksm.c index cdefba633856..4f672f4f2140 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2712,8 +2712,14 @@ no_vmas: spin_unlock(&ksm_mmlist_lock); mm_slot_free(mm_slot_cache, mm_slot); + /* + * Only clear MMF_VM_MERGEABLE. We must not clear + * MMF_VM_MERGE_ANY, because for those MMF_VM_MERGE_ANY process, + * perhaps their mm_struct has just been added to ksm_mm_slot + * list, and its process has not yet officially started running + * or has not yet performed mmap/brk to allocate anonymous VMAS. + */ mm_flags_clear(MMF_VM_MERGEABLE, mm); - mm_flags_clear(MMF_VM_MERGE_ANY, mm); mmap_read_unlock(mm); mmdrop(mm); } else { @@ -2831,12 +2837,20 @@ static int __ksm_del_vma(struct vm_area_struct *vma) * * Returns: @vm_flags possibly updated to mark mergeable. */ -vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file, +vm_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file, vm_flags_t vm_flags) { if (mm_flags_test(MMF_VM_MERGE_ANY, mm) && - __ksm_should_add_vma(file, vm_flags)) + __ksm_should_add_vma(file, vm_flags)) { vm_flags |= VM_MERGEABLE; + /* + * Generally, the flags here always include MMF_VM_MERGEABLE. + * However, in rare cases, this flag may be cleared by ksmd who + * scans a cycle without finding any mergeable vma. + */ + if (unlikely(!mm_flags_test(MMF_VM_MERGEABLE, mm))) + __ksm_enter(mm); + } return vm_flags; } -- cgit v1.2.3 From 9ac09bb9feaccc2f45e5606dc48a3f748d478dc4 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Fri, 3 Oct 2025 16:53:04 +0100 Subject: mm: consistently use current->mm in mm_get_unmapped_area() mm_get_unmapped_area() is a wrapper around arch_get_unmapped_area() / arch_get_unmapped_area_topdown(), both of which search current->mm for some free space. Neither take an mm_struct - they implicitly operate on current->mm. But the wrapper takes an mm_struct and uses it to decide whether to search bottom up or top down. All callers pass in current->mm for this, so everything is working consistently. But it feels like an accident waiting to happen; eventually someone will call that function with a different mm, expecting to find free space in it, but what gets returned is free space in the current mm. So let's simplify by removing the parameter and have the wrapper use current->mm to decide which end to start at. Now everything is consistent and self-documenting. Link: https://lkml.kernel.org/r/20251003155306.2147572-1-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Acked-by: David Hildenbrand Reviewed-by: Oscar Salvador Reviewed-by: Dev Jain Reviewed-by: Anshuman Khandual Reviewed-by: Lorenzo Stoakes Reviewed-by: Baolin Wang Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/sparc/kernel/sys_sparc_64.c | 6 +++--- arch/x86/kernel/cpu/sgx/driver.c | 2 +- drivers/char/mem.c | 2 +- drivers/dax/device.c | 5 ++--- fs/hugetlbfs/inode.c | 3 +-- fs/proc/inode.c | 2 +- fs/ramfs/file-mmu.c | 2 +- include/linux/sched/mm.h | 9 ++++----- io_uring/memmap.c | 2 +- kernel/bpf/arena.c | 2 +- kernel/bpf/syscall.c | 2 +- mm/huge_memory.c | 4 ++-- mm/mmap.c | 17 +++++++---------- mm/shmem.c | 8 +++----- 14 files changed, 29 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c index 55faf2effa46..dbf118b40601 100644 --- a/arch/sparc/kernel/sys_sparc_64.c +++ b/arch/sparc/kernel/sys_sparc_64.c @@ -241,7 +241,7 @@ unsigned long get_fb_unmapped_area(struct file *filp, unsigned long orig_addr, u if (flags & MAP_FIXED) { /* Ok, don't mess with it. */ - return mm_get_unmapped_area(current->mm, NULL, orig_addr, len, pgoff, flags); + return mm_get_unmapped_area(NULL, orig_addr, len, pgoff, flags); } flags &= ~MAP_SHARED; @@ -254,7 +254,7 @@ unsigned long get_fb_unmapped_area(struct file *filp, unsigned long orig_addr, u align_goal = (64UL * 1024); do { - addr = mm_get_unmapped_area(current->mm, NULL, orig_addr, + addr = mm_get_unmapped_area(NULL, orig_addr, len + (align_goal - PAGE_SIZE), pgoff, flags); if (!(addr & ~PAGE_MASK)) { addr = (addr + (align_goal - 1UL)) & ~(align_goal - 1UL); @@ -273,7 +273,7 @@ unsigned long get_fb_unmapped_area(struct file *filp, unsigned long orig_addr, u * be obtained. */ if (addr & ~PAGE_MASK) - addr = mm_get_unmapped_area(current->mm, NULL, orig_addr, len, pgoff, flags); + addr = mm_get_unmapped_area(NULL, orig_addr, len, pgoff, flags); return addr; } diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c index 7f8d1e11dbee..3b3efadb8cae 100644 --- a/arch/x86/kernel/cpu/sgx/driver.c +++ b/arch/x86/kernel/cpu/sgx/driver.c @@ -113,7 +113,7 @@ static unsigned long sgx_get_unmapped_area(struct file *file, if (flags & MAP_FIXED) return addr; - return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags); + return mm_get_unmapped_area(file, addr, len, pgoff, flags); } #ifdef CONFIG_COMPAT diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 34b815901b20..db1ca53a6d01 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -542,7 +542,7 @@ static unsigned long get_unmapped_area_zero(struct file *file, #ifdef CONFIG_TRANSPARENT_HUGEPAGE return thp_get_unmapped_area(file, addr, len, pgoff, flags); #else - return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags); + return mm_get_unmapped_area(file, addr, len, pgoff, flags); #endif } #endif /* CONFIG_MMU */ diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 2bb40a6060af..7f1ed0db8337 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -330,14 +330,13 @@ static unsigned long dax_get_unmapped_area(struct file *filp, if ((off + len_align) < off) goto out; - addr_align = mm_get_unmapped_area(current->mm, filp, addr, len_align, - pgoff, flags); + addr_align = mm_get_unmapped_area(filp, addr, len_align, pgoff, flags); if (!IS_ERR_VALUE(addr_align)) { addr_align += (off - addr_align) & (align - 1); return addr_align; } out: - return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags); + return mm_get_unmapped_area(filp, addr, len, pgoff, flags); } static const struct address_space_operations dev_dax_aops = { diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index f42548ee9083..ce8e40d35032 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -184,8 +184,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, if (addr) addr0 = ALIGN(addr, huge_page_size(h)); - return mm_get_unmapped_area_vmflags(current->mm, file, addr0, len, pgoff, - flags, 0); + return mm_get_unmapped_area_vmflags(file, addr0, len, pgoff, flags, 0); } /* diff --git a/fs/proc/inode.c b/fs/proc/inode.c index d9b7ef122343..2d3425cfa94b 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -443,7 +443,7 @@ pde_get_unmapped_area(struct proc_dir_entry *pde, struct file *file, unsigned lo return pde->proc_ops->proc_get_unmapped_area(file, orig_addr, len, pgoff, flags); #ifdef CONFIG_MMU - return mm_get_unmapped_area(current->mm, file, orig_addr, len, pgoff, flags); + return mm_get_unmapped_area(file, orig_addr, len, pgoff, flags); #endif return orig_addr; diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c index b11f5b20b78b..c3ed1c5117b2 100644 --- a/fs/ramfs/file-mmu.c +++ b/fs/ramfs/file-mmu.c @@ -35,7 +35,7 @@ static unsigned long ramfs_mmu_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags); + return mm_get_unmapped_area(file, addr, len, pgoff, flags); } const struct file_operations ramfs_file_operations = { diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index a74582aed747..0e1d73955fa5 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -189,12 +189,11 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t); -unsigned long mm_get_unmapped_area(struct mm_struct *mm, struct file *filp, - unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags); +unsigned long mm_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags); -unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, - struct file *filp, +unsigned long mm_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, diff --git a/io_uring/memmap.c b/io_uring/memmap.c index add03ca75cb9..63fcfa757bb8 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -387,7 +387,7 @@ unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr, #else addr = 0UL; #endif - return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags); + return mm_get_unmapped_area(filp, addr, len, pgoff, flags); } #else /* !CONFIG_MMU */ diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 1074ac4459f2..872dc0e41c65 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -334,7 +334,7 @@ static unsigned long arena_get_unmapped_area(struct file *filp, unsigned long ad return -EINVAL; } - ret = mm_get_unmapped_area(current->mm, filp, addr, len * 2, 0, flags); + ret = mm_get_unmapped_area(filp, addr, len * 2, 0, flags); if (IS_ERR_VALUE(ret)) return ret; if ((ret >> 32) == ((ret + len - 1) >> 32)) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8a129746bd6c..d77685f2c6cb 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1162,7 +1162,7 @@ static unsigned long bpf_get_unmapped_area(struct file *filp, unsigned long addr if (map->ops->map_get_unmapped_area) return map->ops->map_get_unmapped_area(filp, addr, len, pgoff, flags); #ifdef CONFIG_MMU - return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags); + return mm_get_unmapped_area(filp, addr, len, pgoff, flags); #else return addr; #endif diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2f2a521e5d68..32479ae27400 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1127,7 +1127,7 @@ static unsigned long __thp_get_unmapped_area(struct file *filp, if (len_pad < len || (off + len_pad) < off) return 0; - ret = mm_get_unmapped_area_vmflags(current->mm, filp, addr, len_pad, + ret = mm_get_unmapped_area_vmflags(filp, addr, len_pad, off >> PAGE_SHIFT, flags, vm_flags); /* @@ -1164,7 +1164,7 @@ unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long add if (ret) return ret; - return mm_get_unmapped_area_vmflags(current->mm, filp, addr, len, pgoff, flags, + return mm_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, vm_flags); } diff --git a/mm/mmap.c b/mm/mmap.c index 5fd3b80fda1d..644f02071a41 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -797,12 +797,11 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, } #endif -unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *filp, - unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags, - vm_flags_t vm_flags) +unsigned long mm_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags, vm_flags_t vm_flags) { - if (mm_flags_test(MMF_TOPDOWN, mm)) + if (mm_flags_test(MMF_TOPDOWN, current->mm)) return arch_get_unmapped_area_topdown(filp, addr, len, pgoff, flags, vm_flags); return arch_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags); @@ -848,7 +847,7 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, addr = thp_get_unmapped_area_vmflags(file, addr, len, pgoff, flags, vm_flags); } else { - addr = mm_get_unmapped_area_vmflags(current->mm, file, addr, len, + addr = mm_get_unmapped_area_vmflags(file, addr, len, pgoff, flags, vm_flags); } if (IS_ERR_VALUE(addr)) @@ -864,12 +863,10 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, } unsigned long -mm_get_unmapped_area(struct mm_struct *mm, struct file *file, - unsigned long addr, unsigned long len, +mm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - return mm_get_unmapped_area_vmflags(mm, file, addr, len, - pgoff, flags, 0); + return mm_get_unmapped_area_vmflags(file, addr, len, pgoff, flags, 0); } EXPORT_SYMBOL(mm_get_unmapped_area); diff --git a/mm/shmem.c b/mm/shmem.c index 58701d14dd96..0eecb486a0cb 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2759,8 +2759,7 @@ unsigned long shmem_get_unmapped_area(struct file *file, if (len > TASK_SIZE) return -ENOMEM; - addr = mm_get_unmapped_area(current->mm, file, uaddr, len, pgoff, - flags); + addr = mm_get_unmapped_area(file, uaddr, len, pgoff, flags); if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return addr; @@ -2838,8 +2837,7 @@ unsigned long shmem_get_unmapped_area(struct file *file, if (inflated_len < len) return addr; - inflated_addr = mm_get_unmapped_area(current->mm, NULL, uaddr, - inflated_len, 0, flags); + inflated_addr = mm_get_unmapped_area(NULL, uaddr, inflated_len, 0, flags); if (IS_ERR_VALUE(inflated_addr)) return addr; if (inflated_addr & ~PAGE_MASK) @@ -5775,7 +5773,7 @@ unsigned long shmem_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags); + return mm_get_unmapped_area(file, addr, len, pgoff, flags); } #endif -- cgit v1.2.3 From ada5cbe33a5321f8c896a3362c3aafa0bf262110 Mon Sep 17 00:00:00 2001 From: Sabyrzhan Tasbolatov Date: Thu, 9 Oct 2025 20:54:03 +0500 Subject: kasan: cleanup of kasan_enabled() checks Deduplication of kasan_enabled() checks which are already used by callers. * Altered functions: check_page_allocation Delete the check because callers have it already in __wrappers in include/linux/kasan.h: __kasan_kfree_large __kasan_mempool_poison_pages __kasan_mempool_poison_object kasan_populate_vmalloc, kasan_release_vmalloc Add __wrappers in include/linux/kasan.h. They are called externally in mm/vmalloc.c. __kasan_unpoison_vmalloc, __kasan_poison_vmalloc Delete checks because there're already kasan_enabled() checks in respective __wrappers in include/linux/kasan.h. release_free_meta -- Delete the check because the higher caller path has it already. See the stack trace: __kasan_slab_free -- has the check already __kasan_mempool_poison_object -- has the check already poison_slab_object kasan_save_free_info release_free_meta kasan_enabled() -- Delete here Link: https://lkml.kernel.org/r/20251009155403.1379150-3-snovitoll@gmail.com Signed-off-by: Sabyrzhan Tasbolatov Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Baoquan He Cc: Christophe Leroy Cc: Dmitriy Vyukov Cc: "Ritesh Harjani (IBM)" Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- include/linux/kasan.h | 20 ++++++++++++++++++-- mm/kasan/common.c | 3 --- mm/kasan/generic.c | 3 --- mm/kasan/shadow.c | 20 ++++---------------- 4 files changed, 22 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index d12e1a5f5a9a..f335c1d7b61d 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -571,11 +571,27 @@ static inline void kasan_init_hw_tags(void) { } #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) void kasan_populate_early_vm_area_shadow(void *start, unsigned long size); -int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask); -void kasan_release_vmalloc(unsigned long start, unsigned long end, +int __kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask); +static inline int kasan_populate_vmalloc(unsigned long addr, + unsigned long size, gfp_t gfp_mask) +{ + if (kasan_enabled()) + return __kasan_populate_vmalloc(addr, size, gfp_mask); + return 0; +} +void __kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end, unsigned long flags); +static inline void kasan_release_vmalloc(unsigned long start, unsigned long end, + unsigned long free_region_start, + unsigned long free_region_end, + unsigned long flags) +{ + if (kasan_enabled()) + return __kasan_release_vmalloc(start, end, free_region_start, + free_region_end, flags); +} #else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ diff --git a/mm/kasan/common.c b/mm/kasan/common.c index d4c14359feaf..22e5d67ff064 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -305,9 +305,6 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init, static inline bool check_page_allocation(void *ptr, unsigned long ip) { - if (!kasan_enabled()) - return false; - if (ptr != page_address(virt_to_head_page(ptr))) { kasan_report_invalid_free(ptr, ip, KASAN_REPORT_INVALID_FREE); return true; diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 516b49accc4f..2b8e73f5f6a7 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -506,9 +506,6 @@ static void release_alloc_meta(struct kasan_alloc_meta *meta) static void release_free_meta(const void *object, struct kasan_free_meta *meta) { - if (!kasan_enabled()) - return; - /* Check if free meta is valid. */ if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_SLAB_FREE_META) return; diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index a30d84bfdd52..29a751a8a08d 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -354,7 +354,7 @@ static int ___alloc_pages_bulk(struct page **pages, int nr_pages, gfp_t gfp_mask return 0; } -static int __kasan_populate_vmalloc(unsigned long start, unsigned long end, gfp_t gfp_mask) +static int __kasan_populate_vmalloc_do(unsigned long start, unsigned long end, gfp_t gfp_mask) { unsigned long nr_pages, nr_total = PFN_UP(end - start); struct vmalloc_populate_data data; @@ -395,14 +395,11 @@ static int __kasan_populate_vmalloc(unsigned long start, unsigned long end, gfp_ return ret; } -int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask) +int __kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask) { unsigned long shadow_start, shadow_end; int ret; - if (!kasan_enabled()) - return 0; - if (!is_vmalloc_or_module_addr((void *)addr)) return 0; @@ -424,7 +421,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mas shadow_start = PAGE_ALIGN_DOWN(shadow_start); shadow_end = PAGE_ALIGN(shadow_end); - ret = __kasan_populate_vmalloc(shadow_start, shadow_end, gfp_mask); + ret = __kasan_populate_vmalloc_do(shadow_start, shadow_end, gfp_mask); if (ret) return ret; @@ -566,7 +563,7 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, * pages entirely covered by the free region, we will not run in to any * trouble - any simultaneous allocations will be for disjoint regions. */ -void kasan_release_vmalloc(unsigned long start, unsigned long end, +void __kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end, unsigned long flags) @@ -575,9 +572,6 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long region_start, region_end; unsigned long size; - if (!kasan_enabled()) - return; - region_start = ALIGN(start, KASAN_MEMORY_PER_SHADOW_PAGE); region_end = ALIGN_DOWN(end, KASAN_MEMORY_PER_SHADOW_PAGE); @@ -626,9 +620,6 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, * with setting memory tags, so the KASAN_VMALLOC_INIT flag is ignored. */ - if (!kasan_enabled()) - return (void *)start; - if (!is_vmalloc_or_module_addr(start)) return (void *)start; @@ -651,9 +642,6 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, */ void __kasan_poison_vmalloc(const void *start, unsigned long size) { - if (!kasan_enabled()) - return; - if (!is_vmalloc_or_module_addr(start)) return; -- cgit v1.2.3 From eb8762dc220c0b0573100a941bfc68df34ece74f Mon Sep 17 00:00:00 2001 From: Donet Tom Date: Tue, 14 Oct 2025 21:09:16 +0530 Subject: drivers/base/node: fold register_node() into register_one_node() Patch series "drivers/base/node: fold node register and unregister functions", v2. The first patch merges register_one_node() and register_node(), leaving a single register_node() function. The second patch merges unregister_one_node() and unregister_node(), leaving a single unregister_node() function. There are no functional changes in these patches. This patch (of 2): register_node() is only called from register_one_node(). This patch folds register_node() into its only caller and renames register_one_node() to register_node(). This reduces unnecessary indirection and simplifies the code structure. No functional changes are introduced. [akpm@linux-foundation.org: fix kerneldoc, per David] Link: https://lkml.kernel.org/r/cover.1760097207.git.donettom@linux.ibm.com Link: https://lkml.kernel.org/r/910853c9dd61f7a2190a56cba101e73e9c6859be.1760097207.git.donettom@linux.ibm.com Signed-off-by: Donet Tom Acked-by: Mike Rapoport (Microsoft) Acked-by: SeongJae Park Acked-by: David Hildenbrand Cc: Aboorva Devarajan Cc: Christophe Leroy Cc: Danilo Krummrich Cc: Dave Jiang Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: Madhavan Srinivasan Cc: Oscar Salvador Cc: Peter Zijlstra Cc: "Ritesh Harjani (IBM)" Signed-off-by: Andrew Morton --- arch/powerpc/platforms/pseries/pci_dlpar.c | 2 +- arch/x86/mm/numa.c | 4 +-- drivers/base/node.c | 52 ++++++++++++------------------ include/linux/node.h | 4 +-- mm/memory_hotplug.c | 4 +-- mm/mm_init.c | 2 +- 6 files changed, 28 insertions(+), 40 deletions(-) (limited to 'include/linux') diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c index aeb8633a3d00..8c77ec7980de 100644 --- a/arch/powerpc/platforms/pseries/pci_dlpar.c +++ b/arch/powerpc/platforms/pseries/pci_dlpar.c @@ -29,7 +29,7 @@ struct pci_controller *init_phb_dynamic(struct device_node *dn) nid = of_node_to_nid(dn); if (likely((nid) >= 0)) { if (!node_online(nid)) { - if (register_one_node(nid)) { + if (register_node(nid)) { pr_err("PCI: Failed to register node %d\n", nid); } else { update_numa_distance(dn); diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index c24890c40138..7a97327140df 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -262,7 +262,7 @@ void __init init_gi_nodes(void) * bringup_nonboot_cpus * cpu_up * __try_online_node - * register_one_node + * register_node * because node_subsys is not initialized yet. * TODO remove dependency on node_online */ @@ -303,7 +303,7 @@ void __init init_cpu_to_node(void) * bringup_nonboot_cpus * cpu_up * __try_online_node - * register_one_node + * register_node * because node_subsys is not initialized yet. * TODO remove dependency on node_online */ diff --git a/drivers/base/node.c b/drivers/base/node.c index 83aeb0518e1d..17d7b90403ff 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -676,33 +676,6 @@ static void node_device_release(struct device *dev) kfree(to_node(dev)); } -/* - * register_node - Setup a sysfs device for a node. - * @num - Node number to use when creating the device. - * - * Initialize and register the node device. - */ -static int register_node(struct node *node, int num) -{ - int error; - - node->dev.id = num; - node->dev.bus = &node_subsys; - node->dev.release = node_device_release; - node->dev.groups = node_dev_groups; - error = device_register(&node->dev); - - if (error) { - put_device(&node->dev); - } else { - hugetlb_register_node(node); - compaction_register_node(node); - reclaim_register_node(node); - } - - return error; -} - /** * unregister_node - unregister a node device * @node: node going away @@ -907,7 +880,13 @@ void register_memory_blocks_under_node_hotplug(int nid, unsigned long start_pfn, } #endif /* CONFIG_MEMORY_HOTPLUG */ -int register_one_node(int nid) +/** + * register_node - Initialize and register the node device. + * @nid: Node number to use when creating the device. + * + * Return: 0 on success, -errno otherwise + */ +int register_node(int nid) { int error; int cpu; @@ -918,14 +897,23 @@ int register_one_node(int nid) return -ENOMEM; INIT_LIST_HEAD(&node->access_list); - node_devices[nid] = node; - error = register_node(node_devices[nid], nid); + node->dev.id = nid; + node->dev.bus = &node_subsys; + node->dev.release = node_device_release; + node->dev.groups = node_dev_groups; + + error = device_register(&node->dev); if (error) { - node_devices[nid] = NULL; + put_device(&node->dev); return error; } + node_devices[nid] = node; + hugetlb_register_node(node); + compaction_register_node(node); + reclaim_register_node(node); + /* link cpu under this node */ for_each_present_cpu(cpu) { if (cpu_to_node(cpu) == nid) @@ -1018,7 +1006,7 @@ void __init node_dev_init(void) * to already created cpu devices. */ for_each_online_node(i) { - ret = register_one_node(i); + ret = register_node(i); if (ret) panic("%s() failed to add node: %d\n", __func__, ret); } diff --git a/include/linux/node.h b/include/linux/node.h index 866e3323f1fd..b7028d3ec3b4 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -176,7 +176,7 @@ static inline int hotplug_node_notifier(notifier_fn_t fn, int pri) #ifdef CONFIG_NUMA extern void node_dev_init(void); /* Core of the node registration - only memory hotplug should use this */ -extern int register_one_node(int nid); +int register_node(int nid); extern void unregister_one_node(int nid); extern int register_cpu_under_node(unsigned int cpu, unsigned int nid); extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid); @@ -189,7 +189,7 @@ extern int register_memory_node_under_compute_node(unsigned int mem_nid, static inline void node_dev_init(void) { } -static inline int register_one_node(int nid) +static inline int register_node(int nid) { return 0; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 0be83039c3b5..6c050d867031 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1311,7 +1311,7 @@ static int __try_online_node(int nid, bool set_node_online) if (set_node_online) { node_set_online(nid); - ret = register_one_node(nid); + ret = register_node(nid); BUG_ON(ret); } out: @@ -1542,7 +1542,7 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) goto error_memblock_remove; if (ret) { node_set_online(nid); - ret = register_one_node(nid); + ret = register_node(nid); if (WARN_ON(ret)) { node_set_offline(nid); goto error_memblock_remove; diff --git a/mm/mm_init.c b/mm/mm_init.c index 7712d887b696..c6812b4dbb2e 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1909,7 +1909,7 @@ void __init free_area_init(unsigned long *max_zone_pfn) free_area_init_node(nid); /* - * No sysfs hierarchy will be created via register_one_node() + * No sysfs hierarchy will be created via register_node() *for memory-less node because here it's not marked as N_MEMORY *and won't be set online later. The benefit is userspace *program won't be confused by sysfs files/directories of -- cgit v1.2.3 From d945667dcb1996ddf00ffa8408b579e4ce573652 Mon Sep 17 00:00:00 2001 From: Donet Tom Date: Tue, 14 Oct 2025 21:09:17 +0530 Subject: drivers/base/node: fold unregister_node() into unregister_one_node() unregister_node() is only called from unregister_one_node(). This patch folds unregister_node() into its only caller and renames unregister_one_node() to unregister_node(). This reduces unnecessary indirection and simplifies the code structure. No functional changes are introduced. [donettom@linux.ibm.com: remove extra spaces before @nid and "All"] Link: https://lkml.kernel.org/r/cff01514-9074-4c97-bcf1-d4e3594e48b0@linux.ibm.com Link: https://lkml.kernel.org/r/32b7d5d8f0f30d313c3e1d8798f591459c8746f9.1760097208.git.donettom@linux.ibm.com Signed-off-by: Donet Tom Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Acked-by: SeongJae Park Cc: Aboorva Devarajan Cc: Christophe Leroy Cc: Danilo Krummrich Cc: Dave Jiang Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: Madhavan Srinivasan Cc: Oscar Salvador Cc: Peter Zijlstra Cc: "Ritesh Harjani (IBM)" Signed-off-by: Andrew Morton --- drivers/base/node.c | 38 +++++++++++++++++--------------------- include/linux/node.h | 6 ++---- mm/memory_hotplug.c | 4 ++-- 3 files changed, 21 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/node.c b/drivers/base/node.c index 17d7b90403ff..00cf4532f121 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -676,23 +676,6 @@ static void node_device_release(struct device *dev) kfree(to_node(dev)); } -/** - * unregister_node - unregister a node device - * @node: node going away - * - * Unregisters a node device @node. All the devices on the node must be - * unregistered before calling this function. - */ -void unregister_node(struct node *node) -{ - hugetlb_unregister_node(node); - compaction_unregister_node(node); - reclaim_unregister_node(node); - node_remove_accesses(node); - node_remove_caches(node); - device_unregister(&node->dev); -} - struct node *node_devices[MAX_NUMNODES]; /* @@ -924,13 +907,26 @@ int register_node(int nid) return error; } - -void unregister_one_node(int nid) +/** + * unregister_node - unregister a node device + * @nid: nid of the node going away + * + * Unregisters the node device at node id @nid. All the devices on the + * node must be unregistered before calling this function. + */ +void unregister_node(int nid) { - if (!node_devices[nid]) + struct node *node = node_devices[nid]; + + if (!node) return; - unregister_node(node_devices[nid]); + hugetlb_unregister_node(node); + compaction_unregister_node(node); + reclaim_unregister_node(node); + node_remove_accesses(node); + node_remove_caches(node); + device_unregister(&node->dev); node_devices[nid] = NULL; } diff --git a/include/linux/node.h b/include/linux/node.h index b7028d3ec3b4..0269b064ba65 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -132,8 +132,6 @@ static inline void register_memory_blocks_under_nodes(void) } #endif -extern void unregister_node(struct node *node); - struct node_notify { int nid; }; @@ -177,7 +175,7 @@ static inline int hotplug_node_notifier(notifier_fn_t fn, int pri) extern void node_dev_init(void); /* Core of the node registration - only memory hotplug should use this */ int register_node(int nid); -extern void unregister_one_node(int nid); +void unregister_node(int nid); extern int register_cpu_under_node(unsigned int cpu, unsigned int nid); extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid); extern void unregister_memory_block_under_nodes(struct memory_block *mem_blk); @@ -193,7 +191,7 @@ static inline int register_node(int nid) { return 0; } -static inline int unregister_one_node(int nid) +static inline int unregister_node(int nid) { return 0; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6c050d867031..94a8f6e8811a 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1596,7 +1596,7 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) error: if (new_node) { node_set_offline(nid); - unregister_one_node(nid); + unregister_node(nid); } error_memblock_remove: if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) @@ -2201,7 +2201,7 @@ void try_offline_node(int nid) * node now. */ node_set_offline(nid); - unregister_one_node(nid); + unregister_node(nid); } EXPORT_SYMBOL(try_offline_node); -- cgit v1.2.3 From 0acc67c4030c39f39ac90413cc5d0abddd3a9527 Mon Sep 17 00:00:00 2001 From: Joshua Hahn Date: Tue, 14 Oct 2025 07:50:08 -0700 Subject: mm/page_alloc/vmstat: simplify refresh_cpu_vm_stats change detection Patch series "mm/page_alloc: Batch callers of free_pcppages_bulk", v5. Motivation & Approach ===================== While testing workloads with high sustained memory pressure on large machines in the Meta fleet (1Tb memory, 316 CPUs), we saw an unexpectedly high number of softlockups. Further investigation showed that the zone lock in free_pcppages_bulk was being held for a long time, and was called to free 2k+ pages over 100 times just during boot. This causes starvation in other processes for the zone lock, which can lead to the system stalling as multiple threads cannot make progress without the locks. We can see these issues manifesting as warnings: [ 4512.591979] rcu: INFO: rcu_sched self-detected stall on CPU [ 4512.604370] rcu: 20-....: (9312 ticks this GP) idle=a654/1/0x4000000000000000 softirq=309340/309344 fqs=5426 [ 4512.626401] rcu: hardirqs softirqs csw/system [ 4512.638793] rcu: number: 0 145 0 [ 4512.651177] rcu: cputime: 30 10410 174 ==> 10558(ms) [ 4512.666657] rcu: (t=21077 jiffies g=783665 q=1242213 ncpus=316) While these warnings don't indicate a crash or a kernel panic, they do point to the underlying issue of lock contention. To prevent starvation in both locks, batch the freeing of pages using pcp->batch. Because free_pcppages_bulk is called with the pcp lock and acquires the zone lock, relinquishing and reacquiring the locks are only effective when both of them are broken together (unless the system was built with queued spinlocks). Thus, instead of modifying free_pcppages_bulk to break both locks, batch the freeing from its callers instead. A similar fix has been implemented in the Meta fleet, and we have seen significantly less softlockups. Testing ======= The following are a few synthetic benchmarks, made on three machines. The first is a large machine with 754GiB memory and 316 processors. The second is a relatively smaller machine with 251GiB memory and 176 processors. The third and final is the smallest of the three, which has 62GiB memory and 36 processors. On all machines, I kick off a kernel build with -j$(nproc). Negative delta is better (faster compilation). Large machine (754GiB memory, 316 processors) make -j$(nproc) +------------+---------------+-----------+ | Metric (s) | Variation (%) | Delta(%) | +------------+---------------+-----------+ | real | 0.8070 | - 1.4865 | | user | 0.2823 | + 0.4081 | | sys | 5.0267 | -11.8737 | +------------+---------------+-----------+ Medium machine (251GiB memory, 176 processors) make -j$(nproc) +------------+---------------+----------+ | Metric (s) | Variation (%) | Delta(%) | +------------+---------------+----------+ | real | 0.2806 | +0.0351 | | user | 0.0994 | +0.3170 | | sys | 0.6229 | -0.6277 | +------------+---------------+----------+ Small machine (62GiB memory, 36 processors) make -j$(nproc) +------------+---------------+----------+ | Metric (s) | Variation (%) | Delta(%) | +------------+---------------+----------+ | real | 0.1503 | -2.6585 | | user | 0.0431 | -2.2984 | | sys | 0.1870 | -3.2013 | +------------+---------------+----------+ Here, variation is the coefficient of variation, i.e. standard deviation / mean. Based on these results, it seems like there are varying degrees to how much lock contention this reduces. For the largest and smallest machines that I ran the tests on, it seems like there is quite some significant reduction. There is also some performance increases visible from userspace. Interestingly, the performance gains don't scale with the size of the machine, but rather there seems to be a dip in the gain there is for the medium-sized machine. One possible theory is that because the high watermark depends on both memory and the number of local CPUs, what impacts zone contention the most is not these individual values, but rather the ratio of mem:processors. This patch (of 5): Currently, refresh_cpu_vm_stats returns an int, indicating how many changes were made during its updates. Using this information, callers like vmstat_update can heuristically determine if more work will be done in the future. However, all of refresh_cpu_vm_stats's callers either (a) ignore the result, only caring about performing the updates, or (b) only care about whether changes were made, but not *how many* changes were made. Simplify the code by returning a bool instead to indicate if updates were made. In addition, simplify fold_diff and decay_pcp_high to return a bool for the same reason. Link: https://lkml.kernel.org/r/20251014145011.3427205-1-joshua.hahnjy@gmail.com Link: https://lkml.kernel.org/r/20251014145011.3427205-2-joshua.hahnjy@gmail.com Signed-off-by: Joshua Hahn Reviewed-by: Vlastimil Babka Reviewed-by: SeongJae Park Cc: Brendan Jackman Cc: Chris Mason Cc: Johannes Weiner Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/gfp.h | 2 +- mm/page_alloc.c | 8 ++++---- mm/vmstat.c | 28 +++++++++++++++------------- 3 files changed, 20 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 623bee335383..b155929af5b1 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -387,7 +387,7 @@ extern void free_pages(unsigned long addr, unsigned int order); #define free_page(addr) free_pages((addr), 0) void page_alloc_init_cpuhp(void); -int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp); +bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp); void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp); void drain_all_pages(struct zone *zone); void drain_local_pages(struct zone *zone); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 10a908793b4c..f057ce5ea7da 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2557,10 +2557,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * Called from the vmstat counter updater to decay the PCP high. * Return whether there are addition works to do. */ -int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) +bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) { int high_min, to_drain, batch; - int todo = 0; + bool todo = false; high_min = READ_ONCE(pcp->high_min); batch = READ_ONCE(pcp->batch); @@ -2573,7 +2573,7 @@ int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) pcp->high = max3(pcp->count - (batch << CONFIG_PCP_BATCH_SCALE_MAX), pcp->high - (pcp->high >> 3), high_min); if (pcp->high > high_min) - todo++; + todo = true; } to_drain = pcp->count - pcp->high; @@ -2581,7 +2581,7 @@ int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) spin_lock(&pcp->lock); free_pcppages_bulk(zone, to_drain, pcp, 0); spin_unlock(&pcp->lock); - todo++; + todo = true; } return todo; diff --git a/mm/vmstat.c b/mm/vmstat.c index bb09c032eecf..98855f31294d 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -771,25 +771,25 @@ EXPORT_SYMBOL(dec_node_page_state); /* * Fold a differential into the global counters. - * Returns the number of counters updated. + * Returns whether counters were updated. */ static int fold_diff(int *zone_diff, int *node_diff) { int i; - int changes = 0; + bool changed = false; for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) if (zone_diff[i]) { atomic_long_add(zone_diff[i], &vm_zone_stat[i]); - changes++; + changed = true; } for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) if (node_diff[i]) { atomic_long_add(node_diff[i], &vm_node_stat[i]); - changes++; + changed = true; } - return changes; + return changed; } /* @@ -806,16 +806,16 @@ static int fold_diff(int *zone_diff, int *node_diff) * with the global counters. These could cause remote node cache line * bouncing and will have to be only done when necessary. * - * The function returns the number of global counters updated. + * The function returns whether global counters were updated. */ -static int refresh_cpu_vm_stats(bool do_pagesets) +static bool refresh_cpu_vm_stats(bool do_pagesets) { struct pglist_data *pgdat; struct zone *zone; int i; int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, }; - int changes = 0; + bool changed = false; for_each_populated_zone(zone) { struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats; @@ -839,7 +839,8 @@ static int refresh_cpu_vm_stats(bool do_pagesets) if (do_pagesets) { cond_resched(); - changes += decay_pcp_high(zone, this_cpu_ptr(pcp)); + if (decay_pcp_high(zone, this_cpu_ptr(pcp))) + changed = true; #ifdef CONFIG_NUMA /* * Deal with draining the remote pageset of this @@ -861,13 +862,13 @@ static int refresh_cpu_vm_stats(bool do_pagesets) } if (__this_cpu_dec_return(pcp->expire)) { - changes++; + changed = true; continue; } if (__this_cpu_read(pcp->count)) { drain_zone_pages(zone, this_cpu_ptr(pcp)); - changes++; + changed = true; } #endif } @@ -887,8 +888,9 @@ static int refresh_cpu_vm_stats(bool do_pagesets) } } - changes += fold_diff(global_zone_diff, global_node_diff); - return changes; + if (fold_diff(global_zone_diff, global_node_diff)) + changed = true; + return changed; } /* -- cgit v1.2.3 From d929525c2e30abee621bf71f143ba6104c81ff2b Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 16 Oct 2025 09:10:35 -0700 Subject: memcg: net: track network throttling due to memcg memory pressure The kernel can throttle network sockets if the memory cgroup associated with the corresponding socket is under memory pressure. The throttling actions include clamping the transmit window, failing to expand receive or send buffers, aggressively prune out-of-order receive queue, FIN deferred to a retransmitted packet and more. Let's add memcg metric to track such throttling actions. At the moment memcg memory pressure is defined through vmpressure and in future it may be defined using PSI or we may add more flexible way for the users to define memory pressure, maybe through ebpf. However the potential throttling actions will remain the same, so this newly introduced metric will continue to track throttling actions irrespective of how memcg memory pressure is defined. Link: https://lkml.kernel.org/r/20251016161035.86161-1-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Roman Gushchin Reviewed-by: Kuniyuki Iwashima Reviewed-by: Daniel Sedlak Cc: David S. Miller Cc: Eric Dumazet Cc: Jakub Kacinski Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Neal Cardwell Cc: Paolo Abeni Cc: Simon Horman Cc: Tejun Heo Cc: Willem de Bruijn Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v2.rst | 4 ++++ include/linux/memcontrol.h | 1 + include/net/sock.h | 6 +++++- kernel/cgroup/cgroup.c | 1 + mm/memcontrol.c | 3 +++ 5 files changed, 14 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 0e6c67ac585a..3345961c30ac 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1515,6 +1515,10 @@ The following nested keys are defined. oom_group_kill The number of times a group OOM has occurred. + sock_throttled + The number of times network sockets associated with + this cgroup are throttled. + memory.events.local Similar to memory.events but the fields in the file are local to the cgroup i.e. not hierarchical. The file modified event diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 873e510d6f8d..5fe254813123 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -52,6 +52,7 @@ enum memcg_memory_event { MEMCG_SWAP_HIGH, MEMCG_SWAP_MAX, MEMCG_SWAP_FAIL, + MEMCG_SOCK_THROTTLED, MEMCG_NR_MEMORY_EVENTS, }; diff --git a/include/net/sock.h b/include/net/sock.h index 60bcb13f045c..ff7d49af1619 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2635,8 +2635,12 @@ static inline bool mem_cgroup_sk_under_memory_pressure(const struct sock *sk) #endif /* CONFIG_MEMCG_V1 */ do { - if (time_before64(get_jiffies_64(), mem_cgroup_get_socket_pressure(memcg))) + if (time_before64(get_jiffies_64(), + mem_cgroup_get_socket_pressure(memcg))) { + memcg_memory_event(mem_cgroup_from_sk(sk), + MEMCG_SOCK_THROTTLED); return true; + } } while ((memcg = parent_mem_cgroup(memcg))); return false; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index fdee387f0d6b..8df671c59987 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4704,6 +4704,7 @@ void cgroup_file_notify(struct cgroup_file *cfile) } spin_unlock_irqrestore(&cgroup_file_kn_lock, flags); } +EXPORT_SYMBOL_GPL(cgroup_file_notify); /** * cgroup_file_show - show or hide a hidden cgroup file diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3ae5cbcaed75..976412c8196e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -81,6 +81,7 @@ struct cgroup_subsys memory_cgrp_subsys __read_mostly; EXPORT_SYMBOL(memory_cgrp_subsys); struct mem_cgroup *root_mem_cgroup __read_mostly; +EXPORT_SYMBOL(root_mem_cgroup); /* Active memory cgroup to use from an interrupt context */ DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg); @@ -4463,6 +4464,8 @@ static void __memory_events_show(struct seq_file *m, atomic_long_t *events) atomic_long_read(&events[MEMCG_OOM_KILL])); seq_printf(m, "oom_group_kill %lu\n", atomic_long_read(&events[MEMCG_OOM_GROUP_KILL])); + seq_printf(m, "sock_throttled %lu\n", + atomic_long_read(&events[MEMCG_SOCK_THROTTLED])); } static int memory_events_show(struct seq_file *m, void *v) -- cgit v1.2.3 From 2f05435df9320e70f7a98149eb4b043ff361a120 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 17 Oct 2025 15:53:07 +0800 Subject: mm: vmscan: simplify the logic for activating dirty file folios After commit 6b0dfabb3555 ("fs: Remove aops->writepage"), we no longer attempt to write back filesystem folios through reclaim. However, in the shrink_folio_list() function, there still remains some logic related to writeback control of dirty file folios. The original logic was that, for direct reclaim, or when folio_test_reclaim() is false, or the PGDAT_DIRTY flag is not set, the dirty file folios would be directly activated to avoid being scanned again; otherwise, it will try to writeback the dirty file folios. However, since we can no longer perform writeback on dirty folios, the dirty file folios will still be activated. Additionally, under the original logic, if we continue to try writeback dirty file folios, we will also check the references flag, sc->may_writepage, and may_enter_fs(), which may result in dirty file folios being left in the inactive list. This is unreasonable. Even if these dirty folios are scanned again, we still cannot clean them. Therefore, the checks on these dirty file folios appear to be redundant and can be removed. Dirty file folios should be directly moved to the active list to avoid being scanned again. Since we set the PG_reclaim flag for the dirty folios, once the writeback is completed, they will be moved back to the tail of the inactive list to be retried for quick reclaim. Link: https://lkml.kernel.org/r/ba5c49955fd93c6850bcc19abf0e02e1573768aa.1760687075.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: David Hildenbrand Cc: Hugh Dickins Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Qi Zheng Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 4 ---- mm/vmscan.c | 25 +++---------------------- 2 files changed, 3 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7fb7331c5725..4398e027f450 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1060,10 +1060,6 @@ struct zone { } ____cacheline_internodealigned_in_smp; enum pgdat_flags { - PGDAT_DIRTY, /* reclaim scanning has recently found - * many dirty file pages at the tail - * of the LRU. - */ PGDAT_WRITEBACK, /* reclaim scanning has recently found * many pages under writeback */ diff --git a/mm/vmscan.c b/mm/vmscan.c index e53ac12cc802..ecc90517b791 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1409,21 +1409,7 @@ retry: mapping = folio_mapping(folio); if (folio_test_dirty(folio)) { - /* - * Only kswapd can writeback filesystem folios - * to avoid risk of stack overflow. But avoid - * injecting inefficient single-folio I/O into - * flusher writeback as much as possible: only - * write folios when we've encountered many - * dirty folios, and when we've already scanned - * the rest of the LRU for clean folios and see - * the same dirty folios again (with the reclaim - * flag set). - */ - if (folio_is_file_lru(folio) && - (!current_is_kswapd() || - !folio_test_reclaim(folio) || - !test_bit(PGDAT_DIRTY, &pgdat->flags))) { + if (folio_is_file_lru(folio)) { /* * Immediately reclaim when written back. * Similar in principle to folio_deactivate() @@ -1432,7 +1418,8 @@ retry: */ node_stat_mod_folio(folio, NR_VMSCAN_IMMEDIATE, nr_pages); - folio_set_reclaim(folio); + if (!folio_test_reclaim(folio)) + folio_set_reclaim(folio); goto activate_locked; } @@ -6127,11 +6114,6 @@ again: if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken) set_bit(PGDAT_WRITEBACK, &pgdat->flags); - /* Allow kswapd to start writing pages during reclaim.*/ - if (sc->nr.unqueued_dirty && - sc->nr.unqueued_dirty == sc->nr.file_taken) - set_bit(PGDAT_DIRTY, &pgdat->flags); - /* * If kswapd scans pages marked for immediate * reclaim and under writeback (nr_immediate), it @@ -6872,7 +6854,6 @@ static void clear_pgdat_congested(pg_data_t *pgdat) clear_bit(LRUVEC_NODE_CONGESTED, &lruvec->flags); clear_bit(LRUVEC_CGROUP_CONGESTED, &lruvec->flags); - clear_bit(PGDAT_DIRTY, &pgdat->flags); clear_bit(PGDAT_WRITEBACK, &pgdat->flags); } -- cgit v1.2.3 From d3946c5f4c1c5db63532eb433a55c7d881de1389 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 17 Oct 2025 14:26:53 -0700 Subject: mm/damon: document damos_quota_goal->nid use case Patch series "mm/damon: allow DAMOS auto-tuned for per-memcg per-node memory usage". Introduce two new DAMOS quota auto-tuning target metrics for per-cgroup per-NUMA node memory utilization. Expected use cases are cgroup level access-aware NUMA memory managements, such as memory tiering or proactive reclamation on cgroup-based multi-tenant NUMA systems. Background ========== The aim-oriented aggressiveness auto-tuning feature of DAMOS is a highly recommended way for modern DAMOS use cases. Using it, users can specify what system status they want to achieve with what access-aware system operations. For example, reclaim cold memory aiming for 0.5 percent of memory pressure (proactive reclaim), or migrate hot and cold memory between NUMA nodes having different speed (memory tiering). Then DAMOS automatically adjusts the aggressiveness of the system operation (e.g., increase/decrease reclaim target coldness threshold) based on current status of the system. The use case is limited by the supported system status metrics for specifying the target system status. Two new system metrics for per-node memory usage ratio, namely DAMOS_QUOTA_NODE_MEM_{USED,FREE}_BP, were recently added to extend the use cases for access-aware NUMA nodes management, such as memory tiering. Those are expected to be useful for not only memory tiering but also general access-aware inter-NUMA node page migration, though. Limitation ---------- The per-node memory usage based auto-tuning can be applied only system-wide. For cgroups-based multi-tenant systems, it could arguably harm the fairness. For example, a cgroup may use faster NUMA node memory more than other cgroup, depending on their access pattern. If the user of each cgroup are promised to get the same quality and amount of the system resource, this can arguably be an unfair situation. DAMOS supports cgroup level system operations via DAMOS filter. But the quota auto-tuning system is not aware of cgroups. New DAMOS Quota Tuning Metrics for Per-Cgroup Per-NUMA Memory Usage =================================================================== To overcome the limitation, introduce two new DAMOS quota auto-tuning goal metrics, namely DAMOS_QUOTA_NODE_MEMCG_{USED,FREE}_BP. Those can be thought of as a variant of DAMOS_QUOTA_NODE_MEM_{USED,FREE}_BP that extended for cgroups. The two metrics specifies per-cgroup, per-node amount of used and unused memory in ratio to the total memory of the node. For example, let's assume a system has two NUMA nodes of size 100 GiB and 50 GiB. And two cgroups are using 40 GiB and 60 GiB of node 0, 20 GiB and 10 GiB of node 1, respectively, as illustrated by the below table. node-0 node-1 Total memory 100 GiB 50 GiB Cgroup A usage 40 GiB 20 GiB Cgroup B usage 60 GiB 10 GiB Then, DAMOS_QUOTA_NODE_MEMCG_USED_BP for the cgroups for the first node are, 40 GiB / 100 GiB = 4,000 bp (40 percent) and 60 GiB / 100 GiB = 6,000 bp (60 percent), respectively. Those for the second node are, 20 GiB / 50 GiB = 4000 bp (40 percent) and 10 GiB / 50 GiB = 2000 bp (20 percent), respectively. DAMOS_QUOTA_NODE_MEMCG_FREE_BP for the four cases are, 60 GiB /100 GiB = 6000 bp, 40 GiB / 100 GiB = 4000 bp, 30 GiB / 50 GiB = 6000 bp, and 40 GiB / 50 GiB = 8000 bp, respectively. DAMOS_QUOTA_NODE_MEMCG_USED_BP for cgroup A node-0: 4000 bp DAMOS_QUOTA_NODE_MEMCG_USED_BP for cgroup B node-0: 6000 bp DAMOS_QUOTA_NODE_MEMCG_USED_BP for cgroup A node-1: 4000 bp DAMOS_QUOTA_NODE_MEMCG_USED_BP for cgroup B node-1: 2000 bp DAMOS_QUOTA_NODE_MEMCG_FREE_BP for cgroup A node-0: 6000 bp DAMOS_QUOTA_NODE_MEMCG_FREE_BP for cgroup B node-0: 4000 bp DAMOS_QUOTA_NODE_MEMCG_FREE_BP for cgroup A node-1: 6000 bp DAMOS_QUOTA_NODE_MEMCG_FREE_BP for cgroup B node-1: 8000 bp Using these, users can specify how much [un]used amount of memory for per-cgroup and per-node DAMOS should make as a result of the auto-tuning. Example Usecase: Cgroup Level Memory Tiering ============================================ Let's suppose a typical and simple tiered memory system. The system equips two NUMA nodes. The first node (node 0) is CPU-attached and fast. The second node (node 1) is CPU-unattached and slow. It runs two cgroups that desire to use about 30 percent and 70 percent of the faster node as much as possible for their hot data, respectively. Then, the user can implement DAMOS-based memory tiering for the system using the DAMON user-space tool (damo), like below. # ./damo start \ `# kdamond for node 1 (slow)` \ --numa_node 1 --monitoring_intervals_goal 4% 3 5ms 10s \ `# promotion scheme for cgroup a` \ --damos_action migrate_hot 0 --damos_access_rate 5% max \ --damos_apply_interval 1s \ --damos_filter allow memcg /workloads/a \ --damos_filter allow young \ --damos_quota_interval 1s --damos_quota_space 200MB \ --damos_quota_goal node_memcg_used_bp 29.7% 0 /workloads/a \ \ `# promotion scheme for cgroup b` \ --damos_action migrate_hot 0 --damos_access_rate 5% max \ --damos_apply_interval 1s \ --damos_filter allow memcg /workloads/b \ --damos_filter allow young \ --damos_quota_interval 1s --damos_quota_space 200MB \ --damos_quota_goal node_memcg_used_bp 69.7% 0 workloads/b \ \ `# kdamond for node 0 (fast)` \ --numa_node 0 --monitoring_intervals_goal 4% 3 5ms 10s \ `# demotion scheme for cgroup a` \ --damos_action migrate_cold 1 --damos_access_rate 0% 0% \ --damos_apply_interval 1s \ --damos_filter allow memcg /workloads/a \ --damos_filter reject young \ --damos_quota_interval 1s --damos_quota_space 200MB \ --damos_quota_goal node_memcg_free_bp 70.5% 0 \ \ `# demotion scheme for cgroup b` \ --damos_action migrate_cold 1 --damos_access_rate 0% 0% \ --damos_apply_interval 1s \ --damos_filter allow memcg /workloads/a \ --damos_filter reject young \ --damos_quota_interval 1s --damos_quota_space 200MB \ --damos_quota_goal node_memcg_free_bp 30.5% 0 \ \ --damos_nr_quota_goals 1 1 1 1 --damos_nr_filters 1 1 1 1 \ --nr_targets 1 1 --nr_schemes 2 2 --nr_ctxs 1 1 With the command, the user-space tool will ask DAMON to spawn two kernel threads, each for monitoring accesses to node 1 (slow) and node 0 (fast), respectively. It installs two DAMOS schemes on each thread. Let's call them "promotion scheme for cgroup a/b", and "demotion scheme for cgroup a/b" in the order. The promotion schemes are installed on the DAMON thread for node 1 (slow), and demotion schemes are installed on the DAMON thread for node 0 (fast). Cgroup Level Hot Pages Migration (Promotion) -------------------------------------------- Promotion schemes will find memory regions on node 1 (slow), that some access was detected. The schemes will then migrate the found memory to node 0 (fast), hottest pages first. For accurate and effective migration, these schemes use two page level filters. First, the migration will be filtered for only cgroup A and cgroup B. That is, "promotion scheme for cgroup B" will not do the migration if the page is for cgroup A. Secondly, the schemes will ignore pages that having their page table's Accessed bits unset. The per-page Accessed bit check logic will also unset the bit if it was set, for the next check. For controlled amounts of system resource consumption and aiming on the target memory usage, the schemes use quotas setup. The migration is limited to be done only up to 200 MiB per second, to limit the peak system resource usage. And DAMOS_QUOTA_NODE_MEMCG_USED_BP target is set for 29.7% and 69.7% of node 0 (fast), respectively. The target value is lower than the high level goal (30% and 70% system memory), to give headroom on node 0 (fast). DAMOS will adjust the speed of the pages migration based on the target and current per-cgroup node 0 memory usage. For example, if cgroup A is utilizing only 10% of node 0, DAMOS will try to migrate more of cgroup A hot pages from node 1 to node 0, up to 200 MiB per second. If cgroup A utilizes more than 29.7% of node 0 memory, the cgroup A hot pages migration from node 1 to node 0 will be slowed and eventually stopped. Cgroup Level Cold Pages Migration (Demotion) -------------------------------------------- Demotion schemes are similar to promotion schemes, but differ in filtering setup and quota tuning setup. Those filter out pages having their page table Accessed bits set. And set 70.5% and 30.5% of node 0 memory free rate for the cgroup A and B, respectively. Hence, if promotion schemes or something made cgroup A and/or B uses more than 29.5% and 69.5% of node 0, demotion schemes will start migrating cold pages of appropriate cgroups in node 0 to node 1, under the 200 MiB per second speed cap, while adjusting the speed based on how much more than wanted memory is being used. The quota target values are set to overlap with promotion targets, to keep a minimum level of page exchanges between the nodes. This is to avoid a case that the target memory utilization is met, and then access pattern changes (pages in node 1 become hotter than pages in node 0) while the memory utilization is unchanged. Without the overlap, neither promotion of hotter pages in node 1, nor demotion of colder pages in node 0 will happen since both goals are met. As a result, the faster and slower node will unexpectedly serve cold and hot data. Test: Per-cgroup Memory Tiering =============================== I ran a simplified cgroup level memory tiering using the feature, and confirmed it works as intended. Setup ----- I configured a QEMU virtual machine representing a simplified version of the system that described on the above cgroup level memory tiering example use case. The system equips 40 CPU cores and two NUMA nodes each having 30 GiB physical memory. The first node (node 0) represents the faster NUMA node, and the second node (node 1) represents the slower NUMA node. In specific, below qemu command line options are used. [...] -object memory-backend-ram,size=30G,id=m0 \ -object memory-backend-ram,size=30G,id=m1 \ -numa node,cpus=0-39,memdev=m0 \ -numa node,memdev=m1 \ [...] I booted the virtual machine with a kernel that this patch series is applied. On the virtual machine, I created two cgroups, namely workload_a and workload_b. And ran a test program in each cgroup, resulting in one process per cgroup. The test program allocates 10 GiB memory and evenly split it into 10 regions. After the allocation, it repeatedly access the first region for one minute, than the second one for one minute, and so on. After the one minute repeated access for the 10-th region is done, it repeats the access from the first region. So the process has 10 GiB of data in total, but only 1 GiB of it is hot at a given moment, and the hot data is gradually changed. While the processes are running, run DAMON for a simple access-aware memory tiering using below script. It migrates hot and cold data of the cgroups into node 0 and node 1, aiming the first and the second cgroups (workload_a and workload_b, respectively) utilizing about 9.7 percent and 19.7 percent of node 0, respectively. Note that this setup is a simplified version of the above example use case, for ease of test. Also note that we assigned 30 GiB physical memory to node 0, but DAMON in this setup works for only 27 GiB of the memory. It is due to an internal implementation detail of DAMON user-space tool that not really important for this test. #!/bin/bash damo start \ --numa_node 1 \ --damos_action migrate_hot 0 --damos_access_rate 5% max \ --damos_apply_interval 1s \ --damos_filter allow memcg /workload_a \ --damos_filter allow young \ --damos_quota_interval 1s \ --damos_quota_goal node_memcg_used_bp 9.7% 0 /workload_a \ --damos_action migrate_hot 0 --damos_access_rate 5% max \ --damos_apply_interval 1s \ --damos_filter allow memcg /workload_b \ --damos_filter allow young \ --damos_quota_interval 1s \ --damos_quota_goal node_memcg_used_bp 19.7% 0 /workload_b \ --numa_node 0 \ --damos_action migrate_cold 1 --damos_access_rate 0% 0% \ --damos_apply_interval 1s \ --damos_filter allow memcg /workload_a \ --damos_filter reject young \ --damos_quota_interval 1s \ --damos_quota_goal node_memcg_free_bp 90.5% 0 /workload_a \ --damos_action migrate_cold 1 --damos_access_rate 0% 0% \ --damos_apply_interval 1s \ --damos_filter allow memcg /workload_b \ --damos_filter reject young \ --damos_quota_interval 1s \ --damos_quota_goal node_memcg_free_bp 80.5% 0 /workload_b \ --damos_nr_quota_goals 1 1 1 1 --damos_nr_filters 2 2 2 2 \ --nr_targets 1 1 --nr_schemes 2 2 --nr_ctxs 1 1 After starting DAMON, the pages continuously be migrated across nodes. A few minutes later, the memory usage of the cgroups converges into the aimed amounts, and keeps the level, as expected. To confirm the status is kept in the target level as expected, I collected the memory usage stat of the cgroups using memory.numa_stat file, after the stats are converged. I repeat the stat collection 42 times with 5 seconds delay between each of the collections. The results are as below: node0_memory_usage average stdev workload_a 2.79GiB 522.06MiB workload_b 5.15GiB 739.10MiB The average values are quite close to the targeted values: 27 GiB * 9.7% = 2.619 GiB for workload_a, and 27 GiB * 19.7% = 5.319 GiB. A level of variances are expected, given the overlap of the promotion/demotion targets, and dynamic data access pattern of the workloads. Give that, the measured variances are at a reasonable level. Patches Sequence ================ The first patch (patch 1) updates the kernel-doc comment of damos_quota_goal struct to clarify usage of optional fields of the struct, since later patches will add such optional fields. Following four patches (patches 2-5) implement a new DAMOS quota goal metric for per-cgroup per-node memory usage. Those extends the core layer interface for the new metric (patch 2), implement the metric value calculation on the core layer (patch 3), add DAMON sysfs interface file for the target cgroup specification (patch 4), and implement support of the new metric on DAMON sysfs interface (patch 5). Next two patches implment the second new DAMOS quota goal metric for per-cgroup per-node free (or, unused) memory. Those implement it in the core layer (patch 6) and DAMON sysfs interface (patch 7), extending the existing implementation for memory usage metric. Final three patches update the design (patch 8), the usage (patch 9), and the ABI (patch 10) documents for the changes that are introduced by this patch series. This patch (of 10): damos_quota_goal kerneldoc comment is not explaining when @metric is used. Update the comment for that. Link: https://lkml.kernel.org/r/20251017212706.183502-1-sj@kernel.org Link: https://lkml.kernel.org/r/20251017212706.183502-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index cae8c613c5fc..dc9c310e0e75 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -176,6 +176,9 @@ enum damos_quota_goal_metric { * If @metric is DAMOS_QUOTA_USER_INPUT, @current_value should be manually * entered by the user, probably inside the kdamond callbacks. Otherwise, * DAMON sets @current_value with self-measured value of @metric. + * + * If @metric is DAMOS_QUOTA_NODE_MEM_{USED,FREE}_BP, @nid represents the node + * id of the target node to account the used/free memory. */ struct damos_quota_goal { enum damos_quota_goal_metric metric; -- cgit v1.2.3 From 6a18bbe48361acad1eae8d86aa47d353b1cfe619 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 17 Oct 2025 14:26:54 -0700 Subject: mm/damon: add DAMOS quota goal type for per-memcg per-node memory usage Define a new DAMOS quota auto-tuning target metric for per-cgroup per-node memory usage. For specifying the cgroup of the interest, add a field, namely memcg_id, to damos_quota_goal struct. Note that this commit is only implementing the interface. The handling of the interface (the metric value calculation) will be implemented in the following commit. Link: https://lkml.kernel.org/r/20251017212706.183502-3-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index dc9c310e0e75..0d63ceb7e6ef 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -147,6 +147,7 @@ enum damos_action { * @DAMOS_QUOTA_SOME_MEM_PSI_US: System level some memory PSI in us. * @DAMOS_QUOTA_NODE_MEM_USED_BP: MemUsed ratio of a node. * @DAMOS_QUOTA_NODE_MEM_FREE_BP: MemFree ratio of a node. + * @DAMOS_QUOTA_NODE_MEMCG_USED_BP: MemUsed ratio of a node for a cgroup. * @NR_DAMOS_QUOTA_GOAL_METRICS: Number of DAMOS quota goal metrics. * * Metrics equal to larger than @NR_DAMOS_QUOTA_GOAL_METRICS are unsupported. @@ -156,6 +157,7 @@ enum damos_quota_goal_metric { DAMOS_QUOTA_SOME_MEM_PSI_US, DAMOS_QUOTA_NODE_MEM_USED_BP, DAMOS_QUOTA_NODE_MEM_FREE_BP, + DAMOS_QUOTA_NODE_MEMCG_USED_BP, NR_DAMOS_QUOTA_GOAL_METRICS, }; @@ -166,6 +168,7 @@ enum damos_quota_goal_metric { * @current_value: Current value of @metric. * @last_psi_total: Last measured total PSI * @nid: Node id. + * @memcg_id: Memcg id. * @list: List head for siblings. * * Data structure for getting the current score of the quota tuning goal. The @@ -179,6 +182,9 @@ enum damos_quota_goal_metric { * * If @metric is DAMOS_QUOTA_NODE_MEM_{USED,FREE}_BP, @nid represents the node * id of the target node to account the used/free memory. + * + * If @metric is DAMOS_QUOTA_NODE_MEMCG_USED_BP, @nid and @memcg_id represents + * the node id and the cgroup to account the used memory for. */ struct damos_quota_goal { enum damos_quota_goal_metric metric; @@ -187,7 +193,10 @@ struct damos_quota_goal { /* metric-dependent fields */ union { u64 last_psi_total; - int nid; + struct { + int nid; + unsigned short memcg_id; + }; }; struct list_head list; }; -- cgit v1.2.3 From 98fdce76fb7ed7070df21afbee46a4b36cb6a7c6 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 17 Oct 2025 14:26:58 -0700 Subject: mm/damon/core: add DAMOS quota gaol metric for per-memcg per-numa free memory Add a variant of DAMOS_QUOTA_NODE_MEMCG_USED_BP, for the free memory portion. The value of the metric is implemented as the entire memory of the given NUMA node subtracted by the given cgroup's usage. So from a perspective, "unused" could be a better term than "free". But arguably it is not very clear what is better, so use the term "free". Link: https://lkml.kernel.org/r/20251017212706.183502-7-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 6 ++++-- mm/damon/core.c | 10 ++++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 0d63ceb7e6ef..0edf41d36ea1 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -148,6 +148,7 @@ enum damos_action { * @DAMOS_QUOTA_NODE_MEM_USED_BP: MemUsed ratio of a node. * @DAMOS_QUOTA_NODE_MEM_FREE_BP: MemFree ratio of a node. * @DAMOS_QUOTA_NODE_MEMCG_USED_BP: MemUsed ratio of a node for a cgroup. + * @DAMOS_QUOTA_NODE_MEMCG_FREE_BP: MemFree ratio of a node for a cgroup. * @NR_DAMOS_QUOTA_GOAL_METRICS: Number of DAMOS quota goal metrics. * * Metrics equal to larger than @NR_DAMOS_QUOTA_GOAL_METRICS are unsupported. @@ -158,6 +159,7 @@ enum damos_quota_goal_metric { DAMOS_QUOTA_NODE_MEM_USED_BP, DAMOS_QUOTA_NODE_MEM_FREE_BP, DAMOS_QUOTA_NODE_MEMCG_USED_BP, + DAMOS_QUOTA_NODE_MEMCG_FREE_BP, NR_DAMOS_QUOTA_GOAL_METRICS, }; @@ -183,8 +185,8 @@ enum damos_quota_goal_metric { * If @metric is DAMOS_QUOTA_NODE_MEM_{USED,FREE}_BP, @nid represents the node * id of the target node to account the used/free memory. * - * If @metric is DAMOS_QUOTA_NODE_MEMCG_USED_BP, @nid and @memcg_id represents - * the node id and the cgroup to account the used memory for. + * If @metric is DAMOS_QUOTA_NODE_MEMCG_{USED,FREE}_BP, @nid and @memcg_id + * represents the node id and the cgroup to account the used memory for. */ struct damos_quota_goal { enum damos_quota_goal_metric metric; diff --git a/mm/damon/core.c b/mm/damon/core.c index 8aa8d269df90..a9c11d2d37b0 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -790,6 +790,7 @@ static void damos_commit_quota_goal_union( dst->nid = src->nid; break; case DAMOS_QUOTA_NODE_MEMCG_USED_BP: + case DAMOS_QUOTA_NODE_MEMCG_FREE_BP: dst->nid = src->nid; dst->memcg_id = src->memcg_id; break; @@ -2046,7 +2047,7 @@ static unsigned long damos_get_node_memcg_used_bp( { struct mem_cgroup *memcg; struct lruvec *lruvec; - unsigned long used_pages; + unsigned long used_pages, numerator; struct sysinfo i; rcu_read_lock(); @@ -2066,7 +2067,11 @@ static unsigned long damos_get_node_memcg_used_bp( used_pages += lruvec_page_state(lruvec, NR_INACTIVE_FILE); si_meminfo_node(&i, goal->nid); - return used_pages * 10000 / i.totalram; + if (goal->metric == DAMOS_QUOTA_NODE_MEMCG_USED_BP) + numerator = used_pages; + else /* DAMOS_QUOTA_NODE_MEMCG_FREE_BP */ + numerator = i.totalram - used_pages; + return numerator * 10000 / i.totalram; } #else static __kernel_ulong_t damos_get_node_mem_bp( @@ -2101,6 +2106,7 @@ static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal) goal->current_value = damos_get_node_mem_bp(goal); break; case DAMOS_QUOTA_NODE_MEMCG_USED_BP: + case DAMOS_QUOTA_NODE_MEMCG_FREE_BP: goal->current_value = damos_get_node_memcg_used_bp(goal); break; default: -- cgit v1.2.3 From e859a224fad65cb4848fe202aea9896a14fdb7f4 Mon Sep 17 00:00:00 2001 From: Quanmin Yan Date: Mon, 20 Oct 2025 21:01:24 +0800 Subject: mm/damon: add a min_sz_region parameter to damon_set_region_biggest_system_ram_default() Patch series "mm/damon: fixes for address alignment issues in DAMON_LRU_SORT and DAMON_RECLAIM", v2. In DAMON_LRU_SORT and DAMON_RECLAIM, damon_set_regions() will apply DAMON_MIN_REGION as the core address alignment, and the monitoring target address ranges would be aligned on DAMON_MIN_REGION * addr_unit. When users 1) set addr_unit to a value larger than 1, and 2) set the monitoring target address range as not aligned on DAMON_MIN_REGION * addr_unit, it will cause DAMON_LRU_SORT and DAMON_RECLAIM to operate on unexpectedly large physical address ranges. For example, if the user sets the monitoring target address range to [4, 8) and addr_unit as 1024, the aimed monitoring target address range is [4 KiB, 8 KiB). Assuming DAMON_MIN_REGION is 4096, so resulting target address range will be [0, 4096) in the DAMON core layer address system, and [0, 4 MiB) in the physical address space, which is an unexpected range. To fix the issue, add a min_sz_region parameter to damon_set_region_biggest_system_ram_default() and use it when calling damon_set_regions(), replacing the direct use of DAMON_MIN_REGION. This patch (of 2): In DAMON_LRU_SORT, damon_set_regions() will apply DAMON_MIN_REGION as the core address alignment, and the monitoring target address ranges would be aligned on DAMON_MIN_REGION * addr_unit. When users 1) set addr_unit to a value larger than 1, and 2) set the monitoring target address range as not aligned on DAMON_MIN_REGION * addr_unit, it will cause DAMON_LRU_SORT to operate on unexpectedly large physical address ranges. For example, if the user sets the monitoring target address range to [4, 8) and addr_unit as 1024, the aimed monitoring target address range is [4 KiB, 8 KiB). Assuming DAMON_MIN_REGION is 4096, so resulting target address range will be [0, 4096) in the DAMON core layer address system, and [0, 4 MiB) in the physical address space, which is an unexpected range. To fix the issue, add a min_sz_region parameter to damon_set_region_biggest_system_ram_default() and use it when calling damon_set_regions(), replacing the direct use of DAMON_MIN_REGION. Link: https://lkml.kernel.org/r/20251020130125.2875164-1-yanquanmin1@huawei.com Link: https://lkml.kernel.org/r/20251020130125.2875164-2-yanquanmin1@huawei.com Fixes: 2e0fe9245d6b ("mm/damon/lru_sort: support addr_unit for DAMON_LRU_SORT") Signed-off-by: Quanmin Yan Reviewed-by: SeongJae Park Cc: Kefeng Wang Cc: ze zuo Signed-off-by: Andrew Morton --- include/linux/damon.h | 3 ++- mm/damon/core.c | 6 ++++-- mm/damon/lru_sort.c | 3 ++- mm/damon/reclaim.c | 3 ++- mm/damon/stat.c | 3 ++- 5 files changed, 12 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 0edf41d36ea1..9ee026c2db53 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -961,7 +961,8 @@ int damon_call(struct damon_ctx *ctx, struct damon_call_control *control); int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control); int damon_set_region_biggest_system_ram_default(struct damon_target *t, - unsigned long *start, unsigned long *end); + unsigned long *start, unsigned long *end, + unsigned long min_sz_region); #endif /* CONFIG_DAMON */ diff --git a/mm/damon/core.c b/mm/damon/core.c index a9c11d2d37b0..82546d138a5a 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2818,6 +2818,7 @@ static bool damon_find_biggest_system_ram(unsigned long *start, * @t: The monitoring target to set the region. * @start: The pointer to the start address of the region. * @end: The pointer to the end address of the region. + * @min_sz_region: Minimum region size. * * This function sets the region of @t as requested by @start and @end. If the * values of @start and @end are zero, however, this function finds the biggest @@ -2828,7 +2829,8 @@ static bool damon_find_biggest_system_ram(unsigned long *start, * Return: 0 on success, negative error code otherwise. */ int damon_set_region_biggest_system_ram_default(struct damon_target *t, - unsigned long *start, unsigned long *end) + unsigned long *start, unsigned long *end, + unsigned long min_sz_region) { struct damon_addr_range addr_range; @@ -2841,7 +2843,7 @@ int damon_set_region_biggest_system_ram_default(struct damon_target *t, addr_range.start = *start; addr_range.end = *end; - return damon_set_regions(t, &addr_range, 1, DAMON_MIN_REGION); + return damon_set_regions(t, &addr_range, 1, min_sz_region); } /* diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 42b9a656f9de..49b4bc294f4e 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -242,7 +242,8 @@ static int damon_lru_sort_apply_parameters(void) err = damon_set_region_biggest_system_ram_default(param_target, &monitor_region_start, - &monitor_region_end); + &monitor_region_end, + param_ctx->min_sz_region); if (err) goto out; err = damon_commit_ctx(ctx, param_ctx); diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 7ba3d0f9a19a..e30811cafe90 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -250,7 +250,8 @@ static int damon_reclaim_apply_parameters(void) err = damon_set_region_biggest_system_ram_default(param_target, &monitor_region_start, - &monitor_region_end); + &monitor_region_end, + DAMON_MIN_REGION); if (err) goto out; err = damon_commit_ctx(ctx, param_ctx); diff --git a/mm/damon/stat.c b/mm/damon/stat.c index bf8626859902..ed8e3629d31a 100644 --- a/mm/damon/stat.c +++ b/mm/damon/stat.c @@ -188,7 +188,8 @@ static struct damon_ctx *damon_stat_build_ctx(void) if (!target) goto free_out; damon_add_target(ctx, target); - if (damon_set_region_biggest_system_ram_default(target, &start, &end)) + if (damon_set_region_biggest_system_ram_default(target, &start, &end, + ctx->min_sz_region)) goto free_out; return ctx; free_out: -- cgit v1.2.3 From 54c58a2f5fa191839cf192fa4ebab39395272a3e Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 20 Oct 2025 13:11:21 +0100 Subject: mm: add vma_desc_size(), vma_desc_pages() helpers It's useful to be able to determine the size of a VMA descriptor range used on f_op->mmap_prepare, expressed both in bytes and pages, so add helpers for both and update code that could make use of it to do so. Link: https://lkml.kernel.org/r/74ef338203c9ff08a9ace73a8f1f6116a79112a0.1760959442.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Jan Kara Acked-by: David Hildenbrand Reviewed-by: Jason Gunthorpe Reviewed-by: Pedro Falcato Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Baolin Wang Cc: Baoquan He Cc: Chatre, Reinette Cc: Christian Borntraeger Cc: Christian Brauner Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Dave Young Cc: David S. Miller Cc: Dmitriy Vyukov Cc: Greg Kroah-Hartman Cc: Guo Ren Cc: Heiko Carstens Cc: Hugh Dickins Cc: James Morse Cc: Jann Horn Cc: Jonathan Corbet Cc: Kevin Tian Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Nicolas Pitre Cc: Oscar Salvador Cc: Robin Murohy Cc: Sumanth Korikkar Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: "Uladzislau Rezki (Sony)" Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- fs/ntfs3/file.c | 2 +- include/linux/mm.h | 10 ++++++++++ mm/secretmem.c | 2 +- 3 files changed, 12 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 4c90ec2fa2ea..2f344e1ed756 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -332,7 +332,7 @@ static int ntfs_file_mmap_prepare(struct vm_area_desc *desc) if (rw) { u64 to = min_t(loff_t, i_size_read(inode), - from + desc->end - desc->start); + from + vma_desc_size(desc)); if (is_sparsed(ni)) { /* Allocate clusters for rw map. */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 7c79b3369b82..5752b0c516f2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3583,6 +3583,16 @@ static inline unsigned long vma_pages(const struct vm_area_struct *vma) return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; } +static inline unsigned long vma_desc_size(const struct vm_area_desc *desc) +{ + return desc->end - desc->start; +} + +static inline unsigned long vma_desc_pages(const struct vm_area_desc *desc) +{ + return vma_desc_size(desc) >> PAGE_SHIFT; +} + /* Look up the first VMA which exactly match the interval vm_start ... vm_end */ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end) diff --git a/mm/secretmem.c b/mm/secretmem.c index 9b0f5d9ec6f4..37f6d1097853 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -120,7 +120,7 @@ static int secretmem_release(struct inode *inode, struct file *file) static int secretmem_mmap_prepare(struct vm_area_desc *desc) { - const unsigned long len = desc->end - desc->start; + const unsigned long len = vma_desc_size(desc); if ((desc->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0) return -EINVAL; -- cgit v1.2.3 From 51e38e7d40d617965504f4dcba569ecf9302f245 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 20 Oct 2025 13:11:24 +0100 Subject: mm: add remap_pfn_range_prepare(), remap_pfn_range_complete() We need the ability to split PFN remap between updating the VMA and performing the actual remap, in order to do away with the legacy f_op->mmap hook. To do so, update the PFN remap code to provide shared logic, and also make remap_pfn_range_notrack() static, as its one user, io_mapping_map_user() was removed in commit 9a4f90e24661 ("mm: remove mm/io-mapping.c"). Then, introduce remap_pfn_range_prepare(), which accepts VMA descriptor and PFN parameters, and remap_pfn_range_complete() which accepts the same parameters as remap_pfn_rangte(). remap_pfn_range_prepare() will set the cow vma->vm_pgoff if necessary, so it must be supplied with a correct PFN to do so. While we're here, also clean up the duplicated #ifdef __HAVE_PFNMAP_TRACKING check and put into a single #ifdef/#else block. We keep these internal to mm as they should only be used by internal helpers. Link: https://lkml.kernel.org/r/75b55de63249b3aa0fd5b3b08ed1d3ff19255d0d.1760959442.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Jason Gunthorpe Acked-by: Pedro Falcato Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Baolin Wang Cc: Baoquan He Cc: Chatre, Reinette Cc: Christian Borntraeger Cc: Christian Brauner Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Dave Young Cc: David Hildenbrand Cc: David S. Miller Cc: Dmitriy Vyukov Cc: Greg Kroah-Hartman Cc: Guo Ren Cc: Heiko Carstens Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: Kevin Tian Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Nicolas Pitre Cc: Oscar Salvador Cc: Robin Murohy Cc: Sumanth Korikkar Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: "Uladzislau Rezki (Sony)" Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 22 +++++++-- mm/internal.h | 4 ++ mm/memory.c | 132 +++++++++++++++++++++++++++++++++++------------------ 3 files changed, 110 insertions(+), 48 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 5752b0c516f2..ca5565f4fac4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -489,6 +489,21 @@ extern unsigned int kobjsize(const void *objp); */ #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) +/* + * Physically remapped pages are special. Tell the + * rest of the world about it: + * VM_IO tells people not to look at these pages + * (accesses can have side effects). + * VM_PFNMAP tells the core MM that the base pages are just + * raw PFN mappings, and do not have a "struct page" associated + * with them. + * VM_DONTEXPAND + * Disable vma merging and expanding with mremap(). + * VM_DONTDUMP + * Omit vma from core dump, even when VM_IO turned off. + */ +#define VM_REMAP_FLAGS (VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP) + /* This mask prevents VMA from being scanned with khugepaged */ #define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB) @@ -3634,10 +3649,9 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, struct vm_area_struct *find_extend_vma_locked(struct mm_struct *, unsigned long addr); -int remap_pfn_range(struct vm_area_struct *, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t); -int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t prot); +int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t pgprot); + int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num); diff --git a/mm/internal.h b/mm/internal.h index 56a9a714709a..5ca1e7842b19 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1677,4 +1677,8 @@ static inline bool reclaim_pt_is_enabled(unsigned long start, unsigned long end, void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm); int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm); +void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn); +int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t pgprot); + #endif /* __MM_INTERNAL_H */ diff --git a/mm/memory.c b/mm/memory.c index f13b20b702f6..8e02b8d75535 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2900,6 +2900,25 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, return 0; } +static int get_remap_pgoff(vm_flags_t vm_flags, unsigned long addr, + unsigned long end, unsigned long vm_start, unsigned long vm_end, + unsigned long pfn, pgoff_t *vm_pgoff_p) +{ + /* + * There's a horrible special case to handle copy-on-write + * behaviour that some programs depend on. We mark the "original" + * un-COW'ed pages by matching them up with "vma->vm_pgoff". + * See vm_normal_page() for details. + */ + if (is_cow_mapping(vm_flags)) { + if (addr != vm_start || end != vm_end) + return -EINVAL; + *vm_pgoff_p = pfn; + } + + return 0; +} + static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { @@ -2912,31 +2931,7 @@ static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long ad if (WARN_ON_ONCE(!PAGE_ALIGNED(addr))) return -EINVAL; - /* - * Physically remapped pages are special. Tell the - * rest of the world about it: - * VM_IO tells people not to look at these pages - * (accesses can have side effects). - * VM_PFNMAP tells the core MM that the base pages are just - * raw PFN mappings, and do not have a "struct page" associated - * with them. - * VM_DONTEXPAND - * Disable vma merging and expanding with mremap(). - * VM_DONTDUMP - * Omit vma from core dump, even when VM_IO turned off. - * - * There's a horrible special case to handle copy-on-write - * behaviour that some programs depend on. We mark the "original" - * un-COW'ed pages by matching them up with "vma->vm_pgoff". - * See vm_normal_page() for details. - */ - if (is_cow_mapping(vma->vm_flags)) { - if (addr != vma->vm_start || end != vma->vm_end) - return -EINVAL; - vma->vm_pgoff = pfn; - } - - vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); + VM_WARN_ON_ONCE((vma->vm_flags & VM_REMAP_FLAGS) != VM_REMAP_FLAGS); BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; @@ -2957,7 +2952,7 @@ static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long ad * Variant of remap_pfn_range that does not call track_pfn_remap. The caller * must have pre-validated the caching bits of the pgprot_t. */ -int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, +static int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { int error = remap_pfn_range_internal(vma, addr, pfn, size, prot); @@ -3002,23 +2997,9 @@ void pfnmap_track_ctx_release(struct kref *ref) pfnmap_untrack(ctx->pfn, ctx->size); kfree(ctx); } -#endif /* __HAVE_PFNMAP_TRACKING */ -/** - * remap_pfn_range - remap kernel memory to userspace - * @vma: user vma to map to - * @addr: target page aligned user address to start at - * @pfn: page frame number of kernel physical memory address - * @size: size of mapping area - * @prot: page protection flags for this mapping - * - * Note: this is only safe if the mm semaphore is held when called. - * - * Return: %0 on success, negative error code otherwise. - */ -#ifdef __HAVE_PFNMAP_TRACKING -int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t prot) +static int remap_pfn_range_track(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) { struct pfnmap_track_ctx *ctx = NULL; int err; @@ -3054,15 +3035,78 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, return err; } +static int do_remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + return remap_pfn_range_track(vma, addr, pfn, size, prot); +} #else -int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t prot) +static int do_remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) { return remap_pfn_range_notrack(vma, addr, pfn, size, prot); } #endif + +void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn) +{ + /* + * We set addr=VMA start, end=VMA end here, so this won't fail, but we + * check it again on complete and will fail there if specified addr is + * invalid. + */ + get_remap_pgoff(desc->vm_flags, desc->start, desc->end, + desc->start, desc->end, pfn, &desc->pgoff); + desc->vm_flags |= VM_REMAP_FLAGS; +} + +static int remap_pfn_range_prepare_vma(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size) +{ + unsigned long end = addr + PAGE_ALIGN(size); + int err; + + err = get_remap_pgoff(vma->vm_flags, addr, end, + vma->vm_start, vma->vm_end, + pfn, &vma->vm_pgoff); + if (err) + return err; + + vm_flags_set(vma, VM_REMAP_FLAGS); + return 0; +} + +/** + * remap_pfn_range - remap kernel memory to userspace + * @vma: user vma to map to + * @addr: target page aligned user address to start at + * @pfn: page frame number of kernel physical memory address + * @size: size of mapping area + * @prot: page protection flags for this mapping + * + * Note: this is only safe if the mm semaphore is held when called. + * + * Return: %0 on success, negative error code otherwise. + */ +int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + int err; + + err = remap_pfn_range_prepare_vma(vma, addr, pfn, size); + if (err) + return err; + + return do_remap_pfn_range(vma, addr, pfn, size, prot); +} EXPORT_SYMBOL(remap_pfn_range); +int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + return do_remap_pfn_range(vma, addr, pfn, size, prot); +} + /** * vm_iomap_memory - remap memory to userspace * @vma: user vma to map to -- cgit v1.2.3 From c707a68f9468e4ef4a3546b636a9dd088fe7b7f1 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 20 Oct 2025 13:11:25 +0100 Subject: mm: abstract io_remap_pfn_range() based on PFN The only instances in which we customise this function are ones in which we customise the PFN used. Instances where architectures were not passing the pgprot value through pgprot_decrypted() are ones where pgprot_decrypted() was a no-op anyway, so we can simply always pass pgprot through this function. Use this fact to simplify the use of io_remap_pfn_range(), by abstracting the PFN via io_remap_pfn_range_pfn() and using this instead of providing a general io_remap_pfn_range() function per-architecture. Link: https://lkml.kernel.org/r/d086191bf431b58ce3b231b4f4f555d080f60327.1760959442.git.lorenzo.stoakes@oracle.com Suggested-by: Jason Gunthorpe Signed-off-by: Lorenzo Stoakes Reviewed-by: Jason Gunthorpe Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Baolin Wang Cc: Baoquan He Cc: Chatre, Reinette Cc: Christian Borntraeger Cc: Christian Brauner Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Dave Young Cc: David Hildenbrand Cc: David S. Miller Cc: Dmitriy Vyukov Cc: Greg Kroah-Hartman Cc: Guo Ren Cc: Heiko Carstens Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: Kevin Tian Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Nicolas Pitre Cc: Oscar Salvador Cc: Pedro Falcato Cc: Robin Murohy Cc: Sumanth Korikkar Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: "Uladzislau Rezki (Sony)" Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/csky/include/asm/pgtable.h | 3 --- arch/mips/alchemy/common/setup.c | 9 +++++---- arch/mips/include/asm/pgtable.h | 5 ++--- arch/sparc/include/asm/pgtable_32.h | 12 ++++-------- arch/sparc/include/asm/pgtable_64.h | 12 ++++-------- include/linux/mm.h | 19 ++++++++++++++----- 6 files changed, 29 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/arch/csky/include/asm/pgtable.h b/arch/csky/include/asm/pgtable.h index 5a394be09c35..d606afbabce1 100644 --- a/arch/csky/include/asm/pgtable.h +++ b/arch/csky/include/asm/pgtable.h @@ -263,7 +263,4 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, #define update_mmu_cache(vma, addr, ptep) \ update_mmu_cache_range(NULL, vma, addr, ptep, 1) -#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ - remap_pfn_range(vma, vaddr, pfn, size, prot) - #endif /* __ASM_CSKY_PGTABLE_H */ diff --git a/arch/mips/alchemy/common/setup.c b/arch/mips/alchemy/common/setup.c index a7a6d31a7a41..c35b4f809d51 100644 --- a/arch/mips/alchemy/common/setup.c +++ b/arch/mips/alchemy/common/setup.c @@ -94,12 +94,13 @@ phys_addr_t fixup_bigphys_addr(phys_addr_t phys_addr, phys_addr_t size) return phys_addr; } -int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long vaddr, - unsigned long pfn, unsigned long size, pgprot_t prot) +static inline unsigned long io_remap_pfn_range_pfn(unsigned long pfn, + unsigned long size) { phys_addr_t phys_addr = fixup_bigphys_addr(pfn << PAGE_SHIFT, size); - return remap_pfn_range(vma, vaddr, phys_addr >> PAGE_SHIFT, size, prot); + return phys_addr >> PAGE_SHIFT; } -EXPORT_SYMBOL(io_remap_pfn_range); +EXPORT_SYMBOL(io_remap_pfn_range_pfn); + #endif /* CONFIG_MIPS_FIXUP_BIGPHYS_ADDR */ diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index ae73ecf4c41a..9c06a612d33a 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -604,9 +604,8 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, */ #ifdef CONFIG_MIPS_FIXUP_BIGPHYS_ADDR phys_addr_t fixup_bigphys_addr(phys_addr_t addr, phys_addr_t size); -int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long vaddr, - unsigned long pfn, unsigned long size, pgprot_t prot); -#define io_remap_pfn_range io_remap_pfn_range +unsigned long io_remap_pfn_range_pfn(unsigned long pfn, unsigned long size); +#define io_remap_pfn_range_pfn io_remap_pfn_range_pfn #else #define fixup_bigphys_addr(addr, size) (addr) #endif /* CONFIG_MIPS_FIXUP_BIGPHYS_ADDR */ diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h index f1538a48484a..a9f802d1dd64 100644 --- a/arch/sparc/include/asm/pgtable_32.h +++ b/arch/sparc/include/asm/pgtable_32.h @@ -395,12 +395,8 @@ __get_iospace (unsigned long addr) #define GET_IOSPACE(pfn) (pfn >> (BITS_PER_LONG - 4)) #define GET_PFN(pfn) (pfn & 0x0fffffffUL) -int remap_pfn_range(struct vm_area_struct *, unsigned long, unsigned long, - unsigned long, pgprot_t); - -static inline int io_remap_pfn_range(struct vm_area_struct *vma, - unsigned long from, unsigned long pfn, - unsigned long size, pgprot_t prot) +static inline unsigned long io_remap_pfn_range_pfn(unsigned long pfn, + unsigned long size) { unsigned long long offset, space, phys_base; @@ -408,9 +404,9 @@ static inline int io_remap_pfn_range(struct vm_area_struct *vma, space = GET_IOSPACE(pfn); phys_base = offset | (space << 32ULL); - return remap_pfn_range(vma, from, phys_base >> PAGE_SHIFT, size, prot); + return phys_base >> PAGE_SHIFT; } -#define io_remap_pfn_range io_remap_pfn_range +#define io_remap_pfn_range_pfn io_remap_pfn_range_pfn #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS #define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \ diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 64b85ff9c766..615f460c50af 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -1048,9 +1048,6 @@ int page_in_phys_avail(unsigned long paddr); #define GET_IOSPACE(pfn) (pfn >> (BITS_PER_LONG - 4)) #define GET_PFN(pfn) (pfn & 0x0fffffffffffffffUL) -int remap_pfn_range(struct vm_area_struct *, unsigned long, unsigned long, - unsigned long, pgprot_t); - void adi_restore_tags(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t pte); @@ -1084,9 +1081,8 @@ static inline int arch_unmap_one(struct mm_struct *mm, return 0; } -static inline int io_remap_pfn_range(struct vm_area_struct *vma, - unsigned long from, unsigned long pfn, - unsigned long size, pgprot_t prot) +static inline unsigned long io_remap_pfn_range_pfn(unsigned long pfn, + unsigned long size) { unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT; int space = GET_IOSPACE(pfn); @@ -1094,9 +1090,9 @@ static inline int io_remap_pfn_range(struct vm_area_struct *vma, phys_base = offset | (((unsigned long) space) << 32UL); - return remap_pfn_range(vma, from, phys_base >> PAGE_SHIFT, size, prot); + return phys_base >> PAGE_SHIFT; } -#define io_remap_pfn_range io_remap_pfn_range +#define io_remap_pfn_range_pfn io_remap_pfn_range_pfn static inline unsigned long __untagged_addr(unsigned long start) { diff --git a/include/linux/mm.h b/include/linux/mm.h index ca5565f4fac4..4441ceec913f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3684,15 +3684,24 @@ static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma, return VM_FAULT_NOPAGE; } -#ifndef io_remap_pfn_range -static inline int io_remap_pfn_range(struct vm_area_struct *vma, - unsigned long addr, unsigned long pfn, - unsigned long size, pgprot_t prot) +#ifndef io_remap_pfn_range_pfn +static inline unsigned long io_remap_pfn_range_pfn(unsigned long pfn, + unsigned long size) { - return remap_pfn_range(vma, addr, pfn, size, pgprot_decrypted(prot)); + return pfn; } #endif +static inline int io_remap_pfn_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long orig_pfn, + unsigned long size, pgprot_t orig_prot) +{ + const unsigned long pfn = io_remap_pfn_range_pfn(orig_pfn, size); + const pgprot_t prot = pgprot_decrypted(orig_prot); + + return remap_pfn_range(vma, addr, pfn, size, prot); +} + static inline vm_fault_t vmf_error(int err) { if (err == -ENOMEM) -- cgit v1.2.3 From ac0a3fc9c07df79dc8a4ce9d274df00afc7bf12d Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 20 Oct 2025 13:11:27 +0100 Subject: mm: add ability to take further action in vm_area_desc Some drivers/filesystems need to perform additional tasks after the VMA is set up. This is typically in the form of pre-population. The forms of pre-population most likely to be performed are a PFN remap or the insertion of normal folios and PFNs into a mixed map. We start by implementing the PFN remap functionality, ensuring that we perform the appropriate actions at the appropriate time - that is setting flags at the point of .mmap_prepare, and performing the actual remap at the point at which the VMA is fully established. This prevents the driver from doing anything too crazy with a VMA at any stage, and we retain complete control over how the mm functionality is applied. Unfortunately callers still do often require some kind of custom action, so we add an optional success/error _hook to allow the caller to do something after the action has succeeded or failed. This is done at the point when the VMA has already been established, so the harm that can be done is limited. The error hook can be used to filter errors if necessary. There may be cases in which the caller absolutely must hold the file rmap lock until the operation is entirely complete. It is an edge case, but certainly the hugetlbfs mmap hook requires it. To accommodate this, we add the hide_from_rmap_until_complete flag to the mmap_action type. In this case, if a new VMA is allocated, we will hold the file rmap lock until the operation is entirely completed (including any success/error hooks). Note that we do not need to update __compat_vma_mmap() to accommodate this flag, as this function will be invoked from an .mmap handler whose VMA is not yet visible, so we implicitly hide it from the rmap. If any error arises on these final actions, we simply unmap the VMA altogether. Also update the stacked filesystem compatibility layer to utilise the action behaviour, and update the VMA tests accordingly. While we're here, rename __compat_vma_mmap_prepare() to __compat_vma_mmap() as we are now performing actions invoked by the mmap_prepare in addition to just the mmap_prepare hook. Link: https://lkml.kernel.org/r/2601199a7b2eaeadfcd8ab6e199c6d1706650c94.1760959442.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Baolin Wang Cc: Baoquan He Cc: Chatre, Reinette Cc: Christian Borntraeger Cc: Christian Brauner Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Dave Young Cc: David Hildenbrand Cc: David S. Miller Cc: Dmitriy Vyukov Cc: Greg Kroah-Hartman Cc: Guo Ren Cc: Heiko Carstens Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Kevin Tian Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Nicolas Pitre Cc: Oscar Salvador Cc: Pedro Falcato Cc: Robin Murohy Cc: Sumanth Korikkar Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: "Uladzislau Rezki (Sony)" Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/fs.h | 6 +- include/linux/mm.h | 74 ++++++++++++++++++++ include/linux/mm_types.h | 53 ++++++++++++++ mm/util.c | 146 ++++++++++++++++++++++++++++++++++++--- mm/vma.c | 113 ++++++++++++++++++++++-------- tools/testing/vma/vma_internal.h | 98 ++++++++++++++++++++++++-- 6 files changed, 441 insertions(+), 49 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index c895146c1444..8cf9547a881c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2393,14 +2393,14 @@ static inline bool can_mmap_file(struct file *file) return true; } -int __compat_vma_mmap_prepare(const struct file_operations *f_op, +int __compat_vma_mmap(const struct file_operations *f_op, struct file *file, struct vm_area_struct *vma); -int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma); +int compat_vma_mmap(struct file *file, struct vm_area_struct *vma); static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma) { if (file->f_op->mmap_prepare) - return compat_vma_mmap_prepare(file, vma); + return compat_vma_mmap(file, vma); return file->f_op->mmap(file, vma); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 4441ceec913f..2d060081caa5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3608,6 +3608,80 @@ static inline unsigned long vma_desc_pages(const struct vm_area_desc *desc) return vma_desc_size(desc) >> PAGE_SHIFT; } +/** + * mmap_action_remap - helper for mmap_prepare hook to specify that a pure PFN + * remap is required. + * @desc: The VMA descriptor for the VMA requiring remap. + * @start: The virtual address to start the remap from, must be within the VMA. + * @start_pfn: The first PFN in the range to remap. + * @size: The size of the range to remap, in bytes, at most spanning to the end + * of the VMA. + */ +static inline void mmap_action_remap(struct vm_area_desc *desc, + unsigned long start, + unsigned long start_pfn, + unsigned long size) +{ + struct mmap_action *action = &desc->action; + + /* [start, start + size) must be within the VMA. */ + WARN_ON_ONCE(start < desc->start || start >= desc->end); + WARN_ON_ONCE(start + size > desc->end); + + action->type = MMAP_REMAP_PFN; + action->remap.start = start; + action->remap.start_pfn = start_pfn; + action->remap.size = size; + action->remap.pgprot = desc->page_prot; +} + +/** + * mmap_action_remap_full - helper for mmap_prepare hook to specify that the + * entirety of a VMA should be PFN remapped. + * @desc: The VMA descriptor for the VMA requiring remap. + * @start_pfn: The first PFN in the range to remap. + */ +static inline void mmap_action_remap_full(struct vm_area_desc *desc, + unsigned long start_pfn) +{ + mmap_action_remap(desc, desc->start, start_pfn, vma_desc_size(desc)); +} + +/** + * mmap_action_ioremap - helper for mmap_prepare hook to specify that a pure PFN + * I/O remap is required. + * @desc: The VMA descriptor for the VMA requiring remap. + * @start: The virtual address to start the remap from, must be within the VMA. + * @start_pfn: The first PFN in the range to remap. + * @size: The size of the range to remap, in bytes, at most spanning to the end + * of the VMA. + */ +static inline void mmap_action_ioremap(struct vm_area_desc *desc, + unsigned long start, + unsigned long start_pfn, + unsigned long size) +{ + mmap_action_remap(desc, start, start_pfn, size); + desc->action.type = MMAP_IO_REMAP_PFN; +} + +/** + * mmap_action_ioremap_full - helper for mmap_prepare hook to specify that the + * entirety of a VMA should be PFN I/O remapped. + * @desc: The VMA descriptor for the VMA requiring remap. + * @start_pfn: The first PFN in the range to remap. + */ +static inline void mmap_action_ioremap_full(struct vm_area_desc *desc, + unsigned long start_pfn) +{ + mmap_action_ioremap(desc, desc->start, start_pfn, vma_desc_size(desc)); +} + +void mmap_action_prepare(struct mmap_action *action, + struct vm_area_desc *desc); +int mmap_action_complete(struct mmap_action *action, + struct vm_area_struct *vma); + /* Look up the first VMA which exactly match the interval vm_start ... vm_end */ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 90e5790c318f..5021047485a9 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -773,6 +773,56 @@ struct pfnmap_track_ctx { }; #endif +/* What action should be taken after an .mmap_prepare call is complete? */ +enum mmap_action_type { + MMAP_NOTHING, /* Mapping is complete, no further action. */ + MMAP_REMAP_PFN, /* Remap PFN range. */ + MMAP_IO_REMAP_PFN, /* I/O remap PFN range. */ +}; + +/* + * Describes an action an mmap_prepare hook can instruct to be taken to complete + * the mapping of a VMA. Specified in vm_area_desc. + */ +struct mmap_action { + union { + /* Remap range. */ + struct { + unsigned long start; + unsigned long start_pfn; + unsigned long size; + pgprot_t pgprot; + } remap; + }; + enum mmap_action_type type; + + /* + * If specified, this hook is invoked after the selected action has been + * successfully completed. Note that the VMA write lock still held. + * + * The absolute minimum ought to be done here. + * + * Returns 0 on success, or an error code. + */ + int (*success_hook)(const struct vm_area_struct *vma); + + /* + * If specified, this hook is invoked when an error occurred when + * attempting the selection action. + * + * The hook can return an error code in order to filter the error, but + * it is not valid to clear the error here. + */ + int (*error_hook)(int err); + + /* + * This should be set in rare instances where the operation required + * that the rmap should not be able to access the VMA until + * completely set up. + */ + bool hide_from_rmap_until_complete :1; +}; + /* * Describes a VMA that is about to be mmap()'ed. Drivers may choose to * manipulate mutable fields which will cause those fields to be updated in the @@ -796,6 +846,9 @@ struct vm_area_desc { /* Write-only fields. */ const struct vm_operations_struct *vm_ops; void *private_data; + + /* Take further action? */ + struct mmap_action action; }; /* diff --git a/mm/util.c b/mm/util.c index 8989d5767528..97cae40c0209 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1135,7 +1135,7 @@ EXPORT_SYMBOL(flush_dcache_folio); #endif /** - * __compat_vma_mmap_prepare() - See description for compat_vma_mmap_prepare() + * __compat_vma_mmap() - See description for compat_vma_mmap() * for details. This is the same operation, only with a specific file operations * struct which may or may not be the same as vma->vm_file->f_op. * @f_op: The file operations whose .mmap_prepare() hook is specified. @@ -1143,7 +1143,7 @@ EXPORT_SYMBOL(flush_dcache_folio); * @vma: The VMA to apply the .mmap_prepare() hook to. * Returns: 0 on success or error. */ -int __compat_vma_mmap_prepare(const struct file_operations *f_op, +int __compat_vma_mmap(const struct file_operations *f_op, struct file *file, struct vm_area_struct *vma) { struct vm_area_desc desc = { @@ -1156,21 +1156,24 @@ int __compat_vma_mmap_prepare(const struct file_operations *f_op, .vm_file = vma->vm_file, .vm_flags = vma->vm_flags, .page_prot = vma->vm_page_prot, + + .action.type = MMAP_NOTHING, /* Default */ }; int err; err = f_op->mmap_prepare(&desc); if (err) return err; - set_vma_from_desc(vma, &desc); - return 0; + mmap_action_prepare(&desc.action, &desc); + set_vma_from_desc(vma, &desc); + return mmap_action_complete(&desc.action, vma); } -EXPORT_SYMBOL(__compat_vma_mmap_prepare); +EXPORT_SYMBOL(__compat_vma_mmap); /** - * compat_vma_mmap_prepare() - Apply the file's .mmap_prepare() hook to an - * existing VMA. + * compat_vma_mmap() - Apply the file's .mmap_prepare() hook to an + * existing VMA and execute any requested actions. * @file: The file which possesss an f_op->mmap_prepare() hook. * @vma: The VMA to apply the .mmap_prepare() hook to. * @@ -1185,7 +1188,7 @@ EXPORT_SYMBOL(__compat_vma_mmap_prepare); * .mmap_prepare() hook, as we are in a different context when we invoke the * .mmap() hook, already having a VMA to deal with. * - * compat_vma_mmap_prepare() is a compatibility function that takes VMA state, + * compat_vma_mmap() is a compatibility function that takes VMA state, * establishes a struct vm_area_desc descriptor, passes to the underlying * .mmap_prepare() hook and applies any changes performed by it. * @@ -1194,11 +1197,11 @@ EXPORT_SYMBOL(__compat_vma_mmap_prepare); * * Returns: 0 on success or error. */ -int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma) +int compat_vma_mmap(struct file *file, struct vm_area_struct *vma) { - return __compat_vma_mmap_prepare(file->f_op, file, vma); + return __compat_vma_mmap(file->f_op, file, vma); } -EXPORT_SYMBOL(compat_vma_mmap_prepare); +EXPORT_SYMBOL(compat_vma_mmap); static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio, const struct page *page) @@ -1280,6 +1283,127 @@ again: } } +static int mmap_action_finish(struct mmap_action *action, + const struct vm_area_struct *vma, int err) +{ + /* + * If an error occurs, unmap the VMA altogether and return an error. We + * only clear the newly allocated VMA, since this function is only + * invoked if we do NOT merge, so we only clean up the VMA we created. + */ + if (err) { + const size_t len = vma_pages(vma) << PAGE_SHIFT; + + do_munmap(current->mm, vma->vm_start, len, NULL); + + if (action->error_hook) { + /* We may want to filter the error. */ + err = action->error_hook(err); + + /* The caller should not clear the error. */ + VM_WARN_ON_ONCE(!err); + } + return err; + } + + if (action->success_hook) + return action->success_hook(vma); + + return 0; +} + +#ifdef CONFIG_MMU +/** + * mmap_action_prepare - Perform preparatory setup for an VMA descriptor + * action which need to be performed. + * @desc: The VMA descriptor to prepare for @action. + * @action: The action to perform. + */ +void mmap_action_prepare(struct mmap_action *action, + struct vm_area_desc *desc) +{ + switch (action->type) { + case MMAP_NOTHING: + break; + case MMAP_REMAP_PFN: + remap_pfn_range_prepare(desc, action->remap.start_pfn); + break; + case MMAP_IO_REMAP_PFN: + io_remap_pfn_range_prepare(desc, action->remap.start_pfn, + action->remap.size); + break; + } +} +EXPORT_SYMBOL(mmap_action_prepare); + +/** + * mmap_action_complete - Execute VMA descriptor action. + * @action: The action to perform. + * @vma: The VMA to perform the action upon. + * + * Similar to mmap_action_prepare(). + * + * Return: 0 on success, or error, at which point the VMA will be unmapped. + */ +int mmap_action_complete(struct mmap_action *action, + struct vm_area_struct *vma) +{ + int err = 0; + + switch (action->type) { + case MMAP_NOTHING: + break; + case MMAP_REMAP_PFN: + err = remap_pfn_range_complete(vma, action->remap.start, + action->remap.start_pfn, action->remap.size, + action->remap.pgprot); + break; + case MMAP_IO_REMAP_PFN: + err = io_remap_pfn_range_complete(vma, action->remap.start, + action->remap.start_pfn, action->remap.size, + action->remap.pgprot); + break; + } + + return mmap_action_finish(action, vma, err); +} +EXPORT_SYMBOL(mmap_action_complete); +#else +void mmap_action_prepare(struct mmap_action *action, + struct vm_area_desc *desc) +{ + switch (action->type) { + case MMAP_NOTHING: + break; + case MMAP_REMAP_PFN: + case MMAP_IO_REMAP_PFN: + WARN_ON_ONCE(1); /* nommu cannot handle these. */ + break; + } +} +EXPORT_SYMBOL(mmap_action_prepare); + +int mmap_action_complete(struct mmap_action *action, + struct vm_area_struct *vma) +{ + int err = 0; + + switch (action->type) { + case MMAP_NOTHING: + break; + case MMAP_REMAP_PFN: + case MMAP_IO_REMAP_PFN: + WARN_ON_ONCE(1); /* nommu cannot handle this. */ + + err = -EINVAL; + break; + } + + return mmap_action_finish(action, vma, err); +} +EXPORT_SYMBOL(mmap_action_complete); +#endif + #ifdef CONFIG_MMU /** * folio_pte_batch - detect a PTE batch for a large folio diff --git a/mm/vma.c b/mm/vma.c index eb2f711c03a1..919d1fc63a52 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -34,7 +34,9 @@ struct mmap_state { struct maple_tree mt_detach; /* Determine if we can check KSM flags early in mmap() logic. */ - bool check_ksm_early; + bool check_ksm_early :1; + /* If we map new, hold the file rmap lock on mapping. */ + bool hold_file_rmap_lock :1; }; #define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, vm_flags_, file_) \ @@ -1754,7 +1756,7 @@ void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb) unlink_file_vma_batch_process(vb); } -static void vma_link_file(struct vm_area_struct *vma) +static void vma_link_file(struct vm_area_struct *vma, bool hold_rmap_lock) { struct file *file = vma->vm_file; struct address_space *mapping; @@ -1763,7 +1765,8 @@ static void vma_link_file(struct vm_area_struct *vma) mapping = file->f_mapping; i_mmap_lock_write(mapping); __vma_link_file(vma, mapping); - i_mmap_unlock_write(mapping); + if (!hold_rmap_lock) + i_mmap_unlock_write(mapping); } } @@ -1777,7 +1780,7 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) vma_start_write(vma); vma_iter_store_new(&vmi, vma); - vma_link_file(vma); + vma_link_file(vma, /* hold_rmap_lock= */false); mm->map_count++; validate_mm(mm); return 0; @@ -2311,17 +2314,33 @@ static void update_ksm_flags(struct mmap_state *map) map->vm_flags = ksm_vma_flags(map->mm, map->file, map->vm_flags); } +static void set_desc_from_map(struct vm_area_desc *desc, + const struct mmap_state *map) +{ + desc->start = map->addr; + desc->end = map->end; + + desc->pgoff = map->pgoff; + desc->vm_file = map->file; + desc->vm_flags = map->vm_flags; + desc->page_prot = map->page_prot; +} + /* * __mmap_setup() - Prepare to gather any overlapping VMAs that need to be * unmapped once the map operation is completed, check limits, account mapping * and clean up any pre-existing VMAs. * + * As a result it sets up the @map and @desc objects. + * * @map: Mapping state. + * @desc: VMA descriptor * @uf: Userfaultfd context list. * * Returns: 0 on success, error code otherwise. */ -static int __mmap_setup(struct mmap_state *map, struct list_head *uf) +static int __mmap_setup(struct mmap_state *map, struct vm_area_desc *desc, + struct list_head *uf) { int error; struct vma_iterator *vmi = map->vmi; @@ -2378,6 +2397,7 @@ static int __mmap_setup(struct mmap_state *map, struct list_head *uf) */ vms_clean_up_area(vms, &map->mas_detach); + set_desc_from_map(desc, map); return 0; } @@ -2479,7 +2499,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) vma_start_write(vma); vma_iter_store_new(vmi, vma); map->mm->map_count++; - vma_link_file(vma); + vma_link_file(vma, map->hold_file_rmap_lock); /* * vma_merge_new_range() calls khugepaged_enter_vma() too, the below @@ -2539,6 +2559,17 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) vma_set_page_prot(vma); } +static void call_action_prepare(struct mmap_state *map, + struct vm_area_desc *desc) +{ + struct mmap_action *action = &desc->action; + + mmap_action_prepare(action, desc); + + if (action->hide_from_rmap_until_complete) + map->hold_file_rmap_lock = true; +} + /* * Invoke the f_op->mmap_prepare() callback for a file-backed mapping that * specifies it. @@ -2550,34 +2581,26 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) * * Returns 0 on success, or an error code otherwise. */ -static int call_mmap_prepare(struct mmap_state *map) +static int call_mmap_prepare(struct mmap_state *map, + struct vm_area_desc *desc) { int err; - struct vm_area_desc desc = { - .mm = map->mm, - .file = map->file, - .start = map->addr, - .end = map->end, - - .pgoff = map->pgoff, - .vm_file = map->file, - .vm_flags = map->vm_flags, - .page_prot = map->page_prot, - }; /* Invoke the hook. */ - err = vfs_mmap_prepare(map->file, &desc); + err = vfs_mmap_prepare(map->file, desc); if (err) return err; + call_action_prepare(map, desc); + /* Update fields permitted to be changed. */ - map->pgoff = desc.pgoff; - map->file = desc.vm_file; - map->vm_flags = desc.vm_flags; - map->page_prot = desc.page_prot; + map->pgoff = desc->pgoff; + map->file = desc->vm_file; + map->vm_flags = desc->vm_flags; + map->page_prot = desc->page_prot; /* User-defined fields. */ - map->vm_ops = desc.vm_ops; - map->vm_private_data = desc.private_data; + map->vm_ops = desc->vm_ops; + map->vm_private_data = desc->private_data; return 0; } @@ -2619,22 +2642,48 @@ static bool can_set_ksm_flags_early(struct mmap_state *map) return false; } +static int call_action_complete(struct mmap_state *map, + struct vm_area_desc *desc, + struct vm_area_struct *vma) +{ + struct mmap_action *action = &desc->action; + int ret; + + ret = mmap_action_complete(action, vma); + + /* If we held the file rmap we need to release it. */ + if (map->hold_file_rmap_lock) { + struct file *file = vma->vm_file; + + i_mmap_unlock_write(file->f_mapping); + } + return ret; +} + static unsigned long __mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; - int error; bool have_mmap_prepare = file && file->f_op->mmap_prepare; VMA_ITERATOR(vmi, mm, addr); MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file); + struct vm_area_desc desc = { + .mm = mm, + .file = file, + .action = { + .type = MMAP_NOTHING, /* Default to no further action. */ + }, + }; + bool allocated_new = false; + int error; map.check_ksm_early = can_set_ksm_flags_early(&map); - error = __mmap_setup(&map, uf); + error = __mmap_setup(&map, &desc, uf); if (!error && have_mmap_prepare) - error = call_mmap_prepare(&map); + error = call_mmap_prepare(&map, &desc); if (error) goto abort_munmap; @@ -2653,6 +2702,7 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr, error = __mmap_new_vma(&map, &vma); if (error) goto unacct_error; + allocated_new = true; } if (have_mmap_prepare) @@ -2660,6 +2710,13 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr, __mmap_complete(&map, vma); + if (have_mmap_prepare && allocated_new) { + error = call_action_complete(&map, &desc, vma); + + if (error) + return error; + } + return addr; /* Accounting was done by __mmap_setup(). */ diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index dc976a285ad2..d873667704e8 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -275,6 +275,57 @@ struct mm_struct { struct vm_area_struct; + +/* What action should be taken after an .mmap_prepare call is complete? */ +enum mmap_action_type { + MMAP_NOTHING, /* Mapping is complete, no further action. */ + MMAP_REMAP_PFN, /* Remap PFN range. */ + MMAP_IO_REMAP_PFN, /* I/O remap PFN range. */ +}; + +/* + * Describes an action an mmap_prepare hook can instruct to be taken to complete + * the mapping of a VMA. Specified in vm_area_desc. + */ +struct mmap_action { + union { + /* Remap range. */ + struct { + unsigned long start; + unsigned long start_pfn; + unsigned long size; + pgprot_t pgprot; + } remap; + }; + enum mmap_action_type type; + + /* + * If specified, this hook is invoked after the selected action has been + * successfully completed. Note that the VMA write lock still held. + * + * The absolute minimum ought to be done here. + * + * Returns 0 on success, or an error code. + */ + int (*success_hook)(const struct vm_area_struct *vma); + + /* + * If specified, this hook is invoked when an error occurred when + * attempting the selection action. + * + * The hook can return an error code in order to filter the error, but + * it is not valid to clear the error here. + */ + int (*error_hook)(int err); + + /* + * This should be set in rare instances where the operation required + * that the rmap should not be able to access the VMA until + * completely set up. + */ + bool hide_from_rmap_until_complete :1; +}; + /* * Describes a VMA that is about to be mmap()'ed. Drivers may choose to * manipulate mutable fields which will cause those fields to be updated in the @@ -298,6 +349,9 @@ struct vm_area_desc { /* Write-only fields. */ const struct vm_operations_struct *vm_ops; void *private_data; + + /* Take further action? */ + struct mmap_action action; }; struct file_operations { @@ -1326,12 +1380,23 @@ static inline void free_anon_vma_name(struct vm_area_struct *vma) static inline void set_vma_from_desc(struct vm_area_struct *vma, struct vm_area_desc *desc); -static inline int __compat_vma_mmap_prepare(const struct file_operations *f_op, +static inline void mmap_action_prepare(struct mmap_action *action, + struct vm_area_desc *desc) +{ +} + +static inline int mmap_action_complete(struct mmap_action *action, + struct vm_area_struct *vma) +{ + return 0; +} + +static inline int __compat_vma_mmap(const struct file_operations *f_op, struct file *file, struct vm_area_struct *vma) { struct vm_area_desc desc = { .mm = vma->vm_mm, - .file = vma->vm_file, + .file = file, .start = vma->vm_start, .end = vma->vm_end, @@ -1339,21 +1404,24 @@ static inline int __compat_vma_mmap_prepare(const struct file_operations *f_op, .vm_file = vma->vm_file, .vm_flags = vma->vm_flags, .page_prot = vma->vm_page_prot, + + .action.type = MMAP_NOTHING, /* Default */ }; int err; err = f_op->mmap_prepare(&desc); if (err) return err; - set_vma_from_desc(vma, &desc); - return 0; + mmap_action_prepare(&desc.action, &desc); + set_vma_from_desc(vma, &desc); + return mmap_action_complete(&desc.action, vma); } -static inline int compat_vma_mmap_prepare(struct file *file, +static inline int compat_vma_mmap(struct file *file, struct vm_area_struct *vma) { - return __compat_vma_mmap_prepare(file->f_op, file, vma); + return __compat_vma_mmap(file->f_op, file, vma); } /* Did the driver provide valid mmap hook configuration? */ @@ -1374,7 +1442,7 @@ static inline bool can_mmap_file(struct file *file) static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma) { if (file->f_op->mmap_prepare) - return compat_vma_mmap_prepare(file, vma); + return compat_vma_mmap(file, vma); return file->f_op->mmap(file, vma); } @@ -1407,4 +1475,20 @@ static inline vm_flags_t ksm_vma_flags(const struct mm_struct *mm, return vm_flags; } +static inline void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn) +{ +} + +static inline int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t pgprot) +{ + return 0; +} + +static inline int do_munmap(struct mm_struct *, unsigned long, size_t, + struct list_head *uf) +{ + return 0; +} + #endif /* __MM_VMA_INTERNAL_H */ -- cgit v1.2.3 From ea52cb24cd3fb121283754ab82b2cb3044609359 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 20 Oct 2025 13:11:29 +0100 Subject: mm/hugetlbfs: update hugetlbfs to use mmap_prepare Since we can now perform actions after the VMA is established via mmap_prepare, use desc->action_success_hook to set up the hugetlb lock once the VMA is setup. We also make changes throughout hugetlbfs to make this possible. Note that we must hide newly established hugetlb VMAs from the rmap until the operation is entirely complete as we establish a hugetlb lock during VMA setup that can be raced by rmap users. Link: https://lkml.kernel.org/r/b1afa16d3cfa585a03df9ae215ae9f905b3f0ed7.1760959442.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Jason Gunthorpe Tested-by: Sumanth Korikkar Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Baolin Wang Cc: Baoquan He Cc: Chatre, Reinette Cc: Christian Borntraeger Cc: Christian Brauner Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Dave Young Cc: David Hildenbrand Cc: David S. Miller Cc: Dmitriy Vyukov Cc: Greg Kroah-Hartman Cc: Guo Ren Cc: Heiko Carstens Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: Kevin Tian Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Nicolas Pitre Cc: Oscar Salvador Cc: Pedro Falcato Cc: Robin Murohy Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: "Uladzislau Rezki (Sony)" Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 46 ++++++++++++++++++------- include/linux/hugetlb.h | 9 +++-- include/linux/hugetlb_inline.h | 15 +++++--- mm/hugetlb.c | 77 ++++++++++++++++++++++++------------------ 4 files changed, 95 insertions(+), 52 deletions(-) (limited to 'include/linux') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index ce8e40d35032..3919fca56553 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -96,8 +96,15 @@ static const struct fs_parameter_spec hugetlb_fs_parameters[] = { #define PGOFF_LOFFT_MAX \ (((1UL << (PAGE_SHIFT + 1)) - 1) << (BITS_PER_LONG - (PAGE_SHIFT + 1))) -static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) +static int hugetlb_file_mmap_prepare_success(const struct vm_area_struct *vma) { + /* Unfortunate we have to reassign vma->vm_private_data. */ + return hugetlb_vma_lock_alloc((struct vm_area_struct *)vma); +} + +static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc) +{ + struct file *file = desc->file; struct inode *inode = file_inode(file); loff_t len, vma_len; int ret; @@ -112,8 +119,8 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) * way when do_mmap unwinds (may be important on powerpc * and ia64). */ - vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND); - vma->vm_ops = &hugetlb_vm_ops; + desc->vm_flags |= VM_HUGETLB | VM_DONTEXPAND; + desc->vm_ops = &hugetlb_vm_ops; /* * page based offset in vm_pgoff could be sufficiently large to @@ -122,16 +129,16 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) * sizeof(unsigned long). So, only check in those instances. */ if (sizeof(unsigned long) == sizeof(loff_t)) { - if (vma->vm_pgoff & PGOFF_LOFFT_MAX) + if (desc->pgoff & PGOFF_LOFFT_MAX) return -EINVAL; } /* must be huge page aligned */ - if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) + if (desc->pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) return -EINVAL; - vma_len = (loff_t)(vma->vm_end - vma->vm_start); - len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + vma_len = (loff_t)vma_desc_size(desc); + len = vma_len + ((loff_t)desc->pgoff << PAGE_SHIFT); /* check for overflow */ if (len < vma_len) return -EINVAL; @@ -141,7 +148,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) ret = -ENOMEM; - vm_flags = vma->vm_flags; + vm_flags = desc->vm_flags; /* * for SHM_HUGETLB, the pages are reserved in the shmget() call so skip * reserving here. Note: only for SHM hugetlbfs file, the inode @@ -151,17 +158,30 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) vm_flags |= VM_NORESERVE; if (hugetlb_reserve_pages(inode, - vma->vm_pgoff >> huge_page_order(h), - len >> huge_page_shift(h), vma, - vm_flags) < 0) + desc->pgoff >> huge_page_order(h), + len >> huge_page_shift(h), desc, + vm_flags) < 0) goto out; ret = 0; - if (vma->vm_flags & VM_WRITE && inode->i_size < len) + if ((desc->vm_flags & VM_WRITE) && inode->i_size < len) i_size_write(inode, len); out: inode_unlock(inode); + if (!ret) { + /* Allocate the VMA lock after we set it up. */ + desc->action.success_hook = hugetlb_file_mmap_prepare_success; + /* + * We cannot permit the rmap finding this VMA in the time + * between the VMA being inserted into the VMA tree and the + * completion/success hook being invoked. + * + * This is because we establish a per-VMA hugetlb lock which can + * be raced by rmap. + */ + desc->action.hide_from_rmap_until_complete = true; + } return ret; } @@ -1220,7 +1240,7 @@ static void init_once(void *foo) static const struct file_operations hugetlbfs_file_operations = { .read_iter = hugetlbfs_read_iter, - .mmap = hugetlbfs_file_mmap, + .mmap_prepare = hugetlbfs_file_mmap_prepare, .fsync = noop_fsync, .get_unmapped_area = hugetlb_get_unmapped_area, .llseek = default_llseek, diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 8e63e46b8e1f..2387513d6ae5 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -150,8 +150,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, struct folio **foliop); #endif /* CONFIG_USERFAULTFD */ long hugetlb_reserve_pages(struct inode *inode, long from, long to, - struct vm_area_struct *vma, - vm_flags_t vm_flags); + struct vm_area_desc *desc, vm_flags_t vm_flags); long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list); @@ -280,6 +279,7 @@ bool is_hugetlb_entry_hwpoisoned(pte_t pte); void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); void fixup_hugetlb_reservations(struct vm_area_struct *vma); void hugetlb_split(struct vm_area_struct *vma, unsigned long addr); +int hugetlb_vma_lock_alloc(struct vm_area_struct *vma); #else /* !CONFIG_HUGETLB_PAGE */ @@ -466,6 +466,11 @@ static inline void fixup_hugetlb_reservations(struct vm_area_struct *vma) static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {} +static inline int hugetlb_vma_lock_alloc(struct vm_area_struct *vma) +{ + return 0; +} + #endif /* !CONFIG_HUGETLB_PAGE */ #ifndef pgd_write diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h index 0660a03d37d9..a27aa0162918 100644 --- a/include/linux/hugetlb_inline.h +++ b/include/linux/hugetlb_inline.h @@ -2,22 +2,27 @@ #ifndef _LINUX_HUGETLB_INLINE_H #define _LINUX_HUGETLB_INLINE_H -#ifdef CONFIG_HUGETLB_PAGE - #include -static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) +#ifdef CONFIG_HUGETLB_PAGE + +static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags) { - return !!(vma->vm_flags & VM_HUGETLB); + return !!(vm_flags & VM_HUGETLB); } #else -static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) +static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags) { return false; } #endif +static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) +{ + return is_vm_hugetlb_flags(vma->vm_flags); +} + #endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7774c286b3b7..86e672fcb305 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -119,7 +119,6 @@ struct mutex *hugetlb_fault_mutex_table __ro_after_init; /* Forward declaration */ static int hugetlb_acct_memory(struct hstate *h, long delta); static void hugetlb_vma_lock_free(struct vm_area_struct *vma); -static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma); static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma); static void hugetlb_unshare_pmds(struct vm_area_struct *vma, unsigned long start, unsigned long end, bool take_locks); @@ -438,17 +437,21 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma) } } -static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma) +/* + * vma specific semaphore used for pmd sharing and fault/truncation + * synchronization + */ +int hugetlb_vma_lock_alloc(struct vm_area_struct *vma) { struct hugetlb_vma_lock *vma_lock; /* Only establish in (flags) sharable vmas */ if (!vma || !(vma->vm_flags & VM_MAYSHARE)) - return; + return 0; /* Should never get here with non-NULL vm_private_data */ if (vma->vm_private_data) - return; + return -EINVAL; vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL); if (!vma_lock) { @@ -463,13 +466,15 @@ static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma) * allocation failure. */ pr_warn_once("HugeTLB: unable to allocate vma specific lock\n"); - return; + return -EINVAL; } kref_init(&vma_lock->refs); init_rwsem(&vma_lock->rw_sema); vma_lock->vma = vma; vma->vm_private_data = vma_lock; + + return 0; } /* Helper that removes a struct file_region from the resv_map cache and returns @@ -1201,20 +1206,28 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma) } } -static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) +static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) { - VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); - VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); + VM_WARN_ON_ONCE_VMA(!is_vm_hugetlb_page(vma), vma); + VM_WARN_ON_ONCE_VMA(vma->vm_flags & VM_MAYSHARE, vma); - set_vma_private_data(vma, (unsigned long)map); + set_vma_private_data(vma, get_vma_private_data(vma) | flags); } -static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) +static void set_vma_desc_resv_map(struct vm_area_desc *desc, struct resv_map *map) { - VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); - VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); + VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags)); + VM_WARN_ON_ONCE(desc->vm_flags & VM_MAYSHARE); - set_vma_private_data(vma, get_vma_private_data(vma) | flags); + desc->private_data = map; +} + +static void set_vma_desc_resv_flags(struct vm_area_desc *desc, unsigned long flags) +{ + VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags)); + VM_WARN_ON_ONCE(desc->vm_flags & VM_MAYSHARE); + + desc->private_data = (void *)((unsigned long)desc->private_data | flags); } static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) @@ -1224,6 +1237,13 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) return (get_vma_private_data(vma) & flag) != 0; } +static bool is_vma_desc_resv_set(struct vm_area_desc *desc, unsigned long flag) +{ + VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags)); + + return ((unsigned long)desc->private_data) & flag; +} + bool __vma_private_lock(struct vm_area_struct *vma) { return !(vma->vm_flags & VM_MAYSHARE) && @@ -7270,9 +7290,9 @@ long hugetlb_change_protection(struct vm_area_struct *vma, */ long hugetlb_reserve_pages(struct inode *inode, - long from, long to, - struct vm_area_struct *vma, - vm_flags_t vm_flags) + long from, long to, + struct vm_area_desc *desc, + vm_flags_t vm_flags) { long chg = -1, add = -1, spool_resv, gbl_resv; struct hstate *h = hstate_inode(inode); @@ -7287,12 +7307,6 @@ long hugetlb_reserve_pages(struct inode *inode, return -EINVAL; } - /* - * vma specific semaphore used for pmd sharing and fault/truncation - * synchronization - */ - hugetlb_vma_lock_alloc(vma); - /* * Only apply hugepage reservation if asked. At fault time, an * attempt will be made for VM_NORESERVE to allocate a page @@ -7305,9 +7319,9 @@ long hugetlb_reserve_pages(struct inode *inode, * Shared mappings base their reservation on the number of pages that * are already allocated on behalf of the file. Private mappings need * to reserve the full area even if read-only as mprotect() may be - * called to make the mapping read-write. Assume !vma is a shm mapping + * called to make the mapping read-write. Assume !desc is a shm mapping */ - if (!vma || vma->vm_flags & VM_MAYSHARE) { + if (!desc || desc->vm_flags & VM_MAYSHARE) { /* * resv_map can not be NULL as hugetlb_reserve_pages is only * called for inodes for which resv_maps were created (see @@ -7324,8 +7338,8 @@ long hugetlb_reserve_pages(struct inode *inode, chg = to - from; - set_vma_resv_map(vma, resv_map); - set_vma_resv_flags(vma, HPAGE_RESV_OWNER); + set_vma_desc_resv_map(desc, resv_map); + set_vma_desc_resv_flags(desc, HPAGE_RESV_OWNER); } if (chg < 0) @@ -7335,7 +7349,7 @@ long hugetlb_reserve_pages(struct inode *inode, chg * pages_per_huge_page(h), &h_cg) < 0) goto out_err; - if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) { + if (desc && !(desc->vm_flags & VM_MAYSHARE) && h_cg) { /* For private mappings, the hugetlb_cgroup uncharge info hangs * of the resv_map. */ @@ -7369,7 +7383,7 @@ long hugetlb_reserve_pages(struct inode *inode, * consumed reservations are stored in the map. Hence, nothing * else has to be done for private mappings here */ - if (!vma || vma->vm_flags & VM_MAYSHARE) { + if (!desc || desc->vm_flags & VM_MAYSHARE) { add = region_add(resv_map, from, to, regions_needed, h, h_cg); if (unlikely(add < 0)) { @@ -7423,16 +7437,15 @@ out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), chg * pages_per_huge_page(h), h_cg); out_err: - hugetlb_vma_lock_free(vma); - if (!vma || vma->vm_flags & VM_MAYSHARE) + if (!desc || desc->vm_flags & VM_MAYSHARE) /* Only call region_abort if the region_chg succeeded but the * region_add failed or didn't run. */ if (chg >= 0 && add < 0) region_abort(resv_map, from, to, regions_needed); - if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { + if (desc && is_vma_desc_resv_set(desc, HPAGE_RESV_OWNER)) { kref_put(&resv_map->refs, resv_map_release); - set_vma_resv_map(vma, NULL); + set_vma_desc_resv_map(desc, NULL); } return chg < 0 ? chg : add < 0 ? add : -EINVAL; } -- cgit v1.2.3 From 89646d9c748c0902600090f37ae585f3b99deb4d Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 20 Oct 2025 13:11:30 +0100 Subject: mm: add shmem_zero_setup_desc() Add the ability to set up a shared anonymous mapping based on a VMA descriptor rather than a VMA. This is a prerequisite for converting to the char mm driver to use the mmap_prepare hook. Link: https://lkml.kernel.org/r/d9181517a7e3d6b014a5697c6990d3722c2c9fcd.1760959442.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Jason Gunthorpe Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Baolin Wang Cc: Baoquan He Cc: Chatre, Reinette Cc: Christian Borntraeger Cc: Christian Brauner Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Dave Young Cc: David Hildenbrand Cc: David S. Miller Cc: Dmitriy Vyukov Cc: Greg Kroah-Hartman Cc: Guo Ren Cc: Heiko Carstens Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: Kevin Tian Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Nicolas Pitre Cc: Oscar Salvador Cc: Pedro Falcato Cc: Robin Murohy Cc: Sumanth Korikkar Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: "Uladzislau Rezki (Sony)" Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 3 ++- mm/shmem.c | 41 +++++++++++++++++++++++++++++++++-------- 2 files changed, 35 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 0e47465ef0fd..5b368f9549d6 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -94,7 +94,8 @@ extern struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags); extern struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name, loff_t size, unsigned long flags); -extern int shmem_zero_setup(struct vm_area_struct *); +int shmem_zero_setup(struct vm_area_struct *vma); +int shmem_zero_setup_desc(struct vm_area_desc *desc); extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); extern int shmem_lock(struct file *file, int lock, struct ucounts *ucounts); diff --git a/mm/shmem.c b/mm/shmem.c index 8b9fcdd144c8..da1df4270309 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -5880,14 +5880,9 @@ struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name, } EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt); -/** - * shmem_zero_setup - setup a shared anonymous mapping - * @vma: the vma to be mmapped is prepared by do_mmap - */ -int shmem_zero_setup(struct vm_area_struct *vma) +static struct file *__shmem_zero_setup(unsigned long start, unsigned long end, vm_flags_t vm_flags) { - struct file *file; - loff_t size = vma->vm_end - vma->vm_start; + loff_t size = end - start; /* * Cloning a new file under mmap_lock leads to a lock ordering conflict @@ -5895,7 +5890,18 @@ int shmem_zero_setup(struct vm_area_struct *vma) * accessible to the user through its mapping, use S_PRIVATE flag to * bypass file security, in the same way as shmem_kernel_file_setup(). */ - file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags); + return shmem_kernel_file_setup("dev/zero", size, vm_flags); +} + +/** + * shmem_zero_setup - setup a shared anonymous mapping + * @vma: the vma to be mmapped is prepared by do_mmap + * Returns: 0 on success, or error + */ +int shmem_zero_setup(struct vm_area_struct *vma) +{ + struct file *file = __shmem_zero_setup(vma->vm_start, vma->vm_end, vma->vm_flags); + if (IS_ERR(file)) return PTR_ERR(file); @@ -5907,6 +5913,25 @@ int shmem_zero_setup(struct vm_area_struct *vma) return 0; } +/** + * shmem_zero_setup_desc - same as shmem_zero_setup, but determined by VMA + * descriptor for convenience. + * @desc: Describes VMA + * Returns: 0 on success, or error + */ +int shmem_zero_setup_desc(struct vm_area_desc *desc) +{ + struct file *file = __shmem_zero_setup(desc->start, desc->end, desc->vm_flags); + + if (IS_ERR(file)) + return PTR_ERR(file); + + desc->vm_file = file; + desc->vm_ops = &shmem_anon_vm_ops; + + return 0; +} + /** * shmem_read_folio_gfp - read into page cache, using specified page allocation flags. * @mapping: the folio's address_space -- cgit v1.2.3 From 5ff592bec75ad79ed7f1a817477ab6eef8dc5efc Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Tue, 21 Oct 2025 16:44:25 -0700 Subject: memcg: manually uninline __memcg_memory_event __memcg_memory_event() has been unnecessarily marked inline even when it is not really performance critical. It is usually called to track extreme conditions. Over the time, it has evolved to include more functionality and inlining it is causing more harm. Before the patch: $ size mm/memcontrol.o net/ipv4/tcp_input.o net/ipv4/tcp_output.o text data bss dec hex filename 35645 10574 4192 50411 c4eb mm/memcontrol.o 54738 1658 0 56396 dc4c net/ipv4/tcp_input.o 34644 1065 0 35709 8b7d net/ipv4/tcp_output.o After the patch: $ size mm/memcontrol.o net/ipv4/tcp_input.o net/ipv4/tcp_output.o text data bss dec hex filename 35137 10446 4192 49775 c26f mm/memcontrol.o 54322 1562 0 55884 da4c net/ipv4/tcp_input.o 34492 1017 0 35509 8ab5 net/ipv4/tcp_output.o [akpm@linux-foundation.org: use EXPORT_SYMBOL_GPL for __memcg_memory_event, per Michal and Christoph] Link: https://lkml.kernel.org/r/20251021234425.1885471-1-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: SeongJae Park Acked-by: Roman Gushchin Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Muchun Song Cc: Christoph Hellwig Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 32 ++------------------------------ mm/memcontrol.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5fe254813123..8c0f15e5978f 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1002,36 +1002,8 @@ static inline void count_memcg_event_mm(struct mm_struct *mm, count_memcg_events_mm(mm, idx, 1); } -static inline void __memcg_memory_event(struct mem_cgroup *memcg, - enum memcg_memory_event event, - bool allow_spinning) -{ - bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX || - event == MEMCG_SWAP_FAIL; - - /* For now only MEMCG_MAX can happen with !allow_spinning context. */ - VM_WARN_ON_ONCE(!allow_spinning && event != MEMCG_MAX); - - atomic_long_inc(&memcg->memory_events_local[event]); - if (!swap_event && allow_spinning) - cgroup_file_notify(&memcg->events_local_file); - - do { - atomic_long_inc(&memcg->memory_events[event]); - if (allow_spinning) { - if (swap_event) - cgroup_file_notify(&memcg->swap_events_file); - else - cgroup_file_notify(&memcg->events_file); - } - - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) - break; - if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) - break; - } while ((memcg = parent_mem_cgroup(memcg)) && - !mem_cgroup_is_root(memcg)); -} +void __memcg_memory_event(struct mem_cgroup *memcg, + enum memcg_memory_event event, bool allow_spinning); static inline void memcg_memory_event(struct mem_cgroup *memcg, enum memcg_memory_event event) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 976412c8196e..025da46d9959 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1626,6 +1626,37 @@ unsigned long mem_cgroup_size(struct mem_cgroup *memcg) return page_counter_read(&memcg->memory); } +void __memcg_memory_event(struct mem_cgroup *memcg, + enum memcg_memory_event event, bool allow_spinning) +{ + bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX || + event == MEMCG_SWAP_FAIL; + + /* For now only MEMCG_MAX can happen with !allow_spinning context. */ + VM_WARN_ON_ONCE(!allow_spinning && event != MEMCG_MAX); + + atomic_long_inc(&memcg->memory_events_local[event]); + if (!swap_event && allow_spinning) + cgroup_file_notify(&memcg->events_local_file); + + do { + atomic_long_inc(&memcg->memory_events[event]); + if (allow_spinning) { + if (swap_event) + cgroup_file_notify(&memcg->swap_events_file); + else + cgroup_file_notify(&memcg->events_file); + } + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + break; + if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) + break; + } while ((memcg = parent_mem_cgroup(memcg)) && + !mem_cgroup_is_root(memcg)); +} +EXPORT_SYMBOL_GPL(__memcg_memory_event); + static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, int order) { -- cgit v1.2.3 From 27bfafac65d87c58639f5d7af1353ec1e7886963 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 22 Oct 2025 16:26:28 +0800 Subject: mm: add a ptdesc flag to mark kernel page tables The page tables used to map the kernel and userspace often have very different handling rules. There are frequently *_kernel() variants of functions just for kernel page tables. That's not great and has lead to code duplication. Instead of having completely separate call paths, allow a 'ptdesc' to be marked as being for kernel mappings. Introduce helpers to set and clear this status. Note: this uses the PG_referenced bit. Page flags are a great fit for this since it is truly a single bit of information. Use PG_referenced itself because it's a fairly benign flag (as opposed to things like PG_lock). It's also (according to Willy) unlikely to go away any time soon. PG_referenced is not in PAGE_FLAGS_CHECK_AT_FREE. It does not need to be cleared before freeing the page, and pages coming out of the allocator should have it cleared. Regardless, introduce an API to clear it anyway. Having symmetry in the API makes it easier to change the underlying implementation later, like if there was a need to move to a PAGE_FLAGS_CHECK_AT_FREE bit. Link: https://lkml.kernel.org/r/20251022082635.2462433-3-baolu.lu@linux.intel.com Signed-off-by: Dave Hansen Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Cc: Alistair Popple Cc: Andy Lutomirski Cc: Borislav Betkov Cc: Ingo Molnar Cc: Jann Horn Cc: Jean-Philippe Brucker Cc: Joerg Roedel Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Peter Zijlstra Cc: Robin Murohy Cc: Thomas Gleinxer Cc: "Uladzislau Rezki (Sony)" Cc: Vasant Hegde Cc: Vinicius Costa Gomes Cc: Vlastimil Babka Cc: Will Deacon Cc: Yi Lai Signed-off-by: Andrew Morton --- include/linux/mm.h | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2d060081caa5..5c887c4ea29e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2962,6 +2962,7 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a #endif /* CONFIG_MMU */ enum pt_flags { + PT_kernel = PG_referenced, PT_reserved = PG_reserved, /* High bits are used for zone/node/section */ }; @@ -2987,6 +2988,46 @@ static inline bool pagetable_is_reserved(struct ptdesc *pt) return test_bit(PT_reserved, &pt->pt_flags.f); } +/** + * ptdesc_set_kernel - Mark a ptdesc used to map the kernel + * @ptdesc: The ptdesc to be marked + * + * Kernel page tables often need special handling. Set a flag so that + * the handling code knows this ptdesc will not be used for userspace. + */ +static inline void ptdesc_set_kernel(struct ptdesc *ptdesc) +{ + set_bit(PT_kernel, &ptdesc->pt_flags.f); +} + +/** + * ptdesc_clear_kernel - Mark a ptdesc as no longer used to map the kernel + * @ptdesc: The ptdesc to be unmarked + * + * Use when the ptdesc is no longer used to map the kernel and no longer + * needs special handling. + */ +static inline void ptdesc_clear_kernel(struct ptdesc *ptdesc) +{ + /* + * Note: the 'PG_referenced' bit does not strictly need to be + * cleared before freeing the page. But this is nice for + * symmetry. + */ + clear_bit(PT_kernel, &ptdesc->pt_flags.f); +} + +/** + * ptdesc_test_kernel - Check if a ptdesc is used to map the kernel + * @ptdesc: The ptdesc being tested + * + * Call to tell if the ptdesc used to map the kernel. + */ +static inline bool ptdesc_test_kernel(const struct ptdesc *ptdesc) +{ + return test_bit(PT_kernel, &ptdesc->pt_flags.f); +} + /** * pagetable_alloc - Allocate pagetables * @gfp: GFP flags -- cgit v1.2.3 From 977870522af34359b461060597ee3a86f27450d6 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 22 Oct 2025 16:26:29 +0800 Subject: mm: actually mark kernel page table pages Now that the API is in place, mark kernel page table pages just after they are allocated. Unmark them just before they are freed. Note: Unconditionally clearing the 'kernel' marking (via ptdesc_clear_kernel()) would be functionally identical to what is here. But having the if() makes it logically clear that this function can be used for kernel and non-kernel page tables. Link: https://lkml.kernel.org/r/20251022082635.2462433-4-baolu.lu@linux.intel.com Signed-off-by: Dave Hansen Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Cc: Alistair Popple Cc: Andy Lutomirski Cc: Borislav Betkov Cc: Ingo Molnar Cc: Jann Horn Cc: Jean-Philippe Brucker Cc: Joerg Roedel Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Peter Zijlstra Cc: Robin Murohy Cc: Thomas Gleinxer Cc: "Uladzislau Rezki (Sony)" Cc: Vasant Hegde Cc: Vinicius Costa Gomes Cc: Vlastimil Babka Cc: Will Deacon Cc: Yi Lai Signed-off-by: Andrew Morton --- include/asm-generic/pgalloc.h | 18 ++++++++++++++++++ include/linux/mm.h | 3 +++ 2 files changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index 3c8ec3bfea44..b9d2a7c79b93 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -28,6 +28,8 @@ static inline pte_t *__pte_alloc_one_kernel_noprof(struct mm_struct *mm) return NULL; } + ptdesc_set_kernel(ptdesc); + return ptdesc_address(ptdesc); } #define __pte_alloc_one_kernel(...) alloc_hooks(__pte_alloc_one_kernel_noprof(__VA_ARGS__)) @@ -146,6 +148,10 @@ static inline pmd_t *pmd_alloc_one_noprof(struct mm_struct *mm, unsigned long ad pagetable_free(ptdesc); return NULL; } + + if (mm == &init_mm) + ptdesc_set_kernel(ptdesc); + return ptdesc_address(ptdesc); } #define pmd_alloc_one(...) alloc_hooks(pmd_alloc_one_noprof(__VA_ARGS__)) @@ -179,6 +185,10 @@ static inline pud_t *__pud_alloc_one_noprof(struct mm_struct *mm, unsigned long return NULL; pagetable_pud_ctor(ptdesc); + + if (mm == &init_mm) + ptdesc_set_kernel(ptdesc); + return ptdesc_address(ptdesc); } #define __pud_alloc_one(...) alloc_hooks(__pud_alloc_one_noprof(__VA_ARGS__)) @@ -233,6 +243,10 @@ static inline p4d_t *__p4d_alloc_one_noprof(struct mm_struct *mm, unsigned long return NULL; pagetable_p4d_ctor(ptdesc); + + if (mm == &init_mm) + ptdesc_set_kernel(ptdesc); + return ptdesc_address(ptdesc); } #define __p4d_alloc_one(...) alloc_hooks(__p4d_alloc_one_noprof(__VA_ARGS__)) @@ -277,6 +291,10 @@ static inline pgd_t *__pgd_alloc_noprof(struct mm_struct *mm, unsigned int order return NULL; pagetable_pgd_ctor(ptdesc); + + if (mm == &init_mm) + ptdesc_set_kernel(ptdesc); + return ptdesc_address(ptdesc); } #define __pgd_alloc(...) alloc_hooks(__pgd_alloc_noprof(__VA_ARGS__)) diff --git a/include/linux/mm.h b/include/linux/mm.h index 5c887c4ea29e..8f46048875a7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3057,6 +3057,9 @@ static inline void pagetable_free(struct ptdesc *pt) { struct page *page = ptdesc_page(pt); + if (ptdesc_test_kernel(pt)) + ptdesc_clear_kernel(pt); + __free_pages(page, compound_order(page)); } -- cgit v1.2.3 From 01894295672335ff304beed4359f30d14d5765f2 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 22 Oct 2025 16:26:31 +0800 Subject: mm: introduce pure page table freeing function The pages used for ptdescs are currently freed back to the allocator in a single location. They will shortly be freed from a second location. Create a simple helper that just frees them back to the allocator. Link: https://lkml.kernel.org/r/20251022082635.2462433-6-baolu.lu@linux.intel.com Signed-off-by: Dave Hansen Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Cc: Alistair Popple Cc: Andy Lutomirski Cc: Borislav Betkov Cc: Ingo Molnar Cc: Jann Horn Cc: Jean-Philippe Brucker Cc: Joerg Roedel Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Peter Zijlstra Cc: Robin Murohy Cc: Thomas Gleinxer Cc: "Uladzislau Rezki (Sony)" Cc: Vasant Hegde Cc: Vinicius Costa Gomes Cc: Vlastimil Babka Cc: Will Deacon Cc: Yi Lai Signed-off-by: Andrew Morton --- include/linux/mm.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8f46048875a7..88c0a0fae43a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3046,6 +3046,13 @@ static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int orde } #define pagetable_alloc(...) alloc_hooks(pagetable_alloc_noprof(__VA_ARGS__)) +static inline void __pagetable_free(struct ptdesc *pt) +{ + struct page *page = ptdesc_page(pt); + + __free_pages(page, compound_order(page)); +} + /** * pagetable_free - Free pagetables * @pt: The page table descriptor @@ -3055,12 +3062,10 @@ static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int orde */ static inline void pagetable_free(struct ptdesc *pt) { - struct page *page = ptdesc_page(pt); - if (ptdesc_test_kernel(pt)) ptdesc_clear_kernel(pt); - __free_pages(page, compound_order(page)); + __pagetable_free(pt); } #if defined(CONFIG_SPLIT_PTE_PTLOCKS) -- cgit v1.2.3 From 5ba2f0a1556479638ac11a3c201421f5515e89f5 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 22 Oct 2025 16:26:33 +0800 Subject: mm: introduce deferred freeing for kernel page tables This introduces a conditional asynchronous mechanism, enabled by CONFIG_ASYNC_KERNEL_PGTABLE_FREE. When enabled, this mechanism defers the freeing of pages that are used as page tables for kernel address mappings. These pages are now queued to a work struct instead of being freed immediately. This deferred freeing allows for batch-freeing of page tables, providing a safe context for performing a single expensive operation (TLB flush) for a batch of kernel page tables instead of performing that expensive operation for each page table. Link: https://lkml.kernel.org/r/20251022082635.2462433-8-baolu.lu@linux.intel.com Signed-off-by: Dave Hansen Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Cc: Alistair Popple Cc: Andy Lutomirski Cc: Borislav Betkov Cc: Ingo Molnar Cc: Jann Horn Cc: Jean-Philippe Brucker Cc: Joerg Roedel Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Peter Zijlstra Cc: Robin Murohy Cc: Thomas Gleinxer Cc: "Uladzislau Rezki (Sony)" Cc: Vasant Hegde Cc: Vinicius Costa Gomes Cc: Vlastimil Babka Cc: Will Deacon Cc: Yi Lai Signed-off-by: Andrew Morton --- include/linux/mm.h | 16 +++++++++++++--- mm/Kconfig | 3 +++ mm/pgtable-generic.c | 37 +++++++++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 88c0a0fae43a..a6fd9f5aaf30 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3053,6 +3053,14 @@ static inline void __pagetable_free(struct ptdesc *pt) __free_pages(page, compound_order(page)); } +#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE +void pagetable_free_kernel(struct ptdesc *pt); +#else +static inline void pagetable_free_kernel(struct ptdesc *pt) +{ + __pagetable_free(pt); +} +#endif /** * pagetable_free - Free pagetables * @pt: The page table descriptor @@ -3062,10 +3070,12 @@ static inline void __pagetable_free(struct ptdesc *pt) */ static inline void pagetable_free(struct ptdesc *pt) { - if (ptdesc_test_kernel(pt)) + if (ptdesc_test_kernel(pt)) { ptdesc_clear_kernel(pt); - - __pagetable_free(pt); + pagetable_free_kernel(pt); + } else { + __pagetable_free(pt); + } } #if defined(CONFIG_SPLIT_PTE_PTLOCKS) diff --git a/mm/Kconfig b/mm/Kconfig index 4971436c8697..682a5c39a1a6 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -906,6 +906,9 @@ config HAVE_GIGANTIC_FOLIOS def_bool (HUGETLB_PAGE && ARCH_HAS_GIGANTIC_PAGE) || \ (ZONE_DEVICE && HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) +config ASYNC_KERNEL_PGTABLE_FREE + def_bool n + # TODO: Allow to be enabled without THP config ARCH_SUPPORTS_HUGE_PFNMAP def_bool n diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 567e2d084071..1c7caa8ef164 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -406,3 +406,40 @@ again: pte_unmap_unlock(pte, ptl); goto again; } + +#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE +static void kernel_pgtable_work_func(struct work_struct *work); + +static struct { + struct list_head list; + /* protect above ptdesc lists */ + spinlock_t lock; + struct work_struct work; +} kernel_pgtable_work = { + .list = LIST_HEAD_INIT(kernel_pgtable_work.list), + .lock = __SPIN_LOCK_UNLOCKED(kernel_pgtable_work.lock), + .work = __WORK_INITIALIZER(kernel_pgtable_work.work, kernel_pgtable_work_func), +}; + +static void kernel_pgtable_work_func(struct work_struct *work) +{ + struct ptdesc *pt, *next; + LIST_HEAD(page_list); + + spin_lock(&kernel_pgtable_work.lock); + list_splice_tail_init(&kernel_pgtable_work.list, &page_list); + spin_unlock(&kernel_pgtable_work.lock); + + list_for_each_entry_safe(pt, next, &page_list, pt_list) + __pagetable_free(pt); +} + +void pagetable_free_kernel(struct ptdesc *pt) +{ + spin_lock(&kernel_pgtable_work.lock); + list_add(&pt->pt_list, &kernel_pgtable_work.list); + spin_unlock(&kernel_pgtable_work.lock); + + schedule_work(&kernel_pgtable_work.work); +} +#endif -- cgit v1.2.3 From e37d5a2d60a338c5917c45296bac65da1382eda5 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 22 Oct 2025 16:26:34 +0800 Subject: iommu/sva: invalidate stale IOTLB entries for kernel address space Introduce a new IOMMU interface to flush IOTLB paging cache entries for the CPU kernel address space. This interface is invoked from the x86 architecture code that manages combined user and kernel page tables, specifically before any kernel page table page is freed and reused. This addresses the main issue with vfree() which is a common occurrence and can be triggered by unprivileged users. While this resolves the primary problem, it doesn't address some extremely rare case related to memory unplug of memory that was present as reserved memory at boot, which cannot be triggered by unprivileged users. The discussion can be found at the link below. Enable SVA on x86 architecture since the IOMMU can now receive notification to flush the paging cache before freeing the CPU kernel page table pages. Link: https://lkml.kernel.org/r/20251022082635.2462433-9-baolu.lu@linux.intel.com Link: https://lore.kernel.org/linux-iommu/04983c62-3b1d-40d4-93ae-34ca04b827e5@intel.com/ Co-developed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe Signed-off-by: Lu Baolu Suggested-by: Jann Horn Reviewed-by: Jason Gunthorpe Reviewed-by: Vasant Hegde Reviewed-by: Kevin Tian Cc: Alistair Popple Cc: Andy Lutomirski Cc: Borislav Betkov Cc: Dave Hansen Cc: David Hildenbrand Cc: Ingo Molnar Cc: Jean-Philippe Brucker Cc: Joerg Roedel Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport (Microsoft) Cc: Peter Zijlstra Cc: Robin Murohy Cc: Thomas Gleinxer Cc: "Uladzislau Rezki (Sony)" Cc: Vinicius Costa Gomes Cc: Vlastimil Babka Cc: Will Deacon Cc: Yi Lai Signed-off-by: Andrew Morton --- arch/x86/Kconfig | 1 + drivers/iommu/iommu-sva.c | 32 ++++++++++++++++++++++++++++---- include/linux/iommu.h | 4 ++++ mm/pgtable-generic.c | 2 ++ 4 files changed, 35 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fa3b616af03a..a3700766a8c0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -279,6 +279,7 @@ config X86 select HAVE_PCI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select ASYNC_KERNEL_PGTABLE_FREE if IOMMU_SVA select MMU_GATHER_RCU_TABLE_FREE select MMU_GATHER_MERGE_VMAS select HAVE_POSIX_CPU_TIMERS_TASK_WORK diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c index a0442faad952..d236aef80a8d 100644 --- a/drivers/iommu/iommu-sva.c +++ b/drivers/iommu/iommu-sva.c @@ -10,6 +10,8 @@ #include "iommu-priv.h" static DEFINE_MUTEX(iommu_sva_lock); +static bool iommu_sva_present; +static LIST_HEAD(iommu_sva_mms); static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, struct mm_struct *mm); @@ -42,6 +44,7 @@ static struct iommu_mm_data *iommu_alloc_mm_data(struct mm_struct *mm, struct de return ERR_PTR(-ENOSPC); } iommu_mm->pasid = pasid; + iommu_mm->mm = mm; INIT_LIST_HEAD(&iommu_mm->sva_domains); /* * Make sure the write to mm->iommu_mm is not reordered in front of @@ -77,9 +80,6 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm if (!group) return ERR_PTR(-ENODEV); - if (IS_ENABLED(CONFIG_X86)) - return ERR_PTR(-EOPNOTSUPP); - mutex_lock(&iommu_sva_lock); /* Allocate mm->pasid if necessary. */ @@ -135,8 +135,13 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm if (ret) goto out_free_domain; domain->users = 1; - list_add(&domain->next, &mm->iommu_mm->sva_domains); + if (list_empty(&iommu_mm->sva_domains)) { + if (list_empty(&iommu_sva_mms)) + iommu_sva_present = true; + list_add(&iommu_mm->mm_list_elm, &iommu_sva_mms); + } + list_add(&domain->next, &iommu_mm->sva_domains); out: refcount_set(&handle->users, 1); mutex_unlock(&iommu_sva_lock); @@ -178,6 +183,13 @@ void iommu_sva_unbind_device(struct iommu_sva *handle) list_del(&domain->next); iommu_domain_free(domain); } + + if (list_empty(&iommu_mm->sva_domains)) { + list_del(&iommu_mm->mm_list_elm); + if (list_empty(&iommu_sva_mms)) + iommu_sva_present = false; + } + mutex_unlock(&iommu_sva_lock); kfree(handle); } @@ -315,3 +327,15 @@ static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, return domain; } + +void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) +{ + struct iommu_mm_data *iommu_mm; + + guard(mutex)(&iommu_sva_lock); + if (!iommu_sva_present) + return; + + list_for_each_entry(iommu_mm, &iommu_sva_mms, mm_list_elm) + mmu_notifier_arch_invalidate_secondary_tlbs(iommu_mm->mm, start, end); +} diff --git a/include/linux/iommu.h b/include/linux/iommu.h index c30d12e16473..66e4abb2df0d 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1134,7 +1134,9 @@ struct iommu_sva { struct iommu_mm_data { u32 pasid; + struct mm_struct *mm; struct list_head sva_domains; + struct list_head mm_list_elm; }; int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode); @@ -1615,6 +1617,7 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm); void iommu_sva_unbind_device(struct iommu_sva *handle); u32 iommu_sva_get_pasid(struct iommu_sva *handle); +void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end); #else static inline struct iommu_sva * iommu_sva_bind_device(struct device *dev, struct mm_struct *mm) @@ -1639,6 +1642,7 @@ static inline u32 mm_get_enqcmd_pasid(struct mm_struct *mm) } static inline void mm_pasid_drop(struct mm_struct *mm) {} +static inline void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) {} #endif /* CONFIG_IOMMU_SVA */ #ifdef CONFIG_IOMMU_IOPF diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 1c7caa8ef164..8c22be79b734 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -430,6 +431,7 @@ static void kernel_pgtable_work_func(struct work_struct *work) list_splice_tail_init(&kernel_pgtable_work.list, &page_list); spin_unlock(&kernel_pgtable_work.lock); + iommu_sva_invalidate_kva_range(PAGE_OFFSET, TLB_FLUSH_ALL); list_for_each_entry_safe(pt, next, &page_list, pt_list) __pagetable_free(pt); } -- cgit v1.2.3 From a983471cfc454afeba23526ee5d17fd8cdc7876f Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Fri, 24 Oct 2025 02:00:41 +0800 Subject: mm, swap: cleanup swap entry allocation parameter We no longer need this GFP parameter after commit 8578e0c00dcf ("mm, swap: use the swap table for the swap cache and switch API"). Before that commit the GFP parameter is already almost identical for all callers, so nothing changed by that commit. Swap table just moved the GFP to lower layer and make it more defined and changes depend on atomic or sleep allocation. Now this parameter is no longer used, just remove it. No behavior change. Link: https://lkml.kernel.org/r/20251024-swap-clean-after-swap-table-p1-v2-3-a709469052e7@tencent.com Signed-off-by: Kairui Song Acked-by: Chris Li Acked-by: Nhat Pham Reviewed-by: Baolin Wang Reviewed-by: David Hildenbrand Cc: Baoquan He Cc: Barry Song Cc: "Huang, Ying" Cc: Kemeng Shi Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/swap.h | 4 ++-- mm/shmem.c | 2 +- mm/swapfile.c | 3 +-- mm/vmscan.c | 4 ++-- 4 files changed, 6 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index e818fbade1e2..a4b264817735 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -462,7 +462,7 @@ static inline long get_nr_swap_pages(void) } extern void si_swapinfo(struct sysinfo *); -int folio_alloc_swap(struct folio *folio, gfp_t gfp_mask); +int folio_alloc_swap(struct folio *folio); bool folio_free_swap(struct folio *folio); void put_swap_folio(struct folio *folio, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); @@ -560,7 +560,7 @@ static inline int swp_swapcount(swp_entry_t entry) return 0; } -static inline int folio_alloc_swap(struct folio *folio, gfp_t gfp_mask) +static inline int folio_alloc_swap(struct folio *folio) { return -EINVAL; } diff --git a/mm/shmem.c b/mm/shmem.c index da1df4270309..e1dc2d8e939c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1617,7 +1617,7 @@ try_split: folio_mark_uptodate(folio); } - if (!folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN)) { + if (!folio_alloc_swap(folio)) { bool first_swapped = shmem_recalc_inode(inode, 0, nr_pages); int error; diff --git a/mm/swapfile.c b/mm/swapfile.c index 808052319c0b..d87b562ae661 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1417,7 +1417,6 @@ static bool swap_sync_discard(void) /** * folio_alloc_swap - allocate swap space for a folio * @folio: folio we want to move to swap - * @gfp: gfp mask for shadow nodes * * Allocate swap space for the folio and add the folio to the * swap cache. @@ -1425,7 +1424,7 @@ static bool swap_sync_discard(void) * Context: Caller needs to hold the folio lock. * Return: Whether the folio was added to the swap cache. */ -int folio_alloc_swap(struct folio *folio, gfp_t gfp) +int folio_alloc_swap(struct folio *folio) { unsigned int order = folio_order(folio); unsigned int size = 1 << order; diff --git a/mm/vmscan.c b/mm/vmscan.c index ecc90517b791..c23c9616052a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1318,7 +1318,7 @@ retry: split_folio_to_list(folio, folio_list)) goto activate_locked; } - if (folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN)) { + if (folio_alloc_swap(folio)) { int __maybe_unused order = folio_order(folio); if (!folio_test_large(folio)) @@ -1334,7 +1334,7 @@ retry: } #endif count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK); - if (folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN)) + if (folio_alloc_swap(folio)) goto activate_locked_split; } /* -- cgit v1.2.3 From adf7d6cdd716e1f3826789befc453c961dfafcf2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 22 Oct 2025 18:25:25 -0700 Subject: mm/damon/core: add damon_target->obsolete for pin-point removal Patch series "mm/damon: support pin-point targets removal". DAMON maintains the targets in a list, and allows committing only an entire list of targets having the new parameters. Targets having same index on the lists are treated as matching source and destination targets. If an existing target cannot find a matching one in the sources list, the target is removed. This means that there is no way to remove only a specific monitoring target in the middle of the current targets list. Such pin-point target removal is really needed in some use cases, though. Monitoring access patterns on virtual address spaces of processes that spawned from the same ancestor is one example. If a process of the group is terminated, the user may want to remove the matching DAMON target as soon as possible, to save in-kernel memory usage for the unnecessary target data. The user may also want to do that without turning DAMON off or removing unnecessary targets, to keep the current monitoring results for other active processes. Extend DAMON kernel API and sysfs ABI to support the pin-point removal in the following way. For API, add a new damon_target field, namely 'obsolete'. If the field on parameters commit source target is set, it means the matching destination target is obsolete. Then the parameters commit logic removes the destination target from the existing targets list. For sysfs ABI, add a new file under the target directory, namely 'obsolete_target'. It is connected with the 'obsolete' field of the commit source targets, so internally using the new API. Also add a selftest for the new feature. The related helper scripts for manipulating the sysfs interface and dumping in-kernel DAMON status are also extended for this. Note that the selftest part was initially posted as an individual RFC series [1], but now merged into this one. Bijan Tabatabai has originally reported this issue, and participated in this solution design on a GitHub issue [1] for DAMON user-space tool. This patch (of 9): DAMON's monitoring targets parameters update function, damon_commit_targets(), is not providing a way to remove a target in the middle of the existing targets list. Extend the API by adding a field to struct damon_target. If the field of a damon_commit_targets() source target is set, it indicates the matching target on the existing targets list is obsolete. damon_commit_targets() understands that and removes those from the list, while respecting the index based matching for other non-obsolete targets. Link: https://lkml.kernel.org/r/20251023012535.69625-1-sj@kernel.org Link: https://lkml.kernel.org/r/20251023012535.69625-2-sj@kernel.org Link: https://github.com/damonitor/damo/issues/36 [1] Signed-off-by: SeongJae Park Reviewed-by: Bijan Tabatabai Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 6 ++++++ mm/damon/core.c | 10 +++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 9ee026c2db53..f3566b978cdf 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -91,17 +91,23 @@ struct damon_region { * @nr_regions: Number of monitoring target regions of this target. * @regions_list: Head of the monitoring target regions of this target. * @list: List head for siblings. + * @obsolete: Whether the commit destination target is obsolete. * * Each monitoring context could have multiple targets. For example, a context * for virtual memory address spaces could have multiple target processes. The * @pid should be set for appropriate &struct damon_operations including the * virtual address spaces monitoring operations. + * + * @obsolete is used only for damon_commit_targets() source targets, to specify + * the matching destination targets are obsolete. Read damon_commit_targets() + * to see how it is handled. */ struct damon_target { struct pid *pid; unsigned int nr_regions; struct list_head regions_list; struct list_head list; + bool obsolete; }; /** diff --git a/mm/damon/core.c b/mm/damon/core.c index 769da97fcb26..06ad359024ad 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -479,6 +479,7 @@ struct damon_target *damon_new_target(void) t->nr_regions = 0; INIT_LIST_HEAD(&t->regions_list); INIT_LIST_HEAD(&t->list); + t->obsolete = false; return t; } @@ -1187,7 +1188,11 @@ static int damon_commit_targets( damon_for_each_target_safe(dst_target, next, dst) { src_target = damon_nth_target(i++, src); - if (src_target) { + /* + * If src target is obsolete, do not commit the parameters to + * the dst target, and further remove the dst target. + */ + if (src_target && !src_target->obsolete) { err = damon_commit_target( dst_target, damon_target_has_pid(dst), src_target, damon_target_has_pid(src), @@ -1210,6 +1215,9 @@ static int damon_commit_targets( damon_for_each_target_safe(src_target, next, src) { if (j++ < i) continue; + /* target to remove has no matching dst */ + if (src_target->obsolete) + return -EINVAL; new_target = damon_new_target(); if (!new_target) return -ENOMEM; -- cgit v1.2.3 From b734b9d973ccd7ad1cfebc2e1f7db693824a37ef Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 24 Oct 2025 10:09:02 +0100 Subject: mm/vma: small VMA lock cleanups We declare vma_start_read() as a static function in mm/mmap_lock.c, so there is no need to provide a stub for !CONFIG_PER_VMA_LOCK. __is_vma_write_locked() is declared in a header and should therefore be static inline. Put parens around (refcnt & VMA_LOCK_OFFSET) in is_vma_writer_only() to make precedence clear. Link: https://lkml.kernel.org/r/20251024090902.1118174-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Suren Baghdasaryan Reviewed-by: Shakeel Butt Reviewed-by: Vlastimil Babka Cc: Liam Howlett Signed-off-by: Andrew Morton --- include/linux/mmap_lock.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 2c9fffa58714..e05da70dc0cb 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -130,7 +130,7 @@ static inline bool is_vma_writer_only(int refcnt) * a detached vma happens only in vma_mark_detached() and is a rare * case, therefore most of the time there will be no unnecessary wakeup. */ - return refcnt & VMA_LOCK_OFFSET && refcnt <= VMA_LOCK_OFFSET + 1; + return (refcnt & VMA_LOCK_OFFSET) && refcnt <= VMA_LOCK_OFFSET + 1; } static inline void vma_refcount_put(struct vm_area_struct *vma) @@ -183,7 +183,7 @@ static inline void vma_end_read(struct vm_area_struct *vma) } /* WARNING! Can only be used if mmap_lock is expected to be write-locked */ -static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq) +static inline bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq) { mmap_assert_write_locked(vma->vm_mm); @@ -281,9 +281,6 @@ static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int return true; } static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {} -static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, - struct vm_area_struct *vma) - { return NULL; } static inline void vma_end_read(struct vm_area_struct *vma) {} static inline void vma_start_write(struct vm_area_struct *vma) {} static inline void vma_assert_write_locked(struct vm_area_struct *vma) -- cgit v1.2.3 From 272239dc8fcb109b9f1ec1a73bb85405dac92eda Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 21 Oct 2025 03:56:38 +0100 Subject: mm: make INVALID_PHYS_ADDR a generic macro INVALID_PHYS_ADDR has very similar definitions across the code base. Hence just move that inside header for more generic usage. Also drop the now redundant ones which are no longer required. Link: https://lkml.kernel.org/r/20251021025638.2420216-1-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: Alexander Gordeev [s390] Signed-off-by: Andrew Morton --- arch/arm64/mm/mmu.c | 2 -- arch/s390/boot/vmem.c | 1 - drivers/vdpa/vdpa_user/iova_domain.h | 2 -- include/linux/mm.h | 2 ++ kernel/dma/swiotlb.c | 2 -- 5 files changed, 2 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index b8d37eb037fc..94e29e3574ff 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -470,8 +470,6 @@ static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, mutex_unlock(&fixmap_lock); } -#define INVALID_PHYS_ADDR (-1ULL) - static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, gfp_t gfp, enum pgtable_type pgtable_type) { diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c index cea3de4dce8c..fbe64ffdfb96 100644 --- a/arch/s390/boot/vmem.c +++ b/arch/s390/boot/vmem.c @@ -16,7 +16,6 @@ #include "decompressor.h" #include "boot.h" -#define INVALID_PHYS_ADDR (~(phys_addr_t)0) struct ctlreg __bootdata_preserved(s390_invalid_asce); #ifdef CONFIG_PROC_FS diff --git a/drivers/vdpa/vdpa_user/iova_domain.h b/drivers/vdpa/vdpa_user/iova_domain.h index 775cad5238f3..a923971a64f5 100644 --- a/drivers/vdpa/vdpa_user/iova_domain.h +++ b/drivers/vdpa/vdpa_user/iova_domain.h @@ -17,8 +17,6 @@ #define IOVA_START_PFN 1 -#define INVALID_PHYS_ADDR (~(phys_addr_t)0) - #define BOUNCE_MAP_SHIFT 12 #define BOUNCE_MAP_SIZE (1 << BOUNCE_MAP_SHIFT) #define BOUNCE_MAP_MASK (~(BOUNCE_MAP_SIZE - 1)) diff --git a/include/linux/mm.h b/include/linux/mm.h index a6fd9f5aaf30..7bcd9e6fbc3c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -105,6 +105,8 @@ extern int mmap_rnd_compat_bits __read_mostly; # endif #endif +#define INVALID_PHYS_ADDR (~(phys_addr_t)0) + #include #include diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 0d37da3d95b6..a547c7693135 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -61,8 +61,6 @@ */ #define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT) -#define INVALID_PHYS_ADDR (~(phys_addr_t)0) - /** * struct io_tlb_slot - IO TLB slot descriptor * @orig_addr: The original address corresponding to a mapped entry. -- cgit v1.2.3 From 8e689f8ea45ffdae20350246dd37d124d7092c92 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Tue, 28 Oct 2025 11:43:07 +0800 Subject: mm/swap: do not choose swap device according to numa node Patch series "mm/swapfile.c: select swap devices of default priority round robin", v5. Currently, on system with multiple swap devices, swap allocation will select one swap device according to priority. The swap device with the highest priority will be chosen to allocate firstly. People can specify a priority from 0 to 32767 when swapon a swap device, or the system will set it from -2 then downwards by default. Meanwhile, on NUMA system, the swap device with node_id will be considered first on that NUMA node of the node_id. In the current code, an array of plist, swap_avail_heads[nid], is used to organize swap devices on each NUMA node. For each NUMA node, there is a plist organizing all swap devices. The 'prio' value in the plist is the negated value of the device's priority due to plist being sorted from low to high. The swap device owning one node_id will be promoted to the front position on that NUMA node, then other swap devices are put in order of their default priority. E.g I got a system with 8 NUMA nodes, and I setup 4 zram partition as swap devices. Current behaviour: their priorities will be(note that -1 is skipped): NAME TYPE SIZE USED PRIO /dev/zram0 partition 16G 0B -2 /dev/zram1 partition 16G 0B -3 /dev/zram2 partition 16G 0B -4 /dev/zram3 partition 16G 0B -5 And their positions in the 8 swap_avail_lists[nid] will be: swap_avail_lists[0]: /* node 0's available swap device list */ zram0 -> zram1 -> zram2 -> zram3 prio:1 prio:3 prio:4 prio:5 swap_avali_lists[1]: /* node 1's available swap device list */ zram1 -> zram0 -> zram2 -> zram3 prio:1 prio:2 prio:4 prio:5 swap_avail_lists[2]: /* node 2's available swap device list */ zram2 -> zram0 -> zram1 -> zram3 prio:1 prio:2 prio:3 prio:5 swap_avail_lists[3]: /* node 3's available swap device list */ zram3 -> zram0 -> zram1 -> zram2 prio:1 prio:2 prio:3 prio:4 swap_avail_lists[4-7]: /* node 4,5,6,7's available swap device list */ zram0 -> zram1 -> zram2 -> zram3 prio:2 prio:3 prio:4 prio:5 The adjustment for swap device with node_id intended to decrease the pressure of lock contention for one swap device by taking different swap device on different node. The adjustment was introduced in commit a2468cc9bfdf ("swap: choose swap device according to numa node"). However, the adjustment is a little coarse-grained. On the node, the swap device sharing the node's id will always be selected firstly by node's CPUs until exhausted, then next one. And on other nodes where no swap device shares its node id, swap device with priority '-2' will be selected firstly until exhausted, then next with priority '-3'. This is the swapon output during the process high pressure vm-scability test is being taken. It's clearly showing zram0 is heavily exploited until exhausted. =================================== [root@hp-dl385g10-03 ~]# swapon NAME TYPE SIZE USED PRIO /dev/zram0 partition 16G 15.7G -2 /dev/zram1 partition 16G 3.4G -3 /dev/zram2 partition 16G 3.4G -4 /dev/zram3 partition 16G 2.6G -5 The node based strategy on selecting swap device is much better then the old way one by one selecting swap device. However it is still unreasonable because swap devices are assumed to have similar accessing speed if no priority is specified when swapon. It's unfair and doesn't make sense just because one swap device is swapped on firstly, its priority will be higher than the one swapped on later. So in this patchset, change is made to select the swap device round robin if default priority. In code, the plist array swap_avail_heads[nid] is replaced with a plist swap_avail_head which reverts commit a2468cc9bfdf. Meanwhile, on top of the revert, further change is taken to make any device w/o specified priority get the same default priority '-1'. Surely, swap device with specified priority are always put foremost, this is not impacted. If you care about their different accessing speed, then use 'swapon -p xx' to deploy priority for your swap devices. New behaviour: swap_avail_list: /* one global available swap device list */ zram0 -> zram1 -> zram2 -> zram3 prio:1 prio:1 prio:1 prio:1 This is the swapon output during the process high pressure vm-scability being taken, all is selected round robin: ======================================= [root@hp-dl385g10-03 linux]# swapon NAME TYPE SIZE USED PRIO /dev/zram0 partition 16G 12.6G -1 /dev/zram1 partition 16G 12.6G -1 /dev/zram2 partition 16G 12.6G -1 /dev/zram3 partition 16G 12.6G -1 With the change, we can see about 18% efficiency promotion as below: vm-scability test: ================== Test with: usemem --init-time -O -y -x -n 31 2G (4G memcg, zram as swap) Before: After: System time: 637.92 s 526.74 s (lower is better) Sum Throughput: 3546.56 MB/s 4207.56 MB/s (higher is better) Single process Throughput: 114.40 MB/s 135.72 MB/s (higher is better) free latency: 10138455.99 us 6810119.01 us (low is better) This patch (of 2): This reverts commit a2468cc9bfdf ("swap: choose swap device according to numa node"). After this patch, the behaviour will change back to pre-commit a2468cc9bfdf. Means the priority will be set from -1 then downwards by default, and when swapping, it will exhault swap device one by one according to priority from high to low. This is preparation work for later change. [root@hp-dl385g10-03 ~]# swapon NAME TYPE SIZE USED PRIO /dev/zram0 partition 16G 16G -1 /dev/zram1 partition 16G 966.2M -2 /dev/zram2 partition 16G 0B -3 /dev/zram3 partition 16G 0B -4 Link: https://lkml.kernel.org/r/20251028034308.929550-1-bhe@redhat.com Link: https://lkml.kernel.org/r/20251028034308.929550-2-bhe@redhat.com Signed-off-by: Baoquan He Suggested-by: Chris Li Acked-by: Chris Li Acked-by: Nhat Pham Reviewed-by: Kairui Song Cc: Barry Song Cc: Kemeng Shi Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/index.rst | 1 - Documentation/admin-guide/mm/swap_numa.rst | 78 ----------------------------- include/linux/swap.h | 11 +--- mm/swapfile.c | 80 ++++++------------------------ 4 files changed, 15 insertions(+), 155 deletions(-) delete mode 100644 Documentation/admin-guide/mm/swap_numa.rst (limited to 'include/linux') diff --git a/Documentation/admin-guide/mm/index.rst b/Documentation/admin-guide/mm/index.rst index ebc83ca20fdc..bbb563cba5d2 100644 --- a/Documentation/admin-guide/mm/index.rst +++ b/Documentation/admin-guide/mm/index.rst @@ -39,7 +39,6 @@ the Linux memory management. shrinker_debugfs slab soft-dirty - swap_numa transhuge userfaultfd zswap diff --git a/Documentation/admin-guide/mm/swap_numa.rst b/Documentation/admin-guide/mm/swap_numa.rst deleted file mode 100644 index 2e630627bcee..000000000000 --- a/Documentation/admin-guide/mm/swap_numa.rst +++ /dev/null @@ -1,78 +0,0 @@ -=========================================== -Automatically bind swap device to numa node -=========================================== - -If the system has more than one swap device and swap device has the node -information, we can make use of this information to decide which swap -device to use in get_swap_pages() to get better performance. - - -How to use this feature -======================= - -Swap device has priority and that decides the order of it to be used. To make -use of automatically binding, there is no need to manipulate priority settings -for swap devices. e.g. on a 2 node machine, assume 2 swap devices swapA and -swapB, with swapA attached to node 0 and swapB attached to node 1, are going -to be swapped on. Simply swapping them on by doing:: - - # swapon /dev/swapA - # swapon /dev/swapB - -Then node 0 will use the two swap devices in the order of swapA then swapB and -node 1 will use the two swap devices in the order of swapB then swapA. Note -that the order of them being swapped on doesn't matter. - -A more complex example on a 4 node machine. Assume 6 swap devices are going to -be swapped on: swapA and swapB are attached to node 0, swapC is attached to -node 1, swapD and swapE are attached to node 2 and swapF is attached to node3. -The way to swap them on is the same as above:: - - # swapon /dev/swapA - # swapon /dev/swapB - # swapon /dev/swapC - # swapon /dev/swapD - # swapon /dev/swapE - # swapon /dev/swapF - -Then node 0 will use them in the order of:: - - swapA/swapB -> swapC -> swapD -> swapE -> swapF - -swapA and swapB will be used in a round robin mode before any other swap device. - -node 1 will use them in the order of:: - - swapC -> swapA -> swapB -> swapD -> swapE -> swapF - -node 2 will use them in the order of:: - - swapD/swapE -> swapA -> swapB -> swapC -> swapF - -Similaly, swapD and swapE will be used in a round robin mode before any -other swap devices. - -node 3 will use them in the order of:: - - swapF -> swapA -> swapB -> swapC -> swapD -> swapE - - -Implementation details -====================== - -The current code uses a priority based list, swap_avail_list, to decide -which swap device to use and if multiple swap devices share the same -priority, they are used round robin. This change here replaces the single -global swap_avail_list with a per-numa-node list, i.e. for each numa node, -it sees its own priority based list of available swap devices. Swap -device's priority can be promoted on its matching node's swap_avail_list. - -The current swap device's priority is set as: user can set a >=0 value, -or the system will pick one starting from -1 then downwards. The priority -value in the swap_avail_list is the negated value of the swap device's -due to plist being sorted from low to high. The new policy doesn't change -the semantics for priority >=0 cases, the previous starting from -1 then -downwards now becomes starting from -2 then downwards and -1 is reserved -as the promoted value. So if multiple swap devices are attached to the same -node, they will all be promoted to priority -1 on that node's plist and will -be used round robin before any other swap devices. diff --git a/include/linux/swap.h b/include/linux/swap.h index a4b264817735..38ca3df68716 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -301,16 +301,7 @@ struct swap_info_struct { struct work_struct discard_work; /* discard worker */ struct work_struct reclaim_work; /* reclaim worker */ struct list_head discard_clusters; /* discard clusters list */ - struct plist_node avail_lists[]; /* - * entries in swap_avail_heads, one - * entry per node. - * Must be last as the number of the - * array is nr_node_ids, which is not - * a fixed value so have to allocate - * dynamically. - * And it has to be an array so that - * plist_for_each_* can work. - */ + struct plist_node avail_list; /* entry in swap_avail_head */ }; static inline swp_entry_t page_swap_entry(struct page *page) diff --git a/mm/swapfile.c b/mm/swapfile.c index 125d893bb706..ce3580e2f4f4 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -74,7 +74,7 @@ atomic_long_t nr_swap_pages; EXPORT_SYMBOL_GPL(nr_swap_pages); /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ long total_swap_pages; -static int least_priority = -1; +static int least_priority; unsigned long swapfile_maximum_size; #ifdef CONFIG_MIGRATION bool swap_migration_ad_supported; @@ -103,7 +103,7 @@ static PLIST_HEAD(swap_active_head); * is held and the locking order requires swap_lock to be taken * before any swap_info_struct->lock. */ -static struct plist_head *swap_avail_heads; +static PLIST_HEAD(swap_avail_head); static DEFINE_SPINLOCK(swap_avail_lock); struct swap_info_struct *swap_info[MAX_SWAPFILES]; @@ -1130,7 +1130,6 @@ done: /* SWAP_USAGE_OFFLIST_BIT can only be set by this helper. */ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff) { - int nid; unsigned long pages; spin_lock(&swap_avail_lock); @@ -1159,8 +1158,7 @@ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff) goto skip; } - for_each_node(nid) - plist_del(&si->avail_lists[nid], &swap_avail_heads[nid]); + plist_del(&si->avail_list, &swap_avail_head); skip: spin_unlock(&swap_avail_lock); @@ -1169,7 +1167,6 @@ skip: /* SWAP_USAGE_OFFLIST_BIT can only be cleared by this helper. */ static void add_to_avail_list(struct swap_info_struct *si, bool swapon) { - int nid; long val; unsigned long pages; @@ -1202,8 +1199,7 @@ static void add_to_avail_list(struct swap_info_struct *si, bool swapon) goto skip; } - for_each_node(nid) - plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]); + plist_add(&si->avail_list, &swap_avail_head); skip: spin_unlock(&swap_avail_lock); @@ -1346,16 +1342,14 @@ static bool swap_alloc_fast(swp_entry_t *entry, static bool swap_alloc_slow(swp_entry_t *entry, int order) { - int node; unsigned long offset; struct swap_info_struct *si, *next; - node = numa_node_id(); spin_lock(&swap_avail_lock); start_over: - plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) { + plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { /* Rotate the device and switch to a new cluster */ - plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); + plist_requeue(&si->avail_list, &swap_avail_head); spin_unlock(&swap_avail_lock); if (get_swap_device_info(si)) { offset = cluster_alloc_swap_entry(si, order, SWAP_HAS_CACHE); @@ -1380,7 +1374,7 @@ start_over: * still in the swap_avail_head list then try it, otherwise * start over if we have not gotten any slots. */ - if (plist_node_empty(&next->avail_lists[node])) + if (plist_node_empty(&si->avail_list)) goto start_over; } spin_unlock(&swap_avail_lock); @@ -1394,11 +1388,10 @@ start_over: static bool swap_sync_discard(void) { bool ret = false; - int nid = numa_node_id(); struct swap_info_struct *si, *next; spin_lock(&swap_avail_lock); - plist_for_each_entry_safe(si, next, &swap_avail_heads[nid], avail_lists[nid]) { + plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { spin_unlock(&swap_avail_lock); if (get_swap_device_info(si)) { if (si->flags & SWP_PAGE_DISCARD) @@ -2709,25 +2702,11 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) return generic_swapfile_activate(sis, swap_file, span); } -static int swap_node(struct swap_info_struct *si) -{ - struct block_device *bdev; - - if (si->bdev) - bdev = si->bdev; - else - bdev = si->swap_file->f_inode->i_sb->s_bdev; - - return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE; -} - static void setup_swap_info(struct swap_info_struct *si, int prio, unsigned char *swap_map, struct swap_cluster_info *cluster_info, unsigned long *zeromap) { - int i; - if (prio >= 0) si->prio = prio; else @@ -2737,16 +2716,7 @@ static void setup_swap_info(struct swap_info_struct *si, int prio, * low-to-high, while swap ordering is high-to-low */ si->list.prio = -si->prio; - for_each_node(i) { - if (si->prio >= 0) - si->avail_lists[i].prio = -si->prio; - else { - if (swap_node(si) == i) - si->avail_lists[i].prio = 1; - else - si->avail_lists[i].prio = -si->prio; - } - } + si->avail_list.prio = -si->prio; si->swap_map = swap_map; si->cluster_info = cluster_info; si->zeromap = zeromap; @@ -2919,15 +2889,11 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) del_from_avail_list(p, true); if (p->prio < 0) { struct swap_info_struct *si = p; - int nid; plist_for_each_entry_continue(si, &swap_active_head, list) { si->prio++; si->list.prio--; - for_each_node(nid) { - if (si->avail_lists[nid].prio != 1) - si->avail_lists[nid].prio--; - } + si->avail_list.prio--; } least_priority++; } @@ -3168,9 +3134,8 @@ static struct swap_info_struct *alloc_swap_info(void) struct swap_info_struct *p; struct swap_info_struct *defer = NULL; unsigned int type; - int i; - p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL); + p = kvzalloc(sizeof(struct swap_info_struct), GFP_KERNEL); if (!p) return ERR_PTR(-ENOMEM); @@ -3209,8 +3174,7 @@ static struct swap_info_struct *alloc_swap_info(void) } p->swap_extent_root = RB_ROOT; plist_node_init(&p->list, 0); - for_each_node(i) - plist_node_init(&p->avail_lists[i], 0); + plist_node_init(&p->avail_list, 0); p->flags = SWP_USED; spin_unlock(&swap_lock); if (defer) { @@ -3467,9 +3431,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!swap_avail_heads) - return -ENOMEM; - si = alloc_swap_info(); if (IS_ERR(si)) return PTR_ERR(si); @@ -4079,7 +4040,6 @@ static bool __has_usable_swap(void) void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp) { struct swap_info_struct *si, *next; - int nid = folio_nid(folio); if (!(gfp & __GFP_IO)) return; @@ -4098,8 +4058,8 @@ void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp) return; spin_lock(&swap_avail_lock); - plist_for_each_entry_safe(si, next, &swap_avail_heads[nid], - avail_lists[nid]) { + plist_for_each_entry_safe(si, next, &swap_avail_head, + avail_list) { if (si->bdev) { blkcg_schedule_throttle(si->bdev->bd_disk, true); break; @@ -4111,18 +4071,6 @@ void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp) static int __init swapfile_init(void) { - int nid; - - swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head), - GFP_KERNEL); - if (!swap_avail_heads) { - pr_emerg("Not enough memory for swap heads, swap is disabled\n"); - return -ENOMEM; - } - - for_each_node(nid) - plist_head_init(&swap_avail_heads[nid]); - swapfile_maximum_size = arch_max_swapfile_size(); /* -- cgit v1.2.3 From 1a4f70f6851a1916c4f0e52731c7ecfe99bf36e6 Mon Sep 17 00:00:00 2001 From: Israel Batista Date: Wed, 29 Oct 2025 19:56:28 +0000 Subject: mm: convert memory block states (MEM_*) macros to enum Patch series "mm: Convert memory block states (MEM_*) macros to enums", v2. The MEM_* constants indicating the state of a memory block are currently defined as macros, meaning their definitions will be omitted from the debuginfo on most kernel builds. This makes it harder for debuggers to correctly map the block state at runtime, which can be quite useful when analysing errors related to memory hot plugging and unplugging with tools such as drgn. Converting the constants to an enum ensures the correct information is emitted by the compiler and available for the debugger, without needing to hard-code them into the debugger and track their changes. This patch series aims to replace the current macros with a newly created enum named memory_block_state, while also taking advantage of the compile time guarantees that we get when using enums. The first patch does the conversion of the macros to an enum, while the 2nd and 3rd patches use this enum to clean up some type declarations and make sure that only valid values are used. This patch (of 3): Converting the MEM_* constants from macros to an enum ensures that their values will be correctly emitted in the debug symbols, making it easier to trace the meaning of each value when debugging with tools such as drgn, without the need to hard-code the values. Since the values are mutually exclusive and they are not exposed directly to userspace, I also dropped the misleading pattern (1< Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Reviewed-by: Lorenzo Stoakes Cc: Omar Sandoval Cc: Randy Dunlap Signed-off-by: Andrew Morton --- include/linux/memory.h | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory.h b/include/linux/memory.h index 0c214256216f..f4e358477c6a 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -64,6 +64,18 @@ struct memory_group { }; }; +enum memory_block_state { + /* These states are exposed to userspace as text strings in sysfs */ + MEM_ONLINE, /* exposed to userspace */ + MEM_GOING_OFFLINE, /* exposed to userspace */ + MEM_OFFLINE, /* exposed to userspace */ + MEM_GOING_ONLINE, + MEM_CANCEL_ONLINE, + MEM_CANCEL_OFFLINE, + MEM_PREPARE_ONLINE, + MEM_FINISH_OFFLINE, +}; + struct memory_block { unsigned long start_section_nr; unsigned long state; /* serialized by the dev->lock */ @@ -89,16 +101,6 @@ int arch_get_memory_phys_device(unsigned long start_pfn); unsigned long memory_block_size_bytes(void); int set_memory_block_size_order(unsigned int order); -/* These states are exposed to userspace as text strings in sysfs */ -#define MEM_ONLINE (1<<0) /* exposed to userspace */ -#define MEM_GOING_OFFLINE (1<<1) /* exposed to userspace */ -#define MEM_OFFLINE (1<<2) /* exposed to userspace */ -#define MEM_GOING_ONLINE (1<<3) -#define MEM_CANCEL_ONLINE (1<<4) -#define MEM_CANCEL_OFFLINE (1<<5) -#define MEM_PREPARE_ONLINE (1<<6) -#define MEM_FINISH_OFFLINE (1<<7) - struct memory_notify { /* * The altmap_start_pfn and altmap_nr_pages fields are designated for -- cgit v1.2.3 From 8bc7ba3d265d6ee698de4b1941b7e8f7d91a0562 Mon Sep 17 00:00:00 2001 From: Israel Batista Date: Wed, 29 Oct 2025 19:56:30 +0000 Subject: mm: change type of state in struct memory_block The state of a memory block should be restricted to values specified in the documentation of the memory hotplug API. However, since the state field in the memory_block struct was defined as an unsigned long, this restriction was not enforced at compile time. With the introduction of the enum memory_block_state, it is now possible to incorporate the desired semantics in the field declaration and enforce these restrictions at compile time. [akpm@linux-foundation.org: fix whitespace, per Randy] Link: https://lkml.kernel.org/r/20251029195617.2210700-3-linux@israelbatista.dev.br Signed-off-by: Israel Batista Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Reviewed-by: Lorenzo Stoakes Cc: Omar Sandoval Cc: Randy Dunlap Signed-off-by: Andrew Morton --- drivers/base/memory.c | 2 +- include/linux/memory.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 6d84a02cfa5d..3d17dd774947 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -198,7 +198,7 @@ static ssize_t state_show(struct device *dev, struct device_attribute *attr, break; default: WARN_ON(1); - return sysfs_emit(buf, "ERROR-UNKNOWN-%ld\n", mem->state); + return sysfs_emit(buf, "ERROR-UNKNOWN-%d\n", mem->state); } return sysfs_emit(buf, "%s\n", output); diff --git a/include/linux/memory.h b/include/linux/memory.h index f4e358477c6a..ca20cbdd71f2 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -78,7 +78,7 @@ enum memory_block_state { struct memory_block { unsigned long start_section_nr; - unsigned long state; /* serialized by the dev->lock */ + enum memory_block_state state; /* serialized by the dev->lock */ int online_type; /* for passing data to online routine */ int nid; /* NID for this memory block */ /* -- cgit v1.2.3 From ed1f8855dd7b82a0ad87960b1729a3e848dc5589 Mon Sep 17 00:00:00 2001 From: Israel Batista Date: Wed, 29 Oct 2025 19:56:32 +0000 Subject: mm: change type of parameter for memory_notify memory_notify() is responsible for sending events related to memory hotplugging to a notification queue. Since all the events must match one of the values from the enum memory_block_state, it is appropriate to change the function parameter type to make this condition explicit at compile time. Link: https://lkml.kernel.org/r/20251029195617.2210700-4-linux@israelbatista.dev.br Signed-off-by: Israel Batista Acked-by: Mike Rapoport (Microsoft) Cc: David Hildenbrand Cc: Lorenzo Stoakes Cc: Omar Sandoval Cc: Randy Dunlap Signed-off-by: Andrew Morton --- drivers/base/memory.c | 4 ++-- include/linux/memory.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 3d17dd774947..c03f3b5e5e6f 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -204,9 +204,9 @@ static ssize_t state_show(struct device *dev, struct device_attribute *attr, return sysfs_emit(buf, "%s\n", output); } -int memory_notify(unsigned long val, void *v) +int memory_notify(enum memory_block_state state, void *v) { - return blocking_notifier_call_chain(&memory_chain, val, v); + return blocking_notifier_call_chain(&memory_chain, state, v); } #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) diff --git a/include/linux/memory.h b/include/linux/memory.h index ca20cbdd71f2..ca3eb1db6cc8 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -141,7 +141,7 @@ static inline int register_memory_notifier(struct notifier_block *nb) static inline void unregister_memory_notifier(struct notifier_block *nb) { } -static inline int memory_notify(unsigned long val, void *v) +static inline int memory_notify(enum memory_block_state state, void *v) { return 0; } @@ -165,7 +165,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size, struct memory_group *group); void remove_memory_block_devices(unsigned long start, unsigned long size); extern void memory_dev_init(void); -extern int memory_notify(unsigned long val, void *v); +extern int memory_notify(enum memory_block_state state, void *v); extern struct memory_block *find_memory_block(unsigned long section_nr); typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *); extern int walk_memory_blocks(unsigned long start, unsigned long size, -- cgit v1.2.3 From 2ec41967189cd65a8f79c760dd1b50c4f56e8ac6 Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Sun, 2 Nov 2025 18:44:33 +0000 Subject: mm: handle poisoning of pfn without struct pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Poison (or ECC) errors can be very common on a large size cluster. The kernel MM currently does not handle ECC errors / poison on a memory region that is not backed by struct pages. If a memory region mapped using remap_pfn_range() for example, but not added to the kernel, MM will not have associated struct pages. Add a new mechanism to handle memory failure on such memory. Make kernel MM expose a function to allow modules managing the device memory to register the device memory SPA and the address space associated it. MM maintains this information as an interval tree. On poison, MM can search for the range that the poisoned PFN belong and use the address_space to determine the mapping VMA. In this implementation, kernel MM follows the following sequence that is largely similar to the memory_failure() handler for struct page backed memory: 1. memory_failure() is triggered on reception of a poison error. An absence of struct page is detected and consequently memory_failure_pfn() is executed. 2. memory_failure_pfn() collects the processes mapped to the PFN. 3. memory_failure_pfn() sends SIGBUS to all the processes mapping the faulty PFN using kill_procs(). Note that there is one primary difference versus the handling of the poison on struct pages, which is to skip unmapping to the faulty PFN. This is done to handle the huge PFNMAP support added recently [1] that enables VM_PFNMAP vmas to map at PMD or PUD level. A poison to a PFN mapped in such as way would need breaking the PMD/PUD mapping into PTEs that will get mirrored into the S2. This can greatly increase the cost of table walks and have a major performance impact. Link: https://lore.kernel.org/all/20240826204353.2228736-1-peterx@redhat.com/ [1] Link: https://lkml.kernel.org/r/20251102184434.2406-3-ankita@nvidia.com Signed-off-by: Ankit Agrawal Cc: Aniket Agashe Cc: Borislav Betkov Cc: David Hildenbrand Cc: Hanjun Guo Cc: Ira Weiny Cc: Jason Gunthorpe Cc: Joanthan Cameron Cc: Kevin Tian Cc: Kirti Wankhede Cc: Len Brown Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: "Luck, Tony" Cc: Matthew R. Ochs Cc: Mauro Carvalho Chehab Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Naoya Horiguchi Cc: Neo Jia Cc: Peter Zijlstra Cc: Shuai Xue Cc: Smita Koralahalli Channabasappa Cc: Suren Baghdasaryan Cc: Tarun Gupta Cc: Uwe Kleine-König Cc: Vikram Sethi Cc: Vlastimil Babka Cc: Zhi Wang Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + include/linux/memory-failure.h | 17 +++++ include/linux/mm.h | 1 + include/ras/ras_event.h | 1 + mm/Kconfig | 1 + mm/memory-failure.c | 145 ++++++++++++++++++++++++++++++++++++++++- 6 files changed, 165 insertions(+), 1 deletion(-) create mode 100644 include/linux/memory-failure.h (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index 2625bc3d53d8..5cf6873569d3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11557,6 +11557,7 @@ M: Miaohe Lin R: Naoya Horiguchi L: linux-mm@kvack.org S: Maintained +F: include/linux/memory-failure.h F: mm/hwpoison-inject.c F: mm/memory-failure.c diff --git a/include/linux/memory-failure.h b/include/linux/memory-failure.h new file mode 100644 index 000000000000..bc326503d2d2 --- /dev/null +++ b/include/linux/memory-failure.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_MEMORY_FAILURE_H +#define _LINUX_MEMORY_FAILURE_H + +#include + +struct pfn_address_space; + +struct pfn_address_space { + struct interval_tree_node node; + struct address_space *mapping; +}; + +int register_pfn_address_space(struct pfn_address_space *pfn_space); +void unregister_pfn_address_space(struct pfn_address_space *pfn_space); + +#endif /* _LINUX_MEMORY_FAILURE_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 7bcd9e6fbc3c..b636d12bb651 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4285,6 +4285,7 @@ enum mf_action_page_type { MF_MSG_DAX, MF_MSG_UNSPLIT_THP, MF_MSG_ALREADY_POISONED, + MF_MSG_PFN_MAP, MF_MSG_UNKNOWN, }; diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index c8cd0f00c845..fecfeb7c8be7 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -375,6 +375,7 @@ TRACE_EVENT(aer_event, EM ( MF_MSG_DAX, "dax page" ) \ EM ( MF_MSG_UNSPLIT_THP, "unsplit thp" ) \ EM ( MF_MSG_ALREADY_POISONED, "already poisoned" ) \ + EM ( MF_MSG_PFN_MAP, "non struct page pfn" ) \ EMe ( MF_MSG_UNKNOWN, "unknown page" ) /* diff --git a/mm/Kconfig b/mm/Kconfig index eae03b14f7de..d548976d0e0a 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -741,6 +741,7 @@ config MEMORY_FAILURE depends on ARCH_SUPPORTS_MEMORY_FAILURE bool "Enable recovery from hardware memory errors" select RAS + select INTERVAL_TREE help Enables code to recover from some memory failures on systems with MCA recovery. This allows a system to continue running diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 560884dd6250..77391b6f9f76 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -38,6 +38,7 @@ #include #include +#include #include #include #include @@ -154,6 +155,10 @@ static const struct ctl_table memory_failure_table[] = { } }; +static struct rb_root_cached pfn_space_itree = RB_ROOT_CACHED; + +static DEFINE_MUTEX(pfn_space_lock); + /* * Return values: * 1: the page is dissolved (if needed) and taken off from buddy, @@ -885,6 +890,7 @@ static const char * const action_page_types[] = { [MF_MSG_DAX] = "dax page", [MF_MSG_UNSPLIT_THP] = "unsplit thp", [MF_MSG_ALREADY_POISONED] = "already poisoned page", + [MF_MSG_PFN_MAP] = "non struct page pfn", [MF_MSG_UNKNOWN] = "unknown page", }; @@ -1277,7 +1283,7 @@ static int action_result(unsigned long pfn, enum mf_action_page_type type, { trace_memory_failure_event(pfn, type, result); - if (type != MF_MSG_ALREADY_POISONED) { + if (type != MF_MSG_ALREADY_POISONED && type != MF_MSG_PFN_MAP) { num_poisoned_pages_inc(pfn); update_per_node_mf_stats(pfn, result); } @@ -2147,6 +2153,135 @@ static void kill_procs_now(struct page *p, unsigned long pfn, int flags, kill_procs(&tokill, true, pfn, flags); } +int register_pfn_address_space(struct pfn_address_space *pfn_space) +{ + guard(mutex)(&pfn_space_lock); + + if (interval_tree_iter_first(&pfn_space_itree, + pfn_space->node.start, + pfn_space->node.last)) + return -EBUSY; + + interval_tree_insert(&pfn_space->node, &pfn_space_itree); + + return 0; +} +EXPORT_SYMBOL_GPL(register_pfn_address_space); + +void unregister_pfn_address_space(struct pfn_address_space *pfn_space) +{ + guard(mutex)(&pfn_space_lock); + + if (interval_tree_iter_first(&pfn_space_itree, + pfn_space->node.start, + pfn_space->node.last)) + interval_tree_remove(&pfn_space->node, &pfn_space_itree); +} +EXPORT_SYMBOL_GPL(unregister_pfn_address_space); + +static void add_to_kill_pfn(struct task_struct *tsk, + struct vm_area_struct *vma, + struct list_head *to_kill, + unsigned long pfn) +{ + struct to_kill *tk; + + tk = kmalloc(sizeof(*tk), GFP_ATOMIC); + if (!tk) { + pr_info("Unable to kill proc %d\n", tsk->pid); + return; + } + + /* Check for pgoff not backed by struct page */ + tk->addr = vma_address(vma, pfn, 1); + tk->size_shift = PAGE_SHIFT; + + if (tk->addr == -EFAULT) + pr_info("Unable to find address %lx in %s\n", + pfn, tsk->comm); + + get_task_struct(tsk); + tk->tsk = tsk; + list_add_tail(&tk->nd, to_kill); +} + +/* + * Collect processes when the error hit a PFN not backed by struct page. + */ +static void collect_procs_pfn(struct address_space *mapping, + unsigned long pfn, struct list_head *to_kill) +{ + struct vm_area_struct *vma; + struct task_struct *tsk; + + i_mmap_lock_read(mapping); + rcu_read_lock(); + for_each_process(tsk) { + struct task_struct *t = tsk; + + t = task_early_kill(tsk, true); + if (!t) + continue; + vma_interval_tree_foreach(vma, &mapping->i_mmap, pfn, pfn) { + if (vma->vm_mm == t->mm) + add_to_kill_pfn(t, vma, to_kill, pfn); + } + } + rcu_read_unlock(); + i_mmap_unlock_read(mapping); +} + +/** + * memory_failure_pfn - Handle memory failure on a page not backed by + * struct page. + * @pfn: Page Number of the corrupted page + * @flags: fine tune action taken + * + * Return: + * 0 - success, + * -EBUSY - Page PFN does not belong to any address space mapping. + */ +static int memory_failure_pfn(unsigned long pfn, int flags) +{ + struct interval_tree_node *node; + LIST_HEAD(tokill); + + scoped_guard(mutex, &pfn_space_lock) { + bool mf_handled = false; + + /* + * Modules registers with MM the address space mapping to + * the device memory they manage. Iterate to identify + * exactly which address space has mapped to this failing + * PFN. + */ + for (node = interval_tree_iter_first(&pfn_space_itree, pfn, pfn); node; + node = interval_tree_iter_next(node, pfn, pfn)) { + struct pfn_address_space *pfn_space = + container_of(node, struct pfn_address_space, node); + + collect_procs_pfn(pfn_space->mapping, pfn, &tokill); + + mf_handled = true; + } + + if (!mf_handled) + return action_result(pfn, MF_MSG_PFN_MAP, MF_IGNORED); + } + + /* + * Unlike System-RAM there is no possibility to swap in a different + * physical page at a given virtual address, so all userspace + * consumption of direct PFN memory necessitates SIGBUS (i.e. + * MF_MUST_KILL) + */ + flags |= MF_ACTION_REQUIRED | MF_MUST_KILL; + + kill_procs(&tokill, true, pfn, flags); + + return action_result(pfn, MF_MSG_PFN_MAP, MF_RECOVERED); +} + /** * memory_failure - Handle memory failure of a page. * @pfn: Page Number of the corrupted page @@ -2196,6 +2331,14 @@ int memory_failure(unsigned long pfn, int flags) if (res == 0) goto unlock_mutex; + if (!pfn_valid(pfn) && !arch_is_platform_page(PFN_PHYS(pfn))) { + /* + * The PFN is not backed by struct page. + */ + res = memory_failure_pfn(pfn, flags); + goto unlock_mutex; + } + if (pfn_valid(pfn)) { pgmap = get_dev_pagemap(pfn); put_ref_page(pfn, flags); -- cgit v1.2.3 From 340b59816bc417c306cd76b867914cfb4f386d2d Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 4 Nov 2025 16:57:09 +0800 Subject: mm: kill mm_wr_locked from unmap_vmas() and unmap_single_vma() Kill mm_wr_locked since commit f8e97613fed2 ("mm: convert VM_PFNMAP tracking to pfnmap_track() + pfnmap_untrack()") remove the user. Link: https://lkml.kernel.org/r/20251104085709.2688433-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Acked-by: David Hildenbrand (Red Hat) Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- mm/memory.c | 12 ++++-------- mm/mmap.c | 2 +- mm/vma.c | 5 ++--- tools/testing/vma/vma_internal.h | 3 +-- 5 files changed, 9 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index b636d12bb651..df9f258a017c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2480,7 +2480,7 @@ static inline void zap_vma_pages(struct vm_area_struct *vma) } void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *start_vma, unsigned long start, - unsigned long end, unsigned long tree_end, bool mm_wr_locked); + unsigned long end, unsigned long tree_end); struct mmu_notifier_range; diff --git a/mm/memory.c b/mm/memory.c index 8d8c36adafa8..b09de6274da3 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2023,8 +2023,7 @@ void unmap_page_range(struct mmu_gather *tlb, static void unmap_single_vma(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start_addr, - unsigned long end_addr, - struct zap_details *details, bool mm_wr_locked) + unsigned long end_addr, struct zap_details *details) { unsigned long start = max(vma->vm_start, start_addr); unsigned long end; @@ -2070,7 +2069,6 @@ static void unmap_single_vma(struct mmu_gather *tlb, * @start_addr: virtual address at which to start unmapping * @end_addr: virtual address at which to end unmapping * @tree_end: The maximum index to check - * @mm_wr_locked: lock flag * * Unmap all pages in the vma list. * @@ -2085,8 +2083,7 @@ static void unmap_single_vma(struct mmu_gather *tlb, */ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *vma, unsigned long start_addr, - unsigned long end_addr, unsigned long tree_end, - bool mm_wr_locked) + unsigned long end_addr, unsigned long tree_end) { struct mmu_notifier_range range; struct zap_details details = { @@ -2102,8 +2099,7 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, unsigned long start = start_addr; unsigned long end = end_addr; hugetlb_zap_begin(vma, &start, &end); - unmap_single_vma(tlb, vma, start, end, &details, - mm_wr_locked); + unmap_single_vma(tlb, vma, start, end, &details); hugetlb_zap_end(vma, &details); vma = mas_find(mas, tree_end - 1); } while (vma && likely(!xa_is_zero(vma))); @@ -2139,7 +2135,7 @@ void zap_page_range_single_batched(struct mmu_gather *tlb, * unmap 'address-end' not 'range.start-range.end' as range * could have been expanded for hugetlb pmd sharing. */ - unmap_single_vma(tlb, vma, address, end, details, false); + unmap_single_vma(tlb, vma, address, end, details); mmu_notifier_invalidate_range_end(&range); if (is_vm_hugetlb_page(vma)) { /* diff --git a/mm/mmap.c b/mm/mmap.c index 644f02071a41..4f51ca644903 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1274,7 +1274,7 @@ void exit_mmap(struct mm_struct *mm) tlb_gather_mmu_fullmm(&tlb, mm); /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */ - unmap_vmas(&tlb, &vmi.mas, vma, 0, ULONG_MAX, ULONG_MAX, false); + unmap_vmas(&tlb, &vmi.mas, vma, 0, ULONG_MAX, ULONG_MAX); mmap_read_unlock(mm); /* diff --git a/mm/vma.c b/mm/vma.c index 919d1fc63a52..0c5e391fe2e2 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -483,8 +483,7 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); - unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end, - /* mm_wr_locked = */ true); + unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end); mas_set(mas, vma->vm_end); free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, next ? next->vm_start : USER_PGTABLES_CEILING, @@ -1228,7 +1227,7 @@ static inline void vms_clear_ptes(struct vma_munmap_struct *vms, tlb_gather_mmu(&tlb, vms->vma->vm_mm); update_hiwater_rss(vms->vma->vm_mm); unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end, - vms->vma_count, mm_wr_locked); + vms->vma_count); mas_set(mas_detach, 1); /* start and end may be different if there is no prev or next vma. */ diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index d873667704e8..c68d382dac81 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -848,8 +848,7 @@ static inline void update_hiwater_vm(struct mm_struct *mm) static inline void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *vma, unsigned long start_addr, - unsigned long end_addr, unsigned long tree_end, - bool mm_wr_locked) + unsigned long end_addr, unsigned long tree_end) { } -- cgit v1.2.3 From 5dba5cc2e0ffa76f2f6c8922a04469dc9602c396 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 18 Nov 2025 10:17:43 +0000 Subject: mm: introduce VM_MAYBE_GUARD and make visible in /proc/$pid/smaps Patch series "introduce VM_MAYBE_GUARD and make it sticky", v4. Currently, guard regions are not visible to users except through /proc/$pid/pagemap, with no explicit visibility at the VMA level. This makes the feature less useful, as it isn't entirely apparent which VMAs may have these entries present, especially when performing actions which walk through memory regions such as those performed by CRIU. This series addresses this issue by introducing the VM_MAYBE_GUARD flag which fulfils this role, updating the smaps logic to display an entry for these. The semantics of this flag are that a guard region MAY be present if set (we cannot be sure, as we can't efficiently track whether an MADV_GUARD_REMOVE finally removes all the guard regions in a VMA) - but if not set the VMA definitely does NOT have any guard regions present. It's problematic to establish this flag without further action, because that means that VMAs with guard regions in them become non-mergeable with adjacent VMAs for no especially good reason. To work around this, this series also introduces the concept of 'sticky' VMA flags - that is flags which: a. if set in one VMA and not in another still permit those VMAs to be merged (if otherwise compatible). b. When they are merged, the resultant VMA must have the flag set. The VMA logic is updated to propagate these flags correctly. Additionally, VM_MAYBE_GUARD being an explicit VMA flag allows us to solve an issue with file-backed guard regions - previously these established an anon_vma object for file-backed mappings solely to have vma_needs_copy() correctly propagate guard region mappings to child processes. We introduce a new flag alias VM_COPY_ON_FORK (which currently only specifies VM_MAYBE_GUARD) and update vma_needs_copy() to check explicitly for this flag and to copy page tables if it is present, which resolves this issue. Additionally, we add the ability for allow-listed VMA flags to be atomically writable with only mmap/VMA read locks held. The only flag we allow so far is VM_MAYBE_GUARD, which we carefully ensure does not cause any races by being allowed to do so. This allows us to maintain guard region installation as a read-locked operation and not endure the overhead of obtaining a write lock here. Finally we introduce extensive VMA userland tests to assert that the sticky VMA logic behaves correctly as well as guard region self tests to assert that smaps visibility is correctly implemented. This patch (of 9): Currently, if a user needs to determine if guard regions are present in a range, they have to scan all VMAs (or have knowledge of which ones might have guard regions). Since commit 8e2f2aeb8b48 ("fs/proc/task_mmu: add guard region bit to pagemap") and the related commit a516403787e0 ("fs/proc: extend the PAGEMAP_SCAN ioctl to report guard regions"), users can use either /proc/$pid/pagemap or the PAGEMAP_SCAN functionality to perform this operation at a virtual address level. This is not ideal, and it gives no visibility at a /proc/$pid/smaps level that guard regions exist in ranges. This patch remedies the situation by establishing a new VMA flag, VM_MAYBE_GUARD, to indicate that a VMA may contain guard regions (it is uncertain because we cannot reasonably determine whether a MADV_GUARD_REMOVE call has removed all of the guard regions in a VMA, and additionally VMAs may change across merge/split). We utilise 0x800 for this flag which makes it available to 32-bit architectures also, a flag that was previously used by VM_DENYWRITE, which was removed in commit 8d0920bde5eb ("mm: remove VM_DENYWRITE") and hasn't bee reused yet. We also update the smaps logic and documentation to identify these VMAs. Another major use of this functionality is that we can use it to identify that we ought to copy page tables on fork. We do not actually implement usage of this flag in mm/madvise.c yet as we need to allow some VMA flags to be applied atomically under mmap/VMA read lock in order to avoid the need to acquire a write lock for this purpose. Link: https://lkml.kernel.org/r/cover.1763460113.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/cf8ef821eba29b6c5b5e138fffe95d6dcabdedb9.1763460113.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Pedro Falcato Reviewed-by: Vlastimil Babka Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Lance Yang Cc: Andrei Vagin Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Jann Horn Cc: Jonathan Corbet Cc: Liam Howlett Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Ryan Roberts Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- Documentation/filesystems/proc.rst | 5 +++-- fs/proc/task_mmu.c | 1 + include/linux/mm.h | 3 +++ include/trace/events/mmflags.h | 1 + mm/memory.c | 4 ++++ tools/testing/vma/vma_internal.h | 1 + 6 files changed, 13 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 0b86a8022fa1..8256e857e2d7 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -553,7 +553,7 @@ otherwise. kernel flags associated with the particular virtual memory area in two letter encoded manner. The codes are the following: - == ======================================= + == ============================================================= rd readable wr writeable ex executable @@ -591,7 +591,8 @@ encoded manner. The codes are the following: sl sealed lf lock on fault pages dp always lazily freeable mapping - == ======================================= + gu maybe contains guard regions (if not set, definitely doesn't) + == ============================================================= Note that there is no guarantee that every flag and associated mnemonic will be present in all further kernel releases. Things get changed, the flags may diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index fc35a0543f01..db16ed91c269 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1146,6 +1146,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_MAYSHARE)] = "ms", [ilog2(VM_GROWSDOWN)] = "gd", [ilog2(VM_PFNMAP)] = "pf", + [ilog2(VM_MAYBE_GUARD)] = "gu", [ilog2(VM_LOCKED)] = "lo", [ilog2(VM_IO)] = "io", [ilog2(VM_SEQ_READ)] = "sr", diff --git a/include/linux/mm.h b/include/linux/mm.h index df9f258a017c..36b9418c00fc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -271,6 +271,8 @@ extern struct rw_semaphore nommu_region_sem; extern unsigned int kobjsize(const void *objp); #endif +#define VM_MAYBE_GUARD_BIT 11 + /* * vm_flags in vm_area_struct, see mm_types.h. * When changing, update also include/trace/events/mmflags.h @@ -296,6 +298,7 @@ extern unsigned int kobjsize(const void *objp); #define VM_UFFD_MISSING 0 #endif /* CONFIG_MMU */ #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ +#define VM_MAYBE_GUARD BIT(VM_MAYBE_GUARD_BIT) /* The VMA maybe contains guard regions. */ #define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */ #define VM_LOCKED 0x00002000 diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index aa441f593e9a..a6e5a44c9b42 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -213,6 +213,7 @@ IF_HAVE_PG_ARCH_3(arch_3) {VM_UFFD_MISSING, "uffd_missing" }, \ IF_HAVE_UFFD_MINOR(VM_UFFD_MINOR, "uffd_minor" ) \ {VM_PFNMAP, "pfnmap" }, \ + {VM_MAYBE_GUARD, "maybe_guard" }, \ {VM_UFFD_WP, "uffd_wp" }, \ {VM_LOCKED, "locked" }, \ {VM_IO, "io" }, \ diff --git a/mm/memory.c b/mm/memory.c index b09de6274da3..d1728d0538d6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1478,6 +1478,10 @@ vma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) if (src_vma->anon_vma) return true; + /* Guard regions have modified page tables that require copying. */ + if (src_vma->vm_flags & VM_MAYBE_GUARD) + return true; + /* * Don't copy ptes where a page fault will fill them correctly. Fork * becomes much lighter when there are big shared or private readonly diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index c68d382dac81..46acb4df45de 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -56,6 +56,7 @@ extern unsigned long dac_mmap_min_addr; #define VM_MAYEXEC 0x00000040 #define VM_GROWSDOWN 0x00000100 #define VM_PFNMAP 0x00000400 +#define VM_MAYBE_GUARD 0x00000800 #define VM_LOCKED 0x00002000 #define VM_IO 0x00004000 #define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ -- cgit v1.2.3 From 568822502383acd57d7cc1c72ee43932c45a9524 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 18 Nov 2025 10:17:44 +0000 Subject: mm: add atomic VMA flags and set VM_MAYBE_GUARD as such This patch adds the ability to atomically set VMA flags with only the mmap read/VMA read lock held. As this could be hugely problematic for VMA flags in general given that all other accesses are non-atomic and serialised by the mmap/VMA locks, we implement this with a strict allow-list - that is, only designated flags are allowed to do this. We make VM_MAYBE_GUARD one of these flags. Link: https://lkml.kernel.org/r/97e57abed09f2663077ed7a36fb8206e243171a9.1763460113.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Pedro Falcato Reviewed-by: Vlastimil Babka Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Lance Yang Cc: Andrei Vagin Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Jann Horn Cc: Jonathan Corbet Cc: Liam Howlett Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Ryan Roberts Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 36b9418c00fc..03776aab3837 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -518,6 +518,9 @@ extern unsigned int kobjsize(const void *objp); /* This mask represents all the VMA flag bits used by mlock */ #define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) +/* These flags can be updated atomically via VMA/mmap read lock. */ +#define VM_ATOMIC_SET_ALLOWED VM_MAYBE_GUARD + /* Arch-specific flags to clear when updating VM flags on protection change */ #ifndef VM_ARCH_CLEAR # define VM_ARCH_CLEAR VM_NONE @@ -860,6 +863,47 @@ static inline void vm_flags_mod(struct vm_area_struct *vma, __vm_flags_mod(vma, set, clear); } +static inline bool __vma_flag_atomic_valid(struct vm_area_struct *vma, + int bit) +{ + const vm_flags_t mask = BIT(bit); + + /* Only specific flags are permitted */ + if (WARN_ON_ONCE(!(mask & VM_ATOMIC_SET_ALLOWED))) + return false; + + return true; +} + +/* + * Set VMA flag atomically. Requires only VMA/mmap read lock. Only specific + * valid flags are allowed to do this. + */ +static inline void vma_flag_set_atomic(struct vm_area_struct *vma, int bit) +{ + /* mmap read lock/VMA read lock must be held. */ + if (!rwsem_is_locked(&vma->vm_mm->mmap_lock)) + vma_assert_locked(vma); + + if (__vma_flag_atomic_valid(vma, bit)) + set_bit(bit, &ACCESS_PRIVATE(vma, __vm_flags)); +} + +/* + * Test for VMA flag atomically. Requires no locks. Only specific valid flags + * are allowed to do this. + * + * This is necessarily racey, so callers must ensure that serialisation is + * achieved through some other means, or that races are permissible. + */ +static inline bool vma_flag_test_atomic(struct vm_area_struct *vma, int bit) +{ + if (__vma_flag_atomic_valid(vma, bit)) + return test_bit(bit, &vma->vm_flags); + + return false; +} + static inline void vma_set_anonymous(struct vm_area_struct *vma) { vma->vm_ops = NULL; -- cgit v1.2.3 From 64212ba02e66e705cabce188453ba4e61e9d7325 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 18 Nov 2025 10:17:46 +0000 Subject: mm: implement sticky VMA flags It is useful to be able to designate that certain flags are 'sticky', that is, if two VMAs are merged one with a flag of this nature and one without, the merged VMA sets this flag. As a result we ignore these flags for the purposes of determining VMA flag differences between VMAs being considered for merge. This patch therefore updates the VMA merge logic to perform this action, with flags possessing this property being described in the VM_STICKY bitmap. Those flags which ought to be ignored for the purposes of VMA merge are described in the VM_IGNORE_MERGE bitmap, which the VMA merge logic is also updated to use. As part of this change we place VM_SOFTDIRTY in VM_IGNORE_MERGE as it already had this behaviour, alongside VM_STICKY as sticky flags by implication must not disallow merge. Ultimately it seems that we should make VM_SOFTDIRTY a sticky flag in its own right, but this change is out of scope for this series. The only sticky flag designated as such is VM_MAYBE_GUARD, so as a result of this change, once the VMA flag is set upon guard region installation, VMAs with guard ranges will now not have their merge behaviour impacted as a result and can be freely merged with other VMAs without VM_MAYBE_GUARD set. Also update the comments for vma_modify_flags() to directly reference sticky flags now we have established the concept. We also update the VMA userland tests to account for the changes. Link: https://lkml.kernel.org/r/22ad5269f7669d62afb42ce0c79bad70b994c58d.1763460113.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Pedro Falcato Reviewed-by: Vlastimil Babka Cc: Andrei Vagin Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand (Red Hat) Cc: Dev Jain Cc: Jann Horn Cc: Jonathan Corbet Cc: Lance Yang Cc: Liam Howlett Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Ryan Roberts Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 28 ++++++++++++++++++++++++++++ mm/vma.c | 28 +++++++++++++++------------- mm/vma.h | 10 ++++------ tools/testing/vma/vma_internal.h | 28 ++++++++++++++++++++++++++++ 4 files changed, 75 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 03776aab3837..fea113d1d723 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -527,6 +527,34 @@ extern unsigned int kobjsize(const void *objp); #endif #define VM_FLAGS_CLEAR (ARCH_VM_PKEY_FLAGS | VM_ARCH_CLEAR) +/* + * Flags which should be 'sticky' on merge - that is, flags which, when one VMA + * possesses it but the other does not, the merged VMA should nonetheless have + * applied to it: + * + * VM_MAYBE_GUARD - If a VMA may have guard regions in place it implies that + * mapped page tables may contain metadata not described by the + * VMA and thus any merged VMA may also contain this metadata, + * and thus we must make this flag sticky. + */ +#define VM_STICKY VM_MAYBE_GUARD + +/* + * VMA flags we ignore for the purposes of merge, i.e. one VMA possessing one + * of these flags and the other not does not preclude a merge. + * + * VM_SOFTDIRTY - Should not prevent from VMA merging, if we match the flags but + * dirty bit -- the caller should mark merged VMA as dirty. If + * dirty bit won't be excluded from comparison, we increase + * pressure on the memory system forcing the kernel to generate + * new VMAs when old one could be extended instead. + * + * VM_STICKY - When merging VMAs, VMA flags must match, unless they are + * 'sticky'. If any sticky flags exist in either VMA, we simply + * set all of them on the merged VMA. + */ +#define VM_IGNORE_MERGE (VM_SOFTDIRTY | VM_STICKY) + /* * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask.. diff --git a/mm/vma.c b/mm/vma.c index 47469c036a72..4e21c988054d 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -89,15 +89,7 @@ static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_nex if (!mpol_equal(vmg->policy, vma_policy(vma))) return false; - /* - * VM_SOFTDIRTY should not prevent from VMA merging, if we - * match the flags but dirty bit -- the caller should mark - * merged VMA as dirty. If dirty bit won't be excluded from - * comparison, we increase pressure on the memory system forcing - * the kernel to generate new VMAs when old one could be - * extended instead. - */ - if ((vma->vm_flags ^ vmg->vm_flags) & ~VM_SOFTDIRTY) + if ((vma->vm_flags ^ vmg->vm_flags) & ~VM_IGNORE_MERGE) return false; if (vma->vm_file != vmg->file) return false; @@ -808,6 +800,7 @@ static bool can_merge_remove_vma(struct vm_area_struct *vma) static __must_check struct vm_area_struct *vma_merge_existing_range( struct vma_merge_struct *vmg) { + vm_flags_t sticky_flags = vmg->vm_flags & VM_STICKY; struct vm_area_struct *middle = vmg->middle; struct vm_area_struct *prev = vmg->prev; struct vm_area_struct *next; @@ -900,11 +893,13 @@ static __must_check struct vm_area_struct *vma_merge_existing_range( if (merge_right) { vma_start_write(next); vmg->target = next; + sticky_flags |= (next->vm_flags & VM_STICKY); } if (merge_left) { vma_start_write(prev); vmg->target = prev; + sticky_flags |= (prev->vm_flags & VM_STICKY); } if (merge_both) { @@ -974,6 +969,7 @@ static __must_check struct vm_area_struct *vma_merge_existing_range( if (err || commit_merge(vmg)) goto abort; + vm_flags_set(vmg->target, sticky_flags); khugepaged_enter_vma(vmg->target, vmg->vm_flags); vmg->state = VMA_MERGE_SUCCESS; return vmg->target; @@ -1124,6 +1120,10 @@ int vma_expand(struct vma_merge_struct *vmg) bool remove_next = false; struct vm_area_struct *target = vmg->target; struct vm_area_struct *next = vmg->next; + vm_flags_t sticky_flags; + + sticky_flags = vmg->vm_flags & VM_STICKY; + sticky_flags |= target->vm_flags & VM_STICKY; VM_WARN_ON_VMG(!target, vmg); @@ -1133,6 +1133,7 @@ int vma_expand(struct vma_merge_struct *vmg) if (next && (target != next) && (vmg->end == next->vm_end)) { int ret; + sticky_flags |= next->vm_flags & VM_STICKY; remove_next = true; /* This should already have been checked by this point. */ VM_WARN_ON_VMG(!can_merge_remove_vma(next), vmg); @@ -1159,6 +1160,7 @@ int vma_expand(struct vma_merge_struct *vmg) if (commit_merge(vmg)) goto nomem; + vm_flags_set(target, sticky_flags); return 0; nomem: @@ -1654,9 +1656,9 @@ struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, return ret; /* - * For a merge to succeed, the flags must match those requested. For - * flags which do not obey typical merge rules (i.e. do not need to - * match), we must let the caller know about them. + * For a merge to succeed, the flags must match those + * requested. However, sticky flags may have been retained, so propagate + * them to the caller. */ if (vmg.state == VMA_MERGE_SUCCESS) *vm_flags_ptr = ret->vm_flags; @@ -1906,7 +1908,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct * return a->vm_end == b->vm_start && mpol_equal(vma_policy(a), vma_policy(b)) && a->vm_file == b->vm_file && - !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) && + !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_IGNORE_MERGE)) && b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); } diff --git a/mm/vma.h b/mm/vma.h index 75f1d9c7204b..abada6a64c4e 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -273,17 +273,15 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, * @start: The start of the range to update. May be offset within @vma. * @end: The exclusive end of the range to update, may be offset within @vma. * @vm_flags_ptr: A pointer to the VMA flags that the @start to @end range is - * about to be set to. On merge, this will be updated to include any additional - * flags which remain in place. + * about to be set to. On merge, this will be updated to include sticky flags. * * IMPORTANT: The actual modification being requested here is NOT applied, * rather the VMA is perhaps split, perhaps merged to accommodate the change, * and the caller is expected to perform the actual modification. * - * In order to account for VMA flags which may persist (e.g. soft-dirty), the - * @vm_flags_ptr parameter points to the requested flags which are then updated - * so the caller, should they overwrite any existing flags, correctly retains - * these. + * In order to account for sticky VMA flags, the @vm_flags_ptr parameter points + * to the requested flags which are then updated so the caller, should they + * overwrite any existing flags, correctly retains these. * * Returns: A VMA which contains the range @start to @end ready to have its * flags altered to *@vm_flags. diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 46acb4df45de..73c2025777e6 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -117,6 +117,34 @@ extern unsigned long dac_mmap_min_addr; #define VM_SEALED VM_NONE #endif +/* + * Flags which should be 'sticky' on merge - that is, flags which, when one VMA + * possesses it but the other does not, the merged VMA should nonetheless have + * applied to it: + * + * VM_MAYBE_GUARD - If a VMA may have guard regions in place it implies that + * mapped page tables may contain metadata not described by the + * VMA and thus any merged VMA may also contain this metadata, + * and thus we must make this flag sticky. + */ +#define VM_STICKY VM_MAYBE_GUARD + +/* + * VMA flags we ignore for the purposes of merge, i.e. one VMA possessing one + * of these flags and the other not does not preclude a merge. + * + * VM_SOFTDIRTY - Should not prevent from VMA merging, if we match the flags but + * dirty bit -- the caller should mark merged VMA as dirty. If + * dirty bit won't be excluded from comparison, we increase + * pressure on the memory system forcing the kernel to generate + * new VMAs when old one could be extended instead. + * + * VM_STICKY - When merging VMAs, VMA flags must match, unless they are + * 'sticky'. If any sticky flags exist in either VMA, we simply + * set all of them on the merged VMA. + */ +#define VM_IGNORE_MERGE (VM_SOFTDIRTY | VM_STICKY) + #define FIRST_USER_ADDRESS 0UL #define USER_PGTABLES_CEILING 0UL -- cgit v1.2.3 From ab04b530e7e8bd5cf9fb0c1ad20e0deee8f569ec Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 18 Nov 2025 10:17:47 +0000 Subject: mm: introduce copy-on-fork VMAs and make VM_MAYBE_GUARD one Gather all the VMA flags whose presence implies that page tables must be copied on fork into a single bitmap - VM_COPY_ON_FORK - and use this rather than specifying individual flags in vma_needs_copy(). We also add VM_MAYBE_GUARD to this list, as it being set on a VMA implies that there may be metadata contained in the page tables (that is - guard markers) which would will not and cannot be propagated upon fork. This was already being done manually previously in vma_needs_copy(), but this makes it very explicit, alongside VM_PFNMAP, VM_MIXEDMAP and VM_UFFD_WP all of which imply the same. Note that VM_STICKY flags ought generally to be marked VM_COPY_ON_FORK too - because equally a flag being VM_STICKY indicates that the VMA contains metadat that is not propagated by being faulted in - i.e. that the VMA metadata does not fully describe the VMA alone, and thus we must propagate whatever metadata there is on a fork. However, for maximum flexibility, we do not make this necessarily the case here. Link: https://lkml.kernel.org/r/5d41b24e7bc622cda0af92b6d558d7f4c0d1bc8c.1763460113.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Pedro Falcato Reviewed-by: Vlastimil Babka Acked-by: David Hildenbrand (Red Hat) Cc: Andrei Vagin Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Jann Horn Cc: Jonathan Corbet Cc: Lance Yang Cc: Liam Howlett Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Ryan Roberts Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 26 ++++++++++++++++++++++++++ mm/memory.c | 18 ++++-------------- tools/testing/vma/vma_internal.h | 26 ++++++++++++++++++++++++++ 3 files changed, 56 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index fea113d1d723..af2904aeb163 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -555,6 +555,32 @@ extern unsigned int kobjsize(const void *objp); */ #define VM_IGNORE_MERGE (VM_SOFTDIRTY | VM_STICKY) +/* + * Flags which should result in page tables being copied on fork. These are + * flags which indicate that the VMA maps page tables which cannot be + * reconsistuted upon page fault, so necessitate page table copying upon + * + * VM_PFNMAP / VM_MIXEDMAP - These contain kernel-mapped data which cannot be + * reasonably reconstructed on page fault. + * + * VM_UFFD_WP - Encodes metadata about an installed uffd + * write protect handler, which cannot be + * reconstructed on page fault. + * + * We always copy pgtables when dst_vma has uffd-wp + * enabled even if it's file-backed + * (e.g. shmem). Because when uffd-wp is enabled, + * pgtable contains uffd-wp protection information, + * that's something we can't retrieve from page cache, + * and skip copying will lose those info. + * + * VM_MAYBE_GUARD - Could contain page guard region markers which + * by design are a property of the page tables + * only and thus cannot be reconstructed on page + * fault. + */ +#define VM_COPY_ON_FORK (VM_PFNMAP | VM_MIXEDMAP | VM_UFFD_WP | VM_MAYBE_GUARD) + /* * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask.. diff --git a/mm/memory.c b/mm/memory.c index d1728d0538d6..27bc457b32c2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1463,25 +1463,15 @@ copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, static bool vma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { + if (src_vma->vm_flags & VM_COPY_ON_FORK) + return true; /* - * Always copy pgtables when dst_vma has uffd-wp enabled even if it's - * file-backed (e.g. shmem). Because when uffd-wp is enabled, pgtable - * contains uffd-wp protection information, that's something we can't - * retrieve from page cache, and skip copying will lose those info. + * The presence of an anon_vma indicates an anonymous VMA has page + * tables which naturally cannot be reconstituted on page fault. */ - if (userfaultfd_wp(dst_vma)) - return true; - - if (src_vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) - return true; - if (src_vma->anon_vma) return true; - /* Guard regions have modified page tables that require copying. */ - if (src_vma->vm_flags & VM_MAYBE_GUARD) - return true; - /* * Don't copy ptes where a page fault will fill them correctly. Fork * becomes much lighter when there are big shared or private readonly diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 73c2025777e6..233819a9e7ee 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -145,6 +145,32 @@ extern unsigned long dac_mmap_min_addr; */ #define VM_IGNORE_MERGE (VM_SOFTDIRTY | VM_STICKY) +/* + * Flags which should result in page tables being copied on fork. These are + * flags which indicate that the VMA maps page tables which cannot be + * reconsistuted upon page fault, so necessitate page table copying upon + * + * VM_PFNMAP / VM_MIXEDMAP - These contain kernel-mapped data which cannot be + * reasonably reconstructed on page fault. + * + * VM_UFFD_WP - Encodes metadata about an installed uffd + * write protect handler, which cannot be + * reconstructed on page fault. + * + * We always copy pgtables when dst_vma has uffd-wp + * enabled even if it's file-backed + * (e.g. shmem). Because when uffd-wp is enabled, + * pgtable contains uffd-wp protection information, + * that's something we can't retrieve from page cache, + * and skip copying will lose those info. + * + * VM_MAYBE_GUARD - Could contain page guard region markers which + * by design are a property of the page tables + * only and thus cannot be reconstructed on page + * fault. + */ +#define VM_COPY_ON_FORK (VM_PFNMAP | VM_MIXEDMAP | VM_UFFD_WP | VM_MAYBE_GUARD) + #define FIRST_USER_ADDRESS 0UL #define USER_PGTABLES_CEILING 0UL -- cgit v1.2.3 From bc8e51c05ad50a5a0b02114d3cc94d151a332595 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Fri, 7 Nov 2025 15:40:41 -0800 Subject: mm: memcg: dump memcg protection info on oom or alloc failures Currently kernel dumps memory state on oom and allocation failures. One of the question usually raised on those dumps is why the kernel has not reclaimed the reclaimable memory instead of triggering oom. One potential reason is the usage of memory protection provided by memcg. So, let's also dump the memory protected by the memcg in such reports to ease the debugging. Link: https://lkml.kernel.org/r/20251107234041.3632644-1-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Roman Gushchin Cc: Johannes Weiner Cc: Shakeel Butt Cc: Muchun Song Cc: David Rientjes Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 5 +++++ mm/memcontrol.c | 13 +++++++++++++ mm/oom_kill.c | 1 + mm/page_alloc.c | 1 + 4 files changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 8c0f15e5978f..966f7c1a0128 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1764,6 +1764,7 @@ static inline void count_objcg_events(struct obj_cgroup *objcg, bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid); +void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg); #else static inline bool mem_cgroup_kmem_disabled(void) { @@ -1830,6 +1831,10 @@ static inline bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid) { return true; } + +static inline void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg) +{ +} #endif /* CONFIG_MEMCG */ #if defined(CONFIG_MEMCG) && defined(CONFIG_ZSWAP) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 025da46d9959..bfc986da3289 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5635,3 +5635,16 @@ bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid) { return memcg ? cpuset_node_allowed(memcg->css.cgroup, nid) : true; } + +void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg) +{ + if (mem_cgroup_disabled() || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return; + + if (!memcg) + memcg = root_mem_cgroup; + + pr_warn("Memory cgroup min protection %lukB -- low protection %lukB", + K(atomic_long_read(&memcg->memory.children_min_usage)*PAGE_SIZE), + K(atomic_long_read(&memcg->memory.children_low_usage)*PAGE_SIZE)); +} diff --git a/mm/oom_kill.c b/mm/oom_kill.c index c145b0feecc1..5eb11fbba704 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -472,6 +472,7 @@ static void dump_header(struct oom_control *oc) if (should_dump_unreclaim_slab()) dump_unreclaimable_slab(); } + mem_cgroup_show_protected_memory(oc->memcg); if (sysctl_oom_dump_tasks) dump_tasks(oc); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e4efda1158b2..26be5734253f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3977,6 +3977,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) filter &= ~SHOW_MEM_FILTER_NODES; __show_mem(filter, nodemask, gfp_zone(gfp_mask)); + mem_cgroup_show_protected_memory(NULL); } void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) -- cgit v1.2.3 From 2197bb60f89077603cc580ff752c5cf6388c1099 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 10 Nov 2025 20:32:01 +0000 Subject: mm: add vma_start_write_killable() Patch series "vma_start_write_killable"", v2. When we added the VMA lock, we made a major oversight in not adding a killable variant. That can run us into trouble where a thread takes the VMA lock for read (eg handling a page fault) and then goes out to lunch for an hour (eg doing reclaim). Another thread tries to modify the VMA, taking the mmap_lock for write, then attempts to lock the VMA for write. That blocks on the first thread, and ensures that every other page fault now tries to take the mmap_lock for read. Because everything's in an uninterruptible sleep, we can't kill the task, which makes me angry. This patchset just adds vma_start_write_killable() and converts one caller to use it. Most users are somewhat tricky to convert, so expect follow-up individual patches per call-site which need careful analysis to make sure we've done proper cleanup. This patch (of 2): The vma can be held read-locked for a substantial period of time, eg if memory allocation needs to go into reclaim. It's useful to be able to send fatal signals to threads which are waiting for the write lock. Link: https://lkml.kernel.org/r/20251110203204.1454057-1-willy@infradead.org Link: https://lkml.kernel.org/r/20251110203204.1454057-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Reviewed-by: Liam R. Howlett Reviewed-by: Vlastimil Babka Reviewed-by: Lorenzo Stoakes Cc: Chris Li Cc: Jann Horn Cc: Matthew Wilcox (Oracle) Cc: Shakeel Butt Signed-off-by: Andrew Morton --- Documentation/mm/process_addrs.rst | 9 ++++++++- include/linux/mmap_lock.h | 30 ++++++++++++++++++++++++++++-- mm/mmap_lock.c | 34 +++++++++++++++++++++++++--------- tools/testing/vma/vma_internal.h | 8 ++++++++ 4 files changed, 69 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/Documentation/mm/process_addrs.rst b/Documentation/mm/process_addrs.rst index be49e2a269e4..7f2f3e87071d 100644 --- a/Documentation/mm/process_addrs.rst +++ b/Documentation/mm/process_addrs.rst @@ -48,7 +48,8 @@ Terminology * **VMA locks** - The VMA lock is at VMA granularity (of course) which behaves as a read/write semaphore in practice. A VMA read lock is obtained via :c:func:`!lock_vma_under_rcu` (and unlocked via :c:func:`!vma_end_read`) and a - write lock via :c:func:`!vma_start_write` (all VMA write locks are unlocked + write lock via vma_start_write() or vma_start_write_killable() + (all VMA write locks are unlocked automatically when the mmap write lock is released). To take a VMA write lock you **must** have already acquired an :c:func:`!mmap_write_lock`. * **rmap locks** - When trying to access VMAs through the reverse mapping via a @@ -907,3 +908,9 @@ Stack expansion Stack expansion throws up additional complexities in that we cannot permit there to be racing page faults, as a result we invoke :c:func:`!vma_start_write` to prevent this in :c:func:`!expand_downwards` or :c:func:`!expand_upwards`. + +------------------------ +Functions and structures +------------------------ + +.. kernel-doc:: include/linux/mmap_lock.h diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index e05da70dc0cb..d53f72dba7fe 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -195,7 +195,8 @@ static inline bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned in return (vma->vm_lock_seq == *mm_lock_seq); } -void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq); +int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq, + int state); /* * Begin writing to a VMA. @@ -209,7 +210,30 @@ static inline void vma_start_write(struct vm_area_struct *vma) if (__is_vma_write_locked(vma, &mm_lock_seq)) return; - __vma_start_write(vma, mm_lock_seq); + __vma_start_write(vma, mm_lock_seq, TASK_UNINTERRUPTIBLE); +} + +/** + * vma_start_write_killable - Begin writing to a VMA. + * @vma: The VMA we are going to modify. + * + * Exclude concurrent readers under the per-VMA lock until the currently + * write-locked mmap_lock is dropped or downgraded. + * + * Context: May sleep while waiting for readers to drop the vma read lock. + * Caller must already hold the mmap_lock for write. + * + * Return: 0 for a successful acquisition. -EINTR if a fatal signal was + * received. + */ +static inline __must_check +int vma_start_write_killable(struct vm_area_struct *vma) +{ + unsigned int mm_lock_seq; + + if (__is_vma_write_locked(vma, &mm_lock_seq)) + return 0; + return __vma_start_write(vma, mm_lock_seq, TASK_KILLABLE); } static inline void vma_assert_write_locked(struct vm_area_struct *vma) @@ -283,6 +307,8 @@ static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {} static inline void vma_end_read(struct vm_area_struct *vma) {} static inline void vma_start_write(struct vm_area_struct *vma) {} +static inline __must_check +int vma_start_write_killable(struct vm_area_struct *vma) { return 0; } static inline void vma_assert_write_locked(struct vm_area_struct *vma) { mmap_assert_write_locked(vma->vm_mm); } static inline void vma_assert_attached(struct vm_area_struct *vma) {} diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index 0a0db5849b8e..39f341caf32c 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -45,8 +45,15 @@ EXPORT_SYMBOL(__mmap_lock_do_trace_released); #ifdef CONFIG_MMU #ifdef CONFIG_PER_VMA_LOCK -static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching) +/* + * Return value: 0 if vma detached, + * 1 if vma attached with no readers, + * -EINTR if signal received, + */ +static inline int __vma_enter_locked(struct vm_area_struct *vma, + bool detaching, int state) { + int err; unsigned int tgt_refcnt = VMA_LOCK_OFFSET; /* Additional refcnt if the vma is attached. */ @@ -58,15 +65,19 @@ static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached(). */ if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt)) - return false; + return 0; rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_); - rcuwait_wait_event(&vma->vm_mm->vma_writer_wait, + err = rcuwait_wait_event(&vma->vm_mm->vma_writer_wait, refcount_read(&vma->vm_refcnt) == tgt_refcnt, - TASK_UNINTERRUPTIBLE); + state); + if (err) { + rwsem_release(&vma->vmlock_dep_map, _RET_IP_); + return err; + } lock_acquired(&vma->vmlock_dep_map, _RET_IP_); - return true; + return 1; } static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached) @@ -75,16 +86,19 @@ static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached) rwsem_release(&vma->vmlock_dep_map, _RET_IP_); } -void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq) +int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq, + int state) { - bool locked; + int locked; /* * __vma_enter_locked() returns false immediately if the vma is not * attached, otherwise it waits until refcnt is indicating that vma * is attached with no readers. */ - locked = __vma_enter_locked(vma, false); + locked = __vma_enter_locked(vma, false, state); + if (locked < 0) + return locked; /* * We should use WRITE_ONCE() here because we can have concurrent reads @@ -100,6 +114,8 @@ void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq) __vma_exit_locked(vma, &detached); WARN_ON_ONCE(detached); /* vma should remain attached */ } + + return 0; } EXPORT_SYMBOL_GPL(__vma_start_write); @@ -118,7 +134,7 @@ void vma_mark_detached(struct vm_area_struct *vma) */ if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { /* Wait until vma is detached with no readers. */ - if (__vma_enter_locked(vma, true)) { + if (__vma_enter_locked(vma, true, TASK_UNINTERRUPTIBLE)) { bool detached; __vma_exit_locked(vma, &detached); diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 233819a9e7ee..73a899ba2686 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -952,6 +952,14 @@ static inline void vma_start_write(struct vm_area_struct *vma) vma->vm_lock_seq++; } +static inline __must_check +int vma_start_write_killable(struct vm_area_struct *vma) +{ + /* Used to indicate to tests that a write operation has begun. */ + vma->vm_lock_seq++; + return 0; +} + static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, -- cgit v1.2.3 From 8b02baf37311754518dfe78073583db03fbb0c07 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 12 Nov 2025 07:41:04 -0800 Subject: mm/damon: rename damos core filter helpers to have word core Patch series "mm/damon: misc cleanups". Yet another batch of misc cleanups and refactoring for DAMON code, tests, and documents. First two patches (1and 2) rename DAMOS core filters related code for readability. Three following patches (3-5) refactor page table walk callback functions in DAMON, as suggested by Hugh and David, and I promised. Next two patches (6 and 7) refactor DAMON core layer kunit test and sysfs interface selftest to be simple and deduplicated. Final two patches (8 and 9) fix up sphinx and grammatical errors on documents. This patch (of 9): DAMOS filters handled by the core layer are called core filters, while those handled by the ops layer are called ops filters. They share the same type but are managed in different places since core filters are evaluated before the ops filters. They also have different helper functions that depend on their managed places. The helper functions for ops filters have '_ops_' keyword on their name, so it is easy to know they are for ops filters. Meanwhile, the helper functions for core filters are not having the 'core' keyword on their name. This makes it easy to be mistakenly used for ops filters. Actually there was such a bug. To avoid future mistakes from similar confusions, rename DAMOS core filters helper functions to have a keyword 'core' on their names. Link: https://lkml.kernel.org/r/20251112154114.66053-1-sj@kernel.org Link: https://lkml.kernel.org/r/20251112154114.66053-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Bill Wendling Cc: Brendan Higgins Cc: David Gow Cc: Jonathan Corbet Cc: Justin Stitt Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Nathan Chancellor Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: David Hildenbrand Cc: Hugh Dickins Signed-off-by: Andrew Morton --- .clang-format | 4 ++-- include/linux/damon.h | 4 ++-- mm/damon/core.c | 14 +++++++------- 3 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/.clang-format b/.clang-format index f371a13b4d19..748efbe791ad 100644 --- a/.clang-format +++ b/.clang-format @@ -140,8 +140,8 @@ ForEachMacros: - 'damon_for_each_scheme_safe' - 'damon_for_each_target' - 'damon_for_each_target_safe' - - 'damos_for_each_filter' - - 'damos_for_each_filter_safe' + - 'damos_for_each_core_filter' + - 'damos_for_each_core_filter_safe' - 'damos_for_each_ops_filter' - 'damos_for_each_ops_filter_safe' - 'damos_for_each_quota_goal' diff --git a/include/linux/damon.h b/include/linux/damon.h index f3566b978cdf..6e3db165fe60 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -871,10 +871,10 @@ static inline unsigned long damon_sz_region(struct damon_region *r) #define damos_for_each_quota_goal_safe(goal, next, quota) \ list_for_each_entry_safe(goal, next, &(quota)->goals, list) -#define damos_for_each_filter(f, scheme) \ +#define damos_for_each_core_filter(f, scheme) \ list_for_each_entry(f, &(scheme)->filters, list) -#define damos_for_each_filter_safe(f, next, scheme) \ +#define damos_for_each_core_filter_safe(f, next, scheme) \ list_for_each_entry_safe(f, next, &(scheme)->filters, list) #define damos_for_each_ops_filter(f, scheme) \ diff --git a/mm/damon/core.c b/mm/damon/core.c index a14cc73c2cab..d4cb11ced13f 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -450,7 +450,7 @@ void damon_destroy_scheme(struct damos *s) damos_for_each_quota_goal_safe(g, g_next, &s->quota) damos_destroy_quota_goal(g); - damos_for_each_filter_safe(f, next, s) + damos_for_each_core_filter_safe(f, next, s) damos_destroy_filter(f); damos_for_each_ops_filter_safe(f, next, s) @@ -864,12 +864,12 @@ static int damos_commit_quota(struct damos_quota *dst, struct damos_quota *src) return 0; } -static struct damos_filter *damos_nth_filter(int n, struct damos *s) +static struct damos_filter *damos_nth_core_filter(int n, struct damos *s) { struct damos_filter *filter; int i = 0; - damos_for_each_filter(filter, s) { + damos_for_each_core_filter(filter, s) { if (i++ == n) return filter; } @@ -923,15 +923,15 @@ static int damos_commit_core_filters(struct damos *dst, struct damos *src) struct damos_filter *dst_filter, *next, *src_filter, *new_filter; int i = 0, j = 0; - damos_for_each_filter_safe(dst_filter, next, dst) { - src_filter = damos_nth_filter(i++, src); + damos_for_each_core_filter_safe(dst_filter, next, dst) { + src_filter = damos_nth_core_filter(i++, src); if (src_filter) damos_commit_filter(dst_filter, src_filter); else damos_destroy_filter(dst_filter); } - damos_for_each_filter_safe(src_filter, next, src) { + damos_for_each_core_filter_safe(src_filter, next, src) { if (j++ < i) continue; @@ -1767,7 +1767,7 @@ static bool damos_filter_out(struct damon_ctx *ctx, struct damon_target *t, struct damos_filter *filter; s->core_filters_allowed = false; - damos_for_each_filter(filter, s) { + damos_for_each_core_filter(filter, s) { if (damos_filter_match(ctx, t, r, filter, ctx->min_sz_region)) { if (filter->allow) s->core_filters_allowed = true; -- cgit v1.2.3 From 53298afe456e62ad2c2dc8bc7aa54bb86a67ba2f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 12 Nov 2025 07:41:05 -0800 Subject: mm/damon: rename damos->filters to damos->core_filters DAMOS filters that are handled by the ops layer are linked to damos->ops_filters. Owing to the ops_ prefix on the name, it is easy to understand it is for ops layer handled filters. The other types of filters, which are handled by the core layer, are linked to damos->filters. Because of the name, it is easy to confuse the list is there for not only core layer handled ones but all filters. Avoid such confusions by renaming the field to core_filters. Link: https://lkml.kernel.org/r/20251112154114.66053-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Bill Wendling Cc: Brendan Higgins Cc: David Gow Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jonathan Corbet Cc: Justin Stitt Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Nathan Chancellor Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 10 +++++----- mm/damon/core.c | 6 +++--- mm/damon/tests/core-kunit.h | 4 ++-- tools/testing/selftests/damon/drgn_dump_damon_status.py | 8 ++++---- tools/testing/selftests/damon/sysfs.py | 2 +- 5 files changed, 15 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 6e3db165fe60..3813373a9200 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -492,7 +492,7 @@ struct damos_migrate_dests { * @wmarks: Watermarks for automated (in)activation of this scheme. * @migrate_dests: Destination nodes if @action is "migrate_{hot,cold}". * @target_nid: Destination node if @action is "migrate_{hot,cold}". - * @filters: Additional set of &struct damos_filter for &action. + * @core_filters: Additional set of &struct damos_filter for &action. * @ops_filters: ops layer handling &struct damos_filter objects list. * @last_applied: Last @action applied ops-managing entity. * @stat: Statistics of this scheme. @@ -518,7 +518,7 @@ struct damos_migrate_dests { * * Before applying the &action to a memory region, &struct damon_operations * implementation could check pages of the region and skip &action to respect - * &filters + * &core_filters * * The minimum entity that @action can be applied depends on the underlying * &struct damon_operations. Since it may not be aligned with the core layer @@ -562,7 +562,7 @@ struct damos { struct damos_migrate_dests migrate_dests; }; }; - struct list_head filters; + struct list_head core_filters; struct list_head ops_filters; void *last_applied; struct damos_stat stat; @@ -872,10 +872,10 @@ static inline unsigned long damon_sz_region(struct damon_region *r) list_for_each_entry_safe(goal, next, &(quota)->goals, list) #define damos_for_each_core_filter(f, scheme) \ - list_for_each_entry(f, &(scheme)->filters, list) + list_for_each_entry(f, &(scheme)->core_filters, list) #define damos_for_each_core_filter_safe(f, next, scheme) \ - list_for_each_entry_safe(f, next, &(scheme)->filters, list) + list_for_each_entry_safe(f, next, &(scheme)->core_filters, list) #define damos_for_each_ops_filter(f, scheme) \ list_for_each_entry(f, &(scheme)->ops_filters, list) diff --git a/mm/damon/core.c b/mm/damon/core.c index d4cb11ced13f..aedb315b075a 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -306,7 +306,7 @@ void damos_add_filter(struct damos *s, struct damos_filter *f) if (damos_filter_for_ops(f->type)) list_add_tail(&f->list, &s->ops_filters); else - list_add_tail(&f->list, &s->filters); + list_add_tail(&f->list, &s->core_filters); } static void damos_del_filter(struct damos_filter *f) @@ -397,7 +397,7 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern, */ scheme->next_apply_sis = 0; scheme->walk_completed = false; - INIT_LIST_HEAD(&scheme->filters); + INIT_LIST_HEAD(&scheme->core_filters); INIT_LIST_HEAD(&scheme->ops_filters); scheme->stat = (struct damos_stat){}; INIT_LIST_HEAD(&scheme->list); @@ -995,7 +995,7 @@ static void damos_set_filters_default_reject(struct damos *s) s->core_filters_default_reject = false; else s->core_filters_default_reject = - damos_filters_default_reject(&s->filters); + damos_filters_default_reject(&s->core_filters); s->ops_filters_default_reject = damos_filters_default_reject(&s->ops_filters); } diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index 0d2d8cda8631..4380d0312d24 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -876,7 +876,7 @@ static void damos_test_commit_filter(struct kunit *test) static void damos_test_help_initailize_scheme(struct damos *scheme) { INIT_LIST_HEAD(&scheme->quota.goals); - INIT_LIST_HEAD(&scheme->filters); + INIT_LIST_HEAD(&scheme->core_filters); INIT_LIST_HEAD(&scheme->ops_filters); } @@ -1140,7 +1140,7 @@ static void damon_test_set_filters_default_reject(struct kunit *test) struct damos scheme; struct damos_filter *target_filter, *anon_filter; - INIT_LIST_HEAD(&scheme.filters); + INIT_LIST_HEAD(&scheme.core_filters); INIT_LIST_HEAD(&scheme.ops_filters); damos_set_filters_default_reject(&scheme); diff --git a/tools/testing/selftests/damon/drgn_dump_damon_status.py b/tools/testing/selftests/damon/drgn_dump_damon_status.py index cb4fdbe68acb..5374d18d1fa8 100755 --- a/tools/testing/selftests/damon/drgn_dump_damon_status.py +++ b/tools/testing/selftests/damon/drgn_dump_damon_status.py @@ -175,11 +175,11 @@ def scheme_to_dict(scheme): ['target_nid', int], ['migrate_dests', damos_migrate_dests_to_dict], ]) - filters = [] + core_filters = [] for f in list_for_each_entry( - 'struct damos_filter', scheme.filters.address_of_(), 'list'): - filters.append(damos_filter_to_dict(f)) - dict_['filters'] = filters + 'struct damos_filter', scheme.core_filters.address_of_(), 'list'): + core_filters.append(damos_filter_to_dict(f)) + dict_['core_filters'] = core_filters ops_filters = [] for f in list_for_each_entry( 'struct damos_filter', scheme.ops_filters.address_of_(), 'list'): diff --git a/tools/testing/selftests/damon/sysfs.py b/tools/testing/selftests/damon/sysfs.py index b34aea0a6775..b4c5ef5c4d69 100755 --- a/tools/testing/selftests/damon/sysfs.py +++ b/tools/testing/selftests/damon/sysfs.py @@ -132,7 +132,7 @@ def assert_scheme_committed(scheme, dump): assert_watermarks_committed(scheme.watermarks, dump['wmarks']) # TODO: test filters directory for idx, f in enumerate(scheme.core_filters.filters): - assert_filter_committed(f, dump['filters'][idx]) + assert_filter_committed(f, dump['core_filters'][idx]) for idx, f in enumerate(scheme.ops_filters.filters): assert_filter_committed(f, dump['ops_filters'][idx]) -- cgit v1.2.3 From 6707915e030a3258868355f989b80140c1a45bbe Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 17 Nov 2025 17:33:38 +0000 Subject: mm: propagate VM_SOFTDIRTY on merge Patch series "make VM_SOFTDIRTY a sticky VMA flag", v2. Currently we set VM_SOFTDIRTY when a new mapping is set up (whether by establishing a new VMA, or via merge) as implemented in __mmap_complete() and do_brk_flags(). However, when performing a merge of existing mappings such as when performing mprotect(), we may lose the VM_SOFTDIRTY flag. Now we have the concept of making VMA flags 'sticky', that is that they both don't prevent merge and, importantly, are propagated to merged VMAs, this seems a sensible alternative to the existing special-casing of VM_SOFTDIRTY. We additionally add a self-test that demonstrates that this logic behaves as expected. This patch (of 2): Currently we set VM_SOFTDIRTY when a new mapping is set up (whether by establishing a new VMA, or via merge) as implemented in __mmap_complete() and do_brk_flags(). However, when performing a merge of existing mappings such as when performing mprotect(), we may lose the VM_SOFTDIRTY flag. This is because currently we simply ignore VM_SOFTDIRTY for the purposes of merge, so one VMA may possess the flag and another not, and whichever happens to be the target VMA will be the one upon which the merge is performed which may or may not have VM_SOFTDIRTY set. Now we have the concept of 'sticky' VMA flags, let's make VM_SOFTDIRTY one which solves this issue. Additionally update VMA userland tests to propagate changes. [akpm@linux-foundation.org: update comments, per Lorenzo] Link: https://lkml.kernel.org/r/0019e0b8-ee1e-4359-b5ee-94225cbe5588@lucifer.local Link: https://lkml.kernel.org/r/cover.1763399675.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/955478b5170715c895d1ef3b7f68e0cd77f76868.1763399675.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Suggested-by: Vlastimil Babka Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Pedro Falcato Acked-by: Andrey Vagin Reviewed-by: Vlastimil Babka Acked-by: Cyrill Gorcunov Cc: Jann Horn Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/mm.h | 15 +++++++-------- tools/testing/vma/vma_internal.h | 18 ++++++------------ 2 files changed, 13 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index af2904aeb163..bf660d5b6e97 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -532,28 +532,27 @@ extern unsigned int kobjsize(const void *objp); * possesses it but the other does not, the merged VMA should nonetheless have * applied to it: * + * VM_SOFTDIRTY - if a VMA is marked soft-dirty, that is has not had its + * references cleared via /proc/$pid/clear_refs, any merged VMA + * should be considered soft-dirty also as it operates at a VMA + * granularity. + * * VM_MAYBE_GUARD - If a VMA may have guard regions in place it implies that * mapped page tables may contain metadata not described by the * VMA and thus any merged VMA may also contain this metadata, * and thus we must make this flag sticky. */ -#define VM_STICKY VM_MAYBE_GUARD +#define VM_STICKY (VM_SOFTDIRTY | VM_MAYBE_GUARD) /* * VMA flags we ignore for the purposes of merge, i.e. one VMA possessing one * of these flags and the other not does not preclude a merge. * - * VM_SOFTDIRTY - Should not prevent from VMA merging, if we match the flags but - * dirty bit -- the caller should mark merged VMA as dirty. If - * dirty bit won't be excluded from comparison, we increase - * pressure on the memory system forcing the kernel to generate - * new VMAs when old one could be extended instead. - * * VM_STICKY - When merging VMAs, VMA flags must match, unless they are * 'sticky'. If any sticky flags exist in either VMA, we simply * set all of them on the merged VMA. */ -#define VM_IGNORE_MERGE (VM_SOFTDIRTY | VM_STICKY) +#define VM_IGNORE_MERGE VM_STICKY /* * Flags which should result in page tables being copied on fork. These are diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 73a899ba2686..81b501f51948 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -122,28 +122,22 @@ extern unsigned long dac_mmap_min_addr; * possesses it but the other does not, the merged VMA should nonetheless have * applied to it: * - * VM_MAYBE_GUARD - If a VMA may have guard regions in place it implies that - * mapped page tables may contain metadata not described by the - * VMA and thus any merged VMA may also contain this metadata, - * and thus we must make this flag sticky. + * VM_SOFTDIRTY - if a VMA is marked soft-dirty, that is has not had its + * references cleared via /proc/$pid/clear_refs, any merged VMA + * should be considered soft-dirty also as it operates at a VMA + * granularity. */ -#define VM_STICKY VM_MAYBE_GUARD +#define VM_STICKY (VM_SOFTDIRTY | VM_MAYBE_GUARD) /* * VMA flags we ignore for the purposes of merge, i.e. one VMA possessing one * of these flags and the other not does not preclude a merge. * - * VM_SOFTDIRTY - Should not prevent from VMA merging, if we match the flags but - * dirty bit -- the caller should mark merged VMA as dirty. If - * dirty bit won't be excluded from comparison, we increase - * pressure on the memory system forcing the kernel to generate - * new VMAs when old one could be extended instead. - * * VM_STICKY - When merging VMAs, VMA flags must match, unless they are * 'sticky'. If any sticky flags exist in either VMA, we simply * set all of them on the merged VMA. */ -#define VM_IGNORE_MERGE (VM_SOFTDIRTY | VM_STICKY) +#define VM_IGNORE_MERGE VM_STICKY /* * Flags which should result in page tables being copied on fork. These are -- cgit v1.2.3 From d245f9b4ab806733a77e51a218ca7b8bc3135cd9 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 1 Oct 2025 16:56:52 +1000 Subject: mm/zone_device: support large zone device private folios MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm: support device-private THP", v7. This patch series introduces support for Transparent Huge Page (THP) migration in zone device-private memory. The implementation enables efficient migration of large folios between system memory and device-private memory Background Current zone device-private memory implementation only supports PAGE_SIZE granularity, leading to: - Increased TLB pressure - Inefficient migration between CPU and device memory This series extends the existing zone device-private infrastructure to support THP, leading to: - Reduced page table overhead - Improved memory bandwidth utilization - Seamless fallback to base pages when needed In my local testing (using lib/test_hmm) and a throughput test, the series shows a 350% improvement in data transfer throughput and a 80% improvement in latency These patches build on the earlier posts by Ralph Campbell [1] Two new flags are added in vma_migration to select and mark compound pages. migrate_vma_setup(), migrate_vma_pages() and migrate_vma_finalize() support migration of these pages when MIGRATE_VMA_SELECT_COMPOUND is passed in as arguments. The series also adds zone device awareness to (m)THP pages along with fault handling of large zone device private pages. page vma walk and the rmap code is also zone device aware. Support has also been added for folios that might need to be split in the middle of migration (when the src and dst do not agree on MIGRATE_PFN_COMPOUND), that occurs when src side of the migration can migrate large pages, but the destination has not been able to allocate large pages. The code supported and used folio_split() when migrating THP pages, this is used when MIGRATE_VMA_SELECT_COMPOUND is not passed as an argument to migrate_vma_setup(). The test infrastructure lib/test_hmm.c has been enhanced to support THP migration. A new ioctl to emulate failure of large page allocations has been added to test the folio split code path. hmm-tests.c has new test cases for huge page migration and to test the folio split path. A new throughput test has been added as well. The nouveau dmem code has been enhanced to use the new THP migration capability. mTHP support: The patches hard code, HPAGE_PMD_NR in a few places, but the code has been kept generic to support various order sizes. With additional refactoring of the code support of different order sizes should be possible. The future plan is to post enhancements to support mTHP with a rough design as follows: 1. Add the notion of allowable thp orders to the HMM based test driver 2. For non PMD based THP paths in migrate_device.c, check to see if a suitable order is found and supported by the driver 3. Iterate across orders to check the highest supported order for migration 4. Migrate and finalize The mTHP patches can be built on top of this series, the key design elements that need to be worked out are infrastructure and driver support for multiple ordered pages and their migration. HMM support for large folios was added in 10b9feee2d0d ("mm/hmm: populate PFNs from PMD swap entry"). This patch (of 16) Add routines to support allocation of large order zone device folios and helper functions for zone device folios, to check if a folio is device private and helpers for setting zone device data. When large folios are used, the existing page_free() callback in pgmap is called when the folio is freed, this is true for both PAGE_SIZE and higher order pages. Zone device private large folios do not support deferred split and scan like normal THP folios. Link: https://lkml.kernel.org/r/20251001065707.920170-1-balbirs@nvidia.com Link: https://lkml.kernel.org/r/20251001065707.920170-2-balbirs@nvidia.com Link: https://lore.kernel.org/linux-mm/20201106005147.20113-1-rcampbell@nvidia.com/ [1] Signed-off-by: Balbir Singh Cc: David Hildenbrand Cc: Zi Yan Cc: Joshua Hahn Cc: Rakie Kim Cc: Byungchul Park Cc: Gregory Price Cc: Ying Huang Cc: Alistair Popple Cc: Oscar Salvador Cc: Lorenzo Stoakes Cc: Baolin Wang Cc: "Liam R. Howlett" Cc: Nico Pache Cc: Ryan Roberts Cc: Dev Jain Cc: Barry Song Cc: Lyude Paul Cc: Danilo Krummrich Cc: David Airlie Cc: Simona Vetter Cc: Ralph Campbell Cc: Mika Penttilä Cc: Matthew Brost Cc: Francois Dugast Cc: Madhavan Srinivasan Cc: Christophe Leroy Cc: Felix Kuehling Cc: Alex Deucher Cc: "Christian König" Signed-off-by: Andrew Morton --- arch/powerpc/kvm/book3s_hv_uvmem.c | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 2 +- drivers/gpu/drm/drm_pagemap.c | 2 +- drivers/gpu/drm/nouveau/nouveau_dmem.c | 2 +- include/linux/memremap.h | 10 +++++++++- lib/test_hmm.c | 2 +- mm/memremap.c | 26 +++++++++++++++----------- mm/rmap.c | 6 +++++- 8 files changed, 34 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 03f8c34fa0a2..91f763410673 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -723,7 +723,7 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm) dpage = pfn_to_page(uvmem_pfn); dpage->zone_device_data = pvt; - zone_device_page_init(dpage); + zone_device_page_init(dpage, 0); return dpage; out_clear: spin_lock(&kvmppc_uvmem_bitmap_lock); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index 59a5a3fea65d..f6198e66dc5a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -218,7 +218,7 @@ svm_migrate_get_vram_page(struct svm_range *prange, unsigned long pfn) page = pfn_to_page(pfn); svm_range_bo_ref(prange->svm_bo); page->zone_device_data = prange->svm_bo; - zone_device_page_init(page); + zone_device_page_init(page, 0); } static void diff --git a/drivers/gpu/drm/drm_pagemap.c b/drivers/gpu/drm/drm_pagemap.c index 22c44807e3fe..46a8edb279dc 100644 --- a/drivers/gpu/drm/drm_pagemap.c +++ b/drivers/gpu/drm/drm_pagemap.c @@ -196,7 +196,7 @@ static void drm_pagemap_get_devmem_page(struct page *page, struct drm_pagemap_zdd *zdd) { page->zone_device_data = drm_pagemap_zdd_get(zdd); - zone_device_page_init(page); + zone_device_page_init(page, 0); } /** diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index ca4932a150e3..53cc1926b9da 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -318,7 +318,7 @@ nouveau_dmem_page_alloc_locked(struct nouveau_drm *drm) return NULL; } - zone_device_page_init(page); + zone_device_page_init(page, 0); return page; } diff --git a/include/linux/memremap.h b/include/linux/memremap.h index e5951ba12a28..d2487a19cba2 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -206,7 +206,7 @@ static inline bool is_fsdax_page(const struct page *page) } #ifdef CONFIG_ZONE_DEVICE -void zone_device_page_init(struct page *page); +void zone_device_page_init(struct page *page, unsigned int order); void *memremap_pages(struct dev_pagemap *pgmap, int nid); void memunmap_pages(struct dev_pagemap *pgmap); void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap); @@ -215,6 +215,14 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn); bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn); unsigned long memremap_compat_align(void); + +static inline void zone_device_folio_init(struct folio *folio, unsigned int order) +{ + zone_device_page_init(&folio->page, order); + if (order) + folio_set_large_rmappable(folio); +} + #else static inline void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 83e3d8208a54..24d82121cde8 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -627,7 +627,7 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) goto error; } - zone_device_page_init(dpage); + zone_device_page_init(dpage, 0); dpage->zone_device_data = rpage; return dpage; diff --git a/mm/memremap.c b/mm/memremap.c index 46cb1b0b6f72..e45dfb568710 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -416,20 +416,19 @@ EXPORT_SYMBOL_GPL(get_dev_pagemap); void free_zone_device_folio(struct folio *folio) { struct dev_pagemap *pgmap = folio->pgmap; + unsigned long nr = folio_nr_pages(folio); + int i; if (WARN_ON_ONCE(!pgmap)) return; mem_cgroup_uncharge(folio); - /* - * Note: we don't expect anonymous compound pages yet. Once supported - * and we could PTE-map them similar to THP, we'd have to clear - * PG_anon_exclusive on all tail pages. - */ if (folio_test_anon(folio)) { - VM_BUG_ON_FOLIO(folio_test_large(folio), folio); - __ClearPageAnonExclusive(folio_page(folio, 0)); + for (i = 0; i < nr; i++) + __ClearPageAnonExclusive(folio_page(folio, i)); + } else { + VM_WARN_ON_ONCE(folio_test_large(folio)); } /* @@ -456,8 +455,8 @@ void free_zone_device_folio(struct folio *folio) case MEMORY_DEVICE_COHERENT: if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->page_free)) break; - pgmap->ops->page_free(folio_page(folio, 0)); - put_dev_pagemap(pgmap); + pgmap->ops->page_free(&folio->page); + percpu_ref_put_many(&folio->pgmap->ref, nr); break; case MEMORY_DEVICE_GENERIC: @@ -480,14 +479,19 @@ void free_zone_device_folio(struct folio *folio) } } -void zone_device_page_init(struct page *page) +void zone_device_page_init(struct page *page, unsigned int order) { + VM_WARN_ON_ONCE(order > MAX_ORDER_NR_PAGES); + /* * Drivers shouldn't be allocating pages after calling * memunmap_pages(). */ - WARN_ON_ONCE(!percpu_ref_tryget_live(&page_pgmap(page)->ref)); + WARN_ON_ONCE(!percpu_ref_tryget_many(&page_pgmap(page)->ref, 1 << order)); set_page_count(page, 1); lock_page(page); + + if (order) + prep_compound_page(page, order); } EXPORT_SYMBOL_GPL(zone_device_page_init); diff --git a/mm/rmap.c b/mm/rmap.c index 3c3cf3efa5f6..eaed5dfbb9b7 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1733,9 +1733,13 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, * the folio is unmapped and at least one page is still mapped. * * Check partially_mapped first to ensure it is a large folio. + * + * Device private folios do not support deferred splitting and + * shrinker based scanning of the folios to free. */ if (partially_mapped && folio_test_anon(folio) && - !folio_test_partially_mapped(folio)) + !folio_test_partially_mapped(folio) && + !folio_is_device_private(folio)) deferred_split_folio(folio, true); __folio_mod_stat(folio, -nr, -nr_pmdmapped); -- cgit v1.2.3 From 3a5a06554566fcc9f7de7327cfc365ed384d396c Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 1 Oct 2025 16:56:53 +1000 Subject: mm/zone_device: rename page_free callback to folio_free MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Change page_free to folio_free to make the folio support for zone device-private more consistent. The PCI P2PDMA callback has also been updated and changed to folio_free() as a result. For drivers that do not support folios (yet), the folio is converted back into page via &folio->page and the page is used as is, in the current callback implementation. Link: https://lkml.kernel.org/r/20251001065707.920170-3-balbirs@nvidia.com Signed-off-by: Balbir Singh Cc: David Hildenbrand Cc: Zi Yan Cc: Joshua Hahn Cc: Rakie Kim Cc: Byungchul Park Cc: Gregory Price Cc: Ying Huang Cc: Alistair Popple Cc: Oscar Salvador Cc: Lorenzo Stoakes Cc: Baolin Wang Cc: "Liam R. Howlett" Cc: Nico Pache Cc: Ryan Roberts Cc: Dev Jain Cc: Barry Song Cc: Lyude Paul Cc: Danilo Krummrich Cc: David Airlie Cc: Simona Vetter Cc: Ralph Campbell Cc: Mika Penttilä Cc: Matthew Brost Cc: Francois Dugast Cc: Madhavan Srinivasan Cc: Christophe Leroy Cc: Felix Kuehling Cc: Alex Deucher Cc: "Christian König" Signed-off-by: Andrew Morton --- Documentation/mm/memory-model.rst | 2 +- arch/powerpc/kvm/book3s_hv_uvmem.c | 5 +++-- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 5 +++-- drivers/gpu/drm/drm_pagemap.c | 10 +++++----- drivers/gpu/drm/nouveau/nouveau_dmem.c | 5 +++-- drivers/pci/p2pdma.c | 5 +++-- include/linux/memremap.h | 6 +++--- lib/test_hmm.c | 5 +++-- mm/memremap.c | 16 ++++++++-------- 9 files changed, 32 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/Documentation/mm/memory-model.rst b/Documentation/mm/memory-model.rst index 5f3eafbbc520..7957122039e8 100644 --- a/Documentation/mm/memory-model.rst +++ b/Documentation/mm/memory-model.rst @@ -165,7 +165,7 @@ The users of `ZONE_DEVICE` are: * pmem: Map platform persistent memory to be used as a direct-I/O target via DAX mappings. -* hmm: Extend `ZONE_DEVICE` with `->page_fault()` and `->page_free()` +* hmm: Extend `ZONE_DEVICE` with `->page_fault()` and `->folio_free()` event callbacks to allow a device-driver to coordinate memory management events related to device-memory, typically GPU memory. See Documentation/mm/hmm.rst. diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 91f763410673..e5000bef90f2 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -1014,8 +1014,9 @@ static vm_fault_t kvmppc_uvmem_migrate_to_ram(struct vm_fault *vmf) * to a normal PFN during H_SVM_PAGE_OUT. * Gets called with kvm->arch.uvmem_lock held. */ -static void kvmppc_uvmem_page_free(struct page *page) +static void kvmppc_uvmem_folio_free(struct folio *folio) { + struct page *page = &folio->page; unsigned long pfn = page_to_pfn(page) - (kvmppc_uvmem_pgmap.range.start >> PAGE_SHIFT); struct kvmppc_uvmem_page_pvt *pvt; @@ -1034,7 +1035,7 @@ static void kvmppc_uvmem_page_free(struct page *page) } static const struct dev_pagemap_ops kvmppc_uvmem_ops = { - .page_free = kvmppc_uvmem_page_free, + .folio_free = kvmppc_uvmem_folio_free, .migrate_to_ram = kvmppc_uvmem_migrate_to_ram, }; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index f6198e66dc5a..6f1617436f4b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -568,8 +568,9 @@ out: return r < 0 ? r : 0; } -static void svm_migrate_page_free(struct page *page) +static void svm_migrate_folio_free(struct folio *folio) { + struct page *page = &folio->page; struct svm_range_bo *svm_bo = page->zone_device_data; if (svm_bo) { @@ -1009,7 +1010,7 @@ out_mmput: } static const struct dev_pagemap_ops svm_migrate_pgmap_ops = { - .page_free = svm_migrate_page_free, + .folio_free = svm_migrate_folio_free, .migrate_to_ram = svm_migrate_to_ram, }; diff --git a/drivers/gpu/drm/drm_pagemap.c b/drivers/gpu/drm/drm_pagemap.c index 46a8edb279dc..37d7cfbbb3e8 100644 --- a/drivers/gpu/drm/drm_pagemap.c +++ b/drivers/gpu/drm/drm_pagemap.c @@ -752,15 +752,15 @@ err_out: } /** - * drm_pagemap_page_free() - Put GPU SVM zone device data associated with a page - * @page: Pointer to the page + * drm_pagemap_folio_free() - Put GPU SVM zone device data associated with a folio + * @folio: Pointer to the folio * * This function is a callback used to put the GPU SVM zone device data * associated with a page when it is being released. */ -static void drm_pagemap_page_free(struct page *page) +static void drm_pagemap_folio_free(struct folio *folio) { - drm_pagemap_zdd_put(page->zone_device_data); + drm_pagemap_zdd_put(folio->page.zone_device_data); } /** @@ -788,7 +788,7 @@ static vm_fault_t drm_pagemap_migrate_to_ram(struct vm_fault *vmf) } static const struct dev_pagemap_ops drm_pagemap_pagemap_ops = { - .page_free = drm_pagemap_page_free, + .folio_free = drm_pagemap_folio_free, .migrate_to_ram = drm_pagemap_migrate_to_ram, }; diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index 53cc1926b9da..d34288ebe7d2 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -108,8 +108,9 @@ unsigned long nouveau_dmem_page_addr(struct page *page) return chunk->bo->offset + off; } -static void nouveau_dmem_page_free(struct page *page) +static void nouveau_dmem_folio_free(struct folio *folio) { + struct page *page = &folio->page; struct nouveau_dmem_chunk *chunk = nouveau_page_to_chunk(page); struct nouveau_dmem *dmem = chunk->drm->dmem; @@ -220,7 +221,7 @@ done: } static const struct dev_pagemap_ops nouveau_dmem_pagemap_ops = { - .page_free = nouveau_dmem_page_free, + .folio_free = nouveau_dmem_folio_free, .migrate_to_ram = nouveau_dmem_migrate_to_ram, }; diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 78e108e47254..ee74b75d3e1f 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -200,8 +200,9 @@ static const struct attribute_group p2pmem_group = { .name = "p2pmem", }; -static void p2pdma_page_free(struct page *page) +static void p2pdma_folio_free(struct folio *folio) { + struct page *page = &folio->page; struct pci_p2pdma_pagemap *pgmap = to_p2p_pgmap(page_pgmap(page)); /* safe to dereference while a reference is held to the percpu ref */ struct pci_p2pdma *p2pdma = @@ -214,7 +215,7 @@ static void p2pdma_page_free(struct page *page) } static const struct dev_pagemap_ops p2pdma_pgmap_ops = { - .page_free = p2pdma_page_free, + .folio_free = p2pdma_folio_free, }; static void pci_p2pdma_release(void *data) diff --git a/include/linux/memremap.h b/include/linux/memremap.h index d2487a19cba2..cd28d1666801 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -77,11 +77,11 @@ enum memory_type { struct dev_pagemap_ops { /* - * Called once the page refcount reaches 0. The reference count will be + * Called once the folio refcount reaches 0. The reference count will be * reset to one by the core code after the method is called to prepare - * for handing out the page again. + * for handing out the folio again. */ - void (*page_free)(struct page *page); + void (*folio_free)(struct folio *folio); /* * Used for private (un-addressable) device memory only. Must migrate diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 24d82121cde8..9dbf265d1036 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -1374,8 +1374,9 @@ static const struct file_operations dmirror_fops = { .owner = THIS_MODULE, }; -static void dmirror_devmem_free(struct page *page) +static void dmirror_devmem_free(struct folio *folio) { + struct page *page = &folio->page; struct page *rpage = BACKING_PAGE(page); struct dmirror_device *mdevice; @@ -1438,7 +1439,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) } static const struct dev_pagemap_ops dmirror_devmem_ops = { - .page_free = dmirror_devmem_free, + .folio_free = dmirror_devmem_free, .migrate_to_ram = dmirror_devmem_fault, }; diff --git a/mm/memremap.c b/mm/memremap.c index e45dfb568710..4c2e0d68eb27 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -289,8 +289,8 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) WARN(1, "Missing migrate_to_ram method\n"); return ERR_PTR(-EINVAL); } - if (!pgmap->ops->page_free) { - WARN(1, "Missing page_free method\n"); + if (!pgmap->ops->folio_free) { + WARN(1, "Missing folio_free method\n"); return ERR_PTR(-EINVAL); } if (!pgmap->owner) { @@ -299,8 +299,8 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) } break; case MEMORY_DEVICE_COHERENT: - if (!pgmap->ops->page_free) { - WARN(1, "Missing page_free method\n"); + if (!pgmap->ops->folio_free) { + WARN(1, "Missing folio_free method\n"); return ERR_PTR(-EINVAL); } if (!pgmap->owner) { @@ -453,9 +453,9 @@ void free_zone_device_folio(struct folio *folio) switch (pgmap->type) { case MEMORY_DEVICE_PRIVATE: case MEMORY_DEVICE_COHERENT: - if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->page_free)) + if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->folio_free)) break; - pgmap->ops->page_free(&folio->page); + pgmap->ops->folio_free(folio); percpu_ref_put_many(&folio->pgmap->ref, nr); break; @@ -472,9 +472,9 @@ void free_zone_device_folio(struct folio *folio) break; case MEMORY_DEVICE_PCI_P2PDMA: - if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->page_free)) + if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->folio_free)) break; - pgmap->ops->page_free(folio_page(folio, 0)); + pgmap->ops->folio_free(folio); break; } } -- cgit v1.2.3 From 368076f52ebeecd33e10a9f80905d7508b6b6149 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 1 Oct 2025 16:56:54 +1000 Subject: mm/huge_memory: add device-private THP support to PMD operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend core huge page management functions to handle device-private THP entries. This enables proper handling of large device-private folios in fundamental MM operations. The following functions have been updated: - copy_huge_pmd(): Handle device-private entries during fork/clone - zap_huge_pmd(): Properly free device-private THP during munmap - change_huge_pmd(): Support protection changes on device-private THP - __pte_offset_map(): Add device-private entry awareness Link: https://lkml.kernel.org/r/20251001065707.920170-4-balbirs@nvidia.com Signed-off-by: Matthew Brost Signed-off-by: Balbir Singh Acked-by: Zi Yan Cc: David Hildenbrand Cc: Joshua Hahn Cc: Rakie Kim Cc: Byungchul Park Cc: Gregory Price Cc: Ying Huang Cc: Alistair Popple Cc: Oscar Salvador Cc: Lorenzo Stoakes Cc: Baolin Wang Cc: "Liam R. Howlett" Cc: Nico Pache Cc: Ryan Roberts Cc: Dev Jain Cc: Barry Song Cc: Lyude Paul Cc: Danilo Krummrich Cc: David Airlie Cc: Simona Vetter Cc: Ralph Campbell Cc: Mika Penttilä Cc: Francois Dugast Signed-off-by: Andrew Morton --- include/linux/swapops.h | 32 ++++++++++++++++++++++++++++ mm/huge_memory.c | 56 +++++++++++++++++++++++++++++++++++++++++-------- mm/pgtable-generic.c | 2 +- 3 files changed, 80 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 64ea151a7ae3..2687928a8146 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -594,10 +594,42 @@ static inline int is_pmd_migration_entry(pmd_t pmd) } #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ +#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_ENABLE_THP_MIGRATION) + +/** + * is_pmd_device_private_entry() - Check if PMD contains a device private swap entry + * @pmd: The PMD to check + * + * Returns true if the PMD contains a swap entry that represents a device private + * page mapping. This is used for zone device private pages that have been + * swapped out but still need special handling during various memory management + * operations. + * + * Return: 1 if PMD contains device private entry, 0 otherwise + */ +static inline int is_pmd_device_private_entry(pmd_t pmd) +{ + return is_swap_pmd(pmd) && is_device_private_entry(pmd_to_swp_entry(pmd)); +} + +#else /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ + +static inline int is_pmd_device_private_entry(pmd_t pmd) +{ + return 0; +} + +#endif /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ + static inline int non_swap_entry(swp_entry_t entry) { return swp_type(entry) >= MAX_SWAPFILES; } +static inline int is_pmd_non_present_folio_entry(pmd_t pmd) +{ + return is_pmd_migration_entry(pmd) || is_pmd_device_private_entry(pmd); +} + #endif /* CONFIG_MMU */ #endif /* _LINUX_SWAPOPS_H */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3ae16b4a82de..19f0ee7373ae 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1704,17 +1704,45 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (unlikely(is_swap_pmd(pmd))) { swp_entry_t entry = pmd_to_swp_entry(pmd); - VM_BUG_ON(!is_pmd_migration_entry(pmd)); - if (!is_readable_migration_entry(entry)) { - entry = make_readable_migration_entry( - swp_offset(entry)); + VM_WARN_ON(!is_pmd_non_present_folio_entry(pmd)); + + if (is_writable_migration_entry(entry) || + is_readable_exclusive_migration_entry(entry)) { + entry = make_readable_migration_entry(swp_offset(entry)); pmd = swp_entry_to_pmd(entry); if (pmd_swp_soft_dirty(*src_pmd)) pmd = pmd_swp_mksoft_dirty(pmd); if (pmd_swp_uffd_wp(*src_pmd)) pmd = pmd_swp_mkuffd_wp(pmd); set_pmd_at(src_mm, addr, src_pmd, pmd); + } else if (is_device_private_entry(entry)) { + /* + * For device private entries, since there are no + * read exclusive entries, writable = !readable + */ + if (is_writable_device_private_entry(entry)) { + entry = make_readable_device_private_entry(swp_offset(entry)); + pmd = swp_entry_to_pmd(entry); + + if (pmd_swp_soft_dirty(*src_pmd)) + pmd = pmd_swp_mksoft_dirty(pmd); + if (pmd_swp_uffd_wp(*src_pmd)) + pmd = pmd_swp_mkuffd_wp(pmd); + set_pmd_at(src_mm, addr, src_pmd, pmd); + } + + src_folio = pfn_swap_entry_folio(entry); + VM_WARN_ON(!folio_test_large(src_folio)); + + folio_get(src_folio); + /* + * folio_try_dup_anon_rmap_pmd does not fail for + * device private entries. + */ + folio_try_dup_anon_rmap_pmd(src_folio, &src_folio->page, + dst_vma, src_vma); } + add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); mm_inc_nr_ptes(dst_mm); pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); @@ -2212,15 +2240,16 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, folio_remove_rmap_pmd(folio, page, vma); WARN_ON_ONCE(folio_mapcount(folio) < 0); VM_BUG_ON_PAGE(!PageHead(page), page); - } else if (thp_migration_supported()) { + } else if (is_pmd_non_present_folio_entry(orig_pmd)) { swp_entry_t entry; - VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); entry = pmd_to_swp_entry(orig_pmd); folio = pfn_swap_entry_folio(entry); flush_needed = 0; - } else - WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); + + if (!thp_migration_supported()) + WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); + } if (folio_test_anon(folio)) { zap_deposited_table(tlb->mm, pmd); @@ -2240,6 +2269,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, folio_mark_accessed(folio); } + if (folio_is_device_private(folio)) { + folio_remove_rmap_pmd(folio, &folio->page, vma); + WARN_ON_ONCE(folio_mapcount(folio) < 0); + folio_put(folio); + } + spin_unlock(ptl); if (flush_needed) tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE); @@ -2368,7 +2403,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, struct folio *folio = pfn_swap_entry_folio(entry); pmd_t newpmd; - VM_BUG_ON(!is_pmd_migration_entry(*pmd)); + VM_WARN_ON(!is_pmd_non_present_folio_entry(*pmd)); if (is_writable_migration_entry(entry)) { /* * A protection check is difficult so @@ -2381,6 +2416,9 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, newpmd = swp_entry_to_pmd(entry); if (pmd_swp_soft_dirty(*pmd)) newpmd = pmd_swp_mksoft_dirty(newpmd); + } else if (is_writable_device_private_entry(entry)) { + entry = make_readable_device_private_entry(swp_offset(entry)); + newpmd = swp_entry_to_pmd(entry); } else { newpmd = *pmd; } diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index e46f0cf2159c..d3aec7a9926a 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -292,7 +292,7 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) if (pmdvalp) *pmdvalp = pmdval; - if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval))) + if (unlikely(pmd_none(pmdval) || !pmd_present(pmdval))) goto nomap; if (unlikely(pmd_trans_huge(pmdval))) goto nomap; -- cgit v1.2.3 From a30b48bf1b244f11bf9b6d20cdccfe0c2264130c Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 1 Oct 2025 16:56:58 +1000 Subject: mm/migrate_device: implement THP migration of zone device pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MIGRATE_VMA_SELECT_COMPOUND will be used to select THP pages during migrate_vma_setup() and MIGRATE_PFN_COMPOUND will make migrating device pages as compound pages during device pfn migration. migrate_device code paths go through the collect, setup and finalize phases of migration. The entries in src and dst arrays passed to these functions still remain at a PAGE_SIZE granularity. When a compound page is passed, the first entry has the PFN along with MIGRATE_PFN_COMPOUND and other flags set (MIGRATE_PFN_MIGRATE, MIGRATE_PFN_VALID), the remaining entries (HPAGE_PMD_NR - 1) are filled with 0's. This representation allows for the compound page to be split into smaller page sizes. migrate_vma_collect_hole(), migrate_vma_collect_pmd() are now THP page aware. Two new helper functions migrate_vma_collect_huge_pmd() and migrate_vma_insert_huge_pmd_page() have been added. migrate_vma_collect_huge_pmd() can collect THP pages, but if for some reason this fails, there is fallback support to split the folio and migrate it. migrate_vma_insert_huge_pmd_page() closely follows the logic of migrate_vma_insert_page() Support for splitting pages as needed for migration will follow in later patches in this series. Link: https://lkml.kernel.org/r/20251001065707.920170-8-balbirs@nvidia.com Signed-off-by: Balbir Singh Cc: David Hildenbrand Cc: Zi Yan Cc: Joshua Hahn Cc: Rakie Kim Cc: Byungchul Park Cc: Gregory Price Cc: Ying Huang Cc: Alistair Popple Cc: Oscar Salvador Cc: Lorenzo Stoakes Cc: Baolin Wang Cc: "Liam R. Howlett" Cc: Nico Pache Cc: Ryan Roberts Cc: Dev Jain Cc: Barry Song Cc: Lyude Paul Cc: Danilo Krummrich Cc: David Airlie Cc: Simona Vetter Cc: Ralph Campbell Cc: Mika Penttilä Cc: Matthew Brost Cc: Francois Dugast Signed-off-by: Andrew Morton --- include/linux/migrate.h | 2 + mm/migrate_device.c | 469 +++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 408 insertions(+), 63 deletions(-) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 1f0ac122c3bf..41b4cc05a450 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -125,6 +125,7 @@ static inline int migrate_misplaced_folio(struct folio *folio, int node) #define MIGRATE_PFN_VALID (1UL << 0) #define MIGRATE_PFN_MIGRATE (1UL << 1) #define MIGRATE_PFN_WRITE (1UL << 3) +#define MIGRATE_PFN_COMPOUND (1UL << 4) #define MIGRATE_PFN_SHIFT 6 static inline struct page *migrate_pfn_to_page(unsigned long mpfn) @@ -143,6 +144,7 @@ enum migrate_vma_direction { MIGRATE_VMA_SELECT_SYSTEM = 1 << 0, MIGRATE_VMA_SELECT_DEVICE_PRIVATE = 1 << 1, MIGRATE_VMA_SELECT_DEVICE_COHERENT = 1 << 2, + MIGRATE_VMA_SELECT_COMPOUND = 1 << 3, }; struct migrate_vma { diff --git a/mm/migrate_device.c b/mm/migrate_device.c index e6bcd6dc5129..a0a315f3572a 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include "internal.h" @@ -44,6 +45,23 @@ static int migrate_vma_collect_hole(unsigned long start, if (!vma_is_anonymous(walk->vma)) return migrate_vma_collect_skip(start, end, walk); + if (thp_migration_supported() && + (migrate->flags & MIGRATE_VMA_SELECT_COMPOUND) && + (IS_ALIGNED(start, HPAGE_PMD_SIZE) && + IS_ALIGNED(end, HPAGE_PMD_SIZE))) { + migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE | + MIGRATE_PFN_COMPOUND; + migrate->dst[migrate->npages] = 0; + migrate->npages++; + migrate->cpages++; + + /* + * Collect the remaining entries as holes, in case we + * need to split later + */ + return migrate_vma_collect_skip(start + PAGE_SIZE, end, walk); + } + for (addr = start; addr < end; addr += PAGE_SIZE) { migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE; migrate->dst[migrate->npages] = 0; @@ -103,57 +121,151 @@ static int migrate_vma_split_folio(struct folio *folio, return 0; } -static int migrate_vma_collect_pmd(pmd_t *pmdp, - unsigned long start, - unsigned long end, - struct mm_walk *walk) +/** migrate_vma_collect_huge_pmd - collect THP pages without splitting the + * folio for device private pages. + * @pmdp: pointer to pmd entry + * @start: start address of the range for migration + * @end: end address of the range for migration + * @walk: mm_walk callback structure + * @fault_folio: folio associated with the fault if any + * + * Collect the huge pmd entry at @pmdp for migration and set the + * MIGRATE_PFN_COMPOUND flag in the migrate src entry to indicate that + * migration will occur at HPAGE_PMD granularity + */ +static int migrate_vma_collect_huge_pmd(pmd_t *pmdp, unsigned long start, + unsigned long end, struct mm_walk *walk, + struct folio *fault_folio) { + struct mm_struct *mm = walk->mm; + struct folio *folio; struct migrate_vma *migrate = walk->private; - struct folio *fault_folio = migrate->fault_page ? - page_folio(migrate->fault_page) : NULL; - struct vm_area_struct *vma = walk->vma; - struct mm_struct *mm = vma->vm_mm; - unsigned long addr = start, unmapped = 0; spinlock_t *ptl; - pte_t *ptep; + swp_entry_t entry; + int ret; + unsigned long write = 0; -again: - if (pmd_none(*pmdp)) + ptl = pmd_lock(mm, pmdp); + if (pmd_none(*pmdp)) { + spin_unlock(ptl); return migrate_vma_collect_hole(start, end, -1, walk); + } if (pmd_trans_huge(*pmdp)) { - struct folio *folio; - - ptl = pmd_lock(mm, pmdp); - if (unlikely(!pmd_trans_huge(*pmdp))) { + if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) { spin_unlock(ptl); - goto again; + return migrate_vma_collect_skip(start, end, walk); } folio = pmd_folio(*pmdp); if (is_huge_zero_folio(folio)) { spin_unlock(ptl); - split_huge_pmd(vma, pmdp, addr); - } else { - int ret; + return migrate_vma_collect_hole(start, end, -1, walk); + } + if (pmd_write(*pmdp)) + write = MIGRATE_PFN_WRITE; + } else if (!pmd_present(*pmdp)) { + entry = pmd_to_swp_entry(*pmdp); + folio = pfn_swap_entry_folio(entry); + + if (!is_device_private_entry(entry) || + !(migrate->flags & MIGRATE_VMA_SELECT_DEVICE_PRIVATE) || + (folio->pgmap->owner != migrate->pgmap_owner)) { + spin_unlock(ptl); + return migrate_vma_collect_skip(start, end, walk); + } - folio_get(folio); + if (is_migration_entry(entry)) { + migration_entry_wait_on_locked(entry, ptl); spin_unlock(ptl); - /* FIXME: we don't expect THP for fault_folio */ - if (WARN_ON_ONCE(fault_folio == folio)) - return migrate_vma_collect_skip(start, end, - walk); - if (unlikely(!folio_trylock(folio))) - return migrate_vma_collect_skip(start, end, - walk); - ret = split_folio(folio); - if (fault_folio != folio) - folio_unlock(folio); - folio_put(folio); - if (ret) - return migrate_vma_collect_skip(start, end, - walk); + return -EAGAIN; + } + + if (is_writable_device_private_entry(entry)) + write = MIGRATE_PFN_WRITE; + } else { + spin_unlock(ptl); + return -EAGAIN; + } + + folio_get(folio); + if (folio != fault_folio && unlikely(!folio_trylock(folio))) { + spin_unlock(ptl); + folio_put(folio); + return migrate_vma_collect_skip(start, end, walk); + } + + if (thp_migration_supported() && + (migrate->flags & MIGRATE_VMA_SELECT_COMPOUND) && + (IS_ALIGNED(start, HPAGE_PMD_SIZE) && + IS_ALIGNED(end, HPAGE_PMD_SIZE))) { + + struct page_vma_mapped_walk pvmw = { + .ptl = ptl, + .address = start, + .pmd = pmdp, + .vma = walk->vma, + }; + + unsigned long pfn = page_to_pfn(folio_page(folio, 0)); + + migrate->src[migrate->npages] = migrate_pfn(pfn) | write + | MIGRATE_PFN_MIGRATE + | MIGRATE_PFN_COMPOUND; + migrate->dst[migrate->npages++] = 0; + migrate->cpages++; + ret = set_pmd_migration_entry(&pvmw, folio_page(folio, 0)); + if (ret) { + migrate->npages--; + migrate->cpages--; + migrate->src[migrate->npages] = 0; + migrate->dst[migrate->npages] = 0; + goto fallback; } + migrate_vma_collect_skip(start + PAGE_SIZE, end, walk); + spin_unlock(ptl); + return 0; + } + +fallback: + spin_unlock(ptl); + if (!folio_test_large(folio)) + goto done; + ret = split_folio(folio); + if (fault_folio != folio) + folio_unlock(folio); + folio_put(folio); + if (ret) + return migrate_vma_collect_skip(start, end, walk); + if (pmd_none(pmdp_get_lockless(pmdp))) + return migrate_vma_collect_hole(start, end, -1, walk); + +done: + return -ENOENT; +} + +static int migrate_vma_collect_pmd(pmd_t *pmdp, + unsigned long start, + unsigned long end, + struct mm_walk *walk) +{ + struct migrate_vma *migrate = walk->private; + struct vm_area_struct *vma = walk->vma; + struct mm_struct *mm = vma->vm_mm; + unsigned long addr = start, unmapped = 0; + spinlock_t *ptl; + struct folio *fault_folio = migrate->fault_page ? + page_folio(migrate->fault_page) : NULL; + pte_t *ptep; + +again: + if (pmd_trans_huge(*pmdp) || !pmd_present(*pmdp)) { + int ret = migrate_vma_collect_huge_pmd(pmdp, start, end, walk, fault_folio); + + if (ret == -EAGAIN) + goto again; + if (ret == 0) + return 0; } ptep = pte_offset_map_lock(mm, pmdp, start, &ptl); @@ -243,8 +355,7 @@ again: mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; } - /* FIXME support THP */ - if (!page || !page->mapping || PageTransCompound(page)) { + if (!page || !page->mapping) { mpfn = 0; goto next; } @@ -415,14 +526,6 @@ static bool migrate_vma_check_page(struct page *page, struct page *fault_page) */ int extra = 1 + (page == fault_page); - /* - * FIXME support THP (transparent huge page), it is bit more complex to - * check them than regular pages, because they can be mapped with a pmd - * or with a pte (split pte mapping). - */ - if (folio_test_large(folio)) - return false; - /* Page from ZONE_DEVICE have one extra reference */ if (folio_is_zone_device(folio)) extra++; @@ -453,17 +556,24 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns, lru_add_drain(); - for (i = 0; i < npages; i++) { + for (i = 0; i < npages; ) { struct page *page = migrate_pfn_to_page(src_pfns[i]); struct folio *folio; + unsigned int nr = 1; if (!page) { if (src_pfns[i] & MIGRATE_PFN_MIGRATE) unmapped++; - continue; + goto next; } folio = page_folio(page); + nr = folio_nr_pages(folio); + + if (nr > 1) + src_pfns[i] |= MIGRATE_PFN_COMPOUND; + + /* ZONE_DEVICE folios are not on LRU */ if (!folio_is_zone_device(folio)) { if (!folio_test_lru(folio) && allow_drain) { @@ -475,7 +585,7 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns, if (!folio_isolate_lru(folio)) { src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; restore++; - continue; + goto next; } /* Drop the reference we took in collect */ @@ -494,10 +604,12 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns, src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; restore++; - continue; + goto next; } unmapped++; +next: + i += nr; } for (i = 0; i < npages && restore; i++) { @@ -643,6 +755,160 @@ int migrate_vma_setup(struct migrate_vma *args) } EXPORT_SYMBOL(migrate_vma_setup); +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION +/** + * migrate_vma_insert_huge_pmd_page: Insert a huge folio into @migrate->vma->vm_mm + * at @addr. folio is already allocated as a part of the migration process with + * large page. + * + * @page needs to be initialized and setup after it's allocated. The code bits + * here follow closely the code in __do_huge_pmd_anonymous_page(). This API does + * not support THP zero pages. + * + * @migrate: migrate_vma arguments + * @addr: address where the folio will be inserted + * @page: page to be inserted at @addr + * @src: src pfn which is being migrated + * @pmdp: pointer to the pmd + */ +static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate, + unsigned long addr, + struct page *page, + unsigned long *src, + pmd_t *pmdp) +{ + struct vm_area_struct *vma = migrate->vma; + gfp_t gfp = vma_thp_gfp_mask(vma); + struct folio *folio = page_folio(page); + int ret; + vm_fault_t csa_ret; + spinlock_t *ptl; + pgtable_t pgtable; + pmd_t entry; + bool flush = false; + unsigned long i; + + VM_WARN_ON_FOLIO(!folio, folio); + VM_WARN_ON_ONCE(!pmd_none(*pmdp) && !is_huge_zero_pmd(*pmdp)); + + if (!thp_vma_suitable_order(vma, addr, HPAGE_PMD_ORDER)) + return -EINVAL; + + ret = anon_vma_prepare(vma); + if (ret) + return ret; + + folio_set_order(folio, HPAGE_PMD_ORDER); + folio_set_large_rmappable(folio); + + if (mem_cgroup_charge(folio, migrate->vma->vm_mm, gfp)) { + count_vm_event(THP_FAULT_FALLBACK); + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); + ret = -ENOMEM; + goto abort; + } + + __folio_mark_uptodate(folio); + + pgtable = pte_alloc_one(vma->vm_mm); + if (unlikely(!pgtable)) + goto abort; + + if (folio_is_device_private(folio)) { + swp_entry_t swp_entry; + + if (vma->vm_flags & VM_WRITE) + swp_entry = make_writable_device_private_entry( + page_to_pfn(page)); + else + swp_entry = make_readable_device_private_entry( + page_to_pfn(page)); + entry = swp_entry_to_pmd(swp_entry); + } else { + if (folio_is_zone_device(folio) && + !folio_is_device_coherent(folio)) { + goto abort; + } + entry = folio_mk_pmd(folio, vma->vm_page_prot); + if (vma->vm_flags & VM_WRITE) + entry = pmd_mkwrite(pmd_mkdirty(entry), vma); + } + + ptl = pmd_lock(vma->vm_mm, pmdp); + csa_ret = check_stable_address_space(vma->vm_mm); + if (csa_ret) + goto abort; + + /* + * Check for userfaultfd but do not deliver the fault. Instead, + * just back off. + */ + if (userfaultfd_missing(vma)) + goto unlock_abort; + + if (!pmd_none(*pmdp)) { + if (!is_huge_zero_pmd(*pmdp)) + goto unlock_abort; + flush = true; + } else if (!pmd_none(*pmdp)) + goto unlock_abort; + + add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); + folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); + if (!folio_is_zone_device(folio)) + folio_add_lru_vma(folio, vma); + folio_get(folio); + + if (flush) { + pte_free(vma->vm_mm, pgtable); + flush_cache_page(vma, addr, addr + HPAGE_PMD_SIZE); + pmdp_invalidate(vma, addr, pmdp); + } else { + pgtable_trans_huge_deposit(vma->vm_mm, pmdp, pgtable); + mm_inc_nr_ptes(vma->vm_mm); + } + set_pmd_at(vma->vm_mm, addr, pmdp, entry); + update_mmu_cache_pmd(vma, addr, pmdp); + + spin_unlock(ptl); + + count_vm_event(THP_FAULT_ALLOC); + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); + count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); + + return 0; + +unlock_abort: + spin_unlock(ptl); +abort: + for (i = 0; i < HPAGE_PMD_NR; i++) + src[i] &= ~MIGRATE_PFN_MIGRATE; + return 0; +} +#else /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */ +static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate, + unsigned long addr, + struct page *page, + unsigned long *src, + pmd_t *pmdp) +{ + return 0; +} +#endif + +static unsigned long migrate_vma_nr_pages(unsigned long *src) +{ + unsigned long nr = 1; +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION + if (*src & MIGRATE_PFN_COMPOUND) + nr = HPAGE_PMD_NR; +#else + if (*src & MIGRATE_PFN_COMPOUND) + VM_WARN_ON_ONCE(true); +#endif + return nr; +} + /* * This code closely matches the code in: * __handle_mm_fault() @@ -653,9 +919,10 @@ EXPORT_SYMBOL(migrate_vma_setup); */ static void migrate_vma_insert_page(struct migrate_vma *migrate, unsigned long addr, - struct page *page, + unsigned long *dst, unsigned long *src) { + struct page *page = migrate_pfn_to_page(*dst); struct folio *folio = page_folio(page); struct vm_area_struct *vma = migrate->vma; struct mm_struct *mm = vma->vm_mm; @@ -683,8 +950,24 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, pmdp = pmd_alloc(mm, pudp, addr); if (!pmdp) goto abort; - if (pmd_trans_huge(*pmdp)) - goto abort; + + if (thp_migration_supported() && (*dst & MIGRATE_PFN_COMPOUND)) { + int ret = migrate_vma_insert_huge_pmd_page(migrate, addr, page, + src, pmdp); + if (ret) + goto abort; + return; + } + + if (!pmd_none(*pmdp)) { + if (pmd_trans_huge(*pmdp)) { + if (!is_huge_zero_pmd(*pmdp)) + goto abort; + split_huge_pmd(vma, pmdp, addr); + } else if (pmd_leaf(*pmdp)) + goto abort; + } + if (pte_alloc(mm, pmdp)) goto abort; if (unlikely(anon_vma_prepare(vma))) @@ -775,23 +1058,24 @@ static void __migrate_device_pages(unsigned long *src_pfns, unsigned long i; bool notified = false; - for (i = 0; i < npages; i++) { + for (i = 0; i < npages; ) { struct page *newpage = migrate_pfn_to_page(dst_pfns[i]); struct page *page = migrate_pfn_to_page(src_pfns[i]); struct address_space *mapping; struct folio *newfolio, *folio; int r, extra_cnt = 0; + unsigned long nr = 1; if (!newpage) { src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; - continue; + goto next; } if (!page) { unsigned long addr; if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE)) - continue; + goto next; /* * The only time there is no vma is when called from @@ -809,15 +1093,47 @@ static void __migrate_device_pages(unsigned long *src_pfns, migrate->pgmap_owner); mmu_notifier_invalidate_range_start(&range); } - migrate_vma_insert_page(migrate, addr, newpage, + + if ((src_pfns[i] & MIGRATE_PFN_COMPOUND) && + (!(dst_pfns[i] & MIGRATE_PFN_COMPOUND))) { + nr = migrate_vma_nr_pages(&src_pfns[i]); + src_pfns[i] &= ~MIGRATE_PFN_COMPOUND; + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; + goto next; + } + + migrate_vma_insert_page(migrate, addr, &dst_pfns[i], &src_pfns[i]); - continue; + goto next; } newfolio = page_folio(newpage); folio = page_folio(page); mapping = folio_mapping(folio); + /* + * If THP migration is enabled, check if both src and dst + * can migrate large pages + */ + if (thp_migration_supported()) { + if ((src_pfns[i] & MIGRATE_PFN_MIGRATE) && + (src_pfns[i] & MIGRATE_PFN_COMPOUND) && + !(dst_pfns[i] & MIGRATE_PFN_COMPOUND)) { + + if (!migrate) { + src_pfns[i] &= ~(MIGRATE_PFN_MIGRATE | + MIGRATE_PFN_COMPOUND); + goto next; + } + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; + } else if ((src_pfns[i] & MIGRATE_PFN_MIGRATE) && + (dst_pfns[i] & MIGRATE_PFN_COMPOUND) && + !(src_pfns[i] & MIGRATE_PFN_COMPOUND)) { + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; + } + } + + if (folio_is_device_private(newfolio) || folio_is_device_coherent(newfolio)) { if (mapping) { @@ -830,7 +1146,7 @@ static void __migrate_device_pages(unsigned long *src_pfns, if (!folio_test_anon(folio) || !folio_free_swap(folio)) { src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; - continue; + goto next; } } } else if (folio_is_zone_device(newfolio)) { @@ -838,7 +1154,7 @@ static void __migrate_device_pages(unsigned long *src_pfns, * Other types of ZONE_DEVICE page are not supported. */ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; - continue; + goto next; } BUG_ON(folio_test_writeback(folio)); @@ -850,6 +1166,8 @@ static void __migrate_device_pages(unsigned long *src_pfns, src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; else folio_migrate_flags(newfolio, folio); +next: + i += nr; } if (notified) @@ -1011,10 +1329,23 @@ static unsigned long migrate_device_pfn_lock(unsigned long pfn) int migrate_device_range(unsigned long *src_pfns, unsigned long start, unsigned long npages) { - unsigned long i, pfn; + unsigned long i, j, pfn; + + for (pfn = start, i = 0; i < npages; pfn++, i++) { + struct page *page = pfn_to_page(pfn); + struct folio *folio = page_folio(page); + unsigned int nr = 1; - for (pfn = start, i = 0; i < npages; pfn++, i++) src_pfns[i] = migrate_device_pfn_lock(pfn); + nr = folio_nr_pages(folio); + if (nr > 1) { + src_pfns[i] |= MIGRATE_PFN_COMPOUND; + for (j = 1; j < nr; j++) + src_pfns[i+j] = 0; + i += j - 1; + pfn += j - 1; + } + } migrate_device_unmap(src_pfns, npages, NULL); @@ -1032,10 +1363,22 @@ EXPORT_SYMBOL(migrate_device_range); */ int migrate_device_pfns(unsigned long *src_pfns, unsigned long npages) { - unsigned long i; + unsigned long i, j; + + for (i = 0; i < npages; i++) { + struct page *page = pfn_to_page(src_pfns[i]); + struct folio *folio = page_folio(page); + unsigned int nr = 1; - for (i = 0; i < npages; i++) src_pfns[i] = migrate_device_pfn_lock(src_pfns[i]); + nr = folio_nr_pages(folio); + if (nr > 1) { + src_pfns[i] |= MIGRATE_PFN_COMPOUND; + for (j = 1; j < nr; j++) + src_pfns[i+j] = 0; + i += j - 1; + } + } migrate_device_unmap(src_pfns, npages, NULL); -- cgit v1.2.3 From 4964099163d0524a769d039ffa886bb4515136d0 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 1 Oct 2025 16:56:59 +1000 Subject: mm/memory/fault: add THP fault handling for zone device private pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement CPU fault handling for zone device THP entries through do_huge_pmd_device_private(), enabling transparent migration of device-private large pages back to system memory on CPU access. When the CPU accesses a zone device THP entry, the fault handler calls the device driver's migrate_to_ram() callback to migrate the entire large page back to system memory. Link: https://lkml.kernel.org/r/20251001065707.920170-9-balbirs@nvidia.com Signed-off-by: Balbir Singh Cc: David Hildenbrand Cc: Zi Yan Cc: Joshua Hahn Cc: Rakie Kim Cc: Byungchul Park Cc: Gregory Price Cc: Ying Huang Cc: Alistair Popple Cc: Oscar Salvador Cc: Lorenzo Stoakes Cc: Baolin Wang Cc: "Liam R. Howlett" Cc: Nico Pache Cc: Ryan Roberts Cc: Dev Jain Cc: Barry Song Cc: Lyude Paul Cc: Danilo Krummrich Cc: David Airlie Cc: Simona Vetter Cc: Ralph Campbell Cc: Mika Penttilä Cc: Matthew Brost Cc: Francois Dugast Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 7 +++++++ mm/huge_memory.c | 38 ++++++++++++++++++++++++++++++++++++++ mm/memory.c | 5 +++-- 3 files changed, 48 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index fee4cf7fa300..82408c90b396 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -481,6 +481,8 @@ static inline bool folio_test_pmd_mappable(struct folio *folio) vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf); +vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf); + extern struct folio *huge_zero_folio; extern unsigned long huge_zero_pfn; @@ -662,6 +664,11 @@ static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) return 0; } +static inline vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf) +{ + return 0; +} + static inline bool is_huge_zero_folio(const struct folio *folio) { return false; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 23db562cde07..ded707a50af8 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1288,6 +1288,44 @@ release: } +vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + vm_fault_t ret = 0; + spinlock_t *ptl; + swp_entry_t swp_entry; + struct page *page; + struct folio *folio; + + if (vmf->flags & FAULT_FLAG_VMA_LOCK) { + vma_end_read(vma); + return VM_FAULT_RETRY; + } + + ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd))) { + spin_unlock(ptl); + return 0; + } + + swp_entry = pmd_to_swp_entry(vmf->orig_pmd); + page = pfn_swap_entry_to_page(swp_entry); + folio = page_folio(page); + vmf->page = page; + vmf->pte = NULL; + if (folio_trylock(folio)) { + folio_get(folio); + spin_unlock(ptl); + ret = page_pgmap(page)->ops->migrate_to_ram(vmf); + folio_unlock(folio); + folio_put(folio); + } else { + spin_unlock(ptl); + } + + return ret; +} + /* * always: directly stall for all thp allocations * defer: wake kswapd and fail if not immediately available diff --git a/mm/memory.c b/mm/memory.c index 27bc457b32c2..732414852570 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -6345,8 +6345,9 @@ retry_pud: vmf.orig_pmd = pmdp_get_lockless(vmf.pmd); if (unlikely(is_swap_pmd(vmf.orig_pmd))) { - VM_BUG_ON(thp_migration_supported() && - !is_pmd_migration_entry(vmf.orig_pmd)); + if (is_pmd_device_private_entry(vmf.orig_pmd)) + return do_huge_pmd_device_private(&vmf); + if (is_pmd_migration_entry(vmf.orig_pmd)) pmd_migration_entry_wait(mm, vmf.pmd); return 0; -- cgit v1.2.3 From 775465fd26a325359887f9c3129444fcc76c6298 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 1 Oct 2025 16:57:00 +1000 Subject: lib/test_hmm: add zone device private THP test infrastructure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enhance the hmm test driver (lib/test_hmm) with support for THP pages. A new pool of free_folios() has now been added to the dmirror device, which can be allocated when a request for a THP zone device private page is made. Add compound page awareness to the allocation function during normal migration and fault based migration. These routines also copy folio_nr_pages() when moving data between system memory and device memory. args.src and args.dst used to hold migration entries are now dynamically allocated (as they need to hold HPAGE_PMD_NR entries or more). Split and migrate support will be added in future patches in this series. Link: https://lkml.kernel.org/r/20251001065707.920170-10-balbirs@nvidia.com Signed-off-by: Balbir Singh Cc: David Hildenbrand Cc: Zi Yan Cc: Joshua Hahn Cc: Rakie Kim Cc: Byungchul Park Cc: Gregory Price Cc: Ying Huang Cc: Alistair Popple Cc: Oscar Salvador Cc: Lorenzo Stoakes Cc: Baolin Wang Cc: "Liam R. Howlett" Cc: Nico Pache Cc: Ryan Roberts Cc: Dev Jain Cc: Barry Song Cc: Lyude Paul Cc: Danilo Krummrich Cc: David Airlie Cc: Simona Vetter Cc: Ralph Campbell Cc: Mika Penttilä Cc: Matthew Brost Cc: Francois Dugast Signed-off-by: Andrew Morton --- include/linux/memremap.h | 12 ++ lib/test_hmm.c | 368 +++++++++++++++++++++++++++++++++++++---------- 2 files changed, 304 insertions(+), 76 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index cd28d1666801..7df4dd037b69 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -177,6 +177,18 @@ static inline bool folio_is_pci_p2pdma(const struct folio *folio) folio->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA; } +static inline void *folio_zone_device_data(const struct folio *folio) +{ + VM_WARN_ON_FOLIO(!folio_is_device_private(folio), folio); + return folio->page.zone_device_data; +} + +static inline void folio_set_zone_device_data(struct folio *folio, void *data) +{ + VM_WARN_ON_FOLIO(!folio_is_device_private(folio), folio); + folio->page.zone_device_data = data; +} + static inline bool is_pci_p2pdma_page(const struct page *page) { return IS_ENABLED(CONFIG_PCI_P2PDMA) && diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 9dbf265d1036..32d402e80bcc 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -119,6 +119,7 @@ struct dmirror_device { unsigned long calloc; unsigned long cfree; struct page *free_pages; + struct folio *free_folios; spinlock_t lock; /* protects the above */ }; @@ -492,7 +493,7 @@ fini: } static int dmirror_allocate_chunk(struct dmirror_device *mdevice, - struct page **ppage) + struct page **ppage, bool is_large) { struct dmirror_chunk *devmem; struct resource *res = NULL; @@ -572,20 +573,45 @@ static int dmirror_allocate_chunk(struct dmirror_device *mdevice, pfn_first, pfn_last); spin_lock(&mdevice->lock); - for (pfn = pfn_first; pfn < pfn_last; pfn++) { + for (pfn = pfn_first; pfn < pfn_last; ) { struct page *page = pfn_to_page(pfn); + if (is_large && IS_ALIGNED(pfn, HPAGE_PMD_NR) + && (pfn + HPAGE_PMD_NR <= pfn_last)) { + page->zone_device_data = mdevice->free_folios; + mdevice->free_folios = page_folio(page); + pfn += HPAGE_PMD_NR; + continue; + } + page->zone_device_data = mdevice->free_pages; mdevice->free_pages = page; + pfn++; } + + ret = 0; if (ppage) { - *ppage = mdevice->free_pages; - mdevice->free_pages = (*ppage)->zone_device_data; - mdevice->calloc++; + if (is_large) { + if (!mdevice->free_folios) { + ret = -ENOMEM; + goto err_unlock; + } + *ppage = folio_page(mdevice->free_folios, 0); + mdevice->free_folios = (*ppage)->zone_device_data; + mdevice->calloc += HPAGE_PMD_NR; + } else if (mdevice->free_pages) { + *ppage = mdevice->free_pages; + mdevice->free_pages = (*ppage)->zone_device_data; + mdevice->calloc++; + } else { + ret = -ENOMEM; + goto err_unlock; + } } +err_unlock: spin_unlock(&mdevice->lock); - return 0; + return ret; err_release: mutex_unlock(&mdevice->devmem_lock); @@ -598,10 +624,13 @@ err_devmem: return ret; } -static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) +static struct page *dmirror_devmem_alloc_page(struct dmirror *dmirror, + bool is_large) { struct page *dpage = NULL; struct page *rpage = NULL; + unsigned int order = is_large ? HPAGE_PMD_ORDER : 0; + struct dmirror_device *mdevice = dmirror->mdevice; /* * For ZONE_DEVICE private type, this is a fake device so we allocate @@ -610,49 +639,55 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) * data and ignore rpage. */ if (dmirror_is_private_zone(mdevice)) { - rpage = alloc_page(GFP_HIGHUSER); + rpage = folio_page(folio_alloc(GFP_HIGHUSER, order), 0); if (!rpage) return NULL; } spin_lock(&mdevice->lock); - if (mdevice->free_pages) { + if (is_large && mdevice->free_folios) { + dpage = folio_page(mdevice->free_folios, 0); + mdevice->free_folios = dpage->zone_device_data; + mdevice->calloc += 1 << order; + spin_unlock(&mdevice->lock); + } else if (!is_large && mdevice->free_pages) { dpage = mdevice->free_pages; mdevice->free_pages = dpage->zone_device_data; mdevice->calloc++; spin_unlock(&mdevice->lock); } else { spin_unlock(&mdevice->lock); - if (dmirror_allocate_chunk(mdevice, &dpage)) + if (dmirror_allocate_chunk(mdevice, &dpage, is_large)) goto error; } - zone_device_page_init(dpage, 0); + zone_device_folio_init(page_folio(dpage), order); dpage->zone_device_data = rpage; return dpage; error: if (rpage) - __free_page(rpage); + __free_pages(rpage, order); return NULL; } static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args, struct dmirror *dmirror) { - struct dmirror_device *mdevice = dmirror->mdevice; const unsigned long *src = args->src; unsigned long *dst = args->dst; unsigned long addr; - for (addr = args->start; addr < args->end; addr += PAGE_SIZE, - src++, dst++) { + for (addr = args->start; addr < args->end; ) { struct page *spage; struct page *dpage; struct page *rpage; + bool is_large = *src & MIGRATE_PFN_COMPOUND; + int write = (*src & MIGRATE_PFN_WRITE) ? MIGRATE_PFN_WRITE : 0; + unsigned long nr = 1; if (!(*src & MIGRATE_PFN_MIGRATE)) - continue; + goto next; /* * Note that spage might be NULL which is OK since it is an @@ -662,17 +697,45 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args, if (WARN(spage && is_zone_device_page(spage), "page already in device spage pfn: 0x%lx\n", page_to_pfn(spage))) + goto next; + + dpage = dmirror_devmem_alloc_page(dmirror, is_large); + if (!dpage) { + struct folio *folio; + unsigned long i; + unsigned long spfn = *src >> MIGRATE_PFN_SHIFT; + struct page *src_page; + + if (!is_large) + goto next; + + if (!spage && is_large) { + nr = HPAGE_PMD_NR; + } else { + folio = page_folio(spage); + nr = folio_nr_pages(folio); + } + + for (i = 0; i < nr && addr < args->end; i++) { + dpage = dmirror_devmem_alloc_page(dmirror, false); + rpage = BACKING_PAGE(dpage); + rpage->zone_device_data = dmirror; + + *dst = migrate_pfn(page_to_pfn(dpage)) | write; + src_page = pfn_to_page(spfn + i); + + if (spage) + copy_highpage(rpage, src_page); + else + clear_highpage(rpage); + src++; + dst++; + addr += PAGE_SIZE; + } continue; - - dpage = dmirror_devmem_alloc_page(mdevice); - if (!dpage) - continue; + } rpage = BACKING_PAGE(dpage); - if (spage) - copy_highpage(rpage, spage); - else - clear_highpage(rpage); /* * Normally, a device would use the page->zone_device_data to @@ -684,10 +747,42 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args, pr_debug("migrating from sys to dev pfn src: 0x%lx pfn dst: 0x%lx\n", page_to_pfn(spage), page_to_pfn(dpage)); - *dst = migrate_pfn(page_to_pfn(dpage)); - if ((*src & MIGRATE_PFN_WRITE) || - (!spage && args->vma->vm_flags & VM_WRITE)) - *dst |= MIGRATE_PFN_WRITE; + + *dst = migrate_pfn(page_to_pfn(dpage)) | write; + + if (is_large) { + int i; + struct folio *folio = page_folio(dpage); + *dst |= MIGRATE_PFN_COMPOUND; + + if (folio_test_large(folio)) { + for (i = 0; i < folio_nr_pages(folio); i++) { + struct page *dst_page = + pfn_to_page(page_to_pfn(rpage) + i); + struct page *src_page = + pfn_to_page(page_to_pfn(spage) + i); + + if (spage) + copy_highpage(dst_page, src_page); + else + clear_highpage(dst_page); + src++; + dst++; + addr += PAGE_SIZE; + } + continue; + } + } + + if (spage) + copy_highpage(rpage, spage); + else + clear_highpage(rpage); + +next: + src++; + dst++; + addr += PAGE_SIZE; } } @@ -734,14 +829,17 @@ static int dmirror_migrate_finalize_and_map(struct migrate_vma *args, const unsigned long *src = args->src; const unsigned long *dst = args->dst; unsigned long pfn; + const unsigned long start_pfn = start >> PAGE_SHIFT; + const unsigned long end_pfn = end >> PAGE_SHIFT; /* Map the migrated pages into the device's page tables. */ mutex_lock(&dmirror->mutex); - for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++, - src++, dst++) { + for (pfn = start_pfn; pfn < end_pfn; pfn++, src++, dst++) { struct page *dpage; void *entry; + int nr, i; + struct page *rpage; if (!(*src & MIGRATE_PFN_MIGRATE)) continue; @@ -750,13 +848,25 @@ static int dmirror_migrate_finalize_and_map(struct migrate_vma *args, if (!dpage) continue; - entry = BACKING_PAGE(dpage); - if (*dst & MIGRATE_PFN_WRITE) - entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE); - entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC); - if (xa_is_err(entry)) { - mutex_unlock(&dmirror->mutex); - return xa_err(entry); + if (*dst & MIGRATE_PFN_COMPOUND) + nr = folio_nr_pages(page_folio(dpage)); + else + nr = 1; + + WARN_ON_ONCE(end_pfn < start_pfn + nr); + + rpage = BACKING_PAGE(dpage); + VM_WARN_ON(folio_nr_pages(page_folio(rpage)) != nr); + + for (i = 0; i < nr; i++) { + entry = folio_page(page_folio(rpage), i); + if (*dst & MIGRATE_PFN_WRITE) + entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE); + entry = xa_store(&dmirror->pt, pfn + i, entry, GFP_ATOMIC); + if (xa_is_err(entry)) { + mutex_unlock(&dmirror->mutex); + return xa_err(entry); + } } } @@ -829,31 +939,66 @@ static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args, unsigned long start = args->start; unsigned long end = args->end; unsigned long addr; + unsigned int order = 0; + int i; - for (addr = start; addr < end; addr += PAGE_SIZE, - src++, dst++) { + for (addr = start; addr < end; ) { struct page *dpage, *spage; spage = migrate_pfn_to_page(*src); - if (!spage || !(*src & MIGRATE_PFN_MIGRATE)) - continue; + if (!spage || !(*src & MIGRATE_PFN_MIGRATE)) { + addr += PAGE_SIZE; + goto next; + } if (WARN_ON(!is_device_private_page(spage) && - !is_device_coherent_page(spage))) - continue; + !is_device_coherent_page(spage))) { + addr += PAGE_SIZE; + goto next; + } + spage = BACKING_PAGE(spage); - dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr); - if (!dpage) - continue; - pr_debug("migrating from dev to sys pfn src: 0x%lx pfn dst: 0x%lx\n", - page_to_pfn(spage), page_to_pfn(dpage)); + order = folio_order(page_folio(spage)); + if (order) + dpage = folio_page(vma_alloc_folio(GFP_HIGHUSER_MOVABLE, + order, args->vma, addr), 0); + else + dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr); + + /* Try with smaller pages if large allocation fails */ + if (!dpage && order) { + dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr); + if (!dpage) + return VM_FAULT_OOM; + order = 0; + } + + pr_debug("migrating from sys to dev pfn src: 0x%lx pfn dst: 0x%lx\n", + page_to_pfn(spage), page_to_pfn(dpage)); lock_page(dpage); xa_erase(&dmirror->pt, addr >> PAGE_SHIFT); copy_highpage(dpage, spage); *dst = migrate_pfn(page_to_pfn(dpage)); if (*src & MIGRATE_PFN_WRITE) *dst |= MIGRATE_PFN_WRITE; + if (order) + *dst |= MIGRATE_PFN_COMPOUND; + + for (i = 0; i < (1 << order); i++) { + struct page *src_page; + struct page *dst_page; + + src_page = pfn_to_page(page_to_pfn(spage) + i); + dst_page = pfn_to_page(page_to_pfn(dpage) + i); + + xa_erase(&dmirror->pt, addr >> PAGE_SHIFT); + copy_highpage(dst_page, src_page); + } +next: + addr += PAGE_SIZE << order; + src += 1 << order; + dst += 1 << order; } return 0; } @@ -879,11 +1024,14 @@ static int dmirror_migrate_to_system(struct dmirror *dmirror, unsigned long size = cmd->npages << PAGE_SHIFT; struct mm_struct *mm = dmirror->notifier.mm; struct vm_area_struct *vma; - unsigned long src_pfns[32] = { 0 }; - unsigned long dst_pfns[32] = { 0 }; struct migrate_vma args = { 0 }; unsigned long next; int ret; + unsigned long *src_pfns; + unsigned long *dst_pfns; + + src_pfns = kvcalloc(PTRS_PER_PTE, sizeof(*src_pfns), GFP_KERNEL | __GFP_NOFAIL); + dst_pfns = kvcalloc(PTRS_PER_PTE, sizeof(*dst_pfns), GFP_KERNEL | __GFP_NOFAIL); start = cmd->addr; end = start + size; @@ -902,7 +1050,7 @@ static int dmirror_migrate_to_system(struct dmirror *dmirror, ret = -EINVAL; goto out; } - next = min(end, addr + (ARRAY_SIZE(src_pfns) << PAGE_SHIFT)); + next = min(end, addr + (PTRS_PER_PTE << PAGE_SHIFT)); if (next > vma->vm_end) next = vma->vm_end; @@ -912,7 +1060,7 @@ static int dmirror_migrate_to_system(struct dmirror *dmirror, args.start = addr; args.end = next; args.pgmap_owner = dmirror->mdevice; - args.flags = dmirror_select_device(dmirror); + args.flags = dmirror_select_device(dmirror) | MIGRATE_VMA_SELECT_COMPOUND; ret = migrate_vma_setup(&args); if (ret) @@ -928,6 +1076,8 @@ static int dmirror_migrate_to_system(struct dmirror *dmirror, out: mmap_read_unlock(mm); mmput(mm); + kvfree(src_pfns); + kvfree(dst_pfns); return ret; } @@ -939,12 +1089,12 @@ static int dmirror_migrate_to_device(struct dmirror *dmirror, unsigned long size = cmd->npages << PAGE_SHIFT; struct mm_struct *mm = dmirror->notifier.mm; struct vm_area_struct *vma; - unsigned long src_pfns[32] = { 0 }; - unsigned long dst_pfns[32] = { 0 }; struct dmirror_bounce bounce; struct migrate_vma args = { 0 }; unsigned long next; int ret; + unsigned long *src_pfns = NULL; + unsigned long *dst_pfns = NULL; start = cmd->addr; end = start + size; @@ -955,6 +1105,18 @@ static int dmirror_migrate_to_device(struct dmirror *dmirror, if (!mmget_not_zero(mm)) return -EINVAL; + ret = -ENOMEM; + src_pfns = kvcalloc(PTRS_PER_PTE, sizeof(*src_pfns), + GFP_KERNEL | __GFP_NOFAIL); + if (!src_pfns) + goto free_mem; + + dst_pfns = kvcalloc(PTRS_PER_PTE, sizeof(*dst_pfns), + GFP_KERNEL | __GFP_NOFAIL); + if (!dst_pfns) + goto free_mem; + + ret = 0; mmap_read_lock(mm); for (addr = start; addr < end; addr = next) { vma = vma_lookup(mm, addr); @@ -962,7 +1124,7 @@ static int dmirror_migrate_to_device(struct dmirror *dmirror, ret = -EINVAL; goto out; } - next = min(end, addr + (ARRAY_SIZE(src_pfns) << PAGE_SHIFT)); + next = min(end, addr + (PTRS_PER_PTE << PAGE_SHIFT)); if (next > vma->vm_end) next = vma->vm_end; @@ -972,7 +1134,8 @@ static int dmirror_migrate_to_device(struct dmirror *dmirror, args.start = addr; args.end = next; args.pgmap_owner = dmirror->mdevice; - args.flags = MIGRATE_VMA_SELECT_SYSTEM; + args.flags = MIGRATE_VMA_SELECT_SYSTEM | + MIGRATE_VMA_SELECT_COMPOUND; ret = migrate_vma_setup(&args); if (ret) goto out; @@ -992,7 +1155,7 @@ static int dmirror_migrate_to_device(struct dmirror *dmirror, */ ret = dmirror_bounce_init(&bounce, start, size); if (ret) - return ret; + goto free_mem; mutex_lock(&dmirror->mutex); ret = dmirror_do_read(dmirror, start, end, &bounce); mutex_unlock(&dmirror->mutex); @@ -1003,11 +1166,14 @@ static int dmirror_migrate_to_device(struct dmirror *dmirror, } cmd->cpages = bounce.cpages; dmirror_bounce_fini(&bounce); - return ret; + goto free_mem; out: mmap_read_unlock(mm); mmput(mm); +free_mem: + kfree(src_pfns); + kfree(dst_pfns); return ret; } @@ -1200,6 +1366,7 @@ static void dmirror_device_evict_chunk(struct dmirror_chunk *chunk) unsigned long i; unsigned long *src_pfns; unsigned long *dst_pfns; + unsigned int order = 0; src_pfns = kvcalloc(npages, sizeof(*src_pfns), GFP_KERNEL | __GFP_NOFAIL); dst_pfns = kvcalloc(npages, sizeof(*dst_pfns), GFP_KERNEL | __GFP_NOFAIL); @@ -1215,13 +1382,25 @@ static void dmirror_device_evict_chunk(struct dmirror_chunk *chunk) if (WARN_ON(!is_device_private_page(spage) && !is_device_coherent_page(spage))) continue; + + order = folio_order(page_folio(spage)); spage = BACKING_PAGE(spage); - dpage = alloc_page(GFP_HIGHUSER_MOVABLE | __GFP_NOFAIL); + if (src_pfns[i] & MIGRATE_PFN_COMPOUND) { + dpage = folio_page(folio_alloc(GFP_HIGHUSER_MOVABLE, + order), 0); + } else { + dpage = alloc_page(GFP_HIGHUSER_MOVABLE | __GFP_NOFAIL); + order = 0; + } + + /* TODO Support splitting here */ lock_page(dpage); - copy_highpage(dpage, spage); dst_pfns[i] = migrate_pfn(page_to_pfn(dpage)); if (src_pfns[i] & MIGRATE_PFN_WRITE) dst_pfns[i] |= MIGRATE_PFN_WRITE; + if (order) + dst_pfns[i] |= MIGRATE_PFN_COMPOUND; + folio_copy(page_folio(dpage), page_folio(spage)); } migrate_device_pages(src_pfns, dst_pfns, npages); migrate_device_finalize(src_pfns, dst_pfns, npages); @@ -1234,7 +1413,12 @@ static void dmirror_remove_free_pages(struct dmirror_chunk *devmem) { struct dmirror_device *mdevice = devmem->mdevice; struct page *page; + struct folio *folio; + + for (folio = mdevice->free_folios; folio; folio = folio_zone_device_data(folio)) + if (dmirror_page_to_chunk(folio_page(folio, 0)) == devmem) + mdevice->free_folios = folio_zone_device_data(folio); for (page = mdevice->free_pages; page; page = page->zone_device_data) if (dmirror_page_to_chunk(page) == devmem) mdevice->free_pages = page->zone_device_data; @@ -1265,6 +1449,7 @@ static void dmirror_device_remove_chunks(struct dmirror_device *mdevice) mdevice->devmem_count = 0; mdevice->devmem_capacity = 0; mdevice->free_pages = NULL; + mdevice->free_folios = NULL; kfree(mdevice->devmem_chunks); mdevice->devmem_chunks = NULL; } @@ -1379,18 +1564,30 @@ static void dmirror_devmem_free(struct folio *folio) struct page *page = &folio->page; struct page *rpage = BACKING_PAGE(page); struct dmirror_device *mdevice; + struct folio *rfolio = page_folio(rpage); + unsigned int order = folio_order(rfolio); - if (rpage != page) - __free_page(rpage); + if (rpage != page) { + if (order) + __free_pages(rpage, order); + else + __free_page(rpage); + rpage = NULL; + } mdevice = dmirror_page_to_device(page); spin_lock(&mdevice->lock); /* Return page to our allocator if not freeing the chunk */ if (!dmirror_page_to_chunk(page)->remove) { - mdevice->cfree++; - page->zone_device_data = mdevice->free_pages; - mdevice->free_pages = page; + mdevice->cfree += 1 << order; + if (order) { + page->zone_device_data = mdevice->free_folios; + mdevice->free_folios = page_folio(page); + } else { + page->zone_device_data = mdevice->free_pages; + mdevice->free_pages = page; + } } spin_unlock(&mdevice->lock); } @@ -1398,36 +1595,52 @@ static void dmirror_devmem_free(struct folio *folio) static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) { struct migrate_vma args = { 0 }; - unsigned long src_pfns = 0; - unsigned long dst_pfns = 0; struct page *rpage; struct dmirror *dmirror; - vm_fault_t ret; + vm_fault_t ret = 0; + unsigned int order, nr; /* * Normally, a device would use the page->zone_device_data to point to * the mirror but here we use it to hold the page for the simulated * device memory and that page holds the pointer to the mirror. */ - rpage = vmf->page->zone_device_data; + rpage = folio_zone_device_data(page_folio(vmf->page)); dmirror = rpage->zone_device_data; /* FIXME demonstrate how we can adjust migrate range */ + order = folio_order(page_folio(vmf->page)); + nr = 1 << order; + + /* + * Consider a per-cpu cache of src and dst pfns, but with + * large number of cpus that might not scale well. + */ + args.start = ALIGN_DOWN(vmf->address, (PAGE_SIZE << order)); args.vma = vmf->vma; - args.start = vmf->address; - args.end = args.start + PAGE_SIZE; - args.src = &src_pfns; - args.dst = &dst_pfns; + args.end = args.start + (PAGE_SIZE << order); + + nr = (args.end - args.start) >> PAGE_SHIFT; + args.src = kcalloc(nr, sizeof(unsigned long), GFP_KERNEL); + args.dst = kcalloc(nr, sizeof(unsigned long), GFP_KERNEL); args.pgmap_owner = dmirror->mdevice; args.flags = dmirror_select_device(dmirror); args.fault_page = vmf->page; + if (!args.src || !args.dst) { + ret = VM_FAULT_OOM; + goto err; + } + + if (order) + args.flags |= MIGRATE_VMA_SELECT_COMPOUND; + if (migrate_vma_setup(&args)) return VM_FAULT_SIGBUS; ret = dmirror_devmem_fault_alloc_and_copy(&args, dmirror); if (ret) - return ret; + goto err; migrate_vma_pages(&args); /* * No device finalize step is needed since @@ -1435,7 +1648,10 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) * invalidated the device page table. */ migrate_vma_finalize(&args); - return 0; +err: + kfree(args.src); + kfree(args.dst); + return ret; } static const struct dev_pagemap_ops dmirror_devmem_ops = { @@ -1466,7 +1682,7 @@ static int dmirror_device_init(struct dmirror_device *mdevice, int id) return ret; /* Build a list of free ZONE_DEVICE struct pages */ - return dmirror_allocate_chunk(mdevice, NULL); + return dmirror_allocate_chunk(mdevice, NULL, false); } static void dmirror_device_remove(struct dmirror_device *mdevice) -- cgit v1.2.3 From 56ef398996435a0021569b86293d376649f12540 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 1 Oct 2025 16:57:01 +1000 Subject: mm/memremap: add driver callback support for folio splitting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a zone device page is split (via huge pmd folio split). The driver callback for folio_split is invoked to let the device driver know that the folio size has been split into a smaller order. Provide a default implementation for drivers that do not provide this callback that copies the pgmap and mapping fields for the split folios. Update the HMM test driver to handle the split. Link: https://lkml.kernel.org/r/20251001065707.920170-11-balbirs@nvidia.com Signed-off-by: Balbir Singh Cc: David Hildenbrand Cc: Zi Yan Cc: Joshua Hahn Cc: Rakie Kim Cc: Byungchul Park Cc: Gregory Price Cc: Ying Huang Cc: Alistair Popple Cc: Oscar Salvador Cc: Lorenzo Stoakes Cc: Baolin Wang Cc: "Liam R. Howlett" Cc: Nico Pache Cc: Ryan Roberts Cc: Dev Jain Cc: Barry Song Cc: Lyude Paul Cc: Danilo Krummrich Cc: David Airlie Cc: Simona Vetter Cc: Ralph Campbell Cc: Mika Penttilä Cc: Matthew Brost Cc: Francois Dugast Signed-off-by: Andrew Morton --- include/linux/memremap.h | 29 +++++++++++++++++++++++++++++ lib/test_hmm.c | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 7df4dd037b69..aca2b16d6889 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -100,6 +100,13 @@ struct dev_pagemap_ops { */ int (*memory_failure)(struct dev_pagemap *pgmap, unsigned long pfn, unsigned long nr_pages, int mf_flags); + + /* + * Used for private (un-addressable) device memory only. + * This callback is used when a folio is split into + * a smaller folio + */ + void (*folio_split)(struct folio *head, struct folio *tail); }; #define PGMAP_ALTMAP_VALID (1 << 0) @@ -235,6 +242,23 @@ static inline void zone_device_folio_init(struct folio *folio, unsigned int orde folio_set_large_rmappable(folio); } +static inline void zone_device_private_split_cb(struct folio *original_folio, + struct folio *new_folio) +{ + if (folio_is_device_private(original_folio)) { + if (!original_folio->pgmap->ops->folio_split) { + if (new_folio) { + new_folio->pgmap = original_folio->pgmap; + new_folio->page.mapping = + original_folio->page.mapping; + } + } else { + original_folio->pgmap->ops->folio_split(original_folio, + new_folio); + } + } +} + #else static inline void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) @@ -268,6 +292,11 @@ static inline unsigned long memremap_compat_align(void) { return PAGE_SIZE; } + +static inline void zone_device_private_split_cb(struct folio *original_folio, + struct folio *new_folio) +{ +} #endif /* CONFIG_ZONE_DEVICE */ static inline void put_dev_pagemap(struct dev_pagemap *pgmap) diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 32d402e80bcc..46fa9e200db8 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -1654,9 +1654,44 @@ err: return ret; } +static void dmirror_devmem_folio_split(struct folio *head, struct folio *tail) +{ + struct page *rpage = BACKING_PAGE(folio_page(head, 0)); + struct page *rpage_tail; + struct folio *rfolio; + unsigned long offset = 0; + + if (!rpage) { + tail->page.zone_device_data = NULL; + return; + } + + rfolio = page_folio(rpage); + + if (tail == NULL) { + folio_reset_order(rfolio); + rfolio->mapping = NULL; + folio_set_count(rfolio, 1); + return; + } + + offset = folio_pfn(tail) - folio_pfn(head); + + rpage_tail = folio_page(rfolio, offset); + tail->page.zone_device_data = rpage_tail; + rpage_tail->zone_device_data = rpage->zone_device_data; + clear_compound_head(rpage_tail); + rpage_tail->mapping = NULL; + + folio_page(tail, 0)->mapping = folio_page(head, 0)->mapping; + tail->pgmap = head->pgmap; + folio_set_count(page_folio(rpage_tail), 1); +} + static const struct dev_pagemap_ops dmirror_devmem_ops = { .folio_free = dmirror_devmem_free, .migrate_to_ram = dmirror_devmem_fault, + .folio_split = dmirror_devmem_folio_split, }; static int dmirror_device_init(struct dmirror_device *mdevice, int id) -- cgit v1.2.3 From 4265d67e405a41562634279ca1ededf79fdadcd7 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 1 Oct 2025 16:57:02 +1000 Subject: mm/migrate_device: add THP splitting during migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement migrate_vma_split_pages() to handle THP splitting during the migration process when destination cannot allocate compound pages. This addresses the common scenario where migrate_vma_setup() succeeds with MIGRATE_PFN_COMPOUND pages, but the destination device cannot allocate large pages during the migration phase. Key changes: - migrate_vma_split_pages(): Split already-isolated pages during migration - Enhanced folio_split() and __split_unmapped_folio() with isolated parameter to avoid redundant unmap/remap operations This provides a fallback mechansim to ensure migration succeeds even when large page allocation fails at the destination. [matthew.brost@intel.com: add THP splitting during migration] Link: https://lkml.kernel.org/r/20251120230825.181072-2-matthew.brost@intel.com Link: https://lkml.kernel.org/r/20251001065707.920170-12-balbirs@nvidia.com Signed-off-by: Balbir Singh Signed-off-by: Matthew Brost Cc: David Hildenbrand Cc: Zi Yan Cc: Joshua Hahn Cc: Rakie Kim Cc: Byungchul Park Cc: Gregory Price Cc: Ying Huang Cc: Alistair Popple Cc: Oscar Salvador Cc: Lorenzo Stoakes Cc: Baolin Wang Cc: "Liam R. Howlett" Cc: Nico Pache Cc: Ryan Roberts Cc: Dev Jain Cc: Barry Song Cc: Lyude Paul Cc: Danilo Krummrich Cc: David Airlie Cc: Simona Vetter Cc: Ralph Campbell Cc: Mika Penttilä Cc: Francois Dugast Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 11 +++++-- lib/test_hmm.c | 9 +++++ mm/huge_memory.c | 46 ++++++++++++++------------ mm/migrate_device.c | 87 ++++++++++++++++++++++++++++++++++++++++++------- 4 files changed, 119 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 82408c90b396..ed99e6bd31ac 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -365,8 +365,8 @@ unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long add vm_flags_t vm_flags); bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins); -int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, - unsigned int new_order); +int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list, + unsigned int new_order, bool unmapped); int min_order_for_split(struct folio *folio); int split_folio_to_list(struct folio *folio, struct list_head *list); bool uniform_split_supported(struct folio *folio, unsigned int new_order, @@ -375,6 +375,13 @@ bool non_uniform_split_supported(struct folio *folio, unsigned int new_order, bool warns); int folio_split(struct folio *folio, unsigned int new_order, struct page *page, struct list_head *list); + +static inline int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, + unsigned int new_order) +{ + return __split_huge_page_to_list_to_order(page, list, new_order, false); +} + /* * try_folio_split_to_order - try to split a @folio at @page to @new_order using * non uniform split. diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 46fa9e200db8..df429670633e 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -1612,6 +1612,15 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) order = folio_order(page_folio(vmf->page)); nr = 1 << order; + /* + * When folios are partially mapped, we can't rely on the folio + * order of vmf->page as the folio might not be fully split yet + */ + if (vmf->pte) { + order = 0; + nr = 1; + } + /* * Consider a per-cpu cache of src and dst pfns, but with * large number of cpus that might not scale well. diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ded707a50af8..81e511f1ed26 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3452,15 +3452,6 @@ static void __split_folio_to_order(struct folio *folio, int old_order, new_folio->mapping = folio->mapping; new_folio->index = folio->index + i; - /* - * page->private should not be set in tail pages. Fix up and warn once - * if private is unexpectedly set. - */ - if (unlikely(new_folio->private)) { - VM_WARN_ON_ONCE_PAGE(true, new_head); - new_folio->private = NULL; - } - if (folio_test_swapcache(folio)) new_folio->swap.val = folio->swap.val + i; @@ -3661,6 +3652,7 @@ bool uniform_split_supported(struct folio *folio, unsigned int new_order, * @lock_at: a page within @folio to be left locked to caller * @list: after-split folios will be put on it if non NULL * @uniform_split: perform uniform split or not (non-uniform split) + * @unmapped: The pages are already unmapped, they are migration entries. * * It calls __split_unmapped_folio() to perform uniform and non-uniform split. * It is in charge of checking whether the split is supported or not and @@ -3676,7 +3668,7 @@ bool uniform_split_supported(struct folio *folio, unsigned int new_order, */ static int __folio_split(struct folio *folio, unsigned int new_order, struct page *split_at, struct page *lock_at, - struct list_head *list, bool uniform_split) + struct list_head *list, bool uniform_split, bool unmapped) { struct deferred_split *ds_queue = get_deferred_split_queue(folio); XA_STATE(xas, &folio->mapping->i_pages, folio->index); @@ -3736,13 +3728,15 @@ static int __folio_split(struct folio *folio, unsigned int new_order, * is taken to serialise against parallel split or collapse * operations. */ - anon_vma = folio_get_anon_vma(folio); - if (!anon_vma) { - ret = -EBUSY; - goto out; + if (!unmapped) { + anon_vma = folio_get_anon_vma(folio); + if (!anon_vma) { + ret = -EBUSY; + goto out; + } + anon_vma_lock_write(anon_vma); } mapping = NULL; - anon_vma_lock_write(anon_vma); } else { unsigned int min_order; gfp_t gfp; @@ -3795,7 +3789,8 @@ static int __folio_split(struct folio *folio, unsigned int new_order, goto out_unlock; } - unmap_folio(folio); + if (!unmapped) + unmap_folio(folio); /* block interrupt reentry in xa_lock and spinlock */ local_irq_disable(); @@ -3882,10 +3877,13 @@ static int __folio_split(struct folio *folio, unsigned int new_order, next = folio_next(new_folio); + zone_device_private_split_cb(folio, new_folio); + expected_refs = folio_expected_ref_count(new_folio) + 1; folio_ref_unfreeze(new_folio, expected_refs); - lru_add_split_folio(folio, new_folio, lruvec, list); + if (!unmapped) + lru_add_split_folio(folio, new_folio, lruvec, list); /* * Anonymous folio with swap cache. @@ -3916,6 +3914,8 @@ static int __folio_split(struct folio *folio, unsigned int new_order, __filemap_remove_folio(new_folio, NULL); folio_put_refs(new_folio, nr_pages); } + + zone_device_private_split_cb(folio, NULL); /* * Unfreeze @folio only after all page cache entries, which * used to point to it, have been updated with new folios. @@ -3939,6 +3939,9 @@ fail: local_irq_enable(); + if (unmapped) + return ret; + if (nr_shmem_dropped) shmem_uncharge(mapping->host, nr_shmem_dropped); @@ -4029,12 +4032,13 @@ out: * Returns -EINVAL when trying to split to an order that is incompatible * with the folio. Splitting to order 0 is compatible with all folios. */ -int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, - unsigned int new_order) +int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list, + unsigned int new_order, bool unmapped) { struct folio *folio = page_folio(page); - return __folio_split(folio, new_order, &folio->page, page, list, true); + return __folio_split(folio, new_order, &folio->page, page, list, true, + unmapped); } /* @@ -4063,7 +4067,7 @@ int folio_split(struct folio *folio, unsigned int new_order, struct page *split_at, struct list_head *list) { return __folio_split(folio, new_order, split_at, &folio->page, list, - false); + false, false); } int min_order_for_split(struct folio *folio) diff --git a/mm/migrate_device.c b/mm/migrate_device.c index a0a315f3572a..ab373fd38961 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -309,6 +309,25 @@ again: pgmap->owner != migrate->pgmap_owner) goto next; + folio = page_folio(page); + if (folio_test_large(folio)) { + int ret; + + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(ptep, ptl); + ret = migrate_vma_split_folio(folio, + migrate->fault_page); + + if (ret) { + if (unmapped) + flush_tlb_range(walk->vma, start, end); + + return migrate_vma_collect_skip(addr, end, walk); + } + + goto again; + } + mpfn = migrate_pfn(page_to_pfn(page)) | MIGRATE_PFN_MIGRATE; if (is_writable_device_private_entry(entry)) @@ -885,6 +904,29 @@ abort: src[i] &= ~MIGRATE_PFN_MIGRATE; return 0; } + +static int migrate_vma_split_unmapped_folio(struct migrate_vma *migrate, + unsigned long idx, unsigned long addr, + struct folio *folio) +{ + unsigned long i; + unsigned long pfn; + unsigned long flags; + int ret = 0; + + folio_get(folio); + split_huge_pmd_address(migrate->vma, addr, true); + ret = __split_huge_page_to_list_to_order(folio_page(folio, 0), NULL, + 0, true); + if (ret) + return ret; + migrate->src[idx] &= ~MIGRATE_PFN_COMPOUND; + flags = migrate->src[idx] & ((1UL << MIGRATE_PFN_SHIFT) - 1); + pfn = migrate->src[idx] >> MIGRATE_PFN_SHIFT; + for (i = 1; i < HPAGE_PMD_NR; i++) + migrate->src[i+idx] = migrate_pfn(pfn + i) | flags; + return ret; +} #else /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */ static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate, unsigned long addr, @@ -894,6 +936,13 @@ static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate, { return 0; } + +static int migrate_vma_split_unmapped_folio(struct migrate_vma *migrate, + unsigned long idx, unsigned long addr, + struct folio *folio) +{ + return 0; +} #endif static unsigned long migrate_vma_nr_pages(unsigned long *src) @@ -1055,8 +1104,9 @@ static void __migrate_device_pages(unsigned long *src_pfns, struct migrate_vma *migrate) { struct mmu_notifier_range range; - unsigned long i; + unsigned long i, j; bool notified = false; + unsigned long addr; for (i = 0; i < npages; ) { struct page *newpage = migrate_pfn_to_page(dst_pfns[i]); @@ -1098,12 +1148,16 @@ static void __migrate_device_pages(unsigned long *src_pfns, (!(dst_pfns[i] & MIGRATE_PFN_COMPOUND))) { nr = migrate_vma_nr_pages(&src_pfns[i]); src_pfns[i] &= ~MIGRATE_PFN_COMPOUND; - src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; - goto next; + } else { + nr = 1; } - migrate_vma_insert_page(migrate, addr, &dst_pfns[i], - &src_pfns[i]); + for (j = 0; j < nr && i + j < npages; j++) { + src_pfns[i+j] |= MIGRATE_PFN_MIGRATE; + migrate_vma_insert_page(migrate, + addr + j * PAGE_SIZE, + &dst_pfns[i+j], &src_pfns[i+j]); + } goto next; } @@ -1125,7 +1179,13 @@ static void __migrate_device_pages(unsigned long *src_pfns, MIGRATE_PFN_COMPOUND); goto next; } - src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; + nr = 1 << folio_order(folio); + addr = migrate->start + i * PAGE_SIZE; + if (migrate_vma_split_unmapped_folio(migrate, i, addr, folio)) { + src_pfns[i] &= ~(MIGRATE_PFN_MIGRATE | + MIGRATE_PFN_COMPOUND); + goto next; + } } else if ((src_pfns[i] & MIGRATE_PFN_MIGRATE) && (dst_pfns[i] & MIGRATE_PFN_COMPOUND) && !(src_pfns[i] & MIGRATE_PFN_COMPOUND)) { @@ -1161,11 +1221,16 @@ static void __migrate_device_pages(unsigned long *src_pfns, if (migrate && migrate->fault_page == page) extra_cnt = 1; - r = folio_migrate_mapping(mapping, newfolio, folio, extra_cnt); - if (r) - src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; - else - folio_migrate_flags(newfolio, folio); + for (j = 0; j < nr && i + j < npages; j++) { + folio = page_folio(migrate_pfn_to_page(src_pfns[i+j])); + newfolio = page_folio(migrate_pfn_to_page(dst_pfns[i+j])); + + r = folio_migrate_mapping(mapping, newfolio, folio, extra_cnt); + if (r) + src_pfns[i+j] &= ~MIGRATE_PFN_MIGRATE; + else + folio_migrate_flags(newfolio, folio); + } next: i += nr; } -- cgit v1.2.3 From ac7756771a34f19c9a757eb86efe028e51f57b23 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 8 Oct 2025 09:54:53 +0000 Subject: mm/khugepaged: unify pmd folio installation with map_anon_folio_pmd() Currently we install pmd folio with map_anon_folio_pmd() in __do_huge_pmd_anonymous_page() and do_huge_zero_wp_pmd(). While in collapse_huge_page(), it is done with identical code except statistics adjustment. Unify the process with map_anon_folio_pmd() to install pmd folio. Split it to map_anon_folio_pmd_pf() and map_anon_folio_pmd_nopf() to be used in page fault or not respectively. No functional change is intended. [akpm@linux-foundation.org: remove unneeded map_anon_folio_pmd_nopf() stub, per Wei & David] Link: https://lkml.kernel.org/r/20251008095453.18772-3-richard.weiyang@gmail.com Signed-off-by: Wei Yang Acked-by: David Hildenbrand Reviewed-by: Zi Yan Acked-by: Lance Yang Cc: David Hildenbrand Cc: Lance Yang Cc: Dev Jain Cc: Zi Yan Cc: Usama Arif Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 2 ++ mm/huge_memory.c | 14 ++++++++++---- mm/khugepaged.c | 9 +-------- 3 files changed, 13 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index ed99e6bd31ac..396d9e3d1d46 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -533,6 +533,8 @@ void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, bool freeze); bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp, struct folio *folio); +void map_anon_folio_pmd_nopf(struct folio *folio, pmd_t *pmd, + struct vm_area_struct *vma, unsigned long haddr); #else /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a2a2fda2bff8..05bf419513ad 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1218,7 +1218,7 @@ static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma, return folio; } -static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd, +void map_anon_folio_pmd_nopf(struct folio *folio, pmd_t *pmd, struct vm_area_struct *vma, unsigned long haddr) { pmd_t entry; @@ -1229,11 +1229,17 @@ static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd, folio_add_lru_vma(folio, vma); set_pmd_at(vma->vm_mm, haddr, pmd, entry); update_mmu_cache_pmd(vma, haddr, pmd); + deferred_split_folio(folio, false); +} + +static void map_anon_folio_pmd_pf(struct folio *folio, pmd_t *pmd, + struct vm_area_struct *vma, unsigned long haddr) +{ + map_anon_folio_pmd_nopf(folio, pmd, vma, haddr); add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); count_vm_event(THP_FAULT_ALLOC); count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); - deferred_split_folio(folio, false); } static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf) @@ -1272,7 +1278,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf) return ret; } pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); - map_anon_folio_pmd(folio, vmf->pmd, vma, haddr); + map_anon_folio_pmd_pf(folio, vmf->pmd, vma, haddr); mm_inc_nr_ptes(vma->vm_mm); spin_unlock(vmf->ptl); } @@ -1944,7 +1950,7 @@ static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf) if (ret) goto release; (void)pmdp_huge_clear_flush(vma, haddr, vmf->pmd); - map_anon_folio_pmd(folio, vmf->pmd, vma, haddr); + map_anon_folio_pmd_pf(folio, vmf->pmd, vma, haddr); goto unlock; release: folio_put(folio); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 1b5c2e942df9..af1c162c9a94 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1226,17 +1226,10 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, __folio_mark_uptodate(folio); pgtable = pmd_pgtable(_pmd); - _pmd = folio_mk_pmd(folio, vma->vm_page_prot); - _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); - spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); - folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE); - folio_add_lru_vma(folio, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); - set_pmd_at(mm, address, pmd, _pmd); - update_mmu_cache_pmd(vma, address, pmd); - deferred_split_folio(folio, false); + map_anon_folio_pmd_nopf(folio, pmd, vma, address); spin_unlock(pmd_ptl); folio = NULL; -- cgit v1.2.3 From a7ef12c64fd991c0f42b2e1bf0c4f09068575864 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Fri, 31 Oct 2025 12:19:59 -0400 Subject: mm/huge_memory: add split_huge_page_to_order() Patch series "Optimize folio split in memory failure", v5. This patchset optimizes folio split operations in memory failure code by always splitting a folio to min_order_for_split() to minimize unusable pages, even if min_order_for_split() is non zero and memory failure code would take the failed path eventually for a successfully split folio. This means instead of making the entire original folio unusable memory failure code would only make its after-split folio, which has order of min_order_for_split() and contains HWPoison page, unusable. For soft offline case, since the original folio is still accessible, no split is performed if the folio cannot be split to order-0 to prevent potential performance loss. In addition, add split_huge_page_to_order() to improve code readability and fix kernel-doc comment format for folio_split() and other related functions. Background ========== This patchset is a follow-up of "[PATCH v3] mm/huge_memory: do not change split_huge_page*() target order silently."[1] and [PATCH v4] mm/huge_memory: preserve PG_has_hwpoisoned if a folio is split to >0 order[2], since both are separated out as hotfixes. It improves how memory failure code handles large block size(LBS) folios with min_order_for_split() > 0. By splitting a large folio containing HW poisoned pages to min_order_for_split(), the after-split folios without HW poisoned pages could be freed for reuse. To achieve this, folio split code needs to set has_hwpoisoned on after-split folios containing HW poisoned pages and it is done in the hotfix in [2]. This patchset includes: 1. A patch adds split_huge_page_to_order(), 2. Patch 2 and Patch 3 of "[PATCH v2 0/3] Do not change split folio target order"[3], This patch (of 3): When the caller does not supply a list to split_huge_page_to_list_to_order(), use split_huge_page_to_order() instead. Link: https://lkml.kernel.org/r/20251031162001.670503-1-ziy@nvidia.com Link: https://lkml.kernel.org/r/20251031162001.670503-2-ziy@nvidia.com Link: https://lore.kernel.org/all/20251017013630.139907-1-ziy@nvidia.com/ [1] Link: https://lore.kernel.org/all/20251023030521.473097-1-ziy@nvidia.com/ [2] Link: https://lore.kernel.org/all/20251016033452.125479-1-ziy@nvidia.com/ [3] Signed-off-by: Zi Yan Acked-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Reviewed-by: Wei Yang Reviewed-by: Miaohe Lin Reviewed-by: Barry Song Reviewed-by: Lance Yang Cc: Baolin Wang Cc: Dev Jain Cc: Jane Chu Cc: Liam Howlett Cc: Luis Chamberalin Cc: Matthew Wilcox (Oracle) Cc: Naoya Horiguchi Cc: Nico Pache Cc: Pankaj Raghav Cc: Ryan Roberts Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 396d9e3d1d46..a06924cf4065 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -381,6 +381,10 @@ static inline int split_huge_page_to_list_to_order(struct page *page, struct lis { return __split_huge_page_to_list_to_order(page, list, new_order, false); } +static inline int split_huge_page_to_order(struct page *page, unsigned int new_order) +{ + return split_huge_page_to_list_to_order(page, NULL, new_order); +} /* * try_folio_split_to_order - try to split a @folio at @page to @new_order using @@ -400,8 +404,7 @@ static inline int try_folio_split_to_order(struct folio *folio, struct page *page, unsigned int new_order) { if (!non_uniform_split_supported(folio, new_order, /* warns= */ false)) - return split_huge_page_to_list_to_order(&folio->page, NULL, - new_order); + return split_huge_page_to_order(&folio->page, new_order); return folio_split(folio, new_order, page, NULL); } static inline int split_huge_page(struct page *page) @@ -587,6 +590,11 @@ split_huge_page_to_list_to_order(struct page *page, struct list_head *list, VM_WARN_ON_ONCE_PAGE(1, page); return -EINVAL; } +static inline int split_huge_page_to_order(struct page *page, unsigned int new_order) +{ + VM_WARN_ON_ONCE_PAGE(1, page); + return -EINVAL; +} static inline int split_huge_page(struct page *page) { VM_WARN_ON_ONCE_PAGE(1, page); -- cgit v1.2.3 From 50d0598cf2c9d33e1f08c3b1a357752ea8a9b94a Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Fri, 31 Oct 2025 12:20:01 -0400 Subject: mm/huge_memory: fix kernel-doc comments for folio_split() and related try_folio_split_to_order(), folio_split, __folio_split(), and __split_unmapped_folio() do not have correct kernel-doc comment format. Fix them. [ziy@nvidia.com: kernel-doc fixup] Link: https://lkml.kernel.org/r/BE7AC5F3-9E64-4923-861D-C2C4E0CB91EB@nvidia.com [ziy@nvidia.com: add newline to fix an error and a warning from docutils] Link: https://lkml.kernel.org/r/040B38C0-23C6-4AEA-B069-69AE6DAA828B@nvidia.com Link: https://lkml.kernel.org/r/20251031162001.670503-4-ziy@nvidia.com Signed-off-by: Zi Yan Reviewed-by: Lorenzo Stoakes Reviewed-by: Lance Yang Reviewed-by: Barry Song Reviewed-by: Miaohe Lin Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Wei Yang Cc: Baolin Wang Cc: Dev Jain Cc: Jane Chu Cc: Liam Howlett Cc: Luis Chamberalin Cc: Matthew Wilcox (Oracle) Cc: Naoya Horiguchi Cc: Nico Pache Cc: Pankaj Raghav Cc: Ryan Roberts Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 10 ++++++---- mm/huge_memory.c | 52 ++++++++++++++++++++++++++----------------------- 2 files changed, 34 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index a06924cf4065..9f7f7d772fe5 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -386,9 +386,9 @@ static inline int split_huge_page_to_order(struct page *page, unsigned int new_o return split_huge_page_to_list_to_order(page, NULL, new_order); } -/* - * try_folio_split_to_order - try to split a @folio at @page to @new_order using - * non uniform split. +/** + * try_folio_split_to_order() - try to split a @folio at @page to @new_order + * using non uniform split. * @folio: folio to be split * @page: split to @new_order at the given page * @new_order: the target split order @@ -398,7 +398,7 @@ static inline int split_huge_page_to_order(struct page *page, unsigned int new_o * folios are put back to LRU list. Use min_order_for_split() to get the lower * bound of @new_order. * - * Return: 0: split is successful, otherwise split failed. + * Return: 0 - split is successful, otherwise split failed. */ static inline int try_folio_split_to_order(struct folio *folio, struct page *page, unsigned int new_order) @@ -483,6 +483,8 @@ static inline spinlock_t *pud_trans_huge_lock(pud_t *pud, /** * folio_test_pmd_mappable - Can we map this folio with a PMD? * @folio: The folio to test + * + * Return: true - @folio can be mapped, false - @folio cannot be mapped. */ static inline bool folio_test_pmd_mappable(struct folio *folio) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 30d6afc79016..3d87127c02cf 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3493,8 +3493,9 @@ static void __split_folio_to_order(struct folio *folio, int old_order, ClearPageCompound(&folio->page); } -/* - * It splits an unmapped @folio to lower order smaller folios in two ways. +/** + * __split_unmapped_folio() - splits an unmapped @folio to lower order folios in + * two ways: uniform split or non-uniform split. * @folio: the to-be-split folio * @new_order: the smallest order of the after split folios (since buddy * allocator like split generates folios with orders from @folio's @@ -3511,26 +3512,27 @@ static void __split_folio_to_order(struct folio *folio, int old_order, * uniform_split is true. * 2. buddy allocator like (non-uniform) split: the given @folio is split into * half and one of the half (containing the given page) is split into half - * until the given @page's order becomes @new_order. This is done when + * until the given @folio's order becomes @new_order. This is done when * uniform_split is false. * * The high level flow for these two methods are: - * 1. uniform split: a single __split_folio_to_order() is called to split the - * @folio into @new_order, then we traverse all the resulting folios one by - * one in PFN ascending order and perform stats, unfreeze, adding to list, - * and file mapping index operations. - * 2. non-uniform split: in general, folio_order - @new_order calls to - * __split_folio_to_order() are made in a for loop to split the @folio - * to one lower order at a time. The resulting small folios are processed - * like what is done during the traversal in 1, except the one containing - * @page, which is split in next for loop. + * + * 1. uniform split: @xas is split with no expectation of failure and a single + * __split_folio_to_order() is called to split the @folio into @new_order + * along with stats update. + * 2. non-uniform split: folio_order - @new_order calls to + * __split_folio_to_order() are expected to be made in a for loop to split + * the @folio to one lower order at a time. The folio containing @split_at + * is split in each iteration. @xas is split into half in each iteration and + * can fail. A failed @xas split leaves split folios as is without merging + * them back. * * After splitting, the caller's folio reference will be transferred to the - * folio containing @page. The caller needs to unlock and/or free after-split - * folios if necessary. + * folio containing @split_at. The caller needs to unlock and/or free + * after-split folios if necessary. * - * For !uniform_split, when -ENOMEM is returned, the original folio might be - * split. The caller needs to check the input folio. + * Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be + * split but not to @new_order, the caller needs to check) */ static int __split_unmapped_folio(struct folio *folio, int new_order, struct page *split_at, struct xa_state *xas, @@ -3650,8 +3652,8 @@ bool uniform_split_supported(struct folio *folio, unsigned int new_order, return true; } -/* - * __folio_split: split a folio at @split_at to a @new_order folio +/** + * __folio_split() - split a folio at @split_at to a @new_order folio * @folio: folio to split * @new_order: the order of the new folio * @split_at: a page within the new folio @@ -3669,7 +3671,7 @@ bool uniform_split_supported(struct folio *folio, unsigned int new_order, * 1. for uniform split, @lock_at points to one of @folio's subpages; * 2. for buddy allocator like (non-uniform) split, @lock_at points to @folio. * - * return: 0: successful, <0 failed (if -ENOMEM is returned, @folio might be + * Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be * split but not to @new_order, the caller needs to check) */ static int __folio_split(struct folio *folio, unsigned int new_order, @@ -4047,14 +4049,13 @@ int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list unmapped); } -/* - * folio_split: split a folio at @split_at to a @new_order folio +/** + * folio_split() - split a folio at @split_at to a @new_order folio * @folio: folio to split * @new_order: the order of the new folio * @split_at: a page within the new folio - * - * return: 0: successful, <0 failed (if -ENOMEM is returned, @folio might be - * split but not to @new_order, the caller needs to check) + * @list: after-split folios are added to @list if not null, otherwise to LRU + * list * * It has the same prerequisites and returns as * split_huge_page_to_list_to_order(). @@ -4068,6 +4069,9 @@ int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list * [order-4, {order-3}, order-3, order-5, order-6, order-7, order-8]. * * After split, folio is left locked for caller. + * + * Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be + * split but not to @new_order, the caller needs to check) */ int folio_split(struct folio *folio, unsigned int new_order, struct page *split_at, struct list_head *list) -- cgit v1.2.3 From c467061fbb6eb483d59f546c145b2ff2249455e4 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 6 Nov 2025 03:41:54 +0000 Subject: mm/huge_memory: introduce enum split_type for clarity Patch series "mm/huge_memory: Define split_type and consolidate split support checks", v3. This two-patch series focuses on improving code clarity and removing redundancy in the huge memory handling logic related to folio splitting. The series is based on an original proposal to merge two significantly identical functions that check folio split support[1]. During this process, we found an opportunity to improve readability by explicitly defining the split types. Patch 1: define split_type and use it Patch 2: merge uniform_split_supported() and non_uniform_split_supported() This patch (of 2): We currently handle two distinct types of large folio splitting: * uniform split * non-uniform split Differentiating between these types using a simple boolean variable is not obvious and can harm code readability. This commit introduces enum split_type to explicitly define these two types. Replacing the existing boolean variable with this enumeration significantly improves code clarity and expressiveness when dealing with folio splitting logic. No functional change is expected. [akpm@linux-foundation.org: tweak layout, per David] Link: https://lkml.kernel.org/r/20251106034155.21398-1-richard.weiyang@gmail.com Link: https://lkml.kernel.org/r/20251106034155.21398-2-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Zi Yan Cc: "David Hildenbrand (Red Hat)" Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Nico Pache Cc: Ryan Roberts Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 5 +++++ mm/huge_memory.c | 30 +++++++++++++++--------------- 2 files changed, 20 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 9f7f7d772fe5..b74708dc5b5f 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -364,6 +364,11 @@ unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long add unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags); +enum split_type { + SPLIT_TYPE_UNIFORM, + SPLIT_TYPE_NON_UNIFORM, +}; + bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins); int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list, unsigned int new_order, bool unmapped); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3d87127c02cf..4118f330c55e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3504,16 +3504,16 @@ static void __split_folio_to_order(struct folio *folio, int old_order, * will be split until its order becomes @new_order. * @xas: xa_state pointing to folio->mapping->i_pages and locked by caller * @mapping: @folio->mapping - * @uniform_split: if the split is uniform or not (buddy allocator like split) + * @split_type: if the split is uniform or not (buddy allocator like split) * * * 1. uniform split: the given @folio into multiple @new_order small folios, * where all small folios have the same order. This is done when - * uniform_split is true. + * split_type is SPLIT_TYPE_UNIFORM. * 2. buddy allocator like (non-uniform) split: the given @folio is split into * half and one of the half (containing the given page) is split into half * until the given @folio's order becomes @new_order. This is done when - * uniform_split is false. + * split_type is SPLIT_TYPE_NON_UNIFORM. * * The high level flow for these two methods are: * @@ -3536,11 +3536,11 @@ static void __split_folio_to_order(struct folio *folio, int old_order, */ static int __split_unmapped_folio(struct folio *folio, int new_order, struct page *split_at, struct xa_state *xas, - struct address_space *mapping, bool uniform_split) + struct address_space *mapping, enum split_type split_type) { const bool is_anon = folio_test_anon(folio); int old_order = folio_order(folio); - int start_order = uniform_split ? new_order : old_order - 1; + int start_order = split_type == SPLIT_TYPE_UNIFORM ? new_order : old_order - 1; int split_order; /* @@ -3562,7 +3562,7 @@ static int __split_unmapped_folio(struct folio *folio, int new_order, * irq is disabled to allocate enough memory, whereas * non-uniform split can handle ENOMEM. */ - if (uniform_split) + if (split_type == SPLIT_TYPE_UNIFORM) xas_split(xas, folio, old_order); else { xas_set_order(xas, folio->index, split_order); @@ -3659,7 +3659,7 @@ bool uniform_split_supported(struct folio *folio, unsigned int new_order, * @split_at: a page within the new folio * @lock_at: a page within @folio to be left locked to caller * @list: after-split folios will be put on it if non NULL - * @uniform_split: perform uniform split or not (non-uniform split) + * @split_type: perform uniform split or not (non-uniform split) * @unmapped: The pages are already unmapped, they are migration entries. * * It calls __split_unmapped_folio() to perform uniform and non-uniform split. @@ -3676,7 +3676,7 @@ bool uniform_split_supported(struct folio *folio, unsigned int new_order, */ static int __folio_split(struct folio *folio, unsigned int new_order, struct page *split_at, struct page *lock_at, - struct list_head *list, bool uniform_split, bool unmapped) + struct list_head *list, enum split_type split_type, bool unmapped) { struct deferred_split *ds_queue = get_deferred_split_queue(folio); XA_STATE(xas, &folio->mapping->i_pages, folio->index); @@ -3711,10 +3711,10 @@ static int __folio_split(struct folio *folio, unsigned int new_order, if (new_order >= old_order) return -EINVAL; - if (uniform_split && !uniform_split_supported(folio, new_order, true)) + if (split_type == SPLIT_TYPE_UNIFORM && !uniform_split_supported(folio, new_order, true)) return -EINVAL; - if (!uniform_split && + if (split_type == SPLIT_TYPE_NON_UNIFORM && !non_uniform_split_supported(folio, new_order, true)) return -EINVAL; @@ -3764,7 +3764,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order, goto out; } - if (uniform_split) { + if (split_type == SPLIT_TYPE_UNIFORM) { xas_set_order(&xas, folio->index, new_order); xas_split_alloc(&xas, folio, old_order, gfp); if (xas_error(&xas)) { @@ -3869,7 +3869,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order, lruvec = folio_lruvec_lock(folio); ret = __split_unmapped_folio(folio, new_order, split_at, &xas, - mapping, uniform_split); + mapping, split_type); /* * Unfreeze after-split folios and put them back to the right @@ -4045,8 +4045,8 @@ int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list { struct folio *folio = page_folio(page); - return __folio_split(folio, new_order, &folio->page, page, list, true, - unmapped); + return __folio_split(folio, new_order, &folio->page, page, list, + SPLIT_TYPE_UNIFORM, unmapped); } /** @@ -4077,7 +4077,7 @@ int folio_split(struct folio *folio, unsigned int new_order, struct page *split_at, struct list_head *list) { return __folio_split(folio, new_order, split_at, &folio->page, list, - false, false); + SPLIT_TYPE_NON_UNIFORM, false); } int min_order_for_split(struct folio *folio) -- cgit v1.2.3 From 8a0e4bdddd1c998b894d879a1d22f1e745606215 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 6 Nov 2025 03:41:55 +0000 Subject: mm/huge_memory: merge uniform_split_supported() and non_uniform_split_supported() uniform_split_supported() and non_uniform_split_supported() share significantly similar logic. The only functional difference is that uniform_split_supported() includes an additional check on the requested @new_order. The reason for this check comes from the following two aspects: * some file system or swap cache just supports order-0 folio * the behavioral difference between uniform/non-uniform split The behavioral difference between uniform split and non-uniform: * uniform split splits folio directly to @new_order * non-uniform split creates after-split folios with orders from folio_order(folio) - 1 to new_order. This means for non-uniform split or !new_order split we should check the file system and swap cache respectively. This commit unifies the logic and merge the two functions into a single combined helper, removing redundant code and simplifying the split support checking mechanism. Link: https://lkml.kernel.org/r/20251106034155.21398-3-richard.weiyang@gmail.com Fixes: c010d47f107f ("mm: thp: split huge page to any lower order pages") Signed-off-by: Wei Yang Reviewed-by: Zi Yan Cc: Zi Yan Cc: "David Hildenbrand (Red Hat)" Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Nico Pache Cc: Ryan Roberts Cc: Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 8 +++--- mm/huge_memory.c | 71 +++++++++++++++++++++---------------------------- 2 files changed, 33 insertions(+), 46 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index b74708dc5b5f..19d4a5f52ca2 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -374,10 +374,8 @@ int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list unsigned int new_order, bool unmapped); int min_order_for_split(struct folio *folio); int split_folio_to_list(struct folio *folio, struct list_head *list); -bool uniform_split_supported(struct folio *folio, unsigned int new_order, - bool warns); -bool non_uniform_split_supported(struct folio *folio, unsigned int new_order, - bool warns); +bool folio_split_supported(struct folio *folio, unsigned int new_order, + enum split_type split_type, bool warns); int folio_split(struct folio *folio, unsigned int new_order, struct page *page, struct list_head *list); @@ -408,7 +406,7 @@ static inline int split_huge_page_to_order(struct page *page, unsigned int new_o static inline int try_folio_split_to_order(struct folio *folio, struct page *page, unsigned int new_order) { - if (!non_uniform_split_supported(folio, new_order, /* warns= */ false)) + if (!folio_split_supported(folio, new_order, SPLIT_TYPE_NON_UNIFORM, /* warns= */ false)) return split_huge_page_to_order(&folio->page, new_order); return folio_split(folio, new_order, page, NULL); } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4118f330c55e..d79a4bb363de 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3593,8 +3593,8 @@ static int __split_unmapped_folio(struct folio *folio, int new_order, return 0; } -bool non_uniform_split_supported(struct folio *folio, unsigned int new_order, - bool warns) +bool folio_split_supported(struct folio *folio, unsigned int new_order, + enum split_type split_type, bool warns) { if (folio_test_anon(folio)) { /* order-1 is not supported for anonymous THP. */ @@ -3602,48 +3602,41 @@ bool non_uniform_split_supported(struct folio *folio, unsigned int new_order, "Cannot split to order-1 folio"); if (new_order == 1) return false; - } else if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && - !mapping_large_folio_support(folio->mapping)) { - /* - * No split if the file system does not support large folio. - * Note that we might still have THPs in such mappings due to - * CONFIG_READ_ONLY_THP_FOR_FS. But in that case, the mapping - * does not actually support large folios properly. - */ - VM_WARN_ONCE(warns, - "Cannot split file folio to non-0 order"); - return false; - } - - /* Only swapping a whole PMD-mapped folio is supported */ - if (folio_test_swapcache(folio)) { - VM_WARN_ONCE(warns, - "Cannot split swapcache folio to non-0 order"); - return false; - } - - return true; -} - -/* See comments in non_uniform_split_supported() */ -bool uniform_split_supported(struct folio *folio, unsigned int new_order, - bool warns) -{ - if (folio_test_anon(folio)) { - VM_WARN_ONCE(warns && new_order == 1, - "Cannot split to order-1 folio"); - if (new_order == 1) - return false; - } else if (new_order) { + } else if (split_type == SPLIT_TYPE_NON_UNIFORM || new_order) { if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !mapping_large_folio_support(folio->mapping)) { + /* + * We can always split a folio down to a single page + * (new_order == 0) uniformly. + * + * For any other scenario + * a) uniform split targeting a large folio + * (new_order > 0) + * b) any non-uniform split + * we must confirm that the file system supports large + * folios. + * + * Note that we might still have THPs in such + * mappings, which is created from khugepaged when + * CONFIG_READ_ONLY_THP_FOR_FS is enabled. But in that + * case, the mapping does not actually support large + * folios properly. + */ VM_WARN_ONCE(warns, "Cannot split file folio to non-0 order"); return false; } } - if (new_order && folio_test_swapcache(folio)) { + /* + * swapcache folio could only be split to order 0 + * + * non-uniform split creates after-split folios with orders from + * folio_order(folio) - 1 to new_order, making it not suitable for any + * swapcache folio split. Only uniform split to order-0 can be used + * here. + */ + if ((split_type == SPLIT_TYPE_NON_UNIFORM || new_order) && folio_test_swapcache(folio)) { VM_WARN_ONCE(warns, "Cannot split swapcache folio to non-0 order"); return false; @@ -3711,11 +3704,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order, if (new_order >= old_order) return -EINVAL; - if (split_type == SPLIT_TYPE_UNIFORM && !uniform_split_supported(folio, new_order, true)) - return -EINVAL; - - if (split_type == SPLIT_TYPE_NON_UNIFORM && - !non_uniform_split_supported(folio, new_order, true)) + if (!folio_split_supported(folio, new_order, split_type, /* warn = */ true)) return -EINVAL; is_hzp = is_huge_zero_folio(folio); -- cgit v1.2.3 From c093cf451094a9a03c4d4929bc30122a53038b7b Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 10 Nov 2025 22:21:19 +0000 Subject: mm: correctly handle UFFD PTE markers Patch series "mm: remove is_swap_[pte, pmd]() + non-swap entries, introduce leaf entries", v3. There's an established convention in the kernel that we treat leaf page tables (so far at the PTE, PMD level) as containing 'swap entries' should they be neither empty (i.e. p**_none() evaluating true) nor present (i.e. p**_present() evaluating true). However, at the same time we also have helper predicates - is_swap_pte(), is_swap_pmd() - which are inconsistently used. This is problematic, as it is logical to assume that should somebody wish to operate upon a page table swap entry they should first check to see if it is in fact one. It also implies that perhaps, in future, we might introduce a non-present, none page table entry that is not a swap entry. This series resolves this issue by systematically eliminating all use of the is_swap_pte() and is swap_pmd() predicates so we retain only the convention that should a leaf page table entry be neither none nor present it is a swap entry. We also have the further issue that 'swap entry' is unfortunately a really rather overloaded term and in fact refers to both entries for swap and for other information such as migration entries, page table markers, and device private entries. We therefore have the rather 'unique' concept of a 'non-swap' swap entry. This series therefore introduces the concept of 'software leaf entries', of type softleaf_t, to eliminate this confusion. A software leaf entry in this sense is any page table entry which is non-present, and represented by the softleaf_t type. That is - page table leaf entries which are software-controlled by the kernel. This includes 'none' or empty entries, which are simply represented by an zero leaf entry value. In order to maintain compatibility as we transition the kernel to this new type, we simply typedef swp_entry_t to softleaf_t. We introduce a number of predicates and helpers to interact with software leaf entries in include/linux/leafops.h which, as it imports swapops.h, can be treated as a drop-in replacement for swapops.h wherever leaf entry helpers are used. Since softleaf_from_[pte, pmd]() treats present entries as they were empty/none leaf entries, this allows for a great deal of simplification of code throughout the code base, which this series utilises a great deal. We additionally change from swap entry to software leaf entry handling where it makes sense to and eliminate functions from swapops.h where software leaf entries obviate the need for the functions. This patch (of 16): PTE markers were previously only concerned with UFFD-specific logic - that is, PTE entries with the UFFD WP marker set or those marked via UFFDIO_POISON. However since the introduction of guard markers in commit 7c53dfbdb024 ("mm: add PTE_MARKER_GUARD PTE marker"), this has no longer been the case. Issues have been avoided as guard regions are not permitted in conjunction with UFFD, but it still leaves very confusing logic in place, most notably the misleading and poorly named pte_none_mostly() and huge_pte_none_mostly(). This predicate returns true for PTE entries that ought to be treated as none, but only in certain circumstances, and on the assumption we are dealing with H/W poison markers or UFFD WP markers. This patch removes these functions and makes each invocation of these functions instead explicitly check what it needs to check. As part of this effort it introduces is_uffd_pte_marker() to explicitly determine if a marker in fact is used as part of UFFD or not. In the HMM logic we note that the only time we would need to check for a fault is in the case of a UFFD WP marker, otherwise we simply encounter a fault error (VM_FAULT_HWPOISON for H/W poisoned marker, VM_FAULT_SIGSEGV for a guard marker), so only check for the UFFD WP case. While we're here we also refactor code to make it easier to understand. [akpm@linux-foundation.org: fix comment typo, per Mike] Link: https://lkml.kernel.org/r/cover.1762812360.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/c38625fd9a1c1f1cf64ae8a248858e45b3dcdf11.1762812360.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Reviewed-by: Mike Rapoport (Microsoft) Cc: Alexander Gordeev Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Dev Jain Cc: Gerald Schaefer Cc: Gregory Price Cc: Heiko Carstens Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Janosch Frank Cc: Jason Gunthorpe Cc: Joshua Hahn Cc: Kairui Song Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Pasha Tatashin Cc: Peter Xu Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 93 ++++++++++++++++++++++++++----------------- include/asm-generic/hugetlb.h | 8 ---- include/linux/swapops.h | 18 --------- include/linux/userfaultfd_k.h | 21 ++++++++++ mm/hmm.c | 7 +++- mm/hugetlb.c | 47 +++++++++++----------- mm/mincore.c | 17 ++++++-- mm/userfaultfd.c | 27 ++++++++----- 8 files changed, 138 insertions(+), 100 deletions(-) (limited to 'include/linux') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 54c6cc7fe9c6..94c4d68f0818 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -233,40 +233,48 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, { struct vm_area_struct *vma = vmf->vma; pte_t *ptep, pte; - bool ret = true; assert_fault_locked(vmf); ptep = hugetlb_walk(vma, vmf->address, vma_mmu_pagesize(vma)); if (!ptep) - goto out; + return true; - ret = false; pte = huge_ptep_get(vma->vm_mm, vmf->address, ptep); /* * Lockless access: we're in a wait_event so it's ok if it - * changes under us. PTE markers should be handled the same as none - * ptes here. + * changes under us. + */ + + /* Entry is still missing, wait for userspace to resolve the fault. */ + if (huge_pte_none(pte)) + return true; + /* UFFD PTE markers require userspace to resolve the fault. */ + if (is_uffd_pte_marker(pte)) + return true; + /* + * If VMA has UFFD WP faults enabled and WP fault, wait for userspace to + * resolve the fault. */ - if (huge_pte_none_mostly(pte)) - ret = true; if (!huge_pte_write(pte) && (reason & VM_UFFD_WP)) - ret = true; -out: - return ret; + return true; + + return false; } #else static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, struct vm_fault *vmf, unsigned long reason) { - return false; /* should never get here */ + /* Should never get here. */ + VM_WARN_ON_ONCE(1); + return false; } #endif /* CONFIG_HUGETLB_PAGE */ /* - * Verify the pagetables are still not ok after having reigstered into + * Verify the pagetables are still not ok after having registered into * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any * userfault that has already been resolved, if userfaultfd_read_iter and * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different @@ -284,53 +292,63 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, pmd_t *pmd, _pmd; pte_t *pte; pte_t ptent; - bool ret = true; + bool ret; assert_fault_locked(vmf); pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) - goto out; + return true; p4d = p4d_offset(pgd, address); if (!p4d_present(*p4d)) - goto out; + return true; pud = pud_offset(p4d, address); if (!pud_present(*pud)) - goto out; + return true; pmd = pmd_offset(pud, address); again: _pmd = pmdp_get_lockless(pmd); if (pmd_none(_pmd)) - goto out; + return true; - ret = false; + /* + * A race could arise which would result in a softleaf entry such as + * migration entry unexpectedly being present in the PMD, so explicitly + * check for this and bail out if so. + */ if (!pmd_present(_pmd)) - goto out; + return false; - if (pmd_trans_huge(_pmd)) { - if (!pmd_write(_pmd) && (reason & VM_UFFD_WP)) - ret = true; - goto out; - } + if (pmd_trans_huge(_pmd)) + return !pmd_write(_pmd) && (reason & VM_UFFD_WP); pte = pte_offset_map(pmd, address); - if (!pte) { - ret = true; + if (!pte) goto again; - } + /* * Lockless access: we're in a wait_event so it's ok if it - * changes under us. PTE markers should be handled the same as none - * ptes here. + * changes under us. */ ptent = ptep_get(pte); - if (pte_none_mostly(ptent)) - ret = true; + + ret = true; + /* Entry is still missing, wait for userspace to resolve the fault. */ + if (pte_none(ptent)) + goto out; + /* UFFD PTE markers require userspace to resolve the fault. */ + if (is_uffd_pte_marker(ptent)) + goto out; + /* + * If VMA has UFFD WP faults enabled and WP fault, wait for userspace to + * resolve the fault. + */ if (!pte_write(ptent) && (reason & VM_UFFD_WP)) - ret = true; - pte_unmap(pte); + goto out; + ret = false; out: + pte_unmap(pte); return ret; } @@ -490,12 +508,13 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) set_current_state(blocking_state); spin_unlock_irq(&ctx->fault_pending_wqh.lock); - if (!is_vm_hugetlb_page(vma)) - must_wait = userfaultfd_must_wait(ctx, vmf, reason); - else + if (is_vm_hugetlb_page(vma)) { must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason); - if (is_vm_hugetlb_page(vma)) hugetlb_vma_unlock_read(vma); + } else { + must_wait = userfaultfd_must_wait(ctx, vmf, reason); + } + release_fault_lock(vmf); if (likely(must_wait && !READ_ONCE(ctx->released))) { diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index dcb8727f2b82..e1a2e1b7c8e7 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -97,14 +97,6 @@ static inline int huge_pte_none(pte_t pte) } #endif -/* Please refer to comments above pte_none_mostly() for the usage */ -#ifndef __HAVE_ARCH_HUGE_PTE_NONE_MOSTLY -static inline int huge_pte_none_mostly(pte_t pte) -{ - return huge_pte_none(pte) || is_pte_marker(pte); -} -#endif - #ifndef __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 2687928a8146..d1f665935cfc 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -469,24 +469,6 @@ static inline int is_guard_swp_entry(swp_entry_t entry) (pte_marker_get(entry) & PTE_MARKER_GUARD); } -/* - * This is a special version to check pte_none() just to cover the case when - * the pte is a pte marker. It existed because in many cases the pte marker - * should be seen as a none pte; it's just that we have stored some information - * onto the none pte so it becomes not-none any more. - * - * It should be used when the pte is file-backed, ram-based and backing - * userspace pages, like shmem. It is not needed upon pgtables that do not - * support pte markers at all. For example, it's not needed on anonymous - * memory, kernel-only memory (including when the system is during-boot), - * non-ram based generic file-system. It's fine to be used even there, but the - * extra pte marker check will be pure overhead. - */ -static inline int pte_none_mostly(pte_t pte) -{ - return pte_none(pte) || is_pte_marker(pte); -} - static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry) { struct page *p = pfn_to_page(swp_offset_pfn(entry)); diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index c0e716aec26a..da0b4fcc566f 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -479,4 +479,25 @@ static inline bool pte_swp_uffd_wp_any(pte_t pte) return false; } + +static inline bool is_uffd_pte_marker(pte_t pte) +{ + swp_entry_t entry; + + if (pte_present(pte)) + return false; + + entry = pte_to_swp_entry(pte); + if (!is_pte_marker_entry(entry)) + return false; + + /* UFFD WP, poisoned swap entries are UFFD handled. */ + if (pte_marker_entry_uffd_wp(entry)) + return true; + if (is_poisoned_swp_entry(entry)) + return true; + + return false; +} + #endif /* _LINUX_USERFAULTFD_K_H */ diff --git a/mm/hmm.c b/mm/hmm.c index a56081d67ad6..387a38bbaf6a 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -244,7 +244,12 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, uint64_t pfn_req_flags = *hmm_pfn; uint64_t new_pfn_flags = 0; - if (pte_none_mostly(pte)) { + /* + * Any other marker than a UFFD WP marker will result in a fault error + * that will be correctly handled, so we need only check for UFFD WP + * here. + */ + if (pte_none(pte) || pte_marker_uffd_wp(pte)) { required_fault = hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0); if (required_fault) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 106e61f6e12c..96c991f54f7a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6037,29 +6037,28 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } vmf.orig_pte = huge_ptep_get(mm, vmf.address, vmf.pte); - if (huge_pte_none_mostly(vmf.orig_pte)) { - if (is_pte_marker(vmf.orig_pte)) { - pte_marker marker = - pte_marker_get(pte_to_swp_entry(vmf.orig_pte)); - - if (marker & PTE_MARKER_POISONED) { - ret = VM_FAULT_HWPOISON_LARGE | - VM_FAULT_SET_HINDEX(hstate_index(h)); - goto out_mutex; - } else if (WARN_ON_ONCE(marker & PTE_MARKER_GUARD)) { - /* This isn't supported in hugetlb. */ - ret = VM_FAULT_SIGSEGV; - goto out_mutex; - } - } - + if (huge_pte_none(vmf.orig_pte)) /* - * Other PTE markers should be handled the same way as none PTE. - * * hugetlb_no_page will drop vma lock and hugetlb fault * mutex internally, which make us return immediately. */ return hugetlb_no_page(mapping, &vmf); + + if (is_pte_marker(vmf.orig_pte)) { + const pte_marker marker = + pte_marker_get(pte_to_swp_entry(vmf.orig_pte)); + + if (marker & PTE_MARKER_POISONED) { + ret = VM_FAULT_HWPOISON_LARGE | + VM_FAULT_SET_HINDEX(hstate_index(h)); + goto out_mutex; + } else if (WARN_ON_ONCE(marker & PTE_MARKER_GUARD)) { + /* This isn't supported in hugetlb. */ + ret = VM_FAULT_SIGSEGV; + goto out_mutex; + } + + return hugetlb_no_page(mapping, &vmf); } ret = 0; @@ -6228,6 +6227,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, int ret = -ENOMEM; struct folio *folio; bool folio_in_pagecache = false; + pte_t dst_ptep; if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) { ptl = huge_pte_lock(h, dst_mm, dst_pte); @@ -6367,13 +6367,14 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, if (folio_test_hwpoison(folio)) goto out_release_unlock; + ret = -EEXIST; + + dst_ptep = huge_ptep_get(dst_mm, dst_addr, dst_pte); /* - * We allow to overwrite a pte marker: consider when both MISSING|WP - * registered, we firstly wr-protect a none pte which has no page cache - * page backing it, then access the page. + * See comment about UFFD marker overwriting in + * mfill_atomic_install_pte(). */ - ret = -EEXIST; - if (!huge_pte_none_mostly(huge_ptep_get(dst_mm, dst_addr, dst_pte))) + if (!huge_pte_none(dst_ptep) && !is_uffd_pte_marker(dst_ptep)) goto out_release_unlock; if (folio_in_pagecache) diff --git a/mm/mincore.c b/mm/mincore.c index 8ec4719370e1..fb80becd6119 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -32,11 +32,22 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, spinlock_t *ptl; ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); + /* * Hugepages under user process are always in RAM and never * swapped out, but theoretically it needs to be checked. */ - present = pte && !huge_pte_none_mostly(huge_ptep_get(walk->mm, addr, pte)); + if (!pte) { + present = 0; + } else { + const pte_t ptep = huge_ptep_get(walk->mm, addr, pte); + + if (huge_pte_none(ptep) || is_pte_marker(ptep)) + present = 0; + else + present = 1; + } + for (; addr != end; vec++, addr += PAGE_SIZE) *vec = present; walk->private = vec; @@ -175,8 +186,8 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pte_t pte = ptep_get(ptep); step = 1; - /* We need to do cache lookup too for pte markers */ - if (pte_none_mostly(pte)) + /* We need to do cache lookup too for markers */ + if (pte_none(pte) || is_pte_marker(pte)) __mincore_unmapped_range(addr, addr + PAGE_SIZE, vma, vec); else if (pte_present(pte)) { diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 00122f42718c..cc4ce205bbec 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -178,6 +178,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, spinlock_t *ptl; struct folio *folio = page_folio(page); bool page_in_cache = folio_mapping(folio); + pte_t dst_ptep; _dst_pte = mk_pte(page, dst_vma->vm_page_prot); _dst_pte = pte_mkdirty(_dst_pte); @@ -199,12 +200,15 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, } ret = -EEXIST; + + dst_ptep = ptep_get(dst_pte); + /* - * We allow to overwrite a pte marker: consider when both MISSING|WP - * registered, we firstly wr-protect a none pte which has no page cache - * page backing it, then access the page. + * We are allowed to overwrite a UFFD pte marker: consider when both + * MISSING|WP registered, we firstly wr-protect a none pte which has no + * page cache page backing it, then access the page. */ - if (!pte_none_mostly(ptep_get(dst_pte))) + if (!pte_none(dst_ptep) && !is_uffd_pte_marker(dst_ptep)) goto out_unlock; if (page_in_cache) { @@ -583,12 +587,15 @@ retry: goto out_unlock; } - if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE) && - !huge_pte_none_mostly(huge_ptep_get(dst_mm, dst_addr, dst_pte))) { - err = -EEXIST; - hugetlb_vma_unlock_read(dst_vma); - mutex_unlock(&hugetlb_fault_mutex_table[hash]); - goto out_unlock; + if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) { + const pte_t ptep = huge_ptep_get(dst_mm, dst_addr, dst_pte); + + if (!huge_pte_none(ptep) && !is_uffd_pte_marker(ptep)) { + err = -EEXIST; + hugetlb_vma_unlock_read(dst_vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + goto out_unlock; + } } err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma, dst_addr, -- cgit v1.2.3 From 68aa2fdbf57f769e552f472ddb762aba028a207e Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 10 Nov 2025 22:21:20 +0000 Subject: mm: introduce leaf entry type and use to simplify leaf entry logic The kernel maintains leaf page table entries which contain either: The kernel maintains leaf page table entries which contain either: - Nothing ('none' entries) - Present entries* - Everything else that will cause a fault which the kernel handles * Present entries are either entries the hardware can navigate without page fault or special cases like NUMA hint protnone or PMD with cleared present bit which contain hardware-valid entries modulo the present bit. In the 'everything else' group we include swap entries, but we also include a number of other things such as migration entries, device private entries and marker entries. Unfortunately this 'everything else' group expresses everything through a swp_entry_t type, and these entries are referred to swap entries even though they may well not contain a... swap entry. This is compounded by the rather mind-boggling concept of a non-swap swap entry (checked via non_swap_entry()) and the means by which we twist and turn to satisfy this. This patch lays the foundation for reducing this confusion. We refer to 'everything else' as a 'software-define leaf entry' or 'softleaf'. for short And in fact we scoop up the 'none' entries into this concept also so we are left with: - Present entries. - Softleaf entries (which may be empty). This allows for radical simplification across the board - one can simply convert any leaf page table entry to a leaf entry via softleaf_from_pte(). If the entry is present, we return an empty leaf entry, so it is assumed the caller is aware that they must differentiate between the two categories of page table entries, checking for the former via pte_present(). As a result, we can eliminate a number of places where we would otherwise need to use predicates to see if we can proceed with leaf page table entry conversion and instead just go ahead and do it unconditionally. We do so where we can, adjusting surrounding logic as necessary to integrate the new softleaf_t logic as far as seems reasonable at this stage. We typedef swp_entry_t to softleaf_t for the time being until the conversion can be complete, meaning everything remains compatible regardless of which type is used. We will eventually remove swp_entry_t when the conversion is complete. We introduce a new header file to keep things clear - leafops.h - this imports swapops.h so can direct replace swapops imports without issue, and we do so in all the files that require it. Additionally, add new leafops.h file to core mm maintainers entry. Link: https://lkml.kernel.org/r/c879383aac77d96a03e4d38f7daba893cd35fc76.1762812360.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: Zi Yan Reviewed-by: Vlastimil Babka Cc: Alexander Gordeev Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Dev Jain Cc: Gerald Schaefer Cc: Gregory Price Cc: Heiko Carstens Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Janosch Frank Cc: Jason Gunthorpe Cc: Joshua Hahn Cc: Kairui Song Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Pasha Tatashin Cc: Peter Xu Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + fs/proc/task_mmu.c | 26 +-- fs/userfaultfd.c | 6 +- include/linux/leafops.h | 387 ++++++++++++++++++++++++++++++++++++++++++ include/linux/mm_inline.h | 6 +- include/linux/mm_types.h | 25 +++ include/linux/swapops.h | 28 --- include/linux/userfaultfd_k.h | 51 +----- mm/hmm.c | 2 +- mm/hugetlb.c | 37 ++-- mm/madvise.c | 16 +- mm/memory.c | 41 ++--- mm/mincore.c | 6 +- mm/mprotect.c | 6 +- mm/mremap.c | 4 +- mm/page_vma_mapped.c | 11 +- mm/shmem.c | 7 +- mm/userfaultfd.c | 6 +- 18 files changed, 502 insertions(+), 164 deletions(-) create mode 100644 include/linux/leafops.h (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index 6168d3aebdc1..5ca4caf73021 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16263,6 +16263,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm F: include/linux/gfp.h F: include/linux/gfp_types.h F: include/linux/highmem.h +F: include/linux/leafops.h F: include/linux/memory.h F: include/linux/mm.h F: include/linux/mm_*.h diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index db16ed91c269..5a1e897b0973 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include #include @@ -1231,11 +1231,11 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, if (pte_present(ptent)) { folio = page_folio(pte_page(ptent)); present = true; - } else if (is_swap_pte(ptent)) { - swp_entry_t swpent = pte_to_swp_entry(ptent); + } else { + const softleaf_t entry = softleaf_from_pte(ptent); - if (is_pfn_swap_entry(swpent)) - folio = pfn_swap_entry_folio(swpent); + if (softleaf_has_pfn(entry)) + folio = softleaf_to_folio(entry); } if (folio) { @@ -1956,9 +1956,9 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, flags |= PM_SWAP; if (is_pfn_swap_entry(entry)) page = pfn_swap_entry_to_page(entry); - if (pte_marker_entry_uffd_wp(entry)) + if (softleaf_is_uffd_wp_marker(entry)) flags |= PM_UFFD_WP; - if (is_guard_swp_entry(entry)) + if (softleaf_is_guard_marker(entry)) flags |= PM_GUARD_REGION; } @@ -2331,18 +2331,18 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p, if (pte_soft_dirty(pte)) categories |= PAGE_IS_SOFT_DIRTY; } else if (is_swap_pte(pte)) { - swp_entry_t swp; + softleaf_t entry; categories |= PAGE_IS_SWAPPED; if (!pte_swp_uffd_wp_any(pte)) categories |= PAGE_IS_WRITTEN; - swp = pte_to_swp_entry(pte); - if (is_guard_swp_entry(swp)) + entry = softleaf_from_pte(pte); + if (softleaf_is_guard_marker(entry)) categories |= PAGE_IS_GUARD; else if ((p->masks_of_interest & PAGE_IS_FILE) && - is_pfn_swap_entry(swp) && - !folio_test_anon(pfn_swap_entry_folio(swp))) + softleaf_has_pfn(entry) && + !folio_test_anon(softleaf_to_folio(entry))) categories |= PAGE_IS_FILE; if (pte_swp_soft_dirty(pte)) @@ -2467,7 +2467,7 @@ static void make_uffd_wp_huge_pte(struct vm_area_struct *vma, { unsigned long psize; - if (is_hugetlb_entry_hwpoisoned(ptent) || is_pte_marker(ptent)) + if (is_hugetlb_entry_hwpoisoned(ptent) || pte_is_marker(ptent)) return; psize = huge_page_size(hstate_vma(vma)); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 94c4d68f0818..3f539aabc3b3 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include #include @@ -251,7 +251,7 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, if (huge_pte_none(pte)) return true; /* UFFD PTE markers require userspace to resolve the fault. */ - if (is_uffd_pte_marker(pte)) + if (pte_is_uffd_marker(pte)) return true; /* * If VMA has UFFD WP faults enabled and WP fault, wait for userspace to @@ -337,7 +337,7 @@ again: if (pte_none(ptent)) goto out; /* UFFD PTE markers require userspace to resolve the fault. */ - if (is_uffd_pte_marker(ptent)) + if (pte_is_uffd_marker(ptent)) goto out; /* * If VMA has UFFD WP faults enabled and WP fault, wait for userspace to diff --git a/include/linux/leafops.h b/include/linux/leafops.h new file mode 100644 index 000000000000..cff9d94fd5d1 --- /dev/null +++ b/include/linux/leafops.h @@ -0,0 +1,387 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Describes operations that can be performed on software-defined page table + * leaf entries. These are abstracted from the hardware page table entries + * themselves by the softleaf_t type, see mm_types.h. + */ +#ifndef _LINUX_LEAFOPS_H +#define _LINUX_LEAFOPS_H + +#include +#include +#include + +#ifdef CONFIG_MMU + +/* Temporary until swp_entry_t eliminated. */ +#define LEAF_TYPE_SHIFT SWP_TYPE_SHIFT + +enum softleaf_type { + /* Fundamental types. */ + SOFTLEAF_NONE, + SOFTLEAF_SWAP, + /* Migration types. */ + SOFTLEAF_MIGRATION_READ, + SOFTLEAF_MIGRATION_READ_EXCLUSIVE, + SOFTLEAF_MIGRATION_WRITE, + /* Device types. */ + SOFTLEAF_DEVICE_PRIVATE_READ, + SOFTLEAF_DEVICE_PRIVATE_WRITE, + SOFTLEAF_DEVICE_EXCLUSIVE, + /* H/W posion types. */ + SOFTLEAF_HWPOISON, + /* Marker types. */ + SOFTLEAF_MARKER, +}; + +/** + * softleaf_mk_none() - Create an empty ('none') leaf entry. + * Returns: empty leaf entry. + */ +static inline softleaf_t softleaf_mk_none(void) +{ + return ((softleaf_t) { 0 }); +} + +/** + * softleaf_from_pte() - Obtain a leaf entry from a PTE entry. + * @pte: PTE entry. + * + * If @pte is present (therefore not a leaf entry) the function returns an empty + * leaf entry. Otherwise, it returns a leaf entry. + * + * Returns: Leaf entry. + */ +static inline softleaf_t softleaf_from_pte(pte_t pte) +{ + if (pte_present(pte) || pte_none(pte)) + return softleaf_mk_none(); + + /* Temporary until swp_entry_t eliminated. */ + return pte_to_swp_entry(pte); +} + +/** + * softleaf_is_none() - Is the leaf entry empty? + * @entry: Leaf entry. + * + * Empty entries are typically the result of a 'none' page table leaf entry + * being converted to a leaf entry. + * + * Returns: true if the entry is empty, false otherwise. + */ +static inline bool softleaf_is_none(softleaf_t entry) +{ + return entry.val == 0; +} + +/** + * softleaf_type() - Identify the type of leaf entry. + * @enntry: Leaf entry. + * + * Returns: the leaf entry type associated with @entry. + */ +static inline enum softleaf_type softleaf_type(softleaf_t entry) +{ + unsigned int type_num; + + if (softleaf_is_none(entry)) + return SOFTLEAF_NONE; + + type_num = entry.val >> LEAF_TYPE_SHIFT; + + if (type_num < MAX_SWAPFILES) + return SOFTLEAF_SWAP; + + switch (type_num) { +#ifdef CONFIG_MIGRATION + case SWP_MIGRATION_READ: + return SOFTLEAF_MIGRATION_READ; + case SWP_MIGRATION_READ_EXCLUSIVE: + return SOFTLEAF_MIGRATION_READ_EXCLUSIVE; + case SWP_MIGRATION_WRITE: + return SOFTLEAF_MIGRATION_WRITE; +#endif +#ifdef CONFIG_DEVICE_PRIVATE + case SWP_DEVICE_WRITE: + return SOFTLEAF_DEVICE_PRIVATE_WRITE; + case SWP_DEVICE_READ: + return SOFTLEAF_DEVICE_PRIVATE_READ; + case SWP_DEVICE_EXCLUSIVE: + return SOFTLEAF_DEVICE_EXCLUSIVE; +#endif +#ifdef CONFIG_MEMORY_FAILURE + case SWP_HWPOISON: + return SOFTLEAF_HWPOISON; +#endif + case SWP_PTE_MARKER: + return SOFTLEAF_MARKER; + } + + /* Unknown entry type. */ + VM_WARN_ON_ONCE(1); + return SOFTLEAF_NONE; +} + +/** + * softleaf_is_swap() - Is this leaf entry a swap entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a swap entry, otherwise false. + */ +static inline bool softleaf_is_swap(softleaf_t entry) +{ + return softleaf_type(entry) == SOFTLEAF_SWAP; +} + +/** + * softleaf_is_migration() - Is this leaf entry a migration entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a migration entry, otherwise false. + */ +static inline bool softleaf_is_migration(softleaf_t entry) +{ + switch (softleaf_type(entry)) { + case SOFTLEAF_MIGRATION_READ: + case SOFTLEAF_MIGRATION_READ_EXCLUSIVE: + case SOFTLEAF_MIGRATION_WRITE: + return true; + default: + return false; + } +} + +/** + * softleaf_is_device_private() - Is this leaf entry a device private entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a device private entry, otherwise false. + */ +static inline bool softleaf_is_device_private(softleaf_t entry) +{ + switch (softleaf_type(entry)) { + case SOFTLEAF_DEVICE_PRIVATE_WRITE: + case SOFTLEAF_DEVICE_PRIVATE_READ: + return true; + default: + return false; + } +} + +/** + * softleaf_is_device_exclusive() - Is this leaf entry a device exclusive entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a device exclusive entry, otherwise false. + */ +static inline bool softleaf_is_device_exclusive(softleaf_t entry) +{ + return softleaf_type(entry) == SOFTLEAF_DEVICE_EXCLUSIVE; +} + +/** + * softleaf_is_hwpoison() - Is this leaf entry a hardware poison entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a hardware poison entry, otherwise false. + */ +static inline bool softleaf_is_hwpoison(softleaf_t entry) +{ + return softleaf_type(entry) == SOFTLEAF_HWPOISON; +} + +/** + * softleaf_is_marker() - Is this leaf entry a marker? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a marker entry, otherwise false. + */ +static inline bool softleaf_is_marker(softleaf_t entry) +{ + return softleaf_type(entry) == SOFTLEAF_MARKER; +} + +/** + * softleaf_to_marker() - Obtain marker associated with leaf entry. + * @entry: Leaf entry, softleaf_is_marker(@entry) must return true. + * + * Returns: Marker associated with the leaf entry. + */ +static inline pte_marker softleaf_to_marker(softleaf_t entry) +{ + VM_WARN_ON_ONCE(!softleaf_is_marker(entry)); + + return swp_offset(entry) & PTE_MARKER_MASK; +} + +/** + * softleaf_has_pfn() - Does this leaf entry encode a valid PFN number? + * @entry: Leaf entry. + * + * A pfn swap entry is a special type of swap entry that always has a pfn stored + * in the swap offset. They can either be used to represent unaddressable device + * memory, to restrict access to a page undergoing migration or to represent a + * pfn which has been hwpoisoned and unmapped. + * + * Returns: true if the leaf entry encodes a PFN, otherwise false. + */ +static inline bool softleaf_has_pfn(softleaf_t entry) +{ + /* Make sure the swp offset can always store the needed fields. */ + BUILD_BUG_ON(SWP_TYPE_SHIFT < SWP_PFN_BITS); + + if (softleaf_is_migration(entry)) + return true; + if (softleaf_is_device_private(entry)) + return true; + if (softleaf_is_device_exclusive(entry)) + return true; + if (softleaf_is_hwpoison(entry)) + return true; + + return false; +} + +/** + * softleaf_to_pfn() - Obtain PFN encoded within leaf entry. + * @entry: Leaf entry, softleaf_has_pfn(@entry) must return true. + * + * Returns: The PFN associated with the leaf entry. + */ +static inline unsigned long softleaf_to_pfn(softleaf_t entry) +{ + VM_WARN_ON_ONCE(!softleaf_has_pfn(entry)); + + /* Temporary until swp_entry_t eliminated. */ + return swp_offset_pfn(entry); +} + +/** + * softleaf_to_page() - Obtains struct page for PFN encoded within leaf entry. + * @entry: Leaf entry, softleaf_has_pfn(@entry) must return true. + * + * Returns: Pointer to the struct page associated with the leaf entry's PFN. + */ +static inline struct page *softleaf_to_page(softleaf_t entry) +{ + VM_WARN_ON_ONCE(!softleaf_has_pfn(entry)); + + /* Temporary until swp_entry_t eliminated. */ + return pfn_swap_entry_to_page(entry); +} + +/** + * softleaf_to_folio() - Obtains struct folio for PFN encoded within leaf entry. + * @entry: Leaf entry, softleaf_has_pfn(@entry) must return true. + * + * Returns: Pointer to the struct folio associated with the leaf entry's PFN. + */ +static inline struct folio *softleaf_to_folio(softleaf_t entry) +{ + VM_WARN_ON_ONCE(!softleaf_has_pfn(entry)); + + /* Temporary until swp_entry_t eliminated. */ + return pfn_swap_entry_folio(entry); +} + +/** + * softleaf_is_poison_marker() - Is this leaf entry a poison marker? + * @entry: Leaf entry. + * + * The poison marker is set via UFFDIO_POISON. Userfaultfd-specific. + * + * Returns: true if the leaf entry is a poison marker, otherwise false. + */ +static inline bool softleaf_is_poison_marker(softleaf_t entry) +{ + if (!softleaf_is_marker(entry)) + return false; + + return softleaf_to_marker(entry) & PTE_MARKER_POISONED; +} + +/** + * softleaf_is_guard_marker() - Is this leaf entry a guard region marker? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a guard marker, otherwise false. + */ +static inline bool softleaf_is_guard_marker(softleaf_t entry) +{ + if (!softleaf_is_marker(entry)) + return false; + + return softleaf_to_marker(entry) & PTE_MARKER_GUARD; +} + +/** + * softleaf_is_uffd_wp_marker() - Is this leaf entry a userfautlfd write protect + * marker? + * @entry: Leaf entry. + * + * Userfaultfd-specific. + * + * Returns: true if the leaf entry is a UFFD WP marker, otherwise false. + */ +static inline bool softleaf_is_uffd_wp_marker(softleaf_t entry) +{ + if (!softleaf_is_marker(entry)) + return false; + + return softleaf_to_marker(entry) & PTE_MARKER_UFFD_WP; +} + +/** + * pte_is_marker() - Does the PTE entry encode a marker leaf entry? + * @pte: PTE entry. + * + * Returns: true if this PTE is a marker leaf entry, otherwise false. + */ +static inline bool pte_is_marker(pte_t pte) +{ + return softleaf_is_marker(softleaf_from_pte(pte)); +} + +/** + * pte_is_uffd_wp_marker() - Does this PTE entry encode a userfaultfd write + * protect marker leaf entry? + * @pte: PTE entry. + * + * Returns: true if this PTE is a UFFD WP marker leaf entry, otherwise false. + */ +static inline bool pte_is_uffd_wp_marker(pte_t pte) +{ + const softleaf_t entry = softleaf_from_pte(pte); + + return softleaf_is_uffd_wp_marker(entry); +} + +/** + * pte_is_uffd_marker() - Does this PTE entry encode a userfault-specific marker + * leaf entry? + * @entry: Leaf entry. + * + * It's useful to be able to determine which leaf entries encode UFFD-specific + * markers so we can handle these correctly. + * + * Returns: true if this PTE entry is a UFFD-specific marker, otherwise false. + */ +static inline bool pte_is_uffd_marker(pte_t pte) +{ + const softleaf_t entry = softleaf_from_pte(pte); + + if (!softleaf_is_marker(entry)) + return false; + + /* UFFD WP, poisoned swap entries are UFFD-handled. */ + if (softleaf_is_uffd_wp_marker(entry)) + return true; + if (softleaf_is_poison_marker(entry)) + return true; + + return false; +} + +#endif /* CONFIG_MMU */ +#endif /* _LINUX_LEAFOPS_H */ diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index f6a2b2d20016..ca7a18351797 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -8,7 +8,7 @@ #include #include #include -#include +#include /** * folio_is_file_lru - Should the folio be on a file LRU or anon LRU? @@ -541,9 +541,9 @@ static inline bool mm_tlb_flush_nested(const struct mm_struct *mm) * The caller should insert a new pte created with make_pte_marker(). */ static inline pte_marker copy_pte_marker( - swp_entry_t entry, struct vm_area_struct *dst_vma) + softleaf_t entry, struct vm_area_struct *dst_vma) { - pte_marker srcm = pte_marker_get(entry); + const pte_marker srcm = softleaf_to_marker(entry); /* Always copy error entries. */ pte_marker dstm = srcm & (PTE_MARKER_POISONED | PTE_MARKER_GUARD); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5021047485a9..4f66a3206a63 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -285,6 +285,31 @@ typedef struct { unsigned long val; } swp_entry_t; +/** + * typedef softleaf_t - Describes a page table software leaf entry, abstracted + * from its architecture-specific encoding. + * + * Page table leaf entries are those which do not reference any descendent page + * tables but rather either reference a data page, are an empty (or 'none' + * entry), or contain a non-present entry. + * + * If referencing another page table or a data page then the page table entry is + * pertinent to hardware - that is it tells the hardware how to decode the page + * table entry. + * + * Otherwise it is a software-defined leaf page table entry, which this type + * describes. See leafops.h and specifically @softleaf_type for a list of all + * possible kinds of software leaf entry. + * + * A softleaf_t entry is abstracted from the hardware page table entry, so is + * not architecture-specific. + * + * NOTE: While we transition from the confusing swp_entry_t type used for this + * purpose, we simply alias this type. This will be removed once the + * transition is complete. + */ +typedef swp_entry_t softleaf_t; + #if defined(CONFIG_MEMCG) || defined(CONFIG_SLAB_OBJ_EXT) /* We have some extra room after the refcount in tail pages. */ #define NR_PAGES_IN_LARGE_FOLIO diff --git a/include/linux/swapops.h b/include/linux/swapops.h index d1f665935cfc..0a4b3f51ecf5 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -426,21 +426,6 @@ static inline swp_entry_t make_pte_marker_entry(pte_marker marker) return swp_entry(SWP_PTE_MARKER, marker); } -static inline bool is_pte_marker_entry(swp_entry_t entry) -{ - return swp_type(entry) == SWP_PTE_MARKER; -} - -static inline pte_marker pte_marker_get(swp_entry_t entry) -{ - return swp_offset(entry) & PTE_MARKER_MASK; -} - -static inline bool is_pte_marker(pte_t pte) -{ - return is_swap_pte(pte) && is_pte_marker_entry(pte_to_swp_entry(pte)); -} - static inline pte_t make_pte_marker(pte_marker marker) { return swp_entry_to_pte(make_pte_marker_entry(marker)); @@ -451,24 +436,11 @@ static inline swp_entry_t make_poisoned_swp_entry(void) return make_pte_marker_entry(PTE_MARKER_POISONED); } -static inline int is_poisoned_swp_entry(swp_entry_t entry) -{ - return is_pte_marker_entry(entry) && - (pte_marker_get(entry) & PTE_MARKER_POISONED); - -} - static inline swp_entry_t make_guard_swp_entry(void) { return make_pte_marker_entry(PTE_MARKER_GUARD); } -static inline int is_guard_swp_entry(swp_entry_t entry) -{ - return is_pte_marker_entry(entry) && - (pte_marker_get(entry) & PTE_MARKER_GUARD); -} - static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry) { struct page *p = pfn_to_page(swp_offset_pfn(entry)); diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index da0b4fcc566f..983c860a00f1 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include @@ -434,32 +434,6 @@ static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma) return userfaultfd_wp_unpopulated(vma); } -static inline bool pte_marker_entry_uffd_wp(swp_entry_t entry) -{ -#ifdef CONFIG_PTE_MARKER_UFFD_WP - return is_pte_marker_entry(entry) && - (pte_marker_get(entry) & PTE_MARKER_UFFD_WP); -#else - return false; -#endif -} - -static inline bool pte_marker_uffd_wp(pte_t pte) -{ -#ifdef CONFIG_PTE_MARKER_UFFD_WP - swp_entry_t entry; - - if (!is_swap_pte(pte)) - return false; - - entry = pte_to_swp_entry(pte); - - return pte_marker_entry_uffd_wp(entry); -#else - return false; -#endif -} - /* * Returns true if this is a swap pte and was uffd-wp wr-protected in either * forms (pte marker or a normal swap pte), false otherwise. @@ -473,31 +447,10 @@ static inline bool pte_swp_uffd_wp_any(pte_t pte) if (pte_swp_uffd_wp(pte)) return true; - if (pte_marker_uffd_wp(pte)) + if (pte_is_uffd_wp_marker(pte)) return true; #endif return false; } - -static inline bool is_uffd_pte_marker(pte_t pte) -{ - swp_entry_t entry; - - if (pte_present(pte)) - return false; - - entry = pte_to_swp_entry(pte); - if (!is_pte_marker_entry(entry)) - return false; - - /* UFFD WP, poisoned swap entries are UFFD handled. */ - if (pte_marker_entry_uffd_wp(entry)) - return true; - if (is_poisoned_swp_entry(entry)) - return true; - - return false; -} - #endif /* _LINUX_USERFAULTFD_K_H */ diff --git a/mm/hmm.c b/mm/hmm.c index 387a38bbaf6a..e350d0cc9d41 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -249,7 +249,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, * that will be correctly handled, so we need only check for UFFD WP * here. */ - if (pte_none(pte) || pte_marker_uffd_wp(pte)) { + if (pte_none(pte) || pte_is_uffd_wp_marker(pte)) { required_fault = hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0); if (required_fault) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 96c991f54f7a..12853cdefc9b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #include #include @@ -4956,17 +4956,17 @@ again: entry = huge_pte_clear_uffd_wp(entry); set_huge_pte_at(dst, addr, dst_pte, entry, sz); } else if (unlikely(is_hugetlb_entry_migration(entry))) { - swp_entry_t swp_entry = pte_to_swp_entry(entry); + softleaf_t softleaf = softleaf_from_pte(entry); bool uffd_wp = pte_swp_uffd_wp(entry); - if (!is_readable_migration_entry(swp_entry) && cow) { + if (!is_readable_migration_entry(softleaf) && cow) { /* * COW mappings require pages in both * parent and child to be set to read. */ - swp_entry = make_readable_migration_entry( - swp_offset(swp_entry)); - entry = swp_entry_to_pte(swp_entry); + softleaf = make_readable_migration_entry( + swp_offset(softleaf)); + entry = swp_entry_to_pte(softleaf); if (userfaultfd_wp(src_vma) && uffd_wp) entry = pte_swp_mkuffd_wp(entry); set_huge_pte_at(src, addr, src_pte, entry, sz); @@ -4974,9 +4974,9 @@ again: if (!userfaultfd_wp(dst_vma)) entry = huge_pte_clear_uffd_wp(entry); set_huge_pte_at(dst, addr, dst_pte, entry, sz); - } else if (unlikely(is_pte_marker(entry))) { - pte_marker marker = copy_pte_marker( - pte_to_swp_entry(entry), dst_vma); + } else if (unlikely(pte_is_marker(entry))) { + const softleaf_t softleaf = softleaf_from_pte(entry); + const pte_marker marker = copy_pte_marker(softleaf, dst_vma); if (marker) set_huge_pte_at(dst, addr, dst_pte, @@ -5092,7 +5092,7 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, pte = huge_ptep_get_and_clear(mm, old_addr, src_pte, sz); - if (need_clear_uffd_wp && pte_marker_uffd_wp(pte)) + if (need_clear_uffd_wp && pte_is_uffd_wp_marker(pte)) huge_pte_clear(mm, new_addr, dst_pte, sz); else { if (need_clear_uffd_wp) { @@ -5911,7 +5911,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping, * If this pte was previously wr-protected, keep it wr-protected even * if populated. */ - if (unlikely(pte_marker_uffd_wp(vmf->orig_pte))) + if (unlikely(pte_is_uffd_wp_marker(vmf->orig_pte))) new_pte = huge_pte_mkuffd_wp(new_pte); set_huge_pte_at(mm, vmf->address, vmf->pte, new_pte, huge_page_size(h)); @@ -6044,9 +6044,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, */ return hugetlb_no_page(mapping, &vmf); - if (is_pte_marker(vmf.orig_pte)) { + if (pte_is_marker(vmf.orig_pte)) { const pte_marker marker = - pte_marker_get(pte_to_swp_entry(vmf.orig_pte)); + softleaf_to_marker(softleaf_from_pte(vmf.orig_pte)); if (marker & PTE_MARKER_POISONED) { ret = VM_FAULT_HWPOISON_LARGE | @@ -6374,7 +6374,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, * See comment about UFFD marker overwriting in * mfill_atomic_install_pte(). */ - if (!huge_pte_none(dst_ptep) && !is_uffd_pte_marker(dst_ptep)) + if (!huge_pte_none(dst_ptep) && !pte_is_uffd_marker(dst_ptep)) goto out_release_unlock; if (folio_in_pagecache) @@ -6495,8 +6495,9 @@ long hugetlb_change_protection(struct vm_area_struct *vma, if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { /* Nothing to do. */ } else if (unlikely(is_hugetlb_entry_migration(pte))) { - swp_entry_t entry = pte_to_swp_entry(pte); - struct folio *folio = pfn_swap_entry_folio(entry); + softleaf_t entry = softleaf_from_pte(pte); + + struct folio *folio = softleaf_to_folio(entry); pte_t newpte = pte; if (is_writable_migration_entry(entry)) { @@ -6516,14 +6517,14 @@ long hugetlb_change_protection(struct vm_area_struct *vma, newpte = pte_swp_clear_uffd_wp(newpte); if (!pte_same(pte, newpte)) set_huge_pte_at(mm, address, ptep, newpte, psize); - } else if (unlikely(is_pte_marker(pte))) { + } else if (unlikely(pte_is_marker(pte))) { /* * Do nothing on a poison marker; page is * corrupted, permissions do not apply. Here * pte_marker_uffd_wp()==true implies !poison * because they're mutual exclusive. */ - if (pte_marker_uffd_wp(pte) && uffd_wp_resolve) + if (pte_is_uffd_wp_marker(pte) && uffd_wp_resolve) /* Safe to modify directly (non-present->none). */ huge_pte_clear(mm, address, ptep, psize); } else if (!huge_pte_none(pte)) { diff --git a/mm/madvise.c b/mm/madvise.c index 2a165e9beb5b..c8381b954235 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include #include @@ -690,17 +690,16 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, * (page allocation + zeroing). */ if (!pte_present(ptent)) { - swp_entry_t entry; + softleaf_t entry = softleaf_from_pte(ptent); - entry = pte_to_swp_entry(ptent); - if (!non_swap_entry(entry)) { + if (softleaf_is_swap(entry)) { max_nr = (end - addr) / PAGE_SIZE; nr = swap_pte_batch(pte, max_nr, ptent); nr_swap -= nr; free_swap_and_cache_nr(entry, nr); clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm); - } else if (is_hwpoison_entry(entry) || - is_poisoned_swp_entry(entry)) { + } else if (softleaf_is_hwpoison(entry) || + softleaf_is_poison_marker(entry)) { pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } continue; @@ -1071,8 +1070,9 @@ static bool is_valid_guard_vma(struct vm_area_struct *vma, bool allow_locked) static bool is_guard_pte_marker(pte_t ptent) { - return is_swap_pte(ptent) && - is_guard_swp_entry(pte_to_swp_entry(ptent)); + const softleaf_t entry = softleaf_from_pte(ptent); + + return softleaf_is_guard_marker(entry); } static int guard_install_pud_entry(pud_t *pud, unsigned long addr, diff --git a/mm/memory.c b/mm/memory.c index 732414852570..0caf8c5c8c68 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -60,7 +60,7 @@ #include #include #include -#include +#include #include #include #include @@ -109,7 +109,7 @@ static __always_inline bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf) if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)) return false; - return pte_marker_uffd_wp(vmf->orig_pte); + return pte_is_uffd_wp_marker(vmf->orig_pte); } /* @@ -927,10 +927,10 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, { vm_flags_t vm_flags = dst_vma->vm_flags; pte_t orig_pte = ptep_get(src_pte); + softleaf_t entry = softleaf_from_pte(orig_pte); pte_t pte = orig_pte; struct folio *folio; struct page *page; - swp_entry_t entry = pte_to_swp_entry(orig_pte); if (likely(!non_swap_entry(entry))) { if (swap_duplicate(entry) < 0) @@ -1016,7 +1016,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (try_restore_exclusive_pte(src_vma, addr, src_pte, orig_pte)) return -EBUSY; return -ENOENT; - } else if (is_pte_marker_entry(entry)) { + } else if (softleaf_is_marker(entry)) { pte_marker marker = copy_pte_marker(entry, dst_vma); if (marker) @@ -1711,14 +1711,14 @@ static inline int zap_nonpresent_ptes(struct mmu_gather *tlb, unsigned int max_nr, unsigned long addr, struct zap_details *details, int *rss, bool *any_skipped) { - swp_entry_t entry; + softleaf_t entry; int nr = 1; *any_skipped = true; - entry = pte_to_swp_entry(ptent); - if (is_device_private_entry(entry) || - is_device_exclusive_entry(entry)) { - struct page *page = pfn_swap_entry_to_page(entry); + entry = softleaf_from_pte(ptent); + if (softleaf_is_device_private(entry) || + softleaf_is_device_exclusive(entry)) { + struct page *page = softleaf_to_page(entry); struct folio *folio = page_folio(page); if (unlikely(!should_zap_folio(details, folio))) @@ -1733,7 +1733,7 @@ static inline int zap_nonpresent_ptes(struct mmu_gather *tlb, rss[mm_counter(folio)]--; folio_remove_rmap_pte(folio, page, vma); folio_put(folio); - } else if (!non_swap_entry(entry)) { + } else if (softleaf_is_swap(entry)) { /* Genuine swap entries, hence a private anon pages */ if (!should_zap_cows(details)) return 1; @@ -1741,20 +1741,20 @@ static inline int zap_nonpresent_ptes(struct mmu_gather *tlb, nr = swap_pte_batch(pte, max_nr, ptent); rss[MM_SWAPENTS] -= nr; free_swap_and_cache_nr(entry, nr); - } else if (is_migration_entry(entry)) { - struct folio *folio = pfn_swap_entry_folio(entry); + } else if (softleaf_is_migration(entry)) { + struct folio *folio = softleaf_to_folio(entry); if (!should_zap_folio(details, folio)) return 1; rss[mm_counter(folio)]--; - } else if (pte_marker_entry_uffd_wp(entry)) { + } else if (softleaf_is_uffd_wp_marker(entry)) { /* * For anon: always drop the marker; for file: only * drop the marker if explicitly requested. */ if (!vma_is_anonymous(vma) && !zap_drop_markers(details)) return 1; - } else if (is_guard_swp_entry(entry)) { + } else if (softleaf_is_guard_marker(entry)) { /* * Ordinary zapping should not remove guard PTE * markers. Only do so if we should remove PTE markers @@ -1762,7 +1762,8 @@ static inline int zap_nonpresent_ptes(struct mmu_gather *tlb, */ if (!zap_drop_markers(details)) return 1; - } else if (is_hwpoison_entry(entry) || is_poisoned_swp_entry(entry)) { + } else if (softleaf_is_hwpoison(entry) || + softleaf_is_poison_marker(entry)) { if (!should_zap_cows(details)) return 1; } else { @@ -4380,7 +4381,7 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf) * * This should also cover the case where e.g. the pte changed * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_POISONED. - * So is_pte_marker() check is not enough to safely drop the pte. + * So pte_is_marker() check is not enough to safely drop the pte. */ if (pte_same(vmf->orig_pte, ptep_get(vmf->pte))) pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte); @@ -4414,8 +4415,8 @@ static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf) static vm_fault_t handle_pte_marker(struct vm_fault *vmf) { - swp_entry_t entry = pte_to_swp_entry(vmf->orig_pte); - unsigned long marker = pte_marker_get(entry); + const softleaf_t entry = softleaf_from_pte(vmf->orig_pte); + const pte_marker marker = softleaf_to_marker(entry); /* * PTE markers should never be empty. If anything weird happened, @@ -4432,7 +4433,7 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf) if (marker & PTE_MARKER_GUARD) return VM_FAULT_SIGSEGV; - if (pte_marker_entry_uffd_wp(entry)) + if (softleaf_is_uffd_wp_marker(entry)) return pte_marker_handle_uffd_wp(vmf); /* This is an unknown pte marker */ @@ -4680,7 +4681,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) } } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; - } else if (is_pte_marker_entry(entry)) { + } else if (softleaf_is_marker(entry)) { ret = handle_pte_marker(vmf); } else { print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL); diff --git a/mm/mincore.c b/mm/mincore.c index fb80becd6119..b3682488a65d 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include #include @@ -42,7 +42,7 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, } else { const pte_t ptep = huge_ptep_get(walk->mm, addr, pte); - if (huge_pte_none(ptep) || is_pte_marker(ptep)) + if (huge_pte_none(ptep) || pte_is_marker(ptep)) present = 0; else present = 1; @@ -187,7 +187,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, step = 1; /* We need to do cache lookup too for markers */ - if (pte_none(pte) || is_pte_marker(pte)) + if (pte_none(pte) || pte_is_marker(pte)) __mincore_unmapped_range(addr, addr + PAGE_SIZE, vma, vec); else if (pte_present(pte)) { diff --git a/mm/mprotect.c b/mm/mprotect.c index db93d3bb1a5e..918a64cc6033 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -326,14 +326,14 @@ static long change_pte_range(struct mmu_gather *tlb, newpte = swp_entry_to_pte(entry); if (pte_swp_uffd_wp(oldpte)) newpte = pte_swp_mkuffd_wp(newpte); - } else if (is_pte_marker_entry(entry)) { + } else if (softleaf_is_marker(entry)) { /* * Ignore error swap entries unconditionally, * because any access should sigbus/sigsegv * anyway. */ - if (is_poisoned_swp_entry(entry) || - is_guard_swp_entry(entry)) + if (softleaf_is_poison_marker(entry) || + softleaf_is_guard_marker(entry)) continue; /* * If this is uffd-wp pte marker and we'd like diff --git a/mm/mremap.c b/mm/mremap.c index 8ad06cf50783..7c21b2ad13f6 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include @@ -288,7 +288,7 @@ static int move_ptes(struct pagetable_move_control *pmc, pte = move_pte(pte, old_addr, new_addr); pte = move_soft_dirty_pte(pte); - if (need_clear_uffd_wp && pte_marker_uffd_wp(pte)) + if (need_clear_uffd_wp && pte_is_uffd_wp_marker(pte)) pte_clear(mm, new_addr, new_ptep); else { if (need_clear_uffd_wp) { diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 137ce27ff68c..be20468fb5a9 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include "internal.h" @@ -107,15 +107,12 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw, unsigned long pte_nr) pte_t ptent = ptep_get(pvmw->pte); if (pvmw->flags & PVMW_MIGRATION) { - swp_entry_t entry; - if (!is_swap_pte(ptent)) - return false; - entry = pte_to_swp_entry(ptent); + const softleaf_t entry = softleaf_from_pte(ptent); - if (!is_migration_entry(entry)) + if (!softleaf_is_migration(entry)) return false; - pfn = swp_offset_pfn(entry); + pfn = softleaf_to_pfn(entry); } else if (is_swap_pte(ptent)) { swp_entry_t entry; diff --git a/mm/shmem.c b/mm/shmem.c index 6580f3cd24bb..395ca58ac4a5 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -66,7 +66,7 @@ static struct vfsmount *shm_mnt __ro_after_init; #include #include #include -#include +#include #include #include #include @@ -2286,7 +2286,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL; struct shmem_inode_info *info = SHMEM_I(inode); - swp_entry_t swap, index_entry; + swp_entry_t swap; + softleaf_t index_entry; struct swap_info_struct *si; struct folio *folio = NULL; bool skip_swapcache = false; @@ -2298,7 +2299,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, swap = index_entry; *foliop = NULL; - if (is_poisoned_swp_entry(index_entry)) + if (softleaf_is_poison_marker(index_entry)) return -EIO; si = get_swap_device(index_entry); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index cc4ce205bbec..055ec1050776 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include @@ -208,7 +208,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, * MISSING|WP registered, we firstly wr-protect a none pte which has no * page cache page backing it, then access the page. */ - if (!pte_none(dst_ptep) && !is_uffd_pte_marker(dst_ptep)) + if (!pte_none(dst_ptep) && !pte_is_uffd_marker(dst_ptep)) goto out_unlock; if (page_in_cache) { @@ -590,7 +590,7 @@ retry: if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) { const pte_t ptep = huge_ptep_get(dst_mm, dst_addr, dst_pte); - if (!huge_pte_none(ptep) && !is_uffd_pte_marker(ptep)) { + if (!huge_pte_none(ptep) && !pte_is_uffd_marker(ptep)) { err = -EEXIST; hugetlb_vma_unlock_read(dst_vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); -- cgit v1.2.3 From fb888710e26a8a8a37dc0f8ed09a3c908c63eb71 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 10 Nov 2025 22:21:21 +0000 Subject: mm: avoid unnecessary uses of is_swap_pte() There's an established convention in the kernel that we treat PTEs as containing swap entries (and the unfortunately named non-swap swap entries) should they be neither empty (i.e. pte_none() evaluating true) nor present (i.e. pte_present() evaluating true). However, there is some inconsistency in how this is applied, as we also have the is_swap_pte() helper which explicitly performs this check: /* check whether a pte points to a swap entry */ static inline int is_swap_pte(pte_t pte) { return !pte_none(pte) && !pte_present(pte); } As this represents a predicate, and it's logical to assume that in order to establish that a PTE entry can correctly be manipulated as a swap/non-swap entry, this predicate seems as if it must first be checked. But we instead, we far more often utilise the established convention of checking pte_none() / pte_present() before operating on entries as if they were swap/non-swap. This patch works towards correcting this inconsistency by removing all uses of is_swap_pte() where we are already in a position where we perform pte_none()/pte_present() checks anyway or otherwise it is clearly logical to do so. We also take advantage of the fact that pte_swp_uffd_wp() is only set on swap entries. Additionally, update comments referencing to is_swap_pte() and non_swap_entry(). No functional change intended. Link: https://lkml.kernel.org/r/17fd6d7f46a846517fd455fadd640af47fcd7c55.1762812360.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Zi Yan Reviewed-by: Vlastimil Babka Cc: Alexander Gordeev Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Dev Jain Cc: Gerald Schaefer Cc: Gregory Price Cc: Heiko Carstens Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Janosch Frank Cc: Jason Gunthorpe Cc: Joshua Hahn Cc: Kairui Song Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Pasha Tatashin Cc: Peter Xu Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 49 ++++++++++++++++++++++++++++++------------- include/linux/userfaultfd_k.h | 3 +-- mm/hugetlb.c | 6 +++--- mm/internal.h | 6 ++---- mm/khugepaged.c | 29 ++++++++++++------------- mm/migrate.c | 2 +- mm/mprotect.c | 43 ++++++++++++++++++------------------- mm/mremap.c | 7 +++++-- mm/page_table_check.c | 13 +++++++----- mm/page_vma_mapped.c | 31 +++++++++++++-------------- 10 files changed, 104 insertions(+), 85 deletions(-) (limited to 'include/linux') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 5a1e897b0973..bf48fedaf128 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1017,7 +1017,9 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, young = pte_young(ptent); dirty = pte_dirty(ptent); present = true; - } else if (is_swap_pte(ptent)) { + } else if (pte_none(ptent)) { + smaps_pte_hole_lookup(addr, walk); + } else { swp_entry_t swpent = pte_to_swp_entry(ptent); if (!non_swap_entry(swpent)) { @@ -1038,9 +1040,6 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, present = true; page = pfn_swap_entry_to_page(swpent); } - } else { - smaps_pte_hole_lookup(addr, walk); - return; } if (!page) @@ -1612,6 +1611,9 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, */ pte_t ptent = ptep_get(pte); + if (pte_none(ptent)) + return; + if (pte_present(ptent)) { pte_t old_pte; @@ -1621,7 +1623,7 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, ptent = pte_wrprotect(old_pte); ptent = pte_clear_soft_dirty(ptent); ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); - } else if (is_swap_pte(ptent)) { + } else { ptent = pte_swp_clear_soft_dirty(ptent); set_pte_at(vma->vm_mm, addr, pte, ptent); } @@ -1924,6 +1926,9 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, struct page *page = NULL; struct folio *folio; + if (pte_none(pte)) + goto out; + if (pte_present(pte)) { if (pm->show_pfn) frame = pte_pfn(pte); @@ -1933,8 +1938,9 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, flags |= PM_SOFT_DIRTY; if (pte_uffd_wp(pte)) flags |= PM_UFFD_WP; - } else if (is_swap_pte(pte)) { + } else { swp_entry_t entry; + if (pte_swp_soft_dirty(pte)) flags |= PM_SOFT_DIRTY; if (pte_swp_uffd_wp(pte)) @@ -1942,6 +1948,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, entry = pte_to_swp_entry(pte); if (pm->show_pfn) { pgoff_t offset; + /* * For PFN swap offsets, keeping the offset field * to be PFN only to be compatible with old smaps. @@ -1970,6 +1977,8 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, __folio_page_mapped_exclusively(folio, page)) flags |= PM_MMAP_EXCLUSIVE; } + +out: if (vma->vm_flags & VM_SOFTDIRTY) flags |= PM_SOFT_DIRTY; @@ -2311,12 +2320,16 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p, struct vm_area_struct *vma, unsigned long addr, pte_t pte) { - unsigned long categories = 0; + unsigned long categories; + + if (pte_none(pte)) + return 0; if (pte_present(pte)) { struct page *page; - categories |= PAGE_IS_PRESENT; + categories = PAGE_IS_PRESENT; + if (!pte_uffd_wp(pte)) categories |= PAGE_IS_WRITTEN; @@ -2330,10 +2343,11 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p, categories |= PAGE_IS_PFNZERO; if (pte_soft_dirty(pte)) categories |= PAGE_IS_SOFT_DIRTY; - } else if (is_swap_pte(pte)) { + } else { softleaf_t entry; - categories |= PAGE_IS_SWAPPED; + categories = PAGE_IS_SWAPPED; + if (!pte_swp_uffd_wp_any(pte)) categories |= PAGE_IS_WRITTEN; @@ -2361,12 +2375,12 @@ static void make_uffd_wp_pte(struct vm_area_struct *vma, old_pte = ptep_modify_prot_start(vma, addr, pte); ptent = pte_mkuffd_wp(old_pte); ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); - } else if (is_swap_pte(ptent)) { - ptent = pte_swp_mkuffd_wp(ptent); - set_pte_at(vma->vm_mm, addr, pte, ptent); - } else { + } else if (pte_none(ptent)) { set_pte_at(vma->vm_mm, addr, pte, make_pte_marker(PTE_MARKER_UFFD_WP)); + } else { + ptent = pte_swp_mkuffd_wp(ptent); + set_pte_at(vma->vm_mm, addr, pte, ptent); } } @@ -2435,6 +2449,9 @@ static unsigned long pagemap_hugetlb_category(pte_t pte) { unsigned long categories = PAGE_IS_HUGE; + if (pte_none(pte)) + return categories; + /* * According to pagemap_hugetlb_range(), file-backed HugeTLB * page cannot be swapped. So PAGE_IS_FILE is not checked for @@ -2442,6 +2459,7 @@ static unsigned long pagemap_hugetlb_category(pte_t pte) */ if (pte_present(pte)) { categories |= PAGE_IS_PRESENT; + if (!huge_pte_uffd_wp(pte)) categories |= PAGE_IS_WRITTEN; if (!PageAnon(pte_page(pte))) @@ -2450,8 +2468,9 @@ static unsigned long pagemap_hugetlb_category(pte_t pte) categories |= PAGE_IS_PFNZERO; if (pte_soft_dirty(pte)) categories |= PAGE_IS_SOFT_DIRTY; - } else if (is_swap_pte(pte)) { + } else { categories |= PAGE_IS_SWAPPED; + if (!pte_swp_uffd_wp_any(pte)) categories |= PAGE_IS_WRITTEN; if (pte_swp_soft_dirty(pte)) diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 983c860a00f1..96b089dff4ef 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -441,9 +441,8 @@ static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma) static inline bool pte_swp_uffd_wp_any(pte_t pte) { #ifdef CONFIG_PTE_MARKER_UFFD_WP - if (!is_swap_pte(pte)) + if (pte_present(pte)) return false; - if (pte_swp_uffd_wp(pte)) return true; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 12853cdefc9b..59d91c36770c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5092,13 +5092,13 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, pte = huge_ptep_get_and_clear(mm, old_addr, src_pte, sz); - if (need_clear_uffd_wp && pte_is_uffd_wp_marker(pte)) + if (need_clear_uffd_wp && pte_is_uffd_wp_marker(pte)) { huge_pte_clear(mm, new_addr, dst_pte, sz); - else { + } else { if (need_clear_uffd_wp) { if (pte_present(pte)) pte = huge_pte_clear_uffd_wp(pte); - else if (is_swap_pte(pte)) + else pte = pte_swp_clear_uffd_wp(pte); } set_huge_pte_at(mm, new_addr, dst_pte, pte, sz); diff --git a/mm/internal.h b/mm/internal.h index 2bad3971813b..a9b38cadb192 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -325,8 +325,7 @@ unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte, /** * pte_move_swp_offset - Move the swap entry offset field of a swap pte * forward or backward by delta - * @pte: The initial pte state; is_swap_pte(pte) must be true and - * non_swap_entry() must be false. + * @pte: The initial pte state; must be a swap entry * @delta: The direction and the offset we are moving; forward if delta * is positive; backward if delta is negative * @@ -352,8 +351,7 @@ static inline pte_t pte_move_swp_offset(pte_t pte, long delta) /** * pte_next_swp_offset - Increment the swap entry offset field of a swap pte. - * @pte: The initial pte state; is_swap_pte(pte) must be true and - * non_swap_entry() must be false. + * @pte: The initial pte state; must be a swap entry. * * Increments the swap offset, while maintaining all other fields, including * swap type, and any swp pte bits. The resulting pte is returned. diff --git a/mm/khugepaged.c b/mm/khugepaged.c index af1c162c9a94..d7e71c2e2571 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1019,7 +1019,8 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm, } vmf.orig_pte = ptep_get_lockless(pte); - if (!is_swap_pte(vmf.orig_pte)) + if (pte_none(vmf.orig_pte) || + pte_present(vmf.orig_pte)) continue; vmf.pte = pte; @@ -1276,7 +1277,19 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, addr += PAGE_SIZE) { pte_t pteval = ptep_get(_pte); - if (is_swap_pte(pteval)) { + if (pte_none_or_zero(pteval)) { + ++none_or_zero; + if (!userfaultfd_armed(vma) && + (!cc->is_khugepaged || + none_or_zero <= khugepaged_max_ptes_none)) { + continue; + } else { + result = SCAN_EXCEED_NONE_PTE; + count_vm_event(THP_SCAN_EXCEED_NONE_PTE); + goto out_unmap; + } + } + if (!pte_present(pteval)) { ++unmapped; if (!cc->is_khugepaged || unmapped <= khugepaged_max_ptes_swap) { @@ -1296,18 +1309,6 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, goto out_unmap; } } - if (pte_none_or_zero(pteval)) { - ++none_or_zero; - if (!userfaultfd_armed(vma) && - (!cc->is_khugepaged || - none_or_zero <= khugepaged_max_ptes_none)) { - continue; - } else { - result = SCAN_EXCEED_NONE_PTE; - count_vm_event(THP_SCAN_EXCEED_NONE_PTE); - goto out_unmap; - } - } if (pte_uffd_wp(pteval)) { /* * Don't collapse the page if any of the small diff --git a/mm/migrate.c b/mm/migrate.c index d8f6cd14cdb7..847c1ec17628 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -492,7 +492,7 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, pte = ptep_get(ptep); pte_unmap(ptep); - if (!is_swap_pte(pte)) + if (pte_none(pte) || pte_present(pte)) goto out; entry = pte_to_swp_entry(pte); diff --git a/mm/mprotect.c b/mm/mprotect.c index 918a64cc6033..aa555dfbdfc5 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -297,7 +297,26 @@ static long change_pte_range(struct mmu_gather *tlb, prot_commit_flush_ptes(vma, addr, pte, oldpte, ptent, nr_ptes, /* idx = */ 0, /* set_write = */ false, tlb); pages += nr_ptes; - } else if (is_swap_pte(oldpte)) { + } else if (pte_none(oldpte)) { + /* + * Nobody plays with any none ptes besides + * userfaultfd when applying the protections. + */ + if (likely(!uffd_wp)) + continue; + + if (userfaultfd_wp_use_markers(vma)) { + /* + * For file-backed mem, we need to be able to + * wr-protect a none pte, because even if the + * pte is none, the page/swap cache could + * exist. Doing that by install a marker. + */ + set_pte_at(vma->vm_mm, addr, pte, + make_pte_marker(PTE_MARKER_UFFD_WP)); + pages++; + } + } else { swp_entry_t entry = pte_to_swp_entry(oldpte); pte_t newpte; @@ -358,28 +377,6 @@ static long change_pte_range(struct mmu_gather *tlb, set_pte_at(vma->vm_mm, addr, pte, newpte); pages++; } - } else { - /* It must be an none page, or what else?.. */ - WARN_ON_ONCE(!pte_none(oldpte)); - - /* - * Nobody plays with any none ptes besides - * userfaultfd when applying the protections. - */ - if (likely(!uffd_wp)) - continue; - - if (userfaultfd_wp_use_markers(vma)) { - /* - * For file-backed mem, we need to be able to - * wr-protect a none pte, because even if the - * pte is none, the page/swap cache could - * exist. Doing that by install a marker. - */ - set_pte_at(vma->vm_mm, addr, pte, - make_pte_marker(PTE_MARKER_UFFD_WP)); - pages++; - } } } while (pte += nr_ptes, addr += nr_ptes * PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); diff --git a/mm/mremap.c b/mm/mremap.c index 7c21b2ad13f6..62b6827abacf 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -158,6 +158,9 @@ static void drop_rmap_locks(struct vm_area_struct *vma) static pte_t move_soft_dirty_pte(pte_t pte) { + if (pte_none(pte)) + return pte; + /* * Set soft dirty bit so we can notice * in userspace the ptes were moved. @@ -165,7 +168,7 @@ static pte_t move_soft_dirty_pte(pte_t pte) #ifdef CONFIG_MEM_SOFT_DIRTY if (pte_present(pte)) pte = pte_mksoft_dirty(pte); - else if (is_swap_pte(pte)) + else pte = pte_swp_mksoft_dirty(pte); #endif return pte; @@ -294,7 +297,7 @@ static int move_ptes(struct pagetable_move_control *pmc, if (need_clear_uffd_wp) { if (pte_present(pte)) pte = pte_clear_uffd_wp(pte); - else if (is_swap_pte(pte)) + else pte = pte_swp_clear_uffd_wp(pte); } set_ptes(mm, new_addr, new_ptep, pte, nr_ptes); diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 4eeca782b888..43f75d2f7c36 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -185,12 +185,15 @@ static inline bool swap_cached_writable(swp_entry_t entry) is_writable_migration_entry(entry); } -static inline void page_table_check_pte_flags(pte_t pte) +static void page_table_check_pte_flags(pte_t pte) { - if (pte_present(pte) && pte_uffd_wp(pte)) - WARN_ON_ONCE(pte_write(pte)); - else if (is_swap_pte(pte) && pte_swp_uffd_wp(pte)) - WARN_ON_ONCE(swap_cached_writable(pte_to_swp_entry(pte))); + if (pte_present(pte)) { + WARN_ON_ONCE(pte_uffd_wp(pte) && pte_write(pte)); + } else if (pte_swp_uffd_wp(pte)) { + const swp_entry_t entry = pte_to_swp_entry(pte); + + WARN_ON_ONCE(swap_cached_writable(entry)); + } } void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index be20468fb5a9..a4e23818f37f 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -16,6 +16,7 @@ static inline bool not_found(struct page_vma_mapped_walk *pvmw) static bool map_pte(struct page_vma_mapped_walk *pvmw, pmd_t *pmdvalp, spinlock_t **ptlp) { + bool is_migration; pte_t ptent; if (pvmw->flags & PVMW_SYNC) { @@ -26,6 +27,7 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, pmd_t *pmdvalp, return !!pvmw->pte; } + is_migration = pvmw->flags & PVMW_MIGRATION; again: /* * It is important to return the ptl corresponding to pte, @@ -41,11 +43,14 @@ again: ptent = ptep_get(pvmw->pte); - if (pvmw->flags & PVMW_MIGRATION) { - if (!is_swap_pte(ptent)) + if (pte_none(ptent)) { + return false; + } else if (pte_present(ptent)) { + if (is_migration) return false; - } else if (is_swap_pte(ptent)) { + } else if (!is_migration) { swp_entry_t entry; + /* * Handle un-addressable ZONE_DEVICE memory. * @@ -66,8 +71,6 @@ again: if (!is_device_private_entry(entry) && !is_device_exclusive_entry(entry)) return false; - } else if (!pte_present(ptent)) { - return false; } spin_lock(*ptlp); if (unlikely(!pmd_same(*pmdvalp, pmdp_get_lockless(pvmw->pmd)))) { @@ -113,21 +116,17 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw, unsigned long pte_nr) return false; pfn = softleaf_to_pfn(entry); - } else if (is_swap_pte(ptent)) { - swp_entry_t entry; + } else if (pte_present(ptent)) { + pfn = pte_pfn(ptent); + } else { + const softleaf_t entry = softleaf_from_pte(ptent); /* Handle un-addressable ZONE_DEVICE memory */ - entry = pte_to_swp_entry(ptent); - if (!is_device_private_entry(entry) && - !is_device_exclusive_entry(entry)) - return false; - - pfn = swp_offset_pfn(entry); - } else { - if (!pte_present(ptent)) + if (!softleaf_is_device_private(entry) && + !softleaf_is_device_exclusive(entry)) return false; - pfn = pte_pfn(ptent); + pfn = softleaf_to_pfn(entry); } if ((pfn + pte_nr - 1) < pvmw->pfn) -- cgit v1.2.3 From fb410d8b89e89ef61b18326f07c477f563b631f6 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 10 Nov 2025 22:21:23 +0000 Subject: mm: use leaf entries in debug pgtable + remove is_swap_pte() Remove invocations of is_swap_pte() in mm/debug_vm_pgtable.c and use softleaf_from_pte() and softleaf_is_swap() as necessary to replace this usage. We update the test code to use a 'true' swap entry throughout so we are guaranteed this is not a non-swap entry, so all asserts continue to operate correctly. With this change in place, we no longer use is_swap_pte() anywhere, so remove it. Link: https://lkml.kernel.org/r/222f352e7a99191b4bdfa77e835f2fc0dd83fa72.1762812360.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Cc: Alexander Gordeev Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Dev Jain Cc: Gerald Schaefer Cc: Gregory Price Cc: Heiko Carstens Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Janosch Frank Cc: Jason Gunthorpe Cc: Joshua Hahn Cc: Kairui Song Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Pasha Tatashin Cc: Peter Xu Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/swapops.h | 6 ------ mm/debug_vm_pgtable.c | 39 ++++++++++++++++++++++++--------------- 2 files changed, 24 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 0a4b3f51ecf5..a66ac4f2105c 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -120,12 +120,6 @@ static inline unsigned long swp_offset_pfn(swp_entry_t entry) return swp_offset(entry) & SWP_PFN_MASK; } -/* check whether a pte points to a swap entry */ -static inline int is_swap_pte(pte_t pte) -{ - return !pte_none(pte) && !pte_present(pte); -} - /* * Convert the arch-dependent pte representation of a swp_entry_t into an * arch-independent swp_entry_t. diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 055e0e025b42..fff311830959 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include @@ -714,14 +714,16 @@ static void __init pte_soft_dirty_tests(struct pgtable_debug_args *args) static void __init pte_swap_soft_dirty_tests(struct pgtable_debug_args *args) { pte_t pte; + softleaf_t entry; if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY)) return; pr_debug("Validating PTE swap soft dirty\n"); pte = swp_entry_to_pte(args->swp_entry); - WARN_ON(!is_swap_pte(pte)); + entry = softleaf_from_pte(pte); + WARN_ON(!softleaf_is_swap(entry)); WARN_ON(!pte_swp_soft_dirty(pte_swp_mksoft_dirty(pte))); WARN_ON(pte_swp_soft_dirty(pte_swp_clear_soft_dirty(pte))); } @@ -768,40 +770,47 @@ static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) { static void __init pte_swap_exclusive_tests(struct pgtable_debug_args *args) { - swp_entry_t entry, entry2; + swp_entry_t entry; + softleaf_t softleaf; pte_t pte; pr_debug("Validating PTE swap exclusive\n"); entry = args->swp_entry; pte = swp_entry_to_pte(entry); + softleaf = softleaf_from_pte(pte); + WARN_ON(pte_swp_exclusive(pte)); - WARN_ON(!is_swap_pte(pte)); - entry2 = pte_to_swp_entry(pte); - WARN_ON(memcmp(&entry, &entry2, sizeof(entry))); + WARN_ON(!softleaf_is_swap(softleaf)); + WARN_ON(memcmp(&entry, &softleaf, sizeof(entry))); pte = pte_swp_mkexclusive(pte); + softleaf = softleaf_from_pte(pte); + WARN_ON(!pte_swp_exclusive(pte)); - WARN_ON(!is_swap_pte(pte)); + WARN_ON(!softleaf_is_swap(softleaf)); WARN_ON(pte_swp_soft_dirty(pte)); - entry2 = pte_to_swp_entry(pte); - WARN_ON(memcmp(&entry, &entry2, sizeof(entry))); + WARN_ON(memcmp(&entry, &softleaf, sizeof(entry))); pte = pte_swp_clear_exclusive(pte); + softleaf = softleaf_from_pte(pte); + WARN_ON(pte_swp_exclusive(pte)); - WARN_ON(!is_swap_pte(pte)); - entry2 = pte_to_swp_entry(pte); - WARN_ON(memcmp(&entry, &entry2, sizeof(entry))); + WARN_ON(!softleaf_is_swap(softleaf)); + WARN_ON(memcmp(&entry, &softleaf, sizeof(entry))); } static void __init pte_swap_tests(struct pgtable_debug_args *args) { swp_entry_t arch_entry; + softleaf_t entry; pte_t pte1, pte2; pr_debug("Validating PTE swap\n"); pte1 = swp_entry_to_pte(args->swp_entry); - WARN_ON(!is_swap_pte(pte1)); + entry = softleaf_from_pte(pte1); + + WARN_ON(!softleaf_is_swap(entry)); arch_entry = __pte_to_swp_entry(pte1); pte2 = __swp_entry_to_pte(arch_entry); @@ -1218,8 +1227,8 @@ static int __init init_args(struct pgtable_debug_args *args) /* See generic_max_swapfile_size(): probe the maximum offset */ max_swap_offset = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0, ~0UL)))); - /* Create a swp entry with all possible bits set */ - args->swp_entry = swp_entry((1 << MAX_SWAPFILES_SHIFT) - 1, max_swap_offset); + /* Create a swp entry with all possible bits set while still being swap. */ + args->swp_entry = swp_entry(MAX_SWAPFILES - 1, max_swap_offset); /* * Allocate (huge) pages because some of the tests need to access -- cgit v1.2.3 From aa62204cb680d8ff32497181fc9e0dac4956f7e5 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 10 Nov 2025 22:21:25 +0000 Subject: mm: avoid unnecessary use of is_swap_pmd() PMD 'non-swap' swap entries are currently used for PMD-level migration entries and device private entries. To add to the confusion in this terminology we use is_swap_pmd() in an inconsistent way similar to how is_swap_pte() was being used - sometimes adopting the convention that !pmd_none(), !pmd_present() implies PMD 'swap' entry, sometimes not. This patch handles the low-hanging fruit of cases where we can simply substitute other predicates for is_swap_pmd(). No functional change intended. Link: https://lkml.kernel.org/r/8a1704b36a009c18032d5bea4cb68e71448fbbe5.1762812360.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Cc: Alexander Gordeev Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Dev Jain Cc: Gerald Schaefer Cc: Gregory Price Cc: Heiko Carstens Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Janosch Frank Cc: Jason Gunthorpe Cc: Joshua Hahn Cc: Kairui Song Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Pasha Tatashin Cc: Peter Xu Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 15 ++++++++++++--- include/linux/swapops.h | 16 ++++++++++++++-- mm/huge_memory.c | 4 +++- mm/memory.c | 50 +++++++++++++++++++++++++++---------------------- mm/page_table_check.c | 12 ++++++++---- 5 files changed, 65 insertions(+), 32 deletions(-) (limited to 'include/linux') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 8c35ea48a93e..1bedf7fa5e79 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1059,10 +1059,12 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, bool present = false; struct folio *folio; + if (pmd_none(*pmd)) + return; if (pmd_present(*pmd)) { page = vm_normal_page_pmd(vma, addr, *pmd); present = true; - } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) { + } else if (unlikely(thp_migration_supported())) { swp_entry_t entry = pmd_to_swp_entry(*pmd); if (is_pfn_swap_entry(entry)) @@ -2000,6 +2002,9 @@ static int pagemap_pmd_range_thp(pmd_t *pmdp, unsigned long addr, if (vma->vm_flags & VM_SOFTDIRTY) flags |= PM_SOFT_DIRTY; + if (pmd_none(pmd)) + goto populate_pagemap; + if (pmd_present(pmd)) { page = pmd_page(pmd); @@ -2010,7 +2015,7 @@ static int pagemap_pmd_range_thp(pmd_t *pmdp, unsigned long addr, flags |= PM_UFFD_WP; if (pm->show_pfn) frame = pmd_pfn(pmd) + idx; - } else if (thp_migration_supported() && is_swap_pmd(pmd)) { + } else if (thp_migration_supported()) { swp_entry_t entry = pmd_to_swp_entry(pmd); unsigned long offset; @@ -2037,6 +2042,7 @@ static int pagemap_pmd_range_thp(pmd_t *pmdp, unsigned long addr, flags |= PM_FILE; } +populate_pagemap: for (; addr != end; addr += PAGE_SIZE, idx++) { u64 cur_flags = flags; pagemap_entry_t pme; @@ -2399,6 +2405,9 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p, { unsigned long categories = PAGE_IS_HUGE; + if (pmd_none(pmd)) + return categories; + if (pmd_present(pmd)) { struct page *page; @@ -2416,7 +2425,7 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p, categories |= PAGE_IS_PFNZERO; if (pmd_soft_dirty(pmd)) categories |= PAGE_IS_SOFT_DIRTY; - } else if (is_swap_pmd(pmd)) { + } else { swp_entry_t swp; categories |= PAGE_IS_SWAPPED; diff --git a/include/linux/swapops.h b/include/linux/swapops.h index a66ac4f2105c..3e8dd6ea94dd 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -509,7 +509,13 @@ static inline pmd_t swp_entry_to_pmd(swp_entry_t entry) static inline int is_pmd_migration_entry(pmd_t pmd) { - return is_swap_pmd(pmd) && is_migration_entry(pmd_to_swp_entry(pmd)); + swp_entry_t entry; + + if (pmd_present(pmd)) + return 0; + + entry = pmd_to_swp_entry(pmd); + return is_migration_entry(entry); } #else /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, @@ -557,7 +563,13 @@ static inline int is_pmd_migration_entry(pmd_t pmd) */ static inline int is_pmd_device_private_entry(pmd_t pmd) { - return is_swap_pmd(pmd) && is_device_private_entry(pmd_to_swp_entry(pmd)); + swp_entry_t entry; + + if (pmd_present(pmd)) + return 0; + + entry = pmd_to_swp_entry(pmd); + return is_device_private_entry(entry); } #else /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d79a4bb363de..b88b4b866cb3 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2354,9 +2354,11 @@ static pmd_t move_soft_dirty_pmd(pmd_t pmd) static pmd_t clear_uffd_wp_pmd(pmd_t pmd) { + if (pmd_none(pmd)) + return pmd; if (pmd_present(pmd)) pmd = pmd_clear_uffd_wp(pmd); - else if (is_swap_pmd(pmd)) + else pmd = pmd_swp_clear_uffd_wp(pmd); return pmd; diff --git a/mm/memory.c b/mm/memory.c index 0caf8c5c8c68..76c17feff88b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1376,6 +1376,7 @@ copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, next = pmd_addr_end(addr, end); if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)) { int err; + VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma); err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr, dst_vma, src_vma); @@ -6340,35 +6341,40 @@ retry_pud: if (pmd_none(*vmf.pmd) && thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PMD_ORDER)) { ret = create_huge_pmd(&vmf); - if (!(ret & VM_FAULT_FALLBACK)) + if (ret & VM_FAULT_FALLBACK) + goto fallback; + else return ret; - } else { - vmf.orig_pmd = pmdp_get_lockless(vmf.pmd); + } - if (unlikely(is_swap_pmd(vmf.orig_pmd))) { - if (is_pmd_device_private_entry(vmf.orig_pmd)) - return do_huge_pmd_device_private(&vmf); + vmf.orig_pmd = pmdp_get_lockless(vmf.pmd); + if (pmd_none(vmf.orig_pmd)) + goto fallback; - if (is_pmd_migration_entry(vmf.orig_pmd)) - pmd_migration_entry_wait(mm, vmf.pmd); - return 0; - } - if (pmd_trans_huge(vmf.orig_pmd)) { - if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma)) - return do_huge_pmd_numa_page(&vmf); + if (unlikely(!pmd_present(vmf.orig_pmd))) { + if (is_pmd_device_private_entry(vmf.orig_pmd)) + return do_huge_pmd_device_private(&vmf); - if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && - !pmd_write(vmf.orig_pmd)) { - ret = wp_huge_pmd(&vmf); - if (!(ret & VM_FAULT_FALLBACK)) - return ret; - } else { - huge_pmd_set_accessed(&vmf); - return 0; - } + if (is_pmd_migration_entry(vmf.orig_pmd)) + pmd_migration_entry_wait(mm, vmf.pmd); + return 0; + } + if (pmd_trans_huge(vmf.orig_pmd)) { + if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma)) + return do_huge_pmd_numa_page(&vmf); + + if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && + !pmd_write(vmf.orig_pmd)) { + ret = wp_huge_pmd(&vmf); + if (!(ret & VM_FAULT_FALLBACK)) + return ret; + } else { + huge_pmd_set_accessed(&vmf); + return 0; } } +fallback: return handle_pte_fault(&vmf); } diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 43f75d2f7c36..f5f25e120f69 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -215,10 +215,14 @@ EXPORT_SYMBOL(__page_table_check_ptes_set); static inline void page_table_check_pmd_flags(pmd_t pmd) { - if (pmd_present(pmd) && pmd_uffd_wp(pmd)) - WARN_ON_ONCE(pmd_write(pmd)); - else if (is_swap_pmd(pmd) && pmd_swp_uffd_wp(pmd)) - WARN_ON_ONCE(swap_cached_writable(pmd_to_swp_entry(pmd))); + if (pmd_present(pmd)) { + if (pmd_uffd_wp(pmd)) + WARN_ON_ONCE(pmd_write(pmd)); + } else if (pmd_swp_uffd_wp(pmd)) { + swp_entry_t entry = pmd_to_swp_entry(pmd); + + WARN_ON_ONCE(swap_cached_writable(entry)); + } } void __page_table_check_pmds_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd, -- cgit v1.2.3 From 0ac881efe16468503e8c1e7d8a7210b75f027ce3 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 10 Nov 2025 22:21:28 +0000 Subject: mm: replace pmd_to_swp_entry() with softleaf_from_pmd() Introduce softleaf_from_pmd() to do the equivalent operation for PMDs that softleaf_from_pte() fulfils, and cascade changes through code base accordingly, introducing helpers as necessary. We are then able to eliminate pmd_to_swp_entry(), is_pmd_migration_entry(), is_pmd_device_private_entry() and is_pmd_non_present_folio_entry(). This further establishes the use of leaf operations throughout the code base and further establishes the foundations for eliminating is_swap_pmd(). No functional change intended. [lorenzo.stoakes@oracle.com: check writable, not readable/writable, per Vlastimil] Link: https://lkml.kernel.org/r/cd97b6ec-00f9-45a4-9ae0-8f009c212a94@lucifer.local Link: https://lkml.kernel.org/r/3fb431699639ded8fdc63d2210aa77a38c8891f1.1762812360.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: SeongJae Park \ Reviewed-by: Vlastimil Babka Cc: Alexander Gordeev Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Dev Jain Cc: Gerald Schaefer Cc: Gregory Price Cc: Heiko Carstens Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Janosch Frank Cc: Jason Gunthorpe Cc: Joshua Hahn Cc: Kairui Song Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Pasha Tatashin Cc: Peter Xu Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 27 +++--- include/linux/leafops.h | 218 +++++++++++++++++++++++++++++++++++++++++++++++- include/linux/migrate.h | 2 +- include/linux/swapops.h | 100 ---------------------- mm/damon/ops-common.c | 6 +- mm/filemap.c | 6 +- mm/hmm.c | 16 ++-- mm/huge_memory.c | 98 +++++++++++----------- mm/khugepaged.c | 4 +- mm/madvise.c | 2 +- mm/memory.c | 4 +- mm/mempolicy.c | 4 +- mm/migrate.c | 20 ++--- mm/migrate_device.c | 14 ++-- mm/page_table_check.c | 16 ++-- mm/page_vma_mapped.c | 15 ++-- mm/pagewalk.c | 8 +- mm/rmap.c | 4 +- 18 files changed, 339 insertions(+), 225 deletions(-) (limited to 'include/linux') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 1bedf7fa5e79..898df952b6bc 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1065,10 +1065,10 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, page = vm_normal_page_pmd(vma, addr, *pmd); present = true; } else if (unlikely(thp_migration_supported())) { - swp_entry_t entry = pmd_to_swp_entry(*pmd); + const softleaf_t entry = softleaf_from_pmd(*pmd); - if (is_pfn_swap_entry(entry)) - page = pfn_swap_entry_to_page(entry); + if (softleaf_has_pfn(entry)) + page = softleaf_to_page(entry); } if (IS_ERR_OR_NULL(page)) return; @@ -1655,7 +1655,7 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, pmd = pmd_clear_soft_dirty(pmd); set_pmd_at(vma->vm_mm, addr, pmdp, pmd); - } else if (is_migration_entry(pmd_to_swp_entry(pmd))) { + } else if (pmd_is_migration_entry(pmd)) { pmd = pmd_swp_clear_soft_dirty(pmd); set_pmd_at(vma->vm_mm, addr, pmdp, pmd); } @@ -2016,12 +2016,12 @@ static int pagemap_pmd_range_thp(pmd_t *pmdp, unsigned long addr, if (pm->show_pfn) frame = pmd_pfn(pmd) + idx; } else if (thp_migration_supported()) { - swp_entry_t entry = pmd_to_swp_entry(pmd); + const softleaf_t entry = softleaf_from_pmd(pmd); unsigned long offset; if (pm->show_pfn) { - if (is_pfn_swap_entry(entry)) - offset = swp_offset_pfn(entry) + idx; + if (softleaf_has_pfn(entry)) + offset = softleaf_to_pfn(entry) + idx; else offset = swp_offset(entry) + idx; frame = swp_type(entry) | @@ -2032,7 +2032,7 @@ static int pagemap_pmd_range_thp(pmd_t *pmdp, unsigned long addr, flags |= PM_SOFT_DIRTY; if (pmd_swp_uffd_wp(pmd)) flags |= PM_UFFD_WP; - VM_WARN_ON_ONCE(!is_pmd_migration_entry(pmd)); + VM_WARN_ON_ONCE(!pmd_is_migration_entry(pmd)); page = pfn_swap_entry_to_page(entry); } @@ -2426,8 +2426,6 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p, if (pmd_soft_dirty(pmd)) categories |= PAGE_IS_SOFT_DIRTY; } else { - swp_entry_t swp; - categories |= PAGE_IS_SWAPPED; if (!pmd_swp_uffd_wp(pmd)) categories |= PAGE_IS_WRITTEN; @@ -2435,9 +2433,10 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p, categories |= PAGE_IS_SOFT_DIRTY; if (p->masks_of_interest & PAGE_IS_FILE) { - swp = pmd_to_swp_entry(pmd); - if (is_pfn_swap_entry(swp) && - !folio_test_anon(pfn_swap_entry_folio(swp))) + const softleaf_t entry = softleaf_from_pmd(pmd); + + if (softleaf_has_pfn(entry) && + !folio_test_anon(softleaf_to_folio(entry))) categories |= PAGE_IS_FILE; } } @@ -2454,7 +2453,7 @@ static void make_uffd_wp_pmd(struct vm_area_struct *vma, old = pmdp_invalidate_ad(vma, addr, pmdp); pmd = pmd_mkuffd_wp(old); set_pmd_at(vma->vm_mm, addr, pmdp, pmd); - } else if (is_migration_entry(pmd_to_swp_entry(pmd))) { + } else if (pmd_is_migration_entry(pmd)) { pmd = pmd_swp_mkuffd_wp(pmd); set_pmd_at(vma->vm_mm, addr, pmdp, pmd); } diff --git a/include/linux/leafops.h b/include/linux/leafops.h index cff9d94fd5d1..f5ea9b0385ff 100644 --- a/include/linux/leafops.h +++ b/include/linux/leafops.h @@ -61,6 +61,57 @@ static inline softleaf_t softleaf_from_pte(pte_t pte) return pte_to_swp_entry(pte); } +/** + * softleaf_to_pte() - Obtain a PTE entry from a leaf entry. + * @entry: Leaf entry. + * + * This generates an architecture-specific PTE entry that can be utilised to + * encode the metadata the leaf entry encodes. + * + * Returns: Architecture-specific PTE entry encoding leaf entry. + */ +static inline pte_t softleaf_to_pte(softleaf_t entry) +{ + /* Temporary until swp_entry_t eliminated. */ + return swp_entry_to_pte(entry); +} + +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION +/** + * softleaf_from_pmd() - Obtain a leaf entry from a PMD entry. + * @pmd: PMD entry. + * + * If @pmd is present (therefore not a leaf entry) the function returns an empty + * leaf entry. Otherwise, it returns a leaf entry. + * + * Returns: Leaf entry. + */ +static inline softleaf_t softleaf_from_pmd(pmd_t pmd) +{ + softleaf_t arch_entry; + + if (pmd_present(pmd) || pmd_none(pmd)) + return softleaf_mk_none(); + + if (pmd_swp_soft_dirty(pmd)) + pmd = pmd_swp_clear_soft_dirty(pmd); + if (pmd_swp_uffd_wp(pmd)) + pmd = pmd_swp_clear_uffd_wp(pmd); + arch_entry = __pmd_to_swp_entry(pmd); + + /* Temporary until swp_entry_t eliminated. */ + return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); +} + +#else + +static inline softleaf_t softleaf_from_pmd(pmd_t pmd) +{ + return softleaf_mk_none(); +} + +#endif + /** * softleaf_is_none() - Is the leaf entry empty? * @entry: Leaf entry. @@ -134,6 +185,43 @@ static inline bool softleaf_is_swap(softleaf_t entry) return softleaf_type(entry) == SOFTLEAF_SWAP; } +/** + * softleaf_is_migration_write() - Is this leaf entry a writable migration entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a writable migration entry, otherwise + * false. + */ +static inline bool softleaf_is_migration_write(softleaf_t entry) +{ + return softleaf_type(entry) == SOFTLEAF_MIGRATION_WRITE; +} + +/** + * softleaf_is_migration_read() - Is this leaf entry a readable migration entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a readable migration entry, otherwise + * false. + */ +static inline bool softleaf_is_migration_read(softleaf_t entry) +{ + return softleaf_type(entry) == SOFTLEAF_MIGRATION_READ; +} + +/** + * softleaf_is_migration_read_exclusive() - Is this leaf entry an exclusive + * readable migration entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is an exclusive readable migration entry, + * otherwise false. + */ +static inline bool softleaf_is_migration_read_exclusive(softleaf_t entry) +{ + return softleaf_type(entry) == SOFTLEAF_MIGRATION_READ_EXCLUSIVE; +} + /** * softleaf_is_migration() - Is this leaf entry a migration entry? * @entry: Leaf entry. @@ -152,6 +240,19 @@ static inline bool softleaf_is_migration(softleaf_t entry) } } +/** + * softleaf_is_device_private_write() - Is this leaf entry a device private + * writable entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a device private writable entry, otherwise + * false. + */ +static inline bool softleaf_is_device_private_write(softleaf_t entry) +{ + return softleaf_type(entry) == SOFTLEAF_DEVICE_PRIVATE_WRITE; +} + /** * softleaf_is_device_private() - Is this leaf entry a device private entry? * @entry: Leaf entry. @@ -170,10 +271,10 @@ static inline bool softleaf_is_device_private(softleaf_t entry) } /** - * softleaf_is_device_exclusive() - Is this leaf entry a device exclusive entry? + * softleaf_is_device_exclusive() - Is this leaf entry a device-exclusive entry? * @entry: Leaf entry. * - * Returns: true if the leaf entry is a device exclusive entry, otherwise false. + * Returns: true if the leaf entry is a device-exclusive entry, otherwise false. */ static inline bool softleaf_is_device_exclusive(softleaf_t entry) { @@ -332,6 +433,61 @@ static inline bool softleaf_is_uffd_wp_marker(softleaf_t entry) return softleaf_to_marker(entry) & PTE_MARKER_UFFD_WP; } +#ifdef CONFIG_MIGRATION + +/** + * softleaf_is_migration_young() - Does this migration entry contain an accessed + * bit? + * @entry: Leaf entry. + * + * If the architecture can support storing A/D bits in migration entries, this + * determines whether the accessed (or 'young') bit was set on the migrated page + * table entry. + * + * Returns: true if the entry contains an accessed bit, otherwise false. + */ +static inline bool softleaf_is_migration_young(softleaf_t entry) +{ + VM_WARN_ON_ONCE(!softleaf_is_migration(entry)); + + if (migration_entry_supports_ad()) + return swp_offset(entry) & SWP_MIG_YOUNG; + /* Keep the old behavior of aging page after migration */ + return false; +} + +/** + * softleaf_is_migration_dirty() - Does this migration entry contain a dirty bit? + * @entry: Leaf entry. + * + * If the architecture can support storing A/D bits in migration entries, this + * determines whether the dirty bit was set on the migrated page table entry. + * + * Returns: true if the entry contains a dirty bit, otherwise false. + */ +static inline bool softleaf_is_migration_dirty(softleaf_t entry) +{ + VM_WARN_ON_ONCE(!softleaf_is_migration(entry)); + + if (migration_entry_supports_ad()) + return swp_offset(entry) & SWP_MIG_DIRTY; + /* Keep the old behavior of clean page after migration */ + return false; +} + +#else /* CONFIG_MIGRATION */ + +static inline bool softleaf_is_migration_young(softleaf_t entry) +{ + return false; +} + +static inline bool softleaf_is_migration_dirty(softleaf_t entry) +{ + return false; +} +#endif /* CONFIG_MIGRATION */ + /** * pte_is_marker() - Does the PTE entry encode a marker leaf entry? * @pte: PTE entry. @@ -383,5 +539,63 @@ static inline bool pte_is_uffd_marker(pte_t pte) return false; } +#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_ENABLE_THP_MIGRATION) + +/** + * pmd_is_device_private_entry() - Check if PMD contains a device private swap + * entry. + * @pmd: The PMD to check. + * + * Returns true if the PMD contains a swap entry that represents a device private + * page mapping. This is used for zone device private pages that have been + * swapped out but still need special handling during various memory management + * operations. + * + * Return: true if PMD contains device private entry, false otherwise + */ +static inline bool pmd_is_device_private_entry(pmd_t pmd) +{ + return softleaf_is_device_private(softleaf_from_pmd(pmd)); +} + +#else /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ + +static inline bool pmd_is_device_private_entry(pmd_t pmd) +{ + return false; +} + +#endif /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ + +/** + * pmd_is_migration_entry() - Does this PMD entry encode a migration entry? + * @pmd: PMD entry. + * + * Returns: true if the PMD encodes a migration entry, otherwise false. + */ +static inline bool pmd_is_migration_entry(pmd_t pmd) +{ + return softleaf_is_migration(softleaf_from_pmd(pmd)); +} + +/** + * pmd_is_valid_softleaf() - Is this PMD entry a valid leaf entry? + * @pmd: PMD entry. + * + * PMD leaf entries are valid only if they are device private or migration + * entries. This function asserts that a PMD leaf entry is valid in this + * respect. + * + * Returns: true if the PMD entry is a valid leaf entry, otherwise false. + */ +static inline bool pmd_is_valid_softleaf(pmd_t pmd) +{ + const softleaf_t entry = softleaf_from_pmd(pmd); + + /* Only device private, migration entries valid for PMD. */ + return softleaf_is_device_private(entry) || + softleaf_is_migration(entry); +} + #endif /* CONFIG_MMU */ #endif /* _LINUX_LEAFOPS_H */ diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 41b4cc05a450..26ca00c325d9 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -65,7 +65,7 @@ bool isolate_folio_to_list(struct folio *folio, struct list_head *list); int migrate_huge_page_move_mapping(struct address_space *mapping, struct folio *dst, struct folio *src); -void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl) +void migration_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) __releases(ptl); void folio_migrate_flags(struct folio *newfolio, struct folio *folio); int folio_migrate_mapping(struct address_space *mapping, diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 3e8dd6ea94dd..f1277647262d 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -283,14 +283,6 @@ static inline swp_entry_t make_migration_entry_young(swp_entry_t entry) return entry; } -static inline bool is_migration_entry_young(swp_entry_t entry) -{ - if (migration_entry_supports_ad()) - return swp_offset(entry) & SWP_MIG_YOUNG; - /* Keep the old behavior of aging page after migration */ - return false; -} - static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry) { if (migration_entry_supports_ad()) @@ -299,14 +291,6 @@ static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry) return entry; } -static inline bool is_migration_entry_dirty(swp_entry_t entry) -{ - if (migration_entry_supports_ad()) - return swp_offset(entry) & SWP_MIG_DIRTY; - /* Keep the old behavior of clean page after migration */ - return false; -} - extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address); extern void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, pte_t *pte); @@ -349,20 +333,11 @@ static inline swp_entry_t make_migration_entry_young(swp_entry_t entry) return entry; } -static inline bool is_migration_entry_young(swp_entry_t entry) -{ - return false; -} - static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry) { return entry; } -static inline bool is_migration_entry_dirty(swp_entry_t entry) -{ - return false; -} #endif /* CONFIG_MIGRATION */ #ifdef CONFIG_MEMORY_FAILURE @@ -487,18 +462,6 @@ extern void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, extern void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd); -static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd) -{ - swp_entry_t arch_entry; - - if (pmd_swp_soft_dirty(pmd)) - pmd = pmd_swp_clear_soft_dirty(pmd); - if (pmd_swp_uffd_wp(pmd)) - pmd = pmd_swp_clear_uffd_wp(pmd); - arch_entry = __pmd_to_swp_entry(pmd); - return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); -} - static inline pmd_t swp_entry_to_pmd(swp_entry_t entry) { swp_entry_t arch_entry; @@ -507,23 +470,7 @@ static inline pmd_t swp_entry_to_pmd(swp_entry_t entry) return __swp_entry_to_pmd(arch_entry); } -static inline int is_pmd_migration_entry(pmd_t pmd) -{ - swp_entry_t entry; - - if (pmd_present(pmd)) - return 0; - - entry = pmd_to_swp_entry(pmd); - return is_migration_entry(entry); -} #else /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ -static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, - struct page *page) -{ - BUILD_BUG(); -} - static inline void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) { @@ -532,64 +479,17 @@ static inline void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, static inline void pmd_migration_entry_wait(struct mm_struct *m, pmd_t *p) { } -static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd) -{ - return swp_entry(0, 0); -} - static inline pmd_t swp_entry_to_pmd(swp_entry_t entry) { return __pmd(0); } -static inline int is_pmd_migration_entry(pmd_t pmd) -{ - return 0; -} #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ -#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_ENABLE_THP_MIGRATION) - -/** - * is_pmd_device_private_entry() - Check if PMD contains a device private swap entry - * @pmd: The PMD to check - * - * Returns true if the PMD contains a swap entry that represents a device private - * page mapping. This is used for zone device private pages that have been - * swapped out but still need special handling during various memory management - * operations. - * - * Return: 1 if PMD contains device private entry, 0 otherwise - */ -static inline int is_pmd_device_private_entry(pmd_t pmd) -{ - swp_entry_t entry; - - if (pmd_present(pmd)) - return 0; - - entry = pmd_to_swp_entry(pmd); - return is_device_private_entry(entry); -} - -#else /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ - -static inline int is_pmd_device_private_entry(pmd_t pmd) -{ - return 0; -} - -#endif /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ - static inline int non_swap_entry(swp_entry_t entry) { return swp_type(entry) >= MAX_SWAPFILES; } -static inline int is_pmd_non_present_folio_entry(pmd_t pmd) -{ - return is_pmd_migration_entry(pmd) || is_pmd_device_private_entry(pmd); -} - #endif /* CONFIG_MMU */ #endif /* _LINUX_SWAPOPS_H */ diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index 971df8a16ba4..a218d9922234 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include "../internal.h" #include "ops-common.h" @@ -51,7 +51,7 @@ void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr if (likely(pte_present(pteval))) pfn = pte_pfn(pteval); else - pfn = swp_offset_pfn(pte_to_swp_entry(pteval)); + pfn = softleaf_to_pfn(softleaf_from_pte(pteval)); folio = damon_get_folio(pfn); if (!folio) @@ -83,7 +83,7 @@ void damon_pmdp_mkold(pmd_t *pmd, struct vm_area_struct *vma, unsigned long addr if (likely(pmd_present(pmdval))) pfn = pmd_pfn(pmdval); else - pfn = swp_offset_pfn(pmd_to_swp_entry(pmdval)); + pfn = softleaf_to_pfn(softleaf_from_pmd(pmdval)); folio = damon_get_folio(pfn); if (!folio) diff --git a/mm/filemap.c b/mm/filemap.c index f0c36df1def7..02355aa46324 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include @@ -1402,7 +1402,7 @@ repeat: * This follows the same logic as folio_wait_bit_common() so see the comments * there. */ -void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl) +void migration_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) __releases(ptl) { struct wait_page_queue wait_page; @@ -1411,7 +1411,7 @@ void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl) unsigned long pflags; bool in_thrashing; wait_queue_head_t *q; - struct folio *folio = pfn_swap_entry_folio(entry); + struct folio *folio = softleaf_to_folio(entry); q = folio_waitqueue(folio); if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) { diff --git a/mm/hmm.c b/mm/hmm.c index e350d0cc9d41..e9735a9b6102 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include #include @@ -339,19 +339,19 @@ static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start, struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; unsigned long npages = (end - start) >> PAGE_SHIFT; + const softleaf_t entry = softleaf_from_pmd(pmd); unsigned long addr = start; - swp_entry_t entry = pmd_to_swp_entry(pmd); unsigned int required_fault; - if (is_device_private_entry(entry) && - pfn_swap_entry_folio(entry)->pgmap->owner == + if (softleaf_is_device_private(entry) && + softleaf_to_folio(entry)->pgmap->owner == range->dev_private_owner) { unsigned long cpu_flags = HMM_PFN_VALID | hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT); - unsigned long pfn = swp_offset_pfn(entry); + unsigned long pfn = softleaf_to_pfn(entry); unsigned long i; - if (is_writable_device_private_entry(entry)) + if (softleaf_is_device_private_write(entry)) cpu_flags |= HMM_PFN_WRITE; /* @@ -370,7 +370,7 @@ static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start, required_fault = hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0); if (required_fault) { - if (is_device_private_entry(entry)) + if (softleaf_is_device_private(entry)) return hmm_vma_fault(addr, end, required_fault, walk); else return -EFAULT; @@ -412,7 +412,7 @@ again: if (pmd_none(pmd)) return hmm_vma_walk_hole(start, end, -1, walk); - if (thp_migration_supported() && is_pmd_migration_entry(pmd)) { + if (thp_migration_supported() && pmd_is_migration_entry(pmd)) { if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) { hmm_vma_walk->last = addr; pmd_migration_entry_wait(walk->mm, pmdp); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0fdb3be39e31..9aa933723355 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1299,7 +1299,7 @@ vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; vm_fault_t ret = 0; spinlock_t *ptl; - swp_entry_t swp_entry; + softleaf_t entry; struct page *page; struct folio *folio; @@ -1314,8 +1314,8 @@ vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf) return 0; } - swp_entry = pmd_to_swp_entry(vmf->orig_pmd); - page = pfn_swap_entry_to_page(swp_entry); + entry = softleaf_from_pmd(vmf->orig_pmd); + page = softleaf_to_page(entry); folio = page_folio(page); vmf->page = page; vmf->pte = NULL; @@ -1705,13 +1705,13 @@ static void copy_huge_non_present_pmd( struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pmd_t pmd, pgtable_t pgtable) { - swp_entry_t entry = pmd_to_swp_entry(pmd); + softleaf_t entry = softleaf_from_pmd(pmd); struct folio *src_folio; - VM_WARN_ON(!is_pmd_non_present_folio_entry(pmd)); + VM_WARN_ON_ONCE(!pmd_is_valid_softleaf(pmd)); - if (is_writable_migration_entry(entry) || - is_readable_exclusive_migration_entry(entry)) { + if (softleaf_is_migration_write(entry) || + softleaf_is_migration_read_exclusive(entry)) { entry = make_readable_migration_entry(swp_offset(entry)); pmd = swp_entry_to_pmd(entry); if (pmd_swp_soft_dirty(*src_pmd)) @@ -1719,12 +1719,12 @@ static void copy_huge_non_present_pmd( if (pmd_swp_uffd_wp(*src_pmd)) pmd = pmd_swp_mkuffd_wp(pmd); set_pmd_at(src_mm, addr, src_pmd, pmd); - } else if (is_device_private_entry(entry)) { + } else if (softleaf_is_device_private(entry)) { /* * For device private entries, since there are no * read exclusive entries, writable = !readable */ - if (is_writable_device_private_entry(entry)) { + if (softleaf_is_device_private_write(entry)) { entry = make_readable_device_private_entry(swp_offset(entry)); pmd = swp_entry_to_pmd(entry); @@ -1735,7 +1735,7 @@ static void copy_huge_non_present_pmd( set_pmd_at(src_mm, addr, src_pmd, pmd); } - src_folio = pfn_swap_entry_folio(entry); + src_folio = softleaf_to_folio(entry); VM_WARN_ON(!folio_test_large(src_folio)); folio_get(src_folio); @@ -2195,7 +2195,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (unlikely(!pmd_present(orig_pmd))) { VM_BUG_ON(thp_migration_supported() && - !is_pmd_migration_entry(orig_pmd)); + !pmd_is_migration_entry(orig_pmd)); goto out; } @@ -2293,11 +2293,10 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, folio_remove_rmap_pmd(folio, page, vma); WARN_ON_ONCE(folio_mapcount(folio) < 0); VM_BUG_ON_PAGE(!PageHead(page), page); - } else if (is_pmd_non_present_folio_entry(orig_pmd)) { - swp_entry_t entry; + } else if (pmd_is_valid_softleaf(orig_pmd)) { + const softleaf_t entry = softleaf_from_pmd(orig_pmd); - entry = pmd_to_swp_entry(orig_pmd); - folio = pfn_swap_entry_folio(entry); + folio = softleaf_to_folio(entry); flush_needed = 0; if (!thp_migration_supported()) @@ -2353,7 +2352,7 @@ static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, static pmd_t move_soft_dirty_pmd(pmd_t pmd) { #ifdef CONFIG_MEM_SOFT_DIRTY - if (unlikely(is_pmd_migration_entry(pmd))) + if (unlikely(pmd_is_migration_entry(pmd))) pmd = pmd_swp_mksoft_dirty(pmd); else if (pmd_present(pmd)) pmd = pmd_mksoft_dirty(pmd); @@ -2428,12 +2427,12 @@ static void change_non_present_huge_pmd(struct mm_struct *mm, unsigned long addr, pmd_t *pmd, bool uffd_wp, bool uffd_wp_resolve) { - swp_entry_t entry = pmd_to_swp_entry(*pmd); - struct folio *folio = pfn_swap_entry_folio(entry); + softleaf_t entry = softleaf_from_pmd(*pmd); + const struct folio *folio = softleaf_to_folio(entry); pmd_t newpmd; - VM_WARN_ON(!is_pmd_non_present_folio_entry(*pmd)); - if (is_writable_migration_entry(entry)) { + VM_WARN_ON(!pmd_is_valid_softleaf(*pmd)); + if (softleaf_is_migration_write(entry)) { /* * A protection check is difficult so * just be safe and disable write @@ -2445,7 +2444,7 @@ static void change_non_present_huge_pmd(struct mm_struct *mm, newpmd = swp_entry_to_pmd(entry); if (pmd_swp_soft_dirty(*pmd)) newpmd = pmd_swp_mksoft_dirty(newpmd); - } else if (is_writable_device_private_entry(entry)) { + } else if (softleaf_is_device_private_write(entry)) { entry = make_readable_device_private_entry(swp_offset(entry)); newpmd = swp_entry_to_pmd(entry); } else { @@ -2643,7 +2642,7 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm if (!pmd_trans_huge(src_pmdval)) { spin_unlock(src_ptl); - if (is_pmd_migration_entry(src_pmdval)) { + if (pmd_is_migration_entry(src_pmdval)) { pmd_migration_entry_wait(mm, &src_pmdval); return -EAGAIN; } @@ -2908,13 +2907,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr; pte_t *pte; int i; - swp_entry_t entry; VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); VM_BUG_ON_VMA(vma->vm_start > haddr, vma); VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); - VM_WARN_ON(!is_pmd_non_present_folio_entry(*pmd) && !pmd_trans_huge(*pmd)); + VM_WARN_ON_ONCE(!pmd_is_valid_softleaf(*pmd) && !pmd_trans_huge(*pmd)); count_vm_event(THP_SPLIT_PMD); @@ -2928,11 +2926,10 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, zap_deposited_table(mm, pmd); if (!vma_is_dax(vma) && vma_is_special_huge(vma)) return; - if (unlikely(is_pmd_migration_entry(old_pmd))) { - swp_entry_t entry; + if (unlikely(pmd_is_migration_entry(old_pmd))) { + const softleaf_t old_entry = softleaf_from_pmd(old_pmd); - entry = pmd_to_swp_entry(old_pmd); - folio = pfn_swap_entry_folio(entry); + folio = softleaf_to_folio(old_entry); } else if (is_huge_zero_pmd(old_pmd)) { return; } else { @@ -2962,31 +2959,34 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, return __split_huge_zero_page_pmd(vma, haddr, pmd); } + if (pmd_is_migration_entry(*pmd)) { + softleaf_t entry; - if (is_pmd_migration_entry(*pmd)) { old_pmd = *pmd; - entry = pmd_to_swp_entry(old_pmd); - page = pfn_swap_entry_to_page(entry); + entry = softleaf_from_pmd(old_pmd); + page = softleaf_to_page(entry); folio = page_folio(page); soft_dirty = pmd_swp_soft_dirty(old_pmd); uffd_wp = pmd_swp_uffd_wp(old_pmd); - write = is_writable_migration_entry(entry); + write = softleaf_is_migration_write(entry); if (PageAnon(page)) - anon_exclusive = is_readable_exclusive_migration_entry(entry); - young = is_migration_entry_young(entry); - dirty = is_migration_entry_dirty(entry); - } else if (is_pmd_device_private_entry(*pmd)) { + anon_exclusive = softleaf_is_migration_read_exclusive(entry); + young = softleaf_is_migration_young(entry); + dirty = softleaf_is_migration_dirty(entry); + } else if (pmd_is_device_private_entry(*pmd)) { + softleaf_t entry; + old_pmd = *pmd; - entry = pmd_to_swp_entry(old_pmd); - page = pfn_swap_entry_to_page(entry); + entry = softleaf_from_pmd(old_pmd); + page = softleaf_to_page(entry); folio = page_folio(page); soft_dirty = pmd_swp_soft_dirty(old_pmd); uffd_wp = pmd_swp_uffd_wp(old_pmd); - write = is_writable_device_private_entry(entry); + write = softleaf_is_device_private_write(entry); anon_exclusive = PageAnonExclusive(page); /* @@ -3090,7 +3090,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, * Note that NUMA hinting access restrictions are not transferred to * avoid any possibility of altering permissions across VMAs. */ - if (freeze || is_pmd_migration_entry(old_pmd)) { + if (freeze || pmd_is_migration_entry(old_pmd)) { pte_t entry; swp_entry_t swp_entry; @@ -3116,7 +3116,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, VM_WARN_ON(!pte_none(ptep_get(pte + i))); set_pte_at(mm, addr, pte + i, entry); } - } else if (is_pmd_device_private_entry(old_pmd)) { + } else if (pmd_is_device_private_entry(old_pmd)) { pte_t entry; swp_entry_t swp_entry; @@ -3166,7 +3166,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, } pte_unmap(pte); - if (!is_pmd_migration_entry(*pmd)) + if (!pmd_is_migration_entry(*pmd)) folio_remove_rmap_pmd(folio, page, vma); if (freeze) put_page(page); @@ -3179,7 +3179,7 @@ void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, bool freeze) { VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE)); - if (pmd_trans_huge(*pmd) || is_pmd_non_present_folio_entry(*pmd)) + if (pmd_trans_huge(*pmd) || pmd_is_valid_softleaf(*pmd)) __split_huge_pmd_locked(vma, pmd, address, freeze); } @@ -4749,25 +4749,25 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) unsigned long address = pvmw->address; unsigned long haddr = address & HPAGE_PMD_MASK; pmd_t pmde; - swp_entry_t entry; + softleaf_t entry; if (!(pvmw->pmd && !pvmw->pte)) return; - entry = pmd_to_swp_entry(*pvmw->pmd); + entry = softleaf_from_pmd(*pvmw->pmd); folio_get(folio); pmde = folio_mk_pmd(folio, READ_ONCE(vma->vm_page_prot)); if (pmd_swp_soft_dirty(*pvmw->pmd)) pmde = pmd_mksoft_dirty(pmde); - if (is_writable_migration_entry(entry)) + if (softleaf_is_migration_write(entry)) pmde = pmd_mkwrite(pmde, vma); if (pmd_swp_uffd_wp(*pvmw->pmd)) pmde = pmd_mkuffd_wp(pmde); - if (!is_migration_entry_young(entry)) + if (!softleaf_is_migration_young(entry)) pmde = pmd_mkold(pmde); /* NOTE: this may contain setting soft-dirty on some archs */ - if (folio_test_dirty(folio) && is_migration_entry_dirty(entry)) + if (folio_test_dirty(folio) && softleaf_is_migration_dirty(entry)) pmde = pmd_mkdirty(pmde); if (folio_is_device_private(folio)) { @@ -4790,7 +4790,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) if (folio_test_anon(folio)) { rmap_t rmap_flags = RMAP_NONE; - if (!is_readable_migration_entry(entry)) + if (!softleaf_is_migration_read(entry)) rmap_flags |= RMAP_EXCLUSIVE; folio_add_anon_rmap_pmd(folio, new, vma, haddr, rmap_flags); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index d7e71c2e2571..7e8cb181d5bd 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include @@ -941,7 +941,7 @@ static inline int check_pmd_state(pmd_t *pmd) * collapse it. Migration success or failure will eventually end * up with a present PMD mapping a folio again. */ - if (is_pmd_migration_entry(pmde)) + if (pmd_is_migration_entry(pmde)) return SCAN_PMD_MAPPED; if (!pmd_present(pmde)) return SCAN_PMD_NULL; diff --git a/mm/madvise.c b/mm/madvise.c index 2d7dd7901bae..5979a4a39738 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -390,7 +390,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, if (unlikely(!pmd_present(orig_pmd))) { VM_BUG_ON(thp_migration_supported() && - !is_pmd_migration_entry(orig_pmd)); + !pmd_is_migration_entry(orig_pmd)); goto huge_unlock; } diff --git a/mm/memory.c b/mm/memory.c index 76c17feff88b..9d0d527e95a8 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -6352,10 +6352,10 @@ retry_pud: goto fallback; if (unlikely(!pmd_present(vmf.orig_pmd))) { - if (is_pmd_device_private_entry(vmf.orig_pmd)) + if (pmd_is_device_private_entry(vmf.orig_pmd)) return do_huge_pmd_device_private(&vmf); - if (is_pmd_migration_entry(vmf.orig_pmd)) + if (pmd_is_migration_entry(vmf.orig_pmd)) pmd_migration_entry_wait(mm, vmf.pmd); return 0; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 7ae3f5e2dee6..01c3b98f87a6 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -110,7 +110,7 @@ #include #include #include -#include +#include #include #include @@ -647,7 +647,7 @@ static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk) struct folio *folio; struct queue_pages *qp = walk->private; - if (unlikely(is_pmd_migration_entry(*pmd))) { + if (unlikely(pmd_is_migration_entry(*pmd))) { qp->nr_failed++; return; } diff --git a/mm/migrate.c b/mm/migrate.c index 847c1ec17628..ca4ec170a89b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include #include @@ -353,7 +353,7 @@ static bool remove_migration_pte(struct folio *folio, rmap_t rmap_flags = RMAP_NONE; pte_t old_pte; pte_t pte; - swp_entry_t entry; + softleaf_t entry; struct page *new; unsigned long idx = 0; @@ -379,22 +379,22 @@ static bool remove_migration_pte(struct folio *folio, folio_get(folio); pte = mk_pte(new, READ_ONCE(vma->vm_page_prot)); - entry = pte_to_swp_entry(old_pte); - if (!is_migration_entry_young(entry)) + entry = softleaf_from_pte(old_pte); + if (!softleaf_is_migration_young(entry)) pte = pte_mkold(pte); - if (folio_test_dirty(folio) && is_migration_entry_dirty(entry)) + if (folio_test_dirty(folio) && softleaf_is_migration_dirty(entry)) pte = pte_mkdirty(pte); if (pte_swp_soft_dirty(old_pte)) pte = pte_mksoft_dirty(pte); else pte = pte_clear_soft_dirty(pte); - if (is_writable_migration_entry(entry)) + if (softleaf_is_migration_write(entry)) pte = pte_mkwrite(pte, vma); else if (pte_swp_uffd_wp(old_pte)) pte = pte_mkuffd_wp(pte); - if (folio_test_anon(folio) && !is_readable_migration_entry(entry)) + if (folio_test_anon(folio) && !softleaf_is_migration_read(entry)) rmap_flags |= RMAP_EXCLUSIVE; if (unlikely(is_device_private_page(new))) { @@ -404,7 +404,7 @@ static bool remove_migration_pte(struct folio *folio, else entry = make_readable_device_private_entry( page_to_pfn(new)); - pte = swp_entry_to_pte(entry); + pte = softleaf_to_pte(entry); if (pte_swp_soft_dirty(old_pte)) pte = pte_swp_mksoft_dirty(pte); if (pte_swp_uffd_wp(old_pte)) @@ -543,9 +543,9 @@ void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd) spinlock_t *ptl; ptl = pmd_lock(mm, pmd); - if (!is_pmd_migration_entry(*pmd)) + if (!pmd_is_migration_entry(*pmd)) goto unlock; - migration_entry_wait_on_locked(pmd_to_swp_entry(*pmd), ptl); + migration_entry_wait_on_locked(softleaf_from_pmd(*pmd), ptl); return; unlock: spin_unlock(ptl); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index ab373fd38961..592b4561507c 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include #include "internal.h" @@ -141,7 +141,6 @@ static int migrate_vma_collect_huge_pmd(pmd_t *pmdp, unsigned long start, struct folio *folio; struct migrate_vma *migrate = walk->private; spinlock_t *ptl; - swp_entry_t entry; int ret; unsigned long write = 0; @@ -165,23 +164,24 @@ static int migrate_vma_collect_huge_pmd(pmd_t *pmdp, unsigned long start, if (pmd_write(*pmdp)) write = MIGRATE_PFN_WRITE; } else if (!pmd_present(*pmdp)) { - entry = pmd_to_swp_entry(*pmdp); - folio = pfn_swap_entry_folio(entry); + const softleaf_t entry = softleaf_from_pmd(*pmdp); - if (!is_device_private_entry(entry) || + folio = softleaf_to_folio(entry); + + if (!softleaf_is_device_private(entry) || !(migrate->flags & MIGRATE_VMA_SELECT_DEVICE_PRIVATE) || (folio->pgmap->owner != migrate->pgmap_owner)) { spin_unlock(ptl); return migrate_vma_collect_skip(start, end, walk); } - if (is_migration_entry(entry)) { + if (softleaf_is_migration(entry)) { migration_entry_wait_on_locked(entry, ptl); spin_unlock(ptl); return -EAGAIN; } - if (is_writable_device_private_entry(entry)) + if (softleaf_is_device_private_write(entry)) write = MIGRATE_PFN_WRITE; } else { spin_unlock(ptl); diff --git a/mm/page_table_check.c b/mm/page_table_check.c index f5f25e120f69..741884645ab0 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -8,7 +8,7 @@ #include #include #include -#include +#include #undef pr_fmt #define pr_fmt(fmt) "page_table_check: " fmt @@ -179,10 +179,10 @@ void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) EXPORT_SYMBOL(__page_table_check_pud_clear); /* Whether the swap entry cached writable information */ -static inline bool swap_cached_writable(swp_entry_t entry) +static inline bool softleaf_cached_writable(softleaf_t entry) { - return is_writable_device_private_entry(entry) || - is_writable_migration_entry(entry); + return softleaf_is_device_private_write(entry) || + softleaf_is_migration_write(entry); } static void page_table_check_pte_flags(pte_t pte) @@ -190,9 +190,9 @@ static void page_table_check_pte_flags(pte_t pte) if (pte_present(pte)) { WARN_ON_ONCE(pte_uffd_wp(pte) && pte_write(pte)); } else if (pte_swp_uffd_wp(pte)) { - const swp_entry_t entry = pte_to_swp_entry(pte); + const softleaf_t entry = softleaf_from_pte(pte); - WARN_ON_ONCE(swap_cached_writable(entry)); + WARN_ON_ONCE(softleaf_cached_writable(entry)); } } @@ -219,9 +219,9 @@ static inline void page_table_check_pmd_flags(pmd_t pmd) if (pmd_uffd_wp(pmd)) WARN_ON_ONCE(pmd_write(pmd)); } else if (pmd_swp_uffd_wp(pmd)) { - swp_entry_t entry = pmd_to_swp_entry(pmd); + const softleaf_t entry = softleaf_from_pmd(pmd); - WARN_ON_ONCE(swap_cached_writable(entry)); + WARN_ON_ONCE(softleaf_cached_writable(entry)); } } diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index a4e23818f37f..8137d2366722 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -242,18 +242,19 @@ restart: */ pmde = pmdp_get_lockless(pvmw->pmd); - if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) { + if (pmd_trans_huge(pmde) || pmd_is_migration_entry(pmde)) { pvmw->ptl = pmd_lock(mm, pvmw->pmd); pmde = *pvmw->pmd; if (!pmd_present(pmde)) { - swp_entry_t entry; + softleaf_t entry; if (!thp_migration_supported() || !(pvmw->flags & PVMW_MIGRATION)) return not_found(pvmw); - entry = pmd_to_swp_entry(pmde); - if (!is_migration_entry(entry) || - !check_pmd(swp_offset_pfn(entry), pvmw)) + entry = softleaf_from_pmd(pmde); + + if (!softleaf_is_migration(entry) || + !check_pmd(softleaf_to_pfn(entry), pvmw)) return not_found(pvmw); return true; } @@ -273,9 +274,9 @@ restart: * cannot return prematurely, while zap_huge_pmd() has * cleared *pmd but not decremented compound_mapcount(). */ - swp_entry_t entry = pmd_to_swp_entry(pmde); + const softleaf_t entry = softleaf_from_pmd(pmde); - if (is_device_private_entry(entry)) { + if (softleaf_is_device_private(entry)) { pvmw->ptl = pmd_lock(mm, pvmw->pmd); return true; } diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 8a29b7237bc6..378c774795fc 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include @@ -973,10 +973,10 @@ pmd_table: goto found; } } else if ((flags & FW_MIGRATION) && - is_pmd_migration_entry(pmd)) { - swp_entry_t entry = pmd_to_swp_entry(pmd); + pmd_is_migration_entry(pmd)) { + const softleaf_t entry = softleaf_from_pmd(pmd); - page = pfn_swap_entry_to_page(entry); + page = softleaf_to_page(entry); expose_page = false; goto found; } diff --git a/mm/rmap.c b/mm/rmap.c index 1954c538a991..775710115a41 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -57,7 +57,7 @@ #include #include #include -#include +#include #include #include #include @@ -2341,7 +2341,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, if (likely(pmd_present(pmdval))) pfn = pmd_pfn(pmdval); else - pfn = swp_offset_pfn(pmd_to_swp_entry(pmdval)); + pfn = softleaf_to_pfn(softleaf_from_pmd(pmdval)); subpage = folio_page(folio, pfn - folio_pfn(folio)); -- cgit v1.2.3 From 15eabc898dc58c9e97eb9ddd56dc6b893e7d0d0e Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 10 Nov 2025 22:21:29 +0000 Subject: mm: introduce pmd_is_huge() and use where appropriate The leaf entry PMD case is confusing as only migration entries and device private entries are valid at PMD level, not true swap entries. We repeatedly perform checks of the form is_swap_pmd() || pmd_trans_huge() which is itself confusing - it implies that leaf entries at PMD level exist and are different from huge entries. Address this confusion by introduced pmd_is_huge() which checks for either case. Sadly due to header dependency issues (huge_mm.h is included very early on in headers and cannot really rely on much else) we cannot use pmd_is_valid_softleaf() here. However since these are the only valid, handled cases the function is still achieving what it intends to do. We then replace all instances of is_swap_pmd() || pmd_trans_huge() with pmd_is_huge() invocations and adjust logic accordingly to accommodate this. No functional change intended. Link: https://lkml.kernel.org/r/00f79db3b15293cac8f7040a48d69c52d00117e4.1762812360.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Alexander Gordeev Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Dev Jain Cc: Gerald Schaefer Cc: Gregory Price Cc: Heiko Carstens Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Janosch Frank Cc: Jason Gunthorpe Cc: Joshua Hahn Cc: Kairui Song Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Pasha Tatashin Cc: Peter Xu Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 39 +++++++++++++++++++++++++++++++++++---- include/linux/swapops.h | 6 ++++++ mm/huge_memory.c | 3 ++- mm/memory.c | 4 ++-- mm/mprotect.c | 2 +- mm/mremap.c | 2 +- 6 files changed, 47 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 19d4a5f52ca2..5ab240d61dcc 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -419,10 +419,36 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped); void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze); +/** + * pmd_is_huge() - Is this PMD either a huge PMD entry or a software leaf entry? + * @pmd: The PMD to check. + * + * A huge PMD entry is a non-empty entry which is present and marked huge or a + * software leaf entry. This check be performed without the appropriate locks + * held, in which case the condition should be rechecked after they are + * acquired. + * + * Returns: true if this PMD is huge, false otherwise. + */ +static inline bool pmd_is_huge(pmd_t pmd) +{ + if (pmd_present(pmd)) { + return pmd_trans_huge(pmd); + } else if (!pmd_none(pmd)) { + /* + * Non-present PMDs must be valid huge non-present entries. We + * cannot assert that here due to header dependency issues. + */ + return true; + } + + return false; +} + #define split_huge_pmd(__vma, __pmd, __address) \ do { \ pmd_t *____pmd = (__pmd); \ - if (is_swap_pmd(*____pmd) || pmd_trans_huge(*____pmd)) \ + if (pmd_is_huge(*____pmd)) \ __split_huge_pmd(__vma, __pmd, __address, \ false); \ } while (0) @@ -469,10 +495,10 @@ static inline int is_swap_pmd(pmd_t pmd) static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { - if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd)) + if (pmd_is_huge(*pmd)) return __pmd_trans_huge_lock(pmd, vma); - else - return NULL; + + return NULL; } static inline spinlock_t *pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) @@ -743,6 +769,11 @@ static inline struct folio *get_persistent_huge_zero_folio(void) { return NULL; } + +static inline bool pmd_is_huge(pmd_t pmd) +{ + return false; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline int split_folio_to_list_to_order(struct folio *folio, diff --git a/include/linux/swapops.h b/include/linux/swapops.h index f1277647262d..41cfc6d59054 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -471,6 +471,12 @@ static inline pmd_t swp_entry_to_pmd(swp_entry_t entry) } #else /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ +static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, + struct page *page) +{ + BUILD_BUG(); +} + static inline void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9aa933723355..71dc6e41f0c8 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2735,8 +2735,9 @@ unlock_ptls: spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { spinlock_t *ptl; + ptl = pmd_lock(vma->vm_mm, pmd); - if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd))) + if (likely(pmd_is_huge(*pmd))) return ptl; spin_unlock(ptl); return NULL; diff --git a/mm/memory.c b/mm/memory.c index 9d0d527e95a8..95dac6a1cbc4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1374,7 +1374,7 @@ copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, src_pmd = pmd_offset(src_pud, addr); do { next = pmd_addr_end(addr, end); - if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)) { + if (pmd_is_huge(*src_pmd)) { int err; VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma); @@ -1917,7 +1917,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); - if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd)) { + if (pmd_is_huge(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) __split_huge_pmd(vma, pmd, addr, false); else if (zap_huge_pmd(tlb, vma, pmd, addr)) { diff --git a/mm/mprotect.c b/mm/mprotect.c index aa555dfbdfc5..f910cbf41442 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -474,7 +474,7 @@ again: goto next; _pmd = pmdp_get_lockless(pmd); - if (is_swap_pmd(_pmd) || pmd_trans_huge(_pmd)) { + if (pmd_is_huge(_pmd)) { if ((next - addr != HPAGE_PMD_SIZE) || pgtable_split_needed(vma, cp_flags)) { __split_huge_pmd(vma, pmd, addr, false); diff --git a/mm/mremap.c b/mm/mremap.c index 62b6827abacf..fdb0485ede74 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -850,7 +850,7 @@ unsigned long move_page_tables(struct pagetable_move_control *pmc) if (!new_pmd) break; again: - if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd)) { + if (pmd_is_huge(*old_pmd)) { if (extent == HPAGE_PMD_SIZE && move_pgt_entry(pmc, HPAGE_PMD, old_pmd, new_pmd)) continue; -- cgit v1.2.3 From c0a80c2ce68d3a04daa52497fbf524ffb3a376e0 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 10 Nov 2025 22:21:30 +0000 Subject: mm: remove remaining is_swap_pmd() users and is_swap_pmd() Update copy_huge_pmd() and change_huge_pmd() to use pmd_is_valid_softleaf() - as this checks for the only valid non-present huge PMD states. Also update mm/debug_vm_pgtable.c to explicitly test for a valid leaf PMD entry (which it was not before, which was incorrect), and have it test against pmd_is_huge() and pmd_is_valid_softleaf() rather than is_swap_pmd(). With these changes done there are no further users of is_swap_pmd(), so remove it. Link: https://lkml.kernel.org/r/1628b00b00c8498bbd2c20b82117ee87845fb738.1762812360.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Alexander Gordeev Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Dev Jain Cc: Gerald Schaefer Cc: Gregory Price Cc: Heiko Carstens Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Janosch Frank Cc: Jason Gunthorpe Cc: Joshua Hahn Cc: Kairui Song Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Pasha Tatashin Cc: Peter Xu Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 9 --------- mm/debug_vm_pgtable.c | 25 +++++++++++++++---------- mm/huge_memory.c | 5 +++-- 3 files changed, 18 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 5ab240d61dcc..525624c285a6 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -486,11 +486,6 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma); spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma); -static inline int is_swap_pmd(pmd_t pmd) -{ - return !pmd_none(pmd) && !pmd_present(pmd); -} - /* mmap_lock must be held on entry */ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) @@ -692,10 +687,6 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, struct vm_area_struct *next) { } -static inline int is_swap_pmd(pmd_t pmd) -{ - return 0; -} static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index fff311830959..608d1011ce03 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -74,6 +74,7 @@ struct pgtable_debug_args { unsigned long fixed_pte_pfn; swp_entry_t swp_entry; + swp_entry_t leaf_entry; }; static void __init pte_basic_tests(struct pgtable_debug_args *args, int idx) @@ -745,7 +746,7 @@ static void __init pmd_soft_dirty_tests(struct pgtable_debug_args *args) WARN_ON(pmd_soft_dirty(pmd_clear_soft_dirty(pmd))); } -static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) +static void __init pmd_leaf_soft_dirty_tests(struct pgtable_debug_args *args) { pmd_t pmd; @@ -757,15 +758,16 @@ static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) return; pr_debug("Validating PMD swap soft dirty\n"); - pmd = swp_entry_to_pmd(args->swp_entry); - WARN_ON(!is_swap_pmd(pmd)); + pmd = swp_entry_to_pmd(args->leaf_entry); + WARN_ON(!pmd_is_huge(pmd)); + WARN_ON(!pmd_is_valid_softleaf(pmd)); WARN_ON(!pmd_swp_soft_dirty(pmd_swp_mksoft_dirty(pmd))); WARN_ON(pmd_swp_soft_dirty(pmd_swp_clear_soft_dirty(pmd))); } #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ static void __init pmd_soft_dirty_tests(struct pgtable_debug_args *args) { } -static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) { } +static void __init pmd_leaf_soft_dirty_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static void __init pte_swap_exclusive_tests(struct pgtable_debug_args *args) @@ -818,7 +820,7 @@ static void __init pte_swap_tests(struct pgtable_debug_args *args) } #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION -static void __init pmd_swap_tests(struct pgtable_debug_args *args) +static void __init pmd_softleaf_tests(struct pgtable_debug_args *args) { swp_entry_t arch_entry; pmd_t pmd1, pmd2; @@ -827,15 +829,16 @@ static void __init pmd_swap_tests(struct pgtable_debug_args *args) return; pr_debug("Validating PMD swap\n"); - pmd1 = swp_entry_to_pmd(args->swp_entry); - WARN_ON(!is_swap_pmd(pmd1)); + pmd1 = swp_entry_to_pmd(args->leaf_entry); + WARN_ON(!pmd_is_huge(pmd1)); + WARN_ON(!pmd_is_valid_softleaf(pmd1)); arch_entry = __pmd_to_swp_entry(pmd1); pmd2 = __swp_entry_to_pmd(arch_entry); WARN_ON(memcmp(&pmd1, &pmd2, sizeof(pmd1))); } #else /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */ -static void __init pmd_swap_tests(struct pgtable_debug_args *args) { } +static void __init pmd_softleaf_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ static void __init swap_migration_tests(struct pgtable_debug_args *args) @@ -1229,6 +1232,8 @@ static int __init init_args(struct pgtable_debug_args *args) max_swap_offset = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0, ~0UL)))); /* Create a swp entry with all possible bits set while still being swap. */ args->swp_entry = swp_entry(MAX_SWAPFILES - 1, max_swap_offset); + /* Create a non-present migration entry. */ + args->leaf_entry = make_writable_migration_entry(~0UL); /* * Allocate (huge) pages because some of the tests need to access @@ -1318,12 +1323,12 @@ static int __init debug_vm_pgtable(void) pte_soft_dirty_tests(&args); pmd_soft_dirty_tests(&args); pte_swap_soft_dirty_tests(&args); - pmd_swap_soft_dirty_tests(&args); + pmd_leaf_soft_dirty_tests(&args); pte_swap_exclusive_tests(&args); pte_swap_tests(&args); - pmd_swap_tests(&args); + pmd_softleaf_tests(&args); swap_migration_tests(&args); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 71dc6e41f0c8..e38b0d5e3102 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1800,7 +1800,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, ret = -EAGAIN; pmd = *src_pmd; - if (unlikely(thp_migration_supported() && is_swap_pmd(pmd))) { + if (unlikely(thp_migration_supported() && + pmd_is_valid_softleaf(pmd))) { copy_huge_non_present_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr, dst_vma, src_vma, pmd, pgtable); ret = 0; @@ -2487,7 +2488,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (!ptl) return 0; - if (thp_migration_supported() && is_swap_pmd(*pmd)) { + if (thp_migration_supported() && pmd_is_valid_softleaf(*pmd)) { change_non_present_huge_pmd(mm, addr, pmd, uffd_wp, uffd_wp_resolve); goto unlock; -- cgit v1.2.3 From 9ff30bb9ab40b34908eefd661f12f99aa00d04c3 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 10 Nov 2025 22:21:31 +0000 Subject: mm: remove non_swap_entry() and use softleaf helpers instead There is simply no need for the hugely confusing concept of 'non-swap' swap entries now we have the concept of softleaf entries and relevant softleaf_xxx() helpers. Adjust all callers to use these instead and remove non_swap_entry() altogether. No functional change intended. Link: https://lkml.kernel.org/r/2562093f37f4a9cffea0447058014485eb50aaaf.1762812360.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Alexander Gordeev Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Dev Jain Cc: Gerald Schaefer Cc: Gregory Price Cc: Heiko Carstens Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Janosch Frank Cc: Jason Gunthorpe Cc: Joshua Hahn Cc: Kairui Song Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Pasha Tatashin Cc: Peter Xu Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/s390/mm/gmap_helpers.c | 20 ++++++++++---------- arch/s390/mm/pgtable.c | 12 ++++++------ fs/proc/task_mmu.c | 12 ++++++------ include/linux/swapops.h | 5 ----- mm/filemap.c | 2 +- mm/hmm.c | 16 ++++++++-------- mm/madvise.c | 2 +- mm/memory.c | 36 ++++++++++++++++++------------------ mm/mincore.c | 2 +- mm/userfaultfd.c | 24 ++++++++++++------------ 10 files changed, 63 insertions(+), 68 deletions(-) (limited to 'include/linux') diff --git a/arch/s390/mm/gmap_helpers.c b/arch/s390/mm/gmap_helpers.c index d4c3c36855e2..549f14ad08af 100644 --- a/arch/s390/mm/gmap_helpers.c +++ b/arch/s390/mm/gmap_helpers.c @@ -11,27 +11,27 @@ #include #include #include -#include +#include #include #include #include #include /** - * ptep_zap_swap_entry() - discard a swap entry. + * ptep_zap_softleaf_entry() - discard a software leaf entry. * @mm: the mm - * @entry: the swap entry that needs to be zapped + * @entry: the software leaf entry that needs to be zapped * - * Discards the given swap entry. If the swap entry was an actual swap - * entry (and not a migration entry, for example), the actual swapped + * Discards the given software leaf entry. If the leaf entry was an actual + * swap entry (and not a migration entry, for example), the actual swapped * page is also discarded from swap. */ -static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry) +static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry) { - if (!non_swap_entry(entry)) + if (softleaf_is_swap(entry)) dec_mm_counter(mm, MM_SWAPENTS); - else if (is_migration_entry(entry)) - dec_mm_counter(mm, mm_counter(pfn_swap_entry_folio(entry))); + else if (softleaf_is_migration(entry)) + dec_mm_counter(mm, mm_counter(softleaf_to_folio(entry))); free_swap_and_cache(entry); } @@ -66,7 +66,7 @@ void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr) preempt_disable(); pgste = pgste_get_lock(ptep); - ptep_zap_swap_entry(mm, pte_to_swp_entry(*ptep)); + ptep_zap_softleaf_entry(mm, softleaf_from_pte(*ptep)); pte_clear(mm, vmaddr, ptep); pgste_set_unlock(ptep, pgste); diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 0fde20bbc50b..d670bfb47d9b 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include #include @@ -683,12 +683,12 @@ void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep) pgste_set_unlock(ptep, pgste); } -static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry) +static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry) { - if (!non_swap_entry(entry)) + if (softleaf_is_swap(entry)) dec_mm_counter(mm, MM_SWAPENTS); - else if (is_migration_entry(entry)) { - struct folio *folio = pfn_swap_entry_folio(entry); + else if (softleaf_is_migration(entry)) { + struct folio *folio = softleaf_to_folio(entry); dec_mm_counter(mm, mm_counter(folio)); } @@ -710,7 +710,7 @@ void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, if (!reset && pte_swap(pte) && ((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED || (pgstev & _PGSTE_GPS_ZERO))) { - ptep_zap_swap_entry(mm, pte_to_swp_entry(pte)); + ptep_zap_softleaf_entry(mm, softleaf_from_pte(pte)); pte_clear(mm, addr, ptep); } if (reset) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 898df952b6bc..1f49c81b3591 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1020,13 +1020,13 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, } else if (pte_none(ptent)) { smaps_pte_hole_lookup(addr, walk); } else { - swp_entry_t swpent = pte_to_swp_entry(ptent); + const softleaf_t entry = softleaf_from_pte(ptent); - if (!non_swap_entry(swpent)) { + if (softleaf_is_swap(entry)) { int mapcount; mss->swap += PAGE_SIZE; - mapcount = swp_swapcount(swpent); + mapcount = swp_swapcount(entry); if (mapcount >= 2) { u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT; @@ -1035,10 +1035,10 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, } else { mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT; } - } else if (is_pfn_swap_entry(swpent)) { - if (is_device_private_entry(swpent)) + } else if (softleaf_has_pfn(entry)) { + if (softleaf_is_device_private(entry)) present = true; - page = pfn_swap_entry_to_page(swpent); + page = softleaf_to_page(entry); } } diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 41cfc6d59054..c8e6f927da48 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -492,10 +492,5 @@ static inline pmd_t swp_entry_to_pmd(swp_entry_t entry) #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ -static inline int non_swap_entry(swp_entry_t entry) -{ - return swp_type(entry) >= MAX_SWAPFILES; -} - #endif /* CONFIG_MMU */ #endif /* _LINUX_SWAPOPS_H */ diff --git a/mm/filemap.c b/mm/filemap.c index 02355aa46324..07634b7d9934 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -4567,7 +4567,7 @@ static void filemap_cachestat(struct address_space *mapping, swp_entry_t swp = radix_to_swp_entry(folio); /* swapin error results in poisoned entry */ - if (non_swap_entry(swp)) + if (!softleaf_is_swap(swp)) goto resched; /* diff --git a/mm/hmm.c b/mm/hmm.c index e9735a9b6102..0158f2d1e027 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -258,17 +258,17 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, } if (!pte_present(pte)) { - swp_entry_t entry = pte_to_swp_entry(pte); + const softleaf_t entry = softleaf_from_pte(pte); /* * Don't fault in device private pages owned by the caller, * just report the PFN. */ - if (is_device_private_entry(entry) && - page_pgmap(pfn_swap_entry_to_page(entry))->owner == + if (softleaf_is_device_private(entry) && + page_pgmap(softleaf_to_page(entry))->owner == range->dev_private_owner) { cpu_flags = HMM_PFN_VALID; - if (is_writable_device_private_entry(entry)) + if (softleaf_is_device_private_write(entry)) cpu_flags |= HMM_PFN_WRITE; new_pfn_flags = swp_offset_pfn(entry) | cpu_flags; goto out; @@ -279,16 +279,16 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, if (!required_fault) goto out; - if (!non_swap_entry(entry)) + if (softleaf_is_swap(entry)) goto fault; - if (is_device_private_entry(entry)) + if (softleaf_is_device_private(entry)) goto fault; - if (is_device_exclusive_entry(entry)) + if (softleaf_is_device_exclusive(entry)) goto fault; - if (is_migration_entry(entry)) { + if (softleaf_is_migration(entry)) { pte_unmap(ptep); hmm_vma_walk->last = addr; migration_entry_wait(walk->mm, pmdp, addr); diff --git a/mm/madvise.c b/mm/madvise.c index 5979a4a39738..d8bc51e1bea7 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -249,7 +249,7 @@ static void shmem_swapin_range(struct vm_area_struct *vma, continue; entry = radix_to_swp_entry(folio); /* There might be swapin error entries in shmem mapping. */ - if (non_swap_entry(entry)) + if (!softleaf_is_swap(entry)) continue; addr = vma->vm_start + diff --git a/mm/memory.c b/mm/memory.c index 95dac6a1cbc4..a3f001a47ecf 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -932,7 +932,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, struct folio *folio; struct page *page; - if (likely(!non_swap_entry(entry))) { + if (likely(softleaf_is_swap(entry))) { if (swap_duplicate(entry) < 0) return -EIO; @@ -950,12 +950,12 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, set_pte_at(src_mm, addr, src_pte, pte); } rss[MM_SWAPENTS]++; - } else if (is_migration_entry(entry)) { - folio = pfn_swap_entry_folio(entry); + } else if (softleaf_is_migration(entry)) { + folio = softleaf_to_folio(entry); rss[mm_counter(folio)]++; - if (!is_readable_migration_entry(entry) && + if (!softleaf_is_migration_read(entry) && is_cow_mapping(vm_flags)) { /* * COW mappings require pages in both parent and child @@ -964,15 +964,15 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, */ entry = make_readable_migration_entry( swp_offset(entry)); - pte = swp_entry_to_pte(entry); + pte = softleaf_to_pte(entry); if (pte_swp_soft_dirty(orig_pte)) pte = pte_swp_mksoft_dirty(pte); if (pte_swp_uffd_wp(orig_pte)) pte = pte_swp_mkuffd_wp(pte); set_pte_at(src_mm, addr, src_pte, pte); } - } else if (is_device_private_entry(entry)) { - page = pfn_swap_entry_to_page(entry); + } else if (softleaf_is_device_private(entry)) { + page = softleaf_to_page(entry); folio = page_folio(page); /* @@ -996,7 +996,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, * when a device driver is involved (you cannot easily * save and restore device driver state). */ - if (is_writable_device_private_entry(entry) && + if (softleaf_is_device_private_write(entry) && is_cow_mapping(vm_flags)) { entry = make_readable_device_private_entry( swp_offset(entry)); @@ -1005,7 +1005,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte = pte_swp_mkuffd_wp(pte); set_pte_at(src_mm, addr, src_pte, pte); } - } else if (is_device_exclusive_entry(entry)) { + } else if (softleaf_is_device_exclusive(entry)) { /* * Make device exclusive entries present by restoring the * original entry then copying as for a present pte. Device @@ -4625,7 +4625,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) rmap_t rmap_flags = RMAP_NONE; bool need_clear_cache = false; bool exclusive = false; - swp_entry_t entry; + softleaf_t entry; pte_t pte; vm_fault_t ret = 0; void *shadow = NULL; @@ -4637,15 +4637,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (!pte_unmap_same(vmf)) goto out; - entry = pte_to_swp_entry(vmf->orig_pte); - if (unlikely(non_swap_entry(entry))) { - if (is_migration_entry(entry)) { + entry = softleaf_from_pte(vmf->orig_pte); + if (unlikely(!softleaf_is_swap(entry))) { + if (softleaf_is_migration(entry)) { migration_entry_wait(vma->vm_mm, vmf->pmd, vmf->address); - } else if (is_device_exclusive_entry(entry)) { - vmf->page = pfn_swap_entry_to_page(entry); + } else if (softleaf_is_device_exclusive(entry)) { + vmf->page = softleaf_to_page(entry); ret = remove_device_exclusive_entry(vmf); - } else if (is_device_private_entry(entry)) { + } else if (softleaf_is_device_private(entry)) { if (vmf->flags & FAULT_FLAG_VMA_LOCK) { /* * migrate_to_ram is not yet ready to operate @@ -4656,7 +4656,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out; } - vmf->page = pfn_swap_entry_to_page(entry); + vmf->page = softleaf_to_page(entry); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (unlikely(!vmf->pte || @@ -4680,7 +4680,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) } else { pte_unmap_unlock(vmf->pte, vmf->ptl); } - } else if (is_hwpoison_entry(entry)) { + } else if (softleaf_is_hwpoison(entry)) { ret = VM_FAULT_HWPOISON; } else if (softleaf_is_marker(entry)) { ret = handle_pte_marker(vmf); diff --git a/mm/mincore.c b/mm/mincore.c index b3682488a65d..9a908d8bb706 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -74,7 +74,7 @@ static unsigned char mincore_swap(swp_entry_t entry, bool shmem) * absent. Page table may contain migration or hwpoison * entries which are always uptodate. */ - if (non_swap_entry(entry)) + if (!softleaf_is_swap(entry)) return !shmem; /* diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 055ec1050776..bd1f74a7a5ac 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -1256,7 +1256,6 @@ static long move_pages_ptes(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd unsigned long dst_addr, unsigned long src_addr, unsigned long len, __u64 mode) { - swp_entry_t entry; struct swap_info_struct *si = NULL; pte_t orig_src_pte, orig_dst_pte; pte_t src_folio_pte; @@ -1430,19 +1429,20 @@ retry: orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval, dst_ptl, src_ptl, &src_folio, len); - } else { + } else { /* !pte_present() */ struct folio *folio = NULL; + const softleaf_t entry = softleaf_from_pte(orig_src_pte); - entry = pte_to_swp_entry(orig_src_pte); - if (non_swap_entry(entry)) { - if (is_migration_entry(entry)) { - pte_unmap(src_pte); - pte_unmap(dst_pte); - src_pte = dst_pte = NULL; - migration_entry_wait(mm, src_pmd, src_addr); - ret = -EAGAIN; - } else - ret = -EFAULT; + if (softleaf_is_migration(entry)) { + pte_unmap(src_pte); + pte_unmap(dst_pte); + src_pte = dst_pte = NULL; + migration_entry_wait(mm, src_pmd, src_addr); + + ret = -EAGAIN; + goto out; + } else if (!softleaf_is_swap(entry)) { + ret = -EFAULT; goto out; } -- cgit v1.2.3 From 03bfbc3ad6e496fb576ca9ace08211943232fdf9 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 10 Nov 2025 22:21:32 +0000 Subject: mm: remove is_hugetlb_entry_[migration, hwpoisoned]() We do not need to have explicit helper functions for these, it adds a level of confusion and indirection when we can simply use software leaf entry logic here instead and spell out the special huge_pte_none() case we must consider. No functional change intended. Link: https://lkml.kernel.org/r/0e92d6924d3de88cd014ce1c53e20edc08fc152e.1762812360.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Alexander Gordeev Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Dev Jain Cc: Gerald Schaefer Cc: Gregory Price Cc: Heiko Carstens Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Janosch Frank Cc: Jason Gunthorpe Cc: Joshua Hahn Cc: Kairui Song Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Pasha Tatashin Cc: Peter Xu Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 19 ++++++----- include/linux/hugetlb.h | 2 -- mm/hugetlb.c | 91 ++++++++++++++++++++----------------------------- mm/mempolicy.c | 17 +++++---- mm/migrate.c | 15 +++++--- 5 files changed, 69 insertions(+), 75 deletions(-) (limited to 'include/linux') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 1f49c81b3591..92ada14eabc0 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -2500,22 +2500,23 @@ static void make_uffd_wp_huge_pte(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t ptent) { - unsigned long psize; + const unsigned long psize = huge_page_size(hstate_vma(vma)); + softleaf_t entry; - if (is_hugetlb_entry_hwpoisoned(ptent) || pte_is_marker(ptent)) - return; + if (huge_pte_none(ptent)) + set_huge_pte_at(vma->vm_mm, addr, ptep, + make_pte_marker(PTE_MARKER_UFFD_WP), psize); - psize = huge_page_size(hstate_vma(vma)); + entry = softleaf_from_pte(ptent); + if (softleaf_is_hwpoison(entry) || softleaf_is_marker(entry)) + return; - if (is_hugetlb_entry_migration(ptent)) + if (softleaf_is_migration(entry)) set_huge_pte_at(vma->vm_mm, addr, ptep, pte_swp_mkuffd_wp(ptent), psize); - else if (!huge_pte_none(ptent)) + else huge_ptep_modify_prot_commit(vma, addr, ptep, ptent, huge_pte_mkuffd_wp(ptent)); - else - set_huge_pte_at(vma->vm_mm, addr, ptep, - make_pte_marker(PTE_MARKER_UFFD_WP), psize); } #endif /* CONFIG_HUGETLB_PAGE */ diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 2387513d6ae5..457d48ac7bcd 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -274,8 +274,6 @@ void hugetlb_vma_lock_release(struct kref *kref); long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot, unsigned long cp_flags); -bool is_hugetlb_entry_migration(pte_t pte); -bool is_hugetlb_entry_hwpoisoned(pte_t pte); void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); void fixup_hugetlb_reservations(struct vm_area_struct *vma); void hugetlb_split(struct vm_area_struct *vma, unsigned long addr); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 59d91c36770c..311c5d601310 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4846,32 +4846,6 @@ static void set_huge_ptep_maybe_writable(struct vm_area_struct *vma, set_huge_ptep_writable(vma, address, ptep); } -bool is_hugetlb_entry_migration(pte_t pte) -{ - swp_entry_t swp; - - if (huge_pte_none(pte) || pte_present(pte)) - return false; - swp = pte_to_swp_entry(pte); - if (is_migration_entry(swp)) - return true; - else - return false; -} - -bool is_hugetlb_entry_hwpoisoned(pte_t pte) -{ - swp_entry_t swp; - - if (huge_pte_none(pte) || pte_present(pte)) - return false; - swp = pte_to_swp_entry(pte); - if (is_hwpoison_entry(swp)) - return true; - else - return false; -} - static void hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr, struct folio *new_folio, pte_t old, unsigned long sz) @@ -4900,6 +4874,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, unsigned long npages = pages_per_huge_page(h); struct mmu_notifier_range range; unsigned long last_addr_mask; + softleaf_t softleaf; int ret = 0; if (cow) { @@ -4947,16 +4922,16 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, entry = huge_ptep_get(src_vma->vm_mm, addr, src_pte); again: if (huge_pte_none(entry)) { - /* - * Skip if src entry none. - */ - ; - } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) { + /* Skip if src entry none. */ + goto next; + } + + softleaf = softleaf_from_pte(entry); + if (unlikely(softleaf_is_hwpoison(softleaf))) { if (!userfaultfd_wp(dst_vma)) entry = huge_pte_clear_uffd_wp(entry); set_huge_pte_at(dst, addr, dst_pte, entry, sz); - } else if (unlikely(is_hugetlb_entry_migration(entry))) { - softleaf_t softleaf = softleaf_from_pte(entry); + } else if (unlikely(softleaf_is_migration(softleaf))) { bool uffd_wp = pte_swp_uffd_wp(entry); if (!is_readable_migration_entry(softleaf) && cow) { @@ -4975,7 +4950,6 @@ again: entry = huge_pte_clear_uffd_wp(entry); set_huge_pte_at(dst, addr, dst_pte, entry, sz); } else if (unlikely(pte_is_marker(entry))) { - const softleaf_t softleaf = softleaf_from_pte(entry); const pte_marker marker = copy_pte_marker(softleaf, dst_vma); if (marker) @@ -5033,9 +5007,7 @@ again: } hugetlb_install_folio(dst_vma, dst_pte, addr, new_folio, src_pte_old, sz); - spin_unlock(src_ptl); - spin_unlock(dst_ptl); - continue; + goto next; } if (cow) { @@ -5056,6 +5028,8 @@ again: set_huge_pte_at(dst, addr, dst_pte, entry, sz); hugetlb_count_add(npages, dst); } + +next: spin_unlock(src_ptl); spin_unlock(dst_ptl); } @@ -6064,8 +6038,10 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ret = 0; /* Not present, either a migration or a hwpoisoned entry */ - if (!pte_present(vmf.orig_pte)) { - if (is_hugetlb_entry_migration(vmf.orig_pte)) { + if (!pte_present(vmf.orig_pte) && !huge_pte_none(vmf.orig_pte)) { + const softleaf_t softleaf = softleaf_from_pte(vmf.orig_pte); + + if (softleaf_is_migration(softleaf)) { /* * Release the hugetlb fault lock now, but retain * the vma lock, because it is needed to guard the @@ -6076,9 +6052,12 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, mutex_unlock(&hugetlb_fault_mutex_table[hash]); migration_entry_wait_huge(vma, vmf.address, vmf.pte); return 0; - } else if (is_hugetlb_entry_hwpoisoned(vmf.orig_pte)) + } + if (softleaf_is_hwpoison(softleaf)) { ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); + } + goto out_mutex; } @@ -6460,7 +6439,9 @@ long hugetlb_change_protection(struct vm_area_struct *vma, i_mmap_lock_write(vma->vm_file->f_mapping); last_addr_mask = hugetlb_mask_last_page(h); for (; address < end; address += psize) { + softleaf_t entry; spinlock_t *ptl; + ptep = hugetlb_walk(vma, address, psize); if (!ptep) { if (!uffd_wp) { @@ -6492,15 +6473,23 @@ long hugetlb_change_protection(struct vm_area_struct *vma, continue; } pte = huge_ptep_get(mm, address, ptep); - if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { - /* Nothing to do. */ - } else if (unlikely(is_hugetlb_entry_migration(pte))) { - softleaf_t entry = softleaf_from_pte(pte); + if (huge_pte_none(pte)) { + if (unlikely(uffd_wp)) + /* Safe to modify directly (none->non-present). */ + set_huge_pte_at(mm, address, ptep, + make_pte_marker(PTE_MARKER_UFFD_WP), + psize); + goto next; + } + entry = softleaf_from_pte(pte); + if (unlikely(softleaf_is_hwpoison(entry))) { + /* Nothing to do. */ + } else if (unlikely(softleaf_is_migration(entry))) { struct folio *folio = softleaf_to_folio(entry); pte_t newpte = pte; - if (is_writable_migration_entry(entry)) { + if (softleaf_is_migration_write(entry)) { if (folio_test_anon(folio)) entry = make_readable_exclusive_migration_entry( swp_offset(entry)); @@ -6527,7 +6516,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma, if (pte_is_uffd_wp_marker(pte) && uffd_wp_resolve) /* Safe to modify directly (non-present->none). */ huge_pte_clear(mm, address, ptep, psize); - } else if (!huge_pte_none(pte)) { + } else { pte_t old_pte; unsigned int shift = huge_page_shift(hstate_vma(vma)); @@ -6540,16 +6529,10 @@ long hugetlb_change_protection(struct vm_area_struct *vma, pte = huge_pte_clear_uffd_wp(pte); huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); pages++; - } else { - /* None pte */ - if (unlikely(uffd_wp)) - /* Safe to modify directly (none->non-present). */ - set_huge_pte_at(mm, address, ptep, - make_pte_marker(PTE_MARKER_UFFD_WP), - psize); } - spin_unlock(ptl); +next: + spin_unlock(ptl); cond_resched(); } /* diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 01c3b98f87a6..dee95d5ecfd4 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -768,16 +768,21 @@ static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, unsigned long flags = qp->flags; struct folio *folio; spinlock_t *ptl; - pte_t entry; + pte_t ptep; ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); - entry = huge_ptep_get(walk->mm, addr, pte); - if (!pte_present(entry)) { - if (unlikely(is_hugetlb_entry_migration(entry))) - qp->nr_failed++; + ptep = huge_ptep_get(walk->mm, addr, pte); + if (!pte_present(ptep)) { + if (!huge_pte_none(ptep)) { + const softleaf_t entry = softleaf_from_pte(ptep); + + if (unlikely(softleaf_is_migration(entry))) + qp->nr_failed++; + } + goto unlock; } - folio = pfn_folio(pte_pfn(entry)); + folio = pfn_folio(pte_pfn(ptep)); if (!queue_folio_required(folio, qp)) goto unlock; if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || diff --git a/mm/migrate.c b/mm/migrate.c index ca4ec170a89b..5edfd0b2f63d 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -515,16 +515,18 @@ out: void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, ptep); + softleaf_t entry; pte_t pte; hugetlb_vma_assert_locked(vma); spin_lock(ptl); pte = huge_ptep_get(vma->vm_mm, addr, ptep); - if (unlikely(!is_hugetlb_entry_migration(pte))) { - spin_unlock(ptl); - hugetlb_vma_unlock_read(vma); - } else { + if (huge_pte_none(pte)) + goto fail; + + entry = softleaf_from_pte(pte); + if (softleaf_is_migration(entry)) { /* * If migration entry existed, safe to release vma lock * here because the pgtable page won't be freed without the @@ -533,7 +535,12 @@ void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, p */ hugetlb_vma_unlock_read(vma); migration_entry_wait_on_locked(pte_to_swp_entry(pte), ptl); + return; } + +fail: + spin_unlock(ptl); + hugetlb_vma_unlock_read(vma); } #endif -- cgit v1.2.3 From 93976a20345b4aff1ac7598ec1223d65ca33d49c Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 10 Nov 2025 22:21:33 +0000 Subject: mm: eliminate further swapops predicates Having converted so much of the code base to software leaf entries, we can mop up some remaining cases. We replace is_pfn_swap_entry(), pfn_swap_entry_to_page(), is_writable_device_private_entry(), is_device_exclusive_entry(), is_migration_entry(), is_writable_migration_entry(), is_readable_migration_entry(), swp_offset_pfn() and pfn_swap_entry_folio() with softleaf equivalents. No functional change intended. Link: https://lkml.kernel.org/r/956bc9c031604811c0070d2f4bf2f1373f230213.1762812360.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Alexander Gordeev Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Dev Jain Cc: Gerald Schaefer Cc: Gregory Price Cc: Heiko Carstens Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Janosch Frank Cc: Jason Gunthorpe Cc: Joshua Hahn Cc: Kairui Song Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Pasha Tatashin Cc: Peter Xu Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 14 +++--- include/linux/leafops.h | 25 +++++++--- include/linux/swapops.h | 121 +----------------------------------------------- mm/debug_vm_pgtable.c | 20 ++++---- mm/hmm.c | 2 +- mm/hugetlb.c | 2 +- mm/ksm.c | 6 +-- mm/memory-failure.c | 6 +-- mm/memory.c | 3 +- mm/mempolicy.c | 4 +- mm/migrate.c | 6 +-- mm/migrate_device.c | 10 ++-- mm/mprotect.c | 8 ++-- mm/page_vma_mapped.c | 8 ++-- mm/pagewalk.c | 7 ++- mm/rmap.c | 9 ++-- 16 files changed, 75 insertions(+), 176 deletions(-) (limited to 'include/linux') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 92ada14eabc0..41b062ce6ad8 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1941,13 +1941,13 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, if (pte_uffd_wp(pte)) flags |= PM_UFFD_WP; } else { - swp_entry_t entry; + softleaf_t entry; if (pte_swp_soft_dirty(pte)) flags |= PM_SOFT_DIRTY; if (pte_swp_uffd_wp(pte)) flags |= PM_UFFD_WP; - entry = pte_to_swp_entry(pte); + entry = softleaf_from_pte(pte); if (pm->show_pfn) { pgoff_t offset; @@ -1955,16 +1955,16 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, * For PFN swap offsets, keeping the offset field * to be PFN only to be compatible with old smaps. */ - if (is_pfn_swap_entry(entry)) - offset = swp_offset_pfn(entry); + if (softleaf_has_pfn(entry)) + offset = softleaf_to_pfn(entry); else offset = swp_offset(entry); frame = swp_type(entry) | (offset << MAX_SWAPFILES_SHIFT); } flags |= PM_SWAP; - if (is_pfn_swap_entry(entry)) - page = pfn_swap_entry_to_page(entry); + if (softleaf_has_pfn(entry)) + page = softleaf_to_page(entry); if (softleaf_is_uffd_wp_marker(entry)) flags |= PM_UFFD_WP; if (softleaf_is_guard_marker(entry)) @@ -2033,7 +2033,7 @@ static int pagemap_pmd_range_thp(pmd_t *pmdp, unsigned long addr, if (pmd_swp_uffd_wp(pmd)) flags |= PM_UFFD_WP; VM_WARN_ON_ONCE(!pmd_is_migration_entry(pmd)); - page = pfn_swap_entry_to_page(entry); + page = softleaf_to_page(entry); } if (page) { diff --git a/include/linux/leafops.h b/include/linux/leafops.h index f5ea9b0385ff..d282fab866a1 100644 --- a/include/linux/leafops.h +++ b/include/linux/leafops.h @@ -355,7 +355,7 @@ static inline unsigned long softleaf_to_pfn(softleaf_t entry) VM_WARN_ON_ONCE(!softleaf_has_pfn(entry)); /* Temporary until swp_entry_t eliminated. */ - return swp_offset_pfn(entry); + return swp_offset(entry) & SWP_PFN_MASK; } /** @@ -366,10 +366,16 @@ static inline unsigned long softleaf_to_pfn(softleaf_t entry) */ static inline struct page *softleaf_to_page(softleaf_t entry) { + struct page *page = pfn_to_page(softleaf_to_pfn(entry)); + VM_WARN_ON_ONCE(!softleaf_has_pfn(entry)); + /* + * Any use of migration entries may only occur while the + * corresponding page is locked + */ + VM_WARN_ON_ONCE(softleaf_is_migration(entry) && !PageLocked(page)); - /* Temporary until swp_entry_t eliminated. */ - return pfn_swap_entry_to_page(entry); + return page; } /** @@ -380,10 +386,17 @@ static inline struct page *softleaf_to_page(softleaf_t entry) */ static inline struct folio *softleaf_to_folio(softleaf_t entry) { - VM_WARN_ON_ONCE(!softleaf_has_pfn(entry)); + struct folio *folio = pfn_folio(softleaf_to_pfn(entry)); - /* Temporary until swp_entry_t eliminated. */ - return pfn_swap_entry_folio(entry); + VM_WARN_ON_ONCE(!softleaf_has_pfn(entry)); + /* + * Any use of migration entries may only occur while the + * corresponding folio is locked. + */ + VM_WARN_ON_ONCE(softleaf_is_migration(entry) && + !folio_test_locked(folio)); + + return folio; } /** diff --git a/include/linux/swapops.h b/include/linux/swapops.h index c8e6f927da48..3d02b288c15e 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -28,7 +28,7 @@ #define SWP_OFFSET_MASK ((1UL << SWP_TYPE_SHIFT) - 1) /* - * Definitions only for PFN swap entries (see is_pfn_swap_entry()). To + * Definitions only for PFN swap entries (see leafeant_has_pfn()). To * store PFN, we only need SWP_PFN_BITS bits. Each of the pfn swap entries * can use the extra bits to store other information besides PFN. */ @@ -66,8 +66,6 @@ #define SWP_MIG_YOUNG BIT(SWP_MIG_YOUNG_BIT) #define SWP_MIG_DIRTY BIT(SWP_MIG_DIRTY_BIT) -static inline bool is_pfn_swap_entry(swp_entry_t entry); - /* Clear all flags but only keep swp_entry_t related information */ static inline pte_t pte_swp_clear_flags(pte_t pte) { @@ -109,17 +107,6 @@ static inline pgoff_t swp_offset(swp_entry_t entry) return entry.val & SWP_OFFSET_MASK; } -/* - * This should only be called upon a pfn swap entry to get the PFN stored - * in the swap entry. Please refers to is_pfn_swap_entry() for definition - * of pfn swap entry. - */ -static inline unsigned long swp_offset_pfn(swp_entry_t entry) -{ - VM_BUG_ON(!is_pfn_swap_entry(entry)); - return swp_offset(entry) & SWP_PFN_MASK; -} - /* * Convert the arch-dependent pte representation of a swp_entry_t into an * arch-independent swp_entry_t. @@ -169,27 +156,11 @@ static inline swp_entry_t make_writable_device_private_entry(pgoff_t offset) return swp_entry(SWP_DEVICE_WRITE, offset); } -static inline bool is_device_private_entry(swp_entry_t entry) -{ - int type = swp_type(entry); - return type == SWP_DEVICE_READ || type == SWP_DEVICE_WRITE; -} - -static inline bool is_writable_device_private_entry(swp_entry_t entry) -{ - return unlikely(swp_type(entry) == SWP_DEVICE_WRITE); -} - static inline swp_entry_t make_device_exclusive_entry(pgoff_t offset) { return swp_entry(SWP_DEVICE_EXCLUSIVE, offset); } -static inline bool is_device_exclusive_entry(swp_entry_t entry) -{ - return swp_type(entry) == SWP_DEVICE_EXCLUSIVE; -} - #else /* CONFIG_DEVICE_PRIVATE */ static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset) { @@ -201,50 +172,14 @@ static inline swp_entry_t make_writable_device_private_entry(pgoff_t offset) return swp_entry(0, 0); } -static inline bool is_device_private_entry(swp_entry_t entry) -{ - return false; -} - -static inline bool is_writable_device_private_entry(swp_entry_t entry) -{ - return false; -} - static inline swp_entry_t make_device_exclusive_entry(pgoff_t offset) { return swp_entry(0, 0); } -static inline bool is_device_exclusive_entry(swp_entry_t entry) -{ - return false; -} - #endif /* CONFIG_DEVICE_PRIVATE */ #ifdef CONFIG_MIGRATION -static inline int is_migration_entry(swp_entry_t entry) -{ - return unlikely(swp_type(entry) == SWP_MIGRATION_READ || - swp_type(entry) == SWP_MIGRATION_READ_EXCLUSIVE || - swp_type(entry) == SWP_MIGRATION_WRITE); -} - -static inline int is_writable_migration_entry(swp_entry_t entry) -{ - return unlikely(swp_type(entry) == SWP_MIGRATION_WRITE); -} - -static inline int is_readable_migration_entry(swp_entry_t entry) -{ - return unlikely(swp_type(entry) == SWP_MIGRATION_READ); -} - -static inline int is_readable_exclusive_migration_entry(swp_entry_t entry) -{ - return unlikely(swp_type(entry) == SWP_MIGRATION_READ_EXCLUSIVE); -} static inline swp_entry_t make_readable_migration_entry(pgoff_t offset) { @@ -310,23 +245,10 @@ static inline swp_entry_t make_writable_migration_entry(pgoff_t offset) return swp_entry(0, 0); } -static inline int is_migration_entry(swp_entry_t swp) -{ - return 0; -} - static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { } static inline void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, pte_t *pte) { } -static inline int is_writable_migration_entry(swp_entry_t entry) -{ - return 0; -} -static inline int is_readable_migration_entry(swp_entry_t entry) -{ - return 0; -} static inline swp_entry_t make_migration_entry_young(swp_entry_t entry) { @@ -410,47 +332,6 @@ static inline swp_entry_t make_guard_swp_entry(void) return make_pte_marker_entry(PTE_MARKER_GUARD); } -static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry) -{ - struct page *p = pfn_to_page(swp_offset_pfn(entry)); - - /* - * Any use of migration entries may only occur while the - * corresponding page is locked - */ - BUG_ON(is_migration_entry(entry) && !PageLocked(p)); - - return p; -} - -static inline struct folio *pfn_swap_entry_folio(swp_entry_t entry) -{ - struct folio *folio = pfn_folio(swp_offset_pfn(entry)); - - /* - * Any use of migration entries may only occur while the - * corresponding folio is locked - */ - BUG_ON(is_migration_entry(entry) && !folio_test_locked(folio)); - - return folio; -} - -/* - * A pfn swap entry is a special type of swap entry that always has a pfn stored - * in the swap offset. They can either be used to represent unaddressable device - * memory, to restrict access to a page undergoing migration or to represent a - * pfn which has been hwpoisoned and unmapped. - */ -static inline bool is_pfn_swap_entry(swp_entry_t entry) -{ - /* Make sure the swp offset can always store the needed fields */ - BUILD_BUG_ON(SWP_TYPE_SHIFT < SWP_PFN_BITS); - - return is_migration_entry(entry) || is_device_private_entry(entry) || - is_device_exclusive_entry(entry) || is_hwpoison_entry(entry); -} - struct page_vma_mapped_walk; #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 608d1011ce03..64db85a80558 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -844,7 +844,7 @@ static void __init pmd_softleaf_tests(struct pgtable_debug_args *args) { } static void __init swap_migration_tests(struct pgtable_debug_args *args) { struct page *page; - swp_entry_t swp; + softleaf_t entry; if (!IS_ENABLED(CONFIG_MIGRATION)) return; @@ -867,17 +867,17 @@ static void __init swap_migration_tests(struct pgtable_debug_args *args) * be locked, otherwise it stumbles upon a BUG_ON(). */ __SetPageLocked(page); - swp = make_writable_migration_entry(page_to_pfn(page)); - WARN_ON(!is_migration_entry(swp)); - WARN_ON(!is_writable_migration_entry(swp)); + entry = make_writable_migration_entry(page_to_pfn(page)); + WARN_ON(!softleaf_is_migration(entry)); + WARN_ON(!softleaf_is_migration_write(entry)); - swp = make_readable_migration_entry(swp_offset(swp)); - WARN_ON(!is_migration_entry(swp)); - WARN_ON(is_writable_migration_entry(swp)); + entry = make_readable_migration_entry(swp_offset(entry)); + WARN_ON(!softleaf_is_migration(entry)); + WARN_ON(softleaf_is_migration_write(entry)); - swp = make_readable_migration_entry(page_to_pfn(page)); - WARN_ON(!is_migration_entry(swp)); - WARN_ON(is_writable_migration_entry(swp)); + entry = make_readable_migration_entry(page_to_pfn(page)); + WARN_ON(!softleaf_is_migration(entry)); + WARN_ON(softleaf_is_migration_write(entry)); __ClearPageLocked(page); } diff --git a/mm/hmm.c b/mm/hmm.c index 0158f2d1e027..3912d92a2b9a 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -270,7 +270,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, cpu_flags = HMM_PFN_VALID; if (softleaf_is_device_private_write(entry)) cpu_flags |= HMM_PFN_WRITE; - new_pfn_flags = swp_offset_pfn(entry) | cpu_flags; + new_pfn_flags = softleaf_to_pfn(entry) | cpu_flags; goto out; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 311c5d601310..9e7815b4f058 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4934,7 +4934,7 @@ again: } else if (unlikely(softleaf_is_migration(softleaf))) { bool uffd_wp = pte_swp_uffd_wp(entry); - if (!is_readable_migration_entry(softleaf) && cow) { + if (!softleaf_is_migration_read(softleaf) && cow) { /* * COW mappings require pages in both * parent and child to be set to read. diff --git a/mm/ksm.c b/mm/ksm.c index f9a1a3658ead..cfc182255c7b 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -632,14 +632,14 @@ static int break_ksm_pmd_entry(pmd_t *pmdp, unsigned long addr, unsigned long en if (pte_present(pte)) { folio = vm_normal_folio(walk->vma, addr, pte); } else if (!pte_none(pte)) { - swp_entry_t entry = pte_to_swp_entry(pte); + const softleaf_t entry = softleaf_from_pte(pte); /* * As KSM pages remain KSM pages until freed, no need to wait * here for migration to end. */ - if (is_migration_entry(entry)) - folio = pfn_swap_entry_folio(entry); + if (softleaf_is_migration(entry)) + folio = softleaf_to_folio(entry); } /* return 1 if the page is an normal ksm page or KSM-placed zero page */ found = (folio && folio_test_ksm(folio)) || diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 1f7fb9bf287a..71652cfedcdf 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -693,10 +693,10 @@ static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift, if (pte_present(pte)) { pfn = pte_pfn(pte); } else { - swp_entry_t swp = pte_to_swp_entry(pte); + const softleaf_t entry = softleaf_from_pte(pte); - if (is_hwpoison_entry(swp)) - pfn = swp_offset_pfn(swp); + if (softleaf_is_hwpoison(entry)) + pfn = softleaf_to_pfn(entry); } if (!pfn || pfn != poisoned_pfn) diff --git a/mm/memory.c b/mm/memory.c index a3f001a47ecf..525da4479228 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -902,7 +902,8 @@ static void restore_exclusive_pte(struct vm_area_struct *vma, static int try_restore_exclusive_pte(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t orig_pte) { - struct page *page = pfn_swap_entry_to_page(pte_to_swp_entry(orig_pte)); + const softleaf_t entry = softleaf_from_pte(orig_pte); + struct page *page = softleaf_to_page(entry); struct folio *folio = page_folio(page); if (folio_trylock(folio)) { diff --git a/mm/mempolicy.c b/mm/mempolicy.c index dee95d5ecfd4..acb9bf89f619 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -705,7 +705,9 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, if (pte_none(ptent)) continue; if (!pte_present(ptent)) { - if (is_migration_entry(pte_to_swp_entry(ptent))) + const softleaf_t entry = softleaf_from_pte(ptent); + + if (softleaf_is_migration(entry)) qp->nr_failed++; continue; } diff --git a/mm/migrate.c b/mm/migrate.c index 5edfd0b2f63d..c39dfea1a925 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -483,7 +483,7 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, spinlock_t *ptl; pte_t *ptep; pte_t pte; - swp_entry_t entry; + softleaf_t entry; ptep = pte_offset_map_lock(mm, pmd, address, &ptl); if (!ptep) @@ -495,8 +495,8 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, if (pte_none(pte) || pte_present(pte)) goto out; - entry = pte_to_swp_entry(pte); - if (!is_migration_entry(entry)) + entry = softleaf_from_pte(pte); + if (!softleaf_is_migration(entry)) goto out; migration_entry_wait_on_locked(entry, ptl); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 592b4561507c..b1ce6e3478d6 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -279,7 +279,7 @@ again: unsigned long mpfn = 0, pfn; struct folio *folio; struct page *page; - swp_entry_t entry; + softleaf_t entry; pte_t pte; pte = ptep_get(ptep); @@ -298,11 +298,11 @@ again: * page table entry. Other special swap entries are not * migratable, and we ignore regular swapped page. */ - entry = pte_to_swp_entry(pte); - if (!is_device_private_entry(entry)) + entry = softleaf_from_pte(pte); + if (!softleaf_is_device_private(entry)) goto next; - page = pfn_swap_entry_to_page(entry); + page = softleaf_to_page(entry); pgmap = page_pgmap(page); if (!(migrate->flags & MIGRATE_VMA_SELECT_DEVICE_PRIVATE) || @@ -330,7 +330,7 @@ again: mpfn = migrate_pfn(page_to_pfn(page)) | MIGRATE_PFN_MIGRATE; - if (is_writable_device_private_entry(entry)) + if (softleaf_is_device_private_write(entry)) mpfn |= MIGRATE_PFN_WRITE; } else { pfn = pte_pfn(pte); diff --git a/mm/mprotect.c b/mm/mprotect.c index f910cbf41442..283889e4f1ce 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -317,11 +317,11 @@ static long change_pte_range(struct mmu_gather *tlb, pages++; } } else { - swp_entry_t entry = pte_to_swp_entry(oldpte); + softleaf_t entry = softleaf_from_pte(oldpte); pte_t newpte; - if (is_writable_migration_entry(entry)) { - struct folio *folio = pfn_swap_entry_folio(entry); + if (softleaf_is_migration_write(entry)) { + const struct folio *folio = softleaf_to_folio(entry); /* * A protection check is difficult so @@ -335,7 +335,7 @@ static long change_pte_range(struct mmu_gather *tlb, newpte = swp_entry_to_pte(entry); if (pte_swp_soft_dirty(oldpte)) newpte = pte_swp_mksoft_dirty(newpte); - } else if (is_writable_device_private_entry(entry)) { + } else if (softleaf_is_device_private_write(entry)) { /* * We do not preserve soft-dirtiness. See * copy_nonpresent_pte() for explanation. diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 8137d2366722..b38a1d00c971 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -49,7 +49,7 @@ again: if (is_migration) return false; } else if (!is_migration) { - swp_entry_t entry; + softleaf_t entry; /* * Handle un-addressable ZONE_DEVICE memory. @@ -67,9 +67,9 @@ again: * For more details on device private memory see HMM * (include/linux/hmm.h or mm/hmm.c). */ - entry = pte_to_swp_entry(ptent); - if (!is_device_private_entry(entry) && - !is_device_exclusive_entry(entry)) + entry = softleaf_from_pte(ptent); + if (!softleaf_is_device_private(entry) && + !softleaf_is_device_exclusive(entry)) return false; } spin_lock(*ptlp); diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 378c774795fc..90cc346a6ecf 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -1007,11 +1007,10 @@ pte_table: goto found; } } else if (!pte_none(pte)) { - swp_entry_t entry = pte_to_swp_entry(pte); + const softleaf_t entry = softleaf_from_pte(pte); - if ((flags & FW_MIGRATION) && - is_migration_entry(entry)) { - page = pfn_swap_entry_to_page(entry); + if ((flags & FW_MIGRATION) && softleaf_is_migration(entry)) { + page = softleaf_to_page(entry); expose_page = false; goto found; } diff --git a/mm/rmap.c b/mm/rmap.c index 775710115a41..345466ad396b 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1969,7 +1969,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, if (likely(pte_present(pteval))) { pfn = pte_pfn(pteval); } else { - pfn = swp_offset_pfn(pte_to_swp_entry(pteval)); + pfn = softleaf_to_pfn(pte_to_swp_entry(pteval)); VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); } @@ -2368,7 +2368,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, if (likely(pte_present(pteval))) { pfn = pte_pfn(pteval); } else { - pfn = swp_offset_pfn(pte_to_swp_entry(pteval)); + pfn = softleaf_to_pfn(pte_to_swp_entry(pteval)); VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); } @@ -2453,8 +2453,11 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, folio_mark_dirty(folio); writable = pte_write(pteval); } else { + const softleaf_t entry = softleaf_from_pte(pteval); + pte_clear(mm, address, pvmw.pte); - writable = is_writable_device_private_entry(pte_to_swp_entry(pteval)); + + writable = softleaf_is_device_private_write(entry); } VM_WARN_ON_FOLIO(writable && folio_test_anon(folio) && -- cgit v1.2.3 From a3a3e215c9c140c08760d4d96ba4e8bc485d0f14 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 10 Nov 2025 22:21:34 +0000 Subject: mm: replace remaining pte_to_swp_entry() with softleaf_from_pte() There are straggler invocations of pte_to_swp_entry() lying around, replace all of these with the software leaf entry equivalent - softleaf_from_pte(). With those removed, eliminate pte_to_swp_entry() altogether. No functional change intended. Link: https://lkml.kernel.org/r/d8ee5ccefe4c42d7c4fe1a2e46f285ac40421cd3.1762812360.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Alexander Gordeev Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Dev Jain Cc: Gerald Schaefer Cc: Gregory Price Cc: Heiko Carstens Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Janosch Frank Cc: Jason Gunthorpe Cc: Joshua Hahn Cc: Kairui Song Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Pasha Tatashin Cc: Peter Xu Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/leafops.h | 7 ++++++- include/linux/swapops.h | 13 ------------- mm/debug_vm_pgtable.c | 2 +- mm/internal.h | 7 +++++-- mm/memory-failure.c | 2 +- mm/memory.c | 16 ++++++++-------- mm/migrate.c | 2 +- mm/mincore.c | 4 +++- mm/rmap.c | 8 ++++++-- mm/swapfile.c | 13 +++++++++++-- 10 files changed, 42 insertions(+), 32 deletions(-) (limited to 'include/linux') diff --git a/include/linux/leafops.h b/include/linux/leafops.h index d282fab866a1..cfafe7a5e7b1 100644 --- a/include/linux/leafops.h +++ b/include/linux/leafops.h @@ -54,11 +54,16 @@ static inline softleaf_t softleaf_mk_none(void) */ static inline softleaf_t softleaf_from_pte(pte_t pte) { + softleaf_t arch_entry; + if (pte_present(pte) || pte_none(pte)) return softleaf_mk_none(); + pte = pte_swp_clear_flags(pte); + arch_entry = __pte_to_swp_entry(pte); + /* Temporary until swp_entry_t eliminated. */ - return pte_to_swp_entry(pte); + return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); } /** diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 3d02b288c15e..8cfc966eae48 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -107,19 +107,6 @@ static inline pgoff_t swp_offset(swp_entry_t entry) return entry.val & SWP_OFFSET_MASK; } -/* - * Convert the arch-dependent pte representation of a swp_entry_t into an - * arch-independent swp_entry_t. - */ -static inline swp_entry_t pte_to_swp_entry(pte_t pte) -{ - swp_entry_t arch_entry; - - pte = pte_swp_clear_flags(pte); - arch_entry = __pte_to_swp_entry(pte); - return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); -} - /* * Convert the arch-independent representation of a swp_entry_t into the * arch-dependent pte representation. diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 64db85a80558..1eae87dbef73 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -1229,7 +1229,7 @@ static int __init init_args(struct pgtable_debug_args *args) init_fixed_pfns(args); /* See generic_max_swapfile_size(): probe the maximum offset */ - max_swap_offset = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0, ~0UL)))); + max_swap_offset = swp_offset(softleaf_from_pte(softleaf_to_pte(swp_entry(0, ~0UL)))); /* Create a swp entry with all possible bits set while still being swap. */ args->swp_entry = swp_entry(MAX_SWAPFILES - 1, max_swap_offset); /* Create a non-present migration entry. */ diff --git a/mm/internal.h b/mm/internal.h index 2ed041e6ebc3..929bc4a5dd98 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -334,7 +334,7 @@ unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte, */ static inline pte_t pte_move_swp_offset(pte_t pte, long delta) { - swp_entry_t entry = pte_to_swp_entry(pte); + const softleaf_t entry = softleaf_from_pte(pte); pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry), (swp_offset(entry) + delta))); @@ -389,11 +389,14 @@ static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte) cgroup_id = lookup_swap_cgroup_id(entry); while (ptep < end_ptep) { + softleaf_t entry; + pte = ptep_get(ptep); if (!pte_same(pte, expected_pte)) break; - if (lookup_swap_cgroup_id(pte_to_swp_entry(pte)) != cgroup_id) + entry = softleaf_from_pte(pte); + if (lookup_swap_cgroup_id(entry) != cgroup_id) break; expected_pte = pte_next_swp_offset(expected_pte); ptep++; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 71652cfedcdf..7f908ad795ad 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -51,7 +51,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/mm/memory.c b/mm/memory.c index 525da4479228..50b93b45b174 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1218,7 +1218,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, spinlock_t *src_ptl, *dst_ptl; int progress, max_nr, ret = 0; int rss[NR_MM_COUNTERS]; - swp_entry_t entry = (swp_entry_t){0}; + softleaf_t entry = softleaf_mk_none(); struct folio *prealloc = NULL; int nr; @@ -1282,7 +1282,7 @@ again: dst_vma, src_vma, addr, rss); if (ret == -EIO) { - entry = pte_to_swp_entry(ptep_get(src_pte)); + entry = softleaf_from_pte(ptep_get(src_pte)); break; } else if (ret == -EBUSY) { break; @@ -4446,13 +4446,13 @@ static struct folio *__alloc_swap_folio(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct folio *folio; - swp_entry_t entry; + softleaf_t entry; folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address); if (!folio) return NULL; - entry = pte_to_swp_entry(vmf->orig_pte); + entry = softleaf_from_pte(vmf->orig_pte); if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, GFP_KERNEL, entry)) { folio_put(folio); @@ -4470,7 +4470,7 @@ static struct folio *__alloc_swap_folio(struct vm_fault *vmf) static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages) { unsigned long addr; - swp_entry_t entry; + softleaf_t entry; int idx; pte_t pte; @@ -4480,7 +4480,7 @@ static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages) if (!pte_same(pte, pte_move_swp_offset(vmf->orig_pte, -idx))) return false; - entry = pte_to_swp_entry(pte); + entry = softleaf_from_pte(pte); if (swap_pte_batch(ptep, nr_pages, pte) != nr_pages) return false; @@ -4526,7 +4526,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) unsigned long orders; struct folio *folio; unsigned long addr; - swp_entry_t entry; + softleaf_t entry; spinlock_t *ptl; pte_t *pte; gfp_t gfp; @@ -4547,7 +4547,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) if (!zswap_never_enabled()) goto fallback; - entry = pte_to_swp_entry(vmf->orig_pte); + entry = softleaf_from_pte(vmf->orig_pte); /* * Get a list of all the (large) orders below PMD_ORDER that are enabled * and suitable for swapping THP. diff --git a/mm/migrate.c b/mm/migrate.c index c39dfea1a925..b2ad78bf85d5 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -534,7 +534,7 @@ void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, p * lock release in migration_entry_wait_on_locked(). */ hugetlb_vma_unlock_read(vma); - migration_entry_wait_on_locked(pte_to_swp_entry(pte), ptl); + migration_entry_wait_on_locked(entry, ptl); return; } diff --git a/mm/mincore.c b/mm/mincore.c index 9a908d8bb706..e5d13eea9234 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -202,7 +202,9 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, for (i = 0; i < step; i++) vec[i] = 1; } else { /* pte is a swap entry */ - *vec = mincore_swap(pte_to_swp_entry(pte), false); + const softleaf_t entry = softleaf_from_pte(pte); + + *vec = mincore_swap(entry, false); } vec += step; } diff --git a/mm/rmap.c b/mm/rmap.c index 345466ad396b..d871f2eb821c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1969,7 +1969,9 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, if (likely(pte_present(pteval))) { pfn = pte_pfn(pteval); } else { - pfn = softleaf_to_pfn(pte_to_swp_entry(pteval)); + const softleaf_t entry = softleaf_from_pte(pteval); + + pfn = softleaf_to_pfn(entry); VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); } @@ -2368,7 +2370,9 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, if (likely(pte_present(pteval))) { pfn = pte_pfn(pteval); } else { - pfn = softleaf_to_pfn(pte_to_swp_entry(pteval)); + const softleaf_t entry = softleaf_from_pte(pteval); + + pfn = softleaf_to_pfn(entry); VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); } diff --git a/mm/swapfile.c b/mm/swapfile.c index 8c7f14061f5b..94e0f0c54168 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3202,8 +3202,17 @@ static int claim_swapfile(struct swap_info_struct *si, struct inode *inode) */ unsigned long generic_max_swapfile_size(void) { - return swp_offset(pte_to_swp_entry( - swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; + swp_entry_t entry = swp_entry(0, ~0UL); + const pte_t pte = softleaf_to_pte(entry); + + /* + * Since the PTE can be an invalid softleaf entry (e.g. the none PTE), + * we need to do this manually. + */ + entry = __pte_to_swp_entry(pte); + entry = swp_entry(__swp_type(entry), __swp_offset(entry)); + + return swp_offset(entry) + 1; } /* Can be overridden by an architecture for additional checks. */ -- cgit v1.2.3 From ad7c7f4576a5977b4ec4ac5dd090ab3f81ca7c6f Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 10 Nov 2025 16:17:56 +0800 Subject: mm: thp: introduce folio_split_queue_lock and its variants In future memcg removal, the binding between a folio and a memcg may change, making the split lock within the memcg unstable when held. A new approach is required to reparent the split queue to its parent. This patch starts introducing a unified way to acquire the split lock for future work. It's a code-only refactoring with no functional changes. Link: https://lkml.kernel.org/r/a31a90bcac04dc754f775e87ae3205be3170b571.1762762324.git.zhengqi.arch@bytedance.com Signed-off-by: Muchun Song Signed-off-by: Qi Zheng Acked-by: Johannes Weiner Reviewed-by: Zi Yan Acked-by: Shakeel Butt Acked-by: David Hildenbrand Reviewed-by: Harry Yoo Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Hugh Dickins Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Muchun Song Cc: Nico Pache Cc: Roman Gushchin Cc: Ryan Roberts Cc: Wei Yang Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 10 ++++ mm/huge_memory.c | 119 ++++++++++++++++++++++++++++++++------------- 2 files changed, 94 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 966f7c1a0128..b0c6a4635c67 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1647,6 +1647,11 @@ int alloc_shrinker_info(struct mem_cgroup *memcg); void free_shrinker_info(struct mem_cgroup *memcg); void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id); void reparent_shrinker_deferred(struct mem_cgroup *memcg); + +static inline int shrinker_id(struct shrinker *shrinker) +{ + return shrinker->id; +} #else #define mem_cgroup_sockets_enabled 0 @@ -1678,6 +1683,11 @@ static inline void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) { } + +static inline int shrinker_id(struct shrinker *shrinker) +{ + return -1; +} #endif #ifdef CONFIG_MEMCG diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9758171d49c9..bfcb5d895f67 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1077,28 +1077,86 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) return pmd; } +static struct deferred_split *split_queue_node(int nid) +{ + struct pglist_data *pgdata = NODE_DATA(nid); + + return &pgdata->deferred_split_queue; +} + #ifdef CONFIG_MEMCG static inline -struct deferred_split *get_deferred_split_queue(struct folio *folio) +struct mem_cgroup *folio_split_queue_memcg(struct folio *folio, + struct deferred_split *queue) { - struct mem_cgroup *memcg = folio_memcg(folio); - struct pglist_data *pgdat = NODE_DATA(folio_nid(folio)); + if (mem_cgroup_disabled()) + return NULL; + if (split_queue_node(folio_nid(folio)) == queue) + return NULL; + return container_of(queue, struct mem_cgroup, deferred_split_queue); +} - if (memcg) - return &memcg->deferred_split_queue; - else - return &pgdat->deferred_split_queue; +static struct deferred_split *memcg_split_queue(int nid, struct mem_cgroup *memcg) +{ + return memcg ? &memcg->deferred_split_queue : split_queue_node(nid); } #else static inline -struct deferred_split *get_deferred_split_queue(struct folio *folio) +struct mem_cgroup *folio_split_queue_memcg(struct folio *folio, + struct deferred_split *queue) { - struct pglist_data *pgdat = NODE_DATA(folio_nid(folio)); + return NULL; +} - return &pgdat->deferred_split_queue; +static struct deferred_split *memcg_split_queue(int nid, struct mem_cgroup *memcg) +{ + return split_queue_node(nid); } #endif +static struct deferred_split *split_queue_lock(int nid, struct mem_cgroup *memcg) +{ + struct deferred_split *queue; + + queue = memcg_split_queue(nid, memcg); + spin_lock(&queue->split_queue_lock); + + return queue; +} + +static struct deferred_split * +split_queue_lock_irqsave(int nid, struct mem_cgroup *memcg, unsigned long *flags) +{ + struct deferred_split *queue; + + queue = memcg_split_queue(nid, memcg); + spin_lock_irqsave(&queue->split_queue_lock, *flags); + + return queue; +} + +static struct deferred_split *folio_split_queue_lock(struct folio *folio) +{ + return split_queue_lock(folio_nid(folio), folio_memcg(folio)); +} + +static struct deferred_split * +folio_split_queue_lock_irqsave(struct folio *folio, unsigned long *flags) +{ + return split_queue_lock_irqsave(folio_nid(folio), folio_memcg(folio), flags); +} + +static inline void split_queue_unlock(struct deferred_split *queue) +{ + spin_unlock(&queue->split_queue_lock); +} + +static inline void split_queue_unlock_irqrestore(struct deferred_split *queue, + unsigned long flags) +{ + spin_unlock_irqrestore(&queue->split_queue_lock, flags); +} + static inline bool is_transparent_hugepage(const struct folio *folio) { if (!folio_test_large(folio)) @@ -3690,7 +3748,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order, struct page *split_at, struct page *lock_at, struct list_head *list, enum split_type split_type, bool unmapped) { - struct deferred_split *ds_queue = get_deferred_split_queue(folio); + struct deferred_split *ds_queue; XA_STATE(xas, &folio->mapping->i_pages, folio->index); struct folio *end_folio = folio_next(folio); bool is_anon = folio_test_anon(folio); @@ -3824,7 +3882,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order, } /* Prevent deferred_split_scan() touching ->_refcount */ - spin_lock(&ds_queue->split_queue_lock); + ds_queue = folio_split_queue_lock(folio); if (folio_ref_freeze(folio, 1 + extra_pins)) { struct swap_cluster_info *ci = NULL; struct lruvec *lruvec; @@ -3846,7 +3904,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order, */ list_del_init(&folio->_deferred_list); } - spin_unlock(&ds_queue->split_queue_lock); + split_queue_unlock(ds_queue); if (mapping) { int nr = folio_nr_pages(folio); @@ -3946,7 +4004,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order, if (ci) swap_cluster_unlock(ci); } else { - spin_unlock(&ds_queue->split_queue_lock); + split_queue_unlock(ds_queue); ret = -EAGAIN; } fail: @@ -4129,8 +4187,7 @@ bool __folio_unqueue_deferred_split(struct folio *folio) WARN_ON_ONCE(folio_ref_count(folio)); WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg_charged(folio)); - ds_queue = get_deferred_split_queue(folio); - spin_lock_irqsave(&ds_queue->split_queue_lock, flags); + ds_queue = folio_split_queue_lock_irqsave(folio, &flags); if (!list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; if (folio_test_partially_mapped(folio)) { @@ -4141,7 +4198,7 @@ bool __folio_unqueue_deferred_split(struct folio *folio) list_del_init(&folio->_deferred_list); unqueued = true; } - spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); + split_queue_unlock_irqrestore(ds_queue, flags); return unqueued; /* useful for debug warnings */ } @@ -4149,10 +4206,7 @@ bool __folio_unqueue_deferred_split(struct folio *folio) /* partially_mapped=false won't clear PG_partially_mapped folio flag */ void deferred_split_folio(struct folio *folio, bool partially_mapped) { - struct deferred_split *ds_queue = get_deferred_split_queue(folio); -#ifdef CONFIG_MEMCG - struct mem_cgroup *memcg = folio_memcg(folio); -#endif + struct deferred_split *ds_queue; unsigned long flags; /* @@ -4175,7 +4229,7 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped) if (folio_test_swapcache(folio)) return; - spin_lock_irqsave(&ds_queue->split_queue_lock, flags); + ds_queue = folio_split_queue_lock_irqsave(folio, &flags); if (partially_mapped) { if (!folio_test_partially_mapped(folio)) { folio_set_partially_mapped(folio); @@ -4190,15 +4244,16 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped) VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio); } if (list_empty(&folio->_deferred_list)) { + struct mem_cgroup *memcg; + + memcg = folio_split_queue_memcg(folio, ds_queue); list_add_tail(&folio->_deferred_list, &ds_queue->split_queue); ds_queue->split_queue_len++; -#ifdef CONFIG_MEMCG if (memcg) set_shrinker_bit(memcg, folio_nid(folio), - deferred_split_shrinker->id); -#endif + shrinker_id(deferred_split_shrinker)); } - spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); + split_queue_unlock_irqrestore(ds_queue, flags); } static unsigned long deferred_split_count(struct shrinker *shrink, @@ -4244,19 +4299,13 @@ static bool thp_underused(struct folio *folio) static unsigned long deferred_split_scan(struct shrinker *shrink, struct shrink_control *sc) { - struct pglist_data *pgdata = NODE_DATA(sc->nid); - struct deferred_split *ds_queue = &pgdata->deferred_split_queue; + struct deferred_split *ds_queue; unsigned long flags; LIST_HEAD(list); struct folio *folio, *next, *prev = NULL; int split = 0, removed = 0; -#ifdef CONFIG_MEMCG - if (sc->memcg) - ds_queue = &sc->memcg->deferred_split_queue; -#endif - - spin_lock_irqsave(&ds_queue->split_queue_lock, flags); + ds_queue = split_queue_lock_irqsave(sc->nid, sc->memcg, &flags); /* Take pin on all head pages to avoid freeing them under us */ list_for_each_entry_safe(folio, next, &ds_queue->split_queue, _deferred_list) { @@ -4275,7 +4324,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, if (!--sc->nr_to_scan) break; } - spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); + split_queue_unlock_irqrestore(ds_queue, flags); list_for_each_entry_safe(folio, next, &list, _deferred_list) { bool did_split = false; -- cgit v1.2.3 From 46156dba32cb68537d36877a97d672227f3e8134 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 10 Nov 2025 16:17:58 +0800 Subject: mm: thp: reparent the split queue during memcg offline Similar to list_lru, the split queue is relatively independent and does not need to be reparented along with objcg and LRU folios (holding objcg lock and lru lock). So let's apply the similar mechanism as list_lru to reparent the split queue separately when memcg is offine. This is also a preparation for reparenting LRU folios. Link: https://lkml.kernel.org/r/8703f907c4d1f7e8a2ef2bfed3036a84fa53028b.1762762324.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Acked-by: Zi Yan Reviewed-by: Muchun Song Acked-by: David Hildenbrand Acked-by: Shakeel Butt Reviewed-by: Harry Yoo Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Hugh Dickins Cc: Johannes Weiner Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Muchun Song Cc: Nico Pache Cc: Roman Gushchin Cc: Ryan Roberts Cc: Wei Yang Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 4 ++++ include/linux/memcontrol.h | 11 +++++++++++ mm/huge_memory.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ mm/memcontrol.c | 1 + 4 files changed, 60 insertions(+) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 525624c285a6..e2e91aa1a042 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -415,6 +415,9 @@ static inline int split_huge_page(struct page *page) return split_huge_page_to_list_to_order(page, NULL, 0); } void deferred_split_folio(struct folio *folio, bool partially_mapped); +#ifdef CONFIG_MEMCG +void reparent_deferred_split_queue(struct mem_cgroup *memcg); +#endif void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze); @@ -647,6 +650,7 @@ static inline int try_folio_split_to_order(struct folio *folio, } static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {} +static inline void reparent_deferred_split_queue(struct mem_cgroup *memcg) {} #define split_huge_pmd(__vma, __pmd, __address) \ do { } while (0) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b0c6a4635c67..cc6db20d7dca 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1775,6 +1775,12 @@ static inline void count_objcg_events(struct obj_cgroup *objcg, bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid); void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg); + +static inline bool memcg_is_dying(struct mem_cgroup *memcg) +{ + return memcg ? css_is_dying(&memcg->css) : false; +} + #else static inline bool mem_cgroup_kmem_disabled(void) { @@ -1845,6 +1851,11 @@ static inline bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid) static inline void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg) { } + +static inline bool memcg_is_dying(struct mem_cgroup *memcg) +{ + return false; +} #endif /* CONFIG_MEMCG */ #if defined(CONFIG_MEMCG) && defined(CONFIG_ZSWAP) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 13684e5376e8..d17d3810a882 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1118,8 +1118,19 @@ static struct deferred_split *split_queue_lock(int nid, struct mem_cgroup *memcg { struct deferred_split *queue; +retry: queue = memcg_split_queue(nid, memcg); spin_lock(&queue->split_queue_lock); + /* + * There is a period between setting memcg to dying and reparenting + * deferred split queue, and during this period the THPs in the deferred + * split queue will be hidden from the shrinker side. + */ + if (unlikely(memcg_is_dying(memcg))) { + spin_unlock(&queue->split_queue_lock); + memcg = parent_mem_cgroup(memcg); + goto retry; + } return queue; } @@ -1129,8 +1140,14 @@ split_queue_lock_irqsave(int nid, struct mem_cgroup *memcg, unsigned long *flags { struct deferred_split *queue; +retry: queue = memcg_split_queue(nid, memcg); spin_lock_irqsave(&queue->split_queue_lock, *flags); + if (unlikely(memcg_is_dying(memcg))) { + spin_unlock_irqrestore(&queue->split_queue_lock, *flags); + memcg = parent_mem_cgroup(memcg); + goto retry; + } return queue; } @@ -4389,6 +4406,33 @@ next: return split; } +#ifdef CONFIG_MEMCG +void reparent_deferred_split_queue(struct mem_cgroup *memcg) +{ + struct mem_cgroup *parent = parent_mem_cgroup(memcg); + struct deferred_split *ds_queue = &memcg->deferred_split_queue; + struct deferred_split *parent_ds_queue = &parent->deferred_split_queue; + int nid; + + spin_lock_irq(&ds_queue->split_queue_lock); + spin_lock_nested(&parent_ds_queue->split_queue_lock, SINGLE_DEPTH_NESTING); + + if (!ds_queue->split_queue_len) + goto unlock; + + list_splice_tail_init(&ds_queue->split_queue, &parent_ds_queue->split_queue); + parent_ds_queue->split_queue_len += ds_queue->split_queue_len; + ds_queue->split_queue_len = 0; + + for_each_node(nid) + set_shrinker_bit(parent, nid, shrinker_id(deferred_split_shrinker)); + +unlock: + spin_unlock(&parent_ds_queue->split_queue_lock); + spin_unlock_irq(&ds_queue->split_queue_lock); +} +#endif + #ifdef CONFIG_DEBUG_FS static void split_huge_pages_all(void) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bfc986da3289..623446821b00 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3920,6 +3920,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) zswap_memcg_offline_cleanup(memcg); memcg_offline_kmem(memcg); + reparent_deferred_split_queue(memcg); reparent_shrinker_deferred(memcg); wb_memcg_offline(memcg); lru_gen_offline_memcg(memcg); -- cgit v1.2.3 From cab812d9c9642ec11b8961b7ea994f4bd0826159 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Fri, 14 Nov 2025 12:22:28 +1100 Subject: mm/huge_memory.c: introduce folio_split_unmapped MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Unmapped was added as a parameter to __folio_split() and related call sites to support splitting of folios already in the midst of a migration. This special case arose for device private folio migration since during migration there could be a disconnect between source and destination on the folio size. Introduce folio_split_unmapped() to handle this special case. Also refactor code and add __folio_freeze_and_split_unmapped() helper that is common to both __folio_split() and folio_split_unmapped(). This in turn removes the special casing introduced by the unmapped parameter in __folio_split(). [balbirs@nvidia.com: v2] Link: https://lkml.kernel.org/r/20251115084041.3914728-1-balbirs@nvidia.com [balbirs@nvidia.com: fix clang-20 build] Link: https://lkml.kernel.org/r/20251120134232.3588203-1-balbirs@nvidia.com [akpm@linux-foundation.org: add `inline' to shmem_uncharge() stub, per Balbir] Link: https://lkml.kernel.org/r/20251114012228.2634882-1-balbirs@nvidia.com Signed-off-by: Balbir Singh Suggested-by: Zi Yan Acked-by: Zi Yan Cc: David Hildenbrand Cc: Joshua Hahn Cc: Rakie Kim Cc: Byungchul Park Cc: Gregory Price Cc: Ying Huang Cc: Alistair Popple Cc: Oscar Salvador Cc: Lorenzo Stoakes Cc: Baolin Wang Cc: "Liam R. Howlett" Cc: Nico Pache Cc: Ryan Roberts Cc: Dev Jain Cc: Barry Song Cc: Lyude Paul Cc: Danilo Krummrich Cc: David Airlie Cc: Simona Vetter Cc: Ralph Campbell Cc: Mika Penttilä Cc: Matthew Brost Cc: Francois Dugast Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 5 +- include/linux/shmem_fs.h | 6 +- mm/huge_memory.c | 348 +++++++++++++++++++++++++++-------------------- mm/migrate_device.c | 3 +- 4 files changed, 211 insertions(+), 151 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index e2e91aa1a042..1d439de1ca2c 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -371,7 +371,8 @@ enum split_type { bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins); int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list, - unsigned int new_order, bool unmapped); + unsigned int new_order); +int folio_split_unmapped(struct folio *folio, unsigned int new_order); int min_order_for_split(struct folio *folio); int split_folio_to_list(struct folio *folio, struct list_head *list); bool folio_split_supported(struct folio *folio, unsigned int new_order, @@ -382,7 +383,7 @@ int folio_split(struct folio *folio, unsigned int new_order, struct page *page, static inline int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, unsigned int new_order) { - return __split_huge_page_to_list_to_order(page, list, new_order, false); + return __split_huge_page_to_list_to_order(page, list, new_order); } static inline int split_huge_page_to_order(struct page *page, unsigned int new_order) { diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 5b368f9549d6..d02270072a34 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -136,11 +136,16 @@ static inline bool shmem_hpage_pmd_enabled(void) #ifdef CONFIG_SHMEM extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); +extern void shmem_uncharge(struct inode *inode, long pages); #else static inline unsigned long shmem_swap_usage(struct vm_area_struct *vma) { return 0; } + +static inline void shmem_uncharge(struct inode *inode, long pages) +{ +} #endif extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end); @@ -194,7 +199,6 @@ static inline pgoff_t shmem_fallocend(struct inode *inode, pgoff_t eof) } extern bool shmem_charge(struct inode *inode, long pages); -extern void shmem_uncharge(struct inode *inode, long pages); #ifdef CONFIG_USERFAULTFD #ifdef CONFIG_SHMEM diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d17d3810a882..53a8d380eab2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3739,6 +3739,152 @@ bool folio_split_supported(struct folio *folio, unsigned int new_order, return true; } +static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int new_order, + struct page *split_at, struct xa_state *xas, + struct address_space *mapping, bool do_lru, + struct list_head *list, enum split_type split_type, + pgoff_t end, int *nr_shmem_dropped, int extra_pins) +{ + struct folio *end_folio = folio_next(folio); + struct folio *new_folio, *next; + int old_order = folio_order(folio); + int ret = 0; + struct deferred_split *ds_queue; + + VM_WARN_ON_ONCE(!mapping && end); + /* Prevent deferred_split_scan() touching ->_refcount */ + ds_queue = folio_split_queue_lock(folio); + if (folio_ref_freeze(folio, 1 + extra_pins)) { + struct swap_cluster_info *ci = NULL; + struct lruvec *lruvec; + int expected_refs; + + if (old_order > 1) { + if (!list_empty(&folio->_deferred_list)) { + ds_queue->split_queue_len--; + /* + * Reinitialize page_deferred_list after removing the + * page from the split_queue, otherwise a subsequent + * split will see list corruption when checking the + * page_deferred_list. + */ + list_del_init(&folio->_deferred_list); + } + if (folio_test_partially_mapped(folio)) { + folio_clear_partially_mapped(folio); + mod_mthp_stat(old_order, + MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); + } + } + split_queue_unlock(ds_queue); + if (mapping) { + int nr = folio_nr_pages(folio); + + if (folio_test_pmd_mappable(folio) && + new_order < HPAGE_PMD_ORDER) { + if (folio_test_swapbacked(folio)) { + __lruvec_stat_mod_folio(folio, + NR_SHMEM_THPS, -nr); + } else { + __lruvec_stat_mod_folio(folio, + NR_FILE_THPS, -nr); + filemap_nr_thps_dec(mapping); + } + } + } + + if (folio_test_swapcache(folio)) { + if (mapping) { + VM_WARN_ON_ONCE_FOLIO(mapping, folio); + return -EINVAL; + } + + ci = swap_cluster_get_and_lock(folio); + } + + /* lock lru list/PageCompound, ref frozen by page_ref_freeze */ + if (do_lru) + lruvec = folio_lruvec_lock(folio); + + ret = __split_unmapped_folio(folio, new_order, split_at, xas, + mapping, split_type); + + /* + * Unfreeze after-split folios and put them back to the right + * list. @folio should be kept frozon until page cache + * entries are updated with all the other after-split folios + * to prevent others seeing stale page cache entries. + * As a result, new_folio starts from the next folio of + * @folio. + */ + for (new_folio = folio_next(folio); new_folio != end_folio; + new_folio = next) { + unsigned long nr_pages = folio_nr_pages(new_folio); + + next = folio_next(new_folio); + + zone_device_private_split_cb(folio, new_folio); + + expected_refs = folio_expected_ref_count(new_folio) + 1; + folio_ref_unfreeze(new_folio, expected_refs); + + if (do_lru) + lru_add_split_folio(folio, new_folio, lruvec, list); + + /* + * Anonymous folio with swap cache. + * NOTE: shmem in swap cache is not supported yet. + */ + if (ci) { + __swap_cache_replace_folio(ci, folio, new_folio); + continue; + } + + /* Anonymous folio without swap cache */ + if (!mapping) + continue; + + /* Add the new folio to the page cache. */ + if (new_folio->index < end) { + __xa_store(&mapping->i_pages, new_folio->index, + new_folio, 0); + continue; + } + + VM_WARN_ON_ONCE(!nr_shmem_dropped); + /* Drop folio beyond EOF: ->index >= end */ + if (shmem_mapping(mapping) && nr_shmem_dropped) + *nr_shmem_dropped += nr_pages; + else if (folio_test_clear_dirty(new_folio)) + folio_account_cleaned( + new_folio, inode_to_wb(mapping->host)); + __filemap_remove_folio(new_folio, NULL); + folio_put_refs(new_folio, nr_pages); + } + + zone_device_private_split_cb(folio, NULL); + /* + * Unfreeze @folio only after all page cache entries, which + * used to point to it, have been updated with new folios. + * Otherwise, a parallel folio_try_get() can grab @folio + * and its caller can see stale page cache entries. + */ + expected_refs = folio_expected_ref_count(folio) + 1; + folio_ref_unfreeze(folio, expected_refs); + + if (do_lru) + unlock_page_lruvec(lruvec); + + if (ci) + swap_cluster_unlock(ci); + } else { + split_queue_unlock(ds_queue); + return -EAGAIN; + } + + return ret; +} + /** * __folio_split() - split a folio at @split_at to a @new_order folio * @folio: folio to split @@ -3747,7 +3893,6 @@ bool folio_split_supported(struct folio *folio, unsigned int new_order, * @lock_at: a page within @folio to be left locked to caller * @list: after-split folios will be put on it if non NULL * @split_type: perform uniform split or not (non-uniform split) - * @unmapped: The pages are already unmapped, they are migration entries. * * It calls __split_unmapped_folio() to perform uniform and non-uniform split. * It is in charge of checking whether the split is supported or not and @@ -3763,9 +3908,8 @@ bool folio_split_supported(struct folio *folio, unsigned int new_order, */ static int __folio_split(struct folio *folio, unsigned int new_order, struct page *split_at, struct page *lock_at, - struct list_head *list, enum split_type split_type, bool unmapped) + struct list_head *list, enum split_type split_type) { - struct deferred_split *ds_queue; XA_STATE(xas, &folio->mapping->i_pages, folio->index); struct folio *end_folio = folio_next(folio); bool is_anon = folio_test_anon(folio); @@ -3776,7 +3920,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order, int nr_shmem_dropped = 0; int remap_flags = 0; int extra_pins, ret; - pgoff_t end; + pgoff_t end = 0; bool is_hzp; VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); @@ -3819,14 +3963,12 @@ static int __folio_split(struct folio *folio, unsigned int new_order, * is taken to serialise against parallel split or collapse * operations. */ - if (!unmapped) { - anon_vma = folio_get_anon_vma(folio); - if (!anon_vma) { - ret = -EBUSY; - goto out; - } - anon_vma_lock_write(anon_vma); + anon_vma = folio_get_anon_vma(folio); + if (!anon_vma) { + ret = -EBUSY; + goto out; } + anon_vma_lock_write(anon_vma); mapping = NULL; } else { unsigned int min_order; @@ -3880,8 +4022,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order, goto out_unlock; } - if (!unmapped) - unmap_folio(folio); + unmap_folio(folio); /* block interrupt reentry in xa_lock and spinlock */ local_irq_disable(); @@ -3898,142 +4039,15 @@ static int __folio_split(struct folio *folio, unsigned int new_order, } } - /* Prevent deferred_split_scan() touching ->_refcount */ - ds_queue = folio_split_queue_lock(folio); - if (folio_ref_freeze(folio, 1 + extra_pins)) { - struct swap_cluster_info *ci = NULL; - struct lruvec *lruvec; - int expected_refs; - - if (old_order > 1) { - if (!list_empty(&folio->_deferred_list)) { - ds_queue->split_queue_len--; - /* - * Reinitialize page_deferred_list after removing the - * page from the split_queue, otherwise a subsequent - * split will see list corruption when checking the - * page_deferred_list. - */ - list_del_init(&folio->_deferred_list); - } - if (folio_test_partially_mapped(folio)) { - folio_clear_partially_mapped(folio); - mod_mthp_stat(old_order, - MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); - } - } - split_queue_unlock(ds_queue); - if (mapping) { - int nr = folio_nr_pages(folio); - - if (folio_test_pmd_mappable(folio) && - new_order < HPAGE_PMD_ORDER) { - if (folio_test_swapbacked(folio)) { - __lruvec_stat_mod_folio(folio, - NR_SHMEM_THPS, -nr); - } else { - __lruvec_stat_mod_folio(folio, - NR_FILE_THPS, -nr); - filemap_nr_thps_dec(mapping); - } - } - } - - if (folio_test_swapcache(folio)) { - if (mapping) { - VM_WARN_ON_ONCE_FOLIO(mapping, folio); - ret = -EINVAL; - goto fail; - } - - ci = swap_cluster_get_and_lock(folio); - } - - /* lock lru list/PageCompound, ref frozen by page_ref_freeze */ - lruvec = folio_lruvec_lock(folio); - - ret = __split_unmapped_folio(folio, new_order, split_at, &xas, - mapping, split_type); - - /* - * Unfreeze after-split folios and put them back to the right - * list. @folio should be kept frozon until page cache - * entries are updated with all the other after-split folios - * to prevent others seeing stale page cache entries. - * As a result, new_folio starts from the next folio of - * @folio. - */ - for (new_folio = folio_next(folio); new_folio != end_folio; - new_folio = next) { - unsigned long nr_pages = folio_nr_pages(new_folio); - - next = folio_next(new_folio); - - zone_device_private_split_cb(folio, new_folio); - - expected_refs = folio_expected_ref_count(new_folio) + 1; - folio_ref_unfreeze(new_folio, expected_refs); - - if (!unmapped) - lru_add_split_folio(folio, new_folio, lruvec, list); - - /* - * Anonymous folio with swap cache. - * NOTE: shmem in swap cache is not supported yet. - */ - if (ci) { - __swap_cache_replace_folio(ci, folio, new_folio); - continue; - } - - /* Anonymous folio without swap cache */ - if (!mapping) - continue; - - /* Add the new folio to the page cache. */ - if (new_folio->index < end) { - __xa_store(&mapping->i_pages, new_folio->index, - new_folio, 0); - continue; - } - - /* Drop folio beyond EOF: ->index >= end */ - if (shmem_mapping(mapping)) - nr_shmem_dropped += nr_pages; - else if (folio_test_clear_dirty(new_folio)) - folio_account_cleaned( - new_folio, inode_to_wb(mapping->host)); - __filemap_remove_folio(new_folio, NULL); - folio_put_refs(new_folio, nr_pages); - } - - zone_device_private_split_cb(folio, NULL); - /* - * Unfreeze @folio only after all page cache entries, which - * used to point to it, have been updated with new folios. - * Otherwise, a parallel folio_try_get() can grab @folio - * and its caller can see stale page cache entries. - */ - expected_refs = folio_expected_ref_count(folio) + 1; - folio_ref_unfreeze(folio, expected_refs); - - unlock_page_lruvec(lruvec); - - if (ci) - swap_cluster_unlock(ci); - } else { - split_queue_unlock(ds_queue); - ret = -EAGAIN; - } + ret = __folio_freeze_and_split_unmapped(folio, new_order, split_at, &xas, mapping, + true, list, split_type, end, &nr_shmem_dropped, + extra_pins); fail: if (mapping) xas_unlock(&xas); local_irq_enable(); - if (unmapped) - return ret; - if (nr_shmem_dropped) shmem_uncharge(mapping->host, nr_shmem_dropped); @@ -4077,6 +4091,48 @@ out: return ret; } +/** + * folio_split_unmapped() - split a large anon folio that is already unmapped + * @folio: folio to split + * @new_order: the order of folios after split + * + * This function is a helper for splitting folios that have already been + * unmapped. The use case is that the device or the CPU can refuse to migrate + * THP pages in the middle of migration, due to allocation issues on either + * side. + * + * anon_vma_lock is not required to be held, mmap_read_lock() or + * mmap_write_lock() should be held. @folio is expected to be locked by the + * caller. device-private and non device-private folios are supported along + * with folios that are in the swapcache. @folio should also be unmapped and + * isolated from LRU (if applicable) + * + * Upon return, the folio is not remapped, split folios are not added to LRU, + * free_folio_and_swap_cache() is not called, and new folios remain locked. + * + * Return: 0 on success, -EAGAIN if the folio cannot be split (e.g., due to + * insufficient reference count or extra pins). + */ +int folio_split_unmapped(struct folio *folio, unsigned int new_order) +{ + int extra_pins, ret = 0; + + VM_WARN_ON_ONCE_FOLIO(folio_mapped(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_large(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_anon(folio), folio); + + if (!can_split_folio(folio, 1, &extra_pins)) + return -EAGAIN; + + local_irq_disable(); + ret = __folio_freeze_and_split_unmapped(folio, new_order, &folio->page, NULL, + NULL, false, NULL, SPLIT_TYPE_UNIFORM, + 0, NULL, extra_pins); + local_irq_enable(); + return ret; +} + /* * This function splits a large folio into smaller folios of order @new_order. * @page can point to any page of the large folio to split. The split operation @@ -4125,12 +4181,12 @@ out: * with the folio. Splitting to order 0 is compatible with all folios. */ int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list, - unsigned int new_order, bool unmapped) + unsigned int new_order) { struct folio *folio = page_folio(page); return __folio_split(folio, new_order, &folio->page, page, list, - SPLIT_TYPE_UNIFORM, unmapped); + SPLIT_TYPE_UNIFORM); } /** @@ -4161,7 +4217,7 @@ int folio_split(struct folio *folio, unsigned int new_order, struct page *split_at, struct list_head *list) { return __folio_split(folio, new_order, split_at, &folio->page, list, - SPLIT_TYPE_NON_UNIFORM, false); + SPLIT_TYPE_NON_UNIFORM); } int min_order_for_split(struct folio *folio) diff --git a/mm/migrate_device.c b/mm/migrate_device.c index b1ce6e3478d6..23379663b1e1 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -916,8 +916,7 @@ static int migrate_vma_split_unmapped_folio(struct migrate_vma *migrate, folio_get(folio); split_huge_pmd_address(migrate->vma, addr, true); - ret = __split_huge_page_to_list_to_order(folio_page(folio, 0), NULL, - 0, true); + ret = folio_split_unmapped(folio, 0); if (ret) return ret; migrate->src[idx] &= ~MIGRATE_PFN_COMPOUND; -- cgit v1.2.3 From 7e44d00a13ca5691caf4f7c46541ee60bf75b208 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Mon, 10 Nov 2025 15:20:05 -0800 Subject: memcg: use mod_node_page_state to update stats Patch series "memcg: cleanup the memcg stats interfaces". The memcg stats are safe against irq (and nmi) context and thus does not require disabling irqs. However for some stats which are also maintained at node level, it is using irq unsafe interface and thus requiring the users to still disables irqs or use interfaces which explicitly disables irqs. Let's move memcg code to use irq safe node level stats function which is already optimized for architectures with HAVE_CMPXCHG_LOCAL (all major ones), so there will not be any performance penalty for its usage. This patch (of 4): The memcg stats are safe against irq (and nmi) context and thus does not require disabling irqs. However some code paths for memcg stats also update the node level stats and use irq unsafe interface and thus require the users to disable irqs. However node level stats, on architectures with HAVE_CMPXCHG_LOCAL (all major ones), has interface which does not require irq disabling. Let's move memcg stats code to start using that interface for node level stats. Link: https://lkml.kernel.org/r/20251110232008.1352063-1-shakeel.butt@linux.dev Link: https://lkml.kernel.org/r/20251110232008.1352063-2-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Reviewed-by: Harry Yoo Acked-by: Roman Gushchin Acked-by: Vlastimil Babka Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Qi Zheng Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 2 +- include/linux/vmstat.h | 4 ++-- mm/memcontrol.c | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index cc6db20d7dca..1085d0460e66 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1408,7 +1408,7 @@ static inline void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, { struct page *page = virt_to_head_page(p); - __mod_node_page_state(page_pgdat(page), idx, val); + mod_node_page_state(page_pgdat(page), idx, val); } static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index c287998908bf..11a37aaa4dd9 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -557,7 +557,7 @@ static inline void mod_lruvec_page_state(struct page *page, static inline void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { - __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); + mod_node_page_state(lruvec_pgdat(lruvec), idx, val); } static inline void mod_lruvec_state(struct lruvec *lruvec, @@ -569,7 +569,7 @@ static inline void mod_lruvec_state(struct lruvec *lruvec, static inline void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val) { - __mod_node_page_state(folio_pgdat(folio), idx, val); + mod_node_page_state(folio_pgdat(folio), idx, val); } static inline void lruvec_stat_mod_folio(struct folio *folio, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 623446821b00..7e6407b8bfb7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -770,7 +770,7 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { /* Update node */ - __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); + mod_node_page_state(lruvec_pgdat(lruvec), idx, val); /* Update memcg and lruvec */ if (!mem_cgroup_disabled()) @@ -789,7 +789,7 @@ void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, /* Untracked pages have no memcg, no lruvec. Update only the node */ if (!memcg) { rcu_read_unlock(); - __mod_node_page_state(pgdat, idx, val); + mod_node_page_state(pgdat, idx, val); return; } @@ -815,7 +815,7 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) * vmstats to keep it correct for the root memcg. */ if (!memcg) { - __mod_node_page_state(pgdat, idx, val); + mod_node_page_state(pgdat, idx, val); } else { lruvec = mem_cgroup_lruvec(memcg, pgdat); __mod_lruvec_state(lruvec, idx, val); -- cgit v1.2.3 From 469241fe7657dbec9e2948287ab7412955d8b73a Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Mon, 10 Nov 2025 15:20:06 -0800 Subject: memcg: remove __mod_lruvec_kmem_state __mod_lruvec_kmem_state() is already safe against irqs, so there is no need to have a separate interface (i.e. mod_lruvec_kmem_state) which wraps calls to it with irq disabling and reenabling. Let's rename __mod_lruvec_kmem_state() to mod_lruvec_kmem_state(). Link: https://lkml.kernel.org/r/20251110232008.1352063-3-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Reviewed-by: Harry Yoo Reviewed-by: Qi Zheng Acked-by: Vlastimil Babka Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 28 +++++----------------------- mm/memcontrol.c | 2 +- mm/workingset.c | 2 +- 3 files changed, 7 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 1085d0460e66..d35390f9892a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -957,17 +957,7 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec, void mem_cgroup_flush_stats(struct mem_cgroup *memcg); void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg); -void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val); - -static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, - int val) -{ - unsigned long flags; - - local_irq_save(flags); - __mod_lruvec_kmem_state(p, idx, val); - local_irq_restore(flags); -} +void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val); void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count); @@ -1403,14 +1393,6 @@ static inline void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg) { } -static inline void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, - int val) -{ - struct page *page = virt_to_head_page(p); - - mod_node_page_state(page_pgdat(page), idx, val); -} - static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { @@ -1470,14 +1452,14 @@ struct slabobj_ext { #endif } __aligned(8); -static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx) +static inline void inc_lruvec_kmem_state(void *p, enum node_stat_item idx) { - __mod_lruvec_kmem_state(p, idx, 1); + mod_lruvec_kmem_state(p, idx, 1); } -static inline void __dec_lruvec_kmem_state(void *p, enum node_stat_item idx) +static inline void dec_lruvec_kmem_state(void *p, enum node_stat_item idx) { - __mod_lruvec_kmem_state(p, idx, -1); + mod_lruvec_kmem_state(p, idx, -1); } static inline struct lruvec *parent_lruvec(struct lruvec *lruvec) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7e6407b8bfb7..ae154f51931e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -799,7 +799,7 @@ void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, } EXPORT_SYMBOL(__lruvec_stat_mod_folio); -void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) +void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { pg_data_t *pgdat = page_pgdat(virt_to_page(p)); struct mem_cgroup *memcg; diff --git a/mm/workingset.c b/mm/workingset.c index 68a76a91111f..6ff30369b758 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -749,7 +749,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, if (WARN_ON_ONCE(node->count != node->nr_values)) goto out_invalid; xa_delete_node(node, workingset_update_node); - __inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM); + inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM); out_invalid: xa_unlock_irq(&mapping->i_pages); -- cgit v1.2.3 From 5b3eb779a20cf30d74bb346d2a1e525bc9072685 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Mon, 10 Nov 2025 15:20:07 -0800 Subject: memcg: remove __mod_lruvec_state __mod_lruvec_state() is already safe against irqs, so there is no need to have a separate interface (i.e. mod_lruvec_state) which wraps calls to it with irq disabling and reenabling. Let's rename __mod_lruvec_state() to mod_lruvec_state(). Link: https://lkml.kernel.org/r/20251110232008.1352063-4-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Reviewed-by: Harry Yoo Acked-by: Roman Gushchin Acked-by: Vlastimil Babka Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Qi Zheng Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 2 +- include/linux/vmstat.h | 18 +----------------- mm/memcontrol.c | 8 ++++---- mm/migrate.c | 20 ++++++++++---------- mm/vmscan.c | 4 ++-- 5 files changed, 18 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index ca7a18351797..b58f34c4fe92 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -44,7 +44,7 @@ static __always_inline void __update_lru_size(struct lruvec *lruvec, lockdep_assert_held(&lruvec->lru_lock); WARN_ON_ONCE(nr_pages != (int)nr_pages); - __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages); + mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages); __mod_zone_page_state(&pgdat->node_zones[zid], NR_ZONE_LRU_BASE + lru, nr_pages); } diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 11a37aaa4dd9..4eb7753e6e5c 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -520,19 +520,9 @@ static inline const char *vm_event_name(enum vm_event_item item) #ifdef CONFIG_MEMCG -void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, +void mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); -static inline void mod_lruvec_state(struct lruvec *lruvec, - enum node_stat_item idx, int val) -{ - unsigned long flags; - - local_irq_save(flags); - __mod_lruvec_state(lruvec, idx, val); - local_irq_restore(flags); -} - void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val); @@ -554,12 +544,6 @@ static inline void mod_lruvec_page_state(struct page *page, #else -static inline void __mod_lruvec_state(struct lruvec *lruvec, - enum node_stat_item idx, int val) -{ - mod_node_page_state(lruvec_pgdat(lruvec), idx, val); -} - static inline void mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ae154f51931e..9a659f16af77 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -757,7 +757,7 @@ static void mod_memcg_lruvec_state(struct lruvec *lruvec, } /** - * __mod_lruvec_state - update lruvec memory statistics + * mod_lruvec_state - update lruvec memory statistics * @lruvec: the lruvec * @idx: the stat item * @val: delta to add to the counter, can be negative @@ -766,7 +766,7 @@ static void mod_memcg_lruvec_state(struct lruvec *lruvec, * function updates the all three counters that are affected by a * change of state at this level: per-node, per-cgroup, per-lruvec. */ -void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, +void mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { /* Update node */ @@ -794,7 +794,7 @@ void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, } lruvec = mem_cgroup_lruvec(memcg, pgdat); - __mod_lruvec_state(lruvec, idx, val); + mod_lruvec_state(lruvec, idx, val); rcu_read_unlock(); } EXPORT_SYMBOL(__lruvec_stat_mod_folio); @@ -818,7 +818,7 @@ void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) mod_node_page_state(pgdat, idx, val); } else { lruvec = mem_cgroup_lruvec(memcg, pgdat); - __mod_lruvec_state(lruvec, idx, val); + mod_lruvec_state(lruvec, idx, val); } rcu_read_unlock(); } diff --git a/mm/migrate.c b/mm/migrate.c index b2ad78bf85d5..5169f9717f60 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -675,27 +675,27 @@ static int __folio_migrate_mapping(struct address_space *mapping, old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat); new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat); - __mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr); - __mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr); + mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr); + mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr); if (folio_test_swapbacked(folio) && !folio_test_swapcache(folio)) { - __mod_lruvec_state(old_lruvec, NR_SHMEM, -nr); - __mod_lruvec_state(new_lruvec, NR_SHMEM, nr); + mod_lruvec_state(old_lruvec, NR_SHMEM, -nr); + mod_lruvec_state(new_lruvec, NR_SHMEM, nr); if (folio_test_pmd_mappable(folio)) { - __mod_lruvec_state(old_lruvec, NR_SHMEM_THPS, -nr); - __mod_lruvec_state(new_lruvec, NR_SHMEM_THPS, nr); + mod_lruvec_state(old_lruvec, NR_SHMEM_THPS, -nr); + mod_lruvec_state(new_lruvec, NR_SHMEM_THPS, nr); } } #ifdef CONFIG_SWAP if (folio_test_swapcache(folio)) { - __mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr); - __mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr); + mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr); + mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr); } #endif if (dirty && mapping_can_writeback(mapping)) { - __mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr); + mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr); __mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr); - __mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr); + mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr); __mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr); } } diff --git a/mm/vmscan.c b/mm/vmscan.c index 51ffd32e6e01..720772baf2a7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2018,7 +2018,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, spin_lock_irq(&lruvec->lru_lock); move_folios_to_lru(lruvec, &folio_list); - __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc), + mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc), stat.nr_demoted); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); item = PGSTEAL_KSWAPD + reclaimer_offset(sc); @@ -4744,7 +4744,7 @@ retry: reset_batch_size(walk); } - __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc), + mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc), stat.nr_demoted); item = PGSTEAL_KSWAPD + reclaimer_offset(sc); -- cgit v1.2.3 From c1bd09994c4d5b897571671bed16581335e93242 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Mon, 10 Nov 2025 15:20:08 -0800 Subject: memcg: remove __lruvec_stat_mod_folio __lruvec_stat_mod_folio() is already safe against irqs, so there is no need to have a separate interface (i.e. lruvec_stat_mod_folio) which wraps calls to it with irq disabling and reenabling. Let's rename __lruvec_stat_mod_folio() to lruvec_stat_mod_folio(). Link: https://lkml.kernel.org/r/20251110232008.1352063-5-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Reviewed-by: Harry Yoo Acked-by: Roman Gushchin Acked-by: Vlastimil Babka Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Qi Zheng Signed-off-by: Andrew Morton --- include/linux/vmstat.h | 30 +----------------------------- mm/filemap.c | 20 ++++++++++---------- mm/huge_memory.c | 4 ++-- mm/khugepaged.c | 8 ++++---- mm/memcontrol.c | 4 ++-- mm/page-writeback.c | 2 +- mm/rmap.c | 4 ++-- mm/shmem.c | 6 +++--- 8 files changed, 25 insertions(+), 53 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 4eb7753e6e5c..3398a345bda8 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -523,19 +523,9 @@ static inline const char *vm_event_name(enum vm_event_item item) void mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); -void __lruvec_stat_mod_folio(struct folio *folio, +void lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val); -static inline void lruvec_stat_mod_folio(struct folio *folio, - enum node_stat_item idx, int val) -{ - unsigned long flags; - - local_irq_save(flags); - __lruvec_stat_mod_folio(folio, idx, val); - local_irq_restore(flags); -} - static inline void mod_lruvec_page_state(struct page *page, enum node_stat_item idx, int val) { @@ -550,12 +540,6 @@ static inline void mod_lruvec_state(struct lruvec *lruvec, mod_node_page_state(lruvec_pgdat(lruvec), idx, val); } -static inline void __lruvec_stat_mod_folio(struct folio *folio, - enum node_stat_item idx, int val) -{ - mod_node_page_state(folio_pgdat(folio), idx, val); -} - static inline void lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val) { @@ -570,18 +554,6 @@ static inline void mod_lruvec_page_state(struct page *page, #endif /* CONFIG_MEMCG */ -static inline void __lruvec_stat_add_folio(struct folio *folio, - enum node_stat_item idx) -{ - __lruvec_stat_mod_folio(folio, idx, folio_nr_pages(folio)); -} - -static inline void __lruvec_stat_sub_folio(struct folio *folio, - enum node_stat_item idx) -{ - __lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio)); -} - static inline void lruvec_stat_add_folio(struct folio *folio, enum node_stat_item idx) { diff --git a/mm/filemap.c b/mm/filemap.c index 07634b7d9934..7d15a9c216ef 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -182,13 +182,13 @@ static void filemap_unaccount_folio(struct address_space *mapping, nr = folio_nr_pages(folio); - __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); + lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); if (folio_test_swapbacked(folio)) { - __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); + lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); if (folio_test_pmd_mappable(folio)) - __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr); + lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr); } else if (folio_test_pmd_mappable(folio)) { - __lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr); + lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); } if (test_bit(AS_KERNEL_FILE, &folio->mapping->flags)) @@ -844,13 +844,13 @@ void replace_page_cache_folio(struct folio *old, struct folio *new) old->mapping = NULL; /* hugetlb pages do not participate in page cache accounting. */ if (!folio_test_hugetlb(old)) - __lruvec_stat_sub_folio(old, NR_FILE_PAGES); + lruvec_stat_sub_folio(old, NR_FILE_PAGES); if (!folio_test_hugetlb(new)) - __lruvec_stat_add_folio(new, NR_FILE_PAGES); + lruvec_stat_add_folio(new, NR_FILE_PAGES); if (folio_test_swapbacked(old)) - __lruvec_stat_sub_folio(old, NR_SHMEM); + lruvec_stat_sub_folio(old, NR_SHMEM); if (folio_test_swapbacked(new)) - __lruvec_stat_add_folio(new, NR_SHMEM); + lruvec_stat_add_folio(new, NR_SHMEM); xas_unlock_irq(&xas); if (free_folio) free_folio(old); @@ -933,9 +933,9 @@ noinline int __filemap_add_folio(struct address_space *mapping, /* hugetlb pages do not participate in page cache accounting */ if (!huge) { - __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr); + lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr); if (folio_test_pmd_mappable(folio)) - __lruvec_stat_mod_folio(folio, + lruvec_stat_mod_folio(folio, NR_FILE_THPS, nr); } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 53a8d380eab2..7af3e037d891 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3783,10 +3783,10 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n if (folio_test_pmd_mappable(folio) && new_order < HPAGE_PMD_ORDER) { if (folio_test_swapbacked(folio)) { - __lruvec_stat_mod_folio(folio, + lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr); } else { - __lruvec_stat_mod_folio(folio, + lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 40f9d5939aa5..89c33ef7aac3 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2195,14 +2195,14 @@ immap_locked: } if (is_shmem) - __lruvec_stat_mod_folio(new_folio, NR_SHMEM_THPS, HPAGE_PMD_NR); + lruvec_stat_mod_folio(new_folio, NR_SHMEM_THPS, HPAGE_PMD_NR); else - __lruvec_stat_mod_folio(new_folio, NR_FILE_THPS, HPAGE_PMD_NR); + lruvec_stat_mod_folio(new_folio, NR_FILE_THPS, HPAGE_PMD_NR); if (nr_none) { - __lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, nr_none); + lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, nr_none); /* nr_none is always 0 for non-shmem. */ - __lruvec_stat_mod_folio(new_folio, NR_SHMEM, nr_none); + lruvec_stat_mod_folio(new_folio, NR_SHMEM, nr_none); } /* diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9a659f16af77..9b07db2cb232 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -777,7 +777,7 @@ void mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, mod_memcg_lruvec_state(lruvec, idx, val); } -void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, +void lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val) { struct mem_cgroup *memcg; @@ -797,7 +797,7 @@ void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, mod_lruvec_state(lruvec, idx, val); rcu_read_unlock(); } -EXPORT_SYMBOL(__lruvec_stat_mod_folio); +EXPORT_SYMBOL(lruvec_stat_mod_folio); void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 757bc4d3b5b5..d6b339cc876d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2658,7 +2658,7 @@ static void folio_account_dirtied(struct folio *folio, inode_attach_wb(inode, folio); wb = inode_to_wb(inode); - __lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, nr); + lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, nr); __zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr); __node_stat_mod_folio(folio, NR_DIRTIED, nr); wb_stat_mod(wb, WB_RECLAIMABLE, nr); diff --git a/mm/rmap.c b/mm/rmap.c index d871f2eb821c..f955f02d570e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1212,12 +1212,12 @@ static void __folio_mod_stat(struct folio *folio, int nr, int nr_pmdmapped) if (nr) { idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED; - __lruvec_stat_mod_folio(folio, idx, nr); + lruvec_stat_mod_folio(folio, idx, nr); } if (nr_pmdmapped) { if (folio_test_anon(folio)) { idx = NR_ANON_THPS; - __lruvec_stat_mod_folio(folio, idx, nr_pmdmapped); + lruvec_stat_mod_folio(folio, idx, nr_pmdmapped); } else { /* NR_*_PMDMAPPED are not maintained per-memcg */ idx = folio_test_swapbacked(folio) ? diff --git a/mm/shmem.c b/mm/shmem.c index fc835b3e4914..ad18172ff831 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -871,9 +871,9 @@ static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index static void shmem_update_stats(struct folio *folio, int nr_pages) { if (folio_test_pmd_mappable(folio)) - __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr_pages); - __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); - __lruvec_stat_mod_folio(folio, NR_SHMEM, nr_pages); + lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr_pages); + lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); + lruvec_stat_mod_folio(folio, NR_SHMEM, nr_pages); } /* -- cgit v1.2.3 From 277a1ae3879a82a15a2e2d6741e38e31ea6487ee Mon Sep 17 00:00:00 2001 From: Chunyan Zhang Date: Thu, 13 Nov 2025 15:28:01 +0800 Subject: mm: softdirty: add pgtable_supports_soft_dirty() Patch series "mm: Add soft-dirty and uffd-wp support for RISC-V", v15. This patchset adds support for Svrsw60t59b [1] extension which is ratified now, also add soft dirty and userfaultfd write protect tracking for RISC-V. The patches 1 and 2 add macros to allow architectures to define their own checks if the soft-dirty / uffd_wp PTE bits are available, in other words for RISC-V, the Svrsw60t59b extension is supported on which device the kernel is running. Also patch1-2 are removing "ifdef CONFIG_MEM_SOFT_DIRTY" "ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP" and "ifdef CONFIG_PTE_MARKER_UFFD_WP" in favor of checks which if not overridden by the architecture, no change in behavior is expected. This patchset has been tested with kselftest mm suite in which soft-dirty, madv_populate, test_unmerge_uffd_wp, and uffd-unit-tests run and pass, and no regressions are observed in any of the other tests. This patch (of 6): Some platforms can customize the PTE PMD entry soft-dirty bit making it unavailable even if the architecture provides the resource. Add an API which architectures can define their specific implementations to detect if soft-dirty bit is available on which device the kernel is running. This patch is removing "ifdef CONFIG_MEM_SOFT_DIRTY" in favor of pgtable_supports_soft_dirty() checks that defaults to IS_ENABLED(CONFIG_MEM_SOFT_DIRTY), if not overridden by the architecture, no change in behavior is expected. We make sure to never set VM_SOFTDIRTY if !pgtable_supports_soft_dirty(), so we will never run into VM_SOFTDIRTY checks. [lorenzo.stoakes@oracle.com: fix VMA selftests] Link: https://lkml.kernel.org/r/dac6ddfe-773a-43d5-8f69-021b9ca4d24b@lucifer.local Link: https://lkml.kernel.org/r/20251113072806.795029-1-zhangchunyan@iscas.ac.cn Link: https://lkml.kernel.org/r/20251113072806.795029-2-zhangchunyan@iscas.ac.cn Link: https://github.com/riscv-non-isa/riscv-iommu/pull/543 [1] Signed-off-by: Chunyan Zhang Acked-by: David Hildenbrand Cc: Albert Ou Cc: Alexandre Ghiti Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Christian Brauner Cc: Conor Dooley Cc: Deepak Gupta Cc: Jan Kara Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Xu Cc: Rob Herring Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yuanchu Xie Cc: Alexandre Ghiti Cc: Andrew Jones Cc: Conor Dooley Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 15 ++++++--------- include/linux/mm.h | 3 +++ include/linux/pgtable.h | 12 ++++++++++++ mm/debug_vm_pgtable.c | 10 +++++----- mm/huge_memory.c | 13 +++++++------ mm/internal.h | 2 +- mm/mmap.c | 6 ++++-- mm/mremap.c | 13 +++++++------ mm/userfaultfd.c | 10 ++++------ mm/vma.c | 6 ++++-- mm/vma_exec.c | 5 ++++- tools/testing/vma/vma_internal.h | 2 ++ 12 files changed, 59 insertions(+), 38 deletions(-) (limited to 'include/linux') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 41b062ce6ad8..2b4ab5718ab5 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1584,8 +1584,6 @@ struct clear_refs_private { enum clear_refs_types type; }; -#ifdef CONFIG_MEM_SOFT_DIRTY - static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, pte_t pte) { struct folio *folio; @@ -1605,6 +1603,8 @@ static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, static inline void clear_soft_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *pte) { + if (!pgtable_supports_soft_dirty()) + return; /* * The soft-dirty tracker uses #PF-s to catch writes * to pages, so write-protect the pte as well. See the @@ -1630,19 +1630,16 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, set_pte_at(vma->vm_mm, addr, pte, ptent); } } -#else -static inline void clear_soft_dirty(struct vm_area_struct *vma, - unsigned long addr, pte_t *pte) -{ -} -#endif -#if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE) +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { pmd_t old, pmd = *pmdp; + if (!pgtable_supports_soft_dirty()) + return; + if (pmd_present(pmd)) { /* See comment in change_huge_pmd() */ old = pmdp_invalidate(vma, addr, pmdp); diff --git a/include/linux/mm.h b/include/linux/mm.h index bf660d5b6e97..75f894c3f521 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -859,6 +859,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) static inline void vm_flags_init(struct vm_area_struct *vma, vm_flags_t flags) { + VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY)); ACCESS_PRIVATE(vma, __vm_flags) = flags; } @@ -870,6 +871,7 @@ static inline void vm_flags_init(struct vm_area_struct *vma, static inline void vm_flags_reset(struct vm_area_struct *vma, vm_flags_t flags) { + VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY)); vma_assert_write_locked(vma); vm_flags_init(vma, flags); } @@ -891,6 +893,7 @@ static inline void vm_flags_set(struct vm_area_struct *vma, static inline void vm_flags_clear(struct vm_area_struct *vma, vm_flags_t flags) { + VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY)); vma_start_write(vma); ACCESS_PRIVATE(vma, __vm_flags) &= ~flags; } diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 32e8457ad535..b13b6f42be3c 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1553,6 +1553,18 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) #define arch_start_context_switch(prev) do {} while (0) #endif +/* + * Some platforms can customize the PTE soft-dirty bit making it unavailable + * even if the architecture provides the resource. + * Adding this API allows architectures to add their own checks for the + * devices on which the kernel is running. + * Note: When overriding it, please make sure the CONFIG_MEM_SOFT_DIRTY + * is part of this macro. + */ +#ifndef pgtable_supports_soft_dirty +#define pgtable_supports_soft_dirty() IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) +#endif + #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY #ifndef CONFIG_ARCH_ENABLE_THP_MIGRATION static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 1eae87dbef73..ae9b9310d96f 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -704,7 +704,7 @@ static void __init pte_soft_dirty_tests(struct pgtable_debug_args *args) { pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot); - if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY)) + if (!pgtable_supports_soft_dirty()) return; pr_debug("Validating PTE soft dirty\n"); @@ -717,7 +717,7 @@ static void __init pte_swap_soft_dirty_tests(struct pgtable_debug_args *args) pte_t pte; softleaf_t entry; - if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY)) + if (!pgtable_supports_soft_dirty()) return; pr_debug("Validating PTE swap soft dirty\n"); @@ -734,7 +734,7 @@ static void __init pmd_soft_dirty_tests(struct pgtable_debug_args *args) { pmd_t pmd; - if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY)) + if (!pgtable_supports_soft_dirty()) return; if (!has_transparent_hugepage()) @@ -750,8 +750,8 @@ static void __init pmd_leaf_soft_dirty_tests(struct pgtable_debug_args *args) { pmd_t pmd; - if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) || - !IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION)) + if (!pgtable_supports_soft_dirty() || + !IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION)) return; if (!has_transparent_hugepage()) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7af3e037d891..041b554c7115 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2427,12 +2427,13 @@ static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, static pmd_t move_soft_dirty_pmd(pmd_t pmd) { -#ifdef CONFIG_MEM_SOFT_DIRTY - if (unlikely(pmd_is_migration_entry(pmd))) - pmd = pmd_swp_mksoft_dirty(pmd); - else if (pmd_present(pmd)) - pmd = pmd_mksoft_dirty(pmd); -#endif + if (pgtable_supports_soft_dirty()) { + if (unlikely(pmd_is_migration_entry(pmd))) + pmd = pmd_swp_mksoft_dirty(pmd); + else if (pmd_present(pmd)) + pmd = pmd_mksoft_dirty(pmd); + } + return pmd; } diff --git a/mm/internal.h b/mm/internal.h index 929bc4a5dd98..04c307ee33ae 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1554,7 +1554,7 @@ static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma) * VM_SOFTDIRTY is defined as 0x0, then !(vm_flags & VM_SOFTDIRTY) * will be constantly true. */ - if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY)) + if (!pgtable_supports_soft_dirty()) return false; /* diff --git a/mm/mmap.c b/mm/mmap.c index dc51680824ec..4bdb9ffa9e25 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1448,8 +1448,10 @@ static struct vm_area_struct *__install_special_mapping( return ERR_PTR(-ENOMEM); vma_set_range(vma, addr, addr + len, 0); - vm_flags_init(vma, (vm_flags | mm->def_flags | - VM_DONTEXPAND | VM_SOFTDIRTY) & ~VM_LOCKED_MASK); + vm_flags |= mm->def_flags | VM_DONTEXPAND; + if (pgtable_supports_soft_dirty()) + vm_flags |= VM_SOFTDIRTY; + vm_flags_init(vma, vm_flags & ~VM_LOCKED_MASK); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); vma->vm_ops = ops; diff --git a/mm/mremap.c b/mm/mremap.c index fdb0485ede74..672264807db6 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -165,12 +165,13 @@ static pte_t move_soft_dirty_pte(pte_t pte) * Set soft dirty bit so we can notice * in userspace the ptes were moved. */ -#ifdef CONFIG_MEM_SOFT_DIRTY - if (pte_present(pte)) - pte = pte_mksoft_dirty(pte); - else - pte = pte_swp_mksoft_dirty(pte); -#endif + if (pgtable_supports_soft_dirty()) { + if (pte_present(pte)) + pte = pte_mksoft_dirty(pte); + else + pte = pte_swp_mksoft_dirty(pte); + } + return pte; } diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index bd1f74a7a5ac..e6dfd5f28acd 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -1119,9 +1119,8 @@ static long move_present_ptes(struct mm_struct *mm, orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot); /* Set soft dirty bit so userspace can notice the pte was moved */ -#ifdef CONFIG_MEM_SOFT_DIRTY - orig_dst_pte = pte_mksoft_dirty(orig_dst_pte); -#endif + if (pgtable_supports_soft_dirty()) + orig_dst_pte = pte_mksoft_dirty(orig_dst_pte); if (pte_dirty(orig_src_pte)) orig_dst_pte = pte_mkdirty(orig_dst_pte); orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma); @@ -1208,9 +1207,8 @@ static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, } orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); -#ifdef CONFIG_MEM_SOFT_DIRTY - orig_src_pte = pte_swp_mksoft_dirty(orig_src_pte); -#endif + if (pgtable_supports_soft_dirty()) + orig_src_pte = pte_swp_mksoft_dirty(orig_src_pte); set_pte_at(mm, dst_addr, dst_pte, orig_src_pte); double_pt_unlock(dst_ptl, src_ptl); diff --git a/mm/vma.c b/mm/vma.c index 4e21c988054d..fc90befd162f 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -2559,7 +2559,8 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) * then new mapped in-place (which must be aimed as * a completely new data area). */ - vm_flags_set(vma, VM_SOFTDIRTY); + if (pgtable_supports_soft_dirty()) + vm_flags_set(vma, VM_SOFTDIRTY); vma_set_page_prot(vma); } @@ -2864,7 +2865,8 @@ out: mm->data_vm += len >> PAGE_SHIFT; if (vm_flags & VM_LOCKED) mm->locked_vm += (len >> PAGE_SHIFT); - vm_flags_set(vma, VM_SOFTDIRTY); + if (pgtable_supports_soft_dirty()) + vm_flags_set(vma, VM_SOFTDIRTY); return 0; mas_store_fail: diff --git a/mm/vma_exec.c b/mm/vma_exec.c index 922ee51747a6..8134e1afca68 100644 --- a/mm/vma_exec.c +++ b/mm/vma_exec.c @@ -107,6 +107,7 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift) int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap, unsigned long *top_mem_p) { + unsigned long flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; int err; struct vm_area_struct *vma = vm_area_alloc(mm); @@ -137,7 +138,9 @@ int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap, BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); vma->vm_end = STACK_TOP_MAX; vma->vm_start = vma->vm_end - PAGE_SIZE; - vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP); + if (pgtable_supports_soft_dirty()) + flags |= VM_SOFTDIRTY; + vm_flags_init(vma, flags); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); err = insert_vm_struct(mm, vma); diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 81b501f51948..be99056c5d56 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -212,6 +212,8 @@ typedef __bitwise unsigned int vm_fault_t; #define ASSERT_EXCLUSIVE_WRITER(x) +#define pgtable_supports_soft_dirty() 1 + /** * swap - swap values of @a and @b * @a: first value -- cgit v1.2.3 From f59c0924d61aa2a2bb85936a593140f327112787 Mon Sep 17 00:00:00 2001 From: Chunyan Zhang Date: Thu, 13 Nov 2025 15:28:02 +0800 Subject: mm: userfaultfd: add pgtable_supports_uffd_wp() Some platforms can customize the PTE/PMD entry uffd-wp bit making it unavailable even if the architecture provides the resource. This patch adds a macro API pgtable_supports_uffd_wp() that allows architectures to define their specific implementations to check if the uffd-wp bit is available on which device the kernel is running. Also this patch is removing "ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP" and "ifdef CONFIG_PTE_MARKER_UFFD_WP" in favor of pgtable_supports_uffd_wp() and uffd_supports_wp_marker() checks respectively that default to IS_ENABLED(CONFIG_HAVE_ARCH_USERFAULTFD_WP) and "IS_ENABLED(CONFIG_HAVE_ARCH_USERFAULTFD_WP) && IS_ENABLED(CONFIG_PTE_MARKER_UFFD_WP)" if not overridden by the architecture, no change in behavior is expected. Link: https://lkml.kernel.org/r/20251113072806.795029-3-zhangchunyan@iscas.ac.cn Signed-off-by: Chunyan Zhang Acked-by: David Hildenbrand Cc: Albert Ou Cc: Alexandre Ghiti Cc: Alexandre Ghiti Cc: Al Viro Cc: Andrew Jones Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Christian Brauner Cc: Conor Dooley Cc: Conor Dooley Cc: Deepak Gupta Cc: Jan Kara Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Xu Cc: Rob Herring Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 22 ++++++------ include/asm-generic/pgtable_uffd.h | 17 ++++++++++ include/linux/mm_inline.h | 8 +++-- include/linux/userfaultfd_k.h | 69 ++++++++++++++++++++++---------------- mm/memory.c | 6 ++-- 5 files changed, 78 insertions(+), 44 deletions(-) (limited to 'include/linux') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 3f539aabc3b3..5a0d19dec7ba 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1289,9 +1289,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING) vm_flags |= VM_UFFD_MISSING; if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) { -#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP - goto out; -#endif + if (!pgtable_supports_uffd_wp()) + goto out; + vm_flags |= VM_UFFD_WP; } if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) { @@ -1999,14 +1999,14 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, uffdio_api.features &= ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM); #endif -#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP - uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP; -#endif -#ifndef CONFIG_PTE_MARKER_UFFD_WP - uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM; - uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED; - uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC; -#endif + if (!pgtable_supports_uffd_wp()) + uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP; + + if (!uffd_supports_wp_marker()) { + uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM; + uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED; + uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC; + } ret = -EINVAL; if (features & ~uffdio_api.features) diff --git a/include/asm-generic/pgtable_uffd.h b/include/asm-generic/pgtable_uffd.h index 828966d4c281..0d85791efdf7 100644 --- a/include/asm-generic/pgtable_uffd.h +++ b/include/asm-generic/pgtable_uffd.h @@ -1,6 +1,23 @@ #ifndef _ASM_GENERIC_PGTABLE_UFFD_H #define _ASM_GENERIC_PGTABLE_UFFD_H +/* + * Some platforms can customize the uffd-wp bit, making it unavailable + * even if the architecture provides the resource. + * Adding this API allows architectures to add their own checks for the + * devices on which the kernel is running. + * Note: When overriding it, please make sure the + * CONFIG_HAVE_ARCH_USERFAULTFD_WP is part of this macro. + */ +#ifndef pgtable_supports_uffd_wp +#define pgtable_supports_uffd_wp() IS_ENABLED(CONFIG_HAVE_ARCH_USERFAULTFD_WP) +#endif + +static inline bool uffd_supports_wp_marker(void) +{ + return pgtable_supports_uffd_wp() && IS_ENABLED(CONFIG_PTE_MARKER_UFFD_WP); +} + #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP static __always_inline int pte_uffd_wp(pte_t pte) { diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index b58f34c4fe92..fa2d6ba811b5 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -553,7 +553,6 @@ static inline pte_marker copy_pte_marker( return dstm; } -#endif /* * If this pte is wr-protected by uffd-wp in any form, arm the special pte to @@ -571,9 +570,11 @@ static inline bool pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, pte_t *pte, pte_t pteval) { -#ifdef CONFIG_PTE_MARKER_UFFD_WP bool arm_uffd_pte = false; + if (!uffd_supports_wp_marker()) + return false; + /* The current status of the pte should be "cleared" before calling */ WARN_ON_ONCE(!pte_none(ptep_get(pte))); @@ -602,7 +603,7 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, make_pte_marker(PTE_MARKER_UFFD_WP)); return true; } -#endif + return false; } @@ -616,6 +617,7 @@ static inline bool vma_has_recency(const struct vm_area_struct *vma) return true; } +#endif /** * num_pages_contiguous() - determine the number of contiguous pages diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 96b089dff4ef..fd5f42765497 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -228,15 +228,14 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, if (wp_async && (vm_flags == VM_UFFD_WP)) return true; -#ifndef CONFIG_PTE_MARKER_UFFD_WP /* * If user requested uffd-wp but not enabled pte markers for * uffd-wp, then shmem & hugetlbfs are not supported but only * anonymous. */ - if ((vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma)) + if (!uffd_supports_wp_marker() && (vm_flags & VM_UFFD_WP) && + !vma_is_anonymous(vma)) return false; -#endif /* By default, allow any of anon|shmem|hugetlb */ return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || @@ -291,6 +290,43 @@ void userfaultfd_release_new(struct userfaultfd_ctx *ctx); void userfaultfd_release_all(struct mm_struct *mm, struct userfaultfd_ctx *ctx); +static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma) +{ + /* Only wr-protect mode uses pte markers */ + if (!userfaultfd_wp(vma)) + return false; + + /* File-based uffd-wp always need markers */ + if (!vma_is_anonymous(vma)) + return true; + + /* + * Anonymous uffd-wp only needs the markers if WP_UNPOPULATED + * enabled (to apply markers on zero pages). + */ + return userfaultfd_wp_unpopulated(vma); +} + +/* + * Returns true if this is a swap pte and was uffd-wp wr-protected in either + * forms (pte marker or a normal swap pte), false otherwise. + */ +static inline bool pte_swp_uffd_wp_any(pte_t pte) +{ + if (!uffd_supports_wp_marker()) + return false; + + if (pte_present(pte)) + return false; + + if (pte_swp_uffd_wp(pte)) + return true; + + if (pte_is_uffd_wp_marker(pte)) + return true; + + return false; +} #else /* CONFIG_USERFAULTFD */ /* mm helpers */ @@ -415,23 +451,9 @@ static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma) return false; } -#endif /* CONFIG_USERFAULTFD */ - static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma) { - /* Only wr-protect mode uses pte markers */ - if (!userfaultfd_wp(vma)) - return false; - - /* File-based uffd-wp always need markers */ - if (!vma_is_anonymous(vma)) - return true; - - /* - * Anonymous uffd-wp only needs the markers if WP_UNPOPULATED - * enabled (to apply markers on zero pages). - */ - return userfaultfd_wp_unpopulated(vma); + return false; } /* @@ -440,16 +462,7 @@ static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma) */ static inline bool pte_swp_uffd_wp_any(pte_t pte) { -#ifdef CONFIG_PTE_MARKER_UFFD_WP - if (pte_present(pte)) - return false; - if (pte_swp_uffd_wp(pte)) - return true; - - if (pte_is_uffd_wp_marker(pte)) - return true; -#endif return false; } - +#endif /* CONFIG_USERFAULTFD */ #endif /* _LINUX_USERFAULTFD_K_H */ diff --git a/mm/memory.c b/mm/memory.c index 50b93b45b174..6675e87eb7dd 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1590,7 +1590,9 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, { bool was_installed = false; -#ifdef CONFIG_PTE_MARKER_UFFD_WP + if (!uffd_supports_wp_marker()) + return false; + /* Zap on anonymous always means dropping everything */ if (vma_is_anonymous(vma)) return false; @@ -1607,7 +1609,7 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, pte++; addr += PAGE_SIZE; } -#endif + return was_installed; } -- cgit v1.2.3 From 348ced3da52b3161f5ceec8868e81973ce48e11d Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Fri, 21 Nov 2025 14:48:59 -0500 Subject: hugetlb: add __read_mostly to sysctl_hugetlb_shm_group sysctl bits are mostly-read values. Link: https://lkml.kernel.org/r/20251121194859.265259-2-gourry@gourry.net Signed-off-by: Gregory Price Suggested-by: Andrew Morton Acked-by: Oscar Salvador Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 457d48ac7bcd..019a1c5281e4 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -171,7 +171,7 @@ bool hugetlbfs_pagecache_present(struct hstate *h, struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio); -extern int sysctl_hugetlb_shm_group; +extern int sysctl_hugetlb_shm_group __read_mostly; extern struct list_head huge_boot_pages[MAX_NUMNODES]; void hugetlb_bootmem_alloc(void); -- cgit v1.2.3 From 2b6a3f061f11372af79b862d6184d43193ae927f Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 25 Nov 2025 10:00:59 +0000 Subject: mm: declare VMA flags by bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "initial work on making VMA flags a bitmap", v3. We are in the rather silly situation that we are running out of VMA flags as they are currently limited to a system word in size. This leads to absurd situations where we limit features to 64-bit architectures only because we simply do not have the ability to add a flag for 32-bit ones. This is very constraining and leads to hacks or, in the worst case, simply an inability to implement features we want for entirely arbitrary reasons. This also of course gives us something of a Y2K type situation in mm where we might eventually exhaust all of the VMA flags even on 64-bit systems. This series lays the groundwork for getting away from this limitation by establishing VMA flags as a bitmap whose size we can increase in future beyond 64 bits if required. This is necessarily a highly iterative process given the extensive use of VMA flags throughout the kernel, so we start by performing basic steps. Firstly, we declare VMA flags by bit number rather than by value, retaining the VM_xxx fields but in terms of these newly introduced VMA_xxx_BIT fields. While we are here, we use sparse annotations to ensure that, when dealing with VMA bit number parameters, we cannot be passed values which are not declared as such - providing some useful type safety. We then introduce an opaque VMA flag type, much like the opaque mm_struct flag type introduced in commit bb6525f2f8c4 ("mm: add bitmap mm->flags field"), which we establish in union with vma->vm_flags (but still set at system word size meaning there is no functional or data type size change). We update the vm_flags_xxx() helpers to use this new bitmap, introducing sensible helpers to do so. This series lays the foundation for further work to expand the use of bitmap VMA flags and eventually eliminate these arbitrary restrictions. This patch (of 4): In order to lay the groundwork for VMA flags being a bitmap rather than a system word in size, we need to be able to consistently refer to VMA flags by bit number rather than value. Take this opportunity to do so in an enum which we which is additionally useful for tooling to extract metadata from. This additionally makes it very clear which bits are being used for what at a glance. We use the VMA_ prefix for the bit values as it is logical to do so since these reference VMAs. We consistently suffix with _BIT to make it clear what the values refer to. We declare bit values even when the flags that use them would not be enabled by config options as this is simply clearer and clearly defines what bit numbers are used for what, at no additional cost. We declare a sparse-bitwise type vma_flag_t which ensures that users can't pass around invalid VMA flags by accident and prepares for future work towards VMA flags being a bitmap where we want to ensure bit values are type safe. To make life easier, we declare some macro helpers - DECLARE_VMA_BIT() allows us to avoid duplication in the enum bit number declarations (and maintaining the sparse __bitwise attribute), and INIT_VM_FLAG() is used to assist with declaration of flags. Unfortunately we can't declare both in the enum, as we run into issue with logic in the kernel requiring that flags are preprocessor definitions, and additionally we cannot have a macro which declares another macro so we must define each flag macro directly. Additionally, update the VMA userland testing vma_internal.h header to include these changes. We also have to fix the parameters to the vma_flag_*_atomic() functions since VMA_MAYBE_GUARD_BIT is now of type vma_flag_t and sparse will complain otherwise. We have to update some rather silly if-deffery found in mm/task_mmu.c which would otherwise break. Finally, we update the rust binding helper as now it cannot auto-detect the flags at all. Link: https://lkml.kernel.org/r/cover.1764064556.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/3a35e5a0bcfa00e84af24cbafc0653e74deda64a.1764064556.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Reviewed-by: Pedro Falcato Acked-by: Alice Ryhl [rust] Cc: Alex Gaynor Cc: Alistair Popple Cc: Andreas Hindborg Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Ben Segall Cc: Björn Roy Baron Cc: Boqun Feng Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Danilo Krummrich Cc: David Hildenbrand Cc: David Rientjes Cc: Dev Jain Cc: Dietmar Eggemann Cc: Gary Guo Cc: Gregory Price Cc: "Huang, Ying" Cc: Ingo Molnar Cc: Jann Horn Cc: Jason Gunthorpe Cc: Johannes Weiner Cc: John Hubbard Cc: Joshua Hahn Cc: Juri Lelli Cc: Kairui Song Cc: Kees Cook Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Muchun Song Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: Shakeel Butt Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Trevor Gross Cc: Valentin Schneider Cc: Vincent Guittot Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 4 +- include/linux/mm.h | 379 ++++++++++++++++++++++----------------- mm/khugepaged.c | 2 +- mm/madvise.c | 2 +- rust/bindgen_parameters | 25 +++ rust/bindings/bindings_helper.h | 25 +++ tools/testing/vma/vma_internal.h | 304 ++++++++++++++++++++++++++----- 7 files changed, 524 insertions(+), 217 deletions(-) (limited to 'include/linux') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 2b4ab5718ab5..d00ac179d973 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1183,10 +1183,10 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_PKEY_BIT0)] = "", [ilog2(VM_PKEY_BIT1)] = "", [ilog2(VM_PKEY_BIT2)] = "", -#if VM_PKEY_BIT3 +#if CONFIG_ARCH_PKEY_BITS > 3 [ilog2(VM_PKEY_BIT3)] = "", #endif -#if VM_PKEY_BIT4 +#if CONFIG_ARCH_PKEY_BITS > 4 [ilog2(VM_PKEY_BIT4)] = "", #endif #endif /* CONFIG_ARCH_HAS_PKEYS */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 75f894c3f521..a2f38fb68840 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -271,185 +271,239 @@ extern struct rw_semaphore nommu_region_sem; extern unsigned int kobjsize(const void *objp); #endif -#define VM_MAYBE_GUARD_BIT 11 - /* * vm_flags in vm_area_struct, see mm_types.h. * When changing, update also include/trace/events/mmflags.h */ -#define VM_NONE 0x00000000 -#define VM_READ 0x00000001 /* currently active flags */ -#define VM_WRITE 0x00000002 -#define VM_EXEC 0x00000004 -#define VM_SHARED 0x00000008 +#define VM_NONE 0x00000000 -/* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */ -#define VM_MAYREAD 0x00000010 /* limits for mprotect() etc */ -#define VM_MAYWRITE 0x00000020 -#define VM_MAYEXEC 0x00000040 -#define VM_MAYSHARE 0x00000080 +/** + * typedef vma_flag_t - specifies an individual VMA flag by bit number. + * + * This value is made type safe by sparse to avoid passing invalid flag values + * around. + */ +typedef int __bitwise vma_flag_t; -#define VM_GROWSDOWN 0x00000100 /* general info on the segment */ +#define DECLARE_VMA_BIT(name, bitnum) \ + VMA_ ## name ## _BIT = ((__force vma_flag_t)bitnum) +#define DECLARE_VMA_BIT_ALIAS(name, aliased) \ + VMA_ ## name ## _BIT = (VMA_ ## aliased ## _BIT) +enum { + DECLARE_VMA_BIT(READ, 0), + DECLARE_VMA_BIT(WRITE, 1), + DECLARE_VMA_BIT(EXEC, 2), + DECLARE_VMA_BIT(SHARED, 3), + /* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */ + DECLARE_VMA_BIT(MAYREAD, 4), /* limits for mprotect() etc. */ + DECLARE_VMA_BIT(MAYWRITE, 5), + DECLARE_VMA_BIT(MAYEXEC, 6), + DECLARE_VMA_BIT(MAYSHARE, 7), + DECLARE_VMA_BIT(GROWSDOWN, 8), /* general info on the segment */ #ifdef CONFIG_MMU -#define VM_UFFD_MISSING 0x00000200 /* missing pages tracking */ -#else /* CONFIG_MMU */ -#define VM_MAYOVERLAY 0x00000200 /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */ -#define VM_UFFD_MISSING 0 + DECLARE_VMA_BIT(UFFD_MISSING, 9),/* missing pages tracking */ +#else + /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */ + DECLARE_VMA_BIT(MAYOVERLAY, 9), #endif /* CONFIG_MMU */ -#define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ -#define VM_MAYBE_GUARD BIT(VM_MAYBE_GUARD_BIT) /* The VMA maybe contains guard regions. */ -#define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */ - -#define VM_LOCKED 0x00002000 -#define VM_IO 0x00004000 /* Memory mapped I/O or similar */ - - /* Used by sys_madvise() */ -#define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ -#define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */ - -#define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ -#define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ -#define VM_LOCKONFAULT 0x00080000 /* Lock the pages covered when they are faulted in */ -#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ -#define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ -#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ -#define VM_SYNC 0x00800000 /* Synchronous page faults */ -#define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ -#define VM_WIPEONFORK 0x02000000 /* Wipe VMA contents in child. */ -#define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ - + /* Page-ranges managed without "struct page", just pure PFN */ + DECLARE_VMA_BIT(PFNMAP, 10), + DECLARE_VMA_BIT(MAYBE_GUARD, 11), + DECLARE_VMA_BIT(UFFD_WP, 12), /* wrprotect pages tracking */ + DECLARE_VMA_BIT(LOCKED, 13), + DECLARE_VMA_BIT(IO, 14), /* Memory mapped I/O or similar */ + DECLARE_VMA_BIT(SEQ_READ, 15), /* App will access data sequentially */ + DECLARE_VMA_BIT(RAND_READ, 16), /* App will not benefit from clustered reads */ + DECLARE_VMA_BIT(DONTCOPY, 17), /* Do not copy this vma on fork */ + DECLARE_VMA_BIT(DONTEXPAND, 18),/* Cannot expand with mremap() */ + DECLARE_VMA_BIT(LOCKONFAULT, 19),/* Lock pages covered when faulted in */ + DECLARE_VMA_BIT(ACCOUNT, 20), /* Is a VM accounted object */ + DECLARE_VMA_BIT(NORESERVE, 21), /* should the VM suppress accounting */ + DECLARE_VMA_BIT(HUGETLB, 22), /* Huge TLB Page VM */ + DECLARE_VMA_BIT(SYNC, 23), /* Synchronous page faults */ + DECLARE_VMA_BIT(ARCH_1, 24), /* Architecture-specific flag */ + DECLARE_VMA_BIT(WIPEONFORK, 25),/* Wipe VMA contents in child. */ + DECLARE_VMA_BIT(DONTDUMP, 26), /* Do not include in the core dump */ + DECLARE_VMA_BIT(SOFTDIRTY, 27), /* NOT soft dirty clean area */ + DECLARE_VMA_BIT(MIXEDMAP, 28), /* Can contain struct page and pure PFN pages */ + DECLARE_VMA_BIT(HUGEPAGE, 29), /* MADV_HUGEPAGE marked this vma */ + DECLARE_VMA_BIT(NOHUGEPAGE, 30),/* MADV_NOHUGEPAGE marked this vma */ + DECLARE_VMA_BIT(MERGEABLE, 31), /* KSM may merge identical pages */ + /* These bits are reused, we define specific uses below. */ + DECLARE_VMA_BIT(HIGH_ARCH_0, 32), + DECLARE_VMA_BIT(HIGH_ARCH_1, 33), + DECLARE_VMA_BIT(HIGH_ARCH_2, 34), + DECLARE_VMA_BIT(HIGH_ARCH_3, 35), + DECLARE_VMA_BIT(HIGH_ARCH_4, 36), + DECLARE_VMA_BIT(HIGH_ARCH_5, 37), + DECLARE_VMA_BIT(HIGH_ARCH_6, 38), + /* + * This flag is used to connect VFIO to arch specific KVM code. It + * indicates that the memory under this VMA is safe for use with any + * non-cachable memory type inside KVM. Some VFIO devices, on some + * platforms, are thought to be unsafe and can cause machine crashes + * if KVM does not lock down the memory type. + */ + DECLARE_VMA_BIT(ALLOW_ANY_UNCACHED, 39), +#ifdef CONFIG_PPC32 + DECLARE_VMA_BIT_ALIAS(DROPPABLE, ARCH_1), +#else + DECLARE_VMA_BIT(DROPPABLE, 40), +#endif + DECLARE_VMA_BIT(UFFD_MINOR, 41), + DECLARE_VMA_BIT(SEALED, 42), + /* Flags that reuse flags above. */ + DECLARE_VMA_BIT_ALIAS(PKEY_BIT0, HIGH_ARCH_0), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT1, HIGH_ARCH_1), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT2, HIGH_ARCH_2), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT3, HIGH_ARCH_3), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT4, HIGH_ARCH_4), +#if defined(CONFIG_X86_USER_SHADOW_STACK) + /* + * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of + * support core mm. + * + * These VMAs will get a single end guard page. This helps userspace + * protect itself from attacks. A single page is enough for current + * shadow stack archs (x86). See the comments near alloc_shstk() in + * arch/x86/kernel/shstk.c for more details on the guard size. + */ + DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_5), +#elif defined(CONFIG_ARM64_GCS) + /* + * arm64's Guarded Control Stack implements similar functionality and + * has similar constraints to shadow stacks. + */ + DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_6), +#endif + DECLARE_VMA_BIT_ALIAS(SAO, ARCH_1), /* Strong Access Ordering (powerpc) */ + DECLARE_VMA_BIT_ALIAS(GROWSUP, ARCH_1), /* parisc */ + DECLARE_VMA_BIT_ALIAS(SPARC_ADI, ARCH_1), /* sparc64 */ + DECLARE_VMA_BIT_ALIAS(ARM64_BTI, ARCH_1), /* arm64 */ + DECLARE_VMA_BIT_ALIAS(ARCH_CLEAR, ARCH_1), /* sparc64, arm64 */ + DECLARE_VMA_BIT_ALIAS(MAPPED_COPY, ARCH_1), /* !CONFIG_MMU */ + DECLARE_VMA_BIT_ALIAS(MTE, HIGH_ARCH_4), /* arm64 */ + DECLARE_VMA_BIT_ALIAS(MTE_ALLOWED, HIGH_ARCH_5),/* arm64 */ +#ifdef CONFIG_STACK_GROWSUP + DECLARE_VMA_BIT_ALIAS(STACK, GROWSUP), + DECLARE_VMA_BIT_ALIAS(STACK_EARLY, GROWSDOWN), +#else + DECLARE_VMA_BIT_ALIAS(STACK, GROWSDOWN), +#endif +}; +#undef DECLARE_VMA_BIT +#undef DECLARE_VMA_BIT_ALIAS + +#define INIT_VM_FLAG(name) BIT((__force int) VMA_ ## name ## _BIT) +#define VM_READ INIT_VM_FLAG(READ) +#define VM_WRITE INIT_VM_FLAG(WRITE) +#define VM_EXEC INIT_VM_FLAG(EXEC) +#define VM_SHARED INIT_VM_FLAG(SHARED) +#define VM_MAYREAD INIT_VM_FLAG(MAYREAD) +#define VM_MAYWRITE INIT_VM_FLAG(MAYWRITE) +#define VM_MAYEXEC INIT_VM_FLAG(MAYEXEC) +#define VM_MAYSHARE INIT_VM_FLAG(MAYSHARE) +#define VM_GROWSDOWN INIT_VM_FLAG(GROWSDOWN) +#ifdef CONFIG_MMU +#define VM_UFFD_MISSING INIT_VM_FLAG(UFFD_MISSING) +#else +#define VM_UFFD_MISSING VM_NONE +#define VM_MAYOVERLAY INIT_VM_FLAG(MAYOVERLAY) +#endif +#define VM_PFNMAP INIT_VM_FLAG(PFNMAP) +#define VM_MAYBE_GUARD INIT_VM_FLAG(MAYBE_GUARD) +#define VM_UFFD_WP INIT_VM_FLAG(UFFD_WP) +#define VM_LOCKED INIT_VM_FLAG(LOCKED) +#define VM_IO INIT_VM_FLAG(IO) +#define VM_SEQ_READ INIT_VM_FLAG(SEQ_READ) +#define VM_RAND_READ INIT_VM_FLAG(RAND_READ) +#define VM_DONTCOPY INIT_VM_FLAG(DONTCOPY) +#define VM_DONTEXPAND INIT_VM_FLAG(DONTEXPAND) +#define VM_LOCKONFAULT INIT_VM_FLAG(LOCKONFAULT) +#define VM_ACCOUNT INIT_VM_FLAG(ACCOUNT) +#define VM_NORESERVE INIT_VM_FLAG(NORESERVE) +#define VM_HUGETLB INIT_VM_FLAG(HUGETLB) +#define VM_SYNC INIT_VM_FLAG(SYNC) +#define VM_ARCH_1 INIT_VM_FLAG(ARCH_1) +#define VM_WIPEONFORK INIT_VM_FLAG(WIPEONFORK) +#define VM_DONTDUMP INIT_VM_FLAG(DONTDUMP) #ifdef CONFIG_MEM_SOFT_DIRTY -# define VM_SOFTDIRTY 0x08000000 /* Not soft dirty clean area */ +#define VM_SOFTDIRTY INIT_VM_FLAG(SOFTDIRTY) #else -# define VM_SOFTDIRTY 0 +#define VM_SOFTDIRTY VM_NONE +#endif +#define VM_MIXEDMAP INIT_VM_FLAG(MIXEDMAP) +#define VM_HUGEPAGE INIT_VM_FLAG(HUGEPAGE) +#define VM_NOHUGEPAGE INIT_VM_FLAG(NOHUGEPAGE) +#define VM_MERGEABLE INIT_VM_FLAG(MERGEABLE) +#define VM_STACK INIT_VM_FLAG(STACK) +#ifdef CONFIG_STACK_GROWS_UP +#define VM_STACK_EARLY INIT_VM_FLAG(STACK_EARLY) +#else +#define VM_STACK_EARLY VM_NONE #endif - -#define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ -#define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */ -#define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ -#define VM_MERGEABLE BIT(31) /* KSM may merge identical pages */ - -#ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS -#define VM_HIGH_ARCH_BIT_0 32 /* bit only usable on 64-bit architectures */ -#define VM_HIGH_ARCH_BIT_1 33 /* bit only usable on 64-bit architectures */ -#define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */ -#define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */ -#define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */ -#define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */ -#define VM_HIGH_ARCH_BIT_6 38 /* bit only usable on 64-bit architectures */ -#define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0) -#define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1) -#define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2) -#define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3) -#define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4) -#define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5) -#define VM_HIGH_ARCH_6 BIT(VM_HIGH_ARCH_BIT_6) -#endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ - #ifdef CONFIG_ARCH_HAS_PKEYS -# define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0 -# define VM_PKEY_BIT0 VM_HIGH_ARCH_0 -# define VM_PKEY_BIT1 VM_HIGH_ARCH_1 -# define VM_PKEY_BIT2 VM_HIGH_ARCH_2 +#define VM_PKEY_SHIFT ((__force int)VMA_HIGH_ARCH_0_BIT) +/* Despite the naming, these are FLAGS not bits. */ +#define VM_PKEY_BIT0 INIT_VM_FLAG(PKEY_BIT0) +#define VM_PKEY_BIT1 INIT_VM_FLAG(PKEY_BIT1) +#define VM_PKEY_BIT2 INIT_VM_FLAG(PKEY_BIT2) #if CONFIG_ARCH_PKEY_BITS > 3 -# define VM_PKEY_BIT3 VM_HIGH_ARCH_3 +#define VM_PKEY_BIT3 INIT_VM_FLAG(PKEY_BIT3) #else -# define VM_PKEY_BIT3 0 -#endif +#define VM_PKEY_BIT3 VM_NONE +#endif /* CONFIG_ARCH_PKEY_BITS > 3 */ #if CONFIG_ARCH_PKEY_BITS > 4 -# define VM_PKEY_BIT4 VM_HIGH_ARCH_4 +#define VM_PKEY_BIT4 INIT_VM_FLAG(PKEY_BIT4) #else -# define VM_PKEY_BIT4 0 -#endif +#define VM_PKEY_BIT4 VM_NONE +#endif /* CONFIG_ARCH_PKEY_BITS > 4 */ #endif /* CONFIG_ARCH_HAS_PKEYS */ - -#ifdef CONFIG_X86_USER_SHADOW_STACK -/* - * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of - * support core mm. - * - * These VMAs will get a single end guard page. This helps userspace protect - * itself from attacks. A single page is enough for current shadow stack archs - * (x86). See the comments near alloc_shstk() in arch/x86/kernel/shstk.c - * for more details on the guard size. - */ -# define VM_SHADOW_STACK VM_HIGH_ARCH_5 -#endif - -#if defined(CONFIG_ARM64_GCS) -/* - * arm64's Guarded Control Stack implements similar functionality and - * has similar constraints to shadow stacks. - */ -# define VM_SHADOW_STACK VM_HIGH_ARCH_6 -#endif - -#ifndef VM_SHADOW_STACK -# define VM_SHADOW_STACK VM_NONE +#if defined(CONFIG_X86_USER_SHADOW_STACK) || defined(CONFIG_ARM64_GCS) +#define VM_SHADOW_STACK INIT_VM_FLAG(SHADOW_STACK) +#else +#define VM_SHADOW_STACK VM_NONE #endif - #if defined(CONFIG_PPC64) -# define VM_SAO VM_ARCH_1 /* Strong Access Ordering (powerpc) */ +#define VM_SAO INIT_VM_FLAG(SAO) #elif defined(CONFIG_PARISC) -# define VM_GROWSUP VM_ARCH_1 +#define VM_GROWSUP INIT_VM_FLAG(GROWSUP) #elif defined(CONFIG_SPARC64) -# define VM_SPARC_ADI VM_ARCH_1 /* Uses ADI tag for access control */ -# define VM_ARCH_CLEAR VM_SPARC_ADI +#define VM_SPARC_ADI INIT_VM_FLAG(SPARC_ADI) +#define VM_ARCH_CLEAR INIT_VM_FLAG(ARCH_CLEAR) #elif defined(CONFIG_ARM64) -# define VM_ARM64_BTI VM_ARCH_1 /* BTI guarded page, a.k.a. GP bit */ -# define VM_ARCH_CLEAR VM_ARM64_BTI +#define VM_ARM64_BTI INIT_VM_FLAG(ARM64_BTI) +#define VM_ARCH_CLEAR INIT_VM_FLAG(ARCH_CLEAR) #elif !defined(CONFIG_MMU) -# define VM_MAPPED_COPY VM_ARCH_1 /* T if mapped copy of data (nommu mmap) */ -#endif - -#if defined(CONFIG_ARM64_MTE) -# define VM_MTE VM_HIGH_ARCH_4 /* Use Tagged memory for access control */ -# define VM_MTE_ALLOWED VM_HIGH_ARCH_5 /* Tagged memory permitted */ -#else -# define VM_MTE VM_NONE -# define VM_MTE_ALLOWED VM_NONE +#define VM_MAPPED_COPY INIT_VM_FLAG(MAPPED_COPY) #endif - #ifndef VM_GROWSUP -# define VM_GROWSUP VM_NONE +#define VM_GROWSUP VM_NONE +#endif +#ifdef CONFIG_ARM64_MTE +#define VM_MTE INIT_VM_FLAG(MTE) +#define VM_MTE_ALLOWED INIT_VM_FLAG(MTE_ALLOWED) +#else +#define VM_MTE VM_NONE +#define VM_MTE_ALLOWED VM_NONE #endif - #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR -# define VM_UFFD_MINOR_BIT 41 -# define VM_UFFD_MINOR BIT(VM_UFFD_MINOR_BIT) /* UFFD minor faults */ -#else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ -# define VM_UFFD_MINOR VM_NONE -#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ - -/* - * This flag is used to connect VFIO to arch specific KVM code. It - * indicates that the memory under this VMA is safe for use with any - * non-cachable memory type inside KVM. Some VFIO devices, on some - * platforms, are thought to be unsafe and can cause machine crashes - * if KVM does not lock down the memory type. - */ -#ifdef CONFIG_64BIT -#define VM_ALLOW_ANY_UNCACHED_BIT 39 -#define VM_ALLOW_ANY_UNCACHED BIT(VM_ALLOW_ANY_UNCACHED_BIT) +#define VM_UFFD_MINOR INIT_VM_FLAG(UFFD_MINOR) #else -#define VM_ALLOW_ANY_UNCACHED VM_NONE +#define VM_UFFD_MINOR VM_NONE #endif - #ifdef CONFIG_64BIT -#define VM_DROPPABLE_BIT 40 -#define VM_DROPPABLE BIT(VM_DROPPABLE_BIT) -#elif defined(CONFIG_PPC32) -#define VM_DROPPABLE VM_ARCH_1 +#define VM_ALLOW_ANY_UNCACHED INIT_VM_FLAG(ALLOW_ANY_UNCACHED) +#define VM_SEALED INIT_VM_FLAG(SEALED) #else -#define VM_DROPPABLE VM_NONE +#define VM_ALLOW_ANY_UNCACHED VM_NONE +#define VM_SEALED VM_NONE #endif - -#ifdef CONFIG_64BIT -#define VM_SEALED_BIT 42 -#define VM_SEALED BIT(VM_SEALED_BIT) +#if defined(CONFIG_64BIT) || defined(CONFIG_PPC32) +#define VM_DROPPABLE INIT_VM_FLAG(DROPPABLE) #else -#define VM_SEALED VM_NONE +#define VM_DROPPABLE VM_NONE #endif /* Bits set in the VMA until the stack is in its final location */ @@ -475,12 +529,10 @@ extern unsigned int kobjsize(const void *objp); #define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) -#ifdef CONFIG_STACK_GROWSUP -#define VM_STACK VM_GROWSUP -#define VM_STACK_EARLY VM_GROWSDOWN +#ifdef CONFIG_MSEAL_SYSTEM_MAPPINGS +#define VM_SEALED_SYSMAP VM_SEALED #else -#define VM_STACK VM_GROWSDOWN -#define VM_STACK_EARLY 0 +#define VM_SEALED_SYSMAP VM_NONE #endif #define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) @@ -488,7 +540,6 @@ extern unsigned int kobjsize(const void *objp); /* VMA basic access permission flags */ #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) - /* * Special vmas that are non-mergable, non-mlock()able. */ @@ -523,7 +574,7 @@ extern unsigned int kobjsize(const void *objp); /* Arch-specific flags to clear when updating VM flags on protection change */ #ifndef VM_ARCH_CLEAR -# define VM_ARCH_CLEAR VM_NONE +#define VM_ARCH_CLEAR VM_NONE #endif #define VM_FLAGS_CLEAR (ARCH_VM_PKEY_FLAGS | VM_ARCH_CLEAR) @@ -920,9 +971,9 @@ static inline void vm_flags_mod(struct vm_area_struct *vma, } static inline bool __vma_flag_atomic_valid(struct vm_area_struct *vma, - int bit) + vma_flag_t bit) { - const vm_flags_t mask = BIT(bit); + const vm_flags_t mask = BIT((__force int)bit); /* Only specific flags are permitted */ if (WARN_ON_ONCE(!(mask & VM_ATOMIC_SET_ALLOWED))) @@ -935,14 +986,15 @@ static inline bool __vma_flag_atomic_valid(struct vm_area_struct *vma, * Set VMA flag atomically. Requires only VMA/mmap read lock. Only specific * valid flags are allowed to do this. */ -static inline void vma_flag_set_atomic(struct vm_area_struct *vma, int bit) +static inline void vma_flag_set_atomic(struct vm_area_struct *vma, + vma_flag_t bit) { /* mmap read lock/VMA read lock must be held. */ if (!rwsem_is_locked(&vma->vm_mm->mmap_lock)) vma_assert_locked(vma); if (__vma_flag_atomic_valid(vma, bit)) - set_bit(bit, &ACCESS_PRIVATE(vma, __vm_flags)); + set_bit((__force int)bit, &ACCESS_PRIVATE(vma, __vm_flags)); } /* @@ -952,10 +1004,11 @@ static inline void vma_flag_set_atomic(struct vm_area_struct *vma, int bit) * This is necessarily racey, so callers must ensure that serialisation is * achieved through some other means, or that races are permissible. */ -static inline bool vma_flag_test_atomic(struct vm_area_struct *vma, int bit) +static inline bool vma_flag_test_atomic(struct vm_area_struct *vma, + vma_flag_t bit) { if (__vma_flag_atomic_valid(vma, bit)) - return test_bit(bit, &vma->vm_flags); + return test_bit((__force int)bit, &vma->vm_flags); return false; } @@ -4517,16 +4570,6 @@ int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *st int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status); int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); - -/* - * mseal of userspace process's system mappings. - */ -#ifdef CONFIG_MSEAL_SYSTEM_MAPPINGS -#define VM_SEALED_SYSMAP VM_SEALED -#else -#define VM_SEALED_SYSMAP VM_NONE -#endif - /* * DMA mapping IDs for page_pool * diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 89c33ef7aac3..97d1b2824386 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1740,7 +1740,7 @@ static bool file_backed_vma_is_retractable(struct vm_area_struct *vma) * obtained on guard region installation after the flag is set, so this * check being performed under this lock excludes races. */ - if (vma_flag_test_atomic(vma, VM_MAYBE_GUARD_BIT)) + if (vma_flag_test_atomic(vma, VMA_MAYBE_GUARD_BIT)) return false; return true; diff --git a/mm/madvise.c b/mm/madvise.c index d8bc51e1bea7..b617b1be0f53 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1142,7 +1142,7 @@ static long madvise_guard_install(struct madvise_behavior *madv_behavior) * acquire an mmap/VMA write lock to read it. All remaining readers may * or may not see the flag set, but we don't care. */ - vma_flag_set_atomic(vma, VM_MAYBE_GUARD_BIT); + vma_flag_set_atomic(vma, VMA_MAYBE_GUARD_BIT); /* * If anonymous and we are establishing page tables the VMA ought to diff --git a/rust/bindgen_parameters b/rust/bindgen_parameters index e13c6f9dd17b..fd2fd1c3cb9a 100644 --- a/rust/bindgen_parameters +++ b/rust/bindgen_parameters @@ -35,6 +35,31 @@ # recognized, block generation of the non-helper constants. --blocklist-item ARCH_SLAB_MINALIGN --blocklist-item ARCH_KMALLOC_MINALIGN +--blocklist-item VM_MERGEABLE +--blocklist-item VM_READ +--blocklist-item VM_WRITE +--blocklist-item VM_EXEC +--blocklist-item VM_SHARED +--blocklist-item VM_MAYREAD +--blocklist-item VM_MAYWRITE +--blocklist-item VM_MAYEXEC +--blocklist-item VM_MAYEXEC +--blocklist-item VM_PFNMAP +--blocklist-item VM_IO +--blocklist-item VM_DONTCOPY +--blocklist-item VM_DONTEXPAND +--blocklist-item VM_LOCKONFAULT +--blocklist-item VM_ACCOUNT +--blocklist-item VM_NORESERVE +--blocklist-item VM_HUGETLB +--blocklist-item VM_SYNC +--blocklist-item VM_ARCH_1 +--blocklist-item VM_WIPEONFORK +--blocklist-item VM_DONTDUMP +--blocklist-item VM_SOFTDIRTY +--blocklist-item VM_MIXEDMAP +--blocklist-item VM_HUGEPAGE +--blocklist-item VM_NOHUGEPAGE # Structs should implement `Zeroable` when all of their fields do. --with-derive-custom-struct .*=MaybeZeroable diff --git a/rust/bindings/bindings_helper.h b/rust/bindings/bindings_helper.h index 2e43c66635a2..4c327db01ca0 100644 --- a/rust/bindings/bindings_helper.h +++ b/rust/bindings/bindings_helper.h @@ -108,7 +108,32 @@ const xa_mark_t RUST_CONST_HELPER_XA_PRESENT = XA_PRESENT; const gfp_t RUST_CONST_HELPER_XA_FLAGS_ALLOC = XA_FLAGS_ALLOC; const gfp_t RUST_CONST_HELPER_XA_FLAGS_ALLOC1 = XA_FLAGS_ALLOC1; + const vm_flags_t RUST_CONST_HELPER_VM_MERGEABLE = VM_MERGEABLE; +const vm_flags_t RUST_CONST_HELPER_VM_READ = VM_READ; +const vm_flags_t RUST_CONST_HELPER_VM_WRITE = VM_WRITE; +const vm_flags_t RUST_CONST_HELPER_VM_EXEC = VM_EXEC; +const vm_flags_t RUST_CONST_HELPER_VM_SHARED = VM_SHARED; +const vm_flags_t RUST_CONST_HELPER_VM_MAYREAD = VM_MAYREAD; +const vm_flags_t RUST_CONST_HELPER_VM_MAYWRITE = VM_MAYWRITE; +const vm_flags_t RUST_CONST_HELPER_VM_MAYEXEC = VM_MAYEXEC; +const vm_flags_t RUST_CONST_HELPER_VM_MAYSHARE = VM_MAYEXEC; +const vm_flags_t RUST_CONST_HELPER_VM_PFNMAP = VM_PFNMAP; +const vm_flags_t RUST_CONST_HELPER_VM_IO = VM_IO; +const vm_flags_t RUST_CONST_HELPER_VM_DONTCOPY = VM_DONTCOPY; +const vm_flags_t RUST_CONST_HELPER_VM_DONTEXPAND = VM_DONTEXPAND; +const vm_flags_t RUST_CONST_HELPER_VM_LOCKONFAULT = VM_LOCKONFAULT; +const vm_flags_t RUST_CONST_HELPER_VM_ACCOUNT = VM_ACCOUNT; +const vm_flags_t RUST_CONST_HELPER_VM_NORESERVE = VM_NORESERVE; +const vm_flags_t RUST_CONST_HELPER_VM_HUGETLB = VM_HUGETLB; +const vm_flags_t RUST_CONST_HELPER_VM_SYNC = VM_SYNC; +const vm_flags_t RUST_CONST_HELPER_VM_ARCH_1 = VM_ARCH_1; +const vm_flags_t RUST_CONST_HELPER_VM_WIPEONFORK = VM_WIPEONFORK; +const vm_flags_t RUST_CONST_HELPER_VM_DONTDUMP = VM_DONTDUMP; +const vm_flags_t RUST_CONST_HELPER_VM_SOFTDIRTY = VM_SOFTDIRTY; +const vm_flags_t RUST_CONST_HELPER_VM_MIXEDMAP = VM_MIXEDMAP; +const vm_flags_t RUST_CONST_HELPER_VM_HUGEPAGE = VM_HUGEPAGE; +const vm_flags_t RUST_CONST_HELPER_VM_NOHUGEPAGE = VM_NOHUGEPAGE; #if IS_ENABLED(CONFIG_ANDROID_BINDER_IPC_RUST) #include "../../drivers/android/binder/rust_binder.h" diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 8c2ac301a00e..b7e8fc9ccdf4 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -46,42 +46,271 @@ extern unsigned long dac_mmap_min_addr; #define MMF_HAS_MDWE 28 +/* + * vm_flags in vm_area_struct, see mm_types.h. + * When changing, update also include/trace/events/mmflags.h + */ + #define VM_NONE 0x00000000 -#define VM_READ 0x00000001 -#define VM_WRITE 0x00000002 -#define VM_EXEC 0x00000004 -#define VM_SHARED 0x00000008 -#define VM_MAYREAD 0x00000010 -#define VM_MAYWRITE 0x00000020 -#define VM_MAYEXEC 0x00000040 -#define VM_GROWSDOWN 0x00000100 -#define VM_PFNMAP 0x00000400 -#define VM_MAYBE_GUARD 0x00000800 -#define VM_LOCKED 0x00002000 -#define VM_IO 0x00004000 -#define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ -#define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */ -#define VM_DONTEXPAND 0x00040000 -#define VM_LOCKONFAULT 0x00080000 -#define VM_ACCOUNT 0x00100000 -#define VM_NORESERVE 0x00200000 -#define VM_MIXEDMAP 0x10000000 -#define VM_STACK VM_GROWSDOWN -#define VM_SHADOW_STACK VM_NONE -#define VM_SOFTDIRTY 0 -#define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ -#define VM_GROWSUP VM_NONE -#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) -#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) +/** + * typedef vma_flag_t - specifies an individual VMA flag by bit number. + * + * This value is made type safe by sparse to avoid passing invalid flag values + * around. + */ +typedef int __bitwise vma_flag_t; +#define DECLARE_VMA_BIT(name, bitnum) \ + VMA_ ## name ## _BIT = ((__force vma_flag_t)bitnum) +#define DECLARE_VMA_BIT_ALIAS(name, aliased) \ + VMA_ ## name ## _BIT = VMA_ ## aliased ## _BIT +enum { + DECLARE_VMA_BIT(READ, 0), + DECLARE_VMA_BIT(WRITE, 1), + DECLARE_VMA_BIT(EXEC, 2), + DECLARE_VMA_BIT(SHARED, 3), + /* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */ + DECLARE_VMA_BIT(MAYREAD, 4), /* limits for mprotect() etc. */ + DECLARE_VMA_BIT(MAYWRITE, 5), + DECLARE_VMA_BIT(MAYEXEC, 6), + DECLARE_VMA_BIT(MAYSHARE, 7), + DECLARE_VMA_BIT(GROWSDOWN, 8), /* general info on the segment */ +#ifdef CONFIG_MMU + DECLARE_VMA_BIT(UFFD_MISSING, 9),/* missing pages tracking */ +#else + /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */ + DECLARE_VMA_BIT(MAYOVERLAY, 9), +#endif /* CONFIG_MMU */ + /* Page-ranges managed without "struct page", just pure PFN */ + DECLARE_VMA_BIT(PFNMAP, 10), + DECLARE_VMA_BIT(MAYBE_GUARD, 11), + DECLARE_VMA_BIT(UFFD_WP, 12), /* wrprotect pages tracking */ + DECLARE_VMA_BIT(LOCKED, 13), + DECLARE_VMA_BIT(IO, 14), /* Memory mapped I/O or similar */ + DECLARE_VMA_BIT(SEQ_READ, 15), /* App will access data sequentially */ + DECLARE_VMA_BIT(RAND_READ, 16), /* App will not benefit from clustered reads */ + DECLARE_VMA_BIT(DONTCOPY, 17), /* Do not copy this vma on fork */ + DECLARE_VMA_BIT(DONTEXPAND, 18),/* Cannot expand with mremap() */ + DECLARE_VMA_BIT(LOCKONFAULT, 19),/* Lock pages covered when faulted in */ + DECLARE_VMA_BIT(ACCOUNT, 20), /* Is a VM accounted object */ + DECLARE_VMA_BIT(NORESERVE, 21), /* should the VM suppress accounting */ + DECLARE_VMA_BIT(HUGETLB, 22), /* Huge TLB Page VM */ + DECLARE_VMA_BIT(SYNC, 23), /* Synchronous page faults */ + DECLARE_VMA_BIT(ARCH_1, 24), /* Architecture-specific flag */ + DECLARE_VMA_BIT(WIPEONFORK, 25),/* Wipe VMA contents in child. */ + DECLARE_VMA_BIT(DONTDUMP, 26), /* Do not include in the core dump */ + DECLARE_VMA_BIT(SOFTDIRTY, 27), /* NOT soft dirty clean area */ + DECLARE_VMA_BIT(MIXEDMAP, 28), /* Can contain struct page and pure PFN pages */ + DECLARE_VMA_BIT(HUGEPAGE, 29), /* MADV_HUGEPAGE marked this vma */ + DECLARE_VMA_BIT(NOHUGEPAGE, 30),/* MADV_NOHUGEPAGE marked this vma */ + DECLARE_VMA_BIT(MERGEABLE, 31), /* KSM may merge identical pages */ + /* These bits are reused, we define specific uses below. */ + DECLARE_VMA_BIT(HIGH_ARCH_0, 32), + DECLARE_VMA_BIT(HIGH_ARCH_1, 33), + DECLARE_VMA_BIT(HIGH_ARCH_2, 34), + DECLARE_VMA_BIT(HIGH_ARCH_3, 35), + DECLARE_VMA_BIT(HIGH_ARCH_4, 36), + DECLARE_VMA_BIT(HIGH_ARCH_5, 37), + DECLARE_VMA_BIT(HIGH_ARCH_6, 38), + /* + * This flag is used to connect VFIO to arch specific KVM code. It + * indicates that the memory under this VMA is safe for use with any + * non-cachable memory type inside KVM. Some VFIO devices, on some + * platforms, are thought to be unsafe and can cause machine crashes + * if KVM does not lock down the memory type. + */ + DECLARE_VMA_BIT(ALLOW_ANY_UNCACHED, 39), +#ifdef CONFIG_PPC32 + DECLARE_VMA_BIT_ALIAS(DROPPABLE, ARCH_1), +#else + DECLARE_VMA_BIT(DROPPABLE, 40), +#endif + DECLARE_VMA_BIT(UFFD_MINOR, 41), + DECLARE_VMA_BIT(SEALED, 42), + /* Flags that reuse flags above. */ + DECLARE_VMA_BIT_ALIAS(PKEY_BIT0, HIGH_ARCH_0), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT1, HIGH_ARCH_1), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT2, HIGH_ARCH_2), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT3, HIGH_ARCH_3), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT4, HIGH_ARCH_4), +#if defined(CONFIG_X86_USER_SHADOW_STACK) + /* + * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of + * support core mm. + * + * These VMAs will get a single end guard page. This helps userspace + * protect itself from attacks. A single page is enough for current + * shadow stack archs (x86). See the comments near alloc_shstk() in + * arch/x86/kernel/shstk.c for more details on the guard size. + */ + DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_5), +#elif defined(CONFIG_ARM64_GCS) + /* + * arm64's Guarded Control Stack implements similar functionality and + * has similar constraints to shadow stacks. + */ + DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_6), +#endif + DECLARE_VMA_BIT_ALIAS(SAO, ARCH_1), /* Strong Access Ordering (powerpc) */ + DECLARE_VMA_BIT_ALIAS(GROWSUP, ARCH_1), /* parisc */ + DECLARE_VMA_BIT_ALIAS(SPARC_ADI, ARCH_1), /* sparc64 */ + DECLARE_VMA_BIT_ALIAS(ARM64_BTI, ARCH_1), /* arm64 */ + DECLARE_VMA_BIT_ALIAS(ARCH_CLEAR, ARCH_1), /* sparc64, arm64 */ + DECLARE_VMA_BIT_ALIAS(MAPPED_COPY, ARCH_1), /* !CONFIG_MMU */ + DECLARE_VMA_BIT_ALIAS(MTE, HIGH_ARCH_4), /* arm64 */ + DECLARE_VMA_BIT_ALIAS(MTE_ALLOWED, HIGH_ARCH_5),/* arm64 */ #ifdef CONFIG_STACK_GROWSUP -#define VM_STACK VM_GROWSUP -#define VM_STACK_EARLY VM_GROWSDOWN + DECLARE_VMA_BIT_ALIAS(STACK, GROWSUP), + DECLARE_VMA_BIT_ALIAS(STACK_EARLY, GROWSDOWN), #else -#define VM_STACK VM_GROWSDOWN -#define VM_STACK_EARLY 0 + DECLARE_VMA_BIT_ALIAS(STACK, GROWSDOWN), #endif +}; + +#define INIT_VM_FLAG(name) BIT((__force int) VMA_ ## name ## _BIT) +#define VM_READ INIT_VM_FLAG(READ) +#define VM_WRITE INIT_VM_FLAG(WRITE) +#define VM_EXEC INIT_VM_FLAG(EXEC) +#define VM_SHARED INIT_VM_FLAG(SHARED) +#define VM_MAYREAD INIT_VM_FLAG(MAYREAD) +#define VM_MAYWRITE INIT_VM_FLAG(MAYWRITE) +#define VM_MAYEXEC INIT_VM_FLAG(MAYEXEC) +#define VM_MAYSHARE INIT_VM_FLAG(MAYSHARE) +#define VM_GROWSDOWN INIT_VM_FLAG(GROWSDOWN) +#ifdef CONFIG_MMU +#define VM_UFFD_MISSING INIT_VM_FLAG(UFFD_MISSING) +#else +#define VM_UFFD_MISSING VM_NONE +#define VM_MAYOVERLAY INIT_VM_FLAG(MAYOVERLAY) +#endif +#define VM_PFNMAP INIT_VM_FLAG(PFNMAP) +#define VM_MAYBE_GUARD INIT_VM_FLAG(MAYBE_GUARD) +#define VM_UFFD_WP INIT_VM_FLAG(UFFD_WP) +#define VM_LOCKED INIT_VM_FLAG(LOCKED) +#define VM_IO INIT_VM_FLAG(IO) +#define VM_SEQ_READ INIT_VM_FLAG(SEQ_READ) +#define VM_RAND_READ INIT_VM_FLAG(RAND_READ) +#define VM_DONTCOPY INIT_VM_FLAG(DONTCOPY) +#define VM_DONTEXPAND INIT_VM_FLAG(DONTEXPAND) +#define VM_LOCKONFAULT INIT_VM_FLAG(LOCKONFAULT) +#define VM_ACCOUNT INIT_VM_FLAG(ACCOUNT) +#define VM_NORESERVE INIT_VM_FLAG(NORESERVE) +#define VM_HUGETLB INIT_VM_FLAG(HUGETLB) +#define VM_SYNC INIT_VM_FLAG(SYNC) +#define VM_ARCH_1 INIT_VM_FLAG(ARCH_1) +#define VM_WIPEONFORK INIT_VM_FLAG(WIPEONFORK) +#define VM_DONTDUMP INIT_VM_FLAG(DONTDUMP) +#ifdef CONFIG_MEM_SOFT_DIRTY +#define VM_SOFTDIRTY INIT_VM_FLAG(SOFTDIRTY) +#else +#define VM_SOFTDIRTY VM_NONE +#endif +#define VM_MIXEDMAP INIT_VM_FLAG(MIXEDMAP) +#define VM_HUGEPAGE INIT_VM_FLAG(HUGEPAGE) +#define VM_NOHUGEPAGE INIT_VM_FLAG(NOHUGEPAGE) +#define VM_MERGEABLE INIT_VM_FLAG(MERGEABLE) +#define VM_STACK INIT_VM_FLAG(STACK) +#ifdef CONFIG_STACK_GROWS_UP +#define VM_STACK_EARLY INIT_VM_FLAG(STACK_EARLY) +#else +#define VM_STACK_EARLY VM_NONE +#endif +#ifdef CONFIG_ARCH_HAS_PKEYS +#define VM_PKEY_SHIFT ((__force int)VMA_HIGH_ARCH_0_BIT) +/* Despite the naming, these are FLAGS not bits. */ +#define VM_PKEY_BIT0 INIT_VM_FLAG(PKEY_BIT0) +#define VM_PKEY_BIT1 INIT_VM_FLAG(PKEY_BIT1) +#define VM_PKEY_BIT2 INIT_VM_FLAG(PKEY_BIT2) +#if CONFIG_ARCH_PKEY_BITS > 3 +#define VM_PKEY_BIT3 INIT_VM_FLAG(PKEY_BIT3) +#else +#define VM_PKEY_BIT3 VM_NONE +#endif /* CONFIG_ARCH_PKEY_BITS > 3 */ +#if CONFIG_ARCH_PKEY_BITS > 4 +#define VM_PKEY_BIT4 INIT_VM_FLAG(PKEY_BIT4) +#else +#define VM_PKEY_BIT4 VM_NONE +#endif /* CONFIG_ARCH_PKEY_BITS > 4 */ +#endif /* CONFIG_ARCH_HAS_PKEYS */ +#if defined(CONFIG_X86_USER_SHADOW_STACK) || defined(CONFIG_ARM64_GCS) +#define VM_SHADOW_STACK INIT_VM_FLAG(SHADOW_STACK) +#else +#define VM_SHADOW_STACK VM_NONE +#endif +#if defined(CONFIG_PPC64) +#define VM_SAO INIT_VM_FLAG(SAO) +#elif defined(CONFIG_PARISC) +#define VM_GROWSUP INIT_VM_FLAG(GROWSUP) +#elif defined(CONFIG_SPARC64) +#define VM_SPARC_ADI INIT_VM_FLAG(SPARC_ADI) +#define VM_ARCH_CLEAR INIT_VM_FLAG(ARCH_CLEAR) +#elif defined(CONFIG_ARM64) +#define VM_ARM64_BTI INIT_VM_FLAG(ARM64_BTI) +#define VM_ARCH_CLEAR INIT_VM_FLAG(ARCH_CLEAR) +#elif !defined(CONFIG_MMU) +#define VM_MAPPED_COPY INIT_VM_FLAG(MAPPED_COPY) +#endif +#ifndef VM_GROWSUP +#define VM_GROWSUP VM_NONE +#endif +#ifdef CONFIG_ARM64_MTE +#define VM_MTE INIT_VM_FLAG(MTE) +#define VM_MTE_ALLOWED INIT_VM_FLAG(MTE_ALLOWED) +#else +#define VM_MTE VM_NONE +#define VM_MTE_ALLOWED VM_NONE +#endif +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR +#define VM_UFFD_MINOR INIT_VM_FLAG(UFFD_MINOR) +#else +#define VM_UFFD_MINOR VM_NONE +#endif +#ifdef CONFIG_64BIT +#define VM_ALLOW_ANY_UNCACHED INIT_VM_FLAG(ALLOW_ANY_UNCACHED) +#define VM_SEALED INIT_VM_FLAG(SEALED) +#else +#define VM_ALLOW_ANY_UNCACHED VM_NONE +#define VM_SEALED VM_NONE +#endif +#if defined(CONFIG_64BIT) || defined(CONFIG_PPC32) +#define VM_DROPPABLE INIT_VM_FLAG(DROPPABLE) +#else +#define VM_DROPPABLE VM_NONE +#endif + +/* Bits set in the VMA until the stack is in its final location */ +#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) + +#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) + +/* Common data flag combinations */ +#define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \ + VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#define VM_DATA_FLAGS_NON_EXEC (VM_READ | VM_WRITE | VM_MAYREAD | \ + VM_MAYWRITE | VM_MAYEXEC) +#define VM_DATA_FLAGS_EXEC (VM_READ | VM_WRITE | VM_EXEC | \ + VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) + +#ifndef VM_DATA_DEFAULT_FLAGS /* arch can override this */ +#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_EXEC +#endif + +#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ +#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS +#endif + +#define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) + +#define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) + +/* VMA basic access permission flags */ +#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) + +/* + * Special vmas that are non-mergable, non-mlock()able. + */ +#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) #define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE) #define TASK_SIZE_LOW DEFAULT_MAP_WINDOW @@ -97,26 +326,11 @@ extern unsigned long dac_mmap_min_addr; #define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) -#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC - -#define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) - -#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS -#define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) -#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) - #define RLIMIT_STACK 3 /* max stack size */ #define RLIMIT_MEMLOCK 8 /* max locked-in-memory address space */ #define CAP_IPC_LOCK 14 -#ifdef CONFIG_64BIT -#define VM_SEALED_BIT 42 -#define VM_SEALED BIT(VM_SEALED_BIT) -#else -#define VM_SEALED VM_NONE -#endif - /* * Flags which should be 'sticky' on merge - that is, flags which, when one VMA * possesses it but the other does not, the merged VMA should nonetheless have -- cgit v1.2.3 From 58eac97a8ba0bcfc5dffb347e40ea3006347ff38 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 25 Nov 2025 10:01:00 +0000 Subject: mm: simplify and rename mm flags function for clarity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The __mm_flags_set_word() function is slightly ambiguous - we use 'set' to refer to setting individual bits (such as in mm_flags_set()) but here we use it to refer to overwriting the value altogether. Rename it to __mm_flags_overwrite_word() to eliminate this ambiguity. We additionally simplify the functions, eliminating unnecessary bitmap_xxx() operations (the compiler would have optimised these out but it's worth being as clear as we can be here). Link: https://lkml.kernel.org/r/8f0bc556e1b90eca8ea5eba41f8d5d3f9cd7c98a.1764064557.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Reviewed-by: Pedro Falcato Acked-by: Alice Ryhl [rust] Cc: Alex Gaynor Cc: Alistair Popple Cc: Andreas Hindborg Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Ben Segall Cc: Björn Roy Baron Cc: Boqun Feng Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Danilo Krummrich Cc: David Hildenbrand Cc: David Rientjes Cc: Dev Jain Cc: Dietmar Eggemann Cc: Gary Guo Cc: Gregory Price Cc: "Huang, Ying" Cc: Ingo Molnar Cc: Jann Horn Cc: Jason Gunthorpe Cc: Johannes Weiner Cc: John Hubbard Cc: Joshua Hahn Cc: Juri Lelli Cc: Kairui Song Cc: Kees Cook Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Muchun Song Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: Shakeel Butt Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Trevor Gross Cc: Valentin Schneider Cc: Vincent Guittot Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 14 +++++--------- kernel/fork.c | 4 ++-- 2 files changed, 7 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 4f66a3206a63..3550672e0f9e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1314,15 +1314,13 @@ struct mm_struct { unsigned long cpu_bitmap[]; }; -/* Set the first system word of mm flags, non-atomically. */ -static inline void __mm_flags_set_word(struct mm_struct *mm, unsigned long value) +/* Copy value to the first system word of mm flags, non-atomically. */ +static inline void __mm_flags_overwrite_word(struct mm_struct *mm, unsigned long value) { - unsigned long *bitmap = ACCESS_PRIVATE(&mm->flags, __mm_flags); - - bitmap_copy(bitmap, &value, BITS_PER_LONG); + *ACCESS_PRIVATE(&mm->flags, __mm_flags) = value; } -/* Obtain a read-only view of the bitmap. */ +/* Obtain a read-only view of the mm flags bitmap. */ static inline const unsigned long *__mm_flags_get_bitmap(const struct mm_struct *mm) { return (const unsigned long *)ACCESS_PRIVATE(&mm->flags, __mm_flags); @@ -1331,9 +1329,7 @@ static inline const unsigned long *__mm_flags_get_bitmap(const struct mm_struct /* Read the first system word of mm flags, non-atomically. */ static inline unsigned long __mm_flags_get_word(const struct mm_struct *mm) { - const unsigned long *bitmap = __mm_flags_get_bitmap(mm); - - return bitmap_read(bitmap, 0, BITS_PER_LONG); + return *__mm_flags_get_bitmap(mm); } /* diff --git a/kernel/fork.c b/kernel/fork.c index dd0bb5fe4305..5e3309a2332c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1061,10 +1061,10 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, if (current->mm) { unsigned long flags = __mm_flags_get_word(current->mm); - __mm_flags_set_word(mm, mmf_init_legacy_flags(flags)); + __mm_flags_overwrite_word(mm, mmf_init_legacy_flags(flags)); mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK; } else { - __mm_flags_set_word(mm, default_dump_filter); + __mm_flags_overwrite_word(mm, default_dump_filter); mm->def_flags = 0; } -- cgit v1.2.3 From 9ea35a25d51b13013b724943a177a7aaf4bfed71 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 25 Nov 2025 10:01:02 +0000 Subject: mm: introduce VMA flags bitmap type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is useful to transition to using a bitmap for VMA flags so we can avoid running out of flags, especially for 32-bit kernels which are constrained to 32 flags, necessitating some features to be limited to 64-bit kernels only. By doing so, we remove any constraint on the number of VMA flags moving forwards no matter the platform and can decide in future to extend beyond 64 if required. We start by declaring an opaque types, vma_flags_t (which resembles mm_struct flags of type mm_flags_t), setting it to precisely the same size as vm_flags_t, and place it in union with vm_flags in the VMA declaration. We additionally update struct vm_area_desc equivalently placing the new opaque type in union with vm_flags. This change therefore does not impact the size of struct vm_area_struct or struct vm_area_desc. In order for the change to be iterative and to avoid impacting performance, we designate VM_xxx declared bitmap flag values as those which must exist in the first system word of the VMA flags bitmap. We therefore declare vma_flags_clear_all(), vma_flags_overwrite_word(), vma_flags_overwrite_word(), vma_flags_overwrite_word_once(), vma_flags_set_word() and vma_flags_clear_word() in order to allow us to update the existing vm_flags_*() functions to utilise these helpers. This is a stepping stone towards converting users to the VMA flags bitmap and behaves precisely as before. By doing this, we can eliminate the existing private vma->__vm_flags field in the vma->vm_flags union and replace it with the newly introduced opaque type vma_flags, which we call flags so we refer to the new bitmap field as vma->flags. We update vma_flag_[test, set]_atomic() to account for the change also. We adapt vm_flags_reset_once() to only clear those bits above the first system word providing write-once semantics to the first system word (which it is presumed the caller requires - and in all current use cases this is so). As we currently only specify that the VMA flags bitmap size is equal to BITS_PER_LONG number of bits, this is a noop, but is defensive in preparation for a future change that increases this. We additionally update the VMA userland test declarations to implement the same changes there. Finally, we update the rust code to reference vma->vm_flags on update rather than vma->__vm_flags which has been removed. This is safe for now, albeit it is implicitly performing a const cast. Once we introduce flag helpers we can improve this more. No functional change intended. Link: https://lkml.kernel.org/r/bab179d7b153ac12f221b7d65caac2759282cfe9.1764064557.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Reviewed-by: Pedro Falcato Acked-by: Alice Ryhl [rust] Cc: Alex Gaynor Cc: Alistair Popple Cc: Andreas Hindborg Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Ben Segall Cc: Björn Roy Baron Cc: Boqun Feng Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Danilo Krummrich Cc: David Hildenbrand Cc: David Rientjes Cc: Dev Jain Cc: Dietmar Eggemann Cc: Gary Guo Cc: Gregory Price Cc: "Huang, Ying" Cc: Ingo Molnar Cc: Jann Horn Cc: Jason Gunthorpe Cc: Johannes Weiner Cc: John Hubbard Cc: Joshua Hahn Cc: Juri Lelli Cc: Kairui Song Cc: Kees Cook Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Muchun Song Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: Shakeel Butt Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Trevor Gross Cc: Valentin Schneider Cc: Vincent Guittot Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 24 +++++-- include/linux/mm_types.h | 64 ++++++++++++++++- rust/kernel/mm/virt.rs | 2 +- tools/testing/vma/vma_internal.h | 150 +++++++++++++++++++++++++++++++-------- 4 files changed, 202 insertions(+), 38 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index a2f38fb68840..2887d3b34d3e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -911,7 +911,8 @@ static inline void vm_flags_init(struct vm_area_struct *vma, vm_flags_t flags) { VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY)); - ACCESS_PRIVATE(vma, __vm_flags) = flags; + vma_flags_clear_all(&vma->flags); + vma_flags_overwrite_word(&vma->flags, flags); } /* @@ -931,14 +932,25 @@ static inline void vm_flags_reset_once(struct vm_area_struct *vma, vm_flags_t flags) { vma_assert_write_locked(vma); - WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags); + /* + * If VMA flags exist beyond the first system word, also clear these. It + * is assumed the write once behaviour is required only for the first + * system word. + */ + if (NUM_VMA_FLAG_BITS > BITS_PER_LONG) { + unsigned long *bitmap = ACCESS_PRIVATE(&vma->flags, __vma_flags); + + bitmap_zero(&bitmap[1], NUM_VMA_FLAG_BITS - BITS_PER_LONG); + } + + vma_flags_overwrite_word_once(&vma->flags, flags); } static inline void vm_flags_set(struct vm_area_struct *vma, vm_flags_t flags) { vma_start_write(vma); - ACCESS_PRIVATE(vma, __vm_flags) |= flags; + vma_flags_set_word(&vma->flags, flags); } static inline void vm_flags_clear(struct vm_area_struct *vma, @@ -946,7 +958,7 @@ static inline void vm_flags_clear(struct vm_area_struct *vma, { VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY)); vma_start_write(vma); - ACCESS_PRIVATE(vma, __vm_flags) &= ~flags; + vma_flags_clear_word(&vma->flags, flags); } /* @@ -989,12 +1001,14 @@ static inline bool __vma_flag_atomic_valid(struct vm_area_struct *vma, static inline void vma_flag_set_atomic(struct vm_area_struct *vma, vma_flag_t bit) { + unsigned long *bitmap = ACCESS_PRIVATE(&vma->flags, __vma_flags); + /* mmap read lock/VMA read lock must be held. */ if (!rwsem_is_locked(&vma->vm_mm->mmap_lock)) vma_assert_locked(vma); if (__vma_flag_atomic_valid(vma, bit)) - set_bit((__force int)bit, &ACCESS_PRIVATE(vma, __vm_flags)); + set_bit((__force int)bit, bitmap); } /* diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3550672e0f9e..b71625378ce3 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -848,6 +848,15 @@ struct mmap_action { bool hide_from_rmap_until_complete :1; }; +/* + * Opaque type representing current VMA (vm_area_struct) flag state. Must be + * accessed via vma_flags_xxx() helper functions. + */ +#define NUM_VMA_FLAG_BITS BITS_PER_LONG +typedef struct { + DECLARE_BITMAP(__vma_flags, NUM_VMA_FLAG_BITS); +} __private vma_flags_t; + /* * Describes a VMA that is about to be mmap()'ed. Drivers may choose to * manipulate mutable fields which will cause those fields to be updated in the @@ -865,7 +874,10 @@ struct vm_area_desc { /* Mutable fields. Populated with initial state. */ pgoff_t pgoff; struct file *vm_file; - vm_flags_t vm_flags; + union { + vm_flags_t vm_flags; + vma_flags_t vma_flags; + }; pgprot_t page_prot; /* Write-only fields. */ @@ -910,10 +922,12 @@ struct vm_area_struct { /* * Flags, see mm.h. * To modify use vm_flags_{init|reset|set|clear|mod} functions. + * Preferably, use vma_flags_xxx() functions. */ union { + /* Temporary while VMA flags are being converted. */ const vm_flags_t vm_flags; - vm_flags_t __private __vm_flags; + vma_flags_t flags; }; #ifdef CONFIG_PER_VMA_LOCK @@ -994,6 +1008,52 @@ struct vm_area_struct { #endif } __randomize_layout; +/* Clears all bits in the VMA flags bitmap, non-atomically. */ +static inline void vma_flags_clear_all(vma_flags_t *flags) +{ + bitmap_zero(ACCESS_PRIVATE(flags, __vma_flags), NUM_VMA_FLAG_BITS); +} + +/* + * Copy value to the first system word of VMA flags, non-atomically. + * + * IMPORTANT: This does not overwrite bytes past the first system word. The + * caller must account for this. + */ +static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long value) +{ + *ACCESS_PRIVATE(flags, __vma_flags) = value; +} + +/* + * Copy value to the first system word of VMA flags ONCE, non-atomically. + * + * IMPORTANT: This does not overwrite bytes past the first system word. The + * caller must account for this. + */ +static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned long value) +{ + unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + + WRITE_ONCE(*bitmap, value); +} + +/* Update the first system word of VMA flags setting bits, non-atomically. */ +static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value) +{ + unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + + *bitmap |= value; +} + +/* Update the first system word of VMA flags clearing bits, non-atomically. */ +static inline void vma_flags_clear_word(vma_flags_t *flags, unsigned long value) +{ + unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + + *bitmap &= ~value; +} + #ifdef CONFIG_NUMA #define vma_policy(vma) ((vma)->vm_policy) #else diff --git a/rust/kernel/mm/virt.rs b/rust/kernel/mm/virt.rs index a1bfa4e19293..da21d65ccd20 100644 --- a/rust/kernel/mm/virt.rs +++ b/rust/kernel/mm/virt.rs @@ -250,7 +250,7 @@ impl VmaNew { // SAFETY: This is not a data race: the vma is undergoing initial setup, so it's not yet // shared. Additionally, `VmaNew` is `!Sync`, so it cannot be used to write in parallel. // The caller promises that this does not set the flags to an invalid value. - unsafe { (*self.as_ptr()).__bindgen_anon_2.__vm_flags = flags }; + unsafe { (*self.as_ptr()).__bindgen_anon_2.vm_flags = flags }; } /// Set the `VM_MIXEDMAP` flag on this vma. diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index b7e8fc9ccdf4..9f0a9f5ed0fe 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -524,6 +524,15 @@ typedef struct { __private DECLARE_BITMAP(__mm_flags, NUM_MM_FLAG_BITS); } mm_flags_t; +/* + * Opaque type representing current VMA (vm_area_struct) flag state. Must be + * accessed via vma_flags_xxx() helper functions. + */ +#define NUM_VMA_FLAG_BITS BITS_PER_LONG +typedef struct { + DECLARE_BITMAP(__vma_flags, NUM_VMA_FLAG_BITS); +} __private vma_flags_t; + struct mm_struct { struct maple_tree mm_mt; int map_count; /* number of VMAs */ @@ -608,7 +617,10 @@ struct vm_area_desc { /* Mutable fields. Populated with initial state. */ pgoff_t pgoff; struct file *vm_file; - vm_flags_t vm_flags; + union { + vm_flags_t vm_flags; + vma_flags_t vma_flags; + }; pgprot_t page_prot; /* Write-only fields. */ @@ -654,7 +666,7 @@ struct vm_area_struct { */ union { const vm_flags_t vm_flags; - vm_flags_t __private __vm_flags; + vma_flags_t flags; }; #ifdef CONFIG_PER_VMA_LOCK @@ -1368,26 +1380,6 @@ static inline bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, return true; } -static inline void vm_flags_init(struct vm_area_struct *vma, - vm_flags_t flags) -{ - vma->__vm_flags = flags; -} - -static inline void vm_flags_set(struct vm_area_struct *vma, - vm_flags_t flags) -{ - vma_start_write(vma); - vma->__vm_flags |= flags; -} - -static inline void vm_flags_clear(struct vm_area_struct *vma, - vm_flags_t flags) -{ - vma_start_write(vma); - vma->__vm_flags &= ~flags; -} - static inline int shmem_zero_setup(struct vm_area_struct *vma) { return 0; @@ -1544,13 +1536,118 @@ static inline void userfaultfd_unmap_complete(struct mm_struct *mm, { } -# define ACCESS_PRIVATE(p, member) ((p)->member) +#define ACCESS_PRIVATE(p, member) ((p)->member) + +#define bitmap_size(nbits) (ALIGN(nbits, BITS_PER_LONG) / BITS_PER_BYTE) + +static __always_inline void bitmap_zero(unsigned long *dst, unsigned int nbits) +{ + unsigned int len = bitmap_size(nbits); + + if (small_const_nbits(nbits)) + *dst = 0; + else + memset(dst, 0, len); +} static inline bool mm_flags_test(int flag, const struct mm_struct *mm) { return test_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); } +/* Clears all bits in the VMA flags bitmap, non-atomically. */ +static inline void vma_flags_clear_all(vma_flags_t *flags) +{ + bitmap_zero(ACCESS_PRIVATE(flags, __vma_flags), NUM_VMA_FLAG_BITS); +} + +/* + * Copy value to the first system word of VMA flags, non-atomically. + * + * IMPORTANT: This does not overwrite bytes past the first system word. The + * caller must account for this. + */ +static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long value) +{ + *ACCESS_PRIVATE(flags, __vma_flags) = value; +} + +/* + * Copy value to the first system word of VMA flags ONCE, non-atomically. + * + * IMPORTANT: This does not overwrite bytes past the first system word. The + * caller must account for this. + */ +static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned long value) +{ + unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + + WRITE_ONCE(*bitmap, value); +} + +/* Update the first system word of VMA flags setting bits, non-atomically. */ +static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value) +{ + unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + + *bitmap |= value; +} + +/* Update the first system word of VMA flags clearing bits, non-atomically. */ +static inline void vma_flags_clear_word(vma_flags_t *flags, unsigned long value) +{ + unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + + *bitmap &= ~value; +} + + +/* Use when VMA is not part of the VMA tree and needs no locking */ +static inline void vm_flags_init(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma_flags_clear_all(&vma->flags); + vma_flags_overwrite_word(&vma->flags, flags); +} + +/* + * Use when VMA is part of the VMA tree and modifications need coordination + * Note: vm_flags_reset and vm_flags_reset_once do not lock the vma and + * it should be locked explicitly beforehand. + */ +static inline void vm_flags_reset(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma_assert_write_locked(vma); + vm_flags_init(vma, flags); +} + +static inline void vm_flags_reset_once(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma_assert_write_locked(vma); + /* + * The user should only be interested in avoiding reordering of + * assignment to the first word. + */ + vma_flags_clear_all(&vma->flags); + vma_flags_overwrite_word_once(&vma->flags, flags); +} + +static inline void vm_flags_set(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma_start_write(vma); + vma_flags_set_word(&vma->flags, flags); +} + +static inline void vm_flags_clear(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma_start_write(vma); + vma_flags_clear_word(&vma->flags, flags); +} + /* * Denies creating a writable executable mapping or gaining executable permissions. * @@ -1763,11 +1860,4 @@ static inline int do_munmap(struct mm_struct *, unsigned long, size_t, return 0; } -static inline void vm_flags_reset(struct vm_area_struct *vma, vm_flags_t flags) -{ - vm_flags_t *dst = (vm_flags_t *)(&vma->vm_flags); - - *dst = flags; -} - #endif /* __MM_VMA_INTERNAL_H */ -- cgit v1.2.3 From f3b566d726357df591602f195a9379494f005225 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Wed, 26 Nov 2025 02:04:35 +0000 Subject: memcg: remove inc/dec_lruvec_kmem_state helpers The dec_lruvec_kmem_state helper is unused by any caller and can be safely removed. Meanwhile, the inc_lruvec_kmem_state helper is only referenced by shadow_lru_isolate, retaining these two helpers is unnecessary. This patch removes both helper functions to eliminate redundant code. Link: https://lkml.kernel.org/r/20251126020435.1511637-1-chenridong@huaweicloud.com Signed-off-by: Chen Ridong Acked-by: Qi Zheng Acked-by: Shakeel Butt Cc: Axel Rasmussen Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Lu Jialin Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 10 ---------- mm/workingset.c | 2 +- 2 files changed, 1 insertion(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d35390f9892a..0651865a4564 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1452,16 +1452,6 @@ struct slabobj_ext { #endif } __aligned(8); -static inline void inc_lruvec_kmem_state(void *p, enum node_stat_item idx) -{ - mod_lruvec_kmem_state(p, idx, 1); -} - -static inline void dec_lruvec_kmem_state(void *p, enum node_stat_item idx) -{ - mod_lruvec_kmem_state(p, idx, -1); -} - static inline struct lruvec *parent_lruvec(struct lruvec *lruvec) { struct mem_cgroup *memcg; diff --git a/mm/workingset.c b/mm/workingset.c index 6ff30369b758..1399d6da75a2 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -749,7 +749,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, if (WARN_ON_ONCE(node->count != node->nr_values)) goto out_invalid; xa_delete_node(node, workingset_update_node); - inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM); + mod_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM, 1); out_invalid: xa_unlock_irq(&mapping->i_pages); -- cgit v1.2.3