From 3d3544a6c996e88bb793bb6b2665c3e3f674f5eb Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 13 Apr 2026 11:57:13 +0100 Subject: mm/vma: remove __vma_check_mmap_hook() Commit c50ca15dd496 ("mm: add vm_ops->mapped hook") introduced __vma_check_mmap_hook() in order to assert that a driver doesn't incorrectly implement both an f_op->mmap() and a vm_ops->mapped hook, the latter of which would not ultimately get invoked. However, this did not correctly account for stacked drivers (or drivers that otherwise use the compatibility layer) which might recursively call an mmap_prepare hook via the compatibility layer. Thus the nested mmap_prepare() invocation might result in a VMA which has vm_ops->mapped set with an overlaying mmap() hook, causing the __vma_check_mmap_hook() to fail in vfs_mmap(), wrongly failing the operation. This patch resolves this by simply removing the check, as we can't be certain that an mmap() hook doesn't at some point invoke the compatibility layer, and it's not worth trying to track it. Link: https://lore.kernel.org/20260413105713.92625-1-ljs@kernel.org Fixes: c50ca15dd496 ("mm: add vm_ops->mapped hook") Reported-by: Shinichiro Kawasaki Closes: https://lore.kernel.org/all/adx2ws5z0NMIe5Yj@shinmob/ Signed-off-by: Lorenzo Stoakes Acked-by: Vlastimil Babka (SUSE) Tested-by: Shinichiro Kawasaki Cc: Al Viro Cc: Christian Brauner Cc: David Hildenbrand Cc: Jan Kara Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/fs.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 0bdccfa70b44..f3ca9b841892 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2062,20 +2062,13 @@ void compat_set_desc_from_vma(struct vm_area_desc *desc, const struct file *file const struct vm_area_struct *vma); int __compat_vma_mmap(struct vm_area_desc *desc, struct vm_area_struct *vma); int compat_vma_mmap(struct file *file, struct vm_area_struct *vma); -int __vma_check_mmap_hook(struct vm_area_struct *vma); static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma) { - int err; - if (file->f_op->mmap_prepare) return compat_vma_mmap(file, vma); - err = file->f_op->mmap(file, vma); - if (err) - return err; - - return __vma_check_mmap_hook(vma); + return file->f_op->mmap(file, vma); } static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc) -- cgit v1.2.3 From db128b2c6b7d0c9b514327a0873425bbf18e739b Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 5 Mar 2026 19:52:21 +0800 Subject: mm: rename unlock_page_lruvec_irq and its variants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is inappropriate to use folio_lruvec_lock() variants in conjunction with unlock_page_lruvec() variants, as this involves the inconsistent operation of locking a folio while unlocking a page. To rectify this, the functions unlock_page_lruvec{_irq, _irqrestore} are renamed to lruvec_unlock{_irq,_irqrestore}. Link: https://lore.kernel.org/4e5e05271a250df4d1812e1832be65636a78c957.1772711148.git.zhengqi.arch@bytedance.com Signed-off-by: Muchun Song Signed-off-by: Qi Zheng Acked-by: Roman Gushchin Acked-by: Johannes Weiner Reviewed-by: Harry Yoo Reviewed-by: Chen Ridong Acked-by: David Hildenbrand (Red Hat) Acked-by: Shakeel Butt Cc: Allen Pais Cc: Axel Rasmussen Cc: Baoquan He Cc: Chengming Zhou Cc: Hamza Mahfooz Cc: Hugh Dickins Cc: Imran Khan Cc: Kamalesh Babulal Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Michal Koutný Cc: Mike Rapoport Cc: Muchun Song Cc: Nhat Pham Cc: Suren Baghdasaryan Cc: Usama Arif Cc: Vlastimil Babka Cc: Wei Xu Cc: Yosry Ahmed Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 10 +++++----- mm/compaction.c | 14 +++++++------- mm/huge_memory.c | 2 +- mm/mlock.c | 2 +- mm/swap.c | 12 ++++++------ mm/vmscan.c | 4 ++-- 6 files changed, 22 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5173a9f16721..6e88288e90d8 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1479,17 +1479,17 @@ static inline struct lruvec *parent_lruvec(struct lruvec *lruvec) return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec)); } -static inline void unlock_page_lruvec(struct lruvec *lruvec) +static inline void lruvec_unlock(struct lruvec *lruvec) { spin_unlock(&lruvec->lru_lock); } -static inline void unlock_page_lruvec_irq(struct lruvec *lruvec) +static inline void lruvec_unlock_irq(struct lruvec *lruvec) { spin_unlock_irq(&lruvec->lru_lock); } -static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec, +static inline void lruvec_unlock_irqrestore(struct lruvec *lruvec, unsigned long flags) { spin_unlock_irqrestore(&lruvec->lru_lock, flags); @@ -1511,7 +1511,7 @@ static inline struct lruvec *folio_lruvec_relock_irq(struct folio *folio, if (folio_matches_lruvec(folio, locked_lruvec)) return locked_lruvec; - unlock_page_lruvec_irq(locked_lruvec); + lruvec_unlock_irq(locked_lruvec); } return folio_lruvec_lock_irq(folio); @@ -1525,7 +1525,7 @@ static inline void folio_lruvec_relock_irqsave(struct folio *folio, if (folio_matches_lruvec(folio, *lruvecp)) return; - unlock_page_lruvec_irqrestore(*lruvecp, *flags); + lruvec_unlock_irqrestore(*lruvecp, *flags); } *lruvecp = folio_lruvec_lock_irqsave(folio, flags); diff --git a/mm/compaction.c b/mm/compaction.c index 1e8f8eca318c..c3e338aaa0ff 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -913,7 +913,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, */ if (!(low_pfn % COMPACT_CLUSTER_MAX)) { if (locked) { - unlock_page_lruvec_irqrestore(locked, flags); + lruvec_unlock_irqrestore(locked, flags); locked = NULL; } @@ -964,7 +964,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, } /* for alloc_contig case */ if (locked) { - unlock_page_lruvec_irqrestore(locked, flags); + lruvec_unlock_irqrestore(locked, flags); locked = NULL; } @@ -1053,7 +1053,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (unlikely(page_has_movable_ops(page)) && !PageMovableOpsIsolated(page)) { if (locked) { - unlock_page_lruvec_irqrestore(locked, flags); + lruvec_unlock_irqrestore(locked, flags); locked = NULL; } @@ -1158,7 +1158,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* If we already hold the lock, we can skip some rechecking */ if (lruvec != locked) { if (locked) - unlock_page_lruvec_irqrestore(locked, flags); + lruvec_unlock_irqrestore(locked, flags); compact_lock_irqsave(&lruvec->lru_lock, &flags, cc); locked = lruvec; @@ -1226,7 +1226,7 @@ isolate_success_no_list: isolate_fail_put: /* Avoid potential deadlock in freeing page under lru_lock */ if (locked) { - unlock_page_lruvec_irqrestore(locked, flags); + lruvec_unlock_irqrestore(locked, flags); locked = NULL; } folio_put(folio); @@ -1242,7 +1242,7 @@ isolate_fail: */ if (nr_isolated) { if (locked) { - unlock_page_lruvec_irqrestore(locked, flags); + lruvec_unlock_irqrestore(locked, flags); locked = NULL; } putback_movable_pages(&cc->migratepages); @@ -1274,7 +1274,7 @@ isolate_fail: isolate_abort: if (locked) - unlock_page_lruvec_irqrestore(locked, flags); + lruvec_unlock_irqrestore(locked, flags); if (folio) { folio_set_lru(folio); folio_put(folio); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 42c983821c03..958b580c6619 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3994,7 +3994,7 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n folio_ref_unfreeze(folio, folio_cache_ref_count(folio) + 1); if (do_lru) - unlock_page_lruvec(lruvec); + lruvec_unlock(lruvec); if (ci) swap_cluster_unlock(ci); diff --git a/mm/mlock.c b/mm/mlock.c index fdbd1434a35f..8c227fefa2df 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -205,7 +205,7 @@ static void mlock_folio_batch(struct folio_batch *fbatch) } if (lruvec) - unlock_page_lruvec_irq(lruvec); + lruvec_unlock_irq(lruvec); folios_put(fbatch); } diff --git a/mm/swap.c b/mm/swap.c index 78b4aa811fc6..23df893e2ed7 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -91,7 +91,7 @@ static void page_cache_release(struct folio *folio) __page_cache_release(folio, &lruvec, &flags); if (lruvec) - unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec_unlock_irqrestore(lruvec, flags); } void __folio_put(struct folio *folio) @@ -175,7 +175,7 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) } if (lruvec) - unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec_unlock_irqrestore(lruvec, flags); folios_put(fbatch); } @@ -349,7 +349,7 @@ void folio_activate(struct folio *folio) lruvec = folio_lruvec_lock_irq(folio); lru_activate(lruvec, folio); - unlock_page_lruvec_irq(lruvec); + lruvec_unlock_irq(lruvec); folio_set_lru(folio); } #endif @@ -963,7 +963,7 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) if (folio_is_zone_device(folio)) { if (lruvec) { - unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec_unlock_irqrestore(lruvec, flags); lruvec = NULL; } if (folio_ref_sub_and_test(folio, nr_refs)) @@ -977,7 +977,7 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) /* hugetlb has its own memcg */ if (folio_test_hugetlb(folio)) { if (lruvec) { - unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec_unlock_irqrestore(lruvec, flags); lruvec = NULL; } free_huge_folio(folio); @@ -991,7 +991,7 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) j++; } if (lruvec) - unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec_unlock_irqrestore(lruvec, flags); if (!j) { folio_batch_reinit(folios); return; diff --git a/mm/vmscan.c b/mm/vmscan.c index 4bf091b1c8af..88bb3337e5eb 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1831,7 +1831,7 @@ bool folio_isolate_lru(struct folio *folio) folio_get(folio); lruvec = folio_lruvec_lock_irq(folio); lruvec_del_folio(lruvec, folio); - unlock_page_lruvec_irq(lruvec); + lruvec_unlock_irq(lruvec); ret = true; } @@ -7898,7 +7898,7 @@ void check_move_unevictable_folios(struct folio_batch *fbatch) if (lruvec) { __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); - unlock_page_lruvec_irq(lruvec); + lruvec_unlock_irq(lruvec); } else if (pgscanned) { count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); } -- cgit v1.2.3 From d5aa8c1d136e7de89defb06f42f8108992967a70 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 5 Mar 2026 19:52:25 +0800 Subject: mm: memcontrol: return root object cgroup for root memory cgroup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Memory cgroup functions such as get_mem_cgroup_from_folio() and get_mem_cgroup_from_mm() return a valid memory cgroup pointer, even for the root memory cgroup. In contrast, the situation for object cgroups has been different. Previously, the root object cgroup couldn't be returned because it didn't exist. Now that a valid root object cgroup exists, for the sake of consistency, it's necessary to align the behavior of object-cgroup-related operations with that of memory cgroup APIs. Link: https://lore.kernel.org/e9c3f40ba7681d9753372d4ee2ac7a0216848b95.1772711148.git.zhengqi.arch@bytedance.com Signed-off-by: Muchun Song Signed-off-by: Qi Zheng Acked-by: Johannes Weiner Acked-by: Shakeel Butt Reviewed-by: Harry Yoo Cc: Allen Pais Cc: Axel Rasmussen Cc: Baoquan He Cc: Chengming Zhou Cc: Chen Ridong Cc: David Hildenbrand Cc: Hamza Mahfooz Cc: Hugh Dickins Cc: Imran Khan Cc: Kamalesh Babulal Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Michal Koutný Cc: Mike Rapoport Cc: Muchun Song Cc: Nhat Pham Cc: Roman Gushchin Cc: Suren Baghdasaryan Cc: Usama Arif Cc: Vlastimil Babka Cc: Wei Xu Cc: Yosry Ahmed Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 26 ++++++++++++++++++++------ mm/memcontrol.c | 45 ++++++++++++++++++++++++--------------------- mm/percpu.c | 2 +- 3 files changed, 45 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6e88288e90d8..9a015258a2ff 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -332,6 +332,7 @@ struct mem_cgroup { #define MEMCG_CHARGE_BATCH 64U extern struct mem_cgroup *root_mem_cgroup; +extern struct obj_cgroup *root_obj_cgroup; enum page_memcg_data_flags { /* page->memcg_data is a pointer to an slabobj_ext vector */ @@ -548,6 +549,11 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) return (memcg == root_mem_cgroup); } +static inline bool obj_cgroup_is_root(const struct obj_cgroup *objcg) +{ + return objcg == root_obj_cgroup; +} + static inline bool mem_cgroup_disabled(void) { return !cgroup_subsys_enabled(memory_cgrp_subsys); @@ -774,23 +780,26 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ static inline bool obj_cgroup_tryget(struct obj_cgroup *objcg) { + if (obj_cgroup_is_root(objcg)) + return true; return percpu_ref_tryget(&objcg->refcnt); } -static inline void obj_cgroup_get(struct obj_cgroup *objcg) +static inline void obj_cgroup_get_many(struct obj_cgroup *objcg, + unsigned long nr) { - percpu_ref_get(&objcg->refcnt); + if (!obj_cgroup_is_root(objcg)) + percpu_ref_get_many(&objcg->refcnt, nr); } -static inline void obj_cgroup_get_many(struct obj_cgroup *objcg, - unsigned long nr) +static inline void obj_cgroup_get(struct obj_cgroup *objcg) { - percpu_ref_get_many(&objcg->refcnt, nr); + obj_cgroup_get_many(objcg, 1); } static inline void obj_cgroup_put(struct obj_cgroup *objcg) { - if (objcg) + if (objcg && !obj_cgroup_is_root(objcg)) percpu_ref_put(&objcg->refcnt); } @@ -1087,6 +1096,11 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) return true; } +static inline bool obj_cgroup_is_root(const struct obj_cgroup *objcg) +{ + return true; +} + static inline bool mem_cgroup_disabled(void) { return true; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2cb2d66579d3..e7022adcea7f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -83,6 +83,8 @@ EXPORT_SYMBOL(memory_cgrp_subsys); struct mem_cgroup *root_mem_cgroup __read_mostly; EXPORT_SYMBOL(root_mem_cgroup); +struct obj_cgroup *root_obj_cgroup __read_mostly; + /* Active memory cgroup to use from an interrupt context */ DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg); EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg); @@ -2693,15 +2695,14 @@ struct mem_cgroup *mem_cgroup_from_virt(void *p) static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg) { - struct obj_cgroup *objcg = NULL; + for (; memcg; memcg = parent_mem_cgroup(memcg)) { + struct obj_cgroup *objcg = rcu_dereference(memcg->objcg); - for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) { - objcg = rcu_dereference(memcg->objcg); if (likely(objcg && obj_cgroup_tryget(objcg))) - break; - objcg = NULL; + return objcg; } - return objcg; + + return NULL; } static struct obj_cgroup *current_objcg_update(void) @@ -2775,18 +2776,17 @@ __always_inline struct obj_cgroup *current_obj_cgroup(void) * Objcg reference is kept by the task, so it's safe * to use the objcg by the current task. */ - return objcg; + return objcg ? : root_obj_cgroup; } memcg = this_cpu_read(int_active_memcg); if (unlikely(memcg)) goto from_memcg; - return NULL; + return root_obj_cgroup; from_memcg: - objcg = NULL; - for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) { + for (; memcg; memcg = parent_mem_cgroup(memcg)) { /* * Memcg pointer is protected by scope (see set_active_memcg()) * and is pinning the corresponding objcg, so objcg can't go @@ -2795,10 +2795,10 @@ from_memcg: */ objcg = rcu_dereference_check(memcg->objcg, 1); if (likely(objcg)) - break; + return objcg; } - return objcg; + return root_obj_cgroup; } struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio) @@ -2812,14 +2812,8 @@ struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio) objcg = __folio_objcg(folio); obj_cgroup_get(objcg); } else { - struct mem_cgroup *memcg; - rcu_read_lock(); - memcg = __folio_memcg(folio); - if (memcg) - objcg = __get_obj_cgroup_from_memcg(memcg); - else - objcg = NULL; + objcg = __get_obj_cgroup_from_memcg(__folio_memcg(folio)); rcu_read_unlock(); } return objcg; @@ -2922,7 +2916,7 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) int ret = 0; objcg = current_obj_cgroup(); - if (objcg) { + if (objcg && !obj_cgroup_is_root(objcg)) { ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order); if (!ret) { obj_cgroup_get(objcg); @@ -3251,7 +3245,7 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, * obj_cgroup_get() is used to get a permanent reference. */ objcg = current_obj_cgroup(); - if (!objcg) + if (!objcg || obj_cgroup_is_root(objcg)) return true; /* @@ -3927,6 +3921,9 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) if (!objcg) goto free_shrinker; + if (unlikely(mem_cgroup_is_root(memcg))) + root_obj_cgroup = objcg; + objcg->memcg = memcg; rcu_assign_pointer(memcg->objcg, objcg); obj_cgroup_get(objcg); @@ -5551,6 +5548,9 @@ void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size) if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) return; + if (obj_cgroup_is_root(objcg)) + return; + VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC)); /* PF_MEMALLOC context, charging must succeed */ @@ -5580,6 +5580,9 @@ void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size) if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) return; + if (obj_cgroup_is_root(objcg)) + return; + obj_cgroup_uncharge(objcg, size); rcu_read_lock(); diff --git a/mm/percpu.c b/mm/percpu.c index a2107bdebf0b..b0676b8054ed 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1622,7 +1622,7 @@ static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, return true; objcg = current_obj_cgroup(); - if (!objcg) + if (!objcg || obj_cgroup_is_root(objcg)) return true; if (obj_cgroup_charge(objcg, gfp, pcpu_obj_full_size(size))) -- cgit v1.2.3 From 49717c7bd6b8e14329c2d04b1e8ec691175b6f4e Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 5 Mar 2026 19:52:28 +0800 Subject: writeback: prevent memory cgroup release in writeback module MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the near future, a folio will no longer pin its corresponding memory cgroup. To ensure safety, it will only be appropriate to hold the rcu read lock or acquire a reference to the memory cgroup returned by folio_memcg(), thereby preventing it from being released. In the current patch, the function get_mem_cgroup_css_from_folio() and the rcu read lock are employed to safeguard against the release of the memory cgroup. This serves as a preparatory measure for the reparenting of the LRU pages. Link: https://lore.kernel.org/645f99bc344575417f67def3744f975596df2793.1772711148.git.zhengqi.arch@bytedance.com Signed-off-by: Muchun Song Signed-off-by: Qi Zheng Reviewed-by: Harry Yoo Acked-by: Johannes Weiner Acked-by: Shakeel Butt Cc: Allen Pais Cc: Axel Rasmussen Cc: Baoquan He Cc: Chengming Zhou Cc: Chen Ridong Cc: David Hildenbrand Cc: Hamza Mahfooz Cc: Hugh Dickins Cc: Imran Khan Cc: Kamalesh Babulal Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Michal Koutný Cc: Mike Rapoport Cc: Muchun Song Cc: Nhat Pham Cc: Roman Gushchin Cc: Suren Baghdasaryan Cc: Usama Arif Cc: Vlastimil Babka Cc: Wei Xu Cc: Yosry Ahmed Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- fs/fs-writeback.c | 22 +++++++++++----------- include/linux/memcontrol.h | 9 +++++++-- include/trace/events/writeback.h | 3 +++ mm/memcontrol.c | 14 ++++++++------ 4 files changed, 29 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 7c75ed7e8979..c3442a38450c 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -280,15 +280,13 @@ void __inode_attach_wb(struct inode *inode, struct folio *folio) if (inode_cgwb_enabled(inode)) { struct cgroup_subsys_state *memcg_css; - if (folio) { - memcg_css = mem_cgroup_css_from_folio(folio); - wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); - } else { - /* must pin memcg_css, see wb_get_create() */ + /* must pin memcg_css, see wb_get_create() */ + if (folio) + memcg_css = get_mem_cgroup_css_from_folio(folio); + else memcg_css = task_get_css(current, memory_cgrp_id); - wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); - css_put(memcg_css); - } + wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); + css_put(memcg_css); } if (!wb) @@ -979,16 +977,16 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct folio *folio if (!wbc->wb || wbc->no_cgroup_owner) return; - css = mem_cgroup_css_from_folio(folio); + css = get_mem_cgroup_css_from_folio(folio); /* dead cgroups shouldn't contribute to inode ownership arbitration */ if (!css_is_online(css)) - return; + goto out; id = css->id; if (id == wbc->wb_id) { wbc->wb_bytes += bytes; - return; + goto out; } if (id == wbc->wb_lcand_id) @@ -1001,6 +999,8 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct folio *folio wbc->wb_tcand_bytes += bytes; else wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes); +out: + css_put(css); } EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner); diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 9a015258a2ff..4454f03a4acf 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -894,7 +894,7 @@ static inline bool mm_match_cgroup(struct mm_struct *mm, return match; } -struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio); +struct cgroup_subsys_state *get_mem_cgroup_css_from_folio(struct folio *folio); ino_t page_cgroup_ino(struct page *page); static inline bool mem_cgroup_online(struct mem_cgroup *memcg) @@ -1563,9 +1563,14 @@ static inline void mem_cgroup_track_foreign_dirty(struct folio *folio, if (mem_cgroup_disabled()) return; + if (!folio_memcg_charged(folio)) + return; + + rcu_read_lock(); memcg = folio_memcg(folio); - if (unlikely(memcg && &memcg->css != wb->memcg_css)) + if (unlikely(&memcg->css != wb->memcg_css)) mem_cgroup_track_foreign_dirty_slowpath(folio, wb); + rcu_read_unlock(); } void mem_cgroup_flush_foreign(struct bdi_writeback *wb); diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 4d3d8c8f3a1b..b849b8cc96b1 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -294,7 +294,10 @@ TRACE_EVENT(track_foreign_dirty, __entry->ino = inode ? inode->i_ino : 0; __entry->memcg_id = wb->memcg_css->id; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); + + rcu_read_lock(); __entry->page_cgroup_ino = cgroup_ino(folio_memcg(folio)->css.cgroup); + rcu_read_unlock(); ), TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%lu page_cgroup_ino=%lu", diff --git a/mm/memcontrol.c b/mm/memcontrol.c index dbcf0d2bf114..d7d4b44c5af5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -243,7 +243,7 @@ DEFINE_STATIC_KEY_FALSE(memcg_bpf_enabled_key); EXPORT_SYMBOL(memcg_bpf_enabled_key); /** - * mem_cgroup_css_from_folio - css of the memcg associated with a folio + * get_mem_cgroup_css_from_folio - acquire a css of the memcg associated with a folio * @folio: folio of interest * * If memcg is bound to the default hierarchy, css of the memcg associated @@ -253,14 +253,16 @@ EXPORT_SYMBOL(memcg_bpf_enabled_key); * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup * is returned. */ -struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio) +struct cgroup_subsys_state *get_mem_cgroup_css_from_folio(struct folio *folio) { - struct mem_cgroup *memcg = folio_memcg(folio); + struct mem_cgroup *memcg; - if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) - memcg = root_mem_cgroup; + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return &root_mem_cgroup->css; - return &memcg->css; + memcg = get_mem_cgroup_from_folio(folio); + + return memcg ? &memcg->css : &root_mem_cgroup->css; } /** -- cgit v1.2.3 From f995da5341c1854e59415c2c2c6f0b6406b498f2 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 5 Mar 2026 19:52:29 +0800 Subject: mm: memcontrol: prevent memory cgroup release in count_memcg_folio_events() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the near future, a folio will no longer pin its corresponding memory cgroup. To ensure safety, it will only be appropriate to hold the rcu read lock or acquire a reference to the memory cgroup returned by folio_memcg(), thereby preventing it from being released. In the current patch, the rcu read lock is employed to safeguard against the release of the memory cgroup in count_memcg_folio_events(). This serves as a preparatory measure for the reparenting of the LRU pages. Link: https://lore.kernel.org/dea6aa0389367f7fd6b715c8837a2cf7506bd889.1772711148.git.zhengqi.arch@bytedance.com Signed-off-by: Muchun Song Signed-off-by: Qi Zheng Reviewed-by: Harry Yoo Acked-by: Johannes Weiner Acked-by: Shakeel Butt Cc: Allen Pais Cc: Axel Rasmussen Cc: Baoquan He Cc: Chengming Zhou Cc: Chen Ridong Cc: David Hildenbrand Cc: Hamza Mahfooz Cc: Hugh Dickins Cc: Imran Khan Cc: Kamalesh Babulal Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Michal Koutný Cc: Mike Rapoport Cc: Muchun Song Cc: Nhat Pham Cc: Roman Gushchin Cc: Suren Baghdasaryan Cc: Usama Arif Cc: Vlastimil Babka Cc: Wei Xu Cc: Yosry Ahmed Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 4454f03a4acf..ef26ba087844 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -975,10 +975,15 @@ void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, static inline void count_memcg_folio_events(struct folio *folio, enum vm_event_item idx, unsigned long nr) { - struct mem_cgroup *memcg = folio_memcg(folio); + struct mem_cgroup *memcg; - if (memcg) - count_memcg_events(memcg, idx, nr); + if (!folio_memcg_charged(folio)) + return; + + rcu_read_lock(); + memcg = folio_memcg(folio); + count_memcg_events(memcg, idx, nr); + rcu_read_unlock(); } static inline void count_memcg_events_mm(struct mm_struct *mm, -- cgit v1.2.3 From d14f87858178c64cc94ecd05bb41bba474c1c654 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 5 Mar 2026 19:52:41 +0800 Subject: mm: do not open-code lruvec lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now we have lruvec_unlock(), lruvec_unlock_irq() and lruvec_unlock_irqrestore(), but no the paired lruvec_lock(), lruvec_lock_irq() and lruvec_lock_irqsave(). There is currently no use case for lruvec_lock_irqsave(), so only introduce lruvec_lock_irq(), and change all open-code places to use this helper function. This looks cleaner and prepares for reparenting LRU pages, preventing user from missing RCU lock calls due to open-code lruvec lock. Link: https://lore.kernel.org/2d0bafe7564e17ece46dfd58197af22ce57017dc.1772711148.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Acked-by: Muchun Song Acked-by: Shakeel Butt Reviewed-by: Harry Yoo Cc: Allen Pais Cc: Axel Rasmussen Cc: Baoquan He Cc: Chengming Zhou Cc: Chen Ridong Cc: David Hildenbrand Cc: Hamza Mahfooz Cc: Hugh Dickins Cc: Imran Khan Cc: Johannes Weiner Cc: Kamalesh Babulal Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Michal Koutný Cc: Mike Rapoport Cc: Muchun Song Cc: Nhat Pham Cc: Roman Gushchin Cc: Suren Baghdasaryan Cc: Usama Arif Cc: Vlastimil Babka Cc: Wei Xu Cc: Yosry Ahmed Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 5 +++++ mm/vmscan.c | 38 +++++++++++++++++++------------------- 2 files changed, 24 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ef26ba087844..38f94c7271c1 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1498,6 +1498,11 @@ static inline struct lruvec *parent_lruvec(struct lruvec *lruvec) return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec)); } +static inline void lruvec_lock_irq(struct lruvec *lruvec) +{ + spin_lock_irq(&lruvec->lru_lock); +} + static inline void lruvec_unlock(struct lruvec *lruvec) { spin_unlock(&lruvec->lru_lock); diff --git a/mm/vmscan.c b/mm/vmscan.c index 6f3f9e20ff67..d4b649abe645 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1998,7 +1998,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, lru_add_drain(); - spin_lock_irq(&lruvec->lru_lock); + lruvec_lock_irq(lruvec); nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &folio_list, &nr_scanned, sc, lru); @@ -2008,7 +2008,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, mod_lruvec_state(lruvec, item, nr_scanned); mod_lruvec_state(lruvec, PGSCAN_ANON + file, nr_scanned); - spin_unlock_irq(&lruvec->lru_lock); + lruvec_unlock_irq(lruvec); if (nr_taken == 0) return 0; @@ -2025,7 +2025,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, mod_lruvec_state(lruvec, item, nr_reclaimed); mod_lruvec_state(lruvec, PGSTEAL_ANON + file, nr_reclaimed); - spin_lock_irq(&lruvec->lru_lock); + lruvec_lock_irq(lruvec); lru_note_cost_unlock_irq(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed); @@ -2104,7 +2104,7 @@ static void shrink_active_list(unsigned long nr_to_scan, lru_add_drain(); - spin_lock_irq(&lruvec->lru_lock); + lruvec_lock_irq(lruvec); nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &l_hold, &nr_scanned, sc, lru); @@ -2113,7 +2113,7 @@ static void shrink_active_list(unsigned long nr_to_scan, mod_lruvec_state(lruvec, PGREFILL, nr_scanned); - spin_unlock_irq(&lruvec->lru_lock); + lruvec_unlock_irq(lruvec); while (!list_empty(&l_hold)) { struct folio *folio; @@ -2169,7 +2169,7 @@ static void shrink_active_list(unsigned long nr_to_scan, count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate); mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); - spin_lock_irq(&lruvec->lru_lock); + lruvec_lock_irq(lruvec); lru_note_cost_unlock_irq(lruvec, file, 0, nr_rotated); trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, nr_deactivate, nr_rotated, sc->priority, file); @@ -3803,9 +3803,9 @@ static void walk_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk) } if (walk->batched) { - spin_lock_irq(&lruvec->lru_lock); + lruvec_lock_irq(lruvec); reset_batch_size(walk); - spin_unlock_irq(&lruvec->lru_lock); + lruvec_unlock_irq(lruvec); } cond_resched(); @@ -3965,7 +3965,7 @@ restart: if (seq < READ_ONCE(lrugen->max_seq)) return false; - spin_lock_irq(&lruvec->lru_lock); + lruvec_lock_irq(lruvec); VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); @@ -3980,7 +3980,7 @@ restart: if (inc_min_seq(lruvec, type, swappiness)) continue; - spin_unlock_irq(&lruvec->lru_lock); + lruvec_unlock_irq(lruvec); cond_resched(); goto restart; } @@ -4015,7 +4015,7 @@ restart: /* make sure preceding modifications appear */ smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); unlock: - spin_unlock_irq(&lruvec->lru_lock); + lruvec_unlock_irq(lruvec); return success; } @@ -4715,7 +4715,7 @@ static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec, struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct pglist_data *pgdat = lruvec_pgdat(lruvec); - spin_lock_irq(&lruvec->lru_lock); + lruvec_lock_irq(lruvec); scanned = isolate_folios(nr_to_scan, lruvec, sc, swappiness, &type, &list); @@ -4724,7 +4724,7 @@ static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec, if (evictable_min_seq(lrugen->min_seq, swappiness) + MIN_NR_GENS > lrugen->max_seq) scanned = 0; - spin_unlock_irq(&lruvec->lru_lock); + lruvec_unlock_irq(lruvec); if (list_empty(&list)) return scanned; @@ -4762,9 +4762,9 @@ retry: walk = current->reclaim_state->mm_walk; if (walk && walk->batched) { walk->lruvec = lruvec; - spin_lock_irq(&lruvec->lru_lock); + lruvec_lock_irq(lruvec); reset_batch_size(walk); - spin_unlock_irq(&lruvec->lru_lock); + lruvec_unlock_irq(lruvec); } mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc), @@ -5202,7 +5202,7 @@ static void lru_gen_change_state(bool enabled) for_each_node(nid) { struct lruvec *lruvec = get_lruvec(memcg, nid); - spin_lock_irq(&lruvec->lru_lock); + lruvec_lock_irq(lruvec); VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); VM_WARN_ON_ONCE(!state_is_valid(lruvec)); @@ -5210,12 +5210,12 @@ static void lru_gen_change_state(bool enabled) lruvec->lrugen.enabled = enabled; while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) { - spin_unlock_irq(&lruvec->lru_lock); + lruvec_unlock_irq(lruvec); cond_resched(); - spin_lock_irq(&lruvec->lru_lock); + lruvec_lock_irq(lruvec); } - spin_unlock_irq(&lruvec->lru_lock); + lruvec_unlock_irq(lruvec); } cond_resched(); -- cgit v1.2.3 From 31b54a5e8916fdd4819880e3aed93f65ecbb47e3 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 5 Mar 2026 19:52:42 +0800 Subject: mm: memcontrol: prepare for reparenting LRU pages for lruvec lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The following diagram illustrates how to ensure the safety of the folio lruvec lock when LRU folios undergo reparenting. In the folio_lruvec_lock(folio) function: rcu_read_lock(); retry: lruvec = folio_lruvec(folio); /* There is a possibility of folio reparenting at this point. */ spin_lock(&lruvec->lru_lock); if (unlikely(lruvec_memcg(lruvec) != folio_memcg(folio))) { /* * The wrong lruvec lock was acquired, and a retry is required. * This is because the folio resides on the parent memcg lruvec * list. */ spin_unlock(&lruvec->lru_lock); goto retry; } /* Reaching here indicates that folio_memcg() is stable. */ In the memcg_reparent_objcgs(memcg) function: spin_lock(&lruvec->lru_lock); spin_lock(&lruvec_parent->lru_lock); /* Transfer folios from the lruvec list to the parent's. */ spin_unlock(&lruvec_parent->lru_lock); spin_unlock(&lruvec->lru_lock); After acquiring the lruvec lock, it is necessary to verify whether the folio has been reparented. If reparenting has occurred, the new lruvec lock must be reacquired. During the LRU folio reparenting process, the lruvec lock will also be acquired (this will be implemented in a subsequent patch). Therefore, folio_memcg() remains unchanged while the lruvec lock is held. Given that lruvec_memcg(lruvec) is always equal to folio_memcg(folio) after the lruvec lock is acquired, the lruvec_memcg_debug() check is redundant. Hence, it is removed. This patch serves as a preparation for the reparenting of LRU folios. Link: https://lore.kernel.org/23f22cbb1419f277a3483018b32158ae2b86c666.1772711148.git.zhengqi.arch@bytedance.com Signed-off-by: Muchun Song Signed-off-by: Qi Zheng Acked-by: Johannes Weiner Acked-by: Shakeel Butt Cc: Allen Pais Cc: Axel Rasmussen Cc: Baoquan He Cc: Chengming Zhou Cc: Chen Ridong Cc: David Hildenbrand Cc: Hamza Mahfooz Cc: Harry Yoo Cc: Hugh Dickins Cc: Imran Khan Cc: Kamalesh Babulal Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Michal Koutný Cc: Mike Rapoport Cc: Muchun Song Cc: Nhat Pham Cc: Roman Gushchin Cc: Suren Baghdasaryan Cc: Usama Arif Cc: Vlastimil Babka Cc: Wei Xu Cc: Yosry Ahmed Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 34 ++++++++++++++--------------- include/linux/swap.h | 3 +-- mm/compaction.c | 29 +++++++++++++++++++------ mm/memcontrol.c | 53 +++++++++++++++++++++++----------------------- mm/swap.c | 6 +++++- 5 files changed, 73 insertions(+), 52 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 38f94c7271c1..12982875073e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -741,7 +741,15 @@ out: * folio_lruvec - return lruvec for isolating/putting an LRU folio * @folio: Pointer to the folio. * - * This function relies on folio->mem_cgroup being stable. + * Call with rcu_read_lock() held to ensure the lifetime of the returned lruvec. + * Note that this alone will NOT guarantee the stability of the folio->lruvec + * association; the folio can be reparented to an ancestor if this races with + * cgroup deletion. + * + * Use folio_lruvec_lock() to ensure both lifetime and stability of the binding. + * Once a lruvec is locked, folio_lruvec() can be called on other folios, and + * their binding is stable if the returned lruvec matches the one the caller has + * locked. Useful for lock batching. */ static inline struct lruvec *folio_lruvec(struct folio *folio) { @@ -764,15 +772,6 @@ struct lruvec *folio_lruvec_lock_irq(struct folio *folio); struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, unsigned long *flags); -#ifdef CONFIG_DEBUG_VM -void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio); -#else -static inline -void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) -{ -} -#endif - static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ return css ? container_of(css, struct mem_cgroup, css) : NULL; @@ -1198,11 +1197,6 @@ static inline struct lruvec *folio_lruvec(struct folio *folio) return &pgdat->__lruvec; } -static inline -void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) -{ -} - static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) { return NULL; @@ -1261,6 +1255,7 @@ static inline struct lruvec *folio_lruvec_lock(struct folio *folio) { struct pglist_data *pgdat = folio_pgdat(folio); + rcu_read_lock(); spin_lock(&pgdat->__lruvec.lru_lock); return &pgdat->__lruvec; } @@ -1269,6 +1264,7 @@ static inline struct lruvec *folio_lruvec_lock_irq(struct folio *folio) { struct pglist_data *pgdat = folio_pgdat(folio); + rcu_read_lock(); spin_lock_irq(&pgdat->__lruvec.lru_lock); return &pgdat->__lruvec; } @@ -1278,6 +1274,7 @@ static inline struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, { struct pglist_data *pgdat = folio_pgdat(folio); + rcu_read_lock(); spin_lock_irqsave(&pgdat->__lruvec.lru_lock, *flagsp); return &pgdat->__lruvec; } @@ -1500,23 +1497,26 @@ static inline struct lruvec *parent_lruvec(struct lruvec *lruvec) static inline void lruvec_lock_irq(struct lruvec *lruvec) { + rcu_read_lock(); spin_lock_irq(&lruvec->lru_lock); } static inline void lruvec_unlock(struct lruvec *lruvec) { spin_unlock(&lruvec->lru_lock); + rcu_read_unlock(); } static inline void lruvec_unlock_irq(struct lruvec *lruvec) { spin_unlock_irq(&lruvec->lru_lock); + rcu_read_unlock(); } -static inline void lruvec_unlock_irqrestore(struct lruvec *lruvec, - unsigned long flags) +static inline void lruvec_unlock_irqrestore(struct lruvec *lruvec, unsigned long flags) { spin_unlock_irqrestore(&lruvec->lru_lock, flags); + rcu_read_unlock(); } /* Test requires a stable folio->memcg binding, see folio_memcg() */ diff --git a/include/linux/swap.h b/include/linux/swap.h index 4b1f13b5bbad..ea08e2afa2b4 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -310,8 +310,7 @@ extern unsigned long totalreserve_pages; /* linux/mm/swap.c */ void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file, - unsigned int nr_io, unsigned int nr_rotated) - __releases(lruvec->lru_lock); + unsigned int nr_io, unsigned int nr_rotated); void lru_note_cost_refault(struct folio *); void folio_add_lru(struct folio *); void folio_add_lru_vma(struct folio *, struct vm_area_struct *); diff --git a/mm/compaction.c b/mm/compaction.c index c3e338aaa0ff..3648ce22c807 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -518,6 +518,24 @@ static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags, return true; } +static struct lruvec * +compact_folio_lruvec_lock_irqsave(struct folio *folio, unsigned long *flags, + struct compact_control *cc) +{ + struct lruvec *lruvec; + + rcu_read_lock(); +retry: + lruvec = folio_lruvec(folio); + compact_lock_irqsave(&lruvec->lru_lock, flags, cc); + if (unlikely(lruvec_memcg(lruvec) != folio_memcg(folio))) { + spin_unlock_irqrestore(&lruvec->lru_lock, *flags); + goto retry; + } + + return lruvec; +} + /* * Compaction requires the taking of some coarse locks that are potentially * very heavily contended. The lock should be periodically unlocked to avoid @@ -839,7 +857,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, { pg_data_t *pgdat = cc->zone->zone_pgdat; unsigned long nr_scanned = 0, nr_isolated = 0; - struct lruvec *lruvec; + struct lruvec *lruvec = NULL; unsigned long flags = 0; struct lruvec *locked = NULL; struct folio *folio = NULL; @@ -1153,18 +1171,17 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (!folio_test_clear_lru(folio)) goto isolate_fail_put; - lruvec = folio_lruvec(folio); + if (locked) + lruvec = folio_lruvec(folio); /* If we already hold the lock, we can skip some rechecking */ - if (lruvec != locked) { + if (lruvec != locked || !locked) { if (locked) lruvec_unlock_irqrestore(locked, flags); - compact_lock_irqsave(&lruvec->lru_lock, &flags, cc); + lruvec = compact_folio_lruvec_lock_irqsave(folio, &flags, cc); locked = lruvec; - lruvec_memcg_debug(lruvec, folio); - /* * Try get exclusive access under lock. If marked for * skip, the scan is aborted unless the current context diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 10021cef176b..0d4eaaea2b54 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1206,23 +1206,6 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, } } -#ifdef CONFIG_DEBUG_VM -void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) -{ - struct mem_cgroup *memcg; - - if (mem_cgroup_disabled()) - return; - - memcg = folio_memcg(folio); - - if (!memcg) - VM_BUG_ON_FOLIO(!mem_cgroup_is_root(lruvec_memcg(lruvec)), folio); - else - VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio); -} -#endif - /** * folio_lruvec_lock - Lock the lruvec for a folio. * @folio: Pointer to the folio. @@ -1232,14 +1215,20 @@ void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) * - folio_test_lru false * - folio frozen (refcount of 0) * - * Return: The lruvec this folio is on with its lock held. + * Return: The lruvec this folio is on with its lock held and rcu read lock held. */ struct lruvec *folio_lruvec_lock(struct folio *folio) { - struct lruvec *lruvec = folio_lruvec(folio); + struct lruvec *lruvec; + rcu_read_lock(); +retry: + lruvec = folio_lruvec(folio); spin_lock(&lruvec->lru_lock); - lruvec_memcg_debug(lruvec, folio); + if (unlikely(lruvec_memcg(lruvec) != folio_memcg(folio))) { + spin_unlock(&lruvec->lru_lock); + goto retry; + } return lruvec; } @@ -1254,14 +1243,20 @@ struct lruvec *folio_lruvec_lock(struct folio *folio) * - folio frozen (refcount of 0) * * Return: The lruvec this folio is on with its lock held and interrupts - * disabled. + * disabled and rcu read lock held. */ struct lruvec *folio_lruvec_lock_irq(struct folio *folio) { - struct lruvec *lruvec = folio_lruvec(folio); + struct lruvec *lruvec; + rcu_read_lock(); +retry: + lruvec = folio_lruvec(folio); spin_lock_irq(&lruvec->lru_lock); - lruvec_memcg_debug(lruvec, folio); + if (unlikely(lruvec_memcg(lruvec) != folio_memcg(folio))) { + spin_unlock_irq(&lruvec->lru_lock); + goto retry; + } return lruvec; } @@ -1277,15 +1272,21 @@ struct lruvec *folio_lruvec_lock_irq(struct folio *folio) * - folio frozen (refcount of 0) * * Return: The lruvec this folio is on with its lock held and interrupts - * disabled. + * disabled and rcu read lock held. */ struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, unsigned long *flags) { - struct lruvec *lruvec = folio_lruvec(folio); + struct lruvec *lruvec; + rcu_read_lock(); +retry: + lruvec = folio_lruvec(folio); spin_lock_irqsave(&lruvec->lru_lock, *flags); - lruvec_memcg_debug(lruvec, folio); + if (unlikely(lruvec_memcg(lruvec) != folio_memcg(folio))) { + spin_unlock_irqrestore(&lruvec->lru_lock, *flags); + goto retry; + } return lruvec; } diff --git a/mm/swap.c b/mm/swap.c index 009b32d6d344..bcd2b52e5def 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -240,6 +240,7 @@ void folio_rotate_reclaimable(struct folio *folio) void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file, unsigned int nr_io, unsigned int nr_rotated) __releases(lruvec->lru_lock) + __releases(rcu) { unsigned long cost; @@ -253,6 +254,7 @@ void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file, cost = nr_io * SWAP_CLUSTER_MAX + nr_rotated; if (!cost) { spin_unlock_irq(&lruvec->lru_lock); + rcu_read_unlock(); return; } @@ -285,8 +287,10 @@ void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file, spin_unlock_irq(&lruvec->lru_lock); lruvec = parent_lruvec(lruvec); - if (!lruvec) + if (!lruvec) { + rcu_read_unlock(); break; + } spin_lock_irq(&lruvec->lru_lock); } } -- cgit v1.2.3 From 07a6e9a2c199fed361f528781284d56771d0016f Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 5 Mar 2026 19:52:43 +0800 Subject: mm: vmscan: prepare for reparenting traditional LRU folios MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To resolve the dying memcg issue, we need to reparent LRU folios of child memcg to its parent memcg. For traditional LRU list, each lruvec of every memcg comprises four LRU lists. Due to the symmetry of the LRU lists, it is feasible to transfer the LRU lists from a memcg to its parent memcg during the reparenting process. This commit implements the specific function, which will be used during the reparenting process. Link: https://lore.kernel.org/a92d217a9fc82bd0c401210204a095caaf615b1c.1772711148.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Harry Yoo Acked-by: Johannes Weiner Acked-by: Muchun Song Acked-by: Shakeel Butt Cc: Allen Pais Cc: Axel Rasmussen Cc: Baoquan He Cc: Chengming Zhou Cc: Chen Ridong Cc: David Hildenbrand Cc: Hamza Mahfooz Cc: Hugh Dickins Cc: Imran Khan Cc: Kamalesh Babulal Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Michal Koutný Cc: Mike Rapoport Cc: Muchun Song Cc: Nhat Pham Cc: Roman Gushchin Cc: Suren Baghdasaryan Cc: Usama Arif Cc: Vlastimil Babka Cc: Wei Xu Cc: Yosry Ahmed Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/swap.h | 21 +++++++++++++++++++++ mm/swap.c | 33 +++++++++++++++++++++++++++++++++ mm/vmscan.c | 19 ------------------- 3 files changed, 54 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index ea08e2afa2b4..d653fe050b8f 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -546,6 +546,8 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) return READ_ONCE(memcg->swappiness); } + +void lru_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid); #else static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) { @@ -610,5 +612,24 @@ static inline bool mem_cgroup_swap_full(struct folio *folio) } #endif +/* for_each_managed_zone_pgdat - helper macro to iterate over all managed zones in a pgdat up to + * and including the specified highidx + * @zone: The current zone in the iterator + * @pgdat: The pgdat which node_zones are being iterated + * @idx: The index variable + * @highidx: The index of the highest zone to return + * + * This macro iterates through all managed zones up to and including the specified highidx. + * The zone iterator enters an invalid state after macro call and must be reinitialized + * before it can be used again. + */ +#define for_each_managed_zone_pgdat(zone, pgdat, idx, highidx) \ + for ((idx) = 0, (zone) = (pgdat)->node_zones; \ + (idx) <= (highidx); \ + (idx)++, (zone)++) \ + if (!managed_zone(zone)) \ + continue; \ + else + #endif /* __KERNEL__*/ #endif /* _LINUX_SWAP_H */ diff --git a/mm/swap.c b/mm/swap.c index bcd2b52e5def..5cc44f0de987 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1090,6 +1090,39 @@ void folio_batch_remove_exceptionals(struct folio_batch *fbatch) fbatch->nr = j; } +#ifdef CONFIG_MEMCG +static void lruvec_reparent_lru(struct lruvec *child_lruvec, + struct lruvec *parent_lruvec, + enum lru_list lru, int nid) +{ + int zid; + struct zone *zone; + + if (lru != LRU_UNEVICTABLE) + list_splice_tail_init(&child_lruvec->lists[lru], &parent_lruvec->lists[lru]); + + for_each_managed_zone_pgdat(zone, NODE_DATA(nid), zid, MAX_NR_ZONES - 1) { + unsigned long size = mem_cgroup_get_zone_lru_size(child_lruvec, lru, zid); + + mem_cgroup_update_lru_size(parent_lruvec, lru, zid, size); + } +} + +void lru_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid) +{ + enum lru_list lru; + struct lruvec *child_lruvec, *parent_lruvec; + + child_lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); + parent_lruvec = mem_cgroup_lruvec(parent, NODE_DATA(nid)); + parent_lruvec->anon_cost += child_lruvec->anon_cost; + parent_lruvec->file_cost += child_lruvec->file_cost; + + for_each_lru(lru) + lruvec_reparent_lru(child_lruvec, parent_lruvec, lru, nid); +} +#endif + static const struct ctl_table swap_sysctl_table[] = { { .procname = "page-cluster", diff --git a/mm/vmscan.c b/mm/vmscan.c index d4b649abe645..d225e84b5263 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -269,25 +269,6 @@ static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg) } #endif -/* for_each_managed_zone_pgdat - helper macro to iterate over all managed zones in a pgdat up to - * and including the specified highidx - * @zone: The current zone in the iterator - * @pgdat: The pgdat which node_zones are being iterated - * @idx: The index variable - * @highidx: The index of the highest zone to return - * - * This macro iterates through all managed zones up to and including the specified highidx. - * The zone iterator enters an invalid state after macro call and must be reinitialized - * before it can be used again. - */ -#define for_each_managed_zone_pgdat(zone, pgdat, idx, highidx) \ - for ((idx) = 0, (zone) = (pgdat)->node_zones; \ - (idx) <= (highidx); \ - (idx)++, (zone)++) \ - if (!managed_zone(zone)) \ - continue; \ - else - static void set_task_reclaim_state(struct task_struct *task, struct reclaim_state *rs) { -- cgit v1.2.3 From f304652609eae3814b0e9d11c75c0e0cb62da31f Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 5 Mar 2026 19:52:44 +0800 Subject: mm: vmscan: prepare for reparenting MGLRU folios MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Similar to traditional LRU folios, in order to solve the dying memcg problem, we also need to reparenting MGLRU folios to the parent memcg when memcg offline. However, there are the following challenges: 1. Each lruvec has between MIN_NR_GENS and MAX_NR_GENS generations, the number of generations of the parent and child memcg may be different, so we cannot simply transfer MGLRU folios in the child memcg to the parent memcg as we did for traditional LRU folios. 2. The generation information is stored in folio->flags, but we cannot traverse these folios while holding the lru lock, otherwise it may cause softlockup. 3. In walk_update_folio(), the gen of folio and corresponding lru size may be updated, but the folio is not immediately moved to the corresponding lru list. Therefore, there may be folios of different generations on an LRU list. 4. In lru_gen_del_folio(), the generation to which the folio belongs is found based on the generation information in folio->flags, and the corresponding LRU size will be updated. Therefore, we need to update the lru size correctly during reparenting, otherwise the lru size may be updated incorrectly in lru_gen_del_folio(). Finally, this patch chose a compromise method, which is to splice the lru list in the child memcg to the lru list of the same generation in the parent memcg during reparenting. And in order to ensure that the parent memcg has the same generation, we need to increase the generations in the parent memcg to the MAX_NR_GENS before reparenting. Of course, the same generation has different meanings in the parent and child memcg, this will cause confusion in the hot and cold information of folios. But other than that, this method is simple enough, the lru size is correct, and there is no need to consider some concurrency issues (such as lru_gen_del_folio()). To prepare for the above work, this commit implements the specific functions, which will be used during reparenting. [zhengqi.arch@bytedance.com: use list_splice_tail_init() to reparent child folios] Link: https://lore.kernel.org/20260324114937.28569-1-qi.zheng@linux.dev Link: https://lore.kernel.org/e75050354cdbc42221a04f7cf133292b61105548.1772711148.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Suggested-by: Harry Yoo Suggested-by: Imran Khan Acked-by: Harry Yoo Cc: Allen Pais Cc: Axel Rasmussen Cc: Baoquan He Cc: Chengming Zhou Cc: Chen Ridong Cc: David Hildenbrand Cc: Hamza Mahfooz Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kamalesh Babulal Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Michal Koutný Cc: Mike Rapoport Cc: Muchun Song Cc: Muchun Song Cc: Nhat Pham Cc: Roman Gushchin Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Usama Arif Cc: Vlastimil Babka Cc: Wei Xu Cc: Yosry Ahmed Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 17 ++++++ mm/vmscan.c | 142 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 159 insertions(+) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 4a20df132258..20f920dede65 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -692,6 +692,9 @@ void lru_gen_online_memcg(struct mem_cgroup *memcg); void lru_gen_offline_memcg(struct mem_cgroup *memcg); void lru_gen_release_memcg(struct mem_cgroup *memcg); void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid); +void max_lru_gen_memcg(struct mem_cgroup *memcg, int nid); +bool recheck_lru_gen_max_memcg(struct mem_cgroup *memcg, int nid); +void lru_gen_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid); #else /* !CONFIG_LRU_GEN */ @@ -733,6 +736,20 @@ static inline void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid) { } +static inline void max_lru_gen_memcg(struct mem_cgroup *memcg, int nid) +{ +} + +static inline bool recheck_lru_gen_max_memcg(struct mem_cgroup *memcg, int nid) +{ + return true; +} + +static inline +void lru_gen_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid) +{ +} + #endif /* CONFIG_LRU_GEN */ struct lruvec { diff --git a/mm/vmscan.c b/mm/vmscan.c index d225e84b5263..8472aa4bddd5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4426,6 +4426,148 @@ void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid) lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD); } +bool recheck_lru_gen_max_memcg(struct mem_cgroup *memcg, int nid) +{ + struct lruvec *lruvec = get_lruvec(memcg, nid); + int type; + + for (type = 0; type < ANON_AND_FILE; type++) { + if (get_nr_gens(lruvec, type) != MAX_NR_GENS) + return false; + } + + return true; +} + +static void try_to_inc_max_seq_nowalk(struct mem_cgroup *memcg, + struct lruvec *lruvec) +{ + struct lru_gen_mm_list *mm_list = get_mm_list(memcg); + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); + int swappiness = mem_cgroup_swappiness(memcg); + DEFINE_MAX_SEQ(lruvec); + bool success = false; + + /* + * We are not iterating the mm_list here, updating mm_state->seq is just + * to make mm walkers work properly. + */ + if (mm_state) { + spin_lock(&mm_list->lock); + VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq); + if (max_seq > mm_state->seq) { + WRITE_ONCE(mm_state->seq, mm_state->seq + 1); + success = true; + } + spin_unlock(&mm_list->lock); + } else { + success = true; + } + + if (success) + inc_max_seq(lruvec, max_seq, swappiness); +} + +/* + * We need to ensure that the folios of child memcg can be reparented to the + * same gen of the parent memcg, so the gens of the parent memcg needed be + * incremented to the MAX_NR_GENS before reparenting. + */ +void max_lru_gen_memcg(struct mem_cgroup *memcg, int nid) +{ + struct lruvec *lruvec = get_lruvec(memcg, nid); + int type; + + for (type = 0; type < ANON_AND_FILE; type++) { + while (get_nr_gens(lruvec, type) < MAX_NR_GENS) { + try_to_inc_max_seq_nowalk(memcg, lruvec); + cond_resched(); + } + } +} + +/* + * Compared to traditional LRU, MGLRU faces the following challenges: + * + * 1. Each lruvec has between MIN_NR_GENS and MAX_NR_GENS generations, the + * number of generations of the parent and child memcg may be different, + * so we cannot simply transfer MGLRU folios in the child memcg to the + * parent memcg as we did for traditional LRU folios. + * 2. The generation information is stored in folio->flags, but we cannot + * traverse these folios while holding the lru lock, otherwise it may + * cause softlockup. + * 3. In walk_update_folio(), the gen of folio and corresponding lru size + * may be updated, but the folio is not immediately moved to the + * corresponding lru list. Therefore, there may be folios of different + * generations on an LRU list. + * 4. In lru_gen_del_folio(), the generation to which the folio belongs is + * found based on the generation information in folio->flags, and the + * corresponding LRU size will be updated. Therefore, we need to update + * the lru size correctly during reparenting, otherwise the lru size may + * be updated incorrectly in lru_gen_del_folio(). + * + * Finally, we choose a compromise method, which is to splice the lru list in + * the child memcg to the lru list of the same generation in the parent memcg + * during reparenting. + * + * The same generation has different meanings in the parent and child memcg, + * so this compromise method will cause the LRU inversion problem. But as the + * system runs, this problem will be fixed automatically. + */ +static void __lru_gen_reparent_memcg(struct lruvec *child_lruvec, struct lruvec *parent_lruvec, + int zone, int type) +{ + struct lru_gen_folio *child_lrugen, *parent_lrugen; + enum lru_list lru = type * LRU_INACTIVE_FILE; + int i; + + child_lrugen = &child_lruvec->lrugen; + parent_lrugen = &parent_lruvec->lrugen; + + for (i = 0; i < get_nr_gens(child_lruvec, type); i++) { + int gen = lru_gen_from_seq(child_lrugen->max_seq - i); + long nr_pages = child_lrugen->nr_pages[gen][type][zone]; + int child_lru_active = lru_gen_is_active(child_lruvec, gen) ? LRU_ACTIVE : 0; + int parent_lru_active = lru_gen_is_active(parent_lruvec, gen) ? LRU_ACTIVE : 0; + + /* Assuming that child pages are colder than parent pages */ + list_splice_tail_init(&child_lrugen->folios[gen][type][zone], + &parent_lrugen->folios[gen][type][zone]); + + WRITE_ONCE(child_lrugen->nr_pages[gen][type][zone], 0); + WRITE_ONCE(parent_lrugen->nr_pages[gen][type][zone], + parent_lrugen->nr_pages[gen][type][zone] + nr_pages); + + if (lru_gen_is_active(child_lruvec, gen) != lru_gen_is_active(parent_lruvec, gen)) { + __update_lru_size(child_lruvec, lru + child_lru_active, zone, -nr_pages); + __update_lru_size(parent_lruvec, lru + parent_lru_active, zone, nr_pages); + } + } +} + +void lru_gen_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid) +{ + struct lruvec *child_lruvec, *parent_lruvec; + int type, zid; + struct zone *zone; + enum lru_list lru; + + child_lruvec = get_lruvec(memcg, nid); + parent_lruvec = get_lruvec(parent, nid); + + for_each_managed_zone_pgdat(zone, NODE_DATA(nid), zid, MAX_NR_ZONES - 1) + for (type = 0; type < ANON_AND_FILE; type++) + __lru_gen_reparent_memcg(child_lruvec, parent_lruvec, zid, type); + + for_each_lru(lru) { + for_each_managed_zone_pgdat(zone, NODE_DATA(nid), zid, MAX_NR_ZONES - 1) { + unsigned long size = mem_cgroup_get_zone_lru_size(child_lruvec, lru, zid); + + mem_cgroup_update_lru_size(parent_lruvec, lru, zid, size); + } + } +} + #endif /* CONFIG_MEMCG */ /****************************************************************************** -- cgit v1.2.3 From 7404bd37cfbeb2aa06249418c1788ca94bae2875 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 5 Mar 2026 19:52:46 +0800 Subject: mm: workingset: use lruvec_lru_size() to get the number of lru pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For cgroup v2, count_shadow_nodes() is the only place to read non-hierarchical stats (lruvec_stats->state_local). To avoid the need to consider cgroup v2 during subsequent non-hierarchical stats reparenting, use lruvec_lru_size() instead of lruvec_page_state_local() to get the number of lru pages. For NR_SLAB_RECLAIMABLE_B and NR_SLAB_UNRECLAIMABLE_B cases, it appears that the statistics here have already been problematic for a while since slab pages have been reparented. So just ignore it for now. Link: https://lore.kernel.org/b1d448c667a8fb377c3390d9aba43bdb7e4d5739.1772711148.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Acked-by: Shakeel Butt Acked-by: Muchun Song Cc: Allen Pais Cc: Axel Rasmussen Cc: Baoquan He Cc: Chengming Zhou Cc: Chen Ridong Cc: David Hildenbrand Cc: Hamza Mahfooz Cc: Harry Yoo Cc: Hugh Dickins Cc: Imran Khan Cc: Johannes Weiner Cc: Kamalesh Babulal Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Michal Koutný Cc: Mike Rapoport Cc: Muchun Song Cc: Nhat Pham Cc: Roman Gushchin Cc: Suren Baghdasaryan Cc: Usama Arif Cc: Vlastimil Babka Cc: Wei Xu Cc: Yosry Ahmed Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/swap.h | 1 + mm/vmscan.c | 3 +-- mm/workingset.c | 5 +++-- 3 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index d653fe050b8f..7a09df6977a5 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -352,6 +352,7 @@ extern void swap_setup(void); extern unsigned long zone_reclaimable_pages(struct zone *zone); extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); +unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx); #define MEMCG_RECLAIM_MAY_SWAP (1 << 1) #define MEMCG_RECLAIM_PROACTIVE (1 << 2) diff --git a/mm/vmscan.c b/mm/vmscan.c index 8472aa4bddd5..1ac4f959ec1c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -390,8 +390,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone) * @lru: lru to use * @zone_idx: zones to consider (use MAX_NR_ZONES - 1 for the whole LRU list) */ -static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, - int zone_idx) +unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) { unsigned long size = 0; int zid; diff --git a/mm/workingset.c b/mm/workingset.c index 95d722a452e1..07e6836d0502 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -691,9 +691,10 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, mem_cgroup_flush_stats_ratelimited(sc->memcg); lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); + for (pages = 0, i = 0; i < NR_LRU_LISTS; i++) - pages += lruvec_page_state_local(lruvec, - NR_LRU_BASE + i); + pages += lruvec_lru_size(lruvec, i, MAX_NR_ZONES - 1); + pages += lruvec_page_state_local( lruvec, NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT; pages += lruvec_page_state_local( -- cgit v1.2.3 From 01b9da291c4969354807b52956f4aae1f41b4924 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 5 Mar 2026 19:52:49 +0800 Subject: mm: memcontrol: convert objcg to be per-memcg per-node type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert objcg to be per-memcg per-node type, so that when reparent LRU folios later, we can hold the lru lock at the node level, thus avoiding holding too many lru locks at once. [zhengqi.arch@bytedance.com: reset pn->orig_objcg to NULL] Link: https://lore.kernel.org/20260309112939.31937-1-qi.zheng@linux.dev [akpm@linux-foundation.org: fix comment typo, per Usama. Reflow comment to 80 cols] [devnexen@gmail.com: fix obj_cgroup leak in mem_cgroup_css_online() error path] Link: https://lore.kernel.org/20260322193631.45457-1-devnexen@gmail.com [devnexen@gmail.com: add newline, per Qi Zheng] Link: https://lore.kernel.org/20260323063007.7783-1-devnexen@gmail.com Link: https://lore.kernel.org/56c04b1c5d54f75ccdc12896df6c1ca35403ecc3.1772711148.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Signed-off-by: David Carlier Acked-by: Shakeel Butt Cc: Allen Pais Cc: Axel Rasmussen Cc: Baoquan He Cc: Chengming Zhou Cc: Chen Ridong Cc: David Hildenbrand Cc: Hamza Mahfooz Cc: Harry Yoo Cc: Hugh Dickins Cc: Imran Khan Cc: Johannes Weiner Cc: Kamalesh Babulal Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Michal Koutný Cc: Mike Rapoport Cc: Muchun Song Cc: Muchun Song Cc: Nhat Pham Cc: Roman Gushchin Cc: Suren Baghdasaryan Cc: Usama Arif Cc: Vlastimil Babka Cc: Wei Xu Cc: Yosry Ahmed Cc: Yuanchu Xie Cc: Zi Yan Cc: Usama Arif Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 23 ++++++------ include/linux/sched.h | 2 +- mm/memcontrol.c | 92 +++++++++++++++++++++++++++++++--------------- 3 files changed, 75 insertions(+), 42 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 12982875073e..3e836b56bfcb 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -115,6 +115,16 @@ struct mem_cgroup_per_node { unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; struct mem_cgroup_reclaim_iter iter; + /* + * objcg is wiped out as a part of the objcg repaprenting process. + * orig_objcg preserves a pointer (and a reference) to the original + * objcg until the end of live of memcg. + */ + struct obj_cgroup __rcu *objcg; + struct obj_cgroup *orig_objcg; + /* list of inherited objcgs, protected by objcg_lock */ + struct list_head objcg_list; + #ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC /* slab stats for nmi context */ atomic_t slab_reclaimable; @@ -179,6 +189,7 @@ struct obj_cgroup { struct list_head list; /* protected by objcg_lock */ struct rcu_head rcu; }; + bool is_root; }; /* @@ -257,15 +268,6 @@ struct mem_cgroup { seqlock_t socket_pressure_seqlock; #endif int kmemcg_id; - /* - * memcg->objcg is wiped out as a part of the objcg repaprenting - * process. memcg->orig_objcg preserves a pointer (and a reference) - * to the original objcg until the end of live of memcg. - */ - struct obj_cgroup __rcu *objcg; - struct obj_cgroup *orig_objcg; - /* list of inherited objcgs, protected by objcg_lock */ - struct list_head objcg_list; struct memcg_vmstats_percpu __percpu *vmstats_percpu; @@ -332,7 +334,6 @@ struct mem_cgroup { #define MEMCG_CHARGE_BATCH 64U extern struct mem_cgroup *root_mem_cgroup; -extern struct obj_cgroup *root_obj_cgroup; enum page_memcg_data_flags { /* page->memcg_data is a pointer to an slabobj_ext vector */ @@ -551,7 +552,7 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) static inline bool obj_cgroup_is_root(const struct obj_cgroup *objcg) { - return objcg == root_obj_cgroup; + return objcg->is_root; } static inline bool mem_cgroup_disabled(void) diff --git a/include/linux/sched.h b/include/linux/sched.h index 5a5d3dbc9cdf..0d27775546f8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1533,7 +1533,7 @@ struct task_struct { /* Used by memcontrol for targeted memcg charge: */ struct mem_cgroup *active_memcg; - /* Cache for current->cgroups->memcg->objcg lookups: */ + /* Cache for current->cgroups->memcg->nodeinfo[nid]->objcg lookups: */ struct obj_cgroup *objcg; #endif diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c9e5ea0d9fc6..1aaa66f729b3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -83,8 +83,6 @@ EXPORT_SYMBOL(memory_cgrp_subsys); struct mem_cgroup *root_mem_cgroup __read_mostly; EXPORT_SYMBOL(root_mem_cgroup); -struct obj_cgroup *root_obj_cgroup __read_mostly; - /* Active memory cgroup to use from an interrupt context */ DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg); EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg); @@ -209,18 +207,21 @@ static struct obj_cgroup *obj_cgroup_alloc(void) } static inline struct obj_cgroup *__memcg_reparent_objcgs(struct mem_cgroup *memcg, - struct mem_cgroup *parent) + struct mem_cgroup *parent, + int nid) { struct obj_cgroup *objcg, *iter; + struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid]; + struct mem_cgroup_per_node *parent_pn = parent->nodeinfo[nid]; - objcg = rcu_replace_pointer(memcg->objcg, NULL, true); + objcg = rcu_replace_pointer(pn->objcg, NULL, true); /* 1) Ready to reparent active objcg. */ - list_add(&objcg->list, &memcg->objcg_list); + list_add(&objcg->list, &pn->objcg_list); /* 2) Reparent active objcg and already reparented objcgs to parent. */ - list_for_each_entry(iter, &memcg->objcg_list, list) + list_for_each_entry(iter, &pn->objcg_list, list) WRITE_ONCE(iter->memcg, parent); /* 3) Move already reparented objcgs to the parent's list */ - list_splice(&memcg->objcg_list, &parent->objcg_list); + list_splice(&pn->objcg_list, &parent_pn->objcg_list); return objcg; } @@ -267,14 +268,17 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg) { struct obj_cgroup *objcg; struct mem_cgroup *parent = parent_mem_cgroup(memcg); + int nid; - reparent_locks(memcg, parent); + for_each_node(nid) { + reparent_locks(memcg, parent); - objcg = __memcg_reparent_objcgs(memcg, parent); + objcg = __memcg_reparent_objcgs(memcg, parent, nid); - reparent_unlocks(memcg, parent); + reparent_unlocks(memcg, parent); - percpu_ref_kill(&objcg->refcnt); + percpu_ref_kill(&objcg->refcnt); + } } /* @@ -2830,8 +2834,10 @@ struct mem_cgroup *mem_cgroup_from_virt(void *p) static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg) { + int nid = numa_node_id(); + for (; memcg; memcg = parent_mem_cgroup(memcg)) { - struct obj_cgroup *objcg = rcu_dereference(memcg->objcg); + struct obj_cgroup *objcg = rcu_dereference(memcg->nodeinfo[nid]->objcg); if (likely(objcg && obj_cgroup_tryget(objcg))) return objcg; @@ -2895,6 +2901,7 @@ __always_inline struct obj_cgroup *current_obj_cgroup(void) { struct mem_cgroup *memcg; struct obj_cgroup *objcg; + int nid = numa_node_id(); if (IS_ENABLED(CONFIG_MEMCG_NMI_UNSAFE) && in_nmi()) return NULL; @@ -2911,14 +2918,14 @@ __always_inline struct obj_cgroup *current_obj_cgroup(void) * Objcg reference is kept by the task, so it's safe * to use the objcg by the current task. */ - return objcg ? : root_obj_cgroup; + return objcg ? : rcu_dereference_check(root_mem_cgroup->nodeinfo[nid]->objcg, 1); } memcg = this_cpu_read(int_active_memcg); if (unlikely(memcg)) goto from_memcg; - return root_obj_cgroup; + return rcu_dereference_check(root_mem_cgroup->nodeinfo[nid]->objcg, 1); from_memcg: for (; memcg; memcg = parent_mem_cgroup(memcg)) { @@ -2928,12 +2935,12 @@ from_memcg: * away and can be used within the scope without any additional * protection. */ - objcg = rcu_dereference_check(memcg->objcg, 1); + objcg = rcu_dereference_check(memcg->nodeinfo[nid]->objcg, 1); if (likely(objcg)) return objcg; } - return root_obj_cgroup; + return rcu_dereference_check(root_mem_cgroup->nodeinfo[nid]->objcg, 1); } struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio) @@ -3876,6 +3883,8 @@ static bool alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) if (!pn->lruvec_stats_percpu) goto fail; + INIT_LIST_HEAD(&pn->objcg_list); + lruvec_init(&pn->lruvec); pn->memcg = memcg; @@ -3890,10 +3899,14 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) { int node; - obj_cgroup_put(memcg->orig_objcg); + for_each_node(node) { + struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; + if (!pn) + continue; - for_each_node(node) - free_mem_cgroup_per_node_info(memcg->nodeinfo[node]); + obj_cgroup_put(pn->orig_objcg); + free_mem_cgroup_per_node_info(pn); + } memcg1_free_events(memcg); kfree(memcg->vmstats); free_percpu(memcg->vmstats_percpu); @@ -3964,7 +3977,6 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) #endif memcg1_memcg_init(memcg); memcg->kmemcg_id = -1; - INIT_LIST_HEAD(&memcg->objcg_list); #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&memcg->cgwb_list); for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) @@ -4041,6 +4053,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct obj_cgroup *objcg; + int nid; memcg_online_kmem(memcg); @@ -4052,17 +4065,19 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) if (alloc_shrinker_info(memcg)) goto offline_kmem; - objcg = obj_cgroup_alloc(); - if (!objcg) - goto free_shrinker; + for_each_node(nid) { + objcg = obj_cgroup_alloc(); + if (!objcg) + goto free_objcg; - if (unlikely(mem_cgroup_is_root(memcg))) - root_obj_cgroup = objcg; + if (unlikely(mem_cgroup_is_root(memcg))) + objcg->is_root = true; - objcg->memcg = memcg; - rcu_assign_pointer(memcg->objcg, objcg); - obj_cgroup_get(objcg); - memcg->orig_objcg = objcg; + objcg->memcg = memcg; + rcu_assign_pointer(memcg->nodeinfo[nid]->objcg, objcg); + obj_cgroup_get(objcg); + memcg->nodeinfo[nid]->orig_objcg = objcg; + } if (unlikely(mem_cgroup_is_root(memcg)) && !mem_cgroup_disabled()) queue_delayed_work(system_dfl_wq, &stats_flush_dwork, @@ -4086,7 +4101,24 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) xa_store(&mem_cgroup_private_ids, memcg->id.id, memcg, GFP_KERNEL); return 0; -free_shrinker: +free_objcg: + for_each_node(nid) { + struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid]; + + objcg = rcu_replace_pointer(pn->objcg, NULL, true); + if (objcg) + percpu_ref_kill(&objcg->refcnt); + + if (pn->orig_objcg) { + obj_cgroup_put(pn->orig_objcg); + /* + * Reset pn->orig_objcg to NULL to prevent + * obj_cgroup_put() from being called again in + * __mem_cgroup_free(). + */ + pn->orig_objcg = NULL; + } + } free_shrinker_info(memcg); offline_kmem: memcg_offline_kmem(memcg); -- cgit v1.2.3 From f1cf8d2f36dc369688bbe61ce064fbd829dbc9e1 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 5 Mar 2026 19:52:50 +0800 Subject: mm: memcontrol: eliminate the problem of dying memory cgroup for LRU folios MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that everything is set up, switch folio->memcg_data pointers to objcgs, update the accessors, and execute reparenting on cgroup death. Finally, folio->memcg_data of LRU folios and kmem folios will always point to an object cgroup pointer. The folio->memcg_data of slab folios will point to an vector of object cgroups. Link: https://lore.kernel.org/80cb7af198dc6f2173fe616d1207a4c315ece141.1772711148.git.zhengqi.arch@bytedance.com Signed-off-by: Muchun Song Signed-off-by: Qi Zheng Acked-by: Shakeel Butt Cc: Allen Pais Cc: Axel Rasmussen Cc: Baoquan He Cc: Chengming Zhou Cc: Chen Ridong Cc: David Hildenbrand Cc: Hamza Mahfooz Cc: Harry Yoo Cc: Hugh Dickins Cc: Imran Khan Cc: Johannes Weiner Cc: Kamalesh Babulal Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Michal Koutný Cc: Mike Rapoport Cc: Muchun Song Cc: Nhat Pham Cc: Roman Gushchin Cc: Suren Baghdasaryan Cc: Usama Arif Cc: Vlastimil Babka Cc: Wei Xu Cc: Yosry Ahmed Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 77 ++++++------------ mm/memcontrol-v1.c | 15 ++-- mm/memcontrol.c | 194 +++++++++++++++++++++++++++------------------ 3 files changed, 151 insertions(+), 135 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 3e836b56bfcb..086158969529 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -369,9 +369,6 @@ enum objext_flags { #define OBJEXTS_FLAGS_MASK (__NR_OBJEXTS_FLAGS - 1) #ifdef CONFIG_MEMCG - -static inline bool folio_memcg_kmem(struct folio *folio); - /* * After the initialization objcg->memcg is always pointing at * a valid memcg, but can be atomically swapped to the parent memcg. @@ -385,43 +382,19 @@ static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg) } /* - * __folio_memcg - Get the memory cgroup associated with a non-kmem folio - * @folio: Pointer to the folio. - * - * Returns a pointer to the memory cgroup associated with the folio, - * or NULL. This function assumes that the folio is known to have a - * proper memory cgroup pointer. It's not safe to call this function - * against some type of folios, e.g. slab folios or ex-slab folios or - * kmem folios. - */ -static inline struct mem_cgroup *__folio_memcg(struct folio *folio) -{ - unsigned long memcg_data = folio->memcg_data; - - VM_BUG_ON_FOLIO(folio_test_slab(folio), folio); - VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio); - VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_KMEM, folio); - - return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK); -} - -/* - * __folio_objcg - get the object cgroup associated with a kmem folio. + * folio_objcg - get the object cgroup associated with a folio. * @folio: Pointer to the folio. * * Returns a pointer to the object cgroup associated with the folio, * or NULL. This function assumes that the folio is known to have a - * proper object cgroup pointer. It's not safe to call this function - * against some type of folios, e.g. slab folios or ex-slab folios or - * LRU folios. + * proper object cgroup pointer. */ -static inline struct obj_cgroup *__folio_objcg(struct folio *folio) +static inline struct obj_cgroup *folio_objcg(struct folio *folio) { unsigned long memcg_data = folio->memcg_data; VM_BUG_ON_FOLIO(folio_test_slab(folio), folio); VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio); - VM_BUG_ON_FOLIO(!(memcg_data & MEMCG_DATA_KMEM), folio); return (struct obj_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK); } @@ -435,21 +408,30 @@ static inline struct obj_cgroup *__folio_objcg(struct folio *folio) * proper memory cgroup pointer. It's not safe to call this function * against some type of folios, e.g. slab folios or ex-slab folios. * - * For a non-kmem folio any of the following ensures folio and memcg binding - * stability: + * For a folio any of the following ensures folio and objcg binding stability: * * - the folio lock * - LRU isolation * - exclusive reference * - * For a kmem folio a caller should hold an rcu read lock to protect memcg - * associated with a kmem folio from being released. + * Based on the stable binding of folio and objcg, for a folio any of the + * following ensures folio and memcg binding stability: + * + * - cgroup_mutex + * - the lruvec lock + * + * If the caller only want to ensure that the page counters of memcg are + * updated correctly, ensure that the binding stability of folio and objcg + * is sufficient. + * + * Note: The caller should hold an rcu read lock or cgroup_mutex to protect + * memcg associated with a folio from being released. */ static inline struct mem_cgroup *folio_memcg(struct folio *folio) { - if (folio_memcg_kmem(folio)) - return obj_cgroup_memcg(__folio_objcg(folio)); - return __folio_memcg(folio); + struct obj_cgroup *objcg = folio_objcg(folio); + + return objcg ? obj_cgroup_memcg(objcg) : NULL; } /* @@ -473,15 +455,10 @@ static inline bool folio_memcg_charged(struct folio *folio) * has an associated memory cgroup pointer or an object cgroups vector or * an object cgroup. * - * For a non-kmem folio any of the following ensures folio and memcg binding - * stability: + * The page and objcg or memcg binding rules can refer to folio_memcg(). * - * - the folio lock - * - LRU isolation - * - exclusive reference - * - * For a kmem folio a caller should hold an rcu read lock to protect memcg - * associated with a kmem folio from being released. + * A caller should hold an rcu read lock to protect memcg associated with a + * page from being released. */ static inline struct mem_cgroup *folio_memcg_check(struct folio *folio) { @@ -490,18 +467,14 @@ static inline struct mem_cgroup *folio_memcg_check(struct folio *folio) * for slabs, READ_ONCE() should be used here. */ unsigned long memcg_data = READ_ONCE(folio->memcg_data); + struct obj_cgroup *objcg; if (memcg_data & MEMCG_DATA_OBJEXTS) return NULL; - if (memcg_data & MEMCG_DATA_KMEM) { - struct obj_cgroup *objcg; - - objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK); - return obj_cgroup_memcg(objcg); - } + objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK); - return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK); + return objcg ? obj_cgroup_memcg(objcg) : NULL; } static inline struct mem_cgroup *page_memcg_check(struct page *page) diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index 8380adfa0f68..433bba9dfe71 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -613,6 +613,7 @@ void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg) void memcg1_swapout(struct folio *folio, swp_entry_t entry) { struct mem_cgroup *memcg, *swap_memcg; + struct obj_cgroup *objcg; unsigned int nr_entries; VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); @@ -624,12 +625,13 @@ void memcg1_swapout(struct folio *folio, swp_entry_t entry) if (!do_memsw_account()) return; - memcg = folio_memcg(folio); - - VM_WARN_ON_ONCE_FOLIO(!memcg, folio); - if (!memcg) + objcg = folio_objcg(folio); + VM_WARN_ON_ONCE_FOLIO(!objcg, folio); + if (!objcg) return; + rcu_read_lock(); + memcg = obj_cgroup_memcg(objcg); /* * In case the memcg owning these pages has been offlined and doesn't * have an ID allocated to it anymore, charge the closest online @@ -644,7 +646,7 @@ void memcg1_swapout(struct folio *folio, swp_entry_t entry) folio_unqueue_deferred_split(folio); folio->memcg_data = 0; - if (!mem_cgroup_is_root(memcg)) + if (!obj_cgroup_is_root(objcg)) page_counter_uncharge(&memcg->memory, nr_entries); if (memcg != swap_memcg) { @@ -665,7 +667,8 @@ void memcg1_swapout(struct folio *folio, swp_entry_t entry) preempt_enable_nested(); memcg1_check_events(memcg, folio_nid(folio)); - css_put(&memcg->css); + rcu_read_unlock(); + obj_cgroup_put(objcg); } /* diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1aaa66f729b3..b696823b34d0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -254,13 +254,17 @@ static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgr } #endif -static inline void reparent_locks(struct mem_cgroup *memcg, struct mem_cgroup *parent) +static inline void reparent_locks(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid) { spin_lock_irq(&objcg_lock); + spin_lock_nested(&mem_cgroup_lruvec(memcg, NODE_DATA(nid))->lru_lock, 1); + spin_lock_nested(&mem_cgroup_lruvec(parent, NODE_DATA(nid))->lru_lock, 2); } -static inline void reparent_unlocks(struct mem_cgroup *memcg, struct mem_cgroup *parent) +static inline void reparent_unlocks(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid) { + spin_unlock(&mem_cgroup_lruvec(parent, NODE_DATA(nid))->lru_lock); + spin_unlock(&mem_cgroup_lruvec(memcg, NODE_DATA(nid))->lru_lock); spin_unlock_irq(&objcg_lock); } @@ -271,14 +275,31 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg) int nid; for_each_node(nid) { - reparent_locks(memcg, parent); +retry: + if (lru_gen_enabled()) + max_lru_gen_memcg(parent, nid); + + reparent_locks(memcg, parent, nid); + + if (lru_gen_enabled()) { + if (!recheck_lru_gen_max_memcg(parent, nid)) { + reparent_unlocks(memcg, parent, nid); + cond_resched(); + goto retry; + } + lru_gen_reparent_memcg(memcg, parent, nid); + } else { + lru_reparent_memcg(memcg, parent, nid); + } objcg = __memcg_reparent_objcgs(memcg, parent, nid); - reparent_unlocks(memcg, parent); + reparent_unlocks(memcg, parent, nid); percpu_ref_kill(&objcg->refcnt); } + + reparent_state_local(memcg, parent); } /* @@ -823,6 +844,7 @@ static void __mod_memcg_state(struct mem_cgroup *memcg, this_cpu_add(memcg->vmstats_percpu->state[i], val); val = memcg_state_val_in_pages(idx, val); memcg_rstat_updated(memcg, val, cpu); + trace_mod_memcg_state(memcg, idx, val); put_cpu(); @@ -840,7 +862,9 @@ void mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, if (mem_cgroup_disabled()) return; + memcg = get_non_dying_memcg_start(memcg); __mod_memcg_state(memcg, idx, val); + get_non_dying_memcg_end(); } #ifdef CONFIG_MEMCG_V1 @@ -900,11 +924,17 @@ static void mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { + struct pglist_data *pgdat = lruvec_pgdat(lruvec); struct mem_cgroup_per_node *pn; + struct mem_cgroup *memcg; pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); + memcg = get_non_dying_memcg_start(pn->memcg); + pn = memcg->nodeinfo[pgdat->node_id]; __mod_memcg_lruvec_state(pn, idx, val); + + get_non_dying_memcg_end(); } /** @@ -1127,6 +1157,8 @@ again: /** * get_mem_cgroup_from_folio - Obtain a reference on a given folio's memcg. * @folio: folio from which memcg should be extracted. + * + * See folio_memcg() for folio->objcg/memcg binding rules. */ struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio) { @@ -2722,17 +2754,17 @@ static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, return try_charge_memcg(memcg, gfp_mask, nr_pages); } -static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) +static void commit_charge(struct folio *folio, struct obj_cgroup *objcg) { VM_BUG_ON_FOLIO(folio_memcg_charged(folio), folio); /* - * Any of the following ensures page's memcg stability: + * Any of the following ensures folio's objcg stability: * * - the page lock * - LRU isolation * - exclusive reference */ - folio->memcg_data = (unsigned long)memcg; + folio->memcg_data = (unsigned long)objcg; } #ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC @@ -2846,6 +2878,17 @@ static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg) return NULL; } +static inline struct obj_cgroup *get_obj_cgroup_from_memcg(struct mem_cgroup *memcg) +{ + struct obj_cgroup *objcg; + + rcu_read_lock(); + objcg = __get_obj_cgroup_from_memcg(memcg); + rcu_read_unlock(); + + return objcg; +} + static struct obj_cgroup *current_objcg_update(void) { struct mem_cgroup *memcg; @@ -2947,17 +2990,10 @@ struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio) { struct obj_cgroup *objcg; - if (!memcg_kmem_online()) - return NULL; - - if (folio_memcg_kmem(folio)) { - objcg = __folio_objcg(folio); + objcg = folio_objcg(folio); + if (objcg) obj_cgroup_get(objcg); - } else { - rcu_read_lock(); - objcg = __get_obj_cgroup_from_memcg(__folio_memcg(folio)); - rcu_read_unlock(); - } + return objcg; } @@ -3519,7 +3555,7 @@ void folio_split_memcg_refs(struct folio *folio, unsigned old_order, return; new_refs = (1 << (old_order - new_order)) - 1; - css_get_many(&__folio_memcg(folio)->css, new_refs); + obj_cgroup_get_many(folio_objcg(folio), new_refs); } static void memcg_online_kmem(struct mem_cgroup *memcg) @@ -4955,16 +4991,20 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root, static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg, gfp_t gfp) { - int ret; - - ret = try_charge(memcg, gfp, folio_nr_pages(folio)); - if (ret) - goto out; + int ret = 0; + struct obj_cgroup *objcg; - css_get(&memcg->css); - commit_charge(folio, memcg); + objcg = get_obj_cgroup_from_memcg(memcg); + /* Do not account at the root objcg level. */ + if (!obj_cgroup_is_root(objcg)) + ret = try_charge_memcg(memcg, gfp, folio_nr_pages(folio)); + if (ret) { + obj_cgroup_put(objcg); + return ret; + } + commit_charge(folio, objcg); memcg1_commit_charge(folio, memcg); -out: + return ret; } @@ -5050,7 +5090,7 @@ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, } struct uncharge_gather { - struct mem_cgroup *memcg; + struct obj_cgroup *objcg; unsigned long nr_memory; unsigned long pgpgout; unsigned long nr_kmem; @@ -5064,58 +5104,52 @@ static inline void uncharge_gather_clear(struct uncharge_gather *ug) static void uncharge_batch(const struct uncharge_gather *ug) { + struct mem_cgroup *memcg; + + rcu_read_lock(); + memcg = obj_cgroup_memcg(ug->objcg); if (ug->nr_memory) { - memcg_uncharge(ug->memcg, ug->nr_memory); + memcg_uncharge(memcg, ug->nr_memory); if (ug->nr_kmem) { - mod_memcg_state(ug->memcg, MEMCG_KMEM, -ug->nr_kmem); - memcg1_account_kmem(ug->memcg, -ug->nr_kmem); + mod_memcg_state(memcg, MEMCG_KMEM, -ug->nr_kmem); + memcg1_account_kmem(memcg, -ug->nr_kmem); } - memcg1_oom_recover(ug->memcg); + memcg1_oom_recover(memcg); } - memcg1_uncharge_batch(ug->memcg, ug->pgpgout, ug->nr_memory, ug->nid); + memcg1_uncharge_batch(memcg, ug->pgpgout, ug->nr_memory, ug->nid); + rcu_read_unlock(); /* drop reference from uncharge_folio */ - css_put(&ug->memcg->css); + obj_cgroup_put(ug->objcg); } static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) { long nr_pages; - struct mem_cgroup *memcg; struct obj_cgroup *objcg; VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); /* * Nobody should be changing or seriously looking at - * folio memcg or objcg at this point, we have fully - * exclusive access to the folio. + * folio objcg at this point, we have fully exclusive + * access to the folio. */ - if (folio_memcg_kmem(folio)) { - objcg = __folio_objcg(folio); - /* - * This get matches the put at the end of the function and - * kmem pages do not hold memcg references anymore. - */ - memcg = get_mem_cgroup_from_objcg(objcg); - } else { - memcg = __folio_memcg(folio); - } - - if (!memcg) + objcg = folio_objcg(folio); + if (!objcg) return; - if (ug->memcg != memcg) { - if (ug->memcg) { + if (ug->objcg != objcg) { + if (ug->objcg) { uncharge_batch(ug); uncharge_gather_clear(ug); } - ug->memcg = memcg; + ug->objcg = objcg; ug->nid = folio_nid(folio); - /* pairs with css_put in uncharge_batch */ - css_get(&memcg->css); + /* pairs with obj_cgroup_put in uncharge_batch */ + obj_cgroup_get(objcg); } nr_pages = folio_nr_pages(folio); @@ -5123,20 +5157,17 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) if (folio_memcg_kmem(folio)) { ug->nr_memory += nr_pages; ug->nr_kmem += nr_pages; - - folio->memcg_data = 0; - obj_cgroup_put(objcg); } else { /* LRU pages aren't accounted at the root level */ - if (!mem_cgroup_is_root(memcg)) + if (!obj_cgroup_is_root(objcg)) ug->nr_memory += nr_pages; ug->pgpgout++; WARN_ON_ONCE(folio_unqueue_deferred_split(folio)); - folio->memcg_data = 0; } - css_put(&memcg->css); + folio->memcg_data = 0; + obj_cgroup_put(objcg); } void __mem_cgroup_uncharge(struct folio *folio) @@ -5160,7 +5191,7 @@ void __mem_cgroup_uncharge_folios(struct folio_batch *folios) uncharge_gather_clear(&ug); for (i = 0; i < folios->nr; i++) uncharge_folio(folios->folios[i], &ug); - if (ug.memcg) + if (ug.objcg) uncharge_batch(&ug); } @@ -5177,6 +5208,7 @@ void __mem_cgroup_uncharge_folios(struct folio_batch *folios) void mem_cgroup_replace_folio(struct folio *old, struct folio *new) { struct mem_cgroup *memcg; + struct obj_cgroup *objcg; long nr_pages = folio_nr_pages(new); VM_BUG_ON_FOLIO(!folio_test_locked(old), old); @@ -5191,21 +5223,24 @@ void mem_cgroup_replace_folio(struct folio *old, struct folio *new) if (folio_memcg_charged(new)) return; - memcg = folio_memcg(old); - VM_WARN_ON_ONCE_FOLIO(!memcg, old); - if (!memcg) + objcg = folio_objcg(old); + VM_WARN_ON_ONCE_FOLIO(!objcg, old); + if (!objcg) return; + rcu_read_lock(); + memcg = obj_cgroup_memcg(objcg); /* Force-charge the new page. The old one will be freed soon */ - if (!mem_cgroup_is_root(memcg)) { + if (!obj_cgroup_is_root(objcg)) { page_counter_charge(&memcg->memory, nr_pages); if (do_memsw_account()) page_counter_charge(&memcg->memsw, nr_pages); } - css_get(&memcg->css); - commit_charge(new, memcg); + obj_cgroup_get(objcg); + commit_charge(new, objcg); memcg1_commit_charge(new, memcg); + rcu_read_unlock(); } /** @@ -5221,7 +5256,7 @@ void mem_cgroup_replace_folio(struct folio *old, struct folio *new) */ void mem_cgroup_migrate(struct folio *old, struct folio *new) { - struct mem_cgroup *memcg; + struct obj_cgroup *objcg; VM_BUG_ON_FOLIO(!folio_test_locked(old), old); VM_BUG_ON_FOLIO(!folio_test_locked(new), new); @@ -5232,18 +5267,18 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new) if (mem_cgroup_disabled()) return; - memcg = folio_memcg(old); + objcg = folio_objcg(old); /* - * Note that it is normal to see !memcg for a hugetlb folio. + * Note that it is normal to see !objcg for a hugetlb folio. * For e.g, it could have been allocated when memory_hugetlb_accounting * was not selected. */ - VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(old) && !memcg, old); - if (!memcg) + VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(old) && !objcg, old); + if (!objcg) return; - /* Transfer the charge and the css ref */ - commit_charge(new, memcg); + /* Transfer the charge and the objcg ref */ + commit_charge(new, objcg); /* Warning should never happen, so don't worry about refcount non-0 */ WARN_ON_ONCE(folio_unqueue_deferred_split(old)); @@ -5426,22 +5461,27 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) unsigned int nr_pages = folio_nr_pages(folio); struct page_counter *counter; struct mem_cgroup *memcg; + struct obj_cgroup *objcg; if (do_memsw_account()) return 0; - memcg = folio_memcg(folio); - - VM_WARN_ON_ONCE_FOLIO(!memcg, folio); - if (!memcg) + objcg = folio_objcg(folio); + VM_WARN_ON_ONCE_FOLIO(!objcg, folio); + if (!objcg) return 0; + rcu_read_lock(); + memcg = obj_cgroup_memcg(objcg); if (!entry.val) { memcg_memory_event(memcg, MEMCG_SWAP_FAIL); + rcu_read_unlock(); return 0; } memcg = mem_cgroup_private_id_get_online(memcg, nr_pages); + /* memcg is pined by memcg ID. */ + rcu_read_unlock(); if (!mem_cgroup_is_root(memcg) && !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { -- cgit v1.2.3 From 0a98e13963424d7f1f50211c692f46a3b1e8d03f Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 5 Mar 2026 19:52:51 +0800 Subject: mm: lru: add VM_WARN_ON_ONCE_FOLIO to lru maintenance helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We must ensure the folio is deleted from or added to the correct lruvec list. So, add VM_WARN_ON_ONCE_FOLIO() to catch invalid users. The VM_BUG_ON_PAGE() in move_pages_to_lru() can be removed as add_page_to_lru_list() will perform the necessary check. Link: https://lore.kernel.org/2c90fc006d9d730331a3caeef96f7e5dabe2036d.1772711148.git.zhengqi.arch@bytedance.com Signed-off-by: Muchun Song Signed-off-by: Qi Zheng Acked-by: Roman Gushchin Acked-by: Johannes Weiner Acked-by: Shakeel Butt Cc: Allen Pais Cc: Axel Rasmussen Cc: Baoquan He Cc: Chengming Zhou Cc: Chen Ridong Cc: David Hildenbrand Cc: Hamza Mahfooz Cc: Harry Yoo Cc: Hugh Dickins Cc: Imran Khan Cc: Kamalesh Babulal Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Michal Koutný Cc: Mike Rapoport Cc: Muchun Song Cc: Nhat Pham Cc: Suren Baghdasaryan Cc: Usama Arif Cc: Vlastimil Babka Cc: Wei Xu Cc: Yosry Ahmed Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 6 ++++++ mm/vmscan.c | 1 - 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 7fc2ced00f8f..a171070e15f0 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -348,6 +348,8 @@ void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); + VM_WARN_ON_ONCE_FOLIO(!folio_matches_lruvec(folio, lruvec), folio); + if (lru_gen_add_folio(lruvec, folio, false)) return; @@ -362,6 +364,8 @@ void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); + VM_WARN_ON_ONCE_FOLIO(!folio_matches_lruvec(folio, lruvec), folio); + if (lru_gen_add_folio(lruvec, folio, true)) return; @@ -376,6 +380,8 @@ void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); + VM_WARN_ON_ONCE_FOLIO(!folio_matches_lruvec(folio, lruvec), folio); + if (lru_gen_del_folio(lruvec, folio, false)) return; diff --git a/mm/vmscan.c b/mm/vmscan.c index 1ac4f959ec1c..fd120e898c70 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1916,7 +1916,6 @@ static unsigned int move_folios_to_lru(struct list_head *list) continue; } - VM_BUG_ON_FOLIO(!folio_matches_lruvec(folio, lruvec), folio); lruvec_add_folio(lruvec, folio); nr_pages = folio_nr_pages(folio); nr_moved += nr_pages; -- cgit v1.2.3 From 85358bad68f5d72a8cff3d79d46e4c38a91afe06 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 27 Mar 2026 18:16:29 +0800 Subject: mm: memcontrol: change val type to long in __mod_memcg_{lruvec_}state() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The __mod_memcg_state() and __mod_memcg_lruvec_state() functions are also used to reparent non-hierarchical stats. In this scenario, the values passed to them are accumulated statistics that might be extremely large and exceed the upper limit of a 32-bit integer. Change the val parameter type from int to long in these functions and their corresponding tracepoints (memcg_rstat_stats) to prevent potential overflow issues. After that, in memcg_state_val_in_pages(), if the passed val is negative, the expression val * unit / PAGE_SIZE could be implicitly converted to a massive positive number when compared with 1UL in the max() macro. This leads to returning an incorrect massive positive value. Fix this by using abs(val) to calculate the magnitude first, and then restoring the sign of the value before returning the result. Additionally, use mult_frac() to prevent potential overflow during the multiplication of val and unit. Link: https://lore.kernel.org/70a9440e49c464b4dca88bcabc6b491bd335c9f0.1774604356.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reported-by: Harry Yoo (Oracle) Reviewed-by: Lorenzo Stoakes (Oracle) Reviewed-by: Harry Yoo (Oracle) Cc: Allen Pais Cc: Axel Rasmussen Cc: Baoquan He Cc: David Hildenbrand Cc: Hamza Mahfooz Cc: Hugh Dickins Cc: Imran Khan Cc: Johannes Weiner Cc: Kamalesh Babulal Cc: Lance Yang Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Cc: Usama Arif Cc: Wei Xu Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/trace/events/memcg.h | 10 +++++----- mm/memcontrol.c | 18 ++++++++++++------ 2 files changed, 17 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/trace/events/memcg.h b/include/trace/events/memcg.h index dfe2f51019b4..51b62c5931fc 100644 --- a/include/trace/events/memcg.h +++ b/include/trace/events/memcg.h @@ -11,14 +11,14 @@ DECLARE_EVENT_CLASS(memcg_rstat_stats, - TP_PROTO(struct mem_cgroup *memcg, int item, int val), + TP_PROTO(struct mem_cgroup *memcg, int item, long val), TP_ARGS(memcg, item, val), TP_STRUCT__entry( __field(u64, id) __field(int, item) - __field(int, val) + __field(long, val) ), TP_fast_assign( @@ -27,20 +27,20 @@ DECLARE_EVENT_CLASS(memcg_rstat_stats, __entry->val = val; ), - TP_printk("memcg_id=%llu item=%d val=%d", + TP_printk("memcg_id=%llu item=%d val=%ld", __entry->id, __entry->item, __entry->val) ); DEFINE_EVENT(memcg_rstat_stats, mod_memcg_state, - TP_PROTO(struct mem_cgroup *memcg, int item, int val), + TP_PROTO(struct mem_cgroup *memcg, int item, long val), TP_ARGS(memcg, item, val) ); DEFINE_EVENT(memcg_rstat_stats, mod_memcg_lruvec_state, - TP_PROTO(struct mem_cgroup *memcg, int item, int val), + TP_PROTO(struct mem_cgroup *memcg, int item, long val), TP_ARGS(memcg, item, val) ); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4ee668c20fa6..685e6dd48ce5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -527,7 +527,7 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec, #ifdef CONFIG_MEMCG_V1 static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn, - enum node_stat_item idx, int val); + enum node_stat_item idx, long val); void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent, int idx) @@ -784,14 +784,20 @@ static int memcg_page_state_unit(int item); * Normalize the value passed into memcg_rstat_updated() to be in pages. Round * up non-zero sub-page updates to 1 page as zero page updates are ignored. */ -static int memcg_state_val_in_pages(int idx, int val) +static long memcg_state_val_in_pages(int idx, long val) { int unit = memcg_page_state_unit(idx); + long res; if (!val || unit == PAGE_SIZE) return val; - else - return max(val * unit / PAGE_SIZE, 1UL); + + /* Get the absolute value of (val * unit / PAGE_SIZE). */ + res = mult_frac(abs(val), unit, PAGE_SIZE); + /* Round up zero values. */ + res = res ? : 1; + + return val < 0 ? -res : res; } #ifdef CONFIG_MEMCG_V1 @@ -831,7 +837,7 @@ static inline void get_non_dying_memcg_end(void) #endif static void __mod_memcg_state(struct mem_cgroup *memcg, - enum memcg_stat_item idx, int val) + enum memcg_stat_item idx, long val) { int i = memcg_stats_index(idx); int cpu; @@ -896,7 +902,7 @@ void reparent_memcg_state_local(struct mem_cgroup *memcg, #endif static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn, - enum node_stat_item idx, int val) + enum node_stat_item idx, long val) { struct mem_cgroup *memcg = pn->memcg; int i = memcg_stats_index(idx); -- cgit v1.2.3 From 1c514a2c6e4c3bf2016a1dbbddc36d19fdf52ce5 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 27 Mar 2026 18:16:30 +0800 Subject: mm: memcontrol: correct the nr_pages parameter type of mem_cgroup_update_lru_size() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The nr_pages parameter of mem_cgroup_update_lru_size() represents a page count. During the reparenting of LRU folios, the value passed to it can potentially exceed the maximum value of a 32-bit integer. It should be declared as long instead of int to match the types used in lruvec size accounting and to prevent possible overflow. Update the parameter type to long to ensure correctness. Link: https://lore.kernel.org/fd4140de44fa0a3978e4e2426731187fe8625f0b.1774604356.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Lorenzo Stoakes (Oracle) Reviewed-by: Harry Yoo (Oracle) Cc: Allen Pais Cc: Axel Rasmussen Cc: Baoquan He Cc: David Hildenbrand Cc: Hamza Mahfooz Cc: Hugh Dickins Cc: Imran Khan Cc: Johannes Weiner Cc: Kamalesh Babulal Cc: Lance Yang Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Cc: Usama Arif Cc: Wei Xu Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 2 +- mm/memcontrol.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 086158969529..dc3fa687759b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -878,7 +878,7 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg) } void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, - int zid, int nr_pages); + int zid, long nr_pages); static inline unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 685e6dd48ce5..c3d98ab41f1f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1472,7 +1472,7 @@ retry: * to or just after a page is removed from an lru list. */ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, - int zid, int nr_pages) + int zid, long nr_pages) { struct mem_cgroup_per_node *mz; unsigned long *lru_size; @@ -1489,7 +1489,7 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, size = *lru_size; if (WARN_ONCE(size < 0, - "%s(%p, %d, %d): lru_size %ld\n", + "%s(%p, %d, %ld): lru_size %ld\n", __func__, lruvec, lru, nr_pages, size)) { VM_BUG_ON(1); *lru_size = 0; -- cgit v1.2.3 From d9e4142e7635f6f7173854667c0695ce5b836bbc Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 16 Mar 2026 04:54:31 -0700 Subject: kho: add size parameter to kho_add_subtree() Patch series "kho: history: track previous kernel version and kexec boot count", v9. Use Kexec Handover (KHO) to pass the previous kernel's version string and the number of kexec reboots since the last cold boot to the next kernel, and print it at boot time. Example ======= [ 0.000000] Linux version 6.19.0-rc3-upstream-00047-ge5d992347849 ... [ 0.000000] KHO: exec from: 6.19.0-rc4-next-20260107upstream-00004-g3071b0dc4498 (count 1) Motivation ========== Bugs that only reproduce when kexecing from specific kernel versions are difficult to diagnose. These issues occur when a buggy kernel kexecs into a new kernel, with the bug manifesting only in the second kernel. Recent examples include: * eb2266312507 ("x86/boot: Fix page table access in 5-level to 4-level paging transition") * 77d48d39e991 ("efistub/tpm: Use ACPI reclaim memory for event log to avoid corruption") * 64b45dd46e15 ("x86/efi: skip memattr table on kexec boot") As kexec-based reboots become more common, these version-dependent bugs are appearing more frequently. At scale, correlating crashes to the previous kernel version is challenging, especially when issues only occur in specific transition scenarios. Some bugs manifest only after multiple consecutive kexec reboots. Tracking the kexec count helps identify these cases (this metric is already used by live update sub-system). KHO provides a reliable mechanism to pass information between kernels. By carrying the previous kernel's release string and kexec count forward, we can print this context at boot time to aid debugging. The goal of this feature is to have this information being printed in early boot, so, users can trace back kernel releases in kexec. Systemd is not helpful because we cannot assume that the previous kernel has systemd or even write access to the disk (common when using Linux as bootloaders) This patch (of 6): kho_add_subtree() assumes the fdt argument is always an FDT and calls fdt_totalsize() on it in the debugfs code path. This assumption will break if a caller passes arbitrary data instead of an FDT. When CONFIG_KEXEC_HANDOVER_DEBUGFS is enabled, kho_debugfs_fdt_add() calls __kho_debugfs_fdt_add(), which executes: f->wrapper.size = fdt_totalsize(fdt); Fix this by adding an explicit size parameter to kho_add_subtree() so callers specify the blob size. This allows subtrees to contain arbitrary data formats, not just FDTs. Update all callers: - memblock.c: use fdt_totalsize(fdt) - luo_core.c: use fdt_totalsize(fdt_out) - test_kho.c: use fdt_totalsize() - kexec_handover.c (root fdt): use fdt_totalsize(kho_out.fdt) Also update __kho_debugfs_fdt_add() to receive the size explicitly instead of computing it internally via fdt_totalsize(). In kho_in_debugfs_init(), pass fdt_totalsize() for the root FDT and sub-blobs since all current users are FDTs. A subsequent patch will persist the size in the KHO FDT so the incoming side can handle non-FDT blobs correctly. Link: https://lore.kernel.org/20260323110747.193569-1-duanchenghao@kylinos.cn Link: https://lore.kernel.org/20260316-kho-v9-1-ed6dcd951988@debian.org Signed-off-by: Breno Leitao Suggested-by: Pratyush Yadav Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: David Hildenbrand Cc: Jonathan Corbet Cc: "Liam R. Howlett" Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Pasha Tatashin Cc: SeongJae Park Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/kexec_handover.h | 4 ++-- kernel/liveupdate/kexec_handover.c | 8 +++++--- kernel/liveupdate/kexec_handover_debugfs.c | 15 +++++++++------ kernel/liveupdate/kexec_handover_internal.h | 5 +++-- kernel/liveupdate/luo_core.c | 3 ++- lib/test_kho.c | 3 ++- mm/memblock.c | 2 +- 7 files changed, 24 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h index ac4129d1d741..abb1d324f42d 100644 --- a/include/linux/kexec_handover.h +++ b/include/linux/kexec_handover.h @@ -32,7 +32,7 @@ void kho_restore_free(void *mem); struct folio *kho_restore_folio(phys_addr_t phys); struct page *kho_restore_pages(phys_addr_t phys, unsigned long nr_pages); void *kho_restore_vmalloc(const struct kho_vmalloc *preservation); -int kho_add_subtree(const char *name, void *fdt); +int kho_add_subtree(const char *name, void *fdt, size_t size); void kho_remove_subtree(void *fdt); int kho_retrieve_subtree(const char *name, phys_addr_t *phys); @@ -97,7 +97,7 @@ static inline void *kho_restore_vmalloc(const struct kho_vmalloc *preservation) return NULL; } -static inline int kho_add_subtree(const char *name, void *fdt) +static inline int kho_add_subtree(const char *name, void *fdt, size_t size) { return -EOPNOTSUPP; } diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 532f455c5d4f..8cc25e29ff91 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -727,6 +727,7 @@ err_disable_kho: * kho_add_subtree - record the physical address of a sub FDT in KHO root tree. * @name: name of the sub tree. * @fdt: the sub tree blob. + * @size: size of the blob in bytes. * * Creates a new child node named @name in KHO root FDT and records * the physical address of @fdt. The pages of @fdt must also be preserved @@ -738,7 +739,7 @@ err_disable_kho: * * Return: 0 on success, error code on failure */ -int kho_add_subtree(const char *name, void *fdt) +int kho_add_subtree(const char *name, void *fdt, size_t size) { phys_addr_t phys = virt_to_phys(fdt); void *root_fdt = kho_out.fdt; @@ -763,7 +764,7 @@ int kho_add_subtree(const char *name, void *fdt) if (err < 0) goto out_pack; - WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, name, fdt, false)); + WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, name, fdt, size, false)); out_pack: fdt_pack(root_fdt); @@ -1431,7 +1432,8 @@ static __init int kho_init(void) } WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, "fdt", - kho_out.fdt, true)); + kho_out.fdt, + fdt_totalsize(kho_out.fdt), true)); return 0; diff --git a/kernel/liveupdate/kexec_handover_debugfs.c b/kernel/liveupdate/kexec_handover_debugfs.c index acf368222682..ca0153736af1 100644 --- a/kernel/liveupdate/kexec_handover_debugfs.c +++ b/kernel/liveupdate/kexec_handover_debugfs.c @@ -25,7 +25,7 @@ struct fdt_debugfs { }; static int __kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir, - const char *name, const void *fdt) + const char *name, const void *fdt, size_t size) { struct fdt_debugfs *f; struct dentry *file; @@ -35,7 +35,7 @@ static int __kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir, return -ENOMEM; f->wrapper.data = (void *)fdt; - f->wrapper.size = fdt_totalsize(fdt); + f->wrapper.size = size; file = debugfs_create_blob(name, 0400, dir, &f->wrapper); if (IS_ERR(file)) { @@ -50,7 +50,7 @@ static int __kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir, } int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name, - const void *fdt, bool root) + const void *fdt, size_t size, bool root) { struct dentry *dir; @@ -59,7 +59,7 @@ int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name, else dir = dbg->sub_fdt_dir; - return __kho_debugfs_fdt_add(&dbg->fdt_list, dir, name, fdt); + return __kho_debugfs_fdt_add(&dbg->fdt_list, dir, name, fdt, size); } void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, void *fdt) @@ -113,7 +113,8 @@ __init void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt) goto err_rmdir; } - err = __kho_debugfs_fdt_add(&dbg->fdt_list, dir, "fdt", fdt); + err = __kho_debugfs_fdt_add(&dbg->fdt_list, dir, "fdt", fdt, + fdt_totalsize(fdt)); if (err) goto err_rmdir; @@ -121,6 +122,7 @@ __init void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt) int len = 0; const char *name = fdt_get_name(fdt, child, NULL); const u64 *fdt_phys; + void *sub_fdt; fdt_phys = fdt_getprop(fdt, child, KHO_FDT_SUB_TREE_PROP_NAME, &len); if (!fdt_phys) @@ -130,8 +132,9 @@ __init void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt) name, len); continue; } + sub_fdt = phys_to_virt(*fdt_phys); err = __kho_debugfs_fdt_add(&dbg->fdt_list, sub_fdt_dir, name, - phys_to_virt(*fdt_phys)); + sub_fdt, fdt_totalsize(sub_fdt)); if (err) { pr_warn("failed to add fdt %s to debugfs: %pe\n", name, ERR_PTR(err)); diff --git a/kernel/liveupdate/kexec_handover_internal.h b/kernel/liveupdate/kexec_handover_internal.h index 9a832a35254c..2a28cb8db9b0 100644 --- a/kernel/liveupdate/kexec_handover_internal.h +++ b/kernel/liveupdate/kexec_handover_internal.h @@ -27,7 +27,7 @@ int kho_debugfs_init(void); void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt); int kho_out_debugfs_init(struct kho_debugfs *dbg); int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name, - const void *fdt, bool root); + const void *fdt, size_t size, bool root); void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, void *fdt); #else static inline int kho_debugfs_init(void) { return 0; } @@ -35,7 +35,8 @@ static inline void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt) { } static inline int kho_out_debugfs_init(struct kho_debugfs *dbg) { return 0; } static inline int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name, - const void *fdt, bool root) { return 0; } + const void *fdt, size_t size, + bool root) { return 0; } static inline void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, void *fdt) { } #endif /* CONFIG_KEXEC_HANDOVER_DEBUGFS */ diff --git a/kernel/liveupdate/luo_core.c b/kernel/liveupdate/luo_core.c index 84ac728d63ba..04d06a0906c0 100644 --- a/kernel/liveupdate/luo_core.c +++ b/kernel/liveupdate/luo_core.c @@ -172,7 +172,8 @@ static int __init luo_fdt_setup(void) if (err) goto exit_free; - err = kho_add_subtree(LUO_FDT_KHO_ENTRY_NAME, fdt_out); + err = kho_add_subtree(LUO_FDT_KHO_ENTRY_NAME, fdt_out, + fdt_totalsize(fdt_out)); if (err) goto exit_free; luo_global.fdt_out = fdt_out; diff --git a/lib/test_kho.c b/lib/test_kho.c index 7ef9e4061869..263182437315 100644 --- a/lib/test_kho.c +++ b/lib/test_kho.c @@ -143,7 +143,8 @@ static int kho_test_preserve(struct kho_test_state *state) if (err) goto err_unpreserve_data; - err = kho_add_subtree(KHO_TEST_FDT, folio_address(state->fdt)); + err = kho_add_subtree(KHO_TEST_FDT, folio_address(state->fdt), + fdt_totalsize(folio_address(state->fdt))); if (err) goto err_unpreserve_data; diff --git a/mm/memblock.c b/mm/memblock.c index b3ddfdec7a80..91d4162eec63 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -2510,7 +2510,7 @@ static int __init prepare_kho_fdt(void) if (err) goto err_unpreserve_fdt; - err = kho_add_subtree(MEMBLOCK_KHO_FDT, fdt); + err = kho_add_subtree(MEMBLOCK_KHO_FDT, fdt, fdt_totalsize(fdt)); if (err) goto err_unpreserve_fdt; -- cgit v1.2.3 From 4916ae386760ad666eafa8afc075957bf479afbc Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 16 Mar 2026 04:54:32 -0700 Subject: kho: rename fdt parameter to blob in kho_add/remove_subtree() Since kho_add_subtree() now accepts arbitrary data blobs (not just FDTs), rename the parameter from 'fdt' to 'blob' to better reflect its purpose. Apply the same rename to kho_remove_subtree() for consistency. Also rename kho_debugfs_fdt_add() and kho_debugfs_fdt_remove() to kho_debugfs_blob_add() and kho_debugfs_blob_remove() respectively, with the same parameter rename from 'fdt' to 'blob'. Link: https://lore.kernel.org/20260316-kho-v9-2-ed6dcd951988@debian.org Signed-off-by: Breno Leitao Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: David Hildenbrand Cc: Jonathan Corbet Cc: "Liam R. Howlett" Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Pasha Tatashin Cc: SeongJae Park Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/kho.rst | 2 +- include/linux/kexec_handover.h | 8 +++---- kernel/liveupdate/kexec_handover.c | 33 +++++++++++++++-------------- kernel/liveupdate/kexec_handover_debugfs.c | 25 +++++++++++----------- kernel/liveupdate/kexec_handover_internal.h | 16 +++++++------- 5 files changed, 43 insertions(+), 41 deletions(-) (limited to 'include') diff --git a/Documentation/admin-guide/mm/kho.rst b/Documentation/admin-guide/mm/kho.rst index cb9a20f64920..6a4ddf344046 100644 --- a/Documentation/admin-guide/mm/kho.rst +++ b/Documentation/admin-guide/mm/kho.rst @@ -80,5 +80,5 @@ stabilized. it finished to interpret their metadata. ``/sys/kernel/debug/kho/in/sub_fdts/`` - Similar to ``kho/out/sub_fdts/``, but contains sub FDT blobs + Similar to ``kho/out/sub_fdts/``, but contains sub blobs of KHO producers passed from the old kernel. diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h index abb1d324f42d..0666cf298c7f 100644 --- a/include/linux/kexec_handover.h +++ b/include/linux/kexec_handover.h @@ -32,8 +32,8 @@ void kho_restore_free(void *mem); struct folio *kho_restore_folio(phys_addr_t phys); struct page *kho_restore_pages(phys_addr_t phys, unsigned long nr_pages); void *kho_restore_vmalloc(const struct kho_vmalloc *preservation); -int kho_add_subtree(const char *name, void *fdt, size_t size); -void kho_remove_subtree(void *fdt); +int kho_add_subtree(const char *name, void *blob, size_t size); +void kho_remove_subtree(void *blob); int kho_retrieve_subtree(const char *name, phys_addr_t *phys); void kho_memory_init(void); @@ -97,12 +97,12 @@ static inline void *kho_restore_vmalloc(const struct kho_vmalloc *preservation) return NULL; } -static inline int kho_add_subtree(const char *name, void *fdt, size_t size) +static inline int kho_add_subtree(const char *name, void *blob, size_t size) { return -EOPNOTSUPP; } -static inline void kho_remove_subtree(void *fdt) { } +static inline void kho_remove_subtree(void *blob) { } static inline int kho_retrieve_subtree(const char *name, phys_addr_t *phys) { diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 8cc25e29ff91..711b6c3376e7 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -724,13 +724,13 @@ err_disable_kho: } /** - * kho_add_subtree - record the physical address of a sub FDT in KHO root tree. + * kho_add_subtree - record the physical address of a sub blob in KHO root tree. * @name: name of the sub tree. - * @fdt: the sub tree blob. + * @blob: the sub tree blob. * @size: size of the blob in bytes. * * Creates a new child node named @name in KHO root FDT and records - * the physical address of @fdt. The pages of @fdt must also be preserved + * the physical address of @blob. The pages of @blob must also be preserved * by KHO for the new kernel to retrieve it after kexec. * * A debugfs blob entry is also created at @@ -739,9 +739,9 @@ err_disable_kho: * * Return: 0 on success, error code on failure */ -int kho_add_subtree(const char *name, void *fdt, size_t size) +int kho_add_subtree(const char *name, void *blob, size_t size) { - phys_addr_t phys = virt_to_phys(fdt); + phys_addr_t phys = virt_to_phys(blob); void *root_fdt = kho_out.fdt; int err = -ENOMEM; int off, fdt_err; @@ -764,7 +764,8 @@ int kho_add_subtree(const char *name, void *fdt, size_t size) if (err < 0) goto out_pack; - WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, name, fdt, size, false)); + WARN_ON_ONCE(kho_debugfs_blob_add(&kho_out.dbg, name, blob, + size, false)); out_pack: fdt_pack(root_fdt); @@ -773,9 +774,9 @@ out_pack: } EXPORT_SYMBOL_GPL(kho_add_subtree); -void kho_remove_subtree(void *fdt) +void kho_remove_subtree(void *blob) { - phys_addr_t target_phys = virt_to_phys(fdt); + phys_addr_t target_phys = virt_to_phys(blob); void *root_fdt = kho_out.fdt; int off; int err; @@ -797,7 +798,7 @@ void kho_remove_subtree(void *fdt) if ((phys_addr_t)*val == target_phys) { fdt_del_node(root_fdt, off); - kho_debugfs_fdt_remove(&kho_out.dbg, fdt); + kho_debugfs_blob_remove(&kho_out.dbg, blob); break; } } @@ -1293,11 +1294,11 @@ bool is_kho_boot(void) EXPORT_SYMBOL_GPL(is_kho_boot); /** - * kho_retrieve_subtree - retrieve a preserved sub FDT by its name. - * @name: the name of the sub FDT passed to kho_add_subtree(). - * @phys: if found, the physical address of the sub FDT is stored in @phys. + * kho_retrieve_subtree - retrieve a preserved sub blob by its name. + * @name: the name of the sub blob passed to kho_add_subtree(). + * @phys: if found, the physical address of the sub blob is stored in @phys. * - * Retrieve a preserved sub FDT named @name and store its physical + * Retrieve a preserved sub blob named @name and store its physical * address in @phys. * * Return: 0 on success, error code on failure @@ -1431,9 +1432,9 @@ static __init int kho_init(void) init_cma_reserved_pageblock(pfn_to_page(pfn)); } - WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, "fdt", - kho_out.fdt, - fdt_totalsize(kho_out.fdt), true)); + WARN_ON_ONCE(kho_debugfs_blob_add(&kho_out.dbg, "fdt", + kho_out.fdt, + fdt_totalsize(kho_out.fdt), true)); return 0; diff --git a/kernel/liveupdate/kexec_handover_debugfs.c b/kernel/liveupdate/kexec_handover_debugfs.c index ca0153736af1..cab923e4f5c8 100644 --- a/kernel/liveupdate/kexec_handover_debugfs.c +++ b/kernel/liveupdate/kexec_handover_debugfs.c @@ -24,8 +24,9 @@ struct fdt_debugfs { struct dentry *file; }; -static int __kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir, - const char *name, const void *fdt, size_t size) +static int __kho_debugfs_blob_add(struct list_head *list, struct dentry *dir, + const char *name, const void *blob, + size_t size) { struct fdt_debugfs *f; struct dentry *file; @@ -34,7 +35,7 @@ static int __kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir, if (!f) return -ENOMEM; - f->wrapper.data = (void *)fdt; + f->wrapper.data = (void *)blob; f->wrapper.size = size; file = debugfs_create_blob(name, 0400, dir, &f->wrapper); @@ -49,8 +50,8 @@ static int __kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir, return 0; } -int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name, - const void *fdt, size_t size, bool root) +int kho_debugfs_blob_add(struct kho_debugfs *dbg, const char *name, + const void *blob, size_t size, bool root) { struct dentry *dir; @@ -59,15 +60,15 @@ int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name, else dir = dbg->sub_fdt_dir; - return __kho_debugfs_fdt_add(&dbg->fdt_list, dir, name, fdt, size); + return __kho_debugfs_blob_add(&dbg->fdt_list, dir, name, blob, size); } -void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, void *fdt) +void kho_debugfs_blob_remove(struct kho_debugfs *dbg, void *blob) { struct fdt_debugfs *ff; list_for_each_entry(ff, &dbg->fdt_list, list) { - if (ff->wrapper.data == fdt) { + if (ff->wrapper.data == blob) { debugfs_remove(ff->file); list_del(&ff->list); kfree(ff); @@ -113,8 +114,8 @@ __init void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt) goto err_rmdir; } - err = __kho_debugfs_fdt_add(&dbg->fdt_list, dir, "fdt", fdt, - fdt_totalsize(fdt)); + err = __kho_debugfs_blob_add(&dbg->fdt_list, dir, "fdt", fdt, + fdt_totalsize(fdt)); if (err) goto err_rmdir; @@ -133,8 +134,8 @@ __init void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt) continue; } sub_fdt = phys_to_virt(*fdt_phys); - err = __kho_debugfs_fdt_add(&dbg->fdt_list, sub_fdt_dir, name, - sub_fdt, fdt_totalsize(sub_fdt)); + err = __kho_debugfs_blob_add(&dbg->fdt_list, sub_fdt_dir, name, + sub_fdt, fdt_totalsize(sub_fdt)); if (err) { pr_warn("failed to add fdt %s to debugfs: %pe\n", name, ERR_PTR(err)); diff --git a/kernel/liveupdate/kexec_handover_internal.h b/kernel/liveupdate/kexec_handover_internal.h index 2a28cb8db9b0..0399ff107775 100644 --- a/kernel/liveupdate/kexec_handover_internal.h +++ b/kernel/liveupdate/kexec_handover_internal.h @@ -26,19 +26,19 @@ extern unsigned int kho_scratch_cnt; int kho_debugfs_init(void); void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt); int kho_out_debugfs_init(struct kho_debugfs *dbg); -int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name, - const void *fdt, size_t size, bool root); -void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, void *fdt); +int kho_debugfs_blob_add(struct kho_debugfs *dbg, const char *name, + const void *blob, size_t size, bool root); +void kho_debugfs_blob_remove(struct kho_debugfs *dbg, void *blob); #else static inline int kho_debugfs_init(void) { return 0; } static inline void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt) { } static inline int kho_out_debugfs_init(struct kho_debugfs *dbg) { return 0; } -static inline int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name, - const void *fdt, size_t size, - bool root) { return 0; } -static inline void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, - void *fdt) { } +static inline int kho_debugfs_blob_add(struct kho_debugfs *dbg, + const char *name, const void *blob, + size_t size, bool root) { return 0; } +static inline void kho_debugfs_blob_remove(struct kho_debugfs *dbg, + void *blob) { } #endif /* CONFIG_KEXEC_HANDOVER_DEBUGFS */ #ifdef CONFIG_KEXEC_HANDOVER_DEBUG -- cgit v1.2.3 From 85e41392820fcf0f7a3f9784cea907905f921358 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 16 Mar 2026 04:54:33 -0700 Subject: kho: persist blob size in KHO FDT kho_add_subtree() accepts a size parameter but only forwards it to debugfs. The size is not persisted in the KHO FDT, so it is lost across kexec. This makes it impossible for the incoming kernel to determine the blob size without understanding the blob format. Store the blob size as a "blob-size" property in the KHO FDT alongside the "preserved-data" physical address. This allows the receiving kernel to recover the size for any blob regardless of format. Also extend kho_retrieve_subtree() with an optional size output parameter so callers can learn the blob size without needing to understand the blob format. Update all callers to pass NULL for the new parameter. Link: https://lore.kernel.org/20260316-kho-v9-3-ed6dcd951988@debian.org Signed-off-by: Breno Leitao Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: David Hildenbrand Cc: Jonathan Corbet Cc: "Liam R. Howlett" Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Pasha Tatashin Cc: SeongJae Park Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/kexec_handover.h | 5 +++-- include/linux/kho/abi/kexec_handover.h | 20 ++++++++++++++++---- kernel/liveupdate/kexec_handover.c | 27 ++++++++++++++++++++++----- kernel/liveupdate/kexec_handover_debugfs.c | 3 ++- kernel/liveupdate/luo_core.c | 2 +- lib/test_kho.c | 2 +- mm/memblock.c | 2 +- 7 files changed, 46 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h index 0666cf298c7f..8968c56d2d73 100644 --- a/include/linux/kexec_handover.h +++ b/include/linux/kexec_handover.h @@ -34,7 +34,7 @@ struct page *kho_restore_pages(phys_addr_t phys, unsigned long nr_pages); void *kho_restore_vmalloc(const struct kho_vmalloc *preservation); int kho_add_subtree(const char *name, void *blob, size_t size); void kho_remove_subtree(void *blob); -int kho_retrieve_subtree(const char *name, phys_addr_t *phys); +int kho_retrieve_subtree(const char *name, phys_addr_t *phys, size_t *size); void kho_memory_init(void); @@ -104,7 +104,8 @@ static inline int kho_add_subtree(const char *name, void *blob, size_t size) static inline void kho_remove_subtree(void *blob) { } -static inline int kho_retrieve_subtree(const char *name, phys_addr_t *phys) +static inline int kho_retrieve_subtree(const char *name, phys_addr_t *phys, + size_t *size) { return -EOPNOTSUPP; } diff --git a/include/linux/kho/abi/kexec_handover.h b/include/linux/kho/abi/kexec_handover.h index 6b7d8ef550f9..7e847a2339b0 100644 --- a/include/linux/kho/abi/kexec_handover.h +++ b/include/linux/kho/abi/kexec_handover.h @@ -41,25 +41,28 @@ * restore the preserved data.:: * * / { - * compatible = "kho-v2"; + * compatible = "kho-v3"; * * preserved-memory-map = <0x...>; * * { * preserved-data = <0x...>; + * blob-size = <0x...>; * }; * * { * preserved-data = <0x...>; + * blob-size = <0x...>; * }; * ... ... * { * preserved-data = <0x...>; + * blob-size = <0x...>; * }; * }; * * Root KHO Node (/): - * - compatible: "kho-v2" + * - compatible: "kho-v3" * * Indentifies the overall KHO ABI version. * @@ -78,16 +81,25 @@ * * Physical address pointing to a subnode data blob that is also * being preserved. + * + * - blob-size: u64 + * + * Size in bytes of the preserved data blob. This is needed because + * blobs may use arbitrary formats (not just FDT), so the size + * cannot be determined from the blob content alone. */ /* The compatible string for the KHO FDT root node. */ -#define KHO_FDT_COMPATIBLE "kho-v2" +#define KHO_FDT_COMPATIBLE "kho-v3" /* The FDT property for the preserved memory map. */ #define KHO_FDT_MEMORY_MAP_PROP_NAME "preserved-memory-map" /* The FDT property for preserved data blobs. */ -#define KHO_FDT_SUB_TREE_PROP_NAME "preserved-data" +#define KHO_SUB_TREE_PROP_NAME "preserved-data" + +/* The FDT property for the size of preserved data blobs. */ +#define KHO_SUB_TREE_SIZE_PROP_NAME "blob-size" /** * DOC: Kexec Handover ABI for vmalloc Preservation diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 711b6c3376e7..adf6541f70f9 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -743,6 +743,7 @@ int kho_add_subtree(const char *name, void *blob, size_t size) { phys_addr_t phys = virt_to_phys(blob); void *root_fdt = kho_out.fdt; + u64 size_u64 = size; int err = -ENOMEM; int off, fdt_err; @@ -759,11 +760,16 @@ int kho_add_subtree(const char *name, void *blob, size_t size) goto out_pack; } - err = fdt_setprop(root_fdt, off, KHO_FDT_SUB_TREE_PROP_NAME, + err = fdt_setprop(root_fdt, off, KHO_SUB_TREE_PROP_NAME, &phys, sizeof(phys)); if (err < 0) goto out_pack; + err = fdt_setprop(root_fdt, off, KHO_SUB_TREE_SIZE_PROP_NAME, + &size_u64, sizeof(size_u64)); + if (err < 0) + goto out_pack; + WARN_ON_ONCE(kho_debugfs_blob_add(&kho_out.dbg, name, blob, size, false)); @@ -792,7 +798,7 @@ void kho_remove_subtree(void *blob) const u64 *val; int len; - val = fdt_getprop(root_fdt, off, KHO_FDT_SUB_TREE_PROP_NAME, &len); + val = fdt_getprop(root_fdt, off, KHO_SUB_TREE_PROP_NAME, &len); if (!val || len != sizeof(phys_addr_t)) continue; @@ -1297,13 +1303,14 @@ EXPORT_SYMBOL_GPL(is_kho_boot); * kho_retrieve_subtree - retrieve a preserved sub blob by its name. * @name: the name of the sub blob passed to kho_add_subtree(). * @phys: if found, the physical address of the sub blob is stored in @phys. + * @size: if not NULL and found, the size of the sub blob is stored in @size. * * Retrieve a preserved sub blob named @name and store its physical - * address in @phys. + * address in @phys and optionally its size in @size. * * Return: 0 on success, error code on failure */ -int kho_retrieve_subtree(const char *name, phys_addr_t *phys) +int kho_retrieve_subtree(const char *name, phys_addr_t *phys, size_t *size) { const void *fdt = kho_get_fdt(); const u64 *val; @@ -1319,12 +1326,22 @@ int kho_retrieve_subtree(const char *name, phys_addr_t *phys) if (offset < 0) return -ENOENT; - val = fdt_getprop(fdt, offset, KHO_FDT_SUB_TREE_PROP_NAME, &len); + val = fdt_getprop(fdt, offset, KHO_SUB_TREE_PROP_NAME, &len); if (!val || len != sizeof(*val)) return -EINVAL; *phys = (phys_addr_t)*val; + val = fdt_getprop(fdt, offset, KHO_SUB_TREE_SIZE_PROP_NAME, &len); + if (!val || len != sizeof(*val)) { + pr_warn("broken KHO subnode '%s': missing or invalid blob-size property\n", + name); + return -EINVAL; + } + + if (size) + *size = (size_t)*val; + return 0; } EXPORT_SYMBOL_GPL(kho_retrieve_subtree); diff --git a/kernel/liveupdate/kexec_handover_debugfs.c b/kernel/liveupdate/kexec_handover_debugfs.c index cab923e4f5c8..b416846810d7 100644 --- a/kernel/liveupdate/kexec_handover_debugfs.c +++ b/kernel/liveupdate/kexec_handover_debugfs.c @@ -125,7 +125,8 @@ __init void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt) const u64 *fdt_phys; void *sub_fdt; - fdt_phys = fdt_getprop(fdt, child, KHO_FDT_SUB_TREE_PROP_NAME, &len); + fdt_phys = fdt_getprop(fdt, child, + KHO_SUB_TREE_PROP_NAME, &len); if (!fdt_phys) continue; if (len != sizeof(*fdt_phys)) { diff --git a/kernel/liveupdate/luo_core.c b/kernel/liveupdate/luo_core.c index 04d06a0906c0..48b25c9abeda 100644 --- a/kernel/liveupdate/luo_core.c +++ b/kernel/liveupdate/luo_core.c @@ -88,7 +88,7 @@ static int __init luo_early_startup(void) } /* Retrieve LUO subtree, and verify its format. */ - err = kho_retrieve_subtree(LUO_FDT_KHO_ENTRY_NAME, &fdt_phys); + err = kho_retrieve_subtree(LUO_FDT_KHO_ENTRY_NAME, &fdt_phys, NULL); if (err) { if (err != -ENOENT) { pr_err("failed to retrieve FDT '%s' from KHO: %pe\n", diff --git a/lib/test_kho.c b/lib/test_kho.c index 263182437315..aa6a0956bb8b 100644 --- a/lib/test_kho.c +++ b/lib/test_kho.c @@ -319,7 +319,7 @@ static int __init kho_test_init(void) if (!kho_is_enabled()) return 0; - err = kho_retrieve_subtree(KHO_TEST_FDT, &fdt_phys); + err = kho_retrieve_subtree(KHO_TEST_FDT, &fdt_phys, NULL); if (!err) { err = kho_test_restore(fdt_phys); if (err) diff --git a/mm/memblock.c b/mm/memblock.c index 91d4162eec63..a1c6dd0f6fad 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -2555,7 +2555,7 @@ static void *__init reserve_mem_kho_retrieve_fdt(void) if (fdt) return fdt; - err = kho_retrieve_subtree(MEMBLOCK_KHO_FDT, &fdt_phys); + err = kho_retrieve_subtree(MEMBLOCK_KHO_FDT, &fdt_phys, NULL); if (err) { if (err != -ENOENT) pr_warn("failed to retrieve FDT '%s' from KHO: %d\n", -- cgit v1.2.3 From 76aa46b9e4049247858309c6e3527d477da2b2fe Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 16 Mar 2026 04:54:35 -0700 Subject: kho: kexec-metadata: track previous kernel chain Use Kexec Handover (KHO) to pass the previous kernel's version string and the number of kexec reboots since the last cold boot to the next kernel, and print it at boot time. Example output: [ 0.000000] KHO: exec from: 6.19.0-rc4-next-20260107 (count 1) Motivation ========== Bugs that only reproduce when kexecing from specific kernel versions are difficult to diagnose. These issues occur when a buggy kernel kexecs into a new kernel, with the bug manifesting only in the second kernel. Recent examples include the following commits: * commit eb2266312507 ("x86/boot: Fix page table access in 5-level to 4-level paging transition") * commit 77d48d39e991 ("efistub/tpm: Use ACPI reclaim memory for event log to avoid corruption") * commit 64b45dd46e15 ("x86/efi: skip memattr table on kexec boot") As kexec-based reboots become more common, these version-dependent bugs are appearing more frequently. At scale, correlating crashes to the previous kernel version is challenging, especially when issues only occur in specific transition scenarios. Implementation ============== The kexec metadata is stored as a plain C struct (struct kho_kexec_metadata) rather than FDT format, for simplicity and direct field access. It is registered via kho_add_subtree() as a separate subtree, keeping it independent from the core KHO ABI. This design choice: - Keeps the core KHO ABI minimal and stable - Allows the metadata format to evolve independently - Avoids requiring version bumps for all KHO consumers (LUO, etc.) when the metadata format changes The struct kho_kexec_metadata contains two fields: - previous_release: The kernel version that initiated the kexec - kexec_count: Number of kexec boots since last cold boot On cold boot, kexec_count starts at 0 and increments with each kexec. The count helps identify issues that only manifest after multiple consecutive kexec reboots. [leitao@debian.org: call kho_kexec_metadata_init() for both boot paths] Link: https://lore.kernel.org/all/20260309-kho-v8-5-c3abcf4ac750@debian.org/ [1] Link: https://lore.kernel.org/20260409-kho_fix_merge_issue-v1-1-710c84ceaa85@debian.org Link: https://lore.kernel.org/20260316-kho-v9-5-ed6dcd951988@debian.org Signed-off-by: Breno Leitao Acked-by: SeongJae Park Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: David Hildenbrand Cc: Jonathan Corbet Cc: "Liam R. Howlett" Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Pasha Tatashin Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/kho/abi/kexec_metadata.h | 46 ++++++++++++++++ kernel/liveupdate/kexec_handover.c | 98 ++++++++++++++++++++++++++++++++++ 2 files changed, 144 insertions(+) create mode 100644 include/linux/kho/abi/kexec_metadata.h (limited to 'include') diff --git a/include/linux/kho/abi/kexec_metadata.h b/include/linux/kho/abi/kexec_metadata.h new file mode 100644 index 000000000000..e9e3f7e38a7c --- /dev/null +++ b/include/linux/kho/abi/kexec_metadata.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +/** + * DOC: Kexec Metadata ABI + * + * The "kexec-metadata" subtree stores optional metadata about the kexec chain. + * It is registered via kho_add_subtree(), keeping it independent from the core + * KHO ABI. This allows the metadata format to evolve without affecting other + * KHO consumers. + * + * The metadata is stored as a plain C struct rather than FDT format for + * simplicity and direct field access. + * + * Copyright (c) 2026 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2026 Breno Leitao + */ + +#ifndef _LINUX_KHO_ABI_KEXEC_METADATA_H +#define _LINUX_KHO_ABI_KEXEC_METADATA_H + +#include +#include + +#define KHO_KEXEC_METADATA_VERSION 1 + +/** + * struct kho_kexec_metadata - Kexec metadata passed between kernels + * @version: ABI version of this struct (must be first field) + * @previous_release: Kernel version string that initiated the kexec + * @kexec_count: Number of kexec boots since last cold boot + * + * This structure is preserved across kexec and allows the new kernel to + * identify which kernel it was booted from and how many kexec reboots + * have occurred. + * + * __NEW_UTS_LEN is part of uABI, so it safe to use it in here. + */ +struct kho_kexec_metadata { + u32 version; + char previous_release[__NEW_UTS_LEN + 1]; + u32 kexec_count; +} __packed; + +#define KHO_METADATA_NODE_NAME "kexec-metadata" + +#endif /* _LINUX_KHO_ABI_KEXEC_METADATA_H */ diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index adf6541f70f9..94762de1fe5f 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -18,7 +18,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -1268,6 +1270,8 @@ EXPORT_SYMBOL_GPL(kho_restore_free); struct kho_in { phys_addr_t fdt_phys; phys_addr_t scratch_phys; + char previous_release[__NEW_UTS_LEN + 1]; + u32 kexec_count; struct kho_debugfs dbg; }; @@ -1392,6 +1396,96 @@ static __init int kho_out_fdt_setup(void) return err; } +static void __init kho_in_kexec_metadata(void) +{ + struct kho_kexec_metadata *metadata; + phys_addr_t metadata_phys; + size_t blob_size; + int err; + + err = kho_retrieve_subtree(KHO_METADATA_NODE_NAME, &metadata_phys, + &blob_size); + if (err) + /* This is fine, previous kernel didn't export metadata */ + return; + + /* Check that, at least, "version" is present */ + if (blob_size < sizeof(u32)) { + pr_warn("kexec-metadata blob too small (%zu bytes)\n", + blob_size); + return; + } + + metadata = phys_to_virt(metadata_phys); + + if (metadata->version != KHO_KEXEC_METADATA_VERSION) { + pr_warn("kexec-metadata version %u not supported (expected %u)\n", + metadata->version, KHO_KEXEC_METADATA_VERSION); + return; + } + + if (blob_size < sizeof(*metadata)) { + pr_warn("kexec-metadata blob too small for v%u (%zu < %zu)\n", + metadata->version, blob_size, sizeof(*metadata)); + return; + } + + /* + * Copy data to the kernel structure that will persist during + * kernel lifetime. + */ + kho_in.kexec_count = metadata->kexec_count; + strscpy(kho_in.previous_release, metadata->previous_release, + sizeof(kho_in.previous_release)); + + pr_info("exec from: %s (count %u)\n", + kho_in.previous_release, kho_in.kexec_count); +} + +/* + * Create kexec metadata to pass kernel version and boot count to the + * next kernel. This keeps the core KHO ABI minimal and allows the + * metadata format to evolve independently. + */ +static __init int kho_out_kexec_metadata(void) +{ + struct kho_kexec_metadata *metadata; + int err; + + metadata = kho_alloc_preserve(sizeof(*metadata)); + if (IS_ERR(metadata)) + return PTR_ERR(metadata); + + metadata->version = KHO_KEXEC_METADATA_VERSION; + strscpy(metadata->previous_release, init_uts_ns.name.release, + sizeof(metadata->previous_release)); + /* kho_in.kexec_count is set to 0 on cold boot */ + metadata->kexec_count = kho_in.kexec_count + 1; + + err = kho_add_subtree(KHO_METADATA_NODE_NAME, metadata, + sizeof(*metadata)); + if (err) + kho_unpreserve_free(metadata); + + return err; +} + +static int __init kho_kexec_metadata_init(const void *fdt) +{ + int err; + + if (fdt) + kho_in_kexec_metadata(); + + /* Populate kexec metadata for the possible next kexec */ + err = kho_out_kexec_metadata(); + if (err) + pr_warn("failed to initialize kexec-metadata subtree: %d\n", + err); + + return err; +} + static __init int kho_init(void) { struct kho_radix_tree *tree = &kho_out.radix_tree; @@ -1425,6 +1519,10 @@ static __init int kho_init(void) if (err) goto err_free_fdt; + err = kho_kexec_metadata_init(fdt); + if (err) + goto err_free_fdt; + if (fdt) { kho_in_debugfs_init(&kho_in.dbg, fdt); return 0; -- cgit v1.2.3 From 00d0b372374f2528394aabf7b1f53f8dafe294de Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Thu, 26 Mar 2026 16:39:41 +0000 Subject: liveupdate: prevent double management of files Patch series "liveupdate: prevent double preservation", v4. Currently, LUO does not prevent the same file from being managed twice across different active sessions. Because LUO preserves files of absolutely different types: memfd, and upcoming vfiofd [1], iommufd [2], guestmefd (and possible kvmfd/cpufd). There is no common private data or guarantee on how to prevent that the same file is not preserved twice beside using inode or some slower and expensive method like hashtables. This patch (of 4) Currently, LUO does not prevent the same file from being managed twice across different active sessions. Use a global xarray luo_preserved_files to keep track of file identifiers being preserved by LUO. Update luo_preserve_file() to check and insert the file identifier into this xarray when it is preserved, and erase it in luo_file_unpreserve_files() when it is released. To allow handlers to define what constitutes a "unique" file (e.g., different struct file objects pointing to the same hardware resource), add a get_id() callback to struct liveupdate_file_ops. If not provided, the default identifier is the struct file pointer itself. This ensures that the same file (or resource) cannot be managed by multiple sessions. If another session attempts to preserve an already managed file, it will now fail with -EBUSY. Link: https://lore.kernel.org/20260326163943.574070-1-pasha.tatashin@soleen.com Link: https://lore.kernel.org/20260326163943.574070-2-pasha.tatashin@soleen.com Link: https://lore.kernel.org/all/20260129212510.967611-1-dmatlack@google.com [1] Link: https://lore.kernel.org/all/20260203220948.2176157-1-skhawaja@google.com [2] Signed-off-by: Pasha Tatashin Reviewed-by: Samiullah Khawaja Reviewed-by: Mike Rapoport (Microsoft) Cc: David Matlack Cc: Pratyush Yadav Cc: Shuah Khan Cc: Christian Brauner Signed-off-by: Andrew Morton --- include/linux/liveupdate.h | 2 ++ kernel/liveupdate/luo_file.c | 32 ++++++++++++++++++++++++++++++-- 2 files changed, 32 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/liveupdate.h b/include/linux/liveupdate.h index dd11fdc76a5f..61325ad26526 100644 --- a/include/linux/liveupdate.h +++ b/include/linux/liveupdate.h @@ -63,6 +63,7 @@ struct liveupdate_file_op_args { * finish, in order to do successful finish calls for all * resources in the session. * @finish: Required. Final cleanup in the new kernel. + * @get_id: Optional. Returns a unique identifier for the file. * @owner: Module reference * * All operations (except can_preserve) receive a pointer to a @@ -78,6 +79,7 @@ struct liveupdate_file_ops { int (*retrieve)(struct liveupdate_file_op_args *args); bool (*can_finish)(struct liveupdate_file_op_args *args); void (*finish)(struct liveupdate_file_op_args *args); + unsigned long (*get_id)(struct file *file); struct module *owner; }; diff --git a/kernel/liveupdate/luo_file.c b/kernel/liveupdate/luo_file.c index 5acee4174bf0..09103cf81107 100644 --- a/kernel/liveupdate/luo_file.c +++ b/kernel/liveupdate/luo_file.c @@ -108,12 +108,16 @@ #include #include #include +#include #include #include #include "luo_internal.h" static LIST_HEAD(luo_file_handler_list); +/* Keep track of files being preserved by LUO */ +static DEFINE_XARRAY(luo_preserved_files); + /* 2 4K pages, give space for 128 files per file_set */ #define LUO_FILE_PGCNT 2ul #define LUO_FILE_MAX \ @@ -203,6 +207,12 @@ static void luo_free_files_mem(struct luo_file_set *file_set) file_set->files = NULL; } +static unsigned long luo_get_id(struct liveupdate_file_handler *fh, + struct file *file) +{ + return fh->ops->get_id ? fh->ops->get_id(file) : (unsigned long)file; +} + static bool luo_token_is_used(struct luo_file_set *file_set, u64 token) { struct luo_file *iter; @@ -248,6 +258,7 @@ static bool luo_token_is_used(struct luo_file_set *file_set, u64 token) * Context: Can be called from an ioctl handler during normal system operation. * Return: 0 on success. Returns a negative errno on failure: * -EEXIST if the token is already used. + * -EBUSY if the file descriptor is already preserved by another session. * -EBADF if the file descriptor is invalid. * -ENOSPC if the file_set is full. * -ENOENT if no compatible handler is found. @@ -288,10 +299,15 @@ int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd) if (err) goto err_free_files_mem; - err = luo_flb_file_preserve(fh); + err = xa_insert(&luo_preserved_files, luo_get_id(fh, file), + file, GFP_KERNEL); if (err) goto err_free_files_mem; + err = luo_flb_file_preserve(fh); + if (err) + goto err_erase_xa; + luo_file = kzalloc_obj(*luo_file); if (!luo_file) { err = -ENOMEM; @@ -320,6 +336,8 @@ err_kfree: kfree(luo_file); err_flb_unpreserve: luo_flb_file_unpreserve(fh); +err_erase_xa: + xa_erase(&luo_preserved_files, luo_get_id(fh, file)); err_free_files_mem: luo_free_files_mem(file_set); err_fput: @@ -363,6 +381,8 @@ void luo_file_unpreserve_files(struct luo_file_set *file_set) luo_file->fh->ops->unpreserve(&args); luo_flb_file_unpreserve(luo_file->fh); + xa_erase(&luo_preserved_files, + luo_get_id(luo_file->fh, luo_file->file)); list_del(&luo_file->list); file_set->count--; @@ -606,6 +626,11 @@ int luo_retrieve_file(struct luo_file_set *file_set, u64 token, luo_file->file = args.file; /* Get reference so we can keep this file in LUO until finish */ get_file(luo_file->file); + + WARN_ON(xa_insert(&luo_preserved_files, + luo_get_id(luo_file->fh, luo_file->file), + luo_file->file, GFP_KERNEL)); + *filep = luo_file->file; luo_file->retrieve_status = 1; @@ -701,8 +726,11 @@ int luo_file_finish(struct luo_file_set *file_set) luo_file_finish_one(file_set, luo_file); - if (luo_file->file) + if (luo_file->file) { + xa_erase(&luo_preserved_files, + luo_get_id(luo_file->fh, luo_file->file)); fput(luo_file->file); + } list_del(&luo_file->list); file_set->count--; mutex_destroy(&luo_file->mutex); -- cgit v1.2.3 From 6b2b22f7c8cf1596490beaac96a989cbafdfea57 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 27 Mar 2026 03:33:28 +0000 Subject: liveupdate: protect FLB lists with luo_register_rwlock Because liveupdate FLB objects will soon drop their persistent module references when registered, list traversals must be protected against concurrent module unloading. To provide this protection, utilize the global luo_register_rwlock. It protects the global registry of FLBs and the handler's specific list of FLB dependencies. Read locks are used during concurrent list traversals (e.g., during preservation and serialization). Write locks are taken during registration and unregistration. Link: https://lore.kernel.org/20260327033335.696621-5-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Reviewed-by: Pratyush Yadav (Google) Cc: David Matlack Cc: Mike Rapoport Cc: Samiullah Khawaja Signed-off-by: Andrew Morton --- include/linux/liveupdate.h | 1 + kernel/liveupdate/luo_flb.c | 14 ++++++++++++++ 2 files changed, 15 insertions(+) (limited to 'include') diff --git a/include/linux/liveupdate.h b/include/linux/liveupdate.h index 61325ad26526..9c761d9bacf8 100644 --- a/include/linux/liveupdate.h +++ b/include/linux/liveupdate.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include diff --git a/kernel/liveupdate/luo_flb.c b/kernel/liveupdate/luo_flb.c index cf4a8f854c83..fdb274410e8f 100644 --- a/kernel/liveupdate/luo_flb.c +++ b/kernel/liveupdate/luo_flb.c @@ -245,17 +245,20 @@ int luo_flb_file_preserve(struct liveupdate_file_handler *fh) struct luo_flb_link *iter; int err = 0; + down_read(&luo_register_rwlock); list_for_each_entry(iter, flb_list, list) { err = luo_flb_file_preserve_one(iter->flb); if (err) goto exit_err; } + up_read(&luo_register_rwlock); return 0; exit_err: list_for_each_entry_continue_reverse(iter, flb_list, list) luo_flb_file_unpreserve_one(iter->flb); + up_read(&luo_register_rwlock); return err; } @@ -277,6 +280,7 @@ void luo_flb_file_unpreserve(struct liveupdate_file_handler *fh) struct list_head *flb_list = &ACCESS_PRIVATE(fh, flb_list); struct luo_flb_link *iter; + guard(rwsem_read)(&luo_register_rwlock); list_for_each_entry_reverse(iter, flb_list, list) luo_flb_file_unpreserve_one(iter->flb); } @@ -297,6 +301,7 @@ void luo_flb_file_finish(struct liveupdate_file_handler *fh) struct list_head *flb_list = &ACCESS_PRIVATE(fh, flb_list); struct luo_flb_link *iter; + guard(rwsem_read)(&luo_register_rwlock); list_for_each_entry_reverse(iter, flb_list, list) luo_flb_file_finish_one(iter->flb); } @@ -360,6 +365,8 @@ int liveupdate_register_flb(struct liveupdate_file_handler *fh, if (!luo_session_quiesce()) return -EBUSY; + down_write(&luo_register_rwlock); + /* Check that this FLB is not already linked to this file handler */ err = -EEXIST; list_for_each_entry(iter, flb_list, list) { @@ -401,11 +408,13 @@ int liveupdate_register_flb(struct liveupdate_file_handler *fh, private->users++; link->flb = flb; list_add_tail(&no_free_ptr(link)->list, flb_list); + up_write(&luo_register_rwlock); luo_session_resume(); return 0; err_resume: + up_write(&luo_register_rwlock); luo_session_resume(); return err; } @@ -449,6 +458,8 @@ int liveupdate_unregister_flb(struct liveupdate_file_handler *fh, if (!luo_session_quiesce()) return -EBUSY; + down_write(&luo_register_rwlock); + /* Find and remove the link from the file handler's list */ list_for_each_entry(iter, flb_list, list) { if (iter->flb == flb) { @@ -473,11 +484,13 @@ int liveupdate_unregister_flb(struct liveupdate_file_handler *fh, module_put(flb->ops->owner); } + up_write(&luo_register_rwlock); luo_session_resume(); return 0; err_resume: + up_write(&luo_register_rwlock); luo_session_resume(); return err; } @@ -643,6 +656,7 @@ void luo_flb_serialize(void) struct liveupdate_flb *gflb; int i = 0; + guard(rwsem_read)(&luo_register_rwlock); list_private_for_each_entry(gflb, &luo_flb_global.list, private.list) { struct luo_flb_private *private = luo_flb_get_private(gflb); -- cgit v1.2.3 From 2ab7207e7ec6cd5af1912d9be5174f114633286b Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 27 Mar 2026 03:33:33 +0000 Subject: liveupdate: make unregister functions return void Change liveupdate_unregister_file_handler and liveupdate_unregister_flb to return void instead of an error code. This follows the design principle that unregistration during module unload should not fail, as the unload cannot be stopped at that point. Link: https://lore.kernel.org/20260327033335.696621-10-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Reviewed-by: Pratyush Yadav (Google) Cc: David Matlack Cc: Mike Rapoport Cc: Samiullah Khawaja Signed-off-by: Andrew Morton --- include/linux/liveupdate.h | 14 ++++++-------- kernel/liveupdate/luo_file.c | 14 ++------------ kernel/liveupdate/luo_flb.c | 11 +++-------- 3 files changed, 11 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/include/linux/liveupdate.h b/include/linux/liveupdate.h index 9c761d9bacf8..30c5a39ff9e9 100644 --- a/include/linux/liveupdate.h +++ b/include/linux/liveupdate.h @@ -231,12 +231,12 @@ bool liveupdate_enabled(void); int liveupdate_reboot(void); int liveupdate_register_file_handler(struct liveupdate_file_handler *fh); -int liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh); +void liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh); int liveupdate_register_flb(struct liveupdate_file_handler *fh, struct liveupdate_flb *flb); -int liveupdate_unregister_flb(struct liveupdate_file_handler *fh, - struct liveupdate_flb *flb); +void liveupdate_unregister_flb(struct liveupdate_file_handler *fh, + struct liveupdate_flb *flb); int liveupdate_flb_get_incoming(struct liveupdate_flb *flb, void **objp); int liveupdate_flb_get_outgoing(struct liveupdate_flb *flb, void **objp); @@ -258,9 +258,8 @@ static inline int liveupdate_register_file_handler(struct liveupdate_file_handle return -EOPNOTSUPP; } -static inline int liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh) +static inline void liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh) { - return -EOPNOTSUPP; } static inline int liveupdate_register_flb(struct liveupdate_file_handler *fh, @@ -269,10 +268,9 @@ static inline int liveupdate_register_flb(struct liveupdate_file_handler *fh, return -EOPNOTSUPP; } -static inline int liveupdate_unregister_flb(struct liveupdate_file_handler *fh, - struct liveupdate_flb *flb) +static inline void liveupdate_unregister_flb(struct liveupdate_file_handler *fh, + struct liveupdate_flb *flb) { - return -EOPNOTSUPP; } static inline int liveupdate_flb_get_incoming(struct liveupdate_flb *flb, diff --git a/kernel/liveupdate/luo_file.c b/kernel/liveupdate/luo_file.c index 4060b6064248..0730865711c1 100644 --- a/kernel/liveupdate/luo_file.c +++ b/kernel/liveupdate/luo_file.c @@ -912,25 +912,15 @@ err_unlock: * * Unregisters the file handler from the liveupdate core. This function * reverses the operations of liveupdate_register_file_handler(). - * - * It ensures safe removal by checking that: - * No FLB registered with this file handler. - * - * If the unregistration fails, the internal test state is reverted. - * - * Return: 0 Success. -EOPNOTSUPP when live update is not enabled. -EBUSY A live - * update is in progress, FLB is registred with this file handler. */ -int liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh) +void liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh) { if (!liveupdate_enabled()) - return -EOPNOTSUPP; + return; guard(rwsem_write)(&luo_register_rwlock); luo_flb_unregister_all(fh); list_del(&ACCESS_PRIVATE(fh, list)); module_put(fh->ops->owner); - - return 0; } diff --git a/kernel/liveupdate/luo_flb.c b/kernel/liveupdate/luo_flb.c index e069d694163e..00f5494812c4 100644 --- a/kernel/liveupdate/luo_flb.c +++ b/kernel/liveupdate/luo_flb.c @@ -475,21 +475,16 @@ int liveupdate_register_flb(struct liveupdate_file_handler *fh, * owner module (acquired during registration) is released. * * Context: It is typically called from a subsystem's module exit function. - * Return: 0 on success. - * -EOPNOTSUPP if live update is disabled. - * -ENOENT if the FLB was not found in the file handler's list. */ -int liveupdate_unregister_flb(struct liveupdate_file_handler *fh, - struct liveupdate_flb *flb) +void liveupdate_unregister_flb(struct liveupdate_file_handler *fh, + struct liveupdate_flb *flb) { if (!liveupdate_enabled()) - return -EOPNOTSUPP; + return; guard(rwsem_write)(&luo_register_rwlock); luo_flb_unregister_one(fh, flb); - - return 0; } /** -- cgit v1.2.3 From 6b1842775a460245e97d36d3a67d0cfba7c4ff79 Mon Sep 17 00:00:00 2001 From: Hao Ge Date: Tue, 31 Mar 2026 16:13:12 +0800 Subject: mm/alloc_tag: clear codetag for pages allocated before page_ext initialization Due to initialization ordering, page_ext is allocated and initialized relatively late during boot. Some pages have already been allocated and freed before page_ext becomes available, leaving their codetag uninitialized. A clear example is in init_section_page_ext(): alloc_page_ext() calls kmemleak_alloc(). If the slab cache has no free objects, it falls back to the buddy allocator to allocate memory. However, at this point page_ext is not yet fully initialized, so these newly allocated pages have no codetag set. These pages may later be reclaimed by KASAN, which causes the warning to trigger when they are freed because their codetag ref is still empty. Use a global array to track pages allocated before page_ext is fully initialized. The array size is fixed at 8192 entries, and will emit a warning if this limit is exceeded. When page_ext initialization completes, set their codetag to empty to avoid warnings when they are freed later. This warning is only observed with CONFIG_MEM_ALLOC_PROFILING_DEBUG=Y and mem_profiling_compressed disabled: [ 9.582133] ------------[ cut here ]------------ [ 9.582137] alloc_tag was not set [ 9.582139] WARNING: ./include/linux/alloc_tag.h:164 at __pgalloc_tag_sub+0x40f/0x550, CPU#5: systemd/1 [ 9.582190] CPU: 5 UID: 0 PID: 1 Comm: systemd Not tainted 7.0.0-rc4 #1 PREEMPT(lazy) [ 9.582192] Hardware name: Red Hat KVM, BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 [ 9.582194] RIP: 0010:__pgalloc_tag_sub+0x40f/0x550 [ 9.582196] Code: 00 00 4c 29 e5 48 8b 05 1f 88 56 05 48 8d 4c ad 00 48 8d 2c c8 e9 87 fd ff ff 0f 0b 0f 0b e9 f3 fe ff ff 48 8d 3d 61 2f ed 03 <67> 48 0f b9 3a e9 b3 fd ff ff 0f 0b eb e4 e8 5e cd 14 02 4c 89 c7 [ 9.582197] RSP: 0018:ffffc9000001f940 EFLAGS: 00010246 [ 9.582200] RAX: dffffc0000000000 RBX: 1ffff92000003f2b RCX: 1ffff110200d806c [ 9.582201] RDX: ffff8881006c0360 RSI: 0000000000000004 RDI: ffffffff9bc7b460 [ 9.582202] RBP: 0000000000000000 R08: 0000000000000000 R09: fffffbfff3a62324 [ 9.582203] R10: ffffffff9d311923 R11: 0000000000000000 R12: ffffea0004001b00 [ 9.582204] R13: 0000000000002000 R14: ffffea0000000000 R15: ffff8881006c0360 [ 9.582206] FS: 00007ffbbcf2d940(0000) GS:ffff888450479000(0000) knlGS:0000000000000000 [ 9.582208] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 9.582210] CR2: 000055ee3aa260d0 CR3: 0000000148b67005 CR4: 0000000000770ef0 [ 9.582211] PKRU: 55555554 [ 9.582212] Call Trace: [ 9.582213] [ 9.582214] ? __pfx___pgalloc_tag_sub+0x10/0x10 [ 9.582216] ? check_bytes_and_report+0x68/0x140 [ 9.582219] __free_frozen_pages+0x2e4/0x1150 [ 9.582221] ? __free_slab+0xc2/0x2b0 [ 9.582224] qlist_free_all+0x4c/0xf0 [ 9.582227] kasan_quarantine_reduce+0x15d/0x180 [ 9.582229] __kasan_slab_alloc+0x69/0x90 [ 9.582232] kmem_cache_alloc_noprof+0x14a/0x500 [ 9.582234] do_getname+0x96/0x310 [ 9.582237] do_readlinkat+0x91/0x2f0 [ 9.582239] ? __pfx_do_readlinkat+0x10/0x10 [ 9.582240] ? get_random_bytes_user+0x1df/0x2c0 [ 9.582244] __x64_sys_readlinkat+0x96/0x100 [ 9.582246] do_syscall_64+0xce/0x650 [ 9.582250] ? __x64_sys_getrandom+0x13a/0x1e0 [ 9.582252] ? __pfx___x64_sys_getrandom+0x10/0x10 [ 9.582254] ? do_syscall_64+0x114/0x650 [ 9.582255] ? ksys_read+0xfc/0x1d0 [ 9.582258] ? __pfx_ksys_read+0x10/0x10 [ 9.582260] ? do_syscall_64+0x114/0x650 [ 9.582262] ? do_syscall_64+0x114/0x650 [ 9.582264] ? __pfx_fput_close_sync+0x10/0x10 [ 9.582266] ? file_close_fd_locked+0x178/0x2a0 [ 9.582268] ? __x64_sys_faccessat2+0x96/0x100 [ 9.582269] ? __x64_sys_close+0x7d/0xd0 [ 9.582271] ? do_syscall_64+0x114/0x650 [ 9.582273] ? do_syscall_64+0x114/0x650 [ 9.582275] ? clear_bhb_loop+0x50/0xa0 [ 9.582277] ? clear_bhb_loop+0x50/0xa0 [ 9.582279] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 9.582280] RIP: 0033:0x7ffbbda345ee [ 9.582282] Code: 0f 1f 40 00 48 8b 15 29 38 0d 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff c3 0f 1f 40 00 f3 0f 1e fa 49 89 ca b8 0b 01 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d fa 37 0d 00 f7 d8 64 89 01 48 [ 9.582284] RSP: 002b:00007ffe2ad8de58 EFLAGS: 00000202 ORIG_RAX: 000000000000010b [ 9.582286] RAX: ffffffffffffffda RBX: 000055ee3aa25570 RCX: 00007ffbbda345ee [ 9.582287] RDX: 000055ee3aa25570 RSI: 00007ffe2ad8dee0 RDI: 00000000ffffff9c [ 9.582288] RBP: 0000000000001000 R08: 0000000000000003 R09: 0000000000001001 [ 9.582289] R10: 0000000000001000 R11: 0000000000000202 R12: 0000000000000033 [ 9.582290] R13: 00007ffe2ad8dee0 R14: 00000000ffffff9c R15: 00007ffe2ad8deb0 [ 9.582292] [ 9.582293] ---[ end trace 0000000000000000 ]--- Link: https://lore.kernel.org/20260331081312.123719-1-hao.ge@linux.dev Fixes: dcfe378c81f72 ("lib: introduce support for page allocation tagging") Signed-off-by: Hao Ge Suggested-by: Suren Baghdasaryan Acked-by: Suren Baghdasaryan Cc: Kent Overstreet Cc: Signed-off-by: Andrew Morton --- include/linux/alloc_tag.h | 2 + include/linux/pgalloc_tag.h | 2 +- lib/alloc_tag.c | 109 ++++++++++++++++++++++++++++++++++++++++++++ mm/page_alloc.c | 10 +++- 4 files changed, 121 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index d40ac39bfbe8..02de2ede560f 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -163,9 +163,11 @@ static inline void alloc_tag_sub_check(union codetag_ref *ref) { WARN_ONCE(ref && !ref->ct, "alloc_tag was not set\n"); } +void alloc_tag_add_early_pfn(unsigned long pfn); #else static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag *tag) {} static inline void alloc_tag_sub_check(union codetag_ref *ref) {} +static inline void alloc_tag_add_early_pfn(unsigned long pfn) {} #endif /* Caller should verify both ref and tag to be valid */ diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index 38a82d65e58e..951d33362268 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -181,7 +181,7 @@ static inline struct alloc_tag *__pgalloc_tag_get(struct page *page) if (get_page_tag_ref(page, &ref, &handle)) { alloc_tag_sub_check(&ref); - if (ref.ct) + if (ref.ct && !is_codetag_empty(&ref)) tag = ct_to_alloc_tag(ref.ct); put_page_tag_ref(handle); } diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index 58991ab09d84..ed1bdcf1f8ab 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -6,7 +6,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -758,8 +760,115 @@ static __init bool need_page_alloc_tagging(void) return mem_profiling_support; } +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG +/* + * Track page allocations before page_ext is initialized. + * Some pages are allocated before page_ext becomes available, leaving + * their codetag uninitialized. Track these early PFNs so we can clear + * their codetag refs later to avoid warnings when they are freed. + * + * Early allocations include: + * - Base allocations independent of CPU count + * - Per-CPU allocations (e.g., CPU hotplug callbacks during smp_init, + * such as trace ring buffers, scheduler per-cpu data) + * + * For simplicity, we fix the size to 8192. + * If insufficient, a warning will be triggered to alert the user. + * + * TODO: Replace fixed-size array with dynamic allocation using + * a GFP flag similar to ___GFP_NO_OBJ_EXT to avoid recursion. + */ +#define EARLY_ALLOC_PFN_MAX 8192 + +static unsigned long early_pfns[EARLY_ALLOC_PFN_MAX] __initdata; +static atomic_t early_pfn_count __initdata = ATOMIC_INIT(0); + +static void __init __alloc_tag_add_early_pfn(unsigned long pfn) +{ + int old_idx, new_idx; + + do { + old_idx = atomic_read(&early_pfn_count); + if (old_idx >= EARLY_ALLOC_PFN_MAX) { + pr_warn_once("Early page allocations before page_ext init exceeded EARLY_ALLOC_PFN_MAX (%d)\n", + EARLY_ALLOC_PFN_MAX); + return; + } + new_idx = old_idx + 1; + } while (!atomic_try_cmpxchg(&early_pfn_count, &old_idx, new_idx)); + + early_pfns[old_idx] = pfn; +} + +typedef void alloc_tag_add_func(unsigned long pfn); +static alloc_tag_add_func __rcu *alloc_tag_add_early_pfn_ptr __refdata = + RCU_INITIALIZER(__alloc_tag_add_early_pfn); + +void alloc_tag_add_early_pfn(unsigned long pfn) +{ + alloc_tag_add_func *alloc_tag_add; + + if (static_key_enabled(&mem_profiling_compressed)) + return; + + rcu_read_lock(); + alloc_tag_add = rcu_dereference(alloc_tag_add_early_pfn_ptr); + if (alloc_tag_add) + alloc_tag_add(pfn); + rcu_read_unlock(); +} + +static void __init clear_early_alloc_pfn_tag_refs(void) +{ + unsigned int i; + + if (static_key_enabled(&mem_profiling_compressed)) + return; + + rcu_assign_pointer(alloc_tag_add_early_pfn_ptr, NULL); + /* Make sure we are not racing with __alloc_tag_add_early_pfn() */ + synchronize_rcu(); + + for (i = 0; i < atomic_read(&early_pfn_count); i++) { + unsigned long pfn = early_pfns[i]; + + if (pfn_valid(pfn)) { + struct page *page = pfn_to_page(pfn); + union pgtag_ref_handle handle; + union codetag_ref ref; + + if (get_page_tag_ref(page, &ref, &handle)) { + /* + * An early-allocated page could be freed and reallocated + * after its page_ext is initialized but before we clear it. + * In that case, it already has a valid tag set. + * We should not overwrite that valid tag with CODETAG_EMPTY. + * + * Note: there is still a small race window between checking + * ref.ct and calling set_codetag_empty(). We accept this + * race as it's unlikely and the extra complexity of atomic + * cmpxchg is not worth it for this debug-only code path. + */ + if (ref.ct) { + put_page_tag_ref(handle); + continue; + } + + set_codetag_empty(&ref); + update_page_tag_ref(handle, &ref); + put_page_tag_ref(handle); + } + } + + } +} +#else /* !CONFIG_MEM_ALLOC_PROFILING_DEBUG */ +static inline void __init clear_early_alloc_pfn_tag_refs(void) {} +#endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ + static __init void init_page_alloc_tagging(void) { + clear_early_alloc_pfn_tag_refs(); } struct page_ext_operations page_alloc_tagging_ops = { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 111b54df8a3c..b1c5430cad4e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1252,10 +1252,18 @@ void __pgalloc_tag_add(struct page *page, struct task_struct *task, union pgtag_ref_handle handle; union codetag_ref ref; - if (get_page_tag_ref(page, &ref, &handle)) { + if (likely(get_page_tag_ref(page, &ref, &handle))) { alloc_tag_add(&ref, task->alloc_tag, PAGE_SIZE * nr); update_page_tag_ref(handle, &ref); put_page_tag_ref(handle); + } else { + /* + * page_ext is not available yet, record the pfn so we can + * clear the tag ref later when page_ext is initialized. + */ + alloc_tag_add_early_pfn(page_to_pfn(page)); + if (task->alloc_tag) + alloc_tag_set_inaccurate(task->alloc_tag); } } -- cgit v1.2.3 From 55da81663b9642dd046b26dd6f1baddbcf337c1e Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 27 Mar 2026 16:33:14 -0700 Subject: mm/damon/core: fix damon_call() vs kdamond_fn() exit race Patch series "mm/damon/core: fix damon_call()/damos_walk() vs kdmond exit race". damon_call() and damos_walk() can leak memory and/or deadlock when they race with kdamond terminations. Fix those. This patch (of 2); When kdamond_fn() main loop is finished, the function cancels all remaining damon_call() requests and unset the damon_ctx->kdamond so that API callers and API functions themselves can know the context is terminated. damon_call() adds the caller's request to the queue first. After that, it shows if the kdamond of the damon_ctx is still running (damon_ctx->kdamond is set). Only if the kdamond is running, damon_call() starts waiting for the kdamond's handling of the newly added request. The damon_call() requests registration and damon_ctx->kdamond unset are protected by different mutexes, though. Hence, damon_call() could race with damon_ctx->kdamond unset, and result in deadlocks. For example, let's suppose kdamond successfully finished the damon_call() requests cancelling. Right after that, damon_call() is called for the context. It registers the new request, and shows the context is still running, because damon_ctx->kdamond unset is not yet done. Hence the damon_call() caller starts waiting for the handling of the request. However, the kdamond is already on the termination steps, so it never handles the new request. As a result, the damon_call() caller threads infinitely waits. Fix this by introducing another damon_ctx field, namely call_controls_obsolete. It is protected by the damon_ctx->call_controls_lock, which protects damon_call() requests registration. Initialize (unset) it in kdamond_fn() before letting damon_start() returns and set it just before the cancelling of remaining damon_call() requests is executed. damon_call() reads the obsolete field under the lock and avoids adding a new request. After this change, only requests that are guaranteed to be handled or cancelled are registered. Hence the after-registration DAMON context termination check is no longer needed. Remove it together. Note that the deadlock will not happen when damon_call() is called for repeat mode request. In tis case, damon_call() returns instead of waiting for the handling when the request registration succeeds and it shows the kdamond is running. However, if the request also has dealloc_on_cancel, the request memory would be leaked. The issue is found by sashiko [1]. Link: https://lore.kernel.org/20260327233319.3528-1-sj@kernel.org Link: https://lore.kernel.org/20260327233319.3528-2-sj@kernel.org Link: https://lore.kernel.org/20260325141956.87144-1-sj@kernel.org [1] Fixes: 42b7491af14c ("mm/damon/core: introduce damon_call()") Signed-off-by: SeongJae Park Cc: # 6.14.x Signed-off-by: Andrew Morton --- include/linux/damon.h | 1 + mm/damon/core.c | 45 ++++++++++++++------------------------------- 2 files changed, 15 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index d9a3babbafc1..5129de70e7b7 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -818,6 +818,7 @@ struct damon_ctx { /* lists of &struct damon_call_control */ struct list_head call_controls; + bool call_controls_obsolete; struct mutex call_controls_lock; struct damos_walk_control *walk_control; diff --git a/mm/damon/core.c b/mm/damon/core.c index db6c67e52d2b..9bcda2765ac9 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1573,35 +1573,6 @@ int damon_kdamond_pid(struct damon_ctx *ctx) return pid; } -/* - * damon_call_handle_inactive_ctx() - handle DAMON call request that added to - * an inactive context. - * @ctx: The inactive DAMON context. - * @control: Control variable of the call request. - * - * This function is called in a case that @control is added to @ctx but @ctx is - * not running (inactive). See if @ctx handled @control or not, and cleanup - * @control if it was not handled. - * - * Returns 0 if @control was handled by @ctx, negative error code otherwise. - */ -static int damon_call_handle_inactive_ctx( - struct damon_ctx *ctx, struct damon_call_control *control) -{ - struct damon_call_control *c; - - mutex_lock(&ctx->call_controls_lock); - list_for_each_entry(c, &ctx->call_controls, list) { - if (c == control) { - list_del(&control->list); - mutex_unlock(&ctx->call_controls_lock); - return -EINVAL; - } - } - mutex_unlock(&ctx->call_controls_lock); - return 0; -} - /** * damon_call() - Invoke a given function on DAMON worker thread (kdamond). * @ctx: DAMON context to call the function for. @@ -1619,6 +1590,10 @@ static int damon_call_handle_inactive_ctx( * synchronization. The return value of the function will be saved in * &damon_call_control->return_code. * + * Note that this function should be called only after damon_start() with the + * @ctx has succeeded. Otherwise, this function could fall into an indefinite + * wait. + * * Return: 0 on success, negative error code otherwise. */ int damon_call(struct damon_ctx *ctx, struct damon_call_control *control) @@ -1629,10 +1604,12 @@ int damon_call(struct damon_ctx *ctx, struct damon_call_control *control) INIT_LIST_HEAD(&control->list); mutex_lock(&ctx->call_controls_lock); + if (ctx->call_controls_obsolete) { + mutex_unlock(&ctx->call_controls_lock); + return -ECANCELED; + } list_add_tail(&control->list, &ctx->call_controls); mutex_unlock(&ctx->call_controls_lock); - if (!damon_is_running(ctx)) - return damon_call_handle_inactive_ctx(ctx, control); if (control->repeat) return 0; wait_for_completion(&control->completion); @@ -2952,6 +2929,9 @@ static int kdamond_fn(void *data) pr_debug("kdamond (%d) starts\n", current->pid); + mutex_lock(&ctx->call_controls_lock); + ctx->call_controls_obsolete = false; + mutex_unlock(&ctx->call_controls_lock); complete(&ctx->kdamond_started); kdamond_init_ctx(ctx); @@ -3062,6 +3042,9 @@ done: damon_destroy_targets(ctx); kfree(ctx->regions_score_histogram); + mutex_lock(&ctx->call_controls_lock); + ctx->call_controls_obsolete = true; + mutex_unlock(&ctx->call_controls_lock); kdamond_call(ctx, true); damos_walk_cancel(ctx); -- cgit v1.2.3 From 33c3f6c2b48cd84b441dba1ee3e62290e53930f4 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 27 Mar 2026 16:33:15 -0700 Subject: mm/damon/core: fix damos_walk() vs kdamond_fn() exit race When kdamond_fn() main loop is finished, the function cancels remaining damos_walk() request and unset the damon_ctx->kdamond so that API callers and API functions themselves can show the context is terminated. damos_walk() adds the caller's request to the queue first. After that, it shows if the kdamond of the damon_ctx is still running (damon_ctx->kdamond is set). Only if the kdamond is running, damos_walk() starts waiting for the kdamond's handling of the newly added request. The damos_walk() requests registration and damon_ctx->kdamond unset are protected by different mutexes, though. Hence, damos_walk() could race with damon_ctx->kdamond unset, and result in deadlocks. For example, let's suppose kdamond successfully finished the damow_walk() request cancelling. Right after that, damos_walk() is called for the context. It registers the new request, and shows the context is still running, because damon_ctx->kdamond unset is not yet done. Hence the damos_walk() caller starts waiting for the handling of the request. However, the kdamond is already on the termination steps, so it never handles the new request. As a result, the damos_walk() caller thread infinitely waits. Fix this by introducing another damon_ctx field, namely walk_control_obsolete. It is protected by the damon_ctx->walk_control_lock, which protects damos_walk() request registration. Initialize (unset) it in kdamond_fn() before letting damon_start() returns and set it just before the cancelling of the remaining damos_walk() request is executed. damos_walk() reads the obsolete field under the lock and avoids adding a new request. After this change, only requests that are guaranteed to be handled or cancelled are registered. Hence the after-registration DAMON context termination check is no longer needed. Remove it together. The issue is found by sashiko [1]. Link: https://lore.kernel.org/20260327233319.3528-3-sj@kernel.org Link: https://lore.kernel.org/20260325141956.87144-1-sj@kernel.org [1] Fixes: bf0eaba0ff9c ("mm/damon/core: implement damos_walk()") Signed-off-by: SeongJae Park Cc: # 6.14.x Signed-off-by: Andrew Morton --- include/linux/damon.h | 1 + mm/damon/core.c | 21 ++++++++++++++------- 2 files changed, 15 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 5129de70e7b7..f2cdb7c3f5e6 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -822,6 +822,7 @@ struct damon_ctx { struct mutex call_controls_lock; struct damos_walk_control *walk_control; + bool walk_control_obsolete; struct mutex walk_control_lock; /* diff --git a/mm/damon/core.c b/mm/damon/core.c index 9bcda2765ac9..ddabb93f2377 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1637,6 +1637,10 @@ int damon_call(struct damon_ctx *ctx, struct damon_call_control *control) * passed at least one &damos->apply_interval_us, kdamond marks the request as * completed so that damos_walk() can wakeup and return. * + * Note that this function should be called only after damon_start() with the + * @ctx has succeeded. Otherwise, this function could fall into an indefinite + * wait. + * * Return: 0 on success, negative error code otherwise. */ int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control) @@ -1644,19 +1648,16 @@ int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control) init_completion(&control->completion); control->canceled = false; mutex_lock(&ctx->walk_control_lock); + if (ctx->walk_control_obsolete) { + mutex_unlock(&ctx->walk_control_lock); + return -ECANCELED; + } if (ctx->walk_control) { mutex_unlock(&ctx->walk_control_lock); return -EBUSY; } ctx->walk_control = control; mutex_unlock(&ctx->walk_control_lock); - if (!damon_is_running(ctx)) { - mutex_lock(&ctx->walk_control_lock); - if (ctx->walk_control == control) - ctx->walk_control = NULL; - mutex_unlock(&ctx->walk_control_lock); - return -EINVAL; - } wait_for_completion(&control->completion); if (control->canceled) return -ECANCELED; @@ -2932,6 +2933,9 @@ static int kdamond_fn(void *data) mutex_lock(&ctx->call_controls_lock); ctx->call_controls_obsolete = false; mutex_unlock(&ctx->call_controls_lock); + mutex_lock(&ctx->walk_control_lock); + ctx->walk_control_obsolete = false; + mutex_unlock(&ctx->walk_control_lock); complete(&ctx->kdamond_started); kdamond_init_ctx(ctx); @@ -3046,6 +3050,9 @@ done: ctx->call_controls_obsolete = true; mutex_unlock(&ctx->call_controls_lock); kdamond_call(ctx, true); + mutex_lock(&ctx->walk_control_lock); + ctx->walk_control_obsolete = true; + mutex_unlock(&ctx->walk_control_lock); damos_walk_cancel(ctx); pr_debug("kdamond (%d) finishes\n", current->pid); -- cgit v1.2.3 From a5bb8669872b6b8463b8777a7a259a8305060016 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Thu, 2 Apr 2026 07:11:47 +0300 Subject: userfaultfd: move vma_can_userfault out of line vma_can_userfault() has grown pretty big and it's not called on performance critical path. Move it out of line. No functional changes. Link: https://lore.kernel.org/20260402041156.1377214-7-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: David Hildenbrand (Red Hat) Reviewed-by: Liam R. Howlett Cc: Andrea Arcangeli Cc: Andrei Vagin Cc: Axel Rasmussen Cc: Baolin Wang Cc: Harry Yoo Cc: Harry Yoo (Oracle) Cc: Hugh Dickins Cc: James Houghton Cc: Lorenzo Stoakes (Oracle) Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Muchun Song Cc: Nikita Kalyazin Cc: Oscar Salvador Cc: Paolo Bonzini Cc: Peter Xu Cc: Sean Christopherson Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: David Carlier Signed-off-by: Andrew Morton --- include/linux/userfaultfd_k.h | 35 ++--------------------------------- mm/userfaultfd.c | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index d83e349900a3..ce0201c3dd82 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -211,39 +211,8 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma) return vma->vm_flags & __VM_UFFD_FLAGS; } -static inline bool vma_can_userfault(struct vm_area_struct *vma, - vm_flags_t vm_flags, - bool wp_async) -{ - vm_flags &= __VM_UFFD_FLAGS; - - if (vma->vm_flags & VM_DROPPABLE) - return false; - - if ((vm_flags & VM_UFFD_MINOR) && - (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma))) - return false; - - /* - * If wp async enabled, and WP is the only mode enabled, allow any - * memory type. - */ - if (wp_async && (vm_flags == VM_UFFD_WP)) - return true; - - /* - * If user requested uffd-wp but not enabled pte markers for - * uffd-wp, then shmem & hugetlbfs are not supported but only - * anonymous. - */ - if (!uffd_supports_wp_marker() && (vm_flags & VM_UFFD_WP) && - !vma_is_anonymous(vma)) - return false; - - /* By default, allow any of anon|shmem|hugetlb */ - return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || - vma_is_shmem(vma); -} +bool vma_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags, + bool wp_async); static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma) { diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 4857be5a7fa2..ebdc6e24a2c7 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -2018,6 +2018,39 @@ out: return moved ? moved : err; } +bool vma_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags, + bool wp_async) +{ + vm_flags &= __VM_UFFD_FLAGS; + + if (vma->vm_flags & VM_DROPPABLE) + return false; + + if ((vm_flags & VM_UFFD_MINOR) && + (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma))) + return false; + + /* + * If wp async enabled, and WP is the only mode enabled, allow any + * memory type. + */ + if (wp_async && (vm_flags == VM_UFFD_WP)) + return true; + + /* + * If user requested uffd-wp but not enabled pte markers for + * uffd-wp, then shmem & hugetlbfs are not supported but only + * anonymous. + */ + if (!uffd_supports_wp_marker() && (vm_flags & VM_UFFD_WP) && + !vma_is_anonymous(vma)) + return false; + + /* By default, allow any of anon|shmem|hugetlb */ + return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || + vma_is_shmem(vma); +} + static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, vm_flags_t vm_flags) { -- cgit v1.2.3 From 0f48947c4232c934885711dde0b49066f9d8ee87 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Thu, 2 Apr 2026 07:11:48 +0300 Subject: userfaultfd: introduce vm_uffd_ops Current userfaultfd implementation works only with memory managed by core MM: anonymous, shmem and hugetlb. First, there is no fundamental reason to limit userfaultfd support only to the core memory types and userfaults can be handled similarly to regular page faults provided a VMA owner implements appropriate callbacks. Second, historically various code paths were conditioned on vma_is_anonymous(), vma_is_shmem() and is_vm_hugetlb_page() and some of these conditions can be expressed as operations implemented by a particular memory type. Introduce vm_uffd_ops extension to vm_operations_struct that will delegate memory type specific operations to a VMA owner. Operations for anonymous memory are handled internally in userfaultfd using anon_uffd_ops that implicitly assigned to anonymous VMAs. Start with a single operation, ->can_userfault() that will verify that a VMA meets requirements for userfaultfd support at registration time. Implement that method for anonymous, shmem and hugetlb and move relevant parts of vma_can_userfault() into the new callbacks. [rppt@kernel.org: relocate VM_DROPPABLE test, per Tal] Link: https://lore.kernel.org/adffgfM5ANxtPIEF@kernel.org Link: https://lore.kernel.org/20260402041156.1377214-8-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Cc: Andrea Arcangeli Cc: Andrei Vagin Cc: Axel Rasmussen Cc: Baolin Wang Cc: David Hildenbrand (Arm) Cc: Harry Yoo Cc: Harry Yoo (Oracle) Cc: Hugh Dickins Cc: James Houghton Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Muchun Song Cc: Nikita Kalyazin Cc: Oscar Salvador Cc: Paolo Bonzini Cc: Peter Xu Cc: Sean Christopherson Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: David Carlier Cc: Tal Zussman Signed-off-by: Andrew Morton --- include/linux/mm.h | 5 +++++ include/linux/userfaultfd_k.h | 6 ++++++ mm/hugetlb.c | 15 +++++++++++++++ mm/shmem.c | 15 +++++++++++++++ mm/userfaultfd.c | 38 ++++++++++++++++++++++++++++---------- 5 files changed, 69 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8260e28205e9..633bbf9a184a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -758,6 +758,8 @@ struct vm_fault { */ }; +struct vm_uffd_ops; + /* * These are the virtual MM functions - opening of an area, closing and * unmapping it (needed to keep files on disk up-to-date etc), pointer @@ -865,6 +867,9 @@ struct vm_operations_struct { struct page *(*find_normal_page)(struct vm_area_struct *vma, unsigned long addr); #endif /* CONFIG_FIND_NORMAL_PAGE */ +#ifdef CONFIG_USERFAULTFD + const struct vm_uffd_ops *uffd_ops; +#endif }; #ifdef CONFIG_NUMA_BALANCING diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index ce0201c3dd82..6d445dbfe8ff 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -83,6 +83,12 @@ struct userfaultfd_ctx { extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason); +/* VMA userfaultfd operations */ +struct vm_uffd_ops { + /* Checks if a VMA can support userfaultfd */ + bool (*can_userfault)(struct vm_area_struct *vma, vm_flags_t vm_flags); +}; + /* A combined operation mode + behavior flags. */ typedef unsigned int __bitwise uffd_flags_t; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a786034ac95c..88009cd2a846 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4792,6 +4792,18 @@ static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf) return 0; } +#ifdef CONFIG_USERFAULTFD +static bool hugetlb_can_userfault(struct vm_area_struct *vma, + vm_flags_t vm_flags) +{ + return true; +} + +static const struct vm_uffd_ops hugetlb_uffd_ops = { + .can_userfault = hugetlb_can_userfault, +}; +#endif + /* * When a new function is introduced to vm_operations_struct and added * to hugetlb_vm_ops, please consider adding the function to shm_vm_ops. @@ -4805,6 +4817,9 @@ const struct vm_operations_struct hugetlb_vm_ops = { .close = hugetlb_vm_op_close, .may_split = hugetlb_vm_op_split, .pagesize = hugetlb_vm_op_pagesize, +#ifdef CONFIG_USERFAULTFD + .uffd_ops = &hugetlb_uffd_ops, +#endif }; static pte_t make_huge_pte(struct vm_area_struct *vma, struct folio *folio, diff --git a/mm/shmem.c b/mm/shmem.c index 6fa1e8340c93..389b2d76396e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3288,6 +3288,15 @@ out_unacct_blocks: shmem_inode_unacct_blocks(inode, 1); return ret; } + +static bool shmem_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags) +{ + return true; +} + +static const struct vm_uffd_ops shmem_uffd_ops = { + .can_userfault = shmem_can_userfault, +}; #endif /* CONFIG_USERFAULTFD */ #ifdef CONFIG_TMPFS @@ -5307,6 +5316,9 @@ static const struct vm_operations_struct shmem_vm_ops = { .set_policy = shmem_set_policy, .get_policy = shmem_get_policy, #endif +#ifdef CONFIG_USERFAULTFD + .uffd_ops = &shmem_uffd_ops, +#endif }; static const struct vm_operations_struct shmem_anon_vm_ops = { @@ -5316,6 +5328,9 @@ static const struct vm_operations_struct shmem_anon_vm_ops = { .set_policy = shmem_set_policy, .get_policy = shmem_get_policy, #endif +#ifdef CONFIG_USERFAULTFD + .uffd_ops = &shmem_uffd_ops, +#endif }; int shmem_init_fs_context(struct fs_context *fc) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index ebdc6e24a2c7..3a824e034a09 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -34,6 +34,25 @@ struct mfill_state { pmd_t *pmd; }; +static bool anon_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags) +{ + /* anonymous memory does not support MINOR mode */ + if (vm_flags & VM_UFFD_MINOR) + return false; + return true; +} + +static const struct vm_uffd_ops anon_uffd_ops = { + .can_userfault = anon_can_userfault, +}; + +static const struct vm_uffd_ops *vma_uffd_ops(struct vm_area_struct *vma) +{ + if (vma_is_anonymous(vma)) + return &anon_uffd_ops; + return vma->vm_ops ? vma->vm_ops->uffd_ops : NULL; +} + static __always_inline bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end) { @@ -2021,34 +2040,33 @@ out: bool vma_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags, bool wp_async) { - vm_flags &= __VM_UFFD_FLAGS; + const struct vm_uffd_ops *ops = vma_uffd_ops(vma); if (vma->vm_flags & VM_DROPPABLE) return false; - if ((vm_flags & VM_UFFD_MINOR) && - (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma))) - return false; + vm_flags &= __VM_UFFD_FLAGS; /* - * If wp async enabled, and WP is the only mode enabled, allow any + * If WP is the only mode enabled and context is wp async, allow any * memory type. */ if (wp_async && (vm_flags == VM_UFFD_WP)) return true; + /* For any other mode reject VMAs that don't implement vm_uffd_ops */ + if (!ops) + return false; + /* * If user requested uffd-wp but not enabled pte markers for - * uffd-wp, then shmem & hugetlbfs are not supported but only - * anonymous. + * uffd-wp, then only anonymous memory is supported */ if (!uffd_supports_wp_marker() && (vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma)) return false; - /* By default, allow any of anon|shmem|hugetlb */ - return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || - vma_is_shmem(vma); + return ops->can_userfault(vma, vm_flags); } static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, -- cgit v1.2.3 From dfc4d771820a171bd701d06252fcf920d0ede25c Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Thu, 2 Apr 2026 07:11:49 +0300 Subject: shmem, userfaultfd: use a VMA callback to handle UFFDIO_CONTINUE When userspace resolves a page fault in a shmem VMA with UFFDIO_CONTINUE it needs to get a folio that already exists in the pagecache backing that VMA. Instead of using shmem_get_folio() for that, add a get_folio_noalloc() method to 'struct vm_uffd_ops' that will return a folio if it exists in the VMA's pagecache at given pgoff. Implement get_folio_noalloc() method for shmem and slightly refactor userfaultfd's mfill_get_vma() and mfill_atomic_pte_continue() to support this new API. Link: https://lore.kernel.org/20260402041156.1377214-9-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: James Houghton Cc: Andrea Arcangeli Cc: Andrei Vagin Cc: Axel Rasmussen Cc: Baolin Wang Cc: David Hildenbrand (Arm) Cc: Harry Yoo Cc: Harry Yoo (Oracle) Cc: Hugh Dickins Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Muchun Song Cc: Nikita Kalyazin Cc: Oscar Salvador Cc: Paolo Bonzini Cc: Peter Xu Cc: Sean Christopherson Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: David Carlier Signed-off-by: Andrew Morton --- include/linux/userfaultfd_k.h | 7 +++++++ mm/shmem.c | 15 ++++++++++++++- mm/userfaultfd.c | 34 ++++++++++++++++++---------------- 3 files changed, 39 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 6d445dbfe8ff..4bda632dae88 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -87,6 +87,13 @@ extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason); struct vm_uffd_ops { /* Checks if a VMA can support userfaultfd */ bool (*can_userfault)(struct vm_area_struct *vma, vm_flags_t vm_flags); + /* + * Called to resolve UFFDIO_CONTINUE request. + * Should return the folio found at pgoff in the VMA's pagecache if it + * exists or ERR_PTR otherwise. + * The returned folio is locked and with reference held. + */ + struct folio *(*get_folio_noalloc)(struct inode *inode, pgoff_t pgoff); }; /* A combined operation mode + behavior flags. */ diff --git a/mm/shmem.c b/mm/shmem.c index 389b2d76396e..ed07d0c03312 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3289,13 +3289,26 @@ out_unacct_blocks: return ret; } +static struct folio *shmem_get_folio_noalloc(struct inode *inode, pgoff_t pgoff) +{ + struct folio *folio; + int err; + + err = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC); + if (err) + return ERR_PTR(err); + + return folio; +} + static bool shmem_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags) { return true; } static const struct vm_uffd_ops shmem_uffd_ops = { - .can_userfault = shmem_can_userfault, + .can_userfault = shmem_can_userfault, + .get_folio_noalloc = shmem_get_folio_noalloc, }; #endif /* CONFIG_USERFAULTFD */ diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 3a824e034a09..5b204c3ec986 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -191,6 +191,7 @@ static int mfill_get_vma(struct mfill_state *state) struct userfaultfd_ctx *ctx = state->ctx; uffd_flags_t flags = state->flags; struct vm_area_struct *dst_vma; + const struct vm_uffd_ops *ops; int err; /* @@ -232,10 +233,12 @@ static int mfill_get_vma(struct mfill_state *state) if (is_vm_hugetlb_page(dst_vma)) return 0; - if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) + ops = vma_uffd_ops(dst_vma); + if (!ops) goto out_unlock; - if (!vma_is_shmem(dst_vma) && - uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) + + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE) && + !ops->get_folio_noalloc) goto out_unlock; return 0; @@ -575,6 +578,7 @@ out: static int mfill_atomic_pte_continue(struct mfill_state *state) { struct vm_area_struct *dst_vma = state->vma; + const struct vm_uffd_ops *ops = vma_uffd_ops(dst_vma); unsigned long dst_addr = state->dst_addr; pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); struct inode *inode = file_inode(dst_vma->vm_file); @@ -584,17 +588,16 @@ static int mfill_atomic_pte_continue(struct mfill_state *state) struct page *page; int ret; - ret = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC); - /* Our caller expects us to return -EFAULT if we failed to find folio */ - if (ret == -ENOENT) - ret = -EFAULT; - if (ret) - goto out; - if (!folio) { - ret = -EFAULT; - goto out; + if (!ops) { + VM_WARN_ONCE(1, "UFFDIO_CONTINUE for unsupported VMA"); + return -EOPNOTSUPP; } + folio = ops->get_folio_noalloc(inode, pgoff); + /* Our caller expects us to return -EFAULT if we failed to find folio */ + if (IS_ERR_OR_NULL(folio)) + return -EFAULT; + page = folio_file_page(folio, pgoff); if (PageHWPoison(page)) { ret = -EIO; @@ -607,13 +610,12 @@ static int mfill_atomic_pte_continue(struct mfill_state *state) goto out_release; folio_unlock(folio); - ret = 0; -out: - return ret; + return 0; + out_release: folio_unlock(folio); folio_put(folio); - goto out; + return ret; } /* Handles UFFDIO_POISON for all non-hugetlb VMAs. */ -- cgit v1.2.3 From ad9ac3081332e955bc4b513018a1e0e86683bfb5 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Thu, 2 Apr 2026 07:11:50 +0300 Subject: userfaultfd: introduce vm_uffd_ops->alloc_folio() and use it to refactor mfill_atomic_pte_zeroed_folio() and mfill_atomic_pte_copy(). mfill_atomic_pte_zeroed_folio() and mfill_atomic_pte_copy() perform almost identical actions: * allocate a folio * update folio contents (either copy from userspace of fill with zeros) * update page tables with the new folio Split a __mfill_atomic_pte() helper that handles both cases and uses newly introduced vm_uffd_ops->alloc_folio() to allocate the folio. Pass the ops structure from the callers to __mfill_atomic_pte() to later allow using anon_uffd_ops for MAP_PRIVATE mappings of file-backed VMAs. Note, that the new ops method is called alloc_folio() rather than folio_alloc() to avoid clash with alloc_tag macro folio_alloc(). Link: https://lore.kernel.org/20260402041156.1377214-10-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: James Houghton Cc: Andrea Arcangeli Cc: Andrei Vagin Cc: Axel Rasmussen Cc: Baolin Wang Cc: David Hildenbrand (Arm) Cc: Harry Yoo Cc: Harry Yoo (Oracle) Cc: Hugh Dickins Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Muchun Song Cc: Nikita Kalyazin Cc: Oscar Salvador Cc: Paolo Bonzini Cc: Peter Xu Cc: Sean Christopherson Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: David Carlier Signed-off-by: Andrew Morton --- include/linux/userfaultfd_k.h | 6 +++ mm/userfaultfd.c | 92 ++++++++++++++++++++++--------------------- 2 files changed, 54 insertions(+), 44 deletions(-) (limited to 'include') diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 4bda632dae88..0f508c752741 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -94,6 +94,12 @@ struct vm_uffd_ops { * The returned folio is locked and with reference held. */ struct folio *(*get_folio_noalloc)(struct inode *inode, pgoff_t pgoff); + /* + * Called during resolution of UFFDIO_COPY request. + * Should allocate and return a folio or NULL if allocation fails. + */ + struct folio *(*alloc_folio)(struct vm_area_struct *vma, + unsigned long addr); }; /* A combined operation mode + behavior flags. */ diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 5b204c3ec986..dd191703b320 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -42,8 +42,26 @@ static bool anon_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags) return true; } +static struct folio *anon_alloc_folio(struct vm_area_struct *vma, + unsigned long addr) +{ + struct folio *folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, + addr); + + if (!folio) + return NULL; + + if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL)) { + folio_put(folio); + return NULL; + } + + return folio; +} + static const struct vm_uffd_ops anon_uffd_ops = { .can_userfault = anon_can_userfault, + .alloc_folio = anon_alloc_folio, }; static const struct vm_uffd_ops *vma_uffd_ops(struct vm_area_struct *vma) @@ -456,7 +474,8 @@ static int mfill_copy_folio_retry(struct mfill_state *state, struct folio *folio return 0; } -static int mfill_atomic_pte_copy(struct mfill_state *state) +static int __mfill_atomic_pte(struct mfill_state *state, + const struct vm_uffd_ops *ops) { unsigned long dst_addr = state->dst_addr; unsigned long src_addr = state->src_addr; @@ -464,16 +483,12 @@ static int mfill_atomic_pte_copy(struct mfill_state *state) struct folio *folio; int ret; - folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, state->vma, dst_addr); + folio = ops->alloc_folio(state->vma, state->dst_addr); if (!folio) return -ENOMEM; - ret = -ENOMEM; - if (mem_cgroup_charge(folio, state->vma->vm_mm, GFP_KERNEL)) - goto out_release; - - ret = mfill_copy_folio_locked(folio, src_addr); - if (unlikely(ret)) { + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) { + ret = mfill_copy_folio_locked(folio, src_addr); /* * Fallback to copy_from_user outside mmap_lock. * If retry is successful, mfill_copy_folio_locked() returns @@ -481,9 +496,15 @@ static int mfill_atomic_pte_copy(struct mfill_state *state) * If there was an error, we must mfill_put_vma() anyway and it * will take care of unlocking if needed. */ - ret = mfill_copy_folio_retry(state, folio); - if (ret) - goto out_release; + if (unlikely(ret)) { + ret = mfill_copy_folio_retry(state, folio); + if (ret) + goto err_folio_put; + } + } else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) { + clear_user_highpage(&folio->page, state->dst_addr); + } else { + VM_WARN_ONCE(1, "Unknown UFFDIO operation, flags: %x", flags); } /* @@ -496,47 +517,30 @@ static int mfill_atomic_pte_copy(struct mfill_state *state) ret = mfill_atomic_install_pte(state->pmd, state->vma, dst_addr, &folio->page, true, flags); if (ret) - goto out_release; -out: - return ret; -out_release: + goto err_folio_put; + + return 0; + +err_folio_put: + folio_put(folio); /* Don't return -ENOENT so that our caller won't retry */ if (ret == -ENOENT) ret = -EFAULT; - folio_put(folio); - goto out; + return ret; } -static int mfill_atomic_pte_zeroed_folio(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr) +static int mfill_atomic_pte_copy(struct mfill_state *state) { - struct folio *folio; - int ret = -ENOMEM; - - folio = vma_alloc_zeroed_movable_folio(dst_vma, dst_addr); - if (!folio) - return ret; - - if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL)) - goto out_put; + const struct vm_uffd_ops *ops = vma_uffd_ops(state->vma); - /* - * The memory barrier inside __folio_mark_uptodate makes sure that - * zeroing out the folio become visible before mapping the page - * using set_pte_at(). See do_anonymous_page(). - */ - __folio_mark_uptodate(folio); + return __mfill_atomic_pte(state, ops); +} - ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, - &folio->page, true, 0); - if (ret) - goto out_put; +static int mfill_atomic_pte_zeroed_folio(struct mfill_state *state) +{ + const struct vm_uffd_ops *ops = vma_uffd_ops(state->vma); - return 0; -out_put: - folio_put(folio); - return ret; + return __mfill_atomic_pte(state, ops); } static int mfill_atomic_pte_zeropage(struct mfill_state *state) @@ -549,7 +553,7 @@ static int mfill_atomic_pte_zeropage(struct mfill_state *state) int ret; if (mm_forbids_zeropage(dst_vma->vm_mm)) - return mfill_atomic_pte_zeroed_folio(dst_pmd, dst_vma, dst_addr); + return mfill_atomic_pte_zeroed_folio(state); _dst_pte = pte_mkspecial(pfn_pte(zero_pfn(dst_addr), dst_vma->vm_page_prot)); -- cgit v1.2.3 From f74991b4e3836dd38f3adb41b146994b283942a1 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Thu, 2 Apr 2026 07:11:51 +0300 Subject: shmem, userfaultfd: implement shmem uffd operations using vm_uffd_ops Add filemap_add() and filemap_remove() methods to vm_uffd_ops and use them in __mfill_atomic_pte() to add shmem folios to page cache and remove them in case of error. Implement these methods in shmem along with vm_uffd_ops->alloc_folio() and drop shmem_mfill_atomic_pte(). Since userfaultfd now does not reference any functions from shmem, drop include if linux/shmem_fs.h from mm/userfaultfd.c mfill_atomic_install_pte() is not used anywhere outside of mm/userfaultfd, make it static. Link: https://lore.kernel.org/20260402041156.1377214-11-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: James Houghton Cc: Andrea Arcangeli Cc: Andrei Vagin Cc: Axel Rasmussen Cc: Baolin Wang Cc: David Hildenbrand (Arm) Cc: Harry Yoo Cc: Harry Yoo (Oracle) Cc: Hugh Dickins Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Muchun Song Cc: Nikita Kalyazin Cc: Oscar Salvador Cc: Paolo Bonzini Cc: Peter Xu Cc: Sean Christopherson Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: David Carlier Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 14 ---- include/linux/userfaultfd_k.h | 19 ++++-- mm/shmem.c | 148 +++++++++++++++--------------------------- mm/userfaultfd.c | 80 +++++++++++------------ 4 files changed, 106 insertions(+), 155 deletions(-) (limited to 'include') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index a8273b32e041..1a345142af7d 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -221,20 +221,6 @@ static inline pgoff_t shmem_fallocend(struct inode *inode, pgoff_t eof) extern bool shmem_charge(struct inode *inode, long pages); -#ifdef CONFIG_USERFAULTFD -#ifdef CONFIG_SHMEM -extern int shmem_mfill_atomic_pte(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - uffd_flags_t flags, - struct folio **foliop); -#else /* !CONFIG_SHMEM */ -#define shmem_mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, \ - src_addr, flags, foliop) ({ BUG(); 0; }) -#endif /* CONFIG_SHMEM */ -#endif /* CONFIG_USERFAULTFD */ - /* * Used space is stored as unsigned 64-bit value in bytes but * quota core supports only signed 64-bit values so use that diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 0f508c752741..d2920f98ab86 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -100,6 +100,20 @@ struct vm_uffd_ops { */ struct folio *(*alloc_folio)(struct vm_area_struct *vma, unsigned long addr); + /* + * Called during resolution of UFFDIO_COPY request. + * Should only be called with a folio returned by alloc_folio() above. + * The folio will be set to locked. + * Returns 0 on success, error code on failure. + */ + int (*filemap_add)(struct folio *folio, struct vm_area_struct *vma, + unsigned long addr); + /* + * Called during resolution of UFFDIO_COPY request on the error + * handling path. + * Should revert the operation of ->filemap_add(). + */ + void (*filemap_remove)(struct folio *folio, struct vm_area_struct *vma); }; /* A combined operation mode + behavior flags. */ @@ -133,11 +147,6 @@ static inline uffd_flags_t uffd_flags_set_mode(uffd_flags_t flags, enum mfill_at /* Flags controlling behavior. These behavior changes are mode-independent. */ #define MFILL_ATOMIC_WP MFILL_ATOMIC_FLAG(0) -extern int mfill_atomic_install_pte(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, struct page *page, - bool newly_allocated, uffd_flags_t flags); - extern ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start, unsigned long src_start, unsigned long len, uffd_flags_t flags); diff --git a/mm/shmem.c b/mm/shmem.c index ed07d0c03312..5aa43657886c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3175,118 +3175,73 @@ static struct inode *shmem_get_inode(struct mnt_idmap *idmap, #endif /* CONFIG_TMPFS_QUOTA */ #ifdef CONFIG_USERFAULTFD -int shmem_mfill_atomic_pte(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - uffd_flags_t flags, - struct folio **foliop) -{ - struct inode *inode = file_inode(dst_vma->vm_file); - struct shmem_inode_info *info = SHMEM_I(inode); +static struct folio *shmem_mfill_folio_alloc(struct vm_area_struct *vma, + unsigned long addr) +{ + struct inode *inode = file_inode(vma->vm_file); struct address_space *mapping = inode->i_mapping; + struct shmem_inode_info *info = SHMEM_I(inode); + pgoff_t pgoff = linear_page_index(vma, addr); gfp_t gfp = mapping_gfp_mask(mapping); - pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); - void *page_kaddr; struct folio *folio; - int ret; - pgoff_t max_off; - - if (shmem_inode_acct_blocks(inode, 1)) { - /* - * We may have got a page, returned -ENOENT triggering a retry, - * and now we find ourselves with -ENOMEM. Release the page, to - * avoid a BUG_ON in our caller. - */ - if (unlikely(*foliop)) { - folio_put(*foliop); - *foliop = NULL; - } - return -ENOMEM; - } - if (!*foliop) { - ret = -ENOMEM; - folio = shmem_alloc_folio(gfp, 0, info, pgoff); - if (!folio) - goto out_unacct_blocks; + if (unlikely(pgoff >= DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE))) + return NULL; - if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) { - page_kaddr = kmap_local_folio(folio, 0); - /* - * The read mmap_lock is held here. Despite the - * mmap_lock being read recursive a deadlock is still - * possible if a writer has taken a lock. For example: - * - * process A thread 1 takes read lock on own mmap_lock - * process A thread 2 calls mmap, blocks taking write lock - * process B thread 1 takes page fault, read lock on own mmap lock - * process B thread 2 calls mmap, blocks taking write lock - * process A thread 1 blocks taking read lock on process B - * process B thread 1 blocks taking read lock on process A - * - * Disable page faults to prevent potential deadlock - * and retry the copy outside the mmap_lock. - */ - pagefault_disable(); - ret = copy_from_user(page_kaddr, - (const void __user *)src_addr, - PAGE_SIZE); - pagefault_enable(); - kunmap_local(page_kaddr); - - /* fallback to copy_from_user outside mmap_lock */ - if (unlikely(ret)) { - *foliop = folio; - ret = -ENOENT; - /* don't free the page */ - goto out_unacct_blocks; - } + folio = shmem_alloc_folio(gfp, 0, info, pgoff); + if (!folio) + return NULL; - flush_dcache_folio(folio); - } else { /* ZEROPAGE */ - clear_user_highpage(&folio->page, dst_addr); - } - } else { - folio = *foliop; - VM_BUG_ON_FOLIO(folio_test_large(folio), folio); - *foliop = NULL; + if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL)) { + folio_put(folio); + return NULL; } - VM_BUG_ON(folio_test_locked(folio)); - VM_BUG_ON(folio_test_swapbacked(folio)); + return folio; +} + +static int shmem_mfill_filemap_add(struct folio *folio, + struct vm_area_struct *vma, + unsigned long addr) +{ + struct inode *inode = file_inode(vma->vm_file); + struct address_space *mapping = inode->i_mapping; + pgoff_t pgoff = linear_page_index(vma, addr); + gfp_t gfp = mapping_gfp_mask(mapping); + int err; + __folio_set_locked(folio); __folio_set_swapbacked(folio); - __folio_mark_uptodate(folio); - - ret = -EFAULT; - max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); - if (unlikely(pgoff >= max_off)) - goto out_release; - ret = mem_cgroup_charge(folio, dst_vma->vm_mm, gfp); - if (ret) - goto out_release; - ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp); - if (ret) - goto out_release; + err = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp); + if (err) + goto err_unlock; - ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, - &folio->page, true, flags); - if (ret) - goto out_delete_from_cache; + if (shmem_inode_acct_blocks(inode, 1)) { + err = -ENOMEM; + goto err_delete_from_cache; + } + folio_add_lru(folio); shmem_recalc_inode(inode, 1, 0); - folio_unlock(folio); + return 0; -out_delete_from_cache: + +err_delete_from_cache: filemap_remove_folio(folio); -out_release: +err_unlock: + folio_unlock(folio); + return err; +} + +static void shmem_mfill_filemap_remove(struct folio *folio, + struct vm_area_struct *vma) +{ + struct inode *inode = file_inode(vma->vm_file); + + filemap_remove_folio(folio); + shmem_recalc_inode(inode, 0, 0); folio_unlock(folio); - folio_put(folio); -out_unacct_blocks: - shmem_inode_unacct_blocks(inode, 1); - return ret; } static struct folio *shmem_get_folio_noalloc(struct inode *inode, pgoff_t pgoff) @@ -3309,6 +3264,9 @@ static bool shmem_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags) static const struct vm_uffd_ops shmem_uffd_ops = { .can_userfault = shmem_can_userfault, .get_folio_noalloc = shmem_get_folio_noalloc, + .alloc_folio = shmem_mfill_folio_alloc, + .filemap_add = shmem_mfill_filemap_add, + .filemap_remove = shmem_mfill_filemap_remove, }; #endif /* CONFIG_USERFAULTFD */ diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index dd191703b320..8a023d9326c2 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include "internal.h" @@ -338,10 +337,10 @@ static bool mfill_file_over_size(struct vm_area_struct *dst_vma, * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem * and anon, and for both shared and private VMAs. */ -int mfill_atomic_install_pte(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, struct page *page, - bool newly_allocated, uffd_flags_t flags) +static int mfill_atomic_install_pte(pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, struct page *page, + uffd_flags_t flags) { int ret; struct mm_struct *dst_mm = dst_vma->vm_mm; @@ -385,9 +384,6 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, goto out_unlock; if (page_in_cache) { - /* Usually, cache pages are already added to LRU */ - if (newly_allocated) - folio_add_lru(folio); folio_add_file_rmap_pte(folio, page, dst_vma); } else { folio_add_new_anon_rmap(folio, dst_vma, dst_addr, RMAP_EXCLUSIVE); @@ -402,6 +398,9 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); + if (page_in_cache) + folio_unlock(folio); + /* No need to invalidate - it was non-present before */ update_mmu_cache(dst_vma, dst_addr, dst_pte); ret = 0; @@ -514,13 +513,22 @@ static int __mfill_atomic_pte(struct mfill_state *state, */ __folio_mark_uptodate(folio); + if (ops->filemap_add) { + ret = ops->filemap_add(folio, state->vma, state->dst_addr); + if (ret) + goto err_folio_put; + } + ret = mfill_atomic_install_pte(state->pmd, state->vma, dst_addr, - &folio->page, true, flags); + &folio->page, flags); if (ret) - goto err_folio_put; + goto err_filemap_remove; return 0; +err_filemap_remove: + if (ops->filemap_remove) + ops->filemap_remove(folio, state->vma); err_folio_put: folio_put(folio); /* Don't return -ENOENT so that our caller won't retry */ @@ -533,6 +541,18 @@ static int mfill_atomic_pte_copy(struct mfill_state *state) { const struct vm_uffd_ops *ops = vma_uffd_ops(state->vma); + /* + * The normal page fault path for a MAP_PRIVATE mapping in a + * file-backed VMA will invoke the fault, fill the hole in the file and + * COW it right away. The result generates plain anonymous memory. + * So when we are asked to fill a hole in a MAP_PRIVATE mapping, we'll + * generate anonymous memory directly without actually filling the + * hole. For the MAP_PRIVATE case the robustness check only happens in + * the pagetable (to verify it's still none) and not in the page cache. + */ + if (!(state->vma->vm_flags & VM_SHARED)) + ops = &anon_uffd_ops; + return __mfill_atomic_pte(state, ops); } @@ -552,7 +572,8 @@ static int mfill_atomic_pte_zeropage(struct mfill_state *state) spinlock_t *ptl; int ret; - if (mm_forbids_zeropage(dst_vma->vm_mm)) + if (mm_forbids_zeropage(dst_vma->vm_mm) || + (dst_vma->vm_flags & VM_SHARED)) return mfill_atomic_pte_zeroed_folio(state); _dst_pte = pte_mkspecial(pfn_pte(zero_pfn(dst_addr), @@ -609,11 +630,10 @@ static int mfill_atomic_pte_continue(struct mfill_state *state) } ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, - page, false, flags); + page, flags); if (ret) goto out_release; - folio_unlock(folio); return 0; out_release: @@ -836,41 +856,19 @@ extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx, static __always_inline ssize_t mfill_atomic_pte(struct mfill_state *state) { - struct vm_area_struct *dst_vma = state->vma; - unsigned long src_addr = state->src_addr; - unsigned long dst_addr = state->dst_addr; - struct folio **foliop = &state->folio; uffd_flags_t flags = state->flags; - pmd_t *dst_pmd = state->pmd; - ssize_t err; if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) return mfill_atomic_pte_continue(state); if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) return mfill_atomic_pte_poison(state); + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) + return mfill_atomic_pte_copy(state); + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) + return mfill_atomic_pte_zeropage(state); - /* - * The normal page fault path for a shmem will invoke the - * fault, fill the hole in the file and COW it right away. The - * result generates plain anonymous memory. So when we are - * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll - * generate anonymous memory directly without actually filling - * the hole. For the MAP_PRIVATE case the robustness check - * only happens in the pagetable (to verify it's still none) - * and not in the radix tree. - */ - if (!(dst_vma->vm_flags & VM_SHARED)) { - if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) - err = mfill_atomic_pte_copy(state); - else - err = mfill_atomic_pte_zeropage(state); - } else { - err = shmem_mfill_atomic_pte(dst_pmd, dst_vma, - dst_addr, src_addr, - flags, foliop); - } - - return err; + VM_WARN_ONCE(1, "Unknown UFFDIO operation, flags: %x", flags); + return -EOPNOTSUPP; } static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, -- cgit v1.2.3 From 77c368f057e17b59b23899a1907ee9d4f4d7a532 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 2 Apr 2026 18:23:20 +0800 Subject: mm/sparse: fix comment for section map alignment The comment in mmzone.h currently details exhaustive per-architecture bit-width lists and explains alignment using min(PAGE_SHIFT, PFN_SECTION_SHIFT). Such details risk falling out of date over time and may inadvertently be left un-updated. We always expect a single section to cover full pages. Therefore, we can safely assume that PFN_SECTION_SHIFT is large enough to accommodate SECTION_MAP_LAST_BIT. We use BUILD_BUG_ON() to ensure this. Update the comment to accurately reflect this consensus, making it clear that we rely on a single section covering full pages. Link: https://lore.kernel.org/20260402102320.3617578-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: David Hildenbrand (Arm) Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Petr Tesarik Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 20f920dede65..07f501a62d67 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -2068,21 +2068,16 @@ static inline struct mem_section *__nr_to_section(unsigned long nr) extern size_t mem_section_usage_size(void); /* - * We use the lower bits of the mem_map pointer to store - * a little bit of information. The pointer is calculated - * as mem_map - section_nr_to_pfn(pnum). The result is - * aligned to the minimum alignment of the two values: - * 1. All mem_map arrays are page-aligned. - * 2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT - * lowest bits. PFN_SECTION_SHIFT is arch-specific - * (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the - * worst combination is powerpc with 256k pages, - * which results in PFN_SECTION_SHIFT equal 6. - * To sum it up, at least 6 bits are available on all architectures. - * However, we can exceed 6 bits on some other architectures except - * powerpc (e.g. 15 bits are available on x86_64, 13 bits are available - * with the worst case of 64K pages on arm64) if we make sure the - * exceeded bit is not applicable to powerpc. + * We use the lower bits of the mem_map pointer to store a little bit of + * information. The pointer is calculated as mem_map - section_nr_to_pfn(). + * The result is aligned to the minimum alignment of the two values: + * + * 1. All mem_map arrays are page-aligned. + * 2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT lowest bits. + * + * We always expect a single section to cover full pages. Therefore, + * we can safely assume that PFN_SECTION_SHIFT is large enough to + * accommodate SECTION_MAP_LAST_BIT. We use BUILD_BUG_ON() to ensure this. */ enum { SECTION_MARKED_PRESENT_BIT, -- cgit v1.2.3