diff options
| author | Johannes Weiner <hannes@cmpxchg.org> | 2026-05-27 23:45:16 +0300 |
|---|---|---|
| committer | Andrew Morton <akpm@linux-foundation.org> | 2026-06-09 04:21:25 +0300 |
| commit | fafaeceb89a5e2e856ff04c2cacb6cae4a2ecb67 (patch) | |
| tree | 0920f260bf1dfb4b8551e321e7d9cdcc6e73947b /include | |
| parent | 65180e9663c782e45ed1c76276dc64d96615da9d (diff) | |
| download | linux-fafaeceb89a5e2e856ff04c2cacb6cae4a2ecb67.tar.xz | |
mm: switch deferred split shrinker to list_lru
The deferred split queue handles cgroups in a suboptimal fashion. The
queue is per-NUMA node or per-cgroup, not the intersection. That means on
a cgrouped system, a node-restricted allocation entering reclaim can end
up splitting large pages on other nodes:
alloc/unmap
deferred_split_folio()
list_add_tail(memcg->split_queue)
set_shrinker_bit(memcg, node, deferred_shrinker_id)
for_each_zone_zonelist_nodemask(restricted_nodes)
mem_cgroup_iter()
shrink_slab(node, memcg)
shrink_slab_memcg(node, memcg)
if test_shrinker_bit(memcg, node, deferred_shrinker_id)
deferred_split_scan()
walks memcg->split_queue
The shrinker bit adds an imperfect guard rail. As soon as the cgroup has
a single large page on the node of interest, all large pages owned by that
memcg, including those on other nodes, will be split.
list_lru properly sets up per-node, per-cgroup lists. As a bonus, it
streamlines a lot of the list operations and reclaim walks. It's used
widely by other major shrinkers already. Convert the deferred split queue
as well.
The list_lru per-memcg heads are instantiated on demand when the first
object of interest is allocated for a cgroup, by calling
folio_memcg_alloc_deferred(). Add calls to where splittable pages are
created: anon faults, swapin faults, khugepaged collapse.
These calls create all possible node heads for the cgroup at once, so the
migration code (between nodes) doesn't need any special care.
[akpm@linux-foundation.org: fix build with CONFIG_TRANSPARENT_HUGEPAGE=n]
Link: https://lore.kernel.org/202605281620.lc3rtkBm-lkp@intel.com
[hannes@cmpxchg.org: fix cgroup.memory=nokmem handling]
Link: https://lore.kernel.org/ah9PGv12mqai84ES@cmpxchg.org
Link: https://lore.kernel.org/20260527204757.2544958-10-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: Mikhail Zaslonko <zaslonko@linux.ibm.com>
Tested-by: Mikhail Zaslonko <zaslonko@linux.ibm.com>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Lorenzo Stoakes (Oracle) <ljs@kernel.org>
Acked-by: Usama Arif <usama.arif@linux.dev>
Reviewed-by: Kairui Song <kasong@tencent.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: David Hildenbrand (Arm) <david@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam R. Howlett <liam@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nico Pache <npache@redhat.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vlastimil Babka <vbabka@kernel.org>
Cc: Zi Yan <ziy@nvidia.com>
Cc: kernel test robot <lkp@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Diffstat (limited to 'include')
| -rw-r--r-- | include/linux/huge_mm.h | 17 | ||||
| -rw-r--r-- | include/linux/memcontrol.h | 4 | ||||
| -rw-r--r-- | include/linux/mmzone.h | 12 |
3 files changed, 12 insertions, 21 deletions
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 58382e97a66d..c0d223d0c556 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -439,10 +439,10 @@ static inline int split_huge_page(struct page *page) { return split_huge_page_to_list_to_order(page, NULL, 0); } + +int folio_memcg_alloc_deferred(struct folio *folio); + void deferred_split_folio(struct folio *folio, bool partially_mapped); -#ifdef CONFIG_MEMCG -void reparent_deferred_split_queue(struct mem_cgroup *memcg); -#endif void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze); @@ -679,8 +679,15 @@ static inline int try_folio_split_to_order(struct folio *folio, return -EINVAL; } -static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {} -static inline void reparent_deferred_split_queue(struct mem_cgroup *memcg) {} +static inline int folio_memcg_alloc_deferred(struct folio *folio) +{ + return 0; +} + +static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) +{ +} + #define split_huge_pmd(__vma, __pmd, __address) \ do { } while (0) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 8f2662db166b..e1f46a0016fc 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -278,10 +278,6 @@ struct mem_cgroup { struct memcg_cgwb_frn cgwb_frn[MEMCG_CGWB_FRN_CNT]; #endif -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - struct deferred_split deferred_split_queue; -#endif - #ifdef CONFIG_LRU_GEN_WALKS_MMU /* per-memcg mm_struct list */ struct lru_gen_mm_list mm_list; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 1331a7b93f33..8e449f524f26 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1431,14 +1431,6 @@ struct zonelist { */ extern struct page *mem_map; -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -struct deferred_split { - spinlock_t split_queue_lock; - struct list_head split_queue; - unsigned long split_queue_len; -}; -#endif - #ifdef CONFIG_MEMORY_FAILURE /* * Per NUMA node memory failure handling statistics. @@ -1564,10 +1556,6 @@ typedef struct pglist_data { unsigned long first_deferred_pfn; #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - struct deferred_split deferred_split_queue; -#endif - #ifdef CONFIG_NUMA_BALANCING /* start time in ms of current promote rate limit period */ unsigned int nbp_rl_start; |
