summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorJohannes Weiner <hannes@cmpxchg.org>2026-05-27 23:45:16 +0300
committerAndrew Morton <akpm@linux-foundation.org>2026-06-09 04:21:25 +0300
commitfafaeceb89a5e2e856ff04c2cacb6cae4a2ecb67 (patch)
tree0920f260bf1dfb4b8551e321e7d9cdcc6e73947b /include
parent65180e9663c782e45ed1c76276dc64d96615da9d (diff)
downloadlinux-fafaeceb89a5e2e856ff04c2cacb6cae4a2ecb67.tar.xz
mm: switch deferred split shrinker to list_lru
The deferred split queue handles cgroups in a suboptimal fashion. The queue is per-NUMA node or per-cgroup, not the intersection. That means on a cgrouped system, a node-restricted allocation entering reclaim can end up splitting large pages on other nodes: alloc/unmap deferred_split_folio() list_add_tail(memcg->split_queue) set_shrinker_bit(memcg, node, deferred_shrinker_id) for_each_zone_zonelist_nodemask(restricted_nodes) mem_cgroup_iter() shrink_slab(node, memcg) shrink_slab_memcg(node, memcg) if test_shrinker_bit(memcg, node, deferred_shrinker_id) deferred_split_scan() walks memcg->split_queue The shrinker bit adds an imperfect guard rail. As soon as the cgroup has a single large page on the node of interest, all large pages owned by that memcg, including those on other nodes, will be split. list_lru properly sets up per-node, per-cgroup lists. As a bonus, it streamlines a lot of the list operations and reclaim walks. It's used widely by other major shrinkers already. Convert the deferred split queue as well. The list_lru per-memcg heads are instantiated on demand when the first object of interest is allocated for a cgroup, by calling folio_memcg_alloc_deferred(). Add calls to where splittable pages are created: anon faults, swapin faults, khugepaged collapse. These calls create all possible node heads for the cgroup at once, so the migration code (between nodes) doesn't need any special care. [akpm@linux-foundation.org: fix build with CONFIG_TRANSPARENT_HUGEPAGE=n] Link: https://lore.kernel.org/202605281620.lc3rtkBm-lkp@intel.com [hannes@cmpxchg.org: fix cgroup.memory=nokmem handling] Link: https://lore.kernel.org/ah9PGv12mqai84ES@cmpxchg.org Link: https://lore.kernel.org/20260527204757.2544958-10-hannes@cmpxchg.org Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Reported-by: Mikhail Zaslonko <zaslonko@linux.ibm.com> Tested-by: Mikhail Zaslonko <zaslonko@linux.ibm.com> Acked-by: Shakeel Butt <shakeel.butt@linux.dev> Reviewed-by: Lorenzo Stoakes (Oracle) <ljs@kernel.org> Acked-by: Usama Arif <usama.arif@linux.dev> Reviewed-by: Kairui Song <kasong@tencent.com> Cc: Baolin Wang <baolin.wang@linux.alibaba.com> Cc: Barry Song <baohua@kernel.org> Cc: Dave Chinner <david@fromorbit.com> Cc: David Hildenbrand (Arm) <david@kernel.org> Cc: Dev Jain <dev.jain@arm.com> Cc: Lance Yang <lance.yang@linux.dev> Cc: Liam R. Howlett <liam@infradead.org> Cc: Michal Hocko <mhocko@kernel.org> Cc: Muchun Song <muchun.song@linux.dev> Cc: Nico Pache <npache@redhat.com> Cc: Roman Gushchin <roman.gushchin@linux.dev> Cc: Ryan Roberts <ryan.roberts@arm.com> Cc: Vasily Gorbik <gor@linux.ibm.com> Cc: Vlastimil Babka <vbabka@kernel.org> Cc: Zi Yan <ziy@nvidia.com> Cc: kernel test robot <lkp@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Diffstat (limited to 'include')
-rw-r--r--include/linux/huge_mm.h17
-rw-r--r--include/linux/memcontrol.h4
-rw-r--r--include/linux/mmzone.h12
3 files changed, 12 insertions, 21 deletions
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 58382e97a66d..c0d223d0c556 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -439,10 +439,10 @@ static inline int split_huge_page(struct page *page)
{
return split_huge_page_to_list_to_order(page, NULL, 0);
}
+
+int folio_memcg_alloc_deferred(struct folio *folio);
+
void deferred_split_folio(struct folio *folio, bool partially_mapped);
-#ifdef CONFIG_MEMCG
-void reparent_deferred_split_queue(struct mem_cgroup *memcg);
-#endif
void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long address, bool freeze);
@@ -679,8 +679,15 @@ static inline int try_folio_split_to_order(struct folio *folio,
return -EINVAL;
}
-static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
-static inline void reparent_deferred_split_queue(struct mem_cgroup *memcg) {}
+static inline int folio_memcg_alloc_deferred(struct folio *folio)
+{
+ return 0;
+}
+
+static inline void deferred_split_folio(struct folio *folio, bool partially_mapped)
+{
+}
+
#define split_huge_pmd(__vma, __pmd, __address) \
do { } while (0)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 8f2662db166b..e1f46a0016fc 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -278,10 +278,6 @@ struct mem_cgroup {
struct memcg_cgwb_frn cgwb_frn[MEMCG_CGWB_FRN_CNT];
#endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- struct deferred_split deferred_split_queue;
-#endif
-
#ifdef CONFIG_LRU_GEN_WALKS_MMU
/* per-memcg mm_struct list */
struct lru_gen_mm_list mm_list;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 1331a7b93f33..8e449f524f26 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1431,14 +1431,6 @@ struct zonelist {
*/
extern struct page *mem_map;
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-struct deferred_split {
- spinlock_t split_queue_lock;
- struct list_head split_queue;
- unsigned long split_queue_len;
-};
-#endif
-
#ifdef CONFIG_MEMORY_FAILURE
/*
* Per NUMA node memory failure handling statistics.
@@ -1564,10 +1556,6 @@ typedef struct pglist_data {
unsigned long first_deferred_pfn;
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- struct deferred_split deferred_split_queue;
-#endif
-
#ifdef CONFIG_NUMA_BALANCING
/* start time in ms of current promote rate limit period */
unsigned int nbp_rl_start;