summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeno Hou <lenohou@gmail.com>2026-03-18 19:30:49 +0300
committerAndrew Morton <akpm@linux-foundation.org>2026-04-05 23:53:33 +0300
commita6a8c087dce00eac0c6d03e560b0fa3d529afa5f (patch)
tree5bdfd90cf5d3e8e49b365b989826d6930ad1ff55
parentdc711106a0bc76a30e0fbd16ed4d348171547d9a (diff)
downloadlinux-a6a8c087dce00eac0c6d03e560b0fa3d529afa5f.tar.xz
mm/mglru: fix cgroup OOM during MGLRU state switching
When the Multi-Gen LRU (MGLRU) state is toggled dynamically, a race condition exists between the state switching and the memory reclaim path. This can lead to unexpected cgroup OOM kills, even when plenty of reclaimable memory is available. Problem Description ================== The issue arises from a "reclaim vacuum" during the transition. 1. When disabling MGLRU, lru_gen_change_state() sets lrugen->enabled to false before the pages are drained from MGLRU lists back to traditional LRU lists. 2. Concurrent reclaimers in shrink_lruvec() see lrugen->enabled as false and skip the MGLRU path. 3. However, these pages might not have reached the traditional LRU lists yet, or the changes are not yet visible to all CPUs due to a lack of synchronization. 4. get_scan_count() subsequently finds traditional LRU lists empty, concludes there is no reclaimable memory, and triggers an OOM kill. A similar race can occur during enablement, where the reclaimer sees the new state but the MGLRU lists haven't been populated via fill_evictable() yet. Solution ======== Introduce a 'switching' state (`lru_switch`) to bridge the transition. When transitioning, the system enters this intermediate state where the reclaimer is forced to attempt both MGLRU and traditional reclaim paths sequentially. This ensures that folios remain visible to at least one reclaim mechanism until the transition is fully materialized across all CPUs. Race & Mitigation ================ A race window exists between checking the 'draining' state and performing the actual list operations. For instance, a reclaimer might observe the draining state as false just before it changes, leading to a suboptimal reclaim path decision. However, this impact is effectively mitigated by the kernel's reclaim retry mechanism (e.g., in do_try_to_free_pages). If a reclaimer pass fails to find eligible folios due to a state transition race, subsequent retries in the loop will observe the updated state and correctly direct the scan to the appropriate LRU lists. This ensures the transient inconsistency does not escalate into a terminal OOM kill. This effectively reduce the race window that previously triggered OOMs under high memory pressure. This fix has been verified on v7.0.0-rc1; dynamic toggling of MGLRU functions correctly without triggering unexpected OOM kills. Link: https://lkml.kernel.org/r/20260319-b4-switch-mglru-v2-v5-1-8898491e5f17@gmail.com Signed-off-by: Leno Hou <lenohou@gmail.com> Acked-by: Yafang Shao <laoar.shao@gmail.com> Reviewed-by: Barry Song <baohua@kernel.org> Reviewed-by: Axel Rasmussen <axelrasmussen@google.com> Cc: Yuanchu Xie <yuanchu@google.com> Cc: Wei Xu <weixugc@google.com> Cc: Jialing Wang <wjl.linux@gmail.com> Cc: Yu Zhao <yuzhao@google.com> Cc: Kairui Song <ryncsn@gmail.com> Cc: Bingfang Guo <bfguo@icloud.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
-rw-r--r--include/linux/mm_inline.h11
-rw-r--r--mm/rmap.c7
-rw-r--r--mm/vmscan.c33
3 files changed, 41 insertions, 10 deletions
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index fa2d6ba811b5..2aedcff6a2c1 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -102,6 +102,12 @@ static __always_inline enum lru_list folio_lru_list(const struct folio *folio)
#ifdef CONFIG_LRU_GEN
+static inline bool lru_gen_switching(void)
+{
+ DECLARE_STATIC_KEY_FALSE(lru_switch);
+
+ return static_branch_unlikely(&lru_switch);
+}
#ifdef CONFIG_LRU_GEN_ENABLED
static inline bool lru_gen_enabled(void)
{
@@ -316,6 +322,11 @@ static inline bool lru_gen_enabled(void)
return false;
}
+static inline bool lru_gen_switching(void)
+{
+ return false;
+}
+
static inline bool lru_gen_in_fault(void)
{
return false;
diff --git a/mm/rmap.c b/mm/rmap.c
index abe4712a220c..78b7fb5f367c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -973,7 +973,12 @@ static bool folio_referenced_one(struct folio *folio,
nr = folio_pte_batch(folio, pvmw.pte, pteval, max_nr);
}
- if (lru_gen_enabled() && pvmw.pte) {
+ /*
+ * When LRU is switching, we don’t know where the surrounding folios
+ * are. —they could be on active/inactive lists or on MGLRU. So the
+ * simplest approach is to disable this look-around optimization.
+ */
+ if (lru_gen_enabled() && !lru_gen_switching() && pvmw.pte) {
if (lru_gen_look_around(&pvmw, nr))
referenced++;
} else if (pvmw.pte) {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 641a6063f375..42f834c508bc 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -905,7 +905,7 @@ static enum folio_references folio_check_references(struct folio *folio,
if (referenced_ptes == -1)
return FOLIOREF_KEEP;
- if (lru_gen_enabled()) {
+ if (lru_gen_enabled() && !lru_gen_switching()) {
if (!referenced_ptes)
return FOLIOREF_RECLAIM;
@@ -2308,7 +2308,7 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc)
unsigned long file;
struct lruvec *target_lruvec;
- if (lru_gen_enabled())
+ if (lru_gen_enabled() && !lru_gen_switching())
return;
target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
@@ -2647,6 +2647,7 @@ static bool can_age_anon_pages(struct lruvec *lruvec,
#ifdef CONFIG_LRU_GEN
+DEFINE_STATIC_KEY_FALSE(lru_switch);
#ifdef CONFIG_LRU_GEN_ENABLED
DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS);
#define get_cap(cap) static_branch_likely(&lru_gen_caps[cap])
@@ -5181,6 +5182,8 @@ static void lru_gen_change_state(bool enabled)
if (enabled == lru_gen_enabled())
goto unlock;
+ static_branch_enable_cpuslocked(&lru_switch);
+
if (enabled)
static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
else
@@ -5211,6 +5214,9 @@ static void lru_gen_change_state(bool enabled)
cond_resched();
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+
+ static_branch_disable_cpuslocked(&lru_switch);
+
unlock:
mutex_unlock(&state_mutex);
put_online_mems();
@@ -5783,9 +5789,12 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
bool proportional_reclaim;
struct blk_plug plug;
- if (lru_gen_enabled() && !root_reclaim(sc)) {
+ if ((lru_gen_enabled() || lru_gen_switching()) && !root_reclaim(sc)) {
lru_gen_shrink_lruvec(lruvec, sc);
- return;
+
+ if (!lru_gen_switching())
+ return;
+
}
get_scan_count(lruvec, sc, nr);
@@ -6045,10 +6054,13 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
struct lruvec *target_lruvec;
bool reclaimable = false;
- if (lru_gen_enabled() && root_reclaim(sc)) {
+ if ((lru_gen_enabled() || lru_gen_switching()) && root_reclaim(sc)) {
memset(&sc->nr, 0, sizeof(sc->nr));
lru_gen_shrink_node(pgdat, sc);
- return;
+
+ if (!lru_gen_switching())
+ return;
+
}
target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
@@ -6318,7 +6330,7 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat)
struct lruvec *target_lruvec;
unsigned long refaults;
- if (lru_gen_enabled())
+ if (lru_gen_enabled() && !lru_gen_switching())
return;
target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
@@ -6708,9 +6720,12 @@ static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc)
struct mem_cgroup *memcg;
struct lruvec *lruvec;
- if (lru_gen_enabled()) {
+ if (lru_gen_enabled() || lru_gen_switching()) {
lru_gen_age_node(pgdat, sc);
- return;
+
+ if (!lru_gen_switching())
+ return;
+
}
lruvec = mem_cgroup_lruvec(NULL, pgdat);