From df0d98475954d655571979aa061ecb07d7e00392 Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Wed, 1 Apr 2026 14:52:13 -0700 Subject: sched/cache: Introduce infrastructure for cache-aware load balancing Adds infrastructure to enable cache-aware load balancing, which improves cache locality by grouping tasks that share resources within the same cache domain. This reduces cache misses and improves overall data access efficiency. In this initial implementation, threads belonging to the same process are treated as entities that likely share working sets. The mechanism tracks per-process CPU occupancy across cache domains and attempts to migrate threads toward cache-hot domains where their process already has active threads, thereby enhancing locality. This provides a basic model for cache affinity. While the current code targets the last-level cache (LLC), the approach could be extended to other domain types such as clusters (L2) or node-internal groupings. At present, the mechanism selects the CPU within an LLC that has the highest recent runtime. Subsequent patches in this series will use this information in the load-balancing path to guide task placement toward preferred LLCs. In the future, more advanced policies could be integrated through NUMA balancing-for example, migrating a task to its preferred LLC when spare capacity exists, or swapping tasks across LLCs to improve cache affinity. Grouping of tasks could also be generalized from that of a process to be that of a NUMA group, or be user configurable. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Chen Yu Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/6269a53221b9439b9ca00d18a9d1946fb64d8cff.1775065312.git.tim.c.chen@linux.intel.com --- include/linux/mm_types.h | 32 ++++++++++++++++++++++++++++++++ include/linux/sched.h | 24 ++++++++++++++++++++++++ 2 files changed, 56 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3cc8ae722886..67b2dfcc71ea 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1173,6 +1173,8 @@ struct mm_struct { /* MM CID related storage */ struct mm_mm_cid mm_cid; + /* sched_cache related statistics */ + struct sched_cache_stat sc_stat; #ifdef CONFIG_MMU atomic_long_t pgtables_bytes; /* size of all page tables */ #endif @@ -1575,6 +1577,36 @@ static inline unsigned int mm_cid_size(void) # define MM_CID_STATIC_SIZE 0 #endif /* CONFIG_SCHED_MM_CID */ +#ifdef CONFIG_SCHED_CACHE +void mm_init_sched(struct mm_struct *mm, + struct sched_cache_time __percpu *pcpu_sched); + +static inline int mm_alloc_sched_noprof(struct mm_struct *mm) +{ + struct sched_cache_time __percpu *pcpu_sched = + alloc_percpu_noprof(struct sched_cache_time); + + if (!pcpu_sched) + return -ENOMEM; + + mm_init_sched(mm, pcpu_sched); + return 0; +} + +#define mm_alloc_sched(...) alloc_hooks(mm_alloc_sched_noprof(__VA_ARGS__)) + +static inline void mm_destroy_sched(struct mm_struct *mm) +{ + free_percpu(mm->sc_stat.pcpu_sched); + mm->sc_stat.pcpu_sched = NULL; +} +#else /* !CONFIG_SCHED_CACHE */ + +static inline int mm_alloc_sched(struct mm_struct *mm) { return 0; } +static inline void mm_destroy_sched(struct mm_struct *mm) { } + +#endif /* CONFIG_SCHED_CACHE */ + struct mmu_gather; extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); diff --git a/include/linux/sched.h b/include/linux/sched.h index 8ec3b6d7d718..2bf261bcd7b6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1407,6 +1407,10 @@ struct task_struct { unsigned long numa_pages_migrated; #endif /* CONFIG_NUMA_BALANCING */ +#ifdef CONFIG_SCHED_CACHE + struct callback_head cache_work; +#endif + struct rseq_data rseq; struct sched_mm_cid mm_cid; @@ -2407,6 +2411,26 @@ static __always_inline int task_mm_cid(struct task_struct *t) } #endif +#ifdef CONFIG_SCHED_CACHE + +struct sched_cache_time { + u64 runtime; + unsigned long epoch; +}; + +struct sched_cache_stat { + struct sched_cache_time __percpu *pcpu_sched; + raw_spinlock_t lock; + unsigned long epoch; + int cpu; +} ____cacheline_aligned_in_smp; + +#else + +struct sched_cache_stat { }; + +#endif + #ifndef MODULE #ifndef COMPILE_OFFSETS -- cgit v1.2.3 From f025ef275388742643a2c33f00a0d9c0af3112ee Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 1 Apr 2026 14:52:15 -0700 Subject: sched/cache: Record per LLC utilization to guide cache aware scheduling decisions When a system becomes busy and a process's preferred LLC is saturated with too many threads, tasks within that LLC migrate frequently. These in LLC migrations introduce latency and degrade performance. To avoid this, task aggregation should be suppressed when the preferred LLC is overloaded, which requires a metric to indicate LLC utilization. Record per LLC utilization/cpu capacity during periodic load balancing. These statistics will be used in later patches to decide whether tasks should be aggregated into their preferred LLC. Signed-off-by: Chen Yu Co-developed-by: Tim Chen Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/a48151b3d57f2a42a5971aaead1b7f81e69229f4.1775065312.git.tim.c.chen@linux.intel.com --- include/linux/sched/topology.h | 4 +++ kernel/sched/fair.c | 70 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 36553e14866d..159716fa0d3a 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -68,6 +68,10 @@ struct sched_domain_shared { atomic_t nr_busy_cpus; int has_idle_cores; int nr_idle_scan; +#ifdef CONFIG_SCHED_CACHE + unsigned long util_avg; + unsigned long capacity; +#endif }; struct sched_domain { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a55ada22e40c..6647d465b59e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9992,6 +9992,28 @@ static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_ return 0; } +#ifdef CONFIG_SCHED_CACHE +static __maybe_unused bool get_llc_stats(int cpu, unsigned long *util, + unsigned long *cap) +{ + struct sched_domain_shared *sd_share; + + sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, cpu)); + if (!sd_share) + return false; + + *util = READ_ONCE(sd_share->util_avg); + *cap = READ_ONCE(sd_share->capacity); + + return true; +} +#else +static inline bool get_llc_stats(int cpu, unsigned long *util, + unsigned long *cap) +{ + return false; +} +#endif /* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ @@ -10948,6 +10970,53 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd) return check_cpu_capacity(rq, sd); } +#ifdef CONFIG_SCHED_CACHE +/* + * Record the statistics for this scheduler group for later + * use. These values guide load balancing on aggregating tasks + * to a LLC. + */ +static void record_sg_llc_stats(struct lb_env *env, + struct sg_lb_stats *sgs, + struct sched_group *group) +{ + struct sched_domain_shared *sd_share; + int cpu; + + if (!sched_cache_enabled() || env->idle == CPU_NEWLY_IDLE) + return; + + /* Only care about sched domain spanning multiple LLCs */ + if (env->sd->child != rcu_dereference_all(per_cpu(sd_llc, env->dst_cpu))) + return; + + /* + * At this point we know this group spans a LLC domain. + * Record the statistic of this group in its corresponding + * shared LLC domain. + * Note: sd_share cannot be obtained via sd->child->shared, + * because the latter refers to the domain that covers the + * local group. Instead, sd_share should be located using + * the first CPU of the LLC group. + */ + cpu = cpumask_first(sched_group_span(group)); + sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, cpu)); + if (!sd_share) + return; + + if (READ_ONCE(sd_share->util_avg) != sgs->group_util) + WRITE_ONCE(sd_share->util_avg, sgs->group_util); + + if (unlikely(READ_ONCE(sd_share->capacity) != sgs->group_capacity)) + WRITE_ONCE(sd_share->capacity, sgs->group_capacity); +} +#else +static inline void record_sg_llc_stats(struct lb_env *env, struct sg_lb_stats *sgs, + struct sched_group *group) +{ +} +#endif + /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @env: The load balancing environment. @@ -11035,6 +11104,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs); + record_sg_llc_stats(env, sgs, group); /* Computing avg_load makes sense only when group is overloaded */ if (sgs->group_type == group_overloaded) sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) / -- cgit v1.2.3 From 47d8696b95f7397fe7cad2d194d550ffe82efc15 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Wed, 1 Apr 2026 14:52:18 -0700 Subject: sched/cache: Assign preferred LLC ID to processes With cache-aware scheduling enabled, each task is assigned a preferred LLC ID. This allows quick identification of the LLC domain where the task prefers to run, similar to numa_preferred_nid in NUMA balancing. Co-developed-by: Chen Yu Signed-off-by: Chen Yu Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/f2ceecba5858680349ad4ce9303a2121f0bb7272.1775065312.git.tim.c.chen@linux.intel.com --- include/linux/sched.h | 1 + init/init_task.c | 3 +++ kernel/sched/fair.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 47 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 2bf261bcd7b6..d2010483cd77 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1409,6 +1409,7 @@ struct task_struct { #ifdef CONFIG_SCHED_CACHE struct callback_head cache_work; + int preferred_llc; #endif struct rseq_data rseq; diff --git a/init/init_task.c b/init/init_task.c index b5f48ebdc2b6..5d90db4ff1f8 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -215,6 +215,9 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = { .numa_group = NULL, .numa_faults = NULL, #endif +#ifdef CONFIG_SCHED_CACHE + .preferred_llc = -1, +#endif #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) .kasan_depth = 1, #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7860c5bc12d7..6e78ecfb560e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1459,11 +1459,43 @@ static unsigned long fraction_mm_sched(struct rq *rq, return div64_u64(NICE_0_LOAD * pcpu_sched->runtime, rq->cpu_runtime + 1); } +static int get_pref_llc(struct task_struct *p, struct mm_struct *mm) +{ + int mm_sched_llc = -1; + + if (!mm) + return -1; + + if (mm->sc_stat.cpu != -1) { + mm_sched_llc = llc_id(mm->sc_stat.cpu); + +#ifdef CONFIG_NUMA_BALANCING + /* + * Don't assign preferred LLC if it + * conflicts with NUMA balancing. + * This can happen when sched_setnuma() gets + * called, however it is not much of an issue + * because we expect account_mm_sched() to get + * called fairly regularly -- at a higher rate + * than sched_setnuma() at least -- and thus the + * conflict only exists for a short period of time. + */ + if (static_branch_likely(&sched_numa_balancing) && + p->numa_preferred_nid >= 0 && + cpu_to_node(mm->sc_stat.cpu) != p->numa_preferred_nid) + mm_sched_llc = -1; +#endif + } + + return mm_sched_llc; +} + static inline void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) { struct sched_cache_time *pcpu_sched; struct mm_struct *mm = p->mm; + int mm_sched_llc = -1; unsigned long epoch; if (!sched_cache_enabled()) @@ -1495,6 +1527,11 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) if (mm->sc_stat.cpu != -1) mm->sc_stat.cpu = -1; } + + mm_sched_llc = get_pref_llc(p, mm); + + if (READ_ONCE(p->preferred_llc) != mm_sched_llc) + WRITE_ONCE(p->preferred_llc, mm_sched_llc); } static void task_tick_cache(struct rq *rq, struct task_struct *p) @@ -1671,6 +1708,12 @@ void init_sched_mm(struct task_struct *p) { } static void task_tick_cache(struct rq *rq, struct task_struct *p) { } +static inline int get_pref_llc(struct task_struct *p, + struct mm_struct *mm) +{ + return -1; +} + #endif /* CONFIG_SCHED_CACHE */ /* -- cgit v1.2.3 From a8d0ca0b7f2f7b53565d1e30e509d3d74d1f5460 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Wed, 1 Apr 2026 14:52:20 -0700 Subject: sched/cache: Introduce per CPU's tasks LLC preference counter The lowest level of sched domain for each CPU is assigned an array where each element tracks the number of tasks preferring a given LLC, indexed from 0 to max_lid. Since each CPU has its dedicated sd, this implies that each CPU will have a dedicated task LLC preference counter. For example, sd->llc_counts[3] = 2 signifies that there are 2 tasks on this runqueue which prefer to run within LLC3. The load balancer can use this information to identify busy runqueues and migrate tasks to their preferred LLC domains. This array will be reallocated at runtime during sched domain rebuild. Introduce the buffer allocation mechanism, and the statistics will be calculated in the subsequent patch. Note: the LLC preference statistics of each CPU are reset on sched domain rebuild and may under count temporarily, until the CPU becomes idle and the count is cleared. This is a trade off to avoid complex data synchronization across sched domain builds. Suggested-by: Peter Zijlstra (Intel) Suggested-by: K Prateek Nayak Co-developed-by: Chen Yu Signed-off-by: Chen Yu Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/42e79eceb8cd6be8a032401d481d101913bc5703.1775065312.git.tim.c.chen@linux.intel.com --- include/linux/sched/topology.h | 5 ++++ kernel/sched/topology.c | 62 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 66 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 159716fa0d3a..0036d6b4bd67 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -103,6 +103,11 @@ struct sched_domain { u64 max_newidle_lb_cost; unsigned long last_decay_max_lb_cost; +#ifdef CONFIG_SCHED_CACHE + unsigned int llc_max; + unsigned int *llc_counts __counted_by_ptr(llc_max); +#endif + #ifdef CONFIG_SCHEDSTATS /* sched_balance_rq() stats */ unsigned int lb_count[CPU_MAX_IDLE_TYPES]; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 1200670969bb..8954bf7900ff 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -634,6 +634,11 @@ static void destroy_sched_domain(struct sched_domain *sd) if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) kfree(sd->shared); + +#ifdef CONFIG_SCHED_CACHE + /* only the bottom sd has llc_counts array */ + kfree(sd->llc_counts); +#endif kfree(sd); } @@ -763,10 +768,18 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) if (sd && sd_degenerate(sd)) { tmp = sd; sd = sd->parent; - destroy_sched_domain(tmp); + if (sd) { struct sched_group *sg = sd->groups; +#ifdef CONFIG_SCHED_CACHE + /* move buffer to parent as child is being destroyed */ + sd->llc_counts = tmp->llc_counts; + sd->llc_max = tmp->llc_max; + /* make sure destroy_sched_domain() does not free it */ + tmp->llc_counts = NULL; + tmp->llc_max = 0; +#endif /* * sched groups hold the flags of the child sched * domain for convenience. Clear such flags since @@ -778,6 +791,8 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) sd->child = NULL; } + + destroy_sched_domain(tmp); } sched_domain_debug(sd, cpu); @@ -805,6 +820,49 @@ enum s_alloc { sa_none, }; +#ifdef CONFIG_SCHED_CACHE +static bool alloc_sd_llc(const struct cpumask *cpu_map, + struct s_data *d) +{ + struct sched_domain *sd; + unsigned int *p; + int i; + + for_each_cpu(i, cpu_map) { + sd = *per_cpu_ptr(d->sd, i); + if (!sd) + goto err; + + p = kcalloc_node(max_lid + 1, sizeof(unsigned int), + GFP_KERNEL, cpu_to_node(i)); + if (!p) + goto err; + + sd->llc_max = max_lid + 1; + sd->llc_counts = p; + } + + return true; +err: + for_each_cpu(i, cpu_map) { + sd = *per_cpu_ptr(d->sd, i); + if (sd) { + kfree(sd->llc_counts); + sd->llc_counts = NULL; + sd->llc_max = 0; + } + } + + return false; +} +#else +static bool alloc_sd_llc(const struct cpumask *cpu_map, + struct s_data *d) +{ + return false; +} +#endif + /* * Return the canonical balance CPU for this group, this is the first CPU * of this group that's also in the balance mask. @@ -2828,6 +2886,8 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att init_sched_groups_capacity(i, sd); } + alloc_sd_llc(cpu_map, &d); + /* Attach the domains */ rcu_read_lock(); for_each_cpu(i, cpu_map) { -- cgit v1.2.3 From a2b4cf39d9d333bfeb9262dbaafe3d24d405a5c0 Mon Sep 17 00:00:00 2001 From: Jianyong Wu Date: Wed, 13 May 2026 13:39:12 -0700 Subject: sched/cache: Allow only 1 thread of the process to calculate the LLC occupancy Scanning online CPUs to calculate the occupancy might be time-consuming. Only allow 1 thread of the process to scan the CPUs at the same time, which is similar to what NUMA balance does in task_numa_work(). Signed-off-by: Jianyong Wu Signed-off-by: Chen Yu Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/5672b52e588b855b01e5a1a17822f7c6c7237a3d.1778703694.git.tim.c.chen@linux.intel.com --- include/linux/sched.h | 1 + kernel/sched/fair.c | 11 +++++++++++ 2 files changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index d2010483cd77..6d883f109ba3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2423,6 +2423,7 @@ struct sched_cache_stat { struct sched_cache_time __percpu *pcpu_sched; raw_spinlock_t lock; unsigned long epoch; + unsigned long next_scan; int cpu; } ____cacheline_aligned_in_smp; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5f22e5a097cf..a759ea669d74 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1451,6 +1451,7 @@ void mm_init_sched(struct mm_struct *mm, raw_spin_lock_init(&mm->sc_stat.lock); mm->sc_stat.epoch = epoch; mm->sc_stat.cpu = -1; + mm->sc_stat.next_scan = jiffies; /* * The update to mm->sc_stat should not be reordered @@ -1661,6 +1662,7 @@ out: static void task_cache_work(struct callback_head *work) { + unsigned long next_scan, now = jiffies; struct task_struct *p = current; struct mm_struct *mm = p->mm; unsigned long m_a_occ = 0; @@ -1675,6 +1677,15 @@ static void task_cache_work(struct callback_head *work) if (p->flags & PF_EXITING) return; + next_scan = READ_ONCE(mm->sc_stat.next_scan); + if (time_before(now, next_scan)) + return; + + /* only 1 thread is allowed to scan */ + if (!try_cmpxchg(&mm->sc_stat.next_scan, &next_scan, + now + EPOCH_PERIOD)) + return; + if (!zalloc_cpumask_var(&cpus, GFP_KERNEL)) return; -- cgit v1.2.3 From deee5e27d5b608323c04dc99979e55f944016a13 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 13 May 2026 13:39:13 -0700 Subject: sched/cache: Disable cache aware scheduling for processes with high thread counts A performance regression was observed by Prateek when running hackbench with many threads per process (high fd count). To avoid this, processes with a large number of active threads are excluded from cache-aware scheduling. With sched_cache enabled, record the number of active threads in each process during the periodic task_cache_work(). While iterating over CPUs, if the currently running task belongs to the same process as the task that launched task_cache_work(), increment the active thread count. If the number of active threads within the process exceeds the number of Cores (divided by the SMT number) in the LLC, do not enable cache-aware scheduling. However, on systems with a smaller number of CPUs within 1 LLC, like Power10/Power11 with SMT4 and an LLC size of 4, this check effectively disables cache-aware scheduling for any process. One possible solution suggested by Peter is to use an LLC-mask instead of a single LLC value for preference. Once there are a 'few' LLCs as preference, this constraint becomes a little easier. It could be an enhancement in the future. For users who wish to perform task aggregation regardless, a debugfs knob is provided for tuning in a subsequent change. Suggested-by: K Prateek Nayak Suggested-by: Aaron Lu Signed-off-by: Chen Yu Co-developed-by: Tim Chen Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Tested-by: Tingyin Duan Link: https://patch.msgid.link/d076cd21a8e6c6341d1e2d927e118db770ebb650.1778703694.git.tim.c.chen@linux.intel.com --- include/linux/sched.h | 1 + kernel/sched/fair.c | 48 +++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 44 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 6d883f109ba3..6701911eaaf7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2423,6 +2423,7 @@ struct sched_cache_stat { struct sched_cache_time __percpu *pcpu_sched; raw_spinlock_t lock; unsigned long epoch; + u64 nr_running_avg; unsigned long next_scan; int cpu; } ____cacheline_aligned_in_smp; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a759ea669d74..808f614fc2d2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1384,6 +1384,12 @@ static int llc_id(int cpu) return per_cpu(sd_llc_id, cpu); } +static bool invalid_llc_nr(struct mm_struct *mm, int cpu) +{ + return !fits_capacity((mm->sc_stat.nr_running_avg * cpu_smt_num_threads), + per_cpu(sd_llc_size, cpu)); +} + static void account_llc_enqueue(struct rq *rq, struct task_struct *p) { struct sched_domain *sd; @@ -1452,7 +1458,7 @@ void mm_init_sched(struct mm_struct *mm, mm->sc_stat.epoch = epoch; mm->sc_stat.cpu = -1; mm->sc_stat.next_scan = jiffies; - + mm->sc_stat.nr_running_avg = 0; /* * The update to mm->sc_stat should not be reordered * before initialization to mm's other fields, in case @@ -1574,7 +1580,8 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) * If this process hasn't hit task_cache_work() for a while invalidate * its preferred state. */ - if (epoch - READ_ONCE(mm->sc_stat.epoch) > EPOCH_LLC_AFFINITY_TIMEOUT) { + if (epoch - READ_ONCE(mm->sc_stat.epoch) > EPOCH_LLC_AFFINITY_TIMEOUT || + invalid_llc_nr(mm, cpu_of(rq))) { if (mm->sc_stat.cpu != -1) mm->sc_stat.cpu = -1; } @@ -1660,14 +1667,32 @@ out: cpumask_copy(cpus, cpu_online_mask); } +static inline void update_avg_scale(u64 *avg, u64 sample) +{ + int factor = per_cpu(sd_llc_size, raw_smp_processor_id()); + s64 diff = sample - *avg; + u32 divisor; + + /* + * Scale the divisor based on the number of CPUs contained + * in the LLC. This scaling ensures smaller LLC domains use + * a smaller divisor to achieve more precise sensitivity to + * changes in nr_running, while larger LLC domains are capped + * at a maximum divisor of 8 which is the default smoothing + * factor of EWMA in update_avg(). + */ + divisor = clamp_t(u32, (factor >> 2), 2, 8); + *avg += div64_s64(diff, divisor); +} + static void task_cache_work(struct callback_head *work) { unsigned long next_scan, now = jiffies; - struct task_struct *p = current; + struct task_struct *p = current, *cur; + int cpu, m_a_cpu = -1, nr_running = 0; + unsigned long curr_m_a_occ = 0; struct mm_struct *mm = p->mm; unsigned long m_a_occ = 0; - unsigned long curr_m_a_occ = 0; - int cpu, m_a_cpu = -1; cpumask_var_t cpus; WARN_ON_ONCE(work != &p->cache_work); @@ -1711,6 +1736,11 @@ static void task_cache_work(struct callback_head *work) m_occ = occ; m_cpu = i; } + + cur = rcu_dereference_all(cpu_rq(i)->curr); + if (cur && !(cur->flags & (PF_EXITING | PF_KTHREAD)) && + cur->mm == mm) + nr_running++; } /* @@ -1754,6 +1784,7 @@ static void task_cache_work(struct callback_head *work) mm->sc_stat.cpu = m_a_cpu; } + update_avg_scale(&mm->sc_stat.nr_running_avg, nr_running); free_cpumask_var(cpus); } @@ -10294,6 +10325,13 @@ static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu, if (cpu < 0 || cpus_share_cache(src_cpu, dst_cpu)) return mig_unrestricted; + /* skip cache aware load balance for too many threads */ + if (invalid_llc_nr(mm, dst_cpu)) { + if (mm->sc_stat.cpu != -1) + mm->sc_stat.cpu = -1; + return mig_unrestricted; + } + if (cpus_share_cache(dst_cpu, cpu)) to_pref = true; else if (cpus_share_cache(src_cpu, cpu)) -- cgit v1.2.3 From 7030513a08776b2ca70fccd5dfddf7bb5c5c88ba Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 13 May 2026 13:39:15 -0700 Subject: sched/cache: Calculate the LLC size and store it in sched_domain Cache aware scheduling needs to know the LLC size that a process can use, so as to avoid memory-intensive tasks from being over-aggregated on a single LLC. Introduce a preparation patch to add get_effective_llc_bytes() to get the LLC size that a CPU can use. The function can be further enhanced by subtracting the LLC cache ways reserved by resctrl (CAT in Intel RDT, etc). Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Chen Yu Co-developed-by: Tim Chen Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Tested-by: Tingyin Duan Link: https://patch.msgid.link/37afee09ff608034da0ce149e72d33b6f4698edf.1778703694.git.tim.c.chen@linux.intel.com --- drivers/base/cacheinfo.c | 23 ++++++++++ include/linux/cacheinfo.h | 1 + include/linux/sched/topology.h | 7 +++ kernel/sched/topology.c | 98 ++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 126 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c index 391ac5e3d2f5..70701d3bc81c 100644 --- a/drivers/base/cacheinfo.c +++ b/drivers/base/cacheinfo.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -68,6 +69,24 @@ bool last_level_cache_is_valid(unsigned int cpu) } +/* + * Get the cacheinfo of the LLC associated with @cpu. + * Derived from update_per_cpu_data_slice_size_cpu(). + */ +struct cacheinfo *get_cpu_cacheinfo_llc(unsigned int cpu) +{ + struct cacheinfo *llc; + + if (!last_level_cache_is_valid(cpu)) + return NULL; + + llc = per_cpu_cacheinfo_idx(cpu, cache_leaves(cpu) - 1); + if (llc->type != CACHE_TYPE_DATA && llc->type != CACHE_TYPE_UNIFIED) + return NULL; + + return llc; +} + bool last_level_cache_is_shared(unsigned int cpu_x, unsigned int cpu_y) { struct cacheinfo *llc_x, *llc_y; @@ -1018,6 +1037,7 @@ static int cacheinfo_cpu_online(unsigned int cpu) goto err; if (cpu_map_shared_cache(true, cpu, &cpu_map)) update_per_cpu_data_slice_size(true, cpu, cpu_map); + sched_update_llc_bytes(cpu); return 0; err: free_cache_attributes(cpu); @@ -1036,6 +1056,9 @@ static int cacheinfo_cpu_pre_down(unsigned int cpu) free_cache_attributes(cpu); if (nr_shared > 1) update_per_cpu_data_slice_size(false, cpu, cpu_map); + + sched_update_llc_bytes(cpu); + return 0; } diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h index c8f4f0a0b874..fc879ac4cc4f 100644 --- a/include/linux/cacheinfo.h +++ b/include/linux/cacheinfo.h @@ -89,6 +89,7 @@ int populate_cache_leaves(unsigned int cpu); int cache_setup_acpi(unsigned int cpu); bool last_level_cache_is_valid(unsigned int cpu); bool last_level_cache_is_shared(unsigned int cpu_x, unsigned int cpu_y); +struct cacheinfo *get_cpu_cacheinfo_llc(unsigned int cpu); int fetch_cache_info(unsigned int cpu); int detect_cache_attributes(unsigned int cpu); #ifndef CONFIG_ACPI_PPTT diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 0036d6b4bd67..fe09d3268bc9 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -106,6 +106,7 @@ struct sched_domain { #ifdef CONFIG_SCHED_CACHE unsigned int llc_max; unsigned int *llc_counts __counted_by_ptr(llc_max); + unsigned long llc_bytes; #endif #ifdef CONFIG_SCHEDSTATS @@ -265,4 +266,10 @@ static inline int task_node(const struct task_struct *p) return cpu_to_node(task_cpu(p)); } +#ifdef CONFIG_SCHED_CACHE +extern void sched_update_llc_bytes(unsigned int cpu); +#else +static inline void sched_update_llc_bytes(unsigned int cpu) { } +#endif + #endif /* _LINUX_SCHED_TOPOLOGY_H */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 9fc99346ef4f..7248a7279abe 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -776,9 +776,11 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) /* move buffer to parent as child is being destroyed */ sd->llc_counts = tmp->llc_counts; sd->llc_max = tmp->llc_max; + sd->llc_bytes = tmp->llc_bytes; /* make sure destroy_sched_domain() does not free it */ tmp->llc_counts = NULL; tmp->llc_max = 0; + tmp->llc_bytes = 0; #endif /* * sched groups hold the flags of the child sched @@ -831,10 +833,42 @@ DEFINE_STATIC_KEY_FALSE(sched_cache_active); /* user wants cache aware scheduling [0 or 1] */ int sysctl_sched_cache_user = 1; +/* + * Get the effective LLC size in bytes that @cpu's bottom sched_domain + * can use. A CPU within a cpuset partition can only use a proportion + * of the physical LLC, scaled by the ratio of the partition's span + * weight to the hardware LLC sharing weight. @sd should be the + * topmost domain with SD_SHARE_LLC. + * + * Returns 0 if cacheinfo is not yet populated. This happens during + * early boot when build_sched_domains() runs before the generic + * cacheinfo framework has been initialized (cacheinfo_cpu_online() + * is a device_initcall cpuhp callback). In that case, + * cacheinfo_cpu_online() will later call sched_update_llc_bytes() + * to fill in the bottom domain's llc_bytes once the cache attributes + * are available. + */ +static unsigned long get_effective_llc_bytes(int cpu, + struct sched_domain *sd) +{ + struct cacheinfo *ci; + unsigned int hw_weight; + + ci = get_cpu_cacheinfo_llc(cpu); + if (!ci) + return 0; + + hw_weight = cpumask_weight(&ci->shared_cpu_map); + if (!hw_weight) + return 0; + + return div_u64((u64)ci->size * sd->span_weight, hw_weight); +} + static bool alloc_sd_llc(const struct cpumask *cpu_map, struct s_data *d) { - struct sched_domain *sd; + struct sched_domain *sd, *top_llc, *parent; unsigned int *p; int i; @@ -848,8 +882,24 @@ static bool alloc_sd_llc(const struct cpumask *cpu_map, if (!p) goto err; - sd->llc_max = max_lid + 1; - sd->llc_counts = p; + top_llc = sd; + /* + * Find the topmost SD_SHARE_LLC domain. + * Not yet attached to the CPU, so per_cpu(sd_llc, i) + * can not be used. + */ + while ((parent = rcu_dereference_protected(top_llc->parent, true)) && + (parent->flags & SD_SHARE_LLC)) + top_llc = parent; + + if (top_llc->flags & SD_SHARE_LLC) { + sd->llc_max = max_lid + 1; + sd->llc_counts = p; + sd->llc_bytes = get_effective_llc_bytes(i, top_llc); + } else { + /* avoid memory leak */ + kfree(p); + } } return true; @@ -860,6 +910,7 @@ err: kfree(sd->llc_counts); sd->llc_counts = NULL; sd->llc_max = 0; + sd->llc_bytes = 0; } } @@ -919,6 +970,47 @@ void sched_cache_active_set_unlocked(void) { return sched_cache_active_set(false); } + +/* + * Update the bottom sched_domain's llc_bytes for @cpu and all its + * LLC siblings. Called from cacheinfo_cpu_online() or + * cacheinfo_cpu_pre_down() with cpu hotplug lock held. + * + * Note: get_effective_llc_bytes() returns 0 on PowerPC. + * thus cache aware scheduling is disabled on PowerPC for + * now. PowerPC does not use the generic cacheinfo framework -- + * it has its own cacheinfo with a separate struct cache hierarchy + * and does not populates the per-CPU struct cpu_cacheinfo array + * that get_cpu_cacheinfo_llc() reads. + */ +void sched_update_llc_bytes(unsigned int cpu) +{ + struct sched_domain *sd, *sdp; + unsigned int i; + + sched_domains_mutex_lock(); + + sdp = rcu_dereference_sched_domain(per_cpu(sd_llc, cpu)); + if (!sdp) + goto unlock; + + /* + * ci->shared_cpu_map is built incrementally as CPUs come + * online, so the first CPU in an LLC initially sees + * hw_weight == 1 and computes an inflated llc_bytes in + * get_effective_llc_bytes(). Re-evaluating every LLC + * sibling on each online event corrects this once the full + * shared_cpu_map is known. + */ + for_each_cpu(i, sched_domain_span(sdp)) { + sd = rcu_dereference_sched_domain(cpu_rq(i)->sd); + if (sd) + sd->llc_bytes = get_effective_llc_bytes(i, sdp); + } + +unlock: + sched_domains_mutex_unlock(); +} #else static bool alloc_sd_llc(const struct cpumask *cpu_map, struct s_data *d) -- cgit v1.2.3 From 808915f982c2a52f5d148510ecfab52284de67cf Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 13 May 2026 13:39:16 -0700 Subject: sched/cache: Avoid cache-aware scheduling for memory-heavy processes Prateek and Tingyin reported that memory-intensive workloads (such as stream) can saturate memory bandwidth and caches on the preferred LLC when sched_cache aggregates too many threads. To mitigate this, estimate a process's memory footprint by comparing its NUMA balancing fault statistics to the size of the LLC. If the footprint exceeds the LLC size, skip cache-aware scheduling. Note that footprint is only an approximation of the memory footprint, since the kernel lacks suitable metrics to estimate the real working set. If a user-provided hint is available in the future, it would be more accurate. A later patch will allow users to provide a hint to adjust this threshold. Suggested-by: K Prateek Nayak Suggested-by: Vern Hao Signed-off-by: Chen Yu Co-developed-by: Tim Chen Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Tested-by: Tingyin Duan Link: https://patch.msgid.link/95cf64a385bcc12f18dcebe9d59e8d3ba8bb318f.1778703694.git.tim.c.chen@linux.intel.com --- include/linux/sched.h | 1 + kernel/exit.c | 29 ++++++++++++++++++++++++ kernel/sched/fair.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 89 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 6701911eaaf7..95729670929c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2425,6 +2425,7 @@ struct sched_cache_stat { unsigned long epoch; u64 nr_running_avg; unsigned long next_scan; + unsigned long footprint; int cpu; } ____cacheline_aligned_in_smp; diff --git a/kernel/exit.c b/kernel/exit.c index ede3117fa7d4..77275c26a2a1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -543,6 +543,32 @@ void mm_update_next_owner(struct mm_struct *mm) } #endif /* CONFIG_MEMCG */ +#if defined(CONFIG_SCHED_CACHE) && defined(CONFIG_NUMA_BALANCING) +/* + * Subtract the memory footprint of the current task from + * mm. + */ +static void exit_mm_sched_cache(struct mm_struct *mm) +{ + unsigned long fp, sub; + + if (!current->total_numa_faults) + return; + /* + * No lock protection due to performance considerations. + * Make sure mm->sc_stat.footprint does not become + * negative. + */ + fp = READ_ONCE(mm->sc_stat.footprint); + sub = min(fp, current->total_numa_faults); + WRITE_ONCE(mm->sc_stat.footprint, fp - sub); +} +#else +static inline void exit_mm_sched_cache(struct mm_struct *mm) +{ +} +#endif /* CONFIG_SCHED_CACHE CONFIG_NUMA_BALANCING */ + /* * Turn us into a lazy TLB process if we * aren't already.. @@ -554,6 +580,9 @@ static void exit_mm(void) exit_mm_release(current, mm); if (!mm) return; + + exit_mm_sched_cache(mm); + mmap_read_lock(mm); mmgrab_lazy_tlb(mm); BUG_ON(mm != current->active_mm); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index df21366ba1ca..a10116ffe0d1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1384,6 +1384,32 @@ static int llc_id(int cpu) return per_cpu(sd_llc_id, cpu); } +static bool exceed_llc_capacity(struct mm_struct *mm, int cpu) +{ +#ifdef CONFIG_NUMA_BALANCING + unsigned long llc, footprint; + struct sched_domain *sd; + + guard(rcu)(); + + sd = rcu_dereference_sched_domain(cpu_rq(cpu)->sd); + if (!sd) + return true; + + if (static_branch_likely(&sched_numa_balancing)) { + /* + * TBD: RDT exclusive LLC ways reserved should be + * excluded. + */ + llc = sd->llc_bytes; + footprint = READ_ONCE(mm->sc_stat.footprint); + + return (llc < (footprint * PAGE_SIZE)); + } +#endif + return false; +} + static bool invalid_llc_nr(struct mm_struct *mm, struct task_struct *p, int cpu) { @@ -1463,6 +1489,7 @@ void mm_init_sched(struct mm_struct *mm, mm->sc_stat.cpu = -1; mm->sc_stat.next_scan = jiffies; mm->sc_stat.nr_running_avg = 0; + mm->sc_stat.footprint = 0; /* * The update to mm->sc_stat should not be reordered * before initialization to mm's other fields, in case @@ -1585,7 +1612,8 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) * its preferred state. */ if (epoch - READ_ONCE(mm->sc_stat.epoch) > EPOCH_LLC_AFFINITY_TIMEOUT || - invalid_llc_nr(mm, p, cpu_of(rq))) { + invalid_llc_nr(mm, p, cpu_of(rq)) || + exceed_llc_capacity(mm, cpu_of(rq))) { if (mm->sc_stat.cpu != -1) mm->sc_stat.cpu = -1; } @@ -1716,7 +1744,8 @@ static void task_cache_work(struct callback_head *work) return; curr_cpu = task_cpu(p); - if (invalid_llc_nr(mm, p, curr_cpu)) { + if (invalid_llc_nr(mm, p, curr_cpu) || + exceed_llc_capacity(mm, curr_cpu)) { if (mm->sc_stat.cpu != -1) mm->sc_stat.cpu = -1; @@ -3515,6 +3544,7 @@ static void task_numa_placement(struct task_struct *p) unsigned long total_faults; u64 runtime, period; spinlock_t *group_lock = NULL; + long __maybe_unused new_fp; struct numa_group *ng; /* @@ -3589,6 +3619,31 @@ static void task_numa_placement(struct task_struct *p) ng->total_faults += diff; group_faults += ng->faults[mem_idx]; } +#ifdef CONFIG_SCHED_CACHE + /* + * Per task p->numa_faults[mem_idx] converges, + * so the accumulation of each task's faults + * converges too - Given the number of threads, + * it cannot overflow an unsigned long. + * Racy with concurrent updates from other threads + * sharing this mm. Acceptable since footprint is a + * heuristic and occasional lost updates are tolerable. + * + * If a task exits, its corresponding footprint must + * be subtracted from the mm->sc_stat.footprint, otherwise + * the mm->sc_stat.footprint will not converge: + * the exiting thread's footprint remains unchanged/undecayed + * in mm->sc_stat.footprint. See exit_mm(). + * + * Lost updates and unsynchronized subtraction + * in exit_mm() can cause footprint + diff to + * go negative. Clamp to zero to prevent the + * unsigned footprint from wrapping. + */ + new_fp = (long)READ_ONCE(p->mm->sc_stat.footprint) + diff; + WRITE_ONCE(p->mm->sc_stat.footprint, + max(new_fp, 0L)); +#endif } if (!ng) { @@ -10338,7 +10393,8 @@ static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu, return mig_unrestricted; /* skip cache aware load balance for too many threads */ - if (invalid_llc_nr(mm, p, dst_cpu)) { + if (invalid_llc_nr(mm, p, dst_cpu) || + exceed_llc_capacity(mm, dst_cpu)) { if (mm->sc_stat.cpu != -1) mm->sc_stat.cpu = -1; return mig_unrestricted; -- cgit v1.2.3 From 03755348b8e74421f92ffed9da159175a698290b Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 13 May 2026 13:39:21 -0700 Subject: sched/cache: Fix unpaired account_llc_enqueue/dequeue There is a race condition that, after a task is enqueued on a runqueue, task_llc(p) may change due to CPU hotplug, because the llc_id is dynamically allocated and adjusted at runtime. Therefore, checking task_llc(p) to determine whether the task is being dequeued from its preferred LLC is unreliable and can cause inconsistent values. To fix this problem, record whether p is enqueued on its preferred LLC, in order to pair with account_llc_dequeue() to maintain a consistent nr_pref_llc_running per runqueue. This bug was reported by sashiko, and the solution was once suggested by Prateek. Fixes: 46afe3af7ead ("sched/cache: Track LLC-preferred tasks per runqueue") Suggested-by: K Prateek Nayak Signed-off-by: Chen Yu Co-developed-by: Tim Chen Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/0c8c6a1571d66792a4d2ff0103ba3cc13e059046.1778703694.git.tim.c.chen@linux.intel.com --- include/linux/sched.h | 2 ++ init/init_task.c | 1 + kernel/sched/fair.c | 31 ++++++++++++++++++++++++++++--- 3 files changed, 31 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 95729670929c..2c9e8e2edde1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1410,6 +1410,8 @@ struct task_struct { #ifdef CONFIG_SCHED_CACHE struct callback_head cache_work; int preferred_llc; + /* 1: task was enqueued to its preferred LLC, 0 otherwise */ + int pref_llc_queued; #endif struct rseq_data rseq; diff --git a/init/init_task.c b/init/init_task.c index 5d90db4ff1f8..3ecd66fbd563 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -217,6 +217,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = { #endif #ifdef CONFIG_SCHED_CACHE .preferred_llc = -1, + .pref_llc_queued = 0, #endif #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) .kasan_depth = 1, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 087445ea6bc9..96c61ce366c2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1472,15 +1472,32 @@ static bool invalid_llc_nr(struct mm_struct *mm, struct task_struct *p, static void account_llc_enqueue(struct rq *rq, struct task_struct *p) { + int pref_llc, pref_llc_queued; struct sched_domain *sd; - int pref_llc; pref_llc = p->preferred_llc; if (pref_llc < 0) return; + pref_llc_queued = (pref_llc == task_llc(p)); rq->nr_llc_running++; - rq->nr_pref_llc_running += (pref_llc == task_llc(p)); + rq->nr_pref_llc_running += pref_llc_queued; + + /* + * Record whether p is enqueued on its preferred + * LLC, in order to pair with account_llc_dequeue() + * to maintain a consistent nr_pref_llc_running per + * runqueue. + * This is necessary because a race condition exists: + * after a task is enqueued on a runqueue, task_llc(p) + * may change due to CPU hotplug. Therefore, checking + * task_llc(p) to determine whether the task is being + * dequeued from its preferred LLC is unreliable and + * can cause inconsistent values - checking the + * p->pref_llc_queued in account_llc_dequeue() would + * be reliable. + */ + p->pref_llc_queued = pref_llc_queued; sd = rcu_dereference_all(rq->sd); if (sd && (unsigned int)pref_llc < sd->llc_max) @@ -1497,7 +1514,15 @@ static void account_llc_dequeue(struct rq *rq, struct task_struct *p) return; rq->nr_llc_running--; - rq->nr_pref_llc_running -= (pref_llc == task_llc(p)); + if (p->pref_llc_queued) { + rq->nr_pref_llc_running--; + /* + * Update the status in case + * other logic might query + * this. + */ + p->pref_llc_queued = 0; + } sd = rcu_dereference_all(rq->sd); if (sd && (unsigned int)pref_llc < sd->llc_max) { -- cgit v1.2.3 From ea19506013ad13685573e4674fbeddb790e27906 Mon Sep 17 00:00:00 2001 From: Yiyang Chen Date: Fri, 15 May 2026 00:05:05 +0800 Subject: sched/clock: Provide !HAVE_UNSTABLE_SCHED_CLOCK stub for sched_clock_stable() When CONFIG_HAVE_UNSTABLE_SCHED_CLOCK is disabled, sched_clock() is already assumed to provide stable semantics, but the public header doesn't provide a sched_clock_stable() stub for that case. Add a header stub that always returns true and clean up the duplicate local stub in ring_buffer.c, so callers can use sched_clock_stable() unconditionally. Signed-off-by: Yiyang Chen Signed-off-by: Peter Zijlstra (Intel) Acked-by: Steven Rostedt Link: https://patch.msgid.link/56e45338858946cd9581b75c8bd45dd37dba52c5.1778773587.git.cyyzero16@gmail.com --- include/linux/sched/clock.h | 5 +++++ kernel/trace/ring_buffer.c | 7 ------- 2 files changed, 5 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h index 196f0ca351a2..39f0a7f94bfc 100644 --- a/include/linux/sched/clock.h +++ b/include/linux/sched/clock.h @@ -33,6 +33,11 @@ extern u64 sched_clock_cpu(int cpu); extern void sched_clock_init(void); #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +static inline int sched_clock_stable(void) +{ + return 1; +} + static inline void sched_clock_tick(void) { } diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 5326924615a4..02691c3c6dd6 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3769,13 +3769,6 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, return skip_time_extend(event); } -#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK -static inline bool sched_clock_stable(void) -{ - return true; -} -#endif - static void rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer, struct rb_event_info *info) -- cgit v1.2.3 From 815c5cb76a3e5dad4fc3911b9073591dc3a29282 Mon Sep 17 00:00:00 2001 From: Shrikanth Hegde Date: Fri, 15 May 2026 22:54:53 +0530 Subject: topology: Introduce cpu_smt_mask for CONFIG_SCHED_SMT=n Define cpu_smt_mask in case of CONFIG_SCHED_SMT=n as cpumask_of that CPU. With that config, it is expected that kernel treats each CPU as individual core. Using cpumask_of(cpu) reflects that. This would help to get rid of the ifdeffery that is spread across the codebase since cpu_smt_mask is defined only in case of CONFIG_SCHED_SMT=y. Note: There is no arch today which defines cpu_smt_mask unconditionally. So likely defining the cpu_smt_mask shouldn't lead redefinition errors. Signed-off-by: Shrikanth Hegde Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Reviewed-by: Valentin Schneider Tested-by: K Prateek Nayak Link: https://patch.msgid.link/20260515172456.542799-2-sshegde@linux.ibm.com --- include/linux/topology.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/topology.h b/include/linux/topology.h index 6575af39fd10..709a2dcf4c73 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -230,11 +230,24 @@ static inline int cpu_to_mem(int cpu) #define topology_drawer_cpumask(cpu) cpumask_of(cpu) #endif -#if defined(CONFIG_SCHED_SMT) && !defined(cpu_smt_mask) +/* + * Defining cpu_smt_mask as cpumask_of that CPU helps to get + * rid of lot of ifdeffery all around the codebase in case of + * CONFIG_SCHED_SMT=n. It just means there are no other siblings, which + * is what is expected. + */ +#if defined(CONFIG_SCHED_SMT) +# if !defined(cpu_smt_mask) static inline const struct cpumask *cpu_smt_mask(int cpu) { return topology_sibling_cpumask(cpu); } +# endif +#else /* !CONFIG_SCHED_SMT */ +static inline const struct cpumask *cpu_smt_mask(int cpu) +{ + return cpumask_of(cpu); +} #endif #ifndef topology_is_primary_thread -- cgit v1.2.3 From 5bc6ab2d42e545f816def21cfcdb4ba35cc74bf6 Mon Sep 17 00:00:00 2001 From: Shrikanth Hegde Date: Fri, 15 May 2026 22:54:54 +0530 Subject: sched: Simplify ifdeffery around cpu_smt_mask Now, that cpu_smt_mask is defined as cpumask_of(cpu) for CONFIG_SCHED_SMT=n, it is possible to get rid of the ifdeffery. Effectively, - This makes sched_smt_present is defined always - cpumask_weight(cpumask_of(cpu)) == 1. So sched_smt_present_inc/dec will never enable the sched_smt_present. Which is expected. - Paths that were compile-time eliminated become runtime guarded using static keys. - Defines set_idle_cores, test_idle_cores, etc which could likely benefit the CONFIG_SCHED_SMT=n systems to use the same optimizations within the LLC at wakeups. - This will expose sched_smt_present symbol for CONFIG_SCHED_SMT=n. Likely not a concern. - There is a bloat of code CONFIG_SCHED_SMT=n. (NR_CPUS=2048) add/remove: 24/18 grow/shrink: 26/28 up/down: 6396/-3188 (3208) Total: Before=30629880, After=30633088, chg +0.01% - No code bloat for CONFIG_SCHED_SMT=y, which is expected. - Add comments around stop_core_cpuslocked on why ifdefs are not removed. - This leaves the remaining uses of CONFIG_SCHED_SMT mainly for topology building bits which has a policy based decision. Signed-off-by: Shrikanth Hegde Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Reviewed-by: Valentin Schneider Acked-by: Tejun Heo Tested-by: K Prateek Nayak Link: https://patch.msgid.link/20260515172456.542799-3-sshegde@linux.ibm.com --- include/linux/sched/smt.h | 4 ---- kernel/sched/core.c | 6 ------ kernel/sched/ext_idle.c | 6 ------ kernel/sched/fair.c | 35 ----------------------------------- kernel/sched/sched.h | 6 ------ kernel/sched/topology.c | 2 -- kernel/stop_machine.c | 5 +++++ kernel/workqueue.c | 4 ---- 8 files changed, 5 insertions(+), 63 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/smt.h b/include/linux/sched/smt.h index 166b19af956f..cde6679c0278 100644 --- a/include/linux/sched/smt.h +++ b/include/linux/sched/smt.h @@ -4,16 +4,12 @@ #include -#ifdef CONFIG_SCHED_SMT extern struct static_key_false sched_smt_present; static __always_inline bool sched_smt_active(void) { return static_branch_likely(&sched_smt_present); } -#else -static __always_inline bool sched_smt_active(void) { return false; } -#endif void arch_smt_update(void); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b905805bbcbe..3ae5f19c1b7e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8612,18 +8612,14 @@ static void cpuset_cpu_inactive(unsigned int cpu) static inline void sched_smt_present_inc(int cpu) { -#ifdef CONFIG_SCHED_SMT if (cpumask_weight(cpu_smt_mask(cpu)) == 2) static_branch_inc_cpuslocked(&sched_smt_present); -#endif } static inline void sched_smt_present_dec(int cpu) { -#ifdef CONFIG_SCHED_SMT if (cpumask_weight(cpu_smt_mask(cpu)) == 2) static_branch_dec_cpuslocked(&sched_smt_present); -#endif } int sched_cpu_activate(unsigned int cpu) @@ -8711,9 +8707,7 @@ int sched_cpu_deactivate(unsigned int cpu) */ sched_smt_present_dec(cpu); -#ifdef CONFIG_SCHED_SMT sched_core_cpu_deactivate(cpu); -#endif if (!sched_smp_initialized) return 0; diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index 7468560a6d80..2bcf58e99c9b 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -79,7 +79,6 @@ static bool scx_idle_test_and_clear_cpu(int cpu) int node = scx_cpu_node_if_enabled(cpu); struct cpumask *idle_cpus = idle_cpumask(node)->cpu; -#ifdef CONFIG_SCHED_SMT /* * SMT mask should be cleared whether we can claim @cpu or not. The SMT * cluster is not wholly idle either way. This also prevents @@ -104,7 +103,6 @@ static bool scx_idle_test_and_clear_cpu(int cpu) else if (cpumask_test_cpu(cpu, idle_smts)) __cpumask_clear_cpu(cpu, idle_smts); } -#endif return cpumask_test_and_clear_cpu(cpu, idle_cpus); } @@ -622,7 +620,6 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, goto out_unlock; } -#ifdef CONFIG_SCHED_SMT /* * Use @prev_cpu's sibling if it's idle. */ @@ -634,7 +631,6 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, goto out_unlock; } } -#endif /* * Search for any idle CPU in the same LLC domain. @@ -714,7 +710,6 @@ static void update_builtin_idle(int cpu, bool idle) assign_cpu(cpu, idle_cpus, idle); -#ifdef CONFIG_SCHED_SMT if (sched_smt_active()) { const struct cpumask *smt = cpu_smt_mask(cpu); struct cpumask *idle_smts = idle_cpumask(node)->smt; @@ -731,7 +726,6 @@ static void update_builtin_idle(int cpu, bool idle) cpumask_andnot(idle_smts, idle_smts, smt); } } -#endif } /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 09d3acd2d2bc..233bd2ebbb73 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1555,7 +1555,6 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) static inline bool is_core_idle(int cpu) { -#ifdef CONFIG_SCHED_SMT int sibling; for_each_cpu(sibling, cpu_smt_mask(cpu)) { @@ -1565,7 +1564,6 @@ static inline bool is_core_idle(int cpu) if (!idle_cpu(sibling)) return false; } -#endif return true; } @@ -2248,7 +2246,6 @@ numa_type numa_classify(unsigned int imbalance_pct, return node_fully_busy; } -#ifdef CONFIG_SCHED_SMT /* Forward declarations of select_idle_sibling helpers */ static inline bool test_idle_cores(int cpu); static inline int numa_idle_core(int idle_core, int cpu) @@ -2266,12 +2263,6 @@ static inline int numa_idle_core(int idle_core, int cpu) return idle_core; } -#else /* !CONFIG_SCHED_SMT: */ -static inline int numa_idle_core(int idle_core, int cpu) -{ - return idle_core; -} -#endif /* !CONFIG_SCHED_SMT */ /* * Gather all necessary information to make NUMA balancing placement @@ -7778,7 +7769,6 @@ static inline int __select_idle_cpu(int cpu, struct task_struct *p) return -1; } -#ifdef CONFIG_SCHED_SMT DEFINE_STATIC_KEY_FALSE(sched_smt_present); EXPORT_SYMBOL_GPL(sched_smt_present); @@ -7888,29 +7878,6 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t return -1; } -#else /* !CONFIG_SCHED_SMT: */ - -static inline void set_idle_cores(int cpu, int val) -{ -} - -static inline bool test_idle_cores(int cpu) -{ - return false; -} - -static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu) -{ - return __select_idle_cpu(core, p); -} - -static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) -{ - return -1; -} - -#endif /* !CONFIG_SCHED_SMT */ - /* * Scan the LLC domain for idle CPUs; this is dynamically regulated by * comparing the average scan cost (tracked in sd->avg_scan_cost) against the @@ -12002,9 +11969,7 @@ static int should_we_balance(struct lb_env *env) * idle has been found, then its not needed to check other * SMT siblings for idleness: */ -#ifdef CONFIG_SCHED_SMT cpumask_andnot(swb_cpus, swb_cpus, cpu_smt_mask(cpu)); -#endif continue; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9f63b15d309d..e476623a0c2a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1667,7 +1667,6 @@ do { \ flags = _raw_spin_rq_lock_irqsave(rq); \ } while (0) -#ifdef CONFIG_SCHED_SMT extern void __update_idle_core(struct rq *rq); static inline void update_idle_core(struct rq *rq) @@ -1676,12 +1675,7 @@ static inline void update_idle_core(struct rq *rq) __update_idle_core(rq); } -#else /* !CONFIG_SCHED_SMT: */ -static inline void update_idle_core(struct rq *rq) { } -#endif /* !CONFIG_SCHED_SMT */ - #ifdef CONFIG_FAIR_GROUP_SCHED - static inline struct task_struct *task_of(struct sched_entity *se) { WARN_ON_ONCE(!entity_is_task(se)); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 5847b83d9d55..a1f46e3f4ede 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1310,9 +1310,7 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) cpumask_copy(mask, sched_group_span(sg)); for_each_cpu(cpu, mask) { cores++; -#ifdef CONFIG_SCHED_SMT cpumask_andnot(mask, mask, cpu_smt_mask(cpu)); -#endif } sg->cores = cores; diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 3fe6b0c99f3d..773d8e9ae30c 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -633,6 +633,11 @@ int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) EXPORT_SYMBOL_GPL(stop_machine); #ifdef CONFIG_SCHED_SMT +/* + * INTEL_IFS is the only user of this API. That selftest can + * only be compiled if SMP=y. On x86 it selects SCHED_SMT. + * Keep the ifdefs for now. + */ int stop_core_cpuslocked(unsigned int cpu, cpu_stop_fn_t fn, void *data) { const struct cpumask *smt_mask = cpu_smt_mask(cpu); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5f747f241a5f..99ef412f02a6 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -8187,11 +8187,7 @@ static bool __init cpus_dont_share(int cpu0, int cpu1) static bool __init cpus_share_smt(int cpu0, int cpu1) { -#ifdef CONFIG_SCHED_SMT return cpumask_test_cpu(cpu0, cpu_smt_mask(cpu1)); -#else - return false; -#endif } static bool __init cpus_share_numa(int cpu0, int cpu1) -- cgit v1.2.3 From 9e005ed21152d4a4bb0ceea71045ff8a642a6feb Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Tue, 19 May 2026 05:14:23 +0000 Subject: sched/topology: Allow multiple domains to claim sched_domain_shared Recent optimizations of sd->shared assignment moved to allocating a single instance of per-CPU sched_domain_shared objects per s_data. Recent optimizations to select_idle_capacity() moved the sd->shared assignments to "sd_asym" domain when ASYM_CPUCAPACITY is detected but cache-aware scheduling mandates the presence of "sd_llc_shared" to compute and cache per-LLC statistics. Use an "alloc_flags" union in sched_domain_shared to claim a sched_domain_shared object per sched_domain. Allocation starts searching for an available / matching sched_domain_shared instance from the first CPU of sched_domain_span(sd) (sd can be sd_llc, or sd_asym). If the shared object is claimed by another domain, the instance corresponding to next CPU in the domain span is explored until a matching / available instance is found. In case of a single CPU in sched_domain_span(), the domain will be degenerated and a temporary overlap of ->shared objects across different domains is acceptable. "alloc_flags" forms a union with "nr_idle_scan" and the stale flags are left as is when the sd->shared is published. The expectation is for the first load balancing instance to correct the value just like the current behavior, except the initial value is no longer 0. Originally-by: Peter Zijlstra Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Tested-by: Andrea Righi --- include/linux/sched/topology.h | 16 ++++++++++- kernel/sched/topology.c | 63 ++++++++++++++++++++++++++++++++++++------ 2 files changed, 69 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index fe09d3268bc9..b5d9d7c2b8ad 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -67,7 +67,21 @@ struct sched_domain_shared { atomic_t ref; atomic_t nr_busy_cpus; int has_idle_cores; - int nr_idle_scan; + union { + int nr_idle_scan; + /* + * Used during allocation to claim the sched_domain_shared + * object at multiple levels. + * + * Note: between build and the first periodic LB tick, which + * rewrites the union via update_idle_cpu_scan(), readers of + * nr_idle_scan may observe the transient SD_* flag value as + * the scan bound. The flag bits are small positive integers, + * so the effect is just a slightly relaxed scan bound for one + * window and self-heals on the first tick. + */ + int alloc_flags; + }; #ifdef CONFIG_SCHED_CACHE unsigned long util_avg; unsigned long capacity; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index dbfd9657f897..df2ceb54c970 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -623,6 +623,12 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc) } while (sg != first); } +static void free_sched_domain_shared(struct sched_domain_shared *sds) +{ + if (sds && atomic_dec_and_test(&sds->ref)) + kfree(sds); +} + static void destroy_sched_domain(struct sched_domain *sd) { /* @@ -631,9 +637,7 @@ static void destroy_sched_domain(struct sched_domain *sd) * dropping group/capacity references, freeing where none remain. */ free_sched_groups(sd->groups, 1); - - if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) - kfree(sd->shared); + free_sched_domain_shared(sd->shared); #ifdef CONFIG_SCHED_CACHE /* only the bottom sd has llc_counts array */ @@ -755,7 +759,14 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) /* Pick reference to parent->shared. */ if (parent->shared) { - WARN_ON_ONCE(tmp->shared); + /* + * It is safe to free a sd->shared that + * has not been published yet. If a + * sd->shared was published, the refcount + * will end up being non-zero and it will + * not be freed here. + */ + free_sched_domain_shared(tmp->shared); tmp->shared = parent->shared; parent->shared = NULL; } @@ -2916,11 +2927,45 @@ static void adjust_numa_imbalance(struct sched_domain *sd_llc) } } -static void init_sched_domain_shared(struct s_data *d, struct sched_domain *sd) +static void +init_sched_domain_shared(struct s_data *d, struct sched_domain *sd, int flags) { - int sd_id = cpumask_first(sched_domain_span(sd)); + struct sched_domain_shared *sds = NULL; + int cpu; + + /* + * Multiple domains can try to claim a shared object like + * SD_ASYM_CPUCAPACITY and SD_SHARE_LLC which can alias to + * same cpumask_first(sched_domain_span(sd)) CPU and can + * cause "nr_idle_scan" to be populated incorrectly during + * load balancing. + * + * Find the first CPU in sched_domain_span(sd) with an + * unclaimed domain (!alloc_flags) or where the alloc_flag + * matches the requested flag (SD_* flag) + * + * If the domain only has single CPU, allow temporary overlap + * in allocation since the domains will be degenerated later. + */ + for_each_cpu(cpu, sched_domain_span(sd)) { + sds = *per_cpu_ptr(d->sds, cpu); + + if (!sds->alloc_flags || + sd->span_weight == 1 || + sds->alloc_flags == flags) { + sds->alloc_flags = flags; + sd->shared = sds; + break; + } + } + + /* + * Use the sd_shared corresponding to the last + * CPU in the span if none are avaialable. + */ + if (WARN_ON_ONCE(!sd->shared)) + sd->shared = sds; - sd->shared = *per_cpu_ptr(d->sds, sd_id); /* * nr_busy_cpus is consumed only by the NOHZ kick path via * sd_balance_shared; on the asym-capacity path it is initialized but @@ -2960,7 +3005,7 @@ static bool claim_asym_sched_domain_shared(struct s_data *d, int cpu) if (!sd_asym || (sd_asym->flags & SD_NUMA)) return false; - init_sched_domain_shared(d, sd_asym); + init_sched_domain_shared(d, sd_asym, SD_ASYM_CPUCAPACITY); return true; } @@ -3115,7 +3160,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att sd = sd->parent; if (sd->flags & SD_SHARE_LLC) { - init_sched_domain_shared(&d, sd); + init_sched_domain_shared(&d, sd, SD_SHARE_LLC); /* * In presence of higher domains, adjust the -- cgit v1.2.3 From e7b63427fdb4977621d69085a97272c8856644fe Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Tue, 26 May 2026 18:42:48 +0200 Subject: sched_ext: Auto-register/unregister dl_server reservations Commit cd959a3562050d ("sched_ext: Add a DL server for sched_ext tasks") introduced an ext_server deadline server to protect sched_ext tasks from fair/RT starvation, mirroring the existing fair_server. Currently, both servers reserve their 50ms/1000ms bandwidth at boot, regardless of whether a BPF scheduler is loaded. Unused bandwidth is still reclaimed at runtime by other classes, but the static reservation prevents the RT class from implicitly using that headroom when one of the two classes is guaranteed to be empty. A sysadmin can work around this by writing /sys/kernel/debug/sched/{fair,ext}_server/cpu*/runtime, but that requires manual action and not all systems expose debugfs. A better approach is to make server bandwidth reservations dynamic: only the scheduling policy that is currently active should register its reservation, while the inactive one should not artificially hold capacity (keeping both reservations only when the BPF scheduler is running in partial mode): +---------------------------------------------+-------------+------------+ | BPF scheduler state | fair server | ext server | +---------------------------------------------+-------------+------------+ | not loaded (default boot) | reserved | none | | loaded full mode (!SCX_OPS_SWITCH_PARTIAL) | none | reserved | | loaded partial mode (SCX_OPS_SWITCH_PARTIAL)| reserved | reserved | +---------------------------------------------+-------------+------------+ To achieve this, introduce an "attached/detached" state for each deadline server, so the kernel can decide whether a server's bandwidth should be accounted in global bandwidth tracking. At boot, the system starts with only the fair server contributing to bandwidth accounting. When a BPF scheduler is enabled, the ext server is attached and may replace or complement the fair server depending on whether full or partial mode is used. When sched_ext is disabled, the system restores the previous deadline bandwidth values and behavior. The transition logic ensures that switching between scheduling modes is consistent and reversible, without losing runtime configuration or requiring manual intervention. Signed-off-by: Andrea Righi Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Link: https://patch.msgid.link/20260526164420.638711-2-arighi@nvidia.com --- include/linux/sched.h | 6 ++ kernel/sched/deadline.c | 204 ++++++++++++++++++++++++++++++++++++++++++++++-- kernel/sched/ext.c | 71 +++++++++++++++++ kernel/sched/sched.h | 4 + 4 files changed, 278 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index da6a0907a78c..8130d13850fc 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -702,6 +702,11 @@ struct sched_dl_entity { * running, skipping the defer phase. * * @dl_defer_idle tracks idle state + * + * @dl_bw_attached tells if this server's bandwidth currently + * contributes to the root domain's total_bw. Only meaningful for server + * entities (@dl_server == 1). Allows toggling the reservation on/off + * without losing the configured @dl_runtime/@dl_period. */ unsigned int dl_throttled : 1; unsigned int dl_yielded : 1; @@ -713,6 +718,7 @@ struct sched_dl_entity { unsigned int dl_defer_armed : 1; unsigned int dl_defer_running : 1; unsigned int dl_defer_idle : 1; + unsigned int dl_bw_attached : 1; /* * Bandwidth enforcement timer. Each -deadline task has its diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index b60e2df8ff9d..f9e62ed08d77 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1797,7 +1797,8 @@ void dl_server_start(struct sched_dl_entity *dl_se) struct rq *rq = dl_se->rq; dl_se->dl_defer_idle = 0; - if (!dl_server(dl_se) || dl_se->dl_server_active || !dl_se->dl_runtime) + if (!dl_server(dl_se) || dl_se->dl_server_active || !dl_se->dl_runtime || + !dl_se->dl_bw_attached) return; /* @@ -1872,6 +1873,13 @@ void sched_init_dl_servers(void) dl_se->dl_server = 1; dl_se->dl_defer = 1; setup_new_dl_entity(dl_se); + + /* + * No BPF scheduler is loaded at boot, so the ext_server has no + * tasks to protect. Detach its bandwidth reservation, it will + * be attached when a BPF scheduler is loaded. + */ + dl_server_detach_bw(dl_se); #endif } } @@ -1882,6 +1890,9 @@ void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq) int cpu = cpu_of(rq); struct dl_bw *dl_b; + if (!dl_se->dl_bw_attached) + return; + dl_b = dl_bw_of(cpu_of(rq)); guard(raw_spinlock)(&dl_b->lock); @@ -1893,7 +1904,8 @@ void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq) int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 period, bool init) { - u64 old_bw = init ? 0 : to_ratio(dl_se->dl_period, dl_se->dl_runtime); + u64 old_bw = (init || !dl_se->dl_bw_attached) ? 0 : + to_ratio(dl_se->dl_period, dl_se->dl_runtime); u64 new_bw = to_ratio(period, runtime); struct rq *rq = dl_se->rq; int cpu = cpu_of(rq); @@ -1913,7 +1925,8 @@ int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 perio if (init) { __add_rq_bw(new_bw, &rq->dl); __dl_add(dl_b, new_bw, cpus); - } else { + dl_se->dl_bw_attached = 1; + } else if (dl_se->dl_bw_attached) { __dl_sub(dl_b, dl_se->dl_bw, cpus); __dl_add(dl_b, new_bw, cpus); @@ -1933,6 +1946,181 @@ int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 perio return 0; } +/* + * Add @dl_se's bw to the root-domain accounting. + * + * Return -EBUSY if attaching would overflow root domain capacity. + */ +static int __dl_server_attach_bw_locked(struct sched_dl_entity *dl_se, + struct dl_bw *dl_b, int cpus) +{ + struct rq *rq = dl_se->rq; + unsigned long cap; + + /* + * Always update @rq->dl.this_bw, but only update @dl_b->total_bw + * (and run the overflow check it gates) while this CPU is active. + * + * This mirrors dl_server_add_bw() during root-domain rebuilds, which + * only publishes bandwidth from active CPUs into @dl_b. + */ + if (cpu_active(cpu_of(rq))) { + cap = dl_bw_capacity(cpu_of(rq)); + if (__dl_overflow(dl_b, cap, 0, dl_se->dl_bw)) + return -EBUSY; + __dl_add(dl_b, dl_se->dl_bw, cpus); + } + __add_rq_bw(dl_se->dl_bw, &rq->dl); + dl_se->dl_bw_attached = 1; + + return 0; +} + +/* + * Drain @dl_se and remove its bw from the root-domain accounting. + */ +static void __dl_server_detach_bw_locked(struct sched_dl_entity *dl_se, + struct dl_bw *dl_b, int cpus) +{ + struct rq *rq = dl_se->rq; + + /* + * If the server is still active (on_rq), dequeue it via + * dl_server_stop(); task_non_contending() will either subtract + * @dl_bw from running_bw immediately (0-lag passed) or set + * dl_non_contending and arm the inactive_timer. + */ + if (dl_se->dl_server_active) + dl_server_stop(dl_se); + + /* + * Drop @dl_se's contribution from this rq's bandwidth accounting, + * mirroring the __add_rq_bw() done at attach time. + */ + dl_rq_change_utilization(rq, dl_se, 0); + + /* + * Update @dl_b only while this CPU is active, matching + * dl_server_add_bw() during root-domain rebuilds. + * + * If this CPU is inactive, its bandwidth is not currently accounted in + * @dl_b->total_bw: either attach skipped adding it, or a rebuild + * already dropped it while re-publishing active CPUs only. + * + * In that case there is nothing to subtract from @dl_b. Just clear + * @dl_se->dl_bw_attached; if the CPU becomes active again, the next + * rebuild will re-publish its bandwidth. + */ + if (cpu_active(cpu_of(rq))) + __dl_sub(dl_b, dl_se->dl_bw, cpus); + dl_se->dl_bw_attached = 0; +} + +/* + * Attach @dl_se's bandwidth to the root domain's total_bw accounting. + * + * Use to dynamically register a dl_server's bandwidth reservation while + * preserving its configured @dl_runtime / @dl_period. No-op if @dl_se is + * already attached. + * + * Returns -EBUSY if attaching would overflow the root domain capacity. + */ +int dl_server_attach_bw(struct sched_dl_entity *dl_se) +{ + struct rq *rq = dl_se->rq; + int cpu = cpu_of(rq); + struct dl_bw *dl_b; + int cpus, ret; + + if (dl_se->dl_bw_attached) + return 0; + + scoped_guard (raw_spinlock, &dl_bw_of(cpu)->lock) { + dl_b = dl_bw_of(cpu); + cpus = dl_bw_cpus(cpu); + ret = __dl_server_attach_bw_locked(dl_se, dl_b, cpus); + } + if (ret) + return ret; + + /* + * The natural 0->nr_running transition that triggers dl_server_start() + * may have happened while @dl_se was still detached (e.g., between + * scx_bypass(false) and the scx_enable() re-balance loop), so kick a + * start here. + * + * dl_server_start() bails out cleanly if there's nothing to schedule or + * it's already active. Skip if @cpu is offline; the server will be + * started naturally on the first enqueue once @cpu comes back. + */ + if (cpu_online(cpu)) + dl_server_start(dl_se); + + return 0; +} + +/* + * Detach @dl_se's bandwidth from the root domain's total_bw accounting. + * + * Use to dynamically unregister a dl_server's bandwidth reservation while + * preserving its configured @dl_runtime / @dl_period. No-op if @dl_se is + * not currently attached. + */ +void dl_server_detach_bw(struct sched_dl_entity *dl_se) +{ + int cpu = cpu_of(dl_se->rq); + struct dl_bw *dl_b; + int cpus; + + if (!dl_se->dl_bw_attached) + return; + + dl_b = dl_bw_of(cpu); + guard(raw_spinlock)(&dl_b->lock); + cpus = dl_bw_cpus(cpu); + __dl_server_detach_bw_locked(dl_se, dl_b, cpus); +} + +/* + * Atomically detach @detach_se and attach @attach_se on the same rq, holding + * @dl_b->lock across both operations so a concurrent sched_setattr() cannot + * steal the bandwidth freed by the detach before the attach can claim it. + * + * Both entities must live on the same rq (same root domain). Returns the + * result of the attach: -EBUSY if attaching @attach_se would overflow root + * domain capacity (in which case both servers end up detached). + */ +int dl_server_swap_bw(struct sched_dl_entity *detach_se, + struct sched_dl_entity *attach_se) +{ + struct rq *rq = detach_se->rq; + int cpu = cpu_of(rq); + struct dl_bw *dl_b; + int cpus, ret; + + WARN_ON_ONCE(attach_se->rq != rq); + + scoped_guard (raw_spinlock, &dl_bw_of(cpu)->lock) { + dl_b = dl_bw_of(cpu); + cpus = dl_bw_cpus(cpu); + + if (detach_se->dl_bw_attached) + __dl_server_detach_bw_locked(detach_se, dl_b, cpus); + + if (attach_se->dl_bw_attached) + ret = 0; + else + ret = __dl_server_attach_bw_locked(attach_se, dl_b, cpus); + } + if (ret) + return ret; + + if (cpu_online(cpu)) + dl_server_start(attach_se); + + return 0; +} + /* * Update the current task's runtime statistics (provided it is still * a -deadline task and has not been removed from the dl_rq). @@ -3233,12 +3421,12 @@ static void dl_server_add_bw(struct root_domain *rd, int cpu) struct sched_dl_entity *dl_se; dl_se = &cpu_rq(cpu)->fair_server; - if (dl_server(dl_se) && cpu_active(cpu)) + if (dl_server(dl_se) && dl_se->dl_bw_attached && cpu_active(cpu)) __dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(cpu)); #ifdef CONFIG_SCHED_CLASS_EXT dl_se = &cpu_rq(cpu)->ext_server; - if (dl_server(dl_se) && cpu_active(cpu)) + if (dl_server(dl_se) && dl_se->dl_bw_attached && cpu_active(cpu)) __dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(cpu)); #endif } @@ -3247,11 +3435,13 @@ static u64 dl_server_read_bw(int cpu) { u64 dl_bw = 0; - if (cpu_rq(cpu)->fair_server.dl_server) + if (cpu_rq(cpu)->fair_server.dl_server && + cpu_rq(cpu)->fair_server.dl_bw_attached) dl_bw += cpu_rq(cpu)->fair_server.dl_bw; #ifdef CONFIG_SCHED_CLASS_EXT - if (cpu_rq(cpu)->ext_server.dl_server) + if (cpu_rq(cpu)->ext_server.dl_server && + cpu_rq(cpu)->ext_server.dl_bw_attached) dl_bw += cpu_rq(cpu)->ext_server.dl_bw; #endif diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 345aa11b84b2..f412c4bb21c3 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -5814,6 +5814,7 @@ static void scx_root_disable(struct scx_sched *sch) struct scx_exit_info *ei = sch->exit_info; struct scx_task_iter sti; struct task_struct *p; + bool was_switched_all; int cpu; /* guarantee forward progress and wait for descendants to be disabled */ @@ -5840,6 +5841,8 @@ static void scx_root_disable(struct scx_sched *sch) */ mutex_lock(&scx_enable_mutex); + was_switched_all = scx_switched_all(); + static_branch_disable(&__scx_switched_all); WRITE_ONCE(scx_switching_all, false); @@ -5889,10 +5892,34 @@ static void scx_root_disable(struct scx_sched *sch) /* * Invalidate all the rq clocks to prevent getting outdated * rq clocks from a previous scx scheduler. + * + * Also re-balance the dl_server bandwidth reservations: detach + * ext_server (no more sched_ext tasks) and reinstate fair_server if it + * was previously detached because we were running in full mode. + * + * Unlike the enable path, this runs on a recovery path that cannot + * fail, so we use dl_server_swap_bw() to atomically free ext_server's + * bandwidth and reclaim it for fair_server under the same dl_b lock. + * + * The swap can still fail with -EBUSY if someone bumped ext_server's + * runtime via debugfs between enable and disable; in that narrow case + * both servers end up detached and we just WARN. */ for_each_possible_cpu(cpu) { struct rq *rq = cpu_rq(cpu); + scx_rq_clock_invalidate(rq); + + scoped_guard(rq_lock_irqsave, rq) { + update_rq_clock(rq); + if (was_switched_all) { + if (WARN_ON_ONCE(dl_server_swap_bw(&rq->ext_server, + &rq->fair_server))) + pr_warn("failed to re-attach fair_server on CPU %d\n", cpu); + } else { + dl_server_detach_bw(&rq->ext_server); + } + } } /* no task is on scx, turn off all the switches and flush in-progress calls */ @@ -6810,6 +6837,31 @@ static void scx_root_enable_workfn(struct kthread_work *work) if (ret) goto err_disable; + /* + * Attach the ext_server bandwidth reservation before anything is + * committed so that we can fail the enable if the root domain cannot + * accommodate it. The matching fair_server detach is deferred to the + * tail of this function, after the switch is fully committed and can no + * longer fail. + * + * On failure, err_disable funnels into scx_root_disable() which + * detaches ext_server, so partially-attached state is cleaned up + * automatically. + */ + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + + scoped_guard(rq_lock_irqsave, rq) { + update_rq_clock(rq); + ret = dl_server_attach_bw(&rq->ext_server); + } + if (ret) { + pr_warn("sched_ext: failed to attach ext_server on CPU %d (%d)\n", + cpu, ret); + goto err_disable; + } + } + /* * Once __scx_enabled is set, %current can be switched to SCX anytime. * This can lead to stalls as some BPF schedulers (e.g. userspace @@ -6926,6 +6978,25 @@ static void scx_root_enable_workfn(struct kthread_work *work) if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL)) static_branch_enable(&__scx_switched_all); + /* + * Detach the fair_server bandwidth reservation now that the switch + * is fully committed. In full mode (!SCX_OPS_SWITCH_PARTIAL) no + * task will ever run in the fair class, so give that bandwidth + * back to the RT class. The matching ext_server attach already + * happened earlier; this only releases bandwidth and cannot fail. + * + * In partial mode keep fair_server attached. + */ + if (scx_switched_all()) { + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + + guard(rq_lock_irqsave)(rq); + update_rq_clock(rq); + dl_server_detach_bw(&rq->fair_server); + } + } + pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n", sch->ops.name, scx_switched_all() ? "" : " (partial)"); kobject_uevent(&sch->kobj, KOBJ_ADD); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6b48bb3074fe..332ecf8930b4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -421,6 +421,10 @@ extern void ext_server_init(struct rq *rq); extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq); extern int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 period, bool init); +extern int dl_server_attach_bw(struct sched_dl_entity *dl_se); +extern void dl_server_detach_bw(struct sched_dl_entity *dl_se); +extern int dl_server_swap_bw(struct sched_dl_entity *detach_se, + struct sched_dl_entity *attach_se); static inline bool dl_server_active(struct sched_dl_entity *dl_se) { -- cgit v1.2.3 From f13beb010e4ab0735c9e46802cbcc820a8bd6467 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 12 May 2026 02:56:15 +0000 Subject: sched: Have try_to_wake_up() handle return-migration for PROXY_WAKING case This patch adds logic so try_to_wake_up() will notice if we are waking a task where blocked_on == PROXY_WAKING, and if necessary dequeue the task so the wakeup will naturally return-migrate the donor task back to a cpu it can run on. This helps performance as we do the dequeue and wakeup under the locks normally taken in the try_to_wake_up() and avoids having to do proxy_force_return() from __schedule(), which has to re-take similar locks and then force a pick again loop. This was split out from the larger proxy patch, and significantly reworked. Credits for the original patch go to: Peter Zijlstra (Intel) Juri Lelli Valentin Schneider Connor O'Brien Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260512025635.2840817-6-jstultz@google.com --- include/linux/sched.h | 2 +- kernel/sched/core.c | 195 +++++++++++++++++++++++++------------------------- 2 files changed, 97 insertions(+), 100 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 8130d13850fc..5dea5b10ac99 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -161,7 +161,7 @@ struct user_event_mm; */ #define is_special_task_state(state) \ ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | \ - TASK_DEAD | TASK_FROZEN)) + TASK_DEAD | TASK_WAKING | TASK_FROZEN)) #ifdef CONFIG_DEBUG_ATOMIC_SLEEP # define debug_normal_state_change(state_value) \ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2f1e85d09b94..3f71dd9c1063 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3735,6 +3735,53 @@ void update_rq_avg_idle(struct rq *rq) rq->idle_stamp = 0; } +#ifdef CONFIG_SCHED_PROXY_EXEC +static void zap_balance_callbacks(struct rq *rq); + +static inline void proxy_reset_donor(struct rq *rq) +{ + WARN_ON_ONCE(rq->donor == rq->curr); + + put_prev_set_next_task(rq, rq->donor, rq->curr); + rq_set_donor(rq, rq->curr); + zap_balance_callbacks(rq); + resched_curr(rq); +} + +/* + * Checks to see if task p has been proxy-migrated to another rq + * and needs to be returned. If so, we deactivate the task here + * so that it can be properly woken up on the p->wake_cpu + * (or whichever cpu select_task_rq() picks at the bottom of + * try_to_wake_up() + */ +static inline bool proxy_needs_return(struct rq *rq, struct task_struct *p) +{ + if (!task_is_blocked(p)) + return false; + + scoped_guard(raw_spinlock, &p->blocked_lock) { + /* Task is waking up; clear any blocked_on relationship */ + __clear_task_blocked_on(p, NULL); + + /* If already current, don't need to return migrate */ + if (task_current(rq, p)) + return false; + + /* If we're return migrating the rq->donor, switch it out for idle */ + if (task_current_donor(rq, p)) + proxy_reset_donor(rq); + } + block_task(rq, p, TASK_WAKING); + return true; +} +#else /* !CONFIG_SCHED_PROXY_EXEC */ +static inline bool proxy_needs_return(struct rq *rq, struct task_struct *p) +{ + return false; +} +#endif /* CONFIG_SCHED_PROXY_EXEC */ + static void ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, struct rq_flags *rf) @@ -3799,28 +3846,26 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, */ static int ttwu_runnable(struct task_struct *p, int wake_flags) { - struct rq_flags rf; - struct rq *rq; - int ret = 0; + ACQUIRE(__task_rq_lock, guard)(p); + struct rq *rq = guard.rq; - rq = __task_rq_lock(p, &rf); - if (task_on_rq_queued(p)) { - update_rq_clock(rq); - if (p->se.sched_delayed) - enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED); - if (!task_on_cpu(rq, p)) { - /* - * When on_rq && !on_cpu the task is preempted, see if - * it should preempt the task that is current now. - */ - wakeup_preempt(rq, p, wake_flags); - } - ttwu_do_wakeup(p); - ret = 1; - } - __task_rq_unlock(rq, p, &rf); + if (!task_on_rq_queued(p)) + return 0; - return ret; + update_rq_clock(rq); + if (p->se.sched_delayed) + enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED); + if (proxy_needs_return(rq, p)) + return 0; + if (!task_on_cpu(rq, p)) { + /* + * When on_rq && !on_cpu the task is preempted, see if + * it should preempt the task that is current now. + */ + wakeup_preempt(rq, p, wake_flags); + } + ttwu_do_wakeup(p); + return 1; } void sched_ttwu_pending(void *arg) @@ -4207,6 +4252,8 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * it disabling IRQs (this allows not taking ->pi_lock). */ WARN_ON_ONCE(p->se.sched_delayed); + /* If p is current, we know we can run here, so clear blocked_on */ + clear_task_blocked_on(p, NULL); if (!ttwu_state_match(p, state, &success)) goto out; @@ -4223,6 +4270,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ scoped_guard (raw_spinlock_irqsave, &p->pi_lock) { smp_mb__after_spinlock(); + if (!ttwu_state_match(p, state, &success)) break; @@ -4287,6 +4335,14 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ WRITE_ONCE(p->__state, TASK_WAKING); + /* + * We never clear the blocked_on relation on proxy_deactivate. + * If we don't clear it here, we have TASK_RUNNING + p->blocked_on + * when waking up. Since this is a fully blocked, off CPU task + * waking up, it should be safe to clear the blocked_on relation. + */ + if (task_is_blocked(p)) + clear_task_blocked_on(p, NULL); /* * If the owning (remote) CPU is still in the middle of schedule() with * this task as prev, considering queueing p on the remote CPUs wake_list @@ -4331,6 +4387,16 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) wake_flags |= WF_MIGRATED; psi_ttwu_dequeue(p); set_task_cpu(p, cpu); + } else if (cpu != p->wake_cpu) { + /* + * If we were proxy-migrated to cpu, then + * select_task_rq() picks cpu instead of wake_cpu + * to return to, we won't call set_task_cpu(), + * leaving a stale wake_cpu pointing to where we + * proxy-migrated from. So just fixup wake_cpu here + * if its not correct + */ + p->wake_cpu = cpu; } ttwu_queue(p, cpu, wake_flags); @@ -6612,7 +6678,7 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p, if (signal_pending_state(task_state, p)) { WRITE_ONCE(p->__state, TASK_RUNNING); *task_state_p = TASK_RUNNING; - set_task_blocked_on_waking(p, NULL); + clear_task_blocked_on(p, NULL); return false; } @@ -6656,13 +6722,11 @@ static inline struct task_struct *proxy_resched_idle(struct rq *rq) return rq->idle; } -static bool proxy_deactivate(struct rq *rq, struct task_struct *donor) +static void proxy_deactivate(struct rq *rq, struct task_struct *donor) { unsigned long state = READ_ONCE(donor->__state); - /* Don't deactivate if the state has been changed to TASK_RUNNING */ - if (state == TASK_RUNNING) - return false; + WARN_ON_ONCE(state == TASK_RUNNING); /* * Because we got donor from pick_next_task(), it is *crucial* * that we call proxy_resched_idle() before we deactivate it. @@ -6673,7 +6737,7 @@ static bool proxy_deactivate(struct rq *rq, struct task_struct *donor) * need to be changed from next *before* we deactivate. */ proxy_resched_idle(rq); - return try_to_block_task(rq, donor, &state, true); + block_task(rq, donor, state); } static inline void proxy_release_rq_lock(struct rq *rq, struct rq_flags *rf) @@ -6747,71 +6811,6 @@ static void proxy_migrate_task(struct rq *rq, struct rq_flags *rf, proxy_reacquire_rq_lock(rq, rf); } -static void proxy_force_return(struct rq *rq, struct rq_flags *rf, - struct task_struct *p) - __must_hold(__rq_lockp(rq)) -{ - struct rq *task_rq, *target_rq = NULL; - int cpu, wake_flag = WF_TTWU; - - lockdep_assert_rq_held(rq); - WARN_ON(p == rq->curr); - - if (p == rq->donor) - proxy_resched_idle(rq); - - proxy_release_rq_lock(rq, rf); - /* - * We drop the rq lock, and re-grab task_rq_lock to get - * the pi_lock (needed for select_task_rq) as well. - */ - scoped_guard (task_rq_lock, p) { - task_rq = scope.rq; - - /* - * Since we let go of the rq lock, the task may have been - * woken or migrated to another rq before we got the - * task_rq_lock. So re-check we're on the same RQ. If - * not, the task has already been migrated and that CPU - * will handle any futher migrations. - */ - if (task_rq != rq) - break; - - /* - * Similarly, if we've been dequeued, someone else will - * wake us - */ - if (!task_on_rq_queued(p)) - break; - - /* - * Since we should only be calling here from __schedule() - * -> find_proxy_task(), no one else should have - * assigned current out from under us. But check and warn - * if we see this, then bail. - */ - if (task_current(task_rq, p) || task_on_cpu(task_rq, p)) { - WARN_ONCE(1, "%s rq: %i current/on_cpu task %s %d on_cpu: %i\n", - __func__, cpu_of(task_rq), - p->comm, p->pid, p->on_cpu); - break; - } - - update_rq_clock(task_rq); - deactivate_task(task_rq, p, DEQUEUE_NOCLOCK); - cpu = select_task_rq(p, p->wake_cpu, &wake_flag); - set_task_cpu(p, cpu); - target_rq = cpu_rq(cpu); - clear_task_blocked_on(p, NULL); - } - - if (target_rq) - attach_one_task(target_rq, p); - - proxy_reacquire_rq_lock(rq, rf); -} - /* * Find runnable lock owner to proxy for mutex blocked donor * @@ -6847,7 +6846,7 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) clear_task_blocked_on(p, PROXY_WAKING); return p; } - goto force_return; + goto deactivate; } /* @@ -6882,7 +6881,7 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) __clear_task_blocked_on(p, NULL); return p; } - goto force_return; + goto deactivate; } if (!READ_ONCE(owner->on_rq) || owner->se.sched_delayed) { @@ -6961,12 +6960,7 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) return owner; deactivate: - if (proxy_deactivate(rq, donor)) - return NULL; - /* If deactivate fails, force return */ - p = donor; -force_return: - proxy_force_return(rq, rf, p); + proxy_deactivate(rq, p); return NULL; migrate_task: proxy_migrate_task(rq, rf, p, owner_cpu); @@ -7113,6 +7107,9 @@ pick_again: if (sched_proxy_exec()) { struct task_struct *prev_donor = rq->donor; + if (!prev_state && prev->blocked_on) + clear_task_blocked_on(prev, NULL); + rq_set_donor(rq, next); if (unlikely(next->blocked_on)) { next = find_proxy_task(rq, next, &rf); -- cgit v1.2.3 From 4c2a20413d7fb3fc3dd7adf233a4f82bb203fb58 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 12 May 2026 02:56:16 +0000 Subject: sched: Add is_blocked task flag Add a new is_blocked flag to the task struct. This flag is set by try_to_block_task() and cleared by ttwu_do_wakeup() and tracks if the task is blocked. Traditionally this would mirror !p->on_rq, however due things like DELAY_DEQUEUE and PROXY_EXEC, this can diverge, so its useful to manage separately. Additionally with this, we might be able to get rid of the p->se.sched_delayed (ab)use in the core code (eventually). Taken whole cloth from Peter's email: https://lore.kernel.org/lkml/20260501132143.GC1026330@noisy.programming.kicks-ass.net/ With a few additional p->is_blocked = 0 in a few cases where we return current if blocked_on gets zeroed or there is no owner. This may hint that these current special cases might be dropped eventually. This change also helps resolve wait-queue stalls seen with proxy-execution. See previous patch attempts for details: https://lore.kernel.org/lkml/20260430215103.2978955-2-jstultz@google.com/ Reported-by: Vineeth Pillai Suggested-by: Peter Zijlstra Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260512025635.2840817-7-jstultz@google.com --- include/linux/sched.h | 7 +++++-- kernel/sched/core.c | 16 +++++++++++++++- 2 files changed, 20 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 5dea5b10ac99..ec170663f99b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -852,7 +852,11 @@ struct task_struct { struct alloc_tag *alloc_tag; #endif - int on_cpu; + u8 on_cpu; + u8 on_rq; + u8 is_blocked; + u8 __pad; + struct __call_single_node wake_entry; unsigned int wakee_flips; unsigned long wakee_flip_decay_ts; @@ -867,7 +871,6 @@ struct task_struct { */ int recent_used_cpu; int wake_cpu; - int on_rq; int prio; int static_prio; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3f71dd9c1063..c7552869d5c4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -624,6 +624,12 @@ int task_llc(const struct task_struct *p) * [ The astute reader will observe that it is possible for two tasks on one * CPU to have ->on_cpu = 1 at the same time. ] * + * p->is_blocked <- { 0, 1 }: + * + * is set by try_to_block_task() and cleared by ttwu_do_wakeup() and tracks + * if the task is blocked. Traditionally this would mirror p->on_rq, however + * due things like DELAY_DEQUEUE and PROXY_EXEC, this can diverge. + * * task_cpu(p): is changed by set_task_cpu(), the rules are: * * - Don't call set_task_cpu() on a blocked task: @@ -3719,6 +3725,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) */ static inline void ttwu_do_wakeup(struct task_struct *p) { + p->is_blocked = 0; WRITE_ONCE(p->__state, TASK_RUNNING); trace_sched_wakeup(p); } @@ -4252,6 +4259,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * it disabling IRQs (this allows not taking ->pi_lock). */ WARN_ON_ONCE(p->se.sched_delayed); + WARN_ON_ONCE(p->is_blocked); /* If p is current, we know we can run here, so clear blocked_on */ clear_task_blocked_on(p, NULL); if (!ttwu_state_match(p, state, &success)) @@ -4563,6 +4571,7 @@ static void __sched_fork(u64 clone_flags, struct task_struct *p) /* A delayed task cannot be in clone(). */ WARN_ON_ONCE(p->se.sched_delayed); + WARN_ON_ONCE(p->is_blocked); #ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = NULL; @@ -6676,6 +6685,7 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p, unsigned long task_state = *task_state_p; if (signal_pending_state(task_state, p)) { + p->is_blocked = 0; WRITE_ONCE(p->__state, TASK_RUNNING); *task_state_p = TASK_RUNNING; clear_task_blocked_on(p, NULL); @@ -6683,6 +6693,8 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p, return false; } + p->is_blocked = 1; + /* * We check should_block after signal_pending because we * will want to wake the task in that case. But if @@ -6843,6 +6855,7 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) /* if its PROXY_WAKING, do return migration or run if current */ if (mutex == PROXY_WAKING) { if (task_current(rq, p)) { + p->is_blocked = 0; clear_task_blocked_on(p, PROXY_WAKING); return p; } @@ -6878,6 +6891,7 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) * just run on this rq), or return-migrate the task. */ if (task_current(rq, p)) { + p->is_blocked = 0; __clear_task_blocked_on(p, NULL); return p; } @@ -7111,7 +7125,7 @@ pick_again: clear_task_blocked_on(prev, NULL); rq_set_donor(rq, next); - if (unlikely(next->blocked_on)) { + if (unlikely(next->is_blocked && next->blocked_on)) { next = find_proxy_task(rq, next, &rf); if (!next) { zap_balance_callbacks(rq); -- cgit v1.2.3 From 1628b25248d0742b2ce9c7cfa59cd183e35f37e1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 May 2026 02:56:17 +0000 Subject: sched: Add blocked_donor link to task for smarter mutex handoffs Add link to the task this task is proxying for, and use it so the mutex owner can do an intelligent hand-off of the mutex to the task that the owner is running on behalf. [jstultz: This patch was split out from larger proxy patch] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Juri Lelli Signed-off-by: Valentin Schneider Signed-off-by: Connor O'Brien Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260512025635.2840817-8-jstultz@google.com --- include/linux/sched.h | 7 ++++++ init/init_task.c | 1 + kernel/fork.c | 1 + kernel/locking/mutex.c | 60 ++++++++++++++++++++++++++++++++++++++++++++------ kernel/sched/core.c | 14 +++++++++++- 5 files changed, 75 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index ec170663f99b..e2f127a7ca0d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1250,6 +1250,13 @@ struct task_struct { struct mutex *blocked_on; /* lock we're blocked on */ raw_spinlock_t blocked_lock; + /* + * The task that is boosting this task; a back link for the current + * donor stack. Set in schedule() -> find_proxy_task() and only stable + * under preempt_disable(). + */ + struct task_struct *blocked_donor; + #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER /* * Encoded lock address causing task block (lower 2 bits = type from diff --git a/init/init_task.c b/init/init_task.c index 3ecd66fbd563..674d174e2e6a 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -200,6 +200,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = { .mems_allowed_seq = SEQCNT_SPINLOCK_ZERO(init_task.mems_allowed_seq, &init_task.alloc_lock), #endif + .blocked_donor = NULL, #ifdef CONFIG_RT_MUTEXES .pi_waiters = RB_ROOT_CACHED, .pi_top_task = NULL, diff --git a/kernel/fork.c b/kernel/fork.c index a679b2448234..6fcca1db0af3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2224,6 +2224,7 @@ __latent_entropy struct task_struct *copy_process( lockdep_init_task(p); p->blocked_on = NULL; /* not blocked yet */ + p->blocked_donor = NULL; /* nobody is boosting p yet */ #ifdef CONFIG_BCACHE p->sequential_io = 0; diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index a93d4c6bee1a..28677165785f 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -981,15 +981,22 @@ EXPORT_SYMBOL_GPL(ww_mutex_lock_interruptible); static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip) __releases(lock) { - struct task_struct *next = NULL; + struct task_struct *donor, *next = NULL; struct mutex_waiter *waiter; - DEFINE_WAKE_Q(wake_q); unsigned long owner; unsigned long flags; mutex_release(&lock->dep_map, ip); __release(lock); + /* + * Ensures the proxy donor stack is stable across unlock and handoff. + * Specifically, it avoids the case where current->blocked_donor is + * NULL when it is inspected while doing the unlock, but a preemption + * before taking the wake_lock would make it set and a hand-off is + * missed. + */ + guard(preempt)(); /* * Release the lock before (potentially) taking the spinlock such that * other contenders can get on with things ASAP. @@ -1002,6 +1009,12 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne MUTEX_WARN_ON(__owner_task(owner) != current); MUTEX_WARN_ON(owner & MUTEX_FLAG_PICKUP); + if (sched_proxy_exec() && current->blocked_donor) { + /* force handoff if we have a blocked_donor */ + owner = MUTEX_FLAG_HANDOFF; + break; + } + if (owner & MUTEX_FLAG_HANDOFF) break; @@ -1014,20 +1027,53 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne } raw_spin_lock_irqsave(&lock->wait_lock, flags); + raw_spin_lock(¤t->blocked_lock); debug_mutex_unlock(lock); + + if (sched_proxy_exec()) { + /* + * If we have a task boosting current, and that task was boosting + * current through this lock, hand the lock to that task, as that + * is the highest waiter, as selected by the scheduling function. + */ + donor = current->blocked_donor; + if (donor) { + struct mutex *next_lock; + + raw_spin_lock_nested(&donor->blocked_lock, SINGLE_DEPTH_NESTING); + next_lock = __get_task_blocked_on(donor); + if (next_lock == lock) { + next = get_task_struct(donor); + __set_task_blocked_on_waking(donor, next_lock); + current->blocked_donor = NULL; + } + raw_spin_unlock(&donor->blocked_lock); + } + } + + /* + * Failing that, pick first on the wait list. + */ waiter = lock->first_waiter; - if (waiter) { - next = waiter->task; + if (!next && waiter) { + next = get_task_struct(waiter->task); + raw_spin_lock_nested(&next->blocked_lock, SINGLE_DEPTH_NESTING); debug_mutex_wake_waiter(lock, waiter); - set_task_blocked_on_waking(next, lock); - wake_q_add(&wake_q, next); + __set_task_blocked_on_waking(next, lock); + raw_spin_unlock(&next->blocked_lock); + } if (owner & MUTEX_FLAG_HANDOFF) __mutex_handoff(lock, next); - raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q); + raw_spin_unlock(¤t->blocked_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + if (next) { + wake_up_process(next); + put_task_struct(next); + } } #ifndef CONFIG_DEBUG_LOCK_ALLOC diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c7552869d5c4..4c6ceff3855e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6827,7 +6827,17 @@ static void proxy_migrate_task(struct rq *rq, struct rq_flags *rf, * Find runnable lock owner to proxy for mutex blocked donor * * Follow the blocked-on relation: - * task->blocked_on -> mutex->owner -> task... + * + * ,-> task + * | | blocked-on + * | v + * blocked_donor | mutex + * | | owner + * | v + * `-- task + * + * and set the blocked_donor relation, this latter is used by the mutex + * code to find which (blocked) task to hand-off to. * * Lock order: * @@ -6969,6 +6979,7 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) * rq, therefore holding @rq->lock is sufficient to * guarantee its existence, as per ttwu_remote(). */ + owner->blocked_donor = p; } WARN_ON_ONCE(owner && !owner->on_rq); return owner; @@ -7125,6 +7136,7 @@ pick_again: clear_task_blocked_on(prev, NULL); rq_set_donor(rq, next); + next->blocked_donor = NULL; if (unlikely(next->is_blocked && next->blocked_on)) { next = find_proxy_task(rq, next, &rf); if (!next) { -- cgit v1.2.3 From ec9d4f1c424134bbf30965075df78d02a5d021dc Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Tue, 26 May 2026 11:43:02 +0200 Subject: sched/proxy: Remove PROXY_WAKING Now that the proxy path uses ->is_blocked, use the '->is_blocked && !->blocked_on' state instead of PROXY_WAKING. Notably, this is where a blocked_on relation is broken but the donor task might still need a return migration. Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260526113322.596522894%40infradead.org --- include/linux/sched.h | 50 ++--------------------------------------------- kernel/locking/mutex.c | 4 ++-- kernel/locking/ww_mutex.h | 4 ++-- kernel/sched/core.c | 2 +- 4 files changed, 7 insertions(+), 53 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index e2f127a7ca0d..35e6183ef615 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2205,19 +2205,10 @@ extern int __cond_resched_rwlock_write(rwlock_t *lock) __must_hold(lock); #ifndef CONFIG_PREEMPT_RT -/* - * With proxy exec, if a task has been proxy-migrated, it may be a donor - * on a cpu that it can't actually run on. Thus we need a special state - * to denote that the task is being woken, but that it needs to be - * evaluated for return-migration before it is run. So if the task is - * blocked_on PROXY_WAKING, return migrate it before running it. - */ -#define PROXY_WAKING ((struct mutex *)(-1L)) - static inline struct mutex *__get_task_blocked_on(struct task_struct *p) { lockdep_assert_held_once(&p->blocked_lock); - return p->blocked_on == PROXY_WAKING ? NULL : p->blocked_on; + return p->blocked_on; } static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m) @@ -2245,7 +2236,7 @@ static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex * * blocked_on relationships, but make sure we are not * clearing the relationship with a different lock. */ - WARN_ON_ONCE(m && p->blocked_on && p->blocked_on != m && p->blocked_on != PROXY_WAKING); + WARN_ON_ONCE(m && p->blocked_on && p->blocked_on != m); p->blocked_on = NULL; } @@ -2254,35 +2245,6 @@ static inline void clear_task_blocked_on(struct task_struct *p, struct mutex *m) guard(raw_spinlock_irqsave)(&p->blocked_lock); __clear_task_blocked_on(p, m); } - -static inline void __set_task_blocked_on_waking(struct task_struct *p, struct mutex *m) -{ - /* Currently we serialize blocked_on under the task::blocked_lock */ - lockdep_assert_held_once(&p->blocked_lock); - - if (!sched_proxy_exec()) { - __clear_task_blocked_on(p, m); - return; - } - - /* Don't set PROXY_WAKING if blocked_on was already cleared */ - if (!p->blocked_on) - return; - /* - * There may be cases where we set PROXY_WAKING on tasks that were - * already set to waking, but make sure we are not changing - * the relationship with a different lock. - */ - WARN_ON_ONCE(m && p->blocked_on != m && p->blocked_on != PROXY_WAKING); - p->blocked_on = PROXY_WAKING; -} - -static inline void set_task_blocked_on_waking(struct task_struct *p, struct mutex *m) -{ - guard(raw_spinlock_irqsave)(&p->blocked_lock); - __set_task_blocked_on_waking(p, m); -} - #else static inline void __clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m) { @@ -2291,14 +2253,6 @@ static inline void __clear_task_blocked_on(struct task_struct *p, struct rt_mute static inline void clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m) { } - -static inline void __set_task_blocked_on_waking(struct task_struct *p, struct rt_mutex *m) -{ -} - -static inline void set_task_blocked_on_waking(struct task_struct *p, struct rt_mutex *m) -{ -} #endif /* !CONFIG_PREEMPT_RT */ static __always_inline bool need_resched(void) diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 28677165785f..89d01f788973 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -1044,7 +1044,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne next_lock = __get_task_blocked_on(donor); if (next_lock == lock) { next = get_task_struct(donor); - __set_task_blocked_on_waking(donor, next_lock); + __clear_task_blocked_on(next, lock); current->blocked_donor = NULL; } raw_spin_unlock(&donor->blocked_lock); @@ -1060,7 +1060,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne raw_spin_lock_nested(&next->blocked_lock, SINGLE_DEPTH_NESTING); debug_mutex_wake_waiter(lock, waiter); - __set_task_blocked_on_waking(next, lock); + __clear_task_blocked_on(next, lock); raw_spin_unlock(&next->blocked_lock); } diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index 6c12452097e1..d62b49b53ec3 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -324,7 +324,7 @@ __ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter, * blocked_on to PROXY_WAKING. Otherwise we can see * circular blocked_on relationships that can't resolve. */ - set_task_blocked_on_waking(waiter->task, lock); + clear_task_blocked_on(waiter->task, lock); wake_q_add(wake_q, waiter->task); } @@ -383,7 +383,7 @@ static bool __ww_mutex_wound(struct MUTEX *lock, * are waking the mutex owner, who may be currently * blocked on a different mutex. */ - set_task_blocked_on_waking(owner, NULL); + clear_task_blocked_on(owner, NULL); wake_q_add(wake_q, owner); } return true; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9b710313dfb3..cec2c164fab1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6872,7 +6872,7 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) for (p = donor; p->is_blocked; p = owner) { /* if its PROXY_WAKING, do return migration or run if current */ struct mutex *mutex = p->blocked_on; - if (!mutex || mutex == PROXY_WAKING) { + if (!mutex) { clear_task_blocked_on(p, mutex); if (task_current(rq, p)) { p->is_blocked = 0; -- cgit v1.2.3