From 9fe89f022c05d99c052d6bc088b82d4ff83bf463 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 27 Jan 2026 16:17:48 +0100 Subject: sched/fair: More complex proportional newidle balance It turns out that a few workloads (easyWave, fio) have a fairly low success rate on newidle balance, but still benefit greatly from having it anyway. Luckliky these workloads have a faily low newidle rate, so the cost if doing the newidle is relatively low, even if unsuccessfull. Add a simple rate based part to the newidle ratio compute, such that low rate newidle will still have a high newidle ratio. This cures the easyWave and fio workloads while not affecting the schbench numbers either (which have a very high newidle rate). Reported-by: Mario Roy Reported-by: "Mohamed Abuelfotoh, Hazem" Signed-off-by: Peter Zijlstra (Intel) Tested-by: Mario Roy Tested-by: "Mohamed Abuelfotoh, Hazem" Link: https://patch.msgid.link/20260127151748.GA1079264@noisy.programming.kicks-ass.net --- include/linux/sched/topology.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 45c0022b91ce..a1e1032426dc 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -95,6 +95,7 @@ struct sched_domain { unsigned int newidle_call; unsigned int newidle_success; unsigned int newidle_ratio; + u64 newidle_stamp; u64 max_newidle_lb_cost; unsigned long last_decay_max_lb_cost; -- cgit v1.2.3 From 49b76317592ecbaefd0969d51d02019966cc994b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 1 Mar 2026 16:52:37 -0800 Subject: sched/wait: correct kernel-doc descriptions Use the correct function name and function parameter name to avoid these kernel-doc warnings: Warning: include/linux/wait_bit.h:424 expecting prototype for wait_var_event_killable(). Prototype was for wait_var_event_interruptible() instead Warning: include/linux/wait_bit.h:508 function parameter 'lock' not described in 'wait_var_event_mutex' Signed-off-by: Randy Dunlap Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260302005237.3473095-1-rdunlap@infradead.org --- include/linux/wait_bit.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h index 9e29d79fc790..ace7379d627d 100644 --- a/include/linux/wait_bit.h +++ b/include/linux/wait_bit.h @@ -406,7 +406,7 @@ do { \ schedule()) /** - * wait_var_event_killable - wait for a variable to be updated and notified + * wait_var_event_interruptible - wait for a variable to be updated and notified * @var: the address of variable being waited on * @condition: the condition to wait for * @@ -492,7 +492,7 @@ do { \ * wait_var_event_mutex - wait for a variable to be updated under a mutex * @var: the address of the variable being waited on * @condition: condition to wait for - * @mutex: the mutex which protects updates to the variable + * @lock: the mutex which protects updates to the variable * * Wait for a condition which can only be reliably tested while holding * a mutex. The variables assessed in the condition will normal be -- cgit v1.2.3 From 10febd397591d93f42adb743c2c664041e7f1bcb Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Thu, 12 Mar 2026 04:44:30 +0000 Subject: sched/topology: Remove sched_domain_shared allocation with sd_data Now that "sd->shared" assignments are using the sched_domain_shared objects allocated with s_data, remove the sd_data based allocations. Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Link: https://patch.msgid.link/20260312044434.1974-6-kprateek.nayak@amd.com --- include/linux/sched/topology.h | 1 - kernel/sched/topology.c | 19 ------------------- 2 files changed, 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index a1e1032426dc..51c29581f15e 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -172,7 +172,6 @@ typedef int (*sched_domain_flags_f)(void); struct sd_data { struct sched_domain *__percpu *sd; - struct sched_domain_shared *__percpu *sds; struct sched_group *__percpu *sg; struct sched_group_capacity *__percpu *sgc; }; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index b19d84f44669..43150591914b 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1609,9 +1609,6 @@ static void claim_allocations(int cpu, struct s_data *d) WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); *per_cpu_ptr(sdd->sd, cpu) = NULL; - if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref)) - *per_cpu_ptr(sdd->sds, cpu) = NULL; - if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) *per_cpu_ptr(sdd->sg, cpu) = NULL; @@ -2390,10 +2387,6 @@ static int __sdt_alloc(const struct cpumask *cpu_map) if (!sdd->sd) return -ENOMEM; - sdd->sds = alloc_percpu(struct sched_domain_shared *); - if (!sdd->sds) - return -ENOMEM; - sdd->sg = alloc_percpu(struct sched_group *); if (!sdd->sg) return -ENOMEM; @@ -2404,7 +2397,6 @@ static int __sdt_alloc(const struct cpumask *cpu_map) for_each_cpu(j, cpu_map) { struct sched_domain *sd; - struct sched_domain_shared *sds; struct sched_group *sg; struct sched_group_capacity *sgc; @@ -2415,13 +2407,6 @@ static int __sdt_alloc(const struct cpumask *cpu_map) *per_cpu_ptr(sdd->sd, j) = sd; - sds = kzalloc_node(sizeof(struct sched_domain_shared), - GFP_KERNEL, cpu_to_node(j)); - if (!sds) - return -ENOMEM; - - *per_cpu_ptr(sdd->sds, j) = sds; - sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), GFP_KERNEL, cpu_to_node(j)); if (!sg) @@ -2463,8 +2448,6 @@ static void __sdt_free(const struct cpumask *cpu_map) kfree(*per_cpu_ptr(sdd->sd, j)); } - if (sdd->sds) - kfree(*per_cpu_ptr(sdd->sds, j)); if (sdd->sg) kfree(*per_cpu_ptr(sdd->sg, j)); if (sdd->sgc) @@ -2472,8 +2455,6 @@ static void __sdt_free(const struct cpumask *cpu_map) } free_percpu(sdd->sd); sdd->sd = NULL; - free_percpu(sdd->sds); - sdd->sds = NULL; free_percpu(sdd->sg); sdd->sg = NULL; free_percpu(sdd->sgc); -- cgit v1.2.3 From 8ca12326f592f7554acf2788ecb1c5c954dcf31c Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Mon, 16 Mar 2026 00:36:22 +0100 Subject: PM: EM: Switch to rcu_dereference_all() in wakeup path em_cpu_energy() is part of the EAS (Fair) task wakeup path. Now that rcu_read_{,un}lock() have been removed from find_energy_efficient_cpu() switch to rcu_dereference_all() and check for rcu_read_lock_any_held() in em_cpu_energy() as well. In EAS (Fair) task wakeup path is a preempt/IRQ disabled region, so rcu_read_{,un}lock() can be removed. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: K Prateek Nayak Link: https://patch.msgid.link/5b1228b7-5949-4a45-9f62-e8ce936de694@arm.com --- include/linux/energy_model.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index e7497f804644..c909a8ba22e8 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -248,7 +248,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, struct em_perf_state *ps; int i; - WARN_ONCE(!rcu_read_lock_held(), "EM: rcu read lock needed\n"); + lockdep_assert(rcu_read_lock_any_held()); if (!sum_util) return 0; @@ -267,7 +267,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, * Find the lowest performance state of the Energy Model above the * requested performance. */ - em_table = rcu_dereference(pd->em_table); + em_table = rcu_dereference_all(pd->em_table); i = em_pd_get_efficient_state(em_table->state, pd, max_util); ps = &em_table->state[i]; -- cgit v1.2.3 From e379dce8af11d8d6040b4348316a499bfd174bfb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 23 Mar 2026 10:36:27 +0100 Subject: sched/topology: Fix sched_domain_span() Commit 8e8e23dea43e ("sched/topology: Compute sd_weight considering cpuset partitions") ends up relying on the fact that structure initialization should not touch the flexible array. However, the official GCC specification for "Arrays of Length Zero" [*] says: Although the size of a zero-length array is zero, an array member of this kind may increase the size of the enclosing type as a result of tail padding. Additionally, structure initialization will zero tail padding. With the end result that since offsetof(*type, member) < sizeof(*type), array initialization will clobber the flex array. Luckily, the way flexible array sizes are calculated is: sizeof(*type) + count * sizeof(*type->member) This means we have the complete size of the flex array *outside* of sizeof(*type), so use that instead of relying on the broken flex array definition. [*] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html Fixes: 8e8e23dea43e ("sched/topology: Compute sd_weight considering cpuset partitions") Reported-by: Nathan Chancellor Debugged-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Tested-by: Jon Hunter Tested-by: Chen Yu Tested-by: K Prateek Nayak Tested-by: Nathan Chancellor Link: https://patch.msgid.link/20260323093627.GY3738010@noisy.programming.kicks-ass.net --- include/linux/sched/topology.h | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 51c29581f15e..36553e14866d 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -142,18 +142,30 @@ struct sched_domain { unsigned int span_weight; /* - * Span of all CPUs in this domain. + * See sched_domain_span(), on why flex arrays are broken. * - * NOTE: this field is variable length. (Allocated dynamically - * by attaching extra space to the end of the structure, - * depending on how many CPUs the kernel has booted up with) - */ unsigned long span[]; + */ }; static inline struct cpumask *sched_domain_span(struct sched_domain *sd) { - return to_cpumask(sd->span); + /* + * Turns out that C flexible arrays are fundamentally broken since it + * is allowed for offsetof(*sd, span) < sizeof(*sd), this means that + * structure initialzation *sd = { ... }; which writes every byte + * inside sizeof(*type), will over-write the start of the flexible + * array. + * + * Luckily, the way we allocate sched_domain is by: + * + * sizeof(*sd) + cpumask_size() + * + * this means that we have sufficient space for the whole flex array + * *outside* of sizeof(*sd). So use that, and avoid using sd->span. + */ + unsigned long *bitmap = (void *)sd + sizeof(*sd); + return to_cpumask(bitmap); } extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], -- cgit v1.2.3 From fa4a1ff8ab235a308d8c983827657a69649185fd Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 24 Mar 2026 19:13:19 +0000 Subject: locking: Add task::blocked_lock to serialize blocked_on state So far, we have been able to utilize the mutex::wait_lock for serializing the blocked_on state, but when we move to proxying across runqueues, we will need to add more state and a way to serialize changes to this state in contexts where we don't hold the mutex::wait_lock. So introduce the task::blocked_lock, which nests under the mutex::wait_lock in the locking order, and rework the locking to use it. Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: K Prateek Nayak Link: https://patch.msgid.link/20260324191337.1841376-5-jstultz@google.com --- include/linux/sched.h | 48 ++++++++++++++++---------------------------- init/init_task.c | 1 + kernel/fork.c | 1 + kernel/locking/mutex-debug.c | 4 ++-- kernel/locking/mutex.c | 40 +++++++++++++++++++++++------------- kernel/locking/mutex.h | 6 ++++++ kernel/locking/ww_mutex.h | 4 ++-- kernel/sched/core.c | 4 +++- 8 files changed, 58 insertions(+), 50 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 5a5d3dbc9cdf..2eef9bc6daaa 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1238,6 +1238,7 @@ struct task_struct { #endif struct mutex *blocked_on; /* lock we're blocked on */ + raw_spinlock_t blocked_lock; #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER /* @@ -2181,57 +2182,42 @@ extern int __cond_resched_rwlock_write(rwlock_t *lock) __must_hold(lock); #ifndef CONFIG_PREEMPT_RT static inline struct mutex *__get_task_blocked_on(struct task_struct *p) { - struct mutex *m = p->blocked_on; - - if (m) - lockdep_assert_held_once(&m->wait_lock); - return m; + lockdep_assert_held_once(&p->blocked_lock); + return p->blocked_on; } static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m) { - struct mutex *blocked_on = READ_ONCE(p->blocked_on); - WARN_ON_ONCE(!m); /* The task should only be setting itself as blocked */ WARN_ON_ONCE(p != current); - /* Currently we serialize blocked_on under the mutex::wait_lock */ - lockdep_assert_held_once(&m->wait_lock); + /* Currently we serialize blocked_on under the task::blocked_lock */ + lockdep_assert_held_once(&p->blocked_lock); /* * Check ensure we don't overwrite existing mutex value * with a different mutex. Note, setting it to the same * lock repeatedly is ok. */ - WARN_ON_ONCE(blocked_on && blocked_on != m); - WRITE_ONCE(p->blocked_on, m); -} - -static inline void set_task_blocked_on(struct task_struct *p, struct mutex *m) -{ - guard(raw_spinlock_irqsave)(&m->wait_lock); - __set_task_blocked_on(p, m); + WARN_ON_ONCE(p->blocked_on && p->blocked_on != m); + p->blocked_on = m; } static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex *m) { - if (m) { - struct mutex *blocked_on = READ_ONCE(p->blocked_on); - - /* Currently we serialize blocked_on under the mutex::wait_lock */ - lockdep_assert_held_once(&m->wait_lock); - /* - * There may be cases where we re-clear already cleared - * blocked_on relationships, but make sure we are not - * clearing the relationship with a different lock. - */ - WARN_ON_ONCE(blocked_on && blocked_on != m); - } - WRITE_ONCE(p->blocked_on, NULL); + /* Currently we serialize blocked_on under the task::blocked_lock */ + lockdep_assert_held_once(&p->blocked_lock); + /* + * There may be cases where we re-clear already cleared + * blocked_on relationships, but make sure we are not + * clearing the relationship with a different lock. + */ + WARN_ON_ONCE(m && p->blocked_on && p->blocked_on != m); + p->blocked_on = NULL; } static inline void clear_task_blocked_on(struct task_struct *p, struct mutex *m) { - guard(raw_spinlock_irqsave)(&m->wait_lock); + guard(raw_spinlock_irqsave)(&p->blocked_lock); __clear_task_blocked_on(p, m); } #else diff --git a/init/init_task.c b/init/init_task.c index 5c838757fc10..b5f48ebdc2b6 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -169,6 +169,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = { .journal_info = NULL, INIT_CPU_TIMERS(init_task) .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(init_task.pi_lock), + .blocked_lock = __RAW_SPIN_LOCK_UNLOCKED(init_task.blocked_lock), .timer_slack_ns = 50000, /* 50 usec default slack */ .thread_pid = &init_struct_pid, .thread_node = LIST_HEAD_INIT(init_signals.thread_head), diff --git a/kernel/fork.c b/kernel/fork.c index bc2bf58b93b6..079802cb6100 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2076,6 +2076,7 @@ __latent_entropy struct task_struct *copy_process( ftrace_graph_init_task(p); rt_mutex_init_task(p); + raw_spin_lock_init(&p->blocked_lock); lockdep_assert_irqs_enabled(); #ifdef CONFIG_PROVE_LOCKING diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 2c6b02d4699b..cc6aa9c6e981 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -54,13 +54,13 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, lockdep_assert_held(&lock->wait_lock); /* Current thread can't be already blocked (since it's executing!) */ - DEBUG_LOCKS_WARN_ON(__get_task_blocked_on(task)); + DEBUG_LOCKS_WARN_ON(get_task_blocked_on(task)); } void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct task_struct *task) { - struct mutex *blocked_on = __get_task_blocked_on(task); + struct mutex *blocked_on = get_task_blocked_on(task); DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); DEBUG_LOCKS_WARN_ON(waiter->task != task); diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 2a1d165b3167..4aa79bcab08c 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -656,6 +656,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas goto err_early_kill; } + raw_spin_lock(¤t->blocked_lock); __set_task_blocked_on(current, lock); set_current_state(state); trace_contention_begin(lock, LCB_F_MUTEX); @@ -669,8 +670,9 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas * the handoff. */ if (__mutex_trylock(lock)) - goto acquired; + break; + raw_spin_unlock(¤t->blocked_lock); /* * Check for signals and kill conditions while holding * wait_lock. This ensures the lock cancellation is ordered @@ -693,12 +695,14 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas first = __mutex_waiter_is_first(lock, &waiter); + raw_spin_lock_irqsave(&lock->wait_lock, flags); + raw_spin_lock(¤t->blocked_lock); /* * As we likely have been woken up by task * that has cleared our blocked_on state, re-set * it to the lock we are trying to acquire. */ - set_task_blocked_on(current, lock); + __set_task_blocked_on(current, lock); set_current_state(state); /* * Here we order against unlock; we must either see it change @@ -709,25 +713,33 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas break; if (first) { - trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN); + bool opt_acquired; + /* * mutex_optimistic_spin() can call schedule(), so - * clear blocked on so we don't become unselectable + * we need to release these locks before calling it, + * and clear blocked on so we don't become unselectable * to run. */ - clear_task_blocked_on(current, lock); - if (mutex_optimistic_spin(lock, ww_ctx, &waiter)) + __clear_task_blocked_on(current, lock); + raw_spin_unlock(¤t->blocked_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + + trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN); + opt_acquired = mutex_optimistic_spin(lock, ww_ctx, &waiter); + + raw_spin_lock_irqsave(&lock->wait_lock, flags); + raw_spin_lock(¤t->blocked_lock); + __set_task_blocked_on(current, lock); + + if (opt_acquired) break; - set_task_blocked_on(current, lock); trace_contention_begin(lock, LCB_F_MUTEX); } - - raw_spin_lock_irqsave(&lock->wait_lock, flags); } - raw_spin_lock_irqsave(&lock->wait_lock, flags); -acquired: __clear_task_blocked_on(current, lock); __set_current_state(TASK_RUNNING); + raw_spin_unlock(¤t->blocked_lock); if (ww_ctx) { /* @@ -756,11 +768,11 @@ skip_wait: return 0; err: - __clear_task_blocked_on(current, lock); + clear_task_blocked_on(current, lock); __set_current_state(TASK_RUNNING); __mutex_remove_waiter(lock, &waiter); err_early_kill: - WARN_ON(__get_task_blocked_on(current)); + WARN_ON(get_task_blocked_on(current)); trace_contention_end(lock, ret); raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q); debug_mutex_free_waiter(&waiter); @@ -971,7 +983,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne next = waiter->task; debug_mutex_wake_waiter(lock, waiter); - __clear_task_blocked_on(next, lock); + clear_task_blocked_on(next, lock); wake_q_add(&wake_q, next); } diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 9ad4da8cea00..7a8ba13fee94 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -47,6 +47,12 @@ static inline struct task_struct *__mutex_owner(struct mutex *lock) return (struct task_struct *)(atomic_long_read(&lock->owner) & ~MUTEX_FLAGS); } +static inline struct mutex *get_task_blocked_on(struct task_struct *p) +{ + guard(raw_spinlock_irqsave)(&p->blocked_lock); + return __get_task_blocked_on(p); +} + #ifdef CONFIG_DEBUG_MUTEXES extern void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter); diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index 31a785afee6c..e4a81790ea7d 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -289,7 +289,7 @@ __ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter, * blocked_on pointer. Otherwise we can see circular * blocked_on relationships that can't resolve. */ - __clear_task_blocked_on(waiter->task, lock); + clear_task_blocked_on(waiter->task, lock); wake_q_add(wake_q, waiter->task); } @@ -347,7 +347,7 @@ static bool __ww_mutex_wound(struct MUTEX *lock, * are waking the mutex owner, who may be currently * blocked on a different mutex. */ - __clear_task_blocked_on(owner, NULL); + clear_task_blocked_on(owner, NULL); wake_q_add(wake_q, owner); } return true; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5b7f378af042..1913dbc68eb9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6584,6 +6584,7 @@ static struct task_struct *proxy_deactivate(struct rq *rq, struct task_struct *d * p->pi_lock * rq->lock * mutex->wait_lock + * p->blocked_lock * * Returns the task that is going to be used as execution context (the one * that is actually going to be run on cpu_of(rq)). @@ -6603,8 +6604,9 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) * and ensure @owner sticks around. */ guard(raw_spinlock)(&mutex->wait_lock); + guard(raw_spinlock)(&p->blocked_lock); - /* Check again that p is blocked with wait_lock held */ + /* Check again that p is blocked with blocked_lock held */ if (mutex != __get_task_blocked_on(p)) { /* * Something changed in the blocked_on chain and -- cgit v1.2.3 From 2d7622669836dcbbb449741b4e6c503ffe005c25 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 24 Mar 2026 19:13:21 +0000 Subject: sched/locking: Add special p->blocked_on==PROXY_WAKING value for proxy return-migration As we add functionality to proxy execution, we may migrate a donor task to a runqueue where it can't run due to cpu affinity. Thus, we must be careful to ensure we return-migrate the task back to a cpu in its cpumask when it becomes unblocked. Peter helpfully provided the following example with pictures: "Suppose we have a ww_mutex cycle: ,-+-* Mutex-1 <-. Task-A ---' | | ,-- Task-B `-> Mutex-2 *-+-' Where Task-A holds Mutex-1 and tries to acquire Mutex-2, and where Task-B holds Mutex-2 and tries to acquire Mutex-1. Then the blocked_on->owner chain will go in circles. Task-A -> Mutex-2 ^ | | v Mutex-1 <- Task-B We need two things: - find_proxy_task() to stop iterating the circle; - the woken task to 'unblock' and run, such that it can back-off and re-try the transaction. Now, the current code [without this patch] does: __clear_task_blocked_on(); wake_q_add(); And surely clearing ->blocked_on is sufficient to break the cycle. Suppose it is Task-B that is made to back-off, then we have: Task-A -> Mutex-2 -> Task-B (no further blocked_on) and it would attempt to run Task-B. Or worse, it could directly pick Task-B and run it, without ever getting into find_proxy_task(). Now, here is a problem because Task-B might not be runnable on the CPU it is currently on; and because !task_is_blocked() we don't get into the proxy paths, so nobody is going to fix this up. Ideally we would have dequeued Task-B alongside of clearing ->blocked_on, but alas, [the lock ordering prevents us from getting the task_rq_lock() and] spoils things." Thus we need more than just a binary concept of the task being blocked on a mutex or not. So allow setting blocked_on to PROXY_WAKING as a special value which specifies the task is no longer blocked, but needs to be evaluated for return migration *before* it can be run. This will then be used in a later patch to handle proxy return-migration. Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: K Prateek Nayak Link: https://patch.msgid.link/20260324191337.1841376-7-jstultz@google.com --- include/linux/sched.h | 51 +++++++++++++++++++++++++++++++++++++++++++++-- kernel/locking/mutex.c | 2 +- kernel/locking/ww_mutex.h | 16 +++++++-------- kernel/sched/core.c | 16 +++++++++++++++ 4 files changed, 74 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 2eef9bc6daaa..8ec3b6d7d718 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2180,10 +2180,20 @@ extern int __cond_resched_rwlock_write(rwlock_t *lock) __must_hold(lock); }) #ifndef CONFIG_PREEMPT_RT + +/* + * With proxy exec, if a task has been proxy-migrated, it may be a donor + * on a cpu that it can't actually run on. Thus we need a special state + * to denote that the task is being woken, but that it needs to be + * evaluated for return-migration before it is run. So if the task is + * blocked_on PROXY_WAKING, return migrate it before running it. + */ +#define PROXY_WAKING ((struct mutex *)(-1L)) + static inline struct mutex *__get_task_blocked_on(struct task_struct *p) { lockdep_assert_held_once(&p->blocked_lock); - return p->blocked_on; + return p->blocked_on == PROXY_WAKING ? NULL : p->blocked_on; } static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m) @@ -2211,7 +2221,7 @@ static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex * * blocked_on relationships, but make sure we are not * clearing the relationship with a different lock. */ - WARN_ON_ONCE(m && p->blocked_on && p->blocked_on != m); + WARN_ON_ONCE(m && p->blocked_on && p->blocked_on != m && p->blocked_on != PROXY_WAKING); p->blocked_on = NULL; } @@ -2220,6 +2230,35 @@ static inline void clear_task_blocked_on(struct task_struct *p, struct mutex *m) guard(raw_spinlock_irqsave)(&p->blocked_lock); __clear_task_blocked_on(p, m); } + +static inline void __set_task_blocked_on_waking(struct task_struct *p, struct mutex *m) +{ + /* Currently we serialize blocked_on under the task::blocked_lock */ + lockdep_assert_held_once(&p->blocked_lock); + + if (!sched_proxy_exec()) { + __clear_task_blocked_on(p, m); + return; + } + + /* Don't set PROXY_WAKING if blocked_on was already cleared */ + if (!p->blocked_on) + return; + /* + * There may be cases where we set PROXY_WAKING on tasks that were + * already set to waking, but make sure we are not changing + * the relationship with a different lock. + */ + WARN_ON_ONCE(m && p->blocked_on != m && p->blocked_on != PROXY_WAKING); + p->blocked_on = PROXY_WAKING; +} + +static inline void set_task_blocked_on_waking(struct task_struct *p, struct mutex *m) +{ + guard(raw_spinlock_irqsave)(&p->blocked_lock); + __set_task_blocked_on_waking(p, m); +} + #else static inline void __clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m) { @@ -2228,6 +2267,14 @@ static inline void __clear_task_blocked_on(struct task_struct *p, struct rt_mute static inline void clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m) { } + +static inline void __set_task_blocked_on_waking(struct task_struct *p, struct rt_mutex *m) +{ +} + +static inline void set_task_blocked_on_waking(struct task_struct *p, struct rt_mutex *m) +{ +} #endif /* !CONFIG_PREEMPT_RT */ static __always_inline bool need_resched(void) diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 4aa79bcab08c..7d359647156d 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -983,7 +983,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne next = waiter->task; debug_mutex_wake_waiter(lock, waiter); - clear_task_blocked_on(next, lock); + set_task_blocked_on_waking(next, lock); wake_q_add(&wake_q, next); } diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index e4a81790ea7d..5cd9dfa4b31e 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -285,11 +285,11 @@ __ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter, debug_mutex_wake_waiter(lock, waiter); #endif /* - * When waking up the task to die, be sure to clear the - * blocked_on pointer. Otherwise we can see circular - * blocked_on relationships that can't resolve. + * When waking up the task to die, be sure to set the + * blocked_on to PROXY_WAKING. Otherwise we can see + * circular blocked_on relationships that can't resolve. */ - clear_task_blocked_on(waiter->task, lock); + set_task_blocked_on_waking(waiter->task, lock); wake_q_add(wake_q, waiter->task); } @@ -339,15 +339,15 @@ static bool __ww_mutex_wound(struct MUTEX *lock, */ if (owner != current) { /* - * When waking up the task to wound, be sure to clear the - * blocked_on pointer. Otherwise we can see circular - * blocked_on relationships that can't resolve. + * When waking up the task to wound, be sure to set the + * blocked_on to PROXY_WAKING. Otherwise we can see + * circular blocked_on relationships that can't resolve. * * NOTE: We pass NULL here instead of lock, because we * are waking the mutex owner, who may be currently * blocked on a different mutex. */ - clear_task_blocked_on(owner, NULL); + set_task_blocked_on_waking(owner, NULL); wake_q_add(wake_q, owner); } return true; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bf4338f71667..c997d516441d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4239,6 +4239,13 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) ttwu_queue(p, cpu, wake_flags); } out: + /* + * For now, if we've been woken up, clear the task->blocked_on + * regardless if it was set to a mutex or PROXY_WAKING so the + * task can run. We will need to be more careful later when + * properly handling proxy migration + */ + clear_task_blocked_on(p, NULL); if (success) ttwu_stat(p, task_cpu(p), wake_flags); @@ -6600,6 +6607,10 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) /* Follow blocked_on chain. */ for (p = donor; (mutex = p->blocked_on); p = owner) { + /* if its PROXY_WAKING, resched_idle so ttwu can complete */ + if (mutex == PROXY_WAKING) + return proxy_resched_idle(rq); + /* * By taking mutex->wait_lock we hold off concurrent mutex_unlock() * and ensure @owner sticks around. @@ -6620,6 +6631,11 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) owner = __mutex_owner(mutex); if (!owner) { + /* + * If there is no owner, clear blocked_on + * and return p so it can run and try to + * acquire the lock + */ __clear_task_blocked_on(p, mutex); return p; } -- cgit v1.2.3