diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2026-05-06 01:22:04 +0300 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2026-05-06 01:22:04 +0300 |
| commit | de95ad90fb19e4b7778a0c27115a4639c7c8b186 (patch) | |
| tree | 1d74306e2964932ccd0b787ea743dc74587683f5 | |
| parent | 50fb0bcc9d7da23e0f0fd5359b4f9ceb0aa337d2 (diff) | |
| parent | b34c82777a2c0648ee053595f4b290fd5249b093 (diff) | |
| download | linux-de95ad90fb19e4b7778a0c27115a4639c7c8b186.tar.xz | |
Merge tag 'sched_ext-for-7.1-rc2-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext
Pull sched_ext fixes from Tejun Heo:
- Fix idle CPU selection returning prev_cpu outside the task's cpus_ptr
when the BPF caller's allowed mask was wider. Stable backport.
- Two opposite-direction gaps in scx_task_iter's cgroup-scoped mode
versus the global mode:
- Tasks past exit_signals() are filtered by the cgroup walk but kept
by global. Sub-scheduler enable abort leaked __scx_init_task()
state. Add a CSS_TASK_ITER_WITH_DEAD flag to cgroup's task
iterator (scx_task_iter is its only user) and use it.
- Tasks past sched_ext_dead() are still returned, tripping
WARN_ON_ONCE() in callers or making them touch torn-down state.
Mark and skip under the per-task rq lock.
* tag 'sched_ext-for-7.1-rc2-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext:
sched_ext: idle: Recheck prev_cpu after narrowing allowed mask
sched_ext: Skip past-sched_ext_dead() tasks in scx_task_iter_next_locked()
cgroup, sched_ext: Include exiting tasks in cgroup iter
| -rw-r--r-- | include/linux/cgroup.h | 1 | ||||
| -rw-r--r-- | include/linux/sched/ext.h | 1 | ||||
| -rw-r--r-- | kernel/cgroup/cgroup.c | 8 | ||||
| -rw-r--r-- | kernel/sched/ext.c | 39 | ||||
| -rw-r--r-- | kernel/sched/ext_idle.c | 12 |
5 files changed, 42 insertions, 19 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index e52160e85af4..f6d037a30fd8 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -53,6 +53,7 @@ struct kernel_clone_args; enum css_task_iter_flags { CSS_TASK_ITER_PROCS = (1U << 0), /* walk only threadgroup leaders */ CSS_TASK_ITER_THREADED = (1U << 1), /* walk all threaded css_sets in the domain */ + CSS_TASK_ITER_WITH_DEAD = (1U << 2), /* include exiting tasks */ CSS_TASK_ITER_SKIPPED = (1U << 16), /* internal flags */ }; diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 1a3af2ea2a79..adb9a4de068a 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -101,6 +101,7 @@ enum scx_ent_flags { SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */ SCX_TASK_SUB_INIT = 1 << 4, /* task being initialized for a sub sched */ SCX_TASK_IMMED = 1 << 5, /* task is on local DSQ with %SCX_ENQ_IMMED */ + SCX_TASK_OFF_TASKS = 1 << 6, /* removed from scx_tasks by sched_ext_dead() */ /* * Bits 8 and 9 are used to carry task state: diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 45c0b1ed687a..383af2b233b3 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5067,10 +5067,12 @@ repeat: task = list_entry(it->task_pos, struct task_struct, cg_list); /* - * Hide tasks that are exiting but not yet removed. Keep zombie - * leaders with live threads visible. + * Hide tasks that are exiting but not yet removed by default. Keep + * zombie leaders with live threads visible. Usages that need to walk + * every existing task can opt out via CSS_TASK_ITER_WITH_DEAD. */ - if ((task->flags & PF_EXITING) && !atomic_read(&task->signal->live)) + if (!(it->flags & CSS_TASK_ITER_WITH_DEAD) && + (task->flags & PF_EXITING) && !atomic_read(&task->signal->live)) goto repeat; if (it->flags & CSS_TASK_ITER_PROCS) { diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 345aa11b84b2..38d90baf78cf 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -766,7 +766,8 @@ static void scx_task_iter_start(struct scx_task_iter *iter, struct cgroup *cgrp) lockdep_assert_held(&cgroup_mutex); iter->cgrp = cgrp; iter->css_pos = css_next_descendant_pre(NULL, &iter->cgrp->self); - css_task_iter_start(iter->css_pos, 0, &iter->css_iter); + css_task_iter_start(iter->css_pos, CSS_TASK_ITER_WITH_DEAD, + &iter->css_iter); return; } #endif @@ -866,7 +867,8 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) iter->css_pos = css_next_descendant_pre(iter->css_pos, &iter->cgrp->self); if (iter->css_pos) - css_task_iter_start(iter->css_pos, 0, &iter->css_iter); + css_task_iter_start(iter->css_pos, CSS_TASK_ITER_WITH_DEAD, + &iter->css_iter); } return NULL; } @@ -926,16 +928,27 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) * * Test for idle_sched_class as only init_tasks are on it. */ - if (p->sched_class != &idle_sched_class) - break; - } - if (!p) - return NULL; + if (p->sched_class == &idle_sched_class) + continue; - iter->rq = task_rq_lock(p, &iter->rf); - iter->locked_task = p; + iter->rq = task_rq_lock(p, &iter->rf); + iter->locked_task = p; - return p; + /* + * cgroup_task_dead() removes the dead tasks from cset->tasks + * after sched_ext_dead() and cgroup iteration may see tasks + * which already finished sched_ext_dead(). %SCX_TASK_OFF_TASKS + * is set by sched_ext_dead() under @p's rq lock. Test it to + * avoid visiting tasks which are already dead from SCX POV. + */ + if (p->scx.flags & SCX_TASK_OFF_TASKS) { + __scx_task_iter_rq_unlock(iter); + continue; + } + + return p; + } + return NULL; } /** @@ -3848,6 +3861,11 @@ void sched_ext_dead(struct task_struct *p) /* * @p is off scx_tasks and wholly ours. scx_root_enable()'s READY -> * ENABLED transitions can't race us. Disable ops for @p. + * + * %SCX_TASK_OFF_TASKS synchronizes against cgroup task iteration - see + * scx_task_iter_next_locked(). NONE tasks need no marking: cgroup + * iteration is only used from sub-sched paths, which require root + * enabled. Root enable transitions every live task to at least READY. */ if (scx_get_task_state(p) != SCX_TASK_NONE) { struct rq_flags rf; @@ -3855,6 +3873,7 @@ void sched_ext_dead(struct task_struct *p) rq = task_rq_lock(p, &rf); scx_disable_and_exit_task(scx_task_sched(p), p); + p->scx.flags |= SCX_TASK_OFF_TASKS; task_rq_unlock(rq, p, &rf); } } diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index 7468560a6d80..6e1980763270 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -466,12 +466,6 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, preempt_disable(); /* - * Check whether @prev_cpu is still within the allowed set. If not, - * we can still try selecting a nearby CPU. - */ - is_prev_allowed = cpumask_test_cpu(prev_cpu, allowed); - - /* * Determine the subset of CPUs usable by @p within @cpus_allowed. */ if (allowed != p->cpus_ptr) { @@ -488,6 +482,12 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, } /* + * Check whether @prev_cpu is still within the allowed set. If not, + * we can still try selecting a nearby CPU. + */ + is_prev_allowed = cpumask_test_cpu(prev_cpu, allowed); + + /* * This is necessary to protect llc_cpus. */ rcu_read_lock(); |
