diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2026-04-15 20:54:24 +0300 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2026-04-15 20:54:24 +0300 |
| commit | 5bdb4078e1efba9650c03753616866192d680718 (patch) | |
| tree | 4031e1be6f7c80b885adaf93eaca6e46c12a7a1b /include | |
| parent | 7de6b4a246330fe29fa2fd144b4724ca35d60d6c (diff) | |
| parent | 7e311bafb9ad3a4711c08c00b09fb7839ada37f0 (diff) | |
| download | linux-5bdb4078e1efba9650c03753616866192d680718.tar.xz | |
Merge tag 'sched_ext-for-7.1' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext
Pull sched_ext updates from Tejun Heo:
- cgroup sub-scheduler groundwork
Multiple BPF schedulers can be attached to cgroups and the dispatch
path is made hierarchical. This involves substantial restructuring of
the core dispatch, bypass, watchdog, and dump paths to be
per-scheduler, along with new infrastructure for scheduler ownership
enforcement, lifecycle management, and cgroup subtree iteration
The enqueue path is not yet updated and will follow in a later cycle
- scx_bpf_dsq_reenq() generalized to support any DSQ including remote
local DSQs and user DSQs
Built on top of this, SCX_ENQ_IMMED guarantees that tasks dispatched
to local DSQs either run immediately or get reenqueued back through
ops.enqueue(), giving schedulers tighter control over queueing
latency
Also useful for opportunistic CPU sharing across sub-schedulers
- ops.dequeue() was only invoked when the core knew a task was in BPF
data structures, missing scheduling property change events and
skipping callbacks for non-local DSQ dispatches from ops.select_cpu()
Fixed to guarantee exactly one ops.dequeue() call when a task leaves
BPF scheduler custody
- Kfunc access validation moved from runtime to BPF verifier time,
removing runtime mask enforcement
- Idle SMT sibling prioritization in the idle CPU selection path
- Documentation, selftest, and tooling updates. Misc bug fixes and
cleanups
* tag 'sched_ext-for-7.1' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext: (134 commits)
tools/sched_ext: Add explicit cast from void* in RESIZE_ARRAY()
sched_ext: Make string params of __ENUM_set() const
tools/sched_ext: Kick home CPU for stranded tasks in scx_qmap
sched_ext: Drop spurious warning on kick during scheduler disable
sched_ext: Warn on task-based SCX op recursion
sched_ext: Rename scx_kf_allowed_on_arg_tasks() to scx_kf_arg_task_ok()
sched_ext: Remove runtime kfunc mask enforcement
sched_ext: Add verifier-time kfunc context filter
sched_ext: Drop redundant rq-locked check from scx_bpf_task_cgroup()
sched_ext: Decouple kfunc unlocked-context check from kf_mask
sched_ext: Fix ops.cgroup_move() invocation kf_mask and rq tracking
sched_ext: Track @p's rq lock across set_cpus_allowed_scx -> ops.set_cpumask
sched_ext: Add select_cpu kfuncs to scx_kfunc_ids_unlocked
sched_ext: Drop TRACING access to select_cpu kfuncs
selftests/sched_ext: Fix wrong DSQ ID in peek_dsq error message
sched_ext: Documentation: improve accuracy of task lifecycle pseudo-code
selftests/sched_ext: Improve runner error reporting for invalid arguments
sched_ext: Documentation: Fix scx_bpf_move_to_local kfunc name
sched_ext: Documentation: Add ops.dequeue() to task lifecycle
tools/sched_ext: Fix off-by-one in scx_sdt payload zeroing
...
Diffstat (limited to 'include')
| -rw-r--r-- | include/linux/cgroup-defs.h | 4 | ||||
| -rw-r--r-- | include/linux/sched/ext.h | 109 |
2 files changed, 68 insertions, 45 deletions
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index f197ca104737..f42563739d2e 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -17,6 +17,7 @@ #include <linux/refcount.h> #include <linux/percpu-refcount.h> #include <linux/percpu-rwsem.h> +#include <linux/sched.h> #include <linux/u64_stats_sync.h> #include <linux/workqueue.h> #include <linux/bpf-cgroup-defs.h> @@ -628,6 +629,9 @@ struct cgroup { #ifdef CONFIG_BPF_SYSCALL struct bpf_local_storage __rcu *bpf_cgrp_storage; #endif +#ifdef CONFIG_EXT_SUB_SCHED + struct scx_sched __rcu *scx_sched; +#endif /* All ancestors including self */ union { diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index bcb962d5ee7d..1a3af2ea2a79 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -62,6 +62,16 @@ enum scx_dsq_id_flags { SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU, }; +struct scx_deferred_reenq_user { + struct list_head node; + u64 flags; +}; + +struct scx_dsq_pcpu { + struct scx_dispatch_q *dsq; + struct scx_deferred_reenq_user deferred_reenq_user; +}; + /* * A dispatch queue (DSQ) can be either a FIFO or p->scx.dsq_vtime ordered * queue. A built-in DSQ is always a FIFO. The built-in local DSQs are used to @@ -78,30 +88,58 @@ struct scx_dispatch_q { u64 id; struct rhash_head hash_node; struct llist_node free_node; + struct scx_sched *sched; + struct scx_dsq_pcpu __percpu *pcpu; struct rcu_head rcu; }; -/* scx_entity.flags */ +/* sched_ext_entity.flags */ enum scx_ent_flags { SCX_TASK_QUEUED = 1 << 0, /* on ext runqueue */ + SCX_TASK_IN_CUSTODY = 1 << 1, /* in custody, needs ops.dequeue() when leaving */ SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */ SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */ + SCX_TASK_SUB_INIT = 1 << 4, /* task being initialized for a sub sched */ + SCX_TASK_IMMED = 1 << 5, /* task is on local DSQ with %SCX_ENQ_IMMED */ - SCX_TASK_STATE_SHIFT = 8, /* bit 8 and 9 are used to carry scx_task_state */ + /* + * Bits 8 and 9 are used to carry task state: + * + * NONE ops.init_task() not called yet + * INIT ops.init_task() succeeded, but task can be cancelled + * READY fully initialized, but not in sched_ext + * ENABLED fully initialized and in sched_ext + */ + SCX_TASK_STATE_SHIFT = 8, /* bits 8 and 9 are used to carry task state */ SCX_TASK_STATE_BITS = 2, SCX_TASK_STATE_MASK = ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT, - SCX_TASK_CURSOR = 1 << 31, /* iteration cursor, not a task */ -}; + SCX_TASK_NONE = 0 << SCX_TASK_STATE_SHIFT, + SCX_TASK_INIT = 1 << SCX_TASK_STATE_SHIFT, + SCX_TASK_READY = 2 << SCX_TASK_STATE_SHIFT, + SCX_TASK_ENABLED = 3 << SCX_TASK_STATE_SHIFT, -/* scx_entity.flags & SCX_TASK_STATE_MASK */ -enum scx_task_state { - SCX_TASK_NONE, /* ops.init_task() not called yet */ - SCX_TASK_INIT, /* ops.init_task() succeeded, but task can be cancelled */ - SCX_TASK_READY, /* fully initialized, but not in sched_ext */ - SCX_TASK_ENABLED, /* fully initialized and in sched_ext */ + /* + * Bits 12 and 13 are used to carry reenqueue reason. In addition to + * %SCX_ENQ_REENQ flag, ops.enqueue() can also test for + * %SCX_TASK_REENQ_REASON_NONE to distinguish reenqueues. + * + * NONE not being reenqueued + * KFUNC reenqueued by scx_bpf_dsq_reenq() and friends + * IMMED reenqueued due to failed ENQ_IMMED + * PREEMPTED preempted while running + */ + SCX_TASK_REENQ_REASON_SHIFT = 12, + SCX_TASK_REENQ_REASON_BITS = 2, + SCX_TASK_REENQ_REASON_MASK = ((1 << SCX_TASK_REENQ_REASON_BITS) - 1) << SCX_TASK_REENQ_REASON_SHIFT, + + SCX_TASK_REENQ_NONE = 0 << SCX_TASK_REENQ_REASON_SHIFT, + SCX_TASK_REENQ_KFUNC = 1 << SCX_TASK_REENQ_REASON_SHIFT, + SCX_TASK_REENQ_IMMED = 2 << SCX_TASK_REENQ_REASON_SHIFT, + SCX_TASK_REENQ_PREEMPTED = 3 << SCX_TASK_REENQ_REASON_SHIFT, - SCX_TASK_NR_STATES, + /* iteration cursor, not a task */ + SCX_TASK_CURSOR = 1 << 31, }; /* scx_entity.dsq_flags */ @@ -109,33 +147,6 @@ enum scx_ent_dsq_flags { SCX_TASK_DSQ_ON_PRIQ = 1 << 0, /* task is queued on the priority queue of a dsq */ }; -/* - * Mask bits for scx_entity.kf_mask. Not all kfuncs can be called from - * everywhere and the following bits track which kfunc sets are currently - * allowed for %current. This simple per-task tracking works because SCX ops - * nest in a limited way. BPF will likely implement a way to allow and disallow - * kfuncs depending on the calling context which will replace this manual - * mechanism. See scx_kf_allow(). - */ -enum scx_kf_mask { - SCX_KF_UNLOCKED = 0, /* sleepable and not rq locked */ - /* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */ - SCX_KF_CPU_RELEASE = 1 << 0, /* ops.cpu_release() */ - /* - * ops.dispatch() may release rq lock temporarily and thus ENQUEUE and - * SELECT_CPU may be nested inside. ops.dequeue (in REST) may also be - * nested inside DISPATCH. - */ - SCX_KF_DISPATCH = 1 << 1, /* ops.dispatch() */ - SCX_KF_ENQUEUE = 1 << 2, /* ops.enqueue() and ops.select_cpu() */ - SCX_KF_SELECT_CPU = 1 << 3, /* ops.select_cpu() */ - SCX_KF_REST = 1 << 4, /* other rq-locked operations */ - - __SCX_KF_RQ_LOCKED = SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH | - SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, - __SCX_KF_TERMINAL = SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, -}; - enum scx_dsq_lnode_flags { SCX_DSQ_LNODE_ITER_CURSOR = 1 << 0, @@ -149,19 +160,31 @@ struct scx_dsq_list_node { u32 priv; /* can be used by iter cursor */ }; -#define INIT_DSQ_LIST_CURSOR(__node, __flags, __priv) \ +#define INIT_DSQ_LIST_CURSOR(__cursor, __dsq, __flags) \ (struct scx_dsq_list_node) { \ - .node = LIST_HEAD_INIT((__node).node), \ + .node = LIST_HEAD_INIT((__cursor).node), \ .flags = SCX_DSQ_LNODE_ITER_CURSOR | (__flags), \ - .priv = (__priv), \ + .priv = READ_ONCE((__dsq)->seq), \ } +struct scx_sched; + /* * The following is embedded in task_struct and contains all fields necessary * for a task to be scheduled by SCX. */ struct sched_ext_entity { +#ifdef CONFIG_CGROUPS + /* + * Associated scx_sched. Updated either during fork or while holding + * both p->pi_lock and rq lock. + */ + struct scx_sched __rcu *sched; +#endif struct scx_dispatch_q *dsq; + atomic_long_t ops_state; + u64 ddsp_dsq_id; + u64 ddsp_enq_flags; struct scx_dsq_list_node dsq_list; /* dispatch order */ struct rb_node dsq_priq; /* p->scx.dsq_vtime order */ u32 dsq_seq; @@ -171,9 +194,7 @@ struct sched_ext_entity { s32 sticky_cpu; s32 holding_cpu; s32 selected_cpu; - u32 kf_mask; /* see scx_kf_mask above */ struct task_struct *kf_tasks[2]; /* see SCX_CALL_OP_TASK() */ - atomic_long_t ops_state; struct list_head runnable_node; /* rq->scx.runnable_list */ unsigned long runnable_at; @@ -181,8 +202,6 @@ struct sched_ext_entity { #ifdef CONFIG_SCHED_CORE u64 core_sched_at; /* see scx_prio_less() */ #endif - u64 ddsp_dsq_id; - u64 ddsp_enq_flags; /* BPF scheduler modifiable fields */ |
