summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-06-17 14:10:11 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2026-06-17 14:10:11 +0300
commit5b33fc6492a7b7a62359157db0f92f5b6e9af690 (patch)
tree0c89f2906b33a19cad3f12e9b33e9cfa150ee1d6
parent83476cc97bc635a3ff502bd194c79bfb1f1ae050 (diff)
parent2e05f2fd0dd72aa8aa56cf355e1e39a3f565b4ca (diff)
downloadlinux-5b33fc6492a7b7a62359157db0f92f5b6e9af690.tar.xz
Merge tag 'sched_ext-for-7.2' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext
Pull sched_ext updates from Tejun Heo: "Most of this continues the in-development sub-scheduler support, which lets a root BPF scheduler delegate to nested sub-schedulers. The dispatch-path building blocks landed in 7.1. A follow-up patchset in development will complete enqueue-path support for hierarchical scheduling. This cycle adds most of that infrastructure: - Topological CPU IDs (cids): a dense, topology-ordered CPU numbering where the CPUs of a core, LLC, or NUMA node form contiguous ranges, so a topology unit becomes a (start, length) slice. Raw CPU numbers are sparse and don't track topological closeness, which makes them clumsy for sharding work across sub-schedulers and awkward in BPF. - cmask: bitmaps windowed over a slice of cid space, so a sub-scheduler can track, for example, the idle cids of its shard without a full NR_CPUS cpumask. - A struct_ops variant that cid-form sub-schedulers register with, along with the cid-form kfuncs they call. - BPF arena integration, which sub-scheduler support is built on. The bpf-next additions let the kernel read and write the BPF scheduler's arena directly, turning it into a real kernel/BPF shared-memory channel. Shared state like the per-CPU cmask now lives there. - scx_qmap is reworked to exercise the new arena and cid interfaces. Additionally: - Exit-dump improvements: dump the faulting CPU first, expose the exit CPU to BPF and userspace, and normalize the dump header. - Misc kfuncs and cleanups: a task-ID lookup kfunc, __printf checking on the error and dump formatters, header reorganization, and assorted fixes" * tag 'sched_ext-for-7.2' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext: (59 commits) sched_ext: Add scx_arena_to_kaddr() / scx_kaddr_to_arena() sched_ext: Make scx_bpf_kick_cid() return s32 sched_ext: Add scx_cmask_test() and scx_cmask_for_each_cid() tools/sched_ext: Order single-cid cmask helpers as (cid, mask) sched_ext: Order single-cid cmask helpers as (cid, mask) selftests/sched_ext: Fix dsq_move_to_local check sched_ext: Guard BPF arena helper calls to fix 32-bit build sched_ext: idle: Fix errno loss in scx_idle_init() sched_ext: Convert ops.set_cmask() to arena-resident cmask sched_ext: Sub-allocator over kernel-claimed BPF arena pages sched_ext: Require an arena for cid-form schedulers sched_ext: Add cmask mask ops sched_ext: Track bits[] storage size in struct scx_cmask sched_ext: Rename scx_cmask.nr_bits to nr_cids tools/sched_ext: scx_qmap: Fix qa arena placement sched_ext: Mark !CONFIG_EXT_SUB_SCHED dummy stubs static inline sched_ext: Replace tryget_task_struct() with get_task_struct() sched_ext: Add scx_task_iter_relock() and use it in scx_root_enable_workfn() sched_ext: Fix ops_cid layout assert sched_ext: Use offsetofend on both sides of the ops_cid layout assert ...
-rw-r--r--Documentation/scheduler/sched-ext.rst7
-rw-r--r--include/linux/sched/ext.h9
-rw-r--r--kernel/sched/build_policy.c9
-rw-r--r--kernel/sched/ext.c1261
-rw-r--r--kernel/sched/ext_arena.c131
-rw-r--r--kernel/sched/ext_arena.h18
-rw-r--r--kernel/sched/ext_cid.c707
-rw-r--r--kernel/sched/ext_cid.h271
-rw-r--r--kernel/sched/ext_idle.c23
-rw-r--r--kernel/sched/ext_internal.h280
-rw-r--r--kernel/sched/ext_types.h144
-rw-r--r--tools/sched_ext/README.md6
-rw-r--r--tools/sched_ext/include/scx/cid.bpf.h678
-rw-r--r--tools/sched_ext/include/scx/common.bpf.h28
-rw-r--r--tools/sched_ext/include/scx/compat.bpf.h30
-rw-r--r--tools/sched_ext/include/scx/compat.h23
-rw-r--r--tools/sched_ext/include/scx/user_exit_info.bpf.h3
-rw-r--r--tools/sched_ext/include/scx/user_exit_info.h2
-rw-r--r--tools/sched_ext/include/scx/user_exit_info_common.h5
-rw-r--r--tools/sched_ext/scx_central.bpf.c10
-rw-r--r--tools/sched_ext/scx_cpu0.bpf.c2
-rw-r--r--tools/sched_ext/scx_cpu0.c2
-rw-r--r--tools/sched_ext/scx_flatcg.c12
-rw-r--r--tools/sched_ext/scx_qmap.bpf.c890
-rw-r--r--tools/sched_ext/scx_qmap.c107
-rw-r--r--tools/sched_ext/scx_qmap.h73
-rw-r--r--tools/testing/selftests/sched_ext/non_scx_kfunc_deny.bpf.c7
-rw-r--r--tools/testing/selftests/sched_ext/peek_dsq.bpf.c2
-rw-r--r--tools/testing/selftests/sched_ext/select_cpu_dfl.c54
29 files changed, 4090 insertions, 704 deletions
diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst
index 03d595d178ea..c4f59c08d8a4 100644
--- a/Documentation/scheduler/sched-ext.rst
+++ b/Documentation/scheduler/sched-ext.rst
@@ -339,6 +339,11 @@ The following briefly shows how a waking task is scheduled and executed.
leaves (e.g., when ``ops.dispatch()`` moves it to a terminal DSQ, or
on property change / sleep).
+ Note that ``ops.enqueue()`` can be called multiple times in a row without
+ an intervening call to ``ops.dequeue()``. This can happen, for example,
+ when a task on a user-created DSQ is re-enqueued using
+ ``scx_bpf_dsq_reenq()``. The task stays in BPF custody the entire time.
+
When a task leaves BPF scheduler custody, ``ops.dequeue()`` is invoked.
The dequeue can happen for different reasons, distinguished by flags:
@@ -503,7 +508,7 @@ Where to Look
custom DSQ.
* ``scx_qmap[.bpf].c``: A multi-level FIFO scheduler supporting five
- levels of priority implemented with ``BPF_MAP_TYPE_QUEUE``.
+ levels of priority implemented with arena-backed doubly-linked lists.
* ``scx_central[.bpf].c``: A central FIFO scheduler where all scheduling
decisions are made on one CPU, demonstrating ``LOCAL_ON`` dispatching,
diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 2129e18ada58..20b2343aa344 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -207,6 +207,15 @@ struct sched_ext_entity {
u64 core_sched_at; /* see scx_prio_less() */
#endif
+ /*
+ * Unique non-zero task ID assigned at fork. Persists across exec and
+ * is never reused. Lets BPF schedulers identify tasks without storing
+ * kernel pointers - arena-backed schedulers being one example. See
+ * scx_bpf_tid_to_task().
+ */
+ u64 tid;
+ struct rhash_head tid_hash_node; /* see SCX_OPS_TID_TO_TASK */
+
/* BPF scheduler modifiable fields */
/*
diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
index 755883faf751..067979a7b69e 100644
--- a/kernel/sched/build_policy.c
+++ b/kernel/sched/build_policy.c
@@ -58,8 +58,17 @@
#include "deadline.c"
#ifdef CONFIG_SCHED_CLASS_EXT
+# include <linux/btf_ids.h>
+# include <linux/find.h>
+# include <linux/genalloc.h>
+# include "ext_types.h"
# include "ext_internal.h"
+# include "ext_cid.h"
+# include "ext_arena.h"
+# include "ext_idle.h"
# include "ext.c"
+# include "ext_cid.c"
+# include "ext_arena.c"
# include "ext_idle.c"
#endif
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index f5a3233ead1a..0db6fa2daea3 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -6,8 +6,6 @@
* Copyright (c) 2022 Tejun Heo <tj@kernel.org>
* Copyright (c) 2022 David Vernet <dvernet@meta.com>
*/
-#include <linux/btf_ids.h>
-#include "ext_idle.h"
static DEFINE_RAW_SPINLOCK(scx_sched_lock);
@@ -38,6 +36,15 @@ static const struct rhashtable_params scx_sched_hash_params = {
static struct rhashtable scx_sched_hash;
#endif
+/* see SCX_OPS_TID_TO_TASK */
+static const struct rhashtable_params scx_tid_hash_params = {
+ .key_len = sizeof_field(struct sched_ext_entity, tid),
+ .key_offset = offsetof(struct sched_ext_entity, tid),
+ .head_offset = offsetof(struct sched_ext_entity, tid_hash_node),
+ .insecure_elasticity = true, /* inserted/removed under scx_tasks_lock */
+};
+static struct rhashtable scx_tid_hash;
+
/*
* During exit, a task may schedule after losing its PIDs. When disabling the
* BPF scheduler, we need to be able to iterate tasks in every state to
@@ -56,10 +63,25 @@ static DEFINE_RAW_SPINLOCK(scx_bypass_lock);
static bool scx_init_task_enabled;
static bool scx_switching_all;
DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
+static DEFINE_STATIC_KEY_FALSE(__scx_tid_to_task_enabled);
+
+/*
+ * True once SCX_OPS_TID_TO_TASK has been negotiated with the root scheduler
+ * and the tid->task table is live. Wraps the static key so callers don't
+ * take the address, and hints "likely enabled" for the common case where
+ * the feature is in use.
+ */
+static inline bool scx_tid_to_task_enabled(void)
+{
+ return static_branch_likely(&__scx_tid_to_task_enabled);
+}
static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
+/* Global cursor for the per-CPU tid allocator. Starts at 1; tid 0 is reserved. */
+static atomic64_t scx_tid_cursor = ATOMIC64_INIT(1);
+
#ifdef CONFIG_EXT_SUB_SCHED
/*
* The sub sched being enabled. Used by scx_disable_and_exit_task() to exit
@@ -109,6 +131,17 @@ struct scx_kick_syncs {
static DEFINE_PER_CPU(struct scx_kick_syncs __rcu *, scx_kick_syncs);
/*
+ * Per-CPU buffered allocator state for p->scx.tid. Each CPU pulls a chunk of
+ * SCX_TID_CHUNK ids from scx_tid_cursor and hands them out locally without
+ * further synchronization. See scx_alloc_tid().
+ */
+struct scx_tid_alloc {
+ u64 next;
+ u64 end;
+};
+static DEFINE_PER_CPU(struct scx_tid_alloc, scx_tid_alloc);
+
+/*
* Direct dispatch marker.
*
* Non-NULL values are used for direct dispatch from enqueue path. A valid
@@ -198,26 +231,21 @@ static void run_deferred(struct rq *rq);
static bool task_dead_and_done(struct task_struct *p);
static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags);
static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind);
-static bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind,
- s64 exit_code, const char *fmt, va_list args);
-static __printf(4, 5) bool scx_exit(struct scx_sched *sch,
- enum scx_exit_kind kind, s64 exit_code,
- const char *fmt, ...)
+__printf(5, 6) bool __scx_exit(struct scx_sched *sch,
+ enum scx_exit_kind kind, s64 exit_code,
+ s32 exit_cpu, const char *fmt, ...)
{
va_list args;
bool ret;
va_start(args, fmt);
- ret = scx_vexit(sch, kind, exit_code, fmt, args);
+ ret = scx_vexit(sch, kind, exit_code, exit_cpu, fmt, args);
va_end(args);
return ret;
}
-#define scx_error(sch, fmt, args...) scx_exit((sch), SCX_EXIT_ERROR, 0, fmt, ##args)
-#define scx_verror(sch, fmt, args) scx_vexit((sch), SCX_EXIT_ERROR, 0, fmt, args)
-
#define SCX_HAS_OP(sch, op) test_bit(SCX_OP_IDX(op), (sch)->has_op)
static long jiffies_delta_msecs(unsigned long at, unsigned long now)
@@ -295,9 +323,9 @@ static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch)
rcu_assign_pointer(p->scx.sched, sch);
}
#else /* CONFIG_EXT_SUB_SCHED */
-static struct scx_sched *scx_parent(struct scx_sched *sch) { return NULL; }
-static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, struct scx_sched *root) { return pos ? NULL : root; }
-static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) {}
+static inline struct scx_sched *scx_parent(struct scx_sched *sch) { return NULL; }
+static inline struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, struct scx_sched *root) { return pos ? NULL : root; }
+static inline void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) {}
#endif /* CONFIG_EXT_SUB_SCHED */
/**
@@ -484,6 +512,33 @@ do { \
update_locked_rq(__prev_locked_rq); \
} while (0)
+/*
+ * Flipped on enable per sch->is_cid_type. Declared in ext_internal.h so
+ * subsystem inlines can read it.
+ */
+DEFINE_STATIC_KEY_FALSE(__scx_is_cid_type);
+
+/*
+ * scx_cpu_arg() wraps a cpu arg being handed to an SCX op. For cid-form
+ * schedulers it resolves to the matching cid; for cpu-form it passes @cpu
+ * through. scx_cpu_ret() is the inverse for a cpu/cid returned from an op
+ * (currently only ops.select_cpu); it validates the BPF-supplied cid and
+ * triggers scx_error() on @sch if invalid.
+ */
+static s32 scx_cpu_arg(s32 cpu)
+{
+ if (scx_is_cid_type())
+ return __scx_cpu_to_cid(cpu);
+ return cpu;
+}
+
+static s32 scx_cpu_ret(struct scx_sched *sch, s32 cpu_or_cid)
+{
+ if (cpu_or_cid < 0 || !scx_is_cid_type())
+ return cpu_or_cid;
+ return scx_cid_to_cpu(sch, cpu_or_cid);
+}
+
#define SCX_CALL_OP_RET(sch, op, locked_rq, args...) \
({ \
struct rq *__prev_locked_rq; \
@@ -545,6 +600,44 @@ do { \
__ret; \
})
+/**
+ * scx_call_op_set_cpumask - invoke ops.set_cpumask / ops_cid.set_cmask for @task
+ * @sch: scx_sched being invoked
+ * @rq: rq to update as the currently-locked rq, or NULL
+ * @task: task whose affinity is changing
+ * @cpumask: new cpumask
+ *
+ * For cid-form schedulers, translate @cpumask to a cmask via the per-cpu
+ * scratch in ext_cid.c and dispatch through the ops_cid union view. Caller
+ * must hold @rq's rq lock so this_cpu_ptr is stable across the call.
+ */
+static inline void scx_call_op_set_cpumask(struct scx_sched *sch, struct rq *rq,
+ struct task_struct *task,
+ const struct cpumask *cpumask)
+{
+ WARN_ON_ONCE(current->scx.kf_tasks[0]);
+ current->scx.kf_tasks[0] = task;
+ if (rq)
+ update_locked_rq(rq);
+
+ if (scx_is_cid_type()) {
+ struct scx_cmask *kern_va = *this_cpu_ptr(sch->set_cmask_scratch);
+ /*
+ * Build the per-CPU arena cmask and hand BPF its arena address.
+ * Caller holds the rq lock with IRQs disabled, which makes us
+ * the sole user of the scratch area.
+ */
+ scx_cpumask_to_cmask(cpumask, kern_va);
+ sch->ops_cid.set_cmask(task, scx_kaddr_to_arena(sch, kern_va));
+ } else {
+ sch->ops.set_cpumask(task, cpumask);
+ }
+
+ if (rq)
+ update_locked_rq(NULL);
+ current->scx.kf_tasks[0] = NULL;
+}
+
/* see SCX_CALL_OP_TASK() */
static __always_inline bool scx_kf_arg_task_ok(struct scx_sched *sch,
struct task_struct *p)
@@ -858,6 +951,24 @@ static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter)
}
/**
+ * scx_task_iter_relock - Re-acquire scx_tasks_lock and, optionally, @p's rq
+ * @iter: iterator to relock
+ * @p: task whose rq to lock, or %NULL for scx_tasks_lock only
+ *
+ * Counterpart to scx_task_iter_unlock(). Locking @p's rq is optional. Once
+ * re-acquired, both locks are managed by the iterator from here on.
+ */
+static void scx_task_iter_relock(struct scx_task_iter *iter,
+ struct task_struct *p)
+{
+ __scx_task_iter_maybe_relock(iter);
+ if (p) {
+ iter->rq = task_rq_lock(p, &iter->rf);
+ iter->locked_task = p;
+ }
+}
+
+/**
* scx_task_iter_stop - Stop a task iteration and unlock scx_tasks_lock
* @iter: iterator to exit
*
@@ -1086,7 +1197,7 @@ static inline bool __cpu_valid(s32 cpu)
}
/**
- * ops_cpu_valid - Verify a cpu number, to be used on ops input args
+ * scx_cpu_valid - Verify a cpu number, to be used on ops input args
* @sch: scx_sched to abort on error
* @cpu: cpu number which came from a BPF ops
* @where: extra information reported on error
@@ -1095,7 +1206,7 @@ static inline bool __cpu_valid(s32 cpu)
* Verify that it is in range and one of the possible cpus. If invalid, trigger
* an ops error.
*/
-static bool ops_cpu_valid(struct scx_sched *sch, s32 cpu, const char *where)
+bool scx_cpu_valid(struct scx_sched *sch, s32 cpu, const char *where)
{
if (__cpu_valid(cpu)) {
return true;
@@ -1742,9 +1853,9 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch,
return &rq->scx.local_dsq;
if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
- s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
+ s32 cpu = scx_cpu_ret(sch, dsq_id & SCX_DSQ_LOCAL_CPU_MASK);
- if (!ops_cpu_valid(sch, cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict"))
+ if (!scx_cpu_valid(sch, cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict"))
return find_global_dsq(sch, tcpu);
return &cpu_rq(cpu)->scx.local_dsq;
@@ -2837,11 +2948,13 @@ scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
dspc->nr_tasks = 0;
if (nested) {
- SCX_CALL_OP(sch, dispatch, rq, cpu, prev_on_sch ? prev : NULL);
+ SCX_CALL_OP(sch, dispatch, rq, scx_cpu_arg(cpu),
+ prev_on_sch ? prev : NULL);
} else {
/* stash @prev so that nested invocations can access it */
rq->scx.sub_dispatch_prev = prev;
- SCX_CALL_OP(sch, dispatch, rq, cpu, prev_on_sch ? prev : NULL);
+ SCX_CALL_OP(sch, dispatch, rq, scx_cpu_arg(cpu),
+ prev_on_sch ? prev : NULL);
rq->scx.sub_dispatch_prev = NULL;
}
@@ -2899,7 +3012,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
* core. This callback complements ->cpu_release(), which is
* emitted in switch_class().
*/
- if (SCX_HAS_OP(sch, cpu_acquire))
+ if (sch->ops.cpu_acquire)
SCX_CALL_OP(sch, cpu_acquire, rq, cpu, NULL);
rq->scx.cpu_released = false;
}
@@ -3045,7 +3158,7 @@ static void switch_class(struct rq *rq, struct task_struct *next)
* next time that balance_one() is invoked.
*/
if (!rq->scx.cpu_released) {
- if (SCX_HAS_OP(sch, cpu_release)) {
+ if (sch->ops.cpu_release) {
struct scx_cpu_release_args args = {
.reason = preempt_reason_from_class(next_class),
.task = next,
@@ -3336,11 +3449,13 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
*ddsp_taskp = p;
this_rq()->scx.in_select_cpu = true;
- cpu = SCX_CALL_OP_TASK_RET(sch, select_cpu, NULL, p, prev_cpu, wake_flags);
+ cpu = SCX_CALL_OP_TASK_RET(sch, select_cpu, NULL, p,
+ scx_cpu_arg(prev_cpu), wake_flags);
+ cpu = scx_cpu_ret(sch, cpu);
this_rq()->scx.in_select_cpu = false;
p->scx.selected_cpu = cpu;
*ddsp_taskp = NULL;
- if (ops_cpu_valid(sch, cpu, "from ops.select_cpu()"))
+ if (scx_cpu_valid(sch, cpu, "from ops.select_cpu()"))
return cpu;
else
return prev_cpu;
@@ -3386,7 +3501,7 @@ static void set_cpus_allowed_scx(struct task_struct *p,
* designation pointless. Cast it away when calling the operation.
*/
if (SCX_HAS_OP(sch, set_cpumask))
- SCX_CALL_OP_TASK(sch, set_cpumask, task_rq(p), p, (struct cpumask *)p->cpus_ptr);
+ scx_call_op_set_cpumask(sch, task_rq(p), p, (struct cpumask *)p->cpus_ptr);
}
static void handle_hotplug(struct rq *rq, bool online)
@@ -3408,9 +3523,9 @@ static void handle_hotplug(struct rq *rq, bool online)
scx_idle_update_selcpu_topology(&sch->ops);
if (online && SCX_HAS_OP(sch, cpu_online))
- SCX_CALL_OP(sch, cpu_online, NULL, cpu);
+ SCX_CALL_OP(sch, cpu_online, NULL, scx_cpu_arg(cpu));
else if (!online && SCX_HAS_OP(sch, cpu_offline))
- SCX_CALL_OP(sch, cpu_offline, NULL, cpu);
+ SCX_CALL_OP(sch, cpu_offline, NULL, scx_cpu_arg(cpu));
else
scx_exit(sch, SCX_EXIT_UNREG_KERN,
SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
@@ -3458,9 +3573,10 @@ static bool check_rq_for_timeouts(struct rq *rq)
last_runnable + READ_ONCE(sch->watchdog_timeout)))) {
u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable);
- scx_exit(sch, SCX_EXIT_ERROR_STALL, 0,
- "%s[%d] failed to run for %u.%03us",
- p->comm, p->pid, dur_ms / 1000, dur_ms % 1000);
+ __scx_exit(sch, SCX_EXIT_ERROR_STALL, 0, cpu_of(rq),
+ "%s[%d] failed to run for %u.%03us",
+ p->comm, p->pid, dur_ms / 1000,
+ dur_ms % 1000);
timed_out = true;
break;
}
@@ -3748,6 +3864,33 @@ void init_scx_entity(struct sched_ext_entity *scx)
scx->slice = SCX_SLICE_DFL;
}
+/* See scx_tid_alloc / scx_tid_cursor. */
+static u64 scx_alloc_tid(void)
+{
+ struct scx_tid_alloc *ta;
+
+ guard(preempt)();
+ ta = this_cpu_ptr(&scx_tid_alloc);
+
+ if (unlikely(ta->next >= ta->end)) {
+ ta->next = atomic64_fetch_add(SCX_TID_CHUNK, &scx_tid_cursor);
+ ta->end = ta->next + SCX_TID_CHUNK;
+ }
+ return ta->next++;
+}
+
+static void scx_tid_hash_insert(struct task_struct *p)
+{
+ int ret;
+
+ lockdep_assert_held(&scx_tasks_lock);
+
+ ret = rhashtable_lookup_insert_fast(&scx_tid_hash,
+ &p->scx.tid_hash_node,
+ scx_tid_hash_params);
+ WARN_ON_ONCE(ret);
+}
+
void scx_pre_fork(struct task_struct *p)
{
/*
@@ -3765,6 +3908,8 @@ int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs)
percpu_rwsem_assert_held(&scx_fork_rwsem);
+ p->scx.tid = scx_alloc_tid();
+
if (scx_init_task_enabled) {
#ifdef CONFIG_EXT_SUB_SCHED
struct scx_sched *sch = kargs->cset->dfl_cgrp->scx_sched;
@@ -3804,9 +3949,11 @@ void scx_post_fork(struct task_struct *p)
}
}
- raw_spin_lock_irq(&scx_tasks_lock);
- list_add_tail(&p->scx.tasks_node, &scx_tasks);
- raw_spin_unlock_irq(&scx_tasks_lock);
+ scoped_guard(raw_spinlock_irq, &scx_tasks_lock) {
+ list_add_tail(&p->scx.tasks_node, &scx_tasks);
+ if (scx_tid_to_task_enabled())
+ scx_tid_hash_insert(p);
+ }
percpu_up_read(&scx_fork_rwsem);
}
@@ -3857,17 +4004,19 @@ static bool task_dead_and_done(struct task_struct *p)
void sched_ext_dead(struct task_struct *p)
{
- unsigned long flags;
-
/*
* By the time control reaches here, @p has %TASK_DEAD set, switched out
* for the last time and then dropped the rq lock - task_dead_and_done()
* should be returning %true nullifying the straggling sched_class ops.
* Remove from scx_tasks and exit @p.
*/
- raw_spin_lock_irqsave(&scx_tasks_lock, flags);
- list_del_init(&p->scx.tasks_node);
- raw_spin_unlock_irqrestore(&scx_tasks_lock, flags);
+ scoped_guard(raw_spinlock_irqsave, &scx_tasks_lock) {
+ list_del_init(&p->scx.tasks_node);
+ if (scx_tid_to_task_enabled())
+ rhashtable_remove_fast(&scx_tid_hash,
+ &p->scx.tid_hash_node,
+ scx_tid_hash_params);
+ }
/*
* @p is off scx_tasks and wholly ours. scx_root_enable()'s READY ->
@@ -3927,7 +4076,7 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p)
* different scheduler class. Keep the BPF scheduler up-to-date.
*/
if (SCX_HAS_OP(sch, set_cpumask))
- SCX_CALL_OP_TASK(sch, set_cpumask, rq, p, (struct cpumask *)p->cpus_ptr);
+ scx_call_op_set_cpumask(sch, rq, p, (struct cpumask *)p->cpus_ptr);
}
static void switched_from_scx(struct rq *rq, struct task_struct *p)
@@ -4510,9 +4659,9 @@ static void scx_cgroup_unlock(void)
#endif
}
#else /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */
-static struct cgroup *root_cgroup(void) { return NULL; }
-static void scx_cgroup_lock(void) {}
-static void scx_cgroup_unlock(void) {}
+static inline struct cgroup *root_cgroup(void) { return NULL; }
+static inline void scx_cgroup_lock(void) {}
+static inline void scx_cgroup_unlock(void) {}
#endif /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */
#ifdef CONFIG_EXT_SUB_SCHED
@@ -4531,8 +4680,8 @@ static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch)
rcu_assign_pointer(pos->scx_sched, sch);
}
#else /* CONFIG_EXT_SUB_SCHED */
-static struct cgroup *sch_cgroup(struct scx_sched *sch) { return NULL; }
-static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) {}
+static inline struct cgroup *sch_cgroup(struct scx_sched *sch) { return NULL; }
+static inline void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) {}
#endif /* CONFIG_EXT_SUB_SCHED */
/*
@@ -4818,6 +4967,48 @@ static const struct attribute_group scx_global_attr_group = {
static void free_pnode(struct scx_sched_pnode *pnode);
static void free_exit_info(struct scx_exit_info *ei);
+static s32 scx_set_cmask_scratch_alloc(struct scx_sched *sch)
+{
+ size_t size = struct_size_t(struct scx_cmask, bits,
+ SCX_CMASK_NR_WORDS(num_possible_cpus()));
+ int cpu;
+
+ if (!sch->is_cid_type || !sch->arena_pool)
+ return 0;
+
+ sch->set_cmask_scratch = alloc_percpu(struct scx_cmask *);
+ if (!sch->set_cmask_scratch)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ struct scx_cmask **slot = per_cpu_ptr(sch->set_cmask_scratch, cpu);
+
+ *slot = scx_arena_alloc(sch, size);
+ if (!*slot)
+ return -ENOMEM;
+ scx_cmask_init(*slot, 0, num_possible_cpus());
+ }
+ return 0;
+}
+
+static void scx_set_cmask_scratch_free(struct scx_sched *sch)
+{
+ size_t size = struct_size_t(struct scx_cmask, bits,
+ SCX_CMASK_NR_WORDS(num_possible_cpus()));
+ int cpu;
+
+ if (!sch->set_cmask_scratch)
+ return;
+
+ for_each_possible_cpu(cpu) {
+ struct scx_cmask **slot = per_cpu_ptr(sch->set_cmask_scratch, cpu);
+
+ scx_arena_free(sch, *slot, size);
+ }
+ free_percpu(sch->set_cmask_scratch);
+ sch->set_cmask_scratch = NULL;
+}
+
static void scx_sched_free_rcu_work(struct work_struct *work)
{
struct rcu_work *rcu_work = to_rcu_work(work);
@@ -4872,6 +5063,10 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
free_exit_info(sch->exit_info);
+ scx_set_cmask_scratch_free(sch);
+ scx_arena_pool_destroy(sch);
+ if (sch->arena_map)
+ bpf_map_put(sch->arena_map);
kfree(sch);
}
@@ -5563,6 +5758,7 @@ static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len)
if (!ei)
return NULL;
+ ei->exit_cpu = -1;
ei->bt = kzalloc_objs(ei->bt[0], SCX_EXIT_BT_LEN);
ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL);
ei->dump = kvzalloc(exit_dump_len, GFP_KERNEL);
@@ -5709,6 +5905,26 @@ static void scx_disable_dump(struct scx_sched *sch)
sch->dump_disabled = true;
}
+static void scx_log_sched_disable(struct scx_sched *sch)
+{
+ struct scx_exit_info *ei = sch->exit_info;
+ const char *type = scx_parent(sch) ? "sub-scheduler" : "scheduler";
+
+ if (ei->kind >= SCX_EXIT_ERROR) {
+ pr_err("sched_ext: BPF %s \"%s\" disabled (%s)\n", type,
+ sch->ops.name, ei->reason);
+
+ if (ei->msg[0] != '\0')
+ pr_err("sched_ext: %s: %s\n", sch->ops.name, ei->msg);
+#ifdef CONFIG_STACKTRACE
+ stack_trace_print(ei->bt, ei->bt_len, 2);
+#endif
+ } else {
+ pr_info("sched_ext: BPF %s \"%s\" disabled (%s)\n", type,
+ sch->ops.name, ei->reason);
+ }
+}
+
#ifdef CONFIG_EXT_SUB_SCHED
static DECLARE_WAIT_QUEUE_HEAD(scx_unlink_waitq);
@@ -5795,14 +6011,11 @@ static void scx_sub_disable(struct scx_sched *sch)
WARN_ON_ONCE(!scx_task_on_sched(sch, p));
/*
- * If $p is about to be freed, nothing prevents $sch from
- * unloading before $p reaches sched_ext_free(). Disable and
- * exit $p right away.
+ * @p is pinned by the iter: css_task_iter_next() takes a
+ * reference and holds it until the next iter_next() call, so
+ * @p->usage is guaranteed > 0.
*/
- if (!tryget_task_struct(p)) {
- scx_disable_and_exit_task(sch, p);
- continue;
- }
+ get_task_struct(p);
scx_task_iter_unlock(&sti);
@@ -5895,6 +6108,8 @@ static void scx_sub_disable(struct scx_sched *sch)
&sub_detach_args);
}
+ scx_log_sched_disable(sch);
+
if (sch->ops.exit)
SCX_CALL_OP(sch, exit, NULL, sch->exit_info);
if (sch->sub_kset)
@@ -5902,13 +6117,12 @@ static void scx_sub_disable(struct scx_sched *sch)
kobject_del(&sch->kobj);
}
#else /* CONFIG_EXT_SUB_SCHED */
-static void drain_descendants(struct scx_sched *sch) { }
-static void scx_sub_disable(struct scx_sched *sch) { }
+static inline void drain_descendants(struct scx_sched *sch) { }
+static inline void scx_sub_disable(struct scx_sched *sch) { }
#endif /* CONFIG_EXT_SUB_SCHED */
static void scx_root_disable(struct scx_sched *sch)
{
- struct scx_exit_info *ei = sch->exit_info;
struct scx_task_iter sti;
struct task_struct *p;
bool was_switched_all;
@@ -6021,26 +6235,19 @@ static void scx_root_disable(struct scx_sched *sch)
/* no task is on scx, turn off all the switches and flush in-progress calls */
static_branch_disable(&__scx_enabled);
+ static_branch_disable(&__scx_is_cid_type);
+ if (sch->ops.flags & SCX_OPS_TID_TO_TASK)
+ static_branch_disable(&__scx_tid_to_task_enabled);
bitmap_zero(sch->has_op, SCX_OPI_END);
scx_idle_disable();
synchronize_rcu();
+ if (sch->ops.flags & SCX_OPS_TID_TO_TASK)
+ rhashtable_free_and_destroy(&scx_tid_hash, NULL, NULL);
- if (ei->kind >= SCX_EXIT_ERROR) {
- pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
- sch->ops.name, ei->reason);
-
- if (ei->msg[0] != '\0')
- pr_err("sched_ext: %s: %s\n", sch->ops.name, ei->msg);
-#ifdef CONFIG_STACKTRACE
- stack_trace_print(ei->bt, ei->bt_len, 2);
-#endif
- } else {
- pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
- sch->ops.name, ei->reason);
- }
+ scx_log_sched_disable(sch);
if (sch->ops.exit)
- SCX_CALL_OP(sch, exit, NULL, ei);
+ SCX_CALL_OP(sch, exit, NULL, sch->exit_info);
scx_unlink_sched(sch);
@@ -6338,6 +6545,94 @@ static void scx_dump_task(struct scx_sched *sch, struct seq_buf *s, struct scx_d
}
}
+static void scx_dump_cpu(struct scx_sched *sch, struct seq_buf *s,
+ struct scx_dump_ctx *dctx, int cpu,
+ bool dump_all_tasks)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
+ struct task_struct *p;
+ struct seq_buf ns;
+ size_t avail, used;
+ char *buf;
+ bool idle;
+
+ rq_lock_irqsave(rq, &rf);
+
+ idle = list_empty(&rq->scx.runnable_list) &&
+ rq->curr->sched_class == &idle_sched_class;
+
+ if (idle && !SCX_HAS_OP(sch, dump_cpu))
+ goto next;
+
+ /*
+ * We don't yet know whether ops.dump_cpu() will produce output
+ * and we may want to skip the default CPU dump if it doesn't.
+ * Use a nested seq_buf to generate the standard dump so that we
+ * can decide whether to commit later.
+ */
+ avail = seq_buf_get_buf(s, &buf);
+ seq_buf_init(&ns, buf, avail);
+
+ dump_newline(&ns);
+ dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu ksync=%lu",
+ cpu, rq->scx.nr_running, rq->scx.flags,
+ rq->scx.cpu_released, rq->scx.ops_qseq,
+ rq->scx.kick_sync);
+ dump_line(&ns, " curr=%s[%d] class=%ps",
+ rq->curr->comm, rq->curr->pid,
+ rq->curr->sched_class);
+ if (!cpumask_empty(rq->scx.cpus_to_kick))
+ dump_line(&ns, " cpus_to_kick : %*pb",
+ cpumask_pr_args(rq->scx.cpus_to_kick));
+ if (!cpumask_empty(rq->scx.cpus_to_kick_if_idle))
+ dump_line(&ns, " idle_to_kick : %*pb",
+ cpumask_pr_args(rq->scx.cpus_to_kick_if_idle));
+ if (!cpumask_empty(rq->scx.cpus_to_preempt))
+ dump_line(&ns, " cpus_to_preempt: %*pb",
+ cpumask_pr_args(rq->scx.cpus_to_preempt));
+ if (!cpumask_empty(rq->scx.cpus_to_wait))
+ dump_line(&ns, " cpus_to_wait : %*pb",
+ cpumask_pr_args(rq->scx.cpus_to_wait));
+ if (!cpumask_empty(rq->scx.cpus_to_sync))
+ dump_line(&ns, " cpus_to_sync : %*pb",
+ cpumask_pr_args(rq->scx.cpus_to_sync));
+
+ used = seq_buf_used(&ns);
+ if (SCX_HAS_OP(sch, dump_cpu)) {
+ ops_dump_init(&ns, " ");
+ SCX_CALL_OP(sch, dump_cpu, rq, dctx, scx_cpu_arg(cpu), idle);
+ ops_dump_exit();
+ }
+
+ /*
+ * If idle && nothing generated by ops.dump_cpu(), there's
+ * nothing interesting. Skip.
+ */
+ if (idle && used == seq_buf_used(&ns))
+ goto next;
+
+ /*
+ * $s may already have overflowed when $ns was created. If so,
+ * calling commit on it will trigger BUG.
+ */
+ if (avail) {
+ seq_buf_commit(s, seq_buf_used(&ns));
+ if (seq_buf_has_overflowed(&ns))
+ seq_buf_set_overflow(s);
+ }
+
+ if (rq->curr->sched_class == &ext_sched_class &&
+ (dump_all_tasks || scx_task_on_sched(sch, rq->curr)))
+ scx_dump_task(sch, s, dctx, rq, rq->curr, '*');
+
+ list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node)
+ if (dump_all_tasks || scx_task_on_sched(sch, p))
+ scx_dump_task(sch, s, dctx, rq, p, ' ');
+next:
+ rq_unlock_irqrestore(rq, &rf);
+}
+
/*
* Dump scheduler state. If @dump_all_tasks is true, dump all tasks regardless
* of which scheduler they belong to. If false, only dump tasks owned by @sch.
@@ -6358,7 +6653,6 @@ static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei,
};
struct seq_buf s;
struct scx_event_stats events;
- char *buf;
int cpu;
guard(raw_spinlock_irqsave)(&scx_dump_lock);
@@ -6379,8 +6673,13 @@ static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei,
if (ei->kind == SCX_EXIT_NONE) {
dump_line(&s, "Debug dump triggered by %s", ei->reason);
} else {
- dump_line(&s, "%s[%d] triggered exit kind %d:",
- current->comm, current->pid, ei->kind);
+ if (ei->exit_cpu >= 0)
+ dump_line(&s, "%s[%d] triggered exit kind %d on CPU %d:",
+ current->comm, current->pid, ei->kind,
+ ei->exit_cpu);
+ else
+ dump_line(&s, "%s[%d] triggered exit kind %d:",
+ current->comm, current->pid, ei->kind);
dump_line(&s, " %s (%s)", ei->reason, ei->msg);
dump_newline(&s);
dump_line(&s, "Backtrace:");
@@ -6397,88 +6696,15 @@ static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei,
dump_line(&s, "CPU states");
dump_line(&s, "----------");
+ /*
+ * Dump the exit CPU first so it isn't lost to dump truncation, then
+ * walk the rest in order, skipping the one already dumped.
+ */
+ if (ei->exit_cpu >= 0)
+ scx_dump_cpu(sch, &s, &dctx, ei->exit_cpu, dump_all_tasks);
for_each_possible_cpu(cpu) {
- struct rq *rq = cpu_rq(cpu);
- struct rq_flags rf;
- struct task_struct *p;
- struct seq_buf ns;
- size_t avail, used;
- bool idle;
-
- rq_lock_irqsave(rq, &rf);
-
- idle = list_empty(&rq->scx.runnable_list) &&
- rq->curr->sched_class == &idle_sched_class;
-
- if (idle && !SCX_HAS_OP(sch, dump_cpu))
- goto next;
-
- /*
- * We don't yet know whether ops.dump_cpu() will produce output
- * and we may want to skip the default CPU dump if it doesn't.
- * Use a nested seq_buf to generate the standard dump so that we
- * can decide whether to commit later.
- */
- avail = seq_buf_get_buf(&s, &buf);
- seq_buf_init(&ns, buf, avail);
-
- dump_newline(&ns);
- dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu ksync=%lu",
- cpu, rq->scx.nr_running, rq->scx.flags,
- rq->scx.cpu_released, rq->scx.ops_qseq,
- rq->scx.kick_sync);
- dump_line(&ns, " curr=%s[%d] class=%ps",
- rq->curr->comm, rq->curr->pid,
- rq->curr->sched_class);
- if (!cpumask_empty(rq->scx.cpus_to_kick))
- dump_line(&ns, " cpus_to_kick : %*pb",
- cpumask_pr_args(rq->scx.cpus_to_kick));
- if (!cpumask_empty(rq->scx.cpus_to_kick_if_idle))
- dump_line(&ns, " idle_to_kick : %*pb",
- cpumask_pr_args(rq->scx.cpus_to_kick_if_idle));
- if (!cpumask_empty(rq->scx.cpus_to_preempt))
- dump_line(&ns, " cpus_to_preempt: %*pb",
- cpumask_pr_args(rq->scx.cpus_to_preempt));
- if (!cpumask_empty(rq->scx.cpus_to_wait))
- dump_line(&ns, " cpus_to_wait : %*pb",
- cpumask_pr_args(rq->scx.cpus_to_wait));
- if (!cpumask_empty(rq->scx.cpus_to_sync))
- dump_line(&ns, " cpus_to_sync : %*pb",
- cpumask_pr_args(rq->scx.cpus_to_sync));
-
- used = seq_buf_used(&ns);
- if (SCX_HAS_OP(sch, dump_cpu)) {
- ops_dump_init(&ns, " ");
- SCX_CALL_OP(sch, dump_cpu, rq, &dctx, cpu, idle);
- ops_dump_exit();
- }
-
- /*
- * If idle && nothing generated by ops.dump_cpu(), there's
- * nothing interesting. Skip.
- */
- if (idle && used == seq_buf_used(&ns))
- goto next;
-
- /*
- * $s may already have overflowed when $ns was created. If so,
- * calling commit on it will trigger BUG.
- */
- if (avail) {
- seq_buf_commit(&s, seq_buf_used(&ns));
- if (seq_buf_has_overflowed(&ns))
- seq_buf_set_overflow(&s);
- }
-
- if (rq->curr->sched_class == &ext_sched_class &&
- (dump_all_tasks || scx_task_on_sched(sch, rq->curr)))
- scx_dump_task(sch, &s, &dctx, rq, rq->curr, '*');
-
- list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node)
- if (dump_all_tasks || scx_task_on_sched(sch, p))
- scx_dump_task(sch, &s, &dctx, rq, p, ' ');
- next:
- rq_unlock_irqrestore(rq, &rf);
+ if (cpu != ei->exit_cpu)
+ scx_dump_cpu(sch, &s, &dctx, cpu, dump_all_tasks);
}
dump_newline(&s);
@@ -6516,9 +6742,9 @@ static void scx_disable_irq_workfn(struct irq_work *irq_work)
kthread_queue_work(sch->helper, &sch->disable_work);
}
-static bool scx_vexit(struct scx_sched *sch,
- enum scx_exit_kind kind, s64 exit_code,
- const char *fmt, va_list args)
+bool scx_vexit(struct scx_sched *sch,
+ enum scx_exit_kind kind, s64 exit_code, s32 exit_cpu,
+ const char *fmt, va_list args)
{
struct scx_exit_info *ei = sch->exit_info;
@@ -6540,6 +6766,7 @@ static bool scx_vexit(struct scx_sched *sch,
*/
ei->kind = kind;
ei->reason = scx_exit_reason(ei->kind);
+ ei->exit_cpu = exit_cpu;
irq_work_queue(&sch->disable_irq_work);
return true;
@@ -6597,13 +6824,32 @@ static struct scx_sched_pnode *alloc_pnode(struct scx_sched *sch, int node)
}
/*
+ * scx_enable() is offloaded to a dedicated system-wide RT kthread to avoid
+ * starvation. During the READY -> ENABLED task switching loop, the calling
+ * thread's sched_class gets switched from fair to ext. As fair has higher
+ * priority than ext, the calling thread can be indefinitely starved under
+ * fair-class saturation, leading to a system hang.
+ */
+struct scx_enable_cmd {
+ struct kthread_work work;
+ union {
+ struct sched_ext_ops *ops;
+ struct sched_ext_ops_cid *ops_cid;
+ };
+ bool is_cid_type;
+ struct bpf_map *arena_map; /* arena ref to transfer to sch */
+ int ret;
+};
+
+/*
* Allocate and initialize a new scx_sched. @cgrp's reference is always
* consumed whether the function succeeds or fails.
*/
-static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
+static struct scx_sched *scx_alloc_and_add_sched(struct scx_enable_cmd *cmd,
struct cgroup *cgrp,
struct scx_sched *parent)
{
+ struct sched_ext_ops *ops = cmd->ops;
struct scx_sched *sch;
s32 level = parent ? parent->level + 1 : 0;
s32 node, cpu, ret, bypass_fail_cpu = nr_cpu_ids;
@@ -6695,7 +6941,18 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
ret = -ENOMEM;
goto err_free_lb_cpumask;
}
- sch->ops = *ops;
+ /*
+ * Copy ops through the right union view. For cid-form the source is
+ * struct sched_ext_ops_cid which lacks the trailing cpu_acquire/
+ * cpu_release; those stay zero from kzalloc.
+ */
+ if (cmd->is_cid_type) {
+ sch->ops_cid = *cmd->ops_cid;
+ sch->is_cid_type = true;
+ } else {
+ sch->ops = *cmd->ops;
+ }
+
rcu_assign_pointer(ops->priv, sch);
sch->kobj.kset = scx_kset;
@@ -6748,6 +7005,20 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
return ERR_PTR(ret);
}
#endif /* CONFIG_EXT_SUB_SCHED */
+
+ /*
+ * Consume the arena_map ref bpf_scx_reg_cid() took. Defer to here so
+ * earlier failure paths leave cmd->arena_map set and bpf_scx_reg_cid
+ * drops the ref. After this point, sch owns the ref and any cleanup
+ * runs through scx_sched_free_rcu_work() which puts it.
+ */
+ sch->arena_map = cmd->arena_map;
+ /* BPF arena is only available on MMU && 64BIT */
+#if defined(CONFIG_MMU) && defined(CONFIG_64BIT)
+ if (sch->arena_map)
+ sch->arena_kern_base = bpf_arena_map_kern_vm_start(sch->arena_map);
+#endif
+ cmd->arena_map = NULL;
return sch;
#ifdef CONFIG_EXT_SUB_SCHED
@@ -6819,6 +7090,17 @@ static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops)
}
/*
+ * SCX_OPS_TID_TO_TASK is enabled by the root scheduler. A sub-sched
+ * may set it to declare a dependency; reject if the root hasn't
+ * enabled it.
+ */
+ if ((ops->flags & SCX_OPS_TID_TO_TASK) && scx_parent(sch) &&
+ !(scx_root->ops.flags & SCX_OPS_TID_TO_TASK)) {
+ scx_error(sch, "SCX_OPS_TID_TO_TASK requires root scheduler to enable it");
+ return -EINVAL;
+ }
+
+ /*
* SCX_OPS_BUILTIN_IDLE_PER_NODE requires built-in CPU idle
* selection policy to be enabled.
*/
@@ -6828,25 +7110,34 @@ static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops)
return -EINVAL;
}
- if (ops->cpu_acquire || ops->cpu_release)
+ /*
+ * cid-form's struct is shorter and doesn't include the cpu_acquire /
+ * cpu_release tail; reading those fields off a cid-form @ops would
+ * run past the BPF allocation. Skip for cid-form.
+ */
+ if (!sch->is_cid_type && (ops->cpu_acquire || ops->cpu_release))
pr_warn("ops->cpu_acquire/release() are deprecated, use sched_switch TP instead\n");
+ /*
+ * Sub-scheduler support is tied to the cid-form struct_ops. A sub-sched
+ * attaches through a cid-form-only interface (sub_attach/sub_detach),
+ * and a root that accepts sub-scheds must expose cid-form state to
+ * them. Reject cpu-form schedulers on either side.
+ */
+ if (!sch->is_cid_type) {
+ if (scx_parent(sch)) {
+ scx_error(sch, "sub-sched requires cid-form struct_ops");
+ return -EINVAL;
+ }
+ if (ops->sub_attach || ops->sub_detach) {
+ scx_error(sch, "sub_attach/sub_detach requires cid-form struct_ops");
+ return -EINVAL;
+ }
+ }
+
return 0;
}
-/*
- * scx_enable() is offloaded to a dedicated system-wide RT kthread to avoid
- * starvation. During the READY -> ENABLED task switching loop, the calling
- * thread's sched_class gets switched from fair to ext. As fair has higher
- * priority than ext, the calling thread can be indefinitely starved under
- * fair-class saturation, leading to a system hang.
- */
-struct scx_enable_cmd {
- struct kthread_work work;
- struct sched_ext_ops *ops;
- int ret;
-};
-
static void scx_root_enable_workfn(struct kthread_work *work)
{
struct scx_enable_cmd *cmd = container_of(work, struct scx_enable_cmd, work);
@@ -6881,15 +7172,24 @@ static void scx_root_enable_workfn(struct kthread_work *work)
if (ret)
goto err_unlock;
+ if (ops->flags & SCX_OPS_TID_TO_TASK) {
+ ret = rhashtable_init(&scx_tid_hash, &scx_tid_hash_params);
+ if (ret)
+ goto err_free_ksyncs;
+ }
+
#ifdef CONFIG_EXT_SUB_SCHED
cgroup_get(cgrp);
#endif
- sch = scx_alloc_and_add_sched(ops, cgrp, NULL);
+ sch = scx_alloc_and_add_sched(cmd, cgrp, NULL);
if (IS_ERR(sch)) {
ret = PTR_ERR(sch);
- goto err_free_ksyncs;
+ goto err_free_tid_hash;
}
+ if (sch->is_cid_type)
+ static_branch_enable(&__scx_is_cid_type);
+
/*
* Transition to ENABLING and clear exit info to arm the disable path.
* Failure triggers full disabling from here on.
@@ -6913,6 +7213,18 @@ static void scx_root_enable_workfn(struct kthread_work *work)
cpus_read_lock();
/*
+ * Build the cid mapping before publishing scx_root. The cid kfuncs
+ * dereference the cid arrays unconditionally once scx_prog_sched()
+ * returns non-NULL; the rcu_assign_pointer() below pairs with their
+ * rcu_dereference() to make the populated arrays visible.
+ */
+ ret = scx_cid_init(sch);
+ if (ret) {
+ cpus_read_unlock();
+ goto err_disable;
+ }
+
+ /*
* Make the scheduler instance visible. Must be inside cpus_read_lock().
* See handle_hotplug().
*/
@@ -6937,6 +7249,18 @@ static void scx_root_enable_workfn(struct kthread_work *work)
sch->exit_info->flags |= SCX_EFLAG_INITIALIZED;
}
+ ret = scx_arena_pool_init(sch);
+ if (ret) {
+ cpus_read_unlock();
+ goto err_disable;
+ }
+
+ ret = scx_set_cmask_scratch_alloc(sch);
+ if (ret) {
+ cpus_read_unlock();
+ goto err_disable;
+ }
+
for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
if (((void (**)(void))ops)[i])
set_bit(i, sch->has_op);
@@ -7003,6 +7327,10 @@ static void scx_root_enable_workfn(struct kthread_work *work)
WARN_ON_ONCE(scx_init_task_enabled);
scx_init_task_enabled = true;
+ /* flip under fork_rwsem; the iter below covers existing tasks */
+ if (ops->flags & SCX_OPS_TID_TO_TASK)
+ static_branch_enable(&__scx_tid_to_task_enabled);
+
/*
* Enable ops for every task. Fork is excluded by scx_fork_rwsem
* preventing new tasks from being added. No need to exclude tasks
@@ -7024,16 +7352,14 @@ static void scx_root_enable_workfn(struct kthread_work *work)
scx_task_iter_start(&sti, NULL);
while ((p = scx_task_iter_next_locked(&sti))) {
- struct rq_flags rf;
- struct rq *rq;
-
/*
- * @p may already be dead, have lost all its usages counts and
- * be waiting for RCU grace period before being freed. @p can't
- * be initialized for SCX in such cases and should be ignored.
+ * @p is in scx_tasks under scx_tasks_lock, and SCX_TASK_DEAD
+ * tasks are filtered by scx_task_iter_next_locked().
+ * sched_ext_dead() removes @p from scx_tasks under the same
+ * lock before put_task_struct_rcu_user() runs, so @p->usage
+ * is guaranteed > 0 here.
*/
- if (!tryget_task_struct(p))
- continue;
+ get_task_struct(p);
/*
* Set %INIT_BEGIN under the iter's rq lock so that a concurrent
@@ -7049,12 +7375,11 @@ static void scx_root_enable_workfn(struct kthread_work *work)
ret = __scx_init_task(sch, p, false);
- rq = task_rq_lock(p, &rf);
+ scx_task_iter_relock(&sti, p);
if (unlikely(ret)) {
if (scx_get_task_state(p) != SCX_TASK_DEAD)
scx_set_task_state(p, SCX_TASK_NONE);
- task_rq_unlock(rq, p, &rf);
scx_task_iter_stop(&sti);
scx_error(sch, "ops.init_task() failed (%d) for %s[%d]",
ret, p->comm, p->pid);
@@ -7075,7 +7400,14 @@ static void scx_root_enable_workfn(struct kthread_work *work)
scx_set_task_state(p, SCX_TASK_READY);
}
- task_rq_unlock(rq, p, &rf);
+ /*
+ * Insert into the tid hash. scx_tasks_lock is held by the iter;
+ * list_empty() guards against sched_ext_dead() having taken @p
+ * off the list while init ran unlocked.
+ */
+ if (scx_tid_to_task_enabled() && !list_empty(&p->scx.tasks_node))
+ scx_tid_hash_insert(p);
+
put_task_struct(p);
}
scx_task_iter_stop(&sti);
@@ -7154,6 +7486,9 @@ static void scx_root_enable_workfn(struct kthread_work *work)
cmd->ret = 0;
return;
+err_free_tid_hash:
+ if (ops->flags & SCX_OPS_TID_TO_TASK)
+ rhashtable_free_and_destroy(&scx_tid_hash, NULL, NULL);
err_free_ksyncs:
free_kick_syncs();
err_unlock:
@@ -7261,7 +7596,7 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
raw_spin_unlock_irq(&scx_sched_lock);
/* scx_alloc_and_add_sched() consumes @cgrp whether it succeeds or not */
- sch = scx_alloc_and_add_sched(ops, cgrp, parent);
+ sch = scx_alloc_and_add_sched(cmd, cgrp, parent);
kobject_put(&parent->kobj);
if (IS_ERR(sch)) {
ret = PTR_ERR(sch);
@@ -7288,6 +7623,14 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
sch->exit_info->flags |= SCX_EFLAG_INITIALIZED;
}
+ ret = scx_arena_pool_init(sch);
+ if (ret)
+ goto err_disable;
+
+ ret = scx_set_cmask_scratch_alloc(sch);
+ if (ret)
+ goto err_disable;
+
if (validate_ops(sch, ops))
goto err_disable;
@@ -7350,9 +7693,8 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
if (p->scx.flags & SCX_TASK_SUB_INIT)
continue;
- /* see scx_root_enable() */
- if (!tryget_task_struct(p))
- continue;
+ /* @p is pinned by the iter; see scx_sub_disable() */
+ get_task_struct(p);
if (!assert_task_ready_or_enabled(p)) {
ret = -EINVAL;
@@ -7515,11 +7857,10 @@ static s32 __init scx_cgroup_lifetime_notifier_init(void)
core_initcall(scx_cgroup_lifetime_notifier_init);
#endif /* CONFIG_EXT_SUB_SCHED */
-static s32 scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
+static s32 scx_enable(struct scx_enable_cmd *cmd, struct bpf_link *link)
{
static struct kthread_worker *helper;
static DEFINE_MUTEX(helper_mutex);
- struct scx_enable_cmd cmd;
if (housekeeping_enabled(HK_TYPE_DOMAIN_BOOT)) {
pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n");
@@ -7542,16 +7883,15 @@ static s32 scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
}
#ifdef CONFIG_EXT_SUB_SCHED
- if (ops->sub_cgroup_id > 1)
- kthread_init_work(&cmd.work, scx_sub_enable_workfn);
+ if (cmd->ops->sub_cgroup_id > 1)
+ kthread_init_work(&cmd->work, scx_sub_enable_workfn);
else
#endif /* CONFIG_EXT_SUB_SCHED */
- kthread_init_work(&cmd.work, scx_root_enable_workfn);
- cmd.ops = ops;
+ kthread_init_work(&cmd->work, scx_root_enable_workfn);
- kthread_queue_work(READ_ONCE(helper), &cmd.work);
- kthread_flush_work(&cmd.work);
- return cmd.ret;
+ kthread_queue_work(READ_ONCE(helper), &cmd->work);
+ kthread_flush_work(&cmd->work);
+ return cmd->ret;
}
@@ -7723,7 +8063,62 @@ static int bpf_scx_check_member(const struct btf_type *t,
static int bpf_scx_reg(void *kdata, struct bpf_link *link)
{
- return scx_enable(kdata, link);
+ struct scx_enable_cmd cmd = { .ops = kdata };
+
+ return scx_enable(&cmd, link);
+}
+
+struct scx_arena_scan {
+ struct bpf_map *arena;
+ int err;
+};
+
+/*
+ * The verifier enforces one arena per BPF program, so each struct_ops
+ * member prog contributes at most one arena via bpf_prog_arena().
+ * Require all non-NULL contributions to match.
+ */
+static int scx_arena_scan_prog(struct bpf_prog *prog, void *data)
+{
+ struct scx_arena_scan *s = data;
+ struct bpf_map *arena = NULL;
+
+ /* arena.o, which defines these, is built only on MMU && 64BIT */
+#if defined(CONFIG_MMU) && defined(CONFIG_64BIT)
+ arena = bpf_prog_arena(prog);
+#endif
+ if (!arena)
+ return 0;
+ if (s->arena && s->arena != arena) {
+ s->err = -EINVAL;
+ return 1;
+ }
+ s->arena = arena;
+ return 0;
+}
+
+static int bpf_scx_reg_cid(void *kdata, struct bpf_link *link)
+{
+ struct scx_enable_cmd cmd = { .ops_cid = kdata, .is_cid_type = true };
+ struct scx_arena_scan scan = {};
+ int ret;
+
+ bpf_struct_ops_for_each_prog(kdata, scx_arena_scan_prog, &scan);
+ if (scan.err) {
+ pr_err("sched_ext: cid-form scheduler uses multiple arena maps\n");
+ return scan.err;
+ }
+ if (!scan.arena) {
+ pr_err("sched_ext: cid-form scheduler must use a BPF arena map\n");
+ return -EINVAL;
+ }
+
+ bpf_map_inc(scan.arena);
+ cmd.arena_map = scan.arena;
+ ret = scx_enable(&cmd, link);
+ if (cmd.arena_map) /* not consumed by scx_alloc_and_add_sched() */
+ bpf_map_put(cmd.arena_map);
+ return ret;
}
static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
@@ -7857,6 +8252,73 @@ static struct bpf_struct_ops bpf_sched_ext_ops = {
.cfi_stubs = &__bpf_ops_sched_ext_ops
};
+/*
+ * cid-form cfi stubs. Stubs whose signatures match the cpu-form (param types
+ * identical, only param names differ across structs) are reused; only
+ * set_cmask needs a fresh stub since the second argument type differs.
+ */
+static void sched_ext_ops_cid__set_cmask(struct task_struct *p,
+ const struct scx_cmask *cmask) {}
+
+static struct sched_ext_ops_cid __bpf_ops_sched_ext_ops_cid = {
+ .select_cid = sched_ext_ops__select_cpu,
+ .enqueue = sched_ext_ops__enqueue,
+ .dequeue = sched_ext_ops__dequeue,
+ .dispatch = sched_ext_ops__dispatch,
+ .tick = sched_ext_ops__tick,
+ .runnable = sched_ext_ops__runnable,
+ .running = sched_ext_ops__running,
+ .stopping = sched_ext_ops__stopping,
+ .quiescent = sched_ext_ops__quiescent,
+ .yield = sched_ext_ops__yield,
+ .core_sched_before = sched_ext_ops__core_sched_before,
+ .set_weight = sched_ext_ops__set_weight,
+ .set_cmask = sched_ext_ops_cid__set_cmask,
+ .update_idle = sched_ext_ops__update_idle,
+ .init_task = sched_ext_ops__init_task,
+ .exit_task = sched_ext_ops__exit_task,
+ .enable = sched_ext_ops__enable,
+ .disable = sched_ext_ops__disable,
+#ifdef CONFIG_EXT_GROUP_SCHED
+ .cgroup_init = sched_ext_ops__cgroup_init,
+ .cgroup_exit = sched_ext_ops__cgroup_exit,
+ .cgroup_prep_move = sched_ext_ops__cgroup_prep_move,
+ .cgroup_move = sched_ext_ops__cgroup_move,
+ .cgroup_cancel_move = sched_ext_ops__cgroup_cancel_move,
+ .cgroup_set_weight = sched_ext_ops__cgroup_set_weight,
+ .cgroup_set_bandwidth = sched_ext_ops__cgroup_set_bandwidth,
+ .cgroup_set_idle = sched_ext_ops__cgroup_set_idle,
+#endif
+ .sub_attach = sched_ext_ops__sub_attach,
+ .sub_detach = sched_ext_ops__sub_detach,
+ .cid_online = sched_ext_ops__cpu_online,
+ .cid_offline = sched_ext_ops__cpu_offline,
+ .init = sched_ext_ops__init,
+ .exit = sched_ext_ops__exit,
+ .dump = sched_ext_ops__dump,
+ .dump_cid = sched_ext_ops__dump_cpu,
+ .dump_task = sched_ext_ops__dump_task,
+};
+
+/*
+ * The cid-form struct_ops shares all bpf_struct_ops hooks with the cpu form.
+ * init_member, check_member, reg, unreg, etc. process kdata as the byte block
+ * verified to match by the BUILD_BUG_ON checks in scx_init().
+ */
+static struct bpf_struct_ops bpf_sched_ext_ops_cid = {
+ .verifier_ops = &bpf_scx_verifier_ops,
+ .reg = bpf_scx_reg_cid,
+ .unreg = bpf_scx_unreg,
+ .check_member = bpf_scx_check_member,
+ .init_member = bpf_scx_init_member,
+ .init = bpf_scx_init,
+ .update = bpf_scx_update,
+ .validate = bpf_scx_validate,
+ .name = "sched_ext_ops_cid",
+ .owner = THIS_MODULE,
+ .cfi_stubs = &__bpf_ops_sched_ext_ops_cid
+};
+
/********************************************************************************
* System integration and init.
@@ -7866,13 +8328,11 @@ static void sysrq_handle_sched_ext_reset(u8 key)
{
struct scx_sched *sch;
- rcu_read_lock();
sch = rcu_dereference(scx_root);
if (likely(sch))
scx_disable(sch, SCX_EXIT_SYSRQ);
else
pr_info("sched_ext: BPF schedulers not loaded\n");
- rcu_read_unlock();
}
static const struct sysrq_key_op sysrq_sched_ext_reset_op = {
@@ -7884,7 +8344,11 @@ static const struct sysrq_key_op sysrq_sched_ext_reset_op = {
static void sysrq_handle_sched_ext_dump(u8 key)
{
- struct scx_exit_info ei = { .kind = SCX_EXIT_NONE, .reason = "SysRq-D" };
+ struct scx_exit_info ei = {
+ .kind = SCX_EXIT_NONE,
+ .exit_cpu = -1,
+ .reason = "SysRq-D",
+ };
struct scx_sched *sch;
list_for_each_entry_rcu(sch, &scx_sched_all, all)
@@ -8954,9 +9418,6 @@ static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags)
struct rq *this_rq;
unsigned long irq_flags;
- if (!ops_cpu_valid(sch, cpu, NULL))
- return;
-
local_irq_save(irq_flags);
this_rq = this_rq();
@@ -9019,11 +9480,36 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags, const struct bpf_prog_aux
guard(rcu)();
sch = scx_prog_sched(aux);
- if (likely(sch))
+ if (likely(sch) && scx_cpu_valid(sch, cpu, NULL))
scx_kick_cpu(sch, cpu, flags);
}
/**
+ * scx_bpf_kick_cid - Trigger reschedule on the CPU mapped to @cid
+ * @cid: cid to kick
+ * @flags: %SCX_KICK_* flags
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
+ *
+ * cid-addressed equivalent of scx_bpf_kick_cpu(). Return 0 on success,
+ * -errno otherwise.
+ */
+__bpf_kfunc s32 scx_bpf_kick_cid(s32 cid, u64 flags, const struct bpf_prog_aux *aux)
+{
+ struct scx_sched *sch;
+ s32 cpu;
+
+ guard(rcu)();
+ sch = scx_prog_sched(aux);
+ if (unlikely(!sch))
+ return -ENODEV;
+ cpu = scx_cid_to_cpu(sch, cid);
+ if (cpu < 0)
+ return cpu;
+ scx_kick_cpu(sch, cpu, flags);
+ return 0;
+}
+
+/**
* scx_bpf_dsq_nr_queued - Return the number of queued tasks
* @dsq_id: id of the DSQ
* @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
@@ -9049,9 +9535,9 @@ __bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id, const struct bpf_prog_aux *aux
ret = READ_ONCE(this_rq()->scx.local_dsq.nr);
goto out;
} else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
- s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
+ s32 cpu = scx_cpu_ret(sch, dsq_id & SCX_DSQ_LOCAL_CPU_MASK);
- if (ops_cpu_valid(sch, cpu, NULL)) {
+ if (scx_cpu_valid(sch, cpu, NULL)) {
ret = READ_ONCE(cpu_rq(cpu)->scx.local_dsq.nr);
goto out;
}
@@ -9269,6 +9755,7 @@ __bpf_kfunc void scx_bpf_reenqueue_local___v2(const struct bpf_prog_aux *aux)
__bpf_kfunc_end_defs();
+__printf(5, 0)
static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf,
size_t line_size, char *fmt, unsigned long long *data,
u32 data__sz)
@@ -9306,6 +9793,7 @@ static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf,
return ret;
}
+__printf(3, 0)
static s32 bstr_format(struct scx_sched *sch, struct scx_bstr_buf *buf,
char *fmt, unsigned long long *data, u32 data__sz)
{
@@ -9326,6 +9814,7 @@ __bpf_kfunc_start_defs();
* Indicate that the BPF scheduler wants to exit gracefully, and initiate ops
* disabling.
*/
+__printf(2, 0)
__bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt,
unsigned long long *data, u32 data__sz,
const struct bpf_prog_aux *aux)
@@ -9351,6 +9840,7 @@ __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt,
* Indicate that the BPF scheduler encountered a fatal error and initiate ops
* disabling.
*/
+__printf(1, 0)
__bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data,
u32 data__sz, const struct bpf_prog_aux *aux)
{
@@ -9378,6 +9868,7 @@ __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data,
* The extra dump may be multiple lines. A single line may be split over
* multiple calls. The last line is automatically terminated.
*/
+__printf(1, 0)
__bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
u32 data__sz, const struct bpf_prog_aux *aux)
{
@@ -9440,13 +9931,36 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu, const struct bpf_prog_aux *aux)
guard(rcu)();
sch = scx_prog_sched(aux);
- if (likely(sch) && ops_cpu_valid(sch, cpu, NULL))
+ if (likely(sch) && scx_cpu_valid(sch, cpu, NULL))
return arch_scale_cpu_capacity(cpu);
else
return SCX_CPUPERF_ONE;
}
/**
+ * scx_bpf_cidperf_cap - Query the maximum relative capacity of the CPU at @cid
+ * @cid: cid of the CPU to query
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
+ *
+ * cid-addressed equivalent of scx_bpf_cpuperf_cap().
+ */
+__bpf_kfunc u32 scx_bpf_cidperf_cap(s32 cid, const struct bpf_prog_aux *aux)
+{
+ struct scx_sched *sch;
+ s32 cpu;
+
+ guard(rcu)();
+
+ sch = scx_prog_sched(aux);
+ if (unlikely(!sch))
+ return SCX_CPUPERF_ONE;
+ cpu = scx_cid_to_cpu(sch, cid);
+ if (cpu < 0)
+ return SCX_CPUPERF_ONE;
+ return arch_scale_cpu_capacity(cpu);
+}
+
+/**
* scx_bpf_cpuperf_cur - Query the current relative performance of a CPU
* @cpu: CPU of interest
* @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
@@ -9468,13 +9982,36 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu, const struct bpf_prog_aux *aux)
guard(rcu)();
sch = scx_prog_sched(aux);
- if (likely(sch) && ops_cpu_valid(sch, cpu, NULL))
+ if (likely(sch) && scx_cpu_valid(sch, cpu, NULL))
return arch_scale_freq_capacity(cpu);
else
return SCX_CPUPERF_ONE;
}
/**
+ * scx_bpf_cidperf_cur - Query the current performance of the CPU at @cid
+ * @cid: cid of the CPU to query
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
+ *
+ * cid-addressed equivalent of scx_bpf_cpuperf_cur().
+ */
+__bpf_kfunc u32 scx_bpf_cidperf_cur(s32 cid, const struct bpf_prog_aux *aux)
+{
+ struct scx_sched *sch;
+ s32 cpu;
+
+ guard(rcu)();
+
+ sch = scx_prog_sched(aux);
+ if (unlikely(!sch))
+ return SCX_CPUPERF_ONE;
+ cpu = scx_cid_to_cpu(sch, cid);
+ if (cpu < 0)
+ return SCX_CPUPERF_ONE;
+ return arch_scale_freq_capacity(cpu);
+}
+
+/**
* scx_bpf_cpuperf_set - Set the relative performance target of a CPU
* @cpu: CPU of interest
* @perf: target performance level [0, %SCX_CPUPERF_ONE]
@@ -9504,7 +10041,7 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf, const struct bpf_prog_au
return;
}
- if (ops_cpu_valid(sch, cpu, NULL)) {
+ if (scx_cpu_valid(sch, cpu, NULL)) {
struct rq *rq = cpu_rq(cpu), *locked_rq = scx_locked_rq();
struct rq_flags rf;
@@ -9535,6 +10072,31 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf, const struct bpf_prog_au
}
/**
+ * scx_bpf_cidperf_set - Set the performance target of the CPU at @cid
+ * @cid: cid of the CPU to target
+ * @perf: target performance level [0, %SCX_CPUPERF_ONE]
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
+ *
+ * cid-addressed equivalent of scx_bpf_cpuperf_set().
+ */
+__bpf_kfunc void scx_bpf_cidperf_set(s32 cid, u32 perf,
+ const struct bpf_prog_aux *aux)
+{
+ struct scx_sched *sch;
+ s32 cpu;
+
+ guard(rcu)();
+
+ sch = scx_prog_sched(aux);
+ if (unlikely(!sch))
+ return;
+ cpu = scx_cid_to_cpu(sch, cid);
+ if (cpu < 0)
+ return;
+ scx_bpf_cpuperf_set(cpu, perf, aux);
+}
+
+/**
* scx_bpf_nr_node_ids - Return the number of possible node IDs
*
* All valid node IDs in the system are smaller than the returned value.
@@ -9555,6 +10117,47 @@ __bpf_kfunc u32 scx_bpf_nr_cpu_ids(void)
}
/**
+ * scx_bpf_nr_cids - Return the size of the cid space
+ *
+ * Equals num_possible_cpus(). All valid cids are in [0, return value).
+ */
+__bpf_kfunc u32 scx_bpf_nr_cids(void)
+{
+ return num_possible_cpus();
+}
+
+/**
+ * scx_bpf_nr_online_cids - Return current count of online CPUs in cid space
+ *
+ * Return num_online_cpus(). The standard model restarts the scheduler on
+ * hotplug, which lets schedulers treat [0, nr_online_cids) as the online
+ * range. Schedulers that prefer to handle hotplug without a restart should
+ * install a custom mapping via scx_bpf_cid_override() and track onlining
+ * through the ops.cid_online / ops.cid_offline callbacks.
+ */
+__bpf_kfunc u32 scx_bpf_nr_online_cids(void)
+{
+ return num_online_cpus();
+}
+
+/**
+ * scx_bpf_this_cid - Return the cid of the CPU this program is running on
+ *
+ * cid-addressed equivalent of bpf_get_smp_processor_id() for scx programs.
+ * The current cpu is trivially valid, so this is just a table lookup. Return
+ * -EINVAL if called from a non-SCX program before any scheduler has ever
+ * been enabled (the cid table is still unallocated at that point).
+ */
+__bpf_kfunc s32 scx_bpf_this_cid(void)
+{
+ s16 *tbl = READ_ONCE(scx_cpu_to_cid_tbl);
+
+ if (!tbl)
+ return -EINVAL;
+ return tbl[raw_smp_processor_id()];
+}
+
+/**
* scx_bpf_get_possible_cpumask - Get a referenced kptr to cpu_possible_mask
*/
__bpf_kfunc const struct cpumask *scx_bpf_get_possible_cpumask(void)
@@ -9603,6 +10206,23 @@ __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p)
}
/**
+ * scx_bpf_task_cid - cid a task is currently associated with
+ * @p: task of interest
+ *
+ * cid-addressed equivalent of scx_bpf_task_cpu(). task_cpu(p) is always a
+ * valid cpu, so this is just a table lookup. Return -EINVAL if called from
+ * a non-SCX program before any scheduler has ever been enabled.
+ */
+__bpf_kfunc s32 scx_bpf_task_cid(const struct task_struct *p)
+{
+ s16 *tbl = READ_ONCE(scx_cpu_to_cid_tbl);
+
+ if (!tbl)
+ return -EINVAL;
+ return tbl[task_cpu(p)];
+}
+
+/**
* scx_bpf_cpu_rq - Fetch the rq of a CPU
* @cpu: CPU of the rq
* @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
@@ -9617,7 +10237,7 @@ __bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu, const struct bpf_prog_aux *aux)
if (unlikely(!sch))
return NULL;
- if (!ops_cpu_valid(sch, cpu, NULL))
+ if (!scx_cpu_valid(sch, cpu, NULL))
return NULL;
if (!sch->warned_deprecated_rq) {
@@ -9674,13 +10294,65 @@ __bpf_kfunc struct task_struct *scx_bpf_cpu_curr(s32 cpu, const struct bpf_prog_
if (unlikely(!sch))
return NULL;
- if (!ops_cpu_valid(sch, cpu, NULL))
+ if (!scx_cpu_valid(sch, cpu, NULL))
return NULL;
return rcu_dereference(cpu_rq(cpu)->curr);
}
/**
+ * scx_bpf_cid_curr - Return the curr task on the CPU at @cid
+ * @cid: cid of interest
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
+ *
+ * cid-addressed equivalent of scx_bpf_cpu_curr(). Callers must hold RCU
+ * read lock (KF_RCU).
+ */
+__bpf_kfunc struct task_struct *scx_bpf_cid_curr(s32 cid, const struct bpf_prog_aux *aux)
+{
+ struct scx_sched *sch;
+ s32 cpu;
+
+ guard(rcu)();
+
+ sch = scx_prog_sched(aux);
+ if (unlikely(!sch))
+ return NULL;
+ cpu = scx_cid_to_cpu(sch, cid);
+ if (cpu < 0)
+ return NULL;
+ return rcu_dereference(cpu_rq(cpu)->curr);
+}
+
+/**
+ * scx_bpf_tid_to_task - Look up a task by its scx tid
+ * @tid: task ID previously read from p->scx.tid
+ *
+ * Returns the task with the given tid, or NULL if no such task exists. The
+ * returned pointer is valid until the end of the current RCU read section
+ * (KF_RCU_PROTECTED). Requires SCX_OPS_TID_TO_TASK to be set on the root
+ * scheduler; otherwise an error is raised and NULL returned.
+ */
+__bpf_kfunc struct task_struct *scx_bpf_tid_to_task(u64 tid)
+{
+ struct sched_ext_entity *scx;
+
+ if (!scx_tid_to_task_enabled()) {
+ struct scx_sched *sch = rcu_dereference(scx_root);
+
+ if (sch)
+ scx_error(sch, "scx_bpf_tid_to_task() called without SCX_OPS_TID_TO_TASK");
+ return NULL;
+ }
+
+ scx = rhashtable_lookup(&scx_tid_hash, &tid, scx_tid_hash_params);
+ if (!scx)
+ return NULL;
+
+ return container_of(scx, struct task_struct, scx);
+}
+
+/**
* scx_bpf_now - Returns a high-performance monotonically non-decreasing
* clock for the current CPU. The clock returned is in nanoseconds.
*
@@ -9839,6 +10511,7 @@ BTF_KFUNCS_START(scx_kfunc_ids_any)
BTF_ID_FLAGS(func, scx_bpf_task_set_slice, KF_IMPLICIT_ARGS | KF_RCU);
BTF_ID_FLAGS(func, scx_bpf_task_set_dsq_vtime, KF_IMPLICIT_ARGS | KF_RCU);
BTF_ID_FLAGS(func, scx_bpf_kick_cpu, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_kick_cid, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, scx_bpf_destroy_dsq, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_IMPLICIT_ARGS | KF_RCU_PROTECTED | KF_RET_NULL)
@@ -9853,16 +10526,25 @@ BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, scx_bpf_cpuperf_set, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_cidperf_cap, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_cidperf_cur, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_cidperf_set, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, scx_bpf_nr_node_ids)
BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids)
+BTF_ID_FLAGS(func, scx_bpf_nr_cids)
+BTF_ID_FLAGS(func, scx_bpf_nr_online_cids)
+BTF_ID_FLAGS(func, scx_bpf_this_cid)
BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE)
BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE)
BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE)
BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_task_cid, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_cpu_rq, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, scx_bpf_locked_rq, KF_IMPLICIT_ARGS | KF_RET_NULL)
BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_IMPLICIT_ARGS | KF_RET_NULL | KF_RCU_PROTECTED)
+BTF_ID_FLAGS(func, scx_bpf_cid_curr, KF_IMPLICIT_ARGS | KF_RET_NULL | KF_RCU_PROTECTED)
+BTF_ID_FLAGS(func, scx_bpf_tid_to_task, KF_RET_NULL | KF_RCU_PROTECTED)
BTF_ID_FLAGS(func, scx_bpf_now)
BTF_ID_FLAGS(func, scx_bpf_events)
#ifdef CONFIG_CGROUP_SCHED
@@ -9877,6 +10559,47 @@ static const struct btf_kfunc_id_set scx_kfunc_set_any = {
};
/*
+ * cpu-form kfuncs that are forbidden from cid-form schedulers
+ * (bpf_sched_ext_ops_cid). Programs targeting the cid struct_ops type must
+ * use the cid-form alternative (cid/cmask kfuncs).
+ *
+ * Membership overlaps with scx_kfunc_ids_{any,idle,select_cpu}; the filter
+ * tests this set independently and rejects matches before the per-op
+ * allow-list check runs.
+ *
+ * pahole/resolve_btfids scans every BTF_ID_FLAGS() at build time and
+ * intersects flags across duplicate entries, so each entry must carry the
+ * same flags as the kfunc's primary declaration; otherwise the flags get
+ * dropped globally.
+ */
+BTF_KFUNCS_START(scx_kfunc_ids_cpu_only)
+BTF_ID_FLAGS(func, scx_bpf_kick_cpu, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_cpu_rq, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_IMPLICIT_ARGS | KF_RET_NULL | KF_RCU_PROTECTED)
+BTF_ID_FLAGS(func, scx_bpf_cpu_node, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_cpuperf_set, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE)
+BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_IMPLICIT_ARGS | KF_RCU)
+BTF_ID_FLAGS(func, __scx_bpf_select_cpu_and, KF_IMPLICIT_ARGS | KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_IMPLICIT_ARGS | KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask_node, KF_IMPLICIT_ARGS | KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_IMPLICIT_ARGS | KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask_node, KF_IMPLICIT_ARGS | KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE)
+BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_IMPLICIT_ARGS | KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_IMPLICIT_ARGS | KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_IMPLICIT_ARGS | KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_IMPLICIT_ARGS | KF_RCU)
+BTF_KFUNCS_END(scx_kfunc_ids_cpu_only)
+
+/*
* Per-op kfunc allow flags. Each bit corresponds to a context-sensitive kfunc
* group; an op may permit zero or more groups, with the union expressed in
* scx_kf_allow_flags[]. The verifier-time filter (scx_kfunc_context_filter())
@@ -9885,10 +10608,11 @@ static const struct btf_kfunc_id_set scx_kfunc_set_any = {
*/
enum scx_kf_allow_flags {
SCX_KF_ALLOW_UNLOCKED = 1 << 0,
- SCX_KF_ALLOW_CPU_RELEASE = 1 << 1,
- SCX_KF_ALLOW_DISPATCH = 1 << 2,
- SCX_KF_ALLOW_ENQUEUE = 1 << 3,
- SCX_KF_ALLOW_SELECT_CPU = 1 << 4,
+ SCX_KF_ALLOW_INIT = 1 << 1,
+ SCX_KF_ALLOW_CPU_RELEASE = 1 << 2,
+ SCX_KF_ALLOW_DISPATCH = 1 << 3,
+ SCX_KF_ALLOW_ENQUEUE = 1 << 4,
+ SCX_KF_ALLOW_SELECT_CPU = 1 << 5,
};
/*
@@ -9916,7 +10640,7 @@ static const u32 scx_kf_allow_flags[] = {
[SCX_OP_IDX(sub_detach)] = SCX_KF_ALLOW_UNLOCKED,
[SCX_OP_IDX(cpu_online)] = SCX_KF_ALLOW_UNLOCKED,
[SCX_OP_IDX(cpu_offline)] = SCX_KF_ALLOW_UNLOCKED,
- [SCX_OP_IDX(init)] = SCX_KF_ALLOW_UNLOCKED,
+ [SCX_OP_IDX(init)] = SCX_KF_ALLOW_UNLOCKED | SCX_KF_ALLOW_INIT,
[SCX_OP_IDX(exit)] = SCX_KF_ALLOW_UNLOCKED,
};
@@ -9931,16 +10655,18 @@ static const u32 scx_kf_allow_flags[] = {
int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id)
{
bool in_unlocked = btf_id_set8_contains(&scx_kfunc_ids_unlocked, kfunc_id);
+ bool in_init = btf_id_set8_contains(&scx_kfunc_ids_init, kfunc_id);
bool in_select_cpu = btf_id_set8_contains(&scx_kfunc_ids_select_cpu, kfunc_id);
bool in_enqueue = btf_id_set8_contains(&scx_kfunc_ids_enqueue_dispatch, kfunc_id);
bool in_dispatch = btf_id_set8_contains(&scx_kfunc_ids_dispatch, kfunc_id);
bool in_cpu_release = btf_id_set8_contains(&scx_kfunc_ids_cpu_release, kfunc_id);
bool in_idle = btf_id_set8_contains(&scx_kfunc_ids_idle, kfunc_id);
bool in_any = btf_id_set8_contains(&scx_kfunc_ids_any, kfunc_id);
+ bool in_cpu_only = btf_id_set8_contains(&scx_kfunc_ids_cpu_only, kfunc_id);
u32 moff, flags;
/* Not an SCX kfunc - allow. */
- if (!(in_unlocked || in_select_cpu || in_enqueue || in_dispatch ||
+ if (!(in_unlocked || in_init || in_select_cpu || in_enqueue || in_dispatch ||
in_cpu_release || in_idle || in_any))
return 0;
@@ -9963,8 +10689,24 @@ int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id)
/*
* Non-SCX struct_ops: SCX kfuncs are not permitted.
+ *
+ * Both bpf_sched_ext_ops (cpu-form) and bpf_sched_ext_ops_cid
+ * (cid-form) are valid SCX struct_ops. Member offsets match between
+ * the two (verified by BUILD_BUG_ON in scx_init()), so the shared
+ * scx_kf_allow_flags[] table indexed by SCX_MOFF_IDX(moff) applies to
+ * both.
+ */
+ if (prog->aux->st_ops != &bpf_sched_ext_ops &&
+ prog->aux->st_ops != &bpf_sched_ext_ops_cid)
+ return -EACCES;
+
+ /*
+ * cid-form schedulers must use cid/cmask kfuncs. cid and cpu are both
+ * small s32s and trivially confused, so cpu-only kfuncs are rejected at
+ * load time. The reverse (cpu-form calling cid-form kfuncs) is
+ * intentionally permissive to ease gradual cpumask -> cid migration.
*/
- if (prog->aux->st_ops != &bpf_sched_ext_ops)
+ if (prog->aux->st_ops == &bpf_sched_ext_ops_cid && in_cpu_only)
return -EACCES;
/* SCX struct_ops: check the per-op allow list. */
@@ -9976,6 +10718,8 @@ int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id)
if ((flags & SCX_KF_ALLOW_UNLOCKED) && in_unlocked)
return 0;
+ if ((flags & SCX_KF_ALLOW_INIT) && in_init)
+ return 0;
if ((flags & SCX_KF_ALLOW_CPU_RELEASE) && in_cpu_release)
return 0;
if ((flags & SCX_KF_ALLOW_DISPATCH) && in_dispatch)
@@ -9993,6 +10737,73 @@ static int __init scx_init(void)
int ret;
/*
+ * sched_ext_ops_cid mirrors sched_ext_ops up to and including @priv.
+ * Both bpf_scx_init_member() and bpf_scx_check_member() use offsets
+ * from struct sched_ext_ops; sched_ext_ops_cid relies on those offsets
+ * matching for the shared fields. Catch any drift at boot.
+ */
+#define CID_OFFSET_MATCH(cpu_field, cid_field) \
+ BUILD_BUG_ON(offsetof(struct sched_ext_ops, cpu_field) != \
+ offsetof(struct sched_ext_ops_cid, cid_field))
+ /* data fields used by bpf_scx_init_member() */
+ CID_OFFSET_MATCH(dispatch_max_batch, dispatch_max_batch);
+ CID_OFFSET_MATCH(flags, flags);
+ CID_OFFSET_MATCH(name, name);
+ CID_OFFSET_MATCH(timeout_ms, timeout_ms);
+ CID_OFFSET_MATCH(exit_dump_len, exit_dump_len);
+ CID_OFFSET_MATCH(hotplug_seq, hotplug_seq);
+ CID_OFFSET_MATCH(sub_cgroup_id, sub_cgroup_id);
+ /* shared callbacks: the union view requires byte-for-byte offset match */
+ CID_OFFSET_MATCH(enqueue, enqueue);
+ CID_OFFSET_MATCH(dequeue, dequeue);
+ CID_OFFSET_MATCH(dispatch, dispatch);
+ CID_OFFSET_MATCH(tick, tick);
+ CID_OFFSET_MATCH(runnable, runnable);
+ CID_OFFSET_MATCH(running, running);
+ CID_OFFSET_MATCH(stopping, stopping);
+ CID_OFFSET_MATCH(quiescent, quiescent);
+ CID_OFFSET_MATCH(yield, yield);
+ CID_OFFSET_MATCH(core_sched_before, core_sched_before);
+ CID_OFFSET_MATCH(set_weight, set_weight);
+ CID_OFFSET_MATCH(update_idle, update_idle);
+ CID_OFFSET_MATCH(init_task, init_task);
+ CID_OFFSET_MATCH(exit_task, exit_task);
+ CID_OFFSET_MATCH(enable, enable);
+ CID_OFFSET_MATCH(disable, disable);
+ CID_OFFSET_MATCH(dump, dump);
+ CID_OFFSET_MATCH(dump_task, dump_task);
+ CID_OFFSET_MATCH(sub_attach, sub_attach);
+ CID_OFFSET_MATCH(sub_detach, sub_detach);
+ CID_OFFSET_MATCH(init, init);
+ CID_OFFSET_MATCH(exit, exit);
+#ifdef CONFIG_EXT_GROUP_SCHED
+ CID_OFFSET_MATCH(cgroup_init, cgroup_init);
+ CID_OFFSET_MATCH(cgroup_exit, cgroup_exit);
+ CID_OFFSET_MATCH(cgroup_prep_move, cgroup_prep_move);
+ CID_OFFSET_MATCH(cgroup_move, cgroup_move);
+ CID_OFFSET_MATCH(cgroup_cancel_move, cgroup_cancel_move);
+ CID_OFFSET_MATCH(cgroup_set_weight, cgroup_set_weight);
+ CID_OFFSET_MATCH(cgroup_set_bandwidth, cgroup_set_bandwidth);
+ CID_OFFSET_MATCH(cgroup_set_idle, cgroup_set_idle);
+#endif
+ /* renamed callbacks must occupy the same slot as their cpu-form sibling */
+ CID_OFFSET_MATCH(select_cpu, select_cid);
+ CID_OFFSET_MATCH(set_cpumask, set_cmask);
+ CID_OFFSET_MATCH(cpu_online, cid_online);
+ CID_OFFSET_MATCH(cpu_offline, cid_offline);
+ CID_OFFSET_MATCH(dump_cpu, dump_cid);
+ /* @priv tail must align since both share the same data block */
+ CID_OFFSET_MATCH(priv, priv);
+ /*
+ * cid-form must end exactly at @priv - validate_ops() skips
+ * cpu_acquire/cpu_release for cid-form because reading those fields
+ * past the BPF allocation would be UB.
+ */
+ BUILD_BUG_ON(offsetof(struct sched_ext_ops_cid, __end) !=
+ offsetofend(struct sched_ext_ops, priv));
+#undef CID_OFFSET_MATCH
+
+ /*
* kfunc registration can't be done from init_sched_ext_class() as
* register_btf_kfunc_id_set() needs most of the system to be up.
*
@@ -10030,12 +10841,24 @@ static int __init scx_init(void)
return ret;
}
+ ret = scx_cid_kfunc_init();
+ if (ret) {
+ pr_err("sched_ext: Failed to register cid kfuncs (%d)\n", ret);
+ return ret;
+ }
+
ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops);
if (ret) {
pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret);
return ret;
}
+ ret = register_bpf_struct_ops(&bpf_sched_ext_ops_cid, sched_ext_ops_cid);
+ if (ret) {
+ pr_err("sched_ext: Failed to register cid struct_ops (%d)\n", ret);
+ return ret;
+ }
+
ret = register_pm_notifier(&scx_pm_notifier);
if (ret) {
pr_err("sched_ext: Failed to register PM notifier (%d)\n", ret);
diff --git a/kernel/sched/ext_arena.c b/kernel/sched/ext_arena.c
new file mode 100644
index 000000000000..493c2424f842
--- /dev/null
+++ b/kernel/sched/ext_arena.c
@@ -0,0 +1,131 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * scx_arena_pool: kernel-side sub-allocator over BPF-arena pages.
+ *
+ * Each chunk added to @sch->arena_pool comes from one
+ * bpf_arena_alloc_pages_sleepable() call and is registered at the
+ * kernel-side mapping address. Callers translate to the BPF-arena form
+ * themselves if needed.
+ *
+ * Allocations grow the pool on demand. Underlying arena pages are released
+ * when the arena map itself is torn down.
+ *
+ * Copyright (c) 2026 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2026 Tejun Heo <tj@kernel.org>
+ */
+
+enum scx_arena_consts {
+ SCX_ARENA_MIN_ORDER = 3, /* 8-byte minimum sub-allocation */
+ SCX_ARENA_GROW_PAGES = 4, /* per growth */
+};
+
+s32 scx_arena_pool_init(struct scx_sched *sch)
+{
+ if (!sch->arena_map)
+ return 0;
+
+ sch->arena_pool = gen_pool_create(SCX_ARENA_MIN_ORDER, NUMA_NO_NODE);
+ if (!sch->arena_pool)
+ return -ENOMEM;
+ return 0;
+}
+
+static void scx_arena_clear_chunk(struct gen_pool *pool, struct gen_pool_chunk *chunk,
+ void *data)
+{
+ int order = pool->min_alloc_order;
+ size_t chunk_sz = chunk->end_addr - chunk->start_addr + 1;
+ unsigned long end_bit = chunk_sz >> order;
+ unsigned long b, e;
+
+ for_each_set_bitrange(b, e, chunk->bits, end_bit)
+ gen_pool_free(pool, chunk->start_addr + (b << order),
+ (e - b) << order);
+}
+
+/*
+ * Tear down the pool. Outstanding gen_pool allocations are freed via
+ * scx_arena_clear_chunk() so gen_pool_destroy() doesn't BUG. The underlying
+ * arena pages are released when the arena map itself is torn down.
+ */
+void scx_arena_pool_destroy(struct scx_sched *sch)
+{
+ if (!sch->arena_pool)
+ return;
+ gen_pool_for_each_chunk(sch->arena_pool, scx_arena_clear_chunk, NULL);
+ gen_pool_destroy(sch->arena_pool);
+ sch->arena_pool = NULL;
+}
+
+/*
+ * Grow the pool by @page_cnt pages. bpf_arena_alloc_pages_sleepable() and
+ * gen_pool_add() (which calls vzalloc(GFP_KERNEL)) require a sleepable
+ * context.
+ */
+static int scx_arena_grow(struct scx_sched *sch, u32 page_cnt)
+{
+ u64 kern_vm_start;
+ u32 uaddr32;
+ void *p;
+ int ret;
+
+ if (!sch->arena_map || !sch->arena_pool)
+ return -EINVAL;
+
+ p = bpf_arena_alloc_pages_sleepable(sch->arena_map, NULL,
+ page_cnt, NUMA_NO_NODE, 0);
+ if (!p)
+ return -ENOMEM;
+
+ uaddr32 = (u32)(unsigned long)p;
+ /* arena.o, which defines these, is built only on MMU && 64BIT */
+#if defined(CONFIG_MMU) && defined(CONFIG_64BIT)
+ kern_vm_start = bpf_arena_map_kern_vm_start(sch->arena_map);
+#else
+ kern_vm_start = 0;
+#endif
+
+ ret = gen_pool_add(sch->arena_pool, kern_vm_start + uaddr32,
+ page_cnt * PAGE_SIZE, NUMA_NO_NODE);
+ if (ret) {
+ bpf_arena_free_pages_non_sleepable(sch->arena_map, p, page_cnt);
+ return ret;
+ }
+ return 0;
+}
+
+/*
+ * Allocate @size bytes from the arena pool. Returns kernel VA on success, NULL
+ * on failure. May grow the pool via scx_arena_grow() which sleeps. Caller must
+ * be in a GFP_KERNEL context.
+ */
+void *scx_arena_alloc(struct scx_sched *sch, size_t size)
+{
+ unsigned long kern_va;
+ u32 page_cnt;
+
+ might_sleep();
+
+ if (!sch->arena_pool)
+ return NULL;
+
+ while (true) {
+ kern_va = gen_pool_alloc(sch->arena_pool, size);
+ if (kern_va)
+ break;
+ page_cnt = max_t(u32, SCX_ARENA_GROW_PAGES,
+ (size + PAGE_SIZE - 1) >> PAGE_SHIFT);
+ if (scx_arena_grow(sch, page_cnt))
+ return NULL;
+ }
+
+ return (void *)kern_va;
+}
+
+void scx_arena_free(struct scx_sched *sch, void *kern_va, size_t size)
+{
+ if (sch->arena_pool && kern_va)
+ gen_pool_free(sch->arena_pool, (unsigned long)kern_va, size);
+}
diff --git a/kernel/sched/ext_arena.h b/kernel/sched/ext_arena.h
new file mode 100644
index 000000000000..4f3610160102
--- /dev/null
+++ b/kernel/sched/ext_arena.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2025 Tejun Heo <tj@kernel.org>
+ */
+#ifndef _KERNEL_SCHED_EXT_ARENA_H
+#define _KERNEL_SCHED_EXT_ARENA_H
+
+struct scx_sched;
+
+s32 scx_arena_pool_init(struct scx_sched *sch);
+void scx_arena_pool_destroy(struct scx_sched *sch);
+void *scx_arena_alloc(struct scx_sched *sch, size_t size);
+void scx_arena_free(struct scx_sched *sch, void *kern_va, size_t size);
+
+#endif /* _KERNEL_SCHED_EXT_ARENA_H */
diff --git a/kernel/sched/ext_cid.c b/kernel/sched/ext_cid.c
new file mode 100644
index 000000000000..66944a7ef79d
--- /dev/null
+++ b/kernel/sched/ext_cid.c
@@ -0,0 +1,707 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * Copyright (c) 2026 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2026 Tejun Heo <tj@kernel.org>
+ */
+#include <linux/cacheinfo.h>
+
+/*
+ * cid tables.
+ *
+ * Pointers are published once on first enable and never revoked. The default
+ * mapping is populated before ops.init() runs; scx_bpf_cid_override() commits
+ * before it returns. As long as the BPF scheduler only uses the tables from
+ * those points onward, it sees a consistent view.
+ */
+s16 *scx_cid_to_cpu_tbl;
+s16 *scx_cpu_to_cid_tbl;
+struct scx_cid_topo *scx_cid_topo;
+
+#define SCX_CID_TOPO_NEG (struct scx_cid_topo) { \
+ .core_cid = -1, .core_idx = -1, .llc_cid = -1, .llc_idx = -1, \
+ .node_cid = -1, .node_idx = -1, \
+}
+
+/*
+ * Return @cpu's LLC shared_cpu_map. If cacheinfo isn't populated (offline or
+ * !present), record @cpu in @fallbacks and return its node mask instead - the
+ * worst that can happen is that the cpu's LLC becomes coarser than reality.
+ */
+static const struct cpumask *cpu_llc_mask(int cpu, struct cpumask *fallbacks)
+{
+ struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu);
+
+ if (!ci || !ci->info_list || !ci->num_leaves) {
+ cpumask_set_cpu(cpu, fallbacks);
+ return cpumask_of_node(cpu_to_node(cpu));
+ }
+ return &ci->info_list[ci->num_leaves - 1].shared_cpu_map;
+}
+
+/* Allocate the cid tables once on first enable; never freed. */
+static s32 scx_cid_arrays_alloc(void)
+{
+ u32 npossible = num_possible_cpus();
+ s16 *cid_to_cpu, *cpu_to_cid;
+ struct scx_cid_topo *cid_topo;
+
+ if (scx_cid_to_cpu_tbl)
+ return 0;
+
+ cid_to_cpu = kzalloc_objs(*scx_cid_to_cpu_tbl, npossible, GFP_KERNEL);
+ cpu_to_cid = kzalloc_objs(*scx_cpu_to_cid_tbl, nr_cpu_ids, GFP_KERNEL);
+ cid_topo = kmalloc_objs(*scx_cid_topo, npossible, GFP_KERNEL);
+
+ if (!cid_to_cpu || !cpu_to_cid || !cid_topo) {
+ kfree(cid_to_cpu);
+ kfree(cpu_to_cid);
+ kfree(cid_topo);
+ return -ENOMEM;
+ }
+
+ WRITE_ONCE(scx_cid_to_cpu_tbl, cid_to_cpu);
+ WRITE_ONCE(scx_cpu_to_cid_tbl, cpu_to_cid);
+ WRITE_ONCE(scx_cid_topo, cid_topo);
+ return 0;
+}
+
+/**
+ * scx_cid_init - build the cid mapping
+ * @sch: the scx_sched being initialized; used as the scx_error() target
+ *
+ * See "Topological CPU IDs" in ext_cid.h for the model. Walk online cpus by
+ * intersection at each level (parent_scratch & this_level_mask), which keeps
+ * containment correct by construction and naturally splits a physical LLC
+ * straddling two NUMA nodes into two LLC units. The caller must hold
+ * cpus_read_lock.
+ */
+s32 scx_cid_init(struct scx_sched *sch)
+{
+ cpumask_var_t to_walk __free(free_cpumask_var) = CPUMASK_VAR_NULL;
+ cpumask_var_t node_scratch __free(free_cpumask_var) = CPUMASK_VAR_NULL;
+ cpumask_var_t llc_scratch __free(free_cpumask_var) = CPUMASK_VAR_NULL;
+ cpumask_var_t core_scratch __free(free_cpumask_var) = CPUMASK_VAR_NULL;
+ cpumask_var_t llc_fallback __free(free_cpumask_var) = CPUMASK_VAR_NULL;
+ cpumask_var_t online_no_topo __free(free_cpumask_var) = CPUMASK_VAR_NULL;
+ u32 next_cid = 0;
+ s32 next_node_idx = 0, next_llc_idx = 0, next_core_idx = 0;
+ s32 cpu, ret;
+
+ /* CMASK_MAX_WORDS in cid.bpf.h covers NR_CPUS up to 8192 */
+ BUILD_BUG_ON(NR_CPUS > 8192);
+
+ lockdep_assert_cpus_held();
+
+ ret = scx_cid_arrays_alloc();
+ if (ret)
+ return ret;
+
+ if (!zalloc_cpumask_var(&to_walk, GFP_KERNEL) ||
+ !zalloc_cpumask_var(&node_scratch, GFP_KERNEL) ||
+ !zalloc_cpumask_var(&llc_scratch, GFP_KERNEL) ||
+ !zalloc_cpumask_var(&core_scratch, GFP_KERNEL) ||
+ !zalloc_cpumask_var(&llc_fallback, GFP_KERNEL) ||
+ !zalloc_cpumask_var(&online_no_topo, GFP_KERNEL))
+ return -ENOMEM;
+
+ /* -1 sentinels for sparse-possible cpu id holes (0 is a valid cid) */
+ for (cpu = 0; cpu < nr_cpu_ids; cpu++)
+ scx_cpu_to_cid_tbl[cpu] = -1;
+
+ cpumask_copy(to_walk, cpu_online_mask);
+
+ while (!cpumask_empty(to_walk)) {
+ s32 next_cpu = cpumask_first(to_walk);
+ s32 nid = cpu_to_node(next_cpu);
+ s32 node_cid = next_cid;
+ s32 node_idx;
+
+ /*
+ * No NUMA info: skip and let the tail loop assign a no-topo
+ * cid. cpumask_of_node(-1) is undefined.
+ */
+ if (nid < 0) {
+ cpumask_clear_cpu(next_cpu, to_walk);
+ continue;
+ }
+
+ node_idx = next_node_idx++;
+
+ /* node_scratch = to_walk & this node */
+ cpumask_and(node_scratch, to_walk, cpumask_of_node(nid));
+ if (WARN_ON_ONCE(!cpumask_test_cpu(next_cpu, node_scratch)))
+ return -EINVAL;
+
+ while (!cpumask_empty(node_scratch)) {
+ s32 ncpu = cpumask_first(node_scratch);
+ const struct cpumask *llc_mask = cpu_llc_mask(ncpu, llc_fallback);
+ s32 llc_cid = next_cid;
+ s32 llc_idx = next_llc_idx++;
+
+ /* llc_scratch = node_scratch & this llc */
+ cpumask_and(llc_scratch, node_scratch, llc_mask);
+ if (WARN_ON_ONCE(!cpumask_test_cpu(ncpu, llc_scratch)))
+ return -EINVAL;
+
+ while (!cpumask_empty(llc_scratch)) {
+ s32 lcpu = cpumask_first(llc_scratch);
+ const struct cpumask *sib = topology_sibling_cpumask(lcpu);
+ s32 core_cid = next_cid;
+ s32 core_idx = next_core_idx++;
+ s32 ccpu;
+
+ /* core_scratch = llc_scratch & this core */
+ cpumask_and(core_scratch, llc_scratch, sib);
+ if (WARN_ON_ONCE(!cpumask_test_cpu(lcpu, core_scratch)))
+ return -EINVAL;
+
+ for_each_cpu(ccpu, core_scratch) {
+ s32 cid = next_cid++;
+
+ scx_cid_to_cpu_tbl[cid] = ccpu;
+ scx_cpu_to_cid_tbl[ccpu] = cid;
+ scx_cid_topo[cid] = (struct scx_cid_topo){
+ .core_cid = core_cid,
+ .core_idx = core_idx,
+ .llc_cid = llc_cid,
+ .llc_idx = llc_idx,
+ .node_cid = node_cid,
+ .node_idx = node_idx,
+ };
+
+ cpumask_clear_cpu(ccpu, llc_scratch);
+ cpumask_clear_cpu(ccpu, node_scratch);
+ cpumask_clear_cpu(ccpu, to_walk);
+ }
+ }
+ }
+ }
+
+ /*
+ * No-topo section: any possible cpu without a cid - normally just the
+ * not-online ones. Collect any currently-online cpus that land here in
+ * @online_no_topo so we can warn about them at the end.
+ */
+ for_each_cpu(cpu, cpu_possible_mask) {
+ s32 cid;
+
+ if (__scx_cpu_to_cid(cpu) != -1)
+ continue;
+ if (cpu_online(cpu))
+ cpumask_set_cpu(cpu, online_no_topo);
+
+ cid = next_cid++;
+ scx_cid_to_cpu_tbl[cid] = cpu;
+ scx_cpu_to_cid_tbl[cpu] = cid;
+ scx_cid_topo[cid] = SCX_CID_TOPO_NEG;
+ }
+
+ if (!cpumask_empty(llc_fallback))
+ pr_warn("scx_cid: cpus without cacheinfo, using node mask as llc: %*pbl\n",
+ cpumask_pr_args(llc_fallback));
+ if (!cpumask_empty(online_no_topo))
+ pr_warn("scx_cid: online cpus with no usable topology: %*pbl\n",
+ cpumask_pr_args(online_no_topo));
+
+ return 0;
+}
+
+/**
+ * scx_cmask_clear - Zero every bit in @m's active range
+ * @m: cmask to clear
+ *
+ * Storage past the active range is left as is.
+ */
+void scx_cmask_clear(struct scx_cmask *m)
+{
+ u32 nr_words;
+
+ if (!m->nr_cids)
+ return;
+ nr_words = (m->base + m->nr_cids - 1) / 64 - m->base / 64 + 1;
+ memset(m->bits, 0, nr_words * sizeof(u64));
+}
+
+/**
+ * scx_cmask_fill - Set every bit in @m's active range
+ * @m: cmask to fill
+ *
+ * Counterpart to scx_cmask_clear(). Storage past the active range is left as is.
+ */
+void scx_cmask_fill(struct scx_cmask *m)
+{
+ u32 nr_words, head_bits, tail_bits;
+
+ if (!m->nr_cids)
+ return;
+ nr_words = (m->base + m->nr_cids - 1) / 64 - m->base / 64 + 1;
+ memset(m->bits, 0xff, nr_words * sizeof(u64));
+
+ /* clear word-0 bits below base */
+ head_bits = m->base & 63;
+ if (head_bits)
+ m->bits[0] &= ~((1ULL << head_bits) - 1);
+
+ /* clear last-word bits at or past base + nr_cids */
+ tail_bits = (m->base + m->nr_cids) & 63;
+ if (tail_bits)
+ m->bits[nr_words - 1] &= (1ULL << tail_bits) - 1;
+}
+
+/**
+ * scx_cpumask_to_cmask - Translate a kernel cpumask into a cmask
+ * @src: source cpumask
+ * @dst: cmask to write
+ *
+ * Clear @dst's active range and set the bit for each cid whose cpu is in
+ * @src and lies within that range. Out-of-range cids are silently ignored.
+ */
+void scx_cpumask_to_cmask(const struct cpumask *src, struct scx_cmask *dst)
+{
+ s32 cpu;
+
+ scx_cmask_clear(dst);
+ for_each_cpu(cpu, src) {
+ s32 cid = __scx_cpu_to_cid(cpu);
+
+ if (cid >= 0)
+ __scx_cmask_set(cid, dst);
+ }
+}
+
+__bpf_kfunc_start_defs();
+
+/**
+ * scx_bpf_cid_override - Install an explicit cpu->cid mapping
+ * @cpu_to_cid: array of nr_cpu_ids s32 entries (cid for each cpu)
+ * @cpu_to_cid__sz: must be nr_cpu_ids * sizeof(s32) bytes
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
+ *
+ * May only be called from ops.init() of the root scheduler. Replace the
+ * topology-probed cid mapping with the caller-provided one. Each possible cpu
+ * must map to a unique cid in [0, num_possible_cpus()). Topo info is cleared.
+ * On invalid input, trigger scx_error() to abort the scheduler.
+ */
+__bpf_kfunc void scx_bpf_cid_override(const s32 *cpu_to_cid, u32 cpu_to_cid__sz,
+ const struct bpf_prog_aux *aux)
+{
+ cpumask_var_t seen __free(free_cpumask_var) = CPUMASK_VAR_NULL;
+ struct scx_sched *sch;
+ bool alloced;
+ s32 cpu, cid;
+
+ /* GFP_KERNEL alloc must happen before the rcu read section */
+ alloced = zalloc_cpumask_var(&seen, GFP_KERNEL);
+
+ guard(rcu)();
+
+ sch = scx_prog_sched(aux);
+ if (unlikely(!sch))
+ return;
+
+ if (!alloced) {
+ scx_error(sch, "scx_bpf_cid_override: failed to allocate cpumask");
+ return;
+ }
+
+ if (scx_parent(sch)) {
+ scx_error(sch, "scx_bpf_cid_override() only allowed from root sched");
+ return;
+ }
+
+ if (cpu_to_cid__sz != nr_cpu_ids * sizeof(s32)) {
+ scx_error(sch, "scx_bpf_cid_override: expected %zu bytes, got %u",
+ nr_cpu_ids * sizeof(s32), cpu_to_cid__sz);
+ return;
+ }
+
+ for_each_possible_cpu(cpu) {
+ s32 c = cpu_to_cid[cpu];
+
+ if (!cid_valid(sch, c))
+ return;
+ if (cpumask_test_and_set_cpu(c, seen)) {
+ scx_error(sch, "cid %d assigned to multiple cpus", c);
+ return;
+ }
+ scx_cpu_to_cid_tbl[cpu] = c;
+ scx_cid_to_cpu_tbl[c] = cpu;
+ }
+
+ /* Invalidate stale topo info - the override carries no topology. */
+ for (cid = 0; cid < num_possible_cpus(); cid++)
+ scx_cid_topo[cid] = SCX_CID_TOPO_NEG;
+}
+
+/**
+ * scx_bpf_cid_to_cpu - Return the raw CPU id for @cid
+ * @cid: cid to look up
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
+ *
+ * Return the raw CPU id for @cid. Trigger scx_error() and return -EINVAL if
+ * @cid is invalid. The cid<->cpu mapping is static for the lifetime of the
+ * loaded scheduler, so the BPF side can cache the result to avoid repeated
+ * kfunc invocations.
+ */
+__bpf_kfunc s32 scx_bpf_cid_to_cpu(s32 cid, const struct bpf_prog_aux *aux)
+{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = scx_prog_sched(aux);
+ if (unlikely(!sch))
+ return -EINVAL;
+ return scx_cid_to_cpu(sch, cid);
+}
+
+/**
+ * scx_bpf_cpu_to_cid - Return the cid for @cpu
+ * @cpu: cpu to look up
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
+ *
+ * Return the cid for @cpu. Trigger scx_error() and return -EINVAL if @cpu is
+ * invalid. The cid<->cpu mapping is static for the lifetime of the loaded
+ * scheduler, so the BPF side can cache the result to avoid repeated kfunc
+ * invocations.
+ */
+__bpf_kfunc s32 scx_bpf_cpu_to_cid(s32 cpu, const struct bpf_prog_aux *aux)
+{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = scx_prog_sched(aux);
+ if (unlikely(!sch))
+ return -EINVAL;
+ return scx_cpu_to_cid(sch, cpu);
+}
+
+/*
+ * Set ops on cmasks. cmask_walk_op2() shares one walk across mutating
+ * (and/or/copy/andnot) and predicate (subset/intersects) two-cmask forms;
+ * cmask_walk_op1() does the same shape over a single cmask range. Every public
+ * entry passes a compile-time-constant @op; cmask_walk_op{1,2}() and
+ * cmask_word_op{1,2}() are __always_inline so the inner switch collapses to the
+ * selected op and cmask_op2_is_pred() folds the predicate early-exit out of
+ * mutating ops.
+ *
+ * Two-cmask ops only touch @dst bits inside the intersection of the two ranges;
+ * bits outside stay untouched. In particular, scx_cmask_copy() does NOT zero
+ * @dst bits that lie outside @src's range.
+ *
+ * The _RACY variants are otherwise identical to their non-racy counterpart but
+ * read @src word-by-word via data_race(). Memory ordering with concurrent
+ * writers is the caller's responsibility.
+ */
+enum cmask_op2 {
+ /* mutating */
+ CMASK_OP2_AND,
+ CMASK_OP2_OR,
+ CMASK_OP2_OR_RACY,
+ CMASK_OP2_COPY,
+ CMASK_OP2_COPY_RACY,
+ CMASK_OP2_ANDNOT,
+ /* predicates - short-circuit when the per-word result is true */
+ CMASK_OP2_SUBSET,
+ CMASK_OP2_INTERSECTS,
+};
+
+static __always_inline bool cmask_op2_is_pred(const enum cmask_op2 op)
+{
+ return op == CMASK_OP2_SUBSET || op == CMASK_OP2_INTERSECTS;
+}
+
+static __always_inline bool cmask_word_op2(u64 *av, const u64 *bp, u64 mask,
+ const enum cmask_op2 op)
+{
+ switch (op) {
+ case CMASK_OP2_AND:
+ *av &= ~mask | *bp;
+ return false;
+ case CMASK_OP2_OR:
+ *av |= *bp & mask;
+ return false;
+ case CMASK_OP2_OR_RACY:
+ *av |= data_race(*bp) & mask;
+ return false;
+ case CMASK_OP2_COPY:
+ *av = (*av & ~mask) | (*bp & mask);
+ return false;
+ case CMASK_OP2_COPY_RACY:
+ *av = (*av & ~mask) | (data_race(*bp) & mask);
+ return false;
+ case CMASK_OP2_ANDNOT:
+ *av &= ~(*bp & mask);
+ return false;
+ case CMASK_OP2_SUBSET:
+ /* stop on the first bit in @sub not set in @super */
+ return (*bp & ~*av) & mask;
+ case CMASK_OP2_INTERSECTS:
+ return (*av & *bp) & mask;
+ }
+ unreachable();
+}
+
+/*
+ * Walk the intersection of [@a_base, @a_base + @a_nr_cids) with [@b_base,
+ * @b_base + @b_nr_cids) word by word, applying @op. Mutating ops walk all words
+ * and return false; predicates return true on the first word whose per-word
+ * test is true. Empty intersection returns false (matches "no bits to consider"
+ * for both mutate and predicate).
+ *
+ * Base/nr_cids are taken as parameters so callers with snapshotted bounds can
+ * drive the walk with values independent of the cmask's header.
+ */
+static __always_inline bool cmask_walk_op2(u64 *a_bits, u32 a_base, u32 a_nr_cids,
+ const u64 *b_bits, u32 b_base, u32 b_nr_cids,
+ const enum cmask_op2 op)
+{
+ u32 lo = max(a_base, b_base);
+ u32 hi = min(a_base + a_nr_cids, b_base + b_nr_cids);
+ u32 a_word_off = a_base / 64;
+ u32 b_word_off = b_base / 64;
+ u32 lo_word = lo / 64;
+ u32 hi_word = (hi - 1) / 64;
+ u64 head_mask = GENMASK_U64(63, lo & 63);
+ u64 tail_mask = GENMASK_U64((hi - 1) & 63, 0);
+ u32 w;
+
+ if (lo >= hi)
+ return false;
+
+ if (lo_word == hi_word)
+ return cmask_word_op2(&a_bits[lo_word - a_word_off],
+ &b_bits[lo_word - b_word_off],
+ head_mask & tail_mask, op);
+
+ if (cmask_word_op2(&a_bits[lo_word - a_word_off],
+ &b_bits[lo_word - b_word_off], head_mask, op) &&
+ cmask_op2_is_pred(op))
+ return true;
+
+ for (w = lo_word + 1; w < hi_word; w++)
+ if (cmask_word_op2(&a_bits[w - a_word_off],
+ &b_bits[w - b_word_off], ~0ULL, op) &&
+ cmask_op2_is_pred(op))
+ return true;
+
+ return cmask_word_op2(&a_bits[hi_word - a_word_off],
+ &b_bits[hi_word - b_word_off], tail_mask, op);
+}
+
+enum cmask_op1 {
+ CMASK_OP1_ANY_SET,
+};
+
+static __always_inline bool cmask_word_op1(const u64 *ap, u64 mask,
+ const enum cmask_op1 op)
+{
+ switch (op) {
+ case CMASK_OP1_ANY_SET:
+ return *ap & mask;
+ }
+ unreachable();
+}
+
+/*
+ * Walk [@a_base, @a_base + @a_nr_cids) of @a_bits word by word, applying @op.
+ * Returns true on the first word whose per-word test is true; returns false if
+ * no word matches or the range is empty. All current op1s short-circuit on
+ * per-word true; if a non-predicate op1 lands here, add a cmask_op1_is_pred()
+ * guard analogous to cmask_op2_is_pred().
+ */
+static __always_inline bool cmask_walk_op1(const u64 *a_bits, u32 a_base,
+ u32 a_nr_cids,
+ const enum cmask_op1 op)
+{
+ u32 lo = a_base;
+ u32 hi = a_base + a_nr_cids;
+ u32 a_word_off = a_base / 64;
+ u32 lo_word = lo / 64;
+ u32 hi_word = (hi - 1) / 64;
+ u64 head_mask = GENMASK_U64(63, lo & 63);
+ u64 tail_mask = GENMASK_U64((hi - 1) & 63, 0);
+ u32 w;
+
+ if (lo >= hi)
+ return false;
+
+ if (lo_word == hi_word)
+ return cmask_word_op1(&a_bits[lo_word - a_word_off],
+ head_mask & tail_mask, op);
+
+ if (cmask_word_op1(&a_bits[lo_word - a_word_off], head_mask, op))
+ return true;
+ for (w = lo_word + 1; w < hi_word; w++)
+ if (cmask_word_op1(&a_bits[w - a_word_off], ~0ULL, op))
+ return true;
+ return cmask_word_op1(&a_bits[hi_word - a_word_off], tail_mask, op);
+}
+
+void scx_cmask_and(struct scx_cmask *dst, const struct scx_cmask *src)
+{
+ cmask_walk_op2(dst->bits, dst->base, dst->nr_cids,
+ src->bits, src->base, src->nr_cids, CMASK_OP2_AND);
+}
+
+void scx_cmask_or(struct scx_cmask *dst, const struct scx_cmask *src)
+{
+ cmask_walk_op2(dst->bits, dst->base, dst->nr_cids,
+ src->bits, src->base, src->nr_cids, CMASK_OP2_OR);
+}
+
+/**
+ * scx_cmask_or_racy - OR @src into @dst, reading @src without locking
+ *
+ * @src is read word-by-word through data_race(). Same per-bit independence
+ * rationale as scx_cmask_copy_racy(). Memory ordering with writers is the
+ * caller's responsibility.
+ */
+void scx_cmask_or_racy(struct scx_cmask *dst, const struct scx_cmask *src)
+{
+ cmask_walk_op2(dst->bits, dst->base, dst->nr_cids,
+ src->bits, src->base, src->nr_cids, CMASK_OP2_OR_RACY);
+}
+
+void scx_cmask_copy(struct scx_cmask *dst, const struct scx_cmask *src)
+{
+ cmask_walk_op2(dst->bits, dst->base, dst->nr_cids,
+ src->bits, src->base, src->nr_cids, CMASK_OP2_COPY);
+}
+
+/**
+ * scx_cmask_copy_racy - Snapshot @src into @dst without locking
+ *
+ * @src is read word-by-word through data_race(). Head/tail masking matches
+ * scx_cmask_copy(). Each bit in a cmask is independent, so partial updates
+ * just leave some bits fresher than others. Memory ordering with writers is
+ * the caller's responsibility.
+ */
+void scx_cmask_copy_racy(struct scx_cmask *dst, const struct scx_cmask *src)
+{
+ cmask_walk_op2(dst->bits, dst->base, dst->nr_cids,
+ src->bits, src->base, src->nr_cids, CMASK_OP2_COPY_RACY);
+}
+
+void scx_cmask_andnot(struct scx_cmask *dst, const struct scx_cmask *src)
+{
+ cmask_walk_op2(dst->bits, dst->base, dst->nr_cids,
+ src->bits, src->base, src->nr_cids, CMASK_OP2_ANDNOT);
+}
+
+/*
+ * Return true if @cm has any bit set in [@lo, @hi). Caller must ensure
+ * [@lo, @hi) is contained in @cm's range.
+ */
+static bool cmask_any_set_in_range(const struct scx_cmask *cm, u32 lo, u32 hi)
+{
+ if (lo >= hi)
+ return false;
+ return cmask_walk_op1(&cm->bits[lo / 64 - cm->base / 64], lo, hi - lo,
+ CMASK_OP1_ANY_SET);
+}
+
+/**
+ * scx_cmask_subset - test whether @sub is a subset of @super
+ * @sub: cmask to test
+ * @super: cmask to test against
+ *
+ * Return true iff every set bit of @sub is also set in @super.
+ */
+bool scx_cmask_subset(const struct scx_cmask *sub, const struct scx_cmask *super)
+{
+ u32 super_end = super->base + super->nr_cids;
+ u32 sub_end = sub->base + sub->nr_cids;
+
+ /*
+ * Set bits in @sub outside @super's range can't be in @super, so any
+ * such bit means not a subset. The walk below only visits words
+ * common to both ranges, so these need a separate scan.
+ */
+ if (sub->base < super->base &&
+ cmask_any_set_in_range(sub, sub->base, min(super->base, sub_end)))
+ return false;
+ if (sub_end > super_end &&
+ cmask_any_set_in_range(sub, max(sub->base, super_end), sub_end))
+ return false;
+
+ return !cmask_walk_op2((u64 *)super->bits, super->base, super->nr_cids,
+ sub->bits, sub->base, sub->nr_cids, CMASK_OP2_SUBSET);
+}
+
+bool scx_cmask_intersects(const struct scx_cmask *a, const struct scx_cmask *b)
+{
+ return cmask_walk_op2((u64 *)a->bits, a->base, a->nr_cids,
+ b->bits, b->base, b->nr_cids, CMASK_OP2_INTERSECTS);
+}
+
+/**
+ * scx_cmask_empty - Test whether @m has no bits set
+ * @m: cmask to test
+ *
+ * Return true iff @m's active range has no bits set.
+ */
+bool scx_cmask_empty(const struct scx_cmask *m)
+{
+ return !cmask_any_set_in_range(m, m->base, m->base + m->nr_cids);
+}
+
+/**
+ * scx_bpf_cid_topo - Copy out per-cid topology info
+ * @cid: cid to look up
+ * @out__uninit: where to copy the topology info; fully written by this call
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
+ *
+ * Fill @out__uninit with the topology info for @cid. Trigger scx_error() if
+ * @cid is out of range. If @cid is valid but in the no-topo section, all fields
+ * are set to -1.
+ */
+__bpf_kfunc void scx_bpf_cid_topo(s32 cid, struct scx_cid_topo *out__uninit,
+ const struct bpf_prog_aux *aux)
+{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = scx_prog_sched(aux);
+ if (unlikely(!sch) || !cid_valid(sch, cid)) {
+ *out__uninit = SCX_CID_TOPO_NEG;
+ return;
+ }
+
+ *out__uninit = READ_ONCE(scx_cid_topo)[cid];
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(scx_kfunc_ids_init)
+BTF_ID_FLAGS(func, scx_bpf_cid_override, KF_IMPLICIT_ARGS | KF_SLEEPABLE)
+BTF_KFUNCS_END(scx_kfunc_ids_init)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_init = {
+ .owner = THIS_MODULE,
+ .set = &scx_kfunc_ids_init,
+ .filter = scx_kfunc_context_filter,
+};
+
+BTF_KFUNCS_START(scx_kfunc_ids_cid)
+BTF_ID_FLAGS(func, scx_bpf_cid_to_cpu, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_cpu_to_cid, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_cid_topo, KF_IMPLICIT_ARGS)
+BTF_KFUNCS_END(scx_kfunc_ids_cid)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_cid = {
+ .owner = THIS_MODULE,
+ .set = &scx_kfunc_ids_cid,
+};
+
+int scx_cid_kfunc_init(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_init) ?:
+ register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_cid) ?:
+ register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &scx_kfunc_set_cid) ?:
+ register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_cid);
+}
diff --git a/kernel/sched/ext_cid.h b/kernel/sched/ext_cid.h
new file mode 100644
index 000000000000..5745e5785e89
--- /dev/null
+++ b/kernel/sched/ext_cid.h
@@ -0,0 +1,271 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Topological CPU IDs (cids)
+ * --------------------------
+ *
+ * Raw cpu numbers are clumsy for sharding work and communication across
+ * topology units, especially from BPF: the space can be sparse, numerical
+ * closeness doesn't imply topological closeness (x86 hyperthreading often puts
+ * SMT siblings far apart), and a range of cpu ids doesn't mean anything.
+ * Sub-scheds make this acute - cpu allocation, revocation and other state are
+ * constantly communicated across sub-scheds, and passing whole cpumasks scales
+ * poorly with cpu count. cpumasks are also awkward in BPF: a variable-length
+ * kernel type sized for the maximum NR_CPUS (4k), with verbose helper sequences
+ * for every op.
+ *
+ * cids give every cpu a dense, topology-ordered id. CPUs sharing a core, LLC or
+ * NUMA node get contiguous cid ranges, so a topology unit becomes a (start,
+ * length) slice of cid space. Communication can pass a slice instead of a
+ * cpumask, and BPF code can process, for example, a u64 word's worth of cids at
+ * a time.
+ *
+ * The mapping is built once at root scheduler enable time by walking the
+ * topology of online cpus only. Going by online cpus is out of necessity:
+ * depending on the arch, topology info isn't reliably available for offline
+ * cpus. The expected usage model is restarting the scheduler on hotplug events
+ * so the mapping is rebuilt against the new online set. A scheduler that wants
+ * to handle hotplug without a restart can provide its own cid and shard mapping
+ * through the override interface.
+ *
+ * Copyright (c) 2026 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2026 Tejun Heo <tj@kernel.org>
+ */
+#ifndef _KERNEL_SCHED_EXT_CID_H
+#define _KERNEL_SCHED_EXT_CID_H
+
+struct scx_sched;
+
+/*
+ * Cid space (total is always num_possible_cpus()) is laid out with
+ * topology-annotated cids first, then no-topo cids at the tail. The
+ * topology-annotated block covers the cpus that were online when scx_cid_init()
+ * ran and remains valid even after those cpus go offline. The tail block covers
+ * possible-but-not-online cpus and carries all-(-1) topo info (see
+ * scx_cid_topo); callers detect it via the -1 sentinels.
+ *
+ * See the comment above the table definitions in ext_cid.c for the
+ * memory-ordering and visibility contract.
+ */
+extern s16 *scx_cid_to_cpu_tbl;
+extern s16 *scx_cpu_to_cid_tbl;
+extern struct scx_cid_topo *scx_cid_topo;
+extern struct btf_id_set8 scx_kfunc_ids_init;
+
+void scx_cmask_clear(struct scx_cmask *m);
+void scx_cmask_fill(struct scx_cmask *m);
+void scx_cmask_and(struct scx_cmask *dst, const struct scx_cmask *src);
+void scx_cmask_or(struct scx_cmask *dst, const struct scx_cmask *src);
+void scx_cmask_or_racy(struct scx_cmask *dst, const struct scx_cmask *src);
+void scx_cmask_copy(struct scx_cmask *dst, const struct scx_cmask *src);
+void scx_cmask_copy_racy(struct scx_cmask *dst, const struct scx_cmask *src);
+void scx_cmask_andnot(struct scx_cmask *dst, const struct scx_cmask *src);
+bool scx_cmask_subset(const struct scx_cmask *sub, const struct scx_cmask *super);
+bool scx_cmask_intersects(const struct scx_cmask *a, const struct scx_cmask *b);
+bool scx_cmask_empty(const struct scx_cmask *m);
+s32 scx_cid_init(struct scx_sched *sch);
+int scx_cid_kfunc_init(void);
+void scx_cpumask_to_cmask(const struct cpumask *src, struct scx_cmask *dst);
+
+/**
+ * cid_valid - Verify a cid value, to be used on ops input args
+ * @sch: scx_sched to abort on error
+ * @cid: cid which came from a BPF ops
+ *
+ * Return true if @cid is in [0, num_possible_cpus()). On failure, trigger
+ * scx_error() and return false.
+ */
+static inline bool cid_valid(struct scx_sched *sch, s32 cid)
+{
+ if (likely(cid >= 0 && cid < num_possible_cpus()))
+ return true;
+ scx_error(sch, "invalid cid %d", cid);
+ return false;
+}
+
+/**
+ * __scx_cid_to_cpu - Unchecked cid->cpu table lookup
+ * @cid: cid to look up. Must be in [0, num_possible_cpus()).
+ *
+ * Intended for callsites that have already validated @cid and that hold a
+ * non-NULL @sch from scx_prog_sched() - a live sched implies the table has
+ * been allocated, so no NULL check is needed here.
+ */
+static inline s32 __scx_cid_to_cpu(s32 cid)
+{
+ /* READ_ONCE pairs with WRITE_ONCE in scx_cid_arrays_alloc() */
+ return READ_ONCE(scx_cid_to_cpu_tbl)[cid];
+}
+
+/**
+ * __scx_cpu_to_cid - Unchecked cpu->cid table lookup
+ * @cpu: cpu to look up. Must be a valid possible cpu id.
+ *
+ * Same usage constraints as __scx_cid_to_cpu().
+ */
+static inline s32 __scx_cpu_to_cid(s32 cpu)
+{
+ return READ_ONCE(scx_cpu_to_cid_tbl)[cpu];
+}
+
+/**
+ * scx_cid_to_cpu - Translate @cid to its cpu
+ * @sch: scx_sched for error reporting
+ * @cid: cid to look up
+ *
+ * Return the cpu for @cid or a negative errno on failure. Invalid cid triggers
+ * scx_error() on @sch. The cid arrays are allocated on first scheduler enable
+ * and never freed, so the returned cpu is stable for the lifetime of the loaded
+ * scheduler.
+ */
+static inline s32 scx_cid_to_cpu(struct scx_sched *sch, s32 cid)
+{
+ if (!cid_valid(sch, cid))
+ return -EINVAL;
+ return __scx_cid_to_cpu(cid);
+}
+
+/**
+ * scx_cpu_to_cid - Translate @cpu to its cid
+ * @sch: scx_sched for error reporting
+ * @cpu: cpu to look up
+ *
+ * Return the cid for @cpu or a negative errno on failure. Invalid cpu triggers
+ * scx_error() on @sch. Same lifetime guarantee as scx_cid_to_cpu().
+ */
+static inline s32 scx_cpu_to_cid(struct scx_sched *sch, s32 cpu)
+{
+ if (!scx_cpu_valid(sch, cpu, NULL))
+ return -EINVAL;
+ return __scx_cpu_to_cid(cpu);
+}
+
+/**
+ * scx_is_cid_type - Test whether the active scheduler hierarchy is cid-form
+ */
+static inline bool scx_is_cid_type(void)
+{
+ return static_branch_unlikely(&__scx_is_cid_type);
+}
+
+static inline bool __scx_cmask_contains(u32 cid, const struct scx_cmask *m)
+{
+ return likely(cid >= m->base && cid < m->base + m->nr_cids);
+}
+
+/* Word in bits[] covering @cid. @cid must satisfy __scx_cmask_contains(). */
+static inline u64 *__scx_cmask_word(u32 cid, const struct scx_cmask *m)
+{
+ return (u64 *)&m->bits[cid / 64 - m->base / 64];
+}
+
+/**
+ * __scx_cmask_init - Initialize @m with explicit storage capacity
+ * @m: cmask to initialize
+ * @base: first cid of the active range
+ * @nr_cids: number of cids in the active range
+ * @alloc_cids: storage capacity in cids, at least @nr_cids
+ *
+ * Use when storage is sized larger than the initial active range. All of
+ * bits[] is zeroed.
+ */
+static inline void __scx_cmask_init(struct scx_cmask *m, u32 base, u32 nr_cids,
+ u32 alloc_cids)
+{
+ if (WARN_ON_ONCE(alloc_cids < nr_cids))
+ nr_cids = alloc_cids;
+
+ m->base = base;
+ m->nr_cids = nr_cids;
+ m->alloc_words = SCX_CMASK_NR_WORDS(alloc_cids);
+ memset(m->bits, 0, m->alloc_words * sizeof(u64));
+}
+
+/**
+ * scx_cmask_init - Initialize @m on tight storage
+ * @m: cmask to initialize
+ * @base: first cid of the active range
+ * @nr_cids: number of cids in the active range
+ *
+ * All of bits[] is zeroed.
+ */
+static inline void scx_cmask_init(struct scx_cmask *m, u32 base, u32 nr_cids)
+{
+ __scx_cmask_init(m, base, nr_cids, nr_cids);
+}
+
+/**
+ * scx_cmask_reframe - Reshape @m's active range without resizing storage
+ * @m: cmask to reframe
+ * @base: new active range base
+ * @nr_cids: new active range length, must fit within @m->alloc_words
+ *
+ * Body bits within the new range become garbage - only the head and tail
+ * words are zeroed to keep the padding invariant.
+ */
+static inline void scx_cmask_reframe(struct scx_cmask *m, u32 base, u32 nr_cids)
+{
+ if (WARN_ON_ONCE(SCX_CMASK_NR_WORDS(nr_cids) > m->alloc_words))
+ return;
+
+ if (nr_cids) {
+ u32 last_word = ((base & 63) + nr_cids - 1) / 64;
+
+ m->bits[0] = 0;
+ m->bits[last_word] = 0;
+ }
+
+ m->base = base;
+ m->nr_cids = nr_cids;
+}
+
+static inline void __scx_cmask_set(u32 cid, struct scx_cmask *m)
+{
+ if (!__scx_cmask_contains(cid, m))
+ return;
+ *__scx_cmask_word(cid, m) |= BIT_U64(cid & 63);
+}
+
+/**
+ * scx_cmask_test - test whether @cid is set in @m
+ * @cid: cid to test
+ * @m: cmask to test
+ *
+ * Return %false if @cid is outside @m's active range. Otherwise return the
+ * bit's value. Read via READ_ONCE so callers can race set/clear writers.
+ */
+static inline bool scx_cmask_test(u32 cid, const struct scx_cmask *m)
+{
+ if (!__scx_cmask_contains(cid, m))
+ return false;
+ return READ_ONCE(*__scx_cmask_word(cid, m)) & BIT_U64(cid & 63);
+}
+
+/*
+ * Words of bits[] the active range spans, 0 if empty. Tighter than the storage
+ * SCX_CMASK_NR_WORDS() sizes for the worst-case base alignment.
+ */
+static inline u32 scx_cmask_nr_used_words(const struct scx_cmask *m)
+{
+ if (!m->nr_cids)
+ return 0;
+ return ((m->base & 63) + m->nr_cids - 1) / 64 + 1;
+}
+
+/**
+ * scx_cmask_for_each_cid - iterate set cids in @m
+ * @cid: s32 loop var that receives each set cid in turn
+ * @m: cmask to iterate
+ *
+ * Visits set bits within @m's active range in ascending order. Scans only the
+ * words the active range spans, where head and tail padding is kept zero, so
+ * no per-cid range check is needed.
+ */
+#define scx_cmask_for_each_cid(cid, m) \
+ for (u64 __bs = (m)->base & ~63u, __wi = 0, \
+ __nw = scx_cmask_nr_used_words(m); \
+ __wi < __nw; __wi++) \
+ for (u64 __w = READ_ONCE((m)->bits[__wi]); \
+ __w && ((cid) = __bs + __wi * 64 + __ffs64(__w), true); \
+ __w &= __w - 1)
+
+#endif /* _KERNEL_SCHED_EXT_CID_H */
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index 9f5ad6b071f9..2077373d8da3 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -9,7 +9,6 @@
* Copyright (c) 2022 David Vernet <dvernet@meta.com>
* Copyright (c) 2024 Andrea Righi <arighi@nvidia.com>
*/
-#include "ext_idle.h"
/* Enable/disable built-in idle CPU selection policy */
static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
@@ -783,7 +782,7 @@ void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
*/
if (SCX_HAS_OP(sch, update_idle) && do_notify &&
!scx_bypassing(sch, cpu_of(rq)))
- SCX_CALL_OP(sch, update_idle, rq, cpu_of(rq), idle);
+ SCX_CALL_OP(sch, update_idle, rq, scx_cpu_arg(cpu_of(rq)), idle);
}
static void reset_idle_masks(struct sched_ext_ops *ops)
@@ -911,7 +910,7 @@ static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p,
bool we_locked = false;
s32 cpu;
- if (!ops_cpu_valid(sch, prev_cpu, NULL))
+ if (!scx_cpu_valid(sch, prev_cpu, NULL))
return -EINVAL;
if (!check_builtin_idle_enabled(sch))
@@ -984,7 +983,7 @@ __bpf_kfunc s32 scx_bpf_cpu_node(s32 cpu, const struct bpf_prog_aux *aux)
guard(rcu)();
sch = scx_prog_sched(aux);
- if (unlikely(!sch) || !ops_cpu_valid(sch, cpu, NULL))
+ if (unlikely(!sch) || !scx_cpu_valid(sch, cpu, NULL))
return NUMA_NO_NODE;
return cpu_to_node(cpu);
}
@@ -1266,7 +1265,7 @@ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu, const struct bpf_prog_
if (!check_builtin_idle_enabled(sch))
return false;
- if (!ops_cpu_valid(sch, cpu, NULL))
+ if (!scx_cpu_valid(sch, cpu, NULL))
return false;
return scx_idle_test_and_clear_cpu(cpu);
@@ -1504,13 +1503,9 @@ static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = {
int scx_idle_init(void)
{
- int ret;
-
- ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_idle) ||
- register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &scx_kfunc_set_idle) ||
- register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_idle) ||
- register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_select_cpu) ||
- register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_select_cpu);
-
- return ret;
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_idle) ?:
+ register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &scx_kfunc_set_idle) ?:
+ register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_idle) ?:
+ register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_select_cpu) ?:
+ register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_select_cpu);
}
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index a075732d4430..b04701190b23 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -8,35 +8,6 @@
#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
#define SCX_MOFF_IDX(moff) ((moff) / sizeof(void (*)(void)))
-enum scx_consts {
- SCX_DSP_DFL_MAX_BATCH = 32,
- SCX_DSP_MAX_LOOPS = 32,
- SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ,
-
- SCX_EXIT_BT_LEN = 64,
- SCX_EXIT_MSG_LEN = 1024,
- SCX_EXIT_DUMP_DFL_LEN = 32768,
-
- SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE,
-
- /*
- * Iterating all tasks may take a while. Periodically drop
- * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls.
- */
- SCX_TASK_ITER_BATCH = 32,
-
- SCX_BYPASS_HOST_NTH = 2,
-
- SCX_BYPASS_LB_DFL_INTV_US = 500 * USEC_PER_MSEC,
- SCX_BYPASS_LB_DONOR_PCT = 125,
- SCX_BYPASS_LB_MIN_DELTA_DIV = 4,
- SCX_BYPASS_LB_BATCH = 256,
-
- SCX_REENQ_LOCAL_MAX_REPEAT = 256,
-
- SCX_SUB_MAX_DEPTH = 4,
-};
-
enum scx_exit_kind {
SCX_EXIT_NONE,
SCX_EXIT_DONE,
@@ -94,6 +65,12 @@ struct scx_exit_info {
/* %SCX_EXIT_* - broad category of the exit reason */
enum scx_exit_kind kind;
+ /*
+ * CPU that initiated the exit, valid once @kind has been set.
+ * Negative if the exit path didn't identify a CPU.
+ */
+ s32 exit_cpu;
+
/* exit code if gracefully exiting */
s64 exit_code;
@@ -138,7 +115,8 @@ enum scx_ops_flags {
* To mask this problem, by default, unhashed tasks are automatically
* dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't
* depend on pid lookups and wants to handle these tasks directly, the
- * following flag can be used.
+ * following flag can be used. With %SCX_OPS_TID_TO_TASK,
+ * scx_bpf_tid_to_task() can find exiting tasks reliably.
*/
SCX_OPS_ENQ_EXITING = 1LLU << 2,
@@ -189,6 +167,17 @@ enum scx_ops_flags {
*/
SCX_OPS_ALWAYS_ENQ_IMMED = 1LLU << 7,
+ /*
+ * Maintain a mapping from p->scx.tid to task_struct so the BPF
+ * scheduler can recover task pointers from stored tids via
+ * scx_bpf_tid_to_task().
+ *
+ * Only the root scheduler turns this on. A sub-sched may set the flag
+ * to declare a dependency on the lookup; if the root scheduler hasn't
+ * enabled it, attaching the sub-sched is rejected.
+ */
+ SCX_OPS_TID_TO_TASK = 1LLU << 8,
+
SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE |
SCX_OPS_ENQ_LAST |
SCX_OPS_ENQ_EXITING |
@@ -196,7 +185,8 @@ enum scx_ops_flags {
SCX_OPS_ALLOW_QUEUED_WAKEUP |
SCX_OPS_SWITCH_PARTIAL |
SCX_OPS_BUILTIN_IDLE_PER_NODE |
- SCX_OPS_ALWAYS_ENQ_IMMED,
+ SCX_OPS_ALWAYS_ENQ_IMMED |
+ SCX_OPS_TID_TO_TASK,
/* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */
__SCX_OPS_INTERNAL_MASK = 0xffLLU << 56,
@@ -540,28 +530,6 @@ struct sched_ext_ops {
void (*update_idle)(s32 cpu, bool idle);
/**
- * @cpu_acquire: A CPU is becoming available to the BPF scheduler
- * @cpu: The CPU being acquired by the BPF scheduler.
- * @args: Acquire arguments, see the struct definition.
- *
- * A CPU that was previously released from the BPF scheduler is now once
- * again under its control.
- */
- void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args);
-
- /**
- * @cpu_release: A CPU is taken away from the BPF scheduler
- * @cpu: The CPU being released by the BPF scheduler.
- * @args: Release arguments, see the struct definition.
- *
- * The specified CPU is no longer under the control of the BPF
- * scheduler. This could be because it was preempted by a higher
- * priority sched_class, though there may be other reasons as well. The
- * caller should consult @args->reason to determine the cause.
- */
- void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args);
-
- /**
* @init_task: Initialize a task to run in a BPF scheduler
* @p: task to initialize for BPF scheduling
* @args: init arguments, see the struct definition
@@ -851,6 +819,128 @@ struct sched_ext_ops {
/* internal use only, must be NULL */
void __rcu *priv;
+
+ /*
+ * Deprecated callbacks. Kept at the end of the struct so the cid-form
+ * struct (sched_ext_ops_cid) can omit them without affecting the
+ * shared field offsets. Use SCX_ENQ_IMMED instead. Sitting past
+ * SCX_OPI_END means has_op doesn't cover them, so SCX_HAS_OP() cannot
+ * be used; callers must test sch->ops.cpu_acquire / cpu_release
+ * directly.
+ */
+
+ /**
+ * @cpu_acquire: A CPU is becoming available to the BPF scheduler
+ * @cpu: The CPU being acquired by the BPF scheduler.
+ * @args: Acquire arguments, see the struct definition.
+ *
+ * A CPU that was previously released from the BPF scheduler is now once
+ * again under its control. Deprecated; use SCX_ENQ_IMMED instead.
+ */
+ void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args);
+
+ /**
+ * @cpu_release: A CPU is taken away from the BPF scheduler
+ * @cpu: The CPU being released by the BPF scheduler.
+ * @args: Release arguments, see the struct definition.
+ *
+ * The specified CPU is no longer under the control of the BPF
+ * scheduler. This could be because it was preempted by a higher
+ * priority sched_class, though there may be other reasons as well. The
+ * caller should consult @args->reason to determine the cause.
+ * Deprecated; use SCX_ENQ_IMMED instead.
+ */
+ void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args);
+};
+
+/**
+ * struct sched_ext_ops_cid - cid-form alternative to struct sched_ext_ops
+ *
+ * Mirrors struct sched_ext_ops with cpu/cpumask substituted with cid/cmask
+ * where applicable. Layout up to and including @priv matches sched_ext_ops
+ * byte-for-byte (verified by BUILD_BUG_ON checks at scx_init() time) so
+ * shared field offsets work for both struct types in bpf_scx_init_member()
+ * and bpf_scx_check_member(). The deprecated cpu_acquire/cpu_release
+ * callbacks at the tail of sched_ext_ops are omitted here entirely.
+ *
+ * Differences from sched_ext_ops:
+ * - select_cpu -> select_cid (returns cid)
+ * - dispatch -> dispatch (cpu arg is now cid)
+ * - update_idle -> update_idle (cpu arg is now cid)
+ * - set_cpumask -> set_cmask (cmask instead of cpumask)
+ * - cpu_online -> cid_online
+ * - cpu_offline -> cid_offline
+ * - dump_cpu -> dump_cid
+ * - cpu_acquire/cpu_release -> not present (deprecated in sched_ext_ops)
+ *
+ * BPF schedulers using this type cannot call cpu-form scx_bpf_* kfuncs;
+ * use the cid-form variants instead. Enforced at BPF verifier time via
+ * scx_kfunc_context_filter() branching on prog->aux->st_ops.
+ *
+ * See sched_ext_ops for callback documentation.
+ */
+struct sched_ext_ops_cid {
+ s32 (*select_cid)(struct task_struct *p, s32 prev_cid, u64 wake_flags);
+ void (*enqueue)(struct task_struct *p, u64 enq_flags);
+ void (*dequeue)(struct task_struct *p, u64 deq_flags);
+ void (*dispatch)(s32 cid, struct task_struct *prev);
+ void (*tick)(struct task_struct *p);
+ void (*runnable)(struct task_struct *p, u64 enq_flags);
+ void (*running)(struct task_struct *p);
+ void (*stopping)(struct task_struct *p, bool runnable);
+ void (*quiescent)(struct task_struct *p, u64 deq_flags);
+ bool (*yield)(struct task_struct *from, struct task_struct *to);
+ bool (*core_sched_before)(struct task_struct *a,
+ struct task_struct *b);
+ void (*set_weight)(struct task_struct *p, u32 weight);
+ void (*set_cmask)(struct task_struct *p,
+ const struct scx_cmask *cmask);
+ void (*update_idle)(s32 cid, bool idle);
+ s32 (*init_task)(struct task_struct *p,
+ struct scx_init_task_args *args);
+ void (*exit_task)(struct task_struct *p,
+ struct scx_exit_task_args *args);
+ void (*enable)(struct task_struct *p);
+ void (*disable)(struct task_struct *p);
+ void (*dump)(struct scx_dump_ctx *ctx);
+ void (*dump_cid)(struct scx_dump_ctx *ctx, s32 cid, bool idle);
+ void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p);
+#ifdef CONFIG_EXT_GROUP_SCHED
+ s32 (*cgroup_init)(struct cgroup *cgrp,
+ struct scx_cgroup_init_args *args);
+ void (*cgroup_exit)(struct cgroup *cgrp);
+ s32 (*cgroup_prep_move)(struct task_struct *p,
+ struct cgroup *from, struct cgroup *to);
+ void (*cgroup_move)(struct task_struct *p,
+ struct cgroup *from, struct cgroup *to);
+ void (*cgroup_cancel_move)(struct task_struct *p,
+ struct cgroup *from, struct cgroup *to);
+ void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight);
+ void (*cgroup_set_bandwidth)(struct cgroup *cgrp,
+ u64 period_us, u64 quota_us, u64 burst_us);
+ void (*cgroup_set_idle)(struct cgroup *cgrp, bool idle);
+#endif /* CONFIG_EXT_GROUP_SCHED */
+ s32 (*sub_attach)(struct scx_sub_attach_args *args);
+ void (*sub_detach)(struct scx_sub_detach_args *args);
+ void (*cid_online)(s32 cid);
+ void (*cid_offline)(s32 cid);
+ s32 (*init)(void);
+ void (*exit)(struct scx_exit_info *info);
+
+ /* Data fields - must match sched_ext_ops layout exactly */
+ u32 dispatch_max_batch;
+ u64 flags;
+ u32 timeout_ms;
+ u32 exit_dump_len;
+ u64 hotplug_seq;
+ u64 sub_cgroup_id;
+ char name[SCX_OPS_NAME_LEN];
+
+ /* internal use only, must be NULL */
+ void __rcu *priv;
+
+ /* layout end anchor for the BUILD_BUG_ON in scx_init(); keep last */
+ char __end[0];
};
enum scx_opi {
@@ -1009,7 +1099,40 @@ struct scx_sched_pnode {
};
struct scx_sched {
- struct sched_ext_ops ops;
+ /*
+ * cpu-form and cid-form ops share field offsets up to .priv (verified
+ * by BUILD_BUG_ON in scx_init()). The anonymous union lets the kernel
+ * access either view of the same storage without function-pointer
+ * casts: use .ops for cpu-form and shared fields, .ops_cid for the
+ * cid-renamed callbacks (set_cmask, select_cid, cid_online, ...).
+ */
+ union {
+ struct sched_ext_ops ops;
+ struct sched_ext_ops_cid ops_cid;
+ };
+ bool is_cid_type; /* true if registered via bpf_sched_ext_ops_cid */
+
+ /*
+ * Arena map auto-discovered from member progs at struct_ops attach.
+ * cid-form schedulers must use exactly one arena across all member
+ * progs. NULL on cpu-form.
+ *
+ * @arena_pool sub-allocates @arena_map. Each gen_pool chunk is added
+ * at the kernel-side mapping address. @arena_kern_base is the start
+ * of the arena's kern_vm range. See scx_arena_to_kaddr() and
+ * scx_kaddr_to_arena().
+ */
+ struct bpf_map *arena_map;
+ struct gen_pool *arena_pool;
+ uintptr_t arena_kern_base;
+
+ /*
+ * Per-CPU arena cmask used by scx_call_op_set_cpumask() to hand a cmask
+ * to ops_cid.set_cmask(). The kernel writes through the stored kern_va
+ * and hands BPF its arena pointer via scx_kaddr_to_arena().
+ */
+ struct scx_cmask * __percpu *set_cmask_scratch;
+
DECLARE_BITMAP(has_op, SCX_OPI_END);
/*
@@ -1083,6 +1206,31 @@ struct scx_sched {
struct scx_sched *ancestors[];
};
+/**
+ * scx_arena_to_kaddr - Translate a BPF-arena pointer to its kernel address
+ * @sch: scheduler whose arena hosts @bpf_ptr
+ * @bpf_ptr: BPF-arena pointer, only the low 32 bits are used
+ *
+ * The (u32) cast normalizes any input into the arena's 4 GiB kern_vm range,
+ * which combined with scratch-page fault recovery makes the returned pointer
+ * safe to dereference up to GUARD_SZ / 2 past the intended object. Accesses
+ * larger than GUARD_SZ / 2 must be explicitly bounds-checked.
+ */
+static inline void *scx_arena_to_kaddr(struct scx_sched *sch, const void *bpf_ptr)
+{
+ return (void *)(sch->arena_kern_base + (u32)(uintptr_t)bpf_ptr);
+}
+
+/**
+ * scx_kaddr_to_arena - Translate a kernel arena address to its BPF form
+ * @sch: scheduler whose arena hosts @kaddr
+ * @kaddr: kernel-side arena address, supplied by trusted kernel code
+ */
+static inline void *scx_kaddr_to_arena(struct scx_sched *sch, const void *kaddr)
+{
+ return (void *)((uintptr_t)kaddr - sch->arena_kern_base);
+}
+
enum scx_wake_flags {
/* expose select WF_* flags as enums */
SCX_WAKE_FORK = WF_FORK,
@@ -1366,8 +1514,30 @@ enum scx_ops_state {
extern struct scx_sched __rcu *scx_root;
DECLARE_PER_CPU(struct rq *, scx_locked_rq_state);
+/*
+ * True when the currently loaded scheduler hierarchy is cid-form. All scheds
+ * in a hierarchy share one form, so this single key tells callsites which
+ * view to use without per-sch dereferences. Use scx_is_cid_type() to test.
+ */
+DECLARE_STATIC_KEY_FALSE(__scx_is_cid_type);
+
int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id);
+bool scx_cpu_valid(struct scx_sched *sch, s32 cpu, const char *where);
+
+__printf(5, 0) bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind,
+ s64 exit_code, s32 exit_cpu, const char *fmt,
+ va_list args);
+__printf(5, 6) bool __scx_exit(struct scx_sched *sch, enum scx_exit_kind kind,
+ s64 exit_code, s32 exit_cpu, const char *fmt, ...);
+
+#define scx_exit(sch, kind, exit_code, fmt, args...) \
+ __scx_exit(sch, kind, exit_code, raw_smp_processor_id(), fmt, ##args)
+#define scx_error(sch, fmt, args...) \
+ scx_exit((sch), SCX_EXIT_ERROR, 0, fmt, ##args)
+#define scx_verror(sch, fmt, args) \
+ scx_vexit((sch), SCX_EXIT_ERROR, 0, raw_smp_processor_id(), fmt, args)
+
/*
* Return the rq currently locked from an scx callback, or NULL if no rq is
* locked.
@@ -1476,7 +1646,7 @@ static inline bool scx_task_on_sched(struct scx_sched *sch,
return true;
}
-static struct scx_sched *scx_prog_sched(const struct bpf_prog_aux *aux)
+static inline struct scx_sched *scx_prog_sched(const struct bpf_prog_aux *aux)
{
return rcu_dereference_all(scx_root);
}
diff --git a/kernel/sched/ext_types.h b/kernel/sched/ext_types.h
new file mode 100644
index 000000000000..8b3527e21fca
--- /dev/null
+++ b/kernel/sched/ext_types.h
@@ -0,0 +1,144 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Early sched_ext type definitions.
+ *
+ * Copyright (c) 2026 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2026 Tejun Heo <tj@kernel.org>
+ */
+#ifndef _KERNEL_SCHED_EXT_TYPES_H
+#define _KERNEL_SCHED_EXT_TYPES_H
+
+enum scx_consts {
+ SCX_DSP_DFL_MAX_BATCH = 32,
+ SCX_DSP_MAX_LOOPS = 32,
+ SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ,
+
+ /* per-CPU chunk size for p->scx.tid allocation, see scx_alloc_tid() */
+ SCX_TID_CHUNK = 1024,
+
+ SCX_EXIT_BT_LEN = 64,
+ SCX_EXIT_MSG_LEN = 1024,
+ SCX_EXIT_DUMP_DFL_LEN = 32768,
+
+ SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE,
+
+ /*
+ * Iterating all tasks may take a while. Periodically drop
+ * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls.
+ */
+ SCX_TASK_ITER_BATCH = 32,
+
+ SCX_BYPASS_HOST_NTH = 2,
+
+ SCX_BYPASS_LB_DFL_INTV_US = 500 * USEC_PER_MSEC,
+ SCX_BYPASS_LB_DONOR_PCT = 125,
+ SCX_BYPASS_LB_MIN_DELTA_DIV = 4,
+ SCX_BYPASS_LB_BATCH = 256,
+
+ SCX_REENQ_LOCAL_MAX_REPEAT = 256,
+
+ SCX_SUB_MAX_DEPTH = 4,
+};
+
+/*
+ * Per-cid topology info. For each topology level (core, LLC, node), records
+ * the first cid in the unit and its global index. Global indices are
+ * consecutive integers assigned in cid-walk order, so e.g. core_idx ranges
+ * over [0, nr_cores_at_init) with no gaps. No-topo cids have all fields set
+ * to -1.
+ *
+ * @core_cid: first cid of this cid's core (smt-sibling group)
+ * @core_idx: global index of that core, in [0, nr_cores_at_init)
+ * @llc_cid: first cid of this cid's LLC
+ * @llc_idx: global index of that LLC, in [0, nr_llcs_at_init)
+ * @node_cid: first cid of this cid's NUMA node
+ * @node_idx: global index of that node, in [0, nr_nodes_at_init)
+ */
+struct scx_cid_topo {
+ s32 core_cid;
+ s32 core_idx;
+ s32 llc_cid;
+ s32 llc_idx;
+ s32 node_cid;
+ s32 node_idx;
+};
+
+/*
+ * cmask: variable-length, base-windowed bitmap over cid space
+ * -----------------------------------------------------------
+ *
+ * A cmask covers the cid range [base, base + nr_cids). bits[] is aligned to the
+ * global 64-cid grid: bits[0] spans [base & ~63, (base & ~63) + 64), so the
+ * first (base & 63) bits of bits[0] are head padding and the trailing bits of
+ * the last active word past base + nr_cids are tail padding. Both stay zero;
+ * all mutating helpers preserve that. Words past the last active word are not
+ * read by any helper and have no constraint.
+ *
+ * Grid alignment means two cmasks always address bits[] against the same global
+ * 64-cid windows, so cross-cmask word ops (AND, OR, ...) reduce to
+ *
+ * dst->bits[i] OP= src->bits[i - delta]
+ *
+ * with no bit-shifting, regardless of how the two bases relate mod 64.
+ */
+struct scx_cmask {
+ u32 base;
+ u32 nr_cids;
+ u32 alloc_words;
+ u64 bits[] __counted_by(alloc_words);
+};
+
+/*
+ * Number of u64 words of bits[] storage that covers @nr_cids regardless of base
+ * alignment. The +1 absorbs up to 63 bits of head padding when base is not
+ * 64-aligned - always allocating one extra word beats branching on base or
+ * splitting the compute. The u64 cast keeps the +63 from wrapping when @nr_cids
+ * is near U32_MAX, so callers bounds-checking the result against @alloc_words
+ * catch the overflow instead of seeing a small value.
+ */
+#define SCX_CMASK_NR_WORDS(nr_cids) ((u32)(((u64)(nr_cids) + 63) / 64 + 1))
+
+/**
+ * __SCX_CMASK_DEFINE - Define an on-stack cmask with explicit storage capacity
+ * @NAME: variable name to define
+ * @BASE: first cid of the active range
+ * @NR_CIDS: active range length
+ * @ALLOC_CIDS: storage capacity in cids, at least @NR_CIDS
+ *
+ * @NAME aliases zero-initialized storage with the active range set to
+ * [BASE, BASE + NR_CIDS). Use scx_cmask_reframe() to reshape later, up to
+ * @ALLOC_CIDS.
+ */
+#define __SCX_CMASK_DEFINE(NAME, BASE, NR_CIDS, ALLOC_CIDS) \
+ _DEFINE_FLEX(struct scx_cmask, NAME, bits, SCX_CMASK_NR_WORDS(ALLOC_CIDS), \
+ = { .base = (BASE), \
+ .nr_cids = (NR_CIDS), \
+ .alloc_words = SCX_CMASK_NR_WORDS(ALLOC_CIDS) })
+
+/**
+ * SCX_CMASK_DEFINE - Define an on-stack cmask on tight storage
+ * @NAME: variable name to define
+ * @BASE: first cid of the active range
+ * @NR_CIDS: active range length, also storage capacity
+ *
+ * @NAME aliases zero-initialized storage with the active range and storage
+ * both [BASE, BASE + NR_CIDS).
+ */
+#define SCX_CMASK_DEFINE(NAME, BASE, NR_CIDS) \
+ __SCX_CMASK_DEFINE(NAME, BASE, NR_CIDS, NR_CIDS)
+
+/**
+ * SCX_CMASK_DEFINE_SHARD - Define an on-stack cmask sized to one shard
+ * @NAME: variable name to define
+ * @BASE: first cid of the active range
+ * @NR_CIDS: active range length, must be <= SCX_CID_SHARD_MAX_CPUS
+ *
+ * Storage is fixed at SCX_CID_SHARD_MAX_CPUS, active range framed by
+ * (BASE, NR_CIDS). Passing NR_CIDS > SCX_CID_SHARD_MAX_CPUS leaves the
+ * cmask claiming more bits than storage holds and subsequent cmask
+ * operations will overrun.
+ */
+#define SCX_CMASK_DEFINE_SHARD(NAME, BASE, NR_CIDS) \
+ __SCX_CMASK_DEFINE(NAME, BASE, NR_CIDS, SCX_CID_SHARD_MAX_CPUS)
+
+#endif /* _KERNEL_SCHED_EXT_TYPES_H */
diff --git a/tools/sched_ext/README.md b/tools/sched_ext/README.md
index 6e282bce453c..0ee5a3d997e5 100644
--- a/tools/sched_ext/README.md
+++ b/tools/sched_ext/README.md
@@ -168,9 +168,9 @@ well on single-socket systems with a unified L3 cache.
Another simple, yet slightly more complex scheduler that provides an example of
a basic weighted FIFO queuing policy. It also provides examples of some common
-useful BPF features, such as sleepable per-task storage allocation in the
-`ops.prep_enable()` callback, and using the `BPF_MAP_TYPE_QUEUE` map type to
-enqueue tasks. It also illustrates how core-sched support could be implemented.
+useful BPF features, such as arena-backed doubly-linked lists threaded through
+per-task context and `bpf_res_spin_lock` for per-queue synchronization. It also
+illustrates how core-sched support could be implemented.
## scx_central
diff --git a/tools/sched_ext/include/scx/cid.bpf.h b/tools/sched_ext/include/scx/cid.bpf.h
new file mode 100644
index 000000000000..9d89bb57e201
--- /dev/null
+++ b/tools/sched_ext/include/scx/cid.bpf.h
@@ -0,0 +1,678 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF-side helpers for cids and cmasks. See kernel/sched/ext_cid.h for the
+ * authoritative layout and semantics. The BPF-side helpers use the cmask_*
+ * naming (no scx_ prefix); cmask is the SCX bitmap type so the prefix is
+ * redundant in BPF code. Atomics use __sync_val_compare_and_swap and every
+ * helper is inline (no .c counterpart).
+ *
+ * Included by scx/common.bpf.h; don't include directly.
+ *
+ * Copyright (c) 2026 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2026 Tejun Heo <tj@kernel.org>
+ */
+#ifndef __SCX_CID_BPF_H
+#define __SCX_CID_BPF_H
+
+#include "bpf_arena_common.bpf.h"
+
+#ifndef BIT_U64
+#define BIT_U64(nr) (1ULL << (nr))
+#endif
+#ifndef GENMASK_U64
+#define GENMASK_U64(h, l) ((~0ULL << (l)) & (~0ULL >> (63 - (h))))
+#endif
+
+/*
+ * Storage cap for bounded loops over bits[]. Sized to cover NR_CPUS=8192 with
+ * one extra word for head-misalignment. Increase if deployment targets larger
+ * NR_CPUS.
+ */
+#ifndef CMASK_MAX_WORDS
+#define CMASK_MAX_WORDS 129
+#endif
+
+/*
+ * Mirrors SCX_CMASK_NR_WORDS in kernel/sched/ext_types.h. The u64 cast keeps
+ * the +63 from wrapping when @nr_cids is near U32_MAX, so cmask_reframe()
+ * bounds-checking the result against alloc_words catches the overflow instead
+ * of seeing a small value.
+ */
+#define CMASK_NR_WORDS(nr_cids) ((u32)(((u64)(nr_cids) + 63) / 64 + 1))
+
+static __always_inline bool __cmask_contains(u32 cid, const struct scx_cmask __arena *m)
+{
+ return cid >= m->base && cid < m->base + m->nr_cids;
+}
+
+static __always_inline u64 __arena *__cmask_word(u32 cid, const struct scx_cmask __arena *m)
+{
+ return (u64 __arena *)&m->bits[cid / 64 - m->base / 64];
+}
+
+/**
+ * __cmask_init - Initialize @m with explicit storage capacity
+ * @m: cmask to initialize
+ * @base: first cid of the active range
+ * @nr_cids: number of cids in the active range
+ * @alloc_cids: storage capacity in cids, at least @nr_cids
+ *
+ * Use when storage is sized larger than the initial active range. All of
+ * bits[] is zeroed.
+ */
+static __always_inline void __cmask_init(struct scx_cmask __arena *m, u32 base,
+ u32 nr_cids, u32 alloc_cids)
+{
+ u32 alloc_words, i;
+
+ if (unlikely(nr_cids > alloc_cids)) {
+ scx_bpf_error("__cmask_init: nr_cids=%u exceeds alloc_cids=%u",
+ nr_cids, alloc_cids);
+ return;
+ }
+ alloc_words = CMASK_NR_WORDS(alloc_cids);
+
+ m->base = base;
+ m->nr_cids = nr_cids;
+ m->alloc_words = alloc_words;
+
+ bpf_for(i, 0, CMASK_MAX_WORDS) {
+ if (i >= alloc_words)
+ break;
+ m->bits[i] = 0;
+ }
+}
+
+/**
+ * cmask_init - Initialize @m on tight storage
+ * @m: cmask to initialize
+ * @base: first cid of the active range
+ * @nr_cids: number of cids in the active range
+ *
+ * All of bits[] is zeroed.
+ */
+static __always_inline void cmask_init(struct scx_cmask __arena *m, u32 base, u32 nr_cids)
+{
+ __cmask_init(m, base, nr_cids, nr_cids);
+}
+
+/**
+ * cmask_reframe - Reshape @m's active range without resizing storage
+ * @m: cmask to reframe
+ * @base: new active range base
+ * @nr_cids: new active range length, must fit within @m->alloc_words
+ *
+ * Body bits within the new range become garbage - only the head and tail
+ * words are zeroed to keep the padding invariant.
+ */
+static __always_inline void cmask_reframe(struct scx_cmask __arena *m, u32 base, u32 nr_cids)
+{
+ if (CMASK_NR_WORDS(nr_cids) > m->alloc_words) {
+ scx_bpf_error("cmask_reframe: nr_cids=%u exceeds alloc_words=%u",
+ nr_cids, m->alloc_words);
+ return;
+ }
+ if (nr_cids) {
+ u32 last_word = ((base & 63) + nr_cids - 1) / 64;
+
+ m->bits[0] = 0;
+ m->bits[last_word] = 0;
+ }
+ m->base = base;
+ m->nr_cids = nr_cids;
+}
+
+static __always_inline bool cmask_test(u32 cid, const struct scx_cmask __arena *m)
+{
+ if (!__cmask_contains(cid, m))
+ return false;
+ return *__cmask_word(cid, m) & BIT_U64(cid & 63);
+}
+
+/*
+ * x86 BPF JIT rejects BPF_OR | BPF_FETCH and BPF_AND | BPF_FETCH on arena
+ * pointers (see bpf_jit_supports_insn() in arch/x86/net/bpf_jit_comp.c). Only
+ * BPF_CMPXCHG / BPF_XCHG / BPF_ADD with FETCH are allowed. Implement
+ * test_and_{set,clear} and the atomic set/clear via a cmpxchg loop.
+ *
+ * CMASK_CAS_TRIES is sized so exhausting it means seconds of real spinning
+ * on one word - past any plausible contention. Abort hard.
+ */
+#define CMASK_CAS_TRIES (1U << 23)
+
+static __always_inline void cmask_set(u32 cid, struct scx_cmask __arena *m)
+{
+ u64 __arena *w;
+ u64 bit, old, new;
+ u32 i;
+
+ if (!__cmask_contains(cid, m))
+ return;
+ w = __cmask_word(cid, m);
+ bit = BIT_U64(cid & 63);
+ bpf_for(i, 0, CMASK_CAS_TRIES) {
+ old = *w;
+ if (old & bit)
+ return;
+ new = old | bit;
+ if (__sync_val_compare_and_swap(w, old, new) == old)
+ return;
+ }
+ scx_bpf_error("cmask_set CAS exhausted at cid %u", cid);
+}
+
+static __always_inline void cmask_clear(u32 cid, struct scx_cmask __arena *m)
+{
+ u64 __arena *w;
+ u64 bit, old, new;
+ u32 i;
+
+ if (!__cmask_contains(cid, m))
+ return;
+ w = __cmask_word(cid, m);
+ bit = BIT_U64(cid & 63);
+ bpf_for(i, 0, CMASK_CAS_TRIES) {
+ old = *w;
+ if (!(old & bit))
+ return;
+ new = old & ~bit;
+ if (__sync_val_compare_and_swap(w, old, new) == old)
+ return;
+ }
+ scx_bpf_error("cmask_clear CAS exhausted at cid %u", cid);
+}
+
+static __always_inline bool cmask_test_and_set(u32 cid, struct scx_cmask __arena *m)
+{
+ u64 __arena *w;
+ u64 bit, old, new;
+ u32 i;
+
+ if (!__cmask_contains(cid, m))
+ return false;
+ w = __cmask_word(cid, m);
+ bit = BIT_U64(cid & 63);
+ bpf_for(i, 0, CMASK_CAS_TRIES) {
+ old = *w;
+ if (old & bit)
+ return true;
+ new = old | bit;
+ if (__sync_val_compare_and_swap(w, old, new) == old)
+ return false;
+ }
+ scx_bpf_error("cmask_test_and_set CAS exhausted at cid %u", cid);
+ return false;
+}
+
+static __always_inline bool cmask_test_and_clear(u32 cid, struct scx_cmask __arena *m)
+{
+ u64 __arena *w;
+ u64 bit, old, new;
+ u32 i;
+
+ if (!__cmask_contains(cid, m))
+ return false;
+ w = __cmask_word(cid, m);
+ bit = BIT_U64(cid & 63);
+ bpf_for(i, 0, CMASK_CAS_TRIES) {
+ old = *w;
+ if (!(old & bit))
+ return false;
+ new = old & ~bit;
+ if (__sync_val_compare_and_swap(w, old, new) == old)
+ return true;
+ }
+ scx_bpf_error("cmask_test_and_clear CAS exhausted at cid %u", cid);
+ return false;
+}
+
+static __always_inline void __cmask_set(u32 cid, struct scx_cmask __arena *m)
+{
+ if (!__cmask_contains(cid, m))
+ return;
+ *__cmask_word(cid, m) |= BIT_U64(cid & 63);
+}
+
+static __always_inline void __cmask_clear(u32 cid, struct scx_cmask __arena *m)
+{
+ if (!__cmask_contains(cid, m))
+ return;
+ *__cmask_word(cid, m) &= ~BIT_U64(cid & 63);
+}
+
+static __always_inline bool __cmask_test_and_set(u32 cid, struct scx_cmask __arena *m)
+{
+ u64 bit = BIT_U64(cid & 63);
+ u64 __arena *w;
+ u64 prev;
+
+ if (!__cmask_contains(cid, m))
+ return false;
+ w = __cmask_word(cid, m);
+ prev = *w & bit;
+ *w |= bit;
+ return prev;
+}
+
+static __always_inline bool __cmask_test_and_clear(u32 cid, struct scx_cmask __arena *m)
+{
+ u64 bit = BIT_U64(cid & 63);
+ u64 __arena *w;
+ u64 prev;
+
+ if (!__cmask_contains(cid, m))
+ return false;
+ w = __cmask_word(cid, m);
+ prev = *w & bit;
+ *w &= ~bit;
+ return prev;
+}
+
+static __always_inline void cmask_zero(struct scx_cmask __arena *m)
+{
+ u32 nr_words = CMASK_NR_WORDS(m->nr_cids), i;
+
+ bpf_for(i, 0, CMASK_MAX_WORDS) {
+ if (i >= nr_words)
+ break;
+ m->bits[i] = 0;
+ }
+}
+
+/*
+ * BPF_-prefixed to avoid colliding with the kernel's anonymous CMASK_OP_*
+ * enum in ext_cid.c, which is exported via BTF and reachable through
+ * vmlinux.h.
+ */
+enum {
+ BPF_CMASK_OP_AND,
+ BPF_CMASK_OP_OR,
+ BPF_CMASK_OP_COPY,
+ BPF_CMASK_OP_ANDNOT,
+};
+
+static __always_inline void cmask_op_word(struct scx_cmask __arena *dst,
+ const struct scx_cmask __arena *src,
+ u32 di, u32 si, u64 mask, int op)
+{
+ u64 dv = dst->bits[di];
+ u64 sv = src->bits[si];
+ u64 rv;
+
+ if (op == BPF_CMASK_OP_AND)
+ rv = dv & sv;
+ else if (op == BPF_CMASK_OP_OR)
+ rv = dv | sv;
+ else if (op == BPF_CMASK_OP_ANDNOT)
+ rv = dv & ~sv;
+ else
+ rv = sv;
+
+ dst->bits[di] = (dv & ~mask) | (rv & mask);
+}
+
+static __always_inline void cmask_op(struct scx_cmask __arena *dst,
+ const struct scx_cmask __arena *src, int op)
+{
+ u32 d_end = dst->base + dst->nr_cids;
+ u32 s_end = src->base + src->nr_cids;
+ u32 lo = dst->base > src->base ? dst->base : src->base;
+ u32 hi = d_end < s_end ? d_end : s_end;
+ u32 d_base = dst->base / 64;
+ u32 s_base = src->base / 64;
+ u32 lo_word, hi_word, i;
+ u64 head_mask, tail_mask;
+
+ if (lo >= hi)
+ return;
+
+ lo_word = lo / 64;
+ hi_word = (hi - 1) / 64;
+ head_mask = GENMASK_U64(63, lo & 63);
+ tail_mask = GENMASK_U64((hi - 1) & 63, 0);
+
+ bpf_for(i, 0, CMASK_MAX_WORDS) {
+ u32 w = lo_word + i;
+ u64 m;
+
+ if (w > hi_word)
+ break;
+
+ m = GENMASK_U64(63, 0);
+ if (w == lo_word)
+ m &= head_mask;
+ if (w == hi_word)
+ m &= tail_mask;
+
+ cmask_op_word(dst, src, w - d_base, w - s_base, m, op);
+ }
+}
+
+/*
+ * cmask_and/or/copy only modify @dst bits that lie in the intersection of
+ * [@dst->base, @dst->base + @dst->nr_cids) and [@src->base,
+ * @src->base + @src->nr_cids). Bits in @dst outside that window
+ * keep their prior values - in particular, cmask_copy() does NOT zero @dst
+ * bits that lie outside @src's range.
+ */
+static __always_inline void cmask_and(struct scx_cmask __arena *dst,
+ const struct scx_cmask __arena *src)
+{
+ cmask_op(dst, src, BPF_CMASK_OP_AND);
+}
+
+static __always_inline void cmask_or(struct scx_cmask __arena *dst,
+ const struct scx_cmask __arena *src)
+{
+ cmask_op(dst, src, BPF_CMASK_OP_OR);
+}
+
+static __always_inline void cmask_copy(struct scx_cmask __arena *dst,
+ const struct scx_cmask __arena *src)
+{
+ cmask_op(dst, src, BPF_CMASK_OP_COPY);
+}
+
+static __always_inline void cmask_andnot(struct scx_cmask __arena *dst,
+ const struct scx_cmask __arena *src)
+{
+ cmask_op(dst, src, BPF_CMASK_OP_ANDNOT);
+}
+
+/*
+ * True iff @a and @b have identical bits over their (assumed equal) range.
+ * Callers are expected to pass same-shape cmasks; differing shapes always
+ * compare unequal.
+ */
+static __always_inline bool cmask_equal(const struct scx_cmask __arena *a,
+ const struct scx_cmask __arena *b)
+{
+ u32 nr_words, i;
+
+ if (a->base != b->base || a->nr_cids != b->nr_cids)
+ return false;
+ nr_words = CMASK_NR_WORDS(a->nr_cids);
+
+ bpf_for(i, 0, CMASK_MAX_WORDS) {
+ if (i >= nr_words)
+ break;
+ if (a->bits[i] != b->bits[i])
+ return false;
+ }
+ return true;
+}
+
+/*
+ * True iff every bit set in @a is also set in @b over the intersection of
+ * their ranges. Bits of @a outside @b's range fail the test.
+ */
+static __always_inline bool cmask_subset(const struct scx_cmask __arena *a,
+ const struct scx_cmask __arena *b)
+{
+ u32 a_end = a->base + a->nr_cids;
+ u32 b_end = b->base + b->nr_cids;
+ u32 a_wbase = a->base / 64;
+ u32 b_wbase = b->base / 64;
+ u32 nr_words, i;
+
+ /* any bit of @a outside @b's range is a subset violation */
+ if (a->base < b->base || a_end > b_end)
+ return false;
+
+ nr_words = CMASK_NR_WORDS(a->nr_cids);
+ bpf_for(i, 0, CMASK_MAX_WORDS) {
+ u32 wi_b;
+
+ if (i >= nr_words)
+ break;
+ wi_b = a_wbase + i - b_wbase;
+ if (a->bits[i] & ~b->bits[wi_b])
+ return false;
+ }
+ return true;
+}
+
+/**
+ * cmask_next_set - find the first set bit at or after @cid
+ * @m: cmask to search
+ * @cid: starting cid (clamped to @m->base if below)
+ *
+ * Returns the smallest set cid in [@cid, @m->base + @m->nr_cids), or
+ * @m->base + @m->nr_cids if none (the out-of-range sentinel matches the
+ * termination condition used by cmask_for_each()).
+ */
+static __always_inline u32 cmask_next_set(const struct scx_cmask __arena *m, u32 cid)
+{
+ u32 end = m->base + m->nr_cids;
+ u32 base = m->base / 64;
+ u32 last_wi = (end - 1) / 64 - base;
+ u32 start_wi, start_bit, i;
+
+ if (cid < m->base)
+ cid = m->base;
+ if (cid >= end)
+ return end;
+
+ start_wi = cid / 64 - base;
+ start_bit = cid & 63;
+
+ bpf_for(i, 0, CMASK_MAX_WORDS) {
+ u32 wi = start_wi + i;
+ u64 word;
+ u32 found;
+
+ if (wi > last_wi)
+ break;
+
+ word = m->bits[wi];
+ if (i == 0)
+ word &= GENMASK_U64(63, start_bit);
+ if (!word)
+ continue;
+
+ found = (base + wi) * 64 + ctzll(word);
+ if (found >= end)
+ return end;
+ return found;
+ }
+ return end;
+}
+
+static __always_inline u32 cmask_first_set(const struct scx_cmask __arena *m)
+{
+ return cmask_next_set(m, m->base);
+}
+
+#define cmask_for_each(cid, m) \
+ for ((cid) = cmask_first_set(m); \
+ (cid) < (m)->base + (m)->nr_cids; \
+ (cid) = cmask_next_set((m), (cid) + 1))
+
+/*
+ * Population count over [base, base + nr_cids). Padding bits in the head/tail
+ * words are guaranteed zero by the mutating helpers, so a flat popcount over
+ * all words is correct.
+ */
+static __always_inline u32 cmask_weight(const struct scx_cmask __arena *m)
+{
+ u32 nr_words = CMASK_NR_WORDS(m->nr_cids), i;
+ u32 count = 0;
+
+ bpf_for(i, 0, CMASK_MAX_WORDS) {
+ if (i >= nr_words)
+ break;
+ count += __builtin_popcountll(m->bits[i]);
+ }
+ return count;
+}
+
+/*
+ * True if @a and @b share any set bit. Walk only the intersection of their
+ * ranges, matching the semantics of cmask_and().
+ */
+static __always_inline bool cmask_intersects(const struct scx_cmask __arena *a,
+ const struct scx_cmask __arena *b)
+{
+ u32 a_end = a->base + a->nr_cids;
+ u32 b_end = b->base + b->nr_cids;
+ u32 lo = a->base > b->base ? a->base : b->base;
+ u32 hi = a_end < b_end ? a_end : b_end;
+ u32 a_base = a->base / 64;
+ u32 b_base = b->base / 64;
+ u32 lo_word, hi_word, i;
+ u64 head_mask, tail_mask;
+
+ if (lo >= hi)
+ return false;
+
+ lo_word = lo / 64;
+ hi_word = (hi - 1) / 64;
+ head_mask = GENMASK_U64(63, lo & 63);
+ tail_mask = GENMASK_U64((hi - 1) & 63, 0);
+
+ bpf_for(i, 0, CMASK_MAX_WORDS) {
+ u32 w = lo_word + i;
+ u64 mask, av, bv;
+
+ if (w > hi_word)
+ break;
+
+ mask = GENMASK_U64(63, 0);
+ if (w == lo_word)
+ mask &= head_mask;
+ if (w == hi_word)
+ mask &= tail_mask;
+
+ av = a->bits[w - a_base] & mask;
+ bv = b->bits[w - b_base] & mask;
+ if (av & bv)
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Find the next cid set in both @a and @b at or after @start, bounded by the
+ * intersection of the two ranges. Return a->base + a->nr_cids if none found.
+ *
+ * Building block for cmask_next_and_set_wrap(). Callers that want a bounded
+ * scan without wrap call this directly.
+ */
+static __always_inline u32 cmask_next_and_set(const struct scx_cmask __arena *a,
+ const struct scx_cmask __arena *b,
+ u32 start)
+{
+ u32 a_end = a->base + a->nr_cids;
+ u32 b_end = b->base + b->nr_cids;
+ u32 a_wbase = a->base / 64;
+ u32 b_wbase = b->base / 64;
+ u32 lo = a->base > b->base ? a->base : b->base;
+ u32 hi = a_end < b_end ? a_end : b_end;
+ u32 last_wi, start_wi, start_bit, i;
+
+ if (lo >= hi)
+ return a_end;
+ if (start < lo)
+ start = lo;
+ if (start >= hi)
+ return a_end;
+
+ last_wi = (hi - 1) / 64;
+ start_wi = start / 64;
+ start_bit = start & 63;
+
+ bpf_for(i, 0, CMASK_MAX_WORDS) {
+ u32 abs_wi = start_wi + i;
+ u64 word;
+ u32 found;
+
+ if (abs_wi > last_wi)
+ break;
+
+ word = a->bits[abs_wi - a_wbase] & b->bits[abs_wi - b_wbase];
+ if (i == 0)
+ word &= GENMASK_U64(63, start_bit);
+ if (!word)
+ continue;
+
+ found = abs_wi * 64 + ctzll(word);
+ if (found >= hi)
+ return a_end;
+ return found;
+ }
+ return a_end;
+}
+
+/*
+ * Find the next set cid in @m at or after @start, wrapping to @m->base if no
+ * set bit is found in [start, m->base + m->nr_cids). Return m->base +
+ * m->nr_cids if @m is empty.
+ *
+ * Callers do round-robin distribution by passing (last_cid + 1) as @start.
+ */
+static __always_inline u32 cmask_next_set_wrap(const struct scx_cmask __arena *m,
+ u32 start)
+{
+ u32 end = m->base + m->nr_cids;
+ u32 found;
+
+ found = cmask_next_set(m, start);
+ if (found < end || start <= m->base)
+ return found;
+
+ found = cmask_next_set(m, m->base);
+ return found < start ? found : end;
+}
+
+/*
+ * Find the next cid set in both @a and @b at or after @start, wrapping to
+ * @a->base if none found in the forward half. Return a->base + a->nr_cids
+ * if the intersection is empty.
+ *
+ * Callers do round-robin distribution by passing (last_cid + 1) as @start.
+ */
+static __always_inline u32 cmask_next_and_set_wrap(const struct scx_cmask __arena *a,
+ const struct scx_cmask __arena *b,
+ u32 start)
+{
+ u32 a_end = a->base + a->nr_cids;
+ u32 found;
+
+ found = cmask_next_and_set(a, b, start);
+ if (found < a_end || start <= a->base)
+ return found;
+
+ found = cmask_next_and_set(a, b, a->base);
+ return found < start ? found : a_end;
+}
+
+/**
+ * cmask_from_cpumask - translate a kernel cpumask to a cid-space cmask
+ * @m: cmask to fill. Zeroed first; only bits within [@m->base, @m->base +
+ * @m->nr_cids) are updated - cpus mapping to cids outside that range
+ * are ignored.
+ * @cpumask: kernel cpumask to translate
+ *
+ * For each cpu in @cpumask, set the cpu's cid in @m. Caller must ensure
+ * @cpumask stays stable across the call (e.g. RCU read lock for
+ * task->cpus_ptr).
+ */
+static __always_inline void cmask_from_cpumask(struct scx_cmask __arena *m,
+ const struct cpumask *cpumask)
+{
+ u32 nr_cpu_ids = scx_bpf_nr_cpu_ids();
+ s32 cpu;
+
+ cmask_zero(m);
+ bpf_for(cpu, 0, nr_cpu_ids) {
+ s32 cid;
+
+ if (!bpf_cpumask_test_cpu(cpu, cpumask))
+ continue;
+ cid = scx_bpf_cpu_to_cid(cpu);
+ if (cid >= 0)
+ __cmask_set(cid, m);
+ }
+}
+
+#endif /* __SCX_CID_BPF_H */
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index 19459dedde41..9591a6e778ce 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -99,8 +99,21 @@ s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym;
struct rq *scx_bpf_locked_rq(void) __ksym;
struct task_struct *scx_bpf_cpu_curr(s32 cpu) __ksym __weak;
+struct task_struct *scx_bpf_tid_to_task(u64 tid) __ksym __weak;
u64 scx_bpf_now(void) __ksym __weak;
void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak;
+s32 scx_bpf_cpu_to_cid(s32 cpu) __ksym __weak;
+s32 scx_bpf_cid_to_cpu(s32 cid) __ksym __weak;
+void scx_bpf_cid_topo(s32 cid, struct scx_cid_topo *out) __ksym __weak;
+s32 scx_bpf_kick_cid(s32 cid, u64 flags) __ksym __weak;
+s32 scx_bpf_task_cid(const struct task_struct *p) __ksym __weak;
+s32 scx_bpf_this_cid(void) __ksym __weak;
+struct task_struct *scx_bpf_cid_curr(s32 cid) __ksym __weak;
+u32 scx_bpf_nr_cids(void) __ksym __weak;
+u32 scx_bpf_nr_online_cids(void) __ksym __weak;
+u32 scx_bpf_cidperf_cap(s32 cid) __ksym __weak;
+u32 scx_bpf_cidperf_cur(s32 cid) __ksym __weak;
+void scx_bpf_cidperf_set(s32 cid, u32 perf) __ksym __weak;
/*
* Use the following as @it__iter when calling scx_bpf_dsq_move[_vtime]() from
@@ -526,6 +539,10 @@ static inline bool is_migration_disabled(const struct task_struct *p)
void bpf_rcu_read_lock(void) __ksym;
void bpf_rcu_read_unlock(void) __ksym;
+/* resilient qspinlock */
+int bpf_res_spin_lock(struct bpf_res_spin_lock *lock) __ksym __weak;
+void bpf_res_spin_unlock(struct bpf_res_spin_lock *lock) __ksym __weak;
+
/*
* Time helpers, most of which are from jiffies.h.
*/
@@ -1035,7 +1052,18 @@ static inline u64 scx_clock_irq(u32 cpu)
return irqt ? BPF_CORE_READ(irqt, total) : 0;
}
+/* Abbreviated forms of <linux/overflow.h>'s struct_size() family. */
+#define flex_array_size(p, member, count) \
+ ((count) * sizeof(*(p)->member))
+
+#define struct_size(p, member, count) \
+ (offsetof(typeof(*(p)), member) + flex_array_size(p, member, count))
+
+#define struct_size_t(type, member, count) \
+ struct_size((type *)NULL, member, count)
+
#include "compat.bpf.h"
#include "enums.bpf.h"
+#include "cid.bpf.h"
#endif /* __SCX_COMMON_BPF_H */
diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h
index 8977b5a2caa1..87f15f296234 100644
--- a/tools/sched_ext/include/scx/compat.bpf.h
+++ b/tools/sched_ext/include/scx/compat.bpf.h
@@ -121,6 +121,18 @@ static inline bool scx_bpf_sub_dispatch(u64 cgroup_id)
return false;
}
+/*
+ * v7.2: scx_bpf_cid_override() for explicit cpu->cid mapping. Ignore if
+ * missing.
+ */
+void scx_bpf_cid_override___compat(const s32 *cpu_to_cid, u32 cpu_to_cid__sz) __ksym __weak;
+
+static inline void scx_bpf_cid_override(const s32 *cpu_to_cid, u32 cpu_to_cid__sz)
+{
+ if (bpf_ksym_exists(scx_bpf_cid_override___compat))
+ return scx_bpf_cid_override___compat(cpu_to_cid, cpu_to_cid__sz);
+}
+
/**
* __COMPAT_is_enq_cpu_selected - Test if SCX_ENQ_CPU_SELECTED is on
* in a compatible way. We will preserve this __COMPAT helper until v6.16.
@@ -423,8 +435,10 @@ static inline void scx_bpf_dsq_reenq(u64 dsq_id, u64 reenq_flags)
}
/*
- * Define sched_ext_ops. This may be expanded to define multiple variants for
- * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH().
+ * Define sched_ext_ops. See compat.h::SCX_OPS_OPEN() for how backward
+ * compatibility is handled (this macro can be expanded to emit multiple
+ * variants for incompatible op changes; SCX_OPS_OPEN() handles purely
+ * additive changes at load time).
*/
#define SCX_OPS_DEFINE(__name, ...) \
SEC(".struct_ops.link") \
@@ -432,4 +446,16 @@ static inline void scx_bpf_dsq_reenq(u64 dsq_id, u64 reenq_flags)
__VA_ARGS__, \
};
+/*
+ * Define a cid-form sched_ext_ops. Programs targeting this struct_ops type
+ * use cid-form callback signatures (select_cid, set_cmask, cid_online/offline,
+ * dispatch with cid arg, etc.) and may only call the cid-form scx_bpf_*
+ * kfuncs (kick_cid, task_cid, this_cid, ...).
+ */
+#define SCX_OPS_CID_DEFINE(__name, ...) \
+ SEC(".struct_ops.link") \
+ struct sched_ext_ops_cid __name = { \
+ __VA_ARGS__, \
+ };
+
#endif /* __SCX_COMPAT_BPF_H */
diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h
index 039854c490d5..602f07061ee3 100644
--- a/tools/sched_ext/include/scx/compat.h
+++ b/tools/sched_ext/include/scx/compat.h
@@ -149,10 +149,24 @@ static inline long scx_hotplug_seq(void)
}
/*
- * struct sched_ext_ops can change over time. If compat.bpf.h::SCX_OPS_DEFINE()
- * is used to define ops and compat.h::SCX_OPS_LOAD/ATTACH() are used to load
- * and attach it, backward compatibility is automatically maintained where
- * reasonable.
+ * Open the sched_ext_ops skeleton.
+ *
+ * struct sched_ext_ops can change over time. Two complementary mechanisms
+ * keep BPF schedulers built against newer headers running on older kernels:
+ *
+ * 1. Load-time fix-up (this macro). For each optional ops callback or field
+ * added to struct sched_ext_ops, an explicit stanza below probes the
+ * running kernel's BTF via __COMPAT_struct_has_field() and, if the field
+ * is missing, clears it in the in-memory struct_ops (with a warning to
+ * stderr) before load. Handles additive changes - a new stanza must be
+ * added here for each new optional field.
+ *
+ * 2. Multi-variant struct_ops via compat.bpf.h::SCX_OPS_DEFINE(). That
+ * macro can be expanded to emit several variants of struct sched_ext_ops,
+ * and SCX_OPS_LOAD()/ATTACH() can pick the right one based on what the
+ * kernel supports. Needed when an existing operation has to change
+ * incompatibly (e.g. a callback signature changes); the load-time
+ * fix-up above only handles purely additive changes.
*
* ec7e3b0463e1 ("implement-ops") in https://github.com/sched-ext/sched_ext is
* the current minimum required kernel version.
@@ -225,6 +239,7 @@ static inline void __scx_ops_assoc_prog(struct bpf_program *prog,
}
#endif
+/* See SCX_OPS_OPEN() above for backward-compatibility handling. */
#define SCX_OPS_LOAD(__skel, __ops_name, __scx_name, __uei_name) ({ \
struct bpf_program *__prog; \
UEI_SET_SIZE(__skel, __ops_name, __uei_name); \
diff --git a/tools/sched_ext/include/scx/user_exit_info.bpf.h b/tools/sched_ext/include/scx/user_exit_info.bpf.h
index e7ac6611a990..98cab643c8d9 100644
--- a/tools/sched_ext/include/scx/user_exit_info.bpf.h
+++ b/tools/sched_ext/include/scx/user_exit_info.bpf.h
@@ -32,6 +32,9 @@
__uei_name##_dump_len, (__ei)->dump); \
if (bpf_core_field_exists((__ei)->exit_code)) \
__uei_name.exit_code = (__ei)->exit_code; \
+ __uei_name.exit_cpu = -1; \
+ if (bpf_core_field_exists((__ei)->exit_cpu)) \
+ __uei_name.exit_cpu = (__ei)->exit_cpu; \
/* use __sync to force memory barrier */ \
__sync_val_compare_and_swap(&__uei_name.kind, __uei_name.kind, \
(__ei)->kind); \
diff --git a/tools/sched_ext/include/scx/user_exit_info.h b/tools/sched_ext/include/scx/user_exit_info.h
index 399697fa372f..56a02b549aef 100644
--- a/tools/sched_ext/include/scx/user_exit_info.h
+++ b/tools/sched_ext/include/scx/user_exit_info.h
@@ -39,6 +39,8 @@
fprintf(stderr, "EXIT: %s", __uei->reason); \
if (__uei->msg[0] != '\0') \
fprintf(stderr, " (%s)", __uei->msg); \
+ if (__uei->exit_cpu >= 0) \
+ fprintf(stderr, " on CPU %d", __uei->exit_cpu); \
fputs("\n", stderr); \
__uei->exit_code; \
})
diff --git a/tools/sched_ext/include/scx/user_exit_info_common.h b/tools/sched_ext/include/scx/user_exit_info_common.h
index 2d0981aedd89..76e2a055eb4b 100644
--- a/tools/sched_ext/include/scx/user_exit_info_common.h
+++ b/tools/sched_ext/include/scx/user_exit_info_common.h
@@ -22,6 +22,11 @@ enum uei_sizes {
struct user_exit_info {
int kind;
+ /*
+ * CPU that triggered the exit, or -1 if unset (e.g. running on an
+ * older kernel that does not expose this field).
+ */
+ s32 exit_cpu;
s64 exit_code;
char reason[UEI_REASON_LEN];
char msg[UEI_MSG_LEN];
diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c
index 4efcce099bd5..64dd60b3e922 100644
--- a/tools/sched_ext/scx_central.bpf.c
+++ b/tools/sched_ext/scx_central.bpf.c
@@ -149,10 +149,14 @@ static bool dispatch_to_cpu(s32 cpu)
}
/*
- * If we can't run the task at the top, do the dumb thing and
- * bounce it to the fallback dsq.
+ * If we can't run the task at the top for whatever reason,
+ * bounce it to the fallback dsq. Also check
+ * is_migration_disabled() explicitly as p->cpus_ptr may not
+ * reflect the migration-disabled state yet if
+ * migrate_disable_switch() hasn't run.
*/
- if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) {
+ if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr) ||
+ (is_migration_disabled(p) && scx_bpf_task_cpu(p) != cpu)) {
__sync_fetch_and_add(&nr_mismatches, 1);
scx_bpf_dsq_insert(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, 0);
bpf_task_release(p);
diff --git a/tools/sched_ext/scx_cpu0.bpf.c b/tools/sched_ext/scx_cpu0.bpf.c
index 0b1a7ce879b0..909d1be1bfe3 100644
--- a/tools/sched_ext/scx_cpu0.bpf.c
+++ b/tools/sched_ext/scx_cpu0.bpf.c
@@ -18,8 +18,6 @@
char _license[] SEC("license") = "GPL";
-const volatile u32 nr_cpus = 32; /* !0 for veristat, set during init */
-
UEI_DEFINE(uei);
/*
diff --git a/tools/sched_ext/scx_cpu0.c b/tools/sched_ext/scx_cpu0.c
index a6fba9978b9c..4966e3d4c724 100644
--- a/tools/sched_ext/scx_cpu0.c
+++ b/tools/sched_ext/scx_cpu0.c
@@ -72,8 +72,6 @@ restart:
optind = 1;
skel = SCX_OPS_OPEN(cpu0_ops, scx_cpu0);
- skel->rodata->nr_cpus = libbpf_num_possible_cpus();
-
while ((opt = getopt(argc, argv, "vh")) != -1) {
switch (opt) {
case 'v':
diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c
index d865c381589b..de2bef86d64d 100644
--- a/tools/sched_ext/scx_flatcg.c
+++ b/tools/sched_ext/scx_flatcg.c
@@ -130,7 +130,6 @@ int main(int argc, char **argv)
struct scx_flatcg *skel;
struct bpf_link *link;
struct timespec intv_ts = { .tv_sec = 2, .tv_nsec = 0 };
- bool dump_cgrps = false;
__u64 last_cpu_sum = 0, last_cpu_idle = 0;
__u64 last_stats[FCG_NR_STATS] = {};
unsigned long seq = 0;
@@ -148,7 +147,7 @@ restart:
assert(skel->rodata->nr_cpus > 0);
skel->rodata->cgrp_slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL");
- while ((opt = getopt(argc, argv, "s:i:dfvh")) != -1) {
+ while ((opt = getopt(argc, argv, "s:i:fvh")) != -1) {
double v;
switch (opt) {
@@ -161,9 +160,6 @@ restart:
intv_ts.tv_sec = v;
intv_ts.tv_nsec = (v - (float)intv_ts.tv_sec) * 1000000000;
break;
- case 'd':
- dump_cgrps = true;
- break;
case 'f':
skel->rodata->fifo_sched = true;
break;
@@ -177,10 +173,10 @@ restart:
}
}
- printf("slice=%.1lfms intv=%.1lfs dump_cgrps=%d",
+ printf("slice=%.1lfms intv=%.1lfs",
(double)skel->rodata->cgrp_slice_ns / 1000000.0,
- (double)intv_ts.tv_sec + (double)intv_ts.tv_nsec / 1000000000.0,
- dump_cgrps);
+ (double)intv_ts.tv_sec + (double)intv_ts.tv_nsec / 1000000000.0);
+
SCX_OPS_LOAD(skel, flatcg_ops, scx_flatcg, uei);
link = SCX_OPS_ATTACH(skel, flatcg_ops, scx_flatcg);
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index aad698fe294b..fd9a82a67627 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -2,15 +2,16 @@
/*
* A simple five-level FIFO queue scheduler.
*
- * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets
- * assigned to one depending on its compound weight. Each CPU round robins
- * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from
- * queue0, 2 from queue1, 4 from queue2 and so on.
+ * There are five FIFOs implemented as arena-backed doubly-linked lists
+ * threaded through per-task context. A task gets assigned to one depending on
+ * its compound weight. Each CPU round robins through the FIFOs and dispatches
+ * more from FIFOs with higher indices - 1 from queue0, 2 from queue1, 4 from
+ * queue2 and so on.
*
* This scheduler demonstrates:
*
- * - BPF-side queueing using PIDs.
- * - Sleepable per-task storage allocation using ops.prep_enable().
+ * - BPF-side queueing using TIDs.
+ * - BPF arena for scheduler state.
* - Core-sched support.
*
* This scheduler is primarily for demonstration and testing of sched_ext
@@ -22,6 +23,8 @@
*/
#include <scx/common.bpf.h>
+#include "scx_qmap.h"
+
enum consts {
ONE_SEC_IN_NS = 1000000000,
ONE_MSEC_IN_NS = 1000000,
@@ -47,40 +50,72 @@ const volatile s32 disallow_tgid;
const volatile bool suppress_dump;
const volatile bool always_enq_immed;
const volatile u32 immed_stress_nth;
+const volatile u32 max_tasks;
-u64 nr_highpri_queued;
-u32 test_error_cnt;
-
-#define MAX_SUB_SCHEDS 8
-u64 sub_sched_cgroup_ids[MAX_SUB_SCHEDS];
+/*
+ * Optional cid-override test harness. When cid_override_mode is non-zero,
+ * qmap_init() calls scx_bpf_cid_override() with the caller-supplied
+ * cpu_to_cid array to exercise the kfunc's acceptance and error paths.
+ *
+ * 0 = disabled
+ * 1 = valid reverse mapping
+ * 2 = invalid: duplicate cid assignment
+ * 3 = invalid: out-of-range cid
+ */
+const volatile u32 cid_override_mode;
+/*
+ * Array lives in bss (writable) because scx_bpf_cid_override()'s BPF
+ * verifier signature treats its len-paired pointer as read/write - rodata
+ * fails verification with "write into map forbidden". Userspace populates
+ * it before SCX_OPS_LOAD, same as rodata, and nothing writes it after.
+ */
+s32 cid_override_cpu_to_cid[SCX_QMAP_MAX_CPUS];
UEI_DEFINE(uei);
-struct qmap {
- __uint(type, BPF_MAP_TYPE_QUEUE);
- __uint(max_entries, 4096);
- __type(value, u32);
-} queue0 SEC(".maps"),
- queue1 SEC(".maps"),
- queue2 SEC(".maps"),
- queue3 SEC(".maps"),
- queue4 SEC(".maps"),
- dump_store SEC(".maps");
-
+/*
+ * All scheduler state - per-cpu context, stats counters, core-sched sequence
+ * numbers, sub-sched cgroup ids - lives in this single BPF arena map. Userspace
+ * reaches it via skel->arena->qa.
+ */
struct {
- __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
- __uint(max_entries, 5);
- __type(key, int);
- __array(values, struct qmap);
-} queue_arr SEC(".maps") = {
- .values = {
- [0] = &queue0,
- [1] = &queue1,
- [2] = &queue2,
- [3] = &queue3,
- [4] = &queue4,
- },
-};
+ __uint(type, BPF_MAP_TYPE_ARENA);
+ __uint(map_flags, BPF_F_MMAPABLE);
+ __uint(max_entries, 1 << 16); /* upper bound in pages */
+#if defined(__TARGET_ARCH_arm64) || defined(__aarch64__)
+ __ulong(map_extra, 0x1ull << 32); /* user/BPF mmap base */
+#else
+ __ulong(map_extra, 0x1ull << 44);
+#endif
+} arena SEC(".maps");
+
+struct qmap_arena __arena_global qa;
+
+/*
+ * Global idle-cid tracking, maintained via update_idle / cpu_offline and
+ * scanned by the direct-dispatch path. Allocated in qmap_init() from one
+ * arena page, sized to the full cid space.
+ */
+struct scx_cmask __arena *qa_idle_cids;
+
+/* Per-queue locks. Each in its own .data section as bpf_res_spin_lock requires. */
+__hidden struct bpf_res_spin_lock qa_q_lock0 SEC(".data.qa_q_lock0");
+__hidden struct bpf_res_spin_lock qa_q_lock1 SEC(".data.qa_q_lock1");
+__hidden struct bpf_res_spin_lock qa_q_lock2 SEC(".data.qa_q_lock2");
+__hidden struct bpf_res_spin_lock qa_q_lock3 SEC(".data.qa_q_lock3");
+__hidden struct bpf_res_spin_lock qa_q_lock4 SEC(".data.qa_q_lock4");
+
+static struct bpf_res_spin_lock *qa_q_lock(s32 qid)
+{
+ switch (qid) {
+ case 0: return &qa_q_lock0;
+ case 1: return &qa_q_lock1;
+ case 2: return &qa_q_lock2;
+ case 3: return &qa_q_lock3;
+ case 4: return &qa_q_lock4;
+ default: return NULL;
+ }
+}
/*
* If enabled, CPU performance target is set according to the queue index
@@ -102,85 +137,214 @@ static const u32 qidx_to_cpuperf_target[] = {
* task's seq and the associated queue's head seq is called the queue distance
* and used when comparing two tasks for ordering. See qmap_core_sched_before().
*/
-static u64 core_sched_head_seqs[5];
-static u64 core_sched_tail_seqs[5];
-/* Per-task scheduling context */
+/*
+ * Per-task scheduling context. Allocated from the qa.task_ctxs[] slab in
+ * arena. While the task is alive the entry is referenced from task_ctx_stor;
+ * while it's free the entry sits on the free list singly-linked through
+ * @next_free.
+ *
+ * When the task is queued on one of the five priority FIFOs, @q_idx is the
+ * queue index and @q_next/@q_prev link it in the queue's doubly-linked list.
+ * @q_idx is -1 when the task isn't on any queue.
+ */
struct task_ctx {
- bool force_local; /* Dispatch directly to local_dsq */
- bool highpri;
- u64 core_sched_seq;
+ struct task_ctx __arena *next_free; /* only valid on free list */
+ struct task_ctx __arena *q_next; /* queue link, NULL if tail */
+ struct task_ctx __arena *q_prev; /* queue link, NULL if head */
+ struct qmap_fifo __arena *fifo; /* queue we're on, NULL if not queued */
+ u64 tid;
+ s32 pid; /* for dump only */
+ bool force_local; /* Dispatch directly to local_dsq */
+ bool highpri;
+ u64 core_sched_seq;
+ struct scx_cmask cpus_allowed; /* per-task affinity in cid space */
+};
+
+/*
+ * Slab stride for task_ctx. cpus_allowed's flex array bits[] overlaps the
+ * tail bytes appended per entry; struct_size() gives the actual per-entry
+ * footprint.
+ */
+#define TASK_CTX_STRIDE \
+ struct_size_t(struct task_ctx, cpus_allowed.bits, \
+ CMASK_NR_WORDS(SCX_QMAP_MAX_CPUS))
+
+/* All task_ctx pointers are arena pointers. */
+typedef struct task_ctx __arena task_ctx_t;
+
+/* Holds an arena pointer to the task's slab entry. */
+struct task_ctx_stor_val {
+ task_ctx_t *taskc;
};
struct {
__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
__uint(map_flags, BPF_F_NO_PREALLOC);
__type(key, int);
- __type(value, struct task_ctx);
+ __type(value, struct task_ctx_stor_val);
} task_ctx_stor SEC(".maps");
-struct cpu_ctx {
- u64 dsp_idx; /* dispatch index */
- u64 dsp_cnt; /* remaining count */
- u32 avg_weight;
- u32 cpuperf_target;
-};
+/* Protects the task_ctx slab free list. */
+__hidden struct bpf_res_spin_lock qa_task_lock SEC(".data.qa_task_lock");
-struct {
- __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
- __uint(max_entries, 1);
- __type(key, u32);
- __type(value, struct cpu_ctx);
-} cpu_ctx_stor SEC(".maps");
+static int qmap_spin_lock(struct bpf_res_spin_lock *lock)
+{
+ if (bpf_res_spin_lock(lock)) {
+ scx_bpf_error("res_spin_lock failed");
+ return -EBUSY;
+ }
+ return 0;
+}
-/* Statistics */
-u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cpu0, nr_dequeued, nr_ddsp_from_enq;
-u64 nr_core_sched_execed;
-u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer;
-u32 cpuperf_min, cpuperf_avg, cpuperf_max;
-u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
+/*
+ * Try prev_cid, then scan taskc->cpus_allowed AND qa_idle_cids round-robin
+ * from prev_cid + 1. Atomic claim retries on race; bounded by
+ * IDLE_PICK_RETRIES to keep the verifier's insn budget in check.
+ */
+#define IDLE_PICK_RETRIES 16
-static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
+static s32 pick_direct_dispatch_cid(struct task_struct *p, s32 prev_cid,
+ task_ctx_t *taskc)
{
- s32 cpu;
+ u32 nr_cids = scx_bpf_nr_cids();
+ s32 cid;
+ u32 i;
if (!always_enq_immed && p->nr_cpus_allowed == 1)
- return prev_cpu;
+ return prev_cid;
+
+ if (cmask_test_and_clear(prev_cid, qa_idle_cids))
+ return prev_cid;
+
+ cid = prev_cid;
+ bpf_for(i, 0, IDLE_PICK_RETRIES) {
+ cid = cmask_next_and_set_wrap(&taskc->cpus_allowed,
+ qa_idle_cids, cid + 1);
+ barrier_var(cid);
+ if (cid >= nr_cids)
+ return -1;
+ if (cmask_test_and_clear(cid, qa_idle_cids))
+ return cid;
+ }
+ return -1;
+}
- if (scx_bpf_test_and_clear_cpu_idle(prev_cpu))
- return prev_cpu;
+/*
+ * Force a reference to the arena map. The verifier associates an arena with
+ * a program by finding an LD_IMM64 instruction that loads the arena's BPF
+ * map; programs that only use arena pointers returned from task-local
+ * storage (like qmap_select_cpu) never reference @arena directly. Without
+ * this, the verifier rejects addr_space_cast with "addr_space_cast insn
+ * can only be used in a program that has an associated arena".
+ */
+#define QMAP_TOUCH_ARENA() do { asm volatile("" :: "r"(&arena)); } while (0)
- cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
- if (cpu >= 0)
- return cpu;
+static task_ctx_t *lookup_task_ctx(struct task_struct *p)
+{
+ struct task_ctx_stor_val *v;
- return -1;
+ QMAP_TOUCH_ARENA();
+
+ v = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+ if (!v || !v->taskc)
+ return NULL;
+ return v->taskc;
}
-static struct task_ctx *lookup_task_ctx(struct task_struct *p)
+/* Append @taskc to the tail of @fifo. Must not already be queued. */
+static void qmap_fifo_enqueue(struct qmap_fifo __arena *fifo, task_ctx_t *taskc)
{
- return bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+ struct bpf_res_spin_lock *lock = qa_q_lock(fifo->idx);
+
+ if (!lock || qmap_spin_lock(lock))
+ return;
+ taskc->fifo = fifo;
+ taskc->q_next = NULL;
+ taskc->q_prev = fifo->tail;
+ if (fifo->tail)
+ fifo->tail->q_next = taskc;
+ else
+ fifo->head = taskc;
+ fifo->tail = taskc;
+ bpf_res_spin_unlock(lock);
+}
+
+/* Pop the head of @fifo. Returns NULL if empty. */
+static task_ctx_t *qmap_fifo_pop(struct qmap_fifo __arena *fifo)
+{
+ struct bpf_res_spin_lock *lock = qa_q_lock(fifo->idx);
+ task_ctx_t *taskc;
+
+ if (!lock || qmap_spin_lock(lock))
+ return NULL;
+ taskc = fifo->head;
+ if (taskc) {
+ fifo->head = taskc->q_next;
+ if (taskc->q_next)
+ taskc->q_next->q_prev = NULL;
+ else
+ fifo->tail = NULL;
+ taskc->q_next = NULL;
+ taskc->q_prev = NULL;
+ taskc->fifo = NULL;
+ }
+ bpf_res_spin_unlock(lock);
+ return taskc;
+}
+
+/* Remove @taskc from its fifo. No-op if not queued. */
+static void qmap_fifo_remove(task_ctx_t *taskc)
+{
+ struct qmap_fifo __arena *fifo = taskc->fifo;
+ struct bpf_res_spin_lock *lock;
+
+ if (!fifo)
+ return;
+
+ lock = qa_q_lock(fifo->idx);
+ if (!lock || qmap_spin_lock(lock))
+ return;
+
+ /* Re-check under lock — a concurrent pop may have cleared fifo. */
+ if (taskc->fifo != fifo) {
+ bpf_res_spin_unlock(lock);
+ return;
+ }
+
+ if (taskc->q_next)
+ taskc->q_next->q_prev = taskc->q_prev;
+ else
+ fifo->tail = taskc->q_prev;
+ if (taskc->q_prev)
+ taskc->q_prev->q_next = taskc->q_next;
+ else
+ fifo->head = taskc->q_next;
+ taskc->q_next = NULL;
+ taskc->q_prev = NULL;
+ taskc->fifo = NULL;
+ bpf_res_spin_unlock(lock);
}
-s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
- s32 prev_cpu, u64 wake_flags)
+s32 BPF_STRUCT_OPS(qmap_select_cid, struct task_struct *p,
+ s32 prev_cid, u64 wake_flags)
{
- struct task_ctx *tctx;
- s32 cpu;
+ task_ctx_t *taskc;
+ s32 cid;
- if (!(tctx = lookup_task_ctx(p)))
- return prev_cpu;
+ if (!(taskc = lookup_task_ctx(p)))
+ return prev_cid;
if (p->scx.weight < 2 && !(p->flags & PF_KTHREAD))
- return prev_cpu;
+ return prev_cid;
- cpu = pick_direct_dispatch_cpu(p, prev_cpu);
+ cid = pick_direct_dispatch_cid(p, prev_cid, taskc);
- if (cpu >= 0) {
- tctx->force_local = true;
- return cpu;
+ if (cid >= 0) {
+ taskc->force_local = true;
+ return cid;
} else {
- return prev_cpu;
+ return prev_cid;
}
}
@@ -202,16 +366,14 @@ static int weight_to_idx(u32 weight)
void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
{
static u32 user_cnt, kernel_cnt;
- struct task_ctx *tctx;
- u32 pid = p->pid;
+ task_ctx_t *taskc;
int idx = weight_to_idx(p->scx.weight);
- void *ring;
- s32 cpu;
+ s32 cid;
if (enq_flags & SCX_ENQ_REENQ) {
- __sync_fetch_and_add(&nr_reenqueued, 1);
- if (scx_bpf_task_cpu(p) == 0)
- __sync_fetch_and_add(&nr_reenqueued_cpu0, 1);
+ __sync_fetch_and_add(&qa.nr_reenqueued, 1);
+ if (scx_bpf_task_cid(p) == 0)
+ __sync_fetch_and_add(&qa.nr_reenqueued_cid0, 1);
}
if (p->flags & PF_KTHREAD) {
@@ -222,17 +384,17 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
return;
}
- if (test_error_cnt && !--test_error_cnt)
+ if (qa.test_error_cnt && !--qa.test_error_cnt)
scx_bpf_error("test triggering error");
- if (!(tctx = lookup_task_ctx(p)))
+ if (!(taskc = lookup_task_ctx(p)))
return;
/*
* All enqueued tasks must have their core_sched_seq updated for correct
* core-sched ordering. Also, take a look at the end of qmap_dispatch().
*/
- tctx->core_sched_seq = core_sched_tail_seqs[idx]++;
+ taskc->core_sched_seq = qa.core_sched_tail_seqs[idx]++;
/*
* IMMED stress testing: Every immed_stress_nth'th enqueue, dispatch
@@ -243,19 +405,19 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
static u32 immed_stress_cnt;
if (!(++immed_stress_cnt % immed_stress_nth)) {
- tctx->force_local = false;
- scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | scx_bpf_task_cpu(p),
+ taskc->force_local = false;
+ scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | scx_bpf_task_cid(p),
slice_ns, enq_flags);
return;
}
}
/*
- * If qmap_select_cpu() is telling us to or this is the last runnable
+ * If qmap_select_cid() is telling us to or this is the last runnable
* task on the CPU, enqueue locally.
*/
- if (tctx->force_local) {
- tctx->force_local = false;
+ if (taskc->force_local) {
+ taskc->force_local = false;
scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
return;
}
@@ -267,11 +429,11 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
return;
}
- /* if select_cpu() wasn't called, try direct dispatch */
+ /* if select_cid() wasn't called, try direct dispatch */
if (!__COMPAT_is_enq_cpu_selected(enq_flags) &&
- (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) {
- __sync_fetch_and_add(&nr_ddsp_from_enq, 1);
- scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags);
+ (cid = pick_direct_dispatch_cid(p, scx_bpf_task_cid(p), taskc)) >= 0) {
+ __sync_fetch_and_add(&qa.nr_ddsp_from_enq, 1);
+ scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cid, slice_ns, enq_flags);
return;
}
@@ -279,55 +441,52 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
* If the task was re-enqueued due to the CPU being preempted by a
* higher priority scheduling class, just re-enqueue the task directly
* on the global DSQ. As we want another CPU to pick it up, find and
- * kick an idle CPU.
+ * kick an idle cid.
*/
if (enq_flags & SCX_ENQ_REENQ) {
- s32 cpu;
+ s32 cid;
scx_bpf_dsq_insert(p, SHARED_DSQ, 0, enq_flags);
- cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
- if (cpu >= 0)
- scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
+ cid = cmask_next_and_set_wrap(&taskc->cpus_allowed,
+ qa_idle_cids, 0);
+ if (cid < scx_bpf_nr_cids())
+ scx_bpf_kick_cid(cid, SCX_KICK_IDLE);
return;
}
- ring = bpf_map_lookup_elem(&queue_arr, &idx);
- if (!ring) {
- scx_bpf_error("failed to find ring %d", idx);
- return;
- }
-
- /* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
- if (bpf_map_push_elem(ring, &pid, 0)) {
- scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, enq_flags);
- return;
- }
+ /* Queue on the selected FIFO. */
+ qmap_fifo_enqueue(&qa.fifos[idx], taskc);
if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
- tctx->highpri = true;
- __sync_fetch_and_add(&nr_highpri_queued, 1);
+ taskc->highpri = true;
+ __sync_fetch_and_add(&qa.nr_highpri_queued, 1);
}
- __sync_fetch_and_add(&nr_enqueued, 1);
+ __sync_fetch_and_add(&qa.nr_enqueued, 1);
}
-/*
- * The BPF queue map doesn't support removal and sched_ext can handle spurious
- * dispatches. qmap_dequeue() is only used to collect statistics.
- */
void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
{
- __sync_fetch_and_add(&nr_dequeued, 1);
+ task_ctx_t *taskc;
+
+ __sync_fetch_and_add(&qa.nr_dequeued, 1);
if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
- __sync_fetch_and_add(&nr_core_sched_execed, 1);
+ __sync_fetch_and_add(&qa.nr_core_sched_execed, 1);
+
+ taskc = lookup_task_ctx(p);
+ if (taskc && taskc->fifo) {
+ if (taskc->highpri)
+ __sync_fetch_and_sub(&qa.nr_highpri_queued, 1);
+ qmap_fifo_remove(taskc);
+ }
}
static void update_core_sched_head_seq(struct task_struct *p)
{
int idx = weight_to_idx(p->scx.weight);
- struct task_ctx *tctx;
+ task_ctx_t *taskc;
- if ((tctx = lookup_task_ctx(p)))
- core_sched_head_seqs[idx] = tctx->core_sched_seq;
+ if ((taskc = lookup_task_ctx(p)))
+ qa.core_sched_head_seqs[idx] = taskc->core_sched_seq;
}
/*
@@ -343,17 +502,18 @@ static void update_core_sched_head_seq(struct task_struct *p)
static bool dispatch_highpri(bool from_timer)
{
struct task_struct *p;
- s32 this_cpu = bpf_get_smp_processor_id();
+ s32 this_cid = scx_bpf_this_cid();
+ u32 nr_cids = scx_bpf_nr_cids();
/* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */
bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) {
static u64 highpri_seq;
- struct task_ctx *tctx;
+ task_ctx_t *taskc;
- if (!(tctx = lookup_task_ctx(p)))
+ if (!(taskc = lookup_task_ctx(p)))
return false;
- if (tctx->highpri) {
+ if (taskc->highpri) {
/* exercise the set_*() and vtime interface too */
scx_bpf_dsq_move_set_slice(BPF_FOR_EACH_ITER, slice_ns * 2);
scx_bpf_dsq_move_set_vtime(BPF_FOR_EACH_ITER, highpri_seq++);
@@ -362,30 +522,38 @@ static bool dispatch_highpri(bool from_timer)
}
/*
- * Scan HIGHPRI_DSQ and dispatch until a task that can run on this CPU
- * is found.
+ * Scan HIGHPRI_DSQ and dispatch until a task that can run here is
+ * found. Prefer this_cid if the task allows it; otherwise RR-scan the
+ * task's cpus_allowed starting after this_cid.
*/
bpf_for_each(scx_dsq, p, HIGHPRI_DSQ, 0) {
+ task_ctx_t *taskc;
bool dispatched = false;
- s32 cpu;
+ s32 cid;
+
+ if (!(taskc = lookup_task_ctx(p)))
+ return false;
- if (bpf_cpumask_test_cpu(this_cpu, p->cpus_ptr))
- cpu = this_cpu;
+ if (cmask_test(this_cid, &taskc->cpus_allowed))
+ cid = this_cid;
else
- cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0);
+ cid = cmask_next_set_wrap(&taskc->cpus_allowed,
+ this_cid + 1);
+ if (cid >= nr_cids)
+ continue;
- if (scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p, SCX_DSQ_LOCAL_ON | cpu,
+ if (scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p, SCX_DSQ_LOCAL_ON | cid,
SCX_ENQ_PREEMPT)) {
- if (cpu == this_cpu) {
+ if (cid == this_cid) {
dispatched = true;
- __sync_fetch_and_add(&nr_expedited_local, 1);
+ __sync_fetch_and_add(&qa.nr_expedited_local, 1);
} else {
- __sync_fetch_and_add(&nr_expedited_remote, 1);
+ __sync_fetch_and_add(&qa.nr_expedited_remote, 1);
}
if (from_timer)
- __sync_fetch_and_add(&nr_expedited_from_timer, 1);
+ __sync_fetch_and_add(&qa.nr_expedited_from_timer, 1);
} else {
- __sync_fetch_and_add(&nr_expedited_lost, 1);
+ __sync_fetch_and_add(&qa.nr_expedited_lost, 1);
}
if (dispatched)
@@ -395,22 +563,21 @@ static bool dispatch_highpri(bool from_timer)
return false;
}
-void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
+void BPF_STRUCT_OPS(qmap_dispatch, s32 cid, struct task_struct *prev)
{
struct task_struct *p;
- struct cpu_ctx *cpuc;
- struct task_ctx *tctx;
- u32 zero = 0, batch = dsp_batch ?: 1;
- void *fifo;
- s32 i, pid;
+ struct cpu_ctx __arena *cpuc;
+ task_ctx_t *taskc;
+ u32 batch = dsp_batch ?: 1;
+ s32 i;
if (dispatch_highpri(false))
return;
- if (!nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ, 0))
+ if (!qa.nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ, 0))
return;
- if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
+ if (dsp_inf_loop_after && qa.nr_dispatched > dsp_inf_loop_after) {
/*
* PID 2 should be kthreadd which should mostly be idle and off
* the scheduler. Let's keep dispatching it to force the kernel
@@ -424,10 +591,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
}
}
- if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
- scx_bpf_error("failed to look up cpu_ctx");
- return;
- }
+ cpuc = &qa.cpu_ctxs[scx_bpf_this_cid()];
for (i = 0; i < 5; i++) {
/* Advance the dispatch cursor and pick the fifo. */
@@ -436,33 +600,23 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
}
- fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx);
- if (!fifo) {
- scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx);
- return;
- }
-
/* Dispatch or advance. */
bpf_repeat(BPF_MAX_LOOPS) {
- struct task_ctx *tctx;
+ task_ctx_t *taskc;
- if (bpf_map_pop_elem(fifo, &pid))
+ taskc = qmap_fifo_pop(&qa.fifos[cpuc->dsp_idx]);
+ if (!taskc)
break;
- p = bpf_task_from_pid(pid);
+ p = scx_bpf_tid_to_task(taskc->tid);
if (!p)
continue;
- if (!(tctx = lookup_task_ctx(p))) {
- bpf_task_release(p);
- return;
- }
-
- if (tctx->highpri)
- __sync_fetch_and_sub(&nr_highpri_queued, 1);
+ if (taskc->highpri)
+ __sync_fetch_and_sub(&qa.nr_highpri_queued, 1);
update_core_sched_head_seq(p);
- __sync_fetch_and_add(&nr_dispatched, 1);
+ __sync_fetch_and_add(&qa.nr_dispatched, 1);
scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, 0);
@@ -502,10 +656,8 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
* document this class of issue -- other schedulers
* seeing similar warnings can use this as a reference.
*/
- if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr))
- scx_bpf_kick_cpu(scx_bpf_task_cpu(p), 0);
-
- bpf_task_release(p);
+ if (!cmask_test(cid, &taskc->cpus_allowed))
+ scx_bpf_kick_cid(scx_bpf_task_cid(p), 0);
batch--;
cpuc->dsp_cnt--;
@@ -523,8 +675,8 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
}
for (i = 0; i < MAX_SUB_SCHEDS; i++) {
- if (sub_sched_cgroup_ids[i] &&
- scx_bpf_sub_dispatch(sub_sched_cgroup_ids[i]))
+ if (qa.sub_sched_cgroup_ids[i] &&
+ scx_bpf_sub_dispatch(qa.sub_sched_cgroup_ids[i]))
return;
}
@@ -533,24 +685,20 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
* if the task were enqueued and dispatched immediately.
*/
if (prev) {
- tctx = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
- if (tctx)
- tctx->core_sched_seq =
- core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
+ taskc = lookup_task_ctx(prev);
+ if (!taskc)
+ return;
+
+ taskc->core_sched_seq =
+ qa.core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
}
}
void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
{
- struct cpu_ctx *cpuc;
- u32 zero = 0;
+ struct cpu_ctx __arena *cpuc = &qa.cpu_ctxs[scx_bpf_this_cid()];
int idx;
- if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
- scx_bpf_error("failed to look up cpu_ctx");
- return;
- }
-
/*
* Use the running avg of weights to select the target cpuperf level.
* This is a demonstration of the cpuperf feature rather than a
@@ -560,7 +708,7 @@ void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
idx = weight_to_idx(cpuc->avg_weight);
cpuc->cpuperf_target = qidx_to_cpuperf_target[idx];
- scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target);
+ scx_bpf_cidperf_set(scx_bpf_task_cid(p), cpuc->cpuperf_target);
}
/*
@@ -570,14 +718,14 @@ void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
static s64 task_qdist(struct task_struct *p)
{
int idx = weight_to_idx(p->scx.weight);
- struct task_ctx *tctx;
+ task_ctx_t *taskc;
s64 qdist;
- tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
- if (!tctx)
+ taskc = lookup_task_ctx(p);
+ if (!taskc)
return 0;
- qdist = tctx->core_sched_seq - core_sched_head_seqs[idx];
+ qdist = taskc->core_sched_seq - qa.core_sched_head_seqs[idx];
/*
* As queue index increments, the priority doubles. The queue w/ index 3
@@ -610,70 +758,110 @@ bool BPF_STRUCT_OPS(qmap_core_sched_before,
* tasks when a higher-priority scheduling class takes the CPU.
*/
-s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
- struct scx_init_task_args *args)
+s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init_task, struct task_struct *p,
+ struct scx_init_task_args *args)
{
+ struct task_ctx_stor_val *v;
+ task_ctx_t *taskc;
+
if (p->tgid == disallow_tgid)
p->scx.disallow = true;
- /*
- * @p is new. Let's ensure that its task_ctx is available. We can sleep
- * in this function and the following will automatically use GFP_KERNEL.
- */
- if (bpf_task_storage_get(&task_ctx_stor, p, 0,
- BPF_LOCAL_STORAGE_GET_F_CREATE))
- return 0;
- else
+ /* pop a slab entry off the free list */
+ if (qmap_spin_lock(&qa_task_lock))
+ return -EBUSY;
+ taskc = qa.task_free_head;
+ if (taskc)
+ qa.task_free_head = taskc->next_free;
+ bpf_res_spin_unlock(&qa_task_lock);
+ if (!taskc) {
+ scx_bpf_error("task_ctx slab exhausted (max_tasks=%u)", max_tasks);
+ return -ENOMEM;
+ }
+
+ taskc->next_free = NULL;
+ taskc->q_next = NULL;
+ taskc->q_prev = NULL;
+ taskc->fifo = NULL;
+ taskc->tid = p->scx.tid;
+ taskc->pid = p->pid;
+ taskc->force_local = false;
+ taskc->highpri = false;
+ taskc->core_sched_seq = 0;
+ cmask_init(&taskc->cpus_allowed, 0, scx_bpf_nr_cids());
+ bpf_rcu_read_lock();
+ cmask_from_cpumask(&taskc->cpus_allowed, p->cpus_ptr);
+ bpf_rcu_read_unlock();
+
+ v = bpf_task_storage_get(&task_ctx_stor, p, NULL,
+ BPF_LOCAL_STORAGE_GET_F_CREATE);
+ if (!v) {
+ /* push back to the free list */
+ if (!qmap_spin_lock(&qa_task_lock)) {
+ taskc->next_free = qa.task_free_head;
+ qa.task_free_head = taskc;
+ bpf_res_spin_unlock(&qa_task_lock);
+ }
return -ENOMEM;
+ }
+ v->taskc = taskc;
+ return 0;
}
-void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
+void BPF_STRUCT_OPS(qmap_exit_task, struct task_struct *p,
+ struct scx_exit_task_args *args)
{
- s32 i, pid;
+ struct task_ctx_stor_val *v;
+ task_ctx_t *taskc;
- if (suppress_dump)
+ v = bpf_task_storage_get(&task_ctx_stor, p, NULL, 0);
+ if (!v || !v->taskc)
return;
+ taskc = v->taskc;
+ v->taskc = NULL;
- bpf_for(i, 0, 5) {
- void *fifo;
+ if (qmap_spin_lock(&qa_task_lock))
+ return;
+ taskc->next_free = qa.task_free_head;
+ qa.task_free_head = taskc;
+ bpf_res_spin_unlock(&qa_task_lock);
+}
- if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i)))
- return;
+void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
+{
+ task_ctx_t *taskc;
+ s32 i;
- scx_bpf_dump("QMAP FIFO[%d]:", i);
+ QMAP_TOUCH_ARENA();
- /*
- * Dump can be invoked anytime and there is no way to iterate in
- * a non-destructive way. Pop and store in dump_store and then
- * restore afterwards. If racing against new enqueues, ordering
- * can get mixed up.
- */
- bpf_repeat(4096) {
- if (bpf_map_pop_elem(fifo, &pid))
- break;
- bpf_map_push_elem(&dump_store, &pid, 0);
- scx_bpf_dump(" %d", pid);
- }
+ if (suppress_dump)
+ return;
+ /*
+ * Walk the queue lists without locking - kfunc calls (scx_bpf_dump)
+ * aren't in the verifier's kfunc_spin_allowed() list so we can't hold
+ * a lock and dump. Best-effort; racing may print stale tids but the
+ * walk is bounded by bpf_repeat() so it always terminates.
+ */
+ bpf_for(i, 0, 5) {
+ scx_bpf_dump("QMAP FIFO[%d]:", i);
+ taskc = qa.fifos[i].head;
bpf_repeat(4096) {
- if (bpf_map_pop_elem(&dump_store, &pid))
+ if (!taskc)
break;
- bpf_map_push_elem(fifo, &pid, 0);
+ scx_bpf_dump(" %d:%llu", taskc->pid, taskc->tid);
+ taskc = taskc->q_next;
}
-
scx_bpf_dump("\n");
}
}
-void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle)
+void BPF_STRUCT_OPS(qmap_dump_cid, struct scx_dump_ctx *dctx, s32 cid, bool idle)
{
- u32 zero = 0;
- struct cpu_ctx *cpuc;
+ struct cpu_ctx __arena *cpuc = &qa.cpu_ctxs[cid];
if (suppress_dump || idle)
return;
- if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu)))
- return;
scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u",
cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight,
@@ -682,12 +870,17 @@ void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle
void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p)
{
- struct task_ctx *taskc;
+ struct task_ctx_stor_val *v;
+ task_ctx_t *taskc;
+
+ QMAP_TOUCH_ARENA();
if (suppress_dump)
return;
- if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0)))
+ v = bpf_task_storage_get(&task_ctx_stor, p, NULL, 0);
+ if (!v || !v->taskc)
return;
+ taskc = v->taskc;
scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu",
taskc->force_local, taskc->core_sched_seq);
@@ -716,61 +909,25 @@ void BPF_STRUCT_OPS(qmap_cgroup_set_bandwidth, struct cgroup *cgrp,
cgrp->kn->id, period_us, quota_us, burst_us);
}
-/*
- * Print out the online and possible CPU map using bpf_printk() as a
- * demonstration of using the cpumask kfuncs and ops.cpu_on/offline().
- */
-static void print_cpus(void)
+void BPF_STRUCT_OPS(qmap_update_idle, s32 cid, bool idle)
{
- const struct cpumask *possible, *online;
- s32 cpu;
- char buf[128] = "", *p;
- int idx;
-
- possible = scx_bpf_get_possible_cpumask();
- online = scx_bpf_get_online_cpumask();
-
- idx = 0;
- bpf_for(cpu, 0, scx_bpf_nr_cpu_ids()) {
- if (!(p = MEMBER_VPTR(buf, [idx++])))
- break;
- if (bpf_cpumask_test_cpu(cpu, online))
- *p++ = 'O';
- else if (bpf_cpumask_test_cpu(cpu, possible))
- *p++ = 'X';
- else
- *p++ = ' ';
-
- if ((cpu & 7) == 7) {
- if (!(p = MEMBER_VPTR(buf, [idx++])))
- break;
- *p++ = '|';
- }
- }
- buf[sizeof(buf) - 1] = '\0';
-
- scx_bpf_put_cpumask(online);
- scx_bpf_put_cpumask(possible);
-
- bpf_printk("CPUS: |%s", buf);
+ QMAP_TOUCH_ARENA();
+ if (idle)
+ cmask_set(cid, qa_idle_cids);
+ else
+ cmask_clear(cid, qa_idle_cids);
}
-void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu)
+void BPF_STRUCT_OPS(qmap_set_cmask, struct task_struct *p,
+ const struct scx_cmask *cmask_in)
{
- if (print_msgs) {
- bpf_printk("CPU %d coming online", cpu);
- /* @cpu is already online at this point */
- print_cpus();
- }
-}
+ struct scx_cmask __arena *cmask = (struct scx_cmask __arena *)(long)cmask_in;
+ task_ctx_t *taskc;
-void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu)
-{
- if (print_msgs) {
- bpf_printk("CPU %d going offline", cpu);
- /* @cpu is still online at this point */
- print_cpus();
- }
+ taskc = lookup_task_ctx(p);
+ if (!taskc)
+ return;
+ cmask_copy(&taskc->cpus_allowed, cmask);
}
struct monitor_timer {
@@ -785,64 +942,49 @@ struct {
} monitor_timer SEC(".maps");
/*
- * Print out the min, avg and max performance levels of CPUs every second to
- * demonstrate the cpuperf interface.
+ * Aggregate cidperf across the first nr_online_cids cids. Post-hotplug
+ * the first-N-are-online invariant drifts, so some cap/cur values may
+ * be stale. For this demo monitor that's fine; the scheduler exits on
+ * the enable-time hotplug_seq mismatch and userspace restarts, which
+ * rebuilds the layout.
*/
static void monitor_cpuperf(void)
{
- u32 zero = 0, nr_cpu_ids;
+ u32 nr_online = scx_bpf_nr_online_cids();
u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0;
u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0;
- const struct cpumask *online;
- int i, nr_online_cpus = 0;
-
- nr_cpu_ids = scx_bpf_nr_cpu_ids();
- online = scx_bpf_get_online_cpumask();
+ s32 cid;
- bpf_for(i, 0, nr_cpu_ids) {
- struct cpu_ctx *cpuc;
- u32 cap, cur;
+ QMAP_TOUCH_ARENA();
- if (!bpf_cpumask_test_cpu(i, online))
- continue;
- nr_online_cpus++;
-
- /* collect the capacity and current cpuperf */
- cap = scx_bpf_cpuperf_cap(i);
- cur = scx_bpf_cpuperf_cur(i);
+ bpf_for(cid, 0, nr_online) {
+ struct cpu_ctx __arena *cpuc = &qa.cpu_ctxs[cid];
+ u32 cap = scx_bpf_cidperf_cap(cid);
+ u32 cur = scx_bpf_cidperf_cur(cid);
+ u32 target;
cur_min = cur < cur_min ? cur : cur_min;
cur_max = cur > cur_max ? cur : cur_max;
- /*
- * $cur is relative to $cap. Scale it down accordingly so that
- * it's in the same scale as other CPUs and $cur_sum/$cap_sum
- * makes sense.
- */
- cur_sum += cur * cap / SCX_CPUPERF_ONE;
+ cur_sum += (u64)cur * cap / SCX_CPUPERF_ONE;
cap_sum += cap;
- if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) {
- scx_bpf_error("failed to look up cpu_ctx");
- goto out;
- }
-
- /* collect target */
- cur = cpuc->cpuperf_target;
- target_sum += cur;
- target_min = cur < target_min ? cur : target_min;
- target_max = cur > target_max ? cur : target_max;
+ target = cpuc->cpuperf_target;
+ target_sum += target;
+ target_min = target < target_min ? target : target_min;
+ target_max = target > target_max ? target : target_max;
}
- cpuperf_min = cur_min;
- cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
- cpuperf_max = cur_max;
+ if (!nr_online || !cap_sum)
+ return;
+
+ qa.cpuperf_min = cur_min;
+ qa.cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
+ qa.cpuperf_max = cur_max;
- cpuperf_target_min = target_min;
- cpuperf_target_avg = target_sum / nr_online_cpus;
- cpuperf_target_max = target_max;
-out:
- scx_bpf_put_cpumask(online);
+ qa.cpuperf_target_min = target_min;
+ qa.cpuperf_target_avg = target_sum / nr_online;
+ qa.cpuperf_target_max = target_max;
}
/*
@@ -927,12 +1069,76 @@ static int lowpri_timerfn(void *map, int *key, struct bpf_timer *timer)
s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
{
- u32 key = 0;
+ u8 __arena *slab;
+ u32 nr_pages, key = 0, i;
+ u32 nr_cids, nr_cpu_ids;
struct bpf_timer *timer;
s32 ret;
- if (print_msgs && !sub_cgroup_id)
- print_cpus();
+ nr_cids = scx_bpf_nr_cids();
+ nr_cpu_ids = scx_bpf_nr_cpu_ids();
+
+ if (nr_cids > SCX_QMAP_MAX_CPUS) {
+ scx_bpf_error("nr_cids=%u exceeds SCX_QMAP_MAX_CPUS=%d",
+ nr_cids, SCX_QMAP_MAX_CPUS);
+ return -EINVAL;
+ }
+ if (nr_cpu_ids > SCX_QMAP_MAX_CPUS) {
+ scx_bpf_error("nr_cpu_ids=%u exceeds SCX_QMAP_MAX_CPUS=%d",
+ nr_cpu_ids, SCX_QMAP_MAX_CPUS);
+ return -EINVAL;
+ }
+
+ /*
+ * cid-override test hook. Must run before anything that reads the
+ * cid space (scx_bpf_nr_cids, cmask_init, etc.). On invalid input,
+ * the kfunc calls scx_error() which aborts the scheduler.
+ */
+ if (cid_override_mode) {
+ scx_bpf_cid_override((const s32 *)cid_override_cpu_to_cid,
+ nr_cpu_ids * sizeof(s32));
+ }
+
+ /*
+ * Allocate the task_ctx slab in arena and thread the entire slab onto
+ * the free list. max_tasks is set by userspace before load. Each entry
+ * is TASK_CTX_STRIDE bytes - task_ctx's trailing cpus_allowed flex
+ * array extends into the stride tail.
+ */
+ if (!max_tasks) {
+ scx_bpf_error("max_tasks must be > 0");
+ return -EINVAL;
+ }
+
+ nr_pages = (max_tasks * TASK_CTX_STRIDE + PAGE_SIZE - 1) / PAGE_SIZE;
+ slab = bpf_arena_alloc_pages(&arena, NULL, nr_pages, NUMA_NO_NODE, 0);
+ if (!slab) {
+ scx_bpf_error("failed to allocate task_ctx slab");
+ return -ENOMEM;
+ }
+ qa.task_ctxs = (task_ctx_t *)slab;
+
+ bpf_for(i, 0, 5)
+ qa.fifos[i].idx = i;
+
+ bpf_for(i, 0, max_tasks) {
+ task_ctx_t *cur = (task_ctx_t *)(slab + i * TASK_CTX_STRIDE);
+ task_ctx_t *next = (i + 1 < max_tasks) ?
+ (task_ctx_t *)(slab + (i + 1) * TASK_CTX_STRIDE) : NULL;
+ cur->next_free = next;
+ }
+ qa.task_free_head = (task_ctx_t *)slab;
+
+ /*
+ * Allocate and initialize the idle cmask. Starts empty - update_idle
+ * fills it as cpus enter idle.
+ */
+ qa_idle_cids = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0);
+ if (!qa_idle_cids) {
+ scx_bpf_error("failed to allocate idle cmask");
+ return -ENOMEM;
+ }
+ cmask_init(qa_idle_cids, 0, nr_cids);
ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
if (ret) {
@@ -984,8 +1190,8 @@ s32 BPF_STRUCT_OPS(qmap_sub_attach, struct scx_sub_attach_args *args)
s32 i;
for (i = 0; i < MAX_SUB_SCHEDS; i++) {
- if (!sub_sched_cgroup_ids[i]) {
- sub_sched_cgroup_ids[i] = args->ops->sub_cgroup_id;
+ if (!qa.sub_sched_cgroup_ids[i]) {
+ qa.sub_sched_cgroup_ids[i] = args->ops->sub_cgroup_id;
bpf_printk("attaching sub-sched[%d] on %s",
i, args->cgroup_path);
return 0;
@@ -1000,8 +1206,8 @@ void BPF_STRUCT_OPS(qmap_sub_detach, struct scx_sub_detach_args *args)
s32 i;
for (i = 0; i < MAX_SUB_SCHEDS; i++) {
- if (sub_sched_cgroup_ids[i] == args->ops->sub_cgroup_id) {
- sub_sched_cgroup_ids[i] = 0;
+ if (qa.sub_sched_cgroup_ids[i] == args->ops->sub_cgroup_id) {
+ qa.sub_sched_cgroup_ids[i] = 0;
bpf_printk("detaching sub-sched[%d] on %s",
i, args->cgroup_path);
break;
@@ -1009,24 +1215,26 @@ void BPF_STRUCT_OPS(qmap_sub_detach, struct scx_sub_detach_args *args)
}
}
-SCX_OPS_DEFINE(qmap_ops,
- .select_cpu = (void *)qmap_select_cpu,
+SCX_OPS_CID_DEFINE(qmap_ops,
+ .flags = SCX_OPS_ENQ_EXITING | SCX_OPS_TID_TO_TASK,
+ .select_cid = (void *)qmap_select_cid,
.enqueue = (void *)qmap_enqueue,
.dequeue = (void *)qmap_dequeue,
.dispatch = (void *)qmap_dispatch,
.tick = (void *)qmap_tick,
.core_sched_before = (void *)qmap_core_sched_before,
+ .set_cmask = (void *)qmap_set_cmask,
+ .update_idle = (void *)qmap_update_idle,
.init_task = (void *)qmap_init_task,
+ .exit_task = (void *)qmap_exit_task,
.dump = (void *)qmap_dump,
- .dump_cpu = (void *)qmap_dump_cpu,
+ .dump_cid = (void *)qmap_dump_cid,
.dump_task = (void *)qmap_dump_task,
.cgroup_init = (void *)qmap_cgroup_init,
.cgroup_set_weight = (void *)qmap_cgroup_set_weight,
.cgroup_set_bandwidth = (void *)qmap_cgroup_set_bandwidth,
.sub_attach = (void *)qmap_sub_attach,
.sub_detach = (void *)qmap_sub_detach,
- .cpu_online = (void *)qmap_cpu_online,
- .cpu_offline = (void *)qmap_cpu_offline,
.init = (void *)qmap_init,
.exit = (void *)qmap_exit,
.timeout_ms = 5000U,
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
index e7c89a2bc3d8..67ddd483a4c7 100644
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -10,9 +10,11 @@
#include <inttypes.h>
#include <signal.h>
#include <libgen.h>
+#include <sys/mman.h>
#include <sys/stat.h>
#include <bpf/bpf.h>
#include <scx/common.h>
+#include "scx_qmap.h"
#include "scx_qmap.bpf.skel.h"
const char help_fmt[] =
@@ -21,23 +23,27 @@ const char help_fmt[] =
"See the top-level comment in .bpf.c for more details.\n"
"\n"
"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-b COUNT]\n"
-" [-P] [-M] [-H] [-d PID] [-D LEN] [-S] [-p] [-I] [-F COUNT] [-v]\n"
+" [-N COUNT] [-P] [-M] [-H] [-c CG_PATH] [-d PID] [-D LEN] [-S] [-p] [-I]\n"
+" [-F COUNT] [-v]\n"
"\n"
" -s SLICE_US Override slice duration\n"
" -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n"
" -t COUNT Stall every COUNT'th user thread\n"
" -T COUNT Stall every COUNT'th kernel thread\n"
+" -N COUNT Size of the task_ctx arena slab (default 16384)\n"
" -l COUNT Trigger dispatch infinite looping after COUNT dispatches\n"
" -b COUNT Dispatch upto COUNT tasks together\n"
" -P Print out DSQ content and event counters to trace_pipe every second\n"
" -M Print out debug messages to trace_pipe\n"
" -H Boost nice -20 tasks in SHARED_DSQ, use with -b\n"
+" -c CG_PATH Cgroup path to attach as sub-scheduler, must run parent scheduler first\n"
" -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n"
" -D LEN Set scx_exit_info.dump buffer length\n"
" -S Suppress qmap-specific debug dump\n"
" -p Switch only tasks on SCHED_EXT policy instead of all\n"
" -I Turn on SCX_OPS_ALWAYS_ENQ_IMMED\n"
" -F COUNT IMMED stress: force every COUNT'th enqueue to a busy local DSQ (use with -I)\n"
+" -C MODE cid-override test (shuffle|bad-dup|bad-range)\n"
" -v Print libbpf debug messages\n"
" -h Display this help and exit\n";
@@ -60,23 +66,36 @@ int main(int argc, char **argv)
{
struct scx_qmap *skel;
struct bpf_link *link;
+ struct qmap_arena *qa;
+ __u32 test_error_cnt = 0;
+ __u64 ecode;
int opt;
libbpf_set_print(libbpf_print_fn);
signal(SIGINT, sigint_handler);
signal(SIGTERM, sigint_handler);
+ if (libbpf_num_possible_cpus() > SCX_QMAP_MAX_CPUS) {
+ fprintf(stderr,
+ "scx_qmap: %d possible CPUs exceeds compile-time cap %d; "
+ "rebuild with larger SCX_QMAP_MAX_CPUS\n",
+ libbpf_num_possible_cpus(), SCX_QMAP_MAX_CPUS);
+ return 1;
+ }
+restart:
+ optind = 1;
skel = SCX_OPS_OPEN(qmap_ops, scx_qmap);
skel->rodata->slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL");
+ skel->rodata->max_tasks = 16384;
- while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PMHc:d:D:SpIF:vh")) != -1) {
+ while ((opt = getopt(argc, argv, "s:e:t:T:l:b:N:PMHc:d:D:SpIF:C:vh")) != -1) {
switch (opt) {
case 's':
skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
break;
case 'e':
- skel->bss->test_error_cnt = strtoul(optarg, NULL, 0);
+ test_error_cnt = strtoul(optarg, NULL, 0);
break;
case 't':
skel->rodata->stall_user_nth = strtoul(optarg, NULL, 0);
@@ -90,6 +109,9 @@ int main(int argc, char **argv)
case 'b':
skel->rodata->dsp_batch = strtoul(optarg, NULL, 0);
break;
+ case 'N':
+ skel->rodata->max_tasks = strtoul(optarg, NULL, 0);
+ break;
case 'P':
skel->rodata->print_dsqs_and_events = true;
break;
@@ -130,6 +152,35 @@ int main(int argc, char **argv)
case 'F':
skel->rodata->immed_stress_nth = strtoul(optarg, NULL, 0);
break;
+ case 'C': {
+ u32 nr_cpus = libbpf_num_possible_cpus();
+ u32 mode, i;
+
+ if (!strcmp(optarg, "shuffle"))
+ mode = 1;
+ else if (!strcmp(optarg, "bad-dup"))
+ mode = 2;
+ else if (!strcmp(optarg, "bad-range"))
+ mode = 3;
+ else {
+ fprintf(stderr, "unknown cid-override mode '%s'\n", optarg);
+ return 1;
+ }
+ skel->rodata->cid_override_mode = mode;
+
+ /* shuffle: reversed cpu_to_cid, bad-dup: dup cid 0, bad-range: identity */
+ for (i = 0; i < nr_cpus; i++) {
+ if (mode == 1)
+ skel->bss->cid_override_cpu_to_cid[i] = nr_cpus - 1 - i;
+ else
+ skel->bss->cid_override_cpu_to_cid[i] = i;
+ }
+ if (mode == 2 && nr_cpus >= 2)
+ skel->bss->cid_override_cpu_to_cid[1] = 0;
+ if (mode == 3)
+ skel->bss->cid_override_cpu_to_cid[0] = (s32)nr_cpus;
+ break;
+ }
case 'v':
verbose = true;
break;
@@ -142,39 +193,41 @@ int main(int argc, char **argv)
SCX_OPS_LOAD(skel, qmap_ops, scx_qmap, uei);
link = SCX_OPS_ATTACH(skel, qmap_ops, scx_qmap);
+ qa = &skel->arena->qa;
+ qa->test_error_cnt = test_error_cnt;
+
while (!exit_req && !UEI_EXITED(skel, uei)) {
- long nr_enqueued = skel->bss->nr_enqueued;
- long nr_dispatched = skel->bss->nr_dispatched;
+ long nr_enqueued = qa->nr_enqueued;
+ long nr_dispatched = qa->nr_dispatched;
- printf("stats : enq=%lu dsp=%lu delta=%ld reenq/cpu0=%"PRIu64"/%"PRIu64" deq=%"PRIu64" core=%"PRIu64" enq_ddsp=%"PRIu64"\n",
+ printf("stats : enq=%lu dsp=%lu delta=%ld reenq/cid0=%llu/%llu deq=%llu core=%llu enq_ddsp=%llu\n",
nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched,
- skel->bss->nr_reenqueued, skel->bss->nr_reenqueued_cpu0,
- skel->bss->nr_dequeued,
- skel->bss->nr_core_sched_execed,
- skel->bss->nr_ddsp_from_enq);
- printf(" exp_local=%"PRIu64" exp_remote=%"PRIu64" exp_timer=%"PRIu64" exp_lost=%"PRIu64"\n",
- skel->bss->nr_expedited_local,
- skel->bss->nr_expedited_remote,
- skel->bss->nr_expedited_from_timer,
- skel->bss->nr_expedited_lost);
- if (__COMPAT_has_ksym("scx_bpf_cpuperf_cur"))
+ qa->nr_reenqueued, qa->nr_reenqueued_cid0,
+ qa->nr_dequeued,
+ qa->nr_core_sched_execed,
+ qa->nr_ddsp_from_enq);
+ printf(" exp_local=%llu exp_remote=%llu exp_timer=%llu exp_lost=%llu\n",
+ qa->nr_expedited_local,
+ qa->nr_expedited_remote,
+ qa->nr_expedited_from_timer,
+ qa->nr_expedited_lost);
+ if (__COMPAT_has_ksym("scx_bpf_cidperf_cur"))
printf("cpuperf: cur min/avg/max=%u/%u/%u target min/avg/max=%u/%u/%u\n",
- skel->bss->cpuperf_min,
- skel->bss->cpuperf_avg,
- skel->bss->cpuperf_max,
- skel->bss->cpuperf_target_min,
- skel->bss->cpuperf_target_avg,
- skel->bss->cpuperf_target_max);
+ qa->cpuperf_min,
+ qa->cpuperf_avg,
+ qa->cpuperf_max,
+ qa->cpuperf_target_min,
+ qa->cpuperf_target_avg,
+ qa->cpuperf_target_max);
fflush(stdout);
sleep(1);
}
bpf_link__destroy(link);
- UEI_REPORT(skel, uei);
+ ecode = UEI_REPORT(skel, uei);
scx_qmap__destroy(skel);
- /*
- * scx_qmap implements ops.cpu_on/offline() and doesn't need to restart
- * on CPU hotplug events.
- */
+
+ if (UEI_ECODE_RESTART(ecode))
+ goto restart;
return 0;
}
diff --git a/tools/sched_ext/scx_qmap.h b/tools/sched_ext/scx_qmap.h
new file mode 100644
index 000000000000..d15a705d5ac5
--- /dev/null
+++ b/tools/sched_ext/scx_qmap.h
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Shared definitions between scx_qmap.bpf.c and scx_qmap.c.
+ *
+ * The scheduler keeps all state in a single BPF arena map. struct
+ * qmap_arena is the one object that lives at the base of the arena and is
+ * mmap'd into userspace so the loader can read counters directly.
+ *
+ * Copyright (c) 2026 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2026 Tejun Heo <tj@kernel.org>
+ */
+#ifndef __SCX_QMAP_H
+#define __SCX_QMAP_H
+
+#ifdef __BPF__
+#include <scx/bpf_arena_common.bpf.h>
+#else
+#include <linux/types.h>
+#include <scx/bpf_arena_common.h>
+#endif
+
+#define MAX_SUB_SCHEDS 8
+
+/*
+ * cpu_ctxs[] is sized to a fixed cap so the layout is shared between BPF and
+ * userspace. Keep this in sync with NR_CPUS used by the BPF side.
+ */
+#define SCX_QMAP_MAX_CPUS 1024
+
+struct cpu_ctx {
+ __u64 dsp_idx; /* dispatch index */
+ __u64 dsp_cnt; /* remaining count */
+ __u32 avg_weight;
+ __u32 cpuperf_target;
+};
+
+/* Opaque to userspace; defined in scx_qmap.bpf.c. */
+struct task_ctx;
+
+struct qmap_fifo {
+ struct task_ctx __arena *head;
+ struct task_ctx __arena *tail;
+ __s32 idx;
+};
+
+struct qmap_arena {
+ /* userspace-visible stats */
+ __u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cid0;
+ __u64 nr_dequeued, nr_ddsp_from_enq;
+ __u64 nr_core_sched_execed;
+ __u64 nr_expedited_local, nr_expedited_remote;
+ __u64 nr_expedited_lost, nr_expedited_from_timer;
+ __u64 nr_highpri_queued;
+ __u32 test_error_cnt;
+ __u32 cpuperf_min, cpuperf_avg, cpuperf_max;
+ __u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
+
+ /* kernel-side runtime state */
+ __u64 sub_sched_cgroup_ids[MAX_SUB_SCHEDS];
+ __u64 core_sched_head_seqs[5];
+ __u64 core_sched_tail_seqs[5];
+
+ struct cpu_ctx cpu_ctxs[SCX_QMAP_MAX_CPUS];
+
+ /* task_ctx slab; allocated and threaded by qmap_init() */
+ struct task_ctx __arena *task_ctxs;
+ struct task_ctx __arena *task_free_head;
+
+ /* five priority FIFOs, each a doubly-linked list through task_ctx */
+ struct qmap_fifo fifos[5];
+};
+
+#endif /* __SCX_QMAP_H */
diff --git a/tools/testing/selftests/sched_ext/non_scx_kfunc_deny.bpf.c b/tools/testing/selftests/sched_ext/non_scx_kfunc_deny.bpf.c
index 9f16d39255e7..0d6fcc8e5eb6 100644
--- a/tools/testing/selftests/sched_ext/non_scx_kfunc_deny.bpf.c
+++ b/tools/testing/selftests/sched_ext/non_scx_kfunc_deny.bpf.c
@@ -9,12 +9,7 @@
* Copyright (C) 2026 Cheng-Yang Chou <yphbchou0911@gmail.com>
*/
-#include <vmlinux.h>
-#include <bpf/bpf_helpers.h>
-#include <bpf/bpf_tracing.h>
-
-/* SCX kfunc from scx_kfunc_ids_any set */
-void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
+#include <scx/common.bpf.h>
SEC("struct_ops/ssthresh")
__u32 BPF_PROG(tcp_ca_ssthresh, struct sock *sk)
diff --git a/tools/testing/selftests/sched_ext/peek_dsq.bpf.c b/tools/testing/selftests/sched_ext/peek_dsq.bpf.c
index 7f23fb17b1e0..9e802b52b29e 100644
--- a/tools/testing/selftests/sched_ext/peek_dsq.bpf.c
+++ b/tools/testing/selftests/sched_ext/peek_dsq.bpf.c
@@ -95,7 +95,7 @@ static int scan_dsq_pool(void)
record_peek_result(task->pid);
/* Try to move this task to local */
- if (!moved && scx_bpf_dsq_move_to_local(dsq_id, 0) == 0) {
+ if (!moved && scx_bpf_dsq_move_to_local(dsq_id, 0)) {
moved = 1;
break;
}
diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl.c b/tools/testing/selftests/sched_ext/select_cpu_dfl.c
index 5b6e045e1109..7e342c0cec65 100644
--- a/tools/testing/selftests/sched_ext/select_cpu_dfl.c
+++ b/tools/testing/selftests/sched_ext/select_cpu_dfl.c
@@ -6,6 +6,7 @@
*/
#include <bpf/bpf.h>
#include <scx/common.h>
+#include <stdlib.h>
#include <sys/wait.h>
#include <unistd.h>
#include "select_cpu_dfl.bpf.skel.h"
@@ -13,29 +14,44 @@
#define NUM_CHILDREN 1028
+struct select_cpu_dfl_ctx {
+ struct select_cpu_dfl *skel;
+ struct bpf_link *link;
+};
+
static enum scx_test_status setup(void **ctx)
{
- struct select_cpu_dfl *skel;
+ struct select_cpu_dfl_ctx *tctx;
+
+ tctx = malloc(sizeof(*tctx));
+ SCX_FAIL_IF(!tctx, "Failed to allocate test context");
+ tctx->link = NULL;
- skel = select_cpu_dfl__open();
- SCX_FAIL_IF(!skel, "Failed to open");
- SCX_ENUM_INIT(skel);
- SCX_FAIL_IF(select_cpu_dfl__load(skel), "Failed to load skel");
+ tctx->skel = select_cpu_dfl__open();
+ if (!tctx->skel) {
+ free(tctx);
+ SCX_FAIL("Failed to open");
+ }
+ SCX_ENUM_INIT(tctx->skel);
+ if (select_cpu_dfl__load(tctx->skel)) {
+ select_cpu_dfl__destroy(tctx->skel);
+ free(tctx);
+ SCX_FAIL("Failed to load skel");
+ }
- *ctx = skel;
+ *ctx = tctx;
return SCX_TEST_PASS;
}
static enum scx_test_status run(void *ctx)
{
- struct select_cpu_dfl *skel = ctx;
- struct bpf_link *link;
+ struct select_cpu_dfl_ctx *tctx = ctx;
pid_t pids[NUM_CHILDREN];
- int i, status;
+ int i, status, nforked = 0;
- link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dfl_ops);
- SCX_FAIL_IF(!link, "Failed to attach scheduler");
+ tctx->link = bpf_map__attach_struct_ops(tctx->skel->maps.select_cpu_dfl_ops);
+ SCX_FAIL_IF(!tctx->link, "Failed to attach scheduler");
for (i = 0; i < NUM_CHILDREN; i++) {
pids[i] = fork();
@@ -43,25 +59,31 @@ static enum scx_test_status run(void *ctx)
sleep(1);
exit(0);
}
+ if (pids[i] > 0)
+ nforked++;
}
for (i = 0; i < NUM_CHILDREN; i++) {
+ if (pids[i] <= 0)
+ continue;
SCX_EQ(waitpid(pids[i], &status, 0), pids[i]);
SCX_EQ(status, 0);
}
- SCX_ASSERT(!skel->bss->saw_local);
-
- bpf_link__destroy(link);
+ SCX_GT(nforked, 0);
+ SCX_ASSERT(!tctx->skel->bss->saw_local);
return SCX_TEST_PASS;
}
static void cleanup(void *ctx)
{
- struct select_cpu_dfl *skel = ctx;
+ struct select_cpu_dfl_ctx *tctx = ctx;
- select_cpu_dfl__destroy(skel);
+ if (tctx->link)
+ bpf_link__destroy(tctx->link);
+ select_cpu_dfl__destroy(tctx->skel);
+ free(tctx);
}
struct scx_test select_cpu_dfl = {