diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2026-06-17 14:10:11 +0300 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2026-06-17 14:10:11 +0300 |
| commit | 5b33fc6492a7b7a62359157db0f92f5b6e9af690 (patch) | |
| tree | 0c89f2906b33a19cad3f12e9b33e9cfa150ee1d6 | |
| parent | 83476cc97bc635a3ff502bd194c79bfb1f1ae050 (diff) | |
| parent | 2e05f2fd0dd72aa8aa56cf355e1e39a3f565b4ca (diff) | |
| download | linux-5b33fc6492a7b7a62359157db0f92f5b6e9af690.tar.xz | |
Merge tag 'sched_ext-for-7.2' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext
Pull sched_ext updates from Tejun Heo:
"Most of this continues the in-development sub-scheduler support, which
lets a root BPF scheduler delegate to nested sub-schedulers. The
dispatch-path building blocks landed in 7.1. A follow-up patchset in
development will complete enqueue-path support for hierarchical
scheduling. This cycle adds most of that infrastructure:
- Topological CPU IDs (cids): a dense, topology-ordered CPU numbering
where the CPUs of a core, LLC, or NUMA node form contiguous ranges,
so a topology unit becomes a (start, length) slice. Raw CPU numbers
are sparse and don't track topological closeness, which makes them
clumsy for sharding work across sub-schedulers and awkward in BPF.
- cmask: bitmaps windowed over a slice of cid space, so a
sub-scheduler can track, for example, the idle cids of its shard
without a full NR_CPUS cpumask.
- A struct_ops variant that cid-form sub-schedulers register with,
along with the cid-form kfuncs they call.
- BPF arena integration, which sub-scheduler support is built on. The
bpf-next additions let the kernel read and write the BPF
scheduler's arena directly, turning it into a real kernel/BPF
shared-memory channel. Shared state like the per-CPU cmask now
lives there.
- scx_qmap is reworked to exercise the new arena and cid interfaces.
Additionally:
- Exit-dump improvements: dump the faulting CPU first, expose the
exit CPU to BPF and userspace, and normalize the dump header.
- Misc kfuncs and cleanups: a task-ID lookup kfunc, __printf checking
on the error and dump formatters, header reorganization, and
assorted fixes"
* tag 'sched_ext-for-7.2' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext: (59 commits)
sched_ext: Add scx_arena_to_kaddr() / scx_kaddr_to_arena()
sched_ext: Make scx_bpf_kick_cid() return s32
sched_ext: Add scx_cmask_test() and scx_cmask_for_each_cid()
tools/sched_ext: Order single-cid cmask helpers as (cid, mask)
sched_ext: Order single-cid cmask helpers as (cid, mask)
selftests/sched_ext: Fix dsq_move_to_local check
sched_ext: Guard BPF arena helper calls to fix 32-bit build
sched_ext: idle: Fix errno loss in scx_idle_init()
sched_ext: Convert ops.set_cmask() to arena-resident cmask
sched_ext: Sub-allocator over kernel-claimed BPF arena pages
sched_ext: Require an arena for cid-form schedulers
sched_ext: Add cmask mask ops
sched_ext: Track bits[] storage size in struct scx_cmask
sched_ext: Rename scx_cmask.nr_bits to nr_cids
tools/sched_ext: scx_qmap: Fix qa arena placement
sched_ext: Mark !CONFIG_EXT_SUB_SCHED dummy stubs static inline
sched_ext: Replace tryget_task_struct() with get_task_struct()
sched_ext: Add scx_task_iter_relock() and use it in scx_root_enable_workfn()
sched_ext: Fix ops_cid layout assert
sched_ext: Use offsetofend on both sides of the ops_cid layout assert
...
29 files changed, 4090 insertions, 704 deletions
diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst index 03d595d178ea..c4f59c08d8a4 100644 --- a/Documentation/scheduler/sched-ext.rst +++ b/Documentation/scheduler/sched-ext.rst @@ -339,6 +339,11 @@ The following briefly shows how a waking task is scheduled and executed. leaves (e.g., when ``ops.dispatch()`` moves it to a terminal DSQ, or on property change / sleep). + Note that ``ops.enqueue()`` can be called multiple times in a row without + an intervening call to ``ops.dequeue()``. This can happen, for example, + when a task on a user-created DSQ is re-enqueued using + ``scx_bpf_dsq_reenq()``. The task stays in BPF custody the entire time. + When a task leaves BPF scheduler custody, ``ops.dequeue()`` is invoked. The dequeue can happen for different reasons, distinguished by flags: @@ -503,7 +508,7 @@ Where to Look custom DSQ. * ``scx_qmap[.bpf].c``: A multi-level FIFO scheduler supporting five - levels of priority implemented with ``BPF_MAP_TYPE_QUEUE``. + levels of priority implemented with arena-backed doubly-linked lists. * ``scx_central[.bpf].c``: A central FIFO scheduler where all scheduling decisions are made on one CPU, demonstrating ``LOCAL_ON`` dispatching, diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 2129e18ada58..20b2343aa344 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -207,6 +207,15 @@ struct sched_ext_entity { u64 core_sched_at; /* see scx_prio_less() */ #endif + /* + * Unique non-zero task ID assigned at fork. Persists across exec and + * is never reused. Lets BPF schedulers identify tasks without storing + * kernel pointers - arena-backed schedulers being one example. See + * scx_bpf_tid_to_task(). + */ + u64 tid; + struct rhash_head tid_hash_node; /* see SCX_OPS_TID_TO_TASK */ + /* BPF scheduler modifiable fields */ /* diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c index 755883faf751..067979a7b69e 100644 --- a/kernel/sched/build_policy.c +++ b/kernel/sched/build_policy.c @@ -58,8 +58,17 @@ #include "deadline.c" #ifdef CONFIG_SCHED_CLASS_EXT +# include <linux/btf_ids.h> +# include <linux/find.h> +# include <linux/genalloc.h> +# include "ext_types.h" # include "ext_internal.h" +# include "ext_cid.h" +# include "ext_arena.h" +# include "ext_idle.h" # include "ext.c" +# include "ext_cid.c" +# include "ext_arena.c" # include "ext_idle.c" #endif diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index f5a3233ead1a..0db6fa2daea3 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -6,8 +6,6 @@ * Copyright (c) 2022 Tejun Heo <tj@kernel.org> * Copyright (c) 2022 David Vernet <dvernet@meta.com> */ -#include <linux/btf_ids.h> -#include "ext_idle.h" static DEFINE_RAW_SPINLOCK(scx_sched_lock); @@ -38,6 +36,15 @@ static const struct rhashtable_params scx_sched_hash_params = { static struct rhashtable scx_sched_hash; #endif +/* see SCX_OPS_TID_TO_TASK */ +static const struct rhashtable_params scx_tid_hash_params = { + .key_len = sizeof_field(struct sched_ext_entity, tid), + .key_offset = offsetof(struct sched_ext_entity, tid), + .head_offset = offsetof(struct sched_ext_entity, tid_hash_node), + .insecure_elasticity = true, /* inserted/removed under scx_tasks_lock */ +}; +static struct rhashtable scx_tid_hash; + /* * During exit, a task may schedule after losing its PIDs. When disabling the * BPF scheduler, we need to be able to iterate tasks in every state to @@ -56,10 +63,25 @@ static DEFINE_RAW_SPINLOCK(scx_bypass_lock); static bool scx_init_task_enabled; static bool scx_switching_all; DEFINE_STATIC_KEY_FALSE(__scx_switched_all); +static DEFINE_STATIC_KEY_FALSE(__scx_tid_to_task_enabled); + +/* + * True once SCX_OPS_TID_TO_TASK has been negotiated with the root scheduler + * and the tid->task table is live. Wraps the static key so callers don't + * take the address, and hints "likely enabled" for the common case where + * the feature is in use. + */ +static inline bool scx_tid_to_task_enabled(void) +{ + return static_branch_likely(&__scx_tid_to_task_enabled); +} static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0); static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0); +/* Global cursor for the per-CPU tid allocator. Starts at 1; tid 0 is reserved. */ +static atomic64_t scx_tid_cursor = ATOMIC64_INIT(1); + #ifdef CONFIG_EXT_SUB_SCHED /* * The sub sched being enabled. Used by scx_disable_and_exit_task() to exit @@ -109,6 +131,17 @@ struct scx_kick_syncs { static DEFINE_PER_CPU(struct scx_kick_syncs __rcu *, scx_kick_syncs); /* + * Per-CPU buffered allocator state for p->scx.tid. Each CPU pulls a chunk of + * SCX_TID_CHUNK ids from scx_tid_cursor and hands them out locally without + * further synchronization. See scx_alloc_tid(). + */ +struct scx_tid_alloc { + u64 next; + u64 end; +}; +static DEFINE_PER_CPU(struct scx_tid_alloc, scx_tid_alloc); + +/* * Direct dispatch marker. * * Non-NULL values are used for direct dispatch from enqueue path. A valid @@ -198,26 +231,21 @@ static void run_deferred(struct rq *rq); static bool task_dead_and_done(struct task_struct *p); static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags); static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind); -static bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind, - s64 exit_code, const char *fmt, va_list args); -static __printf(4, 5) bool scx_exit(struct scx_sched *sch, - enum scx_exit_kind kind, s64 exit_code, - const char *fmt, ...) +__printf(5, 6) bool __scx_exit(struct scx_sched *sch, + enum scx_exit_kind kind, s64 exit_code, + s32 exit_cpu, const char *fmt, ...) { va_list args; bool ret; va_start(args, fmt); - ret = scx_vexit(sch, kind, exit_code, fmt, args); + ret = scx_vexit(sch, kind, exit_code, exit_cpu, fmt, args); va_end(args); return ret; } -#define scx_error(sch, fmt, args...) scx_exit((sch), SCX_EXIT_ERROR, 0, fmt, ##args) -#define scx_verror(sch, fmt, args) scx_vexit((sch), SCX_EXIT_ERROR, 0, fmt, args) - #define SCX_HAS_OP(sch, op) test_bit(SCX_OP_IDX(op), (sch)->has_op) static long jiffies_delta_msecs(unsigned long at, unsigned long now) @@ -295,9 +323,9 @@ static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) rcu_assign_pointer(p->scx.sched, sch); } #else /* CONFIG_EXT_SUB_SCHED */ -static struct scx_sched *scx_parent(struct scx_sched *sch) { return NULL; } -static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, struct scx_sched *root) { return pos ? NULL : root; } -static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) {} +static inline struct scx_sched *scx_parent(struct scx_sched *sch) { return NULL; } +static inline struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, struct scx_sched *root) { return pos ? NULL : root; } +static inline void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) {} #endif /* CONFIG_EXT_SUB_SCHED */ /** @@ -484,6 +512,33 @@ do { \ update_locked_rq(__prev_locked_rq); \ } while (0) +/* + * Flipped on enable per sch->is_cid_type. Declared in ext_internal.h so + * subsystem inlines can read it. + */ +DEFINE_STATIC_KEY_FALSE(__scx_is_cid_type); + +/* + * scx_cpu_arg() wraps a cpu arg being handed to an SCX op. For cid-form + * schedulers it resolves to the matching cid; for cpu-form it passes @cpu + * through. scx_cpu_ret() is the inverse for a cpu/cid returned from an op + * (currently only ops.select_cpu); it validates the BPF-supplied cid and + * triggers scx_error() on @sch if invalid. + */ +static s32 scx_cpu_arg(s32 cpu) +{ + if (scx_is_cid_type()) + return __scx_cpu_to_cid(cpu); + return cpu; +} + +static s32 scx_cpu_ret(struct scx_sched *sch, s32 cpu_or_cid) +{ + if (cpu_or_cid < 0 || !scx_is_cid_type()) + return cpu_or_cid; + return scx_cid_to_cpu(sch, cpu_or_cid); +} + #define SCX_CALL_OP_RET(sch, op, locked_rq, args...) \ ({ \ struct rq *__prev_locked_rq; \ @@ -545,6 +600,44 @@ do { \ __ret; \ }) +/** + * scx_call_op_set_cpumask - invoke ops.set_cpumask / ops_cid.set_cmask for @task + * @sch: scx_sched being invoked + * @rq: rq to update as the currently-locked rq, or NULL + * @task: task whose affinity is changing + * @cpumask: new cpumask + * + * For cid-form schedulers, translate @cpumask to a cmask via the per-cpu + * scratch in ext_cid.c and dispatch through the ops_cid union view. Caller + * must hold @rq's rq lock so this_cpu_ptr is stable across the call. + */ +static inline void scx_call_op_set_cpumask(struct scx_sched *sch, struct rq *rq, + struct task_struct *task, + const struct cpumask *cpumask) +{ + WARN_ON_ONCE(current->scx.kf_tasks[0]); + current->scx.kf_tasks[0] = task; + if (rq) + update_locked_rq(rq); + + if (scx_is_cid_type()) { + struct scx_cmask *kern_va = *this_cpu_ptr(sch->set_cmask_scratch); + /* + * Build the per-CPU arena cmask and hand BPF its arena address. + * Caller holds the rq lock with IRQs disabled, which makes us + * the sole user of the scratch area. + */ + scx_cpumask_to_cmask(cpumask, kern_va); + sch->ops_cid.set_cmask(task, scx_kaddr_to_arena(sch, kern_va)); + } else { + sch->ops.set_cpumask(task, cpumask); + } + + if (rq) + update_locked_rq(NULL); + current->scx.kf_tasks[0] = NULL; +} + /* see SCX_CALL_OP_TASK() */ static __always_inline bool scx_kf_arg_task_ok(struct scx_sched *sch, struct task_struct *p) @@ -858,6 +951,24 @@ static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter) } /** + * scx_task_iter_relock - Re-acquire scx_tasks_lock and, optionally, @p's rq + * @iter: iterator to relock + * @p: task whose rq to lock, or %NULL for scx_tasks_lock only + * + * Counterpart to scx_task_iter_unlock(). Locking @p's rq is optional. Once + * re-acquired, both locks are managed by the iterator from here on. + */ +static void scx_task_iter_relock(struct scx_task_iter *iter, + struct task_struct *p) +{ + __scx_task_iter_maybe_relock(iter); + if (p) { + iter->rq = task_rq_lock(p, &iter->rf); + iter->locked_task = p; + } +} + +/** * scx_task_iter_stop - Stop a task iteration and unlock scx_tasks_lock * @iter: iterator to exit * @@ -1086,7 +1197,7 @@ static inline bool __cpu_valid(s32 cpu) } /** - * ops_cpu_valid - Verify a cpu number, to be used on ops input args + * scx_cpu_valid - Verify a cpu number, to be used on ops input args * @sch: scx_sched to abort on error * @cpu: cpu number which came from a BPF ops * @where: extra information reported on error @@ -1095,7 +1206,7 @@ static inline bool __cpu_valid(s32 cpu) * Verify that it is in range and one of the possible cpus. If invalid, trigger * an ops error. */ -static bool ops_cpu_valid(struct scx_sched *sch, s32 cpu, const char *where) +bool scx_cpu_valid(struct scx_sched *sch, s32 cpu, const char *where) { if (__cpu_valid(cpu)) { return true; @@ -1742,9 +1853,9 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch, return &rq->scx.local_dsq; if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { - s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; + s32 cpu = scx_cpu_ret(sch, dsq_id & SCX_DSQ_LOCAL_CPU_MASK); - if (!ops_cpu_valid(sch, cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) + if (!scx_cpu_valid(sch, cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) return find_global_dsq(sch, tcpu); return &cpu_rq(cpu)->scx.local_dsq; @@ -2837,11 +2948,13 @@ scx_dispatch_sched(struct scx_sched *sch, struct rq *rq, dspc->nr_tasks = 0; if (nested) { - SCX_CALL_OP(sch, dispatch, rq, cpu, prev_on_sch ? prev : NULL); + SCX_CALL_OP(sch, dispatch, rq, scx_cpu_arg(cpu), + prev_on_sch ? prev : NULL); } else { /* stash @prev so that nested invocations can access it */ rq->scx.sub_dispatch_prev = prev; - SCX_CALL_OP(sch, dispatch, rq, cpu, prev_on_sch ? prev : NULL); + SCX_CALL_OP(sch, dispatch, rq, scx_cpu_arg(cpu), + prev_on_sch ? prev : NULL); rq->scx.sub_dispatch_prev = NULL; } @@ -2899,7 +3012,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) * core. This callback complements ->cpu_release(), which is * emitted in switch_class(). */ - if (SCX_HAS_OP(sch, cpu_acquire)) + if (sch->ops.cpu_acquire) SCX_CALL_OP(sch, cpu_acquire, rq, cpu, NULL); rq->scx.cpu_released = false; } @@ -3045,7 +3158,7 @@ static void switch_class(struct rq *rq, struct task_struct *next) * next time that balance_one() is invoked. */ if (!rq->scx.cpu_released) { - if (SCX_HAS_OP(sch, cpu_release)) { + if (sch->ops.cpu_release) { struct scx_cpu_release_args args = { .reason = preempt_reason_from_class(next_class), .task = next, @@ -3336,11 +3449,13 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag *ddsp_taskp = p; this_rq()->scx.in_select_cpu = true; - cpu = SCX_CALL_OP_TASK_RET(sch, select_cpu, NULL, p, prev_cpu, wake_flags); + cpu = SCX_CALL_OP_TASK_RET(sch, select_cpu, NULL, p, + scx_cpu_arg(prev_cpu), wake_flags); + cpu = scx_cpu_ret(sch, cpu); this_rq()->scx.in_select_cpu = false; p->scx.selected_cpu = cpu; *ddsp_taskp = NULL; - if (ops_cpu_valid(sch, cpu, "from ops.select_cpu()")) + if (scx_cpu_valid(sch, cpu, "from ops.select_cpu()")) return cpu; else return prev_cpu; @@ -3386,7 +3501,7 @@ static void set_cpus_allowed_scx(struct task_struct *p, * designation pointless. Cast it away when calling the operation. */ if (SCX_HAS_OP(sch, set_cpumask)) - SCX_CALL_OP_TASK(sch, set_cpumask, task_rq(p), p, (struct cpumask *)p->cpus_ptr); + scx_call_op_set_cpumask(sch, task_rq(p), p, (struct cpumask *)p->cpus_ptr); } static void handle_hotplug(struct rq *rq, bool online) @@ -3408,9 +3523,9 @@ static void handle_hotplug(struct rq *rq, bool online) scx_idle_update_selcpu_topology(&sch->ops); if (online && SCX_HAS_OP(sch, cpu_online)) - SCX_CALL_OP(sch, cpu_online, NULL, cpu); + SCX_CALL_OP(sch, cpu_online, NULL, scx_cpu_arg(cpu)); else if (!online && SCX_HAS_OP(sch, cpu_offline)) - SCX_CALL_OP(sch, cpu_offline, NULL, cpu); + SCX_CALL_OP(sch, cpu_offline, NULL, scx_cpu_arg(cpu)); else scx_exit(sch, SCX_EXIT_UNREG_KERN, SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, @@ -3458,9 +3573,10 @@ static bool check_rq_for_timeouts(struct rq *rq) last_runnable + READ_ONCE(sch->watchdog_timeout)))) { u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable); - scx_exit(sch, SCX_EXIT_ERROR_STALL, 0, - "%s[%d] failed to run for %u.%03us", - p->comm, p->pid, dur_ms / 1000, dur_ms % 1000); + __scx_exit(sch, SCX_EXIT_ERROR_STALL, 0, cpu_of(rq), + "%s[%d] failed to run for %u.%03us", + p->comm, p->pid, dur_ms / 1000, + dur_ms % 1000); timed_out = true; break; } @@ -3748,6 +3864,33 @@ void init_scx_entity(struct sched_ext_entity *scx) scx->slice = SCX_SLICE_DFL; } +/* See scx_tid_alloc / scx_tid_cursor. */ +static u64 scx_alloc_tid(void) +{ + struct scx_tid_alloc *ta; + + guard(preempt)(); + ta = this_cpu_ptr(&scx_tid_alloc); + + if (unlikely(ta->next >= ta->end)) { + ta->next = atomic64_fetch_add(SCX_TID_CHUNK, &scx_tid_cursor); + ta->end = ta->next + SCX_TID_CHUNK; + } + return ta->next++; +} + +static void scx_tid_hash_insert(struct task_struct *p) +{ + int ret; + + lockdep_assert_held(&scx_tasks_lock); + + ret = rhashtable_lookup_insert_fast(&scx_tid_hash, + &p->scx.tid_hash_node, + scx_tid_hash_params); + WARN_ON_ONCE(ret); +} + void scx_pre_fork(struct task_struct *p) { /* @@ -3765,6 +3908,8 @@ int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs) percpu_rwsem_assert_held(&scx_fork_rwsem); + p->scx.tid = scx_alloc_tid(); + if (scx_init_task_enabled) { #ifdef CONFIG_EXT_SUB_SCHED struct scx_sched *sch = kargs->cset->dfl_cgrp->scx_sched; @@ -3804,9 +3949,11 @@ void scx_post_fork(struct task_struct *p) } } - raw_spin_lock_irq(&scx_tasks_lock); - list_add_tail(&p->scx.tasks_node, &scx_tasks); - raw_spin_unlock_irq(&scx_tasks_lock); + scoped_guard(raw_spinlock_irq, &scx_tasks_lock) { + list_add_tail(&p->scx.tasks_node, &scx_tasks); + if (scx_tid_to_task_enabled()) + scx_tid_hash_insert(p); + } percpu_up_read(&scx_fork_rwsem); } @@ -3857,17 +4004,19 @@ static bool task_dead_and_done(struct task_struct *p) void sched_ext_dead(struct task_struct *p) { - unsigned long flags; - /* * By the time control reaches here, @p has %TASK_DEAD set, switched out * for the last time and then dropped the rq lock - task_dead_and_done() * should be returning %true nullifying the straggling sched_class ops. * Remove from scx_tasks and exit @p. */ - raw_spin_lock_irqsave(&scx_tasks_lock, flags); - list_del_init(&p->scx.tasks_node); - raw_spin_unlock_irqrestore(&scx_tasks_lock, flags); + scoped_guard(raw_spinlock_irqsave, &scx_tasks_lock) { + list_del_init(&p->scx.tasks_node); + if (scx_tid_to_task_enabled()) + rhashtable_remove_fast(&scx_tid_hash, + &p->scx.tid_hash_node, + scx_tid_hash_params); + } /* * @p is off scx_tasks and wholly ours. scx_root_enable()'s READY -> @@ -3927,7 +4076,7 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p) * different scheduler class. Keep the BPF scheduler up-to-date. */ if (SCX_HAS_OP(sch, set_cpumask)) - SCX_CALL_OP_TASK(sch, set_cpumask, rq, p, (struct cpumask *)p->cpus_ptr); + scx_call_op_set_cpumask(sch, rq, p, (struct cpumask *)p->cpus_ptr); } static void switched_from_scx(struct rq *rq, struct task_struct *p) @@ -4510,9 +4659,9 @@ static void scx_cgroup_unlock(void) #endif } #else /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */ -static struct cgroup *root_cgroup(void) { return NULL; } -static void scx_cgroup_lock(void) {} -static void scx_cgroup_unlock(void) {} +static inline struct cgroup *root_cgroup(void) { return NULL; } +static inline void scx_cgroup_lock(void) {} +static inline void scx_cgroup_unlock(void) {} #endif /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */ #ifdef CONFIG_EXT_SUB_SCHED @@ -4531,8 +4680,8 @@ static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) rcu_assign_pointer(pos->scx_sched, sch); } #else /* CONFIG_EXT_SUB_SCHED */ -static struct cgroup *sch_cgroup(struct scx_sched *sch) { return NULL; } -static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) {} +static inline struct cgroup *sch_cgroup(struct scx_sched *sch) { return NULL; } +static inline void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) {} #endif /* CONFIG_EXT_SUB_SCHED */ /* @@ -4818,6 +4967,48 @@ static const struct attribute_group scx_global_attr_group = { static void free_pnode(struct scx_sched_pnode *pnode); static void free_exit_info(struct scx_exit_info *ei); +static s32 scx_set_cmask_scratch_alloc(struct scx_sched *sch) +{ + size_t size = struct_size_t(struct scx_cmask, bits, + SCX_CMASK_NR_WORDS(num_possible_cpus())); + int cpu; + + if (!sch->is_cid_type || !sch->arena_pool) + return 0; + + sch->set_cmask_scratch = alloc_percpu(struct scx_cmask *); + if (!sch->set_cmask_scratch) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + struct scx_cmask **slot = per_cpu_ptr(sch->set_cmask_scratch, cpu); + + *slot = scx_arena_alloc(sch, size); + if (!*slot) + return -ENOMEM; + scx_cmask_init(*slot, 0, num_possible_cpus()); + } + return 0; +} + +static void scx_set_cmask_scratch_free(struct scx_sched *sch) +{ + size_t size = struct_size_t(struct scx_cmask, bits, + SCX_CMASK_NR_WORDS(num_possible_cpus())); + int cpu; + + if (!sch->set_cmask_scratch) + return; + + for_each_possible_cpu(cpu) { + struct scx_cmask **slot = per_cpu_ptr(sch->set_cmask_scratch, cpu); + + scx_arena_free(sch, *slot, size); + } + free_percpu(sch->set_cmask_scratch); + sch->set_cmask_scratch = NULL; +} + static void scx_sched_free_rcu_work(struct work_struct *work) { struct rcu_work *rcu_work = to_rcu_work(work); @@ -4872,6 +5063,10 @@ static void scx_sched_free_rcu_work(struct work_struct *work) rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL); free_exit_info(sch->exit_info); + scx_set_cmask_scratch_free(sch); + scx_arena_pool_destroy(sch); + if (sch->arena_map) + bpf_map_put(sch->arena_map); kfree(sch); } @@ -5563,6 +5758,7 @@ static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len) if (!ei) return NULL; + ei->exit_cpu = -1; ei->bt = kzalloc_objs(ei->bt[0], SCX_EXIT_BT_LEN); ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL); ei->dump = kvzalloc(exit_dump_len, GFP_KERNEL); @@ -5709,6 +5905,26 @@ static void scx_disable_dump(struct scx_sched *sch) sch->dump_disabled = true; } +static void scx_log_sched_disable(struct scx_sched *sch) +{ + struct scx_exit_info *ei = sch->exit_info; + const char *type = scx_parent(sch) ? "sub-scheduler" : "scheduler"; + + if (ei->kind >= SCX_EXIT_ERROR) { + pr_err("sched_ext: BPF %s \"%s\" disabled (%s)\n", type, + sch->ops.name, ei->reason); + + if (ei->msg[0] != '\0') + pr_err("sched_ext: %s: %s\n", sch->ops.name, ei->msg); +#ifdef CONFIG_STACKTRACE + stack_trace_print(ei->bt, ei->bt_len, 2); +#endif + } else { + pr_info("sched_ext: BPF %s \"%s\" disabled (%s)\n", type, + sch->ops.name, ei->reason); + } +} + #ifdef CONFIG_EXT_SUB_SCHED static DECLARE_WAIT_QUEUE_HEAD(scx_unlink_waitq); @@ -5795,14 +6011,11 @@ static void scx_sub_disable(struct scx_sched *sch) WARN_ON_ONCE(!scx_task_on_sched(sch, p)); /* - * If $p is about to be freed, nothing prevents $sch from - * unloading before $p reaches sched_ext_free(). Disable and - * exit $p right away. + * @p is pinned by the iter: css_task_iter_next() takes a + * reference and holds it until the next iter_next() call, so + * @p->usage is guaranteed > 0. */ - if (!tryget_task_struct(p)) { - scx_disable_and_exit_task(sch, p); - continue; - } + get_task_struct(p); scx_task_iter_unlock(&sti); @@ -5895,6 +6108,8 @@ static void scx_sub_disable(struct scx_sched *sch) &sub_detach_args); } + scx_log_sched_disable(sch); + if (sch->ops.exit) SCX_CALL_OP(sch, exit, NULL, sch->exit_info); if (sch->sub_kset) @@ -5902,13 +6117,12 @@ static void scx_sub_disable(struct scx_sched *sch) kobject_del(&sch->kobj); } #else /* CONFIG_EXT_SUB_SCHED */ -static void drain_descendants(struct scx_sched *sch) { } -static void scx_sub_disable(struct scx_sched *sch) { } +static inline void drain_descendants(struct scx_sched *sch) { } +static inline void scx_sub_disable(struct scx_sched *sch) { } #endif /* CONFIG_EXT_SUB_SCHED */ static void scx_root_disable(struct scx_sched *sch) { - struct scx_exit_info *ei = sch->exit_info; struct scx_task_iter sti; struct task_struct *p; bool was_switched_all; @@ -6021,26 +6235,19 @@ static void scx_root_disable(struct scx_sched *sch) /* no task is on scx, turn off all the switches and flush in-progress calls */ static_branch_disable(&__scx_enabled); + static_branch_disable(&__scx_is_cid_type); + if (sch->ops.flags & SCX_OPS_TID_TO_TASK) + static_branch_disable(&__scx_tid_to_task_enabled); bitmap_zero(sch->has_op, SCX_OPI_END); scx_idle_disable(); synchronize_rcu(); + if (sch->ops.flags & SCX_OPS_TID_TO_TASK) + rhashtable_free_and_destroy(&scx_tid_hash, NULL, NULL); - if (ei->kind >= SCX_EXIT_ERROR) { - pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n", - sch->ops.name, ei->reason); - - if (ei->msg[0] != '\0') - pr_err("sched_ext: %s: %s\n", sch->ops.name, ei->msg); -#ifdef CONFIG_STACKTRACE - stack_trace_print(ei->bt, ei->bt_len, 2); -#endif - } else { - pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n", - sch->ops.name, ei->reason); - } + scx_log_sched_disable(sch); if (sch->ops.exit) - SCX_CALL_OP(sch, exit, NULL, ei); + SCX_CALL_OP(sch, exit, NULL, sch->exit_info); scx_unlink_sched(sch); @@ -6338,6 +6545,94 @@ static void scx_dump_task(struct scx_sched *sch, struct seq_buf *s, struct scx_d } } +static void scx_dump_cpu(struct scx_sched *sch, struct seq_buf *s, + struct scx_dump_ctx *dctx, int cpu, + bool dump_all_tasks) +{ + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; + struct task_struct *p; + struct seq_buf ns; + size_t avail, used; + char *buf; + bool idle; + + rq_lock_irqsave(rq, &rf); + + idle = list_empty(&rq->scx.runnable_list) && + rq->curr->sched_class == &idle_sched_class; + + if (idle && !SCX_HAS_OP(sch, dump_cpu)) + goto next; + + /* + * We don't yet know whether ops.dump_cpu() will produce output + * and we may want to skip the default CPU dump if it doesn't. + * Use a nested seq_buf to generate the standard dump so that we + * can decide whether to commit later. + */ + avail = seq_buf_get_buf(s, &buf); + seq_buf_init(&ns, buf, avail); + + dump_newline(&ns); + dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu ksync=%lu", + cpu, rq->scx.nr_running, rq->scx.flags, + rq->scx.cpu_released, rq->scx.ops_qseq, + rq->scx.kick_sync); + dump_line(&ns, " curr=%s[%d] class=%ps", + rq->curr->comm, rq->curr->pid, + rq->curr->sched_class); + if (!cpumask_empty(rq->scx.cpus_to_kick)) + dump_line(&ns, " cpus_to_kick : %*pb", + cpumask_pr_args(rq->scx.cpus_to_kick)); + if (!cpumask_empty(rq->scx.cpus_to_kick_if_idle)) + dump_line(&ns, " idle_to_kick : %*pb", + cpumask_pr_args(rq->scx.cpus_to_kick_if_idle)); + if (!cpumask_empty(rq->scx.cpus_to_preempt)) + dump_line(&ns, " cpus_to_preempt: %*pb", + cpumask_pr_args(rq->scx.cpus_to_preempt)); + if (!cpumask_empty(rq->scx.cpus_to_wait)) + dump_line(&ns, " cpus_to_wait : %*pb", + cpumask_pr_args(rq->scx.cpus_to_wait)); + if (!cpumask_empty(rq->scx.cpus_to_sync)) + dump_line(&ns, " cpus_to_sync : %*pb", + cpumask_pr_args(rq->scx.cpus_to_sync)); + + used = seq_buf_used(&ns); + if (SCX_HAS_OP(sch, dump_cpu)) { + ops_dump_init(&ns, " "); + SCX_CALL_OP(sch, dump_cpu, rq, dctx, scx_cpu_arg(cpu), idle); + ops_dump_exit(); + } + + /* + * If idle && nothing generated by ops.dump_cpu(), there's + * nothing interesting. Skip. + */ + if (idle && used == seq_buf_used(&ns)) + goto next; + + /* + * $s may already have overflowed when $ns was created. If so, + * calling commit on it will trigger BUG. + */ + if (avail) { + seq_buf_commit(s, seq_buf_used(&ns)); + if (seq_buf_has_overflowed(&ns)) + seq_buf_set_overflow(s); + } + + if (rq->curr->sched_class == &ext_sched_class && + (dump_all_tasks || scx_task_on_sched(sch, rq->curr))) + scx_dump_task(sch, s, dctx, rq, rq->curr, '*'); + + list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) + if (dump_all_tasks || scx_task_on_sched(sch, p)) + scx_dump_task(sch, s, dctx, rq, p, ' '); +next: + rq_unlock_irqrestore(rq, &rf); +} + /* * Dump scheduler state. If @dump_all_tasks is true, dump all tasks regardless * of which scheduler they belong to. If false, only dump tasks owned by @sch. @@ -6358,7 +6653,6 @@ static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei, }; struct seq_buf s; struct scx_event_stats events; - char *buf; int cpu; guard(raw_spinlock_irqsave)(&scx_dump_lock); @@ -6379,8 +6673,13 @@ static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei, if (ei->kind == SCX_EXIT_NONE) { dump_line(&s, "Debug dump triggered by %s", ei->reason); } else { - dump_line(&s, "%s[%d] triggered exit kind %d:", - current->comm, current->pid, ei->kind); + if (ei->exit_cpu >= 0) + dump_line(&s, "%s[%d] triggered exit kind %d on CPU %d:", + current->comm, current->pid, ei->kind, + ei->exit_cpu); + else + dump_line(&s, "%s[%d] triggered exit kind %d:", + current->comm, current->pid, ei->kind); dump_line(&s, " %s (%s)", ei->reason, ei->msg); dump_newline(&s); dump_line(&s, "Backtrace:"); @@ -6397,88 +6696,15 @@ static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei, dump_line(&s, "CPU states"); dump_line(&s, "----------"); + /* + * Dump the exit CPU first so it isn't lost to dump truncation, then + * walk the rest in order, skipping the one already dumped. + */ + if (ei->exit_cpu >= 0) + scx_dump_cpu(sch, &s, &dctx, ei->exit_cpu, dump_all_tasks); for_each_possible_cpu(cpu) { - struct rq *rq = cpu_rq(cpu); - struct rq_flags rf; - struct task_struct *p; - struct seq_buf ns; - size_t avail, used; - bool idle; - - rq_lock_irqsave(rq, &rf); - - idle = list_empty(&rq->scx.runnable_list) && - rq->curr->sched_class == &idle_sched_class; - - if (idle && !SCX_HAS_OP(sch, dump_cpu)) - goto next; - - /* - * We don't yet know whether ops.dump_cpu() will produce output - * and we may want to skip the default CPU dump if it doesn't. - * Use a nested seq_buf to generate the standard dump so that we - * can decide whether to commit later. - */ - avail = seq_buf_get_buf(&s, &buf); - seq_buf_init(&ns, buf, avail); - - dump_newline(&ns); - dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu ksync=%lu", - cpu, rq->scx.nr_running, rq->scx.flags, - rq->scx.cpu_released, rq->scx.ops_qseq, - rq->scx.kick_sync); - dump_line(&ns, " curr=%s[%d] class=%ps", - rq->curr->comm, rq->curr->pid, - rq->curr->sched_class); - if (!cpumask_empty(rq->scx.cpus_to_kick)) - dump_line(&ns, " cpus_to_kick : %*pb", - cpumask_pr_args(rq->scx.cpus_to_kick)); - if (!cpumask_empty(rq->scx.cpus_to_kick_if_idle)) - dump_line(&ns, " idle_to_kick : %*pb", - cpumask_pr_args(rq->scx.cpus_to_kick_if_idle)); - if (!cpumask_empty(rq->scx.cpus_to_preempt)) - dump_line(&ns, " cpus_to_preempt: %*pb", - cpumask_pr_args(rq->scx.cpus_to_preempt)); - if (!cpumask_empty(rq->scx.cpus_to_wait)) - dump_line(&ns, " cpus_to_wait : %*pb", - cpumask_pr_args(rq->scx.cpus_to_wait)); - if (!cpumask_empty(rq->scx.cpus_to_sync)) - dump_line(&ns, " cpus_to_sync : %*pb", - cpumask_pr_args(rq->scx.cpus_to_sync)); - - used = seq_buf_used(&ns); - if (SCX_HAS_OP(sch, dump_cpu)) { - ops_dump_init(&ns, " "); - SCX_CALL_OP(sch, dump_cpu, rq, &dctx, cpu, idle); - ops_dump_exit(); - } - - /* - * If idle && nothing generated by ops.dump_cpu(), there's - * nothing interesting. Skip. - */ - if (idle && used == seq_buf_used(&ns)) - goto next; - - /* - * $s may already have overflowed when $ns was created. If so, - * calling commit on it will trigger BUG. - */ - if (avail) { - seq_buf_commit(&s, seq_buf_used(&ns)); - if (seq_buf_has_overflowed(&ns)) - seq_buf_set_overflow(&s); - } - - if (rq->curr->sched_class == &ext_sched_class && - (dump_all_tasks || scx_task_on_sched(sch, rq->curr))) - scx_dump_task(sch, &s, &dctx, rq, rq->curr, '*'); - - list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) - if (dump_all_tasks || scx_task_on_sched(sch, p)) - scx_dump_task(sch, &s, &dctx, rq, p, ' '); - next: - rq_unlock_irqrestore(rq, &rf); + if (cpu != ei->exit_cpu) + scx_dump_cpu(sch, &s, &dctx, cpu, dump_all_tasks); } dump_newline(&s); @@ -6516,9 +6742,9 @@ static void scx_disable_irq_workfn(struct irq_work *irq_work) kthread_queue_work(sch->helper, &sch->disable_work); } -static bool scx_vexit(struct scx_sched *sch, - enum scx_exit_kind kind, s64 exit_code, - const char *fmt, va_list args) +bool scx_vexit(struct scx_sched *sch, + enum scx_exit_kind kind, s64 exit_code, s32 exit_cpu, + const char *fmt, va_list args) { struct scx_exit_info *ei = sch->exit_info; @@ -6540,6 +6766,7 @@ static bool scx_vexit(struct scx_sched *sch, */ ei->kind = kind; ei->reason = scx_exit_reason(ei->kind); + ei->exit_cpu = exit_cpu; irq_work_queue(&sch->disable_irq_work); return true; @@ -6597,13 +6824,32 @@ static struct scx_sched_pnode *alloc_pnode(struct scx_sched *sch, int node) } /* + * scx_enable() is offloaded to a dedicated system-wide RT kthread to avoid + * starvation. During the READY -> ENABLED task switching loop, the calling + * thread's sched_class gets switched from fair to ext. As fair has higher + * priority than ext, the calling thread can be indefinitely starved under + * fair-class saturation, leading to a system hang. + */ +struct scx_enable_cmd { + struct kthread_work work; + union { + struct sched_ext_ops *ops; + struct sched_ext_ops_cid *ops_cid; + }; + bool is_cid_type; + struct bpf_map *arena_map; /* arena ref to transfer to sch */ + int ret; +}; + +/* * Allocate and initialize a new scx_sched. @cgrp's reference is always * consumed whether the function succeeds or fails. */ -static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, +static struct scx_sched *scx_alloc_and_add_sched(struct scx_enable_cmd *cmd, struct cgroup *cgrp, struct scx_sched *parent) { + struct sched_ext_ops *ops = cmd->ops; struct scx_sched *sch; s32 level = parent ? parent->level + 1 : 0; s32 node, cpu, ret, bypass_fail_cpu = nr_cpu_ids; @@ -6695,7 +6941,18 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, ret = -ENOMEM; goto err_free_lb_cpumask; } - sch->ops = *ops; + /* + * Copy ops through the right union view. For cid-form the source is + * struct sched_ext_ops_cid which lacks the trailing cpu_acquire/ + * cpu_release; those stay zero from kzalloc. + */ + if (cmd->is_cid_type) { + sch->ops_cid = *cmd->ops_cid; + sch->is_cid_type = true; + } else { + sch->ops = *cmd->ops; + } + rcu_assign_pointer(ops->priv, sch); sch->kobj.kset = scx_kset; @@ -6748,6 +7005,20 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, return ERR_PTR(ret); } #endif /* CONFIG_EXT_SUB_SCHED */ + + /* + * Consume the arena_map ref bpf_scx_reg_cid() took. Defer to here so + * earlier failure paths leave cmd->arena_map set and bpf_scx_reg_cid + * drops the ref. After this point, sch owns the ref and any cleanup + * runs through scx_sched_free_rcu_work() which puts it. + */ + sch->arena_map = cmd->arena_map; + /* BPF arena is only available on MMU && 64BIT */ +#if defined(CONFIG_MMU) && defined(CONFIG_64BIT) + if (sch->arena_map) + sch->arena_kern_base = bpf_arena_map_kern_vm_start(sch->arena_map); +#endif + cmd->arena_map = NULL; return sch; #ifdef CONFIG_EXT_SUB_SCHED @@ -6819,6 +7090,17 @@ static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops) } /* + * SCX_OPS_TID_TO_TASK is enabled by the root scheduler. A sub-sched + * may set it to declare a dependency; reject if the root hasn't + * enabled it. + */ + if ((ops->flags & SCX_OPS_TID_TO_TASK) && scx_parent(sch) && + !(scx_root->ops.flags & SCX_OPS_TID_TO_TASK)) { + scx_error(sch, "SCX_OPS_TID_TO_TASK requires root scheduler to enable it"); + return -EINVAL; + } + + /* * SCX_OPS_BUILTIN_IDLE_PER_NODE requires built-in CPU idle * selection policy to be enabled. */ @@ -6828,25 +7110,34 @@ static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops) return -EINVAL; } - if (ops->cpu_acquire || ops->cpu_release) + /* + * cid-form's struct is shorter and doesn't include the cpu_acquire / + * cpu_release tail; reading those fields off a cid-form @ops would + * run past the BPF allocation. Skip for cid-form. + */ + if (!sch->is_cid_type && (ops->cpu_acquire || ops->cpu_release)) pr_warn("ops->cpu_acquire/release() are deprecated, use sched_switch TP instead\n"); + /* + * Sub-scheduler support is tied to the cid-form struct_ops. A sub-sched + * attaches through a cid-form-only interface (sub_attach/sub_detach), + * and a root that accepts sub-scheds must expose cid-form state to + * them. Reject cpu-form schedulers on either side. + */ + if (!sch->is_cid_type) { + if (scx_parent(sch)) { + scx_error(sch, "sub-sched requires cid-form struct_ops"); + return -EINVAL; + } + if (ops->sub_attach || ops->sub_detach) { + scx_error(sch, "sub_attach/sub_detach requires cid-form struct_ops"); + return -EINVAL; + } + } + return 0; } -/* - * scx_enable() is offloaded to a dedicated system-wide RT kthread to avoid - * starvation. During the READY -> ENABLED task switching loop, the calling - * thread's sched_class gets switched from fair to ext. As fair has higher - * priority than ext, the calling thread can be indefinitely starved under - * fair-class saturation, leading to a system hang. - */ -struct scx_enable_cmd { - struct kthread_work work; - struct sched_ext_ops *ops; - int ret; -}; - static void scx_root_enable_workfn(struct kthread_work *work) { struct scx_enable_cmd *cmd = container_of(work, struct scx_enable_cmd, work); @@ -6881,15 +7172,24 @@ static void scx_root_enable_workfn(struct kthread_work *work) if (ret) goto err_unlock; + if (ops->flags & SCX_OPS_TID_TO_TASK) { + ret = rhashtable_init(&scx_tid_hash, &scx_tid_hash_params); + if (ret) + goto err_free_ksyncs; + } + #ifdef CONFIG_EXT_SUB_SCHED cgroup_get(cgrp); #endif - sch = scx_alloc_and_add_sched(ops, cgrp, NULL); + sch = scx_alloc_and_add_sched(cmd, cgrp, NULL); if (IS_ERR(sch)) { ret = PTR_ERR(sch); - goto err_free_ksyncs; + goto err_free_tid_hash; } + if (sch->is_cid_type) + static_branch_enable(&__scx_is_cid_type); + /* * Transition to ENABLING and clear exit info to arm the disable path. * Failure triggers full disabling from here on. @@ -6913,6 +7213,18 @@ static void scx_root_enable_workfn(struct kthread_work *work) cpus_read_lock(); /* + * Build the cid mapping before publishing scx_root. The cid kfuncs + * dereference the cid arrays unconditionally once scx_prog_sched() + * returns non-NULL; the rcu_assign_pointer() below pairs with their + * rcu_dereference() to make the populated arrays visible. + */ + ret = scx_cid_init(sch); + if (ret) { + cpus_read_unlock(); + goto err_disable; + } + + /* * Make the scheduler instance visible. Must be inside cpus_read_lock(). * See handle_hotplug(). */ @@ -6937,6 +7249,18 @@ static void scx_root_enable_workfn(struct kthread_work *work) sch->exit_info->flags |= SCX_EFLAG_INITIALIZED; } + ret = scx_arena_pool_init(sch); + if (ret) { + cpus_read_unlock(); + goto err_disable; + } + + ret = scx_set_cmask_scratch_alloc(sch); + if (ret) { + cpus_read_unlock(); + goto err_disable; + } + for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++) if (((void (**)(void))ops)[i]) set_bit(i, sch->has_op); @@ -7003,6 +7327,10 @@ static void scx_root_enable_workfn(struct kthread_work *work) WARN_ON_ONCE(scx_init_task_enabled); scx_init_task_enabled = true; + /* flip under fork_rwsem; the iter below covers existing tasks */ + if (ops->flags & SCX_OPS_TID_TO_TASK) + static_branch_enable(&__scx_tid_to_task_enabled); + /* * Enable ops for every task. Fork is excluded by scx_fork_rwsem * preventing new tasks from being added. No need to exclude tasks @@ -7024,16 +7352,14 @@ static void scx_root_enable_workfn(struct kthread_work *work) scx_task_iter_start(&sti, NULL); while ((p = scx_task_iter_next_locked(&sti))) { - struct rq_flags rf; - struct rq *rq; - /* - * @p may already be dead, have lost all its usages counts and - * be waiting for RCU grace period before being freed. @p can't - * be initialized for SCX in such cases and should be ignored. + * @p is in scx_tasks under scx_tasks_lock, and SCX_TASK_DEAD + * tasks are filtered by scx_task_iter_next_locked(). + * sched_ext_dead() removes @p from scx_tasks under the same + * lock before put_task_struct_rcu_user() runs, so @p->usage + * is guaranteed > 0 here. */ - if (!tryget_task_struct(p)) - continue; + get_task_struct(p); /* * Set %INIT_BEGIN under the iter's rq lock so that a concurrent @@ -7049,12 +7375,11 @@ static void scx_root_enable_workfn(struct kthread_work *work) ret = __scx_init_task(sch, p, false); - rq = task_rq_lock(p, &rf); + scx_task_iter_relock(&sti, p); if (unlikely(ret)) { if (scx_get_task_state(p) != SCX_TASK_DEAD) scx_set_task_state(p, SCX_TASK_NONE); - task_rq_unlock(rq, p, &rf); scx_task_iter_stop(&sti); scx_error(sch, "ops.init_task() failed (%d) for %s[%d]", ret, p->comm, p->pid); @@ -7075,7 +7400,14 @@ static void scx_root_enable_workfn(struct kthread_work *work) scx_set_task_state(p, SCX_TASK_READY); } - task_rq_unlock(rq, p, &rf); + /* + * Insert into the tid hash. scx_tasks_lock is held by the iter; + * list_empty() guards against sched_ext_dead() having taken @p + * off the list while init ran unlocked. + */ + if (scx_tid_to_task_enabled() && !list_empty(&p->scx.tasks_node)) + scx_tid_hash_insert(p); + put_task_struct(p); } scx_task_iter_stop(&sti); @@ -7154,6 +7486,9 @@ static void scx_root_enable_workfn(struct kthread_work *work) cmd->ret = 0; return; +err_free_tid_hash: + if (ops->flags & SCX_OPS_TID_TO_TASK) + rhashtable_free_and_destroy(&scx_tid_hash, NULL, NULL); err_free_ksyncs: free_kick_syncs(); err_unlock: @@ -7261,7 +7596,7 @@ static void scx_sub_enable_workfn(struct kthread_work *work) raw_spin_unlock_irq(&scx_sched_lock); /* scx_alloc_and_add_sched() consumes @cgrp whether it succeeds or not */ - sch = scx_alloc_and_add_sched(ops, cgrp, parent); + sch = scx_alloc_and_add_sched(cmd, cgrp, parent); kobject_put(&parent->kobj); if (IS_ERR(sch)) { ret = PTR_ERR(sch); @@ -7288,6 +7623,14 @@ static void scx_sub_enable_workfn(struct kthread_work *work) sch->exit_info->flags |= SCX_EFLAG_INITIALIZED; } + ret = scx_arena_pool_init(sch); + if (ret) + goto err_disable; + + ret = scx_set_cmask_scratch_alloc(sch); + if (ret) + goto err_disable; + if (validate_ops(sch, ops)) goto err_disable; @@ -7350,9 +7693,8 @@ static void scx_sub_enable_workfn(struct kthread_work *work) if (p->scx.flags & SCX_TASK_SUB_INIT) continue; - /* see scx_root_enable() */ - if (!tryget_task_struct(p)) - continue; + /* @p is pinned by the iter; see scx_sub_disable() */ + get_task_struct(p); if (!assert_task_ready_or_enabled(p)) { ret = -EINVAL; @@ -7515,11 +7857,10 @@ static s32 __init scx_cgroup_lifetime_notifier_init(void) core_initcall(scx_cgroup_lifetime_notifier_init); #endif /* CONFIG_EXT_SUB_SCHED */ -static s32 scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) +static s32 scx_enable(struct scx_enable_cmd *cmd, struct bpf_link *link) { static struct kthread_worker *helper; static DEFINE_MUTEX(helper_mutex); - struct scx_enable_cmd cmd; if (housekeeping_enabled(HK_TYPE_DOMAIN_BOOT)) { pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n"); @@ -7542,16 +7883,15 @@ static s32 scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) } #ifdef CONFIG_EXT_SUB_SCHED - if (ops->sub_cgroup_id > 1) - kthread_init_work(&cmd.work, scx_sub_enable_workfn); + if (cmd->ops->sub_cgroup_id > 1) + kthread_init_work(&cmd->work, scx_sub_enable_workfn); else #endif /* CONFIG_EXT_SUB_SCHED */ - kthread_init_work(&cmd.work, scx_root_enable_workfn); - cmd.ops = ops; + kthread_init_work(&cmd->work, scx_root_enable_workfn); - kthread_queue_work(READ_ONCE(helper), &cmd.work); - kthread_flush_work(&cmd.work); - return cmd.ret; + kthread_queue_work(READ_ONCE(helper), &cmd->work); + kthread_flush_work(&cmd->work); + return cmd->ret; } @@ -7723,7 +8063,62 @@ static int bpf_scx_check_member(const struct btf_type *t, static int bpf_scx_reg(void *kdata, struct bpf_link *link) { - return scx_enable(kdata, link); + struct scx_enable_cmd cmd = { .ops = kdata }; + + return scx_enable(&cmd, link); +} + +struct scx_arena_scan { + struct bpf_map *arena; + int err; +}; + +/* + * The verifier enforces one arena per BPF program, so each struct_ops + * member prog contributes at most one arena via bpf_prog_arena(). + * Require all non-NULL contributions to match. + */ +static int scx_arena_scan_prog(struct bpf_prog *prog, void *data) +{ + struct scx_arena_scan *s = data; + struct bpf_map *arena = NULL; + + /* arena.o, which defines these, is built only on MMU && 64BIT */ +#if defined(CONFIG_MMU) && defined(CONFIG_64BIT) + arena = bpf_prog_arena(prog); +#endif + if (!arena) + return 0; + if (s->arena && s->arena != arena) { + s->err = -EINVAL; + return 1; + } + s->arena = arena; + return 0; +} + +static int bpf_scx_reg_cid(void *kdata, struct bpf_link *link) +{ + struct scx_enable_cmd cmd = { .ops_cid = kdata, .is_cid_type = true }; + struct scx_arena_scan scan = {}; + int ret; + + bpf_struct_ops_for_each_prog(kdata, scx_arena_scan_prog, &scan); + if (scan.err) { + pr_err("sched_ext: cid-form scheduler uses multiple arena maps\n"); + return scan.err; + } + if (!scan.arena) { + pr_err("sched_ext: cid-form scheduler must use a BPF arena map\n"); + return -EINVAL; + } + + bpf_map_inc(scan.arena); + cmd.arena_map = scan.arena; + ret = scx_enable(&cmd, link); + if (cmd.arena_map) /* not consumed by scx_alloc_and_add_sched() */ + bpf_map_put(cmd.arena_map); + return ret; } static void bpf_scx_unreg(void *kdata, struct bpf_link *link) @@ -7857,6 +8252,73 @@ static struct bpf_struct_ops bpf_sched_ext_ops = { .cfi_stubs = &__bpf_ops_sched_ext_ops }; +/* + * cid-form cfi stubs. Stubs whose signatures match the cpu-form (param types + * identical, only param names differ across structs) are reused; only + * set_cmask needs a fresh stub since the second argument type differs. + */ +static void sched_ext_ops_cid__set_cmask(struct task_struct *p, + const struct scx_cmask *cmask) {} + +static struct sched_ext_ops_cid __bpf_ops_sched_ext_ops_cid = { + .select_cid = sched_ext_ops__select_cpu, + .enqueue = sched_ext_ops__enqueue, + .dequeue = sched_ext_ops__dequeue, + .dispatch = sched_ext_ops__dispatch, + .tick = sched_ext_ops__tick, + .runnable = sched_ext_ops__runnable, + .running = sched_ext_ops__running, + .stopping = sched_ext_ops__stopping, + .quiescent = sched_ext_ops__quiescent, + .yield = sched_ext_ops__yield, + .core_sched_before = sched_ext_ops__core_sched_before, + .set_weight = sched_ext_ops__set_weight, + .set_cmask = sched_ext_ops_cid__set_cmask, + .update_idle = sched_ext_ops__update_idle, + .init_task = sched_ext_ops__init_task, + .exit_task = sched_ext_ops__exit_task, + .enable = sched_ext_ops__enable, + .disable = sched_ext_ops__disable, +#ifdef CONFIG_EXT_GROUP_SCHED + .cgroup_init = sched_ext_ops__cgroup_init, + .cgroup_exit = sched_ext_ops__cgroup_exit, + .cgroup_prep_move = sched_ext_ops__cgroup_prep_move, + .cgroup_move = sched_ext_ops__cgroup_move, + .cgroup_cancel_move = sched_ext_ops__cgroup_cancel_move, + .cgroup_set_weight = sched_ext_ops__cgroup_set_weight, + .cgroup_set_bandwidth = sched_ext_ops__cgroup_set_bandwidth, + .cgroup_set_idle = sched_ext_ops__cgroup_set_idle, +#endif + .sub_attach = sched_ext_ops__sub_attach, + .sub_detach = sched_ext_ops__sub_detach, + .cid_online = sched_ext_ops__cpu_online, + .cid_offline = sched_ext_ops__cpu_offline, + .init = sched_ext_ops__init, + .exit = sched_ext_ops__exit, + .dump = sched_ext_ops__dump, + .dump_cid = sched_ext_ops__dump_cpu, + .dump_task = sched_ext_ops__dump_task, +}; + +/* + * The cid-form struct_ops shares all bpf_struct_ops hooks with the cpu form. + * init_member, check_member, reg, unreg, etc. process kdata as the byte block + * verified to match by the BUILD_BUG_ON checks in scx_init(). + */ +static struct bpf_struct_ops bpf_sched_ext_ops_cid = { + .verifier_ops = &bpf_scx_verifier_ops, + .reg = bpf_scx_reg_cid, + .unreg = bpf_scx_unreg, + .check_member = bpf_scx_check_member, + .init_member = bpf_scx_init_member, + .init = bpf_scx_init, + .update = bpf_scx_update, + .validate = bpf_scx_validate, + .name = "sched_ext_ops_cid", + .owner = THIS_MODULE, + .cfi_stubs = &__bpf_ops_sched_ext_ops_cid +}; + /******************************************************************************** * System integration and init. @@ -7866,13 +8328,11 @@ static void sysrq_handle_sched_ext_reset(u8 key) { struct scx_sched *sch; - rcu_read_lock(); sch = rcu_dereference(scx_root); if (likely(sch)) scx_disable(sch, SCX_EXIT_SYSRQ); else pr_info("sched_ext: BPF schedulers not loaded\n"); - rcu_read_unlock(); } static const struct sysrq_key_op sysrq_sched_ext_reset_op = { @@ -7884,7 +8344,11 @@ static const struct sysrq_key_op sysrq_sched_ext_reset_op = { static void sysrq_handle_sched_ext_dump(u8 key) { - struct scx_exit_info ei = { .kind = SCX_EXIT_NONE, .reason = "SysRq-D" }; + struct scx_exit_info ei = { + .kind = SCX_EXIT_NONE, + .exit_cpu = -1, + .reason = "SysRq-D", + }; struct scx_sched *sch; list_for_each_entry_rcu(sch, &scx_sched_all, all) @@ -8954,9 +9418,6 @@ static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags) struct rq *this_rq; unsigned long irq_flags; - if (!ops_cpu_valid(sch, cpu, NULL)) - return; - local_irq_save(irq_flags); this_rq = this_rq(); @@ -9019,11 +9480,36 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags, const struct bpf_prog_aux guard(rcu)(); sch = scx_prog_sched(aux); - if (likely(sch)) + if (likely(sch) && scx_cpu_valid(sch, cpu, NULL)) scx_kick_cpu(sch, cpu, flags); } /** + * scx_bpf_kick_cid - Trigger reschedule on the CPU mapped to @cid + * @cid: cid to kick + * @flags: %SCX_KICK_* flags + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs + * + * cid-addressed equivalent of scx_bpf_kick_cpu(). Return 0 on success, + * -errno otherwise. + */ +__bpf_kfunc s32 scx_bpf_kick_cid(s32 cid, u64 flags, const struct bpf_prog_aux *aux) +{ + struct scx_sched *sch; + s32 cpu; + + guard(rcu)(); + sch = scx_prog_sched(aux); + if (unlikely(!sch)) + return -ENODEV; + cpu = scx_cid_to_cpu(sch, cid); + if (cpu < 0) + return cpu; + scx_kick_cpu(sch, cpu, flags); + return 0; +} + +/** * scx_bpf_dsq_nr_queued - Return the number of queued tasks * @dsq_id: id of the DSQ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs @@ -9049,9 +9535,9 @@ __bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id, const struct bpf_prog_aux *aux ret = READ_ONCE(this_rq()->scx.local_dsq.nr); goto out; } else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { - s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; + s32 cpu = scx_cpu_ret(sch, dsq_id & SCX_DSQ_LOCAL_CPU_MASK); - if (ops_cpu_valid(sch, cpu, NULL)) { + if (scx_cpu_valid(sch, cpu, NULL)) { ret = READ_ONCE(cpu_rq(cpu)->scx.local_dsq.nr); goto out; } @@ -9269,6 +9755,7 @@ __bpf_kfunc void scx_bpf_reenqueue_local___v2(const struct bpf_prog_aux *aux) __bpf_kfunc_end_defs(); +__printf(5, 0) static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf, size_t line_size, char *fmt, unsigned long long *data, u32 data__sz) @@ -9306,6 +9793,7 @@ static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf, return ret; } +__printf(3, 0) static s32 bstr_format(struct scx_sched *sch, struct scx_bstr_buf *buf, char *fmt, unsigned long long *data, u32 data__sz) { @@ -9326,6 +9814,7 @@ __bpf_kfunc_start_defs(); * Indicate that the BPF scheduler wants to exit gracefully, and initiate ops * disabling. */ +__printf(2, 0) __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt, unsigned long long *data, u32 data__sz, const struct bpf_prog_aux *aux) @@ -9351,6 +9840,7 @@ __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt, * Indicate that the BPF scheduler encountered a fatal error and initiate ops * disabling. */ +__printf(1, 0) __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data__sz, const struct bpf_prog_aux *aux) { @@ -9378,6 +9868,7 @@ __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, * The extra dump may be multiple lines. A single line may be split over * multiple calls. The last line is automatically terminated. */ +__printf(1, 0) __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, u32 data__sz, const struct bpf_prog_aux *aux) { @@ -9440,13 +9931,36 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu, const struct bpf_prog_aux *aux) guard(rcu)(); sch = scx_prog_sched(aux); - if (likely(sch) && ops_cpu_valid(sch, cpu, NULL)) + if (likely(sch) && scx_cpu_valid(sch, cpu, NULL)) return arch_scale_cpu_capacity(cpu); else return SCX_CPUPERF_ONE; } /** + * scx_bpf_cidperf_cap - Query the maximum relative capacity of the CPU at @cid + * @cid: cid of the CPU to query + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs + * + * cid-addressed equivalent of scx_bpf_cpuperf_cap(). + */ +__bpf_kfunc u32 scx_bpf_cidperf_cap(s32 cid, const struct bpf_prog_aux *aux) +{ + struct scx_sched *sch; + s32 cpu; + + guard(rcu)(); + + sch = scx_prog_sched(aux); + if (unlikely(!sch)) + return SCX_CPUPERF_ONE; + cpu = scx_cid_to_cpu(sch, cid); + if (cpu < 0) + return SCX_CPUPERF_ONE; + return arch_scale_cpu_capacity(cpu); +} + +/** * scx_bpf_cpuperf_cur - Query the current relative performance of a CPU * @cpu: CPU of interest * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs @@ -9468,13 +9982,36 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu, const struct bpf_prog_aux *aux) guard(rcu)(); sch = scx_prog_sched(aux); - if (likely(sch) && ops_cpu_valid(sch, cpu, NULL)) + if (likely(sch) && scx_cpu_valid(sch, cpu, NULL)) return arch_scale_freq_capacity(cpu); else return SCX_CPUPERF_ONE; } /** + * scx_bpf_cidperf_cur - Query the current performance of the CPU at @cid + * @cid: cid of the CPU to query + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs + * + * cid-addressed equivalent of scx_bpf_cpuperf_cur(). + */ +__bpf_kfunc u32 scx_bpf_cidperf_cur(s32 cid, const struct bpf_prog_aux *aux) +{ + struct scx_sched *sch; + s32 cpu; + + guard(rcu)(); + + sch = scx_prog_sched(aux); + if (unlikely(!sch)) + return SCX_CPUPERF_ONE; + cpu = scx_cid_to_cpu(sch, cid); + if (cpu < 0) + return SCX_CPUPERF_ONE; + return arch_scale_freq_capacity(cpu); +} + +/** * scx_bpf_cpuperf_set - Set the relative performance target of a CPU * @cpu: CPU of interest * @perf: target performance level [0, %SCX_CPUPERF_ONE] @@ -9504,7 +10041,7 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf, const struct bpf_prog_au return; } - if (ops_cpu_valid(sch, cpu, NULL)) { + if (scx_cpu_valid(sch, cpu, NULL)) { struct rq *rq = cpu_rq(cpu), *locked_rq = scx_locked_rq(); struct rq_flags rf; @@ -9535,6 +10072,31 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf, const struct bpf_prog_au } /** + * scx_bpf_cidperf_set - Set the performance target of the CPU at @cid + * @cid: cid of the CPU to target + * @perf: target performance level [0, %SCX_CPUPERF_ONE] + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs + * + * cid-addressed equivalent of scx_bpf_cpuperf_set(). + */ +__bpf_kfunc void scx_bpf_cidperf_set(s32 cid, u32 perf, + const struct bpf_prog_aux *aux) +{ + struct scx_sched *sch; + s32 cpu; + + guard(rcu)(); + + sch = scx_prog_sched(aux); + if (unlikely(!sch)) + return; + cpu = scx_cid_to_cpu(sch, cid); + if (cpu < 0) + return; + scx_bpf_cpuperf_set(cpu, perf, aux); +} + +/** * scx_bpf_nr_node_ids - Return the number of possible node IDs * * All valid node IDs in the system are smaller than the returned value. @@ -9555,6 +10117,47 @@ __bpf_kfunc u32 scx_bpf_nr_cpu_ids(void) } /** + * scx_bpf_nr_cids - Return the size of the cid space + * + * Equals num_possible_cpus(). All valid cids are in [0, return value). + */ +__bpf_kfunc u32 scx_bpf_nr_cids(void) +{ + return num_possible_cpus(); +} + +/** + * scx_bpf_nr_online_cids - Return current count of online CPUs in cid space + * + * Return num_online_cpus(). The standard model restarts the scheduler on + * hotplug, which lets schedulers treat [0, nr_online_cids) as the online + * range. Schedulers that prefer to handle hotplug without a restart should + * install a custom mapping via scx_bpf_cid_override() and track onlining + * through the ops.cid_online / ops.cid_offline callbacks. + */ +__bpf_kfunc u32 scx_bpf_nr_online_cids(void) +{ + return num_online_cpus(); +} + +/** + * scx_bpf_this_cid - Return the cid of the CPU this program is running on + * + * cid-addressed equivalent of bpf_get_smp_processor_id() for scx programs. + * The current cpu is trivially valid, so this is just a table lookup. Return + * -EINVAL if called from a non-SCX program before any scheduler has ever + * been enabled (the cid table is still unallocated at that point). + */ +__bpf_kfunc s32 scx_bpf_this_cid(void) +{ + s16 *tbl = READ_ONCE(scx_cpu_to_cid_tbl); + + if (!tbl) + return -EINVAL; + return tbl[raw_smp_processor_id()]; +} + +/** * scx_bpf_get_possible_cpumask - Get a referenced kptr to cpu_possible_mask */ __bpf_kfunc const struct cpumask *scx_bpf_get_possible_cpumask(void) @@ -9603,6 +10206,23 @@ __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p) } /** + * scx_bpf_task_cid - cid a task is currently associated with + * @p: task of interest + * + * cid-addressed equivalent of scx_bpf_task_cpu(). task_cpu(p) is always a + * valid cpu, so this is just a table lookup. Return -EINVAL if called from + * a non-SCX program before any scheduler has ever been enabled. + */ +__bpf_kfunc s32 scx_bpf_task_cid(const struct task_struct *p) +{ + s16 *tbl = READ_ONCE(scx_cpu_to_cid_tbl); + + if (!tbl) + return -EINVAL; + return tbl[task_cpu(p)]; +} + +/** * scx_bpf_cpu_rq - Fetch the rq of a CPU * @cpu: CPU of the rq * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs @@ -9617,7 +10237,7 @@ __bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu, const struct bpf_prog_aux *aux) if (unlikely(!sch)) return NULL; - if (!ops_cpu_valid(sch, cpu, NULL)) + if (!scx_cpu_valid(sch, cpu, NULL)) return NULL; if (!sch->warned_deprecated_rq) { @@ -9674,13 +10294,65 @@ __bpf_kfunc struct task_struct *scx_bpf_cpu_curr(s32 cpu, const struct bpf_prog_ if (unlikely(!sch)) return NULL; - if (!ops_cpu_valid(sch, cpu, NULL)) + if (!scx_cpu_valid(sch, cpu, NULL)) return NULL; return rcu_dereference(cpu_rq(cpu)->curr); } /** + * scx_bpf_cid_curr - Return the curr task on the CPU at @cid + * @cid: cid of interest + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs + * + * cid-addressed equivalent of scx_bpf_cpu_curr(). Callers must hold RCU + * read lock (KF_RCU). + */ +__bpf_kfunc struct task_struct *scx_bpf_cid_curr(s32 cid, const struct bpf_prog_aux *aux) +{ + struct scx_sched *sch; + s32 cpu; + + guard(rcu)(); + + sch = scx_prog_sched(aux); + if (unlikely(!sch)) + return NULL; + cpu = scx_cid_to_cpu(sch, cid); + if (cpu < 0) + return NULL; + return rcu_dereference(cpu_rq(cpu)->curr); +} + +/** + * scx_bpf_tid_to_task - Look up a task by its scx tid + * @tid: task ID previously read from p->scx.tid + * + * Returns the task with the given tid, or NULL if no such task exists. The + * returned pointer is valid until the end of the current RCU read section + * (KF_RCU_PROTECTED). Requires SCX_OPS_TID_TO_TASK to be set on the root + * scheduler; otherwise an error is raised and NULL returned. + */ +__bpf_kfunc struct task_struct *scx_bpf_tid_to_task(u64 tid) +{ + struct sched_ext_entity *scx; + + if (!scx_tid_to_task_enabled()) { + struct scx_sched *sch = rcu_dereference(scx_root); + + if (sch) + scx_error(sch, "scx_bpf_tid_to_task() called without SCX_OPS_TID_TO_TASK"); + return NULL; + } + + scx = rhashtable_lookup(&scx_tid_hash, &tid, scx_tid_hash_params); + if (!scx) + return NULL; + + return container_of(scx, struct task_struct, scx); +} + +/** * scx_bpf_now - Returns a high-performance monotonically non-decreasing * clock for the current CPU. The clock returned is in nanoseconds. * @@ -9839,6 +10511,7 @@ BTF_KFUNCS_START(scx_kfunc_ids_any) BTF_ID_FLAGS(func, scx_bpf_task_set_slice, KF_IMPLICIT_ARGS | KF_RCU); BTF_ID_FLAGS(func, scx_bpf_task_set_dsq_vtime, KF_IMPLICIT_ARGS | KF_RCU); BTF_ID_FLAGS(func, scx_bpf_kick_cpu, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_kick_cid, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, scx_bpf_destroy_dsq, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_IMPLICIT_ARGS | KF_RCU_PROTECTED | KF_RET_NULL) @@ -9853,16 +10526,25 @@ BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, scx_bpf_cpuperf_set, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_cidperf_cap, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_cidperf_cur, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_cidperf_set, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, scx_bpf_nr_node_ids) BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids) +BTF_ID_FLAGS(func, scx_bpf_nr_cids) +BTF_ID_FLAGS(func, scx_bpf_nr_online_cids) +BTF_ID_FLAGS(func, scx_bpf_this_cid) BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE) BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE) BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE) BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_task_cid, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_cpu_rq, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, scx_bpf_locked_rq, KF_IMPLICIT_ARGS | KF_RET_NULL) BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_IMPLICIT_ARGS | KF_RET_NULL | KF_RCU_PROTECTED) +BTF_ID_FLAGS(func, scx_bpf_cid_curr, KF_IMPLICIT_ARGS | KF_RET_NULL | KF_RCU_PROTECTED) +BTF_ID_FLAGS(func, scx_bpf_tid_to_task, KF_RET_NULL | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, scx_bpf_now) BTF_ID_FLAGS(func, scx_bpf_events) #ifdef CONFIG_CGROUP_SCHED @@ -9877,6 +10559,47 @@ static const struct btf_kfunc_id_set scx_kfunc_set_any = { }; /* + * cpu-form kfuncs that are forbidden from cid-form schedulers + * (bpf_sched_ext_ops_cid). Programs targeting the cid struct_ops type must + * use the cid-form alternative (cid/cmask kfuncs). + * + * Membership overlaps with scx_kfunc_ids_{any,idle,select_cpu}; the filter + * tests this set independently and rejects matches before the per-op + * allow-list check runs. + * + * pahole/resolve_btfids scans every BTF_ID_FLAGS() at build time and + * intersects flags across duplicate entries, so each entry must carry the + * same flags as the kfunc's primary declaration; otherwise the flags get + * dropped globally. + */ +BTF_KFUNCS_START(scx_kfunc_ids_cpu_only) +BTF_ID_FLAGS(func, scx_bpf_kick_cpu, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_cpu_rq, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_IMPLICIT_ARGS | KF_RET_NULL | KF_RCU_PROTECTED) +BTF_ID_FLAGS(func, scx_bpf_cpu_node, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_cpuperf_set, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE) +BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_IMPLICIT_ARGS | KF_RCU) +BTF_ID_FLAGS(func, __scx_bpf_select_cpu_and, KF_IMPLICIT_ARGS | KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_IMPLICIT_ARGS | KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask_node, KF_IMPLICIT_ARGS | KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_IMPLICIT_ARGS | KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask_node, KF_IMPLICIT_ARGS | KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE) +BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_IMPLICIT_ARGS | KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_IMPLICIT_ARGS | KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_IMPLICIT_ARGS | KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_IMPLICIT_ARGS | KF_RCU) +BTF_KFUNCS_END(scx_kfunc_ids_cpu_only) + +/* * Per-op kfunc allow flags. Each bit corresponds to a context-sensitive kfunc * group; an op may permit zero or more groups, with the union expressed in * scx_kf_allow_flags[]. The verifier-time filter (scx_kfunc_context_filter()) @@ -9885,10 +10608,11 @@ static const struct btf_kfunc_id_set scx_kfunc_set_any = { */ enum scx_kf_allow_flags { SCX_KF_ALLOW_UNLOCKED = 1 << 0, - SCX_KF_ALLOW_CPU_RELEASE = 1 << 1, - SCX_KF_ALLOW_DISPATCH = 1 << 2, - SCX_KF_ALLOW_ENQUEUE = 1 << 3, - SCX_KF_ALLOW_SELECT_CPU = 1 << 4, + SCX_KF_ALLOW_INIT = 1 << 1, + SCX_KF_ALLOW_CPU_RELEASE = 1 << 2, + SCX_KF_ALLOW_DISPATCH = 1 << 3, + SCX_KF_ALLOW_ENQUEUE = 1 << 4, + SCX_KF_ALLOW_SELECT_CPU = 1 << 5, }; /* @@ -9916,7 +10640,7 @@ static const u32 scx_kf_allow_flags[] = { [SCX_OP_IDX(sub_detach)] = SCX_KF_ALLOW_UNLOCKED, [SCX_OP_IDX(cpu_online)] = SCX_KF_ALLOW_UNLOCKED, [SCX_OP_IDX(cpu_offline)] = SCX_KF_ALLOW_UNLOCKED, - [SCX_OP_IDX(init)] = SCX_KF_ALLOW_UNLOCKED, + [SCX_OP_IDX(init)] = SCX_KF_ALLOW_UNLOCKED | SCX_KF_ALLOW_INIT, [SCX_OP_IDX(exit)] = SCX_KF_ALLOW_UNLOCKED, }; @@ -9931,16 +10655,18 @@ static const u32 scx_kf_allow_flags[] = { int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id) { bool in_unlocked = btf_id_set8_contains(&scx_kfunc_ids_unlocked, kfunc_id); + bool in_init = btf_id_set8_contains(&scx_kfunc_ids_init, kfunc_id); bool in_select_cpu = btf_id_set8_contains(&scx_kfunc_ids_select_cpu, kfunc_id); bool in_enqueue = btf_id_set8_contains(&scx_kfunc_ids_enqueue_dispatch, kfunc_id); bool in_dispatch = btf_id_set8_contains(&scx_kfunc_ids_dispatch, kfunc_id); bool in_cpu_release = btf_id_set8_contains(&scx_kfunc_ids_cpu_release, kfunc_id); bool in_idle = btf_id_set8_contains(&scx_kfunc_ids_idle, kfunc_id); bool in_any = btf_id_set8_contains(&scx_kfunc_ids_any, kfunc_id); + bool in_cpu_only = btf_id_set8_contains(&scx_kfunc_ids_cpu_only, kfunc_id); u32 moff, flags; /* Not an SCX kfunc - allow. */ - if (!(in_unlocked || in_select_cpu || in_enqueue || in_dispatch || + if (!(in_unlocked || in_init || in_select_cpu || in_enqueue || in_dispatch || in_cpu_release || in_idle || in_any)) return 0; @@ -9963,8 +10689,24 @@ int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id) /* * Non-SCX struct_ops: SCX kfuncs are not permitted. + * + * Both bpf_sched_ext_ops (cpu-form) and bpf_sched_ext_ops_cid + * (cid-form) are valid SCX struct_ops. Member offsets match between + * the two (verified by BUILD_BUG_ON in scx_init()), so the shared + * scx_kf_allow_flags[] table indexed by SCX_MOFF_IDX(moff) applies to + * both. + */ + if (prog->aux->st_ops != &bpf_sched_ext_ops && + prog->aux->st_ops != &bpf_sched_ext_ops_cid) + return -EACCES; + + /* + * cid-form schedulers must use cid/cmask kfuncs. cid and cpu are both + * small s32s and trivially confused, so cpu-only kfuncs are rejected at + * load time. The reverse (cpu-form calling cid-form kfuncs) is + * intentionally permissive to ease gradual cpumask -> cid migration. */ - if (prog->aux->st_ops != &bpf_sched_ext_ops) + if (prog->aux->st_ops == &bpf_sched_ext_ops_cid && in_cpu_only) return -EACCES; /* SCX struct_ops: check the per-op allow list. */ @@ -9976,6 +10718,8 @@ int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id) if ((flags & SCX_KF_ALLOW_UNLOCKED) && in_unlocked) return 0; + if ((flags & SCX_KF_ALLOW_INIT) && in_init) + return 0; if ((flags & SCX_KF_ALLOW_CPU_RELEASE) && in_cpu_release) return 0; if ((flags & SCX_KF_ALLOW_DISPATCH) && in_dispatch) @@ -9993,6 +10737,73 @@ static int __init scx_init(void) int ret; /* + * sched_ext_ops_cid mirrors sched_ext_ops up to and including @priv. + * Both bpf_scx_init_member() and bpf_scx_check_member() use offsets + * from struct sched_ext_ops; sched_ext_ops_cid relies on those offsets + * matching for the shared fields. Catch any drift at boot. + */ +#define CID_OFFSET_MATCH(cpu_field, cid_field) \ + BUILD_BUG_ON(offsetof(struct sched_ext_ops, cpu_field) != \ + offsetof(struct sched_ext_ops_cid, cid_field)) + /* data fields used by bpf_scx_init_member() */ + CID_OFFSET_MATCH(dispatch_max_batch, dispatch_max_batch); + CID_OFFSET_MATCH(flags, flags); + CID_OFFSET_MATCH(name, name); + CID_OFFSET_MATCH(timeout_ms, timeout_ms); + CID_OFFSET_MATCH(exit_dump_len, exit_dump_len); + CID_OFFSET_MATCH(hotplug_seq, hotplug_seq); + CID_OFFSET_MATCH(sub_cgroup_id, sub_cgroup_id); + /* shared callbacks: the union view requires byte-for-byte offset match */ + CID_OFFSET_MATCH(enqueue, enqueue); + CID_OFFSET_MATCH(dequeue, dequeue); + CID_OFFSET_MATCH(dispatch, dispatch); + CID_OFFSET_MATCH(tick, tick); + CID_OFFSET_MATCH(runnable, runnable); + CID_OFFSET_MATCH(running, running); + CID_OFFSET_MATCH(stopping, stopping); + CID_OFFSET_MATCH(quiescent, quiescent); + CID_OFFSET_MATCH(yield, yield); + CID_OFFSET_MATCH(core_sched_before, core_sched_before); + CID_OFFSET_MATCH(set_weight, set_weight); + CID_OFFSET_MATCH(update_idle, update_idle); + CID_OFFSET_MATCH(init_task, init_task); + CID_OFFSET_MATCH(exit_task, exit_task); + CID_OFFSET_MATCH(enable, enable); + CID_OFFSET_MATCH(disable, disable); + CID_OFFSET_MATCH(dump, dump); + CID_OFFSET_MATCH(dump_task, dump_task); + CID_OFFSET_MATCH(sub_attach, sub_attach); + CID_OFFSET_MATCH(sub_detach, sub_detach); + CID_OFFSET_MATCH(init, init); + CID_OFFSET_MATCH(exit, exit); +#ifdef CONFIG_EXT_GROUP_SCHED + CID_OFFSET_MATCH(cgroup_init, cgroup_init); + CID_OFFSET_MATCH(cgroup_exit, cgroup_exit); + CID_OFFSET_MATCH(cgroup_prep_move, cgroup_prep_move); + CID_OFFSET_MATCH(cgroup_move, cgroup_move); + CID_OFFSET_MATCH(cgroup_cancel_move, cgroup_cancel_move); + CID_OFFSET_MATCH(cgroup_set_weight, cgroup_set_weight); + CID_OFFSET_MATCH(cgroup_set_bandwidth, cgroup_set_bandwidth); + CID_OFFSET_MATCH(cgroup_set_idle, cgroup_set_idle); +#endif + /* renamed callbacks must occupy the same slot as their cpu-form sibling */ + CID_OFFSET_MATCH(select_cpu, select_cid); + CID_OFFSET_MATCH(set_cpumask, set_cmask); + CID_OFFSET_MATCH(cpu_online, cid_online); + CID_OFFSET_MATCH(cpu_offline, cid_offline); + CID_OFFSET_MATCH(dump_cpu, dump_cid); + /* @priv tail must align since both share the same data block */ + CID_OFFSET_MATCH(priv, priv); + /* + * cid-form must end exactly at @priv - validate_ops() skips + * cpu_acquire/cpu_release for cid-form because reading those fields + * past the BPF allocation would be UB. + */ + BUILD_BUG_ON(offsetof(struct sched_ext_ops_cid, __end) != + offsetofend(struct sched_ext_ops, priv)); +#undef CID_OFFSET_MATCH + + /* * kfunc registration can't be done from init_sched_ext_class() as * register_btf_kfunc_id_set() needs most of the system to be up. * @@ -10030,12 +10841,24 @@ static int __init scx_init(void) return ret; } + ret = scx_cid_kfunc_init(); + if (ret) { + pr_err("sched_ext: Failed to register cid kfuncs (%d)\n", ret); + return ret; + } + ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops); if (ret) { pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret); return ret; } + ret = register_bpf_struct_ops(&bpf_sched_ext_ops_cid, sched_ext_ops_cid); + if (ret) { + pr_err("sched_ext: Failed to register cid struct_ops (%d)\n", ret); + return ret; + } + ret = register_pm_notifier(&scx_pm_notifier); if (ret) { pr_err("sched_ext: Failed to register PM notifier (%d)\n", ret); diff --git a/kernel/sched/ext_arena.c b/kernel/sched/ext_arena.c new file mode 100644 index 000000000000..493c2424f842 --- /dev/null +++ b/kernel/sched/ext_arena.c @@ -0,0 +1,131 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * + * scx_arena_pool: kernel-side sub-allocator over BPF-arena pages. + * + * Each chunk added to @sch->arena_pool comes from one + * bpf_arena_alloc_pages_sleepable() call and is registered at the + * kernel-side mapping address. Callers translate to the BPF-arena form + * themselves if needed. + * + * Allocations grow the pool on demand. Underlying arena pages are released + * when the arena map itself is torn down. + * + * Copyright (c) 2026 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2026 Tejun Heo <tj@kernel.org> + */ + +enum scx_arena_consts { + SCX_ARENA_MIN_ORDER = 3, /* 8-byte minimum sub-allocation */ + SCX_ARENA_GROW_PAGES = 4, /* per growth */ +}; + +s32 scx_arena_pool_init(struct scx_sched *sch) +{ + if (!sch->arena_map) + return 0; + + sch->arena_pool = gen_pool_create(SCX_ARENA_MIN_ORDER, NUMA_NO_NODE); + if (!sch->arena_pool) + return -ENOMEM; + return 0; +} + +static void scx_arena_clear_chunk(struct gen_pool *pool, struct gen_pool_chunk *chunk, + void *data) +{ + int order = pool->min_alloc_order; + size_t chunk_sz = chunk->end_addr - chunk->start_addr + 1; + unsigned long end_bit = chunk_sz >> order; + unsigned long b, e; + + for_each_set_bitrange(b, e, chunk->bits, end_bit) + gen_pool_free(pool, chunk->start_addr + (b << order), + (e - b) << order); +} + +/* + * Tear down the pool. Outstanding gen_pool allocations are freed via + * scx_arena_clear_chunk() so gen_pool_destroy() doesn't BUG. The underlying + * arena pages are released when the arena map itself is torn down. + */ +void scx_arena_pool_destroy(struct scx_sched *sch) +{ + if (!sch->arena_pool) + return; + gen_pool_for_each_chunk(sch->arena_pool, scx_arena_clear_chunk, NULL); + gen_pool_destroy(sch->arena_pool); + sch->arena_pool = NULL; +} + +/* + * Grow the pool by @page_cnt pages. bpf_arena_alloc_pages_sleepable() and + * gen_pool_add() (which calls vzalloc(GFP_KERNEL)) require a sleepable + * context. + */ +static int scx_arena_grow(struct scx_sched *sch, u32 page_cnt) +{ + u64 kern_vm_start; + u32 uaddr32; + void *p; + int ret; + + if (!sch->arena_map || !sch->arena_pool) + return -EINVAL; + + p = bpf_arena_alloc_pages_sleepable(sch->arena_map, NULL, + page_cnt, NUMA_NO_NODE, 0); + if (!p) + return -ENOMEM; + + uaddr32 = (u32)(unsigned long)p; + /* arena.o, which defines these, is built only on MMU && 64BIT */ +#if defined(CONFIG_MMU) && defined(CONFIG_64BIT) + kern_vm_start = bpf_arena_map_kern_vm_start(sch->arena_map); +#else + kern_vm_start = 0; +#endif + + ret = gen_pool_add(sch->arena_pool, kern_vm_start + uaddr32, + page_cnt * PAGE_SIZE, NUMA_NO_NODE); + if (ret) { + bpf_arena_free_pages_non_sleepable(sch->arena_map, p, page_cnt); + return ret; + } + return 0; +} + +/* + * Allocate @size bytes from the arena pool. Returns kernel VA on success, NULL + * on failure. May grow the pool via scx_arena_grow() which sleeps. Caller must + * be in a GFP_KERNEL context. + */ +void *scx_arena_alloc(struct scx_sched *sch, size_t size) +{ + unsigned long kern_va; + u32 page_cnt; + + might_sleep(); + + if (!sch->arena_pool) + return NULL; + + while (true) { + kern_va = gen_pool_alloc(sch->arena_pool, size); + if (kern_va) + break; + page_cnt = max_t(u32, SCX_ARENA_GROW_PAGES, + (size + PAGE_SIZE - 1) >> PAGE_SHIFT); + if (scx_arena_grow(sch, page_cnt)) + return NULL; + } + + return (void *)kern_va; +} + +void scx_arena_free(struct scx_sched *sch, void *kern_va, size_t size) +{ + if (sch->arena_pool && kern_va) + gen_pool_free(sch->arena_pool, (unsigned long)kern_va, size); +} diff --git a/kernel/sched/ext_arena.h b/kernel/sched/ext_arena.h new file mode 100644 index 000000000000..4f3610160102 --- /dev/null +++ b/kernel/sched/ext_arena.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * + * Copyright (c) 2025 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2025 Tejun Heo <tj@kernel.org> + */ +#ifndef _KERNEL_SCHED_EXT_ARENA_H +#define _KERNEL_SCHED_EXT_ARENA_H + +struct scx_sched; + +s32 scx_arena_pool_init(struct scx_sched *sch); +void scx_arena_pool_destroy(struct scx_sched *sch); +void *scx_arena_alloc(struct scx_sched *sch, size_t size); +void scx_arena_free(struct scx_sched *sch, void *kern_va, size_t size); + +#endif /* _KERNEL_SCHED_EXT_ARENA_H */ diff --git a/kernel/sched/ext_cid.c b/kernel/sched/ext_cid.c new file mode 100644 index 000000000000..66944a7ef79d --- /dev/null +++ b/kernel/sched/ext_cid.c @@ -0,0 +1,707 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * + * Copyright (c) 2026 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2026 Tejun Heo <tj@kernel.org> + */ +#include <linux/cacheinfo.h> + +/* + * cid tables. + * + * Pointers are published once on first enable and never revoked. The default + * mapping is populated before ops.init() runs; scx_bpf_cid_override() commits + * before it returns. As long as the BPF scheduler only uses the tables from + * those points onward, it sees a consistent view. + */ +s16 *scx_cid_to_cpu_tbl; +s16 *scx_cpu_to_cid_tbl; +struct scx_cid_topo *scx_cid_topo; + +#define SCX_CID_TOPO_NEG (struct scx_cid_topo) { \ + .core_cid = -1, .core_idx = -1, .llc_cid = -1, .llc_idx = -1, \ + .node_cid = -1, .node_idx = -1, \ +} + +/* + * Return @cpu's LLC shared_cpu_map. If cacheinfo isn't populated (offline or + * !present), record @cpu in @fallbacks and return its node mask instead - the + * worst that can happen is that the cpu's LLC becomes coarser than reality. + */ +static const struct cpumask *cpu_llc_mask(int cpu, struct cpumask *fallbacks) +{ + struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu); + + if (!ci || !ci->info_list || !ci->num_leaves) { + cpumask_set_cpu(cpu, fallbacks); + return cpumask_of_node(cpu_to_node(cpu)); + } + return &ci->info_list[ci->num_leaves - 1].shared_cpu_map; +} + +/* Allocate the cid tables once on first enable; never freed. */ +static s32 scx_cid_arrays_alloc(void) +{ + u32 npossible = num_possible_cpus(); + s16 *cid_to_cpu, *cpu_to_cid; + struct scx_cid_topo *cid_topo; + + if (scx_cid_to_cpu_tbl) + return 0; + + cid_to_cpu = kzalloc_objs(*scx_cid_to_cpu_tbl, npossible, GFP_KERNEL); + cpu_to_cid = kzalloc_objs(*scx_cpu_to_cid_tbl, nr_cpu_ids, GFP_KERNEL); + cid_topo = kmalloc_objs(*scx_cid_topo, npossible, GFP_KERNEL); + + if (!cid_to_cpu || !cpu_to_cid || !cid_topo) { + kfree(cid_to_cpu); + kfree(cpu_to_cid); + kfree(cid_topo); + return -ENOMEM; + } + + WRITE_ONCE(scx_cid_to_cpu_tbl, cid_to_cpu); + WRITE_ONCE(scx_cpu_to_cid_tbl, cpu_to_cid); + WRITE_ONCE(scx_cid_topo, cid_topo); + return 0; +} + +/** + * scx_cid_init - build the cid mapping + * @sch: the scx_sched being initialized; used as the scx_error() target + * + * See "Topological CPU IDs" in ext_cid.h for the model. Walk online cpus by + * intersection at each level (parent_scratch & this_level_mask), which keeps + * containment correct by construction and naturally splits a physical LLC + * straddling two NUMA nodes into two LLC units. The caller must hold + * cpus_read_lock. + */ +s32 scx_cid_init(struct scx_sched *sch) +{ + cpumask_var_t to_walk __free(free_cpumask_var) = CPUMASK_VAR_NULL; + cpumask_var_t node_scratch __free(free_cpumask_var) = CPUMASK_VAR_NULL; + cpumask_var_t llc_scratch __free(free_cpumask_var) = CPUMASK_VAR_NULL; + cpumask_var_t core_scratch __free(free_cpumask_var) = CPUMASK_VAR_NULL; + cpumask_var_t llc_fallback __free(free_cpumask_var) = CPUMASK_VAR_NULL; + cpumask_var_t online_no_topo __free(free_cpumask_var) = CPUMASK_VAR_NULL; + u32 next_cid = 0; + s32 next_node_idx = 0, next_llc_idx = 0, next_core_idx = 0; + s32 cpu, ret; + + /* CMASK_MAX_WORDS in cid.bpf.h covers NR_CPUS up to 8192 */ + BUILD_BUG_ON(NR_CPUS > 8192); + + lockdep_assert_cpus_held(); + + ret = scx_cid_arrays_alloc(); + if (ret) + return ret; + + if (!zalloc_cpumask_var(&to_walk, GFP_KERNEL) || + !zalloc_cpumask_var(&node_scratch, GFP_KERNEL) || + !zalloc_cpumask_var(&llc_scratch, GFP_KERNEL) || + !zalloc_cpumask_var(&core_scratch, GFP_KERNEL) || + !zalloc_cpumask_var(&llc_fallback, GFP_KERNEL) || + !zalloc_cpumask_var(&online_no_topo, GFP_KERNEL)) + return -ENOMEM; + + /* -1 sentinels for sparse-possible cpu id holes (0 is a valid cid) */ + for (cpu = 0; cpu < nr_cpu_ids; cpu++) + scx_cpu_to_cid_tbl[cpu] = -1; + + cpumask_copy(to_walk, cpu_online_mask); + + while (!cpumask_empty(to_walk)) { + s32 next_cpu = cpumask_first(to_walk); + s32 nid = cpu_to_node(next_cpu); + s32 node_cid = next_cid; + s32 node_idx; + + /* + * No NUMA info: skip and let the tail loop assign a no-topo + * cid. cpumask_of_node(-1) is undefined. + */ + if (nid < 0) { + cpumask_clear_cpu(next_cpu, to_walk); + continue; + } + + node_idx = next_node_idx++; + + /* node_scratch = to_walk & this node */ + cpumask_and(node_scratch, to_walk, cpumask_of_node(nid)); + if (WARN_ON_ONCE(!cpumask_test_cpu(next_cpu, node_scratch))) + return -EINVAL; + + while (!cpumask_empty(node_scratch)) { + s32 ncpu = cpumask_first(node_scratch); + const struct cpumask *llc_mask = cpu_llc_mask(ncpu, llc_fallback); + s32 llc_cid = next_cid; + s32 llc_idx = next_llc_idx++; + + /* llc_scratch = node_scratch & this llc */ + cpumask_and(llc_scratch, node_scratch, llc_mask); + if (WARN_ON_ONCE(!cpumask_test_cpu(ncpu, llc_scratch))) + return -EINVAL; + + while (!cpumask_empty(llc_scratch)) { + s32 lcpu = cpumask_first(llc_scratch); + const struct cpumask *sib = topology_sibling_cpumask(lcpu); + s32 core_cid = next_cid; + s32 core_idx = next_core_idx++; + s32 ccpu; + + /* core_scratch = llc_scratch & this core */ + cpumask_and(core_scratch, llc_scratch, sib); + if (WARN_ON_ONCE(!cpumask_test_cpu(lcpu, core_scratch))) + return -EINVAL; + + for_each_cpu(ccpu, core_scratch) { + s32 cid = next_cid++; + + scx_cid_to_cpu_tbl[cid] = ccpu; + scx_cpu_to_cid_tbl[ccpu] = cid; + scx_cid_topo[cid] = (struct scx_cid_topo){ + .core_cid = core_cid, + .core_idx = core_idx, + .llc_cid = llc_cid, + .llc_idx = llc_idx, + .node_cid = node_cid, + .node_idx = node_idx, + }; + + cpumask_clear_cpu(ccpu, llc_scratch); + cpumask_clear_cpu(ccpu, node_scratch); + cpumask_clear_cpu(ccpu, to_walk); + } + } + } + } + + /* + * No-topo section: any possible cpu without a cid - normally just the + * not-online ones. Collect any currently-online cpus that land here in + * @online_no_topo so we can warn about them at the end. + */ + for_each_cpu(cpu, cpu_possible_mask) { + s32 cid; + + if (__scx_cpu_to_cid(cpu) != -1) + continue; + if (cpu_online(cpu)) + cpumask_set_cpu(cpu, online_no_topo); + + cid = next_cid++; + scx_cid_to_cpu_tbl[cid] = cpu; + scx_cpu_to_cid_tbl[cpu] = cid; + scx_cid_topo[cid] = SCX_CID_TOPO_NEG; + } + + if (!cpumask_empty(llc_fallback)) + pr_warn("scx_cid: cpus without cacheinfo, using node mask as llc: %*pbl\n", + cpumask_pr_args(llc_fallback)); + if (!cpumask_empty(online_no_topo)) + pr_warn("scx_cid: online cpus with no usable topology: %*pbl\n", + cpumask_pr_args(online_no_topo)); + + return 0; +} + +/** + * scx_cmask_clear - Zero every bit in @m's active range + * @m: cmask to clear + * + * Storage past the active range is left as is. + */ +void scx_cmask_clear(struct scx_cmask *m) +{ + u32 nr_words; + + if (!m->nr_cids) + return; + nr_words = (m->base + m->nr_cids - 1) / 64 - m->base / 64 + 1; + memset(m->bits, 0, nr_words * sizeof(u64)); +} + +/** + * scx_cmask_fill - Set every bit in @m's active range + * @m: cmask to fill + * + * Counterpart to scx_cmask_clear(). Storage past the active range is left as is. + */ +void scx_cmask_fill(struct scx_cmask *m) +{ + u32 nr_words, head_bits, tail_bits; + + if (!m->nr_cids) + return; + nr_words = (m->base + m->nr_cids - 1) / 64 - m->base / 64 + 1; + memset(m->bits, 0xff, nr_words * sizeof(u64)); + + /* clear word-0 bits below base */ + head_bits = m->base & 63; + if (head_bits) + m->bits[0] &= ~((1ULL << head_bits) - 1); + + /* clear last-word bits at or past base + nr_cids */ + tail_bits = (m->base + m->nr_cids) & 63; + if (tail_bits) + m->bits[nr_words - 1] &= (1ULL << tail_bits) - 1; +} + +/** + * scx_cpumask_to_cmask - Translate a kernel cpumask into a cmask + * @src: source cpumask + * @dst: cmask to write + * + * Clear @dst's active range and set the bit for each cid whose cpu is in + * @src and lies within that range. Out-of-range cids are silently ignored. + */ +void scx_cpumask_to_cmask(const struct cpumask *src, struct scx_cmask *dst) +{ + s32 cpu; + + scx_cmask_clear(dst); + for_each_cpu(cpu, src) { + s32 cid = __scx_cpu_to_cid(cpu); + + if (cid >= 0) + __scx_cmask_set(cid, dst); + } +} + +__bpf_kfunc_start_defs(); + +/** + * scx_bpf_cid_override - Install an explicit cpu->cid mapping + * @cpu_to_cid: array of nr_cpu_ids s32 entries (cid for each cpu) + * @cpu_to_cid__sz: must be nr_cpu_ids * sizeof(s32) bytes + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs + * + * May only be called from ops.init() of the root scheduler. Replace the + * topology-probed cid mapping with the caller-provided one. Each possible cpu + * must map to a unique cid in [0, num_possible_cpus()). Topo info is cleared. + * On invalid input, trigger scx_error() to abort the scheduler. + */ +__bpf_kfunc void scx_bpf_cid_override(const s32 *cpu_to_cid, u32 cpu_to_cid__sz, + const struct bpf_prog_aux *aux) +{ + cpumask_var_t seen __free(free_cpumask_var) = CPUMASK_VAR_NULL; + struct scx_sched *sch; + bool alloced; + s32 cpu, cid; + + /* GFP_KERNEL alloc must happen before the rcu read section */ + alloced = zalloc_cpumask_var(&seen, GFP_KERNEL); + + guard(rcu)(); + + sch = scx_prog_sched(aux); + if (unlikely(!sch)) + return; + + if (!alloced) { + scx_error(sch, "scx_bpf_cid_override: failed to allocate cpumask"); + return; + } + + if (scx_parent(sch)) { + scx_error(sch, "scx_bpf_cid_override() only allowed from root sched"); + return; + } + + if (cpu_to_cid__sz != nr_cpu_ids * sizeof(s32)) { + scx_error(sch, "scx_bpf_cid_override: expected %zu bytes, got %u", + nr_cpu_ids * sizeof(s32), cpu_to_cid__sz); + return; + } + + for_each_possible_cpu(cpu) { + s32 c = cpu_to_cid[cpu]; + + if (!cid_valid(sch, c)) + return; + if (cpumask_test_and_set_cpu(c, seen)) { + scx_error(sch, "cid %d assigned to multiple cpus", c); + return; + } + scx_cpu_to_cid_tbl[cpu] = c; + scx_cid_to_cpu_tbl[c] = cpu; + } + + /* Invalidate stale topo info - the override carries no topology. */ + for (cid = 0; cid < num_possible_cpus(); cid++) + scx_cid_topo[cid] = SCX_CID_TOPO_NEG; +} + +/** + * scx_bpf_cid_to_cpu - Return the raw CPU id for @cid + * @cid: cid to look up + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs + * + * Return the raw CPU id for @cid. Trigger scx_error() and return -EINVAL if + * @cid is invalid. The cid<->cpu mapping is static for the lifetime of the + * loaded scheduler, so the BPF side can cache the result to avoid repeated + * kfunc invocations. + */ +__bpf_kfunc s32 scx_bpf_cid_to_cpu(s32 cid, const struct bpf_prog_aux *aux) +{ + struct scx_sched *sch; + + guard(rcu)(); + + sch = scx_prog_sched(aux); + if (unlikely(!sch)) + return -EINVAL; + return scx_cid_to_cpu(sch, cid); +} + +/** + * scx_bpf_cpu_to_cid - Return the cid for @cpu + * @cpu: cpu to look up + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs + * + * Return the cid for @cpu. Trigger scx_error() and return -EINVAL if @cpu is + * invalid. The cid<->cpu mapping is static for the lifetime of the loaded + * scheduler, so the BPF side can cache the result to avoid repeated kfunc + * invocations. + */ +__bpf_kfunc s32 scx_bpf_cpu_to_cid(s32 cpu, const struct bpf_prog_aux *aux) +{ + struct scx_sched *sch; + + guard(rcu)(); + + sch = scx_prog_sched(aux); + if (unlikely(!sch)) + return -EINVAL; + return scx_cpu_to_cid(sch, cpu); +} + +/* + * Set ops on cmasks. cmask_walk_op2() shares one walk across mutating + * (and/or/copy/andnot) and predicate (subset/intersects) two-cmask forms; + * cmask_walk_op1() does the same shape over a single cmask range. Every public + * entry passes a compile-time-constant @op; cmask_walk_op{1,2}() and + * cmask_word_op{1,2}() are __always_inline so the inner switch collapses to the + * selected op and cmask_op2_is_pred() folds the predicate early-exit out of + * mutating ops. + * + * Two-cmask ops only touch @dst bits inside the intersection of the two ranges; + * bits outside stay untouched. In particular, scx_cmask_copy() does NOT zero + * @dst bits that lie outside @src's range. + * + * The _RACY variants are otherwise identical to their non-racy counterpart but + * read @src word-by-word via data_race(). Memory ordering with concurrent + * writers is the caller's responsibility. + */ +enum cmask_op2 { + /* mutating */ + CMASK_OP2_AND, + CMASK_OP2_OR, + CMASK_OP2_OR_RACY, + CMASK_OP2_COPY, + CMASK_OP2_COPY_RACY, + CMASK_OP2_ANDNOT, + /* predicates - short-circuit when the per-word result is true */ + CMASK_OP2_SUBSET, + CMASK_OP2_INTERSECTS, +}; + +static __always_inline bool cmask_op2_is_pred(const enum cmask_op2 op) +{ + return op == CMASK_OP2_SUBSET || op == CMASK_OP2_INTERSECTS; +} + +static __always_inline bool cmask_word_op2(u64 *av, const u64 *bp, u64 mask, + const enum cmask_op2 op) +{ + switch (op) { + case CMASK_OP2_AND: + *av &= ~mask | *bp; + return false; + case CMASK_OP2_OR: + *av |= *bp & mask; + return false; + case CMASK_OP2_OR_RACY: + *av |= data_race(*bp) & mask; + return false; + case CMASK_OP2_COPY: + *av = (*av & ~mask) | (*bp & mask); + return false; + case CMASK_OP2_COPY_RACY: + *av = (*av & ~mask) | (data_race(*bp) & mask); + return false; + case CMASK_OP2_ANDNOT: + *av &= ~(*bp & mask); + return false; + case CMASK_OP2_SUBSET: + /* stop on the first bit in @sub not set in @super */ + return (*bp & ~*av) & mask; + case CMASK_OP2_INTERSECTS: + return (*av & *bp) & mask; + } + unreachable(); +} + +/* + * Walk the intersection of [@a_base, @a_base + @a_nr_cids) with [@b_base, + * @b_base + @b_nr_cids) word by word, applying @op. Mutating ops walk all words + * and return false; predicates return true on the first word whose per-word + * test is true. Empty intersection returns false (matches "no bits to consider" + * for both mutate and predicate). + * + * Base/nr_cids are taken as parameters so callers with snapshotted bounds can + * drive the walk with values independent of the cmask's header. + */ +static __always_inline bool cmask_walk_op2(u64 *a_bits, u32 a_base, u32 a_nr_cids, + const u64 *b_bits, u32 b_base, u32 b_nr_cids, + const enum cmask_op2 op) +{ + u32 lo = max(a_base, b_base); + u32 hi = min(a_base + a_nr_cids, b_base + b_nr_cids); + u32 a_word_off = a_base / 64; + u32 b_word_off = b_base / 64; + u32 lo_word = lo / 64; + u32 hi_word = (hi - 1) / 64; + u64 head_mask = GENMASK_U64(63, lo & 63); + u64 tail_mask = GENMASK_U64((hi - 1) & 63, 0); + u32 w; + + if (lo >= hi) + return false; + + if (lo_word == hi_word) + return cmask_word_op2(&a_bits[lo_word - a_word_off], + &b_bits[lo_word - b_word_off], + head_mask & tail_mask, op); + + if (cmask_word_op2(&a_bits[lo_word - a_word_off], + &b_bits[lo_word - b_word_off], head_mask, op) && + cmask_op2_is_pred(op)) + return true; + + for (w = lo_word + 1; w < hi_word; w++) + if (cmask_word_op2(&a_bits[w - a_word_off], + &b_bits[w - b_word_off], ~0ULL, op) && + cmask_op2_is_pred(op)) + return true; + + return cmask_word_op2(&a_bits[hi_word - a_word_off], + &b_bits[hi_word - b_word_off], tail_mask, op); +} + +enum cmask_op1 { + CMASK_OP1_ANY_SET, +}; + +static __always_inline bool cmask_word_op1(const u64 *ap, u64 mask, + const enum cmask_op1 op) +{ + switch (op) { + case CMASK_OP1_ANY_SET: + return *ap & mask; + } + unreachable(); +} + +/* + * Walk [@a_base, @a_base + @a_nr_cids) of @a_bits word by word, applying @op. + * Returns true on the first word whose per-word test is true; returns false if + * no word matches or the range is empty. All current op1s short-circuit on + * per-word true; if a non-predicate op1 lands here, add a cmask_op1_is_pred() + * guard analogous to cmask_op2_is_pred(). + */ +static __always_inline bool cmask_walk_op1(const u64 *a_bits, u32 a_base, + u32 a_nr_cids, + const enum cmask_op1 op) +{ + u32 lo = a_base; + u32 hi = a_base + a_nr_cids; + u32 a_word_off = a_base / 64; + u32 lo_word = lo / 64; + u32 hi_word = (hi - 1) / 64; + u64 head_mask = GENMASK_U64(63, lo & 63); + u64 tail_mask = GENMASK_U64((hi - 1) & 63, 0); + u32 w; + + if (lo >= hi) + return false; + + if (lo_word == hi_word) + return cmask_word_op1(&a_bits[lo_word - a_word_off], + head_mask & tail_mask, op); + + if (cmask_word_op1(&a_bits[lo_word - a_word_off], head_mask, op)) + return true; + for (w = lo_word + 1; w < hi_word; w++) + if (cmask_word_op1(&a_bits[w - a_word_off], ~0ULL, op)) + return true; + return cmask_word_op1(&a_bits[hi_word - a_word_off], tail_mask, op); +} + +void scx_cmask_and(struct scx_cmask *dst, const struct scx_cmask *src) +{ + cmask_walk_op2(dst->bits, dst->base, dst->nr_cids, + src->bits, src->base, src->nr_cids, CMASK_OP2_AND); +} + +void scx_cmask_or(struct scx_cmask *dst, const struct scx_cmask *src) +{ + cmask_walk_op2(dst->bits, dst->base, dst->nr_cids, + src->bits, src->base, src->nr_cids, CMASK_OP2_OR); +} + +/** + * scx_cmask_or_racy - OR @src into @dst, reading @src without locking + * + * @src is read word-by-word through data_race(). Same per-bit independence + * rationale as scx_cmask_copy_racy(). Memory ordering with writers is the + * caller's responsibility. + */ +void scx_cmask_or_racy(struct scx_cmask *dst, const struct scx_cmask *src) +{ + cmask_walk_op2(dst->bits, dst->base, dst->nr_cids, + src->bits, src->base, src->nr_cids, CMASK_OP2_OR_RACY); +} + +void scx_cmask_copy(struct scx_cmask *dst, const struct scx_cmask *src) +{ + cmask_walk_op2(dst->bits, dst->base, dst->nr_cids, + src->bits, src->base, src->nr_cids, CMASK_OP2_COPY); +} + +/** + * scx_cmask_copy_racy - Snapshot @src into @dst without locking + * + * @src is read word-by-word through data_race(). Head/tail masking matches + * scx_cmask_copy(). Each bit in a cmask is independent, so partial updates + * just leave some bits fresher than others. Memory ordering with writers is + * the caller's responsibility. + */ +void scx_cmask_copy_racy(struct scx_cmask *dst, const struct scx_cmask *src) +{ + cmask_walk_op2(dst->bits, dst->base, dst->nr_cids, + src->bits, src->base, src->nr_cids, CMASK_OP2_COPY_RACY); +} + +void scx_cmask_andnot(struct scx_cmask *dst, const struct scx_cmask *src) +{ + cmask_walk_op2(dst->bits, dst->base, dst->nr_cids, + src->bits, src->base, src->nr_cids, CMASK_OP2_ANDNOT); +} + +/* + * Return true if @cm has any bit set in [@lo, @hi). Caller must ensure + * [@lo, @hi) is contained in @cm's range. + */ +static bool cmask_any_set_in_range(const struct scx_cmask *cm, u32 lo, u32 hi) +{ + if (lo >= hi) + return false; + return cmask_walk_op1(&cm->bits[lo / 64 - cm->base / 64], lo, hi - lo, + CMASK_OP1_ANY_SET); +} + +/** + * scx_cmask_subset - test whether @sub is a subset of @super + * @sub: cmask to test + * @super: cmask to test against + * + * Return true iff every set bit of @sub is also set in @super. + */ +bool scx_cmask_subset(const struct scx_cmask *sub, const struct scx_cmask *super) +{ + u32 super_end = super->base + super->nr_cids; + u32 sub_end = sub->base + sub->nr_cids; + + /* + * Set bits in @sub outside @super's range can't be in @super, so any + * such bit means not a subset. The walk below only visits words + * common to both ranges, so these need a separate scan. + */ + if (sub->base < super->base && + cmask_any_set_in_range(sub, sub->base, min(super->base, sub_end))) + return false; + if (sub_end > super_end && + cmask_any_set_in_range(sub, max(sub->base, super_end), sub_end)) + return false; + + return !cmask_walk_op2((u64 *)super->bits, super->base, super->nr_cids, + sub->bits, sub->base, sub->nr_cids, CMASK_OP2_SUBSET); +} + +bool scx_cmask_intersects(const struct scx_cmask *a, const struct scx_cmask *b) +{ + return cmask_walk_op2((u64 *)a->bits, a->base, a->nr_cids, + b->bits, b->base, b->nr_cids, CMASK_OP2_INTERSECTS); +} + +/** + * scx_cmask_empty - Test whether @m has no bits set + * @m: cmask to test + * + * Return true iff @m's active range has no bits set. + */ +bool scx_cmask_empty(const struct scx_cmask *m) +{ + return !cmask_any_set_in_range(m, m->base, m->base + m->nr_cids); +} + +/** + * scx_bpf_cid_topo - Copy out per-cid topology info + * @cid: cid to look up + * @out__uninit: where to copy the topology info; fully written by this call + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs + * + * Fill @out__uninit with the topology info for @cid. Trigger scx_error() if + * @cid is out of range. If @cid is valid but in the no-topo section, all fields + * are set to -1. + */ +__bpf_kfunc void scx_bpf_cid_topo(s32 cid, struct scx_cid_topo *out__uninit, + const struct bpf_prog_aux *aux) +{ + struct scx_sched *sch; + + guard(rcu)(); + + sch = scx_prog_sched(aux); + if (unlikely(!sch) || !cid_valid(sch, cid)) { + *out__uninit = SCX_CID_TOPO_NEG; + return; + } + + *out__uninit = READ_ONCE(scx_cid_topo)[cid]; +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(scx_kfunc_ids_init) +BTF_ID_FLAGS(func, scx_bpf_cid_override, KF_IMPLICIT_ARGS | KF_SLEEPABLE) +BTF_KFUNCS_END(scx_kfunc_ids_init) + +static const struct btf_kfunc_id_set scx_kfunc_set_init = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_init, + .filter = scx_kfunc_context_filter, +}; + +BTF_KFUNCS_START(scx_kfunc_ids_cid) +BTF_ID_FLAGS(func, scx_bpf_cid_to_cpu, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_cpu_to_cid, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_cid_topo, KF_IMPLICIT_ARGS) +BTF_KFUNCS_END(scx_kfunc_ids_cid) + +static const struct btf_kfunc_id_set scx_kfunc_set_cid = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_cid, +}; + +int scx_cid_kfunc_init(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_init) ?: + register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_cid) ?: + register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &scx_kfunc_set_cid) ?: + register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_cid); +} diff --git a/kernel/sched/ext_cid.h b/kernel/sched/ext_cid.h new file mode 100644 index 000000000000..5745e5785e89 --- /dev/null +++ b/kernel/sched/ext_cid.h @@ -0,0 +1,271 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Topological CPU IDs (cids) + * -------------------------- + * + * Raw cpu numbers are clumsy for sharding work and communication across + * topology units, especially from BPF: the space can be sparse, numerical + * closeness doesn't imply topological closeness (x86 hyperthreading often puts + * SMT siblings far apart), and a range of cpu ids doesn't mean anything. + * Sub-scheds make this acute - cpu allocation, revocation and other state are + * constantly communicated across sub-scheds, and passing whole cpumasks scales + * poorly with cpu count. cpumasks are also awkward in BPF: a variable-length + * kernel type sized for the maximum NR_CPUS (4k), with verbose helper sequences + * for every op. + * + * cids give every cpu a dense, topology-ordered id. CPUs sharing a core, LLC or + * NUMA node get contiguous cid ranges, so a topology unit becomes a (start, + * length) slice of cid space. Communication can pass a slice instead of a + * cpumask, and BPF code can process, for example, a u64 word's worth of cids at + * a time. + * + * The mapping is built once at root scheduler enable time by walking the + * topology of online cpus only. Going by online cpus is out of necessity: + * depending on the arch, topology info isn't reliably available for offline + * cpus. The expected usage model is restarting the scheduler on hotplug events + * so the mapping is rebuilt against the new online set. A scheduler that wants + * to handle hotplug without a restart can provide its own cid and shard mapping + * through the override interface. + * + * Copyright (c) 2026 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2026 Tejun Heo <tj@kernel.org> + */ +#ifndef _KERNEL_SCHED_EXT_CID_H +#define _KERNEL_SCHED_EXT_CID_H + +struct scx_sched; + +/* + * Cid space (total is always num_possible_cpus()) is laid out with + * topology-annotated cids first, then no-topo cids at the tail. The + * topology-annotated block covers the cpus that were online when scx_cid_init() + * ran and remains valid even after those cpus go offline. The tail block covers + * possible-but-not-online cpus and carries all-(-1) topo info (see + * scx_cid_topo); callers detect it via the -1 sentinels. + * + * See the comment above the table definitions in ext_cid.c for the + * memory-ordering and visibility contract. + */ +extern s16 *scx_cid_to_cpu_tbl; +extern s16 *scx_cpu_to_cid_tbl; +extern struct scx_cid_topo *scx_cid_topo; +extern struct btf_id_set8 scx_kfunc_ids_init; + +void scx_cmask_clear(struct scx_cmask *m); +void scx_cmask_fill(struct scx_cmask *m); +void scx_cmask_and(struct scx_cmask *dst, const struct scx_cmask *src); +void scx_cmask_or(struct scx_cmask *dst, const struct scx_cmask *src); +void scx_cmask_or_racy(struct scx_cmask *dst, const struct scx_cmask *src); +void scx_cmask_copy(struct scx_cmask *dst, const struct scx_cmask *src); +void scx_cmask_copy_racy(struct scx_cmask *dst, const struct scx_cmask *src); +void scx_cmask_andnot(struct scx_cmask *dst, const struct scx_cmask *src); +bool scx_cmask_subset(const struct scx_cmask *sub, const struct scx_cmask *super); +bool scx_cmask_intersects(const struct scx_cmask *a, const struct scx_cmask *b); +bool scx_cmask_empty(const struct scx_cmask *m); +s32 scx_cid_init(struct scx_sched *sch); +int scx_cid_kfunc_init(void); +void scx_cpumask_to_cmask(const struct cpumask *src, struct scx_cmask *dst); + +/** + * cid_valid - Verify a cid value, to be used on ops input args + * @sch: scx_sched to abort on error + * @cid: cid which came from a BPF ops + * + * Return true if @cid is in [0, num_possible_cpus()). On failure, trigger + * scx_error() and return false. + */ +static inline bool cid_valid(struct scx_sched *sch, s32 cid) +{ + if (likely(cid >= 0 && cid < num_possible_cpus())) + return true; + scx_error(sch, "invalid cid %d", cid); + return false; +} + +/** + * __scx_cid_to_cpu - Unchecked cid->cpu table lookup + * @cid: cid to look up. Must be in [0, num_possible_cpus()). + * + * Intended for callsites that have already validated @cid and that hold a + * non-NULL @sch from scx_prog_sched() - a live sched implies the table has + * been allocated, so no NULL check is needed here. + */ +static inline s32 __scx_cid_to_cpu(s32 cid) +{ + /* READ_ONCE pairs with WRITE_ONCE in scx_cid_arrays_alloc() */ + return READ_ONCE(scx_cid_to_cpu_tbl)[cid]; +} + +/** + * __scx_cpu_to_cid - Unchecked cpu->cid table lookup + * @cpu: cpu to look up. Must be a valid possible cpu id. + * + * Same usage constraints as __scx_cid_to_cpu(). + */ +static inline s32 __scx_cpu_to_cid(s32 cpu) +{ + return READ_ONCE(scx_cpu_to_cid_tbl)[cpu]; +} + +/** + * scx_cid_to_cpu - Translate @cid to its cpu + * @sch: scx_sched for error reporting + * @cid: cid to look up + * + * Return the cpu for @cid or a negative errno on failure. Invalid cid triggers + * scx_error() on @sch. The cid arrays are allocated on first scheduler enable + * and never freed, so the returned cpu is stable for the lifetime of the loaded + * scheduler. + */ +static inline s32 scx_cid_to_cpu(struct scx_sched *sch, s32 cid) +{ + if (!cid_valid(sch, cid)) + return -EINVAL; + return __scx_cid_to_cpu(cid); +} + +/** + * scx_cpu_to_cid - Translate @cpu to its cid + * @sch: scx_sched for error reporting + * @cpu: cpu to look up + * + * Return the cid for @cpu or a negative errno on failure. Invalid cpu triggers + * scx_error() on @sch. Same lifetime guarantee as scx_cid_to_cpu(). + */ +static inline s32 scx_cpu_to_cid(struct scx_sched *sch, s32 cpu) +{ + if (!scx_cpu_valid(sch, cpu, NULL)) + return -EINVAL; + return __scx_cpu_to_cid(cpu); +} + +/** + * scx_is_cid_type - Test whether the active scheduler hierarchy is cid-form + */ +static inline bool scx_is_cid_type(void) +{ + return static_branch_unlikely(&__scx_is_cid_type); +} + +static inline bool __scx_cmask_contains(u32 cid, const struct scx_cmask *m) +{ + return likely(cid >= m->base && cid < m->base + m->nr_cids); +} + +/* Word in bits[] covering @cid. @cid must satisfy __scx_cmask_contains(). */ +static inline u64 *__scx_cmask_word(u32 cid, const struct scx_cmask *m) +{ + return (u64 *)&m->bits[cid / 64 - m->base / 64]; +} + +/** + * __scx_cmask_init - Initialize @m with explicit storage capacity + * @m: cmask to initialize + * @base: first cid of the active range + * @nr_cids: number of cids in the active range + * @alloc_cids: storage capacity in cids, at least @nr_cids + * + * Use when storage is sized larger than the initial active range. All of + * bits[] is zeroed. + */ +static inline void __scx_cmask_init(struct scx_cmask *m, u32 base, u32 nr_cids, + u32 alloc_cids) +{ + if (WARN_ON_ONCE(alloc_cids < nr_cids)) + nr_cids = alloc_cids; + + m->base = base; + m->nr_cids = nr_cids; + m->alloc_words = SCX_CMASK_NR_WORDS(alloc_cids); + memset(m->bits, 0, m->alloc_words * sizeof(u64)); +} + +/** + * scx_cmask_init - Initialize @m on tight storage + * @m: cmask to initialize + * @base: first cid of the active range + * @nr_cids: number of cids in the active range + * + * All of bits[] is zeroed. + */ +static inline void scx_cmask_init(struct scx_cmask *m, u32 base, u32 nr_cids) +{ + __scx_cmask_init(m, base, nr_cids, nr_cids); +} + +/** + * scx_cmask_reframe - Reshape @m's active range without resizing storage + * @m: cmask to reframe + * @base: new active range base + * @nr_cids: new active range length, must fit within @m->alloc_words + * + * Body bits within the new range become garbage - only the head and tail + * words are zeroed to keep the padding invariant. + */ +static inline void scx_cmask_reframe(struct scx_cmask *m, u32 base, u32 nr_cids) +{ + if (WARN_ON_ONCE(SCX_CMASK_NR_WORDS(nr_cids) > m->alloc_words)) + return; + + if (nr_cids) { + u32 last_word = ((base & 63) + nr_cids - 1) / 64; + + m->bits[0] = 0; + m->bits[last_word] = 0; + } + + m->base = base; + m->nr_cids = nr_cids; +} + +static inline void __scx_cmask_set(u32 cid, struct scx_cmask *m) +{ + if (!__scx_cmask_contains(cid, m)) + return; + *__scx_cmask_word(cid, m) |= BIT_U64(cid & 63); +} + +/** + * scx_cmask_test - test whether @cid is set in @m + * @cid: cid to test + * @m: cmask to test + * + * Return %false if @cid is outside @m's active range. Otherwise return the + * bit's value. Read via READ_ONCE so callers can race set/clear writers. + */ +static inline bool scx_cmask_test(u32 cid, const struct scx_cmask *m) +{ + if (!__scx_cmask_contains(cid, m)) + return false; + return READ_ONCE(*__scx_cmask_word(cid, m)) & BIT_U64(cid & 63); +} + +/* + * Words of bits[] the active range spans, 0 if empty. Tighter than the storage + * SCX_CMASK_NR_WORDS() sizes for the worst-case base alignment. + */ +static inline u32 scx_cmask_nr_used_words(const struct scx_cmask *m) +{ + if (!m->nr_cids) + return 0; + return ((m->base & 63) + m->nr_cids - 1) / 64 + 1; +} + +/** + * scx_cmask_for_each_cid - iterate set cids in @m + * @cid: s32 loop var that receives each set cid in turn + * @m: cmask to iterate + * + * Visits set bits within @m's active range in ascending order. Scans only the + * words the active range spans, where head and tail padding is kept zero, so + * no per-cid range check is needed. + */ +#define scx_cmask_for_each_cid(cid, m) \ + for (u64 __bs = (m)->base & ~63u, __wi = 0, \ + __nw = scx_cmask_nr_used_words(m); \ + __wi < __nw; __wi++) \ + for (u64 __w = READ_ONCE((m)->bits[__wi]); \ + __w && ((cid) = __bs + __wi * 64 + __ffs64(__w), true); \ + __w &= __w - 1) + +#endif /* _KERNEL_SCHED_EXT_CID_H */ diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index 9f5ad6b071f9..2077373d8da3 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -9,7 +9,6 @@ * Copyright (c) 2022 David Vernet <dvernet@meta.com> * Copyright (c) 2024 Andrea Righi <arighi@nvidia.com> */ -#include "ext_idle.h" /* Enable/disable built-in idle CPU selection policy */ static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); @@ -783,7 +782,7 @@ void __scx_update_idle(struct rq *rq, bool idle, bool do_notify) */ if (SCX_HAS_OP(sch, update_idle) && do_notify && !scx_bypassing(sch, cpu_of(rq))) - SCX_CALL_OP(sch, update_idle, rq, cpu_of(rq), idle); + SCX_CALL_OP(sch, update_idle, rq, scx_cpu_arg(cpu_of(rq)), idle); } static void reset_idle_masks(struct sched_ext_ops *ops) @@ -911,7 +910,7 @@ static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p, bool we_locked = false; s32 cpu; - if (!ops_cpu_valid(sch, prev_cpu, NULL)) + if (!scx_cpu_valid(sch, prev_cpu, NULL)) return -EINVAL; if (!check_builtin_idle_enabled(sch)) @@ -984,7 +983,7 @@ __bpf_kfunc s32 scx_bpf_cpu_node(s32 cpu, const struct bpf_prog_aux *aux) guard(rcu)(); sch = scx_prog_sched(aux); - if (unlikely(!sch) || !ops_cpu_valid(sch, cpu, NULL)) + if (unlikely(!sch) || !scx_cpu_valid(sch, cpu, NULL)) return NUMA_NO_NODE; return cpu_to_node(cpu); } @@ -1266,7 +1265,7 @@ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu, const struct bpf_prog_ if (!check_builtin_idle_enabled(sch)) return false; - if (!ops_cpu_valid(sch, cpu, NULL)) + if (!scx_cpu_valid(sch, cpu, NULL)) return false; return scx_idle_test_and_clear_cpu(cpu); @@ -1504,13 +1503,9 @@ static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = { int scx_idle_init(void) { - int ret; - - ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_idle) || - register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &scx_kfunc_set_idle) || - register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_idle) || - register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_select_cpu) || - register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_select_cpu); - - return ret; + return register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_idle) ?: + register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &scx_kfunc_set_idle) ?: + register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_idle) ?: + register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_select_cpu) ?: + register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_select_cpu); } diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h index a075732d4430..b04701190b23 100644 --- a/kernel/sched/ext_internal.h +++ b/kernel/sched/ext_internal.h @@ -8,35 +8,6 @@ #define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) #define SCX_MOFF_IDX(moff) ((moff) / sizeof(void (*)(void))) -enum scx_consts { - SCX_DSP_DFL_MAX_BATCH = 32, - SCX_DSP_MAX_LOOPS = 32, - SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, - - SCX_EXIT_BT_LEN = 64, - SCX_EXIT_MSG_LEN = 1024, - SCX_EXIT_DUMP_DFL_LEN = 32768, - - SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE, - - /* - * Iterating all tasks may take a while. Periodically drop - * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls. - */ - SCX_TASK_ITER_BATCH = 32, - - SCX_BYPASS_HOST_NTH = 2, - - SCX_BYPASS_LB_DFL_INTV_US = 500 * USEC_PER_MSEC, - SCX_BYPASS_LB_DONOR_PCT = 125, - SCX_BYPASS_LB_MIN_DELTA_DIV = 4, - SCX_BYPASS_LB_BATCH = 256, - - SCX_REENQ_LOCAL_MAX_REPEAT = 256, - - SCX_SUB_MAX_DEPTH = 4, -}; - enum scx_exit_kind { SCX_EXIT_NONE, SCX_EXIT_DONE, @@ -94,6 +65,12 @@ struct scx_exit_info { /* %SCX_EXIT_* - broad category of the exit reason */ enum scx_exit_kind kind; + /* + * CPU that initiated the exit, valid once @kind has been set. + * Negative if the exit path didn't identify a CPU. + */ + s32 exit_cpu; + /* exit code if gracefully exiting */ s64 exit_code; @@ -138,7 +115,8 @@ enum scx_ops_flags { * To mask this problem, by default, unhashed tasks are automatically * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't * depend on pid lookups and wants to handle these tasks directly, the - * following flag can be used. + * following flag can be used. With %SCX_OPS_TID_TO_TASK, + * scx_bpf_tid_to_task() can find exiting tasks reliably. */ SCX_OPS_ENQ_EXITING = 1LLU << 2, @@ -189,6 +167,17 @@ enum scx_ops_flags { */ SCX_OPS_ALWAYS_ENQ_IMMED = 1LLU << 7, + /* + * Maintain a mapping from p->scx.tid to task_struct so the BPF + * scheduler can recover task pointers from stored tids via + * scx_bpf_tid_to_task(). + * + * Only the root scheduler turns this on. A sub-sched may set the flag + * to declare a dependency on the lookup; if the root scheduler hasn't + * enabled it, attaching the sub-sched is rejected. + */ + SCX_OPS_TID_TO_TASK = 1LLU << 8, + SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | SCX_OPS_ENQ_LAST | SCX_OPS_ENQ_EXITING | @@ -196,7 +185,8 @@ enum scx_ops_flags { SCX_OPS_ALLOW_QUEUED_WAKEUP | SCX_OPS_SWITCH_PARTIAL | SCX_OPS_BUILTIN_IDLE_PER_NODE | - SCX_OPS_ALWAYS_ENQ_IMMED, + SCX_OPS_ALWAYS_ENQ_IMMED | + SCX_OPS_TID_TO_TASK, /* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */ __SCX_OPS_INTERNAL_MASK = 0xffLLU << 56, @@ -540,28 +530,6 @@ struct sched_ext_ops { void (*update_idle)(s32 cpu, bool idle); /** - * @cpu_acquire: A CPU is becoming available to the BPF scheduler - * @cpu: The CPU being acquired by the BPF scheduler. - * @args: Acquire arguments, see the struct definition. - * - * A CPU that was previously released from the BPF scheduler is now once - * again under its control. - */ - void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args); - - /** - * @cpu_release: A CPU is taken away from the BPF scheduler - * @cpu: The CPU being released by the BPF scheduler. - * @args: Release arguments, see the struct definition. - * - * The specified CPU is no longer under the control of the BPF - * scheduler. This could be because it was preempted by a higher - * priority sched_class, though there may be other reasons as well. The - * caller should consult @args->reason to determine the cause. - */ - void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args); - - /** * @init_task: Initialize a task to run in a BPF scheduler * @p: task to initialize for BPF scheduling * @args: init arguments, see the struct definition @@ -851,6 +819,128 @@ struct sched_ext_ops { /* internal use only, must be NULL */ void __rcu *priv; + + /* + * Deprecated callbacks. Kept at the end of the struct so the cid-form + * struct (sched_ext_ops_cid) can omit them without affecting the + * shared field offsets. Use SCX_ENQ_IMMED instead. Sitting past + * SCX_OPI_END means has_op doesn't cover them, so SCX_HAS_OP() cannot + * be used; callers must test sch->ops.cpu_acquire / cpu_release + * directly. + */ + + /** + * @cpu_acquire: A CPU is becoming available to the BPF scheduler + * @cpu: The CPU being acquired by the BPF scheduler. + * @args: Acquire arguments, see the struct definition. + * + * A CPU that was previously released from the BPF scheduler is now once + * again under its control. Deprecated; use SCX_ENQ_IMMED instead. + */ + void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args); + + /** + * @cpu_release: A CPU is taken away from the BPF scheduler + * @cpu: The CPU being released by the BPF scheduler. + * @args: Release arguments, see the struct definition. + * + * The specified CPU is no longer under the control of the BPF + * scheduler. This could be because it was preempted by a higher + * priority sched_class, though there may be other reasons as well. The + * caller should consult @args->reason to determine the cause. + * Deprecated; use SCX_ENQ_IMMED instead. + */ + void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args); +}; + +/** + * struct sched_ext_ops_cid - cid-form alternative to struct sched_ext_ops + * + * Mirrors struct sched_ext_ops with cpu/cpumask substituted with cid/cmask + * where applicable. Layout up to and including @priv matches sched_ext_ops + * byte-for-byte (verified by BUILD_BUG_ON checks at scx_init() time) so + * shared field offsets work for both struct types in bpf_scx_init_member() + * and bpf_scx_check_member(). The deprecated cpu_acquire/cpu_release + * callbacks at the tail of sched_ext_ops are omitted here entirely. + * + * Differences from sched_ext_ops: + * - select_cpu -> select_cid (returns cid) + * - dispatch -> dispatch (cpu arg is now cid) + * - update_idle -> update_idle (cpu arg is now cid) + * - set_cpumask -> set_cmask (cmask instead of cpumask) + * - cpu_online -> cid_online + * - cpu_offline -> cid_offline + * - dump_cpu -> dump_cid + * - cpu_acquire/cpu_release -> not present (deprecated in sched_ext_ops) + * + * BPF schedulers using this type cannot call cpu-form scx_bpf_* kfuncs; + * use the cid-form variants instead. Enforced at BPF verifier time via + * scx_kfunc_context_filter() branching on prog->aux->st_ops. + * + * See sched_ext_ops for callback documentation. + */ +struct sched_ext_ops_cid { + s32 (*select_cid)(struct task_struct *p, s32 prev_cid, u64 wake_flags); + void (*enqueue)(struct task_struct *p, u64 enq_flags); + void (*dequeue)(struct task_struct *p, u64 deq_flags); + void (*dispatch)(s32 cid, struct task_struct *prev); + void (*tick)(struct task_struct *p); + void (*runnable)(struct task_struct *p, u64 enq_flags); + void (*running)(struct task_struct *p); + void (*stopping)(struct task_struct *p, bool runnable); + void (*quiescent)(struct task_struct *p, u64 deq_flags); + bool (*yield)(struct task_struct *from, struct task_struct *to); + bool (*core_sched_before)(struct task_struct *a, + struct task_struct *b); + void (*set_weight)(struct task_struct *p, u32 weight); + void (*set_cmask)(struct task_struct *p, + const struct scx_cmask *cmask); + void (*update_idle)(s32 cid, bool idle); + s32 (*init_task)(struct task_struct *p, + struct scx_init_task_args *args); + void (*exit_task)(struct task_struct *p, + struct scx_exit_task_args *args); + void (*enable)(struct task_struct *p); + void (*disable)(struct task_struct *p); + void (*dump)(struct scx_dump_ctx *ctx); + void (*dump_cid)(struct scx_dump_ctx *ctx, s32 cid, bool idle); + void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p); +#ifdef CONFIG_EXT_GROUP_SCHED + s32 (*cgroup_init)(struct cgroup *cgrp, + struct scx_cgroup_init_args *args); + void (*cgroup_exit)(struct cgroup *cgrp); + s32 (*cgroup_prep_move)(struct task_struct *p, + struct cgroup *from, struct cgroup *to); + void (*cgroup_move)(struct task_struct *p, + struct cgroup *from, struct cgroup *to); + void (*cgroup_cancel_move)(struct task_struct *p, + struct cgroup *from, struct cgroup *to); + void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight); + void (*cgroup_set_bandwidth)(struct cgroup *cgrp, + u64 period_us, u64 quota_us, u64 burst_us); + void (*cgroup_set_idle)(struct cgroup *cgrp, bool idle); +#endif /* CONFIG_EXT_GROUP_SCHED */ + s32 (*sub_attach)(struct scx_sub_attach_args *args); + void (*sub_detach)(struct scx_sub_detach_args *args); + void (*cid_online)(s32 cid); + void (*cid_offline)(s32 cid); + s32 (*init)(void); + void (*exit)(struct scx_exit_info *info); + + /* Data fields - must match sched_ext_ops layout exactly */ + u32 dispatch_max_batch; + u64 flags; + u32 timeout_ms; + u32 exit_dump_len; + u64 hotplug_seq; + u64 sub_cgroup_id; + char name[SCX_OPS_NAME_LEN]; + + /* internal use only, must be NULL */ + void __rcu *priv; + + /* layout end anchor for the BUILD_BUG_ON in scx_init(); keep last */ + char __end[0]; }; enum scx_opi { @@ -1009,7 +1099,40 @@ struct scx_sched_pnode { }; struct scx_sched { - struct sched_ext_ops ops; + /* + * cpu-form and cid-form ops share field offsets up to .priv (verified + * by BUILD_BUG_ON in scx_init()). The anonymous union lets the kernel + * access either view of the same storage without function-pointer + * casts: use .ops for cpu-form and shared fields, .ops_cid for the + * cid-renamed callbacks (set_cmask, select_cid, cid_online, ...). + */ + union { + struct sched_ext_ops ops; + struct sched_ext_ops_cid ops_cid; + }; + bool is_cid_type; /* true if registered via bpf_sched_ext_ops_cid */ + + /* + * Arena map auto-discovered from member progs at struct_ops attach. + * cid-form schedulers must use exactly one arena across all member + * progs. NULL on cpu-form. + * + * @arena_pool sub-allocates @arena_map. Each gen_pool chunk is added + * at the kernel-side mapping address. @arena_kern_base is the start + * of the arena's kern_vm range. See scx_arena_to_kaddr() and + * scx_kaddr_to_arena(). + */ + struct bpf_map *arena_map; + struct gen_pool *arena_pool; + uintptr_t arena_kern_base; + + /* + * Per-CPU arena cmask used by scx_call_op_set_cpumask() to hand a cmask + * to ops_cid.set_cmask(). The kernel writes through the stored kern_va + * and hands BPF its arena pointer via scx_kaddr_to_arena(). + */ + struct scx_cmask * __percpu *set_cmask_scratch; + DECLARE_BITMAP(has_op, SCX_OPI_END); /* @@ -1083,6 +1206,31 @@ struct scx_sched { struct scx_sched *ancestors[]; }; +/** + * scx_arena_to_kaddr - Translate a BPF-arena pointer to its kernel address + * @sch: scheduler whose arena hosts @bpf_ptr + * @bpf_ptr: BPF-arena pointer, only the low 32 bits are used + * + * The (u32) cast normalizes any input into the arena's 4 GiB kern_vm range, + * which combined with scratch-page fault recovery makes the returned pointer + * safe to dereference up to GUARD_SZ / 2 past the intended object. Accesses + * larger than GUARD_SZ / 2 must be explicitly bounds-checked. + */ +static inline void *scx_arena_to_kaddr(struct scx_sched *sch, const void *bpf_ptr) +{ + return (void *)(sch->arena_kern_base + (u32)(uintptr_t)bpf_ptr); +} + +/** + * scx_kaddr_to_arena - Translate a kernel arena address to its BPF form + * @sch: scheduler whose arena hosts @kaddr + * @kaddr: kernel-side arena address, supplied by trusted kernel code + */ +static inline void *scx_kaddr_to_arena(struct scx_sched *sch, const void *kaddr) +{ + return (void *)((uintptr_t)kaddr - sch->arena_kern_base); +} + enum scx_wake_flags { /* expose select WF_* flags as enums */ SCX_WAKE_FORK = WF_FORK, @@ -1366,8 +1514,30 @@ enum scx_ops_state { extern struct scx_sched __rcu *scx_root; DECLARE_PER_CPU(struct rq *, scx_locked_rq_state); +/* + * True when the currently loaded scheduler hierarchy is cid-form. All scheds + * in a hierarchy share one form, so this single key tells callsites which + * view to use without per-sch dereferences. Use scx_is_cid_type() to test. + */ +DECLARE_STATIC_KEY_FALSE(__scx_is_cid_type); + int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id); +bool scx_cpu_valid(struct scx_sched *sch, s32 cpu, const char *where); + +__printf(5, 0) bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind, + s64 exit_code, s32 exit_cpu, const char *fmt, + va_list args); +__printf(5, 6) bool __scx_exit(struct scx_sched *sch, enum scx_exit_kind kind, + s64 exit_code, s32 exit_cpu, const char *fmt, ...); + +#define scx_exit(sch, kind, exit_code, fmt, args...) \ + __scx_exit(sch, kind, exit_code, raw_smp_processor_id(), fmt, ##args) +#define scx_error(sch, fmt, args...) \ + scx_exit((sch), SCX_EXIT_ERROR, 0, fmt, ##args) +#define scx_verror(sch, fmt, args) \ + scx_vexit((sch), SCX_EXIT_ERROR, 0, raw_smp_processor_id(), fmt, args) + /* * Return the rq currently locked from an scx callback, or NULL if no rq is * locked. @@ -1476,7 +1646,7 @@ static inline bool scx_task_on_sched(struct scx_sched *sch, return true; } -static struct scx_sched *scx_prog_sched(const struct bpf_prog_aux *aux) +static inline struct scx_sched *scx_prog_sched(const struct bpf_prog_aux *aux) { return rcu_dereference_all(scx_root); } diff --git a/kernel/sched/ext_types.h b/kernel/sched/ext_types.h new file mode 100644 index 000000000000..8b3527e21fca --- /dev/null +++ b/kernel/sched/ext_types.h @@ -0,0 +1,144 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Early sched_ext type definitions. + * + * Copyright (c) 2026 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2026 Tejun Heo <tj@kernel.org> + */ +#ifndef _KERNEL_SCHED_EXT_TYPES_H +#define _KERNEL_SCHED_EXT_TYPES_H + +enum scx_consts { + SCX_DSP_DFL_MAX_BATCH = 32, + SCX_DSP_MAX_LOOPS = 32, + SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, + + /* per-CPU chunk size for p->scx.tid allocation, see scx_alloc_tid() */ + SCX_TID_CHUNK = 1024, + + SCX_EXIT_BT_LEN = 64, + SCX_EXIT_MSG_LEN = 1024, + SCX_EXIT_DUMP_DFL_LEN = 32768, + + SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE, + + /* + * Iterating all tasks may take a while. Periodically drop + * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls. + */ + SCX_TASK_ITER_BATCH = 32, + + SCX_BYPASS_HOST_NTH = 2, + + SCX_BYPASS_LB_DFL_INTV_US = 500 * USEC_PER_MSEC, + SCX_BYPASS_LB_DONOR_PCT = 125, + SCX_BYPASS_LB_MIN_DELTA_DIV = 4, + SCX_BYPASS_LB_BATCH = 256, + + SCX_REENQ_LOCAL_MAX_REPEAT = 256, + + SCX_SUB_MAX_DEPTH = 4, +}; + +/* + * Per-cid topology info. For each topology level (core, LLC, node), records + * the first cid in the unit and its global index. Global indices are + * consecutive integers assigned in cid-walk order, so e.g. core_idx ranges + * over [0, nr_cores_at_init) with no gaps. No-topo cids have all fields set + * to -1. + * + * @core_cid: first cid of this cid's core (smt-sibling group) + * @core_idx: global index of that core, in [0, nr_cores_at_init) + * @llc_cid: first cid of this cid's LLC + * @llc_idx: global index of that LLC, in [0, nr_llcs_at_init) + * @node_cid: first cid of this cid's NUMA node + * @node_idx: global index of that node, in [0, nr_nodes_at_init) + */ +struct scx_cid_topo { + s32 core_cid; + s32 core_idx; + s32 llc_cid; + s32 llc_idx; + s32 node_cid; + s32 node_idx; +}; + +/* + * cmask: variable-length, base-windowed bitmap over cid space + * ----------------------------------------------------------- + * + * A cmask covers the cid range [base, base + nr_cids). bits[] is aligned to the + * global 64-cid grid: bits[0] spans [base & ~63, (base & ~63) + 64), so the + * first (base & 63) bits of bits[0] are head padding and the trailing bits of + * the last active word past base + nr_cids are tail padding. Both stay zero; + * all mutating helpers preserve that. Words past the last active word are not + * read by any helper and have no constraint. + * + * Grid alignment means two cmasks always address bits[] against the same global + * 64-cid windows, so cross-cmask word ops (AND, OR, ...) reduce to + * + * dst->bits[i] OP= src->bits[i - delta] + * + * with no bit-shifting, regardless of how the two bases relate mod 64. + */ +struct scx_cmask { + u32 base; + u32 nr_cids; + u32 alloc_words; + u64 bits[] __counted_by(alloc_words); +}; + +/* + * Number of u64 words of bits[] storage that covers @nr_cids regardless of base + * alignment. The +1 absorbs up to 63 bits of head padding when base is not + * 64-aligned - always allocating one extra word beats branching on base or + * splitting the compute. The u64 cast keeps the +63 from wrapping when @nr_cids + * is near U32_MAX, so callers bounds-checking the result against @alloc_words + * catch the overflow instead of seeing a small value. + */ +#define SCX_CMASK_NR_WORDS(nr_cids) ((u32)(((u64)(nr_cids) + 63) / 64 + 1)) + +/** + * __SCX_CMASK_DEFINE - Define an on-stack cmask with explicit storage capacity + * @NAME: variable name to define + * @BASE: first cid of the active range + * @NR_CIDS: active range length + * @ALLOC_CIDS: storage capacity in cids, at least @NR_CIDS + * + * @NAME aliases zero-initialized storage with the active range set to + * [BASE, BASE + NR_CIDS). Use scx_cmask_reframe() to reshape later, up to + * @ALLOC_CIDS. + */ +#define __SCX_CMASK_DEFINE(NAME, BASE, NR_CIDS, ALLOC_CIDS) \ + _DEFINE_FLEX(struct scx_cmask, NAME, bits, SCX_CMASK_NR_WORDS(ALLOC_CIDS), \ + = { .base = (BASE), \ + .nr_cids = (NR_CIDS), \ + .alloc_words = SCX_CMASK_NR_WORDS(ALLOC_CIDS) }) + +/** + * SCX_CMASK_DEFINE - Define an on-stack cmask on tight storage + * @NAME: variable name to define + * @BASE: first cid of the active range + * @NR_CIDS: active range length, also storage capacity + * + * @NAME aliases zero-initialized storage with the active range and storage + * both [BASE, BASE + NR_CIDS). + */ +#define SCX_CMASK_DEFINE(NAME, BASE, NR_CIDS) \ + __SCX_CMASK_DEFINE(NAME, BASE, NR_CIDS, NR_CIDS) + +/** + * SCX_CMASK_DEFINE_SHARD - Define an on-stack cmask sized to one shard + * @NAME: variable name to define + * @BASE: first cid of the active range + * @NR_CIDS: active range length, must be <= SCX_CID_SHARD_MAX_CPUS + * + * Storage is fixed at SCX_CID_SHARD_MAX_CPUS, active range framed by + * (BASE, NR_CIDS). Passing NR_CIDS > SCX_CID_SHARD_MAX_CPUS leaves the + * cmask claiming more bits than storage holds and subsequent cmask + * operations will overrun. + */ +#define SCX_CMASK_DEFINE_SHARD(NAME, BASE, NR_CIDS) \ + __SCX_CMASK_DEFINE(NAME, BASE, NR_CIDS, SCX_CID_SHARD_MAX_CPUS) + +#endif /* _KERNEL_SCHED_EXT_TYPES_H */ diff --git a/tools/sched_ext/README.md b/tools/sched_ext/README.md index 6e282bce453c..0ee5a3d997e5 100644 --- a/tools/sched_ext/README.md +++ b/tools/sched_ext/README.md @@ -168,9 +168,9 @@ well on single-socket systems with a unified L3 cache. Another simple, yet slightly more complex scheduler that provides an example of a basic weighted FIFO queuing policy. It also provides examples of some common -useful BPF features, such as sleepable per-task storage allocation in the -`ops.prep_enable()` callback, and using the `BPF_MAP_TYPE_QUEUE` map type to -enqueue tasks. It also illustrates how core-sched support could be implemented. +useful BPF features, such as arena-backed doubly-linked lists threaded through +per-task context and `bpf_res_spin_lock` for per-queue synchronization. It also +illustrates how core-sched support could be implemented. ## scx_central diff --git a/tools/sched_ext/include/scx/cid.bpf.h b/tools/sched_ext/include/scx/cid.bpf.h new file mode 100644 index 000000000000..9d89bb57e201 --- /dev/null +++ b/tools/sched_ext/include/scx/cid.bpf.h @@ -0,0 +1,678 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF-side helpers for cids and cmasks. See kernel/sched/ext_cid.h for the + * authoritative layout and semantics. The BPF-side helpers use the cmask_* + * naming (no scx_ prefix); cmask is the SCX bitmap type so the prefix is + * redundant in BPF code. Atomics use __sync_val_compare_and_swap and every + * helper is inline (no .c counterpart). + * + * Included by scx/common.bpf.h; don't include directly. + * + * Copyright (c) 2026 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2026 Tejun Heo <tj@kernel.org> + */ +#ifndef __SCX_CID_BPF_H +#define __SCX_CID_BPF_H + +#include "bpf_arena_common.bpf.h" + +#ifndef BIT_U64 +#define BIT_U64(nr) (1ULL << (nr)) +#endif +#ifndef GENMASK_U64 +#define GENMASK_U64(h, l) ((~0ULL << (l)) & (~0ULL >> (63 - (h)))) +#endif + +/* + * Storage cap for bounded loops over bits[]. Sized to cover NR_CPUS=8192 with + * one extra word for head-misalignment. Increase if deployment targets larger + * NR_CPUS. + */ +#ifndef CMASK_MAX_WORDS +#define CMASK_MAX_WORDS 129 +#endif + +/* + * Mirrors SCX_CMASK_NR_WORDS in kernel/sched/ext_types.h. The u64 cast keeps + * the +63 from wrapping when @nr_cids is near U32_MAX, so cmask_reframe() + * bounds-checking the result against alloc_words catches the overflow instead + * of seeing a small value. + */ +#define CMASK_NR_WORDS(nr_cids) ((u32)(((u64)(nr_cids) + 63) / 64 + 1)) + +static __always_inline bool __cmask_contains(u32 cid, const struct scx_cmask __arena *m) +{ + return cid >= m->base && cid < m->base + m->nr_cids; +} + +static __always_inline u64 __arena *__cmask_word(u32 cid, const struct scx_cmask __arena *m) +{ + return (u64 __arena *)&m->bits[cid / 64 - m->base / 64]; +} + +/** + * __cmask_init - Initialize @m with explicit storage capacity + * @m: cmask to initialize + * @base: first cid of the active range + * @nr_cids: number of cids in the active range + * @alloc_cids: storage capacity in cids, at least @nr_cids + * + * Use when storage is sized larger than the initial active range. All of + * bits[] is zeroed. + */ +static __always_inline void __cmask_init(struct scx_cmask __arena *m, u32 base, + u32 nr_cids, u32 alloc_cids) +{ + u32 alloc_words, i; + + if (unlikely(nr_cids > alloc_cids)) { + scx_bpf_error("__cmask_init: nr_cids=%u exceeds alloc_cids=%u", + nr_cids, alloc_cids); + return; + } + alloc_words = CMASK_NR_WORDS(alloc_cids); + + m->base = base; + m->nr_cids = nr_cids; + m->alloc_words = alloc_words; + + bpf_for(i, 0, CMASK_MAX_WORDS) { + if (i >= alloc_words) + break; + m->bits[i] = 0; + } +} + +/** + * cmask_init - Initialize @m on tight storage + * @m: cmask to initialize + * @base: first cid of the active range + * @nr_cids: number of cids in the active range + * + * All of bits[] is zeroed. + */ +static __always_inline void cmask_init(struct scx_cmask __arena *m, u32 base, u32 nr_cids) +{ + __cmask_init(m, base, nr_cids, nr_cids); +} + +/** + * cmask_reframe - Reshape @m's active range without resizing storage + * @m: cmask to reframe + * @base: new active range base + * @nr_cids: new active range length, must fit within @m->alloc_words + * + * Body bits within the new range become garbage - only the head and tail + * words are zeroed to keep the padding invariant. + */ +static __always_inline void cmask_reframe(struct scx_cmask __arena *m, u32 base, u32 nr_cids) +{ + if (CMASK_NR_WORDS(nr_cids) > m->alloc_words) { + scx_bpf_error("cmask_reframe: nr_cids=%u exceeds alloc_words=%u", + nr_cids, m->alloc_words); + return; + } + if (nr_cids) { + u32 last_word = ((base & 63) + nr_cids - 1) / 64; + + m->bits[0] = 0; + m->bits[last_word] = 0; + } + m->base = base; + m->nr_cids = nr_cids; +} + +static __always_inline bool cmask_test(u32 cid, const struct scx_cmask __arena *m) +{ + if (!__cmask_contains(cid, m)) + return false; + return *__cmask_word(cid, m) & BIT_U64(cid & 63); +} + +/* + * x86 BPF JIT rejects BPF_OR | BPF_FETCH and BPF_AND | BPF_FETCH on arena + * pointers (see bpf_jit_supports_insn() in arch/x86/net/bpf_jit_comp.c). Only + * BPF_CMPXCHG / BPF_XCHG / BPF_ADD with FETCH are allowed. Implement + * test_and_{set,clear} and the atomic set/clear via a cmpxchg loop. + * + * CMASK_CAS_TRIES is sized so exhausting it means seconds of real spinning + * on one word - past any plausible contention. Abort hard. + */ +#define CMASK_CAS_TRIES (1U << 23) + +static __always_inline void cmask_set(u32 cid, struct scx_cmask __arena *m) +{ + u64 __arena *w; + u64 bit, old, new; + u32 i; + + if (!__cmask_contains(cid, m)) + return; + w = __cmask_word(cid, m); + bit = BIT_U64(cid & 63); + bpf_for(i, 0, CMASK_CAS_TRIES) { + old = *w; + if (old & bit) + return; + new = old | bit; + if (__sync_val_compare_and_swap(w, old, new) == old) + return; + } + scx_bpf_error("cmask_set CAS exhausted at cid %u", cid); +} + +static __always_inline void cmask_clear(u32 cid, struct scx_cmask __arena *m) +{ + u64 __arena *w; + u64 bit, old, new; + u32 i; + + if (!__cmask_contains(cid, m)) + return; + w = __cmask_word(cid, m); + bit = BIT_U64(cid & 63); + bpf_for(i, 0, CMASK_CAS_TRIES) { + old = *w; + if (!(old & bit)) + return; + new = old & ~bit; + if (__sync_val_compare_and_swap(w, old, new) == old) + return; + } + scx_bpf_error("cmask_clear CAS exhausted at cid %u", cid); +} + +static __always_inline bool cmask_test_and_set(u32 cid, struct scx_cmask __arena *m) +{ + u64 __arena *w; + u64 bit, old, new; + u32 i; + + if (!__cmask_contains(cid, m)) + return false; + w = __cmask_word(cid, m); + bit = BIT_U64(cid & 63); + bpf_for(i, 0, CMASK_CAS_TRIES) { + old = *w; + if (old & bit) + return true; + new = old | bit; + if (__sync_val_compare_and_swap(w, old, new) == old) + return false; + } + scx_bpf_error("cmask_test_and_set CAS exhausted at cid %u", cid); + return false; +} + +static __always_inline bool cmask_test_and_clear(u32 cid, struct scx_cmask __arena *m) +{ + u64 __arena *w; + u64 bit, old, new; + u32 i; + + if (!__cmask_contains(cid, m)) + return false; + w = __cmask_word(cid, m); + bit = BIT_U64(cid & 63); + bpf_for(i, 0, CMASK_CAS_TRIES) { + old = *w; + if (!(old & bit)) + return false; + new = old & ~bit; + if (__sync_val_compare_and_swap(w, old, new) == old) + return true; + } + scx_bpf_error("cmask_test_and_clear CAS exhausted at cid %u", cid); + return false; +} + +static __always_inline void __cmask_set(u32 cid, struct scx_cmask __arena *m) +{ + if (!__cmask_contains(cid, m)) + return; + *__cmask_word(cid, m) |= BIT_U64(cid & 63); +} + +static __always_inline void __cmask_clear(u32 cid, struct scx_cmask __arena *m) +{ + if (!__cmask_contains(cid, m)) + return; + *__cmask_word(cid, m) &= ~BIT_U64(cid & 63); +} + +static __always_inline bool __cmask_test_and_set(u32 cid, struct scx_cmask __arena *m) +{ + u64 bit = BIT_U64(cid & 63); + u64 __arena *w; + u64 prev; + + if (!__cmask_contains(cid, m)) + return false; + w = __cmask_word(cid, m); + prev = *w & bit; + *w |= bit; + return prev; +} + +static __always_inline bool __cmask_test_and_clear(u32 cid, struct scx_cmask __arena *m) +{ + u64 bit = BIT_U64(cid & 63); + u64 __arena *w; + u64 prev; + + if (!__cmask_contains(cid, m)) + return false; + w = __cmask_word(cid, m); + prev = *w & bit; + *w &= ~bit; + return prev; +} + +static __always_inline void cmask_zero(struct scx_cmask __arena *m) +{ + u32 nr_words = CMASK_NR_WORDS(m->nr_cids), i; + + bpf_for(i, 0, CMASK_MAX_WORDS) { + if (i >= nr_words) + break; + m->bits[i] = 0; + } +} + +/* + * BPF_-prefixed to avoid colliding with the kernel's anonymous CMASK_OP_* + * enum in ext_cid.c, which is exported via BTF and reachable through + * vmlinux.h. + */ +enum { + BPF_CMASK_OP_AND, + BPF_CMASK_OP_OR, + BPF_CMASK_OP_COPY, + BPF_CMASK_OP_ANDNOT, +}; + +static __always_inline void cmask_op_word(struct scx_cmask __arena *dst, + const struct scx_cmask __arena *src, + u32 di, u32 si, u64 mask, int op) +{ + u64 dv = dst->bits[di]; + u64 sv = src->bits[si]; + u64 rv; + + if (op == BPF_CMASK_OP_AND) + rv = dv & sv; + else if (op == BPF_CMASK_OP_OR) + rv = dv | sv; + else if (op == BPF_CMASK_OP_ANDNOT) + rv = dv & ~sv; + else + rv = sv; + + dst->bits[di] = (dv & ~mask) | (rv & mask); +} + +static __always_inline void cmask_op(struct scx_cmask __arena *dst, + const struct scx_cmask __arena *src, int op) +{ + u32 d_end = dst->base + dst->nr_cids; + u32 s_end = src->base + src->nr_cids; + u32 lo = dst->base > src->base ? dst->base : src->base; + u32 hi = d_end < s_end ? d_end : s_end; + u32 d_base = dst->base / 64; + u32 s_base = src->base / 64; + u32 lo_word, hi_word, i; + u64 head_mask, tail_mask; + + if (lo >= hi) + return; + + lo_word = lo / 64; + hi_word = (hi - 1) / 64; + head_mask = GENMASK_U64(63, lo & 63); + tail_mask = GENMASK_U64((hi - 1) & 63, 0); + + bpf_for(i, 0, CMASK_MAX_WORDS) { + u32 w = lo_word + i; + u64 m; + + if (w > hi_word) + break; + + m = GENMASK_U64(63, 0); + if (w == lo_word) + m &= head_mask; + if (w == hi_word) + m &= tail_mask; + + cmask_op_word(dst, src, w - d_base, w - s_base, m, op); + } +} + +/* + * cmask_and/or/copy only modify @dst bits that lie in the intersection of + * [@dst->base, @dst->base + @dst->nr_cids) and [@src->base, + * @src->base + @src->nr_cids). Bits in @dst outside that window + * keep their prior values - in particular, cmask_copy() does NOT zero @dst + * bits that lie outside @src's range. + */ +static __always_inline void cmask_and(struct scx_cmask __arena *dst, + const struct scx_cmask __arena *src) +{ + cmask_op(dst, src, BPF_CMASK_OP_AND); +} + +static __always_inline void cmask_or(struct scx_cmask __arena *dst, + const struct scx_cmask __arena *src) +{ + cmask_op(dst, src, BPF_CMASK_OP_OR); +} + +static __always_inline void cmask_copy(struct scx_cmask __arena *dst, + const struct scx_cmask __arena *src) +{ + cmask_op(dst, src, BPF_CMASK_OP_COPY); +} + +static __always_inline void cmask_andnot(struct scx_cmask __arena *dst, + const struct scx_cmask __arena *src) +{ + cmask_op(dst, src, BPF_CMASK_OP_ANDNOT); +} + +/* + * True iff @a and @b have identical bits over their (assumed equal) range. + * Callers are expected to pass same-shape cmasks; differing shapes always + * compare unequal. + */ +static __always_inline bool cmask_equal(const struct scx_cmask __arena *a, + const struct scx_cmask __arena *b) +{ + u32 nr_words, i; + + if (a->base != b->base || a->nr_cids != b->nr_cids) + return false; + nr_words = CMASK_NR_WORDS(a->nr_cids); + + bpf_for(i, 0, CMASK_MAX_WORDS) { + if (i >= nr_words) + break; + if (a->bits[i] != b->bits[i]) + return false; + } + return true; +} + +/* + * True iff every bit set in @a is also set in @b over the intersection of + * their ranges. Bits of @a outside @b's range fail the test. + */ +static __always_inline bool cmask_subset(const struct scx_cmask __arena *a, + const struct scx_cmask __arena *b) +{ + u32 a_end = a->base + a->nr_cids; + u32 b_end = b->base + b->nr_cids; + u32 a_wbase = a->base / 64; + u32 b_wbase = b->base / 64; + u32 nr_words, i; + + /* any bit of @a outside @b's range is a subset violation */ + if (a->base < b->base || a_end > b_end) + return false; + + nr_words = CMASK_NR_WORDS(a->nr_cids); + bpf_for(i, 0, CMASK_MAX_WORDS) { + u32 wi_b; + + if (i >= nr_words) + break; + wi_b = a_wbase + i - b_wbase; + if (a->bits[i] & ~b->bits[wi_b]) + return false; + } + return true; +} + +/** + * cmask_next_set - find the first set bit at or after @cid + * @m: cmask to search + * @cid: starting cid (clamped to @m->base if below) + * + * Returns the smallest set cid in [@cid, @m->base + @m->nr_cids), or + * @m->base + @m->nr_cids if none (the out-of-range sentinel matches the + * termination condition used by cmask_for_each()). + */ +static __always_inline u32 cmask_next_set(const struct scx_cmask __arena *m, u32 cid) +{ + u32 end = m->base + m->nr_cids; + u32 base = m->base / 64; + u32 last_wi = (end - 1) / 64 - base; + u32 start_wi, start_bit, i; + + if (cid < m->base) + cid = m->base; + if (cid >= end) + return end; + + start_wi = cid / 64 - base; + start_bit = cid & 63; + + bpf_for(i, 0, CMASK_MAX_WORDS) { + u32 wi = start_wi + i; + u64 word; + u32 found; + + if (wi > last_wi) + break; + + word = m->bits[wi]; + if (i == 0) + word &= GENMASK_U64(63, start_bit); + if (!word) + continue; + + found = (base + wi) * 64 + ctzll(word); + if (found >= end) + return end; + return found; + } + return end; +} + +static __always_inline u32 cmask_first_set(const struct scx_cmask __arena *m) +{ + return cmask_next_set(m, m->base); +} + +#define cmask_for_each(cid, m) \ + for ((cid) = cmask_first_set(m); \ + (cid) < (m)->base + (m)->nr_cids; \ + (cid) = cmask_next_set((m), (cid) + 1)) + +/* + * Population count over [base, base + nr_cids). Padding bits in the head/tail + * words are guaranteed zero by the mutating helpers, so a flat popcount over + * all words is correct. + */ +static __always_inline u32 cmask_weight(const struct scx_cmask __arena *m) +{ + u32 nr_words = CMASK_NR_WORDS(m->nr_cids), i; + u32 count = 0; + + bpf_for(i, 0, CMASK_MAX_WORDS) { + if (i >= nr_words) + break; + count += __builtin_popcountll(m->bits[i]); + } + return count; +} + +/* + * True if @a and @b share any set bit. Walk only the intersection of their + * ranges, matching the semantics of cmask_and(). + */ +static __always_inline bool cmask_intersects(const struct scx_cmask __arena *a, + const struct scx_cmask __arena *b) +{ + u32 a_end = a->base + a->nr_cids; + u32 b_end = b->base + b->nr_cids; + u32 lo = a->base > b->base ? a->base : b->base; + u32 hi = a_end < b_end ? a_end : b_end; + u32 a_base = a->base / 64; + u32 b_base = b->base / 64; + u32 lo_word, hi_word, i; + u64 head_mask, tail_mask; + + if (lo >= hi) + return false; + + lo_word = lo / 64; + hi_word = (hi - 1) / 64; + head_mask = GENMASK_U64(63, lo & 63); + tail_mask = GENMASK_U64((hi - 1) & 63, 0); + + bpf_for(i, 0, CMASK_MAX_WORDS) { + u32 w = lo_word + i; + u64 mask, av, bv; + + if (w > hi_word) + break; + + mask = GENMASK_U64(63, 0); + if (w == lo_word) + mask &= head_mask; + if (w == hi_word) + mask &= tail_mask; + + av = a->bits[w - a_base] & mask; + bv = b->bits[w - b_base] & mask; + if (av & bv) + return true; + } + return false; +} + +/* + * Find the next cid set in both @a and @b at or after @start, bounded by the + * intersection of the two ranges. Return a->base + a->nr_cids if none found. + * + * Building block for cmask_next_and_set_wrap(). Callers that want a bounded + * scan without wrap call this directly. + */ +static __always_inline u32 cmask_next_and_set(const struct scx_cmask __arena *a, + const struct scx_cmask __arena *b, + u32 start) +{ + u32 a_end = a->base + a->nr_cids; + u32 b_end = b->base + b->nr_cids; + u32 a_wbase = a->base / 64; + u32 b_wbase = b->base / 64; + u32 lo = a->base > b->base ? a->base : b->base; + u32 hi = a_end < b_end ? a_end : b_end; + u32 last_wi, start_wi, start_bit, i; + + if (lo >= hi) + return a_end; + if (start < lo) + start = lo; + if (start >= hi) + return a_end; + + last_wi = (hi - 1) / 64; + start_wi = start / 64; + start_bit = start & 63; + + bpf_for(i, 0, CMASK_MAX_WORDS) { + u32 abs_wi = start_wi + i; + u64 word; + u32 found; + + if (abs_wi > last_wi) + break; + + word = a->bits[abs_wi - a_wbase] & b->bits[abs_wi - b_wbase]; + if (i == 0) + word &= GENMASK_U64(63, start_bit); + if (!word) + continue; + + found = abs_wi * 64 + ctzll(word); + if (found >= hi) + return a_end; + return found; + } + return a_end; +} + +/* + * Find the next set cid in @m at or after @start, wrapping to @m->base if no + * set bit is found in [start, m->base + m->nr_cids). Return m->base + + * m->nr_cids if @m is empty. + * + * Callers do round-robin distribution by passing (last_cid + 1) as @start. + */ +static __always_inline u32 cmask_next_set_wrap(const struct scx_cmask __arena *m, + u32 start) +{ + u32 end = m->base + m->nr_cids; + u32 found; + + found = cmask_next_set(m, start); + if (found < end || start <= m->base) + return found; + + found = cmask_next_set(m, m->base); + return found < start ? found : end; +} + +/* + * Find the next cid set in both @a and @b at or after @start, wrapping to + * @a->base if none found in the forward half. Return a->base + a->nr_cids + * if the intersection is empty. + * + * Callers do round-robin distribution by passing (last_cid + 1) as @start. + */ +static __always_inline u32 cmask_next_and_set_wrap(const struct scx_cmask __arena *a, + const struct scx_cmask __arena *b, + u32 start) +{ + u32 a_end = a->base + a->nr_cids; + u32 found; + + found = cmask_next_and_set(a, b, start); + if (found < a_end || start <= a->base) + return found; + + found = cmask_next_and_set(a, b, a->base); + return found < start ? found : a_end; +} + +/** + * cmask_from_cpumask - translate a kernel cpumask to a cid-space cmask + * @m: cmask to fill. Zeroed first; only bits within [@m->base, @m->base + + * @m->nr_cids) are updated - cpus mapping to cids outside that range + * are ignored. + * @cpumask: kernel cpumask to translate + * + * For each cpu in @cpumask, set the cpu's cid in @m. Caller must ensure + * @cpumask stays stable across the call (e.g. RCU read lock for + * task->cpus_ptr). + */ +static __always_inline void cmask_from_cpumask(struct scx_cmask __arena *m, + const struct cpumask *cpumask) +{ + u32 nr_cpu_ids = scx_bpf_nr_cpu_ids(); + s32 cpu; + + cmask_zero(m); + bpf_for(cpu, 0, nr_cpu_ids) { + s32 cid; + + if (!bpf_cpumask_test_cpu(cpu, cpumask)) + continue; + cid = scx_bpf_cpu_to_cid(cpu); + if (cid >= 0) + __cmask_set(cid, m); + } +} + +#endif /* __SCX_CID_BPF_H */ diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index 19459dedde41..9591a6e778ce 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -99,8 +99,21 @@ s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym; struct rq *scx_bpf_locked_rq(void) __ksym; struct task_struct *scx_bpf_cpu_curr(s32 cpu) __ksym __weak; +struct task_struct *scx_bpf_tid_to_task(u64 tid) __ksym __weak; u64 scx_bpf_now(void) __ksym __weak; void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak; +s32 scx_bpf_cpu_to_cid(s32 cpu) __ksym __weak; +s32 scx_bpf_cid_to_cpu(s32 cid) __ksym __weak; +void scx_bpf_cid_topo(s32 cid, struct scx_cid_topo *out) __ksym __weak; +s32 scx_bpf_kick_cid(s32 cid, u64 flags) __ksym __weak; +s32 scx_bpf_task_cid(const struct task_struct *p) __ksym __weak; +s32 scx_bpf_this_cid(void) __ksym __weak; +struct task_struct *scx_bpf_cid_curr(s32 cid) __ksym __weak; +u32 scx_bpf_nr_cids(void) __ksym __weak; +u32 scx_bpf_nr_online_cids(void) __ksym __weak; +u32 scx_bpf_cidperf_cap(s32 cid) __ksym __weak; +u32 scx_bpf_cidperf_cur(s32 cid) __ksym __weak; +void scx_bpf_cidperf_set(s32 cid, u32 perf) __ksym __weak; /* * Use the following as @it__iter when calling scx_bpf_dsq_move[_vtime]() from @@ -526,6 +539,10 @@ static inline bool is_migration_disabled(const struct task_struct *p) void bpf_rcu_read_lock(void) __ksym; void bpf_rcu_read_unlock(void) __ksym; +/* resilient qspinlock */ +int bpf_res_spin_lock(struct bpf_res_spin_lock *lock) __ksym __weak; +void bpf_res_spin_unlock(struct bpf_res_spin_lock *lock) __ksym __weak; + /* * Time helpers, most of which are from jiffies.h. */ @@ -1035,7 +1052,18 @@ static inline u64 scx_clock_irq(u32 cpu) return irqt ? BPF_CORE_READ(irqt, total) : 0; } +/* Abbreviated forms of <linux/overflow.h>'s struct_size() family. */ +#define flex_array_size(p, member, count) \ + ((count) * sizeof(*(p)->member)) + +#define struct_size(p, member, count) \ + (offsetof(typeof(*(p)), member) + flex_array_size(p, member, count)) + +#define struct_size_t(type, member, count) \ + struct_size((type *)NULL, member, count) + #include "compat.bpf.h" #include "enums.bpf.h" +#include "cid.bpf.h" #endif /* __SCX_COMMON_BPF_H */ diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h index 8977b5a2caa1..87f15f296234 100644 --- a/tools/sched_ext/include/scx/compat.bpf.h +++ b/tools/sched_ext/include/scx/compat.bpf.h @@ -121,6 +121,18 @@ static inline bool scx_bpf_sub_dispatch(u64 cgroup_id) return false; } +/* + * v7.2: scx_bpf_cid_override() for explicit cpu->cid mapping. Ignore if + * missing. + */ +void scx_bpf_cid_override___compat(const s32 *cpu_to_cid, u32 cpu_to_cid__sz) __ksym __weak; + +static inline void scx_bpf_cid_override(const s32 *cpu_to_cid, u32 cpu_to_cid__sz) +{ + if (bpf_ksym_exists(scx_bpf_cid_override___compat)) + return scx_bpf_cid_override___compat(cpu_to_cid, cpu_to_cid__sz); +} + /** * __COMPAT_is_enq_cpu_selected - Test if SCX_ENQ_CPU_SELECTED is on * in a compatible way. We will preserve this __COMPAT helper until v6.16. @@ -423,8 +435,10 @@ static inline void scx_bpf_dsq_reenq(u64 dsq_id, u64 reenq_flags) } /* - * Define sched_ext_ops. This may be expanded to define multiple variants for - * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH(). + * Define sched_ext_ops. See compat.h::SCX_OPS_OPEN() for how backward + * compatibility is handled (this macro can be expanded to emit multiple + * variants for incompatible op changes; SCX_OPS_OPEN() handles purely + * additive changes at load time). */ #define SCX_OPS_DEFINE(__name, ...) \ SEC(".struct_ops.link") \ @@ -432,4 +446,16 @@ static inline void scx_bpf_dsq_reenq(u64 dsq_id, u64 reenq_flags) __VA_ARGS__, \ }; +/* + * Define a cid-form sched_ext_ops. Programs targeting this struct_ops type + * use cid-form callback signatures (select_cid, set_cmask, cid_online/offline, + * dispatch with cid arg, etc.) and may only call the cid-form scx_bpf_* + * kfuncs (kick_cid, task_cid, this_cid, ...). + */ +#define SCX_OPS_CID_DEFINE(__name, ...) \ + SEC(".struct_ops.link") \ + struct sched_ext_ops_cid __name = { \ + __VA_ARGS__, \ + }; + #endif /* __SCX_COMPAT_BPF_H */ diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h index 039854c490d5..602f07061ee3 100644 --- a/tools/sched_ext/include/scx/compat.h +++ b/tools/sched_ext/include/scx/compat.h @@ -149,10 +149,24 @@ static inline long scx_hotplug_seq(void) } /* - * struct sched_ext_ops can change over time. If compat.bpf.h::SCX_OPS_DEFINE() - * is used to define ops and compat.h::SCX_OPS_LOAD/ATTACH() are used to load - * and attach it, backward compatibility is automatically maintained where - * reasonable. + * Open the sched_ext_ops skeleton. + * + * struct sched_ext_ops can change over time. Two complementary mechanisms + * keep BPF schedulers built against newer headers running on older kernels: + * + * 1. Load-time fix-up (this macro). For each optional ops callback or field + * added to struct sched_ext_ops, an explicit stanza below probes the + * running kernel's BTF via __COMPAT_struct_has_field() and, if the field + * is missing, clears it in the in-memory struct_ops (with a warning to + * stderr) before load. Handles additive changes - a new stanza must be + * added here for each new optional field. + * + * 2. Multi-variant struct_ops via compat.bpf.h::SCX_OPS_DEFINE(). That + * macro can be expanded to emit several variants of struct sched_ext_ops, + * and SCX_OPS_LOAD()/ATTACH() can pick the right one based on what the + * kernel supports. Needed when an existing operation has to change + * incompatibly (e.g. a callback signature changes); the load-time + * fix-up above only handles purely additive changes. * * ec7e3b0463e1 ("implement-ops") in https://github.com/sched-ext/sched_ext is * the current minimum required kernel version. @@ -225,6 +239,7 @@ static inline void __scx_ops_assoc_prog(struct bpf_program *prog, } #endif +/* See SCX_OPS_OPEN() above for backward-compatibility handling. */ #define SCX_OPS_LOAD(__skel, __ops_name, __scx_name, __uei_name) ({ \ struct bpf_program *__prog; \ UEI_SET_SIZE(__skel, __ops_name, __uei_name); \ diff --git a/tools/sched_ext/include/scx/user_exit_info.bpf.h b/tools/sched_ext/include/scx/user_exit_info.bpf.h index e7ac6611a990..98cab643c8d9 100644 --- a/tools/sched_ext/include/scx/user_exit_info.bpf.h +++ b/tools/sched_ext/include/scx/user_exit_info.bpf.h @@ -32,6 +32,9 @@ __uei_name##_dump_len, (__ei)->dump); \ if (bpf_core_field_exists((__ei)->exit_code)) \ __uei_name.exit_code = (__ei)->exit_code; \ + __uei_name.exit_cpu = -1; \ + if (bpf_core_field_exists((__ei)->exit_cpu)) \ + __uei_name.exit_cpu = (__ei)->exit_cpu; \ /* use __sync to force memory barrier */ \ __sync_val_compare_and_swap(&__uei_name.kind, __uei_name.kind, \ (__ei)->kind); \ diff --git a/tools/sched_ext/include/scx/user_exit_info.h b/tools/sched_ext/include/scx/user_exit_info.h index 399697fa372f..56a02b549aef 100644 --- a/tools/sched_ext/include/scx/user_exit_info.h +++ b/tools/sched_ext/include/scx/user_exit_info.h @@ -39,6 +39,8 @@ fprintf(stderr, "EXIT: %s", __uei->reason); \ if (__uei->msg[0] != '\0') \ fprintf(stderr, " (%s)", __uei->msg); \ + if (__uei->exit_cpu >= 0) \ + fprintf(stderr, " on CPU %d", __uei->exit_cpu); \ fputs("\n", stderr); \ __uei->exit_code; \ }) diff --git a/tools/sched_ext/include/scx/user_exit_info_common.h b/tools/sched_ext/include/scx/user_exit_info_common.h index 2d0981aedd89..76e2a055eb4b 100644 --- a/tools/sched_ext/include/scx/user_exit_info_common.h +++ b/tools/sched_ext/include/scx/user_exit_info_common.h @@ -22,6 +22,11 @@ enum uei_sizes { struct user_exit_info { int kind; + /* + * CPU that triggered the exit, or -1 if unset (e.g. running on an + * older kernel that does not expose this field). + */ + s32 exit_cpu; s64 exit_code; char reason[UEI_REASON_LEN]; char msg[UEI_MSG_LEN]; diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c index 4efcce099bd5..64dd60b3e922 100644 --- a/tools/sched_ext/scx_central.bpf.c +++ b/tools/sched_ext/scx_central.bpf.c @@ -149,10 +149,14 @@ static bool dispatch_to_cpu(s32 cpu) } /* - * If we can't run the task at the top, do the dumb thing and - * bounce it to the fallback dsq. + * If we can't run the task at the top for whatever reason, + * bounce it to the fallback dsq. Also check + * is_migration_disabled() explicitly as p->cpus_ptr may not + * reflect the migration-disabled state yet if + * migrate_disable_switch() hasn't run. */ - if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) { + if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr) || + (is_migration_disabled(p) && scx_bpf_task_cpu(p) != cpu)) { __sync_fetch_and_add(&nr_mismatches, 1); scx_bpf_dsq_insert(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, 0); bpf_task_release(p); diff --git a/tools/sched_ext/scx_cpu0.bpf.c b/tools/sched_ext/scx_cpu0.bpf.c index 0b1a7ce879b0..909d1be1bfe3 100644 --- a/tools/sched_ext/scx_cpu0.bpf.c +++ b/tools/sched_ext/scx_cpu0.bpf.c @@ -18,8 +18,6 @@ char _license[] SEC("license") = "GPL"; -const volatile u32 nr_cpus = 32; /* !0 for veristat, set during init */ - UEI_DEFINE(uei); /* diff --git a/tools/sched_ext/scx_cpu0.c b/tools/sched_ext/scx_cpu0.c index a6fba9978b9c..4966e3d4c724 100644 --- a/tools/sched_ext/scx_cpu0.c +++ b/tools/sched_ext/scx_cpu0.c @@ -72,8 +72,6 @@ restart: optind = 1; skel = SCX_OPS_OPEN(cpu0_ops, scx_cpu0); - skel->rodata->nr_cpus = libbpf_num_possible_cpus(); - while ((opt = getopt(argc, argv, "vh")) != -1) { switch (opt) { case 'v': diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c index d865c381589b..de2bef86d64d 100644 --- a/tools/sched_ext/scx_flatcg.c +++ b/tools/sched_ext/scx_flatcg.c @@ -130,7 +130,6 @@ int main(int argc, char **argv) struct scx_flatcg *skel; struct bpf_link *link; struct timespec intv_ts = { .tv_sec = 2, .tv_nsec = 0 }; - bool dump_cgrps = false; __u64 last_cpu_sum = 0, last_cpu_idle = 0; __u64 last_stats[FCG_NR_STATS] = {}; unsigned long seq = 0; @@ -148,7 +147,7 @@ restart: assert(skel->rodata->nr_cpus > 0); skel->rodata->cgrp_slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL"); - while ((opt = getopt(argc, argv, "s:i:dfvh")) != -1) { + while ((opt = getopt(argc, argv, "s:i:fvh")) != -1) { double v; switch (opt) { @@ -161,9 +160,6 @@ restart: intv_ts.tv_sec = v; intv_ts.tv_nsec = (v - (float)intv_ts.tv_sec) * 1000000000; break; - case 'd': - dump_cgrps = true; - break; case 'f': skel->rodata->fifo_sched = true; break; @@ -177,10 +173,10 @@ restart: } } - printf("slice=%.1lfms intv=%.1lfs dump_cgrps=%d", + printf("slice=%.1lfms intv=%.1lfs", (double)skel->rodata->cgrp_slice_ns / 1000000.0, - (double)intv_ts.tv_sec + (double)intv_ts.tv_nsec / 1000000000.0, - dump_cgrps); + (double)intv_ts.tv_sec + (double)intv_ts.tv_nsec / 1000000000.0); + SCX_OPS_LOAD(skel, flatcg_ops, scx_flatcg, uei); link = SCX_OPS_ATTACH(skel, flatcg_ops, scx_flatcg); diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index aad698fe294b..fd9a82a67627 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -2,15 +2,16 @@ /* * A simple five-level FIFO queue scheduler. * - * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets - * assigned to one depending on its compound weight. Each CPU round robins - * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from - * queue0, 2 from queue1, 4 from queue2 and so on. + * There are five FIFOs implemented as arena-backed doubly-linked lists + * threaded through per-task context. A task gets assigned to one depending on + * its compound weight. Each CPU round robins through the FIFOs and dispatches + * more from FIFOs with higher indices - 1 from queue0, 2 from queue1, 4 from + * queue2 and so on. * * This scheduler demonstrates: * - * - BPF-side queueing using PIDs. - * - Sleepable per-task storage allocation using ops.prep_enable(). + * - BPF-side queueing using TIDs. + * - BPF arena for scheduler state. * - Core-sched support. * * This scheduler is primarily for demonstration and testing of sched_ext @@ -22,6 +23,8 @@ */ #include <scx/common.bpf.h> +#include "scx_qmap.h" + enum consts { ONE_SEC_IN_NS = 1000000000, ONE_MSEC_IN_NS = 1000000, @@ -47,40 +50,72 @@ const volatile s32 disallow_tgid; const volatile bool suppress_dump; const volatile bool always_enq_immed; const volatile u32 immed_stress_nth; +const volatile u32 max_tasks; -u64 nr_highpri_queued; -u32 test_error_cnt; - -#define MAX_SUB_SCHEDS 8 -u64 sub_sched_cgroup_ids[MAX_SUB_SCHEDS]; +/* + * Optional cid-override test harness. When cid_override_mode is non-zero, + * qmap_init() calls scx_bpf_cid_override() with the caller-supplied + * cpu_to_cid array to exercise the kfunc's acceptance and error paths. + * + * 0 = disabled + * 1 = valid reverse mapping + * 2 = invalid: duplicate cid assignment + * 3 = invalid: out-of-range cid + */ +const volatile u32 cid_override_mode; +/* + * Array lives in bss (writable) because scx_bpf_cid_override()'s BPF + * verifier signature treats its len-paired pointer as read/write - rodata + * fails verification with "write into map forbidden". Userspace populates + * it before SCX_OPS_LOAD, same as rodata, and nothing writes it after. + */ +s32 cid_override_cpu_to_cid[SCX_QMAP_MAX_CPUS]; UEI_DEFINE(uei); -struct qmap { - __uint(type, BPF_MAP_TYPE_QUEUE); - __uint(max_entries, 4096); - __type(value, u32); -} queue0 SEC(".maps"), - queue1 SEC(".maps"), - queue2 SEC(".maps"), - queue3 SEC(".maps"), - queue4 SEC(".maps"), - dump_store SEC(".maps"); - +/* + * All scheduler state - per-cpu context, stats counters, core-sched sequence + * numbers, sub-sched cgroup ids - lives in this single BPF arena map. Userspace + * reaches it via skel->arena->qa. + */ struct { - __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); - __uint(max_entries, 5); - __type(key, int); - __array(values, struct qmap); -} queue_arr SEC(".maps") = { - .values = { - [0] = &queue0, - [1] = &queue1, - [2] = &queue2, - [3] = &queue3, - [4] = &queue4, - }, -}; + __uint(type, BPF_MAP_TYPE_ARENA); + __uint(map_flags, BPF_F_MMAPABLE); + __uint(max_entries, 1 << 16); /* upper bound in pages */ +#if defined(__TARGET_ARCH_arm64) || defined(__aarch64__) + __ulong(map_extra, 0x1ull << 32); /* user/BPF mmap base */ +#else + __ulong(map_extra, 0x1ull << 44); +#endif +} arena SEC(".maps"); + +struct qmap_arena __arena_global qa; + +/* + * Global idle-cid tracking, maintained via update_idle / cpu_offline and + * scanned by the direct-dispatch path. Allocated in qmap_init() from one + * arena page, sized to the full cid space. + */ +struct scx_cmask __arena *qa_idle_cids; + +/* Per-queue locks. Each in its own .data section as bpf_res_spin_lock requires. */ +__hidden struct bpf_res_spin_lock qa_q_lock0 SEC(".data.qa_q_lock0"); +__hidden struct bpf_res_spin_lock qa_q_lock1 SEC(".data.qa_q_lock1"); +__hidden struct bpf_res_spin_lock qa_q_lock2 SEC(".data.qa_q_lock2"); +__hidden struct bpf_res_spin_lock qa_q_lock3 SEC(".data.qa_q_lock3"); +__hidden struct bpf_res_spin_lock qa_q_lock4 SEC(".data.qa_q_lock4"); + +static struct bpf_res_spin_lock *qa_q_lock(s32 qid) +{ + switch (qid) { + case 0: return &qa_q_lock0; + case 1: return &qa_q_lock1; + case 2: return &qa_q_lock2; + case 3: return &qa_q_lock3; + case 4: return &qa_q_lock4; + default: return NULL; + } +} /* * If enabled, CPU performance target is set according to the queue index @@ -102,85 +137,214 @@ static const u32 qidx_to_cpuperf_target[] = { * task's seq and the associated queue's head seq is called the queue distance * and used when comparing two tasks for ordering. See qmap_core_sched_before(). */ -static u64 core_sched_head_seqs[5]; -static u64 core_sched_tail_seqs[5]; -/* Per-task scheduling context */ +/* + * Per-task scheduling context. Allocated from the qa.task_ctxs[] slab in + * arena. While the task is alive the entry is referenced from task_ctx_stor; + * while it's free the entry sits on the free list singly-linked through + * @next_free. + * + * When the task is queued on one of the five priority FIFOs, @q_idx is the + * queue index and @q_next/@q_prev link it in the queue's doubly-linked list. + * @q_idx is -1 when the task isn't on any queue. + */ struct task_ctx { - bool force_local; /* Dispatch directly to local_dsq */ - bool highpri; - u64 core_sched_seq; + struct task_ctx __arena *next_free; /* only valid on free list */ + struct task_ctx __arena *q_next; /* queue link, NULL if tail */ + struct task_ctx __arena *q_prev; /* queue link, NULL if head */ + struct qmap_fifo __arena *fifo; /* queue we're on, NULL if not queued */ + u64 tid; + s32 pid; /* for dump only */ + bool force_local; /* Dispatch directly to local_dsq */ + bool highpri; + u64 core_sched_seq; + struct scx_cmask cpus_allowed; /* per-task affinity in cid space */ +}; + +/* + * Slab stride for task_ctx. cpus_allowed's flex array bits[] overlaps the + * tail bytes appended per entry; struct_size() gives the actual per-entry + * footprint. + */ +#define TASK_CTX_STRIDE \ + struct_size_t(struct task_ctx, cpus_allowed.bits, \ + CMASK_NR_WORDS(SCX_QMAP_MAX_CPUS)) + +/* All task_ctx pointers are arena pointers. */ +typedef struct task_ctx __arena task_ctx_t; + +/* Holds an arena pointer to the task's slab entry. */ +struct task_ctx_stor_val { + task_ctx_t *taskc; }; struct { __uint(type, BPF_MAP_TYPE_TASK_STORAGE); __uint(map_flags, BPF_F_NO_PREALLOC); __type(key, int); - __type(value, struct task_ctx); + __type(value, struct task_ctx_stor_val); } task_ctx_stor SEC(".maps"); -struct cpu_ctx { - u64 dsp_idx; /* dispatch index */ - u64 dsp_cnt; /* remaining count */ - u32 avg_weight; - u32 cpuperf_target; -}; +/* Protects the task_ctx slab free list. */ +__hidden struct bpf_res_spin_lock qa_task_lock SEC(".data.qa_task_lock"); -struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __uint(max_entries, 1); - __type(key, u32); - __type(value, struct cpu_ctx); -} cpu_ctx_stor SEC(".maps"); +static int qmap_spin_lock(struct bpf_res_spin_lock *lock) +{ + if (bpf_res_spin_lock(lock)) { + scx_bpf_error("res_spin_lock failed"); + return -EBUSY; + } + return 0; +} -/* Statistics */ -u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cpu0, nr_dequeued, nr_ddsp_from_enq; -u64 nr_core_sched_execed; -u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer; -u32 cpuperf_min, cpuperf_avg, cpuperf_max; -u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max; +/* + * Try prev_cid, then scan taskc->cpus_allowed AND qa_idle_cids round-robin + * from prev_cid + 1. Atomic claim retries on race; bounded by + * IDLE_PICK_RETRIES to keep the verifier's insn budget in check. + */ +#define IDLE_PICK_RETRIES 16 -static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu) +static s32 pick_direct_dispatch_cid(struct task_struct *p, s32 prev_cid, + task_ctx_t *taskc) { - s32 cpu; + u32 nr_cids = scx_bpf_nr_cids(); + s32 cid; + u32 i; if (!always_enq_immed && p->nr_cpus_allowed == 1) - return prev_cpu; + return prev_cid; + + if (cmask_test_and_clear(prev_cid, qa_idle_cids)) + return prev_cid; + + cid = prev_cid; + bpf_for(i, 0, IDLE_PICK_RETRIES) { + cid = cmask_next_and_set_wrap(&taskc->cpus_allowed, + qa_idle_cids, cid + 1); + barrier_var(cid); + if (cid >= nr_cids) + return -1; + if (cmask_test_and_clear(cid, qa_idle_cids)) + return cid; + } + return -1; +} - if (scx_bpf_test_and_clear_cpu_idle(prev_cpu)) - return prev_cpu; +/* + * Force a reference to the arena map. The verifier associates an arena with + * a program by finding an LD_IMM64 instruction that loads the arena's BPF + * map; programs that only use arena pointers returned from task-local + * storage (like qmap_select_cpu) never reference @arena directly. Without + * this, the verifier rejects addr_space_cast with "addr_space_cast insn + * can only be used in a program that has an associated arena". + */ +#define QMAP_TOUCH_ARENA() do { asm volatile("" :: "r"(&arena)); } while (0) - cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); - if (cpu >= 0) - return cpu; +static task_ctx_t *lookup_task_ctx(struct task_struct *p) +{ + struct task_ctx_stor_val *v; - return -1; + QMAP_TOUCH_ARENA(); + + v = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!v || !v->taskc) + return NULL; + return v->taskc; } -static struct task_ctx *lookup_task_ctx(struct task_struct *p) +/* Append @taskc to the tail of @fifo. Must not already be queued. */ +static void qmap_fifo_enqueue(struct qmap_fifo __arena *fifo, task_ctx_t *taskc) { - return bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + struct bpf_res_spin_lock *lock = qa_q_lock(fifo->idx); + + if (!lock || qmap_spin_lock(lock)) + return; + taskc->fifo = fifo; + taskc->q_next = NULL; + taskc->q_prev = fifo->tail; + if (fifo->tail) + fifo->tail->q_next = taskc; + else + fifo->head = taskc; + fifo->tail = taskc; + bpf_res_spin_unlock(lock); +} + +/* Pop the head of @fifo. Returns NULL if empty. */ +static task_ctx_t *qmap_fifo_pop(struct qmap_fifo __arena *fifo) +{ + struct bpf_res_spin_lock *lock = qa_q_lock(fifo->idx); + task_ctx_t *taskc; + + if (!lock || qmap_spin_lock(lock)) + return NULL; + taskc = fifo->head; + if (taskc) { + fifo->head = taskc->q_next; + if (taskc->q_next) + taskc->q_next->q_prev = NULL; + else + fifo->tail = NULL; + taskc->q_next = NULL; + taskc->q_prev = NULL; + taskc->fifo = NULL; + } + bpf_res_spin_unlock(lock); + return taskc; +} + +/* Remove @taskc from its fifo. No-op if not queued. */ +static void qmap_fifo_remove(task_ctx_t *taskc) +{ + struct qmap_fifo __arena *fifo = taskc->fifo; + struct bpf_res_spin_lock *lock; + + if (!fifo) + return; + + lock = qa_q_lock(fifo->idx); + if (!lock || qmap_spin_lock(lock)) + return; + + /* Re-check under lock — a concurrent pop may have cleared fifo. */ + if (taskc->fifo != fifo) { + bpf_res_spin_unlock(lock); + return; + } + + if (taskc->q_next) + taskc->q_next->q_prev = taskc->q_prev; + else + fifo->tail = taskc->q_prev; + if (taskc->q_prev) + taskc->q_prev->q_next = taskc->q_next; + else + fifo->head = taskc->q_next; + taskc->q_next = NULL; + taskc->q_prev = NULL; + taskc->fifo = NULL; + bpf_res_spin_unlock(lock); } -s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p, - s32 prev_cpu, u64 wake_flags) +s32 BPF_STRUCT_OPS(qmap_select_cid, struct task_struct *p, + s32 prev_cid, u64 wake_flags) { - struct task_ctx *tctx; - s32 cpu; + task_ctx_t *taskc; + s32 cid; - if (!(tctx = lookup_task_ctx(p))) - return prev_cpu; + if (!(taskc = lookup_task_ctx(p))) + return prev_cid; if (p->scx.weight < 2 && !(p->flags & PF_KTHREAD)) - return prev_cpu; + return prev_cid; - cpu = pick_direct_dispatch_cpu(p, prev_cpu); + cid = pick_direct_dispatch_cid(p, prev_cid, taskc); - if (cpu >= 0) { - tctx->force_local = true; - return cpu; + if (cid >= 0) { + taskc->force_local = true; + return cid; } else { - return prev_cpu; + return prev_cid; } } @@ -202,16 +366,14 @@ static int weight_to_idx(u32 weight) void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) { static u32 user_cnt, kernel_cnt; - struct task_ctx *tctx; - u32 pid = p->pid; + task_ctx_t *taskc; int idx = weight_to_idx(p->scx.weight); - void *ring; - s32 cpu; + s32 cid; if (enq_flags & SCX_ENQ_REENQ) { - __sync_fetch_and_add(&nr_reenqueued, 1); - if (scx_bpf_task_cpu(p) == 0) - __sync_fetch_and_add(&nr_reenqueued_cpu0, 1); + __sync_fetch_and_add(&qa.nr_reenqueued, 1); + if (scx_bpf_task_cid(p) == 0) + __sync_fetch_and_add(&qa.nr_reenqueued_cid0, 1); } if (p->flags & PF_KTHREAD) { @@ -222,17 +384,17 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) return; } - if (test_error_cnt && !--test_error_cnt) + if (qa.test_error_cnt && !--qa.test_error_cnt) scx_bpf_error("test triggering error"); - if (!(tctx = lookup_task_ctx(p))) + if (!(taskc = lookup_task_ctx(p))) return; /* * All enqueued tasks must have their core_sched_seq updated for correct * core-sched ordering. Also, take a look at the end of qmap_dispatch(). */ - tctx->core_sched_seq = core_sched_tail_seqs[idx]++; + taskc->core_sched_seq = qa.core_sched_tail_seqs[idx]++; /* * IMMED stress testing: Every immed_stress_nth'th enqueue, dispatch @@ -243,19 +405,19 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) static u32 immed_stress_cnt; if (!(++immed_stress_cnt % immed_stress_nth)) { - tctx->force_local = false; - scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | scx_bpf_task_cpu(p), + taskc->force_local = false; + scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | scx_bpf_task_cid(p), slice_ns, enq_flags); return; } } /* - * If qmap_select_cpu() is telling us to or this is the last runnable + * If qmap_select_cid() is telling us to or this is the last runnable * task on the CPU, enqueue locally. */ - if (tctx->force_local) { - tctx->force_local = false; + if (taskc->force_local) { + taskc->force_local = false; scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, enq_flags); return; } @@ -267,11 +429,11 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) return; } - /* if select_cpu() wasn't called, try direct dispatch */ + /* if select_cid() wasn't called, try direct dispatch */ if (!__COMPAT_is_enq_cpu_selected(enq_flags) && - (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) { - __sync_fetch_and_add(&nr_ddsp_from_enq, 1); - scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags); + (cid = pick_direct_dispatch_cid(p, scx_bpf_task_cid(p), taskc)) >= 0) { + __sync_fetch_and_add(&qa.nr_ddsp_from_enq, 1); + scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cid, slice_ns, enq_flags); return; } @@ -279,55 +441,52 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) * If the task was re-enqueued due to the CPU being preempted by a * higher priority scheduling class, just re-enqueue the task directly * on the global DSQ. As we want another CPU to pick it up, find and - * kick an idle CPU. + * kick an idle cid. */ if (enq_flags & SCX_ENQ_REENQ) { - s32 cpu; + s32 cid; scx_bpf_dsq_insert(p, SHARED_DSQ, 0, enq_flags); - cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); - if (cpu >= 0) - scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE); + cid = cmask_next_and_set_wrap(&taskc->cpus_allowed, + qa_idle_cids, 0); + if (cid < scx_bpf_nr_cids()) + scx_bpf_kick_cid(cid, SCX_KICK_IDLE); return; } - ring = bpf_map_lookup_elem(&queue_arr, &idx); - if (!ring) { - scx_bpf_error("failed to find ring %d", idx); - return; - } - - /* Queue on the selected FIFO. If the FIFO overflows, punt to global. */ - if (bpf_map_push_elem(ring, &pid, 0)) { - scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, enq_flags); - return; - } + /* Queue on the selected FIFO. */ + qmap_fifo_enqueue(&qa.fifos[idx], taskc); if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) { - tctx->highpri = true; - __sync_fetch_and_add(&nr_highpri_queued, 1); + taskc->highpri = true; + __sync_fetch_and_add(&qa.nr_highpri_queued, 1); } - __sync_fetch_and_add(&nr_enqueued, 1); + __sync_fetch_and_add(&qa.nr_enqueued, 1); } -/* - * The BPF queue map doesn't support removal and sched_ext can handle spurious - * dispatches. qmap_dequeue() is only used to collect statistics. - */ void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags) { - __sync_fetch_and_add(&nr_dequeued, 1); + task_ctx_t *taskc; + + __sync_fetch_and_add(&qa.nr_dequeued, 1); if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC) - __sync_fetch_and_add(&nr_core_sched_execed, 1); + __sync_fetch_and_add(&qa.nr_core_sched_execed, 1); + + taskc = lookup_task_ctx(p); + if (taskc && taskc->fifo) { + if (taskc->highpri) + __sync_fetch_and_sub(&qa.nr_highpri_queued, 1); + qmap_fifo_remove(taskc); + } } static void update_core_sched_head_seq(struct task_struct *p) { int idx = weight_to_idx(p->scx.weight); - struct task_ctx *tctx; + task_ctx_t *taskc; - if ((tctx = lookup_task_ctx(p))) - core_sched_head_seqs[idx] = tctx->core_sched_seq; + if ((taskc = lookup_task_ctx(p))) + qa.core_sched_head_seqs[idx] = taskc->core_sched_seq; } /* @@ -343,17 +502,18 @@ static void update_core_sched_head_seq(struct task_struct *p) static bool dispatch_highpri(bool from_timer) { struct task_struct *p; - s32 this_cpu = bpf_get_smp_processor_id(); + s32 this_cid = scx_bpf_this_cid(); + u32 nr_cids = scx_bpf_nr_cids(); /* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */ bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) { static u64 highpri_seq; - struct task_ctx *tctx; + task_ctx_t *taskc; - if (!(tctx = lookup_task_ctx(p))) + if (!(taskc = lookup_task_ctx(p))) return false; - if (tctx->highpri) { + if (taskc->highpri) { /* exercise the set_*() and vtime interface too */ scx_bpf_dsq_move_set_slice(BPF_FOR_EACH_ITER, slice_ns * 2); scx_bpf_dsq_move_set_vtime(BPF_FOR_EACH_ITER, highpri_seq++); @@ -362,30 +522,38 @@ static bool dispatch_highpri(bool from_timer) } /* - * Scan HIGHPRI_DSQ and dispatch until a task that can run on this CPU - * is found. + * Scan HIGHPRI_DSQ and dispatch until a task that can run here is + * found. Prefer this_cid if the task allows it; otherwise RR-scan the + * task's cpus_allowed starting after this_cid. */ bpf_for_each(scx_dsq, p, HIGHPRI_DSQ, 0) { + task_ctx_t *taskc; bool dispatched = false; - s32 cpu; + s32 cid; + + if (!(taskc = lookup_task_ctx(p))) + return false; - if (bpf_cpumask_test_cpu(this_cpu, p->cpus_ptr)) - cpu = this_cpu; + if (cmask_test(this_cid, &taskc->cpus_allowed)) + cid = this_cid; else - cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0); + cid = cmask_next_set_wrap(&taskc->cpus_allowed, + this_cid + 1); + if (cid >= nr_cids) + continue; - if (scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p, SCX_DSQ_LOCAL_ON | cpu, + if (scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p, SCX_DSQ_LOCAL_ON | cid, SCX_ENQ_PREEMPT)) { - if (cpu == this_cpu) { + if (cid == this_cid) { dispatched = true; - __sync_fetch_and_add(&nr_expedited_local, 1); + __sync_fetch_and_add(&qa.nr_expedited_local, 1); } else { - __sync_fetch_and_add(&nr_expedited_remote, 1); + __sync_fetch_and_add(&qa.nr_expedited_remote, 1); } if (from_timer) - __sync_fetch_and_add(&nr_expedited_from_timer, 1); + __sync_fetch_and_add(&qa.nr_expedited_from_timer, 1); } else { - __sync_fetch_and_add(&nr_expedited_lost, 1); + __sync_fetch_and_add(&qa.nr_expedited_lost, 1); } if (dispatched) @@ -395,22 +563,21 @@ static bool dispatch_highpri(bool from_timer) return false; } -void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) +void BPF_STRUCT_OPS(qmap_dispatch, s32 cid, struct task_struct *prev) { struct task_struct *p; - struct cpu_ctx *cpuc; - struct task_ctx *tctx; - u32 zero = 0, batch = dsp_batch ?: 1; - void *fifo; - s32 i, pid; + struct cpu_ctx __arena *cpuc; + task_ctx_t *taskc; + u32 batch = dsp_batch ?: 1; + s32 i; if (dispatch_highpri(false)) return; - if (!nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ, 0)) + if (!qa.nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ, 0)) return; - if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) { + if (dsp_inf_loop_after && qa.nr_dispatched > dsp_inf_loop_after) { /* * PID 2 should be kthreadd which should mostly be idle and off * the scheduler. Let's keep dispatching it to force the kernel @@ -424,10 +591,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) } } - if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) { - scx_bpf_error("failed to look up cpu_ctx"); - return; - } + cpuc = &qa.cpu_ctxs[scx_bpf_this_cid()]; for (i = 0; i < 5; i++) { /* Advance the dispatch cursor and pick the fifo. */ @@ -436,33 +600,23 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) cpuc->dsp_cnt = 1 << cpuc->dsp_idx; } - fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx); - if (!fifo) { - scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx); - return; - } - /* Dispatch or advance. */ bpf_repeat(BPF_MAX_LOOPS) { - struct task_ctx *tctx; + task_ctx_t *taskc; - if (bpf_map_pop_elem(fifo, &pid)) + taskc = qmap_fifo_pop(&qa.fifos[cpuc->dsp_idx]); + if (!taskc) break; - p = bpf_task_from_pid(pid); + p = scx_bpf_tid_to_task(taskc->tid); if (!p) continue; - if (!(tctx = lookup_task_ctx(p))) { - bpf_task_release(p); - return; - } - - if (tctx->highpri) - __sync_fetch_and_sub(&nr_highpri_queued, 1); + if (taskc->highpri) + __sync_fetch_and_sub(&qa.nr_highpri_queued, 1); update_core_sched_head_seq(p); - __sync_fetch_and_add(&nr_dispatched, 1); + __sync_fetch_and_add(&qa.nr_dispatched, 1); scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, 0); @@ -502,10 +656,8 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) * document this class of issue -- other schedulers * seeing similar warnings can use this as a reference. */ - if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) - scx_bpf_kick_cpu(scx_bpf_task_cpu(p), 0); - - bpf_task_release(p); + if (!cmask_test(cid, &taskc->cpus_allowed)) + scx_bpf_kick_cid(scx_bpf_task_cid(p), 0); batch--; cpuc->dsp_cnt--; @@ -523,8 +675,8 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) } for (i = 0; i < MAX_SUB_SCHEDS; i++) { - if (sub_sched_cgroup_ids[i] && - scx_bpf_sub_dispatch(sub_sched_cgroup_ids[i])) + if (qa.sub_sched_cgroup_ids[i] && + scx_bpf_sub_dispatch(qa.sub_sched_cgroup_ids[i])) return; } @@ -533,24 +685,20 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) * if the task were enqueued and dispatched immediately. */ if (prev) { - tctx = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0); - if (tctx) - tctx->core_sched_seq = - core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++; + taskc = lookup_task_ctx(prev); + if (!taskc) + return; + + taskc->core_sched_seq = + qa.core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++; } } void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p) { - struct cpu_ctx *cpuc; - u32 zero = 0; + struct cpu_ctx __arena *cpuc = &qa.cpu_ctxs[scx_bpf_this_cid()]; int idx; - if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) { - scx_bpf_error("failed to look up cpu_ctx"); - return; - } - /* * Use the running avg of weights to select the target cpuperf level. * This is a demonstration of the cpuperf feature rather than a @@ -560,7 +708,7 @@ void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p) idx = weight_to_idx(cpuc->avg_weight); cpuc->cpuperf_target = qidx_to_cpuperf_target[idx]; - scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target); + scx_bpf_cidperf_set(scx_bpf_task_cid(p), cpuc->cpuperf_target); } /* @@ -570,14 +718,14 @@ void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p) static s64 task_qdist(struct task_struct *p) { int idx = weight_to_idx(p->scx.weight); - struct task_ctx *tctx; + task_ctx_t *taskc; s64 qdist; - tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); - if (!tctx) + taskc = lookup_task_ctx(p); + if (!taskc) return 0; - qdist = tctx->core_sched_seq - core_sched_head_seqs[idx]; + qdist = taskc->core_sched_seq - qa.core_sched_head_seqs[idx]; /* * As queue index increments, the priority doubles. The queue w/ index 3 @@ -610,70 +758,110 @@ bool BPF_STRUCT_OPS(qmap_core_sched_before, * tasks when a higher-priority scheduling class takes the CPU. */ -s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p, - struct scx_init_task_args *args) +s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init_task, struct task_struct *p, + struct scx_init_task_args *args) { + struct task_ctx_stor_val *v; + task_ctx_t *taskc; + if (p->tgid == disallow_tgid) p->scx.disallow = true; - /* - * @p is new. Let's ensure that its task_ctx is available. We can sleep - * in this function and the following will automatically use GFP_KERNEL. - */ - if (bpf_task_storage_get(&task_ctx_stor, p, 0, - BPF_LOCAL_STORAGE_GET_F_CREATE)) - return 0; - else + /* pop a slab entry off the free list */ + if (qmap_spin_lock(&qa_task_lock)) + return -EBUSY; + taskc = qa.task_free_head; + if (taskc) + qa.task_free_head = taskc->next_free; + bpf_res_spin_unlock(&qa_task_lock); + if (!taskc) { + scx_bpf_error("task_ctx slab exhausted (max_tasks=%u)", max_tasks); + return -ENOMEM; + } + + taskc->next_free = NULL; + taskc->q_next = NULL; + taskc->q_prev = NULL; + taskc->fifo = NULL; + taskc->tid = p->scx.tid; + taskc->pid = p->pid; + taskc->force_local = false; + taskc->highpri = false; + taskc->core_sched_seq = 0; + cmask_init(&taskc->cpus_allowed, 0, scx_bpf_nr_cids()); + bpf_rcu_read_lock(); + cmask_from_cpumask(&taskc->cpus_allowed, p->cpus_ptr); + bpf_rcu_read_unlock(); + + v = bpf_task_storage_get(&task_ctx_stor, p, NULL, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (!v) { + /* push back to the free list */ + if (!qmap_spin_lock(&qa_task_lock)) { + taskc->next_free = qa.task_free_head; + qa.task_free_head = taskc; + bpf_res_spin_unlock(&qa_task_lock); + } return -ENOMEM; + } + v->taskc = taskc; + return 0; } -void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx) +void BPF_STRUCT_OPS(qmap_exit_task, struct task_struct *p, + struct scx_exit_task_args *args) { - s32 i, pid; + struct task_ctx_stor_val *v; + task_ctx_t *taskc; - if (suppress_dump) + v = bpf_task_storage_get(&task_ctx_stor, p, NULL, 0); + if (!v || !v->taskc) return; + taskc = v->taskc; + v->taskc = NULL; - bpf_for(i, 0, 5) { - void *fifo; + if (qmap_spin_lock(&qa_task_lock)) + return; + taskc->next_free = qa.task_free_head; + qa.task_free_head = taskc; + bpf_res_spin_unlock(&qa_task_lock); +} - if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i))) - return; +void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx) +{ + task_ctx_t *taskc; + s32 i; - scx_bpf_dump("QMAP FIFO[%d]:", i); + QMAP_TOUCH_ARENA(); - /* - * Dump can be invoked anytime and there is no way to iterate in - * a non-destructive way. Pop and store in dump_store and then - * restore afterwards. If racing against new enqueues, ordering - * can get mixed up. - */ - bpf_repeat(4096) { - if (bpf_map_pop_elem(fifo, &pid)) - break; - bpf_map_push_elem(&dump_store, &pid, 0); - scx_bpf_dump(" %d", pid); - } + if (suppress_dump) + return; + /* + * Walk the queue lists without locking - kfunc calls (scx_bpf_dump) + * aren't in the verifier's kfunc_spin_allowed() list so we can't hold + * a lock and dump. Best-effort; racing may print stale tids but the + * walk is bounded by bpf_repeat() so it always terminates. + */ + bpf_for(i, 0, 5) { + scx_bpf_dump("QMAP FIFO[%d]:", i); + taskc = qa.fifos[i].head; bpf_repeat(4096) { - if (bpf_map_pop_elem(&dump_store, &pid)) + if (!taskc) break; - bpf_map_push_elem(fifo, &pid, 0); + scx_bpf_dump(" %d:%llu", taskc->pid, taskc->tid); + taskc = taskc->q_next; } - scx_bpf_dump("\n"); } } -void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle) +void BPF_STRUCT_OPS(qmap_dump_cid, struct scx_dump_ctx *dctx, s32 cid, bool idle) { - u32 zero = 0; - struct cpu_ctx *cpuc; + struct cpu_ctx __arena *cpuc = &qa.cpu_ctxs[cid]; if (suppress_dump || idle) return; - if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu))) - return; scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u", cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight, @@ -682,12 +870,17 @@ void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p) { - struct task_ctx *taskc; + struct task_ctx_stor_val *v; + task_ctx_t *taskc; + + QMAP_TOUCH_ARENA(); if (suppress_dump) return; - if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) + v = bpf_task_storage_get(&task_ctx_stor, p, NULL, 0); + if (!v || !v->taskc) return; + taskc = v->taskc; scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu", taskc->force_local, taskc->core_sched_seq); @@ -716,61 +909,25 @@ void BPF_STRUCT_OPS(qmap_cgroup_set_bandwidth, struct cgroup *cgrp, cgrp->kn->id, period_us, quota_us, burst_us); } -/* - * Print out the online and possible CPU map using bpf_printk() as a - * demonstration of using the cpumask kfuncs and ops.cpu_on/offline(). - */ -static void print_cpus(void) +void BPF_STRUCT_OPS(qmap_update_idle, s32 cid, bool idle) { - const struct cpumask *possible, *online; - s32 cpu; - char buf[128] = "", *p; - int idx; - - possible = scx_bpf_get_possible_cpumask(); - online = scx_bpf_get_online_cpumask(); - - idx = 0; - bpf_for(cpu, 0, scx_bpf_nr_cpu_ids()) { - if (!(p = MEMBER_VPTR(buf, [idx++]))) - break; - if (bpf_cpumask_test_cpu(cpu, online)) - *p++ = 'O'; - else if (bpf_cpumask_test_cpu(cpu, possible)) - *p++ = 'X'; - else - *p++ = ' '; - - if ((cpu & 7) == 7) { - if (!(p = MEMBER_VPTR(buf, [idx++]))) - break; - *p++ = '|'; - } - } - buf[sizeof(buf) - 1] = '\0'; - - scx_bpf_put_cpumask(online); - scx_bpf_put_cpumask(possible); - - bpf_printk("CPUS: |%s", buf); + QMAP_TOUCH_ARENA(); + if (idle) + cmask_set(cid, qa_idle_cids); + else + cmask_clear(cid, qa_idle_cids); } -void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu) +void BPF_STRUCT_OPS(qmap_set_cmask, struct task_struct *p, + const struct scx_cmask *cmask_in) { - if (print_msgs) { - bpf_printk("CPU %d coming online", cpu); - /* @cpu is already online at this point */ - print_cpus(); - } -} + struct scx_cmask __arena *cmask = (struct scx_cmask __arena *)(long)cmask_in; + task_ctx_t *taskc; -void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu) -{ - if (print_msgs) { - bpf_printk("CPU %d going offline", cpu); - /* @cpu is still online at this point */ - print_cpus(); - } + taskc = lookup_task_ctx(p); + if (!taskc) + return; + cmask_copy(&taskc->cpus_allowed, cmask); } struct monitor_timer { @@ -785,64 +942,49 @@ struct { } monitor_timer SEC(".maps"); /* - * Print out the min, avg and max performance levels of CPUs every second to - * demonstrate the cpuperf interface. + * Aggregate cidperf across the first nr_online_cids cids. Post-hotplug + * the first-N-are-online invariant drifts, so some cap/cur values may + * be stale. For this demo monitor that's fine; the scheduler exits on + * the enable-time hotplug_seq mismatch and userspace restarts, which + * rebuilds the layout. */ static void monitor_cpuperf(void) { - u32 zero = 0, nr_cpu_ids; + u32 nr_online = scx_bpf_nr_online_cids(); u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0; u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0; - const struct cpumask *online; - int i, nr_online_cpus = 0; - - nr_cpu_ids = scx_bpf_nr_cpu_ids(); - online = scx_bpf_get_online_cpumask(); + s32 cid; - bpf_for(i, 0, nr_cpu_ids) { - struct cpu_ctx *cpuc; - u32 cap, cur; + QMAP_TOUCH_ARENA(); - if (!bpf_cpumask_test_cpu(i, online)) - continue; - nr_online_cpus++; - - /* collect the capacity and current cpuperf */ - cap = scx_bpf_cpuperf_cap(i); - cur = scx_bpf_cpuperf_cur(i); + bpf_for(cid, 0, nr_online) { + struct cpu_ctx __arena *cpuc = &qa.cpu_ctxs[cid]; + u32 cap = scx_bpf_cidperf_cap(cid); + u32 cur = scx_bpf_cidperf_cur(cid); + u32 target; cur_min = cur < cur_min ? cur : cur_min; cur_max = cur > cur_max ? cur : cur_max; - /* - * $cur is relative to $cap. Scale it down accordingly so that - * it's in the same scale as other CPUs and $cur_sum/$cap_sum - * makes sense. - */ - cur_sum += cur * cap / SCX_CPUPERF_ONE; + cur_sum += (u64)cur * cap / SCX_CPUPERF_ONE; cap_sum += cap; - if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) { - scx_bpf_error("failed to look up cpu_ctx"); - goto out; - } - - /* collect target */ - cur = cpuc->cpuperf_target; - target_sum += cur; - target_min = cur < target_min ? cur : target_min; - target_max = cur > target_max ? cur : target_max; + target = cpuc->cpuperf_target; + target_sum += target; + target_min = target < target_min ? target : target_min; + target_max = target > target_max ? target : target_max; } - cpuperf_min = cur_min; - cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum; - cpuperf_max = cur_max; + if (!nr_online || !cap_sum) + return; + + qa.cpuperf_min = cur_min; + qa.cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum; + qa.cpuperf_max = cur_max; - cpuperf_target_min = target_min; - cpuperf_target_avg = target_sum / nr_online_cpus; - cpuperf_target_max = target_max; -out: - scx_bpf_put_cpumask(online); + qa.cpuperf_target_min = target_min; + qa.cpuperf_target_avg = target_sum / nr_online; + qa.cpuperf_target_max = target_max; } /* @@ -927,12 +1069,76 @@ static int lowpri_timerfn(void *map, int *key, struct bpf_timer *timer) s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init) { - u32 key = 0; + u8 __arena *slab; + u32 nr_pages, key = 0, i; + u32 nr_cids, nr_cpu_ids; struct bpf_timer *timer; s32 ret; - if (print_msgs && !sub_cgroup_id) - print_cpus(); + nr_cids = scx_bpf_nr_cids(); + nr_cpu_ids = scx_bpf_nr_cpu_ids(); + + if (nr_cids > SCX_QMAP_MAX_CPUS) { + scx_bpf_error("nr_cids=%u exceeds SCX_QMAP_MAX_CPUS=%d", + nr_cids, SCX_QMAP_MAX_CPUS); + return -EINVAL; + } + if (nr_cpu_ids > SCX_QMAP_MAX_CPUS) { + scx_bpf_error("nr_cpu_ids=%u exceeds SCX_QMAP_MAX_CPUS=%d", + nr_cpu_ids, SCX_QMAP_MAX_CPUS); + return -EINVAL; + } + + /* + * cid-override test hook. Must run before anything that reads the + * cid space (scx_bpf_nr_cids, cmask_init, etc.). On invalid input, + * the kfunc calls scx_error() which aborts the scheduler. + */ + if (cid_override_mode) { + scx_bpf_cid_override((const s32 *)cid_override_cpu_to_cid, + nr_cpu_ids * sizeof(s32)); + } + + /* + * Allocate the task_ctx slab in arena and thread the entire slab onto + * the free list. max_tasks is set by userspace before load. Each entry + * is TASK_CTX_STRIDE bytes - task_ctx's trailing cpus_allowed flex + * array extends into the stride tail. + */ + if (!max_tasks) { + scx_bpf_error("max_tasks must be > 0"); + return -EINVAL; + } + + nr_pages = (max_tasks * TASK_CTX_STRIDE + PAGE_SIZE - 1) / PAGE_SIZE; + slab = bpf_arena_alloc_pages(&arena, NULL, nr_pages, NUMA_NO_NODE, 0); + if (!slab) { + scx_bpf_error("failed to allocate task_ctx slab"); + return -ENOMEM; + } + qa.task_ctxs = (task_ctx_t *)slab; + + bpf_for(i, 0, 5) + qa.fifos[i].idx = i; + + bpf_for(i, 0, max_tasks) { + task_ctx_t *cur = (task_ctx_t *)(slab + i * TASK_CTX_STRIDE); + task_ctx_t *next = (i + 1 < max_tasks) ? + (task_ctx_t *)(slab + (i + 1) * TASK_CTX_STRIDE) : NULL; + cur->next_free = next; + } + qa.task_free_head = (task_ctx_t *)slab; + + /* + * Allocate and initialize the idle cmask. Starts empty - update_idle + * fills it as cpus enter idle. + */ + qa_idle_cids = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + if (!qa_idle_cids) { + scx_bpf_error("failed to allocate idle cmask"); + return -ENOMEM; + } + cmask_init(qa_idle_cids, 0, nr_cids); ret = scx_bpf_create_dsq(SHARED_DSQ, -1); if (ret) { @@ -984,8 +1190,8 @@ s32 BPF_STRUCT_OPS(qmap_sub_attach, struct scx_sub_attach_args *args) s32 i; for (i = 0; i < MAX_SUB_SCHEDS; i++) { - if (!sub_sched_cgroup_ids[i]) { - sub_sched_cgroup_ids[i] = args->ops->sub_cgroup_id; + if (!qa.sub_sched_cgroup_ids[i]) { + qa.sub_sched_cgroup_ids[i] = args->ops->sub_cgroup_id; bpf_printk("attaching sub-sched[%d] on %s", i, args->cgroup_path); return 0; @@ -1000,8 +1206,8 @@ void BPF_STRUCT_OPS(qmap_sub_detach, struct scx_sub_detach_args *args) s32 i; for (i = 0; i < MAX_SUB_SCHEDS; i++) { - if (sub_sched_cgroup_ids[i] == args->ops->sub_cgroup_id) { - sub_sched_cgroup_ids[i] = 0; + if (qa.sub_sched_cgroup_ids[i] == args->ops->sub_cgroup_id) { + qa.sub_sched_cgroup_ids[i] = 0; bpf_printk("detaching sub-sched[%d] on %s", i, args->cgroup_path); break; @@ -1009,24 +1215,26 @@ void BPF_STRUCT_OPS(qmap_sub_detach, struct scx_sub_detach_args *args) } } -SCX_OPS_DEFINE(qmap_ops, - .select_cpu = (void *)qmap_select_cpu, +SCX_OPS_CID_DEFINE(qmap_ops, + .flags = SCX_OPS_ENQ_EXITING | SCX_OPS_TID_TO_TASK, + .select_cid = (void *)qmap_select_cid, .enqueue = (void *)qmap_enqueue, .dequeue = (void *)qmap_dequeue, .dispatch = (void *)qmap_dispatch, .tick = (void *)qmap_tick, .core_sched_before = (void *)qmap_core_sched_before, + .set_cmask = (void *)qmap_set_cmask, + .update_idle = (void *)qmap_update_idle, .init_task = (void *)qmap_init_task, + .exit_task = (void *)qmap_exit_task, .dump = (void *)qmap_dump, - .dump_cpu = (void *)qmap_dump_cpu, + .dump_cid = (void *)qmap_dump_cid, .dump_task = (void *)qmap_dump_task, .cgroup_init = (void *)qmap_cgroup_init, .cgroup_set_weight = (void *)qmap_cgroup_set_weight, .cgroup_set_bandwidth = (void *)qmap_cgroup_set_bandwidth, .sub_attach = (void *)qmap_sub_attach, .sub_detach = (void *)qmap_sub_detach, - .cpu_online = (void *)qmap_cpu_online, - .cpu_offline = (void *)qmap_cpu_offline, .init = (void *)qmap_init, .exit = (void *)qmap_exit, .timeout_ms = 5000U, diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c index e7c89a2bc3d8..67ddd483a4c7 100644 --- a/tools/sched_ext/scx_qmap.c +++ b/tools/sched_ext/scx_qmap.c @@ -10,9 +10,11 @@ #include <inttypes.h> #include <signal.h> #include <libgen.h> +#include <sys/mman.h> #include <sys/stat.h> #include <bpf/bpf.h> #include <scx/common.h> +#include "scx_qmap.h" #include "scx_qmap.bpf.skel.h" const char help_fmt[] = @@ -21,23 +23,27 @@ const char help_fmt[] = "See the top-level comment in .bpf.c for more details.\n" "\n" "Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-b COUNT]\n" -" [-P] [-M] [-H] [-d PID] [-D LEN] [-S] [-p] [-I] [-F COUNT] [-v]\n" +" [-N COUNT] [-P] [-M] [-H] [-c CG_PATH] [-d PID] [-D LEN] [-S] [-p] [-I]\n" +" [-F COUNT] [-v]\n" "\n" " -s SLICE_US Override slice duration\n" " -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n" " -t COUNT Stall every COUNT'th user thread\n" " -T COUNT Stall every COUNT'th kernel thread\n" +" -N COUNT Size of the task_ctx arena slab (default 16384)\n" " -l COUNT Trigger dispatch infinite looping after COUNT dispatches\n" " -b COUNT Dispatch upto COUNT tasks together\n" " -P Print out DSQ content and event counters to trace_pipe every second\n" " -M Print out debug messages to trace_pipe\n" " -H Boost nice -20 tasks in SHARED_DSQ, use with -b\n" +" -c CG_PATH Cgroup path to attach as sub-scheduler, must run parent scheduler first\n" " -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n" " -D LEN Set scx_exit_info.dump buffer length\n" " -S Suppress qmap-specific debug dump\n" " -p Switch only tasks on SCHED_EXT policy instead of all\n" " -I Turn on SCX_OPS_ALWAYS_ENQ_IMMED\n" " -F COUNT IMMED stress: force every COUNT'th enqueue to a busy local DSQ (use with -I)\n" +" -C MODE cid-override test (shuffle|bad-dup|bad-range)\n" " -v Print libbpf debug messages\n" " -h Display this help and exit\n"; @@ -60,23 +66,36 @@ int main(int argc, char **argv) { struct scx_qmap *skel; struct bpf_link *link; + struct qmap_arena *qa; + __u32 test_error_cnt = 0; + __u64 ecode; int opt; libbpf_set_print(libbpf_print_fn); signal(SIGINT, sigint_handler); signal(SIGTERM, sigint_handler); + if (libbpf_num_possible_cpus() > SCX_QMAP_MAX_CPUS) { + fprintf(stderr, + "scx_qmap: %d possible CPUs exceeds compile-time cap %d; " + "rebuild with larger SCX_QMAP_MAX_CPUS\n", + libbpf_num_possible_cpus(), SCX_QMAP_MAX_CPUS); + return 1; + } +restart: + optind = 1; skel = SCX_OPS_OPEN(qmap_ops, scx_qmap); skel->rodata->slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL"); + skel->rodata->max_tasks = 16384; - while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PMHc:d:D:SpIF:vh")) != -1) { + while ((opt = getopt(argc, argv, "s:e:t:T:l:b:N:PMHc:d:D:SpIF:C:vh")) != -1) { switch (opt) { case 's': skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; break; case 'e': - skel->bss->test_error_cnt = strtoul(optarg, NULL, 0); + test_error_cnt = strtoul(optarg, NULL, 0); break; case 't': skel->rodata->stall_user_nth = strtoul(optarg, NULL, 0); @@ -90,6 +109,9 @@ int main(int argc, char **argv) case 'b': skel->rodata->dsp_batch = strtoul(optarg, NULL, 0); break; + case 'N': + skel->rodata->max_tasks = strtoul(optarg, NULL, 0); + break; case 'P': skel->rodata->print_dsqs_and_events = true; break; @@ -130,6 +152,35 @@ int main(int argc, char **argv) case 'F': skel->rodata->immed_stress_nth = strtoul(optarg, NULL, 0); break; + case 'C': { + u32 nr_cpus = libbpf_num_possible_cpus(); + u32 mode, i; + + if (!strcmp(optarg, "shuffle")) + mode = 1; + else if (!strcmp(optarg, "bad-dup")) + mode = 2; + else if (!strcmp(optarg, "bad-range")) + mode = 3; + else { + fprintf(stderr, "unknown cid-override mode '%s'\n", optarg); + return 1; + } + skel->rodata->cid_override_mode = mode; + + /* shuffle: reversed cpu_to_cid, bad-dup: dup cid 0, bad-range: identity */ + for (i = 0; i < nr_cpus; i++) { + if (mode == 1) + skel->bss->cid_override_cpu_to_cid[i] = nr_cpus - 1 - i; + else + skel->bss->cid_override_cpu_to_cid[i] = i; + } + if (mode == 2 && nr_cpus >= 2) + skel->bss->cid_override_cpu_to_cid[1] = 0; + if (mode == 3) + skel->bss->cid_override_cpu_to_cid[0] = (s32)nr_cpus; + break; + } case 'v': verbose = true; break; @@ -142,39 +193,41 @@ int main(int argc, char **argv) SCX_OPS_LOAD(skel, qmap_ops, scx_qmap, uei); link = SCX_OPS_ATTACH(skel, qmap_ops, scx_qmap); + qa = &skel->arena->qa; + qa->test_error_cnt = test_error_cnt; + while (!exit_req && !UEI_EXITED(skel, uei)) { - long nr_enqueued = skel->bss->nr_enqueued; - long nr_dispatched = skel->bss->nr_dispatched; + long nr_enqueued = qa->nr_enqueued; + long nr_dispatched = qa->nr_dispatched; - printf("stats : enq=%lu dsp=%lu delta=%ld reenq/cpu0=%"PRIu64"/%"PRIu64" deq=%"PRIu64" core=%"PRIu64" enq_ddsp=%"PRIu64"\n", + printf("stats : enq=%lu dsp=%lu delta=%ld reenq/cid0=%llu/%llu deq=%llu core=%llu enq_ddsp=%llu\n", nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched, - skel->bss->nr_reenqueued, skel->bss->nr_reenqueued_cpu0, - skel->bss->nr_dequeued, - skel->bss->nr_core_sched_execed, - skel->bss->nr_ddsp_from_enq); - printf(" exp_local=%"PRIu64" exp_remote=%"PRIu64" exp_timer=%"PRIu64" exp_lost=%"PRIu64"\n", - skel->bss->nr_expedited_local, - skel->bss->nr_expedited_remote, - skel->bss->nr_expedited_from_timer, - skel->bss->nr_expedited_lost); - if (__COMPAT_has_ksym("scx_bpf_cpuperf_cur")) + qa->nr_reenqueued, qa->nr_reenqueued_cid0, + qa->nr_dequeued, + qa->nr_core_sched_execed, + qa->nr_ddsp_from_enq); + printf(" exp_local=%llu exp_remote=%llu exp_timer=%llu exp_lost=%llu\n", + qa->nr_expedited_local, + qa->nr_expedited_remote, + qa->nr_expedited_from_timer, + qa->nr_expedited_lost); + if (__COMPAT_has_ksym("scx_bpf_cidperf_cur")) printf("cpuperf: cur min/avg/max=%u/%u/%u target min/avg/max=%u/%u/%u\n", - skel->bss->cpuperf_min, - skel->bss->cpuperf_avg, - skel->bss->cpuperf_max, - skel->bss->cpuperf_target_min, - skel->bss->cpuperf_target_avg, - skel->bss->cpuperf_target_max); + qa->cpuperf_min, + qa->cpuperf_avg, + qa->cpuperf_max, + qa->cpuperf_target_min, + qa->cpuperf_target_avg, + qa->cpuperf_target_max); fflush(stdout); sleep(1); } bpf_link__destroy(link); - UEI_REPORT(skel, uei); + ecode = UEI_REPORT(skel, uei); scx_qmap__destroy(skel); - /* - * scx_qmap implements ops.cpu_on/offline() and doesn't need to restart - * on CPU hotplug events. - */ + + if (UEI_ECODE_RESTART(ecode)) + goto restart; return 0; } diff --git a/tools/sched_ext/scx_qmap.h b/tools/sched_ext/scx_qmap.h new file mode 100644 index 000000000000..d15a705d5ac5 --- /dev/null +++ b/tools/sched_ext/scx_qmap.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shared definitions between scx_qmap.bpf.c and scx_qmap.c. + * + * The scheduler keeps all state in a single BPF arena map. struct + * qmap_arena is the one object that lives at the base of the arena and is + * mmap'd into userspace so the loader can read counters directly. + * + * Copyright (c) 2026 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2026 Tejun Heo <tj@kernel.org> + */ +#ifndef __SCX_QMAP_H +#define __SCX_QMAP_H + +#ifdef __BPF__ +#include <scx/bpf_arena_common.bpf.h> +#else +#include <linux/types.h> +#include <scx/bpf_arena_common.h> +#endif + +#define MAX_SUB_SCHEDS 8 + +/* + * cpu_ctxs[] is sized to a fixed cap so the layout is shared between BPF and + * userspace. Keep this in sync with NR_CPUS used by the BPF side. + */ +#define SCX_QMAP_MAX_CPUS 1024 + +struct cpu_ctx { + __u64 dsp_idx; /* dispatch index */ + __u64 dsp_cnt; /* remaining count */ + __u32 avg_weight; + __u32 cpuperf_target; +}; + +/* Opaque to userspace; defined in scx_qmap.bpf.c. */ +struct task_ctx; + +struct qmap_fifo { + struct task_ctx __arena *head; + struct task_ctx __arena *tail; + __s32 idx; +}; + +struct qmap_arena { + /* userspace-visible stats */ + __u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cid0; + __u64 nr_dequeued, nr_ddsp_from_enq; + __u64 nr_core_sched_execed; + __u64 nr_expedited_local, nr_expedited_remote; + __u64 nr_expedited_lost, nr_expedited_from_timer; + __u64 nr_highpri_queued; + __u32 test_error_cnt; + __u32 cpuperf_min, cpuperf_avg, cpuperf_max; + __u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max; + + /* kernel-side runtime state */ + __u64 sub_sched_cgroup_ids[MAX_SUB_SCHEDS]; + __u64 core_sched_head_seqs[5]; + __u64 core_sched_tail_seqs[5]; + + struct cpu_ctx cpu_ctxs[SCX_QMAP_MAX_CPUS]; + + /* task_ctx slab; allocated and threaded by qmap_init() */ + struct task_ctx __arena *task_ctxs; + struct task_ctx __arena *task_free_head; + + /* five priority FIFOs, each a doubly-linked list through task_ctx */ + struct qmap_fifo fifos[5]; +}; + +#endif /* __SCX_QMAP_H */ diff --git a/tools/testing/selftests/sched_ext/non_scx_kfunc_deny.bpf.c b/tools/testing/selftests/sched_ext/non_scx_kfunc_deny.bpf.c index 9f16d39255e7..0d6fcc8e5eb6 100644 --- a/tools/testing/selftests/sched_ext/non_scx_kfunc_deny.bpf.c +++ b/tools/testing/selftests/sched_ext/non_scx_kfunc_deny.bpf.c @@ -9,12 +9,7 @@ * Copyright (C) 2026 Cheng-Yang Chou <yphbchou0911@gmail.com> */ -#include <vmlinux.h> -#include <bpf/bpf_helpers.h> -#include <bpf/bpf_tracing.h> - -/* SCX kfunc from scx_kfunc_ids_any set */ -void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym; +#include <scx/common.bpf.h> SEC("struct_ops/ssthresh") __u32 BPF_PROG(tcp_ca_ssthresh, struct sock *sk) diff --git a/tools/testing/selftests/sched_ext/peek_dsq.bpf.c b/tools/testing/selftests/sched_ext/peek_dsq.bpf.c index 7f23fb17b1e0..9e802b52b29e 100644 --- a/tools/testing/selftests/sched_ext/peek_dsq.bpf.c +++ b/tools/testing/selftests/sched_ext/peek_dsq.bpf.c @@ -95,7 +95,7 @@ static int scan_dsq_pool(void) record_peek_result(task->pid); /* Try to move this task to local */ - if (!moved && scx_bpf_dsq_move_to_local(dsq_id, 0) == 0) { + if (!moved && scx_bpf_dsq_move_to_local(dsq_id, 0)) { moved = 1; break; } diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl.c b/tools/testing/selftests/sched_ext/select_cpu_dfl.c index 5b6e045e1109..7e342c0cec65 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_dfl.c +++ b/tools/testing/selftests/sched_ext/select_cpu_dfl.c @@ -6,6 +6,7 @@ */ #include <bpf/bpf.h> #include <scx/common.h> +#include <stdlib.h> #include <sys/wait.h> #include <unistd.h> #include "select_cpu_dfl.bpf.skel.h" @@ -13,29 +14,44 @@ #define NUM_CHILDREN 1028 +struct select_cpu_dfl_ctx { + struct select_cpu_dfl *skel; + struct bpf_link *link; +}; + static enum scx_test_status setup(void **ctx) { - struct select_cpu_dfl *skel; + struct select_cpu_dfl_ctx *tctx; + + tctx = malloc(sizeof(*tctx)); + SCX_FAIL_IF(!tctx, "Failed to allocate test context"); + tctx->link = NULL; - skel = select_cpu_dfl__open(); - SCX_FAIL_IF(!skel, "Failed to open"); - SCX_ENUM_INIT(skel); - SCX_FAIL_IF(select_cpu_dfl__load(skel), "Failed to load skel"); + tctx->skel = select_cpu_dfl__open(); + if (!tctx->skel) { + free(tctx); + SCX_FAIL("Failed to open"); + } + SCX_ENUM_INIT(tctx->skel); + if (select_cpu_dfl__load(tctx->skel)) { + select_cpu_dfl__destroy(tctx->skel); + free(tctx); + SCX_FAIL("Failed to load skel"); + } - *ctx = skel; + *ctx = tctx; return SCX_TEST_PASS; } static enum scx_test_status run(void *ctx) { - struct select_cpu_dfl *skel = ctx; - struct bpf_link *link; + struct select_cpu_dfl_ctx *tctx = ctx; pid_t pids[NUM_CHILDREN]; - int i, status; + int i, status, nforked = 0; - link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dfl_ops); - SCX_FAIL_IF(!link, "Failed to attach scheduler"); + tctx->link = bpf_map__attach_struct_ops(tctx->skel->maps.select_cpu_dfl_ops); + SCX_FAIL_IF(!tctx->link, "Failed to attach scheduler"); for (i = 0; i < NUM_CHILDREN; i++) { pids[i] = fork(); @@ -43,25 +59,31 @@ static enum scx_test_status run(void *ctx) sleep(1); exit(0); } + if (pids[i] > 0) + nforked++; } for (i = 0; i < NUM_CHILDREN; i++) { + if (pids[i] <= 0) + continue; SCX_EQ(waitpid(pids[i], &status, 0), pids[i]); SCX_EQ(status, 0); } - SCX_ASSERT(!skel->bss->saw_local); - - bpf_link__destroy(link); + SCX_GT(nforked, 0); + SCX_ASSERT(!tctx->skel->bss->saw_local); return SCX_TEST_PASS; } static void cleanup(void *ctx) { - struct select_cpu_dfl *skel = ctx; + struct select_cpu_dfl_ctx *tctx = ctx; - select_cpu_dfl__destroy(skel); + if (tctx->link) + bpf_link__destroy(tctx->link); + select_cpu_dfl__destroy(tctx->skel); + free(tctx); } struct scx_test select_cpu_dfl = { |
