diff options
Diffstat (limited to 'tools/testing/selftests')
| -rw-r--r-- | tools/testing/selftests/sched_ext/Makefile | 1 | ||||
| -rw-r--r-- | tools/testing/selftests/sched_ext/dequeue.bpf.c | 389 | ||||
| -rw-r--r-- | tools/testing/selftests/sched_ext/dequeue.c | 274 |
3 files changed, 664 insertions, 0 deletions
diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile index 2c601a7eaff5..2815a875bde2 100644 --- a/tools/testing/selftests/sched_ext/Makefile +++ b/tools/testing/selftests/sched_ext/Makefile @@ -161,6 +161,7 @@ all_test_bpfprogs := $(foreach prog,$(wildcard *.bpf.c),$(INCLUDE_DIR)/$(patsubs auto-test-targets := \ create_dsq \ + dequeue \ enq_last_no_enq_fails \ ddsp_bogus_dsq_fail \ ddsp_vtimelocal_fail \ diff --git a/tools/testing/selftests/sched_ext/dequeue.bpf.c b/tools/testing/selftests/sched_ext/dequeue.bpf.c new file mode 100644 index 000000000000..597b88563d7d --- /dev/null +++ b/tools/testing/selftests/sched_ext/dequeue.bpf.c @@ -0,0 +1,389 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A scheduler that validates ops.dequeue() is called correctly: + * - Tasks dispatched to terminal DSQs (local, global) bypass the BPF + * scheduler entirely: no ops.dequeue() should be called + * - Tasks dispatched to user DSQs from ops.enqueue() enter BPF custody: + * ops.dequeue() must be called when they leave custody + * - Every ops.enqueue() dispatch to non-terminal DSQs is followed by + * exactly one ops.dequeue() (validate 1:1 pairing and state machine) + * + * Copyright (c) 2026 NVIDIA Corporation. + */ + +#include <scx/common.bpf.h> + +#define SHARED_DSQ 0 + +/* + * BPF internal queue. + * + * Tasks are stored here and consumed from ops.dispatch(), validating that + * tasks on BPF internal structures still get ops.dequeue() when they + * leave. + */ +struct { + __uint(type, BPF_MAP_TYPE_QUEUE); + __uint(max_entries, 32768); + __type(value, s32); +} global_queue SEC(".maps"); + +char _license[] SEC("license") = "GPL"; + +UEI_DEFINE(uei); + +/* + * Counters to track the lifecycle of tasks: + * - enqueue_cnt: Number of times ops.enqueue() was called + * - dequeue_cnt: Number of times ops.dequeue() was called (any type) + * - dispatch_dequeue_cnt: Number of regular dispatch dequeues (no flag) + * - change_dequeue_cnt: Number of property change dequeues + * - bpf_queue_full: Number of times the BPF internal queue was full + */ +u64 enqueue_cnt, dequeue_cnt, dispatch_dequeue_cnt, change_dequeue_cnt, bpf_queue_full; + +/* + * Test scenarios: + * 0) Dispatch to local DSQ from ops.select_cpu() (terminal DSQ, bypasses BPF + * scheduler, no dequeue callbacks) + * 1) Dispatch to global DSQ from ops.select_cpu() (terminal DSQ, bypasses BPF + * scheduler, no dequeue callbacks) + * 2) Dispatch to shared user DSQ from ops.select_cpu() (enters BPF scheduler, + * dequeue callbacks expected) + * 3) Dispatch to local DSQ from ops.enqueue() (terminal DSQ, bypasses BPF + * scheduler, no dequeue callbacks) + * 4) Dispatch to global DSQ from ops.enqueue() (terminal DSQ, bypasses BPF + * scheduler, no dequeue callbacks) + * 5) Dispatch to shared user DSQ from ops.enqueue() (enters BPF scheduler, + * dequeue callbacks expected) + * 6) BPF internal queue from ops.enqueue(): store task PIDs in ops.enqueue(), + * consume in ops.dispatch() and dispatch to local DSQ (validates dequeue + * for tasks stored in internal BPF data structures) + */ +u32 test_scenario; + +/* + * Per-task state to track lifecycle and validate workflow semantics. + * State transitions: + * NONE -> ENQUEUED (on enqueue) + * NONE -> DISPATCHED (on direct dispatch to terminal DSQ) + * ENQUEUED -> DISPATCHED (on dispatch dequeue) + * DISPATCHED -> NONE (on property change dequeue or re-enqueue) + * ENQUEUED -> NONE (on property change dequeue before dispatch) + */ +enum task_state { + TASK_NONE = 0, + TASK_ENQUEUED, + TASK_DISPATCHED, +}; + +struct task_ctx { + enum task_state state; /* Current state in the workflow */ + u64 enqueue_seq; /* Sequence number for debugging */ +}; + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct task_ctx); +} task_ctx_stor SEC(".maps"); + +static struct task_ctx *try_lookup_task_ctx(struct task_struct *p) +{ + return bpf_task_storage_get(&task_ctx_stor, p, 0, 0); +} + +s32 BPF_STRUCT_OPS(dequeue_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + struct task_ctx *tctx; + + tctx = try_lookup_task_ctx(p); + if (!tctx) + return prev_cpu; + + switch (test_scenario) { + case 0: + /* + * Direct dispatch to the local DSQ. + * + * Task bypasses BPF scheduler entirely: no enqueue + * tracking, no ops.dequeue() callbacks. + */ + scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); + tctx->state = TASK_DISPATCHED; + break; + case 1: + /* + * Direct dispatch to the global DSQ. + * + * Task bypasses BPF scheduler entirely: no enqueue + * tracking, no ops.dequeue() callbacks. + */ + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); + tctx->state = TASK_DISPATCHED; + break; + case 2: + /* + * Dispatch to a shared user DSQ. + * + * Task enters BPF scheduler management: track + * enqueue/dequeue lifecycle and validate state + * transitions. + */ + if (tctx->state == TASK_ENQUEUED) + scx_bpf_error("%d (%s): enqueue while in ENQUEUED state seq=%llu", + p->pid, p->comm, tctx->enqueue_seq); + + scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, 0); + + __sync_fetch_and_add(&enqueue_cnt, 1); + + tctx->state = TASK_ENQUEUED; + tctx->enqueue_seq++; + break; + } + + return prev_cpu; +} + +void BPF_STRUCT_OPS(dequeue_enqueue, struct task_struct *p, u64 enq_flags) +{ + struct task_ctx *tctx; + s32 pid = p->pid; + + tctx = try_lookup_task_ctx(p); + if (!tctx) + return; + + switch (test_scenario) { + case 3: + /* + * Direct dispatch to the local DSQ. + * + * Task bypasses BPF scheduler entirely: no enqueue + * tracking, no ops.dequeue() callbacks. + */ + scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags); + tctx->state = TASK_DISPATCHED; + break; + case 4: + /* + * Direct dispatch to the global DSQ. + * + * Task bypasses BPF scheduler entirely: no enqueue + * tracking, no ops.dequeue() callbacks. + */ + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); + tctx->state = TASK_DISPATCHED; + break; + case 5: + /* + * Dispatch to shared user DSQ. + * + * Task enters BPF scheduler management: track + * enqueue/dequeue lifecycle and validate state + * transitions. + */ + if (tctx->state == TASK_ENQUEUED) + scx_bpf_error("%d (%s): enqueue while in ENQUEUED state seq=%llu", + p->pid, p->comm, tctx->enqueue_seq); + + scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags); + + __sync_fetch_and_add(&enqueue_cnt, 1); + + tctx->state = TASK_ENQUEUED; + tctx->enqueue_seq++; + break; + case 6: + /* + * Store task in BPF internal queue. + * + * Task enters BPF scheduler management: track + * enqueue/dequeue lifecycle and validate state + * transitions. + */ + if (tctx->state == TASK_ENQUEUED) + scx_bpf_error("%d (%s): enqueue while in ENQUEUED state seq=%llu", + p->pid, p->comm, tctx->enqueue_seq); + + if (bpf_map_push_elem(&global_queue, &pid, 0)) { + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); + __sync_fetch_and_add(&bpf_queue_full, 1); + + tctx->state = TASK_DISPATCHED; + } else { + __sync_fetch_and_add(&enqueue_cnt, 1); + + tctx->state = TASK_ENQUEUED; + tctx->enqueue_seq++; + } + break; + default: + /* For all other scenarios, dispatch to the global DSQ */ + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); + tctx->state = TASK_DISPATCHED; + break; + } + + scx_bpf_kick_cpu(scx_bpf_task_cpu(p), SCX_KICK_IDLE); +} + +void BPF_STRUCT_OPS(dequeue_dequeue, struct task_struct *p, u64 deq_flags) +{ + struct task_ctx *tctx; + + __sync_fetch_and_add(&dequeue_cnt, 1); + + tctx = try_lookup_task_ctx(p); + if (!tctx) + return; + + /* + * For scenarios 0, 1, 3, and 4 (terminal DSQs: local and global), + * ops.dequeue() should never be called because tasks bypass the + * BPF scheduler entirely. If we get here, it's a kernel bug. + */ + if (test_scenario == 0 || test_scenario == 3) { + scx_bpf_error("%d (%s): dequeue called for local DSQ scenario", + p->pid, p->comm); + return; + } + + if (test_scenario == 1 || test_scenario == 4) { + scx_bpf_error("%d (%s): dequeue called for global DSQ scenario", + p->pid, p->comm); + return; + } + + if (deq_flags & SCX_DEQ_SCHED_CHANGE) { + /* + * Property change interrupting the workflow. Valid from + * both ENQUEUED and DISPATCHED states. Transitions task + * back to NONE state. + */ + __sync_fetch_and_add(&change_dequeue_cnt, 1); + + /* Validate state transition */ + if (tctx->state != TASK_ENQUEUED && tctx->state != TASK_DISPATCHED) + scx_bpf_error("%d (%s): invalid property change dequeue state=%d seq=%llu", + p->pid, p->comm, tctx->state, tctx->enqueue_seq); + + /* + * Transition back to NONE: task outside scheduler control. + * + * Scenario 6: dispatch() checks tctx->state after popping a + * PID, if the task is in state NONE, it was dequeued by + * property change and must not be dispatched (this + * prevents "target CPU not allowed"). + */ + tctx->state = TASK_NONE; + } else { + /* + * Regular dispatch dequeue: kernel is moving the task from + * BPF custody to a terminal DSQ. Normally we come from + * ENQUEUED state. We can also see TASK_NONE if the task + * was dequeued by property change (SCX_DEQ_SCHED_CHANGE) + * while it was already on a DSQ (dispatched but not yet + * consumed); in that case we just leave state as NONE. + */ + __sync_fetch_and_add(&dispatch_dequeue_cnt, 1); + + /* + * Must be ENQUEUED (normal path) or NONE (already dequeued + * by property change while on a DSQ). + */ + if (tctx->state != TASK_ENQUEUED && tctx->state != TASK_NONE) + scx_bpf_error("%d (%s): dispatch dequeue from state %d seq=%llu", + p->pid, p->comm, tctx->state, tctx->enqueue_seq); + + if (tctx->state == TASK_ENQUEUED) + tctx->state = TASK_DISPATCHED; + + /* NONE: leave as-is, task was already property-change dequeued */ + } +} + +void BPF_STRUCT_OPS(dequeue_dispatch, s32 cpu, struct task_struct *prev) +{ + if (test_scenario == 6) { + struct task_ctx *tctx; + struct task_struct *p; + s32 pid; + + if (bpf_map_pop_elem(&global_queue, &pid)) + return; + + p = bpf_task_from_pid(pid); + if (!p) + return; + + /* + * If the task was dequeued by property change + * (ops.dequeue() set tctx->state = TASK_NONE), skip + * dispatch. + */ + tctx = try_lookup_task_ctx(p); + if (!tctx || tctx->state == TASK_NONE) { + bpf_task_release(p); + return; + } + + /* + * Dispatch to this CPU's local DSQ if allowed, otherwise + * fallback to the global DSQ. + */ + if (bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) + scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_DFL, 0); + else + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); + + bpf_task_release(p); + } else { + scx_bpf_dsq_move_to_local(SHARED_DSQ); + } +} + +s32 BPF_STRUCT_OPS(dequeue_init_task, struct task_struct *p, + struct scx_init_task_args *args) +{ + struct task_ctx *tctx; + + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (!tctx) + return -ENOMEM; + + return 0; +} + +s32 BPF_STRUCT_OPS_SLEEPABLE(dequeue_init) +{ + s32 ret; + + ret = scx_bpf_create_dsq(SHARED_DSQ, -1); + if (ret) + return ret; + + return 0; +} + +void BPF_STRUCT_OPS(dequeue_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops dequeue_ops = { + .select_cpu = (void *)dequeue_select_cpu, + .enqueue = (void *)dequeue_enqueue, + .dequeue = (void *)dequeue_dequeue, + .dispatch = (void *)dequeue_dispatch, + .init_task = (void *)dequeue_init_task, + .init = (void *)dequeue_init, + .exit = (void *)dequeue_exit, + .flags = SCX_OPS_ENQ_LAST, + .name = "dequeue_test", +}; diff --git a/tools/testing/selftests/sched_ext/dequeue.c b/tools/testing/selftests/sched_ext/dequeue.c new file mode 100644 index 000000000000..4e93262703ca --- /dev/null +++ b/tools/testing/selftests/sched_ext/dequeue.c @@ -0,0 +1,274 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2025 NVIDIA Corporation. + */ +#define _GNU_SOURCE +#include <stdio.h> +#include <unistd.h> +#include <signal.h> +#include <time.h> +#include <bpf/bpf.h> +#include <scx/common.h> +#include <sys/wait.h> +#include <sched.h> +#include <pthread.h> +#include "scx_test.h" +#include "dequeue.bpf.skel.h" + +#define NUM_WORKERS 8 +#define AFFINITY_HAMMER_MS 500 + +/* + * Worker function that creates enqueue/dequeue events via CPU work and + * sleep. + */ +static void worker_fn(int id) +{ + int i; + volatile int sum = 0; + + for (i = 0; i < 1000; i++) { + volatile int j; + + /* Do some work to trigger scheduling events */ + for (j = 0; j < 10000; j++) + sum += j; + + /* Sleep to trigger dequeue */ + usleep(1000 + (id * 100)); + } + + exit(0); +} + +/* + * This thread changes workers' affinity from outside so that some changes + * hit tasks while they are still in the scheduler's queue and trigger + * property-change dequeues. + */ +static void *affinity_hammer_fn(void *arg) +{ + pid_t *pids = arg; + cpu_set_t cpuset; + int i = 0, n = NUM_WORKERS; + struct timespec start, now; + + clock_gettime(CLOCK_MONOTONIC, &start); + while (1) { + int w = i % n; + int cpu = (i / n) % 4; + + CPU_ZERO(&cpuset); + CPU_SET(cpu, &cpuset); + sched_setaffinity(pids[w], sizeof(cpuset), &cpuset); + i++; + + /* Check elapsed time every 256 iterations to limit gettime cost */ + if ((i & 255) == 0) { + long long elapsed_ms; + + clock_gettime(CLOCK_MONOTONIC, &now); + elapsed_ms = (now.tv_sec - start.tv_sec) * 1000LL + + (now.tv_nsec - start.tv_nsec) / 1000000; + if (elapsed_ms >= AFFINITY_HAMMER_MS) + break; + } + } + return NULL; +} + +static enum scx_test_status run_scenario(struct dequeue *skel, u32 scenario, + const char *scenario_name) +{ + struct bpf_link *link; + pid_t pids[NUM_WORKERS]; + pthread_t hammer; + + int i, status; + u64 enq_start, deq_start, + dispatch_deq_start, change_deq_start, bpf_queue_full_start; + u64 enq_delta, deq_delta, + dispatch_deq_delta, change_deq_delta, bpf_queue_full_delta; + + /* Set the test scenario */ + skel->bss->test_scenario = scenario; + + /* Record starting counts */ + enq_start = skel->bss->enqueue_cnt; + deq_start = skel->bss->dequeue_cnt; + dispatch_deq_start = skel->bss->dispatch_dequeue_cnt; + change_deq_start = skel->bss->change_dequeue_cnt; + bpf_queue_full_start = skel->bss->bpf_queue_full; + + link = bpf_map__attach_struct_ops(skel->maps.dequeue_ops); + SCX_FAIL_IF(!link, "Failed to attach struct_ops for scenario %s", scenario_name); + + /* Fork worker processes to generate enqueue/dequeue events */ + for (i = 0; i < NUM_WORKERS; i++) { + pids[i] = fork(); + SCX_FAIL_IF(pids[i] < 0, "Failed to fork worker %d", i); + + if (pids[i] == 0) { + worker_fn(i); + /* Should not reach here */ + exit(1); + } + } + + /* + * Run an "affinity hammer" so that some property changes hit tasks + * while they are still in BPF custody (e.g., in user DSQ or BPF + * queue), triggering SCX_DEQ_SCHED_CHANGE dequeues. + */ + SCX_FAIL_IF(pthread_create(&hammer, NULL, affinity_hammer_fn, pids) != 0, + "Failed to create affinity hammer thread"); + pthread_join(hammer, NULL); + + /* Wait for all workers to complete */ + for (i = 0; i < NUM_WORKERS; i++) { + SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i], + "Failed to wait for worker %d", i); + SCX_FAIL_IF(status != 0, "Worker %d exited with status %d", i, status); + } + + bpf_link__destroy(link); + + SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_UNREG)); + + /* Calculate deltas */ + enq_delta = skel->bss->enqueue_cnt - enq_start; + deq_delta = skel->bss->dequeue_cnt - deq_start; + dispatch_deq_delta = skel->bss->dispatch_dequeue_cnt - dispatch_deq_start; + change_deq_delta = skel->bss->change_dequeue_cnt - change_deq_start; + bpf_queue_full_delta = skel->bss->bpf_queue_full - bpf_queue_full_start; + + printf("%s:\n", scenario_name); + printf(" enqueues: %lu\n", (unsigned long)enq_delta); + printf(" dequeues: %lu (dispatch: %lu, property_change: %lu)\n", + (unsigned long)deq_delta, + (unsigned long)dispatch_deq_delta, + (unsigned long)change_deq_delta); + printf(" BPF queue full: %lu\n", (unsigned long)bpf_queue_full_delta); + + /* + * Validate enqueue/dequeue lifecycle tracking. + * + * For scenarios 0, 1, 3, 4 (local and global DSQs from + * ops.select_cpu() and ops.enqueue()), both enqueues and dequeues + * should be 0 because tasks bypass the BPF scheduler entirely: + * tasks never enter BPF scheduler's custody. + * + * For scenarios 2, 5, 6 (user DSQ or BPF internal queue) we expect + * both enqueues and dequeues. + * + * The BPF code does strict state machine validation with + * scx_bpf_error() to ensure the workflow semantics are correct. + * + * If we reach this point without errors, the semantics are + * validated correctly. + */ + if (scenario == 0 || scenario == 1 || + scenario == 3 || scenario == 4) { + /* Tasks bypass BPF scheduler completely */ + SCX_EQ(enq_delta, 0); + SCX_EQ(deq_delta, 0); + SCX_EQ(dispatch_deq_delta, 0); + SCX_EQ(change_deq_delta, 0); + } else { + /* + * User DSQ from ops.enqueue() or ops.select_cpu(): tasks + * enter BPF scheduler's custody. + * + * Also validate 1:1 enqueue/dequeue pairing. + */ + SCX_GT(enq_delta, 0); + SCX_GT(deq_delta, 0); + SCX_EQ(enq_delta, deq_delta); + } + + return SCX_TEST_PASS; +} + +static enum scx_test_status setup(void **ctx) +{ + struct dequeue *skel; + + skel = dequeue__open(); + SCX_FAIL_IF(!skel, "Failed to open skel"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(dequeue__load(skel), "Failed to load skel"); + + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct dequeue *skel = ctx; + enum scx_test_status status; + + status = run_scenario(skel, 0, "Scenario 0: Local DSQ from ops.select_cpu()"); + if (status != SCX_TEST_PASS) + return status; + + status = run_scenario(skel, 1, "Scenario 1: Global DSQ from ops.select_cpu()"); + if (status != SCX_TEST_PASS) + return status; + + status = run_scenario(skel, 2, "Scenario 2: User DSQ from ops.select_cpu()"); + if (status != SCX_TEST_PASS) + return status; + + status = run_scenario(skel, 3, "Scenario 3: Local DSQ from ops.enqueue()"); + if (status != SCX_TEST_PASS) + return status; + + status = run_scenario(skel, 4, "Scenario 4: Global DSQ from ops.enqueue()"); + if (status != SCX_TEST_PASS) + return status; + + status = run_scenario(skel, 5, "Scenario 5: User DSQ from ops.enqueue()"); + if (status != SCX_TEST_PASS) + return status; + + status = run_scenario(skel, 6, "Scenario 6: BPF queue from ops.enqueue()"); + if (status != SCX_TEST_PASS) + return status; + + printf("\n=== Summary ===\n"); + printf("Total enqueues: %lu\n", (unsigned long)skel->bss->enqueue_cnt); + printf("Total dequeues: %lu\n", (unsigned long)skel->bss->dequeue_cnt); + printf(" Dispatch dequeues: %lu (no flag, normal workflow)\n", + (unsigned long)skel->bss->dispatch_dequeue_cnt); + printf(" Property change dequeues: %lu (SCX_DEQ_SCHED_CHANGE flag)\n", + (unsigned long)skel->bss->change_dequeue_cnt); + printf(" BPF queue full: %lu\n", + (unsigned long)skel->bss->bpf_queue_full); + printf("\nAll scenarios passed - no state machine violations detected\n"); + printf("-> Validated: Local DSQ dispatch bypasses BPF scheduler\n"); + printf("-> Validated: Global DSQ dispatch bypasses BPF scheduler\n"); + printf("-> Validated: User DSQ dispatch triggers ops.dequeue() callbacks\n"); + printf("-> Validated: Dispatch dequeues have no flags (normal workflow)\n"); + printf("-> Validated: Property change dequeues have SCX_DEQ_SCHED_CHANGE flag\n"); + printf("-> Validated: No duplicate enqueues or invalid state transitions\n"); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct dequeue *skel = ctx; + + dequeue__destroy(skel); +} + +struct scx_test dequeue_test = { + .name = "dequeue", + .description = "Verify ops.dequeue() semantics", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; + +REGISTER_SCX_TEST(&dequeue_test) |
