From 33a165f9c2c1b9ddceaaccc356ce841baf1a08a2 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 3 Dec 2025 15:37:46 -0800 Subject: selftests/bpf: Test BPF_PROG_ASSOC_STRUCT_OPS command Test BPF_PROG_ASSOC_STRUCT_OPS command that associates a BPF program with a struct_ops. The test follows the same logic in commit ba7000f1c360 ("selftests/bpf: Test multi_st_ops and calling kfuncs from different programs"), but instead of using map id to identify a specific struct_ops, this test uses the new BPF command to associate a struct_ops with a program. The test consists of two sets of almost identical struct_ops maps and BPF programs associated with the map. Their only difference is the unique value returned by bpf_testmod_multi_st_ops::test_1(). The test first loads the programs and associates them with struct_ops maps. Then, it exercises the BPF programs. They will in turn call kfunc bpf_kfunc_multi_st_ops_test_1_prog_arg() to trigger test_1() of the associated struct_ops map, and then check if the right unique value is returned. Signed-off-by: Amery Hung Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20251203233748.668365-5-ameryhung@gmail.com --- .../bpf/prog_tests/test_struct_ops_assoc.c | 72 ++++++++++++++ .../testing/selftests/bpf/progs/struct_ops_assoc.c | 105 +++++++++++++++++++++ .../testing/selftests/bpf/test_kmods/bpf_testmod.c | 17 ++++ .../selftests/bpf/test_kmods/bpf_testmod_kfunc.h | 1 + 4 files changed, 195 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c create mode 100644 tools/testing/selftests/bpf/progs/struct_ops_assoc.c (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c new file mode 100644 index 000000000000..1e24a4915524 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include "struct_ops_assoc.skel.h" + +static void test_st_ops_assoc(void) +{ + struct struct_ops_assoc *skel = NULL; + int err, pid; + + skel = struct_ops_assoc__open_and_load(); + if (!ASSERT_OK_PTR(skel, "struct_ops_assoc__open")) + goto out; + + /* cannot explicitly associate struct_ops program */ + err = bpf_program__assoc_struct_ops(skel->progs.test_1_a, + skel->maps.st_ops_map_a, NULL); + ASSERT_ERR(err, "bpf_program__assoc_struct_ops(test_1_a, st_ops_map_a)"); + + err = bpf_program__assoc_struct_ops(skel->progs.syscall_prog_a, + skel->maps.st_ops_map_a, NULL); + ASSERT_OK(err, "bpf_program__assoc_struct_ops(syscall_prog_a, st_ops_map_a)"); + + err = bpf_program__assoc_struct_ops(skel->progs.sys_enter_prog_a, + skel->maps.st_ops_map_a, NULL); + ASSERT_OK(err, "bpf_program__assoc_struct_ops(sys_enter_prog_a, st_ops_map_a)"); + + err = bpf_program__assoc_struct_ops(skel->progs.syscall_prog_b, + skel->maps.st_ops_map_b, NULL); + ASSERT_OK(err, "bpf_program__assoc_struct_ops(syscall_prog_b, st_ops_map_b)"); + + err = bpf_program__assoc_struct_ops(skel->progs.sys_enter_prog_b, + skel->maps.st_ops_map_b, NULL); + ASSERT_OK(err, "bpf_program__assoc_struct_ops(sys_enter_prog_b, st_ops_map_b)"); + + /* sys_enter_prog_a already associated with map_a */ + err = bpf_program__assoc_struct_ops(skel->progs.sys_enter_prog_a, + skel->maps.st_ops_map_b, NULL); + ASSERT_ERR(err, "bpf_program__assoc_struct_ops(sys_enter_prog_a, st_ops_map_b)"); + + err = struct_ops_assoc__attach(skel); + if (!ASSERT_OK(err, "struct_ops_assoc__attach")) + goto out; + + /* run tracing prog that calls .test_1 and checks return */ + pid = getpid(); + skel->bss->test_pid = pid; + sys_gettid(); + skel->bss->test_pid = 0; + + ASSERT_EQ(skel->bss->test_err_a, 0, "skel->bss->test_err_a"); + ASSERT_EQ(skel->bss->test_err_b, 0, "skel->bss->test_err_b"); + + /* run syscall_prog that calls .test_1 and checks return */ + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.syscall_prog_a), NULL); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.syscall_prog_b), NULL); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + + ASSERT_EQ(skel->bss->test_err_a, 0, "skel->bss->test_err_a"); + ASSERT_EQ(skel->bss->test_err_b, 0, "skel->bss->test_err_b"); + +out: + struct_ops_assoc__destroy(skel); +} + +void test_struct_ops_assoc(void) +{ + if (test__start_subtest("st_ops_assoc")) + test_st_ops_assoc(); +} diff --git a/tools/testing/selftests/bpf/progs/struct_ops_assoc.c b/tools/testing/selftests/bpf/progs/struct_ops_assoc.c new file mode 100644 index 000000000000..8f1097903e22 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/struct_ops_assoc.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "bpf_misc.h" +#include "../test_kmods/bpf_testmod.h" +#include "../test_kmods/bpf_testmod_kfunc.h" + +char _license[] SEC("license") = "GPL"; + +int test_pid; + +/* Programs associated with st_ops_map_a */ + +#define MAP_A_MAGIC 1234 +int test_err_a; + +SEC("struct_ops") +int BPF_PROG(test_1_a, struct st_ops_args *args) +{ + return MAP_A_MAGIC; +} + +SEC("tp_btf/sys_enter") +int BPF_PROG(sys_enter_prog_a, struct pt_regs *regs, long id) +{ + struct st_ops_args args = {}; + struct task_struct *task; + int ret; + + task = bpf_get_current_task_btf(); + if (!test_pid || task->pid != test_pid) + return 0; + + ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + if (ret != MAP_A_MAGIC) + test_err_a++; + + return 0; +} + +SEC("syscall") +int syscall_prog_a(void *ctx) +{ + struct st_ops_args args = {}; + int ret; + + ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + if (ret != MAP_A_MAGIC) + test_err_a++; + + return 0; +} + +SEC(".struct_ops.link") +struct bpf_testmod_multi_st_ops st_ops_map_a = { + .test_1 = (void *)test_1_a, +}; + +/* Programs associated with st_ops_map_b */ + +#define MAP_B_MAGIC 5678 +int test_err_b; + +SEC("struct_ops") +int BPF_PROG(test_1_b, struct st_ops_args *args) +{ + return MAP_B_MAGIC; +} + +SEC("tp_btf/sys_enter") +int BPF_PROG(sys_enter_prog_b, struct pt_regs *regs, long id) +{ + struct st_ops_args args = {}; + struct task_struct *task; + int ret; + + task = bpf_get_current_task_btf(); + if (!test_pid || task->pid != test_pid) + return 0; + + ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + if (ret != MAP_B_MAGIC) + test_err_b++; + + return 0; +} + +SEC("syscall") +int syscall_prog_b(void *ctx) +{ + struct st_ops_args args = {}; + int ret; + + ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + if (ret != MAP_B_MAGIC) + test_err_b++; + + return 0; +} + +SEC(".struct_ops.link") +struct bpf_testmod_multi_st_ops st_ops_map_b = { + .test_1 = (void *)test_1_b, +}; diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index 1669a7eeda26..90c4b1a51de6 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -1134,6 +1134,7 @@ __bpf_kfunc int bpf_kfunc_st_ops_inc10(struct st_ops_args *args) } __bpf_kfunc int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id); +__bpf_kfunc int bpf_kfunc_multi_st_ops_test_1_impl(struct st_ops_args *args, void *aux_prog); BTF_KFUNCS_START(bpf_testmod_check_kfunc_ids) BTF_ID_FLAGS(func, bpf_testmod_test_mod_kfunc) @@ -1176,6 +1177,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_epilogue, KF_TRUSTED_ARGS | KF_SLEEPABL BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_pro_epilogue, KF_TRUSTED_ARGS | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_st_ops_inc10, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1_impl, KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_testmod_check_kfunc_ids) static int bpf_testmod_ops_init(struct btf *btf) @@ -1637,6 +1639,7 @@ static struct bpf_testmod_multi_st_ops *multi_st_ops_find_nolock(u32 id) return NULL; } +/* Call test_1() of the struct_ops map identified by the id */ int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id) { struct bpf_testmod_multi_st_ops *st_ops; @@ -1652,6 +1655,20 @@ int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id) return ret; } +/* Call test_1() of the associated struct_ops map */ +int bpf_kfunc_multi_st_ops_test_1_impl(struct st_ops_args *args, void *aux__prog) +{ + struct bpf_prog_aux *prog_aux = (struct bpf_prog_aux *)aux__prog; + struct bpf_testmod_multi_st_ops *st_ops; + int ret = -1; + + st_ops = (struct bpf_testmod_multi_st_ops *)bpf_prog_get_assoc_struct_ops(prog_aux); + if (st_ops) + ret = st_ops->test_1(args); + + return ret; +} + static int multi_st_ops_reg(void *kdata, struct bpf_link *link) { struct bpf_testmod_multi_st_ops *st_ops = diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h index 4df6fa6a92cb..2357a0340ffe 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h @@ -162,5 +162,6 @@ struct task_struct *bpf_kfunc_ret_rcu_test(void) __ksym; int *bpf_kfunc_ret_rcu_test_nostruct(int rdonly_buf_size) __ksym; int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id) __ksym; +int bpf_kfunc_multi_st_ops_test_1_impl(struct st_ops_args *args, void *aux__prog) __ksym; #endif /* _BPF_TESTMOD_KFUNC_H */ -- cgit v1.2.3 From 04fd12df4e05dd6fd3017b637f6fbc9da10b4e65 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 3 Dec 2025 15:37:47 -0800 Subject: selftests/bpf: Test ambiguous associated struct_ops Add a test to make sure implicit struct_ops association does not break backward compatibility nor return incorrect struct_ops. struct_ops programs should still be allowed to be reused in different struct_ops map. The associated struct_ops map set implicitly however will be poisoned. Trying to read it through the helper bpf_prog_get_assoc_struct_ops() should result in a NULL pointer. While recursion of test_1() cannot happen due to the associated struct_ops being ambiguois, explicitly check for it to prevent stack overflow if the test regresses. Signed-off-by: Amery Hung Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20251203233748.668365-6-ameryhung@gmail.com --- .../bpf/prog_tests/test_struct_ops_assoc.c | 38 +++++++++++ .../selftests/bpf/progs/struct_ops_assoc_reuse.c | 75 ++++++++++++++++++++++ 2 files changed, 113 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/struct_ops_assoc_reuse.c (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c index 1e24a4915524..02173504f675 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c +++ b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c @@ -2,6 +2,7 @@ #include #include "struct_ops_assoc.skel.h" +#include "struct_ops_assoc_reuse.skel.h" static void test_st_ops_assoc(void) { @@ -65,8 +66,45 @@ out: struct_ops_assoc__destroy(skel); } +static void test_st_ops_assoc_reuse(void) +{ + struct struct_ops_assoc_reuse *skel = NULL; + int err; + + skel = struct_ops_assoc_reuse__open_and_load(); + if (!ASSERT_OK_PTR(skel, "struct_ops_assoc_reuse__open")) + goto out; + + err = bpf_program__assoc_struct_ops(skel->progs.syscall_prog_a, + skel->maps.st_ops_map_a, NULL); + ASSERT_OK(err, "bpf_program__assoc_struct_ops(syscall_prog_a, st_ops_map_a)"); + + err = bpf_program__assoc_struct_ops(skel->progs.syscall_prog_b, + skel->maps.st_ops_map_b, NULL); + ASSERT_OK(err, "bpf_program__assoc_struct_ops(syscall_prog_b, st_ops_map_b)"); + + err = struct_ops_assoc_reuse__attach(skel); + if (!ASSERT_OK(err, "struct_ops_assoc__attach")) + goto out; + + /* run syscall_prog that calls .test_1 and checks return */ + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.syscall_prog_a), NULL); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.syscall_prog_b), NULL); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + + ASSERT_EQ(skel->bss->test_err_a, 0, "skel->bss->test_err_a"); + ASSERT_EQ(skel->bss->test_err_b, 0, "skel->bss->test_err_b"); + +out: + struct_ops_assoc_reuse__destroy(skel); +} + void test_struct_ops_assoc(void) { if (test__start_subtest("st_ops_assoc")) test_st_ops_assoc(); + if (test__start_subtest("st_ops_assoc_reuse")) + test_st_ops_assoc_reuse(); } diff --git a/tools/testing/selftests/bpf/progs/struct_ops_assoc_reuse.c b/tools/testing/selftests/bpf/progs/struct_ops_assoc_reuse.c new file mode 100644 index 000000000000..5bb6ebf5eed4 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/struct_ops_assoc_reuse.c @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "bpf_misc.h" +#include "../test_kmods/bpf_testmod.h" +#include "../test_kmods/bpf_testmod_kfunc.h" + +char _license[] SEC("license") = "GPL"; + +#define MAP_A_MAGIC 1234 +int test_err_a; +int recur; + +/* + * test_1_a is reused. The kfunc should not be able to get the associated + * struct_ops and call test_1 recursively as it is ambiguous. + */ +SEC("struct_ops") +int BPF_PROG(test_1_a, struct st_ops_args *args) +{ + int ret; + + if (!recur) { + recur++; + ret = bpf_kfunc_multi_st_ops_test_1_impl(args, NULL); + if (ret != -1) + test_err_a++; + recur--; + } + + return MAP_A_MAGIC; +} + +/* Programs associated with st_ops_map_a */ + +SEC("syscall") +int syscall_prog_a(void *ctx) +{ + struct st_ops_args args = {}; + int ret; + + ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + if (ret != MAP_A_MAGIC) + test_err_a++; + + return 0; +} + +SEC(".struct_ops.link") +struct bpf_testmod_multi_st_ops st_ops_map_a = { + .test_1 = (void *)test_1_a, +}; + +/* Programs associated with st_ops_map_b */ + +int test_err_b; + +SEC("syscall") +int syscall_prog_b(void *ctx) +{ + struct st_ops_args args = {}; + int ret; + + ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + if (ret != MAP_A_MAGIC) + test_err_b++; + + return 0; +} + +SEC(".struct_ops.link") +struct bpf_testmod_multi_st_ops st_ops_map_b = { + .test_1 = (void *)test_1_a, +}; -- cgit v1.2.3 From 0e841d19263ab6e1ca2b280109832f57624e48d1 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 3 Dec 2025 15:37:48 -0800 Subject: selftests/bpf: Test getting associated struct_ops in timer callback Make sure 1) a timer callback can also reference the associated struct_ops, and then make sure 2) the timer callback cannot get a dangled pointer to the struct_ops when the map is freed. The test schedules a timer callback from a struct_ops program since struct_ops programs do not pin the map. It is possible for the timer callback to run after the map is freed. The timer callback calls a kfunc that runs .test_1() of the associated struct_ops, which should return MAP_MAGIC when the map is still alive or -1 when the map is gone. The first subtest added in this patch schedules the timer callback to run immediately, while the map is still alive. The second subtest added schedules the callback to run 500ms after syscall_prog runs and then frees the map right after syscall_prog runs. Both subtests then wait until the callback runs to check the return of the kfunc. Signed-off-by: Amery Hung Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20251203233748.668365-7-ameryhung@gmail.com --- .../bpf/prog_tests/test_struct_ops_assoc.c | 81 ++++++++++++++++++++++ .../bpf/progs/struct_ops_assoc_in_timer.c | 77 ++++++++++++++++++++ 2 files changed, 158 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/struct_ops_assoc_in_timer.c (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c index 02173504f675..461ded722351 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c +++ b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c @@ -3,6 +3,7 @@ #include #include "struct_ops_assoc.skel.h" #include "struct_ops_assoc_reuse.skel.h" +#include "struct_ops_assoc_in_timer.skel.h" static void test_st_ops_assoc(void) { @@ -101,10 +102,90 @@ out: struct_ops_assoc_reuse__destroy(skel); } +static void test_st_ops_assoc_in_timer(void) +{ + struct struct_ops_assoc_in_timer *skel = NULL; + int err; + + skel = struct_ops_assoc_in_timer__open_and_load(); + if (!ASSERT_OK_PTR(skel, "struct_ops_assoc_in_timer__open")) + goto out; + + err = bpf_program__assoc_struct_ops(skel->progs.syscall_prog, + skel->maps.st_ops_map, NULL); + ASSERT_OK(err, "bpf_program__assoc_struct_ops"); + + err = struct_ops_assoc_in_timer__attach(skel); + if (!ASSERT_OK(err, "struct_ops_assoc__attach")) + goto out; + + /* + * Run .test_1 by calling kfunc bpf_kfunc_multi_st_ops_test_1_prog_arg() and checks + * the return value. .test_1 will also schedule timer_cb that runs .test_1 again + * immediately. + */ + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.syscall_prog), NULL); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + + /* Check the return of the kfunc after timer_cb runs */ + while (!READ_ONCE(skel->bss->timer_cb_run)) + sched_yield(); + ASSERT_EQ(skel->bss->timer_test_1_ret, 1234, "skel->bss->timer_test_1_ret"); + ASSERT_EQ(skel->bss->test_err, 0, "skel->bss->test_err_a"); +out: + struct_ops_assoc_in_timer__destroy(skel); +} + +static void test_st_ops_assoc_in_timer_no_uref(void) +{ + struct struct_ops_assoc_in_timer *skel = NULL; + struct bpf_link *link; + int err; + + skel = struct_ops_assoc_in_timer__open_and_load(); + if (!ASSERT_OK_PTR(skel, "struct_ops_assoc_in_timer__open")) + goto out; + + err = bpf_program__assoc_struct_ops(skel->progs.syscall_prog, + skel->maps.st_ops_map, NULL); + ASSERT_OK(err, "bpf_program__assoc_struct_ops"); + + link = bpf_map__attach_struct_ops(skel->maps.st_ops_map); + if (!ASSERT_OK_PTR(link, "bpf_map__attach_struct_ops")) + goto out; + + /* + * Run .test_1 by calling kfunc bpf_kfunc_multi_st_ops_test_1_prog_arg() and checks + * the return value. .test_1 will also schedule timer_cb that runs .test_1 again. + * timer_cb will run 500ms after syscall_prog runs, when the user space no longer + * holds a reference to st_ops_map. + */ + skel->bss->timer_ns = 500000000; + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.syscall_prog), NULL); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + + /* Detach and close struct_ops map to cause it to be freed */ + bpf_link__destroy(link); + close(bpf_program__fd(skel->progs.syscall_prog)); + close(bpf_map__fd(skel->maps.st_ops_map)); + + /* Check the return of the kfunc after timer_cb runs */ + while (!READ_ONCE(skel->bss->timer_cb_run)) + sched_yield(); + ASSERT_EQ(skel->bss->timer_test_1_ret, -1, "skel->bss->timer_test_1_ret"); + ASSERT_EQ(skel->bss->test_err, 0, "skel->bss->test_err_a"); +out: + struct_ops_assoc_in_timer__destroy(skel); +} + void test_struct_ops_assoc(void) { if (test__start_subtest("st_ops_assoc")) test_st_ops_assoc(); if (test__start_subtest("st_ops_assoc_reuse")) test_st_ops_assoc_reuse(); + if (test__start_subtest("st_ops_assoc_in_timer")) + test_st_ops_assoc_in_timer(); + if (test__start_subtest("st_ops_assoc_in_timer_no_uref")) + test_st_ops_assoc_in_timer_no_uref(); } diff --git a/tools/testing/selftests/bpf/progs/struct_ops_assoc_in_timer.c b/tools/testing/selftests/bpf/progs/struct_ops_assoc_in_timer.c new file mode 100644 index 000000000000..d5a2ea934284 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/struct_ops_assoc_in_timer.c @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "bpf_misc.h" +#include "../test_kmods/bpf_testmod.h" +#include "../test_kmods/bpf_testmod_kfunc.h" + +char _license[] SEC("license") = "GPL"; + +struct elem { + struct bpf_timer timer; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct elem); +} array_map SEC(".maps"); + +#define MAP_MAGIC 1234 +int recur; +int test_err; +int timer_ns; +int timer_test_1_ret; +int timer_cb_run; + +__noinline static int timer_cb(void *map, int *key, struct bpf_timer *timer) +{ + struct st_ops_args args = {}; + + recur++; + timer_test_1_ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + recur--; + + timer_cb_run++; + + return 0; +} + +SEC("struct_ops") +int BPF_PROG(test_1, struct st_ops_args *args) +{ + struct bpf_timer *timer; + int key = 0; + + if (!recur) { + timer = bpf_map_lookup_elem(&array_map, &key); + if (!timer) + return 0; + + bpf_timer_init(timer, &array_map, 1); + bpf_timer_set_callback(timer, timer_cb); + bpf_timer_start(timer, timer_ns, 0); + } + + return MAP_MAGIC; +} + +SEC("syscall") +int syscall_prog(void *ctx) +{ + struct st_ops_args args = {}; + int ret; + + ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + if (ret != MAP_MAGIC) + test_err++; + + return 0; +} + +SEC(".struct_ops.link") +struct bpf_testmod_multi_st_ops st_ops_map = { + .test_1 = (void *)test_1, +}; -- cgit v1.2.3 From 311ead1be05d2348e89f873337c8375e856e1abb Mon Sep 17 00:00:00 2001 From: Guopeng Zhang Date: Wed, 3 Dec 2025 19:56:29 +0800 Subject: selftests: cgroup: Add cg_read_key_long_poll() to poll a cgroup key with retries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce a new helper function `cg_read_key_long_poll()` in cgroup_util.h. This function polls the specified key in a cgroup file until it matches the expected value or the retry limit is reached, with configurable wait intervals between retries. This helper is particularly useful for handling asynchronously updated cgroup statistics (e.g., memory.stat), where immediate reads may observe stale values, especially on busy systems. It allows tests and other utilities to handle such cases more flexibly. Signed-off-by: Guopeng Zhang Suggested-by: Michal Koutný Reviewed-by: Shakeel Butt Acked-by: Michal Koutný Signed-off-by: Tejun Heo --- tools/testing/selftests/cgroup/lib/cgroup_util.c | 21 +++++++++++++++++++++ .../selftests/cgroup/lib/include/cgroup_util.h | 5 +++++ 2 files changed, 26 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/cgroup/lib/cgroup_util.c b/tools/testing/selftests/cgroup/lib/cgroup_util.c index 44c52f620fda..ce6c2642fd9b 100644 --- a/tools/testing/selftests/cgroup/lib/cgroup_util.c +++ b/tools/testing/selftests/cgroup/lib/cgroup_util.c @@ -168,6 +168,27 @@ long cg_read_key_long(const char *cgroup, const char *control, const char *key) return atol(ptr + strlen(key)); } +long cg_read_key_long_poll(const char *cgroup, const char *control, + const char *key, long expected, int retries, + useconds_t wait_interval_us) +{ + long val = -1; + int i; + + for (i = 0; i < retries; i++) { + val = cg_read_key_long(cgroup, control, key); + if (val < 0) + return val; + + if (val == expected) + break; + + usleep(wait_interval_us); + } + + return val; +} + long cg_read_lc(const char *cgroup, const char *control) { char buf[PAGE_SIZE]; diff --git a/tools/testing/selftests/cgroup/lib/include/cgroup_util.h b/tools/testing/selftests/cgroup/lib/include/cgroup_util.h index 7ab2824ed7b5..77f386dab5e8 100644 --- a/tools/testing/selftests/cgroup/lib/include/cgroup_util.h +++ b/tools/testing/selftests/cgroup/lib/include/cgroup_util.h @@ -17,6 +17,8 @@ #define CG_NAMED_NAME "selftest" #define CG_PATH_FORMAT (!cg_test_v1_named ? "0::%s" : (":name=" CG_NAMED_NAME ":%s")) +#define DEFAULT_WAIT_INTERVAL_US (100 * 1000) /* 100 ms */ + /* * Checks if two given values differ by less than err% of their sum. */ @@ -64,6 +66,9 @@ extern int cg_read_strstr(const char *cgroup, const char *control, extern long cg_read_long(const char *cgroup, const char *control); extern long cg_read_long_fd(int fd); long cg_read_key_long(const char *cgroup, const char *control, const char *key); +long cg_read_key_long_poll(const char *cgroup, const char *control, + const char *key, long expected, int retries, + useconds_t wait_interval_us); extern long cg_read_lc(const char *cgroup, const char *control); extern int cg_write(const char *cgroup, const char *control, char *buf); extern int cg_open(const char *cgroup, const char *control, int flags); -- cgit v1.2.3 From 6360d444ae32871c6a048ac880ef3b871a439bad Mon Sep 17 00:00:00 2001 From: Guopeng Zhang Date: Wed, 3 Dec 2025 19:56:30 +0800 Subject: selftests: cgroup: make test_memcg_sock robust against delayed sock stats MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit test_memcg_sock() currently requires that memory.stat's "sock " counter is exactly zero immediately after the TCP server exits. On a busy system this assumption is too strict: - Socket memory may be freed with a small delay (e.g. RCU callbacks). - memcg statistics are updated asynchronously via the rstat flushing worker, so the "sock " value in memory.stat can stay non-zero for a short period of time even after all socket memory has been uncharged. As a result, test_memcg_sock() can intermittently fail even though socket memory accounting is working correctly. Make the test more robust by polling memory.stat for the "sock " counter and allowing it some time to drop to zero instead of checking it only once. The timeout is set to 3 seconds to cover the periodic rstat flush interval (FLUSH_TIME = 2*HZ by default) plus some scheduling slack. If the counter does not become zero within the timeout, the test still fails as before. On my test system, running test_memcontrol 50 times produced: - Before this patch: 6/50 runs passed. - After this patch: 50/50 runs passed. Signed-off-by: Guopeng Zhang Suggested-by: Lance Yang Reviewed-by: Shakeel Butt Acked-by: Michal Koutný Signed-off-by: Tejun Heo --- tools/testing/selftests/cgroup/test_memcontrol.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index 4e1647568c5b..2fb096a2a9f9 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -21,6 +21,8 @@ #include "kselftest.h" #include "cgroup_util.h" +#define MEMCG_SOCKSTAT_WAIT_RETRIES 30 + static bool has_localevents; static bool has_recursiveprot; @@ -1384,6 +1386,7 @@ static int test_memcg_sock(const char *root) int bind_retries = 5, ret = KSFT_FAIL, pid, err; unsigned short port; char *memcg; + long sock_post = -1; memcg = cg_name(root, "memcg_test"); if (!memcg) @@ -1432,7 +1435,22 @@ static int test_memcg_sock(const char *root) if (cg_read_long(memcg, "memory.current") < 0) goto cleanup; - if (cg_read_key_long(memcg, "memory.stat", "sock ")) + /* + * memory.stat is updated asynchronously via the memcg rstat + * flushing worker, which runs periodically (every 2 seconds, + * see FLUSH_TIME). On a busy system, the "sock " counter may + * stay non-zero for a short period of time after the TCP + * connection is closed and all socket memory has been + * uncharged. + * + * Poll memory.stat for up to 3 seconds (~FLUSH_TIME plus some + * scheduling slack) and require that the "sock " counter + * eventually drops to zero. + */ + sock_post = cg_read_key_long_poll(memcg, "memory.stat", "sock ", 0, + MEMCG_SOCKSTAT_WAIT_RETRIES, + DEFAULT_WAIT_INTERVAL_US); + if (sock_post) goto cleanup; ret = KSFT_PASS; -- cgit v1.2.3 From 50133c09d189a26f4cc6e78e382864fd599a1dc4 Mon Sep 17 00:00:00 2001 From: Guopeng Zhang Date: Wed, 3 Dec 2025 19:56:31 +0800 Subject: selftests: cgroup: Replace sleep with cg_read_key_long_poll() for waiting on nr_dying_descendants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the manual sleep-and-retry logic in test_kmem_dead_cgroups() with the new helper `cg_read_key_long_poll()`. This change improves the robustness of the test by polling the "nr_dying_descendants" counter in `cgroup.stat` until it reaches 0 or the timeout is exceeded. Additionally, increase the retry timeout to 8 seconds (from 5 seconds) based on testing results: - With 5-second timeout: 4/20 runs passed. - With 8-second timeout: 20/20 runs passed. The 8 second timeout is based on stress testing of test_kmem_dead_cgroups() under load: 5 seconds was occasionally not enough for reclaim of dying descendants to complete, whereas 8 seconds consistently covered the observed latencies. This value is intended as a generous upper bound for the asynchronous reclaim and is not tied to any specific kernel constant, so it can be adjusted in the future if reclaim behavior changes. Signed-off-by: Guopeng Zhang Reviewed-by: Shakeel Butt Acked-by: Michal Koutný Signed-off-by: Tejun Heo --- tools/testing/selftests/cgroup/test_kmem.c | 33 ++++++++++++++---------------- 1 file changed, 15 insertions(+), 18 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/cgroup/test_kmem.c b/tools/testing/selftests/cgroup/test_kmem.c index ca38525484e3..eeabd34bf083 100644 --- a/tools/testing/selftests/cgroup/test_kmem.c +++ b/tools/testing/selftests/cgroup/test_kmem.c @@ -26,6 +26,7 @@ */ #define MAX_VMSTAT_ERROR (4096 * 64 * get_nprocs()) +#define KMEM_DEAD_WAIT_RETRIES 80 static int alloc_dcache(const char *cgroup, void *arg) { @@ -306,9 +307,7 @@ static int test_kmem_dead_cgroups(const char *root) { int ret = KSFT_FAIL; char *parent; - long dead; - int i; - int max_time = 20; + long dead = -1; parent = cg_name(root, "kmem_dead_cgroups_test"); if (!parent) @@ -323,21 +322,19 @@ static int test_kmem_dead_cgroups(const char *root) if (cg_run_in_subcgroups(parent, alloc_dcache, (void *)100, 30)) goto cleanup; - for (i = 0; i < max_time; i++) { - dead = cg_read_key_long(parent, "cgroup.stat", - "nr_dying_descendants "); - if (dead == 0) { - ret = KSFT_PASS; - break; - } - /* - * Reclaiming cgroups might take some time, - * let's wait a bit and repeat. - */ - sleep(1); - if (i > 5) - printf("Waiting time longer than 5s; wait: %ds (dead: %ld)\n", i, dead); - } + /* + * Allow up to ~8s for reclaim of dying descendants to complete. + * This is a generous upper bound derived from stress testing, not + * from a specific kernel constant, and can be adjusted if reclaim + * behavior changes in the future. + */ + dead = cg_read_key_long_poll(parent, "cgroup.stat", + "nr_dying_descendants ", 0, KMEM_DEAD_WAIT_RETRIES, + DEFAULT_WAIT_INTERVAL_US); + if (dead) + goto cleanup; + + ret = KSFT_PASS; cleanup: cg_destroy(parent); -- cgit v1.2.3 From 18352f8fae91d23bbd7165a7b2a1f15c4f5beff8 Mon Sep 17 00:00:00 2001 From: Kohei Enju Date: Mon, 8 Dec 2025 22:14:32 +0900 Subject: selftests/bpf: add tests for attaching invalid fd MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add test cases for situations where adding the following types of file descriptors to a cpumap entry should fail: - Non-BPF file descriptor (expect -EINVAL) - Nonexistent file descriptor (expect -EBADF) Also tighten the assertion for the expected error when adding a non-BPF_XDP_CPUMAP program to a cpumap entry. Signed-off-by: Kohei Enju Reviewed-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20251208131449.73036-3-enjuk@amazon.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/xdp_cpumap_attach.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c b/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c index df27535995af..ad56e4370ce3 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c @@ -18,7 +18,7 @@ static void test_xdp_with_cpumap_helpers(void) struct bpf_cpumap_val val = { .qsize = 192, }; - int err, prog_fd, prog_redir_fd, map_fd; + int err, prog_fd, prog_redir_fd, map_fd, bad_fd; struct nstoken *nstoken = NULL; __u32 idx = 0; @@ -79,7 +79,22 @@ static void test_xdp_with_cpumap_helpers(void) val.qsize = 192; val.bpf_prog.fd = bpf_program__fd(skel->progs.xdp_dummy_prog); err = bpf_map_update_elem(map_fd, &idx, &val, 0); - ASSERT_NEQ(err, 0, "Add non-BPF_XDP_CPUMAP program to cpumap entry"); + ASSERT_EQ(err, -EINVAL, "Add non-BPF_XDP_CPUMAP program to cpumap entry"); + + /* Try to attach non-BPF file descriptor */ + bad_fd = open("/dev/null", O_RDONLY); + ASSERT_GE(bad_fd, 0, "Open /dev/null for non-BPF fd"); + + val.bpf_prog.fd = bad_fd; + err = bpf_map_update_elem(map_fd, &idx, &val, 0); + ASSERT_EQ(err, -EINVAL, "Add non-BPF fd to cpumap entry"); + + /* Try to attach nonexistent file descriptor */ + err = close(bad_fd); + ASSERT_EQ(err, 0, "Close non-BPF fd for nonexistent fd"); + + err = bpf_map_update_elem(map_fd, &idx, &val, 0); + ASSERT_EQ(err, -EBADF, "Add nonexistent fd to cpumap entry"); /* Try to attach BPF_XDP program with frags to cpumap when we have * already loaded a BPF_XDP program on the map -- cgit v1.2.3 From a5b4867fad18e72fd5fc442c16be83723776283b Mon Sep 17 00:00:00 2001 From: Cupertino Miranda Date: Tue, 2 Dec 2025 18:02:20 +0000 Subject: selftests/bpf: add verifier sign extension bound computation tests. This commit adds 3 tests to verify a common compiler generated pattern for sign extension (r1 <<= 32; r1 s>>= 32). The tests make sure the register bounds are correctly computed both for positive and negative register values. Signed-off-by: Cupertino Miranda Signed-off-by: Andrew Pinski Acked-by: Eduard Zingerman Cc: David Faust Cc: Jose Marchesi Cc: Elena Zannoni Link: https://lore.kernel.org/r/20251202180220.11128-3-cupertino.miranda@oracle.com Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/progs/verifier_subreg.c | 68 ++++++++++++++++++++++ 1 file changed, 68 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/progs/verifier_subreg.c b/tools/testing/selftests/bpf/progs/verifier_subreg.c index 8613ea160dcd..b3e1c3eef9ae 100644 --- a/tools/testing/selftests/bpf/progs/verifier_subreg.c +++ b/tools/testing/selftests/bpf/progs/verifier_subreg.c @@ -531,6 +531,74 @@ __naked void arsh32_imm_zero_extend_check(void) : __clobber_all); } +SEC("socket") +__description("arsh32 imm sign positive extend check") +__success __retval(0) +__log_level(2) +__msg("2: (57) r6 &= 4095 ; R6=scalar(smin=smin32=0,smax=umax=smax32=umax32=4095,var_off=(0x0; 0xfff))") +__msg("3: (67) r6 <<= 32 ; R6=scalar(smin=smin32=0,smax=umax=0xfff00000000,smax32=umax32=0,var_off=(0x0; 0xfff00000000))") +__msg("4: (c7) r6 s>>= 32 ; R6=scalar(smin=smin32=0,smax=umax=smax32=umax32=4095,var_off=(0x0; 0xfff))") +__naked void arsh32_imm_sign_extend_positive_check(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r6 = r0; \ + r6 &= 4095; \ + r6 <<= 32; \ + r6 s>>= 32; \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("arsh32 imm sign negative extend check") +__success __retval(0) +__log_level(2) +__msg("3: (17) r6 -= 4095 ; R6=scalar(smin=smin32=-4095,smax=smax32=0)") +__msg("4: (67) r6 <<= 32 ; R6=scalar(smin=0xfffff00100000000,smax=smax32=umax32=0,umax=0xffffffff00000000,smin32=0,var_off=(0x0; 0xffffffff00000000))") +__msg("5: (c7) r6 s>>= 32 ; R6=scalar(smin=smin32=-4095,smax=smax32=0)") +__naked void arsh32_imm_sign_extend_negative_check(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r6 = r0; \ + r6 &= 4095; \ + r6 -= 4095; \ + r6 <<= 32; \ + r6 s>>= 32; \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("arsh32 imm sign extend check") +__success __retval(0) +__log_level(2) +__msg("3: (17) r6 -= 2047 ; R6=scalar(smin=smin32=-2047,smax=smax32=2048)") +__msg("4: (67) r6 <<= 32 ; R6=scalar(smin=0xfffff80100000000,smax=0x80000000000,umax=0xffffffff00000000,smin32=0,smax32=umax32=0,var_off=(0x0; 0xffffffff00000000))") +__msg("5: (c7) r6 s>>= 32 ; R6=scalar(smin=smin32=-2047,smax=smax32=2048)") +__naked void arsh32_imm_sign_extend_check(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r6 = r0; \ + r6 &= 4095; \ + r6 -= 2047; \ + r6 <<= 32; \ + r6 s>>= 32; \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + SEC("socket") __description("end16 (to_le) reg zero extend check") __success __success_unpriv __retval(0) -- cgit v1.2.3 From 0c82fdbbbfbee878a0a5e884ea5f31dd16138f0d Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Sat, 29 Nov 2025 14:41:22 +0530 Subject: selftests: statmount: tests for STATMOUNT_BY_FD Add tests for STATMOUNT_BY_FD flag, which adds support for passing a file descriptors to statmount(). The fd can also be on a "unmounted" mount (mount unmounted with MNT_DETACH), we also include tests for that. Co-developed-by: Andrei Vagin Signed-off-by: Andrei Vagin Signed-off-by: Bhavik Sachdev Link: https://patch.msgid.link/20251129091455.757724-4-b.sachdev1904@gmail.com Signed-off-by: Christian Brauner --- .../selftests/filesystems/statmount/statmount.h | 15 +- .../filesystems/statmount/statmount_test.c | 261 +++++++++++++++++++-- .../filesystems/statmount/statmount_test_ns.c | 101 +++++++- 3 files changed, 354 insertions(+), 23 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/filesystems/statmount/statmount.h b/tools/testing/selftests/filesystems/statmount/statmount.h index 99e5ad082fb1..e1cba4bfd8d9 100644 --- a/tools/testing/selftests/filesystems/statmount/statmount.h +++ b/tools/testing/selftests/filesystems/statmount/statmount.h @@ -43,19 +43,24 @@ #endif #endif -static inline int statmount(uint64_t mnt_id, uint64_t mnt_ns_id, uint64_t mask, - struct statmount *buf, size_t bufsize, +static inline int statmount(uint64_t mnt_id, uint64_t mnt_ns_id, uint32_t fd, + uint64_t mask, struct statmount *buf, size_t bufsize, unsigned int flags) { struct mnt_id_req req = { .size = MNT_ID_REQ_SIZE_VER0, - .mnt_id = mnt_id, .param = mask, }; - if (mnt_ns_id) { + if (flags & STATMOUNT_BY_FD) { req.size = MNT_ID_REQ_SIZE_VER1; - req.mnt_ns_id = mnt_ns_id; + req.mnt_fd = fd; + } else { + req.mnt_id = mnt_id; + if (mnt_ns_id) { + req.size = MNT_ID_REQ_SIZE_VER1; + req.mnt_ns_id = mnt_ns_id; + } } return syscall(__NR_statmount, &req, buf, bufsize, flags); diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test.c b/tools/testing/selftests/filesystems/statmount/statmount_test.c index 6e53430423d2..a04bcaace126 100644 --- a/tools/testing/selftests/filesystems/statmount/statmount_test.c +++ b/tools/testing/selftests/filesystems/statmount/statmount_test.c @@ -33,15 +33,24 @@ static const char *const known_fs[] = { "sysv", "tmpfs", "tracefs", "ubifs", "udf", "ufs", "v7", "vboxsf", "vfat", "virtiofs", "vxfs", "xenfs", "xfs", "zonefs", NULL }; -static struct statmount *statmount_alloc(uint64_t mnt_id, uint64_t mask, unsigned int flags) +static struct statmount *statmount_alloc(uint64_t mnt_id, int fd, uint64_t mask, unsigned int flags) { size_t bufsize = 1 << 15; - struct statmount *buf = NULL, *tmp = alloca(bufsize); + struct statmount *buf = NULL, *tmp = NULL; int tofree = 0; int ret; + if (flags & STATMOUNT_BY_FD && fd < 0) + return NULL; + + tmp = alloca(bufsize); + for (;;) { - ret = statmount(mnt_id, 0, mask, tmp, bufsize, flags); + if (flags & STATMOUNT_BY_FD) + ret = statmount(0, 0, (uint32_t) fd, mask, tmp, bufsize, flags); + else + ret = statmount(mnt_id, 0, 0, mask, tmp, bufsize, flags); + if (ret != -1) break; if (tofree) @@ -237,7 +246,7 @@ static void test_statmount_zero_mask(void) struct statmount sm; int ret; - ret = statmount(root_id, 0, 0, &sm, sizeof(sm), 0); + ret = statmount(root_id, 0, 0, 0, &sm, sizeof(sm), 0); if (ret == -1) { ksft_test_result_fail("statmount zero mask: %s\n", strerror(errno)); @@ -263,7 +272,7 @@ static void test_statmount_mnt_basic(void) int ret; uint64_t mask = STATMOUNT_MNT_BASIC; - ret = statmount(root_id, 0, mask, &sm, sizeof(sm), 0); + ret = statmount(root_id, 0, 0, mask, &sm, sizeof(sm), 0); if (ret == -1) { ksft_test_result_fail("statmount mnt basic: %s\n", strerror(errno)); @@ -323,7 +332,7 @@ static void test_statmount_sb_basic(void) struct statx sx; struct statfs sf; - ret = statmount(root_id, 0, mask, &sm, sizeof(sm), 0); + ret = statmount(root_id, 0, 0, mask, &sm, sizeof(sm), 0); if (ret == -1) { ksft_test_result_fail("statmount sb basic: %s\n", strerror(errno)); @@ -375,7 +384,7 @@ static void test_statmount_mnt_point(void) { struct statmount *sm; - sm = statmount_alloc(root_id, STATMOUNT_MNT_POINT, 0); + sm = statmount_alloc(root_id, 0, STATMOUNT_MNT_POINT, 0); if (!sm) { ksft_test_result_fail("statmount mount point: %s\n", strerror(errno)); @@ -405,7 +414,7 @@ static void test_statmount_mnt_root(void) assert(last_dir); last_dir++; - sm = statmount_alloc(root_id, STATMOUNT_MNT_ROOT, 0); + sm = statmount_alloc(root_id, 0, STATMOUNT_MNT_ROOT, 0); if (!sm) { ksft_test_result_fail("statmount mount root: %s\n", strerror(errno)); @@ -438,7 +447,7 @@ static void test_statmount_fs_type(void) const char *fs_type; const char *const *s; - sm = statmount_alloc(root_id, STATMOUNT_FS_TYPE, 0); + sm = statmount_alloc(root_id, 0, STATMOUNT_FS_TYPE, 0); if (!sm) { ksft_test_result_fail("statmount fs type: %s\n", strerror(errno)); @@ -467,7 +476,7 @@ static void test_statmount_mnt_opts(void) char *line = NULL; size_t len = 0; - sm = statmount_alloc(root_id, STATMOUNT_MNT_BASIC | STATMOUNT_MNT_OPTS, + sm = statmount_alloc(root_id, 0, STATMOUNT_MNT_BASIC | STATMOUNT_MNT_OPTS, 0); if (!sm) { ksft_test_result_fail("statmount mnt opts: %s\n", @@ -557,7 +566,7 @@ static void test_statmount_string(uint64_t mask, size_t off, const char *name) uint32_t start, i; int ret; - sm = statmount_alloc(root_id, mask, 0); + sm = statmount_alloc(root_id, 0, mask, 0); if (!sm) { ksft_test_result_fail("statmount %s: %s\n", name, strerror(errno)); @@ -586,14 +595,14 @@ static void test_statmount_string(uint64_t mask, size_t off, const char *name) exactsize = sm->size; shortsize = sizeof(*sm) + i; - ret = statmount(root_id, 0, mask, sm, exactsize, 0); + ret = statmount(root_id, 0, 0, mask, sm, exactsize, 0); if (ret == -1) { ksft_test_result_fail("statmount exact size: %s\n", strerror(errno)); goto out; } errno = 0; - ret = statmount(root_id, 0, mask, sm, shortsize, 0); + ret = statmount(root_id, 0, 0, mask, sm, shortsize, 0); if (ret != -1 || errno != EOVERFLOW) { ksft_test_result_fail("should have failed with EOVERFLOW: %s\n", strerror(errno)); @@ -658,6 +667,226 @@ static void test_listmount_tree(void) ksft_test_result_pass("listmount tree\n"); } +static void test_statmount_by_fd(void) +{ + struct statmount *sm = NULL; + char tmpdir[] = "/statmount.fd.XXXXXX"; + const char root[] = "/test"; + char subdir[PATH_MAX], tmproot[PATH_MAX]; + int fd; + + if (!mkdtemp(tmpdir)) { + ksft_perror("mkdtemp"); + return; + } + + if (mount("statmount.test", tmpdir, "tmpfs", 0, NULL)) { + ksft_perror("mount"); + rmdir(tmpdir); + return; + } + + snprintf(subdir, PATH_MAX, "%s%s", tmpdir, root); + snprintf(tmproot, PATH_MAX, "%s/%s", tmpdir, "chroot"); + + if (mkdir(subdir, 0755)) { + ksft_perror("mkdir"); + goto err_tmpdir; + } + + if (mount(subdir, subdir, NULL, MS_BIND, 0)) { + ksft_perror("mount"); + goto err_subdir; + } + + if (mkdir(tmproot, 0755)) { + ksft_perror("mkdir"); + goto err_subdir; + } + + fd = open(subdir, O_PATH); + if (fd < 0) { + ksft_perror("open"); + goto err_tmproot; + } + + if (chroot(tmproot)) { + ksft_perror("chroot"); + goto err_fd; + } + + sm = statmount_alloc(0, fd, STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT, STATMOUNT_BY_FD); + if (!sm) { + ksft_test_result_fail("statmount by fd failed: %s\n", strerror(errno)); + goto err_chroot; + } + + if (sm->size < sizeof(*sm)) { + ksft_test_result_fail("unexpected size: %u < %u\n", + sm->size, (uint32_t) sizeof(*sm)); + goto err_chroot; + } + + if (sm->mask & STATMOUNT_MNT_POINT) { + ksft_test_result_fail("STATMOUNT_MNT_POINT unexpectedly set in statmount\n"); + goto err_chroot; + } + + if (!(sm->mask & STATMOUNT_MNT_ROOT)) { + ksft_test_result_fail("STATMOUNT_MNT_ROOT not set in statmount\n"); + goto err_chroot; + } + + if (strcmp(root, sm->str + sm->mnt_root) != 0) { + ksft_test_result_fail("statmount returned incorrect mnt_root," + "statmount mnt_root: %s != %s\n", + sm->str + sm->mnt_root, root); + goto err_chroot; + } + + if (chroot(".")) { + ksft_perror("chroot"); + goto out; + } + + free(sm); + sm = statmount_alloc(0, fd, STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT, STATMOUNT_BY_FD); + if (!sm) { + ksft_test_result_fail("statmount by fd failed: %s\n", strerror(errno)); + goto err_fd; + } + + if (sm->size < sizeof(*sm)) { + ksft_test_result_fail("unexpected size: %u < %u\n", + sm->size, (uint32_t) sizeof(*sm)); + goto out; + } + + if (!(sm->mask & STATMOUNT_MNT_POINT)) { + ksft_test_result_fail("STATMOUNT_MNT_POINT not set in statmount\n"); + goto out; + } + + if (!(sm->mask & STATMOUNT_MNT_ROOT)) { + ksft_test_result_fail("STATMOUNT_MNT_ROOT not set in statmount\n"); + goto out; + } + + if (strcmp(subdir, sm->str + sm->mnt_point) != 0) { + ksft_test_result_fail("statmount returned incorrect mnt_point," + "statmount mnt_point: %s != %s\n", sm->str + sm->mnt_point, subdir); + goto out; + } + + if (strcmp(root, sm->str + sm->mnt_root) != 0) { + ksft_test_result_fail("statmount returned incorrect mnt_root," + "statmount mnt_root: %s != %s\n", sm->str + sm->mnt_root, root); + goto out; + } + + ksft_test_result_pass("statmount by fd\n"); + goto out; +err_chroot: + chroot("."); +out: + free(sm); +err_fd: + close(fd); +err_tmproot: + rmdir(tmproot); +err_subdir: + umount2(subdir, MNT_DETACH); + rmdir(subdir); +err_tmpdir: + umount2(tmpdir, MNT_DETACH); + rmdir(tmpdir); +} + +static void test_statmount_by_fd_unmounted(void) +{ + const char root[] = "/test.unmounted"; + char tmpdir[] = "/statmount.fd.XXXXXX"; + char subdir[PATH_MAX]; + int fd; + struct statmount *sm = NULL; + + if (!mkdtemp(tmpdir)) { + ksft_perror("mkdtemp"); + return; + } + + if (mount("statmount.test", tmpdir, "tmpfs", 0, NULL)) { + ksft_perror("mount"); + rmdir(tmpdir); + return; + } + + snprintf(subdir, PATH_MAX, "%s%s", tmpdir, root); + + if (mkdir(subdir, 0755)) { + ksft_perror("mkdir"); + goto err_tmpdir; + } + + if (mount(subdir, subdir, 0, MS_BIND, NULL)) { + ksft_perror("mount"); + goto err_subdir; + } + + fd = open(subdir, O_PATH); + if (fd < 0) { + ksft_perror("open"); + goto err_subdir; + } + + if (umount2(tmpdir, MNT_DETACH)) { + ksft_perror("umount2"); + goto err_fd; + } + + sm = statmount_alloc(0, fd, STATMOUNT_MNT_POINT | STATMOUNT_MNT_ROOT, STATMOUNT_BY_FD); + if (!sm) { + ksft_test_result_fail("statmount by fd unmounted: %s\n", + strerror(errno)); + goto err_sm; + } + + if (sm->size < sizeof(*sm)) { + ksft_test_result_fail("unexpected size: %u < %u\n", + sm->size, (uint32_t) sizeof(*sm)); + goto err_sm; + } + + if (sm->mask & STATMOUNT_MNT_POINT) { + ksft_test_result_fail("STATMOUNT_MNT_POINT unexpectedly set in mask\n"); + goto err_sm; + } + + if (!(sm->mask & STATMOUNT_MNT_ROOT)) { + ksft_test_result_fail("STATMOUNT_MNT_ROOT not set in mask\n"); + goto err_sm; + } + + if (strcmp(sm->str + sm->mnt_root, root) != 0) { + ksft_test_result_fail("statmount returned incorrect mnt_root," + "statmount mnt_root: %s != %s\n", + sm->str + sm->mnt_root, root); + goto err_sm; + } + + ksft_test_result_pass("statmount by fd on unmounted mount\n"); +err_sm: + free(sm); +err_fd: + close(fd); +err_subdir: + umount2(subdir, MNT_DETACH); + rmdir(subdir); +err_tmpdir: + umount2(tmpdir, MNT_DETACH); + rmdir(tmpdir); +} + #define str_off(memb) (offsetof(struct statmount, memb) / sizeof(uint32_t)) int main(void) @@ -669,14 +898,14 @@ int main(void) ksft_print_header(); - ret = statmount(0, 0, 0, NULL, 0, 0); + ret = statmount(0, 0, 0, 0, NULL, 0, 0); assert(ret == -1); if (errno == ENOSYS) ksft_exit_skip("statmount() syscall not supported\n"); setup_namespace(); - ksft_set_plan(15); + ksft_set_plan(17); test_listmount_empty_root(); test_statmount_zero_mask(); test_statmount_mnt_basic(); @@ -693,6 +922,8 @@ int main(void) test_statmount_string(all_mask, str_off(fs_type), "fs type & all"); test_listmount_tree(); + test_statmount_by_fd_unmounted(); + test_statmount_by_fd(); if (ksft_get_fail_cnt() + ksft_get_error_cnt() > 0) diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c b/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c index d56d4103182f..063d9de46431 100644 --- a/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c +++ b/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c @@ -102,7 +102,7 @@ static int _test_statmount_mnt_ns_id(void) if (!root_id) return NSID_ERROR; - ret = statmount(root_id, 0, STATMOUNT_MNT_NS_ID, &sm, sizeof(sm), 0); + ret = statmount(root_id, 0, 0, STATMOUNT_MNT_NS_ID, &sm, sizeof(sm), 0); if (ret == -1) { ksft_print_msg("statmount mnt ns id: %s\n", strerror(errno)); return NSID_ERROR; @@ -128,6 +128,98 @@ static int _test_statmount_mnt_ns_id(void) return NSID_PASS; } +static int _test_statmount_mnt_ns_id_by_fd(void) +{ + struct statmount sm; + uint64_t mnt_ns_id; + int ret, fd, mounted = 1, status = NSID_ERROR; + char mnt[] = "/statmount.fd.XXXXXX"; + + ret = get_mnt_ns_id("/proc/self/ns/mnt", &mnt_ns_id); + if (ret != NSID_PASS) + return ret; + + if (!mkdtemp(mnt)) { + ksft_print_msg("statmount by fd mnt ns id mkdtemp: %s\n", strerror(errno)); + return NSID_ERROR; + } + + if (mount(mnt, mnt, NULL, MS_BIND, 0)) { + ksft_print_msg("statmount by fd mnt ns id mount: %s\n", strerror(errno)); + status = NSID_ERROR; + goto err; + } + + fd = open(mnt, O_PATH); + if (fd < 0) { + ksft_print_msg("statmount by fd mnt ns id open: %s\n", strerror(errno)); + goto err; + } + + ret = statmount(0, 0, fd, STATMOUNT_MNT_NS_ID, &sm, sizeof(sm), STATMOUNT_BY_FD); + if (ret == -1) { + ksft_print_msg("statmount mnt ns id statmount: %s\n", strerror(errno)); + status = NSID_ERROR; + goto out; + } + + if (sm.size != sizeof(sm)) { + ksft_print_msg("unexpected size: %u != %u\n", sm.size, + (uint32_t)sizeof(sm)); + status = NSID_FAIL; + goto out; + } + if (sm.mask != STATMOUNT_MNT_NS_ID) { + ksft_print_msg("statmount mnt ns id unavailable\n"); + status = NSID_SKIP; + goto out; + } + + if (sm.mnt_ns_id != mnt_ns_id) { + ksft_print_msg("unexpected mnt ns ID: 0x%llx != 0x%llx\n", + (unsigned long long)sm.mnt_ns_id, + (unsigned long long)mnt_ns_id); + status = NSID_FAIL; + goto out; + } + + mounted = 0; + if (umount2(mnt, MNT_DETACH)) { + ksft_print_msg("statmount by fd mnt ns id umount2: %s\n", strerror(errno)); + goto out; + } + + ret = statmount(0, 0, fd, STATMOUNT_MNT_NS_ID, &sm, sizeof(sm), STATMOUNT_BY_FD); + if (ret == -1) { + ksft_print_msg("statmount mnt ns id statmount: %s\n", strerror(errno)); + status = NSID_ERROR; + goto out; + } + + if (sm.size != sizeof(sm)) { + ksft_print_msg("unexpected size: %u != %u\n", sm.size, + (uint32_t)sizeof(sm)); + status = NSID_FAIL; + goto out; + } + + if (sm.mask == STATMOUNT_MNT_NS_ID) { + ksft_print_msg("unexpected STATMOUNT_MNT_NS_ID in mask\n"); + status = NSID_FAIL; + goto out; + } + + status = NSID_PASS; +out: + close(fd); + if (mounted) + umount2(mnt, MNT_DETACH); +err: + rmdir(mnt); + return status; +} + + static void test_statmount_mnt_ns_id(void) { pid_t pid; @@ -148,6 +240,9 @@ static void test_statmount_mnt_ns_id(void) if (ret != NSID_PASS) exit(ret); ret = _test_statmount_mnt_ns_id(); + if (ret != NSID_PASS) + exit(ret); + ret = _test_statmount_mnt_ns_id_by_fd(); exit(ret); } @@ -179,7 +274,7 @@ static int validate_external_listmount(pid_t pid, uint64_t child_nr_mounts) for (int i = 0; i < nr_mounts; i++) { struct statmount sm; - ret = statmount(list[i], mnt_ns_id, STATMOUNT_MNT_NS_ID, &sm, + ret = statmount(list[i], mnt_ns_id, 0, STATMOUNT_MNT_NS_ID, &sm, sizeof(sm), 0); if (ret < 0) { ksft_print_msg("statmount mnt ns id: %s\n", strerror(errno)); @@ -275,7 +370,7 @@ int main(void) int ret; ksft_print_header(); - ret = statmount(0, 0, 0, NULL, 0, 0); + ret = statmount(0, 0, 0, 0, NULL, 0, 0); assert(ret == -1); if (errno == ENOSYS) ksft_exit_skip("statmount() syscall not supported\n"); -- cgit v1.2.3 From 0355911ac021a424b18d2d746536d70b879cdeab Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Tue, 16 Dec 2025 12:33:21 -0500 Subject: selftests/bpf: Explicitly account for globals in verifier_arena_large The big_alloc1 test in verifier_arena_large assumes that the arena base and the first page allocated by bpf_arena_alloc_pages are identical. This is not the case, because the first page in the arena is populated by global arena data. The test still passes because the code makes the tacit assumption that the first page is on offset PAGE_SIZE instead of 0. Make this distinction explicit in the code, and adjust the page offsets requested during the test to count from the beginning of the arena instead of using the address of the first allocated page. Signed-off-by: Emil Tsalapatis Signed-off-by: Andrii Nakryiko Reviewed-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20251216173325.98465-2-emil@etsalapatis.com --- tools/testing/selftests/bpf/progs/verifier_arena_large.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_large.c b/tools/testing/selftests/bpf/progs/verifier_arena_large.c index f19e15400b3e..bd430a34c3ab 100644 --- a/tools/testing/selftests/bpf/progs/verifier_arena_large.c +++ b/tools/testing/selftests/bpf/progs/verifier_arena_large.c @@ -23,18 +23,25 @@ int big_alloc1(void *ctx) { #if defined(__BPF_FEATURE_ADDR_SPACE_CAST) volatile char __arena *page1, *page2, *no_page, *page3; - void __arena *base; + u64 base; - page1 = base = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + base = (u64)arena_base(&arena); + + page1 = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); if (!page1) return 1; + + /* Account for global arena data. */ + if ((u64)page1 != base + PAGE_SIZE) + return 15; + *page1 = 1; - page2 = bpf_arena_alloc_pages(&arena, base + ARENA_SIZE - PAGE_SIZE * 2, + page2 = bpf_arena_alloc_pages(&arena, (void __arena *)(ARENA_SIZE - PAGE_SIZE), 1, NUMA_NO_NODE, 0); if (!page2) return 2; *page2 = 2; - no_page = bpf_arena_alloc_pages(&arena, base + ARENA_SIZE - PAGE_SIZE, + no_page = bpf_arena_alloc_pages(&arena, (void __arena *)ARENA_SIZE, 1, NUMA_NO_NODE, 0); if (no_page) return 3; -- cgit v1.2.3 From 12a1fe6e12dbad39f2f0dad1a385625f0298eff4 Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Tue, 16 Dec 2025 12:33:22 -0500 Subject: bpf/verifier: Do not limit maximum direct offset into arena map The verifier currently limits direct offsets into a map to 512MiB to avoid overflow during pointer arithmetic. However, this prevents arena maps from using direct addressing instructions to access data at the end of > 512MiB arena maps. This is necessary when moving arena globals to the end of the arena instead of the front. Refactor the verifier code to remove the offset calculation during direct value access calculations. This is possible because the only two map types that implement .map_direct_value_addr() are arrays and arenas, and they both do their own internal checks to ensure the offset is within bounds. Adjust selftests that expect the old error. These tests still fail because the verifier identifies the access as out of bounds for the map, so change them to expect an "invalid access to map value pointer" error instead. Signed-off-by: Emil Tsalapatis Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20251216173325.98465-3-emil@etsalapatis.com --- kernel/bpf/verifier.c | 5 ----- tools/testing/selftests/bpf/verifier/direct_value_access.c | 4 ++-- 2 files changed, 2 insertions(+), 7 deletions(-) (limited to 'tools/testing') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a31c032b2dd6..d6b8a77fbe3b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21132,11 +21132,6 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) } else { u32 off = insn[1].imm; - if (off >= BPF_MAX_VAR_OFF) { - verbose(env, "direct value offset of %u is not allowed\n", off); - return -EINVAL; - } - if (!map->ops->map_direct_value_addr) { verbose(env, "no direct value access support for this map type\n"); return -EINVAL; diff --git a/tools/testing/selftests/bpf/verifier/direct_value_access.c b/tools/testing/selftests/bpf/verifier/direct_value_access.c index c0648dc009b5..e569d119fb60 100644 --- a/tools/testing/selftests/bpf/verifier/direct_value_access.c +++ b/tools/testing/selftests/bpf/verifier/direct_value_access.c @@ -81,7 +81,7 @@ }, .fixup_map_array_48b = { 1 }, .result = REJECT, - .errstr = "direct value offset of 4294967295 is not allowed", + .errstr = "invalid access to map value pointer, value_size=48 off=4294967295", }, { "direct map access, write test 8", @@ -141,7 +141,7 @@ }, .fixup_map_array_48b = { 1 }, .result = REJECT, - .errstr = "direct value offset of 536870912 is not allowed", + .errstr = "invalid access to map value pointer, value_size=48 off=536870912", }, { "direct map access, write test 13", -- cgit v1.2.3 From c1f61171d44b19834cf24def2cf832f2688e83df Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Tue, 16 Dec 2025 12:33:24 -0500 Subject: libbpf: Move arena globals to the end of the arena Arena globals are currently placed at the beginning of the arena by libbpf. This is convenient, but prevents users from reserving guard pages in the beginning of the arena to identify NULL pointer dereferences. Adjust the load logic to place the globals at the end of the arena instead. Also modify bpftool to set the arena pointer in the program's BPF skeleton to point to the globals. Users now call bpf_map__initial_value() to find the beginning of the arena mapping and use the arena pointer in the skeleton to determine which part of the mapping holds the arena globals and which part is free. Suggested-by: Andrii Nakryiko Signed-off-by: Emil Tsalapatis Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20251216173325.98465-5-emil@etsalapatis.com --- tools/lib/bpf/libbpf.c | 17 +++++++++++++---- .../testing/selftests/bpf/progs/verifier_arena_large.c | 12 +++++++++--- 2 files changed, 22 insertions(+), 7 deletions(-) (limited to 'tools/testing') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 4d4badb64824..6fba879492a8 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -757,6 +757,7 @@ struct bpf_object { int arena_map_idx; void *arena_data; size_t arena_data_sz; + size_t arena_data_off; void *jumptables_data; size_t jumptables_data_sz; @@ -2991,10 +2992,11 @@ static int init_arena_map_data(struct bpf_object *obj, struct bpf_map *map, void *data, size_t data_sz) { const long page_sz = sysconf(_SC_PAGE_SIZE); + const size_t data_alloc_sz = roundup(data_sz, page_sz); size_t mmap_sz; mmap_sz = bpf_map_mmap_sz(map); - if (roundup(data_sz, page_sz) > mmap_sz) { + if (data_alloc_sz > mmap_sz) { pr_warn("elf: sec '%s': declared ARENA map size (%zu) is too small to hold global __arena variables of size %zu\n", sec_name, mmap_sz, data_sz); return -E2BIG; @@ -3006,6 +3008,9 @@ static int init_arena_map_data(struct bpf_object *obj, struct bpf_map *map, memcpy(obj->arena_data, data, data_sz); obj->arena_data_sz = data_sz; + /* place globals at the end of the arena */ + obj->arena_data_off = mmap_sz - data_alloc_sz; + /* make bpf_map__init_value() work for ARENA maps */ map->mmaped = obj->arena_data; @@ -4663,7 +4668,7 @@ static int bpf_program__record_reloc(struct bpf_program *prog, reloc_desc->type = RELO_DATA; reloc_desc->insn_idx = insn_idx; reloc_desc->map_idx = obj->arena_map_idx; - reloc_desc->sym_off = sym->st_value; + reloc_desc->sym_off = sym->st_value + obj->arena_data_off; map = &obj->maps[obj->arena_map_idx]; pr_debug("prog '%s': found arena map %d (%s, sec %d, off %zu) for insn %u\n", @@ -5624,7 +5629,8 @@ retry: return err; } if (obj->arena_data) { - memcpy(map->mmaped, obj->arena_data, obj->arena_data_sz); + memcpy(map->mmaped + obj->arena_data_off, obj->arena_data, + obj->arena_data_sz); zfree(&obj->arena_data); } } @@ -14429,7 +14435,10 @@ int bpf_object__load_skeleton(struct bpf_object_skeleton *s) if (!map_skel->mmaped) continue; - *map_skel->mmaped = map->mmaped; + if (map->def.type == BPF_MAP_TYPE_ARENA) + *map_skel->mmaped = map->mmaped + map->obj->arena_data_off; + else + *map_skel->mmaped = map->mmaped; } return 0; diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_large.c b/tools/testing/selftests/bpf/progs/verifier_arena_large.c index bd430a34c3ab..2b8cf2a4d880 100644 --- a/tools/testing/selftests/bpf/progs/verifier_arena_large.c +++ b/tools/testing/selftests/bpf/progs/verifier_arena_large.c @@ -31,16 +31,22 @@ int big_alloc1(void *ctx) if (!page1) return 1; - /* Account for global arena data. */ - if ((u64)page1 != base + PAGE_SIZE) + if ((u64)page1 != base) return 15; *page1 = 1; - page2 = bpf_arena_alloc_pages(&arena, (void __arena *)(ARENA_SIZE - PAGE_SIZE), + page2 = bpf_arena_alloc_pages(&arena, (void __arena *)(ARENA_SIZE - 2 * PAGE_SIZE), 1, NUMA_NO_NODE, 0); if (!page2) return 2; *page2 = 2; + + /* Test for the guard region at the end of the arena. */ + no_page = bpf_arena_alloc_pages(&arena, (void __arena *)ARENA_SIZE - PAGE_SIZE, + 1, NUMA_NO_NODE, 0); + if (no_page) + return 16; + no_page = bpf_arena_alloc_pages(&arena, (void __arena *)ARENA_SIZE, 1, NUMA_NO_NODE, 0); if (no_page) -- cgit v1.2.3 From 19f12431b6c339416e656c794a26ff0ebb2dba56 Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Tue, 16 Dec 2025 12:33:25 -0500 Subject: selftests/bpf: Add tests for the arena offset of globals Add tests for the new libbpf globals arena offset logic. The tests cover the case of globals being as large as the arena itself, and being smaller than the arena. In that case, the data is placed at the end of the arena, and the beginning of the arena is free. Signed-off-by: Emil Tsalapatis Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20251216173325.98465-6-emil@etsalapatis.com --- tools/testing/selftests/bpf/prog_tests/verifier.c | 4 + .../selftests/bpf/progs/verifier_arena_globals1.c | 87 ++++++++++++++++++++++ .../selftests/bpf/progs/verifier_arena_globals2.c | 49 ++++++++++++ 3 files changed, 140 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/verifier_arena_globals1.c create mode 100644 tools/testing/selftests/bpf/progs/verifier_arena_globals2.c (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index 4b4b081b46cc..5829ffd70f8f 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -6,6 +6,8 @@ #include "verifier_and.skel.h" #include "verifier_arena.skel.h" #include "verifier_arena_large.skel.h" +#include "verifier_arena_globals1.skel.h" +#include "verifier_arena_globals2.skel.h" #include "verifier_array_access.skel.h" #include "verifier_async_cb_context.skel.h" #include "verifier_basic_stack.skel.h" @@ -147,6 +149,8 @@ static void run_tests_aux(const char *skel_name, void test_verifier_and(void) { RUN(verifier_and); } void test_verifier_arena(void) { RUN(verifier_arena); } void test_verifier_arena_large(void) { RUN(verifier_arena_large); } +void test_verifier_arena_globals1(void) { RUN(verifier_arena_globals1); } +void test_verifier_arena_globals2(void) { RUN(verifier_arena_globals2); } void test_verifier_basic_stack(void) { RUN(verifier_basic_stack); } void test_verifier_bitfield_write(void) { RUN(verifier_bitfield_write); } void test_verifier_bounds(void) { RUN(verifier_bounds); } diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_globals1.c b/tools/testing/selftests/bpf/progs/verifier_arena_globals1.c new file mode 100644 index 000000000000..14afef3d6442 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_arena_globals1.c @@ -0,0 +1,87 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#define BPF_NO_KFUNC_PROTOTYPES +#include +#include +#include +#include "bpf_experimental.h" +#include "bpf_arena_common.h" +#include "bpf_misc.h" + +#define ARENA_PAGES (1UL<< (32 - 12)) +#define GLOBAL_PAGES (16) + +struct { + __uint(type, BPF_MAP_TYPE_ARENA); + __uint(map_flags, BPF_F_MMAPABLE); + __uint(max_entries, ARENA_PAGES); +#ifdef __TARGET_ARCH_arm64 + __ulong(map_extra, (1ull << 32) | (~0u - __PAGE_SIZE * ARENA_PAGES + 1)); +#else + __ulong(map_extra, (1ull << 44) | (~0u - __PAGE_SIZE * ARENA_PAGES + 1)); +#endif +} arena SEC(".maps"); + +/* + * Global data, to be placed at the end of the arena. + */ +volatile char __arena global_data[GLOBAL_PAGES][PAGE_SIZE]; + +SEC("syscall") +__success __retval(0) +int check_reserve1(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + const u8 magic = 0x5a; + __u8 __arena *guard, *globals; + volatile char __arena *ptr; + int i; + int ret; + + guard = (void __arena *)arena_base(&arena); + globals = (void __arena *)(arena_base(&arena) + (ARENA_PAGES - GLOBAL_PAGES) * PAGE_SIZE); + + /* Reserve the region we've offset the globals by. */ + ret = bpf_arena_reserve_pages(&arena, guard, ARENA_PAGES - GLOBAL_PAGES); + if (ret) + return 1; + + /* Make sure the globals are in the expected offset. */ + ret = bpf_arena_reserve_pages(&arena, globals, 1); + if (!ret) + return 2; + + /* Verify globals are properly mapped in by libbpf. */ + for (i = 0; i < GLOBAL_PAGES; i++) { + ptr = &global_data[i][PAGE_SIZE / 2]; + + *ptr = magic; + if (*ptr != magic) + return i + 3; + } +#endif + return 0; +} + +/* + * Relocation check by reading directly into the global data w/o using symbols. + */ +SEC("syscall") +__success __retval(0) +int check_relocation(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + const u8 magic = 0xfa; + u8 __arena *ptr; + + global_data[GLOBAL_PAGES - 1][PAGE_SIZE / 2] = magic; + ptr = (u8 __arena *)((u64)(ARENA_PAGES * PAGE_SIZE - PAGE_SIZE / 2)); + if (*ptr != magic) + return 1; + +#endif + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_globals2.c b/tools/testing/selftests/bpf/progs/verifier_arena_globals2.c new file mode 100644 index 000000000000..e6bd7b61f9f1 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_arena_globals2.c @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#define BPF_NO_KFUNC_PROTOTYPES +#include +#include +#include +#include "bpf_misc.h" +#include "bpf_experimental.h" +#include "bpf_arena_common.h" + +#define ARENA_PAGES (32) + +struct { + __uint(type, BPF_MAP_TYPE_ARENA); + __uint(map_flags, BPF_F_MMAPABLE); + __uint(max_entries, ARENA_PAGES); +#ifdef __TARGET_ARCH_arm64 + __ulong(map_extra, (1ull << 32) | (~0u - __PAGE_SIZE * ARENA_PAGES + 1)); +#else + __ulong(map_extra, (1ull << 44) | (~0u - __PAGE_SIZE * ARENA_PAGES + 1)); +#endif +} arena SEC(".maps"); + +/* + * Fill the entire arena with global data. + * The offset into the arena should be 0. + */ +char __arena global_data[ARENA_PAGES][PAGE_SIZE]; + +SEC("syscall") +__success __retval(0) +int check_reserve2(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + void __arena *guard; + int ret; + + guard = (void __arena *)arena_base(&arena); + + /* Make sure the data at offset 0 case is properly handled. */ + ret = bpf_arena_reserve_pages(&arena, guard, 1); + if (!ret) + return 1; +#endif + return 0; +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 014e1cdb5fad8c6034feb3a97468a91edf23d3d0 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Fri, 19 Dec 2025 10:18:24 -0800 Subject: selftests/bpf: Run resolve_btfids only for relevant .test.o objects A selftest targeting resolve_btfids functionality relies on a resolved .BTF_ids section to be available in the TRUNNER_BINARY. The underlying BTF data is taken from a special BPF program (btf_data.c), and so resolve_btfids is executed as a part of a TRUNNER_BINARY build recipe on the final binary. Subsequent patches in this series allow resolve_btfids to modify BTF before resolving the symbols, which means that the test needs access to that modified BTF [1]. Currently the test simply reads in btf_data.bpf.o on the assumption that BTF hasn't changed. Implement resolve_btfids call only for particular test objects (just resolve_btfids.test.o for now). The test objects are linked into the TRUNNER_BINARY, and so .BTF_ids section will be available there. This will make it trivial for the resolve_btfids test to access BTF modified by resolve_btfids. [1] https://lore.kernel.org/bpf/CAErzpmvsgSDe-QcWH8SFFErL6y3p3zrqNri5-UHJ9iK2ChyiBw@mail.gmail.com/ Signed-off-by: Ihor Solodrai Signed-off-by: Andrii Nakryiko Tested-by: Alan Maguire Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20251219181825.1289460-2-ihor.solodrai@linux.dev --- tools/testing/selftests/bpf/Makefile | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 4aa60e83ff19..ffd0a4c354c7 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -643,6 +643,9 @@ $(TRUNNER_TESTS_HDR): $(TRUNNER_TESTS_DIR)/*.c ) > $$@) endif +$(TRUNNER_OUTPUT)/resolve_btfids.test.o: $(RESOLVE_BTFIDS) $(TRUNNER_OUTPUT)/btf_data.bpf.o +$(TRUNNER_OUTPUT)/resolve_btfids.test.o: private TEST_NEEDS_BTFIDS = 1 + # compile individual test files # Note: we cd into output directory to ensure embedded BPF object is found $(TRUNNER_TEST_OBJS): $(TRUNNER_OUTPUT)/%.test.o: \ @@ -650,6 +653,9 @@ $(TRUNNER_TEST_OBJS): $(TRUNNER_OUTPUT)/%.test.o: \ | $(TRUNNER_OUTPUT)/%.test.d $$(call msg,TEST-OBJ,$(TRUNNER_BINARY),$$@) $(Q)cd $$(@D) && $$(CC) -I. $$(CFLAGS) -MMD -MT $$@ -c $(CURDIR)/$$< $$(LDLIBS) -o $$(@F) + $$(if $$(TEST_NEEDS_BTFIDS), \ + $$(call msg,BTFIDS,$(TRUNNER_BINARY),$$@) \ + $(RESOLVE_BTFIDS) --btf $(TRUNNER_OUTPUT)/btf_data.bpf.o $$@) $(TRUNNER_TEST_OBJS:.o=.d): $(TRUNNER_OUTPUT)/%.test.d: \ $(TRUNNER_TESTS_DIR)/%.c \ @@ -695,13 +701,11 @@ $(OUTPUT)/$(TRUNNER_BINARY): | $(TRUNNER_BPF_OBJS) $(OUTPUT)/$(TRUNNER_BINARY): $(TRUNNER_TEST_OBJS) \ $(TRUNNER_EXTRA_OBJS) $$(BPFOBJ) \ $(TRUNNER_LIB_OBJS) \ - $(RESOLVE_BTFIDS) \ $(TRUNNER_BPFTOOL) \ $(OUTPUT)/veristat \ | $(TRUNNER_BINARY)-extras $$(call msg,BINARY,,$$@) $(Q)$$(CC) $$(CFLAGS) $$(filter %.a %.o,$$^) $$(LDLIBS) $$(LLVM_LDLIBS) $$(LDFLAGS) $$(LLVM_LDFLAGS) -o $$@ - $(Q)$(RESOLVE_BTFIDS) --btf $(TRUNNER_OUTPUT)/btf_data.bpf.o $$@ $(Q)ln -sf $(if $2,..,.)/tools/build/bpftool/$(USE_BOOTSTRAP)bpftool \ $(OUTPUT)/$(if $2,$2/)bpftool -- cgit v1.2.3 From 522397d05e7d4a7c30b91841492360336b24f833 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Fri, 19 Dec 2025 10:18:25 -0800 Subject: resolve_btfids: Change in-place update with raw binary output Currently resolve_btfids updates .BTF_ids section of an ELF file in-place, based on the contents of provided BTF, usually within the same input file, and optionally a BTF base. Change resolve_btfids behavior to enable BTF transformations as part of its main operation. To achieve this, in-place ELF write in resolve_btfids is replaced with generation of the following binaries: * ${1}.BTF with .BTF section data * ${1}.BTF_ids with .BTF_ids section data if it existed in ${1} * ${1}.BTF.base with .BTF.base section data for out-of-tree modules The execution of resolve_btfids and consumption of its output is orchestrated by scripts/gen-btf.sh introduced in this patch. The motivation for emitting binary data is that it allows simplifying resolve_btfids implementation by delegating ELF update to the $OBJCOPY tool [1], which is already widely used across the codebase. There are two distinct paths for BTF generation and resolve_btfids application in the kernel build: for vmlinux and for kernel modules. For the vmlinux binary a .BTF section is added in a roundabout way to ensure correct linking. The patch doesn't change this approach, only the implementation is a little different. Before this patch it worked as follows: * pahole consumed .tmp_vmlinux1 [2] and added .BTF section with llvm-objcopy [3] to it * then everything except the .BTF section was stripped from .tmp_vmlinux1 into a .tmp_vmlinux1.bpf.o object [2], later linked into vmlinux * resolve_btfids was executed later on vmlinux.unstripped [4], updating it in-place After this patch gen-btf.sh implements the following: * pahole consumes .tmp_vmlinux1 and produces a *detached* file with raw BTF data * resolve_btfids consumes .tmp_vmlinux1 and detached BTF to produce (potentially modified) .BTF, and .BTF_ids sections data * a .tmp_vmlinux1.bpf.o object is then produced with objcopy copying BTF output of resolve_btfids * .BTF_ids data gets embedded into vmlinux.unstripped in link-vmlinux.sh by objcopy --update-section For kernel modules, creating a special .bpf.o file is not necessary, and so embedding of sections data produced by resolve_btfids is straightforward with objcopy. With this patch an ELF file becomes effectively read-only within resolve_btfids, which allows deleting elf_update() call and satellite code (like compressed_section_fix [5]). Endianness handling of .BTF_ids data is also changed. Previously the "flags" part of the section was bswapped in sets_patch() [6], and then Elf_Type was modified before elf_update() to signal to libelf that bswap may be necessary. With this patch we explicitly bswap entire data buffer on load and on dump. [1] https://lore.kernel.org/bpf/131b4190-9c49-4f79-a99d-c00fac97fa44@linux.dev/ [2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/scripts/link-vmlinux.sh?h=v6.18#n110 [3] https://git.kernel.org/pub/scm/devel/pahole/pahole.git/tree/btf_encoder.c?h=v1.31#n1803 [4] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/scripts/link-vmlinux.sh?h=v6.18#n284 [5] https://lore.kernel.org/bpf/20200819092342.259004-1-jolsa@kernel.org/ [6] https://lore.kernel.org/bpf/cover.1707223196.git.vmalik@redhat.com/ Signed-off-by: Ihor Solodrai Signed-off-by: Andrii Nakryiko Tested-by: Alan Maguire Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20251219181825.1289460-3-ihor.solodrai@linux.dev --- MAINTAINERS | 1 + scripts/Makefile.btf | 12 +- scripts/Makefile.modfinal | 5 +- scripts/Makefile.vmlinux | 2 +- scripts/gen-btf.sh | 157 +++++++++++++++ scripts/link-vmlinux.sh | 42 +--- tools/bpf/resolve_btfids/main.c | 224 +++++++++++++-------- tools/testing/selftests/bpf/.gitignore | 3 + tools/testing/selftests/bpf/Makefile | 9 +- .../selftests/bpf/prog_tests/resolve_btfids.c | 4 +- 10 files changed, 328 insertions(+), 131 deletions(-) create mode 100755 scripts/gen-btf.sh (limited to 'tools/testing') diff --git a/MAINTAINERS b/MAINTAINERS index 5b11839cba9d..cb1898a85b05 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4766,6 +4766,7 @@ F: net/sched/act_bpf.c F: net/sched/cls_bpf.c F: samples/bpf/ F: scripts/bpf_doc.py +F: scripts/gen-btf.sh F: scripts/Makefile.btf F: scripts/pahole-version.sh F: tools/bpf/ diff --git a/scripts/Makefile.btf b/scripts/Makefile.btf index 840a55de42da..562a04b40e06 100644 --- a/scripts/Makefile.btf +++ b/scripts/Makefile.btf @@ -18,13 +18,15 @@ pahole-flags-$(call test-ge, $(pahole-ver), 126) = -j$(JOBS) --btf_features=enc pahole-flags-$(call test-ge, $(pahole-ver), 130) += --btf_features=attributes -ifneq ($(KBUILD_EXTMOD),) -module-pahole-flags-$(call test-ge, $(pahole-ver), 128) += --btf_features=distilled_base -endif - endif pahole-flags-$(CONFIG_PAHOLE_HAS_LANG_EXCLUDE) += --lang_exclude=rust export PAHOLE_FLAGS := $(pahole-flags-y) -export MODULE_PAHOLE_FLAGS := $(module-pahole-flags-y) + +resolve-btfids-flags-y := +resolve-btfids-flags-$(CONFIG_WERROR) += --fatal_warnings +resolve-btfids-flags-$(if $(KBUILD_EXTMOD),y) += --distill_base +resolve-btfids-flags-$(if $(KBUILD_VERBOSE),y) += --verbose + +export RESOLVE_BTFIDS_FLAGS := $(resolve-btfids-flags-y) diff --git a/scripts/Makefile.modfinal b/scripts/Makefile.modfinal index 149e12ff5700..422c56dc878e 100644 --- a/scripts/Makefile.modfinal +++ b/scripts/Makefile.modfinal @@ -42,9 +42,8 @@ quiet_cmd_btf_ko = BTF [M] $@ cmd_btf_ko = \ if [ ! -f $(objtree)/vmlinux ]; then \ printf "Skipping BTF generation for %s due to unavailability of vmlinux\n" $@ 1>&2; \ - else \ - LLVM_OBJCOPY="$(OBJCOPY)" $(PAHOLE) -J $(PAHOLE_FLAGS) $(MODULE_PAHOLE_FLAGS) --btf_base $(objtree)/vmlinux $@; \ - $(RESOLVE_BTFIDS) -b $(objtree)/vmlinux $@; \ + else \ + $(srctree)/scripts/gen-btf.sh --btf_base $(objtree)/vmlinux $@; \ fi; # Same as newer-prereqs, but allows to exclude specified extra dependencies diff --git a/scripts/Makefile.vmlinux b/scripts/Makefile.vmlinux index cd788cac9d91..20a988f4fe0c 100644 --- a/scripts/Makefile.vmlinux +++ b/scripts/Makefile.vmlinux @@ -71,7 +71,7 @@ targets += vmlinux.unstripped .vmlinux.export.o vmlinux.unstripped: scripts/link-vmlinux.sh vmlinux.o .vmlinux.export.o $(KBUILD_LDS) FORCE +$(call if_changed_dep,link_vmlinux) ifdef CONFIG_DEBUG_INFO_BTF -vmlinux.unstripped: $(RESOLVE_BTFIDS) +vmlinux.unstripped: $(RESOLVE_BTFIDS) $(srctree)/scripts/gen-btf.sh endif ifdef CONFIG_BUILDTIME_TABLE_SORT diff --git a/scripts/gen-btf.sh b/scripts/gen-btf.sh new file mode 100755 index 000000000000..06c6d8becaa2 --- /dev/null +++ b/scripts/gen-btf.sh @@ -0,0 +1,157 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2025 Meta Platforms, Inc. and affiliates. +# +# This script generates BTF data for the provided ELF file. +# +# Kernel BTF generation involves these conceptual steps: +# 1. pahole generates BTF from DWARF data +# 2. resolve_btfids applies kernel-specific btf2btf +# transformations and computes data for .BTF_ids section +# 3. the result gets linked/objcopied into the target binary +# +# How step (3) should be done differs between vmlinux, and +# kernel modules, which is the primary reason for the existence +# of this script. +# +# For modules the script expects vmlinux passed in as --btf_base. +# Generated .BTF, .BTF.base and .BTF_ids sections become embedded +# into the input ELF file with objcopy. +# +# For vmlinux the input file remains unchanged and two files are produced: +# - ${1}.btf.o ready for linking into vmlinux +# - ${1}.BTF_ids with .BTF_ids data blob +# This output is consumed by scripts/link-vmlinux.sh + +set -e + +usage() +{ + echo "Usage: $0 [--btf_base ] " + exit 1 +} + +BTF_BASE="" + +while [ $# -gt 0 ]; do + case "$1" in + --btf_base) + BTF_BASE="$2" + shift 2 + ;; + -*) + echo "Unknown option: $1" >&2 + usage + ;; + *) + break + ;; + esac +done + +if [ $# -ne 1 ]; then + usage +fi + +ELF_FILE="$1" +shift + +is_enabled() { + grep -q "^$1=y" ${objtree}/include/config/auto.conf +} + +info() +{ + printf " %-7s %s\n" "${1}" "${2}" +} + +case "${KBUILD_VERBOSE}" in +*1*) + set -x + ;; +esac + + +gen_btf_data() +{ + info BTF "${ELF_FILE}" + btf1="${ELF_FILE}.BTF.1" + ${PAHOLE} -J ${PAHOLE_FLAGS} \ + ${BTF_BASE:+--btf_base ${BTF_BASE}} \ + --btf_encode_detached=${btf1} \ + "${ELF_FILE}" + + info BTFIDS "${ELF_FILE}" + ${RESOLVE_BTFIDS} ${RESOLVE_BTFIDS_FLAGS} \ + ${BTF_BASE:+--btf_base ${BTF_BASE}} \ + --btf ${btf1} "${ELF_FILE}" +} + +gen_btf_o() +{ + local btf_data=${ELF_FILE}.btf.o + + # Create ${btf_data} which contains just .BTF section but no symbols. Add + # SHF_ALLOC because .BTF will be part of the vmlinux image. --strip-all + # deletes all symbols including __start_BTF and __stop_BTF, which will + # be redefined in the linker script. + info OBJCOPY "${btf_data}" + echo "" | ${CC} ${CLANG_FLAGS} -c -x c -o ${btf_data} - + ${OBJCOPY} --add-section .BTF=${ELF_FILE}.BTF \ + --set-section-flags .BTF=alloc,readonly ${btf_data} + ${OBJCOPY} --only-section=.BTF --strip-all ${btf_data} + + # Change e_type to ET_REL so that it can be used to link final vmlinux. + # GNU ld 2.35+ and lld do not allow an ET_EXEC input. + if is_enabled CONFIG_CPU_BIG_ENDIAN; then + et_rel='\0\1' + else + et_rel='\1\0' + fi + printf "${et_rel}" | dd of="${btf_data}" conv=notrunc bs=1 seek=16 status=none +} + +embed_btf_data() +{ + info OBJCOPY "${ELF_FILE}.BTF" + ${OBJCOPY} --add-section .BTF=${ELF_FILE}.BTF ${ELF_FILE} + + # a module might not have a .BTF_ids or .BTF.base section + local btf_base="${ELF_FILE}.BTF.base" + if [ -f "${btf_base}" ]; then + ${OBJCOPY} --add-section .BTF.base=${btf_base} ${ELF_FILE} + fi + local btf_ids="${ELF_FILE}.BTF_ids" + if [ -f "${btf_ids}" ]; then + ${OBJCOPY} --update-section .BTF_ids=${btf_ids} ${ELF_FILE} + fi +} + +cleanup() +{ + rm -f "${ELF_FILE}.BTF.1" + rm -f "${ELF_FILE}.BTF" + if [ "${BTFGEN_MODE}" = "module" ]; then + rm -f "${ELF_FILE}.BTF.base" + rm -f "${ELF_FILE}.BTF_ids" + fi +} +trap cleanup EXIT + +BTFGEN_MODE="vmlinux" +if [ -n "${BTF_BASE}" ]; then + BTFGEN_MODE="module" +fi + +gen_btf_data + +case "${BTFGEN_MODE}" in +vmlinux) + gen_btf_o + ;; +module) + embed_btf_data + ;; +esac + +exit 0 diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 4ab44c73da4d..e2207e612ac3 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -106,34 +106,6 @@ vmlinux_link() ${kallsymso} ${btf_vmlinux_bin_o} ${arch_vmlinux_o} ${ldlibs} } -# generate .BTF typeinfo from DWARF debuginfo -# ${1} - vmlinux image -gen_btf() -{ - local btf_data=${1}.btf.o - - info BTF "${btf_data}" - LLVM_OBJCOPY="${OBJCOPY}" ${PAHOLE} -J ${PAHOLE_FLAGS} ${1} - - # Create ${btf_data} which contains just .BTF section but no symbols. Add - # SHF_ALLOC because .BTF will be part of the vmlinux image. --strip-all - # deletes all symbols including __start_BTF and __stop_BTF, which will - # be redefined in the linker script. Add 2>/dev/null to suppress GNU - # objcopy warnings: "empty loadable segment detected at ..." - ${OBJCOPY} --only-section=.BTF --set-section-flags .BTF=alloc,readonly \ - --strip-all ${1} "${btf_data}" 2>/dev/null - # Change e_type to ET_REL so that it can be used to link final vmlinux. - # GNU ld 2.35+ and lld do not allow an ET_EXEC input. - if is_enabled CONFIG_CPU_BIG_ENDIAN; then - et_rel='\0\1' - else - et_rel='\1\0' - fi - printf "${et_rel}" | dd of="${btf_data}" conv=notrunc bs=1 seek=16 status=none - - btf_vmlinux_bin_o=${btf_data} -} - # Create ${2}.o file with all symbols from the ${1} object file kallsyms() { @@ -205,6 +177,7 @@ if is_enabled CONFIG_ARCH_WANTS_PRE_LINK_VMLINUX; then fi btf_vmlinux_bin_o= +btfids_vmlinux= kallsymso= strip_debug= generate_map= @@ -232,11 +205,13 @@ if is_enabled CONFIG_KALLSYMS || is_enabled CONFIG_DEBUG_INFO_BTF; then fi if is_enabled CONFIG_DEBUG_INFO_BTF; then - if ! gen_btf .tmp_vmlinux1; then + if ! ${srctree}/scripts/gen-btf.sh .tmp_vmlinux1; then echo >&2 "Failed to generate BTF for vmlinux" echo >&2 "Try to disable CONFIG_DEBUG_INFO_BTF" exit 1 fi + btf_vmlinux_bin_o=.tmp_vmlinux1.btf.o + btfids_vmlinux=.tmp_vmlinux1.BTF_ids fi if is_enabled CONFIG_KALLSYMS; then @@ -289,14 +264,9 @@ fi vmlinux_link "${VMLINUX}" -# fill in BTF IDs if is_enabled CONFIG_DEBUG_INFO_BTF; then - info BTFIDS "${VMLINUX}" - RESOLVE_BTFIDS_ARGS="" - if is_enabled CONFIG_WERROR; then - RESOLVE_BTFIDS_ARGS=" --fatal_warnings " - fi - ${RESOLVE_BTFIDS} ${RESOLVE_BTFIDS_ARGS} "${VMLINUX}" + info OBJCOPY ${btfids_vmlinux} + ${OBJCOPY} --update-section .BTF_ids=${btfids_vmlinux} ${VMLINUX} fi mksysmap "${VMLINUX}" System.map diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c index e721e20a2bbd..2cbc252259be 100644 --- a/tools/bpf/resolve_btfids/main.c +++ b/tools/bpf/resolve_btfids/main.c @@ -71,9 +71,11 @@ #include #include #include +#include #include #include #include +#include #include #include #include @@ -124,6 +126,7 @@ struct object { struct btf *btf; struct btf *base_btf; + bool distill_base; struct { int fd; @@ -324,42 +327,16 @@ static struct btf_id *add_symbol(struct rb_root *root, char *name, size_t size) return btf_id__add(root, id, BTF_ID_KIND_SYM); } -/* Older libelf.h and glibc elf.h might not yet define the ELF compression types. */ -#ifndef SHF_COMPRESSED -#define SHF_COMPRESSED (1 << 11) /* Section with compressed data. */ -#endif - -/* - * The data of compressed section should be aligned to 4 - * (for 32bit) or 8 (for 64 bit) bytes. The binutils ld - * sets sh_addralign to 1, which makes libelf fail with - * misaligned section error during the update: - * FAILED elf_update(WRITE): invalid section alignment - * - * While waiting for ld fix, we fix the compressed sections - * sh_addralign value manualy. - */ -static int compressed_section_fix(Elf *elf, Elf_Scn *scn, GElf_Shdr *sh) +static void bswap_32_data(void *data, u32 nr_bytes) { - int expected = gelf_getclass(elf) == ELFCLASS32 ? 4 : 8; - - if (!(sh->sh_flags & SHF_COMPRESSED)) - return 0; - - if (sh->sh_addralign == expected) - return 0; + u32 cnt, i; + u32 *ptr; - pr_debug2(" - fixing wrong alignment sh_addralign %u, expected %u\n", - sh->sh_addralign, expected); + cnt = nr_bytes / sizeof(u32); + ptr = data; - sh->sh_addralign = expected; - - if (gelf_update_shdr(scn, sh) == 0) { - pr_err("FAILED cannot update section header: %s\n", - elf_errmsg(-1)); - return -1; - } - return 0; + for (i = 0; i < cnt; i++) + ptr[i] = bswap_32(ptr[i]); } static int elf_collect(struct object *obj) @@ -380,7 +357,7 @@ static int elf_collect(struct object *obj) elf_version(EV_CURRENT); - elf = elf_begin(fd, ELF_C_RDWR_MMAP, NULL); + elf = elf_begin(fd, ELF_C_READ_MMAP_PRIVATE, NULL); if (!elf) { close(fd); pr_err("FAILED cannot create ELF descriptor: %s\n", @@ -443,21 +420,20 @@ static int elf_collect(struct object *obj) obj->efile.symbols_shndx = idx; obj->efile.strtabidx = sh.sh_link; } else if (!strcmp(name, BTF_IDS_SECTION)) { + /* + * If target endianness differs from host, we need to bswap32 + * the .BTF_ids section data on load, because .BTF_ids has + * Elf_Type = ELF_T_BYTE, and so libelf returns data buffer in + * the target endianness. We repeat this on dump. + */ + if (obj->efile.encoding != ELFDATANATIVE) { + pr_debug("bswap_32 .BTF_ids data from target to host endianness\n"); + bswap_32_data(data->d_buf, data->d_size); + } obj->efile.idlist = data; obj->efile.idlist_shndx = idx; obj->efile.idlist_addr = sh.sh_addr; - } else if (!strcmp(name, BTF_BASE_ELF_SEC)) { - /* If a .BTF.base section is found, do not resolve - * BTF ids relative to vmlinux; resolve relative - * to the .BTF.base section instead. btf__parse_split() - * will take care of this once the base BTF it is - * passed is NULL. - */ - obj->base_btf_path = NULL; } - - if (compressed_section_fix(elf, scn, &sh)) - return -1; } return 0; @@ -587,11 +563,26 @@ static int load_btf(struct object *obj) obj->base_btf = base_btf; obj->btf = btf; + if (obj->base_btf && obj->distill_base) { + err = btf__distill_base(obj->btf, &base_btf, &btf); + if (err) { + pr_err("FAILED to distill base BTF: %s\n", strerror(errno)); + goto out_err; + } + + btf__free(obj->base_btf); + btf__free(obj->btf); + obj->base_btf = base_btf; + obj->btf = btf; + } + return 0; out_err: btf__free(base_btf); btf__free(btf); + obj->base_btf = NULL; + obj->btf = NULL; return err; } @@ -760,24 +751,6 @@ static int sets_patch(struct object *obj) */ BUILD_BUG_ON((u32 *)set8->pairs != &set8->pairs[0].id); qsort(set8->pairs, set8->cnt, sizeof(set8->pairs[0]), cmp_id); - - /* - * When ELF endianness does not match endianness of the - * host, libelf will do the translation when updating - * the ELF. This, however, corrupts SET8 flags which are - * already in the target endianness. So, let's bswap - * them to the host endianness and libelf will then - * correctly translate everything. - */ - if (obj->efile.encoding != ELFDATANATIVE) { - int i; - - set8->flags = bswap_32(set8->flags); - for (i = 0; i < set8->cnt; i++) { - set8->pairs[i].flags = - bswap_32(set8->pairs[i].flags); - } - } break; default: pr_err("Unexpected btf_id_kind %d for set '%s'\n", id->kind, id->name); @@ -793,8 +766,6 @@ static int sets_patch(struct object *obj) static int symbols_patch(struct object *obj) { - off_t err; - if (__symbols_patch(obj, &obj->structs) || __symbols_patch(obj, &obj->unions) || __symbols_patch(obj, &obj->typedefs) || @@ -805,20 +776,90 @@ static int symbols_patch(struct object *obj) if (sets_patch(obj)) return -1; - /* Set type to ensure endian translation occurs. */ - obj->efile.idlist->d_type = ELF_T_WORD; + return 0; +} - elf_flagdata(obj->efile.idlist, ELF_C_SET, ELF_F_DIRTY); +static int dump_raw_data(const char *out_path, const void *data, u32 size) +{ + size_t written; + FILE *file; - err = elf_update(obj->efile.elf, ELF_C_WRITE); - if (err < 0) { - pr_err("FAILED elf_update(WRITE): %s\n", - elf_errmsg(-1)); + file = fopen(out_path, "wb"); + if (!file) { + pr_err("Couldn't open %s for writing\n", out_path); + return -1; + } + + written = fwrite(data, 1, size, file); + if (written != size) { + pr_err("Failed to write data to %s\n", out_path); + fclose(file); + unlink(out_path); + return -1; + } + + fclose(file); + pr_debug("Dumped %lu bytes of data to %s\n", size, out_path); + + return 0; +} + +static int dump_raw_btf_ids(struct object *obj, const char *out_path) +{ + Elf_Data *data = obj->efile.idlist; + int err; + + if (!data || !data->d_buf) { + pr_debug("%s has no BTF_ids data to dump\n", obj->path); + return 0; + } + + /* + * If target endianness differs from host, we need to bswap32 the + * .BTF_ids section data before dumping so that the output is in + * target endianness. + */ + if (obj->efile.encoding != ELFDATANATIVE) { + pr_debug("bswap_32 .BTF_ids data from host to target endianness\n"); + bswap_32_data(data->d_buf, data->d_size); + } + + err = dump_raw_data(out_path, data->d_buf, data->d_size); + if (err) + return -1; + + return 0; +} + +static int dump_raw_btf(struct btf *btf, const char *out_path) +{ + const void *raw_btf_data; + u32 raw_btf_size; + int err; + + raw_btf_data = btf__raw_data(btf, &raw_btf_size); + if (!raw_btf_data) { + pr_err("btf__raw_data() failed\n"); + return -1; } - pr_debug("update %s for %s\n", - err >= 0 ? "ok" : "failed", obj->path); - return err < 0 ? -1 : 0; + err = dump_raw_data(out_path, raw_btf_data, raw_btf_size); + if (err) + return -1; + + return 0; +} + +static inline int make_out_path(char *buf, u32 buf_sz, const char *in_path, const char *suffix) +{ + int len = snprintf(buf, buf_sz, "%s%s", in_path, suffix); + + if (len < 0 || len >= buf_sz) { + pr_err("Output path is too long: %s%s\n", in_path, suffix); + return -E2BIG; + } + + return 0; } static const char * const resolve_btfids_usage[] = { @@ -840,6 +881,8 @@ int main(int argc, const char **argv) .sets = RB_ROOT, }; bool fatal_warnings = false; + char out_path[PATH_MAX]; + struct option btfid_options[] = { OPT_INCR('v', "verbose", &verbose, "be more verbose (show errors, etc)"), @@ -849,6 +892,8 @@ int main(int argc, const char **argv) "path of file providing base BTF"), OPT_BOOLEAN(0, "fatal_warnings", &fatal_warnings, "turn warnings into errors"), + OPT_BOOLEAN(0, "distill_base", &obj.distill_base, + "distill --btf_base and emit .BTF.base section data"), OPT_END() }; int err = -1; @@ -860,6 +905,9 @@ int main(int argc, const char **argv) obj.path = argv[0]; + if (load_btf(&obj)) + goto out; + if (elf_collect(&obj)) goto out; @@ -869,23 +917,37 @@ int main(int argc, const char **argv) */ if (obj.efile.idlist_shndx == -1 || obj.efile.symbols_shndx == -1) { - pr_debug("Cannot find .BTF_ids or symbols sections, nothing to do\n"); - err = 0; - goto out; + pr_debug("Cannot find .BTF_ids or symbols sections, skip symbols resolution\n"); + goto dump_btf; } if (symbols_collect(&obj)) goto out; - if (load_btf(&obj)) - goto out; - if (symbols_resolve(&obj)) goto out; if (symbols_patch(&obj)) goto out; + err = make_out_path(out_path, sizeof(out_path), obj.path, BTF_IDS_SECTION); + err = err ?: dump_raw_btf_ids(&obj, out_path); + if (err) + goto out; + +dump_btf: + err = make_out_path(out_path, sizeof(out_path), obj.path, BTF_ELF_SEC); + err = err ?: dump_raw_btf(obj.btf, out_path); + if (err) + goto out; + + if (obj.base_btf && obj.distill_base) { + err = make_out_path(out_path, sizeof(out_path), obj.path, BTF_BASE_ELF_SEC); + err = err ?: dump_raw_btf(obj.base_btf, out_path); + if (err) + goto out; + } + if (!(fatal_warnings && warnings)) err = 0; out: diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 19c1638e312a..b8bf51b7a0b0 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -45,3 +45,6 @@ xdp_synproxy xdp_hw_metadata xdp_features verification_cert.h +*.BTF +*.BTF_ids +*.BTF.base diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index ffd0a4c354c7..f28a32b16ff0 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -4,6 +4,7 @@ include ../../../scripts/Makefile.arch include ../../../scripts/Makefile.include CXX ?= $(CROSS_COMPILE)g++ +OBJCOPY ?= $(CROSS_COMPILE)objcopy CURDIR := $(abspath .) TOOLSDIR := $(abspath ../../..) @@ -653,9 +654,10 @@ $(TRUNNER_TEST_OBJS): $(TRUNNER_OUTPUT)/%.test.o: \ | $(TRUNNER_OUTPUT)/%.test.d $$(call msg,TEST-OBJ,$(TRUNNER_BINARY),$$@) $(Q)cd $$(@D) && $$(CC) -I. $$(CFLAGS) -MMD -MT $$@ -c $(CURDIR)/$$< $$(LDLIBS) -o $$(@F) - $$(if $$(TEST_NEEDS_BTFIDS), \ - $$(call msg,BTFIDS,$(TRUNNER_BINARY),$$@) \ - $(RESOLVE_BTFIDS) --btf $(TRUNNER_OUTPUT)/btf_data.bpf.o $$@) + $$(if $$(TEST_NEEDS_BTFIDS), \ + $$(call msg,BTFIDS,$(TRUNNER_BINARY),$$@) \ + $(RESOLVE_BTFIDS) --btf $(TRUNNER_OUTPUT)/btf_data.bpf.o $$@; \ + $(OBJCOPY) --update-section .BTF_ids=$$@.BTF_ids $$@) $(TRUNNER_TEST_OBJS:.o=.d): $(TRUNNER_OUTPUT)/%.test.d: \ $(TRUNNER_TESTS_DIR)/%.c \ @@ -894,6 +896,7 @@ EXTRA_CLEAN := $(SCRATCH_DIR) $(HOST_SCRATCH_DIR) \ prog_tests/tests.h map_tests/tests.h verifier/tests.h \ feature bpftool $(TEST_KMOD_TARGETS) \ $(addprefix $(OUTPUT)/,*.o *.d *.skel.h *.lskel.h *.subskel.h \ + *.BTF *.BTF_ids *.BTF.base \ no_alu32 cpuv4 bpf_gcc \ liburandom_read.so) \ $(OUTPUT)/FEATURE-DUMP.selftests \ diff --git a/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c b/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c index 51544372f52e..41dfaaabb73f 100644 --- a/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c +++ b/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c @@ -101,9 +101,9 @@ static int resolve_symbols(void) int type_id; __u32 nr; - btf = btf__parse_elf("btf_data.bpf.o", NULL); + btf = btf__parse_raw("resolve_btfids.test.o.BTF"); if (CHECK(libbpf_get_error(btf), "resolve", - "Failed to load BTF from btf_data.bpf.o\n")) + "Failed to load BTF from resolve_btfids.test.o.BTF\n")) return -1; nr = btf__type_cnt(btf); -- cgit v1.2.3 From d2749ae85aec685e52e0474f445f6a8552363eb0 Mon Sep 17 00:00:00 2001 From: Matt Bobrowski Date: Tue, 16 Dec 2025 13:30:00 +0000 Subject: selftests/bpf: add test case for BPF LSM hook bpf_lsm_mmap_file Add a trivial test case asserting that the BPF verifier enforces PTR_MAYBE_NULL semantics on the struct file pointer argument of BPF LSM hook bpf_lsm_mmap_file(). Dereferencing the struct file pointer passed into bpf_lsm_mmap_file() without explicitly performing a NULL check first should not be permitted by the BPF verifier as it can lead to NULL pointer dereferences and a kernel crash. Signed-off-by: Matt Bobrowski Acked-by: Song Liu Link: https://lore.kernel.org/r/20251216133000.3690723-2-mattbobrowski@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/verifier_lsm.c | 31 +++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/progs/verifier_lsm.c b/tools/testing/selftests/bpf/progs/verifier_lsm.c index 6af9100a37ff..38e8e9176862 100644 --- a/tools/testing/selftests/bpf/progs/verifier_lsm.c +++ b/tools/testing/selftests/bpf/progs/verifier_lsm.c @@ -1,7 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 -#include +#include #include +#include #include "bpf_misc.h" SEC("lsm/file_permission") @@ -159,4 +160,32 @@ __naked int disabled_hook_test3(void *ctx) ::: __clobber_all); } +SEC("lsm/mmap_file") +__description("not null checking nullable pointer in bpf_lsm_mmap_file") +__failure __msg("R1 invalid mem access 'trusted_ptr_or_null_'") +int BPF_PROG(no_null_check, struct file *file) +{ + struct inode *inode; + + inode = file->f_inode; + __sink(inode); + + return 0; +} + +SEC("lsm/mmap_file") +__description("null checking nullable pointer in bpf_lsm_mmap_file") +__success +int BPF_PROG(null_check, struct file *file) +{ + struct inode *inode; + + if (file) { + inode = file->f_inode; + __sink(inode); + } + + return 0; +} + char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 6bce6ddbe634bbc6d21672b5bfdbb5ad0409bd8d Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Mon, 22 Dec 2025 20:41:55 -0800 Subject: bpf: selftests: selftests for memcg stat kfuncs Add test coverage for the kfuncs that fetch memcg stats. Using some common stats, test scenarios ensuring that the given stat increases by some arbitrary amount. The stats selected cover the three categories represented by the enums: node_stat_item, memcg_stat_item, vm_event_item. Since only a subset of all stats are queried, use a static struct made up of fields for each stat. Write to the struct with the fetched values when the bpf program is invoked and read the fields in the user mode program for verification. Signed-off-by: JP Kobryn Signed-off-by: Roman Gushchin Link: https://lore.kernel.org/r/20251223044156.208250-6-roman.gushchin@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/cgroup_iter_memcg.h | 18 ++ .../selftests/bpf/prog_tests/cgroup_iter_memcg.c | 223 +++++++++++++++++++++ .../selftests/bpf/progs/cgroup_iter_memcg.c | 39 ++++ 3 files changed, 280 insertions(+) create mode 100644 tools/testing/selftests/bpf/cgroup_iter_memcg.h create mode 100644 tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c create mode 100644 tools/testing/selftests/bpf/progs/cgroup_iter_memcg.c (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/cgroup_iter_memcg.h b/tools/testing/selftests/bpf/cgroup_iter_memcg.h new file mode 100644 index 000000000000..3f59b127943b --- /dev/null +++ b/tools/testing/selftests/bpf/cgroup_iter_memcg.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#ifndef __CGROUP_ITER_MEMCG_H +#define __CGROUP_ITER_MEMCG_H + +struct memcg_query { + /* some node_stat_item's */ + unsigned long nr_anon_mapped; + unsigned long nr_shmem; + unsigned long nr_file_pages; + unsigned long nr_file_mapped; + /* some memcg_stat_item */ + unsigned long memcg_kmem; + /* some vm_event_item */ + unsigned long pgfault; +}; + +#endif /* __CGROUP_ITER_MEMCG_H */ diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c b/tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c new file mode 100644 index 000000000000..a5afd16705f0 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c @@ -0,0 +1,223 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include +#include +#include +#include +#include "cgroup_helpers.h" +#include "cgroup_iter_memcg.h" +#include "cgroup_iter_memcg.skel.h" + +static int read_stats(struct bpf_link *link) +{ + int fd, ret = 0; + ssize_t bytes; + + fd = bpf_iter_create(bpf_link__fd(link)); + if (!ASSERT_OK_FD(fd, "bpf_iter_create")) + return 1; + + /* + * Invoke iter program by reading from its fd. We're not expecting any + * data to be written by the bpf program so the result should be zero. + * Results will be read directly through the custom data section + * accessible through skel->data_query.memcg_query. + */ + bytes = read(fd, NULL, 0); + if (!ASSERT_EQ(bytes, 0, "read fd")) + ret = 1; + + close(fd); + return ret; +} + +static void test_anon(struct bpf_link *link, struct memcg_query *memcg_query) +{ + void *map; + size_t len; + + len = sysconf(_SC_PAGESIZE) * 1024; + + /* + * Increase memcg anon usage by mapping and writing + * to a new anon region. + */ + map = mmap(NULL, len, PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (!ASSERT_NEQ(map, MAP_FAILED, "mmap anon")) + return; + + memset(map, 1, len); + + if (!ASSERT_OK(read_stats(link), "read stats")) + goto cleanup; + + ASSERT_GT(memcg_query->nr_anon_mapped, 0, "final anon mapped val"); + +cleanup: + munmap(map, len); +} + +static void test_file(struct bpf_link *link, struct memcg_query *memcg_query) +{ + void *map; + size_t len; + char *path; + int fd; + + len = sysconf(_SC_PAGESIZE) * 1024; + path = "/tmp/test_cgroup_iter_memcg"; + + /* + * Increase memcg file usage by creating and writing + * to a mapped file. + */ + fd = open(path, O_CREAT | O_RDWR, 0644); + if (!ASSERT_OK_FD(fd, "open fd")) + return; + if (!ASSERT_OK(ftruncate(fd, len), "ftruncate")) + goto cleanup_fd; + + map = mmap(NULL, len, PROT_WRITE, MAP_SHARED, fd, 0); + if (!ASSERT_NEQ(map, MAP_FAILED, "mmap file")) + goto cleanup_fd; + + memset(map, 1, len); + + if (!ASSERT_OK(read_stats(link), "read stats")) + goto cleanup_map; + + ASSERT_GT(memcg_query->nr_file_pages, 0, "final file value"); + ASSERT_GT(memcg_query->nr_file_mapped, 0, "final file mapped value"); + +cleanup_map: + munmap(map, len); +cleanup_fd: + close(fd); + unlink(path); +} + +static void test_shmem(struct bpf_link *link, struct memcg_query *memcg_query) +{ + size_t len; + int fd; + + len = sysconf(_SC_PAGESIZE) * 1024; + + /* + * Increase memcg shmem usage by creating and writing + * to a shmem object. + */ + fd = shm_open("/tmp_shmem", O_CREAT | O_RDWR, 0644); + if (!ASSERT_OK_FD(fd, "shm_open")) + return; + + if (!ASSERT_OK(fallocate(fd, 0, 0, len), "fallocate")) + goto cleanup; + + if (!ASSERT_OK(read_stats(link), "read stats")) + goto cleanup; + + ASSERT_GT(memcg_query->nr_shmem, 0, "final shmem value"); + +cleanup: + close(fd); + shm_unlink("/tmp_shmem"); +} + +#define NR_PIPES 64 +static void test_kmem(struct bpf_link *link, struct memcg_query *memcg_query) +{ + int fds[NR_PIPES][2], i; + + /* + * Increase kmem value by creating pipes which will allocate some + * kernel buffers. + */ + for (i = 0; i < NR_PIPES; i++) { + if (!ASSERT_OK(pipe(fds[i]), "pipe")) + goto cleanup; + } + + if (!ASSERT_OK(read_stats(link), "read stats")) + goto cleanup; + + ASSERT_GT(memcg_query->memcg_kmem, 0, "kmem value"); + +cleanup: + for (i = i - 1; i >= 0; i--) { + close(fds[i][0]); + close(fds[i][1]); + } +} + +static void test_pgfault(struct bpf_link *link, struct memcg_query *memcg_query) +{ + void *map; + size_t len; + + len = sysconf(_SC_PAGESIZE) * 1024; + + /* Create region to use for triggering a page fault. */ + map = mmap(NULL, len, PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (!ASSERT_NEQ(map, MAP_FAILED, "mmap anon")) + return; + + /* Trigger page fault. */ + memset(map, 1, len); + + if (!ASSERT_OK(read_stats(link), "read stats")) + goto cleanup; + + ASSERT_GT(memcg_query->pgfault, 0, "final pgfault val"); + +cleanup: + munmap(map, len); +} + +void test_cgroup_iter_memcg(void) +{ + char *cgroup_rel_path = "/cgroup_iter_memcg_test"; + struct cgroup_iter_memcg *skel; + struct bpf_link *link; + int cgroup_fd; + + cgroup_fd = cgroup_setup_and_join(cgroup_rel_path); + if (!ASSERT_OK_FD(cgroup_fd, "cgroup_setup_and_join")) + return; + + skel = cgroup_iter_memcg__open_and_load(); + if (!ASSERT_OK_PTR(skel, "cgroup_iter_memcg__open_and_load")) + goto cleanup_cgroup_fd; + + DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts); + union bpf_iter_link_info linfo = { + .cgroup.cgroup_fd = cgroup_fd, + .cgroup.order = BPF_CGROUP_ITER_SELF_ONLY, + }; + opts.link_info = &linfo; + opts.link_info_len = sizeof(linfo); + + link = bpf_program__attach_iter(skel->progs.cgroup_memcg_query, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_iter")) + goto cleanup_skel; + + if (test__start_subtest("cgroup_iter_memcg__anon")) + test_anon(link, &skel->data_query->memcg_query); + if (test__start_subtest("cgroup_iter_memcg__shmem")) + test_shmem(link, &skel->data_query->memcg_query); + if (test__start_subtest("cgroup_iter_memcg__file")) + test_file(link, &skel->data_query->memcg_query); + if (test__start_subtest("cgroup_iter_memcg__kmem")) + test_kmem(link, &skel->data_query->memcg_query); + if (test__start_subtest("cgroup_iter_memcg__pgfault")) + test_pgfault(link, &skel->data_query->memcg_query); + + bpf_link__destroy(link); +cleanup_skel: + cgroup_iter_memcg__destroy(skel); +cleanup_cgroup_fd: + close(cgroup_fd); + cleanup_cgroup_environment(); +} diff --git a/tools/testing/selftests/bpf/progs/cgroup_iter_memcg.c b/tools/testing/selftests/bpf/progs/cgroup_iter_memcg.c new file mode 100644 index 000000000000..59fb70a3cc50 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/cgroup_iter_memcg.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include "cgroup_iter_memcg.h" + +char _license[] SEC("license") = "GPL"; + +/* The latest values read are stored here. */ +struct memcg_query memcg_query SEC(".data.query"); + +SEC("iter.s/cgroup") +int cgroup_memcg_query(struct bpf_iter__cgroup *ctx) +{ + struct cgroup *cgrp = ctx->cgroup; + struct cgroup_subsys_state *css; + struct mem_cgroup *memcg; + + if (!cgrp) + return 1; + + css = &cgrp->self; + memcg = bpf_get_mem_cgroup(css); + if (!memcg) + return 1; + + bpf_mem_cgroup_flush_stats(memcg); + + memcg_query.nr_anon_mapped = bpf_mem_cgroup_page_state(memcg, NR_ANON_MAPPED); + memcg_query.nr_shmem = bpf_mem_cgroup_page_state(memcg, NR_SHMEM); + memcg_query.nr_file_pages = bpf_mem_cgroup_page_state(memcg, NR_FILE_PAGES); + memcg_query.nr_file_mapped = bpf_mem_cgroup_page_state(memcg, NR_FILE_MAPPED); + memcg_query.memcg_kmem = bpf_mem_cgroup_page_state(memcg, MEMCG_KMEM); + memcg_query.pgfault = bpf_mem_cgroup_vm_events(memcg, PGFAULT); + + bpf_put_mem_cgroup(memcg); + + return 0; +} -- cgit v1.2.3 From 83dd46ecb68ecc03cff23e68490ded5d40d79f66 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Mon, 22 Dec 2025 05:32:46 -0800 Subject: selftests: bpf: fix tests with raw_tp calling kfuncs As the previous commit allowed raw_tp programs to call kfuncs, so of the selftests that were expected to fail will now succeed. Signed-off-by: Puranjay Mohan Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20251222133250.1890587-3-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/dynptr_fail.c | 2 +- tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c index dda6a8dada82..8f2ae9640886 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_fail.c +++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c @@ -1465,7 +1465,7 @@ int xdp_invalid_data_slice2(struct xdp_md *xdp) } /* Only supported prog type can create skb-type dynptrs */ -SEC("?raw_tp") +SEC("?xdp") __failure __msg("calling kernel function bpf_dynptr_from_skb is not allowed") int skb_invalid_ctx(void *ctx) { diff --git a/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c b/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c index a509cad97e69..1fce7a7e8d03 100644 --- a/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c +++ b/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c @@ -32,7 +32,7 @@ static void task_kfunc_load_test(void) } SEC("raw_tp") -__failure __msg("calling kernel function") +__success int BPF_PROG(task_kfunc_raw_tp) { task_kfunc_load_test(); @@ -86,7 +86,7 @@ static void cgrp_kfunc_load_test(void) } SEC("raw_tp") -__failure __msg("calling kernel function") +__success int BPF_PROG(cgrp_kfunc_raw_tp) { cgrp_kfunc_load_test(); @@ -138,7 +138,7 @@ static void cpumask_kfunc_load_test(void) } SEC("raw_tp") -__failure __msg("calling kernel function") +__success int BPF_PROG(cpumask_kfunc_raw_tp) { cpumask_kfunc_load_test(); -- cgit v1.2.3 From efecc9e825f4aa3fe616236152604a066a3e776d Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Mon, 22 Dec 2025 11:50:19 -0800 Subject: selftests: bpf: test non-sleepable arena allocations As arena kfuncs can now be called from non-sleepable contexts, test this by adding non-sleepable copies of tests in verifier_arena, this is done by using a socket program instead of syscall. Add a new test case in verifier_arena_large to check that the bpf_arena_alloc_pages() works for more than 1024 pages. 1024 * sizeof(struct page *) is the upper limit of kmalloc_nolock() but bpf_arena_alloc_pages() should still succeed because it re-uses this array in a loop. Augment the arena_list selftest to also run in non-sleepable context by taking rcu_read_lock. Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20251222195022.431211-5-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/prog_tests/arena_list.c | 20 ++- tools/testing/selftests/bpf/progs/arena_list.c | 11 ++ tools/testing/selftests/bpf/progs/verifier_arena.c | 185 +++++++++++++++++++++ .../selftests/bpf/progs/verifier_arena_large.c | 29 ++++ 4 files changed, 240 insertions(+), 5 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/prog_tests/arena_list.c b/tools/testing/selftests/bpf/prog_tests/arena_list.c index d15867cddde0..4f2866a615ce 100644 --- a/tools/testing/selftests/bpf/prog_tests/arena_list.c +++ b/tools/testing/selftests/bpf/prog_tests/arena_list.c @@ -27,17 +27,23 @@ static int list_sum(struct arena_list_head *head) return sum; } -static void test_arena_list_add_del(int cnt) +static void test_arena_list_add_del(int cnt, bool nonsleepable) { LIBBPF_OPTS(bpf_test_run_opts, opts); struct arena_list *skel; int expected_sum = (u64)cnt * (cnt - 1) / 2; int ret, sum; - skel = arena_list__open_and_load(); - if (!ASSERT_OK_PTR(skel, "arena_list__open_and_load")) + skel = arena_list__open(); + if (!ASSERT_OK_PTR(skel, "arena_list__open")) return; + skel->rodata->nonsleepable = nonsleepable; + + ret = arena_list__load(skel); + if (!ASSERT_OK(ret, "arena_list__load")) + goto out; + skel->bss->cnt = cnt; ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.arena_list_add), &opts); ASSERT_OK(ret, "ret_add"); @@ -65,7 +71,11 @@ out: void test_arena_list(void) { if (test__start_subtest("arena_list_1")) - test_arena_list_add_del(1); + test_arena_list_add_del(1, false); if (test__start_subtest("arena_list_1000")) - test_arena_list_add_del(1000); + test_arena_list_add_del(1000, false); + if (test__start_subtest("arena_list_1_nonsleepable")) + test_arena_list_add_del(1, true); + if (test__start_subtest("arena_list_1000_nonsleepable")) + test_arena_list_add_del(1000, true); } diff --git a/tools/testing/selftests/bpf/progs/arena_list.c b/tools/testing/selftests/bpf/progs/arena_list.c index 3a2ddcacbea6..235d8cc95bdd 100644 --- a/tools/testing/selftests/bpf/progs/arena_list.c +++ b/tools/testing/selftests/bpf/progs/arena_list.c @@ -30,6 +30,7 @@ struct arena_list_head __arena *list_head; int list_sum; int cnt; bool skip = false; +const volatile bool nonsleepable = false; #ifdef __BPF_FEATURE_ADDR_SPACE_CAST long __arena arena_sum; @@ -42,6 +43,9 @@ int test_val SEC(".addr_space.1"); int zero; +void bpf_rcu_read_lock(void) __ksym; +void bpf_rcu_read_unlock(void) __ksym; + SEC("syscall") int arena_list_add(void *ctx) { @@ -71,6 +75,10 @@ int arena_list_del(void *ctx) struct elem __arena *n; int sum = 0; + /* Take rcu_read_lock to test non-sleepable context */ + if (nonsleepable) + bpf_rcu_read_lock(); + arena_sum = 0; list_for_each_entry(n, list_head, node) { sum += n->value; @@ -79,6 +87,9 @@ int arena_list_del(void *ctx) bpf_free(n); } list_sum = sum; + + if (nonsleepable) + bpf_rcu_read_unlock(); #else skip = true; #endif diff --git a/tools/testing/selftests/bpf/progs/verifier_arena.c b/tools/testing/selftests/bpf/progs/verifier_arena.c index 7f4827eede3c..4a9d96344813 100644 --- a/tools/testing/selftests/bpf/progs/verifier_arena.c +++ b/tools/testing/selftests/bpf/progs/verifier_arena.c @@ -21,6 +21,37 @@ struct { #endif } arena SEC(".maps"); +SEC("socket") +__success __retval(0) +int basic_alloc1_nosleep(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + volatile int __arena *page1, *page2, *no_page; + + page1 = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + if (!page1) + return 1; + *page1 = 1; + page2 = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + if (!page2) + return 2; + *page2 = 2; + no_page = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + if (no_page) + return 3; + if (*page1 != 1) + return 4; + if (*page2 != 2) + return 5; + bpf_arena_free_pages(&arena, (void __arena *)page2, 1); + if (*page1 != 1) + return 6; + if (*page2 != 0 && *page2 != 2) /* use-after-free should return 0 or the stored value */ + return 7; +#endif + return 0; +} + SEC("syscall") __success __retval(0) int basic_alloc1(void *ctx) @@ -60,6 +91,44 @@ int basic_alloc1(void *ctx) return 0; } +SEC("socket") +__success __retval(0) +int basic_alloc2_nosleep(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + volatile char __arena *page1, *page2, *page3, *page4; + + page1 = bpf_arena_alloc_pages(&arena, NULL, 2, NUMA_NO_NODE, 0); + if (!page1) + return 1; + page2 = page1 + __PAGE_SIZE; + page3 = page1 + __PAGE_SIZE * 2; + page4 = page1 - __PAGE_SIZE; + *page1 = 1; + *page2 = 2; + *page3 = 3; + *page4 = 4; + if (*page1 != 1) + return 1; + if (*page2 != 2) + return 2; + if (*page3 != 0) + return 3; + if (*page4 != 0) + return 4; + bpf_arena_free_pages(&arena, (void __arena *)page1, 2); + if (*page1 != 0 && *page1 != 1) + return 5; + if (*page2 != 0 && *page2 != 2) + return 6; + if (*page3 != 0) + return 7; + if (*page4 != 0) + return 8; +#endif + return 0; +} + SEC("syscall") __success __retval(0) int basic_alloc2(void *ctx) @@ -102,6 +171,19 @@ struct bpf_arena___l { struct bpf_map map; } __attribute__((preserve_access_index)); +SEC("socket") +__success __retval(0) __log_level(2) +int basic_alloc3_nosleep(void *ctx) +{ + struct bpf_arena___l *ar = (struct bpf_arena___l *)&arena; + volatile char __arena *pages; + + pages = bpf_arena_alloc_pages(&ar->map, NULL, ar->map.max_entries, NUMA_NO_NODE, 0); + if (!pages) + return 1; + return 0; +} + SEC("syscall") __success __retval(0) __log_level(2) int basic_alloc3(void *ctx) @@ -115,6 +197,38 @@ int basic_alloc3(void *ctx) return 0; } +SEC("socket") +__success __retval(0) +int basic_reserve1_nosleep(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + char __arena *page; + int ret; + + page = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + if (!page) + return 1; + + page += __PAGE_SIZE; + + /* Reserve the second page */ + ret = bpf_arena_reserve_pages(&arena, page, 1); + if (ret) + return 2; + + /* Try to explicitly allocate the reserved page. */ + page = bpf_arena_alloc_pages(&arena, page, 1, NUMA_NO_NODE, 0); + if (page) + return 3; + + /* Try to implicitly allocate the page (since there's only 2 of them). */ + page = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + if (page) + return 4; +#endif + return 0; +} + SEC("syscall") __success __retval(0) int basic_reserve1(void *ctx) @@ -147,6 +261,26 @@ int basic_reserve1(void *ctx) return 0; } +SEC("socket") +__success __retval(0) +int basic_reserve2_nosleep(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + char __arena *page; + int ret; + + page = arena_base(&arena); + ret = bpf_arena_reserve_pages(&arena, page, 1); + if (ret) + return 1; + + page = bpf_arena_alloc_pages(&arena, page, 1, NUMA_NO_NODE, 0); + if ((u64)page) + return 2; +#endif + return 0; +} + SEC("syscall") __success __retval(0) int basic_reserve2(void *ctx) @@ -168,6 +302,27 @@ int basic_reserve2(void *ctx) } /* Reserve the same page twice, should return -EBUSY. */ +SEC("socket") +__success __retval(0) +int reserve_twice_nosleep(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + char __arena *page; + int ret; + + page = arena_base(&arena); + + ret = bpf_arena_reserve_pages(&arena, page, 1); + if (ret) + return 1; + + ret = bpf_arena_reserve_pages(&arena, page, 1); + if (ret != -EBUSY) + return 2; +#endif + return 0; +} + SEC("syscall") __success __retval(0) int reserve_twice(void *ctx) @@ -190,6 +345,36 @@ int reserve_twice(void *ctx) } /* Try to reserve past the end of the arena. */ +SEC("socket") +__success __retval(0) +int reserve_invalid_region_nosleep(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + char __arena *page; + int ret; + + /* Try a NULL pointer. */ + ret = bpf_arena_reserve_pages(&arena, NULL, 3); + if (ret != -EINVAL) + return 1; + + page = arena_base(&arena); + + ret = bpf_arena_reserve_pages(&arena, page, 3); + if (ret != -EINVAL) + return 2; + + ret = bpf_arena_reserve_pages(&arena, page, 4096); + if (ret != -EINVAL) + return 3; + + ret = bpf_arena_reserve_pages(&arena, page, (1ULL << 32) - 1); + if (ret != -EINVAL) + return 4; +#endif + return 0; +} + SEC("syscall") __success __retval(0) int reserve_invalid_region(void *ctx) diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_large.c b/tools/testing/selftests/bpf/progs/verifier_arena_large.c index 2b8cf2a4d880..4ca491cbe8d1 100644 --- a/tools/testing/selftests/bpf/progs/verifier_arena_large.c +++ b/tools/testing/selftests/bpf/progs/verifier_arena_large.c @@ -283,5 +283,34 @@ int big_alloc2(void *ctx) return 9; return 0; } + +SEC("socket") +__success __retval(0) +int big_alloc3(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + char __arena *pages; + u64 i; + + /* + * Allocate 2051 pages in one go to check how kmalloc_nolock() handles large requests. + * Since kmalloc_nolock() can allocate up to 1024 struct page * at a time, this call should + * result in three batches: two batches of 1024 pages each, followed by a final batch of 3 + * pages. + */ + pages = bpf_arena_alloc_pages(&arena, NULL, 2051, NUMA_NO_NODE, 0); + if (!pages) + return -1; + + bpf_for(i, 0, 2051) + pages[i * PAGE_SIZE] = 123; + bpf_for(i, 0, 2051) + if (pages[i * PAGE_SIZE] != 123) + return i; + + bpf_arena_free_pages(&arena, pages, 2051); +#endif + return 0; +} #endif char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From bd09d9a05cf04028f639e209b416bacaeffd4909 Mon Sep 17 00:00:00 2001 From: Matthieu Buffet Date: Mon, 27 Oct 2025 20:07:24 +0100 Subject: selftests/landlock: Fix TCP bind(AF_UNSPEC) test case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The nominal error code for bind(AF_UNSPEC) on an IPv6 socket is -EAFNOSUPPORT, not -EINVAL. -EINVAL is only returned when the supplied address struct is too short, which happens to be the case in current selftests because they treat AF_UNSPEC like IPv4 sockets do: as an alias for AF_INET (which is a 16-byte struct instead of the 24 bytes required by IPv6 sockets). Make the union large enough for any address (by adding struct sockaddr_storage to the union), and make AF_UNSPEC addresses large enough for any family. Test for -EAFNOSUPPORT instead, and add a dedicated test case for truncated inputs with -EINVAL. Fixes: a549d055a22e ("selftests/landlock: Add network tests") Signed-off-by: Matthieu Buffet Link: https://lore.kernel.org/r/20251027190726.626244-2-matthieu@buffet.re Signed-off-by: Mickaël Salaün --- tools/testing/selftests/landlock/common.h | 1 + tools/testing/selftests/landlock/net_test.c | 16 +++++++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/landlock/common.h b/tools/testing/selftests/landlock/common.h index 230b75f6015b..90551650299c 100644 --- a/tools/testing/selftests/landlock/common.h +++ b/tools/testing/selftests/landlock/common.h @@ -237,6 +237,7 @@ struct service_fixture { struct sockaddr_un unix_addr; socklen_t unix_addr_len; }; + struct sockaddr_storage _largest; }; }; diff --git a/tools/testing/selftests/landlock/net_test.c b/tools/testing/selftests/landlock/net_test.c index 2a45208551e6..3bbc0508420b 100644 --- a/tools/testing/selftests/landlock/net_test.c +++ b/tools/testing/selftests/landlock/net_test.c @@ -121,6 +121,10 @@ static socklen_t get_addrlen(const struct service_fixture *const srv, { switch (srv->protocol.domain) { case AF_UNSPEC: + if (minimal) + return sizeof(sa_family_t); + return sizeof(struct sockaddr_storage); + case AF_INET: return sizeof(srv->ipv4_addr); @@ -758,6 +762,11 @@ TEST_F(protocol, bind_unspec) bind_fd = socket_variant(&self->srv0); ASSERT_LE(0, bind_fd); + /* Tries to bind with too small addrlen. */ + EXPECT_EQ(-EINVAL, bind_variant_addrlen( + bind_fd, &self->unspec_any0, + get_addrlen(&self->unspec_any0, true) - 1)); + /* Allowed bind on AF_UNSPEC/INADDR_ANY. */ ret = bind_variant(bind_fd, &self->unspec_any0); if (variant->prot.domain == AF_INET) { @@ -766,6 +775,8 @@ TEST_F(protocol, bind_unspec) TH_LOG("Failed to bind to unspec/any socket: %s", strerror(errno)); } + } else if (variant->prot.domain == AF_INET6) { + EXPECT_EQ(-EAFNOSUPPORT, ret); } else { EXPECT_EQ(-EINVAL, ret); } @@ -792,6 +803,8 @@ TEST_F(protocol, bind_unspec) } else { EXPECT_EQ(0, ret); } + } else if (variant->prot.domain == AF_INET6) { + EXPECT_EQ(-EAFNOSUPPORT, ret); } else { EXPECT_EQ(-EINVAL, ret); } @@ -801,7 +814,8 @@ TEST_F(protocol, bind_unspec) bind_fd = socket_variant(&self->srv0); ASSERT_LE(0, bind_fd); ret = bind_variant(bind_fd, &self->unspec_srv0); - if (variant->prot.domain == AF_INET) { + if (variant->prot.domain == AF_INET || + variant->prot.domain == AF_INET6) { EXPECT_EQ(-EAFNOSUPPORT, ret); } else { EXPECT_EQ(-EINVAL, ret) -- cgit v1.2.3 From 6685201ebfacff0c889bcd569181fa6e8af5575e Mon Sep 17 00:00:00 2001 From: Matthieu Buffet Date: Mon, 27 Oct 2025 20:07:25 +0100 Subject: selftests/landlock: Add missing connect(minimal AF_UNSPEC) test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit connect_variant(unspec_any0) is called twice. Both calls end up in connect_variant_addrlen() with an address length of get_addrlen(minimal=false). However, the connect() syscall and its variants (e.g. iouring/compat) accept much shorter addresses of 4 bytes and that behaviour was not tested. Replace one of these calls with one using a minimal address length (just a bare sa_family=AF_UNSPEC field with no actual address). Also add a call using a truncated address for good measure. Signed-off-by: Matthieu Buffet Link: https://lore.kernel.org/r/20251027190726.626244-3-matthieu@buffet.re Signed-off-by: Mickaël Salaün --- tools/testing/selftests/landlock/net_test.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/landlock/net_test.c b/tools/testing/selftests/landlock/net_test.c index 3bbc0508420b..b34b139b3f89 100644 --- a/tools/testing/selftests/landlock/net_test.c +++ b/tools/testing/selftests/landlock/net_test.c @@ -906,7 +906,19 @@ TEST_F(protocol, connect_unspec) EXPECT_EQ(0, close(ruleset_fd)); } - ret = connect_variant(connect_fd, &self->unspec_any0); + /* Try to re-disconnect with a truncated address struct. */ + EXPECT_EQ(-EINVAL, + connect_variant_addrlen( + connect_fd, &self->unspec_any0, + get_addrlen(&self->unspec_any0, true) - 1)); + + /* + * Re-disconnect, with a minimal sockaddr struct (just a + * bare af_family=AF_UNSPEC field). + */ + ret = connect_variant_addrlen(connect_fd, &self->unspec_any0, + get_addrlen(&self->unspec_any0, + true)); if (self->srv0.protocol.domain == AF_UNIX && self->srv0.protocol.type == SOCK_STREAM) { EXPECT_EQ(-EINVAL, ret); -- cgit v1.2.3 From e1a57c33590a50a6639798e60a597af4a23b0340 Mon Sep 17 00:00:00 2001 From: Matthieu Buffet Date: Mon, 1 Dec 2025 01:36:31 +0100 Subject: selftests/landlock: Remove invalid unix socket bind() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove bind() call on a client socket that doesn't make sense. Since strlen(cli_un.sun_path) returns a random value depending on stack garbage, that many uninitialized bytes are read from the stack as an unix socket address. This creates random test failures due to the bind address being invalid or already in use if the same stack value comes up twice. Fixes: f83d51a5bdfe ("selftests/landlock: Check IOCTL restrictions for named UNIX domain sockets") Signed-off-by: Matthieu Buffet Reviewed-by: Günther Noack Link: https://lore.kernel.org/r/20251201003631.190817-1-matthieu@buffet.re Signed-off-by: Mickaël Salaün --- tools/testing/selftests/landlock/fs_test.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c index eee814e09dd7..7d378bdf3bce 100644 --- a/tools/testing/selftests/landlock/fs_test.c +++ b/tools/testing/selftests/landlock/fs_test.c @@ -4391,9 +4391,6 @@ TEST_F_FORK(layout1, named_unix_domain_socket_ioctl) cli_fd = socket(AF_UNIX, SOCK_STREAM, 0); ASSERT_LE(0, cli_fd); - size = offsetof(struct sockaddr_un, sun_path) + strlen(cli_un.sun_path); - ASSERT_EQ(0, bind(cli_fd, (struct sockaddr *)&cli_un, size)); - bzero(&cli_un, sizeof(cli_un)); cli_un.sun_family = AF_UNIX; strncpy(cli_un.sun_path, path, sizeof(cli_un.sun_path)); -- cgit v1.2.3 From e4aa4461d4acb922ef45785581232f0588a6eea8 Mon Sep 17 00:00:00 2001 From: Matthieu Buffet Date: Tue, 2 Dec 2025 22:51:41 +0100 Subject: selftests/landlock: NULL-terminate unix pathname addresses MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The size of Unix pathname addresses is computed in selftests using offsetof(struct sockaddr_un, sun_path) + strlen(xxx). It should have been that +1, which makes addresses passed to the libc and kernel non-NULL-terminated. unix_mkname_bsd() fixes that in Linux so there is no harm, but just using sizeof(the address struct) should improve readability. Signed-off-by: Matthieu Buffet Reviewed-by: Günther Noack Link: https://lore.kernel.org/r/20251202215141.689986-1-matthieu@buffet.re Signed-off-by: Mickaël Salaün --- tools/testing/selftests/landlock/fs_test.c | 24 ++++++++++------------ .../selftests/landlock/scoped_abstract_unix_test.c | 21 ++++++++----------- 2 files changed, 20 insertions(+), 25 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c index 7d378bdf3bce..76491ba54dce 100644 --- a/tools/testing/selftests/landlock/fs_test.c +++ b/tools/testing/selftests/landlock/fs_test.c @@ -4362,22 +4362,24 @@ TEST_F_FORK(layout1, named_unix_domain_socket_ioctl) { const char *const path = file1_s1d1; int srv_fd, cli_fd, ruleset_fd; - socklen_t size; - struct sockaddr_un srv_un, cli_un; + struct sockaddr_un srv_un = { + .sun_family = AF_UNIX, + }; + struct sockaddr_un cli_un = { + .sun_family = AF_UNIX, + }; const struct landlock_ruleset_attr attr = { .handled_access_fs = LANDLOCK_ACCESS_FS_IOCTL_DEV, }; /* Sets up a server */ - srv_un.sun_family = AF_UNIX; - strncpy(srv_un.sun_path, path, sizeof(srv_un.sun_path)); - ASSERT_EQ(0, unlink(path)); srv_fd = socket(AF_UNIX, SOCK_STREAM, 0); ASSERT_LE(0, srv_fd); - size = offsetof(struct sockaddr_un, sun_path) + strlen(srv_un.sun_path); - ASSERT_EQ(0, bind(srv_fd, (struct sockaddr *)&srv_un, size)); + strncpy(srv_un.sun_path, path, sizeof(srv_un.sun_path)); + ASSERT_EQ(0, bind(srv_fd, (struct sockaddr *)&srv_un, sizeof(srv_un))); + ASSERT_EQ(0, listen(srv_fd, 10 /* qlen */)); /* Enables Landlock. */ @@ -4387,16 +4389,12 @@ TEST_F_FORK(layout1, named_unix_domain_socket_ioctl) ASSERT_EQ(0, close(ruleset_fd)); /* Sets up a client connection to it */ - cli_un.sun_family = AF_UNIX; cli_fd = socket(AF_UNIX, SOCK_STREAM, 0); ASSERT_LE(0, cli_fd); - bzero(&cli_un, sizeof(cli_un)); - cli_un.sun_family = AF_UNIX; strncpy(cli_un.sun_path, path, sizeof(cli_un.sun_path)); - size = offsetof(struct sockaddr_un, sun_path) + strlen(cli_un.sun_path); - - ASSERT_EQ(0, connect(cli_fd, (struct sockaddr *)&cli_un, size)); + ASSERT_EQ(0, + connect(cli_fd, (struct sockaddr *)&cli_un, sizeof(cli_un))); /* FIONREAD and other IOCTLs should not be forbidden. */ EXPECT_EQ(0, test_fionread_ioctl(cli_fd)); diff --git a/tools/testing/selftests/landlock/scoped_abstract_unix_test.c b/tools/testing/selftests/landlock/scoped_abstract_unix_test.c index 6825082c079c..2cdf1ba07016 100644 --- a/tools/testing/selftests/landlock/scoped_abstract_unix_test.c +++ b/tools/testing/selftests/landlock/scoped_abstract_unix_test.c @@ -779,7 +779,6 @@ FIXTURE_TEARDOWN(various_address_sockets) TEST_F(various_address_sockets, scoped_pathname_sockets) { - socklen_t size_stream, size_dgram; pid_t child; int status; char buf_child, buf_parent; @@ -798,12 +797,8 @@ TEST_F(various_address_sockets, scoped_pathname_sockets) /* Pathname address. */ snprintf(stream_pathname_addr.sun_path, sizeof(stream_pathname_addr.sun_path), "%s", stream_path); - size_stream = offsetof(struct sockaddr_un, sun_path) + - strlen(stream_pathname_addr.sun_path); snprintf(dgram_pathname_addr.sun_path, sizeof(dgram_pathname_addr.sun_path), "%s", dgram_path); - size_dgram = offsetof(struct sockaddr_un, sun_path) + - strlen(dgram_pathname_addr.sun_path); /* Abstract address. */ memset(&stream_abstract_addr, 0, sizeof(stream_abstract_addr)); @@ -841,8 +836,9 @@ TEST_F(various_address_sockets, scoped_pathname_sockets) /* Connects with pathname sockets. */ stream_pathname_socket = socket(AF_UNIX, SOCK_STREAM, 0); ASSERT_LE(0, stream_pathname_socket); - ASSERT_EQ(0, connect(stream_pathname_socket, - &stream_pathname_addr, size_stream)); + ASSERT_EQ(0, + connect(stream_pathname_socket, &stream_pathname_addr, + sizeof(stream_pathname_addr))); ASSERT_EQ(1, write(stream_pathname_socket, "b", 1)); EXPECT_EQ(0, close(stream_pathname_socket)); @@ -850,12 +846,13 @@ TEST_F(various_address_sockets, scoped_pathname_sockets) dgram_pathname_socket = socket(AF_UNIX, SOCK_DGRAM, 0); ASSERT_LE(0, dgram_pathname_socket); err = sendto(dgram_pathname_socket, "c", 1, 0, - &dgram_pathname_addr, size_dgram); + &dgram_pathname_addr, sizeof(dgram_pathname_addr)); EXPECT_EQ(1, err); /* Sends with connection. */ - ASSERT_EQ(0, connect(dgram_pathname_socket, - &dgram_pathname_addr, size_dgram)); + ASSERT_EQ(0, + connect(dgram_pathname_socket, &dgram_pathname_addr, + sizeof(dgram_pathname_addr))); ASSERT_EQ(1, write(dgram_pathname_socket, "d", 1)); EXPECT_EQ(0, close(dgram_pathname_socket)); @@ -910,13 +907,13 @@ TEST_F(various_address_sockets, scoped_pathname_sockets) stream_pathname_socket = socket(AF_UNIX, SOCK_STREAM, 0); ASSERT_LE(0, stream_pathname_socket); ASSERT_EQ(0, bind(stream_pathname_socket, &stream_pathname_addr, - size_stream)); + sizeof(stream_pathname_addr))); ASSERT_EQ(0, listen(stream_pathname_socket, backlog)); dgram_pathname_socket = socket(AF_UNIX, SOCK_DGRAM, 0); ASSERT_LE(0, dgram_pathname_socket); ASSERT_EQ(0, bind(dgram_pathname_socket, &dgram_pathname_addr, - size_dgram)); + sizeof(dgram_pathname_addr))); /* Sets up abstract servers. */ stream_abstract_socket = socket(AF_UNIX, SOCK_STREAM, 0); -- cgit v1.2.3 From 14c00e30d3a29a7fb6053fcaa54aeb6c07fb1055 Mon Sep 17 00:00:00 2001 From: Tingmao Wang Date: Sun, 28 Dec 2025 01:27:31 +0000 Subject: selftests/landlock: Fix typo in fs_test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Tingmao Wang Link: https://lore.kernel.org/r/62d18e06eeb26f62bc49d24c4467b3793c5ba32b.1766885035.git.m@maowtm.org Signed-off-by: Mickaël Salaün --- tools/testing/selftests/landlock/fs_test.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c index 76491ba54dce..37a5a3df712e 100644 --- a/tools/testing/selftests/landlock/fs_test.c +++ b/tools/testing/selftests/landlock/fs_test.c @@ -7069,8 +7069,8 @@ static int matches_log_fs_extra(struct __test_metadata *const _metadata, return -E2BIG; /* - * It is assume that absolute_path does not contain control characters nor - * spaces, see audit_string_contains_control(). + * It is assumed that absolute_path does not contain control + * characters nor spaces, see audit_string_contains_control(). */ absolute_path = realpath(path, NULL); if (!absolute_path) -- cgit v1.2.3 From 7aa593d8fb64b884bf00c13e01387b0733f3d786 Mon Sep 17 00:00:00 2001 From: Tingmao Wang Date: Sun, 28 Dec 2025 01:27:32 +0000 Subject: selftests/landlock: Fix missing semicolon MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add missing semicolon after EXPECT_EQ(0, close(stream_server_child)) in the scoped_vs_unscoped test. I suspect currently it's just not executing the close statement after the line, but this causes no observable difference. Fixes: fefcf0f7cf47 ("selftests/landlock: Test abstract UNIX socket scoping") Cc: Tahera Fahimi Signed-off-by: Tingmao Wang Link: https://lore.kernel.org/r/d9e968e4cd4ecc9bf487593d7b7220bffbb3b5f5.1766885035.git.m@maowtm.org Signed-off-by: Mickaël Salaün --- tools/testing/selftests/landlock/scoped_abstract_unix_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/landlock/scoped_abstract_unix_test.c b/tools/testing/selftests/landlock/scoped_abstract_unix_test.c index 2cdf1ba07016..72f97648d4a7 100644 --- a/tools/testing/selftests/landlock/scoped_abstract_unix_test.c +++ b/tools/testing/selftests/landlock/scoped_abstract_unix_test.c @@ -543,7 +543,7 @@ TEST_F(scoped_vs_unscoped, unix_scoping) ASSERT_EQ(1, write(pipe_child[1], ".", 1)); ASSERT_EQ(grand_child, waitpid(grand_child, &status, 0)); - EXPECT_EQ(0, close(stream_server_child)) + EXPECT_EQ(0, close(stream_server_child)); EXPECT_EQ(0, close(dgram_server_child)); return; } -- cgit v1.2.3 From 55dc93a7c2717311d48ca0a47c5f8c1b0755a115 Mon Sep 17 00:00:00 2001 From: Tingmao Wang Date: Sun, 28 Dec 2025 01:27:34 +0000 Subject: selftests/landlock: Use scoped_base_variants.h for ptrace_test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ptrace_test.c currently contains a duplicated version of the scoped_domains fixture variants. This patch removes that and make it use the shared scoped_base_variants.h instead, like in scoped_abstract_unix_test and scoped_signal_test. This required renaming the hierarchy fixture to scoped_domains, but the test is otherwise the same. Cc: Tahera Fahimi Signed-off-by: Tingmao Wang Link: https://lore.kernel.org/r/48148f0134f95f819a25277486a875a6fd88ecf9.1766885035.git.m@maowtm.org Signed-off-by: Mickaël Salaün --- tools/testing/selftests/landlock/ptrace_test.c | 154 +-------------------- .../selftests/landlock/scoped_base_variants.h | 9 +- 2 files changed, 12 insertions(+), 151 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/landlock/ptrace_test.c b/tools/testing/selftests/landlock/ptrace_test.c index 4e356334ecb7..4f64c90583cd 100644 --- a/tools/testing/selftests/landlock/ptrace_test.c +++ b/tools/testing/selftests/landlock/ptrace_test.c @@ -86,16 +86,9 @@ static int get_yama_ptrace_scope(void) } /* clang-format off */ -FIXTURE(hierarchy) {}; +FIXTURE(scoped_domains) {}; /* clang-format on */ -FIXTURE_VARIANT(hierarchy) -{ - const bool domain_both; - const bool domain_parent; - const bool domain_child; -}; - /* * Test multiple tracing combinations between a parent process P1 and a child * process P2. @@ -104,155 +97,18 @@ FIXTURE_VARIANT(hierarchy) * restriction is enforced in addition to any Landlock check, which means that * all P2 requests to trace P1 would be denied. */ +#include "scoped_base_variants.h" -/* - * No domain - * - * P1-. P1 -> P2 : allow - * \ P2 -> P1 : allow - * 'P2 - */ -/* clang-format off */ -FIXTURE_VARIANT_ADD(hierarchy, allow_without_domain) { - /* clang-format on */ - .domain_both = false, - .domain_parent = false, - .domain_child = false, -}; - -/* - * Child domain - * - * P1--. P1 -> P2 : allow - * \ P2 -> P1 : deny - * .'-----. - * | P2 | - * '------' - */ -/* clang-format off */ -FIXTURE_VARIANT_ADD(hierarchy, allow_with_one_domain) { - /* clang-format on */ - .domain_both = false, - .domain_parent = false, - .domain_child = true, -}; - -/* - * Parent domain - * .------. - * | P1 --. P1 -> P2 : deny - * '------' \ P2 -> P1 : allow - * ' - * P2 - */ -/* clang-format off */ -FIXTURE_VARIANT_ADD(hierarchy, deny_with_parent_domain) { - /* clang-format on */ - .domain_both = false, - .domain_parent = true, - .domain_child = false, -}; - -/* - * Parent + child domain (siblings) - * .------. - * | P1 ---. P1 -> P2 : deny - * '------' \ P2 -> P1 : deny - * .---'--. - * | P2 | - * '------' - */ -/* clang-format off */ -FIXTURE_VARIANT_ADD(hierarchy, deny_with_sibling_domain) { - /* clang-format on */ - .domain_both = false, - .domain_parent = true, - .domain_child = true, -}; - -/* - * Same domain (inherited) - * .-------------. - * | P1----. | P1 -> P2 : allow - * | \ | P2 -> P1 : allow - * | ' | - * | P2 | - * '-------------' - */ -/* clang-format off */ -FIXTURE_VARIANT_ADD(hierarchy, allow_sibling_domain) { - /* clang-format on */ - .domain_both = true, - .domain_parent = false, - .domain_child = false, -}; - -/* - * Inherited + child domain - * .-----------------. - * | P1----. | P1 -> P2 : allow - * | \ | P2 -> P1 : deny - * | .-'----. | - * | | P2 | | - * | '------' | - * '-----------------' - */ -/* clang-format off */ -FIXTURE_VARIANT_ADD(hierarchy, allow_with_nested_domain) { - /* clang-format on */ - .domain_both = true, - .domain_parent = false, - .domain_child = true, -}; - -/* - * Inherited + parent domain - * .-----------------. - * |.------. | P1 -> P2 : deny - * || P1 ----. | P2 -> P1 : allow - * |'------' \ | - * | ' | - * | P2 | - * '-----------------' - */ -/* clang-format off */ -FIXTURE_VARIANT_ADD(hierarchy, deny_with_nested_and_parent_domain) { - /* clang-format on */ - .domain_both = true, - .domain_parent = true, - .domain_child = false, -}; - -/* - * Inherited + parent and child domain (siblings) - * .-----------------. - * | .------. | P1 -> P2 : deny - * | | P1 . | P2 -> P1 : deny - * | '------'\ | - * | \ | - * | .--'---. | - * | | P2 | | - * | '------' | - * '-----------------' - */ -/* clang-format off */ -FIXTURE_VARIANT_ADD(hierarchy, deny_with_forked_domain) { - /* clang-format on */ - .domain_both = true, - .domain_parent = true, - .domain_child = true, -}; - -FIXTURE_SETUP(hierarchy) +FIXTURE_SETUP(scoped_domains) { } -FIXTURE_TEARDOWN(hierarchy) +FIXTURE_TEARDOWN(scoped_domains) { } /* Test PTRACE_TRACEME and PTRACE_ATTACH for parent and child. */ -TEST_F(hierarchy, trace) +TEST_F(scoped_domains, trace) { pid_t child, parent; int status, err_proc_read; diff --git a/tools/testing/selftests/landlock/scoped_base_variants.h b/tools/testing/selftests/landlock/scoped_base_variants.h index d3b1fa8a584e..7116728ebc68 100644 --- a/tools/testing/selftests/landlock/scoped_base_variants.h +++ b/tools/testing/selftests/landlock/scoped_base_variants.h @@ -1,8 +1,13 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * Landlock scoped_domains variants + * Landlock scoped_domains test variant definition. * - * See the hierarchy variants from ptrace_test.c + * This file defines a fixture variant "scoped_domains" that has all + * permutations of parent/child process being in separate or shared + * Landlock domain, or not being in a Landlock domain at all. + * + * Scoped access tests can include this file to avoid repeating these + * combinations. * * Copyright © 2017-2020 Mickaël Salaün * Copyright © 2019-2020 ANSSI -- cgit v1.2.3 From 317a5df78f24bd77fb770a26eb85bf39620592e0 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Tue, 30 Dec 2025 11:51:32 -0800 Subject: selftests/bpf: Fix verifier_arena_large/big_alloc3 test The big_alloc3() test tries to allocate 2051 pages at once in non-sleepable context and this can fail sporadically on resource contrained systems, so skip this test in case of such failures. Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20251230195134.599463-1-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/verifier_arena_large.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_large.c b/tools/testing/selftests/bpf/progs/verifier_arena_large.c index 4ca491cbe8d1..5f7e7afee169 100644 --- a/tools/testing/selftests/bpf/progs/verifier_arena_large.c +++ b/tools/testing/selftests/bpf/progs/verifier_arena_large.c @@ -300,7 +300,7 @@ int big_alloc3(void *ctx) */ pages = bpf_arena_alloc_pages(&arena, NULL, 2051, NUMA_NO_NODE, 0); if (!pages) - return -1; + return 0; bpf_for(i, 0, 2051) pages[i * PAGE_SIZE] = 123; -- cgit v1.2.3 From e6f2612f0e7c23ce991d3094b5387caf1a52a4fe Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 29 Dec 2025 23:13:08 -0800 Subject: selftests/bpf: test cases for bpf_loop SCC and state graph backedges Test for state graph backedges accumulation for SCCs formed by bpf_loop(). Equivalent to the following C program: int main(void) { 1: fp[-8] = bpf_get_prandom_u32(); 2: fp[-16] = -32; // used in a memory access below 3: bpf_loop(7, loop_cb4, fp, 0); 4: return 0; } int loop_cb4(int i, void *ctx) { 5: if (unlikely(ctx[-8] > bpf_get_prandom_u32())) 6: *(u64 *)(fp + ctx[-16]) = 42; // aligned access expected 7: if (unlikely(fp[-8] > bpf_get_prandom_u32())) 8: ctx[-16] = -31; // makes said access unaligned 9: return 0; } If state graph backedges are not accumulated properly at the SCC formed by loop_cb4() call from bpf_loop(), the state {ctx[-16]=-32} injected at instruction 9 on verification path 1,2,3,5,7,9,4 would be considered fully verified and would lack precision mark for ctx[-16]. This would lead to early pruning of verification path 1,2,3,5,7,8,9 in state {ctx[-16]=-31}, which in turn leads to the incorrect assumption that the above program is safe. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20251229-scc-for-callbacks-v1-2-ceadfe679900@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/iters.c | 75 +++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c index 7dd92a303bf6..69061f030957 100644 --- a/tools/testing/selftests/bpf/progs/iters.c +++ b/tools/testing/selftests/bpf/progs/iters.c @@ -1926,4 +1926,79 @@ static int loop1_wrapper(void) ); } +/* + * This is similar to a test case absent_mark_in_the_middle_state(), + * but adapted for use with bpf_loop(). + */ +SEC("raw_tp") +__flag(BPF_F_TEST_STATE_FREQ) +__failure __msg("math between fp pointer and register with unbounded min value is not allowed") +__naked void absent_mark_in_the_middle_state4(void) +{ + /* + * Equivalent to a C program below: + * + * int main(void) { + * fp[-8] = bpf_get_prandom_u32(); + * fp[-16] = -32; // used in a memory access below + * bpf_loop(7, loop_cb4, fp, 0); + * return 0; + * } + * + * int loop_cb4(int i, void *ctx) { + * if (unlikely(ctx[-8] > bpf_get_prandom_u32())) + * *(u64 *)(fp + ctx[-16]) = 42; // aligned access expected + * if (unlikely(fp[-8] > bpf_get_prandom_u32())) + * ctx[-16] = -31; // makes said access unaligned + * return 0; + * } + */ + asm volatile ( + "call %[bpf_get_prandom_u32];" + "r8 = r0;" + "*(u64 *)(r10 - 8) = r0;" + "*(u64 *)(r10 - 16) = -32;" + "r1 = 7;" + "r2 = loop_cb4 ll;" + "r3 = r10;" + "r4 = 0;" + "call %[bpf_loop];" + "r0 = 0;" + "exit;" + : + : __imm(bpf_loop), + __imm(bpf_get_prandom_u32) + : __clobber_all + ); +} + +__used __naked +static void loop_cb4(void) +{ + asm volatile ( + "r9 = r2;" + "r8 = *(u64 *)(r9 - 8);" + "r6 = *(u64 *)(r9 - 16);" + "call %[bpf_get_prandom_u32];" + "if r0 > r8 goto use_fp16_%=;" + "1:" + "call %[bpf_get_prandom_u32];" + "if r0 > r8 goto update_fp16_%=;" + "2:" + "r0 = 0;" + "exit;" + "use_fp16_%=:" + "r1 = r10;" + "r1 += r6;" + "*(u64 *)(r1 + 0) = 42;" + "goto 1b;" + "update_fp16_%=:" + "*(u64 *)(r9 - 16) = -31;" + "goto 2b;" + : + : __imm(bpf_get_prandom_u32) + : __clobber_all + ); +} + char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 4fd99103eef347174b3c9b6071428324a3cf9a60 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 30 Dec 2025 21:36:04 -0800 Subject: selftests/bpf: iterator based loop and STACK_MISC states pruning The test case first initializes 9 stack slots as STACK_MISC, then conditionally updates each of them to SCALAR spill inside an iterator based loop. This leads to 2**9 combinations of MISC/SPILL marks for these slots at the iterator next call. The loop converges only if the verifier treats such states as equivalent, otherwise visited states are evicted from the states cache too quickly. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20251230-loop-stack-misc-pruning-v1-2-585cfd6cec51@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/iters.c | 65 +++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c index 69061f030957..7f27b517d5d5 100644 --- a/tools/testing/selftests/bpf/progs/iters.c +++ b/tools/testing/selftests/bpf/progs/iters.c @@ -1997,6 +1997,71 @@ static void loop_cb4(void) "goto 2b;" : : __imm(bpf_get_prandom_u32) + ); +} + +SEC("raw_tp") +__success +__naked int stack_misc_vs_scalar_in_a_loop(void) +{ + asm volatile( + "*(u8 *)(r10 - 15) = 1;" /* This marks stack slot fp[-16] as STACK_MISC. */ + "*(u8 *)(r10 - 23) = 1;" + "*(u8 *)(r10 - 31) = 1;" + "*(u8 *)(r10 - 39) = 1;" + "*(u8 *)(r10 - 47) = 1;" + "*(u8 *)(r10 - 55) = 1;" + "*(u8 *)(r10 - 63) = 1;" + "*(u8 *)(r10 - 71) = 1;" + "*(u8 *)(r10 - 79) = 1;" + "r1 = r10;" + "r1 += -8;" + "r2 = 0;" + "r3 = 10;" + "call %[bpf_iter_num_new];" + "loop_%=:" + "r1 = r10;" + "r1 += -8;" + "call %[bpf_iter_num_next];" + "if r0 == 0 goto loop_end_%=;" + +#define maybe_change_stack_slot(off) \ + "call %[bpf_get_prandom_u32];" \ + "if r0 == 42 goto +1;" \ + "goto +1;" \ + "*(u64 *)(r10 " #off ") = r0;" + + /* + * When comparing verifier states fp[-16] will be + * either STACK_MISC or SCALAR. Pruning logic should + * consider old STACK_MISC equivalent to current SCALAR + * to avoid states explosion. + */ + maybe_change_stack_slot(-16) + maybe_change_stack_slot(-24) + maybe_change_stack_slot(-32) + maybe_change_stack_slot(-40) + maybe_change_stack_slot(-48) + maybe_change_stack_slot(-56) + maybe_change_stack_slot(-64) + maybe_change_stack_slot(-72) + maybe_change_stack_slot(-80) + +#undef maybe_change_stack_slot + + "goto loop_%=;" + "loop_end_%=:" + "r1 = r10;" + "r1 += -8;" + "call %[bpf_iter_num_destroy];" + "r0 = 0;" + "exit;" + : + : __imm(bpf_get_prandom_u32), + __imm(bpf_iter_num_new), + __imm(bpf_iter_num_next), + __imm(bpf_iter_num_destroy), + __imm_addr(amap) : __clobber_all ); } -- cgit v1.2.3 From 1a8fa7faf4890d201aad4f5d4943f74d840cd0ba Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 30 Dec 2025 17:25:57 -0800 Subject: resolve_btfids: Implement --patch_btfids Recent changes in BTF generation [1] rely on ${OBJCOPY} command to update .BTF_ids section data in target ELF files. This exposed a bug in llvm-objcopy --update-section code path, that may lead to corruption of a target ELF file. Specifically, because of the bug st_shndx of some symbols may be (incorrectly) set to 0xffff (SHN_XINDEX) [2][3]. While there is a pending fix for LLVM, it'll take some time before it lands (likely in 22.x). And the kernel build must keep working with older LLVM toolchains in the foreseeable future. Using GNU objcopy for .BTF_ids update would work, but it would require changes to LLVM-based build process, likely breaking existing build environments as discussed in [2]. To work around llvm-objcopy bug, implement --patch_btfids code path in resolve_btfids as a drop-in replacement for: ${OBJCOPY} --update-section .BTF_ids=${btf_ids} ${elf} Which works specifically for .BTF_ids section: ${RESOLVE_BTFIDS} --patch_btfids ${btf_ids} ${elf} This feature in resolve_btfids can be removed at some point in the future, when llvm-objcopy with a relevant bugfix becomes common. [1] https://lore.kernel.org/bpf/20251219181321.1283664-1-ihor.solodrai@linux.dev/ [2] https://lore.kernel.org/bpf/20251224005752.201911-1-ihor.solodrai@linux.dev/ [3] https://github.com/llvm/llvm-project/issues/168060#issuecomment-3533552952 Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20251231012558.1699758-1-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- scripts/gen-btf.sh | 2 +- scripts/link-vmlinux.sh | 2 +- tools/bpf/resolve_btfids/main.c | 117 +++++++++++++++++++++++++++++++++++ tools/testing/selftests/bpf/Makefile | 2 +- 4 files changed, 120 insertions(+), 3 deletions(-) (limited to 'tools/testing') diff --git a/scripts/gen-btf.sh b/scripts/gen-btf.sh index 12244dbe097c..0aec86615416 100755 --- a/scripts/gen-btf.sh +++ b/scripts/gen-btf.sh @@ -123,7 +123,7 @@ embed_btf_data() fi local btf_ids="${ELF_FILE}.BTF_ids" if [ -f "${btf_ids}" ]; then - ${OBJCOPY} --update-section .BTF_ids=${btf_ids} ${ELF_FILE} + ${RESOLVE_BTFIDS} --patch_btfids ${btf_ids} ${ELF_FILE} fi } diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index e2207e612ac3..1915adf3249b 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -266,7 +266,7 @@ vmlinux_link "${VMLINUX}" if is_enabled CONFIG_DEBUG_INFO_BTF; then info OBJCOPY ${btfids_vmlinux} - ${OBJCOPY} --update-section .BTF_ids=${btfids_vmlinux} ${VMLINUX} + ${RESOLVE_BTFIDS} --patch_btfids ${btfids_vmlinux} ${VMLINUX} fi mksysmap "${VMLINUX}" System.map diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c index 2cbc252259be..df39982f51df 100644 --- a/tools/bpf/resolve_btfids/main.c +++ b/tools/bpf/resolve_btfids/main.c @@ -862,8 +862,119 @@ static inline int make_out_path(char *buf, u32 buf_sz, const char *in_path, cons return 0; } +/* + * Patch the .BTF_ids section of an ELF file with data from provided file. + * Equivalent to: objcopy --update-section .BTF_ids= + * + * 1. Find .BTF_ids section in the ELF + * 2. Verify that blob file size matches section size + * 3. Update section data buffer with blob data + * 4. Write the ELF file + */ +static int patch_btfids(const char *btfids_path, const char *elf_path) +{ + Elf_Scn *scn = NULL; + FILE *btfids_file; + size_t shdrstrndx; + int fd, err = -1; + Elf_Data *data; + struct stat st; + GElf_Shdr sh; + char *name; + Elf *elf; + + elf_version(EV_CURRENT); + + fd = open(elf_path, O_RDWR, 0666); + if (fd < 0) { + pr_err("FAILED to open %s: %s\n", elf_path, strerror(errno)); + return -1; + } + + elf = elf_begin(fd, ELF_C_RDWR_MMAP, NULL); + if (!elf) { + close(fd); + pr_err("FAILED cannot create ELF descriptor: %s\n", elf_errmsg(-1)); + return -1; + } + + elf_flagelf(elf, ELF_C_SET, ELF_F_LAYOUT); + + if (elf_getshdrstrndx(elf, &shdrstrndx) != 0) { + pr_err("FAILED cannot get shdr str ndx\n"); + goto out; + } + + while ((scn = elf_nextscn(elf, scn)) != NULL) { + + if (gelf_getshdr(scn, &sh) != &sh) { + pr_err("FAILED to get section header\n"); + goto out; + } + + name = elf_strptr(elf, shdrstrndx, sh.sh_name); + if (!name) + continue; + + if (strcmp(name, BTF_IDS_SECTION) == 0) + break; + } + + if (!scn) { + pr_err("FAILED: section %s not found in %s\n", BTF_IDS_SECTION, elf_path); + goto out; + } + + data = elf_getdata(scn, NULL); + if (!data) { + pr_err("FAILED to get %s section data from %s\n", BTF_IDS_SECTION, elf_path); + goto out; + } + + if (stat(btfids_path, &st) < 0) { + pr_err("FAILED to stat %s: %s\n", btfids_path, strerror(errno)); + goto out; + } + + if ((size_t)st.st_size != data->d_size) { + pr_err("FAILED: size mismatch - %s section in %s is %zu bytes, %s is %zu bytes\n", + BTF_IDS_SECTION, elf_path, data->d_size, btfids_path, (size_t)st.st_size); + goto out; + } + + btfids_file = fopen(btfids_path, "rb"); + if (!btfids_file) { + pr_err("FAILED to open %s: %s\n", btfids_path, strerror(errno)); + goto out; + } + + pr_debug("Copying data from %s to %s section of %s (%zu bytes)\n", + btfids_path, BTF_IDS_SECTION, elf_path, data->d_size); + + if (fread(data->d_buf, data->d_size, 1, btfids_file) != 1) { + pr_err("FAILED to read %s\n", btfids_path); + fclose(btfids_file); + goto out; + } + fclose(btfids_file); + + elf_flagdata(data, ELF_C_SET, ELF_F_DIRTY); + if (elf_update(elf, ELF_C_WRITE) < 0) { + pr_err("FAILED to update ELF file %s\n", elf_path); + goto out; + } + + err = 0; +out: + elf_end(elf); + close(fd); + + return err; +} + static const char * const resolve_btfids_usage[] = { "resolve_btfids [] ", + "resolve_btfids --patch_btfids <.BTF_ids file> ", NULL }; @@ -880,6 +991,7 @@ int main(int argc, const char **argv) .funcs = RB_ROOT, .sets = RB_ROOT, }; + const char *btfids_path = NULL; bool fatal_warnings = false; char out_path[PATH_MAX]; @@ -894,6 +1006,8 @@ int main(int argc, const char **argv) "turn warnings into errors"), OPT_BOOLEAN(0, "distill_base", &obj.distill_base, "distill --btf_base and emit .BTF.base section data"), + OPT_STRING(0, "patch_btfids", &btfids_path, "file", + "path to .BTF_ids section data blob to patch into ELF file"), OPT_END() }; int err = -1; @@ -905,6 +1019,9 @@ int main(int argc, const char **argv) obj.path = argv[0]; + if (btfids_path) + return patch_btfids(btfids_path, obj.path); + if (load_btf(&obj)) goto out; diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index f28a32b16ff0..9488d076c740 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -657,7 +657,7 @@ $(TRUNNER_TEST_OBJS): $(TRUNNER_OUTPUT)/%.test.o: \ $$(if $$(TEST_NEEDS_BTFIDS), \ $$(call msg,BTFIDS,$(TRUNNER_BINARY),$$@) \ $(RESOLVE_BTFIDS) --btf $(TRUNNER_OUTPUT)/btf_data.bpf.o $$@; \ - $(OBJCOPY) --update-section .BTF_ids=$$@.BTF_ids $$@) + $(RESOLVE_BTFIDS) --patch_btfids $$@.BTF_ids $$@) $(TRUNNER_TEST_OBJS:.o=.d): $(TRUNNER_OUTPUT)/%.test.d: \ $(TRUNNER_TESTS_DIR)/%.c \ -- cgit v1.2.3 From 673a55cc49dafe47defb9ad76a73987fe89e5d70 Mon Sep 17 00:00:00 2001 From: Clint George Date: Mon, 15 Dec 2025 14:17:37 +0530 Subject: kselftest/coredump: use __builtin_trap() instead of null pointer Use __builtin_trap() to truly crash the program instead of dereferencing null pointer which may be optimized by the compiler preventing the crash from occurring [] Testing: The diff between before and after of running the kselftest test of the module shows no regression on system with x86 architecture [] Error log: ~/Desktop/kernel-dev/linux-v1/tools/testing/selftests/coredump$ make LLVM=1 W=1 CC stackdump_test coredump_test_helpers.c:59:6: warning: indirection of non-volatile null pointer will be deleted, not trap [-Wnull-dereference] 59 | i = *(int *)NULL; | ^~~~~~~~~~~~ coredump_test_helpers.c:59:6: note: consider using __builtin_trap() or qualifying pointer with 'volatile' 1 warning generated. CC coredump_socket_test coredump_test_helpers.c:59:6: warning: indirection of non-volatile null pointer will be deleted, not trap [-Wnull-dereference] 59 | i = *(int *)NULL; | ^~~~~~~~~~~~ coredump_test_helpers.c:59:6: note: consider using __builtin_trap() or qualifying pointer with 'volatile' 1 warning generated. CC coredump_socket_protocol_test coredump_test_helpers.c:59:6: warning: indirection of non-volatile null pointer will be deleted, not trap [-Wnull-dereference] 59 | i = *(int *)NULL; | ^~~~~~~~~~~~ coredump_test_helpers.c:59:6: note: consider using __builtin_trap() or qualifying pointer with 'volatile' 1 warning generated. Link: https://lore.kernel.org/r/20251215084737.7504-1-clintbgeorge@gmail.com Signed-off-by: Clint George Signed-off-by: Shuah Khan --- tools/testing/selftests/coredump/coredump_test_helpers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/coredump/coredump_test_helpers.c b/tools/testing/selftests/coredump/coredump_test_helpers.c index a6f6d5f2ae07..5c8adee63641 100644 --- a/tools/testing/selftests/coredump/coredump_test_helpers.c +++ b/tools/testing/selftests/coredump/coredump_test_helpers.c @@ -56,7 +56,7 @@ void crashing_child(void) pthread_create(&thread, NULL, do_nothing, NULL); /* crash on purpose */ - i = *(int *)NULL; + __builtin_trap(); } int create_detached_tmpfs(void) -- cgit v1.2.3 From 0aaff7b109037c0a45def1bed7b76ffaf253f7d0 Mon Sep 17 00:00:00 2001 From: Clint George Date: Mon, 15 Dec 2025 14:19:00 +0530 Subject: kselftest/anon_inode: replace null pointers with empty arrays Make use of empty (NULL-terminated) array instead of NULL pointer to avoid compiler errors while maintaining the behavior of the function intact [] Testing: The diff between before and after of running the kselftest test of the module shows no regression on system with x86 architecture [] Error log: ~/Desktop/kernel-dev/linux-v1/tools/testing/selftests/filesystems$ make LLVM=1 W=1 CC devpts_pts CC file_stressor CC anon_inode_test anon_inode_test.c:45:37: warning: null passed to a callee that requires a non-null argument [-Wnonnull] 45 | ASSERT_LT(execveat(fd_context, "", NULL, NULL, AT_EMPTY_PATH), 0); | ^~~~ /usr/lib/llvm-18/lib/clang/18/include/__stddef_null.h:26:14: note: expanded from macro 'NULL' 26 | #define NULL ((void*)0) | ^~~~~~~~~~ ../Desktop/kernel-dev/linux-v1/tools/testing/selftests/../../../tools/testing/selftests/kselftest_harness.h:535:11: note: expanded from macro 'ASSERT_LT' 535 | __EXPECT(expected, #expected, seen, #seen, <, 1) | ^~~~~~~~ ../Desktop/kernel-dev/linux-v1/tools/testing/selftests/../../../tools/testing/selftests/kselftest_harness.h:758:33: note: expanded from macro '__EXPECT' 758 | __typeof__(_expected) __exp = (_expected); \ | ^~~~~~~~~ 1 warning generated. Link: https://lore.kernel.org/r/20251215084900.7590-1-clintbgeorge@gmail.com Signed-off-by: Clint George Signed-off-by: Shuah Khan --- tools/testing/selftests/filesystems/anon_inode_test.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/filesystems/anon_inode_test.c b/tools/testing/selftests/filesystems/anon_inode_test.c index 94c6c81c2301..2c4c50500116 100644 --- a/tools/testing/selftests/filesystems/anon_inode_test.c +++ b/tools/testing/selftests/filesystems/anon_inode_test.c @@ -42,7 +42,10 @@ TEST(anon_inode_no_exec) fd_context = sys_fsopen("tmpfs", 0); ASSERT_GE(fd_context, 0); - ASSERT_LT(execveat(fd_context, "", NULL, NULL, AT_EMPTY_PATH), 0); + char *const empty_argv[] = {NULL}; + char *const empty_envp[] = {NULL}; + + ASSERT_LT(execveat(fd_context, "", empty_argv, empty_envp, AT_EMPTY_PATH), 0); ASSERT_EQ(errno, EACCES); EXPECT_EQ(close(fd_context), 0); -- cgit v1.2.3 From 3e6ad272bb8b3199bad952e7b077102af2d8df03 Mon Sep 17 00:00:00 2001 From: Clint George Date: Mon, 15 Dec 2025 14:20:22 +0530 Subject: kselftest/kublk: include message in _Static_assert for C11 compatibility Add descriptive message in the _Static_assert to comply with the C11 standard requirement to prevent compiler from throwing out error. The compiler throws an error when _Static_assert is used without a message as that is a C23 extension. [] Testing: The diff between before and after of running the kselftest test of the module shows no regression on system with x86 architecture [] Error log: ~/Desktop/kernel-dev/linux-v1/tools/testing/selftests/ublk$ make LLVM=1 W=1 CC kublk In file included from kublk.c:6: ./kublk.h:220:43: error: '_Static_assert' with no message is a C23 extension [-Werror,-Wc23-extensions] 220 | _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7); | ^ | , "" 1 error generated. In file included from null.c:3: ./kublk.h:220:43: error: '_Static_assert' with no message is a C23 extension [-Werror,-Wc23-extensions] 220 | _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7); | ^ | , "" 1 error generated. In file included from file_backed.c:3: ./kublk.h:220:43: error: '_Static_assert' with no message is a C23 extension [-Werror,-Wc23-extensions] 220 | _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7); | ^ | , "" 1 error generated. In file included from common.c:3: ./kublk.h:220:43: error: '_Static_assert' with no message is a C23 extension [-Werror,-Wc23-extensions] 220 | _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7); | ^ | , "" 1 error generated. In file included from stripe.c:3: ./kublk.h:220:43: error: '_Static_assert' with no message is a C23 extension [-Werror,-Wc23-extensions] 220 | _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7); | ^ | , "" 1 error generated. In file included from fault_inject.c:11: ./kublk.h:220:43: error: '_Static_assert' with no message is a C23 extension [-Werror,-Wc23-extensions] 220 | _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7); | ^ | , "" 1 error generated. make: *** [../lib.mk:225: ~/Desktop/kernel-dev/linux-v1/tools/testing/selftests/ublk/kublk] Error 1 Link: https://lore.kernel.org/r/20251215085022.7642-1-clintbgeorge@gmail.com Signed-off-by: Clint George Reviewed-by: Ming Lei Signed-off-by: Shuah Khan --- tools/testing/selftests/ublk/kublk.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index fe42705c6d42..e5eb5f762c1c 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -217,7 +217,7 @@ static inline __u64 build_user_data(unsigned tag, unsigned op, unsigned tgt_data, unsigned q_id, unsigned is_target_io) { /* we only have 7 bits to encode q_id */ - _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7); + _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7, "UBLK_MAX_QUEUES_SHIFT must be <= 7"); assert(!(tag >> 16) && !(op >> 8) && !(tgt_data >> 16) && !(q_id >> 7)); return tag | (op << 16) | (tgt_data << 24) | -- cgit v1.2.3 From c286e7e9d1f1f3d90ad11c37e896f582b02d19c4 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Wed, 31 Dec 2025 14:10:50 -0800 Subject: selftests/bpf: veristat: fix printing order in output_stats() The order of the variables in the printf() doesn't match the text and therefore veristat prints something like this: Done. Processed 24 files, 0 programs. Skipped 62 files, 0 programs. When it should print: Done. Processed 24 files, 62 programs. Skipped 0 files, 0 programs. Fix the order of variables in the printf() call. Fixes: 518fee8bfaf2 ("selftests/bpf: make veristat skip non-BPF and failing-to-open BPF objects") Tested-by: Eduard Zingerman Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20251231221052.759396-1-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/veristat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c index e962f133250c..1be1e353d40a 100644 --- a/tools/testing/selftests/bpf/veristat.c +++ b/tools/testing/selftests/bpf/veristat.c @@ -2580,7 +2580,7 @@ static void output_stats(const struct verif_stats *s, enum resfmt fmt, bool last if (last && fmt == RESFMT_TABLE) { output_header_underlines(); printf("Done. Processed %d files, %d programs. Skipped %d files, %d programs.\n", - env.files_processed, env.files_skipped, env.progs_processed, env.progs_skipped); + env.files_processed, env.progs_processed, env.files_skipped, env.progs_skipped); } } -- cgit v1.2.3 From a73fc3dcc60b6d7a2075e2fbdca64fd53600f855 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 29 Dec 2025 11:10:58 -0800 Subject: rcu: Clean up after the SRCU-fastification of RCU Tasks Trace Now that RCU Tasks Trace has been re-implemented in terms of SRCU-fast, the ->trc_ipi_to_cpu, ->trc_blkd_cpu, ->trc_blkd_node, ->trc_holdout_list, and ->trc_reader_special task_struct fields are no longer used. In addition, the rcu_tasks_trace_qs(), rcu_tasks_trace_qs_blkd(), exit_tasks_rcu_finish_trace(), and rcu_spawn_tasks_trace_kthread(), show_rcu_tasks_trace_gp_kthread(), rcu_tasks_trace_get_gp_data(), rcu_tasks_trace_torture_stats_print(), and get_rcu_tasks_trace_gp_kthread() functions and all the other functions that they invoke are no longer used. Also, the TRC_NEED_QS and TRC_NEED_QS_CHECKED CPP macros are no longer used. Neither are the rcu_tasks_trace_lazy_ms and rcu_task_ipi_delay rcupdate module parameters and the TASKS_TRACE_RCU_READ_MB Kconfig option. This commit therefore removes all of them. [ paulmck: Apply Alexei Starovoitov feedback. ] Signed-off-by: Paul E. McKenney Cc: Andrii Nakryiko Cc: Alexei Starovoitov Cc: Peter Zijlstra Cc: bpf@vger.kernel.org Reviewed-by: Joel Fernandes Signed-off-by: Boqun Feng --- Documentation/admin-guide/kernel-parameters.txt | 15 ---- include/linux/rcupdate.h | 31 +-------- include/linux/rcupdate_trace.h | 2 - include/linux/sched.h | 5 -- init/init_task.c | 3 - kernel/fork.c | 3 - kernel/rcu/Kconfig | 18 ----- kernel/rcu/rcu.h | 9 --- kernel/rcu/rcuscale.c | 7 -- kernel/rcu/rcutorture.c | 2 - kernel/rcu/tasks.h | 79 +--------------------- .../selftests/rcutorture/configs/rcu/TRACE01 | 1 - .../selftests/rcutorture/configs/rcu/TRACE02 | 1 - 13 files changed, 2 insertions(+), 174 deletions(-) (limited to 'tools/testing') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a8d0afde7f85..1b8e5cadbecb 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6249,13 +6249,6 @@ Kernel parameters dynamically) adjusted. This parameter is intended for use in testing. - rcupdate.rcu_task_ipi_delay= [KNL] - Set time in jiffies during which RCU tasks will - avoid sending IPIs, starting with the beginning - of a given grace period. Setting a large - number avoids disturbing real-time workloads, - but lengthens grace periods. - rcupdate.rcu_task_lazy_lim= [KNL] Number of callbacks on a given CPU that will cancel laziness on that CPU. Use -1 to disable @@ -6299,14 +6292,6 @@ Kernel parameters of zero will disable batching. Batching is always disabled for synchronize_rcu_tasks(). - rcupdate.rcu_tasks_trace_lazy_ms= [KNL] - Set timeout in milliseconds RCU Tasks - Trace asynchronous callback batching for - call_rcu_tasks_trace(). A negative value - will take the default. A value of zero will - disable batching. Batching is always disabled - for synchronize_rcu_tasks_trace(). - rcupdate.rcu_self_test= [KNL] Run the RCU early boot self tests diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index c5b30054cd01..bd5a420cf09a 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -175,36 +175,7 @@ void rcu_tasks_torture_stats_print(char *tt, char *tf); # define synchronize_rcu_tasks synchronize_rcu # endif -# ifdef CONFIG_TASKS_TRACE_RCU -// Bits for ->trc_reader_special.b.need_qs field. -#define TRC_NEED_QS 0x1 // Task needs a quiescent state. -#define TRC_NEED_QS_CHECKED 0x2 // Task has been checked for needing quiescent state. - -u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new); -void rcu_tasks_trace_qs_blkd(struct task_struct *t); - -# define rcu_tasks_trace_qs(t) \ - do { \ - int ___rttq_nesting = READ_ONCE((t)->trc_reader_nesting); \ - \ - if (unlikely(READ_ONCE((t)->trc_reader_special.b.need_qs) == TRC_NEED_QS) && \ - likely(!___rttq_nesting)) { \ - rcu_trc_cmpxchg_need_qs((t), TRC_NEED_QS, TRC_NEED_QS_CHECKED); \ - } else if (___rttq_nesting && ___rttq_nesting != INT_MIN && \ - !READ_ONCE((t)->trc_reader_special.b.blocked)) { \ - rcu_tasks_trace_qs_blkd(t); \ - } \ - } while (0) -void rcu_tasks_trace_torture_stats_print(char *tt, char *tf); -# else -# define rcu_tasks_trace_qs(t) do { } while (0) -# endif - -#define rcu_tasks_qs(t, preempt) \ -do { \ - rcu_tasks_classic_qs((t), (preempt)); \ - rcu_tasks_trace_qs(t); \ -} while (0) +#define rcu_tasks_qs(t, preempt) rcu_tasks_classic_qs((t), (preempt)) # ifdef CONFIG_TASKS_RUDE_RCU void synchronize_rcu_tasks_rude(void); diff --git a/include/linux/rcupdate_trace.h b/include/linux/rcupdate_trace.h index 3f46cbe67000..0bd47f12ecd1 100644 --- a/include/linux/rcupdate_trace.h +++ b/include/linux/rcupdate_trace.h @@ -136,9 +136,7 @@ static inline void rcu_barrier_tasks_trace(void) } // Placeholders to enable stepwise transition. -void rcu_tasks_trace_get_gp_data(int *flags, unsigned long *gp_seq); void __init rcu_tasks_trace_suppress_unused(void); -struct task_struct *get_rcu_tasks_trace_gp_kthread(void); #else /* diff --git a/include/linux/sched.h b/include/linux/sched.h index fe39d422b37d..56156643ccac 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -946,11 +946,6 @@ struct task_struct { #ifdef CONFIG_TASKS_TRACE_RCU int trc_reader_nesting; struct srcu_ctr __percpu *trc_reader_scp; - int trc_ipi_to_cpu; - union rcu_special trc_reader_special; - struct list_head trc_holdout_list; - struct list_head trc_blkd_node; - int trc_blkd_cpu; #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ struct sched_info sched_info; diff --git a/init/init_task.c b/init/init_task.c index 49b13d7c3985..db92c404d59a 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -195,9 +195,6 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = { #endif #ifdef CONFIG_TASKS_TRACE_RCU .trc_reader_nesting = 0, - .trc_reader_special.s = 0, - .trc_holdout_list = LIST_HEAD_INIT(init_task.trc_holdout_list), - .trc_blkd_node = LIST_HEAD_INIT(init_task.trc_blkd_node), #endif #ifdef CONFIG_CPUSETS .mems_allowed_seq = SEQCNT_SPINLOCK_ZERO(init_task.mems_allowed_seq, diff --git a/kernel/fork.c b/kernel/fork.c index b1f3915d5f8e..d7ed107cbb47 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1828,9 +1828,6 @@ static inline void rcu_copy_process(struct task_struct *p) #endif /* #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_TRACE_RCU p->trc_reader_nesting = 0; - p->trc_reader_special.s = 0; - INIT_LIST_HEAD(&p->trc_holdout_list); - INIT_LIST_HEAD(&p->trc_blkd_node); #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 4d9b21f69eaa..8d5a1ecb7d56 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -313,24 +313,6 @@ config RCU_NOCB_CPU_CB_BOOST Say Y here if you want to set RT priority for offloading kthreads. Say N here if you are building a !PREEMPT_RT kernel and are unsure. -config TASKS_TRACE_RCU_READ_MB - bool "Tasks Trace RCU readers use memory barriers in user and idle" - depends on RCU_EXPERT && TASKS_TRACE_RCU - default PREEMPT_RT || NR_CPUS < 8 - help - Use this option to further reduce the number of IPIs sent - to CPUs executing in userspace or idle during tasks trace - RCU grace periods. Given that a reasonable setting of - the rcupdate.rcu_task_ipi_delay kernel boot parameter - eliminates such IPIs for many workloads, proper setting - of this Kconfig option is important mostly for aggressive - real-time installations and for battery-powered devices, - hence the default chosen above. - - Say Y here if you hate IPIs. - Say N here if you hate read-side memory barriers. - Take the default if you are unsure. - config RCU_LAZY bool "RCU callback lazy invocation functionality" depends on RCU_NOCB_CPU diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 9cf01832a6c3..dc5d614b372c 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -544,10 +544,6 @@ struct task_struct *get_rcu_tasks_rude_gp_kthread(void); void rcu_tasks_rude_get_gp_data(int *flags, unsigned long *gp_seq); #endif // # ifdef CONFIG_TASKS_RUDE_RCU -#ifdef CONFIG_TASKS_TRACE_RCU -void rcu_tasks_trace_get_gp_data(int *flags, unsigned long *gp_seq); -#endif - #ifdef CONFIG_TASKS_RCU_GENERIC void tasks_cblist_init_generic(void); #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ @@ -673,11 +669,6 @@ void show_rcu_tasks_rude_gp_kthread(void); #else static inline void show_rcu_tasks_rude_gp_kthread(void) {} #endif -#if !defined(CONFIG_TINY_RCU) && defined(CONFIG_TASKS_TRACE_RCU) -void show_rcu_tasks_trace_gp_kthread(void); -#else -static inline void show_rcu_tasks_trace_gp_kthread(void) {} -#endif #ifdef CONFIG_TINY_RCU static inline bool rcu_cpu_beenfullyonline(int cpu) { return true; } diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 7484d8ad5767..1c50f89fbd6f 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -400,11 +400,6 @@ static void tasks_trace_scale_read_unlock(int idx) rcu_read_unlock_trace(); } -static void rcu_tasks_trace_scale_stats(void) -{ - rcu_tasks_trace_torture_stats_print(scale_type, SCALE_FLAG); -} - static struct rcu_scale_ops tasks_tracing_ops = { .ptype = RCU_TASKS_FLAVOR, .init = rcu_sync_scale_init, @@ -416,8 +411,6 @@ static struct rcu_scale_ops tasks_tracing_ops = { .gp_barrier = rcu_barrier_tasks_trace, .sync = synchronize_rcu_tasks_trace, .exp_sync = synchronize_rcu_tasks_trace, - .rso_gp_kthread = get_rcu_tasks_trace_gp_kthread, - .stats = IS_ENABLED(CONFIG_TINY_RCU) ? NULL : rcu_tasks_trace_scale_stats, .name = "tasks-tracing" }; diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 07e51974b06b..78a6ebe77d35 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1180,8 +1180,6 @@ static struct rcu_torture_ops tasks_tracing_ops = { .exp_sync = synchronize_rcu_tasks_trace, .call = call_rcu_tasks_trace, .cb_barrier = rcu_barrier_tasks_trace, - .gp_kthread_dbg = show_rcu_tasks_trace_gp_kthread, - .get_gp_data = rcu_tasks_trace_get_gp_data, .cbflood_max = 50000, .irq_capable = 1, .slow_gps = 1, diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 1fe789c99f36..1249b47f0a8d 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -161,11 +161,6 @@ static void tasks_rcu_exit_srcu_stall(struct timer_list *unused); static DEFINE_TIMER(tasks_rcu_exit_srcu_stall_timer, tasks_rcu_exit_srcu_stall); #endif -/* Avoid IPIing CPUs early in the grace period. */ -#define RCU_TASK_IPI_DELAY (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) ? HZ / 2 : 0) -static int rcu_task_ipi_delay __read_mostly = RCU_TASK_IPI_DELAY; -module_param(rcu_task_ipi_delay, int, 0644); - /* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ #define RCU_TASK_BOOT_STALL_TIMEOUT (HZ * 30) #define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) @@ -800,8 +795,6 @@ static void rcu_tasks_torture_stats_print_generic(struct rcu_tasks *rtp, char *t #endif // #ifndef CONFIG_TINY_RCU -static void exit_tasks_rcu_finish_trace(struct task_struct *t); - #if defined(CONFIG_TASKS_RCU) //////////////////////////////////////////////////////////////////////// @@ -1321,13 +1314,11 @@ void exit_tasks_rcu_finish(void) raw_spin_lock_irqsave_rcu_node(rtpcp, flags); list_del_init(&t->rcu_tasks_exit_list); raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); - - exit_tasks_rcu_finish_trace(t); } #else /* #ifdef CONFIG_TASKS_RCU */ void exit_tasks_rcu_start(void) { } -void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); } +void exit_tasks_rcu_finish(void) { } #endif /* #else #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_RUDE_RCU @@ -1475,69 +1466,6 @@ void __init rcu_tasks_trace_suppress_unused(void) #endif // #ifndef CONFIG_TINY_RCU } -/* - * Do a cmpxchg() on ->trc_reader_special.b.need_qs, allowing for - * the four-byte operand-size restriction of some platforms. - * - * Returns the old value, which is often ignored. - */ -u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new) -{ - return cmpxchg(&t->trc_reader_special.b.need_qs, old, new); -} -EXPORT_SYMBOL_GPL(rcu_trc_cmpxchg_need_qs); - -/* Add a newly blocked reader task to its CPU's list. */ -void rcu_tasks_trace_qs_blkd(struct task_struct *t) -{ -} -EXPORT_SYMBOL_GPL(rcu_tasks_trace_qs_blkd); - -/* Communicate task state back to the RCU tasks trace stall warning request. */ -struct trc_stall_chk_rdr { - int nesting; - int ipi_to_cpu; - u8 needqs; -}; - -/* Report any needed quiescent state for this exiting task. */ -static void exit_tasks_rcu_finish_trace(struct task_struct *t) -{ -} - -int rcu_tasks_trace_lazy_ms = -1; -module_param(rcu_tasks_trace_lazy_ms, int, 0444); - -static int __init rcu_spawn_tasks_trace_kthread(void) -{ - return 0; -} - -#if !defined(CONFIG_TINY_RCU) -void show_rcu_tasks_trace_gp_kthread(void) -{ -} -EXPORT_SYMBOL_GPL(show_rcu_tasks_trace_gp_kthread); - -void rcu_tasks_trace_torture_stats_print(char *tt, char *tf) -{ -} -EXPORT_SYMBOL_GPL(rcu_tasks_trace_torture_stats_print); -#endif // !defined(CONFIG_TINY_RCU) - -struct task_struct *get_rcu_tasks_trace_gp_kthread(void) -{ - return NULL; -} -EXPORT_SYMBOL_GPL(get_rcu_tasks_trace_gp_kthread); - -void rcu_tasks_trace_get_gp_data(int *flags, unsigned long *gp_seq) -{ -} -EXPORT_SYMBOL_GPL(rcu_tasks_trace_get_gp_data); - -#else /* #ifdef CONFIG_TASKS_TRACE_RCU */ -static void exit_tasks_rcu_finish_trace(struct task_struct *t) { } #endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */ #ifndef CONFIG_TINY_RCU @@ -1545,7 +1473,6 @@ void show_rcu_tasks_gp_kthreads(void) { show_rcu_tasks_classic_gp_kthread(); show_rcu_tasks_rude_gp_kthread(); - show_rcu_tasks_trace_gp_kthread(); } #endif /* #ifndef CONFIG_TINY_RCU */ @@ -1684,10 +1611,6 @@ static int __init rcu_init_tasks_generic(void) rcu_spawn_tasks_rude_kthread(); #endif -#ifdef CONFIG_TASKS_TRACE_RCU - rcu_spawn_tasks_trace_kthread(); -#endif - // Run the self-tests. rcu_tasks_initiate_self_tests(); diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 b/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 index 85b407467454..18efab346381 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 @@ -10,5 +10,4 @@ CONFIG_PROVE_LOCKING=n #CHECK#CONFIG_PROVE_RCU=n CONFIG_FORCE_TASKS_TRACE_RCU=y #CHECK#CONFIG_TASKS_TRACE_RCU=y -CONFIG_TASKS_TRACE_RCU_READ_MB=y CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 b/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 index 9003c56cd764..8da390e82829 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 @@ -9,6 +9,5 @@ CONFIG_PROVE_LOCKING=y #CHECK#CONFIG_PROVE_RCU=y CONFIG_FORCE_TASKS_TRACE_RCU=y #CHECK#CONFIG_TASKS_TRACE_RCU=y -CONFIG_TASKS_TRACE_RCU_READ_MB=n CONFIG_RCU_EXPERT=y CONFIG_DEBUG_OBJECTS=y -- cgit v1.2.3 From 3ce40539cc0055b2df49303979c5b6a4a8321be4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 29 Dec 2025 11:13:54 -0800 Subject: torture: Parallelize kvm-series.sh guest-OS execution Currently, kvm-series.sh builds and runs serially, which makes for long execution times. This commit changes its logic to build all of the needed kernels serially, but then run the corresponding guest OSes concurrently in batches using the entire machine. On large systems, this results in order-of-magnitude speedups of the guest-OS execution portion of the runtime. Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- .../testing/selftests/rcutorture/bin/kvm-series.sh | 174 ++++++++++++++++++--- 1 file changed, 153 insertions(+), 21 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/rcutorture/bin/kvm-series.sh b/tools/testing/selftests/rcutorture/bin/kvm-series.sh index 2ff905a1853b..6729687861f2 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-series.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-series.sh @@ -15,7 +15,7 @@ # This script is intended to replace kvm-check-branches.sh by providing # ease of use and faster execution. -T="`mktemp -d ${TMPDIR-/tmp}/kvm-series.sh.XXXXXX`" +T="`mktemp -d ${TMPDIR-/tmp}/kvm-series.sh.XXXXXX`"; export T trap 'rm -rf $T' 0 scriptname=$0 @@ -53,40 +53,62 @@ shift RCUTORTURE="`pwd`/tools/testing/selftests/rcutorture"; export RCUTORTURE PATH=${RCUTORTURE}/bin:$PATH; export PATH +RES="${RCUTORTURE}/res"; export RES . functions.sh ret=0 -nfail=0 +nbuildfail=0 +nrunfail=0 nsuccess=0 -faillist= +ncpus=0 +buildfaillist= +runfaillist= successlist= cursha1="`git rev-parse --abbrev-ref HEAD`" ds="`date +%Y.%m.%d-%H.%M.%S`-series" +DS="${RES}/${ds}"; export DS startdate="`date`" starttime="`get_starttime`" echo " --- " $scriptname $args | tee -a $T/log echo " --- Results directory: " $ds | tee -a $T/log +# Do all builds. Iterate through commits within a given scenario +# because builds normally go faster from one commit to the next within a +# given scenario. In contrast, switching scenarios on each rebuild will +# often force a full rebuild due to Kconfig differences, for example, +# turning preemption on and off. Defer actual runs in order to run +# lots of them concurrently on large systems. +touch $T/torunlist for config in ${config_list} do sha_n=0 for sha in ${sha1_list} do sha1=${sha_n}.${sha} # Enable "sort -k1nr" to list commits in order. + echo echo Starting ${config}/${sha1} at `date` | tee -a $T/log - git checkout "${sha}" - time tools/testing/selftests/rcutorture/bin/kvm.sh --configs "$config" --datestamp "$ds/${config}/${sha1}" --duration 1 "$@" + git checkout --detach "${sha}" + tools/testing/selftests/rcutorture/bin/kvm.sh --configs "$config" --datestamp "$ds/${config}/${sha1}" --duration 1 --build-only --trust-make "$@" curret=$? if test "${curret}" -ne 0 then - nfail=$((nfail+1)) - faillist="$faillist ${config}/${sha1}(${curret})" + nbuildfail=$((nbuildfail+1)) + buildfaillist="$buildfaillist ${config}/${sha1}(${curret})" else - nsuccess=$((nsuccess+1)) - successlist="$successlist ${config}/${sha1}" - # Successful run, so remove large files. - rm -f ${RCUTORTURE}/$ds/${config}/${sha1}/{vmlinux,bzImage,System.map,Module.symvers} + batchncpus="`grep -v "^# cpus=" "${DS}/${config}/${sha1}/batches" | awk '{ sum += $3 } END { print sum }'`" + echo run_one_qemu ${sha_n} ${config}/${sha1} ${batchncpus} >> $T/torunlist + if test "${ncpus}" -eq 0 + then + ncpus="`grep "^# cpus=" "${DS}/${config}/${sha1}/batches" | sed -e 's/^# cpus=//'`" + case "${ncpus}" in + ^[0-9]*$) + ;; + *) + ncpus=0 + ;; + esac + fi fi if test "${ret}" -eq 0 then @@ -95,22 +117,132 @@ do sha_n=$((sha_n+1)) done done + +# If the user did not specify the number of CPUs, use them all. +if test "${ncpus}" -eq 0 +then + ncpus="`identify_qemu_vcpus`" +fi + +cpusused=0 +touch $T/successlistfile +touch $T/faillistfile + +# do_run_one_qemu ds resultsdir qemu_curout +# +# Start the specified qemu run and record its success or failure. +do_run_one_qemu () { + local ret + local ds="$1" + local resultsdir="$2" + local qemu_curout="$3" + + tools/testing/selftests/rcutorture/bin/kvm-again.sh "${DS}/${resultsdir}" --link inplace-force > ${qemu_curout} 2>&1 + ret=$? + if test "${ret}" -eq 0 + then + echo ${resultsdir} >> $T/successlistfile + # Successful run, so remove large files. + rm -f ${DS}/${resultsdir}/{vmlinux,bzImage,System.map,Module.symvers} + else + echo "${resultsdir}(${ret})" >> $T/faillistfile + fi +} + +# cleanup_qemu_batch batchncpus +# +# Update success and failure lists, files, and counts at the end of +# a batch. +cleanup_qemu_batch () { + local batchncpus="$1" + + echo Waiting, cpusused=${cpusused}, ncpus=${ncpus} `date` | tee -a $T/log + wait + cpusused="${batchncpus}" + nsuccessbatch="`wc -l $T/successlistfile | awk '{ print $1 }'`" + nsuccess=$((nsuccess+nsuccessbatch)) + successlist="$successlist `cat $T/successlistfile`" + rm $T/successlistfile + touch $T/successlistfile + nfailbatch="`wc -l $T/faillistfile | awk '{ print $1 }'`" + nrunfail=$((nrunfail+nfailbatch)) + runfaillist="$runfaillist `cat $T/faillistfile`" + rm $T/faillistfile + touch $T/faillistfile +} + +# run_one_qemu sha_n config/sha1 batchncpus +# +# Launch into the background the sha_n-th qemu job whose results directory +# is config/sha1 and which uses batchncpus CPUs. Once we reach a job that +# would overflow the number of available CPUs, wait for the previous jobs +# to complete and record their results. +run_one_qemu () { + local sha_n="$1" + local config_sha1="$2" + local batchncpus="$3" + local qemu_curout + + cpusused=$((cpusused+batchncpus)) + if test "${cpusused}" -gt $ncpus + then + cleanup_qemu_batch "${batchncpus}" + fi + echo Starting ${config_sha1} using ${batchncpus} CPUs `date` + qemu_curout="${DS}/${config_sha1}/qemu-series" + do_run_one_qemu "$ds" "${config_sha1}" ${qemu_curout} & +} + +# Re-ordering the runs will mess up the affinity chosen at build time +# (among other things, over-using CPU 0), so suppress it. +TORTURE_NO_AFFINITY="no-affinity"; export TORTURE_NO_AFFINITY + +# Run the kernels (if any) that built correctly. +echo | tee -a $T/log # Put a blank line between build and run messages. +. $T/torunlist +cleanup_qemu_batch "${batchncpus}" + +# Get back to initial checkout/SHA-1. git checkout "${cursha1}" -echo ${nsuccess} SUCCESSES: | tee -a $T/log -echo ${successlist} | fmt | tee -a $T/log -echo | tee -a $T/log -echo ${nfail} FAILURES: | tee -a $T/log -echo ${faillist} | fmt | tee -a $T/log -if test -n "${faillist}" +# Throw away leading and trailing space characters for fmt. +successlist="`echo ${successlist} | sed -e 's/^ *//' -e 's/ *$//'`" +buildfaillist="`echo ${buildfaillist} | sed -e 's/^ *//' -e 's/ *$//'`" +runfaillist="`echo ${runfaillist} | sed -e 's/^ *//' -e 's/ *$//'`" + +# Print lists of successes, build failures, and run failures, if any. +if test "${nsuccess}" -gt 0 +then + echo | tee -a $T/log + echo ${nsuccess} SUCCESSES: | tee -a $T/log + echo ${successlist} | fmt | tee -a $T/log +fi +if test "${nbuildfail}" -gt 0 then echo | tee -a $T/log - echo Failures across commits: | tee -a $T/log - echo ${faillist} | tr ' ' '\012' | sed -e 's,^[^/]*/,,' -e 's/([0-9]*)//' | + echo ${nbuildfail} BUILD FAILURES: | tee -a $T/log + echo ${buildfaillist} | fmt | tee -a $T/log +fi +if test "${nrunfail}" -gt 0 +then + echo | tee -a $T/log + echo ${nrunfail} RUN FAILURES: | tee -a $T/log + echo ${runfaillist} | fmt | tee -a $T/log +fi + +# If there were build or runtime failures, map them to commits. +if test "${nbuildfail}" -gt 0 || test "${nrunfail}" -gt 0 +then + echo | tee -a $T/log + echo Build failures across commits: | tee -a $T/log + echo ${buildfaillist} | tr ' ' '\012' | sed -e 's,^[^/]*/,,' -e 's/([0-9]*)//' | sort | uniq -c | sort -k2n | tee -a $T/log fi + +# Print run summary. +echo | tee -a $T/log echo Started at $startdate, ended at `date`, duration `get_starttime_duration $starttime`. | tee -a $T/log -echo Summary: Successes: ${nsuccess} Failures: ${nfail} | tee -a $T/log -cp $T/log tools/testing/selftests/rcutorture/res/${ds} +echo Summary: Successes: ${nsuccess} " "Build Failures: ${nbuildfail} " "Runtime Failures: ${nrunfail}| tee -a $T/log +cp $T/log ${DS} exit "${ret}" -- cgit v1.2.3 From 672621773f7df8cda2ff5635edac4aa5339d097f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 29 Dec 2025 11:13:55 -0800 Subject: torture: Make kvm-series.sh give build numbers and totals The kvm-series.sh script can easily be convinced to do on the order of 1,000 builds, so some sort of progress indicator would be helpful. This commit therefore updates the "Starting" output lines to read as in the following example, adding the ("2 of 4"): Starting TREE01/1.7e0ad1b49057 (2 of 4) at Sat Nov 8 10:08:21 PM PST 2025 Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- tools/testing/selftests/rcutorture/bin/kvm-series.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/rcutorture/bin/kvm-series.sh b/tools/testing/selftests/rcutorture/bin/kvm-series.sh index 6729687861f2..a00d2e96f6cc 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-series.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-series.sh @@ -32,6 +32,7 @@ then echo "$0: Repetition ('*') not allowed in config list." exit 1 fi +config_list_len="`echo ${config_list} | wc -w | awk '{ print $1; }'`" commit_list="${2}" if test -z "${commit_list}" @@ -47,6 +48,7 @@ then exit 2 fi sha1_list=`cat $T/commits` +sha1_list_len="`echo ${sha1_list} | wc -w | awk '{ print $1; }'`" shift shift @@ -80,6 +82,8 @@ echo " --- Results directory: " $ds | tee -a $T/log # turning preemption on and off. Defer actual runs in order to run # lots of them concurrently on large systems. touch $T/torunlist +n2build="$((config_list_len*sha1_list_len))" +nbuilt=0 for config in ${config_list} do sha_n=0 @@ -87,7 +91,7 @@ do do sha1=${sha_n}.${sha} # Enable "sort -k1nr" to list commits in order. echo - echo Starting ${config}/${sha1} at `date` | tee -a $T/log + echo Starting ${config}/${sha1} "($((nbuilt+1)) of ${n2build})" at `date` | tee -a $T/log git checkout --detach "${sha}" tools/testing/selftests/rcutorture/bin/kvm.sh --configs "$config" --datestamp "$ds/${config}/${sha1}" --duration 1 --build-only --trust-make "$@" curret=$? @@ -115,6 +119,7 @@ do ret=${curret} fi sha_n=$((sha_n+1)) + nbuilt=$((nbuilt+1)) done done -- cgit v1.2.3 From 3d69b6beb8ba932911096138394b5cd3e21b3b92 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 29 Dec 2025 11:13:56 -0800 Subject: torture: Make kvm-series.sh give run numbers and totals The kvm-series.sh script can easily be convinced to run on the order of 1,000 guest OSes, so some sort of progress indicator would be helpful. This commit therefore updates the "Starting" output lines to read as in the following example, adding the ("3 of 4"): Starting TREE02/1.7e0ad1b49057 using 8 CPUs (4 of 4) Sat Nov 8 10:51:06 PM PST 2025 Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- tools/testing/selftests/rcutorture/bin/kvm-series.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/rcutorture/bin/kvm-series.sh b/tools/testing/selftests/rcutorture/bin/kvm-series.sh index a00d2e96f6cc..c4ee5f910931 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-series.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-series.sh @@ -132,6 +132,8 @@ fi cpusused=0 touch $T/successlistfile touch $T/faillistfile +n2run="`wc -l $T/torunlist | awk '{ print $1; }'`" +nrun=0 # do_run_one_qemu ds resultsdir qemu_curout # @@ -193,9 +195,10 @@ run_one_qemu () { then cleanup_qemu_batch "${batchncpus}" fi - echo Starting ${config_sha1} using ${batchncpus} CPUs `date` + echo Starting ${config_sha1} using ${batchncpus} CPUs "($((nrun+1)) of ${n2run})" `date` qemu_curout="${DS}/${config_sha1}/qemu-series" do_run_one_qemu "$ds" "${config_sha1}" ${qemu_curout} & + nrun="$((nrun+1))" } # Re-ordering the runs will mess up the affinity chosen at build time -- cgit v1.2.3 From dcd6067322ba6750995fbc5486d7c9ada88489ff Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 29 Dec 2025 11:13:57 -0800 Subject: torture: Make config2csv.sh properly handle comments in .boot files As in strip the "#" and everything after it and *then* tokenize. Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- tools/testing/selftests/rcutorture/bin/config2csv.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/rcutorture/bin/config2csv.sh b/tools/testing/selftests/rcutorture/bin/config2csv.sh index 0cf55f1bf654..aeab4d6f11ad 100755 --- a/tools/testing/selftests/rcutorture/bin/config2csv.sh +++ b/tools/testing/selftests/rcutorture/bin/config2csv.sh @@ -42,7 +42,7 @@ do grep -v '^#' < $i | grep -v '^ *$' > $T/p if test -r $i.boot then - tr -s ' ' '\012' < $i.boot | grep -v '^#' >> $T/p + sed -e 's/#.*$//' < $i.boot | tr -s ' ' '\012' >> $T/p fi sed -e 's/^[^=]*$/&=?/' < $T/p | sed -e 's/^\([^=]*\)=\(.*\)$/\tp["\1:'"$i"'"] = "\2";\n\tc["\1"] = 1;/' >> $T/p.awk -- cgit v1.2.3 From c89474b9b2ab8ab2c0d2cddadbed781c0f5e8f0c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 29 Dec 2025 11:13:58 -0800 Subject: torture: Include commit discription in testid.txt Currently, the testid.txt file in the top-level directory of the rcutorture results contains the output of "git rev-parse HEAD", which just gives the full SHA-1 of the current commit. This is followed by the output of "git status", which is further followed by the output of "git diff". This works, but is less than helpful to human readers scanning a list of commits. This commit therefore instead uses "git show --oneline --no-patch HEAD", which provides a short SHA-1, but also the names of any branches and the commit's title. Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- tools/testing/selftests/rcutorture/bin/mktestid.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/rcutorture/bin/mktestid.sh b/tools/testing/selftests/rcutorture/bin/mktestid.sh index 16f9907a4dae..24f6261dab6a 100755 --- a/tools/testing/selftests/rcutorture/bin/mktestid.sh +++ b/tools/testing/selftests/rcutorture/bin/mktestid.sh @@ -18,7 +18,7 @@ fi echo Build directory: `pwd` > ${resdir}/testid.txt if test -d .git then - echo Current commit: `git rev-parse HEAD` >> ${resdir}/testid.txt + echo Current commit: `git show --oneline --no-patch HEAD` >> ${resdir}/testid.txt echo >> ${resdir}/testid.txt echo ' ---' Output of "'"git status"'": >> ${resdir}/testid.txt git status >> ${resdir}/testid.txt -- cgit v1.2.3 From 7646c7afd9a95db0b0cb4ad066ed90f6024da67d Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 2 Jan 2026 10:00:28 -0800 Subject: bpf: Remove redundant KF_TRUSTED_ARGS flag from all kfuncs Now that KF_TRUSTED_ARGS is the default for all kfuncs, remove the explicit KF_TRUSTED_ARGS flag from all kfunc definitions and remove the flag itself. Acked-by: Eduard Zingerman Reviewed-by: Emil Tsalapatis Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260102180038.2708325-3-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- fs/bpf_fs_kfuncs.c | 13 ++++++------- fs/verity/measure.c | 2 +- include/linux/bpf.h | 2 +- include/linux/btf.h | 3 +-- kernel/bpf/arena.c | 6 +++--- kernel/bpf/cpumask.c | 2 +- kernel/bpf/helpers.c | 20 ++++++++++---------- kernel/bpf/map_iter.c | 2 +- kernel/bpf/verifier.c | 2 +- kernel/sched/ext.c | 8 ++++---- mm/bpf_memcontrol.c | 10 +++++----- net/core/filter.c | 10 +++++----- net/core/xdp.c | 2 +- net/netfilter/nf_conntrack_bpf.c | 8 ++++---- net/netfilter/nf_flow_table_bpf.c | 2 +- net/netfilter/nf_nat_bpf.c | 2 +- net/sched/bpf_qdisc.c | 12 ++++++------ tools/testing/selftests/bpf/progs/cpumask_failure.c | 2 +- tools/testing/selftests/bpf/test_kmods/bpf_testmod.c | 20 ++++++++++---------- 19 files changed, 63 insertions(+), 65 deletions(-) (limited to 'tools/testing') diff --git a/fs/bpf_fs_kfuncs.c b/fs/bpf_fs_kfuncs.c index abd5eaa4892e..e4e51a1d0de2 100644 --- a/fs/bpf_fs_kfuncs.c +++ b/fs/bpf_fs_kfuncs.c @@ -356,14 +356,13 @@ __bpf_kfunc int bpf_cgroup_read_xattr(struct cgroup *cgroup, const char *name__s __bpf_kfunc_end_defs(); BTF_KFUNCS_START(bpf_fs_kfunc_set_ids) -BTF_ID_FLAGS(func, bpf_get_task_exe_file, - KF_ACQUIRE | KF_TRUSTED_ARGS | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_get_task_exe_file, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_put_file, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_path_d_path, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_get_dentry_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_set_dentry_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_remove_dentry_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_path_d_path) +BTF_ID_FLAGS(func, bpf_get_dentry_xattr, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_set_dentry_xattr, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_remove_dentry_xattr, KF_SLEEPABLE) BTF_KFUNCS_END(bpf_fs_kfunc_set_ids) static int bpf_fs_kfuncs_filter(const struct bpf_prog *prog, u32 kfunc_id) diff --git a/fs/verity/measure.c b/fs/verity/measure.c index 388734132f01..6a35623ebdf0 100644 --- a/fs/verity/measure.c +++ b/fs/verity/measure.c @@ -162,7 +162,7 @@ __bpf_kfunc int bpf_get_fsverity_digest(struct file *file, struct bpf_dynptr *di __bpf_kfunc_end_defs(); BTF_KFUNCS_START(fsverity_set_ids) -BTF_ID_FLAGS(func, bpf_get_fsverity_digest, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_get_fsverity_digest) BTF_KFUNCS_END(fsverity_set_ids) static int bpf_get_fsverity_digest_filter(const struct bpf_prog *prog, u32 kfunc_id) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4e7d72dfbcd4..9efb2ddf331c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -753,7 +753,7 @@ enum bpf_type_flag { MEM_ALLOC = BIT(11 + BPF_BASE_TYPE_BITS), /* PTR was passed from the kernel in a trusted context, and may be - * passed to KF_TRUSTED_ARGS kfuncs or BPF helper functions. + * passed to kfuncs or BPF helper functions. * Confusingly, this is _not_ the opposite of PTR_UNTRUSTED above. * PTR_UNTRUSTED refers to a kptr that was read directly from a map * without invoking bpf_kptr_xchg(). What we really need to know is diff --git a/include/linux/btf.h b/include/linux/btf.h index f06976ffb63f..691f09784933 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -34,7 +34,7 @@ * * And the following kfunc: * - * BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) + * BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE) * * All invocations to the kfunc must pass the unmodified, unwalked task: * @@ -66,7 +66,6 @@ * return 0; * } */ -#define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */ #define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */ #define KF_DESTRUCTIVE (1 << 6) /* kfunc performs destructive actions */ #define KF_RCU (1 << 7) /* kfunc takes either rcu or trusted pointer arguments */ diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 456ac989269d..2274319a95e6 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -890,9 +890,9 @@ __bpf_kfunc int bpf_arena_reserve_pages(void *p__map, void *ptr__ign, u32 page_c __bpf_kfunc_end_defs(); BTF_KFUNCS_START(arena_kfuncs) -BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_ARENA_RET | KF_ARENA_ARG2) -BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_ARENA_ARG2) -BTF_ID_FLAGS(func, bpf_arena_reserve_pages, KF_TRUSTED_ARGS | KF_ARENA_ARG2) +BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_ARENA_RET | KF_ARENA_ARG2) +BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_ARENA_ARG2) +BTF_ID_FLAGS(func, bpf_arena_reserve_pages, KF_ARENA_ARG2) BTF_KFUNCS_END(arena_kfuncs) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index 9876c5fe6c2a..b8c805b4b06a 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -477,7 +477,7 @@ __bpf_kfunc_end_defs(); BTF_KFUNCS_START(cpumask_kfunc_btf_ids) BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE) BTF_ID_FLAGS(func, bpf_cpumask_first, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_first_and, KF_RCU) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index db72b96f9c8c..2c15f77c74db 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -4427,7 +4427,7 @@ BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_from_vpid, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_throw) #ifdef CONFIG_BPF_EVENTS -BTF_ID_FLAGS(func, bpf_send_signal_task, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_send_signal_task) #endif #ifdef CONFIG_KEYS BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE) @@ -4467,14 +4467,14 @@ BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW | KF_RCU) BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY) #ifdef CONFIG_CGROUPS -BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW) BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY) -BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED) +BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, bpf_iter_css_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_css_destroy, KF_ITER_DESTROY) #endif -BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED) +BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_dynptr_adjust) @@ -4510,8 +4510,8 @@ BTF_ID_FLAGS(func, bpf_probe_read_user_str_dynptr) BTF_ID_FLAGS(func, bpf_probe_read_kernel_str_dynptr) BTF_ID_FLAGS(func, bpf_copy_from_user_dynptr, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_copy_from_user_str_dynptr, KF_SLEEPABLE) -BTF_ID_FLAGS(func, bpf_copy_from_user_task_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_copy_from_user_task_str_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_copy_from_user_task_dynptr, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_copy_from_user_task_str_dynptr, KF_SLEEPABLE) #endif #ifdef CONFIG_DMA_SHARED_BUFFER BTF_ID_FLAGS(func, bpf_iter_dmabuf_new, KF_ITER_NEW | KF_SLEEPABLE) @@ -4536,10 +4536,10 @@ BTF_ID_FLAGS(func, bpf_strncasestr); #if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS) BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU) #endif -BTF_ID_FLAGS(func, bpf_stream_vprintk_impl, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_task_work_schedule_signal_impl, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_task_work_schedule_resume_impl, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_dynptr_from_file, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_stream_vprintk_impl) +BTF_ID_FLAGS(func, bpf_task_work_schedule_signal_impl) +BTF_ID_FLAGS(func, bpf_task_work_schedule_resume_impl) +BTF_ID_FLAGS(func, bpf_dynptr_from_file) BTF_ID_FLAGS(func, bpf_dynptr_file_discard) BTF_KFUNCS_END(common_btf_ids) diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index 9575314f40a6..261a03ea73d3 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -214,7 +214,7 @@ __bpf_kfunc s64 bpf_map_sum_elem_count(const struct bpf_map *map) __bpf_kfunc_end_defs(); BTF_KFUNCS_START(bpf_map_iter_kfunc_ids) -BTF_ID_FLAGS(func, bpf_map_sum_elem_count, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_map_sum_elem_count) BTF_KFUNCS_END(bpf_map_iter_kfunc_ids) static const struct btf_kfunc_id_set bpf_map_iter_kfunc_set = { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 359a962d69a1..c9da70dd3e72 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12619,7 +12619,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, /* Enforce strict type matching for calls to kfuncs that are acquiring * or releasing a reference, or are no-cast aliases. We do _not_ - * enforce strict matching for plain KF_TRUSTED_ARGS kfuncs by default, + * enforce strict matching for kfuncs by default, * as we want to enable BPF programs to pass types that are bitwise * equivalent without forcing them to explicitly cast with something * like bpf_cast_to_kern_ctx(). diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 94164f2dec6d..fd5423428dde 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -7229,9 +7229,9 @@ BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_RCU_PROTECTED | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY) -BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, scx_bpf_exit_bstr) +BTF_ID_FLAGS(func, scx_bpf_error_bstr) +BTF_ID_FLAGS(func, scx_bpf_dump_bstr) BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2) BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap) BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur) @@ -7250,7 +7250,7 @@ BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_RET_NULL | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE) #endif BTF_ID_FLAGS(func, scx_bpf_now) -BTF_ID_FLAGS(func, scx_bpf_events, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, scx_bpf_events) BTF_KFUNCS_END(scx_kfunc_ids_any) static const struct btf_kfunc_id_set scx_kfunc_set_any = { diff --git a/mm/bpf_memcontrol.c b/mm/bpf_memcontrol.c index e8fa7f5855f9..716df49d7647 100644 --- a/mm/bpf_memcontrol.c +++ b/mm/bpf_memcontrol.c @@ -166,11 +166,11 @@ BTF_ID_FLAGS(func, bpf_get_root_mem_cgroup, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_get_mem_cgroup, KF_ACQUIRE | KF_RET_NULL | KF_RCU) BTF_ID_FLAGS(func, bpf_put_mem_cgroup, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_mem_cgroup_vm_events, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_mem_cgroup_memory_events, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_mem_cgroup_usage, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_mem_cgroup_page_state, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_mem_cgroup_flush_stats, KF_TRUSTED_ARGS | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_mem_cgroup_vm_events) +BTF_ID_FLAGS(func, bpf_mem_cgroup_memory_events) +BTF_ID_FLAGS(func, bpf_mem_cgroup_usage) +BTF_ID_FLAGS(func, bpf_mem_cgroup_page_state) +BTF_ID_FLAGS(func, bpf_mem_cgroup_flush_stats, KF_SLEEPABLE) BTF_KFUNCS_END(bpf_memcontrol_kfuncs) diff --git a/net/core/filter.c b/net/core/filter.c index 616e0520a0bb..d43df98e1ded 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -12438,11 +12438,11 @@ int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags, } BTF_KFUNCS_START(bpf_kfunc_check_set_skb) -BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_dynptr_from_skb) BTF_KFUNCS_END(bpf_kfunc_check_set_skb) BTF_KFUNCS_START(bpf_kfunc_check_set_skb_meta) -BTF_ID_FLAGS(func, bpf_dynptr_from_skb_meta, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_dynptr_from_skb_meta) BTF_KFUNCS_END(bpf_kfunc_check_set_skb_meta) BTF_KFUNCS_START(bpf_kfunc_check_set_xdp) @@ -12455,11 +12455,11 @@ BTF_ID_FLAGS(func, bpf_sock_addr_set_sun_path) BTF_KFUNCS_END(bpf_kfunc_check_set_sock_addr) BTF_KFUNCS_START(bpf_kfunc_check_set_tcp_reqsk) -BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk) BTF_KFUNCS_END(bpf_kfunc_check_set_tcp_reqsk) BTF_KFUNCS_START(bpf_kfunc_check_set_sock_ops) -BTF_ID_FLAGS(func, bpf_sock_ops_enable_tx_tstamp, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_sock_ops_enable_tx_tstamp) BTF_KFUNCS_END(bpf_kfunc_check_set_sock_ops) static const struct btf_kfunc_id_set bpf_kfunc_set_skb = { @@ -12554,7 +12554,7 @@ __bpf_kfunc int bpf_sock_destroy(struct sock_common *sock) __bpf_kfunc_end_defs(); BTF_KFUNCS_START(bpf_sk_iter_kfunc_ids) -BTF_ID_FLAGS(func, bpf_sock_destroy, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_sock_destroy) BTF_KFUNCS_END(bpf_sk_iter_kfunc_ids) static int tracing_iter_filter(const struct bpf_prog *prog, u32 kfunc_id) diff --git a/net/core/xdp.c b/net/core/xdp.c index 9100e160113a..fee6d080ee85 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -964,7 +964,7 @@ __bpf_kfunc int bpf_xdp_metadata_rx_vlan_tag(const struct xdp_md *ctx, __bpf_kfunc_end_defs(); BTF_KFUNCS_START(xdp_metadata_kfunc_ids) -#define XDP_METADATA_KFUNC(_, __, name, ___) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS) +#define XDP_METADATA_KFUNC(_, __, name, ___) BTF_ID_FLAGS(func, name) XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC BTF_KFUNCS_END(xdp_metadata_kfunc_ids) diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c index 4a136fc3a9c0..a630139bd0c3 100644 --- a/net/netfilter/nf_conntrack_bpf.c +++ b/net/netfilter/nf_conntrack_bpf.c @@ -516,10 +516,10 @@ BTF_ID_FLAGS(func, bpf_skb_ct_alloc, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_skb_ct_lookup, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_ct_insert_entry, KF_ACQUIRE | KF_RET_NULL | KF_RELEASE) BTF_ID_FLAGS(func, bpf_ct_release, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_ct_set_timeout, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_ct_change_timeout, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_ct_set_status, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_ct_change_status, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_ct_set_timeout) +BTF_ID_FLAGS(func, bpf_ct_change_timeout) +BTF_ID_FLAGS(func, bpf_ct_set_status) +BTF_ID_FLAGS(func, bpf_ct_change_status) BTF_KFUNCS_END(nf_ct_kfunc_set) static const struct btf_kfunc_id_set nf_conntrack_kfunc_set = { diff --git a/net/netfilter/nf_flow_table_bpf.c b/net/netfilter/nf_flow_table_bpf.c index 4a5f5195f2d2..cbd5b97a6329 100644 --- a/net/netfilter/nf_flow_table_bpf.c +++ b/net/netfilter/nf_flow_table_bpf.c @@ -105,7 +105,7 @@ __diag_pop() __bpf_kfunc_end_defs(); BTF_KFUNCS_START(nf_ft_kfunc_set) -BTF_ID_FLAGS(func, bpf_xdp_flow_lookup, KF_TRUSTED_ARGS | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_xdp_flow_lookup, KF_RET_NULL) BTF_KFUNCS_END(nf_ft_kfunc_set) static const struct btf_kfunc_id_set nf_flow_kfunc_set = { diff --git a/net/netfilter/nf_nat_bpf.c b/net/netfilter/nf_nat_bpf.c index 481be15609b1..f9dd85ccea01 100644 --- a/net/netfilter/nf_nat_bpf.c +++ b/net/netfilter/nf_nat_bpf.c @@ -55,7 +55,7 @@ __bpf_kfunc int bpf_ct_set_nat_info(struct nf_conn___init *nfct, __bpf_kfunc_end_defs(); BTF_KFUNCS_START(nf_nat_kfunc_set) -BTF_ID_FLAGS(func, bpf_ct_set_nat_info, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_ct_set_nat_info) BTF_KFUNCS_END(nf_nat_kfunc_set) static const struct btf_kfunc_id_set nf_bpf_nat_kfunc_set = { diff --git a/net/sched/bpf_qdisc.c b/net/sched/bpf_qdisc.c index adcb618a2bfc..b9771788b9b3 100644 --- a/net/sched/bpf_qdisc.c +++ b/net/sched/bpf_qdisc.c @@ -271,14 +271,14 @@ __bpf_kfunc void bpf_qdisc_bstats_update(struct Qdisc *sch, const struct sk_buff __bpf_kfunc_end_defs(); BTF_KFUNCS_START(qdisc_kfunc_ids) -BTF_ID_FLAGS(func, bpf_skb_get_hash, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_skb_get_hash) BTF_ID_FLAGS(func, bpf_kfree_skb, KF_RELEASE) BTF_ID_FLAGS(func, bpf_qdisc_skb_drop, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_qdisc_watchdog_schedule, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_qdisc_init_prologue, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_qdisc_reset_destroy_epilogue, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_qdisc_bstats_update, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_dynptr_from_skb) +BTF_ID_FLAGS(func, bpf_qdisc_watchdog_schedule) +BTF_ID_FLAGS(func, bpf_qdisc_init_prologue) +BTF_ID_FLAGS(func, bpf_qdisc_reset_destroy_epilogue) +BTF_ID_FLAGS(func, bpf_qdisc_bstats_update) BTF_KFUNCS_END(qdisc_kfunc_ids) BTF_SET_START(qdisc_common_kfunc_set) diff --git a/tools/testing/selftests/bpf/progs/cpumask_failure.c b/tools/testing/selftests/bpf/progs/cpumask_failure.c index 8a2fd596c8a3..61c32e91e8c3 100644 --- a/tools/testing/selftests/bpf/progs/cpumask_failure.c +++ b/tools/testing/selftests/bpf/progs/cpumask_failure.c @@ -110,7 +110,7 @@ SEC("tp_btf/task_newtask") __failure __msg("NULL pointer passed to trusted arg0") int BPF_PROG(test_cpumask_null, struct task_struct *task, u64 clone_flags) { - /* NULL passed to KF_TRUSTED_ARGS kfunc. */ + /* NULL passed to kfunc. */ bpf_cpumask_empty(NULL); return 0; diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index 90c4b1a51de6..1c41d03bd5a1 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -693,9 +693,9 @@ BTF_ID_FLAGS(func, bpf_kfunc_dynptr_test) BTF_ID_FLAGS(func, bpf_kfunc_nested_acquire_nonzero_offset_test, KF_ACQUIRE) BTF_ID_FLAGS(func, bpf_kfunc_nested_acquire_zero_offset_test, KF_ACQUIRE) BTF_ID_FLAGS(func, bpf_kfunc_nested_release_test, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_kfunc_trusted_vma_test, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_kfunc_trusted_task_test, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_kfunc_trusted_num_test, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_kfunc_trusted_vma_test) +BTF_ID_FLAGS(func, bpf_kfunc_trusted_task_test) +BTF_ID_FLAGS(func, bpf_kfunc_trusted_num_test) BTF_ID_FLAGS(func, bpf_kfunc_rcu_task_test, KF_RCU) BTF_ID_FLAGS(func, bpf_kfunc_ret_rcu_test, KF_RET_NULL | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, bpf_kfunc_ret_rcu_test_nostruct, KF_RET_NULL | KF_RCU_PROTECTED) @@ -1158,7 +1158,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_test_pass2) BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail1) BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail2) BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail3) -BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_TRUSTED_ARGS | KF_RCU) +BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_RCU) BTF_ID_FLAGS(func, bpf_kfunc_call_test_destructive, KF_DESTRUCTIVE) BTF_ID_FLAGS(func, bpf_kfunc_call_test_static_unused_arg) BTF_ID_FLAGS(func, bpf_kfunc_call_test_offset) @@ -1172,12 +1172,12 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_sendmsg, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_call_sock_sendmsg, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_getsockname, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_getpeername, KF_SLEEPABLE) -BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_prologue, KF_TRUSTED_ARGS | KF_SLEEPABLE) -BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_epilogue, KF_TRUSTED_ARGS | KF_SLEEPABLE) -BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_pro_epilogue, KF_TRUSTED_ARGS | KF_SLEEPABLE) -BTF_ID_FLAGS(func, bpf_kfunc_st_ops_inc10, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1_impl, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_prologue, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_epilogue, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_pro_epilogue, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_st_ops_inc10) +BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1) +BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1_impl) BTF_KFUNCS_END(bpf_testmod_check_kfunc_ids) static int bpf_testmod_ops_init(struct btf *btf) -- cgit v1.2.3 From df5004579bbdfe4c734f2cbbf3f44fe3fac440a3 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 2 Jan 2026 10:00:32 -0800 Subject: selftests: bpf: Update kfunc_param_nullable test for new error message With trusted args now being the default, the NULL pointer check runs before type-specific validation. Update test3 to expect the new error message "Possibly NULL pointer passed to trusted arg0" instead of the old dynptr-specific error message. Acked-by: Eduard Zingerman Reviewed-by: Emil Tsalapatis Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260102180038.2708325-7-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c b/tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c index 0ad1bf1ede8d..967081bbcfe1 100644 --- a/tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c +++ b/tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c @@ -29,7 +29,7 @@ int kfunc_dynptr_nullable_test2(struct __sk_buff *skb) } SEC("tc") -__failure __msg("expected pointer to stack or const struct bpf_dynptr") +__failure __msg("Possibly NULL pointer passed to trusted arg0") int kfunc_dynptr_nullable_test3(struct __sk_buff *skb) { struct bpf_dynptr data; -- cgit v1.2.3 From 03cc77b10e009ce87f1a8e93454aadf2912a4c15 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 2 Jan 2026 10:00:33 -0800 Subject: selftests: bpf: Update failure message for rbtree_fail The rbtree_api_use_unchecked_remove_retval() selftest passes a pointer received from bpf_rbtree_remove() to bpf_rbtree_add() without checking for NULL, this was earlier caught by __check_ptr_off_reg() in the verifier. Now the verifier assumes every kfunc only takes trusted pointer arguments, so it catches this NULL pointer earlier in the path and provides a more accurate failure message. Acked-by: Eduard Zingerman Reviewed-by: Emil Tsalapatis Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260102180038.2708325-8-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/rbtree_fail.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/progs/rbtree_fail.c b/tools/testing/selftests/bpf/progs/rbtree_fail.c index 4acb6af2dfe3..70b7baf9304b 100644 --- a/tools/testing/selftests/bpf/progs/rbtree_fail.c +++ b/tools/testing/selftests/bpf/progs/rbtree_fail.c @@ -153,7 +153,7 @@ long rbtree_api_add_to_multiple_trees(void *ctx) } SEC("?tc") -__failure __msg("dereference of modified ptr_or_null_ ptr R2 off=16 disallowed") +__failure __msg("Possibly NULL pointer passed to trusted arg1") long rbtree_api_use_unchecked_remove_retval(void *ctx) { struct bpf_rb_node *res; -- cgit v1.2.3 From 230b0118e416583a53fc0ad5d1fecb37f496fe34 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 2 Jan 2026 10:00:34 -0800 Subject: selftests: bpf: fix test_kfunc_dynptr_param As verifier now assumes that all kfuncs only takes trusted pointer arguments, passing 0 (NULL) to a kfunc that doesn't mark the argument as __nullable or __opt will be rejected with a failure message of: Possibly NULL pointer passed to trusted arg Pass a non-null value to the kfunc to test the expected failure mode. Acked-by: Eduard Zingerman Reviewed-by: Emil Tsalapatis Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260102180038.2708325-9-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c b/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c index 061befb004c2..d249113ed657 100644 --- a/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c +++ b/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c @@ -48,10 +48,9 @@ SEC("?lsm.s/bpf") __failure __msg("arg#0 expected pointer to stack or const struct bpf_dynptr") int BPF_PROG(not_ptr_to_stack, int cmd, union bpf_attr *attr, unsigned int size, bool kernel) { - unsigned long val = 0; + static struct bpf_dynptr val; - return bpf_verify_pkcs7_signature((struct bpf_dynptr *)val, - (struct bpf_dynptr *)val, NULL); + return bpf_verify_pkcs7_signature(&val, &val, NULL); } SEC("lsm.s/bpf") -- cgit v1.2.3 From cf82580c86a91de2aa979260985cadcb39ed28d2 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 2 Jan 2026 10:00:35 -0800 Subject: selftests: bpf: fix cgroup_hierarchical_stats The cgroup_hierarchical_stats selftests uses an fentry program attached to cgroup_attach_task and then passes the received &dst_cgrp->self to the css_rstat_updated() kfunc. The verifier now assumes that all kfuncs only takes trusted pointer arguments, and pointers received by fentry are not marked trustes by default. Use a tp_btf program in place for fentry for this test, pointers received by tp_btf programs are marked trusted by the verifier. Acked-by: Eduard Zingerman Reviewed-by: Emil Tsalapatis Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260102180038.2708325-10-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c b/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c index ff189a736ad8..8fc38592a87b 100644 --- a/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c +++ b/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c @@ -62,9 +62,9 @@ static int create_attach_counter(__u64 cg_id, __u64 state, __u64 pending) &init, BPF_NOEXIST); } -SEC("fentry/cgroup_attach_task") -int BPF_PROG(counter, struct cgroup *dst_cgrp, struct task_struct *leader, - bool threadgroup) +SEC("tp_btf/cgroup_attach_task") +int BPF_PROG(counter, struct cgroup *dst_cgrp, const char *path, + struct task_struct *task, bool threadgroup) { __u64 cg_id = cgroup_id(dst_cgrp); struct percpu_attach_counter *pcpu_counter = bpf_map_lookup_elem( -- cgit v1.2.3 From cf503eb2c6c38bf449063f33790a96218a067718 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 2 Jan 2026 10:00:36 -0800 Subject: selftests: bpf: Fix test_bpf_nf for trusted args becoming default With trusted args now being the default, passing NULL to kfunc parameters that are pointers causes verifier rejection rather than a runtime error. The test_bpf_nf test was failing because it attempted to pass NULL to bpf_xdp_ct_lookup() to verify runtime error handling. Since the NULL check now happens at verification time, remove the runtime test case that passed NULL to the bpf_tuple parameter and instead add verification-time tests to ensure the verifier correctly rejects programs that pass NULL to trusted arguments. Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260102180038.2708325-11-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/bpf_nf.c | 5 +- tools/testing/selftests/bpf/progs/test_bpf_nf.c | 7 --- .../testing/selftests/bpf/progs/test_bpf_nf_fail.c | 57 ++++++++++++++++++++++ 3 files changed, 61 insertions(+), 8 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c index dd6512fa652b..215878ea04de 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c @@ -19,6 +19,10 @@ struct { { "change_timeout_after_alloc", "kernel function bpf_ct_change_timeout args#0 expected pointer to STRUCT nf_conn but" }, { "change_status_after_alloc", "kernel function bpf_ct_change_status args#0 expected pointer to STRUCT nf_conn but" }, { "write_not_allowlisted_field", "no write support to nf_conn at off" }, + { "lookup_null_bpf_tuple", "Possibly NULL pointer passed to trusted arg1" }, + { "lookup_null_bpf_opts", "Possibly NULL pointer passed to trusted arg3" }, + { "xdp_lookup_null_bpf_tuple", "Possibly NULL pointer passed to trusted arg1" }, + { "xdp_lookup_null_bpf_opts", "Possibly NULL pointer passed to trusted arg3" }, }; enum { @@ -111,7 +115,6 @@ static void test_bpf_nf_ct(int mode) if (!ASSERT_OK(err, "bpf_prog_test_run")) goto end; - ASSERT_EQ(skel->bss->test_einval_bpf_tuple, -EINVAL, "Test EINVAL for NULL bpf_tuple"); ASSERT_EQ(skel->bss->test_einval_reserved, -EINVAL, "Test EINVAL for reserved not set to 0"); ASSERT_EQ(skel->bss->test_einval_reserved_new, -EINVAL, "Test EINVAL for reserved in new struct not set to 0"); ASSERT_EQ(skel->bss->test_einval_netns_id, -EINVAL, "Test EINVAL for netns_id < -1"); diff --git a/tools/testing/selftests/bpf/progs/test_bpf_nf.c b/tools/testing/selftests/bpf/progs/test_bpf_nf.c index f7b330ddd007..076fbf03a126 100644 --- a/tools/testing/selftests/bpf/progs/test_bpf_nf.c +++ b/tools/testing/selftests/bpf/progs/test_bpf_nf.c @@ -15,7 +15,6 @@ extern unsigned long CONFIG_HZ __kconfig; -int test_einval_bpf_tuple = 0; int test_einval_reserved = 0; int test_einval_reserved_new = 0; int test_einval_netns_id = 0; @@ -99,12 +98,6 @@ nf_ct_test(struct nf_conn *(*lookup_fn)(void *, struct bpf_sock_tuple *, u32, __builtin_memset(&bpf_tuple, 0, sizeof(bpf_tuple.ipv4)); - ct = lookup_fn(ctx, NULL, 0, &opts_def, sizeof(opts_def)); - if (ct) - bpf_ct_release(ct); - else - test_einval_bpf_tuple = opts_def.error; - opts_def.reserved[0] = 1; ct = lookup_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def, sizeof(opts_def)); diff --git a/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c b/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c index a586f087ffeb..2c156cd166af 100644 --- a/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c +++ b/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c @@ -4,6 +4,7 @@ #include #include #include +#include "bpf_misc.h" struct nf_conn; @@ -18,6 +19,10 @@ struct nf_conn *bpf_skb_ct_alloc(struct __sk_buff *, struct bpf_sock_tuple *, u3 struct bpf_ct_opts___local *, u32) __ksym; struct nf_conn *bpf_skb_ct_lookup(struct __sk_buff *, struct bpf_sock_tuple *, u32, struct bpf_ct_opts___local *, u32) __ksym; +struct nf_conn *bpf_xdp_ct_alloc(struct xdp_md *, struct bpf_sock_tuple *, u32, + struct bpf_ct_opts___local *, u32) __ksym; +struct nf_conn *bpf_xdp_ct_lookup(struct xdp_md *, struct bpf_sock_tuple *, u32, + struct bpf_ct_opts___local *, u32) __ksym; struct nf_conn *bpf_ct_insert_entry(struct nf_conn *) __ksym; void bpf_ct_release(struct nf_conn *) __ksym; void bpf_ct_set_timeout(struct nf_conn *, u32) __ksym; @@ -146,4 +151,56 @@ int change_status_after_alloc(struct __sk_buff *ctx) return 0; } +SEC("?tc") +__failure __msg("Possibly NULL pointer passed to trusted arg1") +int lookup_null_bpf_tuple(struct __sk_buff *ctx) +{ + struct bpf_ct_opts___local opts = {}; + struct nf_conn *ct; + + ct = bpf_skb_ct_lookup(ctx, NULL, 0, &opts, sizeof(opts)); + if (ct) + bpf_ct_release(ct); + return 0; +} + +SEC("?tc") +__failure __msg("Possibly NULL pointer passed to trusted arg3") +int lookup_null_bpf_opts(struct __sk_buff *ctx) +{ + struct bpf_sock_tuple tup = {}; + struct nf_conn *ct; + + ct = bpf_skb_ct_lookup(ctx, &tup, sizeof(tup.ipv4), NULL, sizeof(struct bpf_ct_opts___local)); + if (ct) + bpf_ct_release(ct); + return 0; +} + +SEC("?xdp") +__failure __msg("Possibly NULL pointer passed to trusted arg1") +int xdp_lookup_null_bpf_tuple(struct xdp_md *ctx) +{ + struct bpf_ct_opts___local opts = {}; + struct nf_conn *ct; + + ct = bpf_xdp_ct_lookup(ctx, NULL, 0, &opts, sizeof(opts)); + if (ct) + bpf_ct_release(ct); + return 0; +} + +SEC("?xdp") +__failure __msg("Possibly NULL pointer passed to trusted arg3") +int xdp_lookup_null_bpf_opts(struct xdp_md *ctx) +{ + struct bpf_sock_tuple tup = {}; + struct nf_conn *ct; + + ct = bpf_xdp_ct_lookup(ctx, &tup, sizeof(tup.ipv4), NULL, sizeof(struct bpf_ct_opts___local)); + if (ct) + bpf_ct_release(ct); + return 0; +} + char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From ec4bb8e8dfa060c699b548f62e4d56133aafbbec Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Wed, 24 Sep 2025 16:20:58 +0200 Subject: tools/nolibc: add ptrace support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add ptrace support, as it will be useful in UML. Signed-off-by: Benjamin Berg [Thomas: drop va_args usage and linux/uio.h inclusion] Signed-off-by: Thomas Weißschuh --- tools/include/nolibc/Makefile | 1 + tools/include/nolibc/nolibc.h | 1 + tools/include/nolibc/sys/ptrace.h | 33 ++++++++++++++++++++++++++++ tools/testing/selftests/nolibc/nolibc-test.c | 2 ++ 4 files changed, 37 insertions(+) create mode 100644 tools/include/nolibc/sys/ptrace.h (limited to 'tools/testing') diff --git a/tools/include/nolibc/Makefile b/tools/include/nolibc/Makefile index 8118e22844f1..8b883a6fe580 100644 --- a/tools/include/nolibc/Makefile +++ b/tools/include/nolibc/Makefile @@ -54,6 +54,7 @@ all_files := \ sys/mman.h \ sys/mount.h \ sys/prctl.h \ + sys/ptrace.h \ sys/random.h \ sys/reboot.h \ sys/resource.h \ diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h index 272dfc961158..9c7f43b9218b 100644 --- a/tools/include/nolibc/nolibc.h +++ b/tools/include/nolibc/nolibc.h @@ -101,6 +101,7 @@ #include "sys/mman.h" #include "sys/mount.h" #include "sys/prctl.h" +#include "sys/ptrace.h" #include "sys/random.h" #include "sys/reboot.h" #include "sys/resource.h" diff --git a/tools/include/nolibc/sys/ptrace.h b/tools/include/nolibc/sys/ptrace.h new file mode 100644 index 000000000000..72ca28541633 --- /dev/null +++ b/tools/include/nolibc/sys/ptrace.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * ptrace for NOLIBC + * Copyright (C) 2017-2021 Willy Tarreau + * Copyright (C) 2025 Intel Corporation + */ + +/* make sure to include all global symbols */ +#include "../nolibc.h" + +#ifndef _NOLIBC_SYS_PTRACE_H +#define _NOLIBC_SYS_PTRACE_H + +#include "../sys.h" + +#include + +/* + * long ptrace(int op, pid_t pid, void *addr, void *data); + */ +static __attribute__((unused)) +long sys_ptrace(int op, pid_t pid, void *addr, void *data) +{ + return my_syscall4(__NR_ptrace, op, pid, addr, data); +} + +static __attribute__((unused)) +ssize_t ptrace(int op, pid_t pid, void *addr, void *data) +{ + return __sysret(sys_ptrace(op, pid, addr, data)); +} + +#endif /* _NOLIBC_SYS_PTRACE_H */ diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index 3c5a226dad3a..6888b20af259 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -1406,6 +1407,7 @@ int run_syscall(int min, int max) CASE_TEST(readv_zero); EXPECT_SYSZR(1, readv(1, NULL, 0)); break; CASE_TEST(writev_badf); EXPECT_SYSER(1, writev(-1, &iov_one, 1), -1, EBADF); break; CASE_TEST(writev_zero); EXPECT_SYSZR(1, writev(1, NULL, 0)); break; + CASE_TEST(ptrace); EXPECT_SYSER(1, ptrace(PTRACE_CONT, getpid(), NULL, NULL), -1, ESRCH); break; CASE_TEST(syscall_noargs); EXPECT_SYSEQ(1, syscall(__NR_getpid), getpid()); break; CASE_TEST(syscall_args); EXPECT_SYSER(1, syscall(__NR_statx, 0, NULL, 0, 0, NULL), -1, EFAULT); break; CASE_TEST(namespace); EXPECT_SYSZR(euid0 && proc, test_namespace()); break; -- cgit v1.2.3 From a590a79d19046d3e0f2089f83046f3b87f880359 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Thu, 1 Jan 2026 11:34:16 -0500 Subject: rcutorture: Prevent concurrent kvm.sh runs on same source tree Add flock-based locking to kvm.sh to prevent multiple instances from running concurrently on the same source tree. This prevents build failures caused by one instance's "make clean" deleting generated files while another instance is building causing build failures. The lock file is placed in the rcutorture directory and added to .gitignore. Signed-off-by: Joel Fernandes Tested-by: Paul E. McKenney Signed-off-by: Boqun Feng --- tools/testing/selftests/rcutorture/.gitignore | 1 + tools/testing/selftests/rcutorture/bin/kvm.sh | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/rcutorture/.gitignore b/tools/testing/selftests/rcutorture/.gitignore index f6cbce77460b..b8fd42547a6e 100644 --- a/tools/testing/selftests/rcutorture/.gitignore +++ b/tools/testing/selftests/rcutorture/.gitignore @@ -3,3 +3,4 @@ initrd b[0-9]* res *.swp +.kvm.sh.lock diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index fff15821c44c..d1fbd092e22a 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -275,6 +275,23 @@ do shift done +# Prevent concurrent kvm.sh runs on the same source tree. The flock +# is automatically released when the script exits, even if killed. +TORTURE_LOCK="$RCUTORTURE/.kvm.sh.lock" +if test -z "$dryrun" +then + # Create a file descriptor and flock it, so that when kvm.sh (and its + # children) exit, the flock is released by the kernel automatically. + exec 9>"$TORTURE_LOCK" + if ! flock -n 9 + then + echo "ERROR: Another kvm.sh instance is already running on this tree." + echo " Lock file: $TORTURE_LOCK" + echo " To run kvm.sh, kill all existing kvm.sh runs first." + exit 1 + fi +fi + if test -n "$dryrun" || test -z "$TORTURE_INITRD" || tools/testing/selftests/rcutorture/bin/mkinitrd.sh then : -- cgit v1.2.3 From cf587c6ff2d09866eb53a4620bc1aa561fb0c000 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Thu, 1 Jan 2026 11:34:17 -0500 Subject: rcutorture: Add --kill-previous option to terminate previous kvm.sh runs When kvm.sh is killed, its child processes (make, gcc, qemu, etc.) may continue running. This prevents new kvm.sh instances from starting even though the parent is gone. Add a --kill-previous option that uses fuser(1) to terminate all processes holding the flock file before attempting to acquire it. This provides a clean way to recover from stale/zombie kvm.sh runs which sometimes may have lots of qemu and compiler processes still disturbing. Signed-off-by: Joel Fernandes Tested-by: Paul E. McKenney Signed-off-by: Boqun Feng --- tools/testing/selftests/rcutorture/bin/kvm.sh | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index d1fbd092e22a..65b04b832733 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -80,6 +80,7 @@ usage () { echo " --kasan" echo " --kconfig Kconfig-options" echo " --kcsan" + echo " --kill-previous" echo " --kmake-arg kernel-make-arguments" echo " --mac nn:nn:nn:nn:nn:nn" echo " --memory megabytes|nnnG" @@ -206,6 +207,9 @@ do --kcsan) TORTURE_KCONFIG_KCSAN_ARG="$debuginfo CONFIG_KCSAN=y CONFIG_KCSAN_STRICT=y CONFIG_KCSAN_REPORT_ONCE_IN_MS=100000 CONFIG_KCSAN_VERBOSE=y CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y"; export TORTURE_KCONFIG_KCSAN_ARG ;; + --kill-previous) + TORTURE_KILL_PREVIOUS=1 + ;; --kmake-arg|--kmake-args) checkarg --kmake-arg "(kernel make arguments)" $# "$2" '.*' '^error$' TORTURE_KMAKE_ARG="`echo "$TORTURE_KMAKE_ARG $2" | sed -e 's/^ *//' -e 's/ *$//'`" @@ -278,6 +282,25 @@ done # Prevent concurrent kvm.sh runs on the same source tree. The flock # is automatically released when the script exits, even if killed. TORTURE_LOCK="$RCUTORTURE/.kvm.sh.lock" + +# Terminate any processes holding the lock file, if requested. +if test -n "$TORTURE_KILL_PREVIOUS" +then + if test -e "$TORTURE_LOCK" + then + echo "Killing processes holding $TORTURE_LOCK..." + if fuser -k "$TORTURE_LOCK" >/dev/null 2>&1 + then + sleep 2 + echo "Previous kvm.sh processes killed." + else + echo "No processes were holding the lock." + fi + else + echo "No lock file exists, nothing to kill." + fi +fi + if test -z "$dryrun" then # Create a file descriptor and flock it, so that when kvm.sh (and its @@ -287,7 +310,7 @@ then then echo "ERROR: Another kvm.sh instance is already running on this tree." echo " Lock file: $TORTURE_LOCK" - echo " To run kvm.sh, kill all existing kvm.sh runs first." + echo " To run kvm.sh, kill all existing kvm.sh runs first (--kill-previous)." exit 1 fi fi -- cgit v1.2.3 From f2546eba53bbe38c4bb950f78625ccf4b1a2cbc8 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 15 Dec 2025 16:56:15 -0800 Subject: cxl/mem: Drop @host argument to devm_cxl_add_memdev() In all cases the device that created the 'struct cxl_dev_state' instance is also the device to host the devm cleanup of devm_cxl_add_memdev(). This simplifies the function prototype, and limits a degree of freedom of the API. Cc: Smita Koralahalli Reviewed-by: Jonathan Cameron Reviewed-by: Alison Schofield Reviewed-by: Dave Jiang Reviewed-by: Ben Cheatham Tested-by: Alejandro Lucero Link: https://patch.msgid.link/20251216005616.3090129-6-dan.j.williams@intel.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang --- drivers/cxl/core/memdev.c | 3 +-- drivers/cxl/cxlmem.h | 6 ++---- drivers/cxl/mem.c | 9 +++++---- drivers/cxl/pci.c | 2 +- tools/testing/cxl/test/mem.c | 2 +- 5 files changed, 10 insertions(+), 12 deletions(-) (limited to 'tools/testing') diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index 92aea95859fb..935a163f1527 100644 --- a/drivers/cxl/core/memdev.c +++ b/drivers/cxl/core/memdev.c @@ -1093,8 +1093,7 @@ static struct cxl_memdev *cxl_memdev_autoremove(struct cxl_memdev *cxlmd) * Core helper for devm_cxl_add_memdev() that wants to both create a device and * assert to the caller that upon return cxl_mem::probe() has been invoked. */ -struct cxl_memdev *__devm_cxl_add_memdev(struct device *host, - struct cxl_dev_state *cxlds) +struct cxl_memdev *__devm_cxl_add_memdev(struct cxl_dev_state *cxlds) { struct device *dev; int rc; diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 012e68acad34..9db31c7993c4 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -95,10 +95,8 @@ static inline bool is_cxl_endpoint(struct cxl_port *port) return is_cxl_memdev(port->uport_dev); } -struct cxl_memdev *__devm_cxl_add_memdev(struct device *host, - struct cxl_dev_state *cxlds); -struct cxl_memdev *devm_cxl_add_memdev(struct device *host, - struct cxl_dev_state *cxlds); +struct cxl_memdev *__devm_cxl_add_memdev(struct cxl_dev_state *cxlds); +struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds); int devm_cxl_sanitize_setup_notifier(struct device *host, struct cxl_memdev *cxlmd); struct cxl_memdev_state; diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c index d62931526fd4..677996c65272 100644 --- a/drivers/cxl/mem.c +++ b/drivers/cxl/mem.c @@ -165,17 +165,18 @@ static int cxl_mem_probe(struct device *dev) /** * devm_cxl_add_memdev - Add a CXL memory device - * @host: devres alloc/release context and parent for the memdev * @cxlds: CXL device state to associate with the memdev * * Upon return the device will have had a chance to attach to the * cxl_mem driver, but may fail if the CXL topology is not ready * (hardware CXL link down, or software platform CXL root not attached) + * + * The parent of the resulting device and the devm context for allocations is + * @cxlds->dev. */ -struct cxl_memdev *devm_cxl_add_memdev(struct device *host, - struct cxl_dev_state *cxlds) +struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds) { - return __devm_cxl_add_memdev(host, cxlds); + return __devm_cxl_add_memdev(cxlds); } EXPORT_SYMBOL_NS_GPL(devm_cxl_add_memdev, "CXL"); diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index 0be4e508affe..1c6fc5334806 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -1006,7 +1006,7 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (rc) dev_dbg(&pdev->dev, "No CXL Features discovered\n"); - cxlmd = devm_cxl_add_memdev(&pdev->dev, cxlds); + cxlmd = devm_cxl_add_memdev(cxlds); if (IS_ERR(cxlmd)) return PTR_ERR(cxlmd); diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index 176dcde570cd..8a22b7601627 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -1767,7 +1767,7 @@ static int cxl_mock_mem_probe(struct platform_device *pdev) cxl_mock_add_event_logs(&mdata->mes); - cxlmd = devm_cxl_add_memdev(&pdev->dev, cxlds); + cxlmd = devm_cxl_add_memdev(cxlds); if (IS_ERR(cxlmd)) return PTR_ERR(cxlmd); -- cgit v1.2.3 From 29317f8dc6ed601ec54575689c2cd55cc470bcce Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 15 Dec 2025 16:56:16 -0800 Subject: cxl/mem: Introduce cxl_memdev_attach for CXL-dependent operation Unlike the cxl_pci class driver that opportunistically enables memory expansion with no other dependent functionality, CXL accelerator drivers have distinct PCIe-only and CXL-enhanced operation states. If CXL is available some additional coherent memory/cache operations can be enabled, otherwise traditional DMA+MMIO over PCIe/CXL.io is a fallback. This constitutes a new mode of operation where the caller of devm_cxl_add_memdev() wants to make a "go/no-go" decision about running in CXL accelerated mode or falling back to PCIe-only operation. Part of that decision making process likely also includes additional CXL-acceleration-specific resource setup. Encapsulate both of those requirements into 'struct cxl_memdev_attach' that provides a ->probe() callback. The probe callback runs in cxl_mem_probe() context, after the port topology is successfully attached for the given memdev. It supports a contract where, upon successful return from devm_cxl_add_memdev(), everything needed for CXL accelerated operation has been enabled. Additionally the presence of @cxlmd->attach indicates that the accelerator driver be detached when CXL operation ends. This conceptually makes a CXL link loss event mirror a PCIe link loss event which results in triggering the ->remove() callback of affected devices+drivers. A driver can re-attach to recover back to PCIe-only operation. Live recovery, i.e. without a ->remove()/->probe() cycle, is left as a future consideration. [ dj: Repalce with updated commit log from Dan ] Cc: Smita Koralahalli Reviewed-by: Ben Cheatham Reviewed-by: Dave Jiang Tested-by: Alejandro Lucero Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251216005616.3090129-7-dan.j.williams@intel.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang --- drivers/cxl/core/memdev.c | 33 +++++++++++++++++++++++++++++---- drivers/cxl/cxlmem.h | 12 ++++++++++-- drivers/cxl/mem.c | 20 ++++++++++++++++---- drivers/cxl/pci.c | 2 +- tools/testing/cxl/test/mem.c | 2 +- 5 files changed, 57 insertions(+), 12 deletions(-) (limited to 'tools/testing') diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index 935a163f1527..af3d0cc65138 100644 --- a/drivers/cxl/core/memdev.c +++ b/drivers/cxl/core/memdev.c @@ -641,14 +641,24 @@ static void detach_memdev(struct work_struct *work) struct cxl_memdev *cxlmd; cxlmd = container_of(work, typeof(*cxlmd), detach_work); - device_release_driver(&cxlmd->dev); + + /* + * When the creator of @cxlmd sets ->attach it indicates CXL operation + * is required. In that case, @cxlmd detach escalates to parent device + * detach. + */ + if (cxlmd->attach) + device_release_driver(cxlmd->dev.parent); + else + device_release_driver(&cxlmd->dev); put_device(&cxlmd->dev); } static struct lock_class_key cxl_memdev_key; static struct cxl_memdev *cxl_memdev_alloc(struct cxl_dev_state *cxlds, - const struct file_operations *fops) + const struct file_operations *fops, + const struct cxl_memdev_attach *attach) { struct cxl_memdev *cxlmd; struct device *dev; @@ -664,6 +674,8 @@ static struct cxl_memdev *cxl_memdev_alloc(struct cxl_dev_state *cxlds, goto err; cxlmd->id = rc; cxlmd->depth = -1; + cxlmd->attach = attach; + cxlmd->endpoint = ERR_PTR(-ENXIO); dev = &cxlmd->dev; device_initialize(dev); @@ -1081,6 +1093,18 @@ static struct cxl_memdev *cxl_memdev_autoremove(struct cxl_memdev *cxlmd) { int rc; + /* + * If @attach is provided fail if the driver is not attached upon + * return. Note that failure here could be the result of a race to + * teardown the CXL port topology. I.e. cxl_mem_probe() could have + * succeeded and then cxl_mem unbound before the lock is acquired. + */ + guard(device)(&cxlmd->dev); + if (cxlmd->attach && !cxlmd->dev.driver) { + cxl_memdev_unregister(cxlmd); + return ERR_PTR(-ENXIO); + } + rc = devm_add_action_or_reset(cxlmd->cxlds->dev, cxl_memdev_unregister, cxlmd); if (rc) @@ -1093,13 +1117,14 @@ static struct cxl_memdev *cxl_memdev_autoremove(struct cxl_memdev *cxlmd) * Core helper for devm_cxl_add_memdev() that wants to both create a device and * assert to the caller that upon return cxl_mem::probe() has been invoked. */ -struct cxl_memdev *__devm_cxl_add_memdev(struct cxl_dev_state *cxlds) +struct cxl_memdev *__devm_cxl_add_memdev(struct cxl_dev_state *cxlds, + const struct cxl_memdev_attach *attach) { struct device *dev; int rc; struct cxl_memdev *cxlmd __free(put_cxlmd) = - cxl_memdev_alloc(cxlds, &cxl_memdev_fops); + cxl_memdev_alloc(cxlds, &cxl_memdev_fops, attach); if (IS_ERR(cxlmd)) return cxlmd; diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 9db31c7993c4..ef202b34e5ea 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -34,6 +34,10 @@ (FIELD_GET(CXLMDEV_RESET_NEEDED_MASK, status) != \ CXLMDEV_RESET_NEEDED_NOT) +struct cxl_memdev_attach { + int (*probe)(struct cxl_memdev *cxlmd); +}; + /** * struct cxl_memdev - CXL bus object representing a Type-3 Memory Device * @dev: driver core device object @@ -43,6 +47,7 @@ * @cxl_nvb: coordinate removal of @cxl_nvd if present * @cxl_nvd: optional bridge to an nvdimm if the device supports pmem * @endpoint: connection to the CXL port topology for this memory device + * @attach: creator of this memdev depends on CXL link attach to operate * @id: id number of this memdev instance. * @depth: endpoint port depth * @scrub_cycle: current scrub cycle set for this device @@ -59,6 +64,7 @@ struct cxl_memdev { struct cxl_nvdimm_bridge *cxl_nvb; struct cxl_nvdimm *cxl_nvd; struct cxl_port *endpoint; + const struct cxl_memdev_attach *attach; int id; int depth; u8 scrub_cycle; @@ -95,8 +101,10 @@ static inline bool is_cxl_endpoint(struct cxl_port *port) return is_cxl_memdev(port->uport_dev); } -struct cxl_memdev *__devm_cxl_add_memdev(struct cxl_dev_state *cxlds); -struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds); +struct cxl_memdev *__devm_cxl_add_memdev(struct cxl_dev_state *cxlds, + const struct cxl_memdev_attach *attach); +struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds, + const struct cxl_memdev_attach *attach); int devm_cxl_sanitize_setup_notifier(struct device *host, struct cxl_memdev *cxlmd); struct cxl_memdev_state; diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c index 677996c65272..333c366b69e7 100644 --- a/drivers/cxl/mem.c +++ b/drivers/cxl/mem.c @@ -142,6 +142,12 @@ static int cxl_mem_probe(struct device *dev) return rc; } + if (cxlmd->attach) { + rc = cxlmd->attach->probe(cxlmd); + if (rc) + return rc; + } + rc = devm_cxl_memdev_edac_register(cxlmd); if (rc) dev_dbg(dev, "CXL memdev EDAC registration failed rc=%d\n", rc); @@ -166,17 +172,23 @@ static int cxl_mem_probe(struct device *dev) /** * devm_cxl_add_memdev - Add a CXL memory device * @cxlds: CXL device state to associate with the memdev + * @attach: Caller depends on CXL topology attachment * * Upon return the device will have had a chance to attach to the - * cxl_mem driver, but may fail if the CXL topology is not ready - * (hardware CXL link down, or software platform CXL root not attached) + * cxl_mem driver, but may fail to attach if the CXL topology is not ready + * (hardware CXL link down, or software platform CXL root not attached). + * + * When @attach is NULL it indicates the caller wants the memdev to remain + * registered even if it does not immediately attach to the CXL hierarchy. When + * @attach is provided a cxl_mem_probe() failure leads to failure of this routine. * * The parent of the resulting device and the devm context for allocations is * @cxlds->dev. */ -struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds) +struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds, + const struct cxl_memdev_attach *attach) { - return __devm_cxl_add_memdev(cxlds); + return __devm_cxl_add_memdev(cxlds, attach); } EXPORT_SYMBOL_NS_GPL(devm_cxl_add_memdev, "CXL"); diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index 1c6fc5334806..549368a9c868 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -1006,7 +1006,7 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (rc) dev_dbg(&pdev->dev, "No CXL Features discovered\n"); - cxlmd = devm_cxl_add_memdev(cxlds); + cxlmd = devm_cxl_add_memdev(cxlds, NULL); if (IS_ERR(cxlmd)) return PTR_ERR(cxlmd); diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index 8a22b7601627..cb87e8c0e63c 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -1767,7 +1767,7 @@ static int cxl_mock_mem_probe(struct platform_device *pdev) cxl_mock_add_event_logs(&mdata->mes); - cxlmd = devm_cxl_add_memdev(cxlds); + cxlmd = devm_cxl_add_memdev(cxlds, NULL); if (IS_ERR(cxlmd)) return PTR_ERR(cxlmd); -- cgit v1.2.3 From fb36d71308a770268c771d6697f22615e5ddbd6e Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Fri, 19 Dec 2025 15:29:42 +0000 Subject: kselftest/arm64: Support FORCE_TARGETS The top level kselftest Makefile supports an option FORCE_TARGETS which causes any failures during the build to be propagated to the exit status of the top level make, useful during build testing. Currently the recursion done by the arm64 selftests ignores this option, meaning arm64 failures are not reported via this mechanism. Add the logic to implement FORCE_TARGETS so that it works for arm64. Signed-off-by: Mark Brown Acked-by: Shuah Khan Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/Makefile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/arm64/Makefile b/tools/testing/selftests/arm64/Makefile index c4c72ee2ef55..e456f3b62fa1 100644 --- a/tools/testing/selftests/arm64/Makefile +++ b/tools/testing/selftests/arm64/Makefile @@ -30,13 +30,15 @@ all: @for DIR in $(ARM64_SUBTARGETS); do \ BUILD_TARGET=$(OUTPUT)/$$DIR; \ mkdir -p $$BUILD_TARGET; \ - make OUTPUT=$$BUILD_TARGET -C $$DIR $@; \ + make OUTPUT=$$BUILD_TARGET -C $$DIR $@ \ + $(if $(FORCE_TARGETS),|| exit); \ done install: all @for DIR in $(ARM64_SUBTARGETS); do \ BUILD_TARGET=$(OUTPUT)/$$DIR; \ - make OUTPUT=$$BUILD_TARGET -C $$DIR $@; \ + make OUTPUT=$$BUILD_TARGET -C $$DIR $@ \ + $(if $(FORCE_TARGETS),|| exit); \ done run_tests: all -- cgit v1.2.3 From 5c7a4741431b0a938dcbd22b90a4dc9a2903fc00 Mon Sep 17 00:00:00 2001 From: Ryota Sakamoto Date: Tue, 6 Jan 2026 01:41:01 +0900 Subject: kunit: respect KBUILD_OUTPUT env variable by default Currently, kunit.py ignores the KBUILD_OUTPUT env variable and always defaults to .kunit in the working directory. This behavior is inconsistent with standard Kbuild behavior, where KBUILD_OUTPUT defines the build artifact location. This patch modifies kunit.py to respect KBUILD_OUTPUT if set. A .kunit subdirectory is created inside KBUILD_OUTPUT to avoid polluting the build directory. Link: https://lore.kernel.org/r/20260106-kunit-kbuild_output-v2-1-582281797343@gmail.com Reviewed-by: David Gow Signed-off-by: Ryota Sakamoto Signed-off-by: Shuah Khan --- tools/testing/kunit/kunit.py | 7 ++++++- tools/testing/kunit/kunit_tool_test.py | 19 +++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py index cd99c1956331..e3d82a038f93 100755 --- a/tools/testing/kunit/kunit.py +++ b/tools/testing/kunit/kunit.py @@ -323,11 +323,16 @@ def get_default_jobs() -> int: return ncpu raise RuntimeError("os.cpu_count() returned None") +def get_default_build_dir() -> str: + if 'KBUILD_OUTPUT' in os.environ: + return os.path.join(os.environ['KBUILD_OUTPUT'], '.kunit') + return '.kunit' + def add_common_opts(parser: argparse.ArgumentParser) -> None: parser.add_argument('--build_dir', help='As in the make command, it specifies the build ' 'directory.', - type=str, default='.kunit', metavar='DIR') + type=str, default=get_default_build_dir(), metavar='DIR') parser.add_argument('--make_options', help='X=Y make option, can be repeated.', action='append', metavar='X=Y') diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py index bbba921e0eac..a55b5085310d 100755 --- a/tools/testing/kunit/kunit_tool_test.py +++ b/tools/testing/kunit/kunit_tool_test.py @@ -601,6 +601,7 @@ class KUnitMainTest(unittest.TestCase): all_passed_log = file.readlines() self.print_mock = mock.patch('kunit_printer.Printer.print').start() + mock.patch.dict(os.environ, clear=True).start() self.addCleanup(mock.patch.stopall) self.mock_linux_init = mock.patch.object(kunit_kernel, 'LinuxSourceTree').start() @@ -723,6 +724,24 @@ class KUnitMainTest(unittest.TestCase): args=None, build_dir=build_dir, filter_glob='', filter='', filter_action=None, timeout=300) self.print_mock.assert_any_call(StrContains('Testing complete.')) + @mock.patch.dict(os.environ, {'KBUILD_OUTPUT': '/tmp'}) + def test_run_builddir_from_env(self): + build_dir = '/tmp/.kunit' + kunit.main(['run']) + self.assertEqual(self.linux_source_mock.build_reconfig.call_count, 1) + self.linux_source_mock.run_kernel.assert_called_once_with( + args=None, build_dir=build_dir, filter_glob='', filter='', filter_action=None, timeout=300) + self.print_mock.assert_any_call(StrContains('Testing complete.')) + + @mock.patch.dict(os.environ, {'KBUILD_OUTPUT': '/tmp'}) + def test_run_builddir_override(self): + build_dir = '.kunit' + kunit.main(['run', '--build_dir=.kunit']) + self.assertEqual(self.linux_source_mock.build_reconfig.call_count, 1) + self.linux_source_mock.run_kernel.assert_called_once_with( + args=None, build_dir=build_dir, filter_glob='', filter='', filter_action=None, timeout=300) + self.print_mock.assert_any_call(StrContains('Testing complete.')) + def test_config_builddir(self): build_dir = '.kunit' kunit.main(['config', '--build_dir', build_dir]) -- cgit v1.2.3 From 0c5b86c67fb6898d02c8f92de884186297fd302f Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Tue, 30 Dec 2025 13:26:35 +0100 Subject: kunit: tool: Add test for nested test result reporting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently there is a lack of tests validating the result reporting from nested tests. Add one, it will also be used to validate upcoming changes to the nested test parsing. Link: https://lore.kernel.org/r/20251230-kunit-nested-failure-v1-1-98cfbeb87823@linutronix.de Signed-off-by: Thomas Weißschuh Reviewed-by: Rae Moar Reviewed-by: David Gow Signed-off-by: Shuah Khan --- tools/testing/kunit/kunit_tool_test.py | 10 ++++++++++ .../kunit/test_data/test_is_test_passed-failure-nested.log | 7 +++++++ 2 files changed, 17 insertions(+) create mode 100644 tools/testing/kunit/test_data/test_is_test_passed-failure-nested.log (limited to 'tools/testing') diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py index a55b5085310d..81a0996edef4 100755 --- a/tools/testing/kunit/kunit_tool_test.py +++ b/tools/testing/kunit/kunit_tool_test.py @@ -165,6 +165,16 @@ class KUnitParserTest(unittest.TestCase): self.assertEqual(kunit_parser.TestStatus.FAILURE, result.status) self.assertEqual(result.counts.errors, 0) + def test_parse_failed_nested_tests_log(self): + nested_log = test_data_path('test_is_test_passed-failure-nested.log') + with open(nested_log) as file: + result = kunit_parser.parse_run_tests(file.readlines(), stdout) + self.assertEqual(kunit_parser.TestStatus.FAILURE, result.status) + self.assertEqual(result.counts.failed, 2) + self.assertEqual(kunit_parser.TestStatus.FAILURE, result.subtests[0].status) + self.assertEqual(kunit_parser.TestStatus.FAILURE, result.subtests[1].status) + self.assertEqual(kunit_parser.TestStatus.FAILURE, result.subtests[1].subtests[0].status) + def test_no_header(self): empty_log = test_data_path('test_is_test_passed-no_tests_run_no_header.log') with open(empty_log) as file: diff --git a/tools/testing/kunit/test_data/test_is_test_passed-failure-nested.log b/tools/testing/kunit/test_data/test_is_test_passed-failure-nested.log new file mode 100644 index 000000000000..2e528da39ab5 --- /dev/null +++ b/tools/testing/kunit/test_data/test_is_test_passed-failure-nested.log @@ -0,0 +1,7 @@ +KTAP version 1 +1..2 +not ok 1 subtest 1 + KTAP version 1 + 1..1 + not ok 1 subsubtest 1 +not ok 2 subtest 2 -- cgit v1.2.3 From 85aff81b0dba7c42d226d9f7c11c4d30a7906878 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Tue, 30 Dec 2025 13:26:36 +0100 Subject: kunit: tool: Don't overwrite test status based on subtest counts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a subtest itself reports success, but the outer testcase fails, the whole testcase should be reported as a failure. However the status is recalculated based on the test counts, overwriting the outer test result. Synthesize a failed test in this case to make sure the failure is not swallowed. Link: https://lore.kernel.org/r/20251230-kunit-nested-failure-v1-2-98cfbeb87823@linutronix.de Signed-off-by: Thomas Weißschuh Reviewed-by: David Gow Signed-off-by: Shuah Khan --- tools/testing/kunit/kunit_parser.py | 3 +++ tools/testing/kunit/kunit_tool_test.py | 1 + tools/testing/kunit/test_data/test_is_test_passed-failure-nested.log | 3 +++ 3 files changed, 7 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/kunit/kunit_parser.py b/tools/testing/kunit/kunit_parser.py index 333cd3a4a56b..5338489dcbe4 100644 --- a/tools/testing/kunit/kunit_parser.py +++ b/tools/testing/kunit/kunit_parser.py @@ -689,6 +689,9 @@ def bubble_up_test_results(test: Test) -> None: elif test.counts.get_status() == TestStatus.TEST_CRASHED: test.status = TestStatus.TEST_CRASHED + if status == TestStatus.FAILURE and test.counts.get_status() == TestStatus.SUCCESS: + counts.add_status(status) + def parse_test(lines: LineStream, expected_num: int, log: List[str], is_subtest: bool, printer: Printer) -> Test: """ Finds next test to parse in LineStream, creates new Test object, diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py index 81a0996edef4..bdc51b5c7b10 100755 --- a/tools/testing/kunit/kunit_tool_test.py +++ b/tools/testing/kunit/kunit_tool_test.py @@ -172,6 +172,7 @@ class KUnitParserTest(unittest.TestCase): self.assertEqual(kunit_parser.TestStatus.FAILURE, result.status) self.assertEqual(result.counts.failed, 2) self.assertEqual(kunit_parser.TestStatus.FAILURE, result.subtests[0].status) + self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.subtests[0].subtests[0].status) self.assertEqual(kunit_parser.TestStatus.FAILURE, result.subtests[1].status) self.assertEqual(kunit_parser.TestStatus.FAILURE, result.subtests[1].subtests[0].status) diff --git a/tools/testing/kunit/test_data/test_is_test_passed-failure-nested.log b/tools/testing/kunit/test_data/test_is_test_passed-failure-nested.log index 2e528da39ab5..5498dfd0b0db 100644 --- a/tools/testing/kunit/test_data/test_is_test_passed-failure-nested.log +++ b/tools/testing/kunit/test_data/test_is_test_passed-failure-nested.log @@ -1,5 +1,8 @@ KTAP version 1 1..2 + KTAP version 1 + 1..1 + ok 1 test 1 not ok 1 subtest 1 KTAP version 1 1..1 -- cgit v1.2.3 From ab150c2bbafe9425759eca1e45e08d4ad1456818 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Fri, 2 Jan 2026 08:20:39 +0100 Subject: kunit: qemu_configs: Add 32-bit big endian ARM configuration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a basic config to run kunit tests on 32-bit big endian ARM. Link: https://lore.kernel.org/r/20260102-kunit-armeb-v1-1-e8e5475d735c@linutronix.de Signed-off-by: Thomas Weißschuh Reviewed-by: David Gow Signed-off-by: Shuah Khan --- tools/testing/kunit/qemu_configs/armeb.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 tools/testing/kunit/qemu_configs/armeb.py (limited to 'tools/testing') diff --git a/tools/testing/kunit/qemu_configs/armeb.py b/tools/testing/kunit/qemu_configs/armeb.py new file mode 100644 index 000000000000..86d326651490 --- /dev/null +++ b/tools/testing/kunit/qemu_configs/armeb.py @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: GPL-2.0 + +from ..qemu_config import QemuArchParams + +QEMU_ARCH = QemuArchParams(linux_arch='arm', + kconfig=''' +CONFIG_CPU_BIG_ENDIAN=y +CONFIG_ARCH_VIRT=y +CONFIG_SERIAL_AMBA_PL010=y +CONFIG_SERIAL_AMBA_PL010_CONSOLE=y +CONFIG_SERIAL_AMBA_PL011=y +CONFIG_SERIAL_AMBA_PL011_CONSOLE=y''', + qemu_arch='arm', + kernel_path='arch/arm/boot/zImage', + kernel_command_line='console=ttyAMA0', + extra_qemu_params=['-machine', 'virt']) -- cgit v1.2.3 From ca7206b6ad029d2c35e64f1ea81dba385496e630 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Sat, 20 Dec 2025 14:55:54 +0100 Subject: selftests/nolibc: test compatibility of nolibc and kernel time types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Keeping 'struct timespec' and 'struct __kernel_timespec' compatible allows the source code to stay simple. Validate that the types stay compatible. The test is specific to nolibc and does not compile on other libcs, so skip it there. Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau Reviewed-by: Arnd Bergmann Link: https://patch.msgid.link/20251220-nolibc-uapi-types-v3-10-c662992f75d7@weissschuh.net --- tools/testing/selftests/nolibc/nolibc-test.c | 29 ++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index 6888b20af259..3986d55a6ff6 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -1430,6 +1430,34 @@ int test_difftime(void) return 0; } +int test_time_types(void) +{ +#ifdef NOLIBC + struct __kernel_timespec kts; + struct timespec ts; + + if (!__builtin_types_compatible_p(time_t, __kernel_time64_t)) + return 1; + + if (sizeof(ts) != sizeof(kts)) + return 1; + + if (!__builtin_types_compatible_p(__typeof__(ts.tv_sec), __typeof__(kts.tv_sec))) + return 1; + + if (!__builtin_types_compatible_p(__typeof__(ts.tv_nsec), __typeof__(kts.tv_nsec))) + return 1; + + if (offsetof(__typeof__(ts), tv_sec) != offsetof(__typeof__(kts), tv_sec)) + return 1; + + if (offsetof(__typeof__(ts), tv_nsec) != offsetof(__typeof__(kts), tv_nsec)) + return 1; +#endif /* NOLIBC */ + + return 0; +} + int run_stdlib(int min, int max) { int test; @@ -1555,6 +1583,7 @@ int run_stdlib(int min, int max) CASE_TEST(difftime); EXPECT_ZR(1, test_difftime()); break; CASE_TEST(memchr_foobar6_o); EXPECT_STREQ(1, memchr("foobar", 'o', 6), "oobar"); break; CASE_TEST(memchr_foobar3_b); EXPECT_STRZR(1, memchr("foobar", 'b', 3)); break; + CASE_TEST(time_types); EXPECT_ZR(is_nolibc, test_time_types()); break; case __LINE__: return ret; /* must be last */ -- cgit v1.2.3 From 03139924859f7930cd1667a278a560b5d4c6a672 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Sun, 4 Jan 2026 15:57:51 +0100 Subject: selftests/nolibc: drop NOLIBC_SYSROOT=0 logic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This logic was added in commit 850fad7de827 ("selftests/nolibc: allow test -include /path/to/nolibc.h") to allow the testing of -include /path/to/nolibc.h. As it requires as special variable to activate, this code is nearly never used. Furthermore it complicates the logic a bit. Since commit a6a054c8ad32 ("tools/nolibc: add target to check header usability") and commit 443c6467fcd6 ("selftests/nolibc: always run nolibc header check") the usability of -include /path/to/nolibc.h is always checked anyways, making NOLIBC_SYSROOT=0 pointless. Drop the special logic. Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau Link: https://patch.msgid.link/20260104-nolibc-nolibc_sysroot-v1-1-98025ad99add@weissschuh.net --- tools/testing/selftests/nolibc/Makefile.nolibc | 6 ------ 1 file changed, 6 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/nolibc/Makefile.nolibc b/tools/testing/selftests/nolibc/Makefile.nolibc index f9d43cbdc894..b17ba2f8fb46 100644 --- a/tools/testing/selftests/nolibc/Makefile.nolibc +++ b/tools/testing/selftests/nolibc/Makefile.nolibc @@ -302,15 +302,9 @@ sysroot/$(ARCH)/include: $(Q)$(MAKE) -C $(srctree)/tools/include/nolibc ARCH=$(ARCH) OUTPUT=$(CURDIR)/sysroot/ headers_standalone headers_check $(Q)mv sysroot/sysroot sysroot/$(ARCH) -ifneq ($(NOLIBC_SYSROOT),0) nolibc-test: nolibc-test.c nolibc-test-linkage.c sysroot/$(ARCH)/include $(QUIET_CC)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ \ -nostdlib -nostdinc -static -Isysroot/$(ARCH)/include nolibc-test.c nolibc-test-linkage.c $(LIBGCC) -else -nolibc-test: nolibc-test.c nolibc-test-linkage.c - $(QUIET_CC)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ \ - -nostdlib -static -include $(srctree)/tools/include/nolibc/nolibc.h nolibc-test.c nolibc-test-linkage.c $(LIBGCC) -endif libc-test: nolibc-test.c nolibc-test-linkage.c $(QUIET_CC)$(HOSTCC) -o $@ nolibc-test.c nolibc-test-linkage.c -- cgit v1.2.3 From 6b6dbf3e4ecfd7d1086bd7cd8b31ca8e45d4dc1f Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Tue, 6 Jan 2026 11:39:55 +0100 Subject: selftests/nolibc: always build sparc32 tests with -mcpu=v8 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since LLVM commit 39e30508a7f6 ("[Driver][Sparc] Default to -mcpu=v9 for 32-bit Linux/sparc64 (#109278)"), clang defaults to -mcpu=v9 for 32-bit SPARC builds. -mcpu=v9 generates instructions which are not recognized by qemu-sparc and qemu-system-sparc. Explicitly enforce -mcpu=v8 to generate compatible code. Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau Link: https://patch.msgid.link/20260106-nolibc-sparc32-fix-v2-1-7c5cd6b175c2@weissschuh.net --- tools/testing/selftests/nolibc/Makefile.nolibc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/nolibc/Makefile.nolibc b/tools/testing/selftests/nolibc/Makefile.nolibc index b17ba2f8fb46..f5704193038f 100644 --- a/tools/testing/selftests/nolibc/Makefile.nolibc +++ b/tools/testing/selftests/nolibc/Makefile.nolibc @@ -226,7 +226,7 @@ CFLAGS_mipsn32be = -EB -mabi=n32 -march=mips64r6 CFLAGS_mips64le = -EL -mabi=64 -march=mips64r6 CFLAGS_mips64be = -EB -mabi=64 -march=mips64r2 CFLAGS_loongarch = $(if $(LLVM),-fuse-ld=lld) -CFLAGS_sparc32 = $(call cc-option,-m32) +CFLAGS_sparc32 = $(call cc-option,-m32) -mcpu=v8 CFLAGS_sh4 = -ml -m4 ifeq ($(origin XARCH),command line) CFLAGS_XARCH = $(CFLAGS_$(XARCH)) -- cgit v1.2.3 From ab86d0bf01f6d0e37fd67761bb62918321b64efc Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Mon, 5 Jan 2026 12:47:46 +0100 Subject: selftests/bpf: Update xdp_context_test_run test to check maximum metadata size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update the selftest to check that the metadata size check takes the xdp_frame size into account in bpf_prog_test_run. The original check (for meta size 256) was broken because the data frame supplied was smaller than this, triggering a different EINVAL return. So supply a larger data frame for this test to make sure we actually exercise the check we think we are. Signed-off-by: Toke Høiland-Jørgensen Reviewed-by: Amery Hung Link: https://lore.kernel.org/r/20260105114747.1358750-2-toke@redhat.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/xdp_context_test_run.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c index ee94c281888a..26159e0499c7 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c @@ -47,6 +47,7 @@ void test_xdp_context_test_run(void) struct test_xdp_context_test_run *skel = NULL; char data[sizeof(pkt_v4) + sizeof(__u32)]; char bad_ctx[sizeof(struct xdp_md) + 1]; + char large_data[256]; struct xdp_md ctx_in, ctx_out; DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts, .data_in = &data, @@ -94,9 +95,6 @@ void test_xdp_context_test_run(void) test_xdp_context_error(prog_fd, opts, 4, sizeof(__u32), sizeof(data), 0, 0, 0); - /* Meta data must be 255 bytes or smaller */ - test_xdp_context_error(prog_fd, opts, 0, 256, sizeof(data), 0, 0, 0); - /* Total size of data must be data_end - data_meta or larger */ test_xdp_context_error(prog_fd, opts, 0, sizeof(__u32), sizeof(data) + 1, 0, 0, 0); @@ -116,6 +114,16 @@ void test_xdp_context_test_run(void) test_xdp_context_error(prog_fd, opts, 0, sizeof(__u32), sizeof(data), 0, 0, 1); + /* Meta data must be 216 bytes or smaller (256 - sizeof(struct + * xdp_frame)). Test both nearest invalid size and nearest invalid + * 4-byte-aligned size, and make sure data_in is large enough that we + * actually hit the check on metadata length + */ + opts.data_in = large_data; + opts.data_size_in = sizeof(large_data); + test_xdp_context_error(prog_fd, opts, 0, 217, sizeof(large_data), 0, 0, 0); + test_xdp_context_error(prog_fd, opts, 0, 220, sizeof(large_data), 0, 0, 0); + test_xdp_context_test_run__destroy(skel); } -- cgit v1.2.3 From cb3de96eea66f5e4a580086c6a1be46e765f97f4 Mon Sep 17 00:00:00 2001 From: Yumei Huang Date: Sun, 4 Jan 2026 11:23:57 +0800 Subject: ipv6: preserve insertion order for same-scope addresses IPv6 addresses with the same scope are returned in reverse insertion order, unlike IPv4. For example, when adding a -> b -> c, the list is reported as c -> b -> a, while IPv4 preserves the original order. This behavior causes: a. When using `ip -6 a save` and `ip -6 a restore`, addresses are restored in the opposite order from which they were saved. See example below showing addresses added as 1::1, 1::2, 1::3 but displayed and saved in reverse order. # ip -6 a a 1::1 dev x # ip -6 a a 1::2 dev x # ip -6 a a 1::3 dev x # ip -6 a s dev x 2: x: mtu 1500 qdisc noop state DOWN group default qlen 1000 inet6 1::3/128 scope global tentative valid_lft forever preferred_lft forever inet6 1::2/128 scope global tentative valid_lft forever preferred_lft forever inet6 1::1/128 scope global tentative valid_lft forever preferred_lft forever # ip -6 a save > dump # ip -6 a d 1::1 dev x # ip -6 a d 1::2 dev x # ip -6 a d 1::3 dev x # ip a d ::1 dev lo # ip a restore < dump # ip -6 a s dev x 2: x: mtu 1500 qdisc noop state DOWN group default qlen 1000 inet6 1::1/128 scope global tentative valid_lft forever preferred_lft forever inet6 1::2/128 scope global tentative valid_lft forever preferred_lft forever inet6 1::3/128 scope global tentative valid_lft forever preferred_lft forever # ip a showdump < dump if1: inet6 ::1/128 scope host proto kernel_lo valid_lft forever preferred_lft forever if2: inet6 1::3/128 scope global tentative valid_lft forever preferred_lft forever if2: inet6 1::2/128 scope global tentative valid_lft forever preferred_lft forever if2: inet6 1::1/128 scope global tentative valid_lft forever preferred_lft forever b. Addresses in pasta to appear in reversed order compared to host addresses. The ipv6 addresses were added in reverse order by commit e55ffac60117 ("[IPV6]: order addresses by scope"), then it was changed by commit 502a2ffd7376 ("ipv6: convert idev_list to list macros"), and restored by commit b54c9b98bbfb ("ipv6: Preserve pervious behavior in ipv6_link_dev_addr()."). However, this reverse ordering within the same scope causes inconsistency with IPv4 and the issues described above. This patch aligns IPv6 address ordering with IPv4 for consistency by changing the comparison from >= to > when inserting addresses into the address list. Also updates the ioam6 selftest to reflect the new address ordering behavior. Combine these two changes into one patch for bisectability. Link: https://bugs.passt.top/show_bug.cgi?id=175 Suggested-by: Stefano Brivio Signed-off-by: Yumei Huang Acked-by: Justin Iurman Reviewed-by: David Ahern Link: https://patch.msgid.link/20260104032357.38555-1-yuhuang@redhat.com Signed-off-by: Jakub Kicinski --- net/ipv6/addrconf.c | 2 +- tools/testing/selftests/net/ioam6.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'tools/testing') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index b66217d1b2f8..fe64988a23ab 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1013,7 +1013,7 @@ ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp) list_for_each(p, &idev->addr_list) { struct inet6_ifaddr *ifa = list_entry(p, struct inet6_ifaddr, if_list); - if (ifp_scope >= ipv6_addr_src_scope(&ifa->addr)) + if (ifp_scope > ipv6_addr_src_scope(&ifa->addr)) break; } diff --git a/tools/testing/selftests/net/ioam6.sh b/tools/testing/selftests/net/ioam6.sh index 845c26dd01a9..b2b99889942f 100755 --- a/tools/testing/selftests/net/ioam6.sh +++ b/tools/testing/selftests/net/ioam6.sh @@ -273,8 +273,8 @@ setup() ip -netns $ioam_node_beta link set ioam-veth-betaR name veth1 &>/dev/null ip -netns $ioam_node_gamma link set ioam-veth-gamma name veth0 &>/dev/null - ip -netns $ioam_node_alpha addr add 2001:db8:1::50/64 dev veth0 &>/dev/null ip -netns $ioam_node_alpha addr add 2001:db8:1::2/64 dev veth0 &>/dev/null + ip -netns $ioam_node_alpha addr add 2001:db8:1::50/64 dev veth0 &>/dev/null ip -netns $ioam_node_alpha link set veth0 up &>/dev/null ip -netns $ioam_node_alpha link set lo up &>/dev/null ip -netns $ioam_node_alpha route add 2001:db8:2::/64 \ -- cgit v1.2.3 From c4df070a57def243dcf5773428dd1b51bc8337ef Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 4 Jan 2026 10:46:00 -0800 Subject: selftests: hw-net: rss-input-xfrm: try to enable the xfrm at the start The test currently SKIPs if the symmetric RSS xfrm is not enabled by default. This leads to spurious SKIPs in the Intel CI reporting results to NIPA. Testing on CX7: # ./drivers/net/hw/rss_input_xfrm.py TAP version 13 1..2 ok 1 rss_input_xfrm.test_rss_input_xfrm_ipv4 # SKIP Test requires IPv4 connectivity # Sym input xfrm already enabled: {'sym-or-xor'} ok 2 rss_input_xfrm.test_rss_input_xfrm_ipv6 # Totals: pass:1 fail:0 xfail:0 xpass:0 skip:1 error:0 # ethtool -X eth0 xfrm none # ./drivers/net/hw/rss_input_xfrm.py TAP version 13 1..2 ok 1 rss_input_xfrm.test_rss_input_xfrm_ipv4 # SKIP Test requires IPv4 connectivity # Sym input xfrm configured: {'sym-or-xor'} ok 2 rss_input_xfrm.test_rss_input_xfrm_ipv6 # Totals: pass:1 fail:0 xfail:0 xpass:0 skip:1 error:0 Link: https://patch.msgid.link/20260104184600.795280-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- .../selftests/drivers/net/hw/rss_input_xfrm.py | 44 +++++++++++++++++++--- 1 file changed, 38 insertions(+), 6 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py b/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py index 72880e388478..503f1a2a2872 100755 --- a/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py +++ b/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py @@ -5,9 +5,9 @@ import multiprocessing import socket from lib.py import ksft_run, ksft_exit, ksft_eq, ksft_ge, cmd, fd_read_timeout from lib.py import NetDrvEpEnv -from lib.py import EthtoolFamily, NetdevFamily +from lib.py import EthtoolFamily, NetdevFamily, NlError from lib.py import KsftSkipEx, KsftFailEx -from lib.py import rand_port +from lib.py import defer, ksft_pr, rand_port def traffic(cfg, local_port, remote_port, ipver): @@ -21,6 +21,40 @@ def traffic(cfg, local_port, remote_port, ipver): return sock.getsockopt(socket.SOL_SOCKET, socket.SO_INCOMING_CPU) +def _rss_input_xfrm_try_enable(cfg): + """ + Check if symmetric input-xfrm is already enabled, if not try to enable it + and register a cleanup. + """ + rss = cfg.ethnl.rss_get({'header': {'dev-name': cfg.ifname}}) + orig_xfrm = rss.get('input-xfrm', set()) + sym_xfrm = set(filter(lambda x: 'sym' in x, orig_xfrm)) + + if sym_xfrm: + ksft_pr("Sym input xfrm already enabled:", sym_xfrm) + return sym_xfrm + + for xfrm in cfg.ethnl.consts["input-xfrm"].entries: + # Skip non-symmetric transforms + if "sym" not in xfrm: + continue + + try_xfrm = {xfrm} | orig_xfrm + try: + cfg.ethnl.rss_set({"header": {"dev-index": cfg.ifindex}, + "input-xfrm": try_xfrm}) + except NlError: + continue + + ksft_pr("Sym input xfrm configured:", try_xfrm) + defer(cfg.ethnl.rss_set, + {"header": {"dev-index": cfg.ifindex}, + "input-xfrm": orig_xfrm}) + return {xfrm} + + return set() + + def test_rss_input_xfrm(cfg, ipver): """ Test symmetric input_xfrm. @@ -37,12 +71,10 @@ def test_rss_input_xfrm(cfg, ipver): if not hasattr(socket, "SO_INCOMING_CPU"): raise KsftSkipEx("socket.SO_INCOMING_CPU was added in Python 3.11") - rss = cfg.ethnl.rss_get({'header': {'dev-name': cfg.ifname}}) - input_xfrm = set(filter(lambda x: 'sym' in x, rss.get('input-xfrm', {}))) - # Check for symmetric xor/or-xor + input_xfrm = _rss_input_xfrm_try_enable(cfg) if not input_xfrm: - raise KsftSkipEx("Symmetric RSS hash not requested") + raise KsftSkipEx("Symmetric RSS hash not supported by device") cpus = set() successful = 0 -- cgit v1.2.3 From 3b7a108c4197f9fd0b593c6b4b0de457d9ed4c87 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 5 Jan 2026 12:25:02 -0500 Subject: selftests/net: packetdrill: add minimal client and server tests Introduce minimal tests. These can serve as simple illustrative examples, and as templates when writing new tests. When adding new cases, it can be easier to extend an existing base test rather than start from scratch. The existing tests all focus on real, often non-trivial, features. It is not obvious which to take as starting point, and arguably none really qualify. Add two tests - the client test performs the active open and initial close - the server test implements the passive open and final close Signed-off-by: Willem de Bruijn Link: https://patch.msgid.link/20260105172529.3514786-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- .../selftests/net/packetdrill/tcp_basic_client.pkt | 24 +++++++++++++++ .../selftests/net/packetdrill/tcp_basic_server.pkt | 35 ++++++++++++++++++++++ 2 files changed, 59 insertions(+) create mode 100644 tools/testing/selftests/net/packetdrill/tcp_basic_client.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_basic_server.pkt (limited to 'tools/testing') diff --git a/tools/testing/selftests/net/packetdrill/tcp_basic_client.pkt b/tools/testing/selftests/net/packetdrill/tcp_basic_client.pkt new file mode 100644 index 000000000000..319f81dd717d --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_basic_client.pkt @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +// +// Minimal active open. +// First to close connection. + +`./defaults.sh` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4 + + // Connect to server: active open: three-way handshake + +0...0 connect(4, ..., ...) = 0 + +0 > S 0:0(0) + +0 < S. 0:0(0) ack 1 win 65535 + +0 > . 1:1(0) ack 1 + + // Send data + +0 send(4, ..., 1000, 0) = 1000 + +0 > P. 1:1001(1000) ack 1 + +0 < . 1:1(0) ack 1001 win 257 + + +0 close(4) = 0 + +0 > F. 1001:1001(0) ack 1 + +0 < F. 1:1(0) ack 1002 win 257 + +0 > . 1002:1002(0) ack 2 diff --git a/tools/testing/selftests/net/packetdrill/tcp_basic_server.pkt b/tools/testing/selftests/net/packetdrill/tcp_basic_server.pkt new file mode 100644 index 000000000000..e72a291b666e --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_basic_server.pkt @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0 +// +// Minimal passive open. +// Peer is first to close. + +`./defaults.sh` + + // Open listener socket + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + // Incoming connection: passive open: three-way handshake + +0 < S 0:0(0) win 65535 + +0 > S. 0:0(0) ack 1 + +0 < . 1:1(0) ack 1 win 257 + + // Open connection socket and close listener socket + +0 accept(3, ..., ...) = 4 + +0 close(3) = 0 + + // Peer sends data: acknowledge and receive + +0 < P. 1:1001(1000) ack 1 win 257 + +0 > . 1:1(0) ack 1001 + +0 recv(4, ..., 1000, 0) = 1000 + + // Peer initiates connection close + +0 < F. 1001:1001(0) ack 1 win 257 + +.04 > . 1:1(0) ack 1002 + + // Local socket also closes its side + +0 close(4) = 0 + +0 > F. 1:1(0) ack 1002 + +0 < . 1002:1002(0) ack 2 win 257 -- cgit v1.2.3 From b81d5e9d965e0af2c1f21fc392a23e598171f9d6 Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Tue, 6 Jan 2026 18:36:45 -0500 Subject: selftests/bpf: add tests for arena kfuncs under lock Add selftests to ensure the verifier permits calling the arena kfunc API while holding a lock. Signed-off-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260106-arena-under-lock-v2-3-378e9eab3066@etsalapatis.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/verifier_arena.c | 38 ++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/progs/verifier_arena.c b/tools/testing/selftests/bpf/progs/verifier_arena.c index 4a9d96344813..c4b8daac4388 100644 --- a/tools/testing/selftests/bpf/progs/verifier_arena.c +++ b/tools/testing/selftests/bpf/progs/verifier_arena.c @@ -10,6 +10,8 @@ #include "bpf_experimental.h" #include "bpf_arena_common.h" +#define private(name) SEC(".bss." #name) __hidden __attribute__((aligned(8))) + struct { __uint(type, BPF_MAP_TYPE_ARENA); __uint(map_flags, BPF_F_MMAPABLE); @@ -439,4 +441,40 @@ int iter_maps3(struct bpf_iter__bpf_map *ctx) return 0; } +private(ARENA_TESTS) struct bpf_spin_lock arena_bpf_test_lock; + +/* Use the arena kfunc API while under a BPF lock. */ +SEC("syscall") +__success __retval(0) +int arena_kfuncs_under_bpf_lock(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + char __arena *page; + int ret; + + bpf_spin_lock(&arena_bpf_test_lock); + + /* Get a separate region of the arena. */ + page = arena_base(&arena); + ret = bpf_arena_reserve_pages(&arena, page, 1); + if (ret) { + bpf_spin_unlock(&arena_bpf_test_lock); + return 1; + } + + bpf_arena_free_pages(&arena, page, 1); + + page = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + if (!page) { + bpf_spin_unlock(&arena_bpf_test_lock); + return 2; + } + + bpf_arena_free_pages(&arena, page, 1); + + bpf_spin_unlock(&arena_bpf_test_lock); +#endif + + return 0; +} char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 07bf7aa58e5e7fb27b8addcc33052400a7d9ce32 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Wed, 7 Jan 2026 10:20:22 +0800 Subject: selftests/bpf: Add cases to test BPF_F_CPU and BPF_F_ALL_CPUS flags Add test coverage for the new BPF_F_CPU and BPF_F_ALL_CPUS flags support in percpu maps. The following APIs are exercised: * bpf_map_update_batch() * bpf_map_lookup_batch() * bpf_map_update_elem() * bpf_map__update_elem() * bpf_map_lookup_elem_flags() * bpf_map__lookup_elem() For lru_percpu_hash map, set max_entries to 'libbpf_num_possible_cpus() + 1' and only use the first 'libbpf_num_possible_cpus()' entries. This ensures a spare entry is always available in the LRU free list, avoiding eviction. When updating an existing key in lru_percpu_hash map: 1. l_new = prealloc_lru_pop(); /* Borrow from free list */ 2. l_old = lookup_elem_raw(); /* Found, key exists */ 3. pcpu_copy_value(); /* In-place update */ 4. bpf_lru_push_free(); /* Return l_new to free list */ Also add negative tests to verify that non-percpu array and hash maps reject the BPF_F_CPU and BPF_F_ALL_CPUS flags. Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260107022022.12843-8-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/percpu_alloc.c | 328 +++++++++++++++++++++ .../selftests/bpf/progs/percpu_alloc_array.c | 32 ++ 2 files changed, 360 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/prog_tests/percpu_alloc.c b/tools/testing/selftests/bpf/prog_tests/percpu_alloc.c index 343da65864d6..c1d0949f093f 100644 --- a/tools/testing/selftests/bpf/prog_tests/percpu_alloc.c +++ b/tools/testing/selftests/bpf/prog_tests/percpu_alloc.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include "cgroup_helpers.h" #include "percpu_alloc_array.skel.h" #include "percpu_alloc_cgrp_local_storage.skel.h" #include "percpu_alloc_fail.skel.h" @@ -115,6 +116,321 @@ static void test_failure(void) { RUN_TESTS(percpu_alloc_fail); } +static void test_percpu_map_op_cpu_flag(struct bpf_map *map, void *keys, size_t key_sz, u32 entries, + int nr_cpus, bool test_batch) +{ + size_t value_sz = sizeof(u32), value_sz_cpus, value_sz_total; + u32 *values = NULL, *values_percpu = NULL; + const u32 value = 0xDEADC0DE; + int i, j, cpu, map_fd, err; + u64 batch = 0, flags; + void *values_row; + u32 count, v; + LIBBPF_OPTS(bpf_map_batch_opts, batch_opts); + + value_sz_cpus = value_sz * nr_cpus; + values = calloc(entries, value_sz_cpus); + if (!ASSERT_OK_PTR(values, "calloc values")) + return; + + values_percpu = calloc(entries, roundup(value_sz, 8) * nr_cpus); + if (!ASSERT_OK_PTR(values_percpu, "calloc values_percpu")) { + free(values); + return; + } + + value_sz_total = value_sz_cpus * entries; + memset(values, 0, value_sz_total); + + map_fd = bpf_map__fd(map); + flags = BPF_F_CPU | BPF_F_ALL_CPUS; + err = bpf_map_lookup_elem_flags(map_fd, keys, values, flags); + if (!ASSERT_ERR(err, "bpf_map_lookup_elem_flags cpu|all_cpus")) + goto out; + + err = bpf_map_update_elem(map_fd, keys, values, flags); + if (!ASSERT_ERR(err, "bpf_map_update_elem cpu|all_cpus")) + goto out; + + flags = BPF_F_ALL_CPUS; + err = bpf_map_lookup_elem_flags(map_fd, keys, values, flags); + if (!ASSERT_ERR(err, "bpf_map_lookup_elem_flags all_cpus")) + goto out; + + flags = BPF_F_LOCK | BPF_F_CPU; + err = bpf_map_lookup_elem_flags(map_fd, keys, values, flags); + if (!ASSERT_ERR(err, "bpf_map_lookup_elem_flags BPF_F_LOCK")) + goto out; + + flags = BPF_F_LOCK | BPF_F_ALL_CPUS; + err = bpf_map_update_elem(map_fd, keys, values, flags); + if (!ASSERT_ERR(err, "bpf_map_update_elem BPF_F_LOCK")) + goto out; + + flags = (u64)nr_cpus << 32 | BPF_F_CPU; + err = bpf_map_update_elem(map_fd, keys, values, flags); + if (!ASSERT_EQ(err, -ERANGE, "bpf_map_update_elem -ERANGE")) + goto out; + + err = bpf_map__update_elem(map, keys, key_sz, values, value_sz, flags); + if (!ASSERT_EQ(err, -ERANGE, "bpf_map__update_elem -ERANGE")) + goto out; + + err = bpf_map_lookup_elem_flags(map_fd, keys, values, flags); + if (!ASSERT_EQ(err, -ERANGE, "bpf_map_lookup_elem_flags -ERANGE")) + goto out; + + err = bpf_map__lookup_elem(map, keys, key_sz, values, value_sz, flags); + if (!ASSERT_EQ(err, -ERANGE, "bpf_map__lookup_elem -ERANGE")) + goto out; + + for (cpu = 0; cpu < nr_cpus; cpu++) { + /* clear value on all cpus */ + values[0] = 0; + flags = BPF_F_ALL_CPUS; + for (i = 0; i < entries; i++) { + err = bpf_map__update_elem(map, keys + i * key_sz, key_sz, values, + value_sz, flags); + if (!ASSERT_OK(err, "bpf_map__update_elem all_cpus")) + goto out; + } + + /* update value on specified cpu */ + for (i = 0; i < entries; i++) { + values[0] = value; + flags = (u64)cpu << 32 | BPF_F_CPU; + err = bpf_map__update_elem(map, keys + i * key_sz, key_sz, values, + value_sz, flags); + if (!ASSERT_OK(err, "bpf_map__update_elem specified cpu")) + goto out; + + /* lookup then check value on CPUs */ + for (j = 0; j < nr_cpus; j++) { + flags = (u64)j << 32 | BPF_F_CPU; + err = bpf_map__lookup_elem(map, keys + i * key_sz, key_sz, values, + value_sz, flags); + if (!ASSERT_OK(err, "bpf_map__lookup_elem specified cpu")) + goto out; + if (!ASSERT_EQ(values[0], j != cpu ? 0 : value, + "bpf_map__lookup_elem value on specified cpu")) + goto out; + } + } + } + + if (!test_batch) + goto out; + + count = entries; + batch_opts.elem_flags = (u64)nr_cpus << 32 | BPF_F_CPU; + err = bpf_map_update_batch(map_fd, keys, values, &count, &batch_opts); + if (!ASSERT_EQ(err, -ERANGE, "bpf_map_update_batch -ERANGE")) + goto out; + + for (cpu = 0; cpu < nr_cpus; cpu++) { + memset(values, 0, value_sz_total); + + /* clear values across all CPUs */ + count = entries; + batch_opts.elem_flags = BPF_F_ALL_CPUS; + err = bpf_map_update_batch(map_fd, keys, values, &count, &batch_opts); + if (!ASSERT_OK(err, "bpf_map_update_batch all_cpus")) + goto out; + + /* update values on specified CPU */ + for (i = 0; i < entries; i++) + values[i] = value; + + count = entries; + batch_opts.elem_flags = (u64)cpu << 32 | BPF_F_CPU; + err = bpf_map_update_batch(map_fd, keys, values, &count, &batch_opts); + if (!ASSERT_OK(err, "bpf_map_update_batch specified cpu")) + goto out; + + /* lookup values on specified CPU */ + batch = 0; + count = entries; + memset(values, 0, entries * value_sz); + err = bpf_map_lookup_batch(map_fd, NULL, &batch, keys, values, &count, &batch_opts); + if (!ASSERT_TRUE(!err || err == -ENOENT, "bpf_map_lookup_batch specified cpu")) + goto out; + + for (i = 0; i < entries; i++) + if (!ASSERT_EQ(values[i], value, + "bpf_map_lookup_batch value on specified cpu")) + goto out; + + /* lookup values from all CPUs */ + batch = 0; + count = entries; + batch_opts.elem_flags = 0; + memset(values_percpu, 0, roundup(value_sz, 8) * nr_cpus * entries); + err = bpf_map_lookup_batch(map_fd, NULL, &batch, keys, values_percpu, &count, + &batch_opts); + if (!ASSERT_TRUE(!err || err == -ENOENT, "bpf_map_lookup_batch all_cpus")) + goto out; + + for (i = 0; i < entries; i++) { + values_row = (void *) values_percpu + + roundup(value_sz, 8) * i * nr_cpus; + for (j = 0; j < nr_cpus; j++) { + v = *(u32 *) (values_row + roundup(value_sz, 8) * j); + if (!ASSERT_EQ(v, j != cpu ? 0 : value, + "bpf_map_lookup_batch value all_cpus")) + goto out; + } + } + } + +out: + free(values_percpu); + free(values); +} + + +static void test_percpu_map_cpu_flag(enum bpf_map_type map_type) +{ + struct percpu_alloc_array *skel; + size_t key_sz = sizeof(int); + int *keys, nr_cpus, i, err; + struct bpf_map *map; + u32 max_entries; + + nr_cpus = libbpf_num_possible_cpus(); + if (!ASSERT_GT(nr_cpus, 0, "libbpf_num_possible_cpus")) + return; + + max_entries = nr_cpus + 1; + keys = calloc(max_entries, key_sz); + if (!ASSERT_OK_PTR(keys, "calloc keys")) + return; + + for (i = 0; i < max_entries; i++) + keys[i] = i; + + skel = percpu_alloc_array__open(); + if (!ASSERT_OK_PTR(skel, "percpu_alloc_array__open")) { + free(keys); + return; + } + + map = skel->maps.percpu; + bpf_map__set_type(map, map_type); + bpf_map__set_max_entries(map, max_entries); + + err = percpu_alloc_array__load(skel); + if (!ASSERT_OK(err, "test_percpu_alloc__load")) + goto out; + + test_percpu_map_op_cpu_flag(map, keys, key_sz, max_entries - 1, nr_cpus, true); +out: + percpu_alloc_array__destroy(skel); + free(keys); +} + +static void test_percpu_array_cpu_flag(void) +{ + test_percpu_map_cpu_flag(BPF_MAP_TYPE_PERCPU_ARRAY); +} + +static void test_percpu_hash_cpu_flag(void) +{ + test_percpu_map_cpu_flag(BPF_MAP_TYPE_PERCPU_HASH); +} + +static void test_lru_percpu_hash_cpu_flag(void) +{ + test_percpu_map_cpu_flag(BPF_MAP_TYPE_LRU_PERCPU_HASH); +} + +static void test_percpu_cgroup_storage_cpu_flag(void) +{ + struct percpu_alloc_array *skel = NULL; + struct bpf_cgroup_storage_key key; + int cgroup, prog_fd, nr_cpus, err; + struct bpf_map *map; + + nr_cpus = libbpf_num_possible_cpus(); + if (!ASSERT_GT(nr_cpus, 0, "libbpf_num_possible_cpus")) + return; + + err = setup_cgroup_environment(); + if (!ASSERT_OK(err, "setup_cgroup_environment")) + return; + + cgroup = create_and_get_cgroup("/cg_percpu"); + if (!ASSERT_GE(cgroup, 0, "create_and_get_cgroup")) { + cleanup_cgroup_environment(); + return; + } + + err = join_cgroup("/cg_percpu"); + if (!ASSERT_OK(err, "join_cgroup")) + goto out; + + skel = percpu_alloc_array__open_and_load(); + if (!ASSERT_OK_PTR(skel, "percpu_alloc_array__open_and_load")) + goto out; + + prog_fd = bpf_program__fd(skel->progs.cgroup_egress); + err = bpf_prog_attach(prog_fd, cgroup, BPF_CGROUP_INET_EGRESS, 0); + if (!ASSERT_OK(err, "bpf_prog_attach")) + goto out; + + map = skel->maps.percpu_cgroup_storage; + err = bpf_map_get_next_key(bpf_map__fd(map), NULL, &key); + if (!ASSERT_OK(err, "bpf_map_get_next_key")) + goto out; + + test_percpu_map_op_cpu_flag(map, &key, sizeof(key), 1, nr_cpus, false); +out: + bpf_prog_detach2(-1, cgroup, BPF_CGROUP_INET_EGRESS); + close(cgroup); + cleanup_cgroup_environment(); + percpu_alloc_array__destroy(skel); +} + +static void test_map_op_cpu_flag(enum bpf_map_type map_type) +{ + u32 max_entries = 1, count = max_entries; + u64 flags, batch = 0, val = 0; + int err, map_fd, key = 0; + LIBBPF_OPTS(bpf_map_batch_opts, batch_opts); + + map_fd = bpf_map_create(map_type, "test_cpu_flag", sizeof(int), sizeof(u64), max_entries, + NULL); + if (!ASSERT_GE(map_fd, 0, "bpf_map_create")) + return; + + flags = BPF_F_ALL_CPUS; + err = bpf_map_update_elem(map_fd, &key, &val, flags); + ASSERT_ERR(err, "bpf_map_update_elem all_cpus"); + + batch_opts.elem_flags = BPF_F_ALL_CPUS; + err = bpf_map_update_batch(map_fd, &key, &val, &count, &batch_opts); + ASSERT_ERR(err, "bpf_map_update_batch all_cpus"); + + flags = BPF_F_CPU; + err = bpf_map_lookup_elem_flags(map_fd, &key, &val, flags); + ASSERT_ERR(err, "bpf_map_lookup_elem_flags cpu"); + + batch_opts.elem_flags = BPF_F_CPU; + err = bpf_map_lookup_batch(map_fd, NULL, &batch, &key, &val, &count, &batch_opts); + ASSERT_ERR(err, "bpf_map_lookup_batch cpu"); + + close(map_fd); +} + +static void test_array_cpu_flag(void) +{ + test_map_op_cpu_flag(BPF_MAP_TYPE_ARRAY); +} + +static void test_hash_cpu_flag(void) +{ + test_map_op_cpu_flag(BPF_MAP_TYPE_HASH); +} + void test_percpu_alloc(void) { if (test__start_subtest("array")) @@ -125,4 +441,16 @@ void test_percpu_alloc(void) test_cgrp_local_storage(); if (test__start_subtest("failure_tests")) test_failure(); + if (test__start_subtest("cpu_flag_percpu_array")) + test_percpu_array_cpu_flag(); + if (test__start_subtest("cpu_flag_percpu_hash")) + test_percpu_hash_cpu_flag(); + if (test__start_subtest("cpu_flag_lru_percpu_hash")) + test_lru_percpu_hash_cpu_flag(); + if (test__start_subtest("cpu_flag_percpu_cgroup_storage")) + test_percpu_cgroup_storage_cpu_flag(); + if (test__start_subtest("cpu_flag_array")) + test_array_cpu_flag(); + if (test__start_subtest("cpu_flag_hash")) + test_hash_cpu_flag(); } diff --git a/tools/testing/selftests/bpf/progs/percpu_alloc_array.c b/tools/testing/selftests/bpf/progs/percpu_alloc_array.c index 37c2d2608ec0..ed6a2a93d5a5 100644 --- a/tools/testing/selftests/bpf/progs/percpu_alloc_array.c +++ b/tools/testing/selftests/bpf/progs/percpu_alloc_array.c @@ -187,4 +187,36 @@ out: return 0; } +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 2); + __type(key, int); + __type(value, u32); +} percpu SEC(".maps"); + +SEC("?fentry/bpf_fentry_test1") +int BPF_PROG(test_percpu_array, int x) +{ + u64 value = 0xDEADC0DE; + int key = 0; + + bpf_map_update_elem(&percpu, &key, &value, BPF_ANY); + return 0; +} + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE); + __type(key, struct bpf_cgroup_storage_key); + __type(value, u32); +} percpu_cgroup_storage SEC(".maps"); + +SEC("cgroup_skb/egress") +int cgroup_egress(struct __sk_buff *skb) +{ + u32 *val = bpf_get_local_storage(&percpu_cgroup_storage, 0); + + *val = 1; + return 1; +} + char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 97fb54d86d2194ea8a4cbe6cf074e6ba47b054ea Mon Sep 17 00:00:00 2001 From: "Jose E. Marchesi" Date: Tue, 6 Jan 2026 18:36:49 +0100 Subject: bpf: adapt selftests to GCC 16 -Wunused-but-set-variable GCC 16 has changed the semantics of -Wunused-but-set-variable, as well as introducing new options -Wunused-but-set-variable={0,1,2,3} to adjust the level of support. One of the changes is that GCC now treats 'sum += 1' and 'sum++' as non-usage, whereas clang (and GCC < 16) considers the first as usage and the second as non-usage, which is sort of inconsistent. The GCC 16 -Wunused-but-set-variable=2 option implements the previous semantics of -Wunused-but-set-variable, but since it is a new option, it cannot be used unconditionally for forward-compatibility, just for backwards-compatibility. So this patch adds pragmas to the two self-tests impacted by this, progs/free_timer.c and progs/rcu_read_lock.c, to make gcc to ignore -Wunused-but-set-variable warnings when compiling them with GCC > 15. See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=44677#c25 for details on why this regression got introduced in GCC upstream. Signed-off-by: Jose E. Marchesi Cc: david.faust@oracle.com Cc: cupertino.miranda@oracle.com Cc: Eduard Zingerman Cc: Yonghong Song Link: https://lore.kernel.org/r/20260106173650.18191-2-jose.marchesi@oracle.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/free_timer.c | 10 ++++++++++ tools/testing/selftests/bpf/progs/rcu_read_lock.c | 10 ++++++++++ 2 files changed, 20 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/progs/free_timer.c b/tools/testing/selftests/bpf/progs/free_timer.c index 4501ae8fc414..eccb2d47db43 100644 --- a/tools/testing/selftests/bpf/progs/free_timer.c +++ b/tools/testing/selftests/bpf/progs/free_timer.c @@ -7,6 +7,16 @@ #define MAX_ENTRIES 8 +/* clang considers 'sum += 1' as usage but 'sum++' as non-usage. GCC + * is more consistent and considers both 'sum += 1' and 'sum++' as + * non-usage. This triggers warnings in the functions below. + * + * Starting with GCC 16 -Wunused-but-set-variable=2 can be used to + * mimic clang's behavior. */ +#if !defined(__clang__) && __GNUC__ > 15 +#pragma GCC diagnostic ignored "-Wunused-but-set-variable" +#endif + struct map_value { struct bpf_timer timer; }; diff --git a/tools/testing/selftests/bpf/progs/rcu_read_lock.c b/tools/testing/selftests/bpf/progs/rcu_read_lock.c index d70c28824bbe..b4e073168fb1 100644 --- a/tools/testing/selftests/bpf/progs/rcu_read_lock.c +++ b/tools/testing/selftests/bpf/progs/rcu_read_lock.c @@ -7,6 +7,16 @@ #include "bpf_tracing_net.h" #include "bpf_misc.h" +/* clang considers 'sum += 1' as usage but 'sum++' as non-usage. GCC + * is more consistent and considers both 'sum += 1' and 'sum++' as + * non-usage. This triggers warnings in the functions below. + * + * Starting with GCC 16 -Wunused-but-set-variable=2 can be used to + * mimic clang's behavior. */ +#if !defined(__clang__) && __GNUC__ > 15 +#pragma GCC diagnostic ignored "-Wunused-but-set-variable" +#endif + char _license[] SEC("license") = "GPL"; struct { -- cgit v1.2.3 From 681600647c59050546939da5e490c736e567fe91 Mon Sep 17 00:00:00 2001 From: "Jose E. Marchesi" Date: Tue, 6 Jan 2026 18:36:50 +0100 Subject: bpf: GCC requires function attributes before the declarator GCC insists in placing attributes before the declarators in function declarations. Now that GCC supports btf_decl_tag and therefore __tag1 and __tag2 expand to actual attributes, the compiler is complaining about it for static __noinline int foo(int x __tag1 __tag2) __tag1 __tag2 progs/test_btf_decl_tag.c:36:1: error: attributes should be specified \ before the declarator in a function definition This patch simply places the tags before the declarator. Signed-off-by: Jose E. Marchesi Cc: david.faust@oracle.com Cc: cupertino.miranda@oracle.com Cc: Eduard Zingerman Cc: Yonghong Song Link: https://lore.kernel.org/r/20260106173650.18191-3-jose.marchesi@oracle.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/test_btf_decl_tag.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/progs/test_btf_decl_tag.c b/tools/testing/selftests/bpf/progs/test_btf_decl_tag.c index c88ccc53529a..0c3df19626cb 100644 --- a/tools/testing/selftests/bpf/progs/test_btf_decl_tag.c +++ b/tools/testing/selftests/bpf/progs/test_btf_decl_tag.c @@ -33,7 +33,7 @@ struct { } hashmap1 SEC(".maps"); -static __noinline int foo(int x __tag1 __tag2) __tag1 __tag2 +static __noinline __tag1 __tag2 int foo(int x __tag1 __tag2) { struct key_t key; value_t val = {}; -- cgit v1.2.3 From 1cabad3a00ab2e3d6bf19c5ab8fc9212d0b81e18 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Wed, 7 Jan 2026 09:59:33 +0800 Subject: kunit: tool: test: Rename test_data_path() to _test_data_path() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Running the KUnit testsuite through pytest fails, as the function test_data_path() is recognized as a test function. Its execution fails as pytest tries to resolve the 'path' argument as a fixture which does not exist. Rename the function, so the helper function is not incorrectly recognized as a test function. Link: https://lore.kernel.org/r/20260107015936.2316047-1-davidgow@google.com Signed-off-by: Thomas Weißschuh Reviewed-by: David Gow Signed-off-by: David Gow Signed-off-by: Shuah Khan --- tools/testing/kunit/kunit_tool_test.py | 56 +++++++++++++++++----------------- 1 file changed, 28 insertions(+), 28 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py index bdc51b5c7b10..30ac1cb6c8ed 100755 --- a/tools/testing/kunit/kunit_tool_test.py +++ b/tools/testing/kunit/kunit_tool_test.py @@ -36,7 +36,7 @@ def setUpModule(): def tearDownModule(): shutil.rmtree(test_tmpdir) -def test_data_path(path): +def _test_data_path(path): return os.path.join(abs_test_data_dir, path) class KconfigTest(unittest.TestCase): @@ -52,7 +52,7 @@ class KconfigTest(unittest.TestCase): self.assertFalse(kconfig1.is_subset_of(kconfig0)) def test_read_from_file(self): - kconfig_path = test_data_path('test_read_from_file.kconfig') + kconfig_path = _test_data_path('test_read_from_file.kconfig') kconfig = kunit_config.parse_file(kconfig_path) @@ -98,7 +98,7 @@ class KUnitParserTest(unittest.TestCase): raise AssertionError(f'"{needle}" not found in {list(backup)}!') def test_output_isolated_correctly(self): - log_path = test_data_path('test_output_isolated_correctly.log') + log_path = _test_data_path('test_output_isolated_correctly.log') with open(log_path) as file: result = kunit_parser.extract_tap_lines(file.readlines()) self.assertContains('TAP version 14', result) @@ -109,7 +109,7 @@ class KUnitParserTest(unittest.TestCase): self.assertContains('ok 1 - example', result) def test_output_with_prefix_isolated_correctly(self): - log_path = test_data_path('test_pound_sign.log') + log_path = _test_data_path('test_pound_sign.log') with open(log_path) as file: result = kunit_parser.extract_tap_lines(file.readlines()) self.assertContains('TAP version 14', result) @@ -138,35 +138,35 @@ class KUnitParserTest(unittest.TestCase): self.assertContains('ok 3 - string-stream-test', result) def test_parse_successful_test_log(self): - all_passed_log = test_data_path('test_is_test_passed-all_passed.log') + all_passed_log = _test_data_path('test_is_test_passed-all_passed.log') with open(all_passed_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status) self.assertEqual(result.counts.errors, 0) def test_parse_successful_nested_tests_log(self): - all_passed_log = test_data_path('test_is_test_passed-all_passed_nested.log') + all_passed_log = _test_data_path('test_is_test_passed-all_passed_nested.log') with open(all_passed_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status) self.assertEqual(result.counts.errors, 0) def test_kselftest_nested(self): - kselftest_log = test_data_path('test_is_test_passed-kselftest.log') + kselftest_log = _test_data_path('test_is_test_passed-kselftest.log') with open(kselftest_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status) self.assertEqual(result.counts.errors, 0) def test_parse_failed_test_log(self): - failed_log = test_data_path('test_is_test_passed-failure.log') + failed_log = _test_data_path('test_is_test_passed-failure.log') with open(failed_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) self.assertEqual(kunit_parser.TestStatus.FAILURE, result.status) self.assertEqual(result.counts.errors, 0) def test_parse_failed_nested_tests_log(self): - nested_log = test_data_path('test_is_test_passed-failure-nested.log') + nested_log = _test_data_path('test_is_test_passed-failure-nested.log') with open(nested_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) self.assertEqual(kunit_parser.TestStatus.FAILURE, result.status) @@ -177,7 +177,7 @@ class KUnitParserTest(unittest.TestCase): self.assertEqual(kunit_parser.TestStatus.FAILURE, result.subtests[1].subtests[0].status) def test_no_header(self): - empty_log = test_data_path('test_is_test_passed-no_tests_run_no_header.log') + empty_log = _test_data_path('test_is_test_passed-no_tests_run_no_header.log') with open(empty_log) as file: result = kunit_parser.parse_run_tests( kunit_parser.extract_tap_lines(file.readlines()), stdout) @@ -186,7 +186,7 @@ class KUnitParserTest(unittest.TestCase): self.assertEqual(result.counts.errors, 1) def test_missing_test_plan(self): - missing_plan_log = test_data_path('test_is_test_passed-' + missing_plan_log = _test_data_path('test_is_test_passed-' 'missing_plan.log') with open(missing_plan_log) as file: result = kunit_parser.parse_run_tests( @@ -197,7 +197,7 @@ class KUnitParserTest(unittest.TestCase): self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status) def test_no_tests(self): - header_log = test_data_path('test_is_test_passed-no_tests_run_with_header.log') + header_log = _test_data_path('test_is_test_passed-no_tests_run_with_header.log') with open(header_log) as file: result = kunit_parser.parse_run_tests( kunit_parser.extract_tap_lines(file.readlines()), stdout) @@ -206,7 +206,7 @@ class KUnitParserTest(unittest.TestCase): self.assertEqual(result.counts.errors, 1) def test_no_tests_no_plan(self): - no_plan_log = test_data_path('test_is_test_passed-no_tests_no_plan.log') + no_plan_log = _test_data_path('test_is_test_passed-no_tests_no_plan.log') with open(no_plan_log) as file: result = kunit_parser.parse_run_tests( kunit_parser.extract_tap_lines(file.readlines()), stdout) @@ -218,7 +218,7 @@ class KUnitParserTest(unittest.TestCase): def test_no_kunit_output(self): - crash_log = test_data_path('test_insufficient_memory.log') + crash_log = _test_data_path('test_insufficient_memory.log') print_mock = mock.patch('kunit_printer.Printer.print').start() with open(crash_log) as file: result = kunit_parser.parse_run_tests( @@ -229,7 +229,7 @@ class KUnitParserTest(unittest.TestCase): self.assertEqual(result.counts.errors, 1) def test_skipped_test(self): - skipped_log = test_data_path('test_skip_tests.log') + skipped_log = _test_data_path('test_skip_tests.log') with open(skipped_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) @@ -238,7 +238,7 @@ class KUnitParserTest(unittest.TestCase): self.assertEqual(result.counts, kunit_parser.TestCounts(passed=4, skipped=1)) def test_skipped_all_tests(self): - skipped_log = test_data_path('test_skip_all_tests.log') + skipped_log = _test_data_path('test_skip_all_tests.log') with open(skipped_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) @@ -246,7 +246,7 @@ class KUnitParserTest(unittest.TestCase): self.assertEqual(result.counts, kunit_parser.TestCounts(skipped=5)) def test_ignores_hyphen(self): - hyphen_log = test_data_path('test_strip_hyphen.log') + hyphen_log = _test_data_path('test_strip_hyphen.log') with open(hyphen_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) @@ -260,7 +260,7 @@ class KUnitParserTest(unittest.TestCase): result.subtests[1].name) def test_ignores_prefix_printk_time(self): - prefix_log = test_data_path('test_config_printk_time.log') + prefix_log = _test_data_path('test_config_printk_time.log') with open(prefix_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status) @@ -268,7 +268,7 @@ class KUnitParserTest(unittest.TestCase): self.assertEqual(result.counts.errors, 0) def test_ignores_multiple_prefixes(self): - prefix_log = test_data_path('test_multiple_prefixes.log') + prefix_log = _test_data_path('test_multiple_prefixes.log') with open(prefix_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status) @@ -276,7 +276,7 @@ class KUnitParserTest(unittest.TestCase): self.assertEqual(result.counts.errors, 0) def test_prefix_mixed_kernel_output(self): - mixed_prefix_log = test_data_path('test_interrupted_tap_output.log') + mixed_prefix_log = _test_data_path('test_interrupted_tap_output.log') with open(mixed_prefix_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status) @@ -284,7 +284,7 @@ class KUnitParserTest(unittest.TestCase): self.assertEqual(result.counts.errors, 0) def test_prefix_poundsign(self): - pound_log = test_data_path('test_pound_sign.log') + pound_log = _test_data_path('test_pound_sign.log') with open(pound_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status) @@ -292,7 +292,7 @@ class KUnitParserTest(unittest.TestCase): self.assertEqual(result.counts.errors, 0) def test_kernel_panic_end(self): - panic_log = test_data_path('test_kernel_panic_interrupt.log') + panic_log = _test_data_path('test_kernel_panic_interrupt.log') with open(panic_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) self.assertEqual(kunit_parser.TestStatus.TEST_CRASHED, result.status) @@ -300,7 +300,7 @@ class KUnitParserTest(unittest.TestCase): self.assertGreaterEqual(result.counts.errors, 1) def test_pound_no_prefix(self): - pound_log = test_data_path('test_pound_no_prefix.log') + pound_log = _test_data_path('test_pound_no_prefix.log') with open(pound_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status) @@ -329,7 +329,7 @@ class KUnitParserTest(unittest.TestCase): 'Failures: all_failed_suite, some_failed_suite.test2') def test_ktap_format(self): - ktap_log = test_data_path('test_parse_ktap_output.log') + ktap_log = _test_data_path('test_parse_ktap_output.log') with open(ktap_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) self.assertEqual(result.counts, kunit_parser.TestCounts(passed=3)) @@ -338,13 +338,13 @@ class KUnitParserTest(unittest.TestCase): self.assertEqual('case_2', result.subtests[0].subtests[1].name) def test_parse_subtest_header(self): - ktap_log = test_data_path('test_parse_subtest_header.log') + ktap_log = _test_data_path('test_parse_subtest_header.log') with open(ktap_log) as file: kunit_parser.parse_run_tests(file.readlines(), stdout) self.print_mock.assert_any_call(StrContains('suite (1 subtest)')) def test_parse_attributes(self): - ktap_log = test_data_path('test_parse_attributes.log') + ktap_log = _test_data_path('test_parse_attributes.log') with open(ktap_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) @@ -566,7 +566,7 @@ class KUnitJsonTest(unittest.TestCase): self.addCleanup(mock.patch.stopall) def _json_for(self, log_file): - with open(test_data_path(log_file)) as file: + with open(_test_data_path(log_file)) as file: test_result = kunit_parser.parse_run_tests(file, stdout) json_obj = kunit_json.get_json_result( test=test_result, @@ -607,7 +607,7 @@ class StrContains(str): class KUnitMainTest(unittest.TestCase): def setUp(self): - path = test_data_path('test_is_test_passed-all_passed.log') + path = _test_data_path('test_is_test_passed-all_passed.log') with open(path) as file: all_passed_log = file.readlines() -- cgit v1.2.3 From f126d688193b4dd6d0044c19771469724c03f8f8 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Wed, 7 Jan 2026 09:59:34 +0800 Subject: kunit: tool: test: Don't rely on implicit working directory change MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If no kunitconfig_paths are passed to LinuxSourceTree() it falls back to DEFAULT_KUNITCONFIG_PATH. This resolution only works when the current working directory is the root of the source tree. This works by chance when running the full testsuite through the default unittest runner, as some tests will change the current working directory as a side-effect of 'kunit.main()'. When running a single testcase or using pytest, which resets the working directory for each test, this assumption breaks. Explicitly specify an empty kunitconfig for the affected tests. Link: https://lore.kernel.org/r/20260107015936.2316047-2-davidgow@google.com Signed-off-by: Thomas Weißschuh Reviewed-by: David Gow Signed-off-by: David Gow Signed-off-by: Shuah Khan --- tools/testing/kunit/kunit_tool_test.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py index 30ac1cb6c8ed..238a31a5cc29 100755 --- a/tools/testing/kunit/kunit_tool_test.py +++ b/tools/testing/kunit/kunit_tool_test.py @@ -477,7 +477,8 @@ class LinuxSourceTreeTest(unittest.TestCase): want_kconfig = kunit_config.Kconfig() want_kconfig.add_entry('NOT_REAL', 'y') - tree = kunit_kernel.LinuxSourceTree('', kconfig_add=['CONFIG_NOT_REAL=y']) + tree = kunit_kernel.LinuxSourceTree('', kunitconfig_paths=[os.devnull], + kconfig_add=['CONFIG_NOT_REAL=y']) self.assertTrue(want_kconfig.is_subset_of(tree._kconfig), msg=tree._kconfig) def test_invalid_arch(self): @@ -489,7 +490,7 @@ class LinuxSourceTreeTest(unittest.TestCase): return subprocess.Popen(['echo "hi\nbye"'], shell=True, text=True, stdout=subprocess.PIPE) with tempfile.TemporaryDirectory('') as build_dir: - tree = kunit_kernel.LinuxSourceTree(build_dir) + tree = kunit_kernel.LinuxSourceTree(build_dir, kunitconfig_paths=[os.devnull]) mock.patch.object(tree._ops, 'start', side_effect=fake_start).start() with self.assertRaises(ValueError): -- cgit v1.2.3 From 8190b9ea30fef5b9067825b91fb3ec6d678ee5e3 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Thu, 18 Dec 2025 14:25:59 -0800 Subject: thermal: intel: selftests: workload_hint: Support slow workload hints Add option to enable slow workload type hints. User can specify "slow" as the command line argument to enable slow workload type hints. There are two slow workload type hints: "power" and "performance". Signed-off-by: Srinivas Pandruvada Link: https://patch.msgid.link/20251218222559.4110027-3-srinivas.pandruvada@linux.intel.com Signed-off-by: Rafael J. Wysocki --- .../intel/workload_hint/workload_hint_test.c | 74 +++++++++++++++------- 1 file changed, 52 insertions(+), 22 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c b/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c index ca2bd03154e4..569d44f22835 100644 --- a/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c +++ b/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c @@ -12,6 +12,7 @@ #define WORKLOAD_NOTIFICATION_DELAY_ATTRIBUTE "/sys/bus/pci/devices/0000:00:04.0/workload_hint/notification_delay_ms" #define WORKLOAD_ENABLE_ATTRIBUTE "/sys/bus/pci/devices/0000:00:04.0/workload_hint/workload_hint_enable" +#define WORKLOAD_SLOW_ENABLE_ATTRIBUTE "/sys/bus/pci/devices/0000:00:04.0/workload_hint/workload_slow_hint_enable" #define WORKLOAD_TYPE_INDEX_ATTRIBUTE "/sys/bus/pci/devices/0000:00:04.0/workload_hint/workload_type_index" static const char * const workload_types[] = { @@ -22,6 +23,9 @@ static const char * const workload_types[] = { NULL }; +static int wlt_slow; +static char *wlt_enable_attr; + #define WORKLOAD_TYPE_MAX_INDEX 3 void workload_hint_exit(int signum) @@ -30,7 +34,7 @@ void workload_hint_exit(int signum) /* Disable feature via sysfs knob */ - fd = open(WORKLOAD_ENABLE_ATTRIBUTE, O_RDWR); + fd = open(wlt_enable_attr, O_RDWR); if (fd < 0) { perror("Unable to open workload type feature enable file"); exit(1); @@ -46,6 +50,26 @@ void workload_hint_exit(int signum) close(fd); } +static void update_delay(char *delay_str) +{ + int fd; + + printf("Setting notification delay in ms to %s\n", delay_str); + + fd = open(WORKLOAD_NOTIFICATION_DELAY_ATTRIBUTE, O_RDWR); + if (fd < 0) { + perror("Unable to open workload notification delay"); + exit(1); + } + + if (write(fd, delay_str, strlen(delay_str)) < 0) { + perror("Can't set delay"); + exit(1); + } + + close(fd); +} + int main(int argc, char **argv) { struct pollfd ufd; @@ -54,32 +78,26 @@ int main(int argc, char **argv) char delay_str[64]; int delay = 0; - printf("Usage: workload_hint_test [notification delay in milli seconds]\n"); + printf("Usage: workload_hint_test [notification delay in milli seconds][slow]\n"); if (argc > 1) { - ret = sscanf(argv[1], "%d", &delay); - if (ret < 0) { - printf("Invalid delay\n"); - exit(1); - } + int i; - printf("Setting notification delay to %d ms\n", delay); - if (delay < 0) - exit(1); + for (i = 1; i < argc; ++i) { + if (!strcmp(argv[i], "slow")) { + wlt_slow = 1; + continue; + } - sprintf(delay_str, "%s\n", argv[1]); - fd = open(WORKLOAD_NOTIFICATION_DELAY_ATTRIBUTE, O_RDWR); - if (fd < 0) { - perror("Unable to open workload notification delay"); - exit(1); - } + ret = sscanf(argv[1], "%d", &delay); + if (ret < 0) { + printf("Invalid delay\n"); + exit(1); + } - if (write(fd, delay_str, strlen(delay_str)) < 0) { - perror("Can't set delay"); - exit(1); + sprintf(delay_str, "%s\n", argv[1]); + update_delay(delay_str); } - - close(fd); } if (signal(SIGINT, workload_hint_exit) == SIG_IGN) @@ -89,8 +107,13 @@ int main(int argc, char **argv) if (signal(SIGTERM, workload_hint_exit) == SIG_IGN) signal(SIGTERM, SIG_IGN); + if (wlt_slow) + wlt_enable_attr = WORKLOAD_SLOW_ENABLE_ATTRIBUTE; + else + wlt_enable_attr = WORKLOAD_ENABLE_ATTRIBUTE; + /* Enable feature via sysfs knob */ - fd = open(WORKLOAD_ENABLE_ATTRIBUTE, O_RDWR); + fd = open(wlt_enable_attr, O_RDWR); if (fd < 0) { perror("Unable to open workload type feature enable file"); exit(1); @@ -145,6 +168,13 @@ int main(int argc, char **argv) if (ret < 0) break; + if (wlt_slow) { + if (index & 0x10) + printf("workload type slow:%s\n", "power"); + else + printf("workload type slow:%s\n", "performance"); + } + index &= 0x0f; if (index > WORKLOAD_TYPE_MAX_INDEX) printf("Invalid workload type index\n"); -- cgit v1.2.3 From 0b28194c4c8e3a6c2552bfa6451f71b1879dd61f Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Fri, 5 Dec 2025 14:49:37 -0800 Subject: KVM: selftests: Test TPR / CR8 sync and interrupt masking Add a few extra TPR / CR8 tests to x86's xapic_state_test to see if: * TPR is 0 on reset, * TPR, PPR and CR8 are equal inside the guest, * TPR and CR8 read equal by the host after a VMExit * TPR borderline values set by the host correctly mask interrupts in the guest. These hopefully will catch the most obvious cases of improper TPR sync or interrupt masking. Do these tests both in x2APIC and xAPIC modes. The x2APIC mode uses SELF_IPI register to trigger interrupts to give it a bit of exercise too. Signed-off-by: Maciej S. Szmigiero Acked-by: Naveen N Rao (AMD) [sean: put code in separate test] Link: https://patch.msgid.link/20251205224937.428122-1-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/Makefile.kvm | 1 + tools/testing/selftests/kvm/include/x86/apic.h | 3 + tools/testing/selftests/kvm/x86/xapic_tpr_test.c | 276 +++++++++++++++++++++++ 3 files changed, 280 insertions(+) create mode 100644 tools/testing/selftests/kvm/x86/xapic_tpr_test.c (limited to 'tools/testing') diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index ba5c2b643efa..3789890421bd 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -124,6 +124,7 @@ TEST_GEN_PROGS_x86 += x86/vmx_set_nested_state_test TEST_GEN_PROGS_x86 += x86/apic_bus_clock_test TEST_GEN_PROGS_x86 += x86/xapic_ipi_test TEST_GEN_PROGS_x86 += x86/xapic_state_test +TEST_GEN_PROGS_x86 += x86/xapic_tpr_test TEST_GEN_PROGS_x86 += x86/xcr0_cpuid_test TEST_GEN_PROGS_x86 += x86/xss_msr_test TEST_GEN_PROGS_x86 += x86/debug_regs diff --git a/tools/testing/selftests/kvm/include/x86/apic.h b/tools/testing/selftests/kvm/include/x86/apic.h index 80fe9f69b38d..e9b9aebaac97 100644 --- a/tools/testing/selftests/kvm/include/x86/apic.h +++ b/tools/testing/selftests/kvm/include/x86/apic.h @@ -28,6 +28,8 @@ #define GET_APIC_ID_FIELD(x) (((x) >> 24) & 0xFF) #define APIC_TASKPRI 0x80 #define APIC_PROCPRI 0xA0 +#define GET_APIC_PRI(x) (((x) & GENMASK(7, 4)) >> 4) +#define SET_APIC_PRI(x, y) (((x) & ~GENMASK(7, 4)) | (y << 4)) #define APIC_EOI 0xB0 #define APIC_SPIV 0xF0 #define APIC_SPIV_FOCUS_DISABLED (1 << 9) @@ -67,6 +69,7 @@ #define APIC_TMICT 0x380 #define APIC_TMCCT 0x390 #define APIC_TDCR 0x3E0 +#define APIC_SELF_IPI 0x3F0 void apic_disable(void); void xapic_enable(void); diff --git a/tools/testing/selftests/kvm/x86/xapic_tpr_test.c b/tools/testing/selftests/kvm/x86/xapic_tpr_test.c new file mode 100644 index 000000000000..3862134d9d40 --- /dev/null +++ b/tools/testing/selftests/kvm/x86/xapic_tpr_test.c @@ -0,0 +1,276 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include +#include + +#include "apic.h" +#include "kvm_util.h" +#include "processor.h" +#include "test_util.h" + +static bool is_x2apic; + +#define IRQ_VECTOR 0x20 + +/* See also the comment at similar assertion in memslot_perf_test.c */ +static_assert(ATOMIC_INT_LOCK_FREE == 2, "atomic int is not lockless"); + +static atomic_uint tpr_guest_irq_sync_val; + +static void tpr_guest_irq_sync_flag_reset(void) +{ + atomic_store_explicit(&tpr_guest_irq_sync_val, 0, + memory_order_release); +} + +static unsigned int tpr_guest_irq_sync_val_get(void) +{ + return atomic_load_explicit(&tpr_guest_irq_sync_val, + memory_order_acquire); +} + +static void tpr_guest_irq_sync_val_inc(void) +{ + atomic_fetch_add_explicit(&tpr_guest_irq_sync_val, 1, + memory_order_acq_rel); +} + +static void tpr_guest_irq_handler_xapic(struct ex_regs *regs) +{ + tpr_guest_irq_sync_val_inc(); + + xapic_write_reg(APIC_EOI, 0); +} + +static void tpr_guest_irq_handler_x2apic(struct ex_regs *regs) +{ + tpr_guest_irq_sync_val_inc(); + + x2apic_write_reg(APIC_EOI, 0); +} + +static void tpr_guest_irq_queue(void) +{ + if (is_x2apic) { + x2apic_write_reg(APIC_SELF_IPI, IRQ_VECTOR); + } else { + uint32_t icr, icr2; + + icr = APIC_DEST_SELF | APIC_DEST_PHYSICAL | APIC_DM_FIXED | + IRQ_VECTOR; + icr2 = 0; + + xapic_write_reg(APIC_ICR2, icr2); + xapic_write_reg(APIC_ICR, icr); + } +} + +static uint8_t tpr_guest_tpr_get(void) +{ + uint32_t taskpri; + + if (is_x2apic) + taskpri = x2apic_read_reg(APIC_TASKPRI); + else + taskpri = xapic_read_reg(APIC_TASKPRI); + + return GET_APIC_PRI(taskpri); +} + +static uint8_t tpr_guest_ppr_get(void) +{ + uint32_t procpri; + + if (is_x2apic) + procpri = x2apic_read_reg(APIC_PROCPRI); + else + procpri = xapic_read_reg(APIC_PROCPRI); + + return GET_APIC_PRI(procpri); +} + +static uint8_t tpr_guest_cr8_get(void) +{ + uint64_t cr8; + + asm volatile ("mov %%cr8, %[cr8]\n\t" : [cr8] "=r"(cr8)); + + return cr8 & GENMASK(3, 0); +} + +static void tpr_guest_check_tpr_ppr_cr8_equal(void) +{ + uint8_t tpr; + + tpr = tpr_guest_tpr_get(); + + GUEST_ASSERT_EQ(tpr_guest_ppr_get(), tpr); + GUEST_ASSERT_EQ(tpr_guest_cr8_get(), tpr); +} + +static void tpr_guest_code(void) +{ + cli(); + + if (is_x2apic) + x2apic_enable(); + else + xapic_enable(); + + GUEST_ASSERT_EQ(tpr_guest_tpr_get(), 0); + tpr_guest_check_tpr_ppr_cr8_equal(); + + tpr_guest_irq_queue(); + + /* TPR = 0 but IRQ masked by IF=0, should not fire */ + udelay(1000); + GUEST_ASSERT_EQ(tpr_guest_irq_sync_val_get(), 0); + + sti(); + + /* IF=1 now, IRQ should fire */ + while (tpr_guest_irq_sync_val_get() == 0) + cpu_relax(); + GUEST_ASSERT_EQ(tpr_guest_irq_sync_val_get(), 1); + + GUEST_SYNC(true); + tpr_guest_check_tpr_ppr_cr8_equal(); + + tpr_guest_irq_queue(); + + /* IRQ masked by barely high enough TPR now, should not fire */ + udelay(1000); + GUEST_ASSERT_EQ(tpr_guest_irq_sync_val_get(), 1); + + GUEST_SYNC(false); + tpr_guest_check_tpr_ppr_cr8_equal(); + + /* TPR barely low enough now to unmask IRQ, should fire */ + while (tpr_guest_irq_sync_val_get() == 1) + cpu_relax(); + GUEST_ASSERT_EQ(tpr_guest_irq_sync_val_get(), 2); + + GUEST_DONE(); +} + +static uint8_t lapic_tpr_get(struct kvm_lapic_state *xapic) +{ + return GET_APIC_PRI(*((u32 *)&xapic->regs[APIC_TASKPRI])); +} + +static void lapic_tpr_set(struct kvm_lapic_state *xapic, uint8_t val) +{ + u32 *taskpri = (u32 *)&xapic->regs[APIC_TASKPRI]; + + *taskpri = SET_APIC_PRI(*taskpri, val); +} + +static uint8_t sregs_tpr(struct kvm_sregs *sregs) +{ + return sregs->cr8 & GENMASK(3, 0); +} + +static void test_tpr_check_tpr_zero(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic_state xapic; + + vcpu_ioctl(vcpu, KVM_GET_LAPIC, &xapic); + + TEST_ASSERT_EQ(lapic_tpr_get(&xapic), 0); +} + +static void test_tpr_check_tpr_cr8_equal(struct kvm_vcpu *vcpu) +{ + struct kvm_sregs sregs; + struct kvm_lapic_state xapic; + + vcpu_sregs_get(vcpu, &sregs); + vcpu_ioctl(vcpu, KVM_GET_LAPIC, &xapic); + + TEST_ASSERT_EQ(sregs_tpr(&sregs), lapic_tpr_get(&xapic)); +} + +static void test_tpr_set_tpr_for_irq(struct kvm_vcpu *vcpu, bool mask) +{ + struct kvm_lapic_state xapic; + uint8_t tpr; + + static_assert(IRQ_VECTOR >= 16, "invalid IRQ vector number"); + tpr = IRQ_VECTOR / 16; + if (!mask) + tpr--; + + vcpu_ioctl(vcpu, KVM_GET_LAPIC, &xapic); + lapic_tpr_set(&xapic, tpr); + vcpu_ioctl(vcpu, KVM_SET_LAPIC, &xapic); +} + +static void test_tpr(bool __is_x2apic) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + bool done = false; + + is_x2apic = __is_x2apic; + + vm = vm_create_with_one_vcpu(&vcpu, tpr_guest_code); + if (is_x2apic) { + vm_install_exception_handler(vm, IRQ_VECTOR, + tpr_guest_irq_handler_x2apic); + } else { + vm_install_exception_handler(vm, IRQ_VECTOR, + tpr_guest_irq_handler_xapic); + vcpu_clear_cpuid_feature(vcpu, X86_FEATURE_X2APIC); + virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA); + } + + sync_global_to_guest(vcpu->vm, is_x2apic); + + /* According to the SDM/APM the TPR value at reset is 0 */ + test_tpr_check_tpr_zero(vcpu); + test_tpr_check_tpr_cr8_equal(vcpu); + + tpr_guest_irq_sync_flag_reset(); + sync_global_to_guest(vcpu->vm, tpr_guest_irq_sync_val); + + while (!done) { + struct ucall uc; + + alarm(2); + vcpu_run(vcpu); + alarm(0); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + case UCALL_DONE: + test_tpr_check_tpr_cr8_equal(vcpu); + done = true; + break; + case UCALL_SYNC: + test_tpr_check_tpr_cr8_equal(vcpu); + test_tpr_set_tpr_for_irq(vcpu, uc.args[1]); + break; + default: + TEST_FAIL("Unknown ucall result 0x%lx", uc.cmd); + break; + } + } + kvm_vm_free(vm); +} + +int main(int argc, char *argv[]) +{ + /* + * Use separate VMs for the xAPIC and x2APIC tests so that x2APIC can + * be fully hidden from the guest. KVM disallows changing CPUID after + * KVM_RUN and AVIC is disabled if _any_ vCPU is allowed to use x2APIC. + */ + test_tpr(false); + test_tpr(true); +} -- cgit v1.2.3 From 7fe9f5366bd5d7ee3cfd9f66868a4410d6e4792d Mon Sep 17 00:00:00 2001 From: MJ Pooladkhay Date: Mon, 22 Dec 2025 17:42:07 +0000 Subject: KVM: selftests: Fix sign extension bug in get_desc64_base() The function get_desc64_base() performs a series of bitwise left shifts on fields of various sizes. More specifically, when performing '<< 24' on 'desc->base2' (which is a u8), 'base2' is promoted to a signed integer before shifting. In a scenario where base2 >= 0x80, the shift places a 1 into bit 31, causing the 32-bit intermediate value to become negative. When this result is cast to uint64_t or ORed into the return value, sign extension occurs, corrupting the upper 32 bits of the address (base3). Example: Given: base0 = 0x5000 base1 = 0xd6 base2 = 0xf8 base3 = 0xfffffe7c Expected return: 0xfffffe7cf8d65000 Actual return: 0xfffffffff8d65000 Fix this by explicitly casting the fields to 'uint64_t' before shifting to prevent sign extension. Signed-off-by: MJ Pooladkhay Link: https://patch.msgid.link/20251222174207.107331-1-mj@pooladkhay.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/x86/processor.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index 57d62a425109..26a91bb73c93 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -436,8 +436,10 @@ struct kvm_x86_state { static inline uint64_t get_desc64_base(const struct desc64 *desc) { - return ((uint64_t)desc->base3 << 32) | - (desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24)); + return (uint64_t)desc->base3 << 32 | + (uint64_t)desc->base2 << 24 | + (uint64_t)desc->base1 << 16 | + (uint64_t)desc->base0; } static inline uint64_t rdtsc(void) -- cgit v1.2.3 From 69e81ed5e6a59c12c0c6756c3f0524e2ddb023f4 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 30 Dec 2025 15:01:30 -0800 Subject: KVM: selftests: Make __vm_get_page_table_entry() static The function is only used in processor.c, drop the declaration in processor.h and make it static. No functional change intended. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-2-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/x86/processor.h | 2 -- tools/testing/selftests/kvm/lib/x86/processor.c | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index 26a91bb73c93..1cb5b4c46b99 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -1369,8 +1369,6 @@ static inline bool kvm_is_ignore_msrs(void) return get_kvm_param_bool("ignore_msrs"); } -uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr, - int *level); uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr); uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2, diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index 36104d27f3d9..c14bf2b5f28f 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -306,8 +306,8 @@ static bool vm_is_target_pte(uint64_t *pte, int *level, int current_level) return *level == current_level; } -uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr, - int *level) +static uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr, + int *level) { int va_width = 12 + (vm->pgtable_levels) * 9; uint64_t *pte = &vm->pgd; -- cgit v1.2.3 From 97dfbdfea405a0820ccfcf00afdda4c0f47c3df8 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 30 Dec 2025 15:01:31 -0800 Subject: KVM: selftests: Stop passing a memslot to nested_map_memslot() On x86, KVM selftests use memslot 0 for all the default regions used by the test infrastructure. This is an implementation detail. nested_map_memslot() is currently used to map the default regions by explicitly passing slot 0, which leaks the library implementation into the caller. Rename the function to a very verbose nested_identity_map_default_memslots() to reflect what it actually does. Add an assertion that only memslot 0 is being used so that the implementation does not change from under us. No functional change intended. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-3-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/x86/vmx.h | 4 ++-- tools/testing/selftests/kvm/lib/x86/vmx.c | 12 ++++++++---- tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c | 2 +- 3 files changed, 11 insertions(+), 7 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/kvm/include/x86/vmx.h b/tools/testing/selftests/kvm/include/x86/vmx.h index 96e2b4c630a9..91916b8aa94b 100644 --- a/tools/testing/selftests/kvm/include/x86/vmx.h +++ b/tools/testing/selftests/kvm/include/x86/vmx.h @@ -563,8 +563,8 @@ void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr); void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, uint64_t size); -void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm, - uint32_t memslot); +void nested_identity_map_default_memslots(struct vmx_pages *vmx, + struct kvm_vm *vm); void nested_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm, uint64_t addr, uint64_t size); bool kvm_cpu_has_ept(void); diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c index 29b082a58daa..eec33ec63811 100644 --- a/tools/testing/selftests/kvm/lib/x86/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86/vmx.c @@ -494,12 +494,16 @@ void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, /* Prepare an identity extended page table that maps all the * physical pages in VM. */ -void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm, - uint32_t memslot) +void nested_identity_map_default_memslots(struct vmx_pages *vmx, + struct kvm_vm *vm) { + uint32_t s, memslot = 0; sparsebit_idx_t i, last; - struct userspace_mem_region *region = - memslot2region(vm, memslot); + struct userspace_mem_region *region = memslot2region(vm, memslot); + + /* Only memslot 0 is mapped here, ensure it's the only one being used */ + for (s = 0; s < NR_MEM_REGIONS; s++) + TEST_ASSERT_EQ(vm->memslots[s], 0); i = (region->region.guest_phys_addr >> vm->page_shift) - 1; last = i + (region->region.memory_size >> vm->page_shift); diff --git a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c index 98cb6bdab3e6..aab7333aaef0 100644 --- a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c +++ b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c @@ -121,7 +121,7 @@ static void test_vmx_dirty_log(bool enable_ept) */ if (enable_ept) { prepare_eptp(vmx, vm); - nested_map_memslot(vmx, vm, 0); + nested_identity_map_default_memslots(vmx, vm); nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, PAGE_SIZE); nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, PAGE_SIZE); } -- cgit v1.2.3 From 60de423781ad9967bfdd3a2f02ba0a31787b0d2c Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 30 Dec 2025 15:01:32 -0800 Subject: KVM: selftests: Rename nested TDP mapping functions Rename the functions from nested_* to tdp_* to make their purpose clearer. No functional change intended. Suggested-by: Sean Christopherson Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-4-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/x86/vmx.h | 16 +++---- tools/testing/selftests/kvm/lib/x86/memstress.c | 4 +- tools/testing/selftests/kvm/lib/x86/vmx.c | 50 +++++++++++----------- .../testing/selftests/kvm/x86/vmx_dirty_log_test.c | 6 +-- 4 files changed, 37 insertions(+), 39 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/kvm/include/x86/vmx.h b/tools/testing/selftests/kvm/include/x86/vmx.h index 91916b8aa94b..04b8231d032a 100644 --- a/tools/testing/selftests/kvm/include/x86/vmx.h +++ b/tools/testing/selftests/kvm/include/x86/vmx.h @@ -559,14 +559,14 @@ bool load_vmcs(struct vmx_pages *vmx); bool ept_1g_pages_supported(void); -void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr); -void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr, uint64_t size); -void nested_identity_map_default_memslots(struct vmx_pages *vmx, - struct kvm_vm *vm); -void nested_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t addr, uint64_t size); +void tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, uint64_t nested_paddr, + uint64_t paddr); +void tdp_map(struct vmx_pages *vmx, struct kvm_vm *vm, uint64_t nested_paddr, + uint64_t paddr, uint64_t size); +void tdp_identity_map_default_memslots(struct vmx_pages *vmx, + struct kvm_vm *vm); +void tdp_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t addr, uint64_t size); bool kvm_cpu_has_ept(void); void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm); void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm); diff --git a/tools/testing/selftests/kvm/lib/x86/memstress.c b/tools/testing/selftests/kvm/lib/x86/memstress.c index 0b1f288ad556..1928b00bde51 100644 --- a/tools/testing/selftests/kvm/lib/x86/memstress.c +++ b/tools/testing/selftests/kvm/lib/x86/memstress.c @@ -70,11 +70,11 @@ void memstress_setup_ept(struct vmx_pages *vmx, struct kvm_vm *vm) * KVM can shadow the EPT12 with the maximum huge page size supported * by the backing source. */ - nested_identity_map_1g(vmx, vm, 0, 0x100000000ULL); + tdp_identity_map_1g(vmx, vm, 0, 0x100000000ULL); start = align_down(memstress_args.gpa, PG_SIZE_1G); end = align_up(memstress_args.gpa + memstress_args.size, PG_SIZE_1G); - nested_identity_map_1g(vmx, vm, start, end - start); + tdp_identity_map_1g(vmx, vm, start, end - start); } void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vcpus[]) diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c index eec33ec63811..1954ccdfc353 100644 --- a/tools/testing/selftests/kvm/lib/x86/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86/vmx.c @@ -362,12 +362,12 @@ void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp) init_vmcs_guest_state(guest_rip, guest_rsp); } -static void nested_create_pte(struct kvm_vm *vm, - struct eptPageTableEntry *pte, - uint64_t nested_paddr, - uint64_t paddr, - int current_level, - int target_level) +static void tdp_create_pte(struct kvm_vm *vm, + struct eptPageTableEntry *pte, + uint64_t nested_paddr, + uint64_t paddr, + int current_level, + int target_level) { if (!pte->readable) { pte->writable = true; @@ -394,8 +394,8 @@ static void nested_create_pte(struct kvm_vm *vm, } -void __nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr, int target_level) +void __tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t nested_paddr, uint64_t paddr, int target_level) { const uint64_t page_size = PG_LEVEL_SIZE(target_level); struct eptPageTableEntry *pt = vmx->eptp_hva, *pte; @@ -428,7 +428,7 @@ void __nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, index = (nested_paddr >> PG_LEVEL_SHIFT(level)) & 0x1ffu; pte = &pt[index]; - nested_create_pte(vm, pte, nested_paddr, paddr, level, target_level); + tdp_create_pte(vm, pte, nested_paddr, paddr, level, target_level); if (pte->page_size) break; @@ -445,10 +445,10 @@ void __nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, } -void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr) +void tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t nested_paddr, uint64_t paddr) { - __nested_pg_map(vmx, vm, nested_paddr, paddr, PG_LEVEL_4K); + __tdp_pg_map(vmx, vm, nested_paddr, paddr, PG_LEVEL_4K); } /* @@ -468,8 +468,8 @@ void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, * Within the VM given by vm, creates a nested guest translation for the * page range starting at nested_paddr to the page range starting at paddr. */ -void __nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr, uint64_t size, +void __tdp_map(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t nested_paddr, uint64_t paddr, uint64_t size, int level) { size_t page_size = PG_LEVEL_SIZE(level); @@ -479,23 +479,23 @@ void __nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, TEST_ASSERT(paddr + size > paddr, "Paddr overflow"); while (npages--) { - __nested_pg_map(vmx, vm, nested_paddr, paddr, level); + __tdp_pg_map(vmx, vm, nested_paddr, paddr, level); nested_paddr += page_size; paddr += page_size; } } -void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr, uint64_t size) +void tdp_map(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t nested_paddr, uint64_t paddr, uint64_t size) { - __nested_map(vmx, vm, nested_paddr, paddr, size, PG_LEVEL_4K); + __tdp_map(vmx, vm, nested_paddr, paddr, size, PG_LEVEL_4K); } /* Prepare an identity extended page table that maps all the * physical pages in VM. */ -void nested_identity_map_default_memslots(struct vmx_pages *vmx, - struct kvm_vm *vm) +void tdp_identity_map_default_memslots(struct vmx_pages *vmx, + struct kvm_vm *vm) { uint32_t s, memslot = 0; sparsebit_idx_t i, last; @@ -512,18 +512,16 @@ void nested_identity_map_default_memslots(struct vmx_pages *vmx, if (i > last) break; - nested_map(vmx, vm, - (uint64_t)i << vm->page_shift, - (uint64_t)i << vm->page_shift, - 1 << vm->page_shift); + tdp_map(vmx, vm, (uint64_t)i << vm->page_shift, + (uint64_t)i << vm->page_shift, 1 << vm->page_shift); } } /* Identity map a region with 1GiB Pages. */ -void nested_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm, +void tdp_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm, uint64_t addr, uint64_t size) { - __nested_map(vmx, vm, addr, addr, size, PG_LEVEL_1G); + __tdp_map(vmx, vm, addr, addr, size, PG_LEVEL_1G); } bool kvm_cpu_has_ept(void) diff --git a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c index aab7333aaef0..e7d0c08ba29d 100644 --- a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c +++ b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c @@ -121,9 +121,9 @@ static void test_vmx_dirty_log(bool enable_ept) */ if (enable_ept) { prepare_eptp(vmx, vm); - nested_identity_map_default_memslots(vmx, vm); - nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, PAGE_SIZE); - nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, PAGE_SIZE); + tdp_identity_map_default_memslots(vmx, vm); + tdp_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, PAGE_SIZE); + tdp_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, PAGE_SIZE); } bmap = bitmap_zalloc(TEST_MEM_PAGES); -- cgit v1.2.3 From b320c03d685704df51cf0774edd799e96c505c74 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 30 Dec 2025 15:01:33 -0800 Subject: KVM: selftests: Kill eptPageTablePointer Replace the struct overlay with explicit bitmasks, which is clearer and less error-prone. See commit f18b4aebe107 ("kvm: selftests: do not use bitfields larger than 32-bits for PTEs") for an example of why bitfields are not preferable. Remove the unused PAGE_SHIFT_4K definition while at it. No functional change intended. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-5-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/lib/x86/vmx.c | 37 ++++++++++++++----------------- 1 file changed, 17 insertions(+), 20 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c index 1954ccdfc353..85043bb1ec4d 100644 --- a/tools/testing/selftests/kvm/lib/x86/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86/vmx.c @@ -10,10 +10,16 @@ #include "processor.h" #include "vmx.h" -#define PAGE_SHIFT_4K 12 - #define KVM_EPT_PAGE_TABLE_MIN_PADDR 0x1c0000 +#define EPTP_MT_SHIFT 0 /* EPTP memtype bits 2:0 */ +#define EPTP_PWL_SHIFT 3 /* EPTP page walk length bits 5:3 */ +#define EPTP_AD_ENABLED_SHIFT 6 /* EPTP AD enabled bit 6 */ + +#define EPTP_WB (X86_MEMTYPE_WB << EPTP_MT_SHIFT) +#define EPTP_PWL_4 (3ULL << EPTP_PWL_SHIFT) /* PWL is (levels - 1) */ +#define EPTP_AD_ENABLED (1ULL << EPTP_AD_ENABLED_SHIFT) + bool enable_evmcs; struct hv_enlightened_vmcs *current_evmcs; @@ -34,14 +40,6 @@ struct eptPageTableEntry { uint64_t suppress_ve:1; }; -struct eptPageTablePointer { - uint64_t memory_type:3; - uint64_t page_walk_length:3; - uint64_t ad_enabled:1; - uint64_t reserved_11_07:5; - uint64_t address:40; - uint64_t reserved_63_52:12; -}; int vcpu_enable_evmcs(struct kvm_vcpu *vcpu) { uint16_t evmcs_ver; @@ -196,16 +194,15 @@ static inline void init_vmcs_control_fields(struct vmx_pages *vmx) vmwrite(PIN_BASED_VM_EXEC_CONTROL, rdmsr(MSR_IA32_VMX_TRUE_PINBASED_CTLS)); if (vmx->eptp_gpa) { - uint64_t ept_paddr; - struct eptPageTablePointer eptp = { - .memory_type = X86_MEMTYPE_WB, - .page_walk_length = 3, /* + 1 */ - .ad_enabled = ept_vpid_cap_supported(VMX_EPT_VPID_CAP_AD_BITS), - .address = vmx->eptp_gpa >> PAGE_SHIFT_4K, - }; - - memcpy(&ept_paddr, &eptp, sizeof(ept_paddr)); - vmwrite(EPT_POINTER, ept_paddr); + uint64_t eptp = vmx->eptp_gpa | EPTP_WB | EPTP_PWL_4; + + TEST_ASSERT((vmx->eptp_gpa & ~PHYSICAL_PAGE_MASK) == 0, + "Illegal bits set in vmx->eptp_gpa"); + + if (ept_vpid_cap_supported(VMX_EPT_VPID_CAP_AD_BITS)) + eptp |= EPTP_AD_ENABLED; + + vmwrite(EPT_POINTER, eptp); sec_exec_ctl |= SECONDARY_EXEC_ENABLE_EPT; } -- cgit v1.2.3 From 3cd5002807bebf504cbb6645e73d01204324e54a Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 30 Dec 2025 15:01:34 -0800 Subject: KVM: selftests: Stop setting A/D bits when creating EPT PTEs Stop setting Accessed/Dirty bits when creating EPT entries for L2 so that the stage-1 and stage-2 (a.k.a. TDP) page table APIs can use common code without bleeding the EPT hack into the common APIs. While commit 094444204570 ("selftests: kvm: add test for dirty logging inside nested guests") is _very_ light on details, the most likely explanation is that vmx_dirty_log_test was attempting to avoid taking an EPT Violation on the first _write_ from L2. static void l2_guest_code(u64 *a, u64 *b) { READ_ONCE(*a); WRITE_ONCE(*a, 1); <=== GUEST_SYNC(true); ... } When handling read faults in the shadow MMU, KVM opportunistically creates a writable SPTE if the mapping can be writable *and* the gPTE is dirty (or doesn't support the Dirty bit), i.e. if KVM doesn't need to intercept writes in order to emulate Dirty-bit updates. By setting A/D bits in the test's EPT entries, the above READ+WRITE will fault only on the read, and in theory expose the bug fixed by KVM commit 1f4e5fc83a42 ("KVM: x86: fix nested guest live migration with PML"). If the Dirty bit is NOT set, the test will get a false pass due; though again, in theory. However, the test is flawed (and always was, at least in the versions posted publicly), as KVM (correctly) marks the corresponding L1 GFN as dirty (in the dirty bitmap) when creating the writable SPTE. I.e. without a check on the dirty bitmap after the READ_ONCE(), the check after the first WRITE_ONCE() will get a false pass due to the dirty bitmap/log having been updated by the read fault, not by PML. Furthermore, the subsequent behavior in the test's l2_guest_code() effectively hides the flawed test behavior, as the straight writes to a new L2 GPA fault also trigger the KVM bug, and so the test will still detect the failure due to lack of isolation between the two testcases (Read=>Write vs. Write=>Write). WRITE_ONCE(*b, 1); GUEST_SYNC(true); WRITE_ONCE(*b, 1); GUEST_SYNC(true); GUEST_SYNC(false); Punt on fixing vmx_dirty_log_test for the moment as it will be easier to properly fix the test once the TDP code uses the common MMU APIs, at which point it will be trivially easy for the test to retrieve the EPT PTE and set the Dirty bit as needed. Signed-off-by: Yosry Ahmed [sean: rewrite changelog to explain the situation] Link: https://patch.msgid.link/20251230230150.4150236-6-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/lib/x86/vmx.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c index 85043bb1ec4d..a3e2eae981da 100644 --- a/tools/testing/selftests/kvm/lib/x86/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86/vmx.c @@ -432,14 +432,6 @@ void __tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, pt = addr_gpa2hva(vm, pte->address * vm->page_size); } - - /* - * For now mark these as accessed and dirty because the only - * testcase we have needs that. Can be reconsidered later. - */ - pte->accessed = true; - pte->dirty = true; - } void tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, -- cgit v1.2.3 From 9f073ac25b4c4cf3b3ea13b155035108c54148bb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Dec 2025 15:01:35 -0800 Subject: KVM: selftests: Add "struct kvm_mmu" to track a given MMU instance Add a "struct kvm_mmu" to track a given MMU instance, e.g. a VM's stage-1 MMU versus a VM's stage-2 MMU, so that x86 can share MMU functionality for both stage-1 and stage-2 MMUs, without creating the potential for subtle bugs, e.g. due to consuming on vm->pgtable_levels when operating a stage-2 MMU. Encapsulate the existing de facto MMU in "struct kvm_vm", e.g instead of burying the MMU details in "struct kvm_vm_arch", to avoid more #ifdefs in ____vm_create(), and in the hopes that other architectures can utilize the formalized MMU structure if/when they too support stage-2 page tables. No functional change intended. Reviewed-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-7-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/kvm_util.h | 11 +++++-- tools/testing/selftests/kvm/lib/arm64/processor.c | 38 +++++++++++----------- tools/testing/selftests/kvm/lib/kvm_util.c | 28 ++++++++-------- .../selftests/kvm/lib/loongarch/processor.c | 28 ++++++++-------- tools/testing/selftests/kvm/lib/riscv/processor.c | 31 +++++++++--------- tools/testing/selftests/kvm/lib/s390/processor.c | 16 ++++----- tools/testing/selftests/kvm/lib/x86/processor.c | 28 ++++++++-------- .../selftests/kvm/x86/vmx_nested_la57_state_test.c | 2 +- 8 files changed, 94 insertions(+), 88 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 81f4355ff28a..39558c05c0bf 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -88,12 +88,17 @@ enum kvm_mem_region_type { NR_MEM_REGIONS, }; +struct kvm_mmu { + bool pgd_created; + uint64_t pgd; + int pgtable_levels; +}; + struct kvm_vm { int mode; unsigned long type; int kvm_fd; int fd; - unsigned int pgtable_levels; unsigned int page_size; unsigned int page_shift; unsigned int pa_bits; @@ -104,13 +109,13 @@ struct kvm_vm { struct sparsebit *vpages_valid; struct sparsebit *vpages_mapped; bool has_irqchip; - bool pgd_created; vm_paddr_t ucall_mmio_addr; - vm_paddr_t pgd; vm_vaddr_t handlers; uint32_t dirty_ring_size; uint64_t gpa_tag_mask; + struct kvm_mmu mmu; + struct kvm_vm_arch arch; struct kvm_binary_stats stats; diff --git a/tools/testing/selftests/kvm/lib/arm64/processor.c b/tools/testing/selftests/kvm/lib/arm64/processor.c index d46e4b13b92c..c40f59d48311 100644 --- a/tools/testing/selftests/kvm/lib/arm64/processor.c +++ b/tools/testing/selftests/kvm/lib/arm64/processor.c @@ -28,7 +28,7 @@ static uint64_t page_align(struct kvm_vm *vm, uint64_t v) static uint64_t pgd_index(struct kvm_vm *vm, vm_vaddr_t gva) { - unsigned int shift = (vm->pgtable_levels - 1) * (vm->page_shift - 3) + vm->page_shift; + unsigned int shift = (vm->mmu.pgtable_levels - 1) * (vm->page_shift - 3) + vm->page_shift; uint64_t mask = (1UL << (vm->va_bits - shift)) - 1; return (gva >> shift) & mask; @@ -39,7 +39,7 @@ static uint64_t pud_index(struct kvm_vm *vm, vm_vaddr_t gva) unsigned int shift = 2 * (vm->page_shift - 3) + vm->page_shift; uint64_t mask = (1UL << (vm->page_shift - 3)) - 1; - TEST_ASSERT(vm->pgtable_levels == 4, + TEST_ASSERT(vm->mmu.pgtable_levels == 4, "Mode %d does not have 4 page table levels", vm->mode); return (gva >> shift) & mask; @@ -50,7 +50,7 @@ static uint64_t pmd_index(struct kvm_vm *vm, vm_vaddr_t gva) unsigned int shift = (vm->page_shift - 3) + vm->page_shift; uint64_t mask = (1UL << (vm->page_shift - 3)) - 1; - TEST_ASSERT(vm->pgtable_levels >= 3, + TEST_ASSERT(vm->mmu.pgtable_levels >= 3, "Mode %d does not have >= 3 page table levels", vm->mode); return (gva >> shift) & mask; @@ -104,7 +104,7 @@ static uint64_t pte_addr(struct kvm_vm *vm, uint64_t pte) static uint64_t ptrs_per_pgd(struct kvm_vm *vm) { - unsigned int shift = (vm->pgtable_levels - 1) * (vm->page_shift - 3) + vm->page_shift; + unsigned int shift = (vm->mmu.pgtable_levels - 1) * (vm->page_shift - 3) + vm->page_shift; return 1 << (vm->va_bits - shift); } @@ -117,13 +117,13 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm) { size_t nr_pages = page_align(vm, ptrs_per_pgd(vm) * 8) / vm->page_size; - if (vm->pgd_created) + if (vm->mmu.pgd_created) return; - vm->pgd = vm_phy_pages_alloc(vm, nr_pages, - KVM_GUEST_PAGE_TABLE_MIN_PADDR, - vm->memslots[MEM_REGION_PT]); - vm->pgd_created = true; + vm->mmu.pgd = vm_phy_pages_alloc(vm, nr_pages, + KVM_GUEST_PAGE_TABLE_MIN_PADDR, + vm->memslots[MEM_REGION_PT]); + vm->mmu.pgd_created = true; } static void _virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, @@ -147,12 +147,12 @@ static void _virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", paddr, vm->max_gfn, vm->page_size); - ptep = addr_gpa2hva(vm, vm->pgd) + pgd_index(vm, vaddr) * 8; + ptep = addr_gpa2hva(vm, vm->mmu.pgd) + pgd_index(vm, vaddr) * 8; if (!*ptep) *ptep = addr_pte(vm, vm_alloc_page_table(vm), PGD_TYPE_TABLE | PTE_VALID); - switch (vm->pgtable_levels) { + switch (vm->mmu.pgtable_levels) { case 4: ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pud_index(vm, vaddr) * 8; if (!*ptep) @@ -190,16 +190,16 @@ uint64_t *virt_get_pte_hva_at_level(struct kvm_vm *vm, vm_vaddr_t gva, int level { uint64_t *ptep; - if (!vm->pgd_created) + if (!vm->mmu.pgd_created) goto unmapped_gva; - ptep = addr_gpa2hva(vm, vm->pgd) + pgd_index(vm, gva) * 8; + ptep = addr_gpa2hva(vm, vm->mmu.pgd) + pgd_index(vm, gva) * 8; if (!ptep) goto unmapped_gva; if (level == 0) return ptep; - switch (vm->pgtable_levels) { + switch (vm->mmu.pgtable_levels) { case 4: ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pud_index(vm, gva) * 8; if (!ptep) @@ -263,13 +263,13 @@ static void pte_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent, uint64_t p void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) { - int level = 4 - (vm->pgtable_levels - 1); + int level = 4 - (vm->mmu.pgtable_levels - 1); uint64_t pgd, *ptep; - if (!vm->pgd_created) + if (!vm->mmu.pgd_created) return; - for (pgd = vm->pgd; pgd < vm->pgd + ptrs_per_pgd(vm) * 8; pgd += 8) { + for (pgd = vm->mmu.pgd; pgd < vm->mmu.pgd + ptrs_per_pgd(vm) * 8; pgd += 8) { ptep = addr_gpa2hva(vm, pgd); if (!*ptep) continue; @@ -350,7 +350,7 @@ void aarch64_vcpu_setup(struct kvm_vcpu *vcpu, struct kvm_vcpu_init *init) TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode); } - ttbr0_el1 = vm->pgd & GENMASK(47, vm->page_shift); + ttbr0_el1 = vm->mmu.pgd & GENMASK(47, vm->page_shift); /* Configure output size */ switch (vm->mode) { @@ -358,7 +358,7 @@ void aarch64_vcpu_setup(struct kvm_vcpu *vcpu, struct kvm_vcpu_init *init) case VM_MODE_P52V48_16K: case VM_MODE_P52V48_64K: tcr_el1 |= TCR_IPS_52_BITS; - ttbr0_el1 |= FIELD_GET(GENMASK(51, 48), vm->pgd) << 2; + ttbr0_el1 |= FIELD_GET(GENMASK(51, 48), vm->mmu.pgd) << 2; break; case VM_MODE_P48V48_4K: case VM_MODE_P48V48_16K: diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 8279b6ced8d2..65752daeed90 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -281,34 +281,34 @@ struct kvm_vm *____vm_create(struct vm_shape shape) /* Setup mode specific traits. */ switch (vm->mode) { case VM_MODE_P52V48_4K: - vm->pgtable_levels = 4; + vm->mmu.pgtable_levels = 4; break; case VM_MODE_P52V48_64K: - vm->pgtable_levels = 3; + vm->mmu.pgtable_levels = 3; break; case VM_MODE_P48V48_4K: - vm->pgtable_levels = 4; + vm->mmu.pgtable_levels = 4; break; case VM_MODE_P48V48_64K: - vm->pgtable_levels = 3; + vm->mmu.pgtable_levels = 3; break; case VM_MODE_P40V48_4K: case VM_MODE_P36V48_4K: - vm->pgtable_levels = 4; + vm->mmu.pgtable_levels = 4; break; case VM_MODE_P40V48_64K: case VM_MODE_P36V48_64K: - vm->pgtable_levels = 3; + vm->mmu.pgtable_levels = 3; break; case VM_MODE_P52V48_16K: case VM_MODE_P48V48_16K: case VM_MODE_P40V48_16K: case VM_MODE_P36V48_16K: - vm->pgtable_levels = 4; + vm->mmu.pgtable_levels = 4; break; case VM_MODE_P47V47_16K: case VM_MODE_P36V47_16K: - vm->pgtable_levels = 3; + vm->mmu.pgtable_levels = 3; break; case VM_MODE_PXXVYY_4K: #ifdef __x86_64__ @@ -321,22 +321,22 @@ struct kvm_vm *____vm_create(struct vm_shape shape) vm->va_bits); if (vm->va_bits == 57) { - vm->pgtable_levels = 5; + vm->mmu.pgtable_levels = 5; } else { TEST_ASSERT(vm->va_bits == 48, "Unexpected guest virtual address width: %d", vm->va_bits); - vm->pgtable_levels = 4; + vm->mmu.pgtable_levels = 4; } #else TEST_FAIL("VM_MODE_PXXVYY_4K not supported on non-x86 platforms"); #endif break; case VM_MODE_P47V64_4K: - vm->pgtable_levels = 5; + vm->mmu.pgtable_levels = 5; break; case VM_MODE_P44V64_4K: - vm->pgtable_levels = 5; + vm->mmu.pgtable_levels = 5; break; default: TEST_FAIL("Unknown guest mode: 0x%x", vm->mode); @@ -1956,8 +1956,8 @@ void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) fprintf(stream, "%*sMapped Virtual Pages:\n", indent, ""); sparsebit_dump(stream, vm->vpages_mapped, indent + 2); fprintf(stream, "%*spgd_created: %u\n", indent, "", - vm->pgd_created); - if (vm->pgd_created) { + vm->mmu.pgd_created); + if (vm->mmu.pgd_created) { fprintf(stream, "%*sVirtual Translation Tables:\n", indent + 2, ""); virt_dump(stream, vm, indent + 4); diff --git a/tools/testing/selftests/kvm/lib/loongarch/processor.c b/tools/testing/selftests/kvm/lib/loongarch/processor.c index 07c103369ddb..17aa55a2047a 100644 --- a/tools/testing/selftests/kvm/lib/loongarch/processor.c +++ b/tools/testing/selftests/kvm/lib/loongarch/processor.c @@ -50,11 +50,11 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm) int i; vm_paddr_t child, table; - if (vm->pgd_created) + if (vm->mmu.pgd_created) return; child = table = 0; - for (i = 0; i < vm->pgtable_levels; i++) { + for (i = 0; i < vm->mmu.pgtable_levels; i++) { invalid_pgtable[i] = child; table = vm_phy_page_alloc(vm, LOONGARCH_PAGE_TABLE_PHYS_MIN, vm->memslots[MEM_REGION_PT]); @@ -62,8 +62,8 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm) virt_set_pgtable(vm, table, child); child = table; } - vm->pgd = table; - vm->pgd_created = true; + vm->mmu.pgd = table; + vm->mmu.pgd_created = true; } static int virt_pte_none(uint64_t *ptep, int level) @@ -77,11 +77,11 @@ static uint64_t *virt_populate_pte(struct kvm_vm *vm, vm_vaddr_t gva, int alloc) uint64_t *ptep; vm_paddr_t child; - if (!vm->pgd_created) + if (!vm->mmu.pgd_created) goto unmapped_gva; - child = vm->pgd; - level = vm->pgtable_levels - 1; + child = vm->mmu.pgd; + level = vm->mmu.pgtable_levels - 1; while (level > 0) { ptep = addr_gpa2hva(vm, child) + virt_pte_index(vm, gva, level) * 8; if (virt_pte_none(ptep, level)) { @@ -161,11 +161,11 @@ void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) { int level; - if (!vm->pgd_created) + if (!vm->mmu.pgd_created) return; - level = vm->pgtable_levels - 1; - pte_dump(stream, vm, indent, vm->pgd, level); + level = vm->mmu.pgtable_levels - 1; + pte_dump(stream, vm, indent, vm->mmu.pgd, level); } void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu, uint8_t indent) @@ -297,7 +297,7 @@ static void loongarch_vcpu_setup(struct kvm_vcpu *vcpu) width = vm->page_shift - 3; - switch (vm->pgtable_levels) { + switch (vm->mmu.pgtable_levels) { case 4: /* pud page shift and width */ val = (vm->page_shift + width * 2) << 20 | (width << 25); @@ -309,15 +309,15 @@ static void loongarch_vcpu_setup(struct kvm_vcpu *vcpu) val |= vm->page_shift | width << 5; break; default: - TEST_FAIL("Got %u page table levels, expected 3 or 4", vm->pgtable_levels); + TEST_FAIL("Got %u page table levels, expected 3 or 4", vm->mmu.pgtable_levels); } loongarch_set_csr(vcpu, LOONGARCH_CSR_PWCTL0, val); /* PGD page shift and width */ - val = (vm->page_shift + width * (vm->pgtable_levels - 1)) | width << 6; + val = (vm->page_shift + width * (vm->mmu.pgtable_levels - 1)) | width << 6; loongarch_set_csr(vcpu, LOONGARCH_CSR_PWCTL1, val); - loongarch_set_csr(vcpu, LOONGARCH_CSR_PGDL, vm->pgd); + loongarch_set_csr(vcpu, LOONGARCH_CSR_PGDL, vm->mmu.pgd); /* * Refill exception runs on real mode diff --git a/tools/testing/selftests/kvm/lib/riscv/processor.c b/tools/testing/selftests/kvm/lib/riscv/processor.c index 2eac7d4b59e9..e6ec7c224fc3 100644 --- a/tools/testing/selftests/kvm/lib/riscv/processor.c +++ b/tools/testing/selftests/kvm/lib/riscv/processor.c @@ -60,7 +60,7 @@ static uint64_t pte_index(struct kvm_vm *vm, vm_vaddr_t gva, int level) { TEST_ASSERT(level > -1, "Negative page table level (%d) not possible", level); - TEST_ASSERT(level < vm->pgtable_levels, + TEST_ASSERT(level < vm->mmu.pgtable_levels, "Invalid page table level (%d)", level); return (gva & pte_index_mask[level]) >> pte_index_shift[level]; @@ -70,19 +70,19 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm) { size_t nr_pages = page_align(vm, ptrs_per_pte(vm) * 8) / vm->page_size; - if (vm->pgd_created) + if (vm->mmu.pgd_created) return; - vm->pgd = vm_phy_pages_alloc(vm, nr_pages, - KVM_GUEST_PAGE_TABLE_MIN_PADDR, - vm->memslots[MEM_REGION_PT]); - vm->pgd_created = true; + vm->mmu.pgd = vm_phy_pages_alloc(vm, nr_pages, + KVM_GUEST_PAGE_TABLE_MIN_PADDR, + vm->memslots[MEM_REGION_PT]); + vm->mmu.pgd_created = true; } void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) { uint64_t *ptep, next_ppn; - int level = vm->pgtable_levels - 1; + int level = vm->mmu.pgtable_levels - 1; TEST_ASSERT((vaddr % vm->page_size) == 0, "Virtual address not on page boundary,\n" @@ -98,7 +98,7 @@ void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", paddr, vm->max_gfn, vm->page_size); - ptep = addr_gpa2hva(vm, vm->pgd) + pte_index(vm, vaddr, level) * 8; + ptep = addr_gpa2hva(vm, vm->mmu.pgd) + pte_index(vm, vaddr, level) * 8; if (!*ptep) { next_ppn = vm_alloc_page_table(vm) >> PGTBL_PAGE_SIZE_SHIFT; *ptep = (next_ppn << PGTBL_PTE_ADDR_SHIFT) | @@ -126,12 +126,12 @@ void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) { uint64_t *ptep; - int level = vm->pgtable_levels - 1; + int level = vm->mmu.pgtable_levels - 1; - if (!vm->pgd_created) + if (!vm->mmu.pgd_created) goto unmapped_gva; - ptep = addr_gpa2hva(vm, vm->pgd) + pte_index(vm, gva, level) * 8; + ptep = addr_gpa2hva(vm, vm->mmu.pgd) + pte_index(vm, gva, level) * 8; if (!ptep) goto unmapped_gva; level--; @@ -176,13 +176,14 @@ static void pte_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent, void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) { - int level = vm->pgtable_levels - 1; + struct kvm_mmu *mmu = &vm->mmu; + int level = mmu->pgtable_levels - 1; uint64_t pgd, *ptep; - if (!vm->pgd_created) + if (!mmu->pgd_created) return; - for (pgd = vm->pgd; pgd < vm->pgd + ptrs_per_pte(vm) * 8; pgd += 8) { + for (pgd = mmu->pgd; pgd < mmu->pgd + ptrs_per_pte(vm) * 8; pgd += 8) { ptep = addr_gpa2hva(vm, pgd); if (!*ptep) continue; @@ -211,7 +212,7 @@ void riscv_vcpu_mmu_setup(struct kvm_vcpu *vcpu) TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode); } - satp = (vm->pgd >> PGTBL_PAGE_SIZE_SHIFT) & SATP_PPN; + satp = (vm->mmu.pgd >> PGTBL_PAGE_SIZE_SHIFT) & SATP_PPN; satp |= SATP_MODE_48; vcpu_set_reg(vcpu, RISCV_GENERAL_CSR_REG(satp), satp); diff --git a/tools/testing/selftests/kvm/lib/s390/processor.c b/tools/testing/selftests/kvm/lib/s390/processor.c index 8ceeb17c819a..6a9a660413a7 100644 --- a/tools/testing/selftests/kvm/lib/s390/processor.c +++ b/tools/testing/selftests/kvm/lib/s390/processor.c @@ -17,7 +17,7 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm) TEST_ASSERT(vm->page_size == PAGE_SIZE, "Unsupported page size: 0x%x", vm->page_size); - if (vm->pgd_created) + if (vm->mmu.pgd_created) return; paddr = vm_phy_pages_alloc(vm, PAGES_PER_REGION, @@ -25,8 +25,8 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm) vm->memslots[MEM_REGION_PT]); memset(addr_gpa2hva(vm, paddr), 0xff, PAGES_PER_REGION * vm->page_size); - vm->pgd = paddr; - vm->pgd_created = true; + vm->mmu.pgd = paddr; + vm->mmu.pgd_created = true; } /* @@ -70,7 +70,7 @@ void virt_arch_pg_map(struct kvm_vm *vm, uint64_t gva, uint64_t gpa) gva, vm->max_gfn, vm->page_size); /* Walk through region and segment tables */ - entry = addr_gpa2hva(vm, vm->pgd); + entry = addr_gpa2hva(vm, vm->mmu.pgd); for (ri = 1; ri <= 4; ri++) { idx = (gva >> (64 - 11 * ri)) & 0x7ffu; if (entry[idx] & REGION_ENTRY_INVALID) @@ -94,7 +94,7 @@ vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) TEST_ASSERT(vm->page_size == PAGE_SIZE, "Unsupported page size: 0x%x", vm->page_size); - entry = addr_gpa2hva(vm, vm->pgd); + entry = addr_gpa2hva(vm, vm->mmu.pgd); for (ri = 1; ri <= 4; ri++) { idx = (gva >> (64 - 11 * ri)) & 0x7ffu; TEST_ASSERT(!(entry[idx] & REGION_ENTRY_INVALID), @@ -149,10 +149,10 @@ static void virt_dump_region(FILE *stream, struct kvm_vm *vm, uint8_t indent, void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) { - if (!vm->pgd_created) + if (!vm->mmu.pgd_created) return; - virt_dump_region(stream, vm, indent, vm->pgd); + virt_dump_region(stream, vm, indent, vm->mmu.pgd); } void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code) @@ -184,7 +184,7 @@ struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id) vcpu_sregs_get(vcpu, &sregs); sregs.crs[0] |= 0x00040000; /* Enable floating point regs */ - sregs.crs[1] = vm->pgd | 0xf; /* Primary region table */ + sregs.crs[1] = vm->mmu.pgd | 0xf; /* Primary region table */ vcpu_sregs_set(vcpu, &sregs); vcpu->run->psw_mask = 0x0400000180000000ULL; /* DAT enabled + 64 bit mode */ diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index c14bf2b5f28f..f027f86d1535 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -162,9 +162,9 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm) "Unknown or unsupported guest mode: 0x%x", vm->mode); /* If needed, create the top-level page table. */ - if (!vm->pgd_created) { - vm->pgd = vm_alloc_page_table(vm); - vm->pgd_created = true; + if (!vm->mmu.pgd_created) { + vm->mmu.pgd = vm_alloc_page_table(vm); + vm->mmu.pgd_created = true; } } @@ -175,7 +175,7 @@ static void *virt_get_pte(struct kvm_vm *vm, uint64_t *parent_pte, uint64_t *page_table = addr_gpa2hva(vm, pt_gpa); int index = (vaddr >> PG_LEVEL_SHIFT(level)) & 0x1ffu; - TEST_ASSERT((*parent_pte & PTE_PRESENT_MASK) || parent_pte == &vm->pgd, + TEST_ASSERT((*parent_pte & PTE_PRESENT_MASK) || parent_pte == &vm->mmu.pgd, "Parent PTE (level %d) not PRESENT for gva: 0x%08lx", level + 1, vaddr); @@ -218,7 +218,7 @@ static uint64_t *virt_create_upper_pte(struct kvm_vm *vm, void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level) { const uint64_t pg_size = PG_LEVEL_SIZE(level); - uint64_t *pte = &vm->pgd; + uint64_t *pte = &vm->mmu.pgd; int current_level; TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, @@ -243,7 +243,7 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level) * Allocate upper level page tables, if not already present. Return * early if a hugepage was created. */ - for (current_level = vm->pgtable_levels; + for (current_level = vm->mmu.pgtable_levels; current_level > PG_LEVEL_4K; current_level--) { pte = virt_create_upper_pte(vm, pte, vaddr, paddr, @@ -309,14 +309,14 @@ static bool vm_is_target_pte(uint64_t *pte, int *level, int current_level) static uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr, int *level) { - int va_width = 12 + (vm->pgtable_levels) * 9; - uint64_t *pte = &vm->pgd; + int va_width = 12 + (vm->mmu.pgtable_levels) * 9; + uint64_t *pte = &vm->mmu.pgd; int current_level; TEST_ASSERT(!vm->arch.is_pt_protected, "Walking page tables of protected guests is impossible"); - TEST_ASSERT(*level >= PG_LEVEL_NONE && *level <= vm->pgtable_levels, + TEST_ASSERT(*level >= PG_LEVEL_NONE && *level <= vm->mmu.pgtable_levels, "Invalid PG_LEVEL_* '%d'", *level); TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, @@ -332,7 +332,7 @@ static uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr, (((int64_t)vaddr << (64 - va_width) >> (64 - va_width))), "Canonical check failed. The virtual address is invalid."); - for (current_level = vm->pgtable_levels; + for (current_level = vm->mmu.pgtable_levels; current_level > PG_LEVEL_4K; current_level--) { pte = virt_get_pte(vm, pte, vaddr, current_level); @@ -357,7 +357,7 @@ void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) uint64_t *pde, *pde_start; uint64_t *pte, *pte_start; - if (!vm->pgd_created) + if (!vm->mmu.pgd_created) return; fprintf(stream, "%*s " @@ -365,7 +365,7 @@ void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) fprintf(stream, "%*s index hvaddr gpaddr " "addr w exec dirty\n", indent, ""); - pml4e_start = (uint64_t *) addr_gpa2hva(vm, vm->pgd); + pml4e_start = (uint64_t *) addr_gpa2hva(vm, vm->mmu.pgd); for (uint16_t n1 = 0; n1 <= 0x1ffu; n1++) { pml4e = &pml4e_start[n1]; if (!(*pml4e & PTE_PRESENT_MASK)) @@ -538,7 +538,7 @@ static void vcpu_init_sregs(struct kvm_vm *vm, struct kvm_vcpu *vcpu) sregs.cr4 |= X86_CR4_PAE | X86_CR4_OSFXSR; if (kvm_cpu_has(X86_FEATURE_XSAVE)) sregs.cr4 |= X86_CR4_OSXSAVE; - if (vm->pgtable_levels == 5) + if (vm->mmu.pgtable_levels == 5) sregs.cr4 |= X86_CR4_LA57; sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX); @@ -549,7 +549,7 @@ static void vcpu_init_sregs(struct kvm_vm *vm, struct kvm_vcpu *vcpu) kvm_seg_set_kernel_data_64bit(&sregs.gs); kvm_seg_set_tss_64bit(vm->arch.tss, &sregs.tr); - sregs.cr3 = vm->pgd; + sregs.cr3 = vm->mmu.pgd; vcpu_sregs_set(vcpu, &sregs); } diff --git a/tools/testing/selftests/kvm/x86/vmx_nested_la57_state_test.c b/tools/testing/selftests/kvm/x86/vmx_nested_la57_state_test.c index cf1d2d1f2a8f..915c42001dba 100644 --- a/tools/testing/selftests/kvm/x86/vmx_nested_la57_state_test.c +++ b/tools/testing/selftests/kvm/x86/vmx_nested_la57_state_test.c @@ -90,7 +90,7 @@ int main(int argc, char *argv[]) * L1 needs to read its own PML5 table to set up L2. Identity map * the PML5 table to facilitate this. */ - virt_map(vm, vm->pgd, vm->pgd, 1); + virt_map(vm, vm->mmu.pgd, vm->mmu.pgd, 1); vcpu_alloc_vmx(vm, &vmx_pages_gva); vcpu_args_set(vcpu, 1, vmx_pages_gva); -- cgit v1.2.3 From 11825209f5494098da6cab666d8a767650c1c0cb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Dec 2025 15:01:36 -0800 Subject: KVM: selftests: Plumb "struct kvm_mmu" into x86's MMU APIs In preparation for generalizing the x86 virt mapping APIs to work with TDP (stage-2) page tables, plumb "struct kvm_mmu" into all of the helper functions instead of operating on vm->mmu directly. Opportunistically swap the order of the check in virt_get_pte() to first assert that the parent is the PGD, and then check that the PTE is present, as it makes more sense to check if the parent PTE is the PGD/root (i.e. not a PTE) before checking that the PTE is PRESENT. No functional change intended. Suggested-by: Sean Christopherson Signed-off-by: Yosry Ahmed [sean: rebase on common kvm_mmu structure, rewrite changelog] Link: https://patch.msgid.link/20251230230150.4150236-8-seanjc@google.com Signed-off-by: Sean Christopherson --- .../testing/selftests/kvm/include/x86/processor.h | 3 +- tools/testing/selftests/kvm/lib/x86/processor.c | 64 +++++++++++++--------- 2 files changed, 39 insertions(+), 28 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index 1cb5b4c46b99..43970a96baed 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -1451,7 +1451,8 @@ enum pg_level { #define PG_SIZE_2M PG_LEVEL_SIZE(PG_LEVEL_2M) #define PG_SIZE_1G PG_LEVEL_SIZE(PG_LEVEL_1G) -void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level); +void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t vaddr, + uint64_t paddr, int level); void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, uint64_t nr_bytes, int level); diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index f027f86d1535..f25742a804b0 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -156,26 +156,31 @@ bool kvm_is_tdp_enabled(void) return get_kvm_amd_param_bool("npt"); } +static void virt_mmu_init(struct kvm_vm *vm, struct kvm_mmu *mmu) +{ + /* If needed, create the top-level page table. */ + if (!mmu->pgd_created) { + mmu->pgd = vm_alloc_page_table(vm); + mmu->pgd_created = true; + } +} + void virt_arch_pgd_alloc(struct kvm_vm *vm) { TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, "Unknown or unsupported guest mode: 0x%x", vm->mode); - /* If needed, create the top-level page table. */ - if (!vm->mmu.pgd_created) { - vm->mmu.pgd = vm_alloc_page_table(vm); - vm->mmu.pgd_created = true; - } + virt_mmu_init(vm, &vm->mmu); } -static void *virt_get_pte(struct kvm_vm *vm, uint64_t *parent_pte, - uint64_t vaddr, int level) +static void *virt_get_pte(struct kvm_vm *vm, struct kvm_mmu *mmu, + uint64_t *parent_pte, uint64_t vaddr, int level) { uint64_t pt_gpa = PTE_GET_PA(*parent_pte); uint64_t *page_table = addr_gpa2hva(vm, pt_gpa); int index = (vaddr >> PG_LEVEL_SHIFT(level)) & 0x1ffu; - TEST_ASSERT((*parent_pte & PTE_PRESENT_MASK) || parent_pte == &vm->mmu.pgd, + TEST_ASSERT((*parent_pte == mmu->pgd) || (*parent_pte & PTE_PRESENT_MASK), "Parent PTE (level %d) not PRESENT for gva: 0x%08lx", level + 1, vaddr); @@ -183,13 +188,14 @@ static void *virt_get_pte(struct kvm_vm *vm, uint64_t *parent_pte, } static uint64_t *virt_create_upper_pte(struct kvm_vm *vm, + struct kvm_mmu *mmu, uint64_t *parent_pte, uint64_t vaddr, uint64_t paddr, int current_level, int target_level) { - uint64_t *pte = virt_get_pte(vm, parent_pte, vaddr, current_level); + uint64_t *pte = virt_get_pte(vm, mmu, parent_pte, vaddr, current_level); paddr = vm_untag_gpa(vm, paddr); @@ -215,10 +221,11 @@ static uint64_t *virt_create_upper_pte(struct kvm_vm *vm, return pte; } -void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level) +void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t vaddr, + uint64_t paddr, int level) { const uint64_t pg_size = PG_LEVEL_SIZE(level); - uint64_t *pte = &vm->mmu.pgd; + uint64_t *pte = &mmu->pgd; int current_level; TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, @@ -243,17 +250,17 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level) * Allocate upper level page tables, if not already present. Return * early if a hugepage was created. */ - for (current_level = vm->mmu.pgtable_levels; + for (current_level = mmu->pgtable_levels; current_level > PG_LEVEL_4K; current_level--) { - pte = virt_create_upper_pte(vm, pte, vaddr, paddr, + pte = virt_create_upper_pte(vm, mmu, pte, vaddr, paddr, current_level, level); if (*pte & PTE_LARGE_MASK) return; } /* Fill in page table entry. */ - pte = virt_get_pte(vm, pte, vaddr, PG_LEVEL_4K); + pte = virt_get_pte(vm, mmu, pte, vaddr, PG_LEVEL_4K); TEST_ASSERT(!(*pte & PTE_PRESENT_MASK), "PTE already present for 4k page at vaddr: 0x%lx", vaddr); *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK | (paddr & PHYSICAL_PAGE_MASK); @@ -270,7 +277,7 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level) void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) { - __virt_pg_map(vm, vaddr, paddr, PG_LEVEL_4K); + __virt_pg_map(vm, &vm->mmu, vaddr, paddr, PG_LEVEL_4K); } void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, @@ -285,7 +292,7 @@ void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, nr_bytes, pg_size); for (i = 0; i < nr_pages; i++) { - __virt_pg_map(vm, vaddr, paddr, level); + __virt_pg_map(vm, &vm->mmu, vaddr, paddr, level); sparsebit_set_num(vm->vpages_mapped, vaddr >> vm->page_shift, nr_bytes / PAGE_SIZE); @@ -294,7 +301,8 @@ void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, } } -static bool vm_is_target_pte(uint64_t *pte, int *level, int current_level) +static bool vm_is_target_pte(struct kvm_mmu *mmu, uint64_t *pte, + int *level, int current_level) { if (*pte & PTE_LARGE_MASK) { TEST_ASSERT(*level == PG_LEVEL_NONE || @@ -306,17 +314,19 @@ static bool vm_is_target_pte(uint64_t *pte, int *level, int current_level) return *level == current_level; } -static uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr, +static uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, + struct kvm_mmu *mmu, + uint64_t vaddr, int *level) { - int va_width = 12 + (vm->mmu.pgtable_levels) * 9; - uint64_t *pte = &vm->mmu.pgd; + int va_width = 12 + (mmu->pgtable_levels) * 9; + uint64_t *pte = &mmu->pgd; int current_level; TEST_ASSERT(!vm->arch.is_pt_protected, "Walking page tables of protected guests is impossible"); - TEST_ASSERT(*level >= PG_LEVEL_NONE && *level <= vm->mmu.pgtable_levels, + TEST_ASSERT(*level >= PG_LEVEL_NONE && *level <= mmu->pgtable_levels, "Invalid PG_LEVEL_* '%d'", *level); TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, @@ -332,22 +342,22 @@ static uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr, (((int64_t)vaddr << (64 - va_width) >> (64 - va_width))), "Canonical check failed. The virtual address is invalid."); - for (current_level = vm->mmu.pgtable_levels; + for (current_level = mmu->pgtable_levels; current_level > PG_LEVEL_4K; current_level--) { - pte = virt_get_pte(vm, pte, vaddr, current_level); - if (vm_is_target_pte(pte, level, current_level)) + pte = virt_get_pte(vm, mmu, pte, vaddr, current_level); + if (vm_is_target_pte(mmu, pte, level, current_level)) return pte; } - return virt_get_pte(vm, pte, vaddr, PG_LEVEL_4K); + return virt_get_pte(vm, mmu, pte, vaddr, PG_LEVEL_4K); } uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr) { int level = PG_LEVEL_4K; - return __vm_get_page_table_entry(vm, vaddr, &level); + return __vm_get_page_table_entry(vm, &vm->mmu, vaddr, &level); } void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) @@ -497,7 +507,7 @@ static void kvm_seg_set_kernel_data_64bit(struct kvm_segment *segp) vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) { int level = PG_LEVEL_NONE; - uint64_t *pte = __vm_get_page_table_entry(vm, gva, &level); + uint64_t *pte = __vm_get_page_table_entry(vm, &vm->mmu, gva, &level); TEST_ASSERT(*pte & PTE_PRESENT_MASK, "Leaf PTE not PRESENT for gva: 0x%08lx", gva); -- cgit v1.2.3 From 3d0e7595e81017558189d2252ae9453c57ab3436 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Dec 2025 15:01:37 -0800 Subject: KVM: selftests: Add a "struct kvm_mmu_arch arch" member to kvm_mmu Add an arch structure+field in "struct kvm_mmu" so that architectures can track arch-specific information for a given MMU. No functional change intended. Reviewed-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-9-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/arm64/kvm_util_arch.h | 2 ++ tools/testing/selftests/kvm/include/kvm_util.h | 2 ++ tools/testing/selftests/kvm/include/loongarch/kvm_util_arch.h | 1 + tools/testing/selftests/kvm/include/riscv/kvm_util_arch.h | 1 + tools/testing/selftests/kvm/include/s390/kvm_util_arch.h | 1 + tools/testing/selftests/kvm/include/x86/kvm_util_arch.h | 2 ++ 6 files changed, 9 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/kvm/include/arm64/kvm_util_arch.h b/tools/testing/selftests/kvm/include/arm64/kvm_util_arch.h index b973bb2c64a6..4a2033708227 100644 --- a/tools/testing/selftests/kvm/include/arm64/kvm_util_arch.h +++ b/tools/testing/selftests/kvm/include/arm64/kvm_util_arch.h @@ -2,6 +2,8 @@ #ifndef SELFTEST_KVM_UTIL_ARCH_H #define SELFTEST_KVM_UTIL_ARCH_H +struct kvm_mmu_arch {}; + struct kvm_vm_arch { bool has_gic; int gic_fd; diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 39558c05c0bf..c1497515fa6a 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -92,6 +92,8 @@ struct kvm_mmu { bool pgd_created; uint64_t pgd; int pgtable_levels; + + struct kvm_mmu_arch arch; }; struct kvm_vm { diff --git a/tools/testing/selftests/kvm/include/loongarch/kvm_util_arch.h b/tools/testing/selftests/kvm/include/loongarch/kvm_util_arch.h index e43a57d99b56..d5095900e442 100644 --- a/tools/testing/selftests/kvm/include/loongarch/kvm_util_arch.h +++ b/tools/testing/selftests/kvm/include/loongarch/kvm_util_arch.h @@ -2,6 +2,7 @@ #ifndef SELFTEST_KVM_UTIL_ARCH_H #define SELFTEST_KVM_UTIL_ARCH_H +struct kvm_mmu_arch {}; struct kvm_vm_arch {}; #endif // SELFTEST_KVM_UTIL_ARCH_H diff --git a/tools/testing/selftests/kvm/include/riscv/kvm_util_arch.h b/tools/testing/selftests/kvm/include/riscv/kvm_util_arch.h index e43a57d99b56..d5095900e442 100644 --- a/tools/testing/selftests/kvm/include/riscv/kvm_util_arch.h +++ b/tools/testing/selftests/kvm/include/riscv/kvm_util_arch.h @@ -2,6 +2,7 @@ #ifndef SELFTEST_KVM_UTIL_ARCH_H #define SELFTEST_KVM_UTIL_ARCH_H +struct kvm_mmu_arch {}; struct kvm_vm_arch {}; #endif // SELFTEST_KVM_UTIL_ARCH_H diff --git a/tools/testing/selftests/kvm/include/s390/kvm_util_arch.h b/tools/testing/selftests/kvm/include/s390/kvm_util_arch.h index e43a57d99b56..d5095900e442 100644 --- a/tools/testing/selftests/kvm/include/s390/kvm_util_arch.h +++ b/tools/testing/selftests/kvm/include/s390/kvm_util_arch.h @@ -2,6 +2,7 @@ #ifndef SELFTEST_KVM_UTIL_ARCH_H #define SELFTEST_KVM_UTIL_ARCH_H +struct kvm_mmu_arch {}; struct kvm_vm_arch {}; #endif // SELFTEST_KVM_UTIL_ARCH_H diff --git a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h index 972bb1c4ab4c..456e5ca170df 100644 --- a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h +++ b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h @@ -10,6 +10,8 @@ extern bool is_forced_emulation_enabled; +struct kvm_mmu_arch {}; + struct kvm_vm_arch { vm_vaddr_t gdt; vm_vaddr_t tss; -- cgit v1.2.3 From 6dd70757213fc046e5f86901e4baf11ed894da6b Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 30 Dec 2025 15:01:38 -0800 Subject: KVM: selftests: Move PTE bitmasks to kvm_mmu Move the PTE bitmasks into kvm_mmu to parameterize them for virt mapping functions. Introduce helpers to read/write different PTE bits given a kvm_mmu. Drop the 'global' bit definition as it's currently unused, but leave the 'user' bit as it will be used in coming changes. Opportunisitcally rename 'large' to 'huge' as it's more consistent with the kernel naming. Leave PHYSICAL_PAGE_MASK alone, it's fixed in all page table formats and a lot of other macros depend on it. It's tempting to move all the other macros to be per-struct instead, but it would be too much noise for little benefit. Keep c_bit and s_bit in vm->arch as they used before the MMU is initialized, through __vmcreate() -> vm_userspace_mem_region_add() -> vm_mem_add() -> vm_arch_has_protected_memory(). No functional change intended. Signed-off-by: Yosry Ahmed [sean: rename accessors to is__pte()] Link: https://patch.msgid.link/20251230230150.4150236-10-seanjc@google.com Signed-off-by: Sean Christopherson --- .../selftests/kvm/include/x86/kvm_util_arch.h | 16 ++++- .../testing/selftests/kvm/include/x86/processor.h | 28 ++++++--- tools/testing/selftests/kvm/lib/x86/processor.c | 71 +++++++++++++--------- 3 files changed, 76 insertions(+), 39 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h index 456e5ca170df..bad381d63b6a 100644 --- a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h +++ b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h @@ -10,7 +10,21 @@ extern bool is_forced_emulation_enabled; -struct kvm_mmu_arch {}; +struct pte_masks { + uint64_t present; + uint64_t writable; + uint64_t user; + uint64_t accessed; + uint64_t dirty; + uint64_t huge; + uint64_t nx; + uint64_t c; + uint64_t s; +}; + +struct kvm_mmu_arch { + struct pte_masks pte_masks; +}; struct kvm_vm_arch { vm_vaddr_t gdt; diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index 43970a96baed..55dac84cd4a7 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -362,16 +362,6 @@ static inline unsigned int x86_model(unsigned int eax) return ((eax >> 12) & 0xf0) | ((eax >> 4) & 0x0f); } -/* Page table bitfield declarations */ -#define PTE_PRESENT_MASK BIT_ULL(0) -#define PTE_WRITABLE_MASK BIT_ULL(1) -#define PTE_USER_MASK BIT_ULL(2) -#define PTE_ACCESSED_MASK BIT_ULL(5) -#define PTE_DIRTY_MASK BIT_ULL(6) -#define PTE_LARGE_MASK BIT_ULL(7) -#define PTE_GLOBAL_MASK BIT_ULL(8) -#define PTE_NX_MASK BIT_ULL(63) - #define PHYSICAL_PAGE_MASK GENMASK_ULL(51, 12) #define PAGE_SHIFT 12 @@ -1451,6 +1441,24 @@ enum pg_level { #define PG_SIZE_2M PG_LEVEL_SIZE(PG_LEVEL_2M) #define PG_SIZE_1G PG_LEVEL_SIZE(PG_LEVEL_1G) +#define PTE_PRESENT_MASK(mmu) ((mmu)->arch.pte_masks.present) +#define PTE_WRITABLE_MASK(mmu) ((mmu)->arch.pte_masks.writable) +#define PTE_USER_MASK(mmu) ((mmu)->arch.pte_masks.user) +#define PTE_ACCESSED_MASK(mmu) ((mmu)->arch.pte_masks.accessed) +#define PTE_DIRTY_MASK(mmu) ((mmu)->arch.pte_masks.dirty) +#define PTE_HUGE_MASK(mmu) ((mmu)->arch.pte_masks.huge) +#define PTE_NX_MASK(mmu) ((mmu)->arch.pte_masks.nx) +#define PTE_C_BIT_MASK(mmu) ((mmu)->arch.pte_masks.c) +#define PTE_S_BIT_MASK(mmu) ((mmu)->arch.pte_masks.s) + +#define is_present_pte(mmu, pte) (!!(*(pte) & PTE_PRESENT_MASK(mmu))) +#define is_writable_pte(mmu, pte) (!!(*(pte) & PTE_WRITABLE_MASK(mmu))) +#define is_user_pte(mmu, pte) (!!(*(pte) & PTE_USER_MASK(mmu))) +#define is_accessed_pte(mmu, pte) (!!(*(pte) & PTE_ACCESSED_MASK(mmu))) +#define is_dirty_pte(mmu, pte) (!!(*(pte) & PTE_DIRTY_MASK(mmu))) +#define is_huge_pte(mmu, pte) (!!(*(pte) & PTE_HUGE_MASK(mmu))) +#define is_nx_pte(mmu, pte) (!!(*(pte) & PTE_NX_MASK(mmu))) + void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t vaddr, uint64_t paddr, int level); void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index f25742a804b0..3800f4ff6770 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -156,12 +156,14 @@ bool kvm_is_tdp_enabled(void) return get_kvm_amd_param_bool("npt"); } -static void virt_mmu_init(struct kvm_vm *vm, struct kvm_mmu *mmu) +static void virt_mmu_init(struct kvm_vm *vm, struct kvm_mmu *mmu, + struct pte_masks *pte_masks) { /* If needed, create the top-level page table. */ if (!mmu->pgd_created) { mmu->pgd = vm_alloc_page_table(vm); mmu->pgd_created = true; + mmu->arch.pte_masks = *pte_masks; } } @@ -170,7 +172,19 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm) TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, "Unknown or unsupported guest mode: 0x%x", vm->mode); - virt_mmu_init(vm, &vm->mmu); + struct pte_masks pte_masks = (struct pte_masks){ + .present = BIT_ULL(0), + .writable = BIT_ULL(1), + .user = BIT_ULL(2), + .accessed = BIT_ULL(5), + .dirty = BIT_ULL(6), + .huge = BIT_ULL(7), + .nx = BIT_ULL(63), + .c = vm->arch.c_bit, + .s = vm->arch.s_bit, + }; + + virt_mmu_init(vm, &vm->mmu, &pte_masks); } static void *virt_get_pte(struct kvm_vm *vm, struct kvm_mmu *mmu, @@ -180,7 +194,7 @@ static void *virt_get_pte(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t *page_table = addr_gpa2hva(vm, pt_gpa); int index = (vaddr >> PG_LEVEL_SHIFT(level)) & 0x1ffu; - TEST_ASSERT((*parent_pte == mmu->pgd) || (*parent_pte & PTE_PRESENT_MASK), + TEST_ASSERT((*parent_pte == mmu->pgd) || is_present_pte(mmu, parent_pte), "Parent PTE (level %d) not PRESENT for gva: 0x%08lx", level + 1, vaddr); @@ -199,10 +213,10 @@ static uint64_t *virt_create_upper_pte(struct kvm_vm *vm, paddr = vm_untag_gpa(vm, paddr); - if (!(*pte & PTE_PRESENT_MASK)) { - *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK; + if (!is_present_pte(mmu, pte)) { + *pte = PTE_PRESENT_MASK(mmu) | PTE_WRITABLE_MASK(mmu); if (current_level == target_level) - *pte |= PTE_LARGE_MASK | (paddr & PHYSICAL_PAGE_MASK); + *pte |= PTE_HUGE_MASK(mmu) | (paddr & PHYSICAL_PAGE_MASK); else *pte |= vm_alloc_page_table(vm) & PHYSICAL_PAGE_MASK; } else { @@ -214,7 +228,7 @@ static uint64_t *virt_create_upper_pte(struct kvm_vm *vm, TEST_ASSERT(current_level != target_level, "Cannot create hugepage at level: %u, vaddr: 0x%lx", current_level, vaddr); - TEST_ASSERT(!(*pte & PTE_LARGE_MASK), + TEST_ASSERT(!is_huge_pte(mmu, pte), "Cannot create page table at level: %u, vaddr: 0x%lx", current_level, vaddr); } @@ -255,24 +269,24 @@ void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t vaddr, current_level--) { pte = virt_create_upper_pte(vm, mmu, pte, vaddr, paddr, current_level, level); - if (*pte & PTE_LARGE_MASK) + if (is_huge_pte(mmu, pte)) return; } /* Fill in page table entry. */ pte = virt_get_pte(vm, mmu, pte, vaddr, PG_LEVEL_4K); - TEST_ASSERT(!(*pte & PTE_PRESENT_MASK), + TEST_ASSERT(!is_present_pte(mmu, pte), "PTE already present for 4k page at vaddr: 0x%lx", vaddr); - *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK | (paddr & PHYSICAL_PAGE_MASK); + *pte = PTE_PRESENT_MASK(mmu) | PTE_WRITABLE_MASK(mmu) | (paddr & PHYSICAL_PAGE_MASK); /* * Neither SEV nor TDX supports shared page tables, so only the final * leaf PTE needs manually set the C/S-bit. */ if (vm_is_gpa_protected(vm, paddr)) - *pte |= vm->arch.c_bit; + *pte |= PTE_C_BIT_MASK(mmu); else - *pte |= vm->arch.s_bit; + *pte |= PTE_S_BIT_MASK(mmu); } void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) @@ -304,7 +318,7 @@ void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, static bool vm_is_target_pte(struct kvm_mmu *mmu, uint64_t *pte, int *level, int current_level) { - if (*pte & PTE_LARGE_MASK) { + if (is_huge_pte(mmu, pte)) { TEST_ASSERT(*level == PG_LEVEL_NONE || *level == current_level, "Unexpected hugepage at level %d", current_level); @@ -362,12 +376,13 @@ uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr) void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) { + struct kvm_mmu *mmu = &vm->mmu; uint64_t *pml4e, *pml4e_start; uint64_t *pdpe, *pdpe_start; uint64_t *pde, *pde_start; uint64_t *pte, *pte_start; - if (!vm->mmu.pgd_created) + if (!mmu->pgd_created) return; fprintf(stream, "%*s " @@ -375,47 +390,47 @@ void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) fprintf(stream, "%*s index hvaddr gpaddr " "addr w exec dirty\n", indent, ""); - pml4e_start = (uint64_t *) addr_gpa2hva(vm, vm->mmu.pgd); + pml4e_start = (uint64_t *) addr_gpa2hva(vm, mmu->pgd); for (uint16_t n1 = 0; n1 <= 0x1ffu; n1++) { pml4e = &pml4e_start[n1]; - if (!(*pml4e & PTE_PRESENT_MASK)) + if (!is_present_pte(mmu, pml4e)) continue; fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10llx %u " " %u\n", indent, "", pml4e - pml4e_start, pml4e, addr_hva2gpa(vm, pml4e), PTE_GET_PFN(*pml4e), - !!(*pml4e & PTE_WRITABLE_MASK), !!(*pml4e & PTE_NX_MASK)); + is_writable_pte(mmu, pml4e), is_nx_pte(mmu, pml4e)); pdpe_start = addr_gpa2hva(vm, *pml4e & PHYSICAL_PAGE_MASK); for (uint16_t n2 = 0; n2 <= 0x1ffu; n2++) { pdpe = &pdpe_start[n2]; - if (!(*pdpe & PTE_PRESENT_MASK)) + if (!is_present_pte(mmu, pdpe)) continue; fprintf(stream, "%*spdpe 0x%-3zx %p 0x%-12lx 0x%-10llx " "%u %u\n", indent, "", pdpe - pdpe_start, pdpe, addr_hva2gpa(vm, pdpe), - PTE_GET_PFN(*pdpe), !!(*pdpe & PTE_WRITABLE_MASK), - !!(*pdpe & PTE_NX_MASK)); + PTE_GET_PFN(*pdpe), is_writable_pte(mmu, pdpe), + is_nx_pte(mmu, pdpe)); pde_start = addr_gpa2hva(vm, *pdpe & PHYSICAL_PAGE_MASK); for (uint16_t n3 = 0; n3 <= 0x1ffu; n3++) { pde = &pde_start[n3]; - if (!(*pde & PTE_PRESENT_MASK)) + if (!is_present_pte(mmu, pde)) continue; fprintf(stream, "%*spde 0x%-3zx %p " "0x%-12lx 0x%-10llx %u %u\n", indent, "", pde - pde_start, pde, addr_hva2gpa(vm, pde), - PTE_GET_PFN(*pde), !!(*pde & PTE_WRITABLE_MASK), - !!(*pde & PTE_NX_MASK)); + PTE_GET_PFN(*pde), is_writable_pte(mmu, pde), + is_nx_pte(mmu, pde)); pte_start = addr_gpa2hva(vm, *pde & PHYSICAL_PAGE_MASK); for (uint16_t n4 = 0; n4 <= 0x1ffu; n4++) { pte = &pte_start[n4]; - if (!(*pte & PTE_PRESENT_MASK)) + if (!is_present_pte(mmu, pte)) continue; fprintf(stream, "%*spte 0x%-3zx %p " "0x%-12lx 0x%-10llx %u %u " @@ -424,9 +439,9 @@ void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) pte - pte_start, pte, addr_hva2gpa(vm, pte), PTE_GET_PFN(*pte), - !!(*pte & PTE_WRITABLE_MASK), - !!(*pte & PTE_NX_MASK), - !!(*pte & PTE_DIRTY_MASK), + is_writable_pte(mmu, pte), + is_nx_pte(mmu, pte), + is_dirty_pte(mmu, pte), ((uint64_t) n1 << 27) | ((uint64_t) n2 << 18) | ((uint64_t) n3 << 9) @@ -509,7 +524,7 @@ vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) int level = PG_LEVEL_NONE; uint64_t *pte = __vm_get_page_table_entry(vm, &vm->mmu, gva, &level); - TEST_ASSERT(*pte & PTE_PRESENT_MASK, + TEST_ASSERT(is_present_pte(&vm->mmu, pte), "Leaf PTE not PRESENT for gva: 0x%08lx", gva); /* -- cgit v1.2.3 From f00f519cebcd4280c4ce4fab8133ed2dcfa8f95a Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 30 Dec 2025 15:01:39 -0800 Subject: KVM: selftests: Use a TDP MMU to share EPT page tables between vCPUs prepare_eptp() currently allocates new EPTs for each vCPU. memstress has its own hack to share the EPTs between vCPUs. Currently, there is no reason to have separate EPTs for each vCPU, and the complexity is significant. The only reason it doesn't matter now is because memstress is the only user with multiple vCPUs. Add vm_enable_ept() to allocate EPT page tables for an entire VM, and use it everywhere to replace prepare_eptp(). Drop 'eptp' and 'eptp_hva' from 'struct vmx_pages' as they serve no purpose (e.g. the EPTP can be built from the PGD), but keep 'eptp_gpa' so that the MMU structure doesn't need to be passed in along with vmx_pages. Dynamically allocate the TDP MMU structure to avoid a cyclical dependency between kvm_util_arch.h and kvm_util.h. Remove the workaround in memstress to copy the EPT root between vCPUs since that's now the default behavior. Name the MMU tdp_mmu instead of e.g. nested_mmu or nested.mmu to avoid recreating the same mess that KVM has with respect to "nested" MMUs, e.g. does nested refer to the stage-2 page tables created by L1, or the stage-1 page tables created by L2? Signed-off-by: Yosry Ahmed Co-developed-by: Sean Christopherson Link: https://patch.msgid.link/20251230230150.4150236-11-seanjc@google.com Signed-off-by: Sean Christopherson --- .../selftests/kvm/include/x86/kvm_util_arch.h | 4 +++ .../testing/selftests/kvm/include/x86/processor.h | 3 +++ tools/testing/selftests/kvm/include/x86/vmx.h | 8 +++--- tools/testing/selftests/kvm/lib/x86/memstress.c | 19 +++++--------- tools/testing/selftests/kvm/lib/x86/processor.c | 9 +++++++ tools/testing/selftests/kvm/lib/x86/vmx.c | 30 ++++++++++++++-------- .../testing/selftests/kvm/x86/vmx_dirty_log_test.c | 7 +++-- 7 files changed, 48 insertions(+), 32 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h index bad381d63b6a..05a1fc1780f2 100644 --- a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h +++ b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h @@ -26,6 +26,8 @@ struct kvm_mmu_arch { struct pte_masks pte_masks; }; +struct kvm_mmu; + struct kvm_vm_arch { vm_vaddr_t gdt; vm_vaddr_t tss; @@ -35,6 +37,8 @@ struct kvm_vm_arch { uint64_t s_bit; int sev_fd; bool is_pt_protected; + + struct kvm_mmu *tdp_mmu; }; static inline bool __vm_arch_has_protected_memory(struct kvm_vm_arch *arch) diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index 55dac84cd4a7..0164ef090787 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -1459,6 +1459,9 @@ enum pg_level { #define is_huge_pte(mmu, pte) (!!(*(pte) & PTE_HUGE_MASK(mmu))) #define is_nx_pte(mmu, pte) (!!(*(pte) & PTE_NX_MASK(mmu))) +void tdp_mmu_init(struct kvm_vm *vm, int pgtable_levels, + struct pte_masks *pte_masks); + void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t vaddr, uint64_t paddr, int level); void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, diff --git a/tools/testing/selftests/kvm/include/x86/vmx.h b/tools/testing/selftests/kvm/include/x86/vmx.h index 04b8231d032a..1fd83c23529a 100644 --- a/tools/testing/selftests/kvm/include/x86/vmx.h +++ b/tools/testing/selftests/kvm/include/x86/vmx.h @@ -520,13 +520,11 @@ struct vmx_pages { uint64_t vmwrite_gpa; void *vmwrite; - void *eptp_hva; - uint64_t eptp_gpa; - void *eptp; - void *apic_access_hva; uint64_t apic_access_gpa; void *apic_access; + + uint64_t eptp_gpa; }; union vmx_basic { @@ -568,7 +566,7 @@ void tdp_identity_map_default_memslots(struct vmx_pages *vmx, void tdp_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm, uint64_t addr, uint64_t size); bool kvm_cpu_has_ept(void); -void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm); +void vm_enable_ept(struct kvm_vm *vm); void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm); #endif /* SELFTEST_KVM_VMX_H */ diff --git a/tools/testing/selftests/kvm/lib/x86/memstress.c b/tools/testing/selftests/kvm/lib/x86/memstress.c index 1928b00bde51..00f7f11e5f0e 100644 --- a/tools/testing/selftests/kvm/lib/x86/memstress.c +++ b/tools/testing/selftests/kvm/lib/x86/memstress.c @@ -59,12 +59,10 @@ uint64_t memstress_nested_pages(int nr_vcpus) return 513 + 10 * nr_vcpus; } -void memstress_setup_ept(struct vmx_pages *vmx, struct kvm_vm *vm) +static void memstress_setup_ept_mappings(struct vmx_pages *vmx, struct kvm_vm *vm) { uint64_t start, end; - prepare_eptp(vmx, vm); - /* * Identity map the first 4G and the test region with 1G pages so that * KVM can shadow the EPT12 with the maximum huge page size supported @@ -79,7 +77,7 @@ void memstress_setup_ept(struct vmx_pages *vmx, struct kvm_vm *vm) void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vcpus[]) { - struct vmx_pages *vmx, *vmx0 = NULL; + struct vmx_pages *vmx; struct kvm_regs regs; vm_vaddr_t vmx_gva; int vcpu_id; @@ -87,18 +85,13 @@ void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vc TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); TEST_REQUIRE(kvm_cpu_has_ept()); + vm_enable_ept(vm); for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { vmx = vcpu_alloc_vmx(vm, &vmx_gva); - if (vcpu_id == 0) { - memstress_setup_ept(vmx, vm); - vmx0 = vmx; - } else { - /* Share the same EPT table across all vCPUs. */ - vmx->eptp = vmx0->eptp; - vmx->eptp_hva = vmx0->eptp_hva; - vmx->eptp_gpa = vmx0->eptp_gpa; - } + /* The EPTs are shared across vCPUs, setup the mappings once */ + if (vcpu_id == 0) + memstress_setup_ept_mappings(vmx, vm); /* * Override the vCPU to run memstress_l1_guest_code() which will diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index 3800f4ff6770..8a9298a72897 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -187,6 +187,15 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm) virt_mmu_init(vm, &vm->mmu, &pte_masks); } +void tdp_mmu_init(struct kvm_vm *vm, int pgtable_levels, + struct pte_masks *pte_masks) +{ + TEST_ASSERT(!vm->arch.tdp_mmu, "TDP MMU already initialized"); + + vm->arch.tdp_mmu = calloc(1, sizeof(*vm->arch.tdp_mmu)); + virt_mmu_init(vm, vm->arch.tdp_mmu, pte_masks); +} + static void *virt_get_pte(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t *parent_pte, uint64_t vaddr, int level) { diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c index a3e2eae981da..9d4e391fdf2c 100644 --- a/tools/testing/selftests/kvm/lib/x86/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86/vmx.c @@ -56,6 +56,21 @@ int vcpu_enable_evmcs(struct kvm_vcpu *vcpu) return evmcs_ver; } +void vm_enable_ept(struct kvm_vm *vm) +{ + TEST_ASSERT(kvm_cpu_has_ept(), "KVM doesn't support nested EPT"); + if (vm->arch.tdp_mmu) + return; + + /* TODO: Drop eptPageTableEntry in favor of PTE masks. */ + struct pte_masks pte_masks = (struct pte_masks) { + + }; + + /* TODO: Add support for 5-level EPT. */ + tdp_mmu_init(vm, 4, &pte_masks); +} + /* Allocate memory regions for nested VMX tests. * * Input Args: @@ -105,6 +120,9 @@ vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva) vmx->vmwrite_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmwrite); memset(vmx->vmwrite_hva, 0, getpagesize()); + if (vm->arch.tdp_mmu) + vmx->eptp_gpa = vm->arch.tdp_mmu->pgd; + *p_vmx_gva = vmx_gva; return vmx; } @@ -395,7 +413,8 @@ void __tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, int target_level) { const uint64_t page_size = PG_LEVEL_SIZE(target_level); - struct eptPageTableEntry *pt = vmx->eptp_hva, *pte; + void *eptp_hva = addr_gpa2hva(vm, vm->arch.tdp_mmu->pgd); + struct eptPageTableEntry *pt = eptp_hva, *pte; uint16_t index; TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, @@ -525,15 +544,6 @@ bool kvm_cpu_has_ept(void) return ctrl & SECONDARY_EXEC_ENABLE_EPT; } -void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm) -{ - TEST_ASSERT(kvm_cpu_has_ept(), "KVM doesn't support nested EPT"); - - vmx->eptp = (void *)vm_vaddr_alloc_page(vm); - vmx->eptp_hva = addr_gva2hva(vm, (uintptr_t)vmx->eptp); - vmx->eptp_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->eptp); -} - void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm) { vmx->apic_access = (void *)vm_vaddr_alloc_page(vm); diff --git a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c index e7d0c08ba29d..5c8cf8ac42a2 100644 --- a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c +++ b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c @@ -93,6 +93,9 @@ static void test_vmx_dirty_log(bool enable_ept) /* Create VM */ vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); + if (enable_ept) + vm_enable_ept(vm); + vmx = vcpu_alloc_vmx(vm, &vmx_pages_gva); vcpu_args_set(vcpu, 1, vmx_pages_gva); @@ -113,14 +116,10 @@ static void test_vmx_dirty_log(bool enable_ept) * ... pages in the L2 GPA range [0xc0001000, 0xc0003000) will map to * 0xc0000000. * - * Note that prepare_eptp should be called only L1's GPA map is done, - * meaning after the last call to virt_map. - * * When EPT is disabled, the L2 guest code will still access the same L1 * GPAs as the EPT enabled case. */ if (enable_ept) { - prepare_eptp(vmx, vm); tdp_identity_map_default_memslots(vmx, vm); tdp_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, PAGE_SIZE); tdp_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, PAGE_SIZE); -- cgit v1.2.3 From e40e72fec0dea9ac55aea84a0d76ccb7d7f32204 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 30 Dec 2025 15:01:40 -0800 Subject: KVM: selftests: Stop passing VMX metadata to TDP mapping functions The root GPA is now retrieved from the nested MMU, stop passing VMX metadata. This is in preparation for making these functions work for NPTs as well. Opportunistically drop tdp_pg_map() since it's unused. No functional change intended. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-12-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/x86/vmx.h | 11 ++------ tools/testing/selftests/kvm/lib/x86/memstress.c | 11 ++++---- tools/testing/selftests/kvm/lib/x86/vmx.c | 33 ++++++++-------------- .../testing/selftests/kvm/x86/vmx_dirty_log_test.c | 9 +++--- 4 files changed, 24 insertions(+), 40 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/kvm/include/x86/vmx.h b/tools/testing/selftests/kvm/include/x86/vmx.h index 1fd83c23529a..4dd4c2094ee6 100644 --- a/tools/testing/selftests/kvm/include/x86/vmx.h +++ b/tools/testing/selftests/kvm/include/x86/vmx.h @@ -557,14 +557,9 @@ bool load_vmcs(struct vmx_pages *vmx); bool ept_1g_pages_supported(void); -void tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, uint64_t nested_paddr, - uint64_t paddr); -void tdp_map(struct vmx_pages *vmx, struct kvm_vm *vm, uint64_t nested_paddr, - uint64_t paddr, uint64_t size); -void tdp_identity_map_default_memslots(struct vmx_pages *vmx, - struct kvm_vm *vm); -void tdp_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t addr, uint64_t size); +void tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, uint64_t size); +void tdp_identity_map_default_memslots(struct kvm_vm *vm); +void tdp_identity_map_1g(struct kvm_vm *vm, uint64_t addr, uint64_t size); bool kvm_cpu_has_ept(void); void vm_enable_ept(struct kvm_vm *vm); void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm); diff --git a/tools/testing/selftests/kvm/lib/x86/memstress.c b/tools/testing/selftests/kvm/lib/x86/memstress.c index 00f7f11e5f0e..3319cb57a78d 100644 --- a/tools/testing/selftests/kvm/lib/x86/memstress.c +++ b/tools/testing/selftests/kvm/lib/x86/memstress.c @@ -59,7 +59,7 @@ uint64_t memstress_nested_pages(int nr_vcpus) return 513 + 10 * nr_vcpus; } -static void memstress_setup_ept_mappings(struct vmx_pages *vmx, struct kvm_vm *vm) +static void memstress_setup_ept_mappings(struct kvm_vm *vm) { uint64_t start, end; @@ -68,16 +68,15 @@ static void memstress_setup_ept_mappings(struct vmx_pages *vmx, struct kvm_vm *v * KVM can shadow the EPT12 with the maximum huge page size supported * by the backing source. */ - tdp_identity_map_1g(vmx, vm, 0, 0x100000000ULL); + tdp_identity_map_1g(vm, 0, 0x100000000ULL); start = align_down(memstress_args.gpa, PG_SIZE_1G); end = align_up(memstress_args.gpa + memstress_args.size, PG_SIZE_1G); - tdp_identity_map_1g(vmx, vm, start, end - start); + tdp_identity_map_1g(vm, start, end - start); } void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vcpus[]) { - struct vmx_pages *vmx; struct kvm_regs regs; vm_vaddr_t vmx_gva; int vcpu_id; @@ -87,11 +86,11 @@ void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vc vm_enable_ept(vm); for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { - vmx = vcpu_alloc_vmx(vm, &vmx_gva); + vcpu_alloc_vmx(vm, &vmx_gva); /* The EPTs are shared across vCPUs, setup the mappings once */ if (vcpu_id == 0) - memstress_setup_ept_mappings(vmx, vm); + memstress_setup_ept_mappings(vm); /* * Override the vCPU to run memstress_l1_guest_code() which will diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c index 9d4e391fdf2c..ea1c09f9e8ab 100644 --- a/tools/testing/selftests/kvm/lib/x86/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86/vmx.c @@ -409,8 +409,8 @@ static void tdp_create_pte(struct kvm_vm *vm, } -void __tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr, int target_level) +void __tdp_pg_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, + int target_level) { const uint64_t page_size = PG_LEVEL_SIZE(target_level); void *eptp_hva = addr_gpa2hva(vm, vm->arch.tdp_mmu->pgd); @@ -453,12 +453,6 @@ void __tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, } } -void tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr) -{ - __tdp_pg_map(vmx, vm, nested_paddr, paddr, PG_LEVEL_4K); -} - /* * Map a range of EPT guest physical addresses to the VM's physical address * @@ -476,9 +470,8 @@ void tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, * Within the VM given by vm, creates a nested guest translation for the * page range starting at nested_paddr to the page range starting at paddr. */ -void __tdp_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr, uint64_t size, - int level) +void __tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, + uint64_t size, int level) { size_t page_size = PG_LEVEL_SIZE(level); size_t npages = size / page_size; @@ -487,23 +480,22 @@ void __tdp_map(struct vmx_pages *vmx, struct kvm_vm *vm, TEST_ASSERT(paddr + size > paddr, "Paddr overflow"); while (npages--) { - __tdp_pg_map(vmx, vm, nested_paddr, paddr, level); + __tdp_pg_map(vm, nested_paddr, paddr, level); nested_paddr += page_size; paddr += page_size; } } -void tdp_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr, uint64_t size) +void tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, + uint64_t size) { - __tdp_map(vmx, vm, nested_paddr, paddr, size, PG_LEVEL_4K); + __tdp_map(vm, nested_paddr, paddr, size, PG_LEVEL_4K); } /* Prepare an identity extended page table that maps all the * physical pages in VM. */ -void tdp_identity_map_default_memslots(struct vmx_pages *vmx, - struct kvm_vm *vm) +void tdp_identity_map_default_memslots(struct kvm_vm *vm) { uint32_t s, memslot = 0; sparsebit_idx_t i, last; @@ -520,16 +512,15 @@ void tdp_identity_map_default_memslots(struct vmx_pages *vmx, if (i > last) break; - tdp_map(vmx, vm, (uint64_t)i << vm->page_shift, + tdp_map(vm, (uint64_t)i << vm->page_shift, (uint64_t)i << vm->page_shift, 1 << vm->page_shift); } } /* Identity map a region with 1GiB Pages. */ -void tdp_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t addr, uint64_t size) +void tdp_identity_map_1g(struct kvm_vm *vm, uint64_t addr, uint64_t size) { - __tdp_map(vmx, vm, addr, addr, size, PG_LEVEL_1G); + __tdp_map(vm, addr, addr, size, PG_LEVEL_1G); } bool kvm_cpu_has_ept(void) diff --git a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c index 5c8cf8ac42a2..370f8d3117c2 100644 --- a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c +++ b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c @@ -80,7 +80,6 @@ void l1_guest_code(struct vmx_pages *vmx) static void test_vmx_dirty_log(bool enable_ept) { vm_vaddr_t vmx_pages_gva = 0; - struct vmx_pages *vmx; unsigned long *bmap; uint64_t *host_test_mem; @@ -96,7 +95,7 @@ static void test_vmx_dirty_log(bool enable_ept) if (enable_ept) vm_enable_ept(vm); - vmx = vcpu_alloc_vmx(vm, &vmx_pages_gva); + vcpu_alloc_vmx(vm, &vmx_pages_gva); vcpu_args_set(vcpu, 1, vmx_pages_gva); /* Add an extra memory slot for testing dirty logging */ @@ -120,9 +119,9 @@ static void test_vmx_dirty_log(bool enable_ept) * GPAs as the EPT enabled case. */ if (enable_ept) { - tdp_identity_map_default_memslots(vmx, vm); - tdp_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, PAGE_SIZE); - tdp_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, PAGE_SIZE); + tdp_identity_map_default_memslots(vm); + tdp_map(vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, PAGE_SIZE); + tdp_map(vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, PAGE_SIZE); } bmap = bitmap_zalloc(TEST_MEM_PAGES); -- cgit v1.2.3 From 8296b16c0a2ba018c3235db5325d679e603899d6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Dec 2025 15:01:41 -0800 Subject: KVM: selftests: Add a stage-2 MMU instance to kvm_vm Add a stage-2 MMU instance so that architectures that support nested virtualization (more specifically, nested stage-2 page tables) can create and track stage-2 page tables for running L2 guests. Plumb the structure into common code to avoid cyclical dependencies, and to provide some line of sight to having common APIs for creating stage-2 mappings. As a bonus, putting the member in common code justifies using stage2_mmu instead of tdp_mmu for x86. Reviewed-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-13-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/kvm_util.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index c1497515fa6a..371d55e0366e 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -116,7 +116,12 @@ struct kvm_vm { uint32_t dirty_ring_size; uint64_t gpa_tag_mask; + /* + * "mmu" is the guest's stage-1, with a short name because the vast + * majority of tests only care about the stage-1 MMU. + */ struct kvm_mmu mmu; + struct kvm_mmu stage2_mmu; struct kvm_vm_arch arch; -- cgit v1.2.3 From 508d1cc3ca0ac428b1d5d614519bc497868c2e9f Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 30 Dec 2025 15:01:42 -0800 Subject: KVM: selftests: Reuse virt mapping functions for nested EPTs Rework tdp_map() and friends to use __virt_pg_map() and drop the custom EPT code in __tdp_pg_map() and tdp_create_pte(). The EPT code and __virt_pg_map() are practically identical, the main differences are: - EPT uses the EPT struct overlay instead of the PTE masks. - EPT always assumes 4-level EPTs. To reuse __virt_pg_map(), extend the PTE masks to work with EPT's RWX and X-only capabilities, and provide a tdp_mmu_init() API so that EPT can pass in the EPT PTE masks along with the root page level (which is currently hardcoded to '4'). Don't reuse KVM's insane overloading of the USER bit for EPT_R as there's no reason to multiplex bits in the selftests, e.g. selftests aren't trying to shadow guest PTEs and thus don't care about funnelling protections into a common permissions check. Another benefit of reusing the code is having separate handling for upper-level PTEs vs 4K PTEs, which avoids some quirks like setting the large bit on a 4K PTE in the EPTs. For all intents and purposes, no functional change intended. Suggested-by: Sean Christopherson Signed-off-by: Yosry Ahmed Co-developed-by: Sean Christopherson Link: https://patch.msgid.link/20251230230150.4150236-14-seanjc@google.com Signed-off-by: Sean Christopherson --- .../selftests/kvm/include/x86/kvm_util_arch.h | 4 +- .../testing/selftests/kvm/include/x86/processor.h | 16 ++- tools/testing/selftests/kvm/lib/x86/processor.c | 21 +++- tools/testing/selftests/kvm/lib/x86/vmx.c | 119 ++++----------------- 4 files changed, 52 insertions(+), 108 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h index 05a1fc1780f2..1cf84b8212c6 100644 --- a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h +++ b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h @@ -14,6 +14,8 @@ struct pte_masks { uint64_t present; uint64_t writable; uint64_t user; + uint64_t readable; + uint64_t executable; uint64_t accessed; uint64_t dirty; uint64_t huge; @@ -37,8 +39,6 @@ struct kvm_vm_arch { uint64_t s_bit; int sev_fd; bool is_pt_protected; - - struct kvm_mmu *tdp_mmu; }; static inline bool __vm_arch_has_protected_memory(struct kvm_vm_arch *arch) diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index 0164ef090787..e17cbbe71b8f 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -1444,6 +1444,8 @@ enum pg_level { #define PTE_PRESENT_MASK(mmu) ((mmu)->arch.pte_masks.present) #define PTE_WRITABLE_MASK(mmu) ((mmu)->arch.pte_masks.writable) #define PTE_USER_MASK(mmu) ((mmu)->arch.pte_masks.user) +#define PTE_READABLE_MASK(mmu) ((mmu)->arch.pte_masks.readable) +#define PTE_EXECUTABLE_MASK(mmu) ((mmu)->arch.pte_masks.executable) #define PTE_ACCESSED_MASK(mmu) ((mmu)->arch.pte_masks.accessed) #define PTE_DIRTY_MASK(mmu) ((mmu)->arch.pte_masks.dirty) #define PTE_HUGE_MASK(mmu) ((mmu)->arch.pte_masks.huge) @@ -1451,13 +1453,23 @@ enum pg_level { #define PTE_C_BIT_MASK(mmu) ((mmu)->arch.pte_masks.c) #define PTE_S_BIT_MASK(mmu) ((mmu)->arch.pte_masks.s) -#define is_present_pte(mmu, pte) (!!(*(pte) & PTE_PRESENT_MASK(mmu))) +/* + * For PTEs without a PRESENT bit (i.e. EPT entries), treat the PTE as present + * if it's executable or readable, as EPT supports execute-only PTEs, but not + * write-only PTEs. + */ +#define is_present_pte(mmu, pte) \ + (PTE_PRESENT_MASK(mmu) ? \ + !!(*(pte) & PTE_PRESENT_MASK(mmu)) : \ + !!(*(pte) & (PTE_READABLE_MASK(mmu) | PTE_EXECUTABLE_MASK(mmu)))) +#define is_executable_pte(mmu, pte) \ + ((*(pte) & (PTE_EXECUTABLE_MASK(mmu) | PTE_NX_MASK(mmu))) == PTE_EXECUTABLE_MASK(mmu)) #define is_writable_pte(mmu, pte) (!!(*(pte) & PTE_WRITABLE_MASK(mmu))) #define is_user_pte(mmu, pte) (!!(*(pte) & PTE_USER_MASK(mmu))) #define is_accessed_pte(mmu, pte) (!!(*(pte) & PTE_ACCESSED_MASK(mmu))) #define is_dirty_pte(mmu, pte) (!!(*(pte) & PTE_DIRTY_MASK(mmu))) #define is_huge_pte(mmu, pte) (!!(*(pte) & PTE_HUGE_MASK(mmu))) -#define is_nx_pte(mmu, pte) (!!(*(pte) & PTE_NX_MASK(mmu))) +#define is_nx_pte(mmu, pte) (!is_executable_pte(mmu, pte)) void tdp_mmu_init(struct kvm_vm *vm, int pgtable_levels, struct pte_masks *pte_masks); diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index 8a9298a72897..41316cac94e0 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -165,6 +165,10 @@ static void virt_mmu_init(struct kvm_vm *vm, struct kvm_mmu *mmu, mmu->pgd_created = true; mmu->arch.pte_masks = *pte_masks; } + + TEST_ASSERT(mmu->pgtable_levels == 4 || mmu->pgtable_levels == 5, + "Selftests MMU only supports 4-level and 5-level paging, not %u-level paging", + mmu->pgtable_levels); } void virt_arch_pgd_alloc(struct kvm_vm *vm) @@ -180,6 +184,7 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm) .dirty = BIT_ULL(6), .huge = BIT_ULL(7), .nx = BIT_ULL(63), + .executable = 0, .c = vm->arch.c_bit, .s = vm->arch.s_bit, }; @@ -190,10 +195,10 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm) void tdp_mmu_init(struct kvm_vm *vm, int pgtable_levels, struct pte_masks *pte_masks) { - TEST_ASSERT(!vm->arch.tdp_mmu, "TDP MMU already initialized"); + TEST_ASSERT(!vm->stage2_mmu.pgtable_levels, "TDP MMU already initialized"); - vm->arch.tdp_mmu = calloc(1, sizeof(*vm->arch.tdp_mmu)); - virt_mmu_init(vm, vm->arch.tdp_mmu, pte_masks); + vm->stage2_mmu.pgtable_levels = pgtable_levels; + virt_mmu_init(vm, &vm->stage2_mmu, pte_masks); } static void *virt_get_pte(struct kvm_vm *vm, struct kvm_mmu *mmu, @@ -223,7 +228,8 @@ static uint64_t *virt_create_upper_pte(struct kvm_vm *vm, paddr = vm_untag_gpa(vm, paddr); if (!is_present_pte(mmu, pte)) { - *pte = PTE_PRESENT_MASK(mmu) | PTE_WRITABLE_MASK(mmu); + *pte = PTE_PRESENT_MASK(mmu) | PTE_READABLE_MASK(mmu) | + PTE_WRITABLE_MASK(mmu) | PTE_EXECUTABLE_MASK(mmu); if (current_level == target_level) *pte |= PTE_HUGE_MASK(mmu) | (paddr & PHYSICAL_PAGE_MASK); else @@ -269,6 +275,9 @@ void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t vaddr, TEST_ASSERT(vm_untag_gpa(vm, paddr) == paddr, "Unexpected bits in paddr: %lx", paddr); + TEST_ASSERT(!PTE_EXECUTABLE_MASK(mmu) || !PTE_NX_MASK(mmu), + "X and NX bit masks cannot be used simultaneously"); + /* * Allocate upper level page tables, if not already present. Return * early if a hugepage was created. @@ -286,7 +295,9 @@ void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t vaddr, pte = virt_get_pte(vm, mmu, pte, vaddr, PG_LEVEL_4K); TEST_ASSERT(!is_present_pte(mmu, pte), "PTE already present for 4k page at vaddr: 0x%lx", vaddr); - *pte = PTE_PRESENT_MASK(mmu) | PTE_WRITABLE_MASK(mmu) | (paddr & PHYSICAL_PAGE_MASK); + *pte = PTE_PRESENT_MASK(mmu) | PTE_READABLE_MASK(mmu) | + PTE_WRITABLE_MASK(mmu) | PTE_EXECUTABLE_MASK(mmu) | + (paddr & PHYSICAL_PAGE_MASK); /* * Neither SEV nor TDX supports shared page tables, so only the final diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c index ea1c09f9e8ab..e3737b3d9120 100644 --- a/tools/testing/selftests/kvm/lib/x86/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86/vmx.c @@ -25,21 +25,6 @@ bool enable_evmcs; struct hv_enlightened_vmcs *current_evmcs; struct hv_vp_assist_page *current_vp_assist; -struct eptPageTableEntry { - uint64_t readable:1; - uint64_t writable:1; - uint64_t executable:1; - uint64_t memory_type:3; - uint64_t ignore_pat:1; - uint64_t page_size:1; - uint64_t accessed:1; - uint64_t dirty:1; - uint64_t ignored_11_10:2; - uint64_t address:40; - uint64_t ignored_62_52:11; - uint64_t suppress_ve:1; -}; - int vcpu_enable_evmcs(struct kvm_vcpu *vcpu) { uint16_t evmcs_ver; @@ -58,13 +43,24 @@ int vcpu_enable_evmcs(struct kvm_vcpu *vcpu) void vm_enable_ept(struct kvm_vm *vm) { - TEST_ASSERT(kvm_cpu_has_ept(), "KVM doesn't support nested EPT"); - if (vm->arch.tdp_mmu) - return; + struct pte_masks pte_masks; - /* TODO: Drop eptPageTableEntry in favor of PTE masks. */ - struct pte_masks pte_masks = (struct pte_masks) { + TEST_ASSERT(kvm_cpu_has_ept(), "KVM doesn't support nested EPT"); + /* + * EPTs do not have 'present' or 'user' bits, instead bit 0 is the + * 'readable' bit. + */ + pte_masks = (struct pte_masks) { + .present = 0, + .user = 0, + .readable = BIT_ULL(0), + .writable = BIT_ULL(1), + .executable = BIT_ULL(2), + .huge = BIT_ULL(7), + .accessed = BIT_ULL(8), + .dirty = BIT_ULL(9), + .nx = 0, }; /* TODO: Add support for 5-level EPT. */ @@ -120,8 +116,8 @@ vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva) vmx->vmwrite_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmwrite); memset(vmx->vmwrite_hva, 0, getpagesize()); - if (vm->arch.tdp_mmu) - vmx->eptp_gpa = vm->arch.tdp_mmu->pgd; + if (vm->stage2_mmu.pgd_created) + vmx->eptp_gpa = vm->stage2_mmu.pgd; *p_vmx_gva = vmx_gva; return vmx; @@ -377,82 +373,6 @@ void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp) init_vmcs_guest_state(guest_rip, guest_rsp); } -static void tdp_create_pte(struct kvm_vm *vm, - struct eptPageTableEntry *pte, - uint64_t nested_paddr, - uint64_t paddr, - int current_level, - int target_level) -{ - if (!pte->readable) { - pte->writable = true; - pte->readable = true; - pte->executable = true; - pte->page_size = (current_level == target_level); - if (pte->page_size) - pte->address = paddr >> vm->page_shift; - else - pte->address = vm_alloc_page_table(vm) >> vm->page_shift; - } else { - /* - * Entry already present. Assert that the caller doesn't want - * a hugepage at this level, and that there isn't a hugepage at - * this level. - */ - TEST_ASSERT(current_level != target_level, - "Cannot create hugepage at level: %u, nested_paddr: 0x%lx", - current_level, nested_paddr); - TEST_ASSERT(!pte->page_size, - "Cannot create page table at level: %u, nested_paddr: 0x%lx", - current_level, nested_paddr); - } -} - - -void __tdp_pg_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, - int target_level) -{ - const uint64_t page_size = PG_LEVEL_SIZE(target_level); - void *eptp_hva = addr_gpa2hva(vm, vm->arch.tdp_mmu->pgd); - struct eptPageTableEntry *pt = eptp_hva, *pte; - uint16_t index; - - TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, - "Unknown or unsupported guest mode: 0x%x", vm->mode); - - TEST_ASSERT((nested_paddr >> 48) == 0, - "Nested physical address 0x%lx is > 48-bits and requires 5-level EPT", - nested_paddr); - TEST_ASSERT((nested_paddr % page_size) == 0, - "Nested physical address not on page boundary,\n" - " nested_paddr: 0x%lx page_size: 0x%lx", - nested_paddr, page_size); - TEST_ASSERT((nested_paddr >> vm->page_shift) <= vm->max_gfn, - "Physical address beyond beyond maximum supported,\n" - " nested_paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", - paddr, vm->max_gfn, vm->page_size); - TEST_ASSERT((paddr % page_size) == 0, - "Physical address not on page boundary,\n" - " paddr: 0x%lx page_size: 0x%lx", - paddr, page_size); - TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn, - "Physical address beyond beyond maximum supported,\n" - " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", - paddr, vm->max_gfn, vm->page_size); - - for (int level = PG_LEVEL_512G; level >= PG_LEVEL_4K; level--) { - index = (nested_paddr >> PG_LEVEL_SHIFT(level)) & 0x1ffu; - pte = &pt[index]; - - tdp_create_pte(vm, pte, nested_paddr, paddr, level, target_level); - - if (pte->page_size) - break; - - pt = addr_gpa2hva(vm, pte->address * vm->page_size); - } -} - /* * Map a range of EPT guest physical addresses to the VM's physical address * @@ -473,6 +393,7 @@ void __tdp_pg_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, void __tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, uint64_t size, int level) { + struct kvm_mmu *mmu = &vm->stage2_mmu; size_t page_size = PG_LEVEL_SIZE(level); size_t npages = size / page_size; @@ -480,7 +401,7 @@ void __tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, TEST_ASSERT(paddr + size > paddr, "Paddr overflow"); while (npages--) { - __tdp_pg_map(vm, nested_paddr, paddr, level); + __virt_pg_map(vm, mmu, nested_paddr, paddr, level); nested_paddr += page_size; paddr += page_size; } -- cgit v1.2.3 From 07676c04bd753f32a9fe2f247b0ba2d213dd7a99 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Dec 2025 15:01:43 -0800 Subject: KVM: selftests: Move TDP mapping functions outside of vmx.c Now that the functions are no longer VMX-specific, move them to processor.c. Do a minor comment tweak replacing 'EPT' with 'TDP'. No functional change intended. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-15-seanjc@google.com Signed-off-by: Sean Christopherson --- .../testing/selftests/kvm/include/x86/processor.h | 4 ++ tools/testing/selftests/kvm/include/x86/vmx.h | 3 - tools/testing/selftests/kvm/lib/x86/processor.c | 53 ++++++++++++++++ tools/testing/selftests/kvm/lib/x86/vmx.c | 71 ---------------------- 4 files changed, 57 insertions(+), 74 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index e17cbbe71b8f..461cf155b96e 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -1479,6 +1479,10 @@ void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t vaddr, void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, uint64_t nr_bytes, int level); +void tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, uint64_t size); +void tdp_identity_map_default_memslots(struct kvm_vm *vm); +void tdp_identity_map_1g(struct kvm_vm *vm, uint64_t addr, uint64_t size); + /* * Basic CPU control in CR0 */ diff --git a/tools/testing/selftests/kvm/include/x86/vmx.h b/tools/testing/selftests/kvm/include/x86/vmx.h index 4dd4c2094ee6..92b918700d24 100644 --- a/tools/testing/selftests/kvm/include/x86/vmx.h +++ b/tools/testing/selftests/kvm/include/x86/vmx.h @@ -557,9 +557,6 @@ bool load_vmcs(struct vmx_pages *vmx); bool ept_1g_pages_supported(void); -void tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, uint64_t size); -void tdp_identity_map_default_memslots(struct kvm_vm *vm); -void tdp_identity_map_1g(struct kvm_vm *vm, uint64_t addr, uint64_t size); bool kvm_cpu_has_ept(void); void vm_enable_ept(struct kvm_vm *vm); void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm); diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index 41316cac94e0..29e7d172f945 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -472,6 +472,59 @@ void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) } } +void __tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, + uint64_t size, int level) +{ + size_t page_size = PG_LEVEL_SIZE(level); + size_t npages = size / page_size; + + TEST_ASSERT(nested_paddr + size > nested_paddr, "Vaddr overflow"); + TEST_ASSERT(paddr + size > paddr, "Paddr overflow"); + + while (npages--) { + __virt_pg_map(vm, &vm->stage2_mmu, nested_paddr, paddr, level); + nested_paddr += page_size; + paddr += page_size; + } +} + +void tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, + uint64_t size) +{ + __tdp_map(vm, nested_paddr, paddr, size, PG_LEVEL_4K); +} + +/* Prepare an identity extended page table that maps all the + * physical pages in VM. + */ +void tdp_identity_map_default_memslots(struct kvm_vm *vm) +{ + uint32_t s, memslot = 0; + sparsebit_idx_t i, last; + struct userspace_mem_region *region = memslot2region(vm, memslot); + + /* Only memslot 0 is mapped here, ensure it's the only one being used */ + for (s = 0; s < NR_MEM_REGIONS; s++) + TEST_ASSERT_EQ(vm->memslots[s], 0); + + i = (region->region.guest_phys_addr >> vm->page_shift) - 1; + last = i + (region->region.memory_size >> vm->page_shift); + for (;;) { + i = sparsebit_next_clear(region->unused_phy_pages, i); + if (i > last) + break; + + tdp_map(vm, (uint64_t)i << vm->page_shift, + (uint64_t)i << vm->page_shift, 1 << vm->page_shift); + } +} + +/* Identity map a region with 1GiB Pages. */ +void tdp_identity_map_1g(struct kvm_vm *vm, uint64_t addr, uint64_t size) +{ + __tdp_map(vm, addr, addr, size, PG_LEVEL_1G); +} + /* * Set Unusable Segment * diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c index e3737b3d9120..448a63457467 100644 --- a/tools/testing/selftests/kvm/lib/x86/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86/vmx.c @@ -373,77 +373,6 @@ void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp) init_vmcs_guest_state(guest_rip, guest_rsp); } -/* - * Map a range of EPT guest physical addresses to the VM's physical address - * - * Input Args: - * vm - Virtual Machine - * nested_paddr - Nested guest physical address to map - * paddr - VM Physical Address - * size - The size of the range to map - * level - The level at which to map the range - * - * Output Args: None - * - * Return: None - * - * Within the VM given by vm, creates a nested guest translation for the - * page range starting at nested_paddr to the page range starting at paddr. - */ -void __tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, - uint64_t size, int level) -{ - struct kvm_mmu *mmu = &vm->stage2_mmu; - size_t page_size = PG_LEVEL_SIZE(level); - size_t npages = size / page_size; - - TEST_ASSERT(nested_paddr + size > nested_paddr, "Vaddr overflow"); - TEST_ASSERT(paddr + size > paddr, "Paddr overflow"); - - while (npages--) { - __virt_pg_map(vm, mmu, nested_paddr, paddr, level); - nested_paddr += page_size; - paddr += page_size; - } -} - -void tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, - uint64_t size) -{ - __tdp_map(vm, nested_paddr, paddr, size, PG_LEVEL_4K); -} - -/* Prepare an identity extended page table that maps all the - * physical pages in VM. - */ -void tdp_identity_map_default_memslots(struct kvm_vm *vm) -{ - uint32_t s, memslot = 0; - sparsebit_idx_t i, last; - struct userspace_mem_region *region = memslot2region(vm, memslot); - - /* Only memslot 0 is mapped here, ensure it's the only one being used */ - for (s = 0; s < NR_MEM_REGIONS; s++) - TEST_ASSERT_EQ(vm->memslots[s], 0); - - i = (region->region.guest_phys_addr >> vm->page_shift) - 1; - last = i + (region->region.memory_size >> vm->page_shift); - for (;;) { - i = sparsebit_next_clear(region->unused_phy_pages, i); - if (i > last) - break; - - tdp_map(vm, (uint64_t)i << vm->page_shift, - (uint64_t)i << vm->page_shift, 1 << vm->page_shift); - } -} - -/* Identity map a region with 1GiB Pages. */ -void tdp_identity_map_1g(struct kvm_vm *vm, uint64_t addr, uint64_t size) -{ - __tdp_map(vm, addr, addr, size, PG_LEVEL_1G); -} - bool kvm_cpu_has_ept(void) { uint64_t ctrl; -- cgit v1.2.3 From 9cb1944f6bf09ecebcc7609f35178b85aa26f165 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 30 Dec 2025 15:01:44 -0800 Subject: KVM: selftests: Allow kvm_cpu_has_ept() to be called on AMD CPUs In preparation for generalizing the nested dirty logging test, checking if either EPT or NPT is enabled will be needed. To avoid needing to gate the kvm_cpu_has_ept() call by the CPU type, make sure the function returns false if VMX is not available instead of trying to read VMX-only MSRs. No functional change intended. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-16-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/lib/x86/vmx.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c index 448a63457467..c87b340362a9 100644 --- a/tools/testing/selftests/kvm/lib/x86/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86/vmx.c @@ -377,6 +377,9 @@ bool kvm_cpu_has_ept(void) { uint64_t ctrl; + if (!kvm_cpu_has(X86_FEATURE_VMX)) + return false; + ctrl = kvm_get_feature_msr(MSR_IA32_VMX_TRUE_PROCBASED_CTLS) >> 32; if (!(ctrl & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) return false; -- cgit v1.2.3 From 753c0d5a507b939d6efc60c7b437d5330880cce3 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 30 Dec 2025 15:01:45 -0800 Subject: KVM: selftests: Add support for nested NPTs Implement nCR3 and NPT initialization functions, similar to the EPT equivalents, and create common TDP helpers for enablement checking and initialization. Enable NPT for nested guests by default if the TDP MMU was initialized, similar to VMX. Reuse the PTE masks from the main MMU in the NPT MMU, except for the C and S bits related to confidential VMs. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-17-seanjc@google.com [sean: apply Yosry's fixup for ncr3_gpa] Signed-off-by: Sean Christopherson --- .../testing/selftests/kvm/include/x86/processor.h | 2 ++ tools/testing/selftests/kvm/include/x86/svm_util.h | 9 ++++++++ tools/testing/selftests/kvm/lib/x86/memstress.c | 4 ++-- tools/testing/selftests/kvm/lib/x86/processor.c | 15 ++++++++++++++ tools/testing/selftests/kvm/lib/x86/svm.c | 24 ++++++++++++++++++++++ .../testing/selftests/kvm/x86/vmx_dirty_log_test.c | 4 ++-- 6 files changed, 54 insertions(+), 4 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index 461cf155b96e..115bec5eb1eb 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -1479,6 +1479,8 @@ void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t vaddr, void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, uint64_t nr_bytes, int level); +void vm_enable_tdp(struct kvm_vm *vm); +bool kvm_cpu_has_tdp(void); void tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, uint64_t size); void tdp_identity_map_default_memslots(struct kvm_vm *vm); void tdp_identity_map_1g(struct kvm_vm *vm, uint64_t addr, uint64_t size); diff --git a/tools/testing/selftests/kvm/include/x86/svm_util.h b/tools/testing/selftests/kvm/include/x86/svm_util.h index b74c6dcddcbd..5d7c42534bc4 100644 --- a/tools/testing/selftests/kvm/include/x86/svm_util.h +++ b/tools/testing/selftests/kvm/include/x86/svm_util.h @@ -27,6 +27,9 @@ struct svm_test_data { void *msr; /* gva */ void *msr_hva; uint64_t msr_gpa; + + /* NPT */ + uint64_t ncr3_gpa; }; static inline void vmmcall(void) @@ -57,6 +60,12 @@ struct svm_test_data *vcpu_alloc_svm(struct kvm_vm *vm, vm_vaddr_t *p_svm_gva); void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_rsp); void run_guest(struct vmcb *vmcb, uint64_t vmcb_gpa); +static inline bool kvm_cpu_has_npt(void) +{ + return kvm_cpu_has(X86_FEATURE_NPT); +} +void vm_enable_npt(struct kvm_vm *vm); + int open_sev_dev_path_or_exit(void); #endif /* SELFTEST_KVM_SVM_UTILS_H */ diff --git a/tools/testing/selftests/kvm/lib/x86/memstress.c b/tools/testing/selftests/kvm/lib/x86/memstress.c index 3319cb57a78d..407abfc34909 100644 --- a/tools/testing/selftests/kvm/lib/x86/memstress.c +++ b/tools/testing/selftests/kvm/lib/x86/memstress.c @@ -82,9 +82,9 @@ void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vc int vcpu_id; TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); - TEST_REQUIRE(kvm_cpu_has_ept()); + TEST_REQUIRE(kvm_cpu_has_tdp()); - vm_enable_ept(vm); + vm_enable_tdp(vm); for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { vcpu_alloc_vmx(vm, &vmx_gva); diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index 29e7d172f945..a3a4c9a4cbcb 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -8,7 +8,9 @@ #include "kvm_util.h" #include "pmu.h" #include "processor.h" +#include "svm_util.h" #include "sev.h" +#include "vmx.h" #ifndef NUM_INTERRUPTS #define NUM_INTERRUPTS 256 @@ -472,6 +474,19 @@ void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) } } +void vm_enable_tdp(struct kvm_vm *vm) +{ + if (kvm_cpu_has(X86_FEATURE_VMX)) + vm_enable_ept(vm); + else + vm_enable_npt(vm); +} + +bool kvm_cpu_has_tdp(void) +{ + return kvm_cpu_has_ept() || kvm_cpu_has_npt(); +} + void __tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, uint64_t size, int level) { diff --git a/tools/testing/selftests/kvm/lib/x86/svm.c b/tools/testing/selftests/kvm/lib/x86/svm.c index d239c2097391..a25a3471f5f6 100644 --- a/tools/testing/selftests/kvm/lib/x86/svm.c +++ b/tools/testing/selftests/kvm/lib/x86/svm.c @@ -46,6 +46,9 @@ vcpu_alloc_svm(struct kvm_vm *vm, vm_vaddr_t *p_svm_gva) svm->msr_gpa = addr_gva2gpa(vm, (uintptr_t)svm->msr); memset(svm->msr_hva, 0, getpagesize()); + if (vm->stage2_mmu.pgd_created) + svm->ncr3_gpa = vm->stage2_mmu.pgd; + *p_svm_gva = svm_gva; return svm; } @@ -59,6 +62,22 @@ static void vmcb_set_seg(struct vmcb_seg *seg, u16 selector, seg->base = base; } +void vm_enable_npt(struct kvm_vm *vm) +{ + struct pte_masks pte_masks; + + TEST_ASSERT(kvm_cpu_has_npt(), "KVM doesn't supported nested NPT"); + + /* + * NPTs use the same PTE format, but deliberately drop the C-bit as the + * per-VM shared vs. private information is only meant for stage-1. + */ + pte_masks = vm->mmu.arch.pte_masks; + pte_masks.c = 0; + + tdp_mmu_init(vm, vm->mmu.pgtable_levels, &pte_masks); +} + void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_rsp) { struct vmcb *vmcb = svm->vmcb; @@ -102,6 +121,11 @@ void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_r vmcb->save.rip = (u64)guest_rip; vmcb->save.rsp = (u64)guest_rsp; guest_regs.rdi = (u64)svm; + + if (svm->ncr3_gpa) { + ctrl->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE; + ctrl->nested_cr3 = svm->ncr3_gpa; + } } /* diff --git a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c index 370f8d3117c2..032ab8bf60a4 100644 --- a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c +++ b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c @@ -93,7 +93,7 @@ static void test_vmx_dirty_log(bool enable_ept) /* Create VM */ vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); if (enable_ept) - vm_enable_ept(vm); + vm_enable_tdp(vm); vcpu_alloc_vmx(vm, &vmx_pages_gva); vcpu_args_set(vcpu, 1, vmx_pages_gva); @@ -170,7 +170,7 @@ int main(int argc, char *argv[]) test_vmx_dirty_log(/*enable_ept=*/false); - if (kvm_cpu_has_ept()) + if (kvm_cpu_has_tdp()) test_vmx_dirty_log(/*enable_ept=*/true); return 0; -- cgit v1.2.3 From 251e4849a79b258fd3e889ff095ba083ce301c13 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 30 Dec 2025 15:01:46 -0800 Subject: KVM: selftests: Set the user bit on nested NPT PTEs According to the APM, NPT walks are treated as user accesses. In preparation for supporting NPT mappings, set the 'user' bit on NPTs by adding a mask of bits to always be set on PTEs in kvm_mmu. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-18-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/x86/kvm_util_arch.h | 2 ++ tools/testing/selftests/kvm/include/x86/processor.h | 1 + tools/testing/selftests/kvm/lib/x86/processor.c | 5 +++-- tools/testing/selftests/kvm/lib/x86/svm.c | 3 +++ 4 files changed, 9 insertions(+), 2 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h index 1cf84b8212c6..be35d26bb320 100644 --- a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h +++ b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h @@ -22,6 +22,8 @@ struct pte_masks { uint64_t nx; uint64_t c; uint64_t s; + + uint64_t always_set; }; struct kvm_mmu_arch { diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index 115bec5eb1eb..995277cae94e 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -1452,6 +1452,7 @@ enum pg_level { #define PTE_NX_MASK(mmu) ((mmu)->arch.pte_masks.nx) #define PTE_C_BIT_MASK(mmu) ((mmu)->arch.pte_masks.c) #define PTE_S_BIT_MASK(mmu) ((mmu)->arch.pte_masks.s) +#define PTE_ALWAYS_SET_MASK(mmu) ((mmu)->arch.pte_masks.always_set) /* * For PTEs without a PRESENT bit (i.e. EPT entries), treat the PTE as present diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index a3a4c9a4cbcb..5a3385d48902 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -231,7 +231,8 @@ static uint64_t *virt_create_upper_pte(struct kvm_vm *vm, if (!is_present_pte(mmu, pte)) { *pte = PTE_PRESENT_MASK(mmu) | PTE_READABLE_MASK(mmu) | - PTE_WRITABLE_MASK(mmu) | PTE_EXECUTABLE_MASK(mmu); + PTE_WRITABLE_MASK(mmu) | PTE_EXECUTABLE_MASK(mmu) | + PTE_ALWAYS_SET_MASK(mmu); if (current_level == target_level) *pte |= PTE_HUGE_MASK(mmu) | (paddr & PHYSICAL_PAGE_MASK); else @@ -299,7 +300,7 @@ void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t vaddr, "PTE already present for 4k page at vaddr: 0x%lx", vaddr); *pte = PTE_PRESENT_MASK(mmu) | PTE_READABLE_MASK(mmu) | PTE_WRITABLE_MASK(mmu) | PTE_EXECUTABLE_MASK(mmu) | - (paddr & PHYSICAL_PAGE_MASK); + PTE_ALWAYS_SET_MASK(mmu) | (paddr & PHYSICAL_PAGE_MASK); /* * Neither SEV nor TDX supports shared page tables, so only the final diff --git a/tools/testing/selftests/kvm/lib/x86/svm.c b/tools/testing/selftests/kvm/lib/x86/svm.c index a25a3471f5f6..2e5c480c9afd 100644 --- a/tools/testing/selftests/kvm/lib/x86/svm.c +++ b/tools/testing/selftests/kvm/lib/x86/svm.c @@ -75,6 +75,9 @@ void vm_enable_npt(struct kvm_vm *vm) pte_masks = vm->mmu.arch.pte_masks; pte_masks.c = 0; + /* NPT walks are treated as user accesses, so set the 'user' bit. */ + pte_masks.always_set = pte_masks.user; + tdp_mmu_init(vm, vm->mmu.pgtable_levels, &pte_masks); } -- cgit v1.2.3 From 6794d916f87e0a6cd51a3d8c2c0e6ffd48fa7a79 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 30 Dec 2025 15:01:47 -0800 Subject: KVM: selftests: Extend vmx_dirty_log_test to cover SVM Generalize the code in vmx_dirty_log_test.c by adding SVM-specific L1 code, doing some renaming (e.g. EPT -> TDP), and having setup code for both SVM and VMX in test_dirty_log(). Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-19-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/Makefile.kvm | 2 +- .../selftests/kvm/x86/nested_dirty_log_test.c | 210 +++++++++++++++++++++ .../testing/selftests/kvm/x86/vmx_dirty_log_test.c | 177 ----------------- 3 files changed, 211 insertions(+), 178 deletions(-) create mode 100644 tools/testing/selftests/kvm/x86/nested_dirty_log_test.c delete mode 100644 tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c (limited to 'tools/testing') diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index 3789890421bd..ffbf891b31d3 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -89,6 +89,7 @@ TEST_GEN_PROGS_x86 += x86/kvm_buslock_test TEST_GEN_PROGS_x86 += x86/monitor_mwait_test TEST_GEN_PROGS_x86 += x86/msrs_test TEST_GEN_PROGS_x86 += x86/nested_close_kvm_test +TEST_GEN_PROGS_x86 += x86/nested_dirty_log_test TEST_GEN_PROGS_x86 += x86/nested_emulation_test TEST_GEN_PROGS_x86 += x86/nested_exceptions_test TEST_GEN_PROGS_x86 += x86/nested_invalid_cr3_test @@ -115,7 +116,6 @@ TEST_GEN_PROGS_x86 += x86/ucna_injection_test TEST_GEN_PROGS_x86 += x86/userspace_io_test TEST_GEN_PROGS_x86 += x86/userspace_msr_exit_test TEST_GEN_PROGS_x86 += x86/vmx_apic_access_test -TEST_GEN_PROGS_x86 += x86/vmx_dirty_log_test TEST_GEN_PROGS_x86 += x86/vmx_exception_with_invalid_guest_state TEST_GEN_PROGS_x86 += x86/vmx_msrs_test TEST_GEN_PROGS_x86 += x86/vmx_invalid_nested_guest_state diff --git a/tools/testing/selftests/kvm/x86/nested_dirty_log_test.c b/tools/testing/selftests/kvm/x86/nested_dirty_log_test.c new file mode 100644 index 000000000000..89d2e86a0db9 --- /dev/null +++ b/tools/testing/selftests/kvm/x86/nested_dirty_log_test.c @@ -0,0 +1,210 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KVM dirty page logging test + * + * Copyright (C) 2018, Red Hat, Inc. + */ +#include +#include +#include +#include + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "svm_util.h" +#include "vmx.h" + +/* The memory slot index to track dirty pages */ +#define TEST_MEM_SLOT_INDEX 1 +#define TEST_MEM_PAGES 3 + +/* L1 guest test virtual memory offset */ +#define GUEST_TEST_MEM 0xc0000000 + +/* L2 guest test virtual memory offset */ +#define NESTED_TEST_MEM1 0xc0001000 +#define NESTED_TEST_MEM2 0xc0002000 + +#define L2_GUEST_STACK_SIZE 64 + +static void l2_guest_code(u64 *a, u64 *b) +{ + READ_ONCE(*a); + WRITE_ONCE(*a, 1); + GUEST_SYNC(true); + GUEST_SYNC(false); + + WRITE_ONCE(*b, 1); + GUEST_SYNC(true); + WRITE_ONCE(*b, 1); + GUEST_SYNC(true); + GUEST_SYNC(false); + + /* Exit to L1 and never come back. */ + vmcall(); +} + +static void l2_guest_code_tdp_enabled(void) +{ + l2_guest_code((u64 *)NESTED_TEST_MEM1, (u64 *)NESTED_TEST_MEM2); +} + +static void l2_guest_code_tdp_disabled(void) +{ + /* Access the same L1 GPAs as l2_guest_code_tdp_enabled() */ + l2_guest_code((u64 *)GUEST_TEST_MEM, (u64 *)GUEST_TEST_MEM); +} + +void l1_vmx_code(struct vmx_pages *vmx) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + void *l2_rip; + + GUEST_ASSERT(vmx->vmcs_gpa); + GUEST_ASSERT(prepare_for_vmx_operation(vmx)); + GUEST_ASSERT(load_vmcs(vmx)); + + if (vmx->eptp_gpa) + l2_rip = l2_guest_code_tdp_enabled; + else + l2_rip = l2_guest_code_tdp_disabled; + + prepare_vmcs(vmx, l2_rip, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + GUEST_SYNC(false); + GUEST_ASSERT(!vmlaunch()); + GUEST_SYNC(false); + GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_VMCALL); + GUEST_DONE(); +} + +static void l1_svm_code(struct svm_test_data *svm) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + void *l2_rip; + + if (svm->ncr3_gpa) + l2_rip = l2_guest_code_tdp_enabled; + else + l2_rip = l2_guest_code_tdp_disabled; + + generic_svm_setup(svm, l2_rip, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + GUEST_SYNC(false); + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_SYNC(false); + GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMMCALL); + GUEST_DONE(); +} + +static void l1_guest_code(void *data) +{ + if (this_cpu_has(X86_FEATURE_VMX)) + l1_vmx_code(data); + else + l1_svm_code(data); +} + +static void test_dirty_log(bool nested_tdp) +{ + vm_vaddr_t nested_gva = 0; + unsigned long *bmap; + uint64_t *host_test_mem; + + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + struct ucall uc; + bool done = false; + + pr_info("Nested TDP: %s\n", nested_tdp ? "enabled" : "disabled"); + + /* Create VM */ + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); + if (nested_tdp) + vm_enable_tdp(vm); + + if (kvm_cpu_has(X86_FEATURE_VMX)) + vcpu_alloc_vmx(vm, &nested_gva); + else + vcpu_alloc_svm(vm, &nested_gva); + + vcpu_args_set(vcpu, 1, nested_gva); + + /* Add an extra memory slot for testing dirty logging */ + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + GUEST_TEST_MEM, + TEST_MEM_SLOT_INDEX, + TEST_MEM_PAGES, + KVM_MEM_LOG_DIRTY_PAGES); + + /* + * Add an identity map for GVA range [0xc0000000, 0xc0002000). This + * affects both L1 and L2. However... + */ + virt_map(vm, GUEST_TEST_MEM, GUEST_TEST_MEM, TEST_MEM_PAGES); + + /* + * ... pages in the L2 GPA range [0xc0001000, 0xc0003000) will map to + * 0xc0000000. + * + * When TDP is disabled, the L2 guest code will still access the same L1 + * GPAs as the TDP enabled case. + */ + if (nested_tdp) { + tdp_identity_map_default_memslots(vm); + tdp_map(vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, PAGE_SIZE); + tdp_map(vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, PAGE_SIZE); + } + + bmap = bitmap_zalloc(TEST_MEM_PAGES); + host_test_mem = addr_gpa2hva(vm, GUEST_TEST_MEM); + + while (!done) { + memset(host_test_mem, 0xaa, TEST_MEM_PAGES * PAGE_SIZE); + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + /* NOT REACHED */ + case UCALL_SYNC: + /* + * The nested guest wrote at offset 0x1000 in the memslot, but the + * dirty bitmap must be filled in according to L1 GPA, not L2. + */ + kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap); + if (uc.args[1]) { + TEST_ASSERT(test_bit(0, bmap), "Page 0 incorrectly reported clean"); + TEST_ASSERT(host_test_mem[0] == 1, "Page 0 not written by guest"); + } else { + TEST_ASSERT(!test_bit(0, bmap), "Page 0 incorrectly reported dirty"); + TEST_ASSERT(host_test_mem[0] == 0xaaaaaaaaaaaaaaaaULL, "Page 0 written by guest"); + } + + TEST_ASSERT(!test_bit(1, bmap), "Page 1 incorrectly reported dirty"); + TEST_ASSERT(host_test_mem[PAGE_SIZE / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 1 written by guest"); + TEST_ASSERT(!test_bit(2, bmap), "Page 2 incorrectly reported dirty"); + TEST_ASSERT(host_test_mem[PAGE_SIZE*2 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 2 written by guest"); + break; + case UCALL_DONE: + done = true; + break; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } +} + +int main(int argc, char *argv[]) +{ + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX) || kvm_cpu_has(X86_FEATURE_SVM)); + + test_dirty_log(/*nested_tdp=*/false); + + if (kvm_cpu_has_tdp()) + test_dirty_log(/*nested_tdp=*/true); + + return 0; +} diff --git a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c deleted file mode 100644 index 032ab8bf60a4..000000000000 --- a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c +++ /dev/null @@ -1,177 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * KVM dirty page logging test - * - * Copyright (C) 2018, Red Hat, Inc. - */ -#include -#include -#include -#include - -#include "test_util.h" -#include "kvm_util.h" -#include "processor.h" -#include "vmx.h" - -/* The memory slot index to track dirty pages */ -#define TEST_MEM_SLOT_INDEX 1 -#define TEST_MEM_PAGES 3 - -/* L1 guest test virtual memory offset */ -#define GUEST_TEST_MEM 0xc0000000 - -/* L2 guest test virtual memory offset */ -#define NESTED_TEST_MEM1 0xc0001000 -#define NESTED_TEST_MEM2 0xc0002000 - -static void l2_guest_code(u64 *a, u64 *b) -{ - READ_ONCE(*a); - WRITE_ONCE(*a, 1); - GUEST_SYNC(true); - GUEST_SYNC(false); - - WRITE_ONCE(*b, 1); - GUEST_SYNC(true); - WRITE_ONCE(*b, 1); - GUEST_SYNC(true); - GUEST_SYNC(false); - - /* Exit to L1 and never come back. */ - vmcall(); -} - -static void l2_guest_code_ept_enabled(void) -{ - l2_guest_code((u64 *)NESTED_TEST_MEM1, (u64 *)NESTED_TEST_MEM2); -} - -static void l2_guest_code_ept_disabled(void) -{ - /* Access the same L1 GPAs as l2_guest_code_ept_enabled() */ - l2_guest_code((u64 *)GUEST_TEST_MEM, (u64 *)GUEST_TEST_MEM); -} - -void l1_guest_code(struct vmx_pages *vmx) -{ -#define L2_GUEST_STACK_SIZE 64 - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; - void *l2_rip; - - GUEST_ASSERT(vmx->vmcs_gpa); - GUEST_ASSERT(prepare_for_vmx_operation(vmx)); - GUEST_ASSERT(load_vmcs(vmx)); - - if (vmx->eptp_gpa) - l2_rip = l2_guest_code_ept_enabled; - else - l2_rip = l2_guest_code_ept_disabled; - - prepare_vmcs(vmx, l2_rip, &l2_guest_stack[L2_GUEST_STACK_SIZE]); - - GUEST_SYNC(false); - GUEST_ASSERT(!vmlaunch()); - GUEST_SYNC(false); - GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); - GUEST_DONE(); -} - -static void test_vmx_dirty_log(bool enable_ept) -{ - vm_vaddr_t vmx_pages_gva = 0; - unsigned long *bmap; - uint64_t *host_test_mem; - - struct kvm_vcpu *vcpu; - struct kvm_vm *vm; - struct ucall uc; - bool done = false; - - pr_info("Nested EPT: %s\n", enable_ept ? "enabled" : "disabled"); - - /* Create VM */ - vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); - if (enable_ept) - vm_enable_tdp(vm); - - vcpu_alloc_vmx(vm, &vmx_pages_gva); - vcpu_args_set(vcpu, 1, vmx_pages_gva); - - /* Add an extra memory slot for testing dirty logging */ - vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, - GUEST_TEST_MEM, - TEST_MEM_SLOT_INDEX, - TEST_MEM_PAGES, - KVM_MEM_LOG_DIRTY_PAGES); - - /* - * Add an identity map for GVA range [0xc0000000, 0xc0002000). This - * affects both L1 and L2. However... - */ - virt_map(vm, GUEST_TEST_MEM, GUEST_TEST_MEM, TEST_MEM_PAGES); - - /* - * ... pages in the L2 GPA range [0xc0001000, 0xc0003000) will map to - * 0xc0000000. - * - * When EPT is disabled, the L2 guest code will still access the same L1 - * GPAs as the EPT enabled case. - */ - if (enable_ept) { - tdp_identity_map_default_memslots(vm); - tdp_map(vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, PAGE_SIZE); - tdp_map(vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, PAGE_SIZE); - } - - bmap = bitmap_zalloc(TEST_MEM_PAGES); - host_test_mem = addr_gpa2hva(vm, GUEST_TEST_MEM); - - while (!done) { - memset(host_test_mem, 0xaa, TEST_MEM_PAGES * PAGE_SIZE); - vcpu_run(vcpu); - TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); - - switch (get_ucall(vcpu, &uc)) { - case UCALL_ABORT: - REPORT_GUEST_ASSERT(uc); - /* NOT REACHED */ - case UCALL_SYNC: - /* - * The nested guest wrote at offset 0x1000 in the memslot, but the - * dirty bitmap must be filled in according to L1 GPA, not L2. - */ - kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap); - if (uc.args[1]) { - TEST_ASSERT(test_bit(0, bmap), "Page 0 incorrectly reported clean"); - TEST_ASSERT(host_test_mem[0] == 1, "Page 0 not written by guest"); - } else { - TEST_ASSERT(!test_bit(0, bmap), "Page 0 incorrectly reported dirty"); - TEST_ASSERT(host_test_mem[0] == 0xaaaaaaaaaaaaaaaaULL, "Page 0 written by guest"); - } - - TEST_ASSERT(!test_bit(1, bmap), "Page 1 incorrectly reported dirty"); - TEST_ASSERT(host_test_mem[PAGE_SIZE / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 1 written by guest"); - TEST_ASSERT(!test_bit(2, bmap), "Page 2 incorrectly reported dirty"); - TEST_ASSERT(host_test_mem[PAGE_SIZE*2 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 2 written by guest"); - break; - case UCALL_DONE: - done = true; - break; - default: - TEST_FAIL("Unknown ucall %lu", uc.cmd); - } - } -} - -int main(int argc, char *argv[]) -{ - TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); - - test_vmx_dirty_log(/*enable_ept=*/false); - - if (kvm_cpu_has_tdp()) - test_vmx_dirty_log(/*enable_ept=*/true); - - return 0; -} -- cgit v1.2.3 From 59eef1a47b8c264d09ee84c909a1da60b4e70bd7 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 30 Dec 2025 15:01:48 -0800 Subject: KVM: selftests: Extend memstress to run on nested SVM Add L1 SVM code and generalize the setup code to work for both VMX and SVM. This allows running 'dirty_log_perf_test -n' on AMD CPUs. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-20-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/lib/x86/memstress.c | 42 ++++++++++++++++++++----- 1 file changed, 35 insertions(+), 7 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/kvm/lib/x86/memstress.c b/tools/testing/selftests/kvm/lib/x86/memstress.c index 407abfc34909..86f4c5e4c430 100644 --- a/tools/testing/selftests/kvm/lib/x86/memstress.c +++ b/tools/testing/selftests/kvm/lib/x86/memstress.c @@ -13,6 +13,7 @@ #include "kvm_util.h" #include "memstress.h" #include "processor.h" +#include "svm_util.h" #include "vmx.h" void memstress_l2_guest_code(uint64_t vcpu_id) @@ -29,9 +30,10 @@ __asm__( " ud2;" ); -static void memstress_l1_guest_code(struct vmx_pages *vmx, uint64_t vcpu_id) -{ #define L2_GUEST_STACK_SIZE 64 + +static void l1_vmx_code(struct vmx_pages *vmx, uint64_t vcpu_id) +{ unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; unsigned long *rsp; @@ -45,10 +47,34 @@ static void memstress_l1_guest_code(struct vmx_pages *vmx, uint64_t vcpu_id) prepare_vmcs(vmx, memstress_l2_guest_entry, rsp); GUEST_ASSERT(!vmlaunch()); - GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_VMCALL); + GUEST_DONE(); +} + +static void l1_svm_code(struct svm_test_data *svm, uint64_t vcpu_id) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + unsigned long *rsp; + + + rsp = &l2_guest_stack[L2_GUEST_STACK_SIZE - 1]; + *rsp = vcpu_id; + generic_svm_setup(svm, memstress_l2_guest_entry, rsp); + + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMMCALL); GUEST_DONE(); } + +static void memstress_l1_guest_code(void *data, uint64_t vcpu_id) +{ + if (this_cpu_has(X86_FEATURE_VMX)) + l1_vmx_code(data, vcpu_id); + else + l1_svm_code(data, vcpu_id); +} + uint64_t memstress_nested_pages(int nr_vcpus) { /* @@ -78,15 +104,17 @@ static void memstress_setup_ept_mappings(struct kvm_vm *vm) void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vcpus[]) { struct kvm_regs regs; - vm_vaddr_t vmx_gva; + vm_vaddr_t nested_gva; int vcpu_id; - TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); TEST_REQUIRE(kvm_cpu_has_tdp()); vm_enable_tdp(vm); for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { - vcpu_alloc_vmx(vm, &vmx_gva); + if (kvm_cpu_has(X86_FEATURE_VMX)) + vcpu_alloc_vmx(vm, &nested_gva); + else + vcpu_alloc_svm(vm, &nested_gva); /* The EPTs are shared across vCPUs, setup the mappings once */ if (vcpu_id == 0) @@ -99,6 +127,6 @@ void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vc vcpu_regs_get(vcpus[vcpu_id], ®s); regs.rip = (unsigned long) memstress_l1_guest_code; vcpu_regs_set(vcpus[vcpu_id], ®s); - vcpu_args_set(vcpus[vcpu_id], 2, vmx_gva, vcpu_id); + vcpu_args_set(vcpus[vcpu_id], 2, nested_gva, vcpu_id); } } -- cgit v1.2.3 From e353850499c717f1f984f7c208f49a8618beff2f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Dec 2025 15:01:49 -0800 Subject: KVM: selftests: Rename vm_get_page_table_entry() to vm_get_pte() Shorten the API to get a PTE as the "PTE" acronym is ubiquitous, and the "page table entry" makes it unnecessarily difficult to quickly understand what callers are doing. No functional change intended. Reviewed-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-21-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/x86/processor.h | 2 +- tools/testing/selftests/kvm/lib/x86/processor.c | 2 +- tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c | 2 +- tools/testing/selftests/kvm/x86/smaller_maxphyaddr_emulation_test.c | 4 +--- 4 files changed, 4 insertions(+), 6 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index 995277cae94e..8f130e7d7048 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -1359,7 +1359,7 @@ static inline bool kvm_is_ignore_msrs(void) return get_kvm_param_bool("ignore_msrs"); } -uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr); +uint64_t *vm_get_pte(struct kvm_vm *vm, uint64_t vaddr); uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2, uint64_t a3); diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index 5a3385d48902..ab869a98bbdc 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -390,7 +390,7 @@ static uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, return virt_get_pte(vm, mmu, pte, vaddr, PG_LEVEL_4K); } -uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr) +uint64_t *vm_get_pte(struct kvm_vm *vm, uint64_t vaddr) { int level = PG_LEVEL_4K; diff --git a/tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c b/tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c index a3b7ce155981..c542cc4762b1 100644 --- a/tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c +++ b/tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c @@ -619,7 +619,7 @@ int main(int argc, char *argv[]) */ gva = vm_vaddr_unused_gap(vm, NTEST_PAGES * PAGE_SIZE, KVM_UTIL_MIN_VADDR); for (i = 0; i < NTEST_PAGES; i++) { - pte = vm_get_page_table_entry(vm, data->test_pages + i * PAGE_SIZE); + pte = vm_get_pte(vm, data->test_pages + i * PAGE_SIZE); gpa = addr_hva2gpa(vm, pte); virt_pg_map(vm, gva + PAGE_SIZE * i, gpa & PAGE_MASK); data->test_pages_pte[i] = gva + (gpa & ~PAGE_MASK); diff --git a/tools/testing/selftests/kvm/x86/smaller_maxphyaddr_emulation_test.c b/tools/testing/selftests/kvm/x86/smaller_maxphyaddr_emulation_test.c index fabeeaddfb3a..0e8aec568010 100644 --- a/tools/testing/selftests/kvm/x86/smaller_maxphyaddr_emulation_test.c +++ b/tools/testing/selftests/kvm/x86/smaller_maxphyaddr_emulation_test.c @@ -47,7 +47,6 @@ int main(int argc, char *argv[]) struct kvm_vcpu *vcpu; struct kvm_vm *vm; struct ucall uc; - uint64_t *pte; uint64_t *hva; uint64_t gpa; int rc; @@ -73,8 +72,7 @@ int main(int argc, char *argv[]) hva = addr_gpa2hva(vm, MEM_REGION_GPA); memset(hva, 0, PAGE_SIZE); - pte = vm_get_page_table_entry(vm, MEM_REGION_GVA); - *pte |= BIT_ULL(MAXPHYADDR); + *vm_get_pte(vm, MEM_REGION_GVA) |= BIT_ULL(MAXPHYADDR); vcpu_run(vcpu); -- cgit v1.2.3 From bda6ae6f29664b659671f872a2adda3c1c2f5dd6 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 21 Nov 2025 20:48:02 +0000 Subject: KVM: selftests: Use TEST_ASSERT_EQ() in test_vmx_nested_state() The assert messages do not add much value, so use TEST_ASSERT_EQ(), which also nicely displays the addresses in hex. While at it, also assert the values of state->flags. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251121204803.991707-4-yosry.ahmed@linux.dev Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c b/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c index 67a62a5a8895..b59a8a17084d 100644 --- a/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c +++ b/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c @@ -241,8 +241,10 @@ void test_vmx_nested_state(struct kvm_vcpu *vcpu) TEST_ASSERT(state->size >= sizeof(*state) && state->size <= state_sz, "Size must be between %ld and %d. The size returned was %d.", sizeof(*state), state_sz, state->size); - TEST_ASSERT(state->hdr.vmx.vmxon_pa == -1ull, "vmxon_pa must be -1ull."); - TEST_ASSERT(state->hdr.vmx.vmcs12_pa == -1ull, "vmcs_pa must be -1ull."); + + TEST_ASSERT_EQ(state->hdr.vmx.vmxon_pa, -1ull); + TEST_ASSERT_EQ(state->hdr.vmx.vmcs12_pa, -1ull); + TEST_ASSERT_EQ(state->flags, 0); free(state); } -- cgit v1.2.3 From ca2eccb953fd33ef38701e33e660b21f7e84aa14 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 21 Nov 2025 20:48:03 +0000 Subject: KVM: selftests: Extend vmx_set_nested_state_test to cover SVM Add test cases for the validation checks in svm_set_nested_state(), and allow the test to run with SVM as well as VMX. The SVM test also makes sure that KVM_SET_NESTED_STATE accepts GIF being set or cleared if EFER.SVME is cleared, verifying a recently fixed bug where GIF was incorrectly expected to always be set when EFER.SVME is cleared. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251121204803.991707-5-yosry.ahmed@linux.dev Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/Makefile.kvm | 2 +- .../selftests/kvm/x86/nested_set_state_test.c | 406 +++++++++++++++++++++ .../selftests/kvm/x86/vmx_set_nested_state_test.c | 306 ---------------- 3 files changed, 407 insertions(+), 307 deletions(-) create mode 100644 tools/testing/selftests/kvm/x86/nested_set_state_test.c delete mode 100644 tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c (limited to 'tools/testing') diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index ba5c2b643efa..4ddece4ee365 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -92,6 +92,7 @@ TEST_GEN_PROGS_x86 += x86/nested_close_kvm_test TEST_GEN_PROGS_x86 += x86/nested_emulation_test TEST_GEN_PROGS_x86 += x86/nested_exceptions_test TEST_GEN_PROGS_x86 += x86/nested_invalid_cr3_test +TEST_GEN_PROGS_x86 += x86/nested_set_state_test TEST_GEN_PROGS_x86 += x86/nested_tsc_adjust_test TEST_GEN_PROGS_x86 += x86/nested_tsc_scaling_test TEST_GEN_PROGS_x86 += x86/platform_info_test @@ -120,7 +121,6 @@ TEST_GEN_PROGS_x86 += x86/vmx_exception_with_invalid_guest_state TEST_GEN_PROGS_x86 += x86/vmx_msrs_test TEST_GEN_PROGS_x86 += x86/vmx_invalid_nested_guest_state TEST_GEN_PROGS_x86 += x86/vmx_nested_la57_state_test -TEST_GEN_PROGS_x86 += x86/vmx_set_nested_state_test TEST_GEN_PROGS_x86 += x86/apic_bus_clock_test TEST_GEN_PROGS_x86 += x86/xapic_ipi_test TEST_GEN_PROGS_x86 += x86/xapic_state_test diff --git a/tools/testing/selftests/kvm/x86/nested_set_state_test.c b/tools/testing/selftests/kvm/x86/nested_set_state_test.c new file mode 100644 index 000000000000..0f2102b43629 --- /dev/null +++ b/tools/testing/selftests/kvm/x86/nested_set_state_test.c @@ -0,0 +1,406 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2019, Google LLC. + * + * This test verifies the integrity of calling the ioctl KVM_SET_NESTED_STATE. + */ + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "vmx.h" +#include "svm_util.h" + +#include +#include +#include +#include +#include + +/* + * Mirror of VMCS12_REVISION in arch/x86/kvm/vmx/vmcs12.h. If that value + * changes this should be updated. + */ +#define VMCS12_REVISION 0x11e57ed0 + +bool have_evmcs; + +void test_nested_state(struct kvm_vcpu *vcpu, struct kvm_nested_state *state) +{ + vcpu_nested_state_set(vcpu, state); +} + +void test_nested_state_expect_errno(struct kvm_vcpu *vcpu, + struct kvm_nested_state *state, + int expected_errno) +{ + int rv; + + rv = __vcpu_nested_state_set(vcpu, state); + TEST_ASSERT(rv == -1 && errno == expected_errno, + "Expected %s (%d) from vcpu_nested_state_set but got rv: %i errno: %s (%d)", + strerror(expected_errno), expected_errno, rv, strerror(errno), + errno); +} + +void test_nested_state_expect_einval(struct kvm_vcpu *vcpu, + struct kvm_nested_state *state) +{ + test_nested_state_expect_errno(vcpu, state, EINVAL); +} + +void test_nested_state_expect_efault(struct kvm_vcpu *vcpu, + struct kvm_nested_state *state) +{ + test_nested_state_expect_errno(vcpu, state, EFAULT); +} + +void set_revision_id_for_vmcs12(struct kvm_nested_state *state, + u32 vmcs12_revision) +{ + /* Set revision_id in vmcs12 to vmcs12_revision. */ + memcpy(&state->data, &vmcs12_revision, sizeof(u32)); +} + +void set_default_state(struct kvm_nested_state *state) +{ + memset(state, 0, sizeof(*state)); + state->flags = KVM_STATE_NESTED_RUN_PENDING | + KVM_STATE_NESTED_GUEST_MODE; + state->format = 0; + state->size = sizeof(*state); +} + +void set_default_vmx_state(struct kvm_nested_state *state, int size) +{ + memset(state, 0, size); + if (have_evmcs) + state->flags = KVM_STATE_NESTED_EVMCS; + state->format = 0; + state->size = size; + state->hdr.vmx.vmxon_pa = 0x1000; + state->hdr.vmx.vmcs12_pa = 0x2000; + state->hdr.vmx.smm.flags = 0; + set_revision_id_for_vmcs12(state, VMCS12_REVISION); +} + +void test_vmx_nested_state(struct kvm_vcpu *vcpu) +{ + /* Add a page for VMCS12. */ + const int state_sz = sizeof(struct kvm_nested_state) + getpagesize(); + struct kvm_nested_state *state = + (struct kvm_nested_state *)malloc(state_sz); + + /* The format must be set to 0. 0 for VMX, 1 for SVM. */ + set_default_vmx_state(state, state_sz); + state->format = 1; + test_nested_state_expect_einval(vcpu, state); + + /* + * We cannot virtualize anything if the guest does not have VMX + * enabled. + */ + set_default_vmx_state(state, state_sz); + test_nested_state_expect_einval(vcpu, state); + + /* + * We cannot virtualize anything if the guest does not have VMX + * enabled. We expect KVM_SET_NESTED_STATE to return 0 if vmxon_pa + * is set to -1ull, but the flags must be zero. + */ + set_default_vmx_state(state, state_sz); + state->hdr.vmx.vmxon_pa = -1ull; + test_nested_state_expect_einval(vcpu, state); + + state->hdr.vmx.vmcs12_pa = -1ull; + state->flags = KVM_STATE_NESTED_EVMCS; + test_nested_state_expect_einval(vcpu, state); + + state->flags = 0; + test_nested_state(vcpu, state); + + /* Enable VMX in the guest CPUID. */ + vcpu_set_cpuid_feature(vcpu, X86_FEATURE_VMX); + + /* + * Setting vmxon_pa == -1ull and vmcs_pa == -1ull exits early without + * setting the nested state. When the eVMCS flag is not set, the + * expected return value is '0'. + */ + set_default_vmx_state(state, state_sz); + state->flags = 0; + state->hdr.vmx.vmxon_pa = -1ull; + state->hdr.vmx.vmcs12_pa = -1ull; + test_nested_state(vcpu, state); + + /* + * When eVMCS is supported, the eVMCS flag can only be set if the + * enlightened VMCS capability has been enabled. + */ + if (have_evmcs) { + state->flags = KVM_STATE_NESTED_EVMCS; + test_nested_state_expect_einval(vcpu, state); + vcpu_enable_evmcs(vcpu); + test_nested_state(vcpu, state); + } + + /* It is invalid to have vmxon_pa == -1ull and SMM flags non-zero. */ + state->hdr.vmx.smm.flags = 1; + test_nested_state_expect_einval(vcpu, state); + + /* Invalid flags are rejected. */ + set_default_vmx_state(state, state_sz); + state->hdr.vmx.flags = ~0; + test_nested_state_expect_einval(vcpu, state); + + /* It is invalid to have vmxon_pa == -1ull and vmcs_pa != -1ull. */ + set_default_vmx_state(state, state_sz); + state->hdr.vmx.vmxon_pa = -1ull; + state->flags = 0; + test_nested_state_expect_einval(vcpu, state); + + /* It is invalid to have vmxon_pa set to a non-page aligned address. */ + set_default_vmx_state(state, state_sz); + state->hdr.vmx.vmxon_pa = 1; + test_nested_state_expect_einval(vcpu, state); + + /* + * It is invalid to have KVM_STATE_NESTED_SMM_GUEST_MODE and + * KVM_STATE_NESTED_GUEST_MODE set together. + */ + set_default_vmx_state(state, state_sz); + state->flags = KVM_STATE_NESTED_GUEST_MODE | + KVM_STATE_NESTED_RUN_PENDING; + state->hdr.vmx.smm.flags = KVM_STATE_NESTED_SMM_GUEST_MODE; + test_nested_state_expect_einval(vcpu, state); + + /* + * It is invalid to have any of the SMM flags set besides: + * KVM_STATE_NESTED_SMM_GUEST_MODE + * KVM_STATE_NESTED_SMM_VMXON + */ + set_default_vmx_state(state, state_sz); + state->hdr.vmx.smm.flags = ~(KVM_STATE_NESTED_SMM_GUEST_MODE | + KVM_STATE_NESTED_SMM_VMXON); + test_nested_state_expect_einval(vcpu, state); + + /* Outside SMM, SMM flags must be zero. */ + set_default_vmx_state(state, state_sz); + state->flags = 0; + state->hdr.vmx.smm.flags = KVM_STATE_NESTED_SMM_GUEST_MODE; + test_nested_state_expect_einval(vcpu, state); + + /* + * Size must be large enough to fit kvm_nested_state and vmcs12 + * if VMCS12 physical address is set + */ + set_default_vmx_state(state, state_sz); + state->size = sizeof(*state); + state->flags = 0; + test_nested_state_expect_einval(vcpu, state); + + set_default_vmx_state(state, state_sz); + state->size = sizeof(*state); + state->flags = 0; + state->hdr.vmx.vmcs12_pa = -1; + test_nested_state(vcpu, state); + + /* + * KVM_SET_NESTED_STATE succeeds with invalid VMCS + * contents but L2 not running. + */ + set_default_vmx_state(state, state_sz); + state->flags = 0; + test_nested_state(vcpu, state); + + /* Invalid flags are rejected, even if no VMCS loaded. */ + set_default_vmx_state(state, state_sz); + state->size = sizeof(*state); + state->flags = 0; + state->hdr.vmx.vmcs12_pa = -1; + state->hdr.vmx.flags = ~0; + test_nested_state_expect_einval(vcpu, state); + + /* vmxon_pa cannot be the same address as vmcs_pa. */ + set_default_vmx_state(state, state_sz); + state->hdr.vmx.vmxon_pa = 0; + state->hdr.vmx.vmcs12_pa = 0; + test_nested_state_expect_einval(vcpu, state); + + /* + * Test that if we leave nesting the state reflects that when we get + * it again. + */ + set_default_vmx_state(state, state_sz); + state->hdr.vmx.vmxon_pa = -1ull; + state->hdr.vmx.vmcs12_pa = -1ull; + state->flags = 0; + test_nested_state(vcpu, state); + vcpu_nested_state_get(vcpu, state); + TEST_ASSERT(state->size >= sizeof(*state) && state->size <= state_sz, + "Size must be between %ld and %d. The size returned was %d.", + sizeof(*state), state_sz, state->size); + + TEST_ASSERT_EQ(state->hdr.vmx.vmxon_pa, -1ull); + TEST_ASSERT_EQ(state->hdr.vmx.vmcs12_pa, -1ull); + TEST_ASSERT_EQ(state->flags, 0); + + free(state); +} + +static void vcpu_efer_enable_svm(struct kvm_vcpu *vcpu) +{ + uint64_t old_efer = vcpu_get_msr(vcpu, MSR_EFER); + + vcpu_set_msr(vcpu, MSR_EFER, old_efer | EFER_SVME); +} + +static void vcpu_efer_disable_svm(struct kvm_vcpu *vcpu) +{ + uint64_t old_efer = vcpu_get_msr(vcpu, MSR_EFER); + + vcpu_set_msr(vcpu, MSR_EFER, old_efer & ~EFER_SVME); +} + +void set_default_svm_state(struct kvm_nested_state *state, int size) +{ + memset(state, 0, size); + state->format = 1; + state->size = size; + state->hdr.svm.vmcb_pa = 0x3000; +} + +void test_svm_nested_state(struct kvm_vcpu *vcpu) +{ + /* Add a page for VMCB. */ + const int state_sz = sizeof(struct kvm_nested_state) + getpagesize(); + struct kvm_nested_state *state = + (struct kvm_nested_state *)malloc(state_sz); + + vcpu_set_cpuid_feature(vcpu, X86_FEATURE_SVM); + + /* The format must be set to 1. 0 for VMX, 1 for SVM. */ + set_default_svm_state(state, state_sz); + state->format = 0; + test_nested_state_expect_einval(vcpu, state); + + /* Invalid flags are rejected, KVM_STATE_NESTED_EVMCS is VMX-only */ + set_default_svm_state(state, state_sz); + state->flags = KVM_STATE_NESTED_EVMCS; + test_nested_state_expect_einval(vcpu, state); + + /* + * If EFER.SVME is clear, guest mode is disallowed and GIF can be set or + * cleared. + */ + vcpu_efer_disable_svm(vcpu); + + set_default_svm_state(state, state_sz); + state->flags = KVM_STATE_NESTED_GUEST_MODE; + test_nested_state_expect_einval(vcpu, state); + + state->flags = 0; + test_nested_state(vcpu, state); + + state->flags = KVM_STATE_NESTED_GIF_SET; + test_nested_state(vcpu, state); + + /* Enable SVM in the guest EFER. */ + vcpu_efer_enable_svm(vcpu); + + /* Setting vmcb_pa to a non-aligned address is only fine when not entering guest mode */ + set_default_svm_state(state, state_sz); + state->hdr.svm.vmcb_pa = -1ull; + state->flags = 0; + test_nested_state(vcpu, state); + state->flags = KVM_STATE_NESTED_GUEST_MODE; + test_nested_state_expect_einval(vcpu, state); + + /* + * Size must be large enough to fit kvm_nested_state and VMCB + * only when entering guest mode. + */ + set_default_svm_state(state, state_sz/2); + state->flags = 0; + test_nested_state(vcpu, state); + state->flags = KVM_STATE_NESTED_GUEST_MODE; + test_nested_state_expect_einval(vcpu, state); + + /* + * Test that if we leave nesting the state reflects that when we get it + * again, except for vmcb_pa, which is always returned as 0 when not in + * guest mode. + */ + set_default_svm_state(state, state_sz); + state->hdr.svm.vmcb_pa = -1ull; + state->flags = KVM_STATE_NESTED_GIF_SET; + test_nested_state(vcpu, state); + vcpu_nested_state_get(vcpu, state); + TEST_ASSERT(state->size >= sizeof(*state) && state->size <= state_sz, + "Size must be between %ld and %d. The size returned was %d.", + sizeof(*state), state_sz, state->size); + + TEST_ASSERT_EQ(state->hdr.svm.vmcb_pa, 0); + TEST_ASSERT_EQ(state->flags, KVM_STATE_NESTED_GIF_SET); + + free(state); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vm *vm; + struct kvm_nested_state state; + struct kvm_vcpu *vcpu; + + have_evmcs = kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS); + + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX) || + kvm_cpu_has(X86_FEATURE_SVM)); + TEST_REQUIRE(kvm_has_cap(KVM_CAP_NESTED_STATE)); + + vm = vm_create_with_one_vcpu(&vcpu, NULL); + + /* + * First run tests with VMX/SVM disabled to check error handling. + * test_{vmx/svm}_nested_state() will re-enable as needed. + */ + if (kvm_cpu_has(X86_FEATURE_VMX)) + vcpu_clear_cpuid_feature(vcpu, X86_FEATURE_VMX); + else + vcpu_clear_cpuid_feature(vcpu, X86_FEATURE_SVM); + + /* Passing a NULL kvm_nested_state causes a EFAULT. */ + test_nested_state_expect_efault(vcpu, NULL); + + /* 'size' cannot be smaller than sizeof(kvm_nested_state). */ + set_default_state(&state); + state.size = 0; + test_nested_state_expect_einval(vcpu, &state); + + /* + * Setting the flags 0xf fails the flags check. The only flags that + * can be used are: + * KVM_STATE_NESTED_GUEST_MODE + * KVM_STATE_NESTED_RUN_PENDING + * KVM_STATE_NESTED_EVMCS + */ + set_default_state(&state); + state.flags = 0xf; + test_nested_state_expect_einval(vcpu, &state); + + /* + * If KVM_STATE_NESTED_RUN_PENDING is set then + * KVM_STATE_NESTED_GUEST_MODE has to be set as well. + */ + set_default_state(&state); + state.flags = KVM_STATE_NESTED_RUN_PENDING; + test_nested_state_expect_einval(vcpu, &state); + + if (kvm_cpu_has(X86_FEATURE_VMX)) + test_vmx_nested_state(vcpu); + else + test_svm_nested_state(vcpu); + + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c b/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c deleted file mode 100644 index b59a8a17084d..000000000000 --- a/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c +++ /dev/null @@ -1,306 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * vmx_set_nested_state_test - * - * Copyright (C) 2019, Google LLC. - * - * This test verifies the integrity of calling the ioctl KVM_SET_NESTED_STATE. - */ - -#include "test_util.h" -#include "kvm_util.h" -#include "processor.h" -#include "vmx.h" - -#include -#include -#include -#include -#include - -/* - * Mirror of VMCS12_REVISION in arch/x86/kvm/vmx/vmcs12.h. If that value - * changes this should be updated. - */ -#define VMCS12_REVISION 0x11e57ed0 - -bool have_evmcs; - -void test_nested_state(struct kvm_vcpu *vcpu, struct kvm_nested_state *state) -{ - vcpu_nested_state_set(vcpu, state); -} - -void test_nested_state_expect_errno(struct kvm_vcpu *vcpu, - struct kvm_nested_state *state, - int expected_errno) -{ - int rv; - - rv = __vcpu_nested_state_set(vcpu, state); - TEST_ASSERT(rv == -1 && errno == expected_errno, - "Expected %s (%d) from vcpu_nested_state_set but got rv: %i errno: %s (%d)", - strerror(expected_errno), expected_errno, rv, strerror(errno), - errno); -} - -void test_nested_state_expect_einval(struct kvm_vcpu *vcpu, - struct kvm_nested_state *state) -{ - test_nested_state_expect_errno(vcpu, state, EINVAL); -} - -void test_nested_state_expect_efault(struct kvm_vcpu *vcpu, - struct kvm_nested_state *state) -{ - test_nested_state_expect_errno(vcpu, state, EFAULT); -} - -void set_revision_id_for_vmcs12(struct kvm_nested_state *state, - u32 vmcs12_revision) -{ - /* Set revision_id in vmcs12 to vmcs12_revision. */ - memcpy(&state->data, &vmcs12_revision, sizeof(u32)); -} - -void set_default_state(struct kvm_nested_state *state) -{ - memset(state, 0, sizeof(*state)); - state->flags = KVM_STATE_NESTED_RUN_PENDING | - KVM_STATE_NESTED_GUEST_MODE; - state->format = 0; - state->size = sizeof(*state); -} - -void set_default_vmx_state(struct kvm_nested_state *state, int size) -{ - memset(state, 0, size); - if (have_evmcs) - state->flags = KVM_STATE_NESTED_EVMCS; - state->format = 0; - state->size = size; - state->hdr.vmx.vmxon_pa = 0x1000; - state->hdr.vmx.vmcs12_pa = 0x2000; - state->hdr.vmx.smm.flags = 0; - set_revision_id_for_vmcs12(state, VMCS12_REVISION); -} - -void test_vmx_nested_state(struct kvm_vcpu *vcpu) -{ - /* Add a page for VMCS12. */ - const int state_sz = sizeof(struct kvm_nested_state) + getpagesize(); - struct kvm_nested_state *state = - (struct kvm_nested_state *)malloc(state_sz); - - /* The format must be set to 0. 0 for VMX, 1 for SVM. */ - set_default_vmx_state(state, state_sz); - state->format = 1; - test_nested_state_expect_einval(vcpu, state); - - /* - * We cannot virtualize anything if the guest does not have VMX - * enabled. - */ - set_default_vmx_state(state, state_sz); - test_nested_state_expect_einval(vcpu, state); - - /* - * We cannot virtualize anything if the guest does not have VMX - * enabled. We expect KVM_SET_NESTED_STATE to return 0 if vmxon_pa - * is set to -1ull, but the flags must be zero. - */ - set_default_vmx_state(state, state_sz); - state->hdr.vmx.vmxon_pa = -1ull; - test_nested_state_expect_einval(vcpu, state); - - state->hdr.vmx.vmcs12_pa = -1ull; - state->flags = KVM_STATE_NESTED_EVMCS; - test_nested_state_expect_einval(vcpu, state); - - state->flags = 0; - test_nested_state(vcpu, state); - - /* Enable VMX in the guest CPUID. */ - vcpu_set_cpuid_feature(vcpu, X86_FEATURE_VMX); - - /* - * Setting vmxon_pa == -1ull and vmcs_pa == -1ull exits early without - * setting the nested state. When the eVMCS flag is not set, the - * expected return value is '0'. - */ - set_default_vmx_state(state, state_sz); - state->flags = 0; - state->hdr.vmx.vmxon_pa = -1ull; - state->hdr.vmx.vmcs12_pa = -1ull; - test_nested_state(vcpu, state); - - /* - * When eVMCS is supported, the eVMCS flag can only be set if the - * enlightened VMCS capability has been enabled. - */ - if (have_evmcs) { - state->flags = KVM_STATE_NESTED_EVMCS; - test_nested_state_expect_einval(vcpu, state); - vcpu_enable_evmcs(vcpu); - test_nested_state(vcpu, state); - } - - /* It is invalid to have vmxon_pa == -1ull and SMM flags non-zero. */ - state->hdr.vmx.smm.flags = 1; - test_nested_state_expect_einval(vcpu, state); - - /* Invalid flags are rejected. */ - set_default_vmx_state(state, state_sz); - state->hdr.vmx.flags = ~0; - test_nested_state_expect_einval(vcpu, state); - - /* It is invalid to have vmxon_pa == -1ull and vmcs_pa != -1ull. */ - set_default_vmx_state(state, state_sz); - state->hdr.vmx.vmxon_pa = -1ull; - state->flags = 0; - test_nested_state_expect_einval(vcpu, state); - - /* It is invalid to have vmxon_pa set to a non-page aligned address. */ - set_default_vmx_state(state, state_sz); - state->hdr.vmx.vmxon_pa = 1; - test_nested_state_expect_einval(vcpu, state); - - /* - * It is invalid to have KVM_STATE_NESTED_SMM_GUEST_MODE and - * KVM_STATE_NESTED_GUEST_MODE set together. - */ - set_default_vmx_state(state, state_sz); - state->flags = KVM_STATE_NESTED_GUEST_MODE | - KVM_STATE_NESTED_RUN_PENDING; - state->hdr.vmx.smm.flags = KVM_STATE_NESTED_SMM_GUEST_MODE; - test_nested_state_expect_einval(vcpu, state); - - /* - * It is invalid to have any of the SMM flags set besides: - * KVM_STATE_NESTED_SMM_GUEST_MODE - * KVM_STATE_NESTED_SMM_VMXON - */ - set_default_vmx_state(state, state_sz); - state->hdr.vmx.smm.flags = ~(KVM_STATE_NESTED_SMM_GUEST_MODE | - KVM_STATE_NESTED_SMM_VMXON); - test_nested_state_expect_einval(vcpu, state); - - /* Outside SMM, SMM flags must be zero. */ - set_default_vmx_state(state, state_sz); - state->flags = 0; - state->hdr.vmx.smm.flags = KVM_STATE_NESTED_SMM_GUEST_MODE; - test_nested_state_expect_einval(vcpu, state); - - /* - * Size must be large enough to fit kvm_nested_state and vmcs12 - * if VMCS12 physical address is set - */ - set_default_vmx_state(state, state_sz); - state->size = sizeof(*state); - state->flags = 0; - test_nested_state_expect_einval(vcpu, state); - - set_default_vmx_state(state, state_sz); - state->size = sizeof(*state); - state->flags = 0; - state->hdr.vmx.vmcs12_pa = -1; - test_nested_state(vcpu, state); - - /* - * KVM_SET_NESTED_STATE succeeds with invalid VMCS - * contents but L2 not running. - */ - set_default_vmx_state(state, state_sz); - state->flags = 0; - test_nested_state(vcpu, state); - - /* Invalid flags are rejected, even if no VMCS loaded. */ - set_default_vmx_state(state, state_sz); - state->size = sizeof(*state); - state->flags = 0; - state->hdr.vmx.vmcs12_pa = -1; - state->hdr.vmx.flags = ~0; - test_nested_state_expect_einval(vcpu, state); - - /* vmxon_pa cannot be the same address as vmcs_pa. */ - set_default_vmx_state(state, state_sz); - state->hdr.vmx.vmxon_pa = 0; - state->hdr.vmx.vmcs12_pa = 0; - test_nested_state_expect_einval(vcpu, state); - - /* - * Test that if we leave nesting the state reflects that when we get - * it again. - */ - set_default_vmx_state(state, state_sz); - state->hdr.vmx.vmxon_pa = -1ull; - state->hdr.vmx.vmcs12_pa = -1ull; - state->flags = 0; - test_nested_state(vcpu, state); - vcpu_nested_state_get(vcpu, state); - TEST_ASSERT(state->size >= sizeof(*state) && state->size <= state_sz, - "Size must be between %ld and %d. The size returned was %d.", - sizeof(*state), state_sz, state->size); - - TEST_ASSERT_EQ(state->hdr.vmx.vmxon_pa, -1ull); - TEST_ASSERT_EQ(state->hdr.vmx.vmcs12_pa, -1ull); - TEST_ASSERT_EQ(state->flags, 0); - - free(state); -} - -int main(int argc, char *argv[]) -{ - struct kvm_vm *vm; - struct kvm_nested_state state; - struct kvm_vcpu *vcpu; - - have_evmcs = kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS); - - TEST_REQUIRE(kvm_has_cap(KVM_CAP_NESTED_STATE)); - - /* - * AMD currently does not implement set_nested_state, so for now we - * just early out. - */ - TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); - - vm = vm_create_with_one_vcpu(&vcpu, NULL); - - /* - * First run tests with VMX disabled to check error handling. - */ - vcpu_clear_cpuid_feature(vcpu, X86_FEATURE_VMX); - - /* Passing a NULL kvm_nested_state causes a EFAULT. */ - test_nested_state_expect_efault(vcpu, NULL); - - /* 'size' cannot be smaller than sizeof(kvm_nested_state). */ - set_default_state(&state); - state.size = 0; - test_nested_state_expect_einval(vcpu, &state); - - /* - * Setting the flags 0xf fails the flags check. The only flags that - * can be used are: - * KVM_STATE_NESTED_GUEST_MODE - * KVM_STATE_NESTED_RUN_PENDING - * KVM_STATE_NESTED_EVMCS - */ - set_default_state(&state); - state.flags = 0xf; - test_nested_state_expect_einval(vcpu, &state); - - /* - * If KVM_STATE_NESTED_RUN_PENDING is set then - * KVM_STATE_NESTED_GUEST_MODE has to be set as well. - */ - set_default_state(&state); - state.flags = KVM_STATE_NESTED_RUN_PENDING; - test_nested_state_expect_einval(vcpu, &state); - - test_vmx_nested_state(vcpu); - - kvm_vm_free(vm); - return 0; -} -- cgit v1.2.3 From 58e3e5265484a1bf39569903630a45a924621aaa Mon Sep 17 00:00:00 2001 From: Shengming Hu Date: Mon, 29 Dec 2025 21:52:27 +0800 Subject: memblock: drop redundant 'struct page *' argument from memblock_free_pages() memblock_free_pages() currently takes both a struct page * and the corresponding PFN. The page pointer is always derived from the PFN at call sites (pfn_to_page(pfn)), making the parameter redundant and also allowing accidental mismatches between the two arguments. Simplify the interface by removing the struct page * argument and deriving the page locally from the PFN, after the deferred struct page initialization check. This keeps the behavior unchanged while making the helper harder to misuse. Signed-off-by: Shengming Hu Reviewed-by: David Hildenbrand (Red Hat) Link: https://patch.msgid.link/tencent_F741CE6ECC49EE099736685E60C0DBD4A209@qq.com Signed-off-by: Mike Rapoport (Microsoft) --- mm/internal.h | 3 +-- mm/memblock.c | 4 ++-- mm/mm_init.c | 5 +++-- tools/testing/memblock/internal.h | 3 +-- 4 files changed, 7 insertions(+), 8 deletions(-) (limited to 'tools/testing') diff --git a/mm/internal.h b/mm/internal.h index e430da900430..5f93ee1459d9 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -742,8 +742,7 @@ static inline void clear_zone_contiguous(struct zone *zone) extern int __isolate_free_page(struct page *page, unsigned int order); extern void __putback_isolated_page(struct page *page, unsigned int order, int mt); -extern void memblock_free_pages(struct page *page, unsigned long pfn, - unsigned int order); +extern void memblock_free_pages(unsigned long pfn, unsigned int order); extern void __free_pages_core(struct page *page, unsigned int order, enum meminit_context context); diff --git a/mm/memblock.c b/mm/memblock.c index 905d06b16348..6e11f81c4870 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1771,7 +1771,7 @@ void __init memblock_free_late(phys_addr_t base, phys_addr_t size) end = PFN_DOWN(base + size); for (; cursor < end; cursor++) { - memblock_free_pages(pfn_to_page(cursor), cursor, 0); + memblock_free_pages(cursor, 0); totalram_pages_inc(); } } @@ -2216,7 +2216,7 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end) while (start + (1UL << order) > end) order--; - memblock_free_pages(pfn_to_page(start), start, order); + memblock_free_pages(start, order); start += (1UL << order); } diff --git a/mm/mm_init.c b/mm/mm_init.c index fc2a6f1e518f..d5b91602ff2a 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2480,9 +2480,10 @@ void *__init alloc_large_system_hash(const char *tablename, return table; } -void __init memblock_free_pages(struct page *page, unsigned long pfn, - unsigned int order) +void __init memblock_free_pages(unsigned long pfn, unsigned int order) { + struct page *page = pfn_to_page(pfn); + if (IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT)) { int nid = early_pfn_to_nid(pfn); diff --git a/tools/testing/memblock/internal.h b/tools/testing/memblock/internal.h index 0ab4b53bb4f3..009b97bbdd22 100644 --- a/tools/testing/memblock/internal.h +++ b/tools/testing/memblock/internal.h @@ -15,8 +15,7 @@ bool mirrored_kernelcore = false; struct page {}; -void memblock_free_pages(struct page *page, unsigned long pfn, - unsigned int order) +void memblock_free_pages(unsigned long pfn, unsigned int order) { } -- cgit v1.2.3 From 15e8d739fda1084d81f7d3813e9600eba6e0f134 Mon Sep 17 00:00:00 2001 From: Günther Noack Date: Thu, 1 Jan 2026 14:40:58 +0100 Subject: selftests/landlock: Properly close a file descriptor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a missing close(srv_fd) call, and use EXPECT_EQ() to check the result. Signed-off-by: Günther Noack Fixes: f83d51a5bdfe ("selftests/landlock: Check IOCTL restrictions for named UNIX domain sockets") Link: https://lore.kernel.org/r/20260101134102.25938-2-gnoack3000@gmail.com [mic: Use EXPECT_EQ() and update commit message] Signed-off-by: Mickaël Salaün --- tools/testing/selftests/landlock/fs_test.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c index 37a5a3df712e..968a91c927a4 100644 --- a/tools/testing/selftests/landlock/fs_test.c +++ b/tools/testing/selftests/landlock/fs_test.c @@ -4399,7 +4399,8 @@ TEST_F_FORK(layout1, named_unix_domain_socket_ioctl) /* FIONREAD and other IOCTLs should not be forbidden. */ EXPECT_EQ(0, test_fionread_ioctl(cli_fd)); - ASSERT_EQ(0, close(cli_fd)); + EXPECT_EQ(0, close(cli_fd)); + EXPECT_EQ(0, close(srv_fd)); } /* clang-format off */ -- cgit v1.2.3 From 671ef08d9455f5754d1fc96f5a14e357d6b80936 Mon Sep 17 00:00:00 2001 From: Xiaochen Shen Date: Wed, 17 Dec 2025 11:04:53 +0800 Subject: selftests/resctrl: Fix a division by zero error on Hygon Change to adjust effective L3 cache size with SNC enabled change introduced the snc_nodes_per_l3_cache() function to detect the Intel Sub-NUMA Clustering (SNC) feature by comparing #CPUs in node0 with #CPUs sharing LLC with CPU0. The function was designed to return: (1) >1: SNC mode is enabled. (2) 1: SNC mode is not enabled or not supported. However, on certain Hygon CPUs, #CPUs sharing LLC with CPU0 is actually less than #CPUs in node0. This results in snc_nodes_per_l3_cache() returning 0 (calculated as cache_cpus / node_cpus). This leads to a division by zero error in get_cache_size(): *cache_size /= snc_nodes_per_l3_cache(); Causing the resctrl selftest to fail with: "Floating point exception (core dumped)" Fix the issue by ensuring snc_nodes_per_l3_cache() returns 1 when SNC mode is not supported on the platform. Updated commit log to fix commit has issues: Shuah Khan Link: https://lore.kernel.org/r/20251217030456.3834956-2-shenxiaochen@open-hieco.net Fixes: a1cd99e700ec ("selftests/resctrl: Adjust effective L3 cache size with SNC enabled") Signed-off-by: Xiaochen Shen Reviewed-by: Reinette Chatre Reviewed-by: Fenghua Yu Signed-off-by: Shuah Khan --- tools/testing/selftests/resctrl/resctrlfs.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c index 195f04c4d158..b9c1bfb6cc02 100644 --- a/tools/testing/selftests/resctrl/resctrlfs.c +++ b/tools/testing/selftests/resctrl/resctrlfs.c @@ -243,6 +243,16 @@ int snc_nodes_per_l3_cache(void) } snc_mode = cache_cpus / node_cpus; + /* + * On some platforms (e.g. Hygon), + * cache_cpus < node_cpus, the calculated snc_mode is 0. + * + * Set snc_mode = 1 to indicate that SNC mode is not + * supported on the platform. + */ + if (!snc_mode) + snc_mode = 1; + if (snc_mode > 1) ksft_print_msg("SNC-%d mode discovered.\n", snc_mode); } -- cgit v1.2.3 From 4f4f01cc333e97b0e63b61ed1a65c928aa662f99 Mon Sep 17 00:00:00 2001 From: Xiaochen Shen Date: Wed, 17 Dec 2025 11:04:54 +0800 Subject: selftests/resctrl: Define CPU vendor IDs as bits to match usage The CPU vendor IDs are required to be unique bits because they're used for vendor_specific bitmask in the struct resctrl_test. Consider for example their usage in test_vendor_specific_check(): return get_vendor() & test->vendor_specific However, the definitions of CPU vendor IDs in file resctrl.h is quite subtle as a bitmask value: #define ARCH_INTEL 1 #define ARCH_AMD 2 A clearer and more maintainable approach is to define these CPU vendor IDs using BIT(). This ensures each vendor corresponds to a distinct bit and makes it obvious when adding new vendor IDs. Accordingly, update the return types of detect_vendor() and get_vendor() from 'int' to 'unsigned int' to align with their usage as bitmask values and to prevent potentially risky type conversions. Furthermore, introduce a bool flag 'initialized' to simplify the get_vendor() -> detect_vendor() logic. This ensures the vendor ID is detected only once and resolves the ambiguity of using the same variable 'vendor' both as a value and as a state. Link: https://lore.kernel.org/r/20251217030456.3834956-3-shenxiaochen@open-hieco.net Suggested-by: Reinette Chatre Suggested-by: Fenghua Yu Signed-off-by: Xiaochen Shen Reviewed-by: Reinette Chatre Signed-off-by: Shuah Khan --- tools/testing/selftests/resctrl/resctrl.h | 7 ++++--- tools/testing/selftests/resctrl/resctrl_tests.c | 26 +++++++++++++++++-------- 2 files changed, 22 insertions(+), 11 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h index 3c51bdac2dfa..4f9c7d04c98d 100644 --- a/tools/testing/selftests/resctrl/resctrl.h +++ b/tools/testing/selftests/resctrl/resctrl.h @@ -23,6 +23,7 @@ #include #include #include +#include #include "kselftest.h" #define MB (1024 * 1024) @@ -36,8 +37,8 @@ * Define as bits because they're used for vendor_specific bitmask in * the struct resctrl_test. */ -#define ARCH_INTEL 1 -#define ARCH_AMD 2 +#define ARCH_INTEL BIT(0) +#define ARCH_AMD BIT(1) #define END_OF_TESTS 1 @@ -163,7 +164,7 @@ extern int snc_unreliable; extern char llc_occup_path[1024]; int snc_nodes_per_l3_cache(void); -int get_vendor(void); +unsigned int get_vendor(void); bool check_resctrlfs_support(void); int filter_dmesg(void); int get_domain_id(const char *resource, int cpu_no, int *domain_id); diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c index 5154ffd821c4..42605e2a3b66 100644 --- a/tools/testing/selftests/resctrl/resctrl_tests.c +++ b/tools/testing/selftests/resctrl/resctrl_tests.c @@ -23,16 +23,24 @@ static struct resctrl_test *resctrl_tests[] = { &l2_noncont_cat_test, }; -static int detect_vendor(void) +static unsigned int detect_vendor(void) { - FILE *inf = fopen("/proc/cpuinfo", "r"); - int vendor_id = 0; + static unsigned int vendor_id; + static bool initialized; char *s = NULL; + FILE *inf; char *res; - if (!inf) + if (initialized) return vendor_id; + inf = fopen("/proc/cpuinfo", "r"); + if (!inf) { + vendor_id = 0; + initialized = true; + return vendor_id; + } + res = fgrep(inf, "vendor_id"); if (res) @@ -45,15 +53,17 @@ static int detect_vendor(void) fclose(inf); free(res); + + initialized = true; return vendor_id; } -int get_vendor(void) +unsigned int get_vendor(void) { - static int vendor = -1; + unsigned int vendor; + + vendor = detect_vendor(); - if (vendor == -1) - vendor = detect_vendor(); if (vendor == 0) ksft_print_msg("Can not get vendor info...\n"); -- cgit v1.2.3 From 367f931e6476747edbde4e7c7b95fc5d5b724934 Mon Sep 17 00:00:00 2001 From: Xiaochen Shen Date: Wed, 17 Dec 2025 11:04:55 +0800 Subject: selftests/resctrl: Add CPU vendor detection for Hygon The resctrl selftest currently fails on Hygon CPUs that support Platform QoS features, printing the error: "# Can not get vendor info..." This occurs because vendor detection is missing for Hygon CPUs. Fix this by extending the CPU vendor detection logic to include Hygon's vendor ID. Link: https://lore.kernel.org/r/20251217030456.3834956-4-shenxiaochen@open-hieco.net Signed-off-by: Xiaochen Shen Reviewed-by: Reinette Chatre Signed-off-by: Shuah Khan --- tools/testing/selftests/resctrl/resctrl.h | 1 + tools/testing/selftests/resctrl/resctrl_tests.c | 2 ++ 2 files changed, 3 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h index 4f9c7d04c98d..afe635b6e48d 100644 --- a/tools/testing/selftests/resctrl/resctrl.h +++ b/tools/testing/selftests/resctrl/resctrl.h @@ -39,6 +39,7 @@ */ #define ARCH_INTEL BIT(0) #define ARCH_AMD BIT(1) +#define ARCH_HYGON BIT(2) #define END_OF_TESTS 1 diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c index 42605e2a3b66..dbcd5eea9fbc 100644 --- a/tools/testing/selftests/resctrl/resctrl_tests.c +++ b/tools/testing/selftests/resctrl/resctrl_tests.c @@ -50,6 +50,8 @@ static unsigned int detect_vendor(void) vendor_id = ARCH_INTEL; else if (s && !strcmp(s, ": AuthenticAMD\n")) vendor_id = ARCH_AMD; + else if (s && !strcmp(s, ": HygonGenuine\n")) + vendor_id = ARCH_HYGON; fclose(inf); free(res); -- cgit v1.2.3 From 86063a2568b8f2eeb68da1411b320c0ff778f852 Mon Sep 17 00:00:00 2001 From: Xiaochen Shen Date: Wed, 17 Dec 2025 11:04:56 +0800 Subject: selftests/resctrl: Fix non-contiguous CBM check for Hygon The resctrl selftest currently fails on Hygon CPUs that always supports non-contiguous CBM, printing the error: "# Hardware and kernel differ on non-contiguous CBM support!" This occurs because the arch_supports_noncont_cat() function lacks vendor detection for Hygon CPUs, preventing proper identification of their non-contiguous CBM capability. Fix this by adding Hygon vendor ID detection to arch_supports_noncont_cat(). Link: https://lore.kernel.org/r/20251217030456.3834956-5-shenxiaochen@open-hieco.net Signed-off-by: Xiaochen Shen Reviewed-by: Reinette Chatre Reviewed-by: Fenghua Yu Signed-off-by: Shuah Khan --- tools/testing/selftests/resctrl/cat_test.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c index 94cfdba5308d..f00b622c1460 100644 --- a/tools/testing/selftests/resctrl/cat_test.c +++ b/tools/testing/selftests/resctrl/cat_test.c @@ -290,8 +290,10 @@ static int cat_run_test(const struct resctrl_test *test, const struct user_param static bool arch_supports_noncont_cat(const struct resctrl_test *test) { - /* AMD always supports non-contiguous CBM. */ - if (get_vendor() == ARCH_AMD) + unsigned int vendor_id = get_vendor(); + + /* AMD and Hygon always support non-contiguous CBM. */ + if (vendor_id == ARCH_AMD || vendor_id == ARCH_HYGON) return true; #if defined(__i386__) || defined(__x86_64__) /* arch */ -- cgit v1.2.3 From 96ea4fa60c4528d95bdbce7f4212c015ab3e8113 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 6 Jan 2026 12:02:05 -0800 Subject: selftests: tls: avoid flakiness in data_steal We see the following failure a few times a week: # RUN global.data_steal ... # tls.c:3280:data_steal:Expected recv(cfd, buf2, sizeof(buf2), MSG_DONTWAIT) (10000) == -1 (-1) # data_steal: Test failed # FAIL global.data_steal not ok 8 global.data_steal The 10000 bytes read suggests that the child process did a recv() of half of the data using the TLS ULP and we're now getting the remaining half. The intent of the test is to get the child to enter _TCP_ recvmsg handler, so it needs to enter the syscall before parent installed the TLS recvmsg with setsockopt(SOL_TLS). Instead of the 10msec sleep send 1 byte of data and wait for the child to consume it. Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20260106200205.1593915-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/tls.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index a4d16a460fbe..9e2ccea13d70 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -3260,17 +3260,25 @@ TEST(data_steal) { ASSERT_EQ(setsockopt(cfd, IPPROTO_TCP, TCP_ULP, "tls", sizeof("tls")), 0); /* Spawn a child and get it into the read wait path of the underlying - * TCP socket. + * TCP socket (before kernel .recvmsg is replaced with the TLS one). */ pid = fork(); ASSERT_GE(pid, 0); if (!pid) { - EXPECT_EQ(recv(cfd, buf, sizeof(buf) / 2, MSG_WAITALL), - sizeof(buf) / 2); + EXPECT_EQ(recv(cfd, buf, sizeof(buf) / 2 + 1, MSG_WAITALL), + sizeof(buf) / 2 + 1); exit(!__test_passed(_metadata)); } - usleep(10000); + /* Send a sync byte and poll until it's consumed to ensure + * the child is in recv() before we proceed to install TLS. + */ + ASSERT_EQ(send(fd, buf, 1, 0), 1); + do { + usleep(500); + } while (recv(cfd, buf, 1, MSG_PEEK | MSG_DONTWAIT) == 1); + EXPECT_EQ(errno, EAGAIN); + ASSERT_EQ(setsockopt(fd, SOL_TLS, TLS_TX, &tls, tls.len), 0); ASSERT_EQ(setsockopt(cfd, SOL_TLS, TLS_RX, &tls, tls.len), 0); -- cgit v1.2.3 From a0ac0ff382767a5dbde266fb5f7997a5d6b70e10 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 7 Jan 2026 15:25:57 -0800 Subject: selftests: drv-net: gro: increase the rcvbuf size The gro.py test (testing software GRO) is slightly flaky when running against fbnic. We see one flake per roughly 20 runs in NIPA, mostly in ipip.large, and always including some EAGAIN: # Shouldn't coalesce if exceed IP max pkt size: Test succeeded # Expected {65475 899 }, Total 2 packets # Received {65475 899 }, Total 2 packets. # Expected {64576 900 900 }, Total 3 packets # Received {64576 /home/virtme/testing/wt-24/tools/testing/selftests/drivers/net/gro: could not receive: Resource temporarily unavailable The test sends 2 large frames (64k + change). Looks like the default packet socket rcvbuf (~200kB) may not be large enough to hold them. Bump the rcvbuf to 1MB. Add a debug print showing socket statistics to make debugging this issue easier in the future. Without the rcvbuf increase we see: # Shouldn't coalesce if exceed IP max pkt size: Test succeeded # Expected {65475 899 }, Total 2 packets # Received {65475 899 }, Total 2 packets. # Expected {64576 900 900 }, Total 3 packets # Received {64576 Socket stats: packets=7, drops=3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # /home/virtme/testing/wt-24/tools/testing/selftests/drivers/net/gro: could not receive: Resource temporarily unavailable Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260107232557.2147760-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/gro.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/drivers/net/gro.c b/tools/testing/selftests/drivers/net/gro.c index e894037d2e3e..751a8103f408 100644 --- a/tools/testing/selftests/drivers/net/gro.c +++ b/tools/testing/selftests/drivers/net/gro.c @@ -926,6 +926,28 @@ static void set_timeout(int fd) error(1, errno, "cannot set timeout, setsockopt failed"); } +static void set_rcvbuf(int fd) +{ + int bufsize = 1 * 1024 * 1024; /* 1 MB */ + + if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &bufsize, sizeof(bufsize))) + error(1, errno, "cannot set rcvbuf size, setsockopt failed"); +} + +static void recv_error(int fd, int rcv_errno) +{ + struct tpacket_stats stats; + socklen_t len; + + len = sizeof(stats); + if (getsockopt(fd, SOL_PACKET, PACKET_STATISTICS, &stats, &len)) + error(1, errno, "can't get stats"); + + fprintf(stderr, "Socket stats: packets=%u, drops=%u\n", + stats.tp_packets, stats.tp_drops); + error(1, rcv_errno, "could not receive"); +} + static void check_recv_pkts(int fd, int *correct_payload, int correct_num_pkts) { @@ -950,7 +972,7 @@ static void check_recv_pkts(int fd, int *correct_payload, ip_ext_len = 0; pkt_size = recv(fd, buffer, IP_MAXPACKET + ETH_HLEN + 1, 0); if (pkt_size < 0) - error(1, errno, "could not receive"); + recv_error(fd, errno); if (iph->version == 4) ip_ext_len = (iph->ihl - 5) * 4; @@ -1126,6 +1148,7 @@ static void gro_receiver(void) error(1, 0, "socket creation"); setup_sock_filter(rxfd); set_timeout(rxfd); + set_rcvbuf(rxfd); bind_packetsocket(rxfd); ksft_ready(); -- cgit v1.2.3 From 68ec2b9fc59e8053f17ee1d1a5b1959c43a19202 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 7 Jan 2026 06:53:19 -0800 Subject: selftests: forwarding: update PTP tcpdump patterns Recent version of tcpdump (tcpdump-4.99.6-1.fc43.x86_64) seems to have removed the spurious space after msg type in PTP info, e.g.: before: PTPv2, majorSdoId: 0x0, msg type : sync msg, length: 44 after: PTPv2, majorSdoId: 0x0, msg type: sync msg, length: 44 Update our patterns to match both. Reviewed-by: Alexander Sverdlin Link: https://patch.msgid.link/20260107145320.1837464-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- .../selftests/net/forwarding/local_termination.sh | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/net/forwarding/local_termination.sh b/tools/testing/selftests/net/forwarding/local_termination.sh index 892895659c7e..1f2bf6e81847 100755 --- a/tools/testing/selftests/net/forwarding/local_termination.sh +++ b/tools/testing/selftests/net/forwarding/local_termination.sh @@ -306,39 +306,39 @@ run_test() if [ $skip_ptp = false ]; then check_rcv $rcv_if_name "1588v2 over L2 transport, Sync" \ - "ethertype PTP (0x88f7).* PTPv2.* msg type : sync msg" \ + "ethertype PTP (0x88f7).* PTPv2.* msg type *: sync msg" \ true "$test_name" check_rcv $rcv_if_name "1588v2 over L2 transport, Follow-Up" \ - "ethertype PTP (0x88f7).* PTPv2.* msg type : follow up msg" \ + "ethertype PTP (0x88f7).* PTPv2.* msg type *: follow up msg" \ true "$test_name" check_rcv $rcv_if_name "1588v2 over L2 transport, Peer Delay Request" \ - "ethertype PTP (0x88f7).* PTPv2.* msg type : peer delay req msg" \ + "ethertype PTP (0x88f7).* PTPv2.* msg type *: peer delay req msg" \ true "$test_name" check_rcv $rcv_if_name "1588v2 over IPv4, Sync" \ - "ethertype IPv4 (0x0800).* PTPv2.* msg type : sync msg" \ + "ethertype IPv4 (0x0800).* PTPv2.* msg type *: sync msg" \ true "$test_name" check_rcv $rcv_if_name "1588v2 over IPv4, Follow-Up" \ - "ethertype IPv4 (0x0800).* PTPv2.* msg type : follow up msg" \ + "ethertype IPv4 (0x0800).* PTPv2.* msg type *: follow up msg" \ true "$test_name" check_rcv $rcv_if_name "1588v2 over IPv4, Peer Delay Request" \ - "ethertype IPv4 (0x0800).* PTPv2.* msg type : peer delay req msg" \ + "ethertype IPv4 (0x0800).* PTPv2.* msg type *: peer delay req msg" \ true "$test_name" check_rcv $rcv_if_name "1588v2 over IPv6, Sync" \ - "ethertype IPv6 (0x86dd).* PTPv2.* msg type : sync msg" \ + "ethertype IPv6 (0x86dd).* PTPv2.* msg type *: sync msg" \ true "$test_name" check_rcv $rcv_if_name "1588v2 over IPv6, Follow-Up" \ - "ethertype IPv6 (0x86dd).* PTPv2.* msg type : follow up msg" \ + "ethertype IPv6 (0x86dd).* PTPv2.* msg type *: follow up msg" \ true "$test_name" check_rcv $rcv_if_name "1588v2 over IPv6, Peer Delay Request" \ - "ethertype IPv6 (0x86dd).* PTPv2.* msg type : peer delay req msg" \ + "ethertype IPv6 (0x86dd).* PTPv2.* msg type *: peer delay req msg" \ true "$test_name" fi -- cgit v1.2.3 From a1025dcd377ef92d9a09af03b70ce80be281ee22 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 24 Dec 2025 00:44:49 +0100 Subject: selftests: kvm: replace numbered sync points with actions Rework the guest=>host syncs in the AMX test to use named actions instead of arbitrary, incrementing numbers. The "stage" of the test has no real meaning, what matters is what action the test wants the host to perform. The incrementing numbers are somewhat helpful for triaging failures, but fully debugging failures almost always requires a much deeper dive into the test (and KVM). Using named actions not only makes it easier to extend the test without having to shift all sync point numbers, it makes the code easier to read. [Commit message by Sean Christopherson] Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86/amx_test.c | 88 +++++++++++++++--------------- 1 file changed, 43 insertions(+), 45 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/kvm/x86/amx_test.c b/tools/testing/selftests/kvm/x86/amx_test.c index f4ce5a185a7d..3de4402ac17d 100644 --- a/tools/testing/selftests/kvm/x86/amx_test.c +++ b/tools/testing/selftests/kvm/x86/amx_test.c @@ -124,6 +124,14 @@ static void set_tilecfg(struct tile_config *cfg) } } +enum { + /* Check TMM0 against tiledata */ + TEST_COMPARE_TILEDATA = 1, + + /* Full VM save/restore */ + TEST_SAVE_RESTORE = 2, +}; + static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg, struct tile_data *tiledata, struct xstate *xstate) @@ -131,20 +139,20 @@ static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg, GUEST_ASSERT(this_cpu_has(X86_FEATURE_XSAVE) && this_cpu_has(X86_FEATURE_OSXSAVE)); check_xtile_info(); - GUEST_SYNC(1); + GUEST_SYNC(TEST_SAVE_RESTORE); /* xfd=0, enable amx */ wrmsr(MSR_IA32_XFD, 0); - GUEST_SYNC(2); + GUEST_SYNC(TEST_SAVE_RESTORE); GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == 0); set_tilecfg(amx_cfg); __ldtilecfg(amx_cfg); - GUEST_SYNC(3); + GUEST_SYNC(TEST_SAVE_RESTORE); /* Check save/restore when trap to userspace */ __tileloadd(tiledata); - GUEST_SYNC(4); + GUEST_SYNC(TEST_COMPARE_TILEDATA | TEST_SAVE_RESTORE); __tilerelease(); - GUEST_SYNC(5); + GUEST_SYNC(TEST_SAVE_RESTORE); /* * After XSAVEC, XTILEDATA is cleared in the xstate_bv but is set in * the xcomp_bv. @@ -154,6 +162,8 @@ static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg, GUEST_ASSERT(!(xstate->header.xstate_bv & XFEATURE_MASK_XTILE_DATA)); GUEST_ASSERT(xstate->header.xcomp_bv & XFEATURE_MASK_XTILE_DATA); + /* #NM test */ + /* xfd=0x40000, disable amx tiledata */ wrmsr(MSR_IA32_XFD, XFEATURE_MASK_XTILE_DATA); @@ -166,13 +176,13 @@ static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg, GUEST_ASSERT(!(xstate->header.xstate_bv & XFEATURE_MASK_XTILE_DATA)); GUEST_ASSERT((xstate->header.xcomp_bv & XFEATURE_MASK_XTILE_DATA)); - GUEST_SYNC(6); + GUEST_SYNC(TEST_SAVE_RESTORE); GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILE_DATA); set_tilecfg(amx_cfg); __ldtilecfg(amx_cfg); /* Trigger #NM exception */ __tileloadd(tiledata); - GUEST_SYNC(10); + GUEST_SYNC(TEST_COMPARE_TILEDATA | TEST_SAVE_RESTORE); GUEST_DONE(); } @@ -180,18 +190,18 @@ static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg, void guest_nm_handler(struct ex_regs *regs) { /* Check if #NM is triggered by XFEATURE_MASK_XTILE_DATA */ - GUEST_SYNC(7); + GUEST_SYNC(TEST_SAVE_RESTORE); GUEST_ASSERT(!(get_cr0() & X86_CR0_TS)); GUEST_ASSERT(rdmsr(MSR_IA32_XFD_ERR) == XFEATURE_MASK_XTILE_DATA); GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILE_DATA); - GUEST_SYNC(8); + GUEST_SYNC(TEST_SAVE_RESTORE); GUEST_ASSERT(rdmsr(MSR_IA32_XFD_ERR) == XFEATURE_MASK_XTILE_DATA); GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILE_DATA); /* Clear xfd_err */ wrmsr(MSR_IA32_XFD_ERR, 0); /* xfd=0, enable amx */ wrmsr(MSR_IA32_XFD, 0); - GUEST_SYNC(9); + GUEST_SYNC(TEST_SAVE_RESTORE); } int main(int argc, char *argv[]) @@ -244,6 +254,7 @@ int main(int argc, char *argv[]) memset(addr_gva2hva(vm, xstate), 0, PAGE_SIZE * DIV_ROUND_UP(XSAVE_SIZE, PAGE_SIZE)); vcpu_args_set(vcpu, 3, amx_cfg, tiledata, xstate); + int iter = 0; for (;;) { vcpu_run(vcpu); TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); @@ -253,20 +264,9 @@ int main(int argc, char *argv[]) REPORT_GUEST_ASSERT(uc); /* NOT REACHED */ case UCALL_SYNC: - switch (uc.args[1]) { - case 1: - case 2: - case 3: - case 5: - case 6: - case 7: - case 8: - fprintf(stderr, "GUEST_SYNC(%ld)\n", uc.args[1]); - break; - case 4: - case 10: - fprintf(stderr, - "GUEST_SYNC(%ld), check save/restore status\n", uc.args[1]); + ++iter; + if (uc.args[1] & TEST_COMPARE_TILEDATA) { + fprintf(stderr, "GUEST_SYNC #%d, check TMM0 contents\n", iter); /* Compacted mode, get amx offset by xsave area * size subtract 8K amx size. @@ -279,11 +279,25 @@ int main(int argc, char *argv[]) ret = memcmp(amx_start, tiles_data, TILE_SIZE); TEST_ASSERT(ret == 0, "memcmp failed, ret=%d", ret); kvm_x86_state_cleanup(state); - break; - case 9: - fprintf(stderr, - "GUEST_SYNC(%ld), #NM exception and enable amx\n", uc.args[1]); - break; + } + if (uc.args[1] & TEST_SAVE_RESTORE) { + fprintf(stderr, "GUEST_SYNC #%d, save/restore VM state\n", iter); + state = vcpu_save_state(vcpu); + memset(®s1, 0, sizeof(regs1)); + vcpu_regs_get(vcpu, ®s1); + + kvm_vm_release(vm); + + /* Restore state in a new VM. */ + vcpu = vm_recreate_with_one_vcpu(vm); + vcpu_load_state(vcpu, state); + kvm_x86_state_cleanup(state); + + memset(®s2, 0, sizeof(regs2)); + vcpu_regs_get(vcpu, ®s2); + TEST_ASSERT(!memcmp(®s1, ®s2, sizeof(regs2)), + "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx", + (ulong) regs2.rdi, (ulong) regs2.rsi); } break; case UCALL_DONE: @@ -293,22 +307,6 @@ int main(int argc, char *argv[]) TEST_FAIL("Unknown ucall %lu", uc.cmd); } - state = vcpu_save_state(vcpu); - memset(®s1, 0, sizeof(regs1)); - vcpu_regs_get(vcpu, ®s1); - - kvm_vm_release(vm); - - /* Restore state in a new VM. */ - vcpu = vm_recreate_with_one_vcpu(vm); - vcpu_load_state(vcpu, state); - kvm_x86_state_cleanup(state); - - memset(®s2, 0, sizeof(regs2)); - vcpu_regs_get(vcpu, ®s2); - TEST_ASSERT(!memcmp(®s1, ®s2, sizeof(regs2)), - "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx", - (ulong) regs2.rdi, (ulong) regs2.rsi); } done: kvm_vm_free(vm); -- cgit v1.2.3 From 0383a8edef396cf0a6884b0be81d62bde60737b0 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 31 Dec 2025 16:47:26 +0100 Subject: selftests: kvm: try getting XFD and XSAVE state out of sync The host is allowed to set FPU state that includes a disabled xstate component. Check that this does not cause bad effects. Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86/amx_test.c | 38 +++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 8 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/kvm/x86/amx_test.c b/tools/testing/selftests/kvm/x86/amx_test.c index 3de4402ac17d..bee56c1f7833 100644 --- a/tools/testing/selftests/kvm/x86/amx_test.c +++ b/tools/testing/selftests/kvm/x86/amx_test.c @@ -125,11 +125,17 @@ static void set_tilecfg(struct tile_config *cfg) } enum { + /* Retrieve TMM0 from guest, stash it for TEST_RESTORE_TILEDATA */ + TEST_SAVE_TILEDATA = 1, + /* Check TMM0 against tiledata */ - TEST_COMPARE_TILEDATA = 1, + TEST_COMPARE_TILEDATA = 2, + + /* Restore TMM0 from earlier save */ + TEST_RESTORE_TILEDATA = 4, /* Full VM save/restore */ - TEST_SAVE_RESTORE = 2, + TEST_SAVE_RESTORE = 8, }; static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg, @@ -150,7 +156,16 @@ static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg, GUEST_SYNC(TEST_SAVE_RESTORE); /* Check save/restore when trap to userspace */ __tileloadd(tiledata); - GUEST_SYNC(TEST_COMPARE_TILEDATA | TEST_SAVE_RESTORE); + GUEST_SYNC(TEST_SAVE_TILEDATA | TEST_COMPARE_TILEDATA | TEST_SAVE_RESTORE); + + /* xfd=0x40000, disable amx tiledata */ + wrmsr(MSR_IA32_XFD, XFEATURE_MASK_XTILE_DATA); + + /* host tries setting tiledata while guest XFD is set */ + GUEST_SYNC(TEST_RESTORE_TILEDATA); + GUEST_SYNC(TEST_SAVE_RESTORE); + + wrmsr(MSR_IA32_XFD, 0); __tilerelease(); GUEST_SYNC(TEST_SAVE_RESTORE); /* @@ -210,10 +225,10 @@ int main(int argc, char *argv[]) struct kvm_vcpu *vcpu; struct kvm_vm *vm; struct kvm_x86_state *state; + struct kvm_x86_state *tile_state = NULL; int xsave_restore_size; vm_vaddr_t amx_cfg, tiledata, xstate; struct ucall uc; - u32 amx_offset; int ret; /* @@ -265,20 +280,27 @@ int main(int argc, char *argv[]) /* NOT REACHED */ case UCALL_SYNC: ++iter; + if (uc.args[1] & TEST_SAVE_TILEDATA) { + fprintf(stderr, "GUEST_SYNC #%d, save tiledata\n", iter); + tile_state = vcpu_save_state(vcpu); + } if (uc.args[1] & TEST_COMPARE_TILEDATA) { fprintf(stderr, "GUEST_SYNC #%d, check TMM0 contents\n", iter); /* Compacted mode, get amx offset by xsave area * size subtract 8K amx size. */ - amx_offset = xsave_restore_size - NUM_TILES*TILE_SIZE; - state = vcpu_save_state(vcpu); - void *amx_start = (void *)state->xsave + amx_offset; + u32 amx_offset = xsave_restore_size - NUM_TILES*TILE_SIZE; + void *amx_start = (void *)tile_state->xsave + amx_offset; void *tiles_data = (void *)addr_gva2hva(vm, tiledata); /* Only check TMM0 register, 1 tile */ ret = memcmp(amx_start, tiles_data, TILE_SIZE); TEST_ASSERT(ret == 0, "memcmp failed, ret=%d", ret); - kvm_x86_state_cleanup(state); + } + if (uc.args[1] & TEST_RESTORE_TILEDATA) { + fprintf(stderr, "GUEST_SYNC #%d, before KVM_SET_XSAVE\n", iter); + vcpu_xsave_set(vcpu, tile_state->xsave); + fprintf(stderr, "GUEST_SYNC #%d, after KVM_SET_XSAVE\n", iter); } if (uc.args[1] & TEST_SAVE_RESTORE) { fprintf(stderr, "GUEST_SYNC #%d, save/restore VM state\n", iter); -- cgit v1.2.3 From 3611ca7c12b740e250d83f8bbe3554b740c503b0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 29 Dec 2025 12:23:30 -0800 Subject: selftests: kvm: Verify TILELOADD actually #NM faults when XFD[18]=1 Rework the AMX test's #NM handling to use kvm_asm_safe() to verify an #NM actually occurs. As is, a completely missing #NM could go unnoticed. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86/amx_test.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/kvm/x86/amx_test.c b/tools/testing/selftests/kvm/x86/amx_test.c index bee56c1f7833..37b166260ee3 100644 --- a/tools/testing/selftests/kvm/x86/amx_test.c +++ b/tools/testing/selftests/kvm/x86/amx_test.c @@ -69,6 +69,12 @@ static inline void __tileloadd(void *tile) : : "a"(tile), "d"(0)); } +static inline int tileloadd_safe(void *tile) +{ + return kvm_asm_safe(".byte 0xc4,0xe2,0x7b,0x4b,0x04,0x10", + "a"(tile), "d"(0)); +} + static inline void __tilerelease(void) { asm volatile(".byte 0xc4, 0xe2, 0x78, 0x49, 0xc0" ::); @@ -142,6 +148,8 @@ static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg, struct tile_data *tiledata, struct xstate *xstate) { + int vector; + GUEST_ASSERT(this_cpu_has(X86_FEATURE_XSAVE) && this_cpu_has(X86_FEATURE_OSXSAVE)); check_xtile_info(); @@ -195,17 +203,13 @@ static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg, GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILE_DATA); set_tilecfg(amx_cfg); __ldtilecfg(amx_cfg); - /* Trigger #NM exception */ - __tileloadd(tiledata); - GUEST_SYNC(TEST_COMPARE_TILEDATA | TEST_SAVE_RESTORE); - GUEST_DONE(); -} + /* Trigger #NM exception */ + vector = tileloadd_safe(tiledata); + __GUEST_ASSERT(vector == NM_VECTOR, + "Wanted #NM on tileloadd with XFD[18]=1, got %s", + ex_str(vector)); -void guest_nm_handler(struct ex_regs *regs) -{ - /* Check if #NM is triggered by XFEATURE_MASK_XTILE_DATA */ - GUEST_SYNC(TEST_SAVE_RESTORE); GUEST_ASSERT(!(get_cr0() & X86_CR0_TS)); GUEST_ASSERT(rdmsr(MSR_IA32_XFD_ERR) == XFEATURE_MASK_XTILE_DATA); GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILE_DATA); @@ -217,6 +221,11 @@ void guest_nm_handler(struct ex_regs *regs) /* xfd=0, enable amx */ wrmsr(MSR_IA32_XFD, 0); GUEST_SYNC(TEST_SAVE_RESTORE); + + __tileloadd(tiledata); + GUEST_SYNC(TEST_COMPARE_TILEDATA | TEST_SAVE_RESTORE); + + GUEST_DONE(); } int main(int argc, char *argv[]) @@ -253,9 +262,6 @@ int main(int argc, char *argv[]) vcpu_regs_get(vcpu, ®s1); - /* Register #NM handler */ - vm_install_exception_handler(vm, NM_VECTOR, guest_nm_handler); - /* amx cfg for guest_code */ amx_cfg = vm_vaddr_alloc_page(vm); memset(addr_gva2hva(vm, amx_cfg), 0x0, getpagesize()); -- cgit v1.2.3 From c39a6a277e0e67ffff6a8efcbbf7e7e23ce9e38c Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 8 Jan 2026 12:44:19 +0100 Subject: vsock/test: add a final full barrier after run all tests If the last test fails, the other side still completes correctly, which could lead to false positives. Let's add a final barrier that ensures that the last test has finished correctly on both sides, but also that the two sides agree on the number of tests to be performed. Fixes: 2f65b44e199c ("VSOCK: add full barrier between test cases") Reviewed-by: Luigi Leonardi Signed-off-by: Stefano Garzarella Link: https://patch.msgid.link/20260108114419.52747-1-sgarzare@redhat.com Signed-off-by: Jakub Kicinski --- tools/testing/vsock/util.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/vsock/util.c b/tools/testing/vsock/util.c index d843643ced6b..9430ef5b8bc3 100644 --- a/tools/testing/vsock/util.c +++ b/tools/testing/vsock/util.c @@ -511,6 +511,18 @@ void run_tests(const struct test_case *test_cases, printf("ok\n"); } + + printf("All tests have been executed. Waiting other peer..."); + fflush(stdout); + + /* + * Final full barrier, to ensure that all tests have been run and + * that even the last one has been successful on both sides. + */ + control_writeln("COMPLETED"); + control_expectln("COMPLETED"); + + printf("ok\n"); } void list_tests(const struct test_case *test_cases) -- cgit v1.2.3 From 9086984ff52e703cd7ce47ae19f12d8d31914396 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Fri, 9 Jan 2026 13:08:51 +0200 Subject: selftests: drv-net: psp: Better control the used PSP dev The PSP responder fails when zero or multiple PSP devices are detected. There's an option to select the device id to use (-d) but it's currently not used from the PSP self test. It's also hard to use because the PSP test doesn't dump the PSP devices so can't choose one. When zero devices are detected, psp_responder fails which will cause the parent test to fail as well instead of skipping PSP tests. Fix both of these problems. Change psp_responder to: - not fail when no PSP devs are detected. - get an optional -i ifindex argument instead of -d. - select the correct PSP dev from the dump corresponding to ifindex or - select the first PSP dev when -i is not given. - fail when multiple devs are found and -i is not given. - warn and continue when the requested ifindex is not found. Also plumb the ifindex from the Python test. With these, when there are no PSP devs found or the wrong one is chosen, psp_responder opens the server socket, listens for control connections normally, and leaves the skipping of the various test cases which require a PSP device (~most, but not all of them) to the parent test. This results in output like: ok 1 psp.test_case # SKIP No PSP devices found [...] ok 12 psp.dev_get_device # SKIP No PSP devices found ok 13 psp.dev_get_device_bad ok 14 psp.dev_rotate # SKIP No PSP devices found [...] Signed-off-by: Cosmin Ratiu Reviewed-by: Carolina Jubran Link: https://patch.msgid.link/20260109110851.2952906-2-cratiu@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/lib/py/env.py | 1 + tools/testing/selftests/drivers/net/psp.py | 4 +- .../testing/selftests/drivers/net/psp_responder.c | 50 ++++++++++------------ 3 files changed, 26 insertions(+), 29 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py index 8b644fd84ff2..63495376e654 100644 --- a/tools/testing/selftests/drivers/net/lib/py/env.py +++ b/tools/testing/selftests/drivers/net/lib/py/env.py @@ -170,6 +170,7 @@ class NetDrvEpEnv(NetDrvEnvBase): self.remote_ifname = self.resolve_remote_ifc() self.remote_dev = ip("-d link show dev " + self.remote_ifname, host=self.remote, json=True)[0] + self.remote_ifindex = self.remote_dev['ifindex'] self._required_cmd = {} diff --git a/tools/testing/selftests/drivers/net/psp.py b/tools/testing/selftests/drivers/net/psp.py index 52523bdad240..528a421ecf76 100755 --- a/tools/testing/selftests/drivers/net/psp.py +++ b/tools/testing/selftests/drivers/net/psp.py @@ -601,8 +601,8 @@ def main() -> None: cfg.comm_port = rand_port() srv = None try: - with bkg(responder + f" -p {cfg.comm_port}", host=cfg.remote, - exit_wait=True) as srv: + with bkg(responder + f" -p {cfg.comm_port} -i {cfg.remote_ifindex}", + host=cfg.remote, exit_wait=True) as srv: wait_port_listen(cfg.comm_port, host=cfg.remote) cfg.comm_sock = socket.create_connection((cfg.remote_addr, diff --git a/tools/testing/selftests/drivers/net/psp_responder.c b/tools/testing/selftests/drivers/net/psp_responder.c index f309e0d73cbf..a26e7628bbb1 100644 --- a/tools/testing/selftests/drivers/net/psp_responder.c +++ b/tools/testing/selftests/drivers/net/psp_responder.c @@ -22,7 +22,7 @@ static bool should_quit; struct opts { int port; - int devid; + int ifindex; bool verbose; }; @@ -360,7 +360,7 @@ static void usage(const char *name, const char *miss) if (miss) fprintf(stderr, "Missing argument: %s\n", miss); - fprintf(stderr, "Usage: %s -p port [-v] [-d psp-dev-id]\n", name); + fprintf(stderr, "Usage: %s -p port [-v] [-i ifindex]\n", name); exit(EXIT_FAILURE); } @@ -368,7 +368,7 @@ static void parse_cmd_opts(int argc, char **argv, struct opts *opts) { int opt; - while ((opt = getopt(argc, argv, "vp:d:")) != -1) { + while ((opt = getopt(argc, argv, "vp:i:")) != -1) { switch (opt) { case 'v': opts->verbose = 1; @@ -376,8 +376,8 @@ static void parse_cmd_opts(int argc, char **argv, struct opts *opts) case 'p': opts->port = atoi(optarg); break; - case 'd': - opts->devid = atoi(optarg); + case 'i': + opts->ifindex = atoi(optarg); break; default: usage(argv[0], NULL); @@ -410,12 +410,11 @@ static int psp_dev_set_ena(struct ynl_sock *ys, __u32 dev_id, __u32 versions) int main(int argc, char **argv) { struct psp_dev_get_list *dev_list; - bool devid_found = false; __u32 ver_ena, ver_cap; struct opts opts = {}; struct ynl_error yerr; struct ynl_sock *ys; - int first_id = 0; + int devid = -1; int ret; parse_cmd_opts(argc, argv, &opts); @@ -429,20 +428,19 @@ int main(int argc, char **argv) } dev_list = psp_dev_get_dump(ys); - if (ynl_dump_empty(dev_list)) { - if (ys->err.code) - goto err_close; - fprintf(stderr, "No PSP devices\n"); - goto err_close_silent; - } + if (ynl_dump_empty(dev_list) && ys->err.code) + goto err_close; ynl_dump_foreach(dev_list, d) { - if (opts.devid) { - devid_found = true; + if (opts.ifindex) { + if (d->ifindex != opts.ifindex) + continue; + devid = d->id; ver_ena = d->psp_versions_ena; ver_cap = d->psp_versions_cap; - } else if (!first_id) { - first_id = d->id; + break; + } else if (devid < 0) { + devid = d->id; ver_ena = d->psp_versions_ena; ver_cap = d->psp_versions_cap; } else { @@ -452,23 +450,21 @@ int main(int argc, char **argv) } psp_dev_get_list_free(dev_list); - if (opts.devid && !devid_found) { - fprintf(stderr, "PSP device %d requested on cmdline, not found\n", - opts.devid); - goto err_close_silent; - } else if (!opts.devid) { - opts.devid = first_id; - } + if (opts.ifindex && devid < 0) + fprintf(stderr, + "WARN: PSP device with ifindex %d requested on cmdline, not found\n", + opts.ifindex); - if (ver_ena != ver_cap) { - ret = psp_dev_set_ena(ys, opts.devid, ver_cap); + if (devid >= 0 && ver_ena != ver_cap) { + ret = psp_dev_set_ena(ys, devid, ver_cap); if (ret) goto err_close; } ret = run_responder(ys, &opts); - if (ver_ena != ver_cap && psp_dev_set_ena(ys, opts.devid, ver_ena)) + if (devid >= 0 && ver_ena != ver_cap && + psp_dev_set_ena(ys, devid, ver_ena)) fprintf(stderr, "WARN: failed to set the PSP versions back\n"); ynl_sock_destroy(ys); -- cgit v1.2.3 From de7c600e2d5b501c0c04bde8ebab89ac5888a69f Mon Sep 17 00:00:00 2001 From: David Wei Date: Thu, 8 Jan 2026 15:45:21 -0800 Subject: selftests/net: parametrise iou-zcrx.py with ksft_variants Use ksft_variants to parametrise tests in iou-zcrx.py to either use single queues or RSS contexts, reducing duplication. Signed-off-by: David Wei Link: https://patch.msgid.link/20260108234521.3619621-1-dw@davidwei.uk Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/hw/iou-zcrx.py | 162 ++++++++++----------- 1 file changed, 73 insertions(+), 89 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/drivers/net/hw/iou-zcrx.py b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py index 712c806508b5..2c5acfb4f5dc 100755 --- a/tools/testing/selftests/drivers/net/hw/iou-zcrx.py +++ b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py @@ -3,132 +3,114 @@ import re from os import path -from lib.py import ksft_run, ksft_exit, KsftSkipEx +from lib.py import ksft_run, ksft_exit, KsftSkipEx, ksft_variants, KsftNamedVariant from lib.py import NetDrvEpEnv from lib.py import bkg, cmd, defer, ethtool, rand_port, wait_port_listen +from lib.py import EthtoolFamily -def _get_current_settings(cfg): - output = ethtool(f"-g {cfg.ifname}", json=True)[0] - return (output['rx'], output['hds-thresh']) - - -def _get_combined_channels(cfg): - output = ethtool(f"-l {cfg.ifname}").stdout - values = re.findall(r'Combined:\s+(\d+)', output) - return int(values[1]) - - -def _create_rss_ctx(cfg, chan): - output = ethtool(f"-X {cfg.ifname} context new start {chan} equal 1").stdout +def create_rss_ctx(cfg): + output = ethtool(f"-X {cfg.ifname} context new start {cfg.target} equal 1").stdout values = re.search(r'New RSS context is (\d+)', output).group(1) - ctx_id = int(values) - return (ctx_id, defer(ethtool, f"-X {cfg.ifname} delete context {ctx_id}")) + return int(values) -def _set_flow_rule(cfg, port, chan): - output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port {port} action {chan}").stdout +def set_flow_rule(cfg): + output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port {cfg.port} action {cfg.target}").stdout values = re.search(r'ID (\d+)', output).group(1) return int(values) -def _set_flow_rule_rss(cfg, port, ctx_id): - output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port {port} context {ctx_id}").stdout +def set_flow_rule_rss(cfg, rss_ctx_id): + output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port {cfg.port} context {rss_ctx_id}").stdout values = re.search(r'ID (\d+)', output).group(1) return int(values) -def test_zcrx(cfg) -> None: - cfg.require_ipver('6') - - combined_chans = _get_combined_channels(cfg) - if combined_chans < 2: - raise KsftSkipEx('at least 2 combined channels required') - (rx_ring, hds_thresh) = _get_current_settings(cfg) - port = rand_port() - - ethtool(f"-G {cfg.ifname} tcp-data-split on") - defer(ethtool, f"-G {cfg.ifname} tcp-data-split auto") +def single(cfg): + channels = cfg.ethnl.channels_get({'header': {'dev-index': cfg.ifindex}}) + channels = channels['combined-count'] + if channels < 2: + raise KsftSkipEx('Test requires NETIF with at least 2 combined channels') - ethtool(f"-G {cfg.ifname} hds-thresh 0") - defer(ethtool, f"-G {cfg.ifname} hds-thresh {hds_thresh}") + rings = cfg.ethnl.rings_get({'header': {'dev-index': cfg.ifindex}}) + rx_rings = rings['rx'] + hds_thresh = rings.get('hds-thresh', 0) - ethtool(f"-G {cfg.ifname} rx 64") - defer(ethtool, f"-G {cfg.ifname} rx {rx_ring}") + cfg.ethnl.rings_set({'header': {'dev-index': cfg.ifindex}, + 'tcp-data-split': 'enabled', + 'hds-thresh': 0, + 'rx': 64}) + defer(cfg.ethnl.rings_set, {'header': {'dev-index': cfg.ifindex}, + 'tcp-data-split': 'unknown', + 'hds-thresh': hds_thresh, + 'rx': rx_rings}) - ethtool(f"-X {cfg.ifname} equal {combined_chans - 1}") + cfg.target = channels - 1 + ethtool(f"-X {cfg.ifname} equal {cfg.target}") defer(ethtool, f"-X {cfg.ifname} default") - flow_rule_id = _set_flow_rule(cfg, port, combined_chans - 1) + flow_rule_id = set_flow_rule(cfg) defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}") - rx_cmd = f"{cfg.bin_local} -s -p {port} -i {cfg.ifname} -q {combined_chans - 1}" - tx_cmd = f"{cfg.bin_remote} -c -h {cfg.addr_v['6']} -p {port} -l 12840" - with bkg(rx_cmd, exit_wait=True): - wait_port_listen(port, proto="tcp") - cmd(tx_cmd, host=cfg.remote) +def rss(cfg): + channels = cfg.ethnl.channels_get({'header': {'dev-index': cfg.ifindex}}) + channels = channels['combined-count'] + if channels < 2: + raise KsftSkipEx('Test requires NETIF with at least 2 combined channels') -def test_zcrx_oneshot(cfg) -> None: - cfg.require_ipver('6') + rings = cfg.ethnl.rings_get({'header': {'dev-index': cfg.ifindex}}) + rx_rings = rings['rx'] + hds_thresh = rings.get('hds-thresh', 0) - combined_chans = _get_combined_channels(cfg) - if combined_chans < 2: - raise KsftSkipEx('at least 2 combined channels required') - (rx_ring, hds_thresh) = _get_current_settings(cfg) - port = rand_port() + cfg.ethnl.rings_set({'header': {'dev-index': cfg.ifindex}, + 'tcp-data-split': 'enabled', + 'hds-thresh': 0, + 'rx': 64}) + defer(cfg.ethnl.rings_set, {'header': {'dev-index': cfg.ifindex}, + 'tcp-data-split': 'unknown', + 'hds-thresh': hds_thresh, + 'rx': rx_rings}) - ethtool(f"-G {cfg.ifname} tcp-data-split on") - defer(ethtool, f"-G {cfg.ifname} tcp-data-split auto") + cfg.target = channels - 1 + ethtool(f"-X {cfg.ifname} equal {cfg.target}") + defer(ethtool, f"-X {cfg.ifname} default") - ethtool(f"-G {cfg.ifname} hds-thresh 0") - defer(ethtool, f"-G {cfg.ifname} hds-thresh {hds_thresh}") + rss_ctx_id = create_rss_ctx(cfg) + defer(ethtool, f"-X {cfg.ifname} delete context {rss_ctx_id}") - ethtool(f"-G {cfg.ifname} rx 64") - defer(ethtool, f"-G {cfg.ifname} rx {rx_ring}") + flow_rule_id = set_flow_rule_rss(cfg, rss_ctx_id) + defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}") - ethtool(f"-X {cfg.ifname} equal {combined_chans - 1}") - defer(ethtool, f"-X {cfg.ifname} default") - flow_rule_id = _set_flow_rule(cfg, port, combined_chans - 1) - defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}") +@ksft_variants([ + KsftNamedVariant("single", single), + KsftNamedVariant("rss", rss), +]) +def test_zcrx(cfg, setup) -> None: + cfg.require_ipver('6') - rx_cmd = f"{cfg.bin_local} -s -p {port} -i {cfg.ifname} -q {combined_chans - 1} -o 4" - tx_cmd = f"{cfg.bin_remote} -c -h {cfg.addr_v['6']} -p {port} -l 4096 -z 16384" + setup(cfg) + rx_cmd = f"{cfg.bin_local} -s -p {cfg.port} -i {cfg.ifname} -q {cfg.target}" + tx_cmd = f"{cfg.bin_remote} -c -h {cfg.addr_v['6']} -p {cfg.port} -l 12840" with bkg(rx_cmd, exit_wait=True): - wait_port_listen(port, proto="tcp") + wait_port_listen(cfg.port, proto="tcp") cmd(tx_cmd, host=cfg.remote) -def test_zcrx_rss(cfg) -> None: +@ksft_variants([ + KsftNamedVariant("single", single), + KsftNamedVariant("rss", rss), +]) +def test_zcrx_oneshot(cfg, setup) -> None: cfg.require_ipver('6') - combined_chans = _get_combined_channels(cfg) - if combined_chans < 2: - raise KsftSkipEx('at least 2 combined channels required') - (rx_ring, hds_thresh) = _get_current_settings(cfg) - port = rand_port() - - ethtool(f"-G {cfg.ifname} tcp-data-split on") - defer(ethtool, f"-G {cfg.ifname} tcp-data-split auto") - - ethtool(f"-G {cfg.ifname} hds-thresh 0") - defer(ethtool, f"-G {cfg.ifname} hds-thresh {hds_thresh}") - - ethtool(f"-G {cfg.ifname} rx 64") - defer(ethtool, f"-G {cfg.ifname} rx {rx_ring}") - - ethtool(f"-X {cfg.ifname} equal {combined_chans - 1}") - defer(ethtool, f"-X {cfg.ifname} default") - - (ctx_id, delete_ctx) = _create_rss_ctx(cfg, combined_chans - 1) - flow_rule_id = _set_flow_rule_rss(cfg, port, ctx_id) - defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}") - - rx_cmd = f"{cfg.bin_local} -s -p {port} -i {cfg.ifname} -q {combined_chans - 1}" - tx_cmd = f"{cfg.bin_remote} -c -h {cfg.addr_v['6']} -p {port} -l 12840" + setup(cfg) + rx_cmd = f"{cfg.bin_local} -s -p {cfg.port} -i {cfg.ifname} -q {cfg.target} -o 4" + tx_cmd = f"{cfg.bin_remote} -c -h {cfg.addr_v['6']} -p {cfg.port} -l 4096 -z 16384" with bkg(rx_cmd, exit_wait=True): - wait_port_listen(port, proto="tcp") + wait_port_listen(cfg.port, proto="tcp") cmd(tx_cmd, host=cfg.remote) @@ -137,7 +119,9 @@ def main() -> None: cfg.bin_local = path.abspath(path.dirname(__file__) + "/../../../drivers/net/hw/iou-zcrx") cfg.bin_remote = cfg.remote.deploy(cfg.bin_local) - ksft_run(globs=globals(), case_pfx={"test_"}, args=(cfg, )) + cfg.ethnl = EthtoolFamily() + cfg.port = rand_port() + ksft_run(globs=globals(), cases=[test_zcrx, test_zcrx_oneshot], args=(cfg, )) ksft_exit() -- cgit v1.2.3 From 799a4912eea74c667da1c8167f93bf2d1508a89e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 8 Jan 2026 14:52:56 -0800 Subject: selftests: net: py: capitalize defer queue and improve import Import utils and refer to the global defer queue that way instead of importing the queue. This will make it possible to assign value to the global variable. While at it capitalize the name, to comply with the Python coding style. Reviewed-by: Petr Machata Link: https://patch.msgid.link/20260108225257.2684238-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/lib/py/ksft.py | 8 ++++---- tools/testing/selftests/net/lib/py/utils.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/net/lib/py/ksft.py b/tools/testing/selftests/net/lib/py/ksft.py index 531e7fa1b3ea..248cd1a723a3 100644 --- a/tools/testing/selftests/net/lib/py/ksft.py +++ b/tools/testing/selftests/net/lib/py/ksft.py @@ -8,7 +8,7 @@ import time import traceback from collections import namedtuple from .consts import KSFT_MAIN_NAME -from .utils import global_defer_queue +from . import utils KSFT_RESULT = None KSFT_RESULT_ALL = True @@ -157,10 +157,10 @@ def ksft_flush_defer(): global KSFT_RESULT i = 0 - qlen_start = len(global_defer_queue) - while global_defer_queue: + qlen_start = len(utils.GLOBAL_DEFER_QUEUE) + while utils.GLOBAL_DEFER_QUEUE: i += 1 - entry = global_defer_queue.pop() + entry = utils.GLOBAL_DEFER_QUEUE.pop() try: entry.exec_only() except Exception: diff --git a/tools/testing/selftests/net/lib/py/utils.py b/tools/testing/selftests/net/lib/py/utils.py index 106ee1f2df86..2dde34560d65 100644 --- a/tools/testing/selftests/net/lib/py/utils.py +++ b/tools/testing/selftests/net/lib/py/utils.py @@ -141,7 +141,7 @@ class bkg(cmd): return self.process(terminate=terminate, fail=self.check_fail) -global_defer_queue = [] +GLOBAL_DEFER_QUEUE = [] class defer: @@ -153,7 +153,7 @@ class defer: self.args = args self.kwargs = kwargs - self._queue = global_defer_queue + self._queue = GLOBAL_DEFER_QUEUE self._queue.append(self) def __enter__(self): -- cgit v1.2.3 From 7a1ff3545adeec5dc65c3063c2f084500d6f7014 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 8 Jan 2026 14:52:57 -0800 Subject: selftests: net: py: ensure defer() is only used within a test case I wasted a couple of hours recently after accidentally adding a defer() from within a function which itself was called as part of defer(). This leads to an infinite loop of defer(). Make sure this cannot happen and raise a helpful exception. I understand that the pair of _ksft_defer_arm() calls may not be the most Pythonic way to implement this, but it's easy enough to understand. Reviewed-by: Petr Machata Link: https://patch.msgid.link/20260108225257.2684238-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/lib/py/ksft.py | 7 +++++++ tools/testing/selftests/net/lib/py/utils.py | 3 +++ 2 files changed, 10 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/net/lib/py/ksft.py b/tools/testing/selftests/net/lib/py/ksft.py index 248cd1a723a3..0a96f88bb60a 100644 --- a/tools/testing/selftests/net/lib/py/ksft.py +++ b/tools/testing/selftests/net/lib/py/ksft.py @@ -153,6 +153,11 @@ def ktap_result(ok, cnt=1, case_name="", comment=""): print(res, flush=True) +def _ksft_defer_arm(state): + """ Allow or disallow the use of defer() """ + utils.GLOBAL_DEFER_ARMED = state + + def ksft_flush_defer(): global KSFT_RESULT @@ -315,6 +320,7 @@ def ksft_run(cases=None, globs=None, case_pfx=None, args=()): comment = "" cnt_key = "" + _ksft_defer_arm(True) try: func(*args) except KsftSkipEx as e: @@ -332,6 +338,7 @@ def ksft_run(cases=None, globs=None, case_pfx=None, args=()): ksft_pr(f"Stopping tests due to {type(e).__name__}.") KSFT_RESULT = False cnt_key = 'fail' + _ksft_defer_arm(False) try: ksft_flush_defer() diff --git a/tools/testing/selftests/net/lib/py/utils.py b/tools/testing/selftests/net/lib/py/utils.py index 2dde34560d65..824f039d384c 100644 --- a/tools/testing/selftests/net/lib/py/utils.py +++ b/tools/testing/selftests/net/lib/py/utils.py @@ -142,6 +142,7 @@ class bkg(cmd): GLOBAL_DEFER_QUEUE = [] +GLOBAL_DEFER_ARMED = False class defer: @@ -153,6 +154,8 @@ class defer: self.args = args self.kwargs = kwargs + if not GLOBAL_DEFER_ARMED: + raise Exception("defer queue not armed, did you use defer() outside of a test case?") self._queue = GLOBAL_DEFER_QUEUE self._queue.append(self) -- cgit v1.2.3 From 4203c6fb5e9d2e4fb9a48b421d92efd4429a4d55 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Tue, 6 Jan 2026 12:44:57 +0100 Subject: selftests/nolibc: try to read from stdin in readv_zero test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When stdout is redirected to a file this test fails. This happens when running through the kselftest runner since commit d9e6269e3303 ("selftests/run_kselftest.sh: exit with error if tests fail"). For consistency with other tests that read from a file descriptor, switch to stdin over stdout. The tests are still brittle against a redirected stdin, but at least they are now consistently so. Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau Link: https://patch.msgid.link/20260106-nolibc-selftests-v1-1-f82101c2c505@weissschuh.net --- tools/testing/selftests/nolibc/nolibc-test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index 3986d55a6ff6..e83c1e7e2beb 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -1404,7 +1404,7 @@ int run_syscall(int min, int max) CASE_TEST(write_badf); EXPECT_SYSER(1, write(-1, &tmp, 1), -1, EBADF); break; CASE_TEST(write_zero); EXPECT_SYSZR(1, write(1, &tmp, 0)); break; CASE_TEST(readv_badf); EXPECT_SYSER(1, readv(-1, &iov_one, 1), -1, EBADF); break; - CASE_TEST(readv_zero); EXPECT_SYSZR(1, readv(1, NULL, 0)); break; + CASE_TEST(readv_zero); EXPECT_SYSZR(1, readv(0, NULL, 0)); break; CASE_TEST(writev_badf); EXPECT_SYSER(1, writev(-1, &iov_one, 1), -1, EBADF); break; CASE_TEST(writev_zero); EXPECT_SYSZR(1, writev(1, NULL, 0)); break; CASE_TEST(ptrace); EXPECT_SYSER(1, ptrace(PTRACE_CONT, getpid(), NULL, NULL), -1, ESRCH); break; -- cgit v1.2.3 From 20c72de1f8a9e338531579bd784371aba4b7dd2c Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Tue, 6 Jan 2026 12:44:58 +0100 Subject: selftests/nolibc: scope custom flags to the nolibc-test target MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A new target for 'libc-test' is going to be added which should not be affected by these options. Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau Link: https://patch.msgid.link/20260106-nolibc-selftests-v1-2-f82101c2c505@weissschuh.net --- tools/testing/selftests/nolibc/Makefile | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index 40f5c2908dda..43f0b608c796 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -9,14 +9,10 @@ cc-option = $(call __cc-option, $(CC),,$(1),$(2)) include Makefile.include -CFLAGS = -nostdlib -nostdinc -static \ +$(OUTPUT)/nolibc-test: CFLAGS = -nostdlib -nostdinc -static \ -isystem $(top_srcdir)/tools/include/nolibc -isystem $(top_srcdir)/usr/include \ $(CFLAGS_NOLIBC_TEST) - -ifeq ($(LLVM),) -LDLIBS := -lgcc -endif - +$(OUTPUT)/nolibc-test: LDLIBS = $(if $(LLVM),,-lgcc) $(OUTPUT)/nolibc-test: nolibc-test.c nolibc-test-linkage.c | headers help: -- cgit v1.2.3 From 6fe8360b16acbfb50c703f52568cad46759be2ed Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Tue, 6 Jan 2026 12:44:59 +0100 Subject: selftests/nolibc: also test libc-test through regular selftest framework MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hook up libc-test to the regular selftest build to make sure nolibc-test.c stays compatible with a normal libc. As the pattern rule from lib.mk does not handle compiling a target from a differently named source file, add an explicit rule definition. Signed-off-by: Thomas Weißschuh Acked-by: Willy Tarreau Link: https://patch.msgid.link/20260106-nolibc-selftests-v1-3-f82101c2c505@weissschuh.net --- tools/testing/selftests/nolibc/Makefile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index 43f0b608c796..0370489d938b 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 -TEST_GEN_PROGS := nolibc-test +TEST_GEN_PROGS := nolibc-test libc-test include ../lib.mk include $(top_srcdir)/scripts/Makefile.compiler @@ -15,6 +15,10 @@ $(OUTPUT)/nolibc-test: CFLAGS = -nostdlib -nostdinc -static \ $(OUTPUT)/nolibc-test: LDLIBS = $(if $(LLVM),,-lgcc) $(OUTPUT)/nolibc-test: nolibc-test.c nolibc-test-linkage.c | headers +$(OUTPUT)/libc-test: nolibc-test.c nolibc-test-linkage.c + $(call msg,CC,,$@) + $(Q)$(LINK.c) $^ -o $@ + help: @echo "For the custom nolibc testsuite use '$(MAKE) -f Makefile.nolibc'; available targets:" @$(MAKE) -f Makefile.nolibc help -- cgit v1.2.3 From a5f00be9b3b07d92c6689997403851a32e1874cc Mon Sep 17 00:00:00 2001 From: Daniel Palmer Date: Mon, 5 Jan 2026 11:36:29 +0900 Subject: tools/nolibc: Add a simple test for writing to a FILE and reading it back MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a test that exercises create->write->seek->read to check that using the stream functions (fwrite() etc) is not totally broken. The only edge cases this is testing for are: - Reading the file after writing but without rewinding reads nothing. - Trying to read more items than the file contains returns the count of fully read items. Signed-off-by: Daniel Palmer Link: https://patch.msgid.link/20260105023629.1502801-4-daniel@thingy.jp Signed-off-by: Thomas Weißschuh --- tools/testing/selftests/nolibc/nolibc-test.c | 53 ++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index e83c1e7e2beb..1b9d3b2e2491 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -878,6 +878,58 @@ int test_file_stream(void) return 0; } +int test_file_stream_wsr(void) +{ + const char dataout[] = "foo"; + const size_t datasz = sizeof(dataout); + char datain[datasz]; + int fd, r; + FILE *f; + + fd = open("/tmp", O_TMPFILE | O_RDWR, 0644); + if (fd == -1) + return -1; + + f = fdopen(fd, "w+"); + if (!f) + return -1; + + errno = 0; + r = fwrite(dataout, 1, datasz, f); + if (r != datasz) + return -1; + + /* Attempt to read from the file without rewinding, + * we should read 0 items. + */ + r = fread(datain, 1, datasz, f); + if (r) + return -1; + + /* Rewind the file to the start */ + r = fseek(f, 0, SEEK_SET); + if (r) + return -1; + + /* Attempt to read back more than was written to + * make sure we handle short reads properly. + * fread() should return the number of complete items. + */ + r = fread(datain, 1, datasz + 1, f); + if (r != datasz) + return -1; + + /* Data we read should match the data we just wrote */ + if (memcmp(datain, dataout, datasz) != 0) + return -1; + + r = fclose(f); + if (r) + return -1; + + return 0; +} + enum fork_type { FORK_STANDARD, FORK_VFORK, @@ -1352,6 +1404,7 @@ int run_syscall(int min, int max) CASE_TEST(fchdir_stdin); EXPECT_SYSER(1, fchdir(STDIN_FILENO), -1, ENOTDIR); break; CASE_TEST(fchdir_badfd); EXPECT_SYSER(1, fchdir(-1), -1, EBADF); break; CASE_TEST(file_stream); EXPECT_SYSZR(1, test_file_stream()); break; + CASE_TEST(file_stream_wsr); EXPECT_SYSZR(1, test_file_stream_wsr()); break; CASE_TEST(fork); EXPECT_SYSZR(1, test_fork(FORK_STANDARD)); break; CASE_TEST(getdents64_root); EXPECT_SYSNE(1, test_getdents64("/"), -1); break; CASE_TEST(getdents64_null); EXPECT_SYSER(1, test_getdents64("/dev/null"), -1, ENOTDIR); break; -- cgit v1.2.3 From c1d7c0f9cdf6690eff4518f1c17a37d5ee647cd1 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 8 Jan 2026 02:19:40 -0700 Subject: selftests: ublk: display UBLK_F_INTEGRITY support Add support for printing the UBLK_F_INTEGRITY feature flag in the human-readable kublk features output. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/kublk.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 185ba553686a..261095f19c93 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -1454,6 +1454,7 @@ static int cmd_dev_get_features(void) FEAT_NAME(UBLK_F_QUIESCE), FEAT_NAME(UBLK_F_PER_IO_DAEMON), FEAT_NAME(UBLK_F_BUF_REG_OFF_DAEMON), + FEAT_NAME(UBLK_F_INTEGRITY), }; struct ublk_dev *dev; __u64 features = 0; -- cgit v1.2.3 From 261b67f4e34716e793b0b95d2722b2fe780ed5f4 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 8 Jan 2026 02:19:41 -0700 Subject: selftests: ublk: add utility to get block device metadata size Some block device integrity parameters are available in sysfs, but others are only accessible using the FS_IOC_GETLBMD_CAP ioctl. Add a metadata_size utility program to print out the logical block metadata size, PI offset, and PI size within the metadata. Example output: $ metadata_size /dev/ublkb0 metadata_size: 64 pi_offset: 56 pi_tuple_size: 8 Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 5 ++-- tools/testing/selftests/ublk/metadata_size.c | 36 ++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/ublk/metadata_size.c (limited to 'tools/testing') diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 06ba6fde098d..351ac6438561 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -49,12 +49,13 @@ TEST_PROGS += test_stress_05.sh TEST_PROGS += test_stress_06.sh TEST_PROGS += test_stress_07.sh -TEST_GEN_PROGS_EXTENDED = kublk +TEST_GEN_PROGS_EXTENDED = kublk metadata_size +STANDALONE_UTILS := metadata_size.c LOCAL_HDRS += $(wildcard *.h) include ../lib.mk -$(TEST_GEN_PROGS_EXTENDED): $(wildcard *.c) +$(OUTPUT)/kublk: $(filter-out $(STANDALONE_UTILS),$(wildcard *.c)) check: shellcheck -x -f gcc *.sh diff --git a/tools/testing/selftests/ublk/metadata_size.c b/tools/testing/selftests/ublk/metadata_size.c new file mode 100644 index 000000000000..76ecddf04d25 --- /dev/null +++ b/tools/testing/selftests/ublk/metadata_size.c @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include + +int main(int argc, char **argv) +{ + struct logical_block_metadata_cap cap = {}; + const char *filename; + int fd; + int result; + + if (argc != 2) { + fprintf(stderr, "Usage: %s BLOCK_DEVICE\n", argv[0]); + return 1; + } + + filename = argv[1]; + fd = open(filename, O_RDONLY); + if (fd < 0) { + perror(filename); + return 1; + } + + result = ioctl(fd, FS_IOC_GETLBMD_CAP, &cap); + if (result < 0) { + perror("ioctl"); + return 1; + } + + printf("metadata_size: %u\n", cap.lbmd_size); + printf("pi_offset: %u\n", cap.lbmd_pi_offset); + printf("pi_tuple_size: %u\n", cap.lbmd_pi_size); + return 0; +} -- cgit v1.2.3 From 6ed6476c4aefa9ee3ba90f39bcc002dd034f6e03 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 8 Jan 2026 02:19:42 -0700 Subject: selftests: ublk: add kublk support for integrity params Add integrity param command line arguments to kublk. Plumb these to struct ublk_params for the null and fault_inject targets, as they don't need to actually read or write the integrity data. Forbid the integrity params for loop or stripe until the integrity data copy is implemented. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/fault_inject.c | 1 + tools/testing/selftests/ublk/file_backed.c | 4 +++ tools/testing/selftests/ublk/kublk.c | 47 +++++++++++++++++++++++++++++ tools/testing/selftests/ublk/kublk.h | 21 +++++++++++++ tools/testing/selftests/ublk/null.c | 1 + tools/testing/selftests/ublk/stripe.c | 4 +++ 6 files changed, 78 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/ublk/fault_inject.c b/tools/testing/selftests/ublk/fault_inject.c index b227bd78b252..3b897f69c014 100644 --- a/tools/testing/selftests/ublk/fault_inject.c +++ b/tools/testing/selftests/ublk/fault_inject.c @@ -33,6 +33,7 @@ static int ublk_fault_inject_tgt_init(const struct dev_ctx *ctx, .dev_sectors = dev_size >> 9, }, }; + ublk_set_integrity_params(ctx, &dev->tgt.params); dev->private_data = (void *)(unsigned long)(ctx->fault_inject.delay_us * 1000); return 0; diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c index 269d5f124e06..c14ce6608696 100644 --- a/tools/testing/selftests/ublk/file_backed.c +++ b/tools/testing/selftests/ublk/file_backed.c @@ -158,6 +158,10 @@ static int ublk_loop_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) ublk_err("%s: not support auto_zc_fallback\n", __func__); return -EINVAL; } + if (ctx->metadata_size) { + ublk_err("%s: integrity not supported\n", __func__); + return -EINVAL; + } ret = backing_file_tgt_init(dev); if (ret) diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 261095f19c93..48e1865b4875 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -3,6 +3,7 @@ * Description: uring_cmd based ublk */ +#include #include "kublk.h" #define MAX_NR_TGT_ARG 64 @@ -1550,6 +1551,8 @@ static void __cmd_create_help(char *exe, bool recovery) printf("\t[--foreground] [--quiet] [-z] [--auto_zc] [--auto_zc_fallback] [--debug_mask mask] [-r 0|1] [-g] [-u]\n"); printf("\t[-e 0|1 ] [-i 0|1] [--no_ublk_fixed_fd]\n"); printf("\t[--nthreads threads] [--per_io_tasks]\n"); + printf("\t[--integrity_capable] [--integrity_reftag] [--metadata_size SIZE] " + "[--pi_offset OFFSET] [--csum_type ip|t10dif|nvme] [--tag_size SIZE]\n"); printf("\t[target options] [backfile1] [backfile2] ...\n"); printf("\tdefault: nr_queues=2(max 32), depth=128(max 1024), dev_id=-1(auto allocation)\n"); printf("\tdefault: nthreads=nr_queues"); @@ -1613,6 +1616,12 @@ int main(int argc, char *argv[]) { "nthreads", 1, NULL, 0 }, { "per_io_tasks", 0, NULL, 0 }, { "no_ublk_fixed_fd", 0, NULL, 0 }, + { "integrity_capable", 0, NULL, 0 }, + { "integrity_reftag", 0, NULL, 0 }, + { "metadata_size", 1, NULL, 0 }, + { "pi_offset", 1, NULL, 0 }, + { "csum_type", 1, NULL, 0 }, + { "tag_size", 1, NULL, 0 }, { 0, 0, 0, 0 } }; const struct ublk_tgt_ops *ops = NULL; @@ -1623,6 +1632,7 @@ int main(int argc, char *argv[]) .nr_hw_queues = 2, .dev_id = -1, .tgt_type = "unknown", + .csum_type = LBMD_PI_CSUM_NONE, }; int ret = -EINVAL, i; int tgt_argc = 1; @@ -1697,6 +1707,28 @@ int main(int argc, char *argv[]) ctx.per_io_tasks = 1; if (!strcmp(longopts[option_idx].name, "no_ublk_fixed_fd")) ctx.no_ublk_fixed_fd = 1; + if (!strcmp(longopts[option_idx].name, "integrity_capable")) + ctx.integrity_flags |= LBMD_PI_CAP_INTEGRITY; + if (!strcmp(longopts[option_idx].name, "integrity_reftag")) + ctx.integrity_flags |= LBMD_PI_CAP_REFTAG; + if (!strcmp(longopts[option_idx].name, "metadata_size")) + ctx.metadata_size = strtoul(optarg, NULL, 0); + if (!strcmp(longopts[option_idx].name, "pi_offset")) + ctx.pi_offset = strtoul(optarg, NULL, 0); + if (!strcmp(longopts[option_idx].name, "csum_type")) { + if (!strcmp(optarg, "ip")) { + ctx.csum_type = LBMD_PI_CSUM_IP; + } else if (!strcmp(optarg, "t10dif")) { + ctx.csum_type = LBMD_PI_CSUM_CRC16_T10DIF; + } else if (!strcmp(optarg, "nvme")) { + ctx.csum_type = LBMD_PI_CSUM_CRC64_NVME; + } else { + ublk_err("invalid csum_type: %s\n", optarg); + return -EINVAL; + } + } + if (!strcmp(longopts[option_idx].name, "tag_size")) + ctx.tag_size = strtoul(optarg, NULL, 0); break; case '?': /* @@ -1739,6 +1771,21 @@ int main(int argc, char *argv[]) return -EINVAL; } + if (ctx.metadata_size) { + if (!(ctx.flags & UBLK_F_USER_COPY)) { + ublk_err("integrity requires user_copy\n"); + return -EINVAL; + } + + ctx.flags |= UBLK_F_INTEGRITY; + } else if (ctx.integrity_flags || + ctx.pi_offset || + ctx.csum_type != LBMD_PI_CSUM_NONE || + ctx.tag_size) { + ublk_err("integrity parameters require metadata_size\n"); + return -EINVAL; + } + i = optind; while (i < argc && ctx.nr_files < MAX_BACK_FILES) { ctx.files[ctx.nr_files++] = argv[i++]; diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 8a83b90ec603..d00f2b465cdf 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -78,6 +78,11 @@ struct dev_ctx { unsigned int auto_zc_fallback:1; unsigned int per_io_tasks:1; unsigned int no_ublk_fixed_fd:1; + __u32 integrity_flags; + __u8 metadata_size; + __u8 pi_offset; + __u8 csum_type; + __u8 tag_size; int _evtfd; int _shmid; @@ -202,6 +207,22 @@ struct ublk_dev { extern int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io); +static inline void ublk_set_integrity_params(const struct dev_ctx *ctx, + struct ublk_params *params) +{ + if (!ctx->metadata_size) + return; + + params->types |= UBLK_PARAM_TYPE_INTEGRITY; + params->integrity = (struct ublk_param_integrity) { + .flags = ctx->integrity_flags, + .interval_exp = params->basic.logical_bs_shift, + .metadata_size = ctx->metadata_size, + .pi_offset = ctx->pi_offset, + .csum_type = ctx->csum_type, + .tag_size = ctx->tag_size, + }; +} static inline int ublk_io_auto_zc_fallback(const struct ublksrv_io_desc *iod) { diff --git a/tools/testing/selftests/ublk/null.c b/tools/testing/selftests/ublk/null.c index 280043f6b689..3aa162f08476 100644 --- a/tools/testing/selftests/ublk/null.c +++ b/tools/testing/selftests/ublk/null.c @@ -36,6 +36,7 @@ static int ublk_null_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) .max_segments = 32, }, }; + ublk_set_integrity_params(ctx, &dev->tgt.params); if (info->flags & UBLK_F_SUPPORT_ZERO_COPY) dev->tgt.sq_depth = dev->tgt.cq_depth = 2 * info->queue_depth; diff --git a/tools/testing/selftests/ublk/stripe.c b/tools/testing/selftests/ublk/stripe.c index fd412e1f01c0..d4aaf3351d71 100644 --- a/tools/testing/selftests/ublk/stripe.c +++ b/tools/testing/selftests/ublk/stripe.c @@ -298,6 +298,10 @@ static int ublk_stripe_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) ublk_err("%s: not support auto_zc_fallback\n", __func__); return -EINVAL; } + if (ctx->metadata_size) { + ublk_err("%s: integrity not supported\n", __func__); + return -EINVAL; + } if ((chunk_size & (chunk_size - 1)) || !chunk_size) { ublk_err("invalid chunk size %u\n", chunk_size); -- cgit v1.2.3 From 24f8a44b797f03dfadb455138930523599d3c22a Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 8 Jan 2026 02:19:43 -0700 Subject: selftests: ublk: implement integrity user copy in kublk If integrity data is enabled for kublk, allocate an integrity buffer for each I/O. Extend ublk_user_copy() to copy the integrity data between the ublk request and the integrity buffer if the ublksrv_io_desc indicates that the request has integrity data. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/kublk.c | 41 +++++++++++++++++++++++++++++++----- tools/testing/selftests/ublk/kublk.h | 14 ++++++++++++ 2 files changed, 50 insertions(+), 5 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 48e1865b4875..d95937dd6167 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -416,8 +416,10 @@ static void ublk_queue_deinit(struct ublk_queue *q) if (q->io_cmd_buf) munmap(q->io_cmd_buf, ublk_queue_cmd_buf_sz(q)); - for (i = 0; i < nr_ios; i++) + for (i = 0; i < nr_ios; i++) { free(q->ios[i].buf_addr); + free(q->ios[i].integrity_buf); + } } static void ublk_thread_deinit(struct ublk_thread *t) @@ -433,12 +435,13 @@ static void ublk_thread_deinit(struct ublk_thread *t) } } -static int ublk_queue_init(struct ublk_queue *q, unsigned long long extra_flags) +static int ublk_queue_init(struct ublk_queue *q, unsigned long long extra_flags, + __u8 metadata_size) { struct ublk_dev *dev = q->dev; int depth = dev->dev_info.queue_depth; int i; - int cmd_buf_size, io_buf_size; + int cmd_buf_size, io_buf_size, integrity_size; unsigned long off; q->tgt_ops = dev->tgt.ops; @@ -446,6 +449,7 @@ static int ublk_queue_init(struct ublk_queue *q, unsigned long long extra_flags) q->q_depth = depth; q->flags = dev->dev_info.flags; q->flags |= extra_flags; + q->metadata_size = metadata_size; /* Cache fd in queue for fast path access */ q->ublk_fd = dev->fds[0]; @@ -461,11 +465,23 @@ static int ublk_queue_init(struct ublk_queue *q, unsigned long long extra_flags) } io_buf_size = dev->dev_info.max_io_buf_bytes; + integrity_size = ublk_integrity_len(q, io_buf_size); for (i = 0; i < q->q_depth; i++) { q->ios[i].buf_addr = NULL; q->ios[i].flags = UBLKS_IO_NEED_FETCH_RQ | UBLKS_IO_FREE; q->ios[i].tag = i; + if (integrity_size) { + q->ios[i].integrity_buf = malloc(integrity_size); + if (!q->ios[i].integrity_buf) { + ublk_err("ublk dev %d queue %d io %d malloc(%d) failed: %m\n", + dev->dev_info.dev_id, q->q_id, i, + integrity_size); + goto fail; + } + } + + if (ublk_queue_no_buf(q)) continue; @@ -608,13 +624,13 @@ static void ublk_user_copy(const struct ublk_io *io, __u8 match_ublk_op) __u8 ublk_op = ublksrv_get_op(iod); __u32 len = iod->nr_sectors << 9; void *addr = io->buf_addr; + ssize_t copied; if (ublk_op != match_ublk_op) return; while (len) { __u32 copy_len = min(len, UBLK_USER_COPY_LEN); - ssize_t copied; if (ublk_op == UBLK_IO_OP_WRITE) copied = pread(q->ublk_fd, addr, copy_len, off); @@ -627,6 +643,20 @@ static void ublk_user_copy(const struct ublk_io *io, __u8 match_ublk_op) off += copy_len; len -= copy_len; } + + if (!(iod->op_flags & UBLK_IO_F_INTEGRITY)) + return; + + len = ublk_integrity_len(q, iod->nr_sectors << 9); + off = ublk_user_copy_offset(q->q_id, io->tag); + off |= UBLKSRV_IO_INTEGRITY_FLAG; + if (ublk_op == UBLK_IO_OP_WRITE) + copied = pread(q->ublk_fd, io->integrity_buf, len, off); + else if (ublk_op == UBLK_IO_OP_READ) + copied = pwrite(q->ublk_fd, io->integrity_buf, len, off); + else + assert(0); + assert(copied == (ssize_t)len); } int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io) @@ -1013,7 +1043,8 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) dev->q[i].dev = dev; dev->q[i].q_id = i; - ret = ublk_queue_init(&dev->q[i], extra_flags); + ret = ublk_queue_init(&dev->q[i], extra_flags, + ctx->metadata_size); if (ret) { ublk_err("ublk dev %d queue %d init queue failed\n", dinfo->dev_id, i); diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index d00f2b465cdf..830b49a7716a 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -112,6 +112,7 @@ struct ublk_ctrl_cmd_data { struct ublk_io { char *buf_addr; + void *integrity_buf; #define UBLKS_IO_NEED_FETCH_RQ (1UL << 0) #define UBLKS_IO_NEED_COMMIT_RQ_COMP (1UL << 1) @@ -175,6 +176,7 @@ struct ublk_queue { #define UBLKS_Q_NO_UBLK_FIXED_FD (1ULL << 62) __u64 flags; int ublk_fd; /* cached ublk char device fd */ + __u8 metadata_size; struct ublk_io ios[UBLK_QUEUE_DEPTH]; }; @@ -224,6 +226,18 @@ static inline void ublk_set_integrity_params(const struct dev_ctx *ctx, }; } +static inline size_t ublk_integrity_len(const struct ublk_queue *q, size_t len) +{ + /* All targets currently use interval_exp = logical_bs_shift = 9 */ + return (len >> 9) * q->metadata_size; +} + +static inline size_t +ublk_integrity_data_len(const struct ublk_queue *q, size_t integrity_len) +{ + return (integrity_len / q->metadata_size) << 9; +} + static inline int ublk_io_auto_zc_fallback(const struct ublksrv_io_desc *iod) { return !!(iod->op_flags & UBLK_IO_F_NEED_REG_BUF); -- cgit v1.2.3 From a1805442674b85ff9d626965f828e4fd71a82b28 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 8 Jan 2026 02:19:44 -0700 Subject: selftests: ublk: support non-O_DIRECT backing files A subsequent commit will add support for using a backing file to store integrity data. Since integrity data is accessed in intervals of metadata_size, which may be much smaller than a logical block on the backing device, direct I/O cannot be used. Add an argument to backing_file_tgt_init() to specify the number of files to open for direct I/O. The remaining files will use buffered I/O. For now, continue to request direct I/O for all the files. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/common.c | 4 ++-- tools/testing/selftests/ublk/file_backed.c | 2 +- tools/testing/selftests/ublk/kublk.h | 2 +- tools/testing/selftests/ublk/stripe.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/ublk/common.c b/tools/testing/selftests/ublk/common.c index 01580a6f8519..d9873d4d50d0 100644 --- a/tools/testing/selftests/ublk/common.c +++ b/tools/testing/selftests/ublk/common.c @@ -12,7 +12,7 @@ void backing_file_tgt_deinit(struct ublk_dev *dev) } } -int backing_file_tgt_init(struct ublk_dev *dev) +int backing_file_tgt_init(struct ublk_dev *dev, unsigned int nr_direct) { int fd, i; @@ -25,7 +25,7 @@ int backing_file_tgt_init(struct ublk_dev *dev) ublk_dbg(UBLK_DBG_DEV, "%s: file %d: %s\n", __func__, i, file); - fd = open(file, O_RDWR | O_DIRECT); + fd = open(file, O_RDWR | (i < nr_direct ? O_DIRECT : 0)); if (fd < 0) { ublk_err("%s: backing file %s can't be opened: %s\n", __func__, file, strerror(errno)); diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c index c14ce6608696..db4c176a4f28 100644 --- a/tools/testing/selftests/ublk/file_backed.c +++ b/tools/testing/selftests/ublk/file_backed.c @@ -163,7 +163,7 @@ static int ublk_loop_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) return -EINVAL; } - ret = backing_file_tgt_init(dev); + ret = backing_file_tgt_init(dev, 1); if (ret) return ret; diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 830b49a7716a..96c66b337bc0 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -462,6 +462,6 @@ extern const struct ublk_tgt_ops stripe_tgt_ops; extern const struct ublk_tgt_ops fault_inject_tgt_ops; void backing_file_tgt_deinit(struct ublk_dev *dev); -int backing_file_tgt_init(struct ublk_dev *dev); +int backing_file_tgt_init(struct ublk_dev *dev, unsigned int nr_direct); #endif diff --git a/tools/testing/selftests/ublk/stripe.c b/tools/testing/selftests/ublk/stripe.c index d4aaf3351d71..2be1c36438e7 100644 --- a/tools/testing/selftests/ublk/stripe.c +++ b/tools/testing/selftests/ublk/stripe.c @@ -315,7 +315,7 @@ static int ublk_stripe_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) chunk_shift = ilog2(chunk_size); - ret = backing_file_tgt_init(dev); + ret = backing_file_tgt_init(dev, dev->tgt.nr_backing_files); if (ret) return ret; -- cgit v1.2.3 From f48250dc5ba8368ccb587093eb20d1c7baecaacf Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 8 Jan 2026 02:19:45 -0700 Subject: selftests: ublk: add integrity data support to loop target To perform and end-to-end test of integrity information through a ublk device, we need to actually store it somewhere and retrieve it. Add this support to kublk's loop target. It uses a second backing file for the integrity data corresponding to the data stored in the first file. The integrity file is initialized with byte 0xFF, which ensures the app and reference tags are set to the "escape" pattern to disable the bio-integrity-auto guard and reftag checks until the blocks are written. The integrity file is opened without O_DIRECT since it will be accessed at sub-block granularity. Each incoming read/write results in a pair of reads/writes, one to the data file, and one to the integrity file. If either backing I/O fails, the error is propagated to the ublk request. If both backing I/Os read/write some bytes, the ublk request is completed with the smaller of the number of blocks accessed by each I/O. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/file_backed.c | 92 ++++++++++++++++++++++++------ 1 file changed, 74 insertions(+), 18 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c index db4c176a4f28..c3ce5ff72422 100644 --- a/tools/testing/selftests/ublk/file_backed.c +++ b/tools/testing/selftests/ublk/file_backed.c @@ -35,9 +35,23 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, unsigned auto_zc = ublk_queue_use_auto_zc(q); enum io_uring_op op = ublk_to_uring_op(iod, zc | auto_zc); struct ublk_io *io = ublk_get_io(q, tag); + __u64 offset = iod->start_sector << 9; + __u32 len = iod->nr_sectors << 9; struct io_uring_sqe *sqe[3]; void *addr = io->buf_addr; + if (iod->op_flags & UBLK_IO_F_INTEGRITY) { + ublk_io_alloc_sqes(t, sqe, 1); + /* Use second backing file for integrity data */ + io_uring_prep_rw(op, sqe[0], ublk_get_registered_fd(q, 2), + io->integrity_buf, + ublk_integrity_len(q, len), + ublk_integrity_len(q, offset)); + sqe[0]->flags = IOSQE_FIXED_FILE; + /* tgt_data = 1 indicates integrity I/O */ + sqe[0]->user_data = build_user_data(tag, ublk_op, 1, q->q_id, 1); + } + if (!zc || auto_zc) { ublk_io_alloc_sqes(t, sqe, 1); if (!sqe[0]) @@ -45,14 +59,14 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, io_uring_prep_rw(op, sqe[0], ublk_get_registered_fd(q, 1) /*fds[1]*/, addr, - iod->nr_sectors << 9, - iod->start_sector << 9); + len, + offset); if (auto_zc) sqe[0]->buf_index = tag; io_uring_sqe_set_flags(sqe[0], IOSQE_FIXED_FILE); /* bit63 marks us as tgt io */ sqe[0]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1); - return 1; + return !!(iod->op_flags & UBLK_IO_F_INTEGRITY) + 1; } ublk_io_alloc_sqes(t, sqe, 3); @@ -63,8 +77,8 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1); io_uring_prep_rw(op, sqe[1], ublk_get_registered_fd(q, 1) /*fds[1]*/, 0, - iod->nr_sectors << 9, - iod->start_sector << 9); + len, + offset); sqe[1]->buf_index = tag; sqe[1]->flags |= IOSQE_FIXED_FILE | IOSQE_IO_HARDLINK; sqe[1]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1); @@ -72,7 +86,7 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, io->buf_index); sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, q->q_id, 1); - return 2; + return !!(iod->op_flags & UBLK_IO_F_INTEGRITY) + 2; } static int loop_queue_tgt_io(struct ublk_thread *t, struct ublk_queue *q, int tag) @@ -119,12 +133,17 @@ static void ublk_loop_io_done(struct ublk_thread *t, struct ublk_queue *q, unsigned op = user_data_to_op(cqe->user_data); struct ublk_io *io = ublk_get_io(q, tag); - if (cqe->res < 0 || op != ublk_cmd_op_nr(UBLK_U_IO_UNREGISTER_IO_BUF)) { - if (!io->result) - io->result = cqe->res; - if (cqe->res < 0) - ublk_err("%s: io failed op %x user_data %lx\n", - __func__, op, cqe->user_data); + if (cqe->res < 0) { + io->result = cqe->res; + ublk_err("%s: io failed op %x user_data %lx\n", + __func__, op, cqe->user_data); + } else if (op != ublk_cmd_op_nr(UBLK_U_IO_UNREGISTER_IO_BUF)) { + __s32 data_len = user_data_to_tgt_data(cqe->user_data) + ? ublk_integrity_data_len(q, cqe->res) + : cqe->res; + + if (!io->result || data_len < io->result) + io->result = data_len; } /* buffer register op is IOSQE_CQE_SKIP_SUCCESS */ @@ -135,9 +154,30 @@ static void ublk_loop_io_done(struct ublk_thread *t, struct ublk_queue *q, ublk_complete_io(t, q, tag, io->result); } +static int ublk_loop_memset_file(int fd, __u8 byte, size_t len) +{ + off_t offset = 0; + __u8 buf[4096]; + + memset(buf, byte, sizeof(buf)); + while (len) { + int ret = pwrite(fd, buf, min(len, sizeof(buf)), offset); + + if (ret < 0) + return -errno; + if (!ret) + return -EIO; + + len -= ret; + offset += ret; + } + return 0; +} + static int ublk_loop_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) { unsigned long long bytes; + unsigned long blocks; int ret; struct ublk_params p = { .types = UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DMA_ALIGN, @@ -154,23 +194,39 @@ static int ublk_loop_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) }, }; + ublk_set_integrity_params(ctx, &p); if (ctx->auto_zc_fallback) { ublk_err("%s: not support auto_zc_fallback\n", __func__); return -EINVAL; } - if (ctx->metadata_size) { - ublk_err("%s: integrity not supported\n", __func__); - return -EINVAL; - } + /* Use O_DIRECT only for data file */ ret = backing_file_tgt_init(dev, 1); if (ret) return ret; - if (dev->tgt.nr_backing_files != 1) + /* Expect a second file for integrity data */ + if (dev->tgt.nr_backing_files != 1 + !!ctx->metadata_size) return -EINVAL; - bytes = dev->tgt.backing_file_size[0]; + blocks = dev->tgt.backing_file_size[0] >> p.basic.logical_bs_shift; + if (ctx->metadata_size) { + unsigned long metadata_blocks = + dev->tgt.backing_file_size[1] / ctx->metadata_size; + unsigned long integrity_len; + + /* Ensure both data and integrity data fit in backing files */ + blocks = min(blocks, metadata_blocks); + integrity_len = blocks * ctx->metadata_size; + /* + * Initialize PI app tag and ref tag to 0xFF + * to disable bio-integrity-auto checks + */ + ret = ublk_loop_memset_file(dev->fds[2], 0xFF, integrity_len); + if (ret) + return ret; + } + bytes = blocks << p.basic.logical_bs_shift; dev->tgt.dev_size = bytes; p.basic.dev_sectors = bytes >> 9; dev->tgt.params = p; -- cgit v1.2.3 From 9e9f635525b12f055558a7cfe2e54d109839d030 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 8 Jan 2026 02:19:46 -0700 Subject: selftests: ublk: add integrity params test Add test case null_04 to exercise all the different integrity params. It creates 4 different ublk devices with different combinations of integrity arguments and verifies their integrity limits via sysfs and the metadata_size utility. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 1 + tools/testing/selftests/ublk/test_common.sh | 10 ++ tools/testing/selftests/ublk/test_null_04.sh | 166 +++++++++++++++++++++++++++ 3 files changed, 177 insertions(+) create mode 100755 tools/testing/selftests/ublk/test_null_04.sh (limited to 'tools/testing') diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 351ac6438561..239ad1c741ef 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -27,6 +27,7 @@ TEST_PROGS += test_generic_15.sh TEST_PROGS += test_null_01.sh TEST_PROGS += test_null_02.sh TEST_PROGS += test_null_03.sh +TEST_PROGS += test_null_04.sh TEST_PROGS += test_loop_01.sh TEST_PROGS += test_loop_02.sh TEST_PROGS += test_loop_03.sh diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index ea9a5f3eb70a..7ff6ce79d62c 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -384,6 +384,16 @@ _ublk_test_top_dir() cd "$(dirname "$0")" && pwd } +METADATA_SIZE_PROG="$(_ublk_test_top_dir)/metadata_size" + +_get_metadata_size() +{ + local dev_id=$1 + local field=$2 + + "$METADATA_SIZE_PROG" "/dev/ublkb$dev_id" | grep "$field" | grep -o "[0-9]*" +} + UBLK_PROG=$(_ublk_test_top_dir)/kublk UBLK_TEST_QUIET=1 UBLK_TEST_SHOW_RESULT=1 diff --git a/tools/testing/selftests/ublk/test_null_04.sh b/tools/testing/selftests/ublk/test_null_04.sh new file mode 100755 index 000000000000..0b0719ea33a3 --- /dev/null +++ b/tools/testing/selftests/ublk/test_null_04.sh @@ -0,0 +1,166 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID=null_04 + +_prep_test "null" "integrity params" + +dev_id=$(_add_ublk_dev -t null -u --metadata_size 8) +_check_add_dev $TID $? +metadata_size=$(_get_metadata_size "$dev_id" metadata_size) +if [ "$metadata_size" != 8 ]; then + echo "metadata_size $metadata_size != 8" + _show_result $TID 255 +fi +pi_offset=$(_get_metadata_size "$dev_id" pi_offset) +if [ "$pi_offset" != 0 ]; then + echo "pi_offset $pi_offset != 0" + _show_result $TID 255 +fi +pi_tuple_size=$(_get_metadata_size "$dev_id" pi_tuple_size) +if [ "$pi_tuple_size" != 0 ]; then + echo "pi_tuple_size $pi_tuple_size != 0" + _show_result $TID 255 +fi +capable=$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable") +if [ "$capable" != 0 ]; then + echo "device_is_integrity_capable $capable != 0" + _show_result $TID 255 +fi +format=$(cat "/sys/block/ublkb$dev_id/integrity/format") +if [ "$format" != nop ]; then + echo "format $format != nop" + _show_result $TID 255 +fi +protection_interval_bytes=$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes") +if [ "$protection_interval_bytes" != 512 ]; then + echo "protection_interval_bytes $protection_interval_bytes != 512" + _show_result $TID 255 +fi +tag_size=$(cat "/sys/block/ublkb$dev_id/integrity/tag_size") +if [ "$tag_size" != 0 ]; then + echo "tag_size $tag_size != 0" + _show_result $TID 255 +fi +_cleanup_test + +dev_id=$(_add_ublk_dev -t null -u --integrity_capable --metadata_size 64 --pi_offset 56 --csum_type ip) +_check_add_dev $TID $? +metadata_size=$(_get_metadata_size "$dev_id" metadata_size) +if [ "$metadata_size" != 64 ]; then + echo "metadata_size $metadata_size != 64" + _show_result $TID 255 +fi +pi_offset=$(_get_metadata_size "$dev_id" pi_offset) +if [ "$pi_offset" != 56 ]; then + echo "pi_offset $pi_offset != 56" + _show_result $TID 255 +fi +pi_tuple_size=$(_get_metadata_size "$dev_id" pi_tuple_size) +if [ "$pi_tuple_size" != 8 ]; then + echo "pi_tuple_size $pi_tuple_size != 8" + _show_result $TID 255 +fi +capable=$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable") +if [ "$capable" != 1 ]; then + echo "device_is_integrity_capable $capable != 1" + _show_result $TID 255 +fi +format=$(cat "/sys/block/ublkb$dev_id/integrity/format") +if [ "$format" != T10-DIF-TYPE3-IP ]; then + echo "format $format != T10-DIF-TYPE3-IP" + _show_result $TID 255 +fi +protection_interval_bytes=$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes") +if [ "$protection_interval_bytes" != 512 ]; then + echo "protection_interval_bytes $protection_interval_bytes != 512" + _show_result $TID 255 +fi +tag_size=$(cat "/sys/block/ublkb$dev_id/integrity/tag_size") +if [ "$tag_size" != 0 ]; then + echo "tag_size $tag_size != 0" + _show_result $TID 255 +fi +_cleanup_test + +dev_id=$(_add_ublk_dev -t null -u --integrity_reftag --metadata_size 8 --csum_type t10dif) +_check_add_dev $TID $? +metadata_size=$(_get_metadata_size "$dev_id" metadata_size) +if [ "$metadata_size" != 8 ]; then + echo "metadata_size $metadata_size != 8" + _show_result $TID 255 +fi +pi_offset=$(_get_metadata_size "$dev_id" pi_offset) +if [ "$pi_offset" != 0 ]; then + echo "pi_offset $pi_offset != 0" + _show_result $TID 255 +fi +pi_tuple_size=$(_get_metadata_size "$dev_id" pi_tuple_size) +if [ "$pi_tuple_size" != 8 ]; then + echo "pi_tuple_size $pi_tuple_size != 8" + _show_result $TID 255 +fi +capable=$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable") +if [ "$capable" != 0 ]; then + echo "device_is_integrity_capable $capable != 0" + _show_result $TID 255 +fi +format=$(cat "/sys/block/ublkb$dev_id/integrity/format") +if [ "$format" != T10-DIF-TYPE1-CRC ]; then + echo "format $format != T10-DIF-TYPE1-CRC" + _show_result $TID 255 +fi +protection_interval_bytes=$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes") +if [ "$protection_interval_bytes" != 512 ]; then + echo "protection_interval_bytes $protection_interval_bytes != 512" + _show_result $TID 255 +fi +tag_size=$(cat "/sys/block/ublkb$dev_id/integrity/tag_size") +if [ "$tag_size" != 0 ]; then + echo "tag_size $tag_size != 0" + _show_result $TID 255 +fi +_cleanup_test + +dev_id=$(_add_ublk_dev -t null -u --metadata_size 16 --csum_type nvme --tag_size 8) +_check_add_dev $TID $? +metadata_size=$(_get_metadata_size "$dev_id" metadata_size) +if [ "$metadata_size" != 16 ]; then + echo "metadata_size $metadata_size != 16" + _show_result $TID 255 +fi +pi_offset=$(_get_metadata_size "$dev_id" pi_offset) +if [ "$pi_offset" != 0 ]; then + echo "pi_offset $pi_offset != 0" + _show_result $TID 255 +fi +pi_tuple_size=$(_get_metadata_size "$dev_id" pi_tuple_size) +if [ "$pi_tuple_size" != 16 ]; then + echo "pi_tuple_size $pi_tuple_size != 16" + _show_result $TID 255 +fi +capable=$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable") +if [ "$capable" != 0 ]; then + echo "device_is_integrity_capable $capable != 0" + _show_result $TID 255 +fi +format=$(cat "/sys/block/ublkb$dev_id/integrity/format") +if [ "$format" != EXT-DIF-TYPE3-CRC64 ]; then + echo "format $format != EXT-DIF-TYPE3-CRC64" + _show_result $TID 255 +fi +protection_interval_bytes=$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes") +if [ "$protection_interval_bytes" != 512 ]; then + echo "protection_interval_bytes $protection_interval_bytes != 512" + _show_result $TID 255 +fi +tag_size=$(cat "/sys/block/ublkb$dev_id/integrity/tag_size") +if [ "$tag_size" != 8 ]; then + echo "tag_size $tag_size != 8" + _show_result $TID 255 +fi +_cleanup_test + +_show_result $TID 0 -- cgit v1.2.3 From 78796b6bae8684b753b658f431b5b1ee24300d64 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 8 Jan 2026 02:19:47 -0700 Subject: selftests: ublk: add end-to-end integrity test Add test case loop_08 to verify the ublk integrity data flow. It uses the kublk loop target to create a ublk device with integrity on top of backing data and integrity files. It then writes to the whole device with fio configured to generate integrity data. Then it reads back the whole device with fio configured to verify the integrity data. It also verifies that injected guard, reftag, and apptag corruptions are correctly detected. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 1 + tools/testing/selftests/ublk/test_loop_08.sh | 111 +++++++++++++++++++++++++++ 2 files changed, 112 insertions(+) create mode 100755 tools/testing/selftests/ublk/test_loop_08.sh (limited to 'tools/testing') diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 239ad1c741ef..036a9f01b464 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -35,6 +35,7 @@ TEST_PROGS += test_loop_04.sh TEST_PROGS += test_loop_05.sh TEST_PROGS += test_loop_06.sh TEST_PROGS += test_loop_07.sh +TEST_PROGS += test_loop_08.sh TEST_PROGS += test_stripe_01.sh TEST_PROGS += test_stripe_02.sh TEST_PROGS += test_stripe_03.sh diff --git a/tools/testing/selftests/ublk/test_loop_08.sh b/tools/testing/selftests/ublk/test_loop_08.sh new file mode 100755 index 000000000000..ca289cfb2ad4 --- /dev/null +++ b/tools/testing/selftests/ublk/test_loop_08.sh @@ -0,0 +1,111 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +if ! _have_program fio; then + exit $UBLK_SKIP_CODE +fi + +fio_version=$(fio --version) +if [[ "$fio_version" =~ fio-[0-9]+\.[0-9]+$ ]]; then + echo "Requires development fio version with https://github.com/axboe/fio/pull/1992" + exit $UBLK_SKIP_CODE +fi + +TID=loop_08 + +_prep_test "loop" "end-to-end integrity" + +_create_backfile 0 256M +_create_backfile 1 32M # 256M * (64 integrity bytes / 512 data bytes) +integrity_params="--integrity_capable --integrity_reftag + --metadata_size 64 --pi_offset 56 --csum_type t10dif" +dev_id=$(_add_ublk_dev -t loop -u $integrity_params "${UBLK_BACKFILES[@]}") +_check_add_dev $TID $? + +# 1M * (64 integrity bytes / 512 data bytes) = 128K +fio_args="--ioengine io_uring --direct 1 --bsrange 512-1M --iodepth 32 + --md_per_io_size 128K --pi_act 0 --pi_chk GUARD,REFTAG,APPTAG + --filename /dev/ublkb$dev_id" +fio --name fill --rw randwrite $fio_args > /dev/null +err=$? +if [ $err != 0 ]; then + echo "fio fill failed" + _show_result $TID $err +fi + +fio --name verify --rw randread $fio_args > /dev/null +err=$? +if [ $err != 0 ]; then + echo "fio verify failed" + _show_result $TID $err +fi + +fio_err=$(mktemp fio_err_XXXXX) + +# Overwrite 4-byte reftag at offset 56 + 4 = 60 +dd_reftag_args="bs=1 seek=60 count=4 oflag=dsync conv=notrunc status=none" +dd if=/dev/urandom "of=${UBLK_BACKFILES[1]}" $dd_reftag_args +err=$? +if [ $err != 0 ]; then + echo "dd corrupted_reftag failed" + rm -f "$fio_err" + _show_result $TID $err +fi +if fio --name corrupted_reftag --rw randread $fio_args > /dev/null 2> "$fio_err"; then + echo "fio corrupted_reftag unexpectedly succeeded" + rm -f "$fio_err" + _show_result $TID 255 +fi +expected_err="REFTAG compare error: LBA: 0 Expected=0, Actual=" +if ! grep -q "$expected_err" "$fio_err"; then + echo "fio corrupted_reftag message not found: $expected_err" + rm -f "$fio_err" + _show_result $TID 255 +fi +# Reset to 0 +dd if=/dev/zero "of=${UBLK_BACKFILES[1]}" $dd_reftag_args +err=$? +if [ $err != 0 ]; then + echo "dd restore corrupted_reftag failed" + rm -f "$fio_err" + _show_result $TID $err +fi + +dd_data_args="bs=512 count=1 oflag=direct,dsync conv=notrunc status=none" +dd if=/dev/zero "of=${UBLK_BACKFILES[0]}" $dd_data_args +err=$? +if [ $err != 0 ]; then + echo "dd corrupted_data failed" + rm -f "$fio_err" + _show_result $TID $err +fi +if fio --name corrupted_data --rw randread $fio_args > /dev/null 2> "$fio_err"; then + echo "fio corrupted_data unexpectedly succeeded" + rm -f "$fio_err" + _show_result $TID 255 +fi +expected_err="Guard compare error: LBA: 0 Expected=0, Actual=" +if ! grep -q "$expected_err" "$fio_err"; then + echo "fio corrupted_data message not found: $expected_err" + rm -f "$fio_err" + _show_result $TID 255 +fi + +if fio --name bad_apptag --rw randread $fio_args --apptag 0x4321 > /dev/null 2> "$fio_err"; then + echo "fio bad_apptag unexpectedly succeeded" + rm -f "$fio_err" + _show_result $TID 255 +fi +expected_err="APPTAG compare error: LBA: [0-9]* Expected=4321, Actual=1234" +if ! grep -q "$expected_err" "$fio_err"; then + echo "fio bad_apptag message not found: $expected_err" + rm -f "$fio_err" + _show_result $TID 255 +fi + +rm -f "$fio_err" + +_cleanup_test +_show_result $TID 0 -- cgit v1.2.3 From 2a3602030d800b6600ef55c31e21bc54611f7770 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 12 Jan 2026 11:00:20 -0500 Subject: cgroup/cpuset: Don't invalidate sibling partitions on cpuset.cpus conflict Currently, when setting a cpuset's cpuset.cpus to a value that conflicts with the cpuset.cpus/cpuset.cpus.exclusive of a sibling partition, the sibling's partition state becomes invalid. This is overly harsh and is probably not necessary. The cpuset.cpus.exclusive control file, if set, will override the cpuset.cpus of the same cpuset when creating a cpuset partition. So cpuset.cpus has less priority than cpuset.cpus.exclusive in setting up a partition. However, it cannot override a conflicting cpuset.cpus file in a sibling cpuset and the partition creation process will fail. This is inconsistent. That will also make using cpuset.cpus.exclusive less valuable as a tool to set up cpuset partitions as the users have to check if such a cpuset.cpus conflict exists or not. Fix these problems by making sure that once a cpuset.cpus.exclusive is set without failure, it will always be allowed to form a valid partition as long as at least one CPU can be granted from its parent irrespective of the state of the siblings' cpuset.cpus values. Of course, setting cpuset.cpus.exclusive will fail if it conflicts with the cpuset.cpus.exclusive or the cpuset.cpus.exclusive.effective value of a sibling. Partition can still be created by setting only cpuset.cpus without setting cpuset.cpus.exclusive. However, any conflicting CPUs in sibling's cpuset.cpus.exclusive.effective and cpuset.cpus.exclusive values will be removed from its cpuset.cpus.exclusive.effective as long as there is still one or more CPUs left and can be granted from its parent. This CPU stripping is currently done in rm_siblings_excl_cpus(). The new code will now try its best to enable the creation of new partitions with only cpuset.cpus set without invalidating existing ones. However it is not guaranteed that all the CPUs requested in cpuset.cpus will be used in the new partition even when all these CPUs can be granted from the parent. This is similar to the fact that cpuset.cpus.effective may not be able to include all the CPUs requested in cpuset.cpus. In this case, the parent may not able to grant all the exclusive CPUs requested in cpuset.cpus to cpuset.cpus.exclusive.effective if some of them have already been granted to other partitions earlier. With the creation of multiple sibling partitions by setting only cpuset.cpus, this does have the side effect that their exact cpuset.cpus.exclusive.effective settings will depend on the order of partition creation if there are conflicts. Due to the exclusive nature of the CPUs in a partition, it is not easy to make it fair other than the old behavior of invalidating all the conflicting partitions. For example, # echo "0-2" > A1/cpuset.cpus # echo "root" > A1/cpuset.cpus.partition # cat A1/cpuset.cpus.partition root # cat A1/cpuset.cpus.exclusive.effective 0-2 # echo "2-4" > B1/cpuset.cpus # echo "root" > B1/cpuset.cpus.partition # cat B1/cpuset.cpus.partition root # cat B1/cpuset.cpus.exclusive.effective 3-4 # cat B1/cpuset.cpus.effective 3-4 For users who want to be sure that they can get most of the CPUs they want, cpuset.cpus.exclusive should be used instead if they can set it successfully without failure. Setting cpuset.cpus.exclusive will guarantee that sibling conflicts from then onward is no longer possible. To make this change, we have to separate out the is_cpu_exclusive() check in cpus_excl_conflict() into a cgroup v1 only cpuset1_cpus_excl_conflict() helper. The cpus_allowed_validate_change() helper is now no longer needed and can be removed. Some existing tests in test_cpuset_prs.sh are updated and new ones are added to reflect the new behavior. The cgroup-v2.rst doc file is also updated the clarify what exclusive CPUs will be used when a partition is created. Reported-by: Sun Shaojie Closes: https://lore.kernel.org/lkml/20251117015708.977585-1-sunshaojie@kylinos.cn/ Signed-off-by: Waiman Long Reviewed-by: Chen Ridong Signed-off-by: Tejun Heo --- Documentation/admin-guide/cgroup-v2.rst | 33 +++++++--- kernel/cgroup/cpuset-internal.h | 3 + kernel/cgroup/cpuset-v1.c | 19 ++++++ kernel/cgroup/cpuset.c | 80 ++++++++--------------- tools/testing/selftests/cgroup/test_cpuset_prs.sh | 26 ++++++-- 5 files changed, 90 insertions(+), 71 deletions(-) (limited to 'tools/testing') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 510df2461aff..28613c0e1c90 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -2584,9 +2584,9 @@ Cpuset Interface Files of this file will always be a subset of its parent's "cpuset.cpus.exclusive.effective" if its parent is not the root cgroup. It will also be a subset of "cpuset.cpus.exclusive" - if it is set. If "cpuset.cpus.exclusive" is not set, it is - treated to have an implicit value of "cpuset.cpus" in the - formation of local partition. + if it is set. This file should only be non-empty if either + "cpuset.cpus.exclusive" is set or when the current cpuset is + a valid partition root. cpuset.cpus.isolated A read-only and root cgroup only multiple values file. @@ -2618,13 +2618,22 @@ Cpuset Interface Files There are two types of partitions - local and remote. A local partition is one whose parent cgroup is also a valid partition root. A remote partition is one whose parent cgroup is not a - valid partition root itself. Writing to "cpuset.cpus.exclusive" - is optional for the creation of a local partition as its - "cpuset.cpus.exclusive" file will assume an implicit value that - is the same as "cpuset.cpus" if it is not set. Writing the - proper "cpuset.cpus.exclusive" values down the cgroup hierarchy - before the target partition root is mandatory for the creation - of a remote partition. + valid partition root itself. + + Writing to "cpuset.cpus.exclusive" is optional for the creation + of a local partition as its "cpuset.cpus.exclusive" file will + assume an implicit value that is the same as "cpuset.cpus" if it + is not set. Writing the proper "cpuset.cpus.exclusive" values + down the cgroup hierarchy before the target partition root is + mandatory for the creation of a remote partition. + + Not all the CPUs requested in "cpuset.cpus.exclusive" can be + used to form a new partition. Only those that were present + in its parent's "cpuset.cpus.exclusive.effective" control + file can be used. For partitions created without setting + "cpuset.cpus.exclusive", exclusive CPUs specified in sibling's + "cpuset.cpus.exclusive" or "cpuset.cpus.exclusive.effective" + also cannot be used. Currently, a remote partition cannot be created under a local partition. All the ancestors of a remote partition root except @@ -2632,6 +2641,10 @@ Cpuset Interface Files The root cgroup is always a partition root and its state cannot be changed. All other non-root cgroups start out as "member". + Even though the "cpuset.cpus.exclusive*" and "cpuset.cpus" + control files are not present in the root cgroup, they are + implicitly the same as the "/sys/devices/system/cpu/possible" + sysfs file. When set to "root", the current cgroup is the root of a new partition or scheduling domain. The set of exclusive CPUs is diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index e718a4f54360..e8e2683cb067 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -312,6 +312,7 @@ void cpuset1_hotplug_update_tasks(struct cpuset *cs, struct cpumask *new_cpus, nodemask_t *new_mems, bool cpus_updated, bool mems_updated); int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial); +bool cpuset1_cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2); void cpuset1_init(struct cpuset *cs); void cpuset1_online_css(struct cgroup_subsys_state *css); int cpuset1_generate_sched_domains(cpumask_var_t **domains, @@ -326,6 +327,8 @@ static inline void cpuset1_hotplug_update_tasks(struct cpuset *cs, bool cpus_updated, bool mems_updated) {} static inline int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial) { return 0; } +static inline bool cpuset1_cpus_excl_conflict(struct cpuset *cs1, + struct cpuset *cs2) { return false; } static inline void cpuset1_init(struct cpuset *cs) {} static inline void cpuset1_online_css(struct cgroup_subsys_state *css) {} static inline int cpuset1_generate_sched_domains(cpumask_var_t **domains, diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index ecfea7800f0d..04124c38a774 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -373,6 +373,25 @@ out: return ret; } +/* + * cpuset1_cpus_excl_conflict() - Check if two cpusets have exclusive CPU conflicts + * to legacy (v1) + * @cs1: first cpuset to check + * @cs2: second cpuset to check + * + * Returns: true if CPU exclusivity conflict exists, false otherwise + * + * If either cpuset is CPU exclusive, their allowed CPUs cannot intersect. + */ +bool cpuset1_cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) +{ + if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2)) + return cpumask_intersects(cs1->cpus_allowed, + cs2->cpus_allowed); + + return false; +} + #ifdef CONFIG_PROC_PID_CPUSET /* * proc_cpuset_show() diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 4819ab429771..83fb83a86b4b 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -129,6 +129,17 @@ static bool force_sd_rebuild; * For simplicity, a local partition can be created under a local or remote * partition but a remote partition cannot have any partition root in its * ancestor chain except the cgroup root. + * + * A valid partition can be formed by setting exclusive_cpus or cpus_allowed + * if exclusive_cpus is not set. In the case of partition with empty + * exclusive_cpus, all the conflicting exclusive CPUs specified in the + * following cpumasks of sibling cpusets will be removed from its + * cpus_allowed in determining its effective_xcpus. + * - effective_xcpus + * - exclusive_cpus + * + * The "cpuset.cpus.exclusive" control file should be used for setting up + * partition if the users want to get as many CPUs as possible. */ #define PRS_MEMBER 0 #define PRS_ROOT 1 @@ -616,27 +627,25 @@ static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2) * Returns: true if CPU exclusivity conflict exists, false otherwise * * Conflict detection rules: - * 1. If either cpuset is CPU exclusive, they must be mutually exclusive - * 2. exclusive_cpus masks cannot intersect between cpusets - * 3. The allowed CPUs of a sibling cpuset cannot be a subset of the new exclusive CPUs + * o cgroup v1 + * See cpuset1_cpus_excl_conflict() + * o cgroup v2 + * - The exclusive_cpus values cannot overlap. + * - New exclusive_cpus cannot be a superset of a sibling's cpus_allowed. */ static inline bool cpus_excl_conflict(struct cpuset *trial, struct cpuset *sibling, bool xcpus_changed) { - /* If either cpuset is exclusive, check if they are mutually exclusive */ - if (is_cpu_exclusive(trial) || is_cpu_exclusive(sibling)) - return !cpusets_are_exclusive(trial, sibling); - - /* Exclusive_cpus cannot intersect */ - if (cpumask_intersects(trial->exclusive_cpus, sibling->exclusive_cpus)) - return true; + if (!cpuset_v2()) + return cpuset1_cpus_excl_conflict(trial, sibling); /* The cpus_allowed of a sibling cpuset cannot be a subset of the new exclusive_cpus */ if (xcpus_changed && !cpumask_empty(sibling->cpus_allowed) && cpumask_subset(sibling->cpus_allowed, trial->exclusive_cpus)) return true; - return false; + /* Exclusive_cpus cannot intersect */ + return cpumask_intersects(trial->exclusive_cpus, sibling->exclusive_cpus); } static inline bool mems_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) @@ -2312,43 +2321,6 @@ static enum prs_errcode validate_partition(struct cpuset *cs, struct cpuset *tri return PERR_NONE; } -static int cpus_allowed_validate_change(struct cpuset *cs, struct cpuset *trialcs, - struct tmpmasks *tmp) -{ - int retval; - struct cpuset *parent = parent_cs(cs); - - retval = validate_change(cs, trialcs); - - if ((retval == -EINVAL) && cpuset_v2()) { - struct cgroup_subsys_state *css; - struct cpuset *cp; - - /* - * The -EINVAL error code indicates that partition sibling - * CPU exclusivity rule has been violated. We still allow - * the cpumask change to proceed while invalidating the - * partition. However, any conflicting sibling partitions - * have to be marked as invalid too. - */ - trialcs->prs_err = PERR_NOTEXCL; - rcu_read_lock(); - cpuset_for_each_child(cp, css, parent) { - struct cpumask *xcpus = user_xcpus(trialcs); - - if (is_partition_valid(cp) && - cpumask_intersects(xcpus, cp->effective_xcpus)) { - rcu_read_unlock(); - update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, tmp); - rcu_read_lock(); - } - } - rcu_read_unlock(); - retval = 0; - } - return retval; -} - /** * partition_cpus_change - Handle partition state changes due to CPU mask updates * @cs: The target cpuset being modified @@ -2408,15 +2380,15 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) return 0; - if (alloc_tmpmasks(&tmp)) - return -ENOMEM; - compute_trialcs_excpus(trialcs, cs); trialcs->prs_err = PERR_NONE; - retval = cpus_allowed_validate_change(cs, trialcs, &tmp); + retval = validate_change(cs, trialcs); if (retval < 0) - goto out_free; + return retval; + + if (alloc_tmpmasks(&tmp)) + return -ENOMEM; /* * Check all the descendants in update_cpumasks_hier() if @@ -2439,7 +2411,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ if (cs->partition_root_state) update_partition_sd_lb(cs, old_prs); -out_free: + free_tmpmasks(&tmp); return retval; } diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh index a17256d9f88a..ff4540b0490e 100755 --- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh +++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh @@ -269,7 +269,7 @@ TEST_MATRIX=( " C0-3:S+ C1-3:S+ C2-3 . X2-3 X3:P2 . . 0 A1:0-2|A2:3|A3:3 A1:P0|A2:P2 3" " C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3 X2-3:P2 . 0 A1:0-1|A2:1|A3:2-3 A1:P0|A3:P2 2-3" " C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3 X2-3:P2:C3 . 0 A1:0-1|A2:1|A3:2-3 A1:P0|A3:P2 2-3" - " C0-3:S+ C1-3:S+ C2-3 C2-3 . . . P2 0 A1:0-3|A2:1-3|A3:2-3|B1:2-3 A1:P0|A3:P0|B1:P-2" + " C0-3:S+ C1-3:S+ C2-3 C2-3 . . . P2 0 A1:0-1|A2:1|A3:1|B1:2-3 A1:P0|A3:P0|B1:P2" " C0-3:S+ C1-3:S+ C2-3 C4-5 . . . P2 0 B1:4-5 B1:P2 4-5" " C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3 X2-3:P2 P2 0 A3:2-3|B1:4 A3:P2|B1:P2 2-4" " C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3 X2-3:P2:C1-3 P2 0 A3:2-3|B1:4 A3:P2|B1:P2 2-4" @@ -318,7 +318,7 @@ TEST_MATRIX=( # Invalid to valid local partition direct transition tests " C1-3:S+:P2 X4:P2 . . . . . . 0 A1:1-3|XA1:1-3|A2:1-3:XA2: A1:P2|A2:P-2 1-3" " C1-3:S+:P2 X4:P2 . . . X3:P2 . . 0 A1:1-2|XA1:1-3|A2:3:XA2:3 A1:P2|A2:P2 1-3" - " C0-3:P2 . . C4-6 C0-4 . . . 0 A1:0-4|B1:4-6 A1:P-2|B1:P0" + " C0-3:P2 . . C4-6 C0-4 . . . 0 A1:0-4|B1:5-6 A1:P2|B1:P0" " C0-3:P2 . . C4-6 C0-4:C0-3 . . . 0 A1:0-3|B1:4-6 A1:P2|B1:P0 0-3" # Local partition invalidation tests @@ -388,10 +388,10 @@ TEST_MATRIX=( " C0-1:S+ C1 . C2-3 . P2 . . 0 A1:0-1|A2:1 A1:P0|A2:P-2" " C0-1:S+ C1:P2 . C2-3 P1 . . . 0 A1:0|A2:1 A1:P1|A2:P2 0-1|1" - # A non-exclusive cpuset.cpus change will invalidate partition and its siblings - " C0-1:P1 . . C2-3 C0-2 . . . 0 A1:0-2|B1:2-3 A1:P-1|B1:P0" - " C0-1:P1 . . P1:C2-3 C0-2 . . . 0 A1:0-2|B1:2-3 A1:P-1|B1:P-1" - " C0-1 . . P1:C2-3 C0-2 . . . 0 A1:0-2|B1:2-3 A1:P0|B1:P-1" + # A non-exclusive cpuset.cpus change will not invalidate its siblings partition. + " C0-1:P1 . . C2-3 C0-2 . . . 0 A1:0-2|B1:3 A1:P1|B1:P0" + " C0-1:P1 . . P1:C2-3 C0-2 . . . 0 A1:0-1|XA1:0-1|B1:2-3 A1:P1|B1:P1" + " C0-1 . . P1:C2-3 C0-2 . . . 0 A1:0-1|B1:2-3 A1:P0|B1:P1" # cpuset.cpus can overlap with sibling cpuset.cpus.exclusive but not subsumed by it " C0-3 . . C4-5 X5 . . . 0 A1:0-3|B1:4-5" @@ -417,6 +417,14 @@ TEST_MATRIX=( " CX1-4:S+ CX2-4:P2 . C5-6 . . . P1:C3-6 0 A1:1|A2:2-4|B1:5-6 \ A1:P0|A2:P2:B1:P-1 2-4" + # When multiple partitions with conflicting cpuset.cpus are created, the + # latter created ones will only get what are left of the available exclusive + # CPUs. + " C1-3:P1 . . . . . . C3-5:P1 0 A1:1-3|B1:4-5:XB1:4-5 A1:P1|B1:P1" + + # cpuset.cpus can be set to a subset of sibling's cpuset.cpus.exclusive + " C1-3:X1-3 . . C4-5 . . . C1-2 0 A1:1-3|B1:1-2" + # old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate ISOLCPUS # ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------ -------- # Failure cases: @@ -427,7 +435,7 @@ TEST_MATRIX=( # Changes to cpuset.cpus.exclusive that violate exclusivity rule is rejected " C0-3 . . C4-5 X0-3 . . X3-5 1 A1:0-3|B1:4-5" - # cpuset.cpus cannot be a subset of sibling cpuset.cpus.exclusive + # cpuset.cpus.exclusive cannot be set to a superset of sibling's cpuset.cpus " C0-3 . . C4-5 X3-5 . . . 1 A1:0-3|B1:4-5" ) @@ -477,6 +485,10 @@ REMOTE_TEST_MATRIX=( . . X1-2:P2 X4-5:P1 . X1-7:P2 p1:3|c11:1-2|c12:4:c22:5-6 \ p1:P0|p2:P1|c11:P2|c12:P1|c22:P2 \ 1-2,4-6|1-2,5-6" + # c12 whose cpuset.cpus CPUs are all granted to c11 will become invalid partition + " C1-5:P1:S+ . C1-4:P1 C2-3 . . \ + . . . P1 . . p1:5|c11:1-4|c12:5 \ + p1:P1|c11:P1|c12:P-1" ) # -- cgit v1.2.3 From 272bd8183376a9e20fe08bacbaa44003d7c8acaa Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 12 Jan 2026 11:00:21 -0500 Subject: cgroup/cpuset: Move the v1 empty cpus/mems check to cpuset1_validate_change() As stated in commit 1c09b195d37f ("cpuset: fix a regression in validating config change"), it is not allowed to clear masks of a cpuset if there're tasks in it. This is specific to v1 since empty "cpuset.cpus" or "cpuset.mems" will cause the v2 cpuset to inherit the effective CPUs or memory nodes from its parent. So it is OK to have empty cpus or mems even if there are tasks in the cpuset. Move this empty cpus/mems check in validate_change() to cpuset1_validate_change() to allow more flexibility in setting cpus or mems in v2. cpuset_is_populated() needs to be moved into cpuset-internal.h as it is needed by the empty cpus/mems checking code. Also add a test case to test_cpuset_prs.sh to verify that. Reported-by: Chen Ridong Closes: https://lore.kernel.org/lkml/7a3ec392-2e86-4693-aa9f-1e668a668b9c@huaweicloud.com/ Signed-off-by: Waiman Long Reviewed-by: Chen Ridong Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset-internal.h | 9 +++++++++ kernel/cgroup/cpuset-v1.c | 14 ++++++++++++++ kernel/cgroup/cpuset.c | 23 ----------------------- tools/testing/selftests/cgroup/test_cpuset_prs.sh | 3 +++ 4 files changed, 26 insertions(+), 23 deletions(-) (limited to 'tools/testing') diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index e8e2683cb067..fd7d19842ded 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -260,6 +260,15 @@ static inline int nr_cpusets(void) return static_key_count(&cpusets_enabled_key.key) + 1; } +static inline bool cpuset_is_populated(struct cpuset *cs) +{ + lockdep_assert_cpuset_lock_held(); + + /* Cpusets in the process of attaching should be considered as populated */ + return cgroup_is_populated(cs->css.cgroup) || + cs->attach_in_progress; +} + /** * cpuset_for_each_child - traverse online children of a cpuset * @child_cs: loop cursor pointing to the current child diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index 04124c38a774..7a23b9e8778f 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -368,6 +368,20 @@ int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial) if (par && !is_cpuset_subset(trial, par)) goto out; + /* + * Cpusets with tasks - existing or newly being attached - can't + * be changed to have empty cpus_allowed or mems_allowed. + */ + ret = -ENOSPC; + if (cpuset_is_populated(cur)) { + if (!cpumask_empty(cur->cpus_allowed) && + cpumask_empty(trial->cpus_allowed)) + goto out; + if (!nodes_empty(cur->mems_allowed) && + nodes_empty(trial->mems_allowed)) + goto out; + } + ret = 0; out: return ret; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 83fb83a86b4b..a3dbca125588 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -370,15 +370,6 @@ static inline bool is_in_v2_mode(void) (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE); } -static inline bool cpuset_is_populated(struct cpuset *cs) -{ - lockdep_assert_held(&cpuset_mutex); - - /* Cpusets in the process of attaching should be considered as populated */ - return cgroup_is_populated(cs->css.cgroup) || - cs->attach_in_progress; -} - /** * partition_is_populated - check if partition has tasks * @cs: partition root to be checked @@ -695,20 +686,6 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) par = parent_cs(cur); - /* - * Cpusets with tasks - existing or newly being attached - can't - * be changed to have empty cpus_allowed or mems_allowed. - */ - ret = -ENOSPC; - if (cpuset_is_populated(cur)) { - if (!cpumask_empty(cur->cpus_allowed) && - cpumask_empty(trial->cpus_allowed)) - goto out; - if (!nodes_empty(cur->mems_allowed) && - nodes_empty(trial->mems_allowed)) - goto out; - } - /* * We can't shrink if we won't have enough room for SCHED_DEADLINE * tasks. This check is not done when scheduling is disabled as the diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh index ff4540b0490e..5dff3ad53867 100755 --- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh +++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh @@ -425,6 +425,9 @@ TEST_MATRIX=( # cpuset.cpus can be set to a subset of sibling's cpuset.cpus.exclusive " C1-3:X1-3 . . C4-5 . . . C1-2 0 A1:1-3|B1:1-2" + # cpuset.cpus can become empty with task in it as it inherits parent's effective CPUs + " C1-3:S+ C2 . . . T:C . . 0 A1:1-3|A2:1-3" + # old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate ISOLCPUS # ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------ -------- # Failure cases: -- cgit v1.2.3 From 65955a0993a0a9536263fea2eaae8aed496dcc9c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 13 Jan 2026 00:05:02 +0200 Subject: selftests: ublk: add stop command with --safe option Add 'stop' subcommand to kublk utility that uses the new UBLK_CMD_TRY_STOP_DEV command when --safe option is specified. This allows stopping a device only if it has no active openers, returning -EBUSY otherwise. Also add test_generic_16.sh to test the new functionality. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 1 + tools/testing/selftests/ublk/kublk.c | 53 +++++++++++++++++++++++ tools/testing/selftests/ublk/kublk.h | 1 + tools/testing/selftests/ublk/test_generic_16.sh | 57 +++++++++++++++++++++++++ 4 files changed, 112 insertions(+) create mode 100755 tools/testing/selftests/ublk/test_generic_16.sh (limited to 'tools/testing') diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 036a9f01b464..3a2498089b15 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -23,6 +23,7 @@ TEST_PROGS += test_generic_12.sh TEST_PROGS += test_generic_13.sh TEST_PROGS += test_generic_14.sh TEST_PROGS += test_generic_15.sh +TEST_PROGS += test_generic_16.sh TEST_PROGS += test_null_01.sh TEST_PROGS += test_null_02.sh diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index d95937dd6167..3472ce7426ba 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -108,6 +108,15 @@ static int ublk_ctrl_stop_dev(struct ublk_dev *dev) return __ublk_ctrl_cmd(dev, &data); } +static int ublk_ctrl_try_stop_dev(struct ublk_dev *dev) +{ + struct ublk_ctrl_cmd_data data = { + .cmd_op = UBLK_U_CMD_TRY_STOP_DEV, + }; + + return __ublk_ctrl_cmd(dev, &data); +} + static int ublk_ctrl_start_dev(struct ublk_dev *dev, int daemon_pid) { @@ -1424,6 +1433,42 @@ static int cmd_dev_del(struct dev_ctx *ctx) return 0; } +static int cmd_dev_stop(struct dev_ctx *ctx) +{ + int number = ctx->dev_id; + struct ublk_dev *dev; + int ret; + + if (number < 0) { + ublk_err("%s: device id is required\n", __func__); + return -EINVAL; + } + + dev = ublk_ctrl_init(); + dev->dev_info.dev_id = number; + + ret = ublk_ctrl_get_info(dev); + if (ret < 0) + goto fail; + + if (ctx->safe_stop) { + ret = ublk_ctrl_try_stop_dev(dev); + if (ret < 0) + ublk_err("%s: try_stop dev %d failed ret %d\n", + __func__, number, ret); + } else { + ret = ublk_ctrl_stop_dev(dev); + if (ret < 0) + ublk_err("%s: stop dev %d failed ret %d\n", + __func__, number, ret); + } + +fail: + ublk_ctrl_deinit(dev); + + return ret; +} + static int __cmd_dev_list(struct dev_ctx *ctx) { struct ublk_dev *dev = ublk_ctrl_init(); @@ -1487,6 +1532,7 @@ static int cmd_dev_get_features(void) FEAT_NAME(UBLK_F_PER_IO_DAEMON), FEAT_NAME(UBLK_F_BUF_REG_OFF_DAEMON), FEAT_NAME(UBLK_F_INTEGRITY), + FEAT_NAME(UBLK_F_SAFE_STOP_DEV) }; struct ublk_dev *dev; __u64 features = 0; @@ -1616,6 +1662,8 @@ static int cmd_dev_help(char *exe) printf("%s del [-n dev_id] -a \n", exe); printf("\t -a delete all devices -n delete specified device\n\n"); + printf("%s stop -n dev_id [--safe]\n", exe); + printf("\t --safe only stop if device has no active openers\n\n"); printf("%s list [-n dev_id] -a \n", exe); printf("\t -a list all devices, -n list specified device, default -a \n\n"); printf("%s features\n", exe); @@ -1653,6 +1701,7 @@ int main(int argc, char *argv[]) { "pi_offset", 1, NULL, 0 }, { "csum_type", 1, NULL, 0 }, { "tag_size", 1, NULL, 0 }, + { "safe", 0, NULL, 0 }, { 0, 0, 0, 0 } }; const struct ublk_tgt_ops *ops = NULL; @@ -1760,6 +1809,8 @@ int main(int argc, char *argv[]) } if (!strcmp(longopts[option_idx].name, "tag_size")) ctx.tag_size = strtoul(optarg, NULL, 0); + if (!strcmp(longopts[option_idx].name, "safe")) + ctx.safe_stop = 1; break; case '?': /* @@ -1842,6 +1893,8 @@ int main(int argc, char *argv[]) } } else if (!strcmp(cmd, "del")) ret = cmd_dev_del(&ctx); + else if (!strcmp(cmd, "stop")) + ret = cmd_dev_stop(&ctx); else if (!strcmp(cmd, "list")) { ctx.all = 1; ret = cmd_dev_list(&ctx); diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 96c66b337bc0..cb757fd9bf9d 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -83,6 +83,7 @@ struct dev_ctx { __u8 pi_offset; __u8 csum_type; __u8 tag_size; + unsigned int safe_stop:1; int _evtfd; int _shmid; diff --git a/tools/testing/selftests/ublk/test_generic_16.sh b/tools/testing/selftests/ublk/test_generic_16.sh new file mode 100755 index 000000000000..e08af7b685c9 --- /dev/null +++ b/tools/testing/selftests/ublk/test_generic_16.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="generic_16" +ERR_CODE=0 + +_prep_test "null" "stop --safe command" + +# Check if SAFE_STOP_DEV feature is supported +if ! _have_feature "SAFE_STOP_DEV"; then + _cleanup_test "null" + exit "$UBLK_SKIP_CODE" +fi + +# Test 1: stop --safe on idle device should succeed +dev_id=$(_add_ublk_dev -t null -q 2 -d 32) +_check_add_dev $TID $? + +# Device is idle (no openers), stop --safe should succeed +if ! ${UBLK_PROG} stop -n "${dev_id}" --safe; then + echo "stop --safe on idle device failed unexpectedly!" + ERR_CODE=255 +fi + +# Clean up device +${UBLK_PROG} del -n "${dev_id}" > /dev/null 2>&1 +udevadm settle + +# Test 2: stop --safe on device with active opener should fail +dev_id=$(_add_ublk_dev -t null -q 2 -d 32) +_check_add_dev $TID $? + +# Open device in background (dd reads indefinitely) +dd if=/dev/ublkb${dev_id} of=/dev/null bs=4k iflag=direct > /dev/null 2>&1 & +dd_pid=$! + +# Give dd time to start +sleep 0.2 + +# Device has active opener, stop --safe should fail with -EBUSY +if ${UBLK_PROG} stop -n "${dev_id}" --safe 2>/dev/null; then + echo "stop --safe on busy device succeeded unexpectedly!" + ERR_CODE=255 +fi + +# Kill dd and clean up +kill $dd_pid 2>/dev/null +wait $dd_pid 2>/dev/null + +# Now device should be idle, regular delete should work +${UBLK_PROG} del -n "${dev_id}" +udevadm settle + +_cleanup_test "null" +_show_result $TID $ERR_CODE -- cgit v1.2.3 From ba7f1024a1024f219a18c40b4ab2d7d900fd2d15 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Sat, 10 Jan 2026 08:25:52 +0000 Subject: selftests/bpf: Use the correct destructor kfunc type With CONFIG_CFI enabled, the kernel strictly enforces that indirect function calls use a function pointer type that matches the target function. As bpf_testmod_ctx_release() signature differs from the btf_dtor_kfunc_t pointer type used for the destructor calls in bpf_obj_free_fields(), add a stub function with the correct type to fix the type mismatch. Signed-off-by: Sami Tolvanen Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20260110082548.113748-9-samitolvanen@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_kmods/bpf_testmod.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index 1c41d03bd5a1..bc07ce9d5477 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -285,6 +285,12 @@ __bpf_kfunc void bpf_testmod_ctx_release(struct bpf_testmod_ctx *ctx) call_rcu(&ctx->rcu, testmod_free_cb); } +__bpf_kfunc void bpf_testmod_ctx_release_dtor(void *ctx) +{ + bpf_testmod_ctx_release(ctx); +} +CFI_NOSEAL(bpf_testmod_ctx_release_dtor); + static struct bpf_testmod_ops3 *st_ops3; static int bpf_testmod_test_3(void) @@ -707,7 +713,7 @@ BTF_KFUNCS_END(bpf_testmod_common_kfunc_ids) BTF_ID_LIST(bpf_testmod_dtor_ids) BTF_ID(struct, bpf_testmod_ctx) -BTF_ID(func, bpf_testmod_ctx_release) +BTF_ID(func, bpf_testmod_ctx_release_dtor) static const struct btf_kfunc_id_set bpf_testmod_common_kfunc_set = { .owner = THIS_MODULE, -- cgit v1.2.3 From 088f35ab9fd4a03b8c6ccdda7b92461d92bf7b8b Mon Sep 17 00:00:00 2001 From: Ankit Khushwaha Date: Fri, 9 Jan 2026 20:52:01 +0530 Subject: selftests/net/ipsec: Fix variable size type not at the end of struct The "struct alg" object contains a union of 3 xfrm structures: union { struct xfrm_algo; struct xfrm_algo_aead; struct xfrm_algo_auth; } All of them end with a flexible array member used to store key material, but the flexible array appears at *different offsets* in each struct. bcz of this, union itself is of variable-sized & Placing it above char buf[...] triggers: ipsec.c:835:5: warning: field 'u' with variable sized type 'union (unnamed union at ipsec.c:831:3)' not at the end of a struct or class is a GNU extension [-Wgnu-variable-sized-type-not-at-end] 835 | } u; | ^ one fix is to use "TRAILING_OVERLAP()" which works with one flexible array member only. But In "struct alg" flexible array member exists in all union members, but not at the same offset, so TRAILING_OVERLAP cannot be applied. so the fix is to explicitly overlay the key buffer at the correct offset for the largest union member (xfrm_algo_auth). This ensures that the flexible-array region and the fixed buffer line up. No functional change. Reviewed-by: Simon Horman Signed-off-by: Ankit Khushwaha Link: https://patch.msgid.link/20260109152201.15668-1-ankitkhushwaha.linux@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/ipsec.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/net/ipsec.c b/tools/testing/selftests/net/ipsec.c index 0ccf484b1d9d..f4afef51b930 100644 --- a/tools/testing/selftests/net/ipsec.c +++ b/tools/testing/selftests/net/ipsec.c @@ -43,6 +43,10 @@ #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) +#ifndef offsetof +#define offsetof(TYPE, MEMBER) __builtin_offsetof(TYPE, MEMBER) +#endif + #define IPV4_STR_SZ 16 /* xxx.xxx.xxx.xxx is longest + \0 */ #define MAX_PAYLOAD 2048 #define XFRM_ALGO_KEY_BUF_SIZE 512 @@ -827,13 +831,16 @@ static int xfrm_fill_key(char *name, char *buf, static int xfrm_state_pack_algo(struct nlmsghdr *nh, size_t req_sz, struct xfrm_desc *desc) { - struct { + union { union { struct xfrm_algo alg; struct xfrm_algo_aead aead; struct xfrm_algo_auth auth; } u; - char buf[XFRM_ALGO_KEY_BUF_SIZE]; + struct { + unsigned char __offset_to_FAM[offsetof(struct xfrm_algo_auth, alg_key)]; + char buf[XFRM_ALGO_KEY_BUF_SIZE]; + }; } alg = {}; size_t alen, elen, clen, aelen; unsigned short type; -- cgit v1.2.3 From 8d61f1a9f2541c6ef51d4997e6a4c5a1c0d8b27c Mon Sep 17 00:00:00 2001 From: Jonas Köppeler Date: Fri, 9 Jan 2026 14:15:35 +0100 Subject: selftests/tc-testing: add selftests for cake_mq qdisc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Test 684b: Create CAKE_MQ with default setting (4 queues) Test 7ee8: Create CAKE_MQ with bandwidth limit (4 queues) Test 1f87: Create CAKE_MQ with rtt time (4 queues) Test e9cf: Create CAKE_MQ with besteffort flag (4 queues) Test 7c05: Create CAKE_MQ with diffserv8 flag (4 queues) Test 5a77: Create CAKE_MQ with diffserv4 flag (4 queues) Test 8f7a: Create CAKE_MQ with flowblind flag (4 queues) Test 7ef7: Create CAKE_MQ with dsthost and nat flag (4 queues) Test 2e4d: Create CAKE_MQ with wash flag (4 queues) Test b3e6: Create CAKE_MQ with flowblind and no-split-gso flag (4 queues) Test 62cd: Create CAKE_MQ with dual-srchost and ack-filter flag (4 queues) Test 0df3: Create CAKE_MQ with dual-dsthost and ack-filter-aggressive flag (4 queues) Test 9a75: Create CAKE_MQ with memlimit and ptm flag (4 queues) Test cdef: Create CAKE_MQ with fwmark and atm flag (4 queues) Test 93dd: Create CAKE_MQ with overhead 0 and mpu (4 queues) Test 1475: Create CAKE_MQ with conservative and ingress flag (4 queues) Test 7bf1: Delete CAKE_MQ with conservative and ingress flag (4 queues) Test ee55: Replace CAKE_MQ with mpu (4 queues) Test 6df9: Change CAKE_MQ with mpu (4 queues) Test 67e2: Show CAKE_MQ class (4 queues) Test 2de4: Change bandwidth of CAKE_MQ (4 queues) Test 5f62: Fail to create CAKE_MQ with autorate-ingress flag (4 queues) Test 038e: Fail to change setting of sub-qdisc under CAKE_MQ Test 7bdc: Fail to replace sub-qdisc under CAKE_MQ Test 18e0: Fail to install CAKE_MQ on single queue device Reviewed-by: Victor Nogueira Signed-off-by: Jonas Köppeler Signed-off-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20260109-mq-cake-sub-qdisc-v8-6-8d613fece5d8@redhat.com Signed-off-by: Paolo Abeni --- .../tc-testing/tc-tests/qdiscs/cake_mq.json | 559 +++++++++++++++++++++ 1 file changed, 559 insertions(+) create mode 100644 tools/testing/selftests/tc-testing/tc-tests/qdiscs/cake_mq.json (limited to 'tools/testing') diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/cake_mq.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/cake_mq.json new file mode 100644 index 000000000000..0efe229fb86e --- /dev/null +++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/cake_mq.json @@ -0,0 +1,559 @@ +[ + { + "id": "684b", + "name": "Create CAKE_MQ with default setting (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1\" > /sys/bus/netdevsim/del_device || true", + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "7ee8", + "name": "Create CAKE_MQ with bandwidth limit (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq bandwidth 1000", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth 1Kbit diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "1f87", + "name": "Create CAKE_MQ with rtt time (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq rtt 200", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 200us raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "e9cf", + "name": "Create CAKE_MQ with besteffort flag (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq besteffort", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited besteffort triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "7c05", + "name": "Create CAKE_MQ with diffserv8 flag (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq diffserv8", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv8 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "5a77", + "name": "Create CAKE_MQ with diffserv4 flag (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq diffserv4", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv4 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "8f7a", + "name": "Create CAKE_MQ with flowblind flag (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq flowblind", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 flowblind nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "7ef7", + "name": "Create CAKE_MQ with dsthost and nat flag (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq dsthost nat", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 dsthost nat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "2e4d", + "name": "Create CAKE_MQ with wash flag (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq hosts wash", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 hosts nonat wash no-ack-filter split-gso rtt 100ms raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "b3e6", + "name": "Create CAKE_MQ with flowblind and no-split-gso flag (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq flowblind no-split-gso", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 flowblind nonat nowash no-ack-filter no-split-gso rtt 100ms raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "62cd", + "name": "Create CAKE_MQ with dual-srchost and ack-filter flag (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq dual-srchost ack-filter", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 dual-srchost nonat nowash ack-filter split-gso rtt 100ms raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "0df3", + "name": "Create CAKE_MQ with dual-dsthost and ack-filter-aggressive flag (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq dual-dsthost ack-filter-aggressive", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 dual-dsthost nonat nowash ack-filter-aggressive split-gso rtt 100ms raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "9a75", + "name": "Create CAKE_MQ with memlimit and ptm flag (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq memlimit 10000 ptm", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw ptm overhead 0 memlimit 10000b ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "cdef", + "name": "Create CAKE_MQ with fwmark and atm flag (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq fwmark 8 atm", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw atm overhead 0 fwmark 0x8 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "93dd", + "name": "Create CAKE_MQ with overhead 0 and mpu (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq overhead 128 mpu 256", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms noatm overhead 128 mpu 256 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "1475", + "name": "Create CAKE_MQ with conservative and ingress flag (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq conservative ingress", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash ingress no-ack-filter split-gso rtt 100ms atm overhead 48 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "7bf1", + "name": "Delete CAKE_MQ with conservative and ingress flag (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device", + "$TC qdisc add dev $ETH handle 1: root cake_mq conservative ingress" + ], + "cmdUnderTest": "$TC qdisc del dev $ETH handle 1: root", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash ingress no-ack-filter split-gso rtt 100ms atm overhead 48 ", + "matchCount": "0", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "ee55", + "name": "Replace CAKE_MQ with mpu (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device", + "$TC qdisc add dev $ETH handle 1: root cake_mq overhead 128 mpu 256" + ], + "cmdUnderTest": "$TC qdisc replace dev $ETH handle 1: root cake_mq mpu 128", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms noatm overhead 128 mpu 128 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "6df9", + "name": "Change CAKE_MQ with mpu (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device", + "$TC qdisc add dev $ETH handle 1: root cake_mq overhead 128 mpu 256" + ], + "cmdUnderTest": "$TC qdisc change dev $ETH handle 1: root cake_mq mpu 128", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms noatm overhead 128 mpu 128 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "67e2", + "name": "Show CAKE_MQ class (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq", + "expExitCode": "0", + "verifyCmd": "$TC class show dev $ETH", + "matchPattern": "class cake_mq", + "matchCount": "4", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "2de4", + "name": "Change bandwidth of CAKE_MQ (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device", + "$TC qdisc add dev $ETH handle 1: root cake_mq" + ], + "cmdUnderTest": "$TC qdisc replace dev $ETH handle 1: root cake_mq bandwidth 1000", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth 1Kbit diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "5f62", + "name": "Fail to create CAKE_MQ with autorate-ingress flag (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq autorate-ingress", + "expExitCode": "2", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited autorate-ingress diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ", + "matchCount": "0", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "038e", + "name": "Fail to change setting of sub-qdisc under CAKE_MQ", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device", + "$TC qdisc add dev $ETH handle 1: root cake_mq" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH parent 1:1 cake besteffort flows", + "expExitCode": "2", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "7bdc", + "name": "Fail to replace sub-qdisc under CAKE_MQ", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device", + "$TC qdisc add dev $ETH handle 1: root cake_mq" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH parent 1:1 fq", + "expExitCode": "2", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "18e0", + "name": "Fail to install CAKE_MQ on single queue device", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 1\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq", + "expExitCode": "2", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ", + "matchCount": "0", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + } +] -- cgit v1.2.3 From 609e359ab904698f1e5aa0ab2fee2f4c29ee0886 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Tue, 23 Dec 2025 07:59:13 +0100 Subject: selftests: vDSO: vdso_config: Add configurations for clock_getres_time64() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some architectures will start to implement this function. Make sure that tests can be written for it. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20251223-vdso-compat-time32-v1-2-97ea7a06a543@linutronix.de --- tools/testing/selftests/vDSO/vdso_config.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/vDSO/vdso_config.h b/tools/testing/selftests/vDSO/vdso_config.h index 50c261005111..5da223731b81 100644 --- a/tools/testing/selftests/vDSO/vdso_config.h +++ b/tools/testing/selftests/vDSO/vdso_config.h @@ -66,7 +66,7 @@ static const char *versions[7] = { }; __attribute__((unused)) -static const char *names[2][7] = { +static const char *names[2][8] = { { "__kernel_gettimeofday", "__kernel_clock_gettime", @@ -75,6 +75,7 @@ static const char *names[2][7] = { "__kernel_getcpu", "__kernel_clock_gettime64", "__kernel_getrandom", + "__kernel_clock_getres_time64", }, { "__vdso_gettimeofday", @@ -84,6 +85,7 @@ static const char *names[2][7] = { "__vdso_getcpu", "__vdso_clock_gettime64", "__vdso_getrandom", + "__vdso_clock_getres_time64", }, }; -- cgit v1.2.3 From 1dcd1273add368c2b7c65135e22b416e1b374781 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Tue, 23 Dec 2025 07:59:14 +0100 Subject: selftests: vDSO: vdso_test_abi: Use UAPI system call numbers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SYS_clock_getres might have been redirected by libc to some other system call than the actual clock_getres. For testing it is required to use exactly this system call. Use the system call number exported by the UAPI headers which is always correct. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20251223-vdso-compat-time32-v1-3-97ea7a06a543@linutronix.de --- tools/testing/selftests/vDSO/vdso_test_abi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/vDSO/vdso_test_abi.c b/tools/testing/selftests/vDSO/vdso_test_abi.c index c620317eaeea..a75c12dcb0f1 100644 --- a/tools/testing/selftests/vDSO/vdso_test_abi.c +++ b/tools/testing/selftests/vDSO/vdso_test_abi.c @@ -179,7 +179,7 @@ static void vdso_test_clock_getres(clockid_t clk_id) clock_getres_fail++; } - ret = syscall(SYS_clock_getres, clk_id, &sys_ts); + ret = syscall(__NR_clock_getres, clk_id, &sys_ts); ksft_print_msg("The syscall resolution is %lld %lld\n", (long long)sys_ts.tv_sec, (long long)sys_ts.tv_nsec); -- cgit v1.2.3 From 4e6a2312986d437cc22805b9e08f86b15fee0318 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Tue, 23 Dec 2025 07:59:15 +0100 Subject: selftests: vDSO: vdso_test_abi: Add test for clock_getres_time64() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some architectures will start to implement this function. Make sure it works correctly. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20251223-vdso-compat-time32-v1-4-97ea7a06a543@linutronix.de --- tools/testing/selftests/vDSO/vdso_test_abi.c | 53 +++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/vDSO/vdso_test_abi.c b/tools/testing/selftests/vDSO/vdso_test_abi.c index a75c12dcb0f1..b162a4ba9c4f 100644 --- a/tools/testing/selftests/vDSO/vdso_test_abi.c +++ b/tools/testing/selftests/vDSO/vdso_test_abi.c @@ -36,6 +36,7 @@ typedef long (*vdso_gettimeofday_t)(struct timeval *tv, struct timezone *tz); typedef long (*vdso_clock_gettime_t)(clockid_t clk_id, struct timespec *ts); typedef long (*vdso_clock_gettime64_t)(clockid_t clk_id, struct vdso_timespec64 *ts); typedef long (*vdso_clock_getres_t)(clockid_t clk_id, struct timespec *ts); +typedef long (*vdso_clock_getres_time64_t)(clockid_t clk_id, struct vdso_timespec64 *ts); typedef time_t (*vdso_time_t)(time_t *t); static const char * const vdso_clock_name[] = { @@ -196,6 +197,55 @@ static void vdso_test_clock_getres(clockid_t clk_id) } } +#ifdef __NR_clock_getres_time64 +static void vdso_test_clock_getres_time64(clockid_t clk_id) +{ + int clock_getres_fail = 0; + + /* Find clock_getres. */ + vdso_clock_getres_time64_t vdso_clock_getres_time64 = + (vdso_clock_getres_time64_t)vdso_sym(version, name[7]); + + if (!vdso_clock_getres_time64) { + ksft_print_msg("Couldn't find %s\n", name[7]); + ksft_test_result_skip("%s %s\n", name[7], + vdso_clock_name[clk_id]); + return; + } + + struct vdso_timespec64 ts, sys_ts; + long ret = VDSO_CALL(vdso_clock_getres_time64, 2, clk_id, &ts); + + if (ret == 0) { + ksft_print_msg("The vdso resolution is %lld %lld\n", + (long long)ts.tv_sec, (long long)ts.tv_nsec); + } else { + clock_getres_fail++; + } + + ret = syscall(__NR_clock_getres_time64, clk_id, &sys_ts); + + ksft_print_msg("The syscall resolution is %lld %lld\n", + (long long)sys_ts.tv_sec, (long long)sys_ts.tv_nsec); + + if ((sys_ts.tv_sec != ts.tv_sec) || (sys_ts.tv_nsec != ts.tv_nsec)) + clock_getres_fail++; + + if (clock_getres_fail > 0) { + ksft_test_result_fail("%s %s\n", name[7], + vdso_clock_name[clk_id]); + } else { + ksft_test_result_pass("%s %s\n", name[7], + vdso_clock_name[clk_id]); + } +} +#else /* !__NR_clock_getres_time64 */ +static void vdso_test_clock_getres_time64(clockid_t clk_id) +{ + ksft_test_result_skip("%s %s\n", name[7], vdso_clock_name[clk_id]); +} +#endif /* __NR_clock_getres_time64 */ + /* * This function calls vdso_test_clock_gettime and vdso_test_clock_getres * with different values for clock_id. @@ -208,9 +258,10 @@ static inline void vdso_test_clock(clockid_t clock_id) vdso_test_clock_gettime64(clock_id); vdso_test_clock_getres(clock_id); + vdso_test_clock_getres_time64(clock_id); } -#define VDSO_TEST_PLAN 29 +#define VDSO_TEST_PLAN 38 int main(int argc, char **argv) { -- cgit v1.2.3 From 8441c7d3bd6c5a52ab2ecf77e43a5bf262004f5c Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 7 Jan 2026 13:05:43 +0100 Subject: cxl: Check for invalid addresses returned from translation functions on errors Translation functions may return an invalid address in case of errors. If the address is not checked the further use of the invalid value will cause an address corruption. Consistently check for a valid address returned by translation functions. Use RESOURCE_SIZE_MAX to indicate an invalid address for type resource_size_t. Depending on the type either RESOURCE_SIZE_MAX or ULLONG_MAX is used to indicate an address error. Propagating an invalid address from a failed translation may cause userspace to think it has received a valid SPA, when in fact it is wrong. The CXL userspace API, using trace events, expects ULLONG_MAX to indicate a translation failure. If ULLONG_MAX is not returned immediately, subsequent calculations can transform that bad address into a different value (!ULLONG_MAX), and an invalid SPA may be returned to userspace. This can lead to incorrect diagnostics and erroneous corrective actions. [ dj: Added user impact statement from Alison. ] [ dj: Fixed checkpatch tab alignment issue. ] Reviewed-by: Dave Jiang Signed-off-by: Robert Richter Fixes: c3dd67681c70 ("cxl/region: Add inject and clear poison by region offset") Fixes: b78b9e7b7979 ("cxl/region: Refactor address translation funcs for testing") Reviewed-by: Alison Schofield Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20260107120544.410993-1-rrichter@amd.com Signed-off-by: Dave Jiang --- drivers/cxl/core/hdm.c | 2 +- drivers/cxl/core/region.c | 34 ++++++++++++++++++++++++++-------- tools/testing/cxl/test/cxl_translate.c | 30 ++++++++++++++++++------------ 3 files changed, 45 insertions(+), 21 deletions(-) (limited to 'tools/testing') diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c index a470099a69f1..eb5a3a7640c6 100644 --- a/drivers/cxl/core/hdm.c +++ b/drivers/cxl/core/hdm.c @@ -530,7 +530,7 @@ resource_size_t cxl_dpa_size(struct cxl_endpoint_decoder *cxled) resource_size_t cxl_dpa_resource_start(struct cxl_endpoint_decoder *cxled) { - resource_size_t base = -1; + resource_size_t base = RESOURCE_SIZE_MAX; lockdep_assert_held(&cxl_rwsem.dpa); if (cxled->dpa_res) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index fc36a5413d3f..5bd1213737fa 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -3118,7 +3118,7 @@ u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); struct cxl_region_params *p = &cxlr->params; struct cxl_endpoint_decoder *cxled = NULL; - u64 dpa_offset, hpa_offset, hpa; + u64 base, dpa_offset, hpa_offset, hpa; u16 eig = 0; u8 eiw = 0; int pos; @@ -3136,8 +3136,14 @@ u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, ways_to_eiw(p->interleave_ways, &eiw); granularity_to_eig(p->interleave_granularity, &eig); - dpa_offset = dpa - cxl_dpa_resource_start(cxled); + base = cxl_dpa_resource_start(cxled); + if (base == RESOURCE_SIZE_MAX) + return ULLONG_MAX; + + dpa_offset = dpa - base; hpa_offset = cxl_calculate_hpa_offset(dpa_offset, pos, eiw, eig); + if (hpa_offset == ULLONG_MAX) + return ULLONG_MAX; /* Apply the hpa_offset to the region base address */ hpa = hpa_offset + p->res->start + p->cache_size; @@ -3146,6 +3152,9 @@ u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, if (cxlrd->ops.hpa_to_spa) hpa = cxlrd->ops.hpa_to_spa(cxlrd, hpa); + if (hpa == ULLONG_MAX) + return ULLONG_MAX; + if (!cxl_resource_contains_addr(p->res, hpa)) { dev_dbg(&cxlr->dev, "Addr trans fail: hpa 0x%llx not in region\n", hpa); @@ -3170,7 +3179,8 @@ static int region_offset_to_dpa_result(struct cxl_region *cxlr, u64 offset, struct cxl_region_params *p = &cxlr->params; struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); struct cxl_endpoint_decoder *cxled; - u64 hpa, hpa_offset, dpa_offset; + u64 hpa_offset = offset; + u64 dpa, dpa_offset; u16 eig = 0; u8 eiw = 0; int pos; @@ -3187,10 +3197,13 @@ static int region_offset_to_dpa_result(struct cxl_region *cxlr, u64 offset, * CXL HPA is assumed to equal SPA. */ if (cxlrd->ops.spa_to_hpa) { - hpa = cxlrd->ops.spa_to_hpa(cxlrd, p->res->start + offset); - hpa_offset = hpa - p->res->start; - } else { - hpa_offset = offset; + hpa_offset = cxlrd->ops.spa_to_hpa(cxlrd, p->res->start + offset); + if (hpa_offset == ULLONG_MAX) { + dev_dbg(&cxlr->dev, "HPA not found for %pr offset %#llx\n", + p->res, offset); + return -ENXIO; + } + hpa_offset -= p->res->start; } pos = cxl_calculate_position(hpa_offset, eiw, eig); @@ -3207,8 +3220,13 @@ static int region_offset_to_dpa_result(struct cxl_region *cxlr, u64 offset, cxled = p->targets[i]; if (cxled->pos != pos) continue; + + dpa = cxl_dpa_resource_start(cxled); + if (dpa != RESOURCE_SIZE_MAX) + dpa += dpa_offset; + result->cxlmd = cxled_to_memdev(cxled); - result->dpa = cxl_dpa_resource_start(cxled) + dpa_offset; + result->dpa = dpa; return 0; } diff --git a/tools/testing/cxl/test/cxl_translate.c b/tools/testing/cxl/test/cxl_translate.c index 2200ae21795c..16328b2112b2 100644 --- a/tools/testing/cxl/test/cxl_translate.c +++ b/tools/testing/cxl/test/cxl_translate.c @@ -68,6 +68,8 @@ static u64 to_hpa(u64 dpa_offset, int pos, u8 r_eiw, u16 r_eig, u8 hb_ways, /* Calculate base HPA offset from DPA and position */ hpa_offset = cxl_calculate_hpa_offset(dpa_offset, pos, r_eiw, r_eig); + if (hpa_offset == ULLONG_MAX) + return ULLONG_MAX; if (math == XOR_MATH) { cximsd->nr_maps = hbiw_to_nr_maps[hb_ways]; @@ -258,19 +260,23 @@ static int test_random_params(void) pos = get_random_u32() % ways; dpa = get_random_u64() >> 12; + reverse_dpa = ULLONG_MAX; + reverse_pos = -1; + hpa = cxl_calculate_hpa_offset(dpa, pos, eiw, eig); - reverse_dpa = cxl_calculate_dpa_offset(hpa, eiw, eig); - reverse_pos = cxl_calculate_position(hpa, eiw, eig); - - if (reverse_dpa != dpa || reverse_pos != pos) { - pr_err("test random iter %d FAIL hpa=%llu, dpa=%llu reverse_dpa=%llu, pos=%d reverse_pos=%d eiw=%u eig=%u\n", - i, hpa, dpa, reverse_dpa, pos, reverse_pos, eiw, - eig); - - if (failures++ > 10) { - pr_err("test random too many failures, stop\n"); - break; - } + if (hpa != ULLONG_MAX) { + reverse_dpa = cxl_calculate_dpa_offset(hpa, eiw, eig); + reverse_pos = cxl_calculate_position(hpa, eiw, eig); + if (reverse_dpa == dpa && reverse_pos == pos) + continue; + } + + pr_err("test random iter %d FAIL hpa=%llu, dpa=%llu reverse_dpa=%llu, pos=%d reverse_pos=%d eiw=%u eig=%u\n", + i, hpa, dpa, reverse_dpa, pos, reverse_pos, eiw, eig); + + if (failures++ > 10) { + pr_err("test random too many failures, stop\n"); + break; } } pr_info("..... test random: PASS %d FAIL %d\n", i - failures, failures); -- cgit v1.2.3 From 2465a08d433dd4ae0c4eecdb8e79c54b7c5e5a55 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 12 Jan 2026 22:10:23 -0800 Subject: selftests/bpf: Fix dmabuf_iter/lots_of_buffers failure with 64K page On arm64 with 64K page , I observed the following test failure: ... subtest_dmabuf_iter_check_lots_of_buffers:FAIL:total_bytes_read unexpected total_bytes_read: actual 4696 <= expected 65536 #97/3 dmabuf_iter/lots_of_buffers:FAIL With 4K page on x86, the total_bytes_read is 4593. With 64K page on arm64, the total_byte_read is 4696. In progs/dmabuf_iter.c, for each iteration, the output is BPF_SEQ_PRINTF(seq, "%lu\n%llu\n%s\n%s\n", inode, size, name, exporter); The only difference between 4K and 64K page is 'size' in the above BPF_SEQ_PRINTF. The 4K page will output '4096' and the 64K page will output '65536'. So the total_bytes_read with 64K page is slighter greater than 4K page. Adjusting the total_bytes_read from 65536 to 4096 fixed the issue. Cc: T.J. Mercier Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260113061023.3798085-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/dmabuf_iter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/prog_tests/dmabuf_iter.c b/tools/testing/selftests/bpf/prog_tests/dmabuf_iter.c index e442be9dde7e..fb2cea710db3 100644 --- a/tools/testing/selftests/bpf/prog_tests/dmabuf_iter.c +++ b/tools/testing/selftests/bpf/prog_tests/dmabuf_iter.c @@ -233,7 +233,7 @@ static void subtest_dmabuf_iter_check_lots_of_buffers(struct dmabuf_iter *skel) while ((bytes_read = read(iter_fd, buf, sizeof(buf))) > 0) total_bytes_read += bytes_read; - ASSERT_GT(total_bytes_read, getpagesize(), "total_bytes_read"); + ASSERT_GT(total_bytes_read, 4096, "total_bytes_read"); close(iter_fd); } -- cgit v1.2.3 From d2f7cd20a7c7742b83d2c899044d4f2a851a4a7d Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 12 Jan 2026 22:10:28 -0800 Subject: selftests/bpf: Fix sk_bypass_prot_mem failure with 64K page The current selftest sk_bypass_prot_mem only supports 4K page. When running with 64K page on arm64, the following failure happens: ... check_bypass:FAIL:no bypass unexpected no bypass: actual 3 <= expected 32 ... #385/1 sk_bypass_prot_mem/TCP :FAIL ... check_bypass:FAIL:no bypass unexpected no bypass: actual 4 <= expected 32 ... #385/2 sk_bypass_prot_mem/UDP :FAIL ... Adding support to 64K page as well fixed the failure. Cc: Kuniyuki Iwashima Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260113061028.3798326-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/sk_bypass_prot_mem.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/prog_tests/sk_bypass_prot_mem.c b/tools/testing/selftests/bpf/prog_tests/sk_bypass_prot_mem.c index e4940583924b..e2c867fd5244 100644 --- a/tools/testing/selftests/bpf/prog_tests/sk_bypass_prot_mem.c +++ b/tools/testing/selftests/bpf/prog_tests/sk_bypass_prot_mem.c @@ -5,9 +5,14 @@ #include "sk_bypass_prot_mem.skel.h" #include "network_helpers.h" +#ifndef PAGE_SIZE +#include +#define PAGE_SIZE getpagesize() +#endif + #define NR_PAGES 32 #define NR_SOCKETS 2 -#define BUF_TOTAL (NR_PAGES * 4096 / NR_SOCKETS) +#define BUF_TOTAL (NR_PAGES * PAGE_SIZE / NR_SOCKETS) #define BUF_SINGLE 1024 #define NR_SEND (BUF_TOTAL / BUF_SINGLE) -- cgit v1.2.3 From 951d79017e8a0af6db4a167a89746b4a0924626b Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 12 Jan 2026 22:10:33 -0800 Subject: selftests/bpf: Fix verifier_arena_globals1 failure with 64K page With 64K page on arm64, verifier_arena_globals1 failed like below: ... libbpf: map 'arena': failed to create: -E2BIG ... #509/1 verifier_arena_globals1/check_reserve1:FAIL ... For 64K page, if the number of arena pages is (1UL << 20), the total memory will exceed 4G and this will cause map creation failure. Adjusting ARENA_PAGES based on the actual page size fixed the problem. Cc: Emil Tsalapatis Signed-off-by: Yonghong Song Reviewed-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260113061033.3798549-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/verifier_arena_globals1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_globals1.c b/tools/testing/selftests/bpf/progs/verifier_arena_globals1.c index 14afef3d6442..83182ddbfb95 100644 --- a/tools/testing/selftests/bpf/progs/verifier_arena_globals1.c +++ b/tools/testing/selftests/bpf/progs/verifier_arena_globals1.c @@ -9,7 +9,7 @@ #include "bpf_arena_common.h" #include "bpf_misc.h" -#define ARENA_PAGES (1UL<< (32 - 12)) +#define ARENA_PAGES (1UL<< (32 - __builtin_ffs(__PAGE_SIZE) + 1)) #define GLOBAL_PAGES (16) struct { -- cgit v1.2.3 From 9160335317cb404f54ad2f509546c666ddd4d0eb Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 12 Jan 2026 12:13:58 -0800 Subject: selftests/bpf: Add tests for s>>=31 and s>>=63 Add tests for special arithmetic shift right. Signed-off-by: Alexei Starovoitov Co-developed-by: Puranjay Mohan Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260112201424.816836-3-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/progs/verifier_subreg.c | 85 ++++++++++++++++++++++ 1 file changed, 85 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/progs/verifier_subreg.c b/tools/testing/selftests/bpf/progs/verifier_subreg.c index b3e1c3eef9ae..be328100ba53 100644 --- a/tools/testing/selftests/bpf/progs/verifier_subreg.c +++ b/tools/testing/selftests/bpf/progs/verifier_subreg.c @@ -738,4 +738,89 @@ __naked void ldx_w_zero_extend_check(void) : __clobber_all); } +SEC("socket") +__success __success_unpriv __retval(0) +__naked void arsh_31_and(void) +{ + /* Below is what LLVM generates in cilium's bpf_wiregard.o */ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w2 = w0; \ + w2 s>>= 31; \ + w2 &= -134; /* w2 becomes 0 or -134 */ \ + if w2 s> -1 goto +2; \ + /* Branch always taken because w2 = -134 */ \ + if w2 != -136 goto +1; \ + w0 /= 0; \ + w0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__success __success_unpriv __retval(0) +__naked void arsh_63_and(void) +{ + /* Copy of arsh_31 with s/w/r/ */ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r2 = r0; \ + r2 <<= 32; \ + r2 s>>= 63; \ + r2 &= -134; \ + if r2 s> -1 goto +2; \ + /* Branch always taken because w2 = -134 */ \ + if r2 != -136 goto +1; \ + r0 /= 0; \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__success __success_unpriv __retval(0) +__naked void arsh_31_or(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w2 = w0; \ + w2 s>>= 31; \ + w2 |= 134; /* w2 becomes -1 or 134 */ \ + if w2 s> -1 goto +2; \ + /* Branch always taken because w2 = -1 */ \ + if w2 == -1 goto +1; \ + w0 /= 0; \ + w0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__success __success_unpriv __retval(0) +__naked void arsh_63_or(void) +{ + /* Copy of arsh_31 with s/w/r/ */ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r2 = r0; \ + r2 <<= 32; \ + r2 s>>= 63; \ + r2 |= 134; /* r2 becomes -1 or 134 */ \ + if r2 s> -1 goto +2; \ + /* Branch always taken because w2 = -1 */ \ + if r2 == -1 goto +1; \ + r0 /= 0; \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From c65182ef9df6bb96fd85b56a2bcdd18d64c4d3b5 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 12 Jan 2026 11:33:39 -0500 Subject: selftests: net: reduce txtimestamp deschedule flakes This test occasionally fails due to exceeding timing bounds, as run in continuous testing on netdev.bots: https://netdev.bots.linux.dev/contest.html?test=txtimestamp-sh A common pattern is a single elevated delay between USR and SND. # 8.36 [+0.00] test SND # 8.36 [+0.00] USR: 1767864384 s 240994 us (seq=0, len=0) # 8.44 [+0.08] ERROR: 18461 us expected between 10000 and 18000 # 8.44 [+0.00] SND: 1767864384 s 259455 us (seq=42, len=10) (USR +18460 us) # 8.52 [+0.07] SND: 1767864384 s 339523 us (seq=42, len=10) (USR +10005 us) # 8.52 [+0.00] USR: 1767864384 s 409580 us (seq=0, len=0) # 8.60 [+0.08] SND: 1767864384 s 419586 us (seq=42, len=10) (USR +10005 us) # 8.60 [+0.00] USR: 1767864384 s 489645 us (seq=0, len=0) # 8.68 [+0.08] SND: 1767864384 s 499651 us (seq=42, len=10) (USR +10005 us) # 8.68 [+0.00] USR-SND: count=4, avg=12119 us, min=10005 us, max=18460 us (Note that other delays are nowhere near the large 8ms tolerance.) One hypothesis is that the task is descheduled between taking the USR timestamp and sending the packet. Possibly in printing. Delay taking the timestamp closer to sendmsg, and delay printing until after sendmsg. With this change, failure rate is significantly lower in current runs. Link: https://lore.kernel.org/netdev/20260107110521.1aab55e9@kernel.org/ Suggested-by: Jakub Kicinski Signed-off-by: Willem de Bruijn Link: https://patch.msgid.link/20260112163355.3510150-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/txtimestamp.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/net/txtimestamp.c b/tools/testing/selftests/net/txtimestamp.c index bcc14688661d..170be192f5c7 100644 --- a/tools/testing/selftests/net/txtimestamp.c +++ b/tools/testing/selftests/net/txtimestamp.c @@ -206,12 +206,10 @@ static void __print_timestamp(const char *name, struct timespec *cur, fprintf(stderr, "\n"); } -static void print_timestamp_usr(void) +static void record_timestamp_usr(void) { if (clock_gettime(CLOCK_REALTIME, &ts_usr)) error(1, errno, "clock_gettime"); - - __print_timestamp(" USR", &ts_usr, 0, 0); } static void print_timestamp(struct scm_timestamping *tss, int tstype, @@ -599,8 +597,6 @@ static void do_test(int family, unsigned int report_opt) fill_header_udp(buf + off, family == PF_INET); } - print_timestamp_usr(); - iov.iov_base = buf; iov.iov_len = total_len; @@ -655,10 +651,14 @@ static void do_test(int family, unsigned int report_opt) } + record_timestamp_usr(); + val = sendmsg(fd, &msg, 0); if (val != total_len) error(1, errno, "send"); + __print_timestamp(" USR", &ts_usr, 0, 0); + /* wait for all errors to be queued, else ACKs arrive OOO */ if (cfg_sleep_usec) usleep(cfg_sleep_usec); -- cgit v1.2.3 From a3acd7d43462a7f7429301afad3c0059276f427e Mon Sep 17 00:00:00 2001 From: Donglin Peng Date: Fri, 9 Jan 2026 20:59:54 +0800 Subject: selftests/bpf: Add test cases for btf__permute functionality This patch introduces test cases for the btf__permute function to ensure it works correctly with both base BTF and split BTF scenarios. The test suite includes: - test_permute_base: Validates permutation on base BTF - test_permute_split: Tests permutation on split BTF Signed-off-by: Donglin Peng Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20260109130003.3313716-3-dolinux.peng@gmail.com --- .../testing/selftests/bpf/prog_tests/btf_permute.c | 244 +++++++++++++++++++++ 1 file changed, 244 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/btf_permute.c (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/prog_tests/btf_permute.c b/tools/testing/selftests/bpf/prog_tests/btf_permute.c new file mode 100644 index 000000000000..04ade5ad77ac --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/btf_permute.c @@ -0,0 +1,244 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Xiaomi */ + +#include +#include +#include "btf_helpers.h" + +static void permute_base_check(struct btf *btf) +{ + VALIDATE_RAW_BTF( + btf, + "[1] STRUCT 's2' size=4 vlen=1\n" + "\t'm' type_id=4 bits_offset=0", + "[2] FUNC 'f' type_id=6 linkage=static", + "[3] PTR '(anon)' type_id=4", + "[4] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[5] STRUCT 's1' size=4 vlen=1\n" + "\t'm' type_id=4 bits_offset=0", + "[6] FUNC_PROTO '(anon)' ret_type_id=4 vlen=1\n" + "\t'p' type_id=3"); +} + +/* Ensure btf__permute works as expected in the base-BTF scenario */ +static void test_permute_base(void) +{ + struct btf *btf; + __u32 permute_ids[7]; + int err; + + btf = btf__new_empty(); + if (!ASSERT_OK_PTR(btf, "empty_main_btf")) + return; + + btf__add_int(btf, "int", 4, BTF_INT_SIGNED); /* [1] int */ + btf__add_ptr(btf, 1); /* [2] ptr to int */ + btf__add_struct(btf, "s1", 4); /* [3] struct s1 { */ + btf__add_field(btf, "m", 1, 0, 0); /* int m; */ + /* } */ + btf__add_struct(btf, "s2", 4); /* [4] struct s2 { */ + btf__add_field(btf, "m", 1, 0, 0); /* int m; */ + /* } */ + btf__add_func_proto(btf, 1); /* [5] int (*)(int *p); */ + btf__add_func_param(btf, "p", 2); + btf__add_func(btf, "f", BTF_FUNC_STATIC, 5); /* [6] int f(int *p); */ + + VALIDATE_RAW_BTF( + btf, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] PTR '(anon)' type_id=1", + "[3] STRUCT 's1' size=4 vlen=1\n" + "\t'm' type_id=1 bits_offset=0", + "[4] STRUCT 's2' size=4 vlen=1\n" + "\t'm' type_id=1 bits_offset=0", + "[5] FUNC_PROTO '(anon)' ret_type_id=1 vlen=1\n" + "\t'p' type_id=2", + "[6] FUNC 'f' type_id=5 linkage=static"); + + permute_ids[0] = 0; /* [0] -> [0] */ + permute_ids[1] = 4; /* [1] -> [4] */ + permute_ids[2] = 3; /* [2] -> [3] */ + permute_ids[3] = 5; /* [3] -> [5] */ + permute_ids[4] = 1; /* [4] -> [1] */ + permute_ids[5] = 6; /* [5] -> [6] */ + permute_ids[6] = 2; /* [6] -> [2] */ + err = btf__permute(btf, permute_ids, ARRAY_SIZE(permute_ids), NULL); + if (!ASSERT_OK(err, "btf__permute_base")) + goto done; + permute_base_check(btf); + + /* ids[0] must be 0 for base BTF */ + permute_ids[0] = 4; /* [0] -> [0] */ + permute_ids[1] = 0; /* [1] -> [4] */ + permute_ids[2] = 3; /* [2] -> [3] */ + permute_ids[3] = 5; /* [3] -> [5] */ + permute_ids[4] = 1; /* [4] -> [1] */ + permute_ids[5] = 6; /* [5] -> [6] */ + permute_ids[6] = 2; /* [6] -> [2] */ + err = btf__permute(btf, permute_ids, ARRAY_SIZE(permute_ids), NULL); + if (!ASSERT_ERR(err, "btf__permute_base")) + goto done; + /* BTF is not modified */ + permute_base_check(btf); + + /* id_map_cnt is invalid */ + permute_ids[0] = 0; /* [0] -> [0] */ + permute_ids[1] = 4; /* [1] -> [4] */ + permute_ids[2] = 3; /* [2] -> [3] */ + permute_ids[3] = 5; /* [3] -> [5] */ + permute_ids[4] = 1; /* [4] -> [1] */ + permute_ids[5] = 6; /* [5] -> [6] */ + permute_ids[6] = 2; /* [6] -> [2] */ + err = btf__permute(btf, permute_ids, ARRAY_SIZE(permute_ids) - 1, NULL); + if (!ASSERT_ERR(err, "btf__permute_base")) + goto done; + /* BTF is not modified */ + permute_base_check(btf); + + /* Multiple types can not be mapped to the same ID */ + permute_ids[0] = 0; + permute_ids[1] = 4; + permute_ids[2] = 4; + permute_ids[3] = 5; + permute_ids[4] = 1; + permute_ids[5] = 6; + permute_ids[6] = 2; + err = btf__permute(btf, permute_ids, ARRAY_SIZE(permute_ids), NULL); + if (!ASSERT_ERR(err, "btf__permute_base")) + goto done; + /* BTF is not modified */ + permute_base_check(btf); + + /* Type ID must be valid */ + permute_ids[0] = 0; + permute_ids[1] = 4; + permute_ids[2] = 3; + permute_ids[3] = 5; + permute_ids[4] = 1; + permute_ids[5] = 7; + permute_ids[6] = 2; + err = btf__permute(btf, permute_ids, ARRAY_SIZE(permute_ids), NULL); + if (!ASSERT_ERR(err, "btf__permute_base")) + goto done; + /* BTF is not modified */ + permute_base_check(btf); + +done: + btf__free(btf); +} + +static void permute_split_check(struct btf *btf) +{ + VALIDATE_RAW_BTF( + btf, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] PTR '(anon)' type_id=1", + "[3] STRUCT 's2' size=4 vlen=1\n" + "\t'm' type_id=1 bits_offset=0", + "[4] FUNC 'f' type_id=5 linkage=static", + "[5] FUNC_PROTO '(anon)' ret_type_id=1 vlen=1\n" + "\t'p' type_id=2", + "[6] STRUCT 's1' size=4 vlen=1\n" + "\t'm' type_id=1 bits_offset=0"); +} + +/* Ensure btf__permute works as expected in the split-BTF scenario */ +static void test_permute_split(void) +{ + struct btf *split_btf = NULL, *base_btf = NULL; + __u32 permute_ids[4]; + int err, start_id; + + base_btf = btf__new_empty(); + if (!ASSERT_OK_PTR(base_btf, "empty_main_btf")) + return; + + btf__add_int(base_btf, "int", 4, BTF_INT_SIGNED); /* [1] int */ + btf__add_ptr(base_btf, 1); /* [2] ptr to int */ + VALIDATE_RAW_BTF( + base_btf, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] PTR '(anon)' type_id=1"); + split_btf = btf__new_empty_split(base_btf); + if (!ASSERT_OK_PTR(split_btf, "empty_split_btf")) + goto cleanup; + btf__add_struct(split_btf, "s1", 4); /* [3] struct s1 { */ + btf__add_field(split_btf, "m", 1, 0, 0); /* int m; */ + /* } */ + btf__add_struct(split_btf, "s2", 4); /* [4] struct s2 { */ + btf__add_field(split_btf, "m", 1, 0, 0); /* int m; */ + /* } */ + btf__add_func_proto(split_btf, 1); /* [5] int (*)(int p); */ + btf__add_func_param(split_btf, "p", 2); + btf__add_func(split_btf, "f", BTF_FUNC_STATIC, 5); /* [6] int f(int *p); */ + + VALIDATE_RAW_BTF( + split_btf, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] PTR '(anon)' type_id=1", + "[3] STRUCT 's1' size=4 vlen=1\n" + "\t'm' type_id=1 bits_offset=0", + "[4] STRUCT 's2' size=4 vlen=1\n" + "\t'm' type_id=1 bits_offset=0", + "[5] FUNC_PROTO '(anon)' ret_type_id=1 vlen=1\n" + "\t'p' type_id=2", + "[6] FUNC 'f' type_id=5 linkage=static"); + + start_id = btf__type_cnt(base_btf); + permute_ids[3 - start_id] = 6; /* [3] -> [6] */ + permute_ids[4 - start_id] = 3; /* [4] -> [3] */ + permute_ids[5 - start_id] = 5; /* [5] -> [5] */ + permute_ids[6 - start_id] = 4; /* [6] -> [4] */ + err = btf__permute(split_btf, permute_ids, ARRAY_SIZE(permute_ids), NULL); + if (!ASSERT_OK(err, "btf__permute_split")) + goto cleanup; + permute_split_check(split_btf); + + /* + * For split BTF, id_map_cnt must equal to the number of types + * added on top of base BTF + */ + permute_ids[3 - start_id] = 4; + permute_ids[4 - start_id] = 3; + permute_ids[5 - start_id] = 5; + permute_ids[6 - start_id] = 6; + err = btf__permute(split_btf, permute_ids, ARRAY_SIZE(permute_ids) - 1, NULL); + if (!ASSERT_ERR(err, "btf__permute_split")) + goto cleanup; + /* BTF is not modified */ + permute_split_check(split_btf); + + /* Multiple types can not be mapped to the same ID */ + permute_ids[3 - start_id] = 4; + permute_ids[4 - start_id] = 3; + permute_ids[5 - start_id] = 3; + permute_ids[6 - start_id] = 6; + err = btf__permute(split_btf, permute_ids, ARRAY_SIZE(permute_ids), NULL); + if (!ASSERT_ERR(err, "btf__permute_split")) + goto cleanup; + /* BTF is not modified */ + permute_split_check(split_btf); + + /* Can not map to base ID */ + permute_ids[3 - start_id] = 4; + permute_ids[4 - start_id] = 2; + permute_ids[5 - start_id] = 5; + permute_ids[6 - start_id] = 6; + err = btf__permute(split_btf, permute_ids, ARRAY_SIZE(permute_ids), NULL); + if (!ASSERT_ERR(err, "btf__permute_split")) + goto cleanup; + /* BTF is not modified */ + permute_split_check(split_btf); + +cleanup: + btf__free(split_btf); + btf__free(base_btf); +} + +void test_btf_permute(void) +{ + if (test__start_subtest("permute_base")) + test_permute_base(); + if (test__start_subtest("permute_split")) + test_permute_split(); +} -- cgit v1.2.3 From c3a9a27c79e4e5d8bdb20a26d16230111207e98e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 8 Jan 2026 19:45:25 -0800 Subject: KVM: selftests: Add a test to verify APICv updates (while L2 is active) Add a test to verify KVM correctly handles a variety of edge cases related to APICv updates, and in particular updates that are triggered while L2 is actively running. Reviewed-by: Chao Gao Link: https://patch.msgid.link/20260109034532.1012993-2-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/Makefile.kvm | 1 + tools/testing/selftests/kvm/include/x86/apic.h | 4 + .../selftests/kvm/x86/vmx_apicv_updates_test.c | 155 +++++++++++++++++++++ 3 files changed, 160 insertions(+) create mode 100644 tools/testing/selftests/kvm/x86/vmx_apicv_updates_test.c (limited to 'tools/testing') diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index ba5c2b643efa..6f00bd8271c2 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -115,6 +115,7 @@ TEST_GEN_PROGS_x86 += x86/ucna_injection_test TEST_GEN_PROGS_x86 += x86/userspace_io_test TEST_GEN_PROGS_x86 += x86/userspace_msr_exit_test TEST_GEN_PROGS_x86 += x86/vmx_apic_access_test +TEST_GEN_PROGS_x86 += x86/vmx_apicv_updates_test TEST_GEN_PROGS_x86 += x86/vmx_dirty_log_test TEST_GEN_PROGS_x86 += x86/vmx_exception_with_invalid_guest_state TEST_GEN_PROGS_x86 += x86/vmx_msrs_test diff --git a/tools/testing/selftests/kvm/include/x86/apic.h b/tools/testing/selftests/kvm/include/x86/apic.h index 80fe9f69b38d..d42a0998d868 100644 --- a/tools/testing/selftests/kvm/include/x86/apic.h +++ b/tools/testing/selftests/kvm/include/x86/apic.h @@ -32,6 +32,7 @@ #define APIC_SPIV 0xF0 #define APIC_SPIV_FOCUS_DISABLED (1 << 9) #define APIC_SPIV_APIC_ENABLED (1 << 8) +#define APIC_ISR 0x100 #define APIC_IRR 0x200 #define APIC_ICR 0x300 #define APIC_LVTCMCI 0x2f0 @@ -68,6 +69,9 @@ #define APIC_TMCCT 0x390 #define APIC_TDCR 0x3E0 +#define APIC_VECTOR_TO_BIT_NUMBER(v) ((unsigned int)(v) % 32) +#define APIC_VECTOR_TO_REG_OFFSET(v) ((unsigned int)(v) / 32 * 0x10) + void apic_disable(void); void xapic_enable(void); void x2apic_enable(void); diff --git a/tools/testing/selftests/kvm/x86/vmx_apicv_updates_test.c b/tools/testing/selftests/kvm/x86/vmx_apicv_updates_test.c new file mode 100644 index 000000000000..337c53fddeff --- /dev/null +++ b/tools/testing/selftests/kvm/x86/vmx_apicv_updates_test.c @@ -0,0 +1,155 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "vmx.h" + +#define GOOD_IPI_VECTOR 0xe0 +#define BAD_IPI_VECTOR 0xf0 + +static volatile int good_ipis_received; + +static void good_ipi_handler(struct ex_regs *regs) +{ + good_ipis_received++; +} + +static void bad_ipi_handler(struct ex_regs *regs) +{ + GUEST_FAIL("Received \"bad\" IPI; ICR MMIO write should have been ignored"); +} + +static void l2_guest_code(void) +{ + x2apic_enable(); + vmcall(); + + xapic_enable(); + xapic_write_reg(APIC_ID, 1 << 24); + vmcall(); +} + +static void l1_guest_code(struct vmx_pages *vmx_pages) +{ +#define L2_GUEST_STACK_SIZE 64 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + uint32_t control; + + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); + GUEST_ASSERT(load_vmcs(vmx_pages)); + + /* Prepare the VMCS for L2 execution. */ + prepare_vmcs(vmx_pages, l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + control = vmreadz(CPU_BASED_VM_EXEC_CONTROL); + control |= CPU_BASED_USE_MSR_BITMAPS; + vmwrite(CPU_BASED_VM_EXEC_CONTROL, control); + + /* Modify APIC ID to coerce KVM into inhibiting APICv. */ + xapic_enable(); + xapic_write_reg(APIC_ID, 1 << 24); + + /* + * Generate+receive an IRQ without doing EOI to get an IRQ set in vISR + * but not SVI. APICv should be inhibited due to running with a + * modified APIC ID. + */ + xapic_write_reg(APIC_ICR, APIC_DEST_SELF | APIC_DM_FIXED | GOOD_IPI_VECTOR); + GUEST_ASSERT_EQ(xapic_read_reg(APIC_ID), 1 << 24); + + /* Enable IRQs and verify the IRQ was received. */ + sti_nop(); + GUEST_ASSERT_EQ(good_ipis_received, 1); + + /* + * Run L2 to switch to x2APIC mode, which in turn will uninhibit APICv, + * as KVM should force the APIC ID back to its default. + */ + GUEST_ASSERT(!vmlaunch()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + vmwrite(GUEST_RIP, vmreadz(GUEST_RIP) + vmreadz(VM_EXIT_INSTRUCTION_LEN)); + GUEST_ASSERT(rdmsr(MSR_IA32_APICBASE) & MSR_IA32_APICBASE_EXTD); + + /* + * Scribble the APIC access page to verify KVM disabled xAPIC + * virtualization in vmcs01, and to verify that KVM flushes L1's TLB + * when L2 switches back to accelerated xAPIC mode. + */ + xapic_write_reg(APIC_ICR2, 0xdeadbeefu); + xapic_write_reg(APIC_ICR, APIC_DEST_SELF | APIC_DM_FIXED | BAD_IPI_VECTOR); + + /* + * Verify the IRQ is still in-service and emit an EOI to verify KVM + * propagates the highest vISR vector to SVI when APICv is activated + * (and does so even if APICv was uninhibited while L2 was active). + */ + GUEST_ASSERT_EQ(x2apic_read_reg(APIC_ISR + APIC_VECTOR_TO_REG_OFFSET(GOOD_IPI_VECTOR)), + BIT(APIC_VECTOR_TO_BIT_NUMBER(GOOD_IPI_VECTOR))); + x2apic_write_reg(APIC_EOI, 0); + GUEST_ASSERT_EQ(x2apic_read_reg(APIC_ISR + APIC_VECTOR_TO_REG_OFFSET(GOOD_IPI_VECTOR)), 0); + + /* + * Run L2 one more time to switch back to xAPIC mode to verify that KVM + * handles the x2APIC => xAPIC transition and inhibits APICv while L2 + * is active. + */ + GUEST_ASSERT(!vmresume()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + GUEST_ASSERT(!(rdmsr(MSR_IA32_APICBASE) & MSR_IA32_APICBASE_EXTD)); + + xapic_write_reg(APIC_ICR, APIC_DEST_SELF | APIC_DM_FIXED | GOOD_IPI_VECTOR); + /* Re-enable IRQs, as VM-Exit clears RFLAGS.IF. */ + sti_nop(); + GUEST_ASSERT_EQ(good_ipis_received, 2); + + GUEST_ASSERT_EQ(xapic_read_reg(APIC_ISR + APIC_VECTOR_TO_REG_OFFSET(GOOD_IPI_VECTOR)), + BIT(APIC_VECTOR_TO_BIT_NUMBER(GOOD_IPI_VECTOR))); + xapic_write_reg(APIC_EOI, 0); + GUEST_ASSERT_EQ(xapic_read_reg(APIC_ISR + APIC_VECTOR_TO_REG_OFFSET(GOOD_IPI_VECTOR)), 0); + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + vm_vaddr_t vmx_pages_gva; + struct vmx_pages *vmx; + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + struct ucall uc; + + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); + + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); + + vmx = vcpu_alloc_vmx(vm, &vmx_pages_gva); + prepare_virtualize_apic_accesses(vmx, vm); + vcpu_args_set(vcpu, 1, vmx_pages_gva); + + virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA); + vm_install_exception_handler(vm, BAD_IPI_VECTOR, bad_ipi_handler); + vm_install_exception_handler(vm, GOOD_IPI_VECTOR, good_ipi_handler); + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + /* NOT REACHED */ + case UCALL_DONE: + break; + default: + TEST_FAIL("Unexpected ucall %lu", uc.cmd); + } + + /* + * Verify at least two IRQs were injected. Unfortunately, KVM counts + * re-injected IRQs (e.g. if delivering the IRQ hits an EPT violation), + * so being more precise isn't possible given the current stats. + */ + TEST_ASSERT(vcpu_get_stat(vcpu, irq_injections) >= 2, + "Wanted at least 2 IRQ injections, got %lu\n", + vcpu_get_stat(vcpu, irq_injections)); + + kvm_vm_free(vm); + return 0; +} -- cgit v1.2.3 From d7507a94a07202234236d7f94bed6015ca645ae6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Dec 2025 13:13:44 -0800 Subject: KVM: SVM: Treat exit_code as an unsigned 64-bit value through all of KVM Fix KVM's long-standing buggy handling of SVM's exit_code as a 32-bit value. Per the APM and Xen commit d1bd157fbc ("Big merge the HVM full-virtualisation abstractions.") (which is arguably more trustworthy than KVM), offset 0x70 is a single 64-bit value: 070h 63:0 EXITCODE Track exit_code as a single u64 to prevent reintroducing bugs where KVM neglects to correctly set bits 63:32. Fixes: 6aa8b732ca01 ("[PATCH] kvm: userspace interface") Cc: Jim Mattson Cc: Yosry Ahmed Reviewed-by: Yosry Ahmed Link: https://patch.msgid.link/20251230211347.4099600-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/svm.h | 3 +- arch/x86/include/uapi/asm/svm.h | 32 +++++++++---------- arch/x86/kvm/svm/hyperv.c | 1 - arch/x86/kvm/svm/nested.c | 13 ++------ arch/x86/kvm/svm/sev.c | 36 ++++++++-------------- arch/x86/kvm/svm/svm.c | 7 ++--- arch/x86/kvm/svm/svm.h | 4 +-- arch/x86/kvm/trace.h | 6 ++-- include/hyperv/hvgdk.h | 2 +- tools/testing/selftests/kvm/include/x86/svm.h | 3 +- .../kvm/x86/svm_nested_soft_inject_test.c | 4 +-- 11 files changed, 42 insertions(+), 69 deletions(-) (limited to 'tools/testing') diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 50ece197c98a..edde36097ddc 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -137,8 +137,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u32 int_vector; u32 int_state; u8 reserved_3[4]; - u32 exit_code; - u32 exit_code_hi; + u64 exit_code; u64 exit_info_1; u64 exit_info_2; u32 exit_int_info; diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index 650e3256ea7d..010a45c9f614 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -103,38 +103,38 @@ #define SVM_EXIT_VMGEXIT 0x403 /* SEV-ES software-defined VMGEXIT events */ -#define SVM_VMGEXIT_MMIO_READ 0x80000001 -#define SVM_VMGEXIT_MMIO_WRITE 0x80000002 -#define SVM_VMGEXIT_NMI_COMPLETE 0x80000003 -#define SVM_VMGEXIT_AP_HLT_LOOP 0x80000004 -#define SVM_VMGEXIT_AP_JUMP_TABLE 0x80000005 +#define SVM_VMGEXIT_MMIO_READ 0x80000001ull +#define SVM_VMGEXIT_MMIO_WRITE 0x80000002ull +#define SVM_VMGEXIT_NMI_COMPLETE 0x80000003ull +#define SVM_VMGEXIT_AP_HLT_LOOP 0x80000004ull +#define SVM_VMGEXIT_AP_JUMP_TABLE 0x80000005ull #define SVM_VMGEXIT_SET_AP_JUMP_TABLE 0 #define SVM_VMGEXIT_GET_AP_JUMP_TABLE 1 -#define SVM_VMGEXIT_PSC 0x80000010 -#define SVM_VMGEXIT_GUEST_REQUEST 0x80000011 -#define SVM_VMGEXIT_EXT_GUEST_REQUEST 0x80000012 -#define SVM_VMGEXIT_AP_CREATION 0x80000013 +#define SVM_VMGEXIT_PSC 0x80000010ull +#define SVM_VMGEXIT_GUEST_REQUEST 0x80000011ull +#define SVM_VMGEXIT_EXT_GUEST_REQUEST 0x80000012ull +#define SVM_VMGEXIT_AP_CREATION 0x80000013ull #define SVM_VMGEXIT_AP_CREATE_ON_INIT 0 #define SVM_VMGEXIT_AP_CREATE 1 #define SVM_VMGEXIT_AP_DESTROY 2 -#define SVM_VMGEXIT_SNP_RUN_VMPL 0x80000018 -#define SVM_VMGEXIT_SAVIC 0x8000001a +#define SVM_VMGEXIT_SNP_RUN_VMPL 0x80000018ull +#define SVM_VMGEXIT_SAVIC 0x8000001aull #define SVM_VMGEXIT_SAVIC_REGISTER_GPA 0 #define SVM_VMGEXIT_SAVIC_UNREGISTER_GPA 1 #define SVM_VMGEXIT_SAVIC_SELF_GPA ~0ULL -#define SVM_VMGEXIT_HV_FEATURES 0x8000fffd -#define SVM_VMGEXIT_TERM_REQUEST 0x8000fffe +#define SVM_VMGEXIT_HV_FEATURES 0x8000fffdull +#define SVM_VMGEXIT_TERM_REQUEST 0x8000fffeull #define SVM_VMGEXIT_TERM_REASON(reason_set, reason_code) \ /* SW_EXITINFO1[3:0] */ \ (((((u64)reason_set) & 0xf)) | \ /* SW_EXITINFO1[11:4] */ \ ((((u64)reason_code) & 0xff) << 4)) -#define SVM_VMGEXIT_UNSUPPORTED_EVENT 0x8000ffff +#define SVM_VMGEXIT_UNSUPPORTED_EVENT 0x8000ffffull /* Exit code reserved for hypervisor/software use */ -#define SVM_EXIT_SW 0xf0000000 +#define SVM_EXIT_SW 0xf0000000ull -#define SVM_EXIT_ERR -1 +#define SVM_EXIT_ERR -1ull #define SVM_EXIT_REASONS \ { SVM_EXIT_READ_CR0, "read_cr0" }, \ diff --git a/arch/x86/kvm/svm/hyperv.c b/arch/x86/kvm/svm/hyperv.c index 088f6429b24c..3ec580d687f5 100644 --- a/arch/x86/kvm/svm/hyperv.c +++ b/arch/x86/kvm/svm/hyperv.c @@ -11,7 +11,6 @@ void svm_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb->control.exit_code = HV_SVM_EXITCODE_ENL; - svm->vmcb->control.exit_code_hi = 0; svm->vmcb->control.exit_info_1 = HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH; svm->vmcb->control.exit_info_2 = 0; nested_svm_vmexit(svm); diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 666b5a36c15d..5aa0512e09c9 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -45,7 +45,6 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, * correctly fill in the high bits of exit_info_1. */ vmcb->control.exit_code = SVM_EXIT_NPF; - vmcb->control.exit_code_hi = 0; vmcb->control.exit_info_1 = (1ULL << 32); vmcb->control.exit_info_2 = fault->address; } @@ -441,7 +440,6 @@ void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu, to->int_vector = from->int_vector; to->int_state = from->int_state; to->exit_code = from->exit_code; - to->exit_code_hi = from->exit_code_hi; to->exit_info_1 = from->exit_info_1; to->exit_info_2 = from->exit_info_2; to->exit_int_info = from->exit_int_info; @@ -747,8 +745,8 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, enter_guest_mode(vcpu); /* - * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2, - * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes. + * Filled at exit: exit_code, exit_info_1, exit_info_2, exit_int_info, + * exit_int_info_err, next_rip, insn_len, insn_bytes. */ if (guest_cpu_cap_has(vcpu, X86_FEATURE_VGIF) && @@ -1018,7 +1016,6 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) if (!nested_vmcb_check_save(vcpu) || !nested_vmcb_check_controls(vcpu)) { vmcb12->control.exit_code = SVM_EXIT_ERR; - vmcb12->control.exit_code_hi = -1u; vmcb12->control.exit_info_1 = 0; vmcb12->control.exit_info_2 = 0; goto out; @@ -1051,7 +1048,6 @@ out_exit_err: svm->soft_int_injected = false; svm->vmcb->control.exit_code = SVM_EXIT_ERR; - svm->vmcb->control.exit_code_hi = -1u; svm->vmcb->control.exit_info_1 = 0; svm->vmcb->control.exit_info_2 = 0; @@ -1163,7 +1159,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb12->control.int_state = vmcb02->control.int_state; vmcb12->control.exit_code = vmcb02->control.exit_code; - vmcb12->control.exit_code_hi = vmcb02->control.exit_code_hi; vmcb12->control.exit_info_1 = vmcb02->control.exit_info_1; vmcb12->control.exit_info_2 = vmcb02->control.exit_info_2; @@ -1460,7 +1455,7 @@ static int nested_svm_intercept_ioio(struct vcpu_svm *svm) static int nested_svm_intercept(struct vcpu_svm *svm) { - u32 exit_code = svm->vmcb->control.exit_code; + u64 exit_code = svm->vmcb->control.exit_code; int vmexit = NESTED_EXIT_HOST; if (svm_is_vmrun_failure(exit_code)) @@ -1532,7 +1527,6 @@ static void nested_svm_inject_exception_vmexit(struct kvm_vcpu *vcpu) struct vmcb *vmcb = svm->vmcb; vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + ex->vector; - vmcb->control.exit_code_hi = 0; if (ex->has_error_code) vmcb->control.exit_info_1 = ex->error_code; @@ -1708,7 +1702,6 @@ static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst, dst->int_vector = from->int_vector; dst->int_state = from->int_state; dst->exit_code = from->exit_code; - dst->exit_code_hi = from->exit_code_hi; dst->exit_info_1 = from->exit_info_1; dst->exit_info_2 = from->exit_info_2; dst->exit_int_info = from->exit_int_info; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 28150506b18c..f67525007089 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3270,11 +3270,6 @@ skip_vmsa_free: kvfree(svm->sev_es.ghcb_sa); } -static u64 kvm_get_cached_sw_exit_code(struct vmcb_control_area *control) -{ - return (((u64)control->exit_code_hi) << 32) | control->exit_code; -} - static void dump_ghcb(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; @@ -3296,7 +3291,7 @@ static void dump_ghcb(struct vcpu_svm *svm) */ pr_err("GHCB (GPA=%016llx) snapshot:\n", svm->vmcb->control.ghcb_gpa); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_code", - kvm_get_cached_sw_exit_code(control), kvm_ghcb_sw_exit_code_is_valid(svm)); + control->exit_code, kvm_ghcb_sw_exit_code_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_1", control->exit_info_1, kvm_ghcb_sw_exit_info_1_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_2", @@ -3330,7 +3325,6 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) struct vmcb_control_area *control = &svm->vmcb->control; struct kvm_vcpu *vcpu = &svm->vcpu; struct ghcb *ghcb = svm->sev_es.ghcb; - u64 exit_code; /* * The GHCB protocol so far allows for the following data @@ -3364,9 +3358,7 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) __kvm_emulate_msr_write(vcpu, MSR_IA32_XSS, kvm_ghcb_get_xss(svm)); /* Copy the GHCB exit information into the VMCB fields */ - exit_code = kvm_ghcb_get_sw_exit_code(svm); - control->exit_code = lower_32_bits(exit_code); - control->exit_code_hi = upper_32_bits(exit_code); + control->exit_code = kvm_ghcb_get_sw_exit_code(svm); control->exit_info_1 = kvm_ghcb_get_sw_exit_info_1(svm); control->exit_info_2 = kvm_ghcb_get_sw_exit_info_2(svm); svm->sev_es.sw_scratch = kvm_ghcb_get_sw_scratch_if_valid(svm); @@ -3379,15 +3371,8 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; struct kvm_vcpu *vcpu = &svm->vcpu; - u64 exit_code; u64 reason; - /* - * Retrieve the exit code now even though it may not be marked valid - * as it could help with debugging. - */ - exit_code = kvm_get_cached_sw_exit_code(control); - /* Only GHCB Usage code 0 is supported */ if (svm->sev_es.ghcb->ghcb_usage) { reason = GHCB_ERR_INVALID_USAGE; @@ -3401,7 +3386,7 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) !kvm_ghcb_sw_exit_info_2_is_valid(svm)) goto vmgexit_err; - switch (exit_code) { + switch (control->exit_code) { case SVM_EXIT_READ_DR7: break; case SVM_EXIT_WRITE_DR7: @@ -3502,15 +3487,19 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) return 0; vmgexit_err: + /* + * Print the exit code even though it may not be marked valid as it + * could help with debugging. + */ if (reason == GHCB_ERR_INVALID_USAGE) { vcpu_unimpl(vcpu, "vmgexit: ghcb usage %#x is not valid\n", svm->sev_es.ghcb->ghcb_usage); } else if (reason == GHCB_ERR_INVALID_EVENT) { vcpu_unimpl(vcpu, "vmgexit: exit code %#llx is not valid\n", - exit_code); + control->exit_code); } else { vcpu_unimpl(vcpu, "vmgexit: exit code %#llx input is not valid\n", - exit_code); + control->exit_code); dump_ghcb(svm); } @@ -4349,7 +4338,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb_control_area *control = &svm->vmcb->control; - u64 ghcb_gpa, exit_code; + u64 ghcb_gpa; int ret; /* Validate the GHCB */ @@ -4391,8 +4380,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) svm_vmgexit_success(svm, 0); - exit_code = kvm_get_cached_sw_exit_code(control); - switch (exit_code) { + switch (control->exit_code) { case SVM_VMGEXIT_MMIO_READ: ret = setup_vmgexit_scratch(svm, true, control->exit_info_2); if (ret) @@ -4484,7 +4472,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) ret = -EINVAL; break; default: - ret = svm_invoke_exit_handler(vcpu, exit_code); + ret = svm_invoke_exit_handler(vcpu, control->exit_code); } return ret; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 3caf7a21679f..a28cd61d87ea 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2466,7 +2466,6 @@ static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu, if (cr0 ^ val) { svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE; - svm->vmcb->control.exit_code_hi = 0; ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE); } @@ -3299,7 +3298,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl); pr_err("%-20s%08x\n", "int_vector:", control->int_vector); pr_err("%-20s%08x\n", "int_state:", control->int_state); - pr_err("%-20s%08x\n", "exit_code:", control->exit_code); + pr_err("%-20s%016llx\n", "exit_code:", control->exit_code); pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1); pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2); pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info); @@ -3549,7 +3548,6 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_run *kvm_run = vcpu->run; - u32 exit_code = svm->vmcb->control.exit_code; /* SEV-ES guests must use the CR write traps to track CR registers. */ if (!sev_es_guest(vcpu->kvm)) { @@ -3585,7 +3583,7 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) if (exit_fastpath != EXIT_FASTPATH_NONE) return 1; - return svm_invoke_exit_handler(vcpu, exit_code); + return svm_invoke_exit_handler(vcpu, svm->vmcb->control.exit_code); } static int pre_svm_run(struct kvm_vcpu *vcpu) @@ -4670,7 +4668,6 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, if (static_cpu_has(X86_FEATURE_NRIPS)) vmcb->control.next_rip = info->next_rip; vmcb->control.exit_code = icpt_info.exit_code; - vmcb->control.exit_code_hi = 0; vmexit = nested_svm_exit_handled(svm); ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 3360ac36e071..a22433680c73 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -160,8 +160,7 @@ struct vmcb_ctrl_area_cached { u32 int_ctl; u32 int_vector; u32 int_state; - u32 exit_code; - u32 exit_code_hi; + u64 exit_code; u64 exit_info_1; u64 exit_info_2; u32 exit_int_info; @@ -787,7 +786,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm); static inline int nested_svm_simple_vmexit(struct vcpu_svm *svm, u32 exit_code) { svm->vmcb->control.exit_code = exit_code; - svm->vmcb->control.exit_code_hi = 0; svm->vmcb->control.exit_info_1 = 0; svm->vmcb->control.exit_info_2 = 0; return nested_svm_vmexit(svm); diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index e79bc9cb7162..e7fdbe9efc90 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -383,10 +383,10 @@ TRACE_EVENT(kvm_apic, #define kvm_print_exit_reason(exit_reason, isa) \ (isa == KVM_ISA_VMX) ? \ __print_symbolic(exit_reason & 0xffff, VMX_EXIT_REASONS) : \ - __print_symbolic(exit_reason, SVM_EXIT_REASONS), \ + __print_symbolic_u64(exit_reason, SVM_EXIT_REASONS), \ (isa == KVM_ISA_VMX && exit_reason & ~0xffff) ? " " : "", \ (isa == KVM_ISA_VMX) ? \ - __print_flags(exit_reason & ~0xffff, " ", VMX_EXIT_REASON_FLAGS) : "" + __print_flags_u64(exit_reason & ~0xffff, " ", VMX_EXIT_REASON_FLAGS) : "" #define TRACE_EVENT_KVM_EXIT(name) \ TRACE_EVENT(name, \ @@ -781,7 +781,7 @@ TRACE_EVENT_KVM_EXIT(kvm_nested_vmexit); * Tracepoint for #VMEXIT reinjected to the guest */ TRACE_EVENT(kvm_nested_vmexit_inject, - TP_PROTO(__u32 exit_code, + TP_PROTO(__u64 exit_code, __u64 exit_info1, __u64 exit_info2, __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa), TP_ARGS(exit_code, exit_info1, exit_info2, diff --git a/include/hyperv/hvgdk.h b/include/hyperv/hvgdk.h index dd6d4939ea29..384c3f3ff4a5 100644 --- a/include/hyperv/hvgdk.h +++ b/include/hyperv/hvgdk.h @@ -281,7 +281,7 @@ struct hv_vmcb_enlightenments { #define HV_VMCB_NESTED_ENLIGHTENMENTS 31 /* Synthetic VM-Exit */ -#define HV_SVM_EXITCODE_ENL 0xf0000000 +#define HV_SVM_EXITCODE_ENL 0xf0000000ull #define HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH (1) /* VM_PARTITION_ASSIST_PAGE */ diff --git a/tools/testing/selftests/kvm/include/x86/svm.h b/tools/testing/selftests/kvm/include/x86/svm.h index 29cffd0a9181..10b30b38bb3f 100644 --- a/tools/testing/selftests/kvm/include/x86/svm.h +++ b/tools/testing/selftests/kvm/include/x86/svm.h @@ -92,8 +92,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u32 int_vector; u32 int_state; u8 reserved_3[4]; - u32 exit_code; - u32 exit_code_hi; + u64 exit_code; u64 exit_info_1; u64 exit_info_2; u32 exit_int_info; diff --git a/tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c b/tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c index 7b6481d6c0d3..4bd1655f9e6d 100644 --- a/tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c +++ b/tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c @@ -103,7 +103,7 @@ static void l1_guest_code(struct svm_test_data *svm, uint64_t is_nmi, uint64_t i run_guest(vmcb, svm->vmcb_gpa); __GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL, - "Expected VMMCAL #VMEXIT, got '0x%x', info1 = '0x%lx, info2 = '0x%lx'", + "Expected VMMCAL #VMEXIT, got '0x%lx', info1 = '0x%lx, info2 = '0x%lx'", vmcb->control.exit_code, vmcb->control.exit_info_1, vmcb->control.exit_info_2); @@ -133,7 +133,7 @@ static void l1_guest_code(struct svm_test_data *svm, uint64_t is_nmi, uint64_t i run_guest(vmcb, svm->vmcb_gpa); __GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_HLT, - "Expected HLT #VMEXIT, got '0x%x', info1 = '0x%lx, info2 = '0x%lx'", + "Expected HLT #VMEXIT, got '0x%lx', info1 = '0x%lx, info2 = '0x%lx'", vmcb->control.exit_code, vmcb->control.exit_info_1, vmcb->control.exit_info_2); -- cgit v1.2.3 From b324192e36ecea472077f9c9e32bcac8bbafeed6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 12 Jan 2026 16:07:35 -0800 Subject: selftests: net: py: teach ksft_pr() multi-line safety Make printing multi-line logs easier by automatically prefixing each line in ksft_pr(). Make use of this when formatting exceptions. Reviewed-by: Petr Machata Link: https://patch.msgid.link/20260113000740.255360-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/lib/py/ksft.py | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/net/lib/py/ksft.py b/tools/testing/selftests/net/lib/py/ksft.py index 0a96f88bb60a..6cdfb8afccb5 100644 --- a/tools/testing/selftests/net/lib/py/ksft.py +++ b/tools/testing/selftests/net/lib/py/ksft.py @@ -32,8 +32,23 @@ class KsftTerminate(KeyboardInterrupt): def ksft_pr(*objs, **kwargs): + """ + Print logs to stdout. + + Behaves like print() but log lines will be prefixed + with # to prevent breaking the TAP output formatting. + + Extra arguments (on top of what print() supports): + line_pfx - add extra string before each line + """ + sep = kwargs.pop("sep", " ") + pfx = kwargs.pop("line_pfx", "") + pfx = "#" + (" " + pfx if pfx else "") kwargs["flush"] = True - print("#", *objs, **kwargs) + + text = sep.join(str(obj) for obj in objs) + prefixed = f"\n{pfx} ".join(text.split('\n')) + print(pfx, prefixed, **kwargs) def _fail(*args): @@ -170,9 +185,7 @@ def ksft_flush_defer(): entry.exec_only() except Exception: ksft_pr(f"Exception while handling defer / cleanup (callback {i} of {qlen_start})!") - tb = traceback.format_exc() - for line in tb.strip().split('\n'): - ksft_pr("Defer Exception|", line) + ksft_pr(traceback.format_exc(), line_pfx="Defer Exception|") KSFT_RESULT = False @@ -331,9 +344,7 @@ def ksft_run(cases=None, globs=None, case_pfx=None, args=()): cnt_key = 'xfail' except BaseException as e: stop |= isinstance(e, KeyboardInterrupt) - tb = traceback.format_exc() - for line in tb.strip().split('\n'): - ksft_pr("Exception|", line) + ksft_pr(traceback.format_exc(), line_pfx="Exception|") if stop: ksft_pr(f"Stopping tests due to {type(e).__name__}.") KSFT_RESULT = False @@ -343,9 +354,7 @@ def ksft_run(cases=None, globs=None, case_pfx=None, args=()): try: ksft_flush_defer() except BaseException as e: - tb = traceback.format_exc() - for line in tb.strip().split('\n'): - ksft_pr("Exception|", line) + ksft_pr(traceback.format_exc(), line_pfx="Exception|") if isinstance(e, KeyboardInterrupt): ksft_pr() ksft_pr("WARN: defer() interrupted, cleanup may be incomplete.") -- cgit v1.2.3 From ce0f92dc737c65d705d83ea9529d8fb9a2194241 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 12 Jan 2026 16:07:36 -0800 Subject: selftests: net: py: teach cmd() how to print itself Teach cmd() how to print itself, to make debug prints easier. Example output (leading # due to ksft_pr()): # CMD: /root/ksft-net-drv/drivers/net/gro # EXIT: 1 # STDOUT: ipv6 with ext header does coalesce: # STDERR: Expected {200 }, Total 1 packets # Received {100 [!=200]100 [!=0]}, Total 2 packets. Reviewed-by: Petr Machata Link: https://patch.msgid.link/20260113000740.255360-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/lib/py/utils.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/net/lib/py/utils.py b/tools/testing/selftests/net/lib/py/utils.py index 824f039d384c..37243103aee3 100644 --- a/tools/testing/selftests/net/lib/py/utils.py +++ b/tools/testing/selftests/net/lib/py/utils.py @@ -41,7 +41,9 @@ class cmd: self.ret = None self.ksft_term_fd = None + self.host = host self.comm = comm + if host: self.proc = host.cmd(comm) else: @@ -99,6 +101,27 @@ class cmd: raise CmdExitFailure("Command failed: %s\nSTDOUT: %s\nSTDERR: %s" % (self.proc.args, stdout, stderr), self) + def __repr__(self): + def str_fmt(name, s): + name += ': ' + return (name + s.strip().replace('\n', '\n' + ' ' * len(name))) + + ret = "CMD" + if self.host: + ret += "[remote]" + if self.ret is None: + ret += f" (unterminated): {self.comm}\n" + elif self.ret == 0: + ret += f" (success): {self.comm}\n" + else: + ret += f": {self.comm}\n" + ret += f" EXIT: {self.ret}\n" + if self.stdout: + ret += str_fmt(" STDOUT", self.stdout) + "\n" + if self.stderr: + ret += str_fmt(" STDERR", self.stderr) + "\n" + return ret.strip() + class bkg(cmd): """ -- cgit v1.2.3 From d131da6d7282829c775cf70f5f1db1576e5c1273 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 12 Jan 2026 16:07:37 -0800 Subject: selftests: drv-net: gro: use cmd print Now that cmd() can be printed directly remove the old formatting. Before: # fragmented ip6 doesn't coalesce: # Expected {200 100 100 }, Total 3 packets # Received {200 100 }, Total 2 packets. # /root/ksft-net-drv/drivers/net/gro: incorrect number of packets Now: # CMD: drivers/net/gro --ipv6 --dmac 9e:[...] # EXIT: 1 # STDOUT: fragmented ip6 doesn't coalesce: # STDERR: Expected {200 100 100 }, Total 3 packets # Received {200 100 }, Total 2 packets. # /root/ksft-net-drv/drivers/net/gro: incorrect number of packets Reviewed-by: Petr Machata Link: https://patch.msgid.link/20260113000740.255360-4-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/gro.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/drivers/net/gro.py b/tools/testing/selftests/drivers/net/gro.py index ba83713bf7b5..4e0fb19d1527 100755 --- a/tools/testing/selftests/drivers/net/gro.py +++ b/tools/testing/selftests/drivers/net/gro.py @@ -142,8 +142,7 @@ def test(cfg, protocol, test_name): if rx_proc.ret == 0: return - ksft_pr(rx_proc.stdout.strip().replace('\n', '\n# ')) - ksft_pr(rx_proc.stderr.strip().replace('\n', '\n# ')) + ksft_pr(rx_proc) if test_name == "large" and os.environ.get("KSFT_MACHINE_SLOW"): ksft_pr(f"Ignoring {protocol}/{test_name} failure due to slow environment") -- cgit v1.2.3 From 8171f6a76b2250d93a550566af6b915dc92edc75 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 12 Jan 2026 16:07:38 -0800 Subject: selftests: drv-net: gro: improve feature config We'll need to do a lot more feature handling to test HW-GRO and LRO. Clean up the feature handling for SW GRO a bit to let the next commit focus on the new test cases, only. Make sure HW GRO-like features are not enabled for the SW tests. Be more careful about changing features as "nothing changed" situations may result in non-zero error code from ethtool. Don't disable TSO on the local interface (receiver) when running over netdevsim, we just want GSO to break up the segments on the sender. Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260113000740.255360-5-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/gro.py | 39 +++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/drivers/net/gro.py b/tools/testing/selftests/drivers/net/gro.py index 4e0fb19d1527..3c30749ead39 100755 --- a/tools/testing/selftests/drivers/net/gro.py +++ b/tools/testing/selftests/drivers/net/gro.py @@ -20,7 +20,7 @@ Test cases: import os from lib.py import ksft_run, ksft_exit, ksft_pr from lib.py import NetDrvEpEnv, KsftXfailEx -from lib.py import cmd, defer, bkg, ip +from lib.py import bkg, cmd, defer, ethtool, ip from lib.py import ksft_variants @@ -70,6 +70,27 @@ def _set_mtu_restore(dev, mtu, host): defer(ip, f"link set dev {dev['ifname']} mtu {dev['mtu']}", host=host) +def _set_ethtool_feat(dev, current, feats, host=None): + s2n = {True: "on", False: "off"} + + new = ["-K", dev] + old = ["-K", dev] + no_change = True + for name, state in feats.items(): + new += [name, s2n[state]] + old += [name, s2n[current[name]["active"]]] + + if current[name]["active"] != state: + no_change = False + if current[name]["fixed"]: + raise KsftXfailEx(f"Device does not support {name}") + if no_change: + return + + ethtool(" ".join(new), host=host) + defer(ethtool, " ".join(old), host=host) + + def _setup(cfg, test_name): """ Setup hardware loopback mode for GRO testing. """ @@ -77,6 +98,11 @@ def _setup(cfg, test_name): cfg.bin_local = cfg.test_dir / "gro" cfg.bin_remote = cfg.remote.deploy(cfg.bin_local) + if not hasattr(cfg, "feat"): + cfg.feat = ethtool(f"-k {cfg.ifname}", json=True)[0] + cfg.remote_feat = ethtool(f"-k {cfg.remote_ifname}", + host=cfg.remote, json=True)[0] + # "large" test needs at least 4k MTU if test_name == "large": _set_mtu_restore(cfg.dev, 4096, None) @@ -88,15 +114,22 @@ def _setup(cfg, test_name): _write_defer_restore(cfg, flush_path, "200000", defer_undo=True) _write_defer_restore(cfg, irq_path, "10", defer_undo=True) + _set_ethtool_feat(cfg.ifname, cfg.feat, + {"generic-receive-offload": True, + "rx-gro-hw": False, + "large-receive-offload": False}) + try: # Disable TSO for local tests cfg.require_nsim() # will raise KsftXfailEx if not running on nsim - cmd(f"ethtool -K {cfg.ifname} gro on tso off") - cmd(f"ethtool -K {cfg.remote_ifname} gro on tso off", host=cfg.remote) + _set_ethtool_feat(cfg.remote_ifname, cfg.remote_feat, + {"tcp-segmentation-offload": False}, + host=cfg.remote) except KsftXfailEx: pass + def _gro_variants(): """Generator that yields all combinations of protocol and test types.""" -- cgit v1.2.3 From d3b35898de024796c43415f9535fd0bc69cb8f1b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 12 Jan 2026 16:07:39 -0800 Subject: selftests: drv-net: gro: run the test against HW GRO and LRO Run the test against HW GRO and LRO. NICs I have pass the base cases. Interestingly all are happy to build GROs larger than 64k. Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260113000740.255360-6-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/gro.py | 72 +++++++++++++++++------ tools/testing/selftests/drivers/net/lib/py/env.py | 7 ++- 2 files changed, 60 insertions(+), 19 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/drivers/net/gro.py b/tools/testing/selftests/drivers/net/gro.py index 3c30749ead39..1ab85590c439 100755 --- a/tools/testing/selftests/drivers/net/gro.py +++ b/tools/testing/selftests/drivers/net/gro.py @@ -87,11 +87,15 @@ def _set_ethtool_feat(dev, current, feats, host=None): if no_change: return - ethtool(" ".join(new), host=host) + eth_cmd = ethtool(" ".join(new), host=host) defer(ethtool, " ".join(old), host=host) + # If ethtool printed something kernel must have modified some features + if eth_cmd.stdout: + ksft_pr(eth_cmd) -def _setup(cfg, test_name): + +def _setup(cfg, mode, test_name): """ Setup hardware loopback mode for GRO testing. """ if not hasattr(cfg, "bin_remote"): @@ -108,16 +112,49 @@ def _setup(cfg, test_name): _set_mtu_restore(cfg.dev, 4096, None) _set_mtu_restore(cfg.remote_dev, 4096, cfg.remote) - flush_path = f"/sys/class/net/{cfg.ifname}/gro_flush_timeout" - irq_path = f"/sys/class/net/{cfg.ifname}/napi_defer_hard_irqs" - - _write_defer_restore(cfg, flush_path, "200000", defer_undo=True) - _write_defer_restore(cfg, irq_path, "10", defer_undo=True) - - _set_ethtool_feat(cfg.ifname, cfg.feat, - {"generic-receive-offload": True, - "rx-gro-hw": False, - "large-receive-offload": False}) + if mode == "sw": + flush_path = f"/sys/class/net/{cfg.ifname}/gro_flush_timeout" + irq_path = f"/sys/class/net/{cfg.ifname}/napi_defer_hard_irqs" + + _write_defer_restore(cfg, flush_path, "200000", defer_undo=True) + _write_defer_restore(cfg, irq_path, "10", defer_undo=True) + + _set_ethtool_feat(cfg.ifname, cfg.feat, + {"generic-receive-offload": True, + "rx-gro-hw": False, + "large-receive-offload": False}) + elif mode == "hw": + _set_ethtool_feat(cfg.ifname, cfg.feat, + {"generic-receive-offload": False, + "rx-gro-hw": True, + "large-receive-offload": False}) + + # Some NICs treat HW GRO as a GRO sub-feature so disabling GRO + # will also clear HW GRO. Use a hack of installing XDP generic + # to skip SW GRO, even when enabled. + feat = ethtool(f"-k {cfg.ifname}", json=True)[0] + if not feat["rx-gro-hw"]["active"]: + ksft_pr("Driver clears HW GRO and SW GRO is cleared, using generic XDP workaround") + prog = cfg.net_lib_dir / "xdp_dummy.bpf.o" + ip(f"link set dev {cfg.ifname} xdpgeneric obj {prog} sec xdp") + defer(ip, f"link set dev {cfg.ifname} xdpgeneric off") + + # Attaching XDP may change features, fetch the latest state + feat = ethtool(f"-k {cfg.ifname}", json=True)[0] + + _set_ethtool_feat(cfg.ifname, feat, + {"generic-receive-offload": True, + "rx-gro-hw": True, + "large-receive-offload": False}) + elif mode == "lro": + # netdevsim advertises LRO for feature inheritance testing with + # bonding/team tests but it doesn't actually perform the offload + cfg.require_nsim(nsim_test=False) + + _set_ethtool_feat(cfg.ifname, cfg.feat, + {"generic-receive-offload": False, + "rx-gro-hw": False, + "large-receive-offload": True}) try: # Disable TSO for local tests @@ -133,19 +170,20 @@ def _setup(cfg, test_name): def _gro_variants(): """Generator that yields all combinations of protocol and test types.""" - for protocol in ["ipv4", "ipv6", "ipip"]: - for test_name in ["data", "ack", "flags", "tcp", "ip", "large"]: - yield protocol, test_name + for mode in ["sw", "hw", "lro"]: + for protocol in ["ipv4", "ipv6", "ipip"]: + for test_name in ["data", "ack", "flags", "tcp", "ip", "large"]: + yield mode, protocol, test_name @ksft_variants(_gro_variants()) -def test(cfg, protocol, test_name): +def test(cfg, mode, protocol, test_name): """Run a single GRO test with retries.""" ipver = "6" if protocol[-1] == "6" else "4" cfg.require_ipver(ipver) - _setup(cfg, test_name) + _setup(cfg, mode, test_name) base_cmd_args = [ f"--{protocol}", diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py index 63495376e654..41cc248ac848 100644 --- a/tools/testing/selftests/drivers/net/lib/py/env.py +++ b/tools/testing/selftests/drivers/net/lib/py/env.py @@ -248,9 +248,12 @@ class NetDrvEpEnv(NetDrvEnvBase): if not self.addr_v[ipver] or not self.remote_addr_v[ipver]: raise KsftSkipEx(f"Test requires IPv{ipver} connectivity") - def require_nsim(self): - if self._ns is None: + def require_nsim(self, nsim_test=True): + """Require or exclude netdevsim for this test""" + if nsim_test and self._ns is None: raise KsftXfailEx("Test only works on netdevsim") + if nsim_test is False and self._ns is not None: + raise KsftXfailEx("Test does not work on netdevsim") def _require_cmd(self, comm, key, host=None): cached = self._required_cmd.get(comm, {}) -- cgit v1.2.3 From fe074aaa5329246706c652ccba6602eecbed80a8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 12 Jan 2026 16:07:40 -0800 Subject: selftests: drv-net: gro: break out all individual test cases GRO test groups the cases into categories, e.g. "tcp" case checks coalescing in presence of: - packets with bad csum, - sequence number mismatch, - timestamp option value mismatch, - different TCP options. Since we now have TAP support grouping the cases like that lowers our reporting granularity. This matters even more for NICs performing HW GRO and LRO since it appears that most implementation have _some_ bugs. Flagging the whole group of tests as failed prevents us from catching regressions in the things that work today. Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260113000740.255360-7-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/gro.c | 441 +++++++++++++++++------------ tools/testing/selftests/drivers/net/gro.py | 65 ++++- 2 files changed, 312 insertions(+), 194 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/drivers/net/gro.c b/tools/testing/selftests/drivers/net/gro.c index 751a8103f408..e76c618704cf 100644 --- a/tools/testing/selftests/drivers/net/gro.c +++ b/tools/testing/selftests/drivers/net/gro.c @@ -3,26 +3,45 @@ * This testsuite provides conformance testing for GRO coalescing. * * Test cases: - * 1.data + * + * data_*: * Data packets of the same size and same header setup with correct * sequence numbers coalesce. The one exception being the last data * packet coalesced: it can be smaller than the rest and coalesced * as long as it is in the same flow. - * 2.ack + * - data_same: same size packets coalesce + * - data_lrg_sml: large then small coalesces + * - data_sml_lrg: small then large doesn't coalesce + * + * ack: * Pure ACK does not coalesce. - * 3.flags - * Specific test cases: no packets with PSH, SYN, URG, RST set will - * be coalesced. - * 4.tcp + * + * flags_*: + * No packets with PSH, SYN, URG, RST set will be coalesced. + * - flags_psh, flags_syn, flags_rst, flags_urg + * + * tcp_*: * Packets with incorrect checksum, non-consecutive seqno and * different TCP header options shouldn't coalesce. Nit: given that * some extension headers have paddings, such as timestamp, headers - * that are padding differently would not be coalesced. - * 5.ip: - * Packets with different (ECN, TTL, TOS) header, ip options or - * ip fragments (ipv6) shouldn't coalesce. - * 6.large: + * that are padded differently would not be coalesced. + * - tcp_csum: incorrect checksum + * - tcp_seq: non-consecutive sequence numbers + * - tcp_ts: different timestamps + * - tcp_opt: different TCP options + * + * ip_*: + * Packets with different (ECN, TTL, TOS) header, IP options or + * IP fragments shouldn't coalesce. + * - ip_ecn, ip_tos: shared between IPv4/IPv6 + * - ip_ttl, ip_opt, ip_frag4: IPv4 only + * - ip_id_df*: IPv4 IP ID field coalescing tests + * - ip_frag6, ip_v6ext_*: IPv6 only + * + * large_*: * Packets larger than GRO_MAX_SIZE packets shouldn't coalesce. + * - large_max: exceeding max size + * - large_rem: remainder handling * * MSS is defined as 4096 - header because if it is too small * (i.e. 1500 MTU - header), it will result in many packets, @@ -79,6 +98,15 @@ #define ipv6_optlen(p) (((p)->hdrlen+1) << 3) /* calculate IPv6 extension header len */ #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) +enum flush_id_case { + FLUSH_ID_DF1_INC, + FLUSH_ID_DF1_FIXED, + FLUSH_ID_DF0_INC, + FLUSH_ID_DF0_FIXED, + FLUSH_ID_DF1_INC_FIXED, + FLUSH_ID_DF1_FIXED_INC, +}; + static const char *addr6_src = "fdaa::2"; static const char *addr6_dst = "fdaa::1"; static const char *addr4_src = "192.168.1.200"; @@ -95,7 +123,6 @@ static int tcp_offset = -1; static int total_hdr_len = -1; static int ethhdr_proto = -1; static bool ipip; -static const int num_flush_id_cases = 6; static void vlog(const char *fmt, ...) { @@ -127,19 +154,19 @@ static void setup_sock_filter(int fd) /* Overridden later if exthdrs are used: */ opt_ipproto_off = ipproto_off; - if (strcmp(testname, "ip") == 0) { - if (proto == PF_INET) - optlen = sizeof(struct ip_timestamp); - else { - BUILD_BUG_ON(sizeof(struct ip6_hbh) > MIN_EXTHDR_SIZE); - BUILD_BUG_ON(sizeof(struct ip6_dest) > MIN_EXTHDR_SIZE); - BUILD_BUG_ON(sizeof(struct ip6_frag) > MIN_EXTHDR_SIZE); - - /* same size for HBH and Fragment extension header types */ - optlen = MIN_EXTHDR_SIZE; - opt_ipproto_off = ETH_HLEN + sizeof(struct ipv6hdr) - + offsetof(struct ip6_ext, ip6e_nxt); - } + if (strcmp(testname, "ip_opt") == 0) { + optlen = sizeof(struct ip_timestamp); + } else if (strcmp(testname, "ip_frag6") == 0 || + strcmp(testname, "ip_v6ext_same") == 0 || + strcmp(testname, "ip_v6ext_diff") == 0) { + BUILD_BUG_ON(sizeof(struct ip6_hbh) > MIN_EXTHDR_SIZE); + BUILD_BUG_ON(sizeof(struct ip6_dest) > MIN_EXTHDR_SIZE); + BUILD_BUG_ON(sizeof(struct ip6_frag) > MIN_EXTHDR_SIZE); + + /* same size for HBH and Fragment extension header types */ + optlen = MIN_EXTHDR_SIZE; + opt_ipproto_off = ETH_HLEN + sizeof(struct ipv6hdr) + + offsetof(struct ip6_ext, ip6e_nxt); } /* this filter validates the following: @@ -648,7 +675,8 @@ static void fix_ip4_checksum(struct iphdr *iph) iph->check = checksum_fold(iph, sizeof(struct iphdr), 0); } -static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase) +static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, + enum flush_id_case tcase) { static char buf1[MAX_HDR_LEN + PAYLOAD_LEN]; static char buf2[MAX_HDR_LEN + PAYLOAD_LEN]; @@ -667,7 +695,7 @@ static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase) create_packet(buf3, PAYLOAD_LEN * 2, 0, PAYLOAD_LEN, 0); switch (tcase) { - case 0: /* DF=1, Incrementing - should coalesce */ + case FLUSH_ID_DF1_INC: /* DF=1, Incrementing - should coalesce */ iph1->frag_off |= htons(IP_DF); iph1->id = htons(8); @@ -675,7 +703,7 @@ static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase) iph2->id = htons(9); break; - case 1: /* DF=1, Fixed - should coalesce */ + case FLUSH_ID_DF1_FIXED: /* DF=1, Fixed - should coalesce */ iph1->frag_off |= htons(IP_DF); iph1->id = htons(8); @@ -683,7 +711,7 @@ static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase) iph2->id = htons(8); break; - case 2: /* DF=0, Incrementing - should coalesce */ + case FLUSH_ID_DF0_INC: /* DF=0, Incrementing - should coalesce */ iph1->frag_off &= ~htons(IP_DF); iph1->id = htons(8); @@ -691,7 +719,7 @@ static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase) iph2->id = htons(9); break; - case 3: /* DF=0, Fixed - should coalesce */ + case FLUSH_ID_DF0_FIXED: /* DF=0, Fixed - should coalesce */ iph1->frag_off &= ~htons(IP_DF); iph1->id = htons(8); @@ -699,9 +727,10 @@ static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase) iph2->id = htons(8); break; - case 4: /* DF=1, two packets incrementing, and one fixed - should - * coalesce only the first two packets - */ + case FLUSH_ID_DF1_INC_FIXED: /* DF=1, two packets incrementing, and + * one fixed - should coalesce only the + * first two packets + */ iph1->frag_off |= htons(IP_DF); iph1->id = htons(8); @@ -713,9 +742,10 @@ static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase) send_three = true; break; - case 5: /* DF=1, two packets fixed, and one incrementing - should - * coalesce only the first two packets - */ + case FLUSH_ID_DF1_FIXED_INC: /* DF=1, two packets fixed, and one + * incrementing - should coalesce only + * the first two packets + */ iph1->frag_off |= htons(IP_DF); iph1->id = htons(8); @@ -739,16 +769,6 @@ static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase) } } -static void test_flush_id(int fd, struct sockaddr_ll *daddr, char *fin_pkt) -{ - for (int i = 0; i < num_flush_id_cases; i++) { - sleep(1); - send_flush_id_case(fd, daddr, i); - sleep(1); - write_packet(fd, fin_pkt, total_hdr_len, daddr); - } -} - static void send_ipv6_exthdr(int fd, struct sockaddr_ll *daddr, char *ext_data1, char *ext_data2) { static char buf[MAX_HDR_LEN + PAYLOAD_LEN]; @@ -1030,108 +1050,128 @@ static void gro_sender(void) daddr.sll_halen = ETH_ALEN; create_packet(fin_pkt, PAYLOAD_LEN * 2, 0, 0, 1); - if (strcmp(testname, "data") == 0) { + /* data sub-tests */ + if (strcmp(testname, "data_same") == 0) { send_data_pkts(txfd, &daddr, PAYLOAD_LEN, PAYLOAD_LEN); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - + } else if (strcmp(testname, "data_lrg_sml") == 0) { send_data_pkts(txfd, &daddr, PAYLOAD_LEN, PAYLOAD_LEN / 2); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - + } else if (strcmp(testname, "data_sml_lrg") == 0) { send_data_pkts(txfd, &daddr, PAYLOAD_LEN / 2, PAYLOAD_LEN); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + /* ack test */ } else if (strcmp(testname, "ack") == 0) { send_ack(txfd, &daddr); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - } else if (strcmp(testname, "flags") == 0) { + + /* flags sub-tests */ + } else if (strcmp(testname, "flags_psh") == 0) { send_flags(txfd, &daddr, 1, 0, 0, 0); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - + } else if (strcmp(testname, "flags_syn") == 0) { send_flags(txfd, &daddr, 0, 1, 0, 0); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - + } else if (strcmp(testname, "flags_rst") == 0) { send_flags(txfd, &daddr, 0, 0, 1, 0); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - + } else if (strcmp(testname, "flags_urg") == 0) { send_flags(txfd, &daddr, 0, 0, 0, 1); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - } else if (strcmp(testname, "tcp") == 0) { + + /* tcp sub-tests */ + } else if (strcmp(testname, "tcp_csum") == 0) { send_changed_checksum(txfd, &daddr); - /* Adding sleep before sending FIN so that it is not - * received prior to other packets. - */ usleep(fin_delay_us); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - + } else if (strcmp(testname, "tcp_seq") == 0) { send_changed_seq(txfd, &daddr); usleep(fin_delay_us); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - + } else if (strcmp(testname, "tcp_ts") == 0) { send_changed_ts(txfd, &daddr); usleep(fin_delay_us); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - + } else if (strcmp(testname, "tcp_opt") == 0) { send_diff_opt(txfd, &daddr); usleep(fin_delay_us); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - } else if (strcmp(testname, "ip") == 0) { + + /* ip sub-tests - shared between IPv4 and IPv6 */ + } else if (strcmp(testname, "ip_ecn") == 0) { send_changed_ECN(txfd, &daddr); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - + } else if (strcmp(testname, "ip_tos") == 0) { send_changed_tos(txfd, &daddr); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - if (proto == PF_INET) { - /* Modified packets may be received out of order. - * Sleep function added to enforce test boundaries - * so that fin pkts are not received prior to other pkts. - */ - sleep(1); - send_changed_ttl(txfd, &daddr); - write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - - sleep(1); - send_ip_options(txfd, &daddr); - sleep(1); - write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - - sleep(1); - send_fragment4(txfd, &daddr); - sleep(1); - write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - - test_flush_id(txfd, &daddr, fin_pkt); - } else if (proto == PF_INET6) { - sleep(1); - send_fragment6(txfd, &daddr); - sleep(1); - write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - - sleep(1); - /* send IPv6 packets with ext header with same payload */ - send_ipv6_exthdr(txfd, &daddr, EXT_PAYLOAD_1, EXT_PAYLOAD_1); - sleep(1); - write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - - sleep(1); - /* send IPv6 packets with ext header with different payload */ - send_ipv6_exthdr(txfd, &daddr, EXT_PAYLOAD_1, EXT_PAYLOAD_2); - sleep(1); - write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - } - } else if (strcmp(testname, "large") == 0) { - /* 20 is the difference between min iphdr size - * and min ipv6hdr size. Like MAX_HDR_SIZE, - * MAX_PAYLOAD is defined with the larger header of the two. - */ + + /* ip sub-tests - IPv4 only */ + } else if (strcmp(testname, "ip_ttl") == 0) { + send_changed_ttl(txfd, &daddr); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "ip_opt") == 0) { + send_ip_options(txfd, &daddr); + usleep(fin_delay_us); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "ip_frag4") == 0) { + send_fragment4(txfd, &daddr); + usleep(fin_delay_us); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "ip_id_df1_inc") == 0) { + send_flush_id_case(txfd, &daddr, FLUSH_ID_DF1_INC); + usleep(fin_delay_us); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "ip_id_df1_fixed") == 0) { + send_flush_id_case(txfd, &daddr, FLUSH_ID_DF1_FIXED); + usleep(fin_delay_us); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "ip_id_df0_inc") == 0) { + send_flush_id_case(txfd, &daddr, FLUSH_ID_DF0_INC); + usleep(fin_delay_us); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "ip_id_df0_fixed") == 0) { + send_flush_id_case(txfd, &daddr, FLUSH_ID_DF0_FIXED); + usleep(fin_delay_us); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "ip_id_df1_inc_fixed") == 0) { + send_flush_id_case(txfd, &daddr, FLUSH_ID_DF1_INC_FIXED); + usleep(fin_delay_us); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "ip_id_df1_fixed_inc") == 0) { + send_flush_id_case(txfd, &daddr, FLUSH_ID_DF1_FIXED_INC); + usleep(fin_delay_us); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + /* ip sub-tests - IPv6 only */ + } else if (strcmp(testname, "ip_frag6") == 0) { + send_fragment6(txfd, &daddr); + usleep(fin_delay_us); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "ip_v6ext_same") == 0) { + send_ipv6_exthdr(txfd, &daddr, EXT_PAYLOAD_1, EXT_PAYLOAD_1); + usleep(fin_delay_us); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "ip_v6ext_diff") == 0) { + send_ipv6_exthdr(txfd, &daddr, EXT_PAYLOAD_1, EXT_PAYLOAD_2); + usleep(fin_delay_us); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + /* large sub-tests */ + } else if (strcmp(testname, "large_max") == 0) { int offset = (proto == PF_INET && !ipip) ? 20 : 0; int remainder = (MAX_PAYLOAD + offset) % MSS; send_large(txfd, &daddr, remainder); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "large_rem") == 0) { + int offset = (proto == PF_INET && !ipip) ? 20 : 0; + int remainder = (MAX_PAYLOAD + offset) % MSS; send_large(txfd, &daddr, remainder + 1); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); } else { - error(1, 0, "Unknown testcase"); + error(1, 0, "Unknown testcase: %s", testname); } if (close(txfd)) @@ -1155,126 +1195,153 @@ static void gro_receiver(void) memset(correct_payload, 0, sizeof(correct_payload)); - if (strcmp(testname, "data") == 0) { + /* data sub-tests */ + if (strcmp(testname, "data_same") == 0) { printf("pure data packet of same size: "); correct_payload[0] = PAYLOAD_LEN * 2; check_recv_pkts(rxfd, correct_payload, 1); - + } else if (strcmp(testname, "data_lrg_sml") == 0) { printf("large data packets followed by a smaller one: "); correct_payload[0] = PAYLOAD_LEN * 1.5; check_recv_pkts(rxfd, correct_payload, 1); - + } else if (strcmp(testname, "data_sml_lrg") == 0) { printf("small data packets followed by a larger one: "); correct_payload[0] = PAYLOAD_LEN / 2; correct_payload[1] = PAYLOAD_LEN; check_recv_pkts(rxfd, correct_payload, 2); + + /* ack test */ } else if (strcmp(testname, "ack") == 0) { printf("duplicate ack and pure ack: "); check_recv_pkts(rxfd, correct_payload, 3); - } else if (strcmp(testname, "flags") == 0) { + + /* flags sub-tests */ + } else if (strcmp(testname, "flags_psh") == 0) { correct_payload[0] = PAYLOAD_LEN * 3; correct_payload[1] = PAYLOAD_LEN * 2; - printf("psh flag ends coalescing: "); check_recv_pkts(rxfd, correct_payload, 2); - + } else if (strcmp(testname, "flags_syn") == 0) { correct_payload[0] = PAYLOAD_LEN * 2; correct_payload[1] = 0; correct_payload[2] = PAYLOAD_LEN * 2; printf("syn flag ends coalescing: "); check_recv_pkts(rxfd, correct_payload, 3); - + } else if (strcmp(testname, "flags_rst") == 0) { + correct_payload[0] = PAYLOAD_LEN * 2; + correct_payload[1] = 0; + correct_payload[2] = PAYLOAD_LEN * 2; printf("rst flag ends coalescing: "); check_recv_pkts(rxfd, correct_payload, 3); - + } else if (strcmp(testname, "flags_urg") == 0) { + correct_payload[0] = PAYLOAD_LEN * 2; + correct_payload[1] = 0; + correct_payload[2] = PAYLOAD_LEN * 2; printf("urg flag ends coalescing: "); check_recv_pkts(rxfd, correct_payload, 3); - } else if (strcmp(testname, "tcp") == 0) { + + /* tcp sub-tests */ + } else if (strcmp(testname, "tcp_csum") == 0) { correct_payload[0] = PAYLOAD_LEN; correct_payload[1] = PAYLOAD_LEN; - correct_payload[2] = PAYLOAD_LEN; - correct_payload[3] = PAYLOAD_LEN; - printf("changed checksum does not coalesce: "); check_recv_pkts(rxfd, correct_payload, 2); - + } else if (strcmp(testname, "tcp_seq") == 0) { + correct_payload[0] = PAYLOAD_LEN; + correct_payload[1] = PAYLOAD_LEN; printf("Wrong Seq number doesn't coalesce: "); check_recv_pkts(rxfd, correct_payload, 2); - - printf("Different timestamp doesn't coalesce: "); + } else if (strcmp(testname, "tcp_ts") == 0) { correct_payload[0] = PAYLOAD_LEN * 2; + correct_payload[1] = PAYLOAD_LEN; + correct_payload[2] = PAYLOAD_LEN; + correct_payload[3] = PAYLOAD_LEN; + printf("Different timestamp doesn't coalesce: "); check_recv_pkts(rxfd, correct_payload, 4); - - printf("Different options doesn't coalesce: "); + } else if (strcmp(testname, "tcp_opt") == 0) { correct_payload[0] = PAYLOAD_LEN * 2; + correct_payload[1] = PAYLOAD_LEN; + printf("Different options doesn't coalesce: "); check_recv_pkts(rxfd, correct_payload, 2); - } else if (strcmp(testname, "ip") == 0) { + + /* ip sub-tests - shared between IPv4 and IPv6 */ + } else if (strcmp(testname, "ip_ecn") == 0) { correct_payload[0] = PAYLOAD_LEN; correct_payload[1] = PAYLOAD_LEN; - printf("different ECN doesn't coalesce: "); check_recv_pkts(rxfd, correct_payload, 2); - + } else if (strcmp(testname, "ip_tos") == 0) { + correct_payload[0] = PAYLOAD_LEN; + correct_payload[1] = PAYLOAD_LEN; printf("different tos doesn't coalesce: "); check_recv_pkts(rxfd, correct_payload, 2); - if (proto == PF_INET) { - printf("different ttl doesn't coalesce: "); - check_recv_pkts(rxfd, correct_payload, 2); - - printf("ip options doesn't coalesce: "); - correct_payload[2] = PAYLOAD_LEN; - check_recv_pkts(rxfd, correct_payload, 3); - - printf("fragmented ip4 doesn't coalesce: "); - check_recv_pkts(rxfd, correct_payload, 2); - - /* is_atomic checks */ - printf("DF=1, Incrementing - should coalesce: "); - correct_payload[0] = PAYLOAD_LEN * 2; - check_recv_pkts(rxfd, correct_payload, 1); - - printf("DF=1, Fixed - should coalesce: "); - correct_payload[0] = PAYLOAD_LEN * 2; - check_recv_pkts(rxfd, correct_payload, 1); - - printf("DF=0, Incrementing - should coalesce: "); - correct_payload[0] = PAYLOAD_LEN * 2; - check_recv_pkts(rxfd, correct_payload, 1); - - printf("DF=0, Fixed - should coalesce: "); - correct_payload[0] = PAYLOAD_LEN * 2; - check_recv_pkts(rxfd, correct_payload, 1); - - printf("DF=1, 2 Incrementing and one fixed - should coalesce only first 2 packets: "); - correct_payload[0] = PAYLOAD_LEN * 2; - correct_payload[1] = PAYLOAD_LEN; - check_recv_pkts(rxfd, correct_payload, 2); - - printf("DF=1, 2 Fixed and one incrementing - should coalesce only first 2 packets: "); - correct_payload[0] = PAYLOAD_LEN * 2; - correct_payload[1] = PAYLOAD_LEN; - check_recv_pkts(rxfd, correct_payload, 2); - } else if (proto == PF_INET6) { - /* GRO doesn't check for ipv6 hop limit when flushing. - * Hence no corresponding test to the ipv4 case. - */ - printf("fragmented ip6 doesn't coalesce: "); - correct_payload[0] = PAYLOAD_LEN * 2; - correct_payload[1] = PAYLOAD_LEN; - correct_payload[2] = PAYLOAD_LEN; - check_recv_pkts(rxfd, correct_payload, 3); - - printf("ipv6 with ext header does coalesce: "); - correct_payload[0] = PAYLOAD_LEN * 2; - check_recv_pkts(rxfd, correct_payload, 1); - - printf("ipv6 with ext header with different payloads doesn't coalesce: "); - correct_payload[0] = PAYLOAD_LEN; - correct_payload[1] = PAYLOAD_LEN; - check_recv_pkts(rxfd, correct_payload, 2); - } - } else if (strcmp(testname, "large") == 0) { + /* ip sub-tests - IPv4 only */ + } else if (strcmp(testname, "ip_ttl") == 0) { + correct_payload[0] = PAYLOAD_LEN; + correct_payload[1] = PAYLOAD_LEN; + printf("different ttl doesn't coalesce: "); + check_recv_pkts(rxfd, correct_payload, 2); + } else if (strcmp(testname, "ip_opt") == 0) { + correct_payload[0] = PAYLOAD_LEN; + correct_payload[1] = PAYLOAD_LEN; + correct_payload[2] = PAYLOAD_LEN; + printf("ip options doesn't coalesce: "); + check_recv_pkts(rxfd, correct_payload, 3); + } else if (strcmp(testname, "ip_frag4") == 0) { + correct_payload[0] = PAYLOAD_LEN; + correct_payload[1] = PAYLOAD_LEN; + printf("fragmented ip4 doesn't coalesce: "); + check_recv_pkts(rxfd, correct_payload, 2); + } else if (strcmp(testname, "ip_id_df1_inc") == 0) { + printf("DF=1, Incrementing - should coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 1); + } else if (strcmp(testname, "ip_id_df1_fixed") == 0) { + printf("DF=1, Fixed - should coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 1); + } else if (strcmp(testname, "ip_id_df0_inc") == 0) { + printf("DF=0, Incrementing - should coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 1); + } else if (strcmp(testname, "ip_id_df0_fixed") == 0) { + printf("DF=0, Fixed - should coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 1); + } else if (strcmp(testname, "ip_id_df1_inc_fixed") == 0) { + printf("DF=1, 2 Incrementing and one fixed - should coalesce only first 2 packets: "); + correct_payload[0] = PAYLOAD_LEN * 2; + correct_payload[1] = PAYLOAD_LEN; + check_recv_pkts(rxfd, correct_payload, 2); + } else if (strcmp(testname, "ip_id_df1_fixed_inc") == 0) { + printf("DF=1, 2 Fixed and one incrementing - should coalesce only first 2 packets: "); + correct_payload[0] = PAYLOAD_LEN * 2; + correct_payload[1] = PAYLOAD_LEN; + check_recv_pkts(rxfd, correct_payload, 2); + + /* ip sub-tests - IPv6 only */ + } else if (strcmp(testname, "ip_frag6") == 0) { + /* GRO doesn't check for ipv6 hop limit when flushing. + * Hence no corresponding test to the ipv4 case. + */ + printf("fragmented ip6 doesn't coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + correct_payload[1] = PAYLOAD_LEN; + correct_payload[2] = PAYLOAD_LEN; + check_recv_pkts(rxfd, correct_payload, 3); + } else if (strcmp(testname, "ip_v6ext_same") == 0) { + printf("ipv6 with ext header does coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 1); + } else if (strcmp(testname, "ip_v6ext_diff") == 0) { + printf("ipv6 with ext header with different payloads doesn't coalesce: "); + correct_payload[0] = PAYLOAD_LEN; + correct_payload[1] = PAYLOAD_LEN; + check_recv_pkts(rxfd, correct_payload, 2); + + /* large sub-tests */ + } else if (strcmp(testname, "large_max") == 0) { int offset = (proto == PF_INET && !ipip) ? 20 : 0; int remainder = (MAX_PAYLOAD + offset) % MSS; @@ -1282,14 +1349,18 @@ static void gro_receiver(void) correct_payload[1] = remainder; printf("Shouldn't coalesce if exceed IP max pkt size: "); check_recv_pkts(rxfd, correct_payload, 2); + } else if (strcmp(testname, "large_rem") == 0) { + int offset = (proto == PF_INET && !ipip) ? 20 : 0; + int remainder = (MAX_PAYLOAD + offset) % MSS; /* last segment sent individually, doesn't start new segment */ - correct_payload[0] = correct_payload[0] - remainder; + correct_payload[0] = (MAX_PAYLOAD + offset) - remainder; correct_payload[1] = remainder + 1; correct_payload[2] = remainder + 1; + printf("last segment sent individually: "); check_recv_pkts(rxfd, correct_payload, 3); } else { - error(1, 0, "Test case error, should never trigger"); + error(1, 0, "Test case error: unknown testname %s", testname); } if (close(rxfd)) diff --git a/tools/testing/selftests/drivers/net/gro.py b/tools/testing/selftests/drivers/net/gro.py index 1ab85590c439..1bb8af571456 100755 --- a/tools/testing/selftests/drivers/net/gro.py +++ b/tools/testing/selftests/drivers/net/gro.py @@ -9,12 +9,29 @@ binary in different configurations and checking for correct packet coalescing behavior. Test cases: - - data: Data packets with same size/headers and correct seq numbers coalesce + - data_same: Same size data packets coalesce + - data_lrg_sml: Large packet followed by smaller one coalesces + - data_sml_lrg: Small packet followed by larger one doesn't coalesce - ack: Pure ACK packets do not coalesce - - flags: Packets with PSH, SYN, URG, RST flags do not coalesce - - tcp: Packets with incorrect checksum, non-consecutive seqno don't coalesce - - ip: Packets with different ECN, TTL, TOS, or IP options don't coalesce - - large: Packets larger than GRO_MAX_SIZE don't coalesce + - flags_psh: Packets with PSH flag don't coalesce + - flags_syn: Packets with SYN flag don't coalesce + - flags_rst: Packets with RST flag don't coalesce + - flags_urg: Packets with URG flag don't coalesce + - tcp_csum: Packets with incorrect checksum don't coalesce + - tcp_seq: Packets with non-consecutive seqno don't coalesce + - tcp_ts: Packets with different timestamp options don't coalesce + - tcp_opt: Packets with different TCP options don't coalesce + - ip_ecn: Packets with different ECN don't coalesce + - ip_tos: Packets with different TOS don't coalesce + - ip_ttl: (IPv4) Packets with different TTL don't coalesce + - ip_opt: (IPv4) Packets with IP options don't coalesce + - ip_frag4: (IPv4) IPv4 fragments don't coalesce + - ip_id_df*: (IPv4) IP ID field coalescing tests + - ip_frag6: (IPv6) IPv6 fragments don't coalesce + - ip_v6ext_same: (IPv6) IPv6 ext header with same payload coalesces + - ip_v6ext_diff: (IPv6) IPv6 ext header with different payload doesn't coalesce + - large_max: Packets exceeding GRO_MAX_SIZE don't coalesce + - large_rem: Large packet remainder handling """ import os @@ -107,8 +124,8 @@ def _setup(cfg, mode, test_name): cfg.remote_feat = ethtool(f"-k {cfg.remote_ifname}", host=cfg.remote, json=True)[0] - # "large" test needs at least 4k MTU - if test_name == "large": + # "large_*" tests need at least 4k MTU + if test_name.startswith("large_"): _set_mtu_restore(cfg.dev, 4096, None) _set_mtu_restore(cfg.remote_dev, 4096, cfg.remote) @@ -170,11 +187,41 @@ def _setup(cfg, mode, test_name): def _gro_variants(): """Generator that yields all combinations of protocol and test types.""" + # Tests that work for all protocols + common_tests = [ + "data_same", "data_lrg_sml", "data_sml_lrg", + "ack", + "flags_psh", "flags_syn", "flags_rst", "flags_urg", + "tcp_csum", "tcp_seq", "tcp_ts", "tcp_opt", + "ip_ecn", "ip_tos", + "large_max", "large_rem", + ] + + # Tests specific to IPv4 + ipv4_tests = [ + "ip_ttl", "ip_opt", "ip_frag4", + "ip_id_df1_inc", "ip_id_df1_fixed", + "ip_id_df0_inc", "ip_id_df0_fixed", + "ip_id_df1_inc_fixed", "ip_id_df1_fixed_inc", + ] + + # Tests specific to IPv6 + ipv6_tests = [ + "ip_frag6", "ip_v6ext_same", "ip_v6ext_diff", + ] + for mode in ["sw", "hw", "lro"]: for protocol in ["ipv4", "ipv6", "ipip"]: - for test_name in ["data", "ack", "flags", "tcp", "ip", "large"]: + for test_name in common_tests: yield mode, protocol, test_name + if protocol in ["ipv4", "ipip"]: + for test_name in ipv4_tests: + yield mode, protocol, test_name + elif protocol == "ipv6": + for test_name in ipv6_tests: + yield mode, protocol, test_name + @ksft_variants(_gro_variants()) def test(cfg, mode, protocol, test_name): @@ -215,7 +262,7 @@ def test(cfg, mode, protocol, test_name): ksft_pr(rx_proc) - if test_name == "large" and os.environ.get("KSFT_MACHINE_SLOW"): + if test_name.startswith("large_") and os.environ.get("KSFT_MACHINE_SLOW"): ksft_pr(f"Ignoring {protocol}/{test_name} failure due to slow environment") return -- cgit v1.2.3 From a32bb32d019332394aee9e2befea4fec05a672e4 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 19 Nov 2025 12:15:20 +0000 Subject: selftests: iou-zcrx: test large chunk sizes Add a test using large chunks for zcrx memory area. Signed-off-by: Pavel Begunkov --- tools/testing/selftests/drivers/net/hw/iou-zcrx.c | 72 ++++++++++++++++++---- tools/testing/selftests/drivers/net/hw/iou-zcrx.py | 39 ++++++++++++ 2 files changed, 99 insertions(+), 12 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/drivers/net/hw/iou-zcrx.c b/tools/testing/selftests/drivers/net/hw/iou-zcrx.c index 62456df947bc..240d13dbc54e 100644 --- a/tools/testing/selftests/drivers/net/hw/iou-zcrx.c +++ b/tools/testing/selftests/drivers/net/hw/iou-zcrx.c @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -37,6 +38,23 @@ #include +#define SKIP_CODE 42 + +struct t_io_uring_zcrx_ifq_reg { + __u32 if_idx; + __u32 if_rxq; + __u32 rq_entries; + __u32 flags; + + __u64 area_ptr; /* pointer to struct io_uring_zcrx_area_reg */ + __u64 region_ptr; /* struct io_uring_region_desc * */ + + struct io_uring_zcrx_offsets offsets; + __u32 zcrx_id; + __u32 rx_buf_len; + __u64 __resv[3]; +}; + static long page_size; #define AREA_SIZE (8192 * page_size) #define SEND_SIZE (512 * 4096) @@ -65,6 +83,8 @@ static bool cfg_oneshot; static int cfg_oneshot_recvs; static int cfg_send_size = SEND_SIZE; static struct sockaddr_in6 cfg_addr; +static unsigned int cfg_rx_buf_len; +static bool cfg_dry_run; static char *payload; static void *area_ptr; @@ -128,14 +148,28 @@ static void setup_zcrx(struct io_uring *ring) if (!ifindex) error(1, 0, "bad interface name: %s", cfg_ifname); - area_ptr = mmap(NULL, - AREA_SIZE, - PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, - 0, - 0); - if (area_ptr == MAP_FAILED) - error(1, 0, "mmap(): zero copy area"); + if (cfg_rx_buf_len && cfg_rx_buf_len != page_size) { + area_ptr = mmap(NULL, + AREA_SIZE, + PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE | + MAP_HUGETLB | MAP_HUGE_2MB, + -1, + 0); + if (area_ptr == MAP_FAILED) { + printf("Can't allocate huge pages\n"); + exit(SKIP_CODE); + } + } else { + area_ptr = mmap(NULL, + AREA_SIZE, + PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, + 0, + 0); + if (area_ptr == MAP_FAILED) + error(1, 0, "mmap(): zero copy area"); + } ring_size = get_refill_ring_size(rq_entries); ring_ptr = mmap(NULL, @@ -157,17 +191,23 @@ static void setup_zcrx(struct io_uring *ring) .flags = 0, }; - struct io_uring_zcrx_ifq_reg reg = { + struct t_io_uring_zcrx_ifq_reg reg = { .if_idx = ifindex, .if_rxq = cfg_queue_id, .rq_entries = rq_entries, .area_ptr = (__u64)(unsigned long)&area_reg, .region_ptr = (__u64)(unsigned long)®ion_reg, + .rx_buf_len = cfg_rx_buf_len, }; - ret = io_uring_register_ifq(ring, ®); - if (ret) + ret = io_uring_register_ifq(ring, (void *)®); + if (cfg_rx_buf_len && (ret == -EINVAL || ret == -EOPNOTSUPP || + ret == -ERANGE)) { + printf("Large chunks are not supported %i\n", ret); + exit(SKIP_CODE); + } else if (ret) { error(1, 0, "io_uring_register_ifq(): %d", ret); + } rq_ring.khead = (unsigned int *)((char *)ring_ptr + reg.offsets.head); rq_ring.ktail = (unsigned int *)((char *)ring_ptr + reg.offsets.tail); @@ -323,6 +363,8 @@ static void run_server(void) io_uring_queue_init(512, &ring, flags); setup_zcrx(&ring); + if (cfg_dry_run) + return; add_accept(&ring, fd); @@ -383,7 +425,7 @@ static void parse_opts(int argc, char **argv) usage(argv[0]); cfg_payload_len = max_payload_len; - while ((c = getopt(argc, argv, "sch:p:l:i:q:o:z:")) != -1) { + while ((c = getopt(argc, argv, "sch:p:l:i:q:o:z:x:d")) != -1) { switch (c) { case 's': if (cfg_client) @@ -418,6 +460,12 @@ static void parse_opts(int argc, char **argv) case 'z': cfg_send_size = strtoul(optarg, NULL, 0); break; + case 'x': + cfg_rx_buf_len = page_size * strtoul(optarg, NULL, 0); + break; + case 'd': + cfg_dry_run = true; + break; } } diff --git a/tools/testing/selftests/drivers/net/hw/iou-zcrx.py b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py index 712c806508b5..7f596a33eb2b 100755 --- a/tools/testing/selftests/drivers/net/hw/iou-zcrx.py +++ b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py @@ -7,6 +7,7 @@ from lib.py import ksft_run, ksft_exit, KsftSkipEx from lib.py import NetDrvEpEnv from lib.py import bkg, cmd, defer, ethtool, rand_port, wait_port_listen +SKIP_CODE = 42 def _get_current_settings(cfg): output = ethtool(f"-g {cfg.ifname}", json=True)[0] @@ -132,6 +133,44 @@ def test_zcrx_rss(cfg) -> None: cmd(tx_cmd, host=cfg.remote) +def test_zcrx_large_chunks(cfg) -> None: + """Test zcrx with large buffer chunks.""" + + cfg.require_ipver('6') + + combined_chans = _get_combined_channels(cfg) + if combined_chans < 2: + raise KsftSkipEx('at least 2 combined channels required') + (rx_ring, hds_thresh) = _get_current_settings(cfg) + port = rand_port() + + ethtool(f"-G {cfg.ifname} tcp-data-split on") + defer(ethtool, f"-G {cfg.ifname} tcp-data-split auto") + + ethtool(f"-G {cfg.ifname} hds-thresh 0") + defer(ethtool, f"-G {cfg.ifname} hds-thresh {hds_thresh}") + + ethtool(f"-G {cfg.ifname} rx 64") + defer(ethtool, f"-G {cfg.ifname} rx {rx_ring}") + + ethtool(f"-X {cfg.ifname} equal {combined_chans - 1}") + defer(ethtool, f"-X {cfg.ifname} default") + + flow_rule_id = _set_flow_rule(cfg, port, combined_chans - 1) + defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}") + + rx_cmd = f"{cfg.bin_local} -s -p {port} -i {cfg.ifname} -q {combined_chans - 1} -x 2" + tx_cmd = f"{cfg.bin_remote} -c -h {cfg.addr_v['6']} -p {port} -l 12840" + + probe = cmd(rx_cmd + " -d", fail=False) + if probe.ret == SKIP_CODE: + raise KsftSkipEx(probe.stdout) + + with bkg(rx_cmd, exit_wait=True): + wait_port_listen(port, proto="tcp") + cmd(tx_cmd, host=cfg.remote) + + def main() -> None: with NetDrvEpEnv(__file__) as cfg: cfg.bin_local = path.abspath(path.dirname(__file__) + "/../../../drivers/net/hw/iou-zcrx") -- cgit v1.2.3 From e5566f6b1d13e9bc0a458babb880916e212c45fb Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 11 Jan 2026 14:08:09 +0200 Subject: selftests: fib-onlink: Remove "wrong nexthop device" IPv4 tests According to the test description, these tests fail because of a wrong nexthop device: # ./fib-onlink-tests.sh -v [...] COMMAND: ip ro add table 254 169.254.101.102/32 via 169.254.3.1 dev veth1 onlink Error: Nexthop has invalid gateway. TEST: Gateway resolves to wrong nexthop device [ OK ] COMMAND: ip ro add table 1101 169.254.102.103/32 via 169.254.7.1 dev veth5 onlink Error: Nexthop has invalid gateway. TEST: Gateway resolves to wrong nexthop device - VRF [ OK ] [...] But this is incorrect. They fail because the gateway addresses are local addresses: # ip -4 address show [...] 28: veth3@if27: mtu 1500 qdisc noqueue state UP group default qlen 1000 link-netns peer_ns-Urqh3o inet 169.254.3.1/24 scope global veth3 [...] 32: veth7@if31: mtu 1500 qdisc noqueue master lisa state UP group default qlen 1000 link-netns peer_ns-Urqh3o inet 169.254.7.1/24 scope global veth7 Therefore, using a local address that matches the nexthop device fails as well: # ip ro add table 254 169.254.101.102/32 via 169.254.3.1 dev veth3 onlink Error: Nexthop has invalid gateway. Using a gateway address with a "wrong" nexthop device is actually valid and allowed: # ip route get 169.254.1.2 169.254.1.2 dev veth1 src 169.254.1.1 uid 0 # ip ro add table 254 169.254.101.102/32 via 169.254.1.2 dev veth3 onlink # echo $? 0 Remove these tests given that their output is confusing and that the scenario that they are testing is already covered by other tests. A subsequent patch will add tests for the nexthop device mismatch scenario. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Link: https://patch.msgid.link/20260111120813.159799-2-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/fib-onlink-tests.sh | 6 ------ 1 file changed, 6 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/net/fib-onlink-tests.sh b/tools/testing/selftests/net/fib-onlink-tests.sh index ec2d6ceb1f08..1bb1c2289650 100755 --- a/tools/testing/selftests/net/fib-onlink-tests.sh +++ b/tools/testing/selftests/net/fib-onlink-tests.sh @@ -315,12 +315,6 @@ invalid_onlink_ipv4() "Invalid gw - local unicast address, VRF" run_ip 254 ${TEST_NET4[1]}.101 ${V4ADDRS[p1]} "" 2 "No nexthop device given" - - run_ip 254 ${TEST_NET4[1]}.102 ${V4ADDRS[p3]} ${NETIFS[p1]} 2 \ - "Gateway resolves to wrong nexthop device" - - run_ip ${VRF_TABLE} ${TEST_NET4[2]}.103 ${V4ADDRS[p7]} ${NETIFS[p5]} 2 \ - "Gateway resolves to wrong nexthop device - VRF" } ################################################################################ -- cgit v1.2.3 From 0a3419f4ba407b9624c315bfd6f0056caf536898 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 11 Jan 2026 14:08:10 +0200 Subject: selftests: fib-onlink: Remove "wrong nexthop device" IPv6 tests The command in the test fails as expected because IPv6 forbids a nexthop device mismatch: # ./fib-onlink-tests.sh -v [...] COMMAND: ip -6 ro add table 1101 2001:db8:102::103/128 via 2001:db8:701::64 dev veth5 onlink Error: Nexthop has invalid gateway or device mismatch. TEST: Gateway resolves to wrong nexthop device - VRF [ OK ] [...] Where: # ip route get 2001:db8:701::64 vrf lisa 2001:db8:701::64 dev veth7 table 1101 proto kernel src 2001:db8:701::1 metric 256 pref medium This is in contrast to IPv4 where a nexthop device mismatch is allowed when "onlink" is specified: # ip route get 169.254.7.2 vrf lisa 169.254.7.2 dev veth7 table 1101 src 169.254.7.1 uid 0 # ip ro add table 1101 169.254.102.103/32 via 169.254.7.2 dev veth5 onlink # echo $? 0 Remove these tests in preparation for aligning IPv6 with IPv4 and allowing nexthop device mismatch when "onlink" is specified. A subsequent patch will add tests that verify that both address families allow a nexthop device mismatch with "onlink". Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Link: https://patch.msgid.link/20260111120813.159799-3-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/fib-onlink-tests.sh | 7 ------- 1 file changed, 7 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/net/fib-onlink-tests.sh b/tools/testing/selftests/net/fib-onlink-tests.sh index 1bb1c2289650..63477be859e3 100755 --- a/tools/testing/selftests/net/fib-onlink-tests.sh +++ b/tools/testing/selftests/net/fib-onlink-tests.sh @@ -432,13 +432,6 @@ invalid_onlink_ipv6() run_ip6 254 ${TEST_NET6[1]}::101 ${V6ADDRS[p1]} "" 2 \ "No nexthop device given" - - # default VRF validation is done against LOCAL table - # run_ip6 254 ${TEST_NET6[1]}::102 ${V6ADDRS[p3]/::[0-9]/::64} ${NETIFS[p1]} 2 \ - # "Gateway resolves to wrong nexthop device" - - run_ip6 ${VRF_TABLE} ${TEST_NET6[2]}::103 ${V6ADDRS[p7]/::[0-9]/::64} ${NETIFS[p5]} 2 \ - "Gateway resolves to wrong nexthop device - VRF" } run_onlink_tests() -- cgit v1.2.3 From 9bf8345fb38ab9bd771bd430073cbd3e912fcf75 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 11 Jan 2026 14:08:11 +0200 Subject: selftests: fib-onlink: Add a test case for IPv4 multicast gateway A multicast gateway address should be rejected when "onlink" is specified, but it is only tested as part of the IPv6 tests. Add an equivalent IPv4 test. # ./fib-onlink-tests.sh -v [...] COMMAND: ip ro add table 254 169.254.101.12/32 via 233.252.0.1 dev veth1 onlink Error: Nexthop has invalid gateway. TEST: Invalid gw - multicast address [ OK ] [...] COMMAND: ip ro add table 1101 169.254.102.12/32 via 233.252.0.1 dev veth5 onlink Error: Nexthop has invalid gateway. TEST: Invalid gw - multicast address, VRF [ OK ] [...] Tests passed: 37 Tests failed: 0 Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Link: https://patch.msgid.link/20260111120813.159799-4-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/fib-onlink-tests.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/net/fib-onlink-tests.sh b/tools/testing/selftests/net/fib-onlink-tests.sh index 63477be859e3..7a0fd7a91e4e 100755 --- a/tools/testing/selftests/net/fib-onlink-tests.sh +++ b/tools/testing/selftests/net/fib-onlink-tests.sh @@ -72,7 +72,8 @@ declare -A TEST_NET4IN6IN6 TEST_NET4IN6[1]=10.1.1.254 TEST_NET4IN6[2]=10.2.1.254 -# mcast address +# mcast addresses +MCAST4=233.252.0.1 MCAST6=ff02::1 VRF=lisa @@ -310,9 +311,13 @@ invalid_onlink_ipv4() { run_ip 254 ${TEST_NET4[1]}.11 ${V4ADDRS[p1]} ${NETIFS[p1]} 2 \ "Invalid gw - local unicast address" + run_ip 254 ${TEST_NET4[1]}.12 ${MCAST4} ${NETIFS[p1]} 2 \ + "Invalid gw - multicast address" run_ip ${VRF_TABLE} ${TEST_NET4[2]}.11 ${V4ADDRS[p5]} ${NETIFS[p5]} 2 \ "Invalid gw - local unicast address, VRF" + run_ip ${VRF_TABLE} ${TEST_NET4[2]}.12 ${MCAST4} ${NETIFS[p5]} 2 \ + "Invalid gw - multicast address, VRF" run_ip 254 ${TEST_NET4[1]}.101 ${V4ADDRS[p1]} "" 2 "No nexthop device given" } -- cgit v1.2.3 From f8f9ee9d8b2ed1f29309399020f2fc30f7f93035 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 11 Jan 2026 14:08:13 +0200 Subject: selftests: fib-onlink: Add test cases for nexthop device mismatch Add test cases that verify that when the "onlink" keyword is specified, both address families (with and without VRF) accept routes with a gateway address that is reachable via a different interface than the one specified. Output without "ipv6: Allow for nexthop device mismatch with "onlink"": # ./fib-onlink-tests.sh | grep mismatch TEST: nexthop device mismatch [ OK ] TEST: nexthop device mismatch [ OK ] TEST: nexthop device mismatch [FAIL] TEST: nexthop device mismatch [FAIL] Output with "ipv6: Allow for nexthop device mismatch with "onlink"": # ./fib-onlink-tests.sh | grep mismatch TEST: nexthop device mismatch [ OK ] TEST: nexthop device mismatch [ OK ] TEST: nexthop device mismatch [ OK ] TEST: nexthop device mismatch [ OK ] That is, the IPv4 tests were always passing, but the IPv6 ones only pass after the specified patch. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Link: https://patch.msgid.link/20260111120813.159799-6-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/fib-onlink-tests.sh | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/net/fib-onlink-tests.sh b/tools/testing/selftests/net/fib-onlink-tests.sh index 7a0fd7a91e4e..b5773ac8847d 100755 --- a/tools/testing/selftests/net/fib-onlink-tests.sh +++ b/tools/testing/selftests/net/fib-onlink-tests.sh @@ -271,11 +271,15 @@ valid_onlink_ipv4() run_ip 254 ${TEST_NET4[1]}.1 ${CONGW[1]} ${NETIFS[p1]} 0 "unicast connected" run_ip 254 ${TEST_NET4[1]}.2 ${RECGW4[1]} ${NETIFS[p1]} 0 "unicast recursive" + run_ip 254 ${TEST_NET4[1]}.9 ${CONGW[1]} ${NETIFS[p3]} 0 \ + "nexthop device mismatch" log_subsection "VRF ${VRF}" run_ip ${VRF_TABLE} ${TEST_NET4[2]}.1 ${CONGW[3]} ${NETIFS[p5]} 0 "unicast connected" run_ip ${VRF_TABLE} ${TEST_NET4[2]}.2 ${RECGW4[2]} ${NETIFS[p5]} 0 "unicast recursive" + run_ip ${VRF_TABLE} ${TEST_NET4[2]}.10 ${CONGW[3]} ${NETIFS[p7]} 0 \ + "nexthop device mismatch" log_subsection "VRF device, PBR table" @@ -366,12 +370,16 @@ valid_onlink_ipv6() run_ip6 254 ${TEST_NET6[1]}::1 ${V6ADDRS[p1]/::*}::64 ${NETIFS[p1]} 0 "unicast connected" run_ip6 254 ${TEST_NET6[1]}::2 ${RECGW6[1]} ${NETIFS[p1]} 0 "unicast recursive" run_ip6 254 ${TEST_NET6[1]}::3 ::ffff:${TEST_NET4IN6[1]} ${NETIFS[p1]} 0 "v4-mapped" + run_ip6 254 ${TEST_NET6[1]}::a ${V6ADDRS[p1]/::*}::64 ${NETIFS[p3]} 0 \ + "nexthop device mismatch" log_subsection "VRF ${VRF}" run_ip6 ${VRF_TABLE} ${TEST_NET6[2]}::1 ${V6ADDRS[p5]/::*}::64 ${NETIFS[p5]} 0 "unicast connected" run_ip6 ${VRF_TABLE} ${TEST_NET6[2]}::2 ${RECGW6[2]} ${NETIFS[p5]} 0 "unicast recursive" run_ip6 ${VRF_TABLE} ${TEST_NET6[2]}::3 ::ffff:${TEST_NET4IN6[2]} ${NETIFS[p5]} 0 "v4-mapped" + run_ip6 ${VRF_TABLE} ${TEST_NET6[2]}::b ${V6ADDRS[p5]/::*}::64 \ + ${NETIFS[p7]} 0 "nexthop device mismatch" log_subsection "VRF device, PBR table" -- cgit v1.2.3 From 9d48c62f6b4ed70ebeea70f52ddb1c6d8613bed4 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Mon, 12 Jan 2026 19:37:14 +0200 Subject: selftests: drv-net: fix RPS mask handling in toeplitz test The toeplitz.py test passed the hex mask without "0x" prefix (e.g., "300" for CPUs 8,9). The toeplitz.c strtoul() call wrongly parsed this as decimal 300 (0x12c) instead of hex 0x300. Pass the prefixed mask to toeplitz.c, and the unprefixed one to sysfs. Fixes: 9cf9aa77a1f6 ("selftests: drv-net: hw: convert the Toeplitz test to Python") Reviewed-by: Nimrod Oren Signed-off-by: Gal Pressman Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260112173715.384843-2-gal@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/hw/toeplitz.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/drivers/net/hw/toeplitz.py b/tools/testing/selftests/drivers/net/hw/toeplitz.py index d2db5ee9e358..d288c57894f6 100755 --- a/tools/testing/selftests/drivers/net/hw/toeplitz.py +++ b/tools/testing/selftests/drivers/net/hw/toeplitz.py @@ -94,12 +94,14 @@ def _configure_rps(cfg, rps_cpus): mask = 0 for cpu in rps_cpus: mask |= (1 << cpu) - mask = hex(mask)[2:] + + mask = hex(mask) # Set RPS bitmap for all rx queues for rps_file in glob.glob(f"/sys/class/net/{cfg.ifname}/queues/rx-*/rps_cpus"): with open(rps_file, "w", encoding="utf-8") as fp: - fp.write(mask) + # sysfs expects hex without '0x' prefix, toeplitz.c needs the prefix + fp.write(mask[2:]) return mask -- cgit v1.2.3 From cf055f8c000445aa688c53a706ef4f580818eedb Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Mon, 12 Jan 2026 19:37:15 +0200 Subject: selftests: drv-net: fix RPS mask handling for high CPU numbers The RPS bitmask bounds check uses ~(RPS_MAX_CPUS - 1) which equals ~15 = 0xfff0, only allowing CPUs 0-3. Change the mask to ~((1UL << RPS_MAX_CPUS) - 1) = ~0xffff to allow CPUs 0-15. Fixes: 5ebfb4cc3048 ("selftests/net: toeplitz test") Reviewed-by: Nimrod Oren Signed-off-by: Gal Pressman Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260112173715.384843-3-gal@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/hw/toeplitz.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/drivers/net/hw/toeplitz.c b/tools/testing/selftests/drivers/net/hw/toeplitz.c index d23b3b0c20a3..285bb17df9c2 100644 --- a/tools/testing/selftests/drivers/net/hw/toeplitz.c +++ b/tools/testing/selftests/drivers/net/hw/toeplitz.c @@ -485,8 +485,8 @@ static void parse_rps_bitmap(const char *arg) bitmap = strtoul(arg, NULL, 0); - if (bitmap & ~(RPS_MAX_CPUS - 1)) - error(1, 0, "rps bitmap 0x%lx out of bounds 0..%lu", + if (bitmap & ~((1UL << RPS_MAX_CPUS) - 1)) + error(1, 0, "rps bitmap 0x%lx out of bounds, max cpu %lu", bitmap, RPS_MAX_CPUS - 1); for (i = 0; i < RPS_MAX_CPUS; i++) -- cgit v1.2.3 From f8ade2342e22e7dbc71af496f07c900f8c69dd54 Mon Sep 17 00:00:00 2001 From: Matt Bobrowski Date: Tue, 13 Jan 2026 08:39:47 +0000 Subject: bpf: return PTR_TO_BTF_ID | PTR_TRUSTED from BPF kfuncs by default Teach the BPF verifier to treat pointers to struct types returned from BPF kfuncs as implicitly trusted (PTR_TO_BTF_ID | PTR_TRUSTED) by default. Returning untrusted pointers to struct types from BPF kfuncs should be considered an exception only, and certainly not the norm. Update existing selftests to reflect the change in register type printing (e.g. `ptr_` becoming `trusted_ptr_` in verifier error messages). Link: https://lore.kernel.org/bpf/aV4nbCaMfIoM0awM@google.com/ Signed-off-by: Matt Bobrowski Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260113083949.2502978-1-mattbobrowski@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 46 ++++++++++++++-------- tools/testing/selftests/bpf/progs/map_kptr_fail.c | 4 +- .../struct_ops_kptr_return_fail__wrong_type.c | 2 +- .../selftests/bpf/progs/verifier_global_ptr_args.c | 2 +- tools/testing/selftests/bpf/verifier/calls.c | 2 +- 5 files changed, 34 insertions(+), 22 deletions(-) (limited to 'tools/testing') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 45733bae271d..faa1ecc1fe9d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14212,26 +14212,38 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (is_kfunc_rcu_protected(&meta)) regs[BPF_REG_0].type |= MEM_RCU; } else { - mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].btf = desc_btf; - regs[BPF_REG_0].type = PTR_TO_BTF_ID; - regs[BPF_REG_0].btf_id = ptr_type_id; + enum bpf_reg_type type = PTR_TO_BTF_ID; if (meta.func_id == special_kfunc_list[KF_bpf_get_kmem_cache]) - regs[BPF_REG_0].type |= PTR_UNTRUSTED; - else if (is_kfunc_rcu_protected(&meta)) - regs[BPF_REG_0].type |= MEM_RCU; - - if (is_iter_next_kfunc(&meta)) { - struct bpf_reg_state *cur_iter; - - cur_iter = get_iter_from_state(env->cur_state, &meta); - - if (cur_iter->type & MEM_RCU) /* KF_RCU_PROTECTED */ - regs[BPF_REG_0].type |= MEM_RCU; - else - regs[BPF_REG_0].type |= PTR_TRUSTED; + type |= PTR_UNTRUSTED; + else if (is_kfunc_rcu_protected(&meta) || + (is_iter_next_kfunc(&meta) && + (get_iter_from_state(env->cur_state, &meta) + ->type & MEM_RCU))) { + /* + * If the iterator's constructor (the _new + * function e.g., bpf_iter_task_new) has been + * annotated with BPF kfunc flag + * KF_RCU_PROTECTED and was called within a RCU + * read-side critical section, also propagate + * the MEM_RCU flag to the pointer returned from + * the iterator's next function (e.g., + * bpf_iter_task_next). + */ + type |= MEM_RCU; + } else { + /* + * Any PTR_TO_BTF_ID that is returned from a BPF + * kfunc should by default be treated as + * implicitly trusted. + */ + type |= PTR_TRUSTED; } + + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].btf = desc_btf; + regs[BPF_REG_0].type = type; + regs[BPF_REG_0].btf_id = ptr_type_id; } if (is_kfunc_ret_null(&meta)) { diff --git a/tools/testing/selftests/bpf/progs/map_kptr_fail.c b/tools/testing/selftests/bpf/progs/map_kptr_fail.c index 4c0ff01f1a96..6443b320c732 100644 --- a/tools/testing/selftests/bpf/progs/map_kptr_fail.c +++ b/tools/testing/selftests/bpf/progs/map_kptr_fail.c @@ -272,7 +272,7 @@ int reject_untrusted_xchg(struct __sk_buff *ctx) SEC("?tc") __failure -__msg("invalid kptr access, R2 type=ptr_prog_test_ref_kfunc expected=ptr_prog_test_member") +__msg("invalid kptr access, R2 type=trusted_ptr_prog_test_ref_kfunc expected=ptr_prog_test_member") int reject_bad_type_xchg(struct __sk_buff *ctx) { struct prog_test_ref_kfunc *ref_ptr; @@ -291,7 +291,7 @@ int reject_bad_type_xchg(struct __sk_buff *ctx) } SEC("?tc") -__failure __msg("invalid kptr access, R2 type=ptr_prog_test_ref_kfunc") +__failure __msg("invalid kptr access, R2 type=trusted_ptr_prog_test_ref_kfunc") int reject_member_of_ref_xchg(struct __sk_buff *ctx) { struct prog_test_ref_kfunc *ref_ptr; diff --git a/tools/testing/selftests/bpf/progs/struct_ops_kptr_return_fail__wrong_type.c b/tools/testing/selftests/bpf/progs/struct_ops_kptr_return_fail__wrong_type.c index 6a2dd5367802..c8d217e89eea 100644 --- a/tools/testing/selftests/bpf/progs/struct_ops_kptr_return_fail__wrong_type.c +++ b/tools/testing/selftests/bpf/progs/struct_ops_kptr_return_fail__wrong_type.c @@ -12,7 +12,7 @@ void bpf_task_release(struct task_struct *p) __ksym; * reject programs returning a referenced kptr of the wrong type. */ SEC("struct_ops/test_return_ref_kptr") -__failure __msg("At program exit the register R0 is not a known value (ptr_or_null_)") +__failure __msg("At program exit the register R0 is not a known value (trusted_ptr_or_null_)") struct task_struct *BPF_PROG(kptr_return_fail__wrong_type, int dummy, struct task_struct *task, struct cgroup *cgrp) { diff --git a/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c b/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c index 1204fbc58178..e7dae0cf9c17 100644 --- a/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c +++ b/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c @@ -72,7 +72,7 @@ int trusted_task_arg_nonnull_fail1(void *ctx) SEC("?tp_btf/task_newtask") __failure __log_level(2) -__msg("R1 type=ptr_or_null_ expected=ptr_, trusted_ptr_, rcu_ptr_") +__msg("R1 type=trusted_ptr_or_null_ expected=ptr_, trusted_ptr_, rcu_ptr_") __msg("Caller passes invalid args into func#1 ('subprog_trusted_task_nonnull')") int trusted_task_arg_nonnull_fail2(void *ctx) { diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index c8d640802cce..9ca83dce100d 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -220,7 +220,7 @@ }, .result_unpriv = REJECT, .result = REJECT, - .errstr = "variable ptr_ access var_off=(0x0; 0x7) disallowed", + .errstr = "variable trusted_ptr_ access var_off=(0x0; 0x7) disallowed", }, { "calls: invalid kfunc call: referenced arg needs refcounted PTR_TO_BTF_ID", -- cgit v1.2.3 From bbdbed193bcf57f1e9c0d9d58c3ad3350bfd0bd1 Mon Sep 17 00:00:00 2001 From: Matt Bobrowski Date: Tue, 13 Jan 2026 08:39:49 +0000 Subject: selftests/bpf: assert BPF kfunc default trusted pointer semantics The BPF verifier was recently updated to treat pointers to struct types returned from BPF kfuncs as implicitly trusted by default. Add a new test case to exercise this new implicit trust semantic. The KF_ACQUIRE flag was dropped from the bpf_get_root_mem_cgroup() kfunc because it returns a global pointer to root_mem_cgroup without performing any explicit reference counting. This makes it an ideal candidate to verify the new implicit trusted pointer semantics. Signed-off-by: Matt Bobrowski Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260113083949.2502978-3-mattbobrowski@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/verifier.c | 2 ++ .../selftests/bpf/progs/verifier_memcontrol.c | 32 ++++++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/verifier_memcontrol.c (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index 5829ffd70f8f..38c5ba70100c 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -61,6 +61,7 @@ #include "verifier_masking.skel.h" #include "verifier_may_goto_1.skel.h" #include "verifier_may_goto_2.skel.h" +#include "verifier_memcontrol.skel.h" #include "verifier_meta_access.skel.h" #include "verifier_movsx.skel.h" #include "verifier_mtu.skel.h" @@ -202,6 +203,7 @@ void test_verifier_map_ret_val(void) { RUN(verifier_map_ret_val); } void test_verifier_masking(void) { RUN(verifier_masking); } void test_verifier_may_goto_1(void) { RUN(verifier_may_goto_1); } void test_verifier_may_goto_2(void) { RUN(verifier_may_goto_2); } +void test_verifier_memcontrol(void) { RUN(verifier_memcontrol); } void test_verifier_meta_access(void) { RUN(verifier_meta_access); } void test_verifier_movsx(void) { RUN(verifier_movsx); } void test_verifier_mul(void) { RUN(verifier_mul); } diff --git a/tools/testing/selftests/bpf/progs/verifier_memcontrol.c b/tools/testing/selftests/bpf/progs/verifier_memcontrol.c new file mode 100644 index 000000000000..13564956f621 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_memcontrol.c @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2026 Google LLC. + */ + +#include +#include +#include +#include "bpf_misc.h" + +SEC("syscall") +__success __retval(0) +int root_mem_cgroup_default_trusted(void *ctx) +{ + unsigned long usage; + struct mem_cgroup *root_mem_cgroup; + + root_mem_cgroup = bpf_get_root_mem_cgroup(); + if (!root_mem_cgroup) + return 1; + + /* + * BPF kfunc bpf_get_root_mem_cgroup() returns a PTR_TO_BTF_ID | + * PTR_TRUSTED | PTR_MAYBE_NULL, therefore it should be accepted when + * passed to a BPF kfunc only accepting KF_TRUSTED_ARGS. + */ + usage = bpf_mem_cgroup_usage(root_mem_cgroup); + __sink(usage); + return 0; +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From c656807675e09604af09a4b9f3ea466af91b7b7a Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Sun, 11 Jan 2026 15:30:47 +0000 Subject: selftests/bpf: Add tests for loading insn array values with offsets The ldimm64 instruction for map value supports an offset. For insn array maps it wasn't tested before, as normally such instructions aren't generated. However, this is still possible to pass such instructions, so add a few tests to check that correct offsets work properly and incorrect offsets are rejected. Signed-off-by: Anton Protopopov Link: https://lore.kernel.org/r/20260111153047.8388-4-a.s.protopopov@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/bpf_gotox.c | 208 +++++++++++++++++++++ 1 file changed, 208 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_gotox.c b/tools/testing/selftests/bpf/prog_tests/bpf_gotox.c index d138cc7b1bda..75b0cf2467ab 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_gotox.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_gotox.c @@ -240,6 +240,208 @@ static void check_nonstatic_global_other_sec(struct bpf_gotox *skel) bpf_link__destroy(link); } +/* + * The following subtests do not use skeleton rather than to check + * if the test should be skipped. + */ + +static int create_jt_map(__u32 max_entries) +{ + const char *map_name = "jt"; + __u32 key_size = 4; + __u32 value_size = sizeof(struct bpf_insn_array_value); + + return bpf_map_create(BPF_MAP_TYPE_INSN_ARRAY, map_name, + key_size, value_size, max_entries, NULL); +} + +static int prog_load(struct bpf_insn *insns, __u32 insn_cnt) +{ + return bpf_prog_load(BPF_PROG_TYPE_RAW_TRACEPOINT, NULL, "GPL", insns, insn_cnt, NULL); +} + +static int __check_ldimm64_off_prog_load(__u32 max_entries, __u32 off) +{ + struct bpf_insn insns[] = { + BPF_LD_IMM64_RAW(BPF_REG_1, BPF_PSEUDO_MAP_VALUE, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + int map_fd, ret; + + map_fd = create_jt_map(max_entries); + if (!ASSERT_GE(map_fd, 0, "create_jt_map")) + return -1; + if (!ASSERT_EQ(bpf_map_freeze(map_fd), 0, "bpf_map_freeze")) { + close(map_fd); + return -1; + } + + insns[0].imm = map_fd; + insns[1].imm = off; + + ret = prog_load(insns, ARRAY_SIZE(insns)); + close(map_fd); + return ret; +} + +/* + * Check that loads from an instruction array map are only allowed with offsets + * which are multiples of 8 and do not point to outside of the map. + */ +static void check_ldimm64_off_load(struct bpf_gotox *skel __always_unused) +{ + const __u32 max_entries = 10; + int prog_fd; + __u32 off; + + for (off = 0; off < max_entries; off++) { + prog_fd = __check_ldimm64_off_prog_load(max_entries, off * 8); + if (!ASSERT_GE(prog_fd, 0, "__check_ldimm64_off_prog_load")) + return; + close(prog_fd); + } + + prog_fd = __check_ldimm64_off_prog_load(max_entries, 7 /* not a multiple of 8 */); + if (!ASSERT_EQ(prog_fd, -EACCES, "__check_ldimm64_off_prog_load: should be -EACCES")) { + close(prog_fd); + return; + } + + prog_fd = __check_ldimm64_off_prog_load(max_entries, max_entries * 8 /* too large */); + if (!ASSERT_EQ(prog_fd, -EACCES, "__check_ldimm64_off_prog_load: should be -EACCES")) { + close(prog_fd); + return; + } +} + +static int __check_ldimm64_gotox_prog_load(struct bpf_insn *insns, + __u32 insn_cnt, + __u32 off1, __u32 off2) +{ + const __u32 values[] = {5, 7, 9, 11, 13, 15}; + const __u32 max_entries = ARRAY_SIZE(values); + struct bpf_insn_array_value val = {}; + int map_fd, ret, i; + + map_fd = create_jt_map(max_entries); + if (!ASSERT_GE(map_fd, 0, "create_jt_map")) + return -1; + + for (i = 0; i < max_entries; i++) { + val.orig_off = values[i]; + if (!ASSERT_EQ(bpf_map_update_elem(map_fd, &i, &val, 0), 0, + "bpf_map_update_elem")) { + close(map_fd); + return -1; + } + } + + if (!ASSERT_EQ(bpf_map_freeze(map_fd), 0, "bpf_map_freeze")) { + close(map_fd); + return -1; + } + + /* r1 = &map + offset1 */ + insns[0].imm = map_fd; + insns[1].imm = off1; + + /* r1 += off2 */ + insns[2].imm = off2; + + ret = prog_load(insns, insn_cnt); + close(map_fd); + return ret; +} + +static void reject_offsets(struct bpf_insn *insns, __u32 insn_cnt, __u32 off1, __u32 off2) +{ + int prog_fd; + + prog_fd = __check_ldimm64_gotox_prog_load(insns, insn_cnt, off1, off2); + if (!ASSERT_EQ(prog_fd, -EACCES, "__check_ldimm64_gotox_prog_load")) + close(prog_fd); +} + +/* + * Verify a bit more complex programs which include indirect jumps + * and with jump tables loaded with a non-zero offset + */ +static void check_ldimm64_off_gotox(struct bpf_gotox *skel __always_unused) +{ + struct bpf_insn insns[] = { + /* + * The following instructions perform an indirect jump to + * labels below. Thus valid offsets in the map are {0,...,5}. + * The program rewrites the offsets in the instructions below: + * r1 = &map + offset1 + * r1 += offset2 + * r1 = *r1 + * gotox r1 + */ + BPF_LD_IMM64_RAW(BPF_REG_1, BPF_PSEUDO_MAP_VALUE, 0), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0), + BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_1, 0, 0, 0), + + /* case 0: */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + /* case 1: */ + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + /* case 2: */ + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_EXIT_INSN(), + /* case 3: */ + BPF_MOV64_IMM(BPF_REG_0, 3), + BPF_EXIT_INSN(), + /* case 4: */ + BPF_MOV64_IMM(BPF_REG_0, 4), + BPF_EXIT_INSN(), + /* default: */ + BPF_MOV64_IMM(BPF_REG_0, 5), + BPF_EXIT_INSN(), + }; + int prog_fd, err; + __u32 off1, off2; + + /* allow all combinations off1 + off2 < 6 */ + for (off1 = 0; off1 < 6; off1++) { + for (off2 = 0; off1 + off2 < 6; off2++) { + LIBBPF_OPTS(bpf_test_run_opts, topts); + + prog_fd = __check_ldimm64_gotox_prog_load(insns, ARRAY_SIZE(insns), + off1 * 8, off2 * 8); + if (!ASSERT_GE(prog_fd, 0, "__check_ldimm64_gotox_prog_load")) + return; + + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) { + close(prog_fd); + return; + } + + if (!ASSERT_EQ(topts.retval, off1 + off2, "test_run_opts retval")) { + close(prog_fd); + return; + } + + close(prog_fd); + } + } + + /* reject off1 + off2 >= 6 */ + reject_offsets(insns, ARRAY_SIZE(insns), 8 * 3, 8 * 3); + reject_offsets(insns, ARRAY_SIZE(insns), 8 * 7, 8 * 0); + reject_offsets(insns, ARRAY_SIZE(insns), 8 * 0, 8 * 7); + + /* reject (off1 + off2) % 8 != 0 */ + reject_offsets(insns, ARRAY_SIZE(insns), 3, 3); + reject_offsets(insns, ARRAY_SIZE(insns), 7, 0); + reject_offsets(insns, ARRAY_SIZE(insns), 0, 7); +} + void test_bpf_gotox(void) { struct bpf_gotox *skel; @@ -288,5 +490,11 @@ void test_bpf_gotox(void) if (test__start_subtest("one-map-two-jumps")) __subtest(skel, check_one_map_two_jumps); + if (test__start_subtest("check-ldimm64-off")) + __subtest(skel, check_ldimm64_off_load); + + if (test__start_subtest("check-ldimm64-off-gotox")) + __subtest(skel, check_ldimm64_off_gotox); + bpf_gotox__destroy(skel); } -- cgit v1.2.3 From 7158fc54b2c6f124eec0d7cd13bff69da0172e59 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Tue, 30 Dec 2025 08:08:44 +0100 Subject: vdso: Remove struct getcpu_cache MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cache parameter of getcpu() is useless nowadays for various reasons. * It is never passed by userspace for either the vDSO or syscalls. * It is never used by the kernel. * It could not be made to work on the current vDSO architecture. * The structure definition is not part of the UAPI headers. * vdso_getcpu() is superseded by restartable sequences in any case. Remove the struct and its header. As a side-effect this gets rid of an unwanted inclusion of the linux/ header namespace from vDSO code. [ tglx: Adapt to s390 upstream changes */ Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Acked-by: Arnd Bergmann Acked-by: Heiko Carstens # s390 Link: https://patch.msgid.link/20251230-getcpu_cache-v3-1-fb9c5f880ebe@linutronix.de --- arch/loongarch/vdso/vgetcpu.c | 5 ++--- arch/s390/kernel/vdso/getcpu.c | 3 +-- arch/s390/kernel/vdso/vdso.h | 4 +--- arch/x86/entry/vdso/vgetcpu.c | 5 ++--- arch/x86/include/asm/vdso/processor.h | 4 +--- include/linux/getcpu.h | 19 ------------------- include/linux/syscalls.h | 3 +-- kernel/sys.c | 4 +--- tools/testing/selftests/vDSO/vdso_test_getcpu.c | 4 +--- 9 files changed, 10 insertions(+), 41 deletions(-) delete mode 100644 include/linux/getcpu.h (limited to 'tools/testing') diff --git a/arch/loongarch/vdso/vgetcpu.c b/arch/loongarch/vdso/vgetcpu.c index 73af49242ecd..6f054ec898c7 100644 --- a/arch/loongarch/vdso/vgetcpu.c +++ b/arch/loongarch/vdso/vgetcpu.c @@ -4,7 +4,6 @@ */ #include -#include static __always_inline int read_cpu_id(void) { @@ -28,8 +27,8 @@ static __always_inline int read_cpu_id(void) } extern -int __vdso_getcpu(unsigned int *cpu, unsigned int *node, struct getcpu_cache *unused); -int __vdso_getcpu(unsigned int *cpu, unsigned int *node, struct getcpu_cache *unused) +int __vdso_getcpu(unsigned int *cpu, unsigned int *node, void *unused); +int __vdso_getcpu(unsigned int *cpu, unsigned int *node, void *unused) { int cpu_id; diff --git a/arch/s390/kernel/vdso/getcpu.c b/arch/s390/kernel/vdso/getcpu.c index 5c5d4a848b76..1e17665616c5 100644 --- a/arch/s390/kernel/vdso/getcpu.c +++ b/arch/s390/kernel/vdso/getcpu.c @@ -2,11 +2,10 @@ /* Copyright IBM Corp. 2020 */ #include -#include #include #include "vdso.h" -int __s390_vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) +int __s390_vdso_getcpu(unsigned *cpu, unsigned *node, void *unused) { union tod_clock clk; diff --git a/arch/s390/kernel/vdso/vdso.h b/arch/s390/kernel/vdso/vdso.h index 8cff033dd854..1fe52a6f5a56 100644 --- a/arch/s390/kernel/vdso/vdso.h +++ b/arch/s390/kernel/vdso/vdso.h @@ -4,9 +4,7 @@ #include -struct getcpu_cache; - -int __s390_vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused); +int __s390_vdso_getcpu(unsigned *cpu, unsigned *node, void *unused); int __s390_vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz); int __s390_vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts); int __s390_vdso_clock_getres(clockid_t clock, struct __kernel_timespec *ts); diff --git a/arch/x86/entry/vdso/vgetcpu.c b/arch/x86/entry/vdso/vgetcpu.c index e4640306b2e3..6381b472b7c5 100644 --- a/arch/x86/entry/vdso/vgetcpu.c +++ b/arch/x86/entry/vdso/vgetcpu.c @@ -6,17 +6,16 @@ */ #include -#include #include #include notrace long -__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) +__vdso_getcpu(unsigned *cpu, unsigned *node, void *unused) { vdso_read_cpunode(cpu, node); return 0; } -long getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) +long getcpu(unsigned *cpu, unsigned *node, void *tcache) __attribute__((weak, alias("__vdso_getcpu"))); diff --git a/arch/x86/include/asm/vdso/processor.h b/arch/x86/include/asm/vdso/processor.h index 7000aeb59aa2..93e0e24e5cb4 100644 --- a/arch/x86/include/asm/vdso/processor.h +++ b/arch/x86/include/asm/vdso/processor.h @@ -18,9 +18,7 @@ static __always_inline void cpu_relax(void) native_pause(); } -struct getcpu_cache; - -notrace long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused); +notrace long __vdso_getcpu(unsigned *cpu, unsigned *node, void *unused); #endif /* __ASSEMBLER__ */ diff --git a/include/linux/getcpu.h b/include/linux/getcpu.h deleted file mode 100644 index c304dcdb4eac..000000000000 --- a/include/linux/getcpu.h +++ /dev/null @@ -1,19 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_GETCPU_H -#define _LINUX_GETCPU_H 1 - -/* Cache for getcpu() to speed it up. Results might be a short time - out of date, but will be faster. - - User programs should not refer to the contents of this structure. - I repeat they should not refer to it. If they do they will break - in future kernels. - - It is only a private cache for vgetcpu(). It will change in future kernels. - The user program must store this information per thread (__thread) - If you want 100% accurate information pass NULL instead. */ -struct getcpu_cache { - unsigned long blob[128 / sizeof(long)]; -}; - -#endif diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index cf84d98964b2..23704e006afd 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -59,7 +59,6 @@ struct compat_stat; struct old_timeval32; struct robust_list_head; struct futex_waitv; -struct getcpu_cache; struct old_linux_dirent; struct perf_event_attr; struct file_handle; @@ -718,7 +717,7 @@ asmlinkage long sys_getrusage(int who, struct rusage __user *ru); asmlinkage long sys_umask(int mask); asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5); -asmlinkage long sys_getcpu(unsigned __user *cpu, unsigned __user *node, struct getcpu_cache __user *cache); +asmlinkage long sys_getcpu(unsigned __user *cpu, unsigned __user *node, void __user *cache); asmlinkage long sys_gettimeofday(struct __kernel_old_timeval __user *tv, struct timezone __user *tz); asmlinkage long sys_settimeofday(struct __kernel_old_timeval __user *tv, diff --git a/kernel/sys.c b/kernel/sys.c index 8b58eece4e58..f1780ab132a3 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -31,7 +31,6 @@ #include #include #include -#include #include #include #include @@ -2876,8 +2875,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, return error; } -SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, - struct getcpu_cache __user *, unused) +SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, void __user *, unused) { int err = 0; int cpu = raw_smp_processor_id(); diff --git a/tools/testing/selftests/vDSO/vdso_test_getcpu.c b/tools/testing/selftests/vDSO/vdso_test_getcpu.c index bea8ad54da11..3fe49cbdae98 100644 --- a/tools/testing/selftests/vDSO/vdso_test_getcpu.c +++ b/tools/testing/selftests/vDSO/vdso_test_getcpu.c @@ -16,9 +16,7 @@ #include "vdso_config.h" #include "vdso_call.h" -struct getcpu_cache; -typedef long (*getcpu_t)(unsigned int *, unsigned int *, - struct getcpu_cache *); +typedef long (*getcpu_t)(unsigned int *, unsigned int *, void *); int main(int argc, char **argv) { -- cgit v1.2.3 From f756ed82c62aa2725757ac011710492d4cc8c7d8 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 13 Jan 2026 17:14:56 +0000 Subject: KVM: selftests: Slightly simplify memstress_setup_nested() Instead of calling memstress_setup_ept_mappings() only in the first iteration in the loop, move it before the loop. The call needed to happen within the loop before commit e40e72fec0de ("KVM: selftests: Stop passing VMX metadata to TDP mapping functions"), as memstress_setup_ept_mappings() used to take in a pointer to vmx_pages and pass it into tdp_identity_map_1g() (to get the EPT root GPA). This is no longer the case, as tdp_identity_map_1g() gets the EPT root through stage2 MMU. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20260113171456.2097312-1-yosry.ahmed@linux.dev Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/lib/x86/memstress.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/kvm/lib/x86/memstress.c b/tools/testing/selftests/kvm/lib/x86/memstress.c index 86f4c5e4c430..f53414ba7103 100644 --- a/tools/testing/selftests/kvm/lib/x86/memstress.c +++ b/tools/testing/selftests/kvm/lib/x86/memstress.c @@ -110,16 +110,13 @@ void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vc TEST_REQUIRE(kvm_cpu_has_tdp()); vm_enable_tdp(vm); + memstress_setup_ept_mappings(vm); for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { if (kvm_cpu_has(X86_FEATURE_VMX)) vcpu_alloc_vmx(vm, &nested_gva); else vcpu_alloc_svm(vm, &nested_gva); - /* The EPTs are shared across vCPUs, setup the mappings once */ - if (vcpu_id == 0) - memstress_setup_ept_mappings(vm); - /* * Override the vCPU to run memstress_l1_guest_code() which will * bounce it into L2 before calling memstress_guest_code(). -- cgit v1.2.3 From 55058e32151f95dc5badd62381d184e89f15de99 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Sat, 10 Jan 2026 00:48:20 +0000 Subject: KVM: selftests: Add a selftests for nested VMLOAD/VMSAVE Add a test for VMLOAD/VMSAVE in an L2 guest. The test verifies that L1 intercepts for VMSAVE/VMLOAD always work regardless of VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK. Then, more interestingly, it makes sure that when L1 does not intercept VMLOAD/VMSAVE, they work as intended in L2. When VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK is enabled by L1, VMSAVE/VMLOAD from L2 should interpret the GPA as an L2 GPA and translate it through the NPT. When VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK is disabled by L1, VMSAVE/VMLOAD from L2 should interpret the GPA as an L1 GPA. To test this, put two VMCBs (0 and 1) in L1's physical address space, and have a single L2 GPA where: - L2 VMCB GPA == L1 VMCB(0) GPA - L2 VMCB GPA maps to L1 VMCB(1) via the NPT in L1. This setup allows detecting how the GPA is interpreted based on which L1 VMCB is actually accessed. In both cases, L2 sets KERNEL_GS_BASE (one of the fields handled by VMSAVE/VMLOAD), and executes VMSAVE to write its value to the VMCB. The test userspace code then checks that the write was made to the correct VMCB (based on whether VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK is set by L1), and writes a new value to that VMCB. L2 then executes VMLOAD to load the new value and makes sure it's reflected correctly in KERNERL_GS_BASE. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20260110004821.3411245-4-yosry.ahmed@linux.dev Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/Makefile.kvm | 1 + .../testing/selftests/kvm/include/x86/processor.h | 1 + .../selftests/kvm/x86/nested_vmsave_vmload_test.c | 197 +++++++++++++++++++++ 3 files changed, 199 insertions(+) create mode 100644 tools/testing/selftests/kvm/x86/nested_vmsave_vmload_test.c (limited to 'tools/testing') diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index ffbf891b31d3..fc78d4508ebd 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -95,6 +95,7 @@ TEST_GEN_PROGS_x86 += x86/nested_exceptions_test TEST_GEN_PROGS_x86 += x86/nested_invalid_cr3_test TEST_GEN_PROGS_x86 += x86/nested_tsc_adjust_test TEST_GEN_PROGS_x86 += x86/nested_tsc_scaling_test +TEST_GEN_PROGS_x86 += x86/nested_vmsave_vmload_test TEST_GEN_PROGS_x86 += x86/platform_info_test TEST_GEN_PROGS_x86 += x86/pmu_counters_test TEST_GEN_PROGS_x86 += x86/pmu_event_filter_test diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index 8f130e7d7048..6bfffc3b0a33 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -201,6 +201,7 @@ struct kvm_x86_cpu_feature { #define X86_FEATURE_TSCRATEMSR KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 4) #define X86_FEATURE_PAUSEFILTER KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 10) #define X86_FEATURE_PFTHRESHOLD KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 12) +#define X86_FEATURE_V_VMSAVE_VMLOAD KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 15) #define X86_FEATURE_VGIF KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 16) #define X86_FEATURE_IDLE_HLT KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 30) #define X86_FEATURE_SEV KVM_X86_CPU_FEATURE(0x8000001F, 0, EAX, 1) diff --git a/tools/testing/selftests/kvm/x86/nested_vmsave_vmload_test.c b/tools/testing/selftests/kvm/x86/nested_vmsave_vmload_test.c new file mode 100644 index 000000000000..6764a48f9d4d --- /dev/null +++ b/tools/testing/selftests/kvm/x86/nested_vmsave_vmload_test.c @@ -0,0 +1,197 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2026, Google LLC. + */ +#include "kvm_util.h" +#include "vmx.h" +#include "svm_util.h" +#include "kselftest.h" + +/* + * Allocate two VMCB pages for testing. Both pages have different GVAs (shared + * by both L1 and L2) and L1 GPAs. A single L2 GPA is used such that: + * - L2 GPA == L1 GPA for VMCB0. + * - L2 GPA is mapped to L1 GPA for VMCB1 using NPT in L1. + * + * This allows testing whether the GPA used by VMSAVE/VMLOAD in L2 is + * interpreted as a direct L1 GPA or translated using NPT as an L2 GPA, depends + * on which VMCB is accessed. + */ +#define TEST_MEM_SLOT_INDEX 1 +#define TEST_MEM_PAGES 2 +#define TEST_MEM_BASE 0xc0000000 + +#define TEST_GUEST_ADDR(idx) (TEST_MEM_BASE + (idx) * PAGE_SIZE) + +#define TEST_VMCB_L1_GPA(idx) TEST_GUEST_ADDR(idx) +#define TEST_VMCB_GVA(idx) TEST_GUEST_ADDR(idx) + +#define TEST_VMCB_L2_GPA TEST_VMCB_L1_GPA(0) + +#define L2_GUEST_STACK_SIZE 64 + +static void l2_guest_code_vmsave(void) +{ + asm volatile("vmsave %0" : : "a"(TEST_VMCB_L2_GPA) : "memory"); +} + +static void l2_guest_code_vmload(void) +{ + asm volatile("vmload %0" : : "a"(TEST_VMCB_L2_GPA) : "memory"); +} + +static void l2_guest_code_vmcb(int vmcb_idx) +{ + wrmsr(MSR_KERNEL_GS_BASE, 0xaaaa); + l2_guest_code_vmsave(); + + /* Verify the VMCB used by VMSAVE and update KERNEL_GS_BASE to 0xbbbb */ + GUEST_SYNC(vmcb_idx); + + l2_guest_code_vmload(); + GUEST_ASSERT_EQ(rdmsr(MSR_KERNEL_GS_BASE), 0xbbbb); + + /* Reset MSR_KERNEL_GS_BASE */ + wrmsr(MSR_KERNEL_GS_BASE, 0); + l2_guest_code_vmsave(); + + vmmcall(); +} + +static void l2_guest_code_vmcb0(void) +{ + l2_guest_code_vmcb(0); +} + +static void l2_guest_code_vmcb1(void) +{ + l2_guest_code_vmcb(1); +} + +static void l1_guest_code(struct svm_test_data *svm) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + + /* Each test case initializes the guest RIP below */ + generic_svm_setup(svm, NULL, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + /* Set VMSAVE/VMLOAD intercepts and make sure they work with.. */ + svm->vmcb->control.intercept |= (BIT_ULL(INTERCEPT_VMSAVE) | + BIT_ULL(INTERCEPT_VMLOAD)); + + /* ..VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK cleared.. */ + svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; + + svm->vmcb->save.rip = (u64)l2_guest_code_vmsave; + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMSAVE); + + svm->vmcb->save.rip = (u64)l2_guest_code_vmload; + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMLOAD); + + /* ..and VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK set */ + svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; + + svm->vmcb->save.rip = (u64)l2_guest_code_vmsave; + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMSAVE); + + svm->vmcb->save.rip = (u64)l2_guest_code_vmload; + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMLOAD); + + /* Now clear the intercepts to test VMSAVE/VMLOAD behavior */ + svm->vmcb->control.intercept &= ~(BIT_ULL(INTERCEPT_VMSAVE) | + BIT_ULL(INTERCEPT_VMLOAD)); + + /* + * Without VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK, the GPA will be + * interpreted as an L1 GPA, so VMCB0 should be used. + */ + svm->vmcb->save.rip = (u64)l2_guest_code_vmcb0; + svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMMCALL); + + /* + * With VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK, the GPA will be interpeted as + * an L2 GPA, and translated through the NPT to VMCB1. + */ + svm->vmcb->save.rip = (u64)l2_guest_code_vmcb1; + svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMMCALL); + + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + vm_vaddr_t nested_gva = 0; + struct vmcb *test_vmcb[2]; + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + int i; + + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM)); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_NPT)); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD)); + + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); + vm_enable_tdp(vm); + + vcpu_alloc_svm(vm, &nested_gva); + vcpu_args_set(vcpu, 1, nested_gva); + + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + TEST_MEM_BASE, TEST_MEM_SLOT_INDEX, + TEST_MEM_PAGES, 0); + + for (i = 0; i <= 1; i++) { + virt_map(vm, TEST_VMCB_GVA(i), TEST_VMCB_L1_GPA(i), 1); + test_vmcb[i] = (struct vmcb *)addr_gva2hva(vm, TEST_VMCB_GVA(i)); + } + + tdp_identity_map_default_memslots(vm); + + /* + * L2 GPA == L1_GPA(0), but map it to L1_GPA(1), to allow testing + * whether the L2 GPA is interpreted as an L1 GPA or translated through + * the NPT. + */ + TEST_ASSERT_EQ(TEST_VMCB_L2_GPA, TEST_VMCB_L1_GPA(0)); + tdp_map(vm, TEST_VMCB_L2_GPA, TEST_VMCB_L1_GPA(1), PAGE_SIZE); + + for (;;) { + struct ucall uc; + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + case UCALL_SYNC: + i = uc.args[1]; + TEST_ASSERT(i == 0 || i == 1, "Unexpected VMCB idx: %d", i); + + /* + * Check that only the expected VMCB has KERNEL_GS_BASE + * set to 0xaaaa, and update it to 0xbbbb. + */ + TEST_ASSERT_EQ(test_vmcb[i]->save.kernel_gs_base, 0xaaaa); + TEST_ASSERT_EQ(test_vmcb[1-i]->save.kernel_gs_base, 0); + test_vmcb[i]->save.kernel_gs_base = 0xbbbb; + break; + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } + +done: + kvm_vm_free(vm); + return 0; +} -- cgit v1.2.3 From 7c8e817e443c118aa303f1bbcec33df8d9e3487a Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Wed, 14 Jan 2026 16:25:44 +0000 Subject: selftests/bpf: Extend live regs tests with a test for gotox Add a test which checks that the destination register of a gotox instruction is marked as used and that the union of jump targets is considered as live. Signed-off-by: Anton Protopopov Link: https://lore.kernel.org/r/20260114162544.83253-3-a.s.protopopov@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/compute_live_registers.c | 41 ++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/progs/compute_live_registers.c b/tools/testing/selftests/bpf/progs/compute_live_registers.c index 6884ab99a421..f05e120f3450 100644 --- a/tools/testing/selftests/bpf/progs/compute_live_registers.c +++ b/tools/testing/selftests/bpf/progs/compute_live_registers.c @@ -431,6 +431,47 @@ __naked void subprog1(void) ::: __clobber_all); } +#if defined(__TARGET_ARCH_x86) || defined(__TARGET_ARCH_arm64) + +SEC("socket") +__log_level(2) +__msg("2: .1........ (07) r1 += 8") +__msg("3: .1........ (79) r2 = *(u64 *)(r1 +0)") +__msg("4: ..2....... (b7) r3 = 1") +__msg("5: ..23...... (b7) r4 = 2") +__msg("6: ..234..... (0d) gotox r2") +__msg("7: ...3...... (bf) r0 = r3") +__msg("8: 0......... (95) exit") +__msg("9: ....4..... (bf) r0 = r4") +__msg("10: 0......... (95) exit") +__naked +void gotox(void) +{ + asm volatile ( + ".pushsection .jumptables,\"\",@progbits;" +"jt0_%=: .quad l0_%= - socket;" + ".quad l1_%= - socket;" + ".size jt0_%=, 16;" + ".global jt0_%=;" + ".popsection;" + + "r1 = jt0_%= ll;" + "r1 += 8;" + "r2 = *(u64 *)(r1 + 0);" + "r3 = 1;" + "r4 = 2;" + ".8byte %[gotox_r2];" +"l0_%=: r0 = r3;" + "exit;" +"l1_%=: r0 = r4;" + "exit;" + : + : __imm_insn(gotox_r2, BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_2, BPF_REG_0, 0, 0)) + : __clobber_all); +} + +#endif /* __TARGET_ARCH_x86 || __TARGET_ARCH_arm64 */ + /* to retain debug info for BTF generation */ void kfunc_root(void) { -- cgit v1.2.3 From 0ace8f2db6b3b4b0677e559d1a7ab7fd625d61ec Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 5 Jan 2026 20:11:48 +0000 Subject: tools/testing/selftests: add tests for !tgt, src mremap() merges Test that mremap()'ing a VMA into a position such that the target VMA on merge is unfaulted and the source faulted is correctly performed. We cover 4 cases: 1. Previous VMA unfaulted: copied -----| v |-----------|.............| | unfaulted |(faulted VMA)| |-----------|.............| prev target = prev, expand prev to cover. 2. Next VMA unfaulted: copied -----| v |.............|-----------| |(faulted VMA)| unfaulted | |.............|-----------| next target = next, expand next to cover. 3. Both adjacent VMAs unfaulted: copied -----| v |-----------|.............|-----------| | unfaulted |(faulted VMA)| unfaulted | |-----------|.............|-----------| prev next target = prev, expand prev to cover. 4. prev unfaulted, next faulted: copied -----| v |-----------|.............|-----------| | unfaulted |(faulted VMA)| faulted | |-----------|.............|-----------| prev next target = prev, expand prev to cover. Essentially equivalent to 3, but with additional requirement that next's anon_vma is the same as the copied VMA's. Each of these are performed with MREMAP_DONTUNMAP set, which will cause a KASAN assert for UAF or an assert on zero refcount anon_vma if a bug exists with correctly propagating anon_vma state in each scenario. Link: https://lkml.kernel.org/r/f903af2930c7c2c6e0948c886b58d0f42d8e8ba3.1767638272.git.lorenzo.stoakes@oracle.com Fixes: 879bca0a2c4f ("mm/vma: fix incorrectly disallowed anonymous VMA merges") Signed-off-by: Lorenzo Stoakes Cc: David Hildenbrand (Red Hat) Cc: Jann Horn Cc: Jeongjun Park Cc: Liam Howlett Cc: Pedro Falcato Cc: Rik van Riel Cc: Vlastimil Babka Cc: Yeoreum Yun Cc: Harry Yoo Cc: Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/merge.c | 232 +++++++++++++++++++++++++++++++++++++ 1 file changed, 232 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/merge.c b/tools/testing/selftests/mm/merge.c index 363c1033cc7d..22be149f7109 100644 --- a/tools/testing/selftests/mm/merge.c +++ b/tools/testing/selftests/mm/merge.c @@ -1171,4 +1171,236 @@ TEST_F(merge, mremap_correct_placed_faulted) ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr + 15 * page_size); } +TEST_F(merge, mremap_faulted_to_unfaulted_prev) +{ + struct procmap_fd *procmap = &self->procmap; + unsigned int page_size = self->page_size; + char *ptr_a, *ptr_b; + + /* + * mremap() such that A and B merge: + * + * |------------| + * | \ | + * |-----------| | / |---------| + * | unfaulted | v \ | faulted | + * |-----------| / |---------| + * B \ A + */ + + /* Map VMA A into place. */ + ptr_a = mmap(&self->carveout[page_size + 3 * page_size], + 3 * page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0); + ASSERT_NE(ptr_a, MAP_FAILED); + /* Fault it in. */ + ptr_a[0] = 'x'; + + /* + * Now move it out of the way so we can place VMA B in position, + * unfaulted. + */ + ptr_a = mremap(ptr_a, 3 * page_size, 3 * page_size, + MREMAP_FIXED | MREMAP_MAYMOVE, &self->carveout[20 * page_size]); + ASSERT_NE(ptr_a, MAP_FAILED); + + /* Map VMA B into place. */ + ptr_b = mmap(&self->carveout[page_size], 3 * page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0); + ASSERT_NE(ptr_b, MAP_FAILED); + + /* + * Now move VMA A into position with MREMAP_DONTUNMAP to catch incorrect + * anon_vma propagation. + */ + ptr_a = mremap(ptr_a, 3 * page_size, 3 * page_size, + MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP, + &self->carveout[page_size + 3 * page_size]); + ASSERT_NE(ptr_a, MAP_FAILED); + + /* The VMAs should have merged. */ + ASSERT_TRUE(find_vma_procmap(procmap, ptr_b)); + ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr_b); + ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_b + 6 * page_size); +} + +TEST_F(merge, mremap_faulted_to_unfaulted_next) +{ + struct procmap_fd *procmap = &self->procmap; + unsigned int page_size = self->page_size; + char *ptr_a, *ptr_b; + + /* + * mremap() such that A and B merge: + * + * |---------------------------| + * | \ | + * | |-----------| / |---------| + * v | unfaulted | \ | faulted | + * |-----------| / |---------| + * B \ A + * + * Then unmap VMA A to trigger the bug. + */ + + /* Map VMA A into place. */ + ptr_a = mmap(&self->carveout[page_size], 3 * page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0); + ASSERT_NE(ptr_a, MAP_FAILED); + /* Fault it in. */ + ptr_a[0] = 'x'; + + /* + * Now move it out of the way so we can place VMA B in position, + * unfaulted. + */ + ptr_a = mremap(ptr_a, 3 * page_size, 3 * page_size, + MREMAP_FIXED | MREMAP_MAYMOVE, &self->carveout[20 * page_size]); + ASSERT_NE(ptr_a, MAP_FAILED); + + /* Map VMA B into place. */ + ptr_b = mmap(&self->carveout[page_size + 3 * page_size], 3 * page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0); + ASSERT_NE(ptr_b, MAP_FAILED); + + /* + * Now move VMA A into position with MREMAP_DONTUNMAP to catch incorrect + * anon_vma propagation. + */ + ptr_a = mremap(ptr_a, 3 * page_size, 3 * page_size, + MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP, + &self->carveout[page_size]); + ASSERT_NE(ptr_a, MAP_FAILED); + + /* The VMAs should have merged. */ + ASSERT_TRUE(find_vma_procmap(procmap, ptr_a)); + ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr_a); + ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_a + 6 * page_size); +} + +TEST_F(merge, mremap_faulted_to_unfaulted_prev_unfaulted_next) +{ + struct procmap_fd *procmap = &self->procmap; + unsigned int page_size = self->page_size; + char *ptr_a, *ptr_b, *ptr_c; + + /* + * mremap() with MREMAP_DONTUNMAP such that A, B and C merge: + * + * |---------------------------| + * | \ | + * |-----------| | |-----------| / |---------| + * | unfaulted | v | unfaulted | \ | faulted | + * |-----------| |-----------| / |---------| + * A C \ B + */ + + /* Map VMA B into place. */ + ptr_b = mmap(&self->carveout[page_size + 3 * page_size], 3 * page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0); + ASSERT_NE(ptr_b, MAP_FAILED); + /* Fault it in. */ + ptr_b[0] = 'x'; + + /* + * Now move it out of the way so we can place VMAs A, C in position, + * unfaulted. + */ + ptr_b = mremap(ptr_b, 3 * page_size, 3 * page_size, + MREMAP_FIXED | MREMAP_MAYMOVE, &self->carveout[20 * page_size]); + ASSERT_NE(ptr_b, MAP_FAILED); + + /* Map VMA A into place. */ + + ptr_a = mmap(&self->carveout[page_size], 3 * page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0); + ASSERT_NE(ptr_a, MAP_FAILED); + + /* Map VMA C into place. */ + ptr_c = mmap(&self->carveout[page_size + 3 * page_size + 3 * page_size], + 3 * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0); + ASSERT_NE(ptr_c, MAP_FAILED); + + /* + * Now move VMA B into position with MREMAP_DONTUNMAP to catch incorrect + * anon_vma propagation. + */ + ptr_b = mremap(ptr_b, 3 * page_size, 3 * page_size, + MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP, + &self->carveout[page_size + 3 * page_size]); + ASSERT_NE(ptr_b, MAP_FAILED); + + /* The VMAs should have merged. */ + ASSERT_TRUE(find_vma_procmap(procmap, ptr_a)); + ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr_a); + ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_a + 9 * page_size); +} + +TEST_F(merge, mremap_faulted_to_unfaulted_prev_faulted_next) +{ + struct procmap_fd *procmap = &self->procmap; + unsigned int page_size = self->page_size; + char *ptr_a, *ptr_b, *ptr_bc; + + /* + * mremap() with MREMAP_DONTUNMAP such that A, B and C merge: + * + * |---------------------------| + * | \ | + * |-----------| | |-----------| / |---------| + * | unfaulted | v | faulted | \ | faulted | + * |-----------| |-----------| / |---------| + * A C \ B + */ + + /* + * Map VMA B and C into place. We have to map them together so their + * anon_vma is the same and the vma->vm_pgoff's are correctly aligned. + */ + ptr_bc = mmap(&self->carveout[page_size + 3 * page_size], + 3 * page_size + 3 * page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0); + ASSERT_NE(ptr_bc, MAP_FAILED); + + /* Fault it in. */ + ptr_bc[0] = 'x'; + + /* + * Now move VMA B out the way (splitting VMA BC) so we can place VMA A + * in position, unfaulted, and leave the remainder of the VMA we just + * moved in place, faulted, as VMA C. + */ + ptr_b = mremap(ptr_bc, 3 * page_size, 3 * page_size, + MREMAP_FIXED | MREMAP_MAYMOVE, &self->carveout[20 * page_size]); + ASSERT_NE(ptr_b, MAP_FAILED); + + /* Map VMA A into place. */ + ptr_a = mmap(&self->carveout[page_size], 3 * page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0); + ASSERT_NE(ptr_a, MAP_FAILED); + + /* + * Now move VMA B into position with MREMAP_DONTUNMAP to catch incorrect + * anon_vma propagation. + */ + ptr_b = mremap(ptr_b, 3 * page_size, 3 * page_size, + MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP, + &self->carveout[page_size + 3 * page_size]); + ASSERT_NE(ptr_b, MAP_FAILED); + + /* The VMAs should have merged. */ + ASSERT_TRUE(find_vma_procmap(procmap, ptr_a)); + ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr_a); + ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_a + 9 * page_size); +} + TEST_HARNESS_MAIN -- cgit v1.2.3 From fb39444732f02c32a8312c168d97e33d872c14d3 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 5 Jan 2026 20:11:50 +0000 Subject: tools/testing/selftests: add forked (un)/faulted VMA merge tests Now we correctly handle forked faulted/unfaulted merge on mremap(), exhaustively assert that we handle this correctly. Do this in the less duplicative way by adding a new merge_with_fork fixture and forked/unforked variants, and abstract the forking logic as necessary to avoid code duplication with this also. Link: https://lkml.kernel.org/r/1daf76d89fdb9d96f38a6a0152d8f3c2e9e30ac7.1767638272.git.lorenzo.stoakes@oracle.com Fixes: 879bca0a2c4f ("mm/vma: fix incorrectly disallowed anonymous VMA merges") Signed-off-by: Lorenzo Stoakes Cc: David Hildenbrand (Red Hat) Cc: Jann Horn Cc: Jeongjun Park Cc: Liam Howlett Cc: Pedro Falcato Cc: Rik van Riel Cc: Vlastimil Babka Cc: Yeoreum Yun Cc: Harry Yoo Cc: Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/merge.c | 180 ++++++++++++++++++++++++++++--------- 1 file changed, 139 insertions(+), 41 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/merge.c b/tools/testing/selftests/mm/merge.c index 22be149f7109..10b686102b79 100644 --- a/tools/testing/selftests/mm/merge.c +++ b/tools/testing/selftests/mm/merge.c @@ -22,12 +22,37 @@ FIXTURE(merge) struct procmap_fd procmap; }; +static char *map_carveout(unsigned int page_size) +{ + return mmap(NULL, 30 * page_size, PROT_NONE, + MAP_ANON | MAP_PRIVATE, -1, 0); +} + +static pid_t do_fork(struct procmap_fd *procmap) +{ + pid_t pid = fork(); + + if (pid == -1) + return -1; + if (pid != 0) { + wait(NULL); + return pid; + } + + /* Reopen for child. */ + if (close_procmap(procmap)) + return -1; + if (open_self_procmap(procmap)) + return -1; + + return 0; +} + FIXTURE_SETUP(merge) { self->page_size = psize(); /* Carve out PROT_NONE region to map over. */ - self->carveout = mmap(NULL, 30 * self->page_size, PROT_NONE, - MAP_ANON | MAP_PRIVATE, -1, 0); + self->carveout = map_carveout(self->page_size); ASSERT_NE(self->carveout, MAP_FAILED); /* Setup PROCMAP_QUERY interface. */ ASSERT_EQ(open_self_procmap(&self->procmap), 0); @@ -36,7 +61,8 @@ FIXTURE_SETUP(merge) FIXTURE_TEARDOWN(merge) { ASSERT_EQ(munmap(self->carveout, 30 * self->page_size), 0); - ASSERT_EQ(close_procmap(&self->procmap), 0); + /* May fail for parent of forked process. */ + close_procmap(&self->procmap); /* * Clear unconditionally, as some tests set this. It is no issue if this * fails (KSM may be disabled for instance). @@ -44,6 +70,44 @@ FIXTURE_TEARDOWN(merge) prctl(PR_SET_MEMORY_MERGE, 0, 0, 0, 0); } +FIXTURE(merge_with_fork) +{ + unsigned int page_size; + char *carveout; + struct procmap_fd procmap; +}; + +FIXTURE_VARIANT(merge_with_fork) +{ + bool forked; +}; + +FIXTURE_VARIANT_ADD(merge_with_fork, forked) +{ + .forked = true, +}; + +FIXTURE_VARIANT_ADD(merge_with_fork, unforked) +{ + .forked = false, +}; + +FIXTURE_SETUP(merge_with_fork) +{ + self->page_size = psize(); + self->carveout = map_carveout(self->page_size); + ASSERT_NE(self->carveout, MAP_FAILED); + ASSERT_EQ(open_self_procmap(&self->procmap), 0); +} + +FIXTURE_TEARDOWN(merge_with_fork) +{ + ASSERT_EQ(munmap(self->carveout, 30 * self->page_size), 0); + ASSERT_EQ(close_procmap(&self->procmap), 0); + /* See above. */ + prctl(PR_SET_MEMORY_MERGE, 0, 0, 0, 0); +} + TEST_F(merge, mprotect_unfaulted_left) { unsigned int page_size = self->page_size; @@ -322,8 +386,8 @@ TEST_F(merge, forked_target_vma) unsigned int page_size = self->page_size; char *carveout = self->carveout; struct procmap_fd *procmap = &self->procmap; - pid_t pid; char *ptr, *ptr2; + pid_t pid; int i; /* @@ -344,19 +408,10 @@ TEST_F(merge, forked_target_vma) */ ptr[0] = 'x'; - pid = fork(); + pid = do_fork(&self->procmap); ASSERT_NE(pid, -1); - - if (pid != 0) { - wait(NULL); + if (pid != 0) return; - } - - /* Child process below: */ - - /* Reopen for child. */ - ASSERT_EQ(close_procmap(&self->procmap), 0); - ASSERT_EQ(open_self_procmap(&self->procmap), 0); /* unCOWing everything does not cause the AVC to go away. */ for (i = 0; i < 5 * page_size; i += page_size) @@ -386,8 +441,8 @@ TEST_F(merge, forked_source_vma) unsigned int page_size = self->page_size; char *carveout = self->carveout; struct procmap_fd *procmap = &self->procmap; - pid_t pid; char *ptr, *ptr2; + pid_t pid; int i; /* @@ -408,19 +463,10 @@ TEST_F(merge, forked_source_vma) */ ptr[0] = 'x'; - pid = fork(); + pid = do_fork(&self->procmap); ASSERT_NE(pid, -1); - - if (pid != 0) { - wait(NULL); + if (pid != 0) return; - } - - /* Child process below: */ - - /* Reopen for child. */ - ASSERT_EQ(close_procmap(&self->procmap), 0); - ASSERT_EQ(open_self_procmap(&self->procmap), 0); /* unCOWing everything does not cause the AVC to go away. */ for (i = 0; i < 5 * page_size; i += page_size) @@ -1171,10 +1217,11 @@ TEST_F(merge, mremap_correct_placed_faulted) ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr + 15 * page_size); } -TEST_F(merge, mremap_faulted_to_unfaulted_prev) +TEST_F(merge_with_fork, mremap_faulted_to_unfaulted_prev) { struct procmap_fd *procmap = &self->procmap; unsigned int page_size = self->page_size; + unsigned long offset; char *ptr_a, *ptr_b; /* @@ -1197,6 +1244,14 @@ TEST_F(merge, mremap_faulted_to_unfaulted_prev) /* Fault it in. */ ptr_a[0] = 'x'; + if (variant->forked) { + pid_t pid = do_fork(&self->procmap); + + ASSERT_NE(pid, -1); + if (pid != 0) + return; + } + /* * Now move it out of the way so we can place VMA B in position, * unfaulted. @@ -1220,16 +1275,19 @@ TEST_F(merge, mremap_faulted_to_unfaulted_prev) &self->carveout[page_size + 3 * page_size]); ASSERT_NE(ptr_a, MAP_FAILED); - /* The VMAs should have merged. */ + /* The VMAs should have merged, if not forked. */ ASSERT_TRUE(find_vma_procmap(procmap, ptr_b)); ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr_b); - ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_b + 6 * page_size); + + offset = variant->forked ? 3 * page_size : 6 * page_size; + ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_b + offset); } -TEST_F(merge, mremap_faulted_to_unfaulted_next) +TEST_F(merge_with_fork, mremap_faulted_to_unfaulted_next) { struct procmap_fd *procmap = &self->procmap; unsigned int page_size = self->page_size; + unsigned long offset; char *ptr_a, *ptr_b; /* @@ -1253,6 +1311,14 @@ TEST_F(merge, mremap_faulted_to_unfaulted_next) /* Fault it in. */ ptr_a[0] = 'x'; + if (variant->forked) { + pid_t pid = do_fork(&self->procmap); + + ASSERT_NE(pid, -1); + if (pid != 0) + return; + } + /* * Now move it out of the way so we can place VMA B in position, * unfaulted. @@ -1276,16 +1342,18 @@ TEST_F(merge, mremap_faulted_to_unfaulted_next) &self->carveout[page_size]); ASSERT_NE(ptr_a, MAP_FAILED); - /* The VMAs should have merged. */ + /* The VMAs should have merged, if not forked. */ ASSERT_TRUE(find_vma_procmap(procmap, ptr_a)); ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr_a); - ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_a + 6 * page_size); + offset = variant->forked ? 3 * page_size : 6 * page_size; + ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_a + offset); } -TEST_F(merge, mremap_faulted_to_unfaulted_prev_unfaulted_next) +TEST_F(merge_with_fork, mremap_faulted_to_unfaulted_prev_unfaulted_next) { struct procmap_fd *procmap = &self->procmap; unsigned int page_size = self->page_size; + unsigned long offset; char *ptr_a, *ptr_b, *ptr_c; /* @@ -1307,6 +1375,14 @@ TEST_F(merge, mremap_faulted_to_unfaulted_prev_unfaulted_next) /* Fault it in. */ ptr_b[0] = 'x'; + if (variant->forked) { + pid_t pid = do_fork(&self->procmap); + + ASSERT_NE(pid, -1); + if (pid != 0) + return; + } + /* * Now move it out of the way so we can place VMAs A, C in position, * unfaulted. @@ -1337,13 +1413,21 @@ TEST_F(merge, mremap_faulted_to_unfaulted_prev_unfaulted_next) &self->carveout[page_size + 3 * page_size]); ASSERT_NE(ptr_b, MAP_FAILED); - /* The VMAs should have merged. */ + /* The VMAs should have merged, if not forked. */ ASSERT_TRUE(find_vma_procmap(procmap, ptr_a)); ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr_a); - ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_a + 9 * page_size); + offset = variant->forked ? 3 * page_size : 9 * page_size; + ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_a + offset); + + /* If forked, B and C should also not have merged. */ + if (variant->forked) { + ASSERT_TRUE(find_vma_procmap(procmap, ptr_b)); + ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr_b); + ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_b + 3 * page_size); + } } -TEST_F(merge, mremap_faulted_to_unfaulted_prev_faulted_next) +TEST_F(merge_with_fork, mremap_faulted_to_unfaulted_prev_faulted_next) { struct procmap_fd *procmap = &self->procmap; unsigned int page_size = self->page_size; @@ -1373,6 +1457,14 @@ TEST_F(merge, mremap_faulted_to_unfaulted_prev_faulted_next) /* Fault it in. */ ptr_bc[0] = 'x'; + if (variant->forked) { + pid_t pid = do_fork(&self->procmap); + + ASSERT_NE(pid, -1); + if (pid != 0) + return; + } + /* * Now move VMA B out the way (splitting VMA BC) so we can place VMA A * in position, unfaulted, and leave the remainder of the VMA we just @@ -1397,10 +1489,16 @@ TEST_F(merge, mremap_faulted_to_unfaulted_prev_faulted_next) &self->carveout[page_size + 3 * page_size]); ASSERT_NE(ptr_b, MAP_FAILED); - /* The VMAs should have merged. */ - ASSERT_TRUE(find_vma_procmap(procmap, ptr_a)); - ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr_a); - ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_a + 9 * page_size); + /* The VMAs should have merged. A,B,C if unforked, B, C if forked. */ + if (variant->forked) { + ASSERT_TRUE(find_vma_procmap(procmap, ptr_b)); + ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr_b); + ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_b + 6 * page_size); + } else { + ASSERT_TRUE(find_vma_procmap(procmap, ptr_a)); + ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr_a); + ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_a + 9 * page_size); + } } TEST_HARNESS_MAIN -- cgit v1.2.3 From 21c68ad1d9771d331198cc73cbf6e908d7915f35 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 6 Jan 2026 15:45:47 +0000 Subject: tools/testing/selftests: fix gup_longterm for unknown fs Commit 66bce7afbaca ("selftests/mm: fix test result reporting in gup_longterm") introduced a small bug causing unknown filesystems to always result in a test failure. This is because do_test() was updated to use a common reporting path, but this case appears to have been missed. This is problematic for e.g. virtme-ng which uses an overlayfs file system, causing gup_longterm to appear to fail each time due to a test count mismatch: # Planned tests != run tests (50 != 46) # Totals: pass:24 fail:0 xfail:0 xpass:0 skip:22 error:0 The fix is to simply change the return into a break. Link: https://lkml.kernel.org/r/20260106154547.214907-1-lorenzo.stoakes@oracle.com Fixes: 66bce7afbaca ("selftests/mm: fix test result reporting in gup_longterm") Signed-off-by: Lorenzo Stoakes Reviewed-by: David Hildenbrand (Red Hat) Cc: Jason Gunthorpe Cc: John Hubbard Cc: Liam Howlett Cc: "Liam R. Howlett" Cc: Mark Brown Cc: Michal Hocko Cc: Mike Rapoport Cc: Peter Xu Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/gup_longterm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/gup_longterm.c b/tools/testing/selftests/mm/gup_longterm.c index 6279893a0adc..f61150d28eb2 100644 --- a/tools/testing/selftests/mm/gup_longterm.c +++ b/tools/testing/selftests/mm/gup_longterm.c @@ -179,7 +179,7 @@ static void do_test(int fd, size_t size, enum test_type type, bool shared) if (rw && shared && fs_is_unknown(fs_type)) { ksft_print_msg("Unknown filesystem\n"); result = KSFT_SKIP; - return; + break; } /* * R/O pinning or pinning in a private mapping is always -- cgit v1.2.3 From b638a9d0f8965b98403022cb91d8f3b31170eb35 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 8 Jan 2026 17:32:33 +0000 Subject: KVM: arm64: selftests: Add a test for FEAT_IDST Add a very basic test checking that FEAT_IDST actually works for the {GMID,SMIDR,CSSIDR2}_EL1 registers. Link: https://patch.msgid.link/20260108173233.2911955-10-maz@kernel.org Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/Makefile.kvm | 1 + tools/testing/selftests/kvm/arm64/idreg-idst.c | 117 +++++++++++++++++++++++++ 2 files changed, 118 insertions(+) create mode 100644 tools/testing/selftests/kvm/arm64/idreg-idst.c (limited to 'tools/testing') diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index ba5c2b643efa..bfa2fad0aba1 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -175,6 +175,7 @@ TEST_GEN_PROGS_arm64 += arm64/vgic_irq TEST_GEN_PROGS_arm64 += arm64/vgic_lpi_stress TEST_GEN_PROGS_arm64 += arm64/vpmu_counter_access TEST_GEN_PROGS_arm64 += arm64/no-vgic-v3 +TEST_GEN_PROGS_arm64 += arm64/idreg-idst TEST_GEN_PROGS_arm64 += arm64/kvm-uuid TEST_GEN_PROGS_arm64 += access_tracking_perf_test TEST_GEN_PROGS_arm64 += arch_timer diff --git a/tools/testing/selftests/kvm/arm64/idreg-idst.c b/tools/testing/selftests/kvm/arm64/idreg-idst.c new file mode 100644 index 000000000000..9ca9f125abdb --- /dev/null +++ b/tools/testing/selftests/kvm/arm64/idreg-idst.c @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Access all FEAT_IDST-handled registers that depend on more than + * just FEAT_AA64, and fail if we don't get an a trap with an 0x18 EC. + */ + +#include +#include +#include + +static volatile bool sys64, undef; + +#define __check_sr_read(r) \ + ({ \ + uint64_t val; \ + \ + sys64 = false; \ + undef = false; \ + dsb(sy); \ + val = read_sysreg_s(SYS_ ## r); \ + val; \ + }) + +/* Fatal checks */ +#define check_sr_read(r) \ + do { \ + __check_sr_read(r); \ + __GUEST_ASSERT(!undef, #r " unexpected UNDEF"); \ + __GUEST_ASSERT(sys64, #r " didn't trap"); \ + } while(0) + + +static void guest_code(void) +{ + check_sr_read(CCSIDR2_EL1); + check_sr_read(SMIDR_EL1); + check_sr_read(GMID_EL1); + + GUEST_DONE(); +} + +static void guest_sys64_handler(struct ex_regs *regs) +{ + sys64 = true; + undef = false; + regs->pc += 4; +} + +static void guest_undef_handler(struct ex_regs *regs) +{ + sys64 = false; + undef = true; + regs->pc += 4; +} + +static void test_run_vcpu(struct kvm_vcpu *vcpu) +{ + struct ucall uc; + + do { + vcpu_run(vcpu); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + case UCALL_PRINTF: + printf("%s", uc.buffer); + break; + case UCALL_DONE: + break; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } while (uc.cmd != UCALL_DONE); +} + +static void test_guest_feat_idst(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + /* This VM has no MTE, no SME, no CCIDX */ + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vcpu); + + vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, + ESR_ELx_EC_SYS64, guest_sys64_handler); + vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, + ESR_ELx_EC_UNKNOWN, guest_undef_handler); + + test_run_vcpu(vcpu); + + kvm_vm_free(vm); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + uint64_t mmfr2; + + test_disable_default_vgic(); + + vm = vm_create_with_one_vcpu(&vcpu, NULL); + mmfr2 = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64MMFR2_EL1)); + __TEST_REQUIRE(FIELD_GET(ID_AA64MMFR2_EL1_IDS, mmfr2) > 0, + "FEAT_IDST not supported"); + kvm_vm_free(vm); + + test_guest_feat_idst(); + + return 0; +} -- cgit v1.2.3 From 7e03d07d03a486c66d5c084c7185b1bef29049e9 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 9 Jan 2026 08:22:14 +0000 Subject: KVM: arm64: selftests: Disable unused TTBR1_EL1 translations KVM selftests map all guest code and data into the lower virtual address range (0x0000...) managed by TTBR0_EL1. The upper range (0xFFFF...) managed by TTBR1_EL1 is unused and uninitialized. If a guest accesses the upper range, the MMU attempts a translation table walk using uninitialized registers, leading to unpredictable behavior. Set `TCR_EL1.EPD1` to disable translation table walks for TTBR1_EL1, ensuring that any access to the upper range generates an immediate Translation Fault. Additionally, set `TCR_EL1.TBI1` (Top Byte Ignore) to ensure that tagged pointers in the upper range also deterministically trigger a Translation Fault via EPD1. Define `TCR_EPD1_MASK`, `TCR_EPD1_SHIFT`, and `TCR_TBI1` in `processor.h` to support this configuration. These are based on their definitions in `arch/arm64/include/asm/pgtable-hwdef.h`. Suggested-by: Will Deacon Reviewed-by: Itaru Kitayama Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260109082218.3236580-2-tabba@google.com Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/include/arm64/processor.h | 4 ++++ tools/testing/selftests/kvm/lib/arm64/processor.c | 2 ++ 2 files changed, 6 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/kvm/include/arm64/processor.h b/tools/testing/selftests/kvm/include/arm64/processor.h index ff928716574d..ac97a1c436fc 100644 --- a/tools/testing/selftests/kvm/include/arm64/processor.h +++ b/tools/testing/selftests/kvm/include/arm64/processor.h @@ -90,6 +90,9 @@ #define TCR_TG0_64K (UL(1) << TCR_TG0_SHIFT) #define TCR_TG0_16K (UL(2) << TCR_TG0_SHIFT) +#define TCR_EPD1_SHIFT 23 +#define TCR_EPD1_MASK (UL(1) << TCR_EPD1_SHIFT) + #define TCR_IPS_SHIFT 32 #define TCR_IPS_MASK (UL(7) << TCR_IPS_SHIFT) #define TCR_IPS_52_BITS (UL(6) << TCR_IPS_SHIFT) @@ -97,6 +100,7 @@ #define TCR_IPS_40_BITS (UL(2) << TCR_IPS_SHIFT) #define TCR_IPS_36_BITS (UL(1) << TCR_IPS_SHIFT) +#define TCR_TBI1 (UL(1) << 38) #define TCR_HA (UL(1) << 39) #define TCR_DS (UL(1) << 59) diff --git a/tools/testing/selftests/kvm/lib/arm64/processor.c b/tools/testing/selftests/kvm/lib/arm64/processor.c index d46e4b13b92c..5b379da8cb90 100644 --- a/tools/testing/selftests/kvm/lib/arm64/processor.c +++ b/tools/testing/selftests/kvm/lib/arm64/processor.c @@ -384,6 +384,8 @@ void aarch64_vcpu_setup(struct kvm_vcpu *vcpu, struct kvm_vcpu_init *init) tcr_el1 |= TCR_IRGN0_WBWA | TCR_ORGN0_WBWA | TCR_SH0_INNER; tcr_el1 |= TCR_T0SZ(vm->va_bits); + tcr_el1 |= TCR_TBI1; + tcr_el1 |= TCR_EPD1_MASK; if (use_lpa2_pte_format(vm)) tcr_el1 |= TCR_DS; -- cgit v1.2.3 From dd0c5d04d13cae8ff2694ef83d1ae5804d6d9798 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 9 Jan 2026 08:22:15 +0000 Subject: KVM: arm64: selftests: Fix incorrect rounding in page_align() The implementation of `page_align()` in `processor.c` calculates alignment incorrectly for values that are already aligned. Specifically, `(v + vm->page_size) & ~(vm->page_size - 1)` aligns to the *next* page boundary even if `v` is already page-aligned, potentially wasting a page of memory. Fix the calculation to use standard alignment logic: `(v + vm->page_size - 1) & ~(vm->page_size - 1)`. Fixes: 7a6629ef746d ("kvm: selftests: add virt mem support for aarch64") Reviewed-by: Andrew Jones Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260109082218.3236580-3-tabba@google.com Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/lib/arm64/processor.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/kvm/lib/arm64/processor.c b/tools/testing/selftests/kvm/lib/arm64/processor.c index 5b379da8cb90..607a4e462984 100644 --- a/tools/testing/selftests/kvm/lib/arm64/processor.c +++ b/tools/testing/selftests/kvm/lib/arm64/processor.c @@ -23,7 +23,7 @@ static vm_vaddr_t exception_handlers; static uint64_t page_align(struct kvm_vm *vm, uint64_t v) { - return (v + vm->page_size) & ~(vm->page_size - 1); + return (v + vm->page_size - 1) & ~(vm->page_size - 1); } static uint64_t pgd_index(struct kvm_vm *vm, vm_vaddr_t gva) -- cgit v1.2.3 From 582b39463f1c0774e0b3cb5be2118e8564b7941e Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 9 Jan 2026 08:22:16 +0000 Subject: KVM: riscv: selftests: Fix incorrect rounding in page_align() The implementation of `page_align()` in `processor.c` calculates alignment incorrectly for values that are already aligned. Specifically, `(v + vm->page_size) & ~(vm->page_size - 1)` aligns to the *next* page boundary even if `v` is already page-aligned, potentially wasting a page of memory. Fix the calculation to use standard alignment logic: `(v + vm->page_size - 1) & ~(vm->page_size - 1)`. Fixes: 3e06cdf10520 ("KVM: selftests: Add initial support for RISC-V 64-bit") Reviewed-by: Andrew Jones Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260109082218.3236580-4-tabba@google.com Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/lib/riscv/processor.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/kvm/lib/riscv/processor.c b/tools/testing/selftests/kvm/lib/riscv/processor.c index 2eac7d4b59e9..d5e8747b5e69 100644 --- a/tools/testing/selftests/kvm/lib/riscv/processor.c +++ b/tools/testing/selftests/kvm/lib/riscv/processor.c @@ -28,7 +28,7 @@ bool __vcpu_has_ext(struct kvm_vcpu *vcpu, uint64_t ext) static uint64_t page_align(struct kvm_vm *vm, uint64_t v) { - return (v + vm->page_size) & ~(vm->page_size - 1); + return (v + vm->page_size - 1) & ~(vm->page_size - 1); } static uint64_t pte_addr(struct kvm_vm *vm, uint64_t entry) -- cgit v1.2.3 From de00d07321cf3f182762de2308c08062d5b824c0 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 9 Jan 2026 08:22:17 +0000 Subject: KVM: selftests: Move page_align() to shared header To avoid code duplication, move page_align() to the shared `kvm_util.h` header file. Rename it to vm_page_align(), to make it clear that the alignment is done with respect to the guest's base page size. No functional change intended. Reviewed-by: Andrew Jones Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260109082218.3236580-5-tabba@google.com Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/include/kvm_util.h | 5 +++++ tools/testing/selftests/kvm/lib/arm64/processor.c | 7 +------ tools/testing/selftests/kvm/lib/riscv/processor.c | 7 +------ 3 files changed, 7 insertions(+), 12 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 81f4355ff28a..747effa614f1 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -1258,6 +1258,11 @@ static inline int __vm_disable_nx_huge_pages(struct kvm_vm *vm) return __vm_enable_cap(vm, KVM_CAP_VM_DISABLE_NX_HUGE_PAGES, 0); } +static inline uint64_t vm_page_align(struct kvm_vm *vm, uint64_t v) +{ + return (v + vm->page_size - 1) & ~(vm->page_size - 1); +} + /* * Arch hook that is invoked via a constructor, i.e. before exeucting main(), * to allow for arch-specific setup that is common to all tests, e.g. computing diff --git a/tools/testing/selftests/kvm/lib/arm64/processor.c b/tools/testing/selftests/kvm/lib/arm64/processor.c index 607a4e462984..1605dc740d1e 100644 --- a/tools/testing/selftests/kvm/lib/arm64/processor.c +++ b/tools/testing/selftests/kvm/lib/arm64/processor.c @@ -21,11 +21,6 @@ static vm_vaddr_t exception_handlers; -static uint64_t page_align(struct kvm_vm *vm, uint64_t v) -{ - return (v + vm->page_size - 1) & ~(vm->page_size - 1); -} - static uint64_t pgd_index(struct kvm_vm *vm, vm_vaddr_t gva) { unsigned int shift = (vm->pgtable_levels - 1) * (vm->page_shift - 3) + vm->page_shift; @@ -115,7 +110,7 @@ static uint64_t __maybe_unused ptrs_per_pte(struct kvm_vm *vm) void virt_arch_pgd_alloc(struct kvm_vm *vm) { - size_t nr_pages = page_align(vm, ptrs_per_pgd(vm) * 8) / vm->page_size; + size_t nr_pages = vm_page_align(vm, ptrs_per_pgd(vm) * 8) / vm->page_size; if (vm->pgd_created) return; diff --git a/tools/testing/selftests/kvm/lib/riscv/processor.c b/tools/testing/selftests/kvm/lib/riscv/processor.c index d5e8747b5e69..401245fe31db 100644 --- a/tools/testing/selftests/kvm/lib/riscv/processor.c +++ b/tools/testing/selftests/kvm/lib/riscv/processor.c @@ -26,11 +26,6 @@ bool __vcpu_has_ext(struct kvm_vcpu *vcpu, uint64_t ext) return !ret && !!value; } -static uint64_t page_align(struct kvm_vm *vm, uint64_t v) -{ - return (v + vm->page_size - 1) & ~(vm->page_size - 1); -} - static uint64_t pte_addr(struct kvm_vm *vm, uint64_t entry) { return ((entry & PGTBL_PTE_ADDR_MASK) >> PGTBL_PTE_ADDR_SHIFT) << @@ -68,7 +63,7 @@ static uint64_t pte_index(struct kvm_vm *vm, vm_vaddr_t gva, int level) void virt_arch_pgd_alloc(struct kvm_vm *vm) { - size_t nr_pages = page_align(vm, ptrs_per_pte(vm) * 8) / vm->page_size; + size_t nr_pages = vm_page_align(vm, ptrs_per_pte(vm) * 8) / vm->page_size; if (vm->pgd_created) return; -- cgit v1.2.3 From e0a99a2b72f3c6365d9f4d6943ed45f7fc286b70 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 9 Jan 2026 08:22:18 +0000 Subject: KVM: selftests: Fix typos and stale comments in kvm_util Fix minor documentation errors in `kvm_util.h` and `kvm_util.c`. - Correct the argument description for `vcpu_args_set` in `kvm_util.h`, which incorrectly listed `vm` instead of `vcpu`. - Fix a typo in the comment for `kvm_selftest_arch_init` ("exeucting" -> "executing"). - Correct the return value description for `vm_vaddr_unused_gap` in `kvm_util.c` to match the implementation, which returns an address "at or above" `vaddr_min`, not "at or below". No functional change intended. Reviewed-by: Andrew Jones Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260109082218.3236580-6-tabba@google.com Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/include/kvm_util.h | 4 ++-- tools/testing/selftests/kvm/lib/kvm_util.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 747effa614f1..97f9251eb073 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -939,7 +939,7 @@ void *vcpu_map_dirty_ring(struct kvm_vcpu *vcpu); * VM VCPU Args Set * * Input Args: - * vm - Virtual Machine + * vcpu - vCPU * num - number of arguments * ... - arguments, each of type uint64_t * @@ -1264,7 +1264,7 @@ static inline uint64_t vm_page_align(struct kvm_vm *vm, uint64_t v) } /* - * Arch hook that is invoked via a constructor, i.e. before exeucting main(), + * Arch hook that is invoked via a constructor, i.e. before executing main(), * to allow for arch-specific setup that is common to all tests, e.g. computing * the default guest "mode". */ diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 8279b6ced8d2..fab6b62d7810 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1351,7 +1351,7 @@ struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id) * Output Args: None * * Return: - * Lowest virtual address at or below vaddr_min, with at least + * Lowest virtual address at or above vaddr_min, with at least * sz unused bytes. TEST_ASSERT failure if no area of at least * size sz is available. * -- cgit v1.2.3 From 934d9746ed0206e93506a68c838fe82ef748576a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 12 Jan 2026 13:11:57 +0100 Subject: selftests/bpf: Add test for bpf_override_return helper We do not actually test the bpf_override_return helper functionality itself at the moment, only the bpf program being able to attach it. Adding test that override prctl syscall return value on top of kprobe and kprobe.multi. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20260112121157.854473-2-jolsa@kernel.org --- .../selftests/bpf/prog_tests/kprobe_multi_test.c | 44 ++++++++++++++++++++++ .../selftests/bpf/progs/kprobe_multi_override.c | 15 ++++++++ tools/testing/selftests/bpf/trace_helpers.h | 12 ++++++ 3 files changed, 71 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c index 6cfaa978bc9a..9caef222e528 100644 --- a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c @@ -1,4 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include +#include #include #include "kprobe_multi.skel.h" #include "trace_helpers.h" @@ -540,6 +542,46 @@ cleanup: kprobe_multi_override__destroy(skel); } +static void test_override(void) +{ + struct kprobe_multi_override *skel = NULL; + int err; + + skel = kprobe_multi_override__open_and_load(); + if (!ASSERT_OK_PTR(skel, "kprobe_multi_empty__open_and_load")) + goto cleanup; + + skel->bss->pid = getpid(); + + /* no override */ + err = prctl(0xffff, 0); + ASSERT_EQ(err, -1, "err"); + + /* kprobe.multi override */ + skel->links.test_override = bpf_program__attach_kprobe_multi_opts(skel->progs.test_override, + SYS_PREFIX "sys_prctl", NULL); + if (!ASSERT_OK_PTR(skel->links.test_override, "bpf_program__attach_kprobe_multi_opts")) + goto cleanup; + + err = prctl(0xffff, 0); + ASSERT_EQ(err, 123, "err"); + + bpf_link__destroy(skel->links.test_override); + skel->links.test_override = NULL; + + /* kprobe override */ + skel->links.test_kprobe_override = bpf_program__attach_kprobe(skel->progs.test_kprobe_override, + false, SYS_PREFIX "sys_prctl"); + if (!ASSERT_OK_PTR(skel->links.test_kprobe_override, "bpf_program__attach_kprobe")) + goto cleanup; + + err = prctl(0xffff, 0); + ASSERT_EQ(err, 123, "err"); + +cleanup: + kprobe_multi_override__destroy(skel); +} + #ifdef __x86_64__ static void test_attach_write_ctx(void) { @@ -597,6 +639,8 @@ void test_kprobe_multi_test(void) test_attach_api_fails(); if (test__start_subtest("attach_override")) test_attach_override(); + if (test__start_subtest("override")) + test_override(); if (test__start_subtest("session")) test_session_skel_api(); if (test__start_subtest("session_cookie")) diff --git a/tools/testing/selftests/bpf/progs/kprobe_multi_override.c b/tools/testing/selftests/bpf/progs/kprobe_multi_override.c index 28f8487c9059..14f39fa6d515 100644 --- a/tools/testing/selftests/bpf/progs/kprobe_multi_override.c +++ b/tools/testing/selftests/bpf/progs/kprobe_multi_override.c @@ -5,9 +5,24 @@ char _license[] SEC("license") = "GPL"; +int pid = 0; + SEC("kprobe.multi") int test_override(struct pt_regs *ctx) { + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + bpf_override_return(ctx, 123); + return 0; +} + +SEC("kprobe") +int test_kprobe_override(struct pt_regs *ctx) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + bpf_override_return(ctx, 123); return 0; } diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h index 9437bdd4afa5..a5576b2dfc26 100644 --- a/tools/testing/selftests/bpf/trace_helpers.h +++ b/tools/testing/selftests/bpf/trace_helpers.h @@ -4,6 +4,18 @@ #include +#ifdef __x86_64__ +#define SYS_PREFIX "__x64_" +#elif defined(__s390x__) +#define SYS_PREFIX "__s390x_" +#elif defined(__aarch64__) +#define SYS_PREFIX "__arm64_" +#elif defined(__riscv) +#define SYS_PREFIX "__riscv_" +#else +#define SYS_PREFIX "" +#endif + #define __ALIGN_MASK(x, mask) (((x)+(mask))&~(mask)) #define ALIGN(x, a) __ALIGN_MASK(x, (typeof(x))(a)-1) -- cgit v1.2.3 From a63e5fe0959200afcfefa7640db44c491f102c4c Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Tue, 13 Jan 2026 16:08:19 +0100 Subject: vsock/test: Add test for a linear and non-linear skb getting coalesced Loopback transport can mangle data in rx queue when a linear skb is followed by a small MSG_ZEROCOPY packet. To exercise the logic, send out two packets: a weirdly sized one (to ensure some spare tail room in the skb) and a zerocopy one that's small enough to fit in the spare room of its predecessor. Then, wait for both to land in the rx queue, and check the data received. Faulty packets merger manifests itself by corrupting payload of the later packet. Signed-off-by: Michal Luczaj Reviewed-by: Stefano Garzarella Link: https://patch.msgid.link/20260113-vsock-recv-coalescence-v2-2-552b17837cf4@rbox.co Signed-off-by: Jakub Kicinski --- tools/testing/vsock/vsock_test.c | 5 +++ tools/testing/vsock/vsock_test_zerocopy.c | 74 +++++++++++++++++++++++++++++++ tools/testing/vsock/vsock_test_zerocopy.h | 3 ++ 3 files changed, 82 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index bbe3723babdc..27e39354499a 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -2403,6 +2403,11 @@ static struct test_case test_cases[] = { .run_client = test_stream_accepted_setsockopt_client, .run_server = test_stream_accepted_setsockopt_server, }, + { + .name = "SOCK_STREAM virtio MSG_ZEROCOPY coalescence corruption", + .run_client = test_stream_msgzcopy_mangle_client, + .run_server = test_stream_msgzcopy_mangle_server, + }, {}, }; diff --git a/tools/testing/vsock/vsock_test_zerocopy.c b/tools/testing/vsock/vsock_test_zerocopy.c index 9d9a6cb9614a..a31ddfc1cd0c 100644 --- a/tools/testing/vsock/vsock_test_zerocopy.c +++ b/tools/testing/vsock/vsock_test_zerocopy.c @@ -9,14 +9,18 @@ #include #include #include +#include #include #include #include #include #include +#include +#include #include #include "control.h" +#include "timeout.h" #include "vsock_test_zerocopy.h" #include "msg_zerocopy_common.h" @@ -356,3 +360,73 @@ void test_stream_msgzcopy_empty_errq_server(const struct test_opts *opts) control_expectln("DONE"); close(fd); } + +#define GOOD_COPY_LEN 128 /* net/vmw_vsock/virtio_transport_common.c */ + +void test_stream_msgzcopy_mangle_client(const struct test_opts *opts) +{ + char sbuf1[PAGE_SIZE + 1], sbuf2[GOOD_COPY_LEN]; + unsigned long hash; + struct pollfd fds; + int fd, i; + + fd = vsock_stream_connect(opts->peer_cid, opts->peer_port); + if (fd < 0) { + perror("connect"); + exit(EXIT_FAILURE); + } + + enable_so_zerocopy_check(fd); + + memset(sbuf1, 'x', sizeof(sbuf1)); + send_buf(fd, sbuf1, sizeof(sbuf1), 0, sizeof(sbuf1)); + + for (i = 0; i < sizeof(sbuf2); i++) + sbuf2[i] = rand() & 0xff; + + send_buf(fd, sbuf2, sizeof(sbuf2), MSG_ZEROCOPY, sizeof(sbuf2)); + + hash = hash_djb2(sbuf2, sizeof(sbuf2)); + control_writeulong(hash); + + fds.fd = fd; + fds.events = 0; + + if (poll(&fds, 1, TIMEOUT * MSEC_PER_SEC) != 1 || + !(fds.revents & POLLERR)) { + perror("poll"); + exit(EXIT_FAILURE); + } + + close(fd); +} + +void test_stream_msgzcopy_mangle_server(const struct test_opts *opts) +{ + unsigned long local_hash, remote_hash; + char rbuf[PAGE_SIZE + 1]; + int fd; + + fd = vsock_stream_accept(VMADDR_CID_ANY, opts->peer_port, NULL); + if (fd < 0) { + perror("accept"); + exit(EXIT_FAILURE); + } + + /* Wait, don't race the (buggy) skbs coalescence. */ + vsock_ioctl_int(fd, SIOCINQ, PAGE_SIZE + 1 + GOOD_COPY_LEN); + + /* Discard the first packet. */ + recv_buf(fd, rbuf, PAGE_SIZE + 1, 0, PAGE_SIZE + 1); + + recv_buf(fd, rbuf, GOOD_COPY_LEN, 0, GOOD_COPY_LEN); + remote_hash = control_readulong(); + local_hash = hash_djb2(rbuf, GOOD_COPY_LEN); + + if (local_hash != remote_hash) { + fprintf(stderr, "Data received corrupted\n"); + exit(EXIT_FAILURE); + } + + close(fd); +} diff --git a/tools/testing/vsock/vsock_test_zerocopy.h b/tools/testing/vsock/vsock_test_zerocopy.h index 3ef2579e024d..d46c91a69f16 100644 --- a/tools/testing/vsock/vsock_test_zerocopy.h +++ b/tools/testing/vsock/vsock_test_zerocopy.h @@ -12,4 +12,7 @@ void test_seqpacket_msgzcopy_server(const struct test_opts *opts); void test_stream_msgzcopy_empty_errq_client(const struct test_opts *opts); void test_stream_msgzcopy_empty_errq_server(const struct test_opts *opts); +void test_stream_msgzcopy_mangle_client(const struct test_opts *opts); +void test_stream_msgzcopy_mangle_server(const struct test_opts *opts); + #endif /* VSOCK_TEST_ZEROCOPY_H */ -- cgit v1.2.3 From 4f5f148dd7c0459229d2ab9a769b2e820f9ee6a2 Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marlière" Date: Tue, 13 Jan 2026 12:37:44 -0300 Subject: selftests: net: fib-onlink-tests: Convert to use namespaces by default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, the test breaks if the SUT already has a default route configured for IPv6. Fix by avoiding the use of the default namespace. Fixes: 4ed591c8ab44 ("net/ipv6: Allow onlink routes to have a device mismatch if it is the default route") Suggested-by: Fernando Fernandez Mancera Signed-off-by: Ricardo B. Marlière Reviewed-by: Ido Schimmel Reviewed-by: Fernando Fernandez Mancera Link: https://patch.msgid.link/20260113-selftests-net-fib-onlink-v2-1-89de2b931389@suse.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/fib-onlink-tests.sh | 71 +++++++++++-------------- 1 file changed, 30 insertions(+), 41 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/net/fib-onlink-tests.sh b/tools/testing/selftests/net/fib-onlink-tests.sh index ec2d6ceb1f08..c01be076b210 100755 --- a/tools/testing/selftests/net/fib-onlink-tests.sh +++ b/tools/testing/selftests/net/fib-onlink-tests.sh @@ -120,7 +120,7 @@ log_subsection() run_cmd() { - local cmd="$*" + local cmd="$1" local out local rc @@ -145,7 +145,7 @@ get_linklocal() local pfx local addr - addr=$(${pfx} ip -6 -br addr show dev ${dev} | \ + addr=$(${pfx} ${IP} -6 -br addr show dev ${dev} | \ awk '{ for (i = 3; i <= NF; ++i) { if ($i ~ /^fe80/) @@ -173,58 +173,48 @@ setup() set -e - # create namespace - setup_ns PEER_NS + # create namespaces + setup_ns ns1 + IP="ip -netns $ns1" + setup_ns ns2 # add vrf table - ip li add ${VRF} type vrf table ${VRF_TABLE} - ip li set ${VRF} up - ip ro add table ${VRF_TABLE} unreachable default metric 8192 - ip -6 ro add table ${VRF_TABLE} unreachable default metric 8192 + ${IP} li add ${VRF} type vrf table ${VRF_TABLE} + ${IP} li set ${VRF} up + ${IP} ro add table ${VRF_TABLE} unreachable default metric 8192 + ${IP} -6 ro add table ${VRF_TABLE} unreachable default metric 8192 # create test interfaces - ip li add ${NETIFS[p1]} type veth peer name ${NETIFS[p2]} - ip li add ${NETIFS[p3]} type veth peer name ${NETIFS[p4]} - ip li add ${NETIFS[p5]} type veth peer name ${NETIFS[p6]} - ip li add ${NETIFS[p7]} type veth peer name ${NETIFS[p8]} + ${IP} li add ${NETIFS[p1]} type veth peer name ${NETIFS[p2]} + ${IP} li add ${NETIFS[p3]} type veth peer name ${NETIFS[p4]} + ${IP} li add ${NETIFS[p5]} type veth peer name ${NETIFS[p6]} + ${IP} li add ${NETIFS[p7]} type veth peer name ${NETIFS[p8]} # enslave vrf interfaces for n in 5 7; do - ip li set ${NETIFS[p${n}]} vrf ${VRF} + ${IP} li set ${NETIFS[p${n}]} vrf ${VRF} done # add addresses for n in 1 3 5 7; do - ip li set ${NETIFS[p${n}]} up - ip addr add ${V4ADDRS[p${n}]}/24 dev ${NETIFS[p${n}]} - ip addr add ${V6ADDRS[p${n}]}/64 dev ${NETIFS[p${n}]} nodad + ${IP} li set ${NETIFS[p${n}]} up + ${IP} addr add ${V4ADDRS[p${n}]}/24 dev ${NETIFS[p${n}]} + ${IP} addr add ${V6ADDRS[p${n}]}/64 dev ${NETIFS[p${n}]} nodad done # move peer interfaces to namespace and add addresses for n in 2 4 6 8; do - ip li set ${NETIFS[p${n}]} netns ${PEER_NS} up - ip -netns ${PEER_NS} addr add ${V4ADDRS[p${n}]}/24 dev ${NETIFS[p${n}]} - ip -netns ${PEER_NS} addr add ${V6ADDRS[p${n}]}/64 dev ${NETIFS[p${n}]} nodad + ${IP} li set ${NETIFS[p${n}]} netns ${ns2} up + ip -netns $ns2 addr add ${V4ADDRS[p${n}]}/24 dev ${NETIFS[p${n}]} + ip -netns $ns2 addr add ${V6ADDRS[p${n}]}/64 dev ${NETIFS[p${n}]} nodad done - ip -6 ro add default via ${V6ADDRS[p3]/::[0-9]/::64} - ip -6 ro add table ${VRF_TABLE} default via ${V6ADDRS[p7]/::[0-9]/::64} + ${IP} -6 ro add default via ${V6ADDRS[p3]/::[0-9]/::64} + ${IP} -6 ro add table ${VRF_TABLE} default via ${V6ADDRS[p7]/::[0-9]/::64} set +e } -cleanup() -{ - # make sure we start from a clean slate - cleanup_ns ${PEER_NS} 2>/dev/null - for n in 1 3 5 7; do - ip link del ${NETIFS[p${n}]} 2>/dev/null - done - ip link del ${VRF} 2>/dev/null - ip ro flush table ${VRF_TABLE} - ip -6 ro flush table ${VRF_TABLE} -} - ################################################################################ # IPv4 tests # @@ -241,7 +231,7 @@ run_ip() # dev arg may be empty [ -n "${dev}" ] && dev="dev ${dev}" - run_cmd ip ro add table "${table}" "${prefix}"/32 via "${gw}" "${dev}" onlink + run_cmd "${IP} ro add table ${table} ${prefix}/32 via ${gw} ${dev} onlink" log_test $? ${exp_rc} "${desc}" } @@ -257,8 +247,8 @@ run_ip_mpath() # dev arg may be empty [ -n "${dev}" ] && dev="dev ${dev}" - run_cmd ip ro add table "${table}" "${prefix}"/32 \ - nexthop via ${nh1} nexthop via ${nh2} + run_cmd "${IP} ro add table ${table} ${prefix}/32 \ + nexthop via ${nh1} nexthop via ${nh2}" log_test $? ${exp_rc} "${desc}" } @@ -339,7 +329,7 @@ run_ip6() # dev arg may be empty [ -n "${dev}" ] && dev="dev ${dev}" - run_cmd ip -6 ro add table "${table}" "${prefix}"/128 via "${gw}" "${dev}" onlink + run_cmd "${IP} -6 ro add table ${table} ${prefix}/128 via ${gw} ${dev} onlink" log_test $? ${exp_rc} "${desc}" } @@ -353,8 +343,8 @@ run_ip6_mpath() local exp_rc="$6" local desc="$7" - run_cmd ip -6 ro add table "${table}" "${prefix}"/128 "${opts}" \ - nexthop via ${nh1} nexthop via ${nh2} + run_cmd "${IP} -6 ro add table ${table} ${prefix}/128 ${opts} \ + nexthop via ${nh1} nexthop via ${nh2}" log_test $? ${exp_rc} "${desc}" } @@ -491,10 +481,9 @@ do esac done -cleanup setup run_onlink_tests -cleanup +cleanup_ns ${ns1} ${ns2} if [ "$TESTS" != "none" ]; then printf "\nTests passed: %3d\n" ${nsuccess} -- cgit v1.2.3 From a91cc48246605af9aeef1edd32232976d74d9502 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 15 Jan 2026 09:21:54 -0800 Subject: KVM: selftests: Test READ=>WRITE dirty logging behavior for shadow MMU Update the nested dirty log test to validate KVM's handling of READ faults when dirty logging is enabled. Specifically, set the Dirty bit in the guest PTEs used to map L2 GPAs, so that KVM will create writable SPTEs when handling L2 read faults. When handling read faults in the shadow MMU, KVM opportunistically creates a writable SPTE if the mapping can be writable *and* the gPTE is dirty (or doesn't support the Dirty bit), i.e. if KVM doesn't need to intercept writes in order to emulate Dirty-bit updates. To actually test the L2 READ=>WRITE sequence, e.g. without masking a false pass by other test activity, route the READ=>WRITE and WRITE=>WRITE sequences to separate L1 pages, and differentiate between "marked dirty due to a WRITE access/fault" and "marked dirty due to creating a writable SPTE for a READ access/fault". The updated sequence exposes the bug fixed by KVM commit 1f4e5fc83a42 ("KVM: x86: fix nested guest live migration with PML") when the guest performs a READ=>WRITE sequence with dirty guest PTEs. Opportunistically tweak and rename the address macros, and add comments, to make it more obvious what the test is doing. E.g. NESTED_TEST_MEM1 vs. GUEST_TEST_MEM doesn't make it all that obvious that the test is creating aliases in both the L2 GPA and GVA address spaces, but only when L1 is using TDP to run L2. Cc: Yosry Ahmed Reviewed-by: Yosry Ahmed Link: https://patch.msgid.link/20260115172154.709024-1-seanjc@google.com Signed-off-by: Sean Christopherson --- .../testing/selftests/kvm/include/x86/processor.h | 1 + tools/testing/selftests/kvm/lib/x86/processor.c | 7 + .../selftests/kvm/x86/nested_dirty_log_test.c | 187 +++++++++++++++------ 3 files changed, 143 insertions(+), 52 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index 6bfffc3b0a33..4ebae4269e68 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -1486,6 +1486,7 @@ bool kvm_cpu_has_tdp(void); void tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, uint64_t size); void tdp_identity_map_default_memslots(struct kvm_vm *vm); void tdp_identity_map_1g(struct kvm_vm *vm, uint64_t addr, uint64_t size); +uint64_t *tdp_get_pte(struct kvm_vm *vm, uint64_t l2_gpa); /* * Basic CPU control in CR0 diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index ab869a98bbdc..fab18e9be66c 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -390,6 +390,13 @@ static uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, return virt_get_pte(vm, mmu, pte, vaddr, PG_LEVEL_4K); } +uint64_t *tdp_get_pte(struct kvm_vm *vm, uint64_t l2_gpa) +{ + int level = PG_LEVEL_4K; + + return __vm_get_page_table_entry(vm, &vm->stage2_mmu, l2_gpa, &level); +} + uint64_t *vm_get_pte(struct kvm_vm *vm, uint64_t vaddr) { int level = PG_LEVEL_4K; diff --git a/tools/testing/selftests/kvm/x86/nested_dirty_log_test.c b/tools/testing/selftests/kvm/x86/nested_dirty_log_test.c index 89d2e86a0db9..619229bbd693 100644 --- a/tools/testing/selftests/kvm/x86/nested_dirty_log_test.c +++ b/tools/testing/selftests/kvm/x86/nested_dirty_log_test.c @@ -17,29 +17,54 @@ /* The memory slot index to track dirty pages */ #define TEST_MEM_SLOT_INDEX 1 -#define TEST_MEM_PAGES 3 -/* L1 guest test virtual memory offset */ -#define GUEST_TEST_MEM 0xc0000000 +/* + * Allocate four pages total. Two pages are used to verify that the KVM marks + * the accessed page/GFN as marked dirty, but not the "other" page. Times two + * so that each "normal" page can be accessed from L2 via an aliased L2 GVA+GPA + * (when TDP is enabled), to verify KVM marks _L1's_ page/GFN as dirty (to + * detect failures, L2 => L1 GPAs can't be identity mapped in the TDP page + * tables, as marking L2's GPA dirty would get a false pass if L1 == L2). + */ +#define TEST_MEM_PAGES 4 + +#define TEST_MEM_BASE 0xc0000000 +#define TEST_MEM_ALIAS_BASE 0xc0002000 + +#define TEST_GUEST_ADDR(base, idx) ((base) + (idx) * PAGE_SIZE) -/* L2 guest test virtual memory offset */ -#define NESTED_TEST_MEM1 0xc0001000 -#define NESTED_TEST_MEM2 0xc0002000 +#define TEST_GVA(idx) TEST_GUEST_ADDR(TEST_MEM_BASE, idx) +#define TEST_GPA(idx) TEST_GUEST_ADDR(TEST_MEM_BASE, idx) + +#define TEST_ALIAS_GPA(idx) TEST_GUEST_ADDR(TEST_MEM_ALIAS_BASE, idx) + +#define TEST_HVA(vm, idx) addr_gpa2hva(vm, TEST_GPA(idx)) #define L2_GUEST_STACK_SIZE 64 -static void l2_guest_code(u64 *a, u64 *b) -{ - READ_ONCE(*a); - WRITE_ONCE(*a, 1); - GUEST_SYNC(true); - GUEST_SYNC(false); +/* Use the page offset bits to communicate the access+fault type. */ +#define TEST_SYNC_READ_FAULT BIT(0) +#define TEST_SYNC_WRITE_FAULT BIT(1) +#define TEST_SYNC_NO_FAULT BIT(2) - WRITE_ONCE(*b, 1); - GUEST_SYNC(true); - WRITE_ONCE(*b, 1); - GUEST_SYNC(true); - GUEST_SYNC(false); +static void l2_guest_code(vm_vaddr_t base) +{ + vm_vaddr_t page0 = TEST_GUEST_ADDR(base, 0); + vm_vaddr_t page1 = TEST_GUEST_ADDR(base, 1); + + READ_ONCE(*(u64 *)page0); + GUEST_SYNC(page0 | TEST_SYNC_READ_FAULT); + WRITE_ONCE(*(u64 *)page0, 1); + GUEST_SYNC(page0 | TEST_SYNC_WRITE_FAULT); + READ_ONCE(*(u64 *)page0); + GUEST_SYNC(page0 | TEST_SYNC_NO_FAULT); + + WRITE_ONCE(*(u64 *)page1, 1); + GUEST_SYNC(page1 | TEST_SYNC_WRITE_FAULT); + WRITE_ONCE(*(u64 *)page1, 1); + GUEST_SYNC(page1 | TEST_SYNC_WRITE_FAULT); + READ_ONCE(*(u64 *)page1); + GUEST_SYNC(page1 | TEST_SYNC_NO_FAULT); /* Exit to L1 and never come back. */ vmcall(); @@ -47,13 +72,22 @@ static void l2_guest_code(u64 *a, u64 *b) static void l2_guest_code_tdp_enabled(void) { - l2_guest_code((u64 *)NESTED_TEST_MEM1, (u64 *)NESTED_TEST_MEM2); + /* + * Use the aliased virtual addresses when running with TDP to verify + * that KVM correctly handles the case where a page is dirtied via a + * different GPA than would be used by L1. + */ + l2_guest_code(TEST_MEM_ALIAS_BASE); } static void l2_guest_code_tdp_disabled(void) { - /* Access the same L1 GPAs as l2_guest_code_tdp_enabled() */ - l2_guest_code((u64 *)GUEST_TEST_MEM, (u64 *)GUEST_TEST_MEM); + /* + * Use the "normal" virtual addresses when running without TDP enabled, + * in which case L2 will use the same page tables as L1, and thus needs + * to use the same virtual addresses that are mapped into L1. + */ + l2_guest_code(TEST_MEM_BASE); } void l1_vmx_code(struct vmx_pages *vmx) @@ -72,9 +106,9 @@ void l1_vmx_code(struct vmx_pages *vmx) prepare_vmcs(vmx, l2_rip, &l2_guest_stack[L2_GUEST_STACK_SIZE]); - GUEST_SYNC(false); + GUEST_SYNC(TEST_SYNC_NO_FAULT); GUEST_ASSERT(!vmlaunch()); - GUEST_SYNC(false); + GUEST_SYNC(TEST_SYNC_NO_FAULT); GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_VMCALL); GUEST_DONE(); } @@ -91,9 +125,9 @@ static void l1_svm_code(struct svm_test_data *svm) generic_svm_setup(svm, l2_rip, &l2_guest_stack[L2_GUEST_STACK_SIZE]); - GUEST_SYNC(false); + GUEST_SYNC(TEST_SYNC_NO_FAULT); run_guest(svm->vmcb, svm->vmcb_gpa); - GUEST_SYNC(false); + GUEST_SYNC(TEST_SYNC_NO_FAULT); GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMMCALL); GUEST_DONE(); } @@ -106,12 +140,66 @@ static void l1_guest_code(void *data) l1_svm_code(data); } +static void test_handle_ucall_sync(struct kvm_vm *vm, u64 arg, + unsigned long *bmap) +{ + vm_vaddr_t gva = arg & ~(PAGE_SIZE - 1); + int page_nr, i; + + /* + * Extract the page number of underlying physical page, which is also + * the _L1_ page number. The dirty bitmap _must_ be updated based on + * the L1 GPA, not L2 GPA, i.e. whether or not L2 used an aliased GPA + * (i.e. if TDP enabled for L2) is irrelevant with respect to the dirty + * bitmap and which underlying physical page is accessed. + * + * Note, gva will be '0' if there was no access, i.e. if the purpose of + * the sync is to verify all pages are clean. + */ + if (!gva) + page_nr = 0; + else if (gva >= TEST_MEM_ALIAS_BASE) + page_nr = (gva - TEST_MEM_ALIAS_BASE) >> PAGE_SHIFT; + else + page_nr = (gva - TEST_MEM_BASE) >> PAGE_SHIFT; + TEST_ASSERT(page_nr == 0 || page_nr == 1, + "Test bug, unexpected frame number '%u' for arg = %lx", page_nr, arg); + TEST_ASSERT(gva || (arg & TEST_SYNC_NO_FAULT), + "Test bug, gva must be valid if a fault is expected"); + + kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap); + + /* + * Check all pages to verify the correct physical page was modified (or + * not), and that all pages are clean/dirty as expected. + * + * If a fault of any kind is expected, the target page should be dirty + * as the Dirty bit is set in the gPTE. KVM should create a writable + * SPTE even on a read fault, *and* KVM must mark the GFN as dirty + * when doing so. + */ + for (i = 0; i < TEST_MEM_PAGES; i++) { + if (i == page_nr && (arg & TEST_SYNC_WRITE_FAULT)) + TEST_ASSERT(*(u64 *)TEST_HVA(vm, i) == 1, + "Page %u incorrectly not written by guest", i); + else + TEST_ASSERT(*(u64 *)TEST_HVA(vm, i) == 0xaaaaaaaaaaaaaaaaULL, + "Page %u incorrectly written by guest", i); + + if (i == page_nr && !(arg & TEST_SYNC_NO_FAULT)) + TEST_ASSERT(test_bit(i, bmap), + "Page %u incorrectly reported clean on %s fault", + i, arg & TEST_SYNC_READ_FAULT ? "read" : "write"); + else + TEST_ASSERT(!test_bit(i, bmap), + "Page %u incorrectly reported dirty", i); + } +} + static void test_dirty_log(bool nested_tdp) { vm_vaddr_t nested_gva = 0; unsigned long *bmap; - uint64_t *host_test_mem; - struct kvm_vcpu *vcpu; struct kvm_vm *vm; struct ucall uc; @@ -133,35 +221,46 @@ static void test_dirty_log(bool nested_tdp) /* Add an extra memory slot for testing dirty logging */ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, - GUEST_TEST_MEM, + TEST_MEM_BASE, TEST_MEM_SLOT_INDEX, TEST_MEM_PAGES, KVM_MEM_LOG_DIRTY_PAGES); /* - * Add an identity map for GVA range [0xc0000000, 0xc0002000). This + * Add an identity map for GVA range [0xc0000000, 0xc0004000). This * affects both L1 and L2. However... */ - virt_map(vm, GUEST_TEST_MEM, GUEST_TEST_MEM, TEST_MEM_PAGES); + virt_map(vm, TEST_MEM_BASE, TEST_MEM_BASE, TEST_MEM_PAGES); /* - * ... pages in the L2 GPA range [0xc0001000, 0xc0003000) will map to - * 0xc0000000. + * ... pages in the L2 GPA address range [0xc0002000, 0xc0004000) will + * map to [0xc0000000, 0xc0002000) when TDP is enabled (for L2). * * When TDP is disabled, the L2 guest code will still access the same L1 * GPAs as the TDP enabled case. + * + * Set the Dirty bit in the PTEs used by L2 so that KVM will create + * writable SPTEs when handling read faults (if the Dirty bit isn't + * set, KVM must intercept the next write to emulate the Dirty bit + * update). */ if (nested_tdp) { tdp_identity_map_default_memslots(vm); - tdp_map(vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, PAGE_SIZE); - tdp_map(vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, PAGE_SIZE); + tdp_map(vm, TEST_ALIAS_GPA(0), TEST_GPA(0), PAGE_SIZE); + tdp_map(vm, TEST_ALIAS_GPA(1), TEST_GPA(1), PAGE_SIZE); + + *tdp_get_pte(vm, TEST_ALIAS_GPA(0)) |= PTE_DIRTY_MASK(&vm->stage2_mmu); + *tdp_get_pte(vm, TEST_ALIAS_GPA(1)) |= PTE_DIRTY_MASK(&vm->stage2_mmu); + } else { + *vm_get_pte(vm, TEST_GVA(0)) |= PTE_DIRTY_MASK(&vm->mmu); + *vm_get_pte(vm, TEST_GVA(1)) |= PTE_DIRTY_MASK(&vm->mmu); } bmap = bitmap_zalloc(TEST_MEM_PAGES); - host_test_mem = addr_gpa2hva(vm, GUEST_TEST_MEM); while (!done) { - memset(host_test_mem, 0xaa, TEST_MEM_PAGES * PAGE_SIZE); + memset(TEST_HVA(vm, 0), 0xaa, TEST_MEM_PAGES * PAGE_SIZE); + vcpu_run(vcpu); TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); @@ -170,23 +269,7 @@ static void test_dirty_log(bool nested_tdp) REPORT_GUEST_ASSERT(uc); /* NOT REACHED */ case UCALL_SYNC: - /* - * The nested guest wrote at offset 0x1000 in the memslot, but the - * dirty bitmap must be filled in according to L1 GPA, not L2. - */ - kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap); - if (uc.args[1]) { - TEST_ASSERT(test_bit(0, bmap), "Page 0 incorrectly reported clean"); - TEST_ASSERT(host_test_mem[0] == 1, "Page 0 not written by guest"); - } else { - TEST_ASSERT(!test_bit(0, bmap), "Page 0 incorrectly reported dirty"); - TEST_ASSERT(host_test_mem[0] == 0xaaaaaaaaaaaaaaaaULL, "Page 0 written by guest"); - } - - TEST_ASSERT(!test_bit(1, bmap), "Page 1 incorrectly reported dirty"); - TEST_ASSERT(host_test_mem[PAGE_SIZE / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 1 written by guest"); - TEST_ASSERT(!test_bit(2, bmap), "Page 2 incorrectly reported dirty"); - TEST_ASSERT(host_test_mem[PAGE_SIZE*2 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 2 written by guest"); + test_handle_ucall_sync(vm, uc.args[1], bmap); break; case UCALL_DONE: done = true; -- cgit v1.2.3 From 086c99fbe45070d02851427eab5ae26fe7d0f3c0 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Thu, 15 Jan 2026 07:11:41 -0800 Subject: selftests: bpf: Add test for multiple syncs from linked register Before the last commit, sync_linked_regs() corrupted the register whose bounds are being updated by copying known_reg's id to it. The ids are the same in value but known_reg has the BPF_ADD_CONST flag which is wrongly copied to reg. This later causes issues when creating new links to this reg. assign_scalar_id_before_mov() sees this BPF_ADD_CONST and gives a new id to this register and breaks the old links. This is exposed by the added selftest. Signed-off-by: Puranjay Mohan Tested-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260115151143.1344724-3-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/verifier_linked_scalars.c | 33 ++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/progs/verifier_linked_scalars.c b/tools/testing/selftests/bpf/progs/verifier_linked_scalars.c index 8f755d2464cf..5f41bbb730a7 100644 --- a/tools/testing/selftests/bpf/progs/verifier_linked_scalars.c +++ b/tools/testing/selftests/bpf/progs/verifier_linked_scalars.c @@ -31,4 +31,37 @@ l1: \ " ::: __clobber_all); } +/* + * Test that sync_linked_regs() preserves register IDs. + * + * The sync_linked_regs() function copies bounds from known_reg to linked + * registers. When doing so, it must preserve each register's original id + * to allow subsequent syncs from the same source to work correctly. + * + */ +SEC("socket") +__success +__naked void sync_linked_regs_preserves_id(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r0 &= 0xff; /* r0 in [0, 255] */ \ + r1 = r0; /* r0, r1 linked with id 1 */ \ + r1 += 4; /* r1 has id=1 and off=4 in [4, 259] */ \ + if r1 < 10 goto l0_%=; \ + /* r1 in [10, 259], r0 synced to [6, 255] */ \ + r2 = r0; /* r2 has id=1 and in [6, 255] */ \ + if r1 < 14 goto l0_%=; \ + /* r1 in [14, 259], r0 synced to [10, 255] */ \ + if r0 >= 10 goto l0_%=; \ + /* Never executed */ \ + r0 /= 0; \ +l0_%=: \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From b8f7622aa6e32d6fd750697b99d8ce19ad8e66d0 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 29 Dec 2025 14:03:25 +0100 Subject: selftests/open_tree: add OPEN_TREE_NAMESPACE tests Add tests for OPEN_TREE_NAMESPACE. Link: https://patch.msgid.link/20251229-work-empty-namespace-v1-2-bfb24c7b061f@kernel.org Tested-by: Jeff Layton Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- .../selftests/filesystems/open_tree_ns/.gitignore | 1 + .../selftests/filesystems/open_tree_ns/Makefile | 10 + .../filesystems/open_tree_ns/open_tree_ns_test.c | 1030 ++++++++++++++++++++ tools/testing/selftests/filesystems/utils.c | 26 + tools/testing/selftests/filesystems/utils.h | 1 + 5 files changed, 1068 insertions(+) create mode 100644 tools/testing/selftests/filesystems/open_tree_ns/.gitignore create mode 100644 tools/testing/selftests/filesystems/open_tree_ns/Makefile create mode 100644 tools/testing/selftests/filesystems/open_tree_ns/open_tree_ns_test.c (limited to 'tools/testing') diff --git a/tools/testing/selftests/filesystems/open_tree_ns/.gitignore b/tools/testing/selftests/filesystems/open_tree_ns/.gitignore new file mode 100644 index 000000000000..fb12b93fbcaa --- /dev/null +++ b/tools/testing/selftests/filesystems/open_tree_ns/.gitignore @@ -0,0 +1 @@ +open_tree_ns_test diff --git a/tools/testing/selftests/filesystems/open_tree_ns/Makefile b/tools/testing/selftests/filesystems/open_tree_ns/Makefile new file mode 100644 index 000000000000..73c03c4a7ef6 --- /dev/null +++ b/tools/testing/selftests/filesystems/open_tree_ns/Makefile @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0 +TEST_GEN_PROGS := open_tree_ns_test + +CFLAGS := -Wall -Werror -g $(KHDR_INCLUDES) +LDLIBS := -lcap + +include ../../lib.mk + +$(OUTPUT)/open_tree_ns_test: open_tree_ns_test.c ../utils.c + $(CC) $(CFLAGS) -o $@ $^ $(LDLIBS) diff --git a/tools/testing/selftests/filesystems/open_tree_ns/open_tree_ns_test.c b/tools/testing/selftests/filesystems/open_tree_ns/open_tree_ns_test.c new file mode 100644 index 000000000000..9711556280ae --- /dev/null +++ b/tools/testing/selftests/filesystems/open_tree_ns/open_tree_ns_test.c @@ -0,0 +1,1030 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test for OPEN_TREE_NAMESPACE flag. + * + * Test that open_tree() with OPEN_TREE_NAMESPACE creates a new mount + * namespace containing the specified mount tree. + */ +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../wrappers.h" +#include "../statmount/statmount.h" +#include "../utils.h" +#include "../../kselftest_harness.h" + +#ifndef OPEN_TREE_NAMESPACE +#define OPEN_TREE_NAMESPACE (1 << 1) +#endif + +static int get_mnt_ns_id(int fd, uint64_t *mnt_ns_id) +{ + if (ioctl(fd, NS_GET_MNTNS_ID, mnt_ns_id) < 0) + return -errno; + return 0; +} + +static int get_mnt_ns_id_from_path(const char *path, uint64_t *mnt_ns_id) +{ + int fd, ret; + + fd = open(path, O_RDONLY); + if (fd < 0) + return -errno; + + ret = get_mnt_ns_id(fd, mnt_ns_id); + close(fd); + return ret; +} + +#define STATMOUNT_BUFSIZE (1 << 15) + +static struct statmount *statmount_alloc(uint64_t mnt_id, uint64_t mnt_ns_id, uint64_t mask) +{ + struct statmount *buf; + size_t bufsize = STATMOUNT_BUFSIZE; + int ret; + + for (;;) { + buf = malloc(bufsize); + if (!buf) + return NULL; + + ret = statmount(mnt_id, mnt_ns_id, mask, buf, bufsize, 0); + if (ret == 0) + return buf; + + free(buf); + if (errno != EOVERFLOW) + return NULL; + + bufsize <<= 1; + } +} + +static void log_mount(struct __test_metadata *_metadata, struct statmount *sm) +{ + const char *fs_type = ""; + const char *mnt_root = ""; + const char *mnt_point = ""; + + if (sm->mask & STATMOUNT_FS_TYPE) + fs_type = sm->str + sm->fs_type; + if (sm->mask & STATMOUNT_MNT_ROOT) + mnt_root = sm->str + sm->mnt_root; + if (sm->mask & STATMOUNT_MNT_POINT) + mnt_point = sm->str + sm->mnt_point; + + TH_LOG(" mnt_id: %llu, parent_id: %llu, fs_type: %s, root: %s, point: %s", + (unsigned long long)sm->mnt_id, + (unsigned long long)sm->mnt_parent_id, + fs_type, mnt_root, mnt_point); +} + +static void dump_mounts(struct __test_metadata *_metadata, uint64_t mnt_ns_id) +{ + uint64_t list[256]; + ssize_t nr_mounts; + + nr_mounts = listmount(LSMT_ROOT, mnt_ns_id, 0, list, 256, 0); + if (nr_mounts < 0) { + TH_LOG("listmount failed: %s", strerror(errno)); + return; + } + + TH_LOG("Mount namespace %llu contains %zd mount(s):", + (unsigned long long)mnt_ns_id, nr_mounts); + + for (ssize_t i = 0; i < nr_mounts; i++) { + struct statmount *sm; + + sm = statmount_alloc(list[i], mnt_ns_id, + STATMOUNT_MNT_BASIC | + STATMOUNT_FS_TYPE | + STATMOUNT_MNT_ROOT | + STATMOUNT_MNT_POINT); + if (!sm) { + TH_LOG(" [%zd] mnt_id %llu: statmount failed: %s", + i, (unsigned long long)list[i], strerror(errno)); + continue; + } + + log_mount(_metadata, sm); + free(sm); + } +} + +FIXTURE(open_tree_ns) +{ + int fd; + uint64_t current_ns_id; +}; + +FIXTURE_VARIANT(open_tree_ns) +{ + const char *path; + unsigned int flags; + bool expect_success; + bool expect_different_ns; + int min_mounts; +}; + +FIXTURE_VARIANT_ADD(open_tree_ns, basic_root) +{ + .path = "/", + .flags = OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC, + .expect_success = true, + .expect_different_ns = true, + /* + * The empty rootfs is hidden from listmount()/mountinfo, + * so we only see the bind mount on top of it. + */ + .min_mounts = 1, +}; + +FIXTURE_VARIANT_ADD(open_tree_ns, recursive_root) +{ + .path = "/", + .flags = OPEN_TREE_NAMESPACE | AT_RECURSIVE | OPEN_TREE_CLOEXEC, + .expect_success = true, + .expect_different_ns = true, + .min_mounts = 1, +}; + +FIXTURE_VARIANT_ADD(open_tree_ns, subdir_tmp) +{ + .path = "/tmp", + .flags = OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC, + .expect_success = true, + .expect_different_ns = true, + .min_mounts = 1, +}; + +FIXTURE_VARIANT_ADD(open_tree_ns, subdir_proc) +{ + .path = "/proc", + .flags = OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC, + .expect_success = true, + .expect_different_ns = true, + .min_mounts = 1, +}; + +FIXTURE_VARIANT_ADD(open_tree_ns, recursive_tmp) +{ + .path = "/tmp", + .flags = OPEN_TREE_NAMESPACE | AT_RECURSIVE | OPEN_TREE_CLOEXEC, + .expect_success = true, + .expect_different_ns = true, + .min_mounts = 1, +}; + +FIXTURE_VARIANT_ADD(open_tree_ns, recursive_run) +{ + .path = "/run", + .flags = OPEN_TREE_NAMESPACE | AT_RECURSIVE | OPEN_TREE_CLOEXEC, + .expect_success = true, + .expect_different_ns = true, + .min_mounts = 1, +}; + +FIXTURE_VARIANT_ADD(open_tree_ns, invalid_recursive_alone) +{ + .path = "/", + .flags = AT_RECURSIVE | OPEN_TREE_CLOEXEC, + .expect_success = false, + .expect_different_ns = false, + .min_mounts = 0, +}; + +FIXTURE_SETUP(open_tree_ns) +{ + int ret; + + self->fd = -1; + + /* Check if open_tree syscall is supported */ + ret = sys_open_tree(-1, NULL, 0); + if (ret == -1 && errno == ENOSYS) + SKIP(return, "open_tree() syscall not supported"); + + /* Check if statmount/listmount are supported */ + ret = statmount(0, 0, 0, NULL, 0, 0); + if (ret == -1 && errno == ENOSYS) + SKIP(return, "statmount() syscall not supported"); + + /* Get current mount namespace ID for comparison */ + ret = get_mnt_ns_id_from_path("/proc/self/ns/mnt", &self->current_ns_id); + if (ret < 0) + SKIP(return, "Failed to get current mount namespace ID"); +} + +FIXTURE_TEARDOWN(open_tree_ns) +{ + if (self->fd >= 0) + close(self->fd); +} + +TEST_F(open_tree_ns, create_namespace) +{ + uint64_t new_ns_id; + uint64_t list[256]; + ssize_t nr_mounts; + int ret; + + self->fd = sys_open_tree(AT_FDCWD, variant->path, variant->flags); + + if (!variant->expect_success) { + ASSERT_LT(self->fd, 0); + ASSERT_EQ(errno, EINVAL); + return; + } + + if (self->fd < 0 && errno == EINVAL) + SKIP(return, "OPEN_TREE_NAMESPACE not supported"); + + ASSERT_GE(self->fd, 0); + + /* Verify we can get the namespace ID */ + ret = get_mnt_ns_id(self->fd, &new_ns_id); + ASSERT_EQ(ret, 0); + + /* Verify it's a different namespace */ + if (variant->expect_different_ns) + ASSERT_NE(new_ns_id, self->current_ns_id); + + /* List mounts in the new namespace */ + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0); + ASSERT_GE(nr_mounts, 0) { + TH_LOG("%m - listmount failed"); + } + + /* Verify minimum expected mounts */ + ASSERT_GE(nr_mounts, variant->min_mounts); + TH_LOG("Namespace contains %zd mounts", nr_mounts); +} + +TEST_F(open_tree_ns, setns_into_namespace) +{ + uint64_t new_ns_id; + pid_t pid; + int status; + int ret; + + /* Only test with basic flags */ + if (!(variant->flags & OPEN_TREE_NAMESPACE)) + SKIP(return, "setns test only for basic / case"); + + self->fd = sys_open_tree(AT_FDCWD, variant->path, variant->flags); + if (self->fd < 0 && errno == EINVAL) + SKIP(return, "OPEN_TREE_NAMESPACE not supported"); + + ASSERT_GE(self->fd, 0); + + /* Get namespace ID and dump all mounts */ + ret = get_mnt_ns_id(self->fd, &new_ns_id); + ASSERT_EQ(ret, 0); + + dump_mounts(_metadata, new_ns_id); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child: try to enter the namespace */ + if (setns(self->fd, CLONE_NEWNS) < 0) + _exit(1); + _exit(0); + } + + ASSERT_EQ(waitpid(pid, &status, 0), pid); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); +} + +TEST_F(open_tree_ns, verify_mount_properties) +{ + struct statmount sm; + uint64_t new_ns_id; + uint64_t list[256]; + ssize_t nr_mounts; + int ret; + + /* Only test with basic flags on root */ + if (variant->flags != (OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC) || + strcmp(variant->path, "/") != 0) + SKIP(return, "mount properties test only for basic / case"); + + self->fd = sys_open_tree(AT_FDCWD, "/", OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC); + if (self->fd < 0 && errno == EINVAL) + SKIP(return, "OPEN_TREE_NAMESPACE not supported"); + + ASSERT_GE(self->fd, 0); + + ret = get_mnt_ns_id(self->fd, &new_ns_id); + ASSERT_EQ(ret, 0); + + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0); + ASSERT_GE(nr_mounts, 1); + + /* Get info about the root mount (the bind mount, rootfs is hidden) */ + ret = statmount(list[0], new_ns_id, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0); + ASSERT_EQ(ret, 0); + + ASSERT_NE(sm.mnt_id, sm.mnt_parent_id); + + TH_LOG("Root mount id: %llu, parent: %llu", + (unsigned long long)sm.mnt_id, + (unsigned long long)sm.mnt_parent_id); +} + +FIXTURE(open_tree_ns_caps) +{ + bool has_caps; +}; + +FIXTURE_SETUP(open_tree_ns_caps) +{ + int ret; + + /* Check if open_tree syscall is supported */ + ret = sys_open_tree(-1, NULL, 0); + if (ret == -1 && errno == ENOSYS) + SKIP(return, "open_tree() syscall not supported"); + + self->has_caps = (geteuid() == 0); +} + +FIXTURE_TEARDOWN(open_tree_ns_caps) +{ +} + +TEST_F(open_tree_ns_caps, requires_cap_sys_admin) +{ + pid_t pid; + int status; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int fd; + + /* Child: drop privileges using utils.h helper */ + if (enter_userns() != 0) + _exit(2); + + /* Drop all caps using utils.h helper */ + if (caps_down() == 0) + _exit(3); + + fd = sys_open_tree(AT_FDCWD, "/", + OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC); + if (fd >= 0) { + close(fd); + /* Should have failed without caps */ + _exit(1); + } + + if (errno == EPERM) + _exit(0); + + /* EINVAL means OPEN_TREE_NAMESPACE not supported */ + if (errno == EINVAL) + _exit(4); + + /* Unexpected error */ + _exit(5); + } + + ASSERT_EQ(waitpid(pid, &status, 0), pid); + ASSERT_TRUE(WIFEXITED(status)); + + switch (WEXITSTATUS(status)) { + case 0: + /* Expected: EPERM without caps */ + break; + case 1: + ASSERT_FALSE(true) TH_LOG("OPEN_TREE_NAMESPACE succeeded without caps"); + break; + case 2: + SKIP(return, "setup_userns failed"); + break; + case 3: + SKIP(return, "caps_down failed"); + break; + case 4: + SKIP(return, "OPEN_TREE_NAMESPACE not supported"); + break; + default: + ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)", + WEXITSTATUS(status)); + break; + } +} + +FIXTURE(open_tree_ns_userns) +{ + int fd; +}; + +FIXTURE_SETUP(open_tree_ns_userns) +{ + int ret; + + self->fd = -1; + + /* Check if open_tree syscall is supported */ + ret = sys_open_tree(-1, NULL, 0); + if (ret == -1 && errno == ENOSYS) + SKIP(return, "open_tree() syscall not supported"); + + /* Check if statmount/listmount are supported */ + ret = statmount(0, 0, 0, NULL, 0, 0); + if (ret == -1 && errno == ENOSYS) + SKIP(return, "statmount() syscall not supported"); +} + +FIXTURE_TEARDOWN(open_tree_ns_userns) +{ + if (self->fd >= 0) + close(self->fd); +} + +TEST_F(open_tree_ns_userns, create_in_userns) +{ + pid_t pid; + int status; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + uint64_t new_ns_id; + uint64_t list[256]; + ssize_t nr_mounts; + int fd; + + /* Create new user namespace (also creates mount namespace) */ + if (enter_userns() != 0) + _exit(2); + + /* Now we have CAP_SYS_ADMIN in the user namespace */ + fd = sys_open_tree(AT_FDCWD, "/", + OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC); + if (fd < 0) { + if (errno == EINVAL) + _exit(4); /* OPEN_TREE_NAMESPACE not supported */ + _exit(1); + } + + /* Verify we can get the namespace ID */ + if (get_mnt_ns_id(fd, &new_ns_id) != 0) + _exit(5); + + /* Verify we can list mounts in the new namespace */ + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0); + if (nr_mounts < 0) + _exit(6); + + /* Should have at least 1 mount */ + if (nr_mounts < 1) + _exit(7); + + close(fd); + _exit(0); + } + + ASSERT_EQ(waitpid(pid, &status, 0), pid); + ASSERT_TRUE(WIFEXITED(status)); + + switch (WEXITSTATUS(status)) { + case 0: + /* Success */ + break; + case 1: + ASSERT_FALSE(true) TH_LOG("open_tree(OPEN_TREE_NAMESPACE) failed in userns"); + break; + case 2: + SKIP(return, "setup_userns failed"); + break; + case 4: + SKIP(return, "OPEN_TREE_NAMESPACE not supported"); + break; + case 5: + ASSERT_FALSE(true) TH_LOG("Failed to get mount namespace ID"); + break; + case 6: + ASSERT_FALSE(true) TH_LOG("listmount failed in new namespace"); + break; + case 7: + ASSERT_FALSE(true) TH_LOG("New namespace has no mounts"); + break; + default: + ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)", + WEXITSTATUS(status)); + break; + } +} + +TEST_F(open_tree_ns_userns, setns_in_userns) +{ + pid_t pid; + int status; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + uint64_t new_ns_id; + int fd; + pid_t inner_pid; + int inner_status; + + /* Create new user namespace */ + if (enter_userns() != 0) + _exit(2); + + fd = sys_open_tree(AT_FDCWD, "/", + OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC); + if (fd < 0) { + if (errno == EINVAL) + _exit(4); + _exit(1); + } + + if (get_mnt_ns_id(fd, &new_ns_id) != 0) + _exit(5); + + /* Fork again to test setns into the new namespace */ + inner_pid = fork(); + if (inner_pid < 0) + _exit(8); + + if (inner_pid == 0) { + /* Inner child: enter the new namespace */ + if (setns(fd, CLONE_NEWNS) < 0) + _exit(1); + _exit(0); + } + + if (waitpid(inner_pid, &inner_status, 0) != inner_pid) + _exit(9); + + if (!WIFEXITED(inner_status) || WEXITSTATUS(inner_status) != 0) + _exit(10); + + close(fd); + _exit(0); + } + + ASSERT_EQ(waitpid(pid, &status, 0), pid); + ASSERT_TRUE(WIFEXITED(status)); + + switch (WEXITSTATUS(status)) { + case 0: + /* Success */ + break; + case 1: + ASSERT_FALSE(true) TH_LOG("open_tree or setns failed in userns"); + break; + case 2: + SKIP(return, "setup_userns failed"); + break; + case 4: + SKIP(return, "OPEN_TREE_NAMESPACE not supported"); + break; + case 5: + ASSERT_FALSE(true) TH_LOG("Failed to get mount namespace ID"); + break; + case 8: + ASSERT_FALSE(true) TH_LOG("Inner fork failed"); + break; + case 9: + ASSERT_FALSE(true) TH_LOG("Inner waitpid failed"); + break; + case 10: + ASSERT_FALSE(true) TH_LOG("setns into new namespace failed"); + break; + default: + ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)", + WEXITSTATUS(status)); + break; + } +} + +TEST_F(open_tree_ns_userns, recursive_in_userns) +{ + pid_t pid; + int status; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + uint64_t new_ns_id; + uint64_t list[256]; + ssize_t nr_mounts; + int fd; + + /* Create new user namespace */ + if (enter_userns() != 0) + _exit(2); + + /* Test recursive flag in userns */ + fd = sys_open_tree(AT_FDCWD, "/", + OPEN_TREE_NAMESPACE | AT_RECURSIVE | OPEN_TREE_CLOEXEC); + if (fd < 0) { + if (errno == EINVAL) + _exit(4); + _exit(1); + } + + if (get_mnt_ns_id(fd, &new_ns_id) != 0) + _exit(5); + + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0); + if (nr_mounts < 0) + _exit(6); + + /* Recursive should copy submounts too */ + if (nr_mounts < 1) + _exit(7); + + close(fd); + _exit(0); + } + + ASSERT_EQ(waitpid(pid, &status, 0), pid); + ASSERT_TRUE(WIFEXITED(status)); + + switch (WEXITSTATUS(status)) { + case 0: + /* Success */ + break; + case 1: + ASSERT_FALSE(true) TH_LOG("open_tree(OPEN_TREE_NAMESPACE|AT_RECURSIVE) failed in userns"); + break; + case 2: + SKIP(return, "setup_userns failed"); + break; + case 4: + SKIP(return, "OPEN_TREE_NAMESPACE not supported"); + break; + case 5: + ASSERT_FALSE(true) TH_LOG("Failed to get mount namespace ID"); + break; + case 6: + ASSERT_FALSE(true) TH_LOG("listmount failed in new namespace"); + break; + case 7: + ASSERT_FALSE(true) TH_LOG("New namespace has no mounts"); + break; + default: + ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)", + WEXITSTATUS(status)); + break; + } +} + +TEST_F(open_tree_ns_userns, umount_fails_einval) +{ + pid_t pid; + int status; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + uint64_t new_ns_id; + uint64_t list[256]; + ssize_t nr_mounts; + int fd; + ssize_t i; + + /* Create new user namespace */ + if (enter_userns() != 0) + _exit(2); + + fd = sys_open_tree(AT_FDCWD, "/", + OPEN_TREE_NAMESPACE | AT_RECURSIVE | OPEN_TREE_CLOEXEC); + if (fd < 0) { + if (errno == EINVAL) + _exit(4); + _exit(1); + } + + if (get_mnt_ns_id(fd, &new_ns_id) != 0) + _exit(5); + + /* Get all mounts in the new namespace */ + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, LISTMOUNT_REVERSE); + if (nr_mounts < 0) + _exit(9); + + if (nr_mounts < 1) + _exit(10); + + /* Enter the new namespace */ + if (setns(fd, CLONE_NEWNS) < 0) + _exit(6); + + for (i = 0; i < nr_mounts; i++) { + struct statmount *sm; + const char *mnt_point; + + sm = statmount_alloc(list[i], new_ns_id, + STATMOUNT_MNT_POINT); + if (!sm) + _exit(11); + + mnt_point = sm->str + sm->mnt_point; + + TH_LOG("Trying to umount %s", mnt_point); + if (umount2(mnt_point, MNT_DETACH) == 0) { + free(sm); + _exit(7); + } + + if (errno != EINVAL) { + /* Wrong error */ + free(sm); + _exit(8); + } + + free(sm); + } + + close(fd); + _exit(0); + } + + ASSERT_EQ(waitpid(pid, &status, 0), pid); + ASSERT_TRUE(WIFEXITED(status)); + + switch (WEXITSTATUS(status)) { + case 0: + break; + case 1: + ASSERT_FALSE(true) TH_LOG("open_tree(OPEN_TREE_NAMESPACE) failed"); + break; + case 2: + SKIP(return, "setup_userns failed"); + break; + case 4: + SKIP(return, "OPEN_TREE_NAMESPACE not supported"); + break; + case 5: + ASSERT_FALSE(true) TH_LOG("Failed to get mount namespace ID"); + break; + case 6: + ASSERT_FALSE(true) TH_LOG("setns into new namespace failed"); + break; + case 7: + ASSERT_FALSE(true) TH_LOG("umount succeeded but should have failed with EINVAL"); + break; + case 8: + ASSERT_FALSE(true) TH_LOG("umount failed with wrong error (expected EINVAL)"); + break; + case 9: + ASSERT_FALSE(true) TH_LOG("listmount failed"); + break; + case 10: + ASSERT_FALSE(true) TH_LOG("No mounts in new namespace"); + break; + case 11: + ASSERT_FALSE(true) TH_LOG("statmount_alloc failed"); + break; + default: + ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)", + WEXITSTATUS(status)); + break; + } +} + +TEST_F(open_tree_ns_userns, umount_succeeds) +{ + pid_t pid; + int status; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + uint64_t new_ns_id; + uint64_t list[256]; + ssize_t nr_mounts; + int fd; + ssize_t i; + + if (unshare(CLONE_NEWNS)) + _exit(1); + + if (sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) != 0) + _exit(1); + + fd = sys_open_tree(AT_FDCWD, "/", + OPEN_TREE_NAMESPACE | AT_RECURSIVE | OPEN_TREE_CLOEXEC); + if (fd < 0) { + if (errno == EINVAL) + _exit(4); + _exit(1); + } + + if (get_mnt_ns_id(fd, &new_ns_id) != 0) + _exit(5); + + /* Get all mounts in the new namespace */ + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, LISTMOUNT_REVERSE); + if (nr_mounts < 0) + _exit(9); + + if (nr_mounts < 1) + _exit(10); + + /* Enter the new namespace */ + if (setns(fd, CLONE_NEWNS) < 0) + _exit(6); + + for (i = 0; i < nr_mounts; i++) { + struct statmount *sm; + const char *mnt_point; + + sm = statmount_alloc(list[i], new_ns_id, + STATMOUNT_MNT_POINT); + if (!sm) + _exit(11); + + mnt_point = sm->str + sm->mnt_point; + + TH_LOG("Trying to umount %s", mnt_point); + if (umount2(mnt_point, MNT_DETACH) != 0) { + free(sm); + _exit(7); + } + + free(sm); + } + + close(fd); + _exit(0); + } + + ASSERT_EQ(waitpid(pid, &status, 0), pid); + ASSERT_TRUE(WIFEXITED(status)); + + switch (WEXITSTATUS(status)) { + case 0: + break; + case 1: + ASSERT_FALSE(true) TH_LOG("open_tree(OPEN_TREE_NAMESPACE) failed"); + break; + case 2: + SKIP(return, "setup_userns failed"); + break; + case 4: + SKIP(return, "OPEN_TREE_NAMESPACE not supported"); + break; + case 5: + ASSERT_FALSE(true) TH_LOG("Failed to get mount namespace ID"); + break; + case 6: + ASSERT_FALSE(true) TH_LOG("setns into new namespace failed"); + break; + case 7: + ASSERT_FALSE(true) TH_LOG("umount succeeded but should have failed with EINVAL"); + break; + case 9: + ASSERT_FALSE(true) TH_LOG("listmount failed"); + break; + case 10: + ASSERT_FALSE(true) TH_LOG("No mounts in new namespace"); + break; + case 11: + ASSERT_FALSE(true) TH_LOG("statmount_alloc failed"); + break; + default: + ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)", + WEXITSTATUS(status)); + break; + } +} + +FIXTURE(open_tree_ns_unbindable) +{ + char tmpdir[PATH_MAX]; + bool mounted; +}; + +FIXTURE_SETUP(open_tree_ns_unbindable) +{ + int ret; + + self->mounted = false; + + /* Check if open_tree syscall is supported */ + ret = sys_open_tree(-1, NULL, 0); + if (ret == -1 && errno == ENOSYS) + SKIP(return, "open_tree() syscall not supported"); + + /* Create a temporary directory for the test mount */ + snprintf(self->tmpdir, sizeof(self->tmpdir), + "/tmp/open_tree_ns_test.XXXXXX"); + ASSERT_NE(mkdtemp(self->tmpdir), NULL); + + /* Mount tmpfs there */ + ret = mount("tmpfs", self->tmpdir, "tmpfs", 0, NULL); + if (ret < 0) { + rmdir(self->tmpdir); + SKIP(return, "Failed to mount tmpfs"); + } + self->mounted = true; + + ret = mount(NULL, self->tmpdir, NULL, MS_UNBINDABLE, NULL); + if (ret < 0) { + rmdir(self->tmpdir); + SKIP(return, "Failed to make tmpfs unbindable"); + } +} + +FIXTURE_TEARDOWN(open_tree_ns_unbindable) +{ + if (self->mounted) + umount2(self->tmpdir, MNT_DETACH); + rmdir(self->tmpdir); +} + +TEST_F(open_tree_ns_unbindable, fails_on_unbindable) +{ + int fd; + + fd = sys_open_tree(AT_FDCWD, self->tmpdir, + OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC); + ASSERT_LT(fd, 0); +} + +TEST_F(open_tree_ns_unbindable, recursive_skips_on_unbindable) +{ + uint64_t new_ns_id; + uint64_t list[256]; + ssize_t nr_mounts; + int fd; + ssize_t i; + bool found_unbindable = false; + + fd = sys_open_tree(AT_FDCWD, "/", + OPEN_TREE_NAMESPACE | AT_RECURSIVE | OPEN_TREE_CLOEXEC); + ASSERT_GT(fd, 0); + + ASSERT_EQ(get_mnt_ns_id(fd, &new_ns_id), 0); + + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0); + ASSERT_GE(nr_mounts, 0) { + TH_LOG("listmount failed: %m"); + } + + /* + * Iterate through all mounts in the new namespace and verify + * the unbindable tmpfs mount was silently dropped. + */ + for (i = 0; i < nr_mounts; i++) { + struct statmount *sm; + const char *mnt_point; + + sm = statmount_alloc(list[i], new_ns_id, STATMOUNT_MNT_POINT); + ASSERT_NE(sm, NULL) { + TH_LOG("statmount_alloc failed for mnt_id %llu", + (unsigned long long)list[i]); + } + + mnt_point = sm->str + sm->mnt_point; + + if (strcmp(mnt_point, self->tmpdir) == 0) { + TH_LOG("Found unbindable mount at %s (should have been dropped)", + mnt_point); + found_unbindable = true; + } + + free(sm); + } + + ASSERT_FALSE(found_unbindable) { + TH_LOG("Unbindable mount at %s was not dropped", self->tmpdir); + } + + close(fd); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/filesystems/utils.c b/tools/testing/selftests/filesystems/utils.c index c9dd5412b37b..d6f26f849053 100644 --- a/tools/testing/selftests/filesystems/utils.c +++ b/tools/testing/selftests/filesystems/utils.c @@ -515,6 +515,32 @@ int setup_userns(void) return 0; } +int enter_userns(void) +{ + int ret; + char buf[32]; + uid_t uid = getuid(); + gid_t gid = getgid(); + + ret = unshare(CLONE_NEWUSER); + if (ret) + return ret; + + sprintf(buf, "0 %d 1", uid); + ret = write_file("/proc/self/uid_map", buf); + if (ret) + return ret; + ret = write_file("/proc/self/setgroups", "deny"); + if (ret) + return ret; + sprintf(buf, "0 %d 1", gid); + ret = write_file("/proc/self/gid_map", buf); + if (ret) + return ret; + + return 0; +} + /* caps_down - lower all effective caps */ int caps_down(void) { diff --git a/tools/testing/selftests/filesystems/utils.h b/tools/testing/selftests/filesystems/utils.h index 70f7ccc607f4..0bccfed666a9 100644 --- a/tools/testing/selftests/filesystems/utils.h +++ b/tools/testing/selftests/filesystems/utils.h @@ -28,6 +28,7 @@ extern int cap_down(cap_value_t down); extern bool switch_ids(uid_t uid, gid_t gid); extern int setup_userns(void); +extern int enter_userns(void); static inline bool switch_userns(int fd, uid_t uid, gid_t gid, bool drop_caps) { -- cgit v1.2.3 From 3ec6cefc398b93e5f28500f80e7321a80fffee8a Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marlière" Date: Fri, 16 Jan 2026 11:20:54 -0300 Subject: selftests/run_kselftest.sh: Add `--skip` argument option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently the only way of excluding certain tests from a collection is by passing all the other tests explicitly via `--test`. Therefore, if the user wants to skip a single test the resulting command line might be too big, depending on the collection. Add an option `--skip` that takes care of that. Link: https://lore.kernel.org/r/20260116-selftests-add_skip_opt-v1-1-ab54afaae81b@suse.com Signed-off-by: Ricardo B. Marlière Signed-off-by: Shuah Khan --- tools/testing/selftests/run_kselftest.sh | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/run_kselftest.sh b/tools/testing/selftests/run_kselftest.sh index d4be97498b32..84d45254675c 100755 --- a/tools/testing/selftests/run_kselftest.sh +++ b/tools/testing/selftests/run_kselftest.sh @@ -30,6 +30,7 @@ Usage: $0 [OPTIONS] -s | --summary Print summary with detailed log in output.log (conflict with -p) -p | --per-test-log Print test log in /tmp with each test name (conflict with -s) -t | --test COLLECTION:TEST Run TEST from COLLECTION + -S | --skip COLLECTION:TEST Skip TEST from COLLECTION -c | --collection COLLECTION Run all tests from COLLECTION -l | --list List the available collection:test entries -d | --dry-run Don't actually run any tests @@ -43,6 +44,7 @@ EOF COLLECTIONS="" TESTS="" +SKIP="" dryrun="" kselftest_override_timeout="" ERROR_ON_FAIL=true @@ -58,6 +60,9 @@ while true; do -t | --test) TESTS="$TESTS $2" shift 2 ;; + -S | --skip) + SKIP="$SKIP $2" + shift 2 ;; -c | --collection) COLLECTIONS="$COLLECTIONS $2" shift 2 ;; @@ -109,6 +114,12 @@ if [ -n "$TESTS" ]; then done available="$(echo "$valid" | sed -e 's/ /\n/g')" fi +# Remove tests to be skipped from available list +if [ -n "$SKIP" ]; then + for skipped in $SKIP ; do + available="$(echo "$available" | grep -v "^${skipped}$")" + done +fi kselftest_failures_file="$(mktemp --tmpdir kselftest-failures-XXXXXX)" export kselftest_failures_file -- cgit v1.2.3 From db7855c96d4216b2ed45e2781fae9293b323c7ef Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 16 Jan 2026 12:40:56 -0800 Subject: x86/entry/vdso/selftest: Update location of vgetrandom-chacha.S As part of the vdso build restructuring, vgetrandom-chacha.S moved into the vdso/vdso64 subdirectory. Update the selftest #include to match. Closes: https://lore.kernel.org/oe-lkp/202601161608.5cd5af9a-lkp@intel.com Fixes: 693c819fedcd ("x86/entry/vdso: Refactor the vdso build") Reported-by: kernel test robot Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Dave Hansen Link: https://patch.msgid.link/20260116204057.386268-4-hpa@zytor.com --- tools/testing/selftests/vDSO/vgetrandom-chacha.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/vDSO/vgetrandom-chacha.S b/tools/testing/selftests/vDSO/vgetrandom-chacha.S index a4a82e1c28a9..10f982157a1f 100644 --- a/tools/testing/selftests/vDSO/vgetrandom-chacha.S +++ b/tools/testing/selftests/vDSO/vgetrandom-chacha.S @@ -16,5 +16,5 @@ #elif defined(__s390x__) #include "../../../../arch/s390/kernel/vdso64/vgetrandom-chacha.S" #elif defined(__x86_64__) -#include "../../../../arch/x86/entry/vdso/vgetrandom-chacha.S" +#include "../../../../arch/x86/entry/vdso/vdso64/vgetrandom-chacha.S" #endif -- cgit v1.2.3 From 47d440d0a5bb822f3f4e4b2479246da5efb765e6 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Thu, 15 Jan 2026 16:34:57 +0000 Subject: selftests/bpf: Support when CONFIG_VXLAN=m If CONFIG_VXLAN is 'm', struct vxlanhdr will not be in vmlinux.h. Add a ___local variant to support cases where vxlan is a module. Fixes: 8517b1abe5ea ("selftests/bpf: Integrate test_tc_tunnel.sh tests into test_progs") Signed-off-by: Alan Maguire Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260115163457.146267-1-alan.maguire@oracle.com --- tools/testing/selftests/bpf/progs/test_tc_tunnel.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c index 7330c61b5730..7376df405a6b 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c @@ -23,7 +23,12 @@ static const int cfg_udp_src = 20000; (((__u64)len & BPF_ADJ_ROOM_ENCAP_L2_MASK) \ << BPF_ADJ_ROOM_ENCAP_L2_SHIFT) -#define L2_PAD_SZ (sizeof(struct vxlanhdr) + ETH_HLEN) +struct vxlanhdr___local { + __be32 vx_flags; + __be32 vx_vni; +}; + +#define L2_PAD_SZ (sizeof(struct vxlanhdr___local) + ETH_HLEN) #define UDP_PORT 5555 #define MPLS_OVER_UDP_PORT 6635 @@ -154,7 +159,7 @@ static __always_inline int __encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, l2_len = ETH_HLEN; if (ext_proto & EXTPROTO_VXLAN) { udp_dst = VXLAN_UDP_PORT; - l2_len += sizeof(struct vxlanhdr); + l2_len += sizeof(struct vxlanhdr___local); } else udp_dst = ETH_OVER_UDP_PORT; break; @@ -195,12 +200,12 @@ static __always_inline int __encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, flags |= BPF_F_ADJ_ROOM_ENCAP_L2_ETH; if (ext_proto & EXTPROTO_VXLAN) { - struct vxlanhdr *vxlan_hdr = (struct vxlanhdr *)l2_hdr; + struct vxlanhdr___local *vxlan_hdr = (struct vxlanhdr___local *)l2_hdr; vxlan_hdr->vx_flags = VXLAN_FLAGS; vxlan_hdr->vx_vni = VXLAN_VNI; - l2_hdr += sizeof(struct vxlanhdr); + l2_hdr += sizeof(struct vxlanhdr___local); } if (bpf_skb_load_bytes(skb, 0, l2_hdr, ETH_HLEN)) @@ -285,7 +290,7 @@ static __always_inline int __encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, l2_len = ETH_HLEN; if (ext_proto & EXTPROTO_VXLAN) { udp_dst = VXLAN_UDP_PORT; - l2_len += sizeof(struct vxlanhdr); + l2_len += sizeof(struct vxlanhdr___local); } else udp_dst = ETH_OVER_UDP_PORT; break; @@ -325,12 +330,12 @@ static __always_inline int __encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, flags |= BPF_F_ADJ_ROOM_ENCAP_L2_ETH; if (ext_proto & EXTPROTO_VXLAN) { - struct vxlanhdr *vxlan_hdr = (struct vxlanhdr *)l2_hdr; + struct vxlanhdr___local *vxlan_hdr = (struct vxlanhdr___local *)l2_hdr; vxlan_hdr->vx_flags = VXLAN_FLAGS; vxlan_hdr->vx_vni = VXLAN_VNI; - l2_hdr += sizeof(struct vxlanhdr); + l2_hdr += sizeof(struct vxlanhdr___local); } if (bpf_skb_load_bytes(skb, 0, l2_hdr, ETH_HLEN)) @@ -639,7 +644,7 @@ static int decap_internal(struct __sk_buff *skb, int off, int len, char proto) olen += ETH_HLEN; break; case VXLAN_UDP_PORT: - olen += ETH_HLEN + sizeof(struct vxlanhdr); + olen += ETH_HLEN + sizeof(struct vxlanhdr___local); break; } break; -- cgit v1.2.3 From efad162f5a840ae178e7761c176c49f433c7bb68 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 15 Jan 2026 21:22:45 -0800 Subject: selftests/bpf: Fix map_kptr test failure On my arm64 machine, I get the following failure: ... tester_init:PASS:tester_log_buf 0 nsec process_subtest:PASS:obj_open_mem 0 nsec process_subtest:PASS:specs_alloc 0 nsec serial_test_map_kptr:PASS:rcu_tasks_trace_gp__open_and_load 0 nsec ... test_map_kptr_success:PASS:map_kptr__open_and_load 0 nsec test_map_kptr_success:PASS:test_map_kptr_ref1 refcount 0 nsec test_map_kptr_success:FAIL:test_map_kptr_ref1 retval unexpected error: 2 (errno 2) test_map_kptr_success:PASS:test_map_kptr_ref2 refcount 0 nsec test_map_kptr_success:FAIL:test_map_kptr_ref2 retval unexpected error: 1 (errno 2) ... #201/21 map_kptr/success-map:FAIL In serial_test_map_kptr(), before test_map_kptr_success(), one kern_sync_rcu() is used to have some delay for freeing the map. But in my environment, one kern_sync_rcu() seems not enough and caused the test failure. In bpf_map_free_in_work() in syscall.c, the queue time for queue_work(system_dfl_wq, &map->work) may be longer than expected. This may cause the test failure since test_map_kptr_success() expects all previous maps having been freed. Since it is not clear how long queue_work() time takes, a bpf prog is added to count the reference after bpf_kfunc_call_test_acquire(). If the number of references is 2 (for initial ref and the one just acquired), all previous maps should have been released. This will resolve the above 'retval unexpected error' issue. Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/20260116052245.3692405-1-yonghong.song@linux.dev --- tools/testing/selftests/bpf/prog_tests/map_kptr.c | 23 +++++++++++++++++++++++ tools/testing/selftests/bpf/progs/map_kptr.c | 18 ++++++++++++++++++ 2 files changed, 41 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/prog_tests/map_kptr.c b/tools/testing/selftests/bpf/prog_tests/map_kptr.c index 8743df599567..f372162c0280 100644 --- a/tools/testing/selftests/bpf/prog_tests/map_kptr.c +++ b/tools/testing/selftests/bpf/prog_tests/map_kptr.c @@ -131,6 +131,25 @@ static int kern_sync_rcu_tasks_trace(struct rcu_tasks_trace_gp *rcu) return 0; } +static void wait_for_map_release(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, lopts); + struct map_kptr *skel; + int ret; + + skel = map_kptr__open_and_load(); + if (!ASSERT_OK_PTR(skel, "map_kptr__open_and_load")) + return; + + do { + ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.count_ref), &lopts); + ASSERT_OK(ret, "count_ref ret"); + ASSERT_OK(lopts.retval, "count_ref retval"); + } while (skel->bss->num_of_refs != 2); + + map_kptr__destroy(skel); +} + void serial_test_map_kptr(void) { struct rcu_tasks_trace_gp *skel; @@ -148,11 +167,15 @@ void serial_test_map_kptr(void) ASSERT_OK(kern_sync_rcu_tasks_trace(skel), "sync rcu_tasks_trace"); ASSERT_OK(kern_sync_rcu(), "sync rcu"); + wait_for_map_release(); + /* Observe refcount dropping to 1 on bpf_map_free_deferred */ test_map_kptr_success(false); ASSERT_OK(kern_sync_rcu_tasks_trace(skel), "sync rcu_tasks_trace"); ASSERT_OK(kern_sync_rcu(), "sync rcu"); + wait_for_map_release(); + /* Observe refcount dropping to 1 on synchronous delete elem */ test_map_kptr_success(true); } diff --git a/tools/testing/selftests/bpf/progs/map_kptr.c b/tools/testing/selftests/bpf/progs/map_kptr.c index edaba481db9d..e708ffbe1f61 100644 --- a/tools/testing/selftests/bpf/progs/map_kptr.c +++ b/tools/testing/selftests/bpf/progs/map_kptr.c @@ -487,6 +487,24 @@ int test_map_kptr_ref3(struct __sk_buff *ctx) return 0; } +int num_of_refs; + +SEC("syscall") +int count_ref(void *ctx) +{ + struct prog_test_ref_kfunc *p; + unsigned long arg = 0; + + p = bpf_kfunc_call_test_acquire(&arg); + if (!p) + return 1; + + num_of_refs = p->cnt.refs.counter; + + bpf_kfunc_call_test_release(p); + return 0; +} + SEC("syscall") int test_ls_map_kptr_ref1(void *ctx) { -- cgit v1.2.3 From d9b40d7262a227442bf402ea0708dc94f438bb52 Mon Sep 17 00:00:00 2001 From: Bala-Vignesh-Reddy Date: Wed, 22 Oct 2025 11:59:48 +0530 Subject: selftests/x86: Add selftests include path for kselftest.h after centralization The previous change centralizing kselftest.h include path in lib.mk caused x86 selftests to fail, as x86 Makefile overwrites CFLAGS using ":=", dropping the include path added in lib.mk. Therefore, helpers.h could not find kselftest.h during compilation. Fix this by adding the tools/testing/sefltest to CFLAGS in x86 Makefile. [ bp: Correct commit ID in Fixes: ] Fixes: e6fbd1759c9e ("selftests: complete kselftest include centralization") Closes: https://lore.kernel.org/lkml/CA+G9fYvKjQcCBMfXA-z2YuL2L+3Qd-pJjEUDX8PDdz2-EEQd=Q@mail.gmail.com/T/#m83fd330231287fc9d6c921155bee16c591db7360 Reported-by: Linux Kernel Functional Testing Signed-off-by: Bala-Vignesh-Reddy Signed-off-by: Borislav Petkov (AMD) Tested-by: Anders Roxell Tested-by: Brendan Jackman Link: https://patch.msgid.link/20251022062948.162852-1-reddybalavignesh9979@gmail.com --- tools/testing/selftests/x86/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index 83148875a12c..434065215d12 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -36,6 +36,7 @@ BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32)) BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64)) CFLAGS := -O2 -g -std=gnu99 -pthread -Wall $(KHDR_INCLUDES) +CFLAGS += -I $(top_srcdir)/tools/testing/selftests/ # call32_from_64 in thunks.S uses absolute addresses. ifeq ($(CAN_BUILD_WITH_NOPIE),1) -- cgit v1.2.3 From d045e166d3c51b7aec069669bb243e057d80d04f Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Thu, 15 Jan 2026 14:56:52 +0100 Subject: selftests: vDSO: getrandom: Fix path to s390 chacha implementation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The s390 vDSO source directory was recently moved, but this reference was not updated. Fixes: c0087d807ae8 ("s390/vdso: Rename vdso64 to vdso") Signed-off-by: Thomas Weißschuh Acked-by: Heiko Carstens Signed-off-by: Heiko Carstens --- tools/testing/selftests/vDSO/vgetrandom-chacha.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/vDSO/vgetrandom-chacha.S b/tools/testing/selftests/vDSO/vgetrandom-chacha.S index a4a82e1c28a9..8c3cbf4dfd6a 100644 --- a/tools/testing/selftests/vDSO/vgetrandom-chacha.S +++ b/tools/testing/selftests/vDSO/vgetrandom-chacha.S @@ -14,7 +14,7 @@ #elif defined(__riscv) && __riscv_xlen == 64 #include "../../../../arch/riscv/kernel/vdso/vgetrandom-chacha.S" #elif defined(__s390x__) -#include "../../../../arch/s390/kernel/vdso64/vgetrandom-chacha.S" +#include "../../../../arch/s390/kernel/vdso/vgetrandom-chacha.S" #elif defined(__x86_64__) #include "../../../../arch/x86/entry/vdso/vgetrandom-chacha.S" #endif -- cgit v1.2.3 From a120a832e3ebca48474d7183ddeadf4138472535 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 20 Oct 2025 15:01:16 -0700 Subject: lkdtm/bugs: Add __counted_by_ptr() test PTR_BOUNDS Provide run-time validation of the __counted_by_ptr() annotation via newly added PTR_BOUNDS LKDTM test. Link: https://patch.msgid.link/20251020220118.1226740-2-kees@kernel.org Signed-off-by: Kees Cook --- drivers/misc/lkdtm/bugs.c | 90 ++++++++++++++++++++++++++++++--- tools/testing/selftests/lkdtm/tests.txt | 2 + 2 files changed, 84 insertions(+), 8 deletions(-) (limited to 'tools/testing') diff --git a/drivers/misc/lkdtm/bugs.c b/drivers/misc/lkdtm/bugs.c index 502059078b45..b2aee36b956d 100644 --- a/drivers/misc/lkdtm/bugs.c +++ b/drivers/misc/lkdtm/bugs.c @@ -465,32 +465,32 @@ static void lkdtm_ARRAY_BOUNDS(void) pr_expected_config(CONFIG_UBSAN_BOUNDS); } -struct lkdtm_annotated { +struct lkdtm_cb_fam { unsigned long flags; int count; int array[] __counted_by(count); }; -static volatile int fam_count = 4; +static volatile int element_count = 4; static void lkdtm_FAM_BOUNDS(void) { - struct lkdtm_annotated *inst; + struct lkdtm_cb_fam *inst; - inst = kzalloc(struct_size(inst, array, fam_count + 1), GFP_KERNEL); + inst = kzalloc(struct_size(inst, array, element_count + 1), GFP_KERNEL); if (!inst) { pr_err("FAIL: could not allocate test struct!\n"); return; } - inst->count = fam_count; + inst->count = element_count; pr_info("Array access within bounds ...\n"); - inst->array[1] = fam_count; + inst->array[1] = element_count; ignored = inst->array[1]; pr_info("Array access beyond bounds ...\n"); - inst->array[fam_count] = fam_count; - ignored = inst->array[fam_count]; + inst->array[element_count] = element_count; + ignored = inst->array[element_count]; kfree(inst); @@ -505,6 +505,79 @@ static void lkdtm_FAM_BOUNDS(void) pr_expected_config(CONFIG_UBSAN_BOUNDS); } +struct lkdtm_extra { + short a, b; + u16 sixteen; + u32 bigger; + u64 biggest; +}; + +struct lkdtm_cb_ptr { + int a, b, c; + int nr_extra; + char *buf __counted_by_ptr(len); + size_t len; + struct lkdtm_extra *extra __counted_by_ptr(nr_extra); +}; + +static noinline void check_ptr_len(struct lkdtm_cb_ptr *p, size_t len) +{ + if (__member_size(p->buf) != len) + pr_err("FAIL: could not determine size of inst->buf: %zu\n", + __member_size(p->buf)); + else + pr_info("good: inst->buf length is %zu\n", len); +} + +static void lkdtm_PTR_BOUNDS(void) +{ + struct lkdtm_cb_ptr *inst; + + inst = kzalloc(sizeof(*inst), GFP_KERNEL); + if (!inst) { + pr_err("FAIL: could not allocate struct lkdtm_cb_ptr!\n"); + return; + } + + inst->buf = kzalloc(element_count, GFP_KERNEL); + if (!inst->buf) { + pr_err("FAIL: could not allocate inst->buf!\n"); + return; + } + inst->len = element_count; + + /* Double element_count */ + inst->extra = kcalloc(element_count * 2, sizeof(*inst->extra), GFP_KERNEL); + inst->nr_extra = element_count * 2; + + pr_info("Pointer access within bounds ...\n"); + check_ptr_len(inst, 4); + /* All 4 bytes */ + inst->buf[0] = 'A'; + inst->buf[1] = 'B'; + inst->buf[2] = 'C'; + inst->buf[3] = 'D'; + /* Halfway into the array */ + inst->extra[element_count].biggest = 0x1000; + + pr_info("Pointer access beyond bounds ...\n"); + ignored = inst->extra[inst->nr_extra].b; + + kfree(inst->extra); + kfree(inst->buf); + kfree(inst); + + pr_err("FAIL: survived access of invalid pointer member offset!\n"); + + if (!IS_ENABLED(CONFIG_CC_HAS_COUNTED_BY_PTR)) + pr_warn("This is expected since this %s was built with a compiler that does not support __counted_by_ptr\n", + lkdtm_kernel_info); + else if (IS_ENABLED(CONFIG_UBSAN_BOUNDS)) + pr_expected_config(CONFIG_UBSAN_TRAP); + else + pr_expected_config(CONFIG_UBSAN_BOUNDS); +} + static void lkdtm_CORRUPT_LIST_ADD(void) { /* @@ -769,6 +842,7 @@ static struct crashtype crashtypes[] = { CRASHTYPE(OVERFLOW_UNSIGNED), CRASHTYPE(ARRAY_BOUNDS), CRASHTYPE(FAM_BOUNDS), + CRASHTYPE(PTR_BOUNDS), CRASHTYPE(CORRUPT_LIST_ADD), CRASHTYPE(CORRUPT_LIST_DEL), CRASHTYPE(STACK_GUARD_PAGE_LEADING), diff --git a/tools/testing/selftests/lkdtm/tests.txt b/tools/testing/selftests/lkdtm/tests.txt index 67cd53715d93..e62b85b591be 100644 --- a/tools/testing/selftests/lkdtm/tests.txt +++ b/tools/testing/selftests/lkdtm/tests.txt @@ -11,6 +11,8 @@ EXCEPTION #CORRUPT_STACK Crashes entire system on success #CORRUPT_STACK_STRONG Crashes entire system on success ARRAY_BOUNDS call trace:|UBSAN: array-index-out-of-bounds +FAM_BOUNDS call trace:|UBSAN: array-index-out-of-bounds +PTR_BOUNDS call trace:|UBSAN: array-index-out-of-bounds CORRUPT_LIST_ADD list_add corruption CORRUPT_LIST_DEL list_del corruption STACK_GUARD_PAGE_LEADING -- cgit v1.2.3 From 52b4859730434902731e1cd4ea061d9611398008 Mon Sep 17 00:00:00 2001 From: Yohei Kojima Date: Tue, 13 Jan 2026 23:11:54 +0900 Subject: selftests: net: fix passive TFO test to fail if child processes failed Improve the passive TFO test to report failure if the server or the client timed out or exited with non-zero status. Before this commit, TFO test didn't fail even if exit(EXIT_FAILURE) is added to the first line of the run_server() and run_client() functions. Signed-off-by: Yohei Kojima Link: https://patch.msgid.link/214d399caec2e5de7738ced5736829915d507e4e.1768312014.git.yk@y-koj.net Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/tfo_passive.sh | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/net/tfo_passive.sh b/tools/testing/selftests/net/tfo_passive.sh index a4550511830a..f116f888b794 100755 --- a/tools/testing/selftests/net/tfo_passive.sh +++ b/tools/testing/selftests/net/tfo_passive.sh @@ -85,12 +85,15 @@ timeout -k 1s 30s ip netns exec nssv ./tfo \ -s \ -p ${SERVER_PORT} \ -o ${out_file}& +server_pid="$!" wait_local_port_listen nssv ${SERVER_PORT} tcp ip netns exec nscl ./tfo -c -h ${SERVER_IP} -p ${SERVER_PORT} +client_exit_status="$?" -wait +wait "$server_pid" +server_exit_status="$?" res=$(cat $out_file) rm $out_file @@ -101,6 +104,14 @@ if [ "$res" = "0" ]; then exit 1 fi +if [ "$client_exit_status" -ne 0 ] || [ "$server_exit_status" -ne 0 ]; then + # Note: timeout(1) exits with 124 if it timed out + echo "client exited with ${client_exit_status}" + echo "server exited with ${server_exit_status}" + cleanup_ns + exit 1 +fi + echo "$NSIM_SV_FD:$NSIM_SV_IFIDX" > $NSIM_DEV_SYS_UNLINK echo $NSIM_CL_ID > $NSIM_DEV_SYS_DEL -- cgit v1.2.3 From 342e31254f02041e3b9d4ad573204d53b2f832c9 Mon Sep 17 00:00:00 2001 From: Yohei Kojima Date: Tue, 13 Jan 2026 23:11:55 +0900 Subject: selftests: net: improve error handling in passive TFO test Improve the error handling in passive TFO test to check the return value from sendto(), and to fail if read() or fprintf() failed. Signed-off-by: Yohei Kojima Link: https://patch.msgid.link/24707c8133f7095c0e5a94afa69e75c3a80bf6e7.1768312014.git.yk@y-koj.net Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/tfo.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/net/tfo.c b/tools/testing/selftests/net/tfo.c index 8d82140f0f76..3b1ee2d3d417 100644 --- a/tools/testing/selftests/net/tfo.c +++ b/tools/testing/selftests/net/tfo.c @@ -82,8 +82,10 @@ static void run_server(void) error(1, errno, "getsockopt(SO_INCOMING_NAPI_ID)"); if (read(connfd, buf, 64) < 0) - perror("read()"); - fprintf(outfile, "%d\n", opt); + error(1, errno, "read()"); + + if (fprintf(outfile, "%d\n", opt) < 0) + error(1, errno, "fprintf()"); fclose(outfile); close(connfd); @@ -92,14 +94,17 @@ static void run_server(void) static void run_client(void) { - int fd; + int fd, ret; char *msg = "Hello, world!"; fd = socket(AF_INET6, SOCK_STREAM, 0); if (fd == -1) error(1, errno, "socket()"); - sendto(fd, msg, strlen(msg), MSG_FASTOPEN, (struct sockaddr *)&cfg_addr, sizeof(cfg_addr)); + ret = sendto(fd, msg, strlen(msg), MSG_FASTOPEN, + (struct sockaddr *)&cfg_addr, sizeof(cfg_addr)); + if (ret < 0) + error(1, errno, "sendto()"); close(fd); } -- cgit v1.2.3 From 1deecf7805f16cbcb3541cc57d8478b8b992a2ab Mon Sep 17 00:00:00 2001 From: LeeYongjun Date: Sun, 18 Jan 2026 15:55:10 +0900 Subject: selftests: ALSA: Remove unused variable in utimer-test The variable 'i' in wrong_timers_test() is declared but never used. This was detected by Cppcheck static analysis. tools/testing/selftests/alsa/utimer-test.c:144:9: style: Unused variable: i [unusedVariable] Remove it to clean up the code and silence the warning. Signed-off-by: LeeYongjun Link: https://patch.msgid.link/20260118065510.29644-1-jun85566@gmail.com Signed-off-by: Takashi Iwai --- tools/testing/selftests/alsa/utimer-test.c | 1 - 1 file changed, 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/alsa/utimer-test.c b/tools/testing/selftests/alsa/utimer-test.c index c45cb226bd8f..d221972cd8fb 100644 --- a/tools/testing/selftests/alsa/utimer-test.c +++ b/tools/testing/selftests/alsa/utimer-test.c @@ -141,7 +141,6 @@ TEST_F(timer_f, utimer) { TEST(wrong_timers_test) { int timer_dev_fd; int utimer_fd; - size_t i; struct snd_timer_uinfo wrong_timer = { .resolution = 0, .id = UTIMER_DEFAULT_ID, -- cgit v1.2.3 From 59cac9d52b885cbeba45fa455417b03dfb03eaa7 Mon Sep 17 00:00:00 2001 From: UYeol Jo Date: Mon, 12 Jan 2026 06:01:26 +0900 Subject: selftests/x86: Clean up sysret_rip coding style Tidy up sysret_rip style (cast spacing, main(void), const placement). No functional change intended. Signed-off-by: UYeol Jo Signed-off-by: Borislav Petkov (AMD) Link: https://patch.msgid.link/20260111210126.74752-1-jouyeol8739@gmail.com --- tools/testing/selftests/x86/sysret_rip.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/x86/sysret_rip.c b/tools/testing/selftests/x86/sysret_rip.c index 5fb531e3ad7c..2e423a335e1c 100644 --- a/tools/testing/selftests/x86/sysret_rip.c +++ b/tools/testing/selftests/x86/sysret_rip.c @@ -31,7 +31,7 @@ void test_syscall_ins(void); extern const char test_page[]; -static void const *current_test_page_addr = test_page; +static const void *current_test_page_addr = test_page; /* State used by our signal handlers. */ static gregset_t initial_regs; @@ -40,7 +40,7 @@ static volatile unsigned long rip; static void sigsegv_for_sigreturn_test(int sig, siginfo_t *info, void *ctx_void) { - ucontext_t *ctx = (ucontext_t*)ctx_void; + ucontext_t *ctx = (ucontext_t *)ctx_void; if (rip != ctx->uc_mcontext.gregs[REG_RIP]) { printf("[FAIL]\tRequested RIP=0x%lx but got RIP=0x%lx\n", @@ -56,7 +56,7 @@ static void sigsegv_for_sigreturn_test(int sig, siginfo_t *info, void *ctx_void) static void sigusr1(int sig, siginfo_t *info, void *ctx_void) { - ucontext_t *ctx = (ucontext_t*)ctx_void; + ucontext_t *ctx = (ucontext_t *)ctx_void; memcpy(&initial_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t)); @@ -69,8 +69,6 @@ static void sigusr1(int sig, siginfo_t *info, void *ctx_void) ctx->uc_mcontext.gregs[REG_R11]); sethandler(SIGSEGV, sigsegv_for_sigreturn_test, SA_RESETHAND); - - return; } static void test_sigreturn_to(unsigned long ip) @@ -84,7 +82,7 @@ static jmp_buf jmpbuf; static void sigsegv_for_fallthrough(int sig, siginfo_t *info, void *ctx_void) { - ucontext_t *ctx = (ucontext_t*)ctx_void; + ucontext_t *ctx = (ucontext_t *)ctx_void; if (rip != ctx->uc_mcontext.gregs[REG_RIP]) { printf("[FAIL]\tExpected SIGSEGV at 0x%lx but got RIP=0x%lx\n", @@ -130,7 +128,7 @@ static void test_syscall_fallthrough_to(unsigned long ip) printf("[OK]\tWe survived\n"); } -int main() +int main(void) { /* * When the kernel returns from a slow-path syscall, it will -- cgit v1.2.3 From f93bc869825fdba3632ff6ddece4906a6673e679 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 16 Jan 2026 15:30:35 +0100 Subject: selftests: add dm-verity keyring selftests Add selftests that verify the keyring behaves correctly. For simplicity this works with dm-verity as a module. Signed-off-by: Christian Brauner Signed-off-by: Mikulas Patocka --- tools/testing/selftests/dm-verity/Makefile | 5 + tools/testing/selftests/dm-verity/config | 10 + .../selftests/dm-verity/test-dm-verity-keyring.sh | 873 +++++++++++++++++++++ 3 files changed, 888 insertions(+) create mode 100644 tools/testing/selftests/dm-verity/Makefile create mode 100644 tools/testing/selftests/dm-verity/config create mode 100755 tools/testing/selftests/dm-verity/test-dm-verity-keyring.sh (limited to 'tools/testing') diff --git a/tools/testing/selftests/dm-verity/Makefile b/tools/testing/selftests/dm-verity/Makefile new file mode 100644 index 000000000000..b75ee08a54af --- /dev/null +++ b/tools/testing/selftests/dm-verity/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0 + +TEST_PROGS := test-dm-verity-keyring.sh + +include ../lib.mk diff --git a/tools/testing/selftests/dm-verity/config b/tools/testing/selftests/dm-verity/config new file mode 100644 index 000000000000..1cd3712fa0a4 --- /dev/null +++ b/tools/testing/selftests/dm-verity/config @@ -0,0 +1,10 @@ +CONFIG_BLK_DEV_DM=y +CONFIG_DM_VERITY=m +CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG=y +CONFIG_MODULE_UNLOAD=y +CONFIG_KEYS=y +CONFIG_ASYMMETRIC_KEY_TYPE=y +CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y +CONFIG_X509_CERTIFICATE_PARSER=y +CONFIG_PKCS7_MESSAGE_PARSER=y +CONFIG_SYSTEM_DATA_VERIFICATION=y diff --git a/tools/testing/selftests/dm-verity/test-dm-verity-keyring.sh b/tools/testing/selftests/dm-verity/test-dm-verity-keyring.sh new file mode 100755 index 000000000000..1f9601ef22f8 --- /dev/null +++ b/tools/testing/selftests/dm-verity/test-dm-verity-keyring.sh @@ -0,0 +1,873 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test script for dm-verity keyring functionality +# +# This script has two modes depending on kernel configuration: +# +# 1. keyring_unsealed=1 AND require_signatures=1: +# - Upload a test key to the .dm-verity keyring +# - Seal the keyring +# - Create a dm-verity device with a signed root hash +# - Verify signature verification works +# +# 2. keyring_unsealed=0 (default) OR require_signatures=0: +# - Verify the keyring is already sealed (if unsealed=0) +# - Verify keys cannot be added to a sealed keyring +# - Verify the keyring is inactive (not used for verification) +# +# Requirements: +# - Root privileges +# - openssl +# - veritysetup (cryptsetup) +# - keyctl (keyutils) + +set -e + +WORK_DIR="" +DATA_DEV="" +HASH_DEV="" +DM_NAME="verity-test-$$" +CLEANUP_DONE=0 + +# Module parameters (detected at runtime) +KEYRING_UNSEALED="" +REQUIRE_SIGNATURES="" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +log_info() { + echo -e "${GREEN}[INFO]${NC} $*" +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $*" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $*" >&2 +} + +log_pass() { + echo -e "${GREEN}[PASS]${NC} $*" +} + +log_fail() { + echo -e "${RED}[FAIL]${NC} $*" >&2 +} + +log_skip() { + echo -e "${YELLOW}[SKIP]${NC} $*" +} + +cleanup() { + if [ "$CLEANUP_DONE" -eq 1 ]; then + return + fi + CLEANUP_DONE=1 + + log_info "Cleaning up..." + + # Remove dm-verity device if it exists + if dmsetup info "$DM_NAME" &>/dev/null; then + dmsetup remove "$DM_NAME" 2>/dev/null || true + fi + + # Detach loop devices + if [ -n "$DATA_DEV" ] && [[ "$DATA_DEV" == /dev/loop* ]]; then + losetup -d "$DATA_DEV" 2>/dev/null || true + fi + if [ -n "$HASH_DEV" ] && [[ "$HASH_DEV" == /dev/loop* ]]; then + losetup -d "$HASH_DEV" 2>/dev/null || true + fi + + # Remove work directory + if [ -n "$WORK_DIR" ] && [ -d "$WORK_DIR" ]; then + rm -rf "$WORK_DIR" + fi +} + +trap cleanup EXIT + +die() { + log_error "$*" + exit 1 +} + +find_dm_verity_keyring() { + # The .dm-verity keyring is not linked to user-accessible keyrings, + # so we need to find it via /proc/keys + local serial_hex + serial_hex=$(awk '/\.dm-verity/ {print $1}' /proc/keys 2>/dev/null) + + if [ -z "$serial_hex" ]; then + return 1 + fi + + # Convert hex to decimal for keyctl + echo $((16#$serial_hex)) +} + +get_module_param() { + local param="$1" + local path="/sys/module/dm_verity/parameters/$param" + + if [ -f "$path" ]; then + cat "$path" + else + echo "" + fi +} + +check_requirements() { + log_info "Checking requirements..." + + # Check for root + if [ "$(id -u)" -ne 0 ]; then + die "This script must be run as root" + fi + + # Check for required tools + for cmd in openssl veritysetup keyctl losetup dmsetup dd awk; do + if ! command -v "$cmd" &>/dev/null; then + die "Required command not found: $cmd" + fi + done + + # Check for dm-verity module + if ! modprobe -n dm-verity &>/dev/null; then + die "dm-verity module not available" + fi + + # Verify OpenSSL can create signatures + # OpenSSL cms -sign with -binary -outform DER creates detached signatures by default + log_info "Using OpenSSL for PKCS#7 signatures" +} + +load_dm_verity_module() { + local keyring_unsealed="${1:-0}" + local require_signatures="${2:-0}" + + log_info "Loading dm-verity module with keyring_unsealed=$keyring_unsealed require_signatures=$require_signatures" + + # Unload if already loaded + if lsmod | grep -q '^dm_verity'; then + log_info "Unloading existing dm-verity module..." + modprobe -r dm-verity 2>/dev/null || \ + die "Failed to unload dm-verity module (may be in use)" + sleep 1 + fi + + # Load with specified parameters + modprobe dm-verity keyring_unsealed="$keyring_unsealed" require_signatures="$require_signatures" || \ + die "Failed to load dm-verity module" + + # Wait for keyring to be created (poll with timeout) + local keyring_id="" + local timeout=50 # 5 seconds (50 * 0.1s) + while [ $timeout -gt 0 ]; do + keyring_id=$(find_dm_verity_keyring) && break + sleep 0.1 + timeout=$((timeout - 1)) + done + + if [ -z "$keyring_id" ]; then + die "dm-verity keyring not found after module load (timeout)" + fi + + log_info "Found .dm-verity keyring: $keyring_id" + echo "$keyring_id" > "$WORK_DIR/keyring_id" + + # Read and display module parameters + KEYRING_UNSEALED=$(get_module_param "keyring_unsealed") + REQUIRE_SIGNATURES=$(get_module_param "require_signatures") + + log_info "Module parameters:" + log_info " keyring_unsealed=$KEYRING_UNSEALED" + log_info " require_signatures=$REQUIRE_SIGNATURES" +} + +unload_dm_verity_module() { + log_info "Unloading dm-verity module..." + + # Clean up any dm-verity devices first + local dm_dev + while read -r dm_dev _; do + [ -n "$dm_dev" ] || continue + log_info "Removing dm-verity device: $dm_dev" + dmsetup remove "$dm_dev" 2>/dev/null || true + done < <(dmsetup ls --target verity 2>/dev/null) + + if lsmod | grep -q '^dm_verity'; then + modprobe -r dm-verity 2>/dev/null || \ + log_warn "Failed to unload dm-verity module" + sleep 1 + fi +} + +generate_keys() { + log_info "Generating signing key pair..." + + # Generate private key (2048-bit for faster test execution) + openssl genrsa -out "$WORK_DIR/private.pem" 2048 2>/dev/null + + # Create OpenSSL config for certificate extensions + # The kernel requires digitalSignature key usage for signature verification + # Both subjectKeyIdentifier and authorityKeyIdentifier are needed for + # the kernel to match keys in the keyring (especially for self-signed certs) + cat > "$WORK_DIR/openssl.cnf" << 'EOF' +[req] +distinguished_name = req_distinguished_name +x509_extensions = v3_ca +prompt = no + +[req_distinguished_name] +CN = dm-verity-test-key + +[v3_ca] +basicConstraints = critical,CA:FALSE +keyUsage = digitalSignature +subjectKeyIdentifier = hash +authorityKeyIdentifier = keyid +EOF + + # Generate self-signed certificate with proper extensions + openssl req -new -x509 -key "$WORK_DIR/private.pem" \ + -out "$WORK_DIR/cert.pem" -days 365 \ + -config "$WORK_DIR/openssl.cnf" 2>/dev/null + + # Convert certificate to DER format for kernel + openssl x509 -in "$WORK_DIR/cert.pem" -outform DER \ + -out "$WORK_DIR/cert.der" + + # Show certificate info for debugging + log_info "Certificate details:" + openssl x509 -in "$WORK_DIR/cert.pem" -noout -text 2>/dev/null | \ + grep -E "Subject:|Issuer:|Key Usage|Extended" | head -10 + + log_info "Keys generated successfully" +} + +seal_keyring() { + log_info "Sealing the .dm-verity keyring..." + + local keyring_id + keyring_id=$(cat "$WORK_DIR/keyring_id") + + keyctl restrict_keyring "$keyring_id" || \ + die "Failed to seal keyring" + + log_info "Keyring sealed successfully" +} + +create_test_device() { + log_info "Creating test device images..." + + # Create data image with random content (8MB is sufficient for testing) + dd if=/dev/urandom of="$WORK_DIR/data.img" bs=1M count=8 status=none + + # Create hash image (will be populated by veritysetup) + dd if=/dev/zero of="$WORK_DIR/hash.img" bs=1M count=1 status=none + + # Setup loop devices + DATA_DEV=$(losetup --find --show "$WORK_DIR/data.img") + HASH_DEV=$(losetup --find --show "$WORK_DIR/hash.img") + + log_info "Data device: $DATA_DEV" + log_info "Hash device: $HASH_DEV" +} + +create_verity_hash() { + log_info "Creating dm-verity hash tree..." + + local root_hash output + output=$(veritysetup format "$DATA_DEV" "$HASH_DEV" 2>&1) + root_hash=$(echo "$output" | grep "Root hash:" | awk '{print $3}') + + if [ -z "$root_hash" ]; then + log_error "veritysetup format output:" + echo "$output" | sed 's/^/ /' + die "Failed to get root hash from veritysetup format" + fi + + echo "$root_hash" > "$WORK_DIR/root_hash" + log_info "Root hash: $root_hash" +} + +create_detached_signature() { + local infile="$1" + local outfile="$2" + local cert="$3" + local key="$4" + + # Use openssl smime (not cms) for PKCS#7 signatures compatible with kernel + # Flags from working veritysetup example: + # -nocerts: don't include certificate in signature + # -noattr: no signed attributes + # -binary: binary input mode + if openssl smime -sign -nocerts -noattr -binary \ + -in "$infile" \ + -inkey "$key" \ + -signer "$cert" \ + -outform der \ + -out "$outfile" 2>/dev/null; then + return 0 + fi + + log_error "Failed to create signature" + return 1 +} + +activate_verity_device() { + local with_sig="$1" + local root_hash + root_hash=$(cat "$WORK_DIR/root_hash") + + # Clear dmesg and capture any kernel messages during activation + dmesg -C 2>/dev/null || true + + if [ "$with_sig" = "yes" ]; then + log_info "Activating dm-verity device with signature..." + veritysetup open "$DATA_DEV" "$DM_NAME" "$HASH_DEV" "$root_hash" \ + --root-hash-signature="$WORK_DIR/root_hash.p7s" 2>&1 + local ret=$? + else + log_info "Activating dm-verity device without signature..." + veritysetup open "$DATA_DEV" "$DM_NAME" "$HASH_DEV" "$root_hash" 2>&1 + local ret=$? + fi + + # Show relevant kernel messages + local kmsg + kmsg=$(dmesg 2>/dev/null | grep -i -E 'verity|pkcs|signature|asymmetric|key' | tail -10) + if [ -n "$kmsg" ]; then + log_info "Kernel messages:" + echo "$kmsg" | while read -r line; do echo " $line"; done + fi + + return $ret +} + +deactivate_verity_device() { + if dmsetup info "$DM_NAME" &>/dev/null; then + dmsetup remove "$DM_NAME" 2>/dev/null || true + fi +} + +show_keyring_status() { + log_info "Keyring status:" + + local keyring_id + keyring_id=$(find_dm_verity_keyring) || true + + if [ -n "$keyring_id" ]; then + echo " Keyring ID: $keyring_id" + keyctl show "$keyring_id" 2>/dev/null || true + grep '\.dm-verity' /proc/keys 2>/dev/null || true + fi +} + +list_keyring_keys() { + log_info "Keys in .dm-verity keyring:" + + local keyring_id + keyring_id=$(cat "$WORK_DIR/keyring_id" 2>/dev/null) || \ + keyring_id=$(find_dm_verity_keyring) || true + + if [ -z "$keyring_id" ]; then + log_warn "Could not find keyring" + return + fi + + # List all keys in the keyring + local keys + keys=$(keyctl list "$keyring_id" 2>/dev/null) + if [ -z "$keys" ] || [ "$keys" = "keyring is empty" ]; then + echo " (empty)" + else + echo "$keys" | while read -r line; do + echo " $line" + done + + # Show detailed info for each key + log_info "Key details:" + keyctl list "$keyring_id" 2>/dev/null | awk '{print $1}' | grep -E '^[0-9]+$' | while read -r key_id; do + echo " Key $key_id:" + keyctl describe "$key_id" 2>/dev/null | sed 's/^/ /' + done + fi +} + +generate_named_key() { + local name="$1" + local key_dir="$WORK_DIR/keys/$name" + + mkdir -p "$key_dir" + + # Log to stderr so it doesn't interfere with return value + echo "[INFO] Generating key pair: $name" >&2 + + # Generate private key + openssl genrsa -out "$key_dir/private.pem" 2048 2>/dev/null + + # Create OpenSSL config for certificate extensions + # Both subjectKeyIdentifier and authorityKeyIdentifier are needed for + # the kernel to match keys in the keyring (especially for self-signed certs) + cat > "$key_dir/openssl.cnf" << EOF +[req] +distinguished_name = req_distinguished_name +x509_extensions = v3_ca +prompt = no + +[req_distinguished_name] +CN = dm-verity-test-$name + +[v3_ca] +basicConstraints = critical,CA:FALSE +keyUsage = digitalSignature +subjectKeyIdentifier = hash +authorityKeyIdentifier = keyid +EOF + + # Generate self-signed certificate with proper extensions + openssl req -new -x509 -key "$key_dir/private.pem" \ + -out "$key_dir/cert.pem" -days 365 \ + -config "$key_dir/openssl.cnf" 2>/dev/null + + # Convert certificate to DER format for kernel + openssl x509 -in "$key_dir/cert.pem" -outform DER \ + -out "$key_dir/cert.der" + + # Return the key directory path (only this goes to stdout) + echo "$key_dir" +} + +upload_named_key() { + local name="$1" + local key_dir="$2" + + local keyring_id + keyring_id=$(cat "$WORK_DIR/keyring_id") + + log_info "Uploading key '$name' to keyring..." + + local key_id + if key_id=$(keyctl padd asymmetric "$name" "$keyring_id" \ + < "$key_dir/cert.der" 2>&1); then + log_info "Key '$name' uploaded with ID: $key_id" + echo "$key_id" > "$key_dir/key_id" + return 0 + else + log_error "Failed to upload key '$name': $key_id" + return 1 + fi +} + +# +# Test: Verify sealed keyring rejects key additions +# +test_sealed_keyring_rejects_keys() { + log_info "TEST: Verify sealed keyring rejects key additions" + + local keyring_id + keyring_id=$(cat "$WORK_DIR/keyring_id") + + generate_keys + + # Try to add a key - should fail + if keyctl padd asymmetric "dm-verity-test" "$keyring_id" \ + < "$WORK_DIR/cert.der" 2>/dev/null; then + log_fail "Key addition should have been rejected on sealed keyring" + return 1 + else + log_pass "Sealed keyring correctly rejected key addition" + return 0 + fi +} + +# +# Test: Multiple keys in keyring +# +test_multiple_keys() { + log_info "TEST: Multiple keys in keyring" + + local key1_dir key2_dir key3_dir + + # Generate three different keys + key1_dir=$(generate_named_key "vendor-a") + key2_dir=$(generate_named_key "vendor-b") + key3_dir=$(generate_named_key "vendor-c") + + # Upload all three keys + upload_named_key "vendor-a" "$key1_dir" || return 1 + upload_named_key "vendor-b" "$key2_dir" || return 1 + upload_named_key "vendor-c" "$key3_dir" || return 1 + + log_info "" + log_info "Keys in keyring before sealing:" + list_keyring_keys + show_keyring_status + + # Seal the keyring + log_info "" + seal_keyring + + # List keys after sealing + log_info "" + log_info "Keys in keyring after sealing:" + list_keyring_keys + show_keyring_status + + log_pass "Key upload and keyring sealing succeeded" + + # Create test device + log_info "" + create_test_device + create_verity_hash + + # Test 1: Sign with key1, should verify successfully + log_info "" + log_info "Sub-test: Verify with vendor-a key" + if ! sign_root_hash_with_key "$key1_dir"; then + log_fail "Failed to sign with vendor-a key" + return 1 + fi + if activate_verity_device "yes"; then + log_pass "Verification with vendor-a key succeeded" + deactivate_verity_device + else + log_fail "Verification with vendor-a key should succeed" + return 1 + fi + + # Test 2: Sign with key2, should also verify successfully + log_info "" + log_info "Sub-test: Verify with vendor-b key" + if ! sign_root_hash_with_key "$key2_dir"; then + log_fail "Failed to sign with vendor-b key" + return 1 + fi + if activate_verity_device "yes"; then + log_pass "Verification with vendor-b key succeeded" + deactivate_verity_device + else + log_fail "Verification with vendor-b key should succeed" + return 1 + fi + + # Test 3: Sign with key3, should also verify successfully + log_info "" + log_info "Sub-test: Verify with vendor-c key" + if ! sign_root_hash_with_key "$key3_dir"; then + log_fail "Failed to sign with vendor-c key" + return 1 + fi + if activate_verity_device "yes"; then + log_pass "Verification with vendor-c key succeeded" + deactivate_verity_device + else + log_fail "Verification with vendor-c key should succeed" + return 1 + fi + + # Test 4: Generate a key NOT in the keyring, should fail + log_info "" + log_info "Sub-test: Verify with unknown key (should fail)" + local unknown_key_dir + unknown_key_dir=$(generate_named_key "unknown-vendor") + if ! sign_root_hash_with_key "$unknown_key_dir"; then + log_fail "Failed to sign with unknown-vendor key" + return 1 + fi + if activate_verity_device "yes"; then + log_fail "Verification with unknown key should fail" + deactivate_verity_device + return 1 + else + log_pass "Verification with unknown key correctly rejected" + fi + + log_info "" + log_pass "Multiple keys test completed successfully" + return 0 +} + +sign_root_hash_with_key() { + local key_dir="$1" + + local root_hash + root_hash=$(cat "$WORK_DIR/root_hash") + + # Create the data to sign (hex string, not binary) + echo -n "$root_hash" > "$WORK_DIR/root_hash.txt" + + # Debug: show exactly what we're signing + log_info "Root hash (hex): $root_hash" + log_info "Root hash hex string size: $(wc -c < "$WORK_DIR/root_hash.txt") bytes" + + # Create detached PKCS#7 signature + if ! create_detached_signature "$WORK_DIR/root_hash.txt" "$WORK_DIR/root_hash.p7s" \ + "$key_dir/cert.pem" "$key_dir/private.pem"; then + log_error "Failed to sign root hash with key from $key_dir" + return 1 + fi + + # Debug: show signing certificate info + log_info "Signed with certificate:" + openssl x509 -in "$key_dir/cert.pem" -noout -subject 2>/dev/null | sed 's/^/ /' + + # Debug: verify signature locally + # -nointern: cert not in signature, use -certfile + # -noverify: skip certificate chain validation (self-signed) + if openssl smime -verify -binary -inform der -nointern -noverify \ + -in "$WORK_DIR/root_hash.p7s" \ + -content "$WORK_DIR/root_hash.txt" \ + -certfile "$key_dir/cert.pem" \ + -out /dev/null 2>/dev/null; then + log_info "Local signature verification: PASSED" + else + log_warn "Local signature verification: FAILED" + fi + return 0 +} + +# +# Test: Verify corrupted signatures are rejected +# +test_corrupted_signature() { + log_info "TEST: Verify corrupted signatures are rejected" + + # This test requires a valid setup from test_multiple_keys or similar + # It modifies the signature file and verifies rejection + + if [ ! -f "$WORK_DIR/root_hash.p7s" ]; then + log_warn "No signature file found, skipping corrupted signature test" + return 0 + fi + + # Save original signature + cp "$WORK_DIR/root_hash.p7s" "$WORK_DIR/root_hash.p7s.orig" + + # Test 1: Truncated signature + log_info "Sub-test: Truncated signature (should fail)" + head -c 100 "$WORK_DIR/root_hash.p7s.orig" > "$WORK_DIR/root_hash.p7s" + if activate_verity_device "yes"; then + log_fail "Truncated signature should be rejected" + deactivate_verity_device + cp "$WORK_DIR/root_hash.p7s.orig" "$WORK_DIR/root_hash.p7s" + return 1 + else + log_pass "Truncated signature correctly rejected" + fi + + # Test 2: Corrupted signature (flip some bytes) + log_info "Sub-test: Corrupted signature bytes (should fail)" + cp "$WORK_DIR/root_hash.p7s.orig" "$WORK_DIR/root_hash.p7s" + # Corrupt bytes in the middle of the signature + local sig_size + sig_size=$(wc -c < "$WORK_DIR/root_hash.p7s") + local corrupt_offset=$((sig_size / 2)) + printf '\xff\xff\xff\xff' | dd of="$WORK_DIR/root_hash.p7s" bs=1 seek=$corrupt_offset conv=notrunc 2>/dev/null + if activate_verity_device "yes"; then + log_fail "Corrupted signature should be rejected" + deactivate_verity_device + cp "$WORK_DIR/root_hash.p7s.orig" "$WORK_DIR/root_hash.p7s" + return 1 + else + log_pass "Corrupted signature correctly rejected" + fi + + # Test 3: Signature over wrong data (sign different content) + log_info "Sub-test: Signature over wrong data (should fail)" + # Create a different root hash (all zeros as hex string) + printf '%064d' 0 > "$WORK_DIR/wrong_hash.txt" + # Get the first key directory that was used + local key_dir="$WORK_DIR/keys/vendor-a" + if [ -d "$key_dir" ]; then + create_detached_signature "$WORK_DIR/wrong_hash.txt" "$WORK_DIR/root_hash.p7s" \ + "$key_dir/cert.pem" "$key_dir/private.pem" + if activate_verity_device "yes"; then + log_fail "Signature over wrong data should be rejected" + deactivate_verity_device + cp "$WORK_DIR/root_hash.p7s.orig" "$WORK_DIR/root_hash.p7s" + return 1 + else + log_pass "Signature over wrong data correctly rejected" + fi + else + log_warn "Key directory not found, skipping wrong data test" + fi + + # Restore original signature + cp "$WORK_DIR/root_hash.p7s.orig" "$WORK_DIR/root_hash.p7s" + + log_pass "Corrupted signature test completed successfully" + return 0 +} + +# +# Test: Verify keyring is sealed when keyring_unsealed=0 +# +test_keyring_sealed_by_default() { + log_info "TEST: Verify keyring is sealed by default (keyring_unsealed=0)" + + local keyring_id + keyring_id=$(cat "$WORK_DIR/keyring_id") + + log_info "Current keyring state (should be empty and sealed):" + list_keyring_keys + show_keyring_status + + generate_keys + + # Try to add a key - should fail if keyring is sealed + log_info "Attempting to add key to sealed keyring..." + if keyctl padd asymmetric "dm-verity-test" "$keyring_id" \ + < "$WORK_DIR/cert.der" 2>/dev/null; then + log_fail "Keyring should be sealed when keyring_unsealed=0" + list_keyring_keys + return 1 + else + log_pass "Keyring is correctly sealed when keyring_unsealed=0" + log_info "Keyring state after failed add attempt:" + list_keyring_keys + return 0 + fi +} + +# +# Test: Verify dm-verity keyring is inactive when sealed empty +# +test_keyring_inactive_when_empty() { + log_info "TEST: Verify dm-verity keyring is inactive when sealed empty" + + # When keyring_unsealed=0, the keyring is sealed immediately while empty + # This means it should NOT be used for verification (nr_leaves_on_tree=0) + + log_info "Keyring state (should be empty and sealed):" + list_keyring_keys + show_keyring_status + + create_test_device + create_verity_hash + + # Without any keys in the dm-verity keyring, and with it sealed, + # verification should fall through to the secondary/platform keyrings + # and likely succeed (if require_signatures=0) or fail (if =1) + + log_info "Sub-test: Device activation with sealed empty keyring" + if [ "$REQUIRE_SIGNATURES" = "Y" ] || [ "$REQUIRE_SIGNATURES" = "1" ]; then + if activate_verity_device "no"; then + log_fail "Device should NOT activate without signature when require_signatures=1" + deactivate_verity_device + return 1 + else + log_pass "Device correctly rejected (require_signatures=1, no valid signature)" + fi + else + if activate_verity_device "no"; then + log_pass "Device activated (require_signatures=0, empty dm-verity keyring is inactive)" + deactivate_verity_device + else + log_fail "Device should activate when require_signatures=0" + return 1 + fi + fi + + return 0 +} + +main() { + local rc=0 + + log_info "=== dm-verity keyring test ===" + log_info "" + + # Create work directory + WORK_DIR=$(mktemp -d -t dm-verity-test.XXXXXX) + log_info "Work directory: $WORK_DIR" + + check_requirements + + # + # Test 1: UNSEALED keyring mode (keyring_unsealed=1) + # + log_info "" + log_info "========================================" + log_info "=== TEST MODE: UNSEALED KEYRING ===" + log_info "========================================" + log_info "" + + load_dm_verity_module 1 1 # keyring_unsealed=1, require_signatures=1 + show_keyring_status + + log_info "" + if ! test_multiple_keys; then + rc=1 + fi + + # After sealing, verify it rejects new keys + log_info "" + if ! test_sealed_keyring_rejects_keys; then + rc=1 + fi + + # Test corrupted signatures are rejected + log_info "" + if ! test_corrupted_signature; then + rc=1 + fi + + # Clean up devices before reloading module + deactivate_verity_device + if [ -n "$DATA_DEV" ] && [[ "$DATA_DEV" == /dev/loop* ]]; then + losetup -d "$DATA_DEV" 2>/dev/null || true + DATA_DEV="" + fi + if [ -n "$HASH_DEV" ] && [[ "$HASH_DEV" == /dev/loop* ]]; then + losetup -d "$HASH_DEV" 2>/dev/null || true + HASH_DEV="" + fi + + # + # Test 2: SEALED keyring mode (keyring_unsealed=0, default) + # + log_info "" + log_info "========================================" + log_info "=== TEST MODE: SEALED KEYRING (default) ===" + log_info "========================================" + log_info "" + + load_dm_verity_module 0 0 # keyring_unsealed=0, require_signatures=0 + show_keyring_status + + log_info "" + if ! test_keyring_sealed_by_default; then + rc=1 + fi + + log_info "" + if ! test_keyring_inactive_when_empty; then + rc=1 + fi + + # + # Summary + # + log_info "" + log_info "========================================" + if [ $rc -eq 0 ]; then + log_info "=== All tests PASSED ===" + else + log_error "=== Some tests FAILED ===" + fi + log_info "========================================" + + return $rc +} + +main "$@" -- cgit v1.2.3 From 03b7c2d763c907f508edf8c317c0e920ce072a33 Mon Sep 17 00:00:00 2001 From: Alex Mastro Date: Wed, 14 Jan 2026 10:57:16 -0800 Subject: vfio: selftests: Centralize IOMMU mode name definitions Replace scattered string literals with MODE_* macros in iommu.h. This provides a single source of truth for IOMMU mode name strings. Signed-off-by: Alex Mastro Reviewed-by: David Matlack Tested-by: David Matlack Link: https://lore.kernel.org/r/20260114-map-mmio-test-v3-1-44e036d95e64@fb.com Signed-off-by: Alex Williamson --- tools/testing/selftests/vfio/lib/include/libvfio/iommu.h | 6 ++++++ tools/testing/selftests/vfio/lib/iommu.c | 12 ++++++------ tools/testing/selftests/vfio/vfio_dma_mapping_test.c | 2 +- 3 files changed, 13 insertions(+), 7 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/vfio/lib/include/libvfio/iommu.h b/tools/testing/selftests/vfio/lib/include/libvfio/iommu.h index 5c9b9dc6d993..e9a3386a4719 100644 --- a/tools/testing/selftests/vfio/lib/include/libvfio/iommu.h +++ b/tools/testing/selftests/vfio/lib/include/libvfio/iommu.h @@ -61,6 +61,12 @@ iova_t iommu_hva2iova(struct iommu *iommu, void *vaddr); struct iommu_iova_range *iommu_iova_ranges(struct iommu *iommu, u32 *nranges); +#define MODE_VFIO_TYPE1_IOMMU "vfio_type1_iommu" +#define MODE_VFIO_TYPE1V2_IOMMU "vfio_type1v2_iommu" +#define MODE_IOMMUFD_COMPAT_TYPE1 "iommufd_compat_type1" +#define MODE_IOMMUFD_COMPAT_TYPE1V2 "iommufd_compat_type1v2" +#define MODE_IOMMUFD "iommufd" + /* * Generator for VFIO selftests fixture variants that replicate across all * possible IOMMU modes. Tests must define FIXTURE_VARIANT_ADD_IOMMU_MODE() diff --git a/tools/testing/selftests/vfio/lib/iommu.c b/tools/testing/selftests/vfio/lib/iommu.c index 58b7fb7430d4..035dac069d60 100644 --- a/tools/testing/selftests/vfio/lib/iommu.c +++ b/tools/testing/selftests/vfio/lib/iommu.c @@ -20,32 +20,32 @@ #include "../../../kselftest.h" #include -const char *default_iommu_mode = "iommufd"; +const char *default_iommu_mode = MODE_IOMMUFD; /* Reminder: Keep in sync with FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(). */ static const struct iommu_mode iommu_modes[] = { { - .name = "vfio_type1_iommu", + .name = MODE_VFIO_TYPE1_IOMMU, .container_path = "/dev/vfio/vfio", .iommu_type = VFIO_TYPE1_IOMMU, }, { - .name = "vfio_type1v2_iommu", + .name = MODE_VFIO_TYPE1V2_IOMMU, .container_path = "/dev/vfio/vfio", .iommu_type = VFIO_TYPE1v2_IOMMU, }, { - .name = "iommufd_compat_type1", + .name = MODE_IOMMUFD_COMPAT_TYPE1, .container_path = "/dev/iommu", .iommu_type = VFIO_TYPE1_IOMMU, }, { - .name = "iommufd_compat_type1v2", + .name = MODE_IOMMUFD_COMPAT_TYPE1V2, .container_path = "/dev/iommu", .iommu_type = VFIO_TYPE1v2_IOMMU, }, { - .name = "iommufd", + .name = MODE_IOMMUFD, }, }; diff --git a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c index 3bf984b337ac..3d2f44f9c62f 100644 --- a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c +++ b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c @@ -165,7 +165,7 @@ TEST_F(vfio_dma_mapping_test, dma_map_unmap) * IOMMUFD compatibility-mode does not support huge mappings when * using VFIO_TYPE1_IOMMU. */ - if (!strcmp(variant->iommu_mode, "iommufd_compat_type1")) + if (!strcmp(variant->iommu_mode, MODE_IOMMUFD_COMPAT_TYPE1)) mapping_size = SZ_4K; ASSERT_EQ(0, rc); -- cgit v1.2.3 From 557dbdf6c4e9c2dc3d4a4476c67ef14dca32378d Mon Sep 17 00:00:00 2001 From: Alex Mastro Date: Wed, 14 Jan 2026 10:57:17 -0800 Subject: vfio: selftests: Align BAR mmaps for efficient IOMMU mapping Update vfio_pci_bar_map() to align BAR mmaps for efficient huge page mappings. The manual mmap alignment can be removed once mmap(!MAP_FIXED) on vfio device fds improves to automatically return well-aligned addresses. Also add MADV_HUGEPAGE, which encourages the kernel to use huge pages (e.g. when /sys/kernel/mm/transparent_hugepage/enabled is set to "madvise"). Drop MAP_FILE from mmap(). It is an ignored compatibility flag. Signed-off-by: Alex Mastro Reviewed-by: David Matlack Tested-by: David Matlack Link: https://lore.kernel.org/r/20260114-map-mmio-test-v3-2-44e036d95e64@fb.com Signed-off-by: Alex Williamson --- tools/testing/selftests/vfio/lib/include/libvfio.h | 9 ++++++++ tools/testing/selftests/vfio/lib/libvfio.c | 25 ++++++++++++++++++++++ tools/testing/selftests/vfio/lib/vfio_pci_device.c | 24 ++++++++++++++++++++- 3 files changed, 57 insertions(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/vfio/lib/include/libvfio.h b/tools/testing/selftests/vfio/lib/include/libvfio.h index 279ddcd70194..1b6da54cc2cb 100644 --- a/tools/testing/selftests/vfio/lib/include/libvfio.h +++ b/tools/testing/selftests/vfio/lib/include/libvfio.h @@ -23,4 +23,13 @@ const char *vfio_selftests_get_bdf(int *argc, char *argv[]); char **vfio_selftests_get_bdfs(int *argc, char *argv[], int *nr_bdfs); +/* + * Reserve virtual address space of size at an address satisfying + * (vaddr % align) == offset. + * + * Returns the reserved vaddr. The caller is responsible for unmapping + * the returned region. + */ +void *mmap_reserve(size_t size, size_t align, size_t offset); + #endif /* SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_H */ diff --git a/tools/testing/selftests/vfio/lib/libvfio.c b/tools/testing/selftests/vfio/lib/libvfio.c index a23a3cc5be69..3a3d1ed635c1 100644 --- a/tools/testing/selftests/vfio/lib/libvfio.c +++ b/tools/testing/selftests/vfio/lib/libvfio.c @@ -2,6 +2,9 @@ #include #include +#include + +#include #include "../../../kselftest.h" #include @@ -76,3 +79,25 @@ const char *vfio_selftests_get_bdf(int *argc, char *argv[]) return vfio_selftests_get_bdfs(argc, argv, &nr_bdfs)[0]; } + +void *mmap_reserve(size_t size, size_t align, size_t offset) +{ + void *map_base, *map_align; + size_t delta; + + VFIO_ASSERT_GT(align, offset); + delta = align - offset; + + map_base = mmap(NULL, size + align, PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + VFIO_ASSERT_NE(map_base, MAP_FAILED); + + map_align = (void *)(ALIGN((uintptr_t)map_base + delta, align) - delta); + + if (map_align > map_base) + VFIO_ASSERT_EQ(munmap(map_base, map_align - map_base), 0); + + VFIO_ASSERT_EQ(munmap(map_align + size, map_base + align - map_align), 0); + + return map_align; +} diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c index fac4c0ecadef..4e5871f1ebc3 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c @@ -11,10 +11,14 @@ #include #include +#include #include +#include #include +#include #include #include +#include #include #include @@ -123,20 +127,38 @@ static void vfio_pci_region_get(struct vfio_pci_device *device, int index, static void vfio_pci_bar_map(struct vfio_pci_device *device, int index) { struct vfio_pci_bar *bar = &device->bars[index]; + size_t align, size; int prot = 0; + void *vaddr; VFIO_ASSERT_LT(index, PCI_STD_NUM_BARS); VFIO_ASSERT_NULL(bar->vaddr); VFIO_ASSERT_TRUE(bar->info.flags & VFIO_REGION_INFO_FLAG_MMAP); + VFIO_ASSERT_TRUE(is_power_of_2(bar->info.size)); if (bar->info.flags & VFIO_REGION_INFO_FLAG_READ) prot |= PROT_READ; if (bar->info.flags & VFIO_REGION_INFO_FLAG_WRITE) prot |= PROT_WRITE; - bar->vaddr = mmap(NULL, bar->info.size, prot, MAP_FILE | MAP_SHARED, + size = bar->info.size; + + /* + * Align BAR mmaps to improve page fault granularity during potential + * subsequent IOMMU mapping of these BAR vaddr. 1G for x86 is the + * largest hugepage size across any architecture, so no benefit from + * larger alignment. BARs smaller than 1G will be aligned by their + * power-of-two size, guaranteeing sufficient alignment for smaller + * hugepages, if present. + */ + align = min_t(size_t, size, SZ_1G); + + vaddr = mmap_reserve(size, align, 0); + bar->vaddr = mmap(vaddr, size, prot, MAP_SHARED | MAP_FIXED, device->fd, bar->info.offset); VFIO_ASSERT_NE(bar->vaddr, MAP_FAILED); + + madvise(bar->vaddr, size, MADV_HUGEPAGE); } static void vfio_pci_bar_unmap(struct vfio_pci_device *device, int index) -- cgit v1.2.3 From 080723f4d4c3c6fb0720aae614deb1f30ee9ef2e Mon Sep 17 00:00:00 2001 From: Alex Mastro Date: Wed, 14 Jan 2026 10:57:18 -0800 Subject: vfio: selftests: Add vfio_dma_mapping_mmio_test Test IOMMU mapping the BAR mmaps created during vfio_pci_device_setup(). All IOMMU modes are tested: vfio_type1 variants are expected to succeed, while non-type1 modes are expected to fail. iommufd compat mode can be updated to expect success once kernel support lands. Native iommufd will not support mapping vaddrs backed by MMIO (it will support dma-buf based MMIO mapping instead). Signed-off-by: Alex Mastro Reviewed-by: David Matlack Tested-by: David Matlack Link: https://lore.kernel.org/r/20260114-map-mmio-test-v3-3-44e036d95e64@fb.com Signed-off-by: Alex Williamson --- tools/testing/selftests/vfio/Makefile | 1 + .../selftests/vfio/vfio_dma_mapping_mmio_test.c | 143 +++++++++++++++++++++ 2 files changed, 144 insertions(+) create mode 100644 tools/testing/selftests/vfio/vfio_dma_mapping_mmio_test.c (limited to 'tools/testing') diff --git a/tools/testing/selftests/vfio/Makefile b/tools/testing/selftests/vfio/Makefile index 3c796ca99a50..ead27892ab65 100644 --- a/tools/testing/selftests/vfio/Makefile +++ b/tools/testing/selftests/vfio/Makefile @@ -1,5 +1,6 @@ CFLAGS = $(KHDR_INCLUDES) TEST_GEN_PROGS += vfio_dma_mapping_test +TEST_GEN_PROGS += vfio_dma_mapping_mmio_test TEST_GEN_PROGS += vfio_iommufd_setup_test TEST_GEN_PROGS += vfio_pci_device_test TEST_GEN_PROGS += vfio_pci_device_init_perf_test diff --git a/tools/testing/selftests/vfio/vfio_dma_mapping_mmio_test.c b/tools/testing/selftests/vfio/vfio_dma_mapping_mmio_test.c new file mode 100644 index 000000000000..957a89ce7b3a --- /dev/null +++ b/tools/testing/selftests/vfio/vfio_dma_mapping_mmio_test.c @@ -0,0 +1,143 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include "../kselftest_harness.h" + +static const char *device_bdf; + +static struct vfio_pci_bar *largest_mapped_bar(struct vfio_pci_device *device) +{ + u32 flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE; + struct vfio_pci_bar *largest = NULL; + u64 bar_size = 0; + + for (int i = 0; i < PCI_STD_NUM_BARS; i++) { + struct vfio_pci_bar *bar = &device->bars[i]; + + if (!bar->vaddr) + continue; + + /* + * iommu_map() maps with READ|WRITE, so require the same + * abilities for the underlying VFIO region. + */ + if ((bar->info.flags & flags) != flags) + continue; + + if (bar->info.size > bar_size) { + bar_size = bar->info.size; + largest = bar; + } + } + + return largest; +} + +FIXTURE(vfio_dma_mapping_mmio_test) { + struct iommu *iommu; + struct vfio_pci_device *device; + struct iova_allocator *iova_allocator; + struct vfio_pci_bar *bar; +}; + +FIXTURE_VARIANT(vfio_dma_mapping_mmio_test) { + const char *iommu_mode; +}; + +#define FIXTURE_VARIANT_ADD_IOMMU_MODE(_iommu_mode) \ +FIXTURE_VARIANT_ADD(vfio_dma_mapping_mmio_test, _iommu_mode) { \ + .iommu_mode = #_iommu_mode, \ +} + +FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(); + +#undef FIXTURE_VARIANT_ADD_IOMMU_MODE + +FIXTURE_SETUP(vfio_dma_mapping_mmio_test) +{ + self->iommu = iommu_init(variant->iommu_mode); + self->device = vfio_pci_device_init(device_bdf, self->iommu); + self->iova_allocator = iova_allocator_init(self->iommu); + self->bar = largest_mapped_bar(self->device); + + if (!self->bar) + SKIP(return, "No mappable BAR found on device %s", device_bdf); +} + +FIXTURE_TEARDOWN(vfio_dma_mapping_mmio_test) +{ + iova_allocator_cleanup(self->iova_allocator); + vfio_pci_device_cleanup(self->device); + iommu_cleanup(self->iommu); +} + +static void do_mmio_map_test(struct iommu *iommu, + struct iova_allocator *iova_allocator, + void *vaddr, size_t size) +{ + struct dma_region region = { + .vaddr = vaddr, + .size = size, + .iova = iova_allocator_alloc(iova_allocator, size), + }; + + /* + * NOTE: Check for iommufd compat success once it lands. Native iommufd + * will never support this. + */ + if (!strcmp(iommu->mode->name, MODE_VFIO_TYPE1V2_IOMMU) || + !strcmp(iommu->mode->name, MODE_VFIO_TYPE1_IOMMU)) { + iommu_map(iommu, ®ion); + iommu_unmap(iommu, ®ion); + } else { + VFIO_ASSERT_NE(__iommu_map(iommu, ®ion), 0); + VFIO_ASSERT_NE(__iommu_unmap(iommu, ®ion, NULL), 0); + } +} + +TEST_F(vfio_dma_mapping_mmio_test, map_full_bar) +{ + do_mmio_map_test(self->iommu, self->iova_allocator, + self->bar->vaddr, self->bar->info.size); +} + +TEST_F(vfio_dma_mapping_mmio_test, map_partial_bar) +{ + if (self->bar->info.size < 2 * getpagesize()) + SKIP(return, "BAR too small (size=0x%llx)", self->bar->info.size); + + do_mmio_map_test(self->iommu, self->iova_allocator, + self->bar->vaddr, getpagesize()); +} + +/* Test IOMMU mapping of BAR mmap with intentionally poor vaddr alignment. */ +TEST_F(vfio_dma_mapping_mmio_test, map_bar_misaligned) +{ + /* Limit size to bound test time for large BARs */ + size_t size = min_t(size_t, self->bar->info.size, SZ_1G); + void *vaddr; + + vaddr = mmap_reserve(size, SZ_1G, getpagesize()); + vaddr = mmap(vaddr, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, + self->device->fd, self->bar->info.offset); + VFIO_ASSERT_NE(vaddr, MAP_FAILED); + + do_mmio_map_test(self->iommu, self->iova_allocator, vaddr, size); + + VFIO_ASSERT_EQ(munmap(vaddr, size), 0); +} + +int main(int argc, char *argv[]) +{ + device_bdf = vfio_selftests_get_bdf(&argc, argv); + return test_harness_run(argc, argv); +} -- cgit v1.2.3 From 1c588bca3bd5b39c93a28a5986bf82ebfb05eec2 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 14 Jan 2026 21:12:52 +0000 Subject: vfio: selftests: Drop IOMMU mapping size assertions for VFIO_TYPE1_IOMMU Drop the assertions about IOMMU mappings sizes for VFIO_TYPE1_IOMMU modes (both the VFIO mode and the iommufd compatibility mode). These assertions fail when CONFIG_IOMMUFD_VFIO_CONTAINER is enabled, since iommufd compatibility mode provides different huge page behavior than VFIO for VFIO_TYPE1_IOMMU. VFIO_TYPE1_IOMMU is an old enough interface that it's not worth changing the behavior of VFIO and iommufd to match nor care about the IOMMU mapping sizes. Cc: Jason Gunthorpe Link: https://lore.kernel.org/kvm/20260109143830.176dc279@shazbot.org/ Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20260114211252.2581145-1-dmatlack@google.com Signed-off-by: Alex Williamson --- tools/testing/selftests/vfio/vfio_dma_mapping_test.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c index 3d2f44f9c62f..abb170bdcef7 100644 --- a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c +++ b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c @@ -161,12 +161,8 @@ TEST_F(vfio_dma_mapping_test, dma_map_unmap) if (rc == -EOPNOTSUPP) goto unmap; - /* - * IOMMUFD compatibility-mode does not support huge mappings when - * using VFIO_TYPE1_IOMMU. - */ - if (!strcmp(variant->iommu_mode, MODE_IOMMUFD_COMPAT_TYPE1)) - mapping_size = SZ_4K; + if (self->iommu->mode->iommu_type == VFIO_TYPE1_IOMMU) + goto unmap; ASSERT_EQ(0, rc); printf("Found IOMMU mappings for IOVA 0x%lx:\n", region.iova); -- cgit v1.2.3 From 8becfe16e4a12218c703a98f5bfc15b6f0fbd99c Mon Sep 17 00:00:00 2001 From: Dmitry Skorodumov Date: Mon, 12 Jan 2026 17:24:07 +0300 Subject: selftests: net: simple selftest for ipvtap This is a simple ipvtap test to test handling IP-address add/remove on ipvlan interface. It creates a veth-interface and then creates several network-namespace with ipvlan0 interface in it linked to veth. Then it starts to add/remove addresses on ipvlan0 interfaces in several threads. At finish, it checks that there is no duplicated addresses. Signed-off-by: Dmitry Skorodumov Link: https://patch.msgid.link/20260112142417.4039566-3-skorodumov.dmitry@huawei.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/Makefile | 1 + tools/testing/selftests/net/config | 2 + tools/testing/selftests/net/ipvtap_test.sh | 168 +++++++++++++++++++++++++++++ 3 files changed, 171 insertions(+) create mode 100755 tools/testing/selftests/net/ipvtap_test.sh (limited to 'tools/testing') diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index b66ba04f19d9..45c4ea381bc3 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -48,6 +48,7 @@ TEST_PROGS := \ ipv6_flowlabel.sh \ ipv6_force_forwarding.sh \ ipv6_route_update_soft_lockup.sh \ + ipvtap_test.sh \ l2_tos_ttl_inherit.sh \ l2tp.sh \ link_netns.py \ diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config index 1e1f253118f5..b84362b9b508 100644 --- a/tools/testing/selftests/net/config +++ b/tools/testing/selftests/net/config @@ -48,6 +48,7 @@ CONFIG_IPV6_SEG6_LWTUNNEL=y CONFIG_IPV6_SIT=y CONFIG_IPV6_VTI=y CONFIG_IPVLAN=m +CONFIG_IPVTAP=m CONFIG_KALLSYMS=y CONFIG_L2TP=m CONFIG_L2TP_ETH=m @@ -116,6 +117,7 @@ CONFIG_PROC_SYSCTL=y CONFIG_PSAMPLE=m CONFIG_RPS=y CONFIG_SYSFS=y +CONFIG_TAP=m CONFIG_TCP_MD5SIG=y CONFIG_TEST_BLACKHOLE_DEV=m CONFIG_TEST_BPF=m diff --git a/tools/testing/selftests/net/ipvtap_test.sh b/tools/testing/selftests/net/ipvtap_test.sh new file mode 100755 index 000000000000..354ca7ce8584 --- /dev/null +++ b/tools/testing/selftests/net/ipvtap_test.sh @@ -0,0 +1,168 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Simple tests for ipvtap + + +# +# The testing environment looks this way: +# +# |------HNS-------| |------PHY-------| +# | veth<----------------->veth | +# |------|--|------| |----------------| +# | | +# | | |-----TST0-------| +# | |------------|----ipvlan | +# | |----------------| +# | +# | |-----TST1-------| +# |---------------|----ipvlan | +# |----------------| +# + +ALL_TESTS=" + test_ip_set +" + +source lib.sh + +DEBUG=0 + +VETH_HOST=vethtst.h +VETH_PHY=vethtst.p + +NS_COUNT=32 +IP_ITERATIONS=1024 +IPSET_TIMEOUT="60s" + +ns_run() { + ns=$1 + shift + if [[ "$ns" == "global" ]]; then + "$@" >/dev/null + else + ip netns exec "$ns" "$@" >/dev/null + fi +} + +test_ip_setup_env() { + setup_ns NS_PHY + setup_ns HST_NS + + # setup simulated other-host (phy) and host itself + ns_run "$HST_NS" ip link add $VETH_HOST type veth peer name $VETH_PHY \ + netns "$NS_PHY" >/dev/null + ns_run "$HST_NS" ip link set $VETH_HOST up + ns_run "$NS_PHY" ip link set $VETH_PHY up + + for ((i=0; i/dev/null + ip a a "fc00::$v/64" dev ipvlan0 2>/dev/null + v=$(rnd) + ip a d "172.25.0.$v/24" dev ipvlan0 2>/dev/null + ip a d "fc00::$v/64" dev ipvlan0 2>/dev/null + done +} + +test_ip_set() { + RET=0 + + trap test_ip_cleanup_env EXIT + + test_ip_setup_env + + declare -A ns_pids + for ((i=0; i Date: Fri, 16 Jan 2026 10:48:55 +0100 Subject: selftests: net: csum: Fix printk format in recv_get_packet_csum_status() Following warning is encountered when building selftests on powerpc/32. CC csum csum.c: In function 'recv_get_packet_csum_status': csum.c:710:50: warning: format '%lu' expects argument of type 'long unsigned int', but argument 4 has type 'size_t' {aka 'unsigned int'} [-Wformat=] 710 | error(1, 0, "cmsg: len=%lu expected=%lu", | ~~^ | | | long unsigned int | %u 711 | cm->cmsg_len, CMSG_LEN(sizeof(struct tpacket_auxdata))); | ~~~~~~~~~~~~ | | | size_t {aka unsigned int} csum.c:710:63: warning: format '%lu' expects argument of type 'long unsigned int', but argument 5 has type 'unsigned int' [-Wformat=] 710 | error(1, 0, "cmsg: len=%lu expected=%lu", | ~~^ | | | long unsigned int | %u cm->cmsg_len has type __kernel_size_t and CMSG() macro has the type returned by sizeof() which is size_t. size_t is 'unsigned int' on some platforms and 'unsigned long' on other ones so use %zu instead of %lu. The code in question was introduced by commit 91a7de85600d ("selftests/net: add csum offload test"). Signed-off-by: Christophe Leroy (CS GROUP) Reviewed-by: Maxime Chevallier Link: https://patch.msgid.link/8b69b40826553c1dd500d9d25e45883744f3f348.1768556791.git.chleroy@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/lib/csum.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/net/lib/csum.c b/tools/testing/selftests/net/lib/csum.c index 27437590eeb5..e28884ce3ab3 100644 --- a/tools/testing/selftests/net/lib/csum.c +++ b/tools/testing/selftests/net/lib/csum.c @@ -707,7 +707,7 @@ static uint32_t recv_get_packet_csum_status(struct msghdr *msg) cm->cmsg_level, cm->cmsg_type); if (cm->cmsg_len != CMSG_LEN(sizeof(struct tpacket_auxdata))) - error(1, 0, "cmsg: len=%lu expected=%lu", + error(1, 0, "cmsg: len=%zu expected=%zu", cm->cmsg_len, CMSG_LEN(sizeof(struct tpacket_auxdata))); aux = (void *)CMSG_DATA(cm); -- cgit v1.2.3 From 2460f31e6e444a52a4e718e4fe64cff29ffaab05 Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Wed, 14 Jan 2026 11:02:43 -0500 Subject: selftests/tc-testing: Try to add teql as a child qdisc Add a selftest that attempts to add a teql qdisc as a qfq child. Since teql _must_ be added as a root qdisc, the kernel should reject this. Signed-off-by: Victor Nogueira Link: https://patch.msgid.link/20260114160243.913069-4-jhs@mojatatu.com Signed-off-by: Jakub Kicinski --- .../selftests/tc-testing/tc-tests/qdiscs/teql.json | 25 ++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/teql.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/teql.json index e5cc31f265f8..0179c57104ad 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/teql.json +++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/teql.json @@ -81,5 +81,30 @@ "$TC qdisc del dev $DUMMY handle 1: root", "$IP link del dev $DUMMY" ] + }, + { + "id": "124e", + "name": "Try to add teql as a child qdisc", + "category": [ + "qdisc", + "ets", + "tbf" + ], + "plugins": { + "requires": [ + "nsPlugin" + ] + }, + "setup": [ + "$TC qdisc add dev $DUMMY root handle 1: qfq", + "$TC class add dev $DUMMY parent 1: classid 1:1 qfq weight 15 maxpkt 16384" + ], + "cmdUnderTest": "$TC qdisc add dev $DUMMY parent 1:1 handle 2:1 teql0", + "expExitCode": "2", + "verifyCmd": "$TC -s -j qdisc ls dev $DUMMY parent 1:1", + "matchJSON": [], + "teardown": [ + "$TC qdisc del dev $DUMMY root handle 1:" + ] } ] -- cgit v1.2.3 From 61d99ce3dfc2f1ba5bf713093bb3a5faf5ebc2dc Mon Sep 17 00:00:00 2001 From: David Wei Date: Thu, 15 Jan 2026 09:26:00 +0100 Subject: selftests/net: Add bpf skb forwarding program Add nk_forward.bpf.c, a BPF program that forwards skbs matching some IPv6 prefix received on eth0 ifindex to a specified netkit ifindex. This will be needed by netkit container tests. Signed-off-by: David Wei Signed-off-by: Daniel Borkmann Acked-by: Stanislav Fomichev Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20260115082603.219152-14-daniel@iogearbox.net Signed-off-by: Paolo Abeni --- .../selftests/drivers/net/hw/nk_forward.bpf.c | 49 ++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 tools/testing/selftests/drivers/net/hw/nk_forward.bpf.c (limited to 'tools/testing') diff --git a/tools/testing/selftests/drivers/net/hw/nk_forward.bpf.c b/tools/testing/selftests/drivers/net/hw/nk_forward.bpf.c new file mode 100644 index 000000000000..86ebfc1445b6 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/nk_forward.bpf.c @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include + +#define TC_ACT_OK 0 +#define ETH_P_IPV6 0x86DD + +#define ctx_ptr(field) ((void *)(long)(field)) + +#define v6_p64_equal(a, b) (a.s6_addr32[0] == b.s6_addr32[0] && \ + a.s6_addr32[1] == b.s6_addr32[1]) + +volatile __u32 netkit_ifindex; +volatile __u8 ipv6_prefix[16]; + +SEC("tc/ingress") +int tc_redirect_peer(struct __sk_buff *skb) +{ + void *data_end = ctx_ptr(skb->data_end); + void *data = ctx_ptr(skb->data); + struct in6_addr *peer_addr; + struct ipv6hdr *ip6h; + struct ethhdr *eth; + + peer_addr = (struct in6_addr *)ipv6_prefix; + + if (skb->protocol != bpf_htons(ETH_P_IPV6)) + return TC_ACT_OK; + + eth = data; + if ((void *)(eth + 1) > data_end) + return TC_ACT_OK; + + ip6h = data + sizeof(struct ethhdr); + if ((void *)(ip6h + 1) > data_end) + return TC_ACT_OK; + + if (!v6_p64_equal(ip6h->daddr, (*peer_addr))) + return TC_ACT_OK; + + return bpf_redirect_peer(netkit_ifindex, 0); +} + +char __license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 6be87fbb27763c2999e1c69bbec1f3a63cf05422 Mon Sep 17 00:00:00 2001 From: David Wei Date: Thu, 15 Jan 2026 09:26:01 +0100 Subject: selftests/net: Add env for container based tests Add an env NetDrvContEnv for container based selftests. This automates the setup of a netns, netkit pair with one inside the netns, and a BPF program that forwards skbs from the NETIF host inside the container. Currently only netkit is used, but other virtual netdevs e.g. veth can be used too. Expect netkit container datapath selftests to have a publicly routable IP prefix to assign to netkit in a container, such that packets will land on eth0. The BPF skb forward program will then forward such packets from the host netns to the container netns. Signed-off-by: David Wei Signed-off-by: Daniel Borkmann Acked-by: Stanislav Fomichev Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20260115082603.219152-15-daniel@iogearbox.net Signed-off-by: Paolo Abeni --- tools/testing/selftests/drivers/net/README.rst | 7 ++ .../selftests/drivers/net/hw/lib/py/__init__.py | 7 +- .../selftests/drivers/net/lib/py/__init__.py | 7 +- tools/testing/selftests/drivers/net/lib/py/env.py | 112 +++++++++++++++++++++ 4 files changed, 127 insertions(+), 6 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/drivers/net/README.rst b/tools/testing/selftests/drivers/net/README.rst index eb838ae94844..b94e81c2e030 100644 --- a/tools/testing/selftests/drivers/net/README.rst +++ b/tools/testing/selftests/drivers/net/README.rst @@ -62,6 +62,13 @@ LOCAL_V4, LOCAL_V6, REMOTE_V4, REMOTE_V6 Local and remote endpoint IP addresses. +LOCAL_PREFIX_V4, LOCAL_PREFIX_V6 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Local IP prefix/subnet which can be used to allocate extra IP addresses (for +network name spaces behind macvlan, veth, netkit devices). DUT must be +reachable using these addresses from the endpoint. + REMOTE_TYPE ~~~~~~~~~~~ diff --git a/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py b/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py index d5d247eca6b7..022008249313 100644 --- a/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py +++ b/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py @@ -3,6 +3,7 @@ """ Driver test environment (hardware-only tests). NetDrvEnv and NetDrvEpEnv are the main environment classes. +NetDrvContEnv extends NetDrvEpEnv with netkit container support. Former is for local host only tests, latter creates / connects to a remote endpoint. See NIPA wiki for more information about running and writing driver tests. @@ -29,7 +30,7 @@ try: from net.lib.py import ksft_eq, ksft_ge, ksft_in, ksft_is, ksft_lt, \ ksft_ne, ksft_not_in, ksft_raises, ksft_true, ksft_gt, ksft_not_none from drivers.net.lib.py import GenerateTraffic, Remote, Iperf3Runner - from drivers.net.lib.py import NetDrvEnv, NetDrvEpEnv + from drivers.net.lib.py import NetDrvEnv, NetDrvEpEnv, NetDrvContEnv __all__ = ["NetNS", "NetNSEnter", "NetdevSimDev", "EthtoolFamily", "NetdevFamily", "NetshaperFamily", @@ -44,8 +45,8 @@ try: "ksft_eq", "ksft_ge", "ksft_in", "ksft_is", "ksft_lt", "ksft_ne", "ksft_not_in", "ksft_raises", "ksft_true", "ksft_gt", "ksft_not_none", "ksft_not_none", - "NetDrvEnv", "NetDrvEpEnv", "GenerateTraffic", "Remote", - "Iperf3Runner"] + "NetDrvEnv", "NetDrvEpEnv", "NetDrvContEnv", "GenerateTraffic", + "Remote", "Iperf3Runner"] except ModuleNotFoundError as e: print("Failed importing `net` library from kernel sources") print(str(e)) diff --git a/tools/testing/selftests/drivers/net/lib/py/__init__.py b/tools/testing/selftests/drivers/net/lib/py/__init__.py index 8b75faa9af6d..be3a8a936882 100644 --- a/tools/testing/selftests/drivers/net/lib/py/__init__.py +++ b/tools/testing/selftests/drivers/net/lib/py/__init__.py @@ -3,6 +3,7 @@ """ Driver test environment. NetDrvEnv and NetDrvEpEnv are the main environment classes. +NetDrvContEnv extends NetDrvEpEnv with netkit container support. Former is for local host only tests, latter creates / connects to a remote endpoint. See NIPA wiki for more information about running and writing driver tests. @@ -43,12 +44,12 @@ try: "ksft_ne", "ksft_not_in", "ksft_raises", "ksft_true", "ksft_gt", "ksft_not_none", "ksft_not_none"] - from .env import NetDrvEnv, NetDrvEpEnv + from .env import NetDrvEnv, NetDrvEpEnv, NetDrvContEnv from .load import GenerateTraffic, Iperf3Runner from .remote import Remote - __all__ += ["NetDrvEnv", "NetDrvEpEnv", "GenerateTraffic", "Remote", - "Iperf3Runner"] + __all__ += ["NetDrvEnv", "NetDrvEpEnv", "NetDrvContEnv", "GenerateTraffic", + "Remote", "Iperf3Runner"] except ModuleNotFoundError as e: print("Failed importing `net` library from kernel sources") print(str(e)) diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py index 41cc248ac848..5b12c4c59e09 100644 --- a/tools/testing/selftests/drivers/net/lib/py/env.py +++ b/tools/testing/selftests/drivers/net/lib/py/env.py @@ -1,6 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 +import ipaddress import os +import re import time from pathlib import Path from lib.py import KsftSkipEx, KsftXfailEx @@ -8,6 +10,7 @@ from lib.py import ksft_setup, wait_file from lib.py import cmd, ethtool, ip, CmdExitFailure from lib.py import NetNS, NetdevSimDev from .remote import Remote +from . import bpftool class NetDrvEnvBase: @@ -289,3 +292,112 @@ class NetDrvEpEnv(NetDrvEnvBase): data.get('stats-block-usecs', 0) / 1000 / 1000 time.sleep(self._stats_settle_time) + + +class NetDrvContEnv(NetDrvEpEnv): + """ + Class for an environment with a netkit pair setup for forwarding traffic + between the physical interface and a network namespace. + """ + + def __init__(self, src_path, nk_rxqueues=1, **kwargs): + super().__init__(src_path, **kwargs) + + self.require_ipver("6") + local_prefix = self.env.get("LOCAL_PREFIX_V6") + if not local_prefix: + raise KsftSkipEx("LOCAL_PREFIX_V6 required") + + local_prefix = local_prefix.rstrip("/64").rstrip("::").rstrip(":") + self.ipv6_prefix = f"{local_prefix}::" + self.nk_host_ipv6 = f"{local_prefix}::2:1" + self.nk_guest_ipv6 = f"{local_prefix}::2:2" + + self.netns = None + self._nk_host_ifname = None + self._nk_guest_ifname = None + self._tc_attached = False + self._bpf_prog_pref = None + self._bpf_prog_id = None + + ip(f"link add type netkit mode l2 forward peer forward numrxqueues {nk_rxqueues}") + + all_links = ip("-d link show", json=True) + netkit_links = [link for link in all_links + if link.get('linkinfo', {}).get('info_kind') == 'netkit' + and 'UP' not in link.get('flags', [])] + + if len(netkit_links) != 2: + raise KsftSkipEx("Failed to create netkit pair") + + netkit_links.sort(key=lambda x: x['ifindex']) + self._nk_host_ifname = netkit_links[1]['ifname'] + self._nk_guest_ifname = netkit_links[0]['ifname'] + self.nk_host_ifindex = netkit_links[1]['ifindex'] + self.nk_guest_ifindex = netkit_links[0]['ifindex'] + + self._setup_ns() + self._attach_bpf() + + def __del__(self): + if self._tc_attached: + cmd(f"tc filter del dev {self.ifname} ingress pref {self._bpf_prog_pref}") + self._tc_attached = False + + if self._nk_host_ifname: + cmd(f"ip link del dev {self._nk_host_ifname}") + self._nk_host_ifname = None + self._nk_guest_ifname = None + + if self.netns: + del self.netns + self.netns = None + + super().__del__() + + def _setup_ns(self): + self.netns = NetNS() + ip(f"link set dev {self._nk_guest_ifname} netns {self.netns.name}") + ip(f"link set dev {self._nk_host_ifname} up") + ip(f"-6 addr add fe80::1/64 dev {self._nk_host_ifname} nodad") + ip(f"-6 route add {self.nk_guest_ipv6}/128 via fe80::2 dev {self._nk_host_ifname}") + + ip("link set lo up", ns=self.netns) + ip(f"link set dev {self._nk_guest_ifname} up", ns=self.netns) + ip(f"-6 addr add fe80::2/64 dev {self._nk_guest_ifname}", ns=self.netns) + ip(f"-6 addr add {self.nk_guest_ipv6}/64 dev {self._nk_guest_ifname} nodad", ns=self.netns) + ip(f"-6 route add default via fe80::1 dev {self._nk_guest_ifname}", ns=self.netns) + + def _attach_bpf(self): + bpf_obj = self.test_dir / "nk_forward.bpf.o" + if not bpf_obj.exists(): + raise KsftSkipEx("BPF prog not found") + + cmd(f"tc filter add dev {self.ifname} ingress bpf obj {bpf_obj} sec tc/ingress direct-action") + self._tc_attached = True + + tc_info = cmd(f"tc filter show dev {self.ifname} ingress").stdout + match = re.search(r'pref (\d+).*nk_forward\.bpf.*id (\d+)', tc_info) + if not match: + raise Exception("Failed to get BPF prog ID") + self._bpf_prog_pref = int(match.group(1)) + self._bpf_prog_id = int(match.group(2)) + + prog_info = bpftool(f"prog show id {self._bpf_prog_id}", json=True) + map_ids = prog_info.get("map_ids", []) + + bss_map_id = None + for map_id in map_ids: + map_info = bpftool(f"map show id {map_id}", json=True) + if map_info.get("name").endswith("bss"): + bss_map_id = map_id + + if bss_map_id is None: + raise Exception("Failed to find .bss map") + + ipv6_addr = ipaddress.IPv6Address(self.ipv6_prefix) + ipv6_bytes = ipv6_addr.packed + ifindex_bytes = self.nk_host_ifindex.to_bytes(4, byteorder='little') + value = ipv6_bytes + ifindex_bytes + value_hex = ' '.join(f'{b:02x}' for b in value) + bpftool(f"map update id {bss_map_id} key hex 00 00 00 00 value hex {value_hex}") -- cgit v1.2.3 From ab771c938d9a57d510bb70c565c9388b10494090 Mon Sep 17 00:00:00 2001 From: David Wei Date: Thu, 15 Jan 2026 09:26:02 +0100 Subject: selftests/net: Make NetDrvContEnv support queue leasing Add a new parameter `lease` to NetDrvContEnv that sets up queue leasing in the env. The NETIF also has some ethtool parameters changed to support memory provider tests. This is needed in NetDrvContEnv rather than individual test cases since the cleanup to restore NETIF can't be done, until the netns in the env is gone. Signed-off-by: David Wei Signed-off-by: Daniel Borkmann Acked-by: Stanislav Fomichev Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20260115082603.219152-16-daniel@iogearbox.net Signed-off-by: Paolo Abeni --- tools/testing/selftests/drivers/net/lib/py/env.py | 47 ++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py index 5b12c4c59e09..7066d78395c6 100644 --- a/tools/testing/selftests/drivers/net/lib/py/env.py +++ b/tools/testing/selftests/drivers/net/lib/py/env.py @@ -9,6 +9,7 @@ from lib.py import KsftSkipEx, KsftXfailEx from lib.py import ksft_setup, wait_file from lib.py import cmd, ethtool, ip, CmdExitFailure from lib.py import NetNS, NetdevSimDev +from lib.py import NetdevFamily, EthtoolFamily from .remote import Remote from . import bpftool @@ -300,7 +301,7 @@ class NetDrvContEnv(NetDrvEpEnv): between the physical interface and a network namespace. """ - def __init__(self, src_path, nk_rxqueues=1, **kwargs): + def __init__(self, src_path, lease=False, **kwargs): super().__init__(src_path, **kwargs) self.require_ipver("6") @@ -308,6 +309,9 @@ class NetDrvContEnv(NetDrvEpEnv): if not local_prefix: raise KsftSkipEx("LOCAL_PREFIX_V6 required") + self.netdevnl = NetdevFamily() + self.ethnl = EthtoolFamily() + local_prefix = local_prefix.rstrip("/64").rstrip("::").rstrip(":") self.ipv6_prefix = f"{local_prefix}::" self.nk_host_ipv6 = f"{local_prefix}::2:1" @@ -319,7 +323,11 @@ class NetDrvContEnv(NetDrvEpEnv): self._tc_attached = False self._bpf_prog_pref = None self._bpf_prog_id = None + self._leased = False + nk_rxqueues = 1 + if lease: + nk_rxqueues = 2 ip(f"link add type netkit mode l2 forward peer forward numrxqueues {nk_rxqueues}") all_links = ip("-d link show", json=True) @@ -336,6 +344,9 @@ class NetDrvContEnv(NetDrvEpEnv): self.nk_host_ifindex = netkit_links[1]['ifindex'] self.nk_guest_ifindex = netkit_links[0]['ifindex'] + if lease: + self._lease_queues() + self._setup_ns() self._attach_bpf() @@ -353,8 +364,42 @@ class NetDrvContEnv(NetDrvEpEnv): del self.netns self.netns = None + if self._leased: + self.ethnl.rings_set({'header': {'dev-index': self.ifindex}, + 'tcp-data-split': 'unknown', + 'hds-thresh': self._hds_thresh, + 'rx': self._rx_rings}) + self._leased = False + super().__del__() + def _lease_queues(self): + channels = self.ethnl.channels_get({'header': {'dev-index': self.ifindex}}) + channels = channels['combined-count'] + if channels < 2: + raise KsftSkipEx('Test requires NETIF with at least 2 combined channels') + + rings = self.ethnl.rings_get({'header': {'dev-index': self.ifindex}}) + self._rx_rings = rings['rx'] + self._hds_thresh = rings.get('hds-thresh', 0) + self.ethnl.rings_set({'header': {'dev-index': self.ifindex}, + 'tcp-data-split': 'enabled', + 'hds-thresh': 0, + 'rx': 64}) + self.src_queue = channels - 1 + bind_result = self.netdevnl.queue_create( + { + "ifindex": self.nk_guest_ifindex, + "type": "rx", + "lease": { + "ifindex": self.ifindex, + "queue": {"id": self.src_queue, "type": "rx"}, + }, + } + ) + self.nk_queue = bind_result['id'] + self._leased = True + def _setup_ns(self): self.netns = NetNS() ip(f"link set dev {self._nk_guest_ifname} netns {self.netns.name}") -- cgit v1.2.3 From 931420a2fc363817c92990fa14eb1bdec024ce04 Mon Sep 17 00:00:00 2001 From: David Wei Date: Thu, 15 Jan 2026 09:26:03 +0100 Subject: selftests/net: Add netkit container tests Add two tests using NetDrvContEnv. One basic test that sets up a netkit pair, with one end in a netns. Use LOCAL_PREFIX_V6 and nk_forward BPF program to ping from a remote host to the netkit in netns. Second is a selftest for netkit queue leasing, using io_uring zero copy test binary inside of a netns with netkit. This checks that memory providers can be bound against virtual queues in a netkit within a netns that are leasing from a physical netdev in the default netns. Signed-off-by: David Wei Signed-off-by: Daniel Borkmann Acked-by: Stanislav Fomichev Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20260115082603.219152-17-daniel@iogearbox.net Signed-off-by: Paolo Abeni --- tools/testing/selftests/drivers/net/hw/Makefile | 2 + tools/testing/selftests/drivers/net/hw/nk_netns.py | 23 +++++++++ .../testing/selftests/drivers/net/hw/nk_qlease.py | 55 ++++++++++++++++++++++ 3 files changed, 80 insertions(+) create mode 100755 tools/testing/selftests/drivers/net/hw/nk_netns.py create mode 100755 tools/testing/selftests/drivers/net/hw/nk_qlease.py (limited to 'tools/testing') diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile index 9c163ba6feee..39ad86d693b3 100644 --- a/tools/testing/selftests/drivers/net/hw/Makefile +++ b/tools/testing/selftests/drivers/net/hw/Makefile @@ -32,6 +32,8 @@ TEST_PROGS = \ irq.py \ loopback.sh \ nic_timestamp.py \ + nk_netns.py \ + nk_qlease.py \ pp_alloc_fail.py \ rss_api.py \ rss_ctx.py \ diff --git a/tools/testing/selftests/drivers/net/hw/nk_netns.py b/tools/testing/selftests/drivers/net/hw/nk_netns.py new file mode 100755 index 000000000000..afa8638195d8 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/nk_netns.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +from lib.py import ksft_run, ksft_exit +from lib.py import NetDrvContEnv +from lib.py import cmd + + +def test_ping(cfg) -> None: + cfg.require_ipver("6") + + cmd(f"ping -c 1 -W5 {cfg.nk_guest_ipv6}", host=cfg.remote) + cmd(f"ping -c 1 -W5 {cfg.remote_addr_v['6']}", ns=cfg.netns) + + +def main() -> None: + with NetDrvContEnv(__file__) as cfg: + ksft_run([test_ping], args=(cfg,)) + ksft_exit() + + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/drivers/net/hw/nk_qlease.py b/tools/testing/selftests/drivers/net/hw/nk_qlease.py new file mode 100755 index 000000000000..738a46d2d20c --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/nk_qlease.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +import re +from os import path +from lib.py import ksft_run, ksft_exit +from lib.py import NetDrvContEnv +from lib.py import bkg, cmd, defer, ethtool, rand_port, wait_port_listen + + +def create_rss_ctx(cfg): + output = ethtool(f"-X {cfg.ifname} context new start {cfg.src_queue} equal 1").stdout + values = re.search(r'New RSS context is (\d+)', output).group(1) + return int(values) + + +def set_flow_rule(cfg): + output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port {cfg.port} action {cfg.src_queue}").stdout + values = re.search(r'ID (\d+)', output).group(1) + return int(values) + + +def set_flow_rule_rss(cfg, rss_ctx_id): + output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port {cfg.port} context {rss_ctx_id}").stdout + values = re.search(r'ID (\d+)', output).group(1) + return int(values) + + +def test_iou_zcrx(cfg) -> None: + cfg.require_ipver('6') + + ethtool(f"-X {cfg.ifname} equal {cfg.src_queue}") + defer(ethtool, f"-X {cfg.ifname} default") + + flow_rule_id = set_flow_rule(cfg) + defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}") + + rx_cmd = f"ip netns exec {cfg.netns.name} {cfg.bin_local} -s -p {cfg.port} -i {cfg._nk_guest_ifname} -q {cfg.nk_queue}" + tx_cmd = f"{cfg.bin_remote} -c -h {cfg.nk_guest_ipv6} -p {cfg.port} -l 12840" + with bkg(rx_cmd, exit_wait=True): + wait_port_listen(cfg.port, proto="tcp", ns=cfg.netns) + cmd(tx_cmd, host=cfg.remote) + + +def main() -> None: + with NetDrvContEnv(__file__, lease=True) as cfg: + cfg.bin_local = path.abspath(path.dirname(__file__) + "/../../../drivers/net/hw/iou-zcrx") + cfg.bin_remote = cfg.remote.deploy(cfg.bin_local) + cfg.port = rand_port() + ksft_run([test_iou_zcrx], args=(cfg,)) + ksft_exit() + + +if __name__ == "__main__": + main() -- cgit v1.2.3 From 5d54aa40c7b7e9dee5746cca99e9ddbcca13e895 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Fri, 16 Jan 2026 09:52:36 +0100 Subject: vsock/test: Do not filter kallsyms by symbol type Blamed commit implemented logic to discover available vsock transports by grepping /proc/kallsyms for known symbols. It incorrectly filtered entries by type 'd'. For some kernel configs having CONFIG_VIRTIO_VSOCKETS=m CONFIG_VSOCKETS_LOOPBACK=y kallsyms reports 0000000000000000 d virtio_transport [vmw_vsock_virtio_transport] 0000000000000000 t loopback_transport Overzealous filtering might have affected vsock test suit, resulting in insufficient/misleading testing. Do not filter symbols by type. It never helped much. Fixes: 3070c05b7afd ("vsock/test: Introduce get_transports()") Signed-off-by: Michal Luczaj Reviewed-by: Stefano Garzarella Link: https://patch.msgid.link/20260116-vsock_test-kallsyms-grep-v1-1-3320bc3346f2@rbox.co Signed-off-by: Paolo Abeni --- tools/testing/vsock/util.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/vsock/util.h b/tools/testing/vsock/util.h index 142c02a6834a..bf633cde82b0 100644 --- a/tools/testing/vsock/util.h +++ b/tools/testing/vsock/util.h @@ -25,7 +25,7 @@ enum transport { }; static const char * const transport_ksyms[] = { - #define x(name, symbol) "d " symbol "_transport", + #define x(name, symbol) " " symbol "_transport", KNOWN_TRANSPORTS(x) #undef x }; -- cgit v1.2.3 From db0c35ca36526f3072affcb573631ccf8c85f827 Mon Sep 17 00:00:00 2001 From: Ryota Sakamoto Date: Sat, 17 Jan 2026 02:46:34 +0900 Subject: kunit: add bash completion Currently, kunit.py has many subcommands and options, making it difficult to remember them without checking the help message. Add --list-cmds and --list-opts to kunit.py to get available commands and options, use those outputs in kunit-completion.sh to show completion. This implementation is similar to perf and tools/perf/perf-completion.sh. Example output: $ source tools/testing/kunit/kunit-completion.sh $ ./tools/testing/kunit/kunit.py [TAB][TAB] build config exec parse run $ ./tools/testing/kunit/kunit.py run --k[TAB][TAB] --kconfig_add --kernel_args --kunitconfig Link: https://lore.kernel.org/r/20260117-kunit-completion-v2-1-cabd127d0801@gmail.com Reviewed-by: David Gow Signed-off-by: Ryota Sakamoto Signed-off-by: Shuah Khan --- Documentation/dev-tools/kunit/run_wrapper.rst | 9 +++++++ tools/testing/kunit/kunit-completion.sh | 34 +++++++++++++++++++++++++++ tools/testing/kunit/kunit.py | 30 +++++++++++++++++++++++ tools/testing/kunit/kunit_tool_test.py | 21 +++++++++++++++++ 4 files changed, 94 insertions(+) create mode 100644 tools/testing/kunit/kunit-completion.sh (limited to 'tools/testing') diff --git a/Documentation/dev-tools/kunit/run_wrapper.rst b/Documentation/dev-tools/kunit/run_wrapper.rst index 6697c71ee8ca..3c0b585dcfff 100644 --- a/Documentation/dev-tools/kunit/run_wrapper.rst +++ b/Documentation/dev-tools/kunit/run_wrapper.rst @@ -335,3 +335,12 @@ command line arguments: - ``--list_tests_attr``: If set, lists all tests that will be run and all of their attributes. + +Command-line completion +============================== + +The kunit_tool comes with a bash completion script: + +.. code-block:: bash + + source tools/testing/kunit/kunit-completion.sh diff --git a/tools/testing/kunit/kunit-completion.sh b/tools/testing/kunit/kunit-completion.sh new file mode 100644 index 000000000000..f053e7b5d265 --- /dev/null +++ b/tools/testing/kunit/kunit-completion.sh @@ -0,0 +1,34 @@ +# SPDX-License-Identifier: GPL-2.0 +# bash completion support for KUnit + +_kunit_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) + +_kunit() +{ + local cur prev words cword + _init_completion || return + + local script="${_kunit_dir}/kunit.py" + + if [[ $cword -eq 1 && "$cur" != -* ]]; then + local cmds=$(${script} --list-cmds 2>/dev/null) + COMPREPLY=($(compgen -W "${cmds}" -- "$cur")) + return 0 + fi + + if [[ "$cur" == -* ]]; then + if [[ -n "${words[1]}" && "${words[1]}" != -* ]]; then + local opts=$(${script} ${words[1]} --list-opts 2>/dev/null) + COMPREPLY=($(compgen -W "${opts}" -- "$cur")) + return 0 + else + local opts=$(${script} --list-opts 2>/dev/null) + COMPREPLY=($(compgen -W "${opts}" -- "$cur")) + return 0 + fi + fi +} + +complete -o default -F _kunit kunit.py +complete -o default -F _kunit kunit +complete -o default -F _kunit ./tools/testing/kunit/kunit.py diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py index e3d82a038f93..4ec5ecba6d49 100755 --- a/tools/testing/kunit/kunit.py +++ b/tools/testing/kunit/kunit.py @@ -328,6 +328,17 @@ def get_default_build_dir() -> str: return os.path.join(os.environ['KBUILD_OUTPUT'], '.kunit') return '.kunit' +def add_completion_opts(parser: argparse.ArgumentParser) -> None: + parser.add_argument('--list-opts', + help=argparse.SUPPRESS, + action='store_true') + +def add_root_opts(parser: argparse.ArgumentParser) -> None: + parser.add_argument('--list-cmds', + help=argparse.SUPPRESS, + action='store_true') + add_completion_opts(parser) + def add_common_opts(parser: argparse.ArgumentParser) -> None: parser.add_argument('--build_dir', help='As in the make command, it specifies the build ' @@ -379,6 +390,8 @@ def add_common_opts(parser: argparse.ArgumentParser) -> None: help='Additional QEMU arguments, e.g. "-smp 8"', action='append', metavar='') + add_completion_opts(parser) + def add_build_opts(parser: argparse.ArgumentParser) -> None: parser.add_argument('--jobs', help='As in the make command, "Specifies the number of ' @@ -574,6 +587,7 @@ subcommand_handlers_map = { def main(argv: Sequence[str]) -> None: parser = argparse.ArgumentParser( description='Helps writing and running KUnit tests.') + add_root_opts(parser) subparser = parser.add_subparsers(dest='subcommand') # The 'run' command will config, build, exec, and parse in one go. @@ -608,12 +622,28 @@ def main(argv: Sequence[str]) -> None: parse_parser.add_argument('file', help='Specifies the file to read results from.', type=str, nargs='?', metavar='input_file') + add_completion_opts(parse_parser) cli_args = parser.parse_args(massage_argv(argv)) if get_kernel_root_path(): os.chdir(get_kernel_root_path()) + if cli_args.list_cmds: + print(" ".join(subparser.choices.keys())) + return + + if cli_args.list_opts: + target_parser = subparser.choices.get(cli_args.subcommand) + if not target_parser: + target_parser = parser + + # Accessing private attribute _option_string_actions to get + # the list of options. This is not a public API, but argparse + # does not provide a way to inspect options programmatically. + print(' '.join(target_parser._option_string_actions.keys())) + return + subcomand_handler = subcommand_handlers_map.get(cli_args.subcommand, None) if subcomand_handler is None: diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py index 238a31a5cc29..b67408147c1f 100755 --- a/tools/testing/kunit/kunit_tool_test.py +++ b/tools/testing/kunit/kunit_tool_test.py @@ -11,11 +11,13 @@ from unittest import mock import tempfile, shutil # Handling test_tmpdir +import io import itertools import json import os import signal import subprocess +import sys from typing import Iterable import kunit_config @@ -886,5 +888,24 @@ class KUnitMainTest(unittest.TestCase): mock.call(args=None, build_dir='.kunit', filter_glob='suite2.test1', filter='', filter_action=None, timeout=300), ]) + @mock.patch.object(sys, 'stdout', new_callable=io.StringIO) + def test_list_cmds(self, mock_stdout): + kunit.main(['--list-cmds']) + output = mock_stdout.getvalue() + output_cmds = sorted(output.split()) + expected_cmds = sorted(['build', 'config', 'exec', 'parse', 'run']) + self.assertEqual(output_cmds, expected_cmds) + + @mock.patch.object(sys, 'stdout', new_callable=io.StringIO) + def test_run_list_opts(self, mock_stdout): + kunit.main(['run', '--list-opts']) + output = mock_stdout.getvalue() + output_cmds = set(output.split()) + self.assertIn('--help', output_cmds) + self.assertIn('--kunitconfig', output_cmds) + self.assertIn('--jobs', output_cmds) + self.assertIn('--kernel_args', output_cmds) + self.assertIn('--raw_output', output_cmds) + if __name__ == '__main__': unittest.main() -- cgit v1.2.3 From 2e6690d4f7fc41c4fae7d0a4c0bf11f1973e5650 Mon Sep 17 00:00:00 2001 From: Gyutae Bae Date: Tue, 20 Jan 2026 18:07:16 +0900 Subject: selftests/bpf: Add perfbuf multi-producer benchmark Add a multi-producer benchmark for perfbuf to complement the existing ringbuf multi-producer test. Unlike ringbuf which uses a shared buffer and experiences contention, perfbuf uses per-CPU buffers so the test measures scaling behavior rather than contention. This allows developers to compare perfbuf vs ringbuf performance under multi-producer workloads when choosing between the two for their systems. Signed-off-by: Gyutae Bae Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260120090716.82927-1-gyutae.opensource@navercorp.com --- tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh b/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh index 83e05e837871..123b7feb6935 100755 --- a/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh +++ b/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh @@ -49,6 +49,11 @@ for b in 1 2 3 4 8 12 16 20 24 28 32 36 40 44 48 52; do summarize "rb-libbpf nr_prod $b" "$($RUN_RB_BENCH -p$b --rb-batch-cnt 50 rb-libbpf)" done +header "Perfbuf, multi-producer" +for b in 1 2 3 4 8 12 16 20 24 28 32 36 40 44 48 52; do + summarize "pb-libbpf nr_prod $b" "$($RUN_RB_BENCH -p$b --rb-batch-cnt 50 --rb-sample-rate 50 pb-libbpf)" +done + header "Ringbuf, multi-producer contention in overwrite mode, no consumer" for b in 1 2 3 4 8 12 16 20 24 28 32 36 40 44 48 52; do summarize "rb-prod nr_prod $b" "$($RUN_BENCH -p$b --rb-batch-cnt 50 --rb-overwrite --rb-bench-producer rb-libbpf)" -- cgit v1.2.3 From e939f3d16d77a88e5f363394ef73db4c898c4107 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 20 Jan 2026 14:26:31 -0800 Subject: selftests/bpf: Add tests for KF_IMPLICIT_ARGS Add trivial end-to-end tests to validate that KF_IMPLICIT_ARGS flag is properly handled by both resolve_btfids and the verifier. Declare kfuncs in bpf_testmod. Check that bpf_prog_aux pointer is set in the kfunc implementation. Verify that calls with implicit args and a legacy case all work. Acked-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260120222638.3976562-7-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/kfunc_implicit_args.c | 10 ++++++ .../selftests/bpf/progs/kfunc_implicit_args.c | 41 ++++++++++++++++++++++ .../testing/selftests/bpf/test_kmods/bpf_testmod.c | 26 ++++++++++++++ 3 files changed, 77 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/kfunc_implicit_args.c create mode 100644 tools/testing/selftests/bpf/progs/kfunc_implicit_args.c (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_implicit_args.c b/tools/testing/selftests/bpf/prog_tests/kfunc_implicit_args.c new file mode 100644 index 000000000000..5e4793c9c29a --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/kfunc_implicit_args.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include +#include "kfunc_implicit_args.skel.h" + +void test_kfunc_implicit_args(void) +{ + RUN_TESTS(kfunc_implicit_args); +} diff --git a/tools/testing/selftests/bpf/progs/kfunc_implicit_args.c b/tools/testing/selftests/bpf/progs/kfunc_implicit_args.c new file mode 100644 index 000000000000..89b6a47e22dd --- /dev/null +++ b/tools/testing/selftests/bpf/progs/kfunc_implicit_args.c @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include "bpf_misc.h" + +extern int bpf_kfunc_implicit_arg(int a) __weak __ksym; +extern int bpf_kfunc_implicit_arg_impl(int a, struct bpf_prog_aux *aux) __weak __ksym; /* illegal */ +extern int bpf_kfunc_implicit_arg_legacy(int a, int b) __weak __ksym; +extern int bpf_kfunc_implicit_arg_legacy_impl(int a, int b, struct bpf_prog_aux *aux) __weak __ksym; + +char _license[] SEC("license") = "GPL"; + +SEC("syscall") +__retval(5) +int test_kfunc_implicit_arg(void *ctx) +{ + return bpf_kfunc_implicit_arg(5); +} + +SEC("syscall") +__failure __msg("cannot find address for kernel function bpf_kfunc_implicit_arg_impl") +int test_kfunc_implicit_arg_impl_illegal(void *ctx) +{ + return bpf_kfunc_implicit_arg_impl(5, NULL); +} + +SEC("syscall") +__retval(7) +int test_kfunc_implicit_arg_legacy(void *ctx) +{ + return bpf_kfunc_implicit_arg_legacy(3, 4); +} + +SEC("syscall") +__retval(11) +int test_kfunc_implicit_arg_legacy_impl(void *ctx) +{ + return bpf_kfunc_implicit_arg_legacy_impl(5, 6, NULL); +} diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index bc07ce9d5477..a996b816ecc4 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -1142,6 +1142,10 @@ __bpf_kfunc int bpf_kfunc_st_ops_inc10(struct st_ops_args *args) __bpf_kfunc int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id); __bpf_kfunc int bpf_kfunc_multi_st_ops_test_1_impl(struct st_ops_args *args, void *aux_prog); +__bpf_kfunc int bpf_kfunc_implicit_arg(int a, struct bpf_prog_aux *aux); +__bpf_kfunc int bpf_kfunc_implicit_arg_legacy(int a, int b, struct bpf_prog_aux *aux); +__bpf_kfunc int bpf_kfunc_implicit_arg_legacy_impl(int a, int b, struct bpf_prog_aux *aux); + BTF_KFUNCS_START(bpf_testmod_check_kfunc_ids) BTF_ID_FLAGS(func, bpf_testmod_test_mod_kfunc) BTF_ID_FLAGS(func, bpf_kfunc_call_test1) @@ -1184,6 +1188,9 @@ BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_pro_epilogue, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_st_ops_inc10) BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1) BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1_impl) +BTF_ID_FLAGS(func, bpf_kfunc_implicit_arg, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, bpf_kfunc_implicit_arg_legacy, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, bpf_kfunc_implicit_arg_legacy_impl) BTF_KFUNCS_END(bpf_testmod_check_kfunc_ids) static int bpf_testmod_ops_init(struct btf *btf) @@ -1675,6 +1682,25 @@ int bpf_kfunc_multi_st_ops_test_1_impl(struct st_ops_args *args, void *aux__prog return ret; } +int bpf_kfunc_implicit_arg(int a, struct bpf_prog_aux *aux) +{ + if (aux && a > 0) + return a; + return -EINVAL; +} + +int bpf_kfunc_implicit_arg_legacy(int a, int b, struct bpf_prog_aux *aux) +{ + if (aux) + return a + b; + return -EINVAL; +} + +int bpf_kfunc_implicit_arg_legacy_impl(int a, int b, struct bpf_prog_aux *aux) +{ + return bpf_kfunc_implicit_arg_legacy(a, b, aux); +} + static int multi_st_ops_reg(void *kdata, struct bpf_link *link) { struct bpf_testmod_multi_st_ops *st_ops = -- cgit v1.2.3 From b97931a25a4bc74076ffb5c3d1a534c71ade4d55 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 20 Jan 2026 14:26:32 -0800 Subject: bpf: Migrate bpf_wq_set_callback_impl() to KF_IMPLICIT_ARGS Implement bpf_wq_set_callback() with an implicit bpf_prog_aux argument, and remove bpf_wq_set_callback_impl(). Update special kfunc checks in the verifier accordingly. Reviewed-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260120222638.3976562-8-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 11 +++++------ kernel/bpf/verifier.c | 16 ++++++++-------- tools/testing/selftests/bpf/bpf_experimental.h | 5 ----- .../selftests/bpf/progs/verifier_async_cb_context.c | 4 ++-- tools/testing/selftests/bpf/progs/wq_failures.c | 4 ++-- 5 files changed, 17 insertions(+), 23 deletions(-) (limited to 'tools/testing') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 9eaa4185e0a7..c76a9003b221 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3120,12 +3120,11 @@ __bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) return 0; } -__bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq, - int (callback_fn)(void *map, int *key, void *value), - unsigned int flags, - void *aux__prog) +__bpf_kfunc int bpf_wq_set_callback(struct bpf_wq *wq, + int (callback_fn)(void *map, int *key, void *value), + unsigned int flags, + struct bpf_prog_aux *aux) { - struct bpf_prog_aux *aux = (struct bpf_prog_aux *)aux__prog; struct bpf_async_kern *async = (struct bpf_async_kern *)wq; if (flags) @@ -4488,7 +4487,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_memset) BTF_ID_FLAGS(func, bpf_modify_return_test_tp) #endif BTF_ID_FLAGS(func, bpf_wq_init) -BTF_ID_FLAGS(func, bpf_wq_set_callback_impl) +BTF_ID_FLAGS(func, bpf_wq_set_callback, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_wq_start) BTF_ID_FLAGS(func, bpf_preempt_disable) BTF_ID_FLAGS(func, bpf_preempt_enable) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index adc24a2ce5b6..51e8c9f70868 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -520,7 +520,7 @@ static bool is_async_callback_calling_kfunc(u32 btf_id); static bool is_callback_calling_kfunc(u32 btf_id); static bool is_bpf_throw_kfunc(struct bpf_insn *insn); -static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id); +static bool is_bpf_wq_set_callback_kfunc(u32 btf_id); static bool is_task_work_add_kfunc(u32 func_id); static bool is_sync_callback_calling_function(enum bpf_func_id func_id) @@ -562,7 +562,7 @@ static bool is_async_cb_sleepable(struct bpf_verifier_env *env, struct bpf_insn /* bpf_wq and bpf_task_work callbacks are always sleepable. */ if (bpf_pseudo_kfunc_call(insn) && insn->off == 0 && - (is_bpf_wq_set_callback_impl_kfunc(insn->imm) || is_task_work_add_kfunc(insn->imm))) + (is_bpf_wq_set_callback_kfunc(insn->imm) || is_task_work_add_kfunc(insn->imm))) return true; verifier_bug(env, "unhandled async callback in is_async_cb_sleepable"); @@ -12437,7 +12437,7 @@ enum special_kfunc_type { KF_bpf_percpu_obj_new_impl, KF_bpf_percpu_obj_drop_impl, KF_bpf_throw, - KF_bpf_wq_set_callback_impl, + KF_bpf_wq_set_callback, KF_bpf_preempt_disable, KF_bpf_preempt_enable, KF_bpf_iter_css_task_new, @@ -12501,7 +12501,7 @@ BTF_ID(func, bpf_dynptr_clone) BTF_ID(func, bpf_percpu_obj_new_impl) BTF_ID(func, bpf_percpu_obj_drop_impl) BTF_ID(func, bpf_throw) -BTF_ID(func, bpf_wq_set_callback_impl) +BTF_ID(func, bpf_wq_set_callback) BTF_ID(func, bpf_preempt_disable) BTF_ID(func, bpf_preempt_enable) #ifdef CONFIG_CGROUPS @@ -12994,7 +12994,7 @@ static bool is_sync_callback_calling_kfunc(u32 btf_id) static bool is_async_callback_calling_kfunc(u32 btf_id) { - return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl] || + return is_bpf_wq_set_callback_kfunc(btf_id) || is_task_work_add_kfunc(btf_id); } @@ -13004,9 +13004,9 @@ static bool is_bpf_throw_kfunc(struct bpf_insn *insn) insn->imm == special_kfunc_list[KF_bpf_throw]; } -static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id) +static bool is_bpf_wq_set_callback_kfunc(u32 btf_id) { - return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl]; + return btf_id == special_kfunc_list[KF_bpf_wq_set_callback]; } static bool is_callback_calling_kfunc(u32 btf_id) @@ -14085,7 +14085,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, meta.r0_rdonly = false; } - if (is_bpf_wq_set_callback_impl_kfunc(meta.func_id)) { + if (is_bpf_wq_set_callback_kfunc(meta.func_id)) { err = push_callback_call(env, insn, insn_idx, meta.subprogno, set_timer_callback_state); if (err) { diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index 2cd9165c7348..68a49b1f77ae 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -580,11 +580,6 @@ extern void bpf_iter_css_destroy(struct bpf_iter_css *it) __weak __ksym; extern int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) __weak __ksym; extern int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) __weak __ksym; -extern int bpf_wq_set_callback_impl(struct bpf_wq *wq, - int (callback_fn)(void *map, int *key, void *value), - unsigned int flags__k, void *aux__ign) __ksym; -#define bpf_wq_set_callback(timer, cb, flags) \ - bpf_wq_set_callback_impl(timer, cb, flags, NULL) struct bpf_iter_kmem_cache; extern int bpf_iter_kmem_cache_new(struct bpf_iter_kmem_cache *it) __weak __ksym; diff --git a/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c b/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c index 7efa9521105e..5d5e1cd4d51d 100644 --- a/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c +++ b/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c @@ -96,7 +96,7 @@ int wq_non_sleepable_prog(void *ctx) if (bpf_wq_init(&val->w, &wq_map, 0) != 0) return 0; - if (bpf_wq_set_callback_impl(&val->w, wq_cb, 0, NULL) != 0) + if (bpf_wq_set_callback(&val->w, wq_cb, 0) != 0) return 0; return 0; } @@ -114,7 +114,7 @@ int wq_sleepable_prog(void *ctx) if (bpf_wq_init(&val->w, &wq_map, 0) != 0) return 0; - if (bpf_wq_set_callback_impl(&val->w, wq_cb, 0, NULL) != 0) + if (bpf_wq_set_callback(&val->w, wq_cb, 0) != 0) return 0; return 0; } diff --git a/tools/testing/selftests/bpf/progs/wq_failures.c b/tools/testing/selftests/bpf/progs/wq_failures.c index d06f6d40594a..3767f5595bbc 100644 --- a/tools/testing/selftests/bpf/progs/wq_failures.c +++ b/tools/testing/selftests/bpf/progs/wq_failures.c @@ -97,7 +97,7 @@ __failure /* check that the first argument of bpf_wq_set_callback() * is a correct bpf_wq pointer. */ -__msg(": (85) call bpf_wq_set_callback_impl#") /* anchor message */ +__msg(": (85) call bpf_wq_set_callback#") /* anchor message */ __msg("arg#0 doesn't point to a map value") long test_wrong_wq_pointer(void *ctx) { @@ -123,7 +123,7 @@ __failure /* check that the first argument of bpf_wq_set_callback() * is a correct bpf_wq pointer. */ -__msg(": (85) call bpf_wq_set_callback_impl#") /* anchor message */ +__msg(": (85) call bpf_wq_set_callback#") /* anchor message */ __msg("off 1 doesn't point to 'struct bpf_wq' that is at 0") long test_wrong_wq_pointer_offset(void *ctx) { -- cgit v1.2.3 From 8157cc739ad301b7fb6dfc4cfc5497cedd33df4e Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 20 Jan 2026 14:26:33 -0800 Subject: HID: Use bpf_wq_set_callback kernel function Remove extern declaration of bpf_wq_set_callback_impl() from hid_bpf_helpers.h and replace bpf_wq_set_callback macro with a corresponding new declaration. Tested with: # append tools/testing/selftests/hid/config and build the kernel $ make -C tools/testing/selftests/hid # in built kernel $ ./tools/testing/selftests/hid/hid_bpf -t test_multiply_events_wq TAP version 13 1..1 # Starting 1 tests from 1 test cases. # RUN hid_bpf.test_multiply_events_wq ... [ 2.575520] hid-generic 0003:0001:0A36.0001: hidraw0: USB HID v0.00 Device [test-uhid-device-138] on 138 # OK hid_bpf.test_multiply_events_wq ok 1 hid_bpf.test_multiply_events_wq # PASSED: 1 / 1 tests passed. # Totals: pass:1 fail:0 xfail:0 xpass:0 skip:0 error:0 PASS Acked-by: Benjamin Tissoires Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260120222638.3976562-9-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- drivers/hid/bpf/progs/hid_bpf_helpers.h | 8 +++----- tools/testing/selftests/hid/progs/hid_bpf_helpers.h | 8 +++----- 2 files changed, 6 insertions(+), 10 deletions(-) (limited to 'tools/testing') diff --git a/drivers/hid/bpf/progs/hid_bpf_helpers.h b/drivers/hid/bpf/progs/hid_bpf_helpers.h index bf19785a6b06..228f8d787567 100644 --- a/drivers/hid/bpf/progs/hid_bpf_helpers.h +++ b/drivers/hid/bpf/progs/hid_bpf_helpers.h @@ -33,11 +33,9 @@ extern int hid_bpf_try_input_report(struct hid_bpf_ctx *ctx, /* bpf_wq implementation */ extern int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) __weak __ksym; extern int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) __weak __ksym; -extern int bpf_wq_set_callback_impl(struct bpf_wq *wq, - int (callback_fn)(void *map, int *key, void *value), - unsigned int flags__k, void *aux__ign) __ksym; -#define bpf_wq_set_callback(wq, cb, flags) \ - bpf_wq_set_callback_impl(wq, cb, flags, NULL) +extern int bpf_wq_set_callback(struct bpf_wq *wq, + int (*callback_fn)(void *, int *, void *), + unsigned int flags) __weak __ksym; #define HID_MAX_DESCRIPTOR_SIZE 4096 #define HID_IGNORE_EVENT -1 diff --git a/tools/testing/selftests/hid/progs/hid_bpf_helpers.h b/tools/testing/selftests/hid/progs/hid_bpf_helpers.h index 531228b849da..80ab60905865 100644 --- a/tools/testing/selftests/hid/progs/hid_bpf_helpers.h +++ b/tools/testing/selftests/hid/progs/hid_bpf_helpers.h @@ -116,10 +116,8 @@ extern int hid_bpf_try_input_report(struct hid_bpf_ctx *ctx, /* bpf_wq implementation */ extern int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) __weak __ksym; extern int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) __weak __ksym; -extern int bpf_wq_set_callback_impl(struct bpf_wq *wq, - int (callback_fn)(void *map, int *key, void *wq), - unsigned int flags__k, void *aux__ign) __weak __ksym; -#define bpf_wq_set_callback(timer, cb, flags) \ - bpf_wq_set_callback_impl(timer, cb, flags, NULL) +extern int bpf_wq_set_callback(struct bpf_wq *wq, + int (*callback_fn)(void *, int *, void *), + unsigned int flags) __weak __ksym; #endif /* __HID_BPF_HELPERS_H */ -- cgit v1.2.3 From 6e663ffdf7600168338fdfa2fd1eed83395d58a3 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 20 Jan 2026 14:26:34 -0800 Subject: bpf: Migrate bpf_task_work_schedule_* kfuncs to KF_IMPLICIT_ARGS Implement bpf_task_work_schedule_* with an implicit bpf_prog_aux argument, and remove corresponding _impl funcs from the kernel. Update special kfunc checks in the verifier accordingly. Update the selftests to use the new API with implicit argument. Reviewed-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260120222638.3976562-10-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 30 ++++++++++------------ kernel/bpf/verifier.c | 12 ++++----- tools/testing/selftests/bpf/progs/file_reader.c | 2 +- tools/testing/selftests/bpf/progs/task_work.c | 7 +++-- tools/testing/selftests/bpf/progs/task_work_fail.c | 8 +++--- .../testing/selftests/bpf/progs/task_work_stress.c | 4 +-- .../bpf/progs/verifier_async_cb_context.c | 4 +-- 7 files changed, 32 insertions(+), 35 deletions(-) (limited to 'tools/testing') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index c76a9003b221..f2f974b5fb3b 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -4274,41 +4274,39 @@ release_prog: } /** - * bpf_task_work_schedule_signal_impl - Schedule BPF callback using task_work_add with TWA_SIGNAL + * bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL * mode * @task: Task struct for which callback should be scheduled * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping * @map__map: bpf_map that embeds struct bpf_task_work in the values * @callback: pointer to BPF subprogram to call - * @aux__prog: user should pass NULL + * @aux: pointer to bpf_prog_aux of the caller BPF program, implicitly set by the verifier * * Return: 0 if task work has been scheduled successfully, negative error code otherwise */ -__bpf_kfunc int bpf_task_work_schedule_signal_impl(struct task_struct *task, - struct bpf_task_work *tw, void *map__map, - bpf_task_work_callback_t callback, - void *aux__prog) +__bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct bpf_task_work *tw, + void *map__map, bpf_task_work_callback_t callback, + struct bpf_prog_aux *aux) { - return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_SIGNAL); + return bpf_task_work_schedule(task, tw, map__map, callback, aux, TWA_SIGNAL); } /** - * bpf_task_work_schedule_resume_impl - Schedule BPF callback using task_work_add with TWA_RESUME + * bpf_task_work_schedule_resume - Schedule BPF callback using task_work_add with TWA_RESUME * mode * @task: Task struct for which callback should be scheduled * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping * @map__map: bpf_map that embeds struct bpf_task_work in the values * @callback: pointer to BPF subprogram to call - * @aux__prog: user should pass NULL + * @aux: pointer to bpf_prog_aux of the caller BPF program, implicitly set by the verifier * * Return: 0 if task work has been scheduled successfully, negative error code otherwise */ -__bpf_kfunc int bpf_task_work_schedule_resume_impl(struct task_struct *task, - struct bpf_task_work *tw, void *map__map, - bpf_task_work_callback_t callback, - void *aux__prog) +__bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct bpf_task_work *tw, + void *map__map, bpf_task_work_callback_t callback, + struct bpf_prog_aux *aux) { - return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_RESUME); + return bpf_task_work_schedule(task, tw, map__map, callback, aux, TWA_RESUME); } static int make_file_dynptr(struct file *file, u32 flags, bool may_sleep, @@ -4536,8 +4534,8 @@ BTF_ID_FLAGS(func, bpf_strncasestr); BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU) #endif BTF_ID_FLAGS(func, bpf_stream_vprintk_impl) -BTF_ID_FLAGS(func, bpf_task_work_schedule_signal_impl) -BTF_ID_FLAGS(func, bpf_task_work_schedule_resume_impl) +BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_dynptr_from_file) BTF_ID_FLAGS(func, bpf_dynptr_file_discard) BTF_KFUNCS_END(common_btf_ids) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 51e8c9f70868..8e8570e9d167 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12457,8 +12457,8 @@ enum special_kfunc_type { KF_bpf_dynptr_from_file, KF_bpf_dynptr_file_discard, KF___bpf_trap, - KF_bpf_task_work_schedule_signal_impl, - KF_bpf_task_work_schedule_resume_impl, + KF_bpf_task_work_schedule_signal, + KF_bpf_task_work_schedule_resume, KF_bpf_arena_alloc_pages, KF_bpf_arena_free_pages, KF_bpf_arena_reserve_pages, @@ -12534,16 +12534,16 @@ BTF_ID(func, bpf_res_spin_unlock_irqrestore) BTF_ID(func, bpf_dynptr_from_file) BTF_ID(func, bpf_dynptr_file_discard) BTF_ID(func, __bpf_trap) -BTF_ID(func, bpf_task_work_schedule_signal_impl) -BTF_ID(func, bpf_task_work_schedule_resume_impl) +BTF_ID(func, bpf_task_work_schedule_signal) +BTF_ID(func, bpf_task_work_schedule_resume) BTF_ID(func, bpf_arena_alloc_pages) BTF_ID(func, bpf_arena_free_pages) BTF_ID(func, bpf_arena_reserve_pages) static bool is_task_work_add_kfunc(u32 func_id) { - return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal_impl] || - func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume_impl]; + return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal] || + func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume]; } static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) diff --git a/tools/testing/selftests/bpf/progs/file_reader.c b/tools/testing/selftests/bpf/progs/file_reader.c index 4d756b623557..462712ff3b8a 100644 --- a/tools/testing/selftests/bpf/progs/file_reader.c +++ b/tools/testing/selftests/bpf/progs/file_reader.c @@ -77,7 +77,7 @@ int on_open_validate_file_read(void *c) err = 1; return 0; } - bpf_task_work_schedule_signal_impl(task, &work->tw, &arrmap, task_work_callback, NULL); + bpf_task_work_schedule_signal(task, &work->tw, &arrmap, task_work_callback); return 0; } diff --git a/tools/testing/selftests/bpf/progs/task_work.c b/tools/testing/selftests/bpf/progs/task_work.c index 663a80990f8f..a6009d105158 100644 --- a/tools/testing/selftests/bpf/progs/task_work.c +++ b/tools/testing/selftests/bpf/progs/task_work.c @@ -65,8 +65,7 @@ int oncpu_hash_map(struct pt_regs *args) work = bpf_map_lookup_elem(&hmap, &key); if (!work) return 0; - - bpf_task_work_schedule_resume_impl(task, &work->tw, &hmap, process_work, NULL); + bpf_task_work_schedule_resume(task, &work->tw, &hmap, process_work); return 0; } @@ -80,7 +79,7 @@ int oncpu_array_map(struct pt_regs *args) work = bpf_map_lookup_elem(&arrmap, &key); if (!work) return 0; - bpf_task_work_schedule_signal_impl(task, &work->tw, &arrmap, process_work, NULL); + bpf_task_work_schedule_signal(task, &work->tw, &arrmap, process_work); return 0; } @@ -102,6 +101,6 @@ int oncpu_lru_map(struct pt_regs *args) work = bpf_map_lookup_elem(&lrumap, &key); if (!work || work->data[0]) return 0; - bpf_task_work_schedule_resume_impl(task, &work->tw, &lrumap, process_work, NULL); + bpf_task_work_schedule_resume(task, &work->tw, &lrumap, process_work); return 0; } diff --git a/tools/testing/selftests/bpf/progs/task_work_fail.c b/tools/testing/selftests/bpf/progs/task_work_fail.c index 1270953fd092..82e4b8913333 100644 --- a/tools/testing/selftests/bpf/progs/task_work_fail.c +++ b/tools/testing/selftests/bpf/progs/task_work_fail.c @@ -53,7 +53,7 @@ int mismatch_map(struct pt_regs *args) work = bpf_map_lookup_elem(&arrmap, &key); if (!work) return 0; - bpf_task_work_schedule_resume_impl(task, &work->tw, &hmap, process_work, NULL); + bpf_task_work_schedule_resume(task, &work->tw, &hmap, process_work); return 0; } @@ -65,7 +65,7 @@ int no_map_task_work(struct pt_regs *args) struct bpf_task_work tw; task = bpf_get_current_task_btf(); - bpf_task_work_schedule_resume_impl(task, &tw, &hmap, process_work, NULL); + bpf_task_work_schedule_resume(task, &tw, &hmap, process_work); return 0; } @@ -76,7 +76,7 @@ int task_work_null(struct pt_regs *args) struct task_struct *task; task = bpf_get_current_task_btf(); - bpf_task_work_schedule_resume_impl(task, NULL, &hmap, process_work, NULL); + bpf_task_work_schedule_resume(task, NULL, &hmap, process_work); return 0; } @@ -91,6 +91,6 @@ int map_null(struct pt_regs *args) work = bpf_map_lookup_elem(&arrmap, &key); if (!work) return 0; - bpf_task_work_schedule_resume_impl(task, &work->tw, NULL, process_work, NULL); + bpf_task_work_schedule_resume(task, &work->tw, NULL, process_work); return 0; } diff --git a/tools/testing/selftests/bpf/progs/task_work_stress.c b/tools/testing/selftests/bpf/progs/task_work_stress.c index 55e555f7f41b..1d4378f351ef 100644 --- a/tools/testing/selftests/bpf/progs/task_work_stress.c +++ b/tools/testing/selftests/bpf/progs/task_work_stress.c @@ -51,8 +51,8 @@ int schedule_task_work(void *ctx) if (!work) return 0; } - err = bpf_task_work_schedule_signal_impl(bpf_get_current_task_btf(), &work->tw, &hmap, - process_work, NULL); + err = bpf_task_work_schedule_signal(bpf_get_current_task_btf(), &work->tw, &hmap, + process_work); if (err) __sync_fetch_and_add(&schedule_error, 1); else diff --git a/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c b/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c index 5d5e1cd4d51d..39aff82549c9 100644 --- a/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c +++ b/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c @@ -156,7 +156,7 @@ int task_work_non_sleepable_prog(void *ctx) if (!task) return 0; - bpf_task_work_schedule_resume_impl(task, &val->tw, &task_work_map, task_work_cb, NULL); + bpf_task_work_schedule_resume(task, &val->tw, &task_work_map, task_work_cb); return 0; } @@ -176,6 +176,6 @@ int task_work_sleepable_prog(void *ctx) if (!task) return 0; - bpf_task_work_schedule_resume_impl(task, &val->tw, &task_work_map, task_work_cb, NULL); + bpf_task_work_schedule_resume(task, &val->tw, &task_work_map, task_work_cb); return 0; } -- cgit v1.2.3 From d806f3101276a1ed18d963944580e1ee1c7a3d26 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 20 Jan 2026 14:26:35 -0800 Subject: bpf: Migrate bpf_stream_vprintk() to KF_IMPLICIT_ARGS Implement bpf_stream_vprintk with an implicit bpf_prog_aux argument, and remote bpf_stream_vprintk_impl from the kernel. Update the selftests to use the new API with implicit argument. bpf_stream_vprintk macro is changed to use the new bpf_stream_vprintk kfunc, and the extern definition of bpf_stream_vprintk_impl is replaced accordingly. Reviewed-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260120222638.3976562-11-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 2 +- kernel/bpf/stream.c | 5 ++--- tools/lib/bpf/bpf_helpers.h | 6 +++--- tools/testing/selftests/bpf/progs/stream_fail.c | 6 +++--- 4 files changed, 9 insertions(+), 10 deletions(-) (limited to 'tools/testing') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index f2f974b5fb3b..f8aa1320e2f7 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -4533,7 +4533,7 @@ BTF_ID_FLAGS(func, bpf_strncasestr); #if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS) BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU) #endif -BTF_ID_FLAGS(func, bpf_stream_vprintk_impl) +BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_dynptr_from_file) diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c index 0b6bc3f30335..24730df55e69 100644 --- a/kernel/bpf/stream.c +++ b/kernel/bpf/stream.c @@ -212,14 +212,13 @@ __bpf_kfunc_start_defs(); * Avoid using enum bpf_stream_id so that kfunc users don't have to pull in the * enum in headers. */ -__bpf_kfunc int bpf_stream_vprintk_impl(int stream_id, const char *fmt__str, const void *args, - u32 len__sz, void *aux__prog) +__bpf_kfunc int bpf_stream_vprintk(int stream_id, const char *fmt__str, const void *args, + u32 len__sz, struct bpf_prog_aux *aux) { struct bpf_bprintf_data data = { .get_bin_args = true, .get_buf = true, }; - struct bpf_prog_aux *aux = aux__prog; u32 fmt_size = strlen(fmt__str) + 1; struct bpf_stream *stream; u32 data_len = len__sz; diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index d4e4e388e625..c145da05a67c 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -315,8 +315,8 @@ enum libbpf_tristate { ___param, sizeof(___param)); \ }) -extern int bpf_stream_vprintk_impl(int stream_id, const char *fmt__str, const void *args, - __u32 len__sz, void *aux__prog) __weak __ksym; +extern int bpf_stream_vprintk(int stream_id, const char *fmt__str, const void *args, + __u32 len__sz) __weak __ksym; #define bpf_stream_printk(stream_id, fmt, args...) \ ({ \ @@ -328,7 +328,7 @@ extern int bpf_stream_vprintk_impl(int stream_id, const char *fmt__str, const vo ___bpf_fill(___param, args); \ _Pragma("GCC diagnostic pop") \ \ - bpf_stream_vprintk_impl(stream_id, ___fmt, ___param, sizeof(___param), NULL); \ + bpf_stream_vprintk(stream_id, ___fmt, ___param, sizeof(___param)); \ }) /* Use __bpf_printk when bpf_printk call has 3 or fewer fmt args diff --git a/tools/testing/selftests/bpf/progs/stream_fail.c b/tools/testing/selftests/bpf/progs/stream_fail.c index 3662515f0107..8e8249f3521c 100644 --- a/tools/testing/selftests/bpf/progs/stream_fail.c +++ b/tools/testing/selftests/bpf/progs/stream_fail.c @@ -10,7 +10,7 @@ SEC("syscall") __failure __msg("Possibly NULL pointer passed") int stream_vprintk_null_arg(void *ctx) { - bpf_stream_vprintk_impl(BPF_STDOUT, "", NULL, 0, NULL); + bpf_stream_vprintk(BPF_STDOUT, "", NULL, 0); return 0; } @@ -18,7 +18,7 @@ SEC("syscall") __failure __msg("R3 type=scalar expected=") int stream_vprintk_scalar_arg(void *ctx) { - bpf_stream_vprintk_impl(BPF_STDOUT, "", (void *)46, 0, NULL); + bpf_stream_vprintk(BPF_STDOUT, "", (void *)46, 0); return 0; } @@ -26,7 +26,7 @@ SEC("syscall") __failure __msg("arg#1 doesn't point to a const string") int stream_vprintk_string_arg(void *ctx) { - bpf_stream_vprintk_impl(BPF_STDOUT, ctx, NULL, 0, NULL); + bpf_stream_vprintk(BPF_STDOUT, ctx, NULL, 0); return 0; } -- cgit v1.2.3 From bd06b977e02d80fe0dec303cc219d007121a4526 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 20 Jan 2026 14:26:36 -0800 Subject: selftests/bpf: Migrate struct_ops_assoc test to KF_IMPLICIT_ARGS A test kfunc named bpf_kfunc_multi_st_ops_test_1_impl() is a user of __prog suffix. Subsequent patch removes __prog support in favor of KF_IMPLICIT_ARGS, so migrate this kfunc to use implicit argument. Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260120222638.3976562-12-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/struct_ops_assoc.c | 8 ++++---- tools/testing/selftests/bpf/progs/struct_ops_assoc_in_timer.c | 4 ++-- tools/testing/selftests/bpf/progs/struct_ops_assoc_reuse.c | 6 +++--- tools/testing/selftests/bpf/test_kmods/bpf_testmod.c | 9 ++++----- tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h | 6 ++++-- 5 files changed, 17 insertions(+), 16 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/progs/struct_ops_assoc.c b/tools/testing/selftests/bpf/progs/struct_ops_assoc.c index 8f1097903e22..68842e3f936b 100644 --- a/tools/testing/selftests/bpf/progs/struct_ops_assoc.c +++ b/tools/testing/selftests/bpf/progs/struct_ops_assoc.c @@ -32,7 +32,7 @@ int BPF_PROG(sys_enter_prog_a, struct pt_regs *regs, long id) if (!test_pid || task->pid != test_pid) return 0; - ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + ret = bpf_kfunc_multi_st_ops_test_1_assoc(&args); if (ret != MAP_A_MAGIC) test_err_a++; @@ -45,7 +45,7 @@ int syscall_prog_a(void *ctx) struct st_ops_args args = {}; int ret; - ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + ret = bpf_kfunc_multi_st_ops_test_1_assoc(&args); if (ret != MAP_A_MAGIC) test_err_a++; @@ -79,7 +79,7 @@ int BPF_PROG(sys_enter_prog_b, struct pt_regs *regs, long id) if (!test_pid || task->pid != test_pid) return 0; - ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + ret = bpf_kfunc_multi_st_ops_test_1_assoc(&args); if (ret != MAP_B_MAGIC) test_err_b++; @@ -92,7 +92,7 @@ int syscall_prog_b(void *ctx) struct st_ops_args args = {}; int ret; - ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + ret = bpf_kfunc_multi_st_ops_test_1_assoc(&args); if (ret != MAP_B_MAGIC) test_err_b++; diff --git a/tools/testing/selftests/bpf/progs/struct_ops_assoc_in_timer.c b/tools/testing/selftests/bpf/progs/struct_ops_assoc_in_timer.c index d5a2ea934284..0bed49e9f217 100644 --- a/tools/testing/selftests/bpf/progs/struct_ops_assoc_in_timer.c +++ b/tools/testing/selftests/bpf/progs/struct_ops_assoc_in_timer.c @@ -31,7 +31,7 @@ __noinline static int timer_cb(void *map, int *key, struct bpf_timer *timer) struct st_ops_args args = {}; recur++; - timer_test_1_ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + timer_test_1_ret = bpf_kfunc_multi_st_ops_test_1_assoc(&args); recur--; timer_cb_run++; @@ -64,7 +64,7 @@ int syscall_prog(void *ctx) struct st_ops_args args = {}; int ret; - ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + ret = bpf_kfunc_multi_st_ops_test_1_assoc(&args); if (ret != MAP_MAGIC) test_err++; diff --git a/tools/testing/selftests/bpf/progs/struct_ops_assoc_reuse.c b/tools/testing/selftests/bpf/progs/struct_ops_assoc_reuse.c index 5bb6ebf5eed4..396b3e58c729 100644 --- a/tools/testing/selftests/bpf/progs/struct_ops_assoc_reuse.c +++ b/tools/testing/selftests/bpf/progs/struct_ops_assoc_reuse.c @@ -23,7 +23,7 @@ int BPF_PROG(test_1_a, struct st_ops_args *args) if (!recur) { recur++; - ret = bpf_kfunc_multi_st_ops_test_1_impl(args, NULL); + ret = bpf_kfunc_multi_st_ops_test_1_assoc(args); if (ret != -1) test_err_a++; recur--; @@ -40,7 +40,7 @@ int syscall_prog_a(void *ctx) struct st_ops_args args = {}; int ret; - ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + ret = bpf_kfunc_multi_st_ops_test_1_assoc(&args); if (ret != MAP_A_MAGIC) test_err_a++; @@ -62,7 +62,7 @@ int syscall_prog_b(void *ctx) struct st_ops_args args = {}; int ret; - ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + ret = bpf_kfunc_multi_st_ops_test_1_assoc(&args); if (ret != MAP_A_MAGIC) test_err_b++; diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index a996b816ecc4..0d542ba64365 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -1140,7 +1140,7 @@ __bpf_kfunc int bpf_kfunc_st_ops_inc10(struct st_ops_args *args) } __bpf_kfunc int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id); -__bpf_kfunc int bpf_kfunc_multi_st_ops_test_1_impl(struct st_ops_args *args, void *aux_prog); +__bpf_kfunc int bpf_kfunc_multi_st_ops_test_1_assoc(struct st_ops_args *args, struct bpf_prog_aux *aux); __bpf_kfunc int bpf_kfunc_implicit_arg(int a, struct bpf_prog_aux *aux); __bpf_kfunc int bpf_kfunc_implicit_arg_legacy(int a, int b, struct bpf_prog_aux *aux); @@ -1187,7 +1187,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_epilogue, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_pro_epilogue, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_st_ops_inc10) BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1) -BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1_impl) +BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1_assoc, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_kfunc_implicit_arg, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_kfunc_implicit_arg_legacy, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_kfunc_implicit_arg_legacy_impl) @@ -1669,13 +1669,12 @@ int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id) } /* Call test_1() of the associated struct_ops map */ -int bpf_kfunc_multi_st_ops_test_1_impl(struct st_ops_args *args, void *aux__prog) +int bpf_kfunc_multi_st_ops_test_1_assoc(struct st_ops_args *args, struct bpf_prog_aux *aux) { - struct bpf_prog_aux *prog_aux = (struct bpf_prog_aux *)aux__prog; struct bpf_testmod_multi_st_ops *st_ops; int ret = -1; - st_ops = (struct bpf_testmod_multi_st_ops *)bpf_prog_get_assoc_struct_ops(prog_aux); + st_ops = (struct bpf_testmod_multi_st_ops *)bpf_prog_get_assoc_struct_ops(aux); if (st_ops) ret = st_ops->test_1(args); diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h index 2357a0340ffe..225ea30c4e3d 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h @@ -161,7 +161,9 @@ void bpf_kfunc_rcu_task_test(struct task_struct *ptr) __ksym; struct task_struct *bpf_kfunc_ret_rcu_test(void) __ksym; int *bpf_kfunc_ret_rcu_test_nostruct(int rdonly_buf_size) __ksym; -int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id) __ksym; -int bpf_kfunc_multi_st_ops_test_1_impl(struct st_ops_args *args, void *aux__prog) __ksym; +#ifndef __KERNEL__ +extern int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id) __weak __ksym; +extern int bpf_kfunc_multi_st_ops_test_1_assoc(struct st_ops_args *args) __weak __ksym; +#endif #endif /* _BPF_TESTMOD_KFUNC_H */ -- cgit v1.2.3 From 44fdd581d27366092e162b42f025d75d5a16c851 Mon Sep 17 00:00:00 2001 From: Yazhou Tang Date: Mon, 19 Jan 2026 16:54:57 +0800 Subject: bpf: Add range tracking for BPF_DIV and BPF_MOD This patch implements range tracking (interval analysis) for BPF_DIV and BPF_MOD operations when the divisor is a constant, covering both signed and unsigned variants. While LLVM typically optimizes integer division and modulo by constants into multiplication and shift sequences, this optimization is less effective for the BPF target when dealing with 64-bit arithmetic. Currently, the verifier does not track bounds for scalar division or modulo, treating the result as "unbounded". This leads to false positive rejections for safe code patterns. For example, the following code (compiled with -O2): ```c int test(struct pt_regs *ctx) { char buffer[6] = {1}; __u64 x = bpf_ktime_get_ns(); __u64 res = x % sizeof(buffer); char value = buffer[res]; bpf_printk("res = %llu, val = %d", res, value); return 0; } ``` Generates a raw `BPF_MOD64` instruction: ```asm ; __u64 res = x % sizeof(buffer); 1: 97 00 00 00 06 00 00 00 r0 %= 0x6 ; char value = buffer[res]; 2: 18 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r1 = 0x0 ll 4: 0f 01 00 00 00 00 00 00 r1 += r0 5: 91 14 00 00 00 00 00 00 r4 = *(s8 *)(r1 + 0x0) ``` Without this patch, the verifier fails with "math between map_value pointer and register with unbounded min value is not allowed" because it cannot deduce that `r0` is within [0, 5]. According to the BPF instruction set[1], the instruction's offset field (`insn->off`) is used to distinguish between signed (`off == 1`) and unsigned division (`off == 0`). Moreover, we also follow the BPF division and modulo runtime behavior (semantics) to handle special cases, such as division by zero and signed division overflow. - UDIV: dst = (src != 0) ? (dst / src) : 0 - SDIV: dst = (src == 0) ? 0 : ((src == -1 && dst == LLONG_MIN) ? LLONG_MIN : (dst / src)) - UMOD: dst = (src != 0) ? (dst % src) : dst - SMOD: dst = (src == 0) ? dst : ((src == -1 && dst == LLONG_MIN) ? 0: (dst s% src)) Here is the overview of the changes made in this patch (See the code comments for more details and examples): 1. For BPF_DIV: Firstly check whether the divisor is zero. If so, set the destination register to zero (matching runtime behavior). For non-zero constant divisors: goto `scalar(32)?_min_max_(u|s)div` functions. - General cases: compute the new range by dividing max_dividend and min_dividend by the constant divisor. - Overflow case (SIGNED_MIN / -1) in signed division: mark the result as unbounded if the dividend is not a single number. 2. For BPF_MOD: Firstly check whether the divisor is zero. If so, leave the destination register unchanged (matching runtime behavior). For non-zero constant divisors: goto `scalar(32)?_min_max_(u|s)mod` functions. - General case: For signed modulo, the result's sign matches the dividend's sign. And the result's absolute value is strictly bounded by `min(abs(dividend), abs(divisor) - 1)`. - Special care is taken when the divisor is SIGNED_MIN. By casting to unsigned before negation and subtracting 1, we avoid signed overflow and correctly calculate the maximum possible magnitude (`res_max_abs` in the code). - "Small dividend" case: If the dividend is already within the possible result range (e.g., [-2, 5] % 10), the operation is an identity function, and the destination register remains unchanged. 3. In `scalar(32)?_min_max_(u|s)(div|mod)` functions: After updating current range, reset other ranges and tnum to unbounded/unknown. e.g., in `scalar_min_max_sdiv`, signed 64-bit range is updated. Then reset unsigned 64-bit range and 32-bit range to unbounded, and tnum to unknown. Exception: in BPF_MOD's "small dividend" case, since the result remains unchanged, we do not reset other ranges/tnum. 4. Also updated existing selftests based on the expected BPF_DIV and BPF_MOD behavior. [1] https://www.kernel.org/doc/Documentation/bpf/standardization/instruction-set.rst Co-developed-by: Shenghao Yuan Signed-off-by: Shenghao Yuan Co-developed-by: Tianci Cao Signed-off-by: Tianci Cao Signed-off-by: Yazhou Tang Tested-by: syzbot@syzkaller.appspotmail.com Link: https://lore.kernel.org/r/20260119085458.182221-2-tangyazhou@zju.edu.cn Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 299 +++++++++++++++++++++ .../bpf/progs/verifier_value_illegal_alu.c | 7 +- tools/testing/selftests/bpf/verifier/precise.c | 4 +- 3 files changed, 305 insertions(+), 5 deletions(-) (limited to 'tools/testing') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 919556614505..f11dc5366e5b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2349,6 +2349,18 @@ static void __mark_reg32_unbounded(struct bpf_reg_state *reg) reg->u32_max_value = U32_MAX; } +static void reset_reg64_and_tnum(struct bpf_reg_state *reg) +{ + __mark_reg64_unbounded(reg); + reg->var_off = tnum_unknown; +} + +static void reset_reg32_and_tnum(struct bpf_reg_state *reg) +{ + __mark_reg32_unbounded(reg); + reg->var_off = tnum_unknown; +} + static void __update_reg32_bounds(struct bpf_reg_state *reg) { struct tnum var32_off = tnum_subreg(reg->var_off); @@ -15159,6 +15171,252 @@ static void scalar_min_max_mul(struct bpf_reg_state *dst_reg, } } +static void scalar32_min_max_udiv(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + u32 *dst_umin = &dst_reg->u32_min_value; + u32 *dst_umax = &dst_reg->u32_max_value; + u32 src_val = src_reg->u32_min_value; /* non-zero, const divisor */ + + *dst_umin = *dst_umin / src_val; + *dst_umax = *dst_umax / src_val; + + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->s32_min_value = S32_MIN; + dst_reg->s32_max_value = S32_MAX; + reset_reg64_and_tnum(dst_reg); +} + +static void scalar_min_max_udiv(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + u64 *dst_umin = &dst_reg->umin_value; + u64 *dst_umax = &dst_reg->umax_value; + u64 src_val = src_reg->umin_value; /* non-zero, const divisor */ + + *dst_umin = div64_u64(*dst_umin, src_val); + *dst_umax = div64_u64(*dst_umax, src_val); + + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; + reset_reg32_and_tnum(dst_reg); +} + +static void scalar32_min_max_sdiv(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + s32 *dst_smin = &dst_reg->s32_min_value; + s32 *dst_smax = &dst_reg->s32_max_value; + s32 src_val = src_reg->s32_min_value; /* non-zero, const divisor */ + s32 res1, res2; + + /* BPF div specification: S32_MIN / -1 = S32_MIN */ + if (*dst_smin == S32_MIN && src_val == -1) { + /* + * If the dividend range contains more than just S32_MIN, + * we cannot precisely track the result, so it becomes unbounded. + * e.g., [S32_MIN, S32_MIN+10]/(-1), + * = {S32_MIN} U [-(S32_MIN+10), -(S32_MIN+1)] + * = {S32_MIN} U [S32_MAX-9, S32_MAX] = [S32_MIN, S32_MAX] + * Otherwise (if dividend is exactly S32_MIN), result remains S32_MIN. + */ + if (*dst_smax != S32_MIN) { + *dst_smin = S32_MIN; + *dst_smax = S32_MAX; + } + goto reset; + } + + res1 = *dst_smin / src_val; + res2 = *dst_smax / src_val; + *dst_smin = min(res1, res2); + *dst_smax = max(res1, res2); + +reset: + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->u32_min_value = 0; + dst_reg->u32_max_value = U32_MAX; + reset_reg64_and_tnum(dst_reg); +} + +static void scalar_min_max_sdiv(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + s64 *dst_smin = &dst_reg->smin_value; + s64 *dst_smax = &dst_reg->smax_value; + s64 src_val = src_reg->smin_value; /* non-zero, const divisor */ + s64 res1, res2; + + /* BPF div specification: S64_MIN / -1 = S64_MIN */ + if (*dst_smin == S64_MIN && src_val == -1) { + /* + * If the dividend range contains more than just S64_MIN, + * we cannot precisely track the result, so it becomes unbounded. + * e.g., [S64_MIN, S64_MIN+10]/(-1), + * = {S64_MIN} U [-(S64_MIN+10), -(S64_MIN+1)] + * = {S64_MIN} U [S64_MAX-9, S64_MAX] = [S64_MIN, S64_MAX] + * Otherwise (if dividend is exactly S64_MIN), result remains S64_MIN. + */ + if (*dst_smax != S64_MIN) { + *dst_smin = S64_MIN; + *dst_smax = S64_MAX; + } + goto reset; + } + + res1 = div64_s64(*dst_smin, src_val); + res2 = div64_s64(*dst_smax, src_val); + *dst_smin = min(res1, res2); + *dst_smax = max(res1, res2); + +reset: + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->umin_value = 0; + dst_reg->umax_value = U64_MAX; + reset_reg32_and_tnum(dst_reg); +} + +static void scalar32_min_max_umod(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + u32 *dst_umin = &dst_reg->u32_min_value; + u32 *dst_umax = &dst_reg->u32_max_value; + u32 src_val = src_reg->u32_min_value; /* non-zero, const divisor */ + u32 res_max = src_val - 1; + + /* + * If dst_umax <= res_max, the result remains unchanged. + * e.g., [2, 5] % 10 = [2, 5]. + */ + if (*dst_umax <= res_max) + return; + + *dst_umin = 0; + *dst_umax = min(*dst_umax, res_max); + + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->s32_min_value = S32_MIN; + dst_reg->s32_max_value = S32_MAX; + reset_reg64_and_tnum(dst_reg); +} + +static void scalar_min_max_umod(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + u64 *dst_umin = &dst_reg->umin_value; + u64 *dst_umax = &dst_reg->umax_value; + u64 src_val = src_reg->umin_value; /* non-zero, const divisor */ + u64 res_max = src_val - 1; + + /* + * If dst_umax <= res_max, the result remains unchanged. + * e.g., [2, 5] % 10 = [2, 5]. + */ + if (*dst_umax <= res_max) + return; + + *dst_umin = 0; + *dst_umax = min(*dst_umax, res_max); + + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; + reset_reg32_and_tnum(dst_reg); +} + +static void scalar32_min_max_smod(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + s32 *dst_smin = &dst_reg->s32_min_value; + s32 *dst_smax = &dst_reg->s32_max_value; + s32 src_val = src_reg->s32_min_value; /* non-zero, const divisor */ + + /* + * Safe absolute value calculation: + * If src_val == S32_MIN (-2147483648), src_abs becomes 2147483648. + * Here use unsigned integer to avoid overflow. + */ + u32 src_abs = (src_val > 0) ? (u32)src_val : -(u32)src_val; + + /* + * Calculate the maximum possible absolute value of the result. + * Even if src_abs is 2147483648 (S32_MIN), subtracting 1 gives + * 2147483647 (S32_MAX), which fits perfectly in s32. + */ + s32 res_max_abs = src_abs - 1; + + /* + * If the dividend is already within the result range, + * the result remains unchanged. e.g., [-2, 5] % 10 = [-2, 5]. + */ + if (*dst_smin >= -res_max_abs && *dst_smax <= res_max_abs) + return; + + /* General case: result has the same sign as the dividend. */ + if (*dst_smin >= 0) { + *dst_smin = 0; + *dst_smax = min(*dst_smax, res_max_abs); + } else if (*dst_smax <= 0) { + *dst_smax = 0; + *dst_smin = max(*dst_smin, -res_max_abs); + } else { + *dst_smin = -res_max_abs; + *dst_smax = res_max_abs; + } + + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->u32_min_value = 0; + dst_reg->u32_max_value = U32_MAX; + reset_reg64_and_tnum(dst_reg); +} + +static void scalar_min_max_smod(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + s64 *dst_smin = &dst_reg->smin_value; + s64 *dst_smax = &dst_reg->smax_value; + s64 src_val = src_reg->smin_value; /* non-zero, const divisor */ + + /* + * Safe absolute value calculation: + * If src_val == S64_MIN (-2^63), src_abs becomes 2^63. + * Here use unsigned integer to avoid overflow. + */ + u64 src_abs = (src_val > 0) ? (u64)src_val : -(u64)src_val; + + /* + * Calculate the maximum possible absolute value of the result. + * Even if src_abs is 2^63 (S64_MIN), subtracting 1 gives + * 2^63 - 1 (S64_MAX), which fits perfectly in s64. + */ + s64 res_max_abs = src_abs - 1; + + /* + * If the dividend is already within the result range, + * the result remains unchanged. e.g., [-2, 5] % 10 = [-2, 5]. + */ + if (*dst_smin >= -res_max_abs && *dst_smax <= res_max_abs) + return; + + /* General case: result has the same sign as the dividend. */ + if (*dst_smin >= 0) { + *dst_smin = 0; + *dst_smax = min(*dst_smax, res_max_abs); + } else if (*dst_smax <= 0) { + *dst_smax = 0; + *dst_smin = max(*dst_smin, -res_max_abs); + } else { + *dst_smin = -res_max_abs; + *dst_smax = res_max_abs; + } + + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->umin_value = 0; + dst_reg->umax_value = U64_MAX; + reset_reg32_and_tnum(dst_reg); +} + static void scalar32_min_max_and(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { @@ -15564,6 +15822,14 @@ static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn, case BPF_MUL: return true; + /* + * Division and modulo operators range is only safe to compute when the + * divisor is a constant. + */ + case BPF_DIV: + case BPF_MOD: + return src_is_const; + /* Shift operators range is only computable if shift dimension operand * is a constant. Shifts greater than 31 or 63 are undefined. This * includes shifts by a negative number. @@ -15616,6 +15882,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, struct bpf_reg_state src_reg) { u8 opcode = BPF_OP(insn->code); + s16 off = insn->off; bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64); int ret; @@ -15667,6 +15934,38 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, scalar32_min_max_mul(dst_reg, &src_reg); scalar_min_max_mul(dst_reg, &src_reg); break; + case BPF_DIV: + /* BPF div specification: x / 0 = 0 */ + if ((alu32 && src_reg.u32_min_value == 0) || (!alu32 && src_reg.umin_value == 0)) { + ___mark_reg_known(dst_reg, 0); + break; + } + if (alu32) + if (off == 1) + scalar32_min_max_sdiv(dst_reg, &src_reg); + else + scalar32_min_max_udiv(dst_reg, &src_reg); + else + if (off == 1) + scalar_min_max_sdiv(dst_reg, &src_reg); + else + scalar_min_max_udiv(dst_reg, &src_reg); + break; + case BPF_MOD: + /* BPF mod specification: x % 0 = x */ + if ((alu32 && src_reg.u32_min_value == 0) || (!alu32 && src_reg.umin_value == 0)) + break; + if (alu32) + if (off == 1) + scalar32_min_max_smod(dst_reg, &src_reg); + else + scalar32_min_max_umod(dst_reg, &src_reg); + else + if (off == 1) + scalar_min_max_smod(dst_reg, &src_reg); + else + scalar_min_max_umod(dst_reg, &src_reg); + break; case BPF_AND: if (tnum_is_const(src_reg.var_off)) { ret = maybe_fork_scalars(env, insn, dst_reg); diff --git a/tools/testing/selftests/bpf/progs/verifier_value_illegal_alu.c b/tools/testing/selftests/bpf/progs/verifier_value_illegal_alu.c index 2129e4353fd9..4d8273c258d5 100644 --- a/tools/testing/selftests/bpf/progs/verifier_value_illegal_alu.c +++ b/tools/testing/selftests/bpf/progs/verifier_value_illegal_alu.c @@ -173,14 +173,15 @@ __naked void flow_keys_illegal_variable_offset_alu(void) asm volatile(" \ r6 = r1; \ r7 = *(u64*)(r6 + %[flow_keys_off]); \ - r8 = 8; \ - r8 /= 1; \ + call %[bpf_get_prandom_u32]; \ + r8 = r0; \ r8 &= 8; \ r7 += r8; \ r0 = *(u64*)(r7 + 0); \ exit; \ " : - : __imm_const(flow_keys_off, offsetof(struct __sk_buff, flow_keys)) + : __imm_const(flow_keys_off, offsetof(struct __sk_buff, flow_keys)), + __imm(bpf_get_prandom_u32) : __clobber_all); } diff --git a/tools/testing/selftests/bpf/verifier/precise.c b/tools/testing/selftests/bpf/verifier/precise.c index 59a020c35647..061d98f6e9bb 100644 --- a/tools/testing/selftests/bpf/verifier/precise.c +++ b/tools/testing/selftests/bpf/verifier/precise.c @@ -229,11 +229,11 @@ { "precise: program doesn't prematurely prune branches", .insns = { - BPF_ALU64_IMM(BPF_MOV, BPF_REG_6, 0x400), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32), + BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_0), BPF_ALU64_IMM(BPF_MOV, BPF_REG_7, 0), BPF_ALU64_IMM(BPF_MOV, BPF_REG_8, 0), BPF_ALU64_IMM(BPF_MOV, BPF_REG_9, 0x80000000), - BPF_ALU64_IMM(BPF_MOD, BPF_REG_6, 0x401), BPF_JMP_IMM(BPF_JA, 0, 0, 0), BPF_JMP_REG(BPF_JLE, BPF_REG_6, BPF_REG_9, 2), BPF_ALU64_IMM(BPF_MOD, BPF_REG_6, 1), -- cgit v1.2.3 From c9e440bf25a712d906c40ba3ef831f3f0ccc6a1b Mon Sep 17 00:00:00 2001 From: Yazhou Tang Date: Mon, 19 Jan 2026 16:54:58 +0800 Subject: selftests/bpf: Add tests for BPF_DIV and BPF_MOD range tracking Now BPF_DIV has range tracking support via interval analysis. This patch adds selftests to cover various cases of BPF_DIV and BPF_MOD operations when the divisor is a constant, also covering both signed and unsigned variants. This patch includes several types of tests in 32-bit and 64-bit variants: 1. For UDIV - positive divisor - zero divisor 2. For SDIV - positive divisor, positive dividend - positive divisor, negative dividend - positive divisor, mixed sign dividend - negative divisor, positive dividend - negative divisor, negative dividend - negative divisor, mixed sign dividend - zero divisor - overflow (SIGNED_MIN/-1), normal dividend - overflow (SIGNED_MIN/-1), constant dividend 3. For UMOD - positive divisor - positive divisor, small dividend - zero divisor 4. For SMOD - positive divisor, positive dividend - positive divisor, negative dividend - positive divisor, mixed sign dividend - positive divisor, mixed sign dividend, small dividend - negative divisor, positive dividend - negative divisor, negative dividend - negative divisor, mixed sign dividend - negative divisor, mixed sign dividend, small dividend - zero divisor - overflow (SIGNED_MIN/-1), normal dividend - overflow (SIGNED_MIN/-1), constant dividend Specifically, these selftests are based on dead code elimination: If the BPF verifier can precisely analyze the result of BPF_DIV/BPF_MOD instruction, it can prune the path that leads to an error (here we use invalid memory access as the error case), allowing the program to pass verification. Co-developed-by: Shenghao Yuan Signed-off-by: Shenghao Yuan Co-developed-by: Tianci Cao Signed-off-by: Tianci Cao Signed-off-by: Yazhou Tang Link: https://lore.kernel.org/r/20260119085458.182221-3-tangyazhou@zju.edu.cn Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/verifier.c | 2 + .../selftests/bpf/progs/verifier_div_mod_bounds.c | 1149 ++++++++++++++++++++ 2 files changed, 1151 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/verifier_div_mod_bounds.c (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index 38c5ba70100c..fa9e506cc36f 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -33,6 +33,7 @@ #include "verifier_direct_packet_access.skel.h" #include "verifier_direct_stack_access_wraparound.skel.h" #include "verifier_div0.skel.h" +#include "verifier_div_mod_bounds.skel.h" #include "verifier_div_overflow.skel.h" #include "verifier_global_subprogs.skel.h" #include "verifier_global_ptr_args.skel.h" @@ -175,6 +176,7 @@ void test_verifier_d_path(void) { RUN(verifier_d_path); } void test_verifier_direct_packet_access(void) { RUN(verifier_direct_packet_access); } void test_verifier_direct_stack_access_wraparound(void) { RUN(verifier_direct_stack_access_wraparound); } void test_verifier_div0(void) { RUN(verifier_div0); } +void test_verifier_div_mod_bounds(void) { RUN(verifier_div_mod_bounds); } void test_verifier_div_overflow(void) { RUN(verifier_div_overflow); } void test_verifier_global_subprogs(void) { RUN(verifier_global_subprogs); } void test_verifier_global_ptr_args(void) { RUN(verifier_global_ptr_args); } diff --git a/tools/testing/selftests/bpf/progs/verifier_div_mod_bounds.c b/tools/testing/selftests/bpf/progs/verifier_div_mod_bounds.c new file mode 100644 index 000000000000..4672af0b3268 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_div_mod_bounds.c @@ -0,0 +1,1149 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include "bpf_misc.h" + +/* This file contains unit tests for signed/unsigned division and modulo + * operations (with divisor as a constant), focusing on verifying whether + * BPF verifier's range tracking module soundly and precisely computes + * the results. + */ + +SEC("socket") +__description("UDIV32, positive divisor") +__success __retval(0) __log_level(2) +__msg("w1 /= 3 {{.*}}; R1=scalar(smin=smin32=0,smax=umax=smax32=umax32=3,var_off=(0x0; 0x3))") +__naked void udiv32_pos_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + w1 &= 8; \ + w1 |= 1; \ + w1 /= 3; \ + if w1 > 3 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("UDIV32, zero divisor") +__success __retval(0) __log_level(2) +__msg("w1 /= w2 {{.*}}; R1=0 R2=0") +__naked void udiv32_zero_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + w1 &= 8; \ + w1 |= 1; \ + w2 = 0; \ + w1 /= w2; \ + if w1 != 0 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("UDIV64, positive divisor") +__success __retval(0) __log_level(2) +__msg("r1 /= 3 {{.*}}; R1=scalar(smin=smin32=0,smax=umax=smax32=umax32=3,var_off=(0x0; 0x3))") +__naked void udiv64_pos_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + r1 &= 8; \ + r1 |= 1; \ + r1 /= 3; \ + if r1 > 3 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("UDIV64, zero divisor") +__success __retval(0) __log_level(2) +__msg("r1 /= r2 {{.*}}; R1=0 R2=0") +__naked void udiv64_zero_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + r1 &= 8; \ + r1 |= 1; \ + r2 = 0; \ + r1 /= r2; \ + if r1 != 0 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV32, positive divisor, positive dividend") +__success __retval(0) __log_level(2) +__msg("w1 s/= 3 {{.*}}; R1=scalar(smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=3,var_off=(0x2; 0x1))") +__naked void sdiv32_pos_divisor_1(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< 8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w1 s/= 3; \ + if w1 s< 2 goto l1_%=; \ + if w1 s> 3 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV32, positive divisor, negative dividend") +__success __retval(0) __log_level(2) +__msg("w1 s/= 3 {{.*}}; R1=scalar(smin=umin=umin32=0xfffffffd,smax=umax=umax32=0xfffffffe,smin32=-3,smax32=-2,var_off=(0xfffffffc; 0x3))") +__naked void sdiv32_pos_divisor_2(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s> -8 goto l0_%=; \ + if w1 s< -10 goto l0_%=; \ + w1 s/= 3; \ + if w1 s< -3 goto l1_%=; \ + if w1 s> -2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV32, positive divisor, mixed sign dividend") +__success __retval(0) __log_level(2) +__msg("w1 s/= 3 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-2,smax32=3,var_off=(0x0; 0xffffffff))") +__naked void sdiv32_pos_divisor_3(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< -8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w1 s/= 3; \ + if w1 s< -2 goto l1_%=; \ + if w1 s> 3 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV32, negative divisor, positive dividend") +__success __retval(0) __log_level(2) +__msg("w1 s/= -3 {{.*}}; R1=scalar(smin=umin=umin32=0xfffffffd,smax=umax=umax32=0xfffffffe,smin32=-3,smax32=-2,var_off=(0xfffffffc; 0x3))") +__naked void sdiv32_neg_divisor_1(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< 8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w1 s/= -3; \ + if w1 s< -3 goto l1_%=; \ + if w1 s> -2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV32, negative divisor, positive dividend") +__success __retval(0) __log_level(2) +__msg("w1 s/= -3 {{.*}}; R1=scalar(smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=3,var_off=(0x2; 0x1))") +__naked void sdiv32_neg_divisor_2(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s> -8 goto l0_%=; \ + if w1 s< -10 goto l0_%=; \ + w1 s/= -3; \ + if w1 s< 2 goto l1_%=; \ + if w1 s> 3 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV32, negative divisor, mixed sign dividend") +__success __retval(0) __log_level(2) +__msg("w1 s/= -3 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-3,smax32=2,var_off=(0x0; 0xffffffff))") +__naked void sdiv32_neg_divisor_3(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< -8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w1 s/= -3; \ + if w1 s< -3 goto l1_%=; \ + if w1 s> 2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV32, zero divisor") +__success __retval(0) __log_level(2) +__msg("w1 s/= w2 {{.*}}; R1=0 R2=0") +__naked void sdiv32_zero_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + w1 &= 8; \ + w1 |= 1; \ + w2 = 0; \ + w1 s/= w2; \ + if w1 != 0 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV32, overflow (S32_MIN/-1)") +__success __retval(0) __log_level(2) +__msg("w1 s/= -1 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))") +__naked void sdiv32_overflow_1(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + w2 = %[int_min]; \ + w2 += 10; \ + if w1 s> w2 goto l0_%=; \ + w1 s/= -1; \ +l0_%=: r0 = 0; \ + exit; \ +" : + : __imm_const(int_min, INT_MIN), + __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV32, overflow (S32_MIN/-1), constant dividend") +__success __retval(0) __log_level(2) +__msg("w1 s/= -1 {{.*}}; R1=0x80000000") +__naked void sdiv32_overflow_2(void) +{ + asm volatile (" \ + w1 = %[int_min]; \ + w1 s/= -1; \ + if w1 != %[int_min] goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm_const(int_min, INT_MIN) + : __clobber_all); +} + +SEC("socket") +__description("SDIV64, positive divisor, positive dividend") +__success __retval(0) __log_level(2) +__msg("r1 s/= 3 {{.*}}; R1=scalar(smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=3,var_off=(0x2; 0x1))") +__naked void sdiv64_pos_divisor_1(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< 8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r1 s/= 3; \ + if r1 s< 2 goto l1_%=; \ + if r1 s> 3 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV64, positive divisor, negative dividend") +__success __retval(0) __log_level(2) +__msg("r1 s/= 3 {{.*}}; R1=scalar(smin=smin32=-3,smax=smax32=-2,umin=0xfffffffffffffffd,umax=0xfffffffffffffffe,umin32=0xfffffffd,umax32=0xfffffffe,var_off=(0xfffffffffffffffc; 0x3))") +__naked void sdiv64_pos_divisor_2(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s> -8 goto l0_%=; \ + if r1 s< -10 goto l0_%=; \ + r1 s/= 3; \ + if r1 s< -3 goto l1_%=; \ + if r1 s> -2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV64, positive divisor, mixed sign dividend") +__success __retval(0) __log_level(2) +__msg("r1 s/= 3 {{.*}}; R1=scalar(smin=smin32=-2,smax=smax32=3)") +__naked void sdiv64_pos_divisor_3(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< -8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r1 s/= 3; \ + if r1 s< -2 goto l1_%=; \ + if r1 s> 3 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV64, negative divisor, positive dividend") +__success __retval(0) __log_level(2) +__msg("r1 s/= -3 {{.*}}; R1=scalar(smin=smin32=-3,smax=smax32=-2,umin=0xfffffffffffffffd,umax=0xfffffffffffffffe,umin32=0xfffffffd,umax32=0xfffffffe,var_off=(0xfffffffffffffffc; 0x3))") +__naked void sdiv64_neg_divisor_1(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< 8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r1 s/= -3; \ + if r1 s< -3 goto l1_%=; \ + if r1 s> -2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV64, negative divisor, positive dividend") +__success __retval(0) __log_level(2) +__msg("r1 s/= -3 {{.*}}; R1=scalar(smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=3,var_off=(0x2; 0x1))") +__naked void sdiv64_neg_divisor_2(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s> -8 goto l0_%=; \ + if r1 s< -10 goto l0_%=; \ + r1 s/= -3; \ + if r1 s< 2 goto l1_%=; \ + if r1 s> 3 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV64, negative divisor, mixed sign dividend") +__success __retval(0) __log_level(2) +__msg("r1 s/= -3 {{.*}}; R1=scalar(smin=smin32=-3,smax=smax32=2)") +__naked void sdiv64_neg_divisor_3(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< -8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r1 s/= -3; \ + if r1 s< -3 goto l1_%=; \ + if r1 s> 2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV64, zero divisor") +__success __retval(0) __log_level(2) +__msg("r1 s/= r2 {{.*}}; R1=0 R2=0") +__naked void sdiv64_zero_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + r1 &= 8; \ + r1 |= 1; \ + r2 = 0; \ + r1 s/= r2; \ + if r1 != 0 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV64, overflow (S64_MIN/-1)") +__success __retval(0) __log_level(2) +__msg("r1 s/= -1 {{.*}}; R1=scalar()") +__naked void sdiv64_overflow_1(void) +{ + asm volatile (" \ + call %[bpf_ktime_get_ns]; \ + r1 = r0; \ + r2 = %[llong_min] ll; \ + r2 += 10; \ + if r1 s> r2 goto l0_%=; \ + r1 s/= -1; \ +l0_%=: r0 = 0; \ + exit; \ +" : + : __imm_const(llong_min, LLONG_MIN), + __imm(bpf_ktime_get_ns) + : __clobber_all); +} + +SEC("socket") +__description("SDIV64, overflow (S64_MIN/-1), constant dividend") +__success __retval(0) __log_level(2) +__msg("r1 s/= -1 {{.*}}; R1=0x8000000000000000") +__naked void sdiv64_overflow_2(void) +{ + asm volatile (" \ + r1 = %[llong_min] ll; \ + r1 s/= -1; \ + r2 = %[llong_min] ll; \ + if r1 != r2 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm_const(llong_min, LLONG_MIN) + : __clobber_all); +} + +SEC("socket") +__description("UMOD32, positive divisor") +__success __retval(0) __log_level(2) +__msg("w1 %= 3 {{.*}}; R1=scalar(smin=smin32=0,smax=umax=smax32=umax32=2,var_off=(0x0; 0x3))") +__naked void umod32_pos_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + w1 &= 8; \ + w1 |= 1; \ + w1 %%= 3; \ + if w1 > 3 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("UMOD32, positive divisor, small dividend") +__success __retval(0) __log_level(2) +__msg("w1 %= 10 {{.*}}; R1=scalar(smin=umin=smin32=umin32=1,smax=umax=smax32=umax32=9,var_off=(0x1; 0x8))") +__naked void umod32_pos_divisor_unchanged(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + w1 &= 8; \ + w1 |= 1; \ + w1 %%= 10; \ + if w1 < 1 goto l0_%=; \ + if w1 > 9 goto l0_%=; \ + if w1 & 1 != 1 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("UMOD32, zero divisor") +__success __retval(0) __log_level(2) +__msg("w1 %= w2 {{.*}}; R1=scalar(smin=umin=smin32=umin32=1,smax=umax=smax32=umax32=9,var_off=(0x1; 0x8)) R2=0") +__naked void umod32_zero_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + w1 &= 8; \ + w1 |= 1; \ + w2 = 0; \ + w1 %%= w2; \ + if w1 < 1 goto l0_%=; \ + if w1 > 9 goto l0_%=; \ + if w1 & 1 != 1 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("UMOD64, positive divisor") +__success __retval(0) __log_level(2) +__msg("r1 %= 3 {{.*}}; R1=scalar(smin=smin32=0,smax=umax=smax32=umax32=2,var_off=(0x0; 0x3))") +__naked void umod64_pos_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + r1 &= 8; \ + r1 |= 1; \ + r1 %%= 3; \ + if r1 > 3 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("UMOD64, positive divisor, small dividend") +__success __retval(0) __log_level(2) +__msg("r1 %= 10 {{.*}}; R1=scalar(smin=umin=smin32=umin32=1,smax=umax=smax32=umax32=9,var_off=(0x1; 0x8))") +__naked void umod64_pos_divisor_unchanged(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + r1 &= 8; \ + r1 |= 1; \ + r1 %%= 10; \ + if r1 < 1 goto l0_%=; \ + if r1 > 9 goto l0_%=; \ + if r1 & 1 != 1 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("UMOD64, zero divisor") +__success __retval(0) __log_level(2) +__msg("r1 %= r2 {{.*}}; R1=scalar(smin=umin=smin32=umin32=1,smax=umax=smax32=umax32=9,var_off=(0x1; 0x8)) R2=0") +__naked void umod64_zero_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + r1 &= 8; \ + r1 |= 1; \ + r2 = 0; \ + r1 %%= r2; \ + if r1 < 1 goto l0_%=; \ + if r1 > 9 goto l0_%=; \ + if r1 & 1 != 1 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, positive divisor, positive dividend") +__success __retval(0) __log_level(2) +__msg("w1 s%= 3 {{.*}}; R1=scalar(smin=smin32=0,smax=umax=smax32=umax32=2,var_off=(0x0; 0x3))") +__naked void smod32_pos_divisor_1(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< 8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w1 s%%= 3; \ + if w1 s< 0 goto l1_%=; \ + if w1 s> 2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, positive divisor, negative dividend") +__success __retval(0) __log_level(2) +__msg("w1 s%= 3 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-2,smax32=0,var_off=(0x0; 0xffffffff))") +__naked void smod32_pos_divisor_2(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s> -8 goto l0_%=; \ + if w1 s< -10 goto l0_%=; \ + w1 s%%= 3; \ + if w1 s< -2 goto l1_%=; \ + if w1 s> 0 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, positive divisor, mixed sign dividend") +__success __retval(0) __log_level(2) +__msg("w1 s%= 3 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-2,smax32=2,var_off=(0x0; 0xffffffff))") +__naked void smod32_pos_divisor_3(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< -8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w1 s%%= 3; \ + if w1 s< -2 goto l1_%=; \ + if w1 s> 2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, positive divisor, small dividend") +__success __retval(0) __log_level(2) +__msg("w1 s%= 11 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-8,smax32=10,var_off=(0x0; 0xffffffff))") +__naked void smod32_pos_divisor_unchanged(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< -8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w1 s%%= 11; \ + if w1 s< -8 goto l1_%=; \ + if w1 s> 10 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, negative divisor, positive dividend") +__success __retval(0) __log_level(2) +__msg("w1 s%= -3 {{.*}}; R1=scalar(smin=smin32=0,smax=umax=smax32=umax32=2,var_off=(0x0; 0x3))") +__naked void smod32_neg_divisor_1(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< 8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w1 s%%= -3; \ + if w1 s< 0 goto l1_%=; \ + if w1 s> 2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, negative divisor, negative dividend") +__success __retval(0) __log_level(2) +__msg("w1 s%= -3 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-2,smax32=0,var_off=(0x0; 0xffffffff))") +__naked void smod32_neg_divisor_2(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s> -8 goto l0_%=; \ + if w1 s< -10 goto l0_%=; \ + w1 s%%= -3; \ + if w1 s< -2 goto l1_%=; \ + if w1 s> 0 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, negative divisor, mixed sign dividend") +__success __retval(0) __log_level(2) +__msg("w1 s%= -3 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-2,smax32=2,var_off=(0x0; 0xffffffff))") +__naked void smod32_neg_divisor_3(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< -8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w1 s%%= -3; \ + if w1 s< -2 goto l1_%=; \ + if w1 s> 2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, negative divisor, small dividend") +__success __retval(0) __log_level(2) +__msg("w1 s%= -11 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-8,smax32=10,var_off=(0x0; 0xffffffff))") +__naked void smod32_neg_divisor_unchanged(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< -8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w1 s%%= -11; \ + if w1 s< -8 goto l1_%=; \ + if w1 s> 10 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, zero divisor") +__success __retval(0) __log_level(2) +__msg("w1 s%= w2 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-8,smax32=10,var_off=(0x0; 0xffffffff)) R2=0") +__naked void smod32_zero_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< -8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w2 = 0; \ + w1 s%%= w2; \ + if w1 s< -8 goto l1_%=; \ + if w1 s> 10 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, overflow (S32_MIN%-1)") +__success __retval(0) __log_level(2) +__msg("w1 s%= -1 {{.*}}; R1=0") +__naked void smod32_overflow_1(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + w2 = %[int_min]; \ + w2 += 10; \ + if w1 s> w2 goto l0_%=; \ + w1 s%%= -1; \ + if w1 != 0 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm_const(int_min, INT_MIN), + __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, overflow (S32_MIN%-1), constant dividend") +__success __retval(0) __log_level(2) +__msg("w1 s%= -1 {{.*}}; R1=0") +__naked void smod32_overflow_2(void) +{ + asm volatile (" \ + w1 = %[int_min]; \ + w1 s%%= -1; \ + if w1 != 0 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm_const(int_min, INT_MIN) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, positive divisor, positive dividend") +__success __retval(0) __log_level(2) +__msg("r1 s%= 3 {{.*}}; R1=scalar(smin=smin32=0,smax=umax=smax32=umax32=2,var_off=(0x0; 0x3))") +__naked void smod64_pos_divisor_1(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< 8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r1 s%%= 3; \ + if r1 s< 0 goto l1_%=; \ + if r1 s> 2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, positive divisor, negative dividend") +__success __retval(0) __log_level(2) +__msg("r1 s%= 3 {{.*}}; R1=scalar(smin=smin32=-2,smax=smax32=0)") +__naked void smod64_pos_divisor_2(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s> -8 goto l0_%=; \ + if r1 s< -10 goto l0_%=; \ + r1 s%%= 3; \ + if r1 s< -2 goto l1_%=; \ + if r1 s> 0 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, positive divisor, mixed sign dividend") +__success __retval(0) __log_level(2) +__msg("r1 s%= 3 {{.*}}; R1=scalar(smin=smin32=-2,smax=smax32=2)") +__naked void smod64_pos_divisor_3(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< -8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r1 s%%= 3; \ + if r1 s< -2 goto l1_%=; \ + if r1 s> 2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, positive divisor, small dividend") +__success __retval(0) __log_level(2) +__msg("r1 s%= 11 {{.*}}; R1=scalar(smin=smin32=-8,smax=smax32=10)") +__naked void smod64_pos_divisor_unchanged(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< -8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r1 s%%= 11; \ + if r1 s< -8 goto l1_%=; \ + if r1 s> 10 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, negative divisor, positive dividend") +__success __retval(0) __log_level(2) +__msg("r1 s%= -3 {{.*}}; R1=scalar(smin=smin32=0,smax=umax=smax32=umax32=2,var_off=(0x0; 0x3))") +__naked void smod64_neg_divisor_1(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< 8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r1 s%%= -3; \ + if r1 s< 0 goto l1_%=; \ + if r1 s> 2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, negative divisor, negative dividend") +__success __retval(0) __log_level(2) +__msg("r1 s%= -3 {{.*}}; R1=scalar(smin=smin32=-2,smax=smax32=0)") +__naked void smod64_neg_divisor_2(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s> -8 goto l0_%=; \ + if r1 s< -10 goto l0_%=; \ + r1 s%%= -3; \ + if r1 s< -2 goto l1_%=; \ + if r1 s> 0 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, negative divisor, mixed sign dividend") +__success __retval(0) __log_level(2) +__msg("r1 s%= -3 {{.*}}; R1=scalar(smin=smin32=-2,smax=smax32=2)") +__naked void smod64_neg_divisor_3(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< -8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r1 s%%= -3; \ + if r1 s< -2 goto l1_%=; \ + if r1 s> 2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, negative divisor, small dividend") +__success __retval(0) __log_level(2) +__msg("r1 s%= -11 {{.*}}; R1=scalar(smin=smin32=-8,smax=smax32=10)") +__naked void smod64_neg_divisor_unchanged(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< -8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r1 s%%= -11; \ + if r1 s< -8 goto l1_%=; \ + if r1 s> 10 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, zero divisor") +__success __retval(0) __log_level(2) +__msg("r1 s%= r2 {{.*}}; R1=scalar(smin=smin32=-8,smax=smax32=10) R2=0") +__naked void smod64_zero_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< -8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r2 = 0; \ + r1 s%%= r2; \ + if r1 s< -8 goto l1_%=; \ + if r1 s> 10 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, overflow (S64_MIN%-1)") +__success __retval(0) __log_level(2) +__msg("r1 s%= -1 {{.*}}; R1=0") +__naked void smod64_overflow_1(void) +{ + asm volatile (" \ + call %[bpf_ktime_get_ns]; \ + r1 = r0; \ + r2 = %[llong_min] ll; \ + r2 += 10; \ + if r1 s> r2 goto l0_%=; \ + r1 s%%= -1; \ + if r1 != 0 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm_const(llong_min, LLONG_MIN), + __imm(bpf_ktime_get_ns) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, overflow (S64_MIN%-1), constant dividend") +__success __retval(0) __log_level(2) +__msg("r1 s%= -1 {{.*}}; R1=0") +__naked void smod64_overflow_2(void) +{ + asm volatile (" \ + r1 = %[llong_min] ll; \ + r1 s%%= -1; \ + if r1 != 0 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm_const(llong_min, LLONG_MIN) + : __clobber_all); +} -- cgit v1.2.3 From dd341eacdba360d035c9d4de66d3c80a89d77c84 Mon Sep 17 00:00:00 2001 From: Matt Bobrowski Date: Tue, 20 Jan 2026 09:16:30 +0000 Subject: selftests/bpf: update verifier test for default trusted pointer semantics Replace the verifier test for default trusted pointer semantics, which previously relied on BPF kfunc bpf_get_root_mem_cgroup(), with a new test utilizing dedicated BPF kfuncs defined within the bpf_testmod. bpf_get_root_mem_cgroup() was modified such that it again relies on KF_ACQUIRE semantics, therefore no longer making it a suitable candidate to test BPF verifier default trusted pointer semantics against. Link: https://lore.kernel.org/bpf/20260113083949.2502978-2-mattbobrowski@google.com Signed-off-by: Matt Bobrowski Link: https://lore.kernel.org/r/20260120091630.3420452-1-mattbobrowski@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/verifier.c | 4 +-- .../bpf/progs/verifier_default_trusted_ptr.c | 29 ++++++++++++++++++++ .../selftests/bpf/progs/verifier_memcontrol.c | 32 ---------------------- .../testing/selftests/bpf/test_kmods/bpf_testmod.c | 18 ++++++++++++ .../selftests/bpf/test_kmods/bpf_testmod_kfunc.h | 3 ++ 5 files changed, 52 insertions(+), 34 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/verifier_default_trusted_ptr.c delete mode 100644 tools/testing/selftests/bpf/progs/verifier_memcontrol.c (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index fa9e506cc36f..b6a1e79709be 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -30,6 +30,7 @@ #include "verifier_ctx.skel.h" #include "verifier_ctx_sk_msg.skel.h" #include "verifier_d_path.skel.h" +#include "verifier_default_trusted_ptr.skel.h" #include "verifier_direct_packet_access.skel.h" #include "verifier_direct_stack_access_wraparound.skel.h" #include "verifier_div0.skel.h" @@ -62,7 +63,6 @@ #include "verifier_masking.skel.h" #include "verifier_may_goto_1.skel.h" #include "verifier_may_goto_2.skel.h" -#include "verifier_memcontrol.skel.h" #include "verifier_meta_access.skel.h" #include "verifier_movsx.skel.h" #include "verifier_mtu.skel.h" @@ -173,6 +173,7 @@ void test_verifier_const_or(void) { RUN(verifier_const_or); } void test_verifier_ctx(void) { RUN(verifier_ctx); } void test_verifier_ctx_sk_msg(void) { RUN(verifier_ctx_sk_msg); } void test_verifier_d_path(void) { RUN(verifier_d_path); } +void test_verifier_default_trusted_ptr(void) { RUN_TESTS(verifier_default_trusted_ptr); } void test_verifier_direct_packet_access(void) { RUN(verifier_direct_packet_access); } void test_verifier_direct_stack_access_wraparound(void) { RUN(verifier_direct_stack_access_wraparound); } void test_verifier_div0(void) { RUN(verifier_div0); } @@ -205,7 +206,6 @@ void test_verifier_map_ret_val(void) { RUN(verifier_map_ret_val); } void test_verifier_masking(void) { RUN(verifier_masking); } void test_verifier_may_goto_1(void) { RUN(verifier_may_goto_1); } void test_verifier_may_goto_2(void) { RUN(verifier_may_goto_2); } -void test_verifier_memcontrol(void) { RUN(verifier_memcontrol); } void test_verifier_meta_access(void) { RUN(verifier_meta_access); } void test_verifier_movsx(void) { RUN(verifier_movsx); } void test_verifier_mul(void) { RUN(verifier_mul); } diff --git a/tools/testing/selftests/bpf/progs/verifier_default_trusted_ptr.c b/tools/testing/selftests/bpf/progs/verifier_default_trusted_ptr.c new file mode 100644 index 000000000000..fa3b656ad4fb --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_default_trusted_ptr.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2026 Google LLC. + */ + +#include +#include +#include + +#include "bpf_misc.h" +#include "../test_kmods/bpf_testmod_kfunc.h" + +SEC("syscall") +__success __retval(0) +int test_default_trusted_ptr(void *ctx) +{ + struct prog_test_member *trusted_ptr; + + trusted_ptr = bpf_kfunc_get_default_trusted_ptr_test(); + /* + * Test BPF kfunc bpf_get_default_trusted_ptr_test() returns a + * PTR_TO_BTF_ID | PTR_TRUSTED, therefore it should be accepted when + * passed to a BPF kfunc only accepting KF_TRUSTED_ARGS. + */ + bpf_kfunc_put_default_trusted_ptr_test(trusted_ptr); + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_memcontrol.c b/tools/testing/selftests/bpf/progs/verifier_memcontrol.c deleted file mode 100644 index 13564956f621..000000000000 --- a/tools/testing/selftests/bpf/progs/verifier_memcontrol.c +++ /dev/null @@ -1,32 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright 2026 Google LLC. - */ - -#include -#include -#include -#include "bpf_misc.h" - -SEC("syscall") -__success __retval(0) -int root_mem_cgroup_default_trusted(void *ctx) -{ - unsigned long usage; - struct mem_cgroup *root_mem_cgroup; - - root_mem_cgroup = bpf_get_root_mem_cgroup(); - if (!root_mem_cgroup) - return 1; - - /* - * BPF kfunc bpf_get_root_mem_cgroup() returns a PTR_TO_BTF_ID | - * PTR_TRUSTED | PTR_MAYBE_NULL, therefore it should be accepted when - * passed to a BPF kfunc only accepting KF_TRUSTED_ARGS. - */ - usage = bpf_mem_cgroup_usage(root_mem_cgroup); - __sink(usage); - return 0; -} - -char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index 0d542ba64365..d425034b72d3 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -254,6 +254,22 @@ __bpf_kfunc int *bpf_kfunc_ret_rcu_test_nostruct(int rdonly_buf_size) return NULL; } +static struct prog_test_member trusted_ptr; + +__bpf_kfunc struct prog_test_member *bpf_kfunc_get_default_trusted_ptr_test(void) +{ + return &trusted_ptr; +} + +__bpf_kfunc void bpf_kfunc_put_default_trusted_ptr_test(struct prog_test_member *trusted_ptr) +{ + /* + * This BPF kfunc doesn't actually have any put/KF_ACQUIRE + * semantics. We're simply wanting to simulate a BPF kfunc that takes a + * struct prog_test_member pointer as an argument. + */ +} + __bpf_kfunc struct bpf_testmod_ctx * bpf_testmod_ctx_create(int *err) { @@ -709,6 +725,8 @@ BTF_ID_FLAGS(func, bpf_testmod_ctx_create, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_testmod_ctx_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_testmod_ops3_call_test_1) BTF_ID_FLAGS(func, bpf_testmod_ops3_call_test_2) +BTF_ID_FLAGS(func, bpf_kfunc_get_default_trusted_ptr_test); +BTF_ID_FLAGS(func, bpf_kfunc_put_default_trusted_ptr_test); BTF_KFUNCS_END(bpf_testmod_common_kfunc_ids) BTF_ID_LIST(bpf_testmod_dtor_ids) diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h index 225ea30c4e3d..10f89f06245f 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h @@ -166,4 +166,7 @@ extern int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id) __wea extern int bpf_kfunc_multi_st_ops_test_1_assoc(struct st_ops_args *args) __weak __ksym; #endif +struct prog_test_member *bpf_kfunc_get_default_trusted_ptr_test(void) __ksym; +void bpf_kfunc_put_default_trusted_ptr_test(struct prog_test_member *trusted_ptr) __ksym; + #endif /* _BPF_TESTMOD_KFUNC_H */ -- cgit v1.2.3 From 8766d61a1d33cb5f15bfdd6ce9832bbe1fc649c2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 20 Jan 2026 18:04:55 -0800 Subject: Revert "Merge branch 'netkit-support-for-io_uring-zero-copy-and-af_xdp'" This reverts commit 77b9c4a438fc66e2ab004c411056b3fb71a54f2c, reversing changes made to 4515ec4ad58a37e70a9e1256c0b993958c9b7497: 931420a2fc36 ("selftests/net: Add netkit container tests") ab771c938d9a ("selftests/net: Make NetDrvContEnv support queue leasing") 6be87fbb2776 ("selftests/net: Add env for container based tests") 61d99ce3dfc2 ("selftests/net: Add bpf skb forwarding program") 920da3634194 ("netkit: Add xsk support for af_xdp applications") eef51113f8af ("netkit: Add netkit notifier to check for unregistering devices") b5ef109d22d4 ("netkit: Implement rtnl_link_ops->alloc and ndo_queue_create") b5c3fa4a0b16 ("netkit: Add single device mode for netkit") 0073d2fd679d ("xsk: Proxy pool management for leased queues") 1ecea95dd3b5 ("xsk: Extend xsk_rcv_check validation") 804bf334d08a ("net: Proxy netdev_queue_get_dma_dev for leased queues") 0caa9a8ddec3 ("net: Proxy net_mp_{open,close}_rxq for leased queues") ff8889ff9107 ("net, ethtool: Disallow leased real rxqs to be resized") 9e2103f36110 ("net: Add lease info to queue-get response") 31127deddef4 ("net: Implement netdev_nl_queue_create_doit") a5546e18f77c ("net: Add queue-create operation") The series will conflict with io_uring work, and the code needs more polish. Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/netdev.yaml | 44 --- drivers/net/netkit.c | 360 ++++----------------- include/linux/netdevice.h | 6 - include/net/netdev_queues.h | 19 +- include/net/netdev_rx_queue.h | 21 +- include/net/page_pool/memory_provider.h | 4 +- include/net/xdp_sock_drv.h | 2 +- include/uapi/linux/if_link.h | 6 - include/uapi/linux/netdev.h | 11 - net/core/dev.c | 7 - net/core/dev.h | 2 - net/core/netdev-genl-gen.c | 20 -- net/core/netdev-genl-gen.h | 2 - net/core/netdev-genl.c | 185 ----------- net/core/netdev_queues.c | 74 +---- net/core/netdev_rx_queue.c | 169 ++-------- net/ethtool/channels.c | 12 +- net/ethtool/ioctl.c | 9 +- net/xdp/xsk.c | 79 +---- tools/include/uapi/linux/netdev.h | 11 - tools/testing/selftests/drivers/net/README.rst | 7 - tools/testing/selftests/drivers/net/hw/Makefile | 2 - .../selftests/drivers/net/hw/lib/py/__init__.py | 7 +- .../selftests/drivers/net/hw/nk_forward.bpf.c | 49 --- tools/testing/selftests/drivers/net/hw/nk_netns.py | 23 -- .../testing/selftests/drivers/net/hw/nk_qlease.py | 55 ---- .../selftests/drivers/net/lib/py/__init__.py | 7 +- tools/testing/selftests/drivers/net/lib/py/env.py | 157 --------- 28 files changed, 117 insertions(+), 1233 deletions(-) delete mode 100644 tools/testing/selftests/drivers/net/hw/nk_forward.bpf.c delete mode 100755 tools/testing/selftests/drivers/net/hw/nk_netns.py delete mode 100755 tools/testing/selftests/drivers/net/hw/nk_qlease.py (limited to 'tools/testing') diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index b86db8656eac..596c306ce52b 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -339,15 +339,6 @@ attribute-sets: doc: XSK information for this queue, if any. type: nest nested-attributes: xsk-info - - - name: lease - doc: | - A queue from a virtual device can have a lease which refers to - another queue from a physical device. This is useful for memory - providers and AF_XDP operations which take an ifindex and queue id - to allow applications to bind against virtual devices in containers. - type: nest - nested-attributes: lease - name: qstats doc: | @@ -546,24 +537,6 @@ attribute-sets: name: id - name: type - - - name: lease - attributes: - - - name: ifindex - doc: The netdev ifindex to lease the queue from. - type: u32 - checks: - min: 1 - - - name: queue - doc: The netdev queue to lease from. - type: nest - nested-attributes: queue-id - - - name: netns-id - doc: The network namespace id of the netdev. - type: s32 - name: dmabuf attributes: @@ -713,7 +686,6 @@ operations: - dmabuf - io-uring - xsk - - lease dump: request: attributes: @@ -825,22 +797,6 @@ operations: reply: attributes: - id - - - name: queue-create - doc: | - Create a new queue for the given netdevice. Whether this operation - is supported depends on the device and the driver. - attribute-set: queue - flags: [admin-perm] - do: - request: - attributes: - - ifindex - - type - - lease - reply: &queue-create-op - attributes: - - id kernel-family: headers: ["net/netdev_netlink.h"] diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c index 0519f855d062..0a2fef7caccb 100644 --- a/drivers/net/netkit.c +++ b/drivers/net/netkit.c @@ -9,21 +9,11 @@ #include #include -#include -#include -#include -#include #include #include #include -#define NETKIT_DRV_NAME "netkit" - -#define NETKIT_NUM_RX_QUEUES_MAX 1024 -#define NETKIT_NUM_TX_QUEUES_MAX 1 - -#define NETKIT_NUM_RX_QUEUES_REAL 1 -#define NETKIT_NUM_TX_QUEUES_REAL 1 +#define DRV_NAME "netkit" struct netkit { __cacheline_group_begin(netkit_fastpath); @@ -36,7 +26,6 @@ struct netkit { __cacheline_group_begin(netkit_slowpath); enum netkit_mode mode; - enum netkit_pairing pair; bool primary; u32 headroom; __cacheline_group_end(netkit_slowpath); @@ -47,8 +36,6 @@ struct netkit_link { struct net_device *dev; }; -static struct rtnl_link_ops netkit_link_ops; - static __always_inline int netkit_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb, enum netkit_action ret) @@ -148,10 +135,6 @@ static int netkit_open(struct net_device *dev) struct netkit *nk = netkit_priv(dev); struct net_device *peer = rtnl_dereference(nk->peer); - if (nk->pair == NETKIT_DEVICE_SINGLE) { - netif_carrier_on(dev); - return 0; - } if (!peer) return -ENOTCONN; if (peer->flags & IFF_UP) { @@ -236,86 +219,9 @@ static void netkit_get_stats(struct net_device *dev, stats->tx_dropped = DEV_STATS_READ(dev, tx_dropped); } -static bool netkit_xsk_supported_at_phys(const struct net_device *dev) -{ - if (!dev->netdev_ops->ndo_bpf || - !dev->netdev_ops->ndo_xdp_xmit || - !dev->netdev_ops->ndo_xsk_wakeup) - return false; - if ((dev->xdp_features & NETDEV_XDP_ACT_XSK) != NETDEV_XDP_ACT_XSK) - return false; - return true; -} - -static int netkit_xsk(struct net_device *dev, struct netdev_bpf *xdp) -{ - struct netkit *nk = netkit_priv(dev); - struct netdev_bpf xdp_lower; - struct netdev_rx_queue *rxq; - struct net_device *phys; - int ret = -EBUSY; - - switch (xdp->command) { - case XDP_SETUP_XSK_POOL: - if (nk->pair == NETKIT_DEVICE_PAIR) - return -EOPNOTSUPP; - if (xdp->xsk.queue_id >= dev->real_num_rx_queues) - return -EINVAL; - - rxq = __netif_get_rx_queue(dev, xdp->xsk.queue_id); - if (!rxq->lease) - return -EOPNOTSUPP; - - phys = rxq->lease->dev; - if (!netkit_xsk_supported_at_phys(phys)) - return -EOPNOTSUPP; - - memcpy(&xdp_lower, xdp, sizeof(xdp_lower)); - xdp_lower.xsk.queue_id = get_netdev_rx_queue_index(rxq->lease); - break; - case XDP_SETUP_PROG: - return -EPERM; - default: - return -EINVAL; - } - - netdev_lock(phys); - if (!dev_get_min_mp_channel_count(phys)) - ret = phys->netdev_ops->ndo_bpf(phys, &xdp_lower); - netdev_unlock(phys); - return ret; -} - -static int netkit_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags) -{ - struct netdev_rx_queue *rxq; - struct net_device *phys; - - if (queue_id >= dev->real_num_rx_queues) - return -EINVAL; - - rxq = __netif_get_rx_queue(dev, queue_id); - if (!rxq->lease) - return -EOPNOTSUPP; - - phys = rxq->lease->dev; - if (!netkit_xsk_supported_at_phys(phys)) - return -EOPNOTSUPP; - - return phys->netdev_ops->ndo_xsk_wakeup(phys, - get_netdev_rx_queue_index(rxq->lease), flags); -} - -static int netkit_init(struct net_device *dev) -{ - netdev_lockdep_set_classes(dev); - return 0; -} - static void netkit_uninit(struct net_device *dev); static const struct net_device_ops netkit_netdev_ops = { - .ndo_init = netkit_init, .ndo_open = netkit_open, .ndo_stop = netkit_close, .ndo_start_xmit = netkit_xmit, @@ -326,95 +232,19 @@ static const struct net_device_ops netkit_netdev_ops = { .ndo_get_peer_dev = netkit_peer_dev, .ndo_get_stats64 = netkit_get_stats, .ndo_uninit = netkit_uninit, - .ndo_bpf = netkit_xsk, - .ndo_xsk_wakeup = netkit_xsk_wakeup, .ndo_features_check = passthru_features_check, }; static void netkit_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { - strscpy(info->driver, NETKIT_DRV_NAME, sizeof(info->driver)); + strscpy(info->driver, DRV_NAME, sizeof(info->driver)); } static const struct ethtool_ops netkit_ethtool_ops = { .get_drvinfo = netkit_get_drvinfo, }; -static int netkit_queue_create(struct net_device *dev) -{ - struct netkit *nk = netkit_priv(dev); - u32 rxq_count_old, rxq_count_new; - int err; - - rxq_count_old = dev->real_num_rx_queues; - rxq_count_new = rxq_count_old + 1; - - /* Only allow to lease a queue in single device mode or to - * lease against the peer device which then ends up in the - * target netns. - */ - if (nk->pair == NETKIT_DEVICE_PAIR && nk->primary) - return -EOPNOTSUPP; - - if (netif_running(dev)) - netif_carrier_off(dev); - err = netif_set_real_num_rx_queues(dev, rxq_count_new); - if (netif_running(dev)) - netif_carrier_on(dev); - - return err ? : rxq_count_old; -} - -static const struct netdev_queue_mgmt_ops netkit_queue_mgmt_ops = { - .ndo_queue_create = netkit_queue_create, -}; - -static struct net_device *netkit_alloc(struct nlattr *tb[], - const char *ifname, - unsigned char name_assign_type, - unsigned int num_tx_queues, - unsigned int num_rx_queues) -{ - const struct rtnl_link_ops *ops = &netkit_link_ops; - struct net_device *dev; - - if (num_tx_queues > NETKIT_NUM_TX_QUEUES_MAX || - num_rx_queues > NETKIT_NUM_RX_QUEUES_MAX) - return ERR_PTR(-EOPNOTSUPP); - - dev = alloc_netdev_mqs(ops->priv_size, ifname, - name_assign_type, ops->setup, - num_tx_queues, num_rx_queues); - if (dev) { - dev->real_num_tx_queues = NETKIT_NUM_TX_QUEUES_REAL; - dev->real_num_rx_queues = NETKIT_NUM_RX_QUEUES_REAL; - } - return dev; -} - -static void netkit_queue_unlease(struct net_device *dev) -{ - struct netdev_rx_queue *rxq, *rxq_lease; - struct net_device *dev_lease; - int i; - - if (dev->real_num_rx_queues == 1) - return; - - netdev_lock(dev); - for (i = 1; i < dev->real_num_rx_queues; i++) { - rxq = __netif_get_rx_queue(dev, i); - rxq_lease = rxq->lease; - dev_lease = rxq_lease->dev; - - netdev_lock(dev_lease); - netdev_rx_queue_unlease(rxq, rxq_lease); - netdev_unlock(dev_lease); - } - netdev_unlock(dev); -} - static void netkit_setup(struct net_device *dev) { static const netdev_features_t netkit_features_hw_vlan = @@ -445,20 +275,18 @@ static void netkit_setup(struct net_device *dev) dev->priv_flags |= IFF_DISABLE_NETPOLL; dev->lltx = true; - dev->netdev_ops = &netkit_netdev_ops; - dev->ethtool_ops = &netkit_ethtool_ops; - dev->queue_mgmt_ops = &netkit_queue_mgmt_ops; + dev->ethtool_ops = &netkit_ethtool_ops; + dev->netdev_ops = &netkit_netdev_ops; dev->features |= netkit_features; dev->hw_features = netkit_features; dev->hw_enc_features = netkit_features; dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; dev->vlan_features = dev->features & ~netkit_features_hw_vlan; + dev->needs_free_netdev = true; netif_set_tso_max_size(dev, GSO_MAX_SIZE); - - xdp_set_features_flag(dev, NETDEV_XDP_ACT_XSK); } static struct net *netkit_get_link_net(const struct net_device *dev) @@ -497,6 +325,8 @@ static int netkit_validate(struct nlattr *tb[], struct nlattr *data[], return 0; } +static struct rtnl_link_ops netkit_link_ops; + static int netkit_new_link(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) @@ -505,7 +335,6 @@ static int netkit_new_link(struct net_device *dev, enum netkit_scrub scrub_prim = NETKIT_SCRUB_DEFAULT; enum netkit_scrub scrub_peer = NETKIT_SCRUB_DEFAULT; struct nlattr *peer_tb[IFLA_MAX + 1], **tbp, *attr; - enum netkit_pairing pair = NETKIT_DEVICE_PAIR; enum netkit_action policy_prim = NETKIT_PASS; enum netkit_action policy_peer = NETKIT_PASS; struct nlattr **data = params->data; @@ -514,8 +343,7 @@ static int netkit_new_link(struct net_device *dev, struct nlattr **tb = params->tb; u16 headroom = 0, tailroom = 0; struct ifinfomsg *ifmp = NULL; - struct net_device *peer = NULL; - bool seen_peer = false; + struct net_device *peer; char ifname[IFNAMSIZ]; struct netkit *nk; int err; @@ -552,12 +380,6 @@ static int netkit_new_link(struct net_device *dev, headroom = nla_get_u16(data[IFLA_NETKIT_HEADROOM]); if (data[IFLA_NETKIT_TAILROOM]) tailroom = nla_get_u16(data[IFLA_NETKIT_TAILROOM]); - if (data[IFLA_NETKIT_PAIRING]) - pair = nla_get_u32(data[IFLA_NETKIT_PAIRING]); - - seen_peer = data[IFLA_NETKIT_PEER_INFO] || - data[IFLA_NETKIT_PEER_SCRUB] || - data[IFLA_NETKIT_PEER_POLICY]; } if (ifmp && tbp[IFLA_IFNAME]) { @@ -570,46 +392,45 @@ static int netkit_new_link(struct net_device *dev, if (mode != NETKIT_L2 && (tb[IFLA_ADDRESS] || tbp[IFLA_ADDRESS])) return -EOPNOTSUPP; - if (pair == NETKIT_DEVICE_SINGLE && - (tb != tbp || seen_peer || policy_prim != NETKIT_PASS)) - return -EOPNOTSUPP; - if (pair == NETKIT_DEVICE_PAIR) { - peer = rtnl_create_link(peer_net, ifname, ifname_assign_type, - &netkit_link_ops, tbp, extack); - if (IS_ERR(peer)) - return PTR_ERR(peer); - - netif_inherit_tso_max(peer, dev); - if (headroom) - peer->needed_headroom = headroom; - if (tailroom) - peer->needed_tailroom = tailroom; - if (mode == NETKIT_L2 && !(ifmp && tbp[IFLA_ADDRESS])) - eth_hw_addr_random(peer); - if (ifmp && dev->ifindex) - peer->ifindex = ifmp->ifi_index; + peer = rtnl_create_link(peer_net, ifname, ifname_assign_type, + &netkit_link_ops, tbp, extack); + if (IS_ERR(peer)) + return PTR_ERR(peer); - nk = netkit_priv(peer); - nk->primary = false; - nk->policy = policy_peer; - nk->scrub = scrub_peer; - nk->mode = mode; - nk->pair = pair; - nk->headroom = headroom; - bpf_mprog_bundle_init(&nk->bundle); - - err = register_netdevice(peer); - if (err < 0) - goto err_register_peer; - netif_carrier_off(peer); - if (mode == NETKIT_L2) - dev_change_flags(peer, peer->flags & ~IFF_NOARP, NULL); - - err = rtnl_configure_link(peer, NULL, 0, NULL); - if (err < 0) - goto err_configure_peer; + netif_inherit_tso_max(peer, dev); + if (headroom) { + peer->needed_headroom = headroom; + dev->needed_headroom = headroom; } + if (tailroom) { + peer->needed_tailroom = tailroom; + dev->needed_tailroom = tailroom; + } + + if (mode == NETKIT_L2 && !(ifmp && tbp[IFLA_ADDRESS])) + eth_hw_addr_random(peer); + if (ifmp && dev->ifindex) + peer->ifindex = ifmp->ifi_index; + + nk = netkit_priv(peer); + nk->primary = false; + nk->policy = policy_peer; + nk->scrub = scrub_peer; + nk->mode = mode; + nk->headroom = headroom; + bpf_mprog_bundle_init(&nk->bundle); + + err = register_netdevice(peer); + if (err < 0) + goto err_register_peer; + netif_carrier_off(peer); + if (mode == NETKIT_L2) + dev_change_flags(peer, peer->flags & ~IFF_NOARP, NULL); + + err = rtnl_configure_link(peer, NULL, 0, NULL); + if (err < 0) + goto err_configure_peer; if (mode == NETKIT_L2 && !tb[IFLA_ADDRESS]) eth_hw_addr_random(dev); @@ -617,17 +438,12 @@ static int netkit_new_link(struct net_device *dev, nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); else strscpy(dev->name, "nk%d", IFNAMSIZ); - if (headroom) - dev->needed_headroom = headroom; - if (tailroom) - dev->needed_tailroom = tailroom; nk = netkit_priv(dev); nk->primary = true; nk->policy = policy_prim; nk->scrub = scrub_prim; nk->mode = mode; - nk->pair = pair; nk->headroom = headroom; bpf_mprog_bundle_init(&nk->bundle); @@ -639,12 +455,10 @@ static int netkit_new_link(struct net_device *dev, dev_change_flags(dev, dev->flags & ~IFF_NOARP, NULL); rcu_assign_pointer(netkit_priv(dev)->peer, peer); - if (peer) - rcu_assign_pointer(netkit_priv(peer)->peer, dev); + rcu_assign_pointer(netkit_priv(peer)->peer, dev); return 0; err_configure_peer: - if (peer) - unregister_netdevice(peer); + unregister_netdevice(peer); return err; err_register_peer: free_netdev(peer); @@ -704,8 +518,6 @@ static struct net_device *netkit_dev_fetch(struct net *net, u32 ifindex, u32 whi nk = netkit_priv(dev); if (!nk->primary) return ERR_PTR(-EACCES); - if (nk->pair == NETKIT_DEVICE_SINGLE) - return ERR_PTR(-EOPNOTSUPP); if (which == BPF_NETKIT_PEER) { dev = rcu_dereference_rtnl(nk->peer); if (!dev) @@ -1032,7 +844,6 @@ static void netkit_release_all(struct net_device *dev) static void netkit_uninit(struct net_device *dev) { netkit_release_all(dev); - netkit_queue_unlease(dev); } static void netkit_del_link(struct net_device *dev, struct list_head *head) @@ -1068,7 +879,6 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[], { IFLA_NETKIT_PEER_INFO, "peer info" }, { IFLA_NETKIT_HEADROOM, "headroom" }, { IFLA_NETKIT_TAILROOM, "tailroom" }, - { IFLA_NETKIT_PAIRING, "pairing" }, }; if (!nk->primary) { @@ -1088,11 +898,9 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[], } if (data[IFLA_NETKIT_POLICY]) { - err = -EOPNOTSUPP; attr = data[IFLA_NETKIT_POLICY]; policy = nla_get_u32(attr); - if (nk->pair == NETKIT_DEVICE_PAIR) - err = netkit_check_policy(policy, attr, extack); + err = netkit_check_policy(policy, attr, extack); if (err) return err; WRITE_ONCE(nk->policy, policy); @@ -1113,48 +921,6 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[], return 0; } -static void netkit_check_lease_unregister(struct net_device *dev) -{ - LIST_HEAD(list_kill); - u32 q_idx; - - if (READ_ONCE(dev->reg_state) != NETREG_UNREGISTERING || - !dev->dev.parent) - return; - - netdev_lock_ops(dev); - for (q_idx = 0; q_idx < dev->real_num_rx_queues; q_idx++) { - struct net_device *tmp = dev; - u32 tmp_q_idx = q_idx; - - if (netif_rx_queue_lease_get_owner(&tmp, &tmp_q_idx)) { - if (tmp->netdev_ops != &netkit_netdev_ops) - continue; - /* A single phys device can have multiple queues leased - * to one netkit device. We can only queue that netkit - * device once to the list_kill. Queues of that phys - * device can be leased with different individual netkit - * devices, hence we batch via list_kill. - */ - if (unregister_netdevice_queued(tmp)) - continue; - netkit_del_link(tmp, &list_kill); - } - } - netdev_unlock_ops(dev); - unregister_netdevice_many(&list_kill); -} - -static int netkit_notifier(struct notifier_block *this, - unsigned long event, void *ptr) -{ - struct net_device *dev = netdev_notifier_info_to_dev(ptr); - - if (event == NETDEV_UNREGISTER) - netkit_check_lease_unregister(dev); - return NOTIFY_DONE; -} - static size_t netkit_get_size(const struct net_device *dev) { return nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_POLICY */ @@ -1165,7 +931,6 @@ static size_t netkit_get_size(const struct net_device *dev) nla_total_size(sizeof(u8)) + /* IFLA_NETKIT_PRIMARY */ nla_total_size(sizeof(u16)) + /* IFLA_NETKIT_HEADROOM */ nla_total_size(sizeof(u16)) + /* IFLA_NETKIT_TAILROOM */ - nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_PAIRING */ 0; } @@ -1186,8 +951,6 @@ static int netkit_fill_info(struct sk_buff *skb, const struct net_device *dev) return -EMSGSIZE; if (nla_put_u16(skb, IFLA_NETKIT_TAILROOM, dev->needed_tailroom)) return -EMSGSIZE; - if (nla_put_u32(skb, IFLA_NETKIT_PAIRING, nk->pair)) - return -EMSGSIZE; if (peer) { nk = netkit_priv(peer); @@ -1209,15 +972,13 @@ static const struct nla_policy netkit_policy[IFLA_NETKIT_MAX + 1] = { [IFLA_NETKIT_TAILROOM] = { .type = NLA_U16 }, [IFLA_NETKIT_SCRUB] = NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT), [IFLA_NETKIT_PEER_SCRUB] = NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT), - [IFLA_NETKIT_PAIRING] = NLA_POLICY_MAX(NLA_U32, NETKIT_DEVICE_SINGLE), [IFLA_NETKIT_PRIMARY] = { .type = NLA_REJECT, .reject_message = "Primary attribute is read-only" }, }; static struct rtnl_link_ops netkit_link_ops = { - .kind = NETKIT_DRV_NAME, + .kind = DRV_NAME, .priv_size = sizeof(struct netkit), - .alloc = netkit_alloc, .setup = netkit_setup, .newlink = netkit_new_link, .dellink = netkit_del_link, @@ -1231,39 +992,26 @@ static struct rtnl_link_ops netkit_link_ops = { .maxtype = IFLA_NETKIT_MAX, }; -static struct notifier_block netkit_netdev_notifier = { - .notifier_call = netkit_notifier, -}; - -static __init int netkit_mod_init(void) +static __init int netkit_init(void) { - int ret; - BUILD_BUG_ON((int)NETKIT_NEXT != (int)TCX_NEXT || (int)NETKIT_PASS != (int)TCX_PASS || (int)NETKIT_DROP != (int)TCX_DROP || (int)NETKIT_REDIRECT != (int)TCX_REDIRECT); - ret = rtnl_link_register(&netkit_link_ops); - if (ret) - return ret; - ret = register_netdevice_notifier(&netkit_netdev_notifier); - if (ret) - rtnl_link_unregister(&netkit_link_ops); - return ret; + return rtnl_link_register(&netkit_link_ops); } -static __exit void netkit_mod_exit(void) +static __exit void netkit_exit(void) { - unregister_netdevice_notifier(&netkit_netdev_notifier); rtnl_link_unregister(&netkit_link_ops); } -module_init(netkit_mod_init); -module_exit(netkit_mod_exit); +module_init(netkit_init); +module_exit(netkit_exit); MODULE_DESCRIPTION("BPF-programmable network device"); MODULE_AUTHOR("Daniel Borkmann "); MODULE_AUTHOR("Nikolay Aleksandrov "); MODULE_LICENSE("GPL"); -MODULE_ALIAS_RTNL_LINK(NETKIT_DRV_NAME); +MODULE_ALIAS_RTNL_LINK(DRV_NAME); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4d146c000e21..d99b0fbc1942 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3400,17 +3400,11 @@ static inline int dev_direct_xmit(struct sk_buff *skb, u16 queue_id) int register_netdevice(struct net_device *dev); void unregister_netdevice_queue(struct net_device *dev, struct list_head *head); void unregister_netdevice_many(struct list_head *head); - static inline void unregister_netdevice(struct net_device *dev) { unregister_netdevice_queue(dev, NULL); } -static inline bool unregister_netdevice_queued(const struct net_device *dev) -{ - return !list_empty(&dev->unreg_list); -} - int netdev_refcnt_read(const struct net_device *dev); void free_netdev(struct net_device *dev); diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h index 81dc7cb2360c..b55d3b9cb9c2 100644 --- a/include/net/netdev_queues.h +++ b/include/net/netdev_queues.h @@ -130,11 +130,6 @@ void netdev_stat_queue_sum(struct net_device *netdev, * @ndo_queue_get_dma_dev: Get dma device for zero-copy operations to be used * for this queue. Return NULL on error. * - * @ndo_queue_create: Create a new RX queue which can be leased to another queue. - * Ops on this queue are redirected to the leased queue e.g. - * when opening a memory provider. Return the new queue id on - * success. Return negative error code on failure. - * * Note that @ndo_queue_mem_alloc and @ndo_queue_mem_free may be called while * the interface is closed. @ndo_queue_start and @ndo_queue_stop will only * be called for an interface which is open. @@ -154,12 +149,9 @@ struct netdev_queue_mgmt_ops { int idx); struct device * (*ndo_queue_get_dma_dev)(struct net_device *dev, int idx); - int (*ndo_queue_create)(struct net_device *dev); }; -bool netif_rxq_has_unreadable_mp(struct net_device *dev, unsigned int rxq_idx); -bool netif_rxq_has_mp(struct net_device *dev, unsigned int rxq_idx); -bool netif_rxq_is_leased(struct net_device *dev, unsigned int rxq_idx); +bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx); /** * DOC: Lockless queue stopping / waking helpers. @@ -348,10 +340,5 @@ static inline unsigned int netif_xmit_timeout_ms(struct netdev_queue *txq) }) struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx); -bool netdev_can_create_queue(const struct net_device *dev, - struct netlink_ext_ack *extack); -bool netdev_can_lease_queue(const struct net_device *dev, - struct netlink_ext_ack *extack); -bool netdev_queue_busy(struct net_device *dev, int idx, - struct netlink_ext_ack *extack); -#endif /* _LINUX_NET_QUEUES_H */ + +#endif diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h index 508d11afaecb..8cdcd138b33f 100644 --- a/include/net/netdev_rx_queue.h +++ b/include/net/netdev_rx_queue.h @@ -28,8 +28,6 @@ struct netdev_rx_queue { #endif struct napi_struct *napi; struct pp_memory_provider_params mp_params; - struct netdev_rx_queue *lease; - netdevice_tracker lease_tracker; } ____cacheline_aligned_in_smp; /* @@ -59,22 +57,5 @@ get_netdev_rx_queue_index(struct netdev_rx_queue *queue) } int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq); -void netdev_rx_queue_lease(struct netdev_rx_queue *rxq_dst, - struct netdev_rx_queue *rxq_src); -void netdev_rx_queue_unlease(struct netdev_rx_queue *rxq_dst, - struct netdev_rx_queue *rxq_src); -bool netif_rx_queue_lease_get_owner(struct net_device **dev, unsigned int *rxq); -enum netif_lease_dir { - NETIF_VIRT_TO_PHYS, - NETIF_PHYS_TO_VIRT, -}; - -struct netdev_rx_queue * -__netif_get_rx_queue_lease(struct net_device **dev, unsigned int *rxq, - enum netif_lease_dir dir); -struct netdev_rx_queue * -netif_get_rx_queue_lease_locked(struct net_device **dev, unsigned int *rxq); -void netif_put_rx_queue_lease_locked(struct net_device *orig_dev, - struct net_device *dev); -#endif /* _LINUX_NETDEV_RX_QUEUE_H */ +#endif diff --git a/include/net/page_pool/memory_provider.h b/include/net/page_pool/memory_provider.h index b6f811c3416b..ada4f968960a 100644 --- a/include/net/page_pool/memory_provider.h +++ b/include/net/page_pool/memory_provider.h @@ -23,12 +23,12 @@ bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr); void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov); void net_mp_niov_clear_page_pool(struct net_iov *niov); -int net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, +int net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx, struct pp_memory_provider_params *p); int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, const struct pp_memory_provider_params *p, struct netlink_ext_ack *extack); -void net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx, +void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, struct pp_memory_provider_params *old_p); void __net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx, const struct pp_memory_provider_params *old_p); diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index c07cfb431eac..242e34f771cc 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -28,7 +28,7 @@ void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries); bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc); u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max); void xsk_tx_release(struct xsk_buff_pool *pool); -struct xsk_buff_pool *xsk_get_pool_from_qid(const struct net_device *dev, +struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev, u16 queue_id); void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool); void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index bbd565757298..3b491d96e52e 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1296,11 +1296,6 @@ enum netkit_mode { NETKIT_L3, }; -enum netkit_pairing { - NETKIT_DEVICE_PAIR, - NETKIT_DEVICE_SINGLE, -}; - /* NETKIT_SCRUB_NONE leaves clearing skb->{mark,priority} up to * the BPF program if attached. This also means the latter can * consume the two fields if they were populated earlier. @@ -1325,7 +1320,6 @@ enum { IFLA_NETKIT_PEER_SCRUB, IFLA_NETKIT_HEADROOM, IFLA_NETKIT_TAILROOM, - IFLA_NETKIT_PAIRING, __IFLA_NETKIT_MAX, }; #define IFLA_NETKIT_MAX (__IFLA_NETKIT_MAX - 1) diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 7df1056a35fd..e0b579a1df4f 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -160,7 +160,6 @@ enum { NETDEV_A_QUEUE_DMABUF, NETDEV_A_QUEUE_IO_URING, NETDEV_A_QUEUE_XSK, - NETDEV_A_QUEUE_LEASE, __NETDEV_A_QUEUE_MAX, NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1) @@ -203,15 +202,6 @@ enum { NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1) }; -enum { - NETDEV_A_LEASE_IFINDEX = 1, - NETDEV_A_LEASE_QUEUE, - NETDEV_A_LEASE_NETNS_ID, - - __NETDEV_A_LEASE_MAX, - NETDEV_A_LEASE_MAX = (__NETDEV_A_LEASE_MAX - 1) -}; - enum { NETDEV_A_DMABUF_IFINDEX = 1, NETDEV_A_DMABUF_QUEUES, @@ -238,7 +228,6 @@ enum { NETDEV_CMD_BIND_RX, NETDEV_CMD_NAPI_SET, NETDEV_CMD_BIND_TX, - NETDEV_CMD_QUEUE_CREATE, __NETDEV_CMD_MAX, NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) diff --git a/net/core/dev.c b/net/core/dev.c index 13a3de63a825..2661b68f5be3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1114,13 +1114,6 @@ netdev_get_by_index_lock_ops_compat(struct net *net, int ifindex) return __netdev_put_lock_ops_compat(dev, net); } -struct net_device * -netdev_put_lock(struct net_device *dev, netdevice_tracker *tracker) -{ - netdev_tracker_free(dev, tracker); - return __netdev_put_lock(dev, dev_net(dev)); -} - struct net_device * netdev_xa_find_lock(struct net *net, struct net_device *dev, unsigned long *index) diff --git a/net/core/dev.h b/net/core/dev.h index 9bcb76b325d0..da18536cbd35 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -30,8 +30,6 @@ netdev_napi_by_id_lock(struct net *net, unsigned int napi_id); struct net_device *dev_get_by_napi_id(unsigned int napi_id); struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net); -struct net_device *netdev_put_lock(struct net_device *dev, - netdevice_tracker *tracker); struct net_device * netdev_xa_find_lock(struct net *net, struct net_device *dev, unsigned long *index); diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index 52ba99c019e7..ba673e81716f 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -28,12 +28,6 @@ static const struct netlink_range_validation netdev_a_napi_defer_hard_irqs_range }; /* Common nested types */ -const struct nla_policy netdev_lease_nl_policy[NETDEV_A_LEASE_NETNS_ID + 1] = { - [NETDEV_A_LEASE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), - [NETDEV_A_LEASE_QUEUE] = NLA_POLICY_NESTED(netdev_queue_id_nl_policy), - [NETDEV_A_LEASE_NETNS_ID] = { .type = NLA_S32, }, -}; - const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1] = { [NETDEV_A_PAGE_POOL_ID] = NLA_POLICY_FULL_RANGE(NLA_UINT, &netdev_a_page_pool_id_range), [NETDEV_A_PAGE_POOL_IFINDEX] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_page_pool_ifindex_range), @@ -113,13 +107,6 @@ static const struct nla_policy netdev_bind_tx_nl_policy[NETDEV_A_DMABUF_FD + 1] [NETDEV_A_DMABUF_FD] = { .type = NLA_U32, }, }; -/* NETDEV_CMD_QUEUE_CREATE - do */ -static const struct nla_policy netdev_queue_create_nl_policy[NETDEV_A_QUEUE_LEASE + 1] = { - [NETDEV_A_QUEUE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), - [NETDEV_A_QUEUE_TYPE] = NLA_POLICY_MAX(NLA_U32, 1), - [NETDEV_A_QUEUE_LEASE] = NLA_POLICY_NESTED(netdev_lease_nl_policy), -}; - /* Ops table for netdev */ static const struct genl_split_ops netdev_nl_ops[] = { { @@ -218,13 +205,6 @@ static const struct genl_split_ops netdev_nl_ops[] = { .maxattr = NETDEV_A_DMABUF_FD, .flags = GENL_CMD_CAP_DO, }, - { - .cmd = NETDEV_CMD_QUEUE_CREATE, - .doit = netdev_nl_queue_create_doit, - .policy = netdev_queue_create_nl_policy, - .maxattr = NETDEV_A_QUEUE_LEASE, - .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, - }, }; static const struct genl_multicast_group netdev_nl_mcgrps[] = { diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h index d71b435d72c1..cffc08517a41 100644 --- a/net/core/netdev-genl-gen.h +++ b/net/core/netdev-genl-gen.h @@ -14,7 +14,6 @@ #include /* Common nested types */ -extern const struct nla_policy netdev_lease_nl_policy[NETDEV_A_LEASE_NETNS_ID + 1]; extern const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1]; extern const struct nla_policy netdev_queue_id_nl_policy[NETDEV_A_QUEUE_TYPE + 1]; @@ -37,7 +36,6 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info); int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info); int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info); -int netdev_nl_queue_create_doit(struct sk_buff *skb, struct genl_info *info); enum { NETDEV_NLGRP_MGMT, diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 51c830f88f10..470fabbeacd9 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -391,11 +391,8 @@ netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, u32 q_type, const struct genl_info *info) { struct pp_memory_provider_params *params; - struct net_device *orig_netdev = netdev; - struct nlattr *nest_lease, *nest_queue; struct netdev_rx_queue *rxq; struct netdev_queue *txq; - u32 lease_q_idx = q_idx; void *hdr; hdr = genlmsg_iput(rsp, info); @@ -413,37 +410,6 @@ netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, if (nla_put_napi_id(rsp, rxq->napi)) goto nla_put_failure; - if (netif_rx_queue_lease_get_owner(&netdev, &lease_q_idx)) { - struct net *net, *peer_net; - - nest_lease = nla_nest_start(rsp, NETDEV_A_QUEUE_LEASE); - if (!nest_lease) - goto nla_put_failure; - nest_queue = nla_nest_start(rsp, NETDEV_A_LEASE_QUEUE); - if (!nest_queue) - goto nla_put_failure; - if (nla_put_u32(rsp, NETDEV_A_QUEUE_ID, lease_q_idx)) - goto nla_put_failure; - if (nla_put_u32(rsp, NETDEV_A_QUEUE_TYPE, q_type)) - goto nla_put_failure; - nla_nest_end(rsp, nest_queue); - if (nla_put_u32(rsp, NETDEV_A_LEASE_IFINDEX, - READ_ONCE(netdev->ifindex))) - goto nla_put_failure; - rcu_read_lock(); - peer_net = dev_net_rcu(netdev); - net = dev_net_rcu(orig_netdev); - if (!net_eq(net, peer_net)) { - s32 id = peernet2id_alloc(net, peer_net, GFP_ATOMIC); - - if (nla_put_s32(rsp, NETDEV_A_LEASE_NETNS_ID, id)) - goto nla_put_failure_unlock; - } - rcu_read_unlock(); - nla_nest_end(rsp, nest_lease); - netdev = orig_netdev; - } - params = &rxq->mp_params; if (params->mp_ops && params->mp_ops->nl_fill(params->mp_priv, rsp, rxq)) @@ -471,8 +437,6 @@ netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, return 0; -nla_put_failure_unlock: - rcu_read_unlock(); nla_put_failure: genlmsg_cancel(rsp, hdr); return -EMSGSIZE; @@ -1156,155 +1120,6 @@ err_genlmsg_free: return err; } -int netdev_nl_queue_create_doit(struct sk_buff *skb, struct genl_info *info) -{ - const int qmaxtype = ARRAY_SIZE(netdev_queue_id_nl_policy) - 1; - const int lmaxtype = ARRAY_SIZE(netdev_lease_nl_policy) - 1; - int err, ifindex, ifindex_lease, queue_id, queue_id_lease; - struct nlattr *qtb[ARRAY_SIZE(netdev_queue_id_nl_policy)]; - struct nlattr *ltb[ARRAY_SIZE(netdev_lease_nl_policy)]; - struct netdev_rx_queue *rxq, *rxq_lease; - struct net_device *dev, *dev_lease; - netdevice_tracker dev_tracker; - struct nlattr *nest; - struct sk_buff *rsp; - void *hdr; - - if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_IFINDEX) || - GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_TYPE) || - GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_LEASE)) - return -EINVAL; - if (nla_get_u32(info->attrs[NETDEV_A_QUEUE_TYPE]) != - NETDEV_QUEUE_TYPE_RX) { - NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_QUEUE_TYPE]); - return -EINVAL; - } - - ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]); - - nest = info->attrs[NETDEV_A_QUEUE_LEASE]; - err = nla_parse_nested(ltb, lmaxtype, nest, - netdev_lease_nl_policy, info->extack); - if (err < 0) - return err; - if (NL_REQ_ATTR_CHECK(info->extack, nest, ltb, NETDEV_A_LEASE_IFINDEX) || - NL_REQ_ATTR_CHECK(info->extack, nest, ltb, NETDEV_A_LEASE_QUEUE)) - return -EINVAL; - if (ltb[NETDEV_A_LEASE_NETNS_ID]) { - NL_SET_BAD_ATTR(info->extack, ltb[NETDEV_A_LEASE_NETNS_ID]); - return -EINVAL; - } - - ifindex_lease = nla_get_u32(ltb[NETDEV_A_LEASE_IFINDEX]); - - nest = ltb[NETDEV_A_LEASE_QUEUE]; - err = nla_parse_nested(qtb, qmaxtype, nest, - netdev_queue_id_nl_policy, info->extack); - if (err < 0) - return err; - if (NL_REQ_ATTR_CHECK(info->extack, nest, qtb, NETDEV_A_QUEUE_ID) || - NL_REQ_ATTR_CHECK(info->extack, nest, qtb, NETDEV_A_QUEUE_TYPE)) - return -EINVAL; - if (nla_get_u32(qtb[NETDEV_A_QUEUE_TYPE]) != NETDEV_QUEUE_TYPE_RX) { - NL_SET_BAD_ATTR(info->extack, qtb[NETDEV_A_QUEUE_TYPE]); - return -EINVAL; - } - if (ifindex == ifindex_lease) { - NL_SET_ERR_MSG(info->extack, - "Lease ifindex cannot be the same as queue creation ifindex"); - return -EINVAL; - } - - queue_id_lease = nla_get_u32(qtb[NETDEV_A_QUEUE_ID]); - - rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!rsp) - return -ENOMEM; - - hdr = genlmsg_iput(rsp, info); - if (!hdr) { - err = -EMSGSIZE; - goto err_genlmsg_free; - } - - /* Locking order is always from the virtual to the physical device - * since this is also the same order when applications open the - * memory provider later on. - */ - dev = netdev_get_by_index_lock(genl_info_net(info), ifindex); - if (!dev) { - err = -ENODEV; - goto err_genlmsg_free; - } - if (!netdev_can_create_queue(dev, info->extack)) { - err = -EINVAL; - goto err_unlock_dev; - } - - dev_lease = netdev_get_by_index(genl_info_net(info), ifindex_lease, - &dev_tracker, GFP_KERNEL); - if (!dev_lease) { - err = -ENODEV; - goto err_unlock_dev; - } - if (!netdev_can_lease_queue(dev_lease, info->extack)) { - netdev_put(dev_lease, &dev_tracker); - err = -EINVAL; - goto err_unlock_dev; - } - - dev_lease = netdev_put_lock(dev_lease, &dev_tracker); - if (!dev_lease) { - err = -ENODEV; - goto err_unlock_dev; - } - if (queue_id_lease >= dev_lease->real_num_rx_queues) { - err = -ERANGE; - NL_SET_BAD_ATTR(info->extack, qtb[NETDEV_A_QUEUE_ID]); - goto err_unlock_dev_lease; - } - if (netdev_queue_busy(dev_lease, queue_id_lease, info->extack)) { - err = -EBUSY; - goto err_unlock_dev_lease; - } - - rxq_lease = __netif_get_rx_queue(dev_lease, queue_id_lease); - rxq = __netif_get_rx_queue(dev, dev->real_num_rx_queues - 1); - - if (rxq->lease && rxq->lease->dev != dev_lease) { - err = -EOPNOTSUPP; - NL_SET_ERR_MSG(info->extack, - "Leasing multiple queues from different devices not supported"); - goto err_unlock_dev_lease; - } - - err = queue_id = dev->queue_mgmt_ops->ndo_queue_create(dev); - if (err < 0) { - NL_SET_ERR_MSG(info->extack, - "Device is unable to create a new queue"); - goto err_unlock_dev_lease; - } - - rxq = __netif_get_rx_queue(dev, queue_id); - netdev_rx_queue_lease(rxq, rxq_lease); - - nla_put_u32(rsp, NETDEV_A_QUEUE_ID, queue_id); - genlmsg_end(rsp, hdr); - - netdev_unlock(dev_lease); - netdev_unlock(dev); - - return genlmsg_reply(rsp, info); - -err_unlock_dev_lease: - netdev_unlock(dev_lease); -err_unlock_dev: - netdev_unlock(dev); -err_genlmsg_free: - nlmsg_free(rsp); - return err; -} - void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv) { INIT_LIST_HEAD(&priv->bindings); diff --git a/net/core/netdev_queues.c b/net/core/netdev_queues.c index 97acf6440829..251f27a8307f 100644 --- a/net/core/netdev_queues.c +++ b/net/core/netdev_queues.c @@ -1,37 +1,22 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include -#include -#include /** * netdev_queue_get_dma_dev() - get dma device for zero-copy operations * @dev: net_device * @idx: queue index * - * Get dma device for zero-copy operations to be used for this queue. If the - * queue is leased to a physical queue, we retrieve the latter's dma device. + * Get dma device for zero-copy operations to be used for this queue. * When such device is not available or valid, the function will return NULL. * * Return: Device or NULL on error */ struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx) { - const struct netdev_queue_mgmt_ops *queue_ops; + const struct netdev_queue_mgmt_ops *queue_ops = dev->queue_mgmt_ops; struct device *dma_dev; - if (idx < dev->real_num_rx_queues) { - struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, idx); - - if (rxq->lease) { - rxq = rxq->lease; - dev = rxq->dev; - idx = get_netdev_rx_queue_index(rxq); - } - } - - queue_ops = dev->queue_mgmt_ops; - if (queue_ops && queue_ops->ndo_queue_get_dma_dev) dma_dev = queue_ops->ndo_queue_get_dma_dev(dev, idx); else @@ -40,58 +25,3 @@ struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx) return dma_dev && dma_dev->dma_mask ? dma_dev : NULL; } -bool netdev_can_create_queue(const struct net_device *dev, - struct netlink_ext_ack *extack) -{ - if (dev->dev.parent) { - NL_SET_ERR_MSG(extack, "Device is not a virtual device"); - return false; - } - if (!dev->queue_mgmt_ops || - !dev->queue_mgmt_ops->ndo_queue_create) { - NL_SET_ERR_MSG(extack, "Device does not support queue creation"); - return false; - } - if (dev->real_num_rx_queues < 1 || - dev->real_num_tx_queues < 1) { - NL_SET_ERR_MSG(extack, "Device must have at least one real queue"); - return false; - } - return true; -} - -bool netdev_can_lease_queue(const struct net_device *dev, - struct netlink_ext_ack *extack) -{ - if (!dev->dev.parent) { - NL_SET_ERR_MSG(extack, "Lease device is a virtual device"); - return false; - } - if (!netif_device_present(dev)) { - NL_SET_ERR_MSG(extack, "Lease device has been removed from the system"); - return false; - } - if (!dev->queue_mgmt_ops) { - NL_SET_ERR_MSG(extack, "Lease device does not support queue management operations"); - return false; - } - return true; -} - -bool netdev_queue_busy(struct net_device *dev, int idx, - struct netlink_ext_ack *extack) -{ - if (netif_rxq_is_leased(dev, idx)) { - NL_SET_ERR_MSG(extack, "Lease device queue is already leased"); - return true; - } - if (xsk_get_pool_from_qid(dev, idx)) { - NL_SET_ERR_MSG(extack, "Lease device queue in use by AF_XDP"); - return true; - } - if (netif_rxq_has_mp(dev, idx)) { - NL_SET_ERR_MSG(extack, "Lease device queue in use by memory provider"); - return true; - } - return false; -} diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c index 75c7a68cb90d..c7d9341b7630 100644 --- a/net/core/netdev_rx_queue.c +++ b/net/core/netdev_rx_queue.c @@ -9,120 +9,14 @@ #include "page_pool_priv.h" -void netdev_rx_queue_lease(struct netdev_rx_queue *rxq_dst, - struct netdev_rx_queue *rxq_src) -{ - netdev_assert_locked(rxq_src->dev); - netdev_assert_locked(rxq_dst->dev); - - netdev_hold(rxq_src->dev, &rxq_src->lease_tracker, GFP_KERNEL); - - WRITE_ONCE(rxq_src->lease, rxq_dst); - WRITE_ONCE(rxq_dst->lease, rxq_src); -} - -void netdev_rx_queue_unlease(struct netdev_rx_queue *rxq_dst, - struct netdev_rx_queue *rxq_src) -{ - netdev_assert_locked(rxq_dst->dev); - netdev_assert_locked(rxq_src->dev); - - WRITE_ONCE(rxq_src->lease, NULL); - WRITE_ONCE(rxq_dst->lease, NULL); - - netdev_put(rxq_src->dev, &rxq_src->lease_tracker); -} - -bool netif_rxq_is_leased(struct net_device *dev, unsigned int rxq_idx) -{ - if (rxq_idx < dev->real_num_rx_queues) - return READ_ONCE(__netif_get_rx_queue(dev, rxq_idx)->lease); - return false; -} - -static bool netif_lease_dir_ok(const struct net_device *dev, - enum netif_lease_dir dir) -{ - if (dir == NETIF_VIRT_TO_PHYS && !dev->dev.parent) - return true; - if (dir == NETIF_PHYS_TO_VIRT && dev->dev.parent) - return true; - return false; -} - -struct netdev_rx_queue * -__netif_get_rx_queue_lease(struct net_device **dev, unsigned int *rxq_idx, - enum netif_lease_dir dir) -{ - struct net_device *orig_dev = *dev; - struct netdev_rx_queue *rxq = __netif_get_rx_queue(orig_dev, *rxq_idx); - - if (rxq->lease) { - if (!netif_lease_dir_ok(orig_dev, dir)) - return NULL; - rxq = rxq->lease; - *rxq_idx = get_netdev_rx_queue_index(rxq); - *dev = rxq->dev; - } - return rxq; -} - -struct netdev_rx_queue * -netif_get_rx_queue_lease_locked(struct net_device **dev, unsigned int *rxq_idx) -{ - struct net_device *orig_dev = *dev; - struct netdev_rx_queue *rxq; - - /* Locking order is always from the virtual to the physical device - * see netdev_nl_queue_create_doit(). - */ - netdev_ops_assert_locked(orig_dev); - rxq = __netif_get_rx_queue_lease(dev, rxq_idx, NETIF_VIRT_TO_PHYS); - if (rxq && orig_dev != *dev) - netdev_lock(*dev); - return rxq; -} - -void netif_put_rx_queue_lease_locked(struct net_device *orig_dev, - struct net_device *dev) -{ - if (orig_dev != dev) - netdev_unlock(dev); -} - -bool netif_rx_queue_lease_get_owner(struct net_device **dev, - unsigned int *rxq_idx) -{ - struct net_device *orig_dev = *dev; - struct netdev_rx_queue *rxq; - - /* The physical device needs to be locked. If there is indeed a lease, - * then the virtual device holds a reference on the physical device - * and the lease stays active until the virtual device is torn down. - * When queues get {un,}leased both devices are always locked. - */ - netdev_ops_assert_locked(orig_dev); - rxq = __netif_get_rx_queue_lease(dev, rxq_idx, NETIF_PHYS_TO_VIRT); - if (rxq && orig_dev != *dev) - return true; - return false; -} - /* See also page_pool_is_unreadable() */ -bool netif_rxq_has_unreadable_mp(struct net_device *dev, unsigned int rxq_idx) +bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx) { - if (rxq_idx < dev->real_num_rx_queues) - return __netif_get_rx_queue(dev, rxq_idx)->mp_params.mp_ops; - return false; -} -EXPORT_SYMBOL(netif_rxq_has_unreadable_mp); + struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, idx); -bool netif_rxq_has_mp(struct net_device *dev, unsigned int rxq_idx) -{ - if (rxq_idx < dev->real_num_rx_queues) - return __netif_get_rx_queue(dev, rxq_idx)->mp_params.mp_priv; - return false; + return !!rxq->mp_params.mp_ops; } +EXPORT_SYMBOL(netif_rxq_has_unreadable_mp); int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx) { @@ -206,63 +100,49 @@ int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, const struct pp_memory_provider_params *p, struct netlink_ext_ack *extack) { - struct net_device *orig_dev = dev; struct netdev_rx_queue *rxq; int ret; if (!netdev_need_ops_lock(dev)) return -EOPNOTSUPP; + if (rxq_idx >= dev->real_num_rx_queues) { NL_SET_ERR_MSG(extack, "rx queue index out of range"); return -ERANGE; } - rxq_idx = array_index_nospec(rxq_idx, dev->real_num_rx_queues); - rxq = netif_get_rx_queue_lease_locked(&dev, &rxq_idx); - if (!rxq) { - NL_SET_ERR_MSG(extack, "rx queue peered to a virtual netdev"); - return -EBUSY; - } - if (!dev->dev.parent) { - NL_SET_ERR_MSG(extack, "rx queue is mapped to a virtual netdev"); - ret = -EBUSY; - goto out; - } + if (dev->cfg->hds_config != ETHTOOL_TCP_DATA_SPLIT_ENABLED) { NL_SET_ERR_MSG(extack, "tcp-data-split is disabled"); - ret = -EINVAL; - goto out; + return -EINVAL; } if (dev->cfg->hds_thresh) { NL_SET_ERR_MSG(extack, "hds-thresh is not zero"); - ret = -EINVAL; - goto out; + return -EINVAL; } if (dev_xdp_prog_count(dev)) { NL_SET_ERR_MSG(extack, "unable to custom memory provider to device with XDP program attached"); - ret = -EEXIST; - goto out; + return -EEXIST; } + + rxq = __netif_get_rx_queue(dev, rxq_idx); if (rxq->mp_params.mp_ops) { NL_SET_ERR_MSG(extack, "designated queue already memory provider bound"); - ret = -EEXIST; - goto out; + return -EEXIST; } #ifdef CONFIG_XDP_SOCKETS if (rxq->pool) { NL_SET_ERR_MSG(extack, "designated queue already in use by AF_XDP"); - ret = -EBUSY; - goto out; + return -EBUSY; } #endif + rxq->mp_params = *p; ret = netdev_rx_queue_restart(dev, rxq_idx); if (ret) { rxq->mp_params.mp_ops = NULL; rxq->mp_params.mp_priv = NULL; } -out: - netif_put_rx_queue_lease_locked(orig_dev, dev); return ret; } @@ -277,43 +157,38 @@ int net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, return ret; } -void __net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx, +void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx, const struct pp_memory_provider_params *old_p) { - struct net_device *orig_dev = dev; struct netdev_rx_queue *rxq; int err; - if (WARN_ON_ONCE(rxq_idx >= dev->real_num_rx_queues)) + if (WARN_ON_ONCE(ifq_idx >= dev->real_num_rx_queues)) return; - rxq = netif_get_rx_queue_lease_locked(&dev, &rxq_idx); - if (WARN_ON_ONCE(!rxq)) - return; + rxq = __netif_get_rx_queue(dev, ifq_idx); /* Callers holding a netdev ref may get here after we already * went thru shutdown via dev_memory_provider_uninstall(). */ if (dev->reg_state > NETREG_REGISTERED && !rxq->mp_params.mp_ops) - goto out; + return; if (WARN_ON_ONCE(rxq->mp_params.mp_ops != old_p->mp_ops || rxq->mp_params.mp_priv != old_p->mp_priv)) - goto out; + return; rxq->mp_params.mp_ops = NULL; rxq->mp_params.mp_priv = NULL; - err = netdev_rx_queue_restart(dev, rxq_idx); + err = netdev_rx_queue_restart(dev, ifq_idx); WARN_ON(err && err != -ENETDOWN); -out: - netif_put_rx_queue_lease_locked(orig_dev, dev); } -void net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx, +void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, struct pp_memory_provider_params *old_p) { netdev_lock(dev); - __net_mp_close_rxq(dev, rxq_idx, old_p); + __net_mp_close_rxq(dev, ifq_idx, old_p); netdev_unlock(dev); } diff --git a/net/ethtool/channels.c b/net/ethtool/channels.c index 797d2a08c515..ca4f80282448 100644 --- a/net/ethtool/channels.c +++ b/net/ethtool/channels.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only -#include +#include #include "netlink.h" #include "common.h" @@ -169,16 +169,14 @@ ethnl_set_channels(struct ethnl_req_info *req_info, struct genl_info *info) if (ret) return ret; - /* ensure channels are not busy at the moment */ + /* Disabling channels, query zero-copy AF_XDP sockets */ from_channel = channels.combined_count + min(channels.rx_count, channels.tx_count); - for (i = from_channel; i < old_total; i++) { - if (netdev_queue_busy(dev, i, NULL)) { - GENL_SET_ERR_MSG(info, - "requested channel counts are too low due to busy queues (AF_XDP or queue leasing)"); + for (i = from_channel; i < old_total; i++) + if (xsk_get_pool_from_qid(dev, i)) { + GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing zerocopy AF_XDP sockets"); return -EINVAL; } - } ret = dev->ethtool_ops->set_channels(dev, &channels); return ret < 0 ? ret : 1; diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 02a3454234d6..9431e305b233 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -27,13 +27,12 @@ #include #include #include -#include #include #include +#include #include #include -#include - +#include #include "common.h" /* State held across locks and calls for commands which have devlink fallback */ @@ -2283,12 +2282,12 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev, if (ret) return ret; - /* Disabling channels, query busy queues (AF_XDP, queue leasing) */ + /* Disabling channels, query zero-copy AF_XDP sockets */ from_channel = channels.combined_count + min(channels.rx_count, channels.tx_count); to_channel = curr.combined_count + max(curr.rx_count, curr.tx_count); for (i = from_channel; i < to_channel; i++) - if (netdev_queue_busy(dev, i, NULL)) + if (xsk_get_pool_from_qid(dev, i)) return -EINVAL; ret = dev->ethtool_ops->set_channels(dev, &channels); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 92f791433725..3b46bc635c43 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -23,8 +23,6 @@ #include #include #include - -#include #include #include #include @@ -105,7 +103,7 @@ bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool) } EXPORT_SYMBOL(xsk_uses_need_wakeup); -struct xsk_buff_pool *xsk_get_pool_from_qid(const struct net_device *dev, +struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev, u16 queue_id) { if (queue_id < dev->real_num_rx_queues) @@ -119,18 +117,10 @@ EXPORT_SYMBOL(xsk_get_pool_from_qid); void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id) { - struct net_device *orig_dev = dev; - unsigned int id = queue_id; - - if (id < dev->real_num_rx_queues) - WARN_ON_ONCE(!netif_get_rx_queue_lease_locked(&dev, &id)); - - if (id < dev->real_num_rx_queues) - dev->_rx[id].pool = NULL; - if (id < dev->real_num_tx_queues) - dev->_tx[id].pool = NULL; - - netif_put_rx_queue_lease_locked(orig_dev, dev); + if (queue_id < dev->num_rx_queues) + dev->_rx[queue_id].pool = NULL; + if (queue_id < dev->num_tx_queues) + dev->_tx[queue_id].pool = NULL; } /* The buffer pool is stored both in the _rx struct and the _tx struct as we do @@ -140,29 +130,17 @@ void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id) int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool, u16 queue_id) { - struct net_device *orig_dev = dev; - unsigned int id = queue_id; - int ret = 0; - - if (id >= max(dev->real_num_rx_queues, - dev->real_num_tx_queues)) + if (queue_id >= max_t(unsigned int, + dev->real_num_rx_queues, + dev->real_num_tx_queues)) return -EINVAL; - if (id < dev->real_num_rx_queues) { - if (!netif_get_rx_queue_lease_locked(&dev, &id)) - return -EBUSY; - if (xsk_get_pool_from_qid(dev, id)) { - ret = -EBUSY; - goto out; - } - } - if (id < dev->real_num_rx_queues) - dev->_rx[id].pool = pool; - if (id < dev->real_num_tx_queues) - dev->_tx[id].pool = pool; -out: - netif_put_rx_queue_lease_locked(orig_dev, dev); - return ret; + if (queue_id < dev->real_num_rx_queues) + dev->_rx[queue_id].pool = pool; + if (queue_id < dev->real_num_tx_queues) + dev->_tx[queue_id].pool = pool; + + return 0; } static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len, @@ -346,37 +324,14 @@ static bool xsk_is_bound(struct xdp_sock *xs) return false; } -static bool xsk_dev_queue_valid(const struct xdp_sock *xs, - const struct xdp_rxq_info *info) -{ - struct net_device *dev = xs->dev; - u32 queue_index = xs->queue_id; - struct netdev_rx_queue *rxq; - - if (info->dev == dev && - info->queue_index == queue_index) - return true; - - if (queue_index < dev->real_num_rx_queues) { - rxq = READ_ONCE(__netif_get_rx_queue(dev, queue_index)->lease); - if (!rxq) - return false; - - dev = rxq->dev; - queue_index = get_netdev_rx_queue_index(rxq); - - return info->dev == dev && - info->queue_index == queue_index; - } - return false; -} - static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { if (!xsk_is_bound(xs)) return -ENXIO; - if (!xsk_dev_queue_valid(xs, xdp->rxq)) + + if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) return -EINVAL; + if (len > xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) { xs->rx_dropped++; return -ENOSPC; diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index 7df1056a35fd..e0b579a1df4f 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -160,7 +160,6 @@ enum { NETDEV_A_QUEUE_DMABUF, NETDEV_A_QUEUE_IO_URING, NETDEV_A_QUEUE_XSK, - NETDEV_A_QUEUE_LEASE, __NETDEV_A_QUEUE_MAX, NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1) @@ -203,15 +202,6 @@ enum { NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1) }; -enum { - NETDEV_A_LEASE_IFINDEX = 1, - NETDEV_A_LEASE_QUEUE, - NETDEV_A_LEASE_NETNS_ID, - - __NETDEV_A_LEASE_MAX, - NETDEV_A_LEASE_MAX = (__NETDEV_A_LEASE_MAX - 1) -}; - enum { NETDEV_A_DMABUF_IFINDEX = 1, NETDEV_A_DMABUF_QUEUES, @@ -238,7 +228,6 @@ enum { NETDEV_CMD_BIND_RX, NETDEV_CMD_NAPI_SET, NETDEV_CMD_BIND_TX, - NETDEV_CMD_QUEUE_CREATE, __NETDEV_CMD_MAX, NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) diff --git a/tools/testing/selftests/drivers/net/README.rst b/tools/testing/selftests/drivers/net/README.rst index b94e81c2e030..eb838ae94844 100644 --- a/tools/testing/selftests/drivers/net/README.rst +++ b/tools/testing/selftests/drivers/net/README.rst @@ -62,13 +62,6 @@ LOCAL_V4, LOCAL_V6, REMOTE_V4, REMOTE_V6 Local and remote endpoint IP addresses. -LOCAL_PREFIX_V4, LOCAL_PREFIX_V6 -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Local IP prefix/subnet which can be used to allocate extra IP addresses (for -network name spaces behind macvlan, veth, netkit devices). DUT must be -reachable using these addresses from the endpoint. - REMOTE_TYPE ~~~~~~~~~~~ diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile index 39ad86d693b3..9c163ba6feee 100644 --- a/tools/testing/selftests/drivers/net/hw/Makefile +++ b/tools/testing/selftests/drivers/net/hw/Makefile @@ -32,8 +32,6 @@ TEST_PROGS = \ irq.py \ loopback.sh \ nic_timestamp.py \ - nk_netns.py \ - nk_qlease.py \ pp_alloc_fail.py \ rss_api.py \ rss_ctx.py \ diff --git a/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py b/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py index 022008249313..d5d247eca6b7 100644 --- a/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py +++ b/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py @@ -3,7 +3,6 @@ """ Driver test environment (hardware-only tests). NetDrvEnv and NetDrvEpEnv are the main environment classes. -NetDrvContEnv extends NetDrvEpEnv with netkit container support. Former is for local host only tests, latter creates / connects to a remote endpoint. See NIPA wiki for more information about running and writing driver tests. @@ -30,7 +29,7 @@ try: from net.lib.py import ksft_eq, ksft_ge, ksft_in, ksft_is, ksft_lt, \ ksft_ne, ksft_not_in, ksft_raises, ksft_true, ksft_gt, ksft_not_none from drivers.net.lib.py import GenerateTraffic, Remote, Iperf3Runner - from drivers.net.lib.py import NetDrvEnv, NetDrvEpEnv, NetDrvContEnv + from drivers.net.lib.py import NetDrvEnv, NetDrvEpEnv __all__ = ["NetNS", "NetNSEnter", "NetdevSimDev", "EthtoolFamily", "NetdevFamily", "NetshaperFamily", @@ -45,8 +44,8 @@ try: "ksft_eq", "ksft_ge", "ksft_in", "ksft_is", "ksft_lt", "ksft_ne", "ksft_not_in", "ksft_raises", "ksft_true", "ksft_gt", "ksft_not_none", "ksft_not_none", - "NetDrvEnv", "NetDrvEpEnv", "NetDrvContEnv", "GenerateTraffic", - "Remote", "Iperf3Runner"] + "NetDrvEnv", "NetDrvEpEnv", "GenerateTraffic", "Remote", + "Iperf3Runner"] except ModuleNotFoundError as e: print("Failed importing `net` library from kernel sources") print(str(e)) diff --git a/tools/testing/selftests/drivers/net/hw/nk_forward.bpf.c b/tools/testing/selftests/drivers/net/hw/nk_forward.bpf.c deleted file mode 100644 index 86ebfc1445b6..000000000000 --- a/tools/testing/selftests/drivers/net/hw/nk_forward.bpf.c +++ /dev/null @@ -1,49 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include -#include -#include - -#define TC_ACT_OK 0 -#define ETH_P_IPV6 0x86DD - -#define ctx_ptr(field) ((void *)(long)(field)) - -#define v6_p64_equal(a, b) (a.s6_addr32[0] == b.s6_addr32[0] && \ - a.s6_addr32[1] == b.s6_addr32[1]) - -volatile __u32 netkit_ifindex; -volatile __u8 ipv6_prefix[16]; - -SEC("tc/ingress") -int tc_redirect_peer(struct __sk_buff *skb) -{ - void *data_end = ctx_ptr(skb->data_end); - void *data = ctx_ptr(skb->data); - struct in6_addr *peer_addr; - struct ipv6hdr *ip6h; - struct ethhdr *eth; - - peer_addr = (struct in6_addr *)ipv6_prefix; - - if (skb->protocol != bpf_htons(ETH_P_IPV6)) - return TC_ACT_OK; - - eth = data; - if ((void *)(eth + 1) > data_end) - return TC_ACT_OK; - - ip6h = data + sizeof(struct ethhdr); - if ((void *)(ip6h + 1) > data_end) - return TC_ACT_OK; - - if (!v6_p64_equal(ip6h->daddr, (*peer_addr))) - return TC_ACT_OK; - - return bpf_redirect_peer(netkit_ifindex, 0); -} - -char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/drivers/net/hw/nk_netns.py b/tools/testing/selftests/drivers/net/hw/nk_netns.py deleted file mode 100755 index afa8638195d8..000000000000 --- a/tools/testing/selftests/drivers/net/hw/nk_netns.py +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env python3 -# SPDX-License-Identifier: GPL-2.0 - -from lib.py import ksft_run, ksft_exit -from lib.py import NetDrvContEnv -from lib.py import cmd - - -def test_ping(cfg) -> None: - cfg.require_ipver("6") - - cmd(f"ping -c 1 -W5 {cfg.nk_guest_ipv6}", host=cfg.remote) - cmd(f"ping -c 1 -W5 {cfg.remote_addr_v['6']}", ns=cfg.netns) - - -def main() -> None: - with NetDrvContEnv(__file__) as cfg: - ksft_run([test_ping], args=(cfg,)) - ksft_exit() - - -if __name__ == "__main__": - main() diff --git a/tools/testing/selftests/drivers/net/hw/nk_qlease.py b/tools/testing/selftests/drivers/net/hw/nk_qlease.py deleted file mode 100755 index 738a46d2d20c..000000000000 --- a/tools/testing/selftests/drivers/net/hw/nk_qlease.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python3 -# SPDX-License-Identifier: GPL-2.0 - -import re -from os import path -from lib.py import ksft_run, ksft_exit -from lib.py import NetDrvContEnv -from lib.py import bkg, cmd, defer, ethtool, rand_port, wait_port_listen - - -def create_rss_ctx(cfg): - output = ethtool(f"-X {cfg.ifname} context new start {cfg.src_queue} equal 1").stdout - values = re.search(r'New RSS context is (\d+)', output).group(1) - return int(values) - - -def set_flow_rule(cfg): - output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port {cfg.port} action {cfg.src_queue}").stdout - values = re.search(r'ID (\d+)', output).group(1) - return int(values) - - -def set_flow_rule_rss(cfg, rss_ctx_id): - output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port {cfg.port} context {rss_ctx_id}").stdout - values = re.search(r'ID (\d+)', output).group(1) - return int(values) - - -def test_iou_zcrx(cfg) -> None: - cfg.require_ipver('6') - - ethtool(f"-X {cfg.ifname} equal {cfg.src_queue}") - defer(ethtool, f"-X {cfg.ifname} default") - - flow_rule_id = set_flow_rule(cfg) - defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}") - - rx_cmd = f"ip netns exec {cfg.netns.name} {cfg.bin_local} -s -p {cfg.port} -i {cfg._nk_guest_ifname} -q {cfg.nk_queue}" - tx_cmd = f"{cfg.bin_remote} -c -h {cfg.nk_guest_ipv6} -p {cfg.port} -l 12840" - with bkg(rx_cmd, exit_wait=True): - wait_port_listen(cfg.port, proto="tcp", ns=cfg.netns) - cmd(tx_cmd, host=cfg.remote) - - -def main() -> None: - with NetDrvContEnv(__file__, lease=True) as cfg: - cfg.bin_local = path.abspath(path.dirname(__file__) + "/../../../drivers/net/hw/iou-zcrx") - cfg.bin_remote = cfg.remote.deploy(cfg.bin_local) - cfg.port = rand_port() - ksft_run([test_iou_zcrx], args=(cfg,)) - ksft_exit() - - -if __name__ == "__main__": - main() diff --git a/tools/testing/selftests/drivers/net/lib/py/__init__.py b/tools/testing/selftests/drivers/net/lib/py/__init__.py index be3a8a936882..8b75faa9af6d 100644 --- a/tools/testing/selftests/drivers/net/lib/py/__init__.py +++ b/tools/testing/selftests/drivers/net/lib/py/__init__.py @@ -3,7 +3,6 @@ """ Driver test environment. NetDrvEnv and NetDrvEpEnv are the main environment classes. -NetDrvContEnv extends NetDrvEpEnv with netkit container support. Former is for local host only tests, latter creates / connects to a remote endpoint. See NIPA wiki for more information about running and writing driver tests. @@ -44,12 +43,12 @@ try: "ksft_ne", "ksft_not_in", "ksft_raises", "ksft_true", "ksft_gt", "ksft_not_none", "ksft_not_none"] - from .env import NetDrvEnv, NetDrvEpEnv, NetDrvContEnv + from .env import NetDrvEnv, NetDrvEpEnv from .load import GenerateTraffic, Iperf3Runner from .remote import Remote - __all__ += ["NetDrvEnv", "NetDrvEpEnv", "NetDrvContEnv", "GenerateTraffic", - "Remote", "Iperf3Runner"] + __all__ += ["NetDrvEnv", "NetDrvEpEnv", "GenerateTraffic", "Remote", + "Iperf3Runner"] except ModuleNotFoundError as e: print("Failed importing `net` library from kernel sources") print(str(e)) diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py index 7066d78395c6..41cc248ac848 100644 --- a/tools/testing/selftests/drivers/net/lib/py/env.py +++ b/tools/testing/selftests/drivers/net/lib/py/env.py @@ -1,17 +1,13 @@ # SPDX-License-Identifier: GPL-2.0 -import ipaddress import os -import re import time from pathlib import Path from lib.py import KsftSkipEx, KsftXfailEx from lib.py import ksft_setup, wait_file from lib.py import cmd, ethtool, ip, CmdExitFailure from lib.py import NetNS, NetdevSimDev -from lib.py import NetdevFamily, EthtoolFamily from .remote import Remote -from . import bpftool class NetDrvEnvBase: @@ -293,156 +289,3 @@ class NetDrvEpEnv(NetDrvEnvBase): data.get('stats-block-usecs', 0) / 1000 / 1000 time.sleep(self._stats_settle_time) - - -class NetDrvContEnv(NetDrvEpEnv): - """ - Class for an environment with a netkit pair setup for forwarding traffic - between the physical interface and a network namespace. - """ - - def __init__(self, src_path, lease=False, **kwargs): - super().__init__(src_path, **kwargs) - - self.require_ipver("6") - local_prefix = self.env.get("LOCAL_PREFIX_V6") - if not local_prefix: - raise KsftSkipEx("LOCAL_PREFIX_V6 required") - - self.netdevnl = NetdevFamily() - self.ethnl = EthtoolFamily() - - local_prefix = local_prefix.rstrip("/64").rstrip("::").rstrip(":") - self.ipv6_prefix = f"{local_prefix}::" - self.nk_host_ipv6 = f"{local_prefix}::2:1" - self.nk_guest_ipv6 = f"{local_prefix}::2:2" - - self.netns = None - self._nk_host_ifname = None - self._nk_guest_ifname = None - self._tc_attached = False - self._bpf_prog_pref = None - self._bpf_prog_id = None - self._leased = False - - nk_rxqueues = 1 - if lease: - nk_rxqueues = 2 - ip(f"link add type netkit mode l2 forward peer forward numrxqueues {nk_rxqueues}") - - all_links = ip("-d link show", json=True) - netkit_links = [link for link in all_links - if link.get('linkinfo', {}).get('info_kind') == 'netkit' - and 'UP' not in link.get('flags', [])] - - if len(netkit_links) != 2: - raise KsftSkipEx("Failed to create netkit pair") - - netkit_links.sort(key=lambda x: x['ifindex']) - self._nk_host_ifname = netkit_links[1]['ifname'] - self._nk_guest_ifname = netkit_links[0]['ifname'] - self.nk_host_ifindex = netkit_links[1]['ifindex'] - self.nk_guest_ifindex = netkit_links[0]['ifindex'] - - if lease: - self._lease_queues() - - self._setup_ns() - self._attach_bpf() - - def __del__(self): - if self._tc_attached: - cmd(f"tc filter del dev {self.ifname} ingress pref {self._bpf_prog_pref}") - self._tc_attached = False - - if self._nk_host_ifname: - cmd(f"ip link del dev {self._nk_host_ifname}") - self._nk_host_ifname = None - self._nk_guest_ifname = None - - if self.netns: - del self.netns - self.netns = None - - if self._leased: - self.ethnl.rings_set({'header': {'dev-index': self.ifindex}, - 'tcp-data-split': 'unknown', - 'hds-thresh': self._hds_thresh, - 'rx': self._rx_rings}) - self._leased = False - - super().__del__() - - def _lease_queues(self): - channels = self.ethnl.channels_get({'header': {'dev-index': self.ifindex}}) - channels = channels['combined-count'] - if channels < 2: - raise KsftSkipEx('Test requires NETIF with at least 2 combined channels') - - rings = self.ethnl.rings_get({'header': {'dev-index': self.ifindex}}) - self._rx_rings = rings['rx'] - self._hds_thresh = rings.get('hds-thresh', 0) - self.ethnl.rings_set({'header': {'dev-index': self.ifindex}, - 'tcp-data-split': 'enabled', - 'hds-thresh': 0, - 'rx': 64}) - self.src_queue = channels - 1 - bind_result = self.netdevnl.queue_create( - { - "ifindex": self.nk_guest_ifindex, - "type": "rx", - "lease": { - "ifindex": self.ifindex, - "queue": {"id": self.src_queue, "type": "rx"}, - }, - } - ) - self.nk_queue = bind_result['id'] - self._leased = True - - def _setup_ns(self): - self.netns = NetNS() - ip(f"link set dev {self._nk_guest_ifname} netns {self.netns.name}") - ip(f"link set dev {self._nk_host_ifname} up") - ip(f"-6 addr add fe80::1/64 dev {self._nk_host_ifname} nodad") - ip(f"-6 route add {self.nk_guest_ipv6}/128 via fe80::2 dev {self._nk_host_ifname}") - - ip("link set lo up", ns=self.netns) - ip(f"link set dev {self._nk_guest_ifname} up", ns=self.netns) - ip(f"-6 addr add fe80::2/64 dev {self._nk_guest_ifname}", ns=self.netns) - ip(f"-6 addr add {self.nk_guest_ipv6}/64 dev {self._nk_guest_ifname} nodad", ns=self.netns) - ip(f"-6 route add default via fe80::1 dev {self._nk_guest_ifname}", ns=self.netns) - - def _attach_bpf(self): - bpf_obj = self.test_dir / "nk_forward.bpf.o" - if not bpf_obj.exists(): - raise KsftSkipEx("BPF prog not found") - - cmd(f"tc filter add dev {self.ifname} ingress bpf obj {bpf_obj} sec tc/ingress direct-action") - self._tc_attached = True - - tc_info = cmd(f"tc filter show dev {self.ifname} ingress").stdout - match = re.search(r'pref (\d+).*nk_forward\.bpf.*id (\d+)', tc_info) - if not match: - raise Exception("Failed to get BPF prog ID") - self._bpf_prog_pref = int(match.group(1)) - self._bpf_prog_id = int(match.group(2)) - - prog_info = bpftool(f"prog show id {self._bpf_prog_id}", json=True) - map_ids = prog_info.get("map_ids", []) - - bss_map_id = None - for map_id in map_ids: - map_info = bpftool(f"map show id {map_id}", json=True) - if map_info.get("name").endswith("bss"): - bss_map_id = map_id - - if bss_map_id is None: - raise Exception("Failed to find .bss map") - - ipv6_addr = ipaddress.IPv6Address(self.ipv6_prefix) - ipv6_bytes = ipv6_addr.packed - ifindex_bytes = self.nk_host_ifindex.to_bytes(4, byteorder='little') - value = ipv6_bytes + ifindex_bytes - value_hex = ' '.join(f'{b:02x}' for b in value) - bpftool(f"map update id {bss_map_id} key hex 00 00 00 00 value hex {value_hex}") -- cgit v1.2.3 From 49743f27268ffbe2029d9c8fdfbd04d0869bd51d Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Fri, 16 Jan 2026 06:21:21 +0000 Subject: selftests: drv-net: extend HW timestamp test with ioctl Extend HW timestamp tests to check that ioctl interface is not broken and configuration setups and requests are equal to netlink interface. Some linter warnings are disabled because of ctypes classes. Reviewed-by: Kory Maincent Signed-off-by: Vadim Fedorenko Link: https://patch.msgid.link/20260116062121.1230184-2-vadim.fedorenko@linux.dev Signed-off-by: Jakub Kicinski --- .../selftests/drivers/net/hw/nic_timestamp.py | 128 +++++++++++++++++++-- 1 file changed, 120 insertions(+), 8 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/drivers/net/hw/nic_timestamp.py b/tools/testing/selftests/drivers/net/hw/nic_timestamp.py index c1e943d53f19..c632b41e7a23 100755 --- a/tools/testing/selftests/drivers/net/hw/nic_timestamp.py +++ b/tools/testing/selftests/drivers/net/hw/nic_timestamp.py @@ -1,15 +1,38 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: GPL-2.0 +# pylint: disable=locally-disabled, invalid-name, attribute-defined-outside-init, too-few-public-methods """ Tests related to configuration of HW timestamping """ import errno +import ctypes +import fcntl +import socket from lib.py import ksft_run, ksft_exit, ksft_ge, ksft_eq, KsftSkipEx from lib.py import NetDrvEnv, EthtoolFamily, NlError +SIOCSHWTSTAMP = 0x89b0 +SIOCGHWTSTAMP = 0x89b1 +class hwtstamp_config(ctypes.Structure): + """ Python copy of struct hwtstamp_config """ + _fields_ = [ + ("flags", ctypes.c_int), + ("tx_type", ctypes.c_int), + ("rx_filter", ctypes.c_int), + ] + + +class ifreq(ctypes.Structure): + """ Python copy of struct ifreq """ + _fields_ = [ + ("ifr_name", ctypes.c_char * 16), + ("ifr_data", ctypes.POINTER(hwtstamp_config)), + ] + + def __get_hwtimestamp_support(cfg): """ Retrieve supported configuration information """ @@ -31,8 +54,29 @@ def __get_hwtimestamp_support(cfg): return ctx +def __get_hwtimestamp_config_ioctl(cfg): + """ Retrieve current TS configuration information (via ioctl) """ + + config = hwtstamp_config() + + req = ifreq() + req.ifr_name = cfg.ifname.encode() + req.ifr_data = ctypes.pointer(config) + + try: + sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + fcntl.ioctl(sock.fileno(), SIOCGHWTSTAMP, req) + sock.close() + + except OSError as e: + if e.errno == errno.EOPNOTSUPP: + raise KsftSkipEx("timestamping configuration is not supported via ioctl") from e + raise + return config + + def __get_hwtimestamp_config(cfg): - """ Retrieve current TS configuration information """ + """ Retrieve current TS configuration information (via netLink) """ try: tscfg = cfg.ethnl.tsconfig_get({'header': {'dev-name': cfg.ifname}}) @@ -43,8 +87,27 @@ def __get_hwtimestamp_config(cfg): return tscfg +def __set_hwtimestamp_config_ioctl(cfg, ts): + """ Setup new TS configuration information (via ioctl) """ + config = hwtstamp_config() + config.rx_filter = ts['rx-filters']['bits']['bit'][0]['index'] + config.tx_type = ts['tx-types']['bits']['bit'][0]['index'] + req = ifreq() + req.ifr_name = cfg.ifname.encode() + req.ifr_data = ctypes.pointer(config) + try: + sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + fcntl.ioctl(sock.fileno(), SIOCSHWTSTAMP, req) + sock.close() + + except OSError as e: + if e.errno == errno.EOPNOTSUPP: + raise KsftSkipEx("timestamping configuration is not supported via ioctl") from e + raise + + def __set_hwtimestamp_config(cfg, ts): - """ Setup new TS configuration information """ + """ Setup new TS configuration information (via netlink) """ ts['header'] = {'dev-name': cfg.ifname} try: @@ -56,9 +119,9 @@ def __set_hwtimestamp_config(cfg, ts): return res -def test_hwtstamp_tx(cfg): +def __perform_hwtstamp_tx(cfg, is_ioctl): """ - Test TX timestamp configuration. + Test TX timestamp configuration via either netlink or ioctl. The driver should apply provided config and report back proper state. """ @@ -66,16 +129,37 @@ def test_hwtstamp_tx(cfg): ts = __get_hwtimestamp_support(cfg) tx = ts['tx'] for t in tx: + res = None tscfg = orig_tscfg tscfg['tx-types']['bits']['bit'] = [t] - res = __set_hwtimestamp_config(cfg, tscfg) + if is_ioctl: + __set_hwtimestamp_config_ioctl(cfg, tscfg) + else: + res = __set_hwtimestamp_config(cfg, tscfg) if res is None: res = __get_hwtimestamp_config(cfg) + resioctl = __get_hwtimestamp_config_ioctl(cfg) ksft_eq(res['tx-types']['bits']['bit'], [t]) + ksft_eq(resioctl.tx_type, t['index']) __set_hwtimestamp_config(cfg, orig_tscfg) +def test_hwtstamp_tx_netlink(cfg): + """ + Test TX timestamp configuration setup via netlink. + The driver should apply provided config and report back proper state. + """ + __perform_hwtstamp_tx(cfg, False) + + +def test_hwtstamp_tx_ioctl(cfg): + """ + Test TX timestamp configuration setup via ioctl. + The driver should apply provided config and report back proper state. + """ + __perform_hwtstamp_tx(cfg, True) + -def test_hwtstamp_rx(cfg): +def __perform_hwtstamp_rx(cfg, is_ioctl): """ Test RX timestamp configuration. The filter configuration is taken from the list of supported filters. @@ -87,11 +171,17 @@ def test_hwtstamp_rx(cfg): ts = __get_hwtimestamp_support(cfg) rx = ts['rx'] for r in rx: + res = None tscfg = orig_tscfg tscfg['rx-filters']['bits']['bit'] = [r] - res = __set_hwtimestamp_config(cfg, tscfg) + if is_ioctl: + __set_hwtimestamp_config_ioctl(cfg, tscfg) + else: + res = __set_hwtimestamp_config(cfg, tscfg) if res is None: res = __get_hwtimestamp_config(cfg) + resioctl = __get_hwtimestamp_config_ioctl(cfg) + ksft_eq(resioctl.rx_filter, res['rx-filters']['bits']['bit'][0]['index']) if r['index'] == 0 or r['index'] == 1: ksft_eq(res['rx-filters']['bits']['bit'][0]['index'], r['index']) else: @@ -100,12 +190,34 @@ def test_hwtstamp_rx(cfg): __set_hwtimestamp_config(cfg, orig_tscfg) +def test_hwtstamp_rx_netlink(cfg): + """ + Test RX timestamp configuration via netlink. + The filter configuration is taken from the list of supported filters. + The driver should apply the config without error and report back proper state. + Some extension of the timestamping scope is allowed for PTP filters. + """ + __perform_hwtstamp_rx(cfg, False) + + +def test_hwtstamp_rx_ioctl(cfg): + """ + Test RX timestamp configuration via ioctl. + The filter configuration is taken from the list of supported filters. + The driver should apply the config without error and report back proper state. + Some extension of the timestamping scope is allowed for PTP filters. + """ + __perform_hwtstamp_rx(cfg, True) + + def main() -> None: """ Ksft boiler plate main """ with NetDrvEnv(__file__, nsim_test=False) as cfg: cfg.ethnl = EthtoolFamily() - ksft_run([test_hwtstamp_tx, test_hwtstamp_rx], args=(cfg,)) + ksft_run([test_hwtstamp_tx_ioctl, test_hwtstamp_tx_netlink, + test_hwtstamp_rx_ioctl, test_hwtstamp_rx_netlink], + args=(cfg,)) ksft_exit() -- cgit v1.2.3 From 1802e9079f65cbe47d90d048b06df650a91407f4 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 20 Jan 2026 10:03:19 -0800 Subject: selftests: drv-net: fix missing include in ncdevmem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit ca9d74eb5f6a ("uapi: add INT_MAX and INT_MIN constants") recently removed some includes of limits.h in uAPI headers. ncdevmem.c was depending on them: ncdevmem.c: In function ‘ethtool_add_flow’: ncdevmem.c:369:60: error: ‘INT_MAX’ undeclared (first use in this function) 369 | if (endptr == id_start || flow_id < 0 || flow_id > INT_MAX) | ^~~~~~~ ncdevmem.c:77:1: note: ‘INT_MAX’ is defined in header ‘’; did you forget to ‘#include ’? Reviewed-by: Mina Almasry Link: https://patch.msgid.link/20260120180319.1673271-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/hw/ncdevmem.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/drivers/net/hw/ncdevmem.c b/tools/testing/selftests/drivers/net/hw/ncdevmem.c index 3288ed04ce08..16864c844108 100644 --- a/tools/testing/selftests/drivers/net/hw/ncdevmem.c +++ b/tools/testing/selftests/drivers/net/hw/ncdevmem.c @@ -48,6 +48,7 @@ #include #define __iovec_defined #include +#include #include #include #include -- cgit v1.2.3 From a98ec863fdedf4940447f32ceda7d937bebd06a2 Mon Sep 17 00:00:00 2001 From: Audra Mitchell Date: Mon, 1 Dec 2025 13:18:48 -0500 Subject: lib/test_vmalloc.c: minor fixes to test_vmalloc.c If PAGE_SIZE is larger than 4k and if you have a system with a large number of CPUs, this test can require a very large amount of memory leading to oom-killer firing. Given the type of allocation, the kernel won't have anything to kill, causing the system to stall. Add a parameter to the test_vmalloc driver to represent the number of times a percpu object will be allocated. Calculate this in test_vmalloc.sh to be 90% of available memory or the current default of 35000, whichever is smaller. Link: https://lkml.kernel.org/r/20251201181848.1216197-1-audra@redhat.com Signed-off-by: Audra Mitchell Reviewed-by: Andrew Morton Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Rafael Aquini Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: "Uladzislau Rezki (Sony)" Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/test_vmalloc.c | 11 +++++++---- tools/testing/selftests/mm/test_vmalloc.sh | 31 +++++++++++++++++++++++++++--- 2 files changed, 35 insertions(+), 7 deletions(-) (limited to 'tools/testing') diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c index 6521c05c7816..270b6f7ca807 100644 --- a/lib/test_vmalloc.c +++ b/lib/test_vmalloc.c @@ -58,6 +58,9 @@ __param(int, run_test_mask, 7, /* Add a new test case description here. */ ); +__param(int, nr_pcpu_objects, 35000, + "Number of pcpu objects to allocate for pcpu_alloc_test"); + /* * This is for synchronization of setup phase. */ @@ -317,24 +320,24 @@ pcpu_alloc_test(void) size_t size, align; int i; - pcpu = vmalloc(sizeof(void __percpu *) * 35000); + pcpu = vmalloc(sizeof(void __percpu *) * nr_pcpu_objects); if (!pcpu) return -1; - for (i = 0; i < 35000; i++) { + for (i = 0; i < nr_pcpu_objects; i++) { size = get_random_u32_inclusive(1, PAGE_SIZE / 4); /* * Maximum PAGE_SIZE */ - align = 1 << get_random_u32_inclusive(1, 11); + align = 1 << get_random_u32_inclusive(1, PAGE_SHIFT - 1); pcpu[i] = __alloc_percpu(size, align); if (!pcpu[i]) rv = -1; } - for (i = 0; i < 35000; i++) + for (i = 0; i < nr_pcpu_objects; i++) free_percpu(pcpu[i]); vfree(pcpu); diff --git a/tools/testing/selftests/mm/test_vmalloc.sh b/tools/testing/selftests/mm/test_vmalloc.sh index d39096723fca..b23d705bf570 100755 --- a/tools/testing/selftests/mm/test_vmalloc.sh +++ b/tools/testing/selftests/mm/test_vmalloc.sh @@ -13,6 +13,9 @@ TEST_NAME="vmalloc" DRIVER="test_${TEST_NAME}" NUM_CPUS=`grep -c ^processor /proc/cpuinfo` +# Default number of times we allocate percpu objects: +NR_PCPU_OBJECTS=35000 + # 1 if fails exitcode=1 @@ -27,6 +30,8 @@ PERF_PARAM="sequential_test_order=1 test_repeat_count=3" SMOKE_PARAM="test_loop_count=10000 test_repeat_count=10" STRESS_PARAM="nr_threads=$NUM_CPUS test_repeat_count=20" +PCPU_OBJ_PARAM="nr_pcpu_objects=$NR_PCPU_OBJECTS" + check_test_requirements() { uid=$(id -u) @@ -47,12 +52,30 @@ check_test_requirements() fi } +check_memory_requirement() +{ + # The pcpu_alloc_test allocates nr_pcpu_objects per cpu. If the + # PAGE_SIZE is on the larger side it is easier to set a value + # that can cause oom events during testing. Since we are + # testing the functionality of vmalloc and not the oom-killer, + # calculate what is 90% of available memory and divide it by + # the number of online CPUs. + pages=$(($(getconf _AVPHYS_PAGES) * 90 / 100 / $NUM_CPUS)) + + if (($pages < $NR_PCPU_OBJECTS)); then + echo "Updated nr_pcpu_objects to 90% of available memory." + echo "nr_pcpu_objects is now set to: $pages." + PCPU_OBJ_PARAM="nr_pcpu_objects=$pages" + fi +} + run_performance_check() { echo "Run performance tests to evaluate how fast vmalloc allocation is." echo "It runs all test cases on one single CPU with sequential order." - modprobe $DRIVER $PERF_PARAM > /dev/null 2>&1 + check_memory_requirement + modprobe $DRIVER $PERF_PARAM $PCPU_OBJ_PARAM > /dev/null 2>&1 echo "Done." echo "Check the kernel message buffer to see the summary." } @@ -63,7 +86,8 @@ run_stability_check() echo "available test cases are run by NUM_CPUS workers simultaneously." echo "It will take time, so be patient." - modprobe $DRIVER $STRESS_PARAM > /dev/null 2>&1 + check_memory_requirement + modprobe $DRIVER $STRESS_PARAM $PCPU_OBJ_PARAM > /dev/null 2>&1 echo "Done." echo "Check the kernel ring buffer to see the summary." } @@ -74,7 +98,8 @@ run_smoke_check() echo "Please check $0 output how it can be used" echo "for deep performance analysis as well as stress testing." - modprobe $DRIVER $SMOKE_PARAM > /dev/null 2>&1 + check_memory_requirement + modprobe $DRIVER $SMOKE_PARAM $PCPU_OBJ_PARAM > /dev/null 2>&1 echo "Done." echo "Check the kernel ring buffer to see the summary." } -- cgit v1.2.3 From 8e46adb62fae98a866baa7c23f6ed3bfe02e6f88 Mon Sep 17 00:00:00 2001 From: Li Wang Date: Sun, 21 Dec 2025 20:26:37 +0800 Subject: selftests/mm/write_to_hugetlbfs: parse -s as size_t Patch series "selftests/mm: hugetlb cgroup charging: robustness fixes", v3. This series fixes a few issues in the hugetlb cgroup charging selftests (write_to_hugetlbfs.c + charge_reserved_hugetlb.sh) that show up on systems with large hugepages (e.g. 512MB) and when failures cause the test to wait indefinitely. On an aarch64 64k page kernel with 512MB hugepages, the test consistently fails in write_to_hugetlbfs with ENOMEM and then hangs waiting for the expected usage values. The root cause is that charge_reserved_hugetlb.sh mounts hugetlbfs with a fixed size=256M, which is smaller than a single hugepage, resulting in a mount with size=0 capacity. In addition, write_to_hugetlbfs previously parsed -s via atoi() into an int, which can overflow and print negative sizes. Reproducer / environment: - Kernel: 6.12.0-xxx.el10.aarch64+64k - Hugepagesize: 524288 kB (512MB) - ./charge_reserved_hugetlb.sh -cgroup-v2 - Observed mount: pagesize=512M,size=0 before this series After applying the series, the test completes successfully on the above setup. This patch (of 3): write_to_hugetlbfs currently parses the -s size argument with atoi() into an int. This silently accepts malformed input, cannot report overflow, and can truncate large sizes. === Error log === # uname -r 6.12.0-xxx.el10.aarch64+64k # ls /sys/kernel/mm/hugepages/hugepages-* hugepages-16777216kB/ hugepages-2048kB/ hugepages-524288kB/ #./charge_reserved_hugetlb.sh -cgroup-v2 # ----------------------------------------- ... # nr hugepages = 10 # writing cgroup limit: 5368709120 # writing reseravation limit: 5368709120 ... # Writing to this path: /mnt/huge/test # Writing this size: -1610612736 <-------- Switch the size variable to size_t and parse -s with sscanf("%zu", ...). Also print the size using %zu. This avoids incorrect behavior with large -s values and makes the utility more robust. Link: https://lkml.kernel.org/r/20251221122639.3168038-1-liwang@redhat.com Link: https://lkml.kernel.org/r/20251221122639.3168038-2-liwang@redhat.com Signed-off-by: Li Wang Acked-by: David Hildenbrand (Red Hat) Acked-by: Waiman Long Cc: David Hildenbrand Cc: Mark Brown Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/write_to_hugetlbfs.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/write_to_hugetlbfs.c b/tools/testing/selftests/mm/write_to_hugetlbfs.c index 34c91f7e6128..ecb5f7619960 100644 --- a/tools/testing/selftests/mm/write_to_hugetlbfs.c +++ b/tools/testing/selftests/mm/write_to_hugetlbfs.c @@ -68,7 +68,7 @@ int main(int argc, char **argv) int key = 0; int *ptr = NULL; int c = 0; - int size = 0; + size_t size = 0; char path[256] = ""; enum method method = MAX_METHOD; int want_sleep = 0, private = 0; @@ -86,7 +86,10 @@ int main(int argc, char **argv) while ((c = getopt(argc, argv, "s:p:m:owlrn")) != -1) { switch (c) { case 's': - size = atoi(optarg); + if (sscanf(optarg, "%zu", &size) != 1) { + perror("Invalid -s."); + exit_usage(); + } break; case 'p': strncpy(path, optarg, sizeof(path) - 1); @@ -131,7 +134,7 @@ int main(int argc, char **argv) } if (size != 0) { - printf("Writing this size: %d\n", size); + printf("Writing this size: %zu\n", size); } else { errno = EINVAL; perror("size not found"); -- cgit v1.2.3 From 1aa1dd9cc595917882fb6db67725442956f79607 Mon Sep 17 00:00:00 2001 From: Li Wang Date: Sun, 21 Dec 2025 20:26:38 +0800 Subject: selftests/mm/charge_reserved_hugetlb: drop mount size for hugetlbfs charge_reserved_hugetlb.sh mounts a hugetlbfs instance at /mnt/huge with a fixed size of 256M. On systems with large base hugepages (e.g. 512MB), this is smaller than a single hugepage, so the hugetlbfs mount ends up with zero capacity (often visible as size=0 in mount output). As a result, write_to_hugetlbfs fails with ENOMEM and the test can hang waiting for progress. === Error log === # uname -r 6.12.0-xxx.el10.aarch64+64k #./charge_reserved_hugetlb.sh -cgroup-v2 # ----------------------------------------- ... # nr hugepages = 10 # writing cgroup limit: 5368709120 # writing reseravation limit: 5368709120 ... # write_to_hugetlbfs: Error mapping the file: Cannot allocate memory # Waiting for hugetlb memory reservation to reach size 2684354560. # 0 # Waiting for hugetlb memory reservation to reach size 2684354560. # 0 ... # mount |grep /mnt/huge none on /mnt/huge type hugetlbfs (rw,relatime,seclabel,pagesize=512M,size=0) # grep -i huge /proc/meminfo ... HugePages_Total: 10 HugePages_Free: 10 HugePages_Rsvd: 0 HugePages_Surp: 0 Hugepagesize: 524288 kB Hugetlb: 5242880 kB Drop the mount args with 'size=256M', so the filesystem capacity is sufficient regardless of HugeTLB page size. Link: https://lkml.kernel.org/r/20251221122639.3168038-3-liwang@redhat.com Fixes: 29750f71a9b4 ("hugetlb_cgroup: add hugetlb_cgroup reservation tests") Signed-off-by: Li Wang Acked-by: David Hildenbrand (Red Hat) Acked-by: Waiman Long Cc: Mark Brown Cc: Shuah Khan Cc: Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/charge_reserved_hugetlb.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh index e1fe16bcbbe8..fa6713892d82 100755 --- a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh +++ b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh @@ -290,7 +290,7 @@ function run_test() { setup_cgroup "hugetlb_cgroup_test" "$cgroup_limit" "$reservation_limit" mkdir -p /mnt/huge - mount -t hugetlbfs -o pagesize=${MB}M,size=256M none /mnt/huge + mount -t hugetlbfs -o pagesize=${MB}M none /mnt/huge write_hugetlbfs_and_get_usage "hugetlb_cgroup_test" "$size" "$populate" \ "$write" "/mnt/huge/test" "$method" "$private" "$expect_failure" \ @@ -344,7 +344,7 @@ function run_multiple_cgroup_test() { setup_cgroup "hugetlb_cgroup_test2" "$cgroup_limit2" "$reservation_limit2" mkdir -p /mnt/huge - mount -t hugetlbfs -o pagesize=${MB}M,size=256M none /mnt/huge + mount -t hugetlbfs -o pagesize=${MB}M none /mnt/huge write_hugetlbfs_and_get_usage "hugetlb_cgroup_test1" "$size1" \ "$populate1" "$write1" "/mnt/huge/test1" "$method" "$private" \ -- cgit v1.2.3 From b618876f2e7055160cc5b98b4ff5cd8917e7b49e Mon Sep 17 00:00:00 2001 From: Li Wang Date: Sun, 21 Dec 2025 20:26:39 +0800 Subject: selftests/mm/charge_reserved_hugetlb.sh: add waits with timeout helper The hugetlb cgroup usage wait loops in charge_reserved_hugetlb.sh were unbounded and could hang forever if the expected cgroup file value never appears (e.g. due to write_to_hugetlbfs in Error mapping). === Error log === # uname -r 6.12.0-xxx.el10.aarch64+64k # ls /sys/kernel/mm/hugepages/hugepages-* hugepages-16777216kB/ hugepages-2048kB/ hugepages-524288kB/ #./charge_reserved_hugetlb.sh -cgroup-v2 # ----------------------------------------- ... # nr hugepages = 10 # writing cgroup limit: 5368709120 # writing reseravation limit: 5368709120 ... # write_to_hugetlbfs: Error mapping the file: Cannot allocate memory # Waiting for hugetlb memory reservation to reach size 2684354560. # 0 # Waiting for hugetlb memory reservation to reach size 2684354560. # 0 # Waiting for hugetlb memory reservation to reach size 2684354560. # 0 # Waiting for hugetlb memory reservation to reach size 2684354560. # 0 # Waiting for hugetlb memory reservation to reach size 2684354560. # 0 # Waiting for hugetlb memory reservation to reach size 2684354560. # 0 ... Introduce a small helper, wait_for_file_value(), and use it for: - waiting for reservation usage to drop to 0, - waiting for reservation usage to reach a given size, - waiting for fault usage to reach a given size. This makes the waits consistent and adds a hard timeout (60 tries with 1s sleep) so the test fails instead of stalling indefinitely. Link: https://lkml.kernel.org/r/20251221122639.3168038-4-liwang@redhat.com Signed-off-by: Li Wang Acked-by: David Hildenbrand (Red Hat) Cc: Mark Brown Cc: Shuah Khan Cc: Waiman Long Signed-off-by: Andrew Morton --- .../selftests/mm/charge_reserved_hugetlb.sh | 51 +++++++++++++--------- 1 file changed, 30 insertions(+), 21 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh index fa6713892d82..447769657634 100755 --- a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh +++ b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh @@ -100,7 +100,7 @@ function setup_cgroup() { echo writing cgroup limit: "$cgroup_limit" echo "$cgroup_limit" >$cgroup_path/$name/hugetlb.${MB}MB.$fault_limit_file - echo writing reseravation limit: "$reservation_limit" + echo writing reservation limit: "$reservation_limit" echo "$reservation_limit" > \ $cgroup_path/$name/hugetlb.${MB}MB.$reservation_limit_file @@ -112,41 +112,50 @@ function setup_cgroup() { fi } +function wait_for_file_value() { + local path="$1" + local expect="$2" + local max_tries=60 + + if [[ ! -r "$path" ]]; then + echo "ERROR: cannot read '$path', missing or permission denied" + return 1 + fi + + for ((i=1; i<=max_tries; i++)); do + local cur="$(cat "$path")" + if [[ "$cur" == "$expect" ]]; then + return 0 + fi + echo "Waiting for $path to become '$expect' (current: '$cur') (try $i/$max_tries)" + sleep 1 + done + + echo "ERROR: timeout waiting for $path to become '$expect'" + return 1 +} + function wait_for_hugetlb_memory_to_get_depleted() { local cgroup="$1" local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file" - # Wait for hugetlbfs memory to get depleted. - while [ $(cat $path) != 0 ]; do - echo Waiting for hugetlb memory to get depleted. - cat $path - sleep 0.5 - done + + wait_for_file_value "$path" "0" } function wait_for_hugetlb_memory_to_get_reserved() { local cgroup="$1" local size="$2" - local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file" - # Wait for hugetlbfs memory to get written. - while [ $(cat $path) != $size ]; do - echo Waiting for hugetlb memory reservation to reach size $size. - cat $path - sleep 0.5 - done + + wait_for_file_value "$path" "$size" } function wait_for_hugetlb_memory_to_get_written() { local cgroup="$1" local size="$2" - local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$fault_usage_file" - # Wait for hugetlbfs memory to get written. - while [ $(cat $path) != $size ]; do - echo Waiting for hugetlb memory to reach size $size. - cat $path - sleep 0.5 - done + + wait_for_file_value "$path" "$size" } function write_hugetlbfs_and_get_usage() { -- cgit v1.2.3 From b47beff129c6193df3dd406f2db2628fcc09d1eb Mon Sep 17 00:00:00 2001 From: Chunyu Hu Date: Sun, 21 Dec 2025 12:00:21 +0800 Subject: selftests/mm: fix va_high_addr_switch.sh return value Patch series "Fix va_high_addr_switch.sh test failure - again", v2. The series address several issues exist for the va_high_addr_switch test: 1) the test return value is ignored in va_high_addr_switch.sh. 2) the va_high_addr_switch test requires 6 hugepages not 5. 3) the reurn value of the first test in va_high_addr_switch.c can be overridden by the second test. 4) the nr_hugepages setup in run_vmtests.sh for arm64 can be done in va_high_addr_switch.sh too. 5) update a comment for check_test_requirements. This patch: (of 5) The return value should be return value of va_high_addr_switch, otherwise a test failure would be silently ignored. Link: https://lkml.kernel.org/r/20251221040025.3159990-1-chuhu@redhat.com Fixes: d9d957bd7b61 ("selftests/mm: alloc hugepages in va_high_addr_switch test") Signed-off-by: Chunyu Hu Reviewed-by: Luiz Capitulino Cc: Luiz Capitulino Cc: "David Hildenbrand (Red Hat)" Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/va_high_addr_switch.sh | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/va_high_addr_switch.sh b/tools/testing/selftests/mm/va_high_addr_switch.sh index a7d4b02b21dd..f89fe078a8e6 100755 --- a/tools/testing/selftests/mm/va_high_addr_switch.sh +++ b/tools/testing/selftests/mm/va_high_addr_switch.sh @@ -114,4 +114,6 @@ save_nr_hugepages # 4 keep_mapped pages, and one for tmp usage setup_nr_hugepages 5 ./va_high_addr_switch --run-hugetlb +retcode=$? restore_nr_hugepages +exit $retcode -- cgit v1.2.3 From b1f031e33cb5ae4be039a17613ad8da84c777e70 Mon Sep 17 00:00:00 2001 From: Chunyu Hu Date: Sun, 21 Dec 2025 12:00:22 +0800 Subject: selftests/mm: allocate 6 hugepages in va_high_addr_switch.sh The va_high_addr_switch test requires 6 hugepages, not 5. If running the test directly by: ./va_high_addr_switch.sh, the test will hit a mmap 'FAIL' caused by not enough hugepages: mmap(addr_switch_hint - hugepagesize, 2*hugepagesize, MAP_HUGETLB): 0x7f330f800000 - OK mmap(addr_switch_hint , 2*hugepagesize, MAP_FIXED | MAP_HUGETLB): 0xffffffffffffffff - FAILED The failure can't be hit if run the tests by running 'run_vmtests.sh -t hugevm' because the nr_hugepages is set to 128 at the beginning of run_vmtests.sh and va_high_addr_switch.sh skip the setup of nr_hugepages because already enough. Link: https://lkml.kernel.org/r/20251221040025.3159990-2-chuhu@redhat.com Fixes: d9d957bd7b61 ("selftests/mm: alloc hugepages in va_high_addr_switch test") Signed-off-by: Chunyu Hu Reviewed-by: Luiz Capitulino Cc: "David Hildenbrand (Red Hat)" Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/va_high_addr_switch.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/va_high_addr_switch.sh b/tools/testing/selftests/mm/va_high_addr_switch.sh index f89fe078a8e6..a0c93d348b11 100755 --- a/tools/testing/selftests/mm/va_high_addr_switch.sh +++ b/tools/testing/selftests/mm/va_high_addr_switch.sh @@ -111,8 +111,8 @@ setup_nr_hugepages() check_test_requirements save_nr_hugepages -# 4 keep_mapped pages, and one for tmp usage -setup_nr_hugepages 5 +# The HugeTLB tests require 6 pages +setup_nr_hugepages 6 ./va_high_addr_switch --run-hugetlb retcode=$? restore_nr_hugepages -- cgit v1.2.3 From 7544d7969d84c1c4a078d1c5a7d4117fbf6f385c Mon Sep 17 00:00:00 2001 From: Chunyu Hu Date: Sun, 21 Dec 2025 12:00:23 +0800 Subject: selftests/mm: remove arm64 nr_hugepages setup for va_high_addr_switch test arm64 and x86_64 has the same nr_hugepages requriement for running the va_high_addr_switch test. Since commit d9d957bd7b61 ("selftests/mm: alloc hugepages in va_high_addr_switch test"), the setup can be done in va_high_addr_switch.sh. So remove the duplicated setup. Link: https://lkml.kernel.org/r/20251221040025.3159990-3-chuhu@redhat.com Signed-off-by: Chunyu Hu Reviewed-by: Luiz Capitulino Cc: Luiz Capitulino Cc: "David Hildenbrand (Red Hat)" Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/run_vmtests.sh | 8 -------- 1 file changed, 8 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index d9173f2312b7..2dadbfc6e535 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -412,15 +412,7 @@ if [ $VADDR64 -ne 0 ]; then fi # va high address boundary switch test - ARCH_ARM64="arm64" - prev_nr_hugepages=$(cat /proc/sys/vm/nr_hugepages) - if [ "$ARCH" == "$ARCH_ARM64" ]; then - echo 6 > /proc/sys/vm/nr_hugepages - fi CATEGORY="hugevm" run_test bash ./va_high_addr_switch.sh - if [ "$ARCH" == "$ARCH_ARM64" ]; then - echo $prev_nr_hugepages > /proc/sys/vm/nr_hugepages - fi fi # VADDR64 # vmalloc stability smoke test -- cgit v1.2.3 From dd0202a0bd81c33096f3d473c296cad997baba5b Mon Sep 17 00:00:00 2001 From: Chunyu Hu Date: Sun, 21 Dec 2025 12:00:24 +0800 Subject: selftests/mm: va_high_addr_switch return fail when either test failed When the first test failed, and the hugetlb test passed, the result would be pass, but we expect a fail. Fix this issue by returning fail if either is not KSFT_PASS. Link: https://lkml.kernel.org/r/20251221040025.3159990-4-chuhu@redhat.com Signed-off-by: Chunyu Hu Reviewed-by: Luiz Capitulino Cc: "David Hildenbrand (Red Hat)" Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/va_high_addr_switch.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/va_high_addr_switch.c b/tools/testing/selftests/mm/va_high_addr_switch.c index 02f290a69132..51401e081b20 100644 --- a/tools/testing/selftests/mm/va_high_addr_switch.c +++ b/tools/testing/selftests/mm/va_high_addr_switch.c @@ -322,7 +322,7 @@ static int supported_arch(void) int main(int argc, char **argv) { - int ret; + int ret, hugetlb_ret = KSFT_PASS; if (!supported_arch()) return KSFT_SKIP; @@ -331,6 +331,10 @@ int main(int argc, char **argv) ret = run_test(testcases, sz_testcases); if (argc == 2 && !strcmp(argv[1], "--run-hugetlb")) - ret = run_test(hugetlb_testcases, sz_hugetlb_testcases); - return ret; + hugetlb_ret = run_test(hugetlb_testcases, sz_hugetlb_testcases); + + if (ret == KSFT_PASS && hugetlb_ret == KSFT_PASS) + return KSFT_PASS; + else + return KSFT_FAIL; } -- cgit v1.2.3 From 6319c4f44234c3849fdb2c3f72c45353aa428d3f Mon Sep 17 00:00:00 2001 From: Chunyu Hu Date: Sun, 21 Dec 2025 12:00:25 +0800 Subject: selftests/mm: fix comment for check_test_requirements The test supports arm64 as well so the comment is incorrect. And there's a check for arm64 in va_high_addr_switch.c. Link: https://lkml.kernel.org/r/20251221040025.3159990-5-chuhu@redhat.com Fixes: 983e760bcdb6 ("selftest/mm: va_high_addr_switch: add ppc64 support check") Fixes: f556acc2facd ("selftests/mm: skip test for non-LPA2 and non-LVA systems") Signed-off-by: Chunyu Hu Reviewed-by: Luiz Capitulino Cc: "David Hildenbrand (Red Hat)" Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/va_high_addr_switch.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/va_high_addr_switch.sh b/tools/testing/selftests/mm/va_high_addr_switch.sh index a0c93d348b11..9492c2d72634 100755 --- a/tools/testing/selftests/mm/va_high_addr_switch.sh +++ b/tools/testing/selftests/mm/va_high_addr_switch.sh @@ -61,9 +61,9 @@ check_supported_ppc64() check_test_requirements() { - # The test supports x86_64 and powerpc64. We currently have no useful - # eligibility check for powerpc64, and the test itself will reject other - # architectures. + # The test supports x86_64, powerpc64 and arm64. There's check for arm64 + # in va_high_addr_switch.c. The test itself will reject other architectures. + case `uname -m` in "x86_64") check_supported_x86_64 -- cgit v1.2.3 From e700f5d1560798aacf0e56fdcc70ee2c20bf56ec Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Tue, 16 Dec 2025 02:45:21 -0500 Subject: watchdog: softlockup: panic when lockup duration exceeds N thresholds The softlockup_panic sysctl is currently a binary option: panic immediately or never panic on soft lockups. Panicking on any soft lockup, regardless of duration, can be overly aggressive for brief stalls that may be caused by legitimate operations. Conversely, never panicking may allow severe system hangs to persist undetected. Extend softlockup_panic to accept an integer threshold, allowing the kernel to panic only when the normalized lockup duration exceeds N watchdog threshold periods. This provides finer-grained control to distinguish between transient delays and persistent system failures. The accepted values are: - 0: Don't panic (unchanged) - 1: Panic when duration >= 1 * threshold (20s default, original behavior) - N > 1: Panic when duration >= N * threshold (e.g., 2 = 40s, 3 = 60s.) The original behavior is preserved for values 0 and 1, maintaining full backward compatibility while allowing systems to tolerate brief lockups while still catching severe, persistent hangs. [lirongqing@baidu.com: v2] Link: https://lkml.kernel.org/r/20251218074300.4080-1-lirongqing@baidu.com Link: https://lkml.kernel.org/r/20251216074521.2796-1-lirongqing@baidu.com Signed-off-by: Li RongQing Cc: Eduard Zingerman Cc: Hao Luo Cc: Jiri Olsa Cc: John Fastabend Cc: KP Singh Cc: Lance Yang Cc: Martin KaFai Lau Cc: Nicholas Piggin Cc: Song Liu Cc: Stanislav Fomichev Cc: Yonghong Song Signed-off-by: Andrew Morton --- Documentation/admin-guide/kernel-parameters.txt | 10 +++++----- arch/arm/configs/aspeed_g5_defconfig | 2 +- arch/arm/configs/pxa3xx_defconfig | 2 +- arch/openrisc/configs/or1klitex_defconfig | 2 +- arch/powerpc/configs/skiroot_defconfig | 2 +- drivers/gpu/drm/ci/arm.config | 2 +- drivers/gpu/drm/ci/arm64.config | 2 +- drivers/gpu/drm/ci/x86_64.config | 2 +- kernel/configs/debug.config | 2 +- kernel/watchdog.c | 10 ++++++---- lib/Kconfig.debug | 13 +++++++------ tools/testing/selftests/bpf/config | 2 +- tools/testing/selftests/wireguard/qemu/kernel.config | 2 +- 13 files changed, 28 insertions(+), 25 deletions(-) (limited to 'tools/testing') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 1058f2a6d6a8..73d846211144 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6969,12 +6969,12 @@ Kernel parameters softlockup_panic= [KNL] Should the soft-lockup detector generate panics. - Format: 0 | 1 + Format: - A value of 1 instructs the soft-lockup detector - to panic the machine when a soft-lockup occurs. It is - also controlled by the kernel.softlockup_panic sysctl - and CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC, which is the + A value of non-zero instructs the soft-lockup detector + to panic the machine when a soft-lockup duration exceeds + N thresholds. It is also controlled by the kernel.softlockup_panic + sysctl and CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC, which is the respective build-time switch to that functionality. softlockup_all_cpu_backtrace= diff --git a/arch/arm/configs/aspeed_g5_defconfig b/arch/arm/configs/aspeed_g5_defconfig index 2e6ea13c1e9b..ec558e57d081 100644 --- a/arch/arm/configs/aspeed_g5_defconfig +++ b/arch/arm/configs/aspeed_g5_defconfig @@ -306,7 +306,7 @@ CONFIG_SCHED_STACK_END_CHECK=y CONFIG_PANIC_ON_OOPS=y CONFIG_PANIC_TIMEOUT=-1 CONFIG_SOFTLOCKUP_DETECTOR=y -CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1 CONFIG_BOOTPARAM_HUNG_TASK_PANIC=1 CONFIG_WQ_WATCHDOG=y # CONFIG_SCHED_DEBUG is not set diff --git a/arch/arm/configs/pxa3xx_defconfig b/arch/arm/configs/pxa3xx_defconfig index 07d422f0ff34..fb272e3a2337 100644 --- a/arch/arm/configs/pxa3xx_defconfig +++ b/arch/arm/configs/pxa3xx_defconfig @@ -100,7 +100,7 @@ CONFIG_PRINTK_TIME=y CONFIG_DEBUG_KERNEL=y CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_SHIRQ=y -CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1 # CONFIG_SCHED_DEBUG is not set CONFIG_DEBUG_SPINLOCK=y CONFIG_DEBUG_SPINLOCK_SLEEP=y diff --git a/arch/openrisc/configs/or1klitex_defconfig b/arch/openrisc/configs/or1klitex_defconfig index fb1eb9a68bd6..984b0e3b2768 100644 --- a/arch/openrisc/configs/or1klitex_defconfig +++ b/arch/openrisc/configs/or1klitex_defconfig @@ -52,5 +52,5 @@ CONFIG_LSM="lockdown,yama,loadpin,safesetid,integrity,bpf" CONFIG_PRINTK_TIME=y CONFIG_PANIC_ON_OOPS=y CONFIG_SOFTLOCKUP_DETECTOR=y -CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1 CONFIG_BUG_ON_DATA_CORRUPTION=y diff --git a/arch/powerpc/configs/skiroot_defconfig b/arch/powerpc/configs/skiroot_defconfig index 2b71a6dc399e..a4114fca5a39 100644 --- a/arch/powerpc/configs/skiroot_defconfig +++ b/arch/powerpc/configs/skiroot_defconfig @@ -289,7 +289,7 @@ CONFIG_SCHED_STACK_END_CHECK=y CONFIG_DEBUG_STACKOVERFLOW=y CONFIG_PANIC_ON_OOPS=y CONFIG_SOFTLOCKUP_DETECTOR=y -CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1 CONFIG_HARDLOCKUP_DETECTOR=y CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y CONFIG_WQ_WATCHDOG=y diff --git a/drivers/gpu/drm/ci/arm.config b/drivers/gpu/drm/ci/arm.config index 411e814819a8..d7c51670da2f 100644 --- a/drivers/gpu/drm/ci/arm.config +++ b/drivers/gpu/drm/ci/arm.config @@ -52,7 +52,7 @@ CONFIG_TMPFS=y CONFIG_PROVE_LOCKING=n CONFIG_DEBUG_LOCKDEP=n CONFIG_SOFTLOCKUP_DETECTOR=n -CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=n +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=0 CONFIG_FW_LOADER_COMPRESS=y diff --git a/drivers/gpu/drm/ci/arm64.config b/drivers/gpu/drm/ci/arm64.config index fddfbd4d2493..ea0e30737c4d 100644 --- a/drivers/gpu/drm/ci/arm64.config +++ b/drivers/gpu/drm/ci/arm64.config @@ -161,7 +161,7 @@ CONFIG_TMPFS=y CONFIG_PROVE_LOCKING=n CONFIG_DEBUG_LOCKDEP=n CONFIG_SOFTLOCKUP_DETECTOR=y -CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1 CONFIG_DETECT_HUNG_TASK=y diff --git a/drivers/gpu/drm/ci/x86_64.config b/drivers/gpu/drm/ci/x86_64.config index 8eaba388b141..7ac98a78691e 100644 --- a/drivers/gpu/drm/ci/x86_64.config +++ b/drivers/gpu/drm/ci/x86_64.config @@ -47,7 +47,7 @@ CONFIG_TMPFS=y CONFIG_PROVE_LOCKING=n CONFIG_DEBUG_LOCKDEP=n CONFIG_SOFTLOCKUP_DETECTOR=y -CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1 CONFIG_DETECT_HUNG_TASK=y diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config index 9f6ab7dabf67..774702591d26 100644 --- a/kernel/configs/debug.config +++ b/kernel/configs/debug.config @@ -84,7 +84,7 @@ CONFIG_SLUB_DEBUG_ON=y # Debug Oops, Lockups and Hangs # CONFIG_BOOTPARAM_HUNG_TASK_PANIC=0 -# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=0 CONFIG_DEBUG_ATOMIC_SLEEP=y CONFIG_DETECT_HUNG_TASK=y CONFIG_PANIC_ON_OOPS=y diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 366122f4a0f8..b4d5fbdb933a 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -363,7 +363,7 @@ static struct cpumask watchdog_allowed_mask __read_mostly; /* Global variables, exported for sysctl */ unsigned int __read_mostly softlockup_panic = - IS_ENABLED(CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC); + CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC; static bool softlockup_initialized __read_mostly; static u64 __read_mostly sample_period; @@ -774,8 +774,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) { unsigned long touch_ts, period_ts, now; struct pt_regs *regs = get_irq_regs(); - int duration; int softlockup_all_cpu_backtrace; + int duration, thresh_count; unsigned long flags; if (!watchdog_enabled) @@ -879,7 +879,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); sys_info(softlockup_si_mask & ~SYS_INFO_ALL_BT); - if (softlockup_panic) + thresh_count = duration / get_softlockup_thresh(); + + if (softlockup_panic && thresh_count >= softlockup_panic) panic("softlockup: hung tasks"); } @@ -1228,7 +1230,7 @@ static const struct ctl_table watchdog_sysctls[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, + .extra2 = SYSCTL_INT_MAX, }, { .procname = "softlockup_sys_info", diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 4bfca37f313e..947e62e92da8 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1110,13 +1110,14 @@ config SOFTLOCKUP_DETECTOR_INTR_STORM the CPU stats and the interrupt counts during the "soft lockups". config BOOTPARAM_SOFTLOCKUP_PANIC - bool "Panic (Reboot) On Soft Lockups" + int "Panic (Reboot) On Soft Lockups" depends on SOFTLOCKUP_DETECTOR + default 0 help - Say Y here to enable the kernel to panic on "soft lockups", - which are bugs that cause the kernel to loop in kernel - mode for more than 20 seconds (configurable using the watchdog_thresh - sysctl), without giving other tasks a chance to run. + Set to a non-zero value N to enable the kernel to panic on "soft + lockups", which are bugs that cause the kernel to loop in kernel + mode for more than (N * 20 seconds) (configurable using the + watchdog_thresh sysctl), without giving other tasks a chance to run. The panic can be used in combination with panic_timeout, to cause the system to reboot automatically after a @@ -1124,7 +1125,7 @@ config BOOTPARAM_SOFTLOCKUP_PANIC high-availability systems that have uptime guarantees and where a lockup must be resolved ASAP. - Say N if unsure. + Say 0 if unsure. config HAVE_HARDLOCKUP_DETECTOR_BUDDY bool diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 558839e3c185..24855381290d 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -1,6 +1,6 @@ CONFIG_BLK_DEV_LOOP=y CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y -CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1 CONFIG_BPF=y CONFIG_BPF_EVENTS=y CONFIG_BPF_JIT=y diff --git a/tools/testing/selftests/wireguard/qemu/kernel.config b/tools/testing/selftests/wireguard/qemu/kernel.config index 0504c11c2de6..bb89d2dfaa2a 100644 --- a/tools/testing/selftests/wireguard/qemu/kernel.config +++ b/tools/testing/selftests/wireguard/qemu/kernel.config @@ -80,7 +80,7 @@ CONFIG_HARDLOCKUP_DETECTOR=y CONFIG_WQ_WATCHDOG=y CONFIG_DETECT_HUNG_TASK=y CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y -CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1 CONFIG_BOOTPARAM_HUNG_TASK_PANIC=1 CONFIG_PANIC_TIMEOUT=-1 CONFIG_STACKTRACE=y -- cgit v1.2.3 From 4fca95095cdcd81bd4a8c8c7008fb3c175a3a5d5 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Tue, 20 Jan 2026 15:05:55 +0800 Subject: selftests/bpf: test the jited inline of bpf_get_current_task Add the testcase for the jited inline of bpf_get_current_task(). Signed-off-by: Menglong Dong Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260120070555.233486-3-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/verifier.c | 2 ++ .../selftests/bpf/progs/verifier_jit_inline.c | 20 ++++++++++++++++++++ 2 files changed, 22 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/verifier_jit_inline.c (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index b6a1e79709be..302286a80154 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -112,6 +112,7 @@ #include "verifier_xdp_direct_packet_access.skel.h" #include "verifier_bits_iter.skel.h" #include "verifier_lsm.skel.h" +#include "verifier_jit_inline.skel.h" #include "irq.skel.h" #define MAX_ENTRIES 11 @@ -255,6 +256,7 @@ void test_verifier_bits_iter(void) { RUN(verifier_bits_iter); } void test_verifier_lsm(void) { RUN(verifier_lsm); } void test_irq(void) { RUN(irq); } void test_verifier_mtu(void) { RUN(verifier_mtu); } +void test_verifier_jit_inline(void) { RUN(verifier_jit_inline); } static int init_test_val_map(struct bpf_object *obj, char *map_name) { diff --git a/tools/testing/selftests/bpf/progs/verifier_jit_inline.c b/tools/testing/selftests/bpf/progs/verifier_jit_inline.c new file mode 100644 index 000000000000..4ea254063646 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_jit_inline.c @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "bpf_misc.h" + +SEC("fentry/bpf_fentry_test1") +__success __retval(0) +__arch_x86_64 +__jited(" addq %gs:{{.*}}, %rax") +__arch_arm64 +__jited(" mrs x7, SP_EL0") +int inline_bpf_get_current_task(void) +{ + bpf_get_current_task(); + + return 0; +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 75aad5ffe099a1b1a342257236dc260493917ed2 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 13 Jan 2026 16:58:00 +0800 Subject: selftests/ublk: fix IO thread idle check Include cmd_inflight in ublk_thread_is_done() check. Without this, the thread may exit before all FETCH commands are completed, which may cause device deletion to hang. Fixes: 6aecda00b7d1 ("selftests: ublk: add kernel selftests for ublk") Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/kublk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 185ba553686a..f52431fe9b6c 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -753,7 +753,7 @@ static int ublk_thread_is_idle(struct ublk_thread *t) static int ublk_thread_is_done(struct ublk_thread *t) { - return (t->state & UBLKS_T_STOPPING) && ublk_thread_is_idle(t); + return (t->state & UBLKS_T_STOPPING) && ublk_thread_is_idle(t) && !t->cmd_inflight; } static inline void ublksrv_handle_tgt_cqe(struct ublk_thread *t, -- cgit v1.2.3 From 23e62cf75518825aac12e9a22bdc40f062428898 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 13 Jan 2026 16:58:01 +0800 Subject: selftests/ublk: fix error handling for starting device Fix error handling in ublk_start_daemon() when start_dev fails: 1. Call ublk_ctrl_stop_dev() to cancel inflight uring_cmd before cleanup. Without this, the device deletion may hang waiting for I/O completion that will never happen. 2. Add fail_start label so that pthread_join() is called on the error path. This ensures proper thread cleanup when startup fails. Fixes: 6aecda00b7d1 ("selftests: ublk: add kernel selftests for ublk") Signed-off-by: Ming Lei Reviewed-by: Caleb Sander Mateos Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/kublk.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index f52431fe9b6c..65f59e7b6972 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -1054,7 +1054,9 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) } if (ret < 0) { ublk_err("%s: ublk_ctrl_start_dev failed: %d\n", __func__, ret); - goto fail; + /* stop device so that inflight uring_cmd can be cancelled */ + ublk_ctrl_stop_dev(dev); + goto fail_start; } ublk_ctrl_get_info(dev); @@ -1062,7 +1064,7 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) ublk_ctrl_dump(dev); else ublk_send_dev_event(ctx, dev, dev->dev_info.dev_id); - +fail_start: /* wait until we are terminated */ for (i = 0; i < dev->nthreads; i++) pthread_join(tinfo[i].thread, &thread_ret); -- cgit v1.2.3 From e7e1cc18f120a415646be12470169a978a1adcd9 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 13 Jan 2026 16:58:02 +0800 Subject: selftests/ublk: fix garbage output in foreground mode Initialize _evtfd to -1 in struct dev_ctx to prevent garbage output when running kublk in foreground mode. Without this, _evtfd is zero-initialized to 0 (stdin), and ublk_send_dev_event() writes binary data to stdin which appears as garbage on the terminal. Also fix debug message format string. Fixes: 6aecda00b7d1 ("selftests: ublk: add kernel selftests for ublk") Signed-off-by: Ming Lei Reviewed-by: Caleb Sander Mateos Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/kublk.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 65f59e7b6972..f197ad9cc262 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -1274,7 +1274,7 @@ static int __cmd_dev_add(const struct dev_ctx *ctx) } ret = ublk_start_daemon(ctx, dev); - ublk_dbg(UBLK_DBG_DEV, "%s: daemon exit %d\b", ret); + ublk_dbg(UBLK_DBG_DEV, "%s: daemon exit %d\n", __func__, ret); if (ret < 0) ublk_ctrl_del_dev(dev); @@ -1620,6 +1620,7 @@ int main(int argc, char *argv[]) int option_idx, opt; const char *cmd = argv[1]; struct dev_ctx ctx = { + ._evtfd = -1, .queue_depth = 128, .nr_hw_queues = 2, .dev_id = -1, -- cgit v1.2.3 From 1ed797764315496a115cd0568450a7f72da80df6 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Wed, 21 Jan 2026 12:43:48 +0800 Subject: selftests/bpf: test bpf_get_func_arg() for tp_btf Test bpf_get_func_arg() and bpf_get_func_arg_cnt() for tp_btf. The code is most copied from test1 and test2. Signed-off-by: Menglong Dong Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20260121044348.113201-3-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/get_func_args_test.c | 3 ++ .../selftests/bpf/progs/get_func_args_test.c | 44 ++++++++++++++++++++++ .../selftests/bpf/test_kmods/bpf_testmod-events.h | 10 +++++ .../testing/selftests/bpf/test_kmods/bpf_testmod.c | 4 ++ 4 files changed, 61 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/prog_tests/get_func_args_test.c b/tools/testing/selftests/bpf/prog_tests/get_func_args_test.c index 64a9c95d4acf..fadee95d3ae8 100644 --- a/tools/testing/selftests/bpf/prog_tests/get_func_args_test.c +++ b/tools/testing/selftests/bpf/prog_tests/get_func_args_test.c @@ -33,11 +33,14 @@ void test_get_func_args_test(void) ASSERT_EQ(topts.retval >> 16, 1, "test_run"); ASSERT_EQ(topts.retval & 0xffff, 1234 + 29, "test_run"); + ASSERT_OK(trigger_module_test_read(1), "trigger_read"); ASSERT_EQ(skel->bss->test1_result, 1, "test1_result"); ASSERT_EQ(skel->bss->test2_result, 1, "test2_result"); ASSERT_EQ(skel->bss->test3_result, 1, "test3_result"); ASSERT_EQ(skel->bss->test4_result, 1, "test4_result"); + ASSERT_EQ(skel->bss->test5_result, 1, "test5_result"); + ASSERT_EQ(skel->bss->test6_result, 1, "test6_result"); cleanup: get_func_args_test__destroy(skel); diff --git a/tools/testing/selftests/bpf/progs/get_func_args_test.c b/tools/testing/selftests/bpf/progs/get_func_args_test.c index e0f34a55e697..5b7233afef05 100644 --- a/tools/testing/selftests/bpf/progs/get_func_args_test.c +++ b/tools/testing/selftests/bpf/progs/get_func_args_test.c @@ -121,3 +121,47 @@ int BPF_PROG(fexit_test, int _a, int *_b, int _ret) test4_result &= err == 0 && ret == 1234; return 0; } + +__u64 test5_result = 0; +SEC("tp_btf/bpf_testmod_fentry_test1_tp") +int BPF_PROG(tp_test1) +{ + __u64 cnt = bpf_get_func_arg_cnt(ctx); + __u64 a = 0, z = 0; + __s64 err; + + test5_result = cnt == 1; + + err = bpf_get_func_arg(ctx, 0, &a); + test5_result &= err == 0 && ((int) a == 1); + + /* not valid argument */ + err = bpf_get_func_arg(ctx, 1, &z); + test5_result &= err == -EINVAL; + + return 0; +} + +__u64 test6_result = 0; +SEC("tp_btf/bpf_testmod_fentry_test2_tp") +int BPF_PROG(tp_test2) +{ + __u64 cnt = bpf_get_func_arg_cnt(ctx); + __u64 a = 0, b = 0, z = 0; + __s64 err; + + test6_result = cnt == 2; + + /* valid arguments */ + err = bpf_get_func_arg(ctx, 0, &a); + test6_result &= err == 0 && (int) a == 2; + + err = bpf_get_func_arg(ctx, 1, &b); + test6_result &= err == 0 && b == 3; + + /* not valid argument */ + err = bpf_get_func_arg(ctx, 2, &z); + test6_result &= err == -EINVAL; + + return 0; +} diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod-events.h b/tools/testing/selftests/bpf/test_kmods/bpf_testmod-events.h index aeef86b3da74..45a5e41f3a92 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod-events.h +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod-events.h @@ -63,6 +63,16 @@ BPF_TESTMOD_DECLARE_TRACE(bpf_testmod_test_writable_bare, sizeof(struct bpf_testmod_test_writable_ctx) ); +DECLARE_TRACE(bpf_testmod_fentry_test1, + TP_PROTO(int a), + TP_ARGS(a) +); + +DECLARE_TRACE(bpf_testmod_fentry_test2, + TP_PROTO(int a, u64 b), + TP_ARGS(a, b) +); + #endif /* _BPF_TESTMOD_EVENTS_H */ #undef TRACE_INCLUDE_PATH diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index d425034b72d3..77a81fa8ec6a 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -412,11 +412,15 @@ __weak noinline struct file *bpf_testmod_return_ptr(int arg) noinline int bpf_testmod_fentry_test1(int a) { + trace_bpf_testmod_fentry_test1_tp(a); + return a + 1; } noinline int bpf_testmod_fentry_test2(int a, u64 b) { + trace_bpf_testmod_fentry_test2_tp(a, b); + return a + b; } -- cgit v1.2.3 From f4924ad0b13fd4ca4f0c7117dc143bf372224aec Mon Sep 17 00:00:00 2001 From: Yuzuki Ishiyama Date: Wed, 21 Jan 2026 12:33:28 +0900 Subject: selftests/bpf: Test kfunc bpf_strncasecmp Add testsuites for kfunc bpf_strncasecmp. Signed-off-by: Yuzuki Ishiyama Acked-by: Viktor Malik Link: https://lore.kernel.org/r/20260121033328.1850010-3-ishiyama@hpc.is.uec.ac.jp Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/string_kfuncs.c | 1 + tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c | 6 ++++++ tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c | 1 + tools/testing/selftests/bpf/progs/string_kfuncs_success.c | 7 +++++++ 4 files changed, 15 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c b/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c index 0f3bf594e7a5..300032a19445 100644 --- a/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c +++ b/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c @@ -9,6 +9,7 @@ static const char * const test_cases[] = { "strcmp", "strcasecmp", + "strncasecmp", "strchr", "strchrnul", "strnchr", diff --git a/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c b/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c index 826e6b6aff7e..bddc4e8579d2 100644 --- a/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c +++ b/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c @@ -33,6 +33,8 @@ SEC("syscall") __retval(USER_PTR_ERR) int test_strcmp_null1(void *ctx) { return SEC("syscall") __retval(USER_PTR_ERR)int test_strcmp_null2(void *ctx) { return bpf_strcmp("hello", NULL); } SEC("syscall") __retval(USER_PTR_ERR) int test_strcasecmp_null1(void *ctx) { return bpf_strcasecmp(NULL, "HELLO"); } SEC("syscall") __retval(USER_PTR_ERR)int test_strcasecmp_null2(void *ctx) { return bpf_strcasecmp("HELLO", NULL); } +SEC("syscall") __retval(USER_PTR_ERR)int test_strncasecmp_null1(void *ctx) { return bpf_strncasecmp(NULL, "HELLO", 5); } +SEC("syscall") __retval(USER_PTR_ERR)int test_strncasecmp_null2(void *ctx) { return bpf_strncasecmp("HELLO", NULL, 5); } SEC("syscall") __retval(USER_PTR_ERR)int test_strchr_null(void *ctx) { return bpf_strchr(NULL, 'a'); } SEC("syscall") __retval(USER_PTR_ERR)int test_strchrnul_null(void *ctx) { return bpf_strchrnul(NULL, 'a'); } SEC("syscall") __retval(USER_PTR_ERR)int test_strnchr_null(void *ctx) { return bpf_strnchr(NULL, 1, 'a'); } @@ -57,6 +59,8 @@ SEC("syscall") __retval(USER_PTR_ERR) int test_strcmp_user_ptr1(void *ctx) { ret SEC("syscall") __retval(USER_PTR_ERR) int test_strcmp_user_ptr2(void *ctx) { return bpf_strcmp("hello", user_ptr); } SEC("syscall") __retval(USER_PTR_ERR) int test_strcasecmp_user_ptr1(void *ctx) { return bpf_strcasecmp(user_ptr, "HELLO"); } SEC("syscall") __retval(USER_PTR_ERR) int test_strcasecmp_user_ptr2(void *ctx) { return bpf_strcasecmp("HELLO", user_ptr); } +SEC("syscall") __retval(USER_PTR_ERR) int test_strncasecmp_user_ptr1(void *ctx) { return bpf_strncasecmp(user_ptr, "HELLO", 5); } +SEC("syscall") __retval(USER_PTR_ERR) int test_strncasecmp_user_ptr2(void *ctx) { return bpf_strncasecmp("HELLO", user_ptr, 5); } SEC("syscall") __retval(USER_PTR_ERR) int test_strchr_user_ptr(void *ctx) { return bpf_strchr(user_ptr, 'a'); } SEC("syscall") __retval(USER_PTR_ERR) int test_strchrnul_user_ptr(void *ctx) { return bpf_strchrnul(user_ptr, 'a'); } SEC("syscall") __retval(USER_PTR_ERR) int test_strnchr_user_ptr(void *ctx) { return bpf_strnchr(user_ptr, 1, 'a'); } @@ -83,6 +87,8 @@ SEC("syscall") __retval(-EFAULT) int test_strcmp_pagefault1(void *ctx) { return SEC("syscall") __retval(-EFAULT) int test_strcmp_pagefault2(void *ctx) { return bpf_strcmp("hello", invalid_kern_ptr); } SEC("syscall") __retval(-EFAULT) int test_strcasecmp_pagefault1(void *ctx) { return bpf_strcasecmp(invalid_kern_ptr, "HELLO"); } SEC("syscall") __retval(-EFAULT) int test_strcasecmp_pagefault2(void *ctx) { return bpf_strcasecmp("HELLO", invalid_kern_ptr); } +SEC("syscall") __retval(-EFAULT) int test_strncasecmp_pagefault1(void *ctx) { return bpf_strncasecmp(invalid_kern_ptr, "HELLO", 5); } +SEC("syscall") __retval(-EFAULT) int test_strncasecmp_pagefault2(void *ctx) { return bpf_strncasecmp("HELLO", invalid_kern_ptr, 5); } SEC("syscall") __retval(-EFAULT) int test_strchr_pagefault(void *ctx) { return bpf_strchr(invalid_kern_ptr, 'a'); } SEC("syscall") __retval(-EFAULT) int test_strchrnul_pagefault(void *ctx) { return bpf_strchrnul(invalid_kern_ptr, 'a'); } SEC("syscall") __retval(-EFAULT) int test_strnchr_pagefault(void *ctx) { return bpf_strnchr(invalid_kern_ptr, 1, 'a'); } diff --git a/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c b/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c index 05e1da1f250f..412c53b87b18 100644 --- a/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c +++ b/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c @@ -8,6 +8,7 @@ char long_str[XATTR_SIZE_MAX + 1]; SEC("syscall") int test_strcmp_too_long(void *ctx) { return bpf_strcmp(long_str, long_str); } SEC("syscall") int test_strcasecmp_too_long(void *ctx) { return bpf_strcasecmp(long_str, long_str); } +SEC("syscall") int test_strncasecmp_too_long(void *ctx) { return bpf_strncasecmp(long_str, long_str, sizeof(long_str)); } SEC("syscall") int test_strchr_too_long(void *ctx) { return bpf_strchr(long_str, 'b'); } SEC("syscall") int test_strchrnul_too_long(void *ctx) { return bpf_strchrnul(long_str, 'b'); } SEC("syscall") int test_strnchr_too_long(void *ctx) { return bpf_strnchr(long_str, sizeof(long_str), 'b'); } diff --git a/tools/testing/selftests/bpf/progs/string_kfuncs_success.c b/tools/testing/selftests/bpf/progs/string_kfuncs_success.c index a8513964516b..f65b1226a81a 100644 --- a/tools/testing/selftests/bpf/progs/string_kfuncs_success.c +++ b/tools/testing/selftests/bpf/progs/string_kfuncs_success.c @@ -17,6 +17,13 @@ __test(0) int test_strcasecmp_eq2(void *ctx) { return bpf_strcasecmp(str, "HELLO __test(0) int test_strcasecmp_eq3(void *ctx) { return bpf_strcasecmp(str, "HELLO world"); } __test(1) int test_strcasecmp_neq1(void *ctx) { return bpf_strcasecmp(str, "hello"); } __test(1) int test_strcasecmp_neq2(void *ctx) { return bpf_strcasecmp(str, "HELLO"); } +__test(0) int test_strncasecmp_eq1(void *ctx) { return bpf_strncasecmp(str, "hello world", 11); } +__test(0) int test_strncasecmp_eq2(void *ctx) { return bpf_strncasecmp(str, "HELLO WORLD", 11); } +__test(0) int test_strncasecmp_eq3(void *ctx) { return bpf_strncasecmp(str, "HELLO world", 11); } +__test(0) int test_strncasecmp_eq4(void *ctx) { return bpf_strncasecmp(str, "hello", 5); } +__test(0) int test_strncasecmp_eq5(void *ctx) { return bpf_strncasecmp(str, "hello world!", 11); } +__test(-1) int test_strncasecmp_neq1(void *ctx) { return bpf_strncasecmp(str, "hello!", 6); } +__test(1) int test_strncasecmp_neq2(void *ctx) { return bpf_strncasecmp(str, "abc", 3); } __test(1) int test_strchr_found(void *ctx) { return bpf_strchr(str, 'e'); } __test(11) int test_strchr_null(void *ctx) { return bpf_strchr(str, '\0'); } __test(-ENOENT) int test_strchr_notfound(void *ctx) { return bpf_strchr(str, 'x'); } -- cgit v1.2.3 From 6d6ad32e22f028c525d5df471c5522616e645a6b Mon Sep 17 00:00:00 2001 From: Ziyu Chen Date: Wed, 21 Jan 2026 17:41:47 +0800 Subject: selftests/pidfd: fix typo in comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the typo "untill" → "until" in a comment in pidfd_info_test.c. This typo is already listed in scripts/spelling.txt by commit 66b47b4a9dad ("checkpatch: look for common misspellings"). Link: https://lore.kernel.org/r/20260121094147.4187337-1-chenziyu@uniontech.com Suggested-by: Cryolitia PukNgae Signed-off-by: Ziyu Chen Signed-off-by: Shuah Khan --- tools/testing/selftests/pidfd/pidfd_info_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/pidfd/pidfd_info_test.c b/tools/testing/selftests/pidfd/pidfd_info_test.c index 6571e04acd88..8bed951e06a0 100644 --- a/tools/testing/selftests/pidfd/pidfd_info_test.c +++ b/tools/testing/selftests/pidfd/pidfd_info_test.c @@ -229,7 +229,7 @@ static void *pidfd_info_pause_thread(void *arg) close(ipc_socket); - /* Sleep untill we're killed. */ + /* Sleep until we're killed. */ pause(); return NULL; } -- cgit v1.2.3 From a32ae2658471dd87a2f7a438388ed7d9a5767212 Mon Sep 17 00:00:00 2001 From: Kery Qi Date: Wed, 21 Jan 2026 17:41:16 +0800 Subject: selftests/bpf: Fix resource leak in serial_test_wq on attach failure When wq__attach() fails, serial_test_wq() returns early without calling wq__destroy(), leaking the skeleton resources allocated by wq__open_and_load(). This causes ASAN leak reports in selftests runs. Fix this by jumping to a common clean_up label that calls wq__destroy() on all exit paths after successful open_and_load. Note that the early return after wq__open_and_load() failure is correct and doesn't need fixing, since that function returns NULL on failure (after internally cleaning up any partial allocations). Fixes: 8290dba51910 ("selftests/bpf: wq: add bpf_wq_start() checks") Signed-off-by: Kery Qi Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20260121094114.1801-3-qikeyu2017@gmail.com --- tools/testing/selftests/bpf/prog_tests/wq.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/prog_tests/wq.c b/tools/testing/selftests/bpf/prog_tests/wq.c index 15c67d23128b..84831eecc935 100644 --- a/tools/testing/selftests/bpf/prog_tests/wq.c +++ b/tools/testing/selftests/bpf/prog_tests/wq.c @@ -16,12 +16,12 @@ void serial_test_wq(void) /* re-run the success test to check if the timer was actually executed */ wq_skel = wq__open_and_load(); - if (!ASSERT_OK_PTR(wq_skel, "wq_skel_load")) + if (!ASSERT_OK_PTR(wq_skel, "wq__open_and_load")) return; err = wq__attach(wq_skel); if (!ASSERT_OK(err, "wq_attach")) - return; + goto clean_up; prog_fd = bpf_program__fd(wq_skel->progs.test_syscall_array_sleepable); err = bpf_prog_test_run_opts(prog_fd, &topts); @@ -31,6 +31,7 @@ void serial_test_wq(void) usleep(50); /* 10 usecs should be enough, but give it extra */ ASSERT_EQ(wq_skel->bss->ok_sleepable, (1 << 1), "ok_sleepable"); +clean_up: wq__destroy(wq_skel); } -- cgit v1.2.3 From 6ecc08329bab2c87f579cf1a8ab7799d8d88d9bc Mon Sep 17 00:00:00 2001 From: Andre Carvalho Date: Sun, 18 Jan 2026 11:00:27 +0000 Subject: selftests: netconsole: validate target resume Introduce a new netconsole selftest to validate that netconsole is able to resume a deactivated target when the low level interface comes back. The test setups the network using netdevsim, creates a netconsole target and then remove/add netdevsim in order to bring the same interfaces back. Afterwards, the test validates that the target works as expected. Targets are created via cmdline parameters to the module to ensure that we are able to resume targets that were bound by mac and interface name. Reviewed-by: Breno Leitao Signed-off-by: Andre Carvalho Tested-by: Breno Leitao Link: https://patch.msgid.link/20260118-netcons-retrigger-v11-7-4de36aebcf48@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/Makefile | 1 + .../selftests/drivers/net/lib/sh/lib_netcons.sh | 35 +++++- .../selftests/drivers/net/netcons_resume.sh | 124 +++++++++++++++++++++ 3 files changed, 155 insertions(+), 5 deletions(-) create mode 100755 tools/testing/selftests/drivers/net/netcons_resume.sh (limited to 'tools/testing') diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile index f5c71d993750..3eba569b3366 100644 --- a/tools/testing/selftests/drivers/net/Makefile +++ b/tools/testing/selftests/drivers/net/Makefile @@ -19,6 +19,7 @@ TEST_PROGS := \ netcons_cmdline.sh \ netcons_fragmented_msg.sh \ netcons_overflow.sh \ + netcons_resume.sh \ netcons_sysdata.sh \ netcons_torture.sh \ netpoll_basic.py \ diff --git a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh index ae8abff4be40..b6093bcf2b06 100644 --- a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh +++ b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh @@ -203,19 +203,21 @@ function do_cleanup() { function cleanup_netcons() { # delete netconsole dynamic reconfiguration # do not fail if the target is already disabled - if [[ ! -d "${NETCONS_PATH}" ]] + local TARGET_PATH=${1:-${NETCONS_PATH}} + + if [[ ! -d "${TARGET_PATH}" ]] then # in some cases this is called before netcons path is created return fi - if [[ $(cat "${NETCONS_PATH}"/enabled) != 0 ]] + if [[ $(cat "${TARGET_PATH}"/enabled) != 0 ]] then - echo 0 > "${NETCONS_PATH}"/enabled || true + echo 0 > "${TARGET_PATH}"/enabled || true fi # Remove all the keys that got created during the selftest - find "${NETCONS_PATH}/userdata/" -mindepth 1 -type d -delete + find "${TARGET_PATH}/userdata/" -mindepth 1 -type d -delete # Remove the configfs entry - rmdir "${NETCONS_PATH}" + rmdir "${TARGET_PATH}" } function cleanup() { @@ -377,6 +379,29 @@ function check_netconsole_module() { fi } +function wait_target_state() { + local TARGET=${1} + local STATE=${2} + local TARGET_PATH="${NETCONS_CONFIGFS}"/"${TARGET}" + local ENABLED=0 + + if [ "${STATE}" == "enabled" ] + then + ENABLED=1 + fi + + if [ ! -d "$TARGET_PATH" ]; then + echo "FAIL: Target does not exist." >&2 + exit "${ksft_fail}" + fi + + local CHECK_CMD="grep \"$ENABLED\" \"$TARGET_PATH/enabled\"" + slowwait 2 sh -c "test -n \"\$($CHECK_CMD)\"" || { + echo "FAIL: ${TARGET} is not ${STATE}." >&2 + exit "${ksft_fail}" + } +} + # A wrapper to translate protocol version to udp version function wait_for_port() { local NAMESPACE=${1} diff --git a/tools/testing/selftests/drivers/net/netcons_resume.sh b/tools/testing/selftests/drivers/net/netcons_resume.sh new file mode 100755 index 000000000000..fc5e5e3ad3d4 --- /dev/null +++ b/tools/testing/selftests/drivers/net/netcons_resume.sh @@ -0,0 +1,124 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: GPL-2.0 + +# This test validates that netconsole is able to resume a target that was +# deactivated when its interface was removed when the interface is brought +# back up. +# +# The test configures a netconsole target and then removes netdevsim module to +# cause the interface to disappear. Targets are configured via cmdline to ensure +# targets bound by interface name and mac address can be resumed. +# The test verifies that the target moved to disabled state before adding +# netdevsim and the interface back. +# +# Finally, the test verifies that the target is re-enabled automatically and +# the message is received on the destination interface. +# +# Author: Andre Carvalho + +set -euo pipefail + +SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") + +source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh + +SAVED_SRCMAC="" # to be populated later +SAVED_DSTMAC="" # to be populated later + +modprobe netdevsim 2> /dev/null || true +rmmod netconsole 2> /dev/null || true + +check_netconsole_module + +function cleanup() { + cleanup_netcons "${NETCONS_CONFIGFS}/cmdline0" + do_cleanup + rmmod netconsole +} + +function trigger_reactivation() { + # Add back low level module + modprobe netdevsim + # Recreate namespace and two interfaces + set_network + # Restore MACs + ip netns exec "${NAMESPACE}" ip link set "${DSTIF}" \ + address "${SAVED_DSTMAC}" + if [ "${BINDMODE}" == "mac" ]; then + ip link set dev "${SRCIF}" down + ip link set dev "${SRCIF}" address "${SAVED_SRCMAC}" + # Rename device in order to trigger target resume, as initial + # when device was recreated it didn't have correct mac address. + ip link set dev "${SRCIF}" name "${TARGET}" + fi +} + +function trigger_deactivation() { + # Start by storing mac addresses so we can be restored in reactivate + SAVED_DSTMAC=$(ip netns exec "${NAMESPACE}" \ + cat /sys/class/net/"$DSTIF"/address) + SAVED_SRCMAC=$(mac_get "${SRCIF}") + # Remove low level module + rmmod netdevsim +} + +trap cleanup EXIT + +# Run the test twice, with different cmdline parameters +for BINDMODE in "ifname" "mac" +do + echo "Running with bind mode: ${BINDMODE}" >&2 + # Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5) + echo "6 5" > /proc/sys/kernel/printk + + # Create one namespace and two interfaces + set_network + + # Create the command line for netconsole, with the configuration from + # the function above + CMDLINE=$(create_cmdline_str "${BINDMODE}") + + # The content of kmsg will be save to the following file + OUTPUT_FILE="/tmp/${TARGET}-${BINDMODE}" + + # Load the module, with the cmdline set + modprobe netconsole "${CMDLINE}" + # Expose cmdline target in configfs + mkdir "${NETCONS_CONFIGFS}/cmdline0" + + # Target should be enabled + wait_target_state "cmdline0" "enabled" + + # Trigger deactivation by unloading netdevsim module. Target should be + # disabled. + trigger_deactivation + wait_target_state "cmdline0" "disabled" + + # Trigger reactivation by loading netdevsim, recreating the network and + # restoring mac addresses. Target should be re-enabled. + trigger_reactivation + wait_target_state "cmdline0" "enabled" + + # Listen for netconsole port inside the namespace and destination + # interface + listen_port_and_save_to "${OUTPUT_FILE}" & + # Wait for socat to start and listen to the port. + wait_local_port_listen "${NAMESPACE}" "${PORT}" udp + # Send the message + echo "${MSG}: ${TARGET}" > /dev/kmsg + # Wait until socat saves the file to disk + busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}" + # Make sure the message was received in the dst part + # and exit + validate_msg "${OUTPUT_FILE}" + + # kill socat in case it is still running + pkill_socat + # Cleanup & unload the module + cleanup + + echo "${BINDMODE} : Test passed" >&2 +done + +trap - EXIT +exit "${EXIT_STATUS}" -- cgit v1.2.3 From 04708606fd7bdc34b69089a4ff848ff36d7088f9 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Tue, 20 Jan 2026 13:39:30 +0000 Subject: selftests: net: amt: wait longer for connection before sending packets Both send_mcast4() and send_mcast6() use sleep 2 to wait for the tunnel connection between the gateway and the relay, and for the listener socket to be created in the LISTENER namespace. However, tests sometimes fail because packets are sent before the connection is fully established. Increase the waiting time to make the tests more reliable, and use wait_local_port_listen() to explicitly wait for the listener socket. Fixes: c08e8baea78e ("selftests: add amt interface selftest script") Signed-off-by: Taehee Yoo Link: https://patch.msgid.link/20260120133930.863845-1-ap420073@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/amt.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/net/amt.sh b/tools/testing/selftests/net/amt.sh index 3ef209cacb8e..663744305e52 100755 --- a/tools/testing/selftests/net/amt.sh +++ b/tools/testing/selftests/net/amt.sh @@ -73,6 +73,8 @@ # +------------------------+ #============================================================================== +source lib.sh + readonly LISTENER=$(mktemp -u listener-XXXXXXXX) readonly GATEWAY=$(mktemp -u gateway-XXXXXXXX) readonly RELAY=$(mktemp -u relay-XXXXXXXX) @@ -246,14 +248,15 @@ test_ipv6_forward() send_mcast4() { - sleep 2 + sleep 5 + wait_local_port_listen ${LISTENER} 4000 udp ip netns exec "${SOURCE}" bash -c \ 'printf "%s %128s" 172.17.0.2 | nc -w 1 -u 239.0.0.1 4000' & } send_mcast6() { - sleep 2 + wait_local_port_listen ${LISTENER} 6000 udp ip netns exec "${SOURCE}" bash -c \ 'printf "%s %128s" 2001:db8:3::2 | nc -w 1 -u ff0e::5:6 6000' & } -- cgit v1.2.3 From 830969e7821af377bdc1bb016929ff28c78490e8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 15 Dec 2025 17:52:34 +0100 Subject: selftests/rseq: Implement time slice extension test Provide an initial test case to evaluate the functionality. This needs to be extended to cover the ABI violations and expose the race condition between observing granted and arriving in rseq_slice_yield(). Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251215155709.320325431@linutronix.de --- tools/testing/selftests/rseq/.gitignore | 1 + tools/testing/selftests/rseq/Makefile | 5 +- tools/testing/selftests/rseq/rseq-abi.h | 27 ++++ tools/testing/selftests/rseq/slice_test.c | 219 ++++++++++++++++++++++++++++++ 4 files changed, 251 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/rseq/slice_test.c (limited to 'tools/testing') diff --git a/tools/testing/selftests/rseq/.gitignore b/tools/testing/selftests/rseq/.gitignore index 0fda241fa62b..ec01d164c1f0 100644 --- a/tools/testing/selftests/rseq/.gitignore +++ b/tools/testing/selftests/rseq/.gitignore @@ -10,3 +10,4 @@ param_test_mm_cid param_test_mm_cid_benchmark param_test_mm_cid_compare_twice syscall_errors_test +slice_test diff --git a/tools/testing/selftests/rseq/Makefile b/tools/testing/selftests/rseq/Makefile index 0d0a5fae5954..4ef90823b652 100644 --- a/tools/testing/selftests/rseq/Makefile +++ b/tools/testing/selftests/rseq/Makefile @@ -17,7 +17,7 @@ OVERRIDE_TARGETS = 1 TEST_GEN_PROGS = basic_test basic_percpu_ops_test basic_percpu_ops_mm_cid_test param_test \ param_test_benchmark param_test_compare_twice param_test_mm_cid \ param_test_mm_cid_benchmark param_test_mm_cid_compare_twice \ - syscall_errors_test + syscall_errors_test slice_test TEST_GEN_PROGS_EXTENDED = librseq.so @@ -59,3 +59,6 @@ $(OUTPUT)/param_test_mm_cid_compare_twice: param_test.c $(TEST_GEN_PROGS_EXTENDE $(OUTPUT)/syscall_errors_test: syscall_errors_test.c $(TEST_GEN_PROGS_EXTENDED) \ rseq.h rseq-*.h $(CC) $(CFLAGS) $< $(LDLIBS) -lrseq -o $@ + +$(OUTPUT)/slice_test: slice_test.c $(TEST_GEN_PROGS_EXTENDED) rseq.h rseq-*.h + $(CC) $(CFLAGS) $< $(LDLIBS) -lrseq -o $@ diff --git a/tools/testing/selftests/rseq/rseq-abi.h b/tools/testing/selftests/rseq/rseq-abi.h index fb4ec8a75dd4..ecef315204b2 100644 --- a/tools/testing/selftests/rseq/rseq-abi.h +++ b/tools/testing/selftests/rseq/rseq-abi.h @@ -53,6 +53,27 @@ struct rseq_abi_cs { __u64 abort_ip; } __attribute__((aligned(4 * sizeof(__u64)))); +/** + * rseq_abi_slice_ctrl - Time slice extension control structure + * @all: Compound value + * @request: Request for a time slice extension + * @granted: Granted time slice extension + * + * @request is set by user space and can be cleared by user space or kernel + * space. @granted is set and cleared by the kernel and must only be read + * by user space. + */ +struct rseq_abi_slice_ctrl { + union { + __u32 all; + struct { + __u8 request; + __u8 granted; + __u16 __reserved; + }; + }; +}; + /* * struct rseq_abi is aligned on 4 * 8 bytes to ensure it is always * contained within a single cache-line. @@ -164,6 +185,12 @@ struct rseq_abi { */ __u32 mm_cid; + /* + * Time slice extension control structure. CPU local updates from + * kernel and user space. + */ + struct rseq_abi_slice_ctrl slice_ctrl; + /* * Flexible array member at end of structure, after last feature field. */ diff --git a/tools/testing/selftests/rseq/slice_test.c b/tools/testing/selftests/rseq/slice_test.c new file mode 100644 index 000000000000..357122dcb487 --- /dev/null +++ b/tools/testing/selftests/rseq/slice_test.c @@ -0,0 +1,219 @@ +// SPDX-License-Identifier: LGPL-2.1 +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "rseq.h" + +#include "../kselftest_harness.h" + +#ifndef __NR_rseq_slice_yield +# define __NR_rseq_slice_yield 471 +#endif + +#define BITS_PER_INT 32 +#define BITS_PER_BYTE 8 + +#ifndef PR_RSEQ_SLICE_EXTENSION +# define PR_RSEQ_SLICE_EXTENSION 79 +# define PR_RSEQ_SLICE_EXTENSION_GET 1 +# define PR_RSEQ_SLICE_EXTENSION_SET 2 +# define PR_RSEQ_SLICE_EXT_ENABLE 0x01 +#endif + +#ifndef RSEQ_SLICE_EXT_REQUEST_BIT +# define RSEQ_SLICE_EXT_REQUEST_BIT 0 +# define RSEQ_SLICE_EXT_GRANTED_BIT 1 +#endif + +#ifndef asm_inline +# define asm_inline asm __inline +#endif + +#define NSEC_PER_SEC 1000000000L +#define NSEC_PER_USEC 1000L + +struct noise_params { + int64_t noise_nsecs; + int64_t sleep_nsecs; + int64_t run; +}; + +FIXTURE(slice_ext) +{ + pthread_t noise_thread; + struct noise_params noise_params; +}; + +FIXTURE_VARIANT(slice_ext) +{ + int64_t total_nsecs; + int64_t slice_nsecs; + int64_t noise_nsecs; + int64_t sleep_nsecs; + bool no_yield; +}; + +FIXTURE_VARIANT_ADD(slice_ext, n2_2_50) +{ + .total_nsecs = 5LL * NSEC_PER_SEC, + .slice_nsecs = 2LL * NSEC_PER_USEC, + .noise_nsecs = 2LL * NSEC_PER_USEC, + .sleep_nsecs = 50LL * NSEC_PER_USEC, +}; + +FIXTURE_VARIANT_ADD(slice_ext, n50_2_50) +{ + .total_nsecs = 5LL * NSEC_PER_SEC, + .slice_nsecs = 50LL * NSEC_PER_USEC, + .noise_nsecs = 2LL * NSEC_PER_USEC, + .sleep_nsecs = 50LL * NSEC_PER_USEC, +}; + +FIXTURE_VARIANT_ADD(slice_ext, n2_2_50_no_yield) +{ + .total_nsecs = 5LL * NSEC_PER_SEC, + .slice_nsecs = 2LL * NSEC_PER_USEC, + .noise_nsecs = 2LL * NSEC_PER_USEC, + .sleep_nsecs = 50LL * NSEC_PER_USEC, + .no_yield = true, +}; + + +static inline bool elapsed(struct timespec *start, struct timespec *now, + int64_t span) +{ + int64_t delta = now->tv_sec - start->tv_sec; + + delta *= NSEC_PER_SEC; + delta += now->tv_nsec - start->tv_nsec; + return delta >= span; +} + +static void *noise_thread(void *arg) +{ + struct noise_params *p = arg; + + while (RSEQ_READ_ONCE(p->run)) { + struct timespec ts_start, ts_now; + + clock_gettime(CLOCK_MONOTONIC, &ts_start); + do { + clock_gettime(CLOCK_MONOTONIC, &ts_now); + } while (!elapsed(&ts_start, &ts_now, p->noise_nsecs)); + + ts_start.tv_sec = 0; + ts_start.tv_nsec = p->sleep_nsecs; + clock_nanosleep(CLOCK_MONOTONIC, 0, &ts_start, NULL); + } + return NULL; +} + +FIXTURE_SETUP(slice_ext) +{ + cpu_set_t affinity; + + ASSERT_EQ(sched_getaffinity(0, sizeof(affinity), &affinity), 0); + + /* Pin it on a single CPU. Avoid CPU 0 */ + for (int i = 1; i < CPU_SETSIZE; i++) { + if (!CPU_ISSET(i, &affinity)) + continue; + + CPU_ZERO(&affinity); + CPU_SET(i, &affinity); + ASSERT_EQ(sched_setaffinity(0, sizeof(affinity), &affinity), 0); + break; + } + + ASSERT_EQ(rseq_register_current_thread(), 0); + + ASSERT_EQ(prctl(PR_RSEQ_SLICE_EXTENSION, PR_RSEQ_SLICE_EXTENSION_SET, + PR_RSEQ_SLICE_EXT_ENABLE, 0, 0), 0); + + self->noise_params.noise_nsecs = variant->noise_nsecs; + self->noise_params.sleep_nsecs = variant->sleep_nsecs; + self->noise_params.run = 1; + + ASSERT_EQ(pthread_create(&self->noise_thread, NULL, noise_thread, &self->noise_params), 0); +} + +FIXTURE_TEARDOWN(slice_ext) +{ + self->noise_params.run = 0; + pthread_join(self->noise_thread, NULL); +} + +TEST_F(slice_ext, slice_test) +{ + unsigned long success = 0, yielded = 0, scheduled = 0, raced = 0; + unsigned long total = 0, aborted = 0; + struct rseq_abi *rs = rseq_get_abi(); + struct timespec ts_start, ts_now; + + ASSERT_NE(rs, NULL); + + clock_gettime(CLOCK_MONOTONIC, &ts_start); + do { + struct timespec ts_cs; + bool req = false; + + clock_gettime(CLOCK_MONOTONIC, &ts_cs); + + total++; + RSEQ_WRITE_ONCE(rs->slice_ctrl.request, 1); + do { + clock_gettime(CLOCK_MONOTONIC, &ts_now); + } while (!elapsed(&ts_cs, &ts_now, variant->slice_nsecs)); + + /* + * request can be cleared unconditionally, but for making + * the stats work this is actually checking it first + */ + if (RSEQ_READ_ONCE(rs->slice_ctrl.request)) { + RSEQ_WRITE_ONCE(rs->slice_ctrl.request, 0); + /* Race between check and clear! */ + req = true; + success++; + } + + if (RSEQ_READ_ONCE(rs->slice_ctrl.granted)) { + /* The above raced against a late grant */ + if (req) + success--; + if (variant->no_yield) { + syscall(__NR_getpid); + aborted++; + } else { + yielded++; + if (!syscall(__NR_rseq_slice_yield)) + raced++; + } + } else { + if (!req) + scheduled++; + } + + clock_gettime(CLOCK_MONOTONIC, &ts_now); + } while (!elapsed(&ts_start, &ts_now, variant->total_nsecs)); + + printf("# Total %12ld\n", total); + printf("# Success %12ld\n", success); + printf("# Yielded %12ld\n", yielded); + printf("# Aborted %12ld\n", aborted); + printf("# Scheduled %12ld\n", scheduled); + printf("# Raced %12ld\n", raced); +} + +TEST_HARNESS_MAIN -- cgit v1.2.3 From bb332a9e5a057d2cb9b90e307b26cce9b1f6f660 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 21 Jan 2026 15:10:29 +0100 Subject: selftests/rseq: Add rseq slice histogram script A script that processes trace-cmd data and generates a histogram of rseq slice_ext durations for the recorded workload. Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260121143208.340549136@infradead.org --- Documentation/userspace-api/rseq.rst | 3 + tools/testing/selftests/rseq/rseq-slice-hist.py | 132 ++++++++++++++++++++++++ 2 files changed, 135 insertions(+) create mode 100644 tools/testing/selftests/rseq/rseq-slice-hist.py (limited to 'tools/testing') diff --git a/Documentation/userspace-api/rseq.rst b/Documentation/userspace-api/rseq.rst index 468f6bbe0e25..3cd27a3c7c7e 100644 --- a/Documentation/userspace-api/rseq.rst +++ b/Documentation/userspace-api/rseq.rst @@ -83,6 +83,9 @@ determined by debugfs:rseq/slice_ext_nsec. The default value is 5 usec; which is the minimum value. It can be incremented to 50 usecs, however doing so can/will affect the minimum scheduling latency. +Any proposed changes to this default will have to come with a selftest and +rseq-slice-hist.py output that shows the new value has merrit. + The kernel indicates the grant by clearing rseq::slice_ctrl::request and setting rseq::slice_ctrl::granted to 1. If there is a reschedule of the thread after granting the extension, the kernel clears the granted bit to diff --git a/tools/testing/selftests/rseq/rseq-slice-hist.py b/tools/testing/selftests/rseq/rseq-slice-hist.py new file mode 100644 index 000000000000..b7933eeaefb9 --- /dev/null +++ b/tools/testing/selftests/rseq/rseq-slice-hist.py @@ -0,0 +1,132 @@ +#!/usr/bin/python3 + +# +# trace-cmd record -e hrtimer_start -e hrtimer_cancel -e hrtimer_expire_entry -- $cmd +# + +from tracecmd import * + +def load_kallsyms(file_path='/proc/kallsyms'): + """ + Parses /proc/kallsyms into a dictionary. + Returns: { address_int: symbol_name } + """ + kallsyms_map = {} + + try: + with open(file_path, 'r') as f: + for line in f: + # The format is: [address] [type] [name] [module] + parts = line.split() + if len(parts) < 3: + continue + + addr = int(parts[0], 16) + name = parts[2] + + kallsyms_map[addr] = name + + except PermissionError: + print(f"Error: Permission denied reading {file_path}. Try running with sudo.") + except FileNotFoundError: + print(f"Error: {file_path} not found.") + + return kallsyms_map + +ksyms = load_kallsyms() + +# pending[timer_ptr] = {'ts': timestamp, 'comm': comm} +pending = {} + +# histograms[comm][bucket] = count +histograms = {} + +class OnlineHarmonicMean: + def __init__(self): + self.n = 0 # Count of elements + self.S = 0.0 # Cumulative sum of reciprocals + + def update(self, x): + if x == 0: + raise ValueError("Harmonic mean is undefined for zero.") + + self.n += 1 + self.S += 1.0 / x + return self.n / self.S + + @property + def mean(self): + return self.n / self.S if self.n > 0 else 0 + +ohms = {} + +def handle_start(record): + func_name = ksyms[record.num_field("function")] + if "rseq_slice_expired" in func_name: + timer_ptr = record.num_field("hrtimer") + pending[timer_ptr] = { + 'ts': record.ts, + 'comm': record.comm + } + return None + +def handle_cancel(record): + timer_ptr = record.num_field("hrtimer") + + if timer_ptr in pending: + start_data = pending.pop(timer_ptr) + duration_ns = record.ts - start_data['ts'] + duration_us = duration_ns // 1000 + + comm = start_data['comm'] + + if comm not in ohms: + ohms[comm] = OnlineHarmonicMean() + + ohms[comm].update(duration_ns) + + if comm not in histograms: + histograms[comm] = {} + + histograms[comm][duration_us] = histograms[comm].get(duration_us, 0) + 1 + return None + +def handle_expire(record): + timer_ptr = record.num_field("hrtimer") + + if timer_ptr in pending: + start_data = pending.pop(timer_ptr) + comm = start_data['comm'] + + if comm not in histograms: + histograms[comm] = {} + + # Record -1 bucket for expired (failed to cancel) + histograms[comm][-1] = histograms[comm].get(-1, 0) + 1 + return None + +if __name__ == "__main__": + t = Trace("trace.dat") + for cpu in range(0, t.cpus): + ev = t.read_event(cpu) + while ev: + if "hrtimer_start" in ev.name: + handle_start(ev) + if "hrtimer_cancel" in ev.name: + handle_cancel(ev) + if "hrtimer_expire_entry" in ev.name: + handle_expire(ev) + + ev = t.read_event(cpu) + + print("\n" + "="*40) + print("RSEQ SLICE HISTOGRAM (us)") + print("="*40) + for comm, buckets in histograms.items(): + print(f"\nTask: {comm} Mean: {ohms[comm].mean:.3f} ns") + print(f" {'Latency (us)':<15} | {'Count'}") + print(f" {'-'*30}") + # Sort buckets numerically, putting -1 at the top + for bucket in sorted(buckets.keys()): + label = "EXPIRED" if bucket == -1 else f"{bucket} us" + print(f" {label:<15} | {buckets[bucket]}") -- cgit v1.2.3 From 2a369c4942489aeab799a7509b7cc721eecafa8a Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Sat, 17 Jan 2026 13:10:51 +0100 Subject: kselftest/arm64: Use syscall() macro over nolibc my_syscall() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The my_syscall*() macros are internal implementation details of nolibc. Nolibc also provides the regular syscall(2), which is also a macro and directly expands to the correct my_syscall(). Use syscall() instead. As a side-effect this fixes some return value checks, as my_syscall() returns the raw value as set by the kernel and does not set errno. Signed-off-by: Thomas Weißschuh Acked-by: Mark Brown Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/abi/tpidr2.c | 3 +- tools/testing/selftests/arm64/gcs/basic-gcs.c | 40 +++++++++++---------------- 2 files changed, 17 insertions(+), 26 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/arm64/abi/tpidr2.c b/tools/testing/selftests/arm64/abi/tpidr2.c index 1703543fb7c7..ce4550fb7224 100644 --- a/tools/testing/selftests/arm64/abi/tpidr2.c +++ b/tools/testing/selftests/arm64/abi/tpidr2.c @@ -128,8 +128,7 @@ static int sys_clone(unsigned long clone_flags, unsigned long newsp, int *parent_tidptr, unsigned long tls, int *child_tidptr) { - return my_syscall5(__NR_clone, clone_flags, newsp, parent_tidptr, tls, - child_tidptr); + return syscall(__NR_clone, clone_flags, newsp, parent_tidptr, tls, child_tidptr); } #define __STACK_SIZE (8 * 1024 * 1024) diff --git a/tools/testing/selftests/arm64/gcs/basic-gcs.c b/tools/testing/selftests/arm64/gcs/basic-gcs.c index 250977abc398..ae4cce6afe2b 100644 --- a/tools/testing/selftests/arm64/gcs/basic-gcs.c +++ b/tools/testing/selftests/arm64/gcs/basic-gcs.c @@ -22,7 +22,7 @@ static size_t page_size = 65536; static __attribute__((noinline)) void valid_gcs_function(void) { /* Do something the compiler can't optimise out */ - my_syscall1(__NR_prctl, PR_SVE_GET_VL); + syscall(__NR_prctl, PR_SVE_GET_VL); } static inline int gcs_set_status(unsigned long mode) @@ -36,12 +36,10 @@ static inline int gcs_set_status(unsigned long mode) * other 3 values passed in registers to the syscall are zero * since the kernel validates them. */ - ret = my_syscall5(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, mode, - 0, 0, 0); + ret = syscall(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, mode, 0, 0, 0); if (ret == 0) { - ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, - &new_mode, 0, 0, 0); + ret = syscall(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &new_mode, 0, 0, 0); if (ret == 0) { if (new_mode != mode) { ksft_print_msg("Mode set to %lx not %lx\n", @@ -49,7 +47,7 @@ static inline int gcs_set_status(unsigned long mode) ret = -EINVAL; } } else { - ksft_print_msg("Failed to validate mode: %d\n", ret); + ksft_print_msg("Failed to validate mode: %d\n", errno); } if (enabling != chkfeat_gcs()) { @@ -69,10 +67,9 @@ static bool read_status(void) unsigned long state; int ret; - ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, - &state, 0, 0, 0); + ret = syscall(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &state, 0, 0, 0); if (ret != 0) { - ksft_print_msg("Failed to read state: %d\n", ret); + ksft_print_msg("Failed to read state: %d\n", errno); return false; } @@ -188,9 +185,8 @@ static bool map_guarded_stack(void) int elem; bool pass = true; - buf = (void *)my_syscall3(__NR_map_shadow_stack, 0, page_size, - SHADOW_STACK_SET_MARKER | - SHADOW_STACK_SET_TOKEN); + buf = (void *)syscall(__NR_map_shadow_stack, 0, page_size, + SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN); if (buf == MAP_FAILED) { ksft_print_msg("Failed to map %lu byte GCS: %d\n", page_size, errno); @@ -257,8 +253,7 @@ static bool test_fork(void) valid_gcs_function(); get_gcspr(); - ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, - &child_mode, 0, 0, 0); + ret = syscall(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &child_mode, 0, 0, 0); if (ret == 0 && !(child_mode & PR_SHADOW_STACK_ENABLE)) { ksft_print_msg("GCS not enabled in child\n"); ret = -EINVAL; @@ -321,8 +316,7 @@ static bool test_vfork(void) valid_gcs_function(); get_gcspr(); - ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, - &child_mode, 0, 0, 0); + ret = syscall(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &child_mode, 0, 0, 0); if (ret == 0 && !(child_mode & PR_SHADOW_STACK_ENABLE)) { ksft_print_msg("GCS not enabled in child\n"); ret = EXIT_FAILURE; @@ -390,17 +384,15 @@ int main(void) if (!(getauxval(AT_HWCAP) & HWCAP_GCS)) ksft_exit_skip("SKIP GCS not supported\n"); - ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, - &gcs_mode, 0, 0, 0); + ret = syscall(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &gcs_mode, 0, 0, 0); if (ret != 0) - ksft_exit_fail_msg("Failed to read GCS state: %d\n", ret); + ksft_exit_fail_msg("Failed to read GCS state: %d\n", errno); if (!(gcs_mode & PR_SHADOW_STACK_ENABLE)) { gcs_mode = PR_SHADOW_STACK_ENABLE; - ret = my_syscall5(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, - gcs_mode, 0, 0, 0); + ret = syscall(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, gcs_mode, 0, 0, 0); if (ret != 0) - ksft_exit_fail_msg("Failed to enable GCS: %d\n", ret); + ksft_exit_fail_msg("Failed to enable GCS: %d\n", errno); } ksft_set_plan(ARRAY_SIZE(tests)); @@ -410,9 +402,9 @@ int main(void) } /* One last test: disable GCS, we can do this one time */ - ret = my_syscall5(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, 0, 0, 0, 0); + ret = syscall(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, 0, 0, 0, 0); if (ret != 0) - ksft_print_msg("Failed to disable GCS: %d\n", ret); + ksft_print_msg("Failed to disable GCS: %d\n", errno); ksft_finished(); -- cgit v1.2.3 From 57a96356bb6942e16283138d0a42baad29169ed8 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Mon, 19 Jan 2026 10:29:28 +0800 Subject: kselftest/arm64: Add HWCAP test for FEAT_LS64 Add tests for FEAT_LS64. Issue related instructions if feature presents, no SIGILL should be received. When such instructions operate on Device memory or non-cacheable memory, we may received a SIGBUS during the test (w/o FEAT_LS64WB). Just ignore it since we only tested whether the instruction itself can be issued as expected on platforms declaring the support of such features. Acked-by: Arnd Bergmann Acked-by: Oliver Upton Signed-off-by: Yicong Yang Signed-off-by: Zhou Wang Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/abi/hwcap.c | 49 +++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/arm64/abi/hwcap.c b/tools/testing/selftests/arm64/abi/hwcap.c index c41640f18e4e..9d2df1f3e6bb 100644 --- a/tools/testing/selftests/arm64/abi/hwcap.c +++ b/tools/testing/selftests/arm64/abi/hwcap.c @@ -11,6 +11,8 @@ #include #include #include +#include +#include #include #include #include @@ -595,6 +597,45 @@ static void lrcpc3_sigill(void) : "=r" (data0), "=r" (data1) : "r" (src) :); } +static void ignore_signal(int sig, siginfo_t *info, void *context) +{ + ucontext_t *uc = context; + + uc->uc_mcontext.pc += 4; +} + +static void ls64_sigill(void) +{ + struct sigaction ign, old; + char src[64] __aligned(64) = { 1 }; + + /* + * LS64 requires target memory to be Device/Non-cacheable (if + * FEAT_LS64WB not supported) and the completer supports these + * instructions, otherwise we'll receive a SIGBUS. Since we are only + * testing the ABI here, so just ignore the SIGBUS and see if we can + * execute the instructions without receiving a SIGILL. Restore the + * handler of SIGBUS after this test. + */ + ign.sa_sigaction = ignore_signal; + ign.sa_flags = SA_SIGINFO | SA_RESTART; + sigemptyset(&ign.sa_mask); + sigaction(SIGBUS, &ign, &old); + + register void *xn asm ("x8") = src; + register u64 xt_1 asm ("x0"); + + /* LD64B x0, [x8] */ + asm volatile(".inst 0xf83fd100" : "=r" (xt_1) : "r" (xn) + : "x1", "x2", "x3", "x4", "x5", "x6", "x7"); + + /* ST64B x0, [x8] */ + asm volatile(".inst 0xf83f9100" : : "r" (xt_1), "r" (xn) + : "x1", "x2", "x3", "x4", "x5", "x6", "x7"); + + sigaction(SIGBUS, &old, NULL); +} + static const struct hwcap_data { const char *name; unsigned long at_hwcap; @@ -1134,6 +1175,14 @@ static const struct hwcap_data { .hwcap_bit = HWCAP3_MTE_STORE_ONLY, .cpuinfo = "mtestoreonly", }, + { + .name = "LS64", + .at_hwcap = AT_HWCAP3, + .hwcap_bit = HWCAP3_LS64, + .cpuinfo = "ls64", + .sigill_fn = ls64_sigill, + .sigill_reliable = true, + }, }; typedef void (*sighandler_fn)(int, siginfo_t *, void *); -- cgit v1.2.3 From 0a98de80136968bab7db37b16282b37f044694d3 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Wed, 21 Jan 2026 10:36:26 +0100 Subject: vsock/test: fix seqpacket message bounds test The test requires the sender (client) to send all messages before waking up the receiver (server). Since virtio-vsock had a bug and did not respect the size of the TX buffer, this test worked, but now that we are going to fix the bug, the test hangs because the sender would fill the TX buffer before waking up the receiver. Set the buffer size in the sender (client) as well, as we already do for the receiver (server). Fixes: 5c338112e48a ("test/vsock: rework message bounds test") Signed-off-by: Stefano Garzarella Link: https://patch.msgid.link/20260121093628.9941-3-sgarzare@redhat.com Acked-by: Michael S. Tsirkin Signed-off-by: Paolo Abeni --- tools/testing/vsock/vsock_test.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index 27e39354499a..668fbe9eb3cc 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -351,6 +351,7 @@ static void test_stream_msg_peek_server(const struct test_opts *opts) static void test_seqpacket_msg_bounds_client(const struct test_opts *opts) { + unsigned long long sock_buf_size; unsigned long curr_hash; size_t max_msg_size; int page_size; @@ -363,6 +364,16 @@ static void test_seqpacket_msg_bounds_client(const struct test_opts *opts) exit(EXIT_FAILURE); } + sock_buf_size = SOCK_BUF_SIZE; + + setsockopt_ull_check(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_MAX_SIZE, + sock_buf_size, + "setsockopt(SO_VM_SOCKETS_BUFFER_MAX_SIZE)"); + + setsockopt_ull_check(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE, + sock_buf_size, + "setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)"); + /* Wait, until receiver sets buffer size. */ control_expectln("SRVREADY"); -- cgit v1.2.3 From 2a689f76edd04a53137bd01d4618343f4cdd7e23 Mon Sep 17 00:00:00 2001 From: Melbin K Mathew Date: Wed, 21 Jan 2026 10:36:28 +0100 Subject: vsock/test: add stream TX credit bounds test Add a regression test for the TX credit bounds fix. The test verifies that a sender with a small local buffer size cannot queue excessive data even when the peer advertises a large receive buffer. The client: - Sets a small buffer size (64 KiB) - Connects to server (which advertises 2 MiB buffer) - Sends in non-blocking mode until EAGAIN - Verifies total queued data is bounded This guards against the original vulnerability where a remote peer could cause unbounded kernel memory allocation by advertising a large buffer and reading slowly. Suggested-by: Stefano Garzarella Signed-off-by: Melbin K Mathew [Stefano: use sock_buf_size to check the bytes sent + small fixes] Signed-off-by: Stefano Garzarella Link: https://patch.msgid.link/20260121093628.9941-5-sgarzare@redhat.com Acked-by: Michael S. Tsirkin Signed-off-by: Paolo Abeni --- tools/testing/vsock/vsock_test.c | 101 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index 668fbe9eb3cc..5bd20ccd9335 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -347,6 +347,7 @@ static void test_stream_msg_peek_server(const struct test_opts *opts) } #define SOCK_BUF_SIZE (2 * 1024 * 1024) +#define SOCK_BUF_SIZE_SMALL (64 * 1024) #define MAX_MSG_PAGES 4 static void test_seqpacket_msg_bounds_client(const struct test_opts *opts) @@ -2230,6 +2231,101 @@ static void test_stream_accepted_setsockopt_server(const struct test_opts *opts) close(fd); } +static void test_stream_tx_credit_bounds_client(const struct test_opts *opts) +{ + unsigned long long sock_buf_size; + size_t total = 0; + char buf[4096]; + int fd; + + memset(buf, 'A', sizeof(buf)); + + fd = vsock_stream_connect(opts->peer_cid, opts->peer_port); + if (fd < 0) { + perror("connect"); + exit(EXIT_FAILURE); + } + + sock_buf_size = SOCK_BUF_SIZE_SMALL; + + setsockopt_ull_check(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_MAX_SIZE, + sock_buf_size, + "setsockopt(SO_VM_SOCKETS_BUFFER_MAX_SIZE)"); + + setsockopt_ull_check(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE, + sock_buf_size, + "setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)"); + + if (fcntl(fd, F_SETFL, fcntl(fd, F_GETFL, 0) | O_NONBLOCK) < 0) { + perror("fcntl(F_SETFL)"); + exit(EXIT_FAILURE); + } + + control_expectln("SRVREADY"); + + for (;;) { + ssize_t sent = send(fd, buf, sizeof(buf), 0); + + if (sent == 0) { + fprintf(stderr, "unexpected EOF while sending bytes\n"); + exit(EXIT_FAILURE); + } + + if (sent < 0) { + if (errno == EINTR) + continue; + + if (errno == EAGAIN || errno == EWOULDBLOCK) + break; + + perror("send"); + exit(EXIT_FAILURE); + } + + total += sent; + } + + control_writeln("CLIDONE"); + close(fd); + + /* We should not be able to send more bytes than the value set as + * local buffer size. + */ + if (total > sock_buf_size) { + fprintf(stderr, + "TX credit too large: queued %zu bytes (expected <= %llu)\n", + total, sock_buf_size); + exit(EXIT_FAILURE); + } +} + +static void test_stream_tx_credit_bounds_server(const struct test_opts *opts) +{ + unsigned long long sock_buf_size; + int fd; + + fd = vsock_stream_accept(VMADDR_CID_ANY, opts->peer_port, NULL); + if (fd < 0) { + perror("accept"); + exit(EXIT_FAILURE); + } + + sock_buf_size = SOCK_BUF_SIZE; + + setsockopt_ull_check(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_MAX_SIZE, + sock_buf_size, + "setsockopt(SO_VM_SOCKETS_BUFFER_MAX_SIZE)"); + + setsockopt_ull_check(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE, + sock_buf_size, + "setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)"); + + control_writeln("SRVREADY"); + control_expectln("CLIDONE"); + + close(fd); +} + static struct test_case test_cases[] = { { .name = "SOCK_STREAM connection reset", @@ -2419,6 +2515,11 @@ static struct test_case test_cases[] = { .run_client = test_stream_msgzcopy_mangle_client, .run_server = test_stream_msgzcopy_mangle_server, }, + { + .name = "SOCK_STREAM TX credit bounds", + .run_client = test_stream_tx_credit_bounds_client, + .run_server = test_stream_tx_credit_bounds_server, + }, {}, }; -- cgit v1.2.3 From 7ff8b1d60881c5f97b5ae426e14d2822917d3b69 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 14 Jan 2026 12:20:28 -0600 Subject: cxl/pci: Remove CXL VH handling in CONFIG_PCIEAER_CXL conditional blocks from core/pci.c Create new config CONFIG_CXL_RAS and put all CXL RAS items behind the config. The config will depend on CPER and PCIE AER to build. Move the related VH RAS code from core/pci.c to core/ras.c. Restricted CXL host (RCH) RAS functions will be moved in a future patch. Cc: Robert Richter Reviewed-by: Joshua Hahn Reviewed-by: Jonathan Cameron Signed-off-by: Dave Jiang Reviewed-by: Alison Schofield Co-developed-by: Terry Bowman Signed-off-by: Terry Bowman Reviewed-by: Dan Williams Link: https://patch.msgid.link/20260114182055.46029-8-terry.bowman@amd.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang --- drivers/cxl/Kconfig | 4 + drivers/cxl/core/Makefile | 2 +- drivers/cxl/core/core.h | 31 ++++++++ drivers/cxl/core/pci.c | 189 +--------------------------------------------- drivers/cxl/core/ras.c | 176 ++++++++++++++++++++++++++++++++++++++++++ drivers/cxl/cxl.h | 8 -- drivers/cxl/cxlpci.h | 16 ++++ tools/testing/cxl/Kbuild | 2 +- 8 files changed, 233 insertions(+), 195 deletions(-) (limited to 'tools/testing') diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig index 48b7314afdb8..217888992c88 100644 --- a/drivers/cxl/Kconfig +++ b/drivers/cxl/Kconfig @@ -233,4 +233,8 @@ config CXL_MCE def_bool y depends on X86_MCE && MEMORY_FAILURE +config CXL_RAS + def_bool y + depends on ACPI_APEI_GHES && PCIEAER && CXL_PCI + endif diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile index 5ad8fef210b5..b2930cc54f8b 100644 --- a/drivers/cxl/core/Makefile +++ b/drivers/cxl/core/Makefile @@ -14,9 +14,9 @@ cxl_core-y += pci.o cxl_core-y += hdm.o cxl_core-y += pmu.o cxl_core-y += cdat.o -cxl_core-y += ras.o cxl_core-$(CONFIG_TRACING) += trace.o cxl_core-$(CONFIG_CXL_REGION) += region.o cxl_core-$(CONFIG_CXL_MCE) += mce.o cxl_core-$(CONFIG_CXL_FEATURES) += features.o cxl_core-$(CONFIG_CXL_EDAC_MEM_FEATURES) += edac.o +cxl_core-$(CONFIG_CXL_RAS) += ras.o diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index 1fb66132b777..bc818de87ccc 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -144,8 +144,39 @@ int cxl_pci_get_bandwidth(struct pci_dev *pdev, struct access_coordinate *c); int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port, struct access_coordinate *c); +#ifdef CONFIG_CXL_RAS int cxl_ras_init(void); void cxl_ras_exit(void); +bool cxl_handle_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base); +void cxl_handle_cor_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base); +#else +static inline int cxl_ras_init(void) +{ + return 0; +} + +static inline void cxl_ras_exit(void) +{ +} + +static inline bool cxl_handle_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base) +{ + return false; +} +static inline void cxl_handle_cor_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base) { } +#endif /* CONFIG_CXL_RAS */ + +/* Restricted CXL Host specific RAS functions */ +#ifdef CONFIG_CXL_RAS +void cxl_dport_map_rch_aer(struct cxl_dport *dport); +void cxl_disable_rch_root_ints(struct cxl_dport *dport); +void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds); +#else +static inline void cxl_dport_map_rch_aer(struct cxl_dport *dport) { } +static inline void cxl_disable_rch_root_ints(struct cxl_dport *dport) { } +static inline void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) { } +#endif /* CONFIG_CXL_RAS */ + int cxl_gpf_port_setup(struct cxl_dport *dport); struct cxl_hdm; diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 51bb0f372e40..e132fff80979 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -632,81 +632,8 @@ err: } EXPORT_SYMBOL_NS_GPL(read_cdat_data, "CXL"); -static void cxl_handle_cor_ras(struct cxl_dev_state *cxlds, - void __iomem *ras_base) -{ - void __iomem *addr; - u32 status; - - if (!ras_base) - return; - - addr = ras_base + CXL_RAS_CORRECTABLE_STATUS_OFFSET; - status = readl(addr); - if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) { - writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr); - trace_cxl_aer_correctable_error(cxlds->cxlmd, status); - } -} - -/* CXL spec rev3.0 8.2.4.16.1 */ -static void header_log_copy(void __iomem *ras_base, u32 *log) -{ - void __iomem *addr; - u32 *log_addr; - int i, log_u32_size = CXL_HEADERLOG_SIZE / sizeof(u32); - - addr = ras_base + CXL_RAS_HEADER_LOG_OFFSET; - log_addr = log; - - for (i = 0; i < log_u32_size; i++) { - *log_addr = readl(addr); - log_addr++; - addr += sizeof(u32); - } -} - -/* - * Log the state of the RAS status registers and prepare them to log the - * next error status. Return 1 if reset needed. - */ -static bool cxl_handle_ras(struct cxl_dev_state *cxlds, - void __iomem *ras_base) -{ - u32 hl[CXL_HEADERLOG_SIZE_U32]; - void __iomem *addr; - u32 status; - u32 fe; - - if (!ras_base) - return false; - - addr = ras_base + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET; - status = readl(addr); - if (!(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK)) - return false; - - /* If multiple errors, log header points to first error from ctrl reg */ - if (hweight32(status) > 1) { - void __iomem *rcc_addr = - ras_base + CXL_RAS_CAP_CONTROL_OFFSET; - - fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, - readl(rcc_addr))); - } else { - fe = status; - } - - header_log_copy(ras_base, hl); - trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe, hl); - writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr); - - return true; -} - -#ifdef CONFIG_PCIEAER_CXL - -static void cxl_dport_map_rch_aer(struct cxl_dport *dport) +#ifdef CONFIG_CXL_RAS +void cxl_dport_map_rch_aer(struct cxl_dport *dport) { resource_size_t aer_phys; struct device *host; @@ -721,19 +648,7 @@ static void cxl_dport_map_rch_aer(struct cxl_dport *dport) } } -static void cxl_dport_map_ras(struct cxl_dport *dport) -{ - struct cxl_register_map *map = &dport->reg_map; - struct device *dev = dport->dport_dev; - - if (!map->component_map.ras.valid) - dev_dbg(dev, "RAS registers not found\n"); - else if (cxl_map_component_regs(map, &dport->regs.component, - BIT(CXL_CM_CAP_CAP_ID_RAS))) - dev_dbg(dev, "Failed to map RAS capability.\n"); -} - -static void cxl_disable_rch_root_ints(struct cxl_dport *dport) +void cxl_disable_rch_root_ints(struct cxl_dport *dport) { void __iomem *aer_base = dport->regs.dport_aer; u32 aer_cmd_mask, aer_cmd; @@ -757,28 +672,6 @@ static void cxl_disable_rch_root_ints(struct cxl_dport *dport) writel(aer_cmd, aer_base + PCI_ERR_ROOT_COMMAND); } -/** - * cxl_dport_init_ras_reporting - Setup CXL RAS report on this dport - * @dport: the cxl_dport that needs to be initialized - * @host: host device for devm operations - */ -void cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host) -{ - dport->reg_map.host = host; - cxl_dport_map_ras(dport); - - if (dport->rch) { - struct pci_host_bridge *host_bridge = to_pci_host_bridge(dport->dport_dev); - - if (!host_bridge->native_aer) - return; - - cxl_dport_map_rch_aer(dport); - cxl_disable_rch_root_ints(dport); - } -} -EXPORT_SYMBOL_NS_GPL(cxl_dport_init_ras_reporting, "CXL"); - /* * Copy the AER capability registers using 32 bit read accesses. * This is necessary because RCRB AER capability is MMIO mapped. Clear the @@ -827,7 +720,7 @@ static bool cxl_rch_get_aer_severity(struct aer_capability_regs *aer_regs, return false; } -static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) +void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) { struct pci_dev *pdev = to_pci_dev(cxlds->dev); struct aer_capability_regs aer_regs; @@ -852,82 +745,8 @@ static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) else cxl_handle_ras(cxlds, dport->regs.ras); } - -#else -static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) { } #endif -void cxl_cor_error_detected(struct pci_dev *pdev) -{ - struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); - struct device *dev = &cxlds->cxlmd->dev; - - scoped_guard(device, dev) { - if (!dev->driver) { - dev_warn(&pdev->dev, - "%s: memdev disabled, abort error handling\n", - dev_name(dev)); - return; - } - - if (cxlds->rcd) - cxl_handle_rdport_errors(cxlds); - - cxl_handle_cor_ras(cxlds, cxlds->regs.ras); - } -} -EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, "CXL"); - -pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, - pci_channel_state_t state) -{ - struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); - struct cxl_memdev *cxlmd = cxlds->cxlmd; - struct device *dev = &cxlmd->dev; - bool ue; - - scoped_guard(device, dev) { - if (!dev->driver) { - dev_warn(&pdev->dev, - "%s: memdev disabled, abort error handling\n", - dev_name(dev)); - return PCI_ERS_RESULT_DISCONNECT; - } - - if (cxlds->rcd) - cxl_handle_rdport_errors(cxlds); - /* - * A frozen channel indicates an impending reset which is fatal to - * CXL.mem operation, and will likely crash the system. On the off - * chance the situation is recoverable dump the status of the RAS - * capability registers and bounce the active state of the memdev. - */ - ue = cxl_handle_ras(cxlds, cxlds->regs.ras); - } - - - switch (state) { - case pci_channel_io_normal: - if (ue) { - device_release_driver(dev); - return PCI_ERS_RESULT_NEED_RESET; - } - return PCI_ERS_RESULT_CAN_RECOVER; - case pci_channel_io_frozen: - dev_warn(&pdev->dev, - "%s: frozen state error detected, disable CXL.mem\n", - dev_name(dev)); - device_release_driver(dev); - return PCI_ERS_RESULT_NEED_RESET; - case pci_channel_io_perm_failure: - dev_warn(&pdev->dev, - "failure state error detected, request disconnect\n"); - return PCI_ERS_RESULT_DISCONNECT; - } - return PCI_ERS_RESULT_NEED_RESET; -} -EXPORT_SYMBOL_NS_GPL(cxl_error_detected, "CXL"); - static int cxl_flit_size(struct pci_dev *pdev) { if (cxl_pci_flit_256(pdev)) diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c index 2731ba3a0799..b933030b8e1e 100644 --- a/drivers/cxl/core/ras.c +++ b/drivers/cxl/core/ras.c @@ -5,6 +5,7 @@ #include #include #include +#include #include "trace.h" static void cxl_cper_trace_corr_port_prot_err(struct pci_dev *pdev, @@ -124,3 +125,178 @@ void cxl_ras_exit(void) cxl_cper_unregister_prot_err_work(&cxl_cper_prot_err_work); cancel_work_sync(&cxl_cper_prot_err_work); } + +static void cxl_dport_map_ras(struct cxl_dport *dport) +{ + struct cxl_register_map *map = &dport->reg_map; + struct device *dev = dport->dport_dev; + + if (!map->component_map.ras.valid) + dev_dbg(dev, "RAS registers not found\n"); + else if (cxl_map_component_regs(map, &dport->regs.component, + BIT(CXL_CM_CAP_CAP_ID_RAS))) + dev_dbg(dev, "Failed to map RAS capability.\n"); +} + +/** + * cxl_dport_init_ras_reporting - Setup CXL RAS report on this dport + * @dport: the cxl_dport that needs to be initialized + * @host: host device for devm operations + */ +void cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host) +{ + dport->reg_map.host = host; + cxl_dport_map_ras(dport); + + if (dport->rch) { + struct pci_host_bridge *host_bridge = to_pci_host_bridge(dport->dport_dev); + + if (!host_bridge->native_aer) + return; + + cxl_dport_map_rch_aer(dport); + cxl_disable_rch_root_ints(dport); + } +} +EXPORT_SYMBOL_NS_GPL(cxl_dport_init_ras_reporting, "CXL"); + +void cxl_handle_cor_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base) +{ + void __iomem *addr; + u32 status; + + if (!ras_base) + return; + + addr = ras_base + CXL_RAS_CORRECTABLE_STATUS_OFFSET; + status = readl(addr); + if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) { + writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr); + trace_cxl_aer_correctable_error(cxlds->cxlmd, status); + } +} + +/* CXL spec rev3.0 8.2.4.16.1 */ +static void header_log_copy(void __iomem *ras_base, u32 *log) +{ + void __iomem *addr; + u32 *log_addr; + int i, log_u32_size = CXL_HEADERLOG_SIZE / sizeof(u32); + + addr = ras_base + CXL_RAS_HEADER_LOG_OFFSET; + log_addr = log; + + for (i = 0; i < log_u32_size; i++) { + *log_addr = readl(addr); + log_addr++; + addr += sizeof(u32); + } +} + +/* + * Log the state of the RAS status registers and prepare them to log the + * next error status. Return 1 if reset needed. + */ +bool cxl_handle_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base) +{ + u32 hl[CXL_HEADERLOG_SIZE_U32]; + void __iomem *addr; + u32 status; + u32 fe; + + if (!ras_base) + return false; + + addr = ras_base + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET; + status = readl(addr); + if (!(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK)) + return false; + + /* If multiple errors, log header points to first error from ctrl reg */ + if (hweight32(status) > 1) { + void __iomem *rcc_addr = + ras_base + CXL_RAS_CAP_CONTROL_OFFSET; + + fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, + readl(rcc_addr))); + } else { + fe = status; + } + + header_log_copy(ras_base, hl); + trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe, hl); + writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr); + + return true; +} + +void cxl_cor_error_detected(struct pci_dev *pdev) +{ + struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); + struct device *dev = &cxlds->cxlmd->dev; + + scoped_guard(device, dev) { + if (!dev->driver) { + dev_warn(&pdev->dev, + "%s: memdev disabled, abort error handling\n", + dev_name(dev)); + return; + } + + if (cxlds->rcd) + cxl_handle_rdport_errors(cxlds); + + cxl_handle_cor_ras(cxlds, cxlds->regs.ras); + } +} +EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, "CXL"); + +pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, + pci_channel_state_t state) +{ + struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); + struct cxl_memdev *cxlmd = cxlds->cxlmd; + struct device *dev = &cxlmd->dev; + bool ue; + + scoped_guard(device, dev) { + if (!dev->driver) { + dev_warn(&pdev->dev, + "%s: memdev disabled, abort error handling\n", + dev_name(dev)); + return PCI_ERS_RESULT_DISCONNECT; + } + + if (cxlds->rcd) + cxl_handle_rdport_errors(cxlds); + /* + * A frozen channel indicates an impending reset which is fatal to + * CXL.mem operation, and will likely crash the system. On the off + * chance the situation is recoverable dump the status of the RAS + * capability registers and bounce the active state of the memdev. + */ + ue = cxl_handle_ras(cxlds, cxlds->regs.ras); + } + + + switch (state) { + case pci_channel_io_normal: + if (ue) { + device_release_driver(dev); + return PCI_ERS_RESULT_NEED_RESET; + } + return PCI_ERS_RESULT_CAN_RECOVER; + case pci_channel_io_frozen: + dev_warn(&pdev->dev, + "%s: frozen state error detected, disable CXL.mem\n", + dev_name(dev)); + device_release_driver(dev); + return PCI_ERS_RESULT_NEED_RESET; + case pci_channel_io_perm_failure: + dev_warn(&pdev->dev, + "failure state error detected, request disconnect\n"); + return PCI_ERS_RESULT_DISCONNECT; + } + return PCI_ERS_RESULT_NEED_RESET; +} +EXPORT_SYMBOL_NS_GPL(cxl_error_detected, "CXL"); diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index ba17fa86d249..42a76a7a088f 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -803,14 +803,6 @@ struct cxl_dport *devm_cxl_add_rch_dport(struct cxl_port *port, struct device *dport_dev, int port_id, resource_size_t rcrb); -#ifdef CONFIG_PCIEAER_CXL -void cxl_setup_parent_dport(struct device *host, struct cxl_dport *dport); -void cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host); -#else -static inline void cxl_dport_init_ras_reporting(struct cxl_dport *dport, - struct device *host) { } -#endif - struct cxl_decoder *to_cxl_decoder(struct device *dev); struct cxl_root_decoder *to_cxl_root_decoder(struct device *dev); struct cxl_switch_decoder *to_cxl_switch_decoder(struct device *dev); diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h index cdb7cf3dbcb4..6f9c78886fd9 100644 --- a/drivers/cxl/cxlpci.h +++ b/drivers/cxl/cxlpci.h @@ -76,7 +76,23 @@ static inline bool cxl_pci_flit_256(struct pci_dev *pdev) struct cxl_dev_state; void read_cdat_data(struct cxl_port *port); + +#ifdef CONFIG_CXL_RAS void cxl_cor_error_detected(struct pci_dev *pdev); pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, pci_channel_state_t state); +void cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host); +#else +static inline void cxl_cor_error_detected(struct pci_dev *pdev) { } + +static inline pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, + pci_channel_state_t state) +{ + return PCI_ERS_RESULT_NONE; +} + +static inline void cxl_dport_init_ras_reporting(struct cxl_dport *dport, + struct device *host) { } +#endif + #endif /* __CXL_PCI_H__ */ diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index 0e151d0572d1..b7ea66382f3b 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -57,12 +57,12 @@ cxl_core-y += $(CXL_CORE_SRC)/pci.o cxl_core-y += $(CXL_CORE_SRC)/hdm.o cxl_core-y += $(CXL_CORE_SRC)/pmu.o cxl_core-y += $(CXL_CORE_SRC)/cdat.o -cxl_core-y += $(CXL_CORE_SRC)/ras.o cxl_core-$(CONFIG_TRACING) += $(CXL_CORE_SRC)/trace.o cxl_core-$(CONFIG_CXL_REGION) += $(CXL_CORE_SRC)/region.o cxl_core-$(CONFIG_CXL_MCE) += $(CXL_CORE_SRC)/mce.o cxl_core-$(CONFIG_CXL_FEATURES) += $(CXL_CORE_SRC)/features.o cxl_core-$(CONFIG_CXL_EDAC_MEM_FEATURES) += $(CXL_CORE_SRC)/edac.o +cxl_core-$(CONFIG_CXL_RAS) += $(CXL_CORE_SRC)/ras.o cxl_core-y += config_check.o cxl_core-y += cxl_core_test.o cxl_core-y += cxl_core_exports.o -- cgit v1.2.3 From 0ff60f2ec3e4043a442e805f80f8a2445113ec8f Mon Sep 17 00:00:00 2001 From: Terry Bowman Date: Wed, 14 Jan 2026 12:20:29 -0600 Subject: cxl/pci: Move CXL driver's RCH error handling into core/ras_rch.c Restricted CXL Host (RCH) protocol error handling uses a procedure distinct from the CXL Virtual Hierarchy (VH) handling. This is because of the differences in the RCH and VH topologies. Improve the maintainability and add ability to enable/disable RCH handling. Move and combine the RCH handling code into a single block conditionally compiled with the CONFIG_CXL_RCH_RAS kernel config. Signed-off-by: Terry Bowman Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Link: https://patch.msgid.link/20260114182055.46029-9-terry.bowman@amd.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang --- drivers/cxl/core/Makefile | 1 + drivers/cxl/core/core.h | 11 ++--- drivers/cxl/core/pci.c | 115 ------------------------------------------ drivers/cxl/core/ras_rch.c | 121 +++++++++++++++++++++++++++++++++++++++++++++ tools/testing/cxl/Kbuild | 1 + 5 files changed, 126 insertions(+), 123 deletions(-) create mode 100644 drivers/cxl/core/ras_rch.c (limited to 'tools/testing') diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile index b2930cc54f8b..b37f38d502d8 100644 --- a/drivers/cxl/core/Makefile +++ b/drivers/cxl/core/Makefile @@ -20,3 +20,4 @@ cxl_core-$(CONFIG_CXL_MCE) += mce.o cxl_core-$(CONFIG_CXL_FEATURES) += features.o cxl_core-$(CONFIG_CXL_EDAC_MEM_FEATURES) += edac.o cxl_core-$(CONFIG_CXL_RAS) += ras.o +cxl_core-$(CONFIG_CXL_RAS) += ras_rch.o diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index bc818de87ccc..724361195057 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -149,6 +149,9 @@ int cxl_ras_init(void); void cxl_ras_exit(void); bool cxl_handle_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base); void cxl_handle_cor_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base); +void cxl_dport_map_rch_aer(struct cxl_dport *dport); +void cxl_disable_rch_root_ints(struct cxl_dport *dport); +void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds); #else static inline int cxl_ras_init(void) { @@ -164,14 +167,6 @@ static inline bool cxl_handle_ras(struct cxl_dev_state *cxlds, void __iomem *ras return false; } static inline void cxl_handle_cor_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base) { } -#endif /* CONFIG_CXL_RAS */ - -/* Restricted CXL Host specific RAS functions */ -#ifdef CONFIG_CXL_RAS -void cxl_dport_map_rch_aer(struct cxl_dport *dport); -void cxl_disable_rch_root_ints(struct cxl_dport *dport); -void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds); -#else static inline void cxl_dport_map_rch_aer(struct cxl_dport *dport) { } static inline void cxl_disable_rch_root_ints(struct cxl_dport *dport) { } static inline void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) { } diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index e132fff80979..b838c59d7a3c 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -632,121 +632,6 @@ err: } EXPORT_SYMBOL_NS_GPL(read_cdat_data, "CXL"); -#ifdef CONFIG_CXL_RAS -void cxl_dport_map_rch_aer(struct cxl_dport *dport) -{ - resource_size_t aer_phys; - struct device *host; - u16 aer_cap; - - aer_cap = cxl_rcrb_to_aer(dport->dport_dev, dport->rcrb.base); - if (aer_cap) { - host = dport->reg_map.host; - aer_phys = aer_cap + dport->rcrb.base; - dport->regs.dport_aer = devm_cxl_iomap_block(host, aer_phys, - sizeof(struct aer_capability_regs)); - } -} - -void cxl_disable_rch_root_ints(struct cxl_dport *dport) -{ - void __iomem *aer_base = dport->regs.dport_aer; - u32 aer_cmd_mask, aer_cmd; - - if (!aer_base) - return; - - /* - * Disable RCH root port command interrupts. - * CXL 3.0 12.2.1.1 - RCH Downstream Port-detected Errors - * - * This sequence may not be necessary. CXL spec states disabling - * the root cmd register's interrupts is required. But, PCI spec - * shows these are disabled by default on reset. - */ - aer_cmd_mask = (PCI_ERR_ROOT_CMD_COR_EN | - PCI_ERR_ROOT_CMD_NONFATAL_EN | - PCI_ERR_ROOT_CMD_FATAL_EN); - aer_cmd = readl(aer_base + PCI_ERR_ROOT_COMMAND); - aer_cmd &= ~aer_cmd_mask; - writel(aer_cmd, aer_base + PCI_ERR_ROOT_COMMAND); -} - -/* - * Copy the AER capability registers using 32 bit read accesses. - * This is necessary because RCRB AER capability is MMIO mapped. Clear the - * status after copying. - * - * @aer_base: base address of AER capability block in RCRB - * @aer_regs: destination for copying AER capability - */ -static bool cxl_rch_get_aer_info(void __iomem *aer_base, - struct aer_capability_regs *aer_regs) -{ - int read_cnt = sizeof(struct aer_capability_regs) / sizeof(u32); - u32 *aer_regs_buf = (u32 *)aer_regs; - int n; - - if (!aer_base) - return false; - - /* Use readl() to guarantee 32-bit accesses */ - for (n = 0; n < read_cnt; n++) - aer_regs_buf[n] = readl(aer_base + n * sizeof(u32)); - - writel(aer_regs->uncor_status, aer_base + PCI_ERR_UNCOR_STATUS); - writel(aer_regs->cor_status, aer_base + PCI_ERR_COR_STATUS); - - return true; -} - -/* Get AER severity. Return false if there is no error. */ -static bool cxl_rch_get_aer_severity(struct aer_capability_regs *aer_regs, - int *severity) -{ - if (aer_regs->uncor_status & ~aer_regs->uncor_mask) { - if (aer_regs->uncor_status & PCI_ERR_ROOT_FATAL_RCV) - *severity = AER_FATAL; - else - *severity = AER_NONFATAL; - return true; - } - - if (aer_regs->cor_status & ~aer_regs->cor_mask) { - *severity = AER_CORRECTABLE; - return true; - } - - return false; -} - -void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) -{ - struct pci_dev *pdev = to_pci_dev(cxlds->dev); - struct aer_capability_regs aer_regs; - struct cxl_dport *dport; - int severity; - - struct cxl_port *port __free(put_cxl_port) = - cxl_pci_find_port(pdev, &dport); - if (!port) - return; - - if (!cxl_rch_get_aer_info(dport->regs.dport_aer, &aer_regs)) - return; - - if (!cxl_rch_get_aer_severity(&aer_regs, &severity)) - return; - - pci_print_aer(pdev, severity, &aer_regs); - - if (severity == AER_CORRECTABLE) - cxl_handle_cor_ras(cxlds, dport->regs.ras); - else - cxl_handle_ras(cxlds, dport->regs.ras); -} -#endif - static int cxl_flit_size(struct pci_dev *pdev) { if (cxl_pci_flit_256(pdev)) diff --git a/drivers/cxl/core/ras_rch.c b/drivers/cxl/core/ras_rch.c new file mode 100644 index 000000000000..ed58afd18ecc --- /dev/null +++ b/drivers/cxl/core/ras_rch.c @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2025 AMD Corporation. All rights reserved. */ + +#include +#include +#include "cxl.h" +#include "core.h" +#include "cxlmem.h" + +void cxl_dport_map_rch_aer(struct cxl_dport *dport) +{ + resource_size_t aer_phys; + struct device *host; + u16 aer_cap; + + aer_cap = cxl_rcrb_to_aer(dport->dport_dev, dport->rcrb.base); + if (aer_cap) { + host = dport->reg_map.host; + aer_phys = aer_cap + dport->rcrb.base; + dport->regs.dport_aer = + devm_cxl_iomap_block(host, aer_phys, + sizeof(struct aer_capability_regs)); + } +} + +void cxl_disable_rch_root_ints(struct cxl_dport *dport) +{ + void __iomem *aer_base = dport->regs.dport_aer; + u32 aer_cmd_mask, aer_cmd; + + if (!aer_base) + return; + + /* + * Disable RCH root port command interrupts. + * CXL 3.0 12.2.1.1 - RCH Downstream Port-detected Errors + * + * This sequence may not be necessary. CXL spec states disabling + * the root cmd register's interrupts is required. But, PCI spec + * shows these are disabled by default on reset. + */ + aer_cmd_mask = (PCI_ERR_ROOT_CMD_COR_EN | + PCI_ERR_ROOT_CMD_NONFATAL_EN | + PCI_ERR_ROOT_CMD_FATAL_EN); + aer_cmd = readl(aer_base + PCI_ERR_ROOT_COMMAND); + aer_cmd &= ~aer_cmd_mask; + writel(aer_cmd, aer_base + PCI_ERR_ROOT_COMMAND); +} + +/* + * Copy the AER capability registers using 32 bit read accesses. + * This is necessary because RCRB AER capability is MMIO mapped. Clear the + * status after copying. + * + * @aer_base: base address of AER capability block in RCRB + * @aer_regs: destination for copying AER capability + */ +static bool cxl_rch_get_aer_info(void __iomem *aer_base, + struct aer_capability_regs *aer_regs) +{ + int read_cnt = sizeof(struct aer_capability_regs) / sizeof(u32); + u32 *aer_regs_buf = (u32 *)aer_regs; + int n; + + if (!aer_base) + return false; + + /* Use readl() to guarantee 32-bit accesses */ + for (n = 0; n < read_cnt; n++) + aer_regs_buf[n] = readl(aer_base + n * sizeof(u32)); + + writel(aer_regs->uncor_status, aer_base + PCI_ERR_UNCOR_STATUS); + writel(aer_regs->cor_status, aer_base + PCI_ERR_COR_STATUS); + + return true; +} + +/* Get AER severity. Return false if there is no error. */ +static bool cxl_rch_get_aer_severity(struct aer_capability_regs *aer_regs, + int *severity) +{ + if (aer_regs->uncor_status & ~aer_regs->uncor_mask) { + if (aer_regs->uncor_status & PCI_ERR_ROOT_FATAL_RCV) + *severity = AER_FATAL; + else + *severity = AER_NONFATAL; + return true; + } + + if (aer_regs->cor_status & ~aer_regs->cor_mask) { + *severity = AER_CORRECTABLE; + return true; + } + + return false; +} + +void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) +{ + struct pci_dev *pdev = to_pci_dev(cxlds->dev); + struct aer_capability_regs aer_regs; + struct cxl_dport *dport; + int severity; + + struct cxl_port *port __free(put_cxl_port) = + cxl_pci_find_port(pdev, &dport); + if (!port) + return; + + if (!cxl_rch_get_aer_info(dport->regs.dport_aer, &aer_regs)) + return; + + if (!cxl_rch_get_aer_severity(&aer_regs, &severity)) + return; + + pci_print_aer(pdev, severity, &aer_regs); + if (severity == AER_CORRECTABLE) + cxl_handle_cor_ras(cxlds, dport->regs.ras); + else + cxl_handle_ras(cxlds, dport->regs.ras); +} diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index b7ea66382f3b..6eceefefb0e0 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -63,6 +63,7 @@ cxl_core-$(CONFIG_CXL_MCE) += $(CXL_CORE_SRC)/mce.o cxl_core-$(CONFIG_CXL_FEATURES) += $(CXL_CORE_SRC)/features.o cxl_core-$(CONFIG_CXL_EDAC_MEM_FEATURES) += $(CXL_CORE_SRC)/edac.o cxl_core-$(CONFIG_CXL_RAS) += $(CXL_CORE_SRC)/ras.o +cxl_core-$(CONFIG_CXL_RAS) += $(CXL_CORE_SRC)/ras_rch.o cxl_core-y += config_check.o cxl_core-y += cxl_core_test.o cxl_core-y += cxl_core_exports.o -- cgit v1.2.3 From d07d7c3dd9446b47507d15fdfbb835638a2f6f50 Mon Sep 17 00:00:00 2001 From: Danielle Ratson Date: Wed, 21 Jan 2026 13:46:44 +0200 Subject: selftests: net: Add kernel selftest for RFC 4884 RFC 4884 extended certain ICMP messages with a length attribute that encodes the length of the "original datagram" field. This is needed so that new information could be appended to these messages without applications thinking that it is part of the "original datagram" field. In version 5.9, the kernel was extended with two new socket options (SOL_IP/IP_RECVERR_4884 and SOL_IPV6/IPV6_RECVERR_RFC4884) that allow user space to retrieve this length which is basically the offset to the ICMP Extension Structure at the end of the ICMP message. This is required by user space applications that need to parse the information contained in the ICMP Extension Structure. For example, the RFC 5837 extension for tracepath. Add a selftest that verifies correct handling of the RFC 4884 length field for both IPv4 and IPv6, with and without extension structures, and validates that malformed extensions are correctly reported as invalid. For each address family, the test creates: - a raw socket used to send locally crafted ICMP error packets to the loopback address, and - a datagram socket used to receive the encapsulated original datagram and associated error metadata from the kernel error queue. ICMP packets are constructed entirely in user space rather than relying on kernel-generated errors. This allows the test to exercise invalid scenarios (such as corrupted checksums and incorrect length fields) and verify that the SO_EE_RFC4884_FLAG_INVALID flag is set as expected. Output Example: $ ./icmp_rfc4884 Starting 18 tests from 18 test cases. RUN rfc4884.ipv4_ext_small_payload.rfc4884 ... OK rfc4884.ipv4_ext_small_payload.rfc4884 ok 1 rfc4884.ipv4_ext_small_payload.rfc4884 RUN rfc4884.ipv4_ext.rfc4884 ... OK rfc4884.ipv4_ext.rfc4884 ok 2 rfc4884.ipv4_ext.rfc4884 RUN rfc4884.ipv4_ext_large_payload.rfc4884 ... OK rfc4884.ipv4_ext_large_payload.rfc4884 ok 3 rfc4884.ipv4_ext_large_payload.rfc4884 RUN rfc4884.ipv4_no_ext_small_payload.rfc4884 ... OK rfc4884.ipv4_no_ext_small_payload.rfc4884 ok 4 rfc4884.ipv4_no_ext_small_payload.rfc4884 RUN rfc4884.ipv4_no_ext_min_payload.rfc4884 ... OK rfc4884.ipv4_no_ext_min_payload.rfc4884 ok 5 rfc4884.ipv4_no_ext_min_payload.rfc4884 RUN rfc4884.ipv4_no_ext_large_payload.rfc4884 ... OK rfc4884.ipv4_no_ext_large_payload.rfc4884 ok 6 rfc4884.ipv4_no_ext_large_payload.rfc4884 RUN rfc4884.ipv4_invalid_ext_checksum.rfc4884 ... OK rfc4884.ipv4_invalid_ext_checksum.rfc4884 ok 7 rfc4884.ipv4_invalid_ext_checksum.rfc4884 RUN rfc4884.ipv4_invalid_ext_length_small.rfc4884 ... OK rfc4884.ipv4_invalid_ext_length_small.rfc4884 ok 8 rfc4884.ipv4_invalid_ext_length_small.rfc4884 RUN rfc4884.ipv4_invalid_ext_length_large.rfc4884 ... OK rfc4884.ipv4_invalid_ext_length_large.rfc4884 ok 9 rfc4884.ipv4_invalid_ext_length_large.rfc4884 RUN rfc4884.ipv6_ext_small_payload.rfc4884 ... OK rfc4884.ipv6_ext_small_payload.rfc4884 ok 10 rfc4884.ipv6_ext_small_payload.rfc4884 RUN rfc4884.ipv6_ext.rfc4884 ... OK rfc4884.ipv6_ext.rfc4884 ok 11 rfc4884.ipv6_ext.rfc4884 RUN rfc4884.ipv6_ext_large_payload.rfc4884 ... OK rfc4884.ipv6_ext_large_payload.rfc4884 ok 12 rfc4884.ipv6_ext_large_payload.rfc4884 RUN rfc4884.ipv6_no_ext_small_payload.rfc4884 ... OK rfc4884.ipv6_no_ext_small_payload.rfc4884 ok 13 rfc4884.ipv6_no_ext_small_payload.rfc4884 RUN rfc4884.ipv6_no_ext_min_payload.rfc4884 ... OK rfc4884.ipv6_no_ext_min_payload.rfc4884 ok 14 rfc4884.ipv6_no_ext_min_payload.rfc4884 RUN rfc4884.ipv6_no_ext_large_payload.rfc4884 ... OK rfc4884.ipv6_no_ext_large_payload.rfc4884 ok 15 rfc4884.ipv6_no_ext_large_payload.rfc4884 RUN rfc4884.ipv6_invalid_ext_checksum.rfc4884 ... OK rfc4884.ipv6_invalid_ext_checksum.rfc4884 ok 16 rfc4884.ipv6_invalid_ext_checksum.rfc4884 RUN rfc4884.ipv6_invalid_ext_length_small.rfc4884 ... OK rfc4884.ipv6_invalid_ext_length_small.rfc4884 ok 17 rfc4884.ipv6_invalid_ext_length_small.rfc4884 RUN rfc4884.ipv6_invalid_ext_length_large.rfc4884 ... OK rfc4884.ipv6_invalid_ext_length_large.rfc4884 ok 18 rfc4884.ipv6_invalid_ext_length_large.rfc4884 PASSED: 18 / 18 tests passed. Totals: pass:18 fail:0 xfail:0 xpass:0 skip:0 error:0 Signed-off-by: Danielle Ratson Reviewed-by: Ido Schimmel Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260121114644.2863640-1-danieller@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/.gitignore | 1 + tools/testing/selftests/net/Makefile | 1 + tools/testing/selftests/net/icmp_rfc4884.c | 679 +++++++++++++++++++++++++++++ 3 files changed, 681 insertions(+) create mode 100644 tools/testing/selftests/net/icmp_rfc4884.c (limited to 'tools/testing') diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index 6930fe926c58..97ad4d551d44 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -7,6 +7,7 @@ cmsg_sender epoll_busy_poll fin_ack_lat hwtstamp_config +icmp_rfc4884 io_uring_zerocopy_tx ioam6_parser ip_defrag diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index b66ba04f19d9..fe7937dc5f45 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -166,6 +166,7 @@ TEST_GEN_PROGS := \ bind_timewait \ bind_wildcard \ epoll_busy_poll \ + icmp_rfc4884 \ ipv6_fragmentation \ proc_net_pktgen \ reuseaddr_conflict \ diff --git a/tools/testing/selftests/net/icmp_rfc4884.c b/tools/testing/selftests/net/icmp_rfc4884.c new file mode 100644 index 000000000000..cd826b913557 --- /dev/null +++ b/tools/testing/selftests/net/icmp_rfc4884.c @@ -0,0 +1,679 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest_harness.h" + +static const unsigned short src_port = 44444; +static const unsigned short dst_port = 55555; +static const int min_orig_dgram_len = 128; +static const int min_payload_len_v4 = + min_orig_dgram_len - sizeof(struct iphdr) - sizeof(struct udphdr); +static const int min_payload_len_v6 = + min_orig_dgram_len - sizeof(struct ipv6hdr) - sizeof(struct udphdr); +static const uint8_t orig_payload_byte = 0xAA; + +struct sockaddr_inet { + union { + struct sockaddr_in6 v6; + struct sockaddr_in v4; + struct sockaddr sa; + }; + socklen_t len; +}; + +struct ip_case_info { + int domain; + int level; + int opt1; + int opt2; + int proto; + int (*build_func)(uint8_t *buf, ssize_t buflen, bool with_ext, + int payload_len, bool bad_csum, bool bad_len, + bool smaller_len); + int min_payload; +}; + +static int bringup_loopback(void) +{ + struct ifreq ifr = { + .ifr_name = "lo" + }; + int fd; + + fd = socket(AF_INET, SOCK_DGRAM, 0); + if (fd < 0) + return -1; + + if (ioctl(fd, SIOCGIFFLAGS, &ifr) < 0) + goto err; + + ifr.ifr_flags = ifr.ifr_flags | IFF_UP; + + if (ioctl(fd, SIOCSIFFLAGS, &ifr) < 0) + goto err; + + close(fd); + return 0; + +err: + close(fd); + return -1; +} + +static uint16_t csum(const void *buf, size_t len) +{ + const uint8_t *data = buf; + uint32_t sum = 0; + + while (len > 1) { + sum += (data[0] << 8) | data[1]; + data += 2; + len -= 2; + } + + if (len == 1) + sum += data[0] << 8; + + while (sum >> 16) + sum = (sum & 0xFFFF) + (sum >> 16); + + return ~sum & 0xFFFF; +} + +static int poll_err(int fd) +{ + struct pollfd pfd; + + memset(&pfd, 0, sizeof(pfd)); + pfd.fd = fd; + + if (poll(&pfd, 1, 5000) != 1 || pfd.revents != POLLERR) + return -1; + + return 0; +} + +static void set_addr(struct sockaddr_inet *addr, int domain, + unsigned short port) +{ + memset(addr, 0, sizeof(*addr)); + + switch (domain) { + case AF_INET: + addr->v4.sin_family = AF_INET; + addr->v4.sin_port = htons(port); + addr->v4.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + addr->len = sizeof(addr->v4); + break; + case AF_INET6: + addr->v6.sin6_family = AF_INET6; + addr->v6.sin6_port = htons(port); + addr->v6.sin6_addr = in6addr_loopback; + addr->len = sizeof(addr->v6); + break; + } +} + +static int bind_and_setsockopt(int fd, const struct ip_case_info *info) +{ + struct sockaddr_inet addr; + int opt = 1; + + set_addr(&addr, info->domain, src_port); + + if (setsockopt(fd, info->level, info->opt1, &opt, sizeof(opt)) < 0) + return -1; + + if (setsockopt(fd, info->level, info->opt2, &opt, sizeof(opt)) < 0) + return -1; + + return bind(fd, &addr.sa, addr.len); +} + +static int build_rfc4884_ext(uint8_t *buf, size_t buflen, bool bad_csum, + bool bad_len, bool smaller_len) +{ + struct icmp_extobj_hdr *objh; + struct icmp_ext_hdr *exthdr; + size_t obj_len, ext_len; + uint16_t sum; + + /* Use an object payload of 4 bytes */ + obj_len = sizeof(*objh) + sizeof(uint32_t); + ext_len = sizeof(*exthdr) + obj_len; + + if (ext_len > buflen) + return -EINVAL; + + exthdr = (struct icmp_ext_hdr *)buf; + objh = (struct icmp_extobj_hdr *)(buf + sizeof(*exthdr)); + + exthdr->version = 2; + /* When encoding a bad object length, either encode a length too small + * to fit the object header or too big to fit in the packet. + */ + if (bad_len) + obj_len = smaller_len ? sizeof(*objh) - 1 : obj_len * 2; + objh->length = htons(obj_len); + + sum = csum(buf, ext_len); + exthdr->checksum = htons(bad_csum ? sum - 1 : sum); + + return ext_len; +} + +static int build_orig_dgram_v4(uint8_t *buf, ssize_t buflen, int payload_len) +{ + struct udphdr *udph; + struct iphdr *iph; + size_t len = 0; + + len = sizeof(*iph) + sizeof(*udph) + payload_len; + if (len > buflen) + return -EINVAL; + + iph = (struct iphdr *)buf; + udph = (struct udphdr *)(buf + sizeof(*iph)); + + iph->version = 4; + iph->ihl = 5; + iph->protocol = IPPROTO_UDP; + iph->saddr = htonl(INADDR_LOOPBACK); + iph->daddr = htonl(INADDR_LOOPBACK); + iph->tot_len = htons(len); + iph->check = htons(csum(iph, sizeof(*iph))); + + udph->source = htons(src_port); + udph->dest = htons(dst_port); + udph->len = htons(sizeof(*udph) + payload_len); + + memset(buf + sizeof(*iph) + sizeof(*udph), orig_payload_byte, + payload_len); + + return len; +} + +static int build_orig_dgram_v6(uint8_t *buf, ssize_t buflen, int payload_len) +{ + struct udphdr *udph; + struct ipv6hdr *iph; + size_t len = 0; + + len = sizeof(*iph) + sizeof(*udph) + payload_len; + if (len > buflen) + return -EINVAL; + + iph = (struct ipv6hdr *)buf; + udph = (struct udphdr *)(buf + sizeof(*iph)); + + iph->version = 6; + iph->payload_len = htons(sizeof(*udph) + payload_len); + iph->nexthdr = IPPROTO_UDP; + iph->saddr = in6addr_loopback; + iph->daddr = in6addr_loopback; + + udph->source = htons(src_port); + udph->dest = htons(dst_port); + udph->len = htons(sizeof(*udph) + payload_len); + + memset(buf + sizeof(*iph) + sizeof(*udph), orig_payload_byte, + payload_len); + + return len; +} + +static int build_icmpv4_pkt(uint8_t *buf, ssize_t buflen, bool with_ext, + int payload_len, bool bad_csum, bool bad_len, + bool smaller_len) +{ + struct icmphdr *icmph; + int len, ret; + + len = sizeof(*icmph); + memset(buf, 0, buflen); + + icmph = (struct icmphdr *)buf; + icmph->type = ICMP_DEST_UNREACH; + icmph->code = ICMP_PORT_UNREACH; + icmph->checksum = 0; + + ret = build_orig_dgram_v4(buf + len, buflen - len, payload_len); + if (ret < 0) + return ret; + + len += ret; + + icmph->un.reserved[1] = (len - sizeof(*icmph)) / sizeof(uint32_t); + + if (with_ext) { + ret = build_rfc4884_ext(buf + len, buflen - len, + bad_csum, bad_len, smaller_len); + if (ret < 0) + return ret; + + len += ret; + } + + icmph->checksum = htons(csum(icmph, len)); + return len; +} + +static int build_icmpv6_pkt(uint8_t *buf, ssize_t buflen, bool with_ext, + int payload_len, bool bad_csum, bool bad_len, + bool smaller_len) +{ + struct icmp6hdr *icmph; + int len, ret; + + len = sizeof(*icmph); + memset(buf, 0, buflen); + + icmph = (struct icmp6hdr *)buf; + icmph->icmp6_type = ICMPV6_DEST_UNREACH; + icmph->icmp6_code = ICMPV6_PORT_UNREACH; + icmph->icmp6_cksum = 0; + + ret = build_orig_dgram_v6(buf + len, buflen - len, payload_len); + if (ret < 0) + return ret; + + len += ret; + + icmph->icmp6_datagram_len = (len - sizeof(*icmph)) / sizeof(uint64_t); + + if (with_ext) { + ret = build_rfc4884_ext(buf + len, buflen - len, + bad_csum, bad_len, smaller_len); + if (ret < 0) + return ret; + + len += ret; + } + + icmph->icmp6_cksum = htons(csum(icmph, len)); + return len; +} + +FIXTURE(rfc4884) {}; + +FIXTURE_SETUP(rfc4884) +{ + int ret; + + ret = unshare(CLONE_NEWNET); + ASSERT_EQ(ret, 0) { + TH_LOG("unshare(CLONE_NEWNET) failed: %s", strerror(errno)); + } + + ret = bringup_loopback(); + ASSERT_EQ(ret, 0) TH_LOG("Failed to bring up loopback interface"); +} + +FIXTURE_TEARDOWN(rfc4884) +{ +} + +const struct ip_case_info ipv4_info = { + .domain = AF_INET, + .level = SOL_IP, + .opt1 = IP_RECVERR, + .opt2 = IP_RECVERR_RFC4884, + .proto = IPPROTO_ICMP, + .build_func = build_icmpv4_pkt, + .min_payload = min_payload_len_v4, +}; + +const struct ip_case_info ipv6_info = { + .domain = AF_INET6, + .level = SOL_IPV6, + .opt1 = IPV6_RECVERR, + .opt2 = IPV6_RECVERR_RFC4884, + .proto = IPPROTO_ICMPV6, + .build_func = build_icmpv6_pkt, + .min_payload = min_payload_len_v6, +}; + +FIXTURE_VARIANT(rfc4884) { + /* IPv4/v6 related information */ + struct ip_case_info info; + /* Whether to append an ICMP extension or not */ + bool with_ext; + /* UDP payload length */ + int payload_len; + /* Whether to generate a bad checksum in the ICMP extension structure */ + bool bad_csum; + /* Whether to generate a bad length in the ICMP object header */ + bool bad_len; + /* Whether it is too small to fit the object header or too big to fit + * in the packet + */ + bool smaller_len; +}; + +/* Tests that a valid ICMPv4 error message with extension and the original + * datagram is smaller than 128 bytes, generates an error with zero offset, + * and does not raise the SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv4_ext_small_payload) { + .info = ipv4_info, + .with_ext = true, + .payload_len = 64, + .bad_csum = false, + .bad_len = false, +}; + +/* Tests that a valid ICMPv4 error message with extension and 128 bytes original + * datagram, generates an error with the expected offset, and does not raise the + * SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv4_ext) { + .info = ipv4_info, + .with_ext = true, + .payload_len = min_payload_len_v4, + .bad_csum = false, + .bad_len = false, +}; + +/* Tests that a valid ICMPv4 error message with extension and the original + * datagram is larger than 128 bytes, generates an error with the expected + * offset, and does not raise the SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv4_ext_large_payload) { + .info = ipv4_info, + .with_ext = true, + .payload_len = 256, + .bad_csum = false, + .bad_len = false, +}; + +/* Tests that a valid ICMPv4 error message without extension and the original + * datagram is smaller than 128 bytes, generates an error with zero offset, + * and does not raise the SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv4_no_ext_small_payload) { + .info = ipv4_info, + .with_ext = false, + .payload_len = 64, + .bad_csum = false, + .bad_len = false, +}; + +/* Tests that a valid ICMPv4 error message without extension and 128 bytes + * original datagram, generates an error with zero offset, and does not raise + * the SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv4_no_ext_min_payload) { + .info = ipv4_info, + .with_ext = false, + .payload_len = min_payload_len_v4, + .bad_csum = false, + .bad_len = false, +}; + +/* Tests that a valid ICMPv4 error message without extension and the original + * datagram is larger than 128 bytes, generates an error with zero offset, + * and does not raise the SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv4_no_ext_large_payload) { + .info = ipv4_info, + .with_ext = false, + .payload_len = 256, + .bad_csum = false, + .bad_len = false, +}; + +/* Tests that an ICMPv4 error message with extension and an invalid checksum, + * generates an error with the expected offset, and raises the + * SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv4_invalid_ext_checksum) { + .info = ipv4_info, + .with_ext = true, + .payload_len = min_payload_len_v4, + .bad_csum = true, + .bad_len = false, +}; + +/* Tests that an ICMPv4 error message with extension and an object length + * smaller than the object header, generates an error with the expected offset, + * and raises the SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv4_invalid_ext_length_small) { + .info = ipv4_info, + .with_ext = true, + .payload_len = min_payload_len_v4, + .bad_csum = false, + .bad_len = true, + .smaller_len = true, +}; + +/* Tests that an ICMPv4 error message with extension and an object length that + * is too big to fit in the packet, generates an error with the expected offset, + * and raises the SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv4_invalid_ext_length_large) { + .info = ipv4_info, + .with_ext = true, + .payload_len = min_payload_len_v4, + .bad_csum = false, + .bad_len = true, + .smaller_len = false, +}; + +/* Tests that a valid ICMPv6 error message with extension and the original + * datagram is smaller than 128 bytes, generates an error with zero offset, + * and does not raise the SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv6_ext_small_payload) { + .info = ipv6_info, + .with_ext = true, + .payload_len = 64, + .bad_csum = false, + .bad_len = false, +}; + +/* Tests that a valid ICMPv6 error message with extension and 128 bytes original + * datagram, generates an error with the expected offset, and does not raise the + * SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv6_ext) { + .info = ipv6_info, + .with_ext = true, + .payload_len = min_payload_len_v6, + .bad_csum = false, + .bad_len = false, +}; + +/* Tests that a valid ICMPv6 error message with extension and the original + * datagram is larger than 128 bytes, generates an error with the expected + * offset, and does not raise the SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv6_ext_large_payload) { + .info = ipv6_info, + .with_ext = true, + .payload_len = 256, + .bad_csum = false, + .bad_len = false, +}; +/* Tests that a valid ICMPv6 error message without extension and the original + * datagram is smaller than 128 bytes, generates an error with zero offset, + * and does not raise the SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv6_no_ext_small_payload) { + .info = ipv6_info, + .with_ext = false, + .payload_len = 64, + .bad_csum = false, + .bad_len = false, +}; + +/* Tests that a valid ICMPv6 error message without extension and 128 bytes + * original datagram, generates an error with zero offset, and does not + * raise the SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv6_no_ext_min_payload) { + .info = ipv6_info, + .with_ext = false, + .payload_len = min_payload_len_v6, + .bad_csum = false, + .bad_len = false, +}; + +/* Tests that a valid ICMPv6 error message without extension and the original + * datagram is larger than 128 bytes, generates an error with zero offset, + * and does not raise the SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv6_no_ext_large_payload) { + .info = ipv6_info, + .with_ext = false, + .payload_len = 256, + .bad_csum = false, + .bad_len = false, +}; + +/* Tests that an ICMPv6 error message with extension and an invalid checksum, + * generates an error with the expected offset, and raises the + * SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv6_invalid_ext_checksum) { + .info = ipv6_info, + .with_ext = true, + .payload_len = min_payload_len_v6, + .bad_csum = true, + .bad_len = false, +}; + +/* Tests that an ICMPv6 error message with extension and an object length + * smaller than the object header, generates an error with the expected offset, + * and raises the SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv6_invalid_ext_length_small) { + .info = ipv6_info, + .with_ext = true, + .payload_len = min_payload_len_v6, + .bad_csum = false, + .bad_len = true, + .smaller_len = true, +}; + +/* Tests that an ICMPv6 error message with extension and an object length that + * is too big to fit in the packet, generates an error with the expected offset, + * and raises the SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv6_invalid_ext_length_large) { + .info = ipv6_info, + .with_ext = true, + .payload_len = min_payload_len_v6, + .bad_csum = false, + .bad_len = true, + .smaller_len = false, +}; + +static void +check_rfc4884_offset(struct __test_metadata *_metadata, int sock, + const FIXTURE_VARIANT(rfc4884) *v) +{ + char rxbuf[1024]; + char ctrl[1024]; + struct iovec iov = { + .iov_base = rxbuf, + .iov_len = sizeof(rxbuf) + }; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = ctrl, + .msg_controllen = sizeof(ctrl), + }; + struct cmsghdr *cmsg; + int recv; + + ASSERT_EQ(poll_err(sock), 0); + + recv = recvmsg(sock, &msg, MSG_ERRQUEUE); + ASSERT_GE(recv, 0) TH_LOG("recvmsg(MSG_ERRQUEUE) failed"); + + for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) { + bool is_invalid, expected_invalid; + struct sock_extended_err *ee; + int expected_off; + uint16_t off; + + if (cmsg->cmsg_level != v->info.level || + cmsg->cmsg_type != v->info.opt1) { + TH_LOG("Unrelated cmsgs were encountered in recvmsg()"); + continue; + } + + ee = (struct sock_extended_err *)CMSG_DATA(cmsg); + off = ee->ee_rfc4884.len; + is_invalid = ee->ee_rfc4884.flags & SO_EE_RFC4884_FLAG_INVALID; + + expected_invalid = v->bad_csum || v->bad_len; + ASSERT_EQ(is_invalid, expected_invalid) { + TH_LOG("Expected invalidity flag to be %d, but got %d", + expected_invalid, is_invalid); + } + + expected_off = + (v->with_ext && v->payload_len >= v->info.min_payload) ? + v->payload_len : 0; + ASSERT_EQ(off, expected_off) { + TH_LOG("Expected RFC4884 offset %u, got %u", + expected_off, off); + } + break; + } +} + +TEST_F(rfc4884, rfc4884) +{ + const typeof(variant) v = variant; + struct sockaddr_inet addr; + uint8_t pkt[1024]; + int dgram, raw; + int len, sent; + int err; + + dgram = socket(v->info.domain, SOCK_DGRAM, 0); + ASSERT_GE(dgram, 0) TH_LOG("Opening datagram socket failed"); + + err = bind_and_setsockopt(dgram, &v->info); + ASSERT_EQ(err, 0) TH_LOG("Bind failed"); + + raw = socket(v->info.domain, SOCK_RAW, v->info.proto); + ASSERT_GE(raw, 0) TH_LOG("Opening raw socket failed"); + + len = v->info.build_func(pkt, sizeof(pkt), v->with_ext, v->payload_len, + v->bad_csum, v->bad_len, v->smaller_len); + ASSERT_GT(len, 0) TH_LOG("Building packet failed"); + + set_addr(&addr, v->info.domain, 0); + sent = sendto(raw, pkt, len, 0, &addr.sa, addr.len); + ASSERT_EQ(len, sent) TH_LOG("Sending packet failed"); + + check_rfc4884_offset(_metadata, dgram, v); + + close(dgram); + close(raw); +} + +TEST_HARNESS_MAIN -- cgit v1.2.3 From caf84294ff98bb7455722285f30f46c193ffccdd Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:48 +0800 Subject: selftests: ublk: fix user_data truncation for tgt_data >= 256 The build_user_data() function packs multiple fields into a __u64 value using bit shifts. Without explicit __u64 casts before shifting, the shift operations are performed on 32-bit unsigned integers before being promoted to 64-bit, causing data loss. Specifically, when tgt_data >= 256, the expression (tgt_data << 24) shifts on a 32-bit value, truncating the upper 8 bits before promotion to __u64. Since tgt_data can be up to 16 bits (assertion allows up to 65535), values >= 256 would have their high byte lost. Add explicit __u64 casts to both op and tgt_data before shifting to ensure the shift operations happen in 64-bit space, preserving all bits of the input values. user_data_to_tgt_data() is only used by stripe.c, in which the max supported member disks are 4, so won't trigger this issue. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/kublk.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index cb757fd9bf9d..69fd5794f300 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -262,7 +262,7 @@ static inline __u64 build_user_data(unsigned tag, unsigned op, _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7); assert(!(tag >> 16) && !(op >> 8) && !(tgt_data >> 16) && !(q_id >> 7)); - return tag | (op << 16) | (tgt_data << 24) | + return tag | ((__u64)op << 16) | ((__u64)tgt_data << 24) | (__u64)q_id << 56 | (__u64)is_target_io << 63; } -- cgit v1.2.3 From 584709ad5ce359f8b5773eb6af40070412652c51 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:49 +0800 Subject: selftests: ublk: replace assert() with ublk_assert() Replace assert() with ublk_assert() since it is often triggered in daemon, and we may get nothing shown in terminal. Add ublk_assert(), so we can log something to syslog when assert() is triggered. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/common.c | 2 +- tools/testing/selftests/ublk/file_backed.c | 2 +- tools/testing/selftests/ublk/kublk.c | 2 +- tools/testing/selftests/ublk/kublk.h | 2 +- tools/testing/selftests/ublk/stripe.c | 10 +++++----- tools/testing/selftests/ublk/utils.h | 10 ++++++++++ 6 files changed, 19 insertions(+), 9 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/ublk/common.c b/tools/testing/selftests/ublk/common.c index d9873d4d50d0..530f9877c9dd 100644 --- a/tools/testing/selftests/ublk/common.c +++ b/tools/testing/selftests/ublk/common.c @@ -16,7 +16,7 @@ int backing_file_tgt_init(struct ublk_dev *dev, unsigned int nr_direct) { int fd, i; - assert(dev->nr_fds == 1); + ublk_assert(dev->nr_fds == 1); for (i = 0; i < dev->tgt.nr_backing_files; i++) { char *file = dev->tgt.backing_file[i]; diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c index c3ce5ff72422..889047bd8fa3 100644 --- a/tools/testing/selftests/ublk/file_backed.c +++ b/tools/testing/selftests/ublk/file_backed.c @@ -10,7 +10,7 @@ static enum io_uring_op ublk_to_uring_op(const struct ublksrv_io_desc *iod, int return zc ? IORING_OP_READ_FIXED : IORING_OP_READ; else if (ublk_op == UBLK_IO_OP_WRITE) return zc ? IORING_OP_WRITE_FIXED : IORING_OP_WRITE; - assert(0); + ublk_assert(0); } static int loop_queue_flush_io(struct ublk_thread *t, struct ublk_queue *q, diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 3472ce7426ba..e98999bea9b1 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -825,7 +825,7 @@ static void ublk_handle_uring_cmd(struct ublk_thread *t, } if (cqe->res == UBLK_IO_RES_OK) { - assert(tag < q->q_depth); + ublk_assert(tag < q->q_depth); if (ublk_queue_use_user_copy(q)) ublk_user_copy(io, UBLK_IO_OP_WRITE); diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 69fd5794f300..48634d29c084 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -260,7 +260,7 @@ static inline __u64 build_user_data(unsigned tag, unsigned op, { /* we only have 7 bits to encode q_id */ _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7); - assert(!(tag >> 16) && !(op >> 8) && !(tgt_data >> 16) && !(q_id >> 7)); + ublk_assert(!(tag >> 16) && !(op >> 8) && !(tgt_data >> 16) && !(q_id >> 7)); return tag | ((__u64)op << 16) | ((__u64)tgt_data << 24) | (__u64)q_id << 56 | (__u64)is_target_io << 63; diff --git a/tools/testing/selftests/ublk/stripe.c b/tools/testing/selftests/ublk/stripe.c index 2be1c36438e7..b967447fe591 100644 --- a/tools/testing/selftests/ublk/stripe.c +++ b/tools/testing/selftests/ublk/stripe.c @@ -96,12 +96,12 @@ static void calculate_stripe_array(const struct stripe_conf *conf, this->seq = seq; s->nr += 1; } else { - assert(seq == this->seq); - assert(this->start + this->nr_sects == stripe_off); + ublk_assert(seq == this->seq); + ublk_assert(this->start + this->nr_sects == stripe_off); this->nr_sects += nr_sects; } - assert(this->nr_vec < this->cap); + ublk_assert(this->nr_vec < this->cap); this->vec[this->nr_vec].iov_base = (void *)(base + done); this->vec[this->nr_vec++].iov_len = nr_sects << 9; @@ -120,7 +120,7 @@ static inline enum io_uring_op stripe_to_uring_op( return zc ? IORING_OP_READV_FIXED : IORING_OP_READV; else if (ublk_op == UBLK_IO_OP_WRITE) return zc ? IORING_OP_WRITEV_FIXED : IORING_OP_WRITEV; - assert(0); + ublk_assert(0); } static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, @@ -322,7 +322,7 @@ static int ublk_stripe_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) if (!dev->tgt.nr_backing_files || dev->tgt.nr_backing_files > NR_STRIPE) return -EINVAL; - assert(dev->nr_fds == dev->tgt.nr_backing_files + 1); + ublk_assert(dev->nr_fds == dev->tgt.nr_backing_files + 1); for (i = 0; i < dev->tgt.nr_backing_files; i++) dev->tgt.backing_file_size[i] &= ~((1 << chunk_shift) - 1); diff --git a/tools/testing/selftests/ublk/utils.h b/tools/testing/selftests/ublk/utils.h index a852e0b7153e..17eefed73690 100644 --- a/tools/testing/selftests/ublk/utils.h +++ b/tools/testing/selftests/ublk/utils.h @@ -43,6 +43,7 @@ static inline void ublk_err(const char *fmt, ...) va_start(ap, fmt); vfprintf(stderr, fmt, ap); + va_end(ap); } static inline void ublk_log(const char *fmt, ...) @@ -52,6 +53,7 @@ static inline void ublk_log(const char *fmt, ...) va_start(ap, fmt); vfprintf(stdout, fmt, ap); + va_end(ap); } } @@ -62,7 +64,15 @@ static inline void ublk_dbg(int level, const char *fmt, ...) va_start(ap, fmt); vfprintf(stdout, fmt, ap); + va_end(ap); } } +#define ublk_assert(x) do { \ + if (!(x)) { \ + ublk_err("%s %d: assert!\n", __func__, __LINE__); \ + assert(x); \ + } \ +} while (0) + #endif -- cgit v1.2.3 From f1d621b5a04ea41ee90f177db084d00db57e6839 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:50 +0800 Subject: selftests: ublk: add ublk_io_buf_idx() for returning io buffer index Since UBLK_F_PER_IO_DAEMON is added, io buffer index may depend on current thread because the common way is to use per-pthread io_ring_ctx for issuing ublk uring_cmd. Add one helper for returning io buffer index, so we can hide the buffer index implementation details for target code. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/file_backed.c | 9 +++++---- tools/testing/selftests/ublk/kublk.c | 9 +++++---- tools/testing/selftests/ublk/kublk.h | 10 +++++++++- tools/testing/selftests/ublk/null.c | 18 ++++++++++-------- tools/testing/selftests/ublk/stripe.c | 7 ++++--- 5 files changed, 33 insertions(+), 20 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c index 889047bd8fa3..228af2580ac6 100644 --- a/tools/testing/selftests/ublk/file_backed.c +++ b/tools/testing/selftests/ublk/file_backed.c @@ -39,6 +39,7 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, __u32 len = iod->nr_sectors << 9; struct io_uring_sqe *sqe[3]; void *addr = io->buf_addr; + unsigned short buf_index = ublk_io_buf_idx(t, q, tag); if (iod->op_flags & UBLK_IO_F_INTEGRITY) { ublk_io_alloc_sqes(t, sqe, 1); @@ -62,7 +63,7 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, len, offset); if (auto_zc) - sqe[0]->buf_index = tag; + sqe[0]->buf_index = buf_index; io_uring_sqe_set_flags(sqe[0], IOSQE_FIXED_FILE); /* bit63 marks us as tgt io */ sqe[0]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1); @@ -71,7 +72,7 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, ublk_io_alloc_sqes(t, sqe, 3); - io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, io->buf_index); + io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, buf_index); sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK; sqe[0]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1); @@ -79,11 +80,11 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, io_uring_prep_rw(op, sqe[1], ublk_get_registered_fd(q, 1) /*fds[1]*/, 0, len, offset); - sqe[1]->buf_index = tag; + sqe[1]->buf_index = buf_index; sqe[1]->flags |= IOSQE_FIXED_FILE | IOSQE_IO_HARDLINK; sqe[1]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1); - io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, io->buf_index); + io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, buf_index); sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, q->q_id, 1); return !!(iod->op_flags & UBLK_IO_F_INTEGRITY) + 2; diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index e98999bea9b1..9b6f1cd04dc4 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -605,16 +605,17 @@ static void ublk_dev_unprep(struct ublk_dev *dev) close(dev->fds[0]); } -static void ublk_set_auto_buf_reg(const struct ublk_queue *q, +static void ublk_set_auto_buf_reg(const struct ublk_thread *t, + const struct ublk_queue *q, struct io_uring_sqe *sqe, unsigned short tag) { struct ublk_auto_buf_reg buf = {}; if (q->tgt_ops->buf_index) - buf.index = q->tgt_ops->buf_index(q, tag); + buf.index = q->tgt_ops->buf_index(t, q, tag); else - buf.index = q->ios[tag].buf_index; + buf.index = ublk_io_buf_idx(t, q, tag); if (ublk_queue_auto_zc_fallback(q)) buf.flags = UBLK_AUTO_BUF_REG_FALLBACK; @@ -730,7 +731,7 @@ int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io) cmd->addr = 0; if (ublk_queue_use_auto_zc(q)) - ublk_set_auto_buf_reg(q, sqe[0], io->tag); + ublk_set_auto_buf_reg(t, q, sqe[0], io->tag); user_data = build_user_data(io->tag, _IOC_NR(cmd_op), 0, q->q_id, 0); io_uring_sqe_set_data64(sqe[0], user_data); diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 48634d29c084..311a75da9b21 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -150,7 +150,8 @@ struct ublk_tgt_ops { void (*usage)(const struct ublk_tgt_ops *ops); /* return buffer index for UBLK_F_AUTO_BUF_REG */ - unsigned short (*buf_index)(const struct ublk_queue *, int tag); + unsigned short (*buf_index)(const struct ublk_thread *t, + const struct ublk_queue *, int tag); }; struct ublk_tgt { @@ -393,6 +394,13 @@ static inline void ublk_set_sqe_cmd_op(struct io_uring_sqe *sqe, __u32 cmd_op) addr[1] = 0; } +static inline unsigned short ublk_io_buf_idx(const struct ublk_thread *t, + const struct ublk_queue *q, + unsigned tag) +{ + return q->ios[tag].buf_index; +} + static inline struct ublk_io *ublk_get_io(struct ublk_queue *q, unsigned tag) { return &q->ios[tag]; diff --git a/tools/testing/selftests/ublk/null.c b/tools/testing/selftests/ublk/null.c index 3aa162f08476..7656888f4149 100644 --- a/tools/testing/selftests/ublk/null.c +++ b/tools/testing/selftests/ublk/null.c @@ -44,12 +44,12 @@ static int ublk_null_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) } static void __setup_nop_io(int tag, const struct ublksrv_io_desc *iod, - struct io_uring_sqe *sqe, int q_id) + struct io_uring_sqe *sqe, int q_id, unsigned buf_idx) { unsigned ublk_op = ublksrv_get_op(iod); io_uring_prep_nop(sqe); - sqe->buf_index = tag; + sqe->buf_index = buf_idx; sqe->flags |= IOSQE_FIXED_FILE; sqe->rw_flags = IORING_NOP_FIXED_BUFFER | IORING_NOP_INJECT_RESULT; sqe->len = iod->nr_sectors << 9; /* injected result */ @@ -61,18 +61,19 @@ static int null_queue_zc_io(struct ublk_thread *t, struct ublk_queue *q, { const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); struct io_uring_sqe *sqe[3]; + unsigned short buf_idx = ublk_io_buf_idx(t, q, tag); ublk_io_alloc_sqes(t, sqe, 3); - io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, ublk_get_io(q, tag)->buf_index); + io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, buf_idx); sqe[0]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1); sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK; - __setup_nop_io(tag, iod, sqe[1], q->q_id); + __setup_nop_io(tag, iod, sqe[1], q->q_id, buf_idx); sqe[1]->flags |= IOSQE_IO_HARDLINK; - io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, ublk_get_io(q, tag)->buf_index); + io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, buf_idx); sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, q->q_id, 1); // buf register is marked as IOSQE_CQE_SKIP_SUCCESS @@ -86,7 +87,7 @@ static int null_queue_auto_zc_io(struct ublk_thread *t, struct ublk_queue *q, struct io_uring_sqe *sqe[1]; ublk_io_alloc_sqes(t, sqe, 1); - __setup_nop_io(tag, iod, sqe[0], q->q_id); + __setup_nop_io(tag, iod, sqe[0], q->q_id, ublk_io_buf_idx(t, q, tag)); return 1; } @@ -137,11 +138,12 @@ static int ublk_null_queue_io(struct ublk_thread *t, struct ublk_queue *q, * return invalid buffer index for triggering auto buffer register failure, * then UBLK_IO_RES_NEED_REG_BUF handling is covered */ -static unsigned short ublk_null_buf_index(const struct ublk_queue *q, int tag) +static unsigned short ublk_null_buf_index(const struct ublk_thread *t, + const struct ublk_queue *q, int tag) { if (ublk_queue_auto_zc_fallback(q)) return (unsigned short)-1; - return q->ios[tag].buf_index; + return ublk_io_buf_idx(t, q, tag); } const struct ublk_tgt_ops null_tgt_ops = { diff --git a/tools/testing/selftests/ublk/stripe.c b/tools/testing/selftests/ublk/stripe.c index b967447fe591..dca819f5366e 100644 --- a/tools/testing/selftests/ublk/stripe.c +++ b/tools/testing/selftests/ublk/stripe.c @@ -135,6 +135,7 @@ static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, struct ublk_io *io = ublk_get_io(q, tag); int i, extra = zc ? 2 : 0; void *base = io->buf_addr; + unsigned short buf_idx = ublk_io_buf_idx(t, q, tag); io->private_data = s; calculate_stripe_array(conf, iod, s, base); @@ -142,7 +143,7 @@ static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, ublk_io_alloc_sqes(t, sqe, s->nr + extra); if (zc) { - io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, io->buf_index); + io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, buf_idx); sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK; sqe[0]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1); @@ -158,7 +159,7 @@ static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, t->start << 9); io_uring_sqe_set_flags(sqe[i], IOSQE_FIXED_FILE); if (auto_zc || zc) { - sqe[i]->buf_index = tag; + sqe[i]->buf_index = buf_idx; if (zc) sqe[i]->flags |= IOSQE_IO_HARDLINK; } @@ -168,7 +169,7 @@ static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, if (zc) { struct io_uring_sqe *unreg = sqe[s->nr + 1]; - io_uring_prep_buf_unregister(unreg, q, tag, q->q_id, io->buf_index); + io_uring_prep_buf_unregister(unreg, q, tag, q->q_id, buf_idx); unreg->user_data = build_user_data( tag, ublk_cmd_op_nr(unreg->cmd_op), 0, q->q_id, 1); } -- cgit v1.2.3 From dccbfa9d416424fbcbc83a46e84c604bad1db9d0 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:51 +0800 Subject: selftests: ublk: add batch buffer management infrastructure Add the foundational infrastructure for UBLK_F_BATCH_IO buffer management including: - Allocator utility functions for small sized per-thread allocation - Batch buffer allocation and deallocation functions - Buffer index management for commit buffers - Thread state management for batch I/O mode - Buffer size calculation based on device features This prepares the groundwork for handling batch I/O commands by establishing the buffer management layer needed for UBLK_U_IO_PREP_IO_CMDS and UBLK_U_IO_COMMIT_IO_CMDS operations. The allocator uses CPU sets for efficient per-thread buffer tracking, and commit buffers are pre-allocated with 2 buffers per thread to handle overlapping command operations. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/batch.c | 152 +++++++++++++++++++++++++++++++++++ tools/testing/selftests/ublk/kublk.c | 26 +++++- tools/testing/selftests/ublk/kublk.h | 53 ++++++++++++ tools/testing/selftests/ublk/utils.h | 54 +++++++++++++ 4 files changed, 282 insertions(+), 3 deletions(-) create mode 100644 tools/testing/selftests/ublk/batch.c (limited to 'tools/testing') diff --git a/tools/testing/selftests/ublk/batch.c b/tools/testing/selftests/ublk/batch.c new file mode 100644 index 000000000000..609e6073c9c0 --- /dev/null +++ b/tools/testing/selftests/ublk/batch.c @@ -0,0 +1,152 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Description: UBLK_F_BATCH_IO buffer management + */ + +#include "kublk.h" + +static inline void *ublk_get_commit_buf(struct ublk_thread *t, + unsigned short buf_idx) +{ + unsigned idx; + + if (buf_idx < t->commit_buf_start || + buf_idx >= t->commit_buf_start + t->nr_commit_buf) + return NULL; + idx = buf_idx - t->commit_buf_start; + return t->commit_buf + idx * t->commit_buf_size; +} + +/* + * Allocate one buffer for UBLK_U_IO_PREP_IO_CMDS or UBLK_U_IO_COMMIT_IO_CMDS + * + * Buffer index is returned. + */ +static inline unsigned short ublk_alloc_commit_buf(struct ublk_thread *t) +{ + int idx = allocator_get(&t->commit_buf_alloc); + + if (idx >= 0) + return idx + t->commit_buf_start; + return UBLKS_T_COMMIT_BUF_INV_IDX; +} + +/* + * Free one commit buffer which is used by UBLK_U_IO_PREP_IO_CMDS or + * UBLK_U_IO_COMMIT_IO_CMDS + */ +static inline void ublk_free_commit_buf(struct ublk_thread *t, + unsigned short i) +{ + unsigned short idx = i - t->commit_buf_start; + + ublk_assert(idx < t->nr_commit_buf); + ublk_assert(allocator_get_val(&t->commit_buf_alloc, idx) != 0); + + allocator_put(&t->commit_buf_alloc, idx); +} + +static unsigned char ublk_commit_elem_buf_size(struct ublk_dev *dev) +{ + if (dev->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_USER_COPY | + UBLK_F_AUTO_BUF_REG)) + return 8; + + /* one extra 8bytes for carrying buffer address */ + return 16; +} + +static unsigned ublk_commit_buf_size(struct ublk_thread *t) +{ + struct ublk_dev *dev = t->dev; + unsigned elem_size = ublk_commit_elem_buf_size(dev); + unsigned int total = elem_size * dev->dev_info.queue_depth; + unsigned int page_sz = getpagesize(); + + return round_up(total, page_sz); +} + +static void free_batch_commit_buf(struct ublk_thread *t) +{ + if (t->commit_buf) { + unsigned buf_size = ublk_commit_buf_size(t); + unsigned int total = buf_size * t->nr_commit_buf; + + munlock(t->commit_buf, total); + free(t->commit_buf); + } + allocator_deinit(&t->commit_buf_alloc); +} + +static int alloc_batch_commit_buf(struct ublk_thread *t) +{ + unsigned buf_size = ublk_commit_buf_size(t); + unsigned int total = buf_size * t->nr_commit_buf; + unsigned int page_sz = getpagesize(); + void *buf = NULL; + int ret; + + allocator_init(&t->commit_buf_alloc, t->nr_commit_buf); + + t->commit_buf = NULL; + ret = posix_memalign(&buf, page_sz, total); + if (ret || !buf) + goto fail; + + t->commit_buf = buf; + + /* lock commit buffer pages for fast access */ + if (mlock(t->commit_buf, total)) + ublk_err("%s: can't lock commit buffer %s\n", __func__, + strerror(errno)); + + return 0; + +fail: + free_batch_commit_buf(t); + return ret; +} + +void ublk_batch_prepare(struct ublk_thread *t) +{ + /* + * We only handle single device in this thread context. + * + * All queues have same feature flags, so use queue 0's for + * calculate uring_cmd flags. + * + * This way looks not elegant, but it works so far. + */ + struct ublk_queue *q = &t->dev->q[0]; + + t->commit_buf_elem_size = ublk_commit_elem_buf_size(t->dev); + t->commit_buf_size = ublk_commit_buf_size(t); + t->commit_buf_start = t->nr_bufs; + t->nr_commit_buf = 2; + t->nr_bufs += t->nr_commit_buf; + + t->cmd_flags = 0; + if (ublk_queue_use_auto_zc(q)) { + if (ublk_queue_auto_zc_fallback(q)) + t->cmd_flags |= UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK; + } else if (!ublk_queue_no_buf(q)) + t->cmd_flags |= UBLK_BATCH_F_HAS_BUF_ADDR; + + t->state |= UBLKS_T_BATCH_IO; + + ublk_log("%s: thread %d commit(nr_bufs %u, buf_size %u, start %u)\n", + __func__, t->idx, + t->nr_commit_buf, t->commit_buf_size, + t->nr_bufs); +} + +int ublk_batch_alloc_buf(struct ublk_thread *t) +{ + ublk_assert(t->nr_commit_buf < 16); + return alloc_batch_commit_buf(t); +} + +void ublk_batch_free_buf(struct ublk_thread *t) +{ + free_batch_commit_buf(t); +} diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 9b6f1cd04dc4..3864f42e6c29 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -435,6 +435,8 @@ static void ublk_thread_deinit(struct ublk_thread *t) { io_uring_unregister_buffers(&t->ring); + ublk_batch_free_buf(t); + io_uring_unregister_ring_fd(&t->ring); if (t->ring.ring_fd > 0) { @@ -531,15 +533,33 @@ static int ublk_thread_init(struct ublk_thread *t, unsigned long long extra_flag unsigned nr_ios = dev->dev_info.queue_depth * dev->dev_info.nr_hw_queues; unsigned max_nr_ios_per_thread = nr_ios / dev->nthreads; max_nr_ios_per_thread += !!(nr_ios % dev->nthreads); - ret = io_uring_register_buffers_sparse( - &t->ring, max_nr_ios_per_thread); + + t->nr_bufs = max_nr_ios_per_thread; + } else { + t->nr_bufs = 0; + } + + if (ublk_dev_batch_io(dev)) + ublk_batch_prepare(t); + + if (t->nr_bufs) { + ret = io_uring_register_buffers_sparse(&t->ring, t->nr_bufs); if (ret) { - ublk_err("ublk dev %d thread %d register spare buffers failed %d", + ublk_err("ublk dev %d thread %d register spare buffers failed %d\n", dev->dev_info.dev_id, t->idx, ret); goto fail; } } + if (ublk_dev_batch_io(dev)) { + ret = ublk_batch_alloc_buf(t); + if (ret) { + ublk_err("ublk dev %d thread %d alloc batch buf failed %d\n", + dev->dev_info.dev_id, t->idx, ret); + goto fail; + } + } + io_uring_register_ring_fd(&t->ring); if (flags & UBLKS_Q_NO_UBLK_FIXED_FD) { diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 311a75da9b21..424c333596ac 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -182,15 +182,40 @@ struct ublk_queue { struct ublk_io ios[UBLK_QUEUE_DEPTH]; }; +/* align with `ublk_elem_header` */ +struct ublk_batch_elem { + __u16 tag; + __u16 buf_index; + __s32 result; + __u64 buf_addr; +}; + struct ublk_thread { struct ublk_dev *dev; unsigned idx; #define UBLKS_T_STOPPING (1U << 0) #define UBLKS_T_IDLE (1U << 1) +#define UBLKS_T_BATCH_IO (1U << 31) /* readonly */ unsigned state; unsigned int cmd_inflight; unsigned int io_inflight; + + unsigned short nr_bufs; + + /* followings are for BATCH_IO */ + unsigned short commit_buf_start; + unsigned char commit_buf_elem_size; + /* + * We just support single device, so pre-calculate commit/prep flags + */ + unsigned short cmd_flags; + unsigned int nr_commit_buf; + unsigned int commit_buf_size; + void *commit_buf; +#define UBLKS_T_COMMIT_BUF_INV_IDX ((unsigned short)-1) + struct allocator commit_buf_alloc; + struct io_uring ring; }; @@ -211,6 +236,27 @@ struct ublk_dev { extern int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io); +static inline int __ublk_use_batch_io(__u64 flags) +{ + return flags & UBLK_F_BATCH_IO; +} + +static inline int ublk_queue_batch_io(const struct ublk_queue *q) +{ + return __ublk_use_batch_io(q->flags); +} + +static inline int ublk_dev_batch_io(const struct ublk_dev *dev) +{ + return __ublk_use_batch_io(dev->dev_info.flags); +} + +/* only work for handle single device in this pthread context */ +static inline int ublk_thread_batch_io(const struct ublk_thread *t) +{ + return t->state & UBLKS_T_BATCH_IO; +} + static inline void ublk_set_integrity_params(const struct dev_ctx *ctx, struct ublk_params *params) { @@ -465,6 +511,13 @@ static inline int ublk_queue_no_buf(const struct ublk_queue *q) return ublk_queue_use_zc(q) || ublk_queue_use_auto_zc(q); } +/* Initialize batch I/O state and calculate buffer parameters */ +void ublk_batch_prepare(struct ublk_thread *t); +/* Allocate and register commit buffers for batch operations */ +int ublk_batch_alloc_buf(struct ublk_thread *t); +/* Free commit buffers and cleanup batch allocator */ +void ublk_batch_free_buf(struct ublk_thread *t); + extern const struct ublk_tgt_ops null_tgt_ops; extern const struct ublk_tgt_ops loop_tgt_ops; extern const struct ublk_tgt_ops stripe_tgt_ops; diff --git a/tools/testing/selftests/ublk/utils.h b/tools/testing/selftests/ublk/utils.h index 17eefed73690..aab522f26167 100644 --- a/tools/testing/selftests/ublk/utils.h +++ b/tools/testing/selftests/ublk/utils.h @@ -21,6 +21,60 @@ #define round_up(val, rnd) \ (((val) + ((rnd) - 1)) & ~((rnd) - 1)) +/* small sized & per-thread allocator */ +struct allocator { + unsigned int size; + cpu_set_t *set; +}; + +static inline int allocator_init(struct allocator *a, unsigned size) +{ + a->set = CPU_ALLOC(size); + a->size = size; + + if (a->set) + return 0; + return -ENOMEM; +} + +static inline void allocator_deinit(struct allocator *a) +{ + CPU_FREE(a->set); + a->set = NULL; + a->size = 0; +} + +static inline int allocator_get(struct allocator *a) +{ + int i; + + for (i = 0; i < a->size; i += 1) { + size_t set_size = CPU_ALLOC_SIZE(a->size); + + if (!CPU_ISSET_S(i, set_size, a->set)) { + CPU_SET_S(i, set_size, a->set); + return i; + } + } + + return -1; +} + +static inline void allocator_put(struct allocator *a, int i) +{ + size_t set_size = CPU_ALLOC_SIZE(a->size); + + if (i >= 0 && i < a->size) + CPU_CLR_S(i, set_size, a->set); +} + +static inline int allocator_get_val(struct allocator *a, int i) +{ + size_t set_size = CPU_ALLOC_SIZE(a->size); + + return CPU_ISSET_S(i, set_size, a->set); +} + static inline unsigned int ilog2(unsigned int x) { if (x == 0) -- cgit v1.2.3 From d468930a019df71951a80fde20f6348136a2175d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:52 +0800 Subject: selftests: ublk: handle UBLK_U_IO_PREP_IO_CMDS Implement support for UBLK_U_IO_PREP_IO_CMDS in the batch I/O framework: - Add batch command initialization and setup functions - Implement prep command queueing with proper buffer management - Add command completion handling for prep and commit commands - Integrate batch I/O setup into thread initialization - Update CQE handling to support batch commands The implementation uses the previously established buffer management infrastructure to queue UBLK_U_IO_PREP_IO_CMDS commands. Commands are prepared in the first thread context and use commit buffers for efficient command batching. Key changes: - ublk_batch_queue_prep_io_cmds() prepares I/O command batches - ublk_batch_compl_cmd() handles batch command completions - Modified thread setup to use batch operations when enabled - Enhanced buffer index calculation for batch mode Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/batch.c | 114 +++++++++++++++++++++++++++++++++++ tools/testing/selftests/ublk/kublk.c | 50 +++++++++++---- tools/testing/selftests/ublk/kublk.h | 22 +++++++ 3 files changed, 174 insertions(+), 12 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/ublk/batch.c b/tools/testing/selftests/ublk/batch.c index 609e6073c9c0..079cae77add1 100644 --- a/tools/testing/selftests/ublk/batch.c +++ b/tools/testing/selftests/ublk/batch.c @@ -150,3 +150,117 @@ void ublk_batch_free_buf(struct ublk_thread *t) { free_batch_commit_buf(t); } + +static void ublk_init_batch_cmd(struct ublk_thread *t, __u16 q_id, + struct io_uring_sqe *sqe, unsigned op, + unsigned short elem_bytes, + unsigned short nr_elem, + unsigned short buf_idx) +{ + struct ublk_batch_io *cmd; + __u64 user_data; + + cmd = (struct ublk_batch_io *)ublk_get_sqe_cmd(sqe); + + ublk_set_sqe_cmd_op(sqe, op); + + sqe->fd = 0; /* dev->fds[0] */ + sqe->opcode = IORING_OP_URING_CMD; + sqe->flags = IOSQE_FIXED_FILE; + + cmd->q_id = q_id; + cmd->flags = 0; + cmd->reserved = 0; + cmd->elem_bytes = elem_bytes; + cmd->nr_elem = nr_elem; + + user_data = build_user_data(buf_idx, _IOC_NR(op), 0, q_id, 0); + io_uring_sqe_set_data64(sqe, user_data); + + t->cmd_inflight += 1; + + ublk_dbg(UBLK_DBG_IO_CMD, "%s: thread %u qid %d cmd_op %x data %lx " + "nr_elem %u elem_bytes %u buf_size %u buf_idx %d " + "cmd_inflight %u\n", + __func__, t->idx, q_id, op, user_data, + cmd->nr_elem, cmd->elem_bytes, + nr_elem * elem_bytes, buf_idx, t->cmd_inflight); +} + +static void ublk_setup_commit_sqe(struct ublk_thread *t, + struct io_uring_sqe *sqe, + unsigned short buf_idx) +{ + struct ublk_batch_io *cmd; + + cmd = (struct ublk_batch_io *)ublk_get_sqe_cmd(sqe); + + /* Use plain user buffer instead of fixed buffer */ + cmd->flags |= t->cmd_flags; +} + +int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q) +{ + unsigned short nr_elem = q->q_depth; + unsigned short buf_idx = ublk_alloc_commit_buf(t); + struct io_uring_sqe *sqe; + void *buf; + int i; + + ublk_assert(buf_idx != UBLKS_T_COMMIT_BUF_INV_IDX); + + ublk_io_alloc_sqes(t, &sqe, 1); + + ublk_assert(nr_elem == q->q_depth); + buf = ublk_get_commit_buf(t, buf_idx); + for (i = 0; i < nr_elem; i++) { + struct ublk_batch_elem *elem = (struct ublk_batch_elem *)( + buf + i * t->commit_buf_elem_size); + struct ublk_io *io = &q->ios[i]; + + elem->tag = i; + elem->result = 0; + + if (ublk_queue_use_auto_zc(q)) + elem->buf_index = ublk_batch_io_buf_idx(t, q, i); + else if (!ublk_queue_no_buf(q)) + elem->buf_addr = (__u64)io->buf_addr; + } + + sqe->addr = (__u64)buf; + sqe->len = t->commit_buf_elem_size * nr_elem; + + ublk_init_batch_cmd(t, q->q_id, sqe, UBLK_U_IO_PREP_IO_CMDS, + t->commit_buf_elem_size, nr_elem, buf_idx); + ublk_setup_commit_sqe(t, sqe, buf_idx); + return 0; +} + +static void ublk_batch_compl_commit_cmd(struct ublk_thread *t, + const struct io_uring_cqe *cqe, + unsigned op) +{ + unsigned short buf_idx = user_data_to_tag(cqe->user_data); + + if (op == _IOC_NR(UBLK_U_IO_PREP_IO_CMDS)) + ublk_assert(cqe->res == 0); + else if (op == _IOC_NR(UBLK_U_IO_COMMIT_IO_CMDS)) + ;//assert(cqe->res == t->commit_buf_size); + else + ublk_assert(0); + + ublk_free_commit_buf(t, buf_idx); +} + +void ublk_batch_compl_cmd(struct ublk_thread *t, + const struct io_uring_cqe *cqe) +{ + unsigned op = user_data_to_op(cqe->user_data); + + if (op == _IOC_NR(UBLK_U_IO_PREP_IO_CMDS) || + op == _IOC_NR(UBLK_U_IO_COMMIT_IO_CMDS)) { + t->cmd_inflight--; + ublk_batch_compl_commit_cmd(t, cqe, op); + return; + } +} diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 3864f42e6c29..dba912a44eb3 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -840,6 +840,8 @@ static void ublk_handle_uring_cmd(struct ublk_thread *t, unsigned tag = user_data_to_tag(cqe->user_data); struct ublk_io *io = &q->ios[tag]; + t->cmd_inflight--; + if (!fetch) { t->state |= UBLKS_T_STOPPING; io->flags &= ~UBLKS_IO_NEED_FETCH_RQ; @@ -874,28 +876,30 @@ static void ublk_handle_cqe(struct ublk_thread *t, { struct ublk_dev *dev = t->dev; unsigned q_id = user_data_to_q_id(cqe->user_data); - struct ublk_queue *q = &dev->q[q_id]; unsigned cmd_op = user_data_to_op(cqe->user_data); if (cqe->res < 0 && cqe->res != -ENODEV) - ublk_err("%s: res %d userdata %llx queue state %x\n", __func__, - cqe->res, cqe->user_data, q->flags); + ublk_err("%s: res %d userdata %llx thread state %x\n", __func__, + cqe->res, cqe->user_data, t->state); - ublk_dbg(UBLK_DBG_IO_CMD, "%s: res %d (qid %d tag %u cmd_op %u target %d/%d) stopping %d\n", - __func__, cqe->res, q->q_id, user_data_to_tag(cqe->user_data), - cmd_op, is_target_io(cqe->user_data), + ublk_dbg(UBLK_DBG_IO_CMD, "%s: res %d (thread %d qid %d tag %u cmd_op %x " + "data %lx target %d/%d) stopping %d\n", + __func__, cqe->res, t->idx, q_id, + user_data_to_tag(cqe->user_data), + cmd_op, cqe->user_data, is_target_io(cqe->user_data), user_data_to_tgt_data(cqe->user_data), (t->state & UBLKS_T_STOPPING)); /* Don't retrieve io in case of target io */ if (is_target_io(cqe->user_data)) { - ublksrv_handle_tgt_cqe(t, q, cqe); + ublksrv_handle_tgt_cqe(t, &dev->q[q_id], cqe); return; } - t->cmd_inflight--; - - ublk_handle_uring_cmd(t, q, cqe); + if (ublk_thread_batch_io(t)) + ublk_batch_compl_cmd(t, cqe); + else + ublk_handle_uring_cmd(t, &dev->q[q_id], cqe); } static int ublk_reap_events_uring(struct ublk_thread *t) @@ -952,6 +956,22 @@ static void ublk_thread_set_sched_affinity(const struct ublk_thread_info *info) info->dev->dev_info.dev_id, info->idx); } +static void ublk_batch_setup_queues(struct ublk_thread *t) +{ + int i; + + /* setup all queues in the 1st thread */ + for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) { + struct ublk_queue *q = &t->dev->q[i]; + int ret; + + ret = ublk_batch_queue_prep_io_cmds(t, q); + ublk_assert(ret == 0); + ret = ublk_process_io(t); + ublk_assert(ret >= 0); + } +} + static __attribute__((noinline)) int __ublk_io_handler_fn(struct ublk_thread_info *info) { struct ublk_thread t = { @@ -972,8 +992,14 @@ static __attribute__((noinline)) int __ublk_io_handler_fn(struct ublk_thread_inf ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %u started\n", gettid(), dev_id, t.idx); - /* submit all io commands to ublk driver */ - ublk_submit_fetch_commands(&t); + if (!ublk_thread_batch_io(&t)) { + /* submit all io commands to ublk driver */ + ublk_submit_fetch_commands(&t); + } else if (!t.idx) { + /* prepare all io commands in the 1st thread context */ + ublk_batch_setup_queues(&t); + } + do { if (ublk_process_io(&t) < 0) break; diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 424c333596ac..08320d44c7c2 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -440,10 +440,16 @@ static inline void ublk_set_sqe_cmd_op(struct io_uring_sqe *sqe, __u32 cmd_op) addr[1] = 0; } +static inline unsigned short ublk_batch_io_buf_idx( + const struct ublk_thread *t, const struct ublk_queue *q, + unsigned tag); + static inline unsigned short ublk_io_buf_idx(const struct ublk_thread *t, const struct ublk_queue *q, unsigned tag) { + if (ublk_queue_batch_io(q)) + return ublk_batch_io_buf_idx(t, q, tag); return q->ios[tag].buf_index; } @@ -511,6 +517,22 @@ static inline int ublk_queue_no_buf(const struct ublk_queue *q) return ublk_queue_use_zc(q) || ublk_queue_use_auto_zc(q); } +/* + * Each IO's buffer index has to be calculated by this helper for + * UBLKS_T_BATCH_IO + */ +static inline unsigned short ublk_batch_io_buf_idx( + const struct ublk_thread *t, const struct ublk_queue *q, + unsigned tag) +{ + return tag; +} + +/* Queue UBLK_U_IO_PREP_IO_CMDS for a specific queue with batch elements */ +int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q); +/* Handle completion of batch I/O commands (prep/commit) */ +void ublk_batch_compl_cmd(struct ublk_thread *t, + const struct io_uring_cqe *cqe); /* Initialize batch I/O state and calculate buffer parameters */ void ublk_batch_prepare(struct ublk_thread *t); /* Allocate and register commit buffers for batch operations */ -- cgit v1.2.3 From dee7024ffecba291891503e425373d9f2a1d01b6 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:53 +0800 Subject: selftests: ublk: handle UBLK_U_IO_COMMIT_IO_CMDS Implement UBLK_U_IO_COMMIT_IO_CMDS to enable efficient batched completion of I/O operations in the batch I/O framework. This completes the batch I/O infrastructure by adding the commit phase that notifies the kernel about completed I/O operations: Key features: - Batch multiple I/O completions into single UBLK_U_IO_COMMIT_IO_CMDS - Dynamic commit buffer allocation and management per thread - Automatic commit buffer preparation before processing events - Commit buffer submission after processing completed I/Os - Integration with existing completion workflows Implementation details: - ublk_batch_prep_commit() allocates and initializes commit buffers - ublk_batch_complete_io() adds completed I/Os to current batch - ublk_batch_commit_io_cmds() submits batched completions to kernel - Modified ublk_process_io() to handle batch commit lifecycle - Enhanced ublk_complete_io() to route to batch or legacy completion The commit buffer stores completion information (tag, result, buffer details) for multiple I/Os, then submits them all at once, significantly reducing syscall overhead compared to individual I/O completions. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/batch.c | 74 ++++++++++++++++++++++++++++++++++-- tools/testing/selftests/ublk/kublk.c | 8 +++- tools/testing/selftests/ublk/kublk.h | 69 +++++++++++++++++++++------------ 3 files changed, 122 insertions(+), 29 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/ublk/batch.c b/tools/testing/selftests/ublk/batch.c index 079cae77add1..9c4db7335d44 100644 --- a/tools/testing/selftests/ublk/batch.c +++ b/tools/testing/selftests/ublk/batch.c @@ -174,7 +174,7 @@ static void ublk_init_batch_cmd(struct ublk_thread *t, __u16 q_id, cmd->elem_bytes = elem_bytes; cmd->nr_elem = nr_elem; - user_data = build_user_data(buf_idx, _IOC_NR(op), 0, q_id, 0); + user_data = build_user_data(buf_idx, _IOC_NR(op), nr_elem, q_id, 0); io_uring_sqe_set_data64(sqe, user_data); t->cmd_inflight += 1; @@ -244,9 +244,11 @@ static void ublk_batch_compl_commit_cmd(struct ublk_thread *t, if (op == _IOC_NR(UBLK_U_IO_PREP_IO_CMDS)) ublk_assert(cqe->res == 0); - else if (op == _IOC_NR(UBLK_U_IO_COMMIT_IO_CMDS)) - ;//assert(cqe->res == t->commit_buf_size); - else + else if (op == _IOC_NR(UBLK_U_IO_COMMIT_IO_CMDS)) { + int nr_elem = user_data_to_tgt_data(cqe->user_data); + + ublk_assert(cqe->res == t->commit_buf_elem_size * nr_elem); + } else ublk_assert(0); ublk_free_commit_buf(t, buf_idx); @@ -264,3 +266,67 @@ void ublk_batch_compl_cmd(struct ublk_thread *t, return; } } + +void ublk_batch_commit_io_cmds(struct ublk_thread *t) +{ + struct io_uring_sqe *sqe; + unsigned short buf_idx; + unsigned short nr_elem = t->commit.done; + + /* nothing to commit */ + if (!nr_elem) { + ublk_free_commit_buf(t, t->commit.buf_idx); + return; + } + + ublk_io_alloc_sqes(t, &sqe, 1); + buf_idx = t->commit.buf_idx; + sqe->addr = (__u64)t->commit.elem; + sqe->len = nr_elem * t->commit_buf_elem_size; + + /* commit isn't per-queue command */ + ublk_init_batch_cmd(t, t->commit.q_id, sqe, UBLK_U_IO_COMMIT_IO_CMDS, + t->commit_buf_elem_size, nr_elem, buf_idx); + ublk_setup_commit_sqe(t, sqe, buf_idx); +} + +static void ublk_batch_init_commit(struct ublk_thread *t, + unsigned short buf_idx) +{ + /* so far only support 1:1 queue/thread mapping */ + t->commit.q_id = t->idx; + t->commit.buf_idx = buf_idx; + t->commit.elem = ublk_get_commit_buf(t, buf_idx); + t->commit.done = 0; + t->commit.count = t->commit_buf_size / + t->commit_buf_elem_size; +} + +void ublk_batch_prep_commit(struct ublk_thread *t) +{ + unsigned short buf_idx = ublk_alloc_commit_buf(t); + + ublk_assert(buf_idx != UBLKS_T_COMMIT_BUF_INV_IDX); + ublk_batch_init_commit(t, buf_idx); +} + +void ublk_batch_complete_io(struct ublk_thread *t, struct ublk_queue *q, + unsigned tag, int res) +{ + struct batch_commit_buf *cb = &t->commit; + struct ublk_batch_elem *elem = (struct ublk_batch_elem *)(cb->elem + + cb->done * t->commit_buf_elem_size); + struct ublk_io *io = &q->ios[tag]; + + ublk_assert(q->q_id == t->commit.q_id); + + elem->tag = tag; + elem->buf_index = ublk_batch_io_buf_idx(t, q, tag); + elem->result = res; + + if (!ublk_queue_no_buf(q)) + elem->buf_addr = (__u64) (uintptr_t) io->buf_addr; + + cb->done += 1; + ublk_assert(cb->done <= cb->count); +} diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index dba912a44eb3..bf217d30c15f 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -931,7 +931,13 @@ static int ublk_process_io(struct ublk_thread *t) return -ENODEV; ret = io_uring_submit_and_wait(&t->ring, 1); - reapped = ublk_reap_events_uring(t); + if (ublk_thread_batch_io(t)) { + ublk_batch_prep_commit(t); + reapped = ublk_reap_events_uring(t); + ublk_batch_commit_io_cmds(t); + } else { + reapped = ublk_reap_events_uring(t); + } ublk_dbg(UBLK_DBG_THREAD, "submit result %d, reapped %d stop %d idle %d\n", ret, reapped, (t->state & UBLKS_T_STOPPING), diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 08320d44c7c2..5b05f6d7d808 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -190,6 +190,14 @@ struct ublk_batch_elem { __u64 buf_addr; }; +struct batch_commit_buf { + unsigned short q_id; + unsigned short buf_idx; + void *elem; + unsigned short done; + unsigned short count; +}; + struct ublk_thread { struct ublk_dev *dev; unsigned idx; @@ -215,6 +223,7 @@ struct ublk_thread { void *commit_buf; #define UBLKS_T_COMMIT_BUF_INV_IDX ((unsigned short)-1) struct allocator commit_buf_alloc; + struct batch_commit_buf commit; struct io_uring ring; }; @@ -458,30 +467,6 @@ static inline struct ublk_io *ublk_get_io(struct ublk_queue *q, unsigned tag) return &q->ios[tag]; } -static inline int ublk_complete_io(struct ublk_thread *t, struct ublk_queue *q, - unsigned tag, int res) -{ - struct ublk_io *io = &q->ios[tag]; - - ublk_mark_io_done(io, res); - - return ublk_queue_io_cmd(t, io); -} - -static inline void ublk_queued_tgt_io(struct ublk_thread *t, struct ublk_queue *q, - unsigned tag, int queued) -{ - if (queued < 0) - ublk_complete_io(t, q, tag, queued); - else { - struct ublk_io *io = ublk_get_io(q, tag); - - t->io_inflight += queued; - io->tgt_ios = queued; - io->result = 0; - } -} - static inline int ublk_completed_tgt_io(struct ublk_thread *t, struct ublk_queue *q, unsigned tag) { @@ -540,6 +525,42 @@ int ublk_batch_alloc_buf(struct ublk_thread *t); /* Free commit buffers and cleanup batch allocator */ void ublk_batch_free_buf(struct ublk_thread *t); +/* Prepare a new commit buffer for batching completed I/O operations */ +void ublk_batch_prep_commit(struct ublk_thread *t); +/* Submit UBLK_U_IO_COMMIT_IO_CMDS with batched completed I/O operations */ +void ublk_batch_commit_io_cmds(struct ublk_thread *t); +/* Add a completed I/O operation to the current batch commit buffer */ +void ublk_batch_complete_io(struct ublk_thread *t, struct ublk_queue *q, + unsigned tag, int res); + +static inline int ublk_complete_io(struct ublk_thread *t, struct ublk_queue *q, + unsigned tag, int res) +{ + if (ublk_queue_batch_io(q)) { + ublk_batch_complete_io(t, q, tag, res); + return 0; + } else { + struct ublk_io *io = &q->ios[tag]; + + ublk_mark_io_done(io, res); + return ublk_queue_io_cmd(t, io); + } +} + +static inline void ublk_queued_tgt_io(struct ublk_thread *t, struct ublk_queue *q, + unsigned tag, int queued) +{ + if (queued < 0) + ublk_complete_io(t, q, tag, queued); + else { + struct ublk_io *io = ublk_get_io(q, tag); + + t->io_inflight += queued; + io->tgt_ios = queued; + io->result = 0; + } +} + extern const struct ublk_tgt_ops null_tgt_ops; extern const struct ublk_tgt_ops loop_tgt_ops; extern const struct ublk_tgt_ops stripe_tgt_ops; -- cgit v1.2.3 From cb5a6b308700c65c29baccbb6b9b07f306633ad5 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:54 +0800 Subject: selftests: ublk: handle UBLK_U_IO_FETCH_IO_CMDS Add support for UBLK_U_IO_FETCH_IO_CMDS to enable efficient batch fetching of I/O commands using multishot io_uring operations. Key improvements: - Implement multishot UBLK_U_IO_FETCH_IO_CMDS for continuous command fetching - Add fetch buffer management with page-aligned, mlocked buffers - Process fetched I/O command tags from kernel-provided buffers - Integrate fetch operations with existing batch I/O infrastructure - Significantly reduce uring_cmd issuing overhead through batching The implementation uses two fetch buffers per thread with automatic requeuing to maintain continuous I/O command flow. Each fetch operation retrieves multiple command tags in a single syscall, dramatically improving performance compared to individual command fetching. Technical details: - Fetch buffers are page-aligned and mlocked for optimal performance - Uses IORING_URING_CMD_MULTISHOT for continuous operation - Automatic buffer management and requeuing on completion - Enhanced CQE handling for fetch command completions Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/batch.c | 136 ++++++++++++++++++++++++++++++++++- tools/testing/selftests/ublk/kublk.c | 14 +++- tools/testing/selftests/ublk/kublk.h | 13 ++++ 3 files changed, 159 insertions(+), 4 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/ublk/batch.c b/tools/testing/selftests/ublk/batch.c index 9c4db7335d44..5f9587210b12 100644 --- a/tools/testing/selftests/ublk/batch.c +++ b/tools/testing/selftests/ublk/batch.c @@ -140,15 +140,63 @@ void ublk_batch_prepare(struct ublk_thread *t) t->nr_bufs); } +static void free_batch_fetch_buf(struct ublk_thread *t) +{ + int i; + + for (i = 0; i < UBLKS_T_NR_FETCH_BUF; i++) { + io_uring_free_buf_ring(&t->ring, t->fetch[i].br, 1, i); + munlock(t->fetch[i].fetch_buf, t->fetch[i].fetch_buf_size); + free(t->fetch[i].fetch_buf); + } +} + +static int alloc_batch_fetch_buf(struct ublk_thread *t) +{ + /* page aligned fetch buffer, and it is mlocked for speedup delivery */ + unsigned pg_sz = getpagesize(); + unsigned buf_size = round_up(t->dev->dev_info.queue_depth * 2, pg_sz); + int ret; + int i = 0; + + for (i = 0; i < UBLKS_T_NR_FETCH_BUF; i++) { + t->fetch[i].fetch_buf_size = buf_size; + + if (posix_memalign((void **)&t->fetch[i].fetch_buf, pg_sz, + t->fetch[i].fetch_buf_size)) + return -ENOMEM; + + /* lock fetch buffer page for fast fetching */ + if (mlock(t->fetch[i].fetch_buf, t->fetch[i].fetch_buf_size)) + ublk_err("%s: can't lock fetch buffer %s\n", __func__, + strerror(errno)); + t->fetch[i].br = io_uring_setup_buf_ring(&t->ring, 1, + i, IOU_PBUF_RING_INC, &ret); + if (!t->fetch[i].br) { + ublk_err("Buffer ring register failed %d\n", ret); + return ret; + } + } + + return 0; +} + int ublk_batch_alloc_buf(struct ublk_thread *t) { + int ret; + ublk_assert(t->nr_commit_buf < 16); - return alloc_batch_commit_buf(t); + + ret = alloc_batch_commit_buf(t); + if (ret) + return ret; + return alloc_batch_fetch_buf(t); } void ublk_batch_free_buf(struct ublk_thread *t) { free_batch_commit_buf(t); + free_batch_fetch_buf(t); } static void ublk_init_batch_cmd(struct ublk_thread *t, __u16 q_id, @@ -199,6 +247,76 @@ static void ublk_setup_commit_sqe(struct ublk_thread *t, cmd->flags |= t->cmd_flags; } +static void ublk_batch_queue_fetch(struct ublk_thread *t, + struct ublk_queue *q, + unsigned short buf_idx) +{ + unsigned short nr_elem = t->fetch[buf_idx].fetch_buf_size / 2; + struct io_uring_sqe *sqe; + + io_uring_buf_ring_add(t->fetch[buf_idx].br, t->fetch[buf_idx].fetch_buf, + t->fetch[buf_idx].fetch_buf_size, + 0, 0, 0); + io_uring_buf_ring_advance(t->fetch[buf_idx].br, 1); + + ublk_io_alloc_sqes(t, &sqe, 1); + + ublk_init_batch_cmd(t, q->q_id, sqe, UBLK_U_IO_FETCH_IO_CMDS, 2, nr_elem, + buf_idx); + + sqe->rw_flags= IORING_URING_CMD_MULTISHOT; + sqe->buf_group = buf_idx; + sqe->flags |= IOSQE_BUFFER_SELECT; + + t->fetch[buf_idx].fetch_buf_off = 0; +} + +void ublk_batch_start_fetch(struct ublk_thread *t, + struct ublk_queue *q) +{ + int i; + + for (i = 0; i < UBLKS_T_NR_FETCH_BUF; i++) + ublk_batch_queue_fetch(t, q, i); +} + +static unsigned short ublk_compl_batch_fetch(struct ublk_thread *t, + struct ublk_queue *q, + const struct io_uring_cqe *cqe) +{ + unsigned short buf_idx = user_data_to_tag(cqe->user_data); + unsigned start = t->fetch[buf_idx].fetch_buf_off; + unsigned end = start + cqe->res; + void *buf = t->fetch[buf_idx].fetch_buf; + int i; + + if (cqe->res < 0) + return buf_idx; + + if ((end - start) / 2 > q->q_depth) { + ublk_err("%s: fetch duplicated ios offset %u count %u\n", __func__, start, cqe->res); + + for (i = start; i < end; i += 2) { + unsigned short tag = *(unsigned short *)(buf + i); + + ublk_err("%u ", tag); + } + ublk_err("\n"); + } + + for (i = start; i < end; i += 2) { + unsigned short tag = *(unsigned short *)(buf + i); + + if (tag >= q->q_depth) + ublk_err("%s: bad tag %u\n", __func__, tag); + + if (q->tgt_ops->queue_io) + q->tgt_ops->queue_io(t, q, tag); + } + t->fetch[buf_idx].fetch_buf_off = end; + return buf_idx; +} + int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q) { unsigned short nr_elem = q->q_depth; @@ -258,6 +376,9 @@ void ublk_batch_compl_cmd(struct ublk_thread *t, const struct io_uring_cqe *cqe) { unsigned op = user_data_to_op(cqe->user_data); + struct ublk_queue *q; + unsigned buf_idx; + unsigned q_id; if (op == _IOC_NR(UBLK_U_IO_PREP_IO_CMDS) || op == _IOC_NR(UBLK_U_IO_COMMIT_IO_CMDS)) { @@ -265,6 +386,19 @@ void ublk_batch_compl_cmd(struct ublk_thread *t, ublk_batch_compl_commit_cmd(t, cqe, op); return; } + + /* FETCH command is per queue */ + q_id = user_data_to_q_id(cqe->user_data); + q = &t->dev->q[q_id]; + buf_idx = ublk_compl_batch_fetch(t, q, cqe); + + if (cqe->res < 0 && cqe->res != -ENOBUFS) { + t->cmd_inflight--; + t->state |= UBLKS_T_STOPPING; + } else if (!(cqe->flags & IORING_CQE_F_MORE) || cqe->res == -ENOBUFS) { + t->cmd_inflight--; + ublk_batch_queue_fetch(t, q, buf_idx); + } } void ublk_batch_commit_io_cmds(struct ublk_thread *t) diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index bf217d30c15f..c77205bac7a9 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -519,6 +519,10 @@ static int ublk_thread_init(struct ublk_thread *t, unsigned long long extra_flag int ring_depth = dev->tgt.sq_depth, cq_depth = dev->tgt.cq_depth; int ret; + /* FETCH_IO_CMDS is multishot, so increase cq depth for BATCH_IO */ + if (ublk_dev_batch_io(dev)) + cq_depth += dev->dev_info.queue_depth; + ret = ublk_setup_ring(&t->ring, ring_depth, cq_depth, IORING_SETUP_COOP_TASKRUN | IORING_SETUP_SINGLE_ISSUER | @@ -878,7 +882,7 @@ static void ublk_handle_cqe(struct ublk_thread *t, unsigned q_id = user_data_to_q_id(cqe->user_data); unsigned cmd_op = user_data_to_op(cqe->user_data); - if (cqe->res < 0 && cqe->res != -ENODEV) + if (cqe->res < 0 && cqe->res != -ENODEV && cqe->res != -ENOBUFS) ublk_err("%s: res %d userdata %llx thread state %x\n", __func__, cqe->res, cqe->user_data, t->state); @@ -1001,9 +1005,13 @@ static __attribute__((noinline)) int __ublk_io_handler_fn(struct ublk_thread_inf if (!ublk_thread_batch_io(&t)) { /* submit all io commands to ublk driver */ ublk_submit_fetch_commands(&t); - } else if (!t.idx) { + } else { + struct ublk_queue *q = &t.dev->q[t.idx]; + /* prepare all io commands in the 1st thread context */ - ublk_batch_setup_queues(&t); + if (!t.idx) + ublk_batch_setup_queues(&t); + ublk_batch_start_fetch(&t, q); } do { diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 5b05f6d7d808..950e99c02e8b 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -198,6 +198,13 @@ struct batch_commit_buf { unsigned short count; }; +struct batch_fetch_buf { + struct io_uring_buf_ring *br; + void *fetch_buf; + unsigned int fetch_buf_size; + unsigned int fetch_buf_off; +}; + struct ublk_thread { struct ublk_dev *dev; unsigned idx; @@ -224,6 +231,9 @@ struct ublk_thread { #define UBLKS_T_COMMIT_BUF_INV_IDX ((unsigned short)-1) struct allocator commit_buf_alloc; struct batch_commit_buf commit; + /* FETCH_IO_CMDS buffer */ +#define UBLKS_T_NR_FETCH_BUF 2 + struct batch_fetch_buf fetch[UBLKS_T_NR_FETCH_BUF]; struct io_uring ring; }; @@ -515,6 +525,9 @@ static inline unsigned short ublk_batch_io_buf_idx( /* Queue UBLK_U_IO_PREP_IO_CMDS for a specific queue with batch elements */ int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q); +/* Start fetching I/O commands using multishot UBLK_U_IO_FETCH_IO_CMDS */ +void ublk_batch_start_fetch(struct ublk_thread *t, + struct ublk_queue *q); /* Handle completion of batch I/O commands (prep/commit) */ void ublk_batch_compl_cmd(struct ublk_thread *t, const struct io_uring_cqe *cqe); -- cgit v1.2.3 From 4968fb7cc60676040258c8867f22931c8735126f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:55 +0800 Subject: selftests: ublk: increase timeout to 150 seconds More tests need to be covered in existing generic tests, and default 45sec isn't enough, and timeout is often triggered, increase timeout by adding setting file. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 2 ++ tools/testing/selftests/ublk/settings | 1 + 2 files changed, 3 insertions(+) create mode 100644 tools/testing/selftests/ublk/settings (limited to 'tools/testing') diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 3a2498089b15..f2da8b403537 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -52,6 +52,8 @@ TEST_PROGS += test_stress_05.sh TEST_PROGS += test_stress_06.sh TEST_PROGS += test_stress_07.sh +TEST_FILES := settings + TEST_GEN_PROGS_EXTENDED = kublk metadata_size STANDALONE_UTILS := metadata_size.c diff --git a/tools/testing/selftests/ublk/settings b/tools/testing/selftests/ublk/settings new file mode 100644 index 000000000000..682a40f1c8e6 --- /dev/null +++ b/tools/testing/selftests/ublk/settings @@ -0,0 +1 @@ +timeout=150 -- cgit v1.2.3 From 20aeab0b08a175d9ceb4ad327f55ba5c29a79888 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:56 +0800 Subject: selftests: ublk: add --batch/-b for enabling F_BATCH_IO Add --batch/-b for enabling F_BATCH_IO. Add batch_01 for covering its basic function. Add stress_08 and stress_09 for covering stress test. Add recovery test for F_BATCH_IO in generic_04 and generic_05. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 4 +++ tools/testing/selftests/ublk/kublk.c | 15 +++++++-- tools/testing/selftests/ublk/test_batch_01.sh | 32 ++++++++++++++++++ tools/testing/selftests/ublk/test_generic_04.sh | 5 +++ tools/testing/selftests/ublk/test_generic_05.sh | 5 +++ tools/testing/selftests/ublk/test_stress_08.sh | 45 +++++++++++++++++++++++++ tools/testing/selftests/ublk/test_stress_09.sh | 44 ++++++++++++++++++++++++ 7 files changed, 148 insertions(+), 2 deletions(-) create mode 100755 tools/testing/selftests/ublk/test_batch_01.sh create mode 100755 tools/testing/selftests/ublk/test_stress_08.sh create mode 100755 tools/testing/selftests/ublk/test_stress_09.sh (limited to 'tools/testing') diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index f2da8b403537..520e18e224f2 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -25,6 +25,8 @@ TEST_PROGS += test_generic_14.sh TEST_PROGS += test_generic_15.sh TEST_PROGS += test_generic_16.sh +TEST_PROGS += test_batch_01.sh + TEST_PROGS += test_null_01.sh TEST_PROGS += test_null_02.sh TEST_PROGS += test_null_03.sh @@ -51,6 +53,8 @@ TEST_PROGS += test_stress_04.sh TEST_PROGS += test_stress_05.sh TEST_PROGS += test_stress_06.sh TEST_PROGS += test_stress_07.sh +TEST_PROGS += test_stress_08.sh +TEST_PROGS += test_stress_09.sh TEST_FILES := settings diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index c77205bac7a9..5d84000872a0 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -1593,7 +1593,8 @@ static int cmd_dev_get_features(void) FEAT_NAME(UBLK_F_PER_IO_DAEMON), FEAT_NAME(UBLK_F_BUF_REG_OFF_DAEMON), FEAT_NAME(UBLK_F_INTEGRITY), - FEAT_NAME(UBLK_F_SAFE_STOP_DEV) + FEAT_NAME(UBLK_F_SAFE_STOP_DEV), + FEAT_NAME(UBLK_F_BATCH_IO), }; struct ublk_dev *dev; __u64 features = 0; @@ -1691,6 +1692,7 @@ static void __cmd_create_help(char *exe, bool recovery) printf("\t[--nthreads threads] [--per_io_tasks]\n"); printf("\t[--integrity_capable] [--integrity_reftag] [--metadata_size SIZE] " "[--pi_offset OFFSET] [--csum_type ip|t10dif|nvme] [--tag_size SIZE]\n"); + printf("\t[--batch|-b]\n"); printf("\t[target options] [backfile1] [backfile2] ...\n"); printf("\tdefault: nr_queues=2(max 32), depth=128(max 1024), dev_id=-1(auto allocation)\n"); printf("\tdefault: nthreads=nr_queues"); @@ -1763,6 +1765,7 @@ int main(int argc, char *argv[]) { "csum_type", 1, NULL, 0 }, { "tag_size", 1, NULL, 0 }, { "safe", 0, NULL, 0 }, + { "batch", 0, NULL, 'b'}, { 0, 0, 0, 0 } }; const struct ublk_tgt_ops *ops = NULL; @@ -1785,12 +1788,15 @@ int main(int argc, char *argv[]) opterr = 0; optind = 2; - while ((opt = getopt_long(argc, argv, "t:n:d:q:r:e:i:s:gazu", + while ((opt = getopt_long(argc, argv, "t:n:d:q:r:e:i:s:gazub", longopts, &option_idx)) != -1) { switch (opt) { case 'a': ctx.all = 1; break; + case 'b': + ctx.flags |= UBLK_F_BATCH_IO; + break; case 'n': ctx.dev_id = strtol(optarg, NULL, 10); break; @@ -1895,6 +1901,11 @@ int main(int argc, char *argv[]) } } + if (ctx.per_io_tasks && (ctx.flags & UBLK_F_BATCH_IO)) { + ublk_err("per_io_task and F_BATCH_IO conflict\n"); + return -EINVAL; + } + /* auto_zc_fallback depends on F_AUTO_BUF_REG & F_SUPPORT_ZERO_COPY */ if (ctx.auto_zc_fallback && !((ctx.flags & UBLK_F_AUTO_BUF_REG) && diff --git a/tools/testing/selftests/ublk/test_batch_01.sh b/tools/testing/selftests/ublk/test_batch_01.sh new file mode 100755 index 000000000000..9fa9fff5c62f --- /dev/null +++ b/tools/testing/selftests/ublk/test_batch_01.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="batch_01" +ERR_CODE=0 + +if ! _have_feature "BATCH_IO"; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "generic" "test basic function of UBLK_F_BATCH_IO" + +_create_backfile 0 256M +_create_backfile 1 256M + +dev_id=$(_add_ublk_dev -t loop -q 2 -b "${UBLK_BACKFILES[0]}") +_check_add_dev $TID $? + +if ! _mkfs_mount_test /dev/ublkb"${dev_id}"; then + _cleanup_test "generic" + _show_result $TID 255 +fi + +dev_id=$(_add_ublk_dev -t stripe -b --auto_zc "${UBLK_BACKFILES[0]}" "${UBLK_BACKFILES[1]}") +_check_add_dev $TID $? +_mkfs_mount_test /dev/ublkb"${dev_id}" +ERR_CODE=$? + +_cleanup_test "generic" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_generic_04.sh b/tools/testing/selftests/ublk/test_generic_04.sh index baf5b156193d..be2292822bbe 100755 --- a/tools/testing/selftests/ublk/test_generic_04.sh +++ b/tools/testing/selftests/ublk/test_generic_04.sh @@ -26,6 +26,11 @@ _create_backfile 0 256M _create_backfile 1 128M _create_backfile 2 128M +ublk_run_recover_test -t null -q 2 -r 1 -b & +ublk_run_recover_test -t loop -q 2 -r 1 -b "${UBLK_BACKFILES[0]}" & +ublk_run_recover_test -t stripe -q 2 -r 1 -b "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +wait + ublk_run_recover_test -t null -q 2 -r 1 & ublk_run_recover_test -t loop -q 2 -r 1 "${UBLK_BACKFILES[0]}" & ublk_run_recover_test -t stripe -q 2 -r 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & diff --git a/tools/testing/selftests/ublk/test_generic_05.sh b/tools/testing/selftests/ublk/test_generic_05.sh index 7b5083afc02a..9b7f71c16d82 100755 --- a/tools/testing/selftests/ublk/test_generic_05.sh +++ b/tools/testing/selftests/ublk/test_generic_05.sh @@ -30,6 +30,11 @@ _create_backfile 0 256M _create_backfile 1 128M _create_backfile 2 128M +ublk_run_recover_test -t null -q 2 -r 1 -z -b & +ublk_run_recover_test -t loop -q 2 -r 1 -z -b "${UBLK_BACKFILES[0]}" & +ublk_run_recover_test -t stripe -q 2 -r 1 -z -b "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +wait + ublk_run_recover_test -t null -q 2 -r 1 -z & ublk_run_recover_test -t loop -q 2 -r 1 -z "${UBLK_BACKFILES[0]}" & ublk_run_recover_test -t stripe -q 2 -r 1 -z "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & diff --git a/tools/testing/selftests/ublk/test_stress_08.sh b/tools/testing/selftests/ublk/test_stress_08.sh new file mode 100755 index 000000000000..190db0b4f2ad --- /dev/null +++ b/tools/testing/selftests/ublk/test_stress_08.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh +TID="stress_06" +ERR_CODE=0 + +ublk_io_and_remove() +{ + run_io_and_remove "$@" + ERR_CODE=$? + if [ ${ERR_CODE} -ne 0 ]; then + echo "$TID failure: $*" + _show_result $TID $ERR_CODE + fi +} + +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + +if ! _have_feature "ZERO_COPY"; then + exit "$UBLK_SKIP_CODE" +fi +if ! _have_feature "AUTO_BUF_REG"; then + exit "$UBLK_SKIP_CODE" +fi +if ! _have_feature "BATCH_IO"; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "stress" "run IO and remove device(zero copy)" + +_create_backfile 0 256M +_create_backfile 1 128M +_create_backfile 2 128M + +ublk_io_and_remove 8G -t null -q 4 -b & +ublk_io_and_remove 256M -t loop -q 4 --auto_zc -b "${UBLK_BACKFILES[0]}" & +ublk_io_and_remove 256M -t stripe -q 4 --auto_zc -b "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +ublk_io_and_remove 8G -t null -q 4 -z --auto_zc --auto_zc_fallback -b & +wait + +_cleanup_test "stress" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_stress_09.sh b/tools/testing/selftests/ublk/test_stress_09.sh new file mode 100755 index 000000000000..1b6bdb31da03 --- /dev/null +++ b/tools/testing/selftests/ublk/test_stress_09.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh +TID="stress_07" +ERR_CODE=0 + +ublk_io_and_kill_daemon() +{ + run_io_and_kill_daemon "$@" + ERR_CODE=$? + if [ ${ERR_CODE} -ne 0 ]; then + echo "$TID failure: $*" + _show_result $TID $ERR_CODE + fi +} + +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi +if ! _have_feature "ZERO_COPY"; then + exit "$UBLK_SKIP_CODE" +fi +if ! _have_feature "AUTO_BUF_REG"; then + exit "$UBLK_SKIP_CODE" +fi +if ! _have_feature "BATCH_IO"; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "stress" "run IO and kill ublk server(zero copy)" + +_create_backfile 0 256M +_create_backfile 1 128M +_create_backfile 2 128M + +ublk_io_and_kill_daemon 8G -t null -q 4 -z -b & +ublk_io_and_kill_daemon 256M -t loop -q 4 --auto_zc -b "${UBLK_BACKFILES[0]}" & +ublk_io_and_kill_daemon 256M -t stripe -q 4 -b "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +ublk_io_and_kill_daemon 8G -t null -q 4 -z --auto_zc --auto_zc_fallback -b & +wait + +_cleanup_test "stress" +_show_result $TID $ERR_CODE -- cgit v1.2.3 From e8cd481cc665d5db8e918e84740db22bc213059e Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:57 +0800 Subject: selftests: ublk: support arbitrary threads/queues combination Enable flexible thread-to-queue mapping in batch I/O mode to support arbitrary combinations of threads and queues, improving resource utilization and scalability. Key improvements: - Support N:M thread-to-queue mapping (previously limited to 1:1) - Dynamic buffer allocation based on actual queue assignment per thread - Thread-safe queue preparation with spinlock protection - Intelligent buffer index calculation for multi-queue scenarios - Enhanced validation for thread/queue combination constraints Implementation details: - Add q_thread_map matrix to track queue-to-thread assignments - Dynamic allocation of commit and fetch buffers per thread - Round-robin queue assignment algorithm for load balancing - Per-queue spinlock to prevent race conditions during prep - Updated buffer index calculation using queue position within thread This enables efficient configurations like: - Any other N:M combinations for optimal resource matching Testing: - Added test_batch_02.sh: 4 threads vs 1 queue - Added test_batch_03.sh: 1 thread vs 4 queues - Validates correctness across different mapping scenarios Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 2 + tools/testing/selftests/ublk/batch.c | 199 ++++++++++++++++++++++---- tools/testing/selftests/ublk/kublk.c | 49 +++++-- tools/testing/selftests/ublk/kublk.h | 40 ++++-- tools/testing/selftests/ublk/test_batch_02.sh | 30 ++++ tools/testing/selftests/ublk/test_batch_03.sh | 30 ++++ 6 files changed, 302 insertions(+), 48 deletions(-) create mode 100755 tools/testing/selftests/ublk/test_batch_02.sh create mode 100755 tools/testing/selftests/ublk/test_batch_03.sh (limited to 'tools/testing') diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 520e18e224f2..e39a6f871fcc 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -26,6 +26,8 @@ TEST_PROGS += test_generic_15.sh TEST_PROGS += test_generic_16.sh TEST_PROGS += test_batch_01.sh +TEST_PROGS += test_batch_02.sh +TEST_PROGS += test_batch_03.sh TEST_PROGS += test_null_01.sh TEST_PROGS += test_null_02.sh diff --git a/tools/testing/selftests/ublk/batch.c b/tools/testing/selftests/ublk/batch.c index 5f9587210b12..a54025b00917 100644 --- a/tools/testing/selftests/ublk/batch.c +++ b/tools/testing/selftests/ublk/batch.c @@ -76,6 +76,7 @@ static void free_batch_commit_buf(struct ublk_thread *t) free(t->commit_buf); } allocator_deinit(&t->commit_buf_alloc); + free(t->commit); } static int alloc_batch_commit_buf(struct ublk_thread *t) @@ -84,7 +85,13 @@ static int alloc_batch_commit_buf(struct ublk_thread *t) unsigned int total = buf_size * t->nr_commit_buf; unsigned int page_sz = getpagesize(); void *buf = NULL; - int ret; + int i, ret, j = 0; + + t->commit = calloc(t->nr_queues, sizeof(*t->commit)); + for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) { + if (t->q_map[i]) + t->commit[j++].q_id = i; + } allocator_init(&t->commit_buf_alloc, t->nr_commit_buf); @@ -107,6 +114,17 @@ fail: return ret; } +static unsigned int ublk_thread_nr_queues(const struct ublk_thread *t) +{ + int i; + int ret = 0; + + for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) + ret += !!t->q_map[i]; + + return ret; +} + void ublk_batch_prepare(struct ublk_thread *t) { /* @@ -119,10 +137,13 @@ void ublk_batch_prepare(struct ublk_thread *t) */ struct ublk_queue *q = &t->dev->q[0]; + /* cache nr_queues because we don't support dynamic load-balance yet */ + t->nr_queues = ublk_thread_nr_queues(t); + t->commit_buf_elem_size = ublk_commit_elem_buf_size(t->dev); t->commit_buf_size = ublk_commit_buf_size(t); t->commit_buf_start = t->nr_bufs; - t->nr_commit_buf = 2; + t->nr_commit_buf = 2 * t->nr_queues; t->nr_bufs += t->nr_commit_buf; t->cmd_flags = 0; @@ -144,11 +165,12 @@ static void free_batch_fetch_buf(struct ublk_thread *t) { int i; - for (i = 0; i < UBLKS_T_NR_FETCH_BUF; i++) { + for (i = 0; i < t->nr_fetch_bufs; i++) { io_uring_free_buf_ring(&t->ring, t->fetch[i].br, 1, i); munlock(t->fetch[i].fetch_buf, t->fetch[i].fetch_buf_size); free(t->fetch[i].fetch_buf); } + free(t->fetch); } static int alloc_batch_fetch_buf(struct ublk_thread *t) @@ -159,7 +181,12 @@ static int alloc_batch_fetch_buf(struct ublk_thread *t) int ret; int i = 0; - for (i = 0; i < UBLKS_T_NR_FETCH_BUF; i++) { + /* double fetch buffer for each queue */ + t->nr_fetch_bufs = t->nr_queues * 2; + t->fetch = calloc(t->nr_fetch_bufs, sizeof(*t->fetch)); + + /* allocate one buffer for each queue */ + for (i = 0; i < t->nr_fetch_bufs; i++) { t->fetch[i].fetch_buf_size = buf_size; if (posix_memalign((void **)&t->fetch[i].fetch_buf, pg_sz, @@ -185,7 +212,7 @@ int ublk_batch_alloc_buf(struct ublk_thread *t) { int ret; - ublk_assert(t->nr_commit_buf < 16); + ublk_assert(t->nr_commit_buf < 2 * UBLK_MAX_QUEUES); ret = alloc_batch_commit_buf(t); if (ret) @@ -271,13 +298,20 @@ static void ublk_batch_queue_fetch(struct ublk_thread *t, t->fetch[buf_idx].fetch_buf_off = 0; } -void ublk_batch_start_fetch(struct ublk_thread *t, - struct ublk_queue *q) +void ublk_batch_start_fetch(struct ublk_thread *t) { int i; + int j = 0; + + for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) { + if (t->q_map[i]) { + struct ublk_queue *q = &t->dev->q[i]; - for (i = 0; i < UBLKS_T_NR_FETCH_BUF; i++) - ublk_batch_queue_fetch(t, q, i); + /* submit two fetch commands for each queue */ + ublk_batch_queue_fetch(t, q, j++); + ublk_batch_queue_fetch(t, q, j++); + } + } } static unsigned short ublk_compl_batch_fetch(struct ublk_thread *t, @@ -317,7 +351,7 @@ static unsigned short ublk_compl_batch_fetch(struct ublk_thread *t, return buf_idx; } -int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q) +static int __ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q) { unsigned short nr_elem = q->q_depth; unsigned short buf_idx = ublk_alloc_commit_buf(t); @@ -354,6 +388,22 @@ int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q) return 0; } +int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q) +{ + int ret = 0; + + pthread_spin_lock(&q->lock); + if (q->flags & UBLKS_Q_PREPARED) + goto unlock; + ret = __ublk_batch_queue_prep_io_cmds(t, q); + if (!ret) + q->flags |= UBLKS_Q_PREPARED; +unlock: + pthread_spin_unlock(&q->lock); + + return ret; +} + static void ublk_batch_compl_commit_cmd(struct ublk_thread *t, const struct io_uring_cqe *cqe, unsigned op) @@ -401,59 +451,89 @@ void ublk_batch_compl_cmd(struct ublk_thread *t, } } -void ublk_batch_commit_io_cmds(struct ublk_thread *t) +static void __ublk_batch_commit_io_cmds(struct ublk_thread *t, + struct batch_commit_buf *cb) { struct io_uring_sqe *sqe; unsigned short buf_idx; - unsigned short nr_elem = t->commit.done; + unsigned short nr_elem = cb->done; /* nothing to commit */ if (!nr_elem) { - ublk_free_commit_buf(t, t->commit.buf_idx); + ublk_free_commit_buf(t, cb->buf_idx); return; } ublk_io_alloc_sqes(t, &sqe, 1); - buf_idx = t->commit.buf_idx; - sqe->addr = (__u64)t->commit.elem; + buf_idx = cb->buf_idx; + sqe->addr = (__u64)cb->elem; sqe->len = nr_elem * t->commit_buf_elem_size; /* commit isn't per-queue command */ - ublk_init_batch_cmd(t, t->commit.q_id, sqe, UBLK_U_IO_COMMIT_IO_CMDS, + ublk_init_batch_cmd(t, cb->q_id, sqe, UBLK_U_IO_COMMIT_IO_CMDS, t->commit_buf_elem_size, nr_elem, buf_idx); ublk_setup_commit_sqe(t, sqe, buf_idx); } -static void ublk_batch_init_commit(struct ublk_thread *t, - unsigned short buf_idx) +void ublk_batch_commit_io_cmds(struct ublk_thread *t) +{ + int i; + + for (i = 0; i < t->nr_queues; i++) { + struct batch_commit_buf *cb = &t->commit[i]; + + if (cb->buf_idx != UBLKS_T_COMMIT_BUF_INV_IDX) + __ublk_batch_commit_io_cmds(t, cb); + } + +} + +static void __ublk_batch_init_commit(struct ublk_thread *t, + struct batch_commit_buf *cb, + unsigned short buf_idx) { /* so far only support 1:1 queue/thread mapping */ - t->commit.q_id = t->idx; - t->commit.buf_idx = buf_idx; - t->commit.elem = ublk_get_commit_buf(t, buf_idx); - t->commit.done = 0; - t->commit.count = t->commit_buf_size / + cb->buf_idx = buf_idx; + cb->elem = ublk_get_commit_buf(t, buf_idx); + cb->done = 0; + cb->count = t->commit_buf_size / t->commit_buf_elem_size; } -void ublk_batch_prep_commit(struct ublk_thread *t) +/* COMMIT_IO_CMDS is per-queue command, so use its own commit buffer */ +static void ublk_batch_init_commit(struct ublk_thread *t, + struct batch_commit_buf *cb) { unsigned short buf_idx = ublk_alloc_commit_buf(t); ublk_assert(buf_idx != UBLKS_T_COMMIT_BUF_INV_IDX); - ublk_batch_init_commit(t, buf_idx); + ublk_assert(!ublk_batch_commit_prepared(cb)); + + __ublk_batch_init_commit(t, cb, buf_idx); +} + +void ublk_batch_prep_commit(struct ublk_thread *t) +{ + int i; + + for (i = 0; i < t->nr_queues; i++) + t->commit[i].buf_idx = UBLKS_T_COMMIT_BUF_INV_IDX; } void ublk_batch_complete_io(struct ublk_thread *t, struct ublk_queue *q, unsigned tag, int res) { - struct batch_commit_buf *cb = &t->commit; - struct ublk_batch_elem *elem = (struct ublk_batch_elem *)(cb->elem + - cb->done * t->commit_buf_elem_size); + unsigned q_t_idx = ublk_queue_idx_in_thread(t, q); + struct batch_commit_buf *cb = &t->commit[q_t_idx]; + struct ublk_batch_elem *elem; struct ublk_io *io = &q->ios[tag]; - ublk_assert(q->q_id == t->commit.q_id); + if (!ublk_batch_commit_prepared(cb)) + ublk_batch_init_commit(t, cb); + + ublk_assert(q->q_id == cb->q_id); + elem = (struct ublk_batch_elem *)(cb->elem + cb->done * t->commit_buf_elem_size); elem->tag = tag; elem->buf_index = ublk_batch_io_buf_idx(t, q, tag); elem->result = res; @@ -464,3 +544,64 @@ void ublk_batch_complete_io(struct ublk_thread *t, struct ublk_queue *q, cb->done += 1; ublk_assert(cb->done <= cb->count); } + +void ublk_batch_setup_map(unsigned char (*q_thread_map)[UBLK_MAX_QUEUES], + int nthreads, int queues) +{ + int i, j; + + /* + * Setup round-robin queue-to-thread mapping for arbitrary N:M combinations. + * + * This algorithm distributes queues across threads (and threads across queues) + * in a balanced round-robin fashion to ensure even load distribution. + * + * Examples: + * - 2 threads, 4 queues: T0=[Q0,Q2], T1=[Q1,Q3] + * - 4 threads, 2 queues: T0=[Q0], T1=[Q1], T2=[Q0], T3=[Q1] + * - 3 threads, 3 queues: T0=[Q0], T1=[Q1], T2=[Q2] (1:1 mapping) + * + * Phase 1: Mark which queues each thread handles (boolean mapping) + */ + for (i = 0, j = 0; i < queues || j < nthreads; i++, j++) { + q_thread_map[j % nthreads][i % queues] = 1; + } + + /* + * Phase 2: Convert boolean mapping to sequential indices within each thread. + * + * Transform from: q_thread_map[thread][queue] = 1 (handles queue) + * To: q_thread_map[thread][queue] = N (queue index within thread) + * + * This allows each thread to know the local index of each queue it handles, + * which is essential for buffer allocation and management. For example: + * - Thread 0 handling queues [0,2] becomes: q_thread_map[0][0]=1, q_thread_map[0][2]=2 + * - Thread 1 handling queues [1,3] becomes: q_thread_map[1][1]=1, q_thread_map[1][3]=2 + */ + for (j = 0; j < nthreads; j++) { + unsigned char seq = 1; + + for (i = 0; i < queues; i++) { + if (q_thread_map[j][i]) + q_thread_map[j][i] = seq++; + } + } + +#if 0 + for (j = 0; j < nthreads; j++) { + printf("thread %0d: ", j); + for (i = 0; i < queues; i++) { + if (q_thread_map[j][i]) + printf("%03u ", i); + } + printf("\n"); + } + printf("\n"); + for (j = 0; j < nthreads; j++) { + for (i = 0; i < queues; i++) { + printf("%03u ", q_thread_map[j][i]); + } + printf("\n"); + } +#endif +} diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 5d84000872a0..2da37557e1a9 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -455,6 +455,7 @@ static int ublk_queue_init(struct ublk_queue *q, unsigned long long extra_flags, int cmd_buf_size, io_buf_size, integrity_size; unsigned long off; + pthread_spin_init(&q->lock, PTHREAD_PROCESS_PRIVATE); q->tgt_ops = dev->tgt.ops; q->flags = 0; q->q_depth = depth; @@ -521,7 +522,7 @@ static int ublk_thread_init(struct ublk_thread *t, unsigned long long extra_flag /* FETCH_IO_CMDS is multishot, so increase cq depth for BATCH_IO */ if (ublk_dev_batch_io(dev)) - cq_depth += dev->dev_info.queue_depth; + cq_depth += dev->dev_info.queue_depth * 2; ret = ublk_setup_ring(&t->ring, ring_depth, cq_depth, IORING_SETUP_COOP_TASKRUN | @@ -957,6 +958,7 @@ struct ublk_thread_info { sem_t *ready; cpu_set_t *affinity; unsigned long long extra_flags; + unsigned char (*q_thread_map)[UBLK_MAX_QUEUES]; }; static void ublk_thread_set_sched_affinity(const struct ublk_thread_info *info) @@ -970,14 +972,18 @@ static void ublk_batch_setup_queues(struct ublk_thread *t) { int i; - /* setup all queues in the 1st thread */ for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) { struct ublk_queue *q = &t->dev->q[i]; int ret; + /* + * Only prepare io commands in the mapped thread context, + * otherwise io command buffer index may not work as expected + */ + if (t->q_map[i] == 0) + continue; + ret = ublk_batch_queue_prep_io_cmds(t, q); - ublk_assert(ret == 0); - ret = ublk_process_io(t); ublk_assert(ret >= 0); } } @@ -991,6 +997,10 @@ static __attribute__((noinline)) int __ublk_io_handler_fn(struct ublk_thread_inf int dev_id = info->dev->dev_info.dev_id; int ret; + /* Copy per-thread queue mapping into thread-local variable */ + if (info->q_thread_map) + memcpy(t.q_map, info->q_thread_map[info->idx], sizeof(t.q_map)); + ret = ublk_thread_init(&t, info->extra_flags); if (ret) { ublk_err("ublk dev %d thread %u init failed\n", @@ -1006,12 +1016,8 @@ static __attribute__((noinline)) int __ublk_io_handler_fn(struct ublk_thread_inf /* submit all io commands to ublk driver */ ublk_submit_fetch_commands(&t); } else { - struct ublk_queue *q = &t.dev->q[t.idx]; - - /* prepare all io commands in the 1st thread context */ - if (!t.idx) - ublk_batch_setup_queues(&t); - ublk_batch_start_fetch(&t, q); + ublk_batch_setup_queues(&t); + ublk_batch_start_fetch(&t); } do { @@ -1085,6 +1091,7 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) struct ublk_thread_info *tinfo; unsigned long long extra_flags = 0; cpu_set_t *affinity_buf; + unsigned char (*q_thread_map)[UBLK_MAX_QUEUES] = NULL; void *thread_ret; sem_t ready; int ret, i; @@ -1104,6 +1111,16 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) if (ret) return ret; + if (ublk_dev_batch_io(dev)) { + q_thread_map = calloc(dev->nthreads, sizeof(*q_thread_map)); + if (!q_thread_map) { + ret = -ENOMEM; + goto fail; + } + ublk_batch_setup_map(q_thread_map, dev->nthreads, + dinfo->nr_hw_queues); + } + if (ctx->auto_zc_fallback) extra_flags = UBLKS_Q_AUTO_BUF_REG_FALLBACK; if (ctx->no_ublk_fixed_fd) @@ -1127,6 +1144,7 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) tinfo[i].idx = i; tinfo[i].ready = &ready; tinfo[i].extra_flags = extra_flags; + tinfo[i].q_thread_map = q_thread_map; /* * If threads are not tied 1:1 to queues, setting thread @@ -1146,6 +1164,7 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) for (i = 0; i < dev->nthreads; i++) sem_wait(&ready); free(affinity_buf); + free(q_thread_map); /* everything is fine now, start us */ if (ctx->recovery) @@ -1314,7 +1333,8 @@ static int __cmd_dev_add(const struct dev_ctx *ctx) goto fail; } - if (nthreads != nr_queues && !ctx->per_io_tasks) { + if (nthreads != nr_queues && (!ctx->per_io_tasks && + !(ctx->flags & UBLK_F_BATCH_IO))) { ublk_err("%s: threads %u must be same as queues %u if " "not using per_io_tasks\n", __func__, nthreads, nr_queues); @@ -1940,6 +1960,13 @@ int main(int argc, char *argv[]) return -EINVAL; } + if ((ctx.flags & UBLK_F_AUTO_BUF_REG) && + (ctx.flags & UBLK_F_BATCH_IO) && + (ctx.nthreads > ctx.nr_hw_queues)) { + ublk_err("too many threads for F_AUTO_BUF_REG & F_BATCH_IO\n"); + return -EINVAL; + } + i = optind; while (i < argc && ctx.nr_files < MAX_BACK_FILES) { ctx.files[ctx.nr_files++] = argv[i++]; diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 950e99c02e8b..ca97deb5e208 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -173,13 +173,17 @@ struct ublk_queue { const struct ublk_tgt_ops *tgt_ops; struct ublksrv_io_desc *io_cmd_buf; -/* borrow one bit of ublk uapi flags, which may never be used */ +/* borrow three bit of ublk uapi flags, which may never be used */ #define UBLKS_Q_AUTO_BUF_REG_FALLBACK (1ULL << 63) #define UBLKS_Q_NO_UBLK_FIXED_FD (1ULL << 62) +#define UBLKS_Q_PREPARED (1ULL << 61) __u64 flags; int ublk_fd; /* cached ublk char device fd */ __u8 metadata_size; struct ublk_io ios[UBLK_QUEUE_DEPTH]; + + /* used for prep io commands */ + pthread_spinlock_t lock; }; /* align with `ublk_elem_header` */ @@ -206,8 +210,12 @@ struct batch_fetch_buf { }; struct ublk_thread { + /* Thread-local copy of queue-to-thread mapping for this thread */ + unsigned char q_map[UBLK_MAX_QUEUES]; + struct ublk_dev *dev; - unsigned idx; + unsigned short idx; + unsigned short nr_queues; #define UBLKS_T_STOPPING (1U << 0) #define UBLKS_T_IDLE (1U << 1) @@ -230,10 +238,10 @@ struct ublk_thread { void *commit_buf; #define UBLKS_T_COMMIT_BUF_INV_IDX ((unsigned short)-1) struct allocator commit_buf_alloc; - struct batch_commit_buf commit; + struct batch_commit_buf *commit; /* FETCH_IO_CMDS buffer */ -#define UBLKS_T_NR_FETCH_BUF 2 - struct batch_fetch_buf fetch[UBLKS_T_NR_FETCH_BUF]; + unsigned short nr_fetch_bufs; + struct batch_fetch_buf *fetch; struct io_uring ring; }; @@ -512,6 +520,21 @@ static inline int ublk_queue_no_buf(const struct ublk_queue *q) return ublk_queue_use_zc(q) || ublk_queue_use_auto_zc(q); } +static inline int ublk_batch_commit_prepared(struct batch_commit_buf *cb) +{ + return cb->buf_idx != UBLKS_T_COMMIT_BUF_INV_IDX; +} + +static inline unsigned ublk_queue_idx_in_thread(const struct ublk_thread *t, + const struct ublk_queue *q) +{ + unsigned char idx; + + idx = t->q_map[q->q_id]; + ublk_assert(idx != 0); + return idx - 1; +} + /* * Each IO's buffer index has to be calculated by this helper for * UBLKS_T_BATCH_IO @@ -520,14 +543,13 @@ static inline unsigned short ublk_batch_io_buf_idx( const struct ublk_thread *t, const struct ublk_queue *q, unsigned tag) { - return tag; + return ublk_queue_idx_in_thread(t, q) * q->q_depth + tag; } /* Queue UBLK_U_IO_PREP_IO_CMDS for a specific queue with batch elements */ int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q); /* Start fetching I/O commands using multishot UBLK_U_IO_FETCH_IO_CMDS */ -void ublk_batch_start_fetch(struct ublk_thread *t, - struct ublk_queue *q); +void ublk_batch_start_fetch(struct ublk_thread *t); /* Handle completion of batch I/O commands (prep/commit) */ void ublk_batch_compl_cmd(struct ublk_thread *t, const struct io_uring_cqe *cqe); @@ -545,6 +567,8 @@ void ublk_batch_commit_io_cmds(struct ublk_thread *t); /* Add a completed I/O operation to the current batch commit buffer */ void ublk_batch_complete_io(struct ublk_thread *t, struct ublk_queue *q, unsigned tag, int res); +void ublk_batch_setup_map(unsigned char (*q_thread_map)[UBLK_MAX_QUEUES], + int nthreads, int queues); static inline int ublk_complete_io(struct ublk_thread *t, struct ublk_queue *q, unsigned tag, int res) diff --git a/tools/testing/selftests/ublk/test_batch_02.sh b/tools/testing/selftests/ublk/test_batch_02.sh new file mode 100755 index 000000000000..b477f91359e1 --- /dev/null +++ b/tools/testing/selftests/ublk/test_batch_02.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="batch_02" +ERR_CODE=0 + +if ! _have_feature "BATCH_IO"; then + exit "$UBLK_SKIP_CODE" +fi + +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "generic" "test UBLK_F_BATCH_IO with 4_threads vs. 1_queues" + +_create_backfile 0 512M + +dev_id=$(_add_ublk_dev -t loop -q 1 --nthreads 4 -b "${UBLK_BACKFILES[0]}") +_check_add_dev $TID $? + +# run fio over the ublk disk +fio --name=job1 --filename=/dev/ublkb"${dev_id}" --ioengine=libaio --rw=readwrite \ + --iodepth=32 --size=100M --numjobs=4 > /dev/null 2>&1 +ERR_CODE=$? + +_cleanup_test "generic" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_batch_03.sh b/tools/testing/selftests/ublk/test_batch_03.sh new file mode 100755 index 000000000000..13a2b3d3a1b9 --- /dev/null +++ b/tools/testing/selftests/ublk/test_batch_03.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="batch_03" +ERR_CODE=0 + +if ! _have_feature "BATCH_IO"; then + exit "$UBLK_SKIP_CODE" +fi + +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "generic" "test UBLK_F_BATCH_IO with 1_threads vs. 4_queues" + +_create_backfile 0 512M + +dev_id=$(_add_ublk_dev -t loop -q 4 --nthreads 1 -b "${UBLK_BACKFILES[0]}") +_check_add_dev $TID $? + +# run fio over the ublk disk +fio --name=job1 --filename=/dev/ublkb"${dev_id}" --ioengine=libaio --rw=readwrite \ + --iodepth=32 --size=100M --numjobs=4 > /dev/null 2>&1 +ERR_CODE=$? + +_cleanup_test "generic" +_show_result $TID $ERR_CODE -- cgit v1.2.3 From e4d3fc6a22f53e5bbe51e28b43cb32bc130d9f87 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 23 Jan 2026 17:15:44 +0800 Subject: selftests: ublk: fix test name Fix the two added test name. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/test_stress_08.sh | 2 +- tools/testing/selftests/ublk/test_stress_09.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/ublk/test_stress_08.sh b/tools/testing/selftests/ublk/test_stress_08.sh index 190db0b4f2ad..9abb50ee3d00 100755 --- a/tools/testing/selftests/ublk/test_stress_08.sh +++ b/tools/testing/selftests/ublk/test_stress_08.sh @@ -2,7 +2,7 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_06" +TID="stress_08" ERR_CODE=0 ublk_io_and_remove() diff --git a/tools/testing/selftests/ublk/test_stress_09.sh b/tools/testing/selftests/ublk/test_stress_09.sh index 1b6bdb31da03..87b92b0a2410 100755 --- a/tools/testing/selftests/ublk/test_stress_09.sh +++ b/tools/testing/selftests/ublk/test_stress_09.sh @@ -2,7 +2,7 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_07" +TID="stress_09" ERR_CODE=0 ublk_io_and_kill_daemon() -- cgit v1.2.3 From e396a74222654486d6ab45dca5d0c54c408b8b91 Mon Sep 17 00:00:00 2001 From: Zhiquan Li Date: Thu, 22 Jan 2026 13:35:50 +0800 Subject: KVM: selftests: Add -U_FORTIFY_SOURCE to avoid some unpredictable test failures Some distributions (such as Ubuntu) configure GCC so that _FORTIFY_SOURCE is automatically enabled at -O1 or above. This results in some fortified version of definitions of standard library functions are included. While linker resolves the symbols, the fortified versions might override the definitions in lib/string_override.c and reference to those PLT entries in GLIBC. This is not a problem for the code in host, but it is a disaster for the guest code. E.g., if build and run x86/nested_emulation_test on Ubuntu 24.04 will encounter a L1 #PF due to memset() reference to __memset_chk@plt. The option -fno-builtin-memset is not helpful here, because those fortified versions are not built-in but some definitions which are included by header, they are for different intentions. In order to eliminate the unpredictable behaviors may vary depending on the linker and platform, add the "-U_FORTIFY_SOURCE" into CFLAGS to prevent from introducing the fortified definitions. Signed-off-by: Zhiquan Li Link: https://patch.msgid.link/20260122053551.548229-1-zhiquan_li@163.com Fixes: 6b6f71484bf4 ("KVM: selftests: Implement memcmp(), memcpy(), and memset() for guest use") Cc: stable@vger.kernel.org [sean: tag for stable] Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/Makefile.kvm | 1 + 1 file changed, 1 insertion(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index ba5c2b643efa..d45bf4ccb3bf 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -251,6 +251,7 @@ LINUX_TOOL_INCLUDE = $(top_srcdir)/tools/include LINUX_TOOL_ARCH_INCLUDE = $(top_srcdir)/tools/arch/$(ARCH)/include CFLAGS += -Wall -Wstrict-prototypes -Wuninitialized -O2 -g -std=gnu99 \ -Wno-gnu-variable-sized-type-not-at-end -MD -MP -DCONFIG_64BIT \ + -U_FORTIFY_SOURCE \ -fno-builtin-memcmp -fno-builtin-memcpy \ -fno-builtin-memset -fno-builtin-strnlen \ -fno-stack-protector -fno-PIE -fno-strict-aliasing \ -- cgit v1.2.3 From 40146bf7555e9c3480b1225dfe7a5306e1b58b13 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 21 Jan 2026 17:11:36 +0100 Subject: selftests: net: tests for add double tunneling GRO/GSO Create a simple, netns-based topology with double, nested UDP tunnels and perform TSO transfers on top. Explicitly enable GSO and/or GRO and check the skb layout consistency with different configuration allowing (or not) GSO frames to be delivered on the other end. The trickest part is account in a robust way the aggregated/unaggregated packets with double encapsulation: use a classic bpf filter for it. Signed-off-by: Paolo Abeni Reviewed-by: Petr Machata Link: https://patch.msgid.link/61f2c98ba0f73057c2d6f6cb62eb807abd90bf6b.1769011015.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/Makefile | 1 + tools/testing/selftests/net/config | 1 + tools/testing/selftests/net/double_udp_encap.sh | 393 ++++++++++++++++++++++++ 3 files changed, 395 insertions(+) create mode 100755 tools/testing/selftests/net/double_udp_encap.sh (limited to 'tools/testing') diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index ce9699092f50..33f56fcbde09 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -22,6 +22,7 @@ TEST_PROGS := \ cmsg_so_mark.sh \ cmsg_so_priority.sh \ cmsg_time.sh \ + double_udp_encap.sh \ drop_monitor_tests.sh \ fcnal-ipv4.sh \ fcnal-ipv6.sh \ diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config index b84362b9b508..cd49b7dfe216 100644 --- a/tools/testing/selftests/net/config +++ b/tools/testing/selftests/net/config @@ -77,6 +77,7 @@ CONFIG_NET_DROP_MONITOR=m CONFIG_NETFILTER=y CONFIG_NETFILTER_ADVANCED=y CONFIG_NETFILTER_XTABLES_LEGACY=y +CONFIG_NETFILTER_XT_MATCH_BPF=m CONFIG_NETFILTER_XT_MATCH_LENGTH=m CONFIG_NETFILTER_XT_MATCH_POLICY=m CONFIG_NETFILTER_XT_NAT=m diff --git a/tools/testing/selftests/net/double_udp_encap.sh b/tools/testing/selftests/net/double_udp_encap.sh new file mode 100755 index 000000000000..9aaf97cdf141 --- /dev/null +++ b/tools/testing/selftests/net/double_udp_encap.sh @@ -0,0 +1,393 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source lib.sh + +# shellcheck disable=SC2155 # prefer RO variable over return value from cmd +readonly CLI="$(dirname "$(readlink -f "$0")")/../../../net/ynl/pyynl/cli.py" + +readonly SRC=1 +readonly DST=2 + +readonly NET_V4=192.168.1. +readonly NET_V6=2001:db8:: +readonly OL1_NET_V4=172.16.1. +readonly OL1_NET_V6=2001:db8:1:: +readonly OL2_NET_V4=172.16.2. +readonly OL2_NET_V6=2001:db8:2:: + +trap cleanup_all_ns EXIT + +# shellcheck disable=SC2329 # can't figure out usage trough a variable +is_ipv6() { + if [[ $1 =~ .*:.* ]]; then + return 0 + fi + return 1 +} + +# shellcheck disable=SC2329 # can't figure out usage trough a variable +create_gnv_endpoint() { + local -r netns=$1 + local -r bm_rem_addr=$2 + local -r gnv_dev=$3 + local -r gnv_id=$4 + local opts=$5 + local gnv_json + local rem + + if is_ipv6 "$bm_rem_addr"; then + rem=remote6 + else + rem=remote + fi + + # add ynl opt separator, if needed + [ -n "$opts" ] && opts=", $opts" + + gnv_json="{ \"id\": $gnv_id, \"$rem\": \"$bm_rem_addr\"$opts }" + ip netns exec "$netns" "$CLI" --family rt-link --create --excl \ + --do newlink --json "{\"ifname\": \"$gnv_dev\", + \"linkinfo\": {\"kind\":\"geneve\", + \"data\": $gnv_json } }" > /dev/null + ip -n "$netns" link set dev "$gnv_dev" up +} + +# shellcheck disable=SC2329 # can't figure out usage trough a variable +create_vxlan_endpoint() { + local -r netns=$1 + local -r bm_rem_addr=$2 + local -r vxlan_dev=$3 + local -r vxlan_id=$4 + local -r opts_str=$5 + local oldifs + local -a opts + local opt + + # convert the arguments from yaml format + oldifs=$IFS + IFS=',' + for opt in $opts_str; do + local pattern='"port":' + + [ -n "$opt" ] || continue + + opts+=("${opt/$pattern*/dstport}" "${opt/$pattern/}") + done + IFS=$oldifs + [ ${#opts[@]} -gt 0 ] || opts+=("dstport" "4789") + + ip -n "$netns" link add "$vxlan_dev" type vxlan id "$vxlan_id" \ + remote "$bm_rem_addr" "${opts[@]}" + ip -n "$netns" link set dev "$vxlan_dev" up +} + +create_ns() { + local nested_opt='"port":6082' + local create_endpoint + local options="$1" + local feature + local dev + local id + local ns + + RET=0 + + # +-------------+ +-------------+ + # | NS_SRC | | NS_NST_DST | + # | | | | + # | gnv_nst1 | | gnv_nst2 | + # | + | | + | + # | | | | | | + # | + | | + | + # | gnv1 | | gnv2 | + # | + | | + | + # | | | | | | + # | + veth1 +--------+ veth2 + | + # | | | | + # +-------------+ +-------------+ + + setup_ns NS_SRC NS_DST + + # concatenate caller provided options and default one + [ -n "$2" ] && nested_opt="$nested_opt,$2" + + ip link add name "veth$SRC" netns "$NS_SRC" type veth \ + peer name "veth$DST" netns "$NS_DST" + case "$ENCAP" in + vxlan) + create_endpoint=create_vxlan_endpoint + dev=vx + ;; + geneve) + create_endpoint=create_gnv_endpoint + dev=gnv + ;; + esac + + id=1 + for ns in "${NS_LIST[@]}"; do + ip -n "$ns" link set dev "veth$id" up + + # ensure the sender can do large write just after 3whs + ip netns exec "$ns" \ + sysctl -qw net.ipv4.tcp_wmem="4096 4194304 4194304" + + # note that 3 - $SRC == $DST and 3 - $DST == $SRC + if [ $FAMILY = "4" ]; then + ip -n "$ns" addr add dev "veth$id" "$NET_V4$id/24" + $create_endpoint "$ns" "$NET_V4$((3 - id))" \ + "$dev$id" 4 "$options" + ip -n "$ns" addr add dev "$dev$id" "$OL1_NET_V4$id/24" + + # nested tunnel devices + # pmtu can't be propagated to upper layer devices; + # need manual adjust + $create_endpoint "$ns" "$OL1_NET_V4$((3 - id))" \ + "$dev"_nst"$id" 40 "$nested_opt" + ip -n "$ns" addr add dev "$dev"_nst"$id" \ + "$OL2_NET_V4$id/24" + ip -n "$ns" link set dev "$dev"_nst"$id" mtu 1392 + else + ip -n "$ns" addr add dev "veth$id" "$NET_V6$id/64" \ + nodad + $create_endpoint "$ns" "$NET_V6$((3 - id))" \ + "$dev"6"$id" 6 "$options" + ip -n "$ns" addr add dev "$dev"6"$id" \ + "$OL1_NET_V6$id/64" nodad + + $create_endpoint "$ns" "$OL1_NET_V6$((3 - id))" \ + "$dev"6_nst"$id" 60 "$nested_opt" + ip -n "$ns" addr add dev "$dev"6_nst"$id" \ + "$OL2_NET_V6$id/64" nodad + ip -n "$ns" link set dev "$dev"6_nst"$id" mtu 1352 + fi + id=$((id+1)) + done + + # enable GRO heuristic on the veth peer and ensure UDP L4 over tunnel is + # actually segmented + for feature in tso tx-udp_tnl-segmentation; do + ip netns exec "$NS_SRC" ethtool -K "veth$SRC" \ + "$feature" off 2>/dev/null + done +} + +create_ns_gso() { + local dev + + create_ns "$@" + if [ "$ENCAP" = "geneve" ]; then + dev=gnv + else + dev=vx + fi + [ "$FAMILY" = "6" ] && dev="$dev"6 + ip netns exec "$NS_SRC" ethtool -K "$dev$SRC" \ + tx-gso-partial on \ + tx-udp_tnl-segmentation on \ + tx-udp_tnl-csum-segmentation on +} + +create_ns_gso_gro() { + create_ns_gso "$@" + ip netns exec "$NS_DST" ethtool -K "veth$DST" gro on + ip netns exec "$NS_SRC" ethtool -K "veth$SRC" tx off >/dev/null 2>&1 +} + +run_test() { + local -r dst=$NET$DST + local -r msg=$1 + local -r total_size=$2 + local -r encappkts=$3 + local inner_proto_offset=0 + local inner_maclen=14 + local rx_family="-4" + local ipt=iptables + local bpf_filter + local -a rx_args + local wire_pkts + local rcvpkts + local encl=8 + local dport + local pkts + local snd + + if [ $FAMILY = "6" ]; then + ipt=ip6tables + else + # rx program does not support '-6' and implies ipv6 usage by + # default + rx_args=("$rx_family") + fi + + # The received can only check fixed size packet + pkts=$((total_size / GSO_SIZE)) + if [ -n "$4" ]; then + wire_pkts=$4 + elif [ $((total_size % GSO_SIZE)) -eq 0 ]; then + wire_pkts=1 + rx_args+=("-l" "$GSO_SIZE") + else + wire_pkts=2 + pkts=$((pkts + 1)) + fi + + if [ "$ENCAP" = "geneve" ]; then + dport=6081 + else + dport=4789 + fi + + # Either: + # - IPv4, nested tunnel carries UDP over IPv4, with dport 6082, + # innermost is TCP over IPv4 on port 8000 + # - IPv6, nested tunnel carries UDP over IPv6, with dport 6082, + # innermost is TCP over IPv6 on port 8000 + # The nested tunnel port is 6082 and the nested encap len is 8 + # regardless of the encap type (no geneve opts). + # In inherit protocol mode there is no nested mac hdr and the nested + # l3 protocol type field belongs to the geneve hdr. + [ "$USE_HINT" = true ] && encl=16 + [ "$INHERIT" = true ] && inner_maclen=0 + [ "$INHERIT" = true ] && inner_proto_offset=-4 + local inner=$((inner_maclen+encl)) + local proto=$((inner_maclen+encl+inner_proto_offset)) + bpf_filter=$(nfbpf_compile "(ip && + ip[$((40+encl))] == 0x08 && ip[$((41+encl))] == 0x00 && + ip[$((51+encl))] == 0x11 && + ip[$((64+encl))] == 0x17 && ip[$((65+encl))] == 0xc2 && + ip[$((76+proto))] == 0x08 && ip[$((77+proto))] == 0x00 && + ip[$((87+inner))] == 0x6 && + ip[$((100+inner))] == 0x1f && ip[$((101+inner))] == 0x40) || + (ip6 && + ip6[$((60+encl))] == 0x86 && ip6[$((61+encl))] == 0xdd && + ip6[$((68+encl))] == 0x11 && + ip6[$((104+encl))] == 0x17 && ip6[$((105+encl))] == 0xc2 && + ip6[$((116+proto))] == 0x86 && ip6[$((117+proto))] == 0xdd && + ip6[$((124+inner))] == 0x6 && + ip6[$((160+inner))] == 0x1f && ip6[$((161+inner))] == 0x40)") + + # ignore shorts packet, to avoid arp/mld induced noise + ip netns exec "$NS_SRC" "$ipt" -A OUTPUT -p udp --dport "$dport" \ + -m length --length 600:65535 -m bpf --bytecode "$bpf_filter" + ip netns exec "$NS_DST" "$ipt" -A INPUT -p udp --dport "$dport" \ + -m length --length 600:65535 -m bpf --bytecode "$bpf_filter" + ip netns exec "$NS_DST" ./udpgso_bench_rx -C 2000 -t -R 100 \ + -n "$pkts" "${rx_args[@]}" & + local pid=$! + wait_local_port_listen "$NS_DST" 8000 tcp + ip netns exec "$NS_SRC" ./udpgso_bench_tx -"$FAMILY" -t -M 1 \ + -s "$total_size" -D "$dst" + local ret=$? + check_err "$ret" "client failure exit code $ret" + wait "$pid" + ret=$? + check_err "$ret" "sever failure exit code $ret" + + snd=$(ip netns exec "$NS_SRC" "$ipt"-save -c | + grep "dport $dport" | sed -e 's/\[//' -e 's/:.*//') + + [ "$snd" = "$wire_pkts" ] + # shellcheck disable=SC2319 # known false positive + check_err $? "send $snd packets on the lowest link, expected $wire_pkts" + + rcvpkts=$(ip netns exec "$NS_DST" "$ipt"-save -c | \ + grep "dport $dport" | sed -e 's/\[//' -e 's/:.*//') + + [ "$rcvpkts" = "$encappkts" ] + check_err $? "received $rcvpkts $ENCAP packets, expected $encappkts" + log_test "$msg" +} + +run_tests() { + for FAMILY in 4 6; do + NET=$OL2_NET_V4 + GSO_SIZE=1340 # 1392 - 20 - 32 + + if [ $FAMILY = 6 ]; then + NET=$OL2_NET_V6 + GSO_SIZE=1280 # 1352 - 40 - 32 + fi + + echo "IPv$FAMILY" + + unset USE_HINT + unset INHERIT + + # "geneve" must be last encap in list, so that later + # test cases will run on it + for ENCAP in "vxlan" "geneve"; do + create_ns + run_test "No GSO - $ENCAP" $((GSO_SIZE * 4)) 4 4 + cleanup_all_ns + + create_ns_gso + run_test "GSO without GRO - $ENCAP" $((GSO_SIZE * 4)) \ + 4 1 + cleanup_all_ns + + # IPv4 only test + [ $FAMILY = "4" ] || continue + create_ns_gso + ip netns exec "$NS_SRC" \ + sysctl -qw net.ipv4.ip_no_pmtu_disc=1 + run_test "GSO disable due to no fixedid - $ENCAP" \ + $((GSO_SIZE * 4)) 4 4 + cleanup_all_ns + done + + # GRO tests imply/require geneve encap, the only one providing + # GRO hints + create_ns_gso_gro + run_test "double tunnel GRO, no hints" $((GSO_SIZE * 4)) 4 + cleanup_all_ns + + # hint option is expected for all the following tests in the RX + # path + USE_HINT=true + create_ns_gso_gro \ + '"gro-hint":1,"udp-zero-csum6-tx":1,"udp-zero-csum6-rx":1' \ + '"udp-zero-csum6-tx":1,"udp-zero-csum6-rx":1' + run_test "double tunnel GRO" $((GSO_SIZE * 4)) 1 + cleanup_all_ns + + create_ns_gso_gro '"gro-hint":1,"udp-csum":1' '"udp-csum":1' + run_test "double tunnel GRO - csum complete" $((GSO_SIZE * 4))\ + 1 + cleanup_all_ns + + create_ns_gso_gro '"gro-hint":1' \ + '"udp-csum":0,"udp-zero-csum6-tx":1,"udp-zero-csum6-rx":1' + run_test "double tunnel GRO - no nested csum" \ + $((GSO_SIZE * 4)) 1 + cleanup_all_ns + + create_ns_gso_gro \ + '"gro-hint":1,"udp-zero-csum6-tx":1,"udp-zero-csum6-rx":1' \ + '"udp-csum":1' + run_test "double tunnel GRO - nested csum, outer 0-csum, skip"\ + $((GSO_SIZE * 4)) 4 + cleanup_all_ns + + INHERIT=true + create_ns_gso_gro '"gro-hint":1,"udp-csum":1' \ + '"udp-csum":1,"inner-proto-inherit":1' + run_test "double tunnel GRO - nested inherit proto" \ + $((GSO_SIZE * 4)) 1 + cleanup_all_ns + unset INHERIT + + create_ns_gso_gro '"gro-hint":1' + run_test "double tunnel GRO - short last pkt" \ + $((GSO_SIZE * 4 + GSO_SIZE / 2)) 2 + cleanup_all_ns + done +} + +require_command nfbpf_compile +require_command jq + +# tcp retransmisions will break the accounting +xfail_on_slow run_tests +exit "$EXIT_STATUS" -- cgit v1.2.3 From e073c118db0217e1dab14eb0088e7c6a8bf9ef63 Mon Sep 17 00:00:00 2001 From: Xu Du Date: Wed, 21 Jan 2026 18:04:55 +0800 Subject: selftest: tun: Format tun.c existing code In preparation for adding new tests for GSO over UDP tunnels, apply consistently the kernel style to the existing code. Signed-off-by: Xu Du Link: https://patch.msgid.link/d797de1e5a3d215dd78cb46775772ef682bab60e.1768979440.git.xudu@redhat.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/tun.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/net/tun.c b/tools/testing/selftests/net/tun.c index 0efc67b0357a..128b0a5327d4 100644 --- a/tools/testing/selftests/net/tun.c +++ b/tools/testing/selftests/net/tun.c @@ -25,7 +25,7 @@ static int tun_attach(int fd, char *dev) strcpy(ifr.ifr_name, dev); ifr.ifr_flags = IFF_ATTACH_QUEUE; - return ioctl(fd, TUNSETQUEUE, (void *) &ifr); + return ioctl(fd, TUNSETQUEUE, (void *)&ifr); } static int tun_detach(int fd, char *dev) @@ -36,7 +36,7 @@ static int tun_detach(int fd, char *dev) strcpy(ifr.ifr_name, dev); ifr.ifr_flags = IFF_DETACH_QUEUE; - return ioctl(fd, TUNSETQUEUE, (void *) &ifr); + return ioctl(fd, TUNSETQUEUE, (void *)&ifr); } static int tun_alloc(char *dev) @@ -54,7 +54,7 @@ static int tun_alloc(char *dev) strcpy(ifr.ifr_name, dev); ifr.ifr_flags = IFF_TAP | IFF_NAPI | IFF_MULTI_QUEUE; - err = ioctl(fd, TUNSETIFF, (void *) &ifr); + err = ioctl(fd, TUNSETIFF, (void *)&ifr); if (err < 0) { fprintf(stderr, "can't TUNSETIFF: %s\n", strerror(errno)); close(fd); @@ -67,9 +67,9 @@ static int tun_alloc(char *dev) static int tun_delete(char *dev) { struct { - struct nlmsghdr nh; + struct nlmsghdr nh; struct ifinfomsg ifm; - unsigned char data[64]; + unsigned char data[64]; } req; struct rtattr *rta; int ret, rtnl; @@ -127,31 +127,36 @@ FIXTURE_TEARDOWN(tun) close(self->fd2); } -TEST_F(tun, delete_detach_close) { +TEST_F(tun, delete_detach_close) +{ EXPECT_EQ(tun_delete(self->ifname), 0); EXPECT_EQ(tun_detach(self->fd, self->ifname), -1); EXPECT_EQ(errno, 22); } -TEST_F(tun, detach_delete_close) { +TEST_F(tun, detach_delete_close) +{ EXPECT_EQ(tun_detach(self->fd, self->ifname), 0); EXPECT_EQ(tun_delete(self->ifname), 0); } -TEST_F(tun, detach_close_delete) { +TEST_F(tun, detach_close_delete) +{ EXPECT_EQ(tun_detach(self->fd, self->ifname), 0); close(self->fd); self->fd = -1; EXPECT_EQ(tun_delete(self->ifname), 0); } -TEST_F(tun, reattach_delete_close) { +TEST_F(tun, reattach_delete_close) +{ EXPECT_EQ(tun_detach(self->fd, self->ifname), 0); EXPECT_EQ(tun_attach(self->fd, self->ifname), 0); EXPECT_EQ(tun_delete(self->ifname), 0); } -TEST_F(tun, reattach_close_delete) { +TEST_F(tun, reattach_close_delete) +{ EXPECT_EQ(tun_detach(self->fd, self->ifname), 0); EXPECT_EQ(tun_attach(self->fd, self->ifname), 0); close(self->fd); -- cgit v1.2.3 From a942fcd72e9736a0c49b60160f29dc5bb01d6f89 Mon Sep 17 00:00:00 2001 From: Xu Du Date: Wed, 21 Jan 2026 18:04:56 +0800 Subject: selftest: tun: Introduce tuntap_helpers.h header for TUN/TAP testing Introduce rtnetlink manipulation and packet construction helpers that will simplify the later creation of more related test cases. This avoids duplicating logic across different test cases. This new header will contain: - YNL-based netlink management utilities. - Helpers for ip link, ip address, ip neighbor and ip route operations. - Packet construction and manipulation helpers. Signed-off-by: Xu Du Link: https://patch.msgid.link/91f905715c69c75f7bf72d43388921fde6c34989.1768979440.git.xudu@redhat.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/tuntap_helpers.h | 390 +++++++++++++++++++++++++++ 1 file changed, 390 insertions(+) create mode 100644 tools/testing/selftests/net/tuntap_helpers.h (limited to 'tools/testing') diff --git a/tools/testing/selftests/net/tuntap_helpers.h b/tools/testing/selftests/net/tuntap_helpers.h new file mode 100644 index 000000000000..d6c0437136ec --- /dev/null +++ b/tools/testing/selftests/net/tuntap_helpers.h @@ -0,0 +1,390 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef _TUNTAP_HELPERS_H +#define _TUNTAP_HELPERS_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rt-route-user.h" +#include "rt-addr-user.h" +#include "rt-neigh-user.h" +#include "rt-link-user.h" + +#define GENEVE_HLEN 8 +#define PKT_DATA 0xCB +#define TUNTAP_DEFAULT_TTL 8 +#define TUNTAP_DEFAULT_IPID 1337 + +unsigned int if_nametoindex(const char *ifname); + +static inline int ip_addr_len(int family) +{ + return (family == AF_INET) ? sizeof(struct in_addr) : + sizeof(struct in6_addr); +} + +static inline void fill_ifaddr_msg(struct ifaddrmsg *ifam, int family, + int prefix, int flags, const char *dev) +{ + ifam->ifa_family = family; + ifam->ifa_prefixlen = prefix; + ifam->ifa_index = if_nametoindex(dev); + ifam->ifa_flags = flags; + ifam->ifa_scope = RT_SCOPE_UNIVERSE; +} + +static inline int ip_addr_add(const char *dev, int family, void *addr, + uint8_t prefix) +{ + int nl_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL; + int ifa_flags = IFA_F_PERMANENT | IFA_F_NODAD; + int ret = -1, ipalen = ip_addr_len(family); + struct rt_addr_newaddr_req *req; + struct ynl_sock *ys; + + ys = ynl_sock_create(&ynl_rt_addr_family, NULL); + if (!ys) + return -1; + + req = rt_addr_newaddr_req_alloc(); + if (!req) + goto err_req_alloc; + + fill_ifaddr_msg(&req->_hdr, family, prefix, ifa_flags, dev); + rt_addr_newaddr_req_set_nlflags(req, nl_flags); + rt_addr_newaddr_req_set_local(req, addr, ipalen); + + ret = rt_addr_newaddr(ys, req); + rt_addr_newaddr_req_free(req); +err_req_alloc: + ynl_sock_destroy(ys); + return ret; +} + +static inline void fill_neigh_req_header(struct ndmsg *ndm, int family, + int state, const char *dev) +{ + ndm->ndm_family = family; + ndm->ndm_ifindex = if_nametoindex(dev); + ndm->ndm_state = state; + ndm->ndm_flags = 0; + ndm->ndm_type = RTN_UNICAST; +} + +static inline int ip_neigh_add(const char *dev, int family, void *addr, + unsigned char *lladdr) +{ + int nl_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL; + int ret = -1, ipalen = ip_addr_len(family); + struct rt_neigh_newneigh_req *req; + struct ynl_sock *ys; + + ys = ynl_sock_create(&ynl_rt_neigh_family, NULL); + if (!ys) + return -1; + + req = rt_neigh_newneigh_req_alloc(); + if (!req) + goto err_req_alloc; + + fill_neigh_req_header(&req->_hdr, family, NUD_PERMANENT, dev); + rt_neigh_newneigh_req_set_nlflags(req, nl_flags); + rt_neigh_newneigh_req_set_dst(req, addr, ipalen); + rt_neigh_newneigh_req_set_lladdr(req, lladdr, ETH_ALEN); + rt_neigh_newneigh_req_set_ifindex(req, if_nametoindex(dev)); + + ret = rt_neigh_newneigh(ys, req); + rt_neigh_newneigh_req_free(req); +err_req_alloc: + ynl_sock_destroy(ys); + return ret; +} + +static inline void fill_route_req_header(struct rtmsg *rtm, int family, + int table) +{ + rtm->rtm_family = family; + rtm->rtm_table = table; +} + +static inline int +ip_route_get(const char *dev, int family, int table, void *dst, + void (*parse_rsp)(struct rt_route_getroute_rsp *rsp, void *out), + void *out) +{ + int ret = -1, ipalen = ip_addr_len(family); + struct rt_route_getroute_req *req; + struct rt_route_getroute_rsp *rsp; + struct ynl_sock *ys; + + ys = ynl_sock_create(&ynl_rt_route_family, NULL); + if (!ys) + return -1; + + req = rt_route_getroute_req_alloc(); + if (!req) + goto err_req_alloc; + + fill_route_req_header(&req->_hdr, family, table); + rt_route_getroute_req_set_nlflags(req, NLM_F_REQUEST); + rt_route_getroute_req_set_dst(req, dst, ipalen); + rt_route_getroute_req_set_oif(req, if_nametoindex(dev)); + + rsp = rt_route_getroute(ys, req); + if (!rsp) + goto err_rsp_get; + + ret = 0; + if (parse_rsp) + parse_rsp(rsp, out); + + rt_route_getroute_rsp_free(rsp); +err_rsp_get: + rt_route_getroute_req_free(req); +err_req_alloc: + ynl_sock_destroy(ys); + return ret; +} + +static inline int +ip_link_add(const char *dev, char *link_type, + int (*fill_link_attr)(struct rt_link_newlink_req *req, void *data), + void *data) +{ + int nl_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL; + struct rt_link_newlink_req *req; + struct ynl_sock *ys; + int ret = -1; + + ys = ynl_sock_create(&ynl_rt_link_family, NULL); + if (!ys) + return -1; + + req = rt_link_newlink_req_alloc(); + if (!req) + goto err_req_alloc; + + req->_hdr.ifi_flags = IFF_UP; + rt_link_newlink_req_set_nlflags(req, nl_flags); + rt_link_newlink_req_set_ifname(req, dev); + rt_link_newlink_req_set_linkinfo_kind(req, link_type); + + if (fill_link_attr && fill_link_attr(req, data) < 0) + goto err_attr_fill; + + ret = rt_link_newlink(ys, req); +err_attr_fill: + rt_link_newlink_req_free(req); +err_req_alloc: + ynl_sock_destroy(ys); + return ret; +} + +static inline int ip_link_del(const char *dev) +{ + struct rt_link_dellink_req *req; + struct ynl_sock *ys; + int ret = -1; + + ys = ynl_sock_create(&ynl_rt_link_family, NULL); + if (!ys) + return -1; + + req = rt_link_dellink_req_alloc(); + if (!req) + goto err_req_alloc; + + rt_link_dellink_req_set_nlflags(req, NLM_F_REQUEST); + rt_link_dellink_req_set_ifname(req, dev); + + ret = rt_link_dellink(ys, req); + rt_link_dellink_req_free(req); +err_req_alloc: + ynl_sock_destroy(ys); + return ret; +} + +static inline size_t build_eth(uint8_t *buf, uint16_t proto, unsigned char *src, + unsigned char *dest) +{ + struct ethhdr *eth = (struct ethhdr *)buf; + + eth->h_proto = htons(proto); + memcpy(eth->h_source, src, ETH_ALEN); + memcpy(eth->h_dest, dest, ETH_ALEN); + + return ETH_HLEN; +} + +static inline uint32_t add_csum(const uint8_t *buf, int len) +{ + uint16_t *sbuf = (uint16_t *)buf; + uint32_t sum = 0; + + while (len > 1) { + sum += *sbuf++; + len -= 2; + } + + if (len) + sum += *(uint8_t *)sbuf; + + return sum; +} + +static inline uint16_t finish_ip_csum(uint32_t sum) +{ + while (sum >> 16) + sum = (sum & 0xffff) + (sum >> 16); + return ~((uint16_t)sum); +} + +static inline uint16_t build_ip_csum(const uint8_t *buf, int len, uint32_t sum) +{ + sum += add_csum(buf, len); + return finish_ip_csum(sum); +} + +static inline int build_ipv4_header(uint8_t *buf, uint8_t proto, + int payload_len, struct in_addr *src, + struct in_addr *dst) +{ + struct iphdr *iph = (struct iphdr *)buf; + + iph->ihl = 5; + iph->version = 4; + iph->ttl = TUNTAP_DEFAULT_TTL; + iph->tot_len = htons(sizeof(*iph) + payload_len); + iph->id = htons(TUNTAP_DEFAULT_IPID); + iph->protocol = proto; + iph->saddr = src->s_addr; + iph->daddr = dst->s_addr; + iph->check = build_ip_csum(buf, iph->ihl << 2, 0); + + return iph->ihl << 2; +} + +static inline void ipv6_set_dsfield(struct ipv6hdr *ip6h, uint8_t dsfield) +{ + uint16_t val, *ptr = (uint16_t *)ip6h; + + val = ntohs(*ptr); + val &= 0xF00F; + val |= ((uint16_t)dsfield) << 4; + *ptr = htons(val); +} + +static inline int build_ipv6_header(uint8_t *buf, uint8_t proto, + uint8_t dsfield, int payload_len, + struct in6_addr *src, struct in6_addr *dst) +{ + struct ipv6hdr *ip6h = (struct ipv6hdr *)buf; + + ip6h->version = 6; + ip6h->payload_len = htons(payload_len); + ip6h->nexthdr = proto; + ip6h->hop_limit = TUNTAP_DEFAULT_TTL; + ipv6_set_dsfield(ip6h, dsfield); + memcpy(&ip6h->saddr, src, sizeof(ip6h->saddr)); + memcpy(&ip6h->daddr, dst, sizeof(ip6h->daddr)); + + return sizeof(struct ipv6hdr); +} + +static inline int build_geneve_header(uint8_t *buf, uint32_t vni) +{ + uint16_t protocol = htons(ETH_P_TEB); + uint32_t geneve_vni = htonl((vni << 8) & 0xffffff00); + + memcpy(buf + 2, &protocol, 2); + memcpy(buf + 4, &geneve_vni, 4); + return GENEVE_HLEN; +} + +static inline int build_udp_header(uint8_t *buf, uint16_t sport, uint16_t dport, + int payload_len) +{ + struct udphdr *udph = (struct udphdr *)buf; + + udph->source = htons(sport); + udph->dest = htons(dport); + udph->len = htons(sizeof(*udph) + payload_len); + return sizeof(*udph); +} + +static inline void build_udp_packet_csum(uint8_t *buf, int family, + bool csum_off) +{ + struct udphdr *udph = (struct udphdr *)buf; + size_t ipalen = ip_addr_len(family); + uint32_t sum; + + /* No extension IPv4 and IPv6 headers addresses are the last fields */ + sum = add_csum(buf - 2 * ipalen, 2 * ipalen); + sum += htons(IPPROTO_UDP) + udph->len; + + if (!csum_off) + sum += add_csum(buf, udph->len); + + udph->check = finish_ip_csum(sum); +} + +static inline int build_udp_packet(uint8_t *buf, uint16_t sport, uint16_t dport, + int payload_len, int family, bool csum_off) +{ + struct udphdr *udph = (struct udphdr *)buf; + + build_udp_header(buf, sport, dport, payload_len); + memset(buf + sizeof(*udph), PKT_DATA, payload_len); + build_udp_packet_csum(buf, family, csum_off); + + return sizeof(*udph) + payload_len; +} + +static inline int build_virtio_net_hdr_v1_hash_tunnel(uint8_t *buf, bool is_tap, + int hdr_len, int gso_size, + int outer_family, + int inner_family) +{ + struct virtio_net_hdr_v1_hash_tunnel *vh_tunnel = (void *)buf; + struct virtio_net_hdr_v1 *vh = &vh_tunnel->hash_hdr.hdr; + int outer_iphlen, inner_iphlen, eth_hlen, gso_type; + + eth_hlen = is_tap ? ETH_HLEN : 0; + outer_iphlen = (outer_family == AF_INET) ? sizeof(struct iphdr) : + sizeof(struct ipv6hdr); + inner_iphlen = (inner_family == AF_INET) ? sizeof(struct iphdr) : + sizeof(struct ipv6hdr); + + vh_tunnel->outer_th_offset = eth_hlen + outer_iphlen; + vh_tunnel->inner_nh_offset = vh_tunnel->outer_th_offset + ETH_HLEN + + GENEVE_HLEN + sizeof(struct udphdr); + + vh->csum_start = vh_tunnel->inner_nh_offset + inner_iphlen; + vh->csum_offset = __builtin_offsetof(struct udphdr, check); + vh->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; + vh->hdr_len = hdr_len; + vh->gso_size = gso_size; + + if (gso_size) { + gso_type = outer_family == AF_INET ? + VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV4 : + VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV6; + vh->gso_type = VIRTIO_NET_HDR_GSO_UDP_L4 | gso_type; + } + + return sizeof(struct virtio_net_hdr_v1_hash_tunnel); +} + +#endif /* _TUNTAP_HELPERS_H */ -- cgit v1.2.3 From 82cfdcfa201057861ec8a60ca9354d2c4c67bd68 Mon Sep 17 00:00:00 2001 From: Xu Du Date: Wed, 21 Jan 2026 18:04:57 +0800 Subject: selftest: tun: Refactor tun_delete to use tuntap_helpers The previous patch introduced common tuntap helpers to simplify tun test code. This patch refactors the tun_delete function to use these new helpers. Signed-off-by: Xu Du Link: https://patch.msgid.link/ecc7c0c2d75d87cb814e97579e731650339703ab.1768979440.git.xudu@redhat.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/Makefile | 16 ++++++++++++--- tools/testing/selftests/net/tun.c | 39 ++---------------------------------- 2 files changed, 15 insertions(+), 40 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 33f56fcbde09..afdea6d95bde 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -183,7 +183,6 @@ TEST_GEN_PROGS := \ tap \ tcp_port_share \ tls \ - tun \ # end of TEST_GEN_PROGS TEST_FILES := \ @@ -195,7 +194,11 @@ TEST_FILES := \ # YNL files, must be before "include ..lib.mk" YNL_GEN_FILES := busy_poller -YNL_GEN_PROGS := netlink-dumps +YNL_GEN_PROGS := \ + netlink-dumps \ + tun \ +# end of YNL_GEN_PROGS + TEST_GEN_FILES += $(YNL_GEN_FILES) TEST_GEN_PROGS += $(YNL_GEN_PROGS) @@ -206,7 +209,14 @@ TEST_INCLUDES := forwarding/lib.sh include ../lib.mk # YNL build -YNL_GENS := netdev +YNL_GENS := \ + netdev \ + rt-addr \ + rt-link \ + rt-neigh \ + rt-route \ +# end of YNL_GENS + include ynl.mk $(OUTPUT)/epoll_busy_poll: LDLIBS += -lcap diff --git a/tools/testing/selftests/net/tun.c b/tools/testing/selftests/net/tun.c index 128b0a5327d4..d9030bdd2e06 100644 --- a/tools/testing/selftests/net/tun.c +++ b/tools/testing/selftests/net/tun.c @@ -8,14 +8,12 @@ #include #include #include -#include #include -#include -#include #include #include #include "kselftest_harness.h" +#include "tuntap_helpers.h" static int tun_attach(int fd, char *dev) { @@ -66,40 +64,7 @@ static int tun_alloc(char *dev) static int tun_delete(char *dev) { - struct { - struct nlmsghdr nh; - struct ifinfomsg ifm; - unsigned char data[64]; - } req; - struct rtattr *rta; - int ret, rtnl; - - rtnl = socket(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE); - if (rtnl < 0) { - fprintf(stderr, "can't open rtnl: %s\n", strerror(errno)); - return 1; - } - - memset(&req, 0, sizeof(req)); - req.nh.nlmsg_len = NLMSG_ALIGN(NLMSG_LENGTH(sizeof(req.ifm))); - req.nh.nlmsg_flags = NLM_F_REQUEST; - req.nh.nlmsg_type = RTM_DELLINK; - - req.ifm.ifi_family = AF_UNSPEC; - - rta = (struct rtattr *)(((char *)&req) + NLMSG_ALIGN(req.nh.nlmsg_len)); - rta->rta_type = IFLA_IFNAME; - rta->rta_len = RTA_LENGTH(IFNAMSIZ); - req.nh.nlmsg_len += rta->rta_len; - memcpy(RTA_DATA(rta), dev, IFNAMSIZ); - - ret = send(rtnl, &req, req.nh.nlmsg_len, 0); - if (ret < 0) - fprintf(stderr, "can't send: %s\n", strerror(errno)); - ret = (unsigned int)ret != req.nh.nlmsg_len; - - close(rtnl); - return ret; + return ip_link_del(dev); } FIXTURE(tun) -- cgit v1.2.3 From 24e59f26eef2c806a8dbb94859aaf7af28197148 Mon Sep 17 00:00:00 2001 From: Xu Du Date: Wed, 21 Jan 2026 18:04:58 +0800 Subject: selftest: tun: Add helpers for GSO over UDP tunnel In preparation for testing GSO over UDP tunnels, enhance the test infrastructure to support a more complex data path involving a TUN device and a GENEVE udp tunnel. This patch introduces a dedicated setup/teardown topology that creates both a GENEVE tunnel interface and a TUN interface. The TUN device acts as the VTEP (Virtual Tunnel Endpoint), allowing it to send and receive virtio-net packets. This setup effectively tests the kernel's data path for encapsulated traffic. Note that after adding a new address to the UDP tunnel, we need to wait a bit until the associated route is available. Additionally, a new data structure is defined to manage test parameters. This structure is designed to be extensible, allowing different test data and configurations to be easily added in subsequent patches. Signed-off-by: Xu Du Link: https://patch.msgid.link/b5787b8c269f43ce11e1756f1691cc7fd9a1e901.1768979440.git.xudu@redhat.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/tun.c | 425 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 425 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/net/tun.c b/tools/testing/selftests/net/tun.c index d9030bdd2e06..ec089355312b 100644 --- a/tools/testing/selftests/net/tun.c +++ b/tools/testing/selftests/net/tun.c @@ -15,6 +15,80 @@ #include "kselftest_harness.h" #include "tuntap_helpers.h" +static const char param_dev_geneve_name[] = "geneve1"; +static unsigned char param_hwaddr_outer_dst[] = { 0x00, 0xfe, 0x98, + 0x14, 0x22, 0x42 }; +static unsigned char param_hwaddr_outer_src[] = { 0x00, 0xfe, 0x98, + 0x94, 0xd2, 0x43 }; +static unsigned char param_hwaddr_inner_dst[] = { 0x00, 0xfe, 0x98, + 0x94, 0x22, 0xcc }; +static unsigned char param_hwaddr_inner_src[] = { 0x00, 0xfe, 0x98, + 0x94, 0xd2, 0xdd }; + +static struct in_addr param_ipaddr4_outer_dst = { + __constant_htonl(0xac100001), +}; + +static struct in_addr param_ipaddr4_outer_src = { + __constant_htonl(0xac100002), +}; + +static struct in_addr param_ipaddr4_inner_dst = { + __constant_htonl(0xac100101), +}; + +static struct in_addr param_ipaddr4_inner_src = { + __constant_htonl(0xac100102), +}; + +static struct in6_addr param_ipaddr6_outer_dst = { + { { 0x20, 0x02, 0x0d, 0xb8, 0x01, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 } }, +}; + +static struct in6_addr param_ipaddr6_outer_src = { + { { 0x20, 0x02, 0x0d, 0xb8, 0x01, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 } }, +}; + +static struct in6_addr param_ipaddr6_inner_dst = { + { { 0x20, 0x02, 0x0d, 0xb8, 0x02, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 } }, +}; + +static struct in6_addr param_ipaddr6_inner_src = { + { { 0x20, 0x02, 0x0d, 0xb8, 0x02, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 } }, +}; + +#define VN_ID 1 +#define VN_PORT 4789 +#define UDP_SRC_PORT 22 +#define UDP_DST_PORT 48878 +#define IPPREFIX_LEN 24 +#define IP6PREFIX_LEN 64 +#define TIMEOUT_SEC 10 +#define TIMEOUT_USEC 100000 +#define MAX_RETRIES 20 + +#define UDP_TUNNEL_GENEVE_4IN4 0x01 +#define UDP_TUNNEL_GENEVE_6IN4 0x02 +#define UDP_TUNNEL_GENEVE_4IN6 0x04 +#define UDP_TUNNEL_GENEVE_6IN6 0x08 + +#define UDP_TUNNEL_OUTER_IPV4 (UDP_TUNNEL_GENEVE_4IN4 | UDP_TUNNEL_GENEVE_6IN4) +#define UDP_TUNNEL_INNER_IPV4 (UDP_TUNNEL_GENEVE_4IN4 | UDP_TUNNEL_GENEVE_4IN6) + +#define TUN_VNET_TNL_SIZE sizeof(struct virtio_net_hdr_v1_hash_tunnel) + +struct geneve_setup_config { + int family; + union { + struct in_addr r4; + struct in6_addr r6; + } remote; + __be32 vnid; + __be16 vnport; + unsigned char hwaddr[6]; + uint8_t csum; +}; + static int tun_attach(int fd, char *dev) { struct ifreq ifr; @@ -67,6 +141,202 @@ static int tun_delete(char *dev) return ip_link_del(dev); } +static int tun_open(char *dev, const int flags, const int hdrlen, + const int features, const unsigned char *mac_addr) +{ + struct ifreq ifr = { 0 }; + int fd, sk = -1; + + fd = open("/dev/net/tun", O_RDWR); + if (fd < 0) { + perror("open"); + return -1; + } + + ifr.ifr_flags = flags; + if (ioctl(fd, TUNSETIFF, (void *)&ifr) < 0) { + perror("ioctl(TUNSETIFF)"); + goto err; + } + strcpy(dev, ifr.ifr_name); + + if (hdrlen > 0) { + if (ioctl(fd, TUNSETVNETHDRSZ, &hdrlen) < 0) { + perror("ioctl(TUNSETVNETHDRSZ)"); + goto err; + } + } + + if (features) { + if (ioctl(fd, TUNSETOFFLOAD, features) < 0) { + perror("ioctl(TUNSETOFFLOAD)"); + goto err; + } + } + + sk = socket(PF_INET, SOCK_DGRAM, 0); + if (sk < 0) { + perror("socket"); + goto err; + } + + if (ioctl(sk, SIOCGIFFLAGS, &ifr) < 0) { + perror("ioctl(SIOCGIFFLAGS)"); + goto err; + } + + ifr.ifr_flags |= (IFF_UP | IFF_RUNNING); + if (ioctl(sk, SIOCSIFFLAGS, &ifr) < 0) { + perror("ioctl(SIOCSIFFLAGS)"); + goto err; + } + + if (mac_addr && flags & IFF_TAP) { + ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER; + memcpy(ifr.ifr_hwaddr.sa_data, mac_addr, ETH_ALEN); + + if (ioctl(sk, SIOCSIFHWADDR, &ifr) < 0) { + perror("ioctl(SIOCSIFHWADDR)"); + goto err; + } + } + +out: + if (sk >= 0) + close(sk); + return fd; + +err: + close(fd); + fd = -1; + goto out; +} + +static size_t sockaddr_len(int family) +{ + return (family == AF_INET) ? sizeof(struct sockaddr_in) : + sizeof(struct sockaddr_in6); +} + +static int geneve_fill_newlink(struct rt_link_newlink_req *req, void *data) +{ + struct geneve_setup_config *cfg = data; + +#define SET_GENEVE_REMOTE rt_link_newlink_req_set_linkinfo_data_geneve_remote +#define SET_GENEVE_REMOTE6 rt_link_newlink_req_set_linkinfo_data_geneve_remote6 + + rt_link_newlink_req_set_address(req, cfg->hwaddr, ETH_ALEN); + rt_link_newlink_req_set_linkinfo_data_geneve_id(req, cfg->vnid); + rt_link_newlink_req_set_linkinfo_data_geneve_port(req, cfg->vnport); + rt_link_newlink_req_set_linkinfo_data_geneve_udp_csum(req, cfg->csum); + + if (cfg->family == AF_INET) + SET_GENEVE_REMOTE(req, cfg->remote.r4.s_addr); + else + SET_GENEVE_REMOTE6(req, &cfg->remote.r6, + sizeof(cfg->remote.r6)); + + return 0; +} + +static int geneve_create(const char *dev, int family, void *remote, + void *hwaddr) +{ + struct geneve_setup_config geneve; + + memset(&geneve, 0, sizeof(geneve)); + geneve.vnid = VN_ID; + geneve.vnport = htons(VN_PORT); + geneve.csum = 1; + geneve.family = family; + if (family == AF_INET) + memcpy(&geneve.remote.r4, remote, sizeof(struct in_addr)); + else + memcpy(&geneve.remote.r6, remote, sizeof(struct in6_addr)); + memcpy(geneve.hwaddr, hwaddr, ETH_ALEN); + + return ip_link_add(dev, "geneve", geneve_fill_newlink, (void *)&geneve); +} + +static int set_pmtu_discover(int fd, bool is_ipv4) +{ + int level, name, val; + + if (is_ipv4) { + level = SOL_IP; + name = IP_MTU_DISCOVER; + val = IP_PMTUDISC_DO; + } else { + level = SOL_IPV6; + name = IPV6_MTU_DISCOVER; + val = IPV6_PMTUDISC_DO; + } + + return setsockopt(fd, level, name, &val, sizeof(val)); +} + +static int udp_socket_open(struct sockaddr_storage *ssa, bool do_frag, + bool do_connect, struct sockaddr_storage *dsa) +{ + struct timeval to = { .tv_sec = TIMEOUT_SEC }; + int fd, family = ssa->ss_family; + int salen = sockaddr_len(family); + + fd = socket(family, SOCK_DGRAM, 0); + if (fd < 0) + return -1; + + if (bind(fd, (struct sockaddr *)ssa, salen) < 0) { + perror("bind"); + goto err; + } + + if (do_connect && connect(fd, (struct sockaddr *)dsa, salen) < 0) { + perror("connect"); + goto err; + } + + if (setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &to, sizeof(to)) < 0) { + perror("setsockopt(SO_RCVTIMEO)"); + goto err; + } + + if (!do_frag && set_pmtu_discover(fd, family == AF_INET) < 0) { + perror("set_pmtu_discover"); + goto err; + } + return fd; + +err: + close(fd); + return -1; +} + +static void parse_route_rsp(struct rt_route_getroute_rsp *rsp, void *rtm_type) +{ + *(uint8_t *)rtm_type = rsp->_hdr.rtm_type; +} + +static int ip_route_check(const char *intf, int family, void *addr) +{ + uint8_t rtm_type, table = RT_TABLE_LOCAL; + int retries = MAX_RETRIES; + + while (retries-- > 0) { + if (ip_route_get(intf, family, table, addr, parse_route_rsp, + &rtm_type) == 0 && + rtm_type == RTN_LOCAL) + break; + + usleep(TIMEOUT_USEC); + } + + if (retries < 0) + return -1; + + return 0; +} + FIXTURE(tun) { char ifname[IFNAMSIZ]; @@ -129,4 +399,159 @@ TEST_F(tun, reattach_close_delete) EXPECT_EQ(tun_delete(self->ifname), 0); } +FIXTURE(tun_vnet_udptnl) +{ + char ifname[IFNAMSIZ]; + int fd, sock; +}; + +FIXTURE_VARIANT(tun_vnet_udptnl) +{ + int tunnel_type; + bool is_tap; +}; + +/* clang-format off */ +#define TUN_VNET_UDPTNL_VARIANT_ADD(type, desc) \ + FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##udptnl) { \ + .tunnel_type = type, \ + .is_tap = true, \ + } +/* clang-format on */ + +TUN_VNET_UDPTNL_VARIANT_ADD(UDP_TUNNEL_GENEVE_4IN4, 4in4); +TUN_VNET_UDPTNL_VARIANT_ADD(UDP_TUNNEL_GENEVE_6IN4, 6in4); +TUN_VNET_UDPTNL_VARIANT_ADD(UDP_TUNNEL_GENEVE_4IN6, 4in6); +TUN_VNET_UDPTNL_VARIANT_ADD(UDP_TUNNEL_GENEVE_6IN6, 6in6); + +static void assign_ifaddr_vars(int family, int is_outer, void **srcip, + void **dstip, void **srcmac, void **dstmac) +{ + if (is_outer) { + if (family == AF_INET) { + *srcip = (void *)¶m_ipaddr4_outer_src; + *dstip = (void *)¶m_ipaddr4_outer_dst; + } else { + *srcip = (void *)¶m_ipaddr6_outer_src; + *dstip = (void *)¶m_ipaddr6_outer_dst; + } + *srcmac = param_hwaddr_outer_src; + *dstmac = param_hwaddr_outer_dst; + } else { + if (family == AF_INET) { + *srcip = (void *)¶m_ipaddr4_inner_src; + *dstip = (void *)¶m_ipaddr4_inner_dst; + } else { + *srcip = (void *)¶m_ipaddr6_inner_src; + *dstip = (void *)¶m_ipaddr6_inner_dst; + } + *srcmac = param_hwaddr_inner_src; + *dstmac = param_hwaddr_inner_dst; + } +} + +static void assign_sockaddr_vars(int family, int is_outer, + struct sockaddr_storage *src, + struct sockaddr_storage *dst) +{ + src->ss_family = family; + dst->ss_family = family; + + if (family == AF_INET) { + struct sockaddr_in *s4 = (struct sockaddr_in *)src; + struct sockaddr_in *d4 = (struct sockaddr_in *)dst; + + s4->sin_addr = is_outer ? param_ipaddr4_outer_src : + param_ipaddr4_inner_src; + d4->sin_addr = is_outer ? param_ipaddr4_outer_dst : + param_ipaddr4_inner_dst; + if (!is_outer) { + s4->sin_port = htons(UDP_SRC_PORT); + d4->sin_port = htons(UDP_DST_PORT); + } + } else { + struct sockaddr_in6 *s6 = (struct sockaddr_in6 *)src; + struct sockaddr_in6 *d6 = (struct sockaddr_in6 *)dst; + + s6->sin6_addr = is_outer ? param_ipaddr6_outer_src : + param_ipaddr6_inner_src; + d6->sin6_addr = is_outer ? param_ipaddr6_outer_dst : + param_ipaddr6_inner_dst; + if (!is_outer) { + s6->sin6_port = htons(UDP_SRC_PORT); + d6->sin6_port = htons(UDP_DST_PORT); + } + } +} + +FIXTURE_SETUP(tun_vnet_udptnl) +{ + int ret, family, prefix, flags, features; + int tunnel_type = variant->tunnel_type; + struct sockaddr_storage ssa, dsa; + void *sip, *dip, *smac, *dmac; + + flags = (variant->is_tap ? IFF_TAP : IFF_TUN) | IFF_VNET_HDR | + IFF_MULTI_QUEUE | IFF_NO_PI; + features = TUN_F_CSUM | TUN_F_UDP_TUNNEL_GSO | + TUN_F_UDP_TUNNEL_GSO_CSUM | TUN_F_USO4 | TUN_F_USO6; + self->fd = tun_open(self->ifname, flags, TUN_VNET_TNL_SIZE, features, + param_hwaddr_outer_src); + ASSERT_GE(self->fd, 0); + + family = (tunnel_type & UDP_TUNNEL_OUTER_IPV4) ? AF_INET : AF_INET6; + prefix = (family == AF_INET) ? IPPREFIX_LEN : IP6PREFIX_LEN; + assign_ifaddr_vars(family, 1, &sip, &dip, &smac, &dmac); + + ret = ip_addr_add(self->ifname, family, sip, prefix); + ASSERT_EQ(ret, 0); + ret = ip_neigh_add(self->ifname, family, dip, dmac); + ASSERT_EQ(ret, 0); + ret = ip_route_check(self->ifname, family, sip); + ASSERT_EQ(ret, 0); + + ret = geneve_create(param_dev_geneve_name, family, dip, + param_hwaddr_inner_src); + ASSERT_EQ(ret, 0); + + family = (tunnel_type & UDP_TUNNEL_INNER_IPV4) ? AF_INET : AF_INET6; + prefix = (family == AF_INET) ? IPPREFIX_LEN : IP6PREFIX_LEN; + assign_ifaddr_vars(family, 0, &sip, &dip, &smac, &dmac); + + ret = ip_addr_add(param_dev_geneve_name, family, sip, prefix); + ASSERT_EQ(ret, 0); + ret = ip_neigh_add(param_dev_geneve_name, family, dip, dmac); + ASSERT_EQ(ret, 0); + ret = ip_route_check(param_dev_geneve_name, family, sip); + ASSERT_EQ(ret, 0); + + assign_sockaddr_vars(family, 0, &ssa, &dsa); + self->sock = udp_socket_open(&ssa, false, true, &dsa); + ASSERT_GE(self->sock, 0); +} + +FIXTURE_TEARDOWN(tun_vnet_udptnl) +{ + int ret; + + if (self->sock != -1) + close(self->sock); + + ret = ip_link_del(param_dev_geneve_name); + EXPECT_EQ(ret, 0); + + ret = tun_delete(self->ifname); + EXPECT_EQ(ret, 0); +} + +TEST_F(tun_vnet_udptnl, basic) +{ + int ret; + char cmd[256] = { 0 }; + + sprintf(cmd, "ip addr show %s > /dev/null 2>&1", param_dev_geneve_name); + ret = system(cmd); + ASSERT_EQ(ret, 0); +} + TEST_HARNESS_MAIN -- cgit v1.2.3 From 400e658aa096cda99b37ce806ed63cfe894c9566 Mon Sep 17 00:00:00 2001 From: Xu Du Date: Wed, 21 Jan 2026 18:04:59 +0800 Subject: selftest: tun: Add test for sending gso packet into tun The test constructs a raw packet, prepends a virtio_net_hdr, and writes the result to the TUN device. This mimics the behavior of a vm forwarding a guest's packet to the host networking stack. Signed-off-by: Xu Du Link: https://patch.msgid.link/a988dbc9ca109e4f1f0b33858c5035bce8ebede3.1768979440.git.xudu@redhat.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/tun.c | 144 +++++++++++++++++++++++++++++++++++--- 1 file changed, 135 insertions(+), 9 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/net/tun.c b/tools/testing/selftests/net/tun.c index ec089355312b..f0ddb5d37683 100644 --- a/tools/testing/selftests/net/tun.c +++ b/tools/testing/selftests/net/tun.c @@ -75,7 +75,34 @@ static struct in6_addr param_ipaddr6_inner_src = { #define UDP_TUNNEL_OUTER_IPV4 (UDP_TUNNEL_GENEVE_4IN4 | UDP_TUNNEL_GENEVE_6IN4) #define UDP_TUNNEL_INNER_IPV4 (UDP_TUNNEL_GENEVE_4IN4 | UDP_TUNNEL_GENEVE_4IN6) +#define UDP_TUNNEL_GENEVE_4IN4_HDRLEN \ + (ETH_HLEN + 2 * sizeof(struct iphdr) + GENEVE_HLEN + \ + 2 * sizeof(struct udphdr)) +#define UDP_TUNNEL_GENEVE_6IN6_HDRLEN \ + (ETH_HLEN + 2 * sizeof(struct ipv6hdr) + GENEVE_HLEN + \ + 2 * sizeof(struct udphdr)) +#define UDP_TUNNEL_GENEVE_4IN6_HDRLEN \ + (ETH_HLEN + sizeof(struct iphdr) + sizeof(struct ipv6hdr) + \ + GENEVE_HLEN + 2 * sizeof(struct udphdr)) +#define UDP_TUNNEL_GENEVE_6IN4_HDRLEN \ + (ETH_HLEN + sizeof(struct ipv6hdr) + sizeof(struct iphdr) + \ + GENEVE_HLEN + 2 * sizeof(struct udphdr)) + +#define UDP_TUNNEL_HDRLEN(type) \ + ((type) == UDP_TUNNEL_GENEVE_4IN4 ? UDP_TUNNEL_GENEVE_4IN4_HDRLEN : \ + (type) == UDP_TUNNEL_GENEVE_6IN6 ? UDP_TUNNEL_GENEVE_6IN6_HDRLEN : \ + (type) == UDP_TUNNEL_GENEVE_4IN6 ? UDP_TUNNEL_GENEVE_4IN6_HDRLEN : \ + (type) == UDP_TUNNEL_GENEVE_6IN4 ? UDP_TUNNEL_GENEVE_6IN4_HDRLEN : \ + 0) + +#define UDP_TUNNEL_MSS(type) (ETH_DATA_LEN - UDP_TUNNEL_HDRLEN(type)) +#define UDP_TUNNEL_MAX(type, is_tap) \ + (ETH_MAX_MTU - UDP_TUNNEL_HDRLEN(type) - ((is_tap) ? ETH_HLEN : 0)) + #define TUN_VNET_TNL_SIZE sizeof(struct virtio_net_hdr_v1_hash_tunnel) +#define MAX_VNET_TUNNEL_PACKET_SZ \ + (TUN_VNET_TNL_SIZE + ETH_HLEN + UDP_TUNNEL_GENEVE_6IN6_HDRLEN + \ + ETH_MAX_MTU) struct geneve_setup_config { int family; @@ -408,15 +435,23 @@ FIXTURE(tun_vnet_udptnl) FIXTURE_VARIANT(tun_vnet_udptnl) { int tunnel_type; - bool is_tap; + int gso_size; + int data_size; + int r_num_mss; + bool is_tap, no_gso; }; /* clang-format off */ #define TUN_VNET_UDPTNL_VARIANT_ADD(type, desc) \ - FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##udptnl) { \ + FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_1mss) { \ + /* send a single MSS: fall back to no GSO */ \ .tunnel_type = type, \ + .gso_size = UDP_TUNNEL_MSS(type), \ + .data_size = UDP_TUNNEL_MSS(type), \ + .r_num_mss = 1, \ .is_tap = true, \ - } + .no_gso = true, \ + }; /* clang-format on */ TUN_VNET_UDPTNL_VARIANT_ADD(UDP_TUNNEL_GENEVE_4IN4, 4in4); @@ -544,14 +579,105 @@ FIXTURE_TEARDOWN(tun_vnet_udptnl) EXPECT_EQ(ret, 0); } -TEST_F(tun_vnet_udptnl, basic) +static int build_gso_packet_into_tun(const FIXTURE_VARIANT(tun_vnet_udptnl) * + variant, + uint8_t *buf) { - int ret; - char cmd[256] = { 0 }; + int pktlen, hlen, proto, inner_family, outer_family; + int tunnel_type = variant->tunnel_type; + int payload_len = variant->data_size; + int gso_size = variant->gso_size; + uint8_t *outer_udph, *cur = buf; + void *sip, *dip, *smac, *dmac; + bool is_tap = variant->is_tap; - sprintf(cmd, "ip addr show %s > /dev/null 2>&1", param_dev_geneve_name); - ret = system(cmd); - ASSERT_EQ(ret, 0); + hlen = (is_tap ? ETH_HLEN : 0) + UDP_TUNNEL_HDRLEN(tunnel_type); + inner_family = (tunnel_type & UDP_TUNNEL_INNER_IPV4) ? AF_INET : + AF_INET6; + outer_family = (tunnel_type & UDP_TUNNEL_OUTER_IPV4) ? AF_INET : + AF_INET6; + + cur += build_virtio_net_hdr_v1_hash_tunnel(cur, is_tap, hlen, gso_size, + outer_family, inner_family); + + pktlen = hlen + payload_len; + assign_ifaddr_vars(outer_family, 1, &sip, &dip, &smac, &dmac); + + if (is_tap) { + proto = outer_family == AF_INET ? ETH_P_IP : ETH_P_IPV6; + pktlen -= ETH_HLEN; + cur += build_eth(cur, proto, dmac, smac); + } + + if (outer_family == AF_INET) { + pktlen = pktlen - sizeof(struct iphdr); + cur += build_ipv4_header(cur, IPPROTO_UDP, pktlen, dip, sip); + } else { + pktlen = pktlen - sizeof(struct ipv6hdr); + cur += build_ipv6_header(cur, IPPROTO_UDP, 0, pktlen, dip, sip); + } + + outer_udph = cur; + assign_ifaddr_vars(inner_family, 0, &sip, &dip, &smac, &dmac); + + pktlen -= sizeof(struct udphdr); + proto = inner_family == AF_INET ? ETH_P_IP : ETH_P_IPV6; + cur += build_udp_header(cur, UDP_SRC_PORT, VN_PORT, pktlen); + cur += build_geneve_header(cur, VN_ID); + cur += build_eth(cur, proto, dmac, smac); + + pktlen = sizeof(struct udphdr) + payload_len; + if (inner_family == AF_INET) + cur += build_ipv4_header(cur, IPPROTO_UDP, pktlen, dip, sip); + else + cur += build_ipv6_header(cur, IPPROTO_UDP, 0, pktlen, dip, sip); + + cur += build_udp_packet(cur, UDP_DST_PORT, UDP_SRC_PORT, payload_len, + inner_family, false); + + build_udp_packet_csum(outer_udph, outer_family, false); + + return cur - buf; +} + +static int +receive_gso_packet_from_tunnel(FIXTURE_DATA(tun_vnet_udptnl) * self, + const FIXTURE_VARIANT(tun_vnet_udptnl) * variant, + int *r_num_mss) +{ + uint8_t packet_buf[MAX_VNET_TUNNEL_PACKET_SZ]; + int len, total_len = 0, socket = self->sock; + int payload_len = variant->data_size; + + while (total_len < payload_len) { + len = recv(socket, packet_buf, sizeof(packet_buf), 0); + if (len <= 0) { + if (len < 0 && errno != EAGAIN && errno != EWOULDBLOCK) + perror("recv"); + break; + } + + (*r_num_mss)++; + total_len += len; + } + + return total_len; +} + +TEST_F(tun_vnet_udptnl, send_gso_packet) +{ + uint8_t pkt[MAX_VNET_TUNNEL_PACKET_SZ]; + int r_num_mss = 0; + int ret, off; + + memset(pkt, 0, sizeof(pkt)); + off = build_gso_packet_into_tun(variant, pkt); + ret = write(self->fd, pkt, off); + ASSERT_EQ(ret, off); + + ret = receive_gso_packet_from_tunnel(self, variant, &r_num_mss); + ASSERT_EQ(ret, variant->data_size); + ASSERT_EQ(r_num_mss, variant->r_num_mss); } TEST_HARNESS_MAIN -- cgit v1.2.3 From 6bdd7ae6059ec3c19f4717387706f92cd8b715a6 Mon Sep 17 00:00:00 2001 From: Xu Du Date: Wed, 21 Jan 2026 18:05:00 +0800 Subject: selftest: tun: Add test for receiving gso packet from tun The test validate that GSO information are correctly exposed when reading packets from a TUN device. Signed-off-by: Xu Du Link: https://patch.msgid.link/fe75ac66466380490eba858eef50596a1bfbd071.1768979440.git.xudu@redhat.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/tun.c | 194 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 194 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/net/tun.c b/tools/testing/selftests/net/tun.c index f0ddb5d37683..628ae2caf6f4 100644 --- a/tools/testing/selftests/net/tun.c +++ b/tools/testing/selftests/net/tun.c @@ -364,6 +364,116 @@ static int ip_route_check(const char *intf, int family, void *addr) return 0; } +static int send_gso_udp_msg(int socket, struct sockaddr_storage *addr, + uint8_t *send_buf, int send_len, int gso_size) +{ + char control[CMSG_SPACE(sizeof(uint16_t))] = { 0 }; + int alen = sockaddr_len(addr->ss_family); + struct msghdr msg = { 0 }; + struct iovec iov = { 0 }; + int ret; + + iov.iov_base = send_buf; + iov.iov_len = send_len; + + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_name = addr; + msg.msg_namelen = alen; + + if (gso_size > 0) { + struct cmsghdr *cmsg; + + msg.msg_control = control; + msg.msg_controllen = sizeof(control); + + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_UDP; + cmsg->cmsg_type = UDP_SEGMENT; + cmsg->cmsg_len = CMSG_LEN(sizeof(uint16_t)); + *(uint16_t *)CMSG_DATA(cmsg) = gso_size; + } + + ret = sendmsg(socket, &msg, 0); + if (ret < 0) + perror("sendmsg"); + + return ret; +} + +static int validate_hdrlen(uint8_t **cur, int *len, int x) +{ + if (*len < x) + return -1; + *cur += x; + *len -= x; + return 0; +} + +static int parse_udp_tunnel_vnet_packet(uint8_t *buf, int len, int tunnel_type, + bool is_tap) +{ + struct ipv6hdr *iph6; + struct udphdr *udph; + struct iphdr *iph4; + uint8_t *cur = buf; + + if (validate_hdrlen(&cur, &len, TUN_VNET_TNL_SIZE)) + return -1; + + if (is_tap) { + if (validate_hdrlen(&cur, &len, ETH_HLEN)) + return -1; + } + + if (tunnel_type & UDP_TUNNEL_OUTER_IPV4) { + iph4 = (struct iphdr *)cur; + if (validate_hdrlen(&cur, &len, sizeof(struct iphdr))) + return -1; + if (iph4->version != 4 || iph4->protocol != IPPROTO_UDP) + return -1; + } else { + iph6 = (struct ipv6hdr *)cur; + if (validate_hdrlen(&cur, &len, sizeof(struct ipv6hdr))) + return -1; + if (iph6->version != 6 || iph6->nexthdr != IPPROTO_UDP) + return -1; + } + + udph = (struct udphdr *)cur; + if (validate_hdrlen(&cur, &len, sizeof(struct udphdr))) + return -1; + if (ntohs(udph->dest) != VN_PORT) + return -1; + + if (validate_hdrlen(&cur, &len, GENEVE_HLEN)) + return -1; + if (validate_hdrlen(&cur, &len, ETH_HLEN)) + return -1; + + if (tunnel_type & UDP_TUNNEL_INNER_IPV4) { + iph4 = (struct iphdr *)cur; + if (validate_hdrlen(&cur, &len, sizeof(struct iphdr))) + return -1; + if (iph4->version != 4 || iph4->protocol != IPPROTO_UDP) + return -1; + } else { + iph6 = (struct ipv6hdr *)cur; + if (validate_hdrlen(&cur, &len, sizeof(struct ipv6hdr))) + return -1; + if (iph6->version != 6 || iph6->nexthdr != IPPROTO_UDP) + return -1; + } + + udph = (struct udphdr *)cur; + if (validate_hdrlen(&cur, &len, sizeof(struct udphdr))) + return -1; + if (ntohs(udph->dest) != UDP_DST_PORT) + return -1; + + return len; +} + FIXTURE(tun) { char ifname[IFNAMSIZ]; @@ -664,6 +774,68 @@ receive_gso_packet_from_tunnel(FIXTURE_DATA(tun_vnet_udptnl) * self, return total_len; } +static int send_gso_packet_into_tunnel(FIXTURE_DATA(tun_vnet_udptnl) * self, + const FIXTURE_VARIANT(tun_vnet_udptnl) * + variant) +{ + int family = (variant->tunnel_type & UDP_TUNNEL_INNER_IPV4) ? AF_INET : + AF_INET6; + uint8_t buf[MAX_VNET_TUNNEL_PACKET_SZ] = { 0 }; + int payload_len = variant->data_size; + int gso_size = variant->gso_size; + struct sockaddr_storage ssa, dsa; + + assign_sockaddr_vars(family, 0, &ssa, &dsa); + return send_gso_udp_msg(self->sock, &dsa, buf, payload_len, gso_size); +} + +static int +receive_gso_packet_from_tun(FIXTURE_DATA(tun_vnet_udptnl) * self, + const FIXTURE_VARIANT(tun_vnet_udptnl) * variant, + struct virtio_net_hdr_v1_hash_tunnel *vnet_hdr) +{ + struct timeval timeout = { .tv_sec = TIMEOUT_SEC }; + uint8_t buf[MAX_VNET_TUNNEL_PACKET_SZ]; + int tunnel_type = variant->tunnel_type; + int payload_len = variant->data_size; + bool is_tap = variant->is_tap; + int ret, len, total_len = 0; + int tun_fd = self->fd; + fd_set fdset; + + while (total_len < payload_len) { + FD_ZERO(&fdset); + FD_SET(tun_fd, &fdset); + + ret = select(tun_fd + 1, &fdset, NULL, NULL, &timeout); + if (ret <= 0) { + perror("select"); + break; + } + if (!FD_ISSET(tun_fd, &fdset)) + continue; + + len = read(tun_fd, buf, sizeof(buf)); + if (len <= 0) { + if (len < 0 && errno != EAGAIN && errno != EWOULDBLOCK) + perror("read"); + break; + } + + len = parse_udp_tunnel_vnet_packet(buf, len, tunnel_type, + is_tap); + if (len < 0) + continue; + + if (total_len == 0) + memcpy(vnet_hdr, buf, TUN_VNET_TNL_SIZE); + + total_len += len; + } + + return total_len; +} + TEST_F(tun_vnet_udptnl, send_gso_packet) { uint8_t pkt[MAX_VNET_TUNNEL_PACKET_SZ]; @@ -680,4 +852,26 @@ TEST_F(tun_vnet_udptnl, send_gso_packet) ASSERT_EQ(r_num_mss, variant->r_num_mss); } +TEST_F(tun_vnet_udptnl, recv_gso_packet) +{ + struct virtio_net_hdr_v1_hash_tunnel vnet_hdr = { 0 }; + struct virtio_net_hdr_v1 *vh = &vnet_hdr.hash_hdr.hdr; + int ret, gso_type = VIRTIO_NET_HDR_GSO_UDP_L4; + + ret = send_gso_packet_into_tunnel(self, variant); + ASSERT_EQ(ret, variant->data_size); + + memset(&vnet_hdr, 0, sizeof(vnet_hdr)); + ret = receive_gso_packet_from_tun(self, variant, &vnet_hdr); + ASSERT_EQ(ret, variant->data_size); + + if (!variant->no_gso) { + ASSERT_EQ(vh->gso_size, variant->gso_size); + gso_type |= (variant->tunnel_type & UDP_TUNNEL_OUTER_IPV4) ? + (VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV4) : + (VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV6); + ASSERT_EQ(vh->gso_type, gso_type); + } +} + TEST_HARNESS_MAIN -- cgit v1.2.3 From 87db2fdef5a7c2ea8a404afd402800d48940d6b2 Mon Sep 17 00:00:00 2001 From: Xu Du Date: Wed, 21 Jan 2026 18:05:01 +0800 Subject: selftest: tun: Add test data for success and failure paths To improve the robustness and coverage of the TUN selftests, this patch expands the set of test data. Signed-off-by: Xu Du Link: https://patch.msgid.link/5054f3ad9f3dbfe33b827183fccc5efeb8fd0da7.1768979440.git.xudu@redhat.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/tun.c | 115 +++++++++++++++++++++++++++++++++++++- 1 file changed, 113 insertions(+), 2 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/net/tun.c b/tools/testing/selftests/net/tun.c index 628ae2caf6f4..8a5cd5cb5472 100644 --- a/tools/testing/selftests/net/tun.c +++ b/tools/testing/selftests/net/tun.c @@ -57,6 +57,10 @@ static struct in6_addr param_ipaddr6_inner_src = { { { 0x20, 0x02, 0x0d, 0xb8, 0x02, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 } }, }; +#ifndef BIT +#define BIT(nr) (1UL << (nr)) +#endif + #define VN_ID 1 #define VN_PORT 4789 #define UDP_SRC_PORT 22 @@ -72,6 +76,8 @@ static struct in6_addr param_ipaddr6_inner_src = { #define UDP_TUNNEL_GENEVE_4IN6 0x04 #define UDP_TUNNEL_GENEVE_6IN6 0x08 +#define UDP_TUNNEL_MAX_SEGMENTS BIT(7) + #define UDP_TUNNEL_OUTER_IPV4 (UDP_TUNNEL_GENEVE_4IN4 | UDP_TUNNEL_GENEVE_6IN4) #define UDP_TUNNEL_INNER_IPV4 (UDP_TUNNEL_GENEVE_4IN4 | UDP_TUNNEL_GENEVE_4IN6) @@ -553,6 +559,39 @@ FIXTURE_VARIANT(tun_vnet_udptnl) /* clang-format off */ #define TUN_VNET_UDPTNL_VARIANT_ADD(type, desc) \ + FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_nogsosz_1byte) { \ + /* no GSO: send a single byte */ \ + .tunnel_type = type, \ + .data_size = 1, \ + .r_num_mss = 1, \ + .is_tap = true, \ + .no_gso = true, \ + }; \ + FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_nogsosz_1mss) { \ + /* no GSO: send a single MSS, fall back to no GSO */ \ + .tunnel_type = type, \ + .data_size = UDP_TUNNEL_MSS(type), \ + .r_num_mss = 1, \ + .is_tap = true, \ + .no_gso = true, \ + }; \ + FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_nogsosz_gtmss) { \ + /* no GSO: send a single MSS + 1B: fail */ \ + .tunnel_type = type, \ + .data_size = UDP_TUNNEL_MSS(type) + 1, \ + .r_num_mss = 1, \ + .is_tap = true, \ + .no_gso = true, \ + }; \ + FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_1byte) { \ + /* GSO: send 1 byte, gso 1 byte, fall back to no GSO */ \ + .tunnel_type = type, \ + .gso_size = 1, \ + .data_size = 1, \ + .r_num_mss = 1, \ + .is_tap = true, \ + .no_gso = true, \ + }; \ FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_1mss) { \ /* send a single MSS: fall back to no GSO */ \ .tunnel_type = type, \ @@ -561,8 +600,65 @@ FIXTURE_VARIANT(tun_vnet_udptnl) .r_num_mss = 1, \ .is_tap = true, \ .no_gso = true, \ - }; -/* clang-format on */ + }; \ + FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_ltgso) { \ + /* data <= MSS < gso: will fall back to no GSO */ \ + .tunnel_type = type, \ + .gso_size = UDP_TUNNEL_MSS(type) + 1, \ + .data_size = UDP_TUNNEL_MSS(type), \ + .r_num_mss = 1, \ + .is_tap = true, \ + .no_gso = true, \ + }; \ + FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_gtgso) { \ + /* GSO: a single MSS + 1B */ \ + .tunnel_type = type, \ + .gso_size = UDP_TUNNEL_MSS(type), \ + .data_size = UDP_TUNNEL_MSS(type) + 1, \ + .r_num_mss = 2, \ + .is_tap = true, \ + }; \ + FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_2mss) { \ + /* no GSO: send exactly 2 MSS */ \ + .tunnel_type = type, \ + .gso_size = UDP_TUNNEL_MSS(type), \ + .data_size = UDP_TUNNEL_MSS(type) * 2, \ + .r_num_mss = 2, \ + .is_tap = true, \ + }; \ + FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_maxbytes) { \ + /* GSO: send max bytes */ \ + .tunnel_type = type, \ + .gso_size = UDP_TUNNEL_MSS(type), \ + .data_size = UDP_TUNNEL_MAX(type, true), \ + .r_num_mss = UDP_TUNNEL_MAX(type, true) / \ + UDP_TUNNEL_MSS(type) + 1, \ + .is_tap = true, \ + }; \ + FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_over_maxbytes) { \ + /* GSO: send oversize max bytes: fail */ \ + .tunnel_type = type, \ + .gso_size = UDP_TUNNEL_MSS(type), \ + .data_size = ETH_MAX_MTU, \ + .r_num_mss = ETH_MAX_MTU / UDP_TUNNEL_MSS(type) + 1, \ + .is_tap = true, \ + }; \ + FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_maxsegs) { \ + /* GSO: send max number of min sized segments */ \ + .tunnel_type = type, \ + .gso_size = 1, \ + .data_size = UDP_TUNNEL_MAX_SEGMENTS, \ + .r_num_mss = UDP_TUNNEL_MAX_SEGMENTS, \ + .is_tap = true, \ + }; \ + FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_5byte) { \ + /* GSO: send 5 bytes, gso 2 bytes */ \ + .tunnel_type = type, \ + .gso_size = 2, \ + .data_size = 5, \ + .r_num_mss = 3, \ + .is_tap = true, \ + } /* clang-format on */ TUN_VNET_UDPTNL_VARIANT_ADD(UDP_TUNNEL_GENEVE_4IN4, 4in4); TUN_VNET_UDPTNL_VARIANT_ADD(UDP_TUNNEL_GENEVE_6IN4, 6in4); @@ -874,4 +970,19 @@ TEST_F(tun_vnet_udptnl, recv_gso_packet) } } +XFAIL_ADD(tun_vnet_udptnl, 4in4_nogsosz_gtmss, recv_gso_packet); +XFAIL_ADD(tun_vnet_udptnl, 6in4_nogsosz_gtmss, recv_gso_packet); +XFAIL_ADD(tun_vnet_udptnl, 4in6_nogsosz_gtmss, recv_gso_packet); +XFAIL_ADD(tun_vnet_udptnl, 6in6_nogsosz_gtmss, recv_gso_packet); + +XFAIL_ADD(tun_vnet_udptnl, 4in4_over_maxbytes, send_gso_packet); +XFAIL_ADD(tun_vnet_udptnl, 6in4_over_maxbytes, send_gso_packet); +XFAIL_ADD(tun_vnet_udptnl, 4in6_over_maxbytes, send_gso_packet); +XFAIL_ADD(tun_vnet_udptnl, 6in6_over_maxbytes, send_gso_packet); + +XFAIL_ADD(tun_vnet_udptnl, 4in4_over_maxbytes, recv_gso_packet); +XFAIL_ADD(tun_vnet_udptnl, 6in4_over_maxbytes, recv_gso_packet); +XFAIL_ADD(tun_vnet_udptnl, 4in6_over_maxbytes, recv_gso_packet); +XFAIL_ADD(tun_vnet_udptnl, 6in6_over_maxbytes, recv_gso_packet); + TEST_HARNESS_MAIN -- cgit v1.2.3 From d8df878140506e7938928195f540c10b1089fdaf Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 22 Jan 2026 21:51:22 -0800 Subject: selftests/bpf: Fix task_local_data failure with 64K page On arm64 systems with 64K pages, the selftest task_local_data has the following failures: ... test_task_local_data_basic:PASS:tld_create_key 0 nsec test_task_local_data_basic:FAIL:tld_create_key unexpected tld_create_key: actual 0 != expected -28 ... test_task_local_data_basic_thread:PASS:run task_main 0 nsec test_task_local_data_basic_thread:FAIL:task_main retval unexpected error: 2 (errno 0) test_task_local_data_basic_thread:FAIL:tld_get_data value0 unexpected tld_get_data value0: actual 0 != expected 6268 ... #447/1 task_local_data/task_local_data_basic:FAIL ... #447/2 task_local_data/task_local_data_race:FAIL #447 task_local_data:FAIL When TLD_DYN_DATA_SIZE is 64K page size, for struct tld_meta_u { _Atomic __u8 cnt; __u16 size; struct tld_metadata metadata[]; }; field 'cnt' would overflow. For example, for 4K page, 'cnt' will be 4096/64 = 64. But for 64K page, 'cnt' will be 65536/64 = 1024 and 'cnt' is not enough for 1024. To accommodate 64K page, '_Atomic __u8 cnt' becomes '_Atomic __u16 cnt'. A few other places are adjusted accordingly. In test_task_local_data.c, the value for TLD_DYN_DATA_SIZE is changed from 4096 to (getpagesize() - 8) since the maximum buffer size for TLD_DYN_DATA_SIZE is (getpagesize() - 8). Reviewed-by: Alan Maguire Tested-by: Alan Maguire Cc: Amery Hung Signed-off-by: Yonghong Song Acked-by: Amery Hung Link: https://lore.kernel.org/r/20260123055122.494352-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/task_local_data.h | 4 ++-- tools/testing/selftests/bpf/prog_tests/test_task_local_data.c | 2 +- tools/testing/selftests/bpf/progs/task_local_data.bpf.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/prog_tests/task_local_data.h b/tools/testing/selftests/bpf/prog_tests/task_local_data.h index 2de38776a2d4..0f86b9275cf9 100644 --- a/tools/testing/selftests/bpf/prog_tests/task_local_data.h +++ b/tools/testing/selftests/bpf/prog_tests/task_local_data.h @@ -94,7 +94,7 @@ struct tld_metadata { }; struct tld_meta_u { - _Atomic __u8 cnt; + _Atomic __u16 cnt; __u16 size; struct tld_metadata metadata[]; }; @@ -217,7 +217,7 @@ out: static tld_key_t __tld_create_key(const char *name, size_t size, bool dyn_data) { int err, i, sz, off = 0; - __u8 cnt; + __u16 cnt; if (!TLD_READ_ONCE(tld_meta_p)) { err = __tld_init_meta_p(); diff --git a/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c b/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c index 9fd6306b455c..9556ad3d986f 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c +++ b/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c @@ -4,7 +4,7 @@ #include #define TLD_FREE_DATA_ON_THREAD_EXIT -#define TLD_DYN_DATA_SIZE 4096 +#define TLD_DYN_DATA_SIZE (getpagesize() - 8) #include "task_local_data.h" struct test_tld_struct { diff --git a/tools/testing/selftests/bpf/progs/task_local_data.bpf.h b/tools/testing/selftests/bpf/progs/task_local_data.bpf.h index 432fff2af844..fed53d63a7e5 100644 --- a/tools/testing/selftests/bpf/progs/task_local_data.bpf.h +++ b/tools/testing/selftests/bpf/progs/task_local_data.bpf.h @@ -80,7 +80,7 @@ struct tld_metadata { }; struct tld_meta_u { - __u8 cnt; + __u16 cnt; __u16 size; struct tld_metadata metadata[TLD_MAX_DATA_CNT]; }; -- cgit v1.2.3 From c7900f225a102219f5fe2c1c93a7dec5467315ee Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 22 Jan 2026 21:51:28 -0800 Subject: selftests/bpf: Fix xdp_pull_data failure with 64K page If the argument 'pull_len' of run_test() is 'PULL_MAX' or 'PULL_MAX | PULL_PLUS_ONE', the eventual pull_len size will close to the page size. On arm64 systems with 64K pages, the pull_len size will be close to 64K. But the existing buffer will be close to 9000 which is not enough to pull. For those failed run_tests(), make buff size to pg_sz + (pg_sz / 2) This way, there will be enough buffer space to pull regardless of page size. Tested-by: Alan Maguire Cc: Amery Hung Signed-off-by: Yonghong Song Acked-by: Amery Hung Link: https://lore.kernel.org/r/20260123055128.495265-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/xdp_pull_data.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_pull_data.c b/tools/testing/selftests/bpf/prog_tests/xdp_pull_data.c index efa350d04ec5..910dabe95afd 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_pull_data.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_pull_data.c @@ -114,12 +114,14 @@ static void test_xdp_pull_data_basic(void) { u32 pg_sz, max_meta_len, max_data_len; struct test_xdp_pull_data *skel; + int buff_len; skel = test_xdp_pull_data__open_and_load(); if (!ASSERT_OK_PTR(skel, "test_xdp_pull_data__open_and_load")) return; pg_sz = sysconf(_SC_PAGE_SIZE); + buff_len = pg_sz + pg_sz / 2; if (find_xdp_sizes(skel, pg_sz)) goto out; @@ -140,13 +142,13 @@ static void test_xdp_pull_data_basic(void) run_test(skel, XDP_PASS, pg_sz, 9000, 0, 1025, 1025); /* multi-buf pkt, empty linear data area, pull requires memmove */ - run_test(skel, XDP_PASS, pg_sz, 9000, 0, 0, PULL_MAX); + run_test(skel, XDP_PASS, pg_sz, buff_len, 0, 0, PULL_MAX); /* multi-buf pkt, no headroom */ - run_test(skel, XDP_PASS, pg_sz, 9000, max_meta_len, 1024, PULL_MAX); + run_test(skel, XDP_PASS, pg_sz, buff_len, max_meta_len, 1024, PULL_MAX); /* multi-buf pkt, no tailroom, pull requires memmove */ - run_test(skel, XDP_PASS, pg_sz, 9000, 0, max_data_len, PULL_MAX); + run_test(skel, XDP_PASS, pg_sz, buff_len, 0, max_data_len, PULL_MAX); /* Test cases with invalid pull length */ @@ -154,18 +156,18 @@ static void test_xdp_pull_data_basic(void) run_test(skel, XDP_DROP, pg_sz, 2048, 0, 2048, 2049); /* multi-buf pkt with no space left in linear data area */ - run_test(skel, XDP_DROP, pg_sz, 9000, max_meta_len, max_data_len, + run_test(skel, XDP_DROP, pg_sz, buff_len, max_meta_len, max_data_len, PULL_MAX | PULL_PLUS_ONE); /* multi-buf pkt, empty linear data area */ - run_test(skel, XDP_DROP, pg_sz, 9000, 0, 0, PULL_MAX | PULL_PLUS_ONE); + run_test(skel, XDP_DROP, pg_sz, buff_len, 0, 0, PULL_MAX | PULL_PLUS_ONE); /* multi-buf pkt, no headroom */ - run_test(skel, XDP_DROP, pg_sz, 9000, max_meta_len, 1024, + run_test(skel, XDP_DROP, pg_sz, buff_len, max_meta_len, 1024, PULL_MAX | PULL_PLUS_ONE); /* multi-buf pkt, no tailroom */ - run_test(skel, XDP_DROP, pg_sz, 9000, 0, max_data_len, + run_test(skel, XDP_DROP, pg_sz, buff_len, 0, max_data_len, PULL_MAX | PULL_PLUS_ONE); out: -- cgit v1.2.3 From 2d419c44658f75e7655794341a95c0687830f3df Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 24 Jan 2026 14:19:56 +0800 Subject: bpf: add fsession support The fsession is something that similar to kprobe session. It allow to attach a single BPF program to both the entry and the exit of the target functions. Introduce the struct bpf_fsession_link, which allows to add the link to both the fentry and fexit progs_hlist of the trampoline. Signed-off-by: Menglong Dong Co-developed-by: Leon Hwang Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260124062008.8657-2-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 19 ++++++++ include/uapi/linux/bpf.h | 1 + kernel/bpf/btf.c | 2 + kernel/bpf/syscall.c | 18 +++++++- kernel/bpf/trampoline.c | 53 ++++++++++++++++++---- kernel/bpf/verifier.c | 12 +++-- net/bpf/test_run.c | 1 + net/core/bpf_sk_storage.c | 1 + tools/include/uapi/linux/bpf.h | 1 + .../selftests/bpf/prog_tests/tracing_failure.c | 2 +- 10 files changed, 97 insertions(+), 13 deletions(-) (limited to 'tools/testing') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5936f8e2996f..41228b0add52 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1309,6 +1309,7 @@ enum bpf_tramp_prog_type { BPF_TRAMP_MODIFY_RETURN, BPF_TRAMP_MAX, BPF_TRAMP_REPLACE, /* more than MAX */ + BPF_TRAMP_FSESSION, }; struct bpf_tramp_image { @@ -1875,6 +1876,11 @@ struct bpf_tracing_link { struct bpf_prog *tgt_prog; }; +struct bpf_fsession_link { + struct bpf_tracing_link link; + struct bpf_tramp_link fexit; +}; + struct bpf_raw_tp_link { struct bpf_link link; struct bpf_raw_event_map *btp; @@ -2169,6 +2175,19 @@ static inline void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_op #endif +static inline int bpf_fsession_cnt(struct bpf_tramp_links *links) +{ + struct bpf_tramp_links fentries = links[BPF_TRAMP_FENTRY]; + int cnt = 0; + + for (int i = 0; i < links[BPF_TRAMP_FENTRY].nr_links; i++) { + if (fentries.links[i]->link.prog->expected_attach_type == BPF_TRACE_FSESSION) + cnt++; + } + + return cnt; +} + int bpf_prog_ctx_arg_info_init(struct bpf_prog *prog, const struct bpf_ctx_arg_aux *info, u32 cnt); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2a2ade4be60f..44e7dbc278e3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1145,6 +1145,7 @@ enum bpf_attach_type { BPF_NETKIT_PEER, BPF_TRACE_KPROBE_SESSION, BPF_TRACE_UPROBE_SESSION, + BPF_TRACE_FSESSION, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index d10b3404260f..8959f3bc1e92 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6219,6 +6219,7 @@ static int btf_validate_prog_ctx_type(struct bpf_verifier_log *log, const struct case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_MODIFY_RETURN: + case BPF_TRACE_FSESSION: /* allow u64* as ctx */ if (btf_is_int(t) && t->size == 8) return 0; @@ -6820,6 +6821,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, fallthrough; case BPF_LSM_CGROUP: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: /* When LSM programs are attached to void LSM hooks * they use FEXIT trampolines and when attached to * int LSM hooks, they use MODIFY_RETURN trampolines. diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3c5c03d43f5f..b9184545c3fd 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3577,6 +3577,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, case BPF_PROG_TYPE_TRACING: if (prog->expected_attach_type != BPF_TRACE_FENTRY && prog->expected_attach_type != BPF_TRACE_FEXIT && + prog->expected_attach_type != BPF_TRACE_FSESSION && prog->expected_attach_type != BPF_MODIFY_RETURN) { err = -EINVAL; goto out_put_prog; @@ -3626,7 +3627,21 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id); } - link = kzalloc(sizeof(*link), GFP_USER); + if (prog->expected_attach_type == BPF_TRACE_FSESSION) { + struct bpf_fsession_link *fslink; + + fslink = kzalloc(sizeof(*fslink), GFP_USER); + if (fslink) { + bpf_link_init(&fslink->fexit.link, BPF_LINK_TYPE_TRACING, + &bpf_tracing_link_lops, prog, attach_type); + fslink->fexit.cookie = bpf_cookie; + link = &fslink->link; + } else { + link = NULL; + } + } else { + link = kzalloc(sizeof(*link), GFP_USER); + } if (!link) { err = -ENOMEM; goto out_put_prog; @@ -4350,6 +4365,7 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_TRACE_RAW_TP: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: case BPF_MODIFY_RETURN: return BPF_PROG_TYPE_TRACING; case BPF_LSM_MAC: diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 2a125d063e62..edf9da43762d 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -109,10 +109,17 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog) enum bpf_attach_type eatype = prog->expected_attach_type; enum bpf_prog_type ptype = prog->type; - return (ptype == BPF_PROG_TYPE_TRACING && - (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT || - eatype == BPF_MODIFY_RETURN)) || - (ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC); + switch (ptype) { + case BPF_PROG_TYPE_TRACING: + if (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT || + eatype == BPF_MODIFY_RETURN || eatype == BPF_TRACE_FSESSION) + return true; + return false; + case BPF_PROG_TYPE_LSM: + return eatype == BPF_LSM_MAC; + default: + return false; + } } void bpf_image_ksym_init(void *data, unsigned int size, struct bpf_ksym *ksym) @@ -559,6 +566,8 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) return BPF_TRAMP_MODIFY_RETURN; case BPF_TRACE_FEXIT: return BPF_TRAMP_FEXIT; + case BPF_TRACE_FSESSION: + return BPF_TRAMP_FSESSION; case BPF_LSM_MAC: if (!prog->aux->attach_func_proto->type) /* The function returns void, we cannot modify its @@ -594,8 +603,10 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog) { + struct bpf_fsession_link *fslink = NULL; enum bpf_tramp_prog_type kind; struct bpf_tramp_link *link_exiting; + struct hlist_head *prog_list; int err = 0; int cnt = 0, i; @@ -621,24 +632,43 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, BPF_MOD_JUMP, NULL, link->link.prog->bpf_func); } + if (kind == BPF_TRAMP_FSESSION) { + prog_list = &tr->progs_hlist[BPF_TRAMP_FENTRY]; + cnt++; + } else { + prog_list = &tr->progs_hlist[kind]; + } if (cnt >= BPF_MAX_TRAMP_LINKS) return -E2BIG; if (!hlist_unhashed(&link->tramp_hlist)) /* prog already linked */ return -EBUSY; - hlist_for_each_entry(link_exiting, &tr->progs_hlist[kind], tramp_hlist) { + hlist_for_each_entry(link_exiting, prog_list, tramp_hlist) { if (link_exiting->link.prog != link->link.prog) continue; /* prog already linked */ return -EBUSY; } - hlist_add_head(&link->tramp_hlist, &tr->progs_hlist[kind]); - tr->progs_cnt[kind]++; + hlist_add_head(&link->tramp_hlist, prog_list); + if (kind == BPF_TRAMP_FSESSION) { + tr->progs_cnt[BPF_TRAMP_FENTRY]++; + fslink = container_of(link, struct bpf_fsession_link, link.link); + hlist_add_head(&fslink->fexit.tramp_hlist, &tr->progs_hlist[BPF_TRAMP_FEXIT]); + tr->progs_cnt[BPF_TRAMP_FEXIT]++; + } else { + tr->progs_cnt[kind]++; + } err = bpf_trampoline_update(tr, true /* lock_direct_mutex */); if (err) { hlist_del_init(&link->tramp_hlist); - tr->progs_cnt[kind]--; + if (kind == BPF_TRAMP_FSESSION) { + tr->progs_cnt[BPF_TRAMP_FENTRY]--; + hlist_del_init(&fslink->fexit.tramp_hlist); + tr->progs_cnt[BPF_TRAMP_FEXIT]--; + } else { + tr->progs_cnt[kind]--; + } } return err; } @@ -672,6 +702,13 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, guard(mutex)(&tgt_prog->aux->ext_mutex); tgt_prog->aux->is_extended = false; return err; + } else if (kind == BPF_TRAMP_FSESSION) { + struct bpf_fsession_link *fslink = + container_of(link, struct bpf_fsession_link, link.link); + + hlist_del_init(&fslink->fexit.tramp_hlist); + tr->progs_cnt[BPF_TRAMP_FEXIT]--; + kind = BPF_TRAMP_FENTRY; } hlist_del_init(&link->tramp_hlist); tr->progs_cnt[kind]--; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c7f5234d5fd2..41bbed6418b5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -17848,6 +17848,7 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char switch (env->prog->expected_attach_type) { case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: range = retval_range(0, 0); break; case BPF_TRACE_RAW_TP: @@ -23774,6 +23775,7 @@ patch_map_ops_generic: if (prog_type == BPF_PROG_TYPE_TRACING && insn->imm == BPF_FUNC_get_func_ret) { if (eatype == BPF_TRACE_FEXIT || + eatype == BPF_TRACE_FSESSION || eatype == BPF_MODIFY_RETURN) { /* Load nr_args from ctx - 8 */ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); @@ -24725,7 +24727,8 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, if (tgt_prog->type == BPF_PROG_TYPE_TRACING && prog_extension && (tgt_prog->expected_attach_type == BPF_TRACE_FENTRY || - tgt_prog->expected_attach_type == BPF_TRACE_FEXIT)) { + tgt_prog->expected_attach_type == BPF_TRACE_FEXIT || + tgt_prog->expected_attach_type == BPF_TRACE_FSESSION)) { /* Program extensions can extend all program types * except fentry/fexit. The reason is the following. * The fentry/fexit programs are used for performance @@ -24740,7 +24743,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, * beyond reasonable stack size. Hence extending fentry * is not allowed. */ - bpf_log(log, "Cannot extend fentry/fexit\n"); + bpf_log(log, "Cannot extend fentry/fexit/fsession\n"); return -EINVAL; } } else { @@ -24824,6 +24827,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, case BPF_LSM_CGROUP: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: if (!btf_type_is_func(t)) { bpf_log(log, "attach_btf_id %u is not a function\n", btf_id); @@ -24990,6 +24994,7 @@ static bool can_be_sleepable(struct bpf_prog *prog) case BPF_TRACE_FEXIT: case BPF_MODIFY_RETURN: case BPF_TRACE_ITER: + case BPF_TRACE_FSESSION: return true; default: return false; @@ -25071,9 +25076,10 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) tgt_info.tgt_name); return -EINVAL; } else if ((prog->expected_attach_type == BPF_TRACE_FEXIT || + prog->expected_attach_type == BPF_TRACE_FSESSION || prog->expected_attach_type == BPF_MODIFY_RETURN) && btf_id_set_contains(&noreturn_deny, btf_id)) { - verbose(env, "Attaching fexit/fmod_ret to __noreturn function '%s' is rejected.\n", + verbose(env, "Attaching fexit/fsession/fmod_ret to __noreturn function '%s' is rejected.\n", tgt_info.tgt_name); return -EINVAL; } diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 26cfcfdc45eb..178c4738e63b 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -685,6 +685,7 @@ int bpf_prog_test_run_tracing(struct bpf_prog *prog, switch (prog->expected_attach_type) { case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: if (bpf_fentry_test1(1) != 2 || bpf_fentry_test2(2, 3) != 5 || bpf_fentry_test3(4, 5, 6) != 15 || diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 850dd736ccd1..de111818f3a0 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -365,6 +365,7 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog) return true; case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: return !!strncmp(prog->aux->attach_func_name, "bpf_sk_storage", strlen("bpf_sk_storage")); default: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b816bc53d2e1..3ca7d76e05f0 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1145,6 +1145,7 @@ enum bpf_attach_type { BPF_NETKIT_PEER, BPF_TRACE_KPROBE_SESSION, BPF_TRACE_UPROBE_SESSION, + BPF_TRACE_FSESSION, __MAX_BPF_ATTACH_TYPE }; diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_failure.c b/tools/testing/selftests/bpf/prog_tests/tracing_failure.c index 10e231965589..f9f9e1cb87bf 100644 --- a/tools/testing/selftests/bpf/prog_tests/tracing_failure.c +++ b/tools/testing/selftests/bpf/prog_tests/tracing_failure.c @@ -73,7 +73,7 @@ static void test_tracing_deny(void) static void test_fexit_noreturns(void) { test_tracing_fail_prog("fexit_noreturns", - "Attaching fexit/fmod_ret to __noreturn function 'do_exit' is rejected."); + "Attaching fexit/fsession/fmod_ret to __noreturn function 'do_exit' is rejected."); } void test_tracing_failure(void) -- cgit v1.2.3 From 8fe4dc4f6456b3d2c9e6f8aeb1f978b7bff0f6c8 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 24 Jan 2026 14:19:58 +0800 Subject: bpf: change prototype of bpf_session_{cookie,is_return} Add the function argument of "void *ctx" to bpf_session_cookie() and bpf_session_is_return(), which is a preparation of the next patch. The two kfunc is seldom used now, so it will not introduce much effect to change their function prototype. Signed-off-by: Menglong Dong Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20260124062008.8657-4-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 6 +++++- kernel/trace/bpf_trace.c | 4 ++-- tools/testing/selftests/bpf/bpf_kfuncs.h | 3 --- .../selftests/bpf/progs/kprobe_multi_session_cookie.c | 15 +++++++-------- tools/testing/selftests/bpf/progs/uprobe_multi_session.c | 7 +++---- .../selftests/bpf/progs/uprobe_multi_session_cookie.c | 15 +++++++-------- .../selftests/bpf/progs/uprobe_multi_session_recursive.c | 11 +++++------ 7 files changed, 29 insertions(+), 32 deletions(-) (limited to 'tools/testing') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2081343a848d..0fa73d56cb8b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12484,6 +12484,7 @@ enum special_kfunc_type { KF_bpf_arena_alloc_pages, KF_bpf_arena_free_pages, KF_bpf_arena_reserve_pages, + KF_bpf_session_is_return, }; BTF_ID_LIST(special_kfunc_list) @@ -12561,6 +12562,7 @@ BTF_ID(func, bpf_task_work_schedule_resume) BTF_ID(func, bpf_arena_alloc_pages) BTF_ID(func, bpf_arena_free_pages) BTF_ID(func, bpf_arena_reserve_pages) +BTF_ID(func, bpf_session_is_return) static bool is_task_work_add_kfunc(u32 func_id) { @@ -12615,7 +12617,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, struct bpf_reg_state *reg = ®s[regno]; bool arg_mem_size = false; - if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) + if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] || + meta->func_id == special_kfunc_list[KF_bpf_session_is_return] || + meta->func_id == special_kfunc_list[KF_bpf_session_cookie]) return KF_ARG_PTR_TO_CTX; if (argno + 1 < nargs && diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d466a1503da3..13f0a2de33b7 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3323,7 +3323,7 @@ static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx) __bpf_kfunc_start_defs(); -__bpf_kfunc bool bpf_session_is_return(void) +__bpf_kfunc bool bpf_session_is_return(void *ctx) { struct bpf_session_run_ctx *session_ctx; @@ -3331,7 +3331,7 @@ __bpf_kfunc bool bpf_session_is_return(void) return session_ctx->is_return; } -__bpf_kfunc __u64 *bpf_session_cookie(void) +__bpf_kfunc __u64 *bpf_session_cookie(void *ctx) { struct bpf_session_run_ctx *session_ctx; diff --git a/tools/testing/selftests/bpf/bpf_kfuncs.h b/tools/testing/selftests/bpf/bpf_kfuncs.h index e0189254bb6e..7dad01439391 100644 --- a/tools/testing/selftests/bpf/bpf_kfuncs.h +++ b/tools/testing/selftests/bpf/bpf_kfuncs.h @@ -79,9 +79,6 @@ extern int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_ptr, struct bpf_dynptr *sig_ptr, struct bpf_key *trusted_keyring) __ksym; -extern bool bpf_session_is_return(void) __ksym __weak; -extern __u64 *bpf_session_cookie(void) __ksym __weak; - struct dentry; /* Description * Returns xattr of a dentry diff --git a/tools/testing/selftests/bpf/progs/kprobe_multi_session_cookie.c b/tools/testing/selftests/bpf/progs/kprobe_multi_session_cookie.c index 0835b5edf685..ad627016e3e5 100644 --- a/tools/testing/selftests/bpf/progs/kprobe_multi_session_cookie.c +++ b/tools/testing/selftests/bpf/progs/kprobe_multi_session_cookie.c @@ -1,9 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 -#include +#include #include #include #include -#include "bpf_kfuncs.h" char _license[] SEC("license") = "GPL"; @@ -23,16 +22,16 @@ int BPF_PROG(trigger) return 0; } -static int check_cookie(__u64 val, __u64 *result) +static int check_cookie(struct pt_regs *ctx, __u64 val, __u64 *result) { __u64 *cookie; if (bpf_get_current_pid_tgid() >> 32 != pid) return 1; - cookie = bpf_session_cookie(); + cookie = bpf_session_cookie(ctx); - if (bpf_session_is_return()) + if (bpf_session_is_return(ctx)) *result = *cookie == val ? val : 0; else *cookie = val; @@ -42,17 +41,17 @@ static int check_cookie(__u64 val, __u64 *result) SEC("kprobe.session/bpf_fentry_test1") int test_kprobe_1(struct pt_regs *ctx) { - return check_cookie(1, &test_kprobe_1_result); + return check_cookie(ctx, 1, &test_kprobe_1_result); } SEC("kprobe.session/bpf_fentry_test1") int test_kprobe_2(struct pt_regs *ctx) { - return check_cookie(2, &test_kprobe_2_result); + return check_cookie(ctx, 2, &test_kprobe_2_result); } SEC("kprobe.session/bpf_fentry_test1") int test_kprobe_3(struct pt_regs *ctx) { - return check_cookie(3, &test_kprobe_3_result); + return check_cookie(ctx, 3, &test_kprobe_3_result); } diff --git a/tools/testing/selftests/bpf/progs/uprobe_multi_session.c b/tools/testing/selftests/bpf/progs/uprobe_multi_session.c index 30bff90b68dc..6e46bb00ff58 100644 --- a/tools/testing/selftests/bpf/progs/uprobe_multi_session.c +++ b/tools/testing/selftests/bpf/progs/uprobe_multi_session.c @@ -1,9 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 -#include +#include #include #include #include -#include "bpf_kfuncs.h" #include "bpf_misc.h" char _license[] SEC("license") = "GPL"; @@ -51,7 +50,7 @@ static int uprobe_multi_check(void *ctx, bool is_return) SEC("uprobe.session//proc/self/exe:uprobe_multi_func_*") int uprobe(struct pt_regs *ctx) { - return uprobe_multi_check(ctx, bpf_session_is_return()); + return uprobe_multi_check(ctx, bpf_session_is_return(ctx)); } static __always_inline bool verify_sleepable_user_copy(void) @@ -67,5 +66,5 @@ int uprobe_sleepable(struct pt_regs *ctx) { if (verify_sleepable_user_copy()) uprobe_multi_sleep_result++; - return uprobe_multi_check(ctx, bpf_session_is_return()); + return uprobe_multi_check(ctx, bpf_session_is_return(ctx)); } diff --git a/tools/testing/selftests/bpf/progs/uprobe_multi_session_cookie.c b/tools/testing/selftests/bpf/progs/uprobe_multi_session_cookie.c index 5befdf944dc6..b5db196614a9 100644 --- a/tools/testing/selftests/bpf/progs/uprobe_multi_session_cookie.c +++ b/tools/testing/selftests/bpf/progs/uprobe_multi_session_cookie.c @@ -1,9 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 -#include +#include #include #include #include -#include "bpf_kfuncs.h" char _license[] SEC("license") = "GPL"; @@ -13,16 +12,16 @@ __u64 test_uprobe_1_result = 0; __u64 test_uprobe_2_result = 0; __u64 test_uprobe_3_result = 0; -static int check_cookie(__u64 val, __u64 *result) +static int check_cookie(struct pt_regs *ctx, __u64 val, __u64 *result) { __u64 *cookie; if (bpf_get_current_pid_tgid() >> 32 != pid) return 1; - cookie = bpf_session_cookie(); + cookie = bpf_session_cookie(ctx); - if (bpf_session_is_return()) + if (bpf_session_is_return(ctx)) *result = *cookie == val ? val : 0; else *cookie = val; @@ -32,17 +31,17 @@ static int check_cookie(__u64 val, __u64 *result) SEC("uprobe.session//proc/self/exe:uprobe_multi_func_1") int uprobe_1(struct pt_regs *ctx) { - return check_cookie(1, &test_uprobe_1_result); + return check_cookie(ctx, 1, &test_uprobe_1_result); } SEC("uprobe.session//proc/self/exe:uprobe_multi_func_2") int uprobe_2(struct pt_regs *ctx) { - return check_cookie(2, &test_uprobe_2_result); + return check_cookie(ctx, 2, &test_uprobe_2_result); } SEC("uprobe.session//proc/self/exe:uprobe_multi_func_3") int uprobe_3(struct pt_regs *ctx) { - return check_cookie(3, &test_uprobe_3_result); + return check_cookie(ctx, 3, &test_uprobe_3_result); } diff --git a/tools/testing/selftests/bpf/progs/uprobe_multi_session_recursive.c b/tools/testing/selftests/bpf/progs/uprobe_multi_session_recursive.c index 8fbcd69fae22..3ce309248a04 100644 --- a/tools/testing/selftests/bpf/progs/uprobe_multi_session_recursive.c +++ b/tools/testing/selftests/bpf/progs/uprobe_multi_session_recursive.c @@ -1,9 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 -#include +#include #include #include #include -#include "bpf_kfuncs.h" #include "bpf_misc.h" char _license[] SEC("license") = "GPL"; @@ -16,11 +15,11 @@ int idx_return = 0; __u64 test_uprobe_cookie_entry[6]; __u64 test_uprobe_cookie_return[3]; -static int check_cookie(void) +static int check_cookie(struct pt_regs *ctx) { - __u64 *cookie = bpf_session_cookie(); + __u64 *cookie = bpf_session_cookie(ctx); - if (bpf_session_is_return()) { + if (bpf_session_is_return(ctx)) { if (idx_return >= ARRAY_SIZE(test_uprobe_cookie_return)) return 1; test_uprobe_cookie_return[idx_return++] = *cookie; @@ -40,5 +39,5 @@ int uprobe_recursive(struct pt_regs *ctx) if (bpf_get_current_pid_tgid() >> 32 != pid) return 1; - return check_cookie(); + return check_cookie(ctx); } -- cgit v1.2.3 From f7afef5617b685c3491db3593ca09abc33815774 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 24 Jan 2026 14:20:05 +0800 Subject: selftests/bpf: add testcases for fsession Add testcases for BPF_TRACE_FSESSION. The function arguments and return value are tested both in the entry and exit. And the kfunc bpf_session_is_ret() is also tested. Signed-off-by: Menglong Dong Link: https://lore.kernel.org/r/20260124062008.8657-11-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/fsession_test.c | 90 ++++++++++++++++++++ tools/testing/selftests/bpf/progs/fsession_test.c | 97 ++++++++++++++++++++++ 2 files changed, 187 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/fsession_test.c create mode 100644 tools/testing/selftests/bpf/progs/fsession_test.c (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/prog_tests/fsession_test.c b/tools/testing/selftests/bpf/prog_tests/fsession_test.c new file mode 100644 index 000000000000..75bb42942b67 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/fsession_test.c @@ -0,0 +1,90 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 ChinaTelecom */ +#include +#include "fsession_test.skel.h" + +static int check_result(struct fsession_test *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + /* Trigger test function calls */ + prog_fd = bpf_program__fd(skel->progs.test1); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + return err; + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + return topts.retval; + + for (int i = 0; i < sizeof(*skel->bss) / sizeof(__u64); i++) { + if (!ASSERT_EQ(((__u64 *)skel->bss)[i], 1, "test_result")) + return -EINVAL; + } + + return 0; +} + +static void test_fsession_basic(void) +{ + struct fsession_test *skel = NULL; + int err; + + skel = fsession_test__open_and_load(); + if (!ASSERT_OK_PTR(skel, "fsession_test__open_and_load")) + goto cleanup; + + err = fsession_test__attach(skel); + if (!ASSERT_OK(err, "fsession_attach")) + goto cleanup; + + check_result(skel); +cleanup: + fsession_test__destroy(skel); +} + +static void test_fsession_reattach(void) +{ + struct fsession_test *skel = NULL; + int err; + + skel = fsession_test__open_and_load(); + if (!ASSERT_OK_PTR(skel, "fsession_test__open_and_load")) + goto cleanup; + + /* first attach */ + err = fsession_test__attach(skel); + if (!ASSERT_OK(err, "fsession_first_attach")) + goto cleanup; + + if (check_result(skel)) + goto cleanup; + + /* detach */ + fsession_test__detach(skel); + + /* reset counters */ + memset(skel->bss, 0, sizeof(*skel->bss)); + + /* second attach */ + err = fsession_test__attach(skel); + if (!ASSERT_OK(err, "fsession_second_attach")) + goto cleanup; + + if (check_result(skel)) + goto cleanup; + +cleanup: + fsession_test__destroy(skel); +} + +void test_fsession_test(void) +{ +#if !defined(__x86_64__) + test__skip(); + return; +#endif + if (test__start_subtest("fsession_test")) + test_fsession_basic(); + if (test__start_subtest("fsession_reattach")) + test_fsession_reattach(); +} diff --git a/tools/testing/selftests/bpf/progs/fsession_test.c b/tools/testing/selftests/bpf/progs/fsession_test.c new file mode 100644 index 000000000000..0e1b66b2dddc --- /dev/null +++ b/tools/testing/selftests/bpf/progs/fsession_test.c @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 ChinaTelecom */ +#include +#include +#include + +char _license[] SEC("license") = "GPL"; + +__u64 test1_entry_result = 0; +__u64 test1_exit_result = 0; + +SEC("fsession/bpf_fentry_test1") +int BPF_PROG(test1, int a, int ret) +{ + bool is_exit = bpf_session_is_return(ctx); + + if (!is_exit) { + test1_entry_result = a == 1 && ret == 0; + return 0; + } + + test1_exit_result = a == 1 && ret == 2; + return 0; +} + +__u64 test2_entry_result = 0; +__u64 test2_exit_result = 0; + +SEC("fsession/bpf_fentry_test3") +int BPF_PROG(test2, char a, int b, __u64 c, int ret) +{ + bool is_exit = bpf_session_is_return(ctx); + + if (!is_exit) { + test2_entry_result = a == 4 && b == 5 && c == 6 && ret == 0; + return 0; + } + + test2_exit_result = a == 4 && b == 5 && c == 6 && ret == 15; + return 0; +} + +__u64 test3_entry_result = 0; +__u64 test3_exit_result = 0; + +SEC("fsession/bpf_fentry_test4") +int BPF_PROG(test3, void *a, char b, int c, __u64 d, int ret) +{ + bool is_exit = bpf_session_is_return(ctx); + + if (!is_exit) { + test3_entry_result = a == (void *)7 && b == 8 && c == 9 && d == 10 && ret == 0; + return 0; + } + + test3_exit_result = a == (void *)7 && b == 8 && c == 9 && d == 10 && ret == 34; + return 0; +} + +__u64 test4_entry_result = 0; +__u64 test4_exit_result = 0; + +SEC("fsession/bpf_fentry_test5") +int BPF_PROG(test4, __u64 a, void *b, short c, int d, __u64 e, int ret) +{ + bool is_exit = bpf_session_is_return(ctx); + + if (!is_exit) { + test4_entry_result = a == 11 && b == (void *)12 && c == 13 && d == 14 && + e == 15 && ret == 0; + return 0; + } + + test4_exit_result = a == 11 && b == (void *)12 && c == 13 && d == 14 && + e == 15 && ret == 65; + return 0; +} + +__u64 test5_entry_result = 0; +__u64 test5_exit_result = 0; + +SEC("fsession/bpf_fentry_test7") +int BPF_PROG(test5, struct bpf_fentry_test_t *arg, int ret) +{ + bool is_exit = bpf_session_is_return(ctx); + + if (!is_exit) { + if (!arg) + test5_entry_result = ret == 0; + return 0; + } + + if (!arg) + test5_exit_result = 1; + return 0; +} + -- cgit v1.2.3 From a5533a6eaa5b602fe54e53d85f787e09eab4e771 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 24 Jan 2026 14:20:06 +0800 Subject: selftests/bpf: test bpf_get_func_* for fsession Test following bpf helper for fsession: bpf_get_func_arg() bpf_get_func_arg_cnt() bpf_get_func_ret() bpf_get_func_ip() Signed-off-by: Menglong Dong Link: https://lore.kernel.org/r/20260124062008.8657-12-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/get_func_args_test.c | 1 + .../selftests/bpf/prog_tests/get_func_ip_test.c | 2 ++ .../selftests/bpf/progs/get_func_args_test.c | 40 +++++++++++++++++++++- .../testing/selftests/bpf/progs/get_func_ip_test.c | 23 +++++++++++++ 4 files changed, 65 insertions(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/prog_tests/get_func_args_test.c b/tools/testing/selftests/bpf/prog_tests/get_func_args_test.c index fadee95d3ae8..96b27de05524 100644 --- a/tools/testing/selftests/bpf/prog_tests/get_func_args_test.c +++ b/tools/testing/selftests/bpf/prog_tests/get_func_args_test.c @@ -41,6 +41,7 @@ void test_get_func_args_test(void) ASSERT_EQ(skel->bss->test4_result, 1, "test4_result"); ASSERT_EQ(skel->bss->test5_result, 1, "test5_result"); ASSERT_EQ(skel->bss->test6_result, 1, "test6_result"); + ASSERT_EQ(skel->bss->test7_result, 1, "test7_result"); cleanup: get_func_args_test__destroy(skel); diff --git a/tools/testing/selftests/bpf/prog_tests/get_func_ip_test.c b/tools/testing/selftests/bpf/prog_tests/get_func_ip_test.c index c40242dfa8fb..7772a0f288d3 100644 --- a/tools/testing/selftests/bpf/prog_tests/get_func_ip_test.c +++ b/tools/testing/selftests/bpf/prog_tests/get_func_ip_test.c @@ -46,6 +46,8 @@ static void test_function_entry(void) ASSERT_EQ(skel->bss->test5_result, 1, "test5_result"); ASSERT_EQ(skel->bss->test7_result, 1, "test7_result"); ASSERT_EQ(skel->bss->test8_result, 1, "test8_result"); + ASSERT_EQ(skel->bss->test9_entry_result, 1, "test9_entry_result"); + ASSERT_EQ(skel->bss->test9_exit_result, 1, "test9_exit_result"); cleanup: get_func_ip_test__destroy(skel); diff --git a/tools/testing/selftests/bpf/progs/get_func_args_test.c b/tools/testing/selftests/bpf/progs/get_func_args_test.c index 5b7233afef05..0a3236a7a109 100644 --- a/tools/testing/selftests/bpf/progs/get_func_args_test.c +++ b/tools/testing/selftests/bpf/progs/get_func_args_test.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -#include +#include #include #include #include @@ -165,3 +165,41 @@ int BPF_PROG(tp_test2) return 0; } + +__u64 test7_result = 0; +#ifdef __TARGET_ARCH_x86 +SEC("fsession/bpf_fentry_test1") +int BPF_PROG(test7) +{ + __u64 cnt = bpf_get_func_arg_cnt(ctx); + __u64 a = 0, z = 0, ret = 0; + __s64 err; + + test7_result = cnt == 1; + + /* valid arguments */ + err = bpf_get_func_arg(ctx, 0, &a); + test7_result &= err == 0 && ((int) a == 1); + + /* not valid argument */ + err = bpf_get_func_arg(ctx, 1, &z); + test7_result &= err == -EINVAL; + + if (bpf_session_is_return(ctx)) { + err = bpf_get_func_ret(ctx, &ret); + test7_result &= err == 0 && ret == 2; + } else { + err = bpf_get_func_ret(ctx, &ret); + test7_result &= err == 0 && ret == 0; + } + + return 0; +} +#else +SEC("fentry/bpf_fentry_test1") +int BPF_PROG(test7) +{ + test7_result = 1; + return 0; +} +#endif diff --git a/tools/testing/selftests/bpf/progs/get_func_ip_test.c b/tools/testing/selftests/bpf/progs/get_func_ip_test.c index 2011cacdeb18..65f7e1f182bf 100644 --- a/tools/testing/selftests/bpf/progs/get_func_ip_test.c +++ b/tools/testing/selftests/bpf/progs/get_func_ip_test.c @@ -103,3 +103,26 @@ int BPF_URETPROBE(test8, int ret) test8_result = (const void *) addr == (const void *) uprobe_trigger; return 0; } + +__u64 test9_entry_result = 0; +__u64 test9_exit_result = 0; +#ifdef __TARGET_ARCH_x86 +SEC("fsession/bpf_fentry_test1") +int BPF_PROG(test9, int a) +{ + __u64 addr = bpf_get_func_ip(ctx); + + if (bpf_session_is_return(ctx)) + test9_exit_result = (const void *) addr == &bpf_fentry_test1; + else + test9_entry_result = (const void *) addr == &bpf_fentry_test1; + return 0; +} +#else +SEC("fentry/bpf_fentry_test1") +int BPF_PROG(test9, int a) +{ + test9_entry_result = test9_exit_result = 1; + return 0; +} +#endif -- cgit v1.2.3 From 8909b3fb23e245f8ade903dfcfcc43522cf28a56 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 24 Jan 2026 14:20:07 +0800 Subject: selftests/bpf: add testcases for fsession cookie Test session cookie for fsession. Multiple fsession BPF progs is attached to bpf_fentry_test1() and session cookie is read and write in the testcase. bpf_get_func_ip() will influence the layout of the session cookies, so we test the cookie in two case: with and without bpf_get_func_ip(). Signed-off-by: Menglong Dong Link: https://lore.kernel.org/r/20260124062008.8657-13-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/fsession_test.c | 34 +++++++++++ tools/testing/selftests/bpf/progs/fsession_test.c | 66 ++++++++++++++++++++++ 2 files changed, 100 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/prog_tests/fsession_test.c b/tools/testing/selftests/bpf/prog_tests/fsession_test.c index 75bb42942b67..0c4b428e1cee 100644 --- a/tools/testing/selftests/bpf/prog_tests/fsession_test.c +++ b/tools/testing/selftests/bpf/prog_tests/fsession_test.c @@ -77,6 +77,38 @@ cleanup: fsession_test__destroy(skel); } +static void test_fsession_cookie(void) +{ + struct fsession_test *skel = NULL; + int err; + + skel = fsession_test__open(); + if (!ASSERT_OK_PTR(skel, "fsession_test__open")) + goto cleanup; + + /* + * The test_fsession_basic() will test the session cookie with + * bpf_get_func_ip() case, so we need only check + * the cookie without bpf_get_func_ip() case here + */ + bpf_program__set_autoload(skel->progs.test6, false); + + err = fsession_test__load(skel); + if (!ASSERT_OK(err, "fsession_test__load")) + goto cleanup; + + err = fsession_test__attach(skel); + if (!ASSERT_OK(err, "fsession_attach")) + goto cleanup; + + skel->bss->test6_entry_result = 1; + skel->bss->test6_exit_result = 1; + + check_result(skel); +cleanup: + fsession_test__destroy(skel); +} + void test_fsession_test(void) { #if !defined(__x86_64__) @@ -87,4 +119,6 @@ void test_fsession_test(void) test_fsession_basic(); if (test__start_subtest("fsession_reattach")) test_fsession_reattach(); + if (test__start_subtest("fsession_cookie")) + test_fsession_cookie(); } diff --git a/tools/testing/selftests/bpf/progs/fsession_test.c b/tools/testing/selftests/bpf/progs/fsession_test.c index 0e1b66b2dddc..211332bdcccb 100644 --- a/tools/testing/selftests/bpf/progs/fsession_test.c +++ b/tools/testing/selftests/bpf/progs/fsession_test.c @@ -95,3 +95,69 @@ int BPF_PROG(test5, struct bpf_fentry_test_t *arg, int ret) return 0; } +__u64 test6_entry_result = 0; +__u64 test6_exit_result = 0; +SEC("fsession/bpf_fentry_test1") +int BPF_PROG(test6, int a) +{ + __u64 addr = bpf_get_func_ip(ctx); + + if (bpf_session_is_return(ctx)) + test6_exit_result = (const void *) addr == &bpf_fentry_test1; + else + test6_entry_result = (const void *) addr == &bpf_fentry_test1; + return 0; +} + +__u64 test7_entry_ok = 0; +__u64 test7_exit_ok = 0; +SEC("fsession/bpf_fentry_test1") +int BPF_PROG(test7, int a) +{ + volatile __u64 *cookie = bpf_session_cookie(ctx); + + if (!bpf_session_is_return(ctx)) { + *cookie = 0xAAAABBBBCCCCDDDDull; + test7_entry_ok = *cookie == 0xAAAABBBBCCCCDDDDull; + return 0; + } + + test7_exit_ok = *cookie == 0xAAAABBBBCCCCDDDDull; + return 0; +} + +__u64 test8_entry_ok = 0; +__u64 test8_exit_ok = 0; + +SEC("fsession/bpf_fentry_test1") +int BPF_PROG(test8, int a) +{ + volatile __u64 *cookie = bpf_session_cookie(ctx); + + if (!bpf_session_is_return(ctx)) { + *cookie = 0x1111222233334444ull; + test8_entry_ok = *cookie == 0x1111222233334444ull; + return 0; + } + + test8_exit_ok = *cookie == 0x1111222233334444ull; + return 0; +} + +__u64 test9_entry_result = 0; +__u64 test9_exit_result = 0; + +SEC("fsession/bpf_fentry_test1") +int BPF_PROG(test9, int a, int ret) +{ + __u64 *cookie = bpf_session_cookie(ctx); + + if (!bpf_session_is_return(ctx)) { + test9_entry_result = a == 1 && ret == 0; + *cookie = 0x123456ULL; + return 0; + } + + test9_exit_result = a == 1 && ret == 2 && *cookie == 0x123456ULL; + return 0; +} -- cgit v1.2.3 From cb4bfacfb0110aa1b10ab60c64a3df0e176998c5 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 24 Jan 2026 14:20:08 +0800 Subject: selftests/bpf: test fsession mixed with fentry and fexit Test the fsession when it is used together with fentry, fexit. Signed-off-by: Menglong Dong Link: https://lore.kernel.org/r/20260124062008.8657-14-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/fsession_test.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/progs/fsession_test.c b/tools/testing/selftests/bpf/progs/fsession_test.c index 211332bdcccb..86e8a2fe467e 100644 --- a/tools/testing/selftests/bpf/progs/fsession_test.c +++ b/tools/testing/selftests/bpf/progs/fsession_test.c @@ -161,3 +161,19 @@ int BPF_PROG(test9, int a, int ret) test9_exit_result = a == 1 && ret == 2 && *cookie == 0x123456ULL; return 0; } + +__u64 test10_result = 0; +SEC("fexit/bpf_fentry_test1") +int BPF_PROG(test10, int a, int ret) +{ + test10_result = a == 1 && ret == 2; + return 0; +} + +__u64 test11_result = 0; +SEC("fentry/bpf_fentry_test1") +int BPF_PROG(test11, int a) +{ + test11_result = a == 1; + return 0; +} -- cgit v1.2.3 From c31df36bd26a5ed8898bb3fcc8c37ea9157ba784 Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Sun, 25 Jan 2026 20:54:12 +0900 Subject: selftests/bpf: Introduce execution context detection helpers Introduce bpf_in_nmi(), bpf_in_hardirq(), bpf_in_serving_softirq(), and bpf_in_task() inline helpers in bpf_experimental.h. These allow BPF programs to query the current execution context with higher granularity than the existing bpf_in_interrupt() helper. While BPF programs can often infer their context from attachment points, subsystems like sched_ext may call the same BPF logic from multiple contexts (e.g., task-to-task wake-ups vs. interrupt-to-task wake-ups). These helpers provide a reliable way for logic to branch based on the current CPU execution state. Implementing these as BPF-native inline helpers wrapping get_preempt_count() allows the compiler and JIT to inline the logic. The implementation accounts for differences in preempt_count layout between standard and PREEMPT_RT kernels. Reviewed-by: Alexei Starovoitov Signed-off-by: Changwoo Min Link: https://lore.kernel.org/r/20260125115413.117502-2-changwoo@igalia.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/bpf_experimental.h | 58 ++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index 68a49b1f77ae..a39576c8ba04 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -610,6 +610,8 @@ extern int bpf_cgroup_read_xattr(struct cgroup *cgroup, const char *name__str, #define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT) #define NMI_MASK (__IRQ_MASK(NMI_BITS) << NMI_SHIFT) +#define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT) + extern bool CONFIG_PREEMPT_RT __kconfig __weak; #ifdef bpf_target_x86 extern const int __preempt_count __ksym; @@ -648,4 +650,60 @@ static inline int bpf_in_interrupt(void) (tsk->softirq_disable_cnt & SOFTIRQ_MASK); } +/* Description + * Report whether it is in NMI context. Only works on the following archs: + * * x86 + * * arm64 + */ +static inline int bpf_in_nmi(void) +{ + return get_preempt_count() & NMI_MASK; +} + +/* Description + * Report whether it is in hard IRQ context. Only works on the following archs: + * * x86 + * * arm64 + */ +static inline int bpf_in_hardirq(void) +{ + return get_preempt_count() & HARDIRQ_MASK; +} + +/* Description + * Report whether it is in softirq context. Only works on the following archs: + * * x86 + * * arm64 + */ +static inline int bpf_in_serving_softirq(void) +{ + struct task_struct___preempt_rt *tsk; + int pcnt; + + pcnt = get_preempt_count(); + if (!CONFIG_PREEMPT_RT) + return (pcnt & SOFTIRQ_MASK) & SOFTIRQ_OFFSET; + + tsk = (void *) bpf_get_current_task_btf(); + return (tsk->softirq_disable_cnt & SOFTIRQ_MASK) & SOFTIRQ_OFFSET; +} + +/* Description + * Report whether it is in task context. Only works on the following archs: + * * x86 + * * arm64 + */ +static inline int bpf_in_task(void) +{ + struct task_struct___preempt_rt *tsk; + int pcnt; + + pcnt = get_preempt_count(); + if (!CONFIG_PREEMPT_RT) + return !(pcnt & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); + + tsk = (void *) bpf_get_current_task_btf(); + return !((pcnt & (NMI_MASK | HARDIRQ_MASK)) | + ((tsk->softirq_disable_cnt & SOFTIRQ_MASK) & SOFTIRQ_OFFSET)); +} #endif -- cgit v1.2.3 From 221b5e76c1c6e8ad4fa7c95a689e44ff45daab1c Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Sun, 25 Jan 2026 20:54:13 +0900 Subject: selftests/bpf: Add tests for execution context helpers Add a new selftest suite `exe_ctx` to verify the accuracy of the bpf_in_task(), bpf_in_hardirq(), and bpf_in_serving_softirq() helpers introduced in bpf_experimental.h. Testing these execution contexts deterministically requires crossing context boundaries within a single CPU. To achieve this, the test implements a "Trigger-Observer" pattern using bpf_testmod: 1. Trigger: A BPF syscall program calls a new bpf_testmod kfunc bpf_kfunc_trigger_ctx_check(). 2. Task to HardIRQ: The kfunc uses irq_work_queue() to trigger a self-IPI on the local CPU. 3. HardIRQ to SoftIRQ: The irq_work handler calls a dummy function (observed by BPF fentry) and then schedules a tasklet to transition into SoftIRQ context. The user-space runner ensures determinism by pinning itself to CPU 0 before execution, forcing the entire interrupt chain to remain on a single core. Dummy noinline functions with compiler barriers are added to bpf_testmod.c to serve as stable attachment points for fentry programs. A retry loop is used in user-space to wait for the asynchronous SoftIRQ to complete. Note that testing on s390x is avoided because supporting those helpers purely in BPF on s390x is not possible at this point. Reviewed-by: Alexei Starovoitov Signed-off-by: Changwoo Min Link: https://lore.kernel.org/r/20260125115413.117502-3-changwoo@igalia.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/DENYLIST.s390x | 1 + tools/testing/selftests/bpf/prog_tests/exe_ctx.c | 59 ++++++++++++++++++++++ tools/testing/selftests/bpf/progs/test_ctx.c | 48 ++++++++++++++++++ .../testing/selftests/bpf/test_kmods/bpf_testmod.c | 32 ++++++++++++ .../selftests/bpf/test_kmods/bpf_testmod_kfunc.h | 4 ++ 5 files changed, 144 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/exe_ctx.c create mode 100644 tools/testing/selftests/bpf/progs/test_ctx.c (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x index a17baf8c6fd7..f7e1e5f5511c 100644 --- a/tools/testing/selftests/bpf/DENYLIST.s390x +++ b/tools/testing/selftests/bpf/DENYLIST.s390x @@ -1,4 +1,5 @@ # TEMPORARY # Alphabetical order +exe_ctx # execution context check (e.g., hardirq, softirq, etc) get_stack_raw_tp # user_stack corrupted user stack (no backchain userspace) stacktrace_build_id # compare_map_keys stackid_hmap vs. stackmap err -2 errno 2 (?) diff --git a/tools/testing/selftests/bpf/prog_tests/exe_ctx.c b/tools/testing/selftests/bpf/prog_tests/exe_ctx.c new file mode 100644 index 000000000000..aed6a6ef0876 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/exe_ctx.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2026 Valve Corporation. + * Author: Changwoo Min + */ + +#include +#include +#include "test_ctx.skel.h" + +void test_exe_ctx(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + cpu_set_t old_cpuset, target_cpuset; + struct test_ctx *skel; + int err, prog_fd; + + /* 1. Pin the current process to CPU 0. */ + if (sched_getaffinity(0, sizeof(old_cpuset), &old_cpuset) == 0) { + CPU_ZERO(&target_cpuset); + CPU_SET(0, &target_cpuset); + ASSERT_OK(sched_setaffinity(0, sizeof(target_cpuset), + &target_cpuset), "setaffinity"); + } + + skel = test_ctx__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_load")) + goto restore_affinity; + + err = test_ctx__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto cleanup; + + /* 2. When we run this, the kernel will execute the BPF prog on CPU 0. */ + prog_fd = bpf_program__fd(skel->progs.trigger_all_contexts); + err = bpf_prog_test_run_opts(prog_fd, &opts); + ASSERT_OK(err, "test_run_trigger"); + + /* 3. Wait for the local CPU's softirq/tasklet to finish. */ + for (int i = 0; i < 1000; i++) { + if (skel->bss->count_task > 0 && + skel->bss->count_hardirq > 0 && + skel->bss->count_softirq > 0) + break; + usleep(1000); /* Wait 1ms per iteration, up to 1 sec total */ + } + + /* On CPU 0, these should now all be non-zero. */ + ASSERT_GT(skel->bss->count_task, 0, "task_ok"); + ASSERT_GT(skel->bss->count_hardirq, 0, "hardirq_ok"); + ASSERT_GT(skel->bss->count_softirq, 0, "softirq_ok"); + +cleanup: + test_ctx__destroy(skel); + +restore_affinity: + ASSERT_OK(sched_setaffinity(0, sizeof(old_cpuset), &old_cpuset), + "restore_affinity"); +} diff --git a/tools/testing/selftests/bpf/progs/test_ctx.c b/tools/testing/selftests/bpf/progs/test_ctx.c new file mode 100644 index 000000000000..7d4995506717 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_ctx.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2026 Valve Corporation. + * Author: Changwoo Min + */ + +#include "vmlinux.h" +#include +#include +#include "bpf_experimental.h" + +char _license[] SEC("license") = "GPL"; + +extern void bpf_kfunc_trigger_ctx_check(void) __ksym; + +int count_hardirq; +int count_softirq; +int count_task; + +/* Triggered via bpf_prog_test_run from user-space */ +SEC("syscall") +int trigger_all_contexts(void *ctx) +{ + if (bpf_in_task()) + __sync_fetch_and_add(&count_task, 1); + + /* Trigger the firing of a hardirq and softirq for test. */ + bpf_kfunc_trigger_ctx_check(); + return 0; +} + +/* Observer for HardIRQ */ +SEC("fentry/bpf_testmod_test_hardirq_fn") +int BPF_PROG(on_hardirq) +{ + if (bpf_in_hardirq()) + __sync_fetch_and_add(&count_hardirq, 1); + return 0; +} + +/* Observer for SoftIRQ */ +SEC("fentry/bpf_testmod_test_softirq_fn") +int BPF_PROG(on_softirq) +{ + if (bpf_in_serving_softirq()) + __sync_fetch_and_add(&count_softirq, 1); + return 0; +} diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index 77a81fa8ec6a..186a25ab429a 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -1168,6 +1168,33 @@ __bpf_kfunc int bpf_kfunc_implicit_arg(int a, struct bpf_prog_aux *aux); __bpf_kfunc int bpf_kfunc_implicit_arg_legacy(int a, int b, struct bpf_prog_aux *aux); __bpf_kfunc int bpf_kfunc_implicit_arg_legacy_impl(int a, int b, struct bpf_prog_aux *aux); +/* hook targets */ +noinline void bpf_testmod_test_hardirq_fn(void) { barrier(); } +noinline void bpf_testmod_test_softirq_fn(void) { barrier(); } + +/* Tasklet for SoftIRQ context */ +static void ctx_check_tasklet_fn(struct tasklet_struct *t) +{ + bpf_testmod_test_softirq_fn(); +} + +DECLARE_TASKLET(ctx_check_tasklet, ctx_check_tasklet_fn); + +/* IRQ Work for HardIRQ context */ +static void ctx_check_irq_fn(struct irq_work *work) +{ + bpf_testmod_test_hardirq_fn(); + tasklet_schedule(&ctx_check_tasklet); +} + +static struct irq_work ctx_check_irq = IRQ_WORK_INIT_HARD(ctx_check_irq_fn); + +/* The kfunc trigger */ +__bpf_kfunc void bpf_kfunc_trigger_ctx_check(void) +{ + irq_work_queue(&ctx_check_irq); +} + BTF_KFUNCS_START(bpf_testmod_check_kfunc_ids) BTF_ID_FLAGS(func, bpf_testmod_test_mod_kfunc) BTF_ID_FLAGS(func, bpf_kfunc_call_test1) @@ -1213,6 +1240,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1_assoc, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_kfunc_implicit_arg, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_kfunc_implicit_arg_legacy, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_kfunc_implicit_arg_legacy_impl) +BTF_ID_FLAGS(func, bpf_kfunc_trigger_ctx_check) BTF_KFUNCS_END(bpf_testmod_check_kfunc_ids) static int bpf_testmod_ops_init(struct btf *btf) @@ -1844,6 +1872,10 @@ static void bpf_testmod_exit(void) while (refcount_read(&prog_test_struct.cnt) > 1) msleep(20); + /* Clean up irqwork and tasklet */ + irq_work_sync(&ctx_check_irq); + tasklet_kill(&ctx_check_tasklet); + bpf_kfunc_close_sock(); sysfs_remove_bin_file(kernel_kobj, &bin_attr_bpf_testmod_file); unregister_bpf_testmod_uprobe(); diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h index 10f89f06245f..d5c5454e257e 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h @@ -169,4 +169,8 @@ extern int bpf_kfunc_multi_st_ops_test_1_assoc(struct st_ops_args *args) __weak struct prog_test_member *bpf_kfunc_get_default_trusted_ptr_test(void) __ksym; void bpf_kfunc_put_default_trusted_ptr_test(struct prog_test_member *trusted_ptr) __ksym; +void bpf_testmod_test_hardirq_fn(void); +void bpf_testmod_test_softirq_fn(void); +void bpf_kfunc_trigger_ctx_check(void) __ksym; + #endif /* _BPF_TESTMOD_KFUNC_H */ -- cgit v1.2.3 From 8e5bcc3a955a2cc4460b391f55d3b49905eb248e Mon Sep 17 00:00:00 2001 From: Alexander Atanasov Date: Sun, 25 Jan 2026 08:57:46 +0000 Subject: selftests: ublk: add missing gitignore for metadata_size binary A new utility metadata_size was added in commit 261b67f4e347 ("selftests: ublk: add utility to get block device metadata size") but it was not added to .gitignore. Fix that by adding it there. While at it sort all entries alphabetically and add a SPDX license header. Reviewed-by: Caleb Sander Mateos Fixes: 261b67f4e347 ("selftests: ublk: add utility to get block device metadata size") Signed-off-by: Alexander Atanasov Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/.gitignore | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/ublk/.gitignore b/tools/testing/selftests/ublk/.gitignore index 8b2871ea7751..e17bd28f27e0 100644 --- a/tools/testing/selftests/ublk/.gitignore +++ b/tools/testing/selftests/ublk/.gitignore @@ -1,3 +1,5 @@ -kublk -/tools +# SPDX-License-Identifier: GPL-2.0 *-verify.state +/tools +kublk +metadata_size -- cgit v1.2.3 From 1742272bd3fae6362301d0f11eb9db9030348afc Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Wed, 21 Jan 2026 20:44:09 +0100 Subject: selftests: net: add ipv6 ping to local address from localhost Test ipv6 pinging to local configured address and linklocal address from localhost with -I ::1. Signed-off-by: Fernando Fernandez Mancera Reviewed-by: David Ahern Link: https://patch.msgid.link/20260121194409.6749-2-fmancera@suse.de Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/fcnal-test.sh | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/net/fcnal-test.sh b/tools/testing/selftests/net/fcnal-test.sh index 844a580ae74e..890c3f8e51bb 100755 --- a/tools/testing/selftests/net/fcnal-test.sh +++ b/tools/testing/selftests/net/fcnal-test.sh @@ -2327,6 +2327,13 @@ ipv6_ping_novrf() log_test_addr ${a} $? 2 "ping local, device bind" done + for a in ${NSA_LO_IP6} ${NSA_LINKIP6}%${NSA_DEV} ${NSA_IP6} + do + log_start + run_cmd ${ping6} -c1 -w1 -I ::1 ${a} + log_test_addr ${a} $? 0 "ping local, from localhost" + done + # # ip rule blocks address # -- cgit v1.2.3 From 3c58f03e805f8f9025f09fe393103947dca49c57 Mon Sep 17 00:00:00 2001 From: I-Hsin Cheng Date: Sat, 24 Jan 2026 20:32:41 +0800 Subject: kselftest/arm64: Add missing file in .gitignore The binary generated by check_hugetlb_options is missing in .gitignore under the directory. Add it into the file so it won't be logged into version control. Signed-off-by: I-Hsin Cheng Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/mte/.gitignore | 1 + 1 file changed, 1 insertion(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/arm64/mte/.gitignore b/tools/testing/selftests/arm64/mte/.gitignore index 052d0f9f92b3..f6937f890039 100644 --- a/tools/testing/selftests/arm64/mte/.gitignore +++ b/tools/testing/selftests/arm64/mte/.gitignore @@ -6,3 +6,4 @@ check_mmap_options check_prctl check_ksm_options check_user_mem +check_hugetlb_options -- cgit v1.2.3 From 78980b4c7fcb5ef74b7af65fbef5ce8d718cf791 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Mon, 19 Jan 2026 21:34:17 +0800 Subject: selftests/bpf: Harden cpu flags test for lru_percpu_hash map CI occasionally reports failures in the percpu_alloc/cpu_flag_lru_percpu_hash selftest, for example: First test_progs failure (test_progs_no_alu32-x86_64-llvm-21): #264/15 percpu_alloc/cpu_flag_lru_percpu_hash ... test_percpu_map_op_cpu_flag:FAIL:bpf_map_lookup_batch value on specified cpu unexpected bpf_map_lookup_batch value on specified cpu: actual 0 != expected 3735929054 The unexpected value indicates that an element was removed from the map. However, the test never calls delete_elem(), so the only possible cause is LRU eviction. This can happen when the current task migrates to another CPU: an update_elem() triggers eviction because there is no available LRU node on local freelist and global freelist. Harden the test against this behavior by provisioning sufficient spare elements. Set max_entries to 'nr_cpus * 2' and restrict the test to using the first nr_cpus entries, ensuring that updates do not spuriously trigger LRU eviction. Signed-off-by: Leon Hwang Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260119133417.19739-1-leon.hwang@linux.dev --- tools/testing/selftests/bpf/prog_tests/percpu_alloc.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/prog_tests/percpu_alloc.c b/tools/testing/selftests/bpf/prog_tests/percpu_alloc.c index c1d0949f093f..a72ae0b29f6e 100644 --- a/tools/testing/selftests/bpf/prog_tests/percpu_alloc.c +++ b/tools/testing/selftests/bpf/prog_tests/percpu_alloc.c @@ -236,6 +236,8 @@ static void test_percpu_map_op_cpu_flag(struct bpf_map *map, void *keys, size_t err = bpf_map_update_batch(map_fd, keys, values, &count, &batch_opts); if (!ASSERT_OK(err, "bpf_map_update_batch all_cpus")) goto out; + if (!ASSERT_EQ(count, entries, "bpf_map_update_batch count")) + goto out; /* update values on specified CPU */ for (i = 0; i < entries; i++) @@ -246,6 +248,8 @@ static void test_percpu_map_op_cpu_flag(struct bpf_map *map, void *keys, size_t err = bpf_map_update_batch(map_fd, keys, values, &count, &batch_opts); if (!ASSERT_OK(err, "bpf_map_update_batch specified cpu")) goto out; + if (!ASSERT_EQ(count, entries, "bpf_map_update_batch count")) + goto out; /* lookup values on specified CPU */ batch = 0; @@ -254,6 +258,8 @@ static void test_percpu_map_op_cpu_flag(struct bpf_map *map, void *keys, size_t err = bpf_map_lookup_batch(map_fd, NULL, &batch, keys, values, &count, &batch_opts); if (!ASSERT_TRUE(!err || err == -ENOENT, "bpf_map_lookup_batch specified cpu")) goto out; + if (!ASSERT_EQ(count, entries, "bpf_map_lookup_batch count")) + goto out; for (i = 0; i < entries; i++) if (!ASSERT_EQ(values[i], value, @@ -269,6 +275,8 @@ static void test_percpu_map_op_cpu_flag(struct bpf_map *map, void *keys, size_t &batch_opts); if (!ASSERT_TRUE(!err || err == -ENOENT, "bpf_map_lookup_batch all_cpus")) goto out; + if (!ASSERT_EQ(count, entries, "bpf_map_lookup_batch count")) + goto out; for (i = 0; i < entries; i++) { values_row = (void *) values_percpu + @@ -287,7 +295,6 @@ out: free(values); } - static void test_percpu_map_cpu_flag(enum bpf_map_type map_type) { struct percpu_alloc_array *skel; @@ -300,7 +307,7 @@ static void test_percpu_map_cpu_flag(enum bpf_map_type map_type) if (!ASSERT_GT(nr_cpus, 0, "libbpf_num_possible_cpus")) return; - max_entries = nr_cpus + 1; + max_entries = nr_cpus * 2; keys = calloc(max_entries, key_sz); if (!ASSERT_OK_PTR(keys, "calloc keys")) return; @@ -322,7 +329,7 @@ static void test_percpu_map_cpu_flag(enum bpf_map_type map_type) if (!ASSERT_OK(err, "test_percpu_alloc__load")) goto out; - test_percpu_map_op_cpu_flag(map, keys, key_sz, max_entries - 1, nr_cpus, true); + test_percpu_map_op_cpu_flag(map, keys, key_sz, nr_cpus, nr_cpus, true); out: percpu_alloc_array__destroy(skel); free(keys); -- cgit v1.2.3 From f21fae57744607330ae5dabdd996538faac7f5ab Mon Sep 17 00:00:00 2001 From: "Alexis Lothoré (eBPF Foundation)" Date: Fri, 23 Jan 2026 09:30:08 +0100 Subject: selftests/bpf: Add a few helpers for bpftool testing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to integrate some bpftool tests into test_progs, define a few specific helpers that allow to execute bpftool commands, while possibly retrieving the command output. Those helpers most notably set the path to the bpftool binary under test. This version checks different possible paths relative to the directories where the different test_progs runners are executed, as we want to make sure not to accidentally use a bootstrap version of the binary. Signed-off-by: Alexis Lothoré (eBPF Foundation) Link: https://lore.kernel.org/r/20260123-bpftool-tests-v4-1-a6653a7f28e7@bootlin.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 3 +- tools/testing/selftests/bpf/bpftool_helpers.c | 74 +++++++++++++++++++++++++++ tools/testing/selftests/bpf/bpftool_helpers.h | 11 ++++ 3 files changed, 87 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/bpftool_helpers.c create mode 100644 tools/testing/selftests/bpf/bpftool_helpers.h (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 9488d076c740..2cc559ecc723 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -747,7 +747,8 @@ TRUNNER_EXTRA_SOURCES := test_progs.c \ json_writer.c \ $(VERIFY_SIG_HDR) \ flow_dissector_load.h \ - ip_check_defrag_frags.h + ip_check_defrag_frags.h \ + bpftool_helpers.c TRUNNER_LIB_SOURCES := find_bit.c TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read \ $(OUTPUT)/liburandom_read.so \ diff --git a/tools/testing/selftests/bpf/bpftool_helpers.c b/tools/testing/selftests/bpf/bpftool_helpers.c new file mode 100644 index 000000000000..a5824945a4a5 --- /dev/null +++ b/tools/testing/selftests/bpf/bpftool_helpers.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include "bpftool_helpers.h" +#include +#include +#include + +#define BPFTOOL_PATH_MAX_LEN 64 +#define BPFTOOL_FULL_CMD_MAX_LEN 512 + +#define BPFTOOL_DEFAULT_PATH "tools/sbin/bpftool" + +static int detect_bpftool_path(char *buffer) +{ + char tmp[BPFTOOL_PATH_MAX_LEN]; + + /* Check default bpftool location (will work if we are running the + * default flavor of test_progs) + */ + snprintf(tmp, BPFTOOL_PATH_MAX_LEN, "./%s", BPFTOOL_DEFAULT_PATH); + if (access(tmp, X_OK) == 0) { + strncpy(buffer, tmp, BPFTOOL_PATH_MAX_LEN); + return 0; + } + + /* Check alternate bpftool location (will work if we are running a + * specific flavor of test_progs, e.g. cpuv4 or no_alu32) + */ + snprintf(tmp, BPFTOOL_PATH_MAX_LEN, "../%s", BPFTOOL_DEFAULT_PATH); + if (access(tmp, X_OK) == 0) { + strncpy(buffer, tmp, BPFTOOL_PATH_MAX_LEN); + return 0; + } + + /* Failed to find bpftool binary */ + return 1; +} + +static int run_command(char *args, char *output_buf, size_t output_max_len) +{ + static char bpftool_path[BPFTOOL_PATH_MAX_LEN] = {0}; + bool suppress_output = !(output_buf && output_max_len); + char command[BPFTOOL_FULL_CMD_MAX_LEN]; + FILE *f; + int ret; + + /* Detect and cache bpftool binary location */ + if (bpftool_path[0] == 0 && detect_bpftool_path(bpftool_path)) + return 1; + + ret = snprintf(command, BPFTOOL_FULL_CMD_MAX_LEN, "%s %s%s", + bpftool_path, args, + suppress_output ? " > /dev/null 2>&1" : ""); + + f = popen(command, "r"); + if (!f) + return 1; + + if (!suppress_output) + fread(output_buf, 1, output_max_len, f); + ret = pclose(f); + + return ret; +} + +int run_bpftool_command(char *args) +{ + return run_command(args, NULL, 0); +} + +int get_bpftool_command_output(char *args, char *output_buf, size_t output_max_len) +{ + return run_command(args, output_buf, output_max_len); +} + diff --git a/tools/testing/selftests/bpf/bpftool_helpers.h b/tools/testing/selftests/bpf/bpftool_helpers.h new file mode 100644 index 000000000000..dec1ba201410 --- /dev/null +++ b/tools/testing/selftests/bpf/bpftool_helpers.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#pragma once + +#include +#include +#include + +#define MAX_BPFTOOL_CMD_LEN (256) + +int run_bpftool_command(char *args); +int get_bpftool_command_output(char *args, char *output_buf, size_t output_max_len); -- cgit v1.2.3 From 1c0b505908a201054dadc87930f550bea2631c1e Mon Sep 17 00:00:00 2001 From: "Alexis Lothoré (eBPF Foundation)" Date: Fri, 23 Jan 2026 09:30:09 +0100 Subject: selftests/bpf: convert test_bpftool_metadata.sh into test_progs framework MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The test_bpftool_metadata.sh script validates that bpftool properly returns in its ouptput any metadata generated by bpf programs through some .rodata sections. Port this test to the test_progs framework so that it can be executed automatically in CI. The new test, similarly to the former script, checks that valid data appears both for textual output and json output, as well as for both data not used at all and used data. For the json check part, the expected json string is hardcoded to avoid bringing a new external dependency (eg: a json deserializer) for test_progs. As the test is now converted into test_progs, remove the former script. The newly converted test brings two new subtests: #37/1 bpftool_metadata/metadata_unused:OK #37/2 bpftool_metadata/metadata_used:OK #37 bpftool_metadata:OK Summary: 1/2 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Alexis Lothoré (eBPF Foundation) Link: https://lore.kernel.org/r/20260123-bpftool-tests-v4-2-a6653a7f28e7@bootlin.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 1 - .../selftests/bpf/prog_tests/bpftool_metadata.c | 144 +++++++++++++++++++++ .../testing/selftests/bpf/test_bpftool_metadata.sh | 85 ------------ 3 files changed, 144 insertions(+), 86 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/bpftool_metadata.c delete mode 100755 tools/testing/selftests/bpf/test_bpftool_metadata.sh (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 2cc559ecc723..1bb7db1ed6ea 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -109,7 +109,6 @@ TEST_PROGS := test_kmod.sh \ test_bpftool_build.sh \ test_bpftool.sh \ test_bpftool_map.sh \ - test_bpftool_metadata.sh \ test_doc_build.sh \ test_xsk.sh \ test_xdp_features.sh diff --git a/tools/testing/selftests/bpf/prog_tests/bpftool_metadata.c b/tools/testing/selftests/bpf/prog_tests/bpftool_metadata.c new file mode 100644 index 000000000000..408ace90dc7e --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/bpftool_metadata.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include +#include +#include + +#define BPFFS_DIR "/sys/fs/bpf/test_metadata" +#define BPFFS_USED BPFFS_DIR "/used" +#define BPFFS_UNUSED BPFFS_DIR "/unused" + +#define BPF_FILE_USED "metadata_used.bpf.o" +#define BPF_FILE_UNUSED "metadata_unused.bpf.o" +#define METADATA_MAP_NAME "metadata.rodata" + +#define MAX_BPFTOOL_OUTPUT_LEN (64*1024) + +#define MAX_TOKENS_TO_CHECK 3 +static char output[MAX_BPFTOOL_OUTPUT_LEN]; + +struct test_desc { + char *name; + char *bpf_prog; + char *bpffs_path; + char *expected_output[MAX_TOKENS_TO_CHECK]; + char *expected_output_json[MAX_TOKENS_TO_CHECK]; + char *metadata_map_name; +}; + +static int setup(struct test_desc *test) +{ + return mkdir(BPFFS_DIR, 0700); +} + +static void cleanup(struct test_desc *test) +{ + unlink(test->bpffs_path); + rmdir(BPFFS_DIR); +} + +static int check_metadata(char *buf, char * const *tokens, int count) +{ + int i; + + for (i = 0; i < count && tokens[i]; i++) + if (!strstr(buf, tokens[i])) + return 1; + + return 0; +} + +static void run_test(struct test_desc *test) +{ + int ret; + char cmd[MAX_BPFTOOL_CMD_LEN]; + + ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN, "prog load %s %s", + test->bpf_prog, test->bpffs_path); + if (!ASSERT_GT(ret, 0, "format prog insert command")) + return; + ret = run_bpftool_command(cmd); + if (!ASSERT_OK(ret, "load program")) + return; + + /* Check output with default format */ + ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN, "prog show pinned %s", + test->bpffs_path); + if (!ASSERT_GT(ret, 0, "format pinned prog check command")) + return; + ret = get_bpftool_command_output(cmd, output, + MAX_BPFTOOL_OUTPUT_LEN); + if (ASSERT_OK(ret, "get program info")) { + ret = check_metadata(output, test->expected_output, + ARRAY_SIZE(test->expected_output)); + ASSERT_OK(ret, "find metadata"); + } + + /* Check output with json format */ + ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN, "prog -j show pinned %s", + test->bpffs_path); + if (!ASSERT_GT(ret, 0, "format pinned prog check command in json")) + return; + ret = get_bpftool_command_output(cmd, output, + MAX_BPFTOOL_OUTPUT_LEN); + if (ASSERT_OK(ret, "get program info in json")) { + ret = check_metadata(output, test->expected_output_json, + ARRAY_SIZE(test->expected_output_json)); + ASSERT_OK(ret, "find metadata in json"); + } + + /* Check that the corresponding map can be found and accessed */ + ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN, "map show name %s", + test->metadata_map_name); + if (!ASSERT_GT(ret, 0, "format map check command")) + return; + ASSERT_OK(run_bpftool_command(cmd), "access metadata map"); +} + +static struct test_desc tests[] = { + { + .name = "metadata_unused", + .bpf_prog = BPF_FILE_UNUSED, + .bpffs_path = BPFFS_UNUSED, + .expected_output = { + "a = \"foo\"", + "b = 1" + }, + .expected_output_json = { + "\"metadata\":{\"a\":\"foo\",\"b\":1}" + }, + .metadata_map_name = METADATA_MAP_NAME + }, + { + .name = "metadata_used", + .bpf_prog = BPF_FILE_USED, + .bpffs_path = BPFFS_USED, + .expected_output = { + "a = \"bar\"", + "b = 2" + }, + .expected_output_json = { + "\"metadata\":{\"a\":\"bar\",\"b\":2}" + }, + .metadata_map_name = METADATA_MAP_NAME + } +}; +static const int tests_count = ARRAY_SIZE(tests); + +void test_bpftool_metadata(void) +{ + int i; + + for (i = 0; i < tests_count; i++) { + if (!test__start_subtest(tests[i].name)) + continue; + if (ASSERT_OK(setup(&tests[i]), "setup bpffs pin dir")) { + run_test(&tests[i]); + cleanup(&tests[i]); + } + } +} diff --git a/tools/testing/selftests/bpf/test_bpftool_metadata.sh b/tools/testing/selftests/bpf/test_bpftool_metadata.sh deleted file mode 100755 index b5520692f41b..000000000000 --- a/tools/testing/selftests/bpf/test_bpftool_metadata.sh +++ /dev/null @@ -1,85 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -BPF_FILE_USED="metadata_used.bpf.o" -BPF_FILE_UNUSED="metadata_unused.bpf.o" - -TESTNAME=bpftool_metadata -BPF_FS=$(awk '$3 == "bpf" {print $2; exit}' /proc/mounts) -BPF_DIR=$BPF_FS/test_$TESTNAME - -_cleanup() -{ - set +e - rm -rf $BPF_DIR 2> /dev/null -} - -cleanup_skip() -{ - echo "selftests: $TESTNAME [SKIP]" - _cleanup - - exit $ksft_skip -} - -cleanup() -{ - if [ "$?" = 0 ]; then - echo "selftests: $TESTNAME [PASS]" - else - echo "selftests: $TESTNAME [FAILED]" - fi - _cleanup -} - -if [ $(id -u) -ne 0 ]; then - echo "selftests: $TESTNAME [SKIP] Need root privileges" - exit $ksft_skip -fi - -if [ -z "$BPF_FS" ]; then - echo "selftests: $TESTNAME [SKIP] Could not run test without bpffs mounted" - exit $ksft_skip -fi - -if ! bpftool version > /dev/null 2>&1; then - echo "selftests: $TESTNAME [SKIP] Could not run test without bpftool" - exit $ksft_skip -fi - -set -e - -trap cleanup_skip EXIT - -mkdir $BPF_DIR - -trap cleanup EXIT - -bpftool prog load $BPF_FILE_UNUSED $BPF_DIR/unused - -METADATA_PLAIN="$(bpftool prog)" -echo "$METADATA_PLAIN" | grep 'a = "foo"' > /dev/null -echo "$METADATA_PLAIN" | grep 'b = 1' > /dev/null - -bpftool prog --json | grep '"metadata":{"a":"foo","b":1}' > /dev/null - -bpftool map | grep 'metadata.rodata' > /dev/null - -rm $BPF_DIR/unused - -bpftool prog load $BPF_FILE_USED $BPF_DIR/used - -METADATA_PLAIN="$(bpftool prog)" -echo "$METADATA_PLAIN" | grep 'a = "bar"' > /dev/null -echo "$METADATA_PLAIN" | grep 'b = 2' > /dev/null - -bpftool prog --json | grep '"metadata":{"a":"bar","b":2}' > /dev/null - -bpftool map | grep 'metadata.rodata' > /dev/null - -rm $BPF_DIR/used - -exit 0 -- cgit v1.2.3 From 2d96bbdfd3b5d28306001036c0161fcb1713f964 Mon Sep 17 00:00:00 2001 From: "Alexis Lothoré (eBPF Foundation)" Date: Fri, 23 Jan 2026 09:30:10 +0100 Subject: selftests/bpf: convert test_bpftool_map_access.sh into test_progs framework MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The test_bpftool_map.sh script tests that maps read/write accesses are being properly allowed/refused by the kernel depending on a specific fmod_ret program being attached on security_bpf_map function. Rewrite this test to integrate it in the test_progs. The new test spawns a few subtests: #36/1 bpftool_maps_access/unprotected_unpinned:OK #36/2 bpftool_maps_access/unprotected_pinned:OK #36/3 bpftool_maps_access/protected_unpinned:OK #36/4 bpftool_maps_access/protected_pinned:OK #36/5 bpftool_maps_access/nested_maps:OK #36/6 bpftool_maps_access/btf_list:OK #36 bpftool_maps_access:OK Summary: 1/6 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Alexis Lothoré (eBPF Foundation) Acked-by: Quentin Monnet Link: https://lore.kernel.org/r/20260123-bpftool-tests-v4-3-a6653a7f28e7@bootlin.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 1 - .../selftests/bpf/prog_tests/bpftool_maps_access.c | 371 +++++++++++++++++++ tools/testing/selftests/bpf/test_bpftool_map.sh | 398 --------------------- 3 files changed, 371 insertions(+), 399 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/bpftool_maps_access.c delete mode 100755 tools/testing/selftests/bpf/test_bpftool_map.sh (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 1bb7db1ed6ea..2c2f68a171ed 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -108,7 +108,6 @@ TEST_PROGS := test_kmod.sh \ test_xdping.sh \ test_bpftool_build.sh \ test_bpftool.sh \ - test_bpftool_map.sh \ test_doc_build.sh \ test_xsk.sh \ test_xdp_features.sh diff --git a/tools/testing/selftests/bpf/prog_tests/bpftool_maps_access.c b/tools/testing/selftests/bpf/prog_tests/bpftool_maps_access.c new file mode 100644 index 000000000000..e0eb869cb1b4 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/bpftool_maps_access.c @@ -0,0 +1,371 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "security_bpf_map.skel.h" + +#define PROTECTED_MAP_NAME "prot_map" +#define UNPROTECTED_MAP_NAME "not_prot_map" +#define BPF_ITER_FILE "bpf_iter_map_elem.bpf.o" +#define BPFFS_PIN_DIR "/sys/fs/bpf/test_bpftool_map" +#define INNER_MAP_NAME "inner_map_tt" +#define OUTER_MAP_NAME "outer_map_tt" + +#define MAP_NAME_MAX_LEN 64 +#define PATH_MAX_LEN 128 + +enum map_protection { + PROTECTED, + UNPROTECTED +}; + +struct test_desc { + char *name; + enum map_protection protection; + struct bpf_map *map; + char *map_name; + bool pinned; + char pin_path[PATH_MAX_LEN]; + bool write_must_fail; +}; + +static struct security_bpf_map *general_setup(void) +{ + struct security_bpf_map *skel; + uint32_t key, value; + int ret, i; + + skel = security_bpf_map__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open and load skeleton")) + goto end; + + struct bpf_map *maps[] = {skel->maps.prot_map, skel->maps.not_prot_map}; + + ret = security_bpf_map__attach(skel); + if (!ASSERT_OK(ret, "attach maps security programs")) + goto end_destroy; + + for (i = 0; i < sizeof(maps)/sizeof(struct bpf_map *); i++) { + for (key = 0; key < 2; key++) { + int ret = bpf_map__update_elem(maps[i], &key, + sizeof(key), &key, sizeof(key), + 0); + if (!ASSERT_OK(ret, "set initial map value")) + goto end_destroy; + } + } + + key = 0; + value = 1; + ret = bpf_map__update_elem(skel->maps.prot_status_map, &key, + sizeof(key), &value, sizeof(value), 0); + if (!ASSERT_OK(ret, "configure map protection")) + goto end_destroy; + + if (!ASSERT_OK(mkdir(BPFFS_PIN_DIR, S_IFDIR), "create bpffs pin dir")) + goto end_destroy; + + return skel; +end_destroy: + security_bpf_map__destroy(skel); +end: + return NULL; +} + +static void general_cleanup(struct security_bpf_map *skel) +{ + rmdir(BPFFS_PIN_DIR); + security_bpf_map__destroy(skel); +} + +static void update_test_desc(struct security_bpf_map *skel, + struct test_desc *test) +{ + /* Now that the skeleton is loaded, update all missing fields to + * have the subtest properly configured + */ + if (test->protection == PROTECTED) { + test->map = skel->maps.prot_map; + test->map_name = PROTECTED_MAP_NAME; + } else { + test->map = skel->maps.not_prot_map; + test->map_name = UNPROTECTED_MAP_NAME; + } +} + +static int test_setup(struct security_bpf_map *skel, struct test_desc *desc) +{ + int ret; + + update_test_desc(skel, desc); + + if (desc->pinned) { + ret = snprintf(desc->pin_path, PATH_MAX_LEN, "%s/%s", BPFFS_PIN_DIR, + desc->name); + if (!ASSERT_GT(ret, 0, "format pin path")) + return 1; + ret = bpf_map__pin(desc->map, desc->pin_path); + if (!ASSERT_OK(ret, "pin map")) + return 1; + } + + return 0; +} + +static void test_cleanup(struct test_desc *desc) +{ + if (desc->pinned) + bpf_map__unpin(desc->map, NULL); +} + +static int lookup_map_value(char *map_handle) +{ + char cmd[MAX_BPFTOOL_CMD_LEN]; + int ret = 0; + + ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN, "map lookup %s key 0 0 0 0", + map_handle); + if (!ASSERT_GT(ret, 0, "format map lookup cmd")) + return 1; + return run_bpftool_command(cmd); +} + +static int read_map_btf_data(char *map_handle) +{ + char cmd[MAX_BPFTOOL_CMD_LEN]; + int ret = 0; + + ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN, "btf dump map %s", + map_handle); + if (!ASSERT_GT(ret, 0, "format map btf dump cmd")) + return 1; + return run_bpftool_command(cmd); +} + +static int write_map_value(char *map_handle) +{ + char cmd[MAX_BPFTOOL_CMD_LEN]; + int ret = 0; + + ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN, + "map update %s key 0 0 0 0 value 1 1 1 1", map_handle); + if (!ASSERT_GT(ret, 0, "format value write cmd")) + return 1; + return run_bpftool_command(cmd); +} + +static int delete_map_value(char *map_handle) +{ + char cmd[MAX_BPFTOOL_CMD_LEN]; + int ret = 0; + + ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN, + "map delete %s key 0 0 0 0", map_handle); + if (!ASSERT_GT(ret, 0, "format value deletion cmd")) + return 1; + return run_bpftool_command(cmd); +} + +static int iterate_on_map_values(char *map_handle, char *iter_pin_path) +{ + char cmd[MAX_BPFTOOL_CMD_LEN]; + int ret = 0; + + + ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN, "iter pin %s %s map %s", + BPF_ITER_FILE, iter_pin_path, map_handle); + if (!ASSERT_GT(ret, 0, "format iterator creation cmd")) + return 1; + ret = run_bpftool_command(cmd); + if (ret) + return ret; + ret = snprintf(cmd, MAP_NAME_MAX_LEN, "cat %s", iter_pin_path); + if (ret < 0) + goto cleanup; + ret = system(cmd); + +cleanup: + unlink(iter_pin_path); + return ret; +} + +static int create_inner_map(void) +{ + char cmd[MAX_BPFTOOL_CMD_LEN]; + int ret = 0; + + ret = snprintf( + cmd, MAX_BPFTOOL_CMD_LEN, + "map create %s/%s type array key 4 value 4 entries 4 name %s", + BPFFS_PIN_DIR, INNER_MAP_NAME, INNER_MAP_NAME); + if (!ASSERT_GT(ret, 0, "format inner map create cmd")) + return 1; + return run_bpftool_command(cmd); +} + +static int create_outer_map(void) +{ + char cmd[MAX_BPFTOOL_CMD_LEN]; + int ret = 0; + + ret = snprintf( + cmd, MAX_BPFTOOL_CMD_LEN, + "map create %s/%s type hash_of_maps key 4 value 4 entries 2 name %s inner_map name %s", + BPFFS_PIN_DIR, OUTER_MAP_NAME, OUTER_MAP_NAME, INNER_MAP_NAME); + if (!ASSERT_GT(ret, 0, "format outer map create cmd")) + return 1; + return run_bpftool_command(cmd); +} + +static void delete_pinned_map(char *map_name) +{ + char pin_path[PATH_MAX_LEN]; + int ret; + + ret = snprintf(pin_path, PATH_MAX_LEN, "%s/%s", BPFFS_PIN_DIR, + map_name); + if (ret >= 0) + unlink(pin_path); +} + +static int add_outer_map_entry(int key) +{ + char cmd[MAX_BPFTOOL_CMD_LEN]; + int ret = 0; + + ret = snprintf( + cmd, MAX_BPFTOOL_CMD_LEN, + "map update pinned %s/%s key %d 0 0 0 value name %s", + BPFFS_PIN_DIR, OUTER_MAP_NAME, key, INNER_MAP_NAME); + if (!ASSERT_GT(ret, 0, "format outer map value addition cmd")) + return 1; + return run_bpftool_command(cmd); +} + +static void test_basic_access(struct test_desc *desc) +{ + char map_handle[MAP_NAME_MAX_LEN]; + char iter_pin_path[PATH_MAX_LEN]; + int ret; + + if (desc->pinned) + ret = snprintf(map_handle, MAP_NAME_MAX_LEN, "pinned %s", + desc->pin_path); + else + ret = snprintf(map_handle, MAP_NAME_MAX_LEN, "name %s", + desc->map_name); + if (!ASSERT_GT(ret, 0, "format map handle")) + return; + + ret = lookup_map_value(map_handle); + ASSERT_OK(ret, "read map value"); + + ret = read_map_btf_data(map_handle); + ASSERT_OK(ret, "read map btf data"); + + ret = write_map_value(map_handle); + ASSERT_OK(desc->write_must_fail ? !ret : ret, "write map value"); + + ret = delete_map_value(map_handle); + ASSERT_OK(desc->write_must_fail ? !ret : ret, "delete map value"); + /* Restore deleted value */ + if (!ret) + write_map_value(map_handle); + + ret = snprintf(iter_pin_path, PATH_MAX_LEN, "%s/iter", BPFFS_PIN_DIR); + if (ASSERT_GT(ret, 0, "format iter pin path")) { + ret = iterate_on_map_values(map_handle, iter_pin_path); + ASSERT_OK(ret, "iterate on map values"); + } +} + +static void test_create_nested_maps(void) +{ + if (!ASSERT_OK(create_inner_map(), "create inner map")) + return; + if (!ASSERT_OK(create_outer_map(), "create outer map")) + goto end_cleanup_inner; + ASSERT_OK(add_outer_map_entry(0), "add a first entry in outer map"); + ASSERT_OK(add_outer_map_entry(1), "add a second entry in outer map"); + ASSERT_NEQ(add_outer_map_entry(2), 0, "add a third entry in outer map"); + + delete_pinned_map(OUTER_MAP_NAME); +end_cleanup_inner: + delete_pinned_map(INNER_MAP_NAME); +} + +static void test_btf_list(void) +{ + ASSERT_OK(run_bpftool_command("btf list"), "list btf data"); +} + +static struct test_desc tests[] = { + { + .name = "unprotected_unpinned", + .protection = UNPROTECTED, + .map_name = UNPROTECTED_MAP_NAME, + .pinned = false, + .write_must_fail = false, + }, + { + .name = "unprotected_pinned", + .protection = UNPROTECTED, + .map_name = UNPROTECTED_MAP_NAME, + .pinned = true, + .write_must_fail = false, + }, + { + .name = "protected_unpinned", + .protection = PROTECTED, + .map_name = UNPROTECTED_MAP_NAME, + .pinned = false, + .write_must_fail = true, + }, + { + .name = "protected_pinned", + .protection = PROTECTED, + .map_name = UNPROTECTED_MAP_NAME, + .pinned = true, + .write_must_fail = true, + } +}; + +static const size_t tests_count = ARRAY_SIZE(tests); + +void test_bpftool_maps_access(void) +{ + struct security_bpf_map *skel; + struct test_desc *current; + int i; + + skel = general_setup(); + if (!ASSERT_OK_PTR(skel, "prepare programs")) + goto cleanup; + + for (i = 0; i < tests_count; i++) { + current = &tests[i]; + if (!test__start_subtest(current->name)) + continue; + if (ASSERT_OK(test_setup(skel, current), "subtest setup")) { + test_basic_access(current); + test_cleanup(current); + } + } + if (test__start_subtest("nested_maps")) + test_create_nested_maps(); + if (test__start_subtest("btf_list")) + test_btf_list(); + +cleanup: + general_cleanup(skel); +} + diff --git a/tools/testing/selftests/bpf/test_bpftool_map.sh b/tools/testing/selftests/bpf/test_bpftool_map.sh deleted file mode 100755 index 515b1df0501e..000000000000 --- a/tools/testing/selftests/bpf/test_bpftool_map.sh +++ /dev/null @@ -1,398 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -TESTNAME="bpftool_map" -BPF_FILE="security_bpf_map.bpf.o" -BPF_ITER_FILE="bpf_iter_map_elem.bpf.o" -PROTECTED_MAP_NAME="prot_map" -NOT_PROTECTED_MAP_NAME="not_prot_map" -BPF_FS_TMP_PARENT="/tmp" -BPF_FS_PARENT=$(awk '$3 == "bpf" {print $2; exit}' /proc/mounts) -BPF_FS_PARENT=${BPF_FS_PARENT:-$BPF_FS_TMP_PARENT} -# bpftool will mount bpf file system under BPF_DIR if it is not mounted -# under BPF_FS_PARENT. -BPF_DIR="$BPF_FS_PARENT/test_$TESTNAME" -SCRIPT_DIR=$(dirname $(realpath "$0")) -BPF_FILE_PATH="$SCRIPT_DIR/$BPF_FILE" -BPF_ITER_FILE_PATH="$SCRIPT_DIR/$BPF_ITER_FILE" -BPFTOOL_PATH="bpftool" -# Assume the script is located under tools/testing/selftests/bpf/ -KDIR_ROOT_DIR=$(realpath "$SCRIPT_DIR"/../../../../) - -_cleanup() -{ - set +eu - - # If BPF_DIR is a mount point this will not remove the mount point itself. - [ -d "$BPF_DIR" ] && rm -rf "$BPF_DIR" 2> /dev/null - - # Unmount if BPF filesystem was temporarily created. - if [ "$BPF_FS_PARENT" = "$BPF_FS_TMP_PARENT" ]; then - # A loop and recursive unmount are required as bpftool might - # create multiple mounts. For example, a bind mount of the directory - # to itself. The bind mount is created to change mount propagation - # flags on an actual mount point. - max_attempts=3 - attempt=0 - while mountpoint -q "$BPF_DIR" && [ $attempt -lt $max_attempts ]; do - umount -R "$BPF_DIR" 2>/dev/null - attempt=$((attempt+1)) - done - - # The directory still exists. Remove it now. - [ -d "$BPF_DIR" ] && rm -rf "$BPF_DIR" 2>/dev/null - fi -} - -cleanup_skip() -{ - echo "selftests: $TESTNAME [SKIP]" - _cleanup - - exit $ksft_skip -} - -cleanup() -{ - if [ "$?" = 0 ]; then - echo "selftests: $TESTNAME [PASS]" - else - echo "selftests: $TESTNAME [FAILED]" - fi - _cleanup -} - -check_root_privileges() { - if [ $(id -u) -ne 0 ]; then - echo "Need root privileges" - exit $ksft_skip - fi -} - -# Function to verify bpftool path. -# Parameters: -# $1: bpftool path -verify_bpftool_path() { - local bpftool_path="$1" - if ! "$bpftool_path" version > /dev/null 2>&1; then - echo "Could not run test without bpftool" - exit $ksft_skip - fi -} - -# Function to verify BTF support. -# The test requires BTF support for fmod_ret programs. -verify_btf_support() { - if [ ! -f /sys/kernel/btf/vmlinux ]; then - echo "Could not run test without BTF support" - exit $ksft_skip - fi -} - -# Function to initialize map entries with keys [0..2] and values set to 0. -# Parameters: -# $1: Map name -# $2: bpftool path -initialize_map_entries() { - local map_name="$1" - local bpftool_path="$2" - - for key in 0 1 2; do - "$bpftool_path" map update name "$map_name" key $key 0 0 0 value 0 0 0 $key - done -} - -# Test read access to the map. -# Parameters: -# $1: Name command (name/pinned) -# $2: Map name -# $3: bpftool path -# $4: key -access_for_read() { - local name_cmd="$1" - local map_name="$2" - local bpftool_path="$3" - local key="$4" - - # Test read access to the map. - if ! "$bpftool_path" map lookup "$name_cmd" "$map_name" key $key 1>/dev/null; then - echo " Read access to $key in $map_name failed" - exit 1 - fi - - # Test read access to map's BTF data. - if ! "$bpftool_path" btf dump map "$name_cmd" "$map_name" 1>/dev/null; then - echo " Read access to $map_name for BTF data failed" - exit 1 - fi -} - -# Test write access to the map. -# Parameters: -# $1: Name command (name/pinned) -# $2: Map name -# $3: bpftool path -# $4: key -# $5: Whether write should succeed (true/false) -access_for_write() { - local name_cmd="$1" - local map_name="$2" - local bpftool_path="$3" - local key="$4" - local write_should_succeed="$5" - local value="1 1 1 1" - - if "$bpftool_path" map update "$name_cmd" "$map_name" key $key value \ - $value 2>/dev/null; then - if [ "$write_should_succeed" = "false" ]; then - echo " Write access to $key in $map_name succeeded but should have failed" - exit 1 - fi - else - if [ "$write_should_succeed" = "true" ]; then - echo " Write access to $key in $map_name failed but should have succeeded" - exit 1 - fi - fi -} - -# Test entry deletion for the map. -# Parameters: -# $1: Name command (name/pinned) -# $2: Map name -# $3: bpftool path -# $4: key -# $5: Whether write should succeed (true/false) -access_for_deletion() { - local name_cmd="$1" - local map_name="$2" - local bpftool_path="$3" - local key="$4" - local write_should_succeed="$5" - local value="1 1 1 1" - - # Test deletion by key for the map. - # Before deleting, check the key exists. - if ! "$bpftool_path" map lookup "$name_cmd" "$map_name" key $key 1>/dev/null; then - echo " Key $key does not exist in $map_name" - exit 1 - fi - - # Delete by key. - if "$bpftool_path" map delete "$name_cmd" "$map_name" key $key 2>/dev/null; then - if [ "$write_should_succeed" = "false" ]; then - echo " Deletion for $key in $map_name succeeded but should have failed" - exit 1 - fi - else - if [ "$write_should_succeed" = "true" ]; then - echo " Deletion for $key in $map_name failed but should have succeeded" - exit 1 - fi - fi - - # After deleting, check the entry existence according to the expected status. - if "$bpftool_path" map lookup "$name_cmd" "$map_name" key $key 1>/dev/null; then - if [ "$write_should_succeed" = "true" ]; then - echo " Key $key for $map_name was not deleted but should have been deleted" - exit 1 - fi - else - if [ "$write_should_succeed" = "false" ]; then - echo "Key $key for $map_name was deleted but should have not been deleted" - exit 1 - fi - fi - - # Test creation of map's deleted entry, if deletion was successful. - # Otherwise, the entry exists. - if "$bpftool_path" map update "$name_cmd" "$map_name" key $key value \ - $value 2>/dev/null; then - if [ "$write_should_succeed" = "false" ]; then - echo " Write access to $key in $map_name succeeded after deletion attempt but should have failed" - exit 1 - fi - else - if [ "$write_should_succeed" = "true" ]; then - echo " Write access to $key in $map_name failed after deletion attempt but should have succeeded" - exit 1 - fi - fi -} - -# Test map elements iterator. -# Parameters: -# $1: Name command (name/pinned) -# $2: Map name -# $3: bpftool path -# $4: BPF_DIR -# $5: bpf iterator object file path -iterate_map_elem() { - local name_cmd="$1" - local map_name="$2" - local bpftool_path="$3" - local bpf_dir="$4" - local bpf_file="$5" - local pin_path="$bpf_dir/map_iterator" - - "$bpftool_path" iter pin "$bpf_file" "$pin_path" map "$name_cmd" "$map_name" - if [ ! -f "$pin_path" ]; then - echo " Failed to pin iterator to $pin_path" - exit 1 - fi - - cat "$pin_path" 1>/dev/null - rm "$pin_path" 2>/dev/null -} - -# Function to test map access with configurable write expectations -# Parameters: -# $1: Name command (name/pinned) -# $2: Map name -# $3: bpftool path -# $4: key for rw -# $5: key to delete -# $6: Whether write should succeed (true/false) -# $7: BPF_DIR -# $8: bpf iterator object file path -access_map() { - local name_cmd="$1" - local map_name="$2" - local bpftool_path="$3" - local key_for_rw="$4" - local key_to_del="$5" - local write_should_succeed="$6" - local bpf_dir="$7" - local bpf_iter_file_path="$8" - - access_for_read "$name_cmd" "$map_name" "$bpftool_path" "$key_for_rw" - access_for_write "$name_cmd" "$map_name" "$bpftool_path" "$key_for_rw" \ - "$write_should_succeed" - access_for_deletion "$name_cmd" "$map_name" "$bpftool_path" "$key_to_del" \ - "$write_should_succeed" - iterate_map_elem "$name_cmd" "$map_name" "$bpftool_path" "$bpf_dir" \ - "$bpf_iter_file_path" -} - -# Function to test map access with configurable write expectations -# Parameters: -# $1: Map name -# $2: bpftool path -# $3: BPF_DIR -# $4: Whether write should succeed (true/false) -# $5: bpf iterator object file path -test_map_access() { - local map_name="$1" - local bpftool_path="$2" - local bpf_dir="$3" - local pin_path="$bpf_dir/${map_name}_pinned" - local write_should_succeed="$4" - local bpf_iter_file_path="$5" - - # Test access to the map by name. - access_map "name" "$map_name" "$bpftool_path" "0 0 0 0" "1 0 0 0" \ - "$write_should_succeed" "$bpf_dir" "$bpf_iter_file_path" - - # Pin the map to the BPF filesystem - "$bpftool_path" map pin name "$map_name" "$pin_path" - if [ ! -e "$pin_path" ]; then - echo " Failed to pin $map_name" - exit 1 - fi - - # Test access to the pinned map. - access_map "pinned" "$pin_path" "$bpftool_path" "0 0 0 0" "2 0 0 0" \ - "$write_should_succeed" "$bpf_dir" "$bpf_iter_file_path" -} - -# Function to test map creation and map-of-maps -# Parameters: -# $1: bpftool path -# $2: BPF_DIR -test_map_creation_and_map_of_maps() { - local bpftool_path="$1" - local bpf_dir="$2" - local outer_map_name="outer_map_tt" - local inner_map_name="inner_map_tt" - - "$bpftool_path" map create "$bpf_dir/$inner_map_name" type array key 4 \ - value 4 entries 4 name "$inner_map_name" - if [ ! -f "$bpf_dir/$inner_map_name" ]; then - echo " Failed to create inner map file at $bpf_dir/$outer_map_name" - return 1 - fi - - "$bpftool_path" map create "$bpf_dir/$outer_map_name" type hash_of_maps \ - key 4 value 4 entries 2 name "$outer_map_name" inner_map name "$inner_map_name" - if [ ! -f "$bpf_dir/$outer_map_name" ]; then - echo " Failed to create outer map file at $bpf_dir/$outer_map_name" - return 1 - fi - - # Add entries to the outer map by name and by pinned path. - "$bpftool_path" map update pinned "$bpf_dir/$outer_map_name" key 0 0 0 0 \ - value pinned "$bpf_dir/$inner_map_name" - "$bpftool_path" map update name "$outer_map_name" key 1 0 0 0 value \ - name "$inner_map_name" - - # The outer map should be full by now. - # The following map update command is expected to fail. - if "$bpftool_path" map update name "$outer_map_name" key 2 0 0 0 value name \ - "$inner_map_name" 2>/dev/null; then - echo " Update for $outer_map_name succeeded but should have failed" - exit 1 - fi -} - -# Function to test map access with the btf list command -# Parameters: -# $1: bpftool path -test_map_access_with_btf_list() { - local bpftool_path="$1" - - # The btf list command iterates over maps for - # loaded BPF programs. - if ! "$bpftool_path" btf list 1>/dev/null; then - echo " Failed to access btf data" - exit 1 - fi -} - -set -eu - -trap cleanup_skip EXIT - -check_root_privileges - -verify_bpftool_path "$BPFTOOL_PATH" - -verify_btf_support - -trap cleanup EXIT - -# Load and attach the BPF programs to control maps access. -"$BPFTOOL_PATH" prog loadall "$BPF_FILE_PATH" "$BPF_DIR" autoattach - -initialize_map_entries "$PROTECTED_MAP_NAME" "$BPFTOOL_PATH" -initialize_map_entries "$NOT_PROTECTED_MAP_NAME" "$BPFTOOL_PATH" - -# Activate the map protection mechanism. Protection status is controlled -# by a value stored in the prot_status_map at index 0. -"$BPFTOOL_PATH" map update name prot_status_map key 0 0 0 0 value 1 0 0 0 - -# Test protected map (write should fail). -test_map_access "$PROTECTED_MAP_NAME" "$BPFTOOL_PATH" "$BPF_DIR" "false" \ - "$BPF_ITER_FILE_PATH" - -# Test not protected map (write should succeed). -test_map_access "$NOT_PROTECTED_MAP_NAME" "$BPFTOOL_PATH" "$BPF_DIR" "true" \ - "$BPF_ITER_FILE_PATH" - -test_map_creation_and_map_of_maps "$BPFTOOL_PATH" "$BPF_DIR" - -test_map_access_with_btf_list "$BPFTOOL_PATH" - -exit 0 -- cgit v1.2.3 From a84a1fe0fb2e9bfccb1d5a2929a249960a93264d Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Sun, 25 Jan 2026 12:55:24 +0200 Subject: selftests: net: fix wrong boolean evaluation in __exit__ The __exit__ method receives ex_type as the exception class when an exception occurs. The previous code used implicit boolean evaluation: terminate = self.terminate or (self._exit_wait and ex_type) ^^^^^^^^^^^ In Python, the and operator can be used with non-boolean values, but it does not always return a boolean result. This is probably not what we want, because 'self._exit_wait and ex_type' could return the actual ex_type value (the exception class) rather than a boolean True when an exception occurs. Use explicit `ex_type is not None` check to properly evaluate whether an exception occurred, returning a boolean result. Reviewed-by: Nimrod Oren Signed-off-by: Gal Pressman Link: https://patch.msgid.link/20260125105524.773993-1-gal@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/lib/py/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/net/lib/py/utils.py b/tools/testing/selftests/net/lib/py/utils.py index 37243103aee3..85884f3e827b 100644 --- a/tools/testing/selftests/net/lib/py/utils.py +++ b/tools/testing/selftests/net/lib/py/utils.py @@ -160,7 +160,7 @@ class bkg(cmd): def __exit__(self, ex_type, ex_value, ex_tb): # Force termination on exception - terminate = self.terminate or (self._exit_wait and ex_type) + terminate = self.terminate or (self._exit_wait and ex_type is not None) return self.process(terminate=terminate, fail=self.check_fail) -- cgit v1.2.3 From 53eb797ffc3abe30418b19777922b55fb339fc1f Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Sun, 18 Jan 2026 14:50:41 +0000 Subject: mm/rmap: remove anon_vma_merge() function This function is confusing, we already have the concept of anon_vma merge to adjacent VMA's anon_vma's to increase probability of anon_vma compatibility and therefore VMA merge (see is_mergeable_anon_vma() etc.), as well as anon_vma reuse, along side the usual VMA merge logic. We can remove the anon_vma check as it is redundant - a merge would not have been permitted with removal if the anon_vma's were not the same (and in the case of an unfaulted/faulted merge, we would have already set the unfaulted VMA's anon_vma to vp->remove->anon_vma in dup_anon_vma()). Avoid overloading this term when we're very simply unlinking anon_vma state from a removed VMA upon merge. Link: https://lkml.kernel.org/r/56bbe45e309f7af197b1c4f94a9a0c8931ff2d29.1768746221.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Suren Baghdasaryan Reviewed-by: Liam R. Howlett Cc: Barry Song Cc: Chris Li Cc: David Hildenbrand Cc: Harry Yoo Cc: Jann Horn Cc: Michal Hocko Cc: Mike Rapoport Cc: Pedro Falcato Cc: Rik van Riel Cc: Shakeel Butt Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/rmap.h | 7 ------- mm/vma.c | 2 +- tools/testing/vma/vma_internal.h | 5 ----- 3 files changed, 1 insertion(+), 13 deletions(-) (limited to 'tools/testing') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index daa92a58585d..832bfc0ccfc6 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -165,13 +165,6 @@ static inline int anon_vma_prepare(struct vm_area_struct *vma) return __anon_vma_prepare(vma); } -static inline void anon_vma_merge(struct vm_area_struct *vma, - struct vm_area_struct *next) -{ - VM_BUG_ON_VMA(vma->anon_vma != next->anon_vma, vma); - unlink_anon_vmas(next); -} - struct anon_vma *folio_get_anon_vma(const struct folio *folio); #ifdef CONFIG_MM_ID diff --git a/mm/vma.c b/mm/vma.c index f81a5cfcd7cc..6c458c8656b8 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -381,7 +381,7 @@ again: fput(vp->file); } if (vp->remove->anon_vma) - anon_vma_merge(vp->vma, vp->remove); + unlink_anon_vmas(vp->remove); mm->map_count--; mpol_put(vma_policy(vp->remove)); if (!vp->remove2) diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 9f0a9f5ed0fe..93e5792306d9 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -1265,11 +1265,6 @@ static inline void i_mmap_unlock_write(struct address_space *mapping) { } -static inline void anon_vma_merge(struct vm_area_struct *vma, - struct vm_area_struct *next) -{ -} - static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start, unsigned long end, -- cgit v1.2.3 From d17f02417a337de0a0c6e763e938ee5e41a97c3d Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Sun, 18 Jan 2026 14:50:45 +0000 Subject: mm/rmap: separate out fork-only logic on anon_vma_clone() Specify which operation is being performed to anon_vma_clone(), which allows us to do checks specific to each operation type, as well as to separate out and make clear that the anon_vma reuse logic is absolutely specific to fork only. This opens the door to further refactorings and refinements later as we have more information to work with. Link: https://lkml.kernel.org/r/cf7da7a2d973cdc72a1b80dd9a73260519e8fa9f.1768746221.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Reviewed-by: Suren Baghdasaryan Cc: Barry Song Cc: Chris Li Cc: David Hildenbrand Cc: Harry Yoo Cc: Jann Horn Cc: Michal Hocko Cc: Mike Rapoport Cc: Pedro Falcato Cc: Rik van Riel Cc: Shakeel Butt Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/internal.h | 11 +++++- mm/rmap.c | 74 +++++++++++++++++++++++++++------------- mm/vma.c | 6 ++-- tools/testing/vma/vma_internal.h | 11 +++++- 4 files changed, 74 insertions(+), 28 deletions(-) (limited to 'tools/testing') diff --git a/mm/internal.h b/mm/internal.h index aac4ec53fe15..5585059f0209 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -244,7 +244,16 @@ static inline void anon_vma_unlock_read(struct anon_vma *anon_vma) struct anon_vma *folio_get_anon_vma(const struct folio *folio); -int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src); +/* Operations which modify VMAs. */ +enum vma_operation { + VMA_OP_SPLIT, + VMA_OP_MERGE_UNFAULTED, + VMA_OP_REMAP, + VMA_OP_FORK, +}; + +int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src, + enum vma_operation operation); int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma); int __anon_vma_prepare(struct vm_area_struct *vma); void unlink_anon_vmas(struct vm_area_struct *vma); diff --git a/mm/rmap.c b/mm/rmap.c index a5ce9163454a..6ddbf58111ff 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -232,12 +232,13 @@ int __anon_vma_prepare(struct vm_area_struct *vma) } static void check_anon_vma_clone(struct vm_area_struct *dst, - struct vm_area_struct *src) + struct vm_area_struct *src, + enum vma_operation operation) { /* The write lock must be held. */ mmap_assert_write_locked(src->vm_mm); - /* If not a fork (implied by dst->anon_vma) then must be on same mm. */ - VM_WARN_ON_ONCE(dst->anon_vma && dst->vm_mm != src->vm_mm); + /* If not a fork then must be on same mm. */ + VM_WARN_ON_ONCE(operation != VMA_OP_FORK && dst->vm_mm != src->vm_mm); /* If we have anything to do src->anon_vma must be provided. */ VM_WARN_ON_ONCE(!src->anon_vma && !list_empty(&src->anon_vma_chain)); @@ -249,6 +250,40 @@ static void check_anon_vma_clone(struct vm_area_struct *dst, * must be the same across dst and src. */ VM_WARN_ON_ONCE(dst->anon_vma && dst->anon_vma != src->anon_vma); + /* + * Essentially equivalent to above - if not a no-op, we should expect + * dst->anon_vma to be set for everything except a fork. + */ + VM_WARN_ON_ONCE(operation != VMA_OP_FORK && src->anon_vma && + !dst->anon_vma); + /* For the anon_vma to be compatible, it can only be singular. */ + VM_WARN_ON_ONCE(operation == VMA_OP_MERGE_UNFAULTED && + !list_is_singular(&src->anon_vma_chain)); +#ifdef CONFIG_PER_VMA_LOCK + /* Only merging an unfaulted VMA leaves the destination attached. */ + VM_WARN_ON_ONCE(operation != VMA_OP_MERGE_UNFAULTED && + vma_is_attached(dst)); +#endif +} + +static void maybe_reuse_anon_vma(struct vm_area_struct *dst, + struct anon_vma *anon_vma) +{ + /* If already populated, nothing to do.*/ + if (dst->anon_vma) + return; + + /* + * We reuse an anon_vma if any linking VMAs were unmapped and it has + * only a single child at most. + */ + if (anon_vma->num_active_vmas > 0) + return; + if (anon_vma->num_children > 1) + return; + + dst->anon_vma = anon_vma; + anon_vma->num_active_vmas++; } static void cleanup_partial_anon_vmas(struct vm_area_struct *vma); @@ -258,6 +293,7 @@ static void cleanup_partial_anon_vmas(struct vm_area_struct *vma); * all of the anon_vma objects contained within @src anon_vma_chain's. * @dst: The destination VMA with an empty anon_vma_chain. * @src: The source VMA we wish to duplicate. + * @operation: The type of operation which resulted in the clone. * * This is the heart of the VMA side of the anon_vma implementation - we invoke * this function whenever we need to set up a new VMA's anon_vma state. @@ -280,17 +316,17 @@ static void cleanup_partial_anon_vmas(struct vm_area_struct *vma); * * Returns: 0 on success, -ENOMEM on failure. */ -int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) +int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src, + enum vma_operation operation) { struct anon_vma_chain *avc, *pavc; + struct anon_vma *active_anon_vma = src->anon_vma; - check_anon_vma_clone(dst, src); + check_anon_vma_clone(dst, src, operation); - if (!src->anon_vma) + if (!active_anon_vma) return 0; - check_anon_vma_clone(dst, src); - /* * Allocate AVCs. We don't need an anon_vma lock for this as we * are not updating the anon_vma rbtree nor are we changing @@ -318,22 +354,14 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) struct anon_vma *anon_vma = avc->anon_vma; anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); - - /* - * Reuse existing anon_vma if it has no vma and only one - * anon_vma child. - * - * Root anon_vma is never reused: - * it has self-parent reference and at least one child. - */ - if (!dst->anon_vma && src->anon_vma && - anon_vma->num_children < 2 && - anon_vma->num_active_vmas == 0) - dst->anon_vma = anon_vma; + if (operation == VMA_OP_FORK) + maybe_reuse_anon_vma(dst, anon_vma); } - if (dst->anon_vma) + + if (operation != VMA_OP_FORK) dst->anon_vma->num_active_vmas++; - anon_vma_unlock_write(src->anon_vma); + + anon_vma_unlock_write(active_anon_vma); return 0; enomem_failure: @@ -372,7 +400,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) * First, attach the new VMA to the parent VMA's anon_vmas, * so rmap can find non-COWed pages in child processes. */ - rc = anon_vma_clone(vma, pvma); + rc = anon_vma_clone(vma, pvma, VMA_OP_FORK); /* An error arose or an existing anon_vma was reused, all done then. */ if (rc || vma->anon_vma) { put_anon_vma(anon_vma); diff --git a/mm/vma.c b/mm/vma.c index 6c458c8656b8..3dbe414eff89 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -530,7 +530,7 @@ __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, if (err) goto out_free_vmi; - err = anon_vma_clone(new, vma); + err = anon_vma_clone(new, vma, VMA_OP_SPLIT); if (err) goto out_free_mpol; @@ -628,7 +628,7 @@ static int dup_anon_vma(struct vm_area_struct *dst, vma_assert_write_locked(dst); dst->anon_vma = src->anon_vma; - ret = anon_vma_clone(dst, src); + ret = anon_vma_clone(dst, src, VMA_OP_MERGE_UNFAULTED); if (ret) return ret; @@ -1901,7 +1901,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, vma_set_range(new_vma, addr, addr + len, pgoff); if (vma_dup_policy(vma, new_vma)) goto out_free_vma; - if (anon_vma_clone(new_vma, vma)) + if (anon_vma_clone(new_vma, vma, VMA_OP_REMAP)) goto out_free_mempol; if (new_vma->vm_file) get_file(new_vma->vm_file); diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 93e5792306d9..7fa56dcc53a6 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -600,6 +600,14 @@ struct mmap_action { bool hide_from_rmap_until_complete :1; }; +/* Operations which modify VMAs. */ +enum vma_operation { + VMA_OP_SPLIT, + VMA_OP_MERGE_UNFAULTED, + VMA_OP_REMAP, + VMA_OP_FORK, +}; + /* * Describes a VMA that is about to be mmap()'ed. Drivers may choose to * manipulate mutable fields which will cause those fields to be updated in the @@ -1157,7 +1165,8 @@ static inline int vma_dup_policy(struct vm_area_struct *src, struct vm_area_stru return 0; } -static inline int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) +static inline int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src, + enum vma_operation operation) { /* For testing purposes. We indicate that an anon_vma has been cloned. */ if (src->anon_vma != NULL) { -- cgit v1.2.3 From 873e7de9f9a3b67b08b380057b2c7828b0d78cae Mon Sep 17 00:00:00 2001 From: Bobby Eshleman Date: Wed, 21 Jan 2026 14:11:44 -0800 Subject: selftests/vsock: increase timeout to 1200 Increase the timeout from 300s to 1200s. On a modern bare metal server my last run showed the new set of tests taking ~400s. Multiply by an (arbitrary) factor of three to account for slower/nested runners. Reviewed-by: Stefano Garzarella Signed-off-by: Bobby Eshleman Link: https://patch.msgid.link/20260121-vsock-vmtest-v16-4-2859a7512097@meta.com Acked-by: Michael S. Tsirkin Signed-off-by: Paolo Abeni --- tools/testing/selftests/vsock/settings | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/vsock/settings b/tools/testing/selftests/vsock/settings index 694d70710ff0..79b65bdf05db 100644 --- a/tools/testing/selftests/vsock/settings +++ b/tools/testing/selftests/vsock/settings @@ -1 +1 @@ -timeout=300 +timeout=1200 -- cgit v1.2.3 From 423ec6383edba92e78abbb99a776147b3fe7b2ca Mon Sep 17 00:00:00 2001 From: Bobby Eshleman Date: Wed, 21 Jan 2026 14:11:45 -0800 Subject: selftests/vsock: add namespace helpers to vmtest.sh Add functions for initializing namespaces with the different vsock NS modes. Callers can use add_namespaces() and del_namespaces() to create namespaces global0, global1, local0, and local1. The add_namespaces() function initializes global0, local0, etc... with their respective vsock NS mode by toggling child_ns_mode before creating the namespace. Remove namespaces upon exiting the program in cleanup(). This is unlikely to be needed for a healthy run, but it is useful for tests that are manually killed mid-test. This patch is in preparation for later namespace tests. Reviewed-by: Stefano Garzarella Signed-off-by: Bobby Eshleman Link: https://patch.msgid.link/20260121-vsock-vmtest-v16-5-2859a7512097@meta.com Acked-by: Michael S. Tsirkin Signed-off-by: Paolo Abeni --- tools/testing/selftests/vsock/vmtest.sh | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/vsock/vmtest.sh b/tools/testing/selftests/vsock/vmtest.sh index c7b270dd77a9..c2bdc293b94c 100755 --- a/tools/testing/selftests/vsock/vmtest.sh +++ b/tools/testing/selftests/vsock/vmtest.sh @@ -49,6 +49,7 @@ readonly TEST_DESCS=( ) readonly USE_SHARED_VM=(vm_server_host_client vm_client_host_server vm_loopback) +readonly NS_MODES=("local" "global") VERBOSE=0 @@ -103,6 +104,36 @@ check_result() { fi } +add_namespaces() { + local orig_mode + orig_mode=$(cat /proc/sys/net/vsock/child_ns_mode) + + for mode in "${NS_MODES[@]}"; do + echo "${mode}" > /proc/sys/net/vsock/child_ns_mode + ip netns add "${mode}0" 2>/dev/null + ip netns add "${mode}1" 2>/dev/null + done + + echo "${orig_mode}" > /proc/sys/net/vsock/child_ns_mode +} + +init_namespaces() { + for mode in "${NS_MODES[@]}"; do + # we need lo for qemu port forwarding + ip netns exec "${mode}0" ip link set dev lo up + ip netns exec "${mode}1" ip link set dev lo up + done +} + +del_namespaces() { + for mode in "${NS_MODES[@]}"; do + ip netns del "${mode}0" &>/dev/null + ip netns del "${mode}1" &>/dev/null + log_host "removed ns ${mode}0" + log_host "removed ns ${mode}1" + done +} + vm_ssh() { ssh -q -o UserKnownHostsFile=/dev/null -p ${SSH_HOST_PORT} localhost "$@" return $? @@ -110,6 +141,7 @@ vm_ssh() { cleanup() { terminate_pidfiles "${!PIDFILES[@]}" + del_namespaces } check_args() { -- cgit v1.2.3 From fd1b41725d585f29029b8d8610a155f26727c18e Mon Sep 17 00:00:00 2001 From: Bobby Eshleman Date: Wed, 21 Jan 2026 14:11:46 -0800 Subject: selftests/vsock: prepare vm management helpers for namespaces Add namespace support to vm management, ssh helpers, and vsock_test wrapper functions. This enables running VMs and test helpers in specific namespaces, which is required for upcoming namespace isolation tests. The functions still work correctly within the init ns, though the caller must now pass "init_ns" explicitly. No functional changes for existing tests. All have been updated to pass "init_ns" explicitly. Affected functions (such as vm_start() and vm_ssh()) now wrap their commands with 'ip netns exec' when executing commands in non-init namespaces. Reviewed-by: Stefano Garzarella Signed-off-by: Bobby Eshleman Link: https://patch.msgid.link/20260121-vsock-vmtest-v16-6-2859a7512097@meta.com Acked-by: Michael S. Tsirkin Signed-off-by: Paolo Abeni --- tools/testing/selftests/vsock/vmtest.sh | 101 ++++++++++++++++++++++---------- 1 file changed, 69 insertions(+), 32 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/vsock/vmtest.sh b/tools/testing/selftests/vsock/vmtest.sh index c2bdc293b94c..c4d73dd0a4cf 100755 --- a/tools/testing/selftests/vsock/vmtest.sh +++ b/tools/testing/selftests/vsock/vmtest.sh @@ -135,7 +135,18 @@ del_namespaces() { } vm_ssh() { - ssh -q -o UserKnownHostsFile=/dev/null -p ${SSH_HOST_PORT} localhost "$@" + local ns_exec + + if [[ "${1}" == init_ns ]]; then + ns_exec="" + else + ns_exec="ip netns exec ${1}" + fi + + shift + + ${ns_exec} ssh -q -o UserKnownHostsFile=/dev/null -p "${SSH_HOST_PORT}" localhost "$@" + return $? } @@ -258,10 +269,12 @@ terminate_pidfiles() { vm_start() { local pidfile=$1 + local ns=$2 local logfile=/dev/null local verbose_opt="" local kernel_opt="" local qemu_opts="" + local ns_exec="" local qemu qemu=$(command -v "${QEMU}") @@ -282,7 +295,11 @@ vm_start() { kernel_opt="${KERNEL_CHECKOUT}" fi - vng \ + if [[ "${ns}" != "init_ns" ]]; then + ns_exec="ip netns exec ${ns}" + fi + + ${ns_exec} vng \ --run \ ${kernel_opt} \ ${verbose_opt} \ @@ -297,6 +314,7 @@ vm_start() { } vm_wait_for_ssh() { + local ns=$1 local i i=0 @@ -304,7 +322,8 @@ vm_wait_for_ssh() { if [[ ${i} -gt ${WAIT_PERIOD_MAX} ]]; then die "Timed out waiting for guest ssh" fi - if vm_ssh -- true; then + + if vm_ssh "${ns}" -- true; then break fi i=$(( i + 1 )) @@ -338,30 +357,41 @@ wait_for_listener() } vm_wait_for_listener() { - local port=$1 + local ns=$1 + local port=$2 - vm_ssh <&1 | log_guest rc=$? else - vm_ssh -- "${VSOCK_TEST}" \ + vm_ssh "${ns}" -- "${VSOCK_TEST}" \ --mode=server \ --peer-cid="${cid}" \ --control-port="${port}" \ @@ -381,7 +411,7 @@ vm_vsock_test() { return $rc fi - vm_wait_for_listener "${port}" + vm_wait_for_listener "${ns}" "${port}" rc=$? fi set +o pipefail @@ -390,22 +420,28 @@ vm_vsock_test() { } host_vsock_test() { - local host=$1 - local cid=$2 - local port=$3 + local ns=$1 + local host=$2 + local cid=$3 + local port=$4 local rc + local cmd="${VSOCK_TEST}" + if [[ "${ns}" != "init_ns" ]]; then + cmd="ip netns exec ${ns} ${cmd}" + fi + # log output and use pipefail to respect vsock_test errors set -o pipefail if [[ "${host}" != server ]]; then - ${VSOCK_TEST} \ + ${cmd} \ --mode=client \ --peer-cid="${cid}" \ --control-host="${host}" \ --control-port="${port}" 2>&1 | log_host rc=$? else - ${VSOCK_TEST} \ + ${cmd} \ --mode=server \ --peer-cid="${cid}" \ --control-port="${port}" 2>&1 | log_host & @@ -416,7 +452,7 @@ host_vsock_test() { return $rc fi - host_wait_for_listener "${port}" + host_wait_for_listener "${ns}" "${port}" rc=$? fi set +o pipefail @@ -460,11 +496,11 @@ log_guest() { } test_vm_server_host_client() { - if ! vm_vsock_test "server" 2 "${TEST_GUEST_PORT}"; then + if ! vm_vsock_test "init_ns" "server" 2 "${TEST_GUEST_PORT}"; then return "${KSFT_FAIL}" fi - if ! host_vsock_test "127.0.0.1" "${VSOCK_CID}" "${TEST_HOST_PORT}"; then + if ! host_vsock_test "init_ns" "127.0.0.1" "${VSOCK_CID}" "${TEST_HOST_PORT}"; then return "${KSFT_FAIL}" fi @@ -472,11 +508,11 @@ test_vm_server_host_client() { } test_vm_client_host_server() { - if ! host_vsock_test "server" "${VSOCK_CID}" "${TEST_HOST_PORT_LISTENER}"; then + if ! host_vsock_test "init_ns" "server" "${VSOCK_CID}" "${TEST_HOST_PORT_LISTENER}"; then return "${KSFT_FAIL}" fi - if ! vm_vsock_test "10.0.2.2" 2 "${TEST_HOST_PORT_LISTENER}"; then + if ! vm_vsock_test "init_ns" "10.0.2.2" 2 "${TEST_HOST_PORT_LISTENER}"; then return "${KSFT_FAIL}" fi @@ -486,13 +522,14 @@ test_vm_client_host_server() { test_vm_loopback() { local port=60000 # non-forwarded local port - vm_ssh -- modprobe vsock_loopback &> /dev/null || : + vm_ssh "init_ns" -- modprobe vsock_loopback &> /dev/null || : - if ! vm_vsock_test "server" 1 "${port}"; then + if ! vm_vsock_test "init_ns" "server" 1 "${port}"; then return "${KSFT_FAIL}" fi - if ! vm_vsock_test "127.0.0.1" 1 "${port}"; then + + if ! vm_vsock_test "init_ns" "127.0.0.1" 1 "${port}"; then return "${KSFT_FAIL}" fi @@ -550,8 +587,8 @@ run_shared_vm_test() { host_oops_cnt_before=$(dmesg | grep -c -i 'Oops') host_warn_cnt_before=$(dmesg --level=warn | grep -c -i 'vsock') - vm_oops_cnt_before=$(vm_ssh -- dmesg | grep -c -i 'Oops') - vm_warn_cnt_before=$(vm_ssh -- dmesg --level=warn | grep -c -i 'vsock') + vm_oops_cnt_before=$(vm_ssh "init_ns" -- dmesg | grep -c -i 'Oops') + vm_warn_cnt_before=$(vm_ssh "init_ns" -- dmesg --level=warn | grep -c -i 'vsock') name=$(echo "${1}" | awk '{ print $1 }') eval test_"${name}" @@ -569,13 +606,13 @@ run_shared_vm_test() { rc=$KSFT_FAIL fi - vm_oops_cnt_after=$(vm_ssh -- dmesg | grep -i 'Oops' | wc -l) + vm_oops_cnt_after=$(vm_ssh "init_ns" -- dmesg | grep -i 'Oops' | wc -l) if [[ ${vm_oops_cnt_after} -gt ${vm_oops_cnt_before} ]]; then echo "FAIL: kernel oops detected on vm" | log_host rc=$KSFT_FAIL fi - vm_warn_cnt_after=$(vm_ssh -- dmesg --level=warn | grep -c -i 'vsock') + vm_warn_cnt_after=$(vm_ssh "init_ns" -- dmesg --level=warn | grep -c -i 'vsock') if [[ ${vm_warn_cnt_after} -gt ${vm_warn_cnt_before} ]]; then echo "FAIL: kernel warning detected on vm" | log_host rc=$KSFT_FAIL @@ -621,8 +658,8 @@ cnt_total=0 if shared_vm_tests_requested "${ARGS[@]}"; then log_host "Booting up VM" pidfile="$(create_pidfile)" - vm_start "${pidfile}" - vm_wait_for_ssh + vm_start "${pidfile}" "init_ns" + vm_wait_for_ssh "init_ns" log_host "VM booted up" run_shared_vm_tests "${ARGS[@]}" -- cgit v1.2.3 From 4e870ac81df7e9628bdc08ff6f957a42e80582ee Mon Sep 17 00:00:00 2001 From: Bobby Eshleman Date: Wed, 21 Jan 2026 14:11:47 -0800 Subject: selftests/vsock: add vm_dmesg_{warn,oops}_count() helpers These functions are reused by the VM tests to collect and compare dmesg warnings and oops counts. The future VM-specific tests use them heavily. This patches relies on vm_ssh() already supporting namespaces. Reviewed-by: Stefano Garzarella Signed-off-by: Bobby Eshleman Link: https://patch.msgid.link/20260121-vsock-vmtest-v16-7-2859a7512097@meta.com Acked-by: Michael S. Tsirkin Signed-off-by: Paolo Abeni --- tools/testing/selftests/vsock/vmtest.sh | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/vsock/vmtest.sh b/tools/testing/selftests/vsock/vmtest.sh index c4d73dd0a4cf..4b5929ffc9eb 100755 --- a/tools/testing/selftests/vsock/vmtest.sh +++ b/tools/testing/selftests/vsock/vmtest.sh @@ -380,6 +380,17 @@ host_wait_for_listener() { fi } +vm_dmesg_oops_count() { + local ns=$1 + + vm_ssh "${ns}" -- dmesg 2>/dev/null | grep -c -i 'Oops' +} + +vm_dmesg_warn_count() { + local ns=$1 + + vm_ssh "${ns}" -- dmesg --level=warn 2>/dev/null | grep -c -i 'vsock' +} vm_vsock_test() { local ns=$1 @@ -587,8 +598,8 @@ run_shared_vm_test() { host_oops_cnt_before=$(dmesg | grep -c -i 'Oops') host_warn_cnt_before=$(dmesg --level=warn | grep -c -i 'vsock') - vm_oops_cnt_before=$(vm_ssh "init_ns" -- dmesg | grep -c -i 'Oops') - vm_warn_cnt_before=$(vm_ssh "init_ns" -- dmesg --level=warn | grep -c -i 'vsock') + vm_oops_cnt_before=$(vm_dmesg_oops_count "init_ns") + vm_warn_cnt_before=$(vm_dmesg_warn_count "init_ns") name=$(echo "${1}" | awk '{ print $1 }') eval test_"${name}" @@ -606,13 +617,13 @@ run_shared_vm_test() { rc=$KSFT_FAIL fi - vm_oops_cnt_after=$(vm_ssh "init_ns" -- dmesg | grep -i 'Oops' | wc -l) + vm_oops_cnt_after=$(vm_dmesg_oops_count "init_ns") if [[ ${vm_oops_cnt_after} -gt ${vm_oops_cnt_before} ]]; then echo "FAIL: kernel oops detected on vm" | log_host rc=$KSFT_FAIL fi - vm_warn_cnt_after=$(vm_ssh "init_ns" -- dmesg --level=warn | grep -c -i 'vsock') + vm_warn_cnt_after=$(vm_dmesg_warn_count "init_ns") if [[ ${vm_warn_cnt_after} -gt ${vm_warn_cnt_before} ]]; then echo "FAIL: kernel warning detected on vm" | log_host rc=$KSFT_FAIL -- cgit v1.2.3 From 7418f3bb3aa289fbf52f93b551e79ba647371f51 Mon Sep 17 00:00:00 2001 From: Bobby Eshleman Date: Wed, 21 Jan 2026 14:11:48 -0800 Subject: selftests/vsock: use ss to wait for listeners instead of /proc/net Replace /proc/net parsing with ss(8) for detecting listening sockets in wait_for_listener() functions and add support for TCP, VSOCK, and Unix socket protocols. The previous implementation parsed /proc/net/tcp using awk to detect listening sockets, but this approach could not support vsock because vsock does not export socket information to /proc/net/. Instead, use ss so that we can detect listeners on tcp, vsock, and unix. The protocol parameter is now required for all wait_for_listener family functions (wait_for_listener, vm_wait_for_listener, host_wait_for_listener) to explicitly specify which socket type to wait for. ss is added to the dependency check in check_deps(). Reviewed-by: Stefano Garzarella Signed-off-by: Bobby Eshleman Link: https://patch.msgid.link/20260121-vsock-vmtest-v16-8-2859a7512097@meta.com Acked-by: Michael S. Tsirkin Signed-off-by: Paolo Abeni --- tools/testing/selftests/vsock/vmtest.sh | 47 +++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 17 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/vsock/vmtest.sh b/tools/testing/selftests/vsock/vmtest.sh index 4b5929ffc9eb..0e681d4c3a15 100755 --- a/tools/testing/selftests/vsock/vmtest.sh +++ b/tools/testing/selftests/vsock/vmtest.sh @@ -182,7 +182,7 @@ check_args() { } check_deps() { - for dep in vng ${QEMU} busybox pkill ssh; do + for dep in vng ${QEMU} busybox pkill ssh ss; do if [[ ! -x $(command -v "${dep}") ]]; then echo -e "skip: dependency ${dep} not found!\n" exit "${KSFT_SKIP}" @@ -337,21 +337,32 @@ wait_for_listener() local port=$1 local interval=$2 local max_intervals=$3 - local protocol=tcp - local pattern + local protocol=$4 local i - pattern=":$(printf "%04X" "${port}") " - - # for tcp protocol additionally check the socket state - [ "${protocol}" = "tcp" ] && pattern="${pattern}0A" - for i in $(seq "${max_intervals}"); do - if awk -v pattern="${pattern}" \ - 'BEGIN {rc=1} $2" "$4 ~ pattern {rc=0} END {exit rc}' \ - /proc/net/"${protocol}"*; then + case "${protocol}" in + tcp) + if ss --listening --tcp --numeric | grep -q ":${port} "; then + break + fi + ;; + vsock) + if ss --listening --vsock --numeric | grep -q ":${port} "; then + break + fi + ;; + unix) + # For unix sockets, port is actually the socket path + if ss --listening --unix | grep -q "${port}"; then + break + fi + ;; + *) + echo "Unknown protocol: ${protocol}" >&2 break - fi + ;; + esac sleep "${interval}" done } @@ -359,23 +370,25 @@ wait_for_listener() vm_wait_for_listener() { local ns=$1 local port=$2 + local protocol=$3 vm_ssh "${ns}" < Date: Wed, 21 Jan 2026 14:11:49 -0800 Subject: selftests/vsock: add tests for proc sys vsock ns_mode Add tests for the /proc/sys/net/vsock/{ns_mode,child_ns_mode} interfaces. Namely, that they accept/report "global" and "local" strings and enforce their access policies. Start a convention of commenting the test name over the test description. Add test name comments over test descriptions that existed before this convention. Add a check_netns() function that checks if the test requires namespaces and if the current kernel supports namespaces. Skip tests that require namespaces if the system does not have namespace support. This patch is the first to add tests that do *not* re-use the same shared VM. For that reason, it adds a run_ns_tests() function to run these tests and filter out the shared VM tests. Reviewed-by: Stefano Garzarella Signed-off-by: Bobby Eshleman Link: https://patch.msgid.link/20260121-vsock-vmtest-v16-9-2859a7512097@meta.com Acked-by: Michael S. Tsirkin Signed-off-by: Paolo Abeni --- tools/testing/selftests/vsock/vmtest.sh | 140 +++++++++++++++++++++++++++++++- 1 file changed, 138 insertions(+), 2 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/vsock/vmtest.sh b/tools/testing/selftests/vsock/vmtest.sh index 0e681d4c3a15..38785a102236 100755 --- a/tools/testing/selftests/vsock/vmtest.sh +++ b/tools/testing/selftests/vsock/vmtest.sh @@ -41,14 +41,38 @@ readonly KERNEL_CMDLINE="\ virtme.ssh virtme_ssh_channel=tcp virtme_ssh_user=$USER \ " readonly LOG=$(mktemp /tmp/vsock_vmtest_XXXX.log) -readonly TEST_NAMES=(vm_server_host_client vm_client_host_server vm_loopback) + +# Namespace tests must use the ns_ prefix. This is checked in check_netns() and +# is used to determine if a test needs namespace setup before test execution. +readonly TEST_NAMES=( + vm_server_host_client + vm_client_host_server + vm_loopback + ns_host_vsock_ns_mode_ok + ns_host_vsock_child_ns_mode_ok +) readonly TEST_DESCS=( + # vm_server_host_client "Run vsock_test in server mode on the VM and in client mode on the host." + + # vm_client_host_server "Run vsock_test in client mode on the VM and in server mode on the host." + + # vm_loopback "Run vsock_test using the loopback transport in the VM." + + # ns_host_vsock_ns_mode_ok + "Check /proc/sys/net/vsock/ns_mode strings on the host." + + # ns_host_vsock_child_ns_mode_ok + "Check /proc/sys/net/vsock/ns_mode is read-only and child_ns_mode is writable." ) -readonly USE_SHARED_VM=(vm_server_host_client vm_client_host_server vm_loopback) +readonly USE_SHARED_VM=( + vm_server_host_client + vm_client_host_server + vm_loopback +) readonly NS_MODES=("local" "global") VERBOSE=0 @@ -196,6 +220,20 @@ check_deps() { fi } +check_netns() { + local tname=$1 + + # If the test requires NS support, check if NS support exists + # using /proc/self/ns + if [[ "${tname}" =~ ^ns_ ]] && + [[ ! -e /proc/self/ns ]]; then + log_host "No NS support detected for test ${tname}" + return 1 + fi + + return 0 +} + check_vng() { local tested_versions local version @@ -519,6 +557,54 @@ log_guest() { LOG_PREFIX=guest log "$@" } +ns_get_mode() { + local ns=$1 + + ip netns exec "${ns}" cat /proc/sys/net/vsock/ns_mode 2>/dev/null +} + +test_ns_host_vsock_ns_mode_ok() { + for mode in "${NS_MODES[@]}"; do + local actual + + actual=$(ns_get_mode "${mode}0") + if [[ "${actual}" != "${mode}" ]]; then + log_host "expected mode ${mode}, got ${actual}" + return "${KSFT_FAIL}" + fi + done + + return "${KSFT_PASS}" +} + +test_ns_host_vsock_child_ns_mode_ok() { + local orig_mode + local rc + + orig_mode=$(cat /proc/sys/net/vsock/child_ns_mode) + + rc="${KSFT_PASS}" + for mode in "${NS_MODES[@]}"; do + local ns="${mode}0" + + if echo "${mode}" 2>/dev/null > /proc/sys/net/vsock/ns_mode; then + log_host "ns_mode should be read-only but write succeeded" + rc="${KSFT_FAIL}" + continue + fi + + if ! echo "${mode}" > /proc/sys/net/vsock/child_ns_mode; then + log_host "child_ns_mode should be writable to ${mode}" + rc="${KSFT_FAIL}" + continue + fi + done + + echo "${orig_mode}" > /proc/sys/net/vsock/child_ns_mode + + return "${rc}" +} + test_vm_server_host_client() { if ! vm_vsock_test "init_ns" "server" 2 "${TEST_GUEST_PORT}"; then return "${KSFT_FAIL}" @@ -592,6 +678,11 @@ run_shared_vm_tests() { continue fi + if ! check_netns "${arg}"; then + check_result "${KSFT_SKIP}" "${arg}" + continue + fi + run_shared_vm_test "${arg}" check_result "$?" "${arg}" done @@ -645,6 +736,49 @@ run_shared_vm_test() { return "${rc}" } +run_ns_tests() { + for arg in "${ARGS[@]}"; do + if shared_vm_test "${arg}"; then + continue + fi + + if ! check_netns "${arg}"; then + check_result "${KSFT_SKIP}" "${arg}" + continue + fi + + add_namespaces + + name=$(echo "${arg}" | awk '{ print $1 }') + log_host "Executing test_${name}" + + host_oops_before=$(dmesg 2>/dev/null | grep -c -i 'Oops') + host_warn_before=$(dmesg --level=warn 2>/dev/null | grep -c -i 'vsock') + eval test_"${name}" + rc=$? + + host_oops_after=$(dmesg 2>/dev/null | grep -c -i 'Oops') + if [[ "${host_oops_after}" -gt "${host_oops_before}" ]]; then + echo "FAIL: kernel oops detected on host" | log_host + check_result "${KSFT_FAIL}" "${name}" + del_namespaces + continue + fi + + host_warn_after=$(dmesg --level=warn 2>/dev/null | grep -c -i 'vsock') + if [[ "${host_warn_after}" -gt "${host_warn_before}" ]]; then + echo "FAIL: kernel warning detected on host" | log_host + check_result "${KSFT_FAIL}" "${name}" + del_namespaces + continue + fi + + check_result "${rc}" "${name}" + + del_namespaces + done +} + BUILD=0 QEMU="qemu-system-$(uname -m)" @@ -690,6 +824,8 @@ if shared_vm_tests_requested "${ARGS[@]}"; then terminate_pidfiles "${pidfile}" fi +run_ns_tests "${ARGS[@]}" + echo "SUMMARY: PASS=${cnt_pass} SKIP=${cnt_skip} FAIL=${cnt_fail}" echo "Log: ${LOG}" -- cgit v1.2.3 From 605caec5adc2956263a86b48eecfc52ee5c95dae Mon Sep 17 00:00:00 2001 From: Bobby Eshleman Date: Wed, 21 Jan 2026 14:11:50 -0800 Subject: selftests/vsock: add namespace tests for CID collisions Add tests to verify CID collision rules across different vsock namespace modes. 1. Two VMs with the same CID cannot start in different global namespaces (ns_global_same_cid_fails) 2. Two VMs with the same CID can start in different local namespaces (ns_local_same_cid_ok) 3. VMs with the same CID can coexist when one is in a global namespace and another is in a local namespace (ns_global_local_same_cid_ok and ns_local_global_same_cid_ok) The tests ns_global_local_same_cid_ok and ns_local_global_same_cid_ok make sure that ordering does not matter. The tests use a shared helper function namespaces_can_boot_same_cid() that attempts to start two VMs with identical CIDs in the specified namespaces and verifies whether VM initialization failed or succeeded. Reviewed-by: Stefano Garzarella Signed-off-by: Bobby Eshleman Link: https://patch.msgid.link/20260121-vsock-vmtest-v16-10-2859a7512097@meta.com Acked-by: Michael S. Tsirkin Signed-off-by: Paolo Abeni --- tools/testing/selftests/vsock/vmtest.sh | 78 +++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/vsock/vmtest.sh b/tools/testing/selftests/vsock/vmtest.sh index 38785a102236..1bf537410ea6 100755 --- a/tools/testing/selftests/vsock/vmtest.sh +++ b/tools/testing/selftests/vsock/vmtest.sh @@ -50,6 +50,10 @@ readonly TEST_NAMES=( vm_loopback ns_host_vsock_ns_mode_ok ns_host_vsock_child_ns_mode_ok + ns_global_same_cid_fails + ns_local_same_cid_ok + ns_global_local_same_cid_ok + ns_local_global_same_cid_ok ) readonly TEST_DESCS=( # vm_server_host_client @@ -66,6 +70,18 @@ readonly TEST_DESCS=( # ns_host_vsock_child_ns_mode_ok "Check /proc/sys/net/vsock/ns_mode is read-only and child_ns_mode is writable." + + # ns_global_same_cid_fails + "Check QEMU fails to start two VMs with same CID in two different global namespaces." + + # ns_local_same_cid_ok + "Check QEMU successfully starts two VMs with same CID in two different local namespaces." + + # ns_global_local_same_cid_ok + "Check QEMU successfully starts one VM in a global ns and then another VM in a local ns with the same CID." + + # ns_local_global_same_cid_ok + "Check QEMU successfully starts one VM in a local ns and then another VM in a global ns with the same CID." ) readonly USE_SHARED_VM=( @@ -577,6 +593,68 @@ test_ns_host_vsock_ns_mode_ok() { return "${KSFT_PASS}" } +namespaces_can_boot_same_cid() { + local ns0=$1 + local ns1=$2 + local pidfile1 pidfile2 + local rc + + pidfile1="$(create_pidfile)" + + # The first VM should be able to start. If it can't then we have + # problems and need to return non-zero. + if ! vm_start "${pidfile1}" "${ns0}"; then + return 1 + fi + + pidfile2="$(create_pidfile)" + vm_start "${pidfile2}" "${ns1}" + rc=$? + terminate_pidfiles "${pidfile1}" "${pidfile2}" + + return "${rc}" +} + +test_ns_global_same_cid_fails() { + init_namespaces + + if namespaces_can_boot_same_cid "global0" "global1"; then + return "${KSFT_FAIL}" + fi + + return "${KSFT_PASS}" +} + +test_ns_local_global_same_cid_ok() { + init_namespaces + + if namespaces_can_boot_same_cid "local0" "global0"; then + return "${KSFT_PASS}" + fi + + return "${KSFT_FAIL}" +} + +test_ns_global_local_same_cid_ok() { + init_namespaces + + if namespaces_can_boot_same_cid "global0" "local0"; then + return "${KSFT_PASS}" + fi + + return "${KSFT_FAIL}" +} + +test_ns_local_same_cid_ok() { + init_namespaces + + if namespaces_can_boot_same_cid "local0" "local1"; then + return "${KSFT_PASS}" + fi + + return "${KSFT_FAIL}" +} + test_ns_host_vsock_child_ns_mode_ok() { local orig_mode local rc -- cgit v1.2.3 From 0424ee7c3a1721c099544f36580cf6dd1661856d Mon Sep 17 00:00:00 2001 From: Bobby Eshleman Date: Wed, 21 Jan 2026 14:11:51 -0800 Subject: selftests/vsock: add tests for host <-> vm connectivity with namespaces Add tests to validate namespace correctness using vsock_test and socat. The vsock_test tool is used to validate expected success tests, but socat is used for expected failure tests. socat is used to ensure that connections are rejected outright instead of failing due to some other socket behavior (as tested in vsock_test). Additionally, socat is already required for tunneling TCP traffic from vsock_test. Using only one of the vsock_test tests like 'test_stream_client_close_client' would have yielded a similar result, but doing so wouldn't remove the socat dependency. Additionally, check for the dependency socat. socat needs special handling beyond just checking if it is on the path because it must be compiled with support for both vsock and unix. The function check_socat() checks that this support exists. Add more padding to test name printf strings because the tests added in this patch would otherwise overflow. Add vm_dmesg_* helpers to encapsulate checking dmesg for oops and warnings. Add ability to pass extra args to host-side vsock_test so that tests that cause false positives may be skipped with arg --skip. Reviewed-by: Stefano Garzarella Signed-off-by: Bobby Eshleman Link: https://patch.msgid.link/20260121-vsock-vmtest-v16-11-2859a7512097@meta.com Acked-by: Michael S. Tsirkin Signed-off-by: Paolo Abeni --- tools/testing/selftests/vsock/vmtest.sh | 572 +++++++++++++++++++++++++++++++- 1 file changed, 568 insertions(+), 4 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/vsock/vmtest.sh b/tools/testing/selftests/vsock/vmtest.sh index 1bf537410ea6..a9eaf37bc31b 100755 --- a/tools/testing/selftests/vsock/vmtest.sh +++ b/tools/testing/selftests/vsock/vmtest.sh @@ -7,6 +7,7 @@ # * virtme-ng # * busybox-static (used by virtme-ng) # * qemu (used by virtme-ng) +# * socat # # shellcheck disable=SC2317,SC2119 @@ -54,6 +55,19 @@ readonly TEST_NAMES=( ns_local_same_cid_ok ns_global_local_same_cid_ok ns_local_global_same_cid_ok + ns_diff_global_host_connect_to_global_vm_ok + ns_diff_global_host_connect_to_local_vm_fails + ns_diff_global_vm_connect_to_global_host_ok + ns_diff_global_vm_connect_to_local_host_fails + ns_diff_local_host_connect_to_local_vm_fails + ns_diff_local_vm_connect_to_local_host_fails + ns_diff_global_to_local_loopback_local_fails + ns_diff_local_to_global_loopback_fails + ns_diff_local_to_local_loopback_fails + ns_diff_global_to_global_loopback_ok + ns_same_local_loopback_ok + ns_same_local_host_connect_to_local_vm_ok + ns_same_local_vm_connect_to_local_host_ok ) readonly TEST_DESCS=( # vm_server_host_client @@ -82,6 +96,45 @@ readonly TEST_DESCS=( # ns_local_global_same_cid_ok "Check QEMU successfully starts one VM in a local ns and then another VM in a global ns with the same CID." + + # ns_diff_global_host_connect_to_global_vm_ok + "Run vsock_test client in global ns with server in VM in another global ns." + + # ns_diff_global_host_connect_to_local_vm_fails + "Run socat to test a process in a global ns fails to connect to a VM in a local ns." + + # ns_diff_global_vm_connect_to_global_host_ok + "Run vsock_test client in VM in a global ns with server in another global ns." + + # ns_diff_global_vm_connect_to_local_host_fails + "Run socat to test a VM in a global ns fails to connect to a host process in a local ns." + + # ns_diff_local_host_connect_to_local_vm_fails + "Run socat to test a host process in a local ns fails to connect to a VM in another local ns." + + # ns_diff_local_vm_connect_to_local_host_fails + "Run socat to test a VM in a local ns fails to connect to a host process in another local ns." + + # ns_diff_global_to_local_loopback_local_fails + "Run socat to test a loopback vsock in a global ns fails to connect to a vsock in a local ns." + + # ns_diff_local_to_global_loopback_fails + "Run socat to test a loopback vsock in a local ns fails to connect to a vsock in a global ns." + + # ns_diff_local_to_local_loopback_fails + "Run socat to test a loopback vsock in a local ns fails to connect to a vsock in another local ns." + + # ns_diff_global_to_global_loopback_ok + "Run socat to test a loopback vsock in a global ns successfully connects to a vsock in another global ns." + + # ns_same_local_loopback_ok + "Run socat to test a loopback vsock in a local ns successfully connects to a vsock in the same ns." + + # ns_same_local_host_connect_to_local_vm_ok + "Run vsock_test client in a local ns with server in VM in same ns." + + # ns_same_local_vm_connect_to_local_host_ok + "Run vsock_test client in VM in a local ns with server in same ns." ) readonly USE_SHARED_VM=( @@ -112,7 +165,7 @@ usage() { for ((i = 0; i < ${#TEST_NAMES[@]}; i++)); do name=${TEST_NAMES[${i}]} desc=${TEST_DESCS[${i}]} - printf "\t%-35s%-35s\n" "${name}" "${desc}" + printf "\t%-55s%-35s\n" "${name}" "${desc}" done echo @@ -222,7 +275,7 @@ check_args() { } check_deps() { - for dep in vng ${QEMU} busybox pkill ssh ss; do + for dep in vng ${QEMU} busybox pkill ssh ss socat; do if [[ ! -x $(command -v "${dep}") ]]; then echo -e "skip: dependency ${dep} not found!\n" exit "${KSFT_SKIP}" @@ -273,6 +326,20 @@ check_vng() { fi } +check_socat() { + local support_string + + support_string="$(socat -V)" + + if [[ "${support_string}" != *"WITH_VSOCK 1"* ]]; then + die "err: socat is missing vsock support" + fi + + if [[ "${support_string}" != *"WITH_UNIX 1"* ]]; then + die "err: socat is missing unix support" + fi +} + handle_build() { if [[ ! "${BUILD}" -eq 1 ]]; then return @@ -321,6 +388,14 @@ terminate_pidfiles() { done } +terminate_pids() { + local pid + + for pid in "$@"; do + kill -SIGTERM "${pid}" &>/dev/null || : + done +} + vm_start() { local pidfile=$1 local ns=$2 @@ -459,6 +534,28 @@ vm_dmesg_warn_count() { vm_ssh "${ns}" -- dmesg --level=warn 2>/dev/null | grep -c -i 'vsock' } +vm_dmesg_check() { + local pidfile=$1 + local ns=$2 + local oops_before=$3 + local warn_before=$4 + local oops_after warn_after + + oops_after=$(vm_dmesg_oops_count "${ns}") + if [[ "${oops_after}" -gt "${oops_before}" ]]; then + echo "FAIL: kernel oops detected on vm in ns ${ns}" | log_host + return 1 + fi + + warn_after=$(vm_dmesg_warn_count "${ns}") + if [[ "${warn_after}" -gt "${warn_before}" ]]; then + echo "FAIL: kernel warning detected on vm in ns ${ns}" | log_host + return 1 + fi + + return 0 +} + vm_vsock_test() { local ns=$1 local host=$2 @@ -502,6 +599,8 @@ host_vsock_test() { local host=$2 local cid=$3 local port=$4 + shift 4 + local extra_args=("$@") local rc local cmd="${VSOCK_TEST}" @@ -516,13 +615,15 @@ host_vsock_test() { --mode=client \ --peer-cid="${cid}" \ --control-host="${host}" \ - --control-port="${port}" 2>&1 | log_host + --control-port="${port}" \ + "${extra_args[@]}" 2>&1 | log_host rc=$? else ${cmd} \ --mode=server \ --peer-cid="${cid}" \ - --control-port="${port}" 2>&1 | log_host & + --control-port="${port}" \ + "${extra_args[@]}" 2>&1 | log_host & rc=$? if [[ $rc -ne 0 ]]; then @@ -593,6 +694,468 @@ test_ns_host_vsock_ns_mode_ok() { return "${KSFT_PASS}" } +test_ns_diff_global_host_connect_to_global_vm_ok() { + local oops_before warn_before + local pids pid pidfile + local ns0 ns1 port + declare -a pids + local unixfile + ns0="global0" + ns1="global1" + port=1234 + local rc + + init_namespaces + + pidfile="$(create_pidfile)" + + if ! vm_start "${pidfile}" "${ns0}"; then + return "${KSFT_FAIL}" + fi + + vm_wait_for_ssh "${ns0}" + oops_before=$(vm_dmesg_oops_count "${ns0}") + warn_before=$(vm_dmesg_warn_count "${ns0}") + + unixfile=$(mktemp -u /tmp/XXXX.sock) + ip netns exec "${ns1}" \ + socat TCP-LISTEN:"${TEST_HOST_PORT}",fork \ + UNIX-CONNECT:"${unixfile}" & + pids+=($!) + host_wait_for_listener "${ns1}" "${TEST_HOST_PORT}" "tcp" + + ip netns exec "${ns0}" socat UNIX-LISTEN:"${unixfile}",fork \ + TCP-CONNECT:localhost:"${TEST_HOST_PORT}" & + pids+=($!) + host_wait_for_listener "${ns0}" "${unixfile}" "unix" + + vm_vsock_test "${ns0}" "server" 2 "${TEST_GUEST_PORT}" + vm_wait_for_listener "${ns0}" "${TEST_GUEST_PORT}" "tcp" + host_vsock_test "${ns1}" "127.0.0.1" "${VSOCK_CID}" "${TEST_HOST_PORT}" + rc=$? + + vm_dmesg_check "${pidfile}" "${ns0}" "${oops_before}" "${warn_before}" + dmesg_rc=$? + + terminate_pids "${pids[@]}" + terminate_pidfiles "${pidfile}" + + if [[ "${rc}" -ne 0 ]] || [[ "${dmesg_rc}" -ne 0 ]]; then + return "${KSFT_FAIL}" + fi + + return "${KSFT_PASS}" +} + +test_ns_diff_global_host_connect_to_local_vm_fails() { + local oops_before warn_before + local ns0="global0" + local ns1="local0" + local port=12345 + local dmesg_rc + local pidfile + local result + local pid + + init_namespaces + + outfile=$(mktemp) + + pidfile="$(create_pidfile)" + if ! vm_start "${pidfile}" "${ns1}"; then + log_host "failed to start vm (cid=${VSOCK_CID}, ns=${ns0})" + return "${KSFT_FAIL}" + fi + + vm_wait_for_ssh "${ns1}" + oops_before=$(vm_dmesg_oops_count "${ns1}") + warn_before=$(vm_dmesg_warn_count "${ns1}") + + vm_ssh "${ns1}" -- socat VSOCK-LISTEN:"${port}" STDOUT > "${outfile}" & + vm_wait_for_listener "${ns1}" "${port}" "vsock" + echo TEST | ip netns exec "${ns0}" \ + socat STDIN VSOCK-CONNECT:"${VSOCK_CID}":"${port}" 2>/dev/null + + vm_dmesg_check "${pidfile}" "${ns1}" "${oops_before}" "${warn_before}" + dmesg_rc=$? + + terminate_pidfiles "${pidfile}" + result=$(cat "${outfile}") + rm -f "${outfile}" + + if [[ "${result}" == "TEST" ]] || [[ "${dmesg_rc}" -ne 0 ]]; then + return "${KSFT_FAIL}" + fi + + return "${KSFT_PASS}" +} + +test_ns_diff_global_vm_connect_to_global_host_ok() { + local oops_before warn_before + local ns0="global0" + local ns1="global1" + local port=12345 + local unixfile + local dmesg_rc + local pidfile + local pids + local rc + + init_namespaces + + declare -a pids + + log_host "Setup socat bridge from ns ${ns0} to ns ${ns1} over port ${port}" + + unixfile=$(mktemp -u /tmp/XXXX.sock) + + ip netns exec "${ns0}" \ + socat TCP-LISTEN:"${port}" UNIX-CONNECT:"${unixfile}" & + pids+=($!) + host_wait_for_listener "${ns0}" "${port}" "tcp" + + ip netns exec "${ns1}" \ + socat UNIX-LISTEN:"${unixfile}" TCP-CONNECT:127.0.0.1:"${port}" & + pids+=($!) + host_wait_for_listener "${ns1}" "${unixfile}" "unix" + + log_host "Launching ${VSOCK_TEST} in ns ${ns1}" + host_vsock_test "${ns1}" "server" "${VSOCK_CID}" "${port}" + + pidfile="$(create_pidfile)" + if ! vm_start "${pidfile}" "${ns0}"; then + log_host "failed to start vm (cid=${cid}, ns=${ns0})" + terminate_pids "${pids[@]}" + rm -f "${unixfile}" + return "${KSFT_FAIL}" + fi + + vm_wait_for_ssh "${ns0}" + + oops_before=$(vm_dmesg_oops_count "${ns0}") + warn_before=$(vm_dmesg_warn_count "${ns0}") + + vm_vsock_test "${ns0}" "10.0.2.2" 2 "${port}" + rc=$? + + vm_dmesg_check "${pidfile}" "${ns0}" "${oops_before}" "${warn_before}" + dmesg_rc=$? + + terminate_pidfiles "${pidfile}" + terminate_pids "${pids[@]}" + rm -f "${unixfile}" + + if [[ "${rc}" -ne 0 ]] || [[ "${dmesg_rc}" -ne 0 ]]; then + return "${KSFT_FAIL}" + fi + + return "${KSFT_PASS}" + +} + +test_ns_diff_global_vm_connect_to_local_host_fails() { + local ns0="global0" + local ns1="local0" + local port=12345 + local oops_before warn_before + local dmesg_rc + local pidfile + local result + local pid + + init_namespaces + + log_host "Launching socat in ns ${ns1}" + outfile=$(mktemp) + + ip netns exec "${ns1}" socat VSOCK-LISTEN:"${port}" STDOUT &> "${outfile}" & + pid=$! + host_wait_for_listener "${ns1}" "${port}" "vsock" + + pidfile="$(create_pidfile)" + if ! vm_start "${pidfile}" "${ns0}"; then + log_host "failed to start vm (cid=${cid}, ns=${ns0})" + terminate_pids "${pid}" + rm -f "${outfile}" + return "${KSFT_FAIL}" + fi + + vm_wait_for_ssh "${ns0}" + + oops_before=$(vm_dmesg_oops_count "${ns0}") + warn_before=$(vm_dmesg_warn_count "${ns0}") + + vm_ssh "${ns0}" -- \ + bash -c "echo TEST | socat STDIN VSOCK-CONNECT:2:${port}" 2>&1 | log_guest + + vm_dmesg_check "${pidfile}" "${ns0}" "${oops_before}" "${warn_before}" + dmesg_rc=$? + + terminate_pidfiles "${pidfile}" + terminate_pids "${pid}" + + result=$(cat "${outfile}") + rm -f "${outfile}" + + if [[ "${result}" != TEST ]] && [[ "${dmesg_rc}" -eq 0 ]]; then + return "${KSFT_PASS}" + fi + + return "${KSFT_FAIL}" +} + +test_ns_diff_local_host_connect_to_local_vm_fails() { + local ns0="local0" + local ns1="local1" + local port=12345 + local oops_before warn_before + local dmesg_rc + local pidfile + local result + local pid + + init_namespaces + + outfile=$(mktemp) + + pidfile="$(create_pidfile)" + if ! vm_start "${pidfile}" "${ns1}"; then + log_host "failed to start vm (cid=${cid}, ns=${ns0})" + return "${KSFT_FAIL}" + fi + + vm_wait_for_ssh "${ns1}" + oops_before=$(vm_dmesg_oops_count "${ns1}") + warn_before=$(vm_dmesg_warn_count "${ns1}") + + vm_ssh "${ns1}" -- socat VSOCK-LISTEN:"${port}" STDOUT > "${outfile}" & + vm_wait_for_listener "${ns1}" "${port}" "vsock" + + echo TEST | ip netns exec "${ns0}" \ + socat STDIN VSOCK-CONNECT:"${VSOCK_CID}":"${port}" 2>/dev/null + + vm_dmesg_check "${pidfile}" "${ns1}" "${oops_before}" "${warn_before}" + dmesg_rc=$? + + terminate_pidfiles "${pidfile}" + + result=$(cat "${outfile}") + rm -f "${outfile}" + + if [[ "${result}" != TEST ]] && [[ "${dmesg_rc}" -eq 0 ]]; then + return "${KSFT_PASS}" + fi + + return "${KSFT_FAIL}" +} + +test_ns_diff_local_vm_connect_to_local_host_fails() { + local oops_before warn_before + local ns0="local0" + local ns1="local1" + local port=12345 + local dmesg_rc + local pidfile + local result + local pid + + init_namespaces + + log_host "Launching socat in ns ${ns1}" + outfile=$(mktemp) + ip netns exec "${ns1}" socat VSOCK-LISTEN:"${port}" STDOUT &> "${outfile}" & + pid=$! + host_wait_for_listener "${ns1}" "${port}" "vsock" + + pidfile="$(create_pidfile)" + if ! vm_start "${pidfile}" "${ns0}"; then + log_host "failed to start vm (cid=${cid}, ns=${ns0})" + rm -f "${outfile}" + return "${KSFT_FAIL}" + fi + + vm_wait_for_ssh "${ns0}" + oops_before=$(vm_dmesg_oops_count "${ns0}") + warn_before=$(vm_dmesg_warn_count "${ns0}") + + vm_ssh "${ns0}" -- \ + bash -c "echo TEST | socat STDIN VSOCK-CONNECT:2:${port}" 2>&1 | log_guest + + vm_dmesg_check "${pidfile}" "${ns0}" "${oops_before}" "${warn_before}" + dmesg_rc=$? + + terminate_pidfiles "${pidfile}" + terminate_pids "${pid}" + + result=$(cat "${outfile}") + rm -f "${outfile}" + + if [[ "${result}" != TEST ]] && [[ "${dmesg_rc}" -eq 0 ]]; then + return "${KSFT_PASS}" + fi + + return "${KSFT_FAIL}" +} + +__test_loopback_two_netns() { + local ns0=$1 + local ns1=$2 + local port=12345 + local result + local pid + + modprobe vsock_loopback &> /dev/null || : + + log_host "Launching socat in ns ${ns1}" + outfile=$(mktemp) + + ip netns exec "${ns1}" socat VSOCK-LISTEN:"${port}" STDOUT > "${outfile}" 2>/dev/null & + pid=$! + host_wait_for_listener "${ns1}" "${port}" "vsock" + + log_host "Launching socat in ns ${ns0}" + echo TEST | ip netns exec "${ns0}" socat STDIN VSOCK-CONNECT:1:"${port}" 2>/dev/null + terminate_pids "${pid}" + + result=$(cat "${outfile}") + rm -f "${outfile}" + + if [[ "${result}" == TEST ]]; then + return 0 + fi + + return 1 +} + +test_ns_diff_global_to_local_loopback_local_fails() { + init_namespaces + + if ! __test_loopback_two_netns "global0" "local0"; then + return "${KSFT_PASS}" + fi + + return "${KSFT_FAIL}" +} + +test_ns_diff_local_to_global_loopback_fails() { + init_namespaces + + if ! __test_loopback_two_netns "local0" "global0"; then + return "${KSFT_PASS}" + fi + + return "${KSFT_FAIL}" +} + +test_ns_diff_local_to_local_loopback_fails() { + init_namespaces + + if ! __test_loopback_two_netns "local0" "local1"; then + return "${KSFT_PASS}" + fi + + return "${KSFT_FAIL}" +} + +test_ns_diff_global_to_global_loopback_ok() { + init_namespaces + + if __test_loopback_two_netns "global0" "global1"; then + return "${KSFT_PASS}" + fi + + return "${KSFT_FAIL}" +} + +test_ns_same_local_loopback_ok() { + init_namespaces + + if __test_loopback_two_netns "local0" "local0"; then + return "${KSFT_PASS}" + fi + + return "${KSFT_FAIL}" +} + +test_ns_same_local_host_connect_to_local_vm_ok() { + local oops_before warn_before + local ns="local0" + local port=1234 + local dmesg_rc + local pidfile + local rc + + init_namespaces + + pidfile="$(create_pidfile)" + + if ! vm_start "${pidfile}" "${ns}"; then + return "${KSFT_FAIL}" + fi + + vm_wait_for_ssh "${ns}" + oops_before=$(vm_dmesg_oops_count "${ns}") + warn_before=$(vm_dmesg_warn_count "${ns}") + + vm_vsock_test "${ns}" "server" 2 "${TEST_GUEST_PORT}" + + # Skip test 29 (transport release use-after-free): This test attempts + # binding both G2H and H2G CIDs. Because virtio-vsock (G2H) doesn't + # support local namespaces the test will fail when + # transport_g2h->stream_allow() returns false. This edge case only + # happens for vsock_test in client mode on the host in a local + # namespace. This is a false positive. + host_vsock_test "${ns}" "127.0.0.1" "${VSOCK_CID}" "${TEST_HOST_PORT}" --skip=29 + rc=$? + + vm_dmesg_check "${pidfile}" "${ns}" "${oops_before}" "${warn_before}" + dmesg_rc=$? + + terminate_pidfiles "${pidfile}" + + if [[ "${rc}" -ne 0 ]] || [[ "${dmesg_rc}" -ne 0 ]]; then + return "${KSFT_FAIL}" + fi + + return "${KSFT_PASS}" +} + +test_ns_same_local_vm_connect_to_local_host_ok() { + local oops_before warn_before + local ns="local0" + local port=1234 + local dmesg_rc + local pidfile + local rc + + init_namespaces + + pidfile="$(create_pidfile)" + + if ! vm_start "${pidfile}" "${ns}"; then + return "${KSFT_FAIL}" + fi + + vm_wait_for_ssh "${ns}" + oops_before=$(vm_dmesg_oops_count "${ns}") + warn_before=$(vm_dmesg_warn_count "${ns}") + + host_vsock_test "${ns}" "server" "${VSOCK_CID}" "${port}" + vm_vsock_test "${ns}" "10.0.2.2" 2 "${port}" + rc=$? + + vm_dmesg_check "${pidfile}" "${ns}" "${oops_before}" "${warn_before}" + dmesg_rc=$? + + terminate_pidfiles "${pidfile}" + + if [[ "${rc}" -ne 0 ]] || [[ "${dmesg_rc}" -ne 0 ]]; then + return "${KSFT_FAIL}" + fi + + return "${KSFT_PASS}" +} + namespaces_can_boot_same_cid() { local ns0=$1 local ns1=$2 @@ -882,6 +1445,7 @@ fi check_args "${ARGS[@]}" check_deps check_vng +check_socat handle_build echo "1..${#ARGS[@]}" -- cgit v1.2.3 From b3b7b33264c69a3a97723c31a14c7a19b2fce503 Mon Sep 17 00:00:00 2001 From: Bobby Eshleman Date: Wed, 21 Jan 2026 14:11:52 -0800 Subject: selftests/vsock: add tests for namespace deletion Add tests that validate vsock sockets are resilient to deleting namespaces. The vsock sockets should still function normally. The function check_ns_delete_doesnt_break_connection() is added to re-use the step-by-step logic of 1) setup connections, 2) delete ns, 3) check that the connections are still ok. Reviewed-by: Stefano Garzarella Signed-off-by: Bobby Eshleman Link: https://patch.msgid.link/20260121-vsock-vmtest-v16-12-2859a7512097@meta.com Acked-by: Michael S. Tsirkin Signed-off-by: Paolo Abeni --- tools/testing/selftests/vsock/vmtest.sh | 84 +++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/vsock/vmtest.sh b/tools/testing/selftests/vsock/vmtest.sh index a9eaf37bc31b..dc8dbe74a6d0 100755 --- a/tools/testing/selftests/vsock/vmtest.sh +++ b/tools/testing/selftests/vsock/vmtest.sh @@ -68,6 +68,9 @@ readonly TEST_NAMES=( ns_same_local_loopback_ok ns_same_local_host_connect_to_local_vm_ok ns_same_local_vm_connect_to_local_host_ok + ns_delete_vm_ok + ns_delete_host_ok + ns_delete_both_ok ) readonly TEST_DESCS=( # vm_server_host_client @@ -135,6 +138,15 @@ readonly TEST_DESCS=( # ns_same_local_vm_connect_to_local_host_ok "Run vsock_test client in VM in a local ns with server in same ns." + + # ns_delete_vm_ok + "Check that deleting the VM's namespace does not break the socket connection" + + # ns_delete_host_ok + "Check that deleting the host's namespace does not break the socket connection" + + # ns_delete_both_ok + "Check that deleting the VM and host's namespaces does not break the socket connection" ) readonly USE_SHARED_VM=( @@ -1287,6 +1299,78 @@ test_vm_loopback() { return "${KSFT_PASS}" } +check_ns_delete_doesnt_break_connection() { + local pipefile pidfile outfile + local ns0="global0" + local ns1="global1" + local port=12345 + local pids=() + local rc=0 + + init_namespaces + + pidfile="$(create_pidfile)" + if ! vm_start "${pidfile}" "${ns0}"; then + return "${KSFT_FAIL}" + fi + vm_wait_for_ssh "${ns0}" + + outfile=$(mktemp) + vm_ssh "${ns0}" -- \ + socat VSOCK-LISTEN:"${port}",fork STDOUT > "${outfile}" 2>/dev/null & + pids+=($!) + vm_wait_for_listener "${ns0}" "${port}" "vsock" + + # We use a pipe here so that we can echo into the pipe instead of using + # socat and a unix socket file. We just need a name for the pipe (not a + # regular file) so use -u. + pipefile=$(mktemp -u /tmp/vmtest_pipe_XXXX) + ip netns exec "${ns1}" \ + socat PIPE:"${pipefile}" VSOCK-CONNECT:"${VSOCK_CID}":"${port}" & + pids+=($!) + + timeout "${WAIT_PERIOD}" \ + bash -c 'while [[ ! -e '"${pipefile}"' ]]; do sleep 1; done; exit 0' + + if [[ "$1" == "vm" ]]; then + ip netns del "${ns0}" + elif [[ "$1" == "host" ]]; then + ip netns del "${ns1}" + elif [[ "$1" == "both" ]]; then + ip netns del "${ns0}" + ip netns del "${ns1}" + fi + + echo "TEST" > "${pipefile}" + + timeout "${WAIT_PERIOD}" \ + bash -c 'while [[ ! -s '"${outfile}"' ]]; do sleep 1; done; exit 0' + + if grep -q "TEST" "${outfile}"; then + rc="${KSFT_PASS}" + else + rc="${KSFT_FAIL}" + fi + + terminate_pidfiles "${pidfile}" + terminate_pids "${pids[@]}" + rm -f "${outfile}" "${pipefile}" + + return "${rc}" +} + +test_ns_delete_vm_ok() { + check_ns_delete_doesnt_break_connection "vm" +} + +test_ns_delete_host_ok() { + check_ns_delete_doesnt_break_connection "host" +} + +test_ns_delete_both_ok() { + check_ns_delete_doesnt_break_connection "both" +} + shared_vm_test() { local tname -- cgit v1.2.3 From 1456ebb291ddee67c9144c8f7f38a6dddcd32ed7 Mon Sep 17 00:00:00 2001 From: Matt Bobrowski Date: Tue, 27 Jan 2026 08:51:11 +0000 Subject: selftests/bpf: cover BPF_CGROUP_ITER_CHILDREN control option Extend some of the existing CSS iterator selftests such that they cover the newly introduced BPF_CGROUP_ITER_CHILDREN iterator control option. Signed-off-by: Matt Bobrowski Link: https://lore.kernel.org/r/20260127085112.3608687-2-mattbobrowski@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/cgroup_iter.c | 12 ++++++++++++ tools/testing/selftests/bpf/prog_tests/iters.c | 8 +++++++- tools/testing/selftests/bpf/progs/iters_css.c | 9 ++++++--- 3 files changed, 25 insertions(+), 4 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c b/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c index 574d9a0cdc8e..0f88a9d00a22 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c @@ -190,6 +190,16 @@ static void test_walk_self_only(struct cgroup_iter *skel) BPF_CGROUP_ITER_SELF_ONLY, "self_only"); } +static void test_walk_children(struct cgroup_iter *skel) +{ + snprintf(expected_output, sizeof(expected_output), + PROLOGUE "%8llu\n%8llu\n" EPILOGUE, cg_id[CHILD1], + cg_id[CHILD2]); + + read_from_cgroup_iter(skel->progs.cgroup_id_printer, cg_fd[PARENT], + BPF_CGROUP_ITER_CHILDREN, "children"); +} + static void test_walk_dead_self_only(struct cgroup_iter *skel) { DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts); @@ -325,6 +335,8 @@ void test_cgroup_iter(void) test_walk_dead_self_only(skel); if (test__start_subtest("cgroup_iter__self_only_css_task")) test_walk_self_only_css_task(); + if (test__start_subtest("cgroup_iter__children")) + test_walk_children(skel); out: cgroup_iter__destroy(skel); diff --git a/tools/testing/selftests/bpf/prog_tests/iters.c b/tools/testing/selftests/bpf/prog_tests/iters.c index 3cea71f9c500..a539980a2fbe 100644 --- a/tools/testing/selftests/bpf/prog_tests/iters.c +++ b/tools/testing/selftests/bpf/prog_tests/iters.c @@ -253,6 +253,11 @@ static void subtest_css_iters(void) { "/cg1/cg2" }, { "/cg1/cg2/cg3" }, { "/cg1/cg2/cg3/cg4" }, + { "/cg1/cg5" }, + { "/cg1/cg5/cg6" }, + { "/cg1/cg7" }, + { "/cg1/cg7/cg8" }, + { "/cg1/cg7/cg8/cg9" }, }; int err, cg_nr = ARRAY_SIZE(cgs); int i; @@ -284,7 +289,8 @@ static void subtest_css_iters(void) ASSERT_EQ(skel->bss->post_order_cnt, cg_nr, "post_order_cnt"); ASSERT_EQ(skel->bss->last_cg_id, get_cgroup_id(cgs[0].path), "last_cg_id"); - ASSERT_EQ(skel->bss->tree_high, cg_nr - 1, "tree_high"); + ASSERT_EQ(skel->bss->children_cnt, 3, "children_cnt"); + ASSERT_EQ(skel->bss->tree_high, 3, "tree_high"); iters_css__detach(skel); cleanup: cleanup_cgroup_environment(); diff --git a/tools/testing/selftests/bpf/progs/iters_css.c b/tools/testing/selftests/bpf/progs/iters_css.c index ec1f6c2f590b..5a1d87d186a9 100644 --- a/tools/testing/selftests/bpf/progs/iters_css.c +++ b/tools/testing/selftests/bpf/progs/iters_css.c @@ -12,8 +12,7 @@ char _license[] SEC("license") = "GPL"; pid_t target_pid; u64 root_cg_id, leaf_cg_id; u64 first_cg_id, last_cg_id; - -int pre_order_cnt, post_order_cnt, tree_high; +int pre_order_cnt, post_order_cnt, children_cnt, tree_high; struct cgroup *bpf_cgroup_from_id(u64 cgid) __ksym; void bpf_cgroup_release(struct cgroup *p) __ksym; @@ -43,7 +42,7 @@ int iter_css_for_each(const void *ctx) } root_css = &root_cgrp->self; leaf_css = &leaf_cgrp->self; - pre_order_cnt = post_order_cnt = tree_high = 0; + pre_order_cnt = post_order_cnt = children_cnt = tree_high = 0; first_cg_id = last_cg_id = 0; bpf_rcu_read_lock(); @@ -60,6 +59,10 @@ int iter_css_for_each(const void *ctx) first_cg_id = cur_cgrp->kn->id; } + bpf_for_each(css, pos, root_css, BPF_CGROUP_ITER_CHILDREN) { + children_cnt++; + } + bpf_for_each(css, pos, leaf_css, BPF_CGROUP_ITER_ANCESTORS_UP) tree_high++; -- cgit v1.2.3 From 17e2ce02bf5669dfa659976e93d409228cba98f9 Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Sat, 24 Jan 2026 19:32:45 +0800 Subject: selftests/bpf: Add tests for FIONREAD and copied_seq This commit adds two new test functions: one to reproduce the bug reported by syzkaller [1], and another to cover the calculation of copied_seq. The tests primarily involve installing and uninstalling sockmap on sockets, then reading data to verify proper functionality. Additionally, extend the do_test_sockmap_skb_verdict_fionread() function to support UDP FIONREAD testing. [1] https://syzkaller.appspot.com/bug?extid=06dbd397158ec0ea4983 Signed-off-by: Jiayuan Chen Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/r/20260124113314.113584-4-jiayuan.chen@linux.dev Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/sockmap_basic.c | 294 ++++++++++++++++++++- .../selftests/bpf/progs/test_sockmap_pass_prog.c | 14 + 2 files changed, 302 insertions(+), 6 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index 1e3e4392dcca..256707e7d20d 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -1,7 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2020 Cloudflare #include -#include +#include +#include #include #include "test_progs.h" @@ -22,6 +23,15 @@ #define TCP_REPAIR_ON 1 #define TCP_REPAIR_OFF_NO_WP -1 /* Turn off without window probes */ +/** + * SOL_TCP is defined in (glibc), but the copybuf_address + * field of tcp_zerocopy_receive is not yet included in older versions. + * This workaround remains necessary until the glibc update propagates. + */ +#ifndef SOL_TCP +#define SOL_TCP 6 +#endif + static int connected_socket_v4(void) { struct sockaddr_in addr = { @@ -536,13 +546,14 @@ out: } -static void test_sockmap_skb_verdict_fionread(bool pass_prog) +static void do_test_sockmap_skb_verdict_fionread(int sotype, bool pass_prog) { int err, map, verdict, c0 = -1, c1 = -1, p0 = -1, p1 = -1; int expected, zero = 0, sent, recvd, avail; struct test_sockmap_pass_prog *pass = NULL; struct test_sockmap_drop_prog *drop = NULL; char buf[256] = "0123456789"; + int split_len = sizeof(buf) / 2; if (pass_prog) { pass = test_sockmap_pass_prog__open_and_load(); @@ -550,7 +561,10 @@ static void test_sockmap_skb_verdict_fionread(bool pass_prog) return; verdict = bpf_program__fd(pass->progs.prog_skb_verdict); map = bpf_map__fd(pass->maps.sock_map_rx); - expected = sizeof(buf); + if (sotype == SOCK_DGRAM) + expected = split_len; /* FIONREAD for UDP is different from TCP */ + else + expected = sizeof(buf); } else { drop = test_sockmap_drop_prog__open_and_load(); if (!ASSERT_OK_PTR(drop, "open_and_load")) @@ -566,7 +580,7 @@ static void test_sockmap_skb_verdict_fionread(bool pass_prog) if (!ASSERT_OK(err, "bpf_prog_attach")) goto out; - err = create_socket_pairs(AF_INET, SOCK_STREAM, &c0, &c1, &p0, &p1); + err = create_socket_pairs(AF_INET, sotype, &c0, &c1, &p0, &p1); if (!ASSERT_OK(err, "create_socket_pairs()")) goto out; @@ -574,8 +588,9 @@ static void test_sockmap_skb_verdict_fionread(bool pass_prog) if (!ASSERT_OK(err, "bpf_map_update_elem(c1)")) goto out_close; - sent = xsend(p1, &buf, sizeof(buf), 0); - ASSERT_EQ(sent, sizeof(buf), "xsend(p0)"); + sent = xsend(p1, &buf, split_len, 0); + sent += xsend(p1, &buf, sizeof(buf) - split_len, 0); + ASSERT_EQ(sent, sizeof(buf), "xsend(p1)"); err = ioctl(c1, FIONREAD, &avail); ASSERT_OK(err, "ioctl(FIONREAD) error"); ASSERT_EQ(avail, expected, "ioctl(FIONREAD)"); @@ -597,6 +612,12 @@ out: test_sockmap_drop_prog__destroy(drop); } +static void test_sockmap_skb_verdict_fionread(bool pass_prog) +{ + do_test_sockmap_skb_verdict_fionread(SOCK_STREAM, pass_prog); + do_test_sockmap_skb_verdict_fionread(SOCK_DGRAM, pass_prog); +} + static void test_sockmap_skb_verdict_change_tail(void) { struct test_sockmap_change_tail *skel; @@ -1042,6 +1063,257 @@ close_map: xclose(map); } +/* it is used to reproduce WARNING */ +static void test_sockmap_zc(void) +{ + int map, err, sent, recvd, zero = 0, one = 1, on = 1; + char buf[10] = "0123456789", rcv[11], addr[100]; + struct test_sockmap_pass_prog *skel = NULL; + int c0 = -1, p0 = -1, c1 = -1, p1 = -1; + struct tcp_zerocopy_receive zc; + socklen_t zc_len = sizeof(zc); + struct bpf_program *prog; + + skel = test_sockmap_pass_prog__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + if (create_socket_pairs(AF_INET, SOCK_STREAM, &c0, &c1, &p0, &p1)) + goto end; + + prog = skel->progs.prog_skb_verdict_ingress; + map = bpf_map__fd(skel->maps.sock_map_rx); + + err = bpf_prog_attach(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_VERDICT, 0); + if (!ASSERT_OK(err, "bpf_prog_attach")) + goto end; + + err = bpf_map_update_elem(map, &zero, &p0, BPF_ANY); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + goto end; + + err = bpf_map_update_elem(map, &one, &p1, BPF_ANY); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + goto end; + + sent = xsend(c0, buf, sizeof(buf), 0); + if (!ASSERT_EQ(sent, sizeof(buf), "xsend")) + goto end; + + /* trigger tcp_bpf_recvmsg_parser and inc copied_seq of p1 */ + recvd = recv_timeout(p1, rcv, sizeof(rcv), MSG_DONTWAIT, 1); + if (!ASSERT_EQ(recvd, sent, "recv_timeout(p1)")) + goto end; + + /* uninstall sockmap of p1 */ + bpf_map_delete_elem(map, &one); + + /* trigger tcp stack and the rcv_nxt of p1 is less than copied_seq */ + sent = xsend(c1, buf, sizeof(buf) - 1, 0); + if (!ASSERT_EQ(sent, sizeof(buf) - 1, "xsend")) + goto end; + + err = setsockopt(p1, SOL_SOCKET, SO_ZEROCOPY, &on, sizeof(on)); + if (!ASSERT_OK(err, "setsockopt")) + goto end; + + memset(&zc, 0, sizeof(zc)); + zc.copybuf_address = (__u64)((unsigned long)addr); + zc.copybuf_len = sizeof(addr); + + err = getsockopt(p1, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, &zc, &zc_len); + if (!ASSERT_OK(err, "getsockopt")) + goto end; + +end: + if (c0 >= 0) + close(c0); + if (p0 >= 0) + close(p0); + if (c1 >= 0) + close(c1); + if (p1 >= 0) + close(p1); + test_sockmap_pass_prog__destroy(skel); +} + +/* it is used to check whether copied_seq of sk is correct */ +static void test_sockmap_copied_seq(bool strp) +{ + int i, map, err, sent, recvd, zero = 0, one = 1; + struct test_sockmap_pass_prog *skel = NULL; + int c0 = -1, p0 = -1, c1 = -1, p1 = -1; + char buf[10] = "0123456789", rcv[11]; + struct bpf_program *prog; + + skel = test_sockmap_pass_prog__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + if (create_socket_pairs(AF_INET, SOCK_STREAM, &c0, &c1, &p0, &p1)) + goto end; + + prog = skel->progs.prog_skb_verdict_ingress; + map = bpf_map__fd(skel->maps.sock_map_rx); + + err = bpf_prog_attach(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_VERDICT, 0); + if (!ASSERT_OK(err, "bpf_prog_attach verdict")) + goto end; + + if (strp) { + prog = skel->progs.prog_skb_verdict_ingress_strp; + err = bpf_prog_attach(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_PARSER, 0); + if (!ASSERT_OK(err, "bpf_prog_attach parser")) + goto end; + } + + err = bpf_map_update_elem(map, &zero, &p0, BPF_ANY); + if (!ASSERT_OK(err, "bpf_map_update_elem(p0)")) + goto end; + + err = bpf_map_update_elem(map, &one, &p1, BPF_ANY); + if (!ASSERT_OK(err, "bpf_map_update_elem(p1)")) + goto end; + + /* just trigger sockamp: data sent by c0 will be received by p1 */ + sent = xsend(c0, buf, sizeof(buf), 0); + if (!ASSERT_EQ(sent, sizeof(buf), "xsend(c0), bpf")) + goto end; + + /* do partial read */ + recvd = recv_timeout(p1, rcv, 1, MSG_DONTWAIT, 1); + recvd += recv_timeout(p1, rcv + 1, sizeof(rcv) - 1, MSG_DONTWAIT, 1); + if (!ASSERT_EQ(recvd, sent, "recv_timeout(p1), bpf") || + !ASSERT_OK(memcmp(buf, rcv, recvd), "data mismatch")) + goto end; + + /* uninstall sockmap of p1 and p0 */ + err = bpf_map_delete_elem(map, &one); + if (!ASSERT_OK(err, "bpf_map_delete_elem(1)")) + goto end; + + err = bpf_map_delete_elem(map, &zero); + if (!ASSERT_OK(err, "bpf_map_delete_elem(0)")) + goto end; + + /* now all sockets become plain socket, they should still work */ + for (i = 0; i < 5; i++) { + /* test copied_seq of p1 by running tcp native stack */ + sent = xsend(c1, buf, sizeof(buf), 0); + if (!ASSERT_EQ(sent, sizeof(buf), "xsend(c1), native")) + goto end; + + recvd = recv(p1, rcv, sizeof(rcv), MSG_DONTWAIT); + if (!ASSERT_EQ(recvd, sent, "recv_timeout(p1), native")) + goto end; + + /* p0 previously redirected skb to p1, we also check copied_seq of p0 */ + sent = xsend(c0, buf, sizeof(buf), 0); + if (!ASSERT_EQ(sent, sizeof(buf), "xsend(c0), native")) + goto end; + + recvd = recv(p0, rcv, sizeof(rcv), MSG_DONTWAIT); + if (!ASSERT_EQ(recvd, sent, "recv_timeout(p0), native")) + goto end; + } + +end: + if (c0 >= 0) + close(c0); + if (p0 >= 0) + close(p0); + if (c1 >= 0) + close(c1); + if (p1 >= 0) + close(p1); + test_sockmap_pass_prog__destroy(skel); +} + +/* Wait until FIONREAD returns the expected value or timeout */ +static int wait_for_fionread(int fd, int expected, unsigned int timeout_ms) +{ + unsigned int elapsed = 0; + int avail = 0; + + while (elapsed < timeout_ms) { + if (ioctl(fd, FIONREAD, &avail) < 0) + return -errno; + if (avail >= expected) + return avail; + usleep(1000); + elapsed++; + } + return avail; +} + +/* it is used to send data to via native stack and BPF redirecting */ +static void test_sockmap_multi_channels(int sotype) +{ + int map, err, sent, recvd, zero = 0, one = 1, avail = 0, expected; + struct test_sockmap_pass_prog *skel = NULL; + int c0 = -1, p0 = -1, c1 = -1, p1 = -1; + char buf[10] = "0123456789", rcv[11]; + struct bpf_program *prog; + + skel = test_sockmap_pass_prog__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + err = create_socket_pairs(AF_INET, sotype, &c0, &c1, &p0, &p1); + if (err) + goto end; + + prog = skel->progs.prog_skb_verdict_ingress; + map = bpf_map__fd(skel->maps.sock_map_rx); + + err = bpf_prog_attach(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_VERDICT, 0); + if (!ASSERT_OK(err, "bpf_prog_attach verdict")) + goto end; + + err = bpf_map_update_elem(map, &zero, &p0, BPF_ANY); + if (!ASSERT_OK(err, "bpf_map_update_elem(p0)")) + goto end; + + err = bpf_map_update_elem(map, &one, &p1, BPF_ANY); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + goto end; + + /* send data to p1 via native stack */ + sent = xsend(c1, buf, 2, 0); + if (!ASSERT_EQ(sent, 2, "xsend(2)")) + goto end; + + avail = wait_for_fionread(p1, 2, IO_TIMEOUT_SEC); + ASSERT_EQ(avail, 2, "ioctl(FIONREAD) partial return"); + + /* send data to p1 via bpf redirecting */ + sent = xsend(c0, buf + 2, sizeof(buf) - 2, 0); + if (!ASSERT_EQ(sent, sizeof(buf) - 2, "xsend(remain-data)")) + goto end; + + /* Poll FIONREAD until expected bytes arrive, poll_read() is unreliable + * here since it may return immediately if prior data is already queued. + */ + expected = sotype == SOCK_DGRAM ? 2 : sizeof(buf); + avail = wait_for_fionread(p1, expected, IO_TIMEOUT_SEC); + ASSERT_EQ(avail, expected, "ioctl(FIONREAD) full return"); + + recvd = recv_timeout(p1, rcv, sizeof(rcv), MSG_DONTWAIT, 1); + if (!ASSERT_EQ(recvd, sizeof(buf), "recv_timeout(p1)") || + !ASSERT_OK(memcmp(buf, rcv, recvd), "data mismatch")) + goto end; +end: + if (c0 >= 0) + close(c0); + if (p0 >= 0) + close(p0); + if (c1 >= 0) + close(c1); + if (p1 >= 0) + close(p1); + test_sockmap_pass_prog__destroy(skel); +} + void test_sockmap_basic(void) { if (test__start_subtest("sockmap create_update_free")) @@ -1108,4 +1380,14 @@ void test_sockmap_basic(void) test_sockmap_skb_verdict_vsock_poll(); if (test__start_subtest("sockmap vsock unconnected")) test_sockmap_vsock_unconnected(); + if (test__start_subtest("sockmap with zc")) + test_sockmap_zc(); + if (test__start_subtest("sockmap recover")) + test_sockmap_copied_seq(false); + if (test__start_subtest("sockmap recover with strp")) + test_sockmap_copied_seq(true); + if (test__start_subtest("sockmap tcp multi channels")) + test_sockmap_multi_channels(SOCK_STREAM); + if (test__start_subtest("sockmap udp multi channels")) + test_sockmap_multi_channels(SOCK_DGRAM); } diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c b/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c index 69aacc96db36..ef9edca184ea 100644 --- a/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c +++ b/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c @@ -44,4 +44,18 @@ int prog_skb_parser(struct __sk_buff *skb) return SK_PASS; } +SEC("sk_skb/stream_verdict") +int prog_skb_verdict_ingress(struct __sk_buff *skb) +{ + int one = 1; + + return bpf_sk_redirect_map(skb, &sock_map_rx, one, BPF_F_INGRESS); +} + +SEC("sk_skb/stream_parser") +int prog_skb_verdict_ingress_strp(struct __sk_buff *skb) +{ + return skb->len; +} + char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 166e664e702ed96b83df2a87c1ea2138a995b604 Mon Sep 17 00:00:00 2001 From: Junjie Cao Date: Mon, 26 Jan 2026 14:15:31 +0800 Subject: selftests: ptp: use KSFT_SKIP exit code for skip scenarios The kselftest framework defines KSFT_SKIP=4 as the standard exit code for skipped tests. However, phc.sh currently uses a mix of 'exit 0' and 'exit 1' to indicate skip conditions, which can confuse test harnesses and CI systems. This patch introduces ksft_skip=4 variable and unifies all skip exit paths to use 'exit $ksft_skip', consistent with other selftests like net/lib.sh and net/fib_nexthops.sh. Signed-off-by: Junjie Cao Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260126061532.12532-1-junjie.cao@intel.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/ptp/phc.sh | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/ptp/phc.sh b/tools/testing/selftests/ptp/phc.sh index ac6e5a6e1d3a..51aad466d989 100755 --- a/tools/testing/selftests/ptp/phc.sh +++ b/tools/testing/selftests/ptp/phc.sh @@ -8,17 +8,20 @@ ALL_TESTS=" " DEV=$1 +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + ############################################################################## # Sanity checks if [[ "$(id -u)" -ne 0 ]]; then echo "SKIP: need root privileges" - exit 0 + exit $ksft_skip fi if [[ "$DEV" == "" ]]; then echo "SKIP: PTP device not provided" - exit 0 + exit $ksft_skip fi require_command() @@ -27,7 +30,7 @@ require_command() if [[ ! -x "$(command -v "$cmd")" ]]; then echo "SKIP: $cmd not installed" - exit 1 + exit $ksft_skip fi } @@ -37,7 +40,7 @@ phc_sanity() if [ $? != 0 ]; then echo "SKIP: unknown clock $DEV: No such device" - exit 1 + exit $ksft_skip fi } -- cgit v1.2.3 From 239f09e258b906deced5c2a7c1ac8aed301b558b Mon Sep 17 00:00:00 2001 From: Junjie Cao Date: Mon, 26 Jan 2026 14:15:32 +0800 Subject: selftests: ptp: treat unsupported PHC operations as skip Some PTP hardware clock (PHC) devices may return -EOPNOTSUPP for operations like settime, adjtime, or adjfreq. This commonly occurs with timestamp-only PHC implementations that don't support full clock control. For background, syzbot previously exposed a crash risk when PTP clock drivers lacked required callbacks[1]. Subsequent work[2] made callback presence a registration requirement. As a result, some drivers (like iwlwifi MVM/MLD[3]) now provide stub callbacks that return -EOPNOTSUPP for unsupported operations. When phc_ctl encounters such devices, the "Operation not supported" error should be treated as a skip (device limitation) rather than a test failure. This patch: - Adds [SKIP] output handling in log_test() - Detects "Operation not supported" from phc_ctl and returns ksft_skip - Returns ksft_skip if all tests are skipped, preventing false-positive results when testing timestamp-only PHC implementations Link: https://lore.kernel.org/netdev/20251028043216.1971292-1-junjie.cao@intel.com/ [1] Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=dfb073d32cac [2] Link: https://lore.kernel.org/netdev/20251204123204.9316-1-ziyao@disroot.org/ [3] Signed-off-by: Junjie Cao Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260126061532.12532-2-junjie.cao@intel.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/ptp/phc.sh | 49 ++++++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 12 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/ptp/phc.sh b/tools/testing/selftests/ptp/phc.sh index 51aad466d989..9f61c1579edf 100755 --- a/tools/testing/selftests/ptp/phc.sh +++ b/tools/testing/selftests/ptp/phc.sh @@ -52,6 +52,7 @@ phc_sanity # Exit status to return at the end. Set in case one of the tests fails. EXIT_STATUS=0 +PASS_COUNT=0 # Per-test return value. Clear at the beginning of each test. RET=0 @@ -68,12 +69,18 @@ log_test() { local test_name=$1 + if [[ $RET -eq $ksft_skip ]]; then + printf "TEST: %-60s [SKIP]\n" "$test_name" + return 0 + fi + if [[ $RET -ne 0 ]]; then EXIT_STATUS=1 printf "TEST: %-60s [FAIL]\n" "$test_name" return 1 fi + ((PASS_COUNT++)) printf "TEST: %-60s [ OK ]\n" "$test_name" return 0 } @@ -92,34 +99,49 @@ tests_run() settime_do() { - local res + local res out - res=$(phc_ctl $DEV set 0 wait 120.5 get 2> /dev/null \ - | awk '/clock time is/{print $5}' \ - | awk -F. '{print $1}') + out=$(LC_ALL=C phc_ctl $DEV set 0 wait 120.5 get 2>&1) + if [[ $? -ne 0 ]]; then + if echo "$out" | grep -qi "Operation not supported"; then + return $ksft_skip + fi + return 1 + fi + res=$(echo "$out" | awk '/clock time is/{print $5}' | awk -F. '{print $1}') (( res == 120 )) } adjtime_do() { - local res + local res out - res=$(phc_ctl $DEV set 0 adj 10 get 2> /dev/null \ - | awk '/clock time is/{print $5}' \ - | awk -F. '{print $1}') + out=$(LC_ALL=C phc_ctl $DEV set 0 adj 10 get 2>&1) + if [[ $? -ne 0 ]]; then + if echo "$out" | grep -qi "Operation not supported"; then + return $ksft_skip + fi + return 1 + fi + res=$(echo "$out" | awk '/clock time is/{print $5}' | awk -F. '{print $1}') (( res == 10 )) } adjfreq_do() { - local res + local res out # Set the clock to be 1% faster - res=$(phc_ctl $DEV freq 10000000 set 0 wait 100.5 get 2> /dev/null \ - | awk '/clock time is/{print $5}' \ - | awk -F. '{print $1}') + out=$(LC_ALL=C phc_ctl $DEV freq 10000000 set 0 wait 100.5 get 2>&1) + if [[ $? -ne 0 ]]; then + if echo "$out" | grep -qi "Operation not supported"; then + return $ksft_skip + fi + return 1 + fi + res=$(echo "$out" | awk '/clock time is/{print $5}' | awk -F. '{print $1}') (( res == 101 )) } @@ -166,4 +188,7 @@ trap cleanup EXIT tests_run +if [[ $EXIT_STATUS -eq 0 && $PASS_COUNT -eq 0 ]]; then + exit $ksft_skip +fi exit $EXIT_STATUS -- cgit v1.2.3 From 944e3f7562c55fa37ebcdd58e5f60f296c81a854 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 27 Jan 2026 12:12:06 +0100 Subject: tools: Update context analysis macros in compiler_types.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In sync with the main kernel headers, include a stub version of compiler-context-analysis.h in tools/include/linux/compiler_types.h and remove the sparse context tracking definitions. Since tools/ headers are generally self-contained, provide a standalone tools/include/linux/compiler-context-analysis.h with no-op stubs for now. Also clean up redundant stubs in tools/testing/shared/linux/kernel.h that are now redundant. This fixes build errors in tools/testing/radix-tree/ where headers from include/linux/ (like cleanup.h) are used directly and expect these macros to be defined: | cc -I../shared -I. -I../../include -I../../arch/x86/include -I../../../lib -g -Og -Wall -D_LGPL_SOURCE -fsanitize=address -fsanitize=undefined -c -o radix-tree.o radix-tree.c | In file included from ../shared/linux/cleanup.h:2, | from ../shared/linux/../../../../include/linux/idr.h:18, | from ../shared/linux/idr.h:5, | from radix-tree.c:18: | ../shared/linux/../../../../include/linux/idr.h: In function ‘class_idr_alloc_destructor’: | ../shared/linux/../../../../include/linux/cleanup.h:283:9: error: expected declaration specifiers before ‘__no_context_analysis’ | 283 | __no_context_analysis \ | | ^~~~~~~~~~~~~~~~~~~~~ Closes: https://lore.kernel.org/oe-lkp/202601261546.d7ae2447-lkp@intel.com Reported-by: kernel test robot Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Lorenzo Stoakes Tested-by: Lorenzo Stoakes Link: https://patch.msgid.link/20260127111428.3747328-1-elver@google.com --- tools/include/linux/compiler-context-analysis.h | 42 +++++++++++++++++++++++++ tools/include/linux/compiler_types.h | 16 +--------- tools/testing/shared/linux/kernel.h | 4 --- 3 files changed, 43 insertions(+), 19 deletions(-) create mode 100644 tools/include/linux/compiler-context-analysis.h (limited to 'tools/testing') diff --git a/tools/include/linux/compiler-context-analysis.h b/tools/include/linux/compiler-context-analysis.h new file mode 100644 index 000000000000..13a9115e9e58 --- /dev/null +++ b/tools/include/linux/compiler-context-analysis.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _TOOLS_LINUX_COMPILER_CONTEXT_ANALYSIS_H +#define _TOOLS_LINUX_COMPILER_CONTEXT_ANALYSIS_H + +/* + * Macros and attributes for compiler-based static context analysis. + * No-op stubs for tools. + */ + +#define __guarded_by(...) +#define __pt_guarded_by(...) + +#define context_lock_struct(name, ...) struct __VA_ARGS__ name + +#define __no_context_analysis +#define __context_unsafe(comment) +#define context_unsafe(...) ({ __VA_ARGS__; }) +#define context_unsafe_alias(p) +#define disable_context_analysis() +#define enable_context_analysis() + +#define __must_hold(...) +#define __must_not_hold(...) +#define __acquires(...) +#define __cond_acquires(ret, x) +#define __releases(...) +#define __acquire(x) (void)0 +#define __release(x) (void)0 + +#define __must_hold_shared(...) +#define __acquires_shared(...) +#define __cond_acquires_shared(ret, x) +#define __releases_shared(...) +#define __acquire_shared(x) (void)0 +#define __release_shared(x) (void)0 + +#define __acquire_ret(call, expr) (call) +#define __acquire_shared_ret(call, expr) (call) +#define __acquires_ret +#define __acquires_shared_ret + +#endif /* _TOOLS_LINUX_COMPILER_CONTEXT_ANALYSIS_H */ diff --git a/tools/include/linux/compiler_types.h b/tools/include/linux/compiler_types.h index 067a5b4e0f7b..14e420467eee 100644 --- a/tools/include/linux/compiler_types.h +++ b/tools/include/linux/compiler_types.h @@ -13,21 +13,7 @@ #define __has_builtin(x) (0) #endif -#ifdef __CHECKER__ -/* context/locking */ -# define __must_hold(x) __attribute__((context(x,1,1))) -# define __acquires(x) __attribute__((context(x,0,1))) -# define __releases(x) __attribute__((context(x,1,0))) -# define __acquire(x) __context__(x,1) -# define __release(x) __context__(x,-1) -#else /* __CHECKER__ */ -/* context/locking */ -# define __must_hold(x) -# define __acquires(x) -# define __releases(x) -# define __acquire(x) (void)0 -# define __release(x) (void)0 -#endif /* __CHECKER__ */ +#include /* Compiler specific macros. */ #ifdef __GNUC__ diff --git a/tools/testing/shared/linux/kernel.h b/tools/testing/shared/linux/kernel.h index c0a2bb785b92..dc2b4ccfb185 100644 --- a/tools/testing/shared/linux/kernel.h +++ b/tools/testing/shared/linux/kernel.h @@ -21,9 +21,5 @@ #define schedule() #define PAGE_SHIFT 12 -#define __acquires(x) -#define __releases(x) -#define __must_hold(x) - #define EXPORT_PER_CPU_SYMBOL_GPL(x) #endif /* _KERNEL_H */ -- cgit v1.2.3 From 96e004b4bdf9029d137a1b06de1606ff6e263ab8 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 27 Jan 2026 16:16:14 +0000 Subject: kselftest/arm64: Add a no-SVE loop after SVE in fp-pidbench Some applications use SVE intermittently, one common case being where SVE is used during statup (eg, by ld.so) but then rarely if ever during the main application runtime. Add a repeat of the no SVE loop after we've done the SVE loops to fp-pidbench to capture results for that. Signed-off-by: Mark Brown Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/fp/fp-pidbench.S | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/arm64/fp/fp-pidbench.S b/tools/testing/selftests/arm64/fp/fp-pidbench.S index 73830f6bc99b..aeeadc7873dc 100644 --- a/tools/testing/selftests/arm64/fp/fp-pidbench.S +++ b/tools/testing/selftests/arm64/fp/fp-pidbench.S @@ -63,6 +63,10 @@ function _start puts "SVE used per syscall: " test_loop "rdvl x0, #8" + // Test non-SVE execution after SVE + puts "No SVE after SVE: " + test_loop + // And we're done out: mov x0, #0 -- cgit v1.2.3 From b661d753ce2ee951558db1f2c7b97f32d9431966 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 27 Jan 2026 16:16:15 +0000 Subject: kselftest/arm64: Raise default number of loops in fp-pidbench When fp-pidbench was originally written SVE hardware was not widely available so it was useful to run it in emulation and the default number of loops was set very low, running for less than a second on actual hardware. Now that SVE hardware is reasonably available it is very much less interesting to use emulation, bump the default number of loops up to even out a bit of the noise on real systems. On the machine I have to hand this now takes about 15s which is still a toy microbenchmark but perhaps a bit more useful. Signed-off-by: Mark Brown Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/fp/fp-pidbench.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/arm64/fp/fp-pidbench.S b/tools/testing/selftests/arm64/fp/fp-pidbench.S index aeeadc7873dc..881dfa3b342e 100644 --- a/tools/testing/selftests/arm64/fp/fp-pidbench.S +++ b/tools/testing/selftests/arm64/fp/fp-pidbench.S @@ -33,7 +33,7 @@ function _start puts "Iterations per test: " mov x20, #10000 - lsl x20, x20, #8 + lsl x20, x20, #12 mov x0, x20 bl putdec puts "\n" -- cgit v1.2.3 From b640d556a2b354863a9962747a01f67f31cbf4d8 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Wed, 28 Jan 2026 19:05:51 +0000 Subject: selftests/bpf: Remove xxd util dependency The verification signature header generation requires converting a binary certificate to a C array. Previously this only worked with xxd (part of vim-common package). As xxd may not be available on some systems building selftests, it makes sense to substitute it with more common utils: hexdump, wc, sed to generate equivalent C array output. Tested by generating header with both xxd and hexdump and comparing them. Signed-off-by: Mykyta Yatsenko Signed-off-by: Andrii Nakryiko Tested-by: Alan Maguire Link: https://lore.kernel.org/bpf/20260128190552.242335-1-mykyta.yatsenko5@gmail.com --- tools/testing/selftests/bpf/.gitignore | 1 - tools/testing/selftests/bpf/Makefile | 10 ++++++---- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index b8bf51b7a0b0..a3ea98211ea6 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -23,7 +23,6 @@ test_tcpnotify_user test_libbpf xdping test_cpp -test_progs_verification_cert *.d *.subskel.h *.skel.h diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 2c2f68a171ed..c6bf4dfb1495 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -720,9 +720,12 @@ $(VERIFICATION_CERT) $(PRIVATE_KEY): $(VERIFY_SIG_SETUP) $(Q)mkdir -p $(BUILD_DIR) $(Q)$(VERIFY_SIG_SETUP) genkey $(BUILD_DIR) +# Generates a header with C array declaration, containing test_progs_verification_cert bytes $(VERIFY_SIG_HDR): $(VERIFICATION_CERT) - $(Q)ln -fs $< test_progs_verification_cert && \ - xxd -i test_progs_verification_cert > $@ + $(Q)(echo "unsigned char test_progs_verification_cert[] = {"; \ + hexdump -v -e '12/1 " 0x%02x," "\n"' $< | sed 's/0x ,//g; $$s/,$$//'; \ + echo "};"; \ + echo "unsigned int test_progs_verification_cert_len = $$(wc -c < $<);") > $@ # Define test_progs test runner. TRUNNER_TESTS_DIR := prog_tests @@ -898,8 +901,7 @@ EXTRA_CLEAN := $(SCRATCH_DIR) $(HOST_SCRATCH_DIR) \ *.BTF *.BTF_ids *.BTF.base \ no_alu32 cpuv4 bpf_gcc \ liburandom_read.so) \ - $(OUTPUT)/FEATURE-DUMP.selftests \ - test_progs_verification_cert + $(OUTPUT)/FEATURE-DUMP.selftests .PHONY: docs docs-clean -- cgit v1.2.3 From 60d2c438c1bb705cdbf74ce8f12e6e141a4719b0 Mon Sep 17 00:00:00 2001 From: Luis Gerhorst Date: Tue, 27 Jan 2026 12:59:12 +0100 Subject: bpf: Test nospec after dead stack write in helper Without the fix from the previous commit, the selftest fails: $ ./tools/testing/selftests/bpf/vmtest.sh -- \ ./test_progs -t verifier_unpriv [...] run_subtest:PASS:obj_open_mem 0 nsec libbpf: BTF loading error: -EPERM libbpf: Error loading .BTF into kernel: -EPERM. BTF is optional, ignoring. libbpf: prog 'unpriv_nospec_after_helper_stack_write': BPF program load failed: -EFAULT libbpf: prog 'unpriv_nospec_after_helper_stack_write': failed to load: -EFAULT libbpf: failed to load object 'verifier_unpriv' run_subtest:FAIL:unexpected_load_failure unexpected error: -14 (errno 14) VERIFIER LOG: ============= 0: R1=ctx() R10=fp0 0: (b7) r0 = 0 ; R0=P0 1: (55) if r0 != 0x1 goto pc+6 2: R0=Pscalar() R1=ctx() R10=fp0 2: (b7) r2 = 0 ; R2=P0 3: (bf) r3 = r10 ; R3=fp0 R10=fp0 4: (07) r3 += -16 ; R3=fp-16 5: (b7) r4 = 4 ; R4=P4 6: (b7) r5 = 0 ; R5=P0 7: (85) call bpf_skb_load_bytes_relative#68 verifier bug: speculation barrier after jump instruction may not have the desired effect (BPF_CLASS(insn->code) == BPF_JMP || BPF_CLASS(insn->code) == BPF_JMP32) processed 9 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0 ============= [...] The test is based on the PoC from the report. Signed-off-by: Luis Gerhorst Reported-by: Yinhao Hu Reported-by: Kaiyan Mei Reported-by: Dongliang Mu Link: https://lore.kernel.org/bpf/7678017d-b760-4053-a2d8-a6879b0dbeeb@hust.edu.cn/ Link: https://lore.kernel.org/r/20260127115912.3026761-3-luis.gerhorst@fau.de Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/progs/verifier_unpriv.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/progs/verifier_unpriv.c b/tools/testing/selftests/bpf/progs/verifier_unpriv.c index 28b4f7035ceb..8ee1243e62a8 100644 --- a/tools/testing/selftests/bpf/progs/verifier_unpriv.c +++ b/tools/testing/selftests/bpf/progs/verifier_unpriv.c @@ -950,4 +950,26 @@ l3_%=: r0 = 0; \ " ::: __clobber_all); } +SEC("socket") +__description("unpriv: nospec after dead stack write in helper") +__success __success_unpriv +__retval(0) +/* Dead code sanitizer rewrites the call to `goto -1`. */ +__naked void unpriv_dead_helper_stack_write_nospec_result(void) +{ + asm volatile (" \ + r0 = 0; \ + if r0 != 1 goto l0_%=; \ + r2 = 0; \ + r3 = r10; \ + r3 += -16; \ + r4 = 4; \ + r5 = 0; \ + call %[bpf_skb_load_bytes_relative]; \ +l0_%=: exit; \ +" : + : __imm(bpf_skb_load_bytes_relative) + : __clobber_all); +} + char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 6080d525aba8a6cab9fe4b841ca1fab48a2969c6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 27 Jan 2026 12:38:28 +0000 Subject: selftest: packetdrill: add tcp_timestamping_tcp_tx_timestamp_bug.pkt Test tcp_tx_timestamp() behavior after ("tcp: tcp_tx_timestamp() must look at the rtx queue"). Without the fix, this new test fails like this: tcp_timestamping_tcp_tx_timestamp_bug.pkt:55: runtime error in recvmsg call: Expected result 0 but got -1 with errno 11 (Resource temporarily unavailable) Signed-off-by: Eric Dumazet Reviewed-by: Jason Xing Link: https://patch.msgid.link/20260127123828.4098577-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- .../tcp_timestamping_tcp_tx_timestamp_bug.pkt | 70 ++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 tools/testing/selftests/net/packetdrill/tcp_timestamping_tcp_tx_timestamp_bug.pkt (limited to 'tools/testing') diff --git a/tools/testing/selftests/net/packetdrill/tcp_timestamping_tcp_tx_timestamp_bug.pkt b/tools/testing/selftests/net/packetdrill/tcp_timestamping_tcp_tx_timestamp_bug.pkt new file mode 100644 index 000000000000..95a1957a2cf9 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_timestamping_tcp_tx_timestamp_bug.pkt @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test after "tcp: tcp_tx_timestamp() must look at the rtx queue" + +// This test is about receiving the SCM_TSTAMP_ACK, +// we do not care about its SCM_TIMESTAMPING precision. +--tolerance_usecs=1000000 + +`./defaults.sh +sysctl -q net.ipv4.tcp_min_tso_segs=70 +` + +// Create a socket and set it to non-blocking. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 fcntl(3, F_GETFL) = 0x2 (flags O_RDWR) + +0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + +// Establish connection and verify that there was no error. + +0 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress) + +0 > S 0:0(0) ++.010 < S. 0:0(0) ack 1 win 65535 + +0 > . 1:1(0) ack 1 + +0 getsockopt(3, SOL_SOCKET, SO_ERROR, [0], [4]) = 0 + +0 setsockopt(3, SOL_SOCKET, SO_SNDBUF, [30000], 4) = 0 + + +0 write(3, ..., 9880) = 9880 + +0 > P. 1:9881(9880) ack 1 ++.010 < . 1:1(0) ack 9881 win 10000 + + +0 write(3, ..., 19760) = 19760 + +0 > P. 9881:29641(19760) ack 1 ++.010 < . 1:1(0) ack 29641 win 10000 + + +0 write(3, ..., 39520) = 39520 + +0 > P. 29641:69161(39520) ack 1 ++.010 < . 1:1(0) ack 69161 win 10000 + +// One more write to increase cwnd + +0 write(3, ..., 79040) = 79040 + +0 > P. 69161:108681(39520) ack 1 + +0 > P. 108681:148201(39520) ack 1 ++.010 < . 1:1(0) ack 148201 win 1000 + + +0 setsockopt(3, SOL_SOCKET, SO_TIMESTAMPING, + [SOF_TIMESTAMPING_TX_ACK | SOF_TIMESTAMPING_SOFTWARE | + SOF_TIMESTAMPING_OPT_ID], 4) = 0 + +// We have one write filling one skb +// last byte can not be stored because of our small SO_SNDBUF + +0 write(3, ..., 65209) = 65208 + +0 > P. 148201:213409(65208) ack 1 ++.010 < . 1:1(0) ack 213409 win 1000 + +// SCM_TSTAMP_ACK should be received after the last ack at +// t=60ms. + +0 recvmsg(3, {msg_name(...)=..., + msg_iov(1)=[{...,0}], + msg_flags=MSG_ERRQUEUE|MSG_TRUNC, + msg_control=[ + {cmsg_level=SOL_SOCKET, + cmsg_type=SCM_TIMESTAMPING, + cmsg_data={scm_sec=0,scm_nsec=60000000}}, + {cmsg_level=CMSG_LEVEL_IP, + cmsg_type=CMSG_TYPE_RECVERR, + cmsg_data={ee_errno=ENOMSG, + ee_origin=SO_EE_ORIGIN_TIMESTAMPING, + ee_type=0, + ee_code=0, + ee_info=SCM_TSTAMP_ACK, + ee_data=65207}} + ]}, MSG_ERRQUEUE) = 0 -- cgit v1.2.3 From 70de46740b62b83198802bfe6682c5f865c25dc5 Mon Sep 17 00:00:00 2001 From: Daniel Zahka Date: Tue, 27 Jan 2026 08:30:55 -0800 Subject: selftests: drv-net: psp: fix test flakes from racy connection close There is a bug in assoc_sk_only_mismatch() and assoc_sk_only_mismatch_tx() that creates a race condition which triggers test flakes in later test cases e.g. data_send_bad_key(). The problem is that the client uses the "conn clr" rpc to setup a data connection with psp_responder, but never uses a matching "data close" rpc. This creates a race condition where if the client can queue another data sock request, like in data_send_bad_key(), before the server can accept the old connection from the backlog we end up in a situation where we have two connections in the backlog: one for the closed connection we have received a FIN for, and one for the new PSP connection which is expecting to do key exchange. From there the server pops the closed connection from the backlog, but the data_send_bad_key() test case in psp.py hangs waiting to perform key exchange. The fix is to properly use _conn_close, which fill force the server to remove the closed connection from the backlog before sending the RPC ack to the client. Signed-off-by: Daniel Zahka Link: https://patch.msgid.link/20260127-psp-flaky-test-v1-1-13403e390af3@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/psp.py | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/drivers/net/psp.py b/tools/testing/selftests/drivers/net/psp.py index 528a421ecf76..864d9fce1094 100755 --- a/tools/testing/selftests/drivers/net/psp.py +++ b/tools/testing/selftests/drivers/net/psp.py @@ -266,6 +266,7 @@ def assoc_sk_only_mismatch(cfg): the_exception = cm.exception ksft_eq(the_exception.nl_msg.extack['bad-attr'], ".dev-id") ksft_eq(the_exception.nl_msg.error, -errno.EINVAL) + _close_conn(cfg, s) def assoc_sk_only_mismatch_tx(cfg): @@ -283,6 +284,7 @@ def assoc_sk_only_mismatch_tx(cfg): the_exception = cm.exception ksft_eq(the_exception.nl_msg.extack['bad-attr'], ".dev-id") ksft_eq(the_exception.nl_msg.error, -errno.EINVAL) + _close_conn(cfg, s) def assoc_sk_only_unconn(cfg): -- cgit v1.2.3 From 5fc90003de59b0f772a1654f5609fa5f87b4300f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 27 Jan 2026 17:48:06 +0000 Subject: selftests: drv-net: toeplitz: accept bigger rss keys /proc/sys/net/core/netdev_rss_key got bigger (256 bytes instead of 52) Fixes: 37b0ea8fef56 ("net: expand NETDEV_RSS_KEY_LEN to 256 bytes") Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20260127174806.886561-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/hw/toeplitz.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/drivers/net/hw/toeplitz.c b/tools/testing/selftests/drivers/net/hw/toeplitz.c index 285bb17df9c2..cd4bf58a44ee 100644 --- a/tools/testing/selftests/drivers/net/hw/toeplitz.c +++ b/tools/testing/selftests/drivers/net/hw/toeplitz.c @@ -59,7 +59,7 @@ #include "../../../net/lib/ksft.h" #define TOEPLITZ_KEY_MIN_LEN 40 -#define TOEPLITZ_KEY_MAX_LEN 60 +#define TOEPLITZ_KEY_MAX_LEN 256 #define TOEPLITZ_STR_LEN(K) (((K) * 3) - 1) /* hex encoded: AA:BB:CC:...:ZZ */ #define TOEPLITZ_STR_MIN_LEN TOEPLITZ_STR_LEN(TOEPLITZ_KEY_MIN_LEN) -- cgit v1.2.3 From 8467458dfa61b37e259e3485a5d3e415d08193c1 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 27 Jan 2026 20:27:24 +0100 Subject: selftests: mptcp: check no dup close events after error This validates the previous commit: subflow closed events are re-sent with less info when the initial subflow is disconnected after an error and each time a subflow is closed after that. In this new test, the userspace PM is involved because that's how it was discovered, but it is not specific to it. The initial subflow is terminated with a RESET, and that will cause the subflow disconnect. Then, a new subflow is initiated, but also got rejected, which cause a second subflow closed event, but not a third one. While at it, in case of failure to get the expected amount of events, the events are printed. The 'Fixes' tag here below is the same as the one from the previous commit: this patch here is not fixing anything wrong in the selftests, but it validates the previous fix for an issue introduced by this commit ID. Fixes: d82809b6c5f2 ("mptcp: avoid duplicated SUB_CLOSED events") Cc: stable@vger.kernel.org Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260127-net-mptcp-dup-nl-events-v1-2-7f71e1bc4feb@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 51 +++++++++++++++++++++++++ 1 file changed, 51 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index b2e6e548f796..1765714a1e2f 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -3872,11 +3872,32 @@ chk_evt_nr() count=$(grep -cw "type:${evt}" "${evts}") if [ "${count}" != "${exp}" ]; then fail_test "got ${count} events, expected ${exp}" + cat "${evts}" else print_ok fi } +# $1: ns ; $2: event type ; $3: expected count +wait_event() +{ + local ns="${1}" + local evt_name="${2}" + local exp="${3}" + + local evt="${!evt_name}" + local evts="${evts_ns1}" + local count + + [ "${ns}" == "ns2" ] && evts="${evts_ns2}" + + for _ in $(seq 100); do + count=$(grep -cw "type:${evt}" "${evts}") + [ "${count}" -ge "${exp}" ] && break + sleep 0.1 + done +} + userspace_tests() { # userspace pm type prevents add_addr @@ -4085,6 +4106,36 @@ userspace_tests() kill_events_pids mptcp_lib_kill_group_wait $tests_pid fi + + # userspace pm no duplicated spurious close events after an error + if reset_with_events "userspace pm no dup close events after error" && + continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then + set_userspace_pm $ns2 + pm_nl_set_limits $ns1 0 2 + { timeout_test=120 test_linkfail=128 speed=slow \ + run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null + local tests_pid=$! + wait_event ns2 MPTCP_LIB_EVENT_ESTABLISHED 1 + userspace_pm_add_sf $ns2 10.0.3.2 20 + chk_mptcp_info subflows 1 subflows 1 + chk_subflows_total 2 2 + + # force quick loss + ip netns exec $ns2 sysctl -q net.ipv4.tcp_syn_retries=1 + if ip netns exec "${ns1}" ${iptables} -A INPUT -s "10.0.1.2" \ + -p tcp --tcp-option 30 -j REJECT --reject-with tcp-reset && + ip netns exec "${ns2}" ${iptables} -A INPUT -d "10.0.1.2" \ + -p tcp --tcp-option 30 -j REJECT --reject-with tcp-reset; then + wait_event ns2 MPTCP_LIB_EVENT_SUB_CLOSED 1 + wait_event ns1 MPTCP_LIB_EVENT_SUB_CLOSED 1 + chk_subflows_total 1 1 + userspace_pm_add_sf $ns2 10.0.1.2 0 + wait_event ns2 MPTCP_LIB_EVENT_SUB_CLOSED 2 + chk_evt_nr ns2 MPTCP_LIB_EVENT_SUB_CLOSED 2 + fi + kill_events_pids + mptcp_lib_kill_group_wait $tests_pid + fi } endpoint_tests() -- cgit v1.2.3 From 2ef9e3a3845d0a20b62b01f5b731debd0364688d Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 27 Jan 2026 20:27:26 +0100 Subject: selftests: mptcp: check subflow errors in close events This validates the previous commit: subflow closed events should contain an error field when a subflow got closed with an error, e.g. reset or timeout. For this test, the chk_evt_nr helper has been extended to check attributes in the matched events. In this test, the 2 subflow closed events should have an error. The 'Fixes' tag here below is the same as the one from the previous commit: this patch here is not fixing anything wrong in the selftests, but it validates the previous fix for an issue introduced by this commit ID. Fixes: 15cc10453398 ("mptcp: deliver ssk errors to msk") Cc: stable@vger.kernel.org Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260127-net-mptcp-dup-nl-events-v1-4-7f71e1bc4feb@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 1765714a1e2f..3fc29201362a 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -3847,21 +3847,28 @@ userspace_pm_chk_get_addr() fi } -# $1: ns ; $2: event type ; $3: count +# $1: ns ; $2: event type ; $3: count ; [ $4: attr ; $5: attr count ] chk_evt_nr() { local ns=${1} local evt_name="${2}" local exp="${3}" + local attr="${4}" + local attr_exp="${5}" local evts="${evts_ns1}" local evt="${!evt_name}" + local attr_name local count + if [ -n "${attr}" ]; then + attr_name=", ${attr}: ${attr_exp}" + fi + evt_name="${evt_name:16}" # without MPTCP_LIB_EVENT_ [ "${ns}" == "ns2" ] && evts="${evts_ns2}" - print_check "event ${ns} ${evt_name} (${exp})" + print_check "event ${ns} ${evt_name} (${exp}${attr_name})" if [[ "${evt_name}" = "LISTENER_"* ]] && ! mptcp_lib_kallsyms_has "mptcp_event_pm_listener$"; then @@ -3873,6 +3880,16 @@ chk_evt_nr() if [ "${count}" != "${exp}" ]; then fail_test "got ${count} events, expected ${exp}" cat "${evts}" + return + elif [ -z "${attr}" ]; then + print_ok + return + fi + + count=$(grep -w "type:${evt}" "${evts}" | grep -c ",${attr}:") + if [ "${count}" != "${attr_exp}" ]; then + fail_test "got ${count} event attributes, expected ${attr_exp}" + grep -w "type:${evt}" "${evts}" else print_ok fi @@ -4131,7 +4148,7 @@ userspace_tests() chk_subflows_total 1 1 userspace_pm_add_sf $ns2 10.0.1.2 0 wait_event ns2 MPTCP_LIB_EVENT_SUB_CLOSED 2 - chk_evt_nr ns2 MPTCP_LIB_EVENT_SUB_CLOSED 2 + chk_evt_nr ns2 MPTCP_LIB_EVENT_SUB_CLOSED 2 error 2 fi kill_events_pids mptcp_lib_kill_group_wait $tests_pid -- cgit v1.2.3 From c5d5ecf21fdd9ce91e6116feb3aa83cee73352cc Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 27 Jan 2026 20:27:27 +0100 Subject: selftests: mptcp: join: fix local endp not being tracked When running this mptcp_join.sh selftest on older kernel versions not supporting local endpoints tracking, this test fails because 3 MP_JOIN ACKs have been received, while only 2 were expected. It is not clear why only 2 MP_JOIN ACKs were expected on old kernel versions, while 3 MP_JOIN SYN and SYN+ACK were expected. When testing on the v5.15.197 kernel, 3 MP_JOIN ACKs are seen, which is also what is expected in the selftests included in this kernel version, see commit f4480eaad489 ("selftests: mptcp: add missing join check"). Switch the expected MP_JOIN ACKs to 3. While at it, move this chk_join_nr helper out of the special condition for older kernel versions as it is now the same as with more recent ones. Also, invert the condition to be more logical: what's expected on newer kernel versions having such helper first. Fixes: d4c81bbb8600 ("selftests: mptcp: join: support local endpoint being tracked or not") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260127-net-mptcp-dup-nl-events-v1-5-7f71e1bc4feb@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 3fc29201362a..e70d3420954f 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -2329,17 +2329,16 @@ signal_address_tests() ip netns exec $ns1 sysctl -q net.mptcp.add_addr_timeout=1 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 + chk_join_nr 3 3 3 # It is not directly linked to the commit introducing this # symbol but for the parent one which is linked anyway. - if ! mptcp_lib_kallsyms_has "mptcp_pm_subflow_check_next$"; then - chk_join_nr 3 3 2 - chk_add_nr 4 4 - else - chk_join_nr 3 3 3 + if mptcp_lib_kallsyms_has "mptcp_pm_subflow_check_next$"; then # the server will not signal the address terminating # the MPC subflow chk_add_nr 3 3 + else + chk_add_nr 4 4 fi fi } -- cgit v1.2.3 From 5e51803521938566bdf099501379a9bdbacb6066 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 22 Jan 2026 18:46:17 +0100 Subject: selftests: netfilter: nft_flowtable.sh: Add IP6IP6 flowtable selftest Similar to IPIP, introduce specific selftest for IP6IP6 flowtable SW acceleration in nft_flowtable.sh Signed-off-by: Lorenzo Bianconi Signed-off-by: Florian Westphal --- .../selftests/net/netfilter/nft_flowtable.sh | 62 ++++++++++++++++++---- 1 file changed, 53 insertions(+), 9 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/net/netfilter/nft_flowtable.sh b/tools/testing/selftests/net/netfilter/nft_flowtable.sh index a68bc882fa4e..14d7f67715ed 100755 --- a/tools/testing/selftests/net/netfilter/nft_flowtable.sh +++ b/tools/testing/selftests/net/netfilter/nft_flowtable.sh @@ -592,16 +592,28 @@ ip -net "$nsr1" link set tun0 up ip -net "$nsr1" addr add 192.168.100.1/24 dev tun0 ip netns exec "$nsr1" sysctl net.ipv4.conf.tun0.forwarding=1 > /dev/null +ip -net "$nsr1" link add name tun6 type ip6tnl local fee1:2::1 remote fee1:2::2 +ip -net "$nsr1" link set tun6 up +ip -net "$nsr1" addr add fee1:3::1/64 dev tun6 nodad + ip -net "$nsr2" link add name tun0 type ipip local 192.168.10.2 remote 192.168.10.1 ip -net "$nsr2" link set tun0 up ip -net "$nsr2" addr add 192.168.100.2/24 dev tun0 ip netns exec "$nsr2" sysctl net.ipv4.conf.tun0.forwarding=1 > /dev/null +ip -net "$nsr2" link add name tun6 type ip6tnl local fee1:2::2 remote fee1:2::1 +ip -net "$nsr2" link set tun6 up +ip -net "$nsr2" addr add fee1:3::2/64 dev tun6 nodad + ip -net "$nsr1" route change default via 192.168.100.2 ip -net "$nsr2" route change default via 192.168.100.1 +ip -6 -net "$nsr1" route change default via fee1:3::2 +ip -6 -net "$nsr2" route change default via fee1:3::1 ip -net "$ns2" route add default via 10.0.2.1 +ip -6 -net "$ns2" route add default via dead:2::1 ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun0 accept' +ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun6 accept' ip netns exec "$nsr1" nft -a insert rule inet filter forward \ 'meta oif "veth0" tcp sport 12345 ct mark set 1 flow add @f1 counter name routed_repl accept' @@ -611,28 +623,51 @@ if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 "IPIP tunnel"; then ret=1 fi +if test_tcp_forwarding "$ns1" "$ns2" 1 6 "[dead:2::99]" 12345; then + echo "PASS: flow offload for ns1/ns2 IP6IP6 tunnel" +else + echo "FAIL: flow offload for ns1/ns2 with IP6IP6 tunnel" 1>&2 + ip netns exec "$nsr1" nft list ruleset + ret=1 +fi + # Create vlan tagged devices for IPIP traffic. ip -net "$nsr1" link add link veth1 name veth1.10 type vlan id 10 ip -net "$nsr1" link set veth1.10 up ip -net "$nsr1" addr add 192.168.20.1/24 dev veth1.10 +ip -net "$nsr1" addr add fee1:4::1/64 dev veth1.10 nodad ip netns exec "$nsr1" sysctl net.ipv4.conf.veth1/10.forwarding=1 > /dev/null ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif veth1.10 accept' -ip -net "$nsr1" link add name tun1 type ipip local 192.168.20.1 remote 192.168.20.2 -ip -net "$nsr1" link set tun1 up -ip -net "$nsr1" addr add 192.168.200.1/24 dev tun1 + +ip -net "$nsr1" link add name tun0.10 type ipip local 192.168.20.1 remote 192.168.20.2 +ip -net "$nsr1" link set tun0.10 up +ip -net "$nsr1" addr add 192.168.200.1/24 dev tun0.10 ip -net "$nsr1" route change default via 192.168.200.2 -ip netns exec "$nsr1" sysctl net.ipv4.conf.tun1.forwarding=1 > /dev/null -ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun1 accept' +ip netns exec "$nsr1" sysctl net.ipv4.conf.tun0/10.forwarding=1 > /dev/null +ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun0.10 accept' + +ip -net "$nsr1" link add name tun6.10 type ip6tnl local fee1:4::1 remote fee1:4::2 +ip -net "$nsr1" link set tun6.10 up +ip -net "$nsr1" addr add fee1:5::1/64 dev tun6.10 nodad +ip -6 -net "$nsr1" route change default via fee1:5::2 +ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun6.10 accept' ip -net "$nsr2" link add link veth0 name veth0.10 type vlan id 10 ip -net "$nsr2" link set veth0.10 up ip -net "$nsr2" addr add 192.168.20.2/24 dev veth0.10 +ip -net "$nsr2" addr add fee1:4::2/64 dev veth0.10 nodad ip netns exec "$nsr2" sysctl net.ipv4.conf.veth0/10.forwarding=1 > /dev/null -ip -net "$nsr2" link add name tun1 type ipip local 192.168.20.2 remote 192.168.20.1 -ip -net "$nsr2" link set tun1 up -ip -net "$nsr2" addr add 192.168.200.2/24 dev tun1 + +ip -net "$nsr2" link add name tun0.10 type ipip local 192.168.20.2 remote 192.168.20.1 +ip -net "$nsr2" link set tun0.10 up +ip -net "$nsr2" addr add 192.168.200.2/24 dev tun0.10 ip -net "$nsr2" route change default via 192.168.200.1 -ip netns exec "$nsr2" sysctl net.ipv4.conf.tun1.forwarding=1 > /dev/null +ip netns exec "$nsr2" sysctl net.ipv4.conf.tun0/10.forwarding=1 > /dev/null + +ip -net "$nsr2" link add name tun6.10 type ip6tnl local fee1:4::2 remote fee1:4::1 +ip -net "$nsr2" link set tun6.10 up +ip -net "$nsr2" addr add fee1:5::2/64 dev tun6.10 nodad +ip -6 -net "$nsr2" route change default via fee1:5::1 if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 "IPIP tunnel over vlan"; then echo "FAIL: flow offload for ns1/ns2 with IPIP tunnel over vlan" 1>&2 @@ -640,10 +675,19 @@ if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 "IPIP tunnel over vlan"; then ret=1 fi +if test_tcp_forwarding "$ns1" "$ns2" 1 6 "[dead:2::99]" 12345; then + echo "PASS: flow offload for ns1/ns2 IP6IP6 tunnel over vlan" +else + echo "FAIL: flow offload for ns1/ns2 with IP6IP6 tunnel over vlan" 1>&2 + ip netns exec "$nsr1" nft list ruleset + ret=1 +fi + # Restore the previous configuration ip -net "$nsr1" route change default via 192.168.10.2 ip -net "$nsr2" route change default via 192.168.10.1 ip -net "$ns2" route del default via 10.0.2.1 +ip -6 -net "$ns2" route del default via dead:2::1 } # Another test: -- cgit v1.2.3 From 462a94fb8ae8ba0d4d3901c7283b4af052ab8804 Mon Sep 17 00:00:00 2001 From: Paul Walmsley Date: Sun, 25 Jan 2026 21:09:55 -0700 Subject: riscv: hwprobe: add support for RISCV_HWPROBE_KEY_IMA_EXT_1 We've run out of bits to describe RISC-V ISA extensions in our initial hwprobe key, RISCV_HWPROBE_KEY_IMA_EXT_0. So, let's add RISCV_HWPROBE_KEY_IMA_EXT_1, along with the framework to set the appropriate hwprobe tuple, and add testing for it. Based on a suggestion from Andrew Jones , also fix the documentation for RISCV_HWPROBE_KEY_IMA_EXT_0. Reviewed-by: Andrew Jones Signed-off-by: Paul Walmsley --- Documentation/arch/riscv/hwprobe.rst | 6 +- arch/riscv/include/asm/hwprobe.h | 3 +- arch/riscv/include/uapi/asm/hwprobe.h | 1 + arch/riscv/kernel/sys_hwprobe.c | 169 ++++++++++++--------- tools/testing/selftests/riscv/hwprobe/which-cpus.c | 18 ++- 5 files changed, 121 insertions(+), 76 deletions(-) (limited to 'tools/testing') diff --git a/Documentation/arch/riscv/hwprobe.rst b/Documentation/arch/riscv/hwprobe.rst index 641ec4abb906..c420a8349bc6 100644 --- a/Documentation/arch/riscv/hwprobe.rst +++ b/Documentation/arch/riscv/hwprobe.rst @@ -67,7 +67,7 @@ The following keys are defined: programs (it may still be executed in userspace via a kernel-controlled mechanism such as the vDSO). -* :c:macro:`RISCV_HWPROBE_KEY_IMA_EXT_0`: A bitmask containing the extensions +* :c:macro:`RISCV_HWPROBE_KEY_IMA_EXT_0`: A bitmask containing extensions that are compatible with the :c:macro:`RISCV_HWPROBE_BASE_BEHAVIOR_IMA`: base system behavior. @@ -387,3 +387,7 @@ The following keys are defined: * :c:macro:`RISCV_HWPROBE_KEY_ZICBOP_BLOCK_SIZE`: An unsigned int which represents the size of the Zicbop block in bytes. + +* :c:macro:`RISCV_HWPROBE_KEY_IMA_EXT_1`: A bitmask containing additional + extensions that are compatible with the + :c:macro:`RISCV_HWPROBE_BASE_BEHAVIOR_IMA`: base system behavior. diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h index 8c572a464719..8b9f5e1cf4cb 100644 --- a/arch/riscv/include/asm/hwprobe.h +++ b/arch/riscv/include/asm/hwprobe.h @@ -8,7 +8,7 @@ #include -#define RISCV_HWPROBE_MAX_KEY 15 +#define RISCV_HWPROBE_MAX_KEY 16 static inline bool riscv_hwprobe_key_is_valid(__s64 key) { @@ -20,6 +20,7 @@ static inline bool hwprobe_key_is_bitmask(__s64 key) switch (key) { case RISCV_HWPROBE_KEY_BASE_BEHAVIOR: case RISCV_HWPROBE_KEY_IMA_EXT_0: + case RISCV_HWPROBE_KEY_IMA_EXT_1: case RISCV_HWPROBE_KEY_CPUPERF_0: case RISCV_HWPROBE_KEY_VENDOR_EXT_THEAD_0: case RISCV_HWPROBE_KEY_VENDOR_EXT_MIPS_0: diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h index cd3c126730c3..ed2621a5a47d 100644 --- a/arch/riscv/include/uapi/asm/hwprobe.h +++ b/arch/riscv/include/uapi/asm/hwprobe.h @@ -113,6 +113,7 @@ struct riscv_hwprobe { #define RISCV_HWPROBE_KEY_VENDOR_EXT_SIFIVE_0 13 #define RISCV_HWPROBE_KEY_VENDOR_EXT_MIPS_0 14 #define RISCV_HWPROBE_KEY_ZICBOP_BLOCK_SIZE 15 +#define RISCV_HWPROBE_KEY_IMA_EXT_1 16 /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */ /* Flags */ diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c index e6787ba7f2fc..53731ace7984 100644 --- a/arch/riscv/kernel/sys_hwprobe.c +++ b/arch/riscv/kernel/sys_hwprobe.c @@ -24,6 +24,14 @@ #include +#define EXT_KEY(isa_arg, ext, pv, missing) \ + do { \ + if (__riscv_isa_extension_available(isa_arg, RISCV_ISA_EXT_##ext)) \ + pv |= RISCV_HWPROBE_EXT_##ext; \ + else \ + missing |= RISCV_HWPROBE_EXT_##ext; \ + } while (false) + static void hwprobe_arch_id(struct riscv_hwprobe *pair, const struct cpumask *cpus) { @@ -93,90 +101,109 @@ static void hwprobe_isa_ext0(struct riscv_hwprobe *pair, for_each_cpu(cpu, cpus) { struct riscv_isainfo *isainfo = &hart_isa[cpu]; -#define EXT_KEY(ext) \ - do { \ - if (__riscv_isa_extension_available(isainfo->isa, RISCV_ISA_EXT_##ext)) \ - pair->value |= RISCV_HWPROBE_EXT_##ext; \ - else \ - missing |= RISCV_HWPROBE_EXT_##ext; \ - } while (false) - /* * Only use EXT_KEY() for extensions which can be exposed to userspace, * regardless of the kernel's configuration, as no other checks, besides * presence in the hart_isa bitmap, are made. */ - EXT_KEY(ZAAMO); - EXT_KEY(ZABHA); - EXT_KEY(ZACAS); - EXT_KEY(ZALASR); - EXT_KEY(ZALRSC); - EXT_KEY(ZAWRS); - EXT_KEY(ZBA); - EXT_KEY(ZBB); - EXT_KEY(ZBC); - EXT_KEY(ZBKB); - EXT_KEY(ZBKC); - EXT_KEY(ZBKX); - EXT_KEY(ZBS); - EXT_KEY(ZCA); - EXT_KEY(ZCB); - EXT_KEY(ZCLSD); - EXT_KEY(ZCMOP); - EXT_KEY(ZICBOM); - EXT_KEY(ZICBOP); - EXT_KEY(ZICBOZ); - EXT_KEY(ZICNTR); - EXT_KEY(ZICOND); - EXT_KEY(ZIHINTNTL); - EXT_KEY(ZIHINTPAUSE); - EXT_KEY(ZIHPM); - EXT_KEY(ZILSD); - EXT_KEY(ZIMOP); - EXT_KEY(ZKND); - EXT_KEY(ZKNE); - EXT_KEY(ZKNH); - EXT_KEY(ZKSED); - EXT_KEY(ZKSH); - EXT_KEY(ZKT); - EXT_KEY(ZTSO); + EXT_KEY(isainfo->isa, ZAAMO, pair->value, missing); + EXT_KEY(isainfo->isa, ZABHA, pair->value, missing); + EXT_KEY(isainfo->isa, ZACAS, pair->value, missing); + EXT_KEY(isainfo->isa, ZALASR, pair->value, missing); + EXT_KEY(isainfo->isa, ZALRSC, pair->value, missing); + EXT_KEY(isainfo->isa, ZAWRS, pair->value, missing); + EXT_KEY(isainfo->isa, ZBA, pair->value, missing); + EXT_KEY(isainfo->isa, ZBB, pair->value, missing); + EXT_KEY(isainfo->isa, ZBC, pair->value, missing); + EXT_KEY(isainfo->isa, ZBKB, pair->value, missing); + EXT_KEY(isainfo->isa, ZBKC, pair->value, missing); + EXT_KEY(isainfo->isa, ZBKX, pair->value, missing); + EXT_KEY(isainfo->isa, ZBS, pair->value, missing); + EXT_KEY(isainfo->isa, ZCA, pair->value, missing); + EXT_KEY(isainfo->isa, ZCB, pair->value, missing); + EXT_KEY(isainfo->isa, ZCLSD, pair->value, missing); + EXT_KEY(isainfo->isa, ZCMOP, pair->value, missing); + EXT_KEY(isainfo->isa, ZICBOM, pair->value, missing); + EXT_KEY(isainfo->isa, ZICBOP, pair->value, missing); + EXT_KEY(isainfo->isa, ZICBOZ, pair->value, missing); + EXT_KEY(isainfo->isa, ZICNTR, pair->value, missing); + EXT_KEY(isainfo->isa, ZICOND, pair->value, missing); + EXT_KEY(isainfo->isa, ZIHINTNTL, pair->value, missing); + EXT_KEY(isainfo->isa, ZIHINTPAUSE, pair->value, missing); + EXT_KEY(isainfo->isa, ZIHPM, pair->value, missing); + EXT_KEY(isainfo->isa, ZILSD, pair->value, missing); + EXT_KEY(isainfo->isa, ZIMOP, pair->value, missing); + EXT_KEY(isainfo->isa, ZKND, pair->value, missing); + EXT_KEY(isainfo->isa, ZKNE, pair->value, missing); + EXT_KEY(isainfo->isa, ZKNH, pair->value, missing); + EXT_KEY(isainfo->isa, ZKSED, pair->value, missing); + EXT_KEY(isainfo->isa, ZKSH, pair->value, missing); + EXT_KEY(isainfo->isa, ZKT, pair->value, missing); + EXT_KEY(isainfo->isa, ZTSO, pair->value, missing); /* * All the following extensions must depend on the kernel * support of V. */ if (has_vector()) { - EXT_KEY(ZVBB); - EXT_KEY(ZVBC); - EXT_KEY(ZVE32F); - EXT_KEY(ZVE32X); - EXT_KEY(ZVE64D); - EXT_KEY(ZVE64F); - EXT_KEY(ZVE64X); - EXT_KEY(ZVFBFMIN); - EXT_KEY(ZVFBFWMA); - EXT_KEY(ZVFH); - EXT_KEY(ZVFHMIN); - EXT_KEY(ZVKB); - EXT_KEY(ZVKG); - EXT_KEY(ZVKNED); - EXT_KEY(ZVKNHA); - EXT_KEY(ZVKNHB); - EXT_KEY(ZVKSED); - EXT_KEY(ZVKSH); - EXT_KEY(ZVKT); + EXT_KEY(isainfo->isa, ZVBB, pair->value, missing); + EXT_KEY(isainfo->isa, ZVBC, pair->value, missing); + EXT_KEY(isainfo->isa, ZVE32F, pair->value, missing); + EXT_KEY(isainfo->isa, ZVE32X, pair->value, missing); + EXT_KEY(isainfo->isa, ZVE64D, pair->value, missing); + EXT_KEY(isainfo->isa, ZVE64F, pair->value, missing); + EXT_KEY(isainfo->isa, ZVE64X, pair->value, missing); + EXT_KEY(isainfo->isa, ZVFBFMIN, pair->value, missing); + EXT_KEY(isainfo->isa, ZVFBFWMA, pair->value, missing); + EXT_KEY(isainfo->isa, ZVFH, pair->value, missing); + EXT_KEY(isainfo->isa, ZVFHMIN, pair->value, missing); + EXT_KEY(isainfo->isa, ZVKB, pair->value, missing); + EXT_KEY(isainfo->isa, ZVKG, pair->value, missing); + EXT_KEY(isainfo->isa, ZVKNED, pair->value, missing); + EXT_KEY(isainfo->isa, ZVKNHA, pair->value, missing); + EXT_KEY(isainfo->isa, ZVKNHB, pair->value, missing); + EXT_KEY(isainfo->isa, ZVKSED, pair->value, missing); + EXT_KEY(isainfo->isa, ZVKSH, pair->value, missing); + EXT_KEY(isainfo->isa, ZVKT, pair->value, missing); } - EXT_KEY(ZCD); - EXT_KEY(ZCF); - EXT_KEY(ZFA); - EXT_KEY(ZFBFMIN); - EXT_KEY(ZFH); - EXT_KEY(ZFHMIN); + EXT_KEY(isainfo->isa, ZCD, pair->value, missing); + EXT_KEY(isainfo->isa, ZCF, pair->value, missing); + EXT_KEY(isainfo->isa, ZFA, pair->value, missing); + EXT_KEY(isainfo->isa, ZFBFMIN, pair->value, missing); + EXT_KEY(isainfo->isa, ZFH, pair->value, missing); + EXT_KEY(isainfo->isa, ZFHMIN, pair->value, missing); if (IS_ENABLED(CONFIG_RISCV_ISA_SUPM)) - EXT_KEY(SUPM); -#undef EXT_KEY + EXT_KEY(isainfo->isa, SUPM, pair->value, missing); + } + + /* Now turn off reporting features if any CPU is missing it. */ + pair->value &= ~missing; +} + +static void hwprobe_isa_ext1(struct riscv_hwprobe *pair, + const struct cpumask *cpus) +{ + int cpu; + u64 missing = 0; + + pair->value = 0; + + /* + * Loop through and record extensions that 1) anyone has, and 2) anyone + * doesn't have. + */ + for_each_cpu(cpu, cpus) { + /* struct riscv_isainfo *isainfo = &hart_isa[cpu]; */ + + /* + * Only use EXT_KEY() for extensions which can be + * exposed to userspace, regardless of the kernel's + * configuration, as no other checks, besides presence + * in the hart_isa bitmap, are made. + */ + /* Nothing here yet */ } /* Now turn off reporting features if any CPU is missing it. */ @@ -287,6 +314,10 @@ static void hwprobe_one_pair(struct riscv_hwprobe *pair, hwprobe_isa_ext0(pair, cpus); break; + case RISCV_HWPROBE_KEY_IMA_EXT_1: + hwprobe_isa_ext1(pair, cpus); + break; + case RISCV_HWPROBE_KEY_CPUPERF_0: case RISCV_HWPROBE_KEY_MISALIGNED_SCALAR_PERF: pair->value = hwprobe_misaligned(cpus); diff --git a/tools/testing/selftests/riscv/hwprobe/which-cpus.c b/tools/testing/selftests/riscv/hwprobe/which-cpus.c index 3ab53067e8dd..587feb198c04 100644 --- a/tools/testing/selftests/riscv/hwprobe/which-cpus.c +++ b/tools/testing/selftests/riscv/hwprobe/which-cpus.c @@ -83,9 +83,9 @@ static void do_which_cpus(int argc, char **argv, cpu_set_t *cpus) int main(int argc, char **argv) { - struct riscv_hwprobe pairs[2]; + struct riscv_hwprobe pairs[3]; cpu_set_t cpus_aff, cpus; - __u64 ext0_all; + __u64 ext0_all, ext1_all; long rc; rc = sched_getaffinity(0, sizeof(cpu_set_t), &cpus_aff); @@ -112,6 +112,11 @@ int main(int argc, char **argv) assert(rc == 0 && pairs[0].key == RISCV_HWPROBE_KEY_IMA_EXT_0); ext0_all = pairs[0].value; + pairs[0] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_IMA_EXT_1, }; + rc = riscv_hwprobe(pairs, 1, 0, NULL, 0); + assert(rc == 0 && pairs[0].key == RISCV_HWPROBE_KEY_IMA_EXT_1); + ext1_all = pairs[0].value; + pairs[0] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_BASE_BEHAVIOR, .value = RISCV_HWPROBE_BASE_BEHAVIOR_IMA, }; CPU_ZERO(&cpus); rc = riscv_hwprobe(pairs, 1, 0, (unsigned long *)&cpus, RISCV_HWPROBE_WHICH_CPUS); @@ -134,20 +139,23 @@ int main(int argc, char **argv) pairs[0] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_BASE_BEHAVIOR, .value = RISCV_HWPROBE_BASE_BEHAVIOR_IMA, }; pairs[1] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_IMA_EXT_0, .value = ext0_all, }; + pairs[2] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_IMA_EXT_1, .value = ext1_all, }; CPU_ZERO(&cpus); - rc = riscv_hwprobe(pairs, 2, sizeof(cpu_set_t), (unsigned long *)&cpus, RISCV_HWPROBE_WHICH_CPUS); + rc = riscv_hwprobe(pairs, 3, sizeof(cpu_set_t), (unsigned long *)&cpus, RISCV_HWPROBE_WHICH_CPUS); ksft_test_result(rc == 0 && CPU_COUNT(&cpus) == sysconf(_SC_NPROCESSORS_ONLN), "set all cpus\n"); pairs[0] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_BASE_BEHAVIOR, .value = RISCV_HWPROBE_BASE_BEHAVIOR_IMA, }; pairs[1] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_IMA_EXT_0, .value = ext0_all, }; + pairs[2] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_IMA_EXT_1, .value = ext1_all, }; memcpy(&cpus, &cpus_aff, sizeof(cpu_set_t)); - rc = riscv_hwprobe(pairs, 2, sizeof(cpu_set_t), (unsigned long *)&cpus, RISCV_HWPROBE_WHICH_CPUS); + rc = riscv_hwprobe(pairs, 3, sizeof(cpu_set_t), (unsigned long *)&cpus, RISCV_HWPROBE_WHICH_CPUS); ksft_test_result(rc == 0 && CPU_EQUAL(&cpus, &cpus_aff), "set all affinity cpus\n"); pairs[0] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_BASE_BEHAVIOR, .value = RISCV_HWPROBE_BASE_BEHAVIOR_IMA, }; pairs[1] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_IMA_EXT_0, .value = ~ext0_all, }; + pairs[2] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_IMA_EXT_1, .value = ~ext1_all, }; memcpy(&cpus, &cpus_aff, sizeof(cpu_set_t)); - rc = riscv_hwprobe(pairs, 2, sizeof(cpu_set_t), (unsigned long *)&cpus, RISCV_HWPROBE_WHICH_CPUS); + rc = riscv_hwprobe(pairs, 3, sizeof(cpu_set_t), (unsigned long *)&cpus, RISCV_HWPROBE_WHICH_CPUS); ksft_test_result(rc == 0 && CPU_COUNT(&cpus) == 0, "clear all cpus\n"); ksft_finished(); -- cgit v1.2.3 From d30c1683aaecb93d2ab95685dc4300a33d3cea7a Mon Sep 17 00:00:00 2001 From: Deepak Gupta Date: Sun, 25 Jan 2026 21:09:56 -0700 Subject: kselftest/riscv: add kselftest for user mode CFI Add a kselftest for RISC-V control flow integrity implementation for user mode. There is not a lot going on in the kernel to enable landing pad for user mode. CFI selftests are intended to be compiled with a zicfilp and zicfiss enabled compiler. This kselftest simply checks if landing pads and shadow stacks for the process are enabled or not and executes ptrace selftests on CFI. The selftest then registers a SIGSEGV signal handler. Any control flow violations are reported as SIGSEGV with si_code = SEGV_CPERR. The test will fail on receiving any SEGV_CPERR. The shadow stack part has more changes in the kernel, and thus there are separate tests for that. - Exercise 'map_shadow_stack' syscall - 'fork' test to make sure COW works for shadow stack pages - gup tests Kernel uses FOLL_FORCE when access happens to memory via /proc//mem. Not breaking that for shadow stack. - signal test. Make sure signal delivery results in token creation on shadow stack and consumes (and verifies) token on sigreturn - shadow stack protection test. attempts to write using regular store instruction on shadow stack memory must result in access faults - ptrace test: adds landing pad violation, clears ELP and continues In case the toolchain doesn't support the CFI extension, the CFI kselftest won't be built. Test output =========== """ TAP version 13 1..5 This is to ensure shadow stack is indeed enabled and working This is to ensure shadow stack is indeed enabled and working ok 1 shstk fork test ok 2 map shadow stack syscall ok 3 shadow stack gup tests ok 4 shadow stack signal tests ok 5 memory protections of shadow stack memory """ Suggested-by: Charlie Jenkins Signed-off-by: Charlie Jenkins Signed-off-by: Deepak Gupta Tested-by: Andreas Korb # QEMU, custom CVA6 Tested-by: Valentin Haudiquet Link: https://patch.msgid.link/20251112-v5_user_cfi_series-v23-28-b55691eacf4f@rivosinc.com [pjw@kernel.org: updated to apply; cleaned up patch description, code comments] Signed-off-by: Paul Walmsley --- tools/testing/selftests/riscv/Makefile | 2 +- tools/testing/selftests/riscv/cfi/.gitignore | 2 + tools/testing/selftests/riscv/cfi/Makefile | 23 ++ tools/testing/selftests/riscv/cfi/cfi_rv_test.h | 82 +++++ tools/testing/selftests/riscv/cfi/cfitests.c | 173 +++++++++++ tools/testing/selftests/riscv/cfi/shadowstack.c | 385 ++++++++++++++++++++++++ tools/testing/selftests/riscv/cfi/shadowstack.h | 27 ++ 7 files changed, 693 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/riscv/cfi/.gitignore create mode 100644 tools/testing/selftests/riscv/cfi/Makefile create mode 100644 tools/testing/selftests/riscv/cfi/cfi_rv_test.h create mode 100644 tools/testing/selftests/riscv/cfi/cfitests.c create mode 100644 tools/testing/selftests/riscv/cfi/shadowstack.c create mode 100644 tools/testing/selftests/riscv/cfi/shadowstack.h (limited to 'tools/testing') diff --git a/tools/testing/selftests/riscv/Makefile b/tools/testing/selftests/riscv/Makefile index 099b8c1f46f8..5671b4405a12 100644 --- a/tools/testing/selftests/riscv/Makefile +++ b/tools/testing/selftests/riscv/Makefile @@ -5,7 +5,7 @@ ARCH ?= $(shell uname -m 2>/dev/null || echo not) ifneq (,$(filter $(ARCH),riscv)) -RISCV_SUBTARGETS ?= abi hwprobe mm sigreturn vector +RISCV_SUBTARGETS ?= abi hwprobe mm sigreturn vector cfi else RISCV_SUBTARGETS := endif diff --git a/tools/testing/selftests/riscv/cfi/.gitignore b/tools/testing/selftests/riscv/cfi/.gitignore new file mode 100644 index 000000000000..c1faf7ca4346 --- /dev/null +++ b/tools/testing/selftests/riscv/cfi/.gitignore @@ -0,0 +1,2 @@ +cfitests +shadowstack diff --git a/tools/testing/selftests/riscv/cfi/Makefile b/tools/testing/selftests/riscv/cfi/Makefile new file mode 100644 index 000000000000..96a4dc4b69c3 --- /dev/null +++ b/tools/testing/selftests/riscv/cfi/Makefile @@ -0,0 +1,23 @@ +CFLAGS += $(KHDR_INCLUDES) +CFLAGS += -I$(top_srcdir)/tools/include + +CFLAGS += -march=rv64gc_zicfilp_zicfiss -fcf-protection=full + +# Check for zicfi* extensions needs cross compiler +# which is not set until lib.mk is included +ifeq ($(LLVM)$(CC),cc) +CC := $(CROSS_COMPILE)gcc +endif + + +ifeq ($(shell $(CC) $(CFLAGS) -nostdlib -xc /dev/null -o /dev/null > /dev/null 2>&1; echo $$?),0) +TEST_GEN_PROGS := cfitests + +$(OUTPUT)/cfitests: cfitests.c shadowstack.c + $(CC) -o$@ $(CFLAGS) $(LDFLAGS) $^ +else + +$(shell echo "Toolchain doesn't support CFI, skipping CFI kselftest." >&2) +endif + +include ../../lib.mk diff --git a/tools/testing/selftests/riscv/cfi/cfi_rv_test.h b/tools/testing/selftests/riscv/cfi/cfi_rv_test.h new file mode 100644 index 000000000000..1c8043f2b778 --- /dev/null +++ b/tools/testing/selftests/riscv/cfi/cfi_rv_test.h @@ -0,0 +1,82 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef SELFTEST_RISCV_CFI_H +#define SELFTEST_RISCV_CFI_H +#include +#include +#include "shadowstack.h" + +#define CHILD_EXIT_CODE_SSWRITE 10 +#define CHILD_EXIT_CODE_SIG_TEST 11 + +#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ +({ \ + register long _num __asm__ ("a7") = (num); \ + register long _arg1 __asm__ ("a0") = (long)(arg1); \ + register long _arg2 __asm__ ("a1") = (long)(arg2); \ + register long _arg3 __asm__ ("a2") = (long)(arg3); \ + register long _arg4 __asm__ ("a3") = (long)(arg4); \ + register long _arg5 __asm__ ("a4") = (long)(arg5); \ + \ + __asm__ volatile( \ + "ecall\n" \ + : "+r" \ + (_arg1) \ + : "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ + "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall3(num, arg1, arg2, arg3) \ +({ \ + register long _num __asm__ ("a7") = (num); \ + register long _arg1 __asm__ ("a0") = (long)(arg1); \ + register long _arg2 __asm__ ("a1") = (long)(arg2); \ + register long _arg3 __asm__ ("a2") = (long)(arg3); \ + \ + __asm__ volatile( \ + "ecall\n" \ + : "+r" (_arg1) \ + : "r"(_arg2), "r"(_arg3), \ + "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#ifndef __NR_prctl +#define __NR_prctl 167 +#endif + +#ifndef __NR_map_shadow_stack +#define __NR_map_shadow_stack 453 +#endif + +#define CSR_SSP 0x011 + +#ifdef __ASSEMBLY__ +#define __ASM_STR(x) x +#else +#define __ASM_STR(x) #x +#endif + +#define csr_read(csr) \ +({ \ + register unsigned long __v; \ + __asm__ __volatile__ ("csrr %0, " __ASM_STR(csr) \ + : "=r" (__v) : \ + : "memory"); \ + __v; \ +}) + +#define csr_write(csr, val) \ +({ \ + unsigned long __v = (unsigned long)(val); \ + __asm__ __volatile__ ("csrw " __ASM_STR(csr) ", %0" \ + : : "rK" (__v) \ + : "memory"); \ +}) + +#endif diff --git a/tools/testing/selftests/riscv/cfi/cfitests.c b/tools/testing/selftests/riscv/cfi/cfitests.c new file mode 100644 index 000000000000..298544854415 --- /dev/null +++ b/tools/testing/selftests/riscv/cfi/cfitests.c @@ -0,0 +1,173 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "../../kselftest.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cfi_rv_test.h" + +/* do not optimize cfi related test functions */ +#pragma GCC push_options +#pragma GCC optimize("O0") + +void sigsegv_handler(int signum, siginfo_t *si, void *uc) +{ + struct ucontext *ctx = (struct ucontext *)uc; + + if (si->si_code == SEGV_CPERR) { + ksft_print_msg("Control flow violation happened somewhere\n"); + ksft_print_msg("PC where violation happened %lx\n", ctx->uc_mcontext.gregs[0]); + exit(-1); + } + + /* all other cases are expected to be of shadow stack write case */ + exit(CHILD_EXIT_CODE_SSWRITE); +} + +bool register_signal_handler(void) +{ + struct sigaction sa = {}; + + sa.sa_sigaction = sigsegv_handler; + sa.sa_flags = SA_SIGINFO; + if (sigaction(SIGSEGV, &sa, NULL)) { + ksft_print_msg("Registering signal handler for landing pad violation failed\n"); + return false; + } + + return true; +} + +long ptrace(int request, pid_t pid, void *addr, void *data); + +bool cfi_ptrace_test(void) +{ + pid_t pid; + int status, ret = 0; + unsigned long ptrace_test_num = 0, total_ptrace_tests = 2; + + struct user_cfi_state cfi_reg; + struct iovec iov; + + pid = fork(); + + if (pid == -1) { + ksft_exit_fail_msg("%s: fork failed\n", __func__); + exit(1); + } + + if (pid == 0) { + /* allow to be traced */ + ptrace(PTRACE_TRACEME, 0, NULL, NULL); + raise(SIGSTOP); + asm volatile ("la a5, 1f\n" + "jalr a5\n" + "nop\n" + "nop\n" + "1: nop\n" + : : : "a5"); + exit(11); + /* child shouldn't go beyond here */ + } + + /* parent's code goes here */ + iov.iov_base = &cfi_reg; + iov.iov_len = sizeof(cfi_reg); + + while (ptrace_test_num < total_ptrace_tests) { + memset(&cfi_reg, 0, sizeof(cfi_reg)); + waitpid(pid, &status, 0); + if (WIFSTOPPED(status)) { + errno = 0; + ret = ptrace(PTRACE_GETREGSET, pid, (void *)NT_RISCV_USER_CFI, &iov); + if (ret == -1 && errno) + ksft_exit_fail_msg("%s: PTRACE_GETREGSET failed\n", __func__); + } else { + ksft_exit_fail_msg("%s: child didn't stop, failed\n", __func__); + } + + switch (ptrace_test_num) { +#define CFI_ENABLE_MASK (PTRACE_CFI_LP_EN_STATE | \ + PTRACE_CFI_SS_EN_STATE | \ + PTRACE_CFI_SS_PTR_STATE) + case 0: + if ((cfi_reg.cfi_status.cfi_state & CFI_ENABLE_MASK) != CFI_ENABLE_MASK) + ksft_exit_fail_msg("%s: ptrace_getregset failed, %llu\n", __func__, + cfi_reg.cfi_status.cfi_state); + if (!cfi_reg.shstk_ptr) + ksft_exit_fail_msg("%s: NULL shadow stack pointer, test failed\n", + __func__); + break; + case 1: + if (!(cfi_reg.cfi_status.cfi_state & PTRACE_CFI_ELP_STATE)) + ksft_exit_fail_msg("%s: elp must have been set\n", __func__); + /* clear elp state. not interested in anything else */ + cfi_reg.cfi_status.cfi_state = 0; + + ret = ptrace(PTRACE_SETREGSET, pid, (void *)NT_RISCV_USER_CFI, &iov); + if (ret == -1 && errno) + ksft_exit_fail_msg("%s: PTRACE_GETREGSET failed\n", __func__); + break; + default: + ksft_exit_fail_msg("%s: unreachable switch case\n", __func__); + break; + } + ptrace(PTRACE_CONT, pid, NULL, NULL); + ptrace_test_num++; + } + + waitpid(pid, &status, 0); + if (WEXITSTATUS(status) != 11) + ksft_print_msg("%s, bad return code from child\n", __func__); + + ksft_print_msg("%s, ptrace test succeeded\n", __func__); + return true; +} + +int main(int argc, char *argv[]) +{ + int ret = 0; + unsigned long lpad_status = 0, ss_status = 0; + + ksft_print_header(); + + ksft_print_msg("Starting risc-v tests\n"); + + /* + * Landing pad test. Not a lot of kernel changes to support landing + * pads for user mode except lighting up a bit in senvcfg via a prctl. + * Enable landing pad support throughout the execution of the test binary. + */ + ret = my_syscall5(__NR_prctl, PR_GET_INDIR_BR_LP_STATUS, &lpad_status, 0, 0, 0); + if (ret) + ksft_exit_fail_msg("Get landing pad status failed with %d\n", ret); + + if (!(lpad_status & PR_INDIR_BR_LP_ENABLE)) + ksft_exit_fail_msg("Landing pad is not enabled, should be enabled via glibc\n"); + + ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &ss_status, 0, 0, 0); + if (ret) + ksft_exit_fail_msg("Get shadow stack failed with %d\n", ret); + + if (!(ss_status & PR_SHADOW_STACK_ENABLE)) + ksft_exit_fail_msg("Shadow stack is not enabled, should be enabled via glibc\n"); + + if (!register_signal_handler()) + ksft_exit_fail_msg("Registering signal handler for SIGSEGV failed\n"); + + ksft_print_msg("Landing pad and shadow stack are enabled for binary\n"); + cfi_ptrace_test(); + + execute_shadow_stack_tests(); + + return 0; +} + +#pragma GCC pop_options diff --git a/tools/testing/selftests/riscv/cfi/shadowstack.c b/tools/testing/selftests/riscv/cfi/shadowstack.c new file mode 100644 index 000000000000..f8eed8260a12 --- /dev/null +++ b/tools/testing/selftests/riscv/cfi/shadowstack.c @@ -0,0 +1,385 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "../../kselftest.h" +#include +#include +#include +#include +#include +#include "shadowstack.h" +#include "cfi_rv_test.h" + +static struct shadow_stack_tests shstk_tests[] = { + { "shstk fork test\n", shadow_stack_fork_test }, + { "map shadow stack syscall\n", shadow_stack_map_test }, + { "shadow stack gup tests\n", shadow_stack_gup_tests }, + { "shadow stack signal tests\n", shadow_stack_signal_test}, + { "memory protections of shadow stack memory\n", shadow_stack_protection_test } +}; + +#define RISCV_SHADOW_STACK_TESTS ARRAY_SIZE(shstk_tests) + +/* do not optimize shadow stack related test functions */ +#pragma GCC push_options +#pragma GCC optimize("O0") + +void zar(void) +{ + unsigned long ssp = 0; + + ssp = csr_read(CSR_SSP); + ksft_print_msg("Spewing out shadow stack ptr: %lx\n" + " This is to ensure shadow stack is indeed enabled and working\n", + ssp); +} + +void bar(void) +{ + zar(); +} + +void foo(void) +{ + bar(); +} + +void zar_child(void) +{ + unsigned long ssp = 0; + + ssp = csr_read(CSR_SSP); + ksft_print_msg("Spewing out shadow stack ptr: %lx\n" + " This is to ensure shadow stack is indeed enabled and working\n", + ssp); +} + +void bar_child(void) +{ + zar_child(); +} + +void foo_child(void) +{ + bar_child(); +} + +typedef void (call_func_ptr)(void); +/* + * call couple of functions to test push/pop. + */ +int shadow_stack_call_tests(call_func_ptr fn_ptr, bool parent) +{ + ksft_print_msg("dummy calls for sspush and sspopchk in context of %s\n", + parent ? "parent" : "child"); + + (fn_ptr)(); + + return 0; +} + +/* forks a thread, and ensure shadow stacks fork out */ +bool shadow_stack_fork_test(unsigned long test_num, void *ctx) +{ + int pid = 0, child_status = 0, parent_pid = 0, ret = 0; + unsigned long ss_status = 0; + + ksft_print_msg("Exercising shadow stack fork test\n"); + + ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &ss_status, 0, 0, 0); + if (ret) { + ksft_exit_skip("Shadow stack get status prctl failed with errorcode %d\n", ret); + return false; + } + + if (!(ss_status & PR_SHADOW_STACK_ENABLE)) + ksft_exit_skip("Shadow stack is not enabled, should be enabled via glibc\n"); + + parent_pid = getpid(); + pid = fork(); + + if (pid) { + ksft_print_msg("Parent pid %d and child pid %d\n", parent_pid, pid); + shadow_stack_call_tests(&foo, true); + } else { + shadow_stack_call_tests(&foo_child, false); + } + + if (pid) { + ksft_print_msg("Waiting on child to finish\n"); + wait(&child_status); + } else { + /* exit child gracefully */ + exit(0); + } + + if (pid && WIFSIGNALED(child_status)) { + ksft_print_msg("Child faulted, fork test failed\n"); + return false; + } + + return true; +} + +/* exercise 'map_shadow_stack', pivot to it and call some functions to ensure it works */ +#define SHADOW_STACK_ALLOC_SIZE 4096 +bool shadow_stack_map_test(unsigned long test_num, void *ctx) +{ + unsigned long shdw_addr; + int ret = 0; + + ksft_print_msg("Exercising shadow stack map test\n"); + + shdw_addr = my_syscall3(__NR_map_shadow_stack, NULL, SHADOW_STACK_ALLOC_SIZE, 0); + + if (((long)shdw_addr) <= 0) { + ksft_print_msg("map_shadow_stack failed with error code %d\n", + (int)shdw_addr); + return false; + } + + ret = munmap((void *)shdw_addr, SHADOW_STACK_ALLOC_SIZE); + + if (ret) { + ksft_print_msg("munmap failed with error code %d\n", ret); + return false; + } + + return true; +} + +/* + * shadow stack protection tests. map a shadow stack and + * validate all memory protections work on it + */ +bool shadow_stack_protection_test(unsigned long test_num, void *ctx) +{ + unsigned long shdw_addr; + unsigned long *write_addr = NULL; + int ret = 0, pid = 0, child_status = 0; + + ksft_print_msg("Exercising shadow stack protection test (WPT)\n"); + + shdw_addr = my_syscall3(__NR_map_shadow_stack, NULL, SHADOW_STACK_ALLOC_SIZE, 0); + + if (((long)shdw_addr) <= 0) { + ksft_print_msg("map_shadow_stack failed with error code %d\n", + (int)shdw_addr); + return false; + } + + write_addr = (unsigned long *)shdw_addr; + pid = fork(); + + /* no child was created, return false */ + if (pid == -1) + return false; + + /* + * try to perform a store from child on shadow stack memory + * it should result in SIGSEGV + */ + if (!pid) { + /* below write must lead to SIGSEGV */ + *write_addr = 0xdeadbeef; + } else { + wait(&child_status); + } + + /* test fail, if 0xdeadbeef present on shadow stack address */ + if (*write_addr == 0xdeadbeef) { + ksft_print_msg("Shadow stack WPT failed\n"); + return false; + } + + /* if child reached here, then fail */ + if (!pid) { + ksft_print_msg("Shadow stack WPT failed: child reached unreachable state\n"); + return false; + } + + /* if child exited via signal handler but not for write on ss */ + if (WIFEXITED(child_status) && + WEXITSTATUS(child_status) != CHILD_EXIT_CODE_SSWRITE) { + ksft_print_msg("Shadow stack WPT failed: child wasn't signaled for write\n"); + return false; + } + + ret = munmap(write_addr, SHADOW_STACK_ALLOC_SIZE); + if (ret) { + ksft_print_msg("Shadow stack WPT failed: munmap failed, error code %d\n", + ret); + return false; + } + + return true; +} + +#define SS_MAGIC_WRITE_VAL 0xbeefdead + +int gup_tests(int mem_fd, unsigned long *shdw_addr) +{ + unsigned long val = 0; + + lseek(mem_fd, (unsigned long)shdw_addr, SEEK_SET); + if (read(mem_fd, &val, sizeof(val)) < 0) { + ksft_print_msg("Reading shadow stack mem via gup failed\n"); + return 1; + } + + val = SS_MAGIC_WRITE_VAL; + lseek(mem_fd, (unsigned long)shdw_addr, SEEK_SET); + if (write(mem_fd, &val, sizeof(val)) < 0) { + ksft_print_msg("Writing shadow stack mem via gup failed\n"); + return 1; + } + + if (*shdw_addr != SS_MAGIC_WRITE_VAL) { + ksft_print_msg("GUP write to shadow stack memory failed\n"); + return 1; + } + + return 0; +} + +bool shadow_stack_gup_tests(unsigned long test_num, void *ctx) +{ + unsigned long shdw_addr = 0; + unsigned long *write_addr = NULL; + int fd = 0; + bool ret = false; + + ksft_print_msg("Exercising shadow stack gup tests\n"); + shdw_addr = my_syscall3(__NR_map_shadow_stack, NULL, SHADOW_STACK_ALLOC_SIZE, 0); + + if (((long)shdw_addr) <= 0) { + ksft_print_msg("map_shadow_stack failed with error code %d\n", (int)shdw_addr); + return false; + } + + write_addr = (unsigned long *)shdw_addr; + + fd = open("/proc/self/mem", O_RDWR); + if (fd == -1) + return false; + + if (gup_tests(fd, write_addr)) { + ksft_print_msg("gup tests failed\n"); + goto out; + } + + ret = true; +out: + if (shdw_addr && munmap(write_addr, SHADOW_STACK_ALLOC_SIZE)) { + ksft_print_msg("munmap failed with error code %d\n", ret); + ret = false; + } + + return ret; +} + +volatile bool break_loop; + +void sigusr1_handler(int signo) +{ + break_loop = true; +} + +bool sigusr1_signal_test(void) +{ + struct sigaction sa = {}; + + sa.sa_handler = sigusr1_handler; + sa.sa_flags = 0; + sigemptyset(&sa.sa_mask); + if (sigaction(SIGUSR1, &sa, NULL)) { + ksft_print_msg("Registering signal handler for SIGUSR1 failed\n"); + return false; + } + + return true; +} + +/* + * shadow stack signal test. shadow stack must be enabled. + * register a signal, fork another thread which is waiting + * on signal. Send a signal from parent to child, verify + * that signal was received by child. If not test fails + */ +bool shadow_stack_signal_test(unsigned long test_num, void *ctx) +{ + int pid = 0, child_status = 0, ret = 0; + unsigned long ss_status = 0; + + ksft_print_msg("Exercising shadow stack signal test\n"); + + ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &ss_status, 0, 0, 0); + if (ret) { + ksft_print_msg("Shadow stack get status prctl failed with errorcode %d\n", ret); + return false; + } + + if (!(ss_status & PR_SHADOW_STACK_ENABLE)) + ksft_print_msg("Shadow stack is not enabled, should be enabled via glibc\n"); + + /* this should be caught by signal handler and do an exit */ + if (!sigusr1_signal_test()) { + ksft_print_msg("Registering sigusr1 handler failed\n"); + exit(-1); + } + + pid = fork(); + + if (pid == -1) { + ksft_print_msg("Signal test: fork failed\n"); + goto out; + } + + if (pid == 0) { + while (!break_loop) + sleep(1); + + exit(11); + /* child shouldn't go beyond here */ + } + + /* send SIGUSR1 to child */ + kill(pid, SIGUSR1); + wait(&child_status); + +out: + + return (WIFEXITED(child_status) && + WEXITSTATUS(child_status) == 11); +} + +int execute_shadow_stack_tests(void) +{ + int ret = 0; + unsigned long test_count = 0; + unsigned long shstk_status = 0; + bool test_pass = false; + + ksft_print_msg("Executing RISC-V shadow stack self tests\n"); + ksft_set_plan(RISCV_SHADOW_STACK_TESTS); + + ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &shstk_status, 0, 0, 0); + + if (ret != 0) + ksft_exit_fail_msg("Get shadow stack status failed with %d\n", ret); + + /* + * If we are here that means get shadow stack status succeeded and + * thus shadow stack support is baked in the kernel. + */ + while (test_count < RISCV_SHADOW_STACK_TESTS) { + test_pass = (*shstk_tests[test_count].t_func)(test_count, NULL); + ksft_test_result(test_pass, shstk_tests[test_count].name); + test_count++; + } + + ksft_finished(); + + return 0; +} + +#pragma GCC pop_options diff --git a/tools/testing/selftests/riscv/cfi/shadowstack.h b/tools/testing/selftests/riscv/cfi/shadowstack.h new file mode 100644 index 000000000000..943a3685905f --- /dev/null +++ b/tools/testing/selftests/riscv/cfi/shadowstack.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef SELFTEST_SHADOWSTACK_TEST_H +#define SELFTEST_SHADOWSTACK_TEST_H +#include +#include + +/* + * A CFI test returns true for success or false for fail. + * Takes a test number to index into array, and a void pointer. + */ +typedef bool (*shstk_test_func)(unsigned long test_num, void *); + +struct shadow_stack_tests { + char *name; + shstk_test_func t_func; +}; + +bool shadow_stack_fork_test(unsigned long test_num, void *ctx); +bool shadow_stack_map_test(unsigned long test_num, void *ctx); +bool shadow_stack_protection_test(unsigned long test_num, void *ctx); +bool shadow_stack_gup_tests(unsigned long test_num, void *ctx); +bool shadow_stack_signal_test(unsigned long test_num, void *ctx); + +int execute_shadow_stack_tests(void); + +#endif -- cgit v1.2.3 From c17b9046faf7d1f3b8bb992e4d53da873dc478fc Mon Sep 17 00:00:00 2001 From: Koichiro Den Date: Sat, 24 Jan 2026 23:50:12 +0900 Subject: selftests: pci_endpoint: Add BAR subrange mapping test case Add BAR_SUBRANGE_TEST to the pci_endpoint kselftest suite. The test uses the PCITEST_BAR_SUBRANGE ioctl and will skip when the chosen BAR is disabled (-ENODATA), when the endpoint/controller does not support subrange mapping (-EOPNOTSUPP), or when the BAR is reserved for the test register space (-EBUSY). Signed-off-by: Koichiro Den Signed-off-by: Manivannan Sadhasivam Signed-off-by: Bjorn Helgaas Link: https://patch.msgid.link/20260124145012.2794108-9-den@valinux.co.jp --- .../testing/selftests/pci_endpoint/pci_endpoint_test.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/pci_endpoint/pci_endpoint_test.c b/tools/testing/selftests/pci_endpoint/pci_endpoint_test.c index 23aac6f97061..eecb776c33af 100644 --- a/tools/testing/selftests/pci_endpoint/pci_endpoint_test.c +++ b/tools/testing/selftests/pci_endpoint/pci_endpoint_test.c @@ -70,6 +70,23 @@ TEST_F(pci_ep_bar, BAR_TEST) EXPECT_FALSE(ret) TH_LOG("Test failed for BAR%d", variant->barno); } +TEST_F(pci_ep_bar, BAR_SUBRANGE_TEST) +{ + int ret; + + pci_ep_ioctl(PCITEST_SET_IRQTYPE, PCITEST_IRQ_TYPE_AUTO); + ASSERT_EQ(0, ret) TH_LOG("Can't set AUTO IRQ type"); + + pci_ep_ioctl(PCITEST_BAR_SUBRANGE, variant->barno); + if (ret == -ENODATA) + SKIP(return, "BAR is disabled"); + if (ret == -EBUSY) + SKIP(return, "BAR is test register space"); + if (ret == -EOPNOTSUPP) + SKIP(return, "Subrange map is not supported"); + EXPECT_FALSE(ret) TH_LOG("Test failed for BAR%d", variant->barno); +} + FIXTURE(pci_ep_basic) { int fd; -- cgit v1.2.3 From b3827c91cc9979fe04d99e016fb9c5f6260f29a0 Mon Sep 17 00:00:00 2001 From: Andre Carvalho Date: Tue, 27 Jan 2026 19:39:20 +0000 Subject: netconsole: selftests: Move netconsole selftests to separate target This patch moves netconsole selftests from drivers/net to its own target in drivers/net/netconsole. This change helps saving some resources from CI since tests in drivers/net automatically run against real hardware which are not used by netconsole tests as they rely solely on netdevsim. lib_netcons.sh is kept under drivers/net/lib since it is also used by bonding selftests. Finally, drivers/net config remains unchanged as netpoll_basic.py requires netconsole (and does leverage real HW testing). Reviewed-by: Breno Leitao Signed-off-by: Andre Carvalho Link: https://patch.msgid.link/20260127-netcons-selftest-target-v2-1-f509ab65b3bc@gmail.com Signed-off-by: Jakub Kicinski --- MAINTAINERS | 2 +- tools/testing/selftests/Makefile | 1 + tools/testing/selftests/drivers/net/Makefile | 7 - .../testing/selftests/drivers/net/netcons_basic.sh | 74 ------ .../selftests/drivers/net/netcons_cmdline.sh | 65 ----- .../drivers/net/netcons_fragmented_msg.sh | 122 --------- .../selftests/drivers/net/netcons_overflow.sh | 67 ----- .../selftests/drivers/net/netcons_resume.sh | 124 ---------- .../selftests/drivers/net/netcons_sysdata.sh | 272 --------------------- .../selftests/drivers/net/netcons_torture.sh | 130 ---------- .../selftests/drivers/net/netconsole/Makefile | 19 ++ .../selftests/drivers/net/netconsole/config | 6 + .../drivers/net/netconsole/netcons_basic.sh | 74 ++++++ .../drivers/net/netconsole/netcons_cmdline.sh | 65 +++++ .../net/netconsole/netcons_fragmented_msg.sh | 122 +++++++++ .../drivers/net/netconsole/netcons_overflow.sh | 67 +++++ .../drivers/net/netconsole/netcons_resume.sh | 124 ++++++++++ .../drivers/net/netconsole/netcons_sysdata.sh | 272 +++++++++++++++++++++ .../drivers/net/netconsole/netcons_torture.sh | 130 ++++++++++ 19 files changed, 881 insertions(+), 862 deletions(-) delete mode 100755 tools/testing/selftests/drivers/net/netcons_basic.sh delete mode 100755 tools/testing/selftests/drivers/net/netcons_cmdline.sh delete mode 100755 tools/testing/selftests/drivers/net/netcons_fragmented_msg.sh delete mode 100755 tools/testing/selftests/drivers/net/netcons_overflow.sh delete mode 100755 tools/testing/selftests/drivers/net/netcons_resume.sh delete mode 100755 tools/testing/selftests/drivers/net/netcons_sysdata.sh delete mode 100755 tools/testing/selftests/drivers/net/netcons_torture.sh create mode 100644 tools/testing/selftests/drivers/net/netconsole/Makefile create mode 100644 tools/testing/selftests/drivers/net/netconsole/config create mode 100755 tools/testing/selftests/drivers/net/netconsole/netcons_basic.sh create mode 100755 tools/testing/selftests/drivers/net/netconsole/netcons_cmdline.sh create mode 100755 tools/testing/selftests/drivers/net/netconsole/netcons_fragmented_msg.sh create mode 100755 tools/testing/selftests/drivers/net/netconsole/netcons_overflow.sh create mode 100755 tools/testing/selftests/drivers/net/netconsole/netcons_resume.sh create mode 100755 tools/testing/selftests/drivers/net/netconsole/netcons_sysdata.sh create mode 100755 tools/testing/selftests/drivers/net/netconsole/netcons_torture.sh (limited to 'tools/testing') diff --git a/MAINTAINERS b/MAINTAINERS index ff9e204c5f33..0caa8aee5840 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18021,7 +18021,7 @@ S: Maintained F: Documentation/networking/netconsole.rst F: drivers/net/netconsole.c F: tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh -F: tools/testing/selftests/drivers/net/netcons\* +F: tools/testing/selftests/drivers/net/netconsole/ NETDEVSIM M: Jakub Kicinski diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 56e44a98d6a5..450f13ba4cca 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -22,6 +22,7 @@ TARGETS += drivers/ntsync TARGETS += drivers/s390x/uvdevice TARGETS += drivers/net TARGETS += drivers/net/bonding +TARGETS += drivers/net/netconsole TARGETS += drivers/net/team TARGETS += drivers/net/virtio_net TARGETS += drivers/platform/x86/intel/ifs diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile index 3eba569b3366..8154d6d429d3 100644 --- a/tools/testing/selftests/drivers/net/Makefile +++ b/tools/testing/selftests/drivers/net/Makefile @@ -15,13 +15,6 @@ TEST_PROGS := \ hds.py \ napi_id.py \ napi_threaded.py \ - netcons_basic.sh \ - netcons_cmdline.sh \ - netcons_fragmented_msg.sh \ - netcons_overflow.sh \ - netcons_resume.sh \ - netcons_sysdata.sh \ - netcons_torture.sh \ netpoll_basic.py \ ping.py \ psp.py \ diff --git a/tools/testing/selftests/drivers/net/netcons_basic.sh b/tools/testing/selftests/drivers/net/netcons_basic.sh deleted file mode 100755 index 2022f3061738..000000000000 --- a/tools/testing/selftests/drivers/net/netcons_basic.sh +++ /dev/null @@ -1,74 +0,0 @@ -#!/usr/bin/env bash -# SPDX-License-Identifier: GPL-2.0 - -# This test creates two netdevsim virtual interfaces, assigns one of them (the -# "destination interface") to a new namespace, and assigns IP addresses to both -# interfaces. -# -# It listens on the destination interface using socat and configures a dynamic -# target on netconsole, pointing to the destination IP address. -# -# Finally, it checks whether the message was received properly on the -# destination interface. Note that this test may pollute the kernel log buffer -# (dmesg) and relies on dynamic configuration and namespaces being configured. -# -# Author: Breno Leitao - -set -euo pipefail - -SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") - -source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh - -modprobe netdevsim 2> /dev/null || true -modprobe netconsole 2> /dev/null || true - -# The content of kmsg will be save to the following file -OUTPUT_FILE="/tmp/${TARGET}" - -# Check for basic system dependency and exit if not found -check_for_dependencies -# Remove the namespace, interfaces and netconsole target on exit -trap cleanup EXIT - -# Run the test twice, with different format modes -for FORMAT in "basic" "extended" -do - for IP_VERSION in "ipv6" "ipv4" - do - echo "Running with target mode: ${FORMAT} (${IP_VERSION})" - # Set current loglevel to KERN_INFO(6), and default to - # KERN_NOTICE(5) - echo "6 5" > /proc/sys/kernel/printk - # Create one namespace and two interfaces - set_network "${IP_VERSION}" - # Create a dynamic target for netconsole - create_dynamic_target "${FORMAT}" - # Only set userdata for extended format - if [ "$FORMAT" == "extended" ] - then - # Set userdata "key" with the "value" value - set_user_data - fi - # Listed for netconsole port inside the namespace and - # destination interface - listen_port_and_save_to "${OUTPUT_FILE}" "${IP_VERSION}" & - # Wait for socat to start and listen to the port. - wait_for_port "${NAMESPACE}" "${PORT}" "${IP_VERSION}" - # Send the message - echo "${MSG}: ${TARGET}" > /dev/kmsg - # Wait until socat saves the file to disk - busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}" - - # Make sure the message was received in the dst part - # and exit - validate_result "${OUTPUT_FILE}" "${FORMAT}" - # kill socat in case it is still running - pkill_socat - cleanup - echo "${FORMAT} : ${IP_VERSION} : Test passed" >&2 - done -done - -trap - EXIT -exit "${ksft_pass}" diff --git a/tools/testing/selftests/drivers/net/netcons_cmdline.sh b/tools/testing/selftests/drivers/net/netcons_cmdline.sh deleted file mode 100755 index d1d23dc67f99..000000000000 --- a/tools/testing/selftests/drivers/net/netcons_cmdline.sh +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/env bash -# SPDX-License-Identifier: GPL-2.0 - -# This is a selftest to test cmdline arguments on netconsole. -# It exercises loading of netconsole from cmdline instead of the dynamic -# reconfiguration. This includes parsing the long netconsole= line and all the -# flow through init_netconsole(). -# -# Author: Breno Leitao - -set -euo pipefail - -SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") - -source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh - -check_netconsole_module - -modprobe netdevsim 2> /dev/null || true -rmmod netconsole 2> /dev/null || true - -# Check for basic system dependency and exit if not found -# check_for_dependencies -# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5) -echo "6 5" > /proc/sys/kernel/printk -# Remove the namespace and network interfaces -trap do_cleanup EXIT -# Create one namespace and two interfaces -set_network - -# Run the test twice, with different cmdline parameters -for BINDMODE in "ifname" "mac" -do - echo "Running with bind mode: ${BINDMODE}" >&2 - # Create the command line for netconsole, with the configuration from - # the function above - CMDLINE=$(create_cmdline_str "${BINDMODE}") - - # The content of kmsg will be save to the following file - OUTPUT_FILE="/tmp/${TARGET}-${BINDMODE}" - - # Load the module, with the cmdline set - modprobe netconsole "${CMDLINE}" - - # Listed for netconsole port inside the namespace and destination - # interface - listen_port_and_save_to "${OUTPUT_FILE}" & - # Wait for socat to start and listen to the port. - wait_local_port_listen "${NAMESPACE}" "${PORT}" udp - # Send the message - echo "${MSG}: ${TARGET}" > /dev/kmsg - # Wait until socat saves the file to disk - busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}" - # Make sure the message was received in the dst part - # and exit - validate_msg "${OUTPUT_FILE}" - - # kill socat in case it is still running - pkill_socat - # Unload the module - rmmod netconsole - echo "${BINDMODE} : Test passed" >&2 -done - -exit "${ksft_pass}" diff --git a/tools/testing/selftests/drivers/net/netcons_fragmented_msg.sh b/tools/testing/selftests/drivers/net/netcons_fragmented_msg.sh deleted file mode 100755 index 4a71e01a230c..000000000000 --- a/tools/testing/selftests/drivers/net/netcons_fragmented_msg.sh +++ /dev/null @@ -1,122 +0,0 @@ -#!/usr/bin/env bash -# SPDX-License-Identifier: GPL-2.0 - -# Test netconsole's message fragmentation functionality. -# -# When a message exceeds the maximum packet size, netconsole splits it into -# multiple fragments for transmission. This test verifies: -# - Correct fragmentation of large messages -# - Proper reassembly of fragments at the receiver -# - Preservation of userdata across fragments -# - Behavior with and without kernel release version appending -# -# Author: Breno Leitao - -set -euo pipefail - -SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") - -source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh - -modprobe netdevsim 2> /dev/null || true -modprobe netconsole 2> /dev/null || true - -# The content of kmsg will be save to the following file -OUTPUT_FILE="/tmp/${TARGET}" - -# set userdata to a long value. In this case, it is "1-2-3-4...50-" -USERDATA_VALUE=$(printf -- '%.2s-' {1..60}) - -# Convert the header string in a regexp, so, we can remove -# the second header as well. -# A header looks like "13,468,514729715,-,ncfrag=0/1135;". If -# release is appended, you might find something like:L -# "6.13.0-04048-g4f561a87745a,13,468,514729715,-,ncfrag=0/1135;" -function header_to_regex() { - # header is everything before ; - local HEADER="${1}" - REGEX=$(echo "${HEADER}" | cut -d'=' -f1) - echo "${REGEX}=[0-9]*\/[0-9]*;" -} - -# We have two headers in the message. Remove both to get the full message, -# and extract the full message. -function extract_msg() { - local MSGFILE="${1}" - # Extract the header, which is the very first thing that arrives in the - # first list. - HEADER=$(sed -n '1p' "${MSGFILE}" | cut -d';' -f1) - HEADER_REGEX=$(header_to_regex "${HEADER}") - - # Remove the two headers from the received message - # This will return the message without any header, similarly to what - # was sent. - sed "s/""${HEADER_REGEX}""//g" "${MSGFILE}" -} - -# Validate the message, which has two messages glued together. -# unwrap them to make sure all the characters were transmitted. -# File will look like the following: -# 13,468,514729715,-,ncfrag=0/1135; -# key=-13,468,514729715,-,ncfrag=967/1135; -function validate_fragmented_result() { - # Discard the netconsole headers, and assemble the full message - RCVMSG=$(extract_msg "${1}") - - # check for the main message - if ! echo "${RCVMSG}" | grep -q "${MSG}"; then - echo "Message body doesn't match." >&2 - echo "msg received=" "${RCVMSG}" >&2 - exit "${ksft_fail}" - fi - - # check userdata - if ! echo "${RCVMSG}" | grep -q "${USERDATA_VALUE}"; then - echo "message userdata doesn't match" >&2 - echo "msg received=" "${RCVMSG}" >&2 - exit "${ksft_fail}" - fi - # test passed. hooray -} - -# Check for basic system dependency and exit if not found -check_for_dependencies -# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5) -echo "6 5" > /proc/sys/kernel/printk -# Remove the namespace, interfaces and netconsole target on exit -trap cleanup EXIT -# Create one namespace and two interfaces -set_network -# Create a dynamic target for netconsole -create_dynamic_target -# Set userdata "key" with the "value" value -set_user_data - - -# TEST 1: Send message and userdata. They will fragment -# ======= -MSG=$(printf -- 'MSG%.3s=' {1..150}) - -# Listen for netconsole port inside the namespace and destination interface -listen_port_and_save_to "${OUTPUT_FILE}" & -# Wait for socat to start and listen to the port. -wait_local_port_listen "${NAMESPACE}" "${PORT}" udp -# Send the message -echo "${MSG}: ${TARGET}" > /dev/kmsg -# Wait until socat saves the file to disk -busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}" -# Check if the message was not corrupted -validate_fragmented_result "${OUTPUT_FILE}" - -# TEST 2: Test with smaller message, and without release appended -# ======= -MSG=$(printf -- 'FOOBAR%.3s=' {1..100}) -# Let's disable release and test again. -disable_release_append - -listen_port_and_save_to "${OUTPUT_FILE}" & -wait_local_port_listen "${NAMESPACE}" "${PORT}" udp -echo "${MSG}: ${TARGET}" > /dev/kmsg -busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}" -validate_fragmented_result "${OUTPUT_FILE}" -exit "${ksft_pass}" diff --git a/tools/testing/selftests/drivers/net/netcons_overflow.sh b/tools/testing/selftests/drivers/net/netcons_overflow.sh deleted file mode 100755 index 06089643b771..000000000000 --- a/tools/testing/selftests/drivers/net/netcons_overflow.sh +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/env bash -# SPDX-License-Identifier: GPL-2.0 - -# This test verifies that users can successfully create up to -# MAX_USERDATA_ITEMS userdata entries without encountering any failures. -# -# Additionally, it tests for expected failure when attempting to exceed this -# maximum limit. -# -# Author: Breno Leitao - -set -euo pipefail - -SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") - -source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh -# This is coming from netconsole code. Check for it in drivers/net/netconsole.c -MAX_USERDATA_ITEMS=256 - -# Function to create userdata entries -function create_userdata_max_entries() { - # All these keys should be created without any error - for i in $(seq $MAX_USERDATA_ITEMS) - do - # USERDATA_KEY is used by set_user_data - USERDATA_KEY="key"${i} - set_user_data - done -} - -# Function to verify the entry limit -function verify_entry_limit() { - # Allowing the test to fail without exiting, since the next command - # will fail - set +e - mkdir "${NETCONS_PATH}/userdata/key_that_will_fail" 2> /dev/null - ret="$?" - set -e - if [ "$ret" -eq 0 ]; - then - echo "Adding more than ${MAX_USERDATA_ITEMS} entries in userdata should fail, but it didn't" >&2 - ls "${NETCONS_PATH}/userdata/" >&2 - exit "${ksft_fail}" - fi -} - -# ========== # -# Start here # -# ========== # - -modprobe netdevsim 2> /dev/null || true -modprobe netconsole 2> /dev/null || true - -# Check for basic system dependency and exit if not found -check_for_dependencies - -# Remove the namespace, interfaces and netconsole target on exit -trap cleanup EXIT -# Create one namespace and two interfaces -set_network -# Create a dynamic target for netconsole -create_dynamic_target -# populate the maximum number of supported keys in userdata -create_userdata_max_entries -# Verify an additional entry is not allowed -verify_entry_limit -exit "${ksft_pass}" diff --git a/tools/testing/selftests/drivers/net/netcons_resume.sh b/tools/testing/selftests/drivers/net/netcons_resume.sh deleted file mode 100755 index fc5e5e3ad3d4..000000000000 --- a/tools/testing/selftests/drivers/net/netcons_resume.sh +++ /dev/null @@ -1,124 +0,0 @@ -#!/usr/bin/env bash -# SPDX-License-Identifier: GPL-2.0 - -# This test validates that netconsole is able to resume a target that was -# deactivated when its interface was removed when the interface is brought -# back up. -# -# The test configures a netconsole target and then removes netdevsim module to -# cause the interface to disappear. Targets are configured via cmdline to ensure -# targets bound by interface name and mac address can be resumed. -# The test verifies that the target moved to disabled state before adding -# netdevsim and the interface back. -# -# Finally, the test verifies that the target is re-enabled automatically and -# the message is received on the destination interface. -# -# Author: Andre Carvalho - -set -euo pipefail - -SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") - -source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh - -SAVED_SRCMAC="" # to be populated later -SAVED_DSTMAC="" # to be populated later - -modprobe netdevsim 2> /dev/null || true -rmmod netconsole 2> /dev/null || true - -check_netconsole_module - -function cleanup() { - cleanup_netcons "${NETCONS_CONFIGFS}/cmdline0" - do_cleanup - rmmod netconsole -} - -function trigger_reactivation() { - # Add back low level module - modprobe netdevsim - # Recreate namespace and two interfaces - set_network - # Restore MACs - ip netns exec "${NAMESPACE}" ip link set "${DSTIF}" \ - address "${SAVED_DSTMAC}" - if [ "${BINDMODE}" == "mac" ]; then - ip link set dev "${SRCIF}" down - ip link set dev "${SRCIF}" address "${SAVED_SRCMAC}" - # Rename device in order to trigger target resume, as initial - # when device was recreated it didn't have correct mac address. - ip link set dev "${SRCIF}" name "${TARGET}" - fi -} - -function trigger_deactivation() { - # Start by storing mac addresses so we can be restored in reactivate - SAVED_DSTMAC=$(ip netns exec "${NAMESPACE}" \ - cat /sys/class/net/"$DSTIF"/address) - SAVED_SRCMAC=$(mac_get "${SRCIF}") - # Remove low level module - rmmod netdevsim -} - -trap cleanup EXIT - -# Run the test twice, with different cmdline parameters -for BINDMODE in "ifname" "mac" -do - echo "Running with bind mode: ${BINDMODE}" >&2 - # Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5) - echo "6 5" > /proc/sys/kernel/printk - - # Create one namespace and two interfaces - set_network - - # Create the command line for netconsole, with the configuration from - # the function above - CMDLINE=$(create_cmdline_str "${BINDMODE}") - - # The content of kmsg will be save to the following file - OUTPUT_FILE="/tmp/${TARGET}-${BINDMODE}" - - # Load the module, with the cmdline set - modprobe netconsole "${CMDLINE}" - # Expose cmdline target in configfs - mkdir "${NETCONS_CONFIGFS}/cmdline0" - - # Target should be enabled - wait_target_state "cmdline0" "enabled" - - # Trigger deactivation by unloading netdevsim module. Target should be - # disabled. - trigger_deactivation - wait_target_state "cmdline0" "disabled" - - # Trigger reactivation by loading netdevsim, recreating the network and - # restoring mac addresses. Target should be re-enabled. - trigger_reactivation - wait_target_state "cmdline0" "enabled" - - # Listen for netconsole port inside the namespace and destination - # interface - listen_port_and_save_to "${OUTPUT_FILE}" & - # Wait for socat to start and listen to the port. - wait_local_port_listen "${NAMESPACE}" "${PORT}" udp - # Send the message - echo "${MSG}: ${TARGET}" > /dev/kmsg - # Wait until socat saves the file to disk - busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}" - # Make sure the message was received in the dst part - # and exit - validate_msg "${OUTPUT_FILE}" - - # kill socat in case it is still running - pkill_socat - # Cleanup & unload the module - cleanup - - echo "${BINDMODE} : Test passed" >&2 -done - -trap - EXIT -exit "${EXIT_STATUS}" diff --git a/tools/testing/selftests/drivers/net/netcons_sysdata.sh b/tools/testing/selftests/drivers/net/netcons_sysdata.sh deleted file mode 100755 index baf69031089e..000000000000 --- a/tools/testing/selftests/drivers/net/netcons_sysdata.sh +++ /dev/null @@ -1,272 +0,0 @@ -#!/usr/bin/env bash -# SPDX-License-Identifier: GPL-2.0 - -# A test that makes sure that sysdata runtime CPU data is properly set -# when a message is sent. -# -# There are 3 different tests, every time sent using a random CPU. -# - Test #1 -# * Only enable cpu_nr sysdata feature. -# - Test #2 -# * Keep cpu_nr sysdata feature enable and enable userdata. -# - Test #3 -# * keep userdata enabled, and disable sysdata cpu_nr feature. -# -# Author: Breno Leitao - -set -euo pipefail - -SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") - -source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh - -# Enable the sysdata cpu_nr feature -function set_cpu_nr() { - if [[ ! -f "${NETCONS_PATH}/userdata/cpu_nr_enabled" ]] - then - echo "Populate CPU configfs path not available in ${NETCONS_PATH}/userdata/cpu_nr_enabled" >&2 - exit "${ksft_skip}" - fi - - echo 1 > "${NETCONS_PATH}/userdata/cpu_nr_enabled" -} - -# Enable the taskname to be appended to sysdata -function set_taskname() { - if [[ ! -f "${NETCONS_PATH}/userdata/taskname_enabled" ]] - then - echo "Not able to enable taskname sysdata append. Configfs not available in ${NETCONS_PATH}/userdata/taskname_enabled" >&2 - exit "${ksft_skip}" - fi - - echo 1 > "${NETCONS_PATH}/userdata/taskname_enabled" -} - -# Enable the release to be appended to sysdata -function set_release() { - if [[ ! -f "${NETCONS_PATH}/userdata/release_enabled" ]] - then - echo "Not able to enable release sysdata append. Configfs not available in ${NETCONS_PATH}/userdata/release_enabled" >&2 - exit "${ksft_skip}" - fi - - echo 1 > "${NETCONS_PATH}/userdata/release_enabled" -} - -# Enable the msgid to be appended to sysdata -function set_msgid() { - if [[ ! -f "${NETCONS_PATH}/userdata/msgid_enabled" ]] - then - echo "Not able to enable msgid sysdata append. Configfs not available in ${NETCONS_PATH}/userdata/msgid_enabled" >&2 - exit "${ksft_skip}" - fi - - echo 1 > "${NETCONS_PATH}/userdata/msgid_enabled" -} - -# Disable the sysdata cpu_nr feature -function unset_cpu_nr() { - echo 0 > "${NETCONS_PATH}/userdata/cpu_nr_enabled" -} - -# Once called, taskname=<..> will not be appended anymore -function unset_taskname() { - echo 0 > "${NETCONS_PATH}/userdata/taskname_enabled" -} - -function unset_release() { - echo 0 > "${NETCONS_PATH}/userdata/release_enabled" -} - -function unset_msgid() { - echo 0 > "${NETCONS_PATH}/userdata/msgid_enabled" -} - -# Test if MSG contains sysdata -function validate_sysdata() { - # OUTPUT_FILE will contain something like: - # 6.11.1-0_fbk0_rc13_509_g30d75cea12f7,13,1822,115075213798,-;netconsole selftest: netcons_gtJHM - # userdatakey=userdatavalue - # cpu=X - # taskname= - # msgid= - - # Echo is what this test uses to create the message. See runtest() - # function - SENDER="echo" - - if [ ! -f "$OUTPUT_FILE" ]; then - echo "FAIL: File was not generated." >&2 - exit "${ksft_fail}" - fi - - if ! grep -q "${MSG}" "${OUTPUT_FILE}"; then - echo "FAIL: ${MSG} not found in ${OUTPUT_FILE}" >&2 - cat "${OUTPUT_FILE}" >&2 - exit "${ksft_fail}" - fi - - # Check if cpu=XX exists in the file and matches the one used - # in taskset(1) - if ! grep -q "cpu=${CPU}\+" "${OUTPUT_FILE}"; then - echo "FAIL: 'cpu=${CPU}' not found in ${OUTPUT_FILE}" >&2 - cat "${OUTPUT_FILE}" >&2 - exit "${ksft_fail}" - fi - - if ! grep -q "taskname=${SENDER}" "${OUTPUT_FILE}"; then - echo "FAIL: 'taskname=echo' not found in ${OUTPUT_FILE}" >&2 - cat "${OUTPUT_FILE}" >&2 - exit "${ksft_fail}" - fi - - if ! grep -q "msgid=[0-9]\+$" "${OUTPUT_FILE}"; then - echo "FAIL: 'msgid=' not found in ${OUTPUT_FILE}" >&2 - cat "${OUTPUT_FILE}" >&2 - exit "${ksft_fail}" - fi - - rm "${OUTPUT_FILE}" - pkill_socat -} - -function validate_release() { - RELEASE=$(uname -r) - - if [ ! -f "$OUTPUT_FILE" ]; then - echo "FAIL: File was not generated." >&2 - exit "${ksft_fail}" - fi - - if ! grep -q "release=${RELEASE}" "${OUTPUT_FILE}"; then - echo "FAIL: 'release=${RELEASE}' not found in ${OUTPUT_FILE}" >&2 - cat "${OUTPUT_FILE}" >&2 - exit "${ksft_fail}" - fi -} - -# Test if MSG content exists in OUTPUT_FILE but no `cpu=` and `taskname=` -# strings -function validate_no_sysdata() { - if [ ! -f "$OUTPUT_FILE" ]; then - echo "FAIL: File was not generated." >&2 - exit "${ksft_fail}" - fi - - if ! grep -q "${MSG}" "${OUTPUT_FILE}"; then - echo "FAIL: ${MSG} not found in ${OUTPUT_FILE}" >&2 - cat "${OUTPUT_FILE}" >&2 - exit "${ksft_fail}" - fi - - if grep -q "cpu=" "${OUTPUT_FILE}"; then - echo "FAIL: 'cpu= found in ${OUTPUT_FILE}" >&2 - cat "${OUTPUT_FILE}" >&2 - exit "${ksft_fail}" - fi - - if grep -q "taskname=" "${OUTPUT_FILE}"; then - echo "FAIL: 'taskname= found in ${OUTPUT_FILE}" >&2 - cat "${OUTPUT_FILE}" >&2 - exit "${ksft_fail}" - fi - - if grep -q "release=" "${OUTPUT_FILE}"; then - echo "FAIL: 'release= found in ${OUTPUT_FILE}" >&2 - cat "${OUTPUT_FILE}" >&2 - exit "${ksft_fail}" - fi - - if grep -q "msgid=" "${OUTPUT_FILE}"; then - echo "FAIL: 'msgid= found in ${OUTPUT_FILE}" >&2 - cat "${OUTPUT_FILE}" >&2 - exit "${ksft_fail}" - fi - - rm "${OUTPUT_FILE}" -} - -# Start socat, send the message and wait for the file to show up in the file -# system -function runtest { - # Listen for netconsole port inside the namespace and destination - # interface - listen_port_and_save_to "${OUTPUT_FILE}" & - # Wait for socat to start and listen to the port. - wait_local_port_listen "${NAMESPACE}" "${PORT}" udp - # Send the message - taskset -c "${CPU}" echo "${MSG}: ${TARGET}" > /dev/kmsg - # Wait until socat saves the file to disk - busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}" -} - -# ========== # -# Start here # -# ========== # - -modprobe netdevsim 2> /dev/null || true -modprobe netconsole 2> /dev/null || true - -# Check for basic system dependency and exit if not found -check_for_dependencies -# This test also depends on taskset(1). Check for it before starting the test -check_for_taskset - -# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5) -echo "6 5" > /proc/sys/kernel/printk -# Remove the namespace, interfaces and netconsole target on exit -trap cleanup EXIT -# Create one namespace and two interfaces -set_network -# Create a dynamic target for netconsole -create_dynamic_target - -#==================================================== -# TEST #1 -# Send message from a random CPU -#==================================================== -# Random CPU in the system -CPU=$((RANDOM % $(nproc))) -OUTPUT_FILE="/tmp/${TARGET}_1" -MSG="Test #1 from CPU${CPU}" -# Enable the auto population of cpu_nr -set_cpu_nr -# Enable taskname to be appended to sysdata -set_taskname -set_release -set_msgid -runtest -# Make sure the message was received in the dst part -# and exit -validate_release -validate_sysdata - -#==================================================== -# TEST #2 -# This test now adds userdata together with sysdata -# =================================================== -# Get a new random CPU -CPU=$((RANDOM % $(nproc))) -OUTPUT_FILE="/tmp/${TARGET}_2" -MSG="Test #2 from CPU${CPU}" -set_user_data -runtest -validate_release -validate_sysdata - -# =================================================== -# TEST #3 -# Unset all sysdata, fail if any userdata is set -# =================================================== -CPU=$((RANDOM % $(nproc))) -OUTPUT_FILE="/tmp/${TARGET}_3" -MSG="Test #3 from CPU${CPU}" -unset_cpu_nr -unset_taskname -unset_release -unset_msgid -runtest -# At this time, cpu= shouldn't be present in the msg -validate_no_sysdata - -exit "${ksft_pass}" diff --git a/tools/testing/selftests/drivers/net/netcons_torture.sh b/tools/testing/selftests/drivers/net/netcons_torture.sh deleted file mode 100755 index 2ce9ee3719d1..000000000000 --- a/tools/testing/selftests/drivers/net/netcons_torture.sh +++ /dev/null @@ -1,130 +0,0 @@ -#!/usr/bin/env bash -# SPDX-License-Identifier: GPL-2.0 - -# Repeatedly send kernel messages, toggles netconsole targets on and off, -# creates and deletes targets in parallel, and toggles the source interface to -# simulate stress conditions. -# -# This test aims to verify the robustness of netconsole under dynamic -# configurations and concurrent operations. -# -# The major goal is to run this test with LOCKDEP, Kmemleak and KASAN to make -# sure no issues is reported. -# -# Author: Breno Leitao - -set -euo pipefail - -SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") - -source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh - -# Number of times the main loop run -ITERATIONS=${1:-150} - -# Only test extended format -FORMAT="extended" -# And ipv6 only -IP_VERSION="ipv6" - -# Create, enable and delete some targets. -create_and_delete_random_target() { - COUNT=2 - RND_PREFIX=$(mktemp -u netcons_rnd_XXXX_) - - if [ -d "${NETCONS_CONFIGFS}/${RND_PREFIX}${COUNT}" ] || \ - [ -d "${NETCONS_CONFIGFS}/${RND_PREFIX}0" ]; then - echo "Function didn't finish yet, skipping it." >&2 - return - fi - - # enable COUNT targets - for i in $(seq ${COUNT}) - do - RND_TARGET="${RND_PREFIX}"${i} - RND_TARGET_PATH="${NETCONS_CONFIGFS}"/"${RND_TARGET}" - - # Basic population so the target can come up - _create_dynamic_target "${FORMAT}" "${RND_TARGET_PATH}" - done - - echo "netconsole selftest: ${COUNT} additional targets were created" > /dev/kmsg - # disable them all - for i in $(seq ${COUNT}) - do - RND_TARGET="${RND_PREFIX}"${i} - RND_TARGET_PATH="${NETCONS_CONFIGFS}"/"${RND_TARGET}" - if [[ $(cat "${RND_TARGET_PATH}/enabled") -eq 1 ]] - then - echo 0 > "${RND_TARGET_PATH}"/enabled - fi - rmdir "${RND_TARGET_PATH}" - done -} - -# Disable and enable the target mid-air, while messages -# are being transmitted. -toggle_netcons_target() { - for i in $(seq 2) - do - if [ ! -d "${NETCONS_PATH}" ] - then - break - fi - echo 0 > "${NETCONS_PATH}"/enabled 2> /dev/null || true - # Try to enable a bit harder, given it might fail to enable - # Write to `enabled` might fail depending on the lock, which is - # highly contentious here - for _ in $(seq 5) - do - echo 1 > "${NETCONS_PATH}"/enabled 2> /dev/null || true - done - done -} - -toggle_iface(){ - ip link set "${SRCIF}" down - ip link set "${SRCIF}" up -} - -# Start here - -modprobe netdevsim 2> /dev/null || true -modprobe netconsole 2> /dev/null || true - -# Check for basic system dependency and exit if not found -check_for_dependencies -# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5) -echo "6 5" > /proc/sys/kernel/printk -# Remove the namespace, interfaces and netconsole target on exit -trap cleanup EXIT -# Create one namespace and two interfaces -set_network "${IP_VERSION}" -# Create a dynamic target for netconsole -create_dynamic_target "${FORMAT}" - -for i in $(seq "$ITERATIONS") -do - for _ in $(seq 10) - do - echo "${MSG}: ${TARGET} ${i}" > /dev/kmsg - done - wait - - if (( i % 30 == 0 )); then - toggle_netcons_target & - fi - - if (( i % 50 == 0 )); then - # create some targets, enable them, send msg and disable - # all in a parallel thread - create_and_delete_random_target & - fi - - if (( i % 70 == 0 )); then - toggle_iface & - fi -done -wait - -exit "${EXIT_STATUS}" diff --git a/tools/testing/selftests/drivers/net/netconsole/Makefile b/tools/testing/selftests/drivers/net/netconsole/Makefile new file mode 100644 index 000000000000..b56c70b7e274 --- /dev/null +++ b/tools/testing/selftests/drivers/net/netconsole/Makefile @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: GPL-2.0 + +TEST_INCLUDES := \ + ../../../net/lib.sh \ + ../lib/sh/lib_netcons.sh \ +# end of TEST_INCLUDES + +TEST_PROGS := \ + netcons_basic.sh \ + netcons_cmdline.sh \ + netcons_fragmented_msg.sh \ + netcons_overflow.sh \ + netcons_resume.sh \ + netcons_sysdata.sh \ + netcons_torture.sh \ +# end of TEST_PROGS + +include ../../../lib.mk + diff --git a/tools/testing/selftests/drivers/net/netconsole/config b/tools/testing/selftests/drivers/net/netconsole/config new file mode 100644 index 000000000000..a3f6b0fd44ef --- /dev/null +++ b/tools/testing/selftests/drivers/net/netconsole/config @@ -0,0 +1,6 @@ +CONFIG_CONFIGFS_FS=y +CONFIG_IPV6=y +CONFIG_NETCONSOLE=m +CONFIG_NETCONSOLE_DYNAMIC=y +CONFIG_NETCONSOLE_EXTENDED_LOG=y +CONFIG_NETDEVSIM=m diff --git a/tools/testing/selftests/drivers/net/netconsole/netcons_basic.sh b/tools/testing/selftests/drivers/net/netconsole/netcons_basic.sh new file mode 100755 index 000000000000..59cf10013ecd --- /dev/null +++ b/tools/testing/selftests/drivers/net/netconsole/netcons_basic.sh @@ -0,0 +1,74 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: GPL-2.0 + +# This test creates two netdevsim virtual interfaces, assigns one of them (the +# "destination interface") to a new namespace, and assigns IP addresses to both +# interfaces. +# +# It listens on the destination interface using socat and configures a dynamic +# target on netconsole, pointing to the destination IP address. +# +# Finally, it checks whether the message was received properly on the +# destination interface. Note that this test may pollute the kernel log buffer +# (dmesg) and relies on dynamic configuration and namespaces being configured. +# +# Author: Breno Leitao + +set -euo pipefail + +SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") + +source "${SCRIPTDIR}"/../lib/sh/lib_netcons.sh + +modprobe netdevsim 2> /dev/null || true +modprobe netconsole 2> /dev/null || true + +# The content of kmsg will be save to the following file +OUTPUT_FILE="/tmp/${TARGET}" + +# Check for basic system dependency and exit if not found +check_for_dependencies +# Remove the namespace, interfaces and netconsole target on exit +trap cleanup EXIT + +# Run the test twice, with different format modes +for FORMAT in "basic" "extended" +do + for IP_VERSION in "ipv6" "ipv4" + do + echo "Running with target mode: ${FORMAT} (${IP_VERSION})" + # Set current loglevel to KERN_INFO(6), and default to + # KERN_NOTICE(5) + echo "6 5" > /proc/sys/kernel/printk + # Create one namespace and two interfaces + set_network "${IP_VERSION}" + # Create a dynamic target for netconsole + create_dynamic_target "${FORMAT}" + # Only set userdata for extended format + if [ "$FORMAT" == "extended" ] + then + # Set userdata "key" with the "value" value + set_user_data + fi + # Listed for netconsole port inside the namespace and + # destination interface + listen_port_and_save_to "${OUTPUT_FILE}" "${IP_VERSION}" & + # Wait for socat to start and listen to the port. + wait_for_port "${NAMESPACE}" "${PORT}" "${IP_VERSION}" + # Send the message + echo "${MSG}: ${TARGET}" > /dev/kmsg + # Wait until socat saves the file to disk + busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}" + + # Make sure the message was received in the dst part + # and exit + validate_result "${OUTPUT_FILE}" "${FORMAT}" + # kill socat in case it is still running + pkill_socat + cleanup + echo "${FORMAT} : ${IP_VERSION} : Test passed" >&2 + done +done + +trap - EXIT +exit "${ksft_pass}" diff --git a/tools/testing/selftests/drivers/net/netconsole/netcons_cmdline.sh b/tools/testing/selftests/drivers/net/netconsole/netcons_cmdline.sh new file mode 100755 index 000000000000..96d704b8d9d9 --- /dev/null +++ b/tools/testing/selftests/drivers/net/netconsole/netcons_cmdline.sh @@ -0,0 +1,65 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: GPL-2.0 + +# This is a selftest to test cmdline arguments on netconsole. +# It exercises loading of netconsole from cmdline instead of the dynamic +# reconfiguration. This includes parsing the long netconsole= line and all the +# flow through init_netconsole(). +# +# Author: Breno Leitao + +set -euo pipefail + +SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") + +source "${SCRIPTDIR}"/../lib/sh/lib_netcons.sh + +check_netconsole_module + +modprobe netdevsim 2> /dev/null || true +rmmod netconsole 2> /dev/null || true + +# Check for basic system dependency and exit if not found +# check_for_dependencies +# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5) +echo "6 5" > /proc/sys/kernel/printk +# Remove the namespace and network interfaces +trap do_cleanup EXIT +# Create one namespace and two interfaces +set_network + +# Run the test twice, with different cmdline parameters +for BINDMODE in "ifname" "mac" +do + echo "Running with bind mode: ${BINDMODE}" >&2 + # Create the command line for netconsole, with the configuration from + # the function above + CMDLINE=$(create_cmdline_str "${BINDMODE}") + + # The content of kmsg will be save to the following file + OUTPUT_FILE="/tmp/${TARGET}-${BINDMODE}" + + # Load the module, with the cmdline set + modprobe netconsole "${CMDLINE}" + + # Listed for netconsole port inside the namespace and destination + # interface + listen_port_and_save_to "${OUTPUT_FILE}" & + # Wait for socat to start and listen to the port. + wait_local_port_listen "${NAMESPACE}" "${PORT}" udp + # Send the message + echo "${MSG}: ${TARGET}" > /dev/kmsg + # Wait until socat saves the file to disk + busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}" + # Make sure the message was received in the dst part + # and exit + validate_msg "${OUTPUT_FILE}" + + # kill socat in case it is still running + pkill_socat + # Unload the module + rmmod netconsole + echo "${BINDMODE} : Test passed" >&2 +done + +exit "${ksft_pass}" diff --git a/tools/testing/selftests/drivers/net/netconsole/netcons_fragmented_msg.sh b/tools/testing/selftests/drivers/net/netconsole/netcons_fragmented_msg.sh new file mode 100755 index 000000000000..0dc7280c3080 --- /dev/null +++ b/tools/testing/selftests/drivers/net/netconsole/netcons_fragmented_msg.sh @@ -0,0 +1,122 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: GPL-2.0 + +# Test netconsole's message fragmentation functionality. +# +# When a message exceeds the maximum packet size, netconsole splits it into +# multiple fragments for transmission. This test verifies: +# - Correct fragmentation of large messages +# - Proper reassembly of fragments at the receiver +# - Preservation of userdata across fragments +# - Behavior with and without kernel release version appending +# +# Author: Breno Leitao + +set -euo pipefail + +SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") + +source "${SCRIPTDIR}"/../lib/sh/lib_netcons.sh + +modprobe netdevsim 2> /dev/null || true +modprobe netconsole 2> /dev/null || true + +# The content of kmsg will be save to the following file +OUTPUT_FILE="/tmp/${TARGET}" + +# set userdata to a long value. In this case, it is "1-2-3-4...50-" +USERDATA_VALUE=$(printf -- '%.2s-' {1..60}) + +# Convert the header string in a regexp, so, we can remove +# the second header as well. +# A header looks like "13,468,514729715,-,ncfrag=0/1135;". If +# release is appended, you might find something like:L +# "6.13.0-04048-g4f561a87745a,13,468,514729715,-,ncfrag=0/1135;" +function header_to_regex() { + # header is everything before ; + local HEADER="${1}" + REGEX=$(echo "${HEADER}" | cut -d'=' -f1) + echo "${REGEX}=[0-9]*\/[0-9]*;" +} + +# We have two headers in the message. Remove both to get the full message, +# and extract the full message. +function extract_msg() { + local MSGFILE="${1}" + # Extract the header, which is the very first thing that arrives in the + # first list. + HEADER=$(sed -n '1p' "${MSGFILE}" | cut -d';' -f1) + HEADER_REGEX=$(header_to_regex "${HEADER}") + + # Remove the two headers from the received message + # This will return the message without any header, similarly to what + # was sent. + sed "s/""${HEADER_REGEX}""//g" "${MSGFILE}" +} + +# Validate the message, which has two messages glued together. +# unwrap them to make sure all the characters were transmitted. +# File will look like the following: +# 13,468,514729715,-,ncfrag=0/1135; +# key=-13,468,514729715,-,ncfrag=967/1135; +function validate_fragmented_result() { + # Discard the netconsole headers, and assemble the full message + RCVMSG=$(extract_msg "${1}") + + # check for the main message + if ! echo "${RCVMSG}" | grep -q "${MSG}"; then + echo "Message body doesn't match." >&2 + echo "msg received=" "${RCVMSG}" >&2 + exit "${ksft_fail}" + fi + + # check userdata + if ! echo "${RCVMSG}" | grep -q "${USERDATA_VALUE}"; then + echo "message userdata doesn't match" >&2 + echo "msg received=" "${RCVMSG}" >&2 + exit "${ksft_fail}" + fi + # test passed. hooray +} + +# Check for basic system dependency and exit if not found +check_for_dependencies +# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5) +echo "6 5" > /proc/sys/kernel/printk +# Remove the namespace, interfaces and netconsole target on exit +trap cleanup EXIT +# Create one namespace and two interfaces +set_network +# Create a dynamic target for netconsole +create_dynamic_target +# Set userdata "key" with the "value" value +set_user_data + + +# TEST 1: Send message and userdata. They will fragment +# ======= +MSG=$(printf -- 'MSG%.3s=' {1..150}) + +# Listen for netconsole port inside the namespace and destination interface +listen_port_and_save_to "${OUTPUT_FILE}" & +# Wait for socat to start and listen to the port. +wait_local_port_listen "${NAMESPACE}" "${PORT}" udp +# Send the message +echo "${MSG}: ${TARGET}" > /dev/kmsg +# Wait until socat saves the file to disk +busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}" +# Check if the message was not corrupted +validate_fragmented_result "${OUTPUT_FILE}" + +# TEST 2: Test with smaller message, and without release appended +# ======= +MSG=$(printf -- 'FOOBAR%.3s=' {1..100}) +# Let's disable release and test again. +disable_release_append + +listen_port_and_save_to "${OUTPUT_FILE}" & +wait_local_port_listen "${NAMESPACE}" "${PORT}" udp +echo "${MSG}: ${TARGET}" > /dev/kmsg +busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}" +validate_fragmented_result "${OUTPUT_FILE}" +exit "${ksft_pass}" diff --git a/tools/testing/selftests/drivers/net/netconsole/netcons_overflow.sh b/tools/testing/selftests/drivers/net/netconsole/netcons_overflow.sh new file mode 100755 index 000000000000..a8e43d08c166 --- /dev/null +++ b/tools/testing/selftests/drivers/net/netconsole/netcons_overflow.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: GPL-2.0 + +# This test verifies that users can successfully create up to +# MAX_USERDATA_ITEMS userdata entries without encountering any failures. +# +# Additionally, it tests for expected failure when attempting to exceed this +# maximum limit. +# +# Author: Breno Leitao + +set -euo pipefail + +SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") + +source "${SCRIPTDIR}"/../lib/sh/lib_netcons.sh +# This is coming from netconsole code. Check for it in drivers/net/netconsole.c +MAX_USERDATA_ITEMS=256 + +# Function to create userdata entries +function create_userdata_max_entries() { + # All these keys should be created without any error + for i in $(seq $MAX_USERDATA_ITEMS) + do + # USERDATA_KEY is used by set_user_data + USERDATA_KEY="key"${i} + set_user_data + done +} + +# Function to verify the entry limit +function verify_entry_limit() { + # Allowing the test to fail without exiting, since the next command + # will fail + set +e + mkdir "${NETCONS_PATH}/userdata/key_that_will_fail" 2> /dev/null + ret="$?" + set -e + if [ "$ret" -eq 0 ]; + then + echo "Adding more than ${MAX_USERDATA_ITEMS} entries in userdata should fail, but it didn't" >&2 + ls "${NETCONS_PATH}/userdata/" >&2 + exit "${ksft_fail}" + fi +} + +# ========== # +# Start here # +# ========== # + +modprobe netdevsim 2> /dev/null || true +modprobe netconsole 2> /dev/null || true + +# Check for basic system dependency and exit if not found +check_for_dependencies + +# Remove the namespace, interfaces and netconsole target on exit +trap cleanup EXIT +# Create one namespace and two interfaces +set_network +# Create a dynamic target for netconsole +create_dynamic_target +# populate the maximum number of supported keys in userdata +create_userdata_max_entries +# Verify an additional entry is not allowed +verify_entry_limit +exit "${ksft_pass}" diff --git a/tools/testing/selftests/drivers/net/netconsole/netcons_resume.sh b/tools/testing/selftests/drivers/net/netconsole/netcons_resume.sh new file mode 100755 index 000000000000..cb59cf436dd0 --- /dev/null +++ b/tools/testing/selftests/drivers/net/netconsole/netcons_resume.sh @@ -0,0 +1,124 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: GPL-2.0 + +# This test validates that netconsole is able to resume a target that was +# deactivated when its interface was removed when the interface is brought +# back up. +# +# The test configures a netconsole target and then removes netdevsim module to +# cause the interface to disappear. Targets are configured via cmdline to ensure +# targets bound by interface name and mac address can be resumed. +# The test verifies that the target moved to disabled state before adding +# netdevsim and the interface back. +# +# Finally, the test verifies that the target is re-enabled automatically and +# the message is received on the destination interface. +# +# Author: Andre Carvalho + +set -euo pipefail + +SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") + +source "${SCRIPTDIR}"/../lib/sh/lib_netcons.sh + +SAVED_SRCMAC="" # to be populated later +SAVED_DSTMAC="" # to be populated later + +modprobe netdevsim 2> /dev/null || true +rmmod netconsole 2> /dev/null || true + +check_netconsole_module + +function cleanup() { + cleanup_netcons "${NETCONS_CONFIGFS}/cmdline0" + do_cleanup + rmmod netconsole +} + +function trigger_reactivation() { + # Add back low level module + modprobe netdevsim + # Recreate namespace and two interfaces + set_network + # Restore MACs + ip netns exec "${NAMESPACE}" ip link set "${DSTIF}" \ + address "${SAVED_DSTMAC}" + if [ "${BINDMODE}" == "mac" ]; then + ip link set dev "${SRCIF}" down + ip link set dev "${SRCIF}" address "${SAVED_SRCMAC}" + # Rename device in order to trigger target resume, as initial + # when device was recreated it didn't have correct mac address. + ip link set dev "${SRCIF}" name "${TARGET}" + fi +} + +function trigger_deactivation() { + # Start by storing mac addresses so we can be restored in reactivate + SAVED_DSTMAC=$(ip netns exec "${NAMESPACE}" \ + cat /sys/class/net/"$DSTIF"/address) + SAVED_SRCMAC=$(mac_get "${SRCIF}") + # Remove low level module + rmmod netdevsim +} + +trap cleanup EXIT + +# Run the test twice, with different cmdline parameters +for BINDMODE in "ifname" "mac" +do + echo "Running with bind mode: ${BINDMODE}" >&2 + # Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5) + echo "6 5" > /proc/sys/kernel/printk + + # Create one namespace and two interfaces + set_network + + # Create the command line for netconsole, with the configuration from + # the function above + CMDLINE=$(create_cmdline_str "${BINDMODE}") + + # The content of kmsg will be save to the following file + OUTPUT_FILE="/tmp/${TARGET}-${BINDMODE}" + + # Load the module, with the cmdline set + modprobe netconsole "${CMDLINE}" + # Expose cmdline target in configfs + mkdir "${NETCONS_CONFIGFS}/cmdline0" + + # Target should be enabled + wait_target_state "cmdline0" "enabled" + + # Trigger deactivation by unloading netdevsim module. Target should be + # disabled. + trigger_deactivation + wait_target_state "cmdline0" "disabled" + + # Trigger reactivation by loading netdevsim, recreating the network and + # restoring mac addresses. Target should be re-enabled. + trigger_reactivation + wait_target_state "cmdline0" "enabled" + + # Listen for netconsole port inside the namespace and destination + # interface + listen_port_and_save_to "${OUTPUT_FILE}" & + # Wait for socat to start and listen to the port. + wait_local_port_listen "${NAMESPACE}" "${PORT}" udp + # Send the message + echo "${MSG}: ${TARGET}" > /dev/kmsg + # Wait until socat saves the file to disk + busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}" + # Make sure the message was received in the dst part + # and exit + validate_msg "${OUTPUT_FILE}" + + # kill socat in case it is still running + pkill_socat + # Cleanup & unload the module + cleanup + + echo "${BINDMODE} : Test passed" >&2 +done + +trap - EXIT +exit "${EXIT_STATUS}" diff --git a/tools/testing/selftests/drivers/net/netconsole/netcons_sysdata.sh b/tools/testing/selftests/drivers/net/netconsole/netcons_sysdata.sh new file mode 100755 index 000000000000..3fb8c4afe3d2 --- /dev/null +++ b/tools/testing/selftests/drivers/net/netconsole/netcons_sysdata.sh @@ -0,0 +1,272 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: GPL-2.0 + +# A test that makes sure that sysdata runtime CPU data is properly set +# when a message is sent. +# +# There are 3 different tests, every time sent using a random CPU. +# - Test #1 +# * Only enable cpu_nr sysdata feature. +# - Test #2 +# * Keep cpu_nr sysdata feature enable and enable userdata. +# - Test #3 +# * keep userdata enabled, and disable sysdata cpu_nr feature. +# +# Author: Breno Leitao + +set -euo pipefail + +SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") + +source "${SCRIPTDIR}"/../lib/sh/lib_netcons.sh + +# Enable the sysdata cpu_nr feature +function set_cpu_nr() { + if [[ ! -f "${NETCONS_PATH}/userdata/cpu_nr_enabled" ]] + then + echo "Populate CPU configfs path not available in ${NETCONS_PATH}/userdata/cpu_nr_enabled" >&2 + exit "${ksft_skip}" + fi + + echo 1 > "${NETCONS_PATH}/userdata/cpu_nr_enabled" +} + +# Enable the taskname to be appended to sysdata +function set_taskname() { + if [[ ! -f "${NETCONS_PATH}/userdata/taskname_enabled" ]] + then + echo "Not able to enable taskname sysdata append. Configfs not available in ${NETCONS_PATH}/userdata/taskname_enabled" >&2 + exit "${ksft_skip}" + fi + + echo 1 > "${NETCONS_PATH}/userdata/taskname_enabled" +} + +# Enable the release to be appended to sysdata +function set_release() { + if [[ ! -f "${NETCONS_PATH}/userdata/release_enabled" ]] + then + echo "Not able to enable release sysdata append. Configfs not available in ${NETCONS_PATH}/userdata/release_enabled" >&2 + exit "${ksft_skip}" + fi + + echo 1 > "${NETCONS_PATH}/userdata/release_enabled" +} + +# Enable the msgid to be appended to sysdata +function set_msgid() { + if [[ ! -f "${NETCONS_PATH}/userdata/msgid_enabled" ]] + then + echo "Not able to enable msgid sysdata append. Configfs not available in ${NETCONS_PATH}/userdata/msgid_enabled" >&2 + exit "${ksft_skip}" + fi + + echo 1 > "${NETCONS_PATH}/userdata/msgid_enabled" +} + +# Disable the sysdata cpu_nr feature +function unset_cpu_nr() { + echo 0 > "${NETCONS_PATH}/userdata/cpu_nr_enabled" +} + +# Once called, taskname=<..> will not be appended anymore +function unset_taskname() { + echo 0 > "${NETCONS_PATH}/userdata/taskname_enabled" +} + +function unset_release() { + echo 0 > "${NETCONS_PATH}/userdata/release_enabled" +} + +function unset_msgid() { + echo 0 > "${NETCONS_PATH}/userdata/msgid_enabled" +} + +# Test if MSG contains sysdata +function validate_sysdata() { + # OUTPUT_FILE will contain something like: + # 6.11.1-0_fbk0_rc13_509_g30d75cea12f7,13,1822,115075213798,-;netconsole selftest: netcons_gtJHM + # userdatakey=userdatavalue + # cpu=X + # taskname= + # msgid= + + # Echo is what this test uses to create the message. See runtest() + # function + SENDER="echo" + + if [ ! -f "$OUTPUT_FILE" ]; then + echo "FAIL: File was not generated." >&2 + exit "${ksft_fail}" + fi + + if ! grep -q "${MSG}" "${OUTPUT_FILE}"; then + echo "FAIL: ${MSG} not found in ${OUTPUT_FILE}" >&2 + cat "${OUTPUT_FILE}" >&2 + exit "${ksft_fail}" + fi + + # Check if cpu=XX exists in the file and matches the one used + # in taskset(1) + if ! grep -q "cpu=${CPU}\+" "${OUTPUT_FILE}"; then + echo "FAIL: 'cpu=${CPU}' not found in ${OUTPUT_FILE}" >&2 + cat "${OUTPUT_FILE}" >&2 + exit "${ksft_fail}" + fi + + if ! grep -q "taskname=${SENDER}" "${OUTPUT_FILE}"; then + echo "FAIL: 'taskname=echo' not found in ${OUTPUT_FILE}" >&2 + cat "${OUTPUT_FILE}" >&2 + exit "${ksft_fail}" + fi + + if ! grep -q "msgid=[0-9]\+$" "${OUTPUT_FILE}"; then + echo "FAIL: 'msgid=' not found in ${OUTPUT_FILE}" >&2 + cat "${OUTPUT_FILE}" >&2 + exit "${ksft_fail}" + fi + + rm "${OUTPUT_FILE}" + pkill_socat +} + +function validate_release() { + RELEASE=$(uname -r) + + if [ ! -f "$OUTPUT_FILE" ]; then + echo "FAIL: File was not generated." >&2 + exit "${ksft_fail}" + fi + + if ! grep -q "release=${RELEASE}" "${OUTPUT_FILE}"; then + echo "FAIL: 'release=${RELEASE}' not found in ${OUTPUT_FILE}" >&2 + cat "${OUTPUT_FILE}" >&2 + exit "${ksft_fail}" + fi +} + +# Test if MSG content exists in OUTPUT_FILE but no `cpu=` and `taskname=` +# strings +function validate_no_sysdata() { + if [ ! -f "$OUTPUT_FILE" ]; then + echo "FAIL: File was not generated." >&2 + exit "${ksft_fail}" + fi + + if ! grep -q "${MSG}" "${OUTPUT_FILE}"; then + echo "FAIL: ${MSG} not found in ${OUTPUT_FILE}" >&2 + cat "${OUTPUT_FILE}" >&2 + exit "${ksft_fail}" + fi + + if grep -q "cpu=" "${OUTPUT_FILE}"; then + echo "FAIL: 'cpu= found in ${OUTPUT_FILE}" >&2 + cat "${OUTPUT_FILE}" >&2 + exit "${ksft_fail}" + fi + + if grep -q "taskname=" "${OUTPUT_FILE}"; then + echo "FAIL: 'taskname= found in ${OUTPUT_FILE}" >&2 + cat "${OUTPUT_FILE}" >&2 + exit "${ksft_fail}" + fi + + if grep -q "release=" "${OUTPUT_FILE}"; then + echo "FAIL: 'release= found in ${OUTPUT_FILE}" >&2 + cat "${OUTPUT_FILE}" >&2 + exit "${ksft_fail}" + fi + + if grep -q "msgid=" "${OUTPUT_FILE}"; then + echo "FAIL: 'msgid= found in ${OUTPUT_FILE}" >&2 + cat "${OUTPUT_FILE}" >&2 + exit "${ksft_fail}" + fi + + rm "${OUTPUT_FILE}" +} + +# Start socat, send the message and wait for the file to show up in the file +# system +function runtest { + # Listen for netconsole port inside the namespace and destination + # interface + listen_port_and_save_to "${OUTPUT_FILE}" & + # Wait for socat to start and listen to the port. + wait_local_port_listen "${NAMESPACE}" "${PORT}" udp + # Send the message + taskset -c "${CPU}" echo "${MSG}: ${TARGET}" > /dev/kmsg + # Wait until socat saves the file to disk + busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}" +} + +# ========== # +# Start here # +# ========== # + +modprobe netdevsim 2> /dev/null || true +modprobe netconsole 2> /dev/null || true + +# Check for basic system dependency and exit if not found +check_for_dependencies +# This test also depends on taskset(1). Check for it before starting the test +check_for_taskset + +# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5) +echo "6 5" > /proc/sys/kernel/printk +# Remove the namespace, interfaces and netconsole target on exit +trap cleanup EXIT +# Create one namespace and two interfaces +set_network +# Create a dynamic target for netconsole +create_dynamic_target + +#==================================================== +# TEST #1 +# Send message from a random CPU +#==================================================== +# Random CPU in the system +CPU=$((RANDOM % $(nproc))) +OUTPUT_FILE="/tmp/${TARGET}_1" +MSG="Test #1 from CPU${CPU}" +# Enable the auto population of cpu_nr +set_cpu_nr +# Enable taskname to be appended to sysdata +set_taskname +set_release +set_msgid +runtest +# Make sure the message was received in the dst part +# and exit +validate_release +validate_sysdata + +#==================================================== +# TEST #2 +# This test now adds userdata together with sysdata +# =================================================== +# Get a new random CPU +CPU=$((RANDOM % $(nproc))) +OUTPUT_FILE="/tmp/${TARGET}_2" +MSG="Test #2 from CPU${CPU}" +set_user_data +runtest +validate_release +validate_sysdata + +# =================================================== +# TEST #3 +# Unset all sysdata, fail if any userdata is set +# =================================================== +CPU=$((RANDOM % $(nproc))) +OUTPUT_FILE="/tmp/${TARGET}_3" +MSG="Test #3 from CPU${CPU}" +unset_cpu_nr +unset_taskname +unset_release +unset_msgid +runtest +# At this time, cpu= shouldn't be present in the msg +validate_no_sysdata + +exit "${ksft_pass}" diff --git a/tools/testing/selftests/drivers/net/netconsole/netcons_torture.sh b/tools/testing/selftests/drivers/net/netconsole/netcons_torture.sh new file mode 100755 index 000000000000..33a44adb6f8f --- /dev/null +++ b/tools/testing/selftests/drivers/net/netconsole/netcons_torture.sh @@ -0,0 +1,130 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: GPL-2.0 + +# Repeatedly send kernel messages, toggles netconsole targets on and off, +# creates and deletes targets in parallel, and toggles the source interface to +# simulate stress conditions. +# +# This test aims to verify the robustness of netconsole under dynamic +# configurations and concurrent operations. +# +# The major goal is to run this test with LOCKDEP, Kmemleak and KASAN to make +# sure no issues is reported. +# +# Author: Breno Leitao + +set -euo pipefail + +SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") + +source "${SCRIPTDIR}"/../lib/sh/lib_netcons.sh + +# Number of times the main loop run +ITERATIONS=${1:-150} + +# Only test extended format +FORMAT="extended" +# And ipv6 only +IP_VERSION="ipv6" + +# Create, enable and delete some targets. +create_and_delete_random_target() { + COUNT=2 + RND_PREFIX=$(mktemp -u netcons_rnd_XXXX_) + + if [ -d "${NETCONS_CONFIGFS}/${RND_PREFIX}${COUNT}" ] || \ + [ -d "${NETCONS_CONFIGFS}/${RND_PREFIX}0" ]; then + echo "Function didn't finish yet, skipping it." >&2 + return + fi + + # enable COUNT targets + for i in $(seq ${COUNT}) + do + RND_TARGET="${RND_PREFIX}"${i} + RND_TARGET_PATH="${NETCONS_CONFIGFS}"/"${RND_TARGET}" + + # Basic population so the target can come up + _create_dynamic_target "${FORMAT}" "${RND_TARGET_PATH}" + done + + echo "netconsole selftest: ${COUNT} additional targets were created" > /dev/kmsg + # disable them all + for i in $(seq ${COUNT}) + do + RND_TARGET="${RND_PREFIX}"${i} + RND_TARGET_PATH="${NETCONS_CONFIGFS}"/"${RND_TARGET}" + if [[ $(cat "${RND_TARGET_PATH}/enabled") -eq 1 ]] + then + echo 0 > "${RND_TARGET_PATH}"/enabled + fi + rmdir "${RND_TARGET_PATH}" + done +} + +# Disable and enable the target mid-air, while messages +# are being transmitted. +toggle_netcons_target() { + for i in $(seq 2) + do + if [ ! -d "${NETCONS_PATH}" ] + then + break + fi + echo 0 > "${NETCONS_PATH}"/enabled 2> /dev/null || true + # Try to enable a bit harder, given it might fail to enable + # Write to `enabled` might fail depending on the lock, which is + # highly contentious here + for _ in $(seq 5) + do + echo 1 > "${NETCONS_PATH}"/enabled 2> /dev/null || true + done + done +} + +toggle_iface(){ + ip link set "${SRCIF}" down + ip link set "${SRCIF}" up +} + +# Start here + +modprobe netdevsim 2> /dev/null || true +modprobe netconsole 2> /dev/null || true + +# Check for basic system dependency and exit if not found +check_for_dependencies +# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5) +echo "6 5" > /proc/sys/kernel/printk +# Remove the namespace, interfaces and netconsole target on exit +trap cleanup EXIT +# Create one namespace and two interfaces +set_network "${IP_VERSION}" +# Create a dynamic target for netconsole +create_dynamic_target "${FORMAT}" + +for i in $(seq "$ITERATIONS") +do + for _ in $(seq 10) + do + echo "${MSG}: ${TARGET} ${i}" > /dev/kmsg + done + wait + + if (( i % 30 == 0 )); then + toggle_netcons_target & + fi + + if (( i % 50 == 0 )); then + # create some targets, enable them, send msg and disable + # all in a parallel thread + create_and_delete_random_target & + fi + + if (( i % 70 == 0 )); then + toggle_iface & + fi +done +wait + +exit "${EXIT_STATUS}" -- cgit v1.2.3 From 7b85c77585409f76609c817b760db60f3bf8fd33 Mon Sep 17 00:00:00 2001 From: Nimrod Oren Date: Wed, 28 Jan 2026 11:02:17 +0200 Subject: selftests: drv-net: rss_flow_label: skip unsupported devices The test_rss_flow_label_6only test case fails on devices that do not support IPv6 flow label hashing. Make it skip neatly, consistent with the behavior of the test_rss_flow_label case. Reviewed-by: Gal Pressman Signed-off-by: Nimrod Oren Link: https://patch.msgid.link/20260128090217.663366-1-noren@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/hw/rss_flow_label.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/drivers/net/hw/rss_flow_label.py b/tools/testing/selftests/drivers/net/hw/rss_flow_label.py index 6fa95fe27c47..7dc80070884a 100755 --- a/tools/testing/selftests/drivers/net/hw/rss_flow_label.py +++ b/tools/testing/selftests/drivers/net/hw/rss_flow_label.py @@ -145,9 +145,14 @@ def test_rss_flow_label_6only(cfg): # Try to enable Flow Labels and check again, in case it leaks thru initial = _ethtool_get_cfg(cfg, "udp6") - changed = initial.replace("l", "") if "l" in initial else initial + "l" - - cmd(f"ethtool -N {cfg.ifname} rx-flow-hash udp6 {changed}") + no_lbl = initial.replace("l", "") + if "l" not in initial: + try: + cmd(f"ethtool -N {cfg.ifname} rx-flow-hash udp6 l{no_lbl}") + except CmdExitFailure as exc: + raise KsftSkipEx("Device doesn't support Flow Label for UDP6") from exc + else: + cmd(f"ethtool -N {cfg.ifname} rx-flow-hash udp6 {no_lbl}") restore = defer(cmd, f"ethtool -N {cfg.ifname} rx-flow-hash udp6 {initial}") _check_v4_flow_types(cfg) -- cgit v1.2.3 From 15ac1adf0f84a90605121fbe4a6238b24c865f92 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 30 Jan 2026 09:12:08 +0100 Subject: selftests/bpf: Add test for sleepable program tailcalls Adding test that makes sure we can't mix sleepable and non-sleepable bpf programs in the BPF_MAP_TYPE_PROG_ARRAY map and that we can do tail call in the sleepable program. Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260130081208.1130204-3-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/tailcalls.c | 74 ++++++++++++++++++++++ .../selftests/bpf/progs/tailcall_sleepable.c | 43 +++++++++++++ 2 files changed, 117 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/tailcall_sleepable.c (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/prog_tests/tailcalls.c b/tools/testing/selftests/bpf/prog_tests/tailcalls.c index 0ab36503c3b2..7d534fde0af9 100644 --- a/tools/testing/selftests/bpf/prog_tests/tailcalls.c +++ b/tools/testing/selftests/bpf/prog_tests/tailcalls.c @@ -8,6 +8,7 @@ #include "tailcall_freplace.skel.h" #include "tc_bpf2bpf.skel.h" #include "tailcall_fail.skel.h" +#include "tailcall_sleepable.skel.h" /* test_tailcall_1 checks basic functionality by patching multiple locations * in a single program for a single tail call slot with nop->jmp, jmp->nop @@ -1653,6 +1654,77 @@ static void test_tailcall_failure() RUN_TESTS(tailcall_fail); } +noinline void uprobe_sleepable_trigger(void) +{ + asm volatile (""); +} + +static void test_tailcall_sleepable(void) +{ + LIBBPF_OPTS(bpf_uprobe_opts, opts); + struct tailcall_sleepable *skel; + int prog_fd, map_fd; + int err, key; + + skel = tailcall_sleepable__open(); + if (!ASSERT_OK_PTR(skel, "tailcall_sleepable__open")) + return; + + /* + * Test that we can't load uprobe_normal and uprobe_sleepable_1, + * because they share tailcall map. + */ + bpf_program__set_autoload(skel->progs.uprobe_normal, true); + bpf_program__set_autoload(skel->progs.uprobe_sleepable_1, true); + + err = tailcall_sleepable__load(skel); + if (!ASSERT_ERR(err, "tailcall_sleepable__load")) + goto out; + + tailcall_sleepable__destroy(skel); + + /* + * Test that we can tail call from sleepable to sleepable program. + */ + skel = tailcall_sleepable__open(); + if (!ASSERT_OK_PTR(skel, "tailcall_sleepable__open")) + return; + + bpf_program__set_autoload(skel->progs.uprobe_sleepable_1, true); + bpf_program__set_autoload(skel->progs.uprobe_sleepable_2, true); + + err = tailcall_sleepable__load(skel); + if (!ASSERT_OK(err, "tailcall_sleepable__load")) + goto out; + + /* Add sleepable uprobe_sleepable_2 to jmp_table[0]. */ + key = 0; + prog_fd = bpf_program__fd(skel->progs.uprobe_sleepable_2); + map_fd = bpf_map__fd(skel->maps.jmp_table); + err = bpf_map_update_elem(map_fd, &key, &prog_fd, BPF_ANY); + if (!ASSERT_OK(err, "update jmp_table")) + goto out; + + skel->bss->my_pid = getpid(); + + /* Attach uprobe_sleepable_1 to uprobe_sleepable_trigger and hit it. */ + opts.func_name = "uprobe_sleepable_trigger"; + skel->links.uprobe_sleepable_1 = bpf_program__attach_uprobe_opts( + skel->progs.uprobe_sleepable_1, + -1, + "/proc/self/exe", + 0 /* offset */, + &opts); + if (!ASSERT_OK_PTR(skel->links.uprobe_sleepable_1, "bpf_program__attach_uprobe_opts")) + goto out; + + uprobe_sleepable_trigger(); + ASSERT_EQ(skel->bss->executed, 1, "executed"); + +out: + tailcall_sleepable__destroy(skel); +} + void test_tailcalls(void) { if (test__start_subtest("tailcall_1")) @@ -1707,4 +1779,6 @@ void test_tailcalls(void) test_tailcall_bpf2bpf_freplace(); if (test__start_subtest("tailcall_failure")) test_tailcall_failure(); + if (test__start_subtest("tailcall_sleepable")) + test_tailcall_sleepable(); } diff --git a/tools/testing/selftests/bpf/progs/tailcall_sleepable.c b/tools/testing/selftests/bpf/progs/tailcall_sleepable.c new file mode 100644 index 000000000000..d959a9eaaa9c --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tailcall_sleepable.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include "bpf_misc.h" +#include "bpf_test_utils.h" + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 1); + __uint(key_size, sizeof(__u32)); + __array(values, void (void)); +} jmp_table SEC(".maps"); + +SEC("?uprobe") +int uprobe_normal(void *ctx) +{ + bpf_tail_call_static(ctx, &jmp_table, 0); + return 0; +} + +SEC("?uprobe.s") +int uprobe_sleepable_1(void *ctx) +{ + bpf_tail_call_static(ctx, &jmp_table, 0); + return 0; +} + +int executed = 0; +int my_pid = 0; + +SEC("?uprobe.s") +int uprobe_sleepable_2(void *ctx) +{ + int pid = bpf_get_current_pid_tgid() >> 32; + + if (pid != my_pid) + return 0; + + executed++; + return 0; +} + +char __license[] SEC("license") = "GPL"; -- cgit v1.2.3 From cd77618c418254b827f2a807b4c27b97088fdb52 Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Fri, 30 Jan 2026 11:18:43 +0900 Subject: selftests/bpf: Make bpf get_preempt_count() work for v6.14+ kernels Recent x86 kernels export __preempt_count as a ksym, while some old kernels between v6.1 and v6.14 expose the preemption counter via pcpu_hot.preempt_count. The existing selftest helper unconditionally dereferenced __preempt_count, which breaks BPF program loading on such old kernels. Make the x86 preemption count lookup version-agnostic by: - Marking __preempt_count and pcpu_hot as weak ksyms. - Introducing a BTF-described pcpu_hot___local layout with preserve_access_index. - Selecting the appropriate access path at runtime using ksym availability and bpf_ksym_exists() and bpf_core_field_exists(). This allows a single BPF binary to run correctly across kernel versions (e.g., v6.18 vs. v6.13) without relying on compile-time version checks. Signed-off-by: Changwoo Min Link: https://lore.kernel.org/r/20260130021843.154885-1-changwoo@igalia.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/bpf_experimental.h | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index a39576c8ba04..4b7210c318dd 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -614,7 +614,13 @@ extern int bpf_cgroup_read_xattr(struct cgroup *cgroup, const char *name__str, extern bool CONFIG_PREEMPT_RT __kconfig __weak; #ifdef bpf_target_x86 -extern const int __preempt_count __ksym; +extern const int __preempt_count __ksym __weak; + +struct pcpu_hot___local { + int preempt_count; +} __attribute__((preserve_access_index)); + +extern struct pcpu_hot___local pcpu_hot __ksym __weak; #endif struct task_struct___preempt_rt { @@ -624,7 +630,19 @@ struct task_struct___preempt_rt { static inline int get_preempt_count(void) { #if defined(bpf_target_x86) - return *(int *) bpf_this_cpu_ptr(&__preempt_count); + /* By default, read the per-CPU __preempt_count. */ + if (bpf_ksym_exists(&__preempt_count)) + return *(int *) bpf_this_cpu_ptr(&__preempt_count); + + /* + * If __preempt_count does not exist, try to read preempt_count under + * struct pcpu_hot. Between v6.1 and v6.14 -- more specifically, + * [64701838bf057, 46e8fff6d45fe), preempt_count had been managed + * under struct pcpu_hot. + */ + if (bpf_core_field_exists(pcpu_hot.preempt_count)) + return ((struct pcpu_hot___local *) + bpf_this_cpu_ptr(&pcpu_hot))->preempt_count; #elif defined(bpf_target_arm64) return bpf_get_current_task_btf()->thread_info.preempt.count; #endif -- cgit v1.2.3 From 0207f94971e72a13380e28022c86da147e8e090f Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 26 Jan 2026 22:18:34 +0100 Subject: selftests/bpf: Fix kprobe multi stacktrace_ips test We now include the attached function in the stack trace, fixing the test accordingly. Fixes: c9e208fa93cd ("selftests/bpf: Add stacktrace ips test for kprobe_multi/kretprobe_multi") Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260126211837.472802-4-jolsa@kernel.org --- .../testing/selftests/bpf/prog_tests/stacktrace_ips.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c index c9efdd2a5b18..c93718dafd9b 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c @@ -74,11 +74,20 @@ static void test_stacktrace_ips_kprobe_multi(bool retprobe) load_kallsyms(); - check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 4, - ksym_get_addr("bpf_testmod_stacktrace_test_3"), - ksym_get_addr("bpf_testmod_stacktrace_test_2"), - ksym_get_addr("bpf_testmod_stacktrace_test_1"), - ksym_get_addr("bpf_testmod_test_read")); + if (retprobe) { + check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 4, + ksym_get_addr("bpf_testmod_stacktrace_test_3"), + ksym_get_addr("bpf_testmod_stacktrace_test_2"), + ksym_get_addr("bpf_testmod_stacktrace_test_1"), + ksym_get_addr("bpf_testmod_test_read")); + } else { + check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 5, + ksym_get_addr("bpf_testmod_stacktrace_test"), + ksym_get_addr("bpf_testmod_stacktrace_test_3"), + ksym_get_addr("bpf_testmod_stacktrace_test_2"), + ksym_get_addr("bpf_testmod_stacktrace_test_1"), + ksym_get_addr("bpf_testmod_test_read")); + } cleanup: stacktrace_ips__destroy(skel); -- cgit v1.2.3 From 7373f97e868ad01fc73de8e7b71834eeba25d4f1 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 26 Jan 2026 22:18:35 +0100 Subject: selftests/bpf: Add stacktrace ips test for kprobe/kretprobe Adding test that attaches kprobe/kretprobe and verifies the ORC stacktrace matches expected functions. The test is only for ORC unwinder to keep it simple. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260126211837.472802-5-jolsa@kernel.org --- .../selftests/bpf/prog_tests/stacktrace_ips.c | 50 ++++++++++++++++++++++ tools/testing/selftests/bpf/progs/stacktrace_ips.c | 7 +++ 2 files changed, 57 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c index c93718dafd9b..852830536109 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c @@ -137,6 +137,52 @@ cleanup: stacktrace_ips__destroy(skel); } +static void test_stacktrace_ips_kprobe(bool retprobe) +{ + LIBBPF_OPTS(bpf_kprobe_opts, opts, + .retprobe = retprobe + ); + LIBBPF_OPTS(bpf_test_run_opts, topts); + struct stacktrace_ips *skel; + + skel = stacktrace_ips__open_and_load(); + if (!ASSERT_OK_PTR(skel, "stacktrace_ips__open_and_load")) + return; + + if (!skel->kconfig->CONFIG_UNWINDER_ORC) { + test__skip(); + goto cleanup; + } + + skel->links.kprobe_test = bpf_program__attach_kprobe_opts( + skel->progs.kprobe_test, + "bpf_testmod_stacktrace_test", &opts); + if (!ASSERT_OK_PTR(skel->links.kprobe_test, "bpf_program__attach_kprobe_opts")) + goto cleanup; + + trigger_module_test_read(1); + + load_kallsyms(); + + if (retprobe) { + check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 4, + ksym_get_addr("bpf_testmod_stacktrace_test_3"), + ksym_get_addr("bpf_testmod_stacktrace_test_2"), + ksym_get_addr("bpf_testmod_stacktrace_test_1"), + ksym_get_addr("bpf_testmod_test_read")); + } else { + check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 5, + ksym_get_addr("bpf_testmod_stacktrace_test"), + ksym_get_addr("bpf_testmod_stacktrace_test_3"), + ksym_get_addr("bpf_testmod_stacktrace_test_2"), + ksym_get_addr("bpf_testmod_stacktrace_test_1"), + ksym_get_addr("bpf_testmod_test_read")); + } + +cleanup: + stacktrace_ips__destroy(skel); +} + static void __test_stacktrace_ips(void) { if (test__start_subtest("kprobe_multi")) @@ -145,6 +191,10 @@ static void __test_stacktrace_ips(void) test_stacktrace_ips_kprobe_multi(true); if (test__start_subtest("raw_tp")) test_stacktrace_ips_raw_tp(); + if (test__start_subtest("kprobe")) + test_stacktrace_ips_kprobe(false); + if (test__start_subtest("kretprobe")) + test_stacktrace_ips_kprobe(true); } #else static void __test_stacktrace_ips(void) diff --git a/tools/testing/selftests/bpf/progs/stacktrace_ips.c b/tools/testing/selftests/bpf/progs/stacktrace_ips.c index a96c8150d7f5..cae077a4061b 100644 --- a/tools/testing/selftests/bpf/progs/stacktrace_ips.c +++ b/tools/testing/selftests/bpf/progs/stacktrace_ips.c @@ -31,6 +31,13 @@ int unused(void) __u32 stack_key; +SEC("kprobe") +int kprobe_test(struct pt_regs *ctx) +{ + stack_key = bpf_get_stackid(ctx, &stackmap, 0); + return 0; +} + SEC("kprobe.multi") int kprobe_multi_test(struct pt_regs *ctx) { -- cgit v1.2.3 From e5d532be4a3b0d1f0a9210c0da2c04a6b4605904 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 26 Jan 2026 22:18:36 +0100 Subject: selftests/bpf: Add stacktrace ips test for fentry/fexit Adding test that attaches fentry/fexitand verifies the ORC stacktrace matches expected functions. The test is only for ORC unwinder to keep it simple. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260126211837.472802-6-jolsa@kernel.org --- .../selftests/bpf/prog_tests/stacktrace_ips.c | 51 ++++++++++++++++++++++ tools/testing/selftests/bpf/progs/stacktrace_ips.c | 20 +++++++++ 2 files changed, 71 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c index 852830536109..da42b00e3d1f 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c @@ -183,6 +183,53 @@ cleanup: stacktrace_ips__destroy(skel); } +static void test_stacktrace_ips_trampoline(bool retprobe) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + struct stacktrace_ips *skel; + + skel = stacktrace_ips__open_and_load(); + if (!ASSERT_OK_PTR(skel, "stacktrace_ips__open_and_load")) + return; + + if (!skel->kconfig->CONFIG_UNWINDER_ORC) { + test__skip(); + goto cleanup; + } + + if (retprobe) { + skel->links.fexit_test = bpf_program__attach_trace(skel->progs.fexit_test); + if (!ASSERT_OK_PTR(skel->links.fexit_test, "bpf_program__attach_trace")) + goto cleanup; + } else { + skel->links.fentry_test = bpf_program__attach_trace(skel->progs.fentry_test); + if (!ASSERT_OK_PTR(skel->links.fentry_test, "bpf_program__attach_trace")) + goto cleanup; + } + + trigger_module_test_read(1); + + load_kallsyms(); + + if (retprobe) { + check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 4, + ksym_get_addr("bpf_testmod_stacktrace_test_3"), + ksym_get_addr("bpf_testmod_stacktrace_test_2"), + ksym_get_addr("bpf_testmod_stacktrace_test_1"), + ksym_get_addr("bpf_testmod_test_read")); + } else { + check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 5, + ksym_get_addr("bpf_testmod_stacktrace_test"), + ksym_get_addr("bpf_testmod_stacktrace_test_3"), + ksym_get_addr("bpf_testmod_stacktrace_test_2"), + ksym_get_addr("bpf_testmod_stacktrace_test_1"), + ksym_get_addr("bpf_testmod_test_read")); + } + +cleanup: + stacktrace_ips__destroy(skel); +} + static void __test_stacktrace_ips(void) { if (test__start_subtest("kprobe_multi")) @@ -195,6 +242,10 @@ static void __test_stacktrace_ips(void) test_stacktrace_ips_kprobe(false); if (test__start_subtest("kretprobe")) test_stacktrace_ips_kprobe(true); + if (test__start_subtest("fentry")) + test_stacktrace_ips_trampoline(false); + if (test__start_subtest("fexit")) + test_stacktrace_ips_trampoline(true); } #else static void __test_stacktrace_ips(void) diff --git a/tools/testing/selftests/bpf/progs/stacktrace_ips.c b/tools/testing/selftests/bpf/progs/stacktrace_ips.c index cae077a4061b..6830f2978613 100644 --- a/tools/testing/selftests/bpf/progs/stacktrace_ips.c +++ b/tools/testing/selftests/bpf/progs/stacktrace_ips.c @@ -53,4 +53,24 @@ int rawtp_test(void *ctx) return 0; } +SEC("fentry/bpf_testmod_stacktrace_test") +int fentry_test(struct pt_regs *ctx) +{ + /* + * Skip 2 bpf_program/trampoline stack entries: + * - bpf_prog_bd1f7a949f55fb03_fentry_test + * - bpf_trampoline_182536277701 + */ + stack_key = bpf_get_stackid(ctx, &stackmap, 2); + return 0; +} + +SEC("fexit/bpf_testmod_stacktrace_test") +int fexit_test(struct pt_regs *ctx) +{ + /* Skip 2 bpf_program/trampoline stack entries, check fentry_test. */ + stack_key = bpf_get_stackid(ctx, &stackmap, 2); + return 0; +} + char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 4173b494d93a8057d3ed23e65853cd76b647f870 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 26 Jan 2026 22:18:37 +0100 Subject: selftests/bpf: Allow to benchmark trigger with stacktrace Adding support to call bpf_get_stackid helper from trigger programs, so far added for kprobe multi. Adding the --stacktrace/-g option to enable it. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260126211837.472802-7-jolsa@kernel.org --- tools/testing/selftests/bpf/bench.c | 4 ++ tools/testing/selftests/bpf/bench.h | 1 + tools/testing/selftests/bpf/benchs/bench_trigger.c | 1 + tools/testing/selftests/bpf/progs/trigger_bench.c | 46 +++++++++++++++++----- 4 files changed, 43 insertions(+), 9 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index bd29bb2e6cb5..8368bd3a0665 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -265,6 +265,7 @@ static const struct argp_option opts[] = { { "verbose", 'v', NULL, 0, "Verbose debug output"}, { "affinity", 'a', NULL, 0, "Set consumer/producer thread affinity"}, { "quiet", 'q', NULL, 0, "Be more quiet"}, + { "stacktrace", 's', NULL, 0, "Get stack trace"}, { "prod-affinity", ARG_PROD_AFFINITY_SET, "CPUSET", 0, "Set of CPUs for producer threads; implies --affinity"}, { "cons-affinity", ARG_CONS_AFFINITY_SET, "CPUSET", 0, @@ -350,6 +351,9 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) case 'q': env.quiet = true; break; + case 's': + env.stacktrace = true; + break; case ARG_PROD_AFFINITY_SET: env.affinity = true; if (parse_num_list(arg, &env.prod_cpus.cpus, diff --git a/tools/testing/selftests/bpf/bench.h b/tools/testing/selftests/bpf/bench.h index bea323820ffb..7cf21936e7ed 100644 --- a/tools/testing/selftests/bpf/bench.h +++ b/tools/testing/selftests/bpf/bench.h @@ -26,6 +26,7 @@ struct env { bool list; bool affinity; bool quiet; + bool stacktrace; int consumer_cnt; int producer_cnt; int nr_cpus; diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c index 34018fc3927f..aeec9edd3851 100644 --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c @@ -146,6 +146,7 @@ static void setup_ctx(void) bpf_program__set_autoload(ctx.skel->progs.trigger_driver, true); ctx.skel->rodata->batch_iters = args.batch_iters; + ctx.skel->rodata->stacktrace = env.stacktrace; } static void load_ctx(void) diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c index 2898b3749d07..4ea0422d1042 100644 --- a/tools/testing/selftests/bpf/progs/trigger_bench.c +++ b/tools/testing/selftests/bpf/progs/trigger_bench.c @@ -25,6 +25,34 @@ static __always_inline void inc_counter(void) __sync_add_and_fetch(&hits[cpu & CPU_MASK].value, 1); } +volatile const int stacktrace; + +typedef __u64 stack_trace_t[128]; + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, stack_trace_t); +} stack_heap SEC(".maps"); + +static __always_inline void do_stacktrace(void *ctx) +{ + if (!stacktrace) + return; + + __u64 *ptr = bpf_map_lookup_elem(&stack_heap, &(__u32){0}); + + if (ptr) + bpf_get_stack(ctx, ptr, sizeof(stack_trace_t), 0); +} + +static __always_inline void handle(void *ctx) +{ + inc_counter(); + do_stacktrace(ctx); +} + SEC("?uprobe") int bench_trigger_uprobe(void *ctx) { @@ -81,21 +109,21 @@ int trigger_driver_kfunc(void *ctx) SEC("?kprobe/bpf_get_numa_node_id") int bench_trigger_kprobe(void *ctx) { - inc_counter(); + handle(ctx); return 0; } SEC("?kretprobe/bpf_get_numa_node_id") int bench_trigger_kretprobe(void *ctx) { - inc_counter(); + handle(ctx); return 0; } SEC("?kprobe.multi/bpf_get_numa_node_id") int bench_trigger_kprobe_multi(void *ctx) { - inc_counter(); + handle(ctx); return 0; } @@ -108,7 +136,7 @@ int bench_kprobe_multi_empty(void *ctx) SEC("?kretprobe.multi/bpf_get_numa_node_id") int bench_trigger_kretprobe_multi(void *ctx) { - inc_counter(); + handle(ctx); return 0; } @@ -121,34 +149,34 @@ int bench_kretprobe_multi_empty(void *ctx) SEC("?fentry/bpf_get_numa_node_id") int bench_trigger_fentry(void *ctx) { - inc_counter(); + handle(ctx); return 0; } SEC("?fexit/bpf_get_numa_node_id") int bench_trigger_fexit(void *ctx) { - inc_counter(); + handle(ctx); return 0; } SEC("?fmod_ret/bpf_modify_return_test_tp") int bench_trigger_fmodret(void *ctx) { - inc_counter(); + handle(ctx); return -22; } SEC("?tp/bpf_test_run/bpf_trigger_tp") int bench_trigger_tp(void *ctx) { - inc_counter(); + handle(ctx); return 0; } SEC("?raw_tp/bpf_trigger_tp") int bench_trigger_rawtp(void *ctx) { - inc_counter(); + handle(ctx); return 0; } -- cgit v1.2.3 From 3a4d8bed0b47543b2dfce0b1d714b40d68ff2f7e Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 30 Jan 2026 00:19:52 +0800 Subject: selftests: ublk: derive TID automatically from script name Add automatic TID derivation in test_common.sh based on the script filename. The TID is extracted by stripping the "test_" prefix and ".sh" suffix from the script name (e.g., test_loop_01.sh -> loop_01). This removes the need for each test script to manually define TID, reducing boilerplate and preventing potential mismatches between the script name and TID. Scripts can still override TID after sourcing test_common.sh if needed. Reviewed-by: Caleb Sander Mateos Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/test_batch_01.sh | 1 - tools/testing/selftests/ublk/test_batch_02.sh | 1 - tools/testing/selftests/ublk/test_batch_03.sh | 1 - tools/testing/selftests/ublk/test_common.sh | 5 +++++ tools/testing/selftests/ublk/test_generic_01.sh | 1 - tools/testing/selftests/ublk/test_generic_02.sh | 1 - tools/testing/selftests/ublk/test_generic_03.sh | 1 - tools/testing/selftests/ublk/test_generic_04.sh | 1 - tools/testing/selftests/ublk/test_generic_05.sh | 1 - tools/testing/selftests/ublk/test_generic_06.sh | 1 - tools/testing/selftests/ublk/test_generic_07.sh | 1 - tools/testing/selftests/ublk/test_generic_08.sh | 1 - tools/testing/selftests/ublk/test_generic_09.sh | 1 - tools/testing/selftests/ublk/test_generic_10.sh | 1 - tools/testing/selftests/ublk/test_generic_11.sh | 1 - tools/testing/selftests/ublk/test_generic_12.sh | 1 - tools/testing/selftests/ublk/test_generic_13.sh | 1 - tools/testing/selftests/ublk/test_generic_14.sh | 1 - tools/testing/selftests/ublk/test_generic_15.sh | 1 - tools/testing/selftests/ublk/test_generic_16.sh | 1 - tools/testing/selftests/ublk/test_loop_01.sh | 1 - tools/testing/selftests/ublk/test_loop_02.sh | 1 - tools/testing/selftests/ublk/test_loop_03.sh | 1 - tools/testing/selftests/ublk/test_loop_04.sh | 1 - tools/testing/selftests/ublk/test_loop_05.sh | 1 - tools/testing/selftests/ublk/test_loop_06.sh | 1 - tools/testing/selftests/ublk/test_loop_07.sh | 1 - tools/testing/selftests/ublk/test_loop_08.sh | 1 - tools/testing/selftests/ublk/test_null_01.sh | 1 - tools/testing/selftests/ublk/test_null_02.sh | 1 - tools/testing/selftests/ublk/test_null_03.sh | 1 - tools/testing/selftests/ublk/test_null_04.sh | 1 - tools/testing/selftests/ublk/test_stress_01.sh | 1 - tools/testing/selftests/ublk/test_stress_02.sh | 1 - tools/testing/selftests/ublk/test_stress_03.sh | 1 - tools/testing/selftests/ublk/test_stress_04.sh | 1 - tools/testing/selftests/ublk/test_stress_05.sh | 1 - tools/testing/selftests/ublk/test_stress_06.sh | 1 - tools/testing/selftests/ublk/test_stress_07.sh | 1 - tools/testing/selftests/ublk/test_stress_08.sh | 1 - tools/testing/selftests/ublk/test_stress_09.sh | 1 - tools/testing/selftests/ublk/test_stripe_01.sh | 1 - tools/testing/selftests/ublk/test_stripe_02.sh | 1 - tools/testing/selftests/ublk/test_stripe_03.sh | 1 - tools/testing/selftests/ublk/test_stripe_04.sh | 1 - tools/testing/selftests/ublk/test_stripe_05.sh | 1 - tools/testing/selftests/ublk/test_stripe_06.sh | 1 - 47 files changed, 5 insertions(+), 46 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/ublk/test_batch_01.sh b/tools/testing/selftests/ublk/test_batch_01.sh index 9fa9fff5c62f..a18fb39af8be 100755 --- a/tools/testing/selftests/ublk/test_batch_01.sh +++ b/tools/testing/selftests/ublk/test_batch_01.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="batch_01" ERR_CODE=0 if ! _have_feature "BATCH_IO"; then diff --git a/tools/testing/selftests/ublk/test_batch_02.sh b/tools/testing/selftests/ublk/test_batch_02.sh index b477f91359e1..7ca384d11987 100755 --- a/tools/testing/selftests/ublk/test_batch_02.sh +++ b/tools/testing/selftests/ublk/test_batch_02.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="batch_02" ERR_CODE=0 if ! _have_feature "BATCH_IO"; then diff --git a/tools/testing/selftests/ublk/test_batch_03.sh b/tools/testing/selftests/ublk/test_batch_03.sh index 13a2b3d3a1b9..aca9cf144b55 100755 --- a/tools/testing/selftests/ublk/test_batch_03.sh +++ b/tools/testing/selftests/ublk/test_batch_03.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="batch_03" ERR_CODE=0 if ! _have_feature "BATCH_IO"; then diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index 7ff6ce79d62c..bbe031c94a29 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -1,6 +1,11 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +# Derive TID from script name: test__.sh -> _ +# Can be overridden in test script after sourcing this file +TID=$(basename "$0" .sh) +TID=${TID#test_} + UBLK_SKIP_CODE=4 _have_program() { diff --git a/tools/testing/selftests/ublk/test_generic_01.sh b/tools/testing/selftests/ublk/test_generic_01.sh index 21a31cd5491a..26cf3c7ceeb5 100755 --- a/tools/testing/selftests/ublk/test_generic_01.sh +++ b/tools/testing/selftests/ublk/test_generic_01.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_01" ERR_CODE=0 if ! _have_program bpftrace; then diff --git a/tools/testing/selftests/ublk/test_generic_02.sh b/tools/testing/selftests/ublk/test_generic_02.sh index 12920768b1a0..1d4b1d6e059c 100755 --- a/tools/testing/selftests/ublk/test_generic_02.sh +++ b/tools/testing/selftests/ublk/test_generic_02.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_02" ERR_CODE=0 if ! _have_program bpftrace; then diff --git a/tools/testing/selftests/ublk/test_generic_03.sh b/tools/testing/selftests/ublk/test_generic_03.sh index b551aa76cb0d..8934ea926762 100755 --- a/tools/testing/selftests/ublk/test_generic_03.sh +++ b/tools/testing/selftests/ublk/test_generic_03.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_03" ERR_CODE=0 _prep_test "null" "check dma & segment limits for zero copy" diff --git a/tools/testing/selftests/ublk/test_generic_04.sh b/tools/testing/selftests/ublk/test_generic_04.sh index be2292822bbe..2672f9c40fa8 100755 --- a/tools/testing/selftests/ublk/test_generic_04.sh +++ b/tools/testing/selftests/ublk/test_generic_04.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_04" ERR_CODE=0 ublk_run_recover_test() diff --git a/tools/testing/selftests/ublk/test_generic_05.sh b/tools/testing/selftests/ublk/test_generic_05.sh index 9b7f71c16d82..bda5064bc31f 100755 --- a/tools/testing/selftests/ublk/test_generic_05.sh +++ b/tools/testing/selftests/ublk/test_generic_05.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_05" ERR_CODE=0 ublk_run_recover_test() diff --git a/tools/testing/selftests/ublk/test_generic_06.sh b/tools/testing/selftests/ublk/test_generic_06.sh index fd42062b7b76..14a05054fcd8 100755 --- a/tools/testing/selftests/ublk/test_generic_06.sh +++ b/tools/testing/selftests/ublk/test_generic_06.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_06" ERR_CODE=0 _prep_test "fault_inject" "fast cleanup when all I/Os of one hctx are in server" diff --git a/tools/testing/selftests/ublk/test_generic_07.sh b/tools/testing/selftests/ublk/test_generic_07.sh index cba86451fa5e..8dcfd8978f50 100755 --- a/tools/testing/selftests/ublk/test_generic_07.sh +++ b/tools/testing/selftests/ublk/test_generic_07.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_07" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_generic_08.sh b/tools/testing/selftests/ublk/test_generic_08.sh index b222f3a77e12..ce88c31d6b9c 100755 --- a/tools/testing/selftests/ublk/test_generic_08.sh +++ b/tools/testing/selftests/ublk/test_generic_08.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_08" ERR_CODE=0 if ! _have_feature "AUTO_BUF_REG"; then diff --git a/tools/testing/selftests/ublk/test_generic_09.sh b/tools/testing/selftests/ublk/test_generic_09.sh index bb6f77ca5522..744d0cdaa242 100755 --- a/tools/testing/selftests/ublk/test_generic_09.sh +++ b/tools/testing/selftests/ublk/test_generic_09.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_09" ERR_CODE=0 if ! _have_feature "AUTO_BUF_REG"; then diff --git a/tools/testing/selftests/ublk/test_generic_10.sh b/tools/testing/selftests/ublk/test_generic_10.sh index abc11c3d416b..4b4293b9081f 100755 --- a/tools/testing/selftests/ublk/test_generic_10.sh +++ b/tools/testing/selftests/ublk/test_generic_10.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_10" ERR_CODE=0 if ! _have_feature "UPDATE_SIZE"; then diff --git a/tools/testing/selftests/ublk/test_generic_11.sh b/tools/testing/selftests/ublk/test_generic_11.sh index d1f973c8c645..e0dc0b8fe5d6 100755 --- a/tools/testing/selftests/ublk/test_generic_11.sh +++ b/tools/testing/selftests/ublk/test_generic_11.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_11" ERR_CODE=0 ublk_run_quiesce_recover() diff --git a/tools/testing/selftests/ublk/test_generic_12.sh b/tools/testing/selftests/ublk/test_generic_12.sh index b4046201b4d9..54b81ddfe9f9 100755 --- a/tools/testing/selftests/ublk/test_generic_12.sh +++ b/tools/testing/selftests/ublk/test_generic_12.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_12" ERR_CODE=0 if ! _have_program bpftrace; then diff --git a/tools/testing/selftests/ublk/test_generic_13.sh b/tools/testing/selftests/ublk/test_generic_13.sh index b7aa90b1cb74..922115aa14f4 100755 --- a/tools/testing/selftests/ublk/test_generic_13.sh +++ b/tools/testing/selftests/ublk/test_generic_13.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_13" ERR_CODE=0 _prep_test "null" "check that feature list is complete" diff --git a/tools/testing/selftests/ublk/test_generic_14.sh b/tools/testing/selftests/ublk/test_generic_14.sh index cd9b44b97c24..178443394ca5 100755 --- a/tools/testing/selftests/ublk/test_generic_14.sh +++ b/tools/testing/selftests/ublk/test_generic_14.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_14" ERR_CODE=0 ublk_run_recover_test() diff --git a/tools/testing/selftests/ublk/test_generic_15.sh b/tools/testing/selftests/ublk/test_generic_15.sh index 76379362e0a2..727d0f4610d6 100755 --- a/tools/testing/selftests/ublk/test_generic_15.sh +++ b/tools/testing/selftests/ublk/test_generic_15.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_15" ERR_CODE=0 _test_partition_scan_no_hang() diff --git a/tools/testing/selftests/ublk/test_generic_16.sh b/tools/testing/selftests/ublk/test_generic_16.sh index e08af7b685c9..42e8d2e16ec9 100755 --- a/tools/testing/selftests/ublk/test_generic_16.sh +++ b/tools/testing/selftests/ublk/test_generic_16.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_16" ERR_CODE=0 _prep_test "null" "stop --safe command" diff --git a/tools/testing/selftests/ublk/test_loop_01.sh b/tools/testing/selftests/ublk/test_loop_01.sh index 833fa0dbc700..338a235fd82a 100755 --- a/tools/testing/selftests/ublk/test_loop_01.sh +++ b/tools/testing/selftests/ublk/test_loop_01.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="loop_01" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_loop_02.sh b/tools/testing/selftests/ublk/test_loop_02.sh index 874568b3646b..04c52454e2ec 100755 --- a/tools/testing/selftests/ublk/test_loop_02.sh +++ b/tools/testing/selftests/ublk/test_loop_02.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="loop_02" ERR_CODE=0 _prep_test "loop" "mkfs & mount & umount" diff --git a/tools/testing/selftests/ublk/test_loop_03.sh b/tools/testing/selftests/ublk/test_loop_03.sh index c30f797c6429..6e8f649fe93d 100755 --- a/tools/testing/selftests/ublk/test_loop_03.sh +++ b/tools/testing/selftests/ublk/test_loop_03.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="loop_03" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_loop_04.sh b/tools/testing/selftests/ublk/test_loop_04.sh index b01d75b3214d..9f6774ec0de6 100755 --- a/tools/testing/selftests/ublk/test_loop_04.sh +++ b/tools/testing/selftests/ublk/test_loop_04.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="loop_04" ERR_CODE=0 _prep_test "loop" "mkfs & mount & umount with zero copy" diff --git a/tools/testing/selftests/ublk/test_loop_05.sh b/tools/testing/selftests/ublk/test_loop_05.sh index de2141533074..2b8d99e007be 100755 --- a/tools/testing/selftests/ublk/test_loop_05.sh +++ b/tools/testing/selftests/ublk/test_loop_05.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="loop_05" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_loop_06.sh b/tools/testing/selftests/ublk/test_loop_06.sh index 1d1a8a725502..e73f6f4844db 100755 --- a/tools/testing/selftests/ublk/test_loop_06.sh +++ b/tools/testing/selftests/ublk/test_loop_06.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="loop_06" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_loop_07.sh b/tools/testing/selftests/ublk/test_loop_07.sh index 493f3fb611a5..264d20e7c530 100755 --- a/tools/testing/selftests/ublk/test_loop_07.sh +++ b/tools/testing/selftests/ublk/test_loop_07.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="loop_07" ERR_CODE=0 _prep_test "loop" "mkfs & mount & umount with user copy" diff --git a/tools/testing/selftests/ublk/test_loop_08.sh b/tools/testing/selftests/ublk/test_loop_08.sh index ca289cfb2ad4..2caa7ba748fb 100755 --- a/tools/testing/selftests/ublk/test_loop_08.sh +++ b/tools/testing/selftests/ublk/test_loop_08.sh @@ -13,7 +13,6 @@ if [[ "$fio_version" =~ fio-[0-9]+\.[0-9]+$ ]]; then exit $UBLK_SKIP_CODE fi -TID=loop_08 _prep_test "loop" "end-to-end integrity" diff --git a/tools/testing/selftests/ublk/test_null_01.sh b/tools/testing/selftests/ublk/test_null_01.sh index c2cb8f7a09fe..eebce8076530 100755 --- a/tools/testing/selftests/ublk/test_null_01.sh +++ b/tools/testing/selftests/ublk/test_null_01.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="null_01" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_null_02.sh b/tools/testing/selftests/ublk/test_null_02.sh index 8accd35beb55..654bdff39664 100755 --- a/tools/testing/selftests/ublk/test_null_02.sh +++ b/tools/testing/selftests/ublk/test_null_02.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="null_02" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_null_03.sh b/tools/testing/selftests/ublk/test_null_03.sh index 0051067b4686..29cd09f06672 100755 --- a/tools/testing/selftests/ublk/test_null_03.sh +++ b/tools/testing/selftests/ublk/test_null_03.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="null_03" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_null_04.sh b/tools/testing/selftests/ublk/test_null_04.sh index 0b0719ea33a3..7491b8c17f00 100755 --- a/tools/testing/selftests/ublk/test_null_04.sh +++ b/tools/testing/selftests/ublk/test_null_04.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID=null_04 _prep_test "null" "integrity params" diff --git a/tools/testing/selftests/ublk/test_stress_01.sh b/tools/testing/selftests/ublk/test_stress_01.sh index 7d3150f057d4..a9322ce496e9 100755 --- a/tools/testing/selftests/ublk/test_stress_01.sh +++ b/tools/testing/selftests/ublk/test_stress_01.sh @@ -2,7 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_01" ERR_CODE=0 ublk_io_and_remove() diff --git a/tools/testing/selftests/ublk/test_stress_02.sh b/tools/testing/selftests/ublk/test_stress_02.sh index 4bdd921081e5..6c114194f9c9 100755 --- a/tools/testing/selftests/ublk/test_stress_02.sh +++ b/tools/testing/selftests/ublk/test_stress_02.sh @@ -2,7 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_02" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_stress_03.sh b/tools/testing/selftests/ublk/test_stress_03.sh index 3ed4c9b2d8c0..4e81ca0db758 100755 --- a/tools/testing/selftests/ublk/test_stress_03.sh +++ b/tools/testing/selftests/ublk/test_stress_03.sh @@ -2,7 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_03" ERR_CODE=0 ublk_io_and_remove() diff --git a/tools/testing/selftests/ublk/test_stress_04.sh b/tools/testing/selftests/ublk/test_stress_04.sh index efa8dc33234b..6c6f44b172bc 100755 --- a/tools/testing/selftests/ublk/test_stress_04.sh +++ b/tools/testing/selftests/ublk/test_stress_04.sh @@ -2,7 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_04" ERR_CODE=0 ublk_io_and_kill_daemon() diff --git a/tools/testing/selftests/ublk/test_stress_05.sh b/tools/testing/selftests/ublk/test_stress_05.sh index 68a194144302..7e9324de2030 100755 --- a/tools/testing/selftests/ublk/test_stress_05.sh +++ b/tools/testing/selftests/ublk/test_stress_05.sh @@ -2,7 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_05" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_stress_06.sh b/tools/testing/selftests/ublk/test_stress_06.sh index 37188ec2e1f7..c72e5d0b14be 100755 --- a/tools/testing/selftests/ublk/test_stress_06.sh +++ b/tools/testing/selftests/ublk/test_stress_06.sh @@ -2,7 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_06" ERR_CODE=0 ublk_io_and_remove() diff --git a/tools/testing/selftests/ublk/test_stress_07.sh b/tools/testing/selftests/ublk/test_stress_07.sh index fb061fc26d36..04c2764d5238 100755 --- a/tools/testing/selftests/ublk/test_stress_07.sh +++ b/tools/testing/selftests/ublk/test_stress_07.sh @@ -2,7 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_07" ERR_CODE=0 ublk_io_and_kill_daemon() diff --git a/tools/testing/selftests/ublk/test_stress_08.sh b/tools/testing/selftests/ublk/test_stress_08.sh index 9abb50ee3d00..37f7d204879a 100755 --- a/tools/testing/selftests/ublk/test_stress_08.sh +++ b/tools/testing/selftests/ublk/test_stress_08.sh @@ -2,7 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_08" ERR_CODE=0 ublk_io_and_remove() diff --git a/tools/testing/selftests/ublk/test_stress_09.sh b/tools/testing/selftests/ublk/test_stress_09.sh index 87b92b0a2410..53c1e3b2ab30 100755 --- a/tools/testing/selftests/ublk/test_stress_09.sh +++ b/tools/testing/selftests/ublk/test_stress_09.sh @@ -2,7 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_09" ERR_CODE=0 ublk_io_and_kill_daemon() diff --git a/tools/testing/selftests/ublk/test_stripe_01.sh b/tools/testing/selftests/ublk/test_stripe_01.sh index 4e4f0fdf3c9b..3bc821aadad8 100755 --- a/tools/testing/selftests/ublk/test_stripe_01.sh +++ b/tools/testing/selftests/ublk/test_stripe_01.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stripe_01" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_stripe_02.sh b/tools/testing/selftests/ublk/test_stripe_02.sh index 5820ab2efba4..4a7d2b21a6bf 100755 --- a/tools/testing/selftests/ublk/test_stripe_02.sh +++ b/tools/testing/selftests/ublk/test_stripe_02.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stripe_02" ERR_CODE=0 _prep_test "stripe" "mkfs & mount & umount" diff --git a/tools/testing/selftests/ublk/test_stripe_03.sh b/tools/testing/selftests/ublk/test_stripe_03.sh index 20b977e27814..a1c159d54e53 100755 --- a/tools/testing/selftests/ublk/test_stripe_03.sh +++ b/tools/testing/selftests/ublk/test_stripe_03.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stripe_03" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_stripe_04.sh b/tools/testing/selftests/ublk/test_stripe_04.sh index 1b51ed2f1d84..0c30bd6c2b3b 100755 --- a/tools/testing/selftests/ublk/test_stripe_04.sh +++ b/tools/testing/selftests/ublk/test_stripe_04.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stripe_04" ERR_CODE=0 _prep_test "stripe" "mkfs & mount & umount on zero copy" diff --git a/tools/testing/selftests/ublk/test_stripe_05.sh b/tools/testing/selftests/ublk/test_stripe_05.sh index 05d71951d710..6ddfa88ad226 100755 --- a/tools/testing/selftests/ublk/test_stripe_05.sh +++ b/tools/testing/selftests/ublk/test_stripe_05.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stripe_05" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_stripe_06.sh b/tools/testing/selftests/ublk/test_stripe_06.sh index d06cac7626e2..a2c7bf4cc613 100755 --- a/tools/testing/selftests/ublk/test_stripe_06.sh +++ b/tools/testing/selftests/ublk/test_stripe_06.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stripe_06" ERR_CODE=0 _prep_test "stripe" "mkfs & mount & umount on user copy" -- cgit v1.2.3 From e07a2039b6d4ae3acf8ae39b86be449b7fa18d4a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 30 Jan 2026 00:19:53 +0800 Subject: selftests: ublk: add selftest for UBLK_F_NO_AUTO_PART_SCAN Add test_part_01.sh to test the UBLK_F_NO_AUTO_PART_SCAN feature flag which allows suppressing automatic partition scanning during device startup while still allowing manual partition probing. The test verifies: - Normal behavior: partitions are auto-detected without the flag - With flag: partitions are not auto-detected during START_DEV - Manual scan: blockdev --rereadpt works with the flag Also update kublk tool to support --no_auto_part_scan option and recognize the feature flag. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 2 + tools/testing/selftests/ublk/kublk.c | 6 +- tools/testing/selftests/ublk/kublk.h | 3 +- tools/testing/selftests/ublk/test_part_01.sh | 104 +++++++++++++++++++++++++++ 4 files changed, 113 insertions(+), 2 deletions(-) create mode 100755 tools/testing/selftests/ublk/test_part_01.sh (limited to 'tools/testing') diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index e39a6f871fcc..bc5bd7d1381d 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -48,6 +48,8 @@ TEST_PROGS += test_stripe_04.sh TEST_PROGS += test_stripe_05.sh TEST_PROGS += test_stripe_06.sh +TEST_PROGS += test_part_01.sh + TEST_PROGS += test_stress_01.sh TEST_PROGS += test_stress_02.sh TEST_PROGS += test_stress_03.sh diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 2da37557e1a9..e8279c4acc40 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -1615,6 +1615,7 @@ static int cmd_dev_get_features(void) FEAT_NAME(UBLK_F_INTEGRITY), FEAT_NAME(UBLK_F_SAFE_STOP_DEV), FEAT_NAME(UBLK_F_BATCH_IO), + FEAT_NAME(UBLK_F_NO_AUTO_PART_SCAN), }; struct ublk_dev *dev; __u64 features = 0; @@ -1712,7 +1713,7 @@ static void __cmd_create_help(char *exe, bool recovery) printf("\t[--nthreads threads] [--per_io_tasks]\n"); printf("\t[--integrity_capable] [--integrity_reftag] [--metadata_size SIZE] " "[--pi_offset OFFSET] [--csum_type ip|t10dif|nvme] [--tag_size SIZE]\n"); - printf("\t[--batch|-b]\n"); + printf("\t[--batch|-b] [--no_auto_part_scan]\n"); printf("\t[target options] [backfile1] [backfile2] ...\n"); printf("\tdefault: nr_queues=2(max 32), depth=128(max 1024), dev_id=-1(auto allocation)\n"); printf("\tdefault: nthreads=nr_queues"); @@ -1786,6 +1787,7 @@ int main(int argc, char *argv[]) { "tag_size", 1, NULL, 0 }, { "safe", 0, NULL, 0 }, { "batch", 0, NULL, 'b'}, + { "no_auto_part_scan", 0, NULL, 0 }, { 0, 0, 0, 0 } }; const struct ublk_tgt_ops *ops = NULL; @@ -1898,6 +1900,8 @@ int main(int argc, char *argv[]) ctx.tag_size = strtoul(optarg, NULL, 0); if (!strcmp(longopts[option_idx].name, "safe")) ctx.safe_stop = 1; + if (!strcmp(longopts[option_idx].name, "no_auto_part_scan")) + ctx.flags |= UBLK_F_NO_AUTO_PART_SCAN; break; case '?': /* diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index ca97deb5e208..1faeccaaecae 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -78,12 +78,13 @@ struct dev_ctx { unsigned int auto_zc_fallback:1; unsigned int per_io_tasks:1; unsigned int no_ublk_fixed_fd:1; + unsigned int safe_stop:1; + unsigned int no_auto_part_scan:1; __u32 integrity_flags; __u8 metadata_size; __u8 pi_offset; __u8 csum_type; __u8 tag_size; - unsigned int safe_stop:1; int _evtfd; int _shmid; diff --git a/tools/testing/selftests/ublk/test_part_01.sh b/tools/testing/selftests/ublk/test_part_01.sh new file mode 100755 index 000000000000..8028f6e4b3a5 --- /dev/null +++ b/tools/testing/selftests/ublk/test_part_01.sh @@ -0,0 +1,104 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +ERR_CODE=0 + +format_backing_file() +{ + local backing_file=$1 + + # Create ublk device to write partition table + local tmp_dev=$(_add_ublk_dev -t loop "${backing_file}") + [ $? -ne 0 ] && return 1 + + # Write partition table with sfdisk + sfdisk /dev/ublkb"${tmp_dev}" > /dev/null 2>&1 < /dev/null 2>&1 + udevadm settle + + if [ ! -e /dev/ublkb"${dev_id}"p1 ] || [ ! -e /dev/ublkb"${dev_id}"p2 ]; then + "${UBLK_PROG}" del -n "${dev_id}" + return 1 + fi + + "${UBLK_PROG}" del -n "${dev_id}" + return 0 +} + +if ! _have_program sfdisk || ! _have_program blockdev; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "generic" "test UBLK_F_NO_AUTO_PART_SCAN" + +if ! _have_feature "UBLK_F_NO_AUTO_PART_SCAN"; then + _cleanup_test "generic" + exit "$UBLK_SKIP_CODE" +fi + + +# Create and format backing file with partition table +_create_backfile 0 256M +format_backing_file "${UBLK_BACKFILES[0]}" +[ $? -ne 0 ] && ERR_CODE=255 + +# Test normal auto partition scan +[ "$ERR_CODE" -eq 0 ] && test_auto_part_scan "${UBLK_BACKFILES[0]}" +[ $? -ne 0 ] && ERR_CODE=255 + +# Test no auto partition scan with manual scan +[ "$ERR_CODE" -eq 0 ] && test_no_auto_part_scan "${UBLK_BACKFILES[0]}" +[ $? -ne 0 ] && ERR_CODE=255 + +_cleanup_test "generic" +_show_result $TID $ERR_CODE -- cgit v1.2.3 From 7a30d3dfea4a455d1109d5258fe332f2157071ba Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 30 Jan 2026 00:19:54 +0800 Subject: selftests: ublk: rename test_generic_15 to test_part_02 This test exercises partition scanning behavior, so move it to the test_part_* group for consistency. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 2 +- tools/testing/selftests/ublk/test_generic_15.sh | 67 ------------------------- tools/testing/selftests/ublk/test_part_02.sh | 67 +++++++++++++++++++++++++ 3 files changed, 68 insertions(+), 68 deletions(-) delete mode 100755 tools/testing/selftests/ublk/test_generic_15.sh create mode 100755 tools/testing/selftests/ublk/test_part_02.sh (limited to 'tools/testing') diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index bc5bd7d1381d..ca8588ed962c 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -22,7 +22,6 @@ TEST_PROGS += test_generic_11.sh TEST_PROGS += test_generic_12.sh TEST_PROGS += test_generic_13.sh TEST_PROGS += test_generic_14.sh -TEST_PROGS += test_generic_15.sh TEST_PROGS += test_generic_16.sh TEST_PROGS += test_batch_01.sh @@ -49,6 +48,7 @@ TEST_PROGS += test_stripe_05.sh TEST_PROGS += test_stripe_06.sh TEST_PROGS += test_part_01.sh +TEST_PROGS += test_part_02.sh TEST_PROGS += test_stress_01.sh TEST_PROGS += test_stress_02.sh diff --git a/tools/testing/selftests/ublk/test_generic_15.sh b/tools/testing/selftests/ublk/test_generic_15.sh deleted file mode 100755 index 727d0f4610d6..000000000000 --- a/tools/testing/selftests/ublk/test_generic_15.sh +++ /dev/null @@ -1,67 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh - -ERR_CODE=0 - -_test_partition_scan_no_hang() -{ - local recovery_flag=$1 - local expected_state=$2 - local dev_id - local state - local daemon_pid - local start_time - local elapsed - - # Create ublk device with fault_inject target and very large delay - # to simulate hang during partition table read - # --delay_us 60000000 = 60 seconds delay - # Use _add_ublk_dev_no_settle to avoid udevadm settle hang waiting - # for partition scan events to complete - if [ "$recovery_flag" = "yes" ]; then - echo "Testing partition scan with recovery support..." - dev_id=$(_add_ublk_dev_no_settle -t fault_inject -q 1 -d 1 --delay_us 60000000 -r 1) - else - echo "Testing partition scan without recovery..." - dev_id=$(_add_ublk_dev_no_settle -t fault_inject -q 1 -d 1 --delay_us 60000000) - fi - - _check_add_dev "$TID" $? - - # The add command should return quickly because partition scan is async. - # Now sleep briefly to let the async partition scan work start and hit - # the delay in the fault_inject handler. - sleep 1 - - # Kill the ublk daemon while partition scan is potentially blocked - # And check state transitions properly - start_time=${SECONDS} - daemon_pid=$(_get_ublk_daemon_pid "${dev_id}") - state=$(__ublk_kill_daemon "${dev_id}" "${expected_state}") - elapsed=$((SECONDS - start_time)) - - # Verify the device transitioned to expected state - if [ "$state" != "${expected_state}" ]; then - echo "FAIL: Device state is $state, expected ${expected_state}" - ERR_CODE=255 - ${UBLK_PROG} del -n "${dev_id}" > /dev/null 2>&1 - return - fi - echo "PASS: Device transitioned to ${expected_state} in ${elapsed}s without hanging" - - # Clean up the device - ${UBLK_PROG} del -n "${dev_id}" > /dev/null 2>&1 -} - -_prep_test "partition_scan" "verify async partition scan prevents IO hang" - -# Test 1: Without recovery support - should transition to DEAD -_test_partition_scan_no_hang "no" "DEAD" - -# Test 2: With recovery support - should transition to QUIESCED -_test_partition_scan_no_hang "yes" "QUIESCED" - -_cleanup_test "partition_scan" -_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_part_02.sh b/tools/testing/selftests/ublk/test_part_02.sh new file mode 100755 index 000000000000..727d0f4610d6 --- /dev/null +++ b/tools/testing/selftests/ublk/test_part_02.sh @@ -0,0 +1,67 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +ERR_CODE=0 + +_test_partition_scan_no_hang() +{ + local recovery_flag=$1 + local expected_state=$2 + local dev_id + local state + local daemon_pid + local start_time + local elapsed + + # Create ublk device with fault_inject target and very large delay + # to simulate hang during partition table read + # --delay_us 60000000 = 60 seconds delay + # Use _add_ublk_dev_no_settle to avoid udevadm settle hang waiting + # for partition scan events to complete + if [ "$recovery_flag" = "yes" ]; then + echo "Testing partition scan with recovery support..." + dev_id=$(_add_ublk_dev_no_settle -t fault_inject -q 1 -d 1 --delay_us 60000000 -r 1) + else + echo "Testing partition scan without recovery..." + dev_id=$(_add_ublk_dev_no_settle -t fault_inject -q 1 -d 1 --delay_us 60000000) + fi + + _check_add_dev "$TID" $? + + # The add command should return quickly because partition scan is async. + # Now sleep briefly to let the async partition scan work start and hit + # the delay in the fault_inject handler. + sleep 1 + + # Kill the ublk daemon while partition scan is potentially blocked + # And check state transitions properly + start_time=${SECONDS} + daemon_pid=$(_get_ublk_daemon_pid "${dev_id}") + state=$(__ublk_kill_daemon "${dev_id}" "${expected_state}") + elapsed=$((SECONDS - start_time)) + + # Verify the device transitioned to expected state + if [ "$state" != "${expected_state}" ]; then + echo "FAIL: Device state is $state, expected ${expected_state}" + ERR_CODE=255 + ${UBLK_PROG} del -n "${dev_id}" > /dev/null 2>&1 + return + fi + echo "PASS: Device transitioned to ${expected_state} in ${elapsed}s without hanging" + + # Clean up the device + ${UBLK_PROG} del -n "${dev_id}" > /dev/null 2>&1 +} + +_prep_test "partition_scan" "verify async partition scan prevents IO hang" + +# Test 1: Without recovery support - should transition to DEAD +_test_partition_scan_no_hang "no" "DEAD" + +# Test 2: With recovery support - should transition to QUIESCED +_test_partition_scan_no_hang "yes" "QUIESCED" + +_cleanup_test "partition_scan" +_show_result $TID $ERR_CODE -- cgit v1.2.3 From 130975353b1548d76aa9790a4ac7e74bd2a37221 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 30 Jan 2026 00:19:55 +0800 Subject: selftests: ublk: refactor test_null_04 into separate functions Encapsulate each test case in its own function that creates the device, runs checks, and deletes only that device. This avoids calling _cleanup_test multiple times. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/test_null_04.sh | 248 ++++++++++----------------- 1 file changed, 94 insertions(+), 154 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/ublk/test_null_04.sh b/tools/testing/selftests/ublk/test_null_04.sh index 7491b8c17f00..22328e0f3925 100755 --- a/tools/testing/selftests/ublk/test_null_04.sh +++ b/tools/testing/selftests/ublk/test_null_04.sh @@ -3,163 +3,103 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh +ERR_CODE=0 -_prep_test "null" "integrity params" +_check_value() { + local name=$1 + local actual=$2 + local expected=$3 -dev_id=$(_add_ublk_dev -t null -u --metadata_size 8) -_check_add_dev $TID $? -metadata_size=$(_get_metadata_size "$dev_id" metadata_size) -if [ "$metadata_size" != 8 ]; then - echo "metadata_size $metadata_size != 8" - _show_result $TID 255 -fi -pi_offset=$(_get_metadata_size "$dev_id" pi_offset) -if [ "$pi_offset" != 0 ]; then - echo "pi_offset $pi_offset != 0" - _show_result $TID 255 -fi -pi_tuple_size=$(_get_metadata_size "$dev_id" pi_tuple_size) -if [ "$pi_tuple_size" != 0 ]; then - echo "pi_tuple_size $pi_tuple_size != 0" - _show_result $TID 255 -fi -capable=$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable") -if [ "$capable" != 0 ]; then - echo "device_is_integrity_capable $capable != 0" - _show_result $TID 255 -fi -format=$(cat "/sys/block/ublkb$dev_id/integrity/format") -if [ "$format" != nop ]; then - echo "format $format != nop" - _show_result $TID 255 -fi -protection_interval_bytes=$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes") -if [ "$protection_interval_bytes" != 512 ]; then - echo "protection_interval_bytes $protection_interval_bytes != 512" - _show_result $TID 255 -fi -tag_size=$(cat "/sys/block/ublkb$dev_id/integrity/tag_size") -if [ "$tag_size" != 0 ]; then - echo "tag_size $tag_size != 0" - _show_result $TID 255 -fi -_cleanup_test + if [ "$actual" != "$expected" ]; then + echo "$name $actual != $expected" + ERR_CODE=255 + return 1 + fi + return 0 +} -dev_id=$(_add_ublk_dev -t null -u --integrity_capable --metadata_size 64 --pi_offset 56 --csum_type ip) -_check_add_dev $TID $? -metadata_size=$(_get_metadata_size "$dev_id" metadata_size) -if [ "$metadata_size" != 64 ]; then - echo "metadata_size $metadata_size != 64" - _show_result $TID 255 -fi -pi_offset=$(_get_metadata_size "$dev_id" pi_offset) -if [ "$pi_offset" != 56 ]; then - echo "pi_offset $pi_offset != 56" - _show_result $TID 255 -fi -pi_tuple_size=$(_get_metadata_size "$dev_id" pi_tuple_size) -if [ "$pi_tuple_size" != 8 ]; then - echo "pi_tuple_size $pi_tuple_size != 8" - _show_result $TID 255 -fi -capable=$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable") -if [ "$capable" != 1 ]; then - echo "device_is_integrity_capable $capable != 1" - _show_result $TID 255 -fi -format=$(cat "/sys/block/ublkb$dev_id/integrity/format") -if [ "$format" != T10-DIF-TYPE3-IP ]; then - echo "format $format != T10-DIF-TYPE3-IP" - _show_result $TID 255 -fi -protection_interval_bytes=$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes") -if [ "$protection_interval_bytes" != 512 ]; then - echo "protection_interval_bytes $protection_interval_bytes != 512" - _show_result $TID 255 -fi -tag_size=$(cat "/sys/block/ublkb$dev_id/integrity/tag_size") -if [ "$tag_size" != 0 ]; then - echo "tag_size $tag_size != 0" - _show_result $TID 255 -fi -_cleanup_test +_test_metadata_only() { + local dev_id -dev_id=$(_add_ublk_dev -t null -u --integrity_reftag --metadata_size 8 --csum_type t10dif) -_check_add_dev $TID $? -metadata_size=$(_get_metadata_size "$dev_id" metadata_size) -if [ "$metadata_size" != 8 ]; then - echo "metadata_size $metadata_size != 8" - _show_result $TID 255 -fi -pi_offset=$(_get_metadata_size "$dev_id" pi_offset) -if [ "$pi_offset" != 0 ]; then - echo "pi_offset $pi_offset != 0" - _show_result $TID 255 -fi -pi_tuple_size=$(_get_metadata_size "$dev_id" pi_tuple_size) -if [ "$pi_tuple_size" != 8 ]; then - echo "pi_tuple_size $pi_tuple_size != 8" - _show_result $TID 255 -fi -capable=$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable") -if [ "$capable" != 0 ]; then - echo "device_is_integrity_capable $capable != 0" - _show_result $TID 255 -fi -format=$(cat "/sys/block/ublkb$dev_id/integrity/format") -if [ "$format" != T10-DIF-TYPE1-CRC ]; then - echo "format $format != T10-DIF-TYPE1-CRC" - _show_result $TID 255 -fi -protection_interval_bytes=$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes") -if [ "$protection_interval_bytes" != 512 ]; then - echo "protection_interval_bytes $protection_interval_bytes != 512" - _show_result $TID 255 -fi -tag_size=$(cat "/sys/block/ublkb$dev_id/integrity/tag_size") -if [ "$tag_size" != 0 ]; then - echo "tag_size $tag_size != 0" - _show_result $TID 255 -fi -_cleanup_test + dev_id=$(_add_ublk_dev -t null -u --metadata_size 8) + _check_add_dev "$TID" $? -dev_id=$(_add_ublk_dev -t null -u --metadata_size 16 --csum_type nvme --tag_size 8) -_check_add_dev $TID $? -metadata_size=$(_get_metadata_size "$dev_id" metadata_size) -if [ "$metadata_size" != 16 ]; then - echo "metadata_size $metadata_size != 16" - _show_result $TID 255 -fi -pi_offset=$(_get_metadata_size "$dev_id" pi_offset) -if [ "$pi_offset" != 0 ]; then - echo "pi_offset $pi_offset != 0" - _show_result $TID 255 -fi -pi_tuple_size=$(_get_metadata_size "$dev_id" pi_tuple_size) -if [ "$pi_tuple_size" != 16 ]; then - echo "pi_tuple_size $pi_tuple_size != 16" - _show_result $TID 255 -fi -capable=$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable") -if [ "$capable" != 0 ]; then - echo "device_is_integrity_capable $capable != 0" - _show_result $TID 255 -fi -format=$(cat "/sys/block/ublkb$dev_id/integrity/format") -if [ "$format" != EXT-DIF-TYPE3-CRC64 ]; then - echo "format $format != EXT-DIF-TYPE3-CRC64" - _show_result $TID 255 -fi -protection_interval_bytes=$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes") -if [ "$protection_interval_bytes" != 512 ]; then - echo "protection_interval_bytes $protection_interval_bytes != 512" - _show_result $TID 255 -fi -tag_size=$(cat "/sys/block/ublkb$dev_id/integrity/tag_size") -if [ "$tag_size" != 8 ]; then - echo "tag_size $tag_size != 8" - _show_result $TID 255 -fi -_cleanup_test + _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 8 && + _check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 0 && + _check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 0 && + _check_value "device_is_integrity_capable" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 0 && + _check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" nop && + _check_value "protection_interval_bytes" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && + _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0 + + ${UBLK_PROG} del -n "${dev_id}" +} + +_test_integrity_capable_ip() { + local dev_id + + dev_id=$(_add_ublk_dev -t null -u --integrity_capable --metadata_size 64 --pi_offset 56 --csum_type ip) + _check_add_dev "$TID" $? + + _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 64 && + _check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 56 && + _check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 8 && + _check_value "device_is_integrity_capable" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 1 && + _check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" T10-DIF-TYPE3-IP && + _check_value "protection_interval_bytes" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && + _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0 + + ${UBLK_PROG} del -n "${dev_id}" +} + +_test_integrity_reftag_t10dif() { + local dev_id -_show_result $TID 0 + dev_id=$(_add_ublk_dev -t null -u --integrity_reftag --metadata_size 8 --csum_type t10dif) + _check_add_dev "$TID" $? + + _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 8 && + _check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 0 && + _check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 8 && + _check_value "device_is_integrity_capable" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 0 && + _check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" T10-DIF-TYPE1-CRC && + _check_value "protection_interval_bytes" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && + _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0 + + ${UBLK_PROG} del -n "${dev_id}" +} + +_test_nvme_csum() { + local dev_id + + dev_id=$(_add_ublk_dev -t null -u --metadata_size 16 --csum_type nvme --tag_size 8) + _check_add_dev "$TID" $? + + _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 16 && + _check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 0 && + _check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 16 && + _check_value "device_is_integrity_capable" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 0 && + _check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" EXT-DIF-TYPE3-CRC64 && + _check_value "protection_interval_bytes" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && + _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 8 + + ${UBLK_PROG} del -n "${dev_id}" +} + +_prep_test "null" "integrity params" + +_test_metadata_only +_test_integrity_capable_ip +_test_integrity_reftag_t10dif +_test_nvme_csum + +_cleanup_test +_show_result "$TID" $ERR_CODE -- cgit v1.2.3 From 76334de7da404c385e18efb3640ed60ca77a899f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 30 Jan 2026 00:19:56 +0800 Subject: selftests: ublk: disable partition scan for integrity tests The null target doesn't handle IO, so disable partition scan to avoid IO failures caused by integrity verification during the kernel's partition table read. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/test_null_04.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/ublk/test_null_04.sh b/tools/testing/selftests/ublk/test_null_04.sh index 22328e0f3925..a5599d38583a 100755 --- a/tools/testing/selftests/ublk/test_null_04.sh +++ b/tools/testing/selftests/ublk/test_null_04.sh @@ -21,7 +21,7 @@ _check_value() { _test_metadata_only() { local dev_id - dev_id=$(_add_ublk_dev -t null -u --metadata_size 8) + dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --metadata_size 8) _check_add_dev "$TID" $? _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 8 && @@ -40,7 +40,7 @@ _test_metadata_only() { _test_integrity_capable_ip() { local dev_id - dev_id=$(_add_ublk_dev -t null -u --integrity_capable --metadata_size 64 --pi_offset 56 --csum_type ip) + dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --integrity_capable --metadata_size 64 --pi_offset 56 --csum_type ip) _check_add_dev "$TID" $? _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 64 && @@ -59,7 +59,7 @@ _test_integrity_capable_ip() { _test_integrity_reftag_t10dif() { local dev_id - dev_id=$(_add_ublk_dev -t null -u --integrity_reftag --metadata_size 8 --csum_type t10dif) + dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --integrity_reftag --metadata_size 8 --csum_type t10dif) _check_add_dev "$TID" $? _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 8 && @@ -78,7 +78,7 @@ _test_integrity_reftag_t10dif() { _test_nvme_csum() { local dev_id - dev_id=$(_add_ublk_dev -t null -u --metadata_size 16 --csum_type nvme --tag_size 8) + dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --metadata_size 16 --csum_type nvme --tag_size 8) _check_add_dev "$TID" $? _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 16 && -- cgit v1.2.3 From 4e0d293af9e37c735aec574c1e69ed71f81f94b2 Mon Sep 17 00:00:00 2001 From: Alexander Atanasov Date: Fri, 30 Jan 2026 00:19:57 +0800 Subject: selftests: ublk: mark each test start and end time in dmesg Log test start and end time in dmesg, so generated log messages during the test run can be linked to specific test from the test suite. (switch to `date +%F %T`) Signed-off-by: Alexander Atanasov Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/test_common.sh | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index bbe031c94a29..dd4eff97610a 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -126,6 +126,7 @@ _prep_test() { modprobe ublk_drv > /dev/null 2>&1 UBLK_TMP=$(mktemp ublk_test_XXXXX) [ "$UBLK_TEST_QUIET" -eq 0 ] && echo "ublk $type: $*" + echo "ublk selftest: $TID starting at $(date '+%F %T')" | tee /dev/kmsg } _remove_test_files() @@ -170,6 +171,7 @@ _cleanup_test() { "${UBLK_PROG}" del -a _remove_files + echo "ublk selftest: $TID done at $(date '+%F %T')" | tee /dev/kmsg } _have_feature() -- cgit v1.2.3 From 2feca79ef8df5505b87c00812b9ba263b92c64ed Mon Sep 17 00:00:00 2001 From: Alexander Atanasov Date: Fri, 30 Jan 2026 00:19:58 +0800 Subject: selftests: ublk: move test temp files into a sub directory Create and use a temporary directory for the files created during test runs. If TMPDIR environment variable is set use it as a base for the temporary directory path. TMPDIR=/mnt/scratch make run_tests and TMPDIR=/mnt/scratch ./test_generic_01.sh will place test directory under /mnt/scratch Signed-off-by: Alexander Atanasov Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/test_common.sh | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index dd4eff97610a..21ba51fcc7d7 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -48,7 +48,7 @@ _create_backfile() { old_file="${UBLK_BACKFILES[$index]}" [ -f "$old_file" ] && rm -f "$old_file" - new_file=$(mktemp ublk_file_"${new_size}"_XXXXX) + new_file=$(mktemp ${UBLK_TEST_DIR}/ublk_file_"${new_size}"_XXXXX) truncate -s "${new_size}" "${new_file}" UBLK_BACKFILES["$index"]="$new_file" } @@ -65,7 +65,7 @@ _remove_files() { _create_tmp_dir() { local my_file; - my_file=$(mktemp -d ublk_dir_XXXXX) + my_file=$(mktemp -d ${UBLK_TEST_DIR}/ublk_dir_XXXXX) echo "$my_file" } @@ -124,7 +124,9 @@ _prep_test() { local type=$1 shift 1 modprobe ublk_drv > /dev/null 2>&1 - UBLK_TMP=$(mktemp ublk_test_XXXXX) + TDIR=$(mktemp -d ${TMPDIR:-.}/ublktest-dir.XXXXXX) + export UBLK_TEST_DIR=${TDIR} + UBLK_TMP=$(mktemp ${UBLK_TEST_DIR}/ublk_test_XXXXX) [ "$UBLK_TEST_QUIET" -eq 0 ] && echo "ublk $type: $*" echo "ublk selftest: $TID starting at $(date '+%F %T')" | tee /dev/kmsg } @@ -171,6 +173,7 @@ _cleanup_test() { "${UBLK_PROG}" del -a _remove_files + rmdir ${UBLK_TEST_DIR} echo "ublk selftest: $TID done at $(date '+%F %T')" | tee /dev/kmsg } @@ -405,6 +408,8 @@ UBLK_PROG=$(_ublk_test_top_dir)/kublk UBLK_TEST_QUIET=1 UBLK_TEST_SHOW_RESULT=1 UBLK_BACKFILES=() +UBLK_TEST_DIR=${TMPDIR:-.} export UBLK_PROG export UBLK_TEST_QUIET export UBLK_TEST_SHOW_RESULT +export UBLK_TEST_DIR -- cgit v1.2.3 From f0b5b3d6b56f8717e255406366d81bbcd3631660 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Sat, 31 Jan 2026 17:09:02 +0100 Subject: selftests/bpf: Test access from RO map from xdp_store_bytes This new test simply checks that helper bpf_xdp_store_bytes can successfully read from a read-only map. Signed-off-by: Paul Chaignon Link: https://lore.kernel.org/r/4fdb934a713b2d7cf133288c77f6cfefe9856440.1769875479.git.paul.chaignon@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/verifier_xdp.c | 35 ++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/progs/verifier_xdp.c b/tools/testing/selftests/bpf/progs/verifier_xdp.c index 50768ed179b3..7dc9226aeb34 100644 --- a/tools/testing/selftests/bpf/progs/verifier_xdp.c +++ b/tools/testing/selftests/bpf/progs/verifier_xdp.c @@ -5,6 +5,14 @@ #include #include "bpf_misc.h" +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, __u64); + __uint(map_flags, BPF_F_RDONLY_PROG); +} map_array_ro SEC(".maps"); + SEC("xdp") __description("XDP, using ifindex from netdev") __success __retval(1) @@ -21,4 +29,31 @@ l0_%=: exit; \ : __clobber_all); } +SEC("xdp") +__description("XDP, using xdp_store_bytes from RO map") +__success __retval(0) +__naked void xdp_store_bytes_from_ro_map(void) +{ + asm volatile (" \ + r6 = r1; \ + r1 = 0; \ + *(u64*)(r10 - 8) = r1; \ + r2 = r10; \ + r2 += -8; \ + r1 = %[map_array_ro] ll; \ + call %[bpf_map_lookup_elem]; \ + if r0 == 0 goto l0_%=; \ + r1 = r6; \ + r2 = 0; \ + r3 = r0; \ + r4 = 8; \ + call %[bpf_xdp_store_bytes]; \ +l0_%=: exit; \ +" : + : __imm(bpf_map_lookup_elem), + __imm(bpf_xdp_store_bytes), + __imm_addr(map_array_ro) + : __clobber_all); +} + char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 8798902f2b8bcae6f90229a1a1496b48ddda2972 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Sat, 31 Jan 2026 22:49:48 +0800 Subject: bpf: Add bpf_jit_supports_fsession() The added fsession does not prevent running on those architectures, that haven't added fsession support. For example, try to run fsession tests on arm64: test_fsession_basic:PASS:fsession_test__open_and_load 0 nsec test_fsession_basic:PASS:fsession_attach 0 nsec check_result:FAIL:test_run_opts err unexpected error: -14 (errno 14) In order to prevent such errors, add bpf_jit_supports_fsession() to guard those architectures. Fixes: 2d419c44658f ("bpf: add fsession support") Acked-by: Puranjay Mohan Tested-by: Puranjay Mohan Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260131144950.16294-2-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 5 ++++ include/linux/filter.h | 1 + kernel/bpf/core.c | 5 ++++ kernel/bpf/verifier.c | 5 ++++ .../selftests/bpf/prog_tests/fsession_test.c | 32 ++++++++++++++++------ 5 files changed, 40 insertions(+), 8 deletions(-) (limited to 'tools/testing') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 5a075e06cf45..070ba80e39d7 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -4112,3 +4112,8 @@ bool bpf_jit_supports_timed_may_goto(void) { return true; } + +bool bpf_jit_supports_fsession(void) +{ + return true; +} diff --git a/include/linux/filter.h b/include/linux/filter.h index fd54fed8f95f..4e1cb4f91f49 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1167,6 +1167,7 @@ bool bpf_jit_supports_arena(void); bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena); bool bpf_jit_supports_private_stack(void); bool bpf_jit_supports_timed_may_goto(void); +bool bpf_jit_supports_fsession(void); u64 bpf_arch_uaddress_limit(void); void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie); u64 arch_bpf_timed_may_goto(void); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 5ebece600aeb..dc906dfdff94 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -3144,6 +3144,11 @@ bool __weak bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) return false; } +bool __weak bpf_jit_supports_fsession(void) +{ + return false; +} + u64 __weak bpf_arch_uaddress_limit(void) { #if defined(CONFIG_64BIT) && defined(CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 256cc5c1a7df..6b62b6d57175 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -24828,6 +24828,11 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_TRACE_FSESSION: + if (prog->expected_attach_type == BPF_TRACE_FSESSION && + !bpf_jit_supports_fsession()) { + bpf_log(log, "JIT does not support fsession\n"); + return -EOPNOTSUPP; + } if (!btf_type_is_func(t)) { bpf_log(log, "attach_btf_id %u is not a function\n", btf_id); diff --git a/tools/testing/selftests/bpf/prog_tests/fsession_test.c b/tools/testing/selftests/bpf/prog_tests/fsession_test.c index 0c4b428e1cee..a299aeb8cc2e 100644 --- a/tools/testing/selftests/bpf/prog_tests/fsession_test.c +++ b/tools/testing/selftests/bpf/prog_tests/fsession_test.c @@ -29,8 +29,16 @@ static void test_fsession_basic(void) struct fsession_test *skel = NULL; int err; - skel = fsession_test__open_and_load(); - if (!ASSERT_OK_PTR(skel, "fsession_test__open_and_load")) + skel = fsession_test__open(); + if (!ASSERT_OK_PTR(skel, "fsession_test__open")) + return; + + err = fsession_test__load(skel); + if (err == -EOPNOTSUPP) { + test__skip(); + goto cleanup; + } + if (!ASSERT_OK(err, "fsession_test__load")) goto cleanup; err = fsession_test__attach(skel); @@ -47,8 +55,16 @@ static void test_fsession_reattach(void) struct fsession_test *skel = NULL; int err; - skel = fsession_test__open_and_load(); - if (!ASSERT_OK_PTR(skel, "fsession_test__open_and_load")) + skel = fsession_test__open(); + if (!ASSERT_OK_PTR(skel, "fsession_test__open")) + return; + + err = fsession_test__load(skel); + if (err == -EOPNOTSUPP) { + test__skip(); + goto cleanup; + } + if (!ASSERT_OK(err, "fsession_test__load")) goto cleanup; /* first attach */ @@ -94,6 +110,10 @@ static void test_fsession_cookie(void) bpf_program__set_autoload(skel->progs.test6, false); err = fsession_test__load(skel); + if (err == -EOPNOTSUPP) { + test__skip(); + goto cleanup; + } if (!ASSERT_OK(err, "fsession_test__load")) goto cleanup; @@ -111,10 +131,6 @@ cleanup: void test_fsession_test(void) { -#if !defined(__x86_64__) - test__skip(); - return; -#endif if (test__start_subtest("fsession_test")) test_fsession_basic(); if (test__start_subtest("fsession_reattach")) -- cgit v1.2.3 From 7f10da2133b18b0f1bc02d58671883537e212279 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Sat, 31 Jan 2026 22:49:50 +0800 Subject: selftests/bpf: Enable get_func_args and get_func_ip tests on arm64 Allow get_func_args, and get_func_ip fsession selftests to run on arm64. Acked-by: Puranjay Mohan Tested-by: Puranjay Mohan Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260131144950.16294-4-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/get_func_args_test.c | 2 +- tools/testing/selftests/bpf/progs/get_func_ip_test.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/bpf/progs/get_func_args_test.c b/tools/testing/selftests/bpf/progs/get_func_args_test.c index 0a3236a7a109..180ba5098ca1 100644 --- a/tools/testing/selftests/bpf/progs/get_func_args_test.c +++ b/tools/testing/selftests/bpf/progs/get_func_args_test.c @@ -167,7 +167,7 @@ int BPF_PROG(tp_test2) } __u64 test7_result = 0; -#ifdef __TARGET_ARCH_x86 +#if defined(bpf_target_x86) || defined(bpf_target_arm64) SEC("fsession/bpf_fentry_test1") int BPF_PROG(test7) { diff --git a/tools/testing/selftests/bpf/progs/get_func_ip_test.c b/tools/testing/selftests/bpf/progs/get_func_ip_test.c index 65f7e1f182bf..43ff836a8ed8 100644 --- a/tools/testing/selftests/bpf/progs/get_func_ip_test.c +++ b/tools/testing/selftests/bpf/progs/get_func_ip_test.c @@ -106,7 +106,7 @@ int BPF_URETPROBE(test8, int ret) __u64 test9_entry_result = 0; __u64 test9_exit_result = 0; -#ifdef __TARGET_ARCH_x86 +#if defined(bpf_target_x86) || defined(bpf_target_arm64) SEC("fsession/bpf_fentry_test1") int BPF_PROG(test9, int a) { -- cgit v1.2.3 From 5af302a15a1d628a025a78892001fe8afea90c60 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 1 Feb 2026 00:23:32 +0800 Subject: selftests: ublk: simplify UBLK_TEST_DIR handling Remove intermediate TDIR variable and set UBLK_TEST_DIR directly in _prep_test(). Remove default initialization since the directory is created dynamically when tests run. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/test_common.sh | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index 21ba51fcc7d7..8d298a7ee7b1 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -124,8 +124,7 @@ _prep_test() { local type=$1 shift 1 modprobe ublk_drv > /dev/null 2>&1 - TDIR=$(mktemp -d ${TMPDIR:-.}/ublktest-dir.XXXXXX) - export UBLK_TEST_DIR=${TDIR} + UBLK_TEST_DIR=$(mktemp -d ${TMPDIR:-.}/ublktest-dir.XXXXXX) UBLK_TMP=$(mktemp ${UBLK_TEST_DIR}/ublk_test_XXXXX) [ "$UBLK_TEST_QUIET" -eq 0 ] && echo "ublk $type: $*" echo "ublk selftest: $TID starting at $(date '+%F %T')" | tee /dev/kmsg @@ -408,8 +407,6 @@ UBLK_PROG=$(_ublk_test_top_dir)/kublk UBLK_TEST_QUIET=1 UBLK_TEST_SHOW_RESULT=1 UBLK_BACKFILES=() -UBLK_TEST_DIR=${TMPDIR:-.} export UBLK_PROG export UBLK_TEST_QUIET export UBLK_TEST_SHOW_RESULT -export UBLK_TEST_DIR -- cgit v1.2.3 From 842b6520e579b8bd7d6ea09937e1fb7729cce1c5 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 1 Feb 2026 00:23:33 +0800 Subject: selftests: ublk: refactor test_loop_08 into separate functions Encapsulate each test case in its own function for better organization and maintainability: - _setup_device(): device and backfile initialization - _test_fill_and_verify(): initial data population - _test_corrupted_reftag(): reftag corruption detection test - _test_corrupted_data(): data corruption detection test - _test_bad_apptag(): apptag mismatch detection test Also fix temp file creation to use ${UBLK_TEST_DIR}/fio_err_XXXXX instead of creating in current directory. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/test_loop_08.sh | 199 ++++++++++++++++----------- 1 file changed, 115 insertions(+), 84 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/ublk/test_loop_08.sh b/tools/testing/selftests/ublk/test_loop_08.sh index 2caa7ba748fb..aaf1f52da559 100755 --- a/tools/testing/selftests/ublk/test_loop_08.sh +++ b/tools/testing/selftests/ublk/test_loop_08.sh @@ -13,98 +13,129 @@ if [[ "$fio_version" =~ fio-[0-9]+\.[0-9]+$ ]]; then exit $UBLK_SKIP_CODE fi +ERR_CODE=0 -_prep_test "loop" "end-to-end integrity" +# Global variables set during device setup +dev_id="" +fio_args="" +fio_err="" -_create_backfile 0 256M -_create_backfile 1 32M # 256M * (64 integrity bytes / 512 data bytes) -integrity_params="--integrity_capable --integrity_reftag - --metadata_size 64 --pi_offset 56 --csum_type t10dif" -dev_id=$(_add_ublk_dev -t loop -u $integrity_params "${UBLK_BACKFILES[@]}") -_check_add_dev $TID $? - -# 1M * (64 integrity bytes / 512 data bytes) = 128K -fio_args="--ioengine io_uring --direct 1 --bsrange 512-1M --iodepth 32 - --md_per_io_size 128K --pi_act 0 --pi_chk GUARD,REFTAG,APPTAG - --filename /dev/ublkb$dev_id" -fio --name fill --rw randwrite $fio_args > /dev/null -err=$? -if [ $err != 0 ]; then - echo "fio fill failed" - _show_result $TID $err -fi +_setup_device() { + _create_backfile 0 256M + _create_backfile 1 32M # 256M * (64 integrity bytes / 512 data bytes) -fio --name verify --rw randread $fio_args > /dev/null -err=$? -if [ $err != 0 ]; then - echo "fio verify failed" - _show_result $TID $err -fi + local integrity_params="--integrity_capable --integrity_reftag + --metadata_size 64 --pi_offset 56 --csum_type t10dif" + dev_id=$(_add_ublk_dev -t loop -u $integrity_params "${UBLK_BACKFILES[@]}") + _check_add_dev "$TID" $? -fio_err=$(mktemp fio_err_XXXXX) + # 1M * (64 integrity bytes / 512 data bytes) = 128K + fio_args="--ioengine io_uring --direct 1 --bsrange 512-1M --iodepth 32 + --md_per_io_size 128K --pi_act 0 --pi_chk GUARD,REFTAG,APPTAG + --filename /dev/ublkb$dev_id" -# Overwrite 4-byte reftag at offset 56 + 4 = 60 -dd_reftag_args="bs=1 seek=60 count=4 oflag=dsync conv=notrunc status=none" -dd if=/dev/urandom "of=${UBLK_BACKFILES[1]}" $dd_reftag_args -err=$? -if [ $err != 0 ]; then - echo "dd corrupted_reftag failed" - rm -f "$fio_err" - _show_result $TID $err -fi -if fio --name corrupted_reftag --rw randread $fio_args > /dev/null 2> "$fio_err"; then - echo "fio corrupted_reftag unexpectedly succeeded" - rm -f "$fio_err" - _show_result $TID 255 -fi -expected_err="REFTAG compare error: LBA: 0 Expected=0, Actual=" -if ! grep -q "$expected_err" "$fio_err"; then - echo "fio corrupted_reftag message not found: $expected_err" - rm -f "$fio_err" - _show_result $TID 255 -fi -# Reset to 0 -dd if=/dev/zero "of=${UBLK_BACKFILES[1]}" $dd_reftag_args -err=$? -if [ $err != 0 ]; then - echo "dd restore corrupted_reftag failed" - rm -f "$fio_err" - _show_result $TID $err -fi + fio_err=$(mktemp "${UBLK_TEST_DIR}"/fio_err_XXXXX) +} -dd_data_args="bs=512 count=1 oflag=direct,dsync conv=notrunc status=none" -dd if=/dev/zero "of=${UBLK_BACKFILES[0]}" $dd_data_args -err=$? -if [ $err != 0 ]; then - echo "dd corrupted_data failed" - rm -f "$fio_err" - _show_result $TID $err -fi -if fio --name corrupted_data --rw randread $fio_args > /dev/null 2> "$fio_err"; then - echo "fio corrupted_data unexpectedly succeeded" - rm -f "$fio_err" - _show_result $TID 255 -fi -expected_err="Guard compare error: LBA: 0 Expected=0, Actual=" -if ! grep -q "$expected_err" "$fio_err"; then - echo "fio corrupted_data message not found: $expected_err" - rm -f "$fio_err" - _show_result $TID 255 -fi +_test_fill_and_verify() { + fio --name fill --rw randwrite $fio_args > /dev/null + if [ $? != 0 ]; then + echo "fio fill failed" + ERR_CODE=255 + return 1 + fi -if fio --name bad_apptag --rw randread $fio_args --apptag 0x4321 > /dev/null 2> "$fio_err"; then - echo "fio bad_apptag unexpectedly succeeded" - rm -f "$fio_err" - _show_result $TID 255 -fi -expected_err="APPTAG compare error: LBA: [0-9]* Expected=4321, Actual=1234" -if ! grep -q "$expected_err" "$fio_err"; then - echo "fio bad_apptag message not found: $expected_err" - rm -f "$fio_err" - _show_result $TID 255 -fi + fio --name verify --rw randread $fio_args > /dev/null + if [ $? != 0 ]; then + echo "fio verify failed" + ERR_CODE=255 + return 1 + fi +} + +_test_corrupted_reftag() { + local dd_reftag_args="bs=1 seek=60 count=4 oflag=dsync conv=notrunc status=none" + local expected_err="REFTAG compare error: LBA: 0 Expected=0, Actual=" + + # Overwrite 4-byte reftag at offset 56 + 4 = 60 + dd if=/dev/urandom "of=${UBLK_BACKFILES[1]}" $dd_reftag_args + if [ $? != 0 ]; then + echo "dd corrupted_reftag failed" + ERR_CODE=255 + return 1 + fi + + if fio --name corrupted_reftag --rw randread $fio_args > /dev/null 2> "$fio_err"; then + echo "fio corrupted_reftag unexpectedly succeeded" + ERR_CODE=255 + return 1 + fi + + if ! grep -q "$expected_err" "$fio_err"; then + echo "fio corrupted_reftag message not found: $expected_err" + ERR_CODE=255 + return 1 + fi + + # Reset to 0 + dd if=/dev/zero "of=${UBLK_BACKFILES[1]}" $dd_reftag_args + if [ $? != 0 ]; then + echo "dd restore corrupted_reftag failed" + ERR_CODE=255 + return 1 + fi +} + +_test_corrupted_data() { + local dd_data_args="bs=512 count=1 oflag=direct,dsync conv=notrunc status=none" + local expected_err="Guard compare error: LBA: 0 Expected=0, Actual=" + + dd if=/dev/zero "of=${UBLK_BACKFILES[0]}" $dd_data_args + if [ $? != 0 ]; then + echo "dd corrupted_data failed" + ERR_CODE=255 + return 1 + fi + + if fio --name corrupted_data --rw randread $fio_args > /dev/null 2> "$fio_err"; then + echo "fio corrupted_data unexpectedly succeeded" + ERR_CODE=255 + return 1 + fi + + if ! grep -q "$expected_err" "$fio_err"; then + echo "fio corrupted_data message not found: $expected_err" + ERR_CODE=255 + return 1 + fi +} + +_test_bad_apptag() { + local expected_err="APPTAG compare error: LBA: [0-9]* Expected=4321, Actual=1234" + + if fio --name bad_apptag --rw randread $fio_args --apptag 0x4321 > /dev/null 2> "$fio_err"; then + echo "fio bad_apptag unexpectedly succeeded" + ERR_CODE=255 + return 1 + fi + + if ! grep -q "$expected_err" "$fio_err"; then + echo "fio bad_apptag message not found: $expected_err" + ERR_CODE=255 + return 1 + fi +} + +_prep_test "loop" "end-to-end integrity" + +_setup_device + +_test_fill_and_verify && \ +_test_corrupted_reftag && \ +_test_corrupted_data && \ +_test_bad_apptag rm -f "$fio_err" _cleanup_test -_show_result $TID 0 +_show_result "$TID" $ERR_CODE -- cgit v1.2.3 From 92734a4f3a7a5449b0c7d0160ba658a2b665c31b Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 1 Feb 2026 00:23:34 +0800 Subject: selftests: ublk: add _ublk_del_dev helper function Add _ublk_del_dev() to delete a specific ublk device by ID and use it in all test scripts instead of calling UBLK_PROG directly. Also remove unused _remove_ublk_devices() function. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/test_common.sh | 13 +++++++------ tools/testing/selftests/ublk/test_generic_16.sh | 4 ++-- tools/testing/selftests/ublk/test_null_04.sh | 8 ++++---- tools/testing/selftests/ublk/test_part_02.sh | 4 ++-- 4 files changed, 15 insertions(+), 14 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index 8d298a7ee7b1..0f1fdb0892b4 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -106,11 +106,6 @@ _check_root() { fi } -_remove_ublk_devices() { - ${UBLK_PROG} del -a - modprobe -r ublk_drv > /dev/null 2>&1 -} - _get_ublk_dev_state() { ${UBLK_PROG} list -n "$1" | grep "state" | awk '{print $11}' } @@ -277,10 +272,16 @@ __ublk_kill_daemon() echo "$state" } -__remove_ublk_dev_return() { +_ublk_del_dev() { local dev_id=$1 ${UBLK_PROG} del -n "${dev_id}" +} + +__remove_ublk_dev_return() { + local dev_id=$1 + + _ublk_del_dev "${dev_id}" local res=$? udevadm settle return ${res} diff --git a/tools/testing/selftests/ublk/test_generic_16.sh b/tools/testing/selftests/ublk/test_generic_16.sh index 42e8d2e16ec9..3ef367836ac5 100755 --- a/tools/testing/selftests/ublk/test_generic_16.sh +++ b/tools/testing/selftests/ublk/test_generic_16.sh @@ -24,7 +24,7 @@ if ! ${UBLK_PROG} stop -n "${dev_id}" --safe; then fi # Clean up device -${UBLK_PROG} del -n "${dev_id}" > /dev/null 2>&1 +_ublk_del_dev "${dev_id}" > /dev/null 2>&1 udevadm settle # Test 2: stop --safe on device with active opener should fail @@ -49,7 +49,7 @@ kill $dd_pid 2>/dev/null wait $dd_pid 2>/dev/null # Now device should be idle, regular delete should work -${UBLK_PROG} del -n "${dev_id}" +_ublk_del_dev "${dev_id}" udevadm settle _cleanup_test "null" diff --git a/tools/testing/selftests/ublk/test_null_04.sh b/tools/testing/selftests/ublk/test_null_04.sh index a5599d38583a..6713b280a6ff 100755 --- a/tools/testing/selftests/ublk/test_null_04.sh +++ b/tools/testing/selftests/ublk/test_null_04.sh @@ -34,7 +34,7 @@ _test_metadata_only() { "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0 - ${UBLK_PROG} del -n "${dev_id}" + _ublk_del_dev "${dev_id}" } _test_integrity_capable_ip() { @@ -53,7 +53,7 @@ _test_integrity_capable_ip() { "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0 - ${UBLK_PROG} del -n "${dev_id}" + _ublk_del_dev "${dev_id}" } _test_integrity_reftag_t10dif() { @@ -72,7 +72,7 @@ _test_integrity_reftag_t10dif() { "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0 - ${UBLK_PROG} del -n "${dev_id}" + _ublk_del_dev "${dev_id}" } _test_nvme_csum() { @@ -91,7 +91,7 @@ _test_nvme_csum() { "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 8 - ${UBLK_PROG} del -n "${dev_id}" + _ublk_del_dev "${dev_id}" } _prep_test "null" "integrity params" diff --git a/tools/testing/selftests/ublk/test_part_02.sh b/tools/testing/selftests/ublk/test_part_02.sh index 727d0f4610d6..acd098deda3a 100755 --- a/tools/testing/selftests/ublk/test_part_02.sh +++ b/tools/testing/selftests/ublk/test_part_02.sh @@ -46,13 +46,13 @@ _test_partition_scan_no_hang() if [ "$state" != "${expected_state}" ]; then echo "FAIL: Device state is $state, expected ${expected_state}" ERR_CODE=255 - ${UBLK_PROG} del -n "${dev_id}" > /dev/null 2>&1 + _ublk_del_dev "${dev_id}" > /dev/null 2>&1 return fi echo "PASS: Device transitioned to ${expected_state} in ${elapsed}s without hanging" # Clean up the device - ${UBLK_PROG} del -n "${dev_id}" > /dev/null 2>&1 + _ublk_del_dev "${dev_id}" > /dev/null 2>&1 } _prep_test "partition_scan" "verify async partition scan prevents IO hang" -- cgit v1.2.3 From 2021e6109de3e97adfce262c40a657ff206ef495 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 1 Feb 2026 00:23:35 +0800 Subject: selftests: ublk: track created devices for per-test cleanup Track device IDs in UBLK_DEVS array when created. Update _cleanup_test() to only delete devices created by this test instead of using 'del -a' which removes all devices. This prepares for running tests concurrently where each test should only clean up its own devices. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/test_common.sh | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index 0f1fdb0892b4..422882c32490 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -164,7 +164,12 @@ _check_add_dev() } _cleanup_test() { - "${UBLK_PROG}" del -a + if [ -f "${UBLK_TEST_DIR}/.ublk_devs" ]; then + while read -r dev_id; do + ${UBLK_PROG} del -n "${dev_id}" + done < "${UBLK_TEST_DIR}/.ublk_devs" + rm -f "${UBLK_TEST_DIR}/.ublk_devs" + fi _remove_files rmdir ${UBLK_TEST_DIR} @@ -205,6 +210,7 @@ _create_ublk_dev() { fi if [[ "$dev_id" =~ ^[0-9]+$ ]]; then + echo "$dev_id" >> "${UBLK_TEST_DIR}/.ublk_devs" echo "${dev_id}" else return 255 @@ -276,6 +282,11 @@ _ublk_del_dev() { local dev_id=$1 ${UBLK_PROG} del -n "${dev_id}" + + # Remove from tracking file + if [ -f "${UBLK_TEST_DIR}/.ublk_devs" ]; then + sed -i "/^${dev_id}$/d" "${UBLK_TEST_DIR}/.ublk_devs" + fi } __remove_ublk_dev_return() { -- cgit v1.2.3 From b6bbc3bec19efd557f888d78865b627b80b37a32 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 1 Feb 2026 00:23:36 +0800 Subject: selftests: ublk: add group-based test targets Add convenient Makefile targets for running specific test groups: - run_generic, run_batch, run_null, run_loop, run_stripe, run_stress, etc. - run_all for running all tests Test groups are auto-detected from TEST_PROGS using pattern matching (test__.sh -> group), and targets are generated dynamically using define/eval templates. Supports parallel execution via JOBS variable: - JOBS=1 (default): sequential with kselftest TAP output - JOBS>1: parallel execution with xargs -P Usage examples: make run_null # Sequential execution make run_stress JOBS=4 # Parallel with 4 jobs make run_all JOBS=8 # Run all tests with 8 parallel jobs With JOBS=8, running time of `make run_all` is reduced to 2m2s from 6m5s in my test VM. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 36 +++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'tools/testing') diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index ca8588ed962c..37e012d3a8a7 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -72,3 +72,39 @@ $(OUTPUT)/kublk: $(filter-out $(STANDALONE_UTILS),$(wildcard *.c)) check: shellcheck -x -f gcc *.sh + +# Test groups for running subsets of tests +# JOBS=1 (default): sequential with kselftest TAP output +# JOBS>1: parallel execution with xargs -P +# Usage: make run_null JOBS=4 +JOBS ?= 1 + +# Auto-detect test groups from TEST_PROGS (test__.sh -> group) +TEST_GROUPS := $(shell echo "$(TEST_PROGS)" | tr ' ' '\n' | \ + sed 's/test_\([^_]*\)_.*/\1/' | sort -u) + +# Template for group test targets +# $(1) = group name (e.g., null, generic, stress) +define RUN_GROUP +run_$(1): all + @if [ $$(JOBS) -gt 1 ]; then \ + echo $$(filter test_$(1)_%.sh,$$(TEST_PROGS)) | tr ' ' '\n' | \ + xargs -P $$(JOBS) -n1 sh -c './"$$$$0"' || true; \ + else \ + $$(call RUN_TESTS, $$(filter test_$(1)_%.sh,$$(TEST_PROGS))); \ + fi +.PHONY: run_$(1) +endef + +# Generate targets for each discovered test group +$(foreach group,$(TEST_GROUPS),$(eval $(call RUN_GROUP,$(group)))) + +# Run all tests (parallel when JOBS>1) +run_all: all + @if [ $(JOBS) -gt 1 ]; then \ + echo $(TEST_PROGS) | tr ' ' '\n' | \ + xargs -P $(JOBS) -n1 sh -c './"$$0"' || true; \ + else \ + $(call RUN_TESTS, $(TEST_PROGS)); \ + fi +.PHONY: run_all -- cgit v1.2.3 From 64406dd2f69fe27921c7bf06088871c002cf6186 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 1 Feb 2026 00:23:37 +0800 Subject: selftests: ublk: add _ublk_sleep helper for parallel execution Add _ublk_sleep() helper function that uses different sleep times depending on whether tests run in parallel or sequential mode. Usage: _ublk_sleep Export JOBS variable from Makefile so test scripts can detect parallel execution, and use _ublk_sleep in test_part_02.sh to handle the partition scan delay (1s normal, 5s parallel). Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 1 + tools/testing/selftests/ublk/test_common.sh | 10 ++++++++++ tools/testing/selftests/ublk/test_part_02.sh | 2 +- 3 files changed, 12 insertions(+), 1 deletion(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 37e012d3a8a7..1ceae611acb7 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -78,6 +78,7 @@ check: # JOBS>1: parallel execution with xargs -P # Usage: make run_null JOBS=4 JOBS ?= 1 +export JOBS # Auto-detect test groups from TEST_PROGS (test__.sh -> group) TEST_GROUPS := $(shell echo "$(TEST_PROGS)" | tr ' ' '\n' | \ diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index 422882c32490..bd27a6875c1a 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -15,6 +15,16 @@ _have_program() { return 1 } +# Sleep with awareness of parallel execution. +# Usage: _ublk_sleep +_ublk_sleep() { + if [ "${JOBS:-1}" -gt 1 ]; then + sleep "$2" + else + sleep "$1" + fi +} + _get_disk_dev_t() { local dev_id=$1 local dev diff --git a/tools/testing/selftests/ublk/test_part_02.sh b/tools/testing/selftests/ublk/test_part_02.sh index acd098deda3a..7d42ab4d6e83 100755 --- a/tools/testing/selftests/ublk/test_part_02.sh +++ b/tools/testing/selftests/ublk/test_part_02.sh @@ -33,7 +33,7 @@ _test_partition_scan_no_hang() # The add command should return quickly because partition scan is async. # Now sleep briefly to let the async partition scan work start and hit # the delay in the fault_inject handler. - sleep 1 + _ublk_sleep 1 5 # Kill the ublk daemon while partition scan is potentially blocked # And check state transitions properly -- cgit v1.2.3 From 56a08b87f9f2a763cb5546f83b78ebe1e96260af Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 1 Feb 2026 00:23:38 +0800 Subject: selftests: ublk: increase timeouts for parallel test execution When running tests in parallel with high JOBS count (e.g., JOBS=64), the existing timeouts can be insufficient due to system load: - Increase state wait loops from 20/50 to 100 iterations in _recover_ublk_dev(), __ublk_quiesce_dev(), and __ublk_kill_daemon() to handle slower state transitions under heavy load - Add --timeout=20 to udevadm settle calls to prevent indefinite hangs when udev event queue is overwhelmed by rapid device creation/deletion Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/test_common.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index bd27a6875c1a..c3afd00783a2 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -216,7 +216,7 @@ _create_ublk_dev() { fi if [ "$settle" = "yes" ]; then - udevadm settle + udevadm settle --timeout=20 fi if [[ "$dev_id" =~ ^[0-9]+$ ]]; then @@ -240,7 +240,7 @@ _recover_ublk_dev() { local state dev_id=$(_create_ublk_dev "recover" "yes" "$@") - for ((j=0;j<20;j++)); do + for ((j=0;j<100;j++)); do state=$(_get_ublk_dev_state "${dev_id}") [ "$state" == "LIVE" ] && break sleep 1 @@ -260,7 +260,7 @@ __ublk_quiesce_dev() return "$state" fi - for ((j=0;j<50;j++)); do + for ((j=0;j<100;j++)); do state=$(_get_ublk_dev_state "${dev_id}") [ "$state" == "$exp_state" ] && break sleep 1 @@ -279,7 +279,7 @@ __ublk_kill_daemon() daemon_pid=$(_get_ublk_daemon_pid "${dev_id}") state=$(_get_ublk_dev_state "${dev_id}") - for ((j=0;j<50;j++)); do + for ((j=0;j<100;j++)); do [ "$state" == "$exp_state" ] && break kill -9 "$daemon_pid" > /dev/null 2>&1 sleep 1 @@ -304,7 +304,7 @@ __remove_ublk_dev_return() { _ublk_del_dev "${dev_id}" local res=$? - udevadm settle + udevadm settle --timeout=20 return ${res} } -- cgit v1.2.3 From d9a36ab302b1c90d8f03a3b13538b8676eb6ed3b Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 1 Feb 2026 00:23:39 +0800 Subject: selftests: ublk: reorganize tests into integrity and recover groups Move integrity-focused tests into new 'integrity' group: - test_null_04.sh -> test_integrity_01.sh - test_loop_08.sh -> test_integrity_02.sh Move recovery-focused tests into new 'recover' group: - test_generic_04.sh -> test_recover_01.sh - test_generic_05.sh -> test_recover_02.sh - test_generic_11.sh -> test_recover_03.sh - test_generic_14.sh -> test_recover_04.sh Update Makefile to reflect the reorganization. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 14 ++- tools/testing/selftests/ublk/test_generic_04.sh | 44 ------- tools/testing/selftests/ublk/test_generic_05.sh | 48 -------- tools/testing/selftests/ublk/test_generic_11.sh | 43 ------- tools/testing/selftests/ublk/test_generic_14.sh | 39 ------ tools/testing/selftests/ublk/test_integrity_01.sh | 105 ++++++++++++++++ tools/testing/selftests/ublk/test_integrity_02.sh | 141 ++++++++++++++++++++++ tools/testing/selftests/ublk/test_loop_08.sh | 141 ---------------------- tools/testing/selftests/ublk/test_null_04.sh | 105 ---------------- tools/testing/selftests/ublk/test_recover_01.sh | 44 +++++++ tools/testing/selftests/ublk/test_recover_02.sh | 48 ++++++++ tools/testing/selftests/ublk/test_recover_03.sh | 43 +++++++ tools/testing/selftests/ublk/test_recover_04.sh | 39 ++++++ 13 files changed, 428 insertions(+), 426 deletions(-) delete mode 100755 tools/testing/selftests/ublk/test_generic_04.sh delete mode 100755 tools/testing/selftests/ublk/test_generic_05.sh delete mode 100755 tools/testing/selftests/ublk/test_generic_11.sh delete mode 100755 tools/testing/selftests/ublk/test_generic_14.sh create mode 100755 tools/testing/selftests/ublk/test_integrity_01.sh create mode 100755 tools/testing/selftests/ublk/test_integrity_02.sh delete mode 100755 tools/testing/selftests/ublk/test_loop_08.sh delete mode 100755 tools/testing/selftests/ublk/test_null_04.sh create mode 100755 tools/testing/selftests/ublk/test_recover_01.sh create mode 100755 tools/testing/selftests/ublk/test_recover_02.sh create mode 100755 tools/testing/selftests/ublk/test_recover_03.sh create mode 100755 tools/testing/selftests/ublk/test_recover_04.sh (limited to 'tools/testing') diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 1ceae611acb7..a62a06e13006 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -10,18 +10,14 @@ LDLIBS += -lpthread -lm -luring TEST_PROGS := test_generic_01.sh TEST_PROGS += test_generic_02.sh TEST_PROGS += test_generic_03.sh -TEST_PROGS += test_generic_04.sh -TEST_PROGS += test_generic_05.sh TEST_PROGS += test_generic_06.sh TEST_PROGS += test_generic_07.sh TEST_PROGS += test_generic_08.sh TEST_PROGS += test_generic_09.sh TEST_PROGS += test_generic_10.sh -TEST_PROGS += test_generic_11.sh TEST_PROGS += test_generic_12.sh TEST_PROGS += test_generic_13.sh -TEST_PROGS += test_generic_14.sh TEST_PROGS += test_generic_16.sh TEST_PROGS += test_batch_01.sh @@ -31,7 +27,6 @@ TEST_PROGS += test_batch_03.sh TEST_PROGS += test_null_01.sh TEST_PROGS += test_null_02.sh TEST_PROGS += test_null_03.sh -TEST_PROGS += test_null_04.sh TEST_PROGS += test_loop_01.sh TEST_PROGS += test_loop_02.sh TEST_PROGS += test_loop_03.sh @@ -39,7 +34,14 @@ TEST_PROGS += test_loop_04.sh TEST_PROGS += test_loop_05.sh TEST_PROGS += test_loop_06.sh TEST_PROGS += test_loop_07.sh -TEST_PROGS += test_loop_08.sh + +TEST_PROGS += test_integrity_01.sh +TEST_PROGS += test_integrity_02.sh + +TEST_PROGS += test_recover_01.sh +TEST_PROGS += test_recover_02.sh +TEST_PROGS += test_recover_03.sh +TEST_PROGS += test_recover_04.sh TEST_PROGS += test_stripe_01.sh TEST_PROGS += test_stripe_02.sh TEST_PROGS += test_stripe_03.sh diff --git a/tools/testing/selftests/ublk/test_generic_04.sh b/tools/testing/selftests/ublk/test_generic_04.sh deleted file mode 100755 index 2672f9c40fa8..000000000000 --- a/tools/testing/selftests/ublk/test_generic_04.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh - -ERR_CODE=0 - -ublk_run_recover_test() -{ - run_io_and_recover 256M "kill_daemon" "$@" - ERR_CODE=$? - if [ ${ERR_CODE} -ne 0 ]; then - echo "$TID failure: $*" - _show_result $TID $ERR_CODE - fi -} - -if ! _have_program fio; then - exit "$UBLK_SKIP_CODE" -fi - -_prep_test "recover" "basic recover function verification" - -_create_backfile 0 256M -_create_backfile 1 128M -_create_backfile 2 128M - -ublk_run_recover_test -t null -q 2 -r 1 -b & -ublk_run_recover_test -t loop -q 2 -r 1 -b "${UBLK_BACKFILES[0]}" & -ublk_run_recover_test -t stripe -q 2 -r 1 -b "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & -wait - -ublk_run_recover_test -t null -q 2 -r 1 & -ublk_run_recover_test -t loop -q 2 -r 1 "${UBLK_BACKFILES[0]}" & -ublk_run_recover_test -t stripe -q 2 -r 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & -wait - -ublk_run_recover_test -t null -q 2 -r 1 -i 1 & -ublk_run_recover_test -t loop -q 2 -r 1 -i 1 "${UBLK_BACKFILES[0]}" & -ublk_run_recover_test -t stripe -q 2 -r 1 -i 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & -wait - -_cleanup_test "recover" -_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_generic_05.sh b/tools/testing/selftests/ublk/test_generic_05.sh deleted file mode 100755 index bda5064bc31f..000000000000 --- a/tools/testing/selftests/ublk/test_generic_05.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh - -ERR_CODE=0 - -ublk_run_recover_test() -{ - run_io_and_recover 256M "kill_daemon" "$@" - ERR_CODE=$? - if [ ${ERR_CODE} -ne 0 ]; then - echo "$TID failure: $*" - _show_result $TID $ERR_CODE - fi -} - -if ! _have_program fio; then - exit "$UBLK_SKIP_CODE" -fi - -if ! _have_feature "ZERO_COPY"; then - exit "$UBLK_SKIP_CODE" -fi - -_prep_test "recover" "basic recover function verification (zero copy)" - -_create_backfile 0 256M -_create_backfile 1 128M -_create_backfile 2 128M - -ublk_run_recover_test -t null -q 2 -r 1 -z -b & -ublk_run_recover_test -t loop -q 2 -r 1 -z -b "${UBLK_BACKFILES[0]}" & -ublk_run_recover_test -t stripe -q 2 -r 1 -z -b "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & -wait - -ublk_run_recover_test -t null -q 2 -r 1 -z & -ublk_run_recover_test -t loop -q 2 -r 1 -z "${UBLK_BACKFILES[0]}" & -ublk_run_recover_test -t stripe -q 2 -r 1 -z "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & -wait - -ublk_run_recover_test -t null -q 2 -r 1 -z -i 1 & -ublk_run_recover_test -t loop -q 2 -r 1 -z -i 1 "${UBLK_BACKFILES[0]}" & -ublk_run_recover_test -t stripe -q 2 -r 1 -z -i 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & -wait - -_cleanup_test "recover" -_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_generic_11.sh b/tools/testing/selftests/ublk/test_generic_11.sh deleted file mode 100755 index e0dc0b8fe5d6..000000000000 --- a/tools/testing/selftests/ublk/test_generic_11.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh - -ERR_CODE=0 - -ublk_run_quiesce_recover() -{ - run_io_and_recover 256M "quiesce_dev" "$@" - ERR_CODE=$? - if [ ${ERR_CODE} -ne 0 ]; then - echo "$TID failure: $*" - _show_result $TID $ERR_CODE - fi -} - -if ! _have_feature "QUIESCE"; then - exit "$UBLK_SKIP_CODE" -fi - -if ! _have_program fio; then - exit "$UBLK_SKIP_CODE" -fi - -_prep_test "quiesce" "basic quiesce & recover function verification" - -_create_backfile 0 256M -_create_backfile 1 128M -_create_backfile 2 128M - -ublk_run_quiesce_recover -t null -q 2 -r 1 & -ublk_run_quiesce_recover -t loop -q 2 -r 1 "${UBLK_BACKFILES[0]}" & -ublk_run_quiesce_recover -t stripe -q 2 -r 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & -wait - -ublk_run_quiesce_recover -t null -q 2 -r 1 -i 1 & -ublk_run_quiesce_recover -t loop -q 2 -r 1 -i 1 "${UBLK_BACKFILES[0]}" & -ublk_run_quiesce_recover -t stripe -q 2 -r 1 -i 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & -wait - -_cleanup_test "quiesce" -_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_generic_14.sh b/tools/testing/selftests/ublk/test_generic_14.sh deleted file mode 100755 index 178443394ca5..000000000000 --- a/tools/testing/selftests/ublk/test_generic_14.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh - -ERR_CODE=0 - -ublk_run_recover_test() -{ - run_io_and_recover 256M "kill_daemon" "$@" - ERR_CODE=$? - if [ ${ERR_CODE} -ne 0 ]; then - echo "$TID failure: $*" - _show_result $TID $ERR_CODE - fi -} - -if ! _have_program fio; then - exit "$UBLK_SKIP_CODE" -fi - -_prep_test "recover" "basic recover function verification (user copy)" - -_create_backfile 0 256M -_create_backfile 1 128M -_create_backfile 2 128M - -ublk_run_recover_test -t null -q 2 -r 1 -u & -ublk_run_recover_test -t loop -q 2 -r 1 -u "${UBLK_BACKFILES[0]}" & -ublk_run_recover_test -t stripe -q 2 -r 1 -u "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & -wait - -ublk_run_recover_test -t null -q 2 -r 1 -u -i 1 & -ublk_run_recover_test -t loop -q 2 -r 1 -u -i 1 "${UBLK_BACKFILES[0]}" & -ublk_run_recover_test -t stripe -q 2 -r 1 -u -i 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & -wait - -_cleanup_test "recover" -_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_integrity_01.sh b/tools/testing/selftests/ublk/test_integrity_01.sh new file mode 100755 index 000000000000..6713b280a6ff --- /dev/null +++ b/tools/testing/selftests/ublk/test_integrity_01.sh @@ -0,0 +1,105 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +ERR_CODE=0 + +_check_value() { + local name=$1 + local actual=$2 + local expected=$3 + + if [ "$actual" != "$expected" ]; then + echo "$name $actual != $expected" + ERR_CODE=255 + return 1 + fi + return 0 +} + +_test_metadata_only() { + local dev_id + + dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --metadata_size 8) + _check_add_dev "$TID" $? + + _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 8 && + _check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 0 && + _check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 0 && + _check_value "device_is_integrity_capable" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 0 && + _check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" nop && + _check_value "protection_interval_bytes" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && + _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0 + + _ublk_del_dev "${dev_id}" +} + +_test_integrity_capable_ip() { + local dev_id + + dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --integrity_capable --metadata_size 64 --pi_offset 56 --csum_type ip) + _check_add_dev "$TID" $? + + _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 64 && + _check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 56 && + _check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 8 && + _check_value "device_is_integrity_capable" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 1 && + _check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" T10-DIF-TYPE3-IP && + _check_value "protection_interval_bytes" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && + _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0 + + _ublk_del_dev "${dev_id}" +} + +_test_integrity_reftag_t10dif() { + local dev_id + + dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --integrity_reftag --metadata_size 8 --csum_type t10dif) + _check_add_dev "$TID" $? + + _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 8 && + _check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 0 && + _check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 8 && + _check_value "device_is_integrity_capable" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 0 && + _check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" T10-DIF-TYPE1-CRC && + _check_value "protection_interval_bytes" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && + _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0 + + _ublk_del_dev "${dev_id}" +} + +_test_nvme_csum() { + local dev_id + + dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --metadata_size 16 --csum_type nvme --tag_size 8) + _check_add_dev "$TID" $? + + _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 16 && + _check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 0 && + _check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 16 && + _check_value "device_is_integrity_capable" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 0 && + _check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" EXT-DIF-TYPE3-CRC64 && + _check_value "protection_interval_bytes" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && + _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 8 + + _ublk_del_dev "${dev_id}" +} + +_prep_test "null" "integrity params" + +_test_metadata_only +_test_integrity_capable_ip +_test_integrity_reftag_t10dif +_test_nvme_csum + +_cleanup_test +_show_result "$TID" $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_integrity_02.sh b/tools/testing/selftests/ublk/test_integrity_02.sh new file mode 100755 index 000000000000..aaf1f52da559 --- /dev/null +++ b/tools/testing/selftests/ublk/test_integrity_02.sh @@ -0,0 +1,141 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +if ! _have_program fio; then + exit $UBLK_SKIP_CODE +fi + +fio_version=$(fio --version) +if [[ "$fio_version" =~ fio-[0-9]+\.[0-9]+$ ]]; then + echo "Requires development fio version with https://github.com/axboe/fio/pull/1992" + exit $UBLK_SKIP_CODE +fi + +ERR_CODE=0 + +# Global variables set during device setup +dev_id="" +fio_args="" +fio_err="" + +_setup_device() { + _create_backfile 0 256M + _create_backfile 1 32M # 256M * (64 integrity bytes / 512 data bytes) + + local integrity_params="--integrity_capable --integrity_reftag + --metadata_size 64 --pi_offset 56 --csum_type t10dif" + dev_id=$(_add_ublk_dev -t loop -u $integrity_params "${UBLK_BACKFILES[@]}") + _check_add_dev "$TID" $? + + # 1M * (64 integrity bytes / 512 data bytes) = 128K + fio_args="--ioengine io_uring --direct 1 --bsrange 512-1M --iodepth 32 + --md_per_io_size 128K --pi_act 0 --pi_chk GUARD,REFTAG,APPTAG + --filename /dev/ublkb$dev_id" + + fio_err=$(mktemp "${UBLK_TEST_DIR}"/fio_err_XXXXX) +} + +_test_fill_and_verify() { + fio --name fill --rw randwrite $fio_args > /dev/null + if [ $? != 0 ]; then + echo "fio fill failed" + ERR_CODE=255 + return 1 + fi + + fio --name verify --rw randread $fio_args > /dev/null + if [ $? != 0 ]; then + echo "fio verify failed" + ERR_CODE=255 + return 1 + fi +} + +_test_corrupted_reftag() { + local dd_reftag_args="bs=1 seek=60 count=4 oflag=dsync conv=notrunc status=none" + local expected_err="REFTAG compare error: LBA: 0 Expected=0, Actual=" + + # Overwrite 4-byte reftag at offset 56 + 4 = 60 + dd if=/dev/urandom "of=${UBLK_BACKFILES[1]}" $dd_reftag_args + if [ $? != 0 ]; then + echo "dd corrupted_reftag failed" + ERR_CODE=255 + return 1 + fi + + if fio --name corrupted_reftag --rw randread $fio_args > /dev/null 2> "$fio_err"; then + echo "fio corrupted_reftag unexpectedly succeeded" + ERR_CODE=255 + return 1 + fi + + if ! grep -q "$expected_err" "$fio_err"; then + echo "fio corrupted_reftag message not found: $expected_err" + ERR_CODE=255 + return 1 + fi + + # Reset to 0 + dd if=/dev/zero "of=${UBLK_BACKFILES[1]}" $dd_reftag_args + if [ $? != 0 ]; then + echo "dd restore corrupted_reftag failed" + ERR_CODE=255 + return 1 + fi +} + +_test_corrupted_data() { + local dd_data_args="bs=512 count=1 oflag=direct,dsync conv=notrunc status=none" + local expected_err="Guard compare error: LBA: 0 Expected=0, Actual=" + + dd if=/dev/zero "of=${UBLK_BACKFILES[0]}" $dd_data_args + if [ $? != 0 ]; then + echo "dd corrupted_data failed" + ERR_CODE=255 + return 1 + fi + + if fio --name corrupted_data --rw randread $fio_args > /dev/null 2> "$fio_err"; then + echo "fio corrupted_data unexpectedly succeeded" + ERR_CODE=255 + return 1 + fi + + if ! grep -q "$expected_err" "$fio_err"; then + echo "fio corrupted_data message not found: $expected_err" + ERR_CODE=255 + return 1 + fi +} + +_test_bad_apptag() { + local expected_err="APPTAG compare error: LBA: [0-9]* Expected=4321, Actual=1234" + + if fio --name bad_apptag --rw randread $fio_args --apptag 0x4321 > /dev/null 2> "$fio_err"; then + echo "fio bad_apptag unexpectedly succeeded" + ERR_CODE=255 + return 1 + fi + + if ! grep -q "$expected_err" "$fio_err"; then + echo "fio bad_apptag message not found: $expected_err" + ERR_CODE=255 + return 1 + fi +} + +_prep_test "loop" "end-to-end integrity" + +_setup_device + +_test_fill_and_verify && \ +_test_corrupted_reftag && \ +_test_corrupted_data && \ +_test_bad_apptag + +rm -f "$fio_err" + +_cleanup_test +_show_result "$TID" $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_loop_08.sh b/tools/testing/selftests/ublk/test_loop_08.sh deleted file mode 100755 index aaf1f52da559..000000000000 --- a/tools/testing/selftests/ublk/test_loop_08.sh +++ /dev/null @@ -1,141 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh - -if ! _have_program fio; then - exit $UBLK_SKIP_CODE -fi - -fio_version=$(fio --version) -if [[ "$fio_version" =~ fio-[0-9]+\.[0-9]+$ ]]; then - echo "Requires development fio version with https://github.com/axboe/fio/pull/1992" - exit $UBLK_SKIP_CODE -fi - -ERR_CODE=0 - -# Global variables set during device setup -dev_id="" -fio_args="" -fio_err="" - -_setup_device() { - _create_backfile 0 256M - _create_backfile 1 32M # 256M * (64 integrity bytes / 512 data bytes) - - local integrity_params="--integrity_capable --integrity_reftag - --metadata_size 64 --pi_offset 56 --csum_type t10dif" - dev_id=$(_add_ublk_dev -t loop -u $integrity_params "${UBLK_BACKFILES[@]}") - _check_add_dev "$TID" $? - - # 1M * (64 integrity bytes / 512 data bytes) = 128K - fio_args="--ioengine io_uring --direct 1 --bsrange 512-1M --iodepth 32 - --md_per_io_size 128K --pi_act 0 --pi_chk GUARD,REFTAG,APPTAG - --filename /dev/ublkb$dev_id" - - fio_err=$(mktemp "${UBLK_TEST_DIR}"/fio_err_XXXXX) -} - -_test_fill_and_verify() { - fio --name fill --rw randwrite $fio_args > /dev/null - if [ $? != 0 ]; then - echo "fio fill failed" - ERR_CODE=255 - return 1 - fi - - fio --name verify --rw randread $fio_args > /dev/null - if [ $? != 0 ]; then - echo "fio verify failed" - ERR_CODE=255 - return 1 - fi -} - -_test_corrupted_reftag() { - local dd_reftag_args="bs=1 seek=60 count=4 oflag=dsync conv=notrunc status=none" - local expected_err="REFTAG compare error: LBA: 0 Expected=0, Actual=" - - # Overwrite 4-byte reftag at offset 56 + 4 = 60 - dd if=/dev/urandom "of=${UBLK_BACKFILES[1]}" $dd_reftag_args - if [ $? != 0 ]; then - echo "dd corrupted_reftag failed" - ERR_CODE=255 - return 1 - fi - - if fio --name corrupted_reftag --rw randread $fio_args > /dev/null 2> "$fio_err"; then - echo "fio corrupted_reftag unexpectedly succeeded" - ERR_CODE=255 - return 1 - fi - - if ! grep -q "$expected_err" "$fio_err"; then - echo "fio corrupted_reftag message not found: $expected_err" - ERR_CODE=255 - return 1 - fi - - # Reset to 0 - dd if=/dev/zero "of=${UBLK_BACKFILES[1]}" $dd_reftag_args - if [ $? != 0 ]; then - echo "dd restore corrupted_reftag failed" - ERR_CODE=255 - return 1 - fi -} - -_test_corrupted_data() { - local dd_data_args="bs=512 count=1 oflag=direct,dsync conv=notrunc status=none" - local expected_err="Guard compare error: LBA: 0 Expected=0, Actual=" - - dd if=/dev/zero "of=${UBLK_BACKFILES[0]}" $dd_data_args - if [ $? != 0 ]; then - echo "dd corrupted_data failed" - ERR_CODE=255 - return 1 - fi - - if fio --name corrupted_data --rw randread $fio_args > /dev/null 2> "$fio_err"; then - echo "fio corrupted_data unexpectedly succeeded" - ERR_CODE=255 - return 1 - fi - - if ! grep -q "$expected_err" "$fio_err"; then - echo "fio corrupted_data message not found: $expected_err" - ERR_CODE=255 - return 1 - fi -} - -_test_bad_apptag() { - local expected_err="APPTAG compare error: LBA: [0-9]* Expected=4321, Actual=1234" - - if fio --name bad_apptag --rw randread $fio_args --apptag 0x4321 > /dev/null 2> "$fio_err"; then - echo "fio bad_apptag unexpectedly succeeded" - ERR_CODE=255 - return 1 - fi - - if ! grep -q "$expected_err" "$fio_err"; then - echo "fio bad_apptag message not found: $expected_err" - ERR_CODE=255 - return 1 - fi -} - -_prep_test "loop" "end-to-end integrity" - -_setup_device - -_test_fill_and_verify && \ -_test_corrupted_reftag && \ -_test_corrupted_data && \ -_test_bad_apptag - -rm -f "$fio_err" - -_cleanup_test -_show_result "$TID" $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_null_04.sh b/tools/testing/selftests/ublk/test_null_04.sh deleted file mode 100755 index 6713b280a6ff..000000000000 --- a/tools/testing/selftests/ublk/test_null_04.sh +++ /dev/null @@ -1,105 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh - -ERR_CODE=0 - -_check_value() { - local name=$1 - local actual=$2 - local expected=$3 - - if [ "$actual" != "$expected" ]; then - echo "$name $actual != $expected" - ERR_CODE=255 - return 1 - fi - return 0 -} - -_test_metadata_only() { - local dev_id - - dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --metadata_size 8) - _check_add_dev "$TID" $? - - _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 8 && - _check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 0 && - _check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 0 && - _check_value "device_is_integrity_capable" \ - "$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 0 && - _check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" nop && - _check_value "protection_interval_bytes" \ - "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && - _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0 - - _ublk_del_dev "${dev_id}" -} - -_test_integrity_capable_ip() { - local dev_id - - dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --integrity_capable --metadata_size 64 --pi_offset 56 --csum_type ip) - _check_add_dev "$TID" $? - - _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 64 && - _check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 56 && - _check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 8 && - _check_value "device_is_integrity_capable" \ - "$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 1 && - _check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" T10-DIF-TYPE3-IP && - _check_value "protection_interval_bytes" \ - "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && - _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0 - - _ublk_del_dev "${dev_id}" -} - -_test_integrity_reftag_t10dif() { - local dev_id - - dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --integrity_reftag --metadata_size 8 --csum_type t10dif) - _check_add_dev "$TID" $? - - _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 8 && - _check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 0 && - _check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 8 && - _check_value "device_is_integrity_capable" \ - "$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 0 && - _check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" T10-DIF-TYPE1-CRC && - _check_value "protection_interval_bytes" \ - "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && - _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0 - - _ublk_del_dev "${dev_id}" -} - -_test_nvme_csum() { - local dev_id - - dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --metadata_size 16 --csum_type nvme --tag_size 8) - _check_add_dev "$TID" $? - - _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 16 && - _check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 0 && - _check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 16 && - _check_value "device_is_integrity_capable" \ - "$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 0 && - _check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" EXT-DIF-TYPE3-CRC64 && - _check_value "protection_interval_bytes" \ - "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && - _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 8 - - _ublk_del_dev "${dev_id}" -} - -_prep_test "null" "integrity params" - -_test_metadata_only -_test_integrity_capable_ip -_test_integrity_reftag_t10dif -_test_nvme_csum - -_cleanup_test -_show_result "$TID" $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_recover_01.sh b/tools/testing/selftests/ublk/test_recover_01.sh new file mode 100755 index 000000000000..2672f9c40fa8 --- /dev/null +++ b/tools/testing/selftests/ublk/test_recover_01.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +ERR_CODE=0 + +ublk_run_recover_test() +{ + run_io_and_recover 256M "kill_daemon" "$@" + ERR_CODE=$? + if [ ${ERR_CODE} -ne 0 ]; then + echo "$TID failure: $*" + _show_result $TID $ERR_CODE + fi +} + +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "recover" "basic recover function verification" + +_create_backfile 0 256M +_create_backfile 1 128M +_create_backfile 2 128M + +ublk_run_recover_test -t null -q 2 -r 1 -b & +ublk_run_recover_test -t loop -q 2 -r 1 -b "${UBLK_BACKFILES[0]}" & +ublk_run_recover_test -t stripe -q 2 -r 1 -b "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +wait + +ublk_run_recover_test -t null -q 2 -r 1 & +ublk_run_recover_test -t loop -q 2 -r 1 "${UBLK_BACKFILES[0]}" & +ublk_run_recover_test -t stripe -q 2 -r 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +wait + +ublk_run_recover_test -t null -q 2 -r 1 -i 1 & +ublk_run_recover_test -t loop -q 2 -r 1 -i 1 "${UBLK_BACKFILES[0]}" & +ublk_run_recover_test -t stripe -q 2 -r 1 -i 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +wait + +_cleanup_test "recover" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_recover_02.sh b/tools/testing/selftests/ublk/test_recover_02.sh new file mode 100755 index 000000000000..bda5064bc31f --- /dev/null +++ b/tools/testing/selftests/ublk/test_recover_02.sh @@ -0,0 +1,48 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +ERR_CODE=0 + +ublk_run_recover_test() +{ + run_io_and_recover 256M "kill_daemon" "$@" + ERR_CODE=$? + if [ ${ERR_CODE} -ne 0 ]; then + echo "$TID failure: $*" + _show_result $TID $ERR_CODE + fi +} + +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + +if ! _have_feature "ZERO_COPY"; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "recover" "basic recover function verification (zero copy)" + +_create_backfile 0 256M +_create_backfile 1 128M +_create_backfile 2 128M + +ublk_run_recover_test -t null -q 2 -r 1 -z -b & +ublk_run_recover_test -t loop -q 2 -r 1 -z -b "${UBLK_BACKFILES[0]}" & +ublk_run_recover_test -t stripe -q 2 -r 1 -z -b "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +wait + +ublk_run_recover_test -t null -q 2 -r 1 -z & +ublk_run_recover_test -t loop -q 2 -r 1 -z "${UBLK_BACKFILES[0]}" & +ublk_run_recover_test -t stripe -q 2 -r 1 -z "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +wait + +ublk_run_recover_test -t null -q 2 -r 1 -z -i 1 & +ublk_run_recover_test -t loop -q 2 -r 1 -z -i 1 "${UBLK_BACKFILES[0]}" & +ublk_run_recover_test -t stripe -q 2 -r 1 -z -i 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +wait + +_cleanup_test "recover" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_recover_03.sh b/tools/testing/selftests/ublk/test_recover_03.sh new file mode 100755 index 000000000000..e0dc0b8fe5d6 --- /dev/null +++ b/tools/testing/selftests/ublk/test_recover_03.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +ERR_CODE=0 + +ublk_run_quiesce_recover() +{ + run_io_and_recover 256M "quiesce_dev" "$@" + ERR_CODE=$? + if [ ${ERR_CODE} -ne 0 ]; then + echo "$TID failure: $*" + _show_result $TID $ERR_CODE + fi +} + +if ! _have_feature "QUIESCE"; then + exit "$UBLK_SKIP_CODE" +fi + +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "quiesce" "basic quiesce & recover function verification" + +_create_backfile 0 256M +_create_backfile 1 128M +_create_backfile 2 128M + +ublk_run_quiesce_recover -t null -q 2 -r 1 & +ublk_run_quiesce_recover -t loop -q 2 -r 1 "${UBLK_BACKFILES[0]}" & +ublk_run_quiesce_recover -t stripe -q 2 -r 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +wait + +ublk_run_quiesce_recover -t null -q 2 -r 1 -i 1 & +ublk_run_quiesce_recover -t loop -q 2 -r 1 -i 1 "${UBLK_BACKFILES[0]}" & +ublk_run_quiesce_recover -t stripe -q 2 -r 1 -i 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +wait + +_cleanup_test "quiesce" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_recover_04.sh b/tools/testing/selftests/ublk/test_recover_04.sh new file mode 100755 index 000000000000..178443394ca5 --- /dev/null +++ b/tools/testing/selftests/ublk/test_recover_04.sh @@ -0,0 +1,39 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +ERR_CODE=0 + +ublk_run_recover_test() +{ + run_io_and_recover 256M "kill_daemon" "$@" + ERR_CODE=$? + if [ ${ERR_CODE} -ne 0 ]; then + echo "$TID failure: $*" + _show_result $TID $ERR_CODE + fi +} + +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "recover" "basic recover function verification (user copy)" + +_create_backfile 0 256M +_create_backfile 1 128M +_create_backfile 2 128M + +ublk_run_recover_test -t null -q 2 -r 1 -u & +ublk_run_recover_test -t loop -q 2 -r 1 -u "${UBLK_BACKFILES[0]}" & +ublk_run_recover_test -t stripe -q 2 -r 1 -u "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +wait + +ublk_run_recover_test -t null -q 2 -r 1 -u -i 1 & +ublk_run_recover_test -t loop -q 2 -r 1 -u -i 1 "${UBLK_BACKFILES[0]}" & +ublk_run_recover_test -t stripe -q 2 -r 1 -u -i 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +wait + +_cleanup_test "recover" +_show_result $TID $ERR_CODE -- cgit v1.2.3 From 5314d25afbc44d0449fa2519d2c9d7f3c319f74c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 1 Feb 2026 00:23:40 +0800 Subject: selftests: ublk: improve I/O ordering test with bpftrace Remove test_generic_01.sh since block layer may reorder I/O, making the test prone to false positives. Apply the improvements to test_generic_02.sh instead, which supposes for covering ublk dispatch io order. Rework test_generic_02 to verify that ublk dispatch doesn't reorder I/O by comparing request start order with completion order using bpftrace. The bpftrace script now: - Tracks each request's start sequence number in a map keyed by sector - On completion, verifies the request's start order matches expected completion order - Reports any out-of-order completions detected The test script: - Wait bpftrace BEGIN code block is run - Pins fio to CPU 0 for deterministic behavior - Uses block_io_start and block_rq_complete tracepoints - Checks bpftrace output for reordering errors Reported-and-tested-by: Alexander Atanasov Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 3 +- tools/testing/selftests/ublk/test_generic_01.sh | 47 ------------------------- tools/testing/selftests/ublk/test_generic_02.sh | 22 ++++++++---- tools/testing/selftests/ublk/trace/seq_io.bt | 47 ++++++++++++++++++++----- 4 files changed, 54 insertions(+), 65 deletions(-) delete mode 100755 tools/testing/selftests/ublk/test_generic_01.sh (limited to 'tools/testing') diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index a62a06e13006..8ac2d4a682a1 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -7,8 +7,7 @@ endif LDLIBS += -lpthread -lm -luring -TEST_PROGS := test_generic_01.sh -TEST_PROGS += test_generic_02.sh +TEST_PROGS := test_generic_02.sh TEST_PROGS += test_generic_03.sh TEST_PROGS += test_generic_06.sh TEST_PROGS += test_generic_07.sh diff --git a/tools/testing/selftests/ublk/test_generic_01.sh b/tools/testing/selftests/ublk/test_generic_01.sh deleted file mode 100755 index 26cf3c7ceeb5..000000000000 --- a/tools/testing/selftests/ublk/test_generic_01.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh - -ERR_CODE=0 - -if ! _have_program bpftrace; then - exit "$UBLK_SKIP_CODE" -fi - -if ! _have_program fio; then - exit "$UBLK_SKIP_CODE" -fi - -_prep_test "null" "sequential io order" - -dev_id=$(_add_ublk_dev -t null) -_check_add_dev $TID $? - -dev_t=$(_get_disk_dev_t "$dev_id") -bpftrace trace/seq_io.bt "$dev_t" "W" 1 > "$UBLK_TMP" 2>&1 & -btrace_pid=$! -sleep 2 - -if ! kill -0 "$btrace_pid" > /dev/null 2>&1; then - _cleanup_test "null" - exit "$UBLK_SKIP_CODE" -fi - -# run fio over this ublk disk -fio --name=write_seq \ - --filename=/dev/ublkb"${dev_id}" \ - --ioengine=libaio --iodepth=16 \ - --rw=write \ - --size=512M \ - --direct=1 \ - --bs=4k > /dev/null 2>&1 -ERR_CODE=$? -kill "$btrace_pid" -wait -if grep -q "io_out_of_order" "$UBLK_TMP"; then - cat "$UBLK_TMP" - ERR_CODE=255 -fi -_cleanup_test "null" -_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_generic_02.sh b/tools/testing/selftests/ublk/test_generic_02.sh index 1d4b1d6e059c..46b657143fd6 100755 --- a/tools/testing/selftests/ublk/test_generic_02.sh +++ b/tools/testing/selftests/ublk/test_generic_02.sh @@ -13,7 +13,7 @@ if ! _have_program fio; then exit "$UBLK_SKIP_CODE" fi -_prep_test "null" "sequential io order for MQ" +_prep_test "null" "ublk dispatch won't reorder IO for MQ" dev_id=$(_add_ublk_dev -t null -q 2) _check_add_dev $TID $? @@ -21,15 +21,20 @@ _check_add_dev $TID $? dev_t=$(_get_disk_dev_t "$dev_id") bpftrace trace/seq_io.bt "$dev_t" "W" 1 > "$UBLK_TMP" 2>&1 & btrace_pid=$! -sleep 2 -if ! kill -0 "$btrace_pid" > /dev/null 2>&1; then +# Wait for bpftrace probes to be attached (BEGIN block prints BPFTRACE_READY) +for _ in $(seq 100); do + grep -q "BPFTRACE_READY" "$UBLK_TMP" 2>/dev/null && break + sleep 0.1 +done + +if ! kill -0 "$btrace_pid" 2>/dev/null; then _cleanup_test "null" exit "$UBLK_SKIP_CODE" fi -# run fio over this ublk disk -fio --name=write_seq \ +# run fio over this ublk disk (pinned to CPU 0) +taskset -c 0 fio --name=write_seq \ --filename=/dev/ublkb"${dev_id}" \ --ioengine=libaio --iodepth=16 \ --rw=write \ @@ -39,8 +44,11 @@ fio --name=write_seq \ ERR_CODE=$? kill "$btrace_pid" wait -if grep -q "io_out_of_order" "$UBLK_TMP"; then - cat "$UBLK_TMP" + +# Check for out-of-order completions detected by bpftrace +if grep -q "^out_of_order:" "$UBLK_TMP"; then + echo "I/O reordering detected:" + grep "^out_of_order:" "$UBLK_TMP" ERR_CODE=255 fi _cleanup_test "null" diff --git a/tools/testing/selftests/ublk/trace/seq_io.bt b/tools/testing/selftests/ublk/trace/seq_io.bt index b2f60a92b118..9d36ba35468f 100644 --- a/tools/testing/selftests/ublk/trace/seq_io.bt +++ b/tools/testing/selftests/ublk/trace/seq_io.bt @@ -2,23 +2,52 @@ $1: dev_t $2: RWBS $3: strlen($2) + + Track request order between block_io_start and block_rq_complete. + Sequence starts at 1 so 0 means "never seen". On first valid + completion, sync complete_seq to handle probe attachment races. + block_rq_complete listed first to reduce missed completion window. */ + BEGIN { - @last_rw[$1, str($2)] = (uint64)0; + @start_seq = (uint64)1; + @complete_seq = (uint64)0; + @out_of_order = (uint64)0; + @start_order[0] = (uint64)0; + delete(@start_order[0]); + printf("BPFTRACE_READY\n"); } + tracepoint:block:block_rq_complete +/(int64)args.dev == $1 && !strncmp(args.rwbs, str($2), $3)/ { - $dev = $1; - if ((int64)args.dev == $1 && !strncmp(args.rwbs, str($2), $3)) { - $last = @last_rw[$dev, str($2)]; - if ((uint64)args.sector != $last) { - printf("io_out_of_order: exp %llu actual %llu\n", - args.sector, $last); + $expected = @start_order[args.sector]; + if ($expected > 0) { + if (@complete_seq == 0) { + @complete_seq = $expected; + } + if ($expected != @complete_seq) { + printf("out_of_order: sector %llu started at seq %llu but completed at seq %llu\n", + args.sector, $expected, @complete_seq); + @out_of_order = @out_of_order + 1; } - @last_rw[$dev, str($2)] = (args.sector + args.nr_sector); + delete(@start_order[args.sector]); + @complete_seq = @complete_seq + 1; } } +tracepoint:block:block_io_start +/(int64)args.dev == $1 && !strncmp(args.rwbs, str($2), $3)/ +{ + @start_order[args.sector] = @start_seq; + @start_seq = @start_seq + 1; +} + END { - clear(@last_rw); + printf("total_start: %llu total_complete: %llu out_of_order: %llu\n", + @start_seq - 1, @complete_seq, @out_of_order); + clear(@start_order); + clear(@start_seq); + clear(@complete_seq); + clear(@out_of_order); } -- cgit v1.2.3 From 4ac76c51709dff01b285a2d8afea80ca7ae66d28 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Thu, 22 Jan 2026 17:02:16 +0000 Subject: selftests/mm: default KDIR to build directory Patch series "Various mm kselftests improvements/fixes", v3. Various improvements/fixes for the mm kselftests: - Patch 1-3 extend support for more build configurations: out-of-tree $KDIR, cross-compilation, etc. - Patch 4-7 fix issues related to faulting in pages, introducing a new helper for that purpose. - Patch 8 fixes the value returned by pagemap_ioctl (PASS was always returned, which explains why the issue fixed in patch 6 went unnoticed). - Patch 9 improves the exit code of pfnmap. Net results: - 1 test no longer fails (patch 7) - 3 tests are no longer skipped (patch 4) - More accurate return values for whole suites (patch 8, 9) - Extra tests are more likely to be built (patch 1-3) This patch (of 9): KDIR currently defaults to the running kernel's modules directory when building the page_frag module. The underlying assumption is that most users build the kselftests in order to run them against the system they're built on. This assumption seems questionable, and there is no guarantee that the module can actually be built against the running kernel. Switch the default value of KDIR to the kernel's build directory, i.e. $(O) if O= or KBUILD_OUTPUT= is used, and the source directory otherwise. This seems like the least surprising option: the test module is built against the kernel that has been previously built. Note: we can't use $(top_srcdir) in mm/Makefile because it is only defined once lib.mk is included. Link: https://lkml.kernel.org/r/20260122170224.4056513-1-kevin.brodsky@arm.com Link: https://lkml.kernel.org/r/20260122170224.4056513-2-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Cc: David Hildenbrand Cc: Dev Jain Cc: Lorenzo Stoakes Cc: Mark Brown Cc: Ryan Roberts Cc: Shuah Khan Cc: Jason Gunthorpe Cc: John Hubbard Cc: Paolo Abeni Cc: SeongJae Park Cc: Usama Anjum Cc: wang lian Cc: Yunsheng Lin Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/Makefile | 2 +- tools/testing/selftests/mm/page_frag/Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index eaf9312097f7..bb93101e339e 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -44,7 +44,7 @@ LDLIBS = -lrt -lpthread -lm # warnings. CFLAGS += -U_FORTIFY_SOURCE -KDIR ?= /lib/modules/$(shell uname -r)/build +KDIR ?= $(if $(O),$(O),$(realpath ../../../..)) ifneq (,$(wildcard $(KDIR)/Module.symvers)) ifneq (,$(wildcard $(KDIR)/include/linux/page_frag_cache.h)) TEST_GEN_MODS_DIR := page_frag diff --git a/tools/testing/selftests/mm/page_frag/Makefile b/tools/testing/selftests/mm/page_frag/Makefile index 8c8bb39ffa28..96e5f646e69b 100644 --- a/tools/testing/selftests/mm/page_frag/Makefile +++ b/tools/testing/selftests/mm/page_frag/Makefile @@ -1,5 +1,5 @@ PAGE_FRAG_TEST_DIR := $(realpath $(dir $(abspath $(lastword $(MAKEFILE_LIST))))) -KDIR ?= /lib/modules/$(shell uname -r)/build +KDIR ?= $(if $(O),$(O),$(realpath ../../../../..)) ifeq ($(V),1) Q = -- cgit v1.2.3 From 1821be740d2e9329805cafa368e476064fde0789 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Thu, 22 Jan 2026 17:02:17 +0000 Subject: selftests/mm: remove flaky header check Commit 96ed62ea0298 ("mm: page_frag: fix a compile error when kernel is not compiled") introduced a check to avoid attempting to build the page_frag module if is missing. Unfortunately this check only works if KDIR points to /lib/modules/... or an in-tree kernel build. It always fails if KDIR points to an out-of-tree build (i.e. when the kernel was built with O=... make) because only generated headers are present under $KDIR/include/ in that case. A recent commit switched KDIR to default to the kernel's build directory, so that check is no longer justified. Link: https://lkml.kernel.org/r/20260122170224.4056513-3-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Reviewed-by: Mark Brown Cc: Paolo Abeni Cc: Yunsheng Lin Cc: David Hildenbrand Cc: Dev Jain Cc: Jason Gunthorpe Cc: John Hubbard Cc: Lorenzo Stoakes Cc: Ryan Roberts Cc: SeongJae Park Cc: Shuah Khan Cc: Usama Anjum Cc: wang lian Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/Makefile | 4 ---- 1 file changed, 4 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index bb93101e339e..4e5c8a330a0c 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -46,12 +46,8 @@ CFLAGS += -U_FORTIFY_SOURCE KDIR ?= $(if $(O),$(O),$(realpath ../../../..)) ifneq (,$(wildcard $(KDIR)/Module.symvers)) -ifneq (,$(wildcard $(KDIR)/include/linux/page_frag_cache.h)) TEST_GEN_MODS_DIR := page_frag else -PAGE_FRAG_WARNING = "missing page_frag_cache.h, please use a newer kernel" -endif -else PAGE_FRAG_WARNING = "missing Module.symvers, please have the kernel built first" endif -- cgit v1.2.3 From 7f532d19c8be76ad2fcd7ab6b0c9eb618f70966b Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Thu, 22 Jan 2026 17:02:18 +0000 Subject: selftests/mm: pass down full CC and CFLAGS to check_config.sh check_config.sh checks that liburing is available by running the compiler provided as its first argument. This makes two assumptions: 1. CC consists of only one word 2. No extra flag is required Unfortunately, there are many situations where these assumptions don't hold. For instance: - When using Clang, CC consists of multiple words - When cross-compiling, extra flags may be required to allow the compiler to find headers Remove these assumptions by passing down CC and CFLAGS as-is from the Makefile, so that the same command line is used as when actually building the tests. Link: https://lkml.kernel.org/r/20260122170224.4056513-4-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Reviewed-by: Mark Brown Acked-by: David Hildenbrand (Red Hat) Cc: Jason Gunthorpe Cc: John Hubbard Cc: Dev Jain Cc: Lorenzo Stoakes Cc: Paolo Abeni Cc: Ryan Roberts Cc: SeongJae Park Cc: Shuah Khan Cc: Usama Anjum Cc: wang lian Cc: Yunsheng Lin Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/Makefile | 2 +- tools/testing/selftests/mm/check_config.sh | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 4e5c8a330a0c..de4afc34e3b1 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -230,7 +230,7 @@ $(OUTPUT)/migration: LDLIBS += -lnuma $(OUTPUT)/rmap: LDLIBS += -lnuma local_config.mk local_config.h: check_config.sh - /bin/sh ./check_config.sh $(CC) + CC="$(CC)" CFLAGS="$(CFLAGS)" ./check_config.sh EXTRA_CLEAN += local_config.mk local_config.h diff --git a/tools/testing/selftests/mm/check_config.sh b/tools/testing/selftests/mm/check_config.sh index 3954f4746161..b84c82bbf875 100755 --- a/tools/testing/selftests/mm/check_config.sh +++ b/tools/testing/selftests/mm/check_config.sh @@ -16,8 +16,7 @@ echo "#include " > $tmpfile_c echo "#include " >> $tmpfile_c echo "int func(void) { return 0; }" >> $tmpfile_c -CC=${1:?"Usage: $0 # example compiler: gcc"} -$CC -c $tmpfile_c -o $tmpfile_o >/dev/null 2>&1 +$CC $CFLAGS -c $tmpfile_c -o $tmpfile_o if [ -f $tmpfile_o ]; then echo "#define LOCAL_CONFIG_HAVE_LIBURING 1" > $OUTPUT_H_FILE -- cgit v1.2.3 From bce1dabd310e87fefe0645fec9ba98b84d37e418 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Thu, 22 Jan 2026 17:02:19 +0000 Subject: selftests/mm: fix usage of FORCE_READ() in cow tests Commit 5bbc2b785e63 ("selftests/mm: fix FORCE_READ to read input value correctly") modified FORCE_READ() to take a value instead of a pointer. It also changed most of the call sites accordingly, but missed many of them in cow.c. In those cases, we ended up with the pointer itself being read, not the memory it points to. No failure occurred as a result, so it looks like the tests work just fine without faulting in. However, the huge_zeropage tests explicitly check that pages are populated, so those became skipped. Convert all the remaining FORCE_READ() to fault in the mapped page, as was originally intended. This allows the huge_zeropage tests to run again (3 tests in total). Link: https://lkml.kernel.org/r/20260122170224.4056513-5-kevin.brodsky@arm.com Fixes: 5bbc2b785e63 ("selftests/mm: fix FORCE_READ to read input value correctly") Signed-off-by: Kevin Brodsky Acked-by: SeongJae Park Reviewed-by: wang lian Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Dev Jain Cc: Jason Gunthorpe Cc: John Hubbard Cc: Lorenzo Stoakes Cc: Mark Brown Cc: Paolo Abeni Cc: Ryan Roberts Cc: Shuah Khan Cc: Usama Anjum Cc: Yunsheng Lin Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/cow.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c index accfd198dbda..83b3563be26b 100644 --- a/tools/testing/selftests/mm/cow.c +++ b/tools/testing/selftests/mm/cow.c @@ -1612,8 +1612,8 @@ static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc) * the first sub-page and test if we get another sub-page populated * automatically. */ - FORCE_READ(mem); - FORCE_READ(smem); + FORCE_READ(*mem); + FORCE_READ(*smem); if (!pagemap_is_populated(pagemap_fd, mem + pagesize) || !pagemap_is_populated(pagemap_fd, smem + pagesize)) { ksft_test_result_skip("Did not get THPs populated\n"); @@ -1663,8 +1663,8 @@ static void run_with_memfd(non_anon_test_fn fn, const char *desc) } /* Fault the page in. */ - FORCE_READ(mem); - FORCE_READ(smem); + FORCE_READ(*mem); + FORCE_READ(*smem); fn(mem, smem, pagesize); munmap: @@ -1719,8 +1719,8 @@ static void run_with_tmpfile(non_anon_test_fn fn, const char *desc) } /* Fault the page in. */ - FORCE_READ(mem); - FORCE_READ(smem); + FORCE_READ(*mem); + FORCE_READ(*smem); fn(mem, smem, pagesize); munmap: @@ -1773,8 +1773,8 @@ static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc, } /* Fault the page in. */ - FORCE_READ(mem); - FORCE_READ(smem); + FORCE_READ(*mem); + FORCE_READ(*smem); fn(mem, smem, hugetlbsize); munmap: -- cgit v1.2.3 From 20d3fac43608a1d7ef71991935abc4456baa1da7 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Thu, 22 Jan 2026 17:02:20 +0000 Subject: selftests/mm: check that FORCE_READ() succeeded Many cow tests rely on FORCE_READ() to populate pages. Introduce a helper to make sure that the pages are actually populated, and fail otherwise. Link: https://lkml.kernel.org/r/20260122170224.4056513-6-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Suggested-by: David Hildenbrand (Red Hat) Cc: Dev Jain Cc: Jason Gunthorpe Cc: John Hubbard Cc: Lorenzo Stoakes Cc: Mark Brown Cc: Paolo Abeni Cc: Ryan Roberts Cc: SeongJae Park Cc: Shuah Khan Cc: Usama Anjum Cc: wang lian Cc: Yunsheng Lin Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/cow.c | 43 ++++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 10 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c index 83b3563be26b..d9c69c04b67d 100644 --- a/tools/testing/selftests/mm/cow.c +++ b/tools/testing/selftests/mm/cow.c @@ -75,6 +75,18 @@ static bool range_is_swapped(void *addr, size_t size) return true; } +static bool populate_page_checked(char *addr) +{ + bool ret; + + FORCE_READ(*addr); + ret = pagemap_is_populated(pagemap_fd, addr); + if (!ret) + ksft_print_msg("Failed to populate page\n"); + + return ret; +} + struct comm_pipes { int child_ready[2]; int parent_ready[2]; @@ -1549,8 +1561,10 @@ static void run_with_zeropage(non_anon_test_fn fn, const char *desc) } /* Read from the page to populate the shared zeropage. */ - FORCE_READ(*mem); - FORCE_READ(*smem); + if (!populate_page_checked(mem) || !populate_page_checked(smem)) { + log_test_result(KSFT_FAIL); + goto munmap; + } fn(mem, smem, pagesize); munmap: @@ -1612,8 +1626,11 @@ static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc) * the first sub-page and test if we get another sub-page populated * automatically. */ - FORCE_READ(*mem); - FORCE_READ(*smem); + if (!populate_page_checked(mem) || !populate_page_checked(smem)) { + log_test_result(KSFT_FAIL); + goto munmap; + } + if (!pagemap_is_populated(pagemap_fd, mem + pagesize) || !pagemap_is_populated(pagemap_fd, smem + pagesize)) { ksft_test_result_skip("Did not get THPs populated\n"); @@ -1663,8 +1680,10 @@ static void run_with_memfd(non_anon_test_fn fn, const char *desc) } /* Fault the page in. */ - FORCE_READ(*mem); - FORCE_READ(*smem); + if (!populate_page_checked(mem) || !populate_page_checked(smem)) { + log_test_result(KSFT_FAIL); + goto munmap; + } fn(mem, smem, pagesize); munmap: @@ -1719,8 +1738,10 @@ static void run_with_tmpfile(non_anon_test_fn fn, const char *desc) } /* Fault the page in. */ - FORCE_READ(*mem); - FORCE_READ(*smem); + if (!populate_page_checked(mem) || !populate_page_checked(smem)) { + log_test_result(KSFT_FAIL); + goto munmap; + } fn(mem, smem, pagesize); munmap: @@ -1773,8 +1794,10 @@ static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc, } /* Fault the page in. */ - FORCE_READ(*mem); - FORCE_READ(*smem); + if (!populate_page_checked(mem) || !populate_page_checked(smem)) { + log_test_result(KSFT_FAIL); + goto munmap; + } fn(mem, smem, hugetlbsize); munmap: -- cgit v1.2.3 From dd2b4e04c09808ff921e3460a608537d1a94595d Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Thu, 22 Jan 2026 17:02:21 +0000 Subject: selftests/mm: introduce helper to read every page FORCE_READ(*addr) ensures that the compiler will emit a load from addr. Several tests need to trigger such a load for a range of pages, ensuring that every page is faulted in, if it wasn't already. Introduce a new helper force_read_pages() that does exactly that and replace existing loops with a call to it. The step size (regular/huge page size) is preserved for all loops, except in split_huge_page_test. Reading every byte is unnecessary; we now read every huge page, matching the following call to check_huge_file(). Link: https://lkml.kernel.org/r/20260122170224.4056513-7-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Reviewed-by: Dev Jain Reviewed-by: Muhammad Usama Anjum Acked-by: David Hildenbrand (Red Hat) Cc: Jason Gunthorpe Cc: John Hubbard Cc: Lorenzo Stoakes Cc: Mark Brown Cc: Paolo Abeni Cc: Ryan Roberts Cc: SeongJae Park Cc: Shuah Khan Cc: wang lian Cc: Yunsheng Lin Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/hugetlb-madvise.c | 9 +-------- tools/testing/selftests/mm/pfnmap.c | 9 +++------ tools/testing/selftests/mm/split_huge_page_test.c | 6 +----- tools/testing/selftests/mm/vm_util.h | 7 +++++++ 4 files changed, 12 insertions(+), 19 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/hugetlb-madvise.c b/tools/testing/selftests/mm/hugetlb-madvise.c index 05d9d2805ae4..5b12041fa310 100644 --- a/tools/testing/selftests/mm/hugetlb-madvise.c +++ b/tools/testing/selftests/mm/hugetlb-madvise.c @@ -47,14 +47,7 @@ void write_fault_pages(void *addr, unsigned long nr_pages) void read_fault_pages(void *addr, unsigned long nr_pages) { - unsigned long i; - - for (i = 0; i < nr_pages; i++) { - unsigned long *addr2 = - ((unsigned long *)(addr + (i * huge_page_size))); - /* Prevent the compiler from optimizing out the entire loop: */ - FORCE_READ(*addr2); - } + force_read_pages(addr, nr_pages, huge_page_size); } int main(int argc, char **argv) diff --git a/tools/testing/selftests/mm/pfnmap.c b/tools/testing/selftests/mm/pfnmap.c index f546dfb10cae..45b5f1cf6019 100644 --- a/tools/testing/selftests/mm/pfnmap.c +++ b/tools/testing/selftests/mm/pfnmap.c @@ -35,18 +35,15 @@ static void signal_handler(int sig) static int test_read_access(char *addr, size_t size, size_t pagesize) { - size_t offs; int ret; if (signal(SIGSEGV, signal_handler) == SIG_ERR) return -EINVAL; ret = sigsetjmp(sigjmp_buf_env, 1); - if (!ret) { - for (offs = 0; offs < size; offs += pagesize) - /* Force a read that the compiler cannot optimize out. */ - *((volatile char *)(addr + offs)); - } + if (!ret) + force_read_pages(addr, size/pagesize, pagesize); + if (signal(SIGSEGV, SIG_DFL) == SIG_ERR) return -EINVAL; diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c index 40799f3f0213..e0167111bdd1 100644 --- a/tools/testing/selftests/mm/split_huge_page_test.c +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -652,11 +652,7 @@ static int create_pagecache_thp_and_fd(const char *testfile, size_t fd_size, } madvise(*addr, fd_size, MADV_HUGEPAGE); - for (size_t i = 0; i < fd_size; i++) { - char *addr2 = *addr + i; - - FORCE_READ(*addr2); - } + force_read_pages(*addr, fd_size / pmd_pagesize, pmd_pagesize); if (!check_huge_file(*addr, fd_size / pmd_pagesize, pmd_pagesize)) { ksft_print_msg("No large pagecache folio generated, please provide a filesystem supporting large folio\n"); diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h index 6ad32b1830f1..522f7f9050f5 100644 --- a/tools/testing/selftests/mm/vm_util.h +++ b/tools/testing/selftests/mm/vm_util.h @@ -54,6 +54,13 @@ static inline unsigned int pshift(void) return __page_shift; } +static inline void force_read_pages(char *addr, unsigned int nr_pages, + size_t pagesize) +{ + for (unsigned int i = 0; i < nr_pages; i++) + FORCE_READ(addr[i * pagesize]); +} + bool detect_huge_zeropage(void); /* -- cgit v1.2.3 From 7e938f00b00319510ae097e20b7487dfa578d53f Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Thu, 22 Jan 2026 17:02:22 +0000 Subject: selftests/mm: fix faulting-in code in pagemap_ioctl test One of the pagemap_ioctl tests attempts to fault in pages by memcpy()'ing them to an unused buffer. This probably worked originally, but since commit 46036188ea1f ("selftests/mm: build with -O2") the compiler is free to optimise away that unused buffer and the memcpy() with it. As a result there might not be any resident page in the mapping and the test may fail. We don't need to copy all that memory anyway. Just fault in every page. While at it also make sure to compute the number of pages once using simple integer arithmetic instead of ceilf() and implicit conversions. Link: https://lkml.kernel.org/r/20260122170224.4056513-8-kevin.brodsky@arm.com Fixes: 46036188ea1f ("selftests/mm: build with -O2") Signed-off-by: Kevin Brodsky Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Dev Jain Reviewed-by: Muhammad Usama Anjum Cc: Jason Gunthorpe Cc: John Hubbard Cc: Lorenzo Stoakes Cc: Mark Brown Cc: Paolo Abeni Cc: Ryan Roberts Cc: SeongJae Park Cc: Shuah Khan Cc: wang lian Cc: Yunsheng Lin Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/pagemap_ioctl.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/pagemap_ioctl.c b/tools/testing/selftests/mm/pagemap_ioctl.c index 2cb5441f29c7..1896c7d4f72e 100644 --- a/tools/testing/selftests/mm/pagemap_ioctl.c +++ b/tools/testing/selftests/mm/pagemap_ioctl.c @@ -1052,11 +1052,10 @@ static void test_simple(void) int sanity_tests(void) { unsigned long long mem_size, vec_size; - long ret, fd, i, buf_size; + long ret, fd, i, buf_size, nr_pages; struct page_region *vec; char *mem, *fmem; struct stat sbuf; - char *tmp_buf; /* 1. wrong operation */ mem_size = 10 * page_size; @@ -1167,14 +1166,14 @@ int sanity_tests(void) if (fmem == MAP_FAILED) ksft_exit_fail_msg("error nomem %d %s\n", errno, strerror(errno)); - tmp_buf = malloc(sbuf.st_size); - memcpy(tmp_buf, fmem, sbuf.st_size); + nr_pages = (sbuf.st_size + page_size - 1) / page_size; + force_read_pages(fmem, nr_pages, page_size); ret = pagemap_ioctl(fmem, sbuf.st_size, vec, vec_size, 0, 0, 0, PAGEMAP_NON_WRITTEN_BITS, 0, PAGEMAP_NON_WRITTEN_BITS); ksft_test_result(ret >= 0 && vec[0].start == (uintptr_t)fmem && - LEN(vec[0]) == ceilf((float)sbuf.st_size/page_size) && + LEN(vec[0]) == nr_pages && (vec[0].categories & PAGE_IS_FILE), "%s Memory mapped file\n", __func__); -- cgit v1.2.3 From 148e5879532f835118e00c3040acef077b57721a Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Thu, 22 Jan 2026 17:02:23 +0000 Subject: selftests/mm: fix exit code in pagemap_ioctl Make sure pagemap_ioctl exits with an appropriate value: * If the tests are run, call ksft_finished() to report the right status instead of reporting PASS unconditionally. * Report SKIP if userfaultfd isn't available (in line with other tests) * Report FAIL if we failed to open /proc/self/pagemap, as this file has been added a long time ago and doesn't depend on any CONFIG option (returning -EINVAL from main() is meaningless) Link: https://lkml.kernel.org/r/20260122170224.4056513-9-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Reviewed-by: Ryan Roberts Reviewed-by: Mark Brown Acked-by: David Hildenbrand (Red Hat) Acked-by: SeongJae Park Reviewed-by: wang lian Reviewed-by: Dev Jain Cc: Usama Anjum Cc: Jason Gunthorpe Cc: John Hubbard Cc: Lorenzo Stoakes Cc: Paolo Abeni Cc: Shuah Khan Cc: Yunsheng Lin Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/pagemap_ioctl.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/pagemap_ioctl.c b/tools/testing/selftests/mm/pagemap_ioctl.c index 1896c7d4f72e..2ca8a7e3c27e 100644 --- a/tools/testing/selftests/mm/pagemap_ioctl.c +++ b/tools/testing/selftests/mm/pagemap_ioctl.c @@ -1552,7 +1552,7 @@ int main(int __attribute__((unused)) argc, char *argv[]) ksft_print_header(); if (init_uffd()) - ksft_exit_pass(); + ksft_exit_skip("Failed to initialize userfaultfd\n"); ksft_set_plan(117); @@ -1561,7 +1561,7 @@ int main(int __attribute__((unused)) argc, char *argv[]) pagemap_fd = open(PAGEMAP, O_RDONLY); if (pagemap_fd < 0) - return -EINVAL; + ksft_exit_fail_msg("Failed to open " PAGEMAP "\n"); /* 1. Sanity testing */ sanity_tests_sd(); @@ -1733,5 +1733,5 @@ int main(int __attribute__((unused)) argc, char *argv[]) zeropfn_tests(); close(pagemap_fd); - ksft_exit_pass(); + ksft_finished(); } -- cgit v1.2.3 From fde8353121aa304ee88542f011dd5dc83ced47e4 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Thu, 22 Jan 2026 17:02:24 +0000 Subject: selftests/mm: report SKIP in pfnmap if a check fails pfnmap currently checks the target file in FIXTURE_SETUP(pfnmap), meaning once for every test, and skips the test if any check fails. The target file is the same for every test so this is a little overkill. More importantly, this approach means that the whole suite will report PASS even if all the tests are skipped because kernel configuration (e.g. CONFIG_STRICT_DEVMEM=y) prevented /dev/mem from being mapped, for instance. Let's ensure that KSFT_SKIP is returned as exit code if any check fails by performing the checks in pfnmap_init(), run once. That function also takes care of finding the offset of the pages to be mapped and saves it in a global. The file is now opened only once and the fd saved in a global, but it is still mapped/unmapped for every test, as some of them modify the mapping. Link: https://lkml.kernel.org/r/20260122170224.4056513-10-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Acked-by: David Hildenbrand (Red Hat) Cc: Dev Jain Cc: Jason Gunthorpe Cc: John Hubbard Cc: Lorenzo Stoakes Cc: Mark Brown Cc: Paolo Abeni Cc: Ryan Roberts Cc: SeongJae Park Cc: Shuah Khan Cc: Usama Anjum Cc: wang lian Cc: Yunsheng Lin Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/pfnmap.c | 84 +++++++++++++++++++++++-------------- 1 file changed, 53 insertions(+), 31 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/pfnmap.c b/tools/testing/selftests/mm/pfnmap.c index 45b5f1cf6019..4f550822385a 100644 --- a/tools/testing/selftests/mm/pfnmap.c +++ b/tools/testing/selftests/mm/pfnmap.c @@ -25,8 +25,12 @@ #include "kselftest_harness.h" #include "vm_util.h" +#define DEV_MEM_NPAGES 2 + static sigjmp_buf sigjmp_buf_env; static char *file = "/dev/mem"; +static off_t file_offset; +static int fd; static void signal_handler(int sig) { @@ -88,7 +92,7 @@ static int find_ram_target(off_t *offset, break; /* We need two pages. */ - if (end > start + 2 * pagesize) { + if (end > start + DEV_MEM_NPAGES * pagesize) { fclose(file); *offset = start; return 0; @@ -97,11 +101,48 @@ static int find_ram_target(off_t *offset, return -ENOENT; } +static void pfnmap_init(void) +{ + size_t pagesize = getpagesize(); + size_t size = DEV_MEM_NPAGES * pagesize; + void *addr; + + if (strncmp(file, "/dev/mem", strlen("/dev/mem")) == 0) { + int err = find_ram_target(&file_offset, pagesize); + + if (err) + ksft_exit_skip("Cannot find ram target in '/proc/iomem': %s\n", + strerror(-err)); + } else { + file_offset = 0; + } + + fd = open(file, O_RDONLY); + if (fd < 0) + ksft_exit_skip("Cannot open '%s': %s\n", file, strerror(errno)); + + /* + * Make sure we can map the file, and perform some basic checks; skip + * the whole suite if anything goes wrong. + * A fresh mapping is then created for every test case by + * FIXTURE_SETUP(pfnmap). + */ + addr = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, file_offset); + if (addr == MAP_FAILED) + ksft_exit_skip("Cannot mmap '%s': %s\n", file, strerror(errno)); + + if (!check_vmflag_pfnmap(addr)) + ksft_exit_skip("Invalid file: '%s'. Not pfnmap'ed\n", file); + + if (test_read_access(addr, size, pagesize)) + ksft_exit_skip("Cannot read-access mmap'ed '%s'\n", file); + + munmap(addr, size); +} + FIXTURE(pfnmap) { - off_t offset; size_t pagesize; - int dev_mem_fd; char *addr1; size_t size1; char *addr2; @@ -112,31 +153,10 @@ FIXTURE_SETUP(pfnmap) { self->pagesize = getpagesize(); - if (strncmp(file, "/dev/mem", strlen("/dev/mem")) == 0) { - /* We'll require two physical pages throughout our tests ... */ - if (find_ram_target(&self->offset, self->pagesize)) - SKIP(return, - "Cannot find ram target in '/proc/iomem'\n"); - } else { - self->offset = 0; - } - - self->dev_mem_fd = open(file, O_RDONLY); - if (self->dev_mem_fd < 0) - SKIP(return, "Cannot open '%s'\n", file); - - self->size1 = self->pagesize * 2; + self->size1 = DEV_MEM_NPAGES * self->pagesize; self->addr1 = mmap(NULL, self->size1, PROT_READ, MAP_SHARED, - self->dev_mem_fd, self->offset); - if (self->addr1 == MAP_FAILED) - SKIP(return, "Cannot mmap '%s'\n", file); - - if (!check_vmflag_pfnmap(self->addr1)) - SKIP(return, "Invalid file: '%s'. Not pfnmap'ed\n", file); - - /* ... and want to be able to read from them. */ - if (test_read_access(self->addr1, self->size1, self->pagesize)) - SKIP(return, "Cannot read-access mmap'ed '%s'\n", file); + fd, file_offset); + ASSERT_NE(self->addr1, MAP_FAILED); self->size2 = 0; self->addr2 = MAP_FAILED; @@ -148,8 +168,6 @@ FIXTURE_TEARDOWN(pfnmap) munmap(self->addr2, self->size2); if (self->addr1 != MAP_FAILED) munmap(self->addr1, self->size1); - if (self->dev_mem_fd >= 0) - close(self->dev_mem_fd); } TEST_F(pfnmap, madvise_disallowed) @@ -189,7 +207,7 @@ TEST_F(pfnmap, munmap_split) */ self->size2 = self->pagesize; self->addr2 = mmap(NULL, self->pagesize, PROT_READ, MAP_SHARED, - self->dev_mem_fd, self->offset); + fd, file_offset); ASSERT_NE(self->addr2, MAP_FAILED); } @@ -259,8 +277,12 @@ int main(int argc, char **argv) if (strcmp(argv[i], "--") == 0) { if (i + 1 < argc && strlen(argv[i + 1]) > 0) file = argv[i + 1]; - return test_harness_run(i, argv); + argc = i; + break; } } + + pfnmap_init(); + return test_harness_run(argc, argv); } -- cgit v1.2.3 From dd2c6ec24fca9235ccd1b9bfd382d0ddb419e41a Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 16 Jan 2026 13:20:53 +0000 Subject: selftests/mm: remove virtual_address_range test This self test is asserting internal implementation details and is highly vulnerable to internal kernel changes as a result. It is currently failing locally from at least v6.17, and it seems that it may have been failing for longer in many configurations/hardware as it skips if e.g. CONFIG_ANON_VMA_NAME is not specified. With these skips and the fact that run_vmtests.sh won't run the tests in certain configurations it is likely we have simply missed this test being broken in CI for a long while. I have tried multiple versions of these tests and am unable to find a working bisect as previous versions of the test fail also. The tests are essentially mmap()'ing a series of mappings with no hint and asserting what the get_unmapped_area*() functions will come up with, with seemingly few checks for what other mappings may already be in place. It then appears to be mmap()'ing with a hint, and making a series of similar assertions about the internal implementation details of the hinting logic. Commit 0ef3783d7558 ("selftests/mm: add support to test 4PB VA on PPC64"), commit 3bd6137220bb ("selftests/mm: virtual_address_range: avoid reading from VM_IO mappings"), and especially commit a005145b9c96 ("selftests/mm: virtual_address_range: mmap() without PROT_WRITE") are good examples of the whack-a-mole nature of maintaining this test. The last commit there being particularly pertinent as it was accounting for an internal implementation detail change that really should have no bearing on self-tests, that is commit e93d2521b27f ("x86/vdso: Split virtual clock pages into dedicated mapping"). The purpose of the mm self-tests are to assert attributes about the API exposed to users, and to ensure that expectations are met. This test is emphatically not doing this, rather making a series of assumptions about internal implementation details and asserting them. It therefore, sadly, seems that the best course is to remove this test altogether. Link: https://lkml.kernel.org/r/20260116132053.857887-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: SeongJae Park Cc: David Hildenbrand Cc: Liam Howlett Cc: Mark Brown Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/.gitignore | 1 - tools/testing/selftests/mm/Makefile | 3 - tools/testing/selftests/mm/run_vmtests.sh | 12 - tools/testing/selftests/mm/virtual_address_range.c | 260 --------------------- 4 files changed, 276 deletions(-) delete mode 100644 tools/testing/selftests/mm/virtual_address_range.c (limited to 'tools/testing') diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore index c2a8586e51a1..702e5723c35d 100644 --- a/tools/testing/selftests/mm/.gitignore +++ b/tools/testing/selftests/mm/.gitignore @@ -32,7 +32,6 @@ uffd-unit-tests uffd-wp-mremap mlock-intersect-test mlock-random-test -virtual_address_range gup_test va_128TBswitch map_fixed_noreplace diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index de4afc34e3b1..2fdb05e5a56a 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -136,9 +136,6 @@ endif ifneq (,$(filter $(ARCH),arm64 mips64 parisc64 powerpc riscv64 s390x sparc64 x86_64 s390)) TEST_GEN_FILES += va_high_addr_switch -ifneq ($(ARCH),riscv64) -TEST_GEN_FILES += virtual_address_range -endif TEST_GEN_FILES += write_to_hugetlbfs endif diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 2dadbfc6e535..452875db532c 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -399,18 +399,6 @@ CATEGORY="hugetlb" run_test ./hugetlb-read-hwpoison fi if [ $VADDR64 -ne 0 ]; then - - # set overcommit_policy as OVERCOMMIT_ALWAYS so that kernel - # allows high virtual address allocation requests independent - # of platform's physical memory. - - if [ -x ./virtual_address_range ]; then - prev_policy=$(cat /proc/sys/vm/overcommit_memory) - echo 1 > /proc/sys/vm/overcommit_memory - CATEGORY="hugevm" run_test ./virtual_address_range - echo $prev_policy > /proc/sys/vm/overcommit_memory - fi - # va high address boundary switch test CATEGORY="hugevm" run_test bash ./va_high_addr_switch.sh fi # VADDR64 diff --git a/tools/testing/selftests/mm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c deleted file mode 100644 index 4f0923825ed7..000000000000 --- a/tools/testing/selftests/mm/virtual_address_range.c +++ /dev/null @@ -1,260 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright 2017, Anshuman Khandual, IBM Corp. - * - * Works on architectures which support 128TB virtual - * address range and beyond. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "vm_util.h" -#include "kselftest.h" - -/* - * Maximum address range mapped with a single mmap() - * call is little bit more than 1GB. Hence 1GB is - * chosen as the single chunk size for address space - * mapping. - */ - -#define SZ_1GB (1024 * 1024 * 1024UL) -#define SZ_1TB (1024 * 1024 * 1024 * 1024UL) - -#define MAP_CHUNK_SIZE SZ_1GB - -/* - * Address space till 128TB is mapped without any hint - * and is enabled by default. Address space beyond 128TB - * till 512TB is obtained by passing hint address as the - * first argument into mmap() system call. - * - * The process heap address space is divided into two - * different areas one below 128TB and one above 128TB - * till it reaches 512TB. One with size 128TB and the - * other being 384TB. - * - * On Arm64 the address space is 256TB and support for - * high mappings up to 4PB virtual address space has - * been added. - * - * On PowerPC64, the address space up to 128TB can be - * mapped without a hint. Addresses beyond 128TB, up to - * 4PB, can be mapped with a hint. - * - */ - -#define NR_CHUNKS_128TB ((128 * SZ_1TB) / MAP_CHUNK_SIZE) /* Number of chunks for 128TB */ -#define NR_CHUNKS_256TB (NR_CHUNKS_128TB * 2UL) -#define NR_CHUNKS_384TB (NR_CHUNKS_128TB * 3UL) -#define NR_CHUNKS_3840TB (NR_CHUNKS_128TB * 30UL) -#define NR_CHUNKS_3968TB (NR_CHUNKS_128TB * 31UL) - -#define ADDR_MARK_128TB (1UL << 47) /* First address beyond 128TB */ -#define ADDR_MARK_256TB (1UL << 48) /* First address beyond 256TB */ - -#ifdef __aarch64__ -#define HIGH_ADDR_MARK ADDR_MARK_256TB -#define HIGH_ADDR_SHIFT 49 -#define NR_CHUNKS_LOW NR_CHUNKS_256TB -#define NR_CHUNKS_HIGH NR_CHUNKS_3840TB -#elif defined(__PPC64__) -#define HIGH_ADDR_MARK ADDR_MARK_128TB -#define HIGH_ADDR_SHIFT 48 -#define NR_CHUNKS_LOW NR_CHUNKS_128TB -#define NR_CHUNKS_HIGH NR_CHUNKS_3968TB -#else -#define HIGH_ADDR_MARK ADDR_MARK_128TB -#define HIGH_ADDR_SHIFT 48 -#define NR_CHUNKS_LOW NR_CHUNKS_128TB -#define NR_CHUNKS_HIGH NR_CHUNKS_384TB -#endif - -static char *hint_addr(void) -{ - int bits = HIGH_ADDR_SHIFT + rand() % (63 - HIGH_ADDR_SHIFT); - - return (char *) (1UL << bits); -} - -static void validate_addr(char *ptr, int high_addr) -{ - unsigned long addr = (unsigned long) ptr; - - if (high_addr) { - if (addr < HIGH_ADDR_MARK) - ksft_exit_fail_msg("Bad address %lx\n", addr); - return; - } - - if (addr > HIGH_ADDR_MARK) - ksft_exit_fail_msg("Bad address %lx\n", addr); -} - -static void mark_range(char *ptr, size_t size) -{ - if (prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, ptr, size, "virtual_address_range") == -1) { - if (errno == EINVAL) { - /* Depends on CONFIG_ANON_VMA_NAME */ - ksft_test_result_skip("prctl(PR_SET_VMA_ANON_NAME) not supported\n"); - ksft_finished(); - } else { - ksft_exit_fail_perror("prctl(PR_SET_VMA_ANON_NAME) failed\n"); - } - } -} - -static int is_marked_vma(const char *vma_name) -{ - return vma_name && !strcmp(vma_name, "[anon:virtual_address_range]\n"); -} - -static int validate_lower_address_hint(void) -{ - char *ptr; - - ptr = mmap((void *) (1UL << 45), MAP_CHUNK_SIZE, PROT_READ | - PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - - if (ptr == MAP_FAILED) - return 0; - - return 1; -} - -static int validate_complete_va_space(void) -{ - unsigned long start_addr, end_addr, prev_end_addr; - char line[400]; - char prot[6]; - FILE *file; - int fd; - - fd = open("va_dump", O_CREAT | O_WRONLY, 0600); - unlink("va_dump"); - if (fd < 0) { - ksft_test_result_skip("cannot create or open dump file\n"); - ksft_finished(); - } - - file = fopen("/proc/self/maps", "r"); - if (file == NULL) - ksft_exit_fail_msg("cannot open /proc/self/maps\n"); - - prev_end_addr = 0; - while (fgets(line, sizeof(line), file)) { - const char *vma_name = NULL; - int vma_name_start = 0; - unsigned long hop; - - if (sscanf(line, "%lx-%lx %4s %*s %*s %*s %n", - &start_addr, &end_addr, prot, &vma_name_start) != 3) - ksft_exit_fail_msg("cannot parse /proc/self/maps\n"); - - if (vma_name_start) - vma_name = line + vma_name_start; - - /* end of userspace mappings; ignore vsyscall mapping */ - if (start_addr & (1UL << 63)) - return 0; - - /* /proc/self/maps must have gaps less than MAP_CHUNK_SIZE */ - if (start_addr - prev_end_addr >= MAP_CHUNK_SIZE) - return 1; - - prev_end_addr = end_addr; - - if (prot[0] != 'r') - continue; - - if (check_vmflag_io((void *)start_addr)) - continue; - - /* - * Confirm whether MAP_CHUNK_SIZE chunk can be found or not. - * If write succeeds, no need to check MAP_CHUNK_SIZE - 1 - * addresses after that. If the address was not held by this - * process, write would fail with errno set to EFAULT. - * Anyways, if write returns anything apart from 1, exit the - * program since that would mean a bug in /proc/self/maps. - */ - hop = 0; - while (start_addr + hop < end_addr) { - if (write(fd, (void *)(start_addr + hop), 1) != 1) - return 1; - lseek(fd, 0, SEEK_SET); - - if (is_marked_vma(vma_name)) - munmap((char *)(start_addr + hop), MAP_CHUNK_SIZE); - - hop += MAP_CHUNK_SIZE; - } - } - return 0; -} - -int main(int argc, char *argv[]) -{ - char *ptr[NR_CHUNKS_LOW]; - char **hptr; - char *hint; - unsigned long i, lchunks, hchunks; - - ksft_print_header(); - ksft_set_plan(1); - - for (i = 0; i < NR_CHUNKS_LOW; i++) { - ptr[i] = mmap(NULL, MAP_CHUNK_SIZE, PROT_READ, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - - if (ptr[i] == MAP_FAILED) { - if (validate_lower_address_hint()) - ksft_exit_fail_msg("mmap unexpectedly succeeded with hint\n"); - break; - } - - mark_range(ptr[i], MAP_CHUNK_SIZE); - validate_addr(ptr[i], 0); - } - lchunks = i; - hptr = (char **) calloc(NR_CHUNKS_HIGH, sizeof(char *)); - if (hptr == NULL) { - ksft_test_result_skip("Memory constraint not fulfilled\n"); - ksft_finished(); - } - - for (i = 0; i < NR_CHUNKS_HIGH; i++) { - hint = hint_addr(); - hptr[i] = mmap(hint, MAP_CHUNK_SIZE, PROT_READ, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - - if (hptr[i] == MAP_FAILED) - break; - - mark_range(hptr[i], MAP_CHUNK_SIZE); - validate_addr(hptr[i], 1); - } - hchunks = i; - if (validate_complete_va_space()) { - ksft_test_result_fail("BUG in mmap() or /proc/self/maps\n"); - ksft_finished(); - } - - for (i = 0; i < lchunks; i++) - munmap(ptr[i], MAP_CHUNK_SIZE); - - for (i = 0; i < hchunks; i++) - munmap(hptr[i], MAP_CHUNK_SIZE); - - free(hptr); - - ksft_test_result_pass("Test\n"); - ksft_finished(); -} -- cgit v1.2.3 From 94a62284ede0250e48c886416041ad65907ee917 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 16 Jan 2026 18:07:24 -0800 Subject: selftests/damon/sysfs_memcg_path_leak.sh: use kmemleak Patch series "selftests/damon: improve leak detection and wss estimation reliability". Two DAMON selftets, namely 'sysfs_memcg_leak' and 'sysfs_update_schemes_tried_regions_wss_estimation' frequently show intermittent failures due to their unreliable leak detection and working set size estimation. Make those more reliable. This patch (of 5): sysfs_memcg_path_leak.sh determines if the memory leak has happened by seeing if Slab size on /proc/meminfo increases more than expected after an action. Depending on the system and background workloads, the reasonable expectation varies. For the reason, the test frequently shows intermittent failures. Use kmemleak, which is much more reliable and correct, instead. Link: https://lkml.kernel.org/r/20260117020731.226785-1-sj@kernel.org Link: https://lkml.kernel.org/r/20260117020731.226785-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton --- .../selftests/damon/sysfs_memcg_path_leak.sh | 26 ++++++++++++---------- 1 file changed, 14 insertions(+), 12 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/damon/sysfs_memcg_path_leak.sh b/tools/testing/selftests/damon/sysfs_memcg_path_leak.sh index 64c5d8c518a4..33a7ff43ed6c 100755 --- a/tools/testing/selftests/damon/sysfs_memcg_path_leak.sh +++ b/tools/testing/selftests/damon/sysfs_memcg_path_leak.sh @@ -14,6 +14,13 @@ then exit $ksft_skip fi +kmemleak="/sys/kernel/debug/kmemleak" +if [ ! -f "$kmemleak" ] +then + echo "$kmemleak not found" + exit $ksft_skip +fi + # ensure filter directory echo 1 > "$damon_sysfs/kdamonds/nr_kdamonds" echo 1 > "$damon_sysfs/kdamonds/0/contexts/nr_contexts" @@ -22,22 +29,17 @@ echo 1 > "$damon_sysfs/kdamonds/0/contexts/0/schemes/0/filters/nr_filters" filter_dir="$damon_sysfs/kdamonds/0/contexts/0/schemes/0/filters/0" -before_kb=$(grep Slab /proc/meminfo | awk '{print $2}') - -# try to leak 3000 KiB -for i in {1..102400}; +# try to leak 128 times +for i in {1..128}; do echo "012345678901234567890123456789" > "$filter_dir/memcg_path" done -after_kb=$(grep Slab /proc/meminfo | awk '{print $2}') -# expect up to 1500 KiB free from other tasks memory -expected_after_kb_max=$((before_kb + 1500)) - -if [ "$after_kb" -gt "$expected_after_kb_max" ] +echo scan > "$kmemleak" +kmemleak_report=$(cat "$kmemleak") +if [ "$kmemleak_report" = "" ] then - echo "maybe memcg_path are leaking: $before_kb -> $after_kb" - exit 1 -else exit 0 fi +echo "$kmemleak_report" +exit 1 -- cgit v1.2.3 From 891d206e27dc1a684e460b079d2b53e17135d693 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 16 Jan 2026 18:07:25 -0800 Subject: selftests/damon/wss_estimation: test for up to 160 MiB working set size DAMON reads and writes Accessed bits of page tables without manual TLB flush for two reasons. First, it minimizes the overhead. Second, real systems that need DAMON are expected to be memory intensive enough to cause periodic TLB flushes. For test setups that use small test workloads, however, the system's TLB could be big enough to cover whole or most accesses of the test workload. In this case, no page table walk happens and DAMON cannot show any access from the test workload. The test workload for DAMON's working set size estimation selftest is such a case. It accesses only 10 MiB working set, and it turned out there are test setups that have TLBs large enough to cover the 10 MiB data accesses. As a result, the test fails depending on the test machine. Make it more reliable by trying larger working sets up to 160 MiB when it fails. Link: https://lkml.kernel.org/r/20260117020731.226785-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton --- ..._update_schemes_tried_regions_wss_estimation.py | 29 +++++++++++++++++----- 1 file changed, 23 insertions(+), 6 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py b/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py index 90ad7409a7a6..bf48ef8e5241 100755 --- a/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py +++ b/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py @@ -6,9 +6,8 @@ import time import _damon_sysfs -def main(): - # access two 10 MiB memory regions, 2 second per each - sz_region = 10 * 1024 * 1024 +def pass_wss_estimation(sz_region): + # access two regions of given size, 2 seocnds per each region proc = subprocess.Popen(['./access_memory', '2', '%d' % sz_region, '2000']) kdamonds = _damon_sysfs.Kdamonds([_damon_sysfs.Kdamond( contexts=[_damon_sysfs.DamonCtx( @@ -36,20 +35,38 @@ def main(): wss_collected.append( kdamonds.kdamonds[0].contexts[0].schemes[0].tried_bytes) + err = kdamonds.stop() + if err is not None: + print('kdamond stop failed: %s' % err) + exit(1) wss_collected.sort() acceptable_error_rate = 0.2 for percentile in [50, 75]: sample = wss_collected[int(len(wss_collected) * percentile / 100)] error_rate = abs(sample - sz_region) / sz_region - print('%d-th percentile (%d) error %f' % - (percentile, sample, error_rate)) + print('%d-th percentile error %f (expect %d, result %d)' % + (percentile, error_rate, sz_region, sample)) if error_rate > acceptable_error_rate: print('the error rate is not acceptable (> %f)' % acceptable_error_rate) print('samples are as below') print('\n'.join(['%d' % wss for wss in wss_collected])) - exit(1) + return False + return True + +def main(): + # DAMON doesn't flush TLB. If the system has large TLB that can cover + # whole test working set, DAMON cannot see the access. Test up to 160 MiB + # test working set. + sz_region_mb = 10 + max_sz_region_mb = 160 + while sz_region_mb <= max_sz_region_mb: + test_pass = pass_wss_estimation(sz_region_mb * 1024 * 1024) + if test_pass is True: + exit(0) + sz_region_mb *= 2 + exit(1) if __name__ == '__main__': main() -- cgit v1.2.3 From 514d1bcb58e0ef93fafa4f9c3035d604a4219867 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 16 Jan 2026 18:07:26 -0800 Subject: selftests/damon/access_memory: add repeat mode 'access_memory' is an artificial memory access generator program that is used for a few DAMON selftests. It accesses a given number of regions one by one only once, and exits. Depending on systems, the test workload may exit faster than expected, making the tests unreliable. For reliable control of the artificial memory access pattern, add a mode to make it repeat running. Link: https://lkml.kernel.org/r/20260117020731.226785-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/access_memory.c | 29 +++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) (limited to 'tools/testing') diff --git a/tools/testing/selftests/damon/access_memory.c b/tools/testing/selftests/damon/access_memory.c index 56b17e8fe1be..567793b11107 100644 --- a/tools/testing/selftests/damon/access_memory.c +++ b/tools/testing/selftests/damon/access_memory.c @@ -8,6 +8,11 @@ #include #include +enum access_mode { + ACCESS_MODE_ONCE, + ACCESS_MODE_REPEAT, +}; + int main(int argc, char *argv[]) { char **regions; @@ -15,10 +20,12 @@ int main(int argc, char *argv[]) int nr_regions; int sz_region; int access_time_ms; + enum access_mode mode = ACCESS_MODE_ONCE; + int i; - if (argc != 4) { - printf("Usage: %s