summaryrefslogtreecommitdiff
path: root/tools
diff options
context:
space:
mode:
authorPuranjay Mohan <puranjay@kernel.org>2026-02-25 16:43:38 +0300
committerAlexei Starovoitov <ast@kernel.org>2026-03-03 19:43:10 +0300
commitb1d6bd5462f1e16adb805ce293bd11e9d7c47e6c (patch)
treec560bb3d36d940dec0f8ef1c07f27022d37ae3e3 /tools
parentb0cc2e069fae3fba74381609ebc523ceca85cd9a (diff)
downloadlinux-b1d6bd5462f1e16adb805ce293bd11e9d7c47e6c.tar.xz
bpf, arm64: Use ORR-based MOV for general-purpose registers
The A64_MOV macro unconditionally uses ADD Rd, Rn, #0 to implement register moves. While functionally correct, this is not the canonical encoding when both operands are general-purpose registers. On AArch64, MOV has two aliases depending on the operand registers: - MOV <Xd|SP>, <Xn|SP> → ADD <Xd|SP>, <Xn|SP>, #0 - MOV <Xd>, <Xn> → ORR <Xd>, XZR, <Xn> The ADD form is required when the stack pointer is involved (as ORR does not accept SP), while the ORR form is the preferred encoding for general-purpose registers. The ORR encoding is also measurably faster on modern microarchitectures. A microbenchmark [1] comparing dependent chains of MOV (ORR) vs ADD #0 on an ARM Neoverse-V2 (72-core, 3.4 GHz) shows: === mov (ORR Xd, XZR, Xn) === run1 cycles/op=0.749859456 run2 cycles/op=0.749991250 run3 cycles/op=0.749601847 avg cycles/op=0.749817518 === add0 (ADD Xd, Xn, #0) === run1 cycles/op=1.004777689 run2 cycles/op=1.004558266 run3 cycles/op=1.004806559 avg cycles/op=1.004714171 The ORR form completes in ~0.75 cycles/op vs ~1.00 cycles/op for ADD #0, a ~25% improvement. This is likely because the CPU's register renaming hardware can eliminate ORR-based moves, while ADD #0 must go through the ALU pipeline. Update A64_MOV to select the appropriate encoding at JIT time: use ADD when either register is A64_SP, and ORR (via aarch64_insn_gen_move_reg()) otherwise. Update verifier_private_stack selftests to expect "mov x7, x0" instead of "add x7, x0, #0x0" in the JITed instruction checks, matching the new ORR-based encoding. [1] https://github.com/puranjaymohan/scripts/blob/main/arm64/bench/run_mov_vs_add0.sh Signed-off-by: Puranjay Mohan <puranjay@kernel.org> Acked-by: Xu Kuohai <xukuohai@huawei.com> Link: https://lore.kernel.org/r/20260225134339.2723288-1-puranjay@kernel.org Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Diffstat (limited to 'tools')
-rw-r--r--tools/testing/selftests/bpf/progs/verifier_private_stack.c8
1 files changed, 4 insertions, 4 deletions
diff --git a/tools/testing/selftests/bpf/progs/verifier_private_stack.c b/tools/testing/selftests/bpf/progs/verifier_private_stack.c
index 1ecd34ebde19..646e8ef82051 100644
--- a/tools/testing/selftests/bpf/progs/verifier_private_stack.c
+++ b/tools/testing/selftests/bpf/progs/verifier_private_stack.c
@@ -170,11 +170,11 @@ __jited(" mrs x10, TPIDR_EL{{[0-1]}}")
__jited(" add x27, x27, x10")
__jited(" add x25, x27, {{.*}}")
__jited(" bl 0x{{.*}}")
-__jited(" add x7, x0, #0x0")
+__jited(" mov x7, x0")
__jited(" mov x0, #0x2a")
__jited(" str x0, [x27]")
__jited(" bl 0x{{.*}}")
-__jited(" add x7, x0, #0x0")
+__jited(" mov x7, x0")
__jited(" mov x7, #0x0")
__jited(" ldp x25, x27, [sp], {{.*}}")
__naked void private_stack_callback(void)
@@ -220,7 +220,7 @@ __jited(" mov x0, #0x2a")
__jited(" str x0, [x27]")
__jited(" mov x0, #0x0")
__jited(" bl 0x{{.*}}")
-__jited(" add x7, x0, #0x0")
+__jited(" mov x7, x0")
__jited(" ldp x27, x28, [sp], #0x10")
int private_stack_exception_main_prog(void)
{
@@ -258,7 +258,7 @@ __jited(" add x25, x27, {{.*}}")
__jited(" mov x0, #0x2a")
__jited(" str x0, [x27]")
__jited(" bl 0x{{.*}}")
-__jited(" add x7, x0, #0x0")
+__jited(" mov x7, x0")
__jited(" ldp x27, x28, [sp], #0x10")
int private_stack_exception_sub_prog(void)
{