From 27ae400e6e888153ded1ad807a94a94e506dd2df Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 11 May 2026 11:46:11 +0100 Subject: KVM: arm64: nv: Track L2 to L1 exception emulation While we currently track that we are emulating a nested ERET from L1 to L2, we currently don't track the reverse direction (an exception going from L2 to L1). Add a new vcpu state flag for this purpose, which will see some use shortly. Signed-off-by: Marc Zyngier Link: https://patch.msgid.link/20260520085036.541666-2-maz@kernel.org --- arch/arm64/include/asm/kvm_host.h | 3 ++- arch/arm64/kvm/emulate-nested.c | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 65eead8362e0..c79747d5f4dd 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -1112,7 +1112,8 @@ struct kvm_vcpu_arch { #define IN_NESTED_ERET __vcpu_single_flag(sflags, BIT(7)) /* SError pending for nested guest */ #define NESTED_SERROR_PENDING __vcpu_single_flag(sflags, BIT(8)) - +/* KVM is currently emulating an L2 to L1 exception */ +#define IN_NESTED_EXCEPTION __vcpu_single_flag(sflags, BIT(9)) /* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */ #define vcpu_sve_pffr(vcpu) (kern_hyp_va((vcpu)->arch.sve_state) + \ diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c index dba7ced74ca5..15c691a6266d 100644 --- a/arch/arm64/kvm/emulate-nested.c +++ b/arch/arm64/kvm/emulate-nested.c @@ -2862,6 +2862,8 @@ static int kvm_inject_nested(struct kvm_vcpu *vcpu, u64 esr_el2, preempt_disable(); + vcpu_set_flag(vcpu, IN_NESTED_EXCEPTION); + /* * We may have an exception or PC update in the EL0/EL1 context. * Commit it before entering EL2. @@ -2884,6 +2886,8 @@ static int kvm_inject_nested(struct kvm_vcpu *vcpu, u64 esr_el2, __kvm_adjust_pc(vcpu); kvm_arch_vcpu_load(vcpu, smp_processor_id()); + vcpu_clear_flag(vcpu, IN_NESTED_EXCEPTION); + preempt_enable(); if (kvm_vcpu_has_pmu(vcpu)) -- cgit v1.2.3 From 435c466196148ae116f616e6cda97c33281defc2 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 11 May 2026 11:46:41 +0100 Subject: KVM: arm64: nv: Don't save/restore FP register during a nested ERET or exception When switching between L1 and L2, we save the old state using kvm_arch_vcpu_put(), mutate the state in memory, then load the new state using kvm_arch_vcpu_load(). Any live FPSIMD/SVE state is saved and unbound, such that it can be lazily restored on a subsequent trap. The FPSIMD/SVE state is shared by exception levels, and only a handful of related control registers need to be changed when transitioning between L1 and L2. The save/restore of the common state is needless overhead, especially as trapping becomes exponentially more expensive with nesting. Avoid this overhead by leaving the common FPSIMD/SVE state live on the CPU, and only switching the state that is distinct for L1 and L2: - the trap controls: the effective values are recomputed on each entry into the guest to take the EL into account and merge the L0 and L1 configuration if in a nested context, or directly use the L0 configuration in non-nested context (see __activate_traps()). - the VL settings: the effective values are are also recomputed on each entry into the guest (see fpsimd_lazy_switch_to_guest()). Since we appear to cover all bases, use the vcpu flags indicating the handling of a nested ERET or exception delivery to avoid the whole FP save/restore shenanigans. SME will have to be similarly dealt with when it eventually gets supported. For an EL1 L3 guest where L1 and L2 have this optimisation, this results in at least a 10% wall clock reduction when running an I/O heavy workload, generating a high rate of nested exceptions. Reviewed-by: Joey Gouly Acked-by: Mark Rutland Signed-off-by: Marc Zyngier Link: https://patch.msgid.link/20260520085036.541666-3-maz@kernel.org --- arch/arm64/kvm/fpsimd.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index 15e17aca1dec..3f6b1e29cd6b 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -28,6 +28,20 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) if (!system_supports_fpsimd()) return; + /* + * Avoid needless save/restore of the guest's common + * FPSIMD/SVE/SME regs during transitions between L1/L2. + * + * These transitions only happens in a non-preemptible context + * where the host regs have already been saved and unbound. The + * live registers are either free or owned by the guest. + */ + if (vcpu_get_flag(vcpu, IN_NESTED_ERET) || + vcpu_get_flag(vcpu, IN_NESTED_EXCEPTION)) { + WARN_ON_ONCE(host_owns_fp_regs()); + return; + } + /* * Ensure that any host FPSIMD/SVE/SME state is saved and unbound such * that the host kernel is responsible for restoring this state upon @@ -102,6 +116,18 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) { unsigned long flags; + /* + * See comment in kvm_arch_vcpu_load_fp(). Note that we also rely on + * the guest's max VL to have been set by fpsimd_lazy_switch_to_host() + * so that any intervening kernel-mode SIMD (NEON or otherwise) + * operation sees the full guest state that needs saving. + */ + if (vcpu_get_flag(vcpu, IN_NESTED_ERET) || + vcpu_get_flag(vcpu, IN_NESTED_EXCEPTION)) { + WARN_ON_ONCE(host_owns_fp_regs()); + return; + } + local_irq_save(flags); if (guest_owns_fp_regs()) { -- cgit v1.2.3 From 68a612d4dbc7f2b9dac731c79676a21fce573d29 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 20 May 2026 11:01:55 +0100 Subject: KVM: arm64: timer: Repaint kvm_timer_{should,irq_can}_fire() to kvm_timer_{pending,enabled}() kvm_timer_should_fire() seems to date back to a time where the author of the timer code didn't seem to have made the word "pending" part of their vocabulary. Having since slightly improved on that front, let's rename this predicate to kvm_timer_pending(), which clearly indicates whether the timer interrupt is pending or not. Similarly, kvm_timer_irq_can_fire() is renamed to kvm_timer_enabled(). Reviewed-by: Joey Gouly Reviewed-by: Oliver Upton Link: https://patch.msgid.link/20260520100200.543845-2-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/arch_timer.c | 55 ++++++++++++++++++++++----------------------- 1 file changed, 27 insertions(+), 28 deletions(-) diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index cbea4d9ee955..d8add34717f0 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -39,10 +39,9 @@ static const u8 default_ppi[] = { [TIMER_HVTIMER] = 28, }; -static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx); static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, struct arch_timer_context *timer_ctx); -static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx); +static bool kvm_timer_pending(struct arch_timer_context *timer_ctx); static void kvm_arm_timer_write(struct kvm_vcpu *vcpu, struct arch_timer_context *timer, enum kvm_arch_timer_regs treg, @@ -224,7 +223,7 @@ static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) else ctx = map.direct_ptimer; - if (kvm_timer_should_fire(ctx)) + if (kvm_timer_pending(ctx)) kvm_timer_update_irq(vcpu, true, ctx); if (userspace_irqchip(vcpu->kvm) && @@ -257,7 +256,7 @@ static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx) return kvm_counter_compute_delta(timer_ctx, timer_get_cval(timer_ctx)); } -static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx) +static bool kvm_timer_enabled(struct arch_timer_context *timer_ctx) { WARN_ON(timer_ctx && timer_ctx->loaded); return timer_ctx && @@ -294,7 +293,7 @@ static u64 kvm_timer_earliest_exp(struct kvm_vcpu *vcpu) struct arch_timer_context *ctx = &vcpu->arch.timer_cpu.timers[i]; WARN(ctx->loaded, "timer %d loaded\n", i); - if (kvm_timer_irq_can_fire(ctx)) + if (kvm_timer_enabled(ctx)) min_delta = min(min_delta, kvm_timer_compute_delta(ctx)); } @@ -358,7 +357,7 @@ static enum hrtimer_restart kvm_hrtimer_expire(struct hrtimer *hrt) return HRTIMER_NORESTART; } -static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx) +static bool kvm_timer_pending(struct arch_timer_context *timer_ctx) { enum kvm_arch_timers index; u64 cval, now; @@ -391,7 +390,7 @@ static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx) !(cnt_ctl & ARCH_TIMER_CTRL_IT_MASK); } - if (!kvm_timer_irq_can_fire(timer_ctx)) + if (!kvm_timer_enabled(timer_ctx)) return false; cval = timer_get_cval(timer_ctx); @@ -417,9 +416,9 @@ void kvm_timer_update_run(struct kvm_vcpu *vcpu) /* Populate the device bitmap with the timer states */ regs->device_irq_level &= ~(KVM_ARM_DEV_EL1_VTIMER | KVM_ARM_DEV_EL1_PTIMER); - if (kvm_timer_should_fire(vtimer)) + if (kvm_timer_pending(vtimer)) regs->device_irq_level |= KVM_ARM_DEV_EL1_VTIMER; - if (kvm_timer_should_fire(ptimer)) + if (kvm_timer_pending(ptimer)) regs->device_irq_level |= KVM_ARM_DEV_EL1_PTIMER; } @@ -473,21 +472,21 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, /* Only called for a fully emulated timer */ static void timer_emulate(struct arch_timer_context *ctx) { - bool should_fire = kvm_timer_should_fire(ctx); + bool pending = kvm_timer_pending(ctx); - trace_kvm_timer_emulate(ctx, should_fire); + trace_kvm_timer_emulate(ctx, pending); - if (should_fire != ctx->irq.level) - kvm_timer_update_irq(timer_context_to_vcpu(ctx), should_fire, ctx); + if (pending != ctx->irq.level) + kvm_timer_update_irq(timer_context_to_vcpu(ctx), pending, ctx); - kvm_timer_update_status(ctx, should_fire); + kvm_timer_update_status(ctx, pending); /* - * If the timer can fire now, we don't need to have a soft timer - * scheduled for the future. If the timer cannot fire at all, - * then we also don't need a soft timer. + * If the timer is pending, we don't need to have a soft timer + * scheduled for the future. If the timer is disabled, then + * we don't need a soft timer either. */ - if (should_fire || !kvm_timer_irq_can_fire(ctx)) + if (pending || !kvm_timer_enabled(ctx)) return; soft_timer_start(&ctx->hrtimer, kvm_timer_compute_delta(ctx)); @@ -594,10 +593,10 @@ static void kvm_timer_blocking(struct kvm_vcpu *vcpu) * If no timers are capable of raising interrupts (disabled or * masked), then there's no more work for us to do. */ - if (!kvm_timer_irq_can_fire(map.direct_vtimer) && - !kvm_timer_irq_can_fire(map.direct_ptimer) && - !kvm_timer_irq_can_fire(map.emul_vtimer) && - !kvm_timer_irq_can_fire(map.emul_ptimer) && + if (!kvm_timer_enabled(map.direct_vtimer) && + !kvm_timer_enabled(map.direct_ptimer) && + !kvm_timer_enabled(map.emul_vtimer) && + !kvm_timer_enabled(map.emul_ptimer) && !vcpu_has_wfit_active(vcpu)) return; @@ -685,7 +684,7 @@ static void kvm_timer_vcpu_load_gic(struct arch_timer_context *ctx) * this point and the register restoration, we'll take the * interrupt anyway. */ - kvm_timer_update_irq(vcpu, kvm_timer_should_fire(ctx), ctx); + kvm_timer_update_irq(vcpu, kvm_timer_pending(ctx), ctx); if (irqchip_in_kernel(vcpu->kvm)) phys_active = kvm_vgic_map_is_active(vcpu, timer_irq(ctx)); @@ -706,7 +705,7 @@ static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu) * this point and the register restoration, we'll take the * interrupt anyway. */ - kvm_timer_update_irq(vcpu, kvm_timer_should_fire(vtimer), vtimer); + kvm_timer_update_irq(vcpu, kvm_timer_pending(vtimer), vtimer); /* * When using a userspace irqchip with the architected timers and a @@ -917,8 +916,8 @@ bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu) vlevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_VTIMER; plevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_PTIMER; - return kvm_timer_should_fire(vtimer) != vlevel || - kvm_timer_should_fire(ptimer) != plevel; + return kvm_timer_pending(vtimer) != vlevel || + kvm_timer_pending(ptimer) != plevel; } void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) @@ -1006,7 +1005,7 @@ static void unmask_vtimer_irq_user(struct kvm_vcpu *vcpu) { struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - if (!kvm_timer_should_fire(vtimer)) { + if (!kvm_timer_pending(vtimer)) { kvm_timer_update_irq(vcpu, false, vtimer); if (static_branch_likely(&has_gic_active_state)) set_timer_irq_phys_active(vtimer, false); @@ -1579,7 +1578,7 @@ static bool kvm_arch_timer_get_input_level(int vintid) ctx = vcpu_get_timer(vcpu, i); if (timer_irq(ctx) == vintid) - return kvm_timer_should_fire(ctx); + return kvm_timer_pending(ctx); } /* A timer IRQ has fired, but no matching timer was found? */ -- cgit v1.2.3 From 0d27b4b351493cb2fe1f87cd152856704d4e141d Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 20 May 2026 11:01:56 +0100 Subject: KVM: arm64: Simplify userspace notification of interrupt state The userspace notification of interrupts is has a few problems: - it is utterly pointless - it is annoyingly split between detecting the need for notification and the population of the interrupts in the run structure We can't do anything about the former (yet), but the latter can be addressed. If we detect that we must notify userspace, we know that we are going to exit, as we populate the exit status. Which means we can also populate the interrupt state at this stage and be done with it. This simplifies the structure of the code. Reviewed-by: Oliver Upton Link: https://patch.msgid.link/20260520100200.543845-3-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/arch_timer.c | 49 ++++++++++++++++++-------------------------- arch/arm64/kvm/arm.c | 24 +++++++++++++--------- arch/arm64/kvm/pmu-emul.c | 18 ++++++---------- include/kvm/arm_arch_timer.h | 2 +- include/kvm/arm_pmu.h | 4 ++-- 5 files changed, 43 insertions(+), 54 deletions(-) diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index d8add34717f0..7236dd6a99e6 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -404,22 +404,30 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) return vcpu_has_wfit_active(vcpu) && wfit_delay_ns(vcpu) == 0; } +static u64 kvm_timer_needs_notify(struct kvm_vcpu *vcpu) +{ + u64 v = vcpu->run->s.regs.device_irq_level; + + v ^= kvm_timer_pending(vcpu_vtimer(vcpu)) ? KVM_ARM_DEV_EL1_VTIMER : 0; + v ^= kvm_timer_pending(vcpu_ptimer(vcpu)) ? KVM_ARM_DEV_EL1_PTIMER : 0; + + return v & (KVM_ARM_DEV_EL1_VTIMER | KVM_ARM_DEV_EL1_PTIMER); +} + +bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu) +{ + return !!kvm_timer_needs_notify(vcpu); +} + /* * Reflect the timer output level into the kvm_run structure */ -void kvm_timer_update_run(struct kvm_vcpu *vcpu) +bool kvm_timer_update_run(struct kvm_vcpu *vcpu) { - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); - struct kvm_sync_regs *regs = &vcpu->run->s.regs; - - /* Populate the device bitmap with the timer states */ - regs->device_irq_level &= ~(KVM_ARM_DEV_EL1_VTIMER | - KVM_ARM_DEV_EL1_PTIMER); - if (kvm_timer_pending(vtimer)) - regs->device_irq_level |= KVM_ARM_DEV_EL1_VTIMER; - if (kvm_timer_pending(ptimer)) - regs->device_irq_level |= KVM_ARM_DEV_EL1_PTIMER; + u64 mask = kvm_timer_needs_notify(vcpu); + if (mask) + vcpu->run->s.regs.device_irq_level ^= mask; + return !!mask; } static void kvm_timer_update_status(struct arch_timer_context *ctx, bool level) @@ -903,23 +911,6 @@ void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu) timer_set_traps(vcpu, &map); } -bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu) -{ - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); - struct kvm_sync_regs *sregs = &vcpu->run->s.regs; - bool vlevel, plevel; - - if (likely(irqchip_in_kernel(vcpu->kvm))) - return false; - - vlevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_VTIMER; - plevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_PTIMER; - - return kvm_timer_pending(vtimer) != vlevel || - kvm_timer_pending(ptimer) != plevel; -} - void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = vcpu_timer(vcpu); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 8bb2c7422cc8..6e6dc17f8b60 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1163,6 +1163,15 @@ static bool vcpu_mode_is_bad_32bit(struct kvm_vcpu *vcpu) return !kvm_supports_32bit_el0(); } +static bool kvm_irq_update_run(struct kvm_vcpu *vcpu) +{ + bool r; + + r = kvm_timer_update_run(vcpu); + r |= kvm_pmu_update_run(vcpu); + return r; +} + /** * kvm_vcpu_exit_request - returns true if the VCPU should *not* enter the guest * @vcpu: The VCPU pointer @@ -1184,13 +1193,11 @@ static bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu, int *ret) /* * If we're using a userspace irqchip, then check if we need * to tell a userspace irqchip about timer or PMU level - * changes and if so, exit to userspace (the actual level - * state gets updated in kvm_timer_update_run and - * kvm_pmu_update_run below). + * changes and if so, exit to userspace while updating the run + * state. */ if (unlikely(!irqchip_in_kernel(vcpu->kvm))) { - if (kvm_timer_should_notify_user(vcpu) || - kvm_pmu_should_notify_user(vcpu)) { + if (unlikely(kvm_irq_update_run(vcpu))) { *ret = -EINTR; run->exit_reason = KVM_EXIT_INTR; return true; @@ -1405,11 +1412,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) ret = handle_exit(vcpu, ret); } - /* Tell userspace about in-kernel device output levels */ - if (unlikely(!irqchip_in_kernel(vcpu->kvm))) { - kvm_timer_update_run(vcpu); - kvm_pmu_update_run(vcpu); - } + if (unlikely(!irqchip_in_kernel(vcpu->kvm))) + kvm_irq_update_run(vcpu); kvm_sigset_deactivate(vcpu); diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index e1860acae641..31a472a2c488 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -413,27 +413,21 @@ static void kvm_pmu_update_state(struct kvm_vcpu *vcpu) bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu) { - struct kvm_pmu *pmu = &vcpu->arch.pmu; struct kvm_sync_regs *sregs = &vcpu->run->s.regs; bool run_level = sregs->device_irq_level & KVM_ARM_DEV_PMU; - if (likely(irqchip_in_kernel(vcpu->kvm))) - return false; - - return pmu->irq_level != run_level; + return kvm_pmu_overflow_status(vcpu) != run_level; } /* * Reflect the PMU overflow interrupt output level into the kvm_run structure */ -void kvm_pmu_update_run(struct kvm_vcpu *vcpu) +bool kvm_pmu_update_run(struct kvm_vcpu *vcpu) { - struct kvm_sync_regs *regs = &vcpu->run->s.regs; - - /* Populate the timer bitmap for user space */ - regs->device_irq_level &= ~KVM_ARM_DEV_PMU; - if (vcpu->arch.pmu.irq_level) - regs->device_irq_level |= KVM_ARM_DEV_PMU; + bool update = kvm_pmu_should_notify_user(vcpu); + if (update) + vcpu->run->s.regs.device_irq_level ^= KVM_ARM_DEV_PMU; + return update; } /** diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index bf8cc9589bd0..9e4076eebd29 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -104,7 +104,7 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu); void kvm_timer_sync_nested(struct kvm_vcpu *vcpu); void kvm_timer_sync_user(struct kvm_vcpu *vcpu); bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu); -void kvm_timer_update_run(struct kvm_vcpu *vcpu); +bool kvm_timer_update_run(struct kvm_vcpu *vcpu); void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu); void kvm_timer_init_vm(struct kvm *kvm); diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index 0a36a3d5c894..3e844c5ee917 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -54,7 +54,7 @@ void kvm_pmu_reprogram_counter_mask(struct kvm_vcpu *vcpu, u64 val); void kvm_pmu_flush_hwstate(struct kvm_vcpu *vcpu); void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu); bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu); -void kvm_pmu_update_run(struct kvm_vcpu *vcpu); +bool kvm_pmu_update_run(struct kvm_vcpu *vcpu); void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val); void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val); void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data, @@ -131,7 +131,7 @@ static inline bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu) { return false; } -static inline void kvm_pmu_update_run(struct kvm_vcpu *vcpu) {} +static inline bool kvm_pmu_update_run(struct kvm_vcpu *vcpu) { return false; } static inline void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val) {} static inline void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) {} static inline void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, -- cgit v1.2.3 From ac7002031852ab8f75b3debb1a4c4b2d1ff5a26c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 20 May 2026 11:01:57 +0100 Subject: KVM: arm64: timer: Kill the per-timer irq level cache The timer code makes use of a per-timer irq level cache, which looks like a very minor optimisation to avoid taking a lock upon updating the GIC view of the interrupt when it is unchanged from the previous state. This is coming in the way of more important correctness issues, so get rid of the cache, which simplifies a couple of minor things. Reviewed-by: Oliver Upton Link: https://patch.msgid.link/20260520100200.543845-4-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/arch_timer.c | 20 +++++++++----------- include/kvm/arm_arch_timer.h | 5 ----- 2 files changed, 9 insertions(+), 16 deletions(-) diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index 7236dd6a99e6..c3b8257888e8 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -453,9 +453,8 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, { kvm_timer_update_status(timer_ctx, new_level); - timer_ctx->irq.level = new_level; trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_irq(timer_ctx), - timer_ctx->irq.level); + new_level); if (userspace_irqchip(vcpu->kvm)) return; @@ -473,7 +472,7 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, kvm_vgic_inject_irq(vcpu->kvm, vcpu, timer_irq(timer_ctx), - timer_ctx->irq.level, + new_level, timer_ctx); } @@ -484,10 +483,7 @@ static void timer_emulate(struct arch_timer_context *ctx) trace_kvm_timer_emulate(ctx, pending); - if (pending != ctx->irq.level) - kvm_timer_update_irq(timer_context_to_vcpu(ctx), pending, ctx); - - kvm_timer_update_status(ctx, pending); + kvm_timer_update_irq(timer_context_to_vcpu(ctx), pending, ctx); /* * If the timer is pending, we don't need to have a soft timer @@ -684,6 +680,7 @@ static inline void set_timer_irq_phys_active(struct arch_timer_context *ctx, boo static void kvm_timer_vcpu_load_gic(struct arch_timer_context *ctx) { struct kvm_vcpu *vcpu = timer_context_to_vcpu(ctx); + bool pending = kvm_timer_pending(ctx); bool phys_active = false; /* @@ -692,12 +689,12 @@ static void kvm_timer_vcpu_load_gic(struct arch_timer_context *ctx) * this point and the register restoration, we'll take the * interrupt anyway. */ - kvm_timer_update_irq(vcpu, kvm_timer_pending(ctx), ctx); + kvm_timer_update_irq(vcpu, pending, ctx); if (irqchip_in_kernel(vcpu->kvm)) phys_active = kvm_vgic_map_is_active(vcpu, timer_irq(ctx)); - phys_active |= ctx->irq.level; + phys_active |= pending; phys_active |= vgic_is_v5(vcpu->kvm); set_timer_irq_phys_active(ctx, phys_active); @@ -706,6 +703,7 @@ static void kvm_timer_vcpu_load_gic(struct arch_timer_context *ctx) static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu) { struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + bool pending = kvm_timer_pending(vtimer); /* * Update the timer output so that it is likely to match the @@ -713,7 +711,7 @@ static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu) * this point and the register restoration, we'll take the * interrupt anyway. */ - kvm_timer_update_irq(vcpu, kvm_timer_pending(vtimer), vtimer); + kvm_timer_update_irq(vcpu, pending, vtimer); /* * When using a userspace irqchip with the architected timers and a @@ -725,7 +723,7 @@ static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu) * being de-asserted, we unmask the interrupt again so that we exit * from the guest when the timer fires. */ - if (vtimer->irq.level) + if (pending) disable_percpu_irq(host_vtimer_irq); else enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags); diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index 9e4076eebd29..15a4f97f8105 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -66,11 +66,6 @@ struct arch_timer_context { */ bool loaded; - /* Output level of the timer IRQ */ - struct { - bool level; - } irq; - /* Who am I? */ enum kvm_arch_timers timer_id; -- cgit v1.2.3 From 2772383afc5c65d6242f62947b5c184ffb049359 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 20 May 2026 11:01:58 +0100 Subject: KVM: arm64: pmu: Kill the PMU interrupt level cache Just like the timer, the PMU has an interrupt cache that serves little purpose. Drop it. Reviewed-by: Oliver Upton Link: https://patch.msgid.link/20260520100200.543845-5-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/pmu-emul.c | 13 +++---------- include/kvm/arm_pmu.h | 1 - 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 31a472a2c488..edb21239478a 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -396,19 +396,12 @@ static bool kvm_pmu_overflow_status(struct kvm_vcpu *vcpu) static void kvm_pmu_update_state(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = &vcpu->arch.pmu; - bool overflow; - overflow = kvm_pmu_overflow_status(vcpu); - if (pmu->irq_level == overflow) + if (unlikely(!irqchip_in_kernel(vcpu->kvm))) return; - pmu->irq_level = overflow; - - if (likely(irqchip_in_kernel(vcpu->kvm))) { - int ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu, - pmu->irq_num, overflow, pmu); - WARN_ON(ret); - } + WARN_ON(kvm_vgic_inject_irq(vcpu->kvm, vcpu, pmu->irq_num, + kvm_pmu_overflow_status(vcpu), pmu)); } bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu) diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index 3e844c5ee917..b5e5942204fc 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -32,7 +32,6 @@ struct kvm_pmu { struct kvm_pmc pmc[KVM_ARMV8_PMU_MAX_COUNTERS]; int irq_num; bool created; - bool irq_level; }; struct arm_pmu_entry { -- cgit v1.2.3 From 1a8685ed8cd1ded20d0c81070a49b1cddf70481d Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 20 May 2026 11:01:59 +0100 Subject: KVM: arm64: vgic-v2: Force vgic init on injection outside the run loop Make sure that any attempt to inject an interrupt from userspace or an irqfd results in the GICv2 lazy init to take place. This is not currently necessary as the init is also performed on *any* interrupt injection. But as we're about to remove that, let's introduce it here. Reviewed-by: Oliver Upton Link: https://patch.msgid.link/20260520100200.543845-6-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/arm.c | 15 +++++++++++++-- arch/arm64/kvm/vgic/vgic-irqfd.c | 6 ++++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 6e6dc17f8b60..cfb7921fc7d7 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -51,6 +51,7 @@ #include +#include "vgic/vgic.h" #include "sys_regs.h" static enum kvm_mode kvm_mode = KVM_MODE_DEFAULT; @@ -1497,8 +1498,13 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, return vcpu_interrupt_line(vcpu, irq_num, level); case KVM_ARM_IRQ_TYPE_PPI: - if (!irqchip_in_kernel(kvm)) + if (irqchip_in_kernel(kvm)) { + int ret = vgic_lazy_init(kvm); + if (ret) + return ret; + } else { return -ENXIO; + } vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); if (!vcpu) @@ -1525,8 +1531,13 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, return kvm_vgic_inject_irq(kvm, vcpu, irq_num, level, NULL); case KVM_ARM_IRQ_TYPE_SPI: - if (!irqchip_in_kernel(kvm)) + if (irqchip_in_kernel(kvm)) { + int ret = vgic_lazy_init(kvm); + if (ret) + return ret; + } else { return -ENXIO; + } if (vgic_is_v5(kvm)) { /* Build a GICv5-style IntID here */ diff --git a/arch/arm64/kvm/vgic/vgic-irqfd.c b/arch/arm64/kvm/vgic/vgic-irqfd.c index b9b86e3a6c86..19a1094536e6 100644 --- a/arch/arm64/kvm/vgic/vgic-irqfd.c +++ b/arch/arm64/kvm/vgic/vgic-irqfd.c @@ -20,9 +20,15 @@ static int vgic_irqfd_set_irq(struct kvm_kernel_irq_routing_entry *e, int level, bool line_status) { unsigned int spi_id = e->irqchip.pin + VGIC_NR_PRIVATE_IRQS; + int ret; if (!vgic_valid_spi(kvm, spi_id)) return -EINVAL; + + ret = vgic_lazy_init(kvm); + if (ret) + return ret; + return kvm_vgic_inject_irq(kvm, NULL, spi_id, level, NULL); } -- cgit v1.2.3 From 958023d269e0312d10da85a6a49438d2e107dead Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 20 May 2026 11:02:00 +0100 Subject: KVM: arm64: vgic-v2: Don't init the vgic on in-kernel interrupt injection We now have the lazy init on three paths: - on first run of a vcpu - on first injection of an interrupt from userspace and irqfd - on first injection of an interrupt from kernel space as part of the device emulation (timers, PMU, vgic MI) Given that we recompute the state of each in-kernel interrupt every time we are about to enter the guest, we can drop the lazy init from the kernel injection path. This solves a bunch of issues related to vgic_lazy_init() being called in non-preemptible context, such as vcpu reset. Reviewed-by: Oliver Upton Link: https://patch.msgid.link/20260520100200.543845-7-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic/vgic.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index 1e9fe8764584..9e29f03d3463 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -534,11 +534,9 @@ int kvm_vgic_inject_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, { struct vgic_irq *irq; unsigned long flags; - int ret; - ret = vgic_lazy_init(kvm); - if (ret) - return ret; + if (unlikely(!vgic_initialized(kvm))) + return 0; if (!vcpu && irq_is_private(kvm, intid)) return -EINVAL; -- cgit v1.2.3 From 09a4b56100f8667f192f8a6aa4eae190331066c9 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 20 May 2026 10:19:32 +0100 Subject: KVM: arm64: vgic-v5: Add for_each_visible_v5_ppi() iterator We have multiple instances of iterators walking the vgic_ppi_mask mask, and the way it is written has a tendency to make one's eyes bleed. Factor it as a helper and use that across the code base. Link: https://lore.kernel.org/r/20260520091949.542365-2-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 2 +- arch/arm64/kvm/vgic/vgic-v5.c | 10 ++++------ arch/arm64/kvm/vgic/vgic.h | 3 +++ 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 148fc3400ea8..513f5f1429b5 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -751,7 +751,7 @@ static bool access_gicv5_ppi_enabler(struct kvm_vcpu *vcpu, * Sync the change in enable states to the vgic_irqs. We consider all * PPIs as we don't expose many to the guest. */ - for_each_set_bit(i, mask, VGIC_V5_NR_PRIVATE_IRQS) { + for_each_visible_v5_ppi(i, vcpu->kvm) { u32 intid = vgic_v5_make_ppi(i); struct vgic_irq *irq; diff --git a/arch/arm64/kvm/vgic/vgic-v5.c b/arch/arm64/kvm/vgic/vgic-v5.c index fdd39ea7f83e..c0d36658ffe7 100644 --- a/arch/arm64/kvm/vgic/vgic-v5.c +++ b/arch/arm64/kvm/vgic/vgic-v5.c @@ -316,7 +316,7 @@ static void vgic_v5_sync_ppi_priorities(struct kvm_vcpu *vcpu) * those actually exposed to the guest by first iterating over the mask * of exposed PPIs. */ - for_each_set_bit(i, vcpu->kvm->arch.vgic.gicv5_vm.vgic_ppi_mask, VGIC_V5_NR_PRIVATE_IRQS) { + for_each_visible_v5_ppi(i, vcpu->kvm) { u32 intid = vgic_v5_make_ppi(i); struct vgic_irq *irq; int pri_idx, pri_reg, pri_bit; @@ -358,7 +358,7 @@ bool vgic_v5_has_pending_ppi(struct kvm_vcpu *vcpu) if (!priority_mask) return false; - for_each_set_bit(i, vcpu->kvm->arch.vgic.gicv5_vm.vgic_ppi_mask, VGIC_V5_NR_PRIVATE_IRQS) { + for_each_visible_v5_ppi(i, vcpu->kvm) { u32 intid = vgic_v5_make_ppi(i); bool has_pending = false; struct vgic_irq *irq; @@ -391,8 +391,7 @@ void vgic_v5_fold_ppi_state(struct kvm_vcpu *vcpu) activer = host_data_ptr(vgic_v5_ppi_state)->activer_exit; pendr = host_data_ptr(vgic_v5_ppi_state)->pendr; - for_each_set_bit(i, vcpu->kvm->arch.vgic.gicv5_vm.vgic_ppi_mask, - VGIC_V5_NR_PRIVATE_IRQS) { + for_each_visible_v5_ppi(i, vcpu->kvm) { u32 intid = vgic_v5_make_ppi(i); struct vgic_irq *irq; @@ -429,8 +428,7 @@ void vgic_v5_flush_ppi_state(struct kvm_vcpu *vcpu) * ICC_PPI_PENDRx_EL1, however. */ bitmap_zero(pendr, VGIC_V5_NR_PRIVATE_IRQS); - for_each_set_bit(i, vcpu->kvm->arch.vgic.gicv5_vm.vgic_ppi_mask, - VGIC_V5_NR_PRIVATE_IRQS) { + for_each_visible_v5_ppi(i, vcpu->kvm) { u32 intid = vgic_v5_make_ppi(i); struct vgic_irq *irq; diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index 9d941241c8a2..f45f7e3ec4d6 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -378,6 +378,9 @@ void vgic_v5_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); void vgic_v5_restore_state(struct kvm_vcpu *vcpu); void vgic_v5_save_state(struct kvm_vcpu *vcpu); +#define for_each_visible_v5_ppi(__i, __k) \ + for_each_set_bit(__i, (__k)->arch.vgic.gicv5_vm.vgic_ppi_mask, VGIC_V5_NR_PRIVATE_IRQS) + static inline int vgic_v3_max_apr_idx(struct kvm_vcpu *vcpu) { struct vgic_cpu *cpu_if = &vcpu->arch.vgic_cpu; -- cgit v1.2.3 From 2e83ac3b3b1a1b3b248a4af07efb19a1acb29845 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 20 May 2026 10:19:33 +0100 Subject: KVM: arm64: vgic-v5: Move PPI caps into kvm_vgic_global_state Constant vgic properties are usually kept in kvm_vgic_global_state, but the vgic-v5 code does its own thing. Move the ppi_caps data into the global structure, which has the modest additional advantage of making it ro_after_init. Link: https://lore.kernel.org/r/20260520091949.542365-3-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic/vgic-v5.c | 2 +- include/kvm/arm_vgic.h | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-v5.c b/arch/arm64/kvm/vgic/vgic-v5.c index c0d36658ffe7..7c146fccc968 100644 --- a/arch/arm64/kvm/vgic/vgic-v5.c +++ b/arch/arm64/kvm/vgic/vgic-v5.c @@ -10,7 +10,7 @@ #include "vgic.h" -static struct vgic_v5_ppi_caps ppi_caps; +#define ppi_caps kvm_vgic_global_state.vgic_v5_ppi_caps /* * Not all PPIs are guaranteed to be implemented for GICv5. Deterermine which diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 1388dc6028a9..ea793479ab25 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -177,6 +177,11 @@ struct vgic_global { bool has_gcie_v3_compat; u32 ich_vtr_el2; + + /* GICv5 PPI capabilities */ + struct { + DECLARE_BITMAP(impl_ppi_mask, VGIC_V5_NR_PRIVATE_IRQS); + } vgic_v5_ppi_caps; }; extern struct vgic_global kvm_vgic_global_state; @@ -492,11 +497,6 @@ struct vgic_v5_cpu_if { struct gicv5_vpe gicv5_vpe; }; -/* What PPI capabilities does a GICv5 host have */ -struct vgic_v5_ppi_caps { - DECLARE_BITMAP(impl_ppi_mask, VGIC_V5_NR_PRIVATE_IRQS); -}; - struct vgic_cpu { /* CPU vif control registers for world switch */ union { -- cgit v1.2.3 From 2295f5eca95d86b98225795a9d4c529796615d53 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 20 May 2026 10:19:34 +0100 Subject: KVM: arm64: vgic-v5: Remove use of __assign_bit() with a constant Using __assign_bit() is very useful when the value of the bit is not known at compile time. In all other cases, __set_bit() and __clear_bit() are the correct tool for the job. This also fixes an odd case of using VGIC_V5_NR_PRIVATE_IRQS as the bit value... Reviewed-by: Joey Gouly Link: https://lore.kernel.org/r/20260520091949.542365-4-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic/vgic-v5.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-v5.c b/arch/arm64/kvm/vgic/vgic-v5.c index 7c146fccc968..4d62b1c31fe8 100644 --- a/arch/arm64/kvm/vgic/vgic-v5.c +++ b/arch/arm64/kvm/vgic/vgic-v5.c @@ -25,13 +25,13 @@ static void vgic_v5_get_implemented_ppis(void) * If we have KVM, we have EL2, which means that we have support for the * EL1 and EL2 Physical & Virtual timers. */ - __assign_bit(GICV5_ARCH_PPI_CNTHP, ppi_caps.impl_ppi_mask, 1); - __assign_bit(GICV5_ARCH_PPI_CNTV, ppi_caps.impl_ppi_mask, 1); - __assign_bit(GICV5_ARCH_PPI_CNTHV, ppi_caps.impl_ppi_mask, 1); - __assign_bit(GICV5_ARCH_PPI_CNTP, ppi_caps.impl_ppi_mask, 1); + __set_bit(GICV5_ARCH_PPI_CNTHP, ppi_caps.impl_ppi_mask); + __set_bit(GICV5_ARCH_PPI_CNTV, ppi_caps.impl_ppi_mask); + __set_bit(GICV5_ARCH_PPI_CNTHV, ppi_caps.impl_ppi_mask); + __set_bit(GICV5_ARCH_PPI_CNTP, ppi_caps.impl_ppi_mask); /* The SW_PPI should be available */ - __assign_bit(GICV5_ARCH_PPI_SW_PPI, ppi_caps.impl_ppi_mask, 1); + __set_bit(GICV5_ARCH_PPI_SW_PPI, ppi_caps.impl_ppi_mask); /* The PMUIRQ is available if we have the PMU */ __assign_bit(GICV5_ARCH_PPI_PMUIRQ, ppi_caps.impl_ppi_mask, system_supports_pmuv3()); @@ -146,9 +146,7 @@ int vgic_v5_init(struct kvm *kvm) /* We only allow userspace to drive the SW_PPI, if it is implemented. */ bitmap_zero(kvm->arch.vgic.gicv5_vm.userspace_ppis, VGIC_V5_NR_PRIVATE_IRQS); - __assign_bit(GICV5_ARCH_PPI_SW_PPI, - kvm->arch.vgic.gicv5_vm.userspace_ppis, - VGIC_V5_NR_PRIVATE_IRQS); + __set_bit(GICV5_ARCH_PPI_SW_PPI, kvm->arch.vgic.gicv5_vm.userspace_ppis); bitmap_and(kvm->arch.vgic.gicv5_vm.userspace_ppis, kvm->arch.vgic.gicv5_vm.userspace_ppis, ppi_caps.impl_ppi_mask, VGIC_V5_NR_PRIVATE_IRQS); @@ -197,7 +195,7 @@ int vgic_v5_finalize_ppi_state(struct kvm *kvm) /* Expose PPIs with an owner or the SW_PPI, only */ scoped_guard(raw_spinlock_irqsave, &irq->irq_lock) { if (irq->owner || i == GICV5_ARCH_PPI_SW_PPI) { - __assign_bit(i, kvm->arch.vgic.gicv5_vm.vgic_ppi_mask, 1); + __set_bit(i, kvm->arch.vgic.gicv5_vm.vgic_ppi_mask); __assign_bit(i, kvm->arch.vgic.gicv5_vm.vgic_ppi_hmr, irq->config == VGIC_CONFIG_LEVEL); } -- cgit v1.2.3 From e6fdea20cffb0108e3d4b5af1c850cccc8e8866c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 20 May 2026 10:19:35 +0100 Subject: KVM: arm64: vgic-v5: Drop pointless ARM64_HAS_GICV5_CPUIF check vgic_v5_get_implemented_ppis() can only be called when we have a GICv5, by construction. Remove the pointless check against ARM64_HAS_GICV5_CPUIF. Link: https://lore.kernel.org/r/20260520091949.542365-5-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic/vgic-v5.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-v5.c b/arch/arm64/kvm/vgic/vgic-v5.c index 4d62b1c31fe8..0101ec3f5528 100644 --- a/arch/arm64/kvm/vgic/vgic-v5.c +++ b/arch/arm64/kvm/vgic/vgic-v5.c @@ -18,9 +18,6 @@ */ static void vgic_v5_get_implemented_ppis(void) { - if (!cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF)) - return; - /* * If we have KVM, we have EL2, which means that we have support for the * EL1 and EL2 Physical & Virtual timers. -- cgit v1.2.3 From c4a1191f802792fe22fc261fa0e918d048915911 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 20 May 2026 10:19:36 +0100 Subject: KVM: arm64: vgic: Constify struct irq_ops usage vgic-v5 has introduced much more prevalent usage of the struct irq_ops mechanism. In the process, it becomes evident that suffers from two related problems: - it contains flags, rather than only callbacks - it is mutable, because we need to update the above flags Swap the flags for a helper retrieving the flags, and make all irq_ops const, something that is slightly satisfying. Reviewed-by: Joey Gouly Link: https://lore.kernel.org/r/20260520091949.542365-6-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/arch_timer.c | 14 +++++++++----- arch/arm64/kvm/vgic/vgic-v5.c | 2 +- arch/arm64/kvm/vgic/vgic.c | 2 +- include/kvm/arm_vgic.h | 9 +++++---- 4 files changed, 16 insertions(+), 11 deletions(-) diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index cbea4d9ee955..f003df76fdda 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -52,11 +52,17 @@ static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu, enum kvm_arch_timer_regs treg); static bool kvm_arch_timer_get_input_level(int vintid); -static struct irq_ops arch_timer_irq_ops = { +static unsigned long kvm_arch_timer_get_irq_flags(void) +{ + return kvm_vgic_global_state.no_hw_deactivation ? VGIC_IRQ_SW_RESAMPLE : 0; +} + +static const struct irq_ops arch_timer_irq_ops = { + .get_flags = kvm_arch_timer_get_irq_flags, .get_input_level = kvm_arch_timer_get_input_level, }; -static struct irq_ops arch_timer_irq_ops_vgic_v5 = { +static const struct irq_ops arch_timer_irq_ops_vgic_v5 = { .get_input_level = kvm_arch_timer_get_input_level, .queue_irq_unlock = vgic_v5_ppi_queue_irq_unlock, .set_direct_injection = vgic_v5_set_ppi_dvi, @@ -1392,8 +1398,6 @@ static int kvm_irq_init(struct arch_timer_kvm_info *info) return -ENOMEM; } - if (kvm_vgic_global_state.no_hw_deactivation) - arch_timer_irq_ops.flags |= VGIC_IRQ_SW_RESAMPLE; WARN_ON(irq_domain_push_irq(domain, host_vtimer_irq, (void *)TIMER_VTIMER)); } @@ -1591,8 +1595,8 @@ static bool kvm_arch_timer_get_input_level(int vintid) int kvm_timer_enable(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = vcpu_timer(vcpu); + const struct irq_ops *ops; struct timer_map map; - struct irq_ops *ops; int ret; if (timer->enabled) diff --git a/arch/arm64/kvm/vgic/vgic-v5.c b/arch/arm64/kvm/vgic/vgic-v5.c index 0101ec3f5528..757484d2493b 100644 --- a/arch/arm64/kvm/vgic/vgic-v5.c +++ b/arch/arm64/kvm/vgic/vgic-v5.c @@ -285,7 +285,7 @@ void vgic_v5_set_ppi_dvi(struct kvm_vcpu *vcpu, struct vgic_irq *irq, bool dvi) __assign_bit(ppi, cpu_if->vgic_ppi_dvir, dvi); } -static struct irq_ops vgic_v5_ppi_irq_ops = { +static const struct irq_ops vgic_v5_ppi_irq_ops = { .queue_irq_unlock = vgic_v5_ppi_queue_irq_unlock, .set_direct_injection = vgic_v5_set_ppi_dvi, }; diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index 1e9fe8764584..3ac6d49bc487 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -573,7 +573,7 @@ int kvm_vgic_inject_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, } void kvm_vgic_set_irq_ops(struct kvm_vcpu *vcpu, u32 vintid, - struct irq_ops *ops) + const struct irq_ops *ops) { struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, vintid); diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index ea793479ab25..fe49fb56dc3c 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -205,7 +205,7 @@ struct vgic_irq; */ struct irq_ops { /* Per interrupt flags for special-cased interrupts */ - unsigned long flags; + unsigned long (*get_flags)(void); #define VGIC_IRQ_SW_RESAMPLE BIT(0) /* Clear the active state for resampling */ @@ -271,7 +271,7 @@ struct vgic_irq { u8 priority; u8 group; /* 0 == group 0, 1 == group 1 */ - struct irq_ops *ops; + const struct irq_ops *ops; void *owner; /* Opaque pointer to reserve an interrupt for in-kernel devices. */ @@ -279,7 +279,8 @@ struct vgic_irq { static inline bool vgic_irq_needs_resampling(struct vgic_irq *irq) { - return irq->ops && (irq->ops->flags & VGIC_IRQ_SW_RESAMPLE); + return irq->ops && irq->ops->get_flags && + (irq->ops->get_flags() & VGIC_IRQ_SW_RESAMPLE); } struct vgic_register_region; @@ -557,7 +558,7 @@ void kvm_vgic_init_cpu_hardware(void); int kvm_vgic_inject_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, unsigned int intid, bool level, void *owner); void kvm_vgic_set_irq_ops(struct kvm_vcpu *vcpu, u32 vintid, - struct irq_ops *ops); + const struct irq_ops *ops); void kvm_vgic_clear_irq_ops(struct kvm_vcpu *vcpu, u32 vintid); int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq, u32 vintid); -- cgit v1.2.3 From 849fbc130627663b4f7c8c4468025e4babc7a65a Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 20 May 2026 10:19:37 +0100 Subject: KVM: arm64: vgic: Consolidate vgic_allocate_private_irqs_locked() vgic_allocate_private_irqs_locked() calls two helpers, oddly named vgic_{,v5_}allocate_private_irq(). Not only these helpers don't allocate anything, but they also contain duplicate init code that would be better placed in the caller. Consolidate the common init code in the caller, rename the helpers to vgic_{,v5_}setup_private_irq(), and pass the irq pointer around instead of the index of the interrupt. Reviewed-by: Joey Gouly Link: https://lore.kernel.org/r/20260520091949.542365-7-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic/vgic-init.c | 45 +++++++++++++++++------------------------ 1 file changed, 18 insertions(+), 27 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index 933983bb2005..907057881b26 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -271,18 +271,12 @@ int kvm_vgic_vcpu_nv_init(struct kvm_vcpu *vcpu) return ret; } -static void vgic_allocate_private_irq(struct kvm_vcpu *vcpu, int i, u32 type) +static void vgic_setup_private_irq(struct kvm_vcpu *vcpu, struct vgic_irq *irq, + u32 type) { - struct vgic_irq *irq = &vcpu->arch.vgic_cpu.private_irqs[i]; + irq->intid = irq - &vcpu->arch.vgic_cpu.private_irqs[0]; - INIT_LIST_HEAD(&irq->ap_list); - raw_spin_lock_init(&irq->irq_lock); - irq->vcpu = NULL; - irq->target_vcpu = vcpu; - refcount_set(&irq->refcount, 0); - - irq->intid = i; - if (vgic_irq_is_sgi(i)) { + if (vgic_irq_is_sgi(irq->intid)) { /* SGIs */ irq->enabled = 1; irq->config = VGIC_CONFIG_EDGE; @@ -303,18 +297,11 @@ static void vgic_allocate_private_irq(struct kvm_vcpu *vcpu, int i, u32 type) } } -static void vgic_v5_allocate_private_irq(struct kvm_vcpu *vcpu, int i, u32 type) +static void vgic_v5_setup_private_irq(struct kvm_vcpu *vcpu, struct vgic_irq *irq) { - struct vgic_irq *irq = &vcpu->arch.vgic_cpu.private_irqs[i]; - u32 intid = vgic_v5_make_ppi(i); - - INIT_LIST_HEAD(&irq->ap_list); - raw_spin_lock_init(&irq->irq_lock); - irq->vcpu = NULL; - irq->target_vcpu = vcpu; - refcount_set(&irq->refcount, 0); + int i = irq - &vcpu->arch.vgic_cpu.private_irqs[0]; - irq->intid = intid; + irq->intid = vgic_v5_make_ppi(i); /* The only Edge architected PPI is the SW_PPI */ if (i == GICV5_ARCH_PPI_SW_PPI) @@ -323,7 +310,7 @@ static void vgic_v5_allocate_private_irq(struct kvm_vcpu *vcpu, int i, u32 type) irq->config = VGIC_CONFIG_LEVEL; /* Register the GICv5-specific PPI ops */ - vgic_v5_set_ppi_ops(vcpu, intid); + vgic_v5_set_ppi_ops(vcpu, irq->intid); } static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type) @@ -349,15 +336,19 @@ static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type) if (!vgic_cpu->private_irqs) return -ENOMEM; - /* - * Enable and configure all SGIs to be edge-triggered and - * configure all PPIs as level-triggered. - */ for (i = 0; i < num_private_irqs; i++) { + struct vgic_irq *irq = &vcpu->arch.vgic_cpu.private_irqs[i]; + + INIT_LIST_HEAD(&irq->ap_list); + raw_spin_lock_init(&irq->irq_lock); + irq->vcpu = NULL; + irq->target_vcpu = vcpu; + refcount_set(&irq->refcount, 0); + if (vgic_is_v5(vcpu->kvm)) - vgic_v5_allocate_private_irq(vcpu, i, type); + vgic_v5_setup_private_irq(vcpu, irq); else - vgic_allocate_private_irq(vcpu, i, type); + vgic_setup_private_irq(vcpu, irq, type); } return 0; -- cgit v1.2.3 From 319c1ceef7d236e80f8a8e048cda1f986457d834 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 20 May 2026 10:19:38 +0100 Subject: KVM: arm64: vgic-v5: Drop defensive checks from vgic_v5_ppi_queue_irq_unlock() vgic_v5_ppi_queue_irq_unlock() performs a bunch of sanity checks that are pretty pointless as there is no code path that can result in these invariants to be violated. And if they are, a nice crash is just as instructive than a warning. Drop what is evidently debug code and simplify the whole thing. Link: https://lore.kernel.org/r/20260520091949.542365-8-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic/vgic-v5.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-v5.c b/arch/arm64/kvm/vgic/vgic-v5.c index 757484d2493b..7916bd8d564e 100644 --- a/arch/arm64/kvm/vgic/vgic-v5.c +++ b/arch/arm64/kvm/vgic/vgic-v5.c @@ -238,9 +238,9 @@ static u32 vgic_v5_get_effective_priority_mask(struct kvm_vcpu *vcpu) /* * For GICv5, the PPIs are mostly directly managed by the hardware. We (the - * hypervisor) handle the pending, active, enable state save/restore, but don't - * need the PPIs to be queued on a per-VCPU AP list. Therefore, sanity check the - * state, unlock, and return. + * hypervisor) handle the pending, active, enable state save/restore, but + * don't need the PPIs to be queued on a per-VCPU AP list. Therefore, + * unlock, kick the vcpu and return. */ bool vgic_v5_ppi_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, unsigned long flags) @@ -250,12 +250,7 @@ bool vgic_v5_ppi_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, lockdep_assert_held(&irq->irq_lock); - if (WARN_ON_ONCE(!__irq_is_ppi(KVM_DEV_TYPE_ARM_VGIC_V5, irq->intid))) - goto out_unlock_fail; - vcpu = irq->target_vcpu; - if (WARN_ON_ONCE(!vcpu)) - goto out_unlock_fail; raw_spin_unlock_irqrestore(&irq->irq_lock, flags); @@ -264,11 +259,6 @@ bool vgic_v5_ppi_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, kvm_vcpu_kick(vcpu); return true; - -out_unlock_fail: - raw_spin_unlock_irqrestore(&irq->irq_lock, flags); - - return false; } /* -- cgit v1.2.3 From 8f5dd53590b8a810a4004494bd2b07ad587464ed Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 20 May 2026 10:19:39 +0100 Subject: KVM: arm64: vgic: Rationalise per-CPU irq accessor Despite adding the necessary infrastructure to identify irq types, vgic_get_vcpu_irq() treats GICv5 PPIs in a special way, which impairs the readability of the code. Use the existing irq classifiers to handle per-CPU irqs for all vgic types, and let the normal control flow reach global interrupt handling without any v5-specific path. Reviewed-by: Joey Gouly Link: https://lore.kernel.org/r/20260520091949.542365-9-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic/vgic.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index 3ac6d49bc487..b697678d68b0 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -106,24 +106,23 @@ struct vgic_irq *vgic_get_irq(struct kvm *kvm, u32 intid) struct vgic_irq *vgic_get_vcpu_irq(struct kvm_vcpu *vcpu, u32 intid) { + enum kvm_device_type type; + if (WARN_ON(!vcpu)) return NULL; - if (vgic_is_v5(vcpu->kvm)) { - u32 int_num, hwirq_id; - - if (!__irq_is_ppi(KVM_DEV_TYPE_ARM_VGIC_V5, intid)) - return NULL; - - hwirq_id = FIELD_GET(GICV5_HWIRQ_ID, intid); - int_num = array_index_nospec(hwirq_id, VGIC_V5_NR_PRIVATE_IRQS); + type = vcpu->kvm->arch.vgic.vgic_model; - return &vcpu->arch.vgic_cpu.private_irqs[int_num]; - } + if (__irq_is_sgi(type, intid) || __irq_is_ppi(type, intid)) { + switch (type) { + case KVM_DEV_TYPE_ARM_VGIC_V5: + intid = vgic_v5_get_hwirq_id(intid); + intid = array_index_nospec(intid, VGIC_V5_NR_PRIVATE_IRQS); + break; + default: + intid = array_index_nospec(intid, VGIC_NR_PRIVATE_IRQS); + } - /* SGIs and PPIs */ - if (intid < VGIC_NR_PRIVATE_IRQS) { - intid = array_index_nospec(intid, VGIC_NR_PRIVATE_IRQS); return &vcpu->arch.vgic_cpu.private_irqs[intid]; } -- cgit v1.2.3 From 35a4f8d151d6aabb5e74fea4e67993dcad7b526b Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 20 May 2026 10:19:40 +0100 Subject: KVM: arm64: vgic-v5: Limit support to 64 PPIs Although we have some code supporting 128 PPIs, the only supported configuration is 64 PPIs. There is no way to test the 128 PPI code, so it is bound to bitrot very quickly. Given that KVM/arm64's goal has always been to stick to non-IMPDEF behaviours, drop the 128 PPI support. Someone motivated enough and with very strong arguments can always bring it back -- it's all in the git history. Reviewed-by: Joey Gouly Link: https://lore.kernel.org/r/20260520091949.542365-10-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/vgic-v5-sr.c | 82 +++++++---------------------------- arch/arm64/kvm/sys_regs.c | 17 +++----- arch/arm64/kvm/vgic/vgic-kvm-device.c | 9 ++-- 3 files changed, 26 insertions(+), 82 deletions(-) diff --git a/arch/arm64/kvm/hyp/vgic-v5-sr.c b/arch/arm64/kvm/hyp/vgic-v5-sr.c index 47e6bcd43702..6d69dfe89a96 100644 --- a/arch/arm64/kvm/hyp/vgic-v5-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v5-sr.c @@ -30,10 +30,9 @@ void __vgic_v5_save_ppi_state(struct vgic_v5_cpu_if *cpu_if) { /* * The following code assumes that the bitmap storage that we have for - * PPIs is either 64 (architected PPIs, only) or 128 bits (architected & - * impdef PPIs). + * PPIs is either 64 (architected PPIs, only). */ - BUILD_BUG_ON(VGIC_V5_NR_PRIVATE_IRQS % 64); + BUILD_BUG_ON(VGIC_V5_NR_PRIVATE_IRQS != 64); bitmap_write(host_data_ptr(vgic_v5_ppi_state)->activer_exit, read_sysreg_s(SYS_ICH_PPI_ACTIVER0_EL2), 0, 64); @@ -49,22 +48,6 @@ void __vgic_v5_save_ppi_state(struct vgic_v5_cpu_if *cpu_if) cpu_if->vgic_ppi_priorityr[6] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR6_EL2); cpu_if->vgic_ppi_priorityr[7] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR7_EL2); - if (VGIC_V5_NR_PRIVATE_IRQS == 128) { - bitmap_write(host_data_ptr(vgic_v5_ppi_state)->activer_exit, - read_sysreg_s(SYS_ICH_PPI_ACTIVER1_EL2), 64, 64); - bitmap_write(host_data_ptr(vgic_v5_ppi_state)->pendr, - read_sysreg_s(SYS_ICH_PPI_PENDR1_EL2), 64, 64); - - cpu_if->vgic_ppi_priorityr[8] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR8_EL2); - cpu_if->vgic_ppi_priorityr[9] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR9_EL2); - cpu_if->vgic_ppi_priorityr[10] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR10_EL2); - cpu_if->vgic_ppi_priorityr[11] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR11_EL2); - cpu_if->vgic_ppi_priorityr[12] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR12_EL2); - cpu_if->vgic_ppi_priorityr[13] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR13_EL2); - cpu_if->vgic_ppi_priorityr[14] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR14_EL2); - cpu_if->vgic_ppi_priorityr[15] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR15_EL2); - } - /* Now that we are done, disable DVI */ write_sysreg_s(0, SYS_ICH_PPI_DVIR0_EL2); write_sysreg_s(0, SYS_ICH_PPI_DVIR1_EL2); @@ -74,9 +57,6 @@ void __vgic_v5_restore_ppi_state(struct vgic_v5_cpu_if *cpu_if) { DECLARE_BITMAP(pendr, VGIC_V5_NR_PRIVATE_IRQS); - /* We assume 64 or 128 PPIs - see above comment */ - BUILD_BUG_ON(VGIC_V5_NR_PRIVATE_IRQS % 64); - /* Enable DVI so that the guest's interrupt config takes over */ write_sysreg_s(bitmap_read(cpu_if->vgic_ppi_dvir, 0, 64), SYS_ICH_PPI_DVIR0_EL2); @@ -108,50 +88,20 @@ void __vgic_v5_restore_ppi_state(struct vgic_v5_cpu_if *cpu_if) write_sysreg_s(cpu_if->vgic_ppi_priorityr[7], SYS_ICH_PPI_PRIORITYR7_EL2); - if (VGIC_V5_NR_PRIVATE_IRQS == 128) { - /* Enable DVI so that the guest's interrupt config takes over */ - write_sysreg_s(bitmap_read(cpu_if->vgic_ppi_dvir, 64, 64), - SYS_ICH_PPI_DVIR1_EL2); - - write_sysreg_s(bitmap_read(cpu_if->vgic_ppi_activer, 64, 64), - SYS_ICH_PPI_ACTIVER1_EL2); - write_sysreg_s(bitmap_read(cpu_if->vgic_ppi_enabler, 64, 64), - SYS_ICH_PPI_ENABLER1_EL2); - write_sysreg_s(bitmap_read(pendr, 64, 64), - SYS_ICH_PPI_PENDR1_EL2); - - write_sysreg_s(cpu_if->vgic_ppi_priorityr[8], - SYS_ICH_PPI_PRIORITYR8_EL2); - write_sysreg_s(cpu_if->vgic_ppi_priorityr[9], - SYS_ICH_PPI_PRIORITYR9_EL2); - write_sysreg_s(cpu_if->vgic_ppi_priorityr[10], - SYS_ICH_PPI_PRIORITYR10_EL2); - write_sysreg_s(cpu_if->vgic_ppi_priorityr[11], - SYS_ICH_PPI_PRIORITYR11_EL2); - write_sysreg_s(cpu_if->vgic_ppi_priorityr[12], - SYS_ICH_PPI_PRIORITYR12_EL2); - write_sysreg_s(cpu_if->vgic_ppi_priorityr[13], - SYS_ICH_PPI_PRIORITYR13_EL2); - write_sysreg_s(cpu_if->vgic_ppi_priorityr[14], - SYS_ICH_PPI_PRIORITYR14_EL2); - write_sysreg_s(cpu_if->vgic_ppi_priorityr[15], - SYS_ICH_PPI_PRIORITYR15_EL2); - } else { - write_sysreg_s(0, SYS_ICH_PPI_DVIR1_EL2); - - write_sysreg_s(0, SYS_ICH_PPI_ACTIVER1_EL2); - write_sysreg_s(0, SYS_ICH_PPI_ENABLER1_EL2); - write_sysreg_s(0, SYS_ICH_PPI_PENDR1_EL2); - - write_sysreg_s(0, SYS_ICH_PPI_PRIORITYR8_EL2); - write_sysreg_s(0, SYS_ICH_PPI_PRIORITYR9_EL2); - write_sysreg_s(0, SYS_ICH_PPI_PRIORITYR10_EL2); - write_sysreg_s(0, SYS_ICH_PPI_PRIORITYR11_EL2); - write_sysreg_s(0, SYS_ICH_PPI_PRIORITYR12_EL2); - write_sysreg_s(0, SYS_ICH_PPI_PRIORITYR13_EL2); - write_sysreg_s(0, SYS_ICH_PPI_PRIORITYR14_EL2); - write_sysreg_s(0, SYS_ICH_PPI_PRIORITYR15_EL2); - } + write_sysreg_s(0, SYS_ICH_PPI_DVIR1_EL2); + + write_sysreg_s(0, SYS_ICH_PPI_ACTIVER1_EL2); + write_sysreg_s(0, SYS_ICH_PPI_ENABLER1_EL2); + write_sysreg_s(0, SYS_ICH_PPI_PENDR1_EL2); + + write_sysreg_s(0, SYS_ICH_PPI_PRIORITYR8_EL2); + write_sysreg_s(0, SYS_ICH_PPI_PRIORITYR9_EL2); + write_sysreg_s(0, SYS_ICH_PPI_PRIORITYR10_EL2); + write_sysreg_s(0, SYS_ICH_PPI_PRIORITYR11_EL2); + write_sysreg_s(0, SYS_ICH_PPI_PRIORITYR12_EL2); + write_sysreg_s(0, SYS_ICH_PPI_PRIORITYR13_EL2); + write_sysreg_s(0, SYS_ICH_PPI_PRIORITYR14_EL2); + write_sysreg_s(0, SYS_ICH_PPI_PRIORITYR15_EL2); } void __vgic_v5_save_state(struct vgic_v5_cpu_if *cpu_if) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 513f5f1429b5..6083a1b23dbf 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -724,6 +724,7 @@ static bool access_gicv5_ppi_enabler(struct kvm_vcpu *vcpu, { unsigned long *mask = vcpu->kvm->arch.vgic.gicv5_vm.vgic_ppi_mask; struct vgic_v5_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v5; + unsigned long reg = p->regval; int i; /* We never expect to get here with a read! */ @@ -731,21 +732,17 @@ static bool access_gicv5_ppi_enabler(struct kvm_vcpu *vcpu, return undef_access(vcpu, p, r); /* - * If we're only handling architected PPIs and the guest writes to the - * enable for the non-architected PPIs, we just return as there's - * nothing to do at all. We don't even allocate the storage for them in - * this case. + * As we're only handling architected PPIs, the guest writes to the + * enable for the non-architected PPIs just return as there's + * nothing to do at all. We don't even allocate the storage for them. */ - if (VGIC_V5_NR_PRIVATE_IRQS == 64 && p->Op2 % 2) + if (p->Op2 % 2) return true; /* - * Merge the raw guest write into out bitmap at an offset of either 0 or - * 64, then and it with our PPI mask. + * Merge the raw guest write into out bitmap, anded with our PPI mask. */ - bitmap_write(cpu_if->vgic_ppi_enabler, p->regval, 64 * (p->Op2 % 2), 64); - bitmap_and(cpu_if->vgic_ppi_enabler, cpu_if->vgic_ppi_enabler, mask, - VGIC_V5_NR_PRIVATE_IRQS); + bitmap_and(cpu_if->vgic_ppi_enabler, ®, mask, VGIC_V5_NR_PRIVATE_IRQS); /* * Sync the change in enable states to the vgic_irqs. We consider all diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c b/arch/arm64/kvm/vgic/vgic-kvm-device.c index a96c77dccf35..90be99443df3 100644 --- a/arch/arm64/kvm/vgic/vgic-kvm-device.c +++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c @@ -730,18 +730,15 @@ static int vgic_v5_get_userspace_ppis(struct kvm_device *dev, guard(mutex)(&dev->kvm->arch.config_lock); /* - * We either support 64 or 128 PPIs. In the former case, we need to - * return 0s for the second 64 bits as we have no storage backing those. + * We only support 64 PPIs, so, we need to return 0s for the + * second 64 bits as we have no storage backing those. */ ret = put_user(bitmap_read(gicv5_vm->userspace_ppis, 0, 64), uaddr); if (ret) return ret; uaddr++; - if (VGIC_V5_NR_PRIVATE_IRQS == 128) - ret = put_user(bitmap_read(gicv5_vm->userspace_ppis, 64, 128), uaddr); - else - ret = put_user(0, uaddr); + ret = put_user(0, uaddr); return ret; } -- cgit v1.2.3 From eec44c56e67ca78c377f4c3d85ef94fd105bea81 Mon Sep 17 00:00:00 2001 From: Sascha Bischoff Date: Wed, 20 May 2026 10:19:41 +0100 Subject: KVM: arm64: vgic-v5: Add missing trap handing for NV triage As things stand, there is no support for Nested Virt with GICv5 guests yet. However, this is coming and therefore we need to be able to correctly triage the traps when running with NV. Add the missing fgtreg lookups required for that to triage_sysreg_trap(). These are specific to the FGT regs added as part of GICv5: * ICH_HFGRTR_EL2 * ICH_HFGWTR_EL2 * ICH_HFGITR_EL2 Fixes: 9d6d9514c08f ("KVM: arm64: gic-v5: Support GICv5 FGTs & FGUs") Link: https://sashiko.dev/#/patchset/20260319154937.3619520-1-sascha.bischoff%40arm.com Signed-off-by: Sascha Bischoff Reviewed-by: Joey Gouly Link: https://lore.kernel.org/r/20260520091949.542365-11-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/emulate-nested.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c index dba7ced74ca5..a4eb36b4c442 100644 --- a/arch/arm64/kvm/emulate-nested.c +++ b/arch/arm64/kvm/emulate-nested.c @@ -2631,6 +2631,14 @@ bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index) fgtreg = HFGITR2_EL2; break; + case ICH_HFGRTR_GROUP: + fgtreg = is_read ? ICH_HFGRTR_EL2 : ICH_HFGWTR_EL2; + break; + + case ICH_HFGITR_GROUP: + fgtreg = ICH_HFGITR_EL2; + break; + default: /* Something is really wrong, bail out */ WARN_ONCE(1, "Bad FGT group (encoding %08x, config %016llx)\n", -- cgit v1.2.3 From 2427e8c1cd4f467770cb52e5e723adf88cc61ae0 Mon Sep 17 00:00:00 2001 From: Sascha Bischoff Date: Wed, 20 May 2026 10:19:42 +0100 Subject: KVM: arm64: vgic-v5: Atomically assign bits to PPI DVI bitmap For GICv5 guests we make use of the DVI mechanism for PPIs where possible. When mapping a virtual irq to a physical one for a GICv5 guest, the corresponding bit in the DVI bitmap is set. When unmapping, said bit is cleared again. The key user of this mechanism is the arch timer. The existing code used the non-atomic __assign_bit() rather than doing the update atomically. This could technically result in losing state if a second PPI's DVI bit were being manipulated concurrently. Each individual bit within the DVI bitmap is guarded using vgic_irq->irq_lock, but there's no locking for the overall bitmap. Therefore, switch to using the atomic assign_bit() function instead. Fixes: 5a98d0e17e59 ("KVM: arm64: gic-v5: Implement direct injection of PPIs") Link: https://sashiko.dev/#/patchset/20260319154937.3619520-1-sascha.bischoff%40arm.com Signed-off-by: Sascha Bischoff Link: https://lore.kernel.org/r/20260520091949.542365-12-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic/vgic-v5.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/vgic/vgic-v5.c b/arch/arm64/kvm/vgic/vgic-v5.c index 7916bd8d564e..d4789ff3e740 100644 --- a/arch/arm64/kvm/vgic/vgic-v5.c +++ b/arch/arm64/kvm/vgic/vgic-v5.c @@ -272,7 +272,7 @@ void vgic_v5_set_ppi_dvi(struct kvm_vcpu *vcpu, struct vgic_irq *irq, bool dvi) lockdep_assert_held(&irq->irq_lock); ppi = vgic_v5_get_hwirq_id(irq->intid); - __assign_bit(ppi, cpu_if->vgic_ppi_dvir, dvi); + assign_bit(ppi, cpu_if->vgic_ppi_dvir, dvi); } static const struct irq_ops vgic_v5_ppi_irq_ops = { -- cgit v1.2.3 From f034ae93bcb68b06bb52344788f4cb82ae691719 Mon Sep 17 00:00:00 2001 From: Sascha Bischoff Date: Wed, 20 May 2026 10:19:43 +0100 Subject: KVM: arm64: selftests: Add missing GIC CDEN to no-vgic-v5 selftest The selftest mistakenly omitted the GIC CDEN instruction from the testing. Add it in. Fixes: ce29261ec648 ("KVM: arm64: selftests: Add no-vgic-v5 selftest") Reviewed-by: Joey Gouly Signed-off-by: Sascha Bischoff Link: https://lore.kernel.org/r/20260520091949.542365-13-maz@kernel.org Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/arm64/no-vgic.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/kvm/arm64/no-vgic.c b/tools/testing/selftests/kvm/arm64/no-vgic.c index 25b2e3222f68..ab57902ce429 100644 --- a/tools/testing/selftests/kvm/arm64/no-vgic.c +++ b/tools/testing/selftests/kvm/arm64/no-vgic.c @@ -159,6 +159,7 @@ static void guest_code_gicv5(void) check_gicv5_gic_op(CDAFF); check_gicv5_gic_op(CDDI); check_gicv5_gic_op(CDDIS); + check_gicv5_gic_op(CDEN); check_gicv5_gic_op(CDEOI); check_gicv5_gic_op(CDHM); check_gicv5_gic_op(CDPEND); -- cgit v1.2.3 From 8192f783b5e15c2966f67918b1e8073171c928c9 Mon Sep 17 00:00:00 2001 From: Sascha Bischoff Date: Wed, 20 May 2026 10:19:44 +0100 Subject: KVM: arm64: selftests: Cleanup unused vars in GICv5 PPI selftest Clean up a set of unused variables around the size of the guest's PA space as they are completely irrelevant for GICv5 when only considering PPIs. Fixes: 0a9f38bf612b ("KVM: arm64: selftests: Introduce a minimal GICv5 PPI selftest") Link: https://sashiko.dev/#/patchset/20260319154937.3619520-1-sascha.bischoff%40arm.com Signed-off-by: Sascha Bischoff Link: https://lore.kernel.org/r/20260520091949.542365-14-maz@kernel.org Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/arm64/vgic_v5.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tools/testing/selftests/kvm/arm64/vgic_v5.c b/tools/testing/selftests/kvm/arm64/vgic_v5.c index d785b660d847..a8707120de0d 100644 --- a/tools/testing/selftests/kvm/arm64/vgic_v5.c +++ b/tools/testing/selftests/kvm/arm64/vgic_v5.c @@ -20,8 +20,6 @@ struct vm_gic { u32 gic_dev_type; }; -static u64 max_phys_size; - #define GUEST_CMD_IRQ_CDIA 10 #define GUEST_CMD_IRQ_DIEOI 11 #define GUEST_CMD_IS_AWAKE 12 @@ -208,13 +206,9 @@ void run_tests(u32 gic_dev_type) int main(int ac, char **av) { int ret; - int pa_bits; test_disable_default_vgic(); - pa_bits = vm_guest_mode_params[VM_MODE_DEFAULT].pa_bits; - max_phys_size = 1ULL << pa_bits; - ret = test_kvm_device(KVM_DEV_TYPE_ARM_VGIC_V5); if (ret) { pr_info("No GICv5 support; Not running GIC_v5 tests.\n"); -- cgit v1.2.3 From 441623bcb8ccb957fbb4b536b194352a186c0155 Mon Sep 17 00:00:00 2001 From: Sascha Bischoff Date: Wed, 20 May 2026 10:19:45 +0100 Subject: KVM: arm64: selftests: Improve error handling for GICv5 PPI selftest Cases where the KVM_RUN ioctl returned an error were wrongly reported as incorrect ucalls. Furthermore, potential failures when calling KVM_IRQ_LINE were being hidden. Improve the error handling to correctly propagate the error in both cases. Fixes: 0a9f38bf612b ("KVM: arm64: selftests: Introduce a minimal GICv5 PPI selftest") Link: https://sashiko.dev/#/patchset/20260319154937.3619520-1-sascha.bischoff%40arm.com Signed-off-by: Sascha Bischoff Link: https://lore.kernel.org/r/20260520091949.542365-15-maz@kernel.org Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/arm64/vgic_v5.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/arm64/vgic_v5.c b/tools/testing/selftests/kvm/arm64/vgic_v5.c index a8707120de0d..96cfd6bb32f6 100644 --- a/tools/testing/selftests/kvm/arm64/vgic_v5.c +++ b/tools/testing/selftests/kvm/arm64/vgic_v5.c @@ -129,6 +129,8 @@ static void test_vgic_v5_ppis(u32 gic_dev_type) while (1) { ret = run_vcpu(vcpus[0]); + if (ret) + break; switch (get_ucall(vcpus[0], &uc)) { case UCALL_SYNC: @@ -144,7 +146,7 @@ static void test_vgic_v5_ppis(u32 gic_dev_type) irq = FIELD_PREP(KVM_ARM_IRQ_NUM_MASK, 3); irq |= KVM_ARM_IRQ_TYPE_PPI << KVM_ARM_IRQ_TYPE_SHIFT; - _kvm_irq_line(v.vm, irq, level); + kvm_irq_line(v.vm, irq, level); } else if (uc.args[1] == GUEST_CMD_IS_AWAKE) { pr_info("Guest skipping WFI due to pending IRQ\n"); } else if (uc.args[1] == GUEST_CMD_IRQ_CDIA) { -- cgit v1.2.3 From 6930980bf2f1625c5eb25a27b8673faa89c5792a Mon Sep 17 00:00:00 2001 From: Sascha Bischoff Date: Wed, 20 May 2026 10:19:46 +0100 Subject: Documentation: KVM: Fix typos in VGICv5 documentation Fix two typos in the VGICv5 documentation. Fixes: d51c978b7d3e ("KVM: arm64: gic-v5: Communicate userspace-driveable PPIs via a UAPI") Fixes: eb3c4d2c9a4d ("Documentation: KVM: Introduce documentation for VGICv5") Link: https://sashiko.dev/#/patchset/20260319154937.3619520-1-sascha.bischoff%40arm.com Signed-off-by: Sascha Bischoff Link: https://lore.kernel.org/r/20260520091949.542365-16-maz@kernel.org Signed-off-by: Marc Zyngier --- Documentation/virt/kvm/devices/arm-vgic-v5.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/virt/kvm/devices/arm-vgic-v5.rst b/Documentation/virt/kvm/devices/arm-vgic-v5.rst index 29335ea823fc..70b9162755c7 100644 --- a/Documentation/virt/kvm/devices/arm-vgic-v5.rst +++ b/Documentation/virt/kvm/devices/arm-vgic-v5.rst @@ -12,8 +12,8 @@ Only one VGIC instance may be instantiated through this API. The created VGIC will act as the VM interrupt controller, requiring emulated user-space devices to inject interrupts to the VGIC instead of directly to CPUs. -Creating a guest GICv5 device requires a host GICv5 host. The current VGICv5 -device only supports PPI interrupts. These can either be injected from emulated +Creating a guest GICv5 device requires a GICv5 host. The current VGICv5 device +only supports PPI interrupts. These can either be injected from emulated in-kernel devices (such as the Arch Timer, or PMU), or via the KVM_IRQ_LINE ioctl. @@ -25,7 +25,7 @@ Groups: request the initialization of the VGIC, no additional parameter in kvm_device_attr.addr. Must be called after all VCPUs have been created. - KVM_DEV_ARM_VGIC_USERPSPACE_PPIs + KVM_DEV_ARM_VGIC_USERSPACE_PPIS request the mask of userspace-drivable PPIs. Only a subset of the PPIs can be directly driven from userspace with GICv5, and the returned mask informs userspace of which it is allowed to drive via KVM_IRQ_LINE. -- cgit v1.2.3 From 3f5aeaf8d9d3a6693979a92e6ac94b1e2884b911 Mon Sep 17 00:00:00 2001 From: Sascha Bischoff Date: Wed, 20 May 2026 10:19:47 +0100 Subject: Documentation: KVM: Clarify that PMU_V3_IRQ IntID requirements for GICv5 When running a GICv5-based guest, the PMU must use PPI 23. This, however, must be communicated via the KVM_ARM_VCPU_PMU_V3_CTRL->KVM_ARM_VCPU_PMU_V3_IRQ ioctl as a full GICv5-style Interrupt ID. That is, 0x20000017. Optionally, the whole ioctl can be skipped for GICv5. This was previously not clearly documented, so bump the documentation accordingly. Fixes: 7c31c06e2d2d ("KVM: arm64: gic-v5: Mandate architected PPI for PMU emulation on GICv5") Link: https://sashiko.dev/#/patchset/20260319154937.3619520-1-sascha.bischoff%40arm.com Signed-off-by: Sascha Bischoff Link: https://lore.kernel.org/r/20260520091949.542365-17-maz@kernel.org Signed-off-by: Marc Zyngier --- Documentation/virt/kvm/devices/vcpu.rst | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Documentation/virt/kvm/devices/vcpu.rst b/Documentation/virt/kvm/devices/vcpu.rst index 5e3805820010..66e714f2fcfa 100644 --- a/Documentation/virt/kvm/devices/vcpu.rst +++ b/Documentation/virt/kvm/devices/vcpu.rst @@ -37,8 +37,11 @@ Returns: A value describing the PMUv3 (Performance Monitor Unit v3) overflow interrupt number for this vcpu. This interrupt could be a PPI or SPI, but the interrupt type must be same for each vcpu. As a PPI, the interrupt number is the same for -all vcpus, while as an SPI it must be a separate number per vcpu. For -GICv5-based guests, the architected PPI (23) must be used. +all vcpus, while as an SPI it must be a separate number per vcpu. + +For GICv5-based guests, the architected PPI (23) must be used, and must be +communicated as the full GICv5-style Interrupt ID, i.e., 0x20000017. This ioctl +can be omitted altogether for a GICv5-based guest. 1.2 ATTRIBUTE: KVM_ARM_VCPU_PMU_V3_INIT --------------------------------------- -- cgit v1.2.3 From abf60331ebe9a9a7937a72aac7699c2907ab9307 Mon Sep 17 00:00:00 2001 From: Sascha Bischoff Date: Wed, 20 May 2026 10:19:48 +0100 Subject: irqchip/gic-v5: Immediately exec priority drop following activate With GICv5 an interrupt of equal or lower priority cannot be signalled until there has been a priority drop. This is done via the GIC CDEOI system instruction. Once this has been executed, the hardware is able to signal the next interrupt if there is one. As all interrupts are programmed to have the same priority, no new interrupts can be signalled until the priority drop has happened. This can cause issues when, for example, an interrupt remains active while a long running process takes place, such as when injecting a physical interrupt into a guest VM in software. The GICv5 driver has so far done the priority drop as part of irq_eoi(), i.e., at the same time as deactivating the interrupt. This means that any long running process (or VM) could block incoming interrupts, effectively causing a denial of service for all other interrupts. Rather than doing the EOI as part of irq_eoi() (which the name would suggest would be a good place for it), move it to happen immediately after acknowledging an interrupt in the main GICv5 interrupt handler. The deactivation of interrupts (GIC CDDI) remains implemented as part of irq_eoi(), which means that the same interrupt cannot be signalled a second time until deactivated by software. Suggested-by: Marc Zyngier Signed-off-by: Sascha Bischoff Link: https://lore.kernel.org/r/20260520091949.542365-18-maz@kernel.org Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v5.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/irqchip/irq-gic-v5.c b/drivers/irqchip/irq-gic-v5.c index 6b0903be8ebf..58e457d4c147 100644 --- a/drivers/irqchip/irq-gic-v5.c +++ b/drivers/irqchip/irq-gic-v5.c @@ -218,17 +218,13 @@ static void gicv5_hwirq_eoi(u32 hwirq_id, u8 hwirq_type) FIELD_PREP(GICV5_GIC_CDDI_TYPE_MASK, hwirq_type); gic_insn(cddi, CDDI); - - gic_insn(0, CDEOI); } static void gicv5_ppi_irq_eoi(struct irq_data *d) { /* Skip deactivate for forwarded PPI interrupts */ - if (irqd_is_forwarded_to_vcpu(d)) { - gic_insn(0, CDEOI); + if (irqd_is_forwarded_to_vcpu(d)) return; - } gicv5_hwirq_eoi(d->hwirq, GICV5_HWIRQ_TYPE_PPI); } @@ -963,6 +959,13 @@ static void __exception_irq_entry gicv5_handle_irq(struct pt_regs *regs) */ isb(); + /* + * Ensure that we can receive the next interrupts in the event that we + * have a long running handler or directly enter a guest by doing the + * priority drop immediately. + */ + gic_insn(0, CDEOI); + hwirq = FIELD_GET(GICV5_HWIRQ_INTID, ia); handle_irq_per_domain(hwirq); -- cgit v1.2.3 From bee399ea20c8fea361c0ada06afdec9fcbf6dfde Mon Sep 17 00:00:00 2001 From: Sascha Bischoff Date: Wed, 20 May 2026 10:19:49 +0100 Subject: KVM: arm64: Fix arch timer interrupts for GICv3-on-GICv5 guests When running on a GICv5 host, we push an arch-timer-specific interrupt domain for the timer interrupts. This interrupt domain is used to mask the host interrupt when a GICv5 guest is running. However, this interrupt domain is still in place when running with a GICv3 guest on GICv5 hardware. The result is that some interrupt state changes are not correctly propragated to the host irqchip driver for legacy guests. Explicitly pass irqchip state changes though to the host irqchip driver when running a GICv3-based guest on a GICv5 host. This bypasses all masking, and thereby operates just as a native GICv3 guest would, with the exception of having an additional irq domain in the hierarchy. Fixes: 9491c63b6cd7 ("KVM: arm64: gic-v5: Enlighten arch timer for GICv5") Suggested-by: Marc Zyngier Signed-off-by: Sascha Bischoff Link: https://lore.kernel.org/r/20260520091949.542365-19-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/arch_timer.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index f003df76fdda..53b67b4d0bf2 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -1294,7 +1294,12 @@ static int timer_irq_set_vcpu_affinity(struct irq_data *d, void *vcpu) static int timer_irq_set_irqchip_state(struct irq_data *d, enum irqchip_irq_state which, bool val) { - if (which != IRQCHIP_STATE_ACTIVE || !irqd_is_forwarded_to_vcpu(d)) + bool passthrough = which != IRQCHIP_STATE_ACTIVE || + !irqd_is_forwarded_to_vcpu(d) || + (kvm_vgic_global_state.type == VGIC_V5 && + vgic_is_v3(kvm_get_running_vcpu()->kvm)); + + if (passthrough) return irq_chip_set_parent_state(d, which, val); if (val) @@ -1307,15 +1312,7 @@ static int timer_irq_set_irqchip_state(struct irq_data *d, static void timer_irq_eoi(struct irq_data *d) { - /* - * On a GICv5 host, we still need to call EOI on the parent for - * PPIs. The host driver already handles irqs which are forwarded to - * vcpus, and skips the GIC CDDI while still doing the GIC CDEOI. This - * is required to emulate the EOIMode=1 on GICv5 hardware. Failure to - * call EOI unsurprisingly results in *BAD* lock-ups. - */ - if (!irqd_is_forwarded_to_vcpu(d) || - kvm_vgic_global_state.type == VGIC_V5) + if (!irqd_is_forwarded_to_vcpu(d)) irq_chip_eoi_parent(d); } -- cgit v1.2.3 From 6835fbed39bb329744a3f44e8e6a39e24079af10 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Thu, 21 May 2026 15:36:24 +0100 Subject: KVM: arm64: Reset page order in pKVM hyp_pool When a VM fails to initialise after its stage-2 hyp_pool has been initialised, that stage-2 must be torn down entirely. This requires resetting both the refcount and the order of its pages back to 0. Currently, reclaim_pgtable_pages() implicitly resets the page order by allocating the entire pool with order-0 granularity. However, in the VM initialisation error path, the addresses of the donated memory (the PGD) are already known, making it unnecessary to iterate over all pages in the pool. Since the vmemmap page order is a hyp_pool-specific field, leaving a non-zero order on hyp_pool destruction is harmless until another pool attempts to admit the page. Instead of resetting this field during destruction, reset it during pool initialization in hyp_pool_init(). For 'external' pages, we can't trust the order either as they bypass hyp_pool_init(). Since we never coalesce them, enforce order-0 to ensure safe insertion into the pool. This leaves no vmemmap order users outside of hyp_pool. Fixes: 256b4668cd89 ("KVM: arm64: Introduce separate hypercalls for pKVM VM reservation and initialization") Reported-by: Sashiko Signed-off-by: Vincent Donnefort Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Link: https://patch.msgid.link/20260521143626.1005660-2-vdonnefort@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 2 -- arch/arm64/kvm/hyp/nvhe/page_alloc.c | 21 ++++++++++++++++----- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 28a471d1927c..5c1e1742db4f 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -202,7 +202,6 @@ static void *guest_s2_zalloc_page(void *mc) memset(addr, 0, PAGE_SIZE); p = hyp_virt_to_page(addr); p->refcount = 1; - p->order = 0; return addr; } @@ -307,7 +306,6 @@ void reclaim_pgtable_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc) while (addr) { page = hyp_virt_to_page(addr); page->refcount = 0; - page->order = 0; push_hyp_memcache(mc, addr, hyp_virt_to_phys); WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(addr), 1)); addr = hyp_alloc_pages(&vm->pool, 0); diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c b/arch/arm64/kvm/hyp/nvhe/page_alloc.c index a1eb27a1a747..57f86aa0f82f 100644 --- a/arch/arm64/kvm/hyp/nvhe/page_alloc.c +++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c @@ -94,13 +94,22 @@ static void __hyp_attach_page(struct hyp_pool *pool, struct hyp_page *p) { phys_addr_t phys = hyp_page_to_phys(p); - u8 order = p->order; struct hyp_page *buddy; + bool coalesce = true; + u8 order = p->order; - memset(hyp_page_to_virt(p), 0, PAGE_SIZE << p->order); + /* + * 'external' pages are never coalesced and their ->order field + * untrusted as they bypass hyp_pool_init(). Enforce order-0. + */ + if (phys < pool->range_start || phys >= pool->range_end) { + order = 0; + coalesce = false; + } + + memset(hyp_page_to_virt(p), 0, PAGE_SIZE << order); - /* Skip coalescing for 'external' pages being freed into the pool. */ - if (phys < pool->range_start || phys >= pool->range_end) + if (!coalesce) goto insert; /* @@ -237,8 +246,10 @@ int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages, /* Init the vmemmap portion */ p = hyp_phys_to_page(phys); - for (i = 0; i < nr_pages; i++) + for (i = 0; i < nr_pages; i++) { hyp_set_page_refcounted(&p[i]); + p[i].order = 0; + } /* Attach the unused pages to the buddy tree */ for (i = reserved_pages; i < nr_pages; i++) -- cgit v1.2.3 From 20d2753295b1cd3c766199b5990119ca514e1302 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Thu, 21 May 2026 15:36:25 +0100 Subject: KVM: arm64: Fix __pkvm_init_vm error path In the unlikely case where insert_vm_table_entry fails, __pkvm_init_vm release the memory donated by the host for the PGD, but as the stage-2 is still set-up the hypervisor keeps a refcount on those pages, effectively leaking the references. Fix the rollback with the newly added kvm_guest_destroy_stage2(). Fixes: 256b4668cd89 ("KVM: arm64: Introduce separate hypercalls for pKVM VM reservation and initialization") Reported-by: Sashiko Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Signed-off-by: Vincent Donnefort Link: https://patch.msgid.link/20260521143626.1005660-3-vdonnefort@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/include/nvhe/mem_protect.h | 1 + arch/arm64/kvm/hyp/nvhe/mem_protect.c | 13 +++++++++---- arch/arm64/kvm/hyp/nvhe/pkvm.c | 4 +++- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index 3cbfae0e3dda..4f2b871199cb 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -56,6 +56,7 @@ int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot p int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id); int kvm_host_prepare_stage2(void *pgt_pool_base); int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd); +void kvm_guest_destroy_stage2(struct pkvm_hyp_vm *vm); void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt); int hyp_pin_shared_mem(void *from, void *to); diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 5c1e1742db4f..88cd72332208 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -290,16 +290,21 @@ int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd) return 0; } +void kvm_guest_destroy_stage2(struct pkvm_hyp_vm *vm) +{ + guest_lock_component(vm); + kvm_pgtable_stage2_destroy(&vm->pgt); + vm->kvm.arch.mmu.pgd_phys = 0ULL; + guest_unlock_component(vm); +} + void reclaim_pgtable_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc) { struct hyp_page *page; void *addr; /* Dump all pgtable pages in the hyp_pool */ - guest_lock_component(vm); - kvm_pgtable_stage2_destroy(&vm->pgt); - vm->kvm.arch.mmu.pgd_phys = 0ULL; - guest_unlock_component(vm); + kvm_guest_destroy_stage2(vm); /* Drain the hyp_pool into the memcache */ addr = hyp_alloc_pages(&vm->pool, 0); diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index e7496eb85628..ff89b30f5c4a 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -839,10 +839,12 @@ int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva, /* Must be called last since this publishes the VM. */ ret = insert_vm_table_entry(handle, hyp_vm); if (ret) - goto err_remove_mappings; + goto err_destroy_stage2; return 0; +err_destroy_stage2: + kvm_guest_destroy_stage2(hyp_vm); err_remove_mappings: unmap_donated_memory(hyp_vm, vm_size); unmap_donated_memory(pgd, pgd_size); -- cgit v1.2.3 From 5c30c9fc117cc7d0c1b45741d1127edd4c2ae990 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Thu, 21 May 2026 15:36:26 +0100 Subject: KVM: arm64: Add fail-safe for refcounted pages in __pkvm_hyp_donate_host A previous bug in __pkvm_init_vm error path showed that the hypervisor could leak refcounted pages, (i.e. losing access to a page while its refcount is still elevated). This poses a threat to the pKVM state machine. Address this by introducing a fail-safe in __pkvm_hyp_donate_host. Transitions are not a hot path so added security is worth the extra check. Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Signed-off-by: Vincent Donnefort Link: https://patch.msgid.link/20260521143626.1005660-4-vdonnefort@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 88cd72332208..3263f6d0a0a4 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -833,6 +833,16 @@ static int __hyp_check_page_state_range(phys_addr_t phys, u64 size, enum pkvm_pa return 0; } +static int __hyp_check_page_count_range(phys_addr_t phys, u64 size) +{ + for_each_hyp_page(page, phys, size) { + if (page->refcount) + return -EBUSY; + } + + return 0; +} + static bool guest_pte_is_poisoned(kvm_pte_t pte) { if (kvm_pte_valid(pte)) @@ -1031,7 +1041,6 @@ unlock: int __pkvm_host_unshare_hyp(u64 pfn) { u64 phys = hyp_pfn_to_phys(pfn); - u64 virt = (u64)__hyp_va(phys); u64 size = PAGE_SIZE; int ret; @@ -1044,10 +1053,9 @@ int __pkvm_host_unshare_hyp(u64 pfn) ret = __hyp_check_page_state_range(phys, size, PKVM_PAGE_SHARED_BORROWED); if (ret) goto unlock; - if (hyp_page_count((void *)virt)) { - ret = -EBUSY; + ret = __hyp_check_page_count_range(phys, size); + if (ret) goto unlock; - } __hyp_set_page_state_range(phys, size, PKVM_NOPAGE); WARN_ON(__host_set_page_state_range(phys, size, PKVM_PAGE_OWNED)); @@ -1110,6 +1118,10 @@ int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages) if (ret) goto unlock; + ret = __hyp_check_page_count_range(phys, size); + if (ret) + goto unlock; + __hyp_set_page_state_range(phys, size, PKVM_NOPAGE); WARN_ON(kvm_pgtable_hyp_unmap(&pkvm_pgtable, virt, size) != size); WARN_ON(host_stage2_set_owner_locked(phys, size, PKVM_ID_HOST)); -- cgit v1.2.3 From 214a821c1462924668644e167a9706564cba65ea Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 1 May 2026 12:21:45 +0100 Subject: KVM: arm64: Guard against NULL vcpu on VHE hyp panic path On VHE, __hyp_call_panic() unconditionally calls __deactivate_traps(vcpu) on the vcpu pointer read from host_ctxt->__hyp_running_vcpu. That pointer is cleared after every guest exit (and is never set when no guest is running), so an unexpected EL2 exception landing in _guest_exit_panic, e.g. via the el2t*_invalid / el2h_irq_invalid vectors - reaches this function with vcpu == NULL. __deactivate_traps() then dereferences vcpu via ___deactivate_traps() -> vserror_state_is_nested() -> vcpu_has_nv() -> vcpu->arch.features, faulting inside the panic handler and obscuring the original failure. The nVHE counterpart (hyp_panic() in arch/arm64/kvm/hyp/nvhe/switch.c) already guards its vcpu-using cleanup with "if (vcpu)"; mirror that here. sysreg_restore_host_state_vhe() does not depend on vcpu and continues to run unconditionally, preserving panic forensics. The trailing panic("...VCPU:%p", vcpu) prints "(null)" safely via printk's %p handling. Fixes: 6a0259ed29bb ("KVM: arm64: Remove hyp_panic arguments") Assisted-by: Gemini:gemini-3.1-pro review-prompts Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260501112149.2824881-3-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/vhe/switch.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 9db3f11a4754..1e8995add14f 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -663,7 +663,8 @@ static void __noreturn __hyp_call_panic(u64 spsr, u64 elr, u64 par) host_ctxt = host_data_ptr(host_ctxt); vcpu = host_ctxt->__hyp_running_vcpu; - __deactivate_traps(vcpu); + if (vcpu) + __deactivate_traps(vcpu); sysreg_restore_host_state_vhe(host_ctxt); panic("HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n", -- cgit v1.2.3 From 1bdcdc84f9f91e702bb4410cb46016cde1d57d9b Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 1 May 2026 12:21:46 +0100 Subject: KVM: arm64: Fix __deactivate_fgt macro parameter typo __deactivate_fgt() declares its first parameter as "htcxt" but the body references "hctxt". The parameter is unused; the macro silently captures "hctxt" from the enclosing scope. Both existing callers (__deactivate_traps_hfgxtr() and __deactivate_traps_ich_hfgxtr()) happen to define a local "struct kvm_cpu_context *hctxt", so the macro works by coincidence. A future caller without an "hctxt" local in scope, or naming it differently, would compile but bind to the wrong context. Align the parameter name with the sibling __activate_fgt() macro. The "vcpu" parameter remains unused in the body, kept for API symmetry with __activate_fgt() (which uses it). Fixes: f5a5a406b4b8 ("KVM: arm64: Propagate and handle Fine-Grained UNDEF bits") Assisted-by: Gemini:gemini-3.1-pro review-prompts Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260501112149.2824881-4-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/include/hyp/switch.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 98b2976837b1..bf0eb5e43427 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -245,7 +245,7 @@ static inline void __activate_traps_ich_hfgxtr(struct kvm_vcpu *vcpu) __activate_fgt(hctxt, vcpu, ICH_HFGITR_EL2); } -#define __deactivate_fgt(htcxt, vcpu, reg) \ +#define __deactivate_fgt(hctxt, vcpu, reg) \ do { \ write_sysreg_s(ctxt_sys_reg(hctxt, reg), \ SYS_ ## reg); \ -- cgit v1.2.3 From 3a4f5b96730cb40d5d9b31293fd34e11a10f2d6d Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 1 May 2026 12:21:47 +0100 Subject: KVM: arm64: Seed pkvm_ownership_selftest vcpu memcache The hypercall handlers call pkvm_refill_memcache() to top up the hyp_vcpu memcache before invoking __pkvm_host_{share,donate}_guest(). pkvm_ownership_selftest invokes those functions directly with a static selftest_vcpu that has an empty memcache. Seed selftest_vcpu's memcache from the prepopulated selftest pages, leaving the remainder for selftest_vm.pool. Required by the memcache-sufficiency pre-check added in the following patches. Assisted-by: Gemini:gemini-3.1-pro review-prompts Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260501112149.2824881-5-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/pkvm.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index ff89b30f5c4a..3b2c4fbc34d8 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -752,16 +752,30 @@ static struct pkvm_hyp_vcpu selftest_vcpu = { struct pkvm_hyp_vcpu *init_selftest_vm(void *virt) { struct hyp_page *p = hyp_virt_to_page(virt); + unsigned long min_pages, seeded = 0; int i; selftest_vm.kvm.arch.mmu.vtcr = host_mmu.arch.mmu.vtcr; WARN_ON(kvm_guest_prepare_stage2(&selftest_vm, virt)); + /* + * Mirror pkvm_refill_memcache() for the share/donate pre-checks; + * the selftest invokes those functions directly and would + * otherwise see an empty memcache. + */ + min_pages = kvm_mmu_cache_min_pages(&selftest_vm.kvm.arch.mmu); + for (i = 0; i < pkvm_selftest_pages(); i++) { if (p[i].refcount) continue; p[i].refcount = 1; - hyp_put_page(&selftest_vm.pool, hyp_page_to_virt(&p[i])); + if (seeded < min_pages) { + push_hyp_memcache(&selftest_vcpu.vcpu.arch.pkvm_memcache, + hyp_page_to_virt(&p[i]), hyp_virt_to_phys); + seeded++; + } else { + hyp_put_page(&selftest_vm.pool, hyp_page_to_virt(&p[i])); + } } selftest_vm.kvm.arch.pkvm.handle = __pkvm_reserve_vm(); -- cgit v1.2.3 From 8ed0fbe5404616041f6daf1d2fa1824d75602f63 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 1 May 2026 12:21:48 +0100 Subject: KVM: arm64: Pre-check vcpu memcache for host->guest share __pkvm_host_share_guest() ends with kvm_pgtable_stage2_map() to install the guest stage-2 mapping, after a forward pass that mutates the host vmemmap (sets PKVM_PAGE_SHARED_OWNED and increments host_share_guest_count) for every page in the range. The map's return value is wrapped in WARN_ON() and otherwise discarded, asserting that the call cannot fail. WARN_ON() at nVHE EL2 panics, so this assertion is only correct if the call genuinely cannot fail. kvm_pgtable_stage2_map() can fail with -ENOMEM when the stage-2 walker exhausts the caller's memcache, and the host controls the vcpu memcache via the topup interface, so an under-provisioned share request would otherwise turn a recoverable -ENOMEM into a fatal hyp panic. Bound the worst-case walker allocation in the existing pre-check pass so that kvm_pgtable_stage2_map() cannot fail at the call site, using kvm_mmu_cache_min_pages() -- the same bound host EL1 uses for its own stage-2 maps. If the vcpu memcache holds fewer pages, return -ENOMEM before any state mutation. Fixes: d0bd3e6570ae ("KVM: arm64: Introduce __pkvm_host_share_guest()") Assisted-by: Gemini:gemini-3.1-pro review-prompts Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260501112149.2824881-6-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 3263f6d0a0a4..6af4a1c95aef 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -1384,6 +1384,22 @@ unlock: return ret && ret != -EHWPOISON ? ret : 0; } +/* + * share/donate install at most one stage-2 leaf (PAGE_SIZE, or one + * KVM_PGTABLE_LAST_LEVEL - 1 block for share). kvm_mmu_cache_min_pages() + * bounds the worst-case allocation: exact for the PAGE_SIZE leaf, + * conservative by one for the block. + */ +static int __guest_check_pgtable_memcache(struct pkvm_hyp_vcpu *vcpu) +{ + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + + if (vcpu->vcpu.arch.pkvm_memcache.nr_pages < kvm_mmu_cache_min_pages(vm->pgt.mmu)) + return -ENOMEM; + + return 0; +} + int __pkvm_host_donate_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu) { struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); @@ -1468,6 +1484,10 @@ int __pkvm_host_share_guest(u64 pfn, u64 gfn, u64 nr_pages, struct pkvm_hyp_vcpu } } + ret = __guest_check_pgtable_memcache(vcpu); + if (ret) + goto unlock; + for_each_hyp_page(page, phys, size) { set_host_state(page, PKVM_PAGE_SHARED_OWNED); page->host_share_guest_count++; -- cgit v1.2.3 From cada2549ca4c934e6fb3801f857c6b4b0c36490b Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 1 May 2026 12:21:49 +0100 Subject: KVM: arm64: Pre-check vcpu memcache for host->guest donate __pkvm_host_donate_guest() flips the host stage-2 PTE for the donated page to a non-valid annotation via host_stage2_set_owner_metadata_locked() and then calls kvm_pgtable_stage2_map() to install the matching guest stage-2 mapping. The map's return value is wrapped in WARN_ON() and otherwise discarded, asserting that the call cannot fail. WARN_ON() at nVHE EL2 panics, so this assertion is only correct if the call genuinely cannot fail. kvm_pgtable_stage2_map() can fail with -ENOMEM even at PAGE_SIZE granularity: the donate path verifies PKVM_NOPAGE for the guest IPA before the map, so the walker must allocate fresh page-table pages from the vcpu memcache, and the host controls the vcpu memcache via the topup interface. An under-provisioned donation request would otherwise turn a recoverable -ENOMEM into a fatal hyp panic. Bound the worst-case walker allocation alongside the existing __host_check_page_state_range() / __guest_check_page_state_range() pre-checks, using the helper introduced for host->guest share. If the vcpu memcache holds fewer pages than kvm_mmu_cache_min_pages(), return -ENOMEM before any state mutation. Fixes: 1e579adca177 ("KVM: arm64: Introduce __pkvm_host_donate_guest()") Assisted-by: Gemini:gemini-3.1-pro review-prompts Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260501112149.2824881-7-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 6af4a1c95aef..848c5b9dcfe6 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -1419,6 +1419,10 @@ int __pkvm_host_donate_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu) if (ret) goto unlock; + ret = __guest_check_pgtable_memcache(vcpu); + if (ret) + goto unlock; + meta = host_stage2_encode_gfn_meta(vm, gfn); WARN_ON(host_stage2_set_owner_metadata_locked(phys, PAGE_SIZE, PKVM_ID_GUEST, meta)); -- cgit v1.2.3 From 4cceeb8da363ac5127b147ee7345104743f53e9d Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 18 May 2026 16:31:26 +0100 Subject: KVM: arm64: Don't populate TPIDR_EL2 in finalise_el2() Currently, it is not necessary for __finalise_el2() to configure TPIDR_EL2: * The hyp stub code does not consume the value of TPIDR_EL2. * On the boot cpu, TPIDR_EL1 is used for the percpu offset until the ARM64_HAS_VIRT_HOST_EXTN cpucap is detected and boot alternatives are patched. Before boot alternatives are patched, cpu_copy_el2regs() will copy TPIDR_EL1 into TPIDR_EL2. It is not necessary for __finalise_el2() to initialise TPIDR_EL2 before this. * Secondary CPUs are brought up after boot alternatives have been patched, and __secondary_switched() will initialize TPIDR_EL2 in 'init_cpu_task', after finalise_el2() calls __finalise_el2() * KVM hyp code which may consume TPIDR_EL2 is brought up after all secondaries have been booted, once TPIDR_El2 has been configured on all CPUs. Remove the redundant initialisation from __finalise_el2(). Cc: Oliver Upton Cc: Marc Zyngier Cc: Catalin Marinas Reviewed-by: Mark Rutland Signed-off-by: Will Deacon Reviewed-by: Marc Zyngier Link: https://patch.msgid.link/20260518153127.6078-1-will@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kernel/hyp-stub.S | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S index 634ddc904244..37c6976e44a4 100644 --- a/arch/arm64/kernel/hyp-stub.S +++ b/arch/arm64/kernel/hyp-stub.S @@ -104,11 +104,9 @@ SYM_CODE_START_LOCAL(__finalise_el2) mov_q x0, HCR_HOST_VHE_FLAGS msr_hcr_el2 x0 - // Use the EL1 allocated stack, per-cpu offset + // Use the EL1 allocated stack mrs x0, sp_el1 mov sp, x0 - mrs x0, tpidr_el1 - msr tpidr_el2, x0 // FP configuration, vectors mrs_s x0, SYS_CPACR_EL12 -- cgit v1.2.3 From a878096e0e86b44e758aafc6b26af97e8f548673 Mon Sep 17 00:00:00 2001 From: Wei-Lin Chang Date: Tue, 14 Apr 2026 01:03:31 +0100 Subject: KVM: arm64: nv: Rename vtcr_to_walk_info() to setup_s2_walk() This rename aligns the stage-2 walker better with the stage-1 walker. Also set up other non-VTCR walk info in the function. Signed-off-by: Wei-Lin Chang Link: https://patch.msgid.link/20260414000334.3947257-2-weilin.chang@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/nested.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 883b6c1008fb..00e8bc939baf 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -378,9 +378,12 @@ static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, phys_addr_t ipa, return 0; } -static void vtcr_to_walk_info(u64 vtcr, struct s2_walk_info *wi) +static void setup_s2_walk(struct kvm_vcpu *vcpu, struct s2_walk_info *wi) { - wi->t0sz = vtcr & TCR_EL2_T0SZ_MASK; + u64 vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2); + + wi->baddr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); + wi->t0sz = vtcr & VTCR_EL2_T0SZ_MASK; switch (FIELD_GET(VTCR_EL2_TG0_MASK, vtcr)) { case VTCR_EL2_TG0_4K: @@ -398,12 +401,12 @@ static void vtcr_to_walk_info(u64 vtcr, struct s2_walk_info *wi) ps_to_output_size(FIELD_GET(VTCR_EL2_PS_MASK, vtcr), false)); wi->ha = vtcr & VTCR_EL2_HA; + wi->be = vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_EE; } int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa, struct kvm_s2_trans *result) { - u64 vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2); struct s2_walk_info wi; int ret; @@ -412,11 +415,7 @@ int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa, if (!vcpu_has_nv(vcpu)) return 0; - wi.baddr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); - - vtcr_to_walk_info(vtcr, &wi); - - wi.be = vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_EE; + setup_s2_walk(vcpu, &wi); ret = walk_nested_s2_pgd(vcpu, gipa, &wi, result); if (ret) -- cgit v1.2.3 From d7768b4f718503e79e7c626d29e9131b747148ee Mon Sep 17 00:00:00 2001 From: Wei-Lin Chang Date: Tue, 14 Apr 2026 01:03:32 +0100 Subject: KVM: arm64: Factor out TG0/1 decoding of VTCR and TCR The current code decodes TCR.TG0/TG1 and VTCR.TG0 inline at several places. Extract this logic into helpers so the granule size can be derived in one place. This enables us to alter the effective granule size in the same place, which we will do in a later patch. Signed-off-by: Wei-Lin Chang Link: https://patch.msgid.link/20260414000334.3947257-3-weilin.chang@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/at.c | 77 +++++++++++++++++++++++++++++++------------------ arch/arm64/kvm/nested.c | 27 ++++++++++------- 2 files changed, 65 insertions(+), 39 deletions(-) diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index 9f8f0ae8e86e..6ebcf65b4ffa 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -136,14 +136,58 @@ static void compute_s1poe(struct kvm_vcpu *vcpu, struct s1_walk_info *wi) wi->e0poe = (wi->regime != TR_EL2) && (val & TCR2_EL1_E0POE); } +static unsigned int tcr_to_tg0_pgshift(u64 tcr) +{ + u64 tg0 = tcr & TCR_TG0_MASK; + + switch (tg0) { + case TCR_TG0_4K: + return 12; + case TCR_TG0_16K: + return 14; + case TCR_TG0_64K: + default: /* IMPDEF: treat any other value as 64k */ + return 16; + } +} + +static unsigned int tcr_to_tg1_pgshift(u64 tcr) +{ + u64 tg1 = tcr & TCR_TG1_MASK; + + switch (tg1) { + case TCR_TG1_4K: + return 12; + case TCR_TG1_16K: + return 14; + case TCR_TG1_64K: + default: /* IMPDEF: treat any other value as 64k */ + return 16; + } +} + +static unsigned int tcr_tg_pgshift(u64 tcr, bool upper_range) +{ + unsigned int shift; + + /* Someone was silly enough to encode TG0/TG1 differently */ + if (upper_range) + shift = tcr_to_tg1_pgshift(tcr); + else + shift = tcr_to_tg0_pgshift(tcr); + + return shift; +} + static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, struct s1_walk_result *wr, u64 va) { - u64 hcr, sctlr, tcr, tg, ps, ia_bits, ttbr; + u64 hcr, sctlr, tcr, ps, ia_bits, ttbr; unsigned int stride, x; - bool va55, tbi, lva; + bool va55, tbi, lva, upper_range; va55 = va & BIT(55); + upper_range = va55 && wi->regime != TR_EL2; if (vcpu_has_nv(vcpu)) { hcr = __vcpu_sys_reg(vcpu, HCR_EL2); @@ -174,35 +218,12 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, BUG(); } - /* Someone was silly enough to encode TG0/TG1 differently */ - if (va55 && wi->regime != TR_EL2) { + if (upper_range) wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr); - tg = FIELD_GET(TCR_TG1_MASK, tcr); - - switch (tg << TCR_TG1_SHIFT) { - case TCR_TG1_4K: - wi->pgshift = 12; break; - case TCR_TG1_16K: - wi->pgshift = 14; break; - case TCR_TG1_64K: - default: /* IMPDEF: treat any other value as 64k */ - wi->pgshift = 16; break; - } - } else { + else wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr); - tg = FIELD_GET(TCR_TG0_MASK, tcr); - - switch (tg << TCR_TG0_SHIFT) { - case TCR_TG0_4K: - wi->pgshift = 12; break; - case TCR_TG0_16K: - wi->pgshift = 14; break; - case TCR_TG0_64K: - default: /* IMPDEF: treat any other value as 64k */ - wi->pgshift = 16; break; - } - } + wi->pgshift = tcr_tg_pgshift(tcr, upper_range); wi->pa52bit = has_52bit_pa(vcpu, wi, tcr); ia_bits = get_ia_size(wi); diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 00e8bc939baf..a88e5dfddd2b 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -378,28 +378,33 @@ static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, phys_addr_t ipa, return 0; } -static void setup_s2_walk(struct kvm_vcpu *vcpu, struct s2_walk_info *wi) -{ - u64 vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2); - wi->baddr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); - wi->t0sz = vtcr & VTCR_EL2_T0SZ_MASK; +static unsigned int vtcr_to_tg0_pgshift(u64 vtcr) +{ + u64 tg0 = FIELD_GET(VTCR_EL2_TG0_MASK, vtcr); - switch (FIELD_GET(VTCR_EL2_TG0_MASK, vtcr)) { + switch (tg0) { case VTCR_EL2_TG0_4K: - wi->pgshift = 12; break; + return 12; case VTCR_EL2_TG0_16K: - wi->pgshift = 14; break; + return 14; case VTCR_EL2_TG0_64K: - default: /* IMPDEF: treat any other value as 64k */ - wi->pgshift = 16; break; + default: /* IMPDEF: treat any other value as 64k */ + return 16; } +} + +static void setup_s2_walk(struct kvm_vcpu *vcpu, struct s2_walk_info *wi) +{ + u64 vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2); + wi->baddr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); + wi->t0sz = vtcr & VTCR_EL2_T0SZ_MASK; + wi->pgshift = vtcr_to_tg0_pgshift(vtcr); wi->sl = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); /* Global limit for now, should eventually be per-VM */ wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(FIELD_GET(VTCR_EL2_PS_MASK, vtcr), false)); - wi->ha = vtcr & VTCR_EL2_HA; wi->be = vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_EE; } -- cgit v1.2.3 From b154da8288add1f6fb958797d0b3462800f9fc77 Mon Sep 17 00:00:00 2001 From: Wei-Lin Chang Date: Tue, 14 Apr 2026 01:03:33 +0100 Subject: KVM: arm64: nv: Use literal granule size in TLBI range calculation TLBI handling derives the invalidation range from guest VTCR_EL2.TG0 in get_guest_mapping_ttl() and compute_tlb_inval_range(). Switch these to use a helper that returns the decoded VTCR_EL2.TG0 granule size instead of decoding it inline. This keeps the granule size derivation in one place and prepares for following changes that adjust the effective granule size. Signed-off-by: Wei-Lin Chang Link: https://patch.msgid.link/20260414000334.3947257-4-weilin.chang@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/nested.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index a88e5dfddd2b..bc95e43c54dd 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -394,6 +394,11 @@ static unsigned int vtcr_to_tg0_pgshift(u64 vtcr) } } +static size_t vtcr_to_tg0_pgsize(u64 vtcr) +{ + return BIT(vtcr_to_tg0_pgshift(vtcr)); +} + static void setup_s2_walk(struct kvm_vcpu *vcpu, struct s2_walk_info *wi) { u64 vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2); @@ -516,20 +521,21 @@ static u8 pgshift_level_to_ttl(u16 shift, u8 level) */ static u8 get_guest_mapping_ttl(struct kvm_s2_mmu *mmu, u64 addr) { - u64 tmp, sz = 0, vtcr = mmu->tlb_vtcr; + u64 tmp, sz = 0; kvm_pte_t pte; u8 ttl, level; + size_t tg0_size = vtcr_to_tg0_pgsize(mmu->tlb_vtcr); lockdep_assert_held_write(&kvm_s2_mmu_to_kvm(mmu)->mmu_lock); - switch (FIELD_GET(VTCR_EL2_TG0_MASK, vtcr)) { - case VTCR_EL2_TG0_4K: + switch (tg0_size) { + case SZ_4K: ttl = (TLBI_TTL_TG_4K << 2); break; - case VTCR_EL2_TG0_16K: + case SZ_16K: ttl = (TLBI_TTL_TG_16K << 2); break; - case VTCR_EL2_TG0_64K: + case SZ_64K: default: /* IMPDEF: treat any other value as 64k */ ttl = (TLBI_TTL_TG_64K << 2); break; @@ -539,19 +545,19 @@ static u8 get_guest_mapping_ttl(struct kvm_s2_mmu *mmu, u64 addr) again: /* Iteratively compute the block sizes for a particular granule size */ - switch (FIELD_GET(VTCR_EL2_TG0_MASK, vtcr)) { - case VTCR_EL2_TG0_4K: + switch (tg0_size) { + case SZ_4K: if (sz < SZ_4K) sz = SZ_4K; else if (sz < SZ_2M) sz = SZ_2M; else if (sz < SZ_1G) sz = SZ_1G; else sz = 0; break; - case VTCR_EL2_TG0_16K: + case SZ_16K: if (sz < SZ_16K) sz = SZ_16K; else if (sz < SZ_32M) sz = SZ_32M; else sz = 0; break; - case VTCR_EL2_TG0_64K: + case SZ_64K: default: /* IMPDEF: treat any other value as 64k */ if (sz < SZ_64K) sz = SZ_64K; else if (sz < SZ_512M) sz = SZ_512M; @@ -602,14 +608,14 @@ unsigned long compute_tlb_inval_range(struct kvm_s2_mmu *mmu, u64 val) if (!max_size) { /* Compute the maximum extent of the invalidation */ - switch (FIELD_GET(VTCR_EL2_TG0_MASK, mmu->tlb_vtcr)) { - case VTCR_EL2_TG0_4K: + switch (vtcr_to_tg0_pgsize(mmu->tlb_vtcr)) { + case SZ_4K: max_size = SZ_1G; break; - case VTCR_EL2_TG0_16K: + case SZ_16K: max_size = SZ_32M; break; - case VTCR_EL2_TG0_64K: + case SZ_64K: default: /* IMPDEF: treat any other value as 64k */ /* * No, we do not support 52bit IPA in nested yet. Once -- cgit v1.2.3 From 8853566dfbab1a255ae72676ab5ec43e1631ddb7 Mon Sep 17 00:00:00 2001 From: Wei-Lin Chang Date: Tue, 14 Apr 2026 01:03:34 +0100 Subject: KVM: arm64: Fallback to a supported value for unsupported guest TGx When KVM derives the translation granule for emulated stage-1 and stage-2 walks, it decodes TCR/VTCR.TGx and treats the granule as-is. This is wrong when the guest programs a granule size that is not advertised in the guest's ID_AA64MMFR0_EL1.TGRAN* fields. Architecturally, such a value must be treated as an implemented granule size. Choose an available one while prioritizing PAGE_SIZE. Signed-off-by: Wei-Lin Chang Link: https://patch.msgid.link/20260414000334.3947257-5-weilin.chang@arm.com [maz: minor tidying up] Signed-off-by: Marc Zyngier --- arch/arm64/kvm/at.c | 52 ++++++++++++++++++++++++- arch/arm64/kvm/nested.c | 100 ++++++++++++++++++++++++++++++++++-------------- 2 files changed, 122 insertions(+), 30 deletions(-) diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index 6ebcf65b4ffa..60d51e98ccb0 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -136,6 +136,30 @@ static void compute_s1poe(struct kvm_vcpu *vcpu, struct s1_walk_info *wi) wi->e0poe = (wi->regime != TR_EL2) && (val & TCR2_EL1_E0POE); } +#define _has_tgran(__r, __sz) \ + ({ \ + u64 _s1, _mmfr0 = __r; \ + \ + _s1 = SYS_FIELD_GET(ID_AA64MMFR0_EL1, \ + TGRAN##__sz, _mmfr0); \ + \ + _s1 != ID_AA64MMFR0_EL1_TGRAN##__sz##_NI; \ + }) + +static bool has_tgran(u64 mmfr0, unsigned int shift) +{ + switch (shift) { + case 12: + return _has_tgran(mmfr0, 4); + case 14: + return _has_tgran(mmfr0, 16); + case 16: + return _has_tgran(mmfr0, 64); + default: + BUG(); + } +} + static unsigned int tcr_to_tg0_pgshift(u64 tcr) { u64 tg0 = tcr & TCR_TG0_MASK; @@ -166,8 +190,23 @@ static unsigned int tcr_to_tg1_pgshift(u64 tcr) } } -static unsigned int tcr_tg_pgshift(u64 tcr, bool upper_range) +static unsigned int fallback_tgran_shift(u64 mmfr0) { + if (has_tgran(mmfr0, PAGE_SHIFT)) + return PAGE_SHIFT; + else if (has_tgran(mmfr0, 12)) + return 12; + else if (has_tgran(mmfr0, 14)) + return 14; + else if (has_tgran(mmfr0, 16)) + return 16; + else /* Should be unreacheable */ + return PAGE_SHIFT; +} + +static unsigned int tcr_tg_pgshift(struct kvm *kvm, u64 tcr, bool upper_range) +{ + u64 mmfr0 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64MMFR0_EL1); unsigned int shift; /* Someone was silly enough to encode TG0/TG1 differently */ @@ -176,6 +215,15 @@ static unsigned int tcr_tg_pgshift(u64 tcr, bool upper_range) else shift = tcr_to_tg0_pgshift(tcr); + /* + * If TGx is programmed to an unimplemented value (not advertised in + * ID_AA64MMFR0_EL1), we should treat it as if an implemented value is + * written, as per the architecture. Choose an available one while + * prioritizing PAGE_SIZE. + */ + if (!has_tgran(mmfr0, shift)) + return fallback_tgran_shift(mmfr0); + return shift; } @@ -223,7 +271,7 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, else wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr); - wi->pgshift = tcr_tg_pgshift(tcr, upper_range); + wi->pgshift = tcr_tg_pgshift(vcpu->kvm, tcr, upper_range); wi->pa52bit = has_52bit_pa(vcpu, wi, tcr); ia_bits = get_ia_size(wi); diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index bc95e43c54dd..3204b3ef60dd 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -378,25 +378,84 @@ static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, phys_addr_t ipa, return 0; } +#define _has_tgran_2(__r, __sz) \ + ({ \ + u64 _s1, _s2, _mmfr0 = __r; \ + \ + _s2 = SYS_FIELD_GET(ID_AA64MMFR0_EL1, \ + TGRAN##__sz##_2, _mmfr0); \ + \ + _s1 = SYS_FIELD_GET(ID_AA64MMFR0_EL1, \ + TGRAN##__sz, _mmfr0); \ + \ + ((_s2 != ID_AA64MMFR0_EL1_TGRAN##__sz##_2_NI && \ + _s2 != ID_AA64MMFR0_EL1_TGRAN##__sz##_2_TGRAN##__sz) || \ + (_s2 == ID_AA64MMFR0_EL1_TGRAN##__sz##_2_TGRAN##__sz && \ + _s1 != ID_AA64MMFR0_EL1_TGRAN##__sz##_NI)); \ + }) + +static bool has_tgran_2(u64 mmfr0, unsigned int shift) +{ + switch (shift) { + case 12: + return _has_tgran_2(mmfr0, 4); + case 14: + return _has_tgran_2(mmfr0, 16); + case 16: + return _has_tgran_2(mmfr0, 64); + default: + BUG(); + } +} + +static unsigned int fallback_tgran2_shift(u64 mmfr0) +{ + if (has_tgran_2(mmfr0, PAGE_SHIFT)) + return PAGE_SHIFT; + else if (has_tgran_2(mmfr0, 12)) + return 12; + else if (has_tgran_2(mmfr0, 14)) + return 14; + else if (has_tgran_2(mmfr0, 16)) + return 16; + else + return PAGE_SHIFT; +} -static unsigned int vtcr_to_tg0_pgshift(u64 vtcr) +static unsigned int vtcr_to_tg0_pgshift(struct kvm *kvm, u64 vtcr) { u64 tg0 = FIELD_GET(VTCR_EL2_TG0_MASK, vtcr); + u64 mmfr0 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64MMFR0_EL1); + unsigned int shift; switch (tg0) { case VTCR_EL2_TG0_4K: - return 12; + shift = 12; + break; case VTCR_EL2_TG0_16K: - return 14; + shift = 14; + break; case VTCR_EL2_TG0_64K: - default: /* IMPDEF: treat any other value as 64k */ - return 16; + /* IMPDEF: treat any other value as 64k, subject to fallback */ + default: + shift = 16; } + + /* + * If TGx is programmed to an unimplemented value (not advertised in + * ID_AA64MMFR0_EL1), we should treat it as if an implemented value is + * written, as per the architecture. Choose an available one while + * prioritizing PAGE_SIZE. + */ + if (!has_tgran_2(mmfr0, shift)) + return fallback_tgran2_shift(mmfr0); + + return shift; } -static size_t vtcr_to_tg0_pgsize(u64 vtcr) +static size_t vtcr_to_tg0_pgsize(struct kvm *kvm, u64 vtcr) { - return BIT(vtcr_to_tg0_pgshift(vtcr)); + return BIT(vtcr_to_tg0_pgshift(kvm, vtcr)); } static void setup_s2_walk(struct kvm_vcpu *vcpu, struct s2_walk_info *wi) @@ -405,7 +464,7 @@ static void setup_s2_walk(struct kvm_vcpu *vcpu, struct s2_walk_info *wi) wi->baddr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); wi->t0sz = vtcr & VTCR_EL2_T0SZ_MASK; - wi->pgshift = vtcr_to_tg0_pgshift(vtcr); + wi->pgshift = vtcr_to_tg0_pgshift(vcpu->kvm, vtcr); wi->sl = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); /* Global limit for now, should eventually be per-VM */ wi->max_oa_bits = min(get_kvm_ipa_limit(), @@ -521,10 +580,10 @@ static u8 pgshift_level_to_ttl(u16 shift, u8 level) */ static u8 get_guest_mapping_ttl(struct kvm_s2_mmu *mmu, u64 addr) { + size_t tg0_size = vtcr_to_tg0_pgsize(kvm_s2_mmu_to_kvm(mmu), mmu->tlb_vtcr); u64 tmp, sz = 0; kvm_pte_t pte; u8 ttl, level; - size_t tg0_size = vtcr_to_tg0_pgsize(mmu->tlb_vtcr); lockdep_assert_held_write(&kvm_s2_mmu_to_kvm(mmu)->mmu_lock); @@ -608,7 +667,7 @@ unsigned long compute_tlb_inval_range(struct kvm_s2_mmu *mmu, u64 val) if (!max_size) { /* Compute the maximum extent of the invalidation */ - switch (vtcr_to_tg0_pgsize(mmu->tlb_vtcr)) { + switch (vtcr_to_tg0_pgsize(kvm, mmu->tlb_vtcr)) { case SZ_4K: max_size = SZ_1G; break; @@ -1508,21 +1567,6 @@ static void kvm_map_l1_vncr(struct kvm_vcpu *vcpu) } } -#define has_tgran_2(__r, __sz) \ - ({ \ - u64 _s1, _s2, _mmfr0 = __r; \ - \ - _s2 = SYS_FIELD_GET(ID_AA64MMFR0_EL1, \ - TGRAN##__sz##_2, _mmfr0); \ - \ - _s1 = SYS_FIELD_GET(ID_AA64MMFR0_EL1, \ - TGRAN##__sz, _mmfr0); \ - \ - ((_s2 != ID_AA64MMFR0_EL1_TGRAN##__sz##_2_NI && \ - _s2 != ID_AA64MMFR0_EL1_TGRAN##__sz##_2_TGRAN##__sz) || \ - (_s2 == ID_AA64MMFR0_EL1_TGRAN##__sz##_2_TGRAN##__sz && \ - _s1 != ID_AA64MMFR0_EL1_TGRAN##__sz##_NI)); \ - }) /* * Our emulated CPU doesn't support all the possible features. For the * sake of simplicity (and probably mental sanity), wipe out a number @@ -1609,15 +1653,15 @@ u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val) */ switch (PAGE_SIZE) { case SZ_4K: - if (has_tgran_2(orig_val, 4)) + if (_has_tgran_2(orig_val, 4)) val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN4_2, IMP); fallthrough; case SZ_16K: - if (has_tgran_2(orig_val, 16)) + if (_has_tgran_2(orig_val, 16)) val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN16_2, IMP); fallthrough; case SZ_64K: - if (has_tgran_2(orig_val, 64)) + if (_has_tgran_2(orig_val, 64)) val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN64_2, IMP); break; } -- cgit v1.2.3 From e72b425f617edebf6f374fa39f2c763c9b3696ec Mon Sep 17 00:00:00 2001 From: "Zenghui Yu (Huawei)" Date: Wed, 18 Mar 2026 22:43:05 +0800 Subject: KVM: arm64: Remove @arch from __load_stage2() Since commit fe49fd940e22 ("KVM: arm64: Move VTCR_EL2 into struct s2_mmu"), @arch is no longer required to obtain the per-kvm_s2_mmu vtcr and can be removed from __load_stage2(). Signed-off-by: Zenghui Yu (Huawei) Reviewed-by: Anshuman Khandual Link: https://patch.msgid.link/20260318144305.56831-1-zenghui.yu@linux.dev Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_mmu.h | 3 +-- arch/arm64/kvm/at.c | 2 +- arch/arm64/kvm/hyp/include/nvhe/mem_protect.h | 2 +- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 2 +- arch/arm64/kvm/hyp/nvhe/switch.c | 2 +- arch/arm64/kvm/hyp/nvhe/tlb.c | 4 ++-- arch/arm64/kvm/hyp/vhe/switch.c | 2 +- arch/arm64/kvm/hyp/vhe/tlb.c | 4 ++-- 8 files changed, 10 insertions(+), 11 deletions(-) diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 01e9c72d6aa7..6eae7e7e2a68 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -318,8 +318,7 @@ static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu) * Must be called from hyp code running at EL2 with an updated VTTBR * and interrupts disabled. */ -static __always_inline void __load_stage2(struct kvm_s2_mmu *mmu, - struct kvm_arch *arch) +static __always_inline void __load_stage2(struct kvm_s2_mmu *mmu) { write_sysreg(mmu->vtcr, vtcr_el2); write_sysreg(kvm_get_vttbr(mmu), vttbr_el2); diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index 9f8f0ae8e86e..b91ef006919e 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -1380,7 +1380,7 @@ static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) } } write_sysreg_el1(vcpu_read_sys_reg(vcpu, SCTLR_EL1), SYS_SCTLR); - __load_stage2(mmu, mmu->arch); + __load_stage2(mmu); skip_mmu_switch: /* Temporarily switch back to guest context */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index 3cbfae0e3dda..aaeec6862215 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -67,7 +67,7 @@ int refill_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages, static __always_inline void __load_host_stage2(void) { if (static_branch_likely(&kvm_protected_mode_initialized)) - __load_stage2(&host_mmu.arch.mmu, &host_mmu.arch); + __load_stage2(&host_mmu.arch.mmu); else write_sysreg(0, vttbr_el2); } diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 28a471d1927c..888bd7e71d0c 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -337,7 +337,7 @@ int __pkvm_prot_finalize(void) kvm_flush_dcache_to_poc(params, sizeof(*params)); write_sysreg_hcr(params->hcr_el2); - __load_stage2(&host_mmu.arch.mmu, &host_mmu.arch); + __load_stage2(&host_mmu.arch.mmu); /* * Make sure to have an ISB before the TLB maintenance below but only diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 8d1df3d33595..7318e3e6a5f3 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -315,7 +315,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) __sysreg_restore_state_nvhe(guest_ctxt); mmu = kern_hyp_va(vcpu->arch.hw_mmu); - __load_stage2(mmu, kern_hyp_va(mmu->arch)); + __load_stage2(mmu); __activate_traps(vcpu); __hyp_vgic_restore_state(vcpu); diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c index b29140995d48..fdb90483340c 100644 --- a/arch/arm64/kvm/hyp/nvhe/tlb.c +++ b/arch/arm64/kvm/hyp/nvhe/tlb.c @@ -110,7 +110,7 @@ static void enter_vmid_context(struct kvm_s2_mmu *mmu, if (vcpu) __load_host_stage2(); else - __load_stage2(mmu, kern_hyp_va(mmu->arch)); + __load_stage2(mmu); asm(ALTERNATIVE("isb", "nop", ARM64_WORKAROUND_SPECULATIVE_AT)); } @@ -128,7 +128,7 @@ static void exit_vmid_context(struct tlb_inv_context *cxt) return; if (vcpu) - __load_stage2(mmu, kern_hyp_va(mmu->arch)); + __load_stage2(mmu); else __load_host_stage2(); diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 1e8995add14f..bbe9cebd3d9d 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -219,7 +219,7 @@ void kvm_vcpu_load_vhe(struct kvm_vcpu *vcpu) __vcpu_load_switch_sysregs(vcpu); __vcpu_load_activate_traps(vcpu); - __load_stage2(vcpu->arch.hw_mmu, vcpu->arch.hw_mmu->arch); + __load_stage2(vcpu->arch.hw_mmu); } void kvm_vcpu_put_vhe(struct kvm_vcpu *vcpu) diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c index f7b9dfe3f3a5..c386d9f1c101 100644 --- a/arch/arm64/kvm/hyp/vhe/tlb.c +++ b/arch/arm64/kvm/hyp/vhe/tlb.c @@ -60,7 +60,7 @@ static void enter_vmid_context(struct kvm_s2_mmu *mmu, * place before clearing TGE. __load_stage2() already * has an ISB in order to deal with this. */ - __load_stage2(mmu, mmu->arch); + __load_stage2(mmu); val = read_sysreg(hcr_el2); val &= ~HCR_TGE; write_sysreg_hcr(val); @@ -78,7 +78,7 @@ static void exit_vmid_context(struct tlb_inv_context *cxt) /* ... and the stage-2 MMU context that we switched away from */ if (cxt->mmu) - __load_stage2(cxt->mmu, cxt->mmu->arch); + __load_stage2(cxt->mmu); if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { /* Restore the registers to what they were */ -- cgit v1.2.3 From 978cd6b2ad036168712aad8fca213385a5b15e2d Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 2 Jun 2026 16:54:27 +0100 Subject: KVM: arm64: Key CPTR_EL2.E0POE propagation on FEAT_S1POE We propagate CPTR_EL2.E0POE from a L1 into the L0 configuration, but we key this on the L1 guest supporting FEAT_S2POE. This is obviously wrong, as this bit is solely concerned with Stage-1 translation. Fix this by making the update depend on FEAT_S1POE. Fixes: cd931bd6093cb ("KVM: arm64: nv: Add additional trap setup for CPTR_EL2") Reviewed-by: Joey Gouly Reviewed-by: Oliver Upton Link: https://patch.msgid.link/20260602155430.2088142-2-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/include/hyp/switch.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 98b2976837b1..4d814ae90613 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -141,7 +141,7 @@ static inline void __activate_cptr_traps_vhe(struct kvm_vcpu *vcpu) if (!(SYS_FIELD_GET(CPACR_EL1, ZEN, cptr) & BIT(0))) val &= ~CPACR_EL1_ZEN; - if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S2POE, IMP)) + if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S1POE, IMP)) val |= cptr & CPACR_EL1_E0POE; val |= cptr & CPTR_EL2_TCPAC; -- cgit v1.2.3 From f41b481548cc263112b6da4a3b4869fcd35b4e45 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 2 Jun 2026 16:54:28 +0100 Subject: KVM: arm64: Wire AT S1E1A in the system instruction handling table Despite having handling code for AT S1E1A, the instruction was never plugged into the system instruction table, leading to an exception being injected in the guest. If the guest is Linux and using the __kvm_at() helper, the exception is actually handled in the helper, and KVM continues more or less silently by reentering the guest. Not exactly what you'd expect. Fix this by plugging the emulation code where required. Fixes: ff987ffc0c18c ("KVM: arm64: nv: Add support for FEAT_ATS1A") Reviewed-by: Joey Gouly Reviewed-by: Oliver Upton Link: https://patch.msgid.link/20260602155430.2088142-3-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 148fc3400ea8..753fe30d322c 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -4217,6 +4217,7 @@ static struct sys_reg_desc sys_insn_descs[] = { SYS_INSN(AT_S1E0W, handle_at_s1e01), SYS_INSN(AT_S1E1RP, handle_at_s1e01), SYS_INSN(AT_S1E1WP, handle_at_s1e01), + SYS_INSN(AT_S1E1A, handle_at_s1e01), { SYS_DESC(SYS_DC_CSW), access_dcsw }, { SYS_DESC(SYS_DC_CGSW), access_dcgsw }, -- cgit v1.2.3 From a62b4226ae47202eb00306b576859131c4c7196e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 2 Jun 2026 16:54:29 +0100 Subject: arm64: cpufeature: Expose ID_AA64ISAR2_EL1.ATS1A to KVM KVM needs to know if the HW implements FEAT_ATS1A in order to correctly sanitise HFGITR_EL2.ATS1E1A, which otherwise defaults to RES0 and AT S1E1A traps are handled as UNDEF. Solves this by exposing ID_AA64ISAR2_EL1.ATS1A to the rest of the kernel. Fixes: ff987ffc0c18c ("KVM: arm64: nv: Add support for FEAT_ATS1A") Reviewed-by: Joey Gouly Reviewed-by: Oliver Upton Link: https://patch.msgid.link/20260602155430.2088142-4-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kernel/cpufeature.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 6d53bb15cf7b..62b0d77217ee 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -266,6 +266,7 @@ static const struct arm64_ftr_bits ftr_id_aa64isar1[] = { }; static const struct arm64_ftr_bits ftr_id_aa64isar2[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_ATS1A_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_LUT_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_CSSC_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_RPRFM_SHIFT, 4, 0), -- cgit v1.2.3 From 9f76b039a72d7e06374aa96862f0232ed53f7787 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 2 Jun 2026 16:54:46 -0700 Subject: KVM: arm64: Don't leak PFN when kvm_translate_vncr() races MMU notifier In the case that kvm_translate_vncr() races with an MMU notifier the early return does not release a reference on the faulted in PFN. Add the necessary call to kvm_release_faultin_page() for the unused PFN. Cc: stable@vger.kernel.org Fixes: 069a05e535496 ("KVM: arm64: nv: Handle VNCR_EL2-triggered faults") Reported-by: Sashiko (local):gemini-3.1-pro Signed-off-by: Oliver Upton Link: https://patch.msgid.link/20260602235450.103057-2-oupton@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/nested.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 883b6c1008fb..4fa82e96454d 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -1326,8 +1326,10 @@ static int kvm_translate_vncr(struct kvm_vcpu *vcpu, bool *is_gmem) } scoped_guard(write_lock, &vcpu->kvm->mmu_lock) { - if (mmu_invalidate_retry(vcpu->kvm, mmu_seq)) + if (mmu_invalidate_retry(vcpu->kvm, mmu_seq)) { + kvm_release_faultin_page(vcpu->kvm, page, true, false); return -EAGAIN; + } vt->gva = va; vt->hpa = pfn << PAGE_SHIFT; -- cgit v1.2.3 From 5949004d7032767e8fde1e8c986a33f241b2a192 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 2 Jun 2026 16:54:47 -0700 Subject: KVM: arm64: nv: Fully update VNCR fixmap state in kvm_translate_vncr() kvm_translate_vncr() first invalidates the pseudo-TLB entry and corresponding fixmap in anticipation of installing a new translation. While the fixmap invalidation does clear the mapping from host stage-1, it does not clear the L1_VNCR_MAPPED flag. Depending on the state of the VNCR TLB at vcpu_put(), this could potentially precipitate a BUG_ON() if vt->cpu is reset. Share a helper with kvm_vcpu_put_hw_mmu(), ensuring that KVM's view of the VNCR fixmap is in sync with the state of the VNCR TLB. Give it a slightly verbose name to make it obvious that it is meant to be used local to a CPU, unlike other VNCR TLB maintenance. Fixes: 069a05e535496 ("KVM: arm64: nv: Handle VNCR_EL2-triggered faults") Signed-off-by: Oliver Upton Link: https://patch.msgid.link/20260602235450.103057-3-oupton@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/nested.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 4fa82e96454d..d0545144eaac 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -797,18 +797,24 @@ void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu) } } +static void this_cpu_reset_vncr_fixmap(struct kvm_vcpu *vcpu) +{ + if (!host_data_test_flag(L1_VNCR_MAPPED)) + return; + + BUG_ON(vcpu->arch.vncr_tlb->cpu != smp_processor_id()); + BUG_ON(is_hyp_ctxt(vcpu)); + + clear_fixmap(vncr_fixmap(vcpu->arch.vncr_tlb->cpu)); + vcpu->arch.vncr_tlb->cpu = -1; + host_data_clear_flag(L1_VNCR_MAPPED); + atomic_dec(&vcpu->kvm->arch.vncr_map_count); +} + void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu) { /* Unconditionally drop the VNCR mapping if we have one */ - if (host_data_test_flag(L1_VNCR_MAPPED)) { - BUG_ON(vcpu->arch.vncr_tlb->cpu != smp_processor_id()); - BUG_ON(is_hyp_ctxt(vcpu)); - - clear_fixmap(vncr_fixmap(vcpu->arch.vncr_tlb->cpu)); - vcpu->arch.vncr_tlb->cpu = -1; - host_data_clear_flag(L1_VNCR_MAPPED); - atomic_dec(&vcpu->kvm->arch.vncr_map_count); - } + this_cpu_reset_vncr_fixmap(vcpu); /* * Keep a reference on the associated stage-2 MMU if the vCPU is @@ -1282,7 +1288,8 @@ static int kvm_translate_vncr(struct kvm_vcpu *vcpu, bool *is_gmem) * We also prepare the next walk wilst we're at it. */ scoped_guard(write_lock, &vcpu->kvm->mmu_lock) { - invalidate_vncr(vt); + this_cpu_reset_vncr_fixmap(vcpu); + vt->valid = false; vt->wi = (struct s1_walk_info) { .regime = TR_EL20, -- cgit v1.2.3 From efa871f4a2517385295de2e3f786e4ae4ffa6e77 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 2 Jun 2026 16:54:48 -0700 Subject: KVM: arm64: nv: Inject SEA TTW when desc update can't write to GPA Similar to the handling of descriptor reads, inject an SEA during TTW when the descriptor access fails for reasons other than a race, such as a read-only memslot or a bad HVA. Fixes: bff8aa213dee ("KVM: arm64: Implement HW access flag management in stage-1 SW PTW") Fixes: e4c7dfac2f1a ("KVM: arm64: nv: Implement HW access flag management in stage-2 SW PTW") Signed-off-by: Oliver Upton Link: https://patch.msgid.link/20260602235450.103057-4-oupton@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/at.c | 6 +++++- arch/arm64/kvm/nested.c | 7 ++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index 9f8f0ae8e86e..119a603e636e 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -521,8 +521,12 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, } ret = kvm_swap_s1_desc(vcpu, ipa, desc, new_desc, wi); - if (ret) + if (ret == -EAGAIN) return ret; + if (ret) { + fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level), false); + return ret; + } desc = new_desc; } diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index d0545144eaac..f8d3f3a72328 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -352,8 +352,13 @@ static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, phys_addr_t ipa, if (new_desc != desc) { ret = swap_guest_s2_desc(vcpu, paddr, desc, new_desc, wi); - if (ret) + if (ret == -EAGAIN) return ret; + if (ret) { + out->esr = ESR_ELx_FSC_SEA_TTW(level); + out->desc = desc; + return 1; + } desc = new_desc; } -- cgit v1.2.3 From 699a2cc7f608145d55621e57828ccf6bfcb8d906 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 2 Jun 2026 16:54:49 -0700 Subject: KVM: arm64: Restart instruction upon race in __kvm_at_s12() __kvm_at_s*() are expected to return -EAGAIN if the page table walk raced with a concurrent update to a page table descriptor, which is interpreted as a signal to restart the trapping instruction. While this mostly works, __kvm_at_s12() silently eats the return from __kvm_at_s1e01() and consumes an uninitialized PAR value. Propagate the nonzero return instead. Fixes: 92c6443222ca ("KVM: arm64: Propagate PTW errors up to AT emulation") Signed-off-by: Oliver Upton Link: https://patch.msgid.link/20260602235450.103057-5-oupton@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/at.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index 119a603e636e..6cc5892023dd 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -1557,7 +1557,10 @@ int __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) return 0; } - __kvm_at_s1e01(vcpu, op, vaddr); + ret = __kvm_at_s1e01(vcpu, op, vaddr); + if (ret) + return ret; + par = vcpu_read_sys_reg(vcpu, PAR_EL1); if (par & SYS_PAR_EL1_F) return 0; -- cgit v1.2.3 From d8839941df7de41fd4b02b7b7cdd0c46e5ba501e Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 2 Jun 2026 16:54:50 -0700 Subject: KVM: arm64: nv: Restart stage-1 walk if stage-2 desc update fails kvm_walk_nested_s2() returns -EAGAIN as an indication that an underlying descriptor update fails due to a race. The expectation is that the caller restart translation, yet walk_s1() actually synthesizes an abort. Propagate the -EAGAIN return out of walk_s1(), relying on callers to restart the translation fetch. Fixes: e4c7dfac2f1a ("KVM: arm64: nv: Implement HW access flag management in stage-2 SW PTW") Signed-off-by: Oliver Upton Link: https://patch.msgid.link/20260602235450.103057-6-oupton@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/at.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index 6cc5892023dd..4d4285e60fce 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -423,6 +423,9 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, if (wi->s2) { ret = kvm_walk_nested_s2(vcpu, ipa, &s2_trans); + if (ret == -EAGAIN) + return ret; + if (ret) { fail_s1_walk(wr, (s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | level, -- cgit v1.2.3 From e8042f6e1d7befb2fb6b10a75918642bcd0acf9a Mon Sep 17 00:00:00 2001 From: Hyunwoo Kim Date: Sun, 7 Jun 2026 02:56:10 +0900 Subject: KVM: arm64: Clear __hyp_running_vcpu when flushing the pKVM hyp vCPU flush_hyp_vcpu() copies the host vCPU context into the hyp's private vCPU on every run. ctxt_to_vcpu() expects a guest context to have a NULL __hyp_running_vcpu, which is only ever set on the host context, so that it resolves the vCPU via container_of(). While this is generally the case, flush_hyp_vcpu() copies the context verbatim and does not enforce this, so a value provided by the host is dereferenced at EL2 (host -> EL2). Fix by clearing __hyp_running_vcpu after the copy. Cc: stable@vger.kernel.org Fixes: be66e67f1750 ("KVM: arm64: Use the pKVM hyp vCPU structure in handle___kvm_vcpu_run()") Signed-off-by: Hyunwoo Kim Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Link: https://patch.msgid.link/20260606175614.83273-2-imv4bel@gmail.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/hyp-main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 06db299c37a8..02c5d6e5abcb 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -128,6 +128,9 @@ static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) hyp_vcpu->vcpu.arch.ctxt = host_vcpu->arch.ctxt; + /* __hyp_running_vcpu must be NULL in a guest context. */ + hyp_vcpu->vcpu.arch.ctxt.__hyp_running_vcpu = NULL; + hyp_vcpu->vcpu.arch.mdcr_el2 = host_vcpu->arch.mdcr_el2; hyp_vcpu->vcpu.arch.hcr_el2 &= ~(HCR_TWI | HCR_TWE); hyp_vcpu->vcpu.arch.hcr_el2 |= READ_ONCE(host_vcpu->arch.hcr_el2) & -- cgit v1.2.3 From 8cc8bbbfab14c22c5551d0dd19b208a44b141c76 Mon Sep 17 00:00:00 2001 From: Hyunwoo Kim Date: Sun, 7 Jun 2026 02:56:11 +0900 Subject: KVM: arm64: Bound used_lrs when flushing the pKVM hyp vCPU flush_hyp_vcpu() copies the host vGIC state into the hyp's private vCPU on every run. The vGIC list register save and restore use used_lrs as their loop bound and expect it to stay within the number of implemented list registers. While this is generally the case, flush_hyp_vcpu() copies vgic_v3 verbatim and does not enforce this, so a value provided by the host is used at EL2 to index vgic_lr[] and access ICH_LR_EL2 (host -> EL2). Fix by clamping used_lrs to the number of implemented list registers after the copy, as the trusted path already does in vgic_flush_lr_state(). The number of implemented list registers is constant after init, so it is replicated once from kvm_vgic_global_state.nr_lr into hyp_gicv3_nr_lr rather than read on every entry. Cc: stable@vger.kernel.org Fixes: be66e67f1750 ("KVM: arm64: Use the pKVM hyp vCPU structure in handle___kvm_vcpu_run()") Signed-off-by: Hyunwoo Kim Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Link: https://patch.msgid.link/20260606175614.83273-3-imv4bel@gmail.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_hyp.h | 1 + arch/arm64/kvm/arm.c | 2 ++ arch/arm64/kvm/hyp/nvhe/hyp-main.c | 9 +++++++++ 3 files changed, 12 insertions(+) diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index 8d06b62e7188..e9b2b0c40ec6 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -157,5 +157,6 @@ extern unsigned long kvm_nvhe_sym(__icache_flags); extern unsigned int kvm_nvhe_sym(kvm_arm_vmid_bits); extern unsigned int kvm_nvhe_sym(kvm_host_sve_max_vl); extern unsigned long kvm_nvhe_sym(hyp_nr_cpus); +extern unsigned int kvm_nvhe_sym(hyp_gicv3_nr_lr); #endif /* __ARM64_KVM_HYP_H__ */ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 8bb2c7422cc8..74965b358cdf 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -2423,6 +2423,8 @@ static int __init init_subsystems(void) switch (err) { case 0: vgic_present = true; + if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + kvm_nvhe_sym(hyp_gicv3_nr_lr) = kvm_vgic_global_state.nr_lr; break; case -ENODEV: case -ENXIO: diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 02c5d6e5abcb..a0da08caa6c2 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -24,6 +24,9 @@ DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); +/* Number of implemented GICv3 LRs. Used by flush_hyp_vcpu(). */ +unsigned int hyp_gicv3_nr_lr; + void __kvm_hyp_host_forward_smc(struct kvm_cpu_context *host_ctxt); static void __hyp_sve_save_guest(struct kvm_vcpu *vcpu) @@ -142,6 +145,12 @@ static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) hyp_vcpu->vcpu.arch.vgic_cpu.vgic_v3 = host_vcpu->arch.vgic_cpu.vgic_v3; + /* Bound used_lrs by the number of implemented list registers. */ + hyp_vcpu->vcpu.arch.vgic_cpu.vgic_v3.used_lrs = + min_t(unsigned int, + hyp_vcpu->vcpu.arch.vgic_cpu.vgic_v3.used_lrs, + hyp_gicv3_nr_lr); + hyp_vcpu->vcpu.arch.pid = host_vcpu->arch.pid; } -- cgit v1.2.3 From 832dfa237f836549b202d3eebc0bc29b8a719608 Mon Sep 17 00:00:00 2001 From: "tabba@google.com" Date: Sun, 31 May 2026 16:45:48 +0100 Subject: KVM: arm64: Flush HCR_EL2.VSE to deliver SErrors to pKVM guests With pKVM enabled, the host injects a virtual SError by setting HCR_EL2.VSE on its vCPU copy, but flush_hyp_vcpu() only flows TWI/TWE into the hyp vCPU that runs, so VSE never reaches it and a deferred (masked) SError is never delivered. VSE is a host-owned injection control, not a trap-configuration bit, so restricting the host's trap-register values should not have dropped it. Flow it on entry; sync_hyp_vcpu() already copies hcr_el2 back, so delivery is reflected to the host. THis makes it consistent with the existing forwarding of VSESR_EL2, which qualifies the Serror. Fixes: b56680de9c648 ("KVM: arm64: Initialize trap register values in hyp in pKVM") Reported-by: Sashiko (local):gemini-3.1-pro Signed-off-by: Fuad Tabba Reviewed-by: Oliver Upton Link: https://patch.msgid.link/20260531154548.1505799-1-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/hyp-main.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index a0da08caa6c2..1d01c6e547f5 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -135,9 +135,14 @@ static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) hyp_vcpu->vcpu.arch.ctxt.__hyp_running_vcpu = NULL; hyp_vcpu->vcpu.arch.mdcr_el2 = host_vcpu->arch.mdcr_el2; - hyp_vcpu->vcpu.arch.hcr_el2 &= ~(HCR_TWI | HCR_TWE); + /* + * HCR_EL2.VSE is host-owned (a pending virtual SError to inject), not a + * trap-control bit, so it must flow to the hyp vCPU alongside TWI/TWE + * for the vSError to be delivered. sync_hyp_vcpu() reflects it back. + */ + hyp_vcpu->vcpu.arch.hcr_el2 &= ~(HCR_TWI | HCR_TWE | HCR_VSE); hyp_vcpu->vcpu.arch.hcr_el2 |= READ_ONCE(host_vcpu->arch.hcr_el2) & - (HCR_TWI | HCR_TWE); + (HCR_TWI | HCR_TWE | HCR_VSE); hyp_vcpu->vcpu.arch.iflags = host_vcpu->arch.iflags; -- cgit v1.2.3 From 63336d57a26904f58e4ff2cf584ef9958564a7c6 Mon Sep 17 00:00:00 2001 From: "tabba@google.com" Date: Fri, 29 May 2026 13:17:53 +0100 Subject: KVM: arm64: Free hyp-share tracking node when share hypercall fails share_pfn_hyp() inserts a tracking node into hyp_shared_pfns and then invokes __pkvm_host_share_hyp. If the hypercall rejects the share (page-state mismatch at EL2), the node stays in the tree with refcount 1: a phantom share that leaks the allocation and that a later unshare will trust. Erase the node and free it on hypercall failure. Fixes: a83e2191b7f1 ("KVM: arm64: pkvm: Refcount the pages shared with EL2") Reported-by: Sashiko (local):gemini-3.1-pro Suggested-by: Vincent Donnefort Signed-off-by: Fuad Tabba Reviewed-by: Vincent Donnefort Link: https://patch.msgid.link/20260529121755.2923500-2-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/mmu.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index d089c107d9b7..0abf3a2d587b 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -501,6 +501,10 @@ static int share_pfn_hyp(u64 pfn) rb_link_node(&this->node, parent, node); rb_insert_color(&this->node, &hyp_shared_pfns); ret = kvm_call_hyp_nvhe(__pkvm_host_share_hyp, pfn); + if (ret) { + rb_erase(&this->node, &hyp_shared_pfns); + kfree(this); + } unlock: mutex_unlock(&hyp_shared_pfns_lock); -- cgit v1.2.3 From bd2618780ab4584a33ab1049338294a50690d149 Mon Sep 17 00:00:00 2001 From: "tabba@google.com" Date: Fri, 29 May 2026 13:17:54 +0100 Subject: KVM: arm64: Avoid host/hyp share desync on unshare hypercall failure unshare_pfn_hyp() erases the tracking node from hyp_shared_pfns and frees it before invoking __pkvm_host_unshare_hyp. If the hypercall fails (e.g. EL2 refcount still held, or page-state mismatch), the host loses its record while EL2 still holds the share, breaking later share/unshare attempts on the same pfn. Invoke the hypercall first; erase and free only on success. Document at the kvm_unshare_hyp() call site that the WARN_ON() is left non-fatal: a failed unshare leaks the page (it stays shared with the hypervisor) but breaks no isolation guarantee. Fixes: 52b28657ebd7 ("KVM: arm64: pkvm: Unshare guest structs during teardown") Reported-by: Sashiko (local):gemini-3.1-pro Suggested-by: Vincent Donnefort Signed-off-by: Fuad Tabba Reviewed-by: Vincent Donnefort Link: https://patch.msgid.link/20260529121755.2923500-3-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/mmu.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 0abf3a2d587b..c82d4ececab8 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -524,13 +524,17 @@ static int unshare_pfn_hyp(u64 pfn) goto unlock; } - this->count--; - if (this->count) + if (this->count > 1) { + this->count--; + goto unlock; + } + + ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_hyp, pfn); + if (ret) goto unlock; rb_erase(&this->node, &hyp_shared_pfns); kfree(this); - ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_hyp, pfn); unlock: mutex_unlock(&hyp_shared_pfns_lock); @@ -581,6 +585,11 @@ void kvm_unshare_hyp(void *from, void *to) end = PAGE_ALIGN(__pa(to)); for (cur = start; cur < end; cur += PAGE_SIZE) { pfn = __phys_to_pfn(cur); + /* + * A failed unshare leaks the page: it stays shared with the + * hypervisor and is no longer reusable for pKVM. No isolation + * guarantee is broken, and this is not expected in practice. + */ WARN_ON(unshare_pfn_hyp(pfn)); } } -- cgit v1.2.3 From f4411f9308c0187c211577b7c489545b0bdae455 Mon Sep 17 00:00:00 2001 From: "tabba@google.com" Date: Fri, 29 May 2026 13:17:55 +0100 Subject: KVM: arm64: Roll back partial shares on kvm_share_hyp() failure kvm_share_hyp() shares a range one page at a time. If share_pfn_hyp() fails partway through, the pages already shared by this call are left shared, while the caller treats the whole range as failed and never unshares them. Unshare those pages before returning the error. If an unshare itself fails the page is leaked: it stays shared with the hypervisor and is no longer reusable for pKVM, but no isolation guarantee is broken, so WARN and continue. Not expected in practice. Fixes: a83e2191b7f1 ("KVM: arm64: pkvm: Refcount the pages shared with EL2") Suggested-by: Vincent Donnefort Signed-off-by: Fuad Tabba Reviewed-by: Vincent Donnefort Link: https://patch.msgid.link/20260529121755.2923500-4-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/mmu.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index c82d4ececab8..f18b287c98b6 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -544,8 +544,8 @@ unlock: int kvm_share_hyp(void *from, void *to) { phys_addr_t start, end, cur; + int ret = 0; u64 pfn; - int ret; if (is_kernel_in_hyp_mode()) return 0; @@ -567,10 +567,24 @@ int kvm_share_hyp(void *from, void *to) pfn = __phys_to_pfn(cur); ret = share_pfn_hyp(pfn); if (ret) - return ret; + break; } - return 0; + if (!ret) + return 0; + + /* + * Roll back the pages shared by this call. A failed unshare leaks + * the page (it stays shared with the hypervisor and is no longer + * reusable for pKVM) but breaks no isolation guarantee, so warn and + * continue. Not expected in practice. + */ + for (end = cur, cur = start; cur < end; cur += PAGE_SIZE) { + pfn = __phys_to_pfn(cur); + WARN_ON(unshare_pfn_hyp(pfn)); + } + + return ret; } void kvm_unshare_hyp(void *from, void *to) -- cgit v1.2.3 From 3190bd7d36d71ab595409fd116e80928919b5bd4 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Wed, 3 Jun 2026 12:03:12 +0100 Subject: KVM: arm64: Set a Linux errno on SMCCC error in kvm_call_hyp_nvhe() If kvm_call_hyp_nvhe() fails with an SMCCC error code, we WARN(). However, the returned value isn't initialized and the caller might get garbage or 0 which is likely to be interpreted as success. Set a default -EOPNOTSUPP error value, ensuring all callers get the message when hypercalls fail. Signed-off-by: Vincent Donnefort Acked-by: Will Deacon Reviewed-by: Fuad Tabba Link: https://patch.msgid.link/20260603110312.2909844-1-vdonnefort@google.com [maz: changed error value to -EOPNOTSUPP as suggested by Will, tidied up change log] Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 65eead8362e0..9221d7dc331e 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -1273,13 +1273,14 @@ void kvm_arm_resume_guest(struct kvm *kvm); #define vcpu_has_run_once(vcpu) (!!READ_ONCE((vcpu)->pid)) #ifndef __KVM_NVHE_HYPERVISOR__ -#define kvm_call_hyp_nvhe(f, ...) \ +#define kvm_call_hyp_nvhe(f, ...) \ ({ \ struct arm_smccc_res res; \ \ arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(f), \ ##__VA_ARGS__, &res); \ - WARN_ON(res.a0 != SMCCC_RET_SUCCESS); \ + if (WARN_ON(res.a0 != SMCCC_RET_SUCCESS)) \ + res.a1 = -EOPNOTSUPP; \ \ res.a1; \ }) -- cgit v1.2.3 From 6bef47288ce1cb8302c84753164b8f8f6d63e0b3 Mon Sep 17 00:00:00 2001 From: Wei-Lin Chang Date: Fri, 5 Jun 2026 19:52:55 +0100 Subject: KVM: arm64: Fix block mapping validity check in stage-1 walker For the 64K granule size, FEAT_LPA determines whether a level 1 mapping is allowed. Using the result of has_52bit_pa() is too restrictive, as it also checks the selected output addressi size in TCR.(I)PS. Fix it by only checking FEAT_LPA. Fixes: 5da3a3b27a01 ("KVM: arm64: Expand valid block mappings to FEAT_LPA/LPA2 support") Signed-off-by: Wei-Lin Chang Link: https://patch.msgid.link/20260605185255.2431996-1-weilin.chang@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/at.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index 4d4285e60fce..7663df5e03b7 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -495,15 +495,18 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, /* Block mapping, check the validity of the level */ if (!(desc & BIT(1))) { bool valid_block = false; + bool lpa = kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, PARANGE, 52); switch (BIT(wi->pgshift)) { case SZ_4K: valid_block = level == 1 || level == 2 || (wi->pa52bit && level == 0); break; case SZ_16K: - case SZ_64K: valid_block = level == 2 || (wi->pa52bit && level == 1); break; + case SZ_64K: + valid_block = level == 2 || (lpa && level == 1); + break; } if (!valid_block) -- cgit v1.2.3 From 4be6cbeb93d26994bd1827ddbce391e3c4395c8f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 7 Jun 2026 18:57:45 +0100 Subject: KVM: arm64: nv: Avoid dereferencing NULL VNCR pseudo-TLB VNCR TLB invalidation occurs from MMU notifiers or TLBI instructions, and either can race against a vcpu not being onlined yet (no pseudo-TLB allocated). Similarly, the TLB might be invalid, and the invalidation should be skipped in this case. Both kvm_invalidate_vncr_ipa() and kvm_invalidate_vncr_va() are expected to perform the same checks, except that the latter doesn't check for the allocation and blindly dereferences the pointer. Solve this by introducing a new iterator built on top of the usual kvm_for_each_vcpu() that checks for both of the above conditions, and convert the two users to it. Reported-by: Hyunwoo Kim Link: https://lore.kernel.org/r/aiUvSbrWndQeUPc8@v4bel Fixes: 4ffa72ad8f37 ("KVM: arm64: nv: Add S1 TLB invalidation primitive for VNCR_EL2") Cc: stable@vger.kernel.org Reviewed-by: Oliver Upton Link: https://patch.msgid.link/20260607175745.297793-1-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/nested.c | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index f8d3f3a72328..690b8e856416 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -908,9 +908,21 @@ static void invalidate_vncr(struct vncr_tlb *vt) clear_fixmap(vncr_fixmap(vt->cpu)); } +/* + * VNCR TLB invalidation occurs from MMU notifiers or TLBI instructions, and + * either can race against a vcpu not being onlined yet (no pseudo-TLB + * allocated). Similarly, the TLB might be invalid. Skip those, as they + * obviously don't participate in the invalidation at this stage. + */ +#define kvm_for_each_vncr_tlb(idx, vcpup, tlbp, kvm) \ + kvm_for_each_vcpu(idx, vcpup, kvm) \ + if (((tlbp) = vcpup->arch.vncr_tlb) && \ + (tlbp)->valid) + static void kvm_invalidate_vncr_ipa(struct kvm *kvm, u64 start, u64 end) { struct kvm_vcpu *vcpu; + struct vncr_tlb *vt; unsigned long i; lockdep_assert_held_write(&kvm->mmu_lock); @@ -918,24 +930,9 @@ static void kvm_invalidate_vncr_ipa(struct kvm *kvm, u64 start, u64 end) if (!kvm_has_feat(kvm, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY)) return; - kvm_for_each_vcpu(i, vcpu, kvm) { - struct vncr_tlb *vt = vcpu->arch.vncr_tlb; + kvm_for_each_vncr_tlb(i, vcpu, vt, kvm) { u64 ipa_start, ipa_end, ipa_size; - /* - * Careful here: We end-up here from an MMU notifier, - * and this can race against a vcpu not being onlined - * yet, without the pseudo-TLB being allocated. - * - * Skip those, as they obviously don't participate in - * the invalidation at this stage. - */ - if (!vt) - continue; - - if (!vt->valid) - continue; - ipa_size = ttl_to_size(pgshift_level_to_ttl(vt->wi.pgshift, vt->wr.level)); ipa_start = vt->wr.pa & ~(ipa_size - 1); @@ -965,17 +962,14 @@ static void invalidate_vncr_va(struct kvm *kvm, struct s1e2_tlbi_scope *scope) { struct kvm_vcpu *vcpu; + struct vncr_tlb *vt; unsigned long i; lockdep_assert_held_write(&kvm->mmu_lock); - kvm_for_each_vcpu(i, vcpu, kvm) { - struct vncr_tlb *vt = vcpu->arch.vncr_tlb; + kvm_for_each_vncr_tlb(i, vcpu, vt, kvm) { u64 va_start, va_end, va_size; - if (!vt->valid) - continue; - va_size = ttl_to_size(pgshift_level_to_ttl(vt->wi.pgshift, vt->wr.level)); va_start = vt->gva & ~(va_size - 1); -- cgit v1.2.3 From 4b54e2374d1bd82031cef9784e125a7100a32499 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 8 Jun 2026 09:11:08 +0100 Subject: KVM: arm64: nv: Hold kvm->mmu_lock while initialising vcpu->arch.vncr_tlb Sashiko reports that there is a race between initialising vncr_tlb and making use of it, as we don't hold the mmu_lock at this point. Additionally, it identifies a memory leak, should userspace repeatedly invokes the KVM_RUN ioctl after a failure of kvm_arch_vcpu_run_pid_change(), as we assign vncr_tlb blindly on first run, irrespective of prior allocations. Slap the two bugs in one go by taking the kvm->mmu_lock on assigning vncr_tlb, preventing the race for good, and by checking that vncr_tlb is indeed NULL prior to allocation. Reported-by: Sashiko Link: https://lore.kernel.org/r/20260607180815.85FBC1F00893@smtp.kernel.org Reviewed-by: Oliver Upton Link: https://patch.msgid.link/20260608081108.2244133-1-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/nested.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 690b8e856416..326adf404d98 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -1253,8 +1253,20 @@ int kvm_vcpu_allocate_vncr_tlb(struct kvm_vcpu *vcpu) if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY)) return 0; - vcpu->arch.vncr_tlb = kzalloc_obj(*vcpu->arch.vncr_tlb, - GFP_KERNEL_ACCOUNT); + if (!vcpu->arch.vncr_tlb) { + struct vncr_tlb *vt = kzalloc_obj(*vcpu->arch.vncr_tlb, + GFP_KERNEL_ACCOUNT); + + /* + * Taking the lock on assignment ensures that the TLB is + * seen as initialised when following the pointer (release + * semantics of the unlock), and avoids having acquires on + * each user which already take the lock. + */ + scoped_guard(write_lock, &vcpu->kvm->mmu_lock) + vcpu->arch.vncr_tlb = vt; + } + if (!vcpu->arch.vncr_tlb) return -ENOMEM; -- cgit v1.2.3 From 650c4704b9e9ca7c97b29fdaac0f140aa0e8157f Mon Sep 17 00:00:00 2001 From: Jackie Liu Date: Thu, 4 Jun 2026 15:51:47 +0800 Subject: KVM: arm64: vgic-its: Make ABI commit helpers return void The return values of vgic_its_set_abi() and vgic_its_commit_v0() are always 0 and do not carry useful error information. Simplify by changing them to void. Suggested-by: Oliver Upton Signed-off-by: Jackie Liu Reviewed-by: Oliver Upton Reviewed-by: Eric Auger Link: https://patch.msgid.link/20260604075147.53299-1-liu.yun@linux.dev Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic/vgic-its.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index 2ea9f1c7ebcd..67d107e9a77d 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -27,7 +27,7 @@ static struct kvm_device_ops kvm_arm_vgic_its_ops; static int vgic_its_save_tables_v0(struct vgic_its *its); static int vgic_its_restore_tables_v0(struct vgic_its *its); -static int vgic_its_commit_v0(struct vgic_its *its); +static void vgic_its_commit_v0(struct vgic_its *its); static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq, struct kvm_vcpu *filter_vcpu, bool needs_inv); @@ -168,7 +168,7 @@ struct vgic_its_abi { int ite_esz; int (*save_tables)(struct vgic_its *its); int (*restore_tables)(struct vgic_its *its); - int (*commit)(struct vgic_its *its); + void (*commit)(struct vgic_its *its); }; #define ABI_0_ESZ 8 @@ -192,13 +192,13 @@ inline const struct vgic_its_abi *vgic_its_get_abi(struct vgic_its *its) return &its_table_abi_versions[its->abi_rev]; } -static int vgic_its_set_abi(struct vgic_its *its, u32 rev) +static void vgic_its_set_abi(struct vgic_its *its, u32 rev) { const struct vgic_its_abi *abi; its->abi_rev = rev; abi = vgic_its_get_abi(its); - return abi->commit(its); + abi->commit(its); } /* @@ -472,7 +472,8 @@ static int vgic_mmio_uaccess_write_its_iidr(struct kvm *kvm, if (rev >= NR_ITS_ABIS) return -EINVAL; - return vgic_its_set_abi(its, rev); + vgic_its_set_abi(its, rev); + return 0; } static unsigned long vgic_mmio_read_its_idregs(struct kvm *kvm, @@ -1888,14 +1889,11 @@ static int vgic_its_create(struct kvm_device *dev, u32 type) its->baser_coll_table = INITIAL_BASER_VALUE | ((u64)GITS_BASER_TYPE_COLLECTION << GITS_BASER_TYPE_SHIFT); dev->kvm->arch.vgic.propbaser = INITIAL_PROPBASER_VALUE; - dev->private = its; - ret = vgic_its_set_abi(its, NR_ITS_ABIS - 1); - + vgic_its_set_abi(its, NR_ITS_ABIS - 1); mutex_unlock(&dev->kvm->arch.config_lock); - - return ret; + return 0; } static void vgic_its_destroy(struct kvm_device *kvm_dev) @@ -2606,7 +2604,7 @@ static int vgic_its_restore_tables_v0(struct vgic_its *its) return ret; } -static int vgic_its_commit_v0(struct vgic_its *its) +static void vgic_its_commit_v0(struct vgic_its *its) { const struct vgic_its_abi *abi; @@ -2619,7 +2617,6 @@ static int vgic_its_commit_v0(struct vgic_its *its) its->baser_device_table |= (GIC_ENCODE_SZ(abi->dte_esz, 5) << GITS_BASER_ENTRY_SIZE_SHIFT); - return 0; } static void vgic_its_reset(struct kvm *kvm, struct vgic_its *its) -- cgit v1.2.3