From 4b24910c056995c0c0fa7c1b142696443b05fd8e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:16:50 -0800 Subject: KVM: Add a simplified wrapper for registering perf callbacks Add a parameter-less API for registering perf callbacks in anticipation of introducing another x86-only parameter for handling mediated PMU PMIs. No functional change intended. Acked-by: Anup Patel Tested-by: Xudong Hao Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-15-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d93f75b05ae2..8e410d1a63df 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1749,10 +1749,17 @@ static inline bool kvm_arch_intc_initialized(struct kvm *kvm) #ifdef CONFIG_GUEST_PERF_EVENTS unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu); -void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void)); +void __kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void), + void (*mediated_pmi_handler)(void)); + +static inline void kvm_register_perf_callbacks(void) +{ + __kvm_register_perf_callbacks(NULL, NULL); +} + void kvm_unregister_perf_callbacks(void); #else -static inline void kvm_register_perf_callbacks(void *ign) {} +static inline void kvm_register_perf_callbacks(void) {} static inline void kvm_unregister_perf_callbacks(void) {} #endif /* CONFIG_GUEST_PERF_EVENTS */ -- cgit v1.2.3 From 70b02809ded96ec790721cd5061e20b63b622310 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 21 Nov 2025 14:34:41 -0800 Subject: KVM: x86: Mark vmcs12 pages as dirty if and only if they're mapped Mark vmcs12 pages as dirty (in KVM's dirty log bitmap) if and only if the page is mapped, i.e. if the page is actually "active" in vmcs02. For some pages, KVM simply disables the associated VMCS control if the vmcs12 page is unreachable, i.e. it's possible for nested VM-Enter to succeed with a "bad" vmcs12 page. Link: https://patch.msgid.link/20251121223444.355422-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 15 +++------------ include/linux/kvm_host.h | 9 ++++++++- 2 files changed, 11 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 6137e5307d0f..72fcb1228af4 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3984,23 +3984,14 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) { - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - gfn_t gfn; + struct vcpu_vmx *vmx = to_vmx(vcpu); /* * Don't need to mark the APIC access page dirty; it is never * written to by the CPU during APIC virtualization. */ - - if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { - gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; - kvm_vcpu_mark_page_dirty(vcpu, gfn); - } - - if (nested_cpu_has_posted_intr(vmcs12)) { - gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; - kvm_vcpu_mark_page_dirty(vcpu, gfn); - } + kvm_vcpu_map_mark_dirty(vcpu, &vmx->nested.virtual_apic_map); + kvm_vcpu_map_mark_dirty(vcpu, &vmx->nested.pi_desc_map); } static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d93f75b05ae2..536d05e2726f 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1381,6 +1381,7 @@ bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn); void mark_page_dirty_in_slot(struct kvm *kvm, const struct kvm_memory_slot *memslot, gfn_t gfn); void mark_page_dirty(struct kvm *kvm, gfn_t gfn); +void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn); int __kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map, bool writable); @@ -1398,6 +1399,13 @@ static inline int kvm_vcpu_map_readonly(struct kvm_vcpu *vcpu, gpa_t gpa, return __kvm_vcpu_map(vcpu, gpa, map, false); } +static inline void kvm_vcpu_map_mark_dirty(struct kvm_vcpu *vcpu, + struct kvm_host_map *map) +{ + if (kvm_vcpu_mapped(map)) + kvm_vcpu_mark_page_dirty(vcpu, map->gfn); +} + unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn); unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable); int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset, @@ -1410,7 +1418,6 @@ int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, const void *data int offset, int len); int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, unsigned long len); -void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn); /** * kvm_gpc_init - initialize gfn_to_pfn_cache. -- cgit v1.2.3 From d7507a94a07202234236d7f94bed6015ca645ae6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Dec 2025 13:13:44 -0800 Subject: KVM: SVM: Treat exit_code as an unsigned 64-bit value through all of KVM Fix KVM's long-standing buggy handling of SVM's exit_code as a 32-bit value. Per the APM and Xen commit d1bd157fbc ("Big merge the HVM full-virtualisation abstractions.") (which is arguably more trustworthy than KVM), offset 0x70 is a single 64-bit value: 070h 63:0 EXITCODE Track exit_code as a single u64 to prevent reintroducing bugs where KVM neglects to correctly set bits 63:32. Fixes: 6aa8b732ca01 ("[PATCH] kvm: userspace interface") Cc: Jim Mattson Cc: Yosry Ahmed Reviewed-by: Yosry Ahmed Link: https://patch.msgid.link/20251230211347.4099600-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/svm.h | 3 +- arch/x86/include/uapi/asm/svm.h | 32 +++++++++---------- arch/x86/kvm/svm/hyperv.c | 1 - arch/x86/kvm/svm/nested.c | 13 ++------ arch/x86/kvm/svm/sev.c | 36 ++++++++-------------- arch/x86/kvm/svm/svm.c | 7 ++--- arch/x86/kvm/svm/svm.h | 4 +-- arch/x86/kvm/trace.h | 6 ++-- include/hyperv/hvgdk.h | 2 +- tools/testing/selftests/kvm/include/x86/svm.h | 3 +- .../kvm/x86/svm_nested_soft_inject_test.c | 4 +-- 11 files changed, 42 insertions(+), 69 deletions(-) (limited to 'include') diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 50ece197c98a..edde36097ddc 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -137,8 +137,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u32 int_vector; u32 int_state; u8 reserved_3[4]; - u32 exit_code; - u32 exit_code_hi; + u64 exit_code; u64 exit_info_1; u64 exit_info_2; u32 exit_int_info; diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index 650e3256ea7d..010a45c9f614 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -103,38 +103,38 @@ #define SVM_EXIT_VMGEXIT 0x403 /* SEV-ES software-defined VMGEXIT events */ -#define SVM_VMGEXIT_MMIO_READ 0x80000001 -#define SVM_VMGEXIT_MMIO_WRITE 0x80000002 -#define SVM_VMGEXIT_NMI_COMPLETE 0x80000003 -#define SVM_VMGEXIT_AP_HLT_LOOP 0x80000004 -#define SVM_VMGEXIT_AP_JUMP_TABLE 0x80000005 +#define SVM_VMGEXIT_MMIO_READ 0x80000001ull +#define SVM_VMGEXIT_MMIO_WRITE 0x80000002ull +#define SVM_VMGEXIT_NMI_COMPLETE 0x80000003ull +#define SVM_VMGEXIT_AP_HLT_LOOP 0x80000004ull +#define SVM_VMGEXIT_AP_JUMP_TABLE 0x80000005ull #define SVM_VMGEXIT_SET_AP_JUMP_TABLE 0 #define SVM_VMGEXIT_GET_AP_JUMP_TABLE 1 -#define SVM_VMGEXIT_PSC 0x80000010 -#define SVM_VMGEXIT_GUEST_REQUEST 0x80000011 -#define SVM_VMGEXIT_EXT_GUEST_REQUEST 0x80000012 -#define SVM_VMGEXIT_AP_CREATION 0x80000013 +#define SVM_VMGEXIT_PSC 0x80000010ull +#define SVM_VMGEXIT_GUEST_REQUEST 0x80000011ull +#define SVM_VMGEXIT_EXT_GUEST_REQUEST 0x80000012ull +#define SVM_VMGEXIT_AP_CREATION 0x80000013ull #define SVM_VMGEXIT_AP_CREATE_ON_INIT 0 #define SVM_VMGEXIT_AP_CREATE 1 #define SVM_VMGEXIT_AP_DESTROY 2 -#define SVM_VMGEXIT_SNP_RUN_VMPL 0x80000018 -#define SVM_VMGEXIT_SAVIC 0x8000001a +#define SVM_VMGEXIT_SNP_RUN_VMPL 0x80000018ull +#define SVM_VMGEXIT_SAVIC 0x8000001aull #define SVM_VMGEXIT_SAVIC_REGISTER_GPA 0 #define SVM_VMGEXIT_SAVIC_UNREGISTER_GPA 1 #define SVM_VMGEXIT_SAVIC_SELF_GPA ~0ULL -#define SVM_VMGEXIT_HV_FEATURES 0x8000fffd -#define SVM_VMGEXIT_TERM_REQUEST 0x8000fffe +#define SVM_VMGEXIT_HV_FEATURES 0x8000fffdull +#define SVM_VMGEXIT_TERM_REQUEST 0x8000fffeull #define SVM_VMGEXIT_TERM_REASON(reason_set, reason_code) \ /* SW_EXITINFO1[3:0] */ \ (((((u64)reason_set) & 0xf)) | \ /* SW_EXITINFO1[11:4] */ \ ((((u64)reason_code) & 0xff) << 4)) -#define SVM_VMGEXIT_UNSUPPORTED_EVENT 0x8000ffff +#define SVM_VMGEXIT_UNSUPPORTED_EVENT 0x8000ffffull /* Exit code reserved for hypervisor/software use */ -#define SVM_EXIT_SW 0xf0000000 +#define SVM_EXIT_SW 0xf0000000ull -#define SVM_EXIT_ERR -1 +#define SVM_EXIT_ERR -1ull #define SVM_EXIT_REASONS \ { SVM_EXIT_READ_CR0, "read_cr0" }, \ diff --git a/arch/x86/kvm/svm/hyperv.c b/arch/x86/kvm/svm/hyperv.c index 088f6429b24c..3ec580d687f5 100644 --- a/arch/x86/kvm/svm/hyperv.c +++ b/arch/x86/kvm/svm/hyperv.c @@ -11,7 +11,6 @@ void svm_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb->control.exit_code = HV_SVM_EXITCODE_ENL; - svm->vmcb->control.exit_code_hi = 0; svm->vmcb->control.exit_info_1 = HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH; svm->vmcb->control.exit_info_2 = 0; nested_svm_vmexit(svm); diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 666b5a36c15d..5aa0512e09c9 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -45,7 +45,6 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, * correctly fill in the high bits of exit_info_1. */ vmcb->control.exit_code = SVM_EXIT_NPF; - vmcb->control.exit_code_hi = 0; vmcb->control.exit_info_1 = (1ULL << 32); vmcb->control.exit_info_2 = fault->address; } @@ -441,7 +440,6 @@ void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu, to->int_vector = from->int_vector; to->int_state = from->int_state; to->exit_code = from->exit_code; - to->exit_code_hi = from->exit_code_hi; to->exit_info_1 = from->exit_info_1; to->exit_info_2 = from->exit_info_2; to->exit_int_info = from->exit_int_info; @@ -747,8 +745,8 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, enter_guest_mode(vcpu); /* - * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2, - * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes. + * Filled at exit: exit_code, exit_info_1, exit_info_2, exit_int_info, + * exit_int_info_err, next_rip, insn_len, insn_bytes. */ if (guest_cpu_cap_has(vcpu, X86_FEATURE_VGIF) && @@ -1018,7 +1016,6 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) if (!nested_vmcb_check_save(vcpu) || !nested_vmcb_check_controls(vcpu)) { vmcb12->control.exit_code = SVM_EXIT_ERR; - vmcb12->control.exit_code_hi = -1u; vmcb12->control.exit_info_1 = 0; vmcb12->control.exit_info_2 = 0; goto out; @@ -1051,7 +1048,6 @@ out_exit_err: svm->soft_int_injected = false; svm->vmcb->control.exit_code = SVM_EXIT_ERR; - svm->vmcb->control.exit_code_hi = -1u; svm->vmcb->control.exit_info_1 = 0; svm->vmcb->control.exit_info_2 = 0; @@ -1163,7 +1159,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb12->control.int_state = vmcb02->control.int_state; vmcb12->control.exit_code = vmcb02->control.exit_code; - vmcb12->control.exit_code_hi = vmcb02->control.exit_code_hi; vmcb12->control.exit_info_1 = vmcb02->control.exit_info_1; vmcb12->control.exit_info_2 = vmcb02->control.exit_info_2; @@ -1460,7 +1455,7 @@ static int nested_svm_intercept_ioio(struct vcpu_svm *svm) static int nested_svm_intercept(struct vcpu_svm *svm) { - u32 exit_code = svm->vmcb->control.exit_code; + u64 exit_code = svm->vmcb->control.exit_code; int vmexit = NESTED_EXIT_HOST; if (svm_is_vmrun_failure(exit_code)) @@ -1532,7 +1527,6 @@ static void nested_svm_inject_exception_vmexit(struct kvm_vcpu *vcpu) struct vmcb *vmcb = svm->vmcb; vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + ex->vector; - vmcb->control.exit_code_hi = 0; if (ex->has_error_code) vmcb->control.exit_info_1 = ex->error_code; @@ -1708,7 +1702,6 @@ static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst, dst->int_vector = from->int_vector; dst->int_state = from->int_state; dst->exit_code = from->exit_code; - dst->exit_code_hi = from->exit_code_hi; dst->exit_info_1 = from->exit_info_1; dst->exit_info_2 = from->exit_info_2; dst->exit_int_info = from->exit_int_info; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 28150506b18c..f67525007089 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3270,11 +3270,6 @@ skip_vmsa_free: kvfree(svm->sev_es.ghcb_sa); } -static u64 kvm_get_cached_sw_exit_code(struct vmcb_control_area *control) -{ - return (((u64)control->exit_code_hi) << 32) | control->exit_code; -} - static void dump_ghcb(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; @@ -3296,7 +3291,7 @@ static void dump_ghcb(struct vcpu_svm *svm) */ pr_err("GHCB (GPA=%016llx) snapshot:\n", svm->vmcb->control.ghcb_gpa); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_code", - kvm_get_cached_sw_exit_code(control), kvm_ghcb_sw_exit_code_is_valid(svm)); + control->exit_code, kvm_ghcb_sw_exit_code_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_1", control->exit_info_1, kvm_ghcb_sw_exit_info_1_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_2", @@ -3330,7 +3325,6 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) struct vmcb_control_area *control = &svm->vmcb->control; struct kvm_vcpu *vcpu = &svm->vcpu; struct ghcb *ghcb = svm->sev_es.ghcb; - u64 exit_code; /* * The GHCB protocol so far allows for the following data @@ -3364,9 +3358,7 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) __kvm_emulate_msr_write(vcpu, MSR_IA32_XSS, kvm_ghcb_get_xss(svm)); /* Copy the GHCB exit information into the VMCB fields */ - exit_code = kvm_ghcb_get_sw_exit_code(svm); - control->exit_code = lower_32_bits(exit_code); - control->exit_code_hi = upper_32_bits(exit_code); + control->exit_code = kvm_ghcb_get_sw_exit_code(svm); control->exit_info_1 = kvm_ghcb_get_sw_exit_info_1(svm); control->exit_info_2 = kvm_ghcb_get_sw_exit_info_2(svm); svm->sev_es.sw_scratch = kvm_ghcb_get_sw_scratch_if_valid(svm); @@ -3379,15 +3371,8 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; struct kvm_vcpu *vcpu = &svm->vcpu; - u64 exit_code; u64 reason; - /* - * Retrieve the exit code now even though it may not be marked valid - * as it could help with debugging. - */ - exit_code = kvm_get_cached_sw_exit_code(control); - /* Only GHCB Usage code 0 is supported */ if (svm->sev_es.ghcb->ghcb_usage) { reason = GHCB_ERR_INVALID_USAGE; @@ -3401,7 +3386,7 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) !kvm_ghcb_sw_exit_info_2_is_valid(svm)) goto vmgexit_err; - switch (exit_code) { + switch (control->exit_code) { case SVM_EXIT_READ_DR7: break; case SVM_EXIT_WRITE_DR7: @@ -3502,15 +3487,19 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) return 0; vmgexit_err: + /* + * Print the exit code even though it may not be marked valid as it + * could help with debugging. + */ if (reason == GHCB_ERR_INVALID_USAGE) { vcpu_unimpl(vcpu, "vmgexit: ghcb usage %#x is not valid\n", svm->sev_es.ghcb->ghcb_usage); } else if (reason == GHCB_ERR_INVALID_EVENT) { vcpu_unimpl(vcpu, "vmgexit: exit code %#llx is not valid\n", - exit_code); + control->exit_code); } else { vcpu_unimpl(vcpu, "vmgexit: exit code %#llx input is not valid\n", - exit_code); + control->exit_code); dump_ghcb(svm); } @@ -4349,7 +4338,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb_control_area *control = &svm->vmcb->control; - u64 ghcb_gpa, exit_code; + u64 ghcb_gpa; int ret; /* Validate the GHCB */ @@ -4391,8 +4380,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) svm_vmgexit_success(svm, 0); - exit_code = kvm_get_cached_sw_exit_code(control); - switch (exit_code) { + switch (control->exit_code) { case SVM_VMGEXIT_MMIO_READ: ret = setup_vmgexit_scratch(svm, true, control->exit_info_2); if (ret) @@ -4484,7 +4472,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) ret = -EINVAL; break; default: - ret = svm_invoke_exit_handler(vcpu, exit_code); + ret = svm_invoke_exit_handler(vcpu, control->exit_code); } return ret; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 3caf7a21679f..a28cd61d87ea 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2466,7 +2466,6 @@ static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu, if (cr0 ^ val) { svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE; - svm->vmcb->control.exit_code_hi = 0; ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE); } @@ -3299,7 +3298,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl); pr_err("%-20s%08x\n", "int_vector:", control->int_vector); pr_err("%-20s%08x\n", "int_state:", control->int_state); - pr_err("%-20s%08x\n", "exit_code:", control->exit_code); + pr_err("%-20s%016llx\n", "exit_code:", control->exit_code); pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1); pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2); pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info); @@ -3549,7 +3548,6 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_run *kvm_run = vcpu->run; - u32 exit_code = svm->vmcb->control.exit_code; /* SEV-ES guests must use the CR write traps to track CR registers. */ if (!sev_es_guest(vcpu->kvm)) { @@ -3585,7 +3583,7 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) if (exit_fastpath != EXIT_FASTPATH_NONE) return 1; - return svm_invoke_exit_handler(vcpu, exit_code); + return svm_invoke_exit_handler(vcpu, svm->vmcb->control.exit_code); } static int pre_svm_run(struct kvm_vcpu *vcpu) @@ -4670,7 +4668,6 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, if (static_cpu_has(X86_FEATURE_NRIPS)) vmcb->control.next_rip = info->next_rip; vmcb->control.exit_code = icpt_info.exit_code; - vmcb->control.exit_code_hi = 0; vmexit = nested_svm_exit_handled(svm); ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 3360ac36e071..a22433680c73 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -160,8 +160,7 @@ struct vmcb_ctrl_area_cached { u32 int_ctl; u32 int_vector; u32 int_state; - u32 exit_code; - u32 exit_code_hi; + u64 exit_code; u64 exit_info_1; u64 exit_info_2; u32 exit_int_info; @@ -787,7 +786,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm); static inline int nested_svm_simple_vmexit(struct vcpu_svm *svm, u32 exit_code) { svm->vmcb->control.exit_code = exit_code; - svm->vmcb->control.exit_code_hi = 0; svm->vmcb->control.exit_info_1 = 0; svm->vmcb->control.exit_info_2 = 0; return nested_svm_vmexit(svm); diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index e79bc9cb7162..e7fdbe9efc90 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -383,10 +383,10 @@ TRACE_EVENT(kvm_apic, #define kvm_print_exit_reason(exit_reason, isa) \ (isa == KVM_ISA_VMX) ? \ __print_symbolic(exit_reason & 0xffff, VMX_EXIT_REASONS) : \ - __print_symbolic(exit_reason, SVM_EXIT_REASONS), \ + __print_symbolic_u64(exit_reason, SVM_EXIT_REASONS), \ (isa == KVM_ISA_VMX && exit_reason & ~0xffff) ? " " : "", \ (isa == KVM_ISA_VMX) ? \ - __print_flags(exit_reason & ~0xffff, " ", VMX_EXIT_REASON_FLAGS) : "" + __print_flags_u64(exit_reason & ~0xffff, " ", VMX_EXIT_REASON_FLAGS) : "" #define TRACE_EVENT_KVM_EXIT(name) \ TRACE_EVENT(name, \ @@ -781,7 +781,7 @@ TRACE_EVENT_KVM_EXIT(kvm_nested_vmexit); * Tracepoint for #VMEXIT reinjected to the guest */ TRACE_EVENT(kvm_nested_vmexit_inject, - TP_PROTO(__u32 exit_code, + TP_PROTO(__u64 exit_code, __u64 exit_info1, __u64 exit_info2, __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa), TP_ARGS(exit_code, exit_info1, exit_info2, diff --git a/include/hyperv/hvgdk.h b/include/hyperv/hvgdk.h index dd6d4939ea29..384c3f3ff4a5 100644 --- a/include/hyperv/hvgdk.h +++ b/include/hyperv/hvgdk.h @@ -281,7 +281,7 @@ struct hv_vmcb_enlightenments { #define HV_VMCB_NESTED_ENLIGHTENMENTS 31 /* Synthetic VM-Exit */ -#define HV_SVM_EXITCODE_ENL 0xf0000000 +#define HV_SVM_EXITCODE_ENL 0xf0000000ull #define HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH (1) /* VM_PARTITION_ASSIST_PAGE */ diff --git a/tools/testing/selftests/kvm/include/x86/svm.h b/tools/testing/selftests/kvm/include/x86/svm.h index 29cffd0a9181..10b30b38bb3f 100644 --- a/tools/testing/selftests/kvm/include/x86/svm.h +++ b/tools/testing/selftests/kvm/include/x86/svm.h @@ -92,8 +92,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u32 int_vector; u32 int_state; u8 reserved_3[4]; - u32 exit_code; - u32 exit_code_hi; + u64 exit_code; u64 exit_info_1; u64 exit_info_2; u32 exit_int_info; diff --git a/tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c b/tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c index 7b6481d6c0d3..4bd1655f9e6d 100644 --- a/tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c +++ b/tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c @@ -103,7 +103,7 @@ static void l1_guest_code(struct svm_test_data *svm, uint64_t is_nmi, uint64_t i run_guest(vmcb, svm->vmcb_gpa); __GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL, - "Expected VMMCAL #VMEXIT, got '0x%x', info1 = '0x%lx, info2 = '0x%lx'", + "Expected VMMCAL #VMEXIT, got '0x%lx', info1 = '0x%lx, info2 = '0x%lx'", vmcb->control.exit_code, vmcb->control.exit_info_1, vmcb->control.exit_info_2); @@ -133,7 +133,7 @@ static void l1_guest_code(struct svm_test_data *svm, uint64_t is_nmi, uint64_t i run_guest(vmcb, svm->vmcb_gpa); __GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_HLT, - "Expected HLT #VMEXIT, got '0x%x', info1 = '0x%lx, info2 = '0x%lx'", + "Expected HLT #VMEXIT, got '0x%lx', info1 = '0x%lx, info2 = '0x%lx'", vmcb->control.exit_code, vmcb->control.exit_info_1, vmcb->control.exit_info_2); -- cgit v1.2.3 From 6538b6221cc2feda415ca1946e66a5ef02dc6a0a Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Thu, 8 Jan 2026 15:46:18 -0600 Subject: KVM: guest_memfd: Remove partial hugepage handling from kvm_gmem_populate() kvm_gmem_populate(), and the associated post-populate callbacks, have some limited support for dealing with guests backed by hugepages by passing the order information along to each post-populate callback and iterating through the pages passed to kvm_gmem_populate() in hugepage-chunks. However, guest_memfd doesn't yet support hugepages, and in most cases additional changes in the kvm_gmem_populate() path would also be needed to actually allow for this functionality. This makes the existing code unnecessarily complex, and makes changes difficult to work through upstream due to theoretical impacts on hugepage support that can't be considered properly without an actual hugepage implementation to reference. So for now, remove what's there so changes for things like in-place conversion can be implemented/reviewed more efficiently. Suggested-by: Vishal Annapurve Co-developed-by: Vishal Annapurve Signed-off-by: Vishal Annapurve Tested-by: Vishal Annapurve Tested-by: Kai Huang Signed-off-by: Michael Roth Tested-by: Yan Zhao Reviewed-by: Yan Zhao Link: https://patch.msgid.link/20260108214622.1084057-3-michael.roth@amd.com [sean: check for !IS_ERR() before checking folio_order()] Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 94 +++++++++++++++++++----------------------------- arch/x86/kvm/vmx/tdx.c | 2 +- include/linux/kvm_host.h | 2 +- virt/kvm/guest_memfd.c | 30 +++++++++------- 4 files changed, 56 insertions(+), 72 deletions(-) (limited to 'include') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 261d9ef8631b..a70bd3f19e29 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2267,67 +2267,53 @@ struct sev_gmem_populate_args { int fw_error; }; -static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pfn, - void __user *src, int order, void *opaque) +static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, + void __user *src, void *opaque) { struct sev_gmem_populate_args *sev_populate_args = opaque; + struct sev_data_snp_launch_update fw_args = {0}; struct kvm_sev_info *sev = to_kvm_sev_info(kvm); - int n_private = 0, ret, i; - int npages = (1 << order); - gfn_t gfn; + bool assigned = false; + int level; + int ret; if (WARN_ON_ONCE(sev_populate_args->type != KVM_SEV_SNP_PAGE_TYPE_ZERO && !src)) return -EINVAL; - for (gfn = gfn_start, i = 0; gfn < gfn_start + npages; gfn++, i++) { - struct sev_data_snp_launch_update fw_args = {0}; - bool assigned = false; - int level; - - ret = snp_lookup_rmpentry((u64)pfn + i, &assigned, &level); - if (ret || assigned) { - pr_debug("%s: Failed to ensure GFN 0x%llx RMP entry is initial shared state, ret: %d assigned: %d\n", - __func__, gfn, ret, assigned); - ret = ret ? -EINVAL : -EEXIST; - goto err; - } + ret = snp_lookup_rmpentry((u64)pfn, &assigned, &level); + if (ret || assigned) { + pr_debug("%s: Failed to ensure GFN 0x%llx RMP entry is initial shared state, ret: %d assigned: %d\n", + __func__, gfn, ret, assigned); + ret = ret ? -EINVAL : -EEXIST; + goto out; + } - if (src) { - void *vaddr = kmap_local_pfn(pfn + i); + if (src) { + void *vaddr = kmap_local_pfn(pfn); - if (copy_from_user(vaddr, src + i * PAGE_SIZE, PAGE_SIZE)) { - kunmap_local(vaddr); - ret = -EFAULT; - goto err; - } + if (copy_from_user(vaddr, src, PAGE_SIZE)) { kunmap_local(vaddr); + ret = -EFAULT; + goto out; } - - ret = rmp_make_private(pfn + i, gfn << PAGE_SHIFT, PG_LEVEL_4K, - sev_get_asid(kvm), true); - if (ret) - goto err; - - n_private++; - - fw_args.gctx_paddr = __psp_pa(sev->snp_context); - fw_args.address = __sme_set(pfn_to_hpa(pfn + i)); - fw_args.page_size = PG_LEVEL_TO_RMP(PG_LEVEL_4K); - fw_args.page_type = sev_populate_args->type; - - ret = __sev_issue_cmd(sev_populate_args->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE, - &fw_args, &sev_populate_args->fw_error); - if (ret) - goto fw_err; + kunmap_local(vaddr); } - return 0; + ret = rmp_make_private(pfn, gfn << PAGE_SHIFT, PG_LEVEL_4K, + sev_get_asid(kvm), true); + if (ret) + goto out; + + fw_args.gctx_paddr = __psp_pa(sev->snp_context); + fw_args.address = __sme_set(pfn_to_hpa(pfn)); + fw_args.page_size = PG_LEVEL_TO_RMP(PG_LEVEL_4K); + fw_args.page_type = sev_populate_args->type; -fw_err: + ret = __sev_issue_cmd(sev_populate_args->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE, + &fw_args, &sev_populate_args->fw_error); /* * If the firmware command failed handle the reclaim and cleanup of that - * PFN specially vs. prior pages which can be cleaned up below without - * needing to reclaim in advance. + * PFN before reporting an error. * * Additionally, when invalid CPUID function entries are detected, * firmware writes the expected values into the page and leaves it @@ -2337,26 +2323,20 @@ fw_err: * information to provide information on which CPUID leaves/fields * failed CPUID validation. */ - if (!snp_page_reclaim(kvm, pfn + i) && + if (ret && !snp_page_reclaim(kvm, pfn) && sev_populate_args->type == KVM_SEV_SNP_PAGE_TYPE_CPUID && sev_populate_args->fw_error == SEV_RET_INVALID_PARAM) { - void *vaddr = kmap_local_pfn(pfn + i); + void *vaddr = kmap_local_pfn(pfn); - if (copy_to_user(src + i * PAGE_SIZE, vaddr, PAGE_SIZE)) + if (copy_to_user(src, vaddr, PAGE_SIZE)) pr_debug("Failed to write CPUID page back to userspace\n"); kunmap_local(vaddr); } - /* pfn + i is hypervisor-owned now, so skip below cleanup for it. */ - n_private--; - -err: - pr_debug("%s: exiting with error ret %d (fw_error %d), restoring %d gmem PFNs to shared.\n", - __func__, ret, sev_populate_args->fw_error, n_private); - for (i = 0; i < n_private; i++) - kvm_rmp_make_shared(kvm, pfn + i, PG_LEVEL_4K); - +out: + pr_debug("%s: exiting with return code %d (fw_error %d)\n", + __func__, ret, sev_populate_args->fw_error); return ret; } diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 2d7a4d52ccfb..4fb042ce8ed1 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -3118,7 +3118,7 @@ struct tdx_gmem_post_populate_arg { }; static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, - void __user *src, int order, void *_arg) + void __user *src, void *_arg) { struct tdx_gmem_post_populate_arg *arg = _arg; struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d93f75b05ae2..1d0cee72e560 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2581,7 +2581,7 @@ int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_ord * Returns the number of pages that were populated. */ typedef int (*kvm_gmem_populate_cb)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, - void __user *src, int order, void *opaque); + void __user *src, void *opaque); long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages, kvm_gmem_populate_cb post_populate, void *opaque); diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index fdaea3422c30..24eb33c7948d 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -151,6 +151,15 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) mapping_gfp_mask(inode->i_mapping), policy); mpol_cond_put(policy); + /* + * External interfaces like kvm_gmem_get_pfn() support dealing + * with hugepages to a degree, but internally, guest_memfd currently + * assumes that all folios are order-0 and handling would need + * to be updated for anything otherwise (e.g. page-clearing + * operations). + */ + WARN_ON_ONCE(!IS_ERR(folio) && folio_order(folio)); + return folio; } @@ -829,7 +838,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long struct kvm_memory_slot *slot; void __user *p; - int ret = 0, max_order; + int ret = 0; long i; lockdep_assert_held(&kvm->slots_lock); @@ -848,7 +857,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long filemap_invalidate_lock(file->f_mapping); npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages); - for (i = 0; i < npages; i += (1 << max_order)) { + for (i = 0; i < npages; i++) { struct folio *folio; gfn_t gfn = start_gfn + i; pgoff_t index = kvm_gmem_get_index(slot, gfn); @@ -860,7 +869,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long break; } - folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &is_prepared, &max_order); + folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &is_prepared, NULL); if (IS_ERR(folio)) { ret = PTR_ERR(folio); break; @@ -874,20 +883,15 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long } folio_unlock(folio); - WARN_ON(!IS_ALIGNED(gfn, 1 << max_order) || - (npages - i) < (1 << max_order)); ret = -EINVAL; - while (!kvm_range_has_memory_attributes(kvm, gfn, gfn + (1 << max_order), - KVM_MEMORY_ATTRIBUTE_PRIVATE, - KVM_MEMORY_ATTRIBUTE_PRIVATE)) { - if (!max_order) - goto put_folio_and_exit; - max_order--; - } + if (!kvm_range_has_memory_attributes(kvm, gfn, gfn + 1, + KVM_MEMORY_ATTRIBUTE_PRIVATE, + KVM_MEMORY_ATTRIBUTE_PRIVATE)) + goto put_folio_and_exit; p = src ? src + i * PAGE_SIZE : NULL; - ret = post_populate(kvm, gfn, pfn, p, max_order, opaque); + ret = post_populate(kvm, gfn, pfn, p, opaque); if (!ret) kvm_gmem_mark_prepared(folio); -- cgit v1.2.3 From 2a62345b30529e488beb6a1220577b3495933724 Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Thu, 8 Jan 2026 15:46:22 -0600 Subject: KVM: guest_memfd: GUP source pages prior to populating guest memory Currently the post-populate callbacks handle copying source pages into private GPA ranges backed by guest_memfd, where kvm_gmem_populate() acquires the filemap invalidate lock, then calls a post-populate callback which may issue a get_user_pages() on the source pages prior to copying them into the private GPA (e.g. TDX). This will not be compatible with in-place conversion, where the userspace page fault path will attempt to acquire the filemap invalidate lock while holding the mm->mmap_lock, leading to a potential ABBA deadlock. Address this by hoisting the GUP above the filemap invalidate lock so that these page faults path can be taken early, prior to acquiring the filemap invalidate lock. It's not currently clear whether this issue is reachable with the current implementation of guest_memfd, which doesn't support in-place conversion, however it does provide a consistent mechanism to provide stable source/target PFNs to callbacks rather than punting to vendor-specific code, which allows for more commonality across architectures, which may be worthwhile even without in-place conversion. As part of this change, also begin enforcing that the 'src' argument to kvm_gmem_populate() must be page-aligned, as this greatly reduces the complexity around how the post-populate callbacks are implemented, and since no current in-tree users support using a non-page-aligned 'src' argument. Suggested-by: Sean Christopherson Co-developed-by: Sean Christopherson Co-developed-by: Vishal Annapurve Signed-off-by: Vishal Annapurve Tested-by: Vishal Annapurve Tested-by: Kai Huang Signed-off-by: Michael Roth Tested-by: Yan Zhao Reviewed-by: Yan Zhao Link: https://patch.msgid.link/20260108214622.1084057-7-michael.roth@amd.com [sean: avoid local "p" variable] Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 33 +++++++++---------- arch/x86/kvm/vmx/tdx.c | 16 ++-------- include/linux/kvm_host.h | 4 +-- virt/kvm/guest_memfd.c | 83 ++++++++++++++++++++++++++++++++---------------- 4 files changed, 78 insertions(+), 58 deletions(-) (limited to 'include') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index b4409bc652d1..0ab7c89262fb 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2268,7 +2268,7 @@ struct sev_gmem_populate_args { }; static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, - void __user *src, void *opaque) + struct page *src_page, void *opaque) { struct sev_gmem_populate_args *sev_populate_args = opaque; struct sev_data_snp_launch_update fw_args = {0}; @@ -2277,7 +2277,7 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int level; int ret; - if (WARN_ON_ONCE(sev_populate_args->type != KVM_SEV_SNP_PAGE_TYPE_ZERO && !src)) + if (WARN_ON_ONCE(sev_populate_args->type != KVM_SEV_SNP_PAGE_TYPE_ZERO && !src_page)) return -EINVAL; ret = snp_lookup_rmpentry((u64)pfn, &assigned, &level); @@ -2288,15 +2288,14 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, goto out; } - if (src) { - void *vaddr = kmap_local_pfn(pfn); + if (src_page) { + void *src_vaddr = kmap_local_page(src_page); + void *dst_vaddr = kmap_local_pfn(pfn); - if (copy_from_user(vaddr, src, PAGE_SIZE)) { - kunmap_local(vaddr); - ret = -EFAULT; - goto out; - } - kunmap_local(vaddr); + memcpy(dst_vaddr, src_vaddr, PAGE_SIZE); + + kunmap_local(src_vaddr); + kunmap_local(dst_vaddr); } ret = rmp_make_private(pfn, gfn << PAGE_SHIFT, PG_LEVEL_4K, @@ -2326,17 +2325,19 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, if (ret && !snp_page_reclaim(kvm, pfn) && sev_populate_args->type == KVM_SEV_SNP_PAGE_TYPE_CPUID && sev_populate_args->fw_error == SEV_RET_INVALID_PARAM) { - void *vaddr = kmap_local_pfn(pfn); + void *src_vaddr = kmap_local_page(src_page); + void *dst_vaddr = kmap_local_pfn(pfn); - if (copy_to_user(src, vaddr, PAGE_SIZE)) - pr_debug("Failed to write CPUID page back to userspace\n"); + memcpy(src_vaddr, dst_vaddr, PAGE_SIZE); - kunmap_local(vaddr); + kunmap_local(src_vaddr); + kunmap_local(dst_vaddr); } out: - pr_debug("%s: exiting with return code %d (fw_error %d)\n", - __func__, ret, sev_populate_args->fw_error); + if (ret) + pr_debug("%s: error updating GFN %llx, return code %d (fw_error %d)\n", + __func__, gfn, ret, sev_populate_args->fw_error); return ret; } diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 4fb042ce8ed1..5df9d32d2058 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -3118,34 +3118,24 @@ struct tdx_gmem_post_populate_arg { }; static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, - void __user *src, void *_arg) + struct page *src_page, void *_arg) { struct tdx_gmem_post_populate_arg *arg = _arg; struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); u64 err, entry, level_state; gpa_t gpa = gfn_to_gpa(gfn); - struct page *src_page; int ret, i; if (KVM_BUG_ON(kvm_tdx->page_add_src, kvm)) return -EIO; - /* - * Get the source page if it has been faulted in. Return failure if the - * source page has been swapped out or unmapped in primary memory. - */ - ret = get_user_pages_fast((unsigned long)src, 1, 0, &src_page); - if (ret < 0) - return ret; - if (ret != 1) - return -ENOMEM; + if (!src_page) + return -EOPNOTSUPP; kvm_tdx->page_add_src = src_page; ret = kvm_tdp_mmu_map_private_pfn(arg->vcpu, gfn, pfn); kvm_tdx->page_add_src = NULL; - put_page(src_page); - if (ret || !(arg->flags & KVM_TDX_MEASURE_MEMORY_REGION)) return ret; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 1d0cee72e560..49c0cfe24fd8 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2566,7 +2566,7 @@ int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_ord * @gfn: starting GFN to be populated * @src: userspace-provided buffer containing data to copy into GFN range * (passed to @post_populate, and incremented on each iteration - * if not NULL) + * if not NULL). Must be page-aligned. * @npages: number of pages to copy from userspace-buffer * @post_populate: callback to issue for each gmem page that backs the GPA * range @@ -2581,7 +2581,7 @@ int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_ord * Returns the number of pages that were populated. */ typedef int (*kvm_gmem_populate_cb)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, - void __user *src, void *opaque); + struct page *page, void *opaque); long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages, kvm_gmem_populate_cb post_populate, void *opaque); diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index e90879322fd0..923c51a3a525 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -820,12 +820,48 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn); #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE + +static long __kvm_gmem_populate(struct kvm *kvm, struct kvm_memory_slot *slot, + struct file *file, gfn_t gfn, struct page *src_page, + kvm_gmem_populate_cb post_populate, void *opaque) +{ + pgoff_t index = kvm_gmem_get_index(slot, gfn); + struct folio *folio; + kvm_pfn_t pfn; + int ret; + + filemap_invalidate_lock(file->f_mapping); + + folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, NULL); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); + goto out_unlock; + } + + folio_unlock(folio); + + if (!kvm_range_has_memory_attributes(kvm, gfn, gfn + 1, + KVM_MEMORY_ATTRIBUTE_PRIVATE, + KVM_MEMORY_ATTRIBUTE_PRIVATE)) { + ret = -EINVAL; + goto out_put_folio; + } + + ret = post_populate(kvm, gfn, pfn, src_page, opaque); + if (!ret) + folio_mark_uptodate(folio); + +out_put_folio: + folio_put(folio); +out_unlock: + filemap_invalidate_unlock(file->f_mapping); + return ret; +} + long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages, kvm_gmem_populate_cb post_populate, void *opaque) { struct kvm_memory_slot *slot; - void __user *p; - int ret = 0; long i; @@ -834,6 +870,9 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long if (WARN_ON_ONCE(npages <= 0)) return -EINVAL; + if (WARN_ON_ONCE(!PAGE_ALIGNED(src))) + return -EINVAL; + slot = gfn_to_memslot(kvm, start_gfn); if (!kvm_slot_has_gmem(slot)) return -EINVAL; @@ -842,47 +881,37 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long if (!file) return -EFAULT; - filemap_invalidate_lock(file->f_mapping); - npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages); for (i = 0; i < npages; i++) { - struct folio *folio; - gfn_t gfn = start_gfn + i; - pgoff_t index = kvm_gmem_get_index(slot, gfn); - kvm_pfn_t pfn; + struct page *src_page = NULL; if (signal_pending(current)) { ret = -EINTR; break; } - folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, NULL); - if (IS_ERR(folio)) { - ret = PTR_ERR(folio); - break; + if (src) { + unsigned long uaddr = (unsigned long)src + i * PAGE_SIZE; + + ret = get_user_pages_fast(uaddr, 1, 0, &src_page); + if (ret < 0) + break; + if (ret != 1) { + ret = -ENOMEM; + break; + } } - folio_unlock(folio); + ret = __kvm_gmem_populate(kvm, slot, file, start_gfn + i, src_page, + post_populate, opaque); - ret = -EINVAL; - if (!kvm_range_has_memory_attributes(kvm, gfn, gfn + 1, - KVM_MEMORY_ATTRIBUTE_PRIVATE, - KVM_MEMORY_ATTRIBUTE_PRIVATE)) - goto put_folio_and_exit; + if (src_page) + put_page(src_page); - p = src ? src + i * PAGE_SIZE : NULL; - ret = post_populate(kvm, gfn, pfn, p, opaque); - if (!ret) - folio_mark_uptodate(folio); - -put_folio_and_exit: - folio_put(folio); if (ret) break; } - filemap_invalidate_unlock(file->f_mapping); - return ret && !i ? ret : i; } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_populate); -- cgit v1.2.3 From fa9893fadbc245e179cb17f3c371c67471b5a8a8 Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Fri, 9 Jan 2026 17:17:32 -0600 Subject: KVM: Introduce KVM_EXIT_SNP_REQ_CERTS for SNP certificate-fetching For SEV-SNP, the host can optionally provide a certificate table to the guest when it issues an attestation request to firmware (see GHCB 2.0 specification regarding "SNP Extended Guest Requests"). This certificate table can then be used to verify the endorsement key used by firmware to sign the attestation report. While it is possible for guests to obtain the certificates through other means, handling it via the host provides more flexibility in being able to keep the certificate data in sync with the endorsement key throughout host-side operations that might resulting in the endorsement key changing. In the case of KVM, userspace will be responsible for fetching the certificate table and keeping it in sync with any modifications to the endorsement key by other userspace management tools. Define a new KVM_EXIT_SNP_REQ_CERTS event where userspace is provided with the GPA of the buffer the guest has provided as part of the attestation request so that userspace can write the certificate data into it while relying on filesystem-based locking to keep the certificates up-to-date relative to the endorsement keys installed/utilized by firmware at the time the certificates are fetched. [Melody: Update the documentation scheme about how file locking is expected to happen.] Reviewed-by: Liam Merwick Tested-by: Liam Merwick Tested-by: Dionna Glaze Signed-off-by: Michael Roth Signed-off-by: Melody Wang Signed-off-by: Michael Roth Link: https://patch.msgid.link/20260109231732.1160759-2-michael.roth@amd.com Signed-off-by: Sean Christopherson --- Documentation/virt/kvm/api.rst | 44 ++++++++++++++++++++++++++++++ arch/x86/kvm/svm/sev.c | 62 ++++++++++++++++++++++++++++++++++++++---- arch/x86/kvm/svm/svm.h | 1 + include/uapi/linux/kvm.h | 9 ++++++ 4 files changed, 110 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 01a3abef8abb..428d7d9cb4d6 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -7353,6 +7353,50 @@ Please note that the kernel is allowed to use the kvm_run structure as the primary storage for certain register types. Therefore, the kernel may use the values in kvm_run even if the corresponding bit in kvm_dirty_regs is not set. +:: + + /* KVM_EXIT_SNP_REQ_CERTS */ + struct kvm_exit_snp_req_certs { + __u64 gpa; + __u64 npages; + __u64 ret; + }; + +KVM_EXIT_SNP_REQ_CERTS indicates an SEV-SNP guest with certificate-fetching +enabled (see KVM_SEV_SNP_ENABLE_REQ_CERTS) has generated an Extended Guest +Request NAE #VMGEXIT (SNP_GUEST_REQUEST) with message type MSG_REPORT_REQ, +i.e. has requested an attestation report from firmware, and would like the +certificate data corresponding to the attestation report signature to be +provided by the hypervisor as part of the request. + +To allow for userspace to provide the certificate, the 'gpa' and 'npages' +are forwarded verbatim from the guest request (the RAX and RBX GHCB fields +respectively). 'ret' is not an "output" from KVM, and is always '0' on +exit. KVM verifies the 'gpa' is 4KiB aligned prior to exiting to userspace, +but otherwise the information from the guest isn't validated. + +Upon the next KVM_RUN, e.g. after userspace has serviced the request (or not), +KVM will complete the #VMGEXIT, using the 'ret' field to determine whether to +signal success or failure to the guest, and on failure, what reason code will +be communicated via SW_EXITINFO2. If 'ret' is set to an unsupported value (see +the table below), KVM_RUN will fail with -EINVAL. For a 'ret' of 'ENOSPC', KVM +also consumes the 'npages' field, i.e. userspace can use the field to inform +the guest of the number of pages needed to hold all the certificate data. + +The supported 'ret' values and their respective SW_EXITINFO2 encodings: + + ====== ============================================================= + 0 0x0, i.e. success. KVM will emit an SNP_GUEST_REQUEST command + to SNP firmware. + ENOSPC 0x0000000100000000, i.e. not enough guest pages to hold the + certificate table and certificate data. KVM will also set the + RBX field in the GHBC to 'npages'. + EAGAIN 0x0000000200000000, i.e. the host is busy and the guest should + retry the request. + EIO 0xffffffff00000000, for all other errors (this return code is + a KVM-defined hypervisor value, as allowed by the GHCB) + ====== ============================================================= + .. _cap_enable: diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index f67525007089..9e6a78e448f2 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -41,6 +41,16 @@ #define GHCB_HV_FT_SUPPORTED (GHCB_HV_FT_SNP | GHCB_HV_FT_SNP_AP_CREATION) +/* + * The GHCB spec essentially states that all non-zero error codes other than + * those explicitly defined above should be treated as an error by the guest. + * Define a generic error to cover that case, and choose a value that is not + * likely to overlap with new explicit error codes should more be added to + * the GHCB spec later. KVM will use this to report generic errors when + * handling SNP guest requests. + */ +#define SNP_GUEST_VMM_ERR_GENERIC (~0U) + /* enable/disable SEV support */ static bool sev_enabled = true; module_param_named(sev, sev_enabled, bool, 0444); @@ -4139,6 +4149,36 @@ out_unlock: return ret; } +static int snp_req_certs_err(struct vcpu_svm *svm, u32 vmm_error) +{ + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, SNP_GUEST_ERR(vmm_error, 0)); + + return 1; /* resume guest */ +} + +static int snp_complete_req_certs(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_control_area *control = &svm->vmcb->control; + + switch (READ_ONCE(vcpu->run->snp_req_certs.ret)) { + case 0: + return snp_handle_guest_req(svm, control->exit_info_1, + control->exit_info_2); + case ENOSPC: + vcpu->arch.regs[VCPU_REGS_RBX] = vcpu->run->snp_req_certs.npages; + return snp_req_certs_err(svm, SNP_GUEST_VMM_ERR_INVALID_LEN); + case EAGAIN: + return snp_req_certs_err(svm, SNP_GUEST_VMM_ERR_BUSY); + case EIO: + return snp_req_certs_err(svm, SNP_GUEST_VMM_ERR_GENERIC); + default: + break; + } + + return -EINVAL; +} + static int snp_handle_ext_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa) { struct kvm *kvm = svm->vcpu.kvm; @@ -4154,14 +4194,15 @@ static int snp_handle_ext_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t r /* * As per GHCB spec, requests of type MSG_REPORT_REQ also allow for * additional certificate data to be provided alongside the attestation - * report via the guest-provided data pages indicated by RAX/RBX. The - * certificate data is optional and requires additional KVM enablement - * to provide an interface for userspace to provide it, but KVM still - * needs to be able to handle extended guest requests either way. So - * provide a stub implementation that will always return an empty - * certificate table in the guest-provided data pages. + * report via the guest-provided data pages indicated by RAX/RBX. If + * userspace enables KVM_EXIT_SNP_REQ_CERTS, then exit to userspace + * to give userspace an opportunity to provide the certificate data + * before issuing/completing the attestation request. Otherwise, return + * an empty certificate table in the guest-provided data pages and + * handle the attestation request immediately. */ if (msg_type == SNP_MSG_REPORT_REQ) { + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct kvm_vcpu *vcpu = &svm->vcpu; u64 data_npages; gpa_t data_gpa; @@ -4175,6 +4216,15 @@ static int snp_handle_ext_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t r if (!PAGE_ALIGNED(data_gpa)) goto request_invalid; + if (sev->snp_certs_enabled) { + vcpu->run->exit_reason = KVM_EXIT_SNP_REQ_CERTS; + vcpu->run->snp_req_certs.gpa = data_gpa; + vcpu->run->snp_req_certs.npages = data_npages; + vcpu->run->snp_req_certs.ret = 0; + vcpu->arch.complete_userspace_io = snp_complete_req_certs; + return 0; + } + /* * As per GHCB spec (see "SNP Extended Guest Request"), the * certificate table is terminated by 24-bytes of zeroes. diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 338fc4f5cc4c..ebd7b36b1ceb 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -115,6 +115,7 @@ struct kvm_sev_info { void *guest_resp_buf; /* Bounce buffer for SNP Guest Request output */ struct mutex guest_req_mutex; /* Must acquire before using bounce buffers */ cpumask_var_t have_run_cpus; /* CPUs that have done VMRUN for this VM. */ + bool snp_certs_enabled; /* SNP certificate-fetching support. */ }; struct kvm_svm { diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index dddb781b0507..8cd107cdcf0b 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -135,6 +135,12 @@ struct kvm_xen_exit { } u; }; +struct kvm_exit_snp_req_certs { + __u64 gpa; + __u64 npages; + __u64 ret; +}; + #define KVM_S390_GET_SKEYS_NONE 1 #define KVM_S390_SKEYS_MAX 1048576 @@ -180,6 +186,7 @@ struct kvm_xen_exit { #define KVM_EXIT_MEMORY_FAULT 39 #define KVM_EXIT_TDX 40 #define KVM_EXIT_ARM_SEA 41 +#define KVM_EXIT_SNP_REQ_CERTS 42 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -482,6 +489,8 @@ struct kvm_run { __u64 gva; __u64 gpa; } arm_sea; + /* KVM_EXIT_SNP_REQ_CERTS */ + struct kvm_exit_snp_req_certs snp_req_certs; /* Fix the size of the union. */ char padding[256]; }; -- cgit v1.2.3 From 3227c3a89d65fe7482312b7b27038d9ebd86f210 Mon Sep 17 00:00:00 2001 From: Sascha Bischoff Date: Wed, 28 Jan 2026 18:07:33 +0000 Subject: irqchip/gic-v5: Check if impl is virt capable Now that there is support for creating a GICv5-based guest with KVM, check that the hardware itself supports virtualisation, skipping the setting of struct gic_kvm_info if not. Note: If native GICv5 virt is not supported, then nor is FEAT_GCIE_LEGACY, so we are able to skip altogether. Signed-off-by: Sascha Bischoff Reviewed-by: Lorenzo Pieralisi Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20260128175919.3828384-33-sascha.bischoff@arm.com [maz: cosmetic changes] Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v5-irs.c | 2 ++ drivers/irqchip/irq-gic-v5.c | 10 ++++++++++ include/linux/irqchip/arm-gic-v5.h | 4 ++++ 3 files changed, 16 insertions(+) (limited to 'include') diff --git a/drivers/irqchip/irq-gic-v5-irs.c b/drivers/irqchip/irq-gic-v5-irs.c index ce2732d649a3..eeeb40fb0eaa 100644 --- a/drivers/irqchip/irq-gic-v5-irs.c +++ b/drivers/irqchip/irq-gic-v5-irs.c @@ -743,6 +743,8 @@ static int __init gicv5_irs_init(struct device_node *node) * be consistent across IRSes by the architecture. */ if (list_empty(&irs_nodes)) { + idr = irs_readl_relaxed(irs_data, GICV5_IRS_IDR0); + gicv5_global_data.virt_capable = !FIELD_GET(GICV5_IRS_IDR0_VIRT, idr); idr = irs_readl_relaxed(irs_data, GICV5_IRS_IDR1); irs_setup_pri_bits(idr); diff --git a/drivers/irqchip/irq-gic-v5.c b/drivers/irqchip/irq-gic-v5.c index 41ef286c4d78..3c86bbc05761 100644 --- a/drivers/irqchip/irq-gic-v5.c +++ b/drivers/irqchip/irq-gic-v5.c @@ -1064,6 +1064,16 @@ static struct gic_kvm_info gic_v5_kvm_info __initdata; static void __init gic_of_setup_kvm_info(struct device_node *node) { + /* + * If we don't have native GICv5 virtualisation support, then + * we also don't have FEAT_GCIE_LEGACY - the architecture + * forbids this combination. + */ + if (!gicv5_global_data.virt_capable) { + pr_info("GIC implementation is not virtualization capable\n"); + return; + } + gic_v5_kvm_info.type = GIC_V5; /* GIC Virtual CPU interface maintenance interrupt */ diff --git a/include/linux/irqchip/arm-gic-v5.h b/include/linux/irqchip/arm-gic-v5.h index 68ddcdb1cec5..4cb71ce6e8ad 100644 --- a/include/linux/irqchip/arm-gic-v5.h +++ b/include/linux/irqchip/arm-gic-v5.h @@ -43,6 +43,7 @@ /* * IRS registers and tables structures */ +#define GICV5_IRS_IDR0 0x0000 #define GICV5_IRS_IDR1 0x0004 #define GICV5_IRS_IDR2 0x0008 #define GICV5_IRS_IDR5 0x0014 @@ -63,6 +64,8 @@ #define GICV5_IRS_IST_STATUSR 0x0194 #define GICV5_IRS_MAP_L2_ISTR 0x01c0 +#define GICV5_IRS_IDR0_VIRT BIT(6) + #define GICV5_IRS_IDR1_PRIORITY_BITS GENMASK(22, 20) #define GICV5_IRS_IDR1_IAFFID_BITS GENMASK(19, 16) @@ -278,6 +281,7 @@ struct gicv5_chip_data { u8 cpuif_pri_bits; u8 cpuif_id_bits; u8 irs_pri_bits; + bool virt_capable; struct { __le64 *l1ist_addr; u32 l2_size; -- cgit v1.2.3 From 5ab24969705a9adadbc1d3cff4c1c15df174eafb Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 2 Feb 2026 08:57:20 +0000 Subject: KVM: arm64: Reimplement vgic-debug XArray iteration The vgic-debug interface implementation uses XArray marks (`LPI_XA_MARK_DEBUG_ITER`) to "snapshot" LPIs at the start of iteration. This modifies global state for a read-only operation and complicates reference counting, leading to leaks if iteration is aborted or fails. Reimplement the iterator to use dynamic iteration logic: - Remove `lpi_idx` from `struct vgic_state_iter`. - Replace the XArray marking mechanism with dynamic iteration using `xa_find_after(..., XA_PRESENT)`. - Wrap XArray traversals in `rcu_read_lock()`/`rcu_read_unlock()` to ensure safety against concurrent modifications (e.g., LPI unmapping). - Handle potential races where an LPI is removed during iteration by gracefully skipping it in `show()`, rather than warning. - Remove the unused `LPI_XA_MARK_DEBUG_ITER` definition. This simplifies the lifecycle management of the iterator and prevents resource leaks associated with the marking mechanism, and paves the way for using a standard seq_file iterator. Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260202085721.3954942-3-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic/vgic-debug.c | 68 ++++++++++++---------------------------- include/kvm/arm_vgic.h | 1 - 2 files changed, 20 insertions(+), 49 deletions(-) (limited to 'include') diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c index bb92853d1fd3..ec3d0c1fe703 100644 --- a/arch/arm64/kvm/vgic/vgic-debug.c +++ b/arch/arm64/kvm/vgic/vgic-debug.c @@ -25,11 +25,9 @@ struct vgic_state_iter { int nr_cpus; int nr_spis; - int nr_lpis; int dist_id; int vcpu_id; unsigned long intid; - int lpi_idx; }; static void iter_next(struct kvm *kvm, struct vgic_state_iter *iter) @@ -45,13 +43,15 @@ static void iter_next(struct kvm *kvm, struct vgic_state_iter *iter) * Let the xarray drive the iterator after the last SPI, as the iterator * has exhausted the sequentially-allocated INTID space. */ - if (iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS - 1) && - iter->nr_lpis) { - if (iter->lpi_idx < iter->nr_lpis) - xa_find_after(&dist->lpi_xa, &iter->intid, - VGIC_LPI_MAX_INTID, - LPI_XA_MARK_DEBUG_ITER); - iter->lpi_idx++; + if (iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS - 1)) { + if (iter->intid == VGIC_LPI_MAX_INTID + 1) + return; + + rcu_read_lock(); + if (!xa_find_after(&dist->lpi_xa, &iter->intid, + VGIC_LPI_MAX_INTID, XA_PRESENT)) + iter->intid = VGIC_LPI_MAX_INTID + 1; + rcu_read_unlock(); return; } @@ -61,44 +61,21 @@ static void iter_next(struct kvm *kvm, struct vgic_state_iter *iter) iter->intid = 0; } -static int iter_mark_lpis(struct kvm *kvm) +static int vgic_count_lpis(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; - unsigned long intid, flags; struct vgic_irq *irq; + unsigned long intid; int nr_lpis = 0; - xa_lock_irqsave(&dist->lpi_xa, flags); - - xa_for_each(&dist->lpi_xa, intid, irq) { - if (!vgic_try_get_irq_ref(irq)) - continue; - - __xa_set_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER); + rcu_read_lock(); + xa_for_each(&dist->lpi_xa, intid, irq) nr_lpis++; - } - - xa_unlock_irqrestore(&dist->lpi_xa, flags); + rcu_read_unlock(); return nr_lpis; } -static void iter_unmark_lpis(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - unsigned long intid, flags; - struct vgic_irq *irq; - - xa_for_each_marked(&dist->lpi_xa, intid, irq, LPI_XA_MARK_DEBUG_ITER) { - xa_lock_irqsave(&dist->lpi_xa, flags); - __xa_clear_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER); - xa_unlock_irqrestore(&dist->lpi_xa, flags); - - /* vgic_put_irq() expects to be called outside of the xa_lock */ - vgic_put_irq(kvm, irq); - } -} - static void iter_init(struct kvm *kvm, struct vgic_state_iter *iter, loff_t pos) { @@ -108,8 +85,6 @@ static void iter_init(struct kvm *kvm, struct vgic_state_iter *iter, iter->nr_cpus = nr_cpus; iter->nr_spis = kvm->arch.vgic.nr_spis; - if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) - iter->nr_lpis = iter_mark_lpis(kvm); /* Fast forward to the right position if needed */ while (pos--) @@ -121,7 +96,7 @@ static bool end_of_vgic(struct vgic_state_iter *iter) return iter->dist_id > 0 && iter->vcpu_id == iter->nr_cpus && iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS) && - (!iter->nr_lpis || iter->lpi_idx > iter->nr_lpis); + iter->intid > VGIC_LPI_MAX_INTID; } static void *vgic_debug_start(struct seq_file *s, loff_t *pos) @@ -178,7 +153,6 @@ static void vgic_debug_stop(struct seq_file *s, void *v) mutex_lock(&kvm->arch.config_lock); iter = kvm->arch.vgic.iter; - iter_unmark_lpis(kvm); kfree(iter); kvm->arch.vgic.iter = NULL; mutex_unlock(&kvm->arch.config_lock); @@ -188,13 +162,14 @@ static void print_dist_state(struct seq_file *s, struct vgic_dist *dist, struct vgic_state_iter *iter) { bool v3 = dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3; + struct kvm *kvm = s->private; seq_printf(s, "Distributor\n"); seq_printf(s, "===========\n"); seq_printf(s, "vgic_model:\t%s\n", v3 ? "GICv3" : "GICv2"); seq_printf(s, "nr_spis:\t%d\n", dist->nr_spis); if (v3) - seq_printf(s, "nr_lpis:\t%d\n", iter->nr_lpis); + seq_printf(s, "nr_lpis:\t%d\n", vgic_count_lpis(kvm)); seq_printf(s, "enabled:\t%d\n", dist->enabled); seq_printf(s, "\n"); @@ -291,16 +266,13 @@ static int vgic_debug_show(struct seq_file *s, void *v) if (iter->vcpu_id < iter->nr_cpus) vcpu = kvm_get_vcpu(kvm, iter->vcpu_id); - /* - * Expect this to succeed, as iter_mark_lpis() takes a reference on - * every LPI to be visited. - */ if (iter->intid < VGIC_NR_PRIVATE_IRQS) irq = vgic_get_vcpu_irq(vcpu, iter->intid); else irq = vgic_get_irq(kvm, iter->intid); - if (WARN_ON_ONCE(!irq)) - return -EINVAL; + + if (!irq) + return 0; raw_spin_lock_irqsave(&irq->irq_lock, flags); print_irq_state(s, irq, vcpu); diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index b261fb3968d0..d32fafbd2907 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -300,7 +300,6 @@ struct vgic_dist { */ u64 propbaser; -#define LPI_XA_MARK_DEBUG_ITER XA_MARK_0 struct xarray lpi_xa; /* used by vgic-debug */ -- cgit v1.2.3 From fb21cb08566ebed91d5c876db5c013cc8af83b89 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 2 Feb 2026 08:57:21 +0000 Subject: KVM: arm64: Use standard seq_file iterator for vgic-debug debugfs The current implementation uses `vgic_state_iter` in `struct vgic_dist` to track the sequence position. This effectively makes the iterator shared across all open file descriptors for the VM. This approach has significant drawbacks: - It enforces mutual exclusion, preventing concurrent reads of the debugfs file (returning -EBUSY). - It relies on storing transient iterator state in the long-lived VM structure (`vgic_dist`). Refactor the implementation to use the standard `seq_file` iterator. Instead of storing state in `kvm_arch`, rely on the `pos` argument passed to the `start` and `next` callbacks, which tracks the logical index specific to the file descriptor. This change enables concurrent access and eliminates the `vgic_state_iter` field from `struct vgic_dist`. Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260202085721.3954942-4-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic/vgic-debug.c | 40 ++++++++++++---------------------------- include/kvm/arm_vgic.h | 3 --- 2 files changed, 12 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c index ec3d0c1fe703..2c6776a1779b 100644 --- a/arch/arm64/kvm/vgic/vgic-debug.c +++ b/arch/arm64/kvm/vgic/vgic-debug.c @@ -104,58 +104,42 @@ static void *vgic_debug_start(struct seq_file *s, loff_t *pos) struct kvm *kvm = s->private; struct vgic_state_iter *iter; - mutex_lock(&kvm->arch.config_lock); - iter = kvm->arch.vgic.iter; - if (iter) { - iter = ERR_PTR(-EBUSY); - goto out; - } - iter = kmalloc(sizeof(*iter), GFP_KERNEL); - if (!iter) { - iter = ERR_PTR(-ENOMEM); - goto out; - } + if (!iter) + return ERR_PTR(-ENOMEM); iter_init(kvm, iter, *pos); - kvm->arch.vgic.iter = iter; - if (end_of_vgic(iter)) + if (end_of_vgic(iter)) { + kfree(iter); iter = NULL; -out: - mutex_unlock(&kvm->arch.config_lock); + } + return iter; } static void *vgic_debug_next(struct seq_file *s, void *v, loff_t *pos) { struct kvm *kvm = s->private; - struct vgic_state_iter *iter = kvm->arch.vgic.iter; + struct vgic_state_iter *iter = v; ++*pos; iter_next(kvm, iter); - if (end_of_vgic(iter)) + if (end_of_vgic(iter)) { + kfree(iter); iter = NULL; + } return iter; } static void vgic_debug_stop(struct seq_file *s, void *v) { - struct kvm *kvm = s->private; - struct vgic_state_iter *iter; + struct vgic_state_iter *iter = v; - /* - * If the seq file wasn't properly opened, there's nothing to clearn - * up. - */ - if (IS_ERR(v)) + if (IS_ERR_OR_NULL(v)) return; - mutex_lock(&kvm->arch.config_lock); - iter = kvm->arch.vgic.iter; kfree(iter); - kvm->arch.vgic.iter = NULL; - mutex_unlock(&kvm->arch.config_lock); } static void print_dist_state(struct seq_file *s, struct vgic_dist *dist, diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index d32fafbd2907..f2eafc65bbf4 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -302,9 +302,6 @@ struct vgic_dist { struct xarray lpi_xa; - /* used by vgic-debug */ - struct vgic_state_iter *iter; - /* * GICv4 ITS per-VM data, containing the IRQ domain, the VPE * array, the property table pointer as well as allocation -- cgit v1.2.3 From 0ee4ddc1647b8b3b9e7a94d798a1774a764428c1 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Wed, 4 Feb 2026 16:02:57 +0100 Subject: KVM: s390: Storage key manipulation IOCTL Add a new IOCTL to allow userspace to manipulate storage keys directly. This will make it easier to write selftests related to storage keys. Acked-by: Heiko Carstens Signed-off-by: Claudio Imbrenda --- Documentation/virt/kvm/api.rst | 42 ++++++++++++++++++++++++++++++ arch/s390/kvm/kvm-s390.c | 58 ++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/kvm.h | 11 ++++++++ 3 files changed, 111 insertions(+) (limited to 'include') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 01a3abef8abb..72e04dedb068 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6517,6 +6517,40 @@ the capability to be present. `flags` must currently be zero. +4.144 KVM_S390_KEYOP +-------------------- + +:Capability: KVM_CAP_S390_KEYOP +:Architectures: s390 +:Type: vm ioctl +:Parameters: struct kvm_s390_keyop (in/out) +:Returns: 0 in case of success, < 0 on error + +The specified key operation is performed on the given guest address. The +previous storage key (or the relevant part thereof) will be returned in +`key`. + +:: + + struct kvm_s390_keyop { + __u64 guest_addr; + __u8 key; + __u8 operation; + }; + +Currently supported values for ``operation``: + +KVM_S390_KEYOP_ISKE + Returns the storage key for the guest address ``guest_addr`` in ``key``. + +KVM_S390_KEYOP_RRBE + Resets the reference bit for the guest address ``guest_addr``, returning the + R and C bits of the old storage key in ``key``; the remaining fields of + the storage key will be set to 0. + +KVM_S390_KEYOP_SSKE + Sets the storage key for the guest address ``guest_addr`` to the key + specified in ``key``, returning the previous value in ``key``. .. _kvm_run: @@ -9287,6 +9321,14 @@ The presence of this capability indicates that KVM_RUN will update the KVM_RUN_X86_GUEST_MODE bit in kvm_run.flags to indicate whether the vCPU was executing nested guest code when it exited. +8.46 KVM_CAP_S390_KEYOP +----------------------- + +:Architectures: s390 + +The presence of this capability indicates that the KVM_S390_KEYOP ioctl is +available. + KVM exits with the register state of either the L1 or L2 guest depending on which executed at the time of an exit. Userspace must take care to differentiate between these cases. diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index ac7b5f56f0b5..9f24252775dd 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -554,6 +554,37 @@ static void __kvm_s390_exit(void) debug_unregister(kvm_s390_dbf_uv); } +static int kvm_s390_keyop(struct kvm_s390_mmu_cache *mc, struct kvm *kvm, int op, + unsigned long addr, union skey skey) +{ + union asce asce = kvm->arch.gmap->asce; + gfn_t gfn = gpa_to_gfn(addr); + int r; + + guard(read_lock)(&kvm->mmu_lock); + + switch (op) { + case KVM_S390_KEYOP_SSKE: + r = dat_cond_set_storage_key(mc, asce, gfn, skey, &skey, 0, 0, 0); + if (r >= 0) + return skey.skey; + break; + case KVM_S390_KEYOP_ISKE: + r = dat_get_storage_key(asce, gfn, &skey); + if (!r) + return skey.skey; + break; + case KVM_S390_KEYOP_RRBE: + r = dat_reset_reference_bit(asce, gfn); + if (r > 0) + return r << 1; + break; + default: + return -EINVAL; + } + return r; +} + /* Section: device related */ long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) @@ -598,6 +629,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_DIAG318: case KVM_CAP_IRQFD_RESAMPLE: case KVM_CAP_S390_USER_OPEREXEC: + case KVM_CAP_S390_KEYOP: r = 1; break; case KVM_CAP_SET_GUEST_DEBUG2: @@ -2931,6 +2963,32 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) r = -EFAULT; break; } + case KVM_S390_KEYOP: { + struct kvm_s390_mmu_cache *mc; + struct kvm_s390_keyop kop; + union skey skey; + + if (copy_from_user(&kop, argp, sizeof(kop))) { + r = -EFAULT; + break; + } + skey.skey = kop.key; + + mc = kvm_s390_new_mmu_cache(); + if (!mc) + return -ENOMEM; + + r = kvm_s390_keyop(mc, kvm, kop.operation, kop.guest_addr, skey); + kvm_s390_free_mmu_cache(mc); + if (r < 0) + break; + + kop.key = r; + r = 0; + if (copy_to_user(argp, &kop, sizeof(kop))) + r = -EFAULT; + break; + } case KVM_S390_ZPCI_OP: { struct kvm_s390_zpci_op args; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index dddb781b0507..ab3d3d96e75f 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -974,6 +974,7 @@ struct kvm_enable_cap { #define KVM_CAP_GUEST_MEMFD_FLAGS 244 #define KVM_CAP_ARM_SEA_TO_USER 245 #define KVM_CAP_S390_USER_OPEREXEC 246 +#define KVM_CAP_S390_KEYOP 247 struct kvm_irq_routing_irqchip { __u32 irqchip; @@ -1219,6 +1220,15 @@ struct kvm_vfio_spapr_tce { __s32 tablefd; }; +#define KVM_S390_KEYOP_ISKE 0x01 +#define KVM_S390_KEYOP_RRBE 0x02 +#define KVM_S390_KEYOP_SSKE 0x03 +struct kvm_s390_keyop { + __u64 guest_addr; + __u8 key; + __u8 operation; +}; + /* * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns * a vcpu fd. @@ -1238,6 +1248,7 @@ struct kvm_vfio_spapr_tce { #define KVM_S390_UCAS_MAP _IOW(KVMIO, 0x50, struct kvm_s390_ucas_mapping) #define KVM_S390_UCAS_UNMAP _IOW(KVMIO, 0x51, struct kvm_s390_ucas_mapping) #define KVM_S390_VCPU_FAULT _IOW(KVMIO, 0x52, unsigned long) +#define KVM_S390_KEYOP _IOWR(KVMIO, 0x53, struct kvm_s390_keyop) /* Device model IOC */ #define KVM_CREATE_IRQCHIP _IO(KVMIO, 0x60) -- cgit v1.2.3 From f7ab71f178d56447e5efb55b65436feb68662f8f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 6 Feb 2026 10:17:30 +0100 Subject: KVM: s390: Add explicit padding to struct kvm_s390_keyop The newly added structure causes a warning about implied padding: ./usr/include/linux/kvm.h:1247:1: error: padding struct size to alignment boundary with 6 bytes [-Werror=padded] The padding can lead to leaking kernel data and ABI incompatibilies when used on x86. Neither of these is a problem in this specific patch, but it's best to avoid it and use explicit padding fields in general. Fixes: 0ee4ddc1647b ("KVM: s390: Storage key manipulation IOCTL") Signed-off-by: Arnd Bergmann Reviewed-by: Claudio Imbrenda Signed-off-by: Claudio Imbrenda --- include/uapi/linux/kvm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index ab3d3d96e75f..d4250ab662fe 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1227,6 +1227,7 @@ struct kvm_s390_keyop { __u64 guest_addr; __u8 key; __u8 operation; + __u8 pad[6]; }; /* -- cgit v1.2.3 From 376e2f8cca2816c489a9196e65cc904d1a907fd2 Mon Sep 17 00:00:00 2001 From: Xu Lu Date: Sun, 4 Jan 2026 21:34:57 +0800 Subject: irqchip/riscv-imsic: Adjust the number of available guest irq files Currently, KVM assumes the minimum of implemented HGEIE bits and "BIT(gc->guest_index_bits) - 1" as the number of guest files available across all CPUs. This will not work when CPUs have different number of guest files because KVM may incorrectly allocate a guest file on a CPU with fewer guest files. To address above, during initialization, calculate the number of available guest interrupt files according to MMIO resources and constrain the number of guest interrupt files that can be allocated by KVM. Signed-off-by: Xu Lu Reviewed-by: Nutty Liu Reviewed-by: Anup Patel Acked-by: Thomas Gleixner Link: https://lore.kernel.org/r/20260104133457.57742-1-luxu.kernel@bytedance.com Signed-off-by: Anup Patel --- arch/riscv/kvm/aia.c | 2 +- drivers/irqchip/irq-riscv-imsic-state.c | 12 +++++++++++- include/linux/irqchip/riscv-imsic.h | 3 +++ 3 files changed, 15 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/arch/riscv/kvm/aia.c b/arch/riscv/kvm/aia.c index dad318185660..cac3c2b51d72 100644 --- a/arch/riscv/kvm/aia.c +++ b/arch/riscv/kvm/aia.c @@ -630,7 +630,7 @@ int kvm_riscv_aia_init(void) */ if (gc) kvm_riscv_aia_nr_hgei = min((ulong)kvm_riscv_aia_nr_hgei, - BIT(gc->guest_index_bits) - 1); + gc->nr_guest_files); else kvm_riscv_aia_nr_hgei = 0; diff --git a/drivers/irqchip/irq-riscv-imsic-state.c b/drivers/irqchip/irq-riscv-imsic-state.c index b6cebfee9461..7566c8aa2d48 100644 --- a/drivers/irqchip/irq-riscv-imsic-state.c +++ b/drivers/irqchip/irq-riscv-imsic-state.c @@ -784,7 +784,7 @@ static int __init imsic_parse_fwnode(struct fwnode_handle *fwnode, int __init imsic_setup_state(struct fwnode_handle *fwnode, void *opaque) { - u32 i, j, index, nr_parent_irqs, nr_mmios, nr_handlers = 0; + u32 i, j, index, nr_parent_irqs, nr_mmios, nr_guest_files, nr_handlers = 0; struct imsic_global_config *global; struct imsic_local_config *local; void __iomem **mmios_va = NULL; @@ -878,6 +878,7 @@ int __init imsic_setup_state(struct fwnode_handle *fwnode, void *opaque) } /* Configure handlers for target CPUs */ + global->nr_guest_files = BIT(global->guest_index_bits) - 1; for (i = 0; i < nr_parent_irqs; i++) { rc = imsic_get_parent_hartid(fwnode, i, &hartid); if (rc) { @@ -918,6 +919,15 @@ int __init imsic_setup_state(struct fwnode_handle *fwnode, void *opaque) local->msi_pa = mmios[index].start + reloff; local->msi_va = mmios_va[index] + reloff; + /* + * KVM uses global->nr_guest_files to determine the available guest + * interrupt files on each CPU. Take the minimum number of guest + * interrupt files across all CPUs to avoid KVM incorrectly allocating + * an unexisted or unmapped guest interrupt file on some CPUs. + */ + nr_guest_files = (resource_size(&mmios[index]) - reloff) / IMSIC_MMIO_PAGE_SZ - 1; + global->nr_guest_files = min(global->nr_guest_files, nr_guest_files); + nr_handlers++; } diff --git a/include/linux/irqchip/riscv-imsic.h b/include/linux/irqchip/riscv-imsic.h index 7f3ff5c5ea53..4b348836de7a 100644 --- a/include/linux/irqchip/riscv-imsic.h +++ b/include/linux/irqchip/riscv-imsic.h @@ -68,6 +68,9 @@ struct imsic_global_config { /* Number of guest interrupt identities */ u32 nr_guest_ids; + /* Number of guest interrupt files per core */ + u32 nr_guest_files; + /* Per-CPU IMSIC addresses */ struct imsic_local_config __percpu *local; }; -- cgit v1.2.3 From 898885477e0fa23d2e42b65bcb7c250215ecac37 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Fri, 6 Feb 2026 15:35:51 +0100 Subject: KVM: s390: Use guest address to mark guest page dirty Stop using the userspace address to mark the guest page dirty. mark_page_dirty() expects a guest frame number, but was being passed a host virtual frame number. When slot == NULL, mark_page_dirty_in_slot() does nothing and does not complain. This means that in some circumstances the dirtiness of the guest page might have been lost. Fix by adding two fields in struct kvm_s390_adapter_int to keep the guest addressses, and use those for mark_page_dirty(). Fixes: f65470661f36 ("KVM: s390/interrupt: do not pin adapter interrupt pages") Reviewed-by: Steffen Eiden Reviewed-by: Janosch Frank Reviewed-by: Christoph Schlameuss Signed-off-by: Claudio Imbrenda --- arch/s390/kvm/interrupt.c | 6 ++++-- include/linux/kvm_host.h | 2 ++ 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index f55eca9aa638..1c2bb5cd7e12 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -2768,13 +2768,13 @@ static int adapter_indicators_set(struct kvm *kvm, bit = get_ind_bit(adapter_int->ind_addr, adapter_int->ind_offset, adapter->swap); set_bit(bit, map); - mark_page_dirty(kvm, adapter_int->ind_addr >> PAGE_SHIFT); + mark_page_dirty(kvm, adapter_int->ind_gaddr >> PAGE_SHIFT); set_page_dirty_lock(ind_page); map = page_address(summary_page); bit = get_ind_bit(adapter_int->summary_addr, adapter_int->summary_offset, adapter->swap); summary_set = test_and_set_bit(bit, map); - mark_page_dirty(kvm, adapter_int->summary_addr >> PAGE_SHIFT); + mark_page_dirty(kvm, adapter_int->summary_gaddr >> PAGE_SHIFT); set_page_dirty_lock(summary_page); srcu_read_unlock(&kvm->srcu, idx); @@ -2870,7 +2870,9 @@ int kvm_set_routing_entry(struct kvm *kvm, if (kvm_is_error_hva(uaddr_s) || kvm_is_error_hva(uaddr_i)) return -EFAULT; e->adapter.summary_addr = uaddr_s; + e->adapter.summary_gaddr = ue->u.adapter.summary_addr; e->adapter.ind_addr = uaddr_i; + e->adapter.ind_gaddr = ue->u.adapter.ind_addr; e->adapter.summary_offset = ue->u.adapter.summary_offset; e->adapter.ind_offset = ue->u.adapter.ind_offset; e->adapter.adapter_id = ue->u.adapter.adapter_id; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d93f75b05ae2..deb36007480d 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -645,7 +645,9 @@ static inline unsigned long *kvm_second_dirty_bitmap(struct kvm_memory_slot *mem struct kvm_s390_adapter_int { u64 ind_addr; + u64 ind_gaddr; u64 summary_addr; + u64 summary_gaddr; u64 ind_offset; u32 summary_offset; u32 adapter_id; -- cgit v1.2.3