summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaolo Bonzini <pbonzini@redhat.com>2026-04-13 14:01:24 +0300
committerPaolo Bonzini <pbonzini@redhat.com>2026-04-13 14:01:50 +0300
commitea8bc95fbb75da215b7533c7c46f63423e84ff5e (patch)
tree415958be1b51e9ec2d1accaf1ff7732f776bb5ee
parent1b3090da8d25b1dd59744e32e6872c2831fed874 (diff)
parent052ca584bd7c51de0de96e684631570459d46cda (diff)
downloadlinux-ea8bc95fbb75da215b7533c7c46f63423e84ff5e.tar.xz
Merge tag 'kvm-x86-nested-7.1' of https://github.com/kvm-x86/linux into HEAD
KVM nested SVM changes for 7.1 (with one common x86 fix) - To minimize the probability of corrupting guest state, defer KVM's non-architectural delivery of exception payloads (e.g. CR2 and DR6) until consumption of the payload is imminent, and force delivery of the payload in all paths where userspace saves relevant state. - Use vcpu->arch.cr2 when updating vmcb12's CR2 on nested #VMEXIT to fix a bug where L2's CR2 can get corrupted after a save/restore, e.g. if the VM is migrated while L2 is faulting in memory. - Fix a class of nSVM bugs where some fields written by the CPU are not synchronized from vmcb02 to cached vmcb12 after VMRUN, and so are not up-to-date when saved by KVM_GET_NESTED_STATE. - Fix a class of bugs where the ordering between KVM_SET_NESTED_STATE and KVM_SET_{S}REGS could cause vmcb02 to be incorrectly initialized after save+restore. - Add a variety of missing nSVM consistency checks. - Fix several bugs where KVM failed to correctly update VMCB fields on nested #VMEXIT. - Fix several bugs where KVM failed to correctly synthesize #UD or #GP for SVM-related instructions. - Add support for save+restore of virtualized LBRs (on SVM). - Refactor various helpers and macros to improve clarity and (hopefully) make the code easier to maintain. - Aggressively sanitize fields when copying from vmcb12 to guard against unintentionally allowing L1 to utilize yet-to-be-defined features. - Fix several bugs where KVM botched rAX legality checks when emulating SVM instructions. Note, KVM is still flawed in that KVM doesn't address size prefix overrides for 64-bit guests; this should probably be documented as a KVM erratum. - Fail emulation of VMRUN/VMLOAD/VMSAVE if mapping vmcb12 fails instead of somewhat arbitrarily synthesizing #GP (i.e. don't bastardize AMD's already- sketchy behavior of generating #GP if for "unsupported" addresses). - Cache all used vmcb12 fields to further harden against TOCTOU bugs.
-rw-r--r--arch/x86/include/asm/kvm_host.h15
-rw-r--r--arch/x86/include/asm/svm.h20
-rw-r--r--arch/x86/kvm/emulate.c3
-rw-r--r--arch/x86/kvm/hyperv.h8
-rw-r--r--arch/x86/kvm/kvm_emulate.h2
-rw-r--r--arch/x86/kvm/svm/hyperv.h9
-rw-r--r--arch/x86/kvm/svm/nested.c615
-rw-r--r--arch/x86/kvm/svm/sev.c6
-rw-r--r--arch/x86/kvm/svm/svm.c353
-rw-r--r--arch/x86/kvm/svm/svm.h81
-rw-r--r--arch/x86/kvm/vmx/nested.c50
-rw-r--r--arch/x86/kvm/vmx/vmx.c16
-rw-r--r--arch/x86/kvm/vmx/vmx.h3
-rw-r--r--arch/x86/kvm/x86.c78
-rw-r--r--arch/x86/kvm/x86.h10
-rw-r--r--tools/testing/selftests/kvm/Makefile.kvm3
-rw-r--r--tools/testing/selftests/kvm/include/x86/processor.h5
-rw-r--r--tools/testing/selftests/kvm/include/x86/svm.h14
-rw-r--r--tools/testing/selftests/kvm/lib/x86/svm.c2
-rw-r--r--tools/testing/selftests/kvm/x86/nested_vmsave_vmload_test.c16
-rw-r--r--tools/testing/selftests/kvm/x86/state_test.c35
-rw-r--r--tools/testing/selftests/kvm/x86/svm_lbr_nested_state.c145
-rw-r--r--tools/testing/selftests/kvm/x86/svm_nested_clear_efer_svme.c55
-rw-r--r--tools/testing/selftests/kvm/x86/svm_nested_vmcb12_gpa.c176
24 files changed, 1230 insertions, 490 deletions
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 5100ec3a20bd..7cf9b9899f86 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1098,6 +1098,21 @@ struct kvm_vcpu_arch {
*/
bool pdptrs_from_userspace;
+ /*
+ * Set if an emulated nested VM-Enter to L2 is pending completion. KVM
+ * must not synthesize a VM-Exit to L1 before entering L2, as VM-Exits
+ * can only occur at instruction boundaries. The only exception is
+ * VMX's "notify" exits, which exist in large part to break the CPU out
+ * of infinite ucode loops, but can corrupt vCPU state in the process!
+ *
+ * For all intents and purposes, this is a boolean, but it's tracked as
+ * a u8 so that KVM can detect when userspace may have stuffed vCPU
+ * state and generated an architecturally-impossible VM-Exit.
+ */
+#define KVM_NESTED_RUN_PENDING 1
+#define KVM_NESTED_RUN_PENDING_UNTRUSTED 2
+ u8 nested_run_pending;
+
#if IS_ENABLED(CONFIG_HYPERV)
hpa_t hv_root_tdp;
#endif
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index edde36097ddc..bcfeb5e7c0ed 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -142,13 +142,13 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
u64 exit_info_2;
u32 exit_int_info;
u32 exit_int_info_err;
- u64 nested_ctl;
+ u64 misc_ctl;
u64 avic_vapic_bar;
u64 ghcb_gpa;
u32 event_inj;
u32 event_inj_err;
u64 nested_cr3;
- u64 virt_ext;
+ u64 misc_ctl2;
u32 clean;
u32 reserved_5;
u64 next_rip;
@@ -182,6 +182,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define TLB_CONTROL_FLUSH_ASID 3
#define TLB_CONTROL_FLUSH_ASID_LOCAL 7
+#define TLB_CONTROL_MASK GENMASK(2, 0)
+
#define ERAP_CONTROL_ALLOW_LARGER_RAP BIT(0)
#define ERAP_CONTROL_CLEAR_RAP BIT(1)
@@ -222,8 +224,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define X2APIC_MODE_SHIFT 30
#define X2APIC_MODE_MASK (1 << X2APIC_MODE_SHIFT)
-#define LBR_CTL_ENABLE_MASK BIT_ULL(0)
-#define VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK BIT_ULL(1)
+#define SVM_INT_VECTOR_MASK GENMASK(7, 0)
#define SVM_INTERRUPT_SHADOW_MASK BIT_ULL(0)
#define SVM_GUEST_INTERRUPT_MASK BIT_ULL(1)
@@ -239,10 +240,12 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT)
#define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT)
-#define SVM_NESTED_CTL_NP_ENABLE BIT(0)
-#define SVM_NESTED_CTL_SEV_ENABLE BIT(1)
-#define SVM_NESTED_CTL_SEV_ES_ENABLE BIT(2)
+#define SVM_MISC_ENABLE_NP BIT(0)
+#define SVM_MISC_ENABLE_SEV BIT(1)
+#define SVM_MISC_ENABLE_SEV_ES BIT(2)
+#define SVM_MISC2_ENABLE_V_LBR BIT_ULL(0)
+#define SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE BIT_ULL(1)
#define SVM_TSC_RATIO_RSVD 0xffffff0000000000ULL
#define SVM_TSC_RATIO_MIN 0x0000000000000001ULL
@@ -636,6 +639,9 @@ static inline void __unused_size_checks(void)
#define SVM_EVTINJ_VALID (1 << 31)
#define SVM_EVTINJ_VALID_ERR (1 << 11)
+#define SVM_EVTINJ_RESERVED_BITS ~(SVM_EVTINJ_VEC_MASK | SVM_EVTINJ_TYPE_MASK | \
+ SVM_EVTINJ_VALID_ERR | SVM_EVTINJ_VALID)
+
#define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK
#define SVM_EXITINTINFO_TYPE_MASK SVM_EVTINJ_TYPE_MASK
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 6145dac4a605..c8c6cc0406d6 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3887,8 +3887,7 @@ static int check_svme_pa(struct x86_emulate_ctxt *ctxt)
{
u64 rax = reg_read(ctxt, VCPU_REGS_RAX);
- /* Valid physical address? */
- if (rax & 0xffff000000000000ULL)
+ if (!ctxt->ops->page_address_valid(ctxt, rax))
return emulate_gp(ctxt, 0);
return check_svme(ctxt);
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index 6ce160ffa678..6301f79fcbae 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -305,14 +305,6 @@ static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu)
{
return false;
}
-static inline bool kvm_hv_is_tlb_flush_hcall(struct kvm_vcpu *vcpu)
-{
- return false;
-}
-static inline bool guest_hv_cpuid_has_l2_tlb_flush(struct kvm_vcpu *vcpu)
-{
- return false;
-}
static inline int kvm_hv_verify_vp_assist(struct kvm_vcpu *vcpu)
{
return 0;
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
index fb3dab4b5a53..0abff36d0994 100644
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -245,6 +245,8 @@ struct x86_emulate_ops {
bool (*is_canonical_addr)(struct x86_emulate_ctxt *ctxt, gva_t addr,
unsigned int flags);
+
+ bool (*page_address_valid)(struct x86_emulate_ctxt *ctxt, gpa_t gpa);
};
/* Type, address-of, and value of an instruction's operand. */
diff --git a/arch/x86/kvm/svm/hyperv.h b/arch/x86/kvm/svm/hyperv.h
index d3f8bfc05832..f70d076911a6 100644
--- a/arch/x86/kvm/svm/hyperv.h
+++ b/arch/x86/kvm/svm/hyperv.h
@@ -41,10 +41,17 @@ static inline bool nested_svm_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu)
return hv_vcpu->vp_assist_page.nested_control.features.directhypercall;
}
+static inline bool nested_svm_is_l2_tlb_flush_hcall(struct kvm_vcpu *vcpu)
+{
+ return guest_hv_cpuid_has_l2_tlb_flush(vcpu) &&
+ nested_svm_l2_tlb_flush_enabled(vcpu) &&
+ kvm_hv_is_tlb_flush_hcall(vcpu);
+}
+
void svm_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu);
#else /* CONFIG_KVM_HYPERV */
static inline void nested_svm_hv_update_vm_vp_ids(struct kvm_vcpu *vcpu) {}
-static inline bool nested_svm_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu)
+static inline bool nested_svm_is_l2_tlb_flush_hcall(struct kvm_vcpu *vcpu)
{
return false;
}
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index b36c33255bed..961804df5f45 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -116,31 +116,28 @@ static bool nested_vmcb_needs_vls_intercept(struct vcpu_svm *svm)
if (!nested_npt_enabled(svm))
return true;
- if (!(svm->nested.ctl.virt_ext & VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK))
+ if (!(svm->nested.ctl.misc_ctl2 & SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE))
return true;
return false;
}
-void recalc_intercepts(struct vcpu_svm *svm)
+void nested_vmcb02_recalc_intercepts(struct vcpu_svm *svm)
{
- struct vmcb_control_area *c, *h;
- struct vmcb_ctrl_area_cached *g;
+ struct vmcb_ctrl_area_cached *vmcb12_ctrl = &svm->nested.ctl;
+ struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
+ struct vmcb *vmcb01 = svm->vmcb01.ptr;
unsigned int i;
- vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
-
- if (!is_guest_mode(&svm->vcpu))
+ if (WARN_ON_ONCE(svm->vmcb != vmcb02))
return;
- c = &svm->vmcb->control;
- h = &svm->vmcb01.ptr->control;
- g = &svm->nested.ctl;
+ vmcb_mark_dirty(vmcb02, VMCB_INTERCEPTS);
for (i = 0; i < MAX_INTERCEPT; i++)
- c->intercepts[i] = h->intercepts[i];
+ vmcb02->control.intercepts[i] = vmcb01->control.intercepts[i];
- if (g->int_ctl & V_INTR_MASKING_MASK) {
+ if (vmcb12_ctrl->int_ctl & V_INTR_MASKING_MASK) {
/*
* If L2 is active and V_INTR_MASKING is enabled in vmcb12,
* disable intercept of CR8 writes as L2's CR8 does not affect
@@ -151,24 +148,17 @@ void recalc_intercepts(struct vcpu_svm *svm)
* the effective RFLAGS.IF for L1 interrupts will never be set
* while L2 is running (L2's RFLAGS.IF doesn't affect L1 IRQs).
*/
- vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE);
- if (!(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF))
- vmcb_clr_intercept(c, INTERCEPT_VINTR);
+ vmcb_clr_intercept(&vmcb02->control, INTERCEPT_CR8_WRITE);
+ if (!(vmcb01->save.rflags & X86_EFLAGS_IF))
+ vmcb_clr_intercept(&vmcb02->control, INTERCEPT_VINTR);
}
- /*
- * We want to see VMMCALLs from a nested guest only when Hyper-V L2 TLB
- * flush feature is enabled.
- */
- if (!nested_svm_l2_tlb_flush_enabled(&svm->vcpu))
- vmcb_clr_intercept(c, INTERCEPT_VMMCALL);
-
for (i = 0; i < MAX_INTERCEPT; i++)
- c->intercepts[i] |= g->intercepts[i];
+ vmcb02->control.intercepts[i] |= vmcb12_ctrl->intercepts[i];
/* If SMI is not intercepted, ignore guest SMI intercept as well */
if (!intercept_smi)
- vmcb_clr_intercept(c, INTERCEPT_SMI);
+ vmcb_clr_intercept(&vmcb02->control, INTERCEPT_SMI);
if (nested_vmcb_needs_vls_intercept(svm)) {
/*
@@ -176,10 +166,10 @@ void recalc_intercepts(struct vcpu_svm *svm)
* we must intercept these instructions to correctly
* emulate them in case L1 doesn't intercept them.
*/
- vmcb_set_intercept(c, INTERCEPT_VMLOAD);
- vmcb_set_intercept(c, INTERCEPT_VMSAVE);
+ vmcb_set_intercept(&vmcb02->control, INTERCEPT_VMLOAD);
+ vmcb_set_intercept(&vmcb02->control, INTERCEPT_VMSAVE);
} else {
- WARN_ON(!(c->virt_ext & VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK));
+ WARN_ON_ONCE(!(vmcb02->control.misc_ctl2 & SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE));
}
}
@@ -339,8 +329,56 @@ static bool nested_svm_check_bitmap_pa(struct kvm_vcpu *vcpu, u64 pa, u32 size)
kvm_vcpu_is_legal_gpa(vcpu, addr + size - 1);
}
-static bool __nested_vmcb_check_controls(struct kvm_vcpu *vcpu,
- struct vmcb_ctrl_area_cached *control)
+static bool nested_svm_event_inj_valid_exept(struct kvm_vcpu *vcpu, u8 vector)
+{
+ /*
+ * Vectors that do not correspond to a defined exception are invalid
+ * (including #NMI and reserved vectors). In a best effort to define
+ * valid exceptions based on the virtual CPU, make all exceptions always
+ * valid except those obviously tied to a CPU feature.
+ */
+ switch (vector) {
+ case DE_VECTOR: case DB_VECTOR: case BP_VECTOR: case OF_VECTOR:
+ case BR_VECTOR: case UD_VECTOR: case NM_VECTOR: case DF_VECTOR:
+ case TS_VECTOR: case NP_VECTOR: case SS_VECTOR: case GP_VECTOR:
+ case PF_VECTOR: case MF_VECTOR: case AC_VECTOR: case MC_VECTOR:
+ case XM_VECTOR: case HV_VECTOR: case SX_VECTOR:
+ return true;
+ case CP_VECTOR:
+ return guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK);
+ case VC_VECTOR:
+ return guest_cpu_cap_has(vcpu, X86_FEATURE_SEV_ES);
+ }
+ return false;
+}
+
+/*
+ * According to the APM, VMRUN exits with SVM_EXIT_ERR if SVM_EVTINJ_VALID is
+ * set and:
+ * - The type of event_inj is not one of the defined values.
+ * - The type is SVM_EVTINJ_TYPE_EXEPT, but the vector is not a valid exception.
+ */
+static bool nested_svm_check_event_inj(struct kvm_vcpu *vcpu, u32 event_inj)
+{
+ u32 type = event_inj & SVM_EVTINJ_TYPE_MASK;
+ u8 vector = event_inj & SVM_EVTINJ_VEC_MASK;
+
+ if (!(event_inj & SVM_EVTINJ_VALID))
+ return true;
+
+ if (type != SVM_EVTINJ_TYPE_INTR && type != SVM_EVTINJ_TYPE_NMI &&
+ type != SVM_EVTINJ_TYPE_EXEPT && type != SVM_EVTINJ_TYPE_SOFT)
+ return false;
+
+ if (type == SVM_EVTINJ_TYPE_EXEPT &&
+ !nested_svm_event_inj_valid_exept(vcpu, vector))
+ return false;
+
+ return true;
+}
+
+static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu,
+ struct vmcb_ctrl_area_cached *control)
{
if (CC(!vmcb12_is_intercept(control, INTERCEPT_VMRUN)))
return false;
@@ -348,7 +386,8 @@ static bool __nested_vmcb_check_controls(struct kvm_vcpu *vcpu,
if (CC(control->asid == 0))
return false;
- if (CC((control->nested_ctl & SVM_NESTED_CTL_NP_ENABLE) && !npt_enabled))
+ if (CC((control->misc_ctl & SVM_MISC_ENABLE_NP) &&
+ !kvm_vcpu_is_legal_gpa(vcpu, control->nested_cr3)))
return false;
if (CC(!nested_svm_check_bitmap_pa(vcpu, control->msrpm_base_pa,
@@ -363,12 +402,15 @@ static bool __nested_vmcb_check_controls(struct kvm_vcpu *vcpu,
return false;
}
+ if (CC(!nested_svm_check_event_inj(vcpu, control->event_inj)))
+ return false;
+
return true;
}
/* Common checks that apply to both L1 and L2 state. */
-static bool __nested_vmcb_check_save(struct kvm_vcpu *vcpu,
- struct vmcb_save_area_cached *save)
+static bool nested_vmcb_check_save(struct kvm_vcpu *vcpu,
+ struct vmcb_save_area_cached *save)
{
if (CC(!(save->efer & EFER_SVME)))
return false;
@@ -390,6 +432,10 @@ static bool __nested_vmcb_check_save(struct kvm_vcpu *vcpu,
CC(!(save->cr0 & X86_CR0_PE)) ||
CC(!kvm_vcpu_is_legal_cr3(vcpu, save->cr3)))
return false;
+
+ if (CC((save->cs.attrib & SVM_SELECTOR_L_MASK) &&
+ (save->cs.attrib & SVM_SELECTOR_DB_MASK)))
+ return false;
}
/* Note, SVM doesn't have any additional restrictions on CR4. */
@@ -402,26 +448,12 @@ static bool __nested_vmcb_check_save(struct kvm_vcpu *vcpu,
return true;
}
-static bool nested_vmcb_check_save(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- struct vmcb_save_area_cached *save = &svm->nested.save;
-
- return __nested_vmcb_check_save(vcpu, save);
-}
-
-static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu)
+int nested_svm_check_cached_vmcb12(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- struct vmcb_ctrl_area_cached *ctl = &svm->nested.ctl;
- return __nested_vmcb_check_controls(vcpu, ctl);
-}
-
-int nested_svm_check_cached_vmcb12(struct kvm_vcpu *vcpu)
-{
- if (!nested_vmcb_check_save(vcpu) ||
- !nested_vmcb_check_controls(vcpu))
+ if (!nested_vmcb_check_save(vcpu, &svm->nested.save) ||
+ !nested_vmcb_check_controls(vcpu, &svm->nested.ctl))
return -EINVAL;
return 0;
@@ -456,37 +488,39 @@ void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu,
nested_svm_sanitize_intercept(vcpu, to, SKINIT);
nested_svm_sanitize_intercept(vcpu, to, RDPRU);
- to->iopm_base_pa = from->iopm_base_pa;
- to->msrpm_base_pa = from->msrpm_base_pa;
+ /* Always clear SVM_MISC_ENABLE_NP if the guest cannot use NPTs */
+ to->misc_ctl = from->misc_ctl;
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_NPT))
+ to->misc_ctl &= ~SVM_MISC_ENABLE_NP;
+
+ to->iopm_base_pa = from->iopm_base_pa & PAGE_MASK;
+ to->msrpm_base_pa = from->msrpm_base_pa & PAGE_MASK;
to->tsc_offset = from->tsc_offset;
- to->tlb_ctl = from->tlb_ctl;
+ to->tlb_ctl = from->tlb_ctl & TLB_CONTROL_MASK;
to->erap_ctl = from->erap_ctl;
to->int_ctl = from->int_ctl;
- to->int_vector = from->int_vector;
- to->int_state = from->int_state;
+ to->int_vector = from->int_vector & SVM_INT_VECTOR_MASK;
+ to->int_state = from->int_state & SVM_INTERRUPT_SHADOW_MASK;
to->exit_code = from->exit_code;
to->exit_info_1 = from->exit_info_1;
to->exit_info_2 = from->exit_info_2;
to->exit_int_info = from->exit_int_info;
to->exit_int_info_err = from->exit_int_info_err;
- to->nested_ctl = from->nested_ctl;
- to->event_inj = from->event_inj;
+ to->event_inj = from->event_inj & ~SVM_EVTINJ_RESERVED_BITS;
to->event_inj_err = from->event_inj_err;
to->next_rip = from->next_rip;
to->nested_cr3 = from->nested_cr3;
- to->virt_ext = from->virt_ext;
+ to->misc_ctl2 = from->misc_ctl2;
to->pause_filter_count = from->pause_filter_count;
to->pause_filter_thresh = from->pause_filter_thresh;
- /* Copy asid here because nested_vmcb_check_controls will check it. */
+ /* Copy asid here because nested_vmcb_check_controls() will check it */
to->asid = from->asid;
- to->msrpm_base_pa &= ~0x0fffULL;
- to->iopm_base_pa &= ~0x0fffULL;
+ to->clean = from->clean;
#ifdef CONFIG_KVM_HYPERV
/* Hyper-V extensions (Enlightened VMCB) */
if (kvm_hv_hypercall_enabled(vcpu)) {
- to->clean = from->clean;
memcpy(&to->hv_enlightenments, &from->hv_enlightenments,
sizeof(to->hv_enlightenments));
}
@@ -502,17 +536,34 @@ void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm,
static void __nested_copy_vmcb_save_to_cache(struct vmcb_save_area_cached *to,
struct vmcb_save_area *from)
{
- /*
- * Copy only fields that are validated, as we need them
- * to avoid TOC/TOU races.
- */
+ to->es = from->es;
+ to->cs = from->cs;
+ to->ss = from->ss;
+ to->ds = from->ds;
+ to->gdtr = from->gdtr;
+ to->idtr = from->idtr;
+
+ to->cpl = from->cpl;
+
to->efer = from->efer;
- to->cr0 = from->cr0;
- to->cr3 = from->cr3;
to->cr4 = from->cr4;
-
- to->dr6 = from->dr6;
+ to->cr3 = from->cr3;
+ to->cr0 = from->cr0;
to->dr7 = from->dr7;
+ to->dr6 = from->dr6;
+
+ to->rflags = from->rflags;
+ to->rip = from->rip;
+ to->rsp = from->rsp;
+
+ to->s_cet = from->s_cet;
+ to->ssp = from->ssp;
+ to->isst_addr = from->isst_addr;
+
+ to->rax = from->rax;
+ to->cr2 = from->cr2;
+
+ svm_copy_lbrs(to, from);
}
void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm,
@@ -530,6 +581,7 @@ void nested_sync_control_from_vmcb02(struct vcpu_svm *svm)
u32 mask;
svm->nested.ctl.event_inj = svm->vmcb->control.event_inj;
svm->nested.ctl.event_inj_err = svm->vmcb->control.event_inj_err;
+ svm->nested.ctl.int_state = svm->vmcb->control.int_state;
/* Only a few fields of int_ctl are written by the processor. */
mask = V_IRQ_MASK | V_TPR_MASK;
@@ -542,7 +594,7 @@ void nested_sync_control_from_vmcb02(struct vcpu_svm *svm)
* int_ctl (because it was never recognized while L2 was running).
*/
if (svm_is_intercept(svm, INTERCEPT_VINTR) &&
- !test_bit(INTERCEPT_VINTR, (unsigned long *)svm->nested.ctl.intercepts))
+ !vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_VINTR))
mask &= ~V_IRQ_MASK;
if (nested_vgif_enabled(svm))
@@ -648,8 +700,16 @@ void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm)
svm->nested.vmcb02.ptr->save.g_pat = svm->vmcb01.ptr->save.g_pat;
}
-static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
+static bool nested_vmcb12_has_lbrv(struct kvm_vcpu *vcpu)
+{
+ return guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) &&
+ (to_svm(vcpu)->nested.ctl.misc_ctl2 & SVM_MISC2_ENABLE_V_LBR);
+}
+
+static void nested_vmcb02_prepare_save(struct vcpu_svm *svm)
{
+ struct vmcb_ctrl_area_cached *control = &svm->nested.ctl;
+ struct vmcb_save_area_cached *save = &svm->nested.save;
bool new_vmcb12 = false;
struct vmcb *vmcb01 = svm->vmcb01.ptr;
struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
@@ -665,64 +725,64 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
svm->nested.force_msr_bitmap_recalc = true;
}
- if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_SEG))) {
- vmcb02->save.es = vmcb12->save.es;
- vmcb02->save.cs = vmcb12->save.cs;
- vmcb02->save.ss = vmcb12->save.ss;
- vmcb02->save.ds = vmcb12->save.ds;
- vmcb02->save.cpl = vmcb12->save.cpl;
+ if (unlikely(new_vmcb12 || vmcb12_is_dirty(control, VMCB_SEG))) {
+ vmcb02->save.es = save->es;
+ vmcb02->save.cs = save->cs;
+ vmcb02->save.ss = save->ss;
+ vmcb02->save.ds = save->ds;
+ vmcb02->save.cpl = save->cpl;
vmcb_mark_dirty(vmcb02, VMCB_SEG);
}
- if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DT))) {
- vmcb02->save.gdtr = vmcb12->save.gdtr;
- vmcb02->save.idtr = vmcb12->save.idtr;
+ if (unlikely(new_vmcb12 || vmcb12_is_dirty(control, VMCB_DT))) {
+ vmcb02->save.gdtr = save->gdtr;
+ vmcb02->save.idtr = save->idtr;
vmcb_mark_dirty(vmcb02, VMCB_DT);
}
if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) &&
- (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_CET)))) {
- vmcb02->save.s_cet = vmcb12->save.s_cet;
- vmcb02->save.isst_addr = vmcb12->save.isst_addr;
- vmcb02->save.ssp = vmcb12->save.ssp;
+ (unlikely(new_vmcb12 || vmcb12_is_dirty(control, VMCB_CET)))) {
+ vmcb02->save.s_cet = save->s_cet;
+ vmcb02->save.isst_addr = save->isst_addr;
+ vmcb02->save.ssp = save->ssp;
vmcb_mark_dirty(vmcb02, VMCB_CET);
}
- kvm_set_rflags(vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED);
+ kvm_set_rflags(vcpu, save->rflags | X86_EFLAGS_FIXED);
svm_set_efer(vcpu, svm->nested.save.efer);
svm_set_cr0(vcpu, svm->nested.save.cr0);
svm_set_cr4(vcpu, svm->nested.save.cr4);
- svm->vcpu.arch.cr2 = vmcb12->save.cr2;
+ svm->vcpu.arch.cr2 = save->cr2;
- kvm_rax_write(vcpu, vmcb12->save.rax);
- kvm_rsp_write(vcpu, vmcb12->save.rsp);
- kvm_rip_write(vcpu, vmcb12->save.rip);
+ kvm_rax_write(vcpu, save->rax);
+ kvm_rsp_write(vcpu, save->rsp);
+ kvm_rip_write(vcpu, save->rip);
/* In case we don't even reach vcpu_run, the fields are not updated */
- vmcb02->save.rax = vmcb12->save.rax;
- vmcb02->save.rsp = vmcb12->save.rsp;
- vmcb02->save.rip = vmcb12->save.rip;
+ vmcb02->save.rax = save->rax;
+ vmcb02->save.rsp = save->rsp;
+ vmcb02->save.rip = save->rip;
- if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DR))) {
+ if (unlikely(new_vmcb12 || vmcb12_is_dirty(control, VMCB_DR))) {
vmcb02->save.dr7 = svm->nested.save.dr7 | DR7_FIXED_1;
svm->vcpu.arch.dr6 = svm->nested.save.dr6 | DR6_ACTIVE_LOW;
vmcb_mark_dirty(vmcb02, VMCB_DR);
}
- if (unlikely(guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) &&
- (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
+ if (nested_vmcb12_has_lbrv(vcpu)) {
/*
* Reserved bits of DEBUGCTL are ignored. Be consistent with
* svm_set_msr's definition of reserved bits.
*/
- svm_copy_lbrs(vmcb02, vmcb12);
+ svm_copy_lbrs(&vmcb02->save, save);
vmcb02->save.dbgctl &= ~DEBUGCTL_RESERVED_BITS;
} else {
- svm_copy_lbrs(vmcb02, vmcb01);
+ svm_copy_lbrs(&vmcb02->save, &vmcb01->save);
}
+ vmcb_mark_dirty(vmcb02, VMCB_LBR);
svm_update_lbrv(&svm->vcpu);
}
@@ -750,18 +810,16 @@ static bool is_evtinj_nmi(u32 evtinj)
return type == SVM_EVTINJ_TYPE_NMI;
}
-static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
- unsigned long vmcb12_rip,
- unsigned long vmcb12_csbase)
+static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
{
u32 int_ctl_vmcb01_bits = V_INTR_MASKING_MASK;
u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK;
- struct kvm_vcpu *vcpu = &svm->vcpu;
- struct vmcb *vmcb01 = svm->vmcb01.ptr;
+ struct vmcb_ctrl_area_cached *vmcb12_ctrl = &svm->nested.ctl;
struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
- u32 pause_count12;
- u32 pause_thresh12;
+ struct vmcb *vmcb01 = svm->vmcb01.ptr;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ u32 pause_count12, pause_thresh12;
nested_svm_transition_tlb_flush(vcpu);
@@ -774,7 +832,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
*/
if (guest_cpu_cap_has(vcpu, X86_FEATURE_VGIF) &&
- (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK))
+ (vmcb12_ctrl->int_ctl & V_GIF_ENABLE_MASK))
int_ctl_vmcb12_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
else
int_ctl_vmcb01_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
@@ -790,8 +848,16 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
V_NMI_BLOCKING_MASK);
}
- /* Copied from vmcb01. msrpm_base can be overwritten later. */
- vmcb02->control.nested_ctl = vmcb01->control.nested_ctl;
+ /*
+ * Copied from vmcb01. msrpm_base can be overwritten later.
+ *
+ * SVM_MISC_ENABLE_NP in vmcb12 is only used for consistency checks. If
+ * L1 enables NPTs, KVM shadows L1's NPTs and uses those to run L2. If
+ * L1 disables NPT, KVM runs L2 with the same NPTs used to run L1. For
+ * the latter, L1 runs L2 with shadow page tables that translate L2 GVAs
+ * to L1 GPAs, so the same NPTs can be used for L1 and L2.
+ */
+ vmcb02->control.misc_ctl = vmcb01->control.misc_ctl & SVM_MISC_ENABLE_NP;
vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa;
vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa;
vmcb_mark_dirty(vmcb02, VMCB_PERM_MAP);
@@ -818,7 +884,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
* L1 re-enters L2, the same instruction will trigger a VM-Exit and the
* entire cycle start over.
*/
- if (vmcb02->save.rip && (svm->nested.ctl.bus_lock_rip == vmcb02->save.rip))
+ if (vmcb02->save.rip && (svm->nested.last_bus_lock_rip == vmcb02->save.rip))
vmcb02->control.bus_lock_counter = 1;
else
vmcb02->control.bus_lock_counter = 0;
@@ -832,10 +898,9 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
if (nested_npt_enabled(svm))
nested_svm_init_mmu_context(vcpu);
- vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
- vcpu->arch.l1_tsc_offset,
- svm->nested.ctl.tsc_offset,
- svm->tsc_ratio_msr);
+ vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(vcpu->arch.l1_tsc_offset,
+ vmcb12_ctrl->tsc_offset,
+ svm->tsc_ratio_msr);
vmcb02->control.tsc_offset = vcpu->arch.tsc_offset;
@@ -844,49 +909,49 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
nested_svm_update_tsc_ratio_msr(vcpu);
vmcb02->control.int_ctl =
- (svm->nested.ctl.int_ctl & int_ctl_vmcb12_bits) |
+ (vmcb12_ctrl->int_ctl & int_ctl_vmcb12_bits) |
(vmcb01->control.int_ctl & int_ctl_vmcb01_bits);
- vmcb02->control.int_vector = svm->nested.ctl.int_vector;
- vmcb02->control.int_state = svm->nested.ctl.int_state;
- vmcb02->control.event_inj = svm->nested.ctl.event_inj;
- vmcb02->control.event_inj_err = svm->nested.ctl.event_inj_err;
+ vmcb02->control.int_vector = vmcb12_ctrl->int_vector;
+ vmcb02->control.int_state = vmcb12_ctrl->int_state;
+ vmcb02->control.event_inj = vmcb12_ctrl->event_inj;
+ vmcb02->control.event_inj_err = vmcb12_ctrl->event_inj_err;
/*
- * next_rip is consumed on VMRUN as the return address pushed on the
- * stack for injected soft exceptions/interrupts. If nrips is exposed
- * to L1, take it verbatim from vmcb12. If nrips is supported in
- * hardware but not exposed to L1, stuff the actual L2 RIP to emulate
- * what a nrips=0 CPU would do (L1 is responsible for advancing RIP
- * prior to injecting the event).
+ * If nrips is exposed to L1, take NextRIP as-is. Otherwise, L1
+ * advances L2's RIP before VMRUN instead of using NextRIP. KVM will
+ * stuff the current RIP as vmcb02's NextRIP before L2 is run. After
+ * the first run of L2 (e.g. after save+restore), NextRIP is updated by
+ * the CPU and/or KVM and should be used regardless of L1's support.
*/
- if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS))
- vmcb02->control.next_rip = svm->nested.ctl.next_rip;
- else if (boot_cpu_has(X86_FEATURE_NRIPS))
- vmcb02->control.next_rip = vmcb12_rip;
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS) ||
+ !vcpu->arch.nested_run_pending)
+ vmcb02->control.next_rip = vmcb12_ctrl->next_rip;
svm->nmi_l1_to_l2 = is_evtinj_nmi(vmcb02->control.event_inj);
+
+ /*
+ * soft_int_csbase, soft_int_old_rip, and soft_int_next_rip (if L1
+ * doesn't have NRIPS) are initialized later, before the vCPU is run.
+ */
if (is_evtinj_soft(vmcb02->control.event_inj)) {
svm->soft_int_injected = true;
- svm->soft_int_csbase = vmcb12_csbase;
- svm->soft_int_old_rip = vmcb12_rip;
- if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS))
- svm->soft_int_next_rip = svm->nested.ctl.next_rip;
- else
- svm->soft_int_next_rip = vmcb12_rip;
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS) ||
+ !vcpu->arch.nested_run_pending)
+ svm->soft_int_next_rip = vmcb12_ctrl->next_rip;
}
- /* LBR_CTL_ENABLE_MASK is controlled by svm_update_lbrv() */
+ /* SVM_MISC2_ENABLE_V_LBR is controlled by svm_update_lbrv() */
if (!nested_vmcb_needs_vls_intercept(svm))
- vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+ vmcb02->control.misc_ctl2 |= SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE;
if (guest_cpu_cap_has(vcpu, X86_FEATURE_PAUSEFILTER))
- pause_count12 = svm->nested.ctl.pause_filter_count;
+ pause_count12 = vmcb12_ctrl->pause_filter_count;
else
pause_count12 = 0;
if (guest_cpu_cap_has(vcpu, X86_FEATURE_PFTHRESHOLD))
- pause_thresh12 = svm->nested.ctl.pause_filter_thresh;
+ pause_thresh12 = vmcb12_ctrl->pause_filter_thresh;
else
pause_thresh12 = 0;
if (kvm_pause_in_guest(svm->vcpu.kvm)) {
@@ -900,7 +965,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
vmcb02->control.pause_filter_thresh = vmcb01->control.pause_filter_thresh;
/* ... but ensure filtering is disabled if so requested. */
- if (vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_PAUSE)) {
+ if (vmcb12_is_intercept(vmcb12_ctrl, INTERCEPT_PAUSE)) {
if (!pause_count12)
vmcb02->control.pause_filter_count = 0;
if (!pause_thresh12)
@@ -917,7 +982,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
* L2 is the "guest").
*/
if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS))
- vmcb02->control.erap_ctl = (svm->nested.ctl.erap_ctl &
+ vmcb02->control.erap_ctl = (vmcb12_ctrl->erap_ctl &
ERAP_CONTROL_ALLOW_LARGER_RAP) |
ERAP_CONTROL_CLEAR_RAP;
@@ -925,7 +990,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
* Merge guest and host intercepts - must be called with vcpu in
* guest-mode to take effect.
*/
- recalc_intercepts(svm);
+ nested_vmcb02_recalc_intercepts(svm);
}
static void nested_svm_copy_common_state(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
@@ -940,28 +1005,29 @@ static void nested_svm_copy_common_state(struct vmcb *from_vmcb, struct vmcb *to
to_vmcb->save.spec_ctrl = from_vmcb->save.spec_ctrl;
}
-int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa,
- struct vmcb *vmcb12, bool from_vmrun)
+int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, bool from_vmrun)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_ctrl_area_cached *control = &svm->nested.ctl;
+ struct vmcb_save_area_cached *save = &svm->nested.save;
int ret;
trace_kvm_nested_vmenter(svm->vmcb->save.rip,
vmcb12_gpa,
- vmcb12->save.rip,
- vmcb12->control.int_ctl,
- vmcb12->control.event_inj,
- vmcb12->control.nested_ctl,
- vmcb12->control.nested_cr3,
- vmcb12->save.cr3,
+ save->rip,
+ control->int_ctl,
+ control->event_inj,
+ control->misc_ctl,
+ control->nested_cr3,
+ save->cr3,
KVM_ISA_SVM);
- trace_kvm_nested_intercepts(vmcb12->control.intercepts[INTERCEPT_CR] & 0xffff,
- vmcb12->control.intercepts[INTERCEPT_CR] >> 16,
- vmcb12->control.intercepts[INTERCEPT_EXCEPTION],
- vmcb12->control.intercepts[INTERCEPT_WORD3],
- vmcb12->control.intercepts[INTERCEPT_WORD4],
- vmcb12->control.intercepts[INTERCEPT_WORD5]);
+ trace_kvm_nested_intercepts(control->intercepts[INTERCEPT_CR] & 0xffff,
+ control->intercepts[INTERCEPT_CR] >> 16,
+ control->intercepts[INTERCEPT_EXCEPTION],
+ control->intercepts[INTERCEPT_WORD3],
+ control->intercepts[INTERCEPT_WORD4],
+ control->intercepts[INTERCEPT_WORD5]);
svm->nested.vmcb12_gpa = vmcb12_gpa;
@@ -971,8 +1037,8 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa,
nested_svm_copy_common_state(svm->vmcb01.ptr, svm->nested.vmcb02.ptr);
svm_switch_vmcb(svm, &svm->nested.vmcb02);
- nested_vmcb02_prepare_control(svm, vmcb12->save.rip, vmcb12->save.cs.base);
- nested_vmcb02_prepare_save(svm, vmcb12);
+ nested_vmcb02_prepare_control(svm);
+ nested_vmcb02_prepare_save(svm);
ret = nested_svm_load_cr3(&svm->vcpu, svm->nested.save.cr3,
nested_npt_enabled(svm), from_vmrun);
@@ -992,12 +1058,38 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa,
return 0;
}
+static int nested_svm_copy_vmcb12_to_cache(struct kvm_vcpu *vcpu, u64 vmcb12_gpa)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct kvm_host_map map;
+ struct vmcb *vmcb12;
+ int r = 0;
+
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map))
+ return -EFAULT;
+
+ vmcb12 = map.hva;
+ nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
+ nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
+
+ if (nested_svm_check_cached_vmcb12(vcpu) < 0) {
+ vmcb12->control.exit_code = SVM_EXIT_ERR;
+ vmcb12->control.exit_info_1 = 0;
+ vmcb12->control.exit_info_2 = 0;
+ vmcb12->control.event_inj = 0;
+ vmcb12->control.event_inj_err = 0;
+ svm_set_gif(svm, false);
+ r = -EINVAL;
+ }
+
+ kvm_vcpu_unmap(vcpu, &map);
+ return r;
+}
+
int nested_svm_vmrun(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
int ret;
- struct vmcb *vmcb12;
- struct kvm_host_map map;
u64 vmcb12_gpa;
struct vmcb *vmcb01 = svm->vmcb01.ptr;
@@ -1018,32 +1110,27 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
return ret;
}
- vmcb12_gpa = svm->vmcb->save.rax;
- ret = kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map);
- if (ret == -EINVAL) {
+ if (WARN_ON_ONCE(!svm->nested.initialized))
+ return -EINVAL;
+
+ vmcb12_gpa = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ if (!page_address_valid(vcpu, vmcb12_gpa)) {
kvm_inject_gp(vcpu, 0);
return 1;
- } else if (ret) {
- return kvm_skip_emulated_instruction(vcpu);
}
- ret = kvm_skip_emulated_instruction(vcpu);
-
- vmcb12 = map.hva;
-
- if (WARN_ON_ONCE(!svm->nested.initialized))
- return -EINVAL;
-
- nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
- nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
+ ret = nested_svm_copy_vmcb12_to_cache(vcpu, vmcb12_gpa);
+ if (ret) {
+ if (ret == -EFAULT)
+ return kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL);
- if (nested_svm_check_cached_vmcb12(vcpu) < 0) {
- vmcb12->control.exit_code = SVM_EXIT_ERR;
- vmcb12->control.exit_info_1 = 0;
- vmcb12->control.exit_info_2 = 0;
- goto out;
+ /* Advance RIP past VMRUN as part of the nested #VMEXIT. */
+ return kvm_skip_emulated_instruction(vcpu);
}
+ /* At this point, VMRUN is guaranteed to not fault; advance RIP. */
+ ret = kvm_skip_emulated_instruction(vcpu);
+
/*
* Since vmcb01 is not in use, we can use it to store some of the L1
* state.
@@ -1057,27 +1144,20 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
if (!npt_enabled)
vmcb01->save.cr3 = kvm_read_cr3(vcpu);
- svm->nested.nested_run_pending = 1;
-
- if (enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, true))
- goto out_exit_err;
-
- if (nested_svm_merge_msrpm(vcpu))
- goto out;
-
-out_exit_err:
- svm->nested.nested_run_pending = 0;
- svm->nmi_l1_to_l2 = false;
- svm->soft_int_injected = false;
+ vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING;
- svm->vmcb->control.exit_code = SVM_EXIT_ERR;
- svm->vmcb->control.exit_info_1 = 0;
- svm->vmcb->control.exit_info_2 = 0;
+ if (enter_svm_guest_mode(vcpu, vmcb12_gpa, true) ||
+ !nested_svm_merge_msrpm(vcpu)) {
+ vcpu->arch.nested_run_pending = 0;
+ svm->nmi_l1_to_l2 = false;
+ svm->soft_int_injected = false;
- nested_svm_vmexit(svm);
+ svm->vmcb->control.exit_code = SVM_EXIT_ERR;
+ svm->vmcb->control.exit_info_1 = 0;
+ svm->vmcb->control.exit_info_2 = 0;
-out:
- kvm_vcpu_unmap(vcpu, &map);
+ nested_svm_vmexit(svm);
+ }
return ret;
}
@@ -1107,6 +1187,11 @@ void svm_copy_vmrun_state(struct vmcb_save_area *to_save,
to_save->isst_addr = from_save->isst_addr;
to_save->ssp = from_save->ssp;
}
+
+ if (kvm_cpu_cap_has(X86_FEATURE_LBRV)) {
+ svm_copy_lbrs(to_save, from_save);
+ to_save->dbgctl &= ~DEBUGCTL_RESERVED_BITS;
+ }
}
void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
@@ -1125,36 +1210,20 @@ void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
}
-int nested_svm_vmexit(struct vcpu_svm *svm)
+static int nested_svm_vmexit_update_vmcb12(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu *vcpu = &svm->vcpu;
- struct vmcb *vmcb01 = svm->vmcb01.ptr;
+ struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
- struct vmcb *vmcb12;
struct kvm_host_map map;
+ struct vmcb *vmcb12;
int rc;
rc = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
- if (rc) {
- if (rc == -EINVAL)
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
+ if (rc)
+ return rc;
vmcb12 = map.hva;
- /* Exit Guest-Mode */
- leave_guest_mode(vcpu);
- svm->nested.vmcb12_gpa = 0;
- WARN_ON_ONCE(svm->nested.nested_run_pending);
-
- kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
-
- /* in case we halted in L2 */
- kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
-
- /* Give the current vmcb to the guest */
-
vmcb12->save.es = vmcb02->save.es;
vmcb12->save.cs = vmcb02->save.cs;
vmcb12->save.ss = vmcb02->save.ss;
@@ -1164,7 +1233,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
vmcb12->save.efer = svm->vcpu.arch.efer;
vmcb12->save.cr0 = kvm_read_cr0(vcpu);
vmcb12->save.cr3 = kvm_read_cr3(vcpu);
- vmcb12->save.cr2 = vmcb02->save.cr2;
+ vmcb12->save.cr2 = vcpu->arch.cr2;
vmcb12->save.cr4 = svm->vcpu.arch.cr4;
vmcb12->save.rflags = kvm_get_rflags(vcpu);
vmcb12->save.rip = kvm_rip_read(vcpu);
@@ -1191,9 +1260,43 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS))
vmcb12->control.next_rip = vmcb02->control.next_rip;
+ if (nested_vmcb12_has_lbrv(vcpu))
+ svm_copy_lbrs(&vmcb12->save, &vmcb02->save);
+
+ vmcb12->control.event_inj = 0;
+ vmcb12->control.event_inj_err = 0;
vmcb12->control.int_ctl = svm->nested.ctl.int_ctl;
- vmcb12->control.event_inj = svm->nested.ctl.event_inj;
- vmcb12->control.event_inj_err = svm->nested.ctl.event_inj_err;
+
+ trace_kvm_nested_vmexit_inject(vmcb12->control.exit_code,
+ vmcb12->control.exit_info_1,
+ vmcb12->control.exit_info_2,
+ vmcb12->control.exit_int_info,
+ vmcb12->control.exit_int_info_err,
+ KVM_ISA_SVM);
+
+ kvm_vcpu_unmap(vcpu, &map);
+ return 0;
+}
+
+void nested_svm_vmexit(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct vmcb *vmcb01 = svm->vmcb01.ptr;
+ struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
+
+ if (nested_svm_vmexit_update_vmcb12(vcpu))
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+
+ /* Exit Guest-Mode */
+ leave_guest_mode(vcpu);
+ svm->nested.vmcb12_gpa = 0;
+
+ kvm_warn_on_nested_run_pending(vcpu);
+
+ kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
+
+ /* in case we halted in L2 */
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
if (!kvm_pause_in_guest(vcpu->kvm)) {
vmcb01->control.pause_filter_count = vmcb02->control.pause_filter_count;
@@ -1202,11 +1305,11 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
}
/*
- * Invalidate bus_lock_rip unless KVM is still waiting for the guest
- * to make forward progress before re-enabling bus lock detection.
+ * Invalidate last_bus_lock_rip unless KVM is still waiting for the
+ * guest to make forward progress before re-enabling bus lock detection.
*/
if (!vmcb02->control.bus_lock_counter)
- svm->nested.ctl.bus_lock_rip = INVALID_GPA;
+ svm->nested.last_bus_lock_rip = INVALID_GPA;
nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr);
@@ -1239,11 +1342,10 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
if (!nested_exit_on_intr(svm))
kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
- if (unlikely(guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) &&
- (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK)))
- svm_copy_lbrs(vmcb12, vmcb02);
- else
- svm_copy_lbrs(vmcb01, vmcb02);
+ if (!nested_vmcb12_has_lbrv(vcpu)) {
+ svm_copy_lbrs(&vmcb01->save, &vmcb02->save);
+ vmcb_mark_dirty(vmcb01, VMCB_LBR);
+ }
svm_update_lbrv(vcpu);
@@ -1296,22 +1398,16 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
svm->vcpu.arch.dr7 = DR7_FIXED_1;
kvm_update_dr7(&svm->vcpu);
- trace_kvm_nested_vmexit_inject(vmcb12->control.exit_code,
- vmcb12->control.exit_info_1,
- vmcb12->control.exit_info_2,
- vmcb12->control.exit_int_info,
- vmcb12->control.exit_int_info_err,
- KVM_ISA_SVM);
-
- kvm_vcpu_unmap(vcpu, &map);
-
nested_svm_transition_tlb_flush(vcpu);
nested_svm_uninit_mmu_context(vcpu);
- rc = nested_svm_load_cr3(vcpu, vmcb01->save.cr3, false, true);
- if (rc)
- return 1;
+ if (nested_svm_load_cr3(vcpu, vmcb01->save.cr3, false, true))
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+
+ /* Drop tracking for L1->L2 injected NMIs and soft IRQs */
+ svm->nmi_l1_to_l2 = false;
+ svm->soft_int_injected = false;
/*
* Drop what we picked up for L2 via svm_complete_interrupts() so it
@@ -1336,8 +1432,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
*/
if (kvm_apicv_activated(vcpu->kvm))
__kvm_vcpu_update_apicv(vcpu);
-
- return 0;
}
static void nested_svm_triple_fault(struct kvm_vcpu *vcpu)
@@ -1407,7 +1501,7 @@ void svm_leave_nested(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
if (is_guest_mode(vcpu)) {
- svm->nested.nested_run_pending = 0;
+ vcpu->arch.nested_run_pending = 0;
svm->nested.vmcb12_gpa = INVALID_GPA;
leave_guest_mode(vcpu);
@@ -1592,7 +1686,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
* previously injected event, the pending exception occurred while said
* event was being delivered and thus needs to be handled.
*/
- bool block_nested_exceptions = svm->nested.nested_run_pending;
+ bool block_nested_exceptions = vcpu->arch.nested_run_pending;
/*
* New events (not exceptions) are only recognized at instruction
* boundaries. If an event needs reinjection, then KVM is handling a
@@ -1682,9 +1776,7 @@ int nested_svm_exit_special(struct vcpu_svm *svm)
}
case SVM_EXIT_VMMCALL:
/* Hyper-V L2 TLB flush hypercall is handled by L0 */
- if (guest_hv_cpuid_has_l2_tlb_flush(vcpu) &&
- nested_svm_l2_tlb_flush_enabled(vcpu) &&
- kvm_hv_is_tlb_flush_hcall(vcpu))
+ if (nested_svm_is_l2_tlb_flush_hcall(vcpu))
return NESTED_EXIT_HOST;
break;
default:
@@ -1729,12 +1821,12 @@ static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst,
dst->exit_info_2 = from->exit_info_2;
dst->exit_int_info = from->exit_int_info;
dst->exit_int_info_err = from->exit_int_info_err;
- dst->nested_ctl = from->nested_ctl;
+ dst->misc_ctl = from->misc_ctl;
dst->event_inj = from->event_inj;
dst->event_inj_err = from->event_inj_err;
dst->next_rip = from->next_rip;
- dst->nested_cr3 = from->nested_cr3;
- dst->virt_ext = from->virt_ext;
+ dst->nested_cr3 = from->nested_cr3;
+ dst->misc_ctl2 = from->misc_ctl2;
dst->pause_filter_count = from->pause_filter_count;
dst->pause_filter_thresh = from->pause_filter_thresh;
/* 'clean' and 'hv_enlightenments' are not changed by KVM */
@@ -1769,7 +1861,7 @@ static int svm_get_nested_state(struct kvm_vcpu *vcpu,
kvm_state.size += KVM_STATE_NESTED_SVM_VMCB_SIZE;
kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
- if (svm->nested.nested_run_pending)
+ if (vcpu->arch.nested_run_pending)
kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
}
@@ -1869,12 +1961,12 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
ret = -EINVAL;
__nested_copy_vmcb_control_to_cache(vcpu, &ctl_cached, ctl);
- if (!__nested_vmcb_check_controls(vcpu, &ctl_cached))
+ if (!nested_vmcb_check_controls(vcpu, &ctl_cached))
goto out_free;
/*
* Processor state contains L2 state. Check that it is
- * valid for guest mode (see nested_vmcb_check_save).
+ * valid for guest mode (see nested_vmcb_check_save()).
*/
cr0 = kvm_read_cr0(vcpu);
if (((cr0 & X86_CR0_CD) == 0) && (cr0 & X86_CR0_NW))
@@ -1888,7 +1980,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
if (!(save->cr0 & X86_CR0_PG) ||
!(save->cr0 & X86_CR0_PE) ||
(save->rflags & X86_EFLAGS_VM) ||
- !__nested_vmcb_check_save(vcpu, &save_cached))
+ !nested_vmcb_check_save(vcpu, &save_cached))
goto out_free;
@@ -1906,8 +1998,10 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
- svm->nested.nested_run_pending =
- !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
+ if (kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING)
+ vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING_UNTRUSTED;
+ else
+ vcpu->arch.nested_run_pending = 0;
svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa;
@@ -1915,7 +2009,13 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
nested_copy_vmcb_control_to_cache(svm, ctl);
svm_switch_vmcb(svm, &svm->nested.vmcb02);
- nested_vmcb02_prepare_control(svm, svm->vmcb->save.rip, svm->vmcb->save.cs.base);
+ nested_vmcb02_prepare_control(svm);
+
+ /*
+ * Any previously restored state (e.g. KVM_SET_SREGS) would mark fields
+ * dirty in vmcb01 instead of vmcb02, so mark all of vmcb02 dirty here.
+ */
+ vmcb_mark_all_dirty(svm->vmcb);
/*
* While the nested guest CR3 is already checked and set by
@@ -1930,6 +2030,9 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
svm->nested.force_msr_bitmap_recalc = true;
+ if (kvm_vcpu_apicv_active(vcpu))
+ kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
+
kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
ret = 0;
out_free:
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 52517ef55ace..b1aa85a6ca5a 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -4591,7 +4591,7 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm, bool init_event)
struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm);
struct vmcb *vmcb = svm->vmcb01.ptr;
- svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ES_ENABLE;
+ svm->vmcb->control.misc_ctl |= SVM_MISC_ENABLE_SEV_ES;
/*
* An SEV-ES guest requires a VMSA area that is a separate from the
@@ -4631,7 +4631,7 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm, bool init_event)
if (!sev_vcpu_has_debug_swap(svm)) {
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
- recalc_intercepts(svm);
+ svm_mark_intercepts_dirty(svm);
} else {
/*
* Disable #DB intercept iff DebugSwap is enabled. KVM doesn't
@@ -4662,7 +4662,7 @@ void sev_init_vmcb(struct vcpu_svm *svm, bool init_event)
{
struct kvm_vcpu *vcpu = &svm->vcpu;
- svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE;
+ svm->vmcb->control.misc_ctl |= SVM_MISC_ENABLE_SEV;
clr_exception_intercept(svm, UD_VECTOR);
/*
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index d304568588c7..07ed964dacf5 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -52,6 +52,7 @@
#include "svm.h"
#include "svm_ops.h"
+#include "hyperv.h"
#include "kvm_onhyperv.h"
#include "svm_onhyperv.h"
@@ -216,6 +217,19 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) {
if (!(efer & EFER_SVME)) {
+ /*
+ * Architecturally, clearing EFER.SVME while a guest is
+ * running yields undefined behavior, i.e. KVM can do
+ * literally anything. Force the vCPU back into L1 as
+ * that is the safest option for KVM, but synthesize a
+ * triple fault (for L1!) so that KVM at least doesn't
+ * run random L2 code in the context of L1. Do so if
+ * and only if the vCPU is actively running, e.g. to
+ * avoid positives if userspace is stuffing state.
+ */
+ if (is_guest_mode(vcpu) && vcpu->wants_to_run)
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+
svm_leave_nested(vcpu);
/* #GP intercept is still needed for vmware backdoor */
if (!enable_vmware_backdoor)
@@ -244,6 +258,8 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm))
set_exception_intercept(svm, GP_VECTOR);
}
+
+ kvm_make_request(KVM_REQ_RECALC_INTERCEPTS, vcpu);
}
svm->vmcb->save.efer = efer | EFER_SVME;
@@ -635,7 +651,7 @@ static void set_dr_intercepts(struct vcpu_svm *svm)
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
- recalc_intercepts(svm);
+ svm_mark_intercepts_dirty(svm);
}
static void clr_dr_intercepts(struct vcpu_svm *svm)
@@ -644,7 +660,7 @@ static void clr_dr_intercepts(struct vcpu_svm *svm)
vmcb->control.intercepts[INTERCEPT_DR] = 0;
- recalc_intercepts(svm);
+ svm_mark_intercepts_dirty(svm);
}
static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
@@ -710,7 +726,7 @@ void *svm_alloc_permissions_map(unsigned long size, gfp_t gfp_mask)
static void svm_recalc_lbr_msr_intercepts(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- bool intercept = !(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK);
+ bool intercept = !(svm->vmcb->control.misc_ctl2 & SVM_MISC2_ENABLE_V_LBR);
if (intercept == svm->lbr_msrs_intercepted)
return;
@@ -841,20 +857,9 @@ static void svm_recalc_msr_intercepts(struct kvm_vcpu *vcpu)
*/
}
-void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
-{
- to_vmcb->save.dbgctl = from_vmcb->save.dbgctl;
- to_vmcb->save.br_from = from_vmcb->save.br_from;
- to_vmcb->save.br_to = from_vmcb->save.br_to;
- to_vmcb->save.last_excp_from = from_vmcb->save.last_excp_from;
- to_vmcb->save.last_excp_to = from_vmcb->save.last_excp_to;
-
- vmcb_mark_dirty(to_vmcb, VMCB_LBR);
-}
-
static void __svm_enable_lbrv(struct kvm_vcpu *vcpu)
{
- to_svm(vcpu)->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
+ to_svm(vcpu)->vmcb->control.misc_ctl2 |= SVM_MISC2_ENABLE_V_LBR;
}
void svm_enable_lbrv(struct kvm_vcpu *vcpu)
@@ -866,16 +871,16 @@ void svm_enable_lbrv(struct kvm_vcpu *vcpu)
static void __svm_disable_lbrv(struct kvm_vcpu *vcpu)
{
KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm);
- to_svm(vcpu)->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
+ to_svm(vcpu)->vmcb->control.misc_ctl2 &= ~SVM_MISC2_ENABLE_V_LBR;
}
void svm_update_lbrv(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK;
+ bool current_enable_lbrv = svm->vmcb->control.misc_ctl2 & SVM_MISC2_ENABLE_V_LBR;
bool enable_lbrv = (svm->vmcb->save.dbgctl & DEBUGCTLMSR_LBR) ||
(is_guest_mode(vcpu) && guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) &&
- (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK));
+ (svm->nested.ctl.misc_ctl2 & SVM_MISC2_ENABLE_V_LBR));
if (enable_lbrv && !current_enable_lbrv)
__svm_enable_lbrv(vcpu);
@@ -1009,6 +1014,14 @@ void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu)
preempt_enable();
}
+static bool svm_has_pending_gif_event(struct vcpu_svm *svm)
+{
+ return svm->vcpu.arch.smi_pending ||
+ svm->vcpu.arch.nmi_pending ||
+ kvm_cpu_has_injectable_intr(&svm->vcpu) ||
+ kvm_apic_has_pending_init_or_sipi(&svm->vcpu);
+}
+
/* Evaluate instruction intercepts that depend on guest CPUID features. */
static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu)
{
@@ -1034,22 +1047,50 @@ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu)
}
/*
- * No need to toggle VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK here, it is
- * always set if vls is enabled. If the intercepts are set, the bit is
- * meaningless anyway.
+ * Intercept instructions that #UD if EFER.SVME=0, as SVME must be set
+ * even when running the guest, i.e. hardware will only ever see
+ * EFER.SVME=1.
+ *
+ * No need to toggle any of the vgif/vls/etc. enable bits here, as they
+ * are set when the VMCB is initialized and never cleared (if the
+ * relevant intercepts are set, the enablements are meaningless anyway).
+ *
+ * FIXME: When #GP is not intercepted, a #GP on these instructions (e.g.
+ * due to CPL > 0) could be injected by hardware before the instruction
+ * is intercepted, leading to #GP taking precedence over #UD from the
+ * guest's perspective.
*/
- if (guest_cpuid_is_intel_compatible(vcpu)) {
+ if (!(vcpu->arch.efer & EFER_SVME)) {
svm_set_intercept(svm, INTERCEPT_VMLOAD);
svm_set_intercept(svm, INTERCEPT_VMSAVE);
+ svm_set_intercept(svm, INTERCEPT_CLGI);
+ svm_set_intercept(svm, INTERCEPT_STGI);
} else {
/*
* If hardware supports Virtual VMLOAD VMSAVE then enable it
* in VMCB and clear intercepts to avoid #VMEXIT.
*/
- if (vls) {
+ if (guest_cpuid_is_intel_compatible(vcpu)) {
+ svm_set_intercept(svm, INTERCEPT_VMLOAD);
+ svm_set_intercept(svm, INTERCEPT_VMSAVE);
+ } else if (vls) {
svm_clr_intercept(svm, INTERCEPT_VMLOAD);
svm_clr_intercept(svm, INTERCEPT_VMSAVE);
}
+
+ /*
+ * Process pending events when clearing STGI/CLGI intercepts if
+ * there's at least one pending event that is masked by GIF, so
+ * that KVM re-evaluates if the intercept needs to be set again
+ * to track when GIF is re-enabled (e.g. for NMI injection).
+ */
+ if (vgif) {
+ svm_clr_intercept(svm, INTERCEPT_CLGI);
+ svm_clr_intercept(svm, INTERCEPT_STGI);
+
+ if (svm_has_pending_gif_event(svm))
+ kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+ }
}
if (kvm_need_rdpmc_intercept(vcpu))
@@ -1162,7 +1203,7 @@ static void init_vmcb(struct kvm_vcpu *vcpu, bool init_event)
if (npt_enabled) {
/* Setup VMCB for Nested Paging */
- control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
+ control->misc_ctl |= SVM_MISC_ENABLE_NP;
svm_clr_intercept(svm, INTERCEPT_INVLPG);
clr_exception_intercept(svm, PF_VECTOR);
svm_clr_intercept(svm, INTERCEPT_CR3_READ);
@@ -1194,14 +1235,11 @@ static void init_vmcb(struct kvm_vcpu *vcpu, bool init_event)
if (vnmi)
svm->vmcb->control.int_ctl |= V_NMI_ENABLE_MASK;
- if (vgif) {
- svm_clr_intercept(svm, INTERCEPT_STGI);
- svm_clr_intercept(svm, INTERCEPT_CLGI);
+ if (vgif)
svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
- }
if (vls)
- svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+ svm->vmcb->control.misc_ctl2 |= SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE;
if (vcpu->kvm->arch.bus_lock_detection_enabled)
svm_set_intercept(svm, INTERCEPT_BUSLOCK);
@@ -2151,6 +2189,7 @@ static int intr_interception(struct kvm_vcpu *vcpu)
static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload)
{
+ u64 vmcb12_gpa = kvm_register_read(vcpu, VCPU_REGS_RAX);
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb *vmcb12;
struct kvm_host_map map;
@@ -2159,13 +2198,14 @@ static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload)
if (nested_svm_check_permissions(vcpu))
return 1;
- ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
- if (ret) {
- if (ret == -EINVAL)
- kvm_inject_gp(vcpu, 0);
+ if (!page_address_valid(vcpu, vmcb12_gpa)) {
+ kvm_inject_gp(vcpu, 0);
return 1;
}
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map))
+ return kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL);
+
vmcb12 = map.hva;
ret = kvm_skip_emulated_instruction(vcpu);
@@ -2202,58 +2242,28 @@ static int vmrun_interception(struct kvm_vcpu *vcpu)
return nested_svm_vmrun(vcpu);
}
-enum {
- NONE_SVM_INSTR,
- SVM_INSTR_VMRUN,
- SVM_INSTR_VMLOAD,
- SVM_INSTR_VMSAVE,
-};
-
-/* Return NONE_SVM_INSTR if not SVM instrs, otherwise return decode result */
-static int svm_instr_opcode(struct kvm_vcpu *vcpu)
+/* Return 0 if not SVM instr, otherwise return associated exit_code */
+static u64 svm_get_decoded_instr_exit_code(struct kvm_vcpu *vcpu)
{
struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
if (ctxt->b != 0x1 || ctxt->opcode_len != 2)
- return NONE_SVM_INSTR;
+ return 0;
+
+ BUILD_BUG_ON(!SVM_EXIT_VMRUN || !SVM_EXIT_VMLOAD || !SVM_EXIT_VMSAVE);
switch (ctxt->modrm) {
case 0xd8: /* VMRUN */
- return SVM_INSTR_VMRUN;
+ return SVM_EXIT_VMRUN;
case 0xda: /* VMLOAD */
- return SVM_INSTR_VMLOAD;
+ return SVM_EXIT_VMLOAD;
case 0xdb: /* VMSAVE */
- return SVM_INSTR_VMSAVE;
+ return SVM_EXIT_VMSAVE;
default:
break;
}
- return NONE_SVM_INSTR;
-}
-
-static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
-{
- const int guest_mode_exit_codes[] = {
- [SVM_INSTR_VMRUN] = SVM_EXIT_VMRUN,
- [SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD,
- [SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE,
- };
- int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = {
- [SVM_INSTR_VMRUN] = vmrun_interception,
- [SVM_INSTR_VMLOAD] = vmload_interception,
- [SVM_INSTR_VMSAVE] = vmsave_interception,
- };
- struct vcpu_svm *svm = to_svm(vcpu);
- int ret;
-
- if (is_guest_mode(vcpu)) {
- /* Returns '1' or -errno on failure, '0' on success. */
- ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]);
- if (ret)
- return ret;
- return 1;
- }
- return svm_instr_handlers[opcode](vcpu);
+ return 0;
}
/*
@@ -2268,7 +2278,7 @@ static int gp_interception(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
u32 error_code = svm->vmcb->control.exit_info_1;
- int opcode;
+ u64 svm_exit_code;
/* Both #GP cases have zero error_code */
if (error_code)
@@ -2278,27 +2288,37 @@ static int gp_interception(struct kvm_vcpu *vcpu)
if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK)
goto reinject;
- opcode = svm_instr_opcode(vcpu);
+ /* FIXME: Handle SVM instructions through the emulator */
+ svm_exit_code = svm_get_decoded_instr_exit_code(vcpu);
+ if (svm_exit_code) {
+ if (!is_guest_mode(vcpu))
+ return svm_invoke_exit_handler(vcpu, svm_exit_code);
- if (opcode == NONE_SVM_INSTR) {
- if (!enable_vmware_backdoor)
+ if (nested_svm_check_permissions(vcpu))
+ return 1;
+
+ if (!page_address_valid(vcpu, kvm_register_read(vcpu, VCPU_REGS_RAX)))
goto reinject;
/*
- * VMware backdoor emulation on #GP interception only handles
- * IN{S}, OUT{S}, and RDPMC.
+ * FIXME: Only synthesize a #VMEXIT if L1 sets the intercept,
+ * but only after the VMLOAD/VMSAVE exit handlers can properly
+ * handle VMLOAD/VMSAVE from L2 with VLS enabled in L1 (i.e.
+ * RAX is an L2 GPA that needs translation through L1's NPT).
*/
- if (!is_guest_mode(vcpu))
- return kvm_emulate_instruction(vcpu,
- EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE);
- } else {
- /* All SVM instructions expect page aligned RAX */
- if (svm->vmcb->save.rax & ~PAGE_MASK)
- goto reinject;
-
- return emulate_svm_instr(vcpu, opcode);
+ nested_svm_simple_vmexit(svm, svm_exit_code);
+ return 1;
}
+ /*
+ * VMware backdoor emulation on #GP interception only handles
+ * IN{S}, OUT{S}, and RDPMC, and only for L1.
+ */
+ if (!enable_vmware_backdoor || is_guest_mode(vcpu))
+ goto reinject;
+
+ return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE);
+
reinject:
kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
return 1;
@@ -2319,10 +2339,7 @@ void svm_set_gif(struct vcpu_svm *svm, bool value)
svm_clear_vintr(svm);
enable_gif(svm);
- if (svm->vcpu.arch.smi_pending ||
- svm->vcpu.arch.nmi_pending ||
- kvm_cpu_has_injectable_intr(&svm->vcpu) ||
- kvm_apic_has_pending_init_or_sipi(&svm->vcpu))
+ if (svm_has_pending_gif_event(svm))
kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
} else {
disable_gif(svm);
@@ -2366,6 +2383,9 @@ static int invlpga_interception(struct kvm_vcpu *vcpu)
gva_t gva = kvm_rax_read(vcpu);
u32 asid = kvm_rcx_read(vcpu);
+ if (nested_svm_check_permissions(vcpu))
+ return 1;
+
/* FIXME: Handle an address size prefix. */
if (!is_long_mode(vcpu))
gva = (u32)gva;
@@ -2723,6 +2743,24 @@ static int svm_get_feature_msr(u32 msr, u64 *data)
return 0;
}
+static u64 *svm_vmcb_lbr(struct vcpu_svm *svm, u32 msr)
+{
+ switch (msr) {
+ case MSR_IA32_LASTBRANCHFROMIP:
+ return &svm->vmcb->save.br_from;
+ case MSR_IA32_LASTBRANCHTOIP:
+ return &svm->vmcb->save.br_to;
+ case MSR_IA32_LASTINTFROMIP:
+ return &svm->vmcb->save.last_excp_from;
+ case MSR_IA32_LASTINTTOIP:
+ return &svm->vmcb->save.last_excp_to;
+ default:
+ break;
+ }
+ KVM_BUG_ON(1, svm->vcpu.kvm);
+ return &svm->vmcb->save.br_from;
+}
+
static bool sev_es_prevent_msr_access(struct kvm_vcpu *vcpu,
struct msr_data *msr_info)
{
@@ -2796,19 +2834,13 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = svm->tsc_aux;
break;
case MSR_IA32_DEBUGCTLMSR:
- msr_info->data = svm->vmcb->save.dbgctl;
+ msr_info->data = lbrv ? svm->vmcb->save.dbgctl : 0;
break;
case MSR_IA32_LASTBRANCHFROMIP:
- msr_info->data = svm->vmcb->save.br_from;
- break;
case MSR_IA32_LASTBRANCHTOIP:
- msr_info->data = svm->vmcb->save.br_to;
- break;
case MSR_IA32_LASTINTFROMIP:
- msr_info->data = svm->vmcb->save.last_excp_from;
- break;
case MSR_IA32_LASTINTTOIP:
- msr_info->data = svm->vmcb->save.last_excp_to;
+ msr_info->data = lbrv ? *svm_vmcb_lbr(svm, msr_info->index) : 0;
break;
case MSR_VM_HSAVE_PA:
msr_info->data = svm->nested.hsave_msr;
@@ -3083,6 +3115,17 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
vmcb_mark_dirty(svm->vmcb, VMCB_LBR);
svm_update_lbrv(vcpu);
break;
+ case MSR_IA32_LASTBRANCHFROMIP:
+ case MSR_IA32_LASTBRANCHTOIP:
+ case MSR_IA32_LASTINTFROMIP:
+ case MSR_IA32_LASTINTTOIP:
+ if (!lbrv)
+ return KVM_MSR_RET_UNSUPPORTED;
+ if (!msr->host_initiated)
+ return 1;
+ *svm_vmcb_lbr(svm, ecx) = data;
+ vmcb_mark_dirty(svm->vmcb, VMCB_LBR);
+ break;
case MSR_VM_HSAVE_PA:
/*
* Old kernels did not validate the value written to
@@ -3224,11 +3267,27 @@ static int bus_lock_exit(struct kvm_vcpu *vcpu)
vcpu->arch.complete_userspace_io = complete_userspace_buslock;
if (is_guest_mode(vcpu))
- svm->nested.ctl.bus_lock_rip = vcpu->arch.cui_linear_rip;
+ svm->nested.last_bus_lock_rip = vcpu->arch.cui_linear_rip;
return 0;
}
+static int vmmcall_interception(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Inject a #UD if L2 is active and the VMMCALL isn't a Hyper-V TLB
+ * hypercall, as VMMCALL #UDs if it's not intercepted, and this path is
+ * reachable if and only if L1 doesn't want to intercept VMMCALL or has
+ * enabled L0 (KVM) handling of Hyper-V L2 TLB flush hypercalls.
+ */
+ if (is_guest_mode(vcpu) && !nested_svm_is_l2_tlb_flush_hcall(vcpu)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ return kvm_emulate_hypercall(vcpu);
+}
+
static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[SVM_EXIT_READ_CR0] = cr_interception,
[SVM_EXIT_READ_CR3] = cr_interception,
@@ -3279,7 +3338,7 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[SVM_EXIT_TASK_SWITCH] = task_switch_interception,
[SVM_EXIT_SHUTDOWN] = shutdown_interception,
[SVM_EXIT_VMRUN] = vmrun_interception,
- [SVM_EXIT_VMMCALL] = kvm_emulate_hypercall,
+ [SVM_EXIT_VMMCALL] = vmmcall_interception,
[SVM_EXIT_VMLOAD] = vmload_interception,
[SVM_EXIT_VMSAVE] = vmsave_interception,
[SVM_EXIT_STGI] = stgi_interception,
@@ -3354,13 +3413,13 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
- pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
+ pr_err("%-20s%lld\n", "misc_ctl:", control->misc_ctl);
pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
pr_err("%-20s%016llx\n", "ghcb:", control->ghcb_gpa);
pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
- pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext);
+ pr_err("%-20s%lld\n", "misc_ctl2:", control->misc_ctl2);
pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
@@ -3638,6 +3697,16 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
return svm_invoke_exit_handler(vcpu, svm->vmcb->control.exit_code);
}
+static void svm_set_nested_run_soft_int_state(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->soft_int_csbase = svm->vmcb->save.cs.base;
+ svm->soft_int_old_rip = kvm_rip_read(vcpu);
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS))
+ svm->soft_int_next_rip = kvm_rip_read(vcpu);
+}
+
static int pre_svm_run(struct kvm_vcpu *vcpu)
{
struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
@@ -3739,6 +3808,36 @@ static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
svm->vmcb->control.event_inj = intr->nr | SVM_EVTINJ_VALID | type;
}
+static void svm_fixup_nested_rips(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!is_guest_mode(vcpu) || !vcpu->arch.nested_run_pending)
+ return;
+
+ /*
+ * If nrips is supported in hardware but not exposed to L1, stuff the
+ * actual L2 RIP to emulate what a nrips=0 CPU would do (L1 is
+ * responsible for advancing RIP prior to injecting the event). Once L2
+ * runs after L1 executes VMRUN, NextRIP is updated by the CPU and/or
+ * KVM, and this is no longer needed.
+ *
+ * This is done here (as opposed to when preparing vmcb02) to use the
+ * most up-to-date value of RIP regardless of the order of restoring
+ * registers and nested state in the vCPU save+restore path.
+ */
+ if (boot_cpu_has(X86_FEATURE_NRIPS) &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS))
+ svm->vmcb->control.next_rip = kvm_rip_read(vcpu);
+
+ /*
+ * Simiarly, initialize the soft int metadata here to use the most
+ * up-to-date values of RIP and CS base, regardless of restore order.
+ */
+ if (svm->soft_int_injected)
+ svm_set_nested_run_soft_int_state(vcpu);
+}
+
void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
int trig_mode, int vector)
{
@@ -3861,7 +3960,7 @@ bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (svm->nested.nested_run_pending)
+ if (vcpu->arch.nested_run_pending)
return -EBUSY;
if (svm_nmi_blocked(vcpu))
@@ -3903,7 +4002,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (svm->nested.nested_run_pending)
+ if (vcpu->arch.nested_run_pending)
return -EBUSY;
if (svm_interrupt_blocked(vcpu))
@@ -4107,6 +4206,18 @@ static void svm_complete_soft_interrupt(struct kvm_vcpu *vcpu, u8 vector,
struct vcpu_svm *svm = to_svm(vcpu);
/*
+ * Initialize the soft int fields *before* reading them below if KVM
+ * aborted entry to the guest with a nested VMRUN pending. To ensure
+ * KVM uses up-to-date values for RIP and CS base across save/restore,
+ * regardless of restore order, KVM waits to set the soft int fields
+ * until VMRUN is imminent. But when canceling injection, KVM requeues
+ * the soft int and will reinject it via the standard injection flow,
+ * and so KVM needs to grab the state from the pending nested VMRUN.
+ */
+ if (is_guest_mode(vcpu) && vcpu->arch.nested_run_pending)
+ svm_set_nested_run_soft_int_state(vcpu);
+
+ /*
* If NRIPS is enabled, KVM must snapshot the pre-VMRUN next_rip that's
* associated with the original soft exception/interrupt. next_rip is
* cleared on all exits that can occur while vectoring an event, so KVM
@@ -4335,6 +4446,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
kvm_register_is_dirty(vcpu, VCPU_EXREG_ERAPS))
svm->vmcb->control.erap_ctl |= ERAP_CONTROL_CLEAR_RAP;
+ svm_fixup_nested_rips(vcpu);
+
svm_hv_update_vp_id(svm->vmcb, vcpu);
/*
@@ -4355,7 +4468,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
* VM-Exit), as running with the host's DEBUGCTL can negatively affect
* guest state and can even be fatal, e.g. due to Bus Lock Detect.
*/
- if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) &&
+ if (!(svm->vmcb->control.misc_ctl2 & SVM_MISC2_ENABLE_V_LBR) &&
vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl)
update_debugctlmsr(svm->vmcb->save.dbgctl);
@@ -4386,7 +4499,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
- if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) &&
+ if (!(svm->vmcb->control.misc_ctl2 & SVM_MISC2_ENABLE_V_LBR) &&
vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl)
update_debugctlmsr(vcpu->arch.host_debugctl);
@@ -4404,11 +4517,11 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
nested_sync_control_from_vmcb02(svm);
/* Track VMRUNs that have made past consistency checking */
- if (svm->nested.nested_run_pending &&
+ if (vcpu->arch.nested_run_pending &&
!svm_is_vmrun_failure(svm->vmcb->control.exit_code))
++vcpu->stat.nested_run;
- svm->nested.nested_run_pending = 0;
+ vcpu->arch.nested_run_pending = 0;
}
svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
@@ -4436,6 +4549,16 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
svm_complete_interrupts(vcpu);
+ /*
+ * Update the cache after completing interrupts to get an accurate
+ * NextRIP, e.g. when re-injecting a soft interrupt.
+ *
+ * FIXME: Rework svm_get_nested_state() to not pull data from the
+ * cache (except for maybe int_ctl).
+ */
+ if (is_guest_mode(vcpu))
+ svm->nested.ctl.next_rip = svm->vmcb->control.next_rip;
+
return svm_exit_handlers_fastpath(vcpu);
}
@@ -4767,7 +4890,7 @@ bool svm_smi_blocked(struct kvm_vcpu *vcpu)
static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (svm->nested.nested_run_pending)
+ if (vcpu->arch.nested_run_pending)
return -EBUSY;
if (svm_smi_blocked(vcpu))
@@ -4784,7 +4907,6 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct kvm_host_map map_save;
- int ret;
if (!is_guest_mode(vcpu))
return 0;
@@ -4804,9 +4926,7 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
- ret = nested_svm_simple_vmexit(svm, SVM_EXIT_SW);
- if (ret)
- return ret;
+ nested_svm_simple_vmexit(svm, SVM_EXIT_SW);
/*
* KVM uses VMCB01 to store L1 host state while L2 runs but
@@ -4884,12 +5004,11 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
if (nested_svm_check_cached_vmcb12(vcpu) < 0)
goto unmap_save;
- if (enter_svm_guest_mode(vcpu, smram64->svm_guest_vmcb_gpa,
- vmcb12, false) != 0)
+ if (enter_svm_guest_mode(vcpu, smram64->svm_guest_vmcb_gpa, false) != 0)
goto unmap_save;
ret = 0;
- svm->nested.nested_run_pending = 1;
+ vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING;
unmap_save:
kvm_vcpu_unmap(vcpu, &map_save);
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 6942e6b0eda6..c36802285236 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -140,12 +140,32 @@ struct kvm_vmcb_info {
};
struct vmcb_save_area_cached {
+ struct vmcb_seg es;
+ struct vmcb_seg cs;
+ struct vmcb_seg ss;
+ struct vmcb_seg ds;
+ struct vmcb_seg gdtr;
+ struct vmcb_seg idtr;
+ u8 cpl;
u64 efer;
u64 cr4;
u64 cr3;
u64 cr0;
u64 dr7;
u64 dr6;
+ u64 rflags;
+ u64 rip;
+ u64 rsp;
+ u64 s_cet;
+ u64 ssp;
+ u64 isst_addr;
+ u64 rax;
+ u64 cr2;
+ u64 dbgctl;
+ u64 br_from;
+ u64 br_to;
+ u64 last_excp_from;
+ u64 last_excp_to;
};
struct vmcb_ctrl_area_cached {
@@ -166,14 +186,13 @@ struct vmcb_ctrl_area_cached {
u64 exit_info_2;
u32 exit_int_info;
u32 exit_int_info_err;
- u64 nested_ctl;
+ u64 misc_ctl;
u32 event_inj;
u32 event_inj_err;
u64 next_rip;
u64 nested_cr3;
- u64 virt_ext;
+ u64 misc_ctl2;
u32 clean;
- u64 bus_lock_rip;
union {
#if IS_ENABLED(CONFIG_HYPERV) || IS_ENABLED(CONFIG_KVM_HYPERV)
struct hv_vmcb_enlightenments hv_enlightenments;
@@ -188,6 +207,7 @@ struct svm_nested_state {
u64 vm_cr_msr;
u64 vmcb12_gpa;
u64 last_vmcb12_gpa;
+ u64 last_bus_lock_rip;
/*
* The MSR permissions map used for vmcb02, which is the merge result
@@ -195,10 +215,6 @@ struct svm_nested_state {
*/
void *msrpm;
- /* A VMRUN has started but has not yet been performed, so
- * we cannot inject a nested vmexit yet. */
- bool nested_run_pending;
-
/* cache for control fields of the guest */
struct vmcb_ctrl_area_cached ctl;
@@ -357,8 +373,6 @@ struct svm_cpu_data {
DECLARE_PER_CPU(struct svm_cpu_data, svm_data);
-void recalc_intercepts(struct vcpu_svm *svm);
-
static __always_inline struct kvm_svm *to_kvm_svm(struct kvm *kvm)
{
return container_of(kvm, struct kvm_svm, kvm);
@@ -415,9 +429,9 @@ static inline void vmcb_mark_dirty(struct vmcb *vmcb, int bit)
vmcb->control.clean &= ~(1 << bit);
}
-static inline bool vmcb_is_dirty(struct vmcb *vmcb, int bit)
+static inline bool vmcb12_is_dirty(struct vmcb_ctrl_area_cached *control, int bit)
{
- return !test_bit(bit, (unsigned long *)&vmcb->control.clean);
+ return !test_bit(bit, (unsigned long *)&control->clean);
}
static __always_inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
@@ -486,6 +500,22 @@ static inline bool vmcb12_is_intercept(struct vmcb_ctrl_area_cached *control, u3
return __vmcb_is_intercept((unsigned long *)&control->intercepts, bit);
}
+void nested_vmcb02_recalc_intercepts(struct vcpu_svm *svm);
+
+static inline void svm_mark_intercepts_dirty(struct vcpu_svm *svm)
+{
+ vmcb_mark_dirty(svm->vmcb01.ptr, VMCB_INTERCEPTS);
+
+ /*
+ * If L2 is active, recalculate the intercepts for vmcb02 to account
+ * for the changes made to vmcb01. All intercept configuration is done
+ * for vmcb01 and then propagated to vmcb02 to combine KVM's intercepts
+ * with L1's intercepts (from the vmcb12 snapshot).
+ */
+ if (is_guest_mode(&svm->vcpu))
+ nested_vmcb02_recalc_intercepts(svm);
+}
+
static inline void set_exception_intercept(struct vcpu_svm *svm, u32 bit)
{
struct vmcb *vmcb = svm->vmcb01.ptr;
@@ -493,7 +523,7 @@ static inline void set_exception_intercept(struct vcpu_svm *svm, u32 bit)
WARN_ON_ONCE(bit >= 32);
vmcb_set_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit);
- recalc_intercepts(svm);
+ svm_mark_intercepts_dirty(svm);
}
static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit)
@@ -503,7 +533,7 @@ static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit)
WARN_ON_ONCE(bit >= 32);
vmcb_clr_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit);
- recalc_intercepts(svm);
+ svm_mark_intercepts_dirty(svm);
}
static inline void svm_set_intercept(struct vcpu_svm *svm, int bit)
@@ -512,7 +542,7 @@ static inline void svm_set_intercept(struct vcpu_svm *svm, int bit)
vmcb_set_intercept(&vmcb->control, bit);
- recalc_intercepts(svm);
+ svm_mark_intercepts_dirty(svm);
}
static inline void svm_clr_intercept(struct vcpu_svm *svm, int bit)
@@ -521,7 +551,7 @@ static inline void svm_clr_intercept(struct vcpu_svm *svm, int bit)
vmcb_clr_intercept(&vmcb->control, bit);
- recalc_intercepts(svm);
+ svm_mark_intercepts_dirty(svm);
}
static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit)
@@ -578,7 +608,7 @@ static inline bool gif_set(struct vcpu_svm *svm)
static inline bool nested_npt_enabled(struct vcpu_svm *svm)
{
- return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE;
+ return svm->nested.ctl.misc_ctl & SVM_MISC_ENABLE_NP;
}
static inline bool nested_vnmi_enabled(struct vcpu_svm *svm)
@@ -713,8 +743,16 @@ static inline void *svm_vcpu_alloc_msrpm(void)
return svm_alloc_permissions_map(MSRPM_SIZE, GFP_KERNEL_ACCOUNT);
}
+#define svm_copy_lbrs(to, from) \
+do { \
+ (to)->dbgctl = (from)->dbgctl; \
+ (to)->br_from = (from)->br_from; \
+ (to)->br_to = (from)->br_to; \
+ (to)->last_excp_from = (from)->last_excp_from; \
+ (to)->last_excp_to = (from)->last_excp_to; \
+} while (0)
+
void svm_vcpu_free_msrpm(void *msrpm);
-void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb);
void svm_enable_lbrv(struct kvm_vcpu *vcpu);
void svm_update_lbrv(struct kvm_vcpu *vcpu);
@@ -776,8 +814,7 @@ static inline bool nested_exit_on_nmi(struct vcpu_svm *svm)
int __init nested_svm_init_msrpm_merge_offsets(void);
-int enter_svm_guest_mode(struct kvm_vcpu *vcpu,
- u64 vmcb_gpa, struct vmcb *vmcb12, bool from_vmrun);
+int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb_gpa, bool from_vmrun);
void svm_leave_nested(struct kvm_vcpu *vcpu);
void svm_free_nested(struct vcpu_svm *svm);
int svm_allocate_nested(struct vcpu_svm *svm);
@@ -785,14 +822,14 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu);
void svm_copy_vmrun_state(struct vmcb_save_area *to_save,
struct vmcb_save_area *from_save);
void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb);
-int nested_svm_vmexit(struct vcpu_svm *svm);
+void nested_svm_vmexit(struct vcpu_svm *svm);
-static inline int nested_svm_simple_vmexit(struct vcpu_svm *svm, u32 exit_code)
+static inline void nested_svm_simple_vmexit(struct vcpu_svm *svm, u32 exit_code)
{
svm->vmcb->control.exit_code = exit_code;
svm->vmcb->control.exit_info_1 = 0;
svm->vmcb->control.exit_info_2 = 0;
- return nested_svm_vmexit(svm);
+ nested_svm_vmexit(svm);
}
int nested_svm_exit_handled(struct vcpu_svm *svm);
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 937aeb474af7..3fe88f29be7a 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2273,7 +2273,7 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu,
static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
{
- if (vmx->nested.nested_run_pending &&
+ if (vmx->vcpu.arch.nested_run_pending &&
(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
return vmcs12->guest_ia32_efer;
else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
@@ -2513,7 +2513,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0
/*
* Interrupt/Exception Fields
*/
- if (vmx->nested.nested_run_pending) {
+ if (vmx->vcpu.arch.nested_run_pending) {
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
vmcs12->vm_entry_intr_info_field);
vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
@@ -2621,7 +2621,7 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
}
- if (kvm_mpx_supported() && vmx->nested.nested_run_pending &&
+ if (kvm_mpx_supported() && vmx->vcpu.arch.nested_run_pending &&
(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
}
@@ -2718,7 +2718,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
!(evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
}
- if (vmx->nested.nested_run_pending &&
+ if (vcpu->arch.nested_run_pending &&
(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
vmx_guest_debugctl_write(vcpu, vmcs12->guest_ia32_debugctl &
@@ -2728,13 +2728,13 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmx_guest_debugctl_write(vcpu, vmx->nested.pre_vmenter_debugctl);
}
- if (!vmx->nested.nested_run_pending ||
+ if (!vcpu->arch.nested_run_pending ||
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE))
vmcs_write_cet_state(vcpu, vmx->nested.pre_vmenter_s_cet,
vmx->nested.pre_vmenter_ssp,
vmx->nested.pre_vmenter_ssp_tbl);
- if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
+ if (kvm_mpx_supported() && (!vcpu->arch.nested_run_pending ||
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs);
vmx_set_rflags(vcpu, vmcs12->guest_rflags);
@@ -2747,7 +2747,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
- if (vmx->nested.nested_run_pending &&
+ if (vcpu->arch.nested_run_pending &&
(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
vcpu->arch.pat = vmcs12->guest_ia32_pat;
@@ -3349,7 +3349,7 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
* to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
* CR0.PG) is 1.
*/
- if (to_vmx(vcpu)->nested.nested_run_pending &&
+ if (vcpu->arch.nested_run_pending &&
(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) ||
CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) ||
@@ -3627,15 +3627,15 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
kvm_service_local_tlb_flush_requests(vcpu);
- if (!vmx->nested.nested_run_pending ||
+ if (!vcpu->arch.nested_run_pending ||
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
vmx->nested.pre_vmenter_debugctl = vmx_guest_debugctl_read();
if (kvm_mpx_supported() &&
- (!vmx->nested.nested_run_pending ||
+ (!vcpu->arch.nested_run_pending ||
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
- if (!vmx->nested.nested_run_pending ||
+ if (!vcpu->arch.nested_run_pending ||
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE))
vmcs_read_cet_state(vcpu, &vmx->nested.pre_vmenter_s_cet,
&vmx->nested.pre_vmenter_ssp,
@@ -3844,7 +3844,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
* We're finally done with prerequisite checking, and can start with
* the nested entry.
*/
- vmx->nested.nested_run_pending = 1;
+ vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING;
vmx->nested.has_preemption_timer_deadline = false;
status = nested_vmx_enter_non_root_mode(vcpu, true);
if (unlikely(status != NVMX_VMENTRY_SUCCESS))
@@ -3876,12 +3876,12 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
!nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) &&
!(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) &&
(vmcs12->guest_rflags & X86_EFLAGS_IF))) {
- vmx->nested.nested_run_pending = 0;
+ vcpu->arch.nested_run_pending = 0;
return kvm_emulate_halt_noskip(vcpu);
}
break;
case GUEST_ACTIVITY_WAIT_SIPI:
- vmx->nested.nested_run_pending = 0;
+ vcpu->arch.nested_run_pending = 0;
kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED);
break;
default:
@@ -3891,7 +3891,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
return 1;
vmentry_failed:
- vmx->nested.nested_run_pending = 0;
+ vcpu->arch.nested_run_pending = 0;
if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR)
return 0;
if (status == NVMX_VMENTRY_VMEXIT)
@@ -4288,7 +4288,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
* previously injected event, the pending exception occurred while said
* event was being delivered and thus needs to be handled.
*/
- bool block_nested_exceptions = vmx->nested.nested_run_pending;
+ bool block_nested_exceptions = vcpu->arch.nested_run_pending;
/*
* Events that don't require injection, i.e. that are virtualized by
* hardware, aren't blocked by a pending VM-Enter as KVM doesn't need
@@ -4657,7 +4657,7 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
if (nested_cpu_has_preemption_timer(vmcs12) &&
vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER &&
- !vmx->nested.nested_run_pending)
+ !vcpu->arch.nested_run_pending)
vmcs12->vmx_preemption_timer_value =
vmx_get_preemption_timer_value(vcpu);
@@ -5056,7 +5056,7 @@ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
vmx->nested.mtf_pending = false;
/* trying to cancel vmlaunch/vmresume is a bug */
- WARN_ON_ONCE(vmx->nested.nested_run_pending);
+ kvm_warn_on_nested_run_pending(vcpu);
#ifdef CONFIG_KVM_HYPERV
if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
@@ -6679,7 +6679,7 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
unsigned long exit_qual;
u32 exit_intr_info;
- WARN_ON_ONCE(vmx->nested.nested_run_pending);
+ kvm_warn_on_nested_run_pending(vcpu);
/*
* Late nested VM-Fail shares the same flow as nested VM-Exit since KVM
@@ -6775,7 +6775,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
if (is_guest_mode(vcpu)) {
kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
- if (vmx->nested.nested_run_pending)
+ if (vcpu->arch.nested_run_pending)
kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
if (vmx->nested.mtf_pending)
@@ -6850,7 +6850,7 @@ out:
void vmx_leave_nested(struct kvm_vcpu *vcpu)
{
if (is_guest_mode(vcpu)) {
- to_vmx(vcpu)->nested.nested_run_pending = 0;
+ vcpu->arch.nested_run_pending = 0;
nested_vmx_vmexit(vcpu, -1, 0, 0);
}
free_nested(vcpu);
@@ -7008,8 +7008,10 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
return 0;
- vmx->nested.nested_run_pending =
- !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
+ if (kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING)
+ vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING_UNTRUSTED;
+ else
+ vcpu->arch.nested_run_pending = 0;
vmx->nested.mtf_pending =
!!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING);
@@ -7054,7 +7056,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
return 0;
error_guest_mode:
- vmx->nested.nested_run_pending = 0;
+ vcpu->arch.nested_run_pending = 0;
return ret;
}
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index d16427a079f6..d76a21c38506 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5279,7 +5279,7 @@ bool vmx_nmi_blocked(struct kvm_vcpu *vcpu)
int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
- if (to_vmx(vcpu)->nested.nested_run_pending)
+ if (vcpu->arch.nested_run_pending)
return -EBUSY;
/* An NMI must not be injected into L2 if it's supposed to VM-Exit. */
@@ -5306,7 +5306,7 @@ bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
- if (to_vmx(vcpu)->nested.nested_run_pending)
+ if (vcpu->arch.nested_run_pending)
return -EBUSY;
/*
@@ -6118,7 +6118,7 @@ static bool vmx_unhandleable_emulation_required(struct kvm_vcpu *vcpu)
* only reachable if userspace modifies L2 guest state after KVM has
* performed the nested VM-Enter consistency checks.
*/
- if (vmx->nested.nested_run_pending)
+ if (vcpu->arch.nested_run_pending)
return true;
/*
@@ -6802,7 +6802,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
* invalid guest state should never happen as that means KVM knowingly
* allowed a nested VM-Enter with an invalid vmcs12. More below.
*/
- if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm))
+ if (KVM_BUG_ON(vcpu->arch.nested_run_pending, vcpu->kvm))
return -EIO;
if (is_guest_mode(vcpu)) {
@@ -7730,11 +7730,11 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
* Track VMLAUNCH/VMRESUME that have made past guest state
* checking.
*/
- if (vmx->nested.nested_run_pending &&
+ if (vcpu->arch.nested_run_pending &&
!vmx_get_exit_reason(vcpu).failed_vmentry)
++vcpu->stat.nested_run;
- vmx->nested.nested_run_pending = 0;
+ vcpu->arch.nested_run_pending = 0;
}
if (unlikely(vmx->fail))
@@ -8491,7 +8491,7 @@ void vmx_setup_mce(struct kvm_vcpu *vcpu)
int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
/* we need a nested vmexit to enter SMM, postpone if run is pending */
- if (to_vmx(vcpu)->nested.nested_run_pending)
+ if (vcpu->arch.nested_run_pending)
return -EBUSY;
return !is_smm(vcpu);
}
@@ -8536,7 +8536,7 @@ int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
if (ret != NVMX_VMENTRY_SUCCESS)
return 1;
- vmx->nested.nested_run_pending = 1;
+ vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING;
vmx->nested.smm.guest_mode = false;
}
return 0;
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 70bfe81dea54..db84e8001da5 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -138,9 +138,6 @@ struct nested_vmx {
*/
bool enlightened_vmcs_enabled;
- /* L2 must run next, and mustn't decide to exit to L1. */
- bool nested_run_pending;
-
/* Pending MTF VM-exit into L1. */
bool mtf_pending;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 33084df3865c..d65edcf8f30d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -351,6 +351,9 @@ static const u32 msrs_to_save_base[] = {
MSR_IA32_U_CET, MSR_IA32_S_CET,
MSR_IA32_PL0_SSP, MSR_IA32_PL1_SSP, MSR_IA32_PL2_SSP,
MSR_IA32_PL3_SSP, MSR_IA32_INT_SSP_TAB,
+ MSR_IA32_DEBUGCTLMSR,
+ MSR_IA32_LASTBRANCHFROMIP, MSR_IA32_LASTBRANCHTOIP,
+ MSR_IA32_LASTINTFROMIP, MSR_IA32_LASTINTTOIP,
};
static const u32 msrs_to_save_pmu[] = {
@@ -864,9 +867,6 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, unsigned int nr,
vcpu->arch.exception.error_code = error_code;
vcpu->arch.exception.has_payload = has_payload;
vcpu->arch.exception.payload = payload;
- if (!is_guest_mode(vcpu))
- kvm_deliver_exception_payload(vcpu,
- &vcpu->arch.exception);
return;
}
@@ -5531,18 +5531,8 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
return 0;
}
-static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
- struct kvm_vcpu_events *events)
+static struct kvm_queued_exception *kvm_get_exception_to_save(struct kvm_vcpu *vcpu)
{
- struct kvm_queued_exception *ex;
-
- process_nmi(vcpu);
-
-#ifdef CONFIG_KVM_SMM
- if (kvm_check_request(KVM_REQ_SMI, vcpu))
- process_smi(vcpu);
-#endif
-
/*
* KVM's ABI only allows for one exception to be migrated. Luckily,
* the only time there can be two queued exceptions is if there's a
@@ -5553,21 +5543,46 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
if (vcpu->arch.exception_vmexit.pending &&
!vcpu->arch.exception.pending &&
!vcpu->arch.exception.injected)
- ex = &vcpu->arch.exception_vmexit;
- else
- ex = &vcpu->arch.exception;
+ return &vcpu->arch.exception_vmexit;
+
+ return &vcpu->arch.exception;
+}
+
+static void kvm_handle_exception_payload_quirk(struct kvm_vcpu *vcpu)
+{
+ struct kvm_queued_exception *ex = kvm_get_exception_to_save(vcpu);
/*
- * In guest mode, payload delivery should be deferred if the exception
- * will be intercepted by L1, e.g. KVM should not modifying CR2 if L1
- * intercepts #PF, ditto for DR6 and #DBs. If the per-VM capability,
- * KVM_CAP_EXCEPTION_PAYLOAD, is not set, userspace may or may not
- * propagate the payload and so it cannot be safely deferred. Deliver
- * the payload if the capability hasn't been requested.
+ * If KVM_CAP_EXCEPTION_PAYLOAD is disabled, then (prematurely) deliver
+ * the pending exception payload when userspace saves *any* vCPU state
+ * that interacts with exception payloads to avoid breaking userspace.
+ *
+ * Architecturally, KVM must not deliver an exception payload until the
+ * exception is actually injected, e.g. to avoid losing pending #DB
+ * information (which VMX tracks in the VMCS), and to avoid clobbering
+ * state if the exception is never injected for whatever reason. But
+ * if KVM_CAP_EXCEPTION_PAYLOAD isn't enabled, then userspace may or
+ * may not propagate the payload across save+restore, and so KVM can't
+ * safely defer delivery of the payload.
*/
if (!vcpu->kvm->arch.exception_payload_enabled &&
ex->pending && ex->has_payload)
kvm_deliver_exception_payload(vcpu, ex);
+}
+
+static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
+ struct kvm_vcpu_events *events)
+{
+ struct kvm_queued_exception *ex = kvm_get_exception_to_save(vcpu);
+
+ process_nmi(vcpu);
+
+#ifdef CONFIG_KVM_SMM
+ if (kvm_check_request(KVM_REQ_SMI, vcpu))
+ process_smi(vcpu);
+#endif
+
+ kvm_handle_exception_payload_quirk(vcpu);
memset(events, 0, sizeof(*events));
@@ -5746,6 +5761,8 @@ static int kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
vcpu->arch.guest_state_protected)
return -EINVAL;
+ kvm_handle_exception_payload_quirk(vcpu);
+
memset(dbgregs, 0, sizeof(*dbgregs));
BUILD_BUG_ON(ARRAY_SIZE(vcpu->arch.db) != ARRAY_SIZE(dbgregs->db));
@@ -8897,6 +8914,11 @@ static bool emulator_is_canonical_addr(struct x86_emulate_ctxt *ctxt,
return !is_noncanonical_address(addr, emul_to_vcpu(ctxt), flags);
}
+static bool emulator_page_address_valid(struct x86_emulate_ctxt *ctxt, gpa_t gpa)
+{
+ return page_address_valid(emul_to_vcpu(ctxt), gpa);
+}
+
static const struct x86_emulate_ops emulate_ops = {
.vm_bugged = emulator_vm_bugged,
.read_gpr = emulator_read_gpr,
@@ -8944,6 +8966,7 @@ static const struct x86_emulate_ops emulate_ops = {
.set_xcr = emulator_set_xcr,
.get_untagged_addr = emulator_get_untagged_addr,
.is_canonical_addr = emulator_is_canonical_addr,
+ .page_address_valid = emulator_page_address_valid,
};
static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
@@ -11917,6 +11940,13 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
static int kvm_x86_vcpu_pre_run(struct kvm_vcpu *vcpu)
{
/*
+ * Userspace may have modified vCPU state, mark nested_run_pending as
+ * "untrusted" to avoid triggering false-positive WARNs.
+ */
+ if (vcpu->arch.nested_run_pending == KVM_NESTED_RUN_PENDING)
+ vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING_UNTRUSTED;
+
+ /*
* SIPI_RECEIVED is obsolete; KVM leaves the vCPU in Wait-For-SIPI and
* tracks the pending SIPI separately. SIPI_RECEIVED is still accepted
* by KVM_SET_VCPU_EVENTS for backwards compatibility, but should be
@@ -12156,6 +12186,8 @@ static void __get_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
if (vcpu->arch.guest_state_protected)
goto skip_protected_regs;
+ kvm_handle_exception_payload_quirk(vcpu);
+
kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 44a28d343d40..38a905fa86de 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -188,6 +188,16 @@ static inline bool kvm_can_set_cpuid_and_feature_msrs(struct kvm_vcpu *vcpu)
return vcpu->arch.last_vmentry_cpu == -1 && !is_guest_mode(vcpu);
}
+/*
+ * WARN if a nested VM-Enter is pending completion, and userspace hasn't gained
+ * control since the nested VM-Enter was initiated (in which case, userspace
+ * may have modified vCPU state to induce an architecturally invalid VM-Exit).
+ */
+static inline void kvm_warn_on_nested_run_pending(struct kvm_vcpu *vcpu)
+{
+ WARN_ON_ONCE(vcpu->arch.nested_run_pending == KVM_NESTED_RUN_PENDING);
+}
+
static inline void kvm_set_mp_state(struct kvm_vcpu *vcpu, int mp_state)
{
vcpu->arch.mp_state = mp_state;
diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm
index 8ad649a8e936..e79561deffbf 100644
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@@ -111,8 +111,11 @@ TEST_GEN_PROGS_x86 += x86/state_test
TEST_GEN_PROGS_x86 += x86/vmx_preemption_timer_test
TEST_GEN_PROGS_x86 += x86/svm_vmcall_test
TEST_GEN_PROGS_x86 += x86/svm_int_ctl_test
+TEST_GEN_PROGS_x86 += x86/svm_nested_clear_efer_svme
TEST_GEN_PROGS_x86 += x86/svm_nested_shutdown_test
TEST_GEN_PROGS_x86 += x86/svm_nested_soft_inject_test
+TEST_GEN_PROGS_x86 += x86/svm_nested_vmcb12_gpa
+TEST_GEN_PROGS_x86 += x86/svm_lbr_nested_state
TEST_GEN_PROGS_x86 += x86/tsc_scaling_sync
TEST_GEN_PROGS_x86 += x86/sync_regs_test
TEST_GEN_PROGS_x86 += x86/ucna_injection_test
diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h
index 732a9d816a70..d8634a760a60 100644
--- a/tools/testing/selftests/kvm/include/x86/processor.h
+++ b/tools/testing/selftests/kvm/include/x86/processor.h
@@ -1390,6 +1390,11 @@ static inline bool kvm_is_ignore_msrs(void)
return get_kvm_param_bool("ignore_msrs");
}
+static inline bool kvm_is_lbrv_enabled(void)
+{
+ return !!get_kvm_amd_param_integer("lbrv");
+}
+
uint64_t *vm_get_pte(struct kvm_vm *vm, uint64_t vaddr);
uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2,
diff --git a/tools/testing/selftests/kvm/include/x86/svm.h b/tools/testing/selftests/kvm/include/x86/svm.h
index 10b30b38bb3f..c8539166270e 100644
--- a/tools/testing/selftests/kvm/include/x86/svm.h
+++ b/tools/testing/selftests/kvm/include/x86/svm.h
@@ -97,13 +97,13 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
u64 exit_info_2;
u32 exit_int_info;
u32 exit_int_info_err;
- u64 nested_ctl;
+ u64 misc_ctl;
u64 avic_vapic_bar;
u8 reserved_4[8];
u32 event_inj;
u32 event_inj_err;
u64 nested_cr3;
- u64 virt_ext;
+ u64 misc_ctl2;
u32 clean;
u32 reserved_5;
u64 next_rip;
@@ -155,9 +155,6 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define AVIC_ENABLE_SHIFT 31
#define AVIC_ENABLE_MASK (1 << AVIC_ENABLE_SHIFT)
-#define LBR_CTL_ENABLE_MASK BIT_ULL(0)
-#define VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK BIT_ULL(1)
-
#define SVM_INTERRUPT_SHADOW_MASK 1
#define SVM_IOIO_STR_SHIFT 2
@@ -175,8 +172,11 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL
#define SVM_VM_CR_SVM_DIS_MASK 0x0010ULL
-#define SVM_NESTED_CTL_NP_ENABLE BIT(0)
-#define SVM_NESTED_CTL_SEV_ENABLE BIT(1)
+#define SVM_MISC_ENABLE_NP BIT(0)
+#define SVM_MISC_ENABLE_SEV BIT(1)
+
+#define SVM_MISC2_ENABLE_V_LBR BIT_ULL(0)
+#define SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE BIT_ULL(1)
struct __attribute__ ((__packed__)) vmcb_seg {
u16 selector;
diff --git a/tools/testing/selftests/kvm/lib/x86/svm.c b/tools/testing/selftests/kvm/lib/x86/svm.c
index 2e5c480c9afd..eb20b00112c7 100644
--- a/tools/testing/selftests/kvm/lib/x86/svm.c
+++ b/tools/testing/selftests/kvm/lib/x86/svm.c
@@ -126,7 +126,7 @@ void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_r
guest_regs.rdi = (u64)svm;
if (svm->ncr3_gpa) {
- ctrl->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
+ ctrl->misc_ctl |= SVM_MISC_ENABLE_NP;
ctrl->nested_cr3 = svm->ncr3_gpa;
}
}
diff --git a/tools/testing/selftests/kvm/x86/nested_vmsave_vmload_test.c b/tools/testing/selftests/kvm/x86/nested_vmsave_vmload_test.c
index 6764a48f9d4d..71717118d692 100644
--- a/tools/testing/selftests/kvm/x86/nested_vmsave_vmload_test.c
+++ b/tools/testing/selftests/kvm/x86/nested_vmsave_vmload_test.c
@@ -79,8 +79,8 @@ static void l1_guest_code(struct svm_test_data *svm)
svm->vmcb->control.intercept |= (BIT_ULL(INTERCEPT_VMSAVE) |
BIT_ULL(INTERCEPT_VMLOAD));
- /* ..VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK cleared.. */
- svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+ /* ..SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE cleared.. */
+ svm->vmcb->control.misc_ctl2 &= ~SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE;
svm->vmcb->save.rip = (u64)l2_guest_code_vmsave;
run_guest(svm->vmcb, svm->vmcb_gpa);
@@ -90,8 +90,8 @@ static void l1_guest_code(struct svm_test_data *svm)
run_guest(svm->vmcb, svm->vmcb_gpa);
GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMLOAD);
- /* ..and VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK set */
- svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+ /* ..and SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE set */
+ svm->vmcb->control.misc_ctl2 |= SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE;
svm->vmcb->save.rip = (u64)l2_guest_code_vmsave;
run_guest(svm->vmcb, svm->vmcb_gpa);
@@ -106,20 +106,20 @@ static void l1_guest_code(struct svm_test_data *svm)
BIT_ULL(INTERCEPT_VMLOAD));
/*
- * Without VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK, the GPA will be
+ * Without SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE, the GPA will be
* interpreted as an L1 GPA, so VMCB0 should be used.
*/
svm->vmcb->save.rip = (u64)l2_guest_code_vmcb0;
- svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+ svm->vmcb->control.misc_ctl2 &= ~SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE;
run_guest(svm->vmcb, svm->vmcb_gpa);
GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMMCALL);
/*
- * With VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK, the GPA will be interpeted as
+ * With SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE, the GPA will be interpeted as
* an L2 GPA, and translated through the NPT to VMCB1.
*/
svm->vmcb->save.rip = (u64)l2_guest_code_vmcb1;
- svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+ svm->vmcb->control.misc_ctl2 |= SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE;
run_guest(svm->vmcb, svm->vmcb_gpa);
GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMMCALL);
diff --git a/tools/testing/selftests/kvm/x86/state_test.c b/tools/testing/selftests/kvm/x86/state_test.c
index f2c7a1c297e3..992a52504a4a 100644
--- a/tools/testing/selftests/kvm/x86/state_test.c
+++ b/tools/testing/selftests/kvm/x86/state_test.c
@@ -26,7 +26,9 @@ void svm_l2_guest_code(void)
GUEST_SYNC(4);
/* Exit to L1 */
vmcall();
+ clgi();
GUEST_SYNC(6);
+ stgi();
/* Done, exit to L1 and never come back. */
vmcall();
}
@@ -41,6 +43,8 @@ static void svm_l1_guest_code(struct svm_test_data *svm)
generic_svm_setup(svm, svm_l2_guest_code,
&l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ vmcb->control.int_ctl |= (V_GIF_ENABLE_MASK | V_GIF_MASK);
+
GUEST_SYNC(3);
run_guest(vmcb, svm->vmcb_gpa);
GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL);
@@ -222,6 +226,35 @@ static void __attribute__((__flatten__)) guest_code(void *arg)
GUEST_DONE();
}
+void svm_check_nested_state(int stage, struct kvm_x86_state *state)
+{
+ struct vmcb *vmcb = (struct vmcb *)state->nested.data.svm;
+
+ if (kvm_cpu_has(X86_FEATURE_VGIF)) {
+ if (stage == 4)
+ TEST_ASSERT_EQ(!!(vmcb->control.int_ctl & V_GIF_MASK), 1);
+ if (stage == 6)
+ TEST_ASSERT_EQ(!!(vmcb->control.int_ctl & V_GIF_MASK), 0);
+ }
+
+ if (kvm_cpu_has(X86_FEATURE_NRIPS)) {
+ /*
+ * GUEST_SYNC() causes IO emulation in KVM, in which case the
+ * RIP is advanced before exiting to userspace. Hence, the RIP
+ * in the saved state should be the same as nRIP saved by the
+ * CPU in the VMCB.
+ */
+ if (stage == 6)
+ TEST_ASSERT_EQ(vmcb->control.next_rip, state->regs.rip);
+ }
+}
+
+void check_nested_state(int stage, struct kvm_x86_state *state)
+{
+ if (kvm_has_cap(KVM_CAP_NESTED_STATE) && kvm_cpu_has(X86_FEATURE_SVM))
+ svm_check_nested_state(stage, state);
+}
+
int main(int argc, char *argv[])
{
uint64_t *xstate_bv, saved_xstate_bv;
@@ -278,6 +311,8 @@ int main(int argc, char *argv[])
kvm_vm_release(vm);
+ check_nested_state(stage, state);
+
/* Restore state in a new VM. */
vcpu = vm_recreate_with_one_vcpu(vm);
vcpu_load_state(vcpu, state);
diff --git a/tools/testing/selftests/kvm/x86/svm_lbr_nested_state.c b/tools/testing/selftests/kvm/x86/svm_lbr_nested_state.c
new file mode 100644
index 000000000000..ff99438824d3
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/svm_lbr_nested_state.c
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2026, Google, Inc.
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "svm_util.h"
+
+
+#define L2_GUEST_STACK_SIZE 64
+
+#define DO_BRANCH() do { asm volatile("jmp 1f\n 1: nop"); } while (0)
+
+struct lbr_branch {
+ u64 from, to;
+};
+
+volatile struct lbr_branch l2_branch;
+
+#define RECORD_AND_CHECK_BRANCH(b) \
+do { \
+ wrmsr(MSR_IA32_DEBUGCTLMSR, DEBUGCTLMSR_LBR); \
+ DO_BRANCH(); \
+ (b)->from = rdmsr(MSR_IA32_LASTBRANCHFROMIP); \
+ (b)->to = rdmsr(MSR_IA32_LASTBRANCHTOIP); \
+ /* Disable LBR right after to avoid overriding the IPs */ \
+ wrmsr(MSR_IA32_DEBUGCTLMSR, 0); \
+ \
+ GUEST_ASSERT_NE((b)->from, 0); \
+ GUEST_ASSERT_NE((b)->to, 0); \
+} while (0)
+
+#define CHECK_BRANCH_MSRS(b) \
+do { \
+ GUEST_ASSERT_EQ((b)->from, rdmsr(MSR_IA32_LASTBRANCHFROMIP)); \
+ GUEST_ASSERT_EQ((b)->to, rdmsr(MSR_IA32_LASTBRANCHTOIP)); \
+} while (0)
+
+#define CHECK_BRANCH_VMCB(b, vmcb) \
+do { \
+ GUEST_ASSERT_EQ((b)->from, vmcb->save.br_from); \
+ GUEST_ASSERT_EQ((b)->to, vmcb->save.br_to); \
+} while (0)
+
+static void l2_guest_code(struct svm_test_data *svm)
+{
+ /* Record a branch, trigger save/restore, and make sure LBRs are intact */
+ RECORD_AND_CHECK_BRANCH(&l2_branch);
+ GUEST_SYNC(true);
+ CHECK_BRANCH_MSRS(&l2_branch);
+ vmmcall();
+}
+
+static void l1_guest_code(struct svm_test_data *svm, bool nested_lbrv)
+{
+ unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+ struct vmcb *vmcb = svm->vmcb;
+ struct lbr_branch l1_branch;
+
+ /* Record a branch, trigger save/restore, and make sure LBRs are intact */
+ RECORD_AND_CHECK_BRANCH(&l1_branch);
+ GUEST_SYNC(true);
+ CHECK_BRANCH_MSRS(&l1_branch);
+
+ /* Run L2, which will also do the same */
+ generic_svm_setup(svm, l2_guest_code,
+ &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+ if (nested_lbrv)
+ vmcb->control.misc_ctl2 = SVM_MISC2_ENABLE_V_LBR;
+ else
+ vmcb->control.misc_ctl2 &= ~SVM_MISC2_ENABLE_V_LBR;
+
+ run_guest(vmcb, svm->vmcb_gpa);
+ GUEST_ASSERT(svm->vmcb->control.exit_code == SVM_EXIT_VMMCALL);
+
+ /* Trigger save/restore one more time before checking, just for kicks */
+ GUEST_SYNC(true);
+
+ /*
+ * If LBR_CTL_ENABLE is set, L1 and L2 should have separate LBR MSRs, so
+ * expect L1's LBRs to remain intact and L2 LBRs to be in the VMCB.
+ * Otherwise, the MSRs are shared between L1 & L2 so expect L2's LBRs.
+ */
+ if (nested_lbrv) {
+ CHECK_BRANCH_MSRS(&l1_branch);
+ CHECK_BRANCH_VMCB(&l2_branch, vmcb);
+ } else {
+ CHECK_BRANCH_MSRS(&l2_branch);
+ }
+ GUEST_DONE();
+}
+
+void test_lbrv_nested_state(bool nested_lbrv)
+{
+ struct kvm_x86_state *state = NULL;
+ struct kvm_vcpu *vcpu;
+ vm_vaddr_t svm_gva;
+ struct kvm_vm *vm;
+ struct ucall uc;
+
+ pr_info("Testing with nested LBRV %s\n", nested_lbrv ? "enabled" : "disabled");
+
+ vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
+ vcpu_alloc_svm(vm, &svm_gva);
+ vcpu_args_set(vcpu, 2, svm_gva, nested_lbrv);
+
+ for (;;) {
+ vcpu_run(vcpu);
+ TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+ switch (get_ucall(vcpu, &uc)) {
+ case UCALL_SYNC:
+ /* Save the vCPU state and restore it in a new VM on sync */
+ pr_info("Guest triggered save/restore.\n");
+ state = vcpu_save_state(vcpu);
+ kvm_vm_release(vm);
+ vcpu = vm_recreate_with_one_vcpu(vm);
+ vcpu_load_state(vcpu, state);
+ kvm_x86_state_cleanup(state);
+ break;
+ case UCALL_ABORT:
+ REPORT_GUEST_ASSERT(uc);
+ /* NOT REACHED */
+ case UCALL_DONE:
+ goto done;
+ default:
+ TEST_FAIL("Unknown ucall %lu", uc.cmd);
+ }
+ }
+done:
+ kvm_vm_free(vm);
+}
+
+int main(int argc, char *argv[])
+{
+ TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM));
+ TEST_REQUIRE(kvm_is_lbrv_enabled());
+
+ test_lbrv_nested_state(/*nested_lbrv=*/false);
+ test_lbrv_nested_state(/*nested_lbrv=*/true);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/svm_nested_clear_efer_svme.c b/tools/testing/selftests/kvm/x86/svm_nested_clear_efer_svme.c
new file mode 100644
index 000000000000..a521a9eed061
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/svm_nested_clear_efer_svme.c
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2026, Google LLC.
+ */
+#include "kvm_util.h"
+#include "vmx.h"
+#include "svm_util.h"
+#include "kselftest.h"
+
+
+#define L2_GUEST_STACK_SIZE 64
+
+static void l2_guest_code(void)
+{
+ unsigned long efer = rdmsr(MSR_EFER);
+
+ /* generic_svm_setup() initializes EFER_SVME set for L2 */
+ GUEST_ASSERT(efer & EFER_SVME);
+ wrmsr(MSR_EFER, efer & ~EFER_SVME);
+
+ /* Unreachable, L1 should be shutdown */
+ GUEST_ASSERT(0);
+}
+
+static void l1_guest_code(struct svm_test_data *svm)
+{
+ unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+
+ generic_svm_setup(svm, l2_guest_code,
+ &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ run_guest(svm->vmcb, svm->vmcb_gpa);
+
+ /* Unreachable, L1 should be shutdown */
+ GUEST_ASSERT(0);
+}
+
+int main(int argc, char *argv[])
+{
+ struct kvm_vcpu *vcpu;
+ struct kvm_vm *vm;
+ vm_vaddr_t nested_gva = 0;
+
+ TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM));
+
+ vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
+
+ vcpu_alloc_svm(vm, &nested_gva);
+ vcpu_args_set(vcpu, 1, nested_gva);
+
+ vcpu_run(vcpu);
+ TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_SHUTDOWN);
+
+ kvm_vm_free(vm);
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/svm_nested_vmcb12_gpa.c b/tools/testing/selftests/kvm/x86/svm_nested_vmcb12_gpa.c
new file mode 100644
index 000000000000..569869bed20b
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/svm_nested_vmcb12_gpa.c
@@ -0,0 +1,176 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2026, Google LLC.
+ */
+#include "kvm_util.h"
+#include "vmx.h"
+#include "svm_util.h"
+#include "kselftest.h"
+#include "kvm_test_harness.h"
+#include "test_util.h"
+
+
+#define L2_GUEST_STACK_SIZE 64
+
+#define SYNC_GP 101
+#define SYNC_L2_STARTED 102
+
+static unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+
+static void guest_gp_handler(struct ex_regs *regs)
+{
+ GUEST_SYNC(SYNC_GP);
+}
+
+static void l2_code(void)
+{
+ GUEST_SYNC(SYNC_L2_STARTED);
+ vmcall();
+}
+
+static void l1_vmrun(struct svm_test_data *svm, u64 gpa)
+{
+ generic_svm_setup(svm, l2_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+ asm volatile ("vmrun %[gpa]" : : [gpa] "a" (gpa) : "memory");
+}
+
+static void l1_vmload(struct svm_test_data *svm, u64 gpa)
+{
+ generic_svm_setup(svm, l2_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+ asm volatile ("vmload %[gpa]" : : [gpa] "a" (gpa) : "memory");
+}
+
+static void l1_vmsave(struct svm_test_data *svm, u64 gpa)
+{
+ generic_svm_setup(svm, l2_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+ asm volatile ("vmsave %[gpa]" : : [gpa] "a" (gpa) : "memory");
+}
+
+static void l1_vmexit(struct svm_test_data *svm, u64 gpa)
+{
+ generic_svm_setup(svm, l2_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+ run_guest(svm->vmcb, svm->vmcb_gpa);
+ GUEST_ASSERT(svm->vmcb->control.exit_code == SVM_EXIT_VMMCALL);
+ GUEST_DONE();
+}
+
+static u64 unmappable_gpa(struct kvm_vcpu *vcpu)
+{
+ struct userspace_mem_region *region;
+ u64 region_gpa_end, vm_gpa_end = 0;
+ int i;
+
+ hash_for_each(vcpu->vm->regions.slot_hash, i, region, slot_node) {
+ region_gpa_end = region->region.guest_phys_addr + region->region.memory_size;
+ vm_gpa_end = max(vm_gpa_end, region_gpa_end);
+ }
+
+ return vm_gpa_end;
+}
+
+static void test_invalid_vmcb12(struct kvm_vcpu *vcpu)
+{
+ vm_vaddr_t nested_gva = 0;
+ struct ucall uc;
+
+
+ vm_install_exception_handler(vcpu->vm, GP_VECTOR, guest_gp_handler);
+ vcpu_alloc_svm(vcpu->vm, &nested_gva);
+ vcpu_args_set(vcpu, 2, nested_gva, -1ULL);
+ vcpu_run(vcpu);
+
+ TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+ TEST_ASSERT_EQ(get_ucall(vcpu, &uc), UCALL_SYNC);
+ TEST_ASSERT_EQ(uc.args[1], SYNC_GP);
+}
+
+static void test_unmappable_vmcb12(struct kvm_vcpu *vcpu)
+{
+ vm_vaddr_t nested_gva = 0;
+
+ vcpu_alloc_svm(vcpu->vm, &nested_gva);
+ vcpu_args_set(vcpu, 2, nested_gva, unmappable_gpa(vcpu));
+ vcpu_run(vcpu);
+
+ TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_INTERNAL_ERROR);
+ TEST_ASSERT_EQ(vcpu->run->emulation_failure.suberror, KVM_INTERNAL_ERROR_EMULATION);
+}
+
+static void test_unmappable_vmcb12_vmexit(struct kvm_vcpu *vcpu)
+{
+ struct kvm_x86_state *state;
+ vm_vaddr_t nested_gva = 0;
+ struct ucall uc;
+
+ /*
+ * Enter L2 (with a legit vmcb12 GPA), then overwrite vmcb12 GPA with an
+ * unmappable GPA. KVM will fail to map vmcb12 on nested VM-Exit and
+ * cause a shutdown.
+ */
+ vcpu_alloc_svm(vcpu->vm, &nested_gva);
+ vcpu_args_set(vcpu, 2, nested_gva, unmappable_gpa(vcpu));
+ vcpu_run(vcpu);
+ TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+ TEST_ASSERT_EQ(get_ucall(vcpu, &uc), UCALL_SYNC);
+ TEST_ASSERT_EQ(uc.args[1], SYNC_L2_STARTED);
+
+ state = vcpu_save_state(vcpu);
+ state->nested.hdr.svm.vmcb_pa = unmappable_gpa(vcpu);
+ vcpu_load_state(vcpu, state);
+ vcpu_run(vcpu);
+ TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_SHUTDOWN);
+
+ kvm_x86_state_cleanup(state);
+}
+
+KVM_ONE_VCPU_TEST_SUITE(vmcb12_gpa);
+
+KVM_ONE_VCPU_TEST(vmcb12_gpa, vmrun_invalid, l1_vmrun)
+{
+ test_invalid_vmcb12(vcpu);
+}
+
+KVM_ONE_VCPU_TEST(vmcb12_gpa, vmload_invalid, l1_vmload)
+{
+ test_invalid_vmcb12(vcpu);
+}
+
+KVM_ONE_VCPU_TEST(vmcb12_gpa, vmsave_invalid, l1_vmsave)
+{
+ test_invalid_vmcb12(vcpu);
+}
+
+KVM_ONE_VCPU_TEST(vmcb12_gpa, vmrun_unmappable, l1_vmrun)
+{
+ test_unmappable_vmcb12(vcpu);
+}
+
+KVM_ONE_VCPU_TEST(vmcb12_gpa, vmload_unmappable, l1_vmload)
+{
+ test_unmappable_vmcb12(vcpu);
+}
+
+KVM_ONE_VCPU_TEST(vmcb12_gpa, vmsave_unmappable, l1_vmsave)
+{
+ test_unmappable_vmcb12(vcpu);
+}
+
+/*
+ * Invalid vmcb12_gpa cannot be test for #VMEXIT as KVM_SET_NESTED_STATE will
+ * reject it.
+ */
+KVM_ONE_VCPU_TEST(vmcb12_gpa, vmexit_unmappable, l1_vmexit)
+{
+ test_unmappable_vmcb12_vmexit(vcpu);
+}
+
+int main(int argc, char *argv[])
+{
+ TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM));
+
+ return test_harness_run(argc, argv);
+}