summaryrefslogtreecommitdiff
path: root/arch/x86/kvm/vmx.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-12-14 02:47:02 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2016-12-14 02:47:02 +0300
commit93173b5bf2841da7e3a9b0cb1312ef5c87251524 (patch)
tree629de2735f465ce0437f8dba85a00f766bbec31c /arch/x86/kvm/vmx.c
parent1c59e1edb13d60b97b7b03b332ceed5d967d4227 (diff)
parentf673b5b2a66332da5358def524dbfb3305c76d8c (diff)
downloadlinux-93173b5bf2841da7e3a9b0cb1312ef5c87251524.tar.xz
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Paolo Bonzini: "Small release, the most interesting stuff is x86 nested virt improvements. x86: - userspace can now hide nested VMX features from guests - nested VMX can now run Hyper-V in a guest - support for AVX512_4VNNIW and AVX512_FMAPS in KVM - infrastructure support for virtual Intel GPUs. PPC: - support for KVM guests on POWER9 - improved support for interrupt polling - optimizations and cleanups. s390: - two small optimizations, more stuff is in flight and will be in 4.11. ARM: - support for the GICv3 ITS on 32bit platforms" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (94 commits) arm64: KVM: pmu: Reset PMSELR_EL0.SEL to a sane value before entering the guest KVM: arm/arm64: timer: Check for properly initialized timer on init KVM: arm/arm64: vgic-v2: Limit ITARGETSR bits to number of VCPUs KVM: x86: Handle the kthread worker using the new API KVM: nVMX: invvpid handling improvements KVM: nVMX: check host CR3 on vmentry and vmexit KVM: nVMX: introduce nested_vmx_load_cr3 and call it on vmentry KVM: nVMX: propagate errors from prepare_vmcs02 KVM: nVMX: fix CR3 load if L2 uses PAE paging and EPT KVM: nVMX: load GUEST_EFER after GUEST_CR0 during emulated VM-entry KVM: nVMX: generate MSR_IA32_CR{0,4}_FIXED1 from guest CPUID KVM: nVMX: fix checks on CR{0,4} during virtual VMX operation KVM: nVMX: support restore of VMX capability MSRs KVM: nVMX: generate non-true VMX MSRs based on true versions KVM: x86: Do not clear RFLAGS.TF when a singlestep trap occurs. KVM: x86: Add kvm_skip_emulated_instruction and use it. KVM: VMX: Move skip_emulated_instruction out of nested_vmx_check_vmcs12 KVM: VMX: Reorder some skip_emulated_instruction calls KVM: x86: Add a return value to kvm_emulate_cpuid KVM: PPC: Book3S: Move prototypes for KVM functions into kvm_ppc.h ...
Diffstat (limited to 'arch/x86/kvm/vmx.c')
-rw-r--r--arch/x86/kvm/vmx.c1092
1 files changed, 691 insertions, 401 deletions
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 3980da515fd0..aae43c6f2472 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -132,6 +132,22 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
+#define VMX_VPID_EXTENT_SUPPORTED_MASK \
+ (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \
+ VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \
+ VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \
+ VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
+
+/*
+ * Hyper-V requires all of these, so mark them as supported even though
+ * they are just treated the same as all-context.
+ */
+#define VMX_VPID_EXTENT_SUPPORTED_MASK \
+ (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \
+ VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \
+ VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \
+ VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
+
/*
* These 2 parameters are used to config the controls for Pause-Loop Exiting:
* ple_gap: upper bound on the amount of time between two successive
@@ -446,23 +462,31 @@ struct nested_vmx {
u16 vpid02;
u16 last_vpid;
+ /*
+ * We only store the "true" versions of the VMX capability MSRs. We
+ * generate the "non-true" versions by setting the must-be-1 bits
+ * according to the SDM.
+ */
u32 nested_vmx_procbased_ctls_low;
u32 nested_vmx_procbased_ctls_high;
- u32 nested_vmx_true_procbased_ctls_low;
u32 nested_vmx_secondary_ctls_low;
u32 nested_vmx_secondary_ctls_high;
u32 nested_vmx_pinbased_ctls_low;
u32 nested_vmx_pinbased_ctls_high;
u32 nested_vmx_exit_ctls_low;
u32 nested_vmx_exit_ctls_high;
- u32 nested_vmx_true_exit_ctls_low;
u32 nested_vmx_entry_ctls_low;
u32 nested_vmx_entry_ctls_high;
- u32 nested_vmx_true_entry_ctls_low;
u32 nested_vmx_misc_low;
u32 nested_vmx_misc_high;
u32 nested_vmx_ept_caps;
u32 nested_vmx_vpid_caps;
+ u64 nested_vmx_basic;
+ u64 nested_vmx_cr0_fixed0;
+ u64 nested_vmx_cr0_fixed1;
+ u64 nested_vmx_cr4_fixed0;
+ u64 nested_vmx_cr4_fixed1;
+ u64 nested_vmx_vmcs_enum;
};
#define POSTED_INTR_ON 0
@@ -520,6 +544,12 @@ static inline void pi_set_sn(struct pi_desc *pi_desc)
(unsigned long *)&pi_desc->control);
}
+static inline void pi_clear_on(struct pi_desc *pi_desc)
+{
+ clear_bit(POSTED_INTR_ON,
+ (unsigned long *)&pi_desc->control);
+}
+
static inline int pi_test_on(struct pi_desc *pi_desc)
{
return test_bit(POSTED_INTR_ON,
@@ -920,16 +950,32 @@ static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
-static unsigned long *vmx_io_bitmap_a;
-static unsigned long *vmx_io_bitmap_b;
-static unsigned long *vmx_msr_bitmap_legacy;
-static unsigned long *vmx_msr_bitmap_longmode;
-static unsigned long *vmx_msr_bitmap_legacy_x2apic;
-static unsigned long *vmx_msr_bitmap_longmode_x2apic;
-static unsigned long *vmx_msr_bitmap_legacy_x2apic_apicv_inactive;
-static unsigned long *vmx_msr_bitmap_longmode_x2apic_apicv_inactive;
-static unsigned long *vmx_vmread_bitmap;
-static unsigned long *vmx_vmwrite_bitmap;
+enum {
+ VMX_IO_BITMAP_A,
+ VMX_IO_BITMAP_B,
+ VMX_MSR_BITMAP_LEGACY,
+ VMX_MSR_BITMAP_LONGMODE,
+ VMX_MSR_BITMAP_LEGACY_X2APIC_APICV,
+ VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV,
+ VMX_MSR_BITMAP_LEGACY_X2APIC,
+ VMX_MSR_BITMAP_LONGMODE_X2APIC,
+ VMX_VMREAD_BITMAP,
+ VMX_VMWRITE_BITMAP,
+ VMX_BITMAP_NR
+};
+
+static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
+
+#define vmx_io_bitmap_a (vmx_bitmap[VMX_IO_BITMAP_A])
+#define vmx_io_bitmap_b (vmx_bitmap[VMX_IO_BITMAP_B])
+#define vmx_msr_bitmap_legacy (vmx_bitmap[VMX_MSR_BITMAP_LEGACY])
+#define vmx_msr_bitmap_longmode (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE])
+#define vmx_msr_bitmap_legacy_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC_APICV])
+#define vmx_msr_bitmap_longmode_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV])
+#define vmx_msr_bitmap_legacy_x2apic (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC])
+#define vmx_msr_bitmap_longmode_x2apic (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC])
+#define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP])
+#define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP])
static bool cpu_has_load_ia32_efer;
static bool cpu_has_load_perf_global_ctrl;
@@ -2523,14 +2569,14 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) {
if (is_long_mode(vcpu))
- msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
+ msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv;
else
- msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
+ msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv;
} else {
if (is_long_mode(vcpu))
- msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv_inactive;
+ msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
else
- msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv_inactive;
+ msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
}
} else {
if (is_long_mode(vcpu))
@@ -2706,9 +2752,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
/* We support free control of debug control saving. */
- vmx->nested.nested_vmx_true_exit_ctls_low =
- vmx->nested.nested_vmx_exit_ctls_low &
- ~VM_EXIT_SAVE_DEBUG_CONTROLS;
+ vmx->nested.nested_vmx_exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
/* entry controls */
rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
@@ -2727,9 +2771,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
/* We support free control of debug control loading. */
- vmx->nested.nested_vmx_true_entry_ctls_low =
- vmx->nested.nested_vmx_entry_ctls_low &
- ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
+ vmx->nested.nested_vmx_entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
/* cpu-based controls */
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
@@ -2762,8 +2804,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
CPU_BASED_USE_MSR_BITMAPS;
/* We support free control of CR3 access interception. */
- vmx->nested.nested_vmx_true_procbased_ctls_low =
- vmx->nested.nested_vmx_procbased_ctls_low &
+ vmx->nested.nested_vmx_procbased_ctls_low &=
~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
/* secondary cpu-based controls */
@@ -2774,6 +2815,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
vmx->nested.nested_vmx_secondary_ctls_high &=
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_RDTSCP |
+ SECONDARY_EXEC_DESC |
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
SECONDARY_EXEC_ENABLE_VPID |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
@@ -2805,8 +2847,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
*/
if (enable_vpid)
vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT |
- VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT |
- VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
+ VMX_VPID_EXTENT_SUPPORTED_MASK;
else
vmx->nested.nested_vmx_vpid_caps = 0;
@@ -2823,14 +2864,52 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
VMX_MISC_ACTIVITY_HLT;
vmx->nested.nested_vmx_misc_high = 0;
+
+ /*
+ * This MSR reports some information about VMX support. We
+ * should return information about the VMX we emulate for the
+ * guest, and the VMCS structure we give it - not about the
+ * VMX support of the underlying hardware.
+ */
+ vmx->nested.nested_vmx_basic =
+ VMCS12_REVISION |
+ VMX_BASIC_TRUE_CTLS |
+ ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
+ (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
+
+ if (cpu_has_vmx_basic_inout())
+ vmx->nested.nested_vmx_basic |= VMX_BASIC_INOUT;
+
+ /*
+ * These MSRs specify bits which the guest must keep fixed on
+ * while L1 is in VMXON mode (in L1's root mode, or running an L2).
+ * We picked the standard core2 setting.
+ */
+#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
+#define VMXON_CR4_ALWAYSON X86_CR4_VMXE
+ vmx->nested.nested_vmx_cr0_fixed0 = VMXON_CR0_ALWAYSON;
+ vmx->nested.nested_vmx_cr4_fixed0 = VMXON_CR4_ALWAYSON;
+
+ /* These MSRs specify bits which the guest must keep fixed off. */
+ rdmsrl(MSR_IA32_VMX_CR0_FIXED1, vmx->nested.nested_vmx_cr0_fixed1);
+ rdmsrl(MSR_IA32_VMX_CR4_FIXED1, vmx->nested.nested_vmx_cr4_fixed1);
+
+ /* highest index: VMX_PREEMPTION_TIMER_VALUE */
+ vmx->nested.nested_vmx_vmcs_enum = 0x2e;
+}
+
+/*
+ * if fixed0[i] == 1: val[i] must be 1
+ * if fixed1[i] == 0: val[i] must be 0
+ */
+static inline bool fixed_bits_valid(u64 val, u64 fixed0, u64 fixed1)
+{
+ return ((val & fixed1) | fixed0) == val;
}
static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
{
- /*
- * Bits 0 in high must be 0, and bits 1 in low must be 1.
- */
- return ((control & high) | low) == control;
+ return fixed_bits_valid(control, low, high);
}
static inline u64 vmx_control_msr(u32 low, u32 high)
@@ -2838,87 +2917,285 @@ static inline u64 vmx_control_msr(u32 low, u32 high)
return low | ((u64)high << 32);
}
-/* Returns 0 on success, non-0 otherwise. */
-static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
+static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
+{
+ superset &= mask;
+ subset &= mask;
+
+ return (superset | subset) == superset;
+}
+
+static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
+{
+ const u64 feature_and_reserved =
+ /* feature (except bit 48; see below) */
+ BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
+ /* reserved */
+ BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
+ u64 vmx_basic = vmx->nested.nested_vmx_basic;
+
+ if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
+ return -EINVAL;
+
+ /*
+ * KVM does not emulate a version of VMX that constrains physical
+ * addresses of VMX structures (e.g. VMCS) to 32-bits.
+ */
+ if (data & BIT_ULL(48))
+ return -EINVAL;
+
+ if (vmx_basic_vmcs_revision_id(vmx_basic) !=
+ vmx_basic_vmcs_revision_id(data))
+ return -EINVAL;
+
+ if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data))
+ return -EINVAL;
+
+ vmx->nested.nested_vmx_basic = data;
+ return 0;
+}
+
+static int
+vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
+{
+ u64 supported;
+ u32 *lowp, *highp;
+
+ switch (msr_index) {
+ case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
+ lowp = &vmx->nested.nested_vmx_pinbased_ctls_low;
+ highp = &vmx->nested.nested_vmx_pinbased_ctls_high;
+ break;
+ case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
+ lowp = &vmx->nested.nested_vmx_procbased_ctls_low;
+ highp = &vmx->nested.nested_vmx_procbased_ctls_high;
+ break;
+ case MSR_IA32_VMX_TRUE_EXIT_CTLS:
+ lowp = &vmx->nested.nested_vmx_exit_ctls_low;
+ highp = &vmx->nested.nested_vmx_exit_ctls_high;
+ break;
+ case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
+ lowp = &vmx->nested.nested_vmx_entry_ctls_low;
+ highp = &vmx->nested.nested_vmx_entry_ctls_high;
+ break;
+ case MSR_IA32_VMX_PROCBASED_CTLS2:
+ lowp = &vmx->nested.nested_vmx_secondary_ctls_low;
+ highp = &vmx->nested.nested_vmx_secondary_ctls_high;
+ break;
+ default:
+ BUG();
+ }
+
+ supported = vmx_control_msr(*lowp, *highp);
+
+ /* Check must-be-1 bits are still 1. */
+ if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0)))
+ return -EINVAL;
+
+ /* Check must-be-0 bits are still 0. */
+ if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
+ return -EINVAL;
+
+ *lowp = data;
+ *highp = data >> 32;
+ return 0;
+}
+
+static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
+{
+ const u64 feature_and_reserved_bits =
+ /* feature */
+ BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) |
+ BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
+ /* reserved */
+ GENMASK_ULL(13, 9) | BIT_ULL(31);
+ u64 vmx_misc;
+
+ vmx_misc = vmx_control_msr(vmx->nested.nested_vmx_misc_low,
+ vmx->nested.nested_vmx_misc_high);
+
+ if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
+ return -EINVAL;
+
+ if ((vmx->nested.nested_vmx_pinbased_ctls_high &
+ PIN_BASED_VMX_PREEMPTION_TIMER) &&
+ vmx_misc_preemption_timer_rate(data) !=
+ vmx_misc_preemption_timer_rate(vmx_misc))
+ return -EINVAL;
+
+ if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc))
+ return -EINVAL;
+
+ if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc))
+ return -EINVAL;
+
+ if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
+ return -EINVAL;
+
+ vmx->nested.nested_vmx_misc_low = data;
+ vmx->nested.nested_vmx_misc_high = data >> 32;
+ return 0;
+}
+
+static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
+{
+ u64 vmx_ept_vpid_cap;
+
+ vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.nested_vmx_ept_caps,
+ vmx->nested.nested_vmx_vpid_caps);
+
+ /* Every bit is either reserved or a feature bit. */
+ if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
+ return -EINVAL;
+
+ vmx->nested.nested_vmx_ept_caps = data;
+ vmx->nested.nested_vmx_vpid_caps = data >> 32;
+ return 0;
+}
+
+static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
+{
+ u64 *msr;
+
+ switch (msr_index) {
+ case MSR_IA32_VMX_CR0_FIXED0:
+ msr = &vmx->nested.nested_vmx_cr0_fixed0;
+ break;
+ case MSR_IA32_VMX_CR4_FIXED0:
+ msr = &vmx->nested.nested_vmx_cr4_fixed0;
+ break;
+ default:
+ BUG();
+ }
+
+ /*
+ * 1 bits (which indicates bits which "must-be-1" during VMX operation)
+ * must be 1 in the restored value.
+ */
+ if (!is_bitwise_subset(data, *msr, -1ULL))
+ return -EINVAL;
+
+ *msr = data;
+ return 0;
+}
+
+/*
+ * Called when userspace is restoring VMX MSRs.
+ *
+ * Returns 0 on success, non-0 otherwise.
+ */
+static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
switch (msr_index) {
case MSR_IA32_VMX_BASIC:
+ return vmx_restore_vmx_basic(vmx, data);
+ case MSR_IA32_VMX_PINBASED_CTLS:
+ case MSR_IA32_VMX_PROCBASED_CTLS:
+ case MSR_IA32_VMX_EXIT_CTLS:
+ case MSR_IA32_VMX_ENTRY_CTLS:
+ /*
+ * The "non-true" VMX capability MSRs are generated from the
+ * "true" MSRs, so we do not support restoring them directly.
+ *
+ * If userspace wants to emulate VMX_BASIC[55]=0, userspace
+ * should restore the "true" MSRs with the must-be-1 bits
+ * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND
+ * DEFAULT SETTINGS".
+ */
+ return -EINVAL;
+ case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
+ case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
+ case MSR_IA32_VMX_TRUE_EXIT_CTLS:
+ case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
+ case MSR_IA32_VMX_PROCBASED_CTLS2:
+ return vmx_restore_control_msr(vmx, msr_index, data);
+ case MSR_IA32_VMX_MISC:
+ return vmx_restore_vmx_misc(vmx, data);
+ case MSR_IA32_VMX_CR0_FIXED0:
+ case MSR_IA32_VMX_CR4_FIXED0:
+ return vmx_restore_fixed0_msr(vmx, msr_index, data);
+ case MSR_IA32_VMX_CR0_FIXED1:
+ case MSR_IA32_VMX_CR4_FIXED1:
+ /*
+ * These MSRs are generated based on the vCPU's CPUID, so we
+ * do not support restoring them directly.
+ */
+ return -EINVAL;
+ case MSR_IA32_VMX_EPT_VPID_CAP:
+ return vmx_restore_vmx_ept_vpid_cap(vmx, data);
+ case MSR_IA32_VMX_VMCS_ENUM:
+ vmx->nested.nested_vmx_vmcs_enum = data;
+ return 0;
+ default:
/*
- * This MSR reports some information about VMX support. We
- * should return information about the VMX we emulate for the
- * guest, and the VMCS structure we give it - not about the
- * VMX support of the underlying hardware.
+ * The rest of the VMX capability MSRs do not support restore.
*/
- *pdata = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS |
- ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
- (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
- if (cpu_has_vmx_basic_inout())
- *pdata |= VMX_BASIC_INOUT;
+ return -EINVAL;
+ }
+}
+
+/* Returns 0 on success, non-0 otherwise. */
+static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ switch (msr_index) {
+ case MSR_IA32_VMX_BASIC:
+ *pdata = vmx->nested.nested_vmx_basic;
break;
case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
case MSR_IA32_VMX_PINBASED_CTLS:
*pdata = vmx_control_msr(
vmx->nested.nested_vmx_pinbased_ctls_low,
vmx->nested.nested_vmx_pinbased_ctls_high);
+ if (msr_index == MSR_IA32_VMX_PINBASED_CTLS)
+ *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
break;
case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
- *pdata = vmx_control_msr(
- vmx->nested.nested_vmx_true_procbased_ctls_low,
- vmx->nested.nested_vmx_procbased_ctls_high);
- break;
case MSR_IA32_VMX_PROCBASED_CTLS:
*pdata = vmx_control_msr(
vmx->nested.nested_vmx_procbased_ctls_low,
vmx->nested.nested_vmx_procbased_ctls_high);
+ if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS)
+ *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
break;
case MSR_IA32_VMX_TRUE_EXIT_CTLS:
- *pdata = vmx_control_msr(
- vmx->nested.nested_vmx_true_exit_ctls_low,
- vmx->nested.nested_vmx_exit_ctls_high);
- break;
case MSR_IA32_VMX_EXIT_CTLS:
*pdata = vmx_control_msr(
vmx->nested.nested_vmx_exit_ctls_low,
vmx->nested.nested_vmx_exit_ctls_high);
+ if (msr_index == MSR_IA32_VMX_EXIT_CTLS)
+ *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
break;
case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
- *pdata = vmx_control_msr(
- vmx->nested.nested_vmx_true_entry_ctls_low,
- vmx->nested.nested_vmx_entry_ctls_high);
- break;
case MSR_IA32_VMX_ENTRY_CTLS:
*pdata = vmx_control_msr(
vmx->nested.nested_vmx_entry_ctls_low,
vmx->nested.nested_vmx_entry_ctls_high);
+ if (msr_index == MSR_IA32_VMX_ENTRY_CTLS)
+ *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
break;
case MSR_IA32_VMX_MISC:
*pdata = vmx_control_msr(
vmx->nested.nested_vmx_misc_low,
vmx->nested.nested_vmx_misc_high);
break;
- /*
- * These MSRs specify bits which the guest must keep fixed (on or off)
- * while L1 is in VMXON mode (in L1's root mode, or running an L2).
- * We picked the standard core2 setting.
- */
-#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
-#define VMXON_CR4_ALWAYSON X86_CR4_VMXE
case MSR_IA32_VMX_CR0_FIXED0:
- *pdata = VMXON_CR0_ALWAYSON;
+ *pdata = vmx->nested.nested_vmx_cr0_fixed0;
break;
case MSR_IA32_VMX_CR0_FIXED1:
- *pdata = -1ULL;
+ *pdata = vmx->nested.nested_vmx_cr0_fixed1;
break;
case MSR_IA32_VMX_CR4_FIXED0:
- *pdata = VMXON_CR4_ALWAYSON;
+ *pdata = vmx->nested.nested_vmx_cr4_fixed0;
break;
case MSR_IA32_VMX_CR4_FIXED1:
- *pdata = -1ULL;
+ *pdata = vmx->nested.nested_vmx_cr4_fixed1;
break;
case MSR_IA32_VMX_VMCS_ENUM:
- *pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */
+ *pdata = vmx->nested.nested_vmx_vmcs_enum;
break;
case MSR_IA32_VMX_PROCBASED_CTLS2:
*pdata = vmx_control_msr(
@@ -3101,7 +3378,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vmx_leave_nested(vcpu);
break;
case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
- return 1; /* they are read-only */
+ if (!msr_info->host_initiated)
+ return 1; /* they are read-only */
+ if (!nested_vmx_allowed(vcpu))
+ return 1;
+ return vmx_set_vmx_msr(vcpu, msr_index, data);
case MSR_IA32_XSS:
if (!vmx_xsaves_supported())
return 1;
@@ -3863,6 +4144,40 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
(unsigned long *)&vcpu->arch.regs_dirty);
}
+static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed0;
+ u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed1;
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+ if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high &
+ SECONDARY_EXEC_UNRESTRICTED_GUEST &&
+ nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
+ fixed0 &= ~(X86_CR0_PE | X86_CR0_PG);
+
+ return fixed_bits_valid(val, fixed0, fixed1);
+}
+
+static bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed0;
+ u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed1;
+
+ return fixed_bits_valid(val, fixed0, fixed1);
+}
+
+static bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr4_fixed0;
+ u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr4_fixed1;
+
+ return fixed_bits_valid(val, fixed0, fixed1);
+}
+
+/* No difference in the restrictions on guest and host CR4 in VMX operation. */
+#define nested_guest_cr4_valid nested_cr4_valid
+#define nested_host_cr4_valid nested_cr4_valid
+
static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
@@ -3991,8 +4306,8 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
if (!nested_vmx_allowed(vcpu))
return 1;
}
- if (to_vmx(vcpu)->nested.vmxon &&
- ((cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON))
+
+ if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
return 1;
vcpu->arch.cr4 = cr4;
@@ -4569,41 +4884,6 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
}
}
-static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
- u32 msr, int type)
-{
- int f = sizeof(unsigned long);
-
- if (!cpu_has_vmx_msr_bitmap())
- return;
-
- /*
- * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
- * have the write-low and read-high bitmap offsets the wrong way round.
- * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
- */
- if (msr <= 0x1fff) {
- if (type & MSR_TYPE_R)
- /* read-low */
- __set_bit(msr, msr_bitmap + 0x000 / f);
-
- if (type & MSR_TYPE_W)
- /* write-low */
- __set_bit(msr, msr_bitmap + 0x800 / f);
-
- } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
- msr &= 0x1fff;
- if (type & MSR_TYPE_R)
- /* read-high */
- __set_bit(msr, msr_bitmap + 0x400 / f);
-
- if (type & MSR_TYPE_W)
- /* write-high */
- __set_bit(msr, msr_bitmap + 0xc00 / f);
-
- }
-}
-
/*
* If a msr is allowed by L0, we should check whether it is allowed by L1.
* The corresponding bit will be cleared unless both of L0 and L1 allow it.
@@ -4659,48 +4939,18 @@ static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
msr, MSR_TYPE_R | MSR_TYPE_W);
}
-static void vmx_enable_intercept_msr_read_x2apic(u32 msr, bool apicv_active)
-{
- if (apicv_active) {
- __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
- msr, MSR_TYPE_R);
- __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
- msr, MSR_TYPE_R);
- } else {
- __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
- msr, MSR_TYPE_R);
- __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
- msr, MSR_TYPE_R);
- }
-}
-
-static void vmx_disable_intercept_msr_read_x2apic(u32 msr, bool apicv_active)
+static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_active)
{
if (apicv_active) {
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
- msr, MSR_TYPE_R);
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
- msr, MSR_TYPE_R);
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv,
+ msr, type);
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv,
+ msr, type);
} else {
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
- msr, MSR_TYPE_R);
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
- msr, MSR_TYPE_R);
- }
-}
-
-static void vmx_disable_intercept_msr_write_x2apic(u32 msr, bool apicv_active)
-{
- if (apicv_active) {
__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
- msr, MSR_TYPE_W);
+ msr, type);
__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
- msr, MSR_TYPE_W);
- } else {
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
- msr, MSR_TYPE_W);
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
- msr, MSR_TYPE_W);
+ msr, type);
}
}
@@ -4822,9 +5072,15 @@ static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (!pi_test_and_clear_on(&vmx->pi_desc))
+ if (!pi_test_on(&vmx->pi_desc))
return;
+ pi_clear_on(&vmx->pi_desc);
+ /*
+ * IOMMU can write to PIR.ON, so the barrier matters even on UP.
+ * But on x86 this is just a compiler barrier anyway.
+ */
+ smp_mb__after_atomic();
kvm_apic_update_irr(vcpu, vmx->pi_desc.pir);
}
@@ -5583,7 +5839,7 @@ static int handle_triple_fault(struct kvm_vcpu *vcpu)
static int handle_io(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification;
- int size, in, string;
+ int size, in, string, ret;
unsigned port;
exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -5597,9 +5853,14 @@ static int handle_io(struct kvm_vcpu *vcpu)
port = exit_qualification >> 16;
size = (exit_qualification & 7) + 1;
- skip_emulated_instruction(vcpu);
- return kvm_fast_pio_out(vcpu, size, port);
+ ret = kvm_skip_emulated_instruction(vcpu);
+
+ /*
+ * TODO: we might be squashing a KVM_GUESTDBG_SINGLESTEP-triggered
+ * KVM_EXIT_DEBUG here.
+ */
+ return kvm_fast_pio_out(vcpu, size, port) && ret;
}
static void
@@ -5613,18 +5874,6 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
hypercall[2] = 0xc1;
}
-static bool nested_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
-{
- unsigned long always_on = VMXON_CR0_ALWAYSON;
- struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-
- if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high &
- SECONDARY_EXEC_UNRESTRICTED_GUEST &&
- nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
- always_on &= ~(X86_CR0_PE | X86_CR0_PG);
- return (val & always_on) == always_on;
-}
-
/* called to set cr0 as appropriate for a mov-to-cr0 exit. */
static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
{
@@ -5643,7 +5892,7 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
val = (val & ~vmcs12->cr0_guest_host_mask) |
(vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
- if (!nested_cr0_valid(vcpu, val))
+ if (!nested_guest_cr0_valid(vcpu, val))
return 1;
if (kvm_set_cr0(vcpu, val))
@@ -5652,8 +5901,9 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
return 0;
} else {
if (to_vmx(vcpu)->nested.vmxon &&
- ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON))
+ !nested_host_cr0_valid(vcpu, val))
return 1;
+
return kvm_set_cr0(vcpu, val);
}
}
@@ -5697,6 +5947,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
int cr;
int reg;
int err;
+ int ret;
exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
cr = exit_qualification & 15;
@@ -5708,25 +5959,27 @@ static int handle_cr(struct kvm_vcpu *vcpu)
switch (cr) {
case 0:
err = handle_set_cr0(vcpu, val);
- kvm_complete_insn_gp(vcpu, err);
- return 1;
+ return kvm_complete_insn_gp(vcpu, err);
case 3:
err = kvm_set_cr3(vcpu, val);
- kvm_complete_insn_gp(vcpu, err);
- return 1;
+ return kvm_complete_insn_gp(vcpu, err);
case 4:
err = handle_set_cr4(vcpu, val);
- kvm_complete_insn_gp(vcpu, err);
- return 1;
+ return kvm_complete_insn_gp(vcpu, err);
case 8: {
u8 cr8_prev = kvm_get_cr8(vcpu);
u8 cr8 = (u8)val;
err = kvm_set_cr8(vcpu, cr8);
- kvm_complete_insn_gp(vcpu, err);
+ ret = kvm_complete_insn_gp(vcpu, err);
if (lapic_in_kernel(vcpu))
- return 1;
+ return ret;
if (cr8_prev <= cr8)
- return 1;
+ return ret;
+ /*
+ * TODO: we might be squashing a
+ * KVM_GUESTDBG_SINGLESTEP-triggered
+ * KVM_EXIT_DEBUG here.
+ */
vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
return 0;
}
@@ -5735,23 +5988,20 @@ static int handle_cr(struct kvm_vcpu *vcpu)
case 2: /* clts */
handle_clts(vcpu);
trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
- skip_emulated_instruction(vcpu);
vmx_fpu_activate(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
case 1: /*mov from cr*/
switch (cr) {
case 3:
val = kvm_read_cr3(vcpu);
kvm_register_write(vcpu, reg, val);
trace_kvm_cr_read(cr, val);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
case 8:
val = kvm_get_cr8(vcpu);
kvm_register_write(vcpu, reg, val);
trace_kvm_cr_read(cr, val);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
break;
case 3: /* lmsw */
@@ -5759,8 +6009,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
kvm_lmsw(vcpu, val);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
default:
break;
}
@@ -5831,8 +6080,7 @@ static int handle_dr(struct kvm_vcpu *vcpu)
if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg)))
return 1;
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
static u64 vmx_get_dr6(struct kvm_vcpu *vcpu)
@@ -5864,8 +6112,7 @@ static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
static int handle_cpuid(struct kvm_vcpu *vcpu)
{
- kvm_emulate_cpuid(vcpu);
- return 1;
+ return kvm_emulate_cpuid(vcpu);
}
static int handle_rdmsr(struct kvm_vcpu *vcpu)
@@ -5886,8 +6133,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu)
/* FIXME: handling of bits 32:63 of rax, rdx */
vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & -1u;
vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & -1u;
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
static int handle_wrmsr(struct kvm_vcpu *vcpu)
@@ -5907,8 +6153,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu)
}
trace_kvm_msr_write(ecx, data);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
@@ -5952,8 +6197,7 @@ static int handle_invlpg(struct kvm_vcpu *vcpu)
unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
kvm_mmu_invlpg(vcpu, exit_qualification);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
static int handle_rdpmc(struct kvm_vcpu *vcpu)
@@ -5961,15 +6205,12 @@ static int handle_rdpmc(struct kvm_vcpu *vcpu)
int err;
err = kvm_rdpmc(vcpu);
- kvm_complete_insn_gp(vcpu, err);
-
- return 1;
+ return kvm_complete_insn_gp(vcpu, err);
}
static int handle_wbinvd(struct kvm_vcpu *vcpu)
{
- kvm_emulate_wbinvd(vcpu);
- return 1;
+ return kvm_emulate_wbinvd(vcpu);
}
static int handle_xsetbv(struct kvm_vcpu *vcpu)
@@ -5978,20 +6219,20 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu)
u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
if (kvm_set_xcr(vcpu, index, new_bv) == 0)
- skip_emulated_instruction(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
return 1;
}
static int handle_xsaves(struct kvm_vcpu *vcpu)
{
- skip_emulated_instruction(vcpu);
+ kvm_skip_emulated_instruction(vcpu);
WARN(1, "this should never happen\n");
return 1;
}
static int handle_xrstors(struct kvm_vcpu *vcpu)
{
- skip_emulated_instruction(vcpu);
+ kvm_skip_emulated_instruction(vcpu);
WARN(1, "this should never happen\n");
return 1;
}
@@ -6012,8 +6253,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu)
if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
(offset == APIC_EOI)) {
kvm_lapic_set_eoi(vcpu);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
}
return emulate_instruction(vcpu, 0) == EMULATE_DONE;
@@ -6161,9 +6401,8 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
- skip_emulated_instruction(vcpu);
trace_kvm_fast_mmio(gpa);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
ret = handle_mmio_page_fault(vcpu, gpa, true);
@@ -6348,50 +6587,13 @@ static __init int hardware_setup(void)
for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i)
kvm_define_shared_msr(i, vmx_msr_index[i]);
- vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_io_bitmap_a)
- return r;
+ for (i = 0; i < VMX_BITMAP_NR; i++) {
+ vmx_bitmap[i] = (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx_bitmap[i])
+ goto out;
+ }
vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_io_bitmap_b)
- goto out;
-
- vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_msr_bitmap_legacy)
- goto out1;
-
- vmx_msr_bitmap_legacy_x2apic =
- (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_msr_bitmap_legacy_x2apic)
- goto out2;
-
- vmx_msr_bitmap_legacy_x2apic_apicv_inactive =
- (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_msr_bitmap_legacy_x2apic_apicv_inactive)
- goto out3;
-
- vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_msr_bitmap_longmode)
- goto out4;
-
- vmx_msr_bitmap_longmode_x2apic =
- (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_msr_bitmap_longmode_x2apic)
- goto out5;
-
- vmx_msr_bitmap_longmode_x2apic_apicv_inactive =
- (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_msr_bitmap_longmode_x2apic_apicv_inactive)
- goto out6;
-
- vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_vmread_bitmap)
- goto out7;
-
- vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_vmwrite_bitmap)
- goto out8;
-
memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
@@ -6409,7 +6611,7 @@ static __init int hardware_setup(void)
if (setup_vmcs_config(&vmcs_config) < 0) {
r = -EIO;
- goto out9;
+ goto out;
}
if (boot_cpu_has(X86_FEATURE_NX))
@@ -6472,39 +6674,34 @@ static __init int hardware_setup(void)
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true);
- memcpy(vmx_msr_bitmap_legacy_x2apic,
+ memcpy(vmx_msr_bitmap_legacy_x2apic_apicv,
vmx_msr_bitmap_legacy, PAGE_SIZE);
- memcpy(vmx_msr_bitmap_longmode_x2apic,
+ memcpy(vmx_msr_bitmap_longmode_x2apic_apicv,
vmx_msr_bitmap_longmode, PAGE_SIZE);
- memcpy(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
+ memcpy(vmx_msr_bitmap_legacy_x2apic,
vmx_msr_bitmap_legacy, PAGE_SIZE);
- memcpy(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
+ memcpy(vmx_msr_bitmap_longmode_x2apic,
vmx_msr_bitmap_longmode, PAGE_SIZE);
set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
+ for (msr = 0x800; msr <= 0x8ff; msr++) {
+ if (msr == 0x839 /* TMCCT */)
+ continue;
+ vmx_disable_intercept_msr_x2apic(msr, MSR_TYPE_R, true);
+ }
+
/*
- * enable_apicv && kvm_vcpu_apicv_active()
+ * TPR reads and writes can be virtualized even if virtual interrupt
+ * delivery is not in use.
*/
- for (msr = 0x800; msr <= 0x8ff; msr++)
- vmx_disable_intercept_msr_read_x2apic(msr, true);
+ vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_W, true);
+ vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_R | MSR_TYPE_W, false);
- /* TMCCT */
- vmx_enable_intercept_msr_read_x2apic(0x839, true);
- /* TPR */
- vmx_disable_intercept_msr_write_x2apic(0x808, true);
/* EOI */
- vmx_disable_intercept_msr_write_x2apic(0x80b, true);
+ vmx_disable_intercept_msr_x2apic(0x80b, MSR_TYPE_W, true);
/* SELF-IPI */
- vmx_disable_intercept_msr_write_x2apic(0x83f, true);
-
- /*
- * (enable_apicv && !kvm_vcpu_apicv_active()) ||
- * !enable_apicv
- */
- /* TPR */
- vmx_disable_intercept_msr_read_x2apic(0x808, false);
- vmx_disable_intercept_msr_write_x2apic(0x808, false);
+ vmx_disable_intercept_msr_x2apic(0x83f, MSR_TYPE_W, true);
if (enable_ept) {
kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
@@ -6551,42 +6748,19 @@ static __init int hardware_setup(void)
return alloc_kvm_area();
-out9:
- free_page((unsigned long)vmx_vmwrite_bitmap);
-out8:
- free_page((unsigned long)vmx_vmread_bitmap);
-out7:
- free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive);
-out6:
- free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
-out5:
- free_page((unsigned long)vmx_msr_bitmap_longmode);
-out4:
- free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive);
-out3:
- free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
-out2:
- free_page((unsigned long)vmx_msr_bitmap_legacy);
-out1:
- free_page((unsigned long)vmx_io_bitmap_b);
out:
- free_page((unsigned long)vmx_io_bitmap_a);
+ for (i = 0; i < VMX_BITMAP_NR; i++)
+ free_page((unsigned long)vmx_bitmap[i]);
return r;
}
static __exit void hardware_unsetup(void)
{
- free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
- free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive);
- free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
- free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive);
- free_page((unsigned long)vmx_msr_bitmap_legacy);
- free_page((unsigned long)vmx_msr_bitmap_longmode);
- free_page((unsigned long)vmx_io_bitmap_b);
- free_page((unsigned long)vmx_io_bitmap_a);
- free_page((unsigned long)vmx_vmwrite_bitmap);
- free_page((unsigned long)vmx_vmread_bitmap);
+ int i;
+
+ for (i = 0; i < VMX_BITMAP_NR; i++)
+ free_page((unsigned long)vmx_bitmap[i]);
free_kvm_area();
}
@@ -6600,16 +6774,13 @@ static int handle_pause(struct kvm_vcpu *vcpu)
if (ple_gap)
grow_ple_window(vcpu);
- skip_emulated_instruction(vcpu);
kvm_vcpu_on_spin(vcpu);
-
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
static int handle_nop(struct kvm_vcpu *vcpu)
{
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
static int handle_mwait(struct kvm_vcpu *vcpu)
@@ -6916,8 +7087,7 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
*/
if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) {
nested_vmx_failInvalid(vcpu);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
page = nested_get_page(vcpu, vmptr);
@@ -6925,8 +7095,7 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
*(u32 *)kmap(page) != VMCS12_REVISION) {
nested_vmx_failInvalid(vcpu);
kunmap(page);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
kunmap(page);
vmx->nested.vmxon_ptr = vmptr;
@@ -6935,30 +7104,26 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) {
nested_vmx_failValid(vcpu,
VMXERR_VMCLEAR_INVALID_ADDRESS);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
if (vmptr == vmx->nested.vmxon_ptr) {
nested_vmx_failValid(vcpu,
VMXERR_VMCLEAR_VMXON_POINTER);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
break;
case EXIT_REASON_VMPTRLD:
if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) {
nested_vmx_failValid(vcpu,
VMXERR_VMPTRLD_INVALID_ADDRESS);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
if (vmptr == vmx->nested.vmxon_ptr) {
nested_vmx_failValid(vcpu,
VMXERR_VMCLEAR_VMXON_POINTER);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
break;
default:
@@ -7014,8 +7179,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
if (vmx->nested.vmxon) {
nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
@@ -7055,9 +7219,8 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
vmx->nested.vmxon = true;
- skip_emulated_instruction(vcpu);
nested_vmx_succeed(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
out_shadow_vmcs:
kfree(vmx->nested.cached_vmcs12);
@@ -7176,9 +7339,8 @@ static int handle_vmoff(struct kvm_vcpu *vcpu)
if (!nested_vmx_check_permission(vcpu))
return 1;
free_nested(to_vmx(vcpu));
- skip_emulated_instruction(vcpu);
nested_vmx_succeed(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
/* Emulate the VMCLEAR instruction */
@@ -7217,9 +7379,8 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
nested_free_vmcs02(vmx, vmptr);
- skip_emulated_instruction(vcpu);
nested_vmx_succeed(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch);
@@ -7417,7 +7578,6 @@ static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
if (vmx->nested.current_vmptr == -1ull) {
nested_vmx_failInvalid(vcpu);
- skip_emulated_instruction(vcpu);
return 0;
}
return 1;
@@ -7431,17 +7591,18 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
gva_t gva = 0;
- if (!nested_vmx_check_permission(vcpu) ||
- !nested_vmx_check_vmcs12(vcpu))
+ if (!nested_vmx_check_permission(vcpu))
return 1;
+ if (!nested_vmx_check_vmcs12(vcpu))
+ return kvm_skip_emulated_instruction(vcpu);
+
/* Decode instruction info and find the field to read */
field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
/* Read the field, zero-extended to a u64 field_value */
if (vmcs12_read_any(vcpu, field, &field_value) < 0) {
nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
/*
* Now copy part of this value to register or memory, as requested.
@@ -7461,8 +7622,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
}
nested_vmx_succeed(vcpu);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
@@ -7481,10 +7641,12 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
u64 field_value = 0;
struct x86_exception e;
- if (!nested_vmx_check_permission(vcpu) ||
- !nested_vmx_check_vmcs12(vcpu))
+ if (!nested_vmx_check_permission(vcpu))
return 1;
+ if (!nested_vmx_check_vmcs12(vcpu))
+ return kvm_skip_emulated_instruction(vcpu);
+
if (vmx_instruction_info & (1u << 10))
field_value = kvm_register_readl(vcpu,
(((vmx_instruction_info) >> 3) & 0xf));
@@ -7504,19 +7666,16 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
if (vmcs_field_readonly(field)) {
nested_vmx_failValid(vcpu,
VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
if (vmcs12_write_any(vcpu, field, field_value) < 0) {
nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
nested_vmx_succeed(vcpu);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
/* Emulate the VMPTRLD instruction */
@@ -7537,8 +7696,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
page = nested_get_page(vcpu, vmptr);
if (page == NULL) {
nested_vmx_failInvalid(vcpu);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
new_vmcs12 = kmap(page);
if (new_vmcs12->revision_id != VMCS12_REVISION) {
@@ -7546,8 +7704,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
nested_release_page_clean(page);
nested_vmx_failValid(vcpu,
VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
nested_release_vmcs12(vmx);
@@ -7571,8 +7728,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
}
nested_vmx_succeed(vcpu);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
/* Emulate the VMPTRST instruction */
@@ -7597,8 +7753,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
return 1;
}
nested_vmx_succeed(vcpu);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
/* Emulate the INVEPT instruction */
@@ -7636,8 +7791,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
if (type >= 32 || !(types & (1 << type))) {
nested_vmx_failValid(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
/* According to the Intel VMX instruction reference, the memory
@@ -7668,8 +7822,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
break;
}
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
static int handle_invvpid(struct kvm_vcpu *vcpu)
@@ -7694,13 +7847,13 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
- types = (vmx->nested.nested_vmx_vpid_caps >> 8) & 0x7;
+ types = (vmx->nested.nested_vmx_vpid_caps &
+ VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
if (type >= 32 || !(types & (1 << type))) {
nested_vmx_failValid(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
/* according to the intel vmx instruction reference, the memory
@@ -7716,23 +7869,26 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
}
switch (type) {
+ case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
case VMX_VPID_EXTENT_SINGLE_CONTEXT:
- /*
- * Old versions of KVM use the single-context version so we
- * have to support it; just treat it the same as all-context.
- */
+ case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
+ if (!vpid) {
+ nested_vmx_failValid(vcpu,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+ break;
case VMX_VPID_EXTENT_ALL_CONTEXT:
- __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02);
- nested_vmx_succeed(vcpu);
break;
default:
- /* Trap individual address invalidation invvpid calls */
- BUG_ON(1);
- break;
+ WARN_ON_ONCE(1);
+ return kvm_skip_emulated_instruction(vcpu);
}
- skip_emulated_instruction(vcpu);
- return 1;
+ __vmx_flush_tlb(vcpu, vmx->nested.vpid02);
+ nested_vmx_succeed(vcpu);
+
+ return kvm_skip_emulated_instruction(vcpu);
}
static int handle_pml_full(struct kvm_vcpu *vcpu)
@@ -8071,6 +8227,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
case EXIT_REASON_IO_INSTRUCTION:
return nested_vmx_exit_handled_io(vcpu, vmcs12);
+ case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR:
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC);
case EXIT_REASON_MSR_READ:
case EXIT_REASON_MSR_WRITE:
return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
@@ -8620,11 +8778,6 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
register void *__sp asm(_ASM_SP);
- /*
- * If external interrupt exists, IF bit is set in rflags/eflags on the
- * interrupt stack frame, and interrupt will be enabled on a return
- * from interrupt handler.
- */
if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK))
== (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) {
unsigned int vector;
@@ -8809,7 +8962,7 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
msrs[i].host);
}
-void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
+static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u64 tscl;
@@ -9279,6 +9432,50 @@ static void vmcs_set_secondary_exec_control(u32 new_ctl)
(new_ctl & ~mask) | (cur_ctl & mask));
}
+/*
+ * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits
+ * (indicating "allowed-1") if they are supported in the guest's CPUID.
+ */
+static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct kvm_cpuid_entry2 *entry;
+
+ vmx->nested.nested_vmx_cr0_fixed1 = 0xffffffff;
+ vmx->nested.nested_vmx_cr4_fixed1 = X86_CR4_PCE;
+
+#define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \
+ if (entry && (entry->_reg & (_cpuid_mask))) \
+ vmx->nested.nested_vmx_cr4_fixed1 |= (_cr4_mask); \
+} while (0)
+
+ entry = kvm_find_cpuid_entry(vcpu, 0x1, 0);
+ cr4_fixed1_update(X86_CR4_VME, edx, bit(X86_FEATURE_VME));
+ cr4_fixed1_update(X86_CR4_PVI, edx, bit(X86_FEATURE_VME));
+ cr4_fixed1_update(X86_CR4_TSD, edx, bit(X86_FEATURE_TSC));
+ cr4_fixed1_update(X86_CR4_DE, edx, bit(X86_FEATURE_DE));
+ cr4_fixed1_update(X86_CR4_PSE, edx, bit(X86_FEATURE_PSE));
+ cr4_fixed1_update(X86_CR4_PAE, edx, bit(X86_FEATURE_PAE));
+ cr4_fixed1_update(X86_CR4_MCE, edx, bit(X86_FEATURE_MCE));
+ cr4_fixed1_update(X86_CR4_PGE, edx, bit(X86_FEATURE_PGE));
+ cr4_fixed1_update(X86_CR4_OSFXSR, edx, bit(X86_FEATURE_FXSR));
+ cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, bit(X86_FEATURE_XMM));
+ cr4_fixed1_update(X86_CR4_VMXE, ecx, bit(X86_FEATURE_VMX));
+ cr4_fixed1_update(X86_CR4_SMXE, ecx, bit(X86_FEATURE_SMX));
+ cr4_fixed1_update(X86_CR4_PCIDE, ecx, bit(X86_FEATURE_PCID));
+ cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, bit(X86_FEATURE_XSAVE));
+
+ entry = kvm_find_cpuid_entry(vcpu, 0x7, 0);
+ cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, bit(X86_FEATURE_FSGSBASE));
+ cr4_fixed1_update(X86_CR4_SMEP, ebx, bit(X86_FEATURE_SMEP));
+ cr4_fixed1_update(X86_CR4_SMAP, ebx, bit(X86_FEATURE_SMAP));
+ cr4_fixed1_update(X86_CR4_PKE, ecx, bit(X86_FEATURE_PKU));
+ /* TODO: Use X86_CR4_UMIP and X86_FEATURE_UMIP macros */
+ cr4_fixed1_update(bit(11), ecx, bit(2));
+
+#undef cr4_fixed1_update
+}
+
static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
@@ -9320,6 +9517,9 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
else
to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+
+ if (nested_vmx_allowed(vcpu))
+ nested_vmx_cr_fixed1_bits_update(vcpu);
}
static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@@ -9774,6 +9974,49 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
return 0;
}
+static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ unsigned long invalid_mask;
+
+ invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
+ return (val & invalid_mask) == 0;
+}
+
+/*
+ * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are
+ * emulating VM entry into a guest with EPT enabled.
+ * Returns 0 on success, 1 on failure. Invalid state exit qualification code
+ * is assigned to entry_failure_code on failure.
+ */
+static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept,
+ unsigned long *entry_failure_code)
+{
+ if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) {
+ if (!nested_cr3_valid(vcpu, cr3)) {
+ *entry_failure_code = ENTRY_FAIL_DEFAULT;
+ return 1;
+ }
+
+ /*
+ * If PAE paging and EPT are both on, CR3 is not used by the CPU and
+ * must not be dereferenced.
+ */
+ if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu) &&
+ !nested_ept) {
+ if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) {
+ *entry_failure_code = ENTRY_FAIL_PDPTE;
+ return 1;
+ }
+ }
+
+ vcpu->arch.cr3 = cr3;
+ __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+ }
+
+ kvm_mmu_reset_context(vcpu);
+ return 0;
+}
+
/*
* prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
* L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
@@ -9782,11 +10025,15 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
* needs. In addition to modifying the active vmcs (which is vmcs02), this
* function also has additional necessary side-effects, like setting various
* vcpu->arch fields.
+ * Returns 0 on success, 1 on failure. Invalid state exit qualification code
+ * is assigned to entry_failure_code on failure.
*/
-static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+ unsigned long *entry_failure_code)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 exec_control;
+ bool nested_ept_enabled = false;
vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
@@ -9951,6 +10198,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmcs12->guest_intr_status);
}
+ nested_ept_enabled = (exec_control & SECONDARY_EXEC_ENABLE_EPT) != 0;
vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
}
@@ -9964,6 +10212,15 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmx_set_constant_host_state(vmx);
/*
+ * Set the MSR load/store lists to match L0's settings.
+ */
+ vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
+ vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
+ vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
+
+ /*
* HOST_RSP is normally set correctly in vmx_vcpu_run() just before
* entry, but only if the current (host) sp changed from the value
* we wrote last (vmx->host_rsp). This cache is no longer relevant
@@ -10069,15 +10326,6 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
nested_ept_init_mmu_context(vcpu);
}
- if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
- vcpu->arch.efer = vmcs12->guest_ia32_efer;
- else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
- vcpu->arch.efer |= (EFER_LMA | EFER_LME);
- else
- vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
- /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
- vmx_set_efer(vcpu, vcpu->arch.efer);
-
/*
* This sets GUEST_CR0 to vmcs12->guest_cr0, with possibly a modified
* TS bit (for lazy fpu) and bits which we consider mandatory enabled.
@@ -10092,8 +10340,20 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmx_set_cr4(vcpu, vmcs12->guest_cr4);
vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
- /* shadow page tables on either EPT or shadow page tables */
- kvm_set_cr3(vcpu, vmcs12->guest_cr3);
+ if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
+ vcpu->arch.efer = vmcs12->guest_ia32_efer;
+ else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
+ vcpu->arch.efer |= (EFER_LMA | EFER_LME);
+ else
+ vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
+ /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
+ vmx_set_efer(vcpu, vcpu->arch.efer);
+
+ /* Shadow page tables on either EPT or shadow page tables. */
+ if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_ept_enabled,
+ entry_failure_code))
+ return 1;
+
kvm_mmu_reset_context(vcpu);
if (!enable_ept)
@@ -10111,6 +10371,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
+ return 0;
}
/*
@@ -10125,12 +10386,14 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
struct loaded_vmcs *vmcs02;
bool ia32e;
u32 msr_entry_idx;
+ unsigned long exit_qualification;
- if (!nested_vmx_check_permission(vcpu) ||
- !nested_vmx_check_vmcs12(vcpu))
+ if (!nested_vmx_check_permission(vcpu))
return 1;
- skip_emulated_instruction(vcpu);
+ if (!nested_vmx_check_vmcs12(vcpu))
+ goto out;
+
vmcs12 = get_vmcs12(vcpu);
if (enable_shadow_vmcs)
@@ -10150,37 +10413,37 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
nested_vmx_failValid(vcpu,
launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
: VMXERR_VMRESUME_NONLAUNCHED_VMCS);
- return 1;
+ goto out;
}
if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) {
nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
- return 1;
+ goto out;
}
if (!nested_get_vmcs12_pages(vcpu, vmcs12)) {
nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
- return 1;
+ goto out;
}
if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) {
nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
- return 1;
+ goto out;
}
if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) {
nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
- return 1;
+ goto out;
}
if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) {
nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
- return 1;
+ goto out;
}
if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
- vmx->nested.nested_vmx_true_procbased_ctls_low,
+ vmx->nested.nested_vmx_procbased_ctls_low,
vmx->nested.nested_vmx_procbased_ctls_high) ||
!vmx_control_verify(vmcs12->secondary_vm_exec_control,
vmx->nested.nested_vmx_secondary_ctls_low,
@@ -10189,33 +10452,34 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
vmx->nested.nested_vmx_pinbased_ctls_low,
vmx->nested.nested_vmx_pinbased_ctls_high) ||
!vmx_control_verify(vmcs12->vm_exit_controls,
- vmx->nested.nested_vmx_true_exit_ctls_low,
+ vmx->nested.nested_vmx_exit_ctls_low,
vmx->nested.nested_vmx_exit_ctls_high) ||
!vmx_control_verify(vmcs12->vm_entry_controls,
- vmx->nested.nested_vmx_true_entry_ctls_low,
+ vmx->nested.nested_vmx_entry_ctls_low,
vmx->nested.nested_vmx_entry_ctls_high))
{
nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
- return 1;
+ goto out;
}
- if (((vmcs12->host_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) ||
- ((vmcs12->host_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
+ if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) ||
+ !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) ||
+ !nested_cr3_valid(vcpu, vmcs12->host_cr3)) {
nested_vmx_failValid(vcpu,
VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
- return 1;
+ goto out;
}
- if (!nested_cr0_valid(vcpu, vmcs12->guest_cr0) ||
- ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
+ if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) ||
+ !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)) {
nested_vmx_entry_failure(vcpu, vmcs12,
EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
- return 1;
+ goto out;
}
if (vmcs12->vmcs_link_pointer != -1ull) {
nested_vmx_entry_failure(vcpu, vmcs12,
EXIT_REASON_INVALID_STATE, ENTRY_FAIL_VMCS_LINK_PTR);
- return 1;
+ goto out;
}
/*
@@ -10235,7 +10499,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) {
nested_vmx_entry_failure(vcpu, vmcs12,
EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
- return 1;
+ goto out;
}
}
@@ -10253,7 +10517,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) {
nested_vmx_entry_failure(vcpu, vmcs12,
EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
- return 1;
+ goto out;
}
}
@@ -10266,6 +10530,12 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
if (!vmcs02)
return -ENOMEM;
+ /*
+ * After this point, the trap flag no longer triggers a singlestep trap
+ * on the vm entry instructions. Don't call
+ * kvm_skip_emulated_instruction.
+ */
+ skip_emulated_instruction(vcpu);
enter_guest_mode(vcpu);
if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
@@ -10280,7 +10550,13 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
vmx_segment_cache_clear(vmx);
- prepare_vmcs02(vcpu, vmcs12);
+ if (prepare_vmcs02(vcpu, vmcs12, &exit_qualification)) {
+ leave_guest_mode(vcpu);
+ vmx_load_vmcs01(vcpu);
+ nested_vmx_entry_failure(vcpu, vmcs12,
+ EXIT_REASON_INVALID_STATE, exit_qualification);
+ return 1;
+ }
msr_entry_idx = nested_vmx_load_msr(vcpu,
vmcs12->vm_entry_msr_load_addr,
@@ -10307,6 +10583,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
* the success flag) when L2 exits (see nested_vmx_vmexit()).
*/
return 1;
+
+out:
+ return kvm_skip_emulated_instruction(vcpu);
}
/*
@@ -10612,6 +10891,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
struct kvm_segment seg;
+ unsigned long entry_failure_code;
if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
vcpu->arch.efer = vmcs12->host_ia32_efer;
@@ -10649,8 +10929,12 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
nested_ept_uninit_mmu_context(vcpu);
- kvm_set_cr3(vcpu, vmcs12->host_cr3);
- kvm_mmu_reset_context(vcpu);
+ /*
+ * Only PDPTE load can fail as the value of cr3 was checked on entry and
+ * couldn't have changed.
+ */
+ if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code))
+ nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
if (!enable_ept)
vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
@@ -10751,6 +11035,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ u32 vm_inst_error = 0;
/* trying to cancel vmlaunch/vmresume is a bug */
WARN_ON_ONCE(vmx->nested.nested_run_pending);
@@ -10763,6 +11048,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
vmcs12->vm_exit_msr_store_count))
nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
+ if (unlikely(vmx->fail))
+ vm_inst_error = vmcs_read32(VM_INSTRUCTION_ERROR);
+
vmx_load_vmcs01(vcpu);
if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
@@ -10791,6 +11079,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
load_vmcs12_host_state(vcpu, vmcs12);
/* Update any VMCS fields that might have changed while L2 ran */
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
if (vmx->hv_deadline_tsc == -1)
vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
@@ -10839,7 +11129,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
*/
if (unlikely(vmx->fail)) {
vmx->fail = 0;
- nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR));
+ nested_vmx_failValid(vcpu, vm_inst_error);
} else
nested_vmx_succeed(vcpu);
if (enable_shadow_vmcs)