summaryrefslogtreecommitdiff
path: root/arch/x86/kvm/vmx
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/vmx')
-rw-r--r--arch/x86/kvm/vmx/capabilities.h24
-rw-r--r--arch/x86/kvm/vmx/hyperv.c (renamed from arch/x86/kvm/vmx/evmcs.c)45
-rw-r--r--arch/x86/kvm/vmx/hyperv.h (renamed from arch/x86/kvm/vmx/evmcs.h)12
-rw-r--r--arch/x86/kvm/vmx/nested.c128
-rw-r--r--arch/x86/kvm/vmx/nested.h7
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c11
-rw-r--r--arch/x86/kvm/vmx/sgx.c4
-rw-r--r--arch/x86/kvm/vmx/vmcs12.h5
-rw-r--r--arch/x86/kvm/vmx/vmenter.S2
-rw-r--r--arch/x86/kvm/vmx/vmx.c132
-rw-r--r--arch/x86/kvm/vmx/vmx_ops.h20
11 files changed, 270 insertions, 120 deletions
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 07254314f3dd..cd2ac9536c99 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -395,30 +395,6 @@ static inline bool vmx_pebs_supported(void)
return boot_cpu_has(X86_FEATURE_PEBS) && kvm_pmu_cap.pebs_ept;
}
-static inline u64 vmx_get_perf_capabilities(void)
-{
- u64 perf_cap = PMU_CAP_FW_WRITES;
- struct x86_pmu_lbr lbr;
- u64 host_perf_cap = 0;
-
- if (!enable_pmu)
- return 0;
-
- if (boot_cpu_has(X86_FEATURE_PDCM))
- rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap);
-
- if (x86_perf_get_lbr(&lbr) >= 0 && lbr.nr)
- perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT;
-
- if (vmx_pebs_supported()) {
- perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK;
- if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4)
- perf_cap &= ~PERF_CAP_PEBS_BASELINE;
- }
-
- return perf_cap;
-}
-
static inline bool cpu_has_notify_vmexit(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/hyperv.c
index d8b23c96d627..ae03d1fe0355 100644
--- a/arch/x86/kvm/vmx/evmcs.c
+++ b/arch/x86/kvm/vmx/hyperv.c
@@ -3,9 +3,9 @@
#include <linux/errno.h>
#include <linux/smp.h>
-#include "../hyperv.h"
#include "../cpuid.h"
-#include "evmcs.h"
+#include "hyperv.h"
+#include "nested.h"
#include "vmcs.h"
#include "vmx.h"
#include "trace.h"
@@ -322,24 +322,17 @@ const struct evmcs_field vmcs_field_to_evmcs_1[] = {
};
const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1);
-bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa)
+u64 nested_get_evmptr(struct kvm_vcpu *vcpu)
{
- struct hv_vp_assist_page assist_page;
-
- *evmcs_gpa = -1ull;
-
- if (unlikely(!kvm_hv_get_assist_page(vcpu, &assist_page)))
- return false;
-
- if (unlikely(!assist_page.enlighten_vmentry))
- return false;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
- if (unlikely(!evmptr_is_valid(assist_page.current_nested_vmcs)))
- return false;
+ if (unlikely(kvm_hv_get_assist_page(vcpu)))
+ return EVMPTR_INVALID;
- *evmcs_gpa = assist_page.current_nested_vmcs;
+ if (unlikely(!hv_vcpu->vp_assist_page.enlighten_vmentry))
+ return EVMPTR_INVALID;
- return true;
+ return hv_vcpu->vp_assist_page.current_nested_vmcs;
}
uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu)
@@ -507,3 +500,23 @@ int nested_enable_evmcs(struct kvm_vcpu *vcpu,
return 0;
}
+
+bool nested_evmcs_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
+
+ if (!hv_vcpu || !evmcs)
+ return false;
+
+ if (!evmcs->hv_enlightenments_control.nested_flush_hypercall)
+ return false;
+
+ return hv_vcpu->vp_assist_page.nested_control.features.directhypercall;
+}
+
+void vmx_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu)
+{
+ nested_vmx_vmexit(vcpu, HV_VMX_SYNTHETIC_EXIT_REASON_TRAP_AFTER_FLUSH, 0, 0);
+}
diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/hyperv.h
index 6f746ef3c038..571e7929d14e 100644
--- a/arch/x86/kvm/vmx/evmcs.h
+++ b/arch/x86/kvm/vmx/hyperv.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __KVM_X86_VMX_EVMCS_H
-#define __KVM_X86_VMX_EVMCS_H
+#ifndef __KVM_X86_VMX_HYPERV_H
+#define __KVM_X86_VMX_HYPERV_H
#include <linux/jump_label.h>
@@ -8,6 +8,8 @@
#include <asm/mshyperv.h>
#include <asm/vmx.h>
+#include "../hyperv.h"
+
#include "capabilities.h"
#include "vmcs.h"
#include "vmcs12.h"
@@ -235,11 +237,13 @@ enum nested_evmptrld_status {
EVMPTRLD_ERROR,
};
-bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa);
+u64 nested_get_evmptr(struct kvm_vcpu *vcpu);
uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu);
int nested_enable_evmcs(struct kvm_vcpu *vcpu,
uint16_t *vmcs_version);
void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
int nested_evmcs_check_controls(struct vmcs12 *vmcs12);
+bool nested_evmcs_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu);
+void vmx_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu);
-#endif /* __KVM_X86_VMX_EVMCS_H */
+#endif /* __KVM_X86_VMX_HYPERV_H */
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 5b0d4859e4b7..d93c715cda6a 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -7,7 +7,6 @@
#include <asm/mmu_context.h>
#include "cpuid.h"
-#include "evmcs.h"
#include "hyperv.h"
#include "mmu.h"
#include "nested.h"
@@ -16,6 +15,7 @@
#include "trace.h"
#include "vmx.h"
#include "x86.h"
+#include "smm.h"
static bool __read_mostly enable_shadow_vmcs = 1;
module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
@@ -225,6 +225,7 @@ static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
@@ -233,6 +234,12 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
}
vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
+
+ if (hv_vcpu) {
+ hv_vcpu->nested.pa_page_gpa = INVALID_GPA;
+ hv_vcpu->nested.vm_id = 0;
+ hv_vcpu->nested.vp_id = 0;
+ }
}
static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
@@ -1126,6 +1133,15 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
struct vcpu_vmx *vmx = to_vmx(vcpu);
/*
+ * KVM_REQ_HV_TLB_FLUSH flushes entries from either L1's VP_ID or
+ * L2's VP_ID upon request from the guest. Make sure we check for
+ * pending entries in the right FIFO upon L1/L2 transition as these
+ * requests are put by other vCPUs asynchronously.
+ */
+ if (to_hv_vcpu(vcpu) && enable_ept)
+ kvm_make_request(KVM_REQ_HV_TLB_FLUSH, vcpu);
+
+ /*
* If vmcs12 doesn't use VPID, L1 expects linear and combined mappings
* for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a
* full TLB flush from the guest's perspective. This is required even
@@ -1557,12 +1573,20 @@ static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields
{
struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(&vmx->vcpu);
/* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
vmcs12->tpr_threshold = evmcs->tpr_threshold;
vmcs12->guest_rip = evmcs->guest_rip;
if (unlikely(!(hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL))) {
+ hv_vcpu->nested.pa_page_gpa = evmcs->partition_assist_page;
+ hv_vcpu->nested.vm_id = evmcs->hv_vm_id;
+ hv_vcpu->nested.vp_id = evmcs->hv_vp_id;
+ }
+
+ if (unlikely(!(hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) {
vmcs12->guest_rsp = evmcs->guest_rsp;
vmcs12->guest_rflags = evmcs->guest_rflags;
@@ -1977,7 +2001,8 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
if (likely(!guest_cpuid_has_evmcs(vcpu)))
return EVMPTRLD_DISABLED;
- if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) {
+ evmcs_gpa = nested_get_evmptr(vcpu);
+ if (!evmptr_is_valid(evmcs_gpa)) {
nested_release_evmcs(vcpu);
return EVMPTRLD_DISABLED;
}
@@ -2563,12 +2588,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
nested_ept_init_mmu_context(vcpu);
/*
- * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
- * bits which we consider mandatory enabled.
- * The CR0_READ_SHADOW is what L2 should have expected to read given
- * the specifications by L1; It's not enough to take
- * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we
- * have more bits than L1 expected.
+ * Override the CR0/CR4 read shadows after setting the effective guest
+ * CR0/CR4. The common helpers also set the shadows, but they don't
+ * account for vmcs12's cr0/4_guest_host_mask.
*/
vmx_set_cr0(vcpu, vmcs12->guest_cr0);
vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
@@ -3251,6 +3273,12 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
{
+ /*
+ * Note: nested_get_evmcs_page() also updates 'vp_assist_page' copy
+ * in 'struct kvm_vcpu_hv' in case eVMCS is in use, this is mandatory
+ * to make nested_evmcs_l2_tlb_flush_enabled() work correctly post
+ * migration.
+ */
if (!nested_get_evmcs_page(vcpu)) {
pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
__func__);
@@ -4767,6 +4795,17 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
+ /*
+ * If IBRS is advertised to the vCPU, KVM must flush the indirect
+ * branch predictors when transitioning from L2 to L1, as L1 expects
+ * hardware (KVM in this case) to provide separate predictor modes.
+ * Bare metal isolates VMX root (host) from VMX non-root (guest), but
+ * doesn't isolate different VMCSs, i.e. in this case, doesn't provide
+ * separate modes for L2 vs L1.
+ */
+ if (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
+ indirect_branch_prediction_barrier();
+
/* Update any VMCS fields that might have changed while L2 ran */
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
@@ -5100,24 +5139,35 @@ static int handle_vmxon(struct kvm_vcpu *vcpu)
| FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
/*
- * Note, KVM cannot rely on hardware to perform the CR0/CR4 #UD checks
- * that have higher priority than VM-Exit (see Intel SDM's pseudocode
- * for VMXON), as KVM must load valid CR0/CR4 values into hardware while
- * running the guest, i.e. KVM needs to check the _guest_ values.
+ * Manually check CR4.VMXE checks, KVM must force CR4.VMXE=1 to enter
+ * the guest and so cannot rely on hardware to perform the check,
+ * which has higher priority than VM-Exit (see Intel SDM's pseudocode
+ * for VMXON).
*
- * Rely on hardware for the other two pre-VM-Exit checks, !VM86 and
- * !COMPATIBILITY modes. KVM may run the guest in VM86 to emulate Real
- * Mode, but KVM will never take the guest out of those modes.
+ * Rely on hardware for the other pre-VM-Exit checks, CR0.PE=1, !VM86
+ * and !COMPATIBILITY modes. For an unrestricted guest, KVM doesn't
+ * force any of the relevant guest state. For a restricted guest, KVM
+ * does force CR0.PE=1, but only to also force VM86 in order to emulate
+ * Real Mode, and so there's no need to check CR0.PE manually.
*/
- if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) ||
- !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) {
+ if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
/*
- * CPL=0 and all other checks that are lower priority than VM-Exit must
- * be checked manually.
+ * The CPL is checked for "not in VMX operation" and for "in VMX root",
+ * and has higher priority than the VM-Fail due to being post-VMXON,
+ * i.e. VMXON #GPs outside of VMX non-root if CPL!=0. In VMX non-root,
+ * VMXON causes VM-Exit and KVM unconditionally forwards VMXON VM-Exits
+ * from L2 to L1, i.e. there's no need to check for the vCPU being in
+ * VMX non-root.
+ *
+ * Forwarding the VM-Exit unconditionally, i.e. without performing the
+ * #UD checks (see above), is functionally ok because KVM doesn't allow
+ * L1 to run L2 without CR4.VMXE=0, and because KVM never modifies L2's
+ * CR0 or CR4, i.e. it's L2's responsibility to emulate #UDs that are
+ * missed by hardware due to shadowing CR0 and/or CR4.
*/
if (vmx_get_cpl(vcpu)) {
kvm_inject_gp(vcpu, 0);
@@ -5127,6 +5177,17 @@ static int handle_vmxon(struct kvm_vcpu *vcpu)
if (vmx->nested.vmxon)
return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
+ /*
+ * Invalid CR0/CR4 generates #GP. These checks are performed if and
+ * only if the vCPU isn't already in VMX operation, i.e. effectively
+ * have lower priority than the VM-Fail above.
+ */
+ if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) ||
+ !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
!= VMXON_NEEDED_FEATURES) {
kvm_inject_gp(vcpu, 0);
@@ -5206,7 +5267,6 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 zero = 0;
gpa_t vmptr;
- u64 evmcs_gpa;
int r;
if (!nested_vmx_check_permission(vcpu))
@@ -5232,14 +5292,23 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
* vmx->nested.hv_evmcs but this shouldn't be a problem.
*/
if (likely(!guest_cpuid_has_evmcs(vcpu) ||
- !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) {
+ !evmptr_is_valid(nested_get_evmptr(vcpu)))) {
if (vmptr == vmx->nested.current_vmptr)
nested_release_vmcs12(vcpu);
- kvm_vcpu_write_guest(vcpu,
- vmptr + offsetof(struct vmcs12,
- launch_state),
- &zero, sizeof(zero));
+ /*
+ * Silently ignore memory errors on VMCLEAR, Intel's pseudocode
+ * for VMCLEAR includes a "ensure that data for VMCS referenced
+ * by the operand is in memory" clause that guards writes to
+ * memory, i.e. doing nothing for I/O is architecturally valid.
+ *
+ * FIXME: Suppress failures if and only if no memslot is found,
+ * i.e. exit to userspace if __copy_to_user() fails.
+ */
+ (void)kvm_vcpu_write_guest(vcpu,
+ vmptr + offsetof(struct vmcs12,
+ launch_state),
+ &zero, sizeof(zero));
} else if (vmx->nested.hv_evmcs && vmptr == vmx->nested.hv_evmcs_vmptr) {
nested_release_evmcs(vcpu);
}
@@ -6129,6 +6198,11 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
* Handle L2's bus locks in L0 directly.
*/
return true;
+ case EXIT_REASON_VMCALL:
+ /* Hyper-V L2 TLB flush hypercall is handled by L0 */
+ return guest_hv_cpuid_has_l2_tlb_flush(vcpu) &&
+ nested_evmcs_l2_tlb_flush_enabled(vcpu) &&
+ kvm_hv_is_tlb_flush_hcall(vcpu);
default:
break;
}
@@ -6808,7 +6882,8 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
SECONDARY_EXEC_ENABLE_INVPCID |
SECONDARY_EXEC_RDSEED_EXITING |
SECONDARY_EXEC_XSAVES |
- SECONDARY_EXEC_TSC_SCALING;
+ SECONDARY_EXEC_TSC_SCALING |
+ SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
/*
* We can emulate "VMCS shadowing," even if the hardware
@@ -6980,4 +7055,5 @@ struct kvm_x86_nested_ops vmx_nested_ops = {
.write_log_dirty = nested_vmx_write_pml_buffer,
.enable_evmcs = nested_enable_evmcs,
.get_evmcs_version = nested_get_evmcs_version,
+ .hv_inject_synthetic_vmexit_post_tlb_flush = vmx_hv_inject_synthetic_vmexit_post_tlb_flush,
};
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index 6312c9541c3c..96952263b029 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -79,9 +79,10 @@ static inline bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu)
}
/*
- * Return the cr0 value that a nested guest would read. This is a combination
- * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by
- * its hypervisor (cr0_read_shadow).
+ * Return the cr0/4 value that a nested guest would read. This is a combination
+ * of L1's "real" cr0 used to run the guest (guest_cr0), and the bits shadowed
+ * by the L1 hypervisor (cr0_read_shadow). KVM must emulate CPU behavior as
+ * the value+mask loaded into vmcs02 may not match the vmcs12 fields.
*/
static inline unsigned long nested_read_cr0(struct vmcs12 *fields)
{
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 10b33da9bd05..e5cec07ca8d9 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -52,7 +52,7 @@ static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i);
__set_bit(INTEL_PMC_IDX_FIXED + i, pmu->pmc_in_use);
- reprogram_counter(pmc);
+ kvm_pmu_request_counter_reprogam(pmc);
}
}
@@ -76,7 +76,7 @@ static void reprogram_counters(struct kvm_pmu *pmu, u64 diff)
for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) {
pmc = intel_pmc_idx_to_pmc(pmu, bit);
if (pmc)
- reprogram_counter(pmc);
+ kvm_pmu_request_counter_reprogam(pmc);
}
}
@@ -477,7 +477,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
reserved_bits ^= HSW_IN_TX_CHECKPOINTED;
if (!(data & reserved_bits)) {
pmc->eventsel = data;
- reprogram_counter(pmc);
+ kvm_pmu_request_counter_reprogam(pmc);
return 0;
}
} else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false))
@@ -631,7 +631,6 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu)
pmu->fixed_counters[i].current_config = 0;
}
- vcpu->arch.perf_capabilities = vmx_get_perf_capabilities();
lbr_desc->records.nr = 0;
lbr_desc->event = NULL;
lbr_desc->msr_passthrough = false;
@@ -647,14 +646,14 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu)
pmc = &pmu->gp_counters[i];
pmc_stop_counter(pmc);
- pmc->counter = pmc->eventsel = 0;
+ pmc->counter = pmc->prev_counter = pmc->eventsel = 0;
}
for (i = 0; i < KVM_PMC_MAX_FIXED; i++) {
pmc = &pmu->fixed_counters[i];
pmc_stop_counter(pmc);
- pmc->counter = 0;
+ pmc->counter = pmc->prev_counter = 0;
}
pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0;
diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c
index 8f95c7c01433..b12da2a6dec9 100644
--- a/arch/x86/kvm/vmx/sgx.c
+++ b/arch/x86/kvm/vmx/sgx.c
@@ -182,8 +182,10 @@ static int __handle_encls_ecreate(struct kvm_vcpu *vcpu,
/* Enforce CPUID restriction on max enclave size. */
max_size_log2 = (attributes & SGX_ATTR_MODE64BIT) ? sgx_12_0->edx >> 8 :
sgx_12_0->edx;
- if (size >= BIT_ULL(max_size_log2))
+ if (size >= BIT_ULL(max_size_log2)) {
kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
/*
* sgx_virt_ecreate() returns:
diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h
index 746129ddd5ae..01936013428b 100644
--- a/arch/x86/kvm/vmx/vmcs12.h
+++ b/arch/x86/kvm/vmx/vmcs12.h
@@ -208,9 +208,8 @@ struct __packed vmcs12 {
/*
* For save/restore compatibility, the vmcs12 field offsets must not change.
*/
-#define CHECK_OFFSET(field, loc) \
- BUILD_BUG_ON_MSG(offsetof(struct vmcs12, field) != (loc), \
- "Offset of " #field " in struct vmcs12 has changed.")
+#define CHECK_OFFSET(field, loc) \
+ ASSERT_STRUCT_OFFSET(struct vmcs12, field, loc)
static inline void vmx_check_vmcs12_offsets(void)
{
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 0b5db4de4d09..766c6b3ef5ed 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -269,6 +269,7 @@ SYM_FUNC_END(__vmx_vcpu_run)
.section .text, "ax"
+#ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
/**
* vmread_error_trampoline - Trampoline from inline asm to vmread_error()
* @field: VMCS field encoding that failed
@@ -317,6 +318,7 @@ SYM_FUNC_START(vmread_error_trampoline)
RET
SYM_FUNC_END(vmread_error_trampoline)
+#endif
SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff)
/*
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 63247c57c72c..7eec0226d56a 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -51,7 +51,6 @@
#include "capabilities.h"
#include "cpuid.h"
-#include "evmcs.h"
#include "hyperv.h"
#include "kvm_onhyperv.h"
#include "irq.h"
@@ -66,6 +65,7 @@
#include "vmcs12.h"
#include "vmx.h"
#include "x86.h"
+#include "smm.h"
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");
@@ -526,7 +526,7 @@ static unsigned long host_idt_base;
static bool __read_mostly enlightened_vmcs = true;
module_param(enlightened_vmcs, bool, 0444);
-static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
+static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu)
{
struct hv_enlightened_vmcs *evmcs;
struct hv_partition_assist_pg **p_hv_pa_pg =
@@ -858,7 +858,7 @@ unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx)
* to change it directly without causing a vmexit. In that case read
* it after vmexit and store it in vmx->spec_ctrl.
*/
- if (unlikely(!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL)))
+ if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))
flags |= VMX_RUN_SAVE_SPEC_CTRL;
return flags;
@@ -1348,8 +1348,10 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
/*
* No indirect branch prediction barrier needed when switching
- * the active VMCS within a guest, e.g. on nested VM-Enter.
- * The L1 VMM can protect itself with retpolines, IBPB or IBRS.
+ * the active VMCS within a vCPU, unless IBRS is advertised to
+ * the vCPU. To minimize the number of IBPBs executed, KVM
+ * performs IBPB on nested VM-Exit (a single nested transition
+ * may switch the active VMCS multiple times).
*/
if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev))
indirect_branch_prediction_barrier();
@@ -1834,12 +1836,42 @@ bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
}
-static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
- uint64_t val)
+/*
+ * Userspace is allowed to set any supported IA32_FEATURE_CONTROL regardless of
+ * guest CPUID. Note, KVM allows userspace to set "VMX in SMX" to maintain
+ * backwards compatibility even though KVM doesn't support emulating SMX. And
+ * because userspace set "VMX in SMX", the guest must also be allowed to set it,
+ * e.g. if the MSR is left unlocked and the guest does a RMW operation.
+ */
+#define KVM_SUPPORTED_FEATURE_CONTROL (FEAT_CTL_LOCKED | \
+ FEAT_CTL_VMX_ENABLED_INSIDE_SMX | \
+ FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX | \
+ FEAT_CTL_SGX_LC_ENABLED | \
+ FEAT_CTL_SGX_ENABLED | \
+ FEAT_CTL_LMCE_ENABLED)
+
+static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx,
+ struct msr_data *msr)
{
- uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits;
+ uint64_t valid_bits;
- return !(val & ~valid_bits);
+ /*
+ * Ensure KVM_SUPPORTED_FEATURE_CONTROL is updated when new bits are
+ * exposed to the guest.
+ */
+ WARN_ON_ONCE(vmx->msr_ia32_feature_control_valid_bits &
+ ~KVM_SUPPORTED_FEATURE_CONTROL);
+
+ if (!msr->host_initiated &&
+ (vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED))
+ return false;
+
+ if (msr->host_initiated)
+ valid_bits = KVM_SUPPORTED_FEATURE_CONTROL;
+ else
+ valid_bits = vmx->msr_ia32_feature_control_valid_bits;
+
+ return !(msr->data & ~valid_bits);
}
static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
@@ -1849,9 +1881,6 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
if (!nested)
return 1;
return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
- case MSR_IA32_PERF_CAPABILITIES:
- msr->data = vmx_get_perf_capabilities();
- return 0;
default:
return KVM_MSR_RET_INVALID;
}
@@ -2029,7 +2058,7 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated
(host_initiated || guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)))
debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT;
- if ((vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT) &&
+ if ((kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT) &&
(host_initiated || intel_pmu_lbr_is_enabled(vcpu)))
debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
@@ -2241,10 +2270,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vcpu->arch.mcg_ext_ctl = data;
break;
case MSR_IA32_FEAT_CTL:
- if (!vmx_feature_control_msr_valid(vcpu, data) ||
- (to_vmx(vcpu)->msr_ia32_feature_control &
- FEAT_CTL_LOCKED && !msr_info->host_initiated))
+ if (!is_vmx_feature_control_msr_valid(vmx, msr_info))
return 1;
+
vmx->msr_ia32_feature_control = data;
if (msr_info->host_initiated && data == 0)
vmx_leave_nested(vcpu);
@@ -2342,14 +2370,14 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
if (data & PMU_CAP_LBR_FMT) {
if ((data & PMU_CAP_LBR_FMT) !=
- (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT))
+ (kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT))
return 1;
if (!cpuid_model_is_consistent(vcpu))
return 1;
}
if (data & PERF_CAP_PEBS_FORMAT) {
if ((data & PERF_CAP_PEBS_MASK) !=
- (vmx_get_perf_capabilities() & PERF_CAP_PEBS_MASK))
+ (kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK))
return 1;
if (!guest_cpuid_has(vcpu, X86_FEATURE_DS))
return 1;
@@ -3412,18 +3440,15 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var)
{
u32 ar;
- if (var->unusable || !var->present)
- ar = 1 << 16;
- else {
- ar = var->type & 15;
- ar |= (var->s & 1) << 4;
- ar |= (var->dpl & 3) << 5;
- ar |= (var->present & 1) << 7;
- ar |= (var->avl & 1) << 12;
- ar |= (var->l & 1) << 13;
- ar |= (var->db & 1) << 14;
- ar |= (var->g & 1) << 15;
- }
+ ar = var->type & 15;
+ ar |= (var->s & 1) << 4;
+ ar |= (var->dpl & 3) << 5;
+ ar |= (var->present & 1) << 7;
+ ar |= (var->avl & 1) << 12;
+ ar |= (var->l & 1) << 13;
+ ar |= (var->db & 1) << 14;
+ ar |= (var->g & 1) << 15;
+ ar |= (var->unusable || !var->present) << 16;
return ar;
}
@@ -4431,6 +4456,13 @@ vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
* controls for features that are/aren't exposed to the guest.
*/
if (nested) {
+ /*
+ * All features that can be added or removed to VMX MSRs must
+ * be supported in the first place for nested virtualization.
+ */
+ if (WARN_ON_ONCE(!(vmcs_config.nested.secondary_ctls_high & control)))
+ enabled = false;
+
if (enabled)
vmx->nested.msrs.secondary_ctls_high |= control;
else
@@ -6844,6 +6876,8 @@ static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)
{
switch (index) {
case MSR_IA32_SMBASE:
+ if (!IS_ENABLED(CONFIG_KVM_SMM))
+ return false;
/*
* We cannot do SMM unless we can run the guest in big
* real mode.
@@ -7669,6 +7703,31 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
vmx_update_exception_bitmap(vcpu);
}
+static u64 vmx_get_perf_capabilities(void)
+{
+ u64 perf_cap = PMU_CAP_FW_WRITES;
+ struct x86_pmu_lbr lbr;
+ u64 host_perf_cap = 0;
+
+ if (!enable_pmu)
+ return 0;
+
+ if (boot_cpu_has(X86_FEATURE_PDCM))
+ rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap);
+
+ x86_perf_get_lbr(&lbr);
+ if (lbr.nr)
+ perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT;
+
+ if (vmx_pebs_supported()) {
+ perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK;
+ if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4)
+ perf_cap &= ~PERF_CAP_PEBS_BASELINE;
+ }
+
+ return perf_cap;
+}
+
static __init void vmx_set_cpu_caps(void)
{
kvm_set_cpu_caps();
@@ -7691,6 +7750,7 @@ static __init void vmx_set_cpu_caps(void)
if (!enable_pmu)
kvm_cpu_cap_clear(X86_FEATURE_PDCM);
+ kvm_caps.supported_perf_cap = vmx_get_perf_capabilities();
if (!enable_sgx) {
kvm_cpu_cap_clear(X86_FEATURE_SGX);
@@ -7906,6 +7966,7 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu)
~FEAT_CTL_LMCE_ENABLED;
}
+#ifdef CONFIG_KVM_SMM
static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
/* we need a nested vmexit to enter SMM, postpone if run is pending */
@@ -7914,7 +7975,7 @@ static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
return !is_smm(vcpu);
}
-static int vmx_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
+static int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -7935,7 +7996,7 @@ static int vmx_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
return 0;
}
-static int vmx_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
+static int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int ret;
@@ -7960,6 +8021,7 @@ static void vmx_enable_smi_window(struct kvm_vcpu *vcpu)
{
/* RSM will cause a vmexit anyway. */
}
+#endif
static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
{
@@ -8127,10 +8189,12 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.setup_mce = vmx_setup_mce,
+#ifdef CONFIG_KVM_SMM
.smi_allowed = vmx_smi_allowed,
.enter_smm = vmx_enter_smm,
.leave_smm = vmx_leave_smm,
.enable_smi_window = vmx_enable_smi_window,
+#endif
.can_emulate_instruction = vmx_can_emulate_instruction,
.apic_init_signal_blocked = vmx_apic_init_signal_blocked,
@@ -8490,8 +8554,8 @@ static int __init vmx_init(void)
}
if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
- vmx_x86_ops.enable_direct_tlbflush
- = hv_enable_direct_tlbflush;
+ vmx_x86_ops.enable_l2_tlb_flush
+ = hv_enable_l2_tlb_flush;
} else {
enlightened_vmcs = false;
diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h
index ec268df83ed6..842dc898c972 100644
--- a/arch/x86/kvm/vmx/vmx_ops.h
+++ b/arch/x86/kvm/vmx/vmx_ops.h
@@ -6,19 +6,33 @@
#include <asm/vmx.h>
-#include "evmcs.h"
+#include "hyperv.h"
#include "vmcs.h"
#include "../x86.h"
void vmread_error(unsigned long field, bool fault);
-__attribute__((regparm(0))) void vmread_error_trampoline(unsigned long field,
- bool fault);
void vmwrite_error(unsigned long field, unsigned long value);
void vmclear_error(struct vmcs *vmcs, u64 phys_addr);
void vmptrld_error(struct vmcs *vmcs, u64 phys_addr);
void invvpid_error(unsigned long ext, u16 vpid, gva_t gva);
void invept_error(unsigned long ext, u64 eptp, gpa_t gpa);
+#ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
+/*
+ * The VMREAD error trampoline _always_ uses the stack to pass parameters, even
+ * for 64-bit targets. Preserving all registers allows the VMREAD inline asm
+ * blob to avoid clobbering GPRs, which in turn allows the compiler to better
+ * optimize sequences of VMREADs.
+ *
+ * Declare the trampoline as an opaque label as it's not safe to call from C
+ * code; there is no way to tell the compiler to pass params on the stack for
+ * 64-bit targets.
+ *
+ * void vmread_error_trampoline(unsigned long field, bool fault);
+ */
+extern unsigned long vmread_error_trampoline;
+#endif
+
static __always_inline void vmcs_check16(unsigned long field)
{
BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,