diff options
Diffstat (limited to 'arch/x86/kvm/vmx')
-rw-r--r-- | arch/x86/kvm/vmx/capabilities.h | 24 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/hyperv.c (renamed from arch/x86/kvm/vmx/evmcs.c) | 45 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/hyperv.h (renamed from arch/x86/kvm/vmx/evmcs.h) | 12 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/nested.c | 128 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/nested.h | 7 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/pmu_intel.c | 11 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/sgx.c | 4 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmcs12.h | 5 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmenter.S | 2 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 132 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx_ops.h | 20 |
11 files changed, 270 insertions, 120 deletions
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 07254314f3dd..cd2ac9536c99 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -395,30 +395,6 @@ static inline bool vmx_pebs_supported(void) return boot_cpu_has(X86_FEATURE_PEBS) && kvm_pmu_cap.pebs_ept; } -static inline u64 vmx_get_perf_capabilities(void) -{ - u64 perf_cap = PMU_CAP_FW_WRITES; - struct x86_pmu_lbr lbr; - u64 host_perf_cap = 0; - - if (!enable_pmu) - return 0; - - if (boot_cpu_has(X86_FEATURE_PDCM)) - rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); - - if (x86_perf_get_lbr(&lbr) >= 0 && lbr.nr) - perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT; - - if (vmx_pebs_supported()) { - perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK; - if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4) - perf_cap &= ~PERF_CAP_PEBS_BASELINE; - } - - return perf_cap; -} - static inline bool cpu_has_notify_vmexit(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/hyperv.c index d8b23c96d627..ae03d1fe0355 100644 --- a/arch/x86/kvm/vmx/evmcs.c +++ b/arch/x86/kvm/vmx/hyperv.c @@ -3,9 +3,9 @@ #include <linux/errno.h> #include <linux/smp.h> -#include "../hyperv.h" #include "../cpuid.h" -#include "evmcs.h" +#include "hyperv.h" +#include "nested.h" #include "vmcs.h" #include "vmx.h" #include "trace.h" @@ -322,24 +322,17 @@ const struct evmcs_field vmcs_field_to_evmcs_1[] = { }; const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1); -bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa) +u64 nested_get_evmptr(struct kvm_vcpu *vcpu) { - struct hv_vp_assist_page assist_page; - - *evmcs_gpa = -1ull; - - if (unlikely(!kvm_hv_get_assist_page(vcpu, &assist_page))) - return false; - - if (unlikely(!assist_page.enlighten_vmentry)) - return false; + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); - if (unlikely(!evmptr_is_valid(assist_page.current_nested_vmcs))) - return false; + if (unlikely(kvm_hv_get_assist_page(vcpu))) + return EVMPTR_INVALID; - *evmcs_gpa = assist_page.current_nested_vmcs; + if (unlikely(!hv_vcpu->vp_assist_page.enlighten_vmentry)) + return EVMPTR_INVALID; - return true; + return hv_vcpu->vp_assist_page.current_nested_vmcs; } uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu) @@ -507,3 +500,23 @@ int nested_enable_evmcs(struct kvm_vcpu *vcpu, return 0; } + +bool nested_evmcs_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; + + if (!hv_vcpu || !evmcs) + return false; + + if (!evmcs->hv_enlightenments_control.nested_flush_hypercall) + return false; + + return hv_vcpu->vp_assist_page.nested_control.features.directhypercall; +} + +void vmx_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu) +{ + nested_vmx_vmexit(vcpu, HV_VMX_SYNTHETIC_EXIT_REASON_TRAP_AFTER_FLUSH, 0, 0); +} diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/hyperv.h index 6f746ef3c038..571e7929d14e 100644 --- a/arch/x86/kvm/vmx/evmcs.h +++ b/arch/x86/kvm/vmx/hyperv.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __KVM_X86_VMX_EVMCS_H -#define __KVM_X86_VMX_EVMCS_H +#ifndef __KVM_X86_VMX_HYPERV_H +#define __KVM_X86_VMX_HYPERV_H #include <linux/jump_label.h> @@ -8,6 +8,8 @@ #include <asm/mshyperv.h> #include <asm/vmx.h> +#include "../hyperv.h" + #include "capabilities.h" #include "vmcs.h" #include "vmcs12.h" @@ -235,11 +237,13 @@ enum nested_evmptrld_status { EVMPTRLD_ERROR, }; -bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa); +u64 nested_get_evmptr(struct kvm_vcpu *vcpu); uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu); int nested_enable_evmcs(struct kvm_vcpu *vcpu, uint16_t *vmcs_version); void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); int nested_evmcs_check_controls(struct vmcs12 *vmcs12); +bool nested_evmcs_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu); +void vmx_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu); -#endif /* __KVM_X86_VMX_EVMCS_H */ +#endif /* __KVM_X86_VMX_HYPERV_H */ diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 5b0d4859e4b7..d93c715cda6a 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -7,7 +7,6 @@ #include <asm/mmu_context.h> #include "cpuid.h" -#include "evmcs.h" #include "hyperv.h" #include "mmu.h" #include "nested.h" @@ -16,6 +15,7 @@ #include "trace.h" #include "vmx.h" #include "x86.h" +#include "smm.h" static bool __read_mostly enable_shadow_vmcs = 1; module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); @@ -225,6 +225,7 @@ static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) { + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { @@ -233,6 +234,12 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) } vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; + + if (hv_vcpu) { + hv_vcpu->nested.pa_page_gpa = INVALID_GPA; + hv_vcpu->nested.vm_id = 0; + hv_vcpu->nested.vp_id = 0; + } } static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, @@ -1126,6 +1133,15 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx = to_vmx(vcpu); /* + * KVM_REQ_HV_TLB_FLUSH flushes entries from either L1's VP_ID or + * L2's VP_ID upon request from the guest. Make sure we check for + * pending entries in the right FIFO upon L1/L2 transition as these + * requests are put by other vCPUs asynchronously. + */ + if (to_hv_vcpu(vcpu) && enable_ept) + kvm_make_request(KVM_REQ_HV_TLB_FLUSH, vcpu); + + /* * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings * for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a * full TLB flush from the guest's perspective. This is required even @@ -1557,12 +1573,20 @@ static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields { struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(&vmx->vcpu); /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ vmcs12->tpr_threshold = evmcs->tpr_threshold; vmcs12->guest_rip = evmcs->guest_rip; if (unlikely(!(hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL))) { + hv_vcpu->nested.pa_page_gpa = evmcs->partition_assist_page; + hv_vcpu->nested.vm_id = evmcs->hv_vm_id; + hv_vcpu->nested.vp_id = evmcs->hv_vp_id; + } + + if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { vmcs12->guest_rsp = evmcs->guest_rsp; vmcs12->guest_rflags = evmcs->guest_rflags; @@ -1977,7 +2001,8 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( if (likely(!guest_cpuid_has_evmcs(vcpu))) return EVMPTRLD_DISABLED; - if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) { + evmcs_gpa = nested_get_evmptr(vcpu); + if (!evmptr_is_valid(evmcs_gpa)) { nested_release_evmcs(vcpu); return EVMPTRLD_DISABLED; } @@ -2563,12 +2588,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, nested_ept_init_mmu_context(vcpu); /* - * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those - * bits which we consider mandatory enabled. - * The CR0_READ_SHADOW is what L2 should have expected to read given - * the specifications by L1; It's not enough to take - * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we - * have more bits than L1 expected. + * Override the CR0/CR4 read shadows after setting the effective guest + * CR0/CR4. The common helpers also set the shadows, but they don't + * account for vmcs12's cr0/4_guest_host_mask. */ vmx_set_cr0(vcpu, vmcs12->guest_cr0); vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); @@ -3251,6 +3273,12 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) { + /* + * Note: nested_get_evmcs_page() also updates 'vp_assist_page' copy + * in 'struct kvm_vcpu_hv' in case eVMCS is in use, this is mandatory + * to make nested_evmcs_l2_tlb_flush_enabled() work correctly post + * migration. + */ if (!nested_get_evmcs_page(vcpu)) { pr_debug_ratelimited("%s: enlightened vmptrld failed\n", __func__); @@ -4767,6 +4795,17 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, vmx_switch_vmcs(vcpu, &vmx->vmcs01); + /* + * If IBRS is advertised to the vCPU, KVM must flush the indirect + * branch predictors when transitioning from L2 to L1, as L1 expects + * hardware (KVM in this case) to provide separate predictor modes. + * Bare metal isolates VMX root (host) from VMX non-root (guest), but + * doesn't isolate different VMCSs, i.e. in this case, doesn't provide + * separate modes for L2 vs L1. + */ + if (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) + indirect_branch_prediction_barrier(); + /* Update any VMCS fields that might have changed while L2 ran */ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); @@ -5100,24 +5139,35 @@ static int handle_vmxon(struct kvm_vcpu *vcpu) | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; /* - * Note, KVM cannot rely on hardware to perform the CR0/CR4 #UD checks - * that have higher priority than VM-Exit (see Intel SDM's pseudocode - * for VMXON), as KVM must load valid CR0/CR4 values into hardware while - * running the guest, i.e. KVM needs to check the _guest_ values. + * Manually check CR4.VMXE checks, KVM must force CR4.VMXE=1 to enter + * the guest and so cannot rely on hardware to perform the check, + * which has higher priority than VM-Exit (see Intel SDM's pseudocode + * for VMXON). * - * Rely on hardware for the other two pre-VM-Exit checks, !VM86 and - * !COMPATIBILITY modes. KVM may run the guest in VM86 to emulate Real - * Mode, but KVM will never take the guest out of those modes. + * Rely on hardware for the other pre-VM-Exit checks, CR0.PE=1, !VM86 + * and !COMPATIBILITY modes. For an unrestricted guest, KVM doesn't + * force any of the relevant guest state. For a restricted guest, KVM + * does force CR0.PE=1, but only to also force VM86 in order to emulate + * Real Mode, and so there's no need to check CR0.PE manually. */ - if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) || - !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) { + if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } /* - * CPL=0 and all other checks that are lower priority than VM-Exit must - * be checked manually. + * The CPL is checked for "not in VMX operation" and for "in VMX root", + * and has higher priority than the VM-Fail due to being post-VMXON, + * i.e. VMXON #GPs outside of VMX non-root if CPL!=0. In VMX non-root, + * VMXON causes VM-Exit and KVM unconditionally forwards VMXON VM-Exits + * from L2 to L1, i.e. there's no need to check for the vCPU being in + * VMX non-root. + * + * Forwarding the VM-Exit unconditionally, i.e. without performing the + * #UD checks (see above), is functionally ok because KVM doesn't allow + * L1 to run L2 without CR4.VMXE=0, and because KVM never modifies L2's + * CR0 or CR4, i.e. it's L2's responsibility to emulate #UDs that are + * missed by hardware due to shadowing CR0 and/or CR4. */ if (vmx_get_cpl(vcpu)) { kvm_inject_gp(vcpu, 0); @@ -5127,6 +5177,17 @@ static int handle_vmxon(struct kvm_vcpu *vcpu) if (vmx->nested.vmxon) return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); + /* + * Invalid CR0/CR4 generates #GP. These checks are performed if and + * only if the vCPU isn't already in VMX operation, i.e. effectively + * have lower priority than the VM-Fail above. + */ + if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) || + !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) { + kvm_inject_gp(vcpu, 0); + return 1; + } + if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) != VMXON_NEEDED_FEATURES) { kvm_inject_gp(vcpu, 0); @@ -5206,7 +5267,6 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); u32 zero = 0; gpa_t vmptr; - u64 evmcs_gpa; int r; if (!nested_vmx_check_permission(vcpu)) @@ -5232,14 +5292,23 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) * vmx->nested.hv_evmcs but this shouldn't be a problem. */ if (likely(!guest_cpuid_has_evmcs(vcpu) || - !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) { + !evmptr_is_valid(nested_get_evmptr(vcpu)))) { if (vmptr == vmx->nested.current_vmptr) nested_release_vmcs12(vcpu); - kvm_vcpu_write_guest(vcpu, - vmptr + offsetof(struct vmcs12, - launch_state), - &zero, sizeof(zero)); + /* + * Silently ignore memory errors on VMCLEAR, Intel's pseudocode + * for VMCLEAR includes a "ensure that data for VMCS referenced + * by the operand is in memory" clause that guards writes to + * memory, i.e. doing nothing for I/O is architecturally valid. + * + * FIXME: Suppress failures if and only if no memslot is found, + * i.e. exit to userspace if __copy_to_user() fails. + */ + (void)kvm_vcpu_write_guest(vcpu, + vmptr + offsetof(struct vmcs12, + launch_state), + &zero, sizeof(zero)); } else if (vmx->nested.hv_evmcs && vmptr == vmx->nested.hv_evmcs_vmptr) { nested_release_evmcs(vcpu); } @@ -6129,6 +6198,11 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, * Handle L2's bus locks in L0 directly. */ return true; + case EXIT_REASON_VMCALL: + /* Hyper-V L2 TLB flush hypercall is handled by L0 */ + return guest_hv_cpuid_has_l2_tlb_flush(vcpu) && + nested_evmcs_l2_tlb_flush_enabled(vcpu) && + kvm_hv_is_tlb_flush_hcall(vcpu); default: break; } @@ -6808,7 +6882,8 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) SECONDARY_EXEC_ENABLE_INVPCID | SECONDARY_EXEC_RDSEED_EXITING | SECONDARY_EXEC_XSAVES | - SECONDARY_EXEC_TSC_SCALING; + SECONDARY_EXEC_TSC_SCALING | + SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; /* * We can emulate "VMCS shadowing," even if the hardware @@ -6980,4 +7055,5 @@ struct kvm_x86_nested_ops vmx_nested_ops = { .write_log_dirty = nested_vmx_write_pml_buffer, .enable_evmcs = nested_enable_evmcs, .get_evmcs_version = nested_get_evmcs_version, + .hv_inject_synthetic_vmexit_post_tlb_flush = vmx_hv_inject_synthetic_vmexit_post_tlb_flush, }; diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index 6312c9541c3c..96952263b029 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -79,9 +79,10 @@ static inline bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu) } /* - * Return the cr0 value that a nested guest would read. This is a combination - * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by - * its hypervisor (cr0_read_shadow). + * Return the cr0/4 value that a nested guest would read. This is a combination + * of L1's "real" cr0 used to run the guest (guest_cr0), and the bits shadowed + * by the L1 hypervisor (cr0_read_shadow). KVM must emulate CPU behavior as + * the value+mask loaded into vmcs02 may not match the vmcs12 fields. */ static inline unsigned long nested_read_cr0(struct vmcs12 *fields) { diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 10b33da9bd05..e5cec07ca8d9 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -52,7 +52,7 @@ static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i); __set_bit(INTEL_PMC_IDX_FIXED + i, pmu->pmc_in_use); - reprogram_counter(pmc); + kvm_pmu_request_counter_reprogam(pmc); } } @@ -76,7 +76,7 @@ static void reprogram_counters(struct kvm_pmu *pmu, u64 diff) for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) { pmc = intel_pmc_idx_to_pmc(pmu, bit); if (pmc) - reprogram_counter(pmc); + kvm_pmu_request_counter_reprogam(pmc); } } @@ -477,7 +477,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) reserved_bits ^= HSW_IN_TX_CHECKPOINTED; if (!(data & reserved_bits)) { pmc->eventsel = data; - reprogram_counter(pmc); + kvm_pmu_request_counter_reprogam(pmc); return 0; } } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false)) @@ -631,7 +631,6 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu) pmu->fixed_counters[i].current_config = 0; } - vcpu->arch.perf_capabilities = vmx_get_perf_capabilities(); lbr_desc->records.nr = 0; lbr_desc->event = NULL; lbr_desc->msr_passthrough = false; @@ -647,14 +646,14 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu) pmc = &pmu->gp_counters[i]; pmc_stop_counter(pmc); - pmc->counter = pmc->eventsel = 0; + pmc->counter = pmc->prev_counter = pmc->eventsel = 0; } for (i = 0; i < KVM_PMC_MAX_FIXED; i++) { pmc = &pmu->fixed_counters[i]; pmc_stop_counter(pmc); - pmc->counter = 0; + pmc->counter = pmc->prev_counter = 0; } pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0; diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index 8f95c7c01433..b12da2a6dec9 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -182,8 +182,10 @@ static int __handle_encls_ecreate(struct kvm_vcpu *vcpu, /* Enforce CPUID restriction on max enclave size. */ max_size_log2 = (attributes & SGX_ATTR_MODE64BIT) ? sgx_12_0->edx >> 8 : sgx_12_0->edx; - if (size >= BIT_ULL(max_size_log2)) + if (size >= BIT_ULL(max_size_log2)) { kvm_inject_gp(vcpu, 0); + return 1; + } /* * sgx_virt_ecreate() returns: diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h index 746129ddd5ae..01936013428b 100644 --- a/arch/x86/kvm/vmx/vmcs12.h +++ b/arch/x86/kvm/vmx/vmcs12.h @@ -208,9 +208,8 @@ struct __packed vmcs12 { /* * For save/restore compatibility, the vmcs12 field offsets must not change. */ -#define CHECK_OFFSET(field, loc) \ - BUILD_BUG_ON_MSG(offsetof(struct vmcs12, field) != (loc), \ - "Offset of " #field " in struct vmcs12 has changed.") +#define CHECK_OFFSET(field, loc) \ + ASSERT_STRUCT_OFFSET(struct vmcs12, field, loc) static inline void vmx_check_vmcs12_offsets(void) { diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 0b5db4de4d09..766c6b3ef5ed 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -269,6 +269,7 @@ SYM_FUNC_END(__vmx_vcpu_run) .section .text, "ax" +#ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT /** * vmread_error_trampoline - Trampoline from inline asm to vmread_error() * @field: VMCS field encoding that failed @@ -317,6 +318,7 @@ SYM_FUNC_START(vmread_error_trampoline) RET SYM_FUNC_END(vmread_error_trampoline) +#endif SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff) /* diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 63247c57c72c..7eec0226d56a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -51,7 +51,6 @@ #include "capabilities.h" #include "cpuid.h" -#include "evmcs.h" #include "hyperv.h" #include "kvm_onhyperv.h" #include "irq.h" @@ -66,6 +65,7 @@ #include "vmcs12.h" #include "vmx.h" #include "x86.h" +#include "smm.h" MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); @@ -526,7 +526,7 @@ static unsigned long host_idt_base; static bool __read_mostly enlightened_vmcs = true; module_param(enlightened_vmcs, bool, 0444); -static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu) +static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu) { struct hv_enlightened_vmcs *evmcs; struct hv_partition_assist_pg **p_hv_pa_pg = @@ -858,7 +858,7 @@ unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx) * to change it directly without causing a vmexit. In that case read * it after vmexit and store it in vmx->spec_ctrl. */ - if (unlikely(!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))) + if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL)) flags |= VMX_RUN_SAVE_SPEC_CTRL; return flags; @@ -1348,8 +1348,10 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, /* * No indirect branch prediction barrier needed when switching - * the active VMCS within a guest, e.g. on nested VM-Enter. - * The L1 VMM can protect itself with retpolines, IBPB or IBRS. + * the active VMCS within a vCPU, unless IBRS is advertised to + * the vCPU. To minimize the number of IBPBs executed, KVM + * performs IBPB on nested VM-Exit (a single nested transition + * may switch the active VMCS multiple times). */ if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev)) indirect_branch_prediction_barrier(); @@ -1834,12 +1836,42 @@ bool nested_vmx_allowed(struct kvm_vcpu *vcpu) return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX); } -static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu, - uint64_t val) +/* + * Userspace is allowed to set any supported IA32_FEATURE_CONTROL regardless of + * guest CPUID. Note, KVM allows userspace to set "VMX in SMX" to maintain + * backwards compatibility even though KVM doesn't support emulating SMX. And + * because userspace set "VMX in SMX", the guest must also be allowed to set it, + * e.g. if the MSR is left unlocked and the guest does a RMW operation. + */ +#define KVM_SUPPORTED_FEATURE_CONTROL (FEAT_CTL_LOCKED | \ + FEAT_CTL_VMX_ENABLED_INSIDE_SMX | \ + FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX | \ + FEAT_CTL_SGX_LC_ENABLED | \ + FEAT_CTL_SGX_ENABLED | \ + FEAT_CTL_LMCE_ENABLED) + +static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx, + struct msr_data *msr) { - uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits; + uint64_t valid_bits; - return !(val & ~valid_bits); + /* + * Ensure KVM_SUPPORTED_FEATURE_CONTROL is updated when new bits are + * exposed to the guest. + */ + WARN_ON_ONCE(vmx->msr_ia32_feature_control_valid_bits & + ~KVM_SUPPORTED_FEATURE_CONTROL); + + if (!msr->host_initiated && + (vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED)) + return false; + + if (msr->host_initiated) + valid_bits = KVM_SUPPORTED_FEATURE_CONTROL; + else + valid_bits = vmx->msr_ia32_feature_control_valid_bits; + + return !(msr->data & ~valid_bits); } static int vmx_get_msr_feature(struct kvm_msr_entry *msr) @@ -1849,9 +1881,6 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr) if (!nested) return 1; return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data); - case MSR_IA32_PERF_CAPABILITIES: - msr->data = vmx_get_perf_capabilities(); - return 0; default: return KVM_MSR_RET_INVALID; } @@ -2029,7 +2058,7 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated (host_initiated || guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))) debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT; - if ((vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT) && + if ((kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT) && (host_initiated || intel_pmu_lbr_is_enabled(vcpu))) debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; @@ -2241,10 +2270,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.mcg_ext_ctl = data; break; case MSR_IA32_FEAT_CTL: - if (!vmx_feature_control_msr_valid(vcpu, data) || - (to_vmx(vcpu)->msr_ia32_feature_control & - FEAT_CTL_LOCKED && !msr_info->host_initiated)) + if (!is_vmx_feature_control_msr_valid(vmx, msr_info)) return 1; + vmx->msr_ia32_feature_control = data; if (msr_info->host_initiated && data == 0) vmx_leave_nested(vcpu); @@ -2342,14 +2370,14 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; if (data & PMU_CAP_LBR_FMT) { if ((data & PMU_CAP_LBR_FMT) != - (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT)) + (kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT)) return 1; if (!cpuid_model_is_consistent(vcpu)) return 1; } if (data & PERF_CAP_PEBS_FORMAT) { if ((data & PERF_CAP_PEBS_MASK) != - (vmx_get_perf_capabilities() & PERF_CAP_PEBS_MASK)) + (kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK)) return 1; if (!guest_cpuid_has(vcpu, X86_FEATURE_DS)) return 1; @@ -3412,18 +3440,15 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var) { u32 ar; - if (var->unusable || !var->present) - ar = 1 << 16; - else { - ar = var->type & 15; - ar |= (var->s & 1) << 4; - ar |= (var->dpl & 3) << 5; - ar |= (var->present & 1) << 7; - ar |= (var->avl & 1) << 12; - ar |= (var->l & 1) << 13; - ar |= (var->db & 1) << 14; - ar |= (var->g & 1) << 15; - } + ar = var->type & 15; + ar |= (var->s & 1) << 4; + ar |= (var->dpl & 3) << 5; + ar |= (var->present & 1) << 7; + ar |= (var->avl & 1) << 12; + ar |= (var->l & 1) << 13; + ar |= (var->db & 1) << 14; + ar |= (var->g & 1) << 15; + ar |= (var->unusable || !var->present) << 16; return ar; } @@ -4431,6 +4456,13 @@ vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control, * controls for features that are/aren't exposed to the guest. */ if (nested) { + /* + * All features that can be added or removed to VMX MSRs must + * be supported in the first place for nested virtualization. + */ + if (WARN_ON_ONCE(!(vmcs_config.nested.secondary_ctls_high & control))) + enabled = false; + if (enabled) vmx->nested.msrs.secondary_ctls_high |= control; else @@ -6844,6 +6876,8 @@ static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index) { switch (index) { case MSR_IA32_SMBASE: + if (!IS_ENABLED(CONFIG_KVM_SMM)) + return false; /* * We cannot do SMM unless we can run the guest in big * real mode. @@ -7669,6 +7703,31 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vmx_update_exception_bitmap(vcpu); } +static u64 vmx_get_perf_capabilities(void) +{ + u64 perf_cap = PMU_CAP_FW_WRITES; + struct x86_pmu_lbr lbr; + u64 host_perf_cap = 0; + + if (!enable_pmu) + return 0; + + if (boot_cpu_has(X86_FEATURE_PDCM)) + rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); + + x86_perf_get_lbr(&lbr); + if (lbr.nr) + perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT; + + if (vmx_pebs_supported()) { + perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK; + if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4) + perf_cap &= ~PERF_CAP_PEBS_BASELINE; + } + + return perf_cap; +} + static __init void vmx_set_cpu_caps(void) { kvm_set_cpu_caps(); @@ -7691,6 +7750,7 @@ static __init void vmx_set_cpu_caps(void) if (!enable_pmu) kvm_cpu_cap_clear(X86_FEATURE_PDCM); + kvm_caps.supported_perf_cap = vmx_get_perf_capabilities(); if (!enable_sgx) { kvm_cpu_cap_clear(X86_FEATURE_SGX); @@ -7906,6 +7966,7 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu) ~FEAT_CTL_LMCE_ENABLED; } +#ifdef CONFIG_KVM_SMM static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) { /* we need a nested vmexit to enter SMM, postpone if run is pending */ @@ -7914,7 +7975,7 @@ static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) return !is_smm(vcpu); } -static int vmx_enter_smm(struct kvm_vcpu *vcpu, char *smstate) +static int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -7935,7 +7996,7 @@ static int vmx_enter_smm(struct kvm_vcpu *vcpu, char *smstate) return 0; } -static int vmx_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) +static int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) { struct vcpu_vmx *vmx = to_vmx(vcpu); int ret; @@ -7960,6 +8021,7 @@ static void vmx_enable_smi_window(struct kvm_vcpu *vcpu) { /* RSM will cause a vmexit anyway. */ } +#endif static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu) { @@ -8127,10 +8189,12 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .setup_mce = vmx_setup_mce, +#ifdef CONFIG_KVM_SMM .smi_allowed = vmx_smi_allowed, .enter_smm = vmx_enter_smm, .leave_smm = vmx_leave_smm, .enable_smi_window = vmx_enable_smi_window, +#endif .can_emulate_instruction = vmx_can_emulate_instruction, .apic_init_signal_blocked = vmx_apic_init_signal_blocked, @@ -8490,8 +8554,8 @@ static int __init vmx_init(void) } if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) - vmx_x86_ops.enable_direct_tlbflush - = hv_enable_direct_tlbflush; + vmx_x86_ops.enable_l2_tlb_flush + = hv_enable_l2_tlb_flush; } else { enlightened_vmcs = false; diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index ec268df83ed6..842dc898c972 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -6,19 +6,33 @@ #include <asm/vmx.h> -#include "evmcs.h" +#include "hyperv.h" #include "vmcs.h" #include "../x86.h" void vmread_error(unsigned long field, bool fault); -__attribute__((regparm(0))) void vmread_error_trampoline(unsigned long field, - bool fault); void vmwrite_error(unsigned long field, unsigned long value); void vmclear_error(struct vmcs *vmcs, u64 phys_addr); void vmptrld_error(struct vmcs *vmcs, u64 phys_addr); void invvpid_error(unsigned long ext, u16 vpid, gva_t gva); void invept_error(unsigned long ext, u64 eptp, gpa_t gpa); +#ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT +/* + * The VMREAD error trampoline _always_ uses the stack to pass parameters, even + * for 64-bit targets. Preserving all registers allows the VMREAD inline asm + * blob to avoid clobbering GPRs, which in turn allows the compiler to better + * optimize sequences of VMREADs. + * + * Declare the trampoline as an opaque label as it's not safe to call from C + * code; there is no way to tell the compiler to pass params on the stack for + * 64-bit targets. + * + * void vmread_error_trampoline(unsigned long field, bool fault); + */ +extern unsigned long vmread_error_trampoline; +#endif + static __always_inline void vmcs_check16(unsigned long field) { BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000, |