summaryrefslogtreecommitdiff
path: root/arch/x86/kvm/vmx
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/vmx')
-rw-r--r--arch/x86/kvm/vmx/capabilities.h25
-rw-r--r--arch/x86/kvm/vmx/evmcs.h7
-rw-r--r--arch/x86/kvm/vmx/nested.c191
-rw-r--r--arch/x86/kvm/vmx/nested.h8
-rw-r--r--arch/x86/kvm/vmx/ops.h27
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c8
-rw-r--r--arch/x86/kvm/vmx/vmenter.S80
-rw-r--r--arch/x86/kvm/vmx/vmx.c736
-rw-r--r--arch/x86/kvm/vmx/vmx.h10
9 files changed, 617 insertions, 475 deletions
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index f486e2606247..8903475f751e 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -101,7 +101,7 @@ static inline bool cpu_has_load_perf_global_ctrl(void)
(vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
}
-static inline bool vmx_mpx_supported(void)
+static inline bool cpu_has_vmx_mpx(void)
{
return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) &&
(vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS);
@@ -146,11 +146,6 @@ static inline bool vmx_umip_emulated(void)
SECONDARY_EXEC_DESC;
}
-static inline bool vmx_pku_supported(void)
-{
- return boot_cpu_has(X86_FEATURE_PKU);
-}
-
static inline bool cpu_has_vmx_rdtscp(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
@@ -354,4 +349,22 @@ static inline bool cpu_has_vmx_intel_pt(void)
(vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_RTIT_CTL);
}
+/*
+ * Processor Trace can operate in one of three modes:
+ * a. system-wide: trace both host/guest and output to host buffer
+ * b. host-only: only trace host and output to host buffer
+ * c. host-guest: trace host and guest simultaneously and output to their
+ * respective buffer
+ *
+ * KVM currently only supports (a) and (c).
+ */
+static inline bool vmx_pt_mode_is_system(void)
+{
+ return pt_mode == PT_MODE_SYSTEM;
+}
+static inline bool vmx_pt_mode_is_host_guest(void)
+{
+ return pt_mode == PT_MODE_HOST_GUEST;
+}
+
#endif /* __KVM_X86_VMX_CAPS_H */
diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h
index 6de47f2569c9..e5f7a7ebf27d 100644
--- a/arch/x86/kvm/vmx/evmcs.h
+++ b/arch/x86/kvm/vmx/evmcs.h
@@ -198,6 +198,13 @@ static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {}
static inline void evmcs_touch_msr_bitmap(void) {}
#endif /* IS_ENABLED(CONFIG_HYPERV) */
+enum nested_evmptrld_status {
+ EVMPTRLD_DISABLED,
+ EVMPTRLD_SUCCEEDED,
+ EVMPTRLD_VMFAIL,
+ EVMPTRLD_ERROR,
+};
+
bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa);
uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu);
int nested_enable_evmcs(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index e920d7834d73..cbc9ea2de28f 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -224,7 +224,7 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
return;
kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true);
- vmx->nested.hv_evmcs_vmptr = -1ull;
+ vmx->nested.hv_evmcs_vmptr = 0;
vmx->nested.hv_evmcs = NULL;
}
@@ -353,9 +353,8 @@ static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
to_vmx(vcpu)->nested.msrs.ept_caps &
VMX_EPT_EXECUTE_ONLY_BIT,
nested_ept_ad_enabled(vcpu),
- nested_ept_get_cr3(vcpu));
- vcpu->arch.mmu->set_cr3 = vmx_set_cr3;
- vcpu->arch.mmu->get_cr3 = nested_ept_get_cr3;
+ nested_ept_get_eptp(vcpu));
+ vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp;
vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault;
vcpu->arch.mmu->get_pdptr = kvm_pdptr_read;
@@ -1910,20 +1909,21 @@ static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
* This is an equivalent of the nested hypervisor executing the vmptrld
* instruction.
*/
-static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
- bool from_launch)
+static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
+ struct kvm_vcpu *vcpu, bool from_launch)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
bool evmcs_gpa_changed = false;
u64 evmcs_gpa;
if (likely(!vmx->nested.enlightened_vmcs_enabled))
- return 1;
+ return EVMPTRLD_DISABLED;
if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa))
- return 1;
+ return EVMPTRLD_DISABLED;
- if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
+ if (unlikely(!vmx->nested.hv_evmcs ||
+ evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
if (!vmx->nested.hv_evmcs)
vmx->nested.current_vmptr = -1ull;
@@ -1931,7 +1931,7 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa),
&vmx->nested.hv_evmcs_map))
- return 0;
+ return EVMPTRLD_ERROR;
vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva;
@@ -1960,7 +1960,7 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) &&
(vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) {
nested_release_evmcs(vcpu);
- return 0;
+ return EVMPTRLD_VMFAIL;
}
vmx->nested.dirty_vmcs12 = true;
@@ -1989,21 +1989,13 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
vmx->nested.hv_evmcs->hv_clean_fields &=
~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
- return 1;
+ return EVMPTRLD_SUCCEEDED;
}
void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- /*
- * hv_evmcs may end up being not mapped after migration (when
- * L2 was running), map it here to make sure vmcs12 changes are
- * properly reflected.
- */
- if (vmx->nested.enlightened_vmcs_enabled && !vmx->nested.hv_evmcs)
- nested_vmx_handle_enlightened_vmptrld(vcpu, false);
-
if (vmx->nested.hv_evmcs) {
copy_vmcs12_to_enlightened(vmx);
/* All fields are clean */
@@ -2474,9 +2466,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
* If L1 use EPT, then L0 needs to execute INVEPT on
* EPTP02 instead of EPTP01. Therefore, delay TLB
* flush until vmcs02->eptp is fully updated by
- * KVM_REQ_LOAD_CR3. Note that this assumes
+ * KVM_REQ_LOAD_MMU_PGD. Note that this assumes
* KVM_REQ_TLB_FLUSH is evaluated after
- * KVM_REQ_LOAD_CR3 in vcpu_enter_guest().
+ * KVM_REQ_LOAD_MMU_PGD in vcpu_enter_guest().
*/
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
@@ -2521,7 +2513,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
/*
* Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12
* on nested VM-Exit, which can occur without actually running L2 and
- * thus without hitting vmx_set_cr3(), e.g. if L1 is entering L2 with
+ * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with
* vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the
* transition to HLT instead of running L2.
*/
@@ -2563,13 +2555,13 @@ static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
return 0;
}
-static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address)
+static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int maxphyaddr = cpuid_maxphyaddr(vcpu);
/* Check for memory type validity */
- switch (address & VMX_EPTP_MT_MASK) {
+ switch (new_eptp & VMX_EPTP_MT_MASK) {
case VMX_EPTP_MT_UC:
if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT)))
return false;
@@ -2582,16 +2574,26 @@ static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address)
return false;
}
- /* only 4 levels page-walk length are valid */
- if (CC((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4))
+ /* Page-walk levels validity. */
+ switch (new_eptp & VMX_EPTP_PWL_MASK) {
+ case VMX_EPTP_PWL_5:
+ if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT)))
+ return false;
+ break;
+ case VMX_EPTP_PWL_4:
+ if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT)))
+ return false;
+ break;
+ default:
return false;
+ }
/* Reserved bits should not be set */
- if (CC(address >> maxphyaddr || ((address >> 7) & 0x1f)))
+ if (CC(new_eptp >> maxphyaddr || ((new_eptp >> 7) & 0x1f)))
return false;
/* AD, if set, should be supported */
- if (address & VMX_EPTP_AD_ENABLE_BIT) {
+ if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) {
if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT)))
return false;
}
@@ -2640,7 +2642,7 @@ static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu,
return -EINVAL;
if (nested_cpu_has_ept(vmcs12) &&
- CC(!valid_ept_address(vcpu, vmcs12->ept_pointer)))
+ CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer)))
return -EINVAL;
if (nested_cpu_has_vmfunc(vmcs12)) {
@@ -2960,7 +2962,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
/*
* Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS,
* which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to
- * be written (by preparve_vmcs02()) before the "real" VMEnter, i.e.
+ * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e.
* there is no need to preserve other bits or save/restore the field.
*/
vmcs_writel(GUEST_RFLAGS, 0);
@@ -3052,6 +3054,27 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
struct page *page;
u64 hpa;
+ /*
+ * hv_evmcs may end up being not mapped after migration (when
+ * L2 was running), map it here to make sure vmcs12 changes are
+ * properly reflected.
+ */
+ if (vmx->nested.enlightened_vmcs_enabled && !vmx->nested.hv_evmcs) {
+ enum nested_evmptrld_status evmptrld_status =
+ nested_vmx_handle_enlightened_vmptrld(vcpu, false);
+
+ if (evmptrld_status == EVMPTRLD_VMFAIL ||
+ evmptrld_status == EVMPTRLD_ERROR) {
+ pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
+ __func__);
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror =
+ KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ return false;
+ }
+ }
+
if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
/*
* Translate L1 physical address to host physical
@@ -3315,12 +3338,18 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
enum nvmx_vmentry_status status;
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
+ enum nested_evmptrld_status evmptrld_status;
if (!nested_vmx_check_permission(vcpu))
return 1;
- if (!nested_vmx_handle_enlightened_vmptrld(vcpu, launch))
+ evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch);
+ if (evmptrld_status == EVMPTRLD_ERROR) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
+ } else if (evmptrld_status == EVMPTRLD_VMFAIL) {
+ return nested_vmx_failInvalid(vcpu);
+ }
if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull)
return nested_vmx_failInvalid(vcpu);
@@ -3498,7 +3527,7 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
}
-static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
+void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
gfn_t gfn;
@@ -3603,7 +3632,7 @@ static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu)
vcpu->arch.exception.payload);
}
-static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
+static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long exit_qual;
@@ -3616,7 +3645,8 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
* Clear the MTF state. If a higher priority VM-exit is delivered first,
* this state is discarded.
*/
- vmx->nested.mtf_pending = false;
+ if (!block_nested_events)
+ vmx->nested.mtf_pending = false;
if (lapic_in_kernel(vcpu) &&
test_bit(KVM_APIC_INIT, &apic->pending_events)) {
@@ -3679,8 +3709,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
return 0;
}
- if ((kvm_cpu_has_interrupt(vcpu) || external_intr) &&
- nested_exit_on_intr(vcpu)) {
+ if (kvm_cpu_has_interrupt(vcpu) && nested_exit_on_intr(vcpu)) {
if (block_nested_events)
return -EBUSY;
nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
@@ -4023,7 +4052,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
*
* If vmcs12 uses EPT, we need to execute this flush on EPTP01
* and therefore we request the TLB flush to happen only after VMCS EPTP
- * has been set by KVM_REQ_LOAD_CR3.
+ * has been set by KVM_REQ_LOAD_MMU_PGD.
*/
if (enable_vpid &&
(!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu))) {
@@ -4328,17 +4357,8 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
if (likely(!vmx->fail)) {
- /*
- * TODO: SDM says that with acknowledge interrupt on
- * exit, bit 31 of the VM-exit interrupt information
- * (valid interrupt) is always set to 1 on
- * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't
- * need kvm_cpu_has_interrupt(). See the commit
- * message for details.
- */
- if (nested_exit_intr_ack_set(vcpu) &&
- exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
- kvm_cpu_has_interrupt(vcpu)) {
+ if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
+ nested_exit_intr_ack_set(vcpu)) {
int irq = kvm_cpu_get_interrupt(vcpu);
WARN_ON(irq < 0);
vmcs12->vm_exit_intr_info = irq |
@@ -4382,7 +4402,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
* Decode the memory-address operand of a vmx instruction, as recorded on an
* exit caused by such an instruction (run by a guest hypervisor).
* On success, returns 0. When the operand is invalid, returns 1 and throws
- * #UD or #GP.
+ * #UD, #GP, or #SS.
*/
int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
u32 vmx_instruction_info, bool wr, int len, gva_t *ret)
@@ -4423,7 +4443,7 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
if (base_is_valid)
off += kvm_register_read(vcpu, base_reg);
if (index_is_valid)
- off += kvm_register_read(vcpu, index_reg)<<scaling;
+ off += kvm_register_read(vcpu, index_reg) << scaling;
vmx_get_segment(vcpu, &s, seg_reg);
/*
@@ -4516,7 +4536,7 @@ void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
return;
vmx = to_vmx(vcpu);
- if (kvm_x86_ops->pmu_ops->is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)) {
+ if (kvm_x86_ops.pmu_ops->is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)) {
vmx->nested.msrs.entry_ctls_high |=
VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
vmx->nested.msrs.exit_ctls_high |=
@@ -4602,7 +4622,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
vmx->nested.vmcs02_initialized = false;
vmx->nested.vmxon = true;
- if (pt_mode == PT_MODE_HOST_GUEST) {
+ if (vmx_pt_mode_is_host_guest()) {
vmx->pt_desc.guest.ctl = 0;
pt_update_intercept_for_msr(vmx);
}
@@ -5234,7 +5254,7 @@ static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
u32 index = kvm_rcx_read(vcpu);
- u64 address;
+ u64 new_eptp;
bool accessed_dirty;
struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
@@ -5247,23 +5267,23 @@ static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT,
- &address, index * 8, 8))
+ &new_eptp, index * 8, 8))
return 1;
- accessed_dirty = !!(address & VMX_EPTP_AD_ENABLE_BIT);
+ accessed_dirty = !!(new_eptp & VMX_EPTP_AD_ENABLE_BIT);
/*
* If the (L2) guest does a vmfunc to the currently
* active ept pointer, we don't have to do anything else
*/
- if (vmcs12->ept_pointer != address) {
- if (!valid_ept_address(vcpu, address))
+ if (vmcs12->ept_pointer != new_eptp) {
+ if (!nested_vmx_check_eptp(vcpu, new_eptp))
return 1;
kvm_mmu_unload(vcpu);
mmu->ept_ad = accessed_dirty;
mmu->mmu_role.base.ad_disabled = !accessed_dirty;
- vmcs12->ept_pointer = address;
+ vmcs12->ept_pointer = new_eptp;
/*
* TODO: Check what's the correct approach in case
* mmu reload fails. Currently, we just let the next
@@ -5524,8 +5544,7 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- if (vmx->nested.nested_run_pending)
- return false;
+ WARN_ON_ONCE(vmx->nested.nested_run_pending);
if (unlikely(vmx->fail)) {
trace_kvm_nested_vmenter_failed(
@@ -5534,19 +5553,6 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
return true;
}
- /*
- * The host physical addresses of some pages of guest memory
- * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
- * Page). The CPU may write to these pages via their host
- * physical address while L2 is running, bypassing any
- * address-translation-based dirty tracking (e.g. EPT write
- * protection).
- *
- * Mark them dirty on every exit from L2 to prevent them from
- * getting out of sync with dirty tracking.
- */
- nested_mark_vmcs12_pages_dirty(vcpu);
-
trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
vmcs_readl(EXIT_QUALIFICATION),
vmx->idt_vectoring_info,
@@ -5627,7 +5633,7 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
case EXIT_REASON_MWAIT_INSTRUCTION:
return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
case EXIT_REASON_MONITOR_TRAP_FLAG:
- return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG);
+ return nested_cpu_has_mtf(vmcs12);
case EXIT_REASON_MONITOR_INSTRUCTION:
return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
case EXIT_REASON_PAUSE_INSTRUCTION:
@@ -5904,10 +5910,12 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa);
} else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) {
/*
- * Sync eVMCS upon entry as we may not have
- * HV_X64_MSR_VP_ASSIST_PAGE set up yet.
+ * nested_vmx_handle_enlightened_vmptrld() cannot be called
+ * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be
+ * restored yet. EVMCS will be mapped from
+ * nested_get_vmcs12_pages().
*/
- vmx->nested.need_vmcs12_to_shadow_sync = true;
+ kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
} else {
return -EINVAL;
}
@@ -6129,11 +6137,13 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
/* nested EPT: emulate EPT also to L1 */
msrs->secondary_ctls_high |=
SECONDARY_EXEC_ENABLE_EPT;
- msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
- VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
- if (cpu_has_vmx_ept_execute_only())
- msrs->ept_caps |=
- VMX_EPT_EXECUTE_ONLY_BIT;
+ msrs->ept_caps =
+ VMX_EPT_PAGE_WALK_4_BIT |
+ VMX_EPT_PAGE_WALK_5_BIT |
+ VMX_EPTP_WB_BIT |
+ VMX_EPT_INVEPT_BIT |
+ VMX_EPT_EXECUTE_ONLY_BIT;
+
msrs->ept_caps &= ept_caps;
msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
@@ -6232,7 +6242,8 @@ void nested_vmx_hardware_unsetup(void)
}
}
-__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
+__init int nested_vmx_hardware_setup(struct kvm_x86_ops *ops,
+ int (*exit_handlers[])(struct kvm_vcpu *))
{
int i;
@@ -6268,12 +6279,12 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid;
exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc;
- kvm_x86_ops->check_nested_events = vmx_check_nested_events;
- kvm_x86_ops->get_nested_state = vmx_get_nested_state;
- kvm_x86_ops->set_nested_state = vmx_set_nested_state;
- kvm_x86_ops->get_vmcs12_pages = nested_get_vmcs12_pages;
- kvm_x86_ops->nested_enable_evmcs = nested_enable_evmcs;
- kvm_x86_ops->nested_get_evmcs_version = nested_get_evmcs_version;
+ ops->check_nested_events = vmx_check_nested_events;
+ ops->get_nested_state = vmx_get_nested_state;
+ ops->set_nested_state = vmx_set_nested_state;
+ ops->get_vmcs12_pages = nested_get_vmcs12_pages;
+ ops->nested_enable_evmcs = nested_enable_evmcs;
+ ops->nested_get_evmcs_version = nested_get_evmcs_version;
return 0;
}
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index 9aeda46f473e..ac56aefa49e3 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -19,7 +19,8 @@ enum nvmx_vmentry_status {
void vmx_leave_nested(struct kvm_vcpu *vcpu);
void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps);
void nested_vmx_hardware_unsetup(void);
-__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *));
+__init int nested_vmx_hardware_setup(struct kvm_x86_ops *ops,
+ int (*exit_handlers[])(struct kvm_vcpu *));
void nested_vmx_set_vmcs_shadowing_bitmap(void);
void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu);
enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
@@ -33,6 +34,7 @@ int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata);
int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
u32 vmx_instruction_info, bool wr, int len, gva_t *ret);
void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu);
+void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu);
bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
int size);
@@ -60,7 +62,7 @@ static inline int vmx_has_valid_vmcs12(struct kvm_vcpu *vcpu)
vmx->nested.hv_evmcs;
}
-static inline unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
+static inline unsigned long nested_ept_get_eptp(struct kvm_vcpu *vcpu)
{
/* return the page table to be shadowed - in our case, EPT12 */
return get_vmcs12(vcpu)->ept_pointer;
@@ -68,7 +70,7 @@ static inline unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
static inline bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu)
{
- return nested_ept_get_cr3(vcpu) & VMX_EPTP_AD_ENABLE_BIT;
+ return nested_ept_get_eptp(vcpu) & VMX_EPTP_AD_ENABLE_BIT;
}
/*
diff --git a/arch/x86/kvm/vmx/ops.h b/arch/x86/kvm/vmx/ops.h
index 45eaedee2ac0..19717d0a1100 100644
--- a/arch/x86/kvm/vmx/ops.h
+++ b/arch/x86/kvm/vmx/ops.h
@@ -13,6 +13,8 @@
#define __ex(x) __kvm_handle_fault_on_reboot(x)
asmlinkage void vmread_error(unsigned long field, bool fault);
+__attribute__((regparm(0))) void vmread_error_trampoline(unsigned long field,
+ bool fault);
void vmwrite_error(unsigned long field, unsigned long value);
void vmclear_error(struct vmcs *vmcs, u64 phys_addr);
void vmptrld_error(struct vmcs *vmcs, u64 phys_addr);
@@ -70,15 +72,28 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field)
asm volatile("1: vmread %2, %1\n\t"
".byte 0x3e\n\t" /* branch taken hint */
"ja 3f\n\t"
- "mov %2, %%" _ASM_ARG1 "\n\t"
- "xor %%" _ASM_ARG2 ", %%" _ASM_ARG2 "\n\t"
- "2: call vmread_error\n\t"
- "xor %k1, %k1\n\t"
+
+ /*
+ * VMREAD failed. Push '0' for @fault, push the failing
+ * @field, and bounce through the trampoline to preserve
+ * volatile registers.
+ */
+ "push $0\n\t"
+ "push %2\n\t"
+ "2:call vmread_error_trampoline\n\t"
+
+ /*
+ * Unwind the stack. Note, the trampoline zeros out the
+ * memory for @fault so that the result is '0' on error.
+ */
+ "pop %2\n\t"
+ "pop %1\n\t"
"3:\n\t"
+ /* VMREAD faulted. As above, except push '1' for @fault. */
".pushsection .fixup, \"ax\"\n\t"
- "4: mov %2, %%" _ASM_ARG1 "\n\t"
- "mov $1, %%" _ASM_ARG2 "\n\t"
+ "4: push $1\n\t"
+ "push %2\n\t"
"jmp 2b\n\t"
".popsection\n\t"
_ASM_EXTABLE(1b, 4b)
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index fd21cdb10b79..7c857737b438 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -263,9 +263,15 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!msr_info->host_initiated)
data = (s64)(s32)data;
pmc->counter += data - pmc_read_counter(pmc);
+ if (pmc->perf_event)
+ perf_event_period(pmc->perf_event,
+ get_sample_period(pmc, data));
return 0;
} else if ((pmc = get_fixed_pmc(pmu, msr))) {
pmc->counter += data - pmc_read_counter(pmc);
+ if (pmc->perf_event)
+ perf_event_period(pmc->perf_event,
+ get_sample_period(pmc, data));
return 0;
} else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
if (data == pmc->eventsel)
@@ -329,7 +335,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->global_ovf_ctrl_mask = pmu->global_ctrl_mask
& ~(MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF |
MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD);
- if (kvm_x86_ops->pt_supported())
+ if (vmx_pt_mode_is_host_guest())
pmu->global_ovf_ctrl_mask &=
~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI;
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 81ada2ce99e7..87f3f24fef37 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -58,12 +58,8 @@ SYM_FUNC_START(vmx_vmenter)
ret
4: ud2
- .pushsection .fixup, "ax"
-5: jmp 3b
- .popsection
-
- _ASM_EXTABLE(1b, 5b)
- _ASM_EXTABLE(2b, 5b)
+ _ASM_EXTABLE(1b, 3b)
+ _ASM_EXTABLE(2b, 3b)
SYM_FUNC_END(vmx_vmenter)
@@ -135,12 +131,12 @@ SYM_FUNC_START(__vmx_vcpu_run)
cmpb $0, %bl
/* Load guest registers. Don't clobber flags. */
- mov VCPU_RBX(%_ASM_AX), %_ASM_BX
mov VCPU_RCX(%_ASM_AX), %_ASM_CX
mov VCPU_RDX(%_ASM_AX), %_ASM_DX
+ mov VCPU_RBX(%_ASM_AX), %_ASM_BX
+ mov VCPU_RBP(%_ASM_AX), %_ASM_BP
mov VCPU_RSI(%_ASM_AX), %_ASM_SI
mov VCPU_RDI(%_ASM_AX), %_ASM_DI
- mov VCPU_RBP(%_ASM_AX), %_ASM_BP
#ifdef CONFIG_X86_64
mov VCPU_R8 (%_ASM_AX), %r8
mov VCPU_R9 (%_ASM_AX), %r9
@@ -168,12 +164,12 @@ SYM_FUNC_START(__vmx_vcpu_run)
/* Save all guest registers, including RAX from the stack */
__ASM_SIZE(pop) VCPU_RAX(%_ASM_AX)
- mov %_ASM_BX, VCPU_RBX(%_ASM_AX)
mov %_ASM_CX, VCPU_RCX(%_ASM_AX)
mov %_ASM_DX, VCPU_RDX(%_ASM_AX)
+ mov %_ASM_BX, VCPU_RBX(%_ASM_AX)
+ mov %_ASM_BP, VCPU_RBP(%_ASM_AX)
mov %_ASM_SI, VCPU_RSI(%_ASM_AX)
mov %_ASM_DI, VCPU_RDI(%_ASM_AX)
- mov %_ASM_BP, VCPU_RBP(%_ASM_AX)
#ifdef CONFIG_X86_64
mov %r8, VCPU_R8 (%_ASM_AX)
mov %r9, VCPU_R9 (%_ASM_AX)
@@ -197,12 +193,12 @@ SYM_FUNC_START(__vmx_vcpu_run)
* free. RSP and RAX are exempt as RSP is restored by hardware during
* VM-Exit and RAX is explicitly loaded with 0 or 1 to return VM-Fail.
*/
-1: xor %ebx, %ebx
- xor %ecx, %ecx
+1: xor %ecx, %ecx
xor %edx, %edx
+ xor %ebx, %ebx
+ xor %ebp, %ebp
xor %esi, %esi
xor %edi, %edi
- xor %ebp, %ebp
#ifdef CONFIG_X86_64
xor %r8d, %r8d
xor %r9d, %r9d
@@ -234,3 +230,61 @@ SYM_FUNC_START(__vmx_vcpu_run)
2: mov $1, %eax
jmp 1b
SYM_FUNC_END(__vmx_vcpu_run)
+
+/**
+ * vmread_error_trampoline - Trampoline from inline asm to vmread_error()
+ * @field: VMCS field encoding that failed
+ * @fault: %true if the VMREAD faulted, %false if it failed
+
+ * Save and restore volatile registers across a call to vmread_error(). Note,
+ * all parameters are passed on the stack.
+ */
+SYM_FUNC_START(vmread_error_trampoline)
+ push %_ASM_BP
+ mov %_ASM_SP, %_ASM_BP
+
+ push %_ASM_AX
+ push %_ASM_CX
+ push %_ASM_DX
+#ifdef CONFIG_X86_64
+ push %rdi
+ push %rsi
+ push %r8
+ push %r9
+ push %r10
+ push %r11
+#endif
+#ifdef CONFIG_X86_64
+ /* Load @field and @fault to arg1 and arg2 respectively. */
+ mov 3*WORD_SIZE(%rbp), %_ASM_ARG2
+ mov 2*WORD_SIZE(%rbp), %_ASM_ARG1
+#else
+ /* Parameters are passed on the stack for 32-bit (see asmlinkage). */
+ push 3*WORD_SIZE(%ebp)
+ push 2*WORD_SIZE(%ebp)
+#endif
+
+ call vmread_error
+
+#ifndef CONFIG_X86_64
+ add $8, %esp
+#endif
+
+ /* Zero out @fault, which will be popped into the result register. */
+ _ASM_MOV $0, 3*WORD_SIZE(%_ASM_BP)
+
+#ifdef CONFIG_X86_64
+ pop %r11
+ pop %r10
+ pop %r9
+ pop %r8
+ pop %rsi
+ pop %rdi
+#endif
+ pop %_ASM_DX
+ pop %_ASM_CX
+ pop %_ASM_AX
+ pop %_ASM_BP
+
+ ret
+SYM_FUNC_END(vmread_error_trampoline)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 40b1e6138cd5..83050977490c 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -31,6 +31,7 @@
#include <asm/apic.h>
#include <asm/asm.h>
#include <asm/cpu.h>
+#include <asm/cpu_device_id.h>
#include <asm/debugreg.h>
#include <asm/desc.h>
#include <asm/fpu/internal.h>
@@ -41,6 +42,7 @@
#include <asm/mce.h>
#include <asm/mmu_context.h>
#include <asm/mshyperv.h>
+#include <asm/mwait.h>
#include <asm/spec-ctrl.h>
#include <asm/virtext.h>
#include <asm/vmx.h>
@@ -66,7 +68,7 @@ MODULE_LICENSE("GPL");
#ifdef MODULE
static const struct x86_cpu_id vmx_cpu_id[] = {
- X86_FEATURE_MATCH(X86_FEATURE_VMX),
+ X86_MATCH_FEATURE(X86_FEATURE_VMX, NULL),
{}
};
MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
@@ -435,7 +437,6 @@ static const struct kvm_vmx_segment_field {
VMX_SEGMENT_FIELD(LDTR),
};
-u64 host_efer;
static unsigned long host_idt_base;
/*
@@ -656,53 +657,16 @@ static int vmx_set_guest_msr(struct vcpu_vmx *vmx, struct shared_msr_entry *msr,
return ret;
}
-void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
-{
- vmcs_clear(loaded_vmcs->vmcs);
- if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
- vmcs_clear(loaded_vmcs->shadow_vmcs);
- loaded_vmcs->cpu = -1;
- loaded_vmcs->launched = 0;
-}
-
#ifdef CONFIG_KEXEC_CORE
-/*
- * This bitmap is used to indicate whether the vmclear
- * operation is enabled on all cpus. All disabled by
- * default.
- */
-static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
-
-static inline void crash_enable_local_vmclear(int cpu)
-{
- cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
-}
-
-static inline void crash_disable_local_vmclear(int cpu)
-{
- cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
-}
-
-static inline int crash_local_vmclear_enabled(int cpu)
-{
- return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
-}
-
static void crash_vmclear_local_loaded_vmcss(void)
{
int cpu = raw_smp_processor_id();
struct loaded_vmcs *v;
- if (!crash_local_vmclear_enabled(cpu))
- return;
-
list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
loaded_vmcss_on_cpu_link)
vmcs_clear(v->vmcs);
}
-#else
-static inline void crash_enable_local_vmclear(int cpu) { }
-static inline void crash_disable_local_vmclear(int cpu) { }
#endif /* CONFIG_KEXEC_CORE */
static void __loaded_vmcs_clear(void *arg)
@@ -714,19 +678,24 @@ static void __loaded_vmcs_clear(void *arg)
return; /* vcpu migration can race with cpu offline */
if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
per_cpu(current_vmcs, cpu) = NULL;
- crash_disable_local_vmclear(cpu);
+
+ vmcs_clear(loaded_vmcs->vmcs);
+ if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
+ vmcs_clear(loaded_vmcs->shadow_vmcs);
+
list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
/*
- * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
- * is before setting loaded_vmcs->vcpu to -1 which is done in
- * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
- * then adds the vmcs into percpu list before it is deleted.
+ * Ensure all writes to loaded_vmcs, including deleting it from its
+ * current percpu list, complete before setting loaded_vmcs->vcpu to
+ * -1, otherwise a different cpu can see vcpu == -1 first and add
+ * loaded_vmcs to its percpu list before it's deleted from this cpu's
+ * list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
*/
smp_wmb();
- loaded_vmcs_init(loaded_vmcs);
- crash_enable_local_vmclear(cpu);
+ loaded_vmcs->cpu = -1;
+ loaded_vmcs->launched = 0;
}
void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
@@ -810,7 +779,7 @@ void update_exception_bitmap(struct kvm_vcpu *vcpu)
if (to_vmx(vcpu)->rmode.vm86_active)
eb = ~0;
if (enable_ept)
- eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
+ eb &= ~(1u << PF_VECTOR);
/* When we are running a nested L2 guest and L1 specified for it a
* certain exception bitmap, we must trap the same exceptions and pass
@@ -1061,7 +1030,7 @@ static unsigned long segment_base(u16 selector)
static inline bool pt_can_write_msr(struct vcpu_vmx *vmx)
{
- return (pt_mode == PT_MODE_HOST_GUEST) &&
+ return vmx_pt_mode_is_host_guest() &&
!(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
}
@@ -1095,7 +1064,7 @@ static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
static void pt_guest_enter(struct vcpu_vmx *vmx)
{
- if (pt_mode == PT_MODE_SYSTEM)
+ if (vmx_pt_mode_is_system())
return;
/*
@@ -1112,7 +1081,7 @@ static void pt_guest_enter(struct vcpu_vmx *vmx)
static void pt_guest_exit(struct vcpu_vmx *vmx)
{
- if (pt_mode == PT_MODE_SYSTEM)
+ if (vmx_pt_mode_is_system())
return;
if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
@@ -1345,18 +1314,17 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu)
if (!already_loaded) {
loaded_vmcs_clear(vmx->loaded_vmcs);
local_irq_disable();
- crash_disable_local_vmclear(cpu);
/*
- * Read loaded_vmcs->cpu should be before fetching
- * loaded_vmcs->loaded_vmcss_on_cpu_link.
- * See the comments in __loaded_vmcs_clear().
+ * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to
+ * this cpu's percpu list, otherwise it may not yet be deleted
+ * from its previous cpu's percpu list. Pairs with the
+ * smb_wmb() in __loaded_vmcs_clear().
*/
smp_rmb();
list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
&per_cpu(loaded_vmcss_on_cpu, cpu));
- crash_enable_local_vmclear(cpu);
local_irq_enable();
}
@@ -1689,16 +1657,6 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu)
vmx_clear_hlt(vcpu);
}
-static bool vmx_rdtscp_supported(void)
-{
- return cpu_has_vmx_rdtscp();
-}
-
-static bool vmx_invpcid_supported(void)
-{
- return cpu_has_vmx_invpcid();
-}
-
/*
* Swap MSR entry in host/guest MSR entry array.
*/
@@ -1906,24 +1864,24 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
&msr_info->data);
break;
case MSR_IA32_RTIT_CTL:
- if (pt_mode != PT_MODE_HOST_GUEST)
+ if (!vmx_pt_mode_is_host_guest())
return 1;
msr_info->data = vmx->pt_desc.guest.ctl;
break;
case MSR_IA32_RTIT_STATUS:
- if (pt_mode != PT_MODE_HOST_GUEST)
+ if (!vmx_pt_mode_is_host_guest())
return 1;
msr_info->data = vmx->pt_desc.guest.status;
break;
case MSR_IA32_RTIT_CR3_MATCH:
- if ((pt_mode != PT_MODE_HOST_GUEST) ||
+ if (!vmx_pt_mode_is_host_guest() ||
!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_cr3_filtering))
return 1;
msr_info->data = vmx->pt_desc.guest.cr3_match;
break;
case MSR_IA32_RTIT_OUTPUT_BASE:
- if ((pt_mode != PT_MODE_HOST_GUEST) ||
+ if (!vmx_pt_mode_is_host_guest() ||
(!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_topa_output) &&
!intel_pt_validate_cap(vmx->pt_desc.caps,
@@ -1932,7 +1890,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = vmx->pt_desc.guest.output_base;
break;
case MSR_IA32_RTIT_OUTPUT_MASK:
- if ((pt_mode != PT_MODE_HOST_GUEST) ||
+ if (!vmx_pt_mode_is_host_guest() ||
(!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_topa_output) &&
!intel_pt_validate_cap(vmx->pt_desc.caps,
@@ -1942,7 +1900,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
- if ((pt_mode != PT_MODE_HOST_GUEST) ||
+ if (!vmx_pt_mode_is_host_guest() ||
(index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_num_address_ranges)))
return 1;
@@ -2148,7 +2106,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
return vmx_set_vmx_msr(vcpu, msr_index, data);
case MSR_IA32_RTIT_CTL:
- if ((pt_mode != PT_MODE_HOST_GUEST) ||
+ if (!vmx_pt_mode_is_host_guest() ||
vmx_rtit_ctl_check(vcpu, data) ||
vmx->nested.vmxon)
return 1;
@@ -2264,18 +2222,33 @@ static __init int vmx_disabled_by_bios(void)
!boot_cpu_has(X86_FEATURE_VMX);
}
-static void kvm_cpu_vmxon(u64 addr)
+static int kvm_cpu_vmxon(u64 vmxon_pointer)
{
+ u64 msr;
+
cr4_set_bits(X86_CR4_VMXE);
intel_pt_handle_vmx(1);
- asm volatile ("vmxon %0" : : "m"(addr));
+ asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
+ _ASM_EXTABLE(1b, %l[fault])
+ : : [vmxon_pointer] "m"(vmxon_pointer)
+ : : fault);
+ return 0;
+
+fault:
+ WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
+ rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
+ intel_pt_handle_vmx(0);
+ cr4_clear_bits(X86_CR4_VMXE);
+
+ return -EFAULT;
}
static int hardware_enable(void)
{
int cpu = raw_smp_processor_id();
u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
+ int r;
if (cr4_read_shadow() & X86_CR4_VMXE)
return -EBUSY;
@@ -2288,22 +2261,10 @@ static int hardware_enable(void)
!hv_get_vp_assist_page(cpu))
return -EFAULT;
- INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
- INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
- spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
-
- /*
- * Now we can enable the vmclear operation in kdump
- * since the loaded_vmcss_on_cpu list on this cpu
- * has been initialized.
- *
- * Though the cpu is not in VMX operation now, there
- * is no problem to enable the vmclear operation
- * for the loaded_vmcss_on_cpu list is empty!
- */
- crash_enable_local_vmclear(cpu);
+ r = kvm_cpu_vmxon(phys_addr);
+ if (r)
+ return r;
- kvm_cpu_vmxon(phys_addr);
if (enable_ept)
ept_sync_global();
@@ -2338,6 +2299,17 @@ static void hardware_disable(void)
kvm_cpu_vmxoff();
}
+/*
+ * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID
+ * directly instead of going through cpu_has(), to ensure KVM is trapping
+ * ENCLS whenever it's supported in hardware. It does not matter whether
+ * the host OS supports or has enabled SGX.
+ */
+static bool cpu_has_sgx(void)
+{
+ return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0));
+}
+
static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
u32 msr, u32 *result)
{
@@ -2418,8 +2390,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
SECONDARY_EXEC_PT_USE_GPA |
SECONDARY_EXEC_PT_CONCEAL_VMX |
- SECONDARY_EXEC_ENABLE_VMFUNC |
- SECONDARY_EXEC_ENCLS_EXITING;
+ SECONDARY_EXEC_ENABLE_VMFUNC;
+ if (cpu_has_sgx())
+ opt2 |= SECONDARY_EXEC_ENCLS_EXITING;
if (adjust_vmx_controls(min2, opt2,
MSR_IA32_VMX_PROCBASED_CTLS2,
&_cpu_based_2nd_exec_control) < 0)
@@ -2603,9 +2576,12 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
if (!loaded_vmcs->vmcs)
return -ENOMEM;
+ vmcs_clear(loaded_vmcs->vmcs);
+
loaded_vmcs->shadow_vmcs = NULL;
loaded_vmcs->hv_timer_soft_disabled = false;
- loaded_vmcs_init(loaded_vmcs);
+ loaded_vmcs->cpu = -1;
+ loaded_vmcs->launched = 0;
if (cpu_has_vmx_msr_bitmap()) {
loaded_vmcs->msr_bitmap = (unsigned long *)
@@ -2987,9 +2963,8 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
static int get_ept_level(struct kvm_vcpu *vcpu)
{
- /* Nested EPT currently only supports 4-level walks. */
if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu)))
- return 4;
+ return vmx_eptp_page_walk_level(nested_ept_get_eptp(vcpu));
if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
return 5;
return 4;
@@ -3009,7 +2984,7 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
return eptp;
}
-void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long cr3)
{
struct kvm *kvm = vcpu->kvm;
bool update_guest_cr3 = true;
@@ -3021,7 +2996,7 @@ void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
eptp = construct_eptp(vcpu, cr3);
vmcs_write64(EPT_POINTER, eptp);
- if (kvm_x86_ops->tlb_remote_flush) {
+ if (kvm_x86_ops.tlb_remote_flush) {
spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
to_vmx(vcpu)->ept_pointer = eptp;
to_kvm_vmx(kvm)->ept_pointers_match
@@ -4026,7 +4001,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
- if (pt_mode == PT_MODE_SYSTEM)
+ if (vmx_pt_mode_is_system())
exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX);
if (!cpu_need_virtualize_apic_accesses(vcpu))
exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
@@ -4081,7 +4056,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
}
}
- if (vmx_rdtscp_supported()) {
+ if (cpu_has_vmx_rdtscp()) {
bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP);
if (!rdtscp_enabled)
exec_control &= ~SECONDARY_EXEC_RDTSCP;
@@ -4096,7 +4071,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
}
}
- if (vmx_invpcid_supported()) {
+ if (cpu_has_vmx_invpcid()) {
/* Exposing INVPCID only when PCID is exposed */
bool invpcid_enabled =
guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) &&
@@ -4267,7 +4242,7 @@ static void init_vmcs(struct vcpu_vmx *vmx)
if (cpu_has_vmx_encls_vmexit())
vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
- if (pt_mode == PT_MODE_HOST_GUEST) {
+ if (vmx_pt_mode_is_host_guest()) {
memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
/* Bit[6~0] are forced to 1, writes are ignored. */
vmx->pt_desc.guest.output_mask = 0x7F;
@@ -4495,8 +4470,13 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
{
- return (!to_vmx(vcpu)->nested.nested_run_pending &&
- vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
+ if (to_vmx(vcpu)->nested.nested_run_pending)
+ return false;
+
+ if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
+ return true;
+
+ return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
}
@@ -4552,7 +4532,6 @@ static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
case GP_VECTOR:
case MF_VECTOR:
return true;
- break;
}
return false;
}
@@ -4609,6 +4588,26 @@ static int handle_machine_check(struct kvm_vcpu *vcpu)
return 1;
}
+/*
+ * If the host has split lock detection disabled, then #AC is
+ * unconditionally injected into the guest, which is the pre split lock
+ * detection behaviour.
+ *
+ * If the host has split lock detection enabled then #AC is
+ * only injected into the guest when:
+ * - Guest CPL == 3 (user mode)
+ * - Guest has #AC detection enabled in CR0
+ * - Guest EFLAGS has AC bit set
+ */
+static inline bool guest_inject_ac(struct kvm_vcpu *vcpu)
+{
+ if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
+ return true;
+
+ return vmx_get_cpl(vcpu) == 3 && kvm_read_cr0_bits(vcpu, X86_CR0_AM) &&
+ (kvm_get_rflags(vcpu) & X86_EFLAGS_AC);
+}
+
static int handle_exception_nmi(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -4674,9 +4673,6 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
return handle_rmode_exception(vcpu, ex_no, error_code);
switch (ex_no) {
- case AC_VECTOR:
- kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
- return 1;
case DB_VECTOR:
dr6 = vmcs_readl(EXIT_QUALIFICATION);
if (!(vcpu->guest_debug &
@@ -4705,6 +4701,20 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
kvm_run->debug.arch.exception = ex_no;
break;
+ case AC_VECTOR:
+ if (guest_inject_ac(vcpu)) {
+ kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
+ return 1;
+ }
+
+ /*
+ * Handle split lock. Depending on detection mode this will
+ * either warn and disable split lock detection for this
+ * task or force SIGBUS on it.
+ */
+ if (handle_guest_split_lock(kvm_rip_read(vcpu)))
+ return 1;
+ fallthrough;
default:
kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
kvm_run->ex.exception = ex_no;
@@ -5329,7 +5339,6 @@ static void vmx_enable_tdp(void)
VMX_EPT_RWX_MASK, 0ull);
ept_set_mmio_spte_mask();
- kvm_enable_tdp();
}
/*
@@ -5862,8 +5871,23 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu,
if (vmx->emulation_required)
return handle_invalid_guest_state(vcpu);
- if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason))
- return nested_vmx_reflect_vmexit(vcpu, exit_reason);
+ if (is_guest_mode(vcpu)) {
+ /*
+ * The host physical addresses of some pages of guest memory
+ * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
+ * Page). The CPU may write to these pages via their host
+ * physical address while L2 is running, bypassing any
+ * address-translation-based dirty tracking (e.g. EPT write
+ * protection).
+ *
+ * Mark them dirty on every exit from L2 to prevent them from
+ * getting out of sync with dirty tracking.
+ */
+ nested_mark_vmcs12_pages_dirty(vcpu);
+
+ if (nested_vmx_exit_reflected(vcpu, exit_reason))
+ return nested_vmx_reflect_vmexit(vcpu, exit_reason);
+ }
if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
dump_vmcs();
@@ -6223,15 +6247,13 @@ static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
/* if exit due to PF check for async PF */
- if (is_page_fault(vmx->exit_intr_info))
+ if (is_page_fault(vmx->exit_intr_info)) {
vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
-
/* Handle machine checks before interrupts are enabled */
- if (is_machine_check(vmx->exit_intr_info))
+ } else if (is_machine_check(vmx->exit_intr_info)) {
kvm_machine_check();
-
/* We need to handle NMIs before interrupts are enabled */
- if (is_nmi(vmx->exit_intr_info)) {
+ } else if (is_nmi(vmx->exit_intr_info)) {
kvm_before_interrupt(&vmx->vcpu);
asm("int $2");
kvm_after_interrupt(&vmx->vcpu);
@@ -6275,7 +6297,7 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
#endif
ASM_CALL_CONSTRAINT
:
- THUNK_TARGET(entry),
+ [thunk_target]"r"(entry),
[ss]"i"(__KERNEL_DS),
[cs]"i"(__KERNEL_CS)
);
@@ -6317,11 +6339,6 @@ static bool vmx_has_emulated_msr(int index)
}
}
-static bool vmx_pt_supported(void)
-{
- return pt_mode == PT_MODE_HOST_GUEST;
-}
-
static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
{
u32 exit_intr_info;
@@ -6567,7 +6584,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
pt_guest_enter(vmx);
- atomic_switch_perf_msrs(vmx);
+ if (vcpu_to_pmu(vcpu)->version)
+ atomic_switch_perf_msrs(vmx);
atomic_switch_umwait_control_msr(vmx);
if (enable_preemption_timer)
@@ -6684,20 +6702,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx_complete_interrupts(vmx);
}
-static struct kvm *vmx_vm_alloc(void)
-{
- struct kvm_vmx *kvm_vmx = __vmalloc(sizeof(struct kvm_vmx),
- GFP_KERNEL_ACCOUNT | __GFP_ZERO,
- PAGE_KERNEL);
- return &kvm_vmx->kvm;
-}
-
-static void vmx_vm_free(struct kvm *kvm)
-{
- kfree(kvm->arch.hyperv.hv_pa_pg);
- vfree(to_kvm_vmx(kvm));
-}
-
static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6900,17 +6904,24 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
u8 cache;
u64 ipat = 0;
- /* For VT-d and EPT combination
- * 1. MMIO: always map as UC
- * 2. EPT with VT-d:
- * a. VT-d without snooping control feature: can't guarantee the
- * result, try to trust guest.
- * b. VT-d with snooping control feature: snooping control feature of
- * VT-d engine can guarantee the cache correctness. Just set it
- * to WB to keep consistent with host. So the same as item 3.
- * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
- * consistent with host MTRR
+ /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in
+ * memory aliases with conflicting memory types and sometimes MCEs.
+ * We have to be careful as to what are honored and when.
+ *
+ * For MMIO, guest CD/MTRR are ignored. The EPT memory type is set to
+ * UC. The effective memory type is UC or WC depending on guest PAT.
+ * This was historically the source of MCEs and we want to be
+ * conservative.
+ *
+ * When there is no need to deal with noncoherent DMA (e.g., no VT-d
+ * or VT-d has snoop control), guest CD/MTRR/PAT are all ignored. The
+ * EPT memory type is set to WB. The effective memory type is forced
+ * WB.
+ *
+ * Otherwise, we trust guest. Guest CD/MTRR/PAT are all honored. The
+ * EPT memory type is used to emulate guest CD/MTRR.
*/
+
if (is_mmio) {
cache = MTRR_TYPE_UNCACHABLE;
goto exit;
@@ -6937,15 +6948,6 @@ exit:
return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat;
}
-static int vmx_get_lpage_level(void)
-{
- if (enable_ept && !cpu_has_vmx_ept_1g_page())
- return PT_DIRECTORY_LEVEL;
- else
- /* For shadow and EPT supported 1GB page */
- return PT_PDPE_LEVEL;
-}
-
static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx)
{
/*
@@ -7136,10 +7138,37 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
}
}
-static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
+static __init void vmx_set_cpu_caps(void)
{
- if (func == 1 && nested)
- entry->ecx |= feature_bit(VMX);
+ kvm_set_cpu_caps();
+
+ /* CPUID 0x1 */
+ if (nested)
+ kvm_cpu_cap_set(X86_FEATURE_VMX);
+
+ /* CPUID 0x7 */
+ if (kvm_mpx_supported())
+ kvm_cpu_cap_check_and_set(X86_FEATURE_MPX);
+ if (cpu_has_vmx_invpcid())
+ kvm_cpu_cap_check_and_set(X86_FEATURE_INVPCID);
+ if (vmx_pt_mode_is_host_guest())
+ kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
+
+ /* PKU is not yet implemented for shadow paging. */
+ if (enable_ept && boot_cpu_has(X86_FEATURE_OSPKE))
+ kvm_cpu_cap_check_and_set(X86_FEATURE_PKU);
+
+ if (vmx_umip_emulated())
+ kvm_cpu_cap_set(X86_FEATURE_UMIP);
+
+ /* CPUID 0xD.1 */
+ supported_xss = 0;
+ if (!vmx_xsaves_supported())
+ kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
+
+ /* CPUID 0x80000001 */
+ if (!cpu_has_vmx_rdtscp())
+ kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
}
static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
@@ -7183,10 +7212,10 @@ static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
static int vmx_check_intercept(struct kvm_vcpu *vcpu,
struct x86_instruction_info *info,
- enum x86_intercept_stage stage)
+ enum x86_intercept_stage stage,
+ struct x86_exception *exception)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
switch (info->intercept) {
/*
@@ -7195,8 +7224,8 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu,
*/
case x86_intercept_rdtscp:
if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
- ctxt->exception.vector = UD_VECTOR;
- ctxt->exception.error_code_valid = false;
+ exception->vector = UD_VECTOR;
+ exception->error_code_valid = false;
return X86EMUL_PROPAGATE_FAULT;
}
break;
@@ -7307,7 +7336,8 @@ static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
static void vmx_slot_enable_log_dirty(struct kvm *kvm,
struct kvm_memory_slot *slot)
{
- kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
+ if (!kvm_dirty_log_manual_protect_and_init_set(kvm))
+ kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
}
@@ -7490,7 +7520,7 @@ static void pi_post_block(struct kvm_vcpu *vcpu)
static void vmx_post_block(struct kvm_vcpu *vcpu)
{
- if (kvm_x86_ops->set_hv_timer)
+ if (kvm_x86_ops.set_hv_timer)
kvm_lapic_switch_to_hv_timer(vcpu);
pi_post_block(vcpu);
@@ -7657,13 +7687,164 @@ static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
return to_vmx(vcpu)->nested.vmxon;
}
+static void hardware_unsetup(void)
+{
+ if (nested)
+ nested_vmx_hardware_unsetup();
+
+ free_kvm_area();
+}
+
+static bool vmx_check_apicv_inhibit_reasons(ulong bit)
+{
+ ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
+ BIT(APICV_INHIBIT_REASON_HYPERV);
+
+ return supported & BIT(bit);
+}
+
+static struct kvm_x86_ops vmx_x86_ops __initdata = {
+ .hardware_unsetup = hardware_unsetup,
+
+ .hardware_enable = hardware_enable,
+ .hardware_disable = hardware_disable,
+ .cpu_has_accelerated_tpr = report_flexpriority,
+ .has_emulated_msr = vmx_has_emulated_msr,
+
+ .vm_size = sizeof(struct kvm_vmx),
+ .vm_init = vmx_vm_init,
+
+ .vcpu_create = vmx_create_vcpu,
+ .vcpu_free = vmx_free_vcpu,
+ .vcpu_reset = vmx_vcpu_reset,
+
+ .prepare_guest_switch = vmx_prepare_switch_to_guest,
+ .vcpu_load = vmx_vcpu_load,
+ .vcpu_put = vmx_vcpu_put,
+
+ .update_bp_intercept = update_exception_bitmap,
+ .get_msr_feature = vmx_get_msr_feature,
+ .get_msr = vmx_get_msr,
+ .set_msr = vmx_set_msr,
+ .get_segment_base = vmx_get_segment_base,
+ .get_segment = vmx_get_segment,
+ .set_segment = vmx_set_segment,
+ .get_cpl = vmx_get_cpl,
+ .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
+ .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
+ .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
+ .set_cr0 = vmx_set_cr0,
+ .set_cr4 = vmx_set_cr4,
+ .set_efer = vmx_set_efer,
+ .get_idt = vmx_get_idt,
+ .set_idt = vmx_set_idt,
+ .get_gdt = vmx_get_gdt,
+ .set_gdt = vmx_set_gdt,
+ .get_dr6 = vmx_get_dr6,
+ .set_dr6 = vmx_set_dr6,
+ .set_dr7 = vmx_set_dr7,
+ .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
+ .cache_reg = vmx_cache_reg,
+ .get_rflags = vmx_get_rflags,
+ .set_rflags = vmx_set_rflags,
+
+ .tlb_flush = vmx_flush_tlb,
+ .tlb_flush_gva = vmx_flush_tlb_gva,
+
+ .run = vmx_vcpu_run,
+ .handle_exit = vmx_handle_exit,
+ .skip_emulated_instruction = vmx_skip_emulated_instruction,
+ .update_emulated_instruction = vmx_update_emulated_instruction,
+ .set_interrupt_shadow = vmx_set_interrupt_shadow,
+ .get_interrupt_shadow = vmx_get_interrupt_shadow,
+ .patch_hypercall = vmx_patch_hypercall,
+ .set_irq = vmx_inject_irq,
+ .set_nmi = vmx_inject_nmi,
+ .queue_exception = vmx_queue_exception,
+ .cancel_injection = vmx_cancel_injection,
+ .interrupt_allowed = vmx_interrupt_allowed,
+ .nmi_allowed = vmx_nmi_allowed,
+ .get_nmi_mask = vmx_get_nmi_mask,
+ .set_nmi_mask = vmx_set_nmi_mask,
+ .enable_nmi_window = enable_nmi_window,
+ .enable_irq_window = enable_irq_window,
+ .update_cr8_intercept = update_cr8_intercept,
+ .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
+ .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
+ .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
+ .load_eoi_exitmap = vmx_load_eoi_exitmap,
+ .apicv_post_state_restore = vmx_apicv_post_state_restore,
+ .check_apicv_inhibit_reasons = vmx_check_apicv_inhibit_reasons,
+ .hwapic_irr_update = vmx_hwapic_irr_update,
+ .hwapic_isr_update = vmx_hwapic_isr_update,
+ .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
+ .sync_pir_to_irr = vmx_sync_pir_to_irr,
+ .deliver_posted_interrupt = vmx_deliver_posted_interrupt,
+ .dy_apicv_has_pending_interrupt = vmx_dy_apicv_has_pending_interrupt,
+
+ .set_tss_addr = vmx_set_tss_addr,
+ .set_identity_map_addr = vmx_set_identity_map_addr,
+ .get_tdp_level = get_ept_level,
+ .get_mt_mask = vmx_get_mt_mask,
+
+ .get_exit_info = vmx_get_exit_info,
+
+ .cpuid_update = vmx_cpuid_update,
+
+ .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
+
+ .read_l1_tsc_offset = vmx_read_l1_tsc_offset,
+ .write_l1_tsc_offset = vmx_write_l1_tsc_offset,
+
+ .load_mmu_pgd = vmx_load_mmu_pgd,
+
+ .check_intercept = vmx_check_intercept,
+ .handle_exit_irqoff = vmx_handle_exit_irqoff,
+
+ .request_immediate_exit = vmx_request_immediate_exit,
+
+ .sched_in = vmx_sched_in,
+
+ .slot_enable_log_dirty = vmx_slot_enable_log_dirty,
+ .slot_disable_log_dirty = vmx_slot_disable_log_dirty,
+ .flush_log_dirty = vmx_flush_log_dirty,
+ .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
+ .write_log_dirty = vmx_write_pml_buffer,
+
+ .pre_block = vmx_pre_block,
+ .post_block = vmx_post_block,
+
+ .pmu_ops = &intel_pmu_ops,
+
+ .update_pi_irte = vmx_update_pi_irte,
+
+#ifdef CONFIG_X86_64
+ .set_hv_timer = vmx_set_hv_timer,
+ .cancel_hv_timer = vmx_cancel_hv_timer,
+#endif
+
+ .setup_mce = vmx_setup_mce,
+
+ .smi_allowed = vmx_smi_allowed,
+ .pre_enter_smm = vmx_pre_enter_smm,
+ .pre_leave_smm = vmx_pre_leave_smm,
+ .enable_smi_window = enable_smi_window,
+
+ .check_nested_events = NULL,
+ .get_nested_state = NULL,
+ .set_nested_state = NULL,
+ .get_vmcs12_pages = NULL,
+ .nested_enable_evmcs = NULL,
+ .nested_get_evmcs_version = NULL,
+ .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
+ .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
+};
+
static __init int hardware_setup(void)
{
unsigned long host_bndcfgs;
struct desc_ptr dt;
- int r, i;
-
- rdmsrl_safe(MSR_EFER, &host_efer);
+ int r, i, ept_lpage_level;
store_idt(&dt);
host_idt_base = dt.address;
@@ -7682,6 +7863,10 @@ static __init int hardware_setup(void)
WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost");
}
+ if (!cpu_has_vmx_mpx())
+ supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
+ XFEATURE_MASK_BNDCSR);
+
if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
!(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
enable_vpid = 0;
@@ -7710,19 +7895,16 @@ static __init int hardware_setup(void)
* using the APIC_ACCESS_ADDR VMCS field.
*/
if (!flexpriority_enabled)
- kvm_x86_ops->set_apic_access_page_addr = NULL;
+ vmx_x86_ops.set_apic_access_page_addr = NULL;
if (!cpu_has_vmx_tpr_shadow())
- kvm_x86_ops->update_cr8_intercept = NULL;
-
- if (enable_ept && !cpu_has_vmx_ept_2m_page())
- kvm_disable_largepages();
+ vmx_x86_ops.update_cr8_intercept = NULL;
#if IS_ENABLED(CONFIG_HYPERV)
if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
&& enable_ept) {
- kvm_x86_ops->tlb_remote_flush = hv_remote_flush_tlb;
- kvm_x86_ops->tlb_remote_flush_with_range =
+ vmx_x86_ops.tlb_remote_flush = hv_remote_flush_tlb;
+ vmx_x86_ops.tlb_remote_flush_with_range =
hv_remote_flush_tlb_with_range;
}
#endif
@@ -7737,7 +7919,7 @@ static __init int hardware_setup(void)
if (!cpu_has_vmx_apicv()) {
enable_apicv = 0;
- kvm_x86_ops->sync_pir_to_irr = NULL;
+ vmx_x86_ops.sync_pir_to_irr = NULL;
}
if (cpu_has_vmx_tsc_scaling()) {
@@ -7750,8 +7932,16 @@ static __init int hardware_setup(void)
if (enable_ept)
vmx_enable_tdp();
+
+ if (!enable_ept)
+ ept_lpage_level = 0;
+ else if (cpu_has_vmx_ept_1g_page())
+ ept_lpage_level = PT_PDPE_LEVEL;
+ else if (cpu_has_vmx_ept_2m_page())
+ ept_lpage_level = PT_DIRECTORY_LEVEL;
else
- kvm_disable_tdp();
+ ept_lpage_level = PT_PAGE_TABLE_LEVEL;
+ kvm_configure_mmu(enable_ept, ept_lpage_level);
/*
* Only enable PML when hardware supports PML feature, and both EPT
@@ -7761,10 +7951,10 @@ static __init int hardware_setup(void)
enable_pml = 0;
if (!enable_pml) {
- kvm_x86_ops->slot_enable_log_dirty = NULL;
- kvm_x86_ops->slot_disable_log_dirty = NULL;
- kvm_x86_ops->flush_log_dirty = NULL;
- kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
+ vmx_x86_ops.slot_enable_log_dirty = NULL;
+ vmx_x86_ops.slot_disable_log_dirty = NULL;
+ vmx_x86_ops.flush_log_dirty = NULL;
+ vmx_x86_ops.enable_log_dirty_pt_masked = NULL;
}
if (!cpu_has_vmx_preemption_timer())
@@ -7792,9 +7982,9 @@ static __init int hardware_setup(void)
}
if (!enable_preemption_timer) {
- kvm_x86_ops->set_hv_timer = NULL;
- kvm_x86_ops->cancel_hv_timer = NULL;
- kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit;
+ vmx_x86_ops.set_hv_timer = NULL;
+ vmx_x86_ops.cancel_hv_timer = NULL;
+ vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit;
}
kvm_set_posted_intr_wakeup_handler(wakeup_handler);
@@ -7810,185 +8000,27 @@ static __init int hardware_setup(void)
nested_vmx_setup_ctls_msrs(&vmcs_config.nested,
vmx_capability.ept);
- r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers);
+ r = nested_vmx_hardware_setup(&vmx_x86_ops,
+ kvm_vmx_exit_handlers);
if (r)
return r;
}
+ vmx_set_cpu_caps();
+
r = alloc_kvm_area();
if (r)
nested_vmx_hardware_unsetup();
return r;
}
-static __exit void hardware_unsetup(void)
-{
- if (nested)
- nested_vmx_hardware_unsetup();
-
- free_kvm_area();
-}
-
-static bool vmx_check_apicv_inhibit_reasons(ulong bit)
-{
- ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
- BIT(APICV_INHIBIT_REASON_HYPERV);
-
- return supported & BIT(bit);
-}
-
-static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
+static struct kvm_x86_init_ops vmx_init_ops __initdata = {
.cpu_has_kvm_support = cpu_has_kvm_support,
.disabled_by_bios = vmx_disabled_by_bios,
- .hardware_setup = hardware_setup,
- .hardware_unsetup = hardware_unsetup,
.check_processor_compatibility = vmx_check_processor_compat,
- .hardware_enable = hardware_enable,
- .hardware_disable = hardware_disable,
- .cpu_has_accelerated_tpr = report_flexpriority,
- .has_emulated_msr = vmx_has_emulated_msr,
-
- .vm_init = vmx_vm_init,
- .vm_alloc = vmx_vm_alloc,
- .vm_free = vmx_vm_free,
-
- .vcpu_create = vmx_create_vcpu,
- .vcpu_free = vmx_free_vcpu,
- .vcpu_reset = vmx_vcpu_reset,
-
- .prepare_guest_switch = vmx_prepare_switch_to_guest,
- .vcpu_load = vmx_vcpu_load,
- .vcpu_put = vmx_vcpu_put,
-
- .update_bp_intercept = update_exception_bitmap,
- .get_msr_feature = vmx_get_msr_feature,
- .get_msr = vmx_get_msr,
- .set_msr = vmx_set_msr,
- .get_segment_base = vmx_get_segment_base,
- .get_segment = vmx_get_segment,
- .set_segment = vmx_set_segment,
- .get_cpl = vmx_get_cpl,
- .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
- .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
- .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
- .set_cr0 = vmx_set_cr0,
- .set_cr3 = vmx_set_cr3,
- .set_cr4 = vmx_set_cr4,
- .set_efer = vmx_set_efer,
- .get_idt = vmx_get_idt,
- .set_idt = vmx_set_idt,
- .get_gdt = vmx_get_gdt,
- .set_gdt = vmx_set_gdt,
- .get_dr6 = vmx_get_dr6,
- .set_dr6 = vmx_set_dr6,
- .set_dr7 = vmx_set_dr7,
- .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
- .cache_reg = vmx_cache_reg,
- .get_rflags = vmx_get_rflags,
- .set_rflags = vmx_set_rflags,
-
- .tlb_flush = vmx_flush_tlb,
- .tlb_flush_gva = vmx_flush_tlb_gva,
-
- .run = vmx_vcpu_run,
- .handle_exit = vmx_handle_exit,
- .skip_emulated_instruction = vmx_skip_emulated_instruction,
- .update_emulated_instruction = vmx_update_emulated_instruction,
- .set_interrupt_shadow = vmx_set_interrupt_shadow,
- .get_interrupt_shadow = vmx_get_interrupt_shadow,
- .patch_hypercall = vmx_patch_hypercall,
- .set_irq = vmx_inject_irq,
- .set_nmi = vmx_inject_nmi,
- .queue_exception = vmx_queue_exception,
- .cancel_injection = vmx_cancel_injection,
- .interrupt_allowed = vmx_interrupt_allowed,
- .nmi_allowed = vmx_nmi_allowed,
- .get_nmi_mask = vmx_get_nmi_mask,
- .set_nmi_mask = vmx_set_nmi_mask,
- .enable_nmi_window = enable_nmi_window,
- .enable_irq_window = enable_irq_window,
- .update_cr8_intercept = update_cr8_intercept,
- .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
- .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
- .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
- .load_eoi_exitmap = vmx_load_eoi_exitmap,
- .apicv_post_state_restore = vmx_apicv_post_state_restore,
- .check_apicv_inhibit_reasons = vmx_check_apicv_inhibit_reasons,
- .hwapic_irr_update = vmx_hwapic_irr_update,
- .hwapic_isr_update = vmx_hwapic_isr_update,
- .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
- .sync_pir_to_irr = vmx_sync_pir_to_irr,
- .deliver_posted_interrupt = vmx_deliver_posted_interrupt,
- .dy_apicv_has_pending_interrupt = vmx_dy_apicv_has_pending_interrupt,
-
- .set_tss_addr = vmx_set_tss_addr,
- .set_identity_map_addr = vmx_set_identity_map_addr,
- .get_tdp_level = get_ept_level,
- .get_mt_mask = vmx_get_mt_mask,
-
- .get_exit_info = vmx_get_exit_info,
-
- .get_lpage_level = vmx_get_lpage_level,
-
- .cpuid_update = vmx_cpuid_update,
-
- .rdtscp_supported = vmx_rdtscp_supported,
- .invpcid_supported = vmx_invpcid_supported,
-
- .set_supported_cpuid = vmx_set_supported_cpuid,
-
- .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
-
- .read_l1_tsc_offset = vmx_read_l1_tsc_offset,
- .write_l1_tsc_offset = vmx_write_l1_tsc_offset,
-
- .set_tdp_cr3 = vmx_set_cr3,
-
- .check_intercept = vmx_check_intercept,
- .handle_exit_irqoff = vmx_handle_exit_irqoff,
- .mpx_supported = vmx_mpx_supported,
- .xsaves_supported = vmx_xsaves_supported,
- .umip_emulated = vmx_umip_emulated,
- .pt_supported = vmx_pt_supported,
- .pku_supported = vmx_pku_supported,
-
- .request_immediate_exit = vmx_request_immediate_exit,
-
- .sched_in = vmx_sched_in,
-
- .slot_enable_log_dirty = vmx_slot_enable_log_dirty,
- .slot_disable_log_dirty = vmx_slot_disable_log_dirty,
- .flush_log_dirty = vmx_flush_log_dirty,
- .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
- .write_log_dirty = vmx_write_pml_buffer,
-
- .pre_block = vmx_pre_block,
- .post_block = vmx_post_block,
-
- .pmu_ops = &intel_pmu_ops,
-
- .update_pi_irte = vmx_update_pi_irte,
-
-#ifdef CONFIG_X86_64
- .set_hv_timer = vmx_set_hv_timer,
- .cancel_hv_timer = vmx_cancel_hv_timer,
-#endif
-
- .setup_mce = vmx_setup_mce,
-
- .smi_allowed = vmx_smi_allowed,
- .pre_enter_smm = vmx_pre_enter_smm,
- .pre_leave_smm = vmx_pre_leave_smm,
- .enable_smi_window = enable_smi_window,
+ .hardware_setup = hardware_setup,
- .check_nested_events = NULL,
- .get_nested_state = NULL,
- .set_nested_state = NULL,
- .get_vmcs12_pages = NULL,
- .nested_enable_evmcs = NULL,
- .nested_get_evmcs_version = NULL,
- .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
- .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
+ .runtime_ops = &vmx_x86_ops,
};
static void vmx_cleanup_l1d_flush(void)
@@ -8039,7 +8071,7 @@ module_exit(vmx_exit);
static int __init vmx_init(void)
{
- int r;
+ int r, cpu;
#if IS_ENABLED(CONFIG_HYPERV)
/*
@@ -8075,7 +8107,7 @@ static int __init vmx_init(void)
}
#endif
- r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
+ r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx),
__alignof__(struct vcpu_vmx), THIS_MODULE);
if (r)
return r;
@@ -8093,6 +8125,12 @@ static int __init vmx_init(void)
return r;
}
+ for_each_possible_cpu(cpu) {
+ INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
+ INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
+ spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+ }
+
#ifdef CONFIG_KEXEC_CORE
rcu_assign_pointer(crash_vmclear_loaded_vmcss,
crash_vmclear_local_loaded_vmcss);
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index e64da06c7009..aab9df55336e 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -12,9 +12,6 @@
#include "vmcs.h"
extern const u32 vmx_msr_index[];
-extern u64 host_efer;
-
-extern u32 get_umwait_control_msr(void);
#define MSR_TYPE_R 1
#define MSR_TYPE_W 2
@@ -335,9 +332,9 @@ u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu);
void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask);
void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer);
void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
-void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
void set_cr4_guest_host_mask(struct vcpu_vmx *vmx);
+void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long cr3);
void ept_save_pdptrs(struct kvm_vcpu *vcpu);
void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
@@ -452,7 +449,7 @@ static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
static inline u32 vmx_vmentry_ctrl(void)
{
u32 vmentry_ctrl = vmcs_config.vmentry_ctrl;
- if (pt_mode == PT_MODE_SYSTEM)
+ if (vmx_pt_mode_is_system())
vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP |
VM_ENTRY_LOAD_IA32_RTIT_CTL);
/* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
@@ -463,7 +460,7 @@ static inline u32 vmx_vmentry_ctrl(void)
static inline u32 vmx_vmexit_ctrl(void)
{
u32 vmexit_ctrl = vmcs_config.vmexit_ctrl;
- if (pt_mode == PT_MODE_SYSTEM)
+ if (vmx_pt_mode_is_system())
vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP |
VM_EXIT_CLEAR_IA32_RTIT_CTL);
/* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
@@ -493,7 +490,6 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags);
void free_vmcs(struct vmcs *vmcs);
int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs);
void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs);
-void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs);
void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs);
static inline struct vmcs *alloc_vmcs(bool shadow)