diff options
Diffstat (limited to 'arch/x86/kvm/vmx')
-rw-r--r-- | arch/x86/kvm/vmx/capabilities.h | 25 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/evmcs.h | 7 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/nested.c | 191 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/nested.h | 8 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/ops.h | 27 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/pmu_intel.c | 8 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmenter.S | 80 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 736 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.h | 10 |
9 files changed, 617 insertions, 475 deletions
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index f486e2606247..8903475f751e 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -101,7 +101,7 @@ static inline bool cpu_has_load_perf_global_ctrl(void) (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); } -static inline bool vmx_mpx_supported(void) +static inline bool cpu_has_vmx_mpx(void) { return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) && (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS); @@ -146,11 +146,6 @@ static inline bool vmx_umip_emulated(void) SECONDARY_EXEC_DESC; } -static inline bool vmx_pku_supported(void) -{ - return boot_cpu_has(X86_FEATURE_PKU); -} - static inline bool cpu_has_vmx_rdtscp(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & @@ -354,4 +349,22 @@ static inline bool cpu_has_vmx_intel_pt(void) (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_RTIT_CTL); } +/* + * Processor Trace can operate in one of three modes: + * a. system-wide: trace both host/guest and output to host buffer + * b. host-only: only trace host and output to host buffer + * c. host-guest: trace host and guest simultaneously and output to their + * respective buffer + * + * KVM currently only supports (a) and (c). + */ +static inline bool vmx_pt_mode_is_system(void) +{ + return pt_mode == PT_MODE_SYSTEM; +} +static inline bool vmx_pt_mode_is_host_guest(void) +{ + return pt_mode == PT_MODE_HOST_GUEST; +} + #endif /* __KVM_X86_VMX_CAPS_H */ diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h index 6de47f2569c9..e5f7a7ebf27d 100644 --- a/arch/x86/kvm/vmx/evmcs.h +++ b/arch/x86/kvm/vmx/evmcs.h @@ -198,6 +198,13 @@ static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {} static inline void evmcs_touch_msr_bitmap(void) {} #endif /* IS_ENABLED(CONFIG_HYPERV) */ +enum nested_evmptrld_status { + EVMPTRLD_DISABLED, + EVMPTRLD_SUCCEEDED, + EVMPTRLD_VMFAIL, + EVMPTRLD_ERROR, +}; + bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa); uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu); int nested_enable_evmcs(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index e920d7834d73..cbc9ea2de28f 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -224,7 +224,7 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) return; kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); - vmx->nested.hv_evmcs_vmptr = -1ull; + vmx->nested.hv_evmcs_vmptr = 0; vmx->nested.hv_evmcs = NULL; } @@ -353,9 +353,8 @@ static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) to_vmx(vcpu)->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT, nested_ept_ad_enabled(vcpu), - nested_ept_get_cr3(vcpu)); - vcpu->arch.mmu->set_cr3 = vmx_set_cr3; - vcpu->arch.mmu->get_cr3 = nested_ept_get_cr3; + nested_ept_get_eptp(vcpu)); + vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp; vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; @@ -1910,20 +1909,21 @@ static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) * This is an equivalent of the nested hypervisor executing the vmptrld * instruction. */ -static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, - bool from_launch) +static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( + struct kvm_vcpu *vcpu, bool from_launch) { struct vcpu_vmx *vmx = to_vmx(vcpu); bool evmcs_gpa_changed = false; u64 evmcs_gpa; if (likely(!vmx->nested.enlightened_vmcs_enabled)) - return 1; + return EVMPTRLD_DISABLED; if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) - return 1; + return EVMPTRLD_DISABLED; - if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { + if (unlikely(!vmx->nested.hv_evmcs || + evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { if (!vmx->nested.hv_evmcs) vmx->nested.current_vmptr = -1ull; @@ -1931,7 +1931,7 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa), &vmx->nested.hv_evmcs_map)) - return 0; + return EVMPTRLD_ERROR; vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva; @@ -1960,7 +1960,7 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { nested_release_evmcs(vcpu); - return 0; + return EVMPTRLD_VMFAIL; } vmx->nested.dirty_vmcs12 = true; @@ -1989,21 +1989,13 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, vmx->nested.hv_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; - return 1; + return EVMPTRLD_SUCCEEDED; } void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - /* - * hv_evmcs may end up being not mapped after migration (when - * L2 was running), map it here to make sure vmcs12 changes are - * properly reflected. - */ - if (vmx->nested.enlightened_vmcs_enabled && !vmx->nested.hv_evmcs) - nested_vmx_handle_enlightened_vmptrld(vcpu, false); - if (vmx->nested.hv_evmcs) { copy_vmcs12_to_enlightened(vmx); /* All fields are clean */ @@ -2474,9 +2466,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, * If L1 use EPT, then L0 needs to execute INVEPT on * EPTP02 instead of EPTP01. Therefore, delay TLB * flush until vmcs02->eptp is fully updated by - * KVM_REQ_LOAD_CR3. Note that this assumes + * KVM_REQ_LOAD_MMU_PGD. Note that this assumes * KVM_REQ_TLB_FLUSH is evaluated after - * KVM_REQ_LOAD_CR3 in vcpu_enter_guest(). + * KVM_REQ_LOAD_MMU_PGD in vcpu_enter_guest(). */ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } @@ -2521,7 +2513,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, /* * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12 * on nested VM-Exit, which can occur without actually running L2 and - * thus without hitting vmx_set_cr3(), e.g. if L1 is entering L2 with + * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the * transition to HLT instead of running L2. */ @@ -2563,13 +2555,13 @@ static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) return 0; } -static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address) +static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) { struct vcpu_vmx *vmx = to_vmx(vcpu); int maxphyaddr = cpuid_maxphyaddr(vcpu); /* Check for memory type validity */ - switch (address & VMX_EPTP_MT_MASK) { + switch (new_eptp & VMX_EPTP_MT_MASK) { case VMX_EPTP_MT_UC: if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))) return false; @@ -2582,16 +2574,26 @@ static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address) return false; } - /* only 4 levels page-walk length are valid */ - if (CC((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4)) + /* Page-walk levels validity. */ + switch (new_eptp & VMX_EPTP_PWL_MASK) { + case VMX_EPTP_PWL_5: + if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT))) + return false; + break; + case VMX_EPTP_PWL_4: + if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT))) + return false; + break; + default: return false; + } /* Reserved bits should not be set */ - if (CC(address >> maxphyaddr || ((address >> 7) & 0x1f))) + if (CC(new_eptp >> maxphyaddr || ((new_eptp >> 7) & 0x1f))) return false; /* AD, if set, should be supported */ - if (address & VMX_EPTP_AD_ENABLE_BIT) { + if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) { if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))) return false; } @@ -2640,7 +2642,7 @@ static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, return -EINVAL; if (nested_cpu_has_ept(vmcs12) && - CC(!valid_ept_address(vcpu, vmcs12->ept_pointer))) + CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer))) return -EINVAL; if (nested_cpu_has_vmfunc(vmcs12)) { @@ -2960,7 +2962,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) /* * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to - * be written (by preparve_vmcs02()) before the "real" VMEnter, i.e. + * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e. * there is no need to preserve other bits or save/restore the field. */ vmcs_writel(GUEST_RFLAGS, 0); @@ -3052,6 +3054,27 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) struct page *page; u64 hpa; + /* + * hv_evmcs may end up being not mapped after migration (when + * L2 was running), map it here to make sure vmcs12 changes are + * properly reflected. + */ + if (vmx->nested.enlightened_vmcs_enabled && !vmx->nested.hv_evmcs) { + enum nested_evmptrld_status evmptrld_status = + nested_vmx_handle_enlightened_vmptrld(vcpu, false); + + if (evmptrld_status == EVMPTRLD_VMFAIL || + evmptrld_status == EVMPTRLD_ERROR) { + pr_debug_ratelimited("%s: enlightened vmptrld failed\n", + __func__); + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = + KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return false; + } + } + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { /* * Translate L1 physical address to host physical @@ -3315,12 +3338,18 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) enum nvmx_vmentry_status status; struct vcpu_vmx *vmx = to_vmx(vcpu); u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); + enum nested_evmptrld_status evmptrld_status; if (!nested_vmx_check_permission(vcpu)) return 1; - if (!nested_vmx_handle_enlightened_vmptrld(vcpu, launch)) + evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch); + if (evmptrld_status == EVMPTRLD_ERROR) { + kvm_queue_exception(vcpu, UD_VECTOR); return 1; + } else if (evmptrld_status == EVMPTRLD_VMFAIL) { + return nested_vmx_failInvalid(vcpu); + } if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull) return nested_vmx_failInvalid(vcpu); @@ -3498,7 +3527,7 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, } -static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) +void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); gfn_t gfn; @@ -3603,7 +3632,7 @@ static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu) vcpu->arch.exception.payload); } -static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) +static int vmx_check_nested_events(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long exit_qual; @@ -3616,7 +3645,8 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) * Clear the MTF state. If a higher priority VM-exit is delivered first, * this state is discarded. */ - vmx->nested.mtf_pending = false; + if (!block_nested_events) + vmx->nested.mtf_pending = false; if (lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &apic->pending_events)) { @@ -3679,8 +3709,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) return 0; } - if ((kvm_cpu_has_interrupt(vcpu) || external_intr) && - nested_exit_on_intr(vcpu)) { + if (kvm_cpu_has_interrupt(vcpu) && nested_exit_on_intr(vcpu)) { if (block_nested_events) return -EBUSY; nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); @@ -4023,7 +4052,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, * * If vmcs12 uses EPT, we need to execute this flush on EPTP01 * and therefore we request the TLB flush to happen only after VMCS EPTP - * has been set by KVM_REQ_LOAD_CR3. + * has been set by KVM_REQ_LOAD_MMU_PGD. */ if (enable_vpid && (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu))) { @@ -4328,17 +4357,8 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; if (likely(!vmx->fail)) { - /* - * TODO: SDM says that with acknowledge interrupt on - * exit, bit 31 of the VM-exit interrupt information - * (valid interrupt) is always set to 1 on - * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't - * need kvm_cpu_has_interrupt(). See the commit - * message for details. - */ - if (nested_exit_intr_ack_set(vcpu) && - exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && - kvm_cpu_has_interrupt(vcpu)) { + if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && + nested_exit_intr_ack_set(vcpu)) { int irq = kvm_cpu_get_interrupt(vcpu); WARN_ON(irq < 0); vmcs12->vm_exit_intr_info = irq | @@ -4382,7 +4402,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, * Decode the memory-address operand of a vmx instruction, as recorded on an * exit caused by such an instruction (run by a guest hypervisor). * On success, returns 0. When the operand is invalid, returns 1 and throws - * #UD or #GP. + * #UD, #GP, or #SS. */ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, u32 vmx_instruction_info, bool wr, int len, gva_t *ret) @@ -4423,7 +4443,7 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, if (base_is_valid) off += kvm_register_read(vcpu, base_reg); if (index_is_valid) - off += kvm_register_read(vcpu, index_reg)<<scaling; + off += kvm_register_read(vcpu, index_reg) << scaling; vmx_get_segment(vcpu, &s, seg_reg); /* @@ -4516,7 +4536,7 @@ void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu) return; vmx = to_vmx(vcpu); - if (kvm_x86_ops->pmu_ops->is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)) { + if (kvm_x86_ops.pmu_ops->is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)) { vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; vmx->nested.msrs.exit_ctls_high |= @@ -4602,7 +4622,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) vmx->nested.vmcs02_initialized = false; vmx->nested.vmxon = true; - if (pt_mode == PT_MODE_HOST_GUEST) { + if (vmx_pt_mode_is_host_guest()) { vmx->pt_desc.guest.ctl = 0; pt_update_intercept_for_msr(vmx); } @@ -5234,7 +5254,7 @@ static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { u32 index = kvm_rcx_read(vcpu); - u64 address; + u64 new_eptp; bool accessed_dirty; struct kvm_mmu *mmu = vcpu->arch.walk_mmu; @@ -5247,23 +5267,23 @@ static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, - &address, index * 8, 8)) + &new_eptp, index * 8, 8)) return 1; - accessed_dirty = !!(address & VMX_EPTP_AD_ENABLE_BIT); + accessed_dirty = !!(new_eptp & VMX_EPTP_AD_ENABLE_BIT); /* * If the (L2) guest does a vmfunc to the currently * active ept pointer, we don't have to do anything else */ - if (vmcs12->ept_pointer != address) { - if (!valid_ept_address(vcpu, address)) + if (vmcs12->ept_pointer != new_eptp) { + if (!nested_vmx_check_eptp(vcpu, new_eptp)) return 1; kvm_mmu_unload(vcpu); mmu->ept_ad = accessed_dirty; mmu->mmu_role.base.ad_disabled = !accessed_dirty; - vmcs12->ept_pointer = address; + vmcs12->ept_pointer = new_eptp; /* * TODO: Check what's the correct approach in case * mmu reload fails. Currently, we just let the next @@ -5524,8 +5544,7 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - if (vmx->nested.nested_run_pending) - return false; + WARN_ON_ONCE(vmx->nested.nested_run_pending); if (unlikely(vmx->fail)) { trace_kvm_nested_vmenter_failed( @@ -5534,19 +5553,6 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) return true; } - /* - * The host physical addresses of some pages of guest memory - * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC - * Page). The CPU may write to these pages via their host - * physical address while L2 is running, bypassing any - * address-translation-based dirty tracking (e.g. EPT write - * protection). - * - * Mark them dirty on every exit from L2 to prevent them from - * getting out of sync with dirty tracking. - */ - nested_mark_vmcs12_pages_dirty(vcpu); - trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, vmcs_readl(EXIT_QUALIFICATION), vmx->idt_vectoring_info, @@ -5627,7 +5633,7 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) case EXIT_REASON_MWAIT_INSTRUCTION: return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); case EXIT_REASON_MONITOR_TRAP_FLAG: - return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG); + return nested_cpu_has_mtf(vmcs12); case EXIT_REASON_MONITOR_INSTRUCTION: return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); case EXIT_REASON_PAUSE_INSTRUCTION: @@ -5904,10 +5910,12 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa); } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { /* - * Sync eVMCS upon entry as we may not have - * HV_X64_MSR_VP_ASSIST_PAGE set up yet. + * nested_vmx_handle_enlightened_vmptrld() cannot be called + * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be + * restored yet. EVMCS will be mapped from + * nested_get_vmcs12_pages(). */ - vmx->nested.need_vmcs12_to_shadow_sync = true; + kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); } else { return -EINVAL; } @@ -6129,11 +6137,13 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) /* nested EPT: emulate EPT also to L1 */ msrs->secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT; - msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT | - VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT; - if (cpu_has_vmx_ept_execute_only()) - msrs->ept_caps |= - VMX_EPT_EXECUTE_ONLY_BIT; + msrs->ept_caps = + VMX_EPT_PAGE_WALK_4_BIT | + VMX_EPT_PAGE_WALK_5_BIT | + VMX_EPTP_WB_BIT | + VMX_EPT_INVEPT_BIT | + VMX_EPT_EXECUTE_ONLY_BIT; + msrs->ept_caps &= ept_caps; msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | @@ -6232,7 +6242,8 @@ void nested_vmx_hardware_unsetup(void) } } -__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) +__init int nested_vmx_hardware_setup(struct kvm_x86_ops *ops, + int (*exit_handlers[])(struct kvm_vcpu *)) { int i; @@ -6268,12 +6279,12 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; - kvm_x86_ops->check_nested_events = vmx_check_nested_events; - kvm_x86_ops->get_nested_state = vmx_get_nested_state; - kvm_x86_ops->set_nested_state = vmx_set_nested_state; - kvm_x86_ops->get_vmcs12_pages = nested_get_vmcs12_pages; - kvm_x86_ops->nested_enable_evmcs = nested_enable_evmcs; - kvm_x86_ops->nested_get_evmcs_version = nested_get_evmcs_version; + ops->check_nested_events = vmx_check_nested_events; + ops->get_nested_state = vmx_get_nested_state; + ops->set_nested_state = vmx_set_nested_state; + ops->get_vmcs12_pages = nested_get_vmcs12_pages; + ops->nested_enable_evmcs = nested_enable_evmcs; + ops->nested_get_evmcs_version = nested_get_evmcs_version; return 0; } diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index 9aeda46f473e..ac56aefa49e3 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -19,7 +19,8 @@ enum nvmx_vmentry_status { void vmx_leave_nested(struct kvm_vcpu *vcpu); void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps); void nested_vmx_hardware_unsetup(void); -__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)); +__init int nested_vmx_hardware_setup(struct kvm_x86_ops *ops, + int (*exit_handlers[])(struct kvm_vcpu *)); void nested_vmx_set_vmcs_shadowing_bitmap(void); void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu); enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, @@ -33,6 +34,7 @@ int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata); int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, u32 vmx_instruction_info, bool wr, int len, gva_t *ret); void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu); +void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu); bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, int size); @@ -60,7 +62,7 @@ static inline int vmx_has_valid_vmcs12(struct kvm_vcpu *vcpu) vmx->nested.hv_evmcs; } -static inline unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) +static inline unsigned long nested_ept_get_eptp(struct kvm_vcpu *vcpu) { /* return the page table to be shadowed - in our case, EPT12 */ return get_vmcs12(vcpu)->ept_pointer; @@ -68,7 +70,7 @@ static inline unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) static inline bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu) { - return nested_ept_get_cr3(vcpu) & VMX_EPTP_AD_ENABLE_BIT; + return nested_ept_get_eptp(vcpu) & VMX_EPTP_AD_ENABLE_BIT; } /* diff --git a/arch/x86/kvm/vmx/ops.h b/arch/x86/kvm/vmx/ops.h index 45eaedee2ac0..19717d0a1100 100644 --- a/arch/x86/kvm/vmx/ops.h +++ b/arch/x86/kvm/vmx/ops.h @@ -13,6 +13,8 @@ #define __ex(x) __kvm_handle_fault_on_reboot(x) asmlinkage void vmread_error(unsigned long field, bool fault); +__attribute__((regparm(0))) void vmread_error_trampoline(unsigned long field, + bool fault); void vmwrite_error(unsigned long field, unsigned long value); void vmclear_error(struct vmcs *vmcs, u64 phys_addr); void vmptrld_error(struct vmcs *vmcs, u64 phys_addr); @@ -70,15 +72,28 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field) asm volatile("1: vmread %2, %1\n\t" ".byte 0x3e\n\t" /* branch taken hint */ "ja 3f\n\t" - "mov %2, %%" _ASM_ARG1 "\n\t" - "xor %%" _ASM_ARG2 ", %%" _ASM_ARG2 "\n\t" - "2: call vmread_error\n\t" - "xor %k1, %k1\n\t" + + /* + * VMREAD failed. Push '0' for @fault, push the failing + * @field, and bounce through the trampoline to preserve + * volatile registers. + */ + "push $0\n\t" + "push %2\n\t" + "2:call vmread_error_trampoline\n\t" + + /* + * Unwind the stack. Note, the trampoline zeros out the + * memory for @fault so that the result is '0' on error. + */ + "pop %2\n\t" + "pop %1\n\t" "3:\n\t" + /* VMREAD faulted. As above, except push '1' for @fault. */ ".pushsection .fixup, \"ax\"\n\t" - "4: mov %2, %%" _ASM_ARG1 "\n\t" - "mov $1, %%" _ASM_ARG2 "\n\t" + "4: push $1\n\t" + "push %2\n\t" "jmp 2b\n\t" ".popsection\n\t" _ASM_EXTABLE(1b, 4b) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index fd21cdb10b79..7c857737b438 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -263,9 +263,15 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!msr_info->host_initiated) data = (s64)(s32)data; pmc->counter += data - pmc_read_counter(pmc); + if (pmc->perf_event) + perf_event_period(pmc->perf_event, + get_sample_period(pmc, data)); return 0; } else if ((pmc = get_fixed_pmc(pmu, msr))) { pmc->counter += data - pmc_read_counter(pmc); + if (pmc->perf_event) + perf_event_period(pmc->perf_event, + get_sample_period(pmc, data)); return 0; } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { if (data == pmc->eventsel) @@ -329,7 +335,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->global_ovf_ctrl_mask = pmu->global_ctrl_mask & ~(MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF | MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD); - if (kvm_x86_ops->pt_supported()) + if (vmx_pt_mode_is_host_guest()) pmu->global_ovf_ctrl_mask &= ~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI; diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 81ada2ce99e7..87f3f24fef37 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -58,12 +58,8 @@ SYM_FUNC_START(vmx_vmenter) ret 4: ud2 - .pushsection .fixup, "ax" -5: jmp 3b - .popsection - - _ASM_EXTABLE(1b, 5b) - _ASM_EXTABLE(2b, 5b) + _ASM_EXTABLE(1b, 3b) + _ASM_EXTABLE(2b, 3b) SYM_FUNC_END(vmx_vmenter) @@ -135,12 +131,12 @@ SYM_FUNC_START(__vmx_vcpu_run) cmpb $0, %bl /* Load guest registers. Don't clobber flags. */ - mov VCPU_RBX(%_ASM_AX), %_ASM_BX mov VCPU_RCX(%_ASM_AX), %_ASM_CX mov VCPU_RDX(%_ASM_AX), %_ASM_DX + mov VCPU_RBX(%_ASM_AX), %_ASM_BX + mov VCPU_RBP(%_ASM_AX), %_ASM_BP mov VCPU_RSI(%_ASM_AX), %_ASM_SI mov VCPU_RDI(%_ASM_AX), %_ASM_DI - mov VCPU_RBP(%_ASM_AX), %_ASM_BP #ifdef CONFIG_X86_64 mov VCPU_R8 (%_ASM_AX), %r8 mov VCPU_R9 (%_ASM_AX), %r9 @@ -168,12 +164,12 @@ SYM_FUNC_START(__vmx_vcpu_run) /* Save all guest registers, including RAX from the stack */ __ASM_SIZE(pop) VCPU_RAX(%_ASM_AX) - mov %_ASM_BX, VCPU_RBX(%_ASM_AX) mov %_ASM_CX, VCPU_RCX(%_ASM_AX) mov %_ASM_DX, VCPU_RDX(%_ASM_AX) + mov %_ASM_BX, VCPU_RBX(%_ASM_AX) + mov %_ASM_BP, VCPU_RBP(%_ASM_AX) mov %_ASM_SI, VCPU_RSI(%_ASM_AX) mov %_ASM_DI, VCPU_RDI(%_ASM_AX) - mov %_ASM_BP, VCPU_RBP(%_ASM_AX) #ifdef CONFIG_X86_64 mov %r8, VCPU_R8 (%_ASM_AX) mov %r9, VCPU_R9 (%_ASM_AX) @@ -197,12 +193,12 @@ SYM_FUNC_START(__vmx_vcpu_run) * free. RSP and RAX are exempt as RSP is restored by hardware during * VM-Exit and RAX is explicitly loaded with 0 or 1 to return VM-Fail. */ -1: xor %ebx, %ebx - xor %ecx, %ecx +1: xor %ecx, %ecx xor %edx, %edx + xor %ebx, %ebx + xor %ebp, %ebp xor %esi, %esi xor %edi, %edi - xor %ebp, %ebp #ifdef CONFIG_X86_64 xor %r8d, %r8d xor %r9d, %r9d @@ -234,3 +230,61 @@ SYM_FUNC_START(__vmx_vcpu_run) 2: mov $1, %eax jmp 1b SYM_FUNC_END(__vmx_vcpu_run) + +/** + * vmread_error_trampoline - Trampoline from inline asm to vmread_error() + * @field: VMCS field encoding that failed + * @fault: %true if the VMREAD faulted, %false if it failed + + * Save and restore volatile registers across a call to vmread_error(). Note, + * all parameters are passed on the stack. + */ +SYM_FUNC_START(vmread_error_trampoline) + push %_ASM_BP + mov %_ASM_SP, %_ASM_BP + + push %_ASM_AX + push %_ASM_CX + push %_ASM_DX +#ifdef CONFIG_X86_64 + push %rdi + push %rsi + push %r8 + push %r9 + push %r10 + push %r11 +#endif +#ifdef CONFIG_X86_64 + /* Load @field and @fault to arg1 and arg2 respectively. */ + mov 3*WORD_SIZE(%rbp), %_ASM_ARG2 + mov 2*WORD_SIZE(%rbp), %_ASM_ARG1 +#else + /* Parameters are passed on the stack for 32-bit (see asmlinkage). */ + push 3*WORD_SIZE(%ebp) + push 2*WORD_SIZE(%ebp) +#endif + + call vmread_error + +#ifndef CONFIG_X86_64 + add $8, %esp +#endif + + /* Zero out @fault, which will be popped into the result register. */ + _ASM_MOV $0, 3*WORD_SIZE(%_ASM_BP) + +#ifdef CONFIG_X86_64 + pop %r11 + pop %r10 + pop %r9 + pop %r8 + pop %rsi + pop %rdi +#endif + pop %_ASM_DX + pop %_ASM_CX + pop %_ASM_AX + pop %_ASM_BP + + ret +SYM_FUNC_END(vmread_error_trampoline) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 40b1e6138cd5..83050977490c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -31,6 +31,7 @@ #include <asm/apic.h> #include <asm/asm.h> #include <asm/cpu.h> +#include <asm/cpu_device_id.h> #include <asm/debugreg.h> #include <asm/desc.h> #include <asm/fpu/internal.h> @@ -41,6 +42,7 @@ #include <asm/mce.h> #include <asm/mmu_context.h> #include <asm/mshyperv.h> +#include <asm/mwait.h> #include <asm/spec-ctrl.h> #include <asm/virtext.h> #include <asm/vmx.h> @@ -66,7 +68,7 @@ MODULE_LICENSE("GPL"); #ifdef MODULE static const struct x86_cpu_id vmx_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_VMX), + X86_MATCH_FEATURE(X86_FEATURE_VMX, NULL), {} }; MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); @@ -435,7 +437,6 @@ static const struct kvm_vmx_segment_field { VMX_SEGMENT_FIELD(LDTR), }; -u64 host_efer; static unsigned long host_idt_base; /* @@ -656,53 +657,16 @@ static int vmx_set_guest_msr(struct vcpu_vmx *vmx, struct shared_msr_entry *msr, return ret; } -void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs) -{ - vmcs_clear(loaded_vmcs->vmcs); - if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched) - vmcs_clear(loaded_vmcs->shadow_vmcs); - loaded_vmcs->cpu = -1; - loaded_vmcs->launched = 0; -} - #ifdef CONFIG_KEXEC_CORE -/* - * This bitmap is used to indicate whether the vmclear - * operation is enabled on all cpus. All disabled by - * default. - */ -static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE; - -static inline void crash_enable_local_vmclear(int cpu) -{ - cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap); -} - -static inline void crash_disable_local_vmclear(int cpu) -{ - cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap); -} - -static inline int crash_local_vmclear_enabled(int cpu) -{ - return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap); -} - static void crash_vmclear_local_loaded_vmcss(void) { int cpu = raw_smp_processor_id(); struct loaded_vmcs *v; - if (!crash_local_vmclear_enabled(cpu)) - return; - list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), loaded_vmcss_on_cpu_link) vmcs_clear(v->vmcs); } -#else -static inline void crash_enable_local_vmclear(int cpu) { } -static inline void crash_disable_local_vmclear(int cpu) { } #endif /* CONFIG_KEXEC_CORE */ static void __loaded_vmcs_clear(void *arg) @@ -714,19 +678,24 @@ static void __loaded_vmcs_clear(void *arg) return; /* vcpu migration can race with cpu offline */ if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) per_cpu(current_vmcs, cpu) = NULL; - crash_disable_local_vmclear(cpu); + + vmcs_clear(loaded_vmcs->vmcs); + if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched) + vmcs_clear(loaded_vmcs->shadow_vmcs); + list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); /* - * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link - * is before setting loaded_vmcs->vcpu to -1 which is done in - * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist - * then adds the vmcs into percpu list before it is deleted. + * Ensure all writes to loaded_vmcs, including deleting it from its + * current percpu list, complete before setting loaded_vmcs->vcpu to + * -1, otherwise a different cpu can see vcpu == -1 first and add + * loaded_vmcs to its percpu list before it's deleted from this cpu's + * list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs(). */ smp_wmb(); - loaded_vmcs_init(loaded_vmcs); - crash_enable_local_vmclear(cpu); + loaded_vmcs->cpu = -1; + loaded_vmcs->launched = 0; } void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) @@ -810,7 +779,7 @@ void update_exception_bitmap(struct kvm_vcpu *vcpu) if (to_vmx(vcpu)->rmode.vm86_active) eb = ~0; if (enable_ept) - eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ + eb &= ~(1u << PF_VECTOR); /* When we are running a nested L2 guest and L1 specified for it a * certain exception bitmap, we must trap the same exceptions and pass @@ -1061,7 +1030,7 @@ static unsigned long segment_base(u16 selector) static inline bool pt_can_write_msr(struct vcpu_vmx *vmx) { - return (pt_mode == PT_MODE_HOST_GUEST) && + return vmx_pt_mode_is_host_guest() && !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); } @@ -1095,7 +1064,7 @@ static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range) static void pt_guest_enter(struct vcpu_vmx *vmx) { - if (pt_mode == PT_MODE_SYSTEM) + if (vmx_pt_mode_is_system()) return; /* @@ -1112,7 +1081,7 @@ static void pt_guest_enter(struct vcpu_vmx *vmx) static void pt_guest_exit(struct vcpu_vmx *vmx) { - if (pt_mode == PT_MODE_SYSTEM) + if (vmx_pt_mode_is_system()) return; if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { @@ -1345,18 +1314,17 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu) if (!already_loaded) { loaded_vmcs_clear(vmx->loaded_vmcs); local_irq_disable(); - crash_disable_local_vmclear(cpu); /* - * Read loaded_vmcs->cpu should be before fetching - * loaded_vmcs->loaded_vmcss_on_cpu_link. - * See the comments in __loaded_vmcs_clear(). + * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to + * this cpu's percpu list, otherwise it may not yet be deleted + * from its previous cpu's percpu list. Pairs with the + * smb_wmb() in __loaded_vmcs_clear(). */ smp_rmb(); list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, &per_cpu(loaded_vmcss_on_cpu, cpu)); - crash_enable_local_vmclear(cpu); local_irq_enable(); } @@ -1689,16 +1657,6 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu) vmx_clear_hlt(vcpu); } -static bool vmx_rdtscp_supported(void) -{ - return cpu_has_vmx_rdtscp(); -} - -static bool vmx_invpcid_supported(void) -{ - return cpu_has_vmx_invpcid(); -} - /* * Swap MSR entry in host/guest MSR entry array. */ @@ -1906,24 +1864,24 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) &msr_info->data); break; case MSR_IA32_RTIT_CTL: - if (pt_mode != PT_MODE_HOST_GUEST) + if (!vmx_pt_mode_is_host_guest()) return 1; msr_info->data = vmx->pt_desc.guest.ctl; break; case MSR_IA32_RTIT_STATUS: - if (pt_mode != PT_MODE_HOST_GUEST) + if (!vmx_pt_mode_is_host_guest()) return 1; msr_info->data = vmx->pt_desc.guest.status; break; case MSR_IA32_RTIT_CR3_MATCH: - if ((pt_mode != PT_MODE_HOST_GUEST) || + if (!vmx_pt_mode_is_host_guest() || !intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering)) return 1; msr_info->data = vmx->pt_desc.guest.cr3_match; break; case MSR_IA32_RTIT_OUTPUT_BASE: - if ((pt_mode != PT_MODE_HOST_GUEST) || + if (!vmx_pt_mode_is_host_guest() || (!intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output) && !intel_pt_validate_cap(vmx->pt_desc.caps, @@ -1932,7 +1890,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vmx->pt_desc.guest.output_base; break; case MSR_IA32_RTIT_OUTPUT_MASK: - if ((pt_mode != PT_MODE_HOST_GUEST) || + if (!vmx_pt_mode_is_host_guest() || (!intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output) && !intel_pt_validate_cap(vmx->pt_desc.caps, @@ -1942,7 +1900,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; - if ((pt_mode != PT_MODE_HOST_GUEST) || + if (!vmx_pt_mode_is_host_guest() || (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_num_address_ranges))) return 1; @@ -2148,7 +2106,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; return vmx_set_vmx_msr(vcpu, msr_index, data); case MSR_IA32_RTIT_CTL: - if ((pt_mode != PT_MODE_HOST_GUEST) || + if (!vmx_pt_mode_is_host_guest() || vmx_rtit_ctl_check(vcpu, data) || vmx->nested.vmxon) return 1; @@ -2264,18 +2222,33 @@ static __init int vmx_disabled_by_bios(void) !boot_cpu_has(X86_FEATURE_VMX); } -static void kvm_cpu_vmxon(u64 addr) +static int kvm_cpu_vmxon(u64 vmxon_pointer) { + u64 msr; + cr4_set_bits(X86_CR4_VMXE); intel_pt_handle_vmx(1); - asm volatile ("vmxon %0" : : "m"(addr)); + asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t" + _ASM_EXTABLE(1b, %l[fault]) + : : [vmxon_pointer] "m"(vmxon_pointer) + : : fault); + return 0; + +fault: + WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n", + rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); + intel_pt_handle_vmx(0); + cr4_clear_bits(X86_CR4_VMXE); + + return -EFAULT; } static int hardware_enable(void) { int cpu = raw_smp_processor_id(); u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); + int r; if (cr4_read_shadow() & X86_CR4_VMXE) return -EBUSY; @@ -2288,22 +2261,10 @@ static int hardware_enable(void) !hv_get_vp_assist_page(cpu)) return -EFAULT; - INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); - INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); - spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); - - /* - * Now we can enable the vmclear operation in kdump - * since the loaded_vmcss_on_cpu list on this cpu - * has been initialized. - * - * Though the cpu is not in VMX operation now, there - * is no problem to enable the vmclear operation - * for the loaded_vmcss_on_cpu list is empty! - */ - crash_enable_local_vmclear(cpu); + r = kvm_cpu_vmxon(phys_addr); + if (r) + return r; - kvm_cpu_vmxon(phys_addr); if (enable_ept) ept_sync_global(); @@ -2338,6 +2299,17 @@ static void hardware_disable(void) kvm_cpu_vmxoff(); } +/* + * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID + * directly instead of going through cpu_has(), to ensure KVM is trapping + * ENCLS whenever it's supported in hardware. It does not matter whether + * the host OS supports or has enabled SGX. + */ +static bool cpu_has_sgx(void) +{ + return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0)); +} + static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result) { @@ -2418,8 +2390,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX | - SECONDARY_EXEC_ENABLE_VMFUNC | - SECONDARY_EXEC_ENCLS_EXITING; + SECONDARY_EXEC_ENABLE_VMFUNC; + if (cpu_has_sgx()) + opt2 |= SECONDARY_EXEC_ENCLS_EXITING; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -2603,9 +2576,12 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) if (!loaded_vmcs->vmcs) return -ENOMEM; + vmcs_clear(loaded_vmcs->vmcs); + loaded_vmcs->shadow_vmcs = NULL; loaded_vmcs->hv_timer_soft_disabled = false; - loaded_vmcs_init(loaded_vmcs); + loaded_vmcs->cpu = -1; + loaded_vmcs->launched = 0; if (cpu_has_vmx_msr_bitmap()) { loaded_vmcs->msr_bitmap = (unsigned long *) @@ -2987,9 +2963,8 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) static int get_ept_level(struct kvm_vcpu *vcpu) { - /* Nested EPT currently only supports 4-level walks. */ if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu))) - return 4; + return vmx_eptp_page_walk_level(nested_ept_get_eptp(vcpu)); if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48)) return 5; return 4; @@ -3009,7 +2984,7 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa) return eptp; } -void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long cr3) { struct kvm *kvm = vcpu->kvm; bool update_guest_cr3 = true; @@ -3021,7 +2996,7 @@ void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) eptp = construct_eptp(vcpu, cr3); vmcs_write64(EPT_POINTER, eptp); - if (kvm_x86_ops->tlb_remote_flush) { + if (kvm_x86_ops.tlb_remote_flush) { spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock); to_vmx(vcpu)->ept_pointer = eptp; to_kvm_vmx(kvm)->ept_pointers_match @@ -4026,7 +4001,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; - if (pt_mode == PT_MODE_SYSTEM) + if (vmx_pt_mode_is_system()) exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX); if (!cpu_need_virtualize_apic_accesses(vcpu)) exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; @@ -4081,7 +4056,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) } } - if (vmx_rdtscp_supported()) { + if (cpu_has_vmx_rdtscp()) { bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP); if (!rdtscp_enabled) exec_control &= ~SECONDARY_EXEC_RDTSCP; @@ -4096,7 +4071,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) } } - if (vmx_invpcid_supported()) { + if (cpu_has_vmx_invpcid()) { /* Exposing INVPCID only when PCID is exposed */ bool invpcid_enabled = guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) && @@ -4267,7 +4242,7 @@ static void init_vmcs(struct vcpu_vmx *vmx) if (cpu_has_vmx_encls_vmexit()) vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); - if (pt_mode == PT_MODE_HOST_GUEST) { + if (vmx_pt_mode_is_host_guest()) { memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc)); /* Bit[6~0] are forced to 1, writes are ignored. */ vmx->pt_desc.guest.output_mask = 0x7F; @@ -4495,8 +4470,13 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) { - return (!to_vmx(vcpu)->nested.nested_run_pending && - vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && + if (to_vmx(vcpu)->nested.nested_run_pending) + return false; + + if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) + return true; + + return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); } @@ -4552,7 +4532,6 @@ static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) case GP_VECTOR: case MF_VECTOR: return true; - break; } return false; } @@ -4609,6 +4588,26 @@ static int handle_machine_check(struct kvm_vcpu *vcpu) return 1; } +/* + * If the host has split lock detection disabled, then #AC is + * unconditionally injected into the guest, which is the pre split lock + * detection behaviour. + * + * If the host has split lock detection enabled then #AC is + * only injected into the guest when: + * - Guest CPL == 3 (user mode) + * - Guest has #AC detection enabled in CR0 + * - Guest EFLAGS has AC bit set + */ +static inline bool guest_inject_ac(struct kvm_vcpu *vcpu) +{ + if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) + return true; + + return vmx_get_cpl(vcpu) == 3 && kvm_read_cr0_bits(vcpu, X86_CR0_AM) && + (kvm_get_rflags(vcpu) & X86_EFLAGS_AC); +} + static int handle_exception_nmi(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -4674,9 +4673,6 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) return handle_rmode_exception(vcpu, ex_no, error_code); switch (ex_no) { - case AC_VECTOR: - kvm_queue_exception_e(vcpu, AC_VECTOR, error_code); - return 1; case DB_VECTOR: dr6 = vmcs_readl(EXIT_QUALIFICATION); if (!(vcpu->guest_debug & @@ -4705,6 +4701,20 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; kvm_run->debug.arch.exception = ex_no; break; + case AC_VECTOR: + if (guest_inject_ac(vcpu)) { + kvm_queue_exception_e(vcpu, AC_VECTOR, error_code); + return 1; + } + + /* + * Handle split lock. Depending on detection mode this will + * either warn and disable split lock detection for this + * task or force SIGBUS on it. + */ + if (handle_guest_split_lock(kvm_rip_read(vcpu))) + return 1; + fallthrough; default: kvm_run->exit_reason = KVM_EXIT_EXCEPTION; kvm_run->ex.exception = ex_no; @@ -5329,7 +5339,6 @@ static void vmx_enable_tdp(void) VMX_EPT_RWX_MASK, 0ull); ept_set_mmio_spte_mask(); - kvm_enable_tdp(); } /* @@ -5862,8 +5871,23 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, if (vmx->emulation_required) return handle_invalid_guest_state(vcpu); - if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason)) - return nested_vmx_reflect_vmexit(vcpu, exit_reason); + if (is_guest_mode(vcpu)) { + /* + * The host physical addresses of some pages of guest memory + * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC + * Page). The CPU may write to these pages via their host + * physical address while L2 is running, bypassing any + * address-translation-based dirty tracking (e.g. EPT write + * protection). + * + * Mark them dirty on every exit from L2 to prevent them from + * getting out of sync with dirty tracking. + */ + nested_mark_vmcs12_pages_dirty(vcpu); + + if (nested_vmx_exit_reflected(vcpu, exit_reason)) + return nested_vmx_reflect_vmexit(vcpu, exit_reason); + } if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { dump_vmcs(); @@ -6223,15 +6247,13 @@ static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); /* if exit due to PF check for async PF */ - if (is_page_fault(vmx->exit_intr_info)) + if (is_page_fault(vmx->exit_intr_info)) { vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason(); - /* Handle machine checks before interrupts are enabled */ - if (is_machine_check(vmx->exit_intr_info)) + } else if (is_machine_check(vmx->exit_intr_info)) { kvm_machine_check(); - /* We need to handle NMIs before interrupts are enabled */ - if (is_nmi(vmx->exit_intr_info)) { + } else if (is_nmi(vmx->exit_intr_info)) { kvm_before_interrupt(&vmx->vcpu); asm("int $2"); kvm_after_interrupt(&vmx->vcpu); @@ -6275,7 +6297,7 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) #endif ASM_CALL_CONSTRAINT : - THUNK_TARGET(entry), + [thunk_target]"r"(entry), [ss]"i"(__KERNEL_DS), [cs]"i"(__KERNEL_CS) ); @@ -6317,11 +6339,6 @@ static bool vmx_has_emulated_msr(int index) } } -static bool vmx_pt_supported(void) -{ - return pt_mode == PT_MODE_HOST_GUEST; -} - static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) { u32 exit_intr_info; @@ -6567,7 +6584,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) pt_guest_enter(vmx); - atomic_switch_perf_msrs(vmx); + if (vcpu_to_pmu(vcpu)->version) + atomic_switch_perf_msrs(vmx); atomic_switch_umwait_control_msr(vmx); if (enable_preemption_timer) @@ -6684,20 +6702,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx_complete_interrupts(vmx); } -static struct kvm *vmx_vm_alloc(void) -{ - struct kvm_vmx *kvm_vmx = __vmalloc(sizeof(struct kvm_vmx), - GFP_KERNEL_ACCOUNT | __GFP_ZERO, - PAGE_KERNEL); - return &kvm_vmx->kvm; -} - -static void vmx_vm_free(struct kvm *kvm) -{ - kfree(kvm->arch.hyperv.hv_pa_pg); - vfree(to_kvm_vmx(kvm)); -} - static void vmx_free_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6900,17 +6904,24 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) u8 cache; u64 ipat = 0; - /* For VT-d and EPT combination - * 1. MMIO: always map as UC - * 2. EPT with VT-d: - * a. VT-d without snooping control feature: can't guarantee the - * result, try to trust guest. - * b. VT-d with snooping control feature: snooping control feature of - * VT-d engine can guarantee the cache correctness. Just set it - * to WB to keep consistent with host. So the same as item 3. - * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep - * consistent with host MTRR + /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in + * memory aliases with conflicting memory types and sometimes MCEs. + * We have to be careful as to what are honored and when. + * + * For MMIO, guest CD/MTRR are ignored. The EPT memory type is set to + * UC. The effective memory type is UC or WC depending on guest PAT. + * This was historically the source of MCEs and we want to be + * conservative. + * + * When there is no need to deal with noncoherent DMA (e.g., no VT-d + * or VT-d has snoop control), guest CD/MTRR/PAT are all ignored. The + * EPT memory type is set to WB. The effective memory type is forced + * WB. + * + * Otherwise, we trust guest. Guest CD/MTRR/PAT are all honored. The + * EPT memory type is used to emulate guest CD/MTRR. */ + if (is_mmio) { cache = MTRR_TYPE_UNCACHABLE; goto exit; @@ -6937,15 +6948,6 @@ exit: return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat; } -static int vmx_get_lpage_level(void) -{ - if (enable_ept && !cpu_has_vmx_ept_1g_page()) - return PT_DIRECTORY_LEVEL; - else - /* For shadow and EPT supported 1GB page */ - return PT_PDPE_LEVEL; -} - static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx) { /* @@ -7136,10 +7138,37 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) } } -static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) +static __init void vmx_set_cpu_caps(void) { - if (func == 1 && nested) - entry->ecx |= feature_bit(VMX); + kvm_set_cpu_caps(); + + /* CPUID 0x1 */ + if (nested) + kvm_cpu_cap_set(X86_FEATURE_VMX); + + /* CPUID 0x7 */ + if (kvm_mpx_supported()) + kvm_cpu_cap_check_and_set(X86_FEATURE_MPX); + if (cpu_has_vmx_invpcid()) + kvm_cpu_cap_check_and_set(X86_FEATURE_INVPCID); + if (vmx_pt_mode_is_host_guest()) + kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT); + + /* PKU is not yet implemented for shadow paging. */ + if (enable_ept && boot_cpu_has(X86_FEATURE_OSPKE)) + kvm_cpu_cap_check_and_set(X86_FEATURE_PKU); + + if (vmx_umip_emulated()) + kvm_cpu_cap_set(X86_FEATURE_UMIP); + + /* CPUID 0xD.1 */ + supported_xss = 0; + if (!vmx_xsaves_supported()) + kvm_cpu_cap_clear(X86_FEATURE_XSAVES); + + /* CPUID 0x80000001 */ + if (!cpu_has_vmx_rdtscp()) + kvm_cpu_cap_clear(X86_FEATURE_RDTSCP); } static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu) @@ -7183,10 +7212,10 @@ static int vmx_check_intercept_io(struct kvm_vcpu *vcpu, static int vmx_check_intercept(struct kvm_vcpu *vcpu, struct x86_instruction_info *info, - enum x86_intercept_stage stage) + enum x86_intercept_stage stage, + struct x86_exception *exception) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; switch (info->intercept) { /* @@ -7195,8 +7224,8 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu, */ case x86_intercept_rdtscp: if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) { - ctxt->exception.vector = UD_VECTOR; - ctxt->exception.error_code_valid = false; + exception->vector = UD_VECTOR; + exception->error_code_valid = false; return X86EMUL_PROPAGATE_FAULT; } break; @@ -7307,7 +7336,8 @@ static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) static void vmx_slot_enable_log_dirty(struct kvm *kvm, struct kvm_memory_slot *slot) { - kvm_mmu_slot_leaf_clear_dirty(kvm, slot); + if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) + kvm_mmu_slot_leaf_clear_dirty(kvm, slot); kvm_mmu_slot_largepage_remove_write_access(kvm, slot); } @@ -7490,7 +7520,7 @@ static void pi_post_block(struct kvm_vcpu *vcpu) static void vmx_post_block(struct kvm_vcpu *vcpu) { - if (kvm_x86_ops->set_hv_timer) + if (kvm_x86_ops.set_hv_timer) kvm_lapic_switch_to_hv_timer(vcpu); pi_post_block(vcpu); @@ -7657,13 +7687,164 @@ static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu) return to_vmx(vcpu)->nested.vmxon; } +static void hardware_unsetup(void) +{ + if (nested) + nested_vmx_hardware_unsetup(); + + free_kvm_area(); +} + +static bool vmx_check_apicv_inhibit_reasons(ulong bit) +{ + ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) | + BIT(APICV_INHIBIT_REASON_HYPERV); + + return supported & BIT(bit); +} + +static struct kvm_x86_ops vmx_x86_ops __initdata = { + .hardware_unsetup = hardware_unsetup, + + .hardware_enable = hardware_enable, + .hardware_disable = hardware_disable, + .cpu_has_accelerated_tpr = report_flexpriority, + .has_emulated_msr = vmx_has_emulated_msr, + + .vm_size = sizeof(struct kvm_vmx), + .vm_init = vmx_vm_init, + + .vcpu_create = vmx_create_vcpu, + .vcpu_free = vmx_free_vcpu, + .vcpu_reset = vmx_vcpu_reset, + + .prepare_guest_switch = vmx_prepare_switch_to_guest, + .vcpu_load = vmx_vcpu_load, + .vcpu_put = vmx_vcpu_put, + + .update_bp_intercept = update_exception_bitmap, + .get_msr_feature = vmx_get_msr_feature, + .get_msr = vmx_get_msr, + .set_msr = vmx_set_msr, + .get_segment_base = vmx_get_segment_base, + .get_segment = vmx_get_segment, + .set_segment = vmx_set_segment, + .get_cpl = vmx_get_cpl, + .get_cs_db_l_bits = vmx_get_cs_db_l_bits, + .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, + .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, + .set_cr0 = vmx_set_cr0, + .set_cr4 = vmx_set_cr4, + .set_efer = vmx_set_efer, + .get_idt = vmx_get_idt, + .set_idt = vmx_set_idt, + .get_gdt = vmx_get_gdt, + .set_gdt = vmx_set_gdt, + .get_dr6 = vmx_get_dr6, + .set_dr6 = vmx_set_dr6, + .set_dr7 = vmx_set_dr7, + .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, + .cache_reg = vmx_cache_reg, + .get_rflags = vmx_get_rflags, + .set_rflags = vmx_set_rflags, + + .tlb_flush = vmx_flush_tlb, + .tlb_flush_gva = vmx_flush_tlb_gva, + + .run = vmx_vcpu_run, + .handle_exit = vmx_handle_exit, + .skip_emulated_instruction = vmx_skip_emulated_instruction, + .update_emulated_instruction = vmx_update_emulated_instruction, + .set_interrupt_shadow = vmx_set_interrupt_shadow, + .get_interrupt_shadow = vmx_get_interrupt_shadow, + .patch_hypercall = vmx_patch_hypercall, + .set_irq = vmx_inject_irq, + .set_nmi = vmx_inject_nmi, + .queue_exception = vmx_queue_exception, + .cancel_injection = vmx_cancel_injection, + .interrupt_allowed = vmx_interrupt_allowed, + .nmi_allowed = vmx_nmi_allowed, + .get_nmi_mask = vmx_get_nmi_mask, + .set_nmi_mask = vmx_set_nmi_mask, + .enable_nmi_window = enable_nmi_window, + .enable_irq_window = enable_irq_window, + .update_cr8_intercept = update_cr8_intercept, + .set_virtual_apic_mode = vmx_set_virtual_apic_mode, + .set_apic_access_page_addr = vmx_set_apic_access_page_addr, + .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, + .load_eoi_exitmap = vmx_load_eoi_exitmap, + .apicv_post_state_restore = vmx_apicv_post_state_restore, + .check_apicv_inhibit_reasons = vmx_check_apicv_inhibit_reasons, + .hwapic_irr_update = vmx_hwapic_irr_update, + .hwapic_isr_update = vmx_hwapic_isr_update, + .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, + .sync_pir_to_irr = vmx_sync_pir_to_irr, + .deliver_posted_interrupt = vmx_deliver_posted_interrupt, + .dy_apicv_has_pending_interrupt = vmx_dy_apicv_has_pending_interrupt, + + .set_tss_addr = vmx_set_tss_addr, + .set_identity_map_addr = vmx_set_identity_map_addr, + .get_tdp_level = get_ept_level, + .get_mt_mask = vmx_get_mt_mask, + + .get_exit_info = vmx_get_exit_info, + + .cpuid_update = vmx_cpuid_update, + + .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, + + .read_l1_tsc_offset = vmx_read_l1_tsc_offset, + .write_l1_tsc_offset = vmx_write_l1_tsc_offset, + + .load_mmu_pgd = vmx_load_mmu_pgd, + + .check_intercept = vmx_check_intercept, + .handle_exit_irqoff = vmx_handle_exit_irqoff, + + .request_immediate_exit = vmx_request_immediate_exit, + + .sched_in = vmx_sched_in, + + .slot_enable_log_dirty = vmx_slot_enable_log_dirty, + .slot_disable_log_dirty = vmx_slot_disable_log_dirty, + .flush_log_dirty = vmx_flush_log_dirty, + .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, + .write_log_dirty = vmx_write_pml_buffer, + + .pre_block = vmx_pre_block, + .post_block = vmx_post_block, + + .pmu_ops = &intel_pmu_ops, + + .update_pi_irte = vmx_update_pi_irte, + +#ifdef CONFIG_X86_64 + .set_hv_timer = vmx_set_hv_timer, + .cancel_hv_timer = vmx_cancel_hv_timer, +#endif + + .setup_mce = vmx_setup_mce, + + .smi_allowed = vmx_smi_allowed, + .pre_enter_smm = vmx_pre_enter_smm, + .pre_leave_smm = vmx_pre_leave_smm, + .enable_smi_window = enable_smi_window, + + .check_nested_events = NULL, + .get_nested_state = NULL, + .set_nested_state = NULL, + .get_vmcs12_pages = NULL, + .nested_enable_evmcs = NULL, + .nested_get_evmcs_version = NULL, + .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault, + .apic_init_signal_blocked = vmx_apic_init_signal_blocked, +}; + static __init int hardware_setup(void) { unsigned long host_bndcfgs; struct desc_ptr dt; - int r, i; - - rdmsrl_safe(MSR_EFER, &host_efer); + int r, i, ept_lpage_level; store_idt(&dt); host_idt_base = dt.address; @@ -7682,6 +7863,10 @@ static __init int hardware_setup(void) WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost"); } + if (!cpu_has_vmx_mpx()) + supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | + XFEATURE_MASK_BNDCSR); + if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) enable_vpid = 0; @@ -7710,19 +7895,16 @@ static __init int hardware_setup(void) * using the APIC_ACCESS_ADDR VMCS field. */ if (!flexpriority_enabled) - kvm_x86_ops->set_apic_access_page_addr = NULL; + vmx_x86_ops.set_apic_access_page_addr = NULL; if (!cpu_has_vmx_tpr_shadow()) - kvm_x86_ops->update_cr8_intercept = NULL; - - if (enable_ept && !cpu_has_vmx_ept_2m_page()) - kvm_disable_largepages(); + vmx_x86_ops.update_cr8_intercept = NULL; #if IS_ENABLED(CONFIG_HYPERV) if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH && enable_ept) { - kvm_x86_ops->tlb_remote_flush = hv_remote_flush_tlb; - kvm_x86_ops->tlb_remote_flush_with_range = + vmx_x86_ops.tlb_remote_flush = hv_remote_flush_tlb; + vmx_x86_ops.tlb_remote_flush_with_range = hv_remote_flush_tlb_with_range; } #endif @@ -7737,7 +7919,7 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_apicv()) { enable_apicv = 0; - kvm_x86_ops->sync_pir_to_irr = NULL; + vmx_x86_ops.sync_pir_to_irr = NULL; } if (cpu_has_vmx_tsc_scaling()) { @@ -7750,8 +7932,16 @@ static __init int hardware_setup(void) if (enable_ept) vmx_enable_tdp(); + + if (!enable_ept) + ept_lpage_level = 0; + else if (cpu_has_vmx_ept_1g_page()) + ept_lpage_level = PT_PDPE_LEVEL; + else if (cpu_has_vmx_ept_2m_page()) + ept_lpage_level = PT_DIRECTORY_LEVEL; else - kvm_disable_tdp(); + ept_lpage_level = PT_PAGE_TABLE_LEVEL; + kvm_configure_mmu(enable_ept, ept_lpage_level); /* * Only enable PML when hardware supports PML feature, and both EPT @@ -7761,10 +7951,10 @@ static __init int hardware_setup(void) enable_pml = 0; if (!enable_pml) { - kvm_x86_ops->slot_enable_log_dirty = NULL; - kvm_x86_ops->slot_disable_log_dirty = NULL; - kvm_x86_ops->flush_log_dirty = NULL; - kvm_x86_ops->enable_log_dirty_pt_masked = NULL; + vmx_x86_ops.slot_enable_log_dirty = NULL; + vmx_x86_ops.slot_disable_log_dirty = NULL; + vmx_x86_ops.flush_log_dirty = NULL; + vmx_x86_ops.enable_log_dirty_pt_masked = NULL; } if (!cpu_has_vmx_preemption_timer()) @@ -7792,9 +7982,9 @@ static __init int hardware_setup(void) } if (!enable_preemption_timer) { - kvm_x86_ops->set_hv_timer = NULL; - kvm_x86_ops->cancel_hv_timer = NULL; - kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit; + vmx_x86_ops.set_hv_timer = NULL; + vmx_x86_ops.cancel_hv_timer = NULL; + vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit; } kvm_set_posted_intr_wakeup_handler(wakeup_handler); @@ -7810,185 +8000,27 @@ static __init int hardware_setup(void) nested_vmx_setup_ctls_msrs(&vmcs_config.nested, vmx_capability.ept); - r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers); + r = nested_vmx_hardware_setup(&vmx_x86_ops, + kvm_vmx_exit_handlers); if (r) return r; } + vmx_set_cpu_caps(); + r = alloc_kvm_area(); if (r) nested_vmx_hardware_unsetup(); return r; } -static __exit void hardware_unsetup(void) -{ - if (nested) - nested_vmx_hardware_unsetup(); - - free_kvm_area(); -} - -static bool vmx_check_apicv_inhibit_reasons(ulong bit) -{ - ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) | - BIT(APICV_INHIBIT_REASON_HYPERV); - - return supported & BIT(bit); -} - -static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { +static struct kvm_x86_init_ops vmx_init_ops __initdata = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, - .hardware_setup = hardware_setup, - .hardware_unsetup = hardware_unsetup, .check_processor_compatibility = vmx_check_processor_compat, - .hardware_enable = hardware_enable, - .hardware_disable = hardware_disable, - .cpu_has_accelerated_tpr = report_flexpriority, - .has_emulated_msr = vmx_has_emulated_msr, - - .vm_init = vmx_vm_init, - .vm_alloc = vmx_vm_alloc, - .vm_free = vmx_vm_free, - - .vcpu_create = vmx_create_vcpu, - .vcpu_free = vmx_free_vcpu, - .vcpu_reset = vmx_vcpu_reset, - - .prepare_guest_switch = vmx_prepare_switch_to_guest, - .vcpu_load = vmx_vcpu_load, - .vcpu_put = vmx_vcpu_put, - - .update_bp_intercept = update_exception_bitmap, - .get_msr_feature = vmx_get_msr_feature, - .get_msr = vmx_get_msr, - .set_msr = vmx_set_msr, - .get_segment_base = vmx_get_segment_base, - .get_segment = vmx_get_segment, - .set_segment = vmx_set_segment, - .get_cpl = vmx_get_cpl, - .get_cs_db_l_bits = vmx_get_cs_db_l_bits, - .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, - .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, - .set_cr0 = vmx_set_cr0, - .set_cr3 = vmx_set_cr3, - .set_cr4 = vmx_set_cr4, - .set_efer = vmx_set_efer, - .get_idt = vmx_get_idt, - .set_idt = vmx_set_idt, - .get_gdt = vmx_get_gdt, - .set_gdt = vmx_set_gdt, - .get_dr6 = vmx_get_dr6, - .set_dr6 = vmx_set_dr6, - .set_dr7 = vmx_set_dr7, - .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, - .cache_reg = vmx_cache_reg, - .get_rflags = vmx_get_rflags, - .set_rflags = vmx_set_rflags, - - .tlb_flush = vmx_flush_tlb, - .tlb_flush_gva = vmx_flush_tlb_gva, - - .run = vmx_vcpu_run, - .handle_exit = vmx_handle_exit, - .skip_emulated_instruction = vmx_skip_emulated_instruction, - .update_emulated_instruction = vmx_update_emulated_instruction, - .set_interrupt_shadow = vmx_set_interrupt_shadow, - .get_interrupt_shadow = vmx_get_interrupt_shadow, - .patch_hypercall = vmx_patch_hypercall, - .set_irq = vmx_inject_irq, - .set_nmi = vmx_inject_nmi, - .queue_exception = vmx_queue_exception, - .cancel_injection = vmx_cancel_injection, - .interrupt_allowed = vmx_interrupt_allowed, - .nmi_allowed = vmx_nmi_allowed, - .get_nmi_mask = vmx_get_nmi_mask, - .set_nmi_mask = vmx_set_nmi_mask, - .enable_nmi_window = enable_nmi_window, - .enable_irq_window = enable_irq_window, - .update_cr8_intercept = update_cr8_intercept, - .set_virtual_apic_mode = vmx_set_virtual_apic_mode, - .set_apic_access_page_addr = vmx_set_apic_access_page_addr, - .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, - .load_eoi_exitmap = vmx_load_eoi_exitmap, - .apicv_post_state_restore = vmx_apicv_post_state_restore, - .check_apicv_inhibit_reasons = vmx_check_apicv_inhibit_reasons, - .hwapic_irr_update = vmx_hwapic_irr_update, - .hwapic_isr_update = vmx_hwapic_isr_update, - .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, - .sync_pir_to_irr = vmx_sync_pir_to_irr, - .deliver_posted_interrupt = vmx_deliver_posted_interrupt, - .dy_apicv_has_pending_interrupt = vmx_dy_apicv_has_pending_interrupt, - - .set_tss_addr = vmx_set_tss_addr, - .set_identity_map_addr = vmx_set_identity_map_addr, - .get_tdp_level = get_ept_level, - .get_mt_mask = vmx_get_mt_mask, - - .get_exit_info = vmx_get_exit_info, - - .get_lpage_level = vmx_get_lpage_level, - - .cpuid_update = vmx_cpuid_update, - - .rdtscp_supported = vmx_rdtscp_supported, - .invpcid_supported = vmx_invpcid_supported, - - .set_supported_cpuid = vmx_set_supported_cpuid, - - .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, - - .read_l1_tsc_offset = vmx_read_l1_tsc_offset, - .write_l1_tsc_offset = vmx_write_l1_tsc_offset, - - .set_tdp_cr3 = vmx_set_cr3, - - .check_intercept = vmx_check_intercept, - .handle_exit_irqoff = vmx_handle_exit_irqoff, - .mpx_supported = vmx_mpx_supported, - .xsaves_supported = vmx_xsaves_supported, - .umip_emulated = vmx_umip_emulated, - .pt_supported = vmx_pt_supported, - .pku_supported = vmx_pku_supported, - - .request_immediate_exit = vmx_request_immediate_exit, - - .sched_in = vmx_sched_in, - - .slot_enable_log_dirty = vmx_slot_enable_log_dirty, - .slot_disable_log_dirty = vmx_slot_disable_log_dirty, - .flush_log_dirty = vmx_flush_log_dirty, - .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, - .write_log_dirty = vmx_write_pml_buffer, - - .pre_block = vmx_pre_block, - .post_block = vmx_post_block, - - .pmu_ops = &intel_pmu_ops, - - .update_pi_irte = vmx_update_pi_irte, - -#ifdef CONFIG_X86_64 - .set_hv_timer = vmx_set_hv_timer, - .cancel_hv_timer = vmx_cancel_hv_timer, -#endif - - .setup_mce = vmx_setup_mce, - - .smi_allowed = vmx_smi_allowed, - .pre_enter_smm = vmx_pre_enter_smm, - .pre_leave_smm = vmx_pre_leave_smm, - .enable_smi_window = enable_smi_window, + .hardware_setup = hardware_setup, - .check_nested_events = NULL, - .get_nested_state = NULL, - .set_nested_state = NULL, - .get_vmcs12_pages = NULL, - .nested_enable_evmcs = NULL, - .nested_get_evmcs_version = NULL, - .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault, - .apic_init_signal_blocked = vmx_apic_init_signal_blocked, + .runtime_ops = &vmx_x86_ops, }; static void vmx_cleanup_l1d_flush(void) @@ -8039,7 +8071,7 @@ module_exit(vmx_exit); static int __init vmx_init(void) { - int r; + int r, cpu; #if IS_ENABLED(CONFIG_HYPERV) /* @@ -8075,7 +8107,7 @@ static int __init vmx_init(void) } #endif - r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), + r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx), THIS_MODULE); if (r) return r; @@ -8093,6 +8125,12 @@ static int __init vmx_init(void) return r; } + for_each_possible_cpu(cpu) { + INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); + INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); + spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); + } + #ifdef CONFIG_KEXEC_CORE rcu_assign_pointer(crash_vmclear_loaded_vmcss, crash_vmclear_local_loaded_vmcss); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index e64da06c7009..aab9df55336e 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -12,9 +12,6 @@ #include "vmcs.h" extern const u32 vmx_msr_index[]; -extern u64 host_efer; - -extern u32 get_umwait_control_msr(void); #define MSR_TYPE_R 1 #define MSR_TYPE_W 2 @@ -335,9 +332,9 @@ u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu); void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask); void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer); void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); -void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); void set_cr4_guest_host_mask(struct vcpu_vmx *vmx); +void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long cr3); void ept_save_pdptrs(struct kvm_vcpu *vcpu); void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); @@ -452,7 +449,7 @@ static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) static inline u32 vmx_vmentry_ctrl(void) { u32 vmentry_ctrl = vmcs_config.vmentry_ctrl; - if (pt_mode == PT_MODE_SYSTEM) + if (vmx_pt_mode_is_system()) vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | VM_ENTRY_LOAD_IA32_RTIT_CTL); /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ @@ -463,7 +460,7 @@ static inline u32 vmx_vmentry_ctrl(void) static inline u32 vmx_vmexit_ctrl(void) { u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; - if (pt_mode == PT_MODE_SYSTEM) + if (vmx_pt_mode_is_system()) vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | VM_EXIT_CLEAR_IA32_RTIT_CTL); /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ @@ -493,7 +490,6 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags); void free_vmcs(struct vmcs *vmcs); int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs); void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs); -void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs); void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs); static inline struct vmcs *alloc_vmcs(bool shadow) |