diff options
Diffstat (limited to 'arch/x86/kvm/vmx/nested.c')
-rw-r--r-- | arch/x86/kvm/vmx/nested.c | 330 |
1 files changed, 215 insertions, 115 deletions
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 657c2eda357c..fd78ffbde644 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -224,7 +224,7 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) return; kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); - vmx->nested.hv_evmcs_vmptr = -1ull; + vmx->nested.hv_evmcs_vmptr = 0; vmx->nested.hv_evmcs = NULL; } @@ -353,9 +353,8 @@ static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) to_vmx(vcpu)->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT, nested_ept_ad_enabled(vcpu), - nested_ept_get_cr3(vcpu)); - vcpu->arch.mmu->set_cr3 = vmx_set_cr3; - vcpu->arch.mmu->get_cr3 = nested_ept_get_cr3; + nested_ept_get_eptp(vcpu)); + vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp; vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; @@ -544,7 +543,8 @@ static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1, } } -static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) { +static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) +{ int msr; for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { @@ -1909,20 +1909,21 @@ static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) * This is an equivalent of the nested hypervisor executing the vmptrld * instruction. */ -static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, - bool from_launch) +static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( + struct kvm_vcpu *vcpu, bool from_launch) { struct vcpu_vmx *vmx = to_vmx(vcpu); bool evmcs_gpa_changed = false; u64 evmcs_gpa; if (likely(!vmx->nested.enlightened_vmcs_enabled)) - return 1; + return EVMPTRLD_DISABLED; if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) - return 1; + return EVMPTRLD_DISABLED; - if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { + if (unlikely(!vmx->nested.hv_evmcs || + evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { if (!vmx->nested.hv_evmcs) vmx->nested.current_vmptr = -1ull; @@ -1930,7 +1931,7 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa), &vmx->nested.hv_evmcs_map)) - return 0; + return EVMPTRLD_ERROR; vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva; @@ -1959,7 +1960,7 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { nested_release_evmcs(vcpu); - return 0; + return EVMPTRLD_VMFAIL; } vmx->nested.dirty_vmcs12 = true; @@ -1981,28 +1982,20 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, } /* - * Clean fields data can't de used on VMLAUNCH and when we switch + * Clean fields data can't be used on VMLAUNCH and when we switch * between different L2 guests as KVM keeps a single VMCS12 per L1. */ if (from_launch || evmcs_gpa_changed) vmx->nested.hv_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; - return 1; + return EVMPTRLD_SUCCEEDED; } void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - /* - * hv_evmcs may end up being not mapped after migration (when - * L2 was running), map it here to make sure vmcs12 changes are - * properly reflected. - */ - if (vmx->nested.enlightened_vmcs_enabled && !vmx->nested.hv_evmcs) - nested_vmx_handle_enlightened_vmptrld(vcpu, false); - if (vmx->nested.hv_evmcs) { copy_vmcs12_to_enlightened(vmx); /* All fields are clean */ @@ -2473,9 +2466,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, * If L1 use EPT, then L0 needs to execute INVEPT on * EPTP02 instead of EPTP01. Therefore, delay TLB * flush until vmcs02->eptp is fully updated by - * KVM_REQ_LOAD_CR3. Note that this assumes + * KVM_REQ_LOAD_MMU_PGD. Note that this assumes * KVM_REQ_TLB_FLUSH is evaluated after - * KVM_REQ_LOAD_CR3 in vcpu_enter_guest(). + * KVM_REQ_LOAD_MMU_PGD in vcpu_enter_guest(). */ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } @@ -2520,7 +2513,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, /* * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12 * on nested VM-Exit, which can occur without actually running L2 and - * thus without hitting vmx_set_cr3(), e.g. if L1 is entering L2 with + * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the * transition to HLT instead of running L2. */ @@ -2562,13 +2555,13 @@ static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) return 0; } -static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address) +static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) { struct vcpu_vmx *vmx = to_vmx(vcpu); int maxphyaddr = cpuid_maxphyaddr(vcpu); /* Check for memory type validity */ - switch (address & VMX_EPTP_MT_MASK) { + switch (new_eptp & VMX_EPTP_MT_MASK) { case VMX_EPTP_MT_UC: if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))) return false; @@ -2581,16 +2574,26 @@ static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address) return false; } - /* only 4 levels page-walk length are valid */ - if (CC((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4)) + /* Page-walk levels validity. */ + switch (new_eptp & VMX_EPTP_PWL_MASK) { + case VMX_EPTP_PWL_5: + if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT))) + return false; + break; + case VMX_EPTP_PWL_4: + if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT))) + return false; + break; + default: return false; + } /* Reserved bits should not be set */ - if (CC(address >> maxphyaddr || ((address >> 7) & 0x1f))) + if (CC(new_eptp >> maxphyaddr || ((new_eptp >> 7) & 0x1f))) return false; /* AD, if set, should be supported */ - if (address & VMX_EPTP_AD_ENABLE_BIT) { + if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) { if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))) return false; } @@ -2639,7 +2642,7 @@ static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, return -EINVAL; if (nested_cpu_has_ept(vmcs12) && - CC(!valid_ept_address(vcpu, vmcs12->ept_pointer))) + CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer))) return -EINVAL; if (nested_cpu_has_vmfunc(vmcs12)) { @@ -2959,7 +2962,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) /* * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to - * be written (by preparve_vmcs02()) before the "real" VMEnter, i.e. + * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e. * there is no need to preserve other bits or save/restore the field. */ vmcs_writel(GUEST_RFLAGS, 0); @@ -3051,6 +3054,27 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) struct page *page; u64 hpa; + /* + * hv_evmcs may end up being not mapped after migration (when + * L2 was running), map it here to make sure vmcs12 changes are + * properly reflected. + */ + if (vmx->nested.enlightened_vmcs_enabled && !vmx->nested.hv_evmcs) { + enum nested_evmptrld_status evmptrld_status = + nested_vmx_handle_enlightened_vmptrld(vcpu, false); + + if (evmptrld_status == EVMPTRLD_VMFAIL || + evmptrld_status == EVMPTRLD_ERROR) { + pr_debug_ratelimited("%s: enlightened vmptrld failed\n", + __func__); + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = + KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return false; + } + } + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { /* * Translate L1 physical address to host physical @@ -3160,10 +3184,10 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. * * Returns: - * NVMX_ENTRY_SUCCESS: Entered VMX non-root mode - * NVMX_ENTRY_VMFAIL: Consistency check VMFail - * NVMX_ENTRY_VMEXIT: Consistency check VMExit - * NVMX_ENTRY_KVM_INTERNAL_ERROR: KVM internal error + * NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode + * NVMX_VMENTRY_VMFAIL: Consistency check VMFail + * NVMX_VMENTRY_VMEXIT: Consistency check VMExit + * NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error */ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) @@ -3314,12 +3338,18 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) enum nvmx_vmentry_status status; struct vcpu_vmx *vmx = to_vmx(vcpu); u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); + enum nested_evmptrld_status evmptrld_status; if (!nested_vmx_check_permission(vcpu)) return 1; - if (!nested_vmx_handle_enlightened_vmptrld(vcpu, launch)) + evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch); + if (evmptrld_status == EVMPTRLD_ERROR) { + kvm_queue_exception(vcpu, UD_VECTOR); return 1; + } else if (evmptrld_status == EVMPTRLD_VMFAIL) { + return nested_vmx_failInvalid(vcpu); + } if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull) return nested_vmx_failInvalid(vcpu); @@ -3497,7 +3527,7 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, } -static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) +void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); gfn_t gfn; @@ -3575,25 +3605,81 @@ static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu, nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); } -static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) +/* + * Returns true if a debug trap is pending delivery. + * + * In KVM, debug traps bear an exception payload. As such, the class of a #DB + * exception may be inferred from the presence of an exception payload. + */ +static inline bool vmx_pending_dbg_trap(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.exception.pending && + vcpu->arch.exception.nr == DB_VECTOR && + vcpu->arch.exception.payload; +} + +/* + * Certain VM-exits set the 'pending debug exceptions' field to indicate a + * recognized #DB (data or single-step) that has yet to be delivered. Since KVM + * represents these debug traps with a payload that is said to be compatible + * with the 'pending debug exceptions' field, write the payload to the VMCS + * field if a VM-exit is delivered before the debug trap. + */ +static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu) +{ + if (vmx_pending_dbg_trap(vcpu)) + vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, + vcpu->arch.exception.payload); +} + +static int vmx_check_nested_events(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long exit_qual; bool block_nested_events = vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu); + bool mtf_pending = vmx->nested.mtf_pending; struct kvm_lapic *apic = vcpu->arch.apic; + /* + * Clear the MTF state. If a higher priority VM-exit is delivered first, + * this state is discarded. + */ + if (!block_nested_events) + vmx->nested.mtf_pending = false; + if (lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &apic->pending_events)) { if (block_nested_events) return -EBUSY; + nested_vmx_update_pending_dbg(vcpu); clear_bit(KVM_APIC_INIT, &apic->pending_events); nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); return 0; } + /* + * Process any exceptions that are not debug traps before MTF. + */ if (vcpu->arch.exception.pending && - nested_vmx_check_exception(vcpu, &exit_qual)) { + !vmx_pending_dbg_trap(vcpu) && + nested_vmx_check_exception(vcpu, &exit_qual)) { + if (block_nested_events) + return -EBUSY; + nested_vmx_inject_exception_vmexit(vcpu, exit_qual); + return 0; + } + + if (mtf_pending) { + if (block_nested_events) + return -EBUSY; + nested_vmx_update_pending_dbg(vcpu); + nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0); + return 0; + } + + if (vcpu->arch.exception.pending && + nested_vmx_check_exception(vcpu, &exit_qual)) { if (block_nested_events) return -EBUSY; nested_vmx_inject_exception_vmexit(vcpu, exit_qual); @@ -3623,8 +3709,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) return 0; } - if ((kvm_cpu_has_interrupt(vcpu) || external_intr) && - nested_exit_on_intr(vcpu)) { + if (kvm_cpu_has_interrupt(vcpu) && nested_exit_on_intr(vcpu)) { if (block_nested_events) return -EBUSY; nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); @@ -3967,7 +4052,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, * * If vmcs12 uses EPT, we need to execute this flush on EPTP01 * and therefore we request the TLB flush to happen only after VMCS EPTP - * has been set by KVM_REQ_LOAD_CR3. + * has been set by KVM_REQ_LOAD_MMU_PGD. */ if (enable_vpid && (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu))) { @@ -4272,17 +4357,8 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; if (likely(!vmx->fail)) { - /* - * TODO: SDM says that with acknowledge interrupt on - * exit, bit 31 of the VM-exit interrupt information - * (valid interrupt) is always set to 1 on - * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't - * need kvm_cpu_has_interrupt(). See the commit - * message for details. - */ - if (nested_exit_intr_ack_set(vcpu) && - exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && - kvm_cpu_has_interrupt(vcpu)) { + if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && + nested_exit_intr_ack_set(vcpu)) { int irq = kvm_cpu_get_interrupt(vcpu); WARN_ON(irq < 0); vmcs12->vm_exit_intr_info = irq | @@ -4326,7 +4402,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, * Decode the memory-address operand of a vmx instruction, as recorded on an * exit caused by such an instruction (run by a guest hypervisor). * On success, returns 0. When the operand is invalid, returns 1 and throws - * #UD or #GP. + * #UD, #GP, or #SS. */ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, u32 vmx_instruction_info, bool wr, int len, gva_t *ret) @@ -4367,7 +4443,7 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, if (base_is_valid) off += kvm_register_read(vcpu, base_reg); if (index_is_valid) - off += kvm_register_read(vcpu, index_reg)<<scaling; + off += kvm_register_read(vcpu, index_reg) << scaling; vmx_get_segment(vcpu, &s, seg_reg); /* @@ -4460,7 +4536,7 @@ void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu) return; vmx = to_vmx(vcpu); - if (kvm_x86_ops->pmu_ops->is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)) { + if (kvm_x86_ops.pmu_ops->is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)) { vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; vmx->nested.msrs.exit_ctls_high |= @@ -4546,7 +4622,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) vmx->nested.vmcs02_initialized = false; vmx->nested.vmxon = true; - if (pt_mode == PT_MODE_HOST_GUEST) { + if (vmx_pt_mode_is_host_guest()) { vmx->pt_desc.guest.ctl = 0; pt_update_intercept_for_msr(vmx); } @@ -5178,7 +5254,7 @@ static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { u32 index = kvm_rcx_read(vcpu); - u64 address; + u64 new_eptp; bool accessed_dirty; struct kvm_mmu *mmu = vcpu->arch.walk_mmu; @@ -5191,23 +5267,23 @@ static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, - &address, index * 8, 8)) + &new_eptp, index * 8, 8)) return 1; - accessed_dirty = !!(address & VMX_EPTP_AD_ENABLE_BIT); + accessed_dirty = !!(new_eptp & VMX_EPTP_AD_ENABLE_BIT); /* * If the (L2) guest does a vmfunc to the currently * active ept pointer, we don't have to do anything else */ - if (vmcs12->ept_pointer != address) { - if (!valid_ept_address(vcpu, address)) + if (vmcs12->ept_pointer != new_eptp) { + if (!nested_vmx_check_eptp(vcpu, new_eptp)) return 1; kvm_mmu_unload(vcpu); mmu->ept_ad = accessed_dirty; mmu->mmu_role.base.ad_disabled = !accessed_dirty; - vmcs12->ept_pointer = address; + vmcs12->ept_pointer = new_eptp; /* * TODO: Check what's the correct approach in case * mmu reload fails. Currently, we just let the next @@ -5256,24 +5332,17 @@ fail: return 1; } - -static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) +/* + * Return true if an IO instruction with the specified port and size should cause + * a VM-exit into L1. + */ +bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, + int size) { - unsigned long exit_qualification; + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); gpa_t bitmap, last_bitmap; - unsigned int port; - int size; u8 b; - if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) - return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); - - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - - port = exit_qualification >> 16; - size = (exit_qualification & 7) + 1; - last_bitmap = (gpa_t)-1; b = -1; @@ -5300,8 +5369,26 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, return false; } +static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + unsigned long exit_qualification; + unsigned short port; + int size; + + if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) + return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); + + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + + port = exit_qualification >> 16; + size = (exit_qualification & 7) + 1; + + return nested_vmx_check_io_bitmaps(vcpu, port, size); +} + /* - * Return 1 if we should exit from L2 to L1 to handle an MSR access access, + * Return 1 if we should exit from L2 to L1 to handle an MSR access, * rather than handle it ourselves in L0. I.e., check whether L1 expressed * disinterest in the current event (read or write a specific MSR) by using an * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. @@ -5446,8 +5533,25 @@ static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, return 1 & (b >> (field & 7)); } +static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12) +{ + u32 entry_intr_info = vmcs12->vm_entry_intr_info_field; + + if (nested_cpu_has_mtf(vmcs12)) + return true; + + /* + * An MTF VM-exit may be injected into the guest by setting the + * interruption-type to 7 (other event) and the vector field to 0. Such + * is the case regardless of the 'monitor trap flag' VM-execution + * control. + */ + return entry_intr_info == (INTR_INFO_VALID_MASK + | INTR_TYPE_OTHER_EVENT); +} + /* - * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we + * Return true if we should exit from L2 to L1 to handle an exit, or false if we * should handle it ourselves in L0 (and then continue L2). Only call this * when in is_guest_mode (L2). */ @@ -5457,8 +5561,7 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - if (vmx->nested.nested_run_pending) - return false; + WARN_ON_ONCE(vmx->nested.nested_run_pending); if (unlikely(vmx->fail)) { trace_kvm_nested_vmenter_failed( @@ -5467,19 +5570,6 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) return true; } - /* - * The host physical addresses of some pages of guest memory - * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC - * Page). The CPU may write to these pages via their host - * physical address while L2 is running, bypassing any - * address-translation-based dirty tracking (e.g. EPT write - * protection). - * - * Mark them dirty on every exit from L2 to prevent them from - * getting out of sync with dirty tracking. - */ - nested_mark_vmcs12_pages_dirty(vcpu); - trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, vmcs_readl(EXIT_QUALIFICATION), vmx->idt_vectoring_info, @@ -5560,7 +5650,7 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) case EXIT_REASON_MWAIT_INSTRUCTION: return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); case EXIT_REASON_MONITOR_TRAP_FLAG: - return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG); + return nested_vmx_exit_handled_mtf(vmcs12); case EXIT_REASON_MONITOR_INSTRUCTION: return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); case EXIT_REASON_PAUSE_INSTRUCTION: @@ -5683,6 +5773,9 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, if (vmx->nested.nested_run_pending) kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; + + if (vmx->nested.mtf_pending) + kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING; } } @@ -5834,10 +5927,12 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa); } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { /* - * Sync eVMCS upon entry as we may not have - * HV_X64_MSR_VP_ASSIST_PAGE set up yet. + * nested_vmx_handle_enlightened_vmptrld() cannot be called + * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be + * restored yet. EVMCS will be mapped from + * nested_get_vmcs12_pages(). */ - vmx->nested.need_vmcs12_to_shadow_sync = true; + kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); } else { return -EINVAL; } @@ -5863,6 +5958,9 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, vmx->nested.nested_run_pending = !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); + vmx->nested.mtf_pending = + !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING); + ret = -EINVAL; if (nested_cpu_has_shadow_vmcs(vmcs12) && vmcs12->vmcs_link_pointer != -1ull) { @@ -5920,8 +6018,7 @@ void nested_vmx_set_vmcs_shadowing_bitmap(void) * bit in the high half is on if the corresponding bit in the control field * may be on. See also vmx_control_verify(). */ -void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps, - bool apicv) +void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) { /* * Note that as a general rule, the high half of the MSRs (bits in @@ -5948,7 +6045,7 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps, PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS | - (apicv ? PIN_BASED_POSTED_INTR : 0); + (enable_apicv ? PIN_BASED_POSTED_INTR : 0); msrs->pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | PIN_BASED_VMX_PREEMPTION_TIMER; @@ -6057,11 +6154,13 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps, /* nested EPT: emulate EPT also to L1 */ msrs->secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT; - msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT | - VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT; - if (cpu_has_vmx_ept_execute_only()) - msrs->ept_caps |= - VMX_EPT_EXECUTE_ONLY_BIT; + msrs->ept_caps = + VMX_EPT_PAGE_WALK_4_BIT | + VMX_EPT_PAGE_WALK_5_BIT | + VMX_EPTP_WB_BIT | + VMX_EPT_INVEPT_BIT | + VMX_EPT_EXECUTE_ONLY_BIT; + msrs->ept_caps &= ept_caps; msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | @@ -6160,7 +6259,8 @@ void nested_vmx_hardware_unsetup(void) } } -__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) +__init int nested_vmx_hardware_setup(struct kvm_x86_ops *ops, + int (*exit_handlers[])(struct kvm_vcpu *)) { int i; @@ -6196,12 +6296,12 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; - kvm_x86_ops->check_nested_events = vmx_check_nested_events; - kvm_x86_ops->get_nested_state = vmx_get_nested_state; - kvm_x86_ops->set_nested_state = vmx_set_nested_state; - kvm_x86_ops->get_vmcs12_pages = nested_get_vmcs12_pages; - kvm_x86_ops->nested_enable_evmcs = nested_enable_evmcs; - kvm_x86_ops->nested_get_evmcs_version = nested_get_evmcs_version; + ops->check_nested_events = vmx_check_nested_events; + ops->get_nested_state = vmx_get_nested_state; + ops->set_nested_state = vmx_set_nested_state; + ops->get_vmcs12_pages = nested_get_vmcs12_pages; + ops->nested_enable_evmcs = nested_enable_evmcs; + ops->nested_get_evmcs_version = nested_get_evmcs_version; return 0; } |