diff options
Diffstat (limited to 'arch/x86/kvm/svm/nested.c')
-rw-r--r-- | arch/x86/kvm/svm/nested.c | 198 |
1 files changed, 149 insertions, 49 deletions
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index b708bdf7eaff..b7fd2e869998 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -111,7 +111,7 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) static bool nested_vmcb_needs_vls_intercept(struct vcpu_svm *svm) { - if (!guest_can_use(&svm->vcpu, X86_FEATURE_V_VMSAVE_VMLOAD)) + if (!guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_V_VMSAVE_VMLOAD)) return true; if (!nested_npt_enabled(svm)) @@ -185,12 +185,87 @@ void recalc_intercepts(struct vcpu_svm *svm) } /* + * This array (and its actual size) holds the set of offsets (indexing by chunk + * size) to process when merging vmcb12's MSRPM with vmcb01's MSRPM. Note, the + * set of MSRs for which interception is disabled in vmcb01 is per-vCPU, e.g. + * based on CPUID features. This array only tracks MSRs that *might* be passed + * through to the guest. + * + * Hardcode the capacity of the array based on the maximum number of _offsets_. + * MSRs are batched together, so there are fewer offsets than MSRs. + */ +static int nested_svm_msrpm_merge_offsets[7] __ro_after_init; +static int nested_svm_nr_msrpm_merge_offsets __ro_after_init; +typedef unsigned long nsvm_msrpm_merge_t; + +int __init nested_svm_init_msrpm_merge_offsets(void) +{ + static const u32 merge_msrs[] __initconst = { + MSR_STAR, + MSR_IA32_SYSENTER_CS, + MSR_IA32_SYSENTER_EIP, + MSR_IA32_SYSENTER_ESP, + #ifdef CONFIG_X86_64 + MSR_GS_BASE, + MSR_FS_BASE, + MSR_KERNEL_GS_BASE, + MSR_LSTAR, + MSR_CSTAR, + MSR_SYSCALL_MASK, + #endif + MSR_IA32_SPEC_CTRL, + MSR_IA32_PRED_CMD, + MSR_IA32_FLUSH_CMD, + MSR_IA32_APERF, + MSR_IA32_MPERF, + MSR_IA32_LASTBRANCHFROMIP, + MSR_IA32_LASTBRANCHTOIP, + MSR_IA32_LASTINTFROMIP, + MSR_IA32_LASTINTTOIP, + }; + int i, j; + + for (i = 0; i < ARRAY_SIZE(merge_msrs); i++) { + int bit_nr = svm_msrpm_bit_nr(merge_msrs[i]); + u32 offset; + + if (WARN_ON(bit_nr < 0)) + return -EIO; + + /* + * Merging is done in chunks to reduce the number of accesses + * to L1's bitmap. + */ + offset = bit_nr / BITS_PER_BYTE / sizeof(nsvm_msrpm_merge_t); + + for (j = 0; j < nested_svm_nr_msrpm_merge_offsets; j++) { + if (nested_svm_msrpm_merge_offsets[j] == offset) + break; + } + + if (j < nested_svm_nr_msrpm_merge_offsets) + continue; + + if (WARN_ON(j >= ARRAY_SIZE(nested_svm_msrpm_merge_offsets))) + return -EIO; + + nested_svm_msrpm_merge_offsets[j] = offset; + nested_svm_nr_msrpm_merge_offsets++; + } + + return 0; +} + +/* * Merge L0's (KVM) and L1's (Nested VMCB) MSR permission bitmaps. The function * is optimized in that it only merges the parts where KVM MSR permission bitmap * may contain zero bits. */ -static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) +static bool nested_svm_merge_msrpm(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); + nsvm_msrpm_merge_t *msrpm02 = svm->nested.msrpm; + nsvm_msrpm_merge_t *msrpm01 = svm->msrpm; int i; /* @@ -205,7 +280,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) if (!svm->nested.force_msr_bitmap_recalc) { struct hv_vmcb_enlightenments *hve = &svm->nested.ctl.hv_enlightenments; - if (kvm_hv_hypercall_enabled(&svm->vcpu) && + if (kvm_hv_hypercall_enabled(vcpu) && hve->hv_enlightenments_control.msr_bitmap && (svm->nested.ctl.clean & BIT(HV_VMCB_NESTED_ENLIGHTENMENTS))) goto set_msrpm_base_pa; @@ -215,25 +290,17 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) return true; - for (i = 0; i < MSRPM_OFFSETS; i++) { - u32 value, p; - u64 offset; - - if (msrpm_offsets[i] == 0xffffffff) - break; - - p = msrpm_offsets[i]; + for (i = 0; i < nested_svm_nr_msrpm_merge_offsets; i++) { + const int p = nested_svm_msrpm_merge_offsets[i]; + nsvm_msrpm_merge_t l1_val; + gpa_t gpa; - /* x2apic msrs are intercepted always for the nested guest */ - if (is_x2apic_msrpm_offset(p)) - continue; - - offset = svm->nested.ctl.msrpm_base_pa + (p * 4); + gpa = svm->nested.ctl.msrpm_base_pa + (p * sizeof(l1_val)); - if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4)) + if (kvm_vcpu_read_guest(vcpu, gpa, &l1_val, sizeof(l1_val))) return false; - svm->nested.msrpm[p] = svm->msrpm[p] | value; + msrpm02[p] = msrpm01[p] | l1_val; } svm->nested.force_msr_bitmap_recalc = false; @@ -594,7 +661,7 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 vmcb_mark_dirty(vmcb02, VMCB_DR); } - if (unlikely(guest_can_use(vcpu, X86_FEATURE_LBRV) && + if (unlikely(guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) { /* * Reserved bits of DEBUGCTL are ignored. Be consistent with @@ -646,12 +713,17 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, u32 pause_count12; u32 pause_thresh12; + nested_svm_transition_tlb_flush(vcpu); + + /* Enter Guest-Mode */ + enter_guest_mode(vcpu); + /* * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2, * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes. */ - if (guest_can_use(vcpu, X86_FEATURE_VGIF) && + if (guest_cpu_cap_has(vcpu, X86_FEATURE_VGIF) && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK)) int_ctl_vmcb12_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK); else @@ -673,6 +745,33 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa; vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa; + /* + * Stash vmcb02's counter if the guest hasn't moved past the guilty + * instruction; otherwise, reset the counter to '0'. + * + * In order to detect if L2 has made forward progress or not, track the + * RIP at which a bus lock has occurred on a per-vmcb12 basis. If RIP + * is changed, guest has clearly made forward progress, bus_lock_counter + * still remained '1', so reset bus_lock_counter to '0'. Eg. In the + * scenario, where a buslock happened in L1 before VMRUN, the bus lock + * firmly happened on an instruction in the past. Even if vmcb01's + * counter is still '1', (because the guilty instruction got patched), + * the vCPU has clearly made forward progress and so KVM should reset + * vmcb02's counter to '0'. + * + * If the RIP hasn't changed, stash the bus lock counter at nested VMRUN + * to prevent the same guilty instruction from triggering a VM-Exit. Eg. + * if userspace rate-limits the vCPU, then it's entirely possible that + * L1's tick interrupt is pending by the time userspace re-runs the + * vCPU. If KVM unconditionally clears the counter on VMRUN, then when + * L1 re-enters L2, the same instruction will trigger a VM-Exit and the + * entire cycle start over. + */ + if (vmcb02->save.rip && (svm->nested.ctl.bus_lock_rip == vmcb02->save.rip)) + vmcb02->control.bus_lock_counter = 1; + else + vmcb02->control.bus_lock_counter = 0; + /* Done at vmrun: asid. */ /* Also overwritten later if necessary. */ @@ -689,7 +788,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, vmcb02->control.tsc_offset = vcpu->arch.tsc_offset; - if (guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR) && + if (guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR) && svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio) nested_svm_update_tsc_ratio_msr(vcpu); @@ -710,7 +809,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, * what a nrips=0 CPU would do (L1 is responsible for advancing RIP * prior to injecting the event). */ - if (guest_can_use(vcpu, X86_FEATURE_NRIPS)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS)) vmcb02->control.next_rip = svm->nested.ctl.next_rip; else if (boot_cpu_has(X86_FEATURE_NRIPS)) vmcb02->control.next_rip = vmcb12_rip; @@ -720,7 +819,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, svm->soft_int_injected = true; svm->soft_int_csbase = vmcb12_csbase; svm->soft_int_old_rip = vmcb12_rip; - if (guest_can_use(vcpu, X86_FEATURE_NRIPS)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS)) svm->soft_int_next_rip = svm->nested.ctl.next_rip; else svm->soft_int_next_rip = vmcb12_rip; @@ -728,18 +827,18 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, vmcb02->control.virt_ext = vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK; - if (guest_can_use(vcpu, X86_FEATURE_LBRV)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV)) vmcb02->control.virt_ext |= (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK); if (!nested_vmcb_needs_vls_intercept(svm)) vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; - if (guest_can_use(vcpu, X86_FEATURE_PAUSEFILTER)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_PAUSEFILTER)) pause_count12 = svm->nested.ctl.pause_filter_count; else pause_count12 = 0; - if (guest_can_use(vcpu, X86_FEATURE_PFTHRESHOLD)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_PFTHRESHOLD)) pause_thresh12 = svm->nested.ctl.pause_filter_thresh; else pause_thresh12 = 0; @@ -762,11 +861,6 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, } } - nested_svm_transition_tlb_flush(vcpu); - - /* Enter Guest-Mode */ - enter_guest_mode(vcpu); - /* * Merge guest and host intercepts - must be called with vcpu in * guest-mode to take effect. @@ -910,7 +1004,7 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) if (enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, true)) goto out_exit_err; - if (nested_svm_vmrun_msrpm(svm)) + if (nested_svm_merge_msrpm(vcpu)) goto out; out_exit_err: @@ -994,7 +1088,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); /* in case we halted in L2 */ - svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); /* Give the current vmcb to the guest */ @@ -1026,7 +1120,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) if (vmcb12->control.exit_code != SVM_EXIT_ERR) nested_save_pending_event_to_vmcb12(svm, vmcb12); - if (guest_can_use(vcpu, X86_FEATURE_NRIPS)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS)) vmcb12->control.next_rip = vmcb02->control.next_rip; vmcb12->control.int_ctl = svm->nested.ctl.int_ctl; @@ -1039,8 +1133,17 @@ int nested_svm_vmexit(struct vcpu_svm *svm) } + /* + * Invalidate bus_lock_rip unless KVM is still waiting for the guest + * to make forward progress before re-enabling bus lock detection. + */ + if (!vmcb02->control.bus_lock_counter) + svm->nested.ctl.bus_lock_rip = INVALID_GPA; + nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr); + kvm_nested_vmexit_handle_ibrs(vcpu); + svm_switch_vmcb(svm, &svm->vmcb01); /* @@ -1065,7 +1168,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) if (!nested_exit_on_intr(svm)) kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); - if (unlikely(guest_can_use(vcpu, X86_FEATURE_LBRV) && + if (unlikely(guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) { svm_copy_lbrs(vmcb12, vmcb02); svm_update_lbrv(vcpu); @@ -1194,7 +1297,6 @@ int svm_allocate_nested(struct vcpu_svm *svm) svm->nested.msrpm = svm_vcpu_alloc_msrpm(); if (!svm->nested.msrpm) goto err_free_vmcb02; - svm_vcpu_init_msrpm(&svm->vcpu, svm->nested.msrpm); svm->nested.initialized = true; return 0; @@ -1254,26 +1356,26 @@ void svm_leave_nested(struct kvm_vcpu *vcpu) static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) { - u32 offset, msr, value; - int write, mask; + gpa_t base = svm->nested.ctl.msrpm_base_pa; + int write, bit_nr; + u8 value, mask; + u32 msr; if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) return NESTED_EXIT_HOST; msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; - offset = svm_msrpm_offset(msr); + bit_nr = svm_msrpm_bit_nr(msr); write = svm->vmcb->control.exit_info_1 & 1; - mask = 1 << ((2 * (msr & 0xf)) + write); - if (offset == MSR_INVALID) + if (bit_nr < 0) return NESTED_EXIT_DONE; - /* Offset is in 32 bit units but need in 8 bit units */ - offset *= 4; - - if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.ctl.msrpm_base_pa + offset, &value, 4)) + if (kvm_vcpu_read_guest(&svm->vcpu, base + bit_nr / BITS_PER_BYTE, + &value, sizeof(value))) return NESTED_EXIT_DONE; + mask = BIT(write) << (bit_nr & (BITS_PER_BYTE - 1)); return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; } @@ -1783,13 +1885,11 @@ out_free: static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu) { - struct vcpu_svm *svm = to_svm(vcpu); - if (WARN_ON(!is_guest_mode(vcpu))) return true; if (!vcpu->arch.pdptrs_from_userspace && - !nested_npt_enabled(svm) && is_pae_paging(vcpu)) + !nested_npt_enabled(to_svm(vcpu)) && is_pae_paging(vcpu)) /* * Reload the guest's PDPTRs since after a migration * the guest CR3 might be restored prior to setting the nested @@ -1798,7 +1898,7 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu) if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3))) return false; - if (!nested_svm_vmrun_msrpm(svm)) { + if (!nested_svm_merge_msrpm(vcpu)) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; |