diff options
Diffstat (limited to 'arch/x86/kvm/x86.c')
-rw-r--r-- | arch/x86/kvm/x86.c | 229 |
1 files changed, 167 insertions, 62 deletions
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 63bb1ee8258e..4a0b74ecd1de 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -717,7 +717,7 @@ bool pdptrs_changed(struct kvm_vcpu *vcpu) gfn_t gfn; int r; - if (is_long_mode(vcpu) || !is_pae(vcpu) || !is_paging(vcpu)) + if (!is_pae_paging(vcpu)) return false; if (!test_bit(VCPU_EXREG_PDPTR, @@ -960,8 +960,8 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (is_long_mode(vcpu) && (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63))) return 1; - else if (is_pae(vcpu) && is_paging(vcpu) && - !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) + else if (is_pae_paging(vcpu) && + !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) return 1; kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush); @@ -1174,7 +1174,28 @@ static u32 emulated_msrs[] = { MSR_AMD64_VIRT_SPEC_CTRL, MSR_IA32_POWER_CTL, + /* + * The following list leaves out MSRs whose values are determined + * by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs. + * We always support the "true" VMX control MSRs, even if the host + * processor does not, so I am putting these registers here rather + * than in msrs_to_save. + */ + MSR_IA32_VMX_BASIC, + MSR_IA32_VMX_TRUE_PINBASED_CTLS, + MSR_IA32_VMX_TRUE_PROCBASED_CTLS, + MSR_IA32_VMX_TRUE_EXIT_CTLS, + MSR_IA32_VMX_TRUE_ENTRY_CTLS, + MSR_IA32_VMX_MISC, + MSR_IA32_VMX_CR0_FIXED0, + MSR_IA32_VMX_CR4_FIXED0, + MSR_IA32_VMX_VMCS_ENUM, + MSR_IA32_VMX_PROCBASED_CTLS2, + MSR_IA32_VMX_EPT_VPID_CAP, + MSR_IA32_VMX_VMFUNC, + MSR_K7_HWCR, + MSR_KVM_POLL_CONTROL, }; static unsigned num_emulated_msrs; @@ -1210,11 +1231,12 @@ static u32 msr_based_features[] = { static unsigned int num_msr_based_features; -u64 kvm_get_arch_capabilities(void) +static u64 kvm_get_arch_capabilities(void) { - u64 data; + u64 data = 0; - rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data); + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data); /* * If we're doing cache flushes (either "always" or "cond") @@ -1230,7 +1252,6 @@ u64 kvm_get_arch_capabilities(void) return data; } -EXPORT_SYMBOL_GPL(kvm_get_arch_capabilities); static int kvm_get_msr_feature(struct kvm_msr_entry *msr) { @@ -2545,13 +2566,24 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } break; case MSR_IA32_MISC_ENABLE: - vcpu->arch.ia32_misc_enable_msr = data; + if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) && + ((vcpu->arch.ia32_misc_enable_msr ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) { + if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3)) + return 1; + vcpu->arch.ia32_misc_enable_msr = data; + kvm_update_cpuid(vcpu); + } else { + vcpu->arch.ia32_misc_enable_msr = data; + } break; case MSR_IA32_SMBASE: if (!msr_info->host_initiated) return 1; vcpu->arch.smbase = data; break; + case MSR_IA32_POWER_CTL: + vcpu->arch.msr_ia32_power_ctl = data; + break; case MSR_IA32_TSC: kvm_write_tsc(vcpu, msr_info); break; @@ -2626,6 +2658,14 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; break; + case MSR_KVM_POLL_CONTROL: + /* only enable bit supported */ + if (data & (-1ULL << 1)) + return 1; + + vcpu->arch.msr_kvm_poll_control = data; + break; + case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: @@ -2803,6 +2843,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; msr_info->data = vcpu->arch.arch_capabilities; break; + case MSR_IA32_POWER_CTL: + msr_info->data = vcpu->arch.msr_ia32_power_ctl; + break; case MSR_IA32_TSC: msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset; break; @@ -2875,6 +2918,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_KVM_PV_EOI_EN: msr_info->data = vcpu->arch.pv_eoi.msr_val; break; + case MSR_KVM_POLL_CONTROL: + msr_info->data = vcpu->arch.msr_kvm_poll_control; + break; case MSR_IA32_P5_MC_ADDR: case MSR_IA32_P5_MC_TYPE: case MSR_IA32_MCG_CAP: @@ -3084,6 +3130,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SET_BOOT_CPU_ID: case KVM_CAP_SPLIT_IRQCHIP: case KVM_CAP_IMMEDIATE_EXIT: + case KVM_CAP_PMU_EVENT_FILTER: case KVM_CAP_GET_MSR_FEATURES: case KVM_CAP_MSR_PLATFORM_INFO: case KVM_CAP_EXCEPTION_PAYLOAD: @@ -3096,7 +3143,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_CLOCK_TSC_STABLE; break; case KVM_CAP_X86_DISABLE_EXITS: - r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE; + r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE | + KVM_X86_DISABLE_EXITS_CSTATE; if(kvm_can_mwait_in_guest()) r |= KVM_X86_DISABLE_EXITS_MWAIT; break; @@ -4613,6 +4661,8 @@ split_irqchip_unlock: kvm->arch.hlt_in_guest = true; if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE) kvm->arch.pause_in_guest = true; + if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE) + kvm->arch.cstate_in_guest = true; r = 0; break; case KVM_CAP_MSR_PLATFORM_INFO: @@ -4927,6 +4977,9 @@ set_identity_unlock: r = kvm_vm_ioctl_hv_eventfd(kvm, &hvevfd); break; } + case KVM_SET_PMU_EVENT_FILTER: + r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp); + break; default: r = -ENOTTY; } @@ -6379,7 +6432,7 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) vcpu->arch.db); if (dr6 != 0) { - vcpu->arch.dr6 &= ~15; + vcpu->arch.dr6 &= ~DR_TRAP_BITS; vcpu->arch.dr6 |= dr6 | DR6_RTM; kvm_queue_exception(vcpu, DB_VECTOR); *r = EMULATE_DONE; @@ -6706,7 +6759,7 @@ static void kvm_hyperv_tsc_notifier(void) struct kvm_vcpu *vcpu; int cpu; - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) kvm_make_mclock_inprogress_request(kvm); @@ -6732,7 +6785,7 @@ static void kvm_hyperv_tsc_notifier(void) spin_unlock(&ka->pvclock_gtod_sync_lock); } - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); } #endif @@ -6783,17 +6836,17 @@ static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu) smp_call_function_single(cpu, tsc_khz_changed, freq, 1); - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { kvm_for_each_vcpu(i, vcpu, kvm) { if (vcpu->cpu != cpu) continue; kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - if (vcpu->cpu != smp_processor_id()) + if (vcpu->cpu != raw_smp_processor_id()) send_ipi = 1; } } - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); if (freq->old < freq->new && send_ipi) { /* @@ -6908,35 +6961,6 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = { .handle_intel_pt_intr = kvm_handle_intel_pt_intr, }; -static void kvm_set_mmio_spte_mask(void) -{ - u64 mask; - int maxphyaddr = boot_cpu_data.x86_phys_bits; - - /* - * Set the reserved bits and the present bit of an paging-structure - * entry to generate page fault with PFER.RSV = 1. - */ - - /* - * Mask the uppermost physical address bit, which would be reserved as - * long as the supported physical address width is less than 52. - */ - mask = 1ull << 51; - - /* Set the present bit. */ - mask |= 1ull; - - /* - * If reserved bit is not supported, clear the present bit to disable - * mmio page fault. - */ - if (IS_ENABLED(CONFIG_X86_64) && maxphyaddr == 52) - mask &= ~1ull; - - kvm_mmu_set_mmio_spte_mask(mask, mask); -} - #ifdef CONFIG_X86_64 static void pvclock_gtod_update_fn(struct work_struct *work) { @@ -6945,12 +6969,12 @@ static void pvclock_gtod_update_fn(struct work_struct *work) struct kvm_vcpu *vcpu; int i; - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) kvm_for_each_vcpu(i, vcpu, kvm) kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); atomic_set(&kvm_guest_has_master_clock, 0); - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); } static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn); @@ -7033,8 +7057,6 @@ int kvm_arch_init(void *opaque) if (r) goto out_free_percpu; - kvm_set_mmio_spte_mask(); - kvm_x86_ops = ops; kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, @@ -7173,6 +7195,23 @@ void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu) kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu); } +static void kvm_sched_yield(struct kvm *kvm, unsigned long dest_id) +{ + struct kvm_vcpu *target = NULL; + struct kvm_apic_map *map; + + rcu_read_lock(); + map = rcu_dereference(kvm->arch.apic_map); + + if (likely(map) && dest_id <= map->max_apic_id && map->phys_map[dest_id]) + target = map->phys_map[dest_id]->vcpu; + + rcu_read_unlock(); + + if (target) + kvm_vcpu_yield_to(target); +} + int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) { unsigned long nr, a0, a1, a2, a3, ret; @@ -7219,6 +7258,10 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) case KVM_HC_SEND_IPI: ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit); break; + case KVM_HC_SCHED_YIELD: + kvm_sched_yield(vcpu->kvm, a0); + ret = 0; + break; default: ret = -KVM_ENOSYS; break; @@ -7951,9 +7994,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } trace_kvm_entry(vcpu->vcpu_id); - if (lapic_in_kernel(vcpu) && - vcpu->arch.apic->lapic_timer.timer_advance_ns) - wait_lapic_expire(vcpu); guest_enter_irqoff(); fpregs_assert_state_consistent(); @@ -8002,13 +8042,29 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); - kvm_before_interrupt(vcpu); - kvm_x86_ops->handle_external_intr(vcpu); - kvm_after_interrupt(vcpu); + kvm_x86_ops->handle_exit_irqoff(vcpu); + /* + * Consume any pending interrupts, including the possible source of + * VM-Exit on SVM and any ticks that occur between VM-Exit and now. + * An instruction is required after local_irq_enable() to fully unblock + * interrupts on processors that implement an interrupt shadow, the + * stat.exits increment will do nicely. + */ + kvm_before_interrupt(vcpu); + local_irq_enable(); ++vcpu->stat.exits; + local_irq_disable(); + kvm_after_interrupt(vcpu); guest_exit_irqoff(); + if (lapic_in_kernel(vcpu)) { + s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta; + if (delta != S64_MIN) { + trace_kvm_wait_lapic_expire(vcpu->vcpu_id, delta); + vcpu->arch.apic->lapic_timer.advance_expire_delta = S64_MIN; + } + } local_irq_enable(); preempt_enable(); @@ -8594,7 +8650,7 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) kvm_update_cpuid(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); - if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu)) { + if (is_pae_paging(vcpu)) { load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); mmu_reset_needed = 1; } @@ -8875,6 +8931,10 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) msr.host_initiated = true; kvm_write_tsc(vcpu, &msr); vcpu_put(vcpu); + + /* poll control enabled by default */ + vcpu->arch.msr_kvm_poll_control = 1; + mutex_unlock(&vcpu->mutex); if (!kvmclock_periodic_sync) @@ -9107,9 +9167,9 @@ void kvm_arch_hardware_unsetup(void) kvm_x86_ops->hardware_unsetup(); } -void kvm_arch_check_processor_compat(void *rtn) +int kvm_arch_check_processor_compat(void) { - kvm_x86_ops->check_processor_compatibility(rtn); + return kvm_x86_ops->check_processor_compatibility(); } bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu) @@ -9381,6 +9441,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvm_ioapic_destroy(kvm); kvm_free_vcpus(kvm); kvfree(rcu_dereference_check(kvm->arch.apic_map, 1)); + kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1)); kvm_mmu_uninit_vm(kvm); kvm_page_track_cleanup(kvm); kvm_hv_destroy_vm(kvm); @@ -9789,6 +9850,36 @@ static int apf_get_user(struct kvm_vcpu *vcpu, u32 *val) sizeof(u32)); } +static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu) +{ + if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu)) + return false; + + if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) || + (vcpu->arch.apf.send_user_only && + kvm_x86_ops->get_cpl(vcpu) == 0)) + return false; + + return true; +} + +bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu) +{ + if (unlikely(!lapic_in_kernel(vcpu) || + kvm_event_needs_reinjection(vcpu) || + vcpu->arch.exception.pending)) + return false; + + if (kvm_hlt_in_guest(vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu)) + return false; + + /* + * If interrupts are off we cannot even use an artificial + * halt state. + */ + return kvm_x86_ops->interrupt_allowed(vcpu); +} + void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) { @@ -9797,11 +9888,8 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, trace_kvm_async_pf_not_present(work->arch.token, work->gva); kvm_add_async_pf_gfn(vcpu, work->arch.gfn); - if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) || - (vcpu->arch.apf.send_user_only && - kvm_x86_ops->get_cpl(vcpu) == 0)) - kvm_make_request(KVM_REQ_APF_HALT, vcpu); - else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) { + if (kvm_can_deliver_async_pf(vcpu) && + !apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) { fault.vector = PF_VECTOR; fault.error_code_valid = true; fault.error_code = 0; @@ -9809,6 +9897,16 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, fault.address = work->arch.token; fault.async_page_fault = true; kvm_inject_page_fault(vcpu, &fault); + } else { + /* + * It is not possible to deliver a paravirtualized asynchronous + * page fault, but putting the guest in an artificial halt state + * can be beneficial nevertheless: if an interrupt arrives, we + * can deliver it timely and perhaps the guest will schedule + * another process. When the instruction that triggered a page + * fault is retried, hopefully the page will be ready in the host. + */ + kvm_make_request(KVM_REQ_APF_HALT, vcpu); } } @@ -9949,6 +10047,13 @@ bool kvm_vector_hashing_enabled(void) } EXPORT_SYMBOL_GPL(kvm_vector_hashing_enabled); +bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) +{ + return (vcpu->arch.msr_kvm_poll_control & 1) == 0; +} +EXPORT_SYMBOL_GPL(kvm_arch_no_poll); + + EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); |