diff options
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r-- | arch/x86/kvm/Kconfig | 2 | ||||
-rw-r--r-- | arch/x86/kvm/cpuid.h | 8 | ||||
-rw-r--r-- | arch/x86/kvm/emulate.c | 84 | ||||
-rw-r--r-- | arch/x86/kvm/hyperv.c | 74 | ||||
-rw-r--r-- | arch/x86/kvm/hyperv.h | 3 | ||||
-rw-r--r-- | arch/x86/kvm/i8254.c | 2 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 116 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 190 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.h | 5 | ||||
-rw-r--r-- | arch/x86/kvm/mmutrace.h | 6 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c | 248 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.c | 315 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 61 |
14 files changed, 717 insertions, 399 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 760433b2574a..2688c7dc5323 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -22,7 +22,7 @@ config KVM depends on HAVE_KVM depends on HIGH_RES_TIMERS # for TASKSTATS/TASK_DELAY_ACCT: - depends on NET + depends on NET && MULTIUSER select PREEMPT_NOTIFIERS select MMU_NOTIFIER select ANON_INODES diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index a6fd40aade7c..da6728383052 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -144,6 +144,14 @@ static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu) return best && (best->ebx & bit(X86_FEATURE_RTM)); } +static inline bool guest_cpuid_has_mpx(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_MPX)); +} + static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 80890dee66ce..fb0055953fbc 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -900,7 +900,7 @@ static __always_inline int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, if (rc != X86EMUL_CONTINUE) \ goto done; \ ctxt->_eip += sizeof(_type); \ - _x = *(_type __aligned(1) *) ctxt->fetch.ptr; \ + memcpy(&_x, ctxt->fetch.ptr, sizeof(_type)); \ ctxt->fetch.ptr += sizeof(_type); \ _x; \ }) @@ -3942,6 +3942,25 @@ static int check_fxsr(struct x86_emulate_ctxt *ctxt) } /* + * Hardware doesn't save and restore XMM 0-7 without CR4.OSFXSR, but does save + * and restore MXCSR. + */ +static size_t __fxstate_size(int nregs) +{ + return offsetof(struct fxregs_state, xmm_space[0]) + nregs * 16; +} + +static inline size_t fxstate_size(struct x86_emulate_ctxt *ctxt) +{ + bool cr4_osfxsr; + if (ctxt->mode == X86EMUL_MODE_PROT64) + return __fxstate_size(16); + + cr4_osfxsr = ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR; + return __fxstate_size(cr4_osfxsr ? 8 : 0); +} + +/* * FXSAVE and FXRSTOR have 4 different formats depending on execution mode, * 1) 16 bit mode * 2) 32 bit mode @@ -3962,7 +3981,6 @@ static int check_fxsr(struct x86_emulate_ctxt *ctxt) static int em_fxsave(struct x86_emulate_ctxt *ctxt) { struct fxregs_state fx_state; - size_t size; int rc; rc = check_fxsr(ctxt); @@ -3978,68 +3996,42 @@ static int em_fxsave(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; - if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR) - size = offsetof(struct fxregs_state, xmm_space[8 * 16/4]); - else - size = offsetof(struct fxregs_state, xmm_space[0]); - - return segmented_write_std(ctxt, ctxt->memop.addr.mem, &fx_state, size); -} - -static int fxrstor_fixup(struct x86_emulate_ctxt *ctxt, - struct fxregs_state *new) -{ - int rc = X86EMUL_CONTINUE; - struct fxregs_state old; - - rc = asm_safe("fxsave %[fx]", , [fx] "+m"(old)); - if (rc != X86EMUL_CONTINUE) - return rc; - - /* - * 64 bit host will restore XMM 8-15, which is not correct on non-64 - * bit guests. Load the current values in order to preserve 64 bit - * XMMs after fxrstor. - */ -#ifdef CONFIG_X86_64 - /* XXX: accessing XMM 8-15 very awkwardly */ - memcpy(&new->xmm_space[8 * 16/4], &old.xmm_space[8 * 16/4], 8 * 16); -#endif - - /* - * Hardware doesn't save and restore XMM 0-7 without CR4.OSFXSR, but - * does save and restore MXCSR. - */ - if (!(ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR)) - memcpy(new->xmm_space, old.xmm_space, 8 * 16); - - return rc; + return segmented_write_std(ctxt, ctxt->memop.addr.mem, &fx_state, + fxstate_size(ctxt)); } static int em_fxrstor(struct x86_emulate_ctxt *ctxt) { struct fxregs_state fx_state; int rc; + size_t size; rc = check_fxsr(ctxt); if (rc != X86EMUL_CONTINUE) return rc; - rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, 512); - if (rc != X86EMUL_CONTINUE) - return rc; + ctxt->ops->get_fpu(ctxt); - if (fx_state.mxcsr >> 16) - return emulate_gp(ctxt, 0); + size = fxstate_size(ctxt); + if (size < __fxstate_size(16)) { + rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state)); + if (rc != X86EMUL_CONTINUE) + goto out; + } - ctxt->ops->get_fpu(ctxt); + rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, size); + if (rc != X86EMUL_CONTINUE) + goto out; - if (ctxt->mode < X86EMUL_MODE_PROT64) - rc = fxrstor_fixup(ctxt, &fx_state); + if (fx_state.mxcsr >> 16) { + rc = emulate_gp(ctxt, 0); + goto out; + } if (rc == X86EMUL_CONTINUE) rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state)); +out: ctxt->ops->put_fpu(ctxt); return rc; diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index ebae57ac5902..337b6d2730fa 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -106,14 +106,27 @@ static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint, return 0; } -static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vcpu_id) +static struct kvm_vcpu *get_vcpu_by_vpidx(struct kvm *kvm, u32 vpidx) +{ + struct kvm_vcpu *vcpu = NULL; + int i; + + if (vpidx < KVM_MAX_VCPUS) + vcpu = kvm_get_vcpu(kvm, vpidx); + if (vcpu && vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx) + return vcpu; + kvm_for_each_vcpu(i, vcpu, kvm) + if (vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx) + return vcpu; + return NULL; +} + +static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vpidx) { struct kvm_vcpu *vcpu; struct kvm_vcpu_hv_synic *synic; - if (vcpu_id >= atomic_read(&kvm->online_vcpus)) - return NULL; - vcpu = kvm_get_vcpu(kvm, vcpu_id); + vcpu = get_vcpu_by_vpidx(kvm, vpidx); if (!vcpu) return NULL; synic = vcpu_to_synic(vcpu); @@ -221,7 +234,8 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, synic->version = data; break; case HV_X64_MSR_SIEFP: - if (data & HV_SYNIC_SIEFP_ENABLE) + if ((data & HV_SYNIC_SIEFP_ENABLE) && !host && + !synic->dont_zero_synic_pages) if (kvm_clear_guest(vcpu->kvm, data & PAGE_MASK, PAGE_SIZE)) { ret = 1; @@ -232,7 +246,8 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, synic_exit(synic, msr); break; case HV_X64_MSR_SIMP: - if (data & HV_SYNIC_SIMP_ENABLE) + if ((data & HV_SYNIC_SIMP_ENABLE) && !host && + !synic->dont_zero_synic_pages) if (kvm_clear_guest(vcpu->kvm, data & PAGE_MASK, PAGE_SIZE)) { ret = 1; @@ -318,11 +333,11 @@ static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint) return ret; } -int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint) +int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vpidx, u32 sint) { struct kvm_vcpu_hv_synic *synic; - synic = synic_get(kvm, vcpu_id); + synic = synic_get(kvm, vpidx); if (!synic) return -EINVAL; @@ -341,11 +356,11 @@ void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector) kvm_hv_notify_acked_sint(vcpu, i); } -static int kvm_hv_set_sint_gsi(struct kvm *kvm, u32 vcpu_id, u32 sint, int gsi) +static int kvm_hv_set_sint_gsi(struct kvm *kvm, u32 vpidx, u32 sint, int gsi) { struct kvm_vcpu_hv_synic *synic; - synic = synic_get(kvm, vcpu_id); + synic = synic_get(kvm, vpidx); if (!synic) return -EINVAL; @@ -634,9 +649,10 @@ void kvm_hv_process_stimers(struct kvm_vcpu *vcpu) } if ((stimer->config & HV_STIMER_ENABLE) && - stimer->count) - stimer_start(stimer); - else + stimer->count) { + if (!stimer->msg_pending) + stimer_start(stimer); + } else stimer_cleanup(stimer); } } @@ -687,14 +703,24 @@ void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu) stimer_init(&hv_vcpu->stimer[i], i); } -int kvm_hv_activate_synic(struct kvm_vcpu *vcpu) +void kvm_hv_vcpu_postcreate(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); + + hv_vcpu->vp_index = kvm_vcpu_get_idx(vcpu); +} + +int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages) { + struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); + /* * Hyper-V SynIC auto EOI SINT's are * not compatible with APICV, so deactivate APICV */ kvm_vcpu_deactivate_apicv(vcpu); - vcpu_to_synic(vcpu)->active = true; + synic->active = true; + synic->dont_zero_synic_pages = dont_zero_synic_pages; return 0; } @@ -978,6 +1004,11 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; switch (msr) { + case HV_X64_MSR_VP_INDEX: + if (!host) + return 1; + hv->vp_index = (u32)data; + break; case HV_X64_MSR_APIC_ASSIST_PAGE: { u64 gfn; unsigned long addr; @@ -1089,18 +1120,9 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; switch (msr) { - case HV_X64_MSR_VP_INDEX: { - int r; - struct kvm_vcpu *v; - - kvm_for_each_vcpu(r, v, vcpu->kvm) { - if (v == vcpu) { - data = r; - break; - } - } + case HV_X64_MSR_VP_INDEX: + data = hv->vp_index; break; - } case HV_X64_MSR_EOI: return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata); case HV_X64_MSR_ICR: diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index cd1119538add..e637631a9574 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -56,9 +56,10 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu); void kvm_hv_irq_routing_update(struct kvm *kvm); int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint); void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector); -int kvm_hv_activate_synic(struct kvm_vcpu *vcpu); +int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages); void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu); +void kvm_hv_vcpu_postcreate(struct kvm_vcpu *vcpu); void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu); static inline struct kvm_vcpu_hv_stimer *vcpu_to_stimer(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index a78b445ce411..af192895b1fc 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -724,8 +724,10 @@ void kvm_free_pit(struct kvm *kvm) struct kvm_pit *pit = kvm->arch.vpit; if (pit) { + mutex_lock(&kvm->slots_lock); kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev); kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->speaker_dev); + mutex_unlock(&kvm->slots_lock); kvm_pit_set_reinject(pit, false); hrtimer_cancel(&pit->pit_state.timer); kthread_destroy_worker(pit->worker); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index d24c8742d9b0..2819d4c123eb 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1495,6 +1495,7 @@ EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use); static void cancel_hv_timer(struct kvm_lapic *apic) { + WARN_ON(!apic->lapic_timer.hv_timer_in_use); preempt_disable(); kvm_x86_ops->cancel_hv_timer(apic->vcpu); apic->lapic_timer.hv_timer_in_use = false; @@ -1503,25 +1504,56 @@ static void cancel_hv_timer(struct kvm_lapic *apic) static bool start_hv_timer(struct kvm_lapic *apic) { - u64 tscdeadline = apic->lapic_timer.tscdeadline; + struct kvm_timer *ktimer = &apic->lapic_timer; + int r; - if ((atomic_read(&apic->lapic_timer.pending) && - !apic_lvtt_period(apic)) || - kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) { - if (apic->lapic_timer.hv_timer_in_use) - cancel_hv_timer(apic); - } else { - apic->lapic_timer.hv_timer_in_use = true; - hrtimer_cancel(&apic->lapic_timer.timer); + if (!kvm_x86_ops->set_hv_timer) + return false; + + if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending)) + return false; - /* In case the sw timer triggered in the window */ - if (atomic_read(&apic->lapic_timer.pending) && - !apic_lvtt_period(apic)) - cancel_hv_timer(apic); + r = kvm_x86_ops->set_hv_timer(apic->vcpu, ktimer->tscdeadline); + if (r < 0) + return false; + + ktimer->hv_timer_in_use = true; + hrtimer_cancel(&ktimer->timer); + + /* + * Also recheck ktimer->pending, in case the sw timer triggered in + * the window. For periodic timer, leave the hv timer running for + * simplicity, and the deadline will be recomputed on the next vmexit. + */ + if (!apic_lvtt_period(apic) && (r || atomic_read(&ktimer->pending))) { + if (r) + apic_timer_expired(apic); + return false; } - trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, - apic->lapic_timer.hv_timer_in_use); - return apic->lapic_timer.hv_timer_in_use; + + trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, true); + return true; +} + +static void start_sw_timer(struct kvm_lapic *apic) +{ + struct kvm_timer *ktimer = &apic->lapic_timer; + if (apic->lapic_timer.hv_timer_in_use) + cancel_hv_timer(apic); + if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending)) + return; + + if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) + start_sw_period(apic); + else if (apic_lvtt_tscdeadline(apic)) + start_sw_tscdeadline(apic); + trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, false); +} + +static void restart_apic_timer(struct kvm_lapic *apic) +{ + if (!start_hv_timer(apic)) + start_sw_timer(apic); } void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) @@ -1535,19 +1567,14 @@ void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) if (apic_lvtt_period(apic) && apic->lapic_timer.period) { advance_periodic_target_expiration(apic); - if (!start_hv_timer(apic)) - start_sw_period(apic); + restart_apic_timer(apic); } } EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer); void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu) { - struct kvm_lapic *apic = vcpu->arch.apic; - - WARN_ON(apic->lapic_timer.hv_timer_in_use); - - start_hv_timer(apic); + restart_apic_timer(vcpu->arch.apic); } EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer); @@ -1556,33 +1583,28 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu) struct kvm_lapic *apic = vcpu->arch.apic; /* Possibly the TSC deadline timer is not enabled yet */ - if (!apic->lapic_timer.hv_timer_in_use) - return; - - cancel_hv_timer(apic); + if (apic->lapic_timer.hv_timer_in_use) + start_sw_timer(apic); +} +EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer); - if (atomic_read(&apic->lapic_timer.pending)) - return; +void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; - if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) - start_sw_period(apic); - else if (apic_lvtt_tscdeadline(apic)) - start_sw_tscdeadline(apic); + WARN_ON(!apic->lapic_timer.hv_timer_in_use); + restart_apic_timer(apic); } -EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer); static void start_apic_timer(struct kvm_lapic *apic) { atomic_set(&apic->lapic_timer.pending, 0); - if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { - if (set_target_expiration(apic) && - !(kvm_x86_ops->set_hv_timer && start_hv_timer(apic))) - start_sw_period(apic); - } else if (apic_lvtt_tscdeadline(apic)) { - if (!(kvm_x86_ops->set_hv_timer && start_hv_timer(apic))) - start_sw_tscdeadline(apic); - } + if ((apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) + && !set_target_expiration(apic)) + return; + + restart_apic_timer(apic); } static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) @@ -1813,16 +1835,6 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu) * LAPIC interface *---------------------------------------------------------------------- */ -u64 kvm_get_lapic_target_expiration_tsc(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - - if (!lapic_in_kernel(vcpu)) - return 0; - - return apic->lapic_timer.tscdeadline; -} - u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index bcbe811f3b97..29caa2c3dff9 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -87,7 +87,6 @@ int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); -u64 kvm_get_lapic_target_expiration_tsc(struct kvm_vcpu *vcpu); u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data); @@ -216,4 +215,5 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu); void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu); void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu); bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu); +void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu); #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index cb8225969255..9b1dd114956a 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -46,6 +46,7 @@ #include <asm/io.h> #include <asm/vmx.h> #include <asm/kvm_page_track.h> +#include "trace.h" /* * When setting this variable to true it enables Two-Dimensional-Paging @@ -183,13 +184,13 @@ static u64 __read_mostly shadow_user_mask; static u64 __read_mostly shadow_accessed_mask; static u64 __read_mostly shadow_dirty_mask; static u64 __read_mostly shadow_mmio_mask; +static u64 __read_mostly shadow_mmio_value; static u64 __read_mostly shadow_present_mask; /* - * The mask/value to distinguish a PTE that has been marked not-present for - * access tracking purposes. - * The mask would be either 0 if access tracking is disabled, or - * SPTE_SPECIAL_MASK|VMX_EPT_RWX_MASK if access tracking is enabled. + * SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value. + * Non-present SPTEs with shadow_acc_track_value set are in place for access + * tracking. */ static u64 __read_mostly shadow_acc_track_mask; static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK; @@ -207,16 +208,40 @@ static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIF static void mmu_spte_set(u64 *sptep, u64 spte); static void mmu_free_roots(struct kvm_vcpu *vcpu); -void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) +void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value) { + BUG_ON((mmio_mask & mmio_value) != mmio_value); + shadow_mmio_value = mmio_value | SPTE_SPECIAL_MASK; shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK; } EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); +static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) +{ + return sp->role.ad_disabled; +} + +static inline bool spte_ad_enabled(u64 spte) +{ + MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value); + return !(spte & shadow_acc_track_value); +} + +static inline u64 spte_shadow_accessed_mask(u64 spte) +{ + MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value); + return spte_ad_enabled(spte) ? shadow_accessed_mask : 0; +} + +static inline u64 spte_shadow_dirty_mask(u64 spte) +{ + MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value); + return spte_ad_enabled(spte) ? shadow_dirty_mask : 0; +} + static inline bool is_access_track_spte(u64 spte) { - /* Always false if shadow_acc_track_mask is zero. */ - return (spte & shadow_acc_track_mask) == shadow_acc_track_value; + return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0; } /* @@ -270,7 +295,7 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, u64 mask = generation_mmio_spte_mask(gen); access &= ACC_WRITE_MASK | ACC_USER_MASK; - mask |= shadow_mmio_mask | access | gfn << PAGE_SHIFT; + mask |= shadow_mmio_value | access | gfn << PAGE_SHIFT; trace_mark_mmio_spte(sptep, gfn, access, gen); mmu_spte_set(sptep, mask); @@ -278,7 +303,7 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, static bool is_mmio_spte(u64 spte) { - return (spte & shadow_mmio_mask) == shadow_mmio_mask; + return (spte & shadow_mmio_mask) == shadow_mmio_value; } static gfn_t get_mmio_spte_gfn(u64 spte) @@ -315,12 +340,20 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) return likely(kvm_gen == spte_gen); } +/* + * Sets the shadow PTE masks used by the MMU. + * + * Assumptions: + * - Setting either @accessed_mask or @dirty_mask requires setting both + * - At least one of @accessed_mask or @acc_track_mask must be set + */ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, u64 acc_track_mask) { - if (acc_track_mask != 0) - acc_track_mask |= SPTE_SPECIAL_MASK; + BUG_ON(!dirty_mask != !accessed_mask); + BUG_ON(!accessed_mask && !acc_track_mask); + BUG_ON(acc_track_mask & shadow_acc_track_value); shadow_user_mask = user_mask; shadow_accessed_mask = accessed_mask; @@ -329,7 +362,6 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, shadow_x_mask = x_mask; shadow_present_mask = p_mask; shadow_acc_track_mask = acc_track_mask; - WARN_ON(shadow_accessed_mask != 0 && shadow_acc_track_mask != 0); } EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); @@ -549,7 +581,7 @@ static bool spte_has_volatile_bits(u64 spte) is_access_track_spte(spte)) return true; - if (shadow_accessed_mask) { + if (spte_ad_enabled(spte)) { if ((spte & shadow_accessed_mask) == 0 || (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0)) return true; @@ -560,14 +592,17 @@ static bool spte_has_volatile_bits(u64 spte) static bool is_accessed_spte(u64 spte) { - return shadow_accessed_mask ? spte & shadow_accessed_mask - : !is_access_track_spte(spte); + u64 accessed_mask = spte_shadow_accessed_mask(spte); + + return accessed_mask ? spte & accessed_mask + : !is_access_track_spte(spte); } static bool is_dirty_spte(u64 spte) { - return shadow_dirty_mask ? spte & shadow_dirty_mask - : spte & PT_WRITABLE_MASK; + u64 dirty_mask = spte_shadow_dirty_mask(spte); + + return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK; } /* Rules for using mmu_spte_set: @@ -707,10 +742,10 @@ static u64 mmu_spte_get_lockless(u64 *sptep) static u64 mark_spte_for_access_track(u64 spte) { - if (shadow_accessed_mask != 0) + if (spte_ad_enabled(spte)) return spte & ~shadow_accessed_mask; - if (shadow_acc_track_mask == 0 || is_access_track_spte(spte)) + if (is_access_track_spte(spte)) return spte; /* @@ -729,7 +764,6 @@ static u64 mark_spte_for_access_track(u64 spte) spte |= (spte & shadow_acc_track_saved_bits_mask) << shadow_acc_track_saved_bits_shift; spte &= ~shadow_acc_track_mask; - spte |= shadow_acc_track_value; return spte; } @@ -741,6 +775,7 @@ static u64 restore_acc_track_spte(u64 spte) u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift) & shadow_acc_track_saved_bits_mask; + WARN_ON_ONCE(spte_ad_enabled(spte)); WARN_ON_ONCE(!is_access_track_spte(spte)); new_spte &= ~shadow_acc_track_mask; @@ -759,7 +794,7 @@ static bool mmu_spte_age(u64 *sptep) if (!is_accessed_spte(spte)) return false; - if (shadow_accessed_mask) { + if (spte_ad_enabled(spte)) { clear_bit((ffs(shadow_accessed_mask) - 1), (unsigned long *)sptep); } else { @@ -1390,6 +1425,22 @@ static bool spte_clear_dirty(u64 *sptep) return mmu_spte_update(sptep, spte); } +static bool wrprot_ad_disabled_spte(u64 *sptep) +{ + bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT, + (unsigned long *)sptep); + if (was_writable) + kvm_set_pfn_dirty(spte_to_pfn(*sptep)); + + return was_writable; +} + +/* + * Gets the GFN ready for another round of dirty logging by clearing the + * - D bit on ad-enabled SPTEs, and + * - W bit on ad-disabled SPTEs. + * Returns true iff any D or W bits were cleared. + */ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) { u64 *sptep; @@ -1397,7 +1448,10 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) bool flush = false; for_each_rmap_spte(rmap_head, &iter, sptep) - flush |= spte_clear_dirty(sptep); + if (spte_ad_enabled(*sptep)) + flush |= spte_clear_dirty(sptep); + else + flush |= wrprot_ad_disabled_spte(sptep); return flush; } @@ -1420,7 +1474,8 @@ static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) bool flush = false; for_each_rmap_spte(rmap_head, &iter, sptep) - flush |= spte_set_dirty(sptep); + if (spte_ad_enabled(*sptep)) + flush |= spte_set_dirty(sptep); return flush; } @@ -1452,7 +1507,8 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, } /** - * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages + * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write + * protect the page if the D-bit isn't supported. * @kvm: kvm instance * @slot: slot to clear D-bit * @gfn_offset: start of the BITS_PER_LONG pages we care about @@ -1766,18 +1822,9 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, u64 *sptep; struct rmap_iterator iter; - /* - * If there's no access bit in the secondary pte set by the hardware and - * fast access tracking is also not enabled, it's up to gup-fast/gup to - * set the access bit in the primary pte or in the page structure. - */ - if (!shadow_accessed_mask && !shadow_acc_track_mask) - goto out; - for_each_rmap_spte(rmap_head, &iter, sptep) if (is_accessed_spte(*sptep)) return 1; -out: return 0; } @@ -1798,18 +1845,6 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) { - /* - * In case of absence of EPT Access and Dirty Bits supports, - * emulate the accessed bit for EPT, by checking if this page has - * an EPT mapping, and clearing it if it does. On the next access, - * a new EPT mapping will be established. - * This has some overhead, but not as much as the cost of swapping - * out actively used pages or breaking up actively used hugepages. - */ - if (!shadow_accessed_mask && !shadow_acc_track_mask) - return kvm_handle_hva_range(kvm, start, end, 0, - kvm_unmap_rmapp); - return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp); } @@ -2398,7 +2433,12 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK | - shadow_user_mask | shadow_x_mask | shadow_accessed_mask; + shadow_user_mask | shadow_x_mask; + + if (sp_ad_disabled(sp)) + spte |= shadow_acc_track_value; + else + spte |= shadow_accessed_mask; mmu_spte_set(sptep, spte); @@ -2666,10 +2706,15 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, { u64 spte = 0; int ret = 0; + struct kvm_mmu_page *sp; if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access)) return 0; + sp = page_header(__pa(sptep)); + if (sp_ad_disabled(sp)) + spte |= shadow_acc_track_value; + /* * For the EPT case, shadow_present_mask is 0 if hardware * supports exec-only page table entries. In that case, @@ -2678,7 +2723,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, */ spte |= shadow_present_mask; if (!speculative) - spte |= shadow_accessed_mask; + spte |= spte_shadow_accessed_mask(spte); if (pte_access & ACC_EXEC_MASK) spte |= shadow_x_mask; @@ -2735,7 +2780,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (pte_access & ACC_WRITE_MASK) { kvm_vcpu_mark_page_dirty(vcpu, gfn); - spte |= shadow_dirty_mask; + spte |= spte_shadow_dirty_mask(spte); } if (speculative) @@ -2877,16 +2922,16 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) { struct kvm_mmu_page *sp; + sp = page_header(__pa(sptep)); + /* - * Since it's no accessed bit on EPT, it's no way to - * distinguish between actually accessed translations - * and prefetched, so disable pte prefetch if EPT is - * enabled. + * Without accessed bits, there's no way to distinguish between + * actually accessed translations and prefetched, so disable pte + * prefetch if accessed bits aren't available. */ - if (!shadow_accessed_mask) + if (sp_ad_disabled(sp)) return; - sp = page_header(__pa(sptep)); if (sp->role.level > PT_PAGE_TABLE_LEVEL) return; @@ -3704,7 +3749,7 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu) kvm_event_needs_reinjection(vcpu))) return false; - if (is_guest_mode(vcpu)) + if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu)) return false; return kvm_x86_ops->interrupt_allowed(vcpu); @@ -3736,6 +3781,38 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, return false; } +int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, + u64 fault_address, char *insn, int insn_len, + bool need_unprotect) +{ + int r = 1; + + switch (vcpu->arch.apf.host_apf_reason) { + default: + trace_kvm_page_fault(fault_address, error_code); + + if (need_unprotect && kvm_event_needs_reinjection(vcpu)) + kvm_mmu_unprotect_page_virt(vcpu, fault_address); + r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn, + insn_len); + break; + case KVM_PV_REASON_PAGE_NOT_PRESENT: + vcpu->arch.apf.host_apf_reason = 0; + local_irq_disable(); + kvm_async_pf_task_wait(fault_address); + local_irq_enable(); + break; + case KVM_PV_REASON_PAGE_READY: + vcpu->arch.apf.host_apf_reason = 0; + local_irq_disable(); + kvm_async_pf_task_wake(fault_address); + local_irq_enable(); + break; + } + return r; +} +EXPORT_SYMBOL_GPL(kvm_handle_page_fault); + static bool check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level) { @@ -4290,6 +4367,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->base_role.word = 0; context->base_role.smm = is_smm(vcpu); + context->base_role.ad_disabled = (shadow_accessed_mask == 0); context->page_fault = tdp_page_fault; context->sync_page = nonpaging_sync_page; context->invlpg = nonpaging_invlpg; @@ -4377,6 +4455,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, context->root_level = context->shadow_root_level; context->root_hpa = INVALID_PAGE; context->direct_map = false; + context->base_role.ad_disabled = !accessed_dirty; update_permission_bitmask(vcpu, context, true); update_pkru_bitmask(vcpu, context, true); @@ -4636,6 +4715,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, mask.smep_andnot_wp = 1; mask.smap_andnot_wp = 1; mask.smm = 1; + mask.ad_disabled = 1; /* * If we don't have indirect shadow pages, it means no page is diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 330bf3a811fb..d7d248a000dd 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -51,7 +51,7 @@ static inline u64 rsvd_bits(int s, int e) return ((1ULL << (e - s + 1)) - 1) << s; } -void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask); +void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value); void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); @@ -77,6 +77,9 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu); void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, bool accessed_dirty); bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu); +int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, + u64 fault_address, char *insn, int insn_len, + bool need_unprotect); static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) { diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index 5a24b846a1cb..8b97a6cba8d1 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -30,8 +30,9 @@ \ role.word = __entry->role; \ \ - trace_seq_printf(p, "sp gen %lx gfn %llx %u%s q%u%s %s%s" \ - " %snxe root %u %s%c", __entry->mmu_valid_gen, \ + trace_seq_printf(p, "sp gen %lx gfn %llx l%u%s q%u%s %s%s" \ + " %snxe %sad root %u %s%c", \ + __entry->mmu_valid_gen, \ __entry->gfn, role.level, \ role.cr4_pae ? " pae" : "", \ role.quadrant, \ @@ -39,6 +40,7 @@ access_str[role.access], \ role.invalid ? " invalid" : "", \ role.nxe ? "" : "!", \ + role.ad_disabled ? "!" : "", \ __entry->root_count, \ __entry->unsync ? "unsync" : "sync", 0); \ saved_ptr; \ diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ba9891ac5c56..4d8141e533c3 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -36,6 +36,7 @@ #include <linux/slab.h> #include <linux/amd-iommu.h> #include <linux/hashtable.h> +#include <linux/frame.h> #include <asm/apic.h> #include <asm/perf_event.h> @@ -189,10 +190,10 @@ struct vcpu_svm { struct nested_state nested; bool nmi_singlestep; + u64 nmi_singlestep_guest_rflags; unsigned int3_injected; unsigned long int3_rip; - u32 apf_reason; /* cached guest cpuid flags for faster access */ bool nrips_enabled : 1; @@ -275,6 +276,10 @@ static int avic; module_param(avic, int, S_IRUGO); #endif +/* enable/disable Virtual VMLOAD VMSAVE */ +static int vls = true; +module_param(vls, int, 0444); + /* AVIC VM ID bit masks and lock */ static DECLARE_BITMAP(avic_vm_id_bitmap, AVIC_VM_ID_NR); static DEFINE_SPINLOCK(avic_vm_id_lock); @@ -631,11 +636,13 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) svm_set_interrupt_shadow(vcpu, 0); } -static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, - bool has_error_code, u32 error_code, - bool reinject) +static void svm_queue_exception(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + unsigned nr = vcpu->arch.exception.nr; + bool has_error_code = vcpu->arch.exception.has_error_code; + bool reinject = vcpu->arch.exception.reinject; + u32 error_code = vcpu->arch.exception.error_code; /* * If we are within a nested VM we'd better #VMEXIT and let the guest @@ -945,7 +952,7 @@ static void svm_enable_lbrv(struct vcpu_svm *svm) { u32 *msrpm = svm->msrpm; - svm->vmcb->control.lbr_ctl = 1; + svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1); set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); @@ -956,13 +963,25 @@ static void svm_disable_lbrv(struct vcpu_svm *svm) { u32 *msrpm = svm->msrpm; - svm->vmcb->control.lbr_ctl = 0; + svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0); set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0); set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0); set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0); } +static void disable_nmi_singlestep(struct vcpu_svm *svm) +{ + svm->nmi_singlestep = false; + if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) { + /* Clear our flags if they were not set by the guest */ + if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF)) + svm->vmcb->save.rflags &= ~X86_EFLAGS_TF; + if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF)) + svm->vmcb->save.rflags &= ~X86_EFLAGS_RF; + } +} + /* Note: * This hash table is used to map VM_ID to a struct kvm_arch, * when handling AMD IOMMU GALOG notification to schedule in @@ -1079,6 +1098,16 @@ static __init int svm_hardware_setup(void) } } + if (vls) { + if (!npt_enabled || + !boot_cpu_has(X86_FEATURE_VIRTUAL_VMLOAD_VMSAVE) || + !IS_ENABLED(CONFIG_X86_64)) { + vls = false; + } else { + pr_info("Virtual VMLOAD VMSAVE supported\n"); + } + } + return 0; err: @@ -1266,6 +1295,16 @@ static void init_vmcb(struct vcpu_svm *svm) if (avic) avic_init_vmcb(svm); + /* + * If hardware supports Virtual VMLOAD VMSAVE then enable it + * in VMCB and clear intercepts to avoid #VMEXIT. + */ + if (vls) { + clr_intercept(svm, INTERCEPT_VMLOAD); + clr_intercept(svm, INTERCEPT_VMSAVE); + svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; + } + mark_all_dirty(svm->vmcb); enable_gif(svm); @@ -1712,11 +1751,24 @@ static void svm_vcpu_unblocking(struct kvm_vcpu *vcpu) static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) { - return to_svm(vcpu)->vmcb->save.rflags; + struct vcpu_svm *svm = to_svm(vcpu); + unsigned long rflags = svm->vmcb->save.rflags; + + if (svm->nmi_singlestep) { + /* Hide our flags if they were not set by the guest */ + if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF)) + rflags &= ~X86_EFLAGS_TF; + if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF)) + rflags &= ~X86_EFLAGS_RF; + } + return rflags; } static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { + if (to_svm(vcpu)->nmi_singlestep) + rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); + /* * Any change of EFLAGS.VM is accompanied by a reload of SS * (caused by either a task switch or an inter-privilege IRET), @@ -2069,34 +2121,11 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) static int pf_interception(struct vcpu_svm *svm) { u64 fault_address = svm->vmcb->control.exit_info_2; - u64 error_code; - int r = 1; - - switch (svm->apf_reason) { - default: - error_code = svm->vmcb->control.exit_info_1; + u64 error_code = svm->vmcb->control.exit_info_1; - trace_kvm_page_fault(fault_address, error_code); - if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu)) - kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); - r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code, + return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address, svm->vmcb->control.insn_bytes, - svm->vmcb->control.insn_len); - break; - case KVM_PV_REASON_PAGE_NOT_PRESENT: - svm->apf_reason = 0; - local_irq_disable(); - kvm_async_pf_task_wait(fault_address); - local_irq_enable(); - break; - case KVM_PV_REASON_PAGE_READY: - svm->apf_reason = 0; - local_irq_disable(); - kvm_async_pf_task_wake(fault_address); - local_irq_enable(); - break; - } - return r; + svm->vmcb->control.insn_len, !npt_enabled); } static int db_interception(struct vcpu_svm *svm) @@ -2111,10 +2140,7 @@ static int db_interception(struct vcpu_svm *svm) } if (svm->nmi_singlestep) { - svm->nmi_singlestep = false; - if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) - svm->vmcb->save.rflags &= - ~(X86_EFLAGS_TF | X86_EFLAGS_RF); + disable_nmi_singlestep(svm); } if (svm->vcpu.guest_debug & @@ -2243,7 +2269,7 @@ static int io_interception(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &svm->vcpu; u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ - int size, in, string; + int size, in, string, ret; unsigned port; ++svm->vcpu.stat.io_exits; @@ -2255,10 +2281,16 @@ static int io_interception(struct vcpu_svm *svm) port = io_info >> 16; size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; svm->next_rip = svm->vmcb->control.exit_info_2; - skip_emulated_instruction(&svm->vcpu); + ret = kvm_skip_emulated_instruction(&svm->vcpu); - return in ? kvm_fast_pio_in(vcpu, size, port) - : kvm_fast_pio_out(vcpu, size, port); + /* + * TODO: we might be squashing a KVM_GUESTDBG_SINGLESTEP-triggered + * KVM_EXIT_DEBUG here. + */ + if (in) + return kvm_fast_pio_in(vcpu, size, port) && ret; + else + return kvm_fast_pio_out(vcpu, size, port) && ret; } static int nmi_interception(struct vcpu_svm *svm) @@ -2369,8 +2401,8 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) static int nested_svm_check_permissions(struct vcpu_svm *svm) { - if (!(svm->vcpu.arch.efer & EFER_SVME) - || !is_paging(&svm->vcpu)) { + if (!(svm->vcpu.arch.efer & EFER_SVME) || + !is_paging(&svm->vcpu)) { kvm_queue_exception(&svm->vcpu, UD_VECTOR); return 1; } @@ -2380,7 +2412,7 @@ static int nested_svm_check_permissions(struct vcpu_svm *svm) return 1; } - return 0; + return 0; } static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, @@ -2391,15 +2423,19 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, if (!is_guest_mode(&svm->vcpu)) return 0; + vmexit = nested_svm_intercept(svm); + if (vmexit != NESTED_EXIT_DONE) + return 0; + svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; svm->vmcb->control.exit_code_hi = 0; svm->vmcb->control.exit_info_1 = error_code; - svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; - - vmexit = nested_svm_intercept(svm); - if (vmexit == NESTED_EXIT_DONE) - svm->nested.exit_required = true; + if (svm->vcpu.arch.exception.nested_apf) + svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token; + else + svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; + svm->nested.exit_required = true; return vmexit; } @@ -2533,6 +2569,31 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; } +/* DB exceptions for our internal use must not cause vmexit */ +static int nested_svm_intercept_db(struct vcpu_svm *svm) +{ + unsigned long dr6; + + /* if we're not singlestepping, it's not ours */ + if (!svm->nmi_singlestep) + return NESTED_EXIT_DONE; + + /* if it's not a singlestep exception, it's not ours */ + if (kvm_get_dr(&svm->vcpu, 6, &dr6)) + return NESTED_EXIT_DONE; + if (!(dr6 & DR6_BS)) + return NESTED_EXIT_DONE; + + /* if the guest is singlestepping, it should get the vmexit */ + if (svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF) { + disable_nmi_singlestep(svm); + return NESTED_EXIT_DONE; + } + + /* it's ours, the nested hypervisor must not see this one */ + return NESTED_EXIT_HOST; +} + static int nested_svm_exit_special(struct vcpu_svm *svm) { u32 exit_code = svm->vmcb->control.exit_code; @@ -2549,7 +2610,7 @@ static int nested_svm_exit_special(struct vcpu_svm *svm) break; case SVM_EXIT_EXCP_BASE + PF_VECTOR: /* When we're shadowing, trap PFs, but not async PF */ - if (!npt_enabled && svm->apf_reason == 0) + if (!npt_enabled && svm->vcpu.arch.apf.host_apf_reason == 0) return NESTED_EXIT_HOST; break; default: @@ -2588,11 +2649,15 @@ static int nested_svm_intercept(struct vcpu_svm *svm) } case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); - if (svm->nested.intercept_exceptions & excp_bits) - vmexit = NESTED_EXIT_DONE; + if (svm->nested.intercept_exceptions & excp_bits) { + if (exit_code == SVM_EXIT_EXCP_BASE + DB_VECTOR) + vmexit = nested_svm_intercept_db(svm); + else + vmexit = NESTED_EXIT_DONE; + } /* async page fault always cause vmexit */ else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) && - svm->apf_reason != 0) + svm->vcpu.arch.exception.nested_apf != 0) vmexit = NESTED_EXIT_DONE; break; } @@ -2649,7 +2714,7 @@ static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *fr dst->event_inj = from->event_inj; dst->event_inj_err = from->event_inj_err; dst->nested_cr3 = from->nested_cr3; - dst->lbr_ctl = from->lbr_ctl; + dst->virt_ext = from->virt_ext; } static int nested_svm_vmexit(struct vcpu_svm *svm) @@ -2955,7 +3020,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) /* We don't want to see VMMCALLs from a nested guest */ clr_intercept(svm, INTERCEPT_VMMCALL); - svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl; + svm->vmcb->control.virt_ext = nested_vmcb->control.virt_ext; svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; svm->vmcb->control.int_state = nested_vmcb->control.int_state; svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset; @@ -3002,6 +3067,7 @@ static int vmload_interception(struct vcpu_svm *svm) { struct vmcb *nested_vmcb; struct page *page; + int ret; if (nested_svm_check_permissions(svm)) return 1; @@ -3011,18 +3077,19 @@ static int vmload_interception(struct vcpu_svm *svm) return 1; svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); + ret = kvm_skip_emulated_instruction(&svm->vcpu); nested_svm_vmloadsave(nested_vmcb, svm->vmcb); nested_svm_unmap(page); - return 1; + return ret; } static int vmsave_interception(struct vcpu_svm *svm) { struct vmcb *nested_vmcb; struct page *page; + int ret; if (nested_svm_check_permissions(svm)) return 1; @@ -3032,12 +3099,12 @@ static int vmsave_interception(struct vcpu_svm *svm) return 1; svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); + ret = kvm_skip_emulated_instruction(&svm->vcpu); nested_svm_vmloadsave(svm->vmcb, nested_vmcb); nested_svm_unmap(page); - return 1; + return ret; } static int vmrun_interception(struct vcpu_svm *svm) @@ -3070,25 +3137,29 @@ failed: static int stgi_interception(struct vcpu_svm *svm) { + int ret; + if (nested_svm_check_permissions(svm)) return 1; svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); + ret = kvm_skip_emulated_instruction(&svm->vcpu); kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); enable_gif(svm); - return 1; + return ret; } static int clgi_interception(struct vcpu_svm *svm) { + int ret; + if (nested_svm_check_permissions(svm)) return 1; svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); + ret = kvm_skip_emulated_instruction(&svm->vcpu); disable_gif(svm); @@ -3099,7 +3170,7 @@ static int clgi_interception(struct vcpu_svm *svm) mark_dirty(svm->vmcb, VMCB_INTR); } - return 1; + return ret; } static int invlpga_interception(struct vcpu_svm *svm) @@ -3113,8 +3184,7 @@ static int invlpga_interception(struct vcpu_svm *svm) kvm_mmu_invlpg(vcpu, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX)); svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); - return 1; + return kvm_skip_emulated_instruction(&svm->vcpu); } static int skinit_interception(struct vcpu_svm *svm) @@ -3137,7 +3207,7 @@ static int xsetbv_interception(struct vcpu_svm *svm) if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) { svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); + return kvm_skip_emulated_instruction(&svm->vcpu); } return 1; @@ -3233,8 +3303,7 @@ static int invlpg_interception(struct vcpu_svm *svm) return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1); - skip_emulated_instruction(&svm->vcpu); - return 1; + return kvm_skip_emulated_instruction(&svm->vcpu); } static int emulate_on_interception(struct vcpu_svm *svm) @@ -3384,9 +3453,7 @@ static int dr_interception(struct vcpu_svm *svm) kvm_register_write(&svm->vcpu, reg, val); } - skip_emulated_instruction(&svm->vcpu); - - return 1; + return kvm_skip_emulated_instruction(&svm->vcpu); } static int cr8_write_interception(struct vcpu_svm *svm) @@ -3509,6 +3576,7 @@ static int rdmsr_interception(struct vcpu_svm *svm) if (svm_get_msr(&svm->vcpu, &msr_info)) { trace_kvm_msr_read_ex(ecx); kvm_inject_gp(&svm->vcpu, 0); + return 1; } else { trace_kvm_msr_read(ecx, msr_info.data); @@ -3517,9 +3585,8 @@ static int rdmsr_interception(struct vcpu_svm *svm) kvm_register_write(&svm->vcpu, VCPU_REGS_RDX, msr_info.data >> 32); svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; - skip_emulated_instruction(&svm->vcpu); + return kvm_skip_emulated_instruction(&svm->vcpu); } - return 1; } static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data) @@ -3645,11 +3712,11 @@ static int wrmsr_interception(struct vcpu_svm *svm) if (kvm_set_msr(&svm->vcpu, &msr)) { trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(&svm->vcpu, 0); + return 1; } else { trace_kvm_msr_write(ecx, data); - skip_emulated_instruction(&svm->vcpu); + return kvm_skip_emulated_instruction(&svm->vcpu); } - return 1; } static int msr_interception(struct vcpu_svm *svm) @@ -3678,8 +3745,7 @@ static int pause_interception(struct vcpu_svm *svm) static int nop_interception(struct vcpu_svm *svm) { - skip_emulated_instruction(&(svm->vcpu)); - return 1; + return kvm_skip_emulated_instruction(&(svm->vcpu)); } static int monitor_interception(struct vcpu_svm *svm) @@ -4064,7 +4130,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar); pr_err("%-20s%08x\n", "event_inj:", control->event_inj); pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err); - pr_err("%-20s%lld\n", "lbr_ctl:", control->lbr_ctl); + pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext); pr_err("%-20s%016llx\n", "next_rip:", control->next_rip); pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page); pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id); @@ -4626,10 +4692,17 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) == HF_NMI_MASK) return; /* IRET will cause a vm exit */ + if ((svm->vcpu.arch.hflags & HF_GIF_MASK) == 0) + return; /* STGI will cause a vm exit */ + + if (svm->nested.exit_required) + return; /* we're not going to run the guest yet */ + /* * Something prevents NMI from been injected. Single step over possible * problem (IRET or exception injection or interrupt shadow) */ + svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu); svm->nmi_singlestep = true; svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); } @@ -4770,6 +4843,22 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) if (unlikely(svm->nested.exit_required)) return; + /* + * Disable singlestep if we're injecting an interrupt/exception. + * We don't want our modified rflags to be pushed on the stack where + * we might not be able to easily reset them if we disabled NMI + * singlestep later. + */ + if (svm->nmi_singlestep && svm->vmcb->control.event_inj) { + /* + * Event injection happens before external interrupts cause a + * vmexit and interrupts are disabled here, so smp_send_reschedule + * is enough to force an immediate vmexit. + */ + disable_nmi_singlestep(svm); + smp_send_reschedule(vcpu->cpu); + } + pre_svm_run(svm); sync_lapic_to_cr8(vcpu); @@ -4889,7 +4978,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) /* if exit due to PF check for async PF */ if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) - svm->apf_reason = kvm_read_and_reset_pf_reason(); + svm->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason(); if (npt_enabled) { vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); @@ -4906,6 +4995,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) mark_all_clean(svm->vmcb); } +STACK_FRAME_NON_STANDARD(svm_vcpu_run); static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) { diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ca5d2b93385c..29fd8af5c347 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -33,6 +33,7 @@ #include <linux/slab.h> #include <linux/tboot.h> #include <linux/hrtimer.h> +#include <linux/frame.h> #include "kvm_cache_regs.h" #include "x86.h" @@ -48,6 +49,7 @@ #include <asm/kexec.h> #include <asm/apic.h> #include <asm/irq_remapping.h> +#include <asm/mmu_context.h> #include "trace.h" #include "pmu.h" @@ -196,7 +198,8 @@ struct loaded_vmcs { struct vmcs *vmcs; struct vmcs *shadow_vmcs; int cpu; - int launched; + bool launched; + bool nmi_known_unmasked; struct list_head loaded_vmcss_on_cpu_link; }; @@ -596,6 +599,7 @@ struct vcpu_vmx { int gs_ldt_reload_needed; int fs_reload_needed; u64 msr_host_bndcfgs; + unsigned long vmcs_host_cr3; /* May not match real cr3 */ unsigned long vmcs_host_cr4; /* May not match real cr4 */ } host_state; struct { @@ -910,8 +914,9 @@ static void nested_release_page_clean(struct page *page) kvm_release_page_clean(page); } +static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu); static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); -static u64 construct_eptp(unsigned long root_hpa); +static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa); static bool vmx_xsaves_supported(void); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static void vmx_set_segment(struct kvm_vcpu *vcpu, @@ -2322,6 +2327,11 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu) __vmx_load_host_state(to_vmx(vcpu)); } +static bool emulation_required(struct kvm_vcpu *vcpu) +{ + return emulate_invalid_guest_state && !guest_state_valid(vcpu); +} + static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); /* @@ -2359,6 +2369,8 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { + unsigned long old_rflags = vmx_get_rflags(vcpu); + __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); to_vmx(vcpu)->rflags = rflags; if (to_vmx(vcpu)->rmode.vm86_active) { @@ -2366,6 +2378,9 @@ static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; } vmcs_writel(GUEST_RFLAGS, rflags); + + if ((old_rflags ^ to_vmx(vcpu)->rflags) & X86_EFLAGS_VM) + to_vmx(vcpu)->emulation_required = emulation_required(vcpu); } static u32 vmx_get_pkru(struct kvm_vcpu *vcpu) @@ -2418,28 +2433,41 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) * KVM wants to inject page-faults which it got to the guest. This function * checks whether in a nested guest, we need to inject them to L1 or L2. */ -static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr) +static int nested_vmx_check_exception(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + unsigned int nr = vcpu->arch.exception.nr; - if (!(vmcs12->exception_bitmap & (1u << nr))) + if (!((vmcs12->exception_bitmap & (1u << nr)) || + (nr == PF_VECTOR && vcpu->arch.exception.nested_apf))) return 0; + if (vcpu->arch.exception.nested_apf) { + vmcs_write32(VM_EXIT_INTR_ERROR_CODE, vcpu->arch.exception.error_code); + nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, + PF_VECTOR | INTR_TYPE_HARD_EXCEPTION | + INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK, + vcpu->arch.apf.nested_apf_token); + return 1; + } + nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, vmcs_read32(VM_EXIT_INTR_INFO), vmcs_readl(EXIT_QUALIFICATION)); return 1; } -static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, - bool has_error_code, u32 error_code, - bool reinject) +static void vmx_queue_exception(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned nr = vcpu->arch.exception.nr; + bool has_error_code = vcpu->arch.exception.has_error_code; + bool reinject = vcpu->arch.exception.reinject; + u32 error_code = vcpu->arch.exception.error_code; u32 intr_info = nr | INTR_INFO_VALID_MASK; if (!reinject && is_guest_mode(vcpu) && - nested_vmx_check_exception(vcpu, nr)) + nested_vmx_check_exception(vcpu)) return; if (has_error_code) { @@ -2769,7 +2797,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) if (enable_ept_ad_bits) { vmx->nested.nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_PML; - vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT; + vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT; } } else vmx->nested.nested_vmx_ept_caps = 0; @@ -3195,7 +3223,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); break; case MSR_IA32_BNDCFGS: - if (!kvm_mpx_supported()) + if (!kvm_mpx_supported() || + (!msr_info->host_initiated && !guest_cpuid_has_mpx(vcpu))) return 1; msr_info->data = vmcs_read64(GUEST_BNDCFGS); break; @@ -3277,7 +3306,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmcs_writel(GUEST_SYSENTER_ESP, data); break; case MSR_IA32_BNDCFGS: - if (!kvm_mpx_supported()) + if (!kvm_mpx_supported() || + (!msr_info->host_initiated && !guest_cpuid_has_mpx(vcpu))) + return 1; + if (is_noncanonical_address(data & PAGE_MASK) || + (data & MSR_IA32_BNDCFGS_RSVD)) return 1; vmcs_write64(GUEST_BNDCFGS, data); break; @@ -3755,6 +3788,25 @@ static void free_kvm_area(void) } } +enum vmcs_field_type { + VMCS_FIELD_TYPE_U16 = 0, + VMCS_FIELD_TYPE_U64 = 1, + VMCS_FIELD_TYPE_U32 = 2, + VMCS_FIELD_TYPE_NATURAL_WIDTH = 3 +}; + +static inline int vmcs_field_type(unsigned long field) +{ + if (0x1 & field) /* the *_HIGH fields are all 32 bit */ + return VMCS_FIELD_TYPE_U32; + return (field >> 13) & 0x3 ; +} + +static inline int vmcs_field_readonly(unsigned long field) +{ + return (((field >> 10) & 0x3) == 1); +} + static void init_vmcs_shadow_fields(void) { int i, j; @@ -3780,14 +3832,22 @@ static void init_vmcs_shadow_fields(void) /* shadowed fields guest access without vmexit */ for (i = 0; i < max_shadow_read_write_fields; i++) { - clear_bit(shadow_read_write_fields[i], - vmx_vmwrite_bitmap); - clear_bit(shadow_read_write_fields[i], - vmx_vmread_bitmap); + unsigned long field = shadow_read_write_fields[i]; + + clear_bit(field, vmx_vmwrite_bitmap); + clear_bit(field, vmx_vmread_bitmap); + if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64) { + clear_bit(field + 1, vmx_vmwrite_bitmap); + clear_bit(field + 1, vmx_vmread_bitmap); + } + } + for (i = 0; i < max_shadow_read_only_fields; i++) { + unsigned long field = shadow_read_only_fields[i]; + + clear_bit(field, vmx_vmread_bitmap); + if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64) + clear_bit(field + 1, vmx_vmread_bitmap); } - for (i = 0; i < max_shadow_read_only_fields; i++) - clear_bit(shadow_read_only_fields[i], - vmx_vmread_bitmap); } static __init int alloc_kvm_area(void) @@ -3808,11 +3868,6 @@ static __init int alloc_kvm_area(void) return 0; } -static bool emulation_required(struct kvm_vcpu *vcpu) -{ - return emulate_invalid_guest_state && !guest_state_valid(vcpu); -} - static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment *save) { @@ -4010,7 +4065,7 @@ static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid) if (enable_ept) { if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; - ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); + ept_sync_context(construct_eptp(vcpu, vcpu->arch.mmu.root_hpa)); } else { vpid_sync_context(vpid); } @@ -4185,14 +4240,15 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) vmx->emulation_required = emulation_required(vcpu); } -static u64 construct_eptp(unsigned long root_hpa) +static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa) { u64 eptp; /* TODO write the value reading from MSR */ eptp = VMX_EPT_DEFAULT_MT | VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT; - if (enable_ept_ad_bits) + if (enable_ept_ad_bits && + (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu))) eptp |= VMX_EPT_AD_ENABLE_BIT; eptp |= (root_hpa & PAGE_MASK); @@ -4206,7 +4262,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) guest_cr3 = cr3; if (enable_ept) { - eptp = construct_eptp(cr3); + eptp = construct_eptp(vcpu, cr3); vmcs_write64(EPT_POINTER, eptp); if (is_paging(vcpu) || is_guest_mode(vcpu)) guest_cr3 = kvm_read_cr3(vcpu); @@ -4624,6 +4680,11 @@ static bool guest_state_valid(struct kvm_vcpu *vcpu) return true; } +static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu)); +} + static int init_rmode_tss(struct kvm *kvm) { gfn_t fn; @@ -5012,12 +5073,19 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) u32 low32, high32; unsigned long tmpl; struct desc_ptr dt; - unsigned long cr0, cr4; + unsigned long cr0, cr3, cr4; cr0 = read_cr0(); WARN_ON(cr0 & X86_CR0_TS); vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */ - vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ + + /* + * Save the most likely value for this task's CR3 in the VMCS. + * We can't use __get_current_cr3_fast() because we're not atomic. + */ + cr3 = __read_cr3(); + vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */ + vmx->host_state.vmcs_host_cr3 = cr3; /* Save the most likely value for this task's CR4 in the VMCS. */ cr4 = cr4_read_shadow(); @@ -5160,7 +5228,8 @@ static void ept_set_mmio_spte_mask(void) * EPT Misconfigurations can be generated if the value of bits 2:0 * of an EPT paging-structure entry is 110b (write/execute). */ - kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE); + kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK, + VMX_EPT_MISCONFIG_WX_VALUE); } #define VMX_XSS_EXIT_BITMAP 0 @@ -5447,10 +5516,8 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (!is_guest_mode(vcpu)) { - ++vcpu->stat.nmi_injections; - vmx->nmi_known_unmasked = false; - } + ++vcpu->stat.nmi_injections; + vmx->loaded_vmcs->nmi_known_unmasked = false; if (vmx->rmode.vm86_active) { if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE) @@ -5464,16 +5531,21 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) { - if (to_vmx(vcpu)->nmi_known_unmasked) + struct vcpu_vmx *vmx = to_vmx(vcpu); + bool masked; + + if (vmx->loaded_vmcs->nmi_known_unmasked) return false; - return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; + masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; + vmx->loaded_vmcs->nmi_known_unmasked = !masked; + return masked; } static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) { struct vcpu_vmx *vmx = to_vmx(vcpu); - vmx->nmi_known_unmasked = !masked; + vmx->loaded_vmcs->nmi_known_unmasked = !masked; if (masked) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); @@ -5646,14 +5718,11 @@ static int handle_exception(struct kvm_vcpu *vcpu) } if (is_page_fault(intr_info)) { - /* EPT won't cause page fault directly */ - BUG_ON(enable_ept); cr2 = vmcs_readl(EXIT_QUALIFICATION); - trace_kvm_page_fault(cr2, error_code); - - if (kvm_event_needs_reinjection(vcpu)) - kvm_mmu_unprotect_page_virt(vcpu, cr2); - return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0); + /* EPT won't cause page fault directly */ + WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept); + return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0, + true); } ex_no = intr_info & INTR_INFO_VECTOR_MASK; @@ -6210,17 +6279,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - if (is_guest_mode(vcpu) - && !(exit_qualification & EPT_VIOLATION_GVA_TRANSLATED)) { - /* - * Fix up exit_qualification according to whether guest - * page table accesses are reads or writes. - */ - u64 eptp = nested_ept_get_cr3(vcpu); - if (!(eptp & VMX_EPT_AD_ENABLE_BIT)) - exit_qualification &= ~EPT_VIOLATION_ACC_WRITE; - } - /* * EPT violation happened while executing iret from NMI, * "blocked by NMI" bit has to be set before next VM entry. @@ -6443,7 +6501,7 @@ void vmx_enable_tdp(void) enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull, 0ull, VMX_EPT_EXECUTABLE_MASK, cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK, - enable_ept_ad_bits ? 0ull : VMX_EPT_RWX_MASK); + VMX_EPT_RWX_MASK); ept_set_mmio_spte_mask(); kvm_enable_tdp(); @@ -6547,7 +6605,6 @@ static __init int hardware_setup(void) vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); - vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); memcpy(vmx_msr_bitmap_legacy_x2apic_apicv, vmx_msr_bitmap_legacy, PAGE_SIZE); @@ -7208,25 +7265,6 @@ static int handle_vmresume(struct kvm_vcpu *vcpu) return nested_vmx_run(vcpu, false); } -enum vmcs_field_type { - VMCS_FIELD_TYPE_U16 = 0, - VMCS_FIELD_TYPE_U64 = 1, - VMCS_FIELD_TYPE_U32 = 2, - VMCS_FIELD_TYPE_NATURAL_WIDTH = 3 -}; - -static inline int vmcs_field_type(unsigned long field) -{ - if (0x1 & field) /* the *_HIGH fields are all 32 bit */ - return VMCS_FIELD_TYPE_U32; - return (field >> 13) & 0x3 ; -} - -static inline int vmcs_field_readonly(unsigned long field) -{ - return (((field >> 10) & 0x3) == 1); -} - /* * Read a vmcs12 field. Since these can have varying lengths and we return * one type, we chose the biggest type (u64) and zero-extend the return value @@ -7651,7 +7689,10 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) unsigned long type, types; gva_t gva; struct x86_exception e; - int vpid; + struct { + u64 vpid; + u64 gla; + } operand; if (!(vmx->nested.nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_VPID) || @@ -7681,17 +7722,28 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), vmx_instruction_info, false, &gva)) return 1; - if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vpid, - sizeof(u32), &e)) { + if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand, + sizeof(operand), &e)) { kvm_inject_page_fault(vcpu, &e); return 1; } + if (operand.vpid >> 16) { + nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + return kvm_skip_emulated_instruction(vcpu); + } switch (type) { case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: + if (is_noncanonical_address(operand.gla)) { + nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + return kvm_skip_emulated_instruction(vcpu); + } + /* fall through */ case VMX_VPID_EXTENT_SINGLE_CONTEXT: case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: - if (!vpid) { + if (!operand.vpid) { nested_vmx_failValid(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); return kvm_skip_emulated_instruction(vcpu); @@ -7994,7 +8046,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) if (is_nmi(intr_info)) return false; else if (is_page_fault(intr_info)) - return enable_ept; + return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept; else if (is_no_device(intr_info) && !(vmcs12->guest_cr0 & X86_CR0_TS)) return false; @@ -8398,9 +8450,15 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) exit_reason != EXIT_REASON_TASK_SWITCH)) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; - vcpu->run->internal.ndata = 2; + vcpu->run->internal.ndata = 3; vcpu->run->internal.data[0] = vectoring_info; vcpu->run->internal.data[1] = exit_reason; + vcpu->run->internal.data[2] = vcpu->arch.exit_qualification; + if (exit_reason == EXIT_REASON_EPT_MISCONFIG) { + vcpu->run->internal.ndata++; + vcpu->run->internal.data[3] = + vmcs_read64(GUEST_PHYSICAL_ADDRESS); + } return 0; } @@ -8591,17 +8649,24 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu) static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) { - u32 exit_intr_info; + u32 exit_intr_info = 0; + u16 basic_exit_reason = (u16)vmx->exit_reason; - if (!(vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY - || vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)) + if (!(basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY + || basic_exit_reason == EXIT_REASON_EXCEPTION_NMI)) return; - vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - exit_intr_info = vmx->exit_intr_info; + if (!(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) + exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + vmx->exit_intr_info = exit_intr_info; + + /* if exit due to PF check for async PF */ + if (is_page_fault(exit_intr_info)) + vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason(); /* Handle machine checks before interrupts are enabled */ - if (is_machine_check(exit_intr_info)) + if (basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY || + is_machine_check(exit_intr_info)) kvm_machine_check(); /* We need to handle NMIs before interrupts are enabled */ @@ -8652,6 +8717,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) ); } } +STACK_FRAME_NON_STANDARD(vmx_handle_external_intr); static bool vmx_has_high_real_mode_segbase(void) { @@ -8679,7 +8745,7 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; - if (vmx->nmi_known_unmasked) + if (vmx->loaded_vmcs->nmi_known_unmasked) return; /* * Can't use vmx->exit_intr_info since we're not sure what @@ -8703,7 +8769,7 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); else - vmx->nmi_known_unmasked = + vmx->loaded_vmcs->nmi_known_unmasked = !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI); } @@ -8820,7 +8886,7 @@ static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu) static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long debugctlmsr, cr4; + unsigned long debugctlmsr, cr3, cr4; /* Don't enter VMX if guest state is invalid, let the exit handler start emulation until we arrive back to a valid state */ @@ -8842,6 +8908,12 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); + cr3 = __get_current_cr3_fast(); + if (unlikely(cr3 != vmx->host_state.vmcs_host_cr3)) { + vmcs_writel(HOST_CR3, cr3); + vmx->host_state.vmcs_host_cr3 = cr3; + } + cr4 = cr4_read_shadow(); if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) { vmcs_writel(HOST_CR4, cr4); @@ -9028,6 +9100,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx_recover_nmi_blocking(vmx); vmx_complete_interrupts(vmx); } +STACK_FRAME_NON_STANDARD(vmx_vcpu_run); static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) { @@ -9376,6 +9449,11 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, vmcs12->guest_physical_address = fault->address; } +static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu) +{ + return nested_ept_get_cr3(vcpu) & VMX_EPT_AD_ENABLE_BIT; +} + /* Callbacks for nested_ept_init_mmu_context: */ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) @@ -9386,18 +9464,18 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) { - u64 eptp; + bool wants_ad; WARN_ON(mmu_is_nested(vcpu)); - eptp = nested_ept_get_cr3(vcpu); - if ((eptp & VMX_EPT_AD_ENABLE_BIT) && !enable_ept_ad_bits) + wants_ad = nested_ept_ad_enabled(vcpu); + if (wants_ad && !enable_ept_ad_bits) return 1; kvm_mmu_unload(vcpu); kvm_init_shadow_ept_mmu(vcpu, to_vmx(vcpu)->nested.nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT, - eptp & VMX_EPT_AD_ENABLE_BIT); + wants_ad); vcpu->arch.mmu.set_cr3 = vmx_set_cr3; vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3; vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault; @@ -9556,23 +9634,26 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); } +static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) + return 0; + + if (!page_address_valid(vcpu, vmcs12->io_bitmap_a) || + !page_address_valid(vcpu, vmcs12->io_bitmap_b)) + return -EINVAL; + + return 0; +} + static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - int maxphyaddr; - u64 addr; - if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) return 0; - if (vmcs12_read_any(vcpu, MSR_BITMAP, &addr)) { - WARN_ON(1); - return -EINVAL; - } - maxphyaddr = cpuid_maxphyaddr(vcpu); - - if (!PAGE_ALIGNED(vmcs12->msr_bitmap) || - ((addr + PAGE_SIZE) >> maxphyaddr)) + if (!page_address_valid(vcpu, vmcs12->msr_bitmap)) return -EINVAL; return 0; @@ -10260,6 +10341,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_vmx_check_io_bitmap_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; @@ -10396,8 +10480,6 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) return 1; } - vmcs12->launch_state = 1; - /* * Note no nested_vmx_succeed or nested_vmx_fail here. At this point * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet @@ -10415,6 +10497,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) { struct vmcs12 *vmcs12; struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); u32 exit_qual; int ret; @@ -10439,6 +10522,12 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) * for misconfigurations which will anyway be caught by the processor * when using the merged vmcs02. */ + if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS) { + nested_vmx_failValid(vcpu, + VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); + goto out; + } + if (vmcs12->launch_state == launch) { nested_vmx_failValid(vcpu, launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS @@ -10710,8 +10799,7 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); } - if (nested_cpu_has_ept(vmcs12)) - vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); + vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); if (nested_cpu_has_vid(vmcs12)) vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); @@ -10736,8 +10824,6 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); if (kvm_mpx_supported()) vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); - if (nested_cpu_has_xsaves(vmcs12)) - vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP); } /* @@ -10774,6 +10860,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { + vmcs12->launch_state = 1; + /* vm_entry_intr_info_field is cleared on exit. Emulate this * instead of reading the real value. */ vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; @@ -11134,7 +11222,8 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) vmx->hv_deadline_tsc = tscl + delta_tsc; vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_VMX_PREEMPTION_TIMER); - return 0; + + return delta_tsc == 0; } static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0e846f0cb83b..82a63c59f77b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -134,8 +134,6 @@ module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR); static bool __read_mostly vector_hashing = true; module_param(vector_hashing, bool, S_IRUGO); -static bool __read_mostly backwards_tsc_observed = false; - #define KVM_NR_SHARED_MSRS 16 struct kvm_shared_msrs_global { @@ -452,7 +450,12 @@ EXPORT_SYMBOL_GPL(kvm_complete_insn_gp); void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { ++vcpu->stat.pf_guest; - vcpu->arch.cr2 = fault->address; + vcpu->arch.exception.nested_apf = + is_guest_mode(vcpu) && fault->async_page_fault; + if (vcpu->arch.exception.nested_apf) + vcpu->arch.apf.nested_apf_token = fault->address; + else + vcpu->arch.cr2 = fault->address; kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code); } EXPORT_SYMBOL_GPL(kvm_inject_page_fault); @@ -594,8 +597,8 @@ bool pdptrs_changed(struct kvm_vcpu *vcpu) (unsigned long *)&vcpu->arch.regs_avail)) return true; - gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT; - offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1); + gfn = (kvm_read_cr3(vcpu) & ~31ul) >> PAGE_SHIFT; + offset = (kvm_read_cr3(vcpu) & ~31ul) & (PAGE_SIZE - 1); r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte), PFERR_USER_MASK | PFERR_WRITE_MASK); if (r < 0) @@ -1719,7 +1722,7 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm) &ka->master_cycle_now); ka->use_master_clock = host_tsc_clocksource && vcpus_matched - && !backwards_tsc_observed + && !ka->backwards_tsc_observed && !ka->boot_vcpu_runs_old_kvmclock; if (ka->use_master_clock) @@ -2060,8 +2063,8 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) { gpa_t gpa = data & ~0x3f; - /* Bits 2:5 are reserved, Should be zero */ - if (data & 0x3c) + /* Bits 3:5 are reserved, Should be zero */ + if (data & 0x38) return 1; vcpu->arch.apf.msr_val = data; @@ -2077,6 +2080,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) return 1; vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS); + vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT; kvm_async_pf_wakeup_all(vcpu); return 0; } @@ -2661,6 +2665,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_HYPERV_VAPIC: case KVM_CAP_HYPERV_SPIN: case KVM_CAP_HYPERV_SYNIC: + case KVM_CAP_HYPERV_SYNIC2: + case KVM_CAP_HYPERV_VP_INDEX: case KVM_CAP_PCI_SEGMENT: case KVM_CAP_DEBUGREGS: case KVM_CAP_X86_ROBUST_SINGLESTEP: @@ -2841,10 +2847,10 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_vcpu_write_tsc_offset(vcpu, offset); vcpu->arch.tsc_catchup = 1; } - if (kvm_lapic_hv_timer_in_use(vcpu) && - kvm_x86_ops->set_hv_timer(vcpu, - kvm_get_lapic_target_expiration_tsc(vcpu))) - kvm_lapic_switch_to_sw_timer(vcpu); + + if (kvm_lapic_hv_timer_in_use(vcpu)) + kvm_lapic_restart_hv_timer(vcpu); + /* * On a host with synchronized TSC, there is no need to update * kvmclock on vcpu->cpu migration @@ -3384,10 +3390,14 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return -EINVAL; switch (cap->cap) { + case KVM_CAP_HYPERV_SYNIC2: + if (cap->args[0]) + return -EINVAL; case KVM_CAP_HYPERV_SYNIC: if (!irqchip_in_kernel(vcpu->kvm)) return -EINVAL; - return kvm_hv_activate_synic(vcpu); + return kvm_hv_activate_synic(vcpu, cap->cap == + KVM_CAP_HYPERV_SYNIC2); default: return -EINVAL; } @@ -4188,9 +4198,15 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; r = 0; + /* + * TODO: userspace has to take care of races with VCPU_RUN, so + * kvm_gen_update_masterclock() can be cut down to locked + * pvclock_update_vm_gtod_copy(). + */ + kvm_gen_update_masterclock(kvm); now_ns = get_kvmclock_ns(kvm); kvm->arch.kvmclock_offset += user_ns.clock - now_ns; - kvm_gen_update_masterclock(kvm); + kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE); break; } case KVM_GET_CLOCK: { @@ -6011,7 +6027,7 @@ static void kvm_set_mmio_spte_mask(void) mask &= ~1ull; #endif - kvm_mmu_set_mmio_spte_mask(mask); + kvm_mmu_set_mmio_spte_mask(mask, mask); } #ifdef CONFIG_X86_64 @@ -6347,10 +6363,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) kvm_update_dr7(vcpu); } - kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, - vcpu->arch.exception.has_error_code, - vcpu->arch.exception.error_code, - vcpu->arch.exception.reinject); + kvm_x86_ops->queue_exception(vcpu); return 0; } @@ -6733,7 +6746,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) bool req_immediate_exit = false; - if (vcpu->requests) { + if (kvm_request_pending(vcpu)) { if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) kvm_mmu_unload(vcpu); if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) @@ -6897,7 +6910,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_x86_ops->sync_pir_to_irr(vcpu); } - if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests + if (vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) || need_resched() || signal_pending(current)) { vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); @@ -7676,6 +7689,8 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) struct msr_data msr; struct kvm *kvm = vcpu->kvm; + kvm_hv_vcpu_postcreate(vcpu); + if (vcpu_load(vcpu)) return; msr.data = 0x0; @@ -7829,8 +7844,8 @@ int kvm_arch_hardware_enable(void) */ if (backwards_tsc) { u64 delta_cyc = max_tsc - local_tsc; - backwards_tsc_observed = true; list_for_each_entry(kvm, &vm_list, vm_list) { + kvm->arch.backwards_tsc_observed = true; kvm_for_each_vcpu(i, vcpu, kvm) { vcpu->arch.tsc_offset_adjustment += delta_cyc; vcpu->arch.last_host_tsc = local_tsc; @@ -8576,6 +8591,7 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, fault.error_code = 0; fault.nested_page_fault = false; fault.address = work->arch.token; + fault.async_page_fault = true; kvm_inject_page_fault(vcpu, &fault); } } @@ -8598,6 +8614,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, fault.error_code = 0; fault.nested_page_fault = false; fault.address = work->arch.token; + fault.async_page_fault = true; kvm_inject_page_fault(vcpu, &fault); } vcpu->arch.apf.halted = false; |