summaryrefslogtreecommitdiff
path: root/arch/x86/kvm/vmx.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/vmx.c')
-rw-r--r--arch/x86/kvm/vmx.c262
1 files changed, 185 insertions, 77 deletions
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e65a158dee64..579a0b51696a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -39,6 +39,7 @@
#include <asm/mce.h>
#include <asm/i387.h>
#include <asm/xcr.h>
+#include <asm/perf_event.h>
#include "trace.h"
@@ -71,6 +72,9 @@ module_param(vmm_exclusive, bool, S_IRUGO);
static int __read_mostly yield_on_hlt = 1;
module_param(yield_on_hlt, bool, S_IRUGO);
+static int __read_mostly fasteoi = 1;
+module_param(fasteoi, bool, S_IRUGO);
+
/*
* If nested=1, nested virtualization is supported, i.e., guests may use
* VMX and be a hypervisor for its own guests. If nested=0, guests may not
@@ -115,7 +119,7 @@ module_param(ple_gap, int, S_IRUGO);
static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
module_param(ple_window, int, S_IRUGO);
-#define NR_AUTOLOAD_MSRS 1
+#define NR_AUTOLOAD_MSRS 8
#define VMCS02_POOL_SIZE 1
struct vmcs {
@@ -619,6 +623,7 @@ static unsigned long *vmx_msr_bitmap_legacy;
static unsigned long *vmx_msr_bitmap_longmode;
static bool cpu_has_load_ia32_efer;
+static bool cpu_has_load_perf_global_ctrl;
static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
static DEFINE_SPINLOCK(vmx_vpid_lock);
@@ -1188,15 +1193,34 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
vmcs_write32(EXCEPTION_BITMAP, eb);
}
+static void clear_atomic_switch_msr_special(unsigned long entry,
+ unsigned long exit)
+{
+ vmcs_clear_bits(VM_ENTRY_CONTROLS, entry);
+ vmcs_clear_bits(VM_EXIT_CONTROLS, exit);
+}
+
static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
{
unsigned i;
struct msr_autoload *m = &vmx->msr_autoload;
- if (msr == MSR_EFER && cpu_has_load_ia32_efer) {
- vmcs_clear_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER);
- vmcs_clear_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER);
- return;
+ switch (msr) {
+ case MSR_EFER:
+ if (cpu_has_load_ia32_efer) {
+ clear_atomic_switch_msr_special(VM_ENTRY_LOAD_IA32_EFER,
+ VM_EXIT_LOAD_IA32_EFER);
+ return;
+ }
+ break;
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ if (cpu_has_load_perf_global_ctrl) {
+ clear_atomic_switch_msr_special(
+ VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
+ VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
+ return;
+ }
+ break;
}
for (i = 0; i < m->nr; ++i)
@@ -1212,25 +1236,55 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
}
+static void add_atomic_switch_msr_special(unsigned long entry,
+ unsigned long exit, unsigned long guest_val_vmcs,
+ unsigned long host_val_vmcs, u64 guest_val, u64 host_val)
+{
+ vmcs_write64(guest_val_vmcs, guest_val);
+ vmcs_write64(host_val_vmcs, host_val);
+ vmcs_set_bits(VM_ENTRY_CONTROLS, entry);
+ vmcs_set_bits(VM_EXIT_CONTROLS, exit);
+}
+
static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
u64 guest_val, u64 host_val)
{
unsigned i;
struct msr_autoload *m = &vmx->msr_autoload;
- if (msr == MSR_EFER && cpu_has_load_ia32_efer) {
- vmcs_write64(GUEST_IA32_EFER, guest_val);
- vmcs_write64(HOST_IA32_EFER, host_val);
- vmcs_set_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER);
- vmcs_set_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER);
- return;
+ switch (msr) {
+ case MSR_EFER:
+ if (cpu_has_load_ia32_efer) {
+ add_atomic_switch_msr_special(VM_ENTRY_LOAD_IA32_EFER,
+ VM_EXIT_LOAD_IA32_EFER,
+ GUEST_IA32_EFER,
+ HOST_IA32_EFER,
+ guest_val, host_val);
+ return;
+ }
+ break;
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ if (cpu_has_load_perf_global_ctrl) {
+ add_atomic_switch_msr_special(
+ VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
+ VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
+ GUEST_IA32_PERF_GLOBAL_CTRL,
+ HOST_IA32_PERF_GLOBAL_CTRL,
+ guest_val, host_val);
+ return;
+ }
+ break;
}
for (i = 0; i < m->nr; ++i)
if (m->guest[i].index == msr)
break;
- if (i == m->nr) {
+ if (i == NR_AUTOLOAD_MSRS) {
+ printk_once(KERN_WARNING"Not enough mst switch entries. "
+ "Can't add msr %x\n", msr);
+ return;
+ } else if (i == m->nr) {
++m->nr;
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
@@ -1748,6 +1802,21 @@ static u64 guest_read_tsc(void)
}
/*
+ * Like guest_read_tsc, but always returns L1's notion of the timestamp
+ * counter, even if a nested guest (L2) is currently running.
+ */
+u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu)
+{
+ u64 host_tsc, tsc_offset;
+
+ rdtscll(host_tsc);
+ tsc_offset = is_guest_mode(vcpu) ?
+ to_vmx(vcpu)->nested.vmcs01_tsc_offset :
+ vmcs_read64(TSC_OFFSET);
+ return host_tsc + tsc_offset;
+}
+
+/*
* Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ
* ioctl. In this case the call-back should update internal vmx state to make
* the changes effective.
@@ -1762,15 +1831,23 @@ static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
*/
static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
{
- vmcs_write64(TSC_OFFSET, offset);
- if (is_guest_mode(vcpu))
+ if (is_guest_mode(vcpu)) {
/*
- * We're here if L1 chose not to trap the TSC MSR. Since
- * prepare_vmcs12() does not copy tsc_offset, we need to also
- * set the vmcs12 field here.
+ * We're here if L1 chose not to trap WRMSR to TSC. According
+ * to the spec, this should set L1's TSC; The offset that L1
+ * set for L2 remains unchanged, and still needs to be added
+ * to the newly set TSC to get L2's TSC.
*/
- get_vmcs12(vcpu)->tsc_offset = offset -
- to_vmx(vcpu)->nested.vmcs01_tsc_offset;
+ struct vmcs12 *vmcs12;
+ to_vmx(vcpu)->nested.vmcs01_tsc_offset = offset;
+ /* recalculate vmcs02.TSC_OFFSET: */
+ vmcs12 = get_vmcs12(vcpu);
+ vmcs_write64(TSC_OFFSET, offset +
+ (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ?
+ vmcs12->tsc_offset : 0));
+ } else {
+ vmcs_write64(TSC_OFFSET, offset);
+ }
}
static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
@@ -2429,6 +2506,42 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
&& allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
VM_EXIT_LOAD_IA32_EFER);
+ cpu_has_load_perf_global_ctrl =
+ allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
+ VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
+ && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
+ VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
+
+ /*
+ * Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL
+ * but due to arrata below it can't be used. Workaround is to use
+ * msr load mechanism to switch IA32_PERF_GLOBAL_CTRL.
+ *
+ * VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32]
+ *
+ * AAK155 (model 26)
+ * AAP115 (model 30)
+ * AAT100 (model 37)
+ * BC86,AAY89,BD102 (model 44)
+ * BA97 (model 46)
+ *
+ */
+ if (cpu_has_load_perf_global_ctrl && boot_cpu_data.x86 == 0x6) {
+ switch (boot_cpu_data.x86_model) {
+ case 26:
+ case 30:
+ case 37:
+ case 44:
+ case 46:
+ cpu_has_load_perf_global_ctrl = false;
+ printk_once(KERN_WARNING"kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
+ "does not work properly. Using workaround\n");
+ break;
+ default:
+ break;
+ }
+ }
+
return 0;
}
@@ -2736,8 +2849,8 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
- printk(KERN_DEBUG "%s: tss fixup for long mode. \n",
- __func__);
+ pr_debug_ratelimited("%s: tss fixup for long mode. \n",
+ __func__);
vmcs_write32(GUEST_TR_AR_BYTES,
(guest_tr_ar & ~AR_TYPE_MASK)
| AR_TYPE_BUSY_64_TSS);
@@ -4115,8 +4228,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
if (is_page_fault(intr_info)) {
/* EPT won't cause page fault directly */
- if (enable_ept)
- BUG();
+ BUG_ON(enable_ept);
cr2 = vmcs_readl(EXIT_QUALIFICATION);
trace_kvm_page_fault(cr2, error_code);
@@ -4518,6 +4630,24 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu)
static int handle_apic_access(struct kvm_vcpu *vcpu)
{
+ if (likely(fasteoi)) {
+ unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ int access_type, offset;
+
+ access_type = exit_qualification & APIC_ACCESS_TYPE;
+ offset = exit_qualification & APIC_ACCESS_OFFSET;
+ /*
+ * Sane guest uses MOV to write EOI, with written value
+ * not cared. So make a short-circuit here by avoiding
+ * heavy instruction emulation.
+ */
+ if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
+ (offset == APIC_EOI)) {
+ kvm_lapic_set_eoi(vcpu);
+ skip_emulated_instruction(vcpu);
+ return 1;
+ }
+ }
return emulate_instruction(vcpu, 0) == EMULATE_DONE;
}
@@ -5591,8 +5721,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
return 0;
if (unlikely(vmx->fail)) {
- printk(KERN_INFO "%s failed vm entry %x\n",
- __func__, vmcs_read32(VM_INSTRUCTION_ERROR));
+ pr_info_ratelimited("%s failed vm entry %x\n", __func__,
+ vmcs_read32(VM_INSTRUCTION_ERROR));
return 1;
}
@@ -5696,8 +5826,6 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
u32 exit_reason = vmx->exit_reason;
u32 vectoring_info = vmx->idt_vectoring_info;
- trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
-
/* If guest state is invalid, start emulating */
if (vmx->emulation_required && emulate_invalid_guest_state)
return handle_invalid_guest_state(vcpu);
@@ -5927,6 +6055,24 @@ static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
}
+static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
+{
+ int i, nr_msrs;
+ struct perf_guest_switch_msr *msrs;
+
+ msrs = perf_guest_get_msrs(&nr_msrs);
+
+ if (!msrs)
+ return;
+
+ for (i = 0; i < nr_msrs; i++)
+ if (msrs[i].host == msrs[i].guest)
+ clear_atomic_switch_msr(vmx, msrs[i].msr);
+ else
+ add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
+ msrs[i].host);
+}
+
#ifdef CONFIG_X86_64
#define R "r"
#define Q "q"
@@ -5976,6 +6122,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
vmx_set_interrupt_shadow(vcpu, 0);
+ atomic_switch_perf_msrs(vmx);
+
vmx->__launched = vmx->loaded_vmcs->launched;
asm(
/* Store host registers */
@@ -6101,6 +6249,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx->loaded_vmcs->launched = 1;
vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
+ trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX);
vmx_complete_atomic_exit(vmx);
vmx_recover_nmi_blocking(vmx);
@@ -6241,49 +6390,6 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
return ret;
}
-#define _ER(x) { EXIT_REASON_##x, #x }
-
-static const struct trace_print_flags vmx_exit_reasons_str[] = {
- _ER(EXCEPTION_NMI),
- _ER(EXTERNAL_INTERRUPT),
- _ER(TRIPLE_FAULT),
- _ER(PENDING_INTERRUPT),
- _ER(NMI_WINDOW),
- _ER(TASK_SWITCH),
- _ER(CPUID),
- _ER(HLT),
- _ER(INVLPG),
- _ER(RDPMC),
- _ER(RDTSC),
- _ER(VMCALL),
- _ER(VMCLEAR),
- _ER(VMLAUNCH),
- _ER(VMPTRLD),
- _ER(VMPTRST),
- _ER(VMREAD),
- _ER(VMRESUME),
- _ER(VMWRITE),
- _ER(VMOFF),
- _ER(VMON),
- _ER(CR_ACCESS),
- _ER(DR_ACCESS),
- _ER(IO_INSTRUCTION),
- _ER(MSR_READ),
- _ER(MSR_WRITE),
- _ER(MWAIT_INSTRUCTION),
- _ER(MONITOR_INSTRUCTION),
- _ER(PAUSE_INSTRUCTION),
- _ER(MCE_DURING_VMENTRY),
- _ER(TPR_BELOW_THRESHOLD),
- _ER(APIC_ACCESS),
- _ER(EPT_VIOLATION),
- _ER(EPT_MISCONFIG),
- _ER(WBINVD),
- { -1, NULL }
-};
-
-#undef _ER
-
static int vmx_get_lpage_level(void)
{
if (enable_ept && !cpu_has_vmx_ept_1g_page())
@@ -6514,8 +6620,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
set_cr4_guest_host_mask(vmx);
- vmcs_write64(TSC_OFFSET,
- vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset);
+ if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
+ vmcs_write64(TSC_OFFSET,
+ vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset);
+ else
+ vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
if (enable_vpid) {
/*
@@ -6610,9 +6719,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
if (vmcs12->vm_entry_msr_load_count > 0 ||
vmcs12->vm_exit_msr_load_count > 0 ||
vmcs12->vm_exit_msr_store_count > 0) {
- if (printk_ratelimit())
- printk(KERN_WARNING
- "%s: VMCS MSR_{LOAD,STORE} unsupported\n", __func__);
+ pr_warn_ratelimited("%s: VMCS MSR_{LOAD,STORE} unsupported\n",
+ __func__);
nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
return 1;
}
@@ -6922,7 +7030,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
load_vmcs12_host_state(vcpu, vmcs12);
- /* Update TSC_OFFSET if vmx_adjust_tsc_offset() was used while L2 ran */
+ /* Update TSC_OFFSET if TSC was changed while L2 ran */
vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
/* This is needed for same reason as it was needed in prepare_vmcs02 */
@@ -7039,7 +7147,6 @@ static struct kvm_x86_ops vmx_x86_ops = {
.get_mt_mask = vmx_get_mt_mask,
.get_exit_info = vmx_get_exit_info,
- .exit_reasons_str = vmx_exit_reasons_str,
.get_lpage_level = vmx_get_lpage_level,
@@ -7055,6 +7162,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.write_tsc_offset = vmx_write_tsc_offset,
.adjust_tsc_offset = vmx_adjust_tsc_offset,
.compute_tsc_offset = vmx_compute_tsc_offset,
+ .read_l1_tsc = vmx_read_l1_tsc,
.set_tdp_cr3 = vmx_set_cr3,