summaryrefslogtreecommitdiff
path: root/arch/x86/kvm/x86.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/x86.c')
-rw-r--r--arch/x86/kvm/x86.c178
1 files changed, 105 insertions, 73 deletions
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 45608a7da9b3..19f9f9e05c2a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -36,7 +36,8 @@
#include <linux/kvm.h>
#include <linux/fs.h>
#include <linux/vmalloc.h>
-#include <linux/module.h>
+#include <linux/export.h>
+#include <linux/moduleparam.h>
#include <linux/mman.h>
#include <linux/highmem.h>
#include <linux/iommu.h>
@@ -70,7 +71,8 @@
#define MAX_IO_MSRS 256
#define KVM_MAX_MCE_BANKS 32
-#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
+u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
+EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
#define emul_to_vcpu(ctxt) \
container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
@@ -89,8 +91,12 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
+#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
+ KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
+
static void update_cr8_intercept(struct kvm_vcpu *vcpu);
static void process_nmi(struct kvm_vcpu *vcpu);
+static void enter_smm(struct kvm_vcpu *vcpu);
static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
struct kvm_x86_ops *kvm_x86_ops __read_mostly;
@@ -113,7 +119,8 @@ u8 __read_mostly kvm_tsc_scaling_ratio_frac_bits;
EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
u64 __read_mostly kvm_max_tsc_scaling_ratio;
EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
-static u64 __read_mostly kvm_default_tsc_scaling_ratio;
+u64 __read_mostly kvm_default_tsc_scaling_ratio;
+EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
static u32 __read_mostly tsc_tolerance_ppm = 250;
@@ -537,7 +544,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
goto out;
}
for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
- if (is_present_gpte(pdpte[i]) &&
+ if ((pdpte[i] & PT_PRESENT_MASK) &&
(pdpte[i] &
vcpu->arch.mmu.guest_rsvd_check.rsvd_bits_mask[0][2])) {
ret = 0;
@@ -982,6 +989,7 @@ static u32 emulated_msrs[] = {
MSR_IA32_MISC_ENABLE,
MSR_IA32_MCG_STATUS,
MSR_IA32_MCG_CTL,
+ MSR_IA32_MCG_EXT_CTL,
MSR_IA32_SMBASE,
};
@@ -1161,7 +1169,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
int version;
int r;
struct pvclock_wall_clock wc;
- struct timespec boot;
+ struct timespec64 boot;
if (!wall_clock)
return;
@@ -1184,13 +1192,13 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
* wall clock specified here. guest system time equals host
* system time for us, thus we must fill in host boot time here.
*/
- getboottime(&boot);
+ getboottime64(&boot);
if (kvm->arch.kvmclock_offset) {
- struct timespec ts = ns_to_timespec(kvm->arch.kvmclock_offset);
- boot = timespec_sub(boot, ts);
+ struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset);
+ boot = timespec64_sub(boot, ts);
}
- wc.sec = boot.tv_sec;
+ wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */
wc.nsec = boot.tv_nsec;
wc.version = version;
@@ -2615,6 +2623,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_TSC_CONTROL:
r = kvm_has_tsc_control;
break;
+ case KVM_CAP_X2APIC_API:
+ r = KVM_X2APIC_API_VALID_FLAGS;
+ break;
default:
r = 0;
break;
@@ -2677,11 +2688,9 @@ long kvm_arch_dev_ioctl(struct file *filp,
break;
}
case KVM_X86_GET_MCE_CAP_SUPPORTED: {
- u64 mce_cap;
-
- mce_cap = KVM_MCE_CAP_SUPPORTED;
r = -EFAULT;
- if (copy_to_user(argp, &mce_cap, sizeof mce_cap))
+ if (copy_to_user(argp, &kvm_mce_cap_supported,
+ sizeof(kvm_mce_cap_supported)))
goto out;
r = 0;
break;
@@ -2733,6 +2742,11 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
rdtsc() - vcpu->arch.last_host_tsc;
if (tsc_delta < 0)
mark_tsc_unstable("KVM discovered backwards TSC");
+
+ if (kvm_lapic_hv_timer_in_use(vcpu) &&
+ kvm_x86_ops->set_hv_timer(vcpu,
+ kvm_get_lapic_tscdeadline_msr(vcpu)))
+ kvm_lapic_switch_to_sw_timer(vcpu);
if (check_tsc_unstable()) {
u64 offset = kvm_compute_tsc_offset(vcpu,
vcpu->arch.last_guest_tsc);
@@ -2766,15 +2780,17 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
if (vcpu->arch.apicv_active)
kvm_x86_ops->sync_pir_to_irr(vcpu);
- memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
-
- return 0;
+ return kvm_apic_get_state(vcpu, s);
}
static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
- kvm_apic_post_state_restore(vcpu, s);
+ int r;
+
+ r = kvm_apic_set_state(vcpu, s);
+ if (r)
+ return r;
update_cr8_intercept(vcpu);
return 0;
@@ -2859,7 +2875,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
r = -EINVAL;
if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
goto out;
- if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
+ if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000))
goto out;
r = 0;
vcpu->arch.mcg_cap = mcg_cap;
@@ -2869,6 +2885,9 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
/* Init IA32_MCi_CTL to all 1s */
for (bank = 0; bank < bank_num; bank++)
vcpu->arch.mce_banks[bank*4] = ~(u64)0;
+
+ if (kvm_x86_ops->setup_mce)
+ kvm_x86_ops->setup_mce(vcpu);
out:
return r;
}
@@ -3767,7 +3786,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
r = -EEXIST;
if (irqchip_in_kernel(kvm))
goto split_irqchip_unlock;
- if (atomic_read(&kvm->online_vcpus))
+ if (kvm->created_vcpus)
goto split_irqchip_unlock;
r = kvm_setup_empty_irq_routing(kvm);
if (r)
@@ -3781,6 +3800,18 @@ split_irqchip_unlock:
mutex_unlock(&kvm->lock);
break;
}
+ case KVM_CAP_X2APIC_API:
+ r = -EINVAL;
+ if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS)
+ break;
+
+ if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS)
+ kvm->arch.x2apic_format = true;
+ if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
+ kvm->arch.x2apic_broadcast_quirk_disabled = true;
+
+ r = 0;
+ break;
default:
r = -EINVAL;
break;
@@ -3832,7 +3863,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
if (kvm->arch.vpic)
goto create_irqchip_unlock;
r = -EINVAL;
- if (atomic_read(&kvm->online_vcpus))
+ if (kvm->created_vcpus)
goto create_irqchip_unlock;
r = -ENOMEM;
vpic = kvm_create_pic(kvm);
@@ -3872,7 +3903,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
sizeof(struct kvm_pit_config)))
goto out;
create_pit:
- mutex_lock(&kvm->slots_lock);
+ mutex_lock(&kvm->lock);
r = -EEXIST;
if (kvm->arch.vpit)
goto create_pit_unlock;
@@ -3881,7 +3912,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
if (kvm->arch.vpit)
r = 0;
create_pit_unlock:
- mutex_unlock(&kvm->slots_lock);
+ mutex_unlock(&kvm->lock);
break;
case KVM_GET_IRQCHIP: {
/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
@@ -3988,7 +4019,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
case KVM_SET_BOOT_CPU_ID:
r = 0;
mutex_lock(&kvm->lock);
- if (atomic_read(&kvm->online_vcpus) != 0)
+ if (kvm->created_vcpus)
r = -EBUSY;
else
kvm->arch.bsp_vcpu_id = arg;
@@ -5296,13 +5327,8 @@ static void kvm_smm_changed(struct kvm_vcpu *vcpu)
/* This is a good place to trace that we are exiting SMM. */
trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false);
- if (unlikely(vcpu->arch.smi_pending)) {
- kvm_make_request(KVM_REQ_SMI, vcpu);
- vcpu->arch.smi_pending = 0;
- } else {
- /* Process a latched INIT, if any. */
- kvm_make_request(KVM_REQ_EVENT, vcpu);
- }
+ /* Process a latched INIT or SMI, if any. */
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
}
kvm_mmu_reset_context(vcpu);
@@ -5848,8 +5874,8 @@ int kvm_arch_init(void *opaque)
kvm_x86_ops = ops;
kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
- PT_DIRTY_MASK, PT64_NX_MASK, 0);
-
+ PT_DIRTY_MASK, PT64_NX_MASK, 0,
+ PT_PRESENT_MASK);
kvm_timer_init();
perf_register_guest_info_callbacks(&kvm_guest_cbs);
@@ -6083,7 +6109,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
}
/* try to inject new event if pending */
- if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
+ if (vcpu->arch.smi_pending && !is_smm(vcpu)) {
+ vcpu->arch.smi_pending = false;
+ enter_smm(vcpu);
+ } else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
--vcpu->arch.nmi_pending;
vcpu->arch.nmi_injected = true;
kvm_x86_ops->set_nmi(vcpu);
@@ -6106,6 +6135,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
kvm_x86_ops->set_irq(vcpu);
}
}
+
return 0;
}
@@ -6129,7 +6159,7 @@ static void process_nmi(struct kvm_vcpu *vcpu)
#define put_smstate(type, buf, offset, val) \
*(type *)((buf) + (offset) - 0x7e00) = val
-static u32 process_smi_get_segment_flags(struct kvm_segment *seg)
+static u32 enter_smm_get_segment_flags(struct kvm_segment *seg)
{
u32 flags = 0;
flags |= seg->g << 23;
@@ -6143,7 +6173,7 @@ static u32 process_smi_get_segment_flags(struct kvm_segment *seg)
return flags;
}
-static void process_smi_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
+static void enter_smm_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
{
struct kvm_segment seg;
int offset;
@@ -6158,11 +6188,11 @@ static void process_smi_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
put_smstate(u32, buf, offset + 8, seg.base);
put_smstate(u32, buf, offset + 4, seg.limit);
- put_smstate(u32, buf, offset, process_smi_get_segment_flags(&seg));
+ put_smstate(u32, buf, offset, enter_smm_get_segment_flags(&seg));
}
#ifdef CONFIG_X86_64
-static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
+static void enter_smm_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
{
struct kvm_segment seg;
int offset;
@@ -6171,7 +6201,7 @@ static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
kvm_get_segment(vcpu, &seg, n);
offset = 0x7e00 + n * 16;
- flags = process_smi_get_segment_flags(&seg) >> 8;
+ flags = enter_smm_get_segment_flags(&seg) >> 8;
put_smstate(u16, buf, offset, seg.selector);
put_smstate(u16, buf, offset + 2, flags);
put_smstate(u32, buf, offset + 4, seg.limit);
@@ -6179,7 +6209,7 @@ static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
}
#endif
-static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
+static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf)
{
struct desc_ptr dt;
struct kvm_segment seg;
@@ -6203,13 +6233,13 @@ static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
put_smstate(u32, buf, 0x7fc4, seg.selector);
put_smstate(u32, buf, 0x7f64, seg.base);
put_smstate(u32, buf, 0x7f60, seg.limit);
- put_smstate(u32, buf, 0x7f5c, process_smi_get_segment_flags(&seg));
+ put_smstate(u32, buf, 0x7f5c, enter_smm_get_segment_flags(&seg));
kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
put_smstate(u32, buf, 0x7fc0, seg.selector);
put_smstate(u32, buf, 0x7f80, seg.base);
put_smstate(u32, buf, 0x7f7c, seg.limit);
- put_smstate(u32, buf, 0x7f78, process_smi_get_segment_flags(&seg));
+ put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg));
kvm_x86_ops->get_gdt(vcpu, &dt);
put_smstate(u32, buf, 0x7f74, dt.address);
@@ -6220,7 +6250,7 @@ static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
put_smstate(u32, buf, 0x7f54, dt.size);
for (i = 0; i < 6; i++)
- process_smi_save_seg_32(vcpu, buf, i);
+ enter_smm_save_seg_32(vcpu, buf, i);
put_smstate(u32, buf, 0x7f14, kvm_read_cr4(vcpu));
@@ -6229,7 +6259,7 @@ static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase);
}
-static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf)
+static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
{
#ifdef CONFIG_X86_64
struct desc_ptr dt;
@@ -6261,7 +6291,7 @@ static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf)
kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
put_smstate(u16, buf, 0x7e90, seg.selector);
- put_smstate(u16, buf, 0x7e92, process_smi_get_segment_flags(&seg) >> 8);
+ put_smstate(u16, buf, 0x7e92, enter_smm_get_segment_flags(&seg) >> 8);
put_smstate(u32, buf, 0x7e94, seg.limit);
put_smstate(u64, buf, 0x7e98, seg.base);
@@ -6271,7 +6301,7 @@ static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf)
kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
put_smstate(u16, buf, 0x7e70, seg.selector);
- put_smstate(u16, buf, 0x7e72, process_smi_get_segment_flags(&seg) >> 8);
+ put_smstate(u16, buf, 0x7e72, enter_smm_get_segment_flags(&seg) >> 8);
put_smstate(u32, buf, 0x7e74, seg.limit);
put_smstate(u64, buf, 0x7e78, seg.base);
@@ -6280,31 +6310,26 @@ static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf)
put_smstate(u64, buf, 0x7e68, dt.address);
for (i = 0; i < 6; i++)
- process_smi_save_seg_64(vcpu, buf, i);
+ enter_smm_save_seg_64(vcpu, buf, i);
#else
WARN_ON_ONCE(1);
#endif
}
-static void process_smi(struct kvm_vcpu *vcpu)
+static void enter_smm(struct kvm_vcpu *vcpu)
{
struct kvm_segment cs, ds;
struct desc_ptr dt;
char buf[512];
u32 cr0;
- if (is_smm(vcpu)) {
- vcpu->arch.smi_pending = true;
- return;
- }
-
trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
vcpu->arch.hflags |= HF_SMM_MASK;
memset(buf, 0, 512);
if (guest_cpuid_has_longmode(vcpu))
- process_smi_save_state_64(vcpu, buf);
+ enter_smm_save_state_64(vcpu, buf);
else
- process_smi_save_state_32(vcpu, buf);
+ enter_smm_save_state_32(vcpu, buf);
kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
@@ -6360,6 +6385,12 @@ static void process_smi(struct kvm_vcpu *vcpu)
kvm_mmu_reset_context(vcpu);
}
+static void process_smi(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.smi_pending = true;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+}
+
void kvm_make_scan_ioapic_request(struct kvm *kvm)
{
kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
@@ -6554,8 +6585,18 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (inject_pending_event(vcpu, req_int_win) != 0)
req_immediate_exit = true;
- /* enable NMI/IRQ window open exits if needed */
else {
+ /* Enable NMI/IRQ window open exits if needed.
+ *
+ * SMIs have two cases: 1) they can be nested, and
+ * then there is nothing to do here because RSM will
+ * cause a vmexit anyway; 2) or the SMI can be pending
+ * because inject_pending_event has completed the
+ * injection of an IRQ or NMI from the previous vmexit,
+ * and then we request an immediate exit to inject the SMI.
+ */
+ if (vcpu->arch.smi_pending && !is_smm(vcpu))
+ req_immediate_exit = true;
if (vcpu->arch.nmi_pending)
kvm_x86_ops->enable_nmi_window(vcpu);
if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
@@ -6606,12 +6647,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_load_guest_xcr0(vcpu);
- if (req_immediate_exit)
+ if (req_immediate_exit) {
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
smp_send_reschedule(vcpu->cpu);
+ }
trace_kvm_entry(vcpu->vcpu_id);
wait_lapic_expire(vcpu);
- __kvm_guest_enter();
+ guest_enter_irqoff();
if (unlikely(vcpu->arch.switch_db_regs)) {
set_debugreg(0, 7);
@@ -6662,16 +6705,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
++vcpu->stat.exits;
- /*
- * We must have an instruction between local_irq_enable() and
- * kvm_guest_exit(), so the timer interrupt isn't delayed by
- * the interrupt shadow. The stat.exits increment will do nicely.
- * But we need to prevent reordering, hence this barrier():
- */
- barrier();
-
- kvm_guest_exit();
+ guest_exit_irqoff();
+ local_irq_enable();
preempt_enable();
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
@@ -7408,6 +7444,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
vcpu->arch.hflags = 0;
+ vcpu->arch.smi_pending = 0;
atomic_set(&vcpu->arch.nmi_queued, 0);
vcpu->arch.nmi_pending = 0;
vcpu->arch.nmi_injected = false;
@@ -7600,11 +7637,6 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
}
-bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
-{
- return irqchip_in_kernel(vcpu->kvm) == lapic_in_kernel(vcpu);
-}
-
struct static_key kvm_no_apic_vcpu __read_mostly;
EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
@@ -7871,7 +7903,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kfree(kvm->arch.vpic);
kfree(kvm->arch.vioapic);
kvm_free_vcpus(kvm);
- kfree(rcu_dereference_check(kvm->arch.apic_map, 1));
+ kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
kvm_mmu_uninit_vm(kvm);
}
@@ -8379,7 +8411,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
/*
* When producer of consumer is unregistered, we change back to
* remapped mode, so we can re-use the current implementation
- * when the irq is masked/disabed or the consumer side (KVM
+ * when the irq is masked/disabled or the consumer side (KVM
* int this case doesn't want to receive the interrupts.
*/
ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);