summaryrefslogtreecommitdiff
path: root/arch/x86/kvm/x86.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/x86.c')
-rw-r--r--arch/x86/kvm/x86.c209
1 files changed, 127 insertions, 82 deletions
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c259814200bd..bd7a70be41b3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -108,6 +108,10 @@ EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
static u32 tsc_tolerance_ppm = 250;
module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
+/* lapic timer advance (tscdeadline mode only) in nanoseconds */
+unsigned int lapic_timer_advance_ns = 0;
+module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
+
static bool backwards_tsc_observed = false;
#define KVM_NR_SHARED_MSRS 16
@@ -141,6 +145,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "irq_window", VCPU_STAT(irq_window_exits) },
{ "nmi_window", VCPU_STAT(nmi_window_exits) },
{ "halt_exits", VCPU_STAT(halt_exits) },
+ { "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
{ "hypercalls", VCPU_STAT(hypercalls) },
{ "request_irq", VCPU_STAT(request_irq_exits) },
@@ -492,7 +497,7 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
}
EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
-int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
+static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
void *data, int offset, int len, u32 access)
{
return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
@@ -643,7 +648,7 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
}
}
-int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
+static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
{
u64 xcr0 = xcr;
u64 old_xcr0 = vcpu->arch.xcr0;
@@ -1083,6 +1088,15 @@ static void update_pvclock_gtod(struct timekeeper *tk)
}
#endif
+void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
+ * vcpu_enter_guest. This function is only called from
+ * the physical CPU that is running vcpu.
+ */
+ kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+}
static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
{
@@ -1180,7 +1194,7 @@ static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
#endif
static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
-unsigned long max_tsc_khz;
+static unsigned long max_tsc_khz;
static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
{
@@ -1234,7 +1248,7 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
return tsc;
}
-void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
+static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
{
#ifdef CONFIG_X86_64
bool vcpus_matched;
@@ -1529,7 +1543,8 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
&ka->master_cycle_now);
ka->use_master_clock = host_tsc_clocksource && vcpus_matched
- && !backwards_tsc_observed;
+ && !backwards_tsc_observed
+ && !ka->boot_vcpu_runs_old_kvmclock;
if (ka->use_master_clock)
atomic_set(&kvm_guest_has_master_clock, 1);
@@ -2161,8 +2176,20 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_KVM_SYSTEM_TIME_NEW:
case MSR_KVM_SYSTEM_TIME: {
u64 gpa_offset;
+ struct kvm_arch *ka = &vcpu->kvm->arch;
+
kvmclock_reset(vcpu);
+ if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) {
+ bool tmp = (msr == MSR_KVM_SYSTEM_TIME);
+
+ if (ka->boot_vcpu_runs_old_kvmclock != tmp)
+ set_bit(KVM_REQ_MASTERCLOCK_UPDATE,
+ &vcpu->requests);
+
+ ka->boot_vcpu_runs_old_kvmclock = tmp;
+ }
+
vcpu->arch.time = data;
kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
@@ -2324,6 +2351,7 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
{
return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
}
+EXPORT_SYMBOL_GPL(kvm_get_msr);
static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
{
@@ -2738,6 +2766,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_READONLY_MEM:
case KVM_CAP_HYPERV_TIME:
case KVM_CAP_IOAPIC_POLARITY_IGNORED:
+ case KVM_CAP_TSC_DEADLINE_TIMER:
#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
case KVM_CAP_ASSIGN_DEV_IRQ:
case KVM_CAP_PCI_2_3:
@@ -2776,9 +2805,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_TSC_CONTROL:
r = kvm_has_tsc_control;
break;
- case KVM_CAP_TSC_DEADLINE_TIMER:
- r = boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER);
- break;
default:
r = 0;
break;
@@ -3734,83 +3760,43 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
* @kvm: kvm instance
* @log: slot id and address to which we copy the log
*
- * We need to keep it in mind that VCPU threads can write to the bitmap
- * concurrently. So, to avoid losing data, we keep the following order for
- * each bit:
+ * Steps 1-4 below provide general overview of dirty page logging. See
+ * kvm_get_dirty_log_protect() function description for additional details.
+ *
+ * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
+ * always flush the TLB (step 4) even if previous step failed and the dirty
+ * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
+ * does not preclude user space subsequent dirty log read. Flushing TLB ensures
+ * writes will be marked dirty for next log read.
*
* 1. Take a snapshot of the bit and clear it if needed.
* 2. Write protect the corresponding page.
- * 3. Flush TLB's if needed.
- * 4. Copy the snapshot to the userspace.
- *
- * Between 2 and 3, the guest may write to the page using the remaining TLB
- * entry. This is not a problem because the page will be reported dirty at
- * step 4 using the snapshot taken before and step 3 ensures that successive
- * writes will be logged for the next call.
+ * 3. Copy the snapshot to the userspace.
+ * 4. Flush TLB's if needed.
*/
int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
{
- int r;
- struct kvm_memory_slot *memslot;
- unsigned long n, i;
- unsigned long *dirty_bitmap;
- unsigned long *dirty_bitmap_buffer;
bool is_dirty = false;
+ int r;
mutex_lock(&kvm->slots_lock);
- r = -EINVAL;
- if (log->slot >= KVM_USER_MEM_SLOTS)
- goto out;
-
- memslot = id_to_memslot(kvm->memslots, log->slot);
-
- dirty_bitmap = memslot->dirty_bitmap;
- r = -ENOENT;
- if (!dirty_bitmap)
- goto out;
-
- n = kvm_dirty_bitmap_bytes(memslot);
-
- dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long);
- memset(dirty_bitmap_buffer, 0, n);
-
- spin_lock(&kvm->mmu_lock);
-
- for (i = 0; i < n / sizeof(long); i++) {
- unsigned long mask;
- gfn_t offset;
-
- if (!dirty_bitmap[i])
- continue;
-
- is_dirty = true;
-
- mask = xchg(&dirty_bitmap[i], 0);
- dirty_bitmap_buffer[i] = mask;
-
- offset = i * BITS_PER_LONG;
- kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask);
- }
-
- spin_unlock(&kvm->mmu_lock);
+ /*
+ * Flush potentially hardware-cached dirty pages to dirty_bitmap.
+ */
+ if (kvm_x86_ops->flush_log_dirty)
+ kvm_x86_ops->flush_log_dirty(kvm);
- /* See the comments in kvm_mmu_slot_remove_write_access(). */
- lockdep_assert_held(&kvm->slots_lock);
+ r = kvm_get_dirty_log_protect(kvm, log, &is_dirty);
/*
* All the TLBs can be flushed out of mmu lock, see the comments in
* kvm_mmu_slot_remove_write_access().
*/
+ lockdep_assert_held(&kvm->slots_lock);
if (is_dirty)
kvm_flush_remote_tlbs(kvm);
- r = -EFAULT;
- if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
- goto out;
-
- r = 0;
-out:
mutex_unlock(&kvm->slots_lock);
return r;
}
@@ -4516,6 +4502,8 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
if (rc != X86EMUL_CONTINUE)
return rc;
addr += now;
+ if (ctxt->mode != X86EMUL_MODE_PROT64)
+ addr = (u32)addr;
val += now;
bytes -= now;
}
@@ -4984,6 +4972,11 @@ static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulon
kvm_register_write(emul_to_vcpu(ctxt), reg, val);
}
+static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
+{
+ kvm_x86_ops->set_nmi_mask(emul_to_vcpu(ctxt), masked);
+}
+
static const struct x86_emulate_ops emulate_ops = {
.read_gpr = emulator_read_gpr,
.write_gpr = emulator_write_gpr,
@@ -5019,6 +5012,7 @@ static const struct x86_emulate_ops emulate_ops = {
.put_fpu = emulator_put_fpu,
.intercept = emulator_intercept,
.get_cpuid = emulator_get_cpuid,
+ .set_nmi_mask = emulator_set_nmi_mask,
};
static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
@@ -6311,6 +6305,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
}
trace_kvm_entry(vcpu->vcpu_id);
+ wait_lapic_expire(vcpu);
kvm_x86_ops->run(vcpu);
/*
@@ -7041,15 +7036,13 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
return r;
}
-int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
{
- int r;
struct msr_data msr;
struct kvm *kvm = vcpu->kvm;
- r = vcpu_load(vcpu);
- if (r)
- return r;
+ if (vcpu_load(vcpu))
+ return;
msr.data = 0x0;
msr.index = MSR_IA32_TSC;
msr.host_initiated = true;
@@ -7058,8 +7051,6 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
KVMCLOCK_SYNC_PERIOD);
-
- return r;
}
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
@@ -7549,12 +7540,62 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
return 0;
}
+static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
+ struct kvm_memory_slot *new)
+{
+ /* Still write protect RO slot */
+ if (new->flags & KVM_MEM_READONLY) {
+ kvm_mmu_slot_remove_write_access(kvm, new);
+ return;
+ }
+
+ /*
+ * Call kvm_x86_ops dirty logging hooks when they are valid.
+ *
+ * kvm_x86_ops->slot_disable_log_dirty is called when:
+ *
+ * - KVM_MR_CREATE with dirty logging is disabled
+ * - KVM_MR_FLAGS_ONLY with dirty logging is disabled in new flag
+ *
+ * The reason is, in case of PML, we need to set D-bit for any slots
+ * with dirty logging disabled in order to eliminate unnecessary GPA
+ * logging in PML buffer (and potential PML buffer full VMEXT). This
+ * guarantees leaving PML enabled during guest's lifetime won't have
+ * any additonal overhead from PML when guest is running with dirty
+ * logging disabled for memory slots.
+ *
+ * kvm_x86_ops->slot_enable_log_dirty is called when switching new slot
+ * to dirty logging mode.
+ *
+ * If kvm_x86_ops dirty logging hooks are invalid, use write protect.
+ *
+ * In case of write protect:
+ *
+ * Write protect all pages for dirty logging.
+ *
+ * All the sptes including the large sptes which point to this
+ * slot are set to readonly. We can not create any new large
+ * spte on this slot until the end of the logging.
+ *
+ * See the comments in fast_page_fault().
+ */
+ if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
+ if (kvm_x86_ops->slot_enable_log_dirty)
+ kvm_x86_ops->slot_enable_log_dirty(kvm, new);
+ else
+ kvm_mmu_slot_remove_write_access(kvm, new);
+ } else {
+ if (kvm_x86_ops->slot_disable_log_dirty)
+ kvm_x86_ops->slot_disable_log_dirty(kvm, new);
+ }
+}
+
void kvm_arch_commit_memory_region(struct kvm *kvm,
struct kvm_userspace_memory_region *mem,
const struct kvm_memory_slot *old,
enum kvm_mr_change change)
{
-
+ struct kvm_memory_slot *new;
int nr_mmu_pages = 0;
if ((mem->slot >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_DELETE)) {
@@ -7573,17 +7614,20 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
if (nr_mmu_pages)
kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
+
+ /* It's OK to get 'new' slot here as it has already been installed */
+ new = id_to_memslot(kvm->memslots, mem->slot);
+
/*
- * Write protect all pages for dirty logging.
+ * Set up write protection and/or dirty logging for the new slot.
*
- * All the sptes including the large sptes which point to this
- * slot are set to readonly. We can not create any new large
- * spte on this slot until the end of the logging.
- *
- * See the comments in fast_page_fault().
+ * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of old slot have
+ * been zapped so no dirty logging staff is needed for old slot. For
+ * KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the
+ * new and it's also covered when dealing with the new slot.
*/
- if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
- kvm_mmu_slot_remove_write_access(kvm, mem->slot);
+ if (change != KVM_MR_DELETE)
+ kvm_mmu_slot_apply_flags(kvm, new);
}
void kvm_arch_flush_shadow_all(struct kvm *kvm)
@@ -7837,3 +7881,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);