summaryrefslogtreecommitdiff
path: root/virt/kvm/kvm_main.c
diff options
context:
space:
mode:
Diffstat (limited to 'virt/kvm/kvm_main.c')
-rw-r--r--virt/kvm/kvm_main.c220
1 files changed, 155 insertions, 65 deletions
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f5283438ee05..a1093700f3a4 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -66,6 +66,9 @@
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");
+unsigned int halt_poll_ns = 0;
+module_param(halt_poll_ns, uint, S_IRUGO | S_IWUSR);
+
/*
* Ordering of locks:
*
@@ -89,7 +92,7 @@ struct dentry *kvm_debugfs_dir;
static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
unsigned long arg);
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_KVM_COMPAT
static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
unsigned long arg);
#endif
@@ -176,6 +179,7 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
return called;
}
+#ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL
void kvm_flush_remote_tlbs(struct kvm *kvm)
{
long dirty_count = kvm->tlbs_dirty;
@@ -186,6 +190,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm)
cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
}
EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
+#endif
void kvm_reload_remote_mmus(struct kvm *kvm)
{
@@ -671,7 +676,9 @@ static void update_memslots(struct kvm_memslots *slots,
WARN_ON(mslots[i].id != id);
if (!new->npages) {
+ WARN_ON(!mslots[i].npages);
new->base_gfn = 0;
+ new->flags = 0;
if (mslots[i].npages)
slots->used_slots--;
} else {
@@ -687,12 +694,25 @@ static void update_memslots(struct kvm_memslots *slots,
slots->id_to_index[mslots[i].id] = i;
i++;
}
- while (i > 0 &&
- new->base_gfn > mslots[i - 1].base_gfn) {
- mslots[i] = mslots[i - 1];
- slots->id_to_index[mslots[i].id] = i;
- i--;
- }
+
+ /*
+ * The ">=" is needed when creating a slot with base_gfn == 0,
+ * so that it moves before all those with base_gfn == npages == 0.
+ *
+ * On the other hand, if new->npages is zero, the above loop has
+ * already left i pointing to the beginning of the empty part of
+ * mslots, and the ">=" would move the hole backwards in this
+ * case---which is wrong. So skip the loop when deleting a slot.
+ */
+ if (new->npages) {
+ while (i > 0 &&
+ new->base_gfn >= mslots[i - 1].base_gfn) {
+ mslots[i] = mslots[i - 1];
+ slots->id_to_index[mslots[i].id] = i;
+ i--;
+ }
+ } else
+ WARN_ON_ONCE(i != slots->used_slots);
mslots[i] = *new;
slots->id_to_index[mslots[i].id] = i;
@@ -979,6 +999,86 @@ out:
}
EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
+#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
+/**
+ * kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages
+ * are dirty write protect them for next write.
+ * @kvm: pointer to kvm instance
+ * @log: slot id and address to which we copy the log
+ * @is_dirty: flag set if any page is dirty
+ *
+ * We need to keep it in mind that VCPU threads can write to the bitmap
+ * concurrently. So, to avoid losing track of dirty pages we keep the
+ * following order:
+ *
+ * 1. Take a snapshot of the bit and clear it if needed.
+ * 2. Write protect the corresponding page.
+ * 3. Copy the snapshot to the userspace.
+ * 4. Upon return caller flushes TLB's if needed.
+ *
+ * Between 2 and 4, the guest may write to the page using the remaining TLB
+ * entry. This is not a problem because the page is reported dirty using
+ * the snapshot taken before and step 4 ensures that writes done after
+ * exiting to userspace will be logged for the next call.
+ *
+ */
+int kvm_get_dirty_log_protect(struct kvm *kvm,
+ struct kvm_dirty_log *log, bool *is_dirty)
+{
+ struct kvm_memory_slot *memslot;
+ int r, i;
+ unsigned long n;
+ unsigned long *dirty_bitmap;
+ unsigned long *dirty_bitmap_buffer;
+
+ r = -EINVAL;
+ if (log->slot >= KVM_USER_MEM_SLOTS)
+ goto out;
+
+ memslot = id_to_memslot(kvm->memslots, log->slot);
+
+ dirty_bitmap = memslot->dirty_bitmap;
+ r = -ENOENT;
+ if (!dirty_bitmap)
+ goto out;
+
+ n = kvm_dirty_bitmap_bytes(memslot);
+
+ dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long);
+ memset(dirty_bitmap_buffer, 0, n);
+
+ spin_lock(&kvm->mmu_lock);
+ *is_dirty = false;
+ for (i = 0; i < n / sizeof(long); i++) {
+ unsigned long mask;
+ gfn_t offset;
+
+ if (!dirty_bitmap[i])
+ continue;
+
+ *is_dirty = true;
+
+ mask = xchg(&dirty_bitmap[i], 0);
+ dirty_bitmap_buffer[i] = mask;
+
+ offset = i * BITS_PER_LONG;
+ kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset,
+ mask);
+ }
+
+ spin_unlock(&kvm->mmu_lock);
+
+ r = -EFAULT;
+ if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
+ goto out;
+
+ r = 0;
+out:
+ return r;
+}
+EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect);
+#endif
+
bool kvm_largepages_enabled(void)
{
return largepages_enabled;
@@ -1114,43 +1214,6 @@ static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL);
}
-int kvm_get_user_page_io(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long addr, bool write_fault,
- struct page **pagep)
-{
- int npages;
- int locked = 1;
- int flags = FOLL_TOUCH | FOLL_HWPOISON |
- (pagep ? FOLL_GET : 0) |
- (write_fault ? FOLL_WRITE : 0);
-
- /*
- * If retrying the fault, we get here *not* having allowed the filemap
- * to wait on the page lock. We should now allow waiting on the IO with
- * the mmap semaphore released.
- */
- down_read(&mm->mmap_sem);
- npages = __get_user_pages(tsk, mm, addr, 1, flags, pagep, NULL,
- &locked);
- if (!locked) {
- VM_BUG_ON(npages);
-
- if (!pagep)
- return 0;
-
- /*
- * The previous call has now waited on the IO. Now we can
- * retry and complete. Pass TRIED to ensure we do not re
- * schedule async IO (see e.g. filemap_fault).
- */
- down_read(&mm->mmap_sem);
- npages = __get_user_pages(tsk, mm, addr, 1, flags | FOLL_TRIED,
- pagep, NULL, NULL);
- }
- up_read(&mm->mmap_sem);
- return npages;
-}
-
static inline int check_user_page_hwpoison(unsigned long addr)
{
int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
@@ -1213,15 +1276,10 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
npages = get_user_page_nowait(current, current->mm,
addr, write_fault, page);
up_read(&current->mm->mmap_sem);
- } else {
- /*
- * By now we have tried gup_fast, and possibly async_pf, and we
- * are certainly not atomic. Time to retry the gup, allowing
- * mmap semaphore to be relinquished in the case of IO.
- */
- npages = kvm_get_user_page_io(current, current->mm, addr,
- write_fault, page);
- }
+ } else
+ npages = __get_user_pages_unlocked(current, current->mm, addr, 1,
+ write_fault, 0, page,
+ FOLL_TOUCH|FOLL_HWPOISON);
if (npages != 1)
return npages;
@@ -1579,6 +1637,7 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
}
return 0;
}
+EXPORT_SYMBOL_GPL(kvm_write_guest);
int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
gpa_t gpa, unsigned long len)
@@ -1715,29 +1774,60 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
}
EXPORT_SYMBOL_GPL(mark_page_dirty);
+static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
+{
+ if (kvm_arch_vcpu_runnable(vcpu)) {
+ kvm_make_request(KVM_REQ_UNHALT, vcpu);
+ return -EINTR;
+ }
+ if (kvm_cpu_has_pending_timer(vcpu))
+ return -EINTR;
+ if (signal_pending(current))
+ return -EINTR;
+
+ return 0;
+}
+
/*
* The vCPU has executed a HLT instruction with in-kernel mode enabled.
*/
void kvm_vcpu_block(struct kvm_vcpu *vcpu)
{
+ ktime_t start, cur;
DEFINE_WAIT(wait);
+ bool waited = false;
+
+ start = cur = ktime_get();
+ if (halt_poll_ns) {
+ ktime_t stop = ktime_add_ns(ktime_get(), halt_poll_ns);
+ do {
+ /*
+ * This sets KVM_REQ_UNHALT if an interrupt
+ * arrives.
+ */
+ if (kvm_vcpu_check_block(vcpu) < 0) {
+ ++vcpu->stat.halt_successful_poll;
+ goto out;
+ }
+ cur = ktime_get();
+ } while (single_task_running() && ktime_before(cur, stop));
+ }
for (;;) {
prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
- if (kvm_arch_vcpu_runnable(vcpu)) {
- kvm_make_request(KVM_REQ_UNHALT, vcpu);
- break;
- }
- if (kvm_cpu_has_pending_timer(vcpu))
- break;
- if (signal_pending(current))
+ if (kvm_vcpu_check_block(vcpu) < 0)
break;
+ waited = true;
schedule();
}
finish_wait(&vcpu->wq, &wait);
+ cur = ktime_get();
+
+out:
+ trace_kvm_vcpu_wakeup(ktime_to_ns(cur) - ktime_to_ns(start), waited);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_block);
@@ -1920,7 +2010,7 @@ static int kvm_vcpu_release(struct inode *inode, struct file *filp)
static struct file_operations kvm_vcpu_fops = {
.release = kvm_vcpu_release,
.unlocked_ioctl = kvm_vcpu_ioctl,
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_KVM_COMPAT
.compat_ioctl = kvm_vcpu_compat_ioctl,
#endif
.mmap = kvm_vcpu_mmap,
@@ -2210,7 +2300,7 @@ out:
return r;
}
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_KVM_COMPAT
static long kvm_vcpu_compat_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -2302,7 +2392,7 @@ static int kvm_device_release(struct inode *inode, struct file *filp)
static const struct file_operations kvm_device_fops = {
.unlocked_ioctl = kvm_device_ioctl,
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_KVM_COMPAT
.compat_ioctl = kvm_device_ioctl,
#endif
.release = kvm_device_release,
@@ -2589,7 +2679,7 @@ out:
return r;
}
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_KVM_COMPAT
struct compat_kvm_dirty_log {
__u32 slot;
__u32 padding1;
@@ -2636,7 +2726,7 @@ out:
static struct file_operations kvm_vm_fops = {
.release = kvm_vm_release,
.unlocked_ioctl = kvm_vm_ioctl,
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_KVM_COMPAT
.compat_ioctl = kvm_vm_compat_ioctl,
#endif
.llseek = noop_llseek,