diff options
Diffstat (limited to 'virt/kvm/kvm_main.c')
-rw-r--r-- | virt/kvm/kvm_main.c | 238 |
1 files changed, 195 insertions, 43 deletions
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index fd68fbe0a75d..13efc291b1c7 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -50,6 +50,7 @@ #include <linux/bsearch.h> #include <linux/io.h> #include <linux/lockdep.h> +#include <linux/kthread.h> #include <asm/processor.h> #include <asm/ioctl.h> @@ -121,9 +122,22 @@ static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl, unsigned long arg); #define KVM_COMPAT(c) .compat_ioctl = (c) #else +/* + * For architectures that don't implement a compat infrastructure, + * adopt a double line of defense: + * - Prevent a compat task from opening /dev/kvm + * - If the open has been done by a 64bit task, and the KVM fd + * passed to a compat task, let the ioctls fail. + */ static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl, unsigned long arg) { return -EINVAL; } -#define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl + +static int kvm_no_compat_open(struct inode *inode, struct file *file) +{ + return is_compat_task() ? -ENODEV : 0; +} +#define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \ + .open = kvm_no_compat_open #endif static int hardware_enable_all(void); static void hardware_disable_all(void); @@ -149,10 +163,30 @@ __weak int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, return 0; } +bool kvm_is_zone_device_pfn(kvm_pfn_t pfn) +{ + /* + * The metadata used by is_zone_device_page() to determine whether or + * not a page is ZONE_DEVICE is guaranteed to be valid if and only if + * the device has been pinned, e.g. by get_user_pages(). WARN if the + * page_count() is zero to help detect bad usage of this helper. + */ + if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn)))) + return false; + + return is_zone_device_page(pfn_to_page(pfn)); +} + bool kvm_is_reserved_pfn(kvm_pfn_t pfn) { + /* + * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting + * perspective they are "normal" pages, albeit with slightly different + * usage rules. + */ if (pfn_valid(pfn)) - return PageReserved(pfn_to_page(pfn)); + return PageReserved(pfn_to_page(pfn)) && + !kvm_is_zone_device_pfn(pfn); return true; } @@ -625,10 +659,28 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) return 0; } +/* + * Called after the VM is otherwise initialized, but just before adding it to + * the vm_list. + */ +int __weak kvm_arch_post_init_vm(struct kvm *kvm) +{ + return 0; +} + +/* + * Called just after removing the VM from the vm_list, but before doing any + * other destruction. + */ +void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm) +{ +} + static struct kvm *kvm_create_vm(unsigned long type) { - int r, i; struct kvm *kvm = kvm_arch_alloc_vm(); + int r = -ENOMEM; + int i; if (!kvm) return ERR_PTR(-ENOMEM); @@ -640,46 +692,51 @@ static struct kvm *kvm_create_vm(unsigned long type) mutex_init(&kvm->lock); mutex_init(&kvm->irq_lock); mutex_init(&kvm->slots_lock); - refcount_set(&kvm->users_count, 1); INIT_LIST_HEAD(&kvm->devices); - r = kvm_arch_init_vm(kvm, type); - if (r) - goto out_err_no_disable; - - r = hardware_enable_all(); - if (r) - goto out_err_no_disable; - -#ifdef CONFIG_HAVE_KVM_IRQFD - INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); -#endif - BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX); - r = -ENOMEM; + if (init_srcu_struct(&kvm->srcu)) + goto out_err_no_srcu; + if (init_srcu_struct(&kvm->irq_srcu)) + goto out_err_no_irq_srcu; + + refcount_set(&kvm->users_count, 1); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { struct kvm_memslots *slots = kvm_alloc_memslots(); + if (!slots) - goto out_err_no_srcu; + goto out_err_no_arch_destroy_vm; /* Generations must be different for each address space. */ slots->generation = i; rcu_assign_pointer(kvm->memslots[i], slots); } - if (init_srcu_struct(&kvm->srcu)) - goto out_err_no_srcu; - if (init_srcu_struct(&kvm->irq_srcu)) - goto out_err_no_irq_srcu; for (i = 0; i < KVM_NR_BUSES; i++) { rcu_assign_pointer(kvm->buses[i], kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT)); if (!kvm->buses[i]) - goto out_err; + goto out_err_no_arch_destroy_vm; } + r = kvm_arch_init_vm(kvm, type); + if (r) + goto out_err_no_arch_destroy_vm; + + r = hardware_enable_all(); + if (r) + goto out_err_no_disable; + +#ifdef CONFIG_HAVE_KVM_IRQFD + INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); +#endif + r = kvm_init_mmu_notifier(kvm); if (r) + goto out_err_no_mmu_notifier; + + r = kvm_arch_post_init_vm(kvm); + if (r) goto out_err; mutex_lock(&kvm_lock); @@ -691,17 +748,24 @@ static struct kvm *kvm_create_vm(unsigned long type) return kvm; out_err: - cleanup_srcu_struct(&kvm->irq_srcu); -out_err_no_irq_srcu: - cleanup_srcu_struct(&kvm->srcu); -out_err_no_srcu: +#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) + if (kvm->mmu_notifier.ops) + mmu_notifier_unregister(&kvm->mmu_notifier, current->mm); +#endif +out_err_no_mmu_notifier: hardware_disable_all(); out_err_no_disable: - refcount_set(&kvm->users_count, 0); + kvm_arch_destroy_vm(kvm); +out_err_no_arch_destroy_vm: + WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count)); for (i = 0; i < KVM_NR_BUSES; i++) kfree(kvm_get_bus(kvm, i)); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) kvm_free_memslots(kvm, __kvm_memslots(kvm, i)); + cleanup_srcu_struct(&kvm->irq_srcu); +out_err_no_irq_srcu: + cleanup_srcu_struct(&kvm->srcu); +out_err_no_srcu: kvm_arch_free_vm(kvm); mmdrop(current->mm); return ERR_PTR(r); @@ -733,6 +797,8 @@ static void kvm_destroy_vm(struct kvm *kvm) mutex_lock(&kvm_lock); list_del(&kvm->vm_list); mutex_unlock(&kvm_lock); + kvm_arch_pre_destroy_vm(kvm); + kvm_free_irq_routing(kvm); for (i = 0; i < KVM_NR_BUSES; i++) { struct kvm_io_bus *bus = kvm_get_bus(kvm, i); @@ -1853,7 +1919,7 @@ EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); void kvm_set_pfn_dirty(kvm_pfn_t pfn) { - if (!kvm_is_reserved_pfn(pfn)) { + if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) { struct page *page = pfn_to_page(pfn); SetPageDirty(page); @@ -1863,7 +1929,7 @@ EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); void kvm_set_pfn_accessed(kvm_pfn_t pfn) { - if (!kvm_is_reserved_pfn(pfn)) + if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) mark_page_accessed(pfn_to_page(pfn)); } EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); @@ -2360,20 +2426,23 @@ out: kvm_arch_vcpu_unblocking(vcpu); block_ns = ktime_to_ns(cur) - ktime_to_ns(start); - if (!vcpu_valid_wakeup(vcpu)) - shrink_halt_poll_ns(vcpu); - else if (halt_poll_ns) { - if (block_ns <= vcpu->halt_poll_ns) - ; - /* we had a long block, shrink polling */ - else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns) + if (!kvm_arch_no_poll(vcpu)) { + if (!vcpu_valid_wakeup(vcpu)) { shrink_halt_poll_ns(vcpu); - /* we had a short halt and our poll time is too small */ - else if (vcpu->halt_poll_ns < halt_poll_ns && - block_ns < halt_poll_ns) - grow_halt_poll_ns(vcpu); - } else - vcpu->halt_poll_ns = 0; + } else if (halt_poll_ns) { + if (block_ns <= vcpu->halt_poll_ns) + ; + /* we had a long block, shrink polling */ + else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns) + shrink_halt_poll_ns(vcpu); + /* we had a short halt and our poll time is too small */ + else if (vcpu->halt_poll_ns < halt_poll_ns && + block_ns < halt_poll_ns) + grow_halt_poll_ns(vcpu); + } else { + vcpu->halt_poll_ns = 0; + } + } trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu)); kvm_arch_vcpu_block_finish(vcpu); @@ -4364,3 +4433,86 @@ void kvm_exit(void) kvm_vfio_ops_exit(); } EXPORT_SYMBOL_GPL(kvm_exit); + +struct kvm_vm_worker_thread_context { + struct kvm *kvm; + struct task_struct *parent; + struct completion init_done; + kvm_vm_thread_fn_t thread_fn; + uintptr_t data; + int err; +}; + +static int kvm_vm_worker_thread(void *context) +{ + /* + * The init_context is allocated on the stack of the parent thread, so + * we have to locally copy anything that is needed beyond initialization + */ + struct kvm_vm_worker_thread_context *init_context = context; + struct kvm *kvm = init_context->kvm; + kvm_vm_thread_fn_t thread_fn = init_context->thread_fn; + uintptr_t data = init_context->data; + int err; + + err = kthread_park(current); + /* kthread_park(current) is never supposed to return an error */ + WARN_ON(err != 0); + if (err) + goto init_complete; + + err = cgroup_attach_task_all(init_context->parent, current); + if (err) { + kvm_err("%s: cgroup_attach_task_all failed with err %d\n", + __func__, err); + goto init_complete; + } + + set_user_nice(current, task_nice(init_context->parent)); + +init_complete: + init_context->err = err; + complete(&init_context->init_done); + init_context = NULL; + + if (err) + return err; + + /* Wait to be woken up by the spawner before proceeding. */ + kthread_parkme(); + + if (!kthread_should_stop()) + err = thread_fn(kvm, data); + + return err; +} + +int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn, + uintptr_t data, const char *name, + struct task_struct **thread_ptr) +{ + struct kvm_vm_worker_thread_context init_context = {}; + struct task_struct *thread; + + *thread_ptr = NULL; + init_context.kvm = kvm; + init_context.parent = current; + init_context.thread_fn = thread_fn; + init_context.data = data; + init_completion(&init_context.init_done); + + thread = kthread_run(kvm_vm_worker_thread, &init_context, + "%s-%d", name, task_pid_nr(current)); + if (IS_ERR(thread)) + return PTR_ERR(thread); + + /* kthread_run is never supposed to return NULL */ + WARN_ON(thread == NULL); + + wait_for_completion(&init_context.init_done); + + if (!init_context.err) + *thread_ptr = thread; + + return init_context.err; +} |