diff options
Diffstat (limited to 'virt/kvm/kvm_main.c')
-rw-r--r-- | virt/kvm/kvm_main.c | 185 |
1 files changed, 130 insertions, 55 deletions
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ba0327e2d0d3..222f0e894a0c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -143,8 +143,6 @@ static int kvm_no_compat_open(struct inode *inode, struct file *file) #define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \ .open = kvm_no_compat_open #endif -static int kvm_enable_virtualization(void); -static void kvm_disable_virtualization(void); static void kvm_io_bus_destroy(struct kvm_io_bus *bus); @@ -489,6 +487,14 @@ void kvm_destroy_vcpus(struct kvm *kvm) kvm_for_each_vcpu(i, vcpu, kvm) { kvm_vcpu_destroy(vcpu); xa_erase(&kvm->vcpu_array, i); + + /* + * Assert that the vCPU isn't visible in any way, to ensure KVM + * doesn't trigger a use-after-free if destroying vCPUs results + * in VM-wide request, e.g. to flush remote TLBs when tearing + * down MMUs, or to mark the VM dead if a KVM_BUG_ON() fires. + */ + WARN_ON_ONCE(xa_load(&kvm->vcpu_array, i) || kvm_get_vcpu(kvm, i)); } atomic_set(&kvm->online_vcpus, 0); @@ -517,6 +523,7 @@ struct kvm_mmu_notifier_range { on_lock_fn_t on_lock; bool flush_on_ret; bool may_block; + bool lockless; }; /* @@ -551,8 +558,8 @@ static void kvm_null_fn(void) node; \ node = interval_tree_iter_next(node, start, last)) \ -static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm, - const struct kvm_mmu_notifier_range *range) +static __always_inline kvm_mn_ret_t kvm_handle_hva_range(struct kvm *kvm, + const struct kvm_mmu_notifier_range *range) { struct kvm_mmu_notifier_return r = { .ret = false, @@ -571,6 +578,10 @@ static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm, IS_KVM_NULL_FN(range->handler))) return r; + /* on_lock will never be called for lockless walks */ + if (WARN_ON_ONCE(range->lockless && !IS_KVM_NULL_FN(range->on_lock))) + return r; + idx = srcu_read_lock(&kvm->srcu); for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { @@ -607,15 +618,18 @@ static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm, gfn_range.start = hva_to_gfn_memslot(hva_start, slot); gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot); gfn_range.slot = slot; + gfn_range.lockless = range->lockless; if (!r.found_memslot) { r.found_memslot = true; - KVM_MMU_LOCK(kvm); - if (!IS_KVM_NULL_FN(range->on_lock)) - range->on_lock(kvm); - - if (IS_KVM_NULL_FN(range->handler)) - goto mmu_unlock; + if (!range->lockless) { + KVM_MMU_LOCK(kvm); + if (!IS_KVM_NULL_FN(range->on_lock)) + range->on_lock(kvm); + + if (IS_KVM_NULL_FN(range->handler)) + goto mmu_unlock; + } } r.ret |= range->handler(kvm, &gfn_range); } @@ -625,7 +639,7 @@ static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm, kvm_flush_remote_tlbs(kvm); mmu_unlock: - if (r.found_memslot) + if (r.found_memslot && !range->lockless) KVM_MMU_UNLOCK(kvm); srcu_read_unlock(&kvm->srcu, idx); @@ -633,7 +647,7 @@ mmu_unlock: return r; } -static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn, +static __always_inline int kvm_age_hva_range(struct mmu_notifier *mn, unsigned long start, unsigned long end, gfn_handler_t handler, @@ -647,17 +661,18 @@ static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn, .on_lock = (void *)kvm_null_fn, .flush_on_ret = flush_on_ret, .may_block = false, + .lockless = IS_ENABLED(CONFIG_KVM_MMU_LOCKLESS_AGING), }; - return __kvm_handle_hva_range(kvm, &range).ret; + return kvm_handle_hva_range(kvm, &range).ret; } -static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn, - unsigned long start, - unsigned long end, - gfn_handler_t handler) +static __always_inline int kvm_age_hva_range_no_flush(struct mmu_notifier *mn, + unsigned long start, + unsigned long end, + gfn_handler_t handler) { - return kvm_handle_hva_range(mn, start, end, handler, false); + return kvm_age_hva_range(mn, start, end, handler, false); } void kvm_mmu_invalidate_begin(struct kvm *kvm) @@ -752,7 +767,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, * that guest memory has been reclaimed. This needs to be done *after* * dropping mmu_lock, as x86's reclaim path is slooooow. */ - if (__kvm_handle_hva_range(kvm, &hva_range).found_memslot) + if (kvm_handle_hva_range(kvm, &hva_range).found_memslot) kvm_arch_guest_memory_reclaimed(kvm); return 0; @@ -798,7 +813,7 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, }; bool wake; - __kvm_handle_hva_range(kvm, &hva_range); + kvm_handle_hva_range(kvm, &hva_range); /* Pairs with the increment in range_start(). */ spin_lock(&kvm->mn_invalidate_lock); @@ -822,8 +837,8 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, { trace_kvm_age_hva(start, end); - return kvm_handle_hva_range(mn, start, end, kvm_age_gfn, - !IS_ENABLED(CONFIG_KVM_ELIDE_TLB_FLUSH_IF_YOUNG)); + return kvm_age_hva_range(mn, start, end, kvm_age_gfn, + !IS_ENABLED(CONFIG_KVM_ELIDE_TLB_FLUSH_IF_YOUNG)); } static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, @@ -846,7 +861,7 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, * cadence. If we find this inaccurate, we might come up with a * more sophisticated heuristic later. */ - return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn); + return kvm_age_hva_range_no_flush(mn, start, end, kvm_age_gfn); } static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, @@ -855,8 +870,8 @@ static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, { trace_kvm_test_age_hva(address); - return kvm_handle_hva_range_no_flush(mn, address, address + 1, - kvm_test_age_gfn); + return kvm_age_hva_range_no_flush(mn, address, address + 1, + kvm_test_age_gfn); } static void kvm_mmu_notifier_release(struct mmu_notifier *mn, @@ -1254,7 +1269,6 @@ static void kvm_destroy_vm(struct kvm *kvm) kvm_destroy_pm_notifier(kvm); kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm); kvm_destroy_vm_debugfs(kvm); - kvm_arch_sync_events(kvm); mutex_lock(&kvm_lock); list_del(&kvm->vm_list); mutex_unlock(&kvm_lock); @@ -1354,6 +1368,65 @@ static int kvm_vm_release(struct inode *inode, struct file *filp) return 0; } +int kvm_trylock_all_vcpus(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + unsigned long i, j; + + lockdep_assert_held(&kvm->lock); + + kvm_for_each_vcpu(i, vcpu, kvm) + if (!mutex_trylock_nest_lock(&vcpu->mutex, &kvm->lock)) + goto out_unlock; + return 0; + +out_unlock: + kvm_for_each_vcpu(j, vcpu, kvm) { + if (i == j) + break; + mutex_unlock(&vcpu->mutex); + } + return -EINTR; +} +EXPORT_SYMBOL_GPL(kvm_trylock_all_vcpus); + +int kvm_lock_all_vcpus(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + unsigned long i, j; + int r; + + lockdep_assert_held(&kvm->lock); + + kvm_for_each_vcpu(i, vcpu, kvm) { + r = mutex_lock_killable_nest_lock(&vcpu->mutex, &kvm->lock); + if (r) + goto out_unlock; + } + return 0; + +out_unlock: + kvm_for_each_vcpu(j, vcpu, kvm) { + if (i == j) + break; + mutex_unlock(&vcpu->mutex); + } + return r; +} +EXPORT_SYMBOL_GPL(kvm_lock_all_vcpus); + +void kvm_unlock_all_vcpus(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + unsigned long i; + + lockdep_assert_held(&kvm->lock); + + kvm_for_each_vcpu(i, vcpu, kvm) + mutex_unlock(&vcpu->mutex); +} +EXPORT_SYMBOL_GPL(kvm_unlock_all_vcpus); + /* * Allocation size is twice as large as the actual dirty bitmap size. * See kvm_vm_ioctl_get_dirty_log() why this is needed. @@ -2499,6 +2572,8 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end, r = xa_reserve(&kvm->mem_attr_array, i, GFP_KERNEL_ACCOUNT); if (r) goto out_unlock; + + cond_resched(); } kvm_handle_gfn_range(kvm, &pre_set_range); @@ -2507,6 +2582,7 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end, r = xa_err(xa_store(&kvm->mem_attr_array, i, entry, GFP_KERNEL_ACCOUNT)); KVM_BUG_ON(r, kvm); + cond_resched(); } kvm_handle_gfn_range(kvm, &post_set_range); @@ -3725,7 +3801,7 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up); /* * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode. */ -void kvm_vcpu_kick(struct kvm_vcpu *vcpu) +void __kvm_vcpu_kick(struct kvm_vcpu *vcpu, bool wait) { int me, cpu; @@ -3754,13 +3830,24 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu) */ if (kvm_arch_vcpu_should_kick(vcpu)) { cpu = READ_ONCE(vcpu->cpu); - if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) - smp_send_reschedule(cpu); + if (cpu != me && (unsigned int)cpu < nr_cpu_ids && cpu_online(cpu)) { + /* + * Use a reschedule IPI to kick the vCPU if the caller + * doesn't need to wait for a response, as KVM allows + * kicking vCPUs while IRQs are disabled, but using the + * SMP function call framework with IRQs disabled can + * deadlock due to taking cross-CPU locks. + */ + if (wait) + smp_call_function_single(cpu, ack_kick, NULL, wait); + else + smp_send_reschedule(cpu); + } } out: put_cpu(); } -EXPORT_SYMBOL_GPL(kvm_vcpu_kick); +EXPORT_SYMBOL_GPL(__kvm_vcpu_kick); #endif /* !CONFIG_S390 */ int kvm_vcpu_yield_to(struct kvm_vcpu *target) @@ -4110,7 +4197,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id) goto vcpu_free_run_page; if (kvm->dirty_ring_size) { - r = kvm_dirty_ring_alloc(&vcpu->dirty_ring, + r = kvm_dirty_ring_alloc(kvm, &vcpu->dirty_ring, id, kvm->dirty_ring_size); if (r) goto arch_vcpu_destroy; @@ -4224,15 +4311,14 @@ static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu) if (fd < 0) return fd; - file = anon_inode_getfile(name, &kvm_vcpu_stats_fops, vcpu, O_RDONLY); + file = anon_inode_getfile_fmode(name, &kvm_vcpu_stats_fops, vcpu, + O_RDONLY, FMODE_PREAD); if (IS_ERR(file)) { put_unused_fd(fd); return PTR_ERR(file); } kvm_get_kvm(vcpu->kvm); - - file->f_mode |= FMODE_PREAD; fd_install(fd, file); return fd; @@ -4849,7 +4935,7 @@ static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size) return -EINVAL; /* Should be bigger to keep the reserved entries, or a page */ - if (size < kvm_dirty_ring_get_rsvd_entries() * + if (size < kvm_dirty_ring_get_rsvd_entries(kvm) * sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE) return -EINVAL; @@ -5020,16 +5106,14 @@ static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm) if (fd < 0) return fd; - file = anon_inode_getfile("kvm-vm-stats", - &kvm_vm_stats_fops, kvm, O_RDONLY); + file = anon_inode_getfile_fmode("kvm-vm-stats", + &kvm_vm_stats_fops, kvm, O_RDONLY, FMODE_PREAD); if (IS_ERR(file)) { put_unused_fd(fd); return PTR_ERR(file); } kvm_get_kvm(kvm); - - file->f_mode |= FMODE_PREAD; fd_install(fd, file); return fd; @@ -5466,8 +5550,9 @@ static struct miscdevice kvm_dev = { }; #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING -static bool enable_virt_at_load = true; +bool enable_virt_at_load = true; module_param(enable_virt_at_load, bool, 0444); +EXPORT_SYMBOL_GPL(enable_virt_at_load); __visible bool kvm_rebooting; EXPORT_SYMBOL_GPL(kvm_rebooting); @@ -5576,7 +5661,7 @@ static struct syscore_ops kvm_syscore_ops = { .shutdown = kvm_shutdown, }; -static int kvm_enable_virtualization(void) +int kvm_enable_virtualization(void) { int r; @@ -5621,8 +5706,9 @@ err_cpuhp: --kvm_usage_count; return r; } +EXPORT_SYMBOL_GPL(kvm_enable_virtualization); -static void kvm_disable_virtualization(void) +void kvm_disable_virtualization(void) { guard(mutex)(&kvm_usage_lock); @@ -5633,6 +5719,7 @@ static void kvm_disable_virtualization(void) cpuhp_remove_state(CPUHP_AP_KVM_ONLINE); kvm_arch_disable_virtualization(); } +EXPORT_SYMBOL_GPL(kvm_disable_virtualization); static int kvm_init_virtualization(void) { @@ -5648,21 +5735,11 @@ static void kvm_uninit_virtualization(void) kvm_disable_virtualization(); } #else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */ -static int kvm_enable_virtualization(void) -{ - return 0; -} - static int kvm_init_virtualization(void) { return 0; } -static void kvm_disable_virtualization(void) -{ - -} - static void kvm_uninit_virtualization(void) { @@ -5761,7 +5838,6 @@ static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, return -EOPNOTSUPP; } -/* kvm_io_bus_write - called under kvm->slots_lock */ int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, const void *val) { @@ -5782,7 +5858,6 @@ int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, } EXPORT_SYMBOL_GPL(kvm_io_bus_write); -/* kvm_io_bus_write_cookie - called under kvm->slots_lock */ int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, const void *val, long cookie) { @@ -5832,7 +5907,6 @@ static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, return -EOPNOTSUPP; } -/* kvm_io_bus_read - called under kvm->slots_lock */ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, void *val) { @@ -5851,6 +5925,7 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, r = __kvm_io_bus_read(vcpu, bus, &range, val); return r < 0 ? r : 0; } +EXPORT_SYMBOL_GPL(kvm_io_bus_read); int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, struct kvm_io_device *dev) |