diff options
Diffstat (limited to 'virt/kvm/kvm_main.c')
-rw-r--r-- | virt/kvm/kvm_main.c | 217 |
1 files changed, 162 insertions, 55 deletions
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index a49df8988cd6..32896c845ffe 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -168,7 +168,7 @@ __weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm) { } -bool kvm_is_zone_device_pfn(kvm_pfn_t pfn) +bool kvm_is_zone_device_page(struct page *page) { /* * The metadata used by is_zone_device_page() to determine whether or @@ -176,25 +176,42 @@ bool kvm_is_zone_device_pfn(kvm_pfn_t pfn) * the device has been pinned, e.g. by get_user_pages(). WARN if the * page_count() is zero to help detect bad usage of this helper. */ - if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn)))) + if (WARN_ON_ONCE(!page_count(page))) return false; - return is_zone_device_page(pfn_to_page(pfn)); + return is_zone_device_page(page); } -bool kvm_is_reserved_pfn(kvm_pfn_t pfn) +/* + * Returns a 'struct page' if the pfn is "valid" and backed by a refcounted + * page, NULL otherwise. Note, the list of refcounted PG_reserved page types + * is likely incomplete, it has been compiled purely through people wanting to + * back guest with a certain type of memory and encountering issues. + */ +struct page *kvm_pfn_to_refcounted_page(kvm_pfn_t pfn) { + struct page *page; + + if (!pfn_valid(pfn)) + return NULL; + + page = pfn_to_page(pfn); + if (!PageReserved(page)) + return page; + + /* The ZERO_PAGE(s) is marked PG_reserved, but is refcounted. */ + if (is_zero_pfn(pfn)) + return page; + /* * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting * perspective they are "normal" pages, albeit with slightly different * usage rules. */ - if (pfn_valid(pfn)) - return PageReserved(pfn_to_page(pfn)) && - !is_zero_pfn(pfn) && - !kvm_is_zone_device_pfn(pfn); + if (kvm_is_zone_device_page(page)) + return page; - return true; + return NULL; } /* @@ -239,7 +256,7 @@ static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req) return mode == IN_GUEST_MODE; } -static void ack_flush(void *_completed) +static void ack_kick(void *_completed) { } @@ -248,7 +265,7 @@ static inline bool kvm_kick_many_cpus(struct cpumask *cpus, bool wait) if (cpumask_empty(cpus)) return false; - smp_call_function_many(cpus, ack_flush, NULL, wait); + smp_call_function_many(cpus, ack_kick, NULL, wait); return true; } @@ -379,14 +396,31 @@ static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc, return (void *)__get_free_page(gfp_flags); } -int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min) +int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min) { + gfp_t gfp = mc->gfp_custom ? mc->gfp_custom : GFP_KERNEL_ACCOUNT; void *obj; if (mc->nobjs >= min) return 0; - while (mc->nobjs < ARRAY_SIZE(mc->objects)) { - obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT); + + if (unlikely(!mc->objects)) { + if (WARN_ON_ONCE(!capacity)) + return -EIO; + + mc->objects = kvmalloc_array(sizeof(void *), capacity, gfp); + if (!mc->objects) + return -ENOMEM; + + mc->capacity = capacity; + } + + /* It is illegal to request a different capacity across topups. */ + if (WARN_ON_ONCE(mc->capacity != capacity)) + return -EIO; + + while (mc->nobjs < mc->capacity) { + obj = mmu_memory_cache_alloc_obj(mc, gfp); if (!obj) return mc->nobjs >= min ? 0 : -ENOMEM; mc->objects[mc->nobjs++] = obj; @@ -394,6 +428,11 @@ int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min) return 0; } +int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min) +{ + return __kvm_mmu_topup_memory_cache(mc, KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE, min); +} + int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc) { return mc->nobjs; @@ -407,6 +446,11 @@ void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) else free_page((unsigned long)mc->objects[--mc->nobjs]); } + + kvfree(mc->objects); + + mc->objects = NULL; + mc->capacity = 0; } void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) @@ -724,6 +768,15 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, kvm->mn_active_invalidate_count++; spin_unlock(&kvm->mn_invalidate_lock); + /* + * Invalidate pfn caches _before_ invalidating the secondary MMUs, i.e. + * before acquiring mmu_lock, to avoid holding mmu_lock while acquiring + * each cache's lock. There are relatively few caches in existence at + * any given time, and the caches themselves can check for hva overlap, + * i.e. don't need to rely on memslot overlap checks for performance. + * Because this runs without holding mmu_lock, the pfn caches must use + * mn_active_invalidate_count (see above) instead of mmu_notifier_count. + */ gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end, hva_range.may_block); @@ -2492,9 +2545,12 @@ static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault) static int kvm_try_get_pfn(kvm_pfn_t pfn) { - if (kvm_is_reserved_pfn(pfn)) + struct page *page = kvm_pfn_to_refcounted_page(pfn); + + if (!page) return 1; - return get_page_unless_zero(pfn_to_page(pfn)); + + return get_page_unless_zero(page); } static int hva_to_pfn_remapped(struct vm_area_struct *vma, @@ -2580,7 +2636,7 @@ kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, bool write_fault, bool *writable) { struct vm_area_struct *vma; - kvm_pfn_t pfn = 0; + kvm_pfn_t pfn; int npages, r; /* we can do it either atomically or asynchronously, not both */ @@ -2711,34 +2767,32 @@ int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn, } EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); -static struct page *kvm_pfn_to_page(kvm_pfn_t pfn) -{ - if (is_error_noslot_pfn(pfn)) - return KVM_ERR_PTR_BAD_PAGE; - - if (kvm_is_reserved_pfn(pfn)) { - WARN_ON(1); - return KVM_ERR_PTR_BAD_PAGE; - } - - return pfn_to_page(pfn); -} - +/* + * Do not use this helper unless you are absolutely certain the gfn _must_ be + * backed by 'struct page'. A valid example is if the backing memslot is + * controlled by KVM. Note, if the returned page is valid, it's refcount has + * been elevated by gfn_to_pfn(). + */ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) { + struct page *page; kvm_pfn_t pfn; pfn = gfn_to_pfn(kvm, gfn); - return kvm_pfn_to_page(pfn); + if (is_error_noslot_pfn(pfn)) + return KVM_ERR_PTR_BAD_PAGE; + + page = kvm_pfn_to_refcounted_page(pfn); + if (!page) + return KVM_ERR_PTR_BAD_PAGE; + + return page; } EXPORT_SYMBOL_GPL(gfn_to_page); void kvm_release_pfn(kvm_pfn_t pfn, bool dirty) { - if (pfn == 0) - return; - if (dirty) kvm_release_pfn_dirty(pfn); else @@ -2804,28 +2858,48 @@ void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty) } EXPORT_SYMBOL_GPL(kvm_vcpu_unmap); -struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn) +static bool kvm_is_ad_tracked_page(struct page *page) { - kvm_pfn_t pfn; + /* + * Per page-flags.h, pages tagged PG_reserved "should in general not be + * touched (e.g. set dirty) except by its owner". + */ + return !PageReserved(page); +} - pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn); +static void kvm_set_page_dirty(struct page *page) +{ + if (kvm_is_ad_tracked_page(page)) + SetPageDirty(page); +} - return kvm_pfn_to_page(pfn); +static void kvm_set_page_accessed(struct page *page) +{ + if (kvm_is_ad_tracked_page(page)) + mark_page_accessed(page); } -EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page); void kvm_release_page_clean(struct page *page) { WARN_ON(is_error_page(page)); - kvm_release_pfn_clean(page_to_pfn(page)); + kvm_set_page_accessed(page); + put_page(page); } EXPORT_SYMBOL_GPL(kvm_release_page_clean); void kvm_release_pfn_clean(kvm_pfn_t pfn) { - if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn)) - put_page(pfn_to_page(pfn)); + struct page *page; + + if (is_error_noslot_pfn(pfn)) + return; + + page = kvm_pfn_to_refcounted_page(pfn); + if (!page) + return; + + kvm_release_page_clean(page); } EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); @@ -2833,28 +2907,48 @@ void kvm_release_page_dirty(struct page *page) { WARN_ON(is_error_page(page)); - kvm_release_pfn_dirty(page_to_pfn(page)); + kvm_set_page_dirty(page); + kvm_release_page_clean(page); } EXPORT_SYMBOL_GPL(kvm_release_page_dirty); void kvm_release_pfn_dirty(kvm_pfn_t pfn) { - kvm_set_pfn_dirty(pfn); - kvm_release_pfn_clean(pfn); + struct page *page; + + if (is_error_noslot_pfn(pfn)) + return; + + page = kvm_pfn_to_refcounted_page(pfn); + if (!page) + return; + + kvm_release_page_dirty(page); } EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); +/* + * Note, checking for an error/noslot pfn is the caller's responsibility when + * directly marking a page dirty/accessed. Unlike the "release" helpers, the + * "set" helpers are not to be used when the pfn might point at garbage. + */ void kvm_set_pfn_dirty(kvm_pfn_t pfn) { - if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) - SetPageDirty(pfn_to_page(pfn)); + if (WARN_ON(is_error_noslot_pfn(pfn))) + return; + + if (pfn_valid(pfn)) + kvm_set_page_dirty(pfn_to_page(pfn)); } EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); void kvm_set_pfn_accessed(kvm_pfn_t pfn) { - if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) - mark_page_accessed(pfn_to_page(pfn)); + if (WARN_ON(is_error_noslot_pfn(pfn))) + return; + + if (pfn_valid(pfn)) + kvm_set_page_accessed(pfn_to_page(pfn)); } EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); @@ -3728,9 +3822,18 @@ static int create_vcpu_fd(struct kvm_vcpu *vcpu) return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC); } +#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS +static int vcpu_get_pid(void *data, u64 *val) +{ + struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data; + *val = pid_nr(rcu_access_pointer(vcpu->pid)); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(vcpu_get_pid_fops, vcpu_get_pid, NULL, "%llu\n"); + static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) { -#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS struct dentry *debugfs_dentry; char dir_name[ITOA_MAX_LEN * 2]; @@ -3740,10 +3843,12 @@ static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id); debugfs_dentry = debugfs_create_dir(dir_name, vcpu->kvm->debugfs_dentry); + debugfs_create_file("pid", 0444, debugfs_dentry, vcpu, + &vcpu_get_pid_fops); kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry); -#endif } +#endif /* * Creates some virtual cpus. Good luck creating more than one. @@ -3763,13 +3868,15 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) return -EINVAL; } + r = kvm_arch_vcpu_precreate(kvm, id); + if (r) { + mutex_unlock(&kvm->lock); + return r; + } + kvm->created_vcpus++; mutex_unlock(&kvm->lock); - r = kvm_arch_vcpu_precreate(kvm, id); - if (r) - goto vcpu_decrement; - vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); if (!vcpu) { r = -ENOMEM; |