summaryrefslogtreecommitdiff
path: root/virt/kvm/kvm_main.c
diff options
context:
space:
mode:
Diffstat (limited to 'virt/kvm/kvm_main.c')
-rw-r--r--virt/kvm/kvm_main.c217
1 files changed, 162 insertions, 55 deletions
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a49df8988cd6..32896c845ffe 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -168,7 +168,7 @@ __weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
{
}
-bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
+bool kvm_is_zone_device_page(struct page *page)
{
/*
* The metadata used by is_zone_device_page() to determine whether or
@@ -176,25 +176,42 @@ bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
* the device has been pinned, e.g. by get_user_pages(). WARN if the
* page_count() is zero to help detect bad usage of this helper.
*/
- if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn))))
+ if (WARN_ON_ONCE(!page_count(page)))
return false;
- return is_zone_device_page(pfn_to_page(pfn));
+ return is_zone_device_page(page);
}
-bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
+/*
+ * Returns a 'struct page' if the pfn is "valid" and backed by a refcounted
+ * page, NULL otherwise. Note, the list of refcounted PG_reserved page types
+ * is likely incomplete, it has been compiled purely through people wanting to
+ * back guest with a certain type of memory and encountering issues.
+ */
+struct page *kvm_pfn_to_refcounted_page(kvm_pfn_t pfn)
{
+ struct page *page;
+
+ if (!pfn_valid(pfn))
+ return NULL;
+
+ page = pfn_to_page(pfn);
+ if (!PageReserved(page))
+ return page;
+
+ /* The ZERO_PAGE(s) is marked PG_reserved, but is refcounted. */
+ if (is_zero_pfn(pfn))
+ return page;
+
/*
* ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
* perspective they are "normal" pages, albeit with slightly different
* usage rules.
*/
- if (pfn_valid(pfn))
- return PageReserved(pfn_to_page(pfn)) &&
- !is_zero_pfn(pfn) &&
- !kvm_is_zone_device_pfn(pfn);
+ if (kvm_is_zone_device_page(page))
+ return page;
- return true;
+ return NULL;
}
/*
@@ -239,7 +256,7 @@ static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
return mode == IN_GUEST_MODE;
}
-static void ack_flush(void *_completed)
+static void ack_kick(void *_completed)
{
}
@@ -248,7 +265,7 @@ static inline bool kvm_kick_many_cpus(struct cpumask *cpus, bool wait)
if (cpumask_empty(cpus))
return false;
- smp_call_function_many(cpus, ack_flush, NULL, wait);
+ smp_call_function_many(cpus, ack_kick, NULL, wait);
return true;
}
@@ -379,14 +396,31 @@ static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
return (void *)__get_free_page(gfp_flags);
}
-int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
+int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min)
{
+ gfp_t gfp = mc->gfp_custom ? mc->gfp_custom : GFP_KERNEL_ACCOUNT;
void *obj;
if (mc->nobjs >= min)
return 0;
- while (mc->nobjs < ARRAY_SIZE(mc->objects)) {
- obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT);
+
+ if (unlikely(!mc->objects)) {
+ if (WARN_ON_ONCE(!capacity))
+ return -EIO;
+
+ mc->objects = kvmalloc_array(sizeof(void *), capacity, gfp);
+ if (!mc->objects)
+ return -ENOMEM;
+
+ mc->capacity = capacity;
+ }
+
+ /* It is illegal to request a different capacity across topups. */
+ if (WARN_ON_ONCE(mc->capacity != capacity))
+ return -EIO;
+
+ while (mc->nobjs < mc->capacity) {
+ obj = mmu_memory_cache_alloc_obj(mc, gfp);
if (!obj)
return mc->nobjs >= min ? 0 : -ENOMEM;
mc->objects[mc->nobjs++] = obj;
@@ -394,6 +428,11 @@ int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
return 0;
}
+int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
+{
+ return __kvm_mmu_topup_memory_cache(mc, KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE, min);
+}
+
int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
{
return mc->nobjs;
@@ -407,6 +446,11 @@ void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
else
free_page((unsigned long)mc->objects[--mc->nobjs]);
}
+
+ kvfree(mc->objects);
+
+ mc->objects = NULL;
+ mc->capacity = 0;
}
void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
@@ -724,6 +768,15 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
kvm->mn_active_invalidate_count++;
spin_unlock(&kvm->mn_invalidate_lock);
+ /*
+ * Invalidate pfn caches _before_ invalidating the secondary MMUs, i.e.
+ * before acquiring mmu_lock, to avoid holding mmu_lock while acquiring
+ * each cache's lock. There are relatively few caches in existence at
+ * any given time, and the caches themselves can check for hva overlap,
+ * i.e. don't need to rely on memslot overlap checks for performance.
+ * Because this runs without holding mmu_lock, the pfn caches must use
+ * mn_active_invalidate_count (see above) instead of mmu_notifier_count.
+ */
gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end,
hva_range.may_block);
@@ -2492,9 +2545,12 @@ static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
static int kvm_try_get_pfn(kvm_pfn_t pfn)
{
- if (kvm_is_reserved_pfn(pfn))
+ struct page *page = kvm_pfn_to_refcounted_page(pfn);
+
+ if (!page)
return 1;
- return get_page_unless_zero(pfn_to_page(pfn));
+
+ return get_page_unless_zero(page);
}
static int hva_to_pfn_remapped(struct vm_area_struct *vma,
@@ -2580,7 +2636,7 @@ kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
bool write_fault, bool *writable)
{
struct vm_area_struct *vma;
- kvm_pfn_t pfn = 0;
+ kvm_pfn_t pfn;
int npages, r;
/* we can do it either atomically or asynchronously, not both */
@@ -2711,34 +2767,32 @@ int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
}
EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
-static struct page *kvm_pfn_to_page(kvm_pfn_t pfn)
-{
- if (is_error_noslot_pfn(pfn))
- return KVM_ERR_PTR_BAD_PAGE;
-
- if (kvm_is_reserved_pfn(pfn)) {
- WARN_ON(1);
- return KVM_ERR_PTR_BAD_PAGE;
- }
-
- return pfn_to_page(pfn);
-}
-
+/*
+ * Do not use this helper unless you are absolutely certain the gfn _must_ be
+ * backed by 'struct page'. A valid example is if the backing memslot is
+ * controlled by KVM. Note, if the returned page is valid, it's refcount has
+ * been elevated by gfn_to_pfn().
+ */
struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
{
+ struct page *page;
kvm_pfn_t pfn;
pfn = gfn_to_pfn(kvm, gfn);
- return kvm_pfn_to_page(pfn);
+ if (is_error_noslot_pfn(pfn))
+ return KVM_ERR_PTR_BAD_PAGE;
+
+ page = kvm_pfn_to_refcounted_page(pfn);
+ if (!page)
+ return KVM_ERR_PTR_BAD_PAGE;
+
+ return page;
}
EXPORT_SYMBOL_GPL(gfn_to_page);
void kvm_release_pfn(kvm_pfn_t pfn, bool dirty)
{
- if (pfn == 0)
- return;
-
if (dirty)
kvm_release_pfn_dirty(pfn);
else
@@ -2804,28 +2858,48 @@ void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
}
EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
-struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn)
+static bool kvm_is_ad_tracked_page(struct page *page)
{
- kvm_pfn_t pfn;
+ /*
+ * Per page-flags.h, pages tagged PG_reserved "should in general not be
+ * touched (e.g. set dirty) except by its owner".
+ */
+ return !PageReserved(page);
+}
- pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn);
+static void kvm_set_page_dirty(struct page *page)
+{
+ if (kvm_is_ad_tracked_page(page))
+ SetPageDirty(page);
+}
- return kvm_pfn_to_page(pfn);
+static void kvm_set_page_accessed(struct page *page)
+{
+ if (kvm_is_ad_tracked_page(page))
+ mark_page_accessed(page);
}
-EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page);
void kvm_release_page_clean(struct page *page)
{
WARN_ON(is_error_page(page));
- kvm_release_pfn_clean(page_to_pfn(page));
+ kvm_set_page_accessed(page);
+ put_page(page);
}
EXPORT_SYMBOL_GPL(kvm_release_page_clean);
void kvm_release_pfn_clean(kvm_pfn_t pfn)
{
- if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn))
- put_page(pfn_to_page(pfn));
+ struct page *page;
+
+ if (is_error_noslot_pfn(pfn))
+ return;
+
+ page = kvm_pfn_to_refcounted_page(pfn);
+ if (!page)
+ return;
+
+ kvm_release_page_clean(page);
}
EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
@@ -2833,28 +2907,48 @@ void kvm_release_page_dirty(struct page *page)
{
WARN_ON(is_error_page(page));
- kvm_release_pfn_dirty(page_to_pfn(page));
+ kvm_set_page_dirty(page);
+ kvm_release_page_clean(page);
}
EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
void kvm_release_pfn_dirty(kvm_pfn_t pfn)
{
- kvm_set_pfn_dirty(pfn);
- kvm_release_pfn_clean(pfn);
+ struct page *page;
+
+ if (is_error_noslot_pfn(pfn))
+ return;
+
+ page = kvm_pfn_to_refcounted_page(pfn);
+ if (!page)
+ return;
+
+ kvm_release_page_dirty(page);
}
EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
+/*
+ * Note, checking for an error/noslot pfn is the caller's responsibility when
+ * directly marking a page dirty/accessed. Unlike the "release" helpers, the
+ * "set" helpers are not to be used when the pfn might point at garbage.
+ */
void kvm_set_pfn_dirty(kvm_pfn_t pfn)
{
- if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
- SetPageDirty(pfn_to_page(pfn));
+ if (WARN_ON(is_error_noslot_pfn(pfn)))
+ return;
+
+ if (pfn_valid(pfn))
+ kvm_set_page_dirty(pfn_to_page(pfn));
}
EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
void kvm_set_pfn_accessed(kvm_pfn_t pfn)
{
- if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
- mark_page_accessed(pfn_to_page(pfn));
+ if (WARN_ON(is_error_noslot_pfn(pfn)))
+ return;
+
+ if (pfn_valid(pfn))
+ kvm_set_page_accessed(pfn_to_page(pfn));
}
EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
@@ -3728,9 +3822,18 @@ static int create_vcpu_fd(struct kvm_vcpu *vcpu)
return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
}
+#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
+static int vcpu_get_pid(void *data, u64 *val)
+{
+ struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
+ *val = pid_nr(rcu_access_pointer(vcpu->pid));
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_get_pid_fops, vcpu_get_pid, NULL, "%llu\n");
+
static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
{
-#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
struct dentry *debugfs_dentry;
char dir_name[ITOA_MAX_LEN * 2];
@@ -3740,10 +3843,12 @@ static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
debugfs_dentry = debugfs_create_dir(dir_name,
vcpu->kvm->debugfs_dentry);
+ debugfs_create_file("pid", 0444, debugfs_dentry, vcpu,
+ &vcpu_get_pid_fops);
kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry);
-#endif
}
+#endif
/*
* Creates some virtual cpus. Good luck creating more than one.
@@ -3763,13 +3868,15 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
return -EINVAL;
}
+ r = kvm_arch_vcpu_precreate(kvm, id);
+ if (r) {
+ mutex_unlock(&kvm->lock);
+ return r;
+ }
+
kvm->created_vcpus++;
mutex_unlock(&kvm->lock);
- r = kvm_arch_vcpu_precreate(kvm, id);
- if (r)
- goto vcpu_decrement;
-
vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
if (!vcpu) {
r = -ENOMEM;