From a86cb413f4bf273a9d341a3ab2c2ca44e12eb317 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Thu, 23 May 2019 18:43:08 +0200 Subject: KVM: s390: Do not report unusabled IDs via KVM_CAP_MAX_VCPU_ID KVM_CAP_MAX_VCPU_ID is currently always reporting KVM_MAX_VCPU_ID on all architectures. However, on s390x, the amount of usable CPUs is determined during runtime - it is depending on the features of the machine the code is running on. Since we are using the vcpu_id as an index into the SCA structures that are defined by the hardware (see e.g. the sca_add_vcpu() function), it is not only the amount of CPUs that is limited by the hard- ware, but also the range of IDs that we can use. Thus KVM_CAP_MAX_VCPU_ID must be determined during runtime on s390x, too. So the handling of KVM_CAP_MAX_VCPU_ID has to be moved from the common code into the architecture specific code, and on s390x we have to return the same value here as for KVM_CAP_MAX_VCPUS. This problem has been discovered with the kvm_create_max_vcpus selftest. With this change applied, the selftest now passes on s390x, too. Reviewed-by: Andrew Jones Reviewed-by: Cornelia Huck Reviewed-by: David Hildenbrand Signed-off-by: Thomas Huth Message-Id: <20190523164309.13345-9-thuth@redhat.com> Cc: stable@vger.kernel.org Signed-off-by: Christian Borntraeger --- arch/mips/kvm/mips.c | 3 +++ arch/powerpc/kvm/powerpc.c | 3 +++ arch/s390/kvm/kvm-s390.c | 1 + arch/x86/kvm/x86.c | 3 +++ 4 files changed, 10 insertions(+) (limited to 'arch') diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 6d0517ac18e5..0369f26ab96d 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -1122,6 +1122,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; break; + case KVM_CAP_MAX_VCPU_ID: + r = KVM_MAX_VCPU_ID; + break; case KVM_CAP_MIPS_FPU: /* We don't handle systems with inconsistent cpu_has_fpu */ r = !!raw_cpu_has_fpu; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 3393b166817a..aa3a678711be 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -657,6 +657,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; break; + case KVM_CAP_MAX_VCPU_ID: + r = KVM_MAX_VCPU_ID; + break; #ifdef CONFIG_PPC_BOOK3S_64 case KVM_CAP_PPC_GET_SMMU_INFO: r = 1; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index e5e8eb29e68e..28ebd647784c 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -539,6 +539,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; case KVM_CAP_NR_VCPUS: case KVM_CAP_MAX_VCPUS: + case KVM_CAP_MAX_VCPU_ID: r = KVM_S390_BSCA_CPU_SLOTS; if (!kvm_s390_use_sca_entries()) r = KVM_MAX_VCPUS; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index acb179f78fdc..83aefd759846 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3122,6 +3122,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; break; + case KVM_CAP_MAX_VCPU_ID: + r = KVM_MAX_VCPU_ID; + break; case KVM_CAP_PV_MMU: /* obsolete */ r = 0; break; -- cgit v1.2.3 From c395fe1d8e49a5aa03504fcacfb7c95b5a4c6e04 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 23 May 2019 16:35:07 +1000 Subject: KVM: PPC: Book3S HV: Avoid touching arch.mmu_ready in XIVE release functions Currently, kvmppc_xive_release() and kvmppc_xive_native_release() clear kvm->arch.mmu_ready and call kick_all_cpus_sync() as a way of ensuring that no vcpus are executing in the guest. However, future patches will change the mutex associated with kvm->arch.mmu_ready to a new mutex that nests inside the vcpu mutexes, making it difficult to continue to use this method. In fact, taking the vcpu mutex for a vcpu excludes execution of that vcpu, and we already take the vcpu mutex around the call to kvmppc_xive_[native_]cleanup_vcpu(). Once the cleanup function is done and we release the vcpu mutex, the vcpu can execute once again, but because we have cleared vcpu->arch.xive_vcpu, vcpu->arch.irq_type, vcpu->arch.xive_esc_vaddr and vcpu->arch.xive_esc_raddr, that vcpu will not be going into XIVE code any more. Thus, once we have cleaned up all of the vcpus, we are safe to clean up the rest of the XIVE state, and we don't need to use kvm->arch.mmu_ready to hold off vcpu execution. Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_xive.c | 28 ++++++++++++---------------- arch/powerpc/kvm/book3s_xive_native.c | 28 ++++++++++++---------------- 2 files changed, 24 insertions(+), 32 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index 4953957333b7..f623451ec0a3 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -1859,21 +1859,10 @@ static void kvmppc_xive_release(struct kvm_device *dev) struct kvm *kvm = xive->kvm; struct kvm_vcpu *vcpu; int i; - int was_ready; pr_devel("Releasing xive device\n"); - debugfs_remove(xive->dentry); - /* - * Clearing mmu_ready temporarily while holding kvm->lock - * is a way of ensuring that no vcpus can enter the guest - * until we drop kvm->lock. Doing kick_all_cpus_sync() - * ensures that any vcpu executing inside the guest has - * exited the guest. Once kick_all_cpus_sync() has finished, - * we know that no vcpu can be executing the XIVE push or - * pull code, or executing a XICS hcall. - * * Since this is the device release function, we know that * userspace does not have any open fd referring to the * device. Therefore there can not be any of the device @@ -1881,9 +1870,8 @@ static void kvmppc_xive_release(struct kvm_device *dev) * and similarly, the connect_vcpu and set/clr_mapped * functions also cannot be being executed. */ - was_ready = kvm->arch.mmu_ready; - kvm->arch.mmu_ready = 0; - kick_all_cpus_sync(); + + debugfs_remove(xive->dentry); /* * We should clean up the vCPU interrupt presenters first. @@ -1892,12 +1880,22 @@ static void kvmppc_xive_release(struct kvm_device *dev) /* * Take vcpu->mutex to ensure that no one_reg get/set ioctl * (i.e. kvmppc_xive_[gs]et_icp) can be done concurrently. + * Holding the vcpu->mutex also means that the vcpu cannot + * be executing the KVM_RUN ioctl, and therefore it cannot + * be executing the XIVE push or pull code or accessing + * the XIVE MMIO regions. */ mutex_lock(&vcpu->mutex); kvmppc_xive_cleanup_vcpu(vcpu); mutex_unlock(&vcpu->mutex); } + /* + * Now that we have cleared vcpu->arch.xive_vcpu, vcpu->arch.irq_type + * and vcpu->arch.xive_esc_[vr]addr on each vcpu, we are safe + * against xive code getting called during vcpu execution or + * set/get one_reg operations. + */ kvm->arch.xive = NULL; /* Mask and free interrupts */ @@ -1911,8 +1909,6 @@ static void kvmppc_xive_release(struct kvm_device *dev) if (xive->vp_base != XIVE_INVALID_VP) xive_native_free_vp_block(xive->vp_base); - kvm->arch.mmu_ready = was_ready; - /* * A reference of the kvmppc_xive pointer is now kept under * the xive_devices struct of the machine for reuse. It is diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index 6a8e698c4b6e..da31dd05fd72 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -973,21 +973,10 @@ static void kvmppc_xive_native_release(struct kvm_device *dev) struct kvm *kvm = xive->kvm; struct kvm_vcpu *vcpu; int i; - int was_ready; - - debugfs_remove(xive->dentry); pr_devel("Releasing xive native device\n"); /* - * Clearing mmu_ready temporarily while holding kvm->lock - * is a way of ensuring that no vcpus can enter the guest - * until we drop kvm->lock. Doing kick_all_cpus_sync() - * ensures that any vcpu executing inside the guest has - * exited the guest. Once kick_all_cpus_sync() has finished, - * we know that no vcpu can be executing the XIVE push or - * pull code or accessing the XIVE MMIO regions. - * * Since this is the device release function, we know that * userspace does not have any open fd or mmap referring to * the device. Therefore there can not be any of the @@ -996,9 +985,8 @@ static void kvmppc_xive_native_release(struct kvm_device *dev) * connect_vcpu and set/clr_mapped functions also cannot * be being executed. */ - was_ready = kvm->arch.mmu_ready; - kvm->arch.mmu_ready = 0; - kick_all_cpus_sync(); + + debugfs_remove(xive->dentry); /* * We should clean up the vCPU interrupt presenters first. @@ -1007,12 +995,22 @@ static void kvmppc_xive_native_release(struct kvm_device *dev) /* * Take vcpu->mutex to ensure that no one_reg get/set ioctl * (i.e. kvmppc_xive_native_[gs]et_vp) can be being done. + * Holding the vcpu->mutex also means that the vcpu cannot + * be executing the KVM_RUN ioctl, and therefore it cannot + * be executing the XIVE push or pull code or accessing + * the XIVE MMIO regions. */ mutex_lock(&vcpu->mutex); kvmppc_xive_native_cleanup_vcpu(vcpu); mutex_unlock(&vcpu->mutex); } + /* + * Now that we have cleared vcpu->arch.xive_vcpu, vcpu->arch.irq_type + * and vcpu->arch.xive_esc_[vr]addr on each vcpu, we are safe + * against xive code getting called during vcpu execution or + * set/get one_reg operations. + */ kvm->arch.xive = NULL; for (i = 0; i <= xive->max_sbid; i++) { @@ -1025,8 +1023,6 @@ static void kvmppc_xive_native_release(struct kvm_device *dev) if (xive->vp_base != XIVE_INVALID_VP) xive_native_free_vp_block(xive->vp_base); - kvm->arch.mmu_ready = was_ready; - /* * A reference of the kvmppc_xive pointer is now kept under * the xive_devices struct of the machine for reuse. It is -- cgit v1.2.3 From 0d4ee88d92884c661fcafd5576da243aa943dc24 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 23 May 2019 16:35:34 +1000 Subject: KVM: PPC: Book3S HV: Use new mutex to synchronize MMU setup Currently the HV KVM code uses kvm->lock in conjunction with a flag, kvm->arch.mmu_ready, to synchronize MMU setup and hold off vcpu execution until the MMU-related data structures are ready. However, this means that kvm->lock is being taken inside vcpu->mutex, which is contrary to Documentation/virtual/kvm/locking.txt and results in lockdep warnings. To fix this, we add a new mutex, kvm->arch.mmu_setup_lock, which nests inside the vcpu mutexes, and is taken in the places where kvm->lock was taken that are related to MMU setup. Additionally we take the new mutex in the vcpu creation code at the point where we are creating a new vcore, in order to provide mutual exclusion with kvmppc_update_lpcr() and ensure that an update to kvm->arch.lpcr doesn't get missed, which could otherwise lead to a stale vcore->lpcr value. Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_host.h | 1 + arch/powerpc/kvm/book3s_64_mmu_hv.c | 36 ++++++++++++++++++------------------ arch/powerpc/kvm/book3s_hv.c | 31 +++++++++++++++++++++++-------- 3 files changed, 42 insertions(+), 26 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 013c76a0a03e..26b3ce487ddc 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -325,6 +325,7 @@ struct kvm_arch { #endif struct kvmppc_ops *kvm_ops; #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + struct mutex mmu_setup_lock; /* nests inside vcpu mutexes */ u64 l1_ptcr; int max_nested_lpid; struct kvm_nested_guest *nested_guests[KVM_MAX_NESTED_GUESTS]; diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index ab3d484c5e2e..51971311e6c9 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -63,7 +63,7 @@ struct kvm_resize_hpt { struct work_struct work; u32 order; - /* These fields protected by kvm->lock */ + /* These fields protected by kvm->arch.mmu_setup_lock */ /* Possible values and their usage: * <0 an error occurred during allocation, @@ -73,7 +73,7 @@ struct kvm_resize_hpt { int error; /* Private to the work thread, until error != -EBUSY, - * then protected by kvm->lock. + * then protected by kvm->arch.mmu_setup_lock. */ struct kvm_hpt_info hpt; }; @@ -139,7 +139,7 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order) long err = -EBUSY; struct kvm_hpt_info info; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.mmu_setup_lock); if (kvm->arch.mmu_ready) { kvm->arch.mmu_ready = 0; /* order mmu_ready vs. vcpus_running */ @@ -183,7 +183,7 @@ out: /* Ensure that each vcpu will flush its TLB on next entry. */ cpumask_setall(&kvm->arch.need_tlb_flush); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.mmu_setup_lock); return err; } @@ -1447,7 +1447,7 @@ static void resize_hpt_pivot(struct kvm_resize_hpt *resize) static void resize_hpt_release(struct kvm *kvm, struct kvm_resize_hpt *resize) { - if (WARN_ON(!mutex_is_locked(&kvm->lock))) + if (WARN_ON(!mutex_is_locked(&kvm->arch.mmu_setup_lock))) return; if (!resize) @@ -1474,14 +1474,14 @@ static void resize_hpt_prepare_work(struct work_struct *work) if (WARN_ON(resize->error != -EBUSY)) return; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.mmu_setup_lock); /* Request is still current? */ if (kvm->arch.resize_hpt == resize) { /* We may request large allocations here: - * do not sleep with kvm->lock held for a while. + * do not sleep with kvm->arch.mmu_setup_lock held for a while. */ - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.mmu_setup_lock); resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n", resize->order); @@ -1494,9 +1494,9 @@ static void resize_hpt_prepare_work(struct work_struct *work) if (WARN_ON(err == -EBUSY)) err = -EINPROGRESS; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.mmu_setup_lock); /* It is possible that kvm->arch.resize_hpt != resize - * after we grab kvm->lock again. + * after we grab kvm->arch.mmu_setup_lock again. */ } @@ -1505,7 +1505,7 @@ static void resize_hpt_prepare_work(struct work_struct *work) if (kvm->arch.resize_hpt != resize) resize_hpt_release(kvm, resize); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.mmu_setup_lock); } long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm, @@ -1522,7 +1522,7 @@ long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm, if (shift && ((shift < 18) || (shift > 46))) return -EINVAL; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.mmu_setup_lock); resize = kvm->arch.resize_hpt; @@ -1565,7 +1565,7 @@ long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm, ret = 100; /* estimated time in ms */ out: - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.mmu_setup_lock); return ret; } @@ -1588,7 +1588,7 @@ long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm, if (shift && ((shift < 18) || (shift > 46))) return -EINVAL; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.mmu_setup_lock); resize = kvm->arch.resize_hpt; @@ -1625,7 +1625,7 @@ out: smp_mb(); out_no_hpt: resize_hpt_release(kvm, resize); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.mmu_setup_lock); return ret; } @@ -1868,7 +1868,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf, return -EINVAL; /* lock out vcpus from running while we're doing this */ - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.mmu_setup_lock); mmu_ready = kvm->arch.mmu_ready; if (mmu_ready) { kvm->arch.mmu_ready = 0; /* temporarily */ @@ -1876,7 +1876,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf, smp_mb(); if (atomic_read(&kvm->arch.vcpus_running)) { kvm->arch.mmu_ready = 1; - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.mmu_setup_lock); return -EBUSY; } } @@ -1963,7 +1963,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf, /* Order HPTE updates vs. mmu_ready */ smp_wmb(); kvm->arch.mmu_ready = mmu_ready; - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.mmu_setup_lock); if (err) return err; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index d5fc624e0655..b1c0a9bc3555 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2338,11 +2338,17 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, pr_devel("KVM: collision on id %u", id); vcore = NULL; } else if (!vcore) { + /* + * Take mmu_setup_lock for mutual exclusion + * with kvmppc_update_lpcr(). + */ err = -ENOMEM; vcore = kvmppc_vcore_create(kvm, id & ~(kvm->arch.smt_mode - 1)); + mutex_lock(&kvm->arch.mmu_setup_lock); kvm->arch.vcores[core] = vcore; kvm->arch.online_vcores++; + mutex_unlock(&kvm->arch.mmu_setup_lock); } } mutex_unlock(&kvm->lock); @@ -3859,7 +3865,7 @@ static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu) int r = 0; struct kvm *kvm = vcpu->kvm; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.mmu_setup_lock); if (!kvm->arch.mmu_ready) { if (!kvm_is_radix(kvm)) r = kvmppc_hv_setup_htab_rma(vcpu); @@ -3869,7 +3875,7 @@ static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu) kvm->arch.mmu_ready = 1; } } - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.mmu_setup_lock); return r; } @@ -4478,7 +4484,8 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm, /* * Update LPCR values in kvm->arch and in vcores. - * Caller must hold kvm->lock. + * Caller must hold kvm->arch.mmu_setup_lock (for mutual exclusion + * of kvm->arch.lpcr update). */ void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr, unsigned long mask) { @@ -4530,7 +4537,7 @@ void kvmppc_setup_partition_table(struct kvm *kvm) /* * Set up HPT (hashed page table) and RMA (real-mode area). - * Must be called with kvm->lock held. + * Must be called with kvm->arch.mmu_setup_lock held. */ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) { @@ -4618,7 +4625,10 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) goto out_srcu; } -/* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */ +/* + * Must be called with kvm->arch.mmu_setup_lock held and + * mmu_ready = 0 and no vcpus running. + */ int kvmppc_switch_mmu_to_hpt(struct kvm *kvm) { if (nesting_enabled(kvm)) @@ -4635,7 +4645,10 @@ int kvmppc_switch_mmu_to_hpt(struct kvm *kvm) return 0; } -/* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */ +/* + * Must be called with kvm->arch.mmu_setup_lock held and + * mmu_ready = 0 and no vcpus running. + */ int kvmppc_switch_mmu_to_radix(struct kvm *kvm) { int err; @@ -4740,6 +4753,8 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) char buf[32]; int ret; + mutex_init(&kvm->arch.mmu_setup_lock); + /* Allocate the guest's logical partition ID */ lpid = kvmppc_alloc_lpid(); @@ -5265,7 +5280,7 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg) if (kvmhv_on_pseries() && !radix) return -EINVAL; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.mmu_setup_lock); if (radix != kvm_is_radix(kvm)) { if (kvm->arch.mmu_ready) { kvm->arch.mmu_ready = 0; @@ -5293,7 +5308,7 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg) err = 0; out_unlock: - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.mmu_setup_lock); return err; } -- cgit v1.2.3 From 1659e27d2bc1ef47b6d031abe01b467f18cb72d9 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 29 May 2019 11:54:00 +1000 Subject: KVM: PPC: Book3S: Use new mutex to synchronize access to rtas token list Currently the Book 3S KVM code uses kvm->lock to synchronize access to the kvm->arch.rtas_tokens list. Because this list is scanned inside kvmppc_rtas_hcall(), which is called with the vcpu mutex held, taking kvm->lock cause a lock inversion problem, which could lead to a deadlock. To fix this, we add a new mutex, kvm->arch.rtas_token_lock, which nests inside the vcpu mutexes, and use that instead of kvm->lock when accessing the rtas token list. This removes the lockdep_assert_held() in kvmppc_rtas_tokens_free(). At this point we don't hold the new mutex, but that is OK because kvmppc_rtas_tokens_free() is only called when the whole VM is being destroyed, and at that point nothing can be looking up a token in the list. Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_host.h | 1 + arch/powerpc/kvm/book3s.c | 1 + arch/powerpc/kvm/book3s_rtas.c | 14 ++++++-------- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 26b3ce487ddc..d10df677c452 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -309,6 +309,7 @@ struct kvm_arch { #ifdef CONFIG_PPC_BOOK3S_64 struct list_head spapr_tce_tables; struct list_head rtas_tokens; + struct mutex rtas_token_lock; DECLARE_BITMAP(enabled_hcalls, MAX_HCALL_OPCODE/4 + 1); #endif #ifdef CONFIG_KVM_MPIC diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 61a212d0daf0..ac5664845aca 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -902,6 +902,7 @@ int kvmppc_core_init_vm(struct kvm *kvm) #ifdef CONFIG_PPC64 INIT_LIST_HEAD_RCU(&kvm->arch.spapr_tce_tables); INIT_LIST_HEAD(&kvm->arch.rtas_tokens); + mutex_init(&kvm->arch.rtas_token_lock); #endif return kvm->arch.kvm_ops->init_vm(kvm); diff --git a/arch/powerpc/kvm/book3s_rtas.c b/arch/powerpc/kvm/book3s_rtas.c index 4e178c4c1ea5..b7ae3dfbf00e 100644 --- a/arch/powerpc/kvm/book3s_rtas.c +++ b/arch/powerpc/kvm/book3s_rtas.c @@ -146,7 +146,7 @@ static int rtas_token_undefine(struct kvm *kvm, char *name) { struct rtas_token_definition *d, *tmp; - lockdep_assert_held(&kvm->lock); + lockdep_assert_held(&kvm->arch.rtas_token_lock); list_for_each_entry_safe(d, tmp, &kvm->arch.rtas_tokens, list) { if (rtas_name_matches(d->handler->name, name)) { @@ -167,7 +167,7 @@ static int rtas_token_define(struct kvm *kvm, char *name, u64 token) bool found; int i; - lockdep_assert_held(&kvm->lock); + lockdep_assert_held(&kvm->arch.rtas_token_lock); list_for_each_entry(d, &kvm->arch.rtas_tokens, list) { if (d->token == token) @@ -206,14 +206,14 @@ int kvm_vm_ioctl_rtas_define_token(struct kvm *kvm, void __user *argp) if (copy_from_user(&args, argp, sizeof(args))) return -EFAULT; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.rtas_token_lock); if (args.token) rc = rtas_token_define(kvm, args.name, args.token); else rc = rtas_token_undefine(kvm, args.name); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.rtas_token_lock); return rc; } @@ -245,7 +245,7 @@ int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu) orig_rets = args.rets; args.rets = &args.args[be32_to_cpu(args.nargs)]; - mutex_lock(&vcpu->kvm->lock); + mutex_lock(&vcpu->kvm->arch.rtas_token_lock); rc = -ENOENT; list_for_each_entry(d, &vcpu->kvm->arch.rtas_tokens, list) { @@ -256,7 +256,7 @@ int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu) } } - mutex_unlock(&vcpu->kvm->lock); + mutex_unlock(&vcpu->kvm->arch.rtas_token_lock); if (rc == 0) { args.rets = orig_rets; @@ -282,8 +282,6 @@ void kvmppc_rtas_tokens_free(struct kvm *kvm) { struct rtas_token_definition *d, *tmp; - lockdep_assert_held(&kvm->lock); - list_for_each_entry_safe(d, tmp, &kvm->arch.rtas_tokens, list) { list_del(&d->list); kfree(d); -- cgit v1.2.3 From 5a3f49364c3ffa1107bd88f8292406e98c5d206c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 23 May 2019 16:36:32 +1000 Subject: KVM: PPC: Book3S HV: Don't take kvm->lock around kvm_for_each_vcpu MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently the HV KVM code takes the kvm->lock around calls to kvm_for_each_vcpu() and kvm_get_vcpu_by_id() (which can call kvm_for_each_vcpu() internally). However, that leads to a lock order inversion problem, because these are called in contexts where the vcpu mutex is held, but the vcpu mutexes nest within kvm->lock according to Documentation/virtual/kvm/locking.txt. Hence there is a possibility of deadlock. To fix this, we simply don't take the kvm->lock mutex around these calls. This is safe because the implementations of kvm_for_each_vcpu() and kvm_get_vcpu_by_id() have been designed to be able to be called locklessly. Signed-off-by: Paul Mackerras Reviewed-by: Cédric Le Goater Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index b1c0a9bc3555..27054d301852 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -446,12 +446,7 @@ static void kvmppc_dump_regs(struct kvm_vcpu *vcpu) static struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id) { - struct kvm_vcpu *ret; - - mutex_lock(&kvm->lock); - ret = kvm_get_vcpu_by_id(kvm, id); - mutex_unlock(&kvm->lock); - return ret; + return kvm_get_vcpu_by_id(kvm, id); } static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa) @@ -1583,7 +1578,6 @@ static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr, struct kvmppc_vcore *vc = vcpu->arch.vcore; u64 mask; - mutex_lock(&kvm->lock); spin_lock(&vc->lock); /* * If ILE (interrupt little-endian) has changed, update the @@ -1623,7 +1617,6 @@ static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr, mask &= 0xFFFFFFFF; vc->lpcr = (vc->lpcr & ~mask) | (new_lpcr & mask); spin_unlock(&vc->lock); - mutex_unlock(&kvm->lock); } static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, -- cgit v1.2.3 From d47aacdb8e0bc03dcaa1a5630a3c633cdcd4cfa7 Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Mon, 20 May 2019 09:15:12 +0200 Subject: KVM: PPC: Book3S HV: XIVE: Clear file mapping when device is released MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Improve the release of the XIVE KVM device by clearing the file address_space, which is used to unmap the interrupt ESB pages when a device is passed-through. Suggested-by: Paul Mackerras Signed-off-by: Cédric Le Goater Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_xive_native.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index da31dd05fd72..9cedd04e7a44 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -976,6 +976,14 @@ static void kvmppc_xive_native_release(struct kvm_device *dev) pr_devel("Releasing xive native device\n"); + /* + * Clear the KVM device file address_space which is used to + * unmap the ESB pages when a device is passed-through. + */ + mutex_lock(&xive->mapping_lock); + xive->mapping = NULL; + mutex_unlock(&xive->mapping_lock); + /* * Since this is the device release function, we know that * userspace does not have any open fd or mmap referring to -- cgit v1.2.3 From c468bc4e8468cb4b85ad61294ddd88efb2e47d8d Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Mon, 20 May 2019 09:15:13 +0200 Subject: KVM: PPC: Book3S HV: XIVE: Do not test the EQ flag validity when resetting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a CPU is hot-unplugged, the EQ is deconfigured using a zero size and a zero address. In this case, there is no need to check the flag and queue size validity. Move the checks after the queue reset code section to fix CPU hot-unplug. Reported-by: Satheesh Rajendran Tested-by: Satheesh Rajendran Signed-off-by: Cédric Le Goater Reviewed-by: Greg Kurz Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_xive_native.c | 36 +++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index 9cedd04e7a44..53b7159dc305 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -565,24 +565,6 @@ static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive *xive, __func__, server, priority, kvm_eq.flags, kvm_eq.qshift, kvm_eq.qaddr, kvm_eq.qtoggle, kvm_eq.qindex); - /* - * sPAPR specifies a "Unconditional Notify (n) flag" for the - * H_INT_SET_QUEUE_CONFIG hcall which forces notification - * without using the coalescing mechanisms provided by the - * XIVE END ESBs. This is required on KVM as notification - * using the END ESBs is not supported. - */ - if (kvm_eq.flags != KVM_XIVE_EQ_ALWAYS_NOTIFY) { - pr_err("invalid flags %d\n", kvm_eq.flags); - return -EINVAL; - } - - rc = xive_native_validate_queue_size(kvm_eq.qshift); - if (rc) { - pr_err("invalid queue size %d\n", kvm_eq.qshift); - return rc; - } - /* reset queue and disable queueing */ if (!kvm_eq.qshift) { q->guest_qaddr = 0; @@ -604,6 +586,24 @@ static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive *xive, return 0; } + /* + * sPAPR specifies a "Unconditional Notify (n) flag" for the + * H_INT_SET_QUEUE_CONFIG hcall which forces notification + * without using the coalescing mechanisms provided by the + * XIVE END ESBs. This is required on KVM as notification + * using the END ESBs is not supported. + */ + if (kvm_eq.flags != KVM_XIVE_EQ_ALWAYS_NOTIFY) { + pr_err("invalid flags %d\n", kvm_eq.flags); + return -EINVAL; + } + + rc = xive_native_validate_queue_size(kvm_eq.qshift); + if (rc) { + pr_err("invalid queue size %d\n", kvm_eq.qshift); + return rc; + } + if (kvm_eq.qaddr & ((1ull << kvm_eq.qshift) - 1)) { pr_err("queue page is not aligned %llx/%llx\n", kvm_eq.qaddr, 1ull << kvm_eq.qshift); -- cgit v1.2.3 From e717d0ae10041d18fa97da1e6bb4942b5eb9f77c Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Mon, 20 May 2019 09:15:14 +0200 Subject: KVM: PPC: Book3S HV: XIVE: Fix the enforced limit on the vCPU identifier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a vCPU is connected to the KVM device, it is done using its vCPU identifier in the guest. Fix the enforced limit on the vCPU identifier by taking into account the SMT mode. Reported-by: Satheesh Rajendran Tested-by: Satheesh Rajendran Signed-off-by: Cédric Le Goater Reviewed-by: Greg Kurz Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_xive_native.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index 53b7159dc305..3caf64b96f7e 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -109,7 +109,7 @@ int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev, return -EPERM; if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT) return -EBUSY; - if (server_num >= KVM_MAX_VCPUS) { + if (server_num >= (KVM_MAX_VCPUS * vcpu->kvm->arch.emul_smt_mode)) { pr_devel("Out of bounds !\n"); return -EINVAL; } -- cgit v1.2.3 From 7e10b9a6da619bcee243980d9c45112761a86e11 Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Fri, 24 May 2019 15:20:30 +0200 Subject: KVM: PPC: Book3S HV: XIVE: Introduce a new mutex for the XIVE device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The XICS-on-XIVE KVM device needs to allocate XIVE event queues when a priority is used by the OS. This is referred as EQ provisioning and it is done under the hood when : 1. a CPU is hot-plugged in the VM 2. the "set-xive" is called at VM startup 3. sources are restored at VM restore The kvm->lock mutex is used to protect the different XIVE structures being modified but in some contexts, kvm->lock is taken under the vcpu->mutex which is not permitted by the KVM locking rules. Introduce a new mutex 'lock' for the KVM devices for them to synchronize accesses to the XIVE device structures. Reviewed-by: Greg Kurz Signed-off-by: Cédric Le Goater Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_xive.c | 23 +++++++++++++---------- arch/powerpc/kvm/book3s_xive.h | 1 + arch/powerpc/kvm/book3s_xive_native.c | 15 ++++++++------- 3 files changed, 22 insertions(+), 17 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index f623451ec0a3..12c8a36dd980 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -271,14 +271,14 @@ static int xive_provision_queue(struct kvm_vcpu *vcpu, u8 prio) return rc; } -/* Called with kvm_lock held */ +/* Called with xive->lock held */ static int xive_check_provisioning(struct kvm *kvm, u8 prio) { struct kvmppc_xive *xive = kvm->arch.xive; struct kvm_vcpu *vcpu; int i, rc; - lockdep_assert_held(&kvm->lock); + lockdep_assert_held(&xive->lock); /* Already provisioned ? */ if (xive->qmap & (1 << prio)) @@ -621,9 +621,12 @@ int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server, irq, server, priority); /* First, check provisioning of queues */ - if (priority != MASKED) + if (priority != MASKED) { + mutex_lock(&xive->lock); rc = xive_check_provisioning(xive->kvm, xive_prio_from_guest(priority)); + mutex_unlock(&xive->lock); + } if (rc) { pr_devel(" provisioning failure %d !\n", rc); return rc; @@ -1199,7 +1202,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, return -ENOMEM; /* We need to synchronize with queue provisioning */ - mutex_lock(&vcpu->kvm->lock); + mutex_lock(&xive->lock); vcpu->arch.xive_vcpu = xc; xc->xive = xive; xc->vcpu = vcpu; @@ -1283,7 +1286,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, xive_vm_esb_load(&xc->vp_ipi_data, XIVE_ESB_SET_PQ_00); bail: - mutex_unlock(&vcpu->kvm->lock); + mutex_unlock(&xive->lock); if (r) { kvmppc_xive_cleanup_vcpu(vcpu); return r; @@ -1527,13 +1530,12 @@ static int xive_get_source(struct kvmppc_xive *xive, long irq, u64 addr) struct kvmppc_xive_src_block *kvmppc_xive_create_src_block( struct kvmppc_xive *xive, int irq) { - struct kvm *kvm = xive->kvm; struct kvmppc_xive_src_block *sb; int i, bid; bid = irq >> KVMPPC_XICS_ICS_SHIFT; - mutex_lock(&kvm->lock); + mutex_lock(&xive->lock); /* block already exists - somebody else got here first */ if (xive->src_blocks[bid]) @@ -1560,7 +1562,7 @@ struct kvmppc_xive_src_block *kvmppc_xive_create_src_block( xive->max_sbid = bid; out: - mutex_unlock(&kvm->lock); + mutex_unlock(&xive->lock); return xive->src_blocks[bid]; } @@ -1670,9 +1672,9 @@ static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr) /* If we have a priority target the interrupt */ if (act_prio != MASKED) { /* First, check provisioning of queues */ - mutex_lock(&xive->kvm->lock); + mutex_lock(&xive->lock); rc = xive_check_provisioning(xive->kvm, act_prio); - mutex_unlock(&xive->kvm->lock); + mutex_unlock(&xive->lock); /* Target interrupt */ if (rc == 0) @@ -1963,6 +1965,7 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type) dev->private = xive; xive->dev = dev; xive->kvm = kvm; + mutex_init(&xive->lock); /* Already there ? */ if (kvm->arch.xive) diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h index 426146332984..862c2c9650ae 100644 --- a/arch/powerpc/kvm/book3s_xive.h +++ b/arch/powerpc/kvm/book3s_xive.h @@ -141,6 +141,7 @@ struct kvmppc_xive { struct kvmppc_xive_ops *ops; struct address_space *mapping; struct mutex mapping_lock; + struct mutex lock; }; #define KVMPPC_XIVE_Q_COUNT 8 diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index 3caf64b96f7e..fec3b85411ef 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -114,7 +114,7 @@ int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev, return -EINVAL; } - mutex_lock(&vcpu->kvm->lock); + mutex_lock(&xive->lock); if (kvmppc_xive_find_server(vcpu->kvm, server_num)) { pr_devel("Duplicate !\n"); @@ -159,7 +159,7 @@ int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev, /* TODO: reset all queues to a clean state ? */ bail: - mutex_unlock(&vcpu->kvm->lock); + mutex_unlock(&xive->lock); if (rc) kvmppc_xive_native_cleanup_vcpu(vcpu); @@ -772,7 +772,7 @@ static int kvmppc_xive_reset(struct kvmppc_xive *xive) pr_devel("%s\n", __func__); - mutex_lock(&kvm->lock); + mutex_lock(&xive->lock); kvm_for_each_vcpu(i, vcpu, kvm) { struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; @@ -810,7 +810,7 @@ static int kvmppc_xive_reset(struct kvmppc_xive *xive) } } - mutex_unlock(&kvm->lock); + mutex_unlock(&xive->lock); return 0; } @@ -878,7 +878,7 @@ static int kvmppc_xive_native_eq_sync(struct kvmppc_xive *xive) pr_devel("%s\n", __func__); - mutex_lock(&kvm->lock); + mutex_lock(&xive->lock); for (i = 0; i <= xive->max_sbid; i++) { struct kvmppc_xive_src_block *sb = xive->src_blocks[i]; @@ -892,7 +892,7 @@ static int kvmppc_xive_native_eq_sync(struct kvmppc_xive *xive) kvm_for_each_vcpu(i, vcpu, kvm) { kvmppc_xive_native_vcpu_eq_sync(vcpu); } - mutex_unlock(&kvm->lock); + mutex_unlock(&xive->lock); return 0; } @@ -965,7 +965,7 @@ static int kvmppc_xive_native_has_attr(struct kvm_device *dev, } /* - * Called when device fd is closed + * Called when device fd is closed. kvm->lock is held. */ static void kvmppc_xive_native_release(struct kvm_device *dev) { @@ -1064,6 +1064,7 @@ static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type) xive->kvm = kvm; kvm->arch.xive = xive; mutex_init(&xive->mapping_lock); + mutex_init(&xive->lock); /* * Allocate a bunch of VPs. KVM_MAX_VCPUS is a large value for -- cgit v1.2.3 From ef9740204051d0e00f5402fe96cf3a43ddd2bbbf Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Tue, 28 May 2019 14:17:15 +0200 Subject: KVM: PPC: Book3S HV: XIVE: Do not clear IRQ data of passthrough interrupts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The passthrough interrupts are defined at the host level and their IRQ data should not be cleared unless specifically deconfigured (shutdown) by the host. They differ from the IPI interrupts which are allocated by the XIVE KVM device and reserved to the guest usage only. This fixes a host crash when destroying a VM in which a PCI adapter was passed-through. In this case, the interrupt is cleared and freed by the KVM device and then shutdown by vfio at the host level. [ 1007.360265] BUG: Kernel NULL pointer dereference at 0x00000d00 [ 1007.360285] Faulting instruction address: 0xc00000000009da34 [ 1007.360296] Oops: Kernel access of bad area, sig: 7 [#1] [ 1007.360303] LE PAGE_SIZE=64K MMU=Radix MMU=Hash SMP NR_CPUS=2048 NUMA PowerNV [ 1007.360314] Modules linked in: vhost_net vhost iptable_mangle ipt_MASQUERADE iptable_nat nf_nat xt_conntrack nf_conntrack nf_defrag_ipv4 ipt_REJECT nf_reject_ipv4 tun bridge stp llc kvm_hv kvm xt_tcpudp iptable_filter squashfs fuse binfmt_misc vmx_crypto ib_iser rdma_cm iw_cm ib_cm libiscsi scsi_transport_iscsi nfsd ip_tables x_tables autofs4 btrfs zstd_decompress zstd_compress lzo_compress raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq multipath mlx5_ib ib_uverbs ib_core crc32c_vpmsum mlx5_core [ 1007.360425] CPU: 9 PID: 15576 Comm: CPU 18/KVM Kdump: loaded Not tainted 5.1.0-gad7e7d0ef #4 [ 1007.360454] NIP: c00000000009da34 LR: c00000000009e50c CTR: c00000000009e5d0 [ 1007.360482] REGS: c000007f24ccf330 TRAP: 0300 Not tainted (5.1.0-gad7e7d0ef) [ 1007.360500] MSR: 900000000280b033 CR: 24002484 XER: 00000000 [ 1007.360532] CFAR: c00000000009da10 DAR: 0000000000000d00 DSISR: 00080000 IRQMASK: 1 [ 1007.360532] GPR00: c00000000009e62c c000007f24ccf5c0 c000000001510600 c000007fe7f947c0 [ 1007.360532] GPR04: 0000000000000d00 0000000000000000 0000000000000000 c000005eff02d200 [ 1007.360532] GPR08: 0000000000400000 0000000000000000 0000000000000000 fffffffffffffffd [ 1007.360532] GPR12: c00000000009e5d0 c000007fffff7b00 0000000000000031 000000012c345718 [ 1007.360532] GPR16: 0000000000000000 0000000000000008 0000000000418004 0000000000040100 [ 1007.360532] GPR20: 0000000000000000 0000000008430000 00000000003c0000 0000000000000027 [ 1007.360532] GPR24: 00000000000000ff 0000000000000000 00000000000000ff c000007faa90d98c [ 1007.360532] GPR28: c000007faa90da40 00000000000fe040 ffffffffffffffff c000007fe7f947c0 [ 1007.360689] NIP [c00000000009da34] xive_esb_read+0x34/0x120 [ 1007.360706] LR [c00000000009e50c] xive_do_source_set_mask.part.0+0x2c/0x50 [ 1007.360732] Call Trace: [ 1007.360738] [c000007f24ccf5c0] [c000000000a6383c] snooze_loop+0x15c/0x270 (unreliable) [ 1007.360775] [c000007f24ccf5f0] [c00000000009e62c] xive_irq_shutdown+0x5c/0xe0 [ 1007.360795] [c000007f24ccf630] [c00000000019e4a0] irq_shutdown+0x60/0xe0 [ 1007.360813] [c000007f24ccf660] [c000000000198c44] __free_irq+0x3a4/0x420 [ 1007.360831] [c000007f24ccf700] [c000000000198dc8] free_irq+0x78/0xe0 [ 1007.360849] [c000007f24ccf730] [c00000000096c5a8] vfio_msi_set_vector_signal+0xa8/0x350 [ 1007.360878] [c000007f24ccf7f0] [c00000000096c938] vfio_msi_set_block+0xe8/0x1e0 [ 1007.360899] [c000007f24ccf850] [c00000000096cae0] vfio_msi_disable+0xb0/0x110 [ 1007.360912] [c000007f24ccf8a0] [c00000000096cd04] vfio_pci_set_msi_trigger+0x1c4/0x3d0 [ 1007.360922] [c000007f24ccf910] [c00000000096d910] vfio_pci_set_irqs_ioctl+0xa0/0x170 [ 1007.360941] [c000007f24ccf930] [c00000000096b400] vfio_pci_disable+0x80/0x5e0 [ 1007.360963] [c000007f24ccfa10] [c00000000096b9bc] vfio_pci_release+0x5c/0x90 [ 1007.360991] [c000007f24ccfa40] [c000000000963a9c] vfio_device_fops_release+0x3c/0x70 [ 1007.361012] [c000007f24ccfa70] [c0000000003b5668] __fput+0xc8/0x2b0 [ 1007.361040] [c000007f24ccfac0] [c0000000001409b0] task_work_run+0x140/0x1b0 [ 1007.361059] [c000007f24ccfb20] [c000000000118f8c] do_exit+0x3ac/0xd00 [ 1007.361076] [c000007f24ccfc00] [c0000000001199b0] do_group_exit+0x60/0x100 [ 1007.361094] [c000007f24ccfc40] [c00000000012b514] get_signal+0x1a4/0x8f0 [ 1007.361112] [c000007f24ccfd30] [c000000000021cc8] do_notify_resume+0x1a8/0x430 [ 1007.361141] [c000007f24ccfe20] [c00000000000e444] ret_from_except_lite+0x70/0x74 [ 1007.361159] Instruction dump: [ 1007.361175] 38422c00 e9230000 712a0004 41820010 548a2036 7d442378 78840020 71290020 [ 1007.361194] 4082004c e9230010 7c892214 7c0004ac 0c090000 4c00012c 792a0022 Cc: stable@vger.kernel.org # v4.12+ Fixes: 5af50993850a ("KVM: PPC: Book3S HV: Native usage of the XIVE interrupt controller") Signed-off-by: Cédric Le Goater Signed-off-by: Greg Kurz Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_xive.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index 12c8a36dd980..922fd62bcd2a 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -1828,7 +1828,6 @@ static void kvmppc_xive_cleanup_irq(u32 hw_num, struct xive_irq_data *xd) { xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_01); xive_native_configure_irq(hw_num, 0, MASKED, 0); - xive_cleanup_irq_data(xd); } void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb) @@ -1842,9 +1841,10 @@ void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb) continue; kvmppc_xive_cleanup_irq(state->ipi_number, &state->ipi_data); + xive_cleanup_irq_data(&state->ipi_data); xive_native_free_irq(state->ipi_number); - /* Pass-through, cleanup too */ + /* Pass-through, cleanup too but keep IRQ hw data */ if (state->pt_number) kvmppc_xive_cleanup_irq(state->pt_number, state->pt_data); -- cgit v1.2.3 From aedb5b19429c807331e8387bcb98cb5dc4c2a75e Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Tue, 28 May 2019 14:17:16 +0200 Subject: KVM: PPC: Book3S HV: XIVE: Take the srcu read lock when accessing memslots MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to Documentation/virtual/kvm/locking.txt, the srcu read lock should be taken when accessing the memslots of the VM. The XIVE KVM device needs to do so when configuring the page of the OS event queue of vCPU for a given priority and when marking the same page dirty before migration. This avoids warnings such as : [ 208.224882] ============================= [ 208.224884] WARNING: suspicious RCU usage [ 208.224889] 5.2.0-rc2-xive+ #47 Not tainted [ 208.224890] ----------------------------- [ 208.224894] ../include/linux/kvm_host.h:633 suspicious rcu_dereference_check() usage! [ 208.224896] other info that might help us debug this: [ 208.224898] rcu_scheduler_active = 2, debug_locks = 1 [ 208.224901] no locks held by qemu-system-ppc/3923. [ 208.224902] stack backtrace: [ 208.224907] CPU: 64 PID: 3923 Comm: qemu-system-ppc Kdump: loaded Not tainted 5.2.0-rc2-xive+ #47 [ 208.224909] Call Trace: [ 208.224918] [c000200cdd98fa30] [c000000000be1934] dump_stack+0xe8/0x164 (unreliable) [ 208.224924] [c000200cdd98fa80] [c0000000001aec80] lockdep_rcu_suspicious+0x110/0x180 [ 208.224935] [c000200cdd98fb00] [c0080000075933a0] gfn_to_memslot+0x1c8/0x200 [kvm] [ 208.224943] [c000200cdd98fb40] [c008000007599600] gfn_to_pfn+0x28/0x60 [kvm] [ 208.224951] [c000200cdd98fb70] [c008000007599658] gfn_to_page+0x20/0x40 [kvm] [ 208.224959] [c000200cdd98fb90] [c0080000075b495c] kvmppc_xive_native_set_attr+0x8b4/0x1480 [kvm] [ 208.224967] [c000200cdd98fca0] [c00800000759261c] kvm_device_ioctl_attr+0x64/0xb0 [kvm] [ 208.224974] [c000200cdd98fcf0] [c008000007592730] kvm_device_ioctl+0xc8/0x110 [kvm] [ 208.224979] [c000200cdd98fd10] [c000000000433a24] do_vfs_ioctl+0xd4/0xcd0 [ 208.224981] [c000200cdd98fdb0] [c000000000434724] ksys_ioctl+0x104/0x120 [ 208.224984] [c000200cdd98fe00] [c000000000434768] sys_ioctl+0x28/0x80 [ 208.224988] [c000200cdd98fe20] [c00000000000b888] system_call+0x5c/0x70 legoater@boss01:~$ Fixes: 13ce3297c576 ("KVM: PPC: Book3S HV: XIVE: Add controls for the EQ configuration") Fixes: e6714bd1671d ("KVM: PPC: Book3S HV: XIVE: Add a control to dirty the XIVE EQ pages") Signed-off-by: Cédric Le Goater Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_xive_native.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index fec3b85411ef..8b762e3ebbc5 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -535,6 +535,7 @@ static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive *xive, struct xive_q *q; gfn_t gfn; unsigned long page_size; + int srcu_idx; /* * Demangle priority/server tuple from the EQ identifier @@ -610,20 +611,24 @@ static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive *xive, return -EINVAL; } + srcu_idx = srcu_read_lock(&kvm->srcu); gfn = gpa_to_gfn(kvm_eq.qaddr); page = gfn_to_page(kvm, gfn); if (is_error_page(page)) { + srcu_read_unlock(&kvm->srcu, srcu_idx); pr_err("Couldn't get queue page %llx!\n", kvm_eq.qaddr); return -EINVAL; } page_size = kvm_host_page_size(kvm, gfn); if (1ull << kvm_eq.qshift > page_size) { + srcu_read_unlock(&kvm->srcu, srcu_idx); pr_warn("Incompatible host page size %lx!\n", page_size); return -EINVAL; } qaddr = page_to_virt(page) + (kvm_eq.qaddr & ~PAGE_MASK); + srcu_read_unlock(&kvm->srcu, srcu_idx); /* * Backup the queue page guest address to the mark EQ page @@ -854,6 +859,7 @@ static int kvmppc_xive_native_vcpu_eq_sync(struct kvm_vcpu *vcpu) { struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; unsigned int prio; + int srcu_idx; if (!xc) return -ENOENT; @@ -865,7 +871,9 @@ static int kvmppc_xive_native_vcpu_eq_sync(struct kvm_vcpu *vcpu) continue; /* Mark EQ page dirty for migration */ + srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); mark_page_dirty(vcpu->kvm, gpa_to_gfn(q->guest_qaddr)); + srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); } return 0; } -- cgit v1.2.3 From bcaa3110d584f982a17e9ddbfc03e1130bca2bc9 Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Tue, 28 May 2019 23:13:24 +0200 Subject: KVM: PPC: Book3S HV: XIVE: Fix page offset when clearing ESB pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Under XIVE, the ESB pages of an interrupt are used for interrupt management (EOI) and triggering. They are made available to guests through a mapping of the XIVE KVM device. When a device is passed-through, the passthru_irq helpers, kvmppc_xive_set_mapped() and kvmppc_xive_clr_mapped(), clear the ESB pages of the guest IRQ number being mapped and let the VM fault handler repopulate with the correct page. The ESB pages are mapped at offset 4 (KVM_XIVE_ESB_PAGE_OFFSET) in the KVM device mapping. Unfortunately, this offset was not taken into account when clearing the pages. This lead to issues with the passthrough devices for which the interrupts were not functional under some guest configuration (tg3 and single CPU) or in any configuration (e1000e adapter). Reviewed-by: Greg Kurz Tested-by: Greg Kurz Signed-off-by: Cédric Le Goater Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_xive_native.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index 8b762e3ebbc5..5596c8ec221a 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -172,6 +172,7 @@ bail: static int kvmppc_xive_native_reset_mapped(struct kvm *kvm, unsigned long irq) { struct kvmppc_xive *xive = kvm->arch.xive; + pgoff_t esb_pgoff = KVM_XIVE_ESB_PAGE_OFFSET + irq * 2; if (irq >= KVMPPC_XIVE_NR_IRQS) return -EINVAL; @@ -185,7 +186,7 @@ static int kvmppc_xive_native_reset_mapped(struct kvm *kvm, unsigned long irq) mutex_lock(&xive->mapping_lock); if (xive->mapping) unmap_mapping_range(xive->mapping, - irq * (2ull << PAGE_SHIFT), + esb_pgoff << PAGE_SHIFT, 2ull << PAGE_SHIFT, 1); mutex_unlock(&xive->mapping_lock); return 0; -- cgit v1.2.3 From 1b28d5531e446a87bbefa5ced191c4cbd316576c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 28 May 2019 15:01:59 +1000 Subject: KVM: PPC: Book3S HV: Fix lockdep warning when entering guest on POWER9 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 3309bec85e60 ("KVM: PPC: Book3S HV: Fix lockdep warning when entering the guest") moved calls to trace_hardirqs_{on,off} in the entry path used for HPT guests. Similar code exists in the new streamlined entry path used for radix guests on POWER9. This makes the same change there, so as to avoid lockdep warnings such as this: [ 228.686461] DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled) [ 228.686480] WARNING: CPU: 116 PID: 3803 at ../kernel/locking/lockdep.c:4219 check_flags.part.23+0x21c/0x270 [ 228.686544] Modules linked in: vhost_net vhost xt_CHECKSUM iptable_mangle xt_MASQUERADE iptable_nat nf_nat +xt_conntrack nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ipt_REJECT nf_reject_ipv4 tun bridge stp llc ebtable_filter +ebtables ip6table_filter ip6_tables iptable_filter fuse kvm_hv kvm at24 ipmi_powernv regmap_i2c ipmi_devintf +uio_pdrv_genirq ofpart ipmi_msghandler uio powernv_flash mtd ibmpowernv opal_prd ip_tables ext4 mbcache jbd2 btrfs +zstd_decompress zstd_compress raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx libcrc32c xor +raid6_pq raid1 raid0 ses sd_mod enclosure scsi_transport_sas ast i2c_opal i2c_algo_bit drm_kms_helper syscopyarea +sysfillrect sysimgblt fb_sys_fops ttm drm i40e e1000e cxl aacraid tg3 drm_panel_orientation_quirks i2c_core [ 228.686859] CPU: 116 PID: 3803 Comm: qemu-system-ppc Kdump: loaded Not tainted 5.2.0-rc1-xive+ #42 [ 228.686911] NIP: c0000000001b394c LR: c0000000001b3948 CTR: c000000000bfad20 [ 228.686963] REGS: c000200cdb50f570 TRAP: 0700 Not tainted (5.2.0-rc1-xive+) [ 228.687001] MSR: 9000000002823033 CR: 48222222 XER: 20040000 [ 228.687060] CFAR: c000000000116db0 IRQMASK: 1 [ 228.687060] GPR00: c0000000001b3948 c000200cdb50f800 c0000000015e7600 000000000000002e [ 228.687060] GPR04: 0000000000000001 c0000000001c71a0 000000006e655f73 72727563284e4f5f [ 228.687060] GPR08: 0000200e60680000 0000000000000000 c000200cdb486180 0000000000000000 [ 228.687060] GPR12: 0000000000002000 c000200fff61a680 0000000000000000 00007fffb75c0000 [ 228.687060] GPR16: 0000000000000000 0000000000000000 c0000000017d6900 c000000001124900 [ 228.687060] GPR20: 0000000000000074 c008000006916f68 0000000000000074 0000000000000074 [ 228.687060] GPR24: ffffffffffffffff ffffffffffffffff 0000000000000003 c000200d4b600000 [ 228.687060] GPR28: c000000001627e58 c000000001489908 c000000001627e58 c000000002304de0 [ 228.687377] NIP [c0000000001b394c] check_flags.part.23+0x21c/0x270 [ 228.687415] LR [c0000000001b3948] check_flags.part.23+0x218/0x270 [ 228.687466] Call Trace: [ 228.687488] [c000200cdb50f800] [c0000000001b3948] check_flags.part.23+0x218/0x270 (unreliable) [ 228.687542] [c000200cdb50f870] [c0000000001b6548] lock_is_held_type+0x188/0x1c0 [ 228.687595] [c000200cdb50f8d0] [c0000000001d939c] rcu_read_lock_sched_held+0xdc/0x100 [ 228.687646] [c000200cdb50f900] [c0000000001dd704] rcu_note_context_switch+0x304/0x340 [ 228.687701] [c000200cdb50f940] [c0080000068fcc58] kvmhv_run_single_vcpu+0xdb0/0x1120 [kvm_hv] [ 228.687756] [c000200cdb50fa20] [c0080000068fd5b0] kvmppc_vcpu_run_hv+0x5e8/0xe40 [kvm_hv] [ 228.687816] [c000200cdb50faf0] [c0080000071797dc] kvmppc_vcpu_run+0x34/0x48 [kvm] [ 228.687863] [c000200cdb50fb10] [c0080000071755dc] kvm_arch_vcpu_ioctl_run+0x244/0x420 [kvm] [ 228.687916] [c000200cdb50fba0] [c008000007165ccc] kvm_vcpu_ioctl+0x424/0x838 [kvm] [ 228.687957] [c000200cdb50fd10] [c000000000433a24] do_vfs_ioctl+0xd4/0xcd0 [ 228.687995] [c000200cdb50fdb0] [c000000000434724] ksys_ioctl+0x104/0x120 [ 228.688033] [c000200cdb50fe00] [c000000000434768] sys_ioctl+0x28/0x80 [ 228.688072] [c000200cdb50fe20] [c00000000000b888] system_call+0x5c/0x70 [ 228.688109] Instruction dump: [ 228.688142] 4bf6342d 60000000 0fe00000 e8010080 7c0803a6 4bfffe60 3c82ff87 3c62ff87 [ 228.688196] 388472d0 3863d738 4bf63405 60000000 <0fe00000> 4bffff4c 3c82ff87 3c62ff87 [ 228.688251] irq event stamp: 205 [ 228.688287] hardirqs last enabled at (205): [] kvmhv_run_single_vcpu+0x30c/0x1120 [kvm_hv] [ 228.688344] hardirqs last disabled at (204): [] kvmhv_run_single_vcpu+0x148/0x1120 [kvm_hv] [ 228.688412] softirqs last enabled at (180): [] __do_softirq+0x4ac/0x5d4 [ 228.688464] softirqs last disabled at (169): [] irq_exit+0x1f8/0x210 [ 228.688513] ---[ end trace eb16f6260022a812 ]--- [ 228.688548] possible reason: unannotated irqs-off. [ 228.688571] irq event stamp: 205 [ 228.688607] hardirqs last enabled at (205): [] kvmhv_run_single_vcpu+0x30c/0x1120 [kvm_hv] [ 228.688664] hardirqs last disabled at (204): [] kvmhv_run_single_vcpu+0x148/0x1120 [kvm_hv] [ 228.688719] softirqs last enabled at (180): [] __do_softirq+0x4ac/0x5d4 [ 228.688758] softirqs last disabled at (169): [] irq_exit+0x1f8/0x210 Cc: stable@vger.kernel.org # v4.20+ Fixes: 95a6432ce903 ("KVM: PPC: Book3S HV: Streamlined guest entry/exit path on P9 for radix guests") Signed-off-by: Paul Mackerras Reviewed-by: Cédric Le Goater Tested-by: Cédric Le Goater Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 27054d301852..9f49339c6d50 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4090,16 +4090,20 @@ int kvmhv_run_single_vcpu(struct kvm_run *kvm_run, kvmppc_check_need_tlb_flush(kvm, pcpu, nested); } - trace_hardirqs_on(); guest_enter_irqoff(); srcu_idx = srcu_read_lock(&kvm->srcu); this_cpu_disable_ftrace(); + /* Tell lockdep that we're about to enable interrupts */ + trace_hardirqs_on(); + trap = kvmhv_p9_guest_entry(vcpu, time_limit, lpcr); vcpu->arch.trap = trap; + trace_hardirqs_off(); + this_cpu_enable_ftrace(); srcu_read_unlock(&kvm->srcu, srcu_idx); @@ -4109,7 +4113,6 @@ int kvmhv_run_single_vcpu(struct kvm_run *kvm_run, isync(); } - trace_hardirqs_off(); set_irq_happened(trap); kvmppc_set_host_core(pcpu); -- cgit v1.2.3 From d724c9e54939a597592de3659541da11fc7aa112 Mon Sep 17 00:00:00 2001 From: Suraj Jitindar Singh Date: Thu, 30 May 2019 12:17:18 +1000 Subject: KVM: PPC: Book3S HV: Restore SPRG3 in kvmhv_p9_guest_entry() The sprgs are a set of 4 general purpose sprs provided for software use. SPRG3 is special in that it can also be read from userspace. Thus it is used on linux to store the cpu and numa id of the process to speed up syscall access to this information. This register is overwritten with the guest value on kvm guest entry, and so needs to be restored on exit again. Thus restore the value on the guest exit path in kvmhv_p9_guest_entry(). Cc: stable@vger.kernel.org # v4.20+ Fixes: 95a6432ce9038 ("KVM: PPC: Book3S HV: Streamlined guest entry/exit path on P9 for radix guests") Signed-off-by: Suraj Jitindar Singh Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 9f49339c6d50..5e840113eda4 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3662,6 +3662,7 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, vc->in_guest = 0; mtspr(SPRN_DEC, local_paca->kvm_hstate.dec_expires - mftb()); + mtspr(SPRN_SPRG_VDSO_WRITE, local_paca->sprg_vdso); kvmhv_load_host_pmu(); -- cgit v1.2.3