From 5a69aec945d27e78abac9fd032533d3aaebf7c1e Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 16 Aug 2017 16:01:14 +1000 Subject: powerpc: Fix VSX enabling/flushing to also test MSR_FP and MSR_VEC VSX uses a combination of the old vector registers, the old FP registers and new "second halves" of the FP registers. Thus when we need to see the VSX state in the thread struct (flush_vsx_to_thread()) or when we'll use the VSX in the kernel (enable_kernel_vsx()) we need to ensure they are all flushed into the thread struct if either of them is individually enabled. Unfortunately we only tested if the whole VSX was enabled, not if they were individually enabled. Fixes: 72cd7b44bc99 ("powerpc: Uncomment and make enable_kernel_vsx() routine available") Cc: stable@vger.kernel.org # v4.3+ Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/process.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index ec480966f9bf..1f0fd361e09b 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -362,7 +362,8 @@ void enable_kernel_vsx(void) cpumsr = msr_check_and_set(MSR_FP|MSR_VEC|MSR_VSX); - if (current->thread.regs && (current->thread.regs->msr & MSR_VSX)) { + if (current->thread.regs && + (current->thread.regs->msr & (MSR_VSX|MSR_VEC|MSR_FP))) { check_if_tm_restore_required(current); /* * If a thread has already been reclaimed then the @@ -386,7 +387,7 @@ void flush_vsx_to_thread(struct task_struct *tsk) { if (tsk->thread.regs) { preempt_disable(); - if (tsk->thread.regs->msr & MSR_VSX) { + if (tsk->thread.regs->msr & (MSR_VSX|MSR_VEC|MSR_FP)) { BUG_ON(tsk != current); giveup_vsx(tsk); } -- cgit v1.2.3 From 1a92a80ad386a1a6e3b36d576d52a1a456394b70 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 24 Jul 2017 14:28:00 +1000 Subject: powerpc/mm: Ensure cpumask update is ordered There is no guarantee that the various isync's involved with the context switch will order the update of the CPU mask with the first TLB entry for the new context being loaded by the HW. Be safe here and add a memory barrier to order any subsequent load/store which may bring entries into the TLB. The corresponding barrier on the other side already exists as pte updates use pte_xchg() which uses __cmpxchg_u64 which has a sync after the atomic operation. Cc: stable@vger.kernel.org Signed-off-by: Benjamin Herrenschmidt Reviewed-by: Nicholas Piggin [mpe: Add comments in the code] Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/mmu_context.h | 18 ++++++++++++++++++ arch/powerpc/include/asm/pgtable-be-types.h | 1 + arch/powerpc/include/asm/pgtable-types.h | 1 + 3 files changed, 20 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index 0c76675394c5..35bec1c5bd5a 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -90,6 +90,24 @@ static inline void switch_mm_irqs_off(struct mm_struct *prev, /* Mark this context has been used on the new CPU */ if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(next))) { cpumask_set_cpu(smp_processor_id(), mm_cpumask(next)); + + /* + * This full barrier orders the store to the cpumask above vs + * a subsequent operation which allows this CPU to begin loading + * translations for next. + * + * When using the radix MMU that operation is the load of the + * MMU context id, which is then moved to SPRN_PID. + * + * For the hash MMU it is either the first load from slb_cache + * in switch_slb(), and/or the store of paca->mm_ctx_id in + * copy_mm_to_paca(). + * + * On the read side the barrier is in pte_xchg(), which orders + * the store to the PTE vs the load of mm_cpumask. + */ + smp_mb(); + new_on_cpu = true; } diff --git a/arch/powerpc/include/asm/pgtable-be-types.h b/arch/powerpc/include/asm/pgtable-be-types.h index 9c0f5db5cf46..67e7e3d990f4 100644 --- a/arch/powerpc/include/asm/pgtable-be-types.h +++ b/arch/powerpc/include/asm/pgtable-be-types.h @@ -87,6 +87,7 @@ static inline bool pte_xchg(pte_t *ptep, pte_t old, pte_t new) unsigned long *p = (unsigned long *)ptep; __be64 prev; + /* See comment in switch_mm_irqs_off() */ prev = (__force __be64)__cmpxchg_u64(p, (__force unsigned long)pte_raw(old), (__force unsigned long)pte_raw(new)); diff --git a/arch/powerpc/include/asm/pgtable-types.h b/arch/powerpc/include/asm/pgtable-types.h index 8bd3b13fe2fb..369a164b545c 100644 --- a/arch/powerpc/include/asm/pgtable-types.h +++ b/arch/powerpc/include/asm/pgtable-types.h @@ -62,6 +62,7 @@ static inline bool pte_xchg(pte_t *ptep, pte_t old, pte_t new) { unsigned long *p = (unsigned long *)ptep; + /* See comment in switch_mm_irqs_off() */ return pte_val(old) == __cmpxchg_u64(p, pte_val(old), pte_val(new)); } #endif -- cgit v1.2.3 From 92e5aae457787d0bc6b255200d2fb116edf69794 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 18 Aug 2017 15:15:51 -0700 Subject: kernel/watchdog: fix Kconfig constraints for perf hardlockup watchdog Commit 05a4a9527931 ("kernel/watchdog: split up config options") lost the perf-based hardlockup detector's dependency on PERF_EVENTS, which can result in broken builds with some powerpc configurations. Restore the dependency. Add it in for x86 too, despite x86 always selecting PERF_EVENTS it seems reasonable to make the dependency explicit. Link: http://lkml.kernel.org/r/20170810114452.6673-1-npiggin@gmail.com Fixes: 05a4a9527931 ("kernel/watchdog: split up config options") Signed-off-by: Nicholas Piggin Acked-by: Don Zickus Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/Kconfig | 2 +- arch/x86/Kconfig | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 36f858c37ca7..81b0031f909f 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -199,7 +199,7 @@ config PPC select HAVE_OPTPROBES if PPC64 select HAVE_PERF_EVENTS select HAVE_PERF_EVENTS_NMI if PPC64 - select HAVE_HARDLOCKUP_DETECTOR_PERF if HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH + select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_RCU_TABLE_FREE if SMP diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 781521b7cf9e..29a1bf85e507 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -163,7 +163,7 @@ config X86 select HAVE_PCSPKR_PLATFORM select HAVE_PERF_EVENTS select HAVE_PERF_EVENTS_NMI - select HAVE_HARDLOCKUP_DETECTOR_PERF if HAVE_PERF_EVENTS_NMI + select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_REGS_AND_STACK_ACCESS_API -- cgit v1.2.3 From bd0fdb191c8523a9126bb14ac1b22cb47698ebf5 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 13 Mar 2017 03:03:49 +1000 Subject: KVM: PPC: Book3S HV: Use msgsync with hypervisor doorbells on POWER9 When msgsnd is used for IPIs to other cores, msgsync must be executed by the target to order stores performed on the source before its msgsnd (provided the source executes the appropriate sync). Fixes: 1704a81ccebc ("KVM: PPC: Book3S HV: Use msgsnd for IPIs to other cores on POWER9") Signed-off-by: Nicholas Piggin Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index c52184a8efdf..9c9c983b864f 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1291,6 +1291,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) /* Hypervisor doorbell - exit only if host IPI flag set */ cmpwi r12, BOOK3S_INTERRUPT_H_DOORBELL bne 3f +BEGIN_FTR_SECTION + PPC_MSGSYNC +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) lbz r0, HSTATE_HOST_IPI(r13) cmpwi r0, 0 beq 4f -- cgit v1.2.3 From 2c4fb78f78b6e420604ee1b05bdfb5c1d637869f Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Fri, 18 Aug 2017 12:10:52 +1000 Subject: KVM: PPC: Book3S HV: Workaround POWER9 DD1.0 bug causing IPB bit loss This adds a workaround for a bug in POWER9 DD1 chips where changing the CPPR (Current Processor Priority Register) can cause bits in the IPB (Interrupt Pending Buffer) to get lost. Thankfully it only happens when manually manipulating CPPR which is quite rare. When it does happen it can cause interrupts to be delayed or lost. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_xive_template.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/book3s_xive_template.c b/arch/powerpc/kvm/book3s_xive_template.c index 4636ca6e7d38..150be86b1018 100644 --- a/arch/powerpc/kvm/book3s_xive_template.c +++ b/arch/powerpc/kvm/book3s_xive_template.c @@ -16,7 +16,16 @@ static void GLUE(X_PFX,ack_pending)(struct kvmppc_xive_vcpu *xc) u8 cppr; u16 ack; - /* XXX DD1 bug workaround: Check PIPR vs. CPPR first ! */ + /* + * DD1 bug workaround: If PIPR is less favored than CPPR + * ignore the interrupt or we might incorrectly lose an IPB + * bit. + */ + if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { + u8 pipr = __x_readb(__x_tima + TM_QW1_OS + TM_PIPR); + if (pipr >= xc->hw_cppr) + return; + } /* Perform the acknowledge OS to register cycle. */ ack = be16_to_cpu(__x_readw(__x_tima + TM_SPC_ACK_OS_REG)); -- cgit v1.2.3 From bb9b52bd51dcb17b965a30167d0812902c1b9927 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Fri, 18 Aug 2017 12:10:58 +1000 Subject: KVM: PPC: Book3S HV: Add missing barriers to XIVE code and document them This adds missing memory barriers to order updates/tests of the virtual CPPR and MFRR, thus fixing a lost IPI problem. While at it also document all barriers in this file. This fixes a bug causing guest IPIs to occasionally get lost. The symptom then is hangs or stalls in the guest. Signed-off-by: Benjamin Herrenschmidt Tested-by: Guilherme G. Piccoli Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_xive_template.c | 57 +++++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/book3s_xive_template.c b/arch/powerpc/kvm/book3s_xive_template.c index 150be86b1018..d1ed2c41b5d2 100644 --- a/arch/powerpc/kvm/book3s_xive_template.c +++ b/arch/powerpc/kvm/book3s_xive_template.c @@ -16,6 +16,12 @@ static void GLUE(X_PFX,ack_pending)(struct kvmppc_xive_vcpu *xc) u8 cppr; u16 ack; + /* + * Ensure any previous store to CPPR is ordered vs. + * the subsequent loads from PIPR or ACK. + */ + eieio(); + /* * DD1 bug workaround: If PIPR is less favored than CPPR * ignore the interrupt or we might incorrectly lose an IPB @@ -244,6 +250,11 @@ skip_ipi: /* * If we found an interrupt, adjust what the guest CPPR should * be as if we had just fetched that interrupt from HW. + * + * Note: This can only make xc->cppr smaller as the previous + * loop will only exit with hirq != 0 if prio is lower than + * the current xc->cppr. Thus we don't need to re-check xc->mfrr + * for pending IPIs. */ if (hirq) xc->cppr = prio; @@ -389,6 +400,12 @@ X_STATIC int GLUE(X_PFX,h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr) old_cppr = xc->cppr; xc->cppr = cppr; + /* + * Order the above update of xc->cppr with the subsequent + * read of xc->mfrr inside push_pending_to_hw() + */ + smp_mb(); + /* * We are masking less, we need to look for pending things * to deliver and set VP pending bits accordingly to trigger @@ -429,21 +446,37 @@ X_STATIC int GLUE(X_PFX,h_eoi)(struct kvm_vcpu *vcpu, unsigned long xirr) * used to signal MFRR changes is EOId when fetched from * the queue. */ - if (irq == XICS_IPI || irq == 0) + if (irq == XICS_IPI || irq == 0) { + /* + * This barrier orders the setting of xc->cppr vs. + * subsquent test of xc->mfrr done inside + * scan_interrupts and push_pending_to_hw + */ + smp_mb(); goto bail; + } /* Find interrupt source */ sb = kvmppc_xive_find_source(xive, irq, &src); if (!sb) { pr_devel(" source not found !\n"); rc = H_PARAMETER; + /* Same as above */ + smp_mb(); goto bail; } state = &sb->irq_state[src]; kvmppc_xive_select_irq(state, &hw_num, &xd); state->in_eoi = true; - mb(); + + /* + * This barrier orders both setting of in_eoi above vs, + * subsequent test of guest_priority, and the setting + * of xc->cppr vs. subsquent test of xc->mfrr done inside + * scan_interrupts and push_pending_to_hw + */ + smp_mb(); again: if (state->guest_priority == MASKED) { @@ -470,6 +503,14 @@ again: } + /* + * This barrier orders the above guest_priority check + * and spin_lock/unlock with clearing in_eoi below. + * + * It also has to be a full mb() as it must ensure + * the MMIOs done in source_eoi() are completed before + * state->in_eoi is visible. + */ mb(); state->in_eoi = false; bail: @@ -504,6 +545,18 @@ X_STATIC int GLUE(X_PFX,h_ipi)(struct kvm_vcpu *vcpu, unsigned long server, /* Locklessly write over MFRR */ xc->mfrr = mfrr; + /* + * The load of xc->cppr below and the subsequent MMIO store + * to the IPI must happen after the above mfrr update is + * globally visible so that: + * + * - Synchronize with another CPU doing an H_EOI or a H_CPPR + * updating xc->cppr then reading xc->mfrr. + * + * - The target of the IPI sees the xc->mfrr update + */ + mb(); + /* Shoot the IPI if most favored than target cppr */ if (mfrr < xc->cppr) __x_writeq(0, __x_trig_page(&xc->vp_ipi_data)); -- cgit v1.2.3 From 47c5310a8dbe7c2cb9f0083daa43ceed76c257fa Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 24 Aug 2017 19:14:47 +1000 Subject: KVM: PPC: Book3S: Fix race and leak in kvm_vm_ioctl_create_spapr_tce() Nixiaoming pointed out that there is a memory leak in kvm_vm_ioctl_create_spapr_tce() if the call to anon_inode_getfd() fails; the memory allocated for the kvmppc_spapr_tce_table struct is not freed, and nor are the pages allocated for the iommu tables. In addition, we have already incremented the process's count of locked memory pages, and this doesn't get restored on error. David Hildenbrand pointed out that there is a race in that the function checks early on that there is not already an entry in the stt->iommu_tables list with the same LIOBN, but an entry with the same LIOBN could get added between then and when the new entry is added to the list. This fixes all three problems. To simplify things, we now call anon_inode_getfd() before placing the new entry in the list. The check for an existing entry is done while holding the kvm->lock mutex, immediately before adding the new entry to the list. Finally, on failure we now call kvmppc_account_memlimit to decrement the process's count of locked memory pages. Reported-by: Nixiaoming Reported-by: David Hildenbrand Signed-off-by: Paul Mackerras Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/book3s_64_vio.c | 56 ++++++++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 22 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index a160c14304eb..53766e2bc029 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -294,32 +294,26 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvm_create_spapr_tce_64 *args) { struct kvmppc_spapr_tce_table *stt = NULL; + struct kvmppc_spapr_tce_table *siter; unsigned long npages, size; int ret = -ENOMEM; int i; + int fd = -1; if (!args->size) return -EINVAL; - /* Check this LIOBN hasn't been previously allocated */ - list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { - if (stt->liobn == args->liobn) - return -EBUSY; - } - size = _ALIGN_UP(args->size, PAGE_SIZE >> 3); npages = kvmppc_tce_pages(size); ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true); - if (ret) { - stt = NULL; - goto fail; - } + if (ret) + return ret; ret = -ENOMEM; stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *), GFP_KERNEL); if (!stt) - goto fail; + goto fail_acct; stt->liobn = args->liobn; stt->page_shift = args->page_shift; @@ -334,24 +328,42 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, goto fail; } - kvm_get_kvm(kvm); + ret = fd = anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops, + stt, O_RDWR | O_CLOEXEC); + if (ret < 0) + goto fail; mutex_lock(&kvm->lock); - list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables); + + /* Check this LIOBN hasn't been previously allocated */ + ret = 0; + list_for_each_entry(siter, &kvm->arch.spapr_tce_tables, list) { + if (siter->liobn == args->liobn) { + ret = -EBUSY; + break; + } + } + + if (!ret) { + list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables); + kvm_get_kvm(kvm); + } mutex_unlock(&kvm->lock); - return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops, - stt, O_RDWR | O_CLOEXEC); + if (!ret) + return fd; -fail: - if (stt) { - for (i = 0; i < npages; i++) - if (stt->pages[i]) - __free_page(stt->pages[i]); + put_unused_fd(fd); - kfree(stt); - } + fail: + for (i = 0; i < npages; i++) + if (stt->pages[i]) + __free_page(stt->pages[i]); + + kfree(stt); + fail_acct: + kvmppc_account_memlimit(kvmppc_stt_pages(npages), false); return ret; } -- cgit v1.2.3