diff options
author | Paolo Bonzini <pbonzini@redhat.com> | 2019-09-10 17:51:17 +0300 |
---|---|---|
committer | Paolo Bonzini <pbonzini@redhat.com> | 2019-09-10 17:51:17 +0300 |
commit | 8146856b0a2693d5f7334f720c9b39a987f7571d (patch) | |
tree | e51fd6831162103d32e8a2ad83bf4077768a51b2 /arch/powerpc | |
parent | 16cfacc8085782dab8e365979356ce1ca87fd6cc (diff) | |
parent | ff42df49e75f053a8a6b4c2533100cdcc23afe69 (diff) | |
download | linux-8146856b0a2693d5f7334f720c9b39a987f7571d.tar.xz |
Merge tag 'kvm-ppc-next-5.4-1' of git://git.kernel.org/pub/scm/linux/kernel/git/paulus/powerpc into HEAD
PPC KVM update for 5.4
- Some prep for extending the uses of the rmap array
- Various minor fixes
- Commits from the powerpc topic/ppc-kvm branch, which fix a problem
with interrupts arriving after free_irq, causing host hangs and crashes.
Diffstat (limited to 'arch/powerpc')
28 files changed, 328 insertions, 98 deletions
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index d8dcd8820369..77f6ebf97113 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -121,6 +121,7 @@ config PPC select ARCH_32BIT_OFF_T if PPC32 select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEVMEM_IS_ALLOWED + select ARCH_HAS_DMA_MMAP_PGPROT select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index 463c63a9fcf1..11112023e327 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -302,9 +302,14 @@ #define H_SCM_UNBIND_MEM 0x3F0 #define H_SCM_QUERY_BLOCK_MEM_BINDING 0x3F4 #define H_SCM_QUERY_LOGICAL_MEM_BINDING 0x3F8 -#define H_SCM_MEM_QUERY 0x3FC -#define H_SCM_BLOCK_CLEAR 0x400 -#define MAX_HCALL_OPCODE H_SCM_BLOCK_CLEAR +#define H_SCM_UNBIND_ALL 0x3FC +#define H_SCM_HEALTH 0x400 +#define H_SCM_PERFORMANCE_STATS 0x418 +#define MAX_HCALL_OPCODE H_SCM_PERFORMANCE_STATS + +/* Scope args for H_SCM_UNBIND_ALL */ +#define H_UNBIND_SCOPE_ALL (0x1) +#define H_UNBIND_SCOPE_DRC (0x2) /* H_VIOCTL functions */ #define H_GET_VIOA_DUMP_SIZE 0x01 diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index e6e5f59aaa97..6fb5fb4779e0 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -232,11 +232,25 @@ struct revmap_entry { }; /* - * We use the top bit of each memslot->arch.rmap entry as a lock bit, - * and bit 32 as a present flag. The bottom 32 bits are the - * index in the guest HPT of a HPTE that points to the page. + * The rmap array of size number of guest pages is allocated for each memslot. + * This array is used to store usage specific information about the guest page. + * Below are the encodings of the various possible usage types. */ -#define KVMPPC_RMAP_LOCK_BIT 63 +/* Free bits which can be used to define a new usage */ +#define KVMPPC_RMAP_TYPE_MASK 0xff00000000000000 +#define KVMPPC_RMAP_NESTED 0xc000000000000000 /* Nested rmap array */ +#define KVMPPC_RMAP_HPT 0x0100000000000000 /* HPT guest */ + +/* + * rmap usage definition for a hash page table (hpt) guest: + * 0x0000080000000000 Lock bit + * 0x0000018000000000 RC bits + * 0x0000000100000000 Present bit + * 0x00000000ffffffff HPT index bits + * The bottom 32 bits are the index in the guest HPT of a HPTE that points to + * the page. + */ +#define KVMPPC_RMAP_LOCK_BIT 43 #define KVMPPC_RMAP_RC_SHIFT 32 #define KVMPPC_RMAP_REFERENCED (HPTE_R_R << KVMPPC_RMAP_RC_SHIFT) #define KVMPPC_RMAP_PRESENT 0x100000000ul diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 2484e6a8f5ca..8e8514efb124 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -598,6 +598,7 @@ extern int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val); extern int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val); +extern bool kvmppc_xive_native_supported(void); #else static inline int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server, diff --git a/arch/powerpc/include/asm/pmc.h b/arch/powerpc/include/asm/pmc.h index dc9a1ca70edf..c6bbe9778d3c 100644 --- a/arch/powerpc/include/asm/pmc.h +++ b/arch/powerpc/include/asm/pmc.h @@ -27,11 +27,10 @@ static inline void ppc_set_pmu_inuse(int inuse) #ifdef CONFIG_PPC_PSERIES get_lppaca()->pmcregs_in_use = inuse; #endif - } else { + } #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE - get_paca()->pmcregs_in_use = inuse; + get_paca()->pmcregs_in_use = inuse; #endif - } #endif } diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h index e4016985764e..818989e11678 100644 --- a/arch/powerpc/include/asm/xive.h +++ b/arch/powerpc/include/asm/xive.h @@ -46,7 +46,15 @@ struct xive_irq_data { /* Setup/used by frontend */ int target; + /* + * saved_p means that there is a queue entry for this interrupt + * in some CPU's queue (not including guest vcpu queues), even + * if P is not set in the source ESB. + * stale_p means that there is no queue entry for this interrupt + * in some CPU's queue, even if P is set in the source ESB. + */ bool saved_p; + bool stale_p; }; #define XIVE_IRQ_FLAG_STORE_EOI 0x01 #define XIVE_IRQ_FLAG_LSI 0x02 @@ -127,6 +135,7 @@ extern int xive_native_get_queue_state(u32 vp_id, uint32_t prio, u32 *qtoggle, extern int xive_native_set_queue_state(u32 vp_id, uint32_t prio, u32 qtoggle, u32 qindex); extern int xive_native_get_vp_state(u32 vp_id, u64 *out_state); +extern bool xive_native_has_queue_state_support(void); #else diff --git a/arch/powerpc/include/uapi/asm/bpf_perf_event.h b/arch/powerpc/include/uapi/asm/bpf_perf_event.h index b551b741653d..5e1e648aeec4 100644 --- a/arch/powerpc/include/uapi/asm/bpf_perf_event.h +++ b/arch/powerpc/include/uapi/asm/bpf_perf_event.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _UAPI__ASM_BPF_PERF_EVENT_H__ #define _UAPI__ASM_BPF_PERF_EVENT_H__ diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 56dfa7a2a6f2..ea0c69236789 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -49,7 +49,8 @@ obj-y := cputable.o ptrace.o syscalls.o \ signal.o sysfs.o cacheinfo.o time.o \ prom.o traps.o setup-common.o \ udbg.o misc.o io.o misc_$(BITS).o \ - of_platform.o prom_parse.o + of_platform.o prom_parse.o \ + dma-common.o obj-$(CONFIG_PPC64) += setup_64.o sys_ppc32.o \ signal_64.o ptrace32.o \ paca.o nvram_64.o firmware.o diff --git a/arch/powerpc/kernel/dma-common.c b/arch/powerpc/kernel/dma-common.c new file mode 100644 index 000000000000..dc7ef6b17b69 --- /dev/null +++ b/arch/powerpc/kernel/dma-common.c @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Contains common dma routines for all powerpc platforms. + * + * Copyright (C) 2019 Shawn Anastasio. + */ + +#include <linux/mm.h> +#include <linux/dma-noncoherent.h> + +pgprot_t arch_dma_mmap_pgprot(struct device *dev, pgprot_t prot, + unsigned long attrs) +{ + if (!dev_is_dma_coherent(dev)) + return pgprot_noncached(prot); + return prot; +} diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index eee5bef736c8..6ba3cc2ef8ab 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1531,7 +1531,7 @@ EXC_COMMON(trap_0b_common, 0xb00, unknown_exception) * * Call convention: * - * syscall register convention is in Documentation/powerpc/syscall64-abi.txt + * syscall register convention is in Documentation/powerpc/syscall64-abi.rst * * For hypercalls, the register convention is as follows: * r0 volatile diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index f50b708d6d77..98600b276f76 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -1198,6 +1198,9 @@ SYSCALL_DEFINE0(rt_sigreturn) goto bad; if (MSR_TM_ACTIVE(msr_hi<<32)) { + /* Trying to start TM on non TM system */ + if (!cpu_has_feature(CPU_FTR_TM)) + goto bad; /* We only recheckpoint on return if we're * transaction. */ diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c index 2f80e270c7b0..117515564ec7 100644 --- a/arch/powerpc/kernel/signal_64.c +++ b/arch/powerpc/kernel/signal_64.c @@ -771,6 +771,11 @@ SYSCALL_DEFINE0(rt_sigreturn) if (MSR_TM_ACTIVE(msr)) { /* We recheckpoint on return. */ struct ucontext __user *uc_transact; + + /* Trying to start TM on non TM system */ + if (!cpu_has_feature(CPU_FTR_TM)) + goto badframe; + if (__get_user(uc_transact, &uc->uc_link)) goto badframe; if (restore_tm_sigcontexts(current, &uc->uc_mcontext, diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 9524d92bc45d..d7fcdfa7fee4 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -1083,9 +1083,11 @@ static int kvmppc_book3s_init(void) if (xics_on_xive()) { kvmppc_xive_init_module(); kvm_register_device_ops(&kvm_xive_ops, KVM_DEV_TYPE_XICS); - kvmppc_xive_native_init_module(); - kvm_register_device_ops(&kvm_xive_native_ops, - KVM_DEV_TYPE_XIVE); + if (kvmppc_xive_native_supported()) { + kvmppc_xive_native_init_module(); + kvm_register_device_ops(&kvm_xive_native_ops, + KVM_DEV_TYPE_XIVE); + } } else #endif kvm_register_device_ops(&kvm_xics_ops, KVM_DEV_TYPE_XICS); diff --git a/arch/powerpc/kvm/book3s_32_mmu.c b/arch/powerpc/kvm/book3s_32_mmu.c index 653936177857..18f244aad7aa 100644 --- a/arch/powerpc/kvm/book3s_32_mmu.c +++ b/arch/powerpc/kvm/book3s_32_mmu.c @@ -239,6 +239,7 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr, case 2: case 6: pte->may_write = true; + /* fall through */ case 3: case 5: case 7: diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index ec1804f822af..f8975c620f41 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1678,7 +1678,14 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, *val = get_reg_val(id, vcpu->arch.pspb); break; case KVM_REG_PPC_DPDES: - *val = get_reg_val(id, vcpu->arch.vcore->dpdes); + /* + * On POWER9, where we are emulating msgsndp etc., + * we return 1 bit for each vcpu, which can come from + * either vcore->dpdes or doorbell_request. + * On POWER8, doorbell_request is 0. + */ + *val = get_reg_val(id, vcpu->arch.vcore->dpdes | + vcpu->arch.doorbell_request); break; case KVM_REG_PPC_VTB: *val = get_reg_val(id, vcpu->arch.vcore->vtb); @@ -2860,7 +2867,7 @@ static void collect_piggybacks(struct core_info *cip, int target_threads) if (!spin_trylock(&pvc->lock)) continue; prepare_threads(pvc); - if (!pvc->n_runnable) { + if (!pvc->n_runnable || !pvc->kvm->arch.mmu_ready) { list_del_init(&pvc->preempt_list); if (pvc->runner == NULL) { pvc->vcore_state = VCORE_INACTIVE; @@ -2881,15 +2888,20 @@ static void collect_piggybacks(struct core_info *cip, int target_threads) spin_unlock(&lp->lock); } -static bool recheck_signals(struct core_info *cip) +static bool recheck_signals_and_mmu(struct core_info *cip) { int sub, i; struct kvm_vcpu *vcpu; + struct kvmppc_vcore *vc; - for (sub = 0; sub < cip->n_subcores; ++sub) - for_each_runnable_thread(i, vcpu, cip->vc[sub]) + for (sub = 0; sub < cip->n_subcores; ++sub) { + vc = cip->vc[sub]; + if (!vc->kvm->arch.mmu_ready) + return true; + for_each_runnable_thread(i, vcpu, vc) if (signal_pending(vcpu->arch.run_task)) return true; + } return false; } @@ -3119,7 +3131,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) local_irq_disable(); hard_irq_disable(); if (lazy_irq_pending() || need_resched() || - recheck_signals(&core_info) || !vc->kvm->arch.mmu_ready) { + recheck_signals_and_mmu(&core_info)) { local_irq_enable(); vc->vcore_state = VCORE_INACTIVE; /* Unlock all except the primary vcore */ @@ -3569,9 +3581,18 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, mtspr(SPRN_DEC, vcpu->arch.dec_expires - mftb()); if (kvmhv_on_pseries()) { + /* + * We need to save and restore the guest visible part of the + * psscr (i.e. using SPRN_PSSCR_PR) since the hypervisor + * doesn't do this for us. Note only required if pseries since + * this is done in kvmhv_load_hv_regs_and_go() below otherwise. + */ + unsigned long host_psscr; /* call our hypervisor to load up HV regs and go */ struct hv_guest_state hvregs; + host_psscr = mfspr(SPRN_PSSCR_PR); + mtspr(SPRN_PSSCR_PR, vcpu->arch.psscr); kvmhv_save_hv_regs(vcpu, &hvregs); hvregs.lpcr = lpcr; vcpu->arch.regs.msr = vcpu->arch.shregs.msr; @@ -3590,6 +3611,8 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, vcpu->arch.shregs.msr = vcpu->arch.regs.msr; vcpu->arch.shregs.dar = mfspr(SPRN_DAR); vcpu->arch.shregs.dsisr = mfspr(SPRN_DSISR); + vcpu->arch.psscr = mfspr(SPRN_PSSCR_PR); + mtspr(SPRN_PSSCR_PR, host_psscr); /* H_CEDE has to be handled now, not later */ if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested && @@ -3654,6 +3677,8 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, vcpu->arch.vpa.dirty = 1; save_pmu = lp->pmcregs_in_use; } + /* Must save pmu if this guest is capable of running nested guests */ + save_pmu |= nesting_enabled(vcpu->kvm); kvmhv_save_guest_pmu(vcpu, save_pmu); diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 63e0ce91e29d..7186c65c61c9 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -99,7 +99,7 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, } else { rev->forw = rev->back = pte_index; *rmap = (*rmap & ~KVMPPC_RMAP_INDEX) | - pte_index | KVMPPC_RMAP_PRESENT; + pte_index | KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_HPT; } unlock_rmap(rmap); } diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 337e64468d78..07181d0dfcb7 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -942,6 +942,8 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) ld r11, VCPU_XIVE_SAVED_STATE(r4) li r9, TM_QW1_OS lwz r8, VCPU_XIVE_CAM_WORD(r4) + cmpwi r8, 0 + beq no_xive li r7, TM_QW1_OS + TM_WORD2 mfmsr r0 andi. r0, r0, MSR_DR /* in real mode? */ @@ -2831,29 +2833,39 @@ kvm_cede_prodded: kvm_cede_exit: ld r9, HSTATE_KVM_VCPU(r13) #ifdef CONFIG_KVM_XICS - /* Abort if we still have a pending escalation */ + /* are we using XIVE with single escalation? */ + ld r10, VCPU_XIVE_ESC_VADDR(r9) + cmpdi r10, 0 + beq 3f + li r6, XIVE_ESB_SET_PQ_00 + /* + * If we still have a pending escalation, abort the cede, + * and we must set PQ to 10 rather than 00 so that we don't + * potentially end up with two entries for the escalation + * interrupt in the XIVE interrupt queue. In that case + * we also don't want to set xive_esc_on to 1 here in + * case we race with xive_esc_irq(). + */ lbz r5, VCPU_XIVE_ESC_ON(r9) cmpwi r5, 0 - beq 1f + beq 4f li r0, 0 stb r0, VCPU_CEDED(r9) -1: /* Enable XIVE escalation */ - li r5, XIVE_ESB_SET_PQ_00 + li r6, XIVE_ESB_SET_PQ_10 + b 5f +4: li r0, 1 + stb r0, VCPU_XIVE_ESC_ON(r9) + /* make sure store to xive_esc_on is seen before xive_esc_irq runs */ + sync +5: /* Enable XIVE escalation */ mfmsr r0 andi. r0, r0, MSR_DR /* in real mode? */ beq 1f - ld r10, VCPU_XIVE_ESC_VADDR(r9) - cmpdi r10, 0 - beq 3f - ldx r0, r10, r5 + ldx r0, r10, r6 b 2f 1: ld r10, VCPU_XIVE_ESC_RADDR(r9) - cmpdi r10, 0 - beq 3f - ldcix r0, r10, r5 + ldcix r0, r10, r6 2: sync - li r0, 1 - stb r0, VCPU_XIVE_ESC_ON(r9) #endif /* CONFIG_KVM_XICS */ 3: b guest_exit_cont diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index 6ca0d7376a9f..591bfb4bfd0f 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -67,8 +67,14 @@ void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) void __iomem *tima = local_paca->kvm_hstate.xive_tima_virt; u64 pq; - if (!tima) + /* + * Nothing to do if the platform doesn't have a XIVE + * or this vCPU doesn't have its own XIVE context + * (e.g. because it's not using an in-kernel interrupt controller). + */ + if (!tima || !vcpu->arch.xive_cam_word) return; + eieio(); __raw_writeq(vcpu->arch.xive_saved_state.w01, tima + TM_QW1_OS); __raw_writel(vcpu->arch.xive_cam_word, tima + TM_QW1_OS + TM_WORD2); @@ -160,6 +166,9 @@ static irqreturn_t xive_esc_irq(int irq, void *data) */ vcpu->arch.xive_esc_on = false; + /* This orders xive_esc_on = false vs. subsequent stale_p = true */ + smp_wmb(); /* goes with smp_mb() in cleanup_single_escalation */ + return IRQ_HANDLED; } @@ -1113,6 +1122,31 @@ void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu) vcpu->arch.xive_esc_raddr = 0; } +/* + * In single escalation mode, the escalation interrupt is marked so + * that EOI doesn't re-enable it, but just sets the stale_p flag to + * indicate that the P bit has already been dealt with. However, the + * assembly code that enters the guest sets PQ to 00 without clearing + * stale_p (because it has no easy way to address it). Hence we have + * to adjust stale_p before shutting down the interrupt. + */ +void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu, + struct kvmppc_xive_vcpu *xc, int irq) +{ + struct irq_data *d = irq_get_irq_data(irq); + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + + /* + * This slightly odd sequence gives the right result + * (i.e. stale_p set if xive_esc_on is false) even if + * we race with xive_esc_irq() and xive_irq_eoi(). + */ + xd->stale_p = false; + smp_mb(); /* paired with smb_wmb in xive_esc_irq */ + if (!vcpu->arch.xive_esc_on) + xd->stale_p = true; +} + void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) { struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; @@ -1134,20 +1168,28 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) /* Mask the VP IPI */ xive_vm_esb_load(&xc->vp_ipi_data, XIVE_ESB_SET_PQ_01); - /* Disable the VP */ - xive_native_disable_vp(xc->vp_id); - - /* Free the queues & associated interrupts */ + /* Free escalations */ for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { - struct xive_q *q = &xc->queues[i]; - - /* Free the escalation irq */ if (xc->esc_virq[i]) { + if (xc->xive->single_escalation) + xive_cleanup_single_escalation(vcpu, xc, + xc->esc_virq[i]); free_irq(xc->esc_virq[i], vcpu); irq_dispose_mapping(xc->esc_virq[i]); kfree(xc->esc_virq_names[i]); } - /* Free the queue */ + } + + /* Disable the VP */ + xive_native_disable_vp(xc->vp_id); + + /* Clear the cam word so guest entry won't try to push context */ + vcpu->arch.xive_cam_word = 0; + + /* Free the queues */ + for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { + struct xive_q *q = &xc->queues[i]; + xive_native_disable_queue(xc->vp_id, q, i); if (q->qpage) { free_pages((unsigned long)q->qpage, @@ -1986,10 +2028,8 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type) xive->single_escalation = xive_native_has_single_escalation(); - if (ret) { - kfree(xive); + if (ret) return ret; - } return 0; } diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h index 50494d0ee375..955b820ffd6d 100644 --- a/arch/powerpc/kvm/book3s_xive.h +++ b/arch/powerpc/kvm/book3s_xive.h @@ -282,6 +282,8 @@ int kvmppc_xive_select_target(struct kvm *kvm, u32 *server, u8 prio); int kvmppc_xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio, bool single_escalation); struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type); +void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu, + struct kvmppc_xive_vcpu *xc, int irq); #endif /* CONFIG_KVM_XICS */ #endif /* _KVM_PPC_BOOK3S_XICS_H */ diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index 5596c8ec221a..248c1ea9e788 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -67,20 +67,28 @@ void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu) xc->valid = false; kvmppc_xive_disable_vcpu_interrupts(vcpu); - /* Disable the VP */ - xive_native_disable_vp(xc->vp_id); - - /* Free the queues & associated interrupts */ + /* Free escalations */ for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { /* Free the escalation irq */ if (xc->esc_virq[i]) { + if (xc->xive->single_escalation) + xive_cleanup_single_escalation(vcpu, xc, + xc->esc_virq[i]); free_irq(xc->esc_virq[i], vcpu); irq_dispose_mapping(xc->esc_virq[i]); kfree(xc->esc_virq_names[i]); xc->esc_virq[i] = 0; } + } + + /* Disable the VP */ + xive_native_disable_vp(xc->vp_id); - /* Free the queue */ + /* Clear the cam word so guest entry won't try to push context */ + vcpu->arch.xive_cam_word = 0; + + /* Free the queues */ + for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { kvmppc_xive_native_cleanup_queue(vcpu, i); } @@ -1090,9 +1098,9 @@ static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type) xive->ops = &kvmppc_xive_native_ops; if (ret) - kfree(xive); + return ret; - return ret; + return 0; } /* @@ -1171,6 +1179,11 @@ int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val) return 0; } +bool kvmppc_xive_native_supported(void) +{ + return xive_native_has_queue_state_support(); +} + static int xive_native_debug_show(struct seq_file *m, void *private) { struct kvmppc_xive *xive = m->private; diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index bb4d09c1ad56..6fca38ca791f 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -271,6 +271,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) */ if (inst == KVMPPC_INST_SW_BREAKPOINT) { run->exit_reason = KVM_EXIT_DEBUG; + run->debug.arch.status = 0; run->debug.arch.address = kvmppc_get_pc(vcpu); emulated = EMULATE_EXIT_USER; advance = 0; diff --git a/arch/powerpc/kvm/emulate_loadstore.c b/arch/powerpc/kvm/emulate_loadstore.c index 9208c82ed08d..2e496eb86e94 100644 --- a/arch/powerpc/kvm/emulate_loadstore.c +++ b/arch/powerpc/kvm/emulate_loadstore.c @@ -89,12 +89,6 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu) rs = get_rs(inst); rt = get_rt(inst); - /* - * if mmio_vsx_tx_sx_enabled == 0, copy data between - * VSR[0..31] and memory - * if mmio_vsx_tx_sx_enabled == 1, copy data between - * VSR[32..63] and memory - */ vcpu->arch.mmio_vsx_copy_nums = 0; vcpu->arch.mmio_vsx_offset = 0; vcpu->arch.mmio_copy_type = KVMPPC_VSX_COPY_NONE; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 3e566c2e6066..3a77bb643452 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -561,7 +561,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) * a POWER9 processor) and the PowerNV platform, as * nested is not yet supported. */ - r = xive_enabled() && !!cpu_has_feature(CPU_FTR_HVMODE); + r = xive_enabled() && !!cpu_has_feature(CPU_FTR_HVMODE) && + kvmppc_xive_native_supported(); break; #endif diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index 9a5963e07a82..b8ad14bb1170 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -1899,11 +1899,20 @@ void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base, * * For guests on platforms before POWER9, we clamp the it limit to 1G * to avoid some funky things such as RTAS bugs etc... + * + * On POWER9 we limit to 1TB in case the host erroneously told us that + * the RMA was >1TB. Effective address bits 0:23 are treated as zero + * (meaning the access is aliased to zero i.e. addr = addr % 1TB) + * for virtual real mode addressing and so it doesn't make sense to + * have an area larger than 1TB as it can't be addressed. */ if (!early_cpu_has_feature(CPU_FTR_HVMODE)) { ppc64_rma_size = first_memblock_size; if (!early_cpu_has_feature(CPU_FTR_ARCH_300)) ppc64_rma_size = min_t(u64, ppc64_rma_size, 0x40000000); + else + ppc64_rma_size = min_t(u64, ppc64_rma_size, + 1UL << SID_SHIFT_1T); /* Finally limit subsequent allocations */ memblock_set_current_limit(ppc64_rma_size); diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 9259337d7374..9191a66b3bc5 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -239,7 +239,7 @@ void __init paging_init(void) #ifdef CONFIG_ZONE_DMA max_zone_pfns[ZONE_DMA] = min(max_low_pfn, - ((1UL << ARCH_ZONE_DMA_BITS) - 1) >> PAGE_SHIFT); + 1UL << (ARCH_ZONE_DMA_BITS - PAGE_SHIFT)); #endif max_zone_pfns[ZONE_NORMAL] = max_low_pfn; #ifdef CONFIG_HIGHMEM diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c index c8ec670ee924..2c07908359b2 100644 --- a/arch/powerpc/platforms/pseries/papr_scm.c +++ b/arch/powerpc/platforms/pseries/papr_scm.c @@ -11,6 +11,7 @@ #include <linux/sched.h> #include <linux/libnvdimm.h> #include <linux/platform_device.h> +#include <linux/delay.h> #include <asm/plpar_wrappers.h> @@ -43,8 +44,9 @@ struct papr_scm_priv { static int drc_pmem_bind(struct papr_scm_priv *p) { unsigned long ret[PLPAR_HCALL_BUFSIZE]; - uint64_t rc, token; uint64_t saved = 0; + uint64_t token; + int64_t rc; /* * When the hypervisor cannot map all the requested memory in a single @@ -64,6 +66,10 @@ static int drc_pmem_bind(struct papr_scm_priv *p) } while (rc == H_BUSY); if (rc) { + /* H_OVERLAP needs a separate error path */ + if (rc == H_OVERLAP) + return -EBUSY; + dev_err(&p->pdev->dev, "bind err: %lld\n", rc); return -ENXIO; } @@ -78,22 +84,36 @@ static int drc_pmem_bind(struct papr_scm_priv *p) static int drc_pmem_unbind(struct papr_scm_priv *p) { unsigned long ret[PLPAR_HCALL_BUFSIZE]; - uint64_t rc, token; + uint64_t token = 0; + int64_t rc; - token = 0; + dev_dbg(&p->pdev->dev, "unbind drc %x\n", p->drc_index); - /* NB: unbind has the same retry requirements mentioned above */ + /* NB: unbind has the same retry requirements as drc_pmem_bind() */ do { - rc = plpar_hcall(H_SCM_UNBIND_MEM, ret, p->drc_index, - p->bound_addr, p->blocks, token); + + /* Unbind of all SCM resources associated with drcIndex */ + rc = plpar_hcall(H_SCM_UNBIND_ALL, ret, H_UNBIND_SCOPE_DRC, + p->drc_index, token); token = ret[0]; - cond_resched(); + + /* Check if we are stalled for some time */ + if (H_IS_LONG_BUSY(rc)) { + msleep(get_longbusy_msecs(rc)); + rc = H_BUSY; + } else if (rc == H_BUSY) { + cond_resched(); + } + } while (rc == H_BUSY); if (rc) dev_err(&p->pdev->dev, "unbind error: %lld\n", rc); + else + dev_dbg(&p->pdev->dev, "unbind drc %x complete\n", + p->drc_index); - return !!rc; + return rc == H_SUCCESS ? 0 : -ENXIO; } static int papr_scm_meta_get(struct papr_scm_priv *p, @@ -389,6 +409,14 @@ static int papr_scm_probe(struct platform_device *pdev) /* request the hypervisor to bind this region to somewhere in memory */ rc = drc_pmem_bind(p); + + /* If phyp says drc memory still bound then force unbound and retry */ + if (rc == -EBUSY) { + dev_warn(&pdev->dev, "Retrying bind after unbinding\n"); + drc_pmem_unbind(p); + rc = drc_pmem_bind(p); + } + if (rc) goto err; diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index 082c7e1c20f0..be86fce1a84e 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -135,7 +135,7 @@ static u32 xive_read_eq(struct xive_q *q, bool just_peek) static u32 xive_scan_interrupts(struct xive_cpu *xc, bool just_peek) { u32 irq = 0; - u8 prio; + u8 prio = 0; /* Find highest pending priority */ while (xc->pending_prio != 0) { @@ -148,8 +148,19 @@ static u32 xive_scan_interrupts(struct xive_cpu *xc, bool just_peek) irq = xive_read_eq(&xc->queue[prio], just_peek); /* Found something ? That's it */ - if (irq) - break; + if (irq) { + if (just_peek || irq_to_desc(irq)) + break; + /* + * We should never get here; if we do then we must + * have failed to synchronize the interrupt properly + * when shutting it down. + */ + pr_crit("xive: got interrupt %d without descriptor, dropping\n", + irq); + WARN_ON(1); + continue; + } /* Clear pending bits */ xc->pending_prio &= ~(1 << prio); @@ -307,6 +318,7 @@ static void xive_do_queue_eoi(struct xive_cpu *xc) */ static void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd) { + xd->stale_p = false; /* If the XIVE supports the new "store EOI facility, use it */ if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI) xive_esb_write(xd, XIVE_ESB_STORE_EOI, 0); @@ -350,7 +362,7 @@ static void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd) } } -/* irq_chip eoi callback */ +/* irq_chip eoi callback, called with irq descriptor lock held */ static void xive_irq_eoi(struct irq_data *d) { struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); @@ -366,6 +378,8 @@ static void xive_irq_eoi(struct irq_data *d) if (!irqd_irq_disabled(d) && !irqd_is_forwarded_to_vcpu(d) && !(xd->flags & XIVE_IRQ_NO_EOI)) xive_do_source_eoi(irqd_to_hwirq(d), xd); + else + xd->stale_p = true; /* * Clear saved_p to indicate that it's no longer occupying @@ -397,11 +411,16 @@ static void xive_do_source_set_mask(struct xive_irq_data *xd, */ if (mask) { val = xive_esb_read(xd, XIVE_ESB_SET_PQ_01); - xd->saved_p = !!(val & XIVE_ESB_VAL_P); - } else if (xd->saved_p) + if (!xd->stale_p && !!(val & XIVE_ESB_VAL_P)) + xd->saved_p = true; + xd->stale_p = false; + } else if (xd->saved_p) { xive_esb_read(xd, XIVE_ESB_SET_PQ_10); - else + xd->saved_p = false; + } else { xive_esb_read(xd, XIVE_ESB_SET_PQ_00); + xd->stale_p = false; + } } /* @@ -479,7 +498,7 @@ static int xive_find_target_in_mask(const struct cpumask *mask, * Now go through the entire mask until we find a valid * target. */ - for (;;) { + do { /* * We re-check online as the fallback case passes us * an untested affinity mask @@ -487,12 +506,11 @@ static int xive_find_target_in_mask(const struct cpumask *mask, if (cpu_online(cpu) && xive_try_pick_target(cpu)) return cpu; cpu = cpumask_next(cpu, mask); - if (cpu == first) - break; /* Wrap around */ if (cpu >= nr_cpu_ids) cpu = cpumask_first(mask); - } + } while (cpu != first); + return -1; } @@ -542,6 +560,8 @@ static unsigned int xive_irq_startup(struct irq_data *d) unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); int target, rc; + xd->saved_p = false; + xd->stale_p = false; pr_devel("xive_irq_startup: irq %d [0x%x] data @%p\n", d->irq, hw_irq, d); @@ -588,6 +608,7 @@ static unsigned int xive_irq_startup(struct irq_data *d) return 0; } +/* called with irq descriptor lock held */ static void xive_irq_shutdown(struct irq_data *d) { struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); @@ -603,16 +624,6 @@ static void xive_irq_shutdown(struct irq_data *d) xive_do_source_set_mask(xd, true); /* - * The above may have set saved_p. We clear it otherwise it - * will prevent re-enabling later on. It is ok to forget the - * fact that the interrupt might be in a queue because we are - * accounting that already in xive_dec_target_count() and will - * be re-routing it to a new queue with proper accounting when - * it's started up again - */ - xd->saved_p = false; - - /* * Mask the interrupt in HW in the IVT/EAS and set the number * to be the "bad" IRQ number */ @@ -798,6 +809,10 @@ static int xive_irq_retrigger(struct irq_data *d) return 1; } +/* + * Caller holds the irq descriptor lock, so this won't be called + * concurrently with xive_get_irqchip_state on the same interrupt. + */ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state) { struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); @@ -821,6 +836,10 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state) /* Set it to PQ=10 state to prevent further sends */ pq = xive_esb_read(xd, XIVE_ESB_SET_PQ_10); + if (!xd->stale_p) { + xd->saved_p = !!(pq & XIVE_ESB_VAL_P); + xd->stale_p = !xd->saved_p; + } /* No target ? nothing to do */ if (xd->target == XIVE_INVALID_TARGET) { @@ -828,7 +847,7 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state) * An untargetted interrupt should have been * also masked at the source */ - WARN_ON(pq & 2); + WARN_ON(xd->saved_p); return 0; } @@ -848,9 +867,8 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state) * This saved_p is cleared by the host EOI, when we know * for sure the queue slot is no longer in use. */ - if (pq & 2) { - pq = xive_esb_read(xd, XIVE_ESB_SET_PQ_11); - xd->saved_p = true; + if (xd->saved_p) { + xive_esb_read(xd, XIVE_ESB_SET_PQ_11); /* * Sync the XIVE source HW to ensure the interrupt @@ -863,8 +881,7 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state) */ if (xive_ops->sync_source) xive_ops->sync_source(hw_irq); - } else - xd->saved_p = false; + } } else { irqd_clr_forwarded_to_vcpu(d); @@ -915,6 +932,23 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state) return 0; } +/* Called with irq descriptor lock held. */ +static int xive_get_irqchip_state(struct irq_data *data, + enum irqchip_irq_state which, bool *state) +{ + struct xive_irq_data *xd = irq_data_get_irq_handler_data(data); + + switch (which) { + case IRQCHIP_STATE_ACTIVE: + *state = !xd->stale_p && + (xd->saved_p || + !!(xive_esb_read(xd, XIVE_ESB_GET) & XIVE_ESB_VAL_P)); + return 0; + default: + return -EINVAL; + } +} + static struct irq_chip xive_irq_chip = { .name = "XIVE-IRQ", .irq_startup = xive_irq_startup, @@ -926,6 +960,7 @@ static struct irq_chip xive_irq_chip = { .irq_set_type = xive_irq_set_type, .irq_retrigger = xive_irq_retrigger, .irq_set_vcpu_affinity = xive_irq_set_vcpu_affinity, + .irq_get_irqchip_state = xive_get_irqchip_state, }; bool is_xive_irq(struct irq_chip *chip) @@ -1339,6 +1374,11 @@ static void xive_flush_cpu_queue(unsigned int cpu, struct xive_cpu *xc) xd = irq_desc_get_handler_data(desc); /* + * Clear saved_p to indicate that it's no longer pending + */ + xd->saved_p = false; + + /* * For LSIs, we EOI, this will cause a resend if it's * still asserted. Otherwise do an MSI retrigger. */ diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c index 2f26b74f6cfa..37987c815913 100644 --- a/arch/powerpc/sysdev/xive/native.c +++ b/arch/powerpc/sysdev/xive/native.c @@ -800,6 +800,13 @@ int xive_native_set_queue_state(u32 vp_id, u32 prio, u32 qtoggle, u32 qindex) } EXPORT_SYMBOL_GPL(xive_native_set_queue_state); +bool xive_native_has_queue_state_support(void) +{ + return opal_check_token(OPAL_XIVE_GET_QUEUE_STATE) && + opal_check_token(OPAL_XIVE_SET_QUEUE_STATE); +} +EXPORT_SYMBOL_GPL(xive_native_has_queue_state_support); + int xive_native_get_vp_state(u32 vp_id, u64 *out_state) { __be64 state; |