diff options
author | Tejun Heo <tj@kernel.org> | 2011-11-28 21:46:22 +0400 |
---|---|---|
committer | Tejun Heo <tj@kernel.org> | 2011-11-28 21:46:22 +0400 |
commit | d4bbf7e7759afc172e2bfbc5c416324590049cdd (patch) | |
tree | 7eab5ee5481cd3dcf1162329fec827177640018a /arch/powerpc/kvm | |
parent | a150439c4a97db379f0ed6faa46fbbb6e7bf3cb2 (diff) | |
parent | 401d0069cb344f401bc9d264c31db55876ff78c0 (diff) | |
download | linux-d4bbf7e7759afc172e2bfbc5c416324590049cdd.tar.xz |
Merge branch 'master' into x86/memblock
Conflicts & resolutions:
* arch/x86/xen/setup.c
dc91c728fd "xen: allow extra memory to be in multiple regions"
24aa07882b "memblock, x86: Replace memblock_x86_reserve/free..."
conflicted on xen_add_extra_mem() updates. The resolution is
trivial as the latter just want to replace
memblock_x86_reserve_range() with memblock_reserve().
* drivers/pci/intel-iommu.c
166e9278a3f "x86/ia64: intel-iommu: move to drivers/iommu/"
5dfe8660a3d "bootmem: Replace work_with_active_regions() with..."
conflicted as the former moved the file under drivers/iommu/.
Resolved by applying the chnages from the latter on the moved
file.
* mm/Kconfig
6661672053a "memblock: add NO_BOOTMEM config symbol"
c378ddd53f9 "memblock, x86: Make ARCH_DISCARD_MEMBLOCK a config option"
conflicted trivially. Both added config options. Just
letting both add their own options resolves the conflict.
* mm/memblock.c
d1f0ece6cdc "mm/memblock.c: small function definition fixes"
ed7b56a799c "memblock: Remove memblock_memory_can_coalesce()"
confliected. The former updates function removed by the
latter. Resolution is trivial.
Signed-off-by: Tejun Heo <tj@kernel.org>
Diffstat (limited to 'arch/powerpc/kvm')
33 files changed, 6325 insertions, 1663 deletions
diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c index da3a1225c0ac..7b612a76c701 100644 --- a/arch/powerpc/kvm/44x.c +++ b/arch/powerpc/kvm/44x.c @@ -20,6 +20,7 @@ #include <linux/kvm_host.h> #include <linux/slab.h> #include <linux/err.h> +#include <linux/export.h> #include <asm/reg.h> #include <asm/cputable.h> @@ -78,6 +79,8 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu) for (i = 0; i < ARRAY_SIZE(vcpu_44x->shadow_refs); i++) vcpu_44x->shadow_refs[i].gtlb_index = -1; + vcpu->arch.cpu_type = KVM_CPU_440; + return 0; } diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c index 5f3cff83e089..33aa715dab28 100644 --- a/arch/powerpc/kvm/44x_tlb.c +++ b/arch/powerpc/kvm/44x_tlb.c @@ -387,8 +387,10 @@ static void kvmppc_44x_invalidate(struct kvm_vcpu *vcpu, } } -void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode) +void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr) { + int usermode = vcpu->arch.shared->msr & MSR_PR; + vcpu->arch.shadow_pid = !usermode; } diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index b7baff78f90c..78133deb4b64 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -20,7 +20,6 @@ config KVM bool select PREEMPT_NOTIFIERS select ANON_INODES - select KVM_MMIO config KVM_BOOK3S_HANDLER bool @@ -28,16 +27,22 @@ config KVM_BOOK3S_HANDLER config KVM_BOOK3S_32_HANDLER bool select KVM_BOOK3S_HANDLER + select KVM_MMIO config KVM_BOOK3S_64_HANDLER bool select KVM_BOOK3S_HANDLER +config KVM_BOOK3S_PR + bool + select KVM_MMIO + config KVM_BOOK3S_32 tristate "KVM support for PowerPC book3s_32 processors" depends on EXPERIMENTAL && PPC_BOOK3S_32 && !SMP && !PTE_64BIT select KVM select KVM_BOOK3S_32_HANDLER + select KVM_BOOK3S_PR ---help--- Support running unmodified book3s_32 guest kernels in virtual machines on book3s_32 host processors. @@ -50,8 +55,8 @@ config KVM_BOOK3S_32 config KVM_BOOK3S_64 tristate "KVM support for PowerPC book3s_64 processors" depends on EXPERIMENTAL && PPC_BOOK3S_64 - select KVM select KVM_BOOK3S_64_HANDLER + select KVM ---help--- Support running unmodified book3s_64 and book3s_32 guest kernels in virtual machines on book3s_64 host processors. @@ -61,10 +66,34 @@ config KVM_BOOK3S_64 If unsure, say N. +config KVM_BOOK3S_64_HV + bool "KVM support for POWER7 and PPC970 using hypervisor mode in host" + depends on KVM_BOOK3S_64 + ---help--- + Support running unmodified book3s_64 guest kernels in + virtual machines on POWER7 and PPC970 processors that have + hypervisor mode available to the host. + + If you say Y here, KVM will use the hardware virtualization + facilities of POWER7 (and later) processors, meaning that + guest operating systems will run at full hardware speed + using supervisor and user modes. However, this also means + that KVM is not usable under PowerVM (pHyp), is only usable + on POWER7 (or later) processors and PPC970-family processors, + and cannot emulate a different processor from the host processor. + + If unsure, say N. + +config KVM_BOOK3S_64_PR + def_bool y + depends on KVM_BOOK3S_64 && !KVM_BOOK3S_64_HV + select KVM_BOOK3S_PR + config KVM_440 bool "KVM support for PowerPC 440 processors" depends on EXPERIMENTAL && 44x select KVM + select KVM_MMIO ---help--- Support running unmodified 440 guest kernels in virtual machines on 440 host processors. @@ -89,6 +118,7 @@ config KVM_E500 bool "KVM support for PowerPC E500 processors" depends on EXPERIMENTAL && E500 select KVM + select KVM_MMIO ---help--- Support running unmodified E500 guest kernels in virtual machines on E500 host processors. @@ -99,6 +129,5 @@ config KVM_E500 If unsure, say N. source drivers/vhost/Kconfig -source drivers/virtio/Kconfig endif # VIRTUALIZATION diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 4d6863823f69..3688aeecc4b2 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -38,24 +38,46 @@ kvm-e500-objs := \ e500_emulate.o kvm-objs-$(CONFIG_KVM_E500) := $(kvm-e500-objs) -kvm-book3s_64-objs := \ - $(common-objs-y) \ +kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \ + ../../../virt/kvm/coalesced_mmio.o \ fpu.o \ book3s_paired_singles.o \ - book3s.o \ + book3s_pr.o \ + book3s_pr_papr.o \ book3s_emulate.o \ book3s_interrupts.o \ book3s_mmu_hpte.o \ book3s_64_mmu_host.o \ book3s_64_mmu.o \ book3s_32_mmu.o -kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-objs) +kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \ + book3s_rmhandlers.o + +kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \ + book3s_hv.o \ + book3s_hv_interrupts.o \ + book3s_64_mmu_hv.o +kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \ + book3s_hv_rmhandlers.o \ + book3s_hv_rm_mmu.o \ + book3s_64_vio_hv.o \ + book3s_hv_builtin.o + +kvm-book3s_64-module-objs := \ + ../../../virt/kvm/kvm_main.o \ + powerpc.o \ + emulate.o \ + book3s.o \ + $(kvm-book3s_64-objs-y) + +kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-module-objs) kvm-book3s_32-objs := \ $(common-objs-y) \ fpu.o \ book3s_paired_singles.o \ book3s.o \ + book3s_pr.o \ book3s_emulate.o \ book3s_interrupts.o \ book3s_mmu_hpte.o \ @@ -70,3 +92,4 @@ obj-$(CONFIG_KVM_E500) += kvm.o obj-$(CONFIG_KVM_BOOK3S_64) += kvm.o obj-$(CONFIG_KVM_BOOK3S_32) += kvm.o +obj-y += $(kvm-book3s_64-builtin-objs-y) diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 0f95b5cce033..a459479995c6 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -16,8 +16,8 @@ #include <linux/kvm_host.h> #include <linux/err.h> +#include <linux/export.h> #include <linux/slab.h> -#include "trace.h" #include <asm/reg.h> #include <asm/cputable.h> @@ -28,25 +28,17 @@ #include <asm/kvm_ppc.h> #include <asm/kvm_book3s.h> #include <asm/mmu_context.h> +#include <asm/page.h> #include <linux/gfp.h> #include <linux/sched.h> #include <linux/vmalloc.h> #include <linux/highmem.h> +#include "trace.h" + #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU /* #define EXIT_DEBUG */ -/* #define DEBUG_EXT */ - -static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr, - ulong msr); - -/* Some compatibility defines */ -#ifdef CONFIG_PPC_BOOK3S_32 -#define MSR_USER32 MSR_USER -#define MSR_USER64 MSR_USER -#define HW_PAGE_SIZE PAGE_SIZE -#endif struct kvm_stats_debugfs_item debugfs_entries[] = { { "exits", VCPU_STAT(sum_exits) }, @@ -77,100 +69,11 @@ void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu) { } -void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) -{ -#ifdef CONFIG_PPC_BOOK3S_64 - memcpy(to_svcpu(vcpu)->slb, to_book3s(vcpu)->slb_shadow, sizeof(to_svcpu(vcpu)->slb)); - memcpy(&get_paca()->shadow_vcpu, to_book3s(vcpu)->shadow_vcpu, - sizeof(get_paca()->shadow_vcpu)); - to_svcpu(vcpu)->slb_max = to_book3s(vcpu)->slb_shadow_max; -#endif - -#ifdef CONFIG_PPC_BOOK3S_32 - current->thread.kvm_shadow_vcpu = to_book3s(vcpu)->shadow_vcpu; -#endif -} - -void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) -{ -#ifdef CONFIG_PPC_BOOK3S_64 - memcpy(to_book3s(vcpu)->slb_shadow, to_svcpu(vcpu)->slb, sizeof(to_svcpu(vcpu)->slb)); - memcpy(to_book3s(vcpu)->shadow_vcpu, &get_paca()->shadow_vcpu, - sizeof(get_paca()->shadow_vcpu)); - to_book3s(vcpu)->slb_shadow_max = to_svcpu(vcpu)->slb_max; -#endif - - kvmppc_giveup_ext(vcpu, MSR_FP); - kvmppc_giveup_ext(vcpu, MSR_VEC); - kvmppc_giveup_ext(vcpu, MSR_VSX); -} - -static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu) -{ - ulong smsr = vcpu->arch.shared->msr; - - /* Guest MSR values */ - smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_DE; - /* Process MSR values */ - smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE; - /* External providers the guest reserved */ - smsr |= (vcpu->arch.shared->msr & vcpu->arch.guest_owned_ext); - /* 64-bit Process MSR values */ -#ifdef CONFIG_PPC_BOOK3S_64 - smsr |= MSR_ISF | MSR_HV; -#endif - vcpu->arch.shadow_msr = smsr; -} - -void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) -{ - ulong old_msr = vcpu->arch.shared->msr; - -#ifdef EXIT_DEBUG - printk(KERN_INFO "KVM: Set MSR to 0x%llx\n", msr); -#endif - - msr &= to_book3s(vcpu)->msr_mask; - vcpu->arch.shared->msr = msr; - kvmppc_recalc_shadow_msr(vcpu); - - if (msr & MSR_POW) { - if (!vcpu->arch.pending_exceptions) { - kvm_vcpu_block(vcpu); - vcpu->stat.halt_wakeup++; - - /* Unset POW bit after we woke up */ - msr &= ~MSR_POW; - vcpu->arch.shared->msr = msr; - } - } - - if ((vcpu->arch.shared->msr & (MSR_PR|MSR_IR|MSR_DR)) != - (old_msr & (MSR_PR|MSR_IR|MSR_DR))) { - kvmppc_mmu_flush_segments(vcpu); - kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)); - - /* Preload magic page segment when in kernel mode */ - if (!(msr & MSR_PR) && vcpu->arch.magic_page_pa) { - struct kvm_vcpu_arch *a = &vcpu->arch; - - if (msr & MSR_DR) - kvmppc_mmu_map_segment(vcpu, a->magic_page_ea); - else - kvmppc_mmu_map_segment(vcpu, a->magic_page_pa); - } - } - - /* Preload FPU if it's enabled */ - if (vcpu->arch.shared->msr & MSR_FP) - kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP); -} - void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags) { vcpu->arch.shared->srr0 = kvmppc_get_pc(vcpu); vcpu->arch.shared->srr1 = vcpu->arch.shared->msr | flags; - kvmppc_set_pc(vcpu, to_book3s(vcpu)->hior + vec); + kvmppc_set_pc(vcpu, kvmppc_interrupt_offset(vcpu) + vec); vcpu->arch.mmu.reset_msr(vcpu); } @@ -204,11 +107,13 @@ static int kvmppc_book3s_vec2irqprio(unsigned int vec) static void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec) { + unsigned long old_pending = vcpu->arch.pending_exceptions; + clear_bit(kvmppc_book3s_vec2irqprio(vec), &vcpu->arch.pending_exceptions); - if (!vcpu->arch.pending_exceptions) - vcpu->arch.shared->int_pending = 0; + kvmppc_update_int_pending(vcpu, vcpu->arch.pending_exceptions, + old_pending); } void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec) @@ -225,8 +130,8 @@ void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec) void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags) { - to_book3s(vcpu)->prog_flags = flags; - kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_PROGRAM); + /* might as well deliver this straight away */ + kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_PROGRAM, flags); } void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu) @@ -266,21 +171,7 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority) { int deliver = 1; int vec = 0; - ulong flags = 0ULL; - ulong crit_raw = vcpu->arch.shared->critical; - ulong crit_r1 = kvmppc_get_gpr(vcpu, 1); - bool crit; - - /* Truncate crit indicators in 32 bit mode */ - if (!(vcpu->arch.shared->msr & MSR_SF)) { - crit_raw &= 0xffffffff; - crit_r1 &= 0xffffffff; - } - - /* Critical section when crit == r1 */ - crit = (crit_raw == crit_r1); - /* ... and we're in supervisor mode */ - crit = crit && !(vcpu->arch.shared->msr & MSR_PR); + bool crit = kvmppc_critical_section(vcpu); switch (priority) { case BOOK3S_IRQPRIO_DECREMENTER: @@ -315,7 +206,6 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority) break; case BOOK3S_IRQPRIO_PROGRAM: vec = BOOK3S_INTERRUPT_PROGRAM; - flags = to_book3s(vcpu)->prog_flags; break; case BOOK3S_IRQPRIO_VSX: vec = BOOK3S_INTERRUPT_VSX; @@ -346,7 +236,7 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority) #endif if (deliver) - kvmppc_inject_interrupt(vcpu, vec, flags); + kvmppc_inject_interrupt(vcpu, vec, 0); return deliver; } @@ -392,64 +282,7 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu) } /* Tell the guest about our interrupt status */ - if (*pending) - vcpu->arch.shared->int_pending = 1; - else if (old_pending) - vcpu->arch.shared->int_pending = 0; -} - -void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr) -{ - u32 host_pvr; - - vcpu->arch.hflags &= ~BOOK3S_HFLAG_SLB; - vcpu->arch.pvr = pvr; -#ifdef CONFIG_PPC_BOOK3S_64 - if ((pvr >= 0x330000) && (pvr < 0x70330000)) { - kvmppc_mmu_book3s_64_init(vcpu); - to_book3s(vcpu)->hior = 0xfff00000; - to_book3s(vcpu)->msr_mask = 0xffffffffffffffffULL; - } else -#endif - { - kvmppc_mmu_book3s_32_init(vcpu); - to_book3s(vcpu)->hior = 0; - to_book3s(vcpu)->msr_mask = 0xffffffffULL; - } - - /* If we are in hypervisor level on 970, we can tell the CPU to - * treat DCBZ as 32 bytes store */ - vcpu->arch.hflags &= ~BOOK3S_HFLAG_DCBZ32; - if (vcpu->arch.mmu.is_dcbz32(vcpu) && (mfmsr() & MSR_HV) && - !strcmp(cur_cpu_spec->platform, "ppc970")) - vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32; - - /* Cell performs badly if MSR_FEx are set. So let's hope nobody - really needs them in a VM on Cell and force disable them. */ - if (!strcmp(cur_cpu_spec->platform, "ppc-cell-be")) - to_book3s(vcpu)->msr_mask &= ~(MSR_FE0 | MSR_FE1); - -#ifdef CONFIG_PPC_BOOK3S_32 - /* 32 bit Book3S always has 32 byte dcbz */ - vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32; -#endif - - /* On some CPUs we can execute paired single operations natively */ - asm ( "mfpvr %0" : "=r"(host_pvr)); - switch (host_pvr) { - case 0x00080200: /* lonestar 2.0 */ - case 0x00088202: /* lonestar 2.2 */ - case 0x70000100: /* gekko 1.0 */ - case 0x00080100: /* gekko 2.0 */ - case 0x00083203: /* gekko 2.3a */ - case 0x00083213: /* gekko 2.3b */ - case 0x00083204: /* gekko 2.4 */ - case 0x00083214: /* gekko 2.4e (8SE) - retail HW2 */ - case 0x00087200: /* broadway */ - vcpu->arch.hflags |= BOOK3S_HFLAG_NATIVE_PS; - /* Enable HID2.PSE - in case we need it later */ - mtspr(SPRN_HID2_GEKKO, mfspr(SPRN_HID2_GEKKO) | (1 << 29)); - } + kvmppc_update_int_pending(vcpu, *pending, old_pending); } pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn) @@ -471,44 +304,6 @@ pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn) return gfn_to_pfn(vcpu->kvm, gfn); } -/* Book3s_32 CPUs always have 32 bytes cache line size, which Linux assumes. To - * make Book3s_32 Linux work on Book3s_64, we have to make sure we trap dcbz to - * emulate 32 bytes dcbz length. - * - * The Book3s_64 inventors also realized this case and implemented a special bit - * in the HID5 register, which is a hypervisor ressource. Thus we can't use it. - * - * My approach here is to patch the dcbz instruction on executing pages. - */ -static void kvmppc_patch_dcbz(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte) -{ - struct page *hpage; - u64 hpage_offset; - u32 *page; - int i; - - hpage = gfn_to_page(vcpu->kvm, pte->raddr >> PAGE_SHIFT); - if (is_error_page(hpage)) { - kvm_release_page_clean(hpage); - return; - } - - hpage_offset = pte->raddr & ~PAGE_MASK; - hpage_offset &= ~0xFFFULL; - hpage_offset /= 4; - - get_page(hpage); - page = kmap_atomic(hpage, KM_USER0); - - /* patch dcbz into reserved instruction, so we trap */ - for (i=hpage_offset; i < hpage_offset + (HW_PAGE_SIZE / 4); i++) - if ((page[i] & 0xff0007ff) == INS_DCBZ) - page[i] &= 0xfffffff7; - - kunmap_atomic(page, KM_USER0); - put_page(hpage); -} - static int kvmppc_xlate(struct kvm_vcpu *vcpu, ulong eaddr, bool data, struct kvmppc_pte *pte) { @@ -606,519 +401,6 @@ mmio: return EMULATE_DO_MMIO; } -static int kvmppc_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - ulong mp_pa = vcpu->arch.magic_page_pa; - - if (unlikely(mp_pa) && - unlikely((mp_pa & KVM_PAM) >> PAGE_SHIFT == gfn)) { - return 1; - } - - return kvm_is_visible_gfn(vcpu->kvm, gfn); -} - -int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, - ulong eaddr, int vec) -{ - bool data = (vec == BOOK3S_INTERRUPT_DATA_STORAGE); - int r = RESUME_GUEST; - int relocated; - int page_found = 0; - struct kvmppc_pte pte; - bool is_mmio = false; - bool dr = (vcpu->arch.shared->msr & MSR_DR) ? true : false; - bool ir = (vcpu->arch.shared->msr & MSR_IR) ? true : false; - u64 vsid; - - relocated = data ? dr : ir; - - /* Resolve real address if translation turned on */ - if (relocated) { - page_found = vcpu->arch.mmu.xlate(vcpu, eaddr, &pte, data); - } else { - pte.may_execute = true; - pte.may_read = true; - pte.may_write = true; - pte.raddr = eaddr & KVM_PAM; - pte.eaddr = eaddr; - pte.vpage = eaddr >> 12; - } - - switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { - case 0: - pte.vpage |= ((u64)VSID_REAL << (SID_SHIFT - 12)); - break; - case MSR_DR: - case MSR_IR: - vcpu->arch.mmu.esid_to_vsid(vcpu, eaddr >> SID_SHIFT, &vsid); - - if ((vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) == MSR_DR) - pte.vpage |= ((u64)VSID_REAL_DR << (SID_SHIFT - 12)); - else - pte.vpage |= ((u64)VSID_REAL_IR << (SID_SHIFT - 12)); - pte.vpage |= vsid; - - if (vsid == -1) - page_found = -EINVAL; - break; - } - - if (vcpu->arch.mmu.is_dcbz32(vcpu) && - (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) { - /* - * If we do the dcbz hack, we have to NX on every execution, - * so we can patch the executing code. This renders our guest - * NX-less. - */ - pte.may_execute = !data; - } - - if (page_found == -ENOENT) { - /* Page not found in guest PTE entries */ - vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); - vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr; - vcpu->arch.shared->msr |= - (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL); - kvmppc_book3s_queue_irqprio(vcpu, vec); - } else if (page_found == -EPERM) { - /* Storage protection */ - vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); - vcpu->arch.shared->dsisr = - to_svcpu(vcpu)->fault_dsisr & ~DSISR_NOHPTE; - vcpu->arch.shared->dsisr |= DSISR_PROTFAULT; - vcpu->arch.shared->msr |= - (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL); - kvmppc_book3s_queue_irqprio(vcpu, vec); - } else if (page_found == -EINVAL) { - /* Page not found in guest SLB */ - vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); - kvmppc_book3s_queue_irqprio(vcpu, vec + 0x80); - } else if (!is_mmio && - kvmppc_visible_gfn(vcpu, pte.raddr >> PAGE_SHIFT)) { - /* The guest's PTE is not mapped yet. Map on the host */ - kvmppc_mmu_map_page(vcpu, &pte); - if (data) - vcpu->stat.sp_storage++; - else if (vcpu->arch.mmu.is_dcbz32(vcpu) && - (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) - kvmppc_patch_dcbz(vcpu, &pte); - } else { - /* MMIO */ - vcpu->stat.mmio_exits++; - vcpu->arch.paddr_accessed = pte.raddr; - r = kvmppc_emulate_mmio(run, vcpu); - if ( r == RESUME_HOST_NV ) - r = RESUME_HOST; - } - - return r; -} - -static inline int get_fpr_index(int i) -{ -#ifdef CONFIG_VSX - i *= 2; -#endif - return i; -} - -/* Give up external provider (FPU, Altivec, VSX) */ -void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr) -{ - struct thread_struct *t = ¤t->thread; - u64 *vcpu_fpr = vcpu->arch.fpr; -#ifdef CONFIG_VSX - u64 *vcpu_vsx = vcpu->arch.vsr; -#endif - u64 *thread_fpr = (u64*)t->fpr; - int i; - - if (!(vcpu->arch.guest_owned_ext & msr)) - return; - -#ifdef DEBUG_EXT - printk(KERN_INFO "Giving up ext 0x%lx\n", msr); -#endif - - switch (msr) { - case MSR_FP: - giveup_fpu(current); - for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) - vcpu_fpr[i] = thread_fpr[get_fpr_index(i)]; - - vcpu->arch.fpscr = t->fpscr.val; - break; - case MSR_VEC: -#ifdef CONFIG_ALTIVEC - giveup_altivec(current); - memcpy(vcpu->arch.vr, t->vr, sizeof(vcpu->arch.vr)); - vcpu->arch.vscr = t->vscr; -#endif - break; - case MSR_VSX: -#ifdef CONFIG_VSX - __giveup_vsx(current); - for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++) - vcpu_vsx[i] = thread_fpr[get_fpr_index(i) + 1]; -#endif - break; - default: - BUG(); - } - - vcpu->arch.guest_owned_ext &= ~msr; - current->thread.regs->msr &= ~msr; - kvmppc_recalc_shadow_msr(vcpu); -} - -static int kvmppc_read_inst(struct kvm_vcpu *vcpu) -{ - ulong srr0 = kvmppc_get_pc(vcpu); - u32 last_inst = kvmppc_get_last_inst(vcpu); - int ret; - - ret = kvmppc_ld(vcpu, &srr0, sizeof(u32), &last_inst, false); - if (ret == -ENOENT) { - ulong msr = vcpu->arch.shared->msr; - - msr = kvmppc_set_field(msr, 33, 33, 1); - msr = kvmppc_set_field(msr, 34, 36, 0); - vcpu->arch.shared->msr = kvmppc_set_field(msr, 42, 47, 0); - kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_INST_STORAGE); - return EMULATE_AGAIN; - } - - return EMULATE_DONE; -} - -static int kvmppc_check_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr) -{ - - /* Need to do paired single emulation? */ - if (!(vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE)) - return EMULATE_DONE; - - /* Read out the instruction */ - if (kvmppc_read_inst(vcpu) == EMULATE_DONE) - /* Need to emulate */ - return EMULATE_FAIL; - - return EMULATE_AGAIN; -} - -/* Handle external providers (FPU, Altivec, VSX) */ -static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr, - ulong msr) -{ - struct thread_struct *t = ¤t->thread; - u64 *vcpu_fpr = vcpu->arch.fpr; -#ifdef CONFIG_VSX - u64 *vcpu_vsx = vcpu->arch.vsr; -#endif - u64 *thread_fpr = (u64*)t->fpr; - int i; - - /* When we have paired singles, we emulate in software */ - if (vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE) - return RESUME_GUEST; - - if (!(vcpu->arch.shared->msr & msr)) { - kvmppc_book3s_queue_irqprio(vcpu, exit_nr); - return RESUME_GUEST; - } - - /* We already own the ext */ - if (vcpu->arch.guest_owned_ext & msr) { - return RESUME_GUEST; - } - -#ifdef DEBUG_EXT - printk(KERN_INFO "Loading up ext 0x%lx\n", msr); -#endif - - current->thread.regs->msr |= msr; - - switch (msr) { - case MSR_FP: - for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) - thread_fpr[get_fpr_index(i)] = vcpu_fpr[i]; - - t->fpscr.val = vcpu->arch.fpscr; - t->fpexc_mode = 0; - kvmppc_load_up_fpu(); - break; - case MSR_VEC: -#ifdef CONFIG_ALTIVEC - memcpy(t->vr, vcpu->arch.vr, sizeof(vcpu->arch.vr)); - t->vscr = vcpu->arch.vscr; - t->vrsave = -1; - kvmppc_load_up_altivec(); -#endif - break; - case MSR_VSX: -#ifdef CONFIG_VSX - for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++) - thread_fpr[get_fpr_index(i) + 1] = vcpu_vsx[i]; - kvmppc_load_up_vsx(); -#endif - break; - default: - BUG(); - } - - vcpu->arch.guest_owned_ext |= msr; - - kvmppc_recalc_shadow_msr(vcpu); - - return RESUME_GUEST; -} - -int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, - unsigned int exit_nr) -{ - int r = RESUME_HOST; - - vcpu->stat.sum_exits++; - - run->exit_reason = KVM_EXIT_UNKNOWN; - run->ready_for_interrupt_injection = 1; - - trace_kvm_book3s_exit(exit_nr, vcpu); - kvm_resched(vcpu); - switch (exit_nr) { - case BOOK3S_INTERRUPT_INST_STORAGE: - vcpu->stat.pf_instruc++; - -#ifdef CONFIG_PPC_BOOK3S_32 - /* We set segments as unused segments when invalidating them. So - * treat the respective fault as segment fault. */ - if (to_svcpu(vcpu)->sr[kvmppc_get_pc(vcpu) >> SID_SHIFT] - == SR_INVALID) { - kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)); - r = RESUME_GUEST; - break; - } -#endif - - /* only care about PTEG not found errors, but leave NX alone */ - if (to_svcpu(vcpu)->shadow_srr1 & 0x40000000) { - r = kvmppc_handle_pagefault(run, vcpu, kvmppc_get_pc(vcpu), exit_nr); - vcpu->stat.sp_instruc++; - } else if (vcpu->arch.mmu.is_dcbz32(vcpu) && - (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) { - /* - * XXX If we do the dcbz hack we use the NX bit to flush&patch the page, - * so we can't use the NX bit inside the guest. Let's cross our fingers, - * that no guest that needs the dcbz hack does NX. - */ - kvmppc_mmu_pte_flush(vcpu, kvmppc_get_pc(vcpu), ~0xFFFUL); - r = RESUME_GUEST; - } else { - vcpu->arch.shared->msr |= - to_svcpu(vcpu)->shadow_srr1 & 0x58000000; - kvmppc_book3s_queue_irqprio(vcpu, exit_nr); - r = RESUME_GUEST; - } - break; - case BOOK3S_INTERRUPT_DATA_STORAGE: - { - ulong dar = kvmppc_get_fault_dar(vcpu); - vcpu->stat.pf_storage++; - -#ifdef CONFIG_PPC_BOOK3S_32 - /* We set segments as unused segments when invalidating them. So - * treat the respective fault as segment fault. */ - if ((to_svcpu(vcpu)->sr[dar >> SID_SHIFT]) == SR_INVALID) { - kvmppc_mmu_map_segment(vcpu, dar); - r = RESUME_GUEST; - break; - } -#endif - - /* The only case we need to handle is missing shadow PTEs */ - if (to_svcpu(vcpu)->fault_dsisr & DSISR_NOHPTE) { - r = kvmppc_handle_pagefault(run, vcpu, dar, exit_nr); - } else { - vcpu->arch.shared->dar = dar; - vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr; - kvmppc_book3s_queue_irqprio(vcpu, exit_nr); - r = RESUME_GUEST; - } - break; - } - case BOOK3S_INTERRUPT_DATA_SEGMENT: - if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_fault_dar(vcpu)) < 0) { - vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); - kvmppc_book3s_queue_irqprio(vcpu, - BOOK3S_INTERRUPT_DATA_SEGMENT); - } - r = RESUME_GUEST; - break; - case BOOK3S_INTERRUPT_INST_SEGMENT: - if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)) < 0) { - kvmppc_book3s_queue_irqprio(vcpu, - BOOK3S_INTERRUPT_INST_SEGMENT); - } - r = RESUME_GUEST; - break; - /* We're good on these - the host merely wanted to get our attention */ - case BOOK3S_INTERRUPT_DECREMENTER: - vcpu->stat.dec_exits++; - r = RESUME_GUEST; - break; - case BOOK3S_INTERRUPT_EXTERNAL: - vcpu->stat.ext_intr_exits++; - r = RESUME_GUEST; - break; - case BOOK3S_INTERRUPT_PERFMON: - r = RESUME_GUEST; - break; - case BOOK3S_INTERRUPT_PROGRAM: - { - enum emulation_result er; - ulong flags; - -program_interrupt: - flags = to_svcpu(vcpu)->shadow_srr1 & 0x1f0000ull; - - if (vcpu->arch.shared->msr & MSR_PR) { -#ifdef EXIT_DEBUG - printk(KERN_INFO "Userspace triggered 0x700 exception at 0x%lx (0x%x)\n", kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu)); -#endif - if ((kvmppc_get_last_inst(vcpu) & 0xff0007ff) != - (INS_DCBZ & 0xfffffff7)) { - kvmppc_core_queue_program(vcpu, flags); - r = RESUME_GUEST; - break; - } - } - - vcpu->stat.emulated_inst_exits++; - er = kvmppc_emulate_instruction(run, vcpu); - switch (er) { - case EMULATE_DONE: - r = RESUME_GUEST_NV; - break; - case EMULATE_AGAIN: - r = RESUME_GUEST; - break; - case EMULATE_FAIL: - printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n", - __func__, kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu)); - kvmppc_core_queue_program(vcpu, flags); - r = RESUME_GUEST; - break; - case EMULATE_DO_MMIO: - run->exit_reason = KVM_EXIT_MMIO; - r = RESUME_HOST_NV; - break; - default: - BUG(); - } - break; - } - case BOOK3S_INTERRUPT_SYSCALL: - if (vcpu->arch.osi_enabled && - (((u32)kvmppc_get_gpr(vcpu, 3)) == OSI_SC_MAGIC_R3) && - (((u32)kvmppc_get_gpr(vcpu, 4)) == OSI_SC_MAGIC_R4)) { - /* MOL hypercalls */ - u64 *gprs = run->osi.gprs; - int i; - - run->exit_reason = KVM_EXIT_OSI; - for (i = 0; i < 32; i++) - gprs[i] = kvmppc_get_gpr(vcpu, i); - vcpu->arch.osi_needed = 1; - r = RESUME_HOST_NV; - } else if (!(vcpu->arch.shared->msr & MSR_PR) && - (((u32)kvmppc_get_gpr(vcpu, 0)) == KVM_SC_MAGIC_R0)) { - /* KVM PV hypercalls */ - kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu)); - r = RESUME_GUEST; - } else { - /* Guest syscalls */ - vcpu->stat.syscall_exits++; - kvmppc_book3s_queue_irqprio(vcpu, exit_nr); - r = RESUME_GUEST; - } - break; - case BOOK3S_INTERRUPT_FP_UNAVAIL: - case BOOK3S_INTERRUPT_ALTIVEC: - case BOOK3S_INTERRUPT_VSX: - { - int ext_msr = 0; - - switch (exit_nr) { - case BOOK3S_INTERRUPT_FP_UNAVAIL: ext_msr = MSR_FP; break; - case BOOK3S_INTERRUPT_ALTIVEC: ext_msr = MSR_VEC; break; - case BOOK3S_INTERRUPT_VSX: ext_msr = MSR_VSX; break; - } - - switch (kvmppc_check_ext(vcpu, exit_nr)) { - case EMULATE_DONE: - /* everything ok - let's enable the ext */ - r = kvmppc_handle_ext(vcpu, exit_nr, ext_msr); - break; - case EMULATE_FAIL: - /* we need to emulate this instruction */ - goto program_interrupt; - break; - default: - /* nothing to worry about - go again */ - break; - } - break; - } - case BOOK3S_INTERRUPT_ALIGNMENT: - if (kvmppc_read_inst(vcpu) == EMULATE_DONE) { - vcpu->arch.shared->dsisr = kvmppc_alignment_dsisr(vcpu, - kvmppc_get_last_inst(vcpu)); - vcpu->arch.shared->dar = kvmppc_alignment_dar(vcpu, - kvmppc_get_last_inst(vcpu)); - kvmppc_book3s_queue_irqprio(vcpu, exit_nr); - } - r = RESUME_GUEST; - break; - case BOOK3S_INTERRUPT_MACHINE_CHECK: - case BOOK3S_INTERRUPT_TRACE: - kvmppc_book3s_queue_irqprio(vcpu, exit_nr); - r = RESUME_GUEST; - break; - default: - /* Ugh - bork here! What did we get? */ - printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n", - exit_nr, kvmppc_get_pc(vcpu), to_svcpu(vcpu)->shadow_srr1); - r = RESUME_HOST; - BUG(); - break; - } - - - if (!(r & RESUME_HOST)) { - /* To avoid clobbering exit_reason, only check for signals if - * we aren't already exiting to userspace for some other - * reason. */ - if (signal_pending(current)) { -#ifdef EXIT_DEBUG - printk(KERN_EMERG "KVM: Going back to host\n"); -#endif - vcpu->stat.signal_exits++; - run->exit_reason = KVM_EXIT_INTR; - r = -EINTR; - } else { - /* In case an interrupt came in that was triggered - * from userspace (like DEC), we need to check what - * to inject now! */ - kvmppc_core_deliver_interrupts(vcpu); - } - } - - trace_kvm_book3s_reenter(r, vcpu); - - return r; -} - int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) { return 0; @@ -1179,69 +461,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) return 0; } -int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, - struct kvm_sregs *sregs) -{ - struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); - int i; - - sregs->pvr = vcpu->arch.pvr; - - sregs->u.s.sdr1 = to_book3s(vcpu)->sdr1; - if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) { - for (i = 0; i < 64; i++) { - sregs->u.s.ppc64.slb[i].slbe = vcpu3s->slb[i].orige | i; - sregs->u.s.ppc64.slb[i].slbv = vcpu3s->slb[i].origv; - } - } else { - for (i = 0; i < 16; i++) - sregs->u.s.ppc32.sr[i] = vcpu->arch.shared->sr[i]; - - for (i = 0; i < 8; i++) { - sregs->u.s.ppc32.ibat[i] = vcpu3s->ibat[i].raw; - sregs->u.s.ppc32.dbat[i] = vcpu3s->dbat[i].raw; - } - } - - return 0; -} - -int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, - struct kvm_sregs *sregs) -{ - struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); - int i; - - kvmppc_set_pvr(vcpu, sregs->pvr); - - vcpu3s->sdr1 = sregs->u.s.sdr1; - if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) { - for (i = 0; i < 64; i++) { - vcpu->arch.mmu.slbmte(vcpu, sregs->u.s.ppc64.slb[i].slbv, - sregs->u.s.ppc64.slb[i].slbe); - } - } else { - for (i = 0; i < 16; i++) { - vcpu->arch.mmu.mtsrin(vcpu, i, sregs->u.s.ppc32.sr[i]); - } - for (i = 0; i < 8; i++) { - kvmppc_set_bat(vcpu, &(vcpu3s->ibat[i]), false, - (u32)sregs->u.s.ppc32.ibat[i]); - kvmppc_set_bat(vcpu, &(vcpu3s->ibat[i]), true, - (u32)(sregs->u.s.ppc32.ibat[i] >> 32)); - kvmppc_set_bat(vcpu, &(vcpu3s->dbat[i]), false, - (u32)sregs->u.s.ppc32.dbat[i]); - kvmppc_set_bat(vcpu, &(vcpu3s->dbat[i]), true, - (u32)(sregs->u.s.ppc32.dbat[i] >> 32)); - } - } - - /* Flush the MMU after messing with the segments */ - kvmppc_mmu_pte_flush(vcpu, 0, 0); - - return 0; -} - int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { return -ENOTSUPP; @@ -1296,202 +515,3 @@ out: mutex_unlock(&kvm->slots_lock); return r; } - -int kvmppc_core_check_processor_compat(void) -{ - return 0; -} - -struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) -{ - struct kvmppc_vcpu_book3s *vcpu_book3s; - struct kvm_vcpu *vcpu; - int err = -ENOMEM; - unsigned long p; - - vcpu_book3s = vzalloc(sizeof(struct kvmppc_vcpu_book3s)); - if (!vcpu_book3s) - goto out; - - vcpu_book3s->shadow_vcpu = (struct kvmppc_book3s_shadow_vcpu *) - kzalloc(sizeof(*vcpu_book3s->shadow_vcpu), GFP_KERNEL); - if (!vcpu_book3s->shadow_vcpu) - goto free_vcpu; - - vcpu = &vcpu_book3s->vcpu; - err = kvm_vcpu_init(vcpu, kvm, id); - if (err) - goto free_shadow_vcpu; - - p = __get_free_page(GFP_KERNEL|__GFP_ZERO); - /* the real shared page fills the last 4k of our page */ - vcpu->arch.shared = (void*)(p + PAGE_SIZE - 4096); - if (!p) - goto uninit_vcpu; - - vcpu->arch.host_retip = kvm_return_point; - vcpu->arch.host_msr = mfmsr(); -#ifdef CONFIG_PPC_BOOK3S_64 - /* default to book3s_64 (970fx) */ - vcpu->arch.pvr = 0x3C0301; -#else - /* default to book3s_32 (750) */ - vcpu->arch.pvr = 0x84202; -#endif - kvmppc_set_pvr(vcpu, vcpu->arch.pvr); - vcpu_book3s->slb_nr = 64; - - /* remember where some real-mode handlers are */ - vcpu->arch.trampoline_lowmem = kvmppc_trampoline_lowmem; - vcpu->arch.trampoline_enter = kvmppc_trampoline_enter; - vcpu->arch.highmem_handler = (ulong)kvmppc_handler_highmem; -#ifdef CONFIG_PPC_BOOK3S_64 - vcpu->arch.rmcall = *(ulong*)kvmppc_rmcall; -#else - vcpu->arch.rmcall = (ulong)kvmppc_rmcall; -#endif - - vcpu->arch.shadow_msr = MSR_USER64; - - err = kvmppc_mmu_init(vcpu); - if (err < 0) - goto uninit_vcpu; - - return vcpu; - -uninit_vcpu: - kvm_vcpu_uninit(vcpu); -free_shadow_vcpu: - kfree(vcpu_book3s->shadow_vcpu); -free_vcpu: - vfree(vcpu_book3s); -out: - return ERR_PTR(err); -} - -void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) -{ - struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); - - free_page((unsigned long)vcpu->arch.shared & PAGE_MASK); - kvm_vcpu_uninit(vcpu); - kfree(vcpu_book3s->shadow_vcpu); - vfree(vcpu_book3s); -} - -extern int __kvmppc_vcpu_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); -int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) -{ - int ret; - double fpr[32][TS_FPRWIDTH]; - unsigned int fpscr; - int fpexc_mode; -#ifdef CONFIG_ALTIVEC - vector128 vr[32]; - vector128 vscr; - unsigned long uninitialized_var(vrsave); - int used_vr; -#endif -#ifdef CONFIG_VSX - int used_vsr; -#endif - ulong ext_msr; - - /* No need to go into the guest when all we do is going out */ - if (signal_pending(current)) { - kvm_run->exit_reason = KVM_EXIT_INTR; - return -EINTR; - } - - /* Save FPU state in stack */ - if (current->thread.regs->msr & MSR_FP) - giveup_fpu(current); - memcpy(fpr, current->thread.fpr, sizeof(current->thread.fpr)); - fpscr = current->thread.fpscr.val; - fpexc_mode = current->thread.fpexc_mode; - -#ifdef CONFIG_ALTIVEC - /* Save Altivec state in stack */ - used_vr = current->thread.used_vr; - if (used_vr) { - if (current->thread.regs->msr & MSR_VEC) - giveup_altivec(current); - memcpy(vr, current->thread.vr, sizeof(current->thread.vr)); - vscr = current->thread.vscr; - vrsave = current->thread.vrsave; - } -#endif - -#ifdef CONFIG_VSX - /* Save VSX state in stack */ - used_vsr = current->thread.used_vsr; - if (used_vsr && (current->thread.regs->msr & MSR_VSX)) - __giveup_vsx(current); -#endif - - /* Remember the MSR with disabled extensions */ - ext_msr = current->thread.regs->msr; - - /* XXX we get called with irq disabled - change that! */ - local_irq_enable(); - - /* Preload FPU if it's enabled */ - if (vcpu->arch.shared->msr & MSR_FP) - kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP); - - ret = __kvmppc_vcpu_entry(kvm_run, vcpu); - - local_irq_disable(); - - current->thread.regs->msr = ext_msr; - - /* Make sure we save the guest FPU/Altivec/VSX state */ - kvmppc_giveup_ext(vcpu, MSR_FP); - kvmppc_giveup_ext(vcpu, MSR_VEC); - kvmppc_giveup_ext(vcpu, MSR_VSX); - - /* Restore FPU state from stack */ - memcpy(current->thread.fpr, fpr, sizeof(current->thread.fpr)); - current->thread.fpscr.val = fpscr; - current->thread.fpexc_mode = fpexc_mode; - -#ifdef CONFIG_ALTIVEC - /* Restore Altivec state from stack */ - if (used_vr && current->thread.used_vr) { - memcpy(current->thread.vr, vr, sizeof(current->thread.vr)); - current->thread.vscr = vscr; - current->thread.vrsave = vrsave; - } - current->thread.used_vr = used_vr; -#endif - -#ifdef CONFIG_VSX - current->thread.used_vsr = used_vsr; -#endif - - return ret; -} - -static int kvmppc_book3s_init(void) -{ - int r; - - r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_book3s), 0, - THIS_MODULE); - - if (r) - return r; - - r = kvmppc_mmu_hpte_sysinit(); - - return r; -} - -static void kvmppc_book3s_exit(void) -{ - kvmppc_mmu_hpte_sysexit(); - kvm_exit(); -} - -module_init(kvmppc_book3s_init); -module_exit(kvmppc_book3s_exit); diff --git a/arch/powerpc/kvm/book3s_32_sr.S b/arch/powerpc/kvm/book3s_32_sr.S index 3608471ad2d8..7e06a6fc8d07 100644 --- a/arch/powerpc/kvm/book3s_32_sr.S +++ b/arch/powerpc/kvm/book3s_32_sr.S @@ -31,7 +31,7 @@ * R1 = host R1 * R2 = host R2 * R3 = shadow vcpu - * all other volatile GPRS = free + * all other volatile GPRS = free except R4, R6 * SVCPU[CR] = guest CR * SVCPU[XER] = guest XER * SVCPU[CTR] = guest CTR diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c index d7889ef3211e..b871721c0050 100644 --- a/arch/powerpc/kvm/book3s_64_mmu.c +++ b/arch/powerpc/kvm/book3s_64_mmu.c @@ -41,36 +41,36 @@ static void kvmppc_mmu_book3s_64_reset_msr(struct kvm_vcpu *vcpu) } static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe( - struct kvmppc_vcpu_book3s *vcpu_book3s, + struct kvm_vcpu *vcpu, gva_t eaddr) { int i; u64 esid = GET_ESID(eaddr); u64 esid_1t = GET_ESID_1T(eaddr); - for (i = 0; i < vcpu_book3s->slb_nr; i++) { + for (i = 0; i < vcpu->arch.slb_nr; i++) { u64 cmp_esid = esid; - if (!vcpu_book3s->slb[i].valid) + if (!vcpu->arch.slb[i].valid) continue; - if (vcpu_book3s->slb[i].tb) + if (vcpu->arch.slb[i].tb) cmp_esid = esid_1t; - if (vcpu_book3s->slb[i].esid == cmp_esid) - return &vcpu_book3s->slb[i]; + if (vcpu->arch.slb[i].esid == cmp_esid) + return &vcpu->arch.slb[i]; } dprintk("KVM: No SLB entry found for 0x%lx [%llx | %llx]\n", eaddr, esid, esid_1t); - for (i = 0; i < vcpu_book3s->slb_nr; i++) { - if (vcpu_book3s->slb[i].vsid) + for (i = 0; i < vcpu->arch.slb_nr; i++) { + if (vcpu->arch.slb[i].vsid) dprintk(" %d: %c%c%c %llx %llx\n", i, - vcpu_book3s->slb[i].valid ? 'v' : ' ', - vcpu_book3s->slb[i].large ? 'l' : ' ', - vcpu_book3s->slb[i].tb ? 't' : ' ', - vcpu_book3s->slb[i].esid, - vcpu_book3s->slb[i].vsid); + vcpu->arch.slb[i].valid ? 'v' : ' ', + vcpu->arch.slb[i].large ? 'l' : ' ', + vcpu->arch.slb[i].tb ? 't' : ' ', + vcpu->arch.slb[i].esid, + vcpu->arch.slb[i].vsid); } return NULL; @@ -81,7 +81,7 @@ static u64 kvmppc_mmu_book3s_64_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr, { struct kvmppc_slb *slb; - slb = kvmppc_mmu_book3s_64_find_slbe(to_book3s(vcpu), eaddr); + slb = kvmppc_mmu_book3s_64_find_slbe(vcpu, eaddr); if (!slb) return 0; @@ -128,7 +128,13 @@ static hva_t kvmppc_mmu_book3s_64_get_pteg( dprintk("MMU: page=0x%x sdr1=0x%llx pteg=0x%llx vsid=0x%llx\n", page, vcpu_book3s->sdr1, pteg, slbe->vsid); - r = gfn_to_hva(vcpu_book3s->vcpu.kvm, pteg >> PAGE_SHIFT); + /* When running a PAPR guest, SDR1 contains a HVA address instead + of a GPA */ + if (vcpu_book3s->vcpu.arch.papr_enabled) + r = pteg; + else + r = gfn_to_hva(vcpu_book3s->vcpu.kvm, pteg >> PAGE_SHIFT); + if (kvm_is_error_hva(r)) return r; return r | (pteg & ~PAGE_MASK); @@ -180,7 +186,7 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, return 0; } - slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu_book3s, eaddr); + slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu, eaddr); if (!slbe) goto no_seg_found; @@ -320,10 +326,10 @@ static void kvmppc_mmu_book3s_64_slbmte(struct kvm_vcpu *vcpu, u64 rs, u64 rb) esid_1t = GET_ESID_1T(rb); slb_nr = rb & 0xfff; - if (slb_nr > vcpu_book3s->slb_nr) + if (slb_nr > vcpu->arch.slb_nr) return; - slbe = &vcpu_book3s->slb[slb_nr]; + slbe = &vcpu->arch.slb[slb_nr]; slbe->large = (rs & SLB_VSID_L) ? 1 : 0; slbe->tb = (rs & SLB_VSID_B_1T) ? 1 : 0; @@ -344,38 +350,35 @@ static void kvmppc_mmu_book3s_64_slbmte(struct kvm_vcpu *vcpu, u64 rs, u64 rb) static u64 kvmppc_mmu_book3s_64_slbmfee(struct kvm_vcpu *vcpu, u64 slb_nr) { - struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); struct kvmppc_slb *slbe; - if (slb_nr > vcpu_book3s->slb_nr) + if (slb_nr > vcpu->arch.slb_nr) return 0; - slbe = &vcpu_book3s->slb[slb_nr]; + slbe = &vcpu->arch.slb[slb_nr]; return slbe->orige; } static u64 kvmppc_mmu_book3s_64_slbmfev(struct kvm_vcpu *vcpu, u64 slb_nr) { - struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); struct kvmppc_slb *slbe; - if (slb_nr > vcpu_book3s->slb_nr) + if (slb_nr > vcpu->arch.slb_nr) return 0; - slbe = &vcpu_book3s->slb[slb_nr]; + slbe = &vcpu->arch.slb[slb_nr]; return slbe->origv; } static void kvmppc_mmu_book3s_64_slbie(struct kvm_vcpu *vcpu, u64 ea) { - struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); struct kvmppc_slb *slbe; dprintk("KVM MMU: slbie(0x%llx)\n", ea); - slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu_book3s, ea); + slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu, ea); if (!slbe) return; @@ -389,13 +392,12 @@ static void kvmppc_mmu_book3s_64_slbie(struct kvm_vcpu *vcpu, u64 ea) static void kvmppc_mmu_book3s_64_slbia(struct kvm_vcpu *vcpu) { - struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); int i; dprintk("KVM MMU: slbia()\n"); - for (i = 1; i < vcpu_book3s->slb_nr; i++) - vcpu_book3s->slb[i].valid = false; + for (i = 1; i < vcpu->arch.slb_nr; i++) + vcpu->arch.slb[i].valid = false; if (vcpu->arch.shared->msr & MSR_IR) { kvmppc_mmu_flush_segments(vcpu); @@ -464,7 +466,7 @@ static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid, ulong mp_ea = vcpu->arch.magic_page_ea; if (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { - slb = kvmppc_mmu_book3s_64_find_slbe(to_book3s(vcpu), ea); + slb = kvmppc_mmu_book3s_64_find_slbe(vcpu, ea); if (slb) gvsid = slb->vsid; } diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c new file mode 100644 index 000000000000..bc3a2ea94217 --- /dev/null +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -0,0 +1,180 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + */ + +#include <linux/types.h> +#include <linux/string.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/highmem.h> +#include <linux/gfp.h> +#include <linux/slab.h> +#include <linux/hugetlb.h> + +#include <asm/tlbflush.h> +#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s.h> +#include <asm/mmu-hash64.h> +#include <asm/hvcall.h> +#include <asm/synch.h> +#include <asm/ppc-opcode.h> +#include <asm/cputable.h> + +/* For now use fixed-size 16MB page table */ +#define HPT_ORDER 24 +#define HPT_NPTEG (1ul << (HPT_ORDER - 7)) /* 128B per pteg */ +#define HPT_HASH_MASK (HPT_NPTEG - 1) + +/* Pages in the VRMA are 16MB pages */ +#define VRMA_PAGE_ORDER 24 +#define VRMA_VSID 0x1ffffffUL /* 1TB VSID reserved for VRMA */ + +/* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */ +#define MAX_LPID_970 63 +#define NR_LPIDS (LPID_RSVD + 1) +unsigned long lpid_inuse[BITS_TO_LONGS(NR_LPIDS)]; + +long kvmppc_alloc_hpt(struct kvm *kvm) +{ + unsigned long hpt; + unsigned long lpid; + + hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|__GFP_NOWARN, + HPT_ORDER - PAGE_SHIFT); + if (!hpt) { + pr_err("kvm_alloc_hpt: Couldn't alloc HPT\n"); + return -ENOMEM; + } + kvm->arch.hpt_virt = hpt; + + do { + lpid = find_first_zero_bit(lpid_inuse, NR_LPIDS); + if (lpid >= NR_LPIDS) { + pr_err("kvm_alloc_hpt: No LPIDs free\n"); + free_pages(hpt, HPT_ORDER - PAGE_SHIFT); + return -ENOMEM; + } + } while (test_and_set_bit(lpid, lpid_inuse)); + + kvm->arch.sdr1 = __pa(hpt) | (HPT_ORDER - 18); + kvm->arch.lpid = lpid; + + pr_info("KVM guest htab at %lx, LPID %lx\n", hpt, lpid); + return 0; +} + +void kvmppc_free_hpt(struct kvm *kvm) +{ + clear_bit(kvm->arch.lpid, lpid_inuse); + free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT); +} + +void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem) +{ + unsigned long i; + unsigned long npages = kvm->arch.ram_npages; + unsigned long pfn; + unsigned long *hpte; + unsigned long hash; + struct kvmppc_pginfo *pginfo = kvm->arch.ram_pginfo; + + if (!pginfo) + return; + + /* VRMA can't be > 1TB */ + if (npages > 1ul << (40 - kvm->arch.ram_porder)) + npages = 1ul << (40 - kvm->arch.ram_porder); + /* Can't use more than 1 HPTE per HPTEG */ + if (npages > HPT_NPTEG) + npages = HPT_NPTEG; + + for (i = 0; i < npages; ++i) { + pfn = pginfo[i].pfn; + if (!pfn) + break; + /* can't use hpt_hash since va > 64 bits */ + hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & HPT_HASH_MASK; + /* + * We assume that the hash table is empty and no + * vcpus are using it at this stage. Since we create + * at most one HPTE per HPTEG, we just assume entry 7 + * is available and use it. + */ + hpte = (unsigned long *) (kvm->arch.hpt_virt + (hash << 7)); + hpte += 7 * 2; + /* HPTE low word - RPN, protection, etc. */ + hpte[1] = (pfn << PAGE_SHIFT) | HPTE_R_R | HPTE_R_C | + HPTE_R_M | PP_RWXX; + wmb(); + hpte[0] = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) | + (i << (VRMA_PAGE_ORDER - 16)) | HPTE_V_BOLTED | + HPTE_V_LARGE | HPTE_V_VALID; + } +} + +int kvmppc_mmu_hv_init(void) +{ + unsigned long host_lpid, rsvd_lpid; + + if (!cpu_has_feature(CPU_FTR_HVMODE)) + return -EINVAL; + + memset(lpid_inuse, 0, sizeof(lpid_inuse)); + + if (cpu_has_feature(CPU_FTR_ARCH_206)) { + host_lpid = mfspr(SPRN_LPID); /* POWER7 */ + rsvd_lpid = LPID_RSVD; + } else { + host_lpid = 0; /* PPC970 */ + rsvd_lpid = MAX_LPID_970; + } + + set_bit(host_lpid, lpid_inuse); + /* rsvd_lpid is reserved for use in partition switching */ + set_bit(rsvd_lpid, lpid_inuse); + + return 0; +} + +void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) +{ +} + +static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu) +{ + kvmppc_set_msr(vcpu, MSR_SF | MSR_ME); +} + +static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, + struct kvmppc_pte *gpte, bool data) +{ + return -ENOENT; +} + +void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu) +{ + struct kvmppc_mmu *mmu = &vcpu->arch.mmu; + + if (cpu_has_feature(CPU_FTR_ARCH_206)) + vcpu->arch.slb_nr = 32; /* POWER7 */ + else + vcpu->arch.slb_nr = 64; + + mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate; + mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr; + + vcpu->arch.hflags |= BOOK3S_HFLAG_SLB; +} diff --git a/arch/powerpc/kvm/book3s_64_slb.S b/arch/powerpc/kvm/book3s_64_slb.S index 04e7d3bbfe8b..f2e6e48ea463 100644 --- a/arch/powerpc/kvm/book3s_64_slb.S +++ b/arch/powerpc/kvm/book3s_64_slb.S @@ -53,7 +53,7 @@ slb_exit_skip_ ## num: * R1 = host R1 * R2 = host R2 * R3 = shadow vcpu - * all other volatile GPRS = free + * all other volatile GPRS = free except R4, R6 * SVCPU[CR] = guest CR * SVCPU[XER] = guest XER * SVCPU[CTR] = guest CTR diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c new file mode 100644 index 000000000000..ea0f8c537c28 --- /dev/null +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -0,0 +1,73 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com> + */ + +#include <linux/types.h> +#include <linux/string.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/highmem.h> +#include <linux/gfp.h> +#include <linux/slab.h> +#include <linux/hugetlb.h> +#include <linux/list.h> + +#include <asm/tlbflush.h> +#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s.h> +#include <asm/mmu-hash64.h> +#include <asm/hvcall.h> +#include <asm/synch.h> +#include <asm/ppc-opcode.h> +#include <asm/kvm_host.h> +#include <asm/udbg.h> + +#define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64)) + +long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, + unsigned long ioba, unsigned long tce) +{ + struct kvm *kvm = vcpu->kvm; + struct kvmppc_spapr_tce_table *stt; + + /* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */ + /* liobn, ioba, tce); */ + + list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { + if (stt->liobn == liobn) { + unsigned long idx = ioba >> SPAPR_TCE_SHIFT; + struct page *page; + u64 *tbl; + + /* udbg_printf("H_PUT_TCE: liobn 0x%lx => stt=%p window_size=0x%x\n", */ + /* liobn, stt, stt->window_size); */ + if (ioba >= stt->window_size) + return H_PARAMETER; + + page = stt->pages[idx / TCES_PER_PAGE]; + tbl = (u64 *)page_address(page); + + /* FIXME: Need to validate the TCE itself */ + /* udbg_printf("tce @ %p\n", &tbl[idx % TCES_PER_PAGE]); */ + tbl[idx % TCES_PER_PAGE] = tce; + return H_SUCCESS; + } + } + + /* Didn't find the liobn, punt it to userspace */ + return H_TOO_HARD; +} diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index 466846557089..0c9dc62532d0 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -63,6 +63,25 @@ * function pointers, so let's just disable the define. */ #undef mfsrin +enum priv_level { + PRIV_PROBLEM = 0, + PRIV_SUPER = 1, + PRIV_HYPER = 2, +}; + +static bool spr_allowed(struct kvm_vcpu *vcpu, enum priv_level level) +{ + /* PAPR VMs only access supervisor SPRs */ + if (vcpu->arch.papr_enabled && (level > PRIV_SUPER)) + return false; + + /* Limit user space to its own small SPR set */ + if ((vcpu->arch.shared->msr & MSR_PR) && level > PRIV_PROBLEM) + return false; + + return true; +} + int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int inst, int *advance) { @@ -296,6 +315,8 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) switch (sprn) { case SPRN_SDR1: + if (!spr_allowed(vcpu, PRIV_HYPER)) + goto unprivileged; to_book3s(vcpu)->sdr1 = spr_val; break; case SPRN_DSISR: @@ -390,6 +411,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) case SPRN_PMC4_GEKKO: case SPRN_WPAR_GEKKO: break; +unprivileged: default: printk(KERN_INFO "KVM: invalid SPR write: %d\n", sprn); #ifndef DEBUG_SPR @@ -421,6 +443,8 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) break; } case SPRN_SDR1: + if (!spr_allowed(vcpu, PRIV_HYPER)) + goto unprivileged; kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->sdr1); break; case SPRN_DSISR: @@ -449,6 +473,10 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) case SPRN_HID5: kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[5]); break; + case SPRN_CFAR: + case SPRN_PURR: + kvmppc_set_gpr(vcpu, rt, 0); + break; case SPRN_GQR0: case SPRN_GQR1: case SPRN_GQR2: @@ -476,6 +504,7 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) kvmppc_set_gpr(vcpu, rt, 0); break; default: +unprivileged: printk(KERN_INFO "KVM: invalid SPR read: %d\n", sprn); #ifndef DEBUG_SPR emulated = EMULATE_FAIL; diff --git a/arch/powerpc/kvm/book3s_exports.c b/arch/powerpc/kvm/book3s_exports.c index 1dd5a1ddfd0d..a150817d6d4c 100644 --- a/arch/powerpc/kvm/book3s_exports.c +++ b/arch/powerpc/kvm/book3s_exports.c @@ -17,12 +17,13 @@ * Authors: Alexander Graf <agraf@suse.de> */ -#include <linux/module.h> +#include <linux/export.h> #include <asm/kvm_book3s.h> -EXPORT_SYMBOL_GPL(kvmppc_trampoline_enter); -EXPORT_SYMBOL_GPL(kvmppc_trampoline_lowmem); -EXPORT_SYMBOL_GPL(kvmppc_rmcall); +#ifdef CONFIG_KVM_BOOK3S_64_HV +EXPORT_SYMBOL_GPL(kvmppc_hv_entry_trampoline); +#else +EXPORT_SYMBOL_GPL(kvmppc_entry_trampoline); EXPORT_SYMBOL_GPL(kvmppc_load_up_fpu); #ifdef CONFIG_ALTIVEC EXPORT_SYMBOL_GPL(kvmppc_load_up_altivec); @@ -30,3 +31,5 @@ EXPORT_SYMBOL_GPL(kvmppc_load_up_altivec); #ifdef CONFIG_VSX EXPORT_SYMBOL_GPL(kvmppc_load_up_vsx); #endif +#endif + diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c new file mode 100644 index 000000000000..0cb137a9b038 --- /dev/null +++ b/arch/powerpc/kvm/book3s_hv.c @@ -0,0 +1,1320 @@ +/* + * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved. + * + * Authors: + * Paul Mackerras <paulus@au1.ibm.com> + * Alexander Graf <agraf@suse.de> + * Kevin Wolf <mail@kevin-wolf.de> + * + * Description: KVM functions specific to running on Book 3S + * processors in hypervisor mode (specifically POWER7 and later). + * + * This file is derived from arch/powerpc/kvm/book3s.c, + * by Alexander Graf <agraf@suse.de>. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include <linux/kvm_host.h> +#include <linux/err.h> +#include <linux/slab.h> +#include <linux/preempt.h> +#include <linux/sched.h> +#include <linux/delay.h> +#include <linux/export.h> +#include <linux/fs.h> +#include <linux/anon_inodes.h> +#include <linux/cpumask.h> +#include <linux/spinlock.h> +#include <linux/page-flags.h> + +#include <asm/reg.h> +#include <asm/cputable.h> +#include <asm/cacheflush.h> +#include <asm/tlbflush.h> +#include <asm/uaccess.h> +#include <asm/io.h> +#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s.h> +#include <asm/mmu_context.h> +#include <asm/lppaca.h> +#include <asm/processor.h> +#include <asm/cputhreads.h> +#include <asm/page.h> +#include <asm/hvcall.h> +#include <linux/gfp.h> +#include <linux/sched.h> +#include <linux/vmalloc.h> +#include <linux/highmem.h> + +/* + * For now, limit memory to 64GB and require it to be large pages. + * This value is chosen because it makes the ram_pginfo array be + * 64kB in size, which is about as large as we want to be trying + * to allocate with kmalloc. + */ +#define MAX_MEM_ORDER 36 + +#define LARGE_PAGE_ORDER 24 /* 16MB pages */ + +/* #define EXIT_DEBUG */ +/* #define EXIT_DEBUG_SIMPLE */ +/* #define EXIT_DEBUG_INT */ + +static void kvmppc_end_cede(struct kvm_vcpu *vcpu); + +void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + local_paca->kvm_hstate.kvm_vcpu = vcpu; + local_paca->kvm_hstate.kvm_vcore = vcpu->arch.vcore; +} + +void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) +{ +} + +void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) +{ + vcpu->arch.shregs.msr = msr; + kvmppc_end_cede(vcpu); +} + +void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr) +{ + vcpu->arch.pvr = pvr; +} + +void kvmppc_dump_regs(struct kvm_vcpu *vcpu) +{ + int r; + + pr_err("vcpu %p (%d):\n", vcpu, vcpu->vcpu_id); + pr_err("pc = %.16lx msr = %.16llx trap = %x\n", + vcpu->arch.pc, vcpu->arch.shregs.msr, vcpu->arch.trap); + for (r = 0; r < 16; ++r) + pr_err("r%2d = %.16lx r%d = %.16lx\n", + r, kvmppc_get_gpr(vcpu, r), + r+16, kvmppc_get_gpr(vcpu, r+16)); + pr_err("ctr = %.16lx lr = %.16lx\n", + vcpu->arch.ctr, vcpu->arch.lr); + pr_err("srr0 = %.16llx srr1 = %.16llx\n", + vcpu->arch.shregs.srr0, vcpu->arch.shregs.srr1); + pr_err("sprg0 = %.16llx sprg1 = %.16llx\n", + vcpu->arch.shregs.sprg0, vcpu->arch.shregs.sprg1); + pr_err("sprg2 = %.16llx sprg3 = %.16llx\n", + vcpu->arch.shregs.sprg2, vcpu->arch.shregs.sprg3); + pr_err("cr = %.8x xer = %.16lx dsisr = %.8x\n", + vcpu->arch.cr, vcpu->arch.xer, vcpu->arch.shregs.dsisr); + pr_err("dar = %.16llx\n", vcpu->arch.shregs.dar); + pr_err("fault dar = %.16lx dsisr = %.8x\n", + vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); + pr_err("SLB (%d entries):\n", vcpu->arch.slb_max); + for (r = 0; r < vcpu->arch.slb_max; ++r) + pr_err(" ESID = %.16llx VSID = %.16llx\n", + vcpu->arch.slb[r].orige, vcpu->arch.slb[r].origv); + pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.8x\n", + vcpu->kvm->arch.lpcr, vcpu->kvm->arch.sdr1, + vcpu->arch.last_inst); +} + +struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id) +{ + int r; + struct kvm_vcpu *v, *ret = NULL; + + mutex_lock(&kvm->lock); + kvm_for_each_vcpu(r, v, kvm) { + if (v->vcpu_id == id) { + ret = v; + break; + } + } + mutex_unlock(&kvm->lock); + return ret; +} + +static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa) +{ + vpa->shared_proc = 1; + vpa->yield_count = 1; +} + +static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu, + unsigned long flags, + unsigned long vcpuid, unsigned long vpa) +{ + struct kvm *kvm = vcpu->kvm; + unsigned long pg_index, ra, len; + unsigned long pg_offset; + void *va; + struct kvm_vcpu *tvcpu; + + tvcpu = kvmppc_find_vcpu(kvm, vcpuid); + if (!tvcpu) + return H_PARAMETER; + + flags >>= 63 - 18; + flags &= 7; + if (flags == 0 || flags == 4) + return H_PARAMETER; + if (flags < 4) { + if (vpa & 0x7f) + return H_PARAMETER; + /* registering new area; convert logical addr to real */ + pg_index = vpa >> kvm->arch.ram_porder; + pg_offset = vpa & (kvm->arch.ram_psize - 1); + if (pg_index >= kvm->arch.ram_npages) + return H_PARAMETER; + if (kvm->arch.ram_pginfo[pg_index].pfn == 0) + return H_PARAMETER; + ra = kvm->arch.ram_pginfo[pg_index].pfn << PAGE_SHIFT; + ra |= pg_offset; + va = __va(ra); + if (flags <= 1) + len = *(unsigned short *)(va + 4); + else + len = *(unsigned int *)(va + 4); + if (pg_offset + len > kvm->arch.ram_psize) + return H_PARAMETER; + switch (flags) { + case 1: /* register VPA */ + if (len < 640) + return H_PARAMETER; + tvcpu->arch.vpa = va; + init_vpa(vcpu, va); + break; + case 2: /* register DTL */ + if (len < 48) + return H_PARAMETER; + if (!tvcpu->arch.vpa) + return H_RESOURCE; + len -= len % 48; + tvcpu->arch.dtl = va; + tvcpu->arch.dtl_end = va + len; + break; + case 3: /* register SLB shadow buffer */ + if (len < 8) + return H_PARAMETER; + if (!tvcpu->arch.vpa) + return H_RESOURCE; + tvcpu->arch.slb_shadow = va; + len = (len - 16) / 16; + tvcpu->arch.slb_shadow = va; + break; + } + } else { + switch (flags) { + case 5: /* unregister VPA */ + if (tvcpu->arch.slb_shadow || tvcpu->arch.dtl) + return H_RESOURCE; + tvcpu->arch.vpa = NULL; + break; + case 6: /* unregister DTL */ + tvcpu->arch.dtl = NULL; + break; + case 7: /* unregister SLB shadow buffer */ + tvcpu->arch.slb_shadow = NULL; + break; + } + } + return H_SUCCESS; +} + +int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) +{ + unsigned long req = kvmppc_get_gpr(vcpu, 3); + unsigned long target, ret = H_SUCCESS; + struct kvm_vcpu *tvcpu; + + switch (req) { + case H_CEDE: + break; + case H_PROD: + target = kvmppc_get_gpr(vcpu, 4); + tvcpu = kvmppc_find_vcpu(vcpu->kvm, target); + if (!tvcpu) { + ret = H_PARAMETER; + break; + } + tvcpu->arch.prodded = 1; + smp_mb(); + if (vcpu->arch.ceded) { + if (waitqueue_active(&vcpu->wq)) { + wake_up_interruptible(&vcpu->wq); + vcpu->stat.halt_wakeup++; + } + } + break; + case H_CONFER: + break; + case H_REGISTER_VPA: + ret = do_h_register_vpa(vcpu, kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5), + kvmppc_get_gpr(vcpu, 6)); + break; + default: + return RESUME_HOST; + } + kvmppc_set_gpr(vcpu, 3, ret); + vcpu->arch.hcall_needed = 0; + return RESUME_GUEST; +} + +static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, + struct task_struct *tsk) +{ + int r = RESUME_HOST; + + vcpu->stat.sum_exits++; + + run->exit_reason = KVM_EXIT_UNKNOWN; + run->ready_for_interrupt_injection = 1; + switch (vcpu->arch.trap) { + /* We're good on these - the host merely wanted to get our attention */ + case BOOK3S_INTERRUPT_HV_DECREMENTER: + vcpu->stat.dec_exits++; + r = RESUME_GUEST; + break; + case BOOK3S_INTERRUPT_EXTERNAL: + vcpu->stat.ext_intr_exits++; + r = RESUME_GUEST; + break; + case BOOK3S_INTERRUPT_PERFMON: + r = RESUME_GUEST; + break; + case BOOK3S_INTERRUPT_PROGRAM: + { + ulong flags; + /* + * Normally program interrupts are delivered directly + * to the guest by the hardware, but we can get here + * as a result of a hypervisor emulation interrupt + * (e40) getting turned into a 700 by BML RTAS. + */ + flags = vcpu->arch.shregs.msr & 0x1f0000ull; + kvmppc_core_queue_program(vcpu, flags); + r = RESUME_GUEST; + break; + } + case BOOK3S_INTERRUPT_SYSCALL: + { + /* hcall - punt to userspace */ + int i; + + if (vcpu->arch.shregs.msr & MSR_PR) { + /* sc 1 from userspace - reflect to guest syscall */ + kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_SYSCALL); + r = RESUME_GUEST; + break; + } + run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3); + for (i = 0; i < 9; ++i) + run->papr_hcall.args[i] = kvmppc_get_gpr(vcpu, 4 + i); + run->exit_reason = KVM_EXIT_PAPR_HCALL; + vcpu->arch.hcall_needed = 1; + r = RESUME_HOST; + break; + } + /* + * We get these next two if the guest does a bad real-mode access, + * as we have enabled VRMA (virtualized real mode area) mode in the + * LPCR. We just generate an appropriate DSI/ISI to the guest. + */ + case BOOK3S_INTERRUPT_H_DATA_STORAGE: + vcpu->arch.shregs.dsisr = vcpu->arch.fault_dsisr; + vcpu->arch.shregs.dar = vcpu->arch.fault_dar; + kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE, 0); + r = RESUME_GUEST; + break; + case BOOK3S_INTERRUPT_H_INST_STORAGE: + kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_INST_STORAGE, + 0x08000000); + r = RESUME_GUEST; + break; + /* + * This occurs if the guest executes an illegal instruction. + * We just generate a program interrupt to the guest, since + * we don't emulate any guest instructions at this stage. + */ + case BOOK3S_INTERRUPT_H_EMUL_ASSIST: + kvmppc_core_queue_program(vcpu, 0x80000); + r = RESUME_GUEST; + break; + default: + kvmppc_dump_regs(vcpu); + printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n", + vcpu->arch.trap, kvmppc_get_pc(vcpu), + vcpu->arch.shregs.msr); + r = RESUME_HOST; + BUG(); + break; + } + + return r; +} + +int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + int i; + + sregs->pvr = vcpu->arch.pvr; + + memset(sregs, 0, sizeof(struct kvm_sregs)); + for (i = 0; i < vcpu->arch.slb_max; i++) { + sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige; + sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv; + } + + return 0; +} + +int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + int i, j; + + kvmppc_set_pvr(vcpu, sregs->pvr); + + j = 0; + for (i = 0; i < vcpu->arch.slb_nr; i++) { + if (sregs->u.s.ppc64.slb[i].slbe & SLB_ESID_V) { + vcpu->arch.slb[j].orige = sregs->u.s.ppc64.slb[i].slbe; + vcpu->arch.slb[j].origv = sregs->u.s.ppc64.slb[i].slbv; + ++j; + } + } + vcpu->arch.slb_max = j; + + return 0; +} + +int kvmppc_core_check_processor_compat(void) +{ + if (cpu_has_feature(CPU_FTR_HVMODE)) + return 0; + return -EIO; +} + +struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +{ + struct kvm_vcpu *vcpu; + int err = -EINVAL; + int core; + struct kvmppc_vcore *vcore; + + core = id / threads_per_core; + if (core >= KVM_MAX_VCORES) + goto out; + + err = -ENOMEM; + vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL); + if (!vcpu) + goto out; + + err = kvm_vcpu_init(vcpu, kvm, id); + if (err) + goto free_vcpu; + + vcpu->arch.shared = &vcpu->arch.shregs; + vcpu->arch.last_cpu = -1; + vcpu->arch.mmcr[0] = MMCR0_FC; + vcpu->arch.ctrl = CTRL_RUNLATCH; + /* default to host PVR, since we can't spoof it */ + vcpu->arch.pvr = mfspr(SPRN_PVR); + kvmppc_set_pvr(vcpu, vcpu->arch.pvr); + + kvmppc_mmu_book3s_hv_init(vcpu); + + /* + * We consider the vcpu stopped until we see the first run ioctl for it. + */ + vcpu->arch.state = KVMPPC_VCPU_STOPPED; + + init_waitqueue_head(&vcpu->arch.cpu_run); + + mutex_lock(&kvm->lock); + vcore = kvm->arch.vcores[core]; + if (!vcore) { + vcore = kzalloc(sizeof(struct kvmppc_vcore), GFP_KERNEL); + if (vcore) { + INIT_LIST_HEAD(&vcore->runnable_threads); + spin_lock_init(&vcore->lock); + init_waitqueue_head(&vcore->wq); + } + kvm->arch.vcores[core] = vcore; + } + mutex_unlock(&kvm->lock); + + if (!vcore) + goto free_vcpu; + + spin_lock(&vcore->lock); + ++vcore->num_threads; + spin_unlock(&vcore->lock); + vcpu->arch.vcore = vcore; + + vcpu->arch.cpu_type = KVM_CPU_3S_64; + kvmppc_sanity_check(vcpu); + + return vcpu; + +free_vcpu: + kfree(vcpu); +out: + return ERR_PTR(err); +} + +void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) +{ + kvm_vcpu_uninit(vcpu); + kfree(vcpu); +} + +static void kvmppc_set_timer(struct kvm_vcpu *vcpu) +{ + unsigned long dec_nsec, now; + + now = get_tb(); + if (now > vcpu->arch.dec_expires) { + /* decrementer has already gone negative */ + kvmppc_core_queue_dec(vcpu); + kvmppc_core_deliver_interrupts(vcpu); + return; + } + dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC + / tb_ticks_per_sec; + hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec), + HRTIMER_MODE_REL); + vcpu->arch.timer_running = 1; +} + +static void kvmppc_end_cede(struct kvm_vcpu *vcpu) +{ + vcpu->arch.ceded = 0; + if (vcpu->arch.timer_running) { + hrtimer_try_to_cancel(&vcpu->arch.dec_timer); + vcpu->arch.timer_running = 0; + } +} + +extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); +extern void xics_wake_cpu(int cpu); + +static void kvmppc_remove_runnable(struct kvmppc_vcore *vc, + struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu *v; + + if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE) + return; + vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; + --vc->n_runnable; + ++vc->n_busy; + /* decrement the physical thread id of each following vcpu */ + v = vcpu; + list_for_each_entry_continue(v, &vc->runnable_threads, arch.run_list) + --v->arch.ptid; + list_del(&vcpu->arch.run_list); +} + +static void kvmppc_start_thread(struct kvm_vcpu *vcpu) +{ + int cpu; + struct paca_struct *tpaca; + struct kvmppc_vcore *vc = vcpu->arch.vcore; + + if (vcpu->arch.timer_running) { + hrtimer_try_to_cancel(&vcpu->arch.dec_timer); + vcpu->arch.timer_running = 0; + } + cpu = vc->pcpu + vcpu->arch.ptid; + tpaca = &paca[cpu]; + tpaca->kvm_hstate.kvm_vcpu = vcpu; + tpaca->kvm_hstate.kvm_vcore = vc; + tpaca->kvm_hstate.napping = 0; + vcpu->cpu = vc->pcpu; + smp_wmb(); +#ifdef CONFIG_PPC_ICP_NATIVE + if (vcpu->arch.ptid) { + tpaca->cpu_start = 0x80; + wmb(); + xics_wake_cpu(cpu); + ++vc->n_woken; + } +#endif +} + +static void kvmppc_wait_for_nap(struct kvmppc_vcore *vc) +{ + int i; + + HMT_low(); + i = 0; + while (vc->nap_count < vc->n_woken) { + if (++i >= 1000000) { + pr_err("kvmppc_wait_for_nap timeout %d %d\n", + vc->nap_count, vc->n_woken); + break; + } + cpu_relax(); + } + HMT_medium(); +} + +/* + * Check that we are on thread 0 and that any other threads in + * this core are off-line. + */ +static int on_primary_thread(void) +{ + int cpu = smp_processor_id(); + int thr = cpu_thread_in_core(cpu); + + if (thr) + return 0; + while (++thr < threads_per_core) + if (cpu_online(cpu + thr)) + return 0; + return 1; +} + +/* + * Run a set of guest threads on a physical core. + * Called with vc->lock held. + */ +static int kvmppc_run_core(struct kvmppc_vcore *vc) +{ + struct kvm_vcpu *vcpu, *vcpu0, *vnext; + long ret; + u64 now; + int ptid; + + /* don't start if any threads have a signal pending */ + list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) + if (signal_pending(vcpu->arch.run_task)) + return 0; + + /* + * Make sure we are running on thread 0, and that + * secondary threads are offline. + * XXX we should also block attempts to bring any + * secondary threads online. + */ + if (threads_per_core > 1 && !on_primary_thread()) { + list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) + vcpu->arch.ret = -EBUSY; + goto out; + } + + /* + * Assign physical thread IDs, first to non-ceded vcpus + * and then to ceded ones. + */ + ptid = 0; + vcpu0 = NULL; + list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { + if (!vcpu->arch.ceded) { + if (!ptid) + vcpu0 = vcpu; + vcpu->arch.ptid = ptid++; + } + } + if (!vcpu0) + return 0; /* nothing to run */ + list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) + if (vcpu->arch.ceded) + vcpu->arch.ptid = ptid++; + + vc->n_woken = 0; + vc->nap_count = 0; + vc->entry_exit_count = 0; + vc->vcore_state = VCORE_RUNNING; + vc->in_guest = 0; + vc->pcpu = smp_processor_id(); + vc->napping_threads = 0; + list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) + kvmppc_start_thread(vcpu); + + preempt_disable(); + spin_unlock(&vc->lock); + + kvm_guest_enter(); + __kvmppc_vcore_entry(NULL, vcpu0); + + spin_lock(&vc->lock); + /* disable sending of IPIs on virtual external irqs */ + list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) + vcpu->cpu = -1; + /* wait for secondary threads to finish writing their state to memory */ + if (vc->nap_count < vc->n_woken) + kvmppc_wait_for_nap(vc); + /* prevent other vcpu threads from doing kvmppc_start_thread() now */ + vc->vcore_state = VCORE_EXITING; + spin_unlock(&vc->lock); + + /* make sure updates to secondary vcpu structs are visible now */ + smp_mb(); + kvm_guest_exit(); + + preempt_enable(); + kvm_resched(vcpu); + + now = get_tb(); + list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { + /* cancel pending dec exception if dec is positive */ + if (now < vcpu->arch.dec_expires && + kvmppc_core_pending_dec(vcpu)) + kvmppc_core_dequeue_dec(vcpu); + + ret = RESUME_GUEST; + if (vcpu->arch.trap) + ret = kvmppc_handle_exit(vcpu->arch.kvm_run, vcpu, + vcpu->arch.run_task); + + vcpu->arch.ret = ret; + vcpu->arch.trap = 0; + + if (vcpu->arch.ceded) { + if (ret != RESUME_GUEST) + kvmppc_end_cede(vcpu); + else + kvmppc_set_timer(vcpu); + } + } + + spin_lock(&vc->lock); + out: + vc->vcore_state = VCORE_INACTIVE; + list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads, + arch.run_list) { + if (vcpu->arch.ret != RESUME_GUEST) { + kvmppc_remove_runnable(vc, vcpu); + wake_up(&vcpu->arch.cpu_run); + } + } + + return 1; +} + +/* + * Wait for some other vcpu thread to execute us, and + * wake us up when we need to handle something in the host. + */ +static void kvmppc_wait_for_exec(struct kvm_vcpu *vcpu, int wait_state) +{ + DEFINE_WAIT(wait); + + prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state); + if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) + schedule(); + finish_wait(&vcpu->arch.cpu_run, &wait); +} + +/* + * All the vcpus in this vcore are idle, so wait for a decrementer + * or external interrupt to one of the vcpus. vc->lock is held. + */ +static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) +{ + DEFINE_WAIT(wait); + struct kvm_vcpu *v; + int all_idle = 1; + + prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE); + vc->vcore_state = VCORE_SLEEPING; + spin_unlock(&vc->lock); + list_for_each_entry(v, &vc->runnable_threads, arch.run_list) { + if (!v->arch.ceded || v->arch.pending_exceptions) { + all_idle = 0; + break; + } + } + if (all_idle) + schedule(); + finish_wait(&vc->wq, &wait); + spin_lock(&vc->lock); + vc->vcore_state = VCORE_INACTIVE; +} + +static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) +{ + int n_ceded; + int prev_state; + struct kvmppc_vcore *vc; + struct kvm_vcpu *v, *vn; + + kvm_run->exit_reason = 0; + vcpu->arch.ret = RESUME_GUEST; + vcpu->arch.trap = 0; + + /* + * Synchronize with other threads in this virtual core + */ + vc = vcpu->arch.vcore; + spin_lock(&vc->lock); + vcpu->arch.ceded = 0; + vcpu->arch.run_task = current; + vcpu->arch.kvm_run = kvm_run; + prev_state = vcpu->arch.state; + vcpu->arch.state = KVMPPC_VCPU_RUNNABLE; + list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads); + ++vc->n_runnable; + + /* + * This happens the first time this is called for a vcpu. + * If the vcore is already running, we may be able to start + * this thread straight away and have it join in. + */ + if (prev_state == KVMPPC_VCPU_STOPPED) { + if (vc->vcore_state == VCORE_RUNNING && + VCORE_EXIT_COUNT(vc) == 0) { + vcpu->arch.ptid = vc->n_runnable - 1; + kvmppc_start_thread(vcpu); + } + + } else if (prev_state == KVMPPC_VCPU_BUSY_IN_HOST) + --vc->n_busy; + + while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE && + !signal_pending(current)) { + if (vc->n_busy || vc->vcore_state != VCORE_INACTIVE) { + spin_unlock(&vc->lock); + kvmppc_wait_for_exec(vcpu, TASK_INTERRUPTIBLE); + spin_lock(&vc->lock); + continue; + } + n_ceded = 0; + list_for_each_entry(v, &vc->runnable_threads, arch.run_list) + n_ceded += v->arch.ceded; + if (n_ceded == vc->n_runnable) + kvmppc_vcore_blocked(vc); + else + kvmppc_run_core(vc); + + list_for_each_entry_safe(v, vn, &vc->runnable_threads, + arch.run_list) { + kvmppc_core_deliver_interrupts(v); + if (signal_pending(v->arch.run_task)) { + kvmppc_remove_runnable(vc, v); + v->stat.signal_exits++; + v->arch.kvm_run->exit_reason = KVM_EXIT_INTR; + v->arch.ret = -EINTR; + wake_up(&v->arch.cpu_run); + } + } + } + + if (signal_pending(current)) { + if (vc->vcore_state == VCORE_RUNNING || + vc->vcore_state == VCORE_EXITING) { + spin_unlock(&vc->lock); + kvmppc_wait_for_exec(vcpu, TASK_UNINTERRUPTIBLE); + spin_lock(&vc->lock); + } + if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) { + kvmppc_remove_runnable(vc, vcpu); + vcpu->stat.signal_exits++; + kvm_run->exit_reason = KVM_EXIT_INTR; + vcpu->arch.ret = -EINTR; + } + } + + spin_unlock(&vc->lock); + return vcpu->arch.ret; +} + +int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) +{ + int r; + + if (!vcpu->arch.sane) { + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + return -EINVAL; + } + + /* No need to go into the guest when all we'll do is come back out */ + if (signal_pending(current)) { + run->exit_reason = KVM_EXIT_INTR; + return -EINTR; + } + + /* On PPC970, check that we have an RMA region */ + if (!vcpu->kvm->arch.rma && cpu_has_feature(CPU_FTR_ARCH_201)) + return -EPERM; + + flush_fp_to_thread(current); + flush_altivec_to_thread(current); + flush_vsx_to_thread(current); + vcpu->arch.wqp = &vcpu->arch.vcore->wq; + + do { + r = kvmppc_run_vcpu(run, vcpu); + + if (run->exit_reason == KVM_EXIT_PAPR_HCALL && + !(vcpu->arch.shregs.msr & MSR_PR)) { + r = kvmppc_pseries_do_hcall(vcpu); + kvmppc_core_deliver_interrupts(vcpu); + } + } while (r == RESUME_GUEST); + return r; +} + +static long kvmppc_stt_npages(unsigned long window_size) +{ + return ALIGN((window_size >> SPAPR_TCE_SHIFT) + * sizeof(u64), PAGE_SIZE) / PAGE_SIZE; +} + +static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt) +{ + struct kvm *kvm = stt->kvm; + int i; + + mutex_lock(&kvm->lock); + list_del(&stt->list); + for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++) + __free_page(stt->pages[i]); + kfree(stt); + mutex_unlock(&kvm->lock); + + kvm_put_kvm(kvm); +} + +static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct kvmppc_spapr_tce_table *stt = vma->vm_file->private_data; + struct page *page; + + if (vmf->pgoff >= kvmppc_stt_npages(stt->window_size)) + return VM_FAULT_SIGBUS; + + page = stt->pages[vmf->pgoff]; + get_page(page); + vmf->page = page; + return 0; +} + +static const struct vm_operations_struct kvm_spapr_tce_vm_ops = { + .fault = kvm_spapr_tce_fault, +}; + +static int kvm_spapr_tce_mmap(struct file *file, struct vm_area_struct *vma) +{ + vma->vm_ops = &kvm_spapr_tce_vm_ops; + return 0; +} + +static int kvm_spapr_tce_release(struct inode *inode, struct file *filp) +{ + struct kvmppc_spapr_tce_table *stt = filp->private_data; + + release_spapr_tce_table(stt); + return 0; +} + +static struct file_operations kvm_spapr_tce_fops = { + .mmap = kvm_spapr_tce_mmap, + .release = kvm_spapr_tce_release, +}; + +long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, + struct kvm_create_spapr_tce *args) +{ + struct kvmppc_spapr_tce_table *stt = NULL; + long npages; + int ret = -ENOMEM; + int i; + + /* Check this LIOBN hasn't been previously allocated */ + list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { + if (stt->liobn == args->liobn) + return -EBUSY; + } + + npages = kvmppc_stt_npages(args->window_size); + + stt = kzalloc(sizeof(*stt) + npages* sizeof(struct page *), + GFP_KERNEL); + if (!stt) + goto fail; + + stt->liobn = args->liobn; + stt->window_size = args->window_size; + stt->kvm = kvm; + + for (i = 0; i < npages; i++) { + stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!stt->pages[i]) + goto fail; + } + + kvm_get_kvm(kvm); + + mutex_lock(&kvm->lock); + list_add(&stt->list, &kvm->arch.spapr_tce_tables); + + mutex_unlock(&kvm->lock); + + return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops, + stt, O_RDWR); + +fail: + if (stt) { + for (i = 0; i < npages; i++) + if (stt->pages[i]) + __free_page(stt->pages[i]); + + kfree(stt); + } + return ret; +} + +/* Work out RMLS (real mode limit selector) field value for a given RMA size. + Assumes POWER7 or PPC970. */ +static inline int lpcr_rmls(unsigned long rma_size) +{ + switch (rma_size) { + case 32ul << 20: /* 32 MB */ + if (cpu_has_feature(CPU_FTR_ARCH_206)) + return 8; /* only supported on POWER7 */ + return -1; + case 64ul << 20: /* 64 MB */ + return 3; + case 128ul << 20: /* 128 MB */ + return 7; + case 256ul << 20: /* 256 MB */ + return 4; + case 1ul << 30: /* 1 GB */ + return 2; + case 16ul << 30: /* 16 GB */ + return 1; + case 256ul << 30: /* 256 GB */ + return 0; + default: + return -1; + } +} + +static int kvm_rma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct kvmppc_rma_info *ri = vma->vm_file->private_data; + struct page *page; + + if (vmf->pgoff >= ri->npages) + return VM_FAULT_SIGBUS; + + page = pfn_to_page(ri->base_pfn + vmf->pgoff); + get_page(page); + vmf->page = page; + return 0; +} + +static const struct vm_operations_struct kvm_rma_vm_ops = { + .fault = kvm_rma_fault, +}; + +static int kvm_rma_mmap(struct file *file, struct vm_area_struct *vma) +{ + vma->vm_flags |= VM_RESERVED; + vma->vm_ops = &kvm_rma_vm_ops; + return 0; +} + +static int kvm_rma_release(struct inode *inode, struct file *filp) +{ + struct kvmppc_rma_info *ri = filp->private_data; + + kvm_release_rma(ri); + return 0; +} + +static struct file_operations kvm_rma_fops = { + .mmap = kvm_rma_mmap, + .release = kvm_rma_release, +}; + +long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *ret) +{ + struct kvmppc_rma_info *ri; + long fd; + + ri = kvm_alloc_rma(); + if (!ri) + return -ENOMEM; + + fd = anon_inode_getfd("kvm-rma", &kvm_rma_fops, ri, O_RDWR); + if (fd < 0) + kvm_release_rma(ri); + + ret->rma_size = ri->npages << PAGE_SHIFT; + return fd; +} + +static struct page *hva_to_page(unsigned long addr) +{ + struct page *page[1]; + int npages; + + might_sleep(); + + npages = get_user_pages_fast(addr, 1, 1, page); + + if (unlikely(npages != 1)) + return 0; + + return page[0]; +} + +int kvmppc_core_prepare_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem) +{ + unsigned long psize, porder; + unsigned long i, npages, totalpages; + unsigned long pg_ix; + struct kvmppc_pginfo *pginfo; + unsigned long hva; + struct kvmppc_rma_info *ri = NULL; + struct page *page; + + /* For now, only allow 16MB pages */ + porder = LARGE_PAGE_ORDER; + psize = 1ul << porder; + if ((mem->memory_size & (psize - 1)) || + (mem->guest_phys_addr & (psize - 1))) { + pr_err("bad memory_size=%llx @ %llx\n", + mem->memory_size, mem->guest_phys_addr); + return -EINVAL; + } + + npages = mem->memory_size >> porder; + totalpages = (mem->guest_phys_addr + mem->memory_size) >> porder; + + /* More memory than we have space to track? */ + if (totalpages > (1ul << (MAX_MEM_ORDER - LARGE_PAGE_ORDER))) + return -EINVAL; + + /* Do we already have an RMA registered? */ + if (mem->guest_phys_addr == 0 && kvm->arch.rma) + return -EINVAL; + + if (totalpages > kvm->arch.ram_npages) + kvm->arch.ram_npages = totalpages; + + /* Is this one of our preallocated RMAs? */ + if (mem->guest_phys_addr == 0) { + struct vm_area_struct *vma; + + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm, mem->userspace_addr); + if (vma && vma->vm_file && + vma->vm_file->f_op == &kvm_rma_fops && + mem->userspace_addr == vma->vm_start) + ri = vma->vm_file->private_data; + up_read(¤t->mm->mmap_sem); + if (!ri && cpu_has_feature(CPU_FTR_ARCH_201)) { + pr_err("CPU requires an RMO\n"); + return -EINVAL; + } + } + + if (ri) { + unsigned long rma_size; + unsigned long lpcr; + long rmls; + + rma_size = ri->npages << PAGE_SHIFT; + if (rma_size > mem->memory_size) + rma_size = mem->memory_size; + rmls = lpcr_rmls(rma_size); + if (rmls < 0) { + pr_err("Can't use RMA of 0x%lx bytes\n", rma_size); + return -EINVAL; + } + atomic_inc(&ri->use_count); + kvm->arch.rma = ri; + kvm->arch.n_rma_pages = rma_size >> porder; + + /* Update LPCR and RMOR */ + lpcr = kvm->arch.lpcr; + if (cpu_has_feature(CPU_FTR_ARCH_201)) { + /* PPC970; insert RMLS value (split field) in HID4 */ + lpcr &= ~((1ul << HID4_RMLS0_SH) | + (3ul << HID4_RMLS2_SH)); + lpcr |= ((rmls >> 2) << HID4_RMLS0_SH) | + ((rmls & 3) << HID4_RMLS2_SH); + /* RMOR is also in HID4 */ + lpcr |= ((ri->base_pfn >> (26 - PAGE_SHIFT)) & 0xffff) + << HID4_RMOR_SH; + } else { + /* POWER7 */ + lpcr &= ~(LPCR_VPM0 | LPCR_VRMA_L); + lpcr |= rmls << LPCR_RMLS_SH; + kvm->arch.rmor = kvm->arch.rma->base_pfn << PAGE_SHIFT; + } + kvm->arch.lpcr = lpcr; + pr_info("Using RMO at %lx size %lx (LPCR = %lx)\n", + ri->base_pfn << PAGE_SHIFT, rma_size, lpcr); + } + + pg_ix = mem->guest_phys_addr >> porder; + pginfo = kvm->arch.ram_pginfo + pg_ix; + for (i = 0; i < npages; ++i, ++pg_ix) { + if (ri && pg_ix < kvm->arch.n_rma_pages) { + pginfo[i].pfn = ri->base_pfn + + (pg_ix << (porder - PAGE_SHIFT)); + continue; + } + hva = mem->userspace_addr + (i << porder); + page = hva_to_page(hva); + if (!page) { + pr_err("oops, no pfn for hva %lx\n", hva); + goto err; + } + /* Check it's a 16MB page */ + if (!PageHead(page) || + compound_order(page) != (LARGE_PAGE_ORDER - PAGE_SHIFT)) { + pr_err("page at %lx isn't 16MB (o=%d)\n", + hva, compound_order(page)); + goto err; + } + pginfo[i].pfn = page_to_pfn(page); + } + + return 0; + + err: + return -EINVAL; +} + +void kvmppc_core_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem) +{ + if (mem->guest_phys_addr == 0 && mem->memory_size != 0 && + !kvm->arch.rma) + kvmppc_map_vrma(kvm, mem); +} + +int kvmppc_core_init_vm(struct kvm *kvm) +{ + long r; + unsigned long npages = 1ul << (MAX_MEM_ORDER - LARGE_PAGE_ORDER); + long err = -ENOMEM; + unsigned long lpcr; + + /* Allocate hashed page table */ + r = kvmppc_alloc_hpt(kvm); + if (r) + return r; + + INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); + + kvm->arch.ram_pginfo = kzalloc(npages * sizeof(struct kvmppc_pginfo), + GFP_KERNEL); + if (!kvm->arch.ram_pginfo) { + pr_err("kvmppc_core_init_vm: couldn't alloc %lu bytes\n", + npages * sizeof(struct kvmppc_pginfo)); + goto out_free; + } + + kvm->arch.ram_npages = 0; + kvm->arch.ram_psize = 1ul << LARGE_PAGE_ORDER; + kvm->arch.ram_porder = LARGE_PAGE_ORDER; + kvm->arch.rma = NULL; + kvm->arch.n_rma_pages = 0; + + kvm->arch.host_sdr1 = mfspr(SPRN_SDR1); + + if (cpu_has_feature(CPU_FTR_ARCH_201)) { + /* PPC970; HID4 is effectively the LPCR */ + unsigned long lpid = kvm->arch.lpid; + kvm->arch.host_lpid = 0; + kvm->arch.host_lpcr = lpcr = mfspr(SPRN_HID4); + lpcr &= ~((3 << HID4_LPID1_SH) | (0xful << HID4_LPID5_SH)); + lpcr |= ((lpid >> 4) << HID4_LPID1_SH) | + ((lpid & 0xf) << HID4_LPID5_SH); + } else { + /* POWER7; init LPCR for virtual RMA mode */ + kvm->arch.host_lpid = mfspr(SPRN_LPID); + kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR); + lpcr &= LPCR_PECE | LPCR_LPES; + lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE | + LPCR_VPM0 | LPCR_VRMA_L; + } + kvm->arch.lpcr = lpcr; + + return 0; + + out_free: + kvmppc_free_hpt(kvm); + return err; +} + +void kvmppc_core_destroy_vm(struct kvm *kvm) +{ + struct kvmppc_pginfo *pginfo; + unsigned long i; + + if (kvm->arch.ram_pginfo) { + pginfo = kvm->arch.ram_pginfo; + kvm->arch.ram_pginfo = NULL; + for (i = kvm->arch.n_rma_pages; i < kvm->arch.ram_npages; ++i) + if (pginfo[i].pfn) + put_page(pfn_to_page(pginfo[i].pfn)); + kfree(pginfo); + } + if (kvm->arch.rma) { + kvm_release_rma(kvm->arch.rma); + kvm->arch.rma = NULL; + } + + kvmppc_free_hpt(kvm); + WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables)); +} + +/* These are stubs for now */ +void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end) +{ +} + +/* We don't need to emulate any privileged instructions or dcbz */ +int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance) +{ + return EMULATE_FAIL; +} + +int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) +{ + return EMULATE_FAIL; +} + +int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) +{ + return EMULATE_FAIL; +} + +static int kvmppc_book3s_hv_init(void) +{ + int r; + + r = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); + + if (r) + return r; + + r = kvmppc_mmu_hv_init(); + + return r; +} + +static void kvmppc_book3s_hv_exit(void) +{ + kvm_exit(); +} + +module_init(kvmppc_book3s_hv_init); +module_exit(kvmppc_book3s_hv_exit); diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c new file mode 100644 index 000000000000..286f13d601cf --- /dev/null +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -0,0 +1,156 @@ +/* + * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include <linux/kvm_host.h> +#include <linux/preempt.h> +#include <linux/export.h> +#include <linux/sched.h> +#include <linux/spinlock.h> +#include <linux/bootmem.h> +#include <linux/init.h> + +#include <asm/cputable.h> +#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s.h> + +/* + * This maintains a list of RMAs (real mode areas) for KVM guests to use. + * Each RMA has to be physically contiguous and of a size that the + * hardware supports. PPC970 and POWER7 support 64MB, 128MB and 256MB, + * and other larger sizes. Since we are unlikely to be allocate that + * much physically contiguous memory after the system is up and running, + * we preallocate a set of RMAs in early boot for KVM to use. + */ +static unsigned long kvm_rma_size = 64 << 20; /* 64MB */ +static unsigned long kvm_rma_count; + +static int __init early_parse_rma_size(char *p) +{ + if (!p) + return 1; + + kvm_rma_size = memparse(p, &p); + + return 0; +} +early_param("kvm_rma_size", early_parse_rma_size); + +static int __init early_parse_rma_count(char *p) +{ + if (!p) + return 1; + + kvm_rma_count = simple_strtoul(p, NULL, 0); + + return 0; +} +early_param("kvm_rma_count", early_parse_rma_count); + +static struct kvmppc_rma_info *rma_info; +static LIST_HEAD(free_rmas); +static DEFINE_SPINLOCK(rma_lock); + +/* Work out RMLS (real mode limit selector) field value for a given RMA size. + Assumes POWER7 or PPC970. */ +static inline int lpcr_rmls(unsigned long rma_size) +{ + switch (rma_size) { + case 32ul << 20: /* 32 MB */ + if (cpu_has_feature(CPU_FTR_ARCH_206)) + return 8; /* only supported on POWER7 */ + return -1; + case 64ul << 20: /* 64 MB */ + return 3; + case 128ul << 20: /* 128 MB */ + return 7; + case 256ul << 20: /* 256 MB */ + return 4; + case 1ul << 30: /* 1 GB */ + return 2; + case 16ul << 30: /* 16 GB */ + return 1; + case 256ul << 30: /* 256 GB */ + return 0; + default: + return -1; + } +} + +/* + * Called at boot time while the bootmem allocator is active, + * to allocate contiguous physical memory for the real memory + * areas for guests. + */ +void kvm_rma_init(void) +{ + unsigned long i; + unsigned long j, npages; + void *rma; + struct page *pg; + + /* Only do this on PPC970 in HV mode */ + if (!cpu_has_feature(CPU_FTR_HVMODE) || + !cpu_has_feature(CPU_FTR_ARCH_201)) + return; + + if (!kvm_rma_size || !kvm_rma_count) + return; + + /* Check that the requested size is one supported in hardware */ + if (lpcr_rmls(kvm_rma_size) < 0) { + pr_err("RMA size of 0x%lx not supported\n", kvm_rma_size); + return; + } + + npages = kvm_rma_size >> PAGE_SHIFT; + rma_info = alloc_bootmem(kvm_rma_count * sizeof(struct kvmppc_rma_info)); + for (i = 0; i < kvm_rma_count; ++i) { + rma = alloc_bootmem_align(kvm_rma_size, kvm_rma_size); + pr_info("Allocated KVM RMA at %p (%ld MB)\n", rma, + kvm_rma_size >> 20); + rma_info[i].base_virt = rma; + rma_info[i].base_pfn = __pa(rma) >> PAGE_SHIFT; + rma_info[i].npages = npages; + list_add_tail(&rma_info[i].list, &free_rmas); + atomic_set(&rma_info[i].use_count, 0); + + pg = pfn_to_page(rma_info[i].base_pfn); + for (j = 0; j < npages; ++j) { + atomic_inc(&pg->_count); + ++pg; + } + } +} + +struct kvmppc_rma_info *kvm_alloc_rma(void) +{ + struct kvmppc_rma_info *ri; + + ri = NULL; + spin_lock(&rma_lock); + if (!list_empty(&free_rmas)) { + ri = list_first_entry(&free_rmas, struct kvmppc_rma_info, list); + list_del(&ri->list); + atomic_inc(&ri->use_count); + } + spin_unlock(&rma_lock); + return ri; +} +EXPORT_SYMBOL_GPL(kvm_alloc_rma); + +void kvm_release_rma(struct kvmppc_rma_info *ri) +{ + if (atomic_dec_and_test(&ri->use_count)) { + spin_lock(&rma_lock); + list_add_tail(&ri->list, &free_rmas); + spin_unlock(&rma_lock); + + } +} +EXPORT_SYMBOL_GPL(kvm_release_rma); + diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S new file mode 100644 index 000000000000..3f7b674dd4bf --- /dev/null +++ b/arch/powerpc/kvm/book3s_hv_interrupts.S @@ -0,0 +1,166 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + * + * Derived from book3s_interrupts.S, which is: + * Copyright SUSE Linux Products GmbH 2009 + * + * Authors: Alexander Graf <agraf@suse.de> + */ + +#include <asm/ppc_asm.h> +#include <asm/kvm_asm.h> +#include <asm/reg.h> +#include <asm/page.h> +#include <asm/asm-offsets.h> +#include <asm/exception-64s.h> +#include <asm/ppc-opcode.h> + +/***************************************************************************** + * * + * Guest entry / exit code that is in kernel module memory (vmalloc) * + * * + ****************************************************************************/ + +/* Registers: + * r4: vcpu pointer + */ +_GLOBAL(__kvmppc_vcore_entry) + + /* Write correct stack frame */ + mflr r0 + std r0,PPC_LR_STKOFF(r1) + + /* Save host state to the stack */ + stdu r1, -SWITCH_FRAME_SIZE(r1) + + /* Save non-volatile registers (r14 - r31) */ + SAVE_NVGPRS(r1) + + /* Save host DSCR */ +BEGIN_FTR_SECTION + mfspr r3, SPRN_DSCR + std r3, HSTATE_DSCR(r13) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + + /* Save host DABR */ + mfspr r3, SPRN_DABR + std r3, HSTATE_DABR(r13) + + /* Hard-disable interrupts */ + mfmsr r10 + std r10, HSTATE_HOST_MSR(r13) + rldicl r10,r10,48,1 + rotldi r10,r10,16 + mtmsrd r10,1 + + /* Save host PMU registers and load guest PMU registers */ + /* R4 is live here (vcpu pointer) but not r3 or r5 */ + li r3, 1 + sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */ + mfspr r7, SPRN_MMCR0 /* save MMCR0 */ + mtspr SPRN_MMCR0, r3 /* freeze all counters, disable interrupts */ + isync + ld r3, PACALPPACAPTR(r13) /* is the host using the PMU? */ + lbz r5, LPPACA_PMCINUSE(r3) + cmpwi r5, 0 + beq 31f /* skip if not */ + mfspr r5, SPRN_MMCR1 + mfspr r6, SPRN_MMCRA + std r7, HSTATE_MMCR(r13) + std r5, HSTATE_MMCR + 8(r13) + std r6, HSTATE_MMCR + 16(r13) + mfspr r3, SPRN_PMC1 + mfspr r5, SPRN_PMC2 + mfspr r6, SPRN_PMC3 + mfspr r7, SPRN_PMC4 + mfspr r8, SPRN_PMC5 + mfspr r9, SPRN_PMC6 +BEGIN_FTR_SECTION + mfspr r10, SPRN_PMC7 + mfspr r11, SPRN_PMC8 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) + stw r3, HSTATE_PMC(r13) + stw r5, HSTATE_PMC + 4(r13) + stw r6, HSTATE_PMC + 8(r13) + stw r7, HSTATE_PMC + 12(r13) + stw r8, HSTATE_PMC + 16(r13) + stw r9, HSTATE_PMC + 20(r13) +BEGIN_FTR_SECTION + stw r10, HSTATE_PMC + 24(r13) + stw r11, HSTATE_PMC + 28(r13) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) +31: + + /* + * Put whatever is in the decrementer into the + * hypervisor decrementer. + */ + mfspr r8,SPRN_DEC + mftb r7 + mtspr SPRN_HDEC,r8 + extsw r8,r8 + add r8,r8,r7 + std r8,HSTATE_DECEXP(r13) + + /* + * On PPC970, if the guest vcpu has an external interrupt pending, + * send ourselves an IPI so as to interrupt the guest once it + * enables interrupts. (It must have interrupts disabled, + * otherwise we would already have delivered the interrupt.) + */ +BEGIN_FTR_SECTION + ld r0, VCPU_PENDING_EXC(r4) + li r7, (1 << BOOK3S_IRQPRIO_EXTERNAL) + oris r7, r7, (1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h + and. r0, r0, r7 + beq 32f + mr r31, r4 + lhz r3, PACAPACAINDEX(r13) + bl smp_send_reschedule + nop + mr r4, r31 +32: +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) + + /* Jump to partition switch code */ + bl .kvmppc_hv_entry_trampoline + nop + +/* + * We return here in virtual mode after the guest exits + * with something that we can't handle in real mode. + * Interrupts are enabled again at this point. + */ + +.global kvmppc_handler_highmem +kvmppc_handler_highmem: + + /* + * Register usage at this point: + * + * R1 = host R1 + * R2 = host R2 + * R12 = exit handler id + * R13 = PACA + */ + + /* Restore non-volatile host registers (r14 - r31) */ + REST_NVGPRS(r1) + + addi r1, r1, SWITCH_FRAME_SIZE + ld r0, PPC_LR_STKOFF(r1) + mtlr r0 + blr diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c new file mode 100644 index 000000000000..bacb0cfa3602 --- /dev/null +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -0,0 +1,337 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * Copyright 2010-2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + */ + +#include <linux/types.h> +#include <linux/string.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/hugetlb.h> + +#include <asm/tlbflush.h> +#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s.h> +#include <asm/mmu-hash64.h> +#include <asm/hvcall.h> +#include <asm/synch.h> +#include <asm/ppc-opcode.h> + +/* For now use fixed-size 16MB page table */ +#define HPT_ORDER 24 +#define HPT_NPTEG (1ul << (HPT_ORDER - 7)) /* 128B per pteg */ +#define HPT_HASH_MASK (HPT_NPTEG - 1) + +#define HPTE_V_HVLOCK 0x40UL + +static inline long lock_hpte(unsigned long *hpte, unsigned long bits) +{ + unsigned long tmp, old; + + asm volatile(" ldarx %0,0,%2\n" + " and. %1,%0,%3\n" + " bne 2f\n" + " ori %0,%0,%4\n" + " stdcx. %0,0,%2\n" + " beq+ 2f\n" + " li %1,%3\n" + "2: isync" + : "=&r" (tmp), "=&r" (old) + : "r" (hpte), "r" (bits), "i" (HPTE_V_HVLOCK) + : "cc", "memory"); + return old == 0; +} + +long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, + long pte_index, unsigned long pteh, unsigned long ptel) +{ + unsigned long porder; + struct kvm *kvm = vcpu->kvm; + unsigned long i, lpn, pa; + unsigned long *hpte; + + /* only handle 4k, 64k and 16M pages for now */ + porder = 12; + if (pteh & HPTE_V_LARGE) { + if (cpu_has_feature(CPU_FTR_ARCH_206) && + (ptel & 0xf000) == 0x1000) { + /* 64k page */ + porder = 16; + } else if ((ptel & 0xff000) == 0) { + /* 16M page */ + porder = 24; + /* lowest AVA bit must be 0 for 16M pages */ + if (pteh & 0x80) + return H_PARAMETER; + } else + return H_PARAMETER; + } + lpn = (ptel & HPTE_R_RPN) >> kvm->arch.ram_porder; + if (lpn >= kvm->arch.ram_npages || porder > kvm->arch.ram_porder) + return H_PARAMETER; + pa = kvm->arch.ram_pginfo[lpn].pfn << PAGE_SHIFT; + if (!pa) + return H_PARAMETER; + /* Check WIMG */ + if ((ptel & HPTE_R_WIMG) != HPTE_R_M && + (ptel & HPTE_R_WIMG) != (HPTE_R_W | HPTE_R_I | HPTE_R_M)) + return H_PARAMETER; + pteh &= ~0x60UL; + ptel &= ~(HPTE_R_PP0 - kvm->arch.ram_psize); + ptel |= pa; + if (pte_index >= (HPT_NPTEG << 3)) + return H_PARAMETER; + if (likely((flags & H_EXACT) == 0)) { + pte_index &= ~7UL; + hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); + for (i = 0; ; ++i) { + if (i == 8) + return H_PTEG_FULL; + if ((*hpte & HPTE_V_VALID) == 0 && + lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID)) + break; + hpte += 2; + } + } else { + i = 0; + hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); + if (!lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID)) + return H_PTEG_FULL; + } + hpte[1] = ptel; + eieio(); + hpte[0] = pteh; + asm volatile("ptesync" : : : "memory"); + atomic_inc(&kvm->arch.ram_pginfo[lpn].refcnt); + vcpu->arch.gpr[4] = pte_index + i; + return H_SUCCESS; +} + +#define LOCK_TOKEN (*(u32 *)(&get_paca()->lock_token)) + +static inline int try_lock_tlbie(unsigned int *lock) +{ + unsigned int tmp, old; + unsigned int token = LOCK_TOKEN; + + asm volatile("1:lwarx %1,0,%2\n" + " cmpwi cr0,%1,0\n" + " bne 2f\n" + " stwcx. %3,0,%2\n" + " bne- 1b\n" + " isync\n" + "2:" + : "=&r" (tmp), "=&r" (old) + : "r" (lock), "r" (token) + : "cc", "memory"); + return old == 0; +} + +long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, + unsigned long pte_index, unsigned long avpn, + unsigned long va) +{ + struct kvm *kvm = vcpu->kvm; + unsigned long *hpte; + unsigned long v, r, rb; + + if (pte_index >= (HPT_NPTEG << 3)) + return H_PARAMETER; + hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); + while (!lock_hpte(hpte, HPTE_V_HVLOCK)) + cpu_relax(); + if ((hpte[0] & HPTE_V_VALID) == 0 || + ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn) || + ((flags & H_ANDCOND) && (hpte[0] & avpn) != 0)) { + hpte[0] &= ~HPTE_V_HVLOCK; + return H_NOT_FOUND; + } + if (atomic_read(&kvm->online_vcpus) == 1) + flags |= H_LOCAL; + vcpu->arch.gpr[4] = v = hpte[0] & ~HPTE_V_HVLOCK; + vcpu->arch.gpr[5] = r = hpte[1]; + rb = compute_tlbie_rb(v, r, pte_index); + hpte[0] = 0; + if (!(flags & H_LOCAL)) { + while(!try_lock_tlbie(&kvm->arch.tlbie_lock)) + cpu_relax(); + asm volatile("ptesync" : : : "memory"); + asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync" + : : "r" (rb), "r" (kvm->arch.lpid)); + asm volatile("ptesync" : : : "memory"); + kvm->arch.tlbie_lock = 0; + } else { + asm volatile("ptesync" : : : "memory"); + asm volatile("tlbiel %0" : : "r" (rb)); + asm volatile("ptesync" : : : "memory"); + } + return H_SUCCESS; +} + +long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + unsigned long *args = &vcpu->arch.gpr[4]; + unsigned long *hp, tlbrb[4]; + long int i, found; + long int n_inval = 0; + unsigned long flags, req, pte_index; + long int local = 0; + long int ret = H_SUCCESS; + + if (atomic_read(&kvm->online_vcpus) == 1) + local = 1; + for (i = 0; i < 4; ++i) { + pte_index = args[i * 2]; + flags = pte_index >> 56; + pte_index &= ((1ul << 56) - 1); + req = flags >> 6; + flags &= 3; + if (req == 3) + break; + if (req != 1 || flags == 3 || + pte_index >= (HPT_NPTEG << 3)) { + /* parameter error */ + args[i * 2] = ((0xa0 | flags) << 56) + pte_index; + ret = H_PARAMETER; + break; + } + hp = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); + while (!lock_hpte(hp, HPTE_V_HVLOCK)) + cpu_relax(); + found = 0; + if (hp[0] & HPTE_V_VALID) { + switch (flags & 3) { + case 0: /* absolute */ + found = 1; + break; + case 1: /* andcond */ + if (!(hp[0] & args[i * 2 + 1])) + found = 1; + break; + case 2: /* AVPN */ + if ((hp[0] & ~0x7fUL) == args[i * 2 + 1]) + found = 1; + break; + } + } + if (!found) { + hp[0] &= ~HPTE_V_HVLOCK; + args[i * 2] = ((0x90 | flags) << 56) + pte_index; + continue; + } + /* insert R and C bits from PTE */ + flags |= (hp[1] >> 5) & 0x0c; + args[i * 2] = ((0x80 | flags) << 56) + pte_index; + tlbrb[n_inval++] = compute_tlbie_rb(hp[0], hp[1], pte_index); + hp[0] = 0; + } + if (n_inval == 0) + return ret; + + if (!local) { + while(!try_lock_tlbie(&kvm->arch.tlbie_lock)) + cpu_relax(); + asm volatile("ptesync" : : : "memory"); + for (i = 0; i < n_inval; ++i) + asm volatile(PPC_TLBIE(%1,%0) + : : "r" (tlbrb[i]), "r" (kvm->arch.lpid)); + asm volatile("eieio; tlbsync; ptesync" : : : "memory"); + kvm->arch.tlbie_lock = 0; + } else { + asm volatile("ptesync" : : : "memory"); + for (i = 0; i < n_inval; ++i) + asm volatile("tlbiel %0" : : "r" (tlbrb[i])); + asm volatile("ptesync" : : : "memory"); + } + return ret; +} + +long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, + unsigned long pte_index, unsigned long avpn, + unsigned long va) +{ + struct kvm *kvm = vcpu->kvm; + unsigned long *hpte; + unsigned long v, r, rb; + + if (pte_index >= (HPT_NPTEG << 3)) + return H_PARAMETER; + hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); + while (!lock_hpte(hpte, HPTE_V_HVLOCK)) + cpu_relax(); + if ((hpte[0] & HPTE_V_VALID) == 0 || + ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn)) { + hpte[0] &= ~HPTE_V_HVLOCK; + return H_NOT_FOUND; + } + if (atomic_read(&kvm->online_vcpus) == 1) + flags |= H_LOCAL; + v = hpte[0]; + r = hpte[1] & ~(HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N | + HPTE_R_KEY_HI | HPTE_R_KEY_LO); + r |= (flags << 55) & HPTE_R_PP0; + r |= (flags << 48) & HPTE_R_KEY_HI; + r |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO); + rb = compute_tlbie_rb(v, r, pte_index); + hpte[0] = v & ~HPTE_V_VALID; + if (!(flags & H_LOCAL)) { + while(!try_lock_tlbie(&kvm->arch.tlbie_lock)) + cpu_relax(); + asm volatile("ptesync" : : : "memory"); + asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync" + : : "r" (rb), "r" (kvm->arch.lpid)); + asm volatile("ptesync" : : : "memory"); + kvm->arch.tlbie_lock = 0; + } else { + asm volatile("ptesync" : : : "memory"); + asm volatile("tlbiel %0" : : "r" (rb)); + asm volatile("ptesync" : : : "memory"); + } + hpte[1] = r; + eieio(); + hpte[0] = v & ~HPTE_V_HVLOCK; + asm volatile("ptesync" : : : "memory"); + return H_SUCCESS; +} + +static unsigned long reverse_xlate(struct kvm *kvm, unsigned long realaddr) +{ + long int i; + unsigned long offset, rpn; + + offset = realaddr & (kvm->arch.ram_psize - 1); + rpn = (realaddr - offset) >> PAGE_SHIFT; + for (i = 0; i < kvm->arch.ram_npages; ++i) + if (rpn == kvm->arch.ram_pginfo[i].pfn) + return (i << PAGE_SHIFT) + offset; + return HPTE_R_RPN; /* all 1s in the RPN field */ +} + +long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags, + unsigned long pte_index) +{ + struct kvm *kvm = vcpu->kvm; + unsigned long *hpte, r; + int i, n = 1; + + if (pte_index >= (HPT_NPTEG << 3)) + return H_PARAMETER; + if (flags & H_READ_4) { + pte_index &= ~3; + n = 4; + } + for (i = 0; i < n; ++i, ++pte_index) { + hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); + r = hpte[1]; + if ((flags & H_R_XLATE) && (hpte[0] & HPTE_V_VALID)) + r = reverse_xlate(kvm, r & HPTE_R_RPN) | + (r & ~HPTE_R_RPN); + vcpu->arch.gpr[4 + i * 2] = hpte[0]; + vcpu->arch.gpr[5 + i * 2] = r; + } + return H_SUCCESS; +} diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S new file mode 100644 index 000000000000..44d8829334ab --- /dev/null +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -0,0 +1,1577 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + * + * Derived from book3s_rmhandlers.S and other files, which are: + * + * Copyright SUSE Linux Products GmbH 2009 + * + * Authors: Alexander Graf <agraf@suse.de> + */ + +#include <asm/ppc_asm.h> +#include <asm/kvm_asm.h> +#include <asm/reg.h> +#include <asm/mmu.h> +#include <asm/page.h> +#include <asm/ptrace.h> +#include <asm/hvcall.h> +#include <asm/asm-offsets.h> +#include <asm/exception-64s.h> + +/***************************************************************************** + * * + * Real Mode handlers that need to be in the linear mapping * + * * + ****************************************************************************/ + + .globl kvmppc_skip_interrupt +kvmppc_skip_interrupt: + mfspr r13,SPRN_SRR0 + addi r13,r13,4 + mtspr SPRN_SRR0,r13 + GET_SCRATCH0(r13) + rfid + b . + + .globl kvmppc_skip_Hinterrupt +kvmppc_skip_Hinterrupt: + mfspr r13,SPRN_HSRR0 + addi r13,r13,4 + mtspr SPRN_HSRR0,r13 + GET_SCRATCH0(r13) + hrfid + b . + +/* + * Call kvmppc_hv_entry in real mode. + * Must be called with interrupts hard-disabled. + * + * Input Registers: + * + * LR = return address to continue at after eventually re-enabling MMU + */ +_GLOBAL(kvmppc_hv_entry_trampoline) + mfmsr r10 + LOAD_REG_ADDR(r5, kvmppc_hv_entry) + li r0,MSR_RI + andc r0,r10,r0 + li r6,MSR_IR | MSR_DR + andc r6,r10,r6 + mtmsrd r0,1 /* clear RI in MSR */ + mtsrr0 r5 + mtsrr1 r6 + RFI + +#define ULONG_SIZE 8 +#define VCPU_GPR(n) (VCPU_GPRS + (n * ULONG_SIZE)) + +/****************************************************************************** + * * + * Entry code * + * * + *****************************************************************************/ + +#define XICS_XIRR 4 +#define XICS_QIRR 0xc + +/* + * We come in here when wakened from nap mode on a secondary hw thread. + * Relocation is off and most register values are lost. + * r13 points to the PACA. + */ + .globl kvm_start_guest +kvm_start_guest: + ld r1,PACAEMERGSP(r13) + subi r1,r1,STACK_FRAME_OVERHEAD + ld r2,PACATOC(r13) + + /* were we napping due to cede? */ + lbz r0,HSTATE_NAPPING(r13) + cmpwi r0,0 + bne kvm_end_cede + + /* get vcpu pointer */ + ld r4, HSTATE_KVM_VCPU(r13) + + /* We got here with an IPI; clear it */ + ld r5, HSTATE_XICS_PHYS(r13) + li r0, 0xff + li r6, XICS_QIRR + li r7, XICS_XIRR + lwzcix r8, r5, r7 /* ack the interrupt */ + sync + stbcix r0, r5, r6 /* clear it */ + stwcix r8, r5, r7 /* EOI it */ + +.global kvmppc_hv_entry +kvmppc_hv_entry: + + /* Required state: + * + * R4 = vcpu pointer + * MSR = ~IR|DR + * R13 = PACA + * R1 = host R1 + * all other volatile GPRS = free + */ + mflr r0 + std r0, HSTATE_VMHANDLER(r13) + + ld r14, VCPU_GPR(r14)(r4) + ld r15, VCPU_GPR(r15)(r4) + ld r16, VCPU_GPR(r16)(r4) + ld r17, VCPU_GPR(r17)(r4) + ld r18, VCPU_GPR(r18)(r4) + ld r19, VCPU_GPR(r19)(r4) + ld r20, VCPU_GPR(r20)(r4) + ld r21, VCPU_GPR(r21)(r4) + ld r22, VCPU_GPR(r22)(r4) + ld r23, VCPU_GPR(r23)(r4) + ld r24, VCPU_GPR(r24)(r4) + ld r25, VCPU_GPR(r25)(r4) + ld r26, VCPU_GPR(r26)(r4) + ld r27, VCPU_GPR(r27)(r4) + ld r28, VCPU_GPR(r28)(r4) + ld r29, VCPU_GPR(r29)(r4) + ld r30, VCPU_GPR(r30)(r4) + ld r31, VCPU_GPR(r31)(r4) + + /* Load guest PMU registers */ + /* R4 is live here (vcpu pointer) */ + li r3, 1 + sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */ + mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */ + isync + lwz r3, VCPU_PMC(r4) /* always load up guest PMU registers */ + lwz r5, VCPU_PMC + 4(r4) /* to prevent information leak */ + lwz r6, VCPU_PMC + 8(r4) + lwz r7, VCPU_PMC + 12(r4) + lwz r8, VCPU_PMC + 16(r4) + lwz r9, VCPU_PMC + 20(r4) +BEGIN_FTR_SECTION + lwz r10, VCPU_PMC + 24(r4) + lwz r11, VCPU_PMC + 28(r4) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) + mtspr SPRN_PMC1, r3 + mtspr SPRN_PMC2, r5 + mtspr SPRN_PMC3, r6 + mtspr SPRN_PMC4, r7 + mtspr SPRN_PMC5, r8 + mtspr SPRN_PMC6, r9 +BEGIN_FTR_SECTION + mtspr SPRN_PMC7, r10 + mtspr SPRN_PMC8, r11 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) + ld r3, VCPU_MMCR(r4) + ld r5, VCPU_MMCR + 8(r4) + ld r6, VCPU_MMCR + 16(r4) + mtspr SPRN_MMCR1, r5 + mtspr SPRN_MMCRA, r6 + mtspr SPRN_MMCR0, r3 + isync + + /* Load up FP, VMX and VSX registers */ + bl kvmppc_load_fp + +BEGIN_FTR_SECTION + /* Switch DSCR to guest value */ + ld r5, VCPU_DSCR(r4) + mtspr SPRN_DSCR, r5 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + + /* + * Set the decrementer to the guest decrementer. + */ + ld r8,VCPU_DEC_EXPIRES(r4) + mftb r7 + subf r3,r7,r8 + mtspr SPRN_DEC,r3 + stw r3,VCPU_DEC(r4) + + ld r5, VCPU_SPRG0(r4) + ld r6, VCPU_SPRG1(r4) + ld r7, VCPU_SPRG2(r4) + ld r8, VCPU_SPRG3(r4) + mtspr SPRN_SPRG0, r5 + mtspr SPRN_SPRG1, r6 + mtspr SPRN_SPRG2, r7 + mtspr SPRN_SPRG3, r8 + + /* Save R1 in the PACA */ + std r1, HSTATE_HOST_R1(r13) + + /* Increment yield count if they have a VPA */ + ld r3, VCPU_VPA(r4) + cmpdi r3, 0 + beq 25f + lwz r5, LPPACA_YIELDCOUNT(r3) + addi r5, r5, 1 + stw r5, LPPACA_YIELDCOUNT(r3) +25: + /* Load up DAR and DSISR */ + ld r5, VCPU_DAR(r4) + lwz r6, VCPU_DSISR(r4) + mtspr SPRN_DAR, r5 + mtspr SPRN_DSISR, r6 + + /* Set partition DABR */ + li r5,3 + ld r6,VCPU_DABR(r4) + mtspr SPRN_DABRX,r5 + mtspr SPRN_DABR,r6 + +BEGIN_FTR_SECTION + /* Restore AMR and UAMOR, set AMOR to all 1s */ + ld r5,VCPU_AMR(r4) + ld r6,VCPU_UAMOR(r4) + li r7,-1 + mtspr SPRN_AMR,r5 + mtspr SPRN_UAMOR,r6 + mtspr SPRN_AMOR,r7 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + + /* Clear out SLB */ + li r6,0 + slbmte r6,r6 + slbia + ptesync + +BEGIN_FTR_SECTION + b 30f +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) + /* + * POWER7 host -> guest partition switch code. + * We don't have to lock against concurrent tlbies, + * but we do have to coordinate across hardware threads. + */ + /* Increment entry count iff exit count is zero. */ + ld r5,HSTATE_KVM_VCORE(r13) + addi r9,r5,VCORE_ENTRY_EXIT +21: lwarx r3,0,r9 + cmpwi r3,0x100 /* any threads starting to exit? */ + bge secondary_too_late /* if so we're too late to the party */ + addi r3,r3,1 + stwcx. r3,0,r9 + bne 21b + + /* Primary thread switches to guest partition. */ + ld r9,VCPU_KVM(r4) /* pointer to struct kvm */ + lwz r6,VCPU_PTID(r4) + cmpwi r6,0 + bne 20f + ld r6,KVM_SDR1(r9) + lwz r7,KVM_LPID(r9) + li r0,LPID_RSVD /* switch to reserved LPID */ + mtspr SPRN_LPID,r0 + ptesync + mtspr SPRN_SDR1,r6 /* switch to partition page table */ + mtspr SPRN_LPID,r7 + isync + li r0,1 + stb r0,VCORE_IN_GUEST(r5) /* signal secondaries to continue */ + b 10f + + /* Secondary threads wait for primary to have done partition switch */ +20: lbz r0,VCORE_IN_GUEST(r5) + cmpwi r0,0 + beq 20b + + /* Set LPCR and RMOR. */ +10: ld r8,KVM_LPCR(r9) + mtspr SPRN_LPCR,r8 + ld r8,KVM_RMOR(r9) + mtspr SPRN_RMOR,r8 + isync + + /* Check if HDEC expires soon */ + mfspr r3,SPRN_HDEC + cmpwi r3,10 + li r12,BOOK3S_INTERRUPT_HV_DECREMENTER + mr r9,r4 + blt hdec_soon + + /* + * Invalidate the TLB if we could possibly have stale TLB + * entries for this partition on this core due to the use + * of tlbiel. + * XXX maybe only need this on primary thread? + */ + ld r9,VCPU_KVM(r4) /* pointer to struct kvm */ + lwz r5,VCPU_VCPUID(r4) + lhz r6,PACAPACAINDEX(r13) + rldimi r6,r5,0,62 /* XXX map as if threads 1:1 p:v */ + lhz r8,VCPU_LAST_CPU(r4) + sldi r7,r6,1 /* see if this is the same vcpu */ + add r7,r7,r9 /* as last ran on this pcpu */ + lhz r0,KVM_LAST_VCPU(r7) + cmpw r6,r8 /* on the same cpu core as last time? */ + bne 3f + cmpw r0,r5 /* same vcpu as this core last ran? */ + beq 1f +3: sth r6,VCPU_LAST_CPU(r4) /* if not, invalidate partition TLB */ + sth r5,KVM_LAST_VCPU(r7) + li r6,128 + mtctr r6 + li r7,0x800 /* IS field = 0b10 */ + ptesync +2: tlbiel r7 + addi r7,r7,0x1000 + bdnz 2b + ptesync +1: + + /* Save purr/spurr */ + mfspr r5,SPRN_PURR + mfspr r6,SPRN_SPURR + std r5,HSTATE_PURR(r13) + std r6,HSTATE_SPURR(r13) + ld r7,VCPU_PURR(r4) + ld r8,VCPU_SPURR(r4) + mtspr SPRN_PURR,r7 + mtspr SPRN_SPURR,r8 + b 31f + + /* + * PPC970 host -> guest partition switch code. + * We have to lock against concurrent tlbies, + * using native_tlbie_lock to lock against host tlbies + * and kvm->arch.tlbie_lock to lock against guest tlbies. + * We also have to invalidate the TLB since its + * entries aren't tagged with the LPID. + */ +30: ld r9,VCPU_KVM(r4) /* pointer to struct kvm */ + + /* first take native_tlbie_lock */ + .section ".toc","aw" +toc_tlbie_lock: + .tc native_tlbie_lock[TC],native_tlbie_lock + .previous + ld r3,toc_tlbie_lock@toc(2) + lwz r8,PACA_LOCK_TOKEN(r13) +24: lwarx r0,0,r3 + cmpwi r0,0 + bne 24b + stwcx. r8,0,r3 + bne 24b + isync + + ld r7,KVM_LPCR(r9) /* use kvm->arch.lpcr to store HID4 */ + li r0,0x18f + rotldi r0,r0,HID4_LPID5_SH /* all lpid bits in HID4 = 1 */ + or r0,r7,r0 + ptesync + sync + mtspr SPRN_HID4,r0 /* switch to reserved LPID */ + isync + li r0,0 + stw r0,0(r3) /* drop native_tlbie_lock */ + + /* invalidate the whole TLB */ + li r0,256 + mtctr r0 + li r6,0 +25: tlbiel r6 + addi r6,r6,0x1000 + bdnz 25b + ptesync + + /* Take the guest's tlbie_lock */ + addi r3,r9,KVM_TLBIE_LOCK +24: lwarx r0,0,r3 + cmpwi r0,0 + bne 24b + stwcx. r8,0,r3 + bne 24b + isync + ld r6,KVM_SDR1(r9) + mtspr SPRN_SDR1,r6 /* switch to partition page table */ + + /* Set up HID4 with the guest's LPID etc. */ + sync + mtspr SPRN_HID4,r7 + isync + + /* drop the guest's tlbie_lock */ + li r0,0 + stw r0,0(r3) + + /* Check if HDEC expires soon */ + mfspr r3,SPRN_HDEC + cmpwi r3,10 + li r12,BOOK3S_INTERRUPT_HV_DECREMENTER + mr r9,r4 + blt hdec_soon + + /* Enable HDEC interrupts */ + mfspr r0,SPRN_HID0 + li r3,1 + rldimi r0,r3, HID0_HDICE_SH, 64-HID0_HDICE_SH-1 + sync + mtspr SPRN_HID0,r0 + mfspr r0,SPRN_HID0 + mfspr r0,SPRN_HID0 + mfspr r0,SPRN_HID0 + mfspr r0,SPRN_HID0 + mfspr r0,SPRN_HID0 + mfspr r0,SPRN_HID0 + + /* Load up guest SLB entries */ +31: lwz r5,VCPU_SLB_MAX(r4) + cmpwi r5,0 + beq 9f + mtctr r5 + addi r6,r4,VCPU_SLB +1: ld r8,VCPU_SLB_E(r6) + ld r9,VCPU_SLB_V(r6) + slbmte r9,r8 + addi r6,r6,VCPU_SLB_SIZE + bdnz 1b +9: + + /* Restore state of CTRL run bit; assume 1 on entry */ + lwz r5,VCPU_CTRL(r4) + andi. r5,r5,1 + bne 4f + mfspr r6,SPRN_CTRLF + clrrdi r6,r6,1 + mtspr SPRN_CTRLT,r6 +4: + ld r6, VCPU_CTR(r4) + lwz r7, VCPU_XER(r4) + + mtctr r6 + mtxer r7 + +kvmppc_cede_reentry: /* r4 = vcpu, r13 = paca */ + ld r6, VCPU_SRR0(r4) + ld r7, VCPU_SRR1(r4) + ld r10, VCPU_PC(r4) + ld r11, VCPU_MSR(r4) /* r11 = vcpu->arch.msr & ~MSR_HV */ + + rldicl r11, r11, 63 - MSR_HV_LG, 1 + rotldi r11, r11, 1 + MSR_HV_LG + ori r11, r11, MSR_ME + + /* Check if we can deliver an external or decrementer interrupt now */ + ld r0,VCPU_PENDING_EXC(r4) + li r8,(1 << BOOK3S_IRQPRIO_EXTERNAL) + oris r8,r8,(1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h + and r0,r0,r8 + cmpdi cr1,r0,0 + andi. r0,r11,MSR_EE + beq cr1,11f +BEGIN_FTR_SECTION + mfspr r8,SPRN_LPCR + ori r8,r8,LPCR_MER + mtspr SPRN_LPCR,r8 + isync +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + beq 5f + li r0,BOOK3S_INTERRUPT_EXTERNAL +12: mr r6,r10 + mr r10,r0 + mr r7,r11 + li r11,(MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */ + rotldi r11,r11,63 + b 5f +11: beq 5f + mfspr r0,SPRN_DEC + cmpwi r0,0 + li r0,BOOK3S_INTERRUPT_DECREMENTER + blt 12b + + /* Move SRR0 and SRR1 into the respective regs */ +5: mtspr SPRN_SRR0, r6 + mtspr SPRN_SRR1, r7 + li r0,0 + stb r0,VCPU_CEDED(r4) /* cancel cede */ + +fast_guest_return: + mtspr SPRN_HSRR0,r10 + mtspr SPRN_HSRR1,r11 + + /* Activate guest mode, so faults get handled by KVM */ + li r9, KVM_GUEST_MODE_GUEST + stb r9, HSTATE_IN_GUEST(r13) + + /* Enter guest */ + + ld r5, VCPU_LR(r4) + lwz r6, VCPU_CR(r4) + mtlr r5 + mtcr r6 + + ld r0, VCPU_GPR(r0)(r4) + ld r1, VCPU_GPR(r1)(r4) + ld r2, VCPU_GPR(r2)(r4) + ld r3, VCPU_GPR(r3)(r4) + ld r5, VCPU_GPR(r5)(r4) + ld r6, VCPU_GPR(r6)(r4) + ld r7, VCPU_GPR(r7)(r4) + ld r8, VCPU_GPR(r8)(r4) + ld r9, VCPU_GPR(r9)(r4) + ld r10, VCPU_GPR(r10)(r4) + ld r11, VCPU_GPR(r11)(r4) + ld r12, VCPU_GPR(r12)(r4) + ld r13, VCPU_GPR(r13)(r4) + + ld r4, VCPU_GPR(r4)(r4) + + hrfid + b . + +/****************************************************************************** + * * + * Exit code * + * * + *****************************************************************************/ + +/* + * We come here from the first-level interrupt handlers. + */ + .globl kvmppc_interrupt +kvmppc_interrupt: + /* + * Register contents: + * R12 = interrupt vector + * R13 = PACA + * guest CR, R12 saved in shadow VCPU SCRATCH1/0 + * guest R13 saved in SPRN_SCRATCH0 + */ + /* abuse host_r2 as third scratch area; we get r2 from PACATOC(r13) */ + std r9, HSTATE_HOST_R2(r13) + ld r9, HSTATE_KVM_VCPU(r13) + + /* Save registers */ + + std r0, VCPU_GPR(r0)(r9) + std r1, VCPU_GPR(r1)(r9) + std r2, VCPU_GPR(r2)(r9) + std r3, VCPU_GPR(r3)(r9) + std r4, VCPU_GPR(r4)(r9) + std r5, VCPU_GPR(r5)(r9) + std r6, VCPU_GPR(r6)(r9) + std r7, VCPU_GPR(r7)(r9) + std r8, VCPU_GPR(r8)(r9) + ld r0, HSTATE_HOST_R2(r13) + std r0, VCPU_GPR(r9)(r9) + std r10, VCPU_GPR(r10)(r9) + std r11, VCPU_GPR(r11)(r9) + ld r3, HSTATE_SCRATCH0(r13) + lwz r4, HSTATE_SCRATCH1(r13) + std r3, VCPU_GPR(r12)(r9) + stw r4, VCPU_CR(r9) + + /* Restore R1/R2 so we can handle faults */ + ld r1, HSTATE_HOST_R1(r13) + ld r2, PACATOC(r13) + + mfspr r10, SPRN_SRR0 + mfspr r11, SPRN_SRR1 + std r10, VCPU_SRR0(r9) + std r11, VCPU_SRR1(r9) + andi. r0, r12, 2 /* need to read HSRR0/1? */ + beq 1f + mfspr r10, SPRN_HSRR0 + mfspr r11, SPRN_HSRR1 + clrrdi r12, r12, 2 +1: std r10, VCPU_PC(r9) + std r11, VCPU_MSR(r9) + + GET_SCRATCH0(r3) + mflr r4 + std r3, VCPU_GPR(r13)(r9) + std r4, VCPU_LR(r9) + + /* Unset guest mode */ + li r0, KVM_GUEST_MODE_NONE + stb r0, HSTATE_IN_GUEST(r13) + + stw r12,VCPU_TRAP(r9) + + /* See if this is a leftover HDEC interrupt */ + cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER + bne 2f + mfspr r3,SPRN_HDEC + cmpwi r3,0 + bge ignore_hdec +2: + /* See if this is something we can handle in real mode */ + cmpwi r12,BOOK3S_INTERRUPT_SYSCALL + beq hcall_try_real_mode + + /* Check for mediated interrupts (could be done earlier really ...) */ +BEGIN_FTR_SECTION + cmpwi r12,BOOK3S_INTERRUPT_EXTERNAL + bne+ 1f + andi. r0,r11,MSR_EE + beq 1f + mfspr r5,SPRN_LPCR + andi. r0,r5,LPCR_MER + bne bounce_ext_interrupt +1: +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + +hcall_real_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ + /* Save DEC */ + mfspr r5,SPRN_DEC + mftb r6 + extsw r5,r5 + add r5,r5,r6 + std r5,VCPU_DEC_EXPIRES(r9) + + /* Save HEIR (HV emulation assist reg) in last_inst + if this is an HEI (HV emulation interrupt, e40) */ + li r3,-1 +BEGIN_FTR_SECTION + cmpwi r12,BOOK3S_INTERRUPT_H_EMUL_ASSIST + bne 11f + mfspr r3,SPRN_HEIR +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) +11: stw r3,VCPU_LAST_INST(r9) + + /* Save more register state */ + mfxer r5 + mfdar r6 + mfdsisr r7 + mfctr r8 + + stw r5, VCPU_XER(r9) + std r6, VCPU_DAR(r9) + stw r7, VCPU_DSISR(r9) + std r8, VCPU_CTR(r9) + /* grab HDAR & HDSISR if HV data storage interrupt (HDSI) */ +BEGIN_FTR_SECTION + cmpwi r12,BOOK3S_INTERRUPT_H_DATA_STORAGE + beq 6f +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) +7: std r6, VCPU_FAULT_DAR(r9) + stw r7, VCPU_FAULT_DSISR(r9) + + /* Save guest CTRL register, set runlatch to 1 */ + mfspr r6,SPRN_CTRLF + stw r6,VCPU_CTRL(r9) + andi. r0,r6,1 + bne 4f + ori r6,r6,1 + mtspr SPRN_CTRLT,r6 +4: + /* Read the guest SLB and save it away */ + lwz r0,VCPU_SLB_NR(r9) /* number of entries in SLB */ + mtctr r0 + li r6,0 + addi r7,r9,VCPU_SLB + li r5,0 +1: slbmfee r8,r6 + andis. r0,r8,SLB_ESID_V@h + beq 2f + add r8,r8,r6 /* put index in */ + slbmfev r3,r6 + std r8,VCPU_SLB_E(r7) + std r3,VCPU_SLB_V(r7) + addi r7,r7,VCPU_SLB_SIZE + addi r5,r5,1 +2: addi r6,r6,1 + bdnz 1b + stw r5,VCPU_SLB_MAX(r9) + + /* + * Save the guest PURR/SPURR + */ +BEGIN_FTR_SECTION + mfspr r5,SPRN_PURR + mfspr r6,SPRN_SPURR + ld r7,VCPU_PURR(r9) + ld r8,VCPU_SPURR(r9) + std r5,VCPU_PURR(r9) + std r6,VCPU_SPURR(r9) + subf r5,r7,r5 + subf r6,r8,r6 + + /* + * Restore host PURR/SPURR and add guest times + * so that the time in the guest gets accounted. + */ + ld r3,HSTATE_PURR(r13) + ld r4,HSTATE_SPURR(r13) + add r3,r3,r5 + add r4,r4,r6 + mtspr SPRN_PURR,r3 + mtspr SPRN_SPURR,r4 +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_201) + + /* Clear out SLB */ + li r5,0 + slbmte r5,r5 + slbia + ptesync + +hdec_soon: /* r9 = vcpu, r12 = trap, r13 = paca */ +BEGIN_FTR_SECTION + b 32f +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) + /* + * POWER7 guest -> host partition switch code. + * We don't have to lock against tlbies but we do + * have to coordinate the hardware threads. + */ + /* Increment the threads-exiting-guest count in the 0xff00 + bits of vcore->entry_exit_count */ + lwsync + ld r5,HSTATE_KVM_VCORE(r13) + addi r6,r5,VCORE_ENTRY_EXIT +41: lwarx r3,0,r6 + addi r0,r3,0x100 + stwcx. r0,0,r6 + bne 41b + lwsync + + /* + * At this point we have an interrupt that we have to pass + * up to the kernel or qemu; we can't handle it in real mode. + * Thus we have to do a partition switch, so we have to + * collect the other threads, if we are the first thread + * to take an interrupt. To do this, we set the HDEC to 0, + * which causes an HDEC interrupt in all threads within 2ns + * because the HDEC register is shared between all 4 threads. + * However, we don't need to bother if this is an HDEC + * interrupt, since the other threads will already be on their + * way here in that case. + */ + cmpwi r3,0x100 /* Are we the first here? */ + bge 43f + cmpwi r3,1 /* Are any other threads in the guest? */ + ble 43f + cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER + beq 40f + li r0,0 + mtspr SPRN_HDEC,r0 +40: + /* + * Send an IPI to any napping threads, since an HDEC interrupt + * doesn't wake CPUs up from nap. + */ + lwz r3,VCORE_NAPPING_THREADS(r5) + lwz r4,VCPU_PTID(r9) + li r0,1 + sldi r0,r0,r4 + andc. r3,r3,r0 /* no sense IPI'ing ourselves */ + beq 43f + mulli r4,r4,PACA_SIZE /* get paca for thread 0 */ + subf r6,r4,r13 +42: andi. r0,r3,1 + beq 44f + ld r8,HSTATE_XICS_PHYS(r6) /* get thread's XICS reg addr */ + li r0,IPI_PRIORITY + li r7,XICS_QIRR + stbcix r0,r7,r8 /* trigger the IPI */ +44: srdi. r3,r3,1 + addi r6,r6,PACA_SIZE + bne 42b + + /* Secondary threads wait for primary to do partition switch */ +43: ld r4,VCPU_KVM(r9) /* pointer to struct kvm */ + ld r5,HSTATE_KVM_VCORE(r13) + lwz r3,VCPU_PTID(r9) + cmpwi r3,0 + beq 15f + HMT_LOW +13: lbz r3,VCORE_IN_GUEST(r5) + cmpwi r3,0 + bne 13b + HMT_MEDIUM + b 16f + + /* Primary thread waits for all the secondaries to exit guest */ +15: lwz r3,VCORE_ENTRY_EXIT(r5) + srwi r0,r3,8 + clrldi r3,r3,56 + cmpw r3,r0 + bne 15b + isync + + /* Primary thread switches back to host partition */ + ld r6,KVM_HOST_SDR1(r4) + lwz r7,KVM_HOST_LPID(r4) + li r8,LPID_RSVD /* switch to reserved LPID */ + mtspr SPRN_LPID,r8 + ptesync + mtspr SPRN_SDR1,r6 /* switch to partition page table */ + mtspr SPRN_LPID,r7 + isync + li r0,0 + stb r0,VCORE_IN_GUEST(r5) + lis r8,0x7fff /* MAX_INT@h */ + mtspr SPRN_HDEC,r8 + +16: ld r8,KVM_HOST_LPCR(r4) + mtspr SPRN_LPCR,r8 + isync + b 33f + + /* + * PPC970 guest -> host partition switch code. + * We have to lock against concurrent tlbies, and + * we have to flush the whole TLB. + */ +32: ld r4,VCPU_KVM(r9) /* pointer to struct kvm */ + + /* Take the guest's tlbie_lock */ + lwz r8,PACA_LOCK_TOKEN(r13) + addi r3,r4,KVM_TLBIE_LOCK +24: lwarx r0,0,r3 + cmpwi r0,0 + bne 24b + stwcx. r8,0,r3 + bne 24b + isync + + ld r7,KVM_HOST_LPCR(r4) /* use kvm->arch.host_lpcr for HID4 */ + li r0,0x18f + rotldi r0,r0,HID4_LPID5_SH /* all lpid bits in HID4 = 1 */ + or r0,r7,r0 + ptesync + sync + mtspr SPRN_HID4,r0 /* switch to reserved LPID */ + isync + li r0,0 + stw r0,0(r3) /* drop guest tlbie_lock */ + + /* invalidate the whole TLB */ + li r0,256 + mtctr r0 + li r6,0 +25: tlbiel r6 + addi r6,r6,0x1000 + bdnz 25b + ptesync + + /* take native_tlbie_lock */ + ld r3,toc_tlbie_lock@toc(2) +24: lwarx r0,0,r3 + cmpwi r0,0 + bne 24b + stwcx. r8,0,r3 + bne 24b + isync + + ld r6,KVM_HOST_SDR1(r4) + mtspr SPRN_SDR1,r6 /* switch to host page table */ + + /* Set up host HID4 value */ + sync + mtspr SPRN_HID4,r7 + isync + li r0,0 + stw r0,0(r3) /* drop native_tlbie_lock */ + + lis r8,0x7fff /* MAX_INT@h */ + mtspr SPRN_HDEC,r8 + + /* Disable HDEC interrupts */ + mfspr r0,SPRN_HID0 + li r3,0 + rldimi r0,r3, HID0_HDICE_SH, 64-HID0_HDICE_SH-1 + sync + mtspr SPRN_HID0,r0 + mfspr r0,SPRN_HID0 + mfspr r0,SPRN_HID0 + mfspr r0,SPRN_HID0 + mfspr r0,SPRN_HID0 + mfspr r0,SPRN_HID0 + mfspr r0,SPRN_HID0 + + /* load host SLB entries */ +33: ld r8,PACA_SLBSHADOWPTR(r13) + + .rept SLB_NUM_BOLTED + ld r5,SLBSHADOW_SAVEAREA(r8) + ld r6,SLBSHADOW_SAVEAREA+8(r8) + andis. r7,r5,SLB_ESID_V@h + beq 1f + slbmte r6,r5 +1: addi r8,r8,16 + .endr + + /* Save and reset AMR and UAMOR before turning on the MMU */ +BEGIN_FTR_SECTION + mfspr r5,SPRN_AMR + mfspr r6,SPRN_UAMOR + std r5,VCPU_AMR(r9) + std r6,VCPU_UAMOR(r9) + li r6,0 + mtspr SPRN_AMR,r6 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + + /* Restore host DABR and DABRX */ + ld r5,HSTATE_DABR(r13) + li r6,7 + mtspr SPRN_DABR,r5 + mtspr SPRN_DABRX,r6 + + /* Switch DSCR back to host value */ +BEGIN_FTR_SECTION + mfspr r8, SPRN_DSCR + ld r7, HSTATE_DSCR(r13) + std r8, VCPU_DSCR(r7) + mtspr SPRN_DSCR, r7 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + + /* Save non-volatile GPRs */ + std r14, VCPU_GPR(r14)(r9) + std r15, VCPU_GPR(r15)(r9) + std r16, VCPU_GPR(r16)(r9) + std r17, VCPU_GPR(r17)(r9) + std r18, VCPU_GPR(r18)(r9) + std r19, VCPU_GPR(r19)(r9) + std r20, VCPU_GPR(r20)(r9) + std r21, VCPU_GPR(r21)(r9) + std r22, VCPU_GPR(r22)(r9) + std r23, VCPU_GPR(r23)(r9) + std r24, VCPU_GPR(r24)(r9) + std r25, VCPU_GPR(r25)(r9) + std r26, VCPU_GPR(r26)(r9) + std r27, VCPU_GPR(r27)(r9) + std r28, VCPU_GPR(r28)(r9) + std r29, VCPU_GPR(r29)(r9) + std r30, VCPU_GPR(r30)(r9) + std r31, VCPU_GPR(r31)(r9) + + /* Save SPRGs */ + mfspr r3, SPRN_SPRG0 + mfspr r4, SPRN_SPRG1 + mfspr r5, SPRN_SPRG2 + mfspr r6, SPRN_SPRG3 + std r3, VCPU_SPRG0(r9) + std r4, VCPU_SPRG1(r9) + std r5, VCPU_SPRG2(r9) + std r6, VCPU_SPRG3(r9) + + /* Increment yield count if they have a VPA */ + ld r8, VCPU_VPA(r9) /* do they have a VPA? */ + cmpdi r8, 0 + beq 25f + lwz r3, LPPACA_YIELDCOUNT(r8) + addi r3, r3, 1 + stw r3, LPPACA_YIELDCOUNT(r8) +25: + /* Save PMU registers if requested */ + /* r8 and cr0.eq are live here */ + li r3, 1 + sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */ + mfspr r4, SPRN_MMCR0 /* save MMCR0 */ + mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */ + isync + beq 21f /* if no VPA, save PMU stuff anyway */ + lbz r7, LPPACA_PMCINUSE(r8) + cmpwi r7, 0 /* did they ask for PMU stuff to be saved? */ + bne 21f + std r3, VCPU_MMCR(r9) /* if not, set saved MMCR0 to FC */ + b 22f +21: mfspr r5, SPRN_MMCR1 + mfspr r6, SPRN_MMCRA + std r4, VCPU_MMCR(r9) + std r5, VCPU_MMCR + 8(r9) + std r6, VCPU_MMCR + 16(r9) + mfspr r3, SPRN_PMC1 + mfspr r4, SPRN_PMC2 + mfspr r5, SPRN_PMC3 + mfspr r6, SPRN_PMC4 + mfspr r7, SPRN_PMC5 + mfspr r8, SPRN_PMC6 +BEGIN_FTR_SECTION + mfspr r10, SPRN_PMC7 + mfspr r11, SPRN_PMC8 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) + stw r3, VCPU_PMC(r9) + stw r4, VCPU_PMC + 4(r9) + stw r5, VCPU_PMC + 8(r9) + stw r6, VCPU_PMC + 12(r9) + stw r7, VCPU_PMC + 16(r9) + stw r8, VCPU_PMC + 20(r9) +BEGIN_FTR_SECTION + stw r10, VCPU_PMC + 24(r9) + stw r11, VCPU_PMC + 28(r9) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) +22: + /* save FP state */ + mr r3, r9 + bl .kvmppc_save_fp + + /* Secondary threads go off to take a nap on POWER7 */ +BEGIN_FTR_SECTION + lwz r0,VCPU_PTID(r3) + cmpwi r0,0 + bne secondary_nap +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + + /* + * Reload DEC. HDEC interrupts were disabled when + * we reloaded the host's LPCR value. + */ + ld r3, HSTATE_DECEXP(r13) + mftb r4 + subf r4, r4, r3 + mtspr SPRN_DEC, r4 + + /* Reload the host's PMU registers */ + ld r3, PACALPPACAPTR(r13) /* is the host using the PMU? */ + lbz r4, LPPACA_PMCINUSE(r3) + cmpwi r4, 0 + beq 23f /* skip if not */ + lwz r3, HSTATE_PMC(r13) + lwz r4, HSTATE_PMC + 4(r13) + lwz r5, HSTATE_PMC + 8(r13) + lwz r6, HSTATE_PMC + 12(r13) + lwz r8, HSTATE_PMC + 16(r13) + lwz r9, HSTATE_PMC + 20(r13) +BEGIN_FTR_SECTION + lwz r10, HSTATE_PMC + 24(r13) + lwz r11, HSTATE_PMC + 28(r13) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) + mtspr SPRN_PMC1, r3 + mtspr SPRN_PMC2, r4 + mtspr SPRN_PMC3, r5 + mtspr SPRN_PMC4, r6 + mtspr SPRN_PMC5, r8 + mtspr SPRN_PMC6, r9 +BEGIN_FTR_SECTION + mtspr SPRN_PMC7, r10 + mtspr SPRN_PMC8, r11 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) + ld r3, HSTATE_MMCR(r13) + ld r4, HSTATE_MMCR + 8(r13) + ld r5, HSTATE_MMCR + 16(r13) + mtspr SPRN_MMCR1, r4 + mtspr SPRN_MMCRA, r5 + mtspr SPRN_MMCR0, r3 + isync +23: + /* + * For external and machine check interrupts, we need + * to call the Linux handler to process the interrupt. + * We do that by jumping to the interrupt vector address + * which we have in r12. The [h]rfid at the end of the + * handler will return to the book3s_hv_interrupts.S code. + * For other interrupts we do the rfid to get back + * to the book3s_interrupts.S code here. + */ + ld r8, HSTATE_VMHANDLER(r13) + ld r7, HSTATE_HOST_MSR(r13) + + cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL + beq 11f + cmpwi r12, BOOK3S_INTERRUPT_MACHINE_CHECK + + /* RFI into the highmem handler, or branch to interrupt handler */ +12: mfmsr r6 + mtctr r12 + li r0, MSR_RI + andc r6, r6, r0 + mtmsrd r6, 1 /* Clear RI in MSR */ + mtsrr0 r8 + mtsrr1 r7 + beqctr + RFI + +11: +BEGIN_FTR_SECTION + b 12b +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) + mtspr SPRN_HSRR0, r8 + mtspr SPRN_HSRR1, r7 + ba 0x500 + +6: mfspr r6,SPRN_HDAR + mfspr r7,SPRN_HDSISR + b 7b + +/* + * Try to handle an hcall in real mode. + * Returns to the guest if we handle it, or continues on up to + * the kernel if we can't (i.e. if we don't have a handler for + * it, or if the handler returns H_TOO_HARD). + */ + .globl hcall_try_real_mode +hcall_try_real_mode: + ld r3,VCPU_GPR(r3)(r9) + andi. r0,r11,MSR_PR + bne hcall_real_cont + clrrdi r3,r3,2 + cmpldi r3,hcall_real_table_end - hcall_real_table + bge hcall_real_cont + LOAD_REG_ADDR(r4, hcall_real_table) + lwzx r3,r3,r4 + cmpwi r3,0 + beq hcall_real_cont + add r3,r3,r4 + mtctr r3 + mr r3,r9 /* get vcpu pointer */ + ld r4,VCPU_GPR(r4)(r9) + bctrl + cmpdi r3,H_TOO_HARD + beq hcall_real_fallback + ld r4,HSTATE_KVM_VCPU(r13) + std r3,VCPU_GPR(r3)(r4) + ld r10,VCPU_PC(r4) + ld r11,VCPU_MSR(r4) + b fast_guest_return + + /* We've attempted a real mode hcall, but it's punted it back + * to userspace. We need to restore some clobbered volatiles + * before resuming the pass-it-to-qemu path */ +hcall_real_fallback: + li r12,BOOK3S_INTERRUPT_SYSCALL + ld r9, HSTATE_KVM_VCPU(r13) + + b hcall_real_cont + + .globl hcall_real_table +hcall_real_table: + .long 0 /* 0 - unused */ + .long .kvmppc_h_remove - hcall_real_table + .long .kvmppc_h_enter - hcall_real_table + .long .kvmppc_h_read - hcall_real_table + .long 0 /* 0x10 - H_CLEAR_MOD */ + .long 0 /* 0x14 - H_CLEAR_REF */ + .long .kvmppc_h_protect - hcall_real_table + .long 0 /* 0x1c - H_GET_TCE */ + .long .kvmppc_h_put_tce - hcall_real_table + .long 0 /* 0x24 - H_SET_SPRG0 */ + .long .kvmppc_h_set_dabr - hcall_real_table + .long 0 /* 0x2c */ + .long 0 /* 0x30 */ + .long 0 /* 0x34 */ + .long 0 /* 0x38 */ + .long 0 /* 0x3c */ + .long 0 /* 0x40 */ + .long 0 /* 0x44 */ + .long 0 /* 0x48 */ + .long 0 /* 0x4c */ + .long 0 /* 0x50 */ + .long 0 /* 0x54 */ + .long 0 /* 0x58 */ + .long 0 /* 0x5c */ + .long 0 /* 0x60 */ + .long 0 /* 0x64 */ + .long 0 /* 0x68 */ + .long 0 /* 0x6c */ + .long 0 /* 0x70 */ + .long 0 /* 0x74 */ + .long 0 /* 0x78 */ + .long 0 /* 0x7c */ + .long 0 /* 0x80 */ + .long 0 /* 0x84 */ + .long 0 /* 0x88 */ + .long 0 /* 0x8c */ + .long 0 /* 0x90 */ + .long 0 /* 0x94 */ + .long 0 /* 0x98 */ + .long 0 /* 0x9c */ + .long 0 /* 0xa0 */ + .long 0 /* 0xa4 */ + .long 0 /* 0xa8 */ + .long 0 /* 0xac */ + .long 0 /* 0xb0 */ + .long 0 /* 0xb4 */ + .long 0 /* 0xb8 */ + .long 0 /* 0xbc */ + .long 0 /* 0xc0 */ + .long 0 /* 0xc4 */ + .long 0 /* 0xc8 */ + .long 0 /* 0xcc */ + .long 0 /* 0xd0 */ + .long 0 /* 0xd4 */ + .long 0 /* 0xd8 */ + .long 0 /* 0xdc */ + .long .kvmppc_h_cede - hcall_real_table + .long 0 /* 0xe4 */ + .long 0 /* 0xe8 */ + .long 0 /* 0xec */ + .long 0 /* 0xf0 */ + .long 0 /* 0xf4 */ + .long 0 /* 0xf8 */ + .long 0 /* 0xfc */ + .long 0 /* 0x100 */ + .long 0 /* 0x104 */ + .long 0 /* 0x108 */ + .long 0 /* 0x10c */ + .long 0 /* 0x110 */ + .long 0 /* 0x114 */ + .long 0 /* 0x118 */ + .long 0 /* 0x11c */ + .long 0 /* 0x120 */ + .long .kvmppc_h_bulk_remove - hcall_real_table +hcall_real_table_end: + +ignore_hdec: + mr r4,r9 + b fast_guest_return + +bounce_ext_interrupt: + mr r4,r9 + mtspr SPRN_SRR0,r10 + mtspr SPRN_SRR1,r11 + li r10,BOOK3S_INTERRUPT_EXTERNAL + li r11,(MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */ + rotldi r11,r11,63 + b fast_guest_return + +_GLOBAL(kvmppc_h_set_dabr) + std r4,VCPU_DABR(r3) + mtspr SPRN_DABR,r4 + li r3,0 + blr + +_GLOBAL(kvmppc_h_cede) + ori r11,r11,MSR_EE + std r11,VCPU_MSR(r3) + li r0,1 + stb r0,VCPU_CEDED(r3) + sync /* order setting ceded vs. testing prodded */ + lbz r5,VCPU_PRODDED(r3) + cmpwi r5,0 + bne 1f + li r0,0 /* set trap to 0 to say hcall is handled */ + stw r0,VCPU_TRAP(r3) + li r0,H_SUCCESS + std r0,VCPU_GPR(r3)(r3) +BEGIN_FTR_SECTION + b 2f /* just send it up to host on 970 */ +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206) + + /* + * Set our bit in the bitmask of napping threads unless all the + * other threads are already napping, in which case we send this + * up to the host. + */ + ld r5,HSTATE_KVM_VCORE(r13) + lwz r6,VCPU_PTID(r3) + lwz r8,VCORE_ENTRY_EXIT(r5) + clrldi r8,r8,56 + li r0,1 + sld r0,r0,r6 + addi r6,r5,VCORE_NAPPING_THREADS +31: lwarx r4,0,r6 + or r4,r4,r0 + PPC_POPCNTW(r7,r4) + cmpw r7,r8 + bge 2f + stwcx. r4,0,r6 + bne 31b + li r0,1 + stb r0,HSTATE_NAPPING(r13) + /* order napping_threads update vs testing entry_exit_count */ + lwsync + mr r4,r3 + lwz r7,VCORE_ENTRY_EXIT(r5) + cmpwi r7,0x100 + bge 33f /* another thread already exiting */ + +/* + * Although not specifically required by the architecture, POWER7 + * preserves the following registers in nap mode, even if an SMT mode + * switch occurs: SLB entries, PURR, SPURR, AMOR, UAMOR, AMR, SPRG0-3, + * DAR, DSISR, DABR, DABRX, DSCR, PMCx, MMCRx, SIAR, SDAR. + */ + /* Save non-volatile GPRs */ + std r14, VCPU_GPR(r14)(r3) + std r15, VCPU_GPR(r15)(r3) + std r16, VCPU_GPR(r16)(r3) + std r17, VCPU_GPR(r17)(r3) + std r18, VCPU_GPR(r18)(r3) + std r19, VCPU_GPR(r19)(r3) + std r20, VCPU_GPR(r20)(r3) + std r21, VCPU_GPR(r21)(r3) + std r22, VCPU_GPR(r22)(r3) + std r23, VCPU_GPR(r23)(r3) + std r24, VCPU_GPR(r24)(r3) + std r25, VCPU_GPR(r25)(r3) + std r26, VCPU_GPR(r26)(r3) + std r27, VCPU_GPR(r27)(r3) + std r28, VCPU_GPR(r28)(r3) + std r29, VCPU_GPR(r29)(r3) + std r30, VCPU_GPR(r30)(r3) + std r31, VCPU_GPR(r31)(r3) + + /* save FP state */ + bl .kvmppc_save_fp + + /* + * Take a nap until a decrementer or external interrupt occurs, + * with PECE1 (wake on decr) and PECE0 (wake on external) set in LPCR + */ + li r0,0x80 + stb r0,PACAPROCSTART(r13) + mfspr r5,SPRN_LPCR + ori r5,r5,LPCR_PECE0 | LPCR_PECE1 + mtspr SPRN_LPCR,r5 + isync + li r0, 0 + std r0, HSTATE_SCRATCH0(r13) + ptesync + ld r0, HSTATE_SCRATCH0(r13) +1: cmpd r0, r0 + bne 1b + nap + b . + +kvm_end_cede: + /* Woken by external or decrementer interrupt */ + ld r1, HSTATE_HOST_R1(r13) + ld r2, PACATOC(r13) + + /* If we're a secondary thread and we got here by an IPI, ack it */ + ld r4,HSTATE_KVM_VCPU(r13) + lwz r3,VCPU_PTID(r4) + cmpwi r3,0 + beq 27f + mfspr r3,SPRN_SRR1 + rlwinm r3,r3,44-31,0x7 /* extract wake reason field */ + cmpwi r3,4 /* was it an external interrupt? */ + bne 27f + ld r5, HSTATE_XICS_PHYS(r13) + li r0,0xff + li r6,XICS_QIRR + li r7,XICS_XIRR + lwzcix r8,r5,r7 /* ack the interrupt */ + sync + stbcix r0,r5,r6 /* clear it */ + stwcix r8,r5,r7 /* EOI it */ +27: + /* load up FP state */ + bl kvmppc_load_fp + + /* Load NV GPRS */ + ld r14, VCPU_GPR(r14)(r4) + ld r15, VCPU_GPR(r15)(r4) + ld r16, VCPU_GPR(r16)(r4) + ld r17, VCPU_GPR(r17)(r4) + ld r18, VCPU_GPR(r18)(r4) + ld r19, VCPU_GPR(r19)(r4) + ld r20, VCPU_GPR(r20)(r4) + ld r21, VCPU_GPR(r21)(r4) + ld r22, VCPU_GPR(r22)(r4) + ld r23, VCPU_GPR(r23)(r4) + ld r24, VCPU_GPR(r24)(r4) + ld r25, VCPU_GPR(r25)(r4) + ld r26, VCPU_GPR(r26)(r4) + ld r27, VCPU_GPR(r27)(r4) + ld r28, VCPU_GPR(r28)(r4) + ld r29, VCPU_GPR(r29)(r4) + ld r30, VCPU_GPR(r30)(r4) + ld r31, VCPU_GPR(r31)(r4) + + /* clear our bit in vcore->napping_threads */ +33: ld r5,HSTATE_KVM_VCORE(r13) + lwz r3,VCPU_PTID(r4) + li r0,1 + sld r0,r0,r3 + addi r6,r5,VCORE_NAPPING_THREADS +32: lwarx r7,0,r6 + andc r7,r7,r0 + stwcx. r7,0,r6 + bne 32b + li r0,0 + stb r0,HSTATE_NAPPING(r13) + + /* see if any other thread is already exiting */ + lwz r0,VCORE_ENTRY_EXIT(r5) + cmpwi r0,0x100 + blt kvmppc_cede_reentry /* if not go back to guest */ + + /* some threads are exiting, so go to the guest exit path */ + b hcall_real_fallback + + /* cede when already previously prodded case */ +1: li r0,0 + stb r0,VCPU_PRODDED(r3) + sync /* order testing prodded vs. clearing ceded */ + stb r0,VCPU_CEDED(r3) + li r3,H_SUCCESS + blr + + /* we've ceded but we want to give control to the host */ +2: li r3,H_TOO_HARD + blr + +secondary_too_late: + ld r5,HSTATE_KVM_VCORE(r13) + HMT_LOW +13: lbz r3,VCORE_IN_GUEST(r5) + cmpwi r3,0 + bne 13b + HMT_MEDIUM + ld r11,PACA_SLBSHADOWPTR(r13) + + .rept SLB_NUM_BOLTED + ld r5,SLBSHADOW_SAVEAREA(r11) + ld r6,SLBSHADOW_SAVEAREA+8(r11) + andis. r7,r5,SLB_ESID_V@h + beq 1f + slbmte r6,r5 +1: addi r11,r11,16 + .endr + +secondary_nap: + /* Clear any pending IPI - assume we're a secondary thread */ + ld r5, HSTATE_XICS_PHYS(r13) + li r7, XICS_XIRR + lwzcix r3, r5, r7 /* ack any pending interrupt */ + rlwinm. r0, r3, 0, 0xffffff /* any pending? */ + beq 37f + sync + li r0, 0xff + li r6, XICS_QIRR + stbcix r0, r5, r6 /* clear the IPI */ + stwcix r3, r5, r7 /* EOI it */ +37: sync + + /* increment the nap count and then go to nap mode */ + ld r4, HSTATE_KVM_VCORE(r13) + addi r4, r4, VCORE_NAP_COUNT + lwsync /* make previous updates visible */ +51: lwarx r3, 0, r4 + addi r3, r3, 1 + stwcx. r3, 0, r4 + bne 51b + + li r3, LPCR_PECE0 + mfspr r4, SPRN_LPCR + rlwimi r4, r3, 0, LPCR_PECE0 | LPCR_PECE1 + mtspr SPRN_LPCR, r4 + isync + li r0, 0 + std r0, HSTATE_SCRATCH0(r13) + ptesync + ld r0, HSTATE_SCRATCH0(r13) +1: cmpd r0, r0 + bne 1b + nap + b . + +/* + * Save away FP, VMX and VSX registers. + * r3 = vcpu pointer + */ +_GLOBAL(kvmppc_save_fp) + mfmsr r9 + ori r8,r9,MSR_FP +#ifdef CONFIG_ALTIVEC +BEGIN_FTR_SECTION + oris r8,r8,MSR_VEC@h +END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) +#endif +#ifdef CONFIG_VSX +BEGIN_FTR_SECTION + oris r8,r8,MSR_VSX@h +END_FTR_SECTION_IFSET(CPU_FTR_VSX) +#endif + mtmsrd r8 + isync +#ifdef CONFIG_VSX +BEGIN_FTR_SECTION + reg = 0 + .rept 32 + li r6,reg*16+VCPU_VSRS + STXVD2X(reg,r6,r3) + reg = reg + 1 + .endr +FTR_SECTION_ELSE +#endif + reg = 0 + .rept 32 + stfd reg,reg*8+VCPU_FPRS(r3) + reg = reg + 1 + .endr +#ifdef CONFIG_VSX +ALT_FTR_SECTION_END_IFSET(CPU_FTR_VSX) +#endif + mffs fr0 + stfd fr0,VCPU_FPSCR(r3) + +#ifdef CONFIG_ALTIVEC +BEGIN_FTR_SECTION + reg = 0 + .rept 32 + li r6,reg*16+VCPU_VRS + stvx reg,r6,r3 + reg = reg + 1 + .endr + mfvscr vr0 + li r6,VCPU_VSCR + stvx vr0,r6,r3 +END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) +#endif + mfspr r6,SPRN_VRSAVE + stw r6,VCPU_VRSAVE(r3) + mtmsrd r9 + isync + blr + +/* + * Load up FP, VMX and VSX registers + * r4 = vcpu pointer + */ + .globl kvmppc_load_fp +kvmppc_load_fp: + mfmsr r9 + ori r8,r9,MSR_FP +#ifdef CONFIG_ALTIVEC +BEGIN_FTR_SECTION + oris r8,r8,MSR_VEC@h +END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) +#endif +#ifdef CONFIG_VSX +BEGIN_FTR_SECTION + oris r8,r8,MSR_VSX@h +END_FTR_SECTION_IFSET(CPU_FTR_VSX) +#endif + mtmsrd r8 + isync + lfd fr0,VCPU_FPSCR(r4) + MTFSF_L(fr0) +#ifdef CONFIG_VSX +BEGIN_FTR_SECTION + reg = 0 + .rept 32 + li r7,reg*16+VCPU_VSRS + LXVD2X(reg,r7,r4) + reg = reg + 1 + .endr +FTR_SECTION_ELSE +#endif + reg = 0 + .rept 32 + lfd reg,reg*8+VCPU_FPRS(r4) + reg = reg + 1 + .endr +#ifdef CONFIG_VSX +ALT_FTR_SECTION_END_IFSET(CPU_FTR_VSX) +#endif + +#ifdef CONFIG_ALTIVEC +BEGIN_FTR_SECTION + li r7,VCPU_VSCR + lvx vr0,r7,r4 + mtvscr vr0 + reg = 0 + .rept 32 + li r7,reg*16+VCPU_VRS + lvx reg,r7,r4 + reg = reg + 1 + .endr +END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) +#endif + lwz r7,VCPU_VRSAVE(r4) + mtspr SPRN_VRSAVE,r7 + blr diff --git a/arch/powerpc/kvm/book3s_interrupts.S b/arch/powerpc/kvm/book3s_interrupts.S index 2f0bc928b08a..0a8515a5c042 100644 --- a/arch/powerpc/kvm/book3s_interrupts.S +++ b/arch/powerpc/kvm/book3s_interrupts.S @@ -29,28 +29,11 @@ #define ULONG_SIZE 8 #define FUNC(name) GLUE(.,name) -#define GET_SHADOW_VCPU(reg) \ - addi reg, r13, PACA_KVM_SVCPU - -#define DISABLE_INTERRUPTS \ - mfmsr r0; \ - rldicl r0,r0,48,1; \ - rotldi r0,r0,16; \ - mtmsrd r0,1; \ - #elif defined(CONFIG_PPC_BOOK3S_32) #define ULONG_SIZE 4 #define FUNC(name) name -#define GET_SHADOW_VCPU(reg) \ - lwz reg, (THREAD + THREAD_KVM_SVCPU)(r2) - -#define DISABLE_INTERRUPTS \ - mfmsr r0; \ - rlwinm r0,r0,0,17,15; \ - mtmsr r0; \ - #endif /* CONFIG_PPC_BOOK3S_XX */ @@ -85,7 +68,7 @@ * r3: kvm_run pointer * r4: vcpu pointer */ -_GLOBAL(__kvmppc_vcpu_entry) +_GLOBAL(__kvmppc_vcpu_run) kvm_start_entry: /* Write correct stack frame */ @@ -107,52 +90,19 @@ kvm_start_entry: /* Load non-volatile guest state from the vcpu */ VCPU_LOAD_NVGPRS(r4) - GET_SHADOW_VCPU(r5) - - /* Save R1/R2 in the PACA */ - PPC_STL r1, SVCPU_HOST_R1(r5) - PPC_STL r2, SVCPU_HOST_R2(r5) - - /* XXX swap in/out on load? */ - PPC_LL r3, VCPU_HIGHMEM_HANDLER(r4) - PPC_STL r3, SVCPU_VMHANDLER(r5) - kvm_start_lightweight: - PPC_LL r10, VCPU_SHADOW_MSR(r4) /* r10 = vcpu->arch.shadow_msr */ - - DISABLE_INTERRUPTS - #ifdef CONFIG_PPC_BOOK3S_64 - /* Some guests may need to have dcbz set to 32 byte length. - * - * Usually we ensure that by patching the guest's instructions - * to trap on dcbz and emulate it in the hypervisor. - * - * If we can, we should tell the CPU to use 32 byte dcbz though, - * because that's a lot faster. - */ - PPC_LL r3, VCPU_HFLAGS(r4) - rldicl. r3, r3, 0, 63 /* CR = ((r3 & 1) == 0) */ - beq no_dcbz32_on - - mfspr r3,SPRN_HID5 - ori r3, r3, 0x80 /* XXX HID5_dcbz32 = 0x80 */ - mtspr SPRN_HID5,r3 - -no_dcbz32_on: - + rldicl r3, r3, 0, 63 /* r3 &= 1 */ + stb r3, HSTATE_RESTORE_HID5(r13) #endif /* CONFIG_PPC_BOOK3S_64 */ - PPC_LL r6, VCPU_RMCALL(r4) - mtctr r6 - - PPC_LL r3, VCPU_TRAMPOLINE_ENTER(r4) - LOAD_REG_IMMEDIATE(r4, MSR_KERNEL & ~(MSR_IR | MSR_DR)) + PPC_LL r4, VCPU_SHADOW_MSR(r4) /* get shadow_msr */ /* Jump to segment patching handler and into our guest */ - bctr + bl FUNC(kvmppc_entry_trampoline) + nop /* * This is the handler in module memory. It gets jumped at from the @@ -177,21 +127,6 @@ kvmppc_handler_highmem: /* R7 = vcpu */ PPC_LL r7, GPR4(r1) -#ifdef CONFIG_PPC_BOOK3S_64 - - PPC_LL r5, VCPU_HFLAGS(r7) - rldicl. r5, r5, 0, 63 /* CR = ((r5 & 1) == 0) */ - beq no_dcbz32_off - - li r4, 0 - mfspr r5,SPRN_HID5 - rldimi r5,r4,6,56 - mtspr SPRN_HID5,r5 - -no_dcbz32_off: - -#endif /* CONFIG_PPC_BOOK3S_64 */ - PPC_STL r14, VCPU_GPR(r14)(r7) PPC_STL r15, VCPU_GPR(r15)(r7) PPC_STL r16, VCPU_GPR(r16)(r7) @@ -211,67 +146,6 @@ no_dcbz32_off: PPC_STL r30, VCPU_GPR(r30)(r7) PPC_STL r31, VCPU_GPR(r31)(r7) - /* Restore host msr -> SRR1 */ - PPC_LL r6, VCPU_HOST_MSR(r7) - - /* - * For some interrupts, we need to call the real Linux - * handler, so it can do work for us. This has to happen - * as if the interrupt arrived from the kernel though, - * so let's fake it here where most state is restored. - * - * Call Linux for hardware interrupts/decrementer - * r3 = address of interrupt handler (exit reason) - */ - - cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL - beq call_linux_handler - cmpwi r12, BOOK3S_INTERRUPT_DECREMENTER - beq call_linux_handler - cmpwi r12, BOOK3S_INTERRUPT_PERFMON - beq call_linux_handler - - /* Back to EE=1 */ - mtmsr r6 - sync - b kvm_return_point - -call_linux_handler: - - /* - * If we land here we need to jump back to the handler we - * came from. - * - * We have a page that we can access from real mode, so let's - * jump back to that and use it as a trampoline to get back into the - * interrupt handler! - * - * R3 still contains the exit code, - * R5 VCPU_HOST_RETIP and - * R6 VCPU_HOST_MSR - */ - - /* Restore host IP -> SRR0 */ - PPC_LL r5, VCPU_HOST_RETIP(r7) - - /* XXX Better move to a safe function? - * What if we get an HTAB flush in between mtsrr0 and mtsrr1? */ - - mtlr r12 - - PPC_LL r4, VCPU_TRAMPOLINE_LOWMEM(r7) - mtsrr0 r4 - LOAD_REG_IMMEDIATE(r3, MSR_KERNEL & ~(MSR_IR | MSR_DR)) - mtsrr1 r3 - - RFI - -.global kvm_return_point -kvm_return_point: - - /* Jump back to lightweight entry if we're supposed to */ - /* go back into the guest */ - /* Pass the exit number as 3rd argument to kvmppc_handle_exit */ mr r5, r12 diff --git a/arch/powerpc/kvm/book3s_mmu_hpte.c b/arch/powerpc/kvm/book3s_mmu_hpte.c index 79751d8dd131..41cb0017e757 100644 --- a/arch/powerpc/kvm/book3s_mmu_hpte.c +++ b/arch/powerpc/kvm/book3s_mmu_hpte.c @@ -21,7 +21,6 @@ #include <linux/kvm_host.h> #include <linux/hash.h> #include <linux/slab.h> -#include "trace.h" #include <asm/kvm_ppc.h> #include <asm/kvm_book3s.h> @@ -29,6 +28,8 @@ #include <asm/mmu_context.h> #include <asm/hw_irq.h> +#include "trace.h" + #define PTE_SIZE 12 static struct kmem_cache *hpte_cache; @@ -58,30 +59,31 @@ static inline u64 kvmppc_mmu_hash_vpte_long(u64 vpage) void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte) { u64 index; + struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); trace_kvm_book3s_mmu_map(pte); - spin_lock(&vcpu->arch.mmu_lock); + spin_lock(&vcpu3s->mmu_lock); /* Add to ePTE list */ index = kvmppc_mmu_hash_pte(pte->pte.eaddr); - hlist_add_head_rcu(&pte->list_pte, &vcpu->arch.hpte_hash_pte[index]); + hlist_add_head_rcu(&pte->list_pte, &vcpu3s->hpte_hash_pte[index]); /* Add to ePTE_long list */ index = kvmppc_mmu_hash_pte_long(pte->pte.eaddr); hlist_add_head_rcu(&pte->list_pte_long, - &vcpu->arch.hpte_hash_pte_long[index]); + &vcpu3s->hpte_hash_pte_long[index]); /* Add to vPTE list */ index = kvmppc_mmu_hash_vpte(pte->pte.vpage); - hlist_add_head_rcu(&pte->list_vpte, &vcpu->arch.hpte_hash_vpte[index]); + hlist_add_head_rcu(&pte->list_vpte, &vcpu3s->hpte_hash_vpte[index]); /* Add to vPTE_long list */ index = kvmppc_mmu_hash_vpte_long(pte->pte.vpage); hlist_add_head_rcu(&pte->list_vpte_long, - &vcpu->arch.hpte_hash_vpte_long[index]); + &vcpu3s->hpte_hash_vpte_long[index]); - spin_unlock(&vcpu->arch.mmu_lock); + spin_unlock(&vcpu3s->mmu_lock); } static void free_pte_rcu(struct rcu_head *head) @@ -92,16 +94,18 @@ static void free_pte_rcu(struct rcu_head *head) static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte) { + struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); + trace_kvm_book3s_mmu_invalidate(pte); /* Different for 32 and 64 bit */ kvmppc_mmu_invalidate_pte(vcpu, pte); - spin_lock(&vcpu->arch.mmu_lock); + spin_lock(&vcpu3s->mmu_lock); /* pte already invalidated in between? */ if (hlist_unhashed(&pte->list_pte)) { - spin_unlock(&vcpu->arch.mmu_lock); + spin_unlock(&vcpu3s->mmu_lock); return; } @@ -115,14 +119,15 @@ static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte) else kvm_release_pfn_clean(pte->pfn); - spin_unlock(&vcpu->arch.mmu_lock); + spin_unlock(&vcpu3s->mmu_lock); - vcpu->arch.hpte_cache_count--; + vcpu3s->hpte_cache_count--; call_rcu(&pte->rcu_head, free_pte_rcu); } static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu) { + struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); struct hpte_cache *pte; struct hlist_node *node; int i; @@ -130,7 +135,7 @@ static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu) rcu_read_lock(); for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) { - struct hlist_head *list = &vcpu->arch.hpte_hash_vpte_long[i]; + struct hlist_head *list = &vcpu3s->hpte_hash_vpte_long[i]; hlist_for_each_entry_rcu(pte, node, list, list_vpte_long) invalidate_pte(vcpu, pte); @@ -141,12 +146,13 @@ static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu) static void kvmppc_mmu_pte_flush_page(struct kvm_vcpu *vcpu, ulong guest_ea) { + struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); struct hlist_head *list; struct hlist_node *node; struct hpte_cache *pte; /* Find the list of entries in the map */ - list = &vcpu->arch.hpte_hash_pte[kvmppc_mmu_hash_pte(guest_ea)]; + list = &vcpu3s->hpte_hash_pte[kvmppc_mmu_hash_pte(guest_ea)]; rcu_read_lock(); @@ -160,12 +166,13 @@ static void kvmppc_mmu_pte_flush_page(struct kvm_vcpu *vcpu, ulong guest_ea) static void kvmppc_mmu_pte_flush_long(struct kvm_vcpu *vcpu, ulong guest_ea) { + struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); struct hlist_head *list; struct hlist_node *node; struct hpte_cache *pte; /* Find the list of entries in the map */ - list = &vcpu->arch.hpte_hash_pte_long[ + list = &vcpu3s->hpte_hash_pte_long[ kvmppc_mmu_hash_pte_long(guest_ea)]; rcu_read_lock(); @@ -203,12 +210,13 @@ void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask) /* Flush with mask 0xfffffffff */ static void kvmppc_mmu_pte_vflush_short(struct kvm_vcpu *vcpu, u64 guest_vp) { + struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); struct hlist_head *list; struct hlist_node *node; struct hpte_cache *pte; u64 vp_mask = 0xfffffffffULL; - list = &vcpu->arch.hpte_hash_vpte[kvmppc_mmu_hash_vpte(guest_vp)]; + list = &vcpu3s->hpte_hash_vpte[kvmppc_mmu_hash_vpte(guest_vp)]; rcu_read_lock(); @@ -223,12 +231,13 @@ static void kvmppc_mmu_pte_vflush_short(struct kvm_vcpu *vcpu, u64 guest_vp) /* Flush with mask 0xffffff000 */ static void kvmppc_mmu_pte_vflush_long(struct kvm_vcpu *vcpu, u64 guest_vp) { + struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); struct hlist_head *list; struct hlist_node *node; struct hpte_cache *pte; u64 vp_mask = 0xffffff000ULL; - list = &vcpu->arch.hpte_hash_vpte_long[ + list = &vcpu3s->hpte_hash_vpte_long[ kvmppc_mmu_hash_vpte_long(guest_vp)]; rcu_read_lock(); @@ -261,6 +270,7 @@ void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask) void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end) { + struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); struct hlist_node *node; struct hpte_cache *pte; int i; @@ -270,7 +280,7 @@ void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end) rcu_read_lock(); for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) { - struct hlist_head *list = &vcpu->arch.hpte_hash_vpte_long[i]; + struct hlist_head *list = &vcpu3s->hpte_hash_vpte_long[i]; hlist_for_each_entry_rcu(pte, node, list, list_vpte_long) if ((pte->pte.raddr >= pa_start) && @@ -283,12 +293,13 @@ void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end) struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu) { + struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); struct hpte_cache *pte; pte = kmem_cache_zalloc(hpte_cache, GFP_KERNEL); - vcpu->arch.hpte_cache_count++; + vcpu3s->hpte_cache_count++; - if (vcpu->arch.hpte_cache_count == HPTEG_CACHE_NUM) + if (vcpu3s->hpte_cache_count == HPTEG_CACHE_NUM) kvmppc_mmu_pte_flush_all(vcpu); return pte; @@ -309,17 +320,19 @@ static void kvmppc_mmu_hpte_init_hash(struct hlist_head *hash_list, int len) int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu) { + struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); + /* init hpte lookup hashes */ - kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_pte, - ARRAY_SIZE(vcpu->arch.hpte_hash_pte)); - kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_pte_long, - ARRAY_SIZE(vcpu->arch.hpte_hash_pte_long)); - kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_vpte, - ARRAY_SIZE(vcpu->arch.hpte_hash_vpte)); - kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_vpte_long, - ARRAY_SIZE(vcpu->arch.hpte_hash_vpte_long)); - - spin_lock_init(&vcpu->arch.mmu_lock); + kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_pte, + ARRAY_SIZE(vcpu3s->hpte_hash_pte)); + kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_pte_long, + ARRAY_SIZE(vcpu3s->hpte_hash_pte_long)); + kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_vpte, + ARRAY_SIZE(vcpu3s->hpte_hash_vpte)); + kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_vpte_long, + ARRAY_SIZE(vcpu3s->hpte_hash_vpte_long)); + + spin_lock_init(&vcpu3s->mmu_lock); return 0; } diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c new file mode 100644 index 000000000000..3c791e1eb675 --- /dev/null +++ b/arch/powerpc/kvm/book3s_pr.c @@ -0,0 +1,1048 @@ +/* + * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved. + * + * Authors: + * Alexander Graf <agraf@suse.de> + * Kevin Wolf <mail@kevin-wolf.de> + * Paul Mackerras <paulus@samba.org> + * + * Description: + * Functions relating to running KVM on Book 3S processors where + * we don't have access to hypervisor mode, and we run the guest + * in problem state (user mode). + * + * This file is derived from arch/powerpc/kvm/44x.c, + * by Hollis Blanchard <hollisb@us.ibm.com>. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include <linux/kvm_host.h> +#include <linux/export.h> +#include <linux/err.h> +#include <linux/slab.h> + +#include <asm/reg.h> +#include <asm/cputable.h> +#include <asm/cacheflush.h> +#include <asm/tlbflush.h> +#include <asm/uaccess.h> +#include <asm/io.h> +#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s.h> +#include <asm/mmu_context.h> +#include <linux/gfp.h> +#include <linux/sched.h> +#include <linux/vmalloc.h> +#include <linux/highmem.h> + +#include "trace.h" + +/* #define EXIT_DEBUG */ +/* #define DEBUG_EXT */ + +static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr, + ulong msr); + +/* Some compatibility defines */ +#ifdef CONFIG_PPC_BOOK3S_32 +#define MSR_USER32 MSR_USER +#define MSR_USER64 MSR_USER +#define HW_PAGE_SIZE PAGE_SIZE +#endif + +void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ +#ifdef CONFIG_PPC_BOOK3S_64 + memcpy(to_svcpu(vcpu)->slb, to_book3s(vcpu)->slb_shadow, sizeof(to_svcpu(vcpu)->slb)); + memcpy(&get_paca()->shadow_vcpu, to_book3s(vcpu)->shadow_vcpu, + sizeof(get_paca()->shadow_vcpu)); + to_svcpu(vcpu)->slb_max = to_book3s(vcpu)->slb_shadow_max; +#endif + +#ifdef CONFIG_PPC_BOOK3S_32 + current->thread.kvm_shadow_vcpu = to_book3s(vcpu)->shadow_vcpu; +#endif +} + +void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_PPC_BOOK3S_64 + memcpy(to_book3s(vcpu)->slb_shadow, to_svcpu(vcpu)->slb, sizeof(to_svcpu(vcpu)->slb)); + memcpy(to_book3s(vcpu)->shadow_vcpu, &get_paca()->shadow_vcpu, + sizeof(get_paca()->shadow_vcpu)); + to_book3s(vcpu)->slb_shadow_max = to_svcpu(vcpu)->slb_max; +#endif + + kvmppc_giveup_ext(vcpu, MSR_FP); + kvmppc_giveup_ext(vcpu, MSR_VEC); + kvmppc_giveup_ext(vcpu, MSR_VSX); +} + +static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu) +{ + ulong smsr = vcpu->arch.shared->msr; + + /* Guest MSR values */ + smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_DE; + /* Process MSR values */ + smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE; + /* External providers the guest reserved */ + smsr |= (vcpu->arch.shared->msr & vcpu->arch.guest_owned_ext); + /* 64-bit Process MSR values */ +#ifdef CONFIG_PPC_BOOK3S_64 + smsr |= MSR_ISF | MSR_HV; +#endif + vcpu->arch.shadow_msr = smsr; +} + +void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) +{ + ulong old_msr = vcpu->arch.shared->msr; + +#ifdef EXIT_DEBUG + printk(KERN_INFO "KVM: Set MSR to 0x%llx\n", msr); +#endif + + msr &= to_book3s(vcpu)->msr_mask; + vcpu->arch.shared->msr = msr; + kvmppc_recalc_shadow_msr(vcpu); + + if (msr & MSR_POW) { + if (!vcpu->arch.pending_exceptions) { + kvm_vcpu_block(vcpu); + vcpu->stat.halt_wakeup++; + + /* Unset POW bit after we woke up */ + msr &= ~MSR_POW; + vcpu->arch.shared->msr = msr; + } + } + + if ((vcpu->arch.shared->msr & (MSR_PR|MSR_IR|MSR_DR)) != + (old_msr & (MSR_PR|MSR_IR|MSR_DR))) { + kvmppc_mmu_flush_segments(vcpu); + kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)); + + /* Preload magic page segment when in kernel mode */ + if (!(msr & MSR_PR) && vcpu->arch.magic_page_pa) { + struct kvm_vcpu_arch *a = &vcpu->arch; + + if (msr & MSR_DR) + kvmppc_mmu_map_segment(vcpu, a->magic_page_ea); + else + kvmppc_mmu_map_segment(vcpu, a->magic_page_pa); + } + } + + /* Preload FPU if it's enabled */ + if (vcpu->arch.shared->msr & MSR_FP) + kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP); +} + +void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr) +{ + u32 host_pvr; + + vcpu->arch.hflags &= ~BOOK3S_HFLAG_SLB; + vcpu->arch.pvr = pvr; +#ifdef CONFIG_PPC_BOOK3S_64 + if ((pvr >= 0x330000) && (pvr < 0x70330000)) { + kvmppc_mmu_book3s_64_init(vcpu); + to_book3s(vcpu)->hior = 0xfff00000; + to_book3s(vcpu)->msr_mask = 0xffffffffffffffffULL; + vcpu->arch.cpu_type = KVM_CPU_3S_64; + } else +#endif + { + kvmppc_mmu_book3s_32_init(vcpu); + to_book3s(vcpu)->hior = 0; + to_book3s(vcpu)->msr_mask = 0xffffffffULL; + vcpu->arch.cpu_type = KVM_CPU_3S_32; + } + + kvmppc_sanity_check(vcpu); + + /* If we are in hypervisor level on 970, we can tell the CPU to + * treat DCBZ as 32 bytes store */ + vcpu->arch.hflags &= ~BOOK3S_HFLAG_DCBZ32; + if (vcpu->arch.mmu.is_dcbz32(vcpu) && (mfmsr() & MSR_HV) && + !strcmp(cur_cpu_spec->platform, "ppc970")) + vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32; + + /* Cell performs badly if MSR_FEx are set. So let's hope nobody + really needs them in a VM on Cell and force disable them. */ + if (!strcmp(cur_cpu_spec->platform, "ppc-cell-be")) + to_book3s(vcpu)->msr_mask &= ~(MSR_FE0 | MSR_FE1); + +#ifdef CONFIG_PPC_BOOK3S_32 + /* 32 bit Book3S always has 32 byte dcbz */ + vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32; +#endif + + /* On some CPUs we can execute paired single operations natively */ + asm ( "mfpvr %0" : "=r"(host_pvr)); + switch (host_pvr) { + case 0x00080200: /* lonestar 2.0 */ + case 0x00088202: /* lonestar 2.2 */ + case 0x70000100: /* gekko 1.0 */ + case 0x00080100: /* gekko 2.0 */ + case 0x00083203: /* gekko 2.3a */ + case 0x00083213: /* gekko 2.3b */ + case 0x00083204: /* gekko 2.4 */ + case 0x00083214: /* gekko 2.4e (8SE) - retail HW2 */ + case 0x00087200: /* broadway */ + vcpu->arch.hflags |= BOOK3S_HFLAG_NATIVE_PS; + /* Enable HID2.PSE - in case we need it later */ + mtspr(SPRN_HID2_GEKKO, mfspr(SPRN_HID2_GEKKO) | (1 << 29)); + } +} + +/* Book3s_32 CPUs always have 32 bytes cache line size, which Linux assumes. To + * make Book3s_32 Linux work on Book3s_64, we have to make sure we trap dcbz to + * emulate 32 bytes dcbz length. + * + * The Book3s_64 inventors also realized this case and implemented a special bit + * in the HID5 register, which is a hypervisor ressource. Thus we can't use it. + * + * My approach here is to patch the dcbz instruction on executing pages. + */ +static void kvmppc_patch_dcbz(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte) +{ + struct page *hpage; + u64 hpage_offset; + u32 *page; + int i; + + hpage = gfn_to_page(vcpu->kvm, pte->raddr >> PAGE_SHIFT); + if (is_error_page(hpage)) { + kvm_release_page_clean(hpage); + return; + } + + hpage_offset = pte->raddr & ~PAGE_MASK; + hpage_offset &= ~0xFFFULL; + hpage_offset /= 4; + + get_page(hpage); + page = kmap_atomic(hpage, KM_USER0); + + /* patch dcbz into reserved instruction, so we trap */ + for (i=hpage_offset; i < hpage_offset + (HW_PAGE_SIZE / 4); i++) + if ((page[i] & 0xff0007ff) == INS_DCBZ) + page[i] &= 0xfffffff7; + + kunmap_atomic(page, KM_USER0); + put_page(hpage); +} + +static int kvmppc_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + ulong mp_pa = vcpu->arch.magic_page_pa; + + if (unlikely(mp_pa) && + unlikely((mp_pa & KVM_PAM) >> PAGE_SHIFT == gfn)) { + return 1; + } + + return kvm_is_visible_gfn(vcpu->kvm, gfn); +} + +int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, + ulong eaddr, int vec) +{ + bool data = (vec == BOOK3S_INTERRUPT_DATA_STORAGE); + int r = RESUME_GUEST; + int relocated; + int page_found = 0; + struct kvmppc_pte pte; + bool is_mmio = false; + bool dr = (vcpu->arch.shared->msr & MSR_DR) ? true : false; + bool ir = (vcpu->arch.shared->msr & MSR_IR) ? true : false; + u64 vsid; + + relocated = data ? dr : ir; + + /* Resolve real address if translation turned on */ + if (relocated) { + page_found = vcpu->arch.mmu.xlate(vcpu, eaddr, &pte, data); + } else { + pte.may_execute = true; + pte.may_read = true; + pte.may_write = true; + pte.raddr = eaddr & KVM_PAM; + pte.eaddr = eaddr; + pte.vpage = eaddr >> 12; + } + + switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { + case 0: + pte.vpage |= ((u64)VSID_REAL << (SID_SHIFT - 12)); + break; + case MSR_DR: + case MSR_IR: + vcpu->arch.mmu.esid_to_vsid(vcpu, eaddr >> SID_SHIFT, &vsid); + + if ((vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) == MSR_DR) + pte.vpage |= ((u64)VSID_REAL_DR << (SID_SHIFT - 12)); + else + pte.vpage |= ((u64)VSID_REAL_IR << (SID_SHIFT - 12)); + pte.vpage |= vsid; + + if (vsid == -1) + page_found = -EINVAL; + break; + } + + if (vcpu->arch.mmu.is_dcbz32(vcpu) && + (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) { + /* + * If we do the dcbz hack, we have to NX on every execution, + * so we can patch the executing code. This renders our guest + * NX-less. + */ + pte.may_execute = !data; + } + + if (page_found == -ENOENT) { + /* Page not found in guest PTE entries */ + vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); + vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr; + vcpu->arch.shared->msr |= + (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL); + kvmppc_book3s_queue_irqprio(vcpu, vec); + } else if (page_found == -EPERM) { + /* Storage protection */ + vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); + vcpu->arch.shared->dsisr = + to_svcpu(vcpu)->fault_dsisr & ~DSISR_NOHPTE; + vcpu->arch.shared->dsisr |= DSISR_PROTFAULT; + vcpu->arch.shared->msr |= + (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL); + kvmppc_book3s_queue_irqprio(vcpu, vec); + } else if (page_found == -EINVAL) { + /* Page not found in guest SLB */ + vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); + kvmppc_book3s_queue_irqprio(vcpu, vec + 0x80); + } else if (!is_mmio && + kvmppc_visible_gfn(vcpu, pte.raddr >> PAGE_SHIFT)) { + /* The guest's PTE is not mapped yet. Map on the host */ + kvmppc_mmu_map_page(vcpu, &pte); + if (data) + vcpu->stat.sp_storage++; + else if (vcpu->arch.mmu.is_dcbz32(vcpu) && + (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) + kvmppc_patch_dcbz(vcpu, &pte); + } else { + /* MMIO */ + vcpu->stat.mmio_exits++; + vcpu->arch.paddr_accessed = pte.raddr; + r = kvmppc_emulate_mmio(run, vcpu); + if ( r == RESUME_HOST_NV ) + r = RESUME_HOST; + } + + return r; +} + +static inline int get_fpr_index(int i) +{ +#ifdef CONFIG_VSX + i *= 2; +#endif + return i; +} + +/* Give up external provider (FPU, Altivec, VSX) */ +void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr) +{ + struct thread_struct *t = ¤t->thread; + u64 *vcpu_fpr = vcpu->arch.fpr; +#ifdef CONFIG_VSX + u64 *vcpu_vsx = vcpu->arch.vsr; +#endif + u64 *thread_fpr = (u64*)t->fpr; + int i; + + if (!(vcpu->arch.guest_owned_ext & msr)) + return; + +#ifdef DEBUG_EXT + printk(KERN_INFO "Giving up ext 0x%lx\n", msr); +#endif + + switch (msr) { + case MSR_FP: + giveup_fpu(current); + for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) + vcpu_fpr[i] = thread_fpr[get_fpr_index(i)]; + + vcpu->arch.fpscr = t->fpscr.val; + break; + case MSR_VEC: +#ifdef CONFIG_ALTIVEC + giveup_altivec(current); + memcpy(vcpu->arch.vr, t->vr, sizeof(vcpu->arch.vr)); + vcpu->arch.vscr = t->vscr; +#endif + break; + case MSR_VSX: +#ifdef CONFIG_VSX + __giveup_vsx(current); + for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++) + vcpu_vsx[i] = thread_fpr[get_fpr_index(i) + 1]; +#endif + break; + default: + BUG(); + } + + vcpu->arch.guest_owned_ext &= ~msr; + current->thread.regs->msr &= ~msr; + kvmppc_recalc_shadow_msr(vcpu); +} + +static int kvmppc_read_inst(struct kvm_vcpu *vcpu) +{ + ulong srr0 = kvmppc_get_pc(vcpu); + u32 last_inst = kvmppc_get_last_inst(vcpu); + int ret; + + ret = kvmppc_ld(vcpu, &srr0, sizeof(u32), &last_inst, false); + if (ret == -ENOENT) { + ulong msr = vcpu->arch.shared->msr; + + msr = kvmppc_set_field(msr, 33, 33, 1); + msr = kvmppc_set_field(msr, 34, 36, 0); + vcpu->arch.shared->msr = kvmppc_set_field(msr, 42, 47, 0); + kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_INST_STORAGE); + return EMULATE_AGAIN; + } + + return EMULATE_DONE; +} + +static int kvmppc_check_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr) +{ + + /* Need to do paired single emulation? */ + if (!(vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE)) + return EMULATE_DONE; + + /* Read out the instruction */ + if (kvmppc_read_inst(vcpu) == EMULATE_DONE) + /* Need to emulate */ + return EMULATE_FAIL; + + return EMULATE_AGAIN; +} + +/* Handle external providers (FPU, Altivec, VSX) */ +static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr, + ulong msr) +{ + struct thread_struct *t = ¤t->thread; + u64 *vcpu_fpr = vcpu->arch.fpr; +#ifdef CONFIG_VSX + u64 *vcpu_vsx = vcpu->arch.vsr; +#endif + u64 *thread_fpr = (u64*)t->fpr; + int i; + + /* When we have paired singles, we emulate in software */ + if (vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE) + return RESUME_GUEST; + + if (!(vcpu->arch.shared->msr & msr)) { + kvmppc_book3s_queue_irqprio(vcpu, exit_nr); + return RESUME_GUEST; + } + + /* We already own the ext */ + if (vcpu->arch.guest_owned_ext & msr) { + return RESUME_GUEST; + } + +#ifdef DEBUG_EXT + printk(KERN_INFO "Loading up ext 0x%lx\n", msr); +#endif + + current->thread.regs->msr |= msr; + + switch (msr) { + case MSR_FP: + for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) + thread_fpr[get_fpr_index(i)] = vcpu_fpr[i]; + + t->fpscr.val = vcpu->arch.fpscr; + t->fpexc_mode = 0; + kvmppc_load_up_fpu(); + break; + case MSR_VEC: +#ifdef CONFIG_ALTIVEC + memcpy(t->vr, vcpu->arch.vr, sizeof(vcpu->arch.vr)); + t->vscr = vcpu->arch.vscr; + t->vrsave = -1; + kvmppc_load_up_altivec(); +#endif + break; + case MSR_VSX: +#ifdef CONFIG_VSX + for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++) + thread_fpr[get_fpr_index(i) + 1] = vcpu_vsx[i]; + kvmppc_load_up_vsx(); +#endif + break; + default: + BUG(); + } + + vcpu->arch.guest_owned_ext |= msr; + + kvmppc_recalc_shadow_msr(vcpu); + + return RESUME_GUEST; +} + +int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int exit_nr) +{ + int r = RESUME_HOST; + + vcpu->stat.sum_exits++; + + run->exit_reason = KVM_EXIT_UNKNOWN; + run->ready_for_interrupt_injection = 1; + + trace_kvm_book3s_exit(exit_nr, vcpu); + kvm_resched(vcpu); + switch (exit_nr) { + case BOOK3S_INTERRUPT_INST_STORAGE: + vcpu->stat.pf_instruc++; + +#ifdef CONFIG_PPC_BOOK3S_32 + /* We set segments as unused segments when invalidating them. So + * treat the respective fault as segment fault. */ + if (to_svcpu(vcpu)->sr[kvmppc_get_pc(vcpu) >> SID_SHIFT] + == SR_INVALID) { + kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)); + r = RESUME_GUEST; + break; + } +#endif + + /* only care about PTEG not found errors, but leave NX alone */ + if (to_svcpu(vcpu)->shadow_srr1 & 0x40000000) { + r = kvmppc_handle_pagefault(run, vcpu, kvmppc_get_pc(vcpu), exit_nr); + vcpu->stat.sp_instruc++; + } else if (vcpu->arch.mmu.is_dcbz32(vcpu) && + (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) { + /* + * XXX If we do the dcbz hack we use the NX bit to flush&patch the page, + * so we can't use the NX bit inside the guest. Let's cross our fingers, + * that no guest that needs the dcbz hack does NX. + */ + kvmppc_mmu_pte_flush(vcpu, kvmppc_get_pc(vcpu), ~0xFFFUL); + r = RESUME_GUEST; + } else { + vcpu->arch.shared->msr |= + to_svcpu(vcpu)->shadow_srr1 & 0x58000000; + kvmppc_book3s_queue_irqprio(vcpu, exit_nr); + r = RESUME_GUEST; + } + break; + case BOOK3S_INTERRUPT_DATA_STORAGE: + { + ulong dar = kvmppc_get_fault_dar(vcpu); + vcpu->stat.pf_storage++; + +#ifdef CONFIG_PPC_BOOK3S_32 + /* We set segments as unused segments when invalidating them. So + * treat the respective fault as segment fault. */ + if ((to_svcpu(vcpu)->sr[dar >> SID_SHIFT]) == SR_INVALID) { + kvmppc_mmu_map_segment(vcpu, dar); + r = RESUME_GUEST; + break; + } +#endif + + /* The only case we need to handle is missing shadow PTEs */ + if (to_svcpu(vcpu)->fault_dsisr & DSISR_NOHPTE) { + r = kvmppc_handle_pagefault(run, vcpu, dar, exit_nr); + } else { + vcpu->arch.shared->dar = dar; + vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr; + kvmppc_book3s_queue_irqprio(vcpu, exit_nr); + r = RESUME_GUEST; + } + break; + } + case BOOK3S_INTERRUPT_DATA_SEGMENT: + if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_fault_dar(vcpu)) < 0) { + vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); + kvmppc_book3s_queue_irqprio(vcpu, + BOOK3S_INTERRUPT_DATA_SEGMENT); + } + r = RESUME_GUEST; + break; + case BOOK3S_INTERRUPT_INST_SEGMENT: + if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)) < 0) { + kvmppc_book3s_queue_irqprio(vcpu, + BOOK3S_INTERRUPT_INST_SEGMENT); + } + r = RESUME_GUEST; + break; + /* We're good on these - the host merely wanted to get our attention */ + case BOOK3S_INTERRUPT_DECREMENTER: + vcpu->stat.dec_exits++; + r = RESUME_GUEST; + break; + case BOOK3S_INTERRUPT_EXTERNAL: + vcpu->stat.ext_intr_exits++; + r = RESUME_GUEST; + break; + case BOOK3S_INTERRUPT_PERFMON: + r = RESUME_GUEST; + break; + case BOOK3S_INTERRUPT_PROGRAM: + { + enum emulation_result er; + ulong flags; + +program_interrupt: + flags = to_svcpu(vcpu)->shadow_srr1 & 0x1f0000ull; + + if (vcpu->arch.shared->msr & MSR_PR) { +#ifdef EXIT_DEBUG + printk(KERN_INFO "Userspace triggered 0x700 exception at 0x%lx (0x%x)\n", kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu)); +#endif + if ((kvmppc_get_last_inst(vcpu) & 0xff0007ff) != + (INS_DCBZ & 0xfffffff7)) { + kvmppc_core_queue_program(vcpu, flags); + r = RESUME_GUEST; + break; + } + } + + vcpu->stat.emulated_inst_exits++; + er = kvmppc_emulate_instruction(run, vcpu); + switch (er) { + case EMULATE_DONE: + r = RESUME_GUEST_NV; + break; + case EMULATE_AGAIN: + r = RESUME_GUEST; + break; + case EMULATE_FAIL: + printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n", + __func__, kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu)); + kvmppc_core_queue_program(vcpu, flags); + r = RESUME_GUEST; + break; + case EMULATE_DO_MMIO: + run->exit_reason = KVM_EXIT_MMIO; + r = RESUME_HOST_NV; + break; + default: + BUG(); + } + break; + } + case BOOK3S_INTERRUPT_SYSCALL: + if (vcpu->arch.papr_enabled && + (kvmppc_get_last_inst(vcpu) == 0x44000022) && + !(vcpu->arch.shared->msr & MSR_PR)) { + /* SC 1 papr hypercalls */ + ulong cmd = kvmppc_get_gpr(vcpu, 3); + int i; + + if (kvmppc_h_pr(vcpu, cmd) == EMULATE_DONE) { + r = RESUME_GUEST; + break; + } + + run->papr_hcall.nr = cmd; + for (i = 0; i < 9; ++i) { + ulong gpr = kvmppc_get_gpr(vcpu, 4 + i); + run->papr_hcall.args[i] = gpr; + } + run->exit_reason = KVM_EXIT_PAPR_HCALL; + vcpu->arch.hcall_needed = 1; + r = RESUME_HOST; + } else if (vcpu->arch.osi_enabled && + (((u32)kvmppc_get_gpr(vcpu, 3)) == OSI_SC_MAGIC_R3) && + (((u32)kvmppc_get_gpr(vcpu, 4)) == OSI_SC_MAGIC_R4)) { + /* MOL hypercalls */ + u64 *gprs = run->osi.gprs; + int i; + + run->exit_reason = KVM_EXIT_OSI; + for (i = 0; i < 32; i++) + gprs[i] = kvmppc_get_gpr(vcpu, i); + vcpu->arch.osi_needed = 1; + r = RESUME_HOST_NV; + } else if (!(vcpu->arch.shared->msr & MSR_PR) && + (((u32)kvmppc_get_gpr(vcpu, 0)) == KVM_SC_MAGIC_R0)) { + /* KVM PV hypercalls */ + kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu)); + r = RESUME_GUEST; + } else { + /* Guest syscalls */ + vcpu->stat.syscall_exits++; + kvmppc_book3s_queue_irqprio(vcpu, exit_nr); + r = RESUME_GUEST; + } + break; + case BOOK3S_INTERRUPT_FP_UNAVAIL: + case BOOK3S_INTERRUPT_ALTIVEC: + case BOOK3S_INTERRUPT_VSX: + { + int ext_msr = 0; + + switch (exit_nr) { + case BOOK3S_INTERRUPT_FP_UNAVAIL: ext_msr = MSR_FP; break; + case BOOK3S_INTERRUPT_ALTIVEC: ext_msr = MSR_VEC; break; + case BOOK3S_INTERRUPT_VSX: ext_msr = MSR_VSX; break; + } + + switch (kvmppc_check_ext(vcpu, exit_nr)) { + case EMULATE_DONE: + /* everything ok - let's enable the ext */ + r = kvmppc_handle_ext(vcpu, exit_nr, ext_msr); + break; + case EMULATE_FAIL: + /* we need to emulate this instruction */ + goto program_interrupt; + break; + default: + /* nothing to worry about - go again */ + break; + } + break; + } + case BOOK3S_INTERRUPT_ALIGNMENT: + if (kvmppc_read_inst(vcpu) == EMULATE_DONE) { + vcpu->arch.shared->dsisr = kvmppc_alignment_dsisr(vcpu, + kvmppc_get_last_inst(vcpu)); + vcpu->arch.shared->dar = kvmppc_alignment_dar(vcpu, + kvmppc_get_last_inst(vcpu)); + kvmppc_book3s_queue_irqprio(vcpu, exit_nr); + } + r = RESUME_GUEST; + break; + case BOOK3S_INTERRUPT_MACHINE_CHECK: + case BOOK3S_INTERRUPT_TRACE: + kvmppc_book3s_queue_irqprio(vcpu, exit_nr); + r = RESUME_GUEST; + break; + default: + /* Ugh - bork here! What did we get? */ + printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n", + exit_nr, kvmppc_get_pc(vcpu), to_svcpu(vcpu)->shadow_srr1); + r = RESUME_HOST; + BUG(); + break; + } + + + if (!(r & RESUME_HOST)) { + /* To avoid clobbering exit_reason, only check for signals if + * we aren't already exiting to userspace for some other + * reason. */ + if (signal_pending(current)) { +#ifdef EXIT_DEBUG + printk(KERN_EMERG "KVM: Going back to host\n"); +#endif + vcpu->stat.signal_exits++; + run->exit_reason = KVM_EXIT_INTR; + r = -EINTR; + } else { + /* In case an interrupt came in that was triggered + * from userspace (like DEC), we need to check what + * to inject now! */ + kvmppc_core_deliver_interrupts(vcpu); + } + } + + trace_kvm_book3s_reenter(r, vcpu); + + return r; +} + +int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); + int i; + + sregs->pvr = vcpu->arch.pvr; + + sregs->u.s.sdr1 = to_book3s(vcpu)->sdr1; + if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) { + for (i = 0; i < 64; i++) { + sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige | i; + sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv; + } + } else { + for (i = 0; i < 16; i++) + sregs->u.s.ppc32.sr[i] = vcpu->arch.shared->sr[i]; + + for (i = 0; i < 8; i++) { + sregs->u.s.ppc32.ibat[i] = vcpu3s->ibat[i].raw; + sregs->u.s.ppc32.dbat[i] = vcpu3s->dbat[i].raw; + } + } + + return 0; +} + +int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); + int i; + + kvmppc_set_pvr(vcpu, sregs->pvr); + + vcpu3s->sdr1 = sregs->u.s.sdr1; + if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) { + for (i = 0; i < 64; i++) { + vcpu->arch.mmu.slbmte(vcpu, sregs->u.s.ppc64.slb[i].slbv, + sregs->u.s.ppc64.slb[i].slbe); + } + } else { + for (i = 0; i < 16; i++) { + vcpu->arch.mmu.mtsrin(vcpu, i, sregs->u.s.ppc32.sr[i]); + } + for (i = 0; i < 8; i++) { + kvmppc_set_bat(vcpu, &(vcpu3s->ibat[i]), false, + (u32)sregs->u.s.ppc32.ibat[i]); + kvmppc_set_bat(vcpu, &(vcpu3s->ibat[i]), true, + (u32)(sregs->u.s.ppc32.ibat[i] >> 32)); + kvmppc_set_bat(vcpu, &(vcpu3s->dbat[i]), false, + (u32)sregs->u.s.ppc32.dbat[i]); + kvmppc_set_bat(vcpu, &(vcpu3s->dbat[i]), true, + (u32)(sregs->u.s.ppc32.dbat[i] >> 32)); + } + } + + /* Flush the MMU after messing with the segments */ + kvmppc_mmu_pte_flush(vcpu, 0, 0); + + return 0; +} + +int kvmppc_core_check_processor_compat(void) +{ + return 0; +} + +struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +{ + struct kvmppc_vcpu_book3s *vcpu_book3s; + struct kvm_vcpu *vcpu; + int err = -ENOMEM; + unsigned long p; + + vcpu_book3s = vzalloc(sizeof(struct kvmppc_vcpu_book3s)); + if (!vcpu_book3s) + goto out; + + vcpu_book3s->shadow_vcpu = (struct kvmppc_book3s_shadow_vcpu *) + kzalloc(sizeof(*vcpu_book3s->shadow_vcpu), GFP_KERNEL); + if (!vcpu_book3s->shadow_vcpu) + goto free_vcpu; + + vcpu = &vcpu_book3s->vcpu; + err = kvm_vcpu_init(vcpu, kvm, id); + if (err) + goto free_shadow_vcpu; + + p = __get_free_page(GFP_KERNEL|__GFP_ZERO); + /* the real shared page fills the last 4k of our page */ + vcpu->arch.shared = (void*)(p + PAGE_SIZE - 4096); + if (!p) + goto uninit_vcpu; + +#ifdef CONFIG_PPC_BOOK3S_64 + /* default to book3s_64 (970fx) */ + vcpu->arch.pvr = 0x3C0301; +#else + /* default to book3s_32 (750) */ + vcpu->arch.pvr = 0x84202; +#endif + kvmppc_set_pvr(vcpu, vcpu->arch.pvr); + vcpu->arch.slb_nr = 64; + + vcpu->arch.shadow_msr = MSR_USER64; + + err = kvmppc_mmu_init(vcpu); + if (err < 0) + goto uninit_vcpu; + + return vcpu; + +uninit_vcpu: + kvm_vcpu_uninit(vcpu); +free_shadow_vcpu: + kfree(vcpu_book3s->shadow_vcpu); +free_vcpu: + vfree(vcpu_book3s); +out: + return ERR_PTR(err); +} + +void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); + + free_page((unsigned long)vcpu->arch.shared & PAGE_MASK); + kvm_vcpu_uninit(vcpu); + kfree(vcpu_book3s->shadow_vcpu); + vfree(vcpu_book3s); +} + +int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) +{ + int ret; + double fpr[32][TS_FPRWIDTH]; + unsigned int fpscr; + int fpexc_mode; +#ifdef CONFIG_ALTIVEC + vector128 vr[32]; + vector128 vscr; + unsigned long uninitialized_var(vrsave); + int used_vr; +#endif +#ifdef CONFIG_VSX + int used_vsr; +#endif + ulong ext_msr; + + /* Check if we can run the vcpu at all */ + if (!vcpu->arch.sane) { + kvm_run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + return -EINVAL; + } + + /* No need to go into the guest when all we do is going out */ + if (signal_pending(current)) { + kvm_run->exit_reason = KVM_EXIT_INTR; + return -EINTR; + } + + /* Save FPU state in stack */ + if (current->thread.regs->msr & MSR_FP) + giveup_fpu(current); + memcpy(fpr, current->thread.fpr, sizeof(current->thread.fpr)); + fpscr = current->thread.fpscr.val; + fpexc_mode = current->thread.fpexc_mode; + +#ifdef CONFIG_ALTIVEC + /* Save Altivec state in stack */ + used_vr = current->thread.used_vr; + if (used_vr) { + if (current->thread.regs->msr & MSR_VEC) + giveup_altivec(current); + memcpy(vr, current->thread.vr, sizeof(current->thread.vr)); + vscr = current->thread.vscr; + vrsave = current->thread.vrsave; + } +#endif + +#ifdef CONFIG_VSX + /* Save VSX state in stack */ + used_vsr = current->thread.used_vsr; + if (used_vsr && (current->thread.regs->msr & MSR_VSX)) + __giveup_vsx(current); +#endif + + /* Remember the MSR with disabled extensions */ + ext_msr = current->thread.regs->msr; + + /* Preload FPU if it's enabled */ + if (vcpu->arch.shared->msr & MSR_FP) + kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP); + + kvm_guest_enter(); + + ret = __kvmppc_vcpu_run(kvm_run, vcpu); + + kvm_guest_exit(); + + local_irq_disable(); + + current->thread.regs->msr = ext_msr; + + /* Make sure we save the guest FPU/Altivec/VSX state */ + kvmppc_giveup_ext(vcpu, MSR_FP); + kvmppc_giveup_ext(vcpu, MSR_VEC); + kvmppc_giveup_ext(vcpu, MSR_VSX); + + /* Restore FPU state from stack */ + memcpy(current->thread.fpr, fpr, sizeof(current->thread.fpr)); + current->thread.fpscr.val = fpscr; + current->thread.fpexc_mode = fpexc_mode; + +#ifdef CONFIG_ALTIVEC + /* Restore Altivec state from stack */ + if (used_vr && current->thread.used_vr) { + memcpy(current->thread.vr, vr, sizeof(current->thread.vr)); + current->thread.vscr = vscr; + current->thread.vrsave = vrsave; + } + current->thread.used_vr = used_vr; +#endif + +#ifdef CONFIG_VSX + current->thread.used_vsr = used_vsr; +#endif + + return ret; +} + +int kvmppc_core_prepare_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem) +{ + return 0; +} + +void kvmppc_core_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem) +{ +} + +int kvmppc_core_init_vm(struct kvm *kvm) +{ + return 0; +} + +void kvmppc_core_destroy_vm(struct kvm *kvm) +{ +} + +static int kvmppc_book3s_init(void) +{ + int r; + + r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_book3s), 0, + THIS_MODULE); + + if (r) + return r; + + r = kvmppc_mmu_hpte_sysinit(); + + return r; +} + +static void kvmppc_book3s_exit(void) +{ + kvmppc_mmu_hpte_sysexit(); + kvm_exit(); +} + +module_init(kvmppc_book3s_init); +module_exit(kvmppc_book3s_exit); diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c new file mode 100644 index 000000000000..b9589324797b --- /dev/null +++ b/arch/powerpc/kvm/book3s_pr_papr.c @@ -0,0 +1,158 @@ +/* + * Copyright (C) 2011. Freescale Inc. All rights reserved. + * + * Authors: + * Alexander Graf <agraf@suse.de> + * Paul Mackerras <paulus@samba.org> + * + * Description: + * + * Hypercall handling for running PAPR guests in PR KVM on Book 3S + * processors. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include <asm/uaccess.h> +#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s.h> + +static unsigned long get_pteg_addr(struct kvm_vcpu *vcpu, long pte_index) +{ + struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); + unsigned long pteg_addr; + + pte_index <<= 4; + pte_index &= ((1 << ((vcpu_book3s->sdr1 & 0x1f) + 11)) - 1) << 7 | 0x70; + pteg_addr = vcpu_book3s->sdr1 & 0xfffffffffffc0000ULL; + pteg_addr |= pte_index; + + return pteg_addr; +} + +static int kvmppc_h_pr_enter(struct kvm_vcpu *vcpu) +{ + long flags = kvmppc_get_gpr(vcpu, 4); + long pte_index = kvmppc_get_gpr(vcpu, 5); + unsigned long pteg[2 * 8]; + unsigned long pteg_addr, i, *hpte; + + pte_index &= ~7UL; + pteg_addr = get_pteg_addr(vcpu, pte_index); + + copy_from_user(pteg, (void __user *)pteg_addr, sizeof(pteg)); + hpte = pteg; + + if (likely((flags & H_EXACT) == 0)) { + pte_index &= ~7UL; + for (i = 0; ; ++i) { + if (i == 8) + return H_PTEG_FULL; + if ((*hpte & HPTE_V_VALID) == 0) + break; + hpte += 2; + } + } else { + i = kvmppc_get_gpr(vcpu, 5) & 7UL; + hpte += i * 2; + } + + hpte[0] = kvmppc_get_gpr(vcpu, 6); + hpte[1] = kvmppc_get_gpr(vcpu, 7); + copy_to_user((void __user *)pteg_addr, pteg, sizeof(pteg)); + kvmppc_set_gpr(vcpu, 3, H_SUCCESS); + kvmppc_set_gpr(vcpu, 4, pte_index | i); + + return EMULATE_DONE; +} + +static int kvmppc_h_pr_remove(struct kvm_vcpu *vcpu) +{ + unsigned long flags= kvmppc_get_gpr(vcpu, 4); + unsigned long pte_index = kvmppc_get_gpr(vcpu, 5); + unsigned long avpn = kvmppc_get_gpr(vcpu, 6); + unsigned long v = 0, pteg, rb; + unsigned long pte[2]; + + pteg = get_pteg_addr(vcpu, pte_index); + copy_from_user(pte, (void __user *)pteg, sizeof(pte)); + + if ((pte[0] & HPTE_V_VALID) == 0 || + ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != avpn) || + ((flags & H_ANDCOND) && (pte[0] & avpn) != 0)) { + kvmppc_set_gpr(vcpu, 3, H_NOT_FOUND); + return EMULATE_DONE; + } + + copy_to_user((void __user *)pteg, &v, sizeof(v)); + + rb = compute_tlbie_rb(pte[0], pte[1], pte_index); + vcpu->arch.mmu.tlbie(vcpu, rb, rb & 1 ? true : false); + + kvmppc_set_gpr(vcpu, 3, H_SUCCESS); + kvmppc_set_gpr(vcpu, 4, pte[0]); + kvmppc_set_gpr(vcpu, 5, pte[1]); + + return EMULATE_DONE; +} + +static int kvmppc_h_pr_protect(struct kvm_vcpu *vcpu) +{ + unsigned long flags = kvmppc_get_gpr(vcpu, 4); + unsigned long pte_index = kvmppc_get_gpr(vcpu, 5); + unsigned long avpn = kvmppc_get_gpr(vcpu, 6); + unsigned long rb, pteg, r, v; + unsigned long pte[2]; + + pteg = get_pteg_addr(vcpu, pte_index); + copy_from_user(pte, (void __user *)pteg, sizeof(pte)); + + if ((pte[0] & HPTE_V_VALID) == 0 || + ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != avpn)) { + kvmppc_set_gpr(vcpu, 3, H_NOT_FOUND); + return EMULATE_DONE; + } + + v = pte[0]; + r = pte[1]; + r &= ~(HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_HI | + HPTE_R_KEY_LO); + r |= (flags << 55) & HPTE_R_PP0; + r |= (flags << 48) & HPTE_R_KEY_HI; + r |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO); + + pte[1] = r; + + rb = compute_tlbie_rb(v, r, pte_index); + vcpu->arch.mmu.tlbie(vcpu, rb, rb & 1 ? true : false); + copy_to_user((void __user *)pteg, pte, sizeof(pte)); + + kvmppc_set_gpr(vcpu, 3, H_SUCCESS); + + return EMULATE_DONE; +} + +int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd) +{ + switch (cmd) { + case H_ENTER: + return kvmppc_h_pr_enter(vcpu); + case H_REMOVE: + return kvmppc_h_pr_remove(vcpu); + case H_PROTECT: + return kvmppc_h_pr_protect(vcpu); + case H_BULK_REMOVE: + /* We just flush all PTEs, so user space can + handle the HPT modifications */ + kvmppc_mmu_pte_flush(vcpu, 0, 0); + break; + case H_CEDE: + kvm_vcpu_block(vcpu); + vcpu->stat.halt_wakeup++; + return EMULATE_DONE; + } + + return EMULATE_FAIL; +} diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S index 1a1b34487e71..34187585c507 100644 --- a/arch/powerpc/kvm/book3s_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_rmhandlers.S @@ -20,6 +20,7 @@ #include <asm/ppc_asm.h> #include <asm/kvm_asm.h> #include <asm/reg.h> +#include <asm/mmu.h> #include <asm/page.h> #include <asm/asm-offsets.h> @@ -35,42 +36,46 @@ #if defined(CONFIG_PPC_BOOK3S_64) -#define LOAD_SHADOW_VCPU(reg) GET_PACA(reg) -#define SHADOW_VCPU_OFF PACA_KVM_SVCPU -#define MSR_NOIRQ MSR_KERNEL & ~(MSR_IR | MSR_DR) #define FUNC(name) GLUE(.,name) +#define MTMSR_EERI(reg) mtmsrd (reg),1 + + .globl kvmppc_skip_interrupt +kvmppc_skip_interrupt: + /* + * Here all GPRs are unchanged from when the interrupt happened + * except for r13, which is saved in SPRG_SCRATCH0. + */ + mfspr r13, SPRN_SRR0 + addi r13, r13, 4 + mtspr SPRN_SRR0, r13 + GET_SCRATCH0(r13) + rfid + b . + + .globl kvmppc_skip_Hinterrupt +kvmppc_skip_Hinterrupt: + /* + * Here all GPRs are unchanged from when the interrupt happened + * except for r13, which is saved in SPRG_SCRATCH0. + */ + mfspr r13, SPRN_HSRR0 + addi r13, r13, 4 + mtspr SPRN_HSRR0, r13 + GET_SCRATCH0(r13) + hrfid + b . #elif defined(CONFIG_PPC_BOOK3S_32) -#define LOAD_SHADOW_VCPU(reg) \ - mfspr reg, SPRN_SPRG_THREAD; \ - lwz reg, THREAD_KVM_SVCPU(reg); \ - /* PPC32 can have a NULL pointer - let's check for that */ \ - mtspr SPRN_SPRG_SCRATCH1, r12; /* Save r12 */ \ - mfcr r12; \ - cmpwi reg, 0; \ - bne 1f; \ - mfspr reg, SPRN_SPRG_SCRATCH0; \ - mtcr r12; \ - mfspr r12, SPRN_SPRG_SCRATCH1; \ - b kvmppc_resume_\intno; \ -1:; \ - mtcr r12; \ - mfspr r12, SPRN_SPRG_SCRATCH1; \ - tophys(reg, reg) - -#define SHADOW_VCPU_OFF 0 -#define MSR_NOIRQ MSR_KERNEL #define FUNC(name) name - -#endif +#define MTMSR_EERI(reg) mtmsr (reg) .macro INTERRUPT_TRAMPOLINE intno .global kvmppc_trampoline_\intno kvmppc_trampoline_\intno: - SET_SCRATCH0(r13) /* Save r13 */ + mtspr SPRN_SPRG_SCRATCH0, r13 /* Save r13 */ /* * First thing to do is to find out if we're coming @@ -78,19 +83,28 @@ kvmppc_trampoline_\intno: * * To distinguish, we check a magic byte in the PACA/current */ - LOAD_SHADOW_VCPU(r13) - PPC_STL r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH0)(r13) + mfspr r13, SPRN_SPRG_THREAD + lwz r13, THREAD_KVM_SVCPU(r13) + /* PPC32 can have a NULL pointer - let's check for that */ + mtspr SPRN_SPRG_SCRATCH1, r12 /* Save r12 */ mfcr r12 - stw r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH1)(r13) - lbz r12, (SHADOW_VCPU_OFF + SVCPU_IN_GUEST)(r13) + cmpwi r13, 0 + bne 1f +2: mtcr r12 + mfspr r12, SPRN_SPRG_SCRATCH1 + mfspr r13, SPRN_SPRG_SCRATCH0 /* r13 = original r13 */ + b kvmppc_resume_\intno /* Get back original handler */ + +1: tophys(r13, r13) + stw r12, HSTATE_SCRATCH1(r13) + mfspr r12, SPRN_SPRG_SCRATCH1 + stw r12, HSTATE_SCRATCH0(r13) + lbz r12, HSTATE_IN_GUEST(r13) cmpwi r12, KVM_GUEST_MODE_NONE bne ..kvmppc_handler_hasmagic_\intno /* No KVM guest? Then jump back to the Linux handler! */ - lwz r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH1)(r13) - mtcr r12 - PPC_LL r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH0)(r13) - GET_SCRATCH0(r13) /* r13 = original r13 */ - b kvmppc_resume_\intno /* Get back original handler */ + lwz r12, HSTATE_SCRATCH1(r13) + b 2b /* Now we know we're handling a KVM guest */ ..kvmppc_handler_hasmagic_\intno: @@ -112,9 +126,6 @@ INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_MACHINE_CHECK INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_DATA_STORAGE INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_INST_STORAGE INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_EXTERNAL -#ifdef CONFIG_PPC_BOOK3S_64 -INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_EXTERNAL_HV -#endif INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_ALIGNMENT INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_PROGRAM INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_FP_UNAVAIL @@ -124,14 +135,6 @@ INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_TRACE INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_PERFMON INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_ALTIVEC -/* Those are only available on 64 bit machines */ - -#ifdef CONFIG_PPC_BOOK3S_64 -INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_DATA_SEGMENT -INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_INST_SEGMENT -INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_VSX -#endif - /* * Bring us back to the faulting code, but skip the * faulting instruction. @@ -143,8 +146,8 @@ INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_VSX * * R12 = free * R13 = Shadow VCPU (PACA) - * SVCPU.SCRATCH0 = guest R12 - * SVCPU.SCRATCH1 = guest CR + * HSTATE.SCRATCH0 = guest R12 + * HSTATE.SCRATCH1 = guest CR * SPRG_SCRATCH0 = guest R13 * */ @@ -156,49 +159,34 @@ kvmppc_handler_skip_ins: mtsrr0 r12 /* Clean up all state */ - lwz r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH1)(r13) + lwz r12, HSTATE_SCRATCH1(r13) mtcr r12 - PPC_LL r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH0)(r13) + PPC_LL r12, HSTATE_SCRATCH0(r13) GET_SCRATCH0(r13) /* And get back into the code */ RFI +#endif /* - * This trampoline brings us back to a real mode handler - * - * Input Registers: - * - * R5 = SRR0 - * R6 = SRR1 - * LR = real-mode IP + * Call kvmppc_handler_trampoline_enter in real mode * + * On entry, r4 contains the guest shadow MSR */ -.global kvmppc_handler_lowmem_trampoline -kvmppc_handler_lowmem_trampoline: - - mtsrr0 r5 +_GLOBAL(kvmppc_entry_trampoline) + mfmsr r5 + LOAD_REG_ADDR(r7, kvmppc_handler_trampoline_enter) + toreal(r7) + + li r9, MSR_RI + ori r9, r9, MSR_EE + andc r9, r5, r9 /* Clear EE and RI in MSR value */ + li r6, MSR_IR | MSR_DR + ori r6, r6, MSR_EE + andc r6, r5, r6 /* Clear EE, DR and IR in MSR value */ + MTMSR_EERI(r9) /* Clear EE and RI in MSR */ + mtsrr0 r7 /* before we set srr0/1 */ mtsrr1 r6 - blr -kvmppc_handler_lowmem_trampoline_end: - -/* - * Call a function in real mode - * - * Input Registers: - * - * R3 = function - * R4 = MSR - * R5 = scratch register - * - */ -_GLOBAL(kvmppc_rmcall) - LOAD_REG_IMMEDIATE(r5, MSR_NOIRQ) - mtmsr r5 /* Disable relocation and interrupts, so mtsrr - doesn't get interrupted */ - sync - mtsrr0 r3 - mtsrr1 r4 RFI #if defined(CONFIG_PPC_BOOK3S_32) @@ -251,12 +239,4 @@ define_load_up(altivec) define_load_up(vsx) #endif -.global kvmppc_trampoline_lowmem -kvmppc_trampoline_lowmem: - PPC_LONG kvmppc_handler_lowmem_trampoline - CONFIG_KERNEL_START - -.global kvmppc_trampoline_enter -kvmppc_trampoline_enter: - PPC_LONG kvmppc_handler_trampoline_enter - CONFIG_KERNEL_START - #include "book3s_segment.S" diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S index 451264274b8c..0676ae249b9f 100644 --- a/arch/powerpc/kvm/book3s_segment.S +++ b/arch/powerpc/kvm/book3s_segment.S @@ -22,7 +22,8 @@ #if defined(CONFIG_PPC_BOOK3S_64) #define GET_SHADOW_VCPU(reg) \ - addi reg, r13, PACA_KVM_SVCPU + mr reg, r13 +#define MTMSR_EERI(reg) mtmsrd (reg),1 #elif defined(CONFIG_PPC_BOOK3S_32) @@ -30,6 +31,7 @@ tophys(reg, r2); \ lwz reg, (THREAD + THREAD_KVM_SVCPU)(reg); \ tophys(reg, reg) +#define MTMSR_EERI(reg) mtmsr (reg) #endif @@ -57,10 +59,12 @@ kvmppc_handler_trampoline_enter: /* Required state: * * MSR = ~IR|DR - * R13 = PACA * R1 = host R1 * R2 = host R2 - * R10 = guest MSR + * R4 = guest shadow MSR + * R5 = normal host MSR + * R6 = current host MSR (EE, IR, DR off) + * LR = highmem guest exit code * all other volatile GPRS = free * SVCPU[CR] = guest CR * SVCPU[XER] = guest XER @@ -71,43 +75,76 @@ kvmppc_handler_trampoline_enter: /* r3 = shadow vcpu */ GET_SHADOW_VCPU(r3) - /* Move SRR0 and SRR1 into the respective regs */ - PPC_LL r9, SVCPU_PC(r3) - mtsrr0 r9 - mtsrr1 r10 + /* Save guest exit handler address and MSR */ + mflr r0 + PPC_STL r0, HSTATE_VMHANDLER(r3) + PPC_STL r5, HSTATE_HOST_MSR(r3) + + /* Save R1/R2 in the PACA (64-bit) or shadow_vcpu (32-bit) */ + PPC_STL r1, HSTATE_HOST_R1(r3) + PPC_STL r2, HSTATE_HOST_R2(r3) /* Activate guest mode, so faults get handled by KVM */ li r11, KVM_GUEST_MODE_GUEST - stb r11, SVCPU_IN_GUEST(r3) + stb r11, HSTATE_IN_GUEST(r3) /* Switch to guest segment. This is subarch specific. */ LOAD_GUEST_SEGMENTS +#ifdef CONFIG_PPC_BOOK3S_64 + /* Some guests may need to have dcbz set to 32 byte length. + * + * Usually we ensure that by patching the guest's instructions + * to trap on dcbz and emulate it in the hypervisor. + * + * If we can, we should tell the CPU to use 32 byte dcbz though, + * because that's a lot faster. + */ + lbz r0, HSTATE_RESTORE_HID5(r3) + cmpwi r0, 0 + beq no_dcbz32_on + + mfspr r0,SPRN_HID5 + ori r0, r0, 0x80 /* XXX HID5_dcbz32 = 0x80 */ + mtspr SPRN_HID5,r0 +no_dcbz32_on: + +#endif /* CONFIG_PPC_BOOK3S_64 */ + /* Enter guest */ - PPC_LL r4, (SVCPU_CTR)(r3) - PPC_LL r5, (SVCPU_LR)(r3) - lwz r6, (SVCPU_CR)(r3) - lwz r7, (SVCPU_XER)(r3) - - mtctr r4 - mtlr r5 - mtcr r6 - mtxer r7 - - PPC_LL r0, (SVCPU_R0)(r3) - PPC_LL r1, (SVCPU_R1)(r3) - PPC_LL r2, (SVCPU_R2)(r3) - PPC_LL r4, (SVCPU_R4)(r3) - PPC_LL r5, (SVCPU_R5)(r3) - PPC_LL r6, (SVCPU_R6)(r3) - PPC_LL r7, (SVCPU_R7)(r3) - PPC_LL r8, (SVCPU_R8)(r3) - PPC_LL r9, (SVCPU_R9)(r3) - PPC_LL r10, (SVCPU_R10)(r3) - PPC_LL r11, (SVCPU_R11)(r3) - PPC_LL r12, (SVCPU_R12)(r3) - PPC_LL r13, (SVCPU_R13)(r3) + PPC_LL r8, SVCPU_CTR(r3) + PPC_LL r9, SVCPU_LR(r3) + lwz r10, SVCPU_CR(r3) + lwz r11, SVCPU_XER(r3) + + mtctr r8 + mtlr r9 + mtcr r10 + mtxer r11 + + /* Move SRR0 and SRR1 into the respective regs */ + PPC_LL r9, SVCPU_PC(r3) + /* First clear RI in our current MSR value */ + li r0, MSR_RI + andc r6, r6, r0 + MTMSR_EERI(r6) + mtsrr0 r9 + mtsrr1 r4 + + PPC_LL r0, SVCPU_R0(r3) + PPC_LL r1, SVCPU_R1(r3) + PPC_LL r2, SVCPU_R2(r3) + PPC_LL r4, SVCPU_R4(r3) + PPC_LL r5, SVCPU_R5(r3) + PPC_LL r6, SVCPU_R6(r3) + PPC_LL r7, SVCPU_R7(r3) + PPC_LL r8, SVCPU_R8(r3) + PPC_LL r9, SVCPU_R9(r3) + PPC_LL r10, SVCPU_R10(r3) + PPC_LL r11, SVCPU_R11(r3) + PPC_LL r12, SVCPU_R12(r3) + PPC_LL r13, SVCPU_R13(r3) PPC_LL r3, (SVCPU_R3)(r3) @@ -125,56 +162,63 @@ kvmppc_handler_trampoline_enter_end: .global kvmppc_handler_trampoline_exit kvmppc_handler_trampoline_exit: +.global kvmppc_interrupt +kvmppc_interrupt: + /* Register usage at this point: * * SPRG_SCRATCH0 = guest R13 * R12 = exit handler id - * R13 = shadow vcpu - SHADOW_VCPU_OFF [=PACA on PPC64] - * SVCPU.SCRATCH0 = guest R12 - * SVCPU.SCRATCH1 = guest CR + * R13 = shadow vcpu (32-bit) or PACA (64-bit) + * HSTATE.SCRATCH0 = guest R12 + * HSTATE.SCRATCH1 = guest CR * */ /* Save registers */ - PPC_STL r0, (SHADOW_VCPU_OFF + SVCPU_R0)(r13) - PPC_STL r1, (SHADOW_VCPU_OFF + SVCPU_R1)(r13) - PPC_STL r2, (SHADOW_VCPU_OFF + SVCPU_R2)(r13) - PPC_STL r3, (SHADOW_VCPU_OFF + SVCPU_R3)(r13) - PPC_STL r4, (SHADOW_VCPU_OFF + SVCPU_R4)(r13) - PPC_STL r5, (SHADOW_VCPU_OFF + SVCPU_R5)(r13) - PPC_STL r6, (SHADOW_VCPU_OFF + SVCPU_R6)(r13) - PPC_STL r7, (SHADOW_VCPU_OFF + SVCPU_R7)(r13) - PPC_STL r8, (SHADOW_VCPU_OFF + SVCPU_R8)(r13) - PPC_STL r9, (SHADOW_VCPU_OFF + SVCPU_R9)(r13) - PPC_STL r10, (SHADOW_VCPU_OFF + SVCPU_R10)(r13) - PPC_STL r11, (SHADOW_VCPU_OFF + SVCPU_R11)(r13) + PPC_STL r0, SVCPU_R0(r13) + PPC_STL r1, SVCPU_R1(r13) + PPC_STL r2, SVCPU_R2(r13) + PPC_STL r3, SVCPU_R3(r13) + PPC_STL r4, SVCPU_R4(r13) + PPC_STL r5, SVCPU_R5(r13) + PPC_STL r6, SVCPU_R6(r13) + PPC_STL r7, SVCPU_R7(r13) + PPC_STL r8, SVCPU_R8(r13) + PPC_STL r9, SVCPU_R9(r13) + PPC_STL r10, SVCPU_R10(r13) + PPC_STL r11, SVCPU_R11(r13) /* Restore R1/R2 so we can handle faults */ - PPC_LL r1, (SHADOW_VCPU_OFF + SVCPU_HOST_R1)(r13) - PPC_LL r2, (SHADOW_VCPU_OFF + SVCPU_HOST_R2)(r13) + PPC_LL r1, HSTATE_HOST_R1(r13) + PPC_LL r2, HSTATE_HOST_R2(r13) /* Save guest PC and MSR */ +#ifdef CONFIG_PPC64 +BEGIN_FTR_SECTION andi. r0,r12,0x2 beq 1f mfspr r3,SPRN_HSRR0 mfspr r4,SPRN_HSRR1 andi. r12,r12,0x3ffd b 2f +END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) +#endif 1: mfsrr0 r3 mfsrr1 r4 2: - PPC_STL r3, (SHADOW_VCPU_OFF + SVCPU_PC)(r13) - PPC_STL r4, (SHADOW_VCPU_OFF + SVCPU_SHADOW_SRR1)(r13) + PPC_STL r3, SVCPU_PC(r13) + PPC_STL r4, SVCPU_SHADOW_SRR1(r13) /* Get scratch'ed off registers */ GET_SCRATCH0(r9) - PPC_LL r8, (SHADOW_VCPU_OFF + SVCPU_SCRATCH0)(r13) - lwz r7, (SHADOW_VCPU_OFF + SVCPU_SCRATCH1)(r13) + PPC_LL r8, HSTATE_SCRATCH0(r13) + lwz r7, HSTATE_SCRATCH1(r13) - PPC_STL r9, (SHADOW_VCPU_OFF + SVCPU_R13)(r13) - PPC_STL r8, (SHADOW_VCPU_OFF + SVCPU_R12)(r13) - stw r7, (SHADOW_VCPU_OFF + SVCPU_CR)(r13) + PPC_STL r9, SVCPU_R13(r13) + PPC_STL r8, SVCPU_R12(r13) + stw r7, SVCPU_CR(r13) /* Save more register state */ @@ -184,11 +228,11 @@ kvmppc_handler_trampoline_exit: mfctr r8 mflr r9 - stw r5, (SHADOW_VCPU_OFF + SVCPU_XER)(r13) - PPC_STL r6, (SHADOW_VCPU_OFF + SVCPU_FAULT_DAR)(r13) - stw r7, (SHADOW_VCPU_OFF + SVCPU_FAULT_DSISR)(r13) - PPC_STL r8, (SHADOW_VCPU_OFF + SVCPU_CTR)(r13) - PPC_STL r9, (SHADOW_VCPU_OFF + SVCPU_LR)(r13) + stw r5, SVCPU_XER(r13) + PPC_STL r6, SVCPU_FAULT_DAR(r13) + stw r7, SVCPU_FAULT_DSISR(r13) + PPC_STL r8, SVCPU_CTR(r13) + PPC_STL r9, SVCPU_LR(r13) /* * In order for us to easily get the last instruction, @@ -202,11 +246,16 @@ kvmppc_handler_trampoline_exit: beq ld_last_inst cmpwi r12, BOOK3S_INTERRUPT_PROGRAM beq ld_last_inst + cmpwi r12, BOOK3S_INTERRUPT_SYSCALL + beq ld_last_prev_inst cmpwi r12, BOOK3S_INTERRUPT_ALIGNMENT beq- ld_last_inst b no_ld_last_inst +ld_last_prev_inst: + addi r3, r3, -4 + ld_last_inst: /* Save off the guest instruction we're at */ @@ -218,7 +267,7 @@ ld_last_inst: /* Set guest mode to 'jump over instruction' so if lwz faults * we'll just continue at the next IP. */ li r9, KVM_GUEST_MODE_SKIP - stb r9, (SHADOW_VCPU_OFF + SVCPU_IN_GUEST)(r13) + stb r9, HSTATE_IN_GUEST(r13) /* 1) enable paging for data */ mfmsr r9 @@ -232,34 +281,73 @@ ld_last_inst: sync #endif - stw r0, (SHADOW_VCPU_OFF + SVCPU_LAST_INST)(r13) + stw r0, SVCPU_LAST_INST(r13) no_ld_last_inst: /* Unset guest mode */ li r9, KVM_GUEST_MODE_NONE - stb r9, (SHADOW_VCPU_OFF + SVCPU_IN_GUEST)(r13) + stb r9, HSTATE_IN_GUEST(r13) /* Switch back to host MMU */ LOAD_HOST_SEGMENTS +#ifdef CONFIG_PPC_BOOK3S_64 + + lbz r5, HSTATE_RESTORE_HID5(r13) + cmpwi r5, 0 + beq no_dcbz32_off + + li r4, 0 + mfspr r5,SPRN_HID5 + rldimi r5,r4,6,56 + mtspr SPRN_HID5,r5 + +no_dcbz32_off: + +#endif /* CONFIG_PPC_BOOK3S_64 */ + + /* + * For some interrupts, we need to call the real Linux + * handler, so it can do work for us. This has to happen + * as if the interrupt arrived from the kernel though, + * so let's fake it here where most state is restored. + * + * Having set up SRR0/1 with the address where we want + * to continue with relocation on (potentially in module + * space), we either just go straight there with rfi[d], + * or we jump to an interrupt handler with bctr if there + * is an interrupt to be handled first. In the latter + * case, the rfi[d] at the end of the interrupt handler + * will get us back to where we want to continue. + */ + + cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL + beq 1f + cmpwi r12, BOOK3S_INTERRUPT_DECREMENTER + beq 1f + cmpwi r12, BOOK3S_INTERRUPT_PERFMON +1: mtctr r12 + /* Register usage at this point: * * R1 = host R1 * R2 = host R2 * R12 = exit handler id - * R13 = shadow vcpu - SHADOW_VCPU_OFF [=PACA on PPC64] + * R13 = shadow vcpu (32-bit) or PACA (64-bit) * SVCPU.* = guest * * */ - /* RFI into the highmem handler */ - mfmsr r7 - ori r7, r7, MSR_IR|MSR_DR|MSR_RI|MSR_ME /* Enable paging */ - mtsrr1 r7 + PPC_LL r6, HSTATE_HOST_MSR(r13) + PPC_LL r8, HSTATE_VMHANDLER(r13) + + /* Restore host msr -> SRR1 */ + mtsrr1 r6 /* Load highmem handler address */ - PPC_LL r8, (SHADOW_VCPU_OFF + SVCPU_VMHANDLER)(r13) mtsrr0 r8 + /* RFI into the highmem handler, or jump to interrupt handler */ + beqctr RFI kvmppc_handler_trampoline_exit_end: diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 8462b3a1c1c7..bb6c988f010a 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -13,6 +13,7 @@ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * * Copyright IBM Corp. 2007 + * Copyright 2010-2011 Freescale Semiconductor, Inc. * * Authors: Hollis Blanchard <hollisb@us.ibm.com> * Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com> @@ -78,6 +79,60 @@ void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu) } } +#ifdef CONFIG_SPE +void kvmppc_vcpu_disable_spe(struct kvm_vcpu *vcpu) +{ + preempt_disable(); + enable_kernel_spe(); + kvmppc_save_guest_spe(vcpu); + vcpu->arch.shadow_msr &= ~MSR_SPE; + preempt_enable(); +} + +static void kvmppc_vcpu_enable_spe(struct kvm_vcpu *vcpu) +{ + preempt_disable(); + enable_kernel_spe(); + kvmppc_load_guest_spe(vcpu); + vcpu->arch.shadow_msr |= MSR_SPE; + preempt_enable(); +} + +static void kvmppc_vcpu_sync_spe(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.shared->msr & MSR_SPE) { + if (!(vcpu->arch.shadow_msr & MSR_SPE)) + kvmppc_vcpu_enable_spe(vcpu); + } else if (vcpu->arch.shadow_msr & MSR_SPE) { + kvmppc_vcpu_disable_spe(vcpu); + } +} +#else +static void kvmppc_vcpu_sync_spe(struct kvm_vcpu *vcpu) +{ +} +#endif + +/* + * Helper function for "full" MSR writes. No need to call this if only + * EE/CE/ME/DE/RI are changing. + */ +void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr) +{ + u32 old_msr = vcpu->arch.shared->msr; + + vcpu->arch.shared->msr = new_msr; + + kvmppc_mmu_msr_notify(vcpu, old_msr); + + if (vcpu->arch.shared->msr & MSR_WE) { + kvm_vcpu_block(vcpu); + kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS); + }; + + kvmppc_vcpu_sync_spe(vcpu); +} + static void kvmppc_booke_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int priority) { @@ -257,6 +312,24 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu) vcpu->arch.shared->int_pending = 0; } +int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) +{ + int ret; + + if (!vcpu->arch.sane) { + kvm_run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + return -EINVAL; + } + + local_irq_disable(); + kvm_guest_enter(); + ret = __kvmppc_vcpu_run(kvm_run, vcpu); + kvm_guest_exit(); + local_irq_enable(); + + return ret; +} + /** * kvmppc_handle_exit * @@ -344,10 +417,16 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, r = RESUME_GUEST; break; - case BOOKE_INTERRUPT_SPE_UNAVAIL: - kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SPE_UNAVAIL); +#ifdef CONFIG_SPE + case BOOKE_INTERRUPT_SPE_UNAVAIL: { + if (vcpu->arch.shared->msr & MSR_SPE) + kvmppc_vcpu_enable_spe(vcpu); + else + kvmppc_booke_queue_irqprio(vcpu, + BOOKE_IRQPRIO_SPE_UNAVAIL); r = RESUME_GUEST; break; + } case BOOKE_INTERRUPT_SPE_FP_DATA: kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SPE_FP_DATA); @@ -358,6 +437,28 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SPE_FP_ROUND); r = RESUME_GUEST; break; +#else + case BOOKE_INTERRUPT_SPE_UNAVAIL: + /* + * Guest wants SPE, but host kernel doesn't support it. Send + * an "unimplemented operation" program check to the guest. + */ + kvmppc_core_queue_program(vcpu, ESR_PUO | ESR_SPV); + r = RESUME_GUEST; + break; + + /* + * These really should never happen without CONFIG_SPE, + * as we should never enable the real MSR[SPE] in the guest. + */ + case BOOKE_INTERRUPT_SPE_FP_DATA: + case BOOKE_INTERRUPT_SPE_FP_ROUND: + printk(KERN_CRIT "%s: unexpected SPE interrupt %u at %08lx\n", + __func__, exit_nr, vcpu->arch.pc); + run->hw.hardware_exit_reason = exit_nr; + r = RESUME_HOST; + break; +#endif case BOOKE_INTERRUPT_DATA_STORAGE: kvmppc_core_queue_data_storage(vcpu, vcpu->arch.fault_dear, @@ -392,6 +493,17 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, gpa_t gpaddr; gfn_t gfn; +#ifdef CONFIG_KVM_E500 + if (!(vcpu->arch.shared->msr & MSR_PR) && + (eaddr & PAGE_MASK) == vcpu->arch.magic_page_ea) { + kvmppc_map_magic(vcpu); + kvmppc_account_exit(vcpu, DTLB_VIRT_MISS_EXITS); + r = RESUME_GUEST; + + break; + } +#endif + /* Check the guest TLB. */ gtlb_index = kvmppc_mmu_dtlb_index(vcpu, eaddr); if (gtlb_index < 0) { @@ -511,9 +623,11 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) { int i; + int r; vcpu->arch.pc = 0; vcpu->arch.shared->msr = 0; + vcpu->arch.shadow_msr = MSR_USER | MSR_DE | MSR_IS | MSR_DS; kvmppc_set_gpr(vcpu, 1, (16<<20) - 8); /* -8 for the callee-save LR slot */ vcpu->arch.shadow_pid = 1; @@ -526,7 +640,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) kvmppc_init_timing_stats(vcpu); - return kvmppc_core_vcpu_setup(vcpu); + r = kvmppc_core_vcpu_setup(vcpu); + kvmppc_sanity_check(vcpu); + return r; } int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) @@ -770,6 +886,26 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) return -ENOTSUPP; } +int kvmppc_core_prepare_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem) +{ + return 0; +} + +void kvmppc_core_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem) +{ +} + +int kvmppc_core_init_vm(struct kvm *kvm) +{ + return 0; +} + +void kvmppc_core_destroy_vm(struct kvm *kvm) +{ +} + int __init kvmppc_booke_init(void) { unsigned long ivor[16]; diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h index 492bb7030358..8e1fe33d64e5 100644 --- a/arch/powerpc/kvm/booke.h +++ b/arch/powerpc/kvm/booke.h @@ -52,24 +52,19 @@ extern unsigned long kvmppc_booke_handlers; -/* Helper function for "full" MSR writes. No need to call this if only EE is - * changing. */ -static inline void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr) -{ - if ((new_msr & MSR_PR) != (vcpu->arch.shared->msr & MSR_PR)) - kvmppc_mmu_priv_switch(vcpu, new_msr & MSR_PR); - - vcpu->arch.shared->msr = new_msr; - - if (vcpu->arch.shared->msr & MSR_WE) { - kvm_vcpu_block(vcpu); - kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS); - }; -} +void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr); +void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr); int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int inst, int *advance); int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt); int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs); +/* low-level asm code to transfer guest state */ +void kvmppc_load_guest_spe(struct kvm_vcpu *vcpu); +void kvmppc_save_guest_spe(struct kvm_vcpu *vcpu); + +/* high-level function, manages flags, host state */ +void kvmppc_vcpu_disable_spe(struct kvm_vcpu *vcpu); + #endif /* __KVM_BOOKE_H__ */ diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S index b58ccae95904..42f2fb1f66e9 100644 --- a/arch/powerpc/kvm/booke_interrupts.S +++ b/arch/powerpc/kvm/booke_interrupts.S @@ -13,6 +13,7 @@ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * * Copyright IBM Corp. 2007 + * Copyright 2011 Freescale Semiconductor, Inc. * * Authors: Hollis Blanchard <hollisb@us.ibm.com> */ @@ -24,8 +25,6 @@ #include <asm/page.h> #include <asm/asm-offsets.h> -#define KVMPPC_MSR_MASK (MSR_CE|MSR_EE|MSR_PR|MSR_DE|MSR_ME|MSR_IS|MSR_DS) - #define VCPU_GPR(n) (VCPU_GPRS + (n * 4)) /* The host stack layout: */ @@ -192,6 +191,12 @@ _GLOBAL(kvmppc_resume_host) lwz r3, VCPU_HOST_PID(r4) mtspr SPRN_PID, r3 +#ifdef CONFIG_FSL_BOOKE + /* we cheat and know that Linux doesn't use PID1 which is always 0 */ + lis r3, 0 + mtspr SPRN_PID1, r3 +#endif + /* Restore host IVPR before re-enabling interrupts. We cheat and know * that Linux IVPR is always 0xc0000000. */ lis r3, 0xc000 @@ -241,6 +246,14 @@ _GLOBAL(kvmppc_resume_host) heavyweight_exit: /* Not returning to guest. */ +#ifdef CONFIG_SPE + /* save guest SPEFSCR and load host SPEFSCR */ + mfspr r9, SPRN_SPEFSCR + stw r9, VCPU_SPEFSCR(r4) + lwz r9, VCPU_HOST_SPEFSCR(r4) + mtspr SPRN_SPEFSCR, r9 +#endif + /* We already saved guest volatile register state; now save the * non-volatiles. */ stw r15, VCPU_GPR(r15)(r4) @@ -342,6 +355,14 @@ _GLOBAL(__kvmppc_vcpu_run) lwz r30, VCPU_GPR(r30)(r4) lwz r31, VCPU_GPR(r31)(r4) +#ifdef CONFIG_SPE + /* save host SPEFSCR and load guest SPEFSCR */ + mfspr r3, SPRN_SPEFSCR + stw r3, VCPU_HOST_SPEFSCR(r4) + lwz r3, VCPU_SPEFSCR(r4) + mtspr SPRN_SPEFSCR, r3 +#endif + lightweight_exit: stw r2, HOST_R2(r1) @@ -350,6 +371,11 @@ lightweight_exit: lwz r3, VCPU_SHADOW_PID(r4) mtspr SPRN_PID, r3 +#ifdef CONFIG_FSL_BOOKE + lwz r3, VCPU_SHADOW_PID1(r4) + mtspr SPRN_PID1, r3 +#endif + #ifdef CONFIG_44x iccci 0, 0 /* XXX hack */ #endif @@ -405,20 +431,17 @@ lightweight_exit: /* Finish loading guest volatiles and jump to guest. */ lwz r3, VCPU_CTR(r4) + lwz r5, VCPU_CR(r4) + lwz r6, VCPU_PC(r4) + lwz r7, VCPU_SHADOW_MSR(r4) mtctr r3 - lwz r3, VCPU_CR(r4) - mtcr r3 + mtcr r5 + mtsrr0 r6 + mtsrr1 r7 lwz r5, VCPU_GPR(r5)(r4) lwz r6, VCPU_GPR(r6)(r4) lwz r7, VCPU_GPR(r7)(r4) lwz r8, VCPU_GPR(r8)(r4) - lwz r3, VCPU_PC(r4) - mtsrr0 r3 - lwz r3, VCPU_SHARED(r4) - lwz r3, (VCPU_SHARED_MSR + 4)(r3) - oris r3, r3, KVMPPC_MSR_MASK@h - ori r3, r3, KVMPPC_MSR_MASK@l - mtsrr1 r3 /* Clear any debug events which occurred since we disabled MSR[DE]. * XXX This gives us a 3-instruction window in which a breakpoint @@ -430,3 +453,24 @@ lightweight_exit: lwz r3, VCPU_GPR(r3)(r4) lwz r4, VCPU_GPR(r4)(r4) rfi + +#ifdef CONFIG_SPE +_GLOBAL(kvmppc_save_guest_spe) + cmpi 0,r3,0 + beqlr- + SAVE_32EVRS(0, r4, r3, VCPU_EVR) + evxor evr6, evr6, evr6 + evmwumiaa evr6, evr6, evr6 + li r4,VCPU_ACC + evstddx evr6, r4, r3 /* save acc */ + blr + +_GLOBAL(kvmppc_load_guest_spe) + cmpi 0,r3,0 + beqlr- + li r4,VCPU_ACC + evlddx evr6,r4,r3 + evmra evr6,evr6 /* load acc */ + REST_32EVRS(0, r4, r3, VCPU_EVR) + blr +#endif diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index 318dbc61ba44..26d20903f2bc 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2008 Freescale Semiconductor, Inc. All rights reserved. + * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved. * * Author: Yu Liu, <yu.liu@freescale.com> * @@ -41,6 +41,11 @@ void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) { kvmppc_e500_tlb_put(vcpu); + +#ifdef CONFIG_SPE + if (vcpu->arch.shadow_msr & MSR_SPE) + kvmppc_vcpu_disable_spe(vcpu); +#endif } int kvmppc_core_check_processor_compat(void) @@ -68,6 +73,8 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu) /* Since booke kvm only support one core, update all vcpus' PIR to 0 */ vcpu->vcpu_id = 0; + vcpu->arch.cpu_type = KVM_CPU_E500V2; + return 0; } diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c index 69cd665a0caf..d48ae396f41e 100644 --- a/arch/powerpc/kvm/e500_emulate.c +++ b/arch/powerpc/kvm/e500_emulate.c @@ -81,8 +81,12 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) kvmppc_set_pid(vcpu, spr_val); break; case SPRN_PID1: + if (spr_val != 0) + return EMULATE_FAIL; vcpu_e500->pid[1] = spr_val; break; case SPRN_PID2: + if (spr_val != 0) + return EMULATE_FAIL; vcpu_e500->pid[2] = spr_val; break; case SPRN_MAS0: vcpu_e500->mas0 = spr_val; break; diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index b18fe353397d..13c432ea2fa8 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -28,8 +28,196 @@ #define to_htlb1_esel(esel) (tlb1_entry_num - (esel) - 1) +struct id { + unsigned long val; + struct id **pentry; +}; + +#define NUM_TIDS 256 + +/* + * This table provide mappings from: + * (guestAS,guestTID,guestPR) --> ID of physical cpu + * guestAS [0..1] + * guestTID [0..255] + * guestPR [0..1] + * ID [1..255] + * Each vcpu keeps one vcpu_id_table. + */ +struct vcpu_id_table { + struct id id[2][NUM_TIDS][2]; +}; + +/* + * This table provide reversed mappings of vcpu_id_table: + * ID --> address of vcpu_id_table item. + * Each physical core has one pcpu_id_table. + */ +struct pcpu_id_table { + struct id *entry[NUM_TIDS]; +}; + +static DEFINE_PER_CPU(struct pcpu_id_table, pcpu_sids); + +/* This variable keeps last used shadow ID on local core. + * The valid range of shadow ID is [1..255] */ +static DEFINE_PER_CPU(unsigned long, pcpu_last_used_sid); + static unsigned int tlb1_entry_num; +/* + * Allocate a free shadow id and setup a valid sid mapping in given entry. + * A mapping is only valid when vcpu_id_table and pcpu_id_table are match. + * + * The caller must have preemption disabled, and keep it that way until + * it has finished with the returned shadow id (either written into the + * TLB or arch.shadow_pid, or discarded). + */ +static inline int local_sid_setup_one(struct id *entry) +{ + unsigned long sid; + int ret = -1; + + sid = ++(__get_cpu_var(pcpu_last_used_sid)); + if (sid < NUM_TIDS) { + __get_cpu_var(pcpu_sids).entry[sid] = entry; + entry->val = sid; + entry->pentry = &__get_cpu_var(pcpu_sids).entry[sid]; + ret = sid; + } + + /* + * If sid == NUM_TIDS, we've run out of sids. We return -1, and + * the caller will invalidate everything and start over. + * + * sid > NUM_TIDS indicates a race, which we disable preemption to + * avoid. + */ + WARN_ON(sid > NUM_TIDS); + + return ret; +} + +/* + * Check if given entry contain a valid shadow id mapping. + * An ID mapping is considered valid only if + * both vcpu and pcpu know this mapping. + * + * The caller must have preemption disabled, and keep it that way until + * it has finished with the returned shadow id (either written into the + * TLB or arch.shadow_pid, or discarded). + */ +static inline int local_sid_lookup(struct id *entry) +{ + if (entry && entry->val != 0 && + __get_cpu_var(pcpu_sids).entry[entry->val] == entry && + entry->pentry == &__get_cpu_var(pcpu_sids).entry[entry->val]) + return entry->val; + return -1; +} + +/* Invalidate all id mappings on local core */ +static inline void local_sid_destroy_all(void) +{ + preempt_disable(); + __get_cpu_var(pcpu_last_used_sid) = 0; + memset(&__get_cpu_var(pcpu_sids), 0, sizeof(__get_cpu_var(pcpu_sids))); + preempt_enable(); +} + +static void *kvmppc_e500_id_table_alloc(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + vcpu_e500->idt = kzalloc(sizeof(struct vcpu_id_table), GFP_KERNEL); + return vcpu_e500->idt; +} + +static void kvmppc_e500_id_table_free(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + kfree(vcpu_e500->idt); +} + +/* Invalidate all mappings on vcpu */ +static void kvmppc_e500_id_table_reset_all(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + memset(vcpu_e500->idt, 0, sizeof(struct vcpu_id_table)); + + /* Update shadow pid when mappings are changed */ + kvmppc_e500_recalc_shadow_pid(vcpu_e500); +} + +/* Invalidate one ID mapping on vcpu */ +static inline void kvmppc_e500_id_table_reset_one( + struct kvmppc_vcpu_e500 *vcpu_e500, + int as, int pid, int pr) +{ + struct vcpu_id_table *idt = vcpu_e500->idt; + + BUG_ON(as >= 2); + BUG_ON(pid >= NUM_TIDS); + BUG_ON(pr >= 2); + + idt->id[as][pid][pr].val = 0; + idt->id[as][pid][pr].pentry = NULL; + + /* Update shadow pid when mappings are changed */ + kvmppc_e500_recalc_shadow_pid(vcpu_e500); +} + +/* + * Map guest (vcpu,AS,ID,PR) to physical core shadow id. + * This function first lookup if a valid mapping exists, + * if not, then creates a new one. + * + * The caller must have preemption disabled, and keep it that way until + * it has finished with the returned shadow id (either written into the + * TLB or arch.shadow_pid, or discarded). + */ +static unsigned int kvmppc_e500_get_sid(struct kvmppc_vcpu_e500 *vcpu_e500, + unsigned int as, unsigned int gid, + unsigned int pr, int avoid_recursion) +{ + struct vcpu_id_table *idt = vcpu_e500->idt; + int sid; + + BUG_ON(as >= 2); + BUG_ON(gid >= NUM_TIDS); + BUG_ON(pr >= 2); + + sid = local_sid_lookup(&idt->id[as][gid][pr]); + + while (sid <= 0) { + /* No mapping yet */ + sid = local_sid_setup_one(&idt->id[as][gid][pr]); + if (sid <= 0) { + _tlbil_all(); + local_sid_destroy_all(); + } + + /* Update shadow pid when mappings are changed */ + if (!avoid_recursion) + kvmppc_e500_recalc_shadow_pid(vcpu_e500); + } + + return sid; +} + +/* Map guest pid to shadow. + * We use PID to keep shadow of current guest non-zero PID, + * and use PID1 to keep shadow of guest zero PID. + * So that guest tlbe with TID=0 can be accessed at any time */ +void kvmppc_e500_recalc_shadow_pid(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + preempt_disable(); + vcpu_e500->vcpu.arch.shadow_pid = kvmppc_e500_get_sid(vcpu_e500, + get_cur_as(&vcpu_e500->vcpu), + get_cur_pid(&vcpu_e500->vcpu), + get_cur_pr(&vcpu_e500->vcpu), 1); + vcpu_e500->vcpu.arch.shadow_pid1 = kvmppc_e500_get_sid(vcpu_e500, + get_cur_as(&vcpu_e500->vcpu), 0, + get_cur_pr(&vcpu_e500->vcpu), 1); + preempt_enable(); +} + void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); @@ -41,25 +229,14 @@ void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu) for (tlbsel = 0; tlbsel < 2; tlbsel++) { printk("Guest TLB%d:\n", tlbsel); - for (i = 0; i < vcpu_e500->guest_tlb_size[tlbsel]; i++) { - tlbe = &vcpu_e500->guest_tlb[tlbsel][i]; + for (i = 0; i < vcpu_e500->gtlb_size[tlbsel]; i++) { + tlbe = &vcpu_e500->gtlb_arch[tlbsel][i]; if (tlbe->mas1 & MAS1_VALID) printk(" G[%d][%3d] | %08X | %08X | %08X | %08X |\n", tlbsel, i, tlbe->mas1, tlbe->mas2, tlbe->mas3, tlbe->mas7); } } - - for (tlbsel = 0; tlbsel < 2; tlbsel++) { - printk("Shadow TLB%d:\n", tlbsel); - for (i = 0; i < vcpu_e500->shadow_tlb_size[tlbsel]; i++) { - tlbe = &vcpu_e500->shadow_tlb[tlbsel][i]; - if (tlbe->mas1 & MAS1_VALID) - printk(" S[%d][%3d] | %08X | %08X | %08X | %08X |\n", - tlbsel, i, tlbe->mas1, tlbe->mas2, - tlbe->mas3, tlbe->mas7); - } - } } static inline unsigned int tlb0_get_next_victim( @@ -67,16 +244,17 @@ static inline unsigned int tlb0_get_next_victim( { unsigned int victim; - victim = vcpu_e500->guest_tlb_nv[0]++; - if (unlikely(vcpu_e500->guest_tlb_nv[0] >= KVM_E500_TLB0_WAY_NUM)) - vcpu_e500->guest_tlb_nv[0] = 0; + victim = vcpu_e500->gtlb_nv[0]++; + if (unlikely(vcpu_e500->gtlb_nv[0] >= KVM_E500_TLB0_WAY_NUM)) + vcpu_e500->gtlb_nv[0] = 0; return victim; } static inline unsigned int tlb1_max_shadow_size(void) { - return tlb1_entry_num - tlbcam_index; + /* reserve one entry for magic page */ + return tlb1_entry_num - tlbcam_index - 1; } static inline int tlbe_is_writable(struct tlbe *tlbe) @@ -112,72 +290,149 @@ static inline u32 e500_shadow_mas2_attrib(u32 mas2, int usermode) /* * writing shadow tlb entry to host TLB */ -static inline void __write_host_tlbe(struct tlbe *stlbe) +static inline void __write_host_tlbe(struct tlbe *stlbe, uint32_t mas0) { + unsigned long flags; + + local_irq_save(flags); + mtspr(SPRN_MAS0, mas0); mtspr(SPRN_MAS1, stlbe->mas1); mtspr(SPRN_MAS2, stlbe->mas2); mtspr(SPRN_MAS3, stlbe->mas3); mtspr(SPRN_MAS7, stlbe->mas7); - __asm__ __volatile__ ("tlbwe\n" : : ); + asm volatile("isync; tlbwe" : : : "memory"); + local_irq_restore(flags); } static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500, - int tlbsel, int esel) + int tlbsel, int esel, struct tlbe *stlbe) { - struct tlbe *stlbe = &vcpu_e500->shadow_tlb[tlbsel][esel]; - - local_irq_disable(); if (tlbsel == 0) { - __write_host_tlbe(stlbe); + __write_host_tlbe(stlbe, + MAS0_TLBSEL(0) | + MAS0_ESEL(esel & (KVM_E500_TLB0_WAY_NUM - 1))); } else { - unsigned register mas0; - - mas0 = mfspr(SPRN_MAS0); - - mtspr(SPRN_MAS0, MAS0_TLBSEL(1) | MAS0_ESEL(to_htlb1_esel(esel))); - __write_host_tlbe(stlbe); - - mtspr(SPRN_MAS0, mas0); + __write_host_tlbe(stlbe, + MAS0_TLBSEL(1) | + MAS0_ESEL(to_htlb1_esel(esel))); } - local_irq_enable(); + trace_kvm_stlb_write(index_of(tlbsel, esel), stlbe->mas1, stlbe->mas2, + stlbe->mas3, stlbe->mas7); +} + +void kvmppc_map_magic(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + struct tlbe magic; + ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK; + unsigned int stid; + pfn_t pfn; + + pfn = (pfn_t)virt_to_phys((void *)shared_page) >> PAGE_SHIFT; + get_page(pfn_to_page(pfn)); + + preempt_disable(); + stid = kvmppc_e500_get_sid(vcpu_e500, 0, 0, 0, 0); + + magic.mas1 = MAS1_VALID | MAS1_TS | MAS1_TID(stid) | + MAS1_TSIZE(BOOK3E_PAGESZ_4K); + magic.mas2 = vcpu->arch.magic_page_ea | MAS2_M; + magic.mas3 = (pfn << PAGE_SHIFT) | + MAS3_SW | MAS3_SR | MAS3_UW | MAS3_UR; + magic.mas7 = pfn >> (32 - PAGE_SHIFT); + + __write_host_tlbe(&magic, MAS0_TLBSEL(1) | MAS0_ESEL(tlbcam_index)); + preempt_enable(); } void kvmppc_e500_tlb_load(struct kvm_vcpu *vcpu, int cpu) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - int i; - unsigned register mas0; - - /* Load all valid TLB1 entries to reduce guest tlb miss fault */ - local_irq_disable(); - mas0 = mfspr(SPRN_MAS0); - for (i = 0; i < tlb1_max_shadow_size(); i++) { - struct tlbe *stlbe = &vcpu_e500->shadow_tlb[1][i]; - - if (get_tlb_v(stlbe)) { - mtspr(SPRN_MAS0, MAS0_TLBSEL(1) - | MAS0_ESEL(to_htlb1_esel(i))); - __write_host_tlbe(stlbe); - } - } - mtspr(SPRN_MAS0, mas0); - local_irq_enable(); + + /* Shadow PID may be expired on local core */ + kvmppc_e500_recalc_shadow_pid(vcpu_e500); } void kvmppc_e500_tlb_put(struct kvm_vcpu *vcpu) { - _tlbil_all(); +} + +static void kvmppc_e500_stlbe_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500, + int tlbsel, int esel) +{ + struct tlbe *gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; + struct vcpu_id_table *idt = vcpu_e500->idt; + unsigned int pr, tid, ts, pid; + u32 val, eaddr; + unsigned long flags; + + ts = get_tlb_ts(gtlbe); + tid = get_tlb_tid(gtlbe); + + preempt_disable(); + + /* One guest ID may be mapped to two shadow IDs */ + for (pr = 0; pr < 2; pr++) { + /* + * The shadow PID can have a valid mapping on at most one + * host CPU. In the common case, it will be valid on this + * CPU, in which case (for TLB0) we do a local invalidation + * of the specific address. + * + * If the shadow PID is not valid on the current host CPU, or + * if we're invalidating a TLB1 entry, we invalidate the + * entire shadow PID. + */ + if (tlbsel == 1 || + (pid = local_sid_lookup(&idt->id[ts][tid][pr])) <= 0) { + kvmppc_e500_id_table_reset_one(vcpu_e500, ts, tid, pr); + continue; + } + + /* + * The guest is invalidating a TLB0 entry which is in a PID + * that has a valid shadow mapping on this host CPU. We + * search host TLB0 to invalidate it's shadow TLB entry, + * similar to __tlbil_va except that we need to look in AS1. + */ + val = (pid << MAS6_SPID_SHIFT) | MAS6_SAS; + eaddr = get_tlb_eaddr(gtlbe); + + local_irq_save(flags); + + mtspr(SPRN_MAS6, val); + asm volatile("tlbsx 0, %[eaddr]" : : [eaddr] "r" (eaddr)); + val = mfspr(SPRN_MAS1); + if (val & MAS1_VALID) { + mtspr(SPRN_MAS1, val & ~MAS1_VALID); + asm volatile("tlbwe"); + } + + local_irq_restore(flags); + } + + preempt_enable(); } /* Search the guest TLB for a matching entry. */ static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500, gva_t eaddr, int tlbsel, unsigned int pid, int as) { + int size = vcpu_e500->gtlb_size[tlbsel]; + int set_base; int i; - /* XXX Replace loop with fancy data structures. */ - for (i = 0; i < vcpu_e500->guest_tlb_size[tlbsel]; i++) { - struct tlbe *tlbe = &vcpu_e500->guest_tlb[tlbsel][i]; + if (tlbsel == 0) { + int mask = size / KVM_E500_TLB0_WAY_NUM - 1; + set_base = (eaddr >> PAGE_SHIFT) & mask; + set_base *= KVM_E500_TLB0_WAY_NUM; + size = KVM_E500_TLB0_WAY_NUM; + } else { + set_base = 0; + } + + for (i = 0; i < size; i++) { + struct tlbe *tlbe = &vcpu_e500->gtlb_arch[tlbsel][set_base + i]; unsigned int tid; if (eaddr < get_tlb_eaddr(tlbe)) @@ -196,66 +451,32 @@ static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500, if (get_tlb_ts(tlbe) != as && as != -1) continue; - return i; + return set_base + i; } return -1; } -static void kvmppc_e500_shadow_release(struct kvmppc_vcpu_e500 *vcpu_e500, - int tlbsel, int esel) -{ - struct tlbe *stlbe = &vcpu_e500->shadow_tlb[tlbsel][esel]; - struct page *page = vcpu_e500->shadow_pages[tlbsel][esel]; - - if (page) { - vcpu_e500->shadow_pages[tlbsel][esel] = NULL; - - if (get_tlb_v(stlbe)) { - if (tlbe_is_writable(stlbe)) - kvm_release_page_dirty(page); - else - kvm_release_page_clean(page); - } - } -} - -static void kvmppc_e500_stlbe_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500, - int tlbsel, int esel) +static inline void kvmppc_e500_priv_setup(struct tlbe_priv *priv, + struct tlbe *gtlbe, + pfn_t pfn) { - struct tlbe *stlbe = &vcpu_e500->shadow_tlb[tlbsel][esel]; + priv->pfn = pfn; + priv->flags = E500_TLB_VALID; - kvmppc_e500_shadow_release(vcpu_e500, tlbsel, esel); - stlbe->mas1 = 0; - trace_kvm_stlb_inval(index_of(tlbsel, esel)); + if (tlbe_is_writable(gtlbe)) + priv->flags |= E500_TLB_DIRTY; } -static void kvmppc_e500_tlb1_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500, - gva_t eaddr, gva_t eend, u32 tid) +static inline void kvmppc_e500_priv_release(struct tlbe_priv *priv) { - unsigned int pid = tid & 0xff; - unsigned int i; - - /* XXX Replace loop with fancy data structures. */ - for (i = 0; i < vcpu_e500->guest_tlb_size[1]; i++) { - struct tlbe *stlbe = &vcpu_e500->shadow_tlb[1][i]; - unsigned int tid; - - if (!get_tlb_v(stlbe)) - continue; - - if (eend < get_tlb_eaddr(stlbe)) - continue; + if (priv->flags & E500_TLB_VALID) { + if (priv->flags & E500_TLB_DIRTY) + kvm_release_pfn_dirty(priv->pfn); + else + kvm_release_pfn_clean(priv->pfn); - if (eaddr > get_tlb_end(stlbe)) - continue; - - tid = get_tlb_tid(stlbe); - if (tid && (tid != pid)) - continue; - - kvmppc_e500_stlbe_invalidate(vcpu_e500, 1, i); - write_host_tlbe(vcpu_e500, 1, i); + priv->flags = 0; } } @@ -273,7 +494,7 @@ static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu, tsized = (vcpu_e500->mas4 >> 7) & 0x1f; vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim) - | MAS0_NV(vcpu_e500->guest_tlb_nv[tlbsel]); + | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); vcpu_e500->mas1 = MAS1_VALID | (as ? MAS1_TS : 0) | MAS1_TID(vcpu_e500->pid[pidsel]) | MAS1_TSIZE(tsized); @@ -286,56 +507,154 @@ static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu, vcpu_e500->mas7 = 0; } -static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, - u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe, int tlbsel, int esel) +static inline void kvmppc_e500_setup_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500, + struct tlbe *gtlbe, int tsize, + struct tlbe_priv *priv, + u64 gvaddr, struct tlbe *stlbe) { - struct page *new_page; - struct tlbe *stlbe; - hpa_t hpaddr; - - stlbe = &vcpu_e500->shadow_tlb[tlbsel][esel]; - - /* Get reference to new page. */ - new_page = gfn_to_page(vcpu_e500->vcpu.kvm, gfn); - if (is_error_page(new_page)) { - printk(KERN_ERR "Couldn't get guest page for gfn %lx!\n", - (long)gfn); - kvm_release_page_clean(new_page); - return; - } - hpaddr = page_to_phys(new_page); - - /* Drop reference to old page. */ - kvmppc_e500_shadow_release(vcpu_e500, tlbsel, esel); + pfn_t pfn = priv->pfn; + unsigned int stid; - vcpu_e500->shadow_pages[tlbsel][esel] = new_page; + stid = kvmppc_e500_get_sid(vcpu_e500, get_tlb_ts(gtlbe), + get_tlb_tid(gtlbe), + get_cur_pr(&vcpu_e500->vcpu), 0); - /* Force TS=1 IPROT=0 TSIZE=4KB for all guest mappings. */ - stlbe->mas1 = MAS1_TSIZE(BOOK3E_PAGESZ_4K) - | MAS1_TID(get_tlb_tid(gtlbe)) | MAS1_TS | MAS1_VALID; + /* Force TS=1 IPROT=0 for all guest mappings. */ + stlbe->mas1 = MAS1_TSIZE(tsize) + | MAS1_TID(stid) | MAS1_TS | MAS1_VALID; stlbe->mas2 = (gvaddr & MAS2_EPN) | e500_shadow_mas2_attrib(gtlbe->mas2, vcpu_e500->vcpu.arch.shared->msr & MSR_PR); - stlbe->mas3 = (hpaddr & MAS3_RPN) + stlbe->mas3 = ((pfn << PAGE_SHIFT) & MAS3_RPN) | e500_shadow_mas3_attrib(gtlbe->mas3, vcpu_e500->vcpu.arch.shared->msr & MSR_PR); - stlbe->mas7 = (hpaddr >> 32) & MAS7_RPN; + stlbe->mas7 = (pfn >> (32 - PAGE_SHIFT)) & MAS7_RPN; +} - trace_kvm_stlb_write(index_of(tlbsel, esel), stlbe->mas1, stlbe->mas2, - stlbe->mas3, stlbe->mas7); + +static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, + u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe, int tlbsel, int esel, + struct tlbe *stlbe) +{ + struct kvm_memory_slot *slot; + unsigned long pfn, hva; + int pfnmap = 0; + int tsize = BOOK3E_PAGESZ_4K; + struct tlbe_priv *priv; + + /* + * Translate guest physical to true physical, acquiring + * a page reference if it is normal, non-reserved memory. + * + * gfn_to_memslot() must succeed because otherwise we wouldn't + * have gotten this far. Eventually we should just pass the slot + * pointer through from the first lookup. + */ + slot = gfn_to_memslot(vcpu_e500->vcpu.kvm, gfn); + hva = gfn_to_hva_memslot(slot, gfn); + + if (tlbsel == 1) { + struct vm_area_struct *vma; + down_read(¤t->mm->mmap_sem); + + vma = find_vma(current->mm, hva); + if (vma && hva >= vma->vm_start && + (vma->vm_flags & VM_PFNMAP)) { + /* + * This VMA is a physically contiguous region (e.g. + * /dev/mem) that bypasses normal Linux page + * management. Find the overlap between the + * vma and the memslot. + */ + + unsigned long start, end; + unsigned long slot_start, slot_end; + + pfnmap = 1; + + start = vma->vm_pgoff; + end = start + + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); + + pfn = start + ((hva - vma->vm_start) >> PAGE_SHIFT); + + slot_start = pfn - (gfn - slot->base_gfn); + slot_end = slot_start + slot->npages; + + if (start < slot_start) + start = slot_start; + if (end > slot_end) + end = slot_end; + + tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >> + MAS1_TSIZE_SHIFT; + + /* + * e500 doesn't implement the lowest tsize bit, + * or 1K pages. + */ + tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1); + + /* + * Now find the largest tsize (up to what the guest + * requested) that will cover gfn, stay within the + * range, and for which gfn and pfn are mutually + * aligned. + */ + + for (; tsize > BOOK3E_PAGESZ_4K; tsize -= 2) { + unsigned long gfn_start, gfn_end, tsize_pages; + tsize_pages = 1 << (tsize - 2); + + gfn_start = gfn & ~(tsize_pages - 1); + gfn_end = gfn_start + tsize_pages; + + if (gfn_start + pfn - gfn < start) + continue; + if (gfn_end + pfn - gfn > end) + continue; + if ((gfn & (tsize_pages - 1)) != + (pfn & (tsize_pages - 1))) + continue; + + gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1); + pfn &= ~(tsize_pages - 1); + break; + } + } + + up_read(¤t->mm->mmap_sem); + } + + if (likely(!pfnmap)) { + pfn = gfn_to_pfn_memslot(vcpu_e500->vcpu.kvm, slot, gfn); + if (is_error_pfn(pfn)) { + printk(KERN_ERR "Couldn't get real page for gfn %lx!\n", + (long)gfn); + kvm_release_pfn_clean(pfn); + return; + } + } + + /* Drop old priv and setup new one. */ + priv = &vcpu_e500->gtlb_priv[tlbsel][esel]; + kvmppc_e500_priv_release(priv); + kvmppc_e500_priv_setup(priv, gtlbe, pfn); + + kvmppc_e500_setup_stlbe(vcpu_e500, gtlbe, tsize, priv, gvaddr, stlbe); } /* XXX only map the one-one case, for now use TLB0 */ -static int kvmppc_e500_stlbe_map(struct kvmppc_vcpu_e500 *vcpu_e500, - int tlbsel, int esel) +static int kvmppc_e500_tlb0_map(struct kvmppc_vcpu_e500 *vcpu_e500, + int esel, struct tlbe *stlbe) { struct tlbe *gtlbe; - gtlbe = &vcpu_e500->guest_tlb[tlbsel][esel]; + gtlbe = &vcpu_e500->gtlb_arch[0][esel]; kvmppc_e500_shadow_map(vcpu_e500, get_tlb_eaddr(gtlbe), get_tlb_raddr(gtlbe) >> PAGE_SHIFT, - gtlbe, tlbsel, esel); + gtlbe, 0, esel, stlbe); return esel; } @@ -344,53 +663,37 @@ static int kvmppc_e500_stlbe_map(struct kvmppc_vcpu_e500 *vcpu_e500, * the shadow TLB. */ /* XXX for both one-one and one-to-many , for now use TLB1 */ static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500, - u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe) + u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe, struct tlbe *stlbe) { unsigned int victim; - victim = vcpu_e500->guest_tlb_nv[1]++; + victim = vcpu_e500->gtlb_nv[1]++; - if (unlikely(vcpu_e500->guest_tlb_nv[1] >= tlb1_max_shadow_size())) - vcpu_e500->guest_tlb_nv[1] = 0; + if (unlikely(vcpu_e500->gtlb_nv[1] >= tlb1_max_shadow_size())) + vcpu_e500->gtlb_nv[1] = 0; - kvmppc_e500_shadow_map(vcpu_e500, gvaddr, gfn, gtlbe, 1, victim); + kvmppc_e500_shadow_map(vcpu_e500, gvaddr, gfn, gtlbe, 1, victim, stlbe); return victim; } -/* Invalidate all guest kernel mappings when enter usermode, - * so that when they fault back in they will get the - * proper permission bits. */ -void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode) +void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr) { - if (usermode) { - struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - int i; - - /* XXX Replace loop with fancy data structures. */ - for (i = 0; i < tlb1_max_shadow_size(); i++) - kvmppc_e500_stlbe_invalidate(vcpu_e500, 1, i); + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - _tlbil_all(); - } + /* Recalc shadow pid since MSR changes */ + kvmppc_e500_recalc_shadow_pid(vcpu_e500); } -static int kvmppc_e500_gtlbe_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500, - int tlbsel, int esel) +static inline int kvmppc_e500_gtlbe_invalidate( + struct kvmppc_vcpu_e500 *vcpu_e500, + int tlbsel, int esel) { - struct tlbe *gtlbe = &vcpu_e500->guest_tlb[tlbsel][esel]; + struct tlbe *gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; if (unlikely(get_tlb_iprot(gtlbe))) return -1; - if (tlbsel == 1) { - kvmppc_e500_tlb1_invalidate(vcpu_e500, get_tlb_eaddr(gtlbe), - get_tlb_end(gtlbe), - get_tlb_tid(gtlbe)); - } else { - kvmppc_e500_stlbe_invalidate(vcpu_e500, tlbsel, esel); - } - gtlbe->mas1 = 0; return 0; @@ -401,13 +704,14 @@ int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500, ulong value) int esel; if (value & MMUCSR0_TLB0FI) - for (esel = 0; esel < vcpu_e500->guest_tlb_size[0]; esel++) + for (esel = 0; esel < vcpu_e500->gtlb_size[0]; esel++) kvmppc_e500_gtlbe_invalidate(vcpu_e500, 0, esel); if (value & MMUCSR0_TLB1FI) - for (esel = 0; esel < vcpu_e500->guest_tlb_size[1]; esel++) + for (esel = 0; esel < vcpu_e500->gtlb_size[1]; esel++) kvmppc_e500_gtlbe_invalidate(vcpu_e500, 1, esel); - _tlbil_all(); + /* Invalidate all vcpu id mappings */ + kvmppc_e500_id_table_reset_all(vcpu_e500); return EMULATE_DONE; } @@ -428,7 +732,7 @@ int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb) if (ia) { /* invalidate all entries */ - for (esel = 0; esel < vcpu_e500->guest_tlb_size[tlbsel]; esel++) + for (esel = 0; esel < vcpu_e500->gtlb_size[tlbsel]; esel++) kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel); } else { ea &= 0xfffff000; @@ -438,7 +742,8 @@ int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb) kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel); } - _tlbil_all(); + /* Invalidate all vcpu id mappings */ + kvmppc_e500_id_table_reset_all(vcpu_e500); return EMULATE_DONE; } @@ -452,9 +757,9 @@ int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu) tlbsel = get_tlb_tlbsel(vcpu_e500); esel = get_tlb_esel(vcpu_e500, tlbsel); - gtlbe = &vcpu_e500->guest_tlb[tlbsel][esel]; + gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; vcpu_e500->mas0 &= ~MAS0_NV(~0); - vcpu_e500->mas0 |= MAS0_NV(vcpu_e500->guest_tlb_nv[tlbsel]); + vcpu_e500->mas0 |= MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); vcpu_e500->mas1 = gtlbe->mas1; vcpu_e500->mas2 = gtlbe->mas2; vcpu_e500->mas3 = gtlbe->mas3; @@ -477,14 +782,14 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb) for (tlbsel = 0; tlbsel < 2; tlbsel++) { esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, as); if (esel >= 0) { - gtlbe = &vcpu_e500->guest_tlb[tlbsel][esel]; + gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; break; } } if (gtlbe) { vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(esel) - | MAS0_NV(vcpu_e500->guest_tlb_nv[tlbsel]); + | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); vcpu_e500->mas1 = gtlbe->mas1; vcpu_e500->mas2 = gtlbe->mas2; vcpu_e500->mas3 = gtlbe->mas3; @@ -497,7 +802,7 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb) victim = (tlbsel == 0) ? tlb0_get_next_victim(vcpu_e500) : 0; vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim) - | MAS0_NV(vcpu_e500->guest_tlb_nv[tlbsel]); + | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); vcpu_e500->mas1 = (vcpu_e500->mas6 & MAS6_SPID0) | (vcpu_e500->mas6 & (MAS6_SAS ? MAS1_TS : 0)) | (vcpu_e500->mas4 & MAS4_TSIZED(~0)); @@ -514,23 +819,16 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb) int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - u64 eaddr; - u64 raddr; - u32 tid; struct tlbe *gtlbe; - int tlbsel, esel, stlbsel, sesel; + int tlbsel, esel; tlbsel = get_tlb_tlbsel(vcpu_e500); esel = get_tlb_esel(vcpu_e500, tlbsel); - gtlbe = &vcpu_e500->guest_tlb[tlbsel][esel]; + gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; - if (get_tlb_v(gtlbe) && tlbsel == 1) { - eaddr = get_tlb_eaddr(gtlbe); - tid = get_tlb_tid(gtlbe); - kvmppc_e500_tlb1_invalidate(vcpu_e500, eaddr, - get_tlb_end(gtlbe), tid); - } + if (get_tlb_v(gtlbe)) + kvmppc_e500_stlbe_invalidate(vcpu_e500, tlbsel, esel); gtlbe->mas1 = vcpu_e500->mas1; gtlbe->mas2 = vcpu_e500->mas2; @@ -542,6 +840,12 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) /* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */ if (tlbe_is_host_safe(vcpu, gtlbe)) { + struct tlbe stlbe; + int stlbsel, sesel; + u64 eaddr; + u64 raddr; + + preempt_disable(); switch (tlbsel) { case 0: /* TLB0 */ @@ -549,7 +853,7 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) gtlbe->mas1 |= MAS1_TSIZE(BOOK3E_PAGESZ_4K); stlbsel = 0; - sesel = kvmppc_e500_stlbe_map(vcpu_e500, 0, esel); + sesel = kvmppc_e500_tlb0_map(vcpu_e500, esel, &stlbe); break; @@ -564,13 +868,14 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) * are mapped on the fly. */ stlbsel = 1; sesel = kvmppc_e500_tlb1_map(vcpu_e500, eaddr, - raddr >> PAGE_SHIFT, gtlbe); + raddr >> PAGE_SHIFT, gtlbe, &stlbe); break; default: BUG(); } - write_host_tlbe(vcpu_e500, stlbsel, sesel); + write_host_tlbe(vcpu_e500, stlbsel, sesel, &stlbe); + preempt_enable(); } kvmppc_set_exit_type(vcpu, EMULATED_TLBWE_EXITS); @@ -610,7 +915,7 @@ gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int index, { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); struct tlbe *gtlbe = - &vcpu_e500->guest_tlb[tlbsel_of(index)][esel_of(index)]; + &vcpu_e500->gtlb_arch[tlbsel_of(index)][esel_of(index)]; u64 pgmask = get_tlb_bytes(gtlbe) - 1; return get_tlb_raddr(gtlbe) | (eaddr & pgmask); @@ -618,38 +923,37 @@ gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int index, void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) { - struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - int tlbsel, i; - - for (tlbsel = 0; tlbsel < 2; tlbsel++) - for (i = 0; i < vcpu_e500->guest_tlb_size[tlbsel]; i++) - kvmppc_e500_shadow_release(vcpu_e500, tlbsel, i); - - /* discard all guest mapping */ - _tlbil_all(); } void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr, unsigned int index) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + struct tlbe_priv *priv; + struct tlbe *gtlbe, stlbe; int tlbsel = tlbsel_of(index); int esel = esel_of(index); int stlbsel, sesel; + gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; + + preempt_disable(); switch (tlbsel) { case 0: stlbsel = 0; sesel = esel; + priv = &vcpu_e500->gtlb_priv[stlbsel][sesel]; + + kvmppc_e500_setup_stlbe(vcpu_e500, gtlbe, BOOK3E_PAGESZ_4K, + priv, eaddr, &stlbe); break; case 1: { gfn_t gfn = gpaddr >> PAGE_SHIFT; - struct tlbe *gtlbe - = &vcpu_e500->guest_tlb[tlbsel][esel]; stlbsel = 1; - sesel = kvmppc_e500_tlb1_map(vcpu_e500, eaddr, gfn, gtlbe); + sesel = kvmppc_e500_tlb1_map(vcpu_e500, eaddr, gfn, + gtlbe, &stlbe); break; } @@ -657,7 +961,9 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr, BUG(); break; } - write_host_tlbe(vcpu_e500, stlbsel, sesel); + + write_host_tlbe(vcpu_e500, stlbsel, sesel, &stlbe); + preempt_enable(); } int kvmppc_e500_tlb_search(struct kvm_vcpu *vcpu, @@ -679,8 +985,10 @@ void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - vcpu_e500->pid[0] = vcpu->arch.shadow_pid = - vcpu->arch.pid = pid; + if (vcpu->arch.pid != pid) { + vcpu_e500->pid[0] = vcpu->arch.pid = pid; + kvmppc_e500_recalc_shadow_pid(vcpu_e500); + } } void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *vcpu_e500) @@ -688,14 +996,14 @@ void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *vcpu_e500) struct tlbe *tlbe; /* Insert large initial mapping for guest. */ - tlbe = &vcpu_e500->guest_tlb[1][0]; + tlbe = &vcpu_e500->gtlb_arch[1][0]; tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_256M); tlbe->mas2 = 0; tlbe->mas3 = E500_TLB_SUPER_PERM_MASK; tlbe->mas7 = 0; /* 4K map for serial output. Used by kernel wrapper. */ - tlbe = &vcpu_e500->guest_tlb[1][1]; + tlbe = &vcpu_e500->gtlb_arch[1][1]; tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_4K); tlbe->mas2 = (0xe0004500 & 0xFFFFF000) | MAS2_I | MAS2_G; tlbe->mas3 = (0xe0004500 & 0xFFFFF000) | E500_TLB_SUPER_PERM_MASK; @@ -706,68 +1014,64 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) { tlb1_entry_num = mfspr(SPRN_TLB1CFG) & 0xFFF; - vcpu_e500->guest_tlb_size[0] = KVM_E500_TLB0_SIZE; - vcpu_e500->guest_tlb[0] = + vcpu_e500->gtlb_size[0] = KVM_E500_TLB0_SIZE; + vcpu_e500->gtlb_arch[0] = kzalloc(sizeof(struct tlbe) * KVM_E500_TLB0_SIZE, GFP_KERNEL); - if (vcpu_e500->guest_tlb[0] == NULL) + if (vcpu_e500->gtlb_arch[0] == NULL) goto err_out; - vcpu_e500->shadow_tlb_size[0] = KVM_E500_TLB0_SIZE; - vcpu_e500->shadow_tlb[0] = - kzalloc(sizeof(struct tlbe) * KVM_E500_TLB0_SIZE, GFP_KERNEL); - if (vcpu_e500->shadow_tlb[0] == NULL) - goto err_out_guest0; - - vcpu_e500->guest_tlb_size[1] = KVM_E500_TLB1_SIZE; - vcpu_e500->guest_tlb[1] = + vcpu_e500->gtlb_size[1] = KVM_E500_TLB1_SIZE; + vcpu_e500->gtlb_arch[1] = kzalloc(sizeof(struct tlbe) * KVM_E500_TLB1_SIZE, GFP_KERNEL); - if (vcpu_e500->guest_tlb[1] == NULL) - goto err_out_shadow0; + if (vcpu_e500->gtlb_arch[1] == NULL) + goto err_out_guest0; - vcpu_e500->shadow_tlb_size[1] = tlb1_entry_num; - vcpu_e500->shadow_tlb[1] = - kzalloc(sizeof(struct tlbe) * tlb1_entry_num, GFP_KERNEL); - if (vcpu_e500->shadow_tlb[1] == NULL) + vcpu_e500->gtlb_priv[0] = (struct tlbe_priv *) + kzalloc(sizeof(struct tlbe_priv) * KVM_E500_TLB0_SIZE, GFP_KERNEL); + if (vcpu_e500->gtlb_priv[0] == NULL) goto err_out_guest1; + vcpu_e500->gtlb_priv[1] = (struct tlbe_priv *) + kzalloc(sizeof(struct tlbe_priv) * KVM_E500_TLB1_SIZE, GFP_KERNEL); - vcpu_e500->shadow_pages[0] = (struct page **) - kzalloc(sizeof(struct page *) * KVM_E500_TLB0_SIZE, GFP_KERNEL); - if (vcpu_e500->shadow_pages[0] == NULL) - goto err_out_shadow1; + if (vcpu_e500->gtlb_priv[1] == NULL) + goto err_out_priv0; - vcpu_e500->shadow_pages[1] = (struct page **) - kzalloc(sizeof(struct page *) * tlb1_entry_num, GFP_KERNEL); - if (vcpu_e500->shadow_pages[1] == NULL) - goto err_out_page0; + if (kvmppc_e500_id_table_alloc(vcpu_e500) == NULL) + goto err_out_priv1; /* Init TLB configuration register */ vcpu_e500->tlb0cfg = mfspr(SPRN_TLB0CFG) & ~0xfffUL; - vcpu_e500->tlb0cfg |= vcpu_e500->guest_tlb_size[0]; + vcpu_e500->tlb0cfg |= vcpu_e500->gtlb_size[0]; vcpu_e500->tlb1cfg = mfspr(SPRN_TLB1CFG) & ~0xfffUL; - vcpu_e500->tlb1cfg |= vcpu_e500->guest_tlb_size[1]; + vcpu_e500->tlb1cfg |= vcpu_e500->gtlb_size[1]; return 0; -err_out_page0: - kfree(vcpu_e500->shadow_pages[0]); -err_out_shadow1: - kfree(vcpu_e500->shadow_tlb[1]); +err_out_priv1: + kfree(vcpu_e500->gtlb_priv[1]); +err_out_priv0: + kfree(vcpu_e500->gtlb_priv[0]); err_out_guest1: - kfree(vcpu_e500->guest_tlb[1]); -err_out_shadow0: - kfree(vcpu_e500->shadow_tlb[0]); + kfree(vcpu_e500->gtlb_arch[1]); err_out_guest0: - kfree(vcpu_e500->guest_tlb[0]); + kfree(vcpu_e500->gtlb_arch[0]); err_out: return -1; } void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500) { - kfree(vcpu_e500->shadow_pages[1]); - kfree(vcpu_e500->shadow_pages[0]); - kfree(vcpu_e500->shadow_tlb[1]); - kfree(vcpu_e500->guest_tlb[1]); - kfree(vcpu_e500->shadow_tlb[0]); - kfree(vcpu_e500->guest_tlb[0]); + int stlbsel, i; + + /* release all privs */ + for (stlbsel = 0; stlbsel < 2; stlbsel++) + for (i = 0; i < vcpu_e500->gtlb_size[stlbsel]; i++) { + struct tlbe_priv *priv = + &vcpu_e500->gtlb_priv[stlbsel][i]; + kvmppc_e500_priv_release(priv); + } + + kvmppc_e500_id_table_free(vcpu_e500); + kfree(vcpu_e500->gtlb_arch[1]); + kfree(vcpu_e500->gtlb_arch[0]); } diff --git a/arch/powerpc/kvm/e500_tlb.h b/arch/powerpc/kvm/e500_tlb.h index 458946b4775d..59b88e99a235 100644 --- a/arch/powerpc/kvm/e500_tlb.h +++ b/arch/powerpc/kvm/e500_tlb.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2008 Freescale Semiconductor, Inc. All rights reserved. + * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved. * * Author: Yu Liu, yu.liu@freescale.com * @@ -55,6 +55,7 @@ extern void kvmppc_e500_tlb_load(struct kvm_vcpu *, int); extern int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *); extern void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *); extern void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *); +extern void kvmppc_e500_recalc_shadow_pid(struct kvmppc_vcpu_e500 *); /* TLB helper functions */ static inline unsigned int get_tlb_size(const struct tlbe *tlbe) @@ -110,6 +111,16 @@ static inline unsigned int get_cur_pid(struct kvm_vcpu *vcpu) return vcpu->arch.pid & 0xff; } +static inline unsigned int get_cur_as(struct kvm_vcpu *vcpu) +{ + return !!(vcpu->arch.shared->msr & (MSR_IS | MSR_DS)); +} + +static inline unsigned int get_cur_pr(struct kvm_vcpu *vcpu) +{ + return !!(vcpu->arch.shared->msr & MSR_PR); +} + static inline unsigned int get_cur_spid( const struct kvmppc_vcpu_e500 *vcpu_e500) { diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 616dd516ca1f..607fbdf24b84 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -21,7 +21,6 @@ #include <linux/errno.h> #include <linux/err.h> #include <linux/kvm_host.h> -#include <linux/module.h> #include <linux/vmalloc.h> #include <linux/hrtimer.h> #include <linux/fs.h> @@ -30,6 +29,7 @@ #include <asm/uaccess.h> #include <asm/kvm_ppc.h> #include <asm/tlbflush.h> +#include <asm/cputhreads.h> #include "timing.h" #include "../mm/mmu_decl.h" @@ -73,7 +73,8 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) } case HC_VENDOR_KVM | KVM_HC_FEATURES: r = HC_EV_SUCCESS; -#if defined(CONFIG_PPC_BOOK3S) /* XXX Missing magic page on BookE */ +#if defined(CONFIG_PPC_BOOK3S) || defined(CONFIG_KVM_E500) + /* XXX Missing magic page on 44x */ r2 |= (1 << KVM_FEATURE_MAGIC_PAGE); #endif @@ -89,6 +90,31 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) return r; } +int kvmppc_sanity_check(struct kvm_vcpu *vcpu) +{ + int r = false; + + /* We have to know what CPU to virtualize */ + if (!vcpu->arch.pvr) + goto out; + + /* PAPR only works with book3s_64 */ + if ((vcpu->arch.cpu_type != KVM_CPU_3S_64) && vcpu->arch.papr_enabled) + goto out; + +#ifdef CONFIG_KVM_BOOK3S_64_HV + /* HV KVM can only do PAPR mode for now */ + if (!vcpu->arch.papr_enabled) + goto out; +#endif + + r = true; + +out: + vcpu->arch.sane = r; + return r ? 0 : -EINVAL; +} + int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu) { enum emulation_result er; @@ -147,7 +173,7 @@ void kvm_arch_check_processor_compat(void *rtn) int kvm_arch_init_vm(struct kvm *kvm) { - return 0; + return kvmppc_core_init_vm(kvm); } void kvm_arch_destroy_vm(struct kvm *kvm) @@ -163,6 +189,9 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvm->vcpus[i] = NULL; atomic_set(&kvm->online_vcpus, 0); + + kvmppc_core_destroy_vm(kvm); + mutex_unlock(&kvm->lock); } @@ -179,11 +208,15 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_PPC_BOOKE_SREGS: #else case KVM_CAP_PPC_SEGSTATE: + case KVM_CAP_PPC_PAPR: #endif - case KVM_CAP_PPC_PAIRED_SINGLES: case KVM_CAP_PPC_UNSET_IRQ: case KVM_CAP_PPC_IRQ_LEVEL: case KVM_CAP_ENABLE_CAP: + r = 1; + break; +#ifndef CONFIG_KVM_BOOK3S_64_HV + case KVM_CAP_PPC_PAIRED_SINGLES: case KVM_CAP_PPC_OSI: case KVM_CAP_PPC_GET_PVINFO: r = 1; @@ -191,6 +224,21 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_COALESCED_MMIO: r = KVM_COALESCED_MMIO_PAGE_OFFSET; break; +#endif +#ifdef CONFIG_KVM_BOOK3S_64_HV + case KVM_CAP_SPAPR_TCE: + r = 1; + break; + case KVM_CAP_PPC_SMT: + r = threads_per_core; + break; + case KVM_CAP_PPC_RMA: + r = 1; + /* PPC970 requires an RMA */ + if (cpu_has_feature(CPU_FTR_ARCH_201)) + r = 2; + break; +#endif default: r = 0; break; @@ -211,7 +259,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, int user_alloc) { - return 0; + return kvmppc_core_prepare_memory_region(kvm, mem); } void kvm_arch_commit_memory_region(struct kvm *kvm, @@ -219,7 +267,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, struct kvm_memory_slot old, int user_alloc) { - return; + kvmppc_core_commit_memory_region(kvm, mem); } @@ -231,6 +279,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { struct kvm_vcpu *vcpu; vcpu = kvmppc_core_vcpu_create(kvm, id); + vcpu->arch.wqp = &vcpu->wq; if (!IS_ERR(vcpu)) kvmppc_create_vcpu_debugfs(vcpu, id); return vcpu; @@ -262,8 +311,8 @@ static void kvmppc_decrementer_func(unsigned long data) kvmppc_core_queue_dec(vcpu); - if (waitqueue_active(&vcpu->wq)) { - wake_up_interruptible(&vcpu->wq); + if (waitqueue_active(vcpu->arch.wqp)) { + wake_up_interruptible(vcpu->arch.wqp); vcpu->stat.halt_wakeup++; } } @@ -287,6 +336,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); tasklet_init(&vcpu->arch.tasklet, kvmppc_decrementer_func, (ulong)vcpu); vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup; + vcpu->arch.dec_expires = ~(u64)0; #ifdef CONFIG_KVM_EXIT_TIMING mutex_init(&vcpu->arch.exit_timing_lock); @@ -313,6 +363,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) mtspr(SPRN_VRSAVE, vcpu->arch.vrsave); #endif kvmppc_core_vcpu_load(vcpu, cpu); + vcpu->cpu = smp_processor_id(); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) @@ -321,6 +372,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) #ifdef CONFIG_BOOKE vcpu->arch.vrsave = mfspr(SPRN_VRSAVE); #endif + vcpu->cpu = -1; } int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, @@ -492,15 +544,18 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) for (i = 0; i < 32; i++) kvmppc_set_gpr(vcpu, i, gprs[i]); vcpu->arch.osi_needed = 0; + } else if (vcpu->arch.hcall_needed) { + int i; + + kvmppc_set_gpr(vcpu, 3, run->papr_hcall.ret); + for (i = 0; i < 9; ++i) + kvmppc_set_gpr(vcpu, 4 + i, run->papr_hcall.args[i]); + vcpu->arch.hcall_needed = 0; } kvmppc_core_deliver_interrupts(vcpu); - local_irq_disable(); - kvm_guest_enter(); - r = __kvmppc_vcpu_run(run, vcpu); - kvm_guest_exit(); - local_irq_enable(); + r = kvmppc_vcpu_run(run, vcpu); if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &sigsaved, NULL); @@ -510,14 +565,18 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) { - if (irq->irq == KVM_INTERRUPT_UNSET) + if (irq->irq == KVM_INTERRUPT_UNSET) { kvmppc_core_dequeue_external(vcpu, irq); - else - kvmppc_core_queue_external(vcpu, irq); + return 0; + } + + kvmppc_core_queue_external(vcpu, irq); - if (waitqueue_active(&vcpu->wq)) { - wake_up_interruptible(&vcpu->wq); + if (waitqueue_active(vcpu->arch.wqp)) { + wake_up_interruptible(vcpu->arch.wqp); vcpu->stat.halt_wakeup++; + } else if (vcpu->cpu != -1) { + smp_send_reschedule(vcpu->cpu); } return 0; @@ -536,11 +595,18 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, r = 0; vcpu->arch.osi_enabled = true; break; + case KVM_CAP_PPC_PAPR: + r = 0; + vcpu->arch.papr_enabled = true; + break; default: r = -EINVAL; break; } + if (!r) + r = kvmppc_sanity_check(vcpu); + return r; } @@ -633,6 +699,29 @@ long kvm_arch_vm_ioctl(struct file *filp, break; } +#ifdef CONFIG_KVM_BOOK3S_64_HV + case KVM_CREATE_SPAPR_TCE: { + struct kvm_create_spapr_tce create_tce; + struct kvm *kvm = filp->private_data; + + r = -EFAULT; + if (copy_from_user(&create_tce, argp, sizeof(create_tce))) + goto out; + r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce); + goto out; + } + + case KVM_ALLOCATE_RMA: { + struct kvm *kvm = filp->private_data; + struct kvm_allocate_rma rma; + + r = kvm_vm_ioctl_allocate_rma(kvm, &rma); + if (r >= 0 && copy_to_user(argp, &rma, sizeof(rma))) + r = -EFAULT; + break; + } +#endif /* CONFIG_KVM_BOOK3S_64_HV */ + default: r = -ENOTTY; } diff --git a/arch/powerpc/kvm/timing.c b/arch/powerpc/kvm/timing.c index 319177df9587..07b6110a4bb7 100644 --- a/arch/powerpc/kvm/timing.c +++ b/arch/powerpc/kvm/timing.c @@ -56,15 +56,6 @@ static void add_exit_timing(struct kvm_vcpu *vcpu, u64 duration, int type) { u64 old; - do_div(duration, tb_ticks_per_usec); - if (unlikely(duration > 0xFFFFFFFF)) { - printk(KERN_ERR"%s - duration too big -> overflow" - " duration %lld type %d exit #%d\n", - __func__, duration, type, - vcpu->arch.timing_count_type[type]); - return; - } - mutex_lock(&vcpu->arch.exit_timing_lock); vcpu->arch.timing_count_type[type]++; diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h index 3aca1b042b8c..b135d3d397db 100644 --- a/arch/powerpc/kvm/trace.h +++ b/arch/powerpc/kvm/trace.h @@ -103,7 +103,7 @@ TRACE_EVENT(kvm_gtlb_write, * Book3S trace points * *************************************************************************/ -#ifdef CONFIG_PPC_BOOK3S +#ifdef CONFIG_KVM_BOOK3S_PR TRACE_EVENT(kvm_book3s_exit, TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu), @@ -252,7 +252,7 @@ TRACE_EVENT(kvm_book3s_mmu_flush, ), TP_fast_assign( - __entry->count = vcpu->arch.hpte_cache_count; + __entry->count = to_book3s(vcpu)->hpte_cache_count; __entry->p1 = p1; __entry->p2 = p2; __entry->type = type; |