From 37178b8bf00137dbf28a9b291af4fbc1b8f91dcc Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Tue, 29 Nov 2011 14:02:45 +0900 Subject: KVM: MMU: Remove for_each_unsync_children() macro There is only one user of it and for_each_set_bit() does the same. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 224b02c3cda9..8a9b27cb4449 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1391,11 +1391,6 @@ struct kvm_mmu_pages { unsigned int nr; }; -#define for_each_unsync_children(bitmap, idx) \ - for (idx = find_first_bit(bitmap, 512); \ - idx < 512; \ - idx = find_next_bit(bitmap, 512, idx+1)) - static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, int idx) { @@ -1417,7 +1412,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, { int i, ret, nr_unsync_leaf = 0; - for_each_unsync_children(sp->unsync_child_bitmap, i) { + for_each_set_bit(i, sp->unsync_child_bitmap, 512) { struct kvm_mmu_page *child; u64 ent = sp->spt[i]; -- cgit v1.2.3 From 6addd1aa2ca28c054820ef2966ad372f118c3f31 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Tue, 29 Nov 2011 14:03:36 +0900 Subject: KVM: MMU: Add missing large page accounting to drop_large_spte() Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 8a9b27cb4449..9270e0d93c31 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1798,6 +1798,7 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) { if (is_large_pte(*sptep)) { drop_spte(vcpu->kvm, sptep); + --vcpu->kvm->stat.lpages; kvm_flush_remote_tlbs(vcpu->kvm); } } -- cgit v1.2.3 From a138fe7535c0ec778465c7b54b1aaaf4cfd885b7 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 16 Dec 2011 18:18:10 +0800 Subject: KVM: MMU: remove the redundant get_written_sptes get_written_sptes is called twice in kvm_mmu_pte_write, one of them can be removed Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9270e0d93c31..34da43086952 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3551,7 +3551,7 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, * If we're seeing too many writes to a page, it may no longer be a page table, * or we may be forking, in which case it is better to unmap the page. */ -static bool detect_write_flooding(struct kvm_mmu_page *sp, u64 *spte) +static bool detect_write_flooding(struct kvm_mmu_page *sp) { /* * Skip write-flooding detected for the sp whose level is 1, because @@ -3660,10 +3660,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, mask.cr0_wp = mask.cr4_pae = mask.nxe = 1; for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) { - spte = get_written_sptes(sp, gpa, &npte); - if (detect_write_misaligned(sp, gpa, bytes) || - detect_write_flooding(sp, spte)) { + detect_write_flooding(sp)) { zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); ++vcpu->kvm->stat.mmu_flooded; -- cgit v1.2.3 From e08b96371625aaa84cb03f51acc4c8e0be27403a Mon Sep 17 00:00:00 2001 From: Carsten Otte Date: Wed, 4 Jan 2012 10:25:20 +0100 Subject: KVM: s390: add parameter for KVM_CREATE_VM This patch introduces a new config option for user controlled kernel virtual machines. It introduces a parameter to KVM_CREATE_VM that allows to set bits that alter the capabilities of the newly created virtual machine. The parameter is passed to kvm_arch_init_vm for all architectures. The only valid modifier bit for now is KVM_VM_S390_UCONTROL. This requires CAP_SYS_ADMIN privileges and creates a user controlled virtual machine on s390 architectures. Signed-off-by: Carsten Otte Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- Documentation/virtual/kvm/api.txt | 7 ++++++- arch/ia64/kvm/kvm-ia64.c | 5 ++++- arch/powerpc/kvm/powerpc.c | 5 ++++- arch/s390/kvm/Kconfig | 9 +++++++++ arch/s390/kvm/kvm-s390.c | 24 +++++++++++++++++++----- arch/s390/kvm/kvm-s390.h | 10 ++++++++++ arch/x86/kvm/x86.c | 5 ++++- include/linux/kvm.h | 3 +++ include/linux/kvm_host.h | 2 +- virt/kvm/kvm_main.c | 13 +++++-------- 10 files changed, 65 insertions(+), 18 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index e1d94bf4056e..579d40b26a5a 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -95,7 +95,7 @@ described as 'basic' will be available. Capability: basic Architectures: all Type: system ioctl -Parameters: none +Parameters: machine type identifier (KVM_VM_*) Returns: a VM fd that can be used to control the new virtual machine. The new VM has no virtual cpus and no memory. An mmap() of a VM fd @@ -103,6 +103,11 @@ will access the virtual machine's physical address space; offset zero corresponds to guest physical address zero. Use of mmap() on a VM fd is discouraged if userspace memory allocation (KVM_CAP_USER_MEMORY) is available. +You most certainly want to use 0 as machine type. + +In order to create user controlled virtual machines on S390, check +KVM_CAP_S390_UCONTROL and use the flag KVM_VM_S390_UCONTROL as +privileged user (CAP_SYS_ADMIN). 4.3 KVM_GET_MSR_INDEX_LIST diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index 405052002493..df6b14194051 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -809,10 +809,13 @@ static void kvm_build_io_pmt(struct kvm *kvm) #define GUEST_PHYSICAL_RR4 0x2739 #define VMM_INIT_RR 0x1660 -int kvm_arch_init_vm(struct kvm *kvm) +int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { BUG_ON(!kvm); + if (type) + return -EINVAL; + kvm->arch.is_sn2 = ia64_platform_is("sn2"); kvm->arch.metaphysical_rr0 = GUEST_PHYSICAL_RR0; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 607fbdf24b84..83f244569874 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -171,8 +171,11 @@ void kvm_arch_check_processor_compat(void *rtn) *(int *)rtn = kvmppc_core_check_processor_compat(); } -int kvm_arch_init_vm(struct kvm *kvm) +int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { + if (type) + return -EINVAL; + return kvmppc_core_init_vm(kvm); } diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index a21634173a66..78eb9847008f 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -34,6 +34,15 @@ config KVM If unsure, say N. +config KVM_S390_UCONTROL + bool "Userspace controlled virtual machines" + depends on KVM + ---help--- + Allow CAP_SYS_ADMIN users to create KVM virtual machines that are + controlled by userspace. + + If unsure, say N. + # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. source drivers/vhost/Kconfig diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index d1c445732451..f0937552175b 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -171,11 +171,22 @@ long kvm_arch_vm_ioctl(struct file *filp, return r; } -int kvm_arch_init_vm(struct kvm *kvm) +int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { int rc; char debug_name[16]; + rc = -EINVAL; +#ifdef CONFIG_KVM_S390_UCONTROL + if (type & ~KVM_VM_S390_UCONTROL) + goto out_err; + if ((type & KVM_VM_S390_UCONTROL) && (!capable(CAP_SYS_ADMIN))) + goto out_err; +#else + if (type) + goto out_err; +#endif + rc = s390_enable_sie(); if (rc) goto out_err; @@ -198,10 +209,13 @@ int kvm_arch_init_vm(struct kvm *kvm) debug_register_view(kvm->arch.dbf, &debug_sprintf_view); VM_EVENT(kvm, 3, "%s", "vm created"); - kvm->arch.gmap = gmap_alloc(current->mm); - if (!kvm->arch.gmap) - goto out_nogmap; - + if (type & KVM_VM_S390_UCONTROL) { + kvm->arch.gmap = NULL; + } else { + kvm->arch.gmap = gmap_alloc(current->mm); + if (!kvm->arch.gmap) + goto out_nogmap; + } return 0; out_nogmap: debug_unregister(kvm->arch.dbf); diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 99b0b7597115..45b236a7c730 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -47,6 +47,16 @@ static inline int __cpu_is_stopped(struct kvm_vcpu *vcpu) return atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_STOP_INT; } +static inline int kvm_is_ucontrol(struct kvm *kvm) +{ +#ifdef CONFIG_KVM_S390_UCONTROL + if (kvm->arch.gmap) + return 0; + return 1; +#else + return 0; +#endif +} int kvm_s390_handle_wait(struct kvm_vcpu *vcpu); enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer); void kvm_s390_tasklet(unsigned long parm); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9cbfc0698118..06925b4bcc27 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6031,8 +6031,11 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) free_page((unsigned long)vcpu->arch.pio_data); } -int kvm_arch_init_vm(struct kvm *kvm) +int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { + if (type) + return -EINVAL; + INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 68e67e50d028..bba393a6760f 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -431,6 +431,9 @@ struct kvm_ppc_pvinfo { #define KVMIO 0xAE +/* machine type bits, to be used as argument to KVM_CREATE_VM */ +#define KVM_VM_S390_UCONTROL 1 + /* * ioctls for /dev/kvm fds: */ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 900c76337e8f..82375e145e64 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -520,7 +520,7 @@ static inline void kvm_arch_free_vm(struct kvm *kvm) } #endif -int kvm_arch_init_vm(struct kvm *kvm); +int kvm_arch_init_vm(struct kvm *kvm, unsigned long type); void kvm_arch_destroy_vm(struct kvm *kvm); void kvm_free_all_assigned_devices(struct kvm *kvm); void kvm_arch_sync_events(struct kvm *kvm); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index a91f980077d8..32e3b048a6cf 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -449,7 +449,7 @@ static void kvm_init_memslots_id(struct kvm *kvm) slots->id_to_index[i] = slots->memslots[i].id = i; } -static struct kvm *kvm_create_vm(void) +static struct kvm *kvm_create_vm(unsigned long type) { int r, i; struct kvm *kvm = kvm_arch_alloc_vm(); @@ -457,7 +457,7 @@ static struct kvm *kvm_create_vm(void) if (!kvm) return ERR_PTR(-ENOMEM); - r = kvm_arch_init_vm(kvm); + r = kvm_arch_init_vm(kvm, type); if (r) goto out_err_nodisable; @@ -2198,12 +2198,12 @@ static struct file_operations kvm_vm_fops = { .llseek = noop_llseek, }; -static int kvm_dev_ioctl_create_vm(void) +static int kvm_dev_ioctl_create_vm(unsigned long type) { int r; struct kvm *kvm; - kvm = kvm_create_vm(); + kvm = kvm_create_vm(type); if (IS_ERR(kvm)) return PTR_ERR(kvm); #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET @@ -2254,10 +2254,7 @@ static long kvm_dev_ioctl(struct file *filp, r = KVM_API_VERSION; break; case KVM_CREATE_VM: - r = -EINVAL; - if (arg) - goto out; - r = kvm_dev_ioctl_create_vm(); + r = kvm_dev_ioctl_create_vm(arg); break; case KVM_CHECK_EXTENSION: r = kvm_dev_ioctl_check_extension_generic(arg); -- cgit v1.2.3 From 27e0393f15fc8bc855c6a888387ff5ffd2181089 Mon Sep 17 00:00:00 2001 From: Carsten Otte Date: Wed, 4 Jan 2012 10:25:21 +0100 Subject: KVM: s390: ucontrol: per vcpu address spaces This patch introduces two ioctls for virtual cpus, that are only valid for kernel virtual machines that are controlled by userspace. Each virtual cpu has its individual address space in this mode of operation, and each address space is backed by the gmap implementation just like the address space for regular KVM guests. KVM_S390_UCAS_MAP allows to map a part of the user's virtual address space to the vcpu. Starting offset and length in both the user and the vcpu address space need to be aligned to 1M. KVM_S390_UCAS_UNMAP can be used to unmap a range of memory from a virtual cpu in a similar way. Signed-off-by: Carsten Otte Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- Documentation/virtual/kvm/api.txt | 38 +++++++++++++++++++++++++++++ arch/s390/kvm/kvm-s390.c | 50 ++++++++++++++++++++++++++++++++++++++- include/linux/kvm.h | 10 ++++++++ 3 files changed, 97 insertions(+), 1 deletion(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 579d40b26a5a..ee394b263261 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1496,6 +1496,44 @@ following algorithm: Some guests configure the LINT1 NMI input to cause a panic, aiding in debugging. +4.64 KVM_S390_UCAS_MAP + +Capability: KVM_CAP_S390_UCONTROL +Architectures: s390 +Type: vcpu ioctl +Parameters: struct kvm_s390_ucas_mapping (in) +Returns: 0 in case of success + +The parameter is defined like this: + struct kvm_s390_ucas_mapping { + __u64 user_addr; + __u64 vcpu_addr; + __u64 length; + }; + +This ioctl maps the memory at "user_addr" with the length "length" to +the vcpu's address space starting at "vcpu_addr". All parameters need to +be alligned by 1 megabyte. + +4.65 KVM_S390_UCAS_UNMAP + +Capability: KVM_CAP_S390_UCONTROL +Architectures: s390 +Type: vcpu ioctl +Parameters: struct kvm_s390_ucas_mapping (in) +Returns: 0 in case of success + +The parameter is defined like this: + struct kvm_s390_ucas_mapping { + __u64 user_addr; + __u64 vcpu_addr; + __u64 length; + }; + +This ioctl unmaps the memory in the vcpu's address space starting at +"vcpu_addr" with the length "length". The field "user_addr" is ignored. +All parameters need to be alligned by 1 megabyte. + 5. The kvm_run structure Application code obtains a pointer to the kvm_run structure by diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index f0937552175b..2d3248895def 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -233,6 +233,10 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) (__u64) vcpu->arch.sie_block) vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sda = 0; smp_mb(); + + if (kvm_is_ucontrol(vcpu->kvm)) + gmap_free(vcpu->arch.gmap); + free_page((unsigned long)(vcpu->arch.sie_block)); kvm_vcpu_uninit(vcpu); kfree(vcpu); @@ -263,12 +267,20 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvm_free_vcpus(kvm); free_page((unsigned long)(kvm->arch.sca)); debug_unregister(kvm->arch.dbf); - gmap_free(kvm->arch.gmap); + if (!kvm_is_ucontrol(kvm)) + gmap_free(kvm->arch.gmap); } /* Section: vcpu related */ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { + if (kvm_is_ucontrol(vcpu->kvm)) { + vcpu->arch.gmap = gmap_alloc(current->mm); + if (!vcpu->arch.gmap) + return -ENOMEM; + return 0; + } + vcpu->arch.gmap = vcpu->kvm->arch.gmap; return 0; } @@ -687,6 +699,42 @@ long kvm_arch_vcpu_ioctl(struct file *filp, case KVM_S390_INITIAL_RESET: r = kvm_arch_vcpu_ioctl_initial_reset(vcpu); break; +#ifdef CONFIG_KVM_S390_UCONTROL + case KVM_S390_UCAS_MAP: { + struct kvm_s390_ucas_mapping ucasmap; + + if (copy_from_user(&ucasmap, argp, sizeof(ucasmap))) { + r = -EFAULT; + break; + } + + if (!kvm_is_ucontrol(vcpu->kvm)) { + r = -EINVAL; + break; + } + + r = gmap_map_segment(vcpu->arch.gmap, ucasmap.user_addr, + ucasmap.vcpu_addr, ucasmap.length); + break; + } + case KVM_S390_UCAS_UNMAP: { + struct kvm_s390_ucas_mapping ucasmap; + + if (copy_from_user(&ucasmap, argp, sizeof(ucasmap))) { + r = -EFAULT; + break; + } + + if (!kvm_is_ucontrol(vcpu->kvm)) { + r = -EINVAL; + break; + } + + r = gmap_unmap_segment(vcpu->arch.gmap, ucasmap.vcpu_addr, + ucasmap.length); + break; + } +#endif default: r = -EINVAL; } diff --git a/include/linux/kvm.h b/include/linux/kvm.h index bba393a6760f..0a66c1072691 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -658,6 +658,16 @@ struct kvm_clock_data { struct kvm_userspace_memory_region) #define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47) #define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO, 0x48, __u64) + +/* enable ucontrol for s390 */ +struct kvm_s390_ucas_mapping { + __u64 user_addr; + __u64 vcpu_addr; + __u64 length; +}; +#define KVM_S390_UCAS_MAP _IOW(KVMIO, 0x50, struct kvm_s390_ucas_mapping) +#define KVM_S390_UCAS_UNMAP _IOW(KVMIO, 0x51, struct kvm_s390_ucas_mapping) + /* Device model IOC */ #define KVM_CREATE_IRQCHIP _IO(KVMIO, 0x60) #define KVM_IRQ_LINE _IOW(KVMIO, 0x61, struct kvm_irq_level) -- cgit v1.2.3 From e168bf8de33e16a909df2401af1f7d419c5780de Mon Sep 17 00:00:00 2001 From: Carsten Otte Date: Wed, 4 Jan 2012 10:25:22 +0100 Subject: KVM: s390: ucontrol: export page faults to user This patch introduces a new exit reason in the kvm_run structure named KVM_EXIT_S390_UCONTROL. This exit indicates, that a virtual cpu has regognized a fault on the host page table. The idea is that userspace can handle this fault by mapping memory at the fault location into the cpu's address space and then continue to run the virtual cpu. Signed-off-by: Carsten Otte Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- Documentation/virtual/kvm/api.txt | 14 ++++++++++++++ arch/s390/kvm/kvm-s390.c | 32 +++++++++++++++++++++++++++----- arch/s390/kvm/kvm-s390.h | 1 + include/linux/kvm.h | 6 ++++++ 4 files changed, 48 insertions(+), 5 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index ee394b263261..6e53ff51422f 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1694,6 +1694,20 @@ s390 specific. s390 specific. + /* KVM_EXIT_S390_UCONTROL */ + struct { + __u64 trans_exc_code; + __u32 pgm_code; + } s390_ucontrol; + +s390 specific. A page fault has occurred for a user controlled virtual +machine (KVM_VM_S390_UNCONTROL) on it's host page table that cannot be +resolved by the kernel. +The program code and the translation exception code that were placed +in the cpu's lowcore are presented here as defined by the z Architecture +Principles of Operation Book in the Chapter for Dynamic Address Translation +(DAT) + /* KVM_EXIT_DCR */ struct { __u32 dcrn; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 2d3248895def..af05328aca25 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -493,8 +493,10 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, return -EINVAL; /* not implemented yet */ } -static void __vcpu_run(struct kvm_vcpu *vcpu) +static int __vcpu_run(struct kvm_vcpu *vcpu) { + int rc; + memcpy(&vcpu->arch.sie_block->gg14, &vcpu->arch.guest_gprs[14], 16); if (need_resched()) @@ -511,9 +513,15 @@ static void __vcpu_run(struct kvm_vcpu *vcpu) local_irq_enable(); VCPU_EVENT(vcpu, 6, "entering sie flags %x", atomic_read(&vcpu->arch.sie_block->cpuflags)); - if (sie64a(vcpu->arch.sie_block, vcpu->arch.guest_gprs)) { - VCPU_EVENT(vcpu, 3, "%s", "fault in sie instruction"); - kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + rc = sie64a(vcpu->arch.sie_block, vcpu->arch.guest_gprs); + if (rc) { + if (kvm_is_ucontrol(vcpu->kvm)) { + rc = SIE_INTERCEPT_UCONTROL; + } else { + VCPU_EVENT(vcpu, 3, "%s", "fault in sie instruction"); + kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + rc = 0; + } } VCPU_EVENT(vcpu, 6, "exit sie icptcode %d", vcpu->arch.sie_block->icptcode); @@ -522,6 +530,7 @@ static void __vcpu_run(struct kvm_vcpu *vcpu) local_irq_enable(); memcpy(&vcpu->arch.guest_gprs[14], &vcpu->arch.sie_block->gg14, 16); + return rc; } int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) @@ -542,6 +551,7 @@ rerun_vcpu: case KVM_EXIT_UNKNOWN: case KVM_EXIT_INTR: case KVM_EXIT_S390_RESET: + case KVM_EXIT_S390_UCONTROL: break; default: BUG(); @@ -553,7 +563,9 @@ rerun_vcpu: might_fault(); do { - __vcpu_run(vcpu); + rc = __vcpu_run(vcpu); + if (rc) + break; rc = kvm_handle_sie_intercept(vcpu); } while (!signal_pending(current) && !rc); @@ -565,6 +577,16 @@ rerun_vcpu: rc = -EINTR; } +#ifdef CONFIG_KVM_S390_UCONTROL + if (rc == SIE_INTERCEPT_UCONTROL) { + kvm_run->exit_reason = KVM_EXIT_S390_UCONTROL; + kvm_run->s390_ucontrol.trans_exc_code = + current->thread.gmap_addr; + kvm_run->s390_ucontrol.pgm_code = 0x10; + rc = 0; + } +#endif + if (rc == -EOPNOTSUPP) { /* intercept cannot be handled in-kernel, prepare kvm-run */ kvm_run->exit_reason = KVM_EXIT_S390_SIEIC; diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 45b236a7c730..62aa5f19bb98 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -26,6 +26,7 @@ typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu); /* negativ values are error codes, positive values for internal conditions */ #define SIE_INTERCEPT_RERUNVCPU (1<<0) +#define SIE_INTERCEPT_UCONTROL (1<<1) int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu); #define VM_EVENT(d_kvm, d_loglevel, d_string, d_args...)\ diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 0a66c1072691..7f686f6708b0 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -162,6 +162,7 @@ struct kvm_pit_config { #define KVM_EXIT_INTERNAL_ERROR 17 #define KVM_EXIT_OSI 18 #define KVM_EXIT_PAPR_HCALL 19 +#define KVM_EXIT_S390_UCONTROL 20 /* For KVM_EXIT_INTERNAL_ERROR */ #define KVM_INTERNAL_ERROR_EMULATION 1 @@ -249,6 +250,11 @@ struct kvm_run { #define KVM_S390_RESET_CPU_INIT 8 #define KVM_S390_RESET_IPL 16 __u64 s390_reset_flags; + /* KVM_EXIT_S390_UCONTROL */ + struct { + __u64 trans_exc_code; + __u32 pgm_code; + } s390_ucontrol; /* KVM_EXIT_DCR */ struct { __u32 dcrn; -- cgit v1.2.3 From 5b1c1493afe8d69909f9df3221bb2fffdf479f4a Mon Sep 17 00:00:00 2001 From: Carsten Otte Date: Wed, 4 Jan 2012 10:25:23 +0100 Subject: KVM: s390: ucontrol: export SIE control block to user This patch exports the s390 SIE hardware control block to userspace via the mapping of the vcpu file descriptor. In order to do so, a new arch callback named kvm_arch_vcpu_fault is introduced for all architectures. It allows to map architecture specific pages. Signed-off-by: Carsten Otte Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- Documentation/virtual/kvm/api.txt | 5 +++++ arch/ia64/kvm/kvm-ia64.c | 5 +++++ arch/powerpc/kvm/powerpc.c | 5 +++++ arch/s390/kvm/kvm-s390.c | 13 +++++++++++++ arch/x86/kvm/x86.c | 5 +++++ include/linux/kvm.h | 2 ++ include/linux/kvm_host.h | 1 + virt/kvm/kvm_main.c | 2 +- 8 files changed, 37 insertions(+), 1 deletion(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 6e53ff51422f..5ebf47d99e56 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -218,6 +218,11 @@ allocation of vcpu ids. For example, if userspace wants single-threaded guest vcpus, it should make all vcpu ids be a multiple of the number of vcpus per vcore. +For virtual cpus that have been created with S390 user controlled virtual +machines, the resulting vcpu fd can be memory mapped at page offset +KVM_S390_SIE_PAGE_OFFSET in order to obtain a memory map of the virtual +cpu's hardware control block. + 4.8 KVM_GET_DIRTY_LOG (vm ioctl) Capability: basic diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index df6b14194051..8ca7261e7b3d 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -1566,6 +1566,11 @@ out: return r; } +int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) +{ + return VM_FAULT_SIGBUS; +} + int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, struct kvm_memory_slot old, diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 83f244569874..a5671616af86 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -659,6 +659,11 @@ out: return r; } +int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) +{ + return VM_FAULT_SIGBUS; +} + static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo) { u32 inst_lis = 0x3c000000; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index af05328aca25..d6bc65aeb950 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -763,6 +763,19 @@ long kvm_arch_vcpu_ioctl(struct file *filp, return r; } +int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) +{ +#ifdef CONFIG_KVM_S390_UCONTROL + if ((vmf->pgoff == KVM_S390_SIE_PAGE_OFFSET) + && (kvm_is_ucontrol(vcpu->kvm))) { + vmf->page = virt_to_page(vcpu->arch.sie_block); + get_page(vmf->page); + return 0; + } +#endif + return VM_FAULT_SIGBUS; +} + /* Section: memory related */ int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 06925b4bcc27..a3ce196d21fe 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2814,6 +2814,11 @@ out: return r; } +int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) +{ + return VM_FAULT_SIGBUS; +} + static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) { int ret; diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 7f686f6708b0..8f888df206a2 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -440,6 +440,8 @@ struct kvm_ppc_pvinfo { /* machine type bits, to be used as argument to KVM_CREATE_VM */ #define KVM_VM_S390_UCONTROL 1 +#define KVM_S390_SIE_PAGE_OFFSET 1 + /* * ioctls for /dev/kvm fds: */ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 82375e145e64..d4d4d7092110 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -450,6 +450,7 @@ long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); +int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf); int kvm_dev_ioctl_check_extension(long ext); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 32e3b048a6cf..64be836f3348 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1657,7 +1657,7 @@ static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf) page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); #endif else - return VM_FAULT_SIGBUS; + return kvm_arch_vcpu_fault(vcpu, vmf); get_page(page); vmf->page = page; return 0; -- cgit v1.2.3 From c0d744a9c8aae81b624e0650e6fbbbb83db1a145 Mon Sep 17 00:00:00 2001 From: Carsten Otte Date: Wed, 4 Jan 2012 10:25:24 +0100 Subject: KVM: s390: ucontrol: disable in-kernel handling of SIE intercepts This patch disables in-kernel handling of SIE intercepts for user controlled virtual machines. All intercepts are passed to userspace via KVM_EXIT_SIE exit reason just like SIE intercepts that cannot be handled in-kernel for regular KVM guests. Signed-off-by: Carsten Otte Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/s390/kvm/kvm-s390.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index d6bc65aeb950..02510946864e 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -566,7 +566,10 @@ rerun_vcpu: rc = __vcpu_run(vcpu); if (rc) break; - rc = kvm_handle_sie_intercept(vcpu); + if (kvm_is_ucontrol(vcpu->kvm)) + rc = -EOPNOTSUPP; + else + rc = kvm_handle_sie_intercept(vcpu); } while (!signal_pending(current) && !rc); if (rc == SIE_INTERCEPT_RERUNVCPU) -- cgit v1.2.3 From d6b6d166864fa97ca3b1ed1a5c62fd3b53d4606f Mon Sep 17 00:00:00 2001 From: Carsten Otte Date: Wed, 4 Jan 2012 10:25:25 +0100 Subject: KVM: s390: ucontrol: disable in-kernel irq stack This patch disables the in-kernel interrupt stack for KVM virtual machines that are controlled by user. Userspace has to take care of handling interrupts on its own. Signed-off-by: Carsten Otte Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/s390/kvm/kvm-s390.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 02510946864e..5b5c28e471df 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -505,7 +505,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) if (test_thread_flag(TIF_MCCK_PENDING)) s390_handle_mcck(); - kvm_s390_deliver_pending_interrupts(vcpu); + if (!kvm_is_ucontrol(vcpu->kvm)) + kvm_s390_deliver_pending_interrupts(vcpu); vcpu->arch.sie_block->icptcode = 0; local_irq_disable(); -- cgit v1.2.3 From ccc7910fe564d99415def7c041fa261e62a43011 Mon Sep 17 00:00:00 2001 From: Carsten Otte Date: Wed, 4 Jan 2012 10:25:26 +0100 Subject: KVM: s390: ucontrol: interface to inject faults on a vcpu page table This patch allows the user to fault in pages on a virtual cpus address space for user controlled virtual machines. Typically this is superfluous because userspace can just create a mapping and let the kernel's page fault logic take are of it. There is one exception: SIE won't start if the lowcore is not present. Normally the kernel takes care of this [handle_validity() in arch/s390/kvm/intercept.c] but since the kernel does not handle intercepts for user controlled virtual machines, userspace needs to be able to handle this condition. Signed-off-by: Carsten Otte Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- Documentation/virtual/kvm/api.txt | 16 ++++++++++++++++ arch/s390/kvm/kvm-s390.c | 6 ++++++ include/linux/kvm.h | 1 + 3 files changed, 23 insertions(+) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 5ebf47d99e56..a67fb35993fa 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1539,6 +1539,22 @@ This ioctl unmaps the memory in the vcpu's address space starting at "vcpu_addr" with the length "length". The field "user_addr" is ignored. All parameters need to be alligned by 1 megabyte. +4.66 KVM_S390_VCPU_FAULT + +Capability: KVM_CAP_S390_UCONTROL +Architectures: s390 +Type: vcpu ioctl +Parameters: vcpu absolute address (in) +Returns: 0 in case of success + +This call creates a page table entry on the virtual cpu's address space +(for user controlled virtual machines) or the virtual machine's address +space (for regular virtual machines). This only works for minor faults, +thus it's recommended to access subject memory page via the user page +table upfront. This is useful to handle validity intercepts for user +controlled virtual machines to fault in the virtual cpu's lowcore pages +prior to calling the KVM_RUN ioctl. + 5. The kvm_run structure Application code obtains a pointer to the kvm_run structure by diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 5b5c28e471df..8489edf80c89 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -761,6 +761,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } #endif + case KVM_S390_VCPU_FAULT: { + r = gmap_fault(arg, vcpu->arch.gmap); + if (!IS_ERR_VALUE(r)) + r = 0; + break; + } default: r = -EINVAL; } diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 8f888df206a2..778e748927b4 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -675,6 +675,7 @@ struct kvm_s390_ucas_mapping { }; #define KVM_S390_UCAS_MAP _IOW(KVMIO, 0x50, struct kvm_s390_ucas_mapping) #define KVM_S390_UCAS_UNMAP _IOW(KVMIO, 0x51, struct kvm_s390_ucas_mapping) +#define KVM_S390_VCPU_FAULT _IOW(KVMIO, 0x52, unsigned long) /* Device model IOC */ #define KVM_CREATE_IRQCHIP _IO(KVMIO, 0x60) -- cgit v1.2.3 From 58f9460ba1cc9de67e6591bfd08dccded43d27bd Mon Sep 17 00:00:00 2001 From: Carsten Otte Date: Wed, 4 Jan 2012 10:25:27 +0100 Subject: KVM: s390: ucontrol: disable sca This patch makes sure user controlled virtual machines do not use a system control area (sca). This is needed in order to create virtual machines with more cpus than the size of the sca [64]. Signed-off-by: Carsten Otte Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/s390/kvm/kvm-s390.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 8489edf80c89..abf784d8c680 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -228,10 +228,13 @@ out_err: void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { VCPU_EVENT(vcpu, 3, "%s", "free cpu"); - clear_bit(63 - vcpu->vcpu_id, (unsigned long *) &vcpu->kvm->arch.sca->mcn); - if (vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sda == - (__u64) vcpu->arch.sie_block) - vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sda = 0; + if (!kvm_is_ucontrol(vcpu->kvm)) { + clear_bit(63 - vcpu->vcpu_id, + (unsigned long *) &vcpu->kvm->arch.sca->mcn); + if (vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sda == + (__u64) vcpu->arch.sie_block) + vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sda = 0; + } smp_mb(); if (kvm_is_ucontrol(vcpu->kvm)) @@ -368,12 +371,19 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, goto out_free_cpu; vcpu->arch.sie_block->icpua = id; - BUG_ON(!kvm->arch.sca); - if (!kvm->arch.sca->cpu[id].sda) - kvm->arch.sca->cpu[id].sda = (__u64) vcpu->arch.sie_block; - vcpu->arch.sie_block->scaoh = (__u32)(((__u64)kvm->arch.sca) >> 32); - vcpu->arch.sie_block->scaol = (__u32)(__u64)kvm->arch.sca; - set_bit(63 - id, (unsigned long *) &kvm->arch.sca->mcn); + if (!kvm_is_ucontrol(kvm)) { + if (!kvm->arch.sca) { + WARN_ON_ONCE(1); + goto out_free_cpu; + } + if (!kvm->arch.sca->cpu[id].sda) + kvm->arch.sca->cpu[id].sda = + (__u64) vcpu->arch.sie_block; + vcpu->arch.sie_block->scaoh = + (__u32)(((__u64)kvm->arch.sca) >> 32); + vcpu->arch.sie_block->scaol = (__u32)(__u64)kvm->arch.sca; + set_bit(63 - id, (unsigned long *) &kvm->arch.sca->mcn); + } spin_lock_init(&vcpu->arch.local_int.lock); INIT_LIST_HEAD(&vcpu->arch.local_int.list); -- cgit v1.2.3 From 3777594d5a75b704312544a59094beecd820e12b Mon Sep 17 00:00:00 2001 From: Carsten Otte Date: Wed, 4 Jan 2012 10:25:28 +0100 Subject: KVM: s390: fix assumption for KVM_MAX_VCPUS This patch fixes definition of the idle_mask and the local_int array in kvm_s390_float_interrupt. Previous definition had 64 cpus max hardcoded instead of using KVM_MAX_VCPUS. Signed-off-by: Carsten Otte Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/s390/include/asm/kvm_host.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index b0c235cb6ad5..e34fb2ba76c1 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -220,8 +220,9 @@ struct kvm_s390_float_interrupt { struct list_head list; atomic_t active; int next_rr_cpu; - unsigned long idle_mask [(64 + sizeof(long) - 1) / sizeof(long)]; - struct kvm_s390_local_interrupt *local_int[64]; + unsigned long idle_mask[(KVM_MAX_VCPUS + sizeof(long) - 1) + / sizeof(long)]; + struct kvm_s390_local_interrupt *local_int[KVM_MAX_VCPUS]; }; -- cgit v1.2.3 From 1efd0f595ab9d10fef1486dfdef952107c91f3db Mon Sep 17 00:00:00 2001 From: Carsten Otte Date: Wed, 4 Jan 2012 10:25:29 +0100 Subject: KVM: s390: ucontrol: announce capability for user controlled vms This patch announces a new capability KVM_CAP_S390_UCONTROL that indicates that kvm can now support virtual machines that are controlled by userspace. Signed-off-by: Carsten Otte Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/s390/kvm/kvm-s390.c | 3 +++ include/linux/kvm.h | 1 + 2 files changed, 4 insertions(+) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index abf784d8c680..a1061b361c43 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -129,6 +129,9 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_S390_PSW: case KVM_CAP_S390_GMAP: case KVM_CAP_SYNC_MMU: +#ifdef CONFIG_KVM_S390_UCONTROL + case KVM_CAP_S390_UCONTROL: +#endif r = 1; break; default: diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 778e748927b4..6cf048d9604b 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -569,6 +569,7 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_PPC_PAPR 68 #define KVM_CAP_S390_GMAP 71 #define KVM_CAP_TSC_DEADLINE_TIMER 72 +#define KVM_CAP_S390_UCONTROL 73 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From 3e6afcf1d8ce6bca6bfa62a49c1d70d2f16162eb Mon Sep 17 00:00:00 2001 From: Carsten Otte Date: Wed, 4 Jan 2012 10:25:30 +0100 Subject: KVM: s390: Fix return code for unknown ioctl numbers This patch fixes the return code of kvm_arch_vcpu_ioctl in case of an unkown ioctl number. Signed-off-by: Carsten Otte Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/s390/kvm/kvm-s390.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index a1061b361c43..a33b44487540 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -781,7 +781,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } default: - r = -EINVAL; + r = -ENOTTY; } return r; } -- cgit v1.2.3 From 4a58ae614a28b1ae3bea1c74a307cdfb7c77dab8 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 6 Jan 2012 15:06:18 +0100 Subject: KVM: MMU: unnecessary NX state assignment We can remove the first ->nx state assignment since it is assigned afterwards anyways. Signed-off-by: Davidlohr Bueso Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 34da43086952..0a11468d853f 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3321,7 +3321,6 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->get_cr3 = get_cr3; context->get_pdptr = kvm_pdptr_read; context->inject_page_fault = kvm_inject_page_fault; - context->nx = is_nx(vcpu); if (!is_paging(vcpu)) { context->nx = false; -- cgit v1.2.3 From 2b036c6b861dc5da295c6fe19a3edcff7093fdeb Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Mon, 9 Jan 2012 14:00:35 -0500 Subject: KVM: SVM: Add support for AMD's OSVW feature in guests In some cases guests should not provide workarounds for errata even when the physical processor is affected. For example, because of erratum 400 on family 10h processors a Linux guest will read an MSR (resulting in VMEXIT) before going to idle in order to avoid getting stuck in a non-C0 state. This is not necessary: HLT and IO instructions are intercepted and therefore there is no reason for erratum 400 workaround in the guest. This patch allows us to present a guest with certain errata as fixed, regardless of the state of actual hardware. Signed-off-by: Boris Ostrovsky Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 6 +++++ arch/x86/kvm/cpuid.c | 2 +- arch/x86/kvm/cpuid.h | 8 ++++++ arch/x86/kvm/svm.c | 59 +++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 20 ++++++++++++++ 5 files changed, 94 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 52d6640a5ca1..bd69c93da8fa 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -478,6 +478,12 @@ struct kvm_vcpu_arch { u32 id; bool send_user_only; } apf; + + /* OSVW MSRs (AMD only) */ + struct { + u64 length; + u64 status; + } osvw; }; struct kvm_arch { diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 89b02bfaaca5..9fed5bedaad6 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -236,7 +236,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, const u32 kvm_supported_word6_x86_features = F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | - F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) | + F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) | 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); /* cpuid 0xC0000001.edx */ diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 5b97e1797a6d..26d1fb437eb5 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -43,4 +43,12 @@ static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) return best && (best->ebx & bit(X86_FEATURE_FSGSBASE)); } +static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); + return best && (best->ecx & bit(X86_FEATURE_OSVW)); +} + #endif diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 5fa553babe56..fce3ba0f2079 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -110,6 +110,12 @@ struct nested_state { #define MSRPM_OFFSETS 16 static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; +/* + * Set osvw_len to higher value when updated Revision Guides + * are published and we know what the new status bits are + */ +static uint64_t osvw_len = 4, osvw_status; + struct vcpu_svm { struct kvm_vcpu vcpu; struct vmcb *vmcb; @@ -556,6 +562,27 @@ static void svm_init_erratum_383(void) erratum_383_found = true; } +static void svm_init_osvw(struct kvm_vcpu *vcpu) +{ + /* + * Guests should see errata 400 and 415 as fixed (assuming that + * HLT and IO instructions are intercepted). + */ + vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3; + vcpu->arch.osvw.status = osvw_status & ~(6ULL); + + /* + * By increasing VCPU's osvw.length to 3 we are telling the guest that + * all osvw.status bits inside that length, including bit 0 (which is + * reserved for erratum 298), are valid. However, if host processor's + * osvw_len is 0 then osvw_status[0] carries no information. We need to + * be conservative here and therefore we tell the guest that erratum 298 + * is present (because we really don't know). + */ + if (osvw_len == 0 && boot_cpu_data.x86 == 0x10) + vcpu->arch.osvw.status |= 1; +} + static int has_svm(void) { const char *msg; @@ -620,6 +647,36 @@ static int svm_hardware_enable(void *garbage) __get_cpu_var(current_tsc_ratio) = TSC_RATIO_DEFAULT; } + + /* + * Get OSVW bits. + * + * Note that it is possible to have a system with mixed processor + * revisions and therefore different OSVW bits. If bits are not the same + * on different processors then choose the worst case (i.e. if erratum + * is present on one processor and not on another then assume that the + * erratum is present everywhere). + */ + if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) { + uint64_t len, status = 0; + int err; + + len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err); + if (!err) + status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, + &err); + + if (err) + osvw_status = osvw_len = 0; + else { + if (len < osvw_len) + osvw_len = len; + osvw_status |= status; + osvw_status &= (1ULL << osvw_len) - 1; + } + } else + osvw_status = osvw_len = 0; + svm_init_erratum_383(); return 0; @@ -1186,6 +1243,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) if (kvm_vcpu_is_bsp(&svm->vcpu)) svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; + svm_init_osvw(&svm->vcpu); + return &svm->vcpu; free_page4: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a3ce196d21fe..2bd77a3a41ed 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1675,6 +1675,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) */ pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); break; + case MSR_AMD64_OSVW_ID_LENGTH: + if (!guest_cpuid_has_osvw(vcpu)) + return 1; + vcpu->arch.osvw.length = data; + break; + case MSR_AMD64_OSVW_STATUS: + if (!guest_cpuid_has_osvw(vcpu)) + return 1; + vcpu->arch.osvw.status = data; + break; default: if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) return xen_hvm_config(vcpu, data); @@ -1959,6 +1969,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) */ data = 0xbe702111; break; + case MSR_AMD64_OSVW_ID_LENGTH: + if (!guest_cpuid_has_osvw(vcpu)) + return 1; + data = vcpu->arch.osvw.length; + break; + case MSR_AMD64_OSVW_STATUS: + if (!guest_cpuid_has_osvw(vcpu)) + return 1; + data = vcpu->arch.osvw.status; + break; default: if (kvm_pmu_msr(vcpu, msr)) return kvm_pmu_get_msr(vcpu, msr, pdata); -- cgit v1.2.3 From 8d26cf7b40b1648c39e77a113dac07ad31363120 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Wed, 11 Jan 2012 11:19:32 +0100 Subject: KVM: s390: rework code that sets the prefix There are several places in the kvm module, which set the prefix register. Since we need to flush the cpu, lets combine this operation into a helper function. This helper will also explicitely mask out the unused bits. Signed-off-by: Christian Borntraeger Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/s390/kvm/interrupt.c | 3 +-- arch/s390/kvm/kvm-s390.c | 3 +-- arch/s390/kvm/kvm-s390.h | 7 +++++++ arch/s390/kvm/priv.c | 3 +-- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 278ee009ce65..c6366cfb3bf0 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -236,8 +236,7 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu, VCPU_EVENT(vcpu, 4, "interrupt: set prefix to %x", inti->prefix.address); vcpu->stat.deliver_prefix_signal++; - vcpu->arch.sie_block->prefix = inti->prefix.address; - vcpu->arch.sie_block->ihcpu = 0xffff; + kvm_s390_set_prefix(vcpu, inti->prefix.address); break; case KVM_S390_RESTART: diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index a33b44487540..1868b89a840f 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -322,8 +322,7 @@ static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu) /* this equals initial cpu reset in pop, but we don't switch to ESA */ vcpu->arch.sie_block->gpsw.mask = 0UL; vcpu->arch.sie_block->gpsw.addr = 0UL; - vcpu->arch.sie_block->prefix = 0UL; - vcpu->arch.sie_block->ihcpu = 0xffff; + kvm_s390_set_prefix(vcpu, 0); vcpu->arch.sie_block->cputm = 0UL; vcpu->arch.sie_block->ckc = 0UL; vcpu->arch.sie_block->todpr = 0; diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 62aa5f19bb98..ff28f9d1c9eb 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -58,6 +58,13 @@ static inline int kvm_is_ucontrol(struct kvm *kvm) return 0; #endif } + +static inline void kvm_s390_set_prefix(struct kvm_vcpu *vcpu, u32 prefix) +{ + vcpu->arch.sie_block->prefix = prefix & 0x7fffe000u; + vcpu->arch.sie_block->ihcpu = 0xffff; +} + int kvm_s390_handle_wait(struct kvm_vcpu *vcpu); enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer); void kvm_s390_tasklet(unsigned long parm); diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index d02638959922..9c83b8a53843 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -56,8 +56,7 @@ static int handle_set_prefix(struct kvm_vcpu *vcpu) goto out; } - vcpu->arch.sie_block->prefix = address; - vcpu->arch.sie_block->ihcpu = 0xffff; + kvm_s390_set_prefix(vcpu, address); VCPU_EVENT(vcpu, 5, "setting prefix to %x", address); out: -- cgit v1.2.3 From b9e5dc8d4511e6a00862a795319569e7fe7f60f4 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Wed, 11 Jan 2012 11:20:30 +0100 Subject: KVM: provide synchronous registers in kvm_run On some cpus the overhead for virtualization instructions is in the same range as a system call. Having to call multiple ioctls to get set registers will make certain userspace handled exits more expensive than necessary. Lets provide a section in kvm_run that works as a shared save area for guest registers. We also provide two 64bit flags fields (architecture specific), that will specify 1. which parts of these fields are valid. 2. which registers were modified by userspace Each bit for these flag fields will define a group of registers (like general purpose) or a single register. Signed-off-by: Christian Borntraeger Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- Documentation/virtual/kvm/api.txt | 23 +++++++++++++++++++++++ arch/ia64/include/asm/kvm.h | 4 ++++ arch/powerpc/include/asm/kvm.h | 4 ++++ arch/s390/include/asm/kvm.h | 3 +++ arch/x86/include/asm/kvm.h | 4 ++++ include/linux/kvm.h | 15 +++++++++++++++ 6 files changed, 53 insertions(+) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index a67fb35993fa..7ca696227d3a 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1771,6 +1771,29 @@ developer registration required to access it). /* Fix the size of the union. */ char padding[256]; }; + + /* + * shared registers between kvm and userspace. + * kvm_valid_regs specifies the register classes set by the host + * kvm_dirty_regs specified the register classes dirtied by userspace + * struct kvm_sync_regs is architecture specific, as well as the + * bits for kvm_valid_regs and kvm_dirty_regs + */ + __u64 kvm_valid_regs; + __u64 kvm_dirty_regs; + union { + struct kvm_sync_regs regs; + char padding[1024]; + } s; + +If KVM_CAP_SYNC_REGS is defined, these fields allow userspace to access +certain guest registers without having to call SET/GET_*REGS. Thus we can +avoid some system call overhead if userspace has to handle the exit. +Userspace can query the validity of the structure by checking +kvm_valid_regs for specific bits. These bits are architecture specific +and usually define the validity of a groups of registers. (e.g. one bit + for general purpose registers) + }; 6. Capabilities that can be enabled diff --git a/arch/ia64/include/asm/kvm.h b/arch/ia64/include/asm/kvm.h index bc90c75adf67..b9f82c84f093 100644 --- a/arch/ia64/include/asm/kvm.h +++ b/arch/ia64/include/asm/kvm.h @@ -261,4 +261,8 @@ struct kvm_debug_exit_arch { struct kvm_guest_debug_arch { }; +/* definition of registers in kvm_run */ +struct kvm_sync_regs { +}; + #endif diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h index f7727d91ac6b..7d9d4de057ef 100644 --- a/arch/powerpc/include/asm/kvm.h +++ b/arch/powerpc/include/asm/kvm.h @@ -265,6 +265,10 @@ struct kvm_debug_exit_arch { struct kvm_guest_debug_arch { }; +/* definition of registers in kvm_run */ +struct kvm_sync_regs { +}; + #define KVM_REG_MASK 0x001f #define KVM_REG_EXT_MASK 0xffe0 #define KVM_REG_GPR 0x0000 diff --git a/arch/s390/include/asm/kvm.h b/arch/s390/include/asm/kvm.h index 82b32a100c7d..325560afb77e 100644 --- a/arch/s390/include/asm/kvm.h +++ b/arch/s390/include/asm/kvm.h @@ -41,4 +41,7 @@ struct kvm_debug_exit_arch { struct kvm_guest_debug_arch { }; +/* definition of registers in kvm_run */ +struct kvm_sync_regs { +}; #endif diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index 4d8dcbdfc120..e7d1c194d272 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h @@ -321,4 +321,8 @@ struct kvm_xcrs { __u64 padding[16]; }; +/* definition of registers in kvm_run */ +struct kvm_sync_regs { +}; + #endif /* _ASM_X86_KVM_H */ diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 6cf048d9604b..245bcb3a0fcd 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -279,6 +279,20 @@ struct kvm_run { /* Fix the size of the union. */ char padding[256]; }; + + /* + * shared registers between kvm and userspace. + * kvm_valid_regs specifies the register classes set by the host + * kvm_dirty_regs specified the register classes dirtied by userspace + * struct kvm_sync_regs is architecture specific, as well as the + * bits for kvm_valid_regs and kvm_dirty_regs + */ + __u64 kvm_valid_regs; + __u64 kvm_dirty_regs; + union { + struct kvm_sync_regs regs; + char padding[1024]; + } s; }; /* for KVM_REGISTER_COALESCED_MMIO / KVM_UNREGISTER_COALESCED_MMIO */ @@ -570,6 +584,7 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_S390_GMAP 71 #define KVM_CAP_TSC_DEADLINE_TIMER 72 #define KVM_CAP_S390_UCONTROL 73 +#define KVM_CAP_SYNC_REGS 74 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From 60b413c9248495ea400e80e08e4d1e28ed7ee05e Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Wed, 11 Jan 2012 11:20:31 +0100 Subject: KVM: s390: provide the prefix register via kvm_run Add the prefix register to the synced register field in kvm_run. While we need the prefix register most of the time read-only, this patch also adds handling for guest dirtying of the prefix register. Signed-off-by: Christian Borntraeger Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/s390/include/asm/kvm.h | 2 ++ arch/s390/kvm/kvm-s390.c | 7 +++++++ 2 files changed, 9 insertions(+) diff --git a/arch/s390/include/asm/kvm.h b/arch/s390/include/asm/kvm.h index 325560afb77e..9fc328c26159 100644 --- a/arch/s390/include/asm/kvm.h +++ b/arch/s390/include/asm/kvm.h @@ -41,7 +41,9 @@ struct kvm_debug_exit_arch { struct kvm_guest_debug_arch { }; +#define KVM_SYNC_PREFIX (1UL << 0) /* definition of registers in kvm_run */ struct kvm_sync_regs { + __u64 prefix; /* prefix register */ }; #endif diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 1868b89a840f..6962c1b9eec6 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -132,6 +132,7 @@ int kvm_dev_ioctl_check_extension(long ext) #ifdef CONFIG_KVM_S390_UCONTROL case KVM_CAP_S390_UCONTROL: #endif + case KVM_CAP_SYNC_REGS: r = 1; break; default: @@ -288,6 +289,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) } vcpu->arch.gmap = vcpu->kvm->arch.gmap; + vcpu->run->kvm_valid_regs = KVM_SYNC_PREFIX; return 0; } @@ -572,6 +574,10 @@ rerun_vcpu: vcpu->arch.sie_block->gpsw.mask = kvm_run->psw_mask; vcpu->arch.sie_block->gpsw.addr = kvm_run->psw_addr; + if (kvm_run->kvm_dirty_regs & KVM_SYNC_PREFIX) { + kvm_run->kvm_dirty_regs &= ~KVM_SYNC_PREFIX; + kvm_s390_set_prefix(vcpu, kvm_run->s.regs.prefix); + } might_fault(); @@ -620,6 +626,7 @@ rerun_vcpu: kvm_run->psw_mask = vcpu->arch.sie_block->gpsw.mask; kvm_run->psw_addr = vcpu->arch.sie_block->gpsw.addr; + kvm_run->s.regs.prefix = vcpu->arch.sie_block->prefix; if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &sigsaved, NULL); -- cgit v1.2.3 From 5a32c1af56b3c74212b1de2a1d1658c303dd3516 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Wed, 11 Jan 2012 11:20:32 +0100 Subject: KVM: s390: provide general purpose guest registers via kvm_run This patch adds the general purpose registers to the kvm_run structure. Signed-off-by: Christian Borntraeger Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/s390/include/asm/kvm.h | 2 ++ arch/s390/include/asm/kvm_host.h | 3 +-- arch/s390/kvm/diag.c | 6 +++--- arch/s390/kvm/intercept.c | 4 ++-- arch/s390/kvm/kvm-s390.c | 14 +++++++------- arch/s390/kvm/priv.c | 24 ++++++++++++------------ arch/s390/kvm/sigp.c | 20 ++++++++++---------- 7 files changed, 37 insertions(+), 36 deletions(-) diff --git a/arch/s390/include/asm/kvm.h b/arch/s390/include/asm/kvm.h index 9fc328c26159..420dbb7db8dd 100644 --- a/arch/s390/include/asm/kvm.h +++ b/arch/s390/include/asm/kvm.h @@ -42,8 +42,10 @@ struct kvm_guest_debug_arch { }; #define KVM_SYNC_PREFIX (1UL << 0) +#define KVM_SYNC_GPRS (1UL << 1) /* definition of registers in kvm_run */ struct kvm_sync_regs { __u64 prefix; /* prefix register */ + __u64 gprs[16]; /* general purpose registers */ }; #endif diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index e34fb2ba76c1..ed843cad4194 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -228,7 +228,6 @@ struct kvm_s390_float_interrupt { struct kvm_vcpu_arch { struct kvm_s390_sie_block *sie_block; - unsigned long guest_gprs[16]; s390_fp_regs host_fpregs; unsigned int host_acrs[NUM_ACRS]; s390_fp_regs guest_fpregs; @@ -254,5 +253,5 @@ struct kvm_arch{ struct gmap *gmap; }; -extern int sie64a(struct kvm_s390_sie_block *, unsigned long *); +extern int sie64a(struct kvm_s390_sie_block *, u64 *); #endif diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c index 8943e82cd4d9..a353f0ea45c2 100644 --- a/arch/s390/kvm/diag.c +++ b/arch/s390/kvm/diag.c @@ -20,8 +20,8 @@ static int diag_release_pages(struct kvm_vcpu *vcpu) unsigned long start, end; unsigned long prefix = vcpu->arch.sie_block->prefix; - start = vcpu->arch.guest_gprs[(vcpu->arch.sie_block->ipa & 0xf0) >> 4]; - end = vcpu->arch.guest_gprs[vcpu->arch.sie_block->ipa & 0xf] + 4096; + start = vcpu->run->s.regs.gprs[(vcpu->arch.sie_block->ipa & 0xf0) >> 4]; + end = vcpu->run->s.regs.gprs[vcpu->arch.sie_block->ipa & 0xf] + 4096; if (start & ~PAGE_MASK || end & ~PAGE_MASK || start > end || start < 2 * PAGE_SIZE) @@ -56,7 +56,7 @@ static int __diag_time_slice_end(struct kvm_vcpu *vcpu) static int __diag_ipl_functions(struct kvm_vcpu *vcpu) { unsigned int reg = vcpu->arch.sie_block->ipa & 0xf; - unsigned long subcode = vcpu->arch.guest_gprs[reg] & 0xffff; + unsigned long subcode = vcpu->run->s.regs.gprs[reg] & 0xffff; VCPU_EVENT(vcpu, 5, "diag ipl functions, subcode %lx", subcode); switch (subcode) { diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 02434543eabb..776ef83c2771 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -36,7 +36,7 @@ static int handle_lctlg(struct kvm_vcpu *vcpu) useraddr = disp2; if (base2) - useraddr += vcpu->arch.guest_gprs[base2]; + useraddr += vcpu->run->s.regs.gprs[base2]; if (useraddr & 7) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); @@ -75,7 +75,7 @@ static int handle_lctl(struct kvm_vcpu *vcpu) useraddr = disp2; if (base2) - useraddr += vcpu->arch.guest_gprs[base2]; + useraddr += vcpu->run->s.regs.gprs[base2]; if (useraddr & 3) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 6962c1b9eec6..80b12bac6a5b 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -289,7 +289,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) } vcpu->arch.gmap = vcpu->kvm->arch.gmap; - vcpu->run->kvm_valid_regs = KVM_SYNC_PREFIX; + vcpu->run->kvm_valid_regs = KVM_SYNC_PREFIX | KVM_SYNC_GPRS; return 0; } @@ -428,13 +428,13 @@ static int kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { - memcpy(&vcpu->arch.guest_gprs, ®s->gprs, sizeof(regs->gprs)); + memcpy(&vcpu->run->s.regs.gprs, ®s->gprs, sizeof(regs->gprs)); return 0; } int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { - memcpy(®s->gprs, &vcpu->arch.guest_gprs, sizeof(regs->gprs)); + memcpy(®s->gprs, &vcpu->run->s.regs.gprs, sizeof(regs->gprs)); return 0; } @@ -511,7 +511,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) { int rc; - memcpy(&vcpu->arch.sie_block->gg14, &vcpu->arch.guest_gprs[14], 16); + memcpy(&vcpu->arch.sie_block->gg14, &vcpu->run->s.regs.gprs[14], 16); if (need_resched()) schedule(); @@ -528,7 +528,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) local_irq_enable(); VCPU_EVENT(vcpu, 6, "entering sie flags %x", atomic_read(&vcpu->arch.sie_block->cpuflags)); - rc = sie64a(vcpu->arch.sie_block, vcpu->arch.guest_gprs); + rc = sie64a(vcpu->arch.sie_block, vcpu->run->s.regs.gprs); if (rc) { if (kvm_is_ucontrol(vcpu->kvm)) { rc = SIE_INTERCEPT_UCONTROL; @@ -544,7 +544,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) kvm_guest_exit(); local_irq_enable(); - memcpy(&vcpu->arch.guest_gprs[14], &vcpu->arch.sie_block->gg14, 16); + memcpy(&vcpu->run->s.regs.gprs[14], &vcpu->arch.sie_block->gg14, 16); return rc; } @@ -673,7 +673,7 @@ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr) return -EFAULT; if (__guestcopy(vcpu, addr + offsetof(struct save_area, gp_regs), - vcpu->arch.guest_gprs, 128, prefix)) + vcpu->run->s.regs.gprs, 128, prefix)) return -EFAULT; if (__guestcopy(vcpu, addr + offsetof(struct save_area, psw), diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 9c83b8a53843..e5a45dbd26ac 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -33,7 +33,7 @@ static int handle_set_prefix(struct kvm_vcpu *vcpu) operand2 = disp2; if (base2) - operand2 += vcpu->arch.guest_gprs[base2]; + operand2 += vcpu->run->s.regs.gprs[base2]; /* must be word boundary */ if (operand2 & 3) { @@ -73,7 +73,7 @@ static int handle_store_prefix(struct kvm_vcpu *vcpu) vcpu->stat.instruction_stpx++; operand2 = disp2; if (base2) - operand2 += vcpu->arch.guest_gprs[base2]; + operand2 += vcpu->run->s.regs.gprs[base2]; /* must be word boundary */ if (operand2 & 3) { @@ -105,7 +105,7 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu) vcpu->stat.instruction_stap++; useraddr = disp2; if (base2) - useraddr += vcpu->arch.guest_gprs[base2]; + useraddr += vcpu->run->s.regs.gprs[base2]; if (useraddr & 1) { kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); @@ -180,7 +180,7 @@ static int handle_stidp(struct kvm_vcpu *vcpu) vcpu->stat.instruction_stidp++; operand2 = disp2; if (base2) - operand2 += vcpu->arch.guest_gprs[base2]; + operand2 += vcpu->run->s.regs.gprs[base2]; if (operand2 & 7) { kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); @@ -231,9 +231,9 @@ static void handle_stsi_3_2_2(struct kvm_vcpu *vcpu, struct sysinfo_3_2_2 *mem) static int handle_stsi(struct kvm_vcpu *vcpu) { - int fc = (vcpu->arch.guest_gprs[0] & 0xf0000000) >> 28; - int sel1 = vcpu->arch.guest_gprs[0] & 0xff; - int sel2 = vcpu->arch.guest_gprs[1] & 0xffff; + int fc = (vcpu->run->s.regs.gprs[0] & 0xf0000000) >> 28; + int sel1 = vcpu->run->s.regs.gprs[0] & 0xff; + int sel2 = vcpu->run->s.regs.gprs[1] & 0xffff; int base2 = vcpu->arch.sie_block->ipb >> 28; int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16); u64 operand2; @@ -244,14 +244,14 @@ static int handle_stsi(struct kvm_vcpu *vcpu) operand2 = disp2; if (base2) - operand2 += vcpu->arch.guest_gprs[base2]; + operand2 += vcpu->run->s.regs.gprs[base2]; if (operand2 & 0xfff && fc > 0) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); switch (fc) { case 0: - vcpu->arch.guest_gprs[0] = 3 << 28; + vcpu->run->s.regs.gprs[0] = 3 << 28; vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44); return 0; case 1: /* same handling for 1 and 2 */ @@ -280,7 +280,7 @@ static int handle_stsi(struct kvm_vcpu *vcpu) } free_page(mem); vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44); - vcpu->arch.guest_gprs[0] = 0; + vcpu->run->s.regs.gprs[0] = 0; return 0; out_mem: free_page(mem); @@ -332,8 +332,8 @@ static int handle_tprot(struct kvm_vcpu *vcpu) int disp1 = (vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16; int base2 = (vcpu->arch.sie_block->ipb & 0xf000) >> 12; int disp2 = vcpu->arch.sie_block->ipb & 0x0fff; - u64 address1 = disp1 + base1 ? vcpu->arch.guest_gprs[base1] : 0; - u64 address2 = disp2 + base2 ? vcpu->arch.guest_gprs[base2] : 0; + u64 address1 = disp1 + base1 ? vcpu->run->s.regs.gprs[base1] : 0; + u64 address2 = disp2 + base2 ? vcpu->run->s.regs.gprs[base2] : 0; struct vm_area_struct *vma; unsigned long user_address; diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index 0a7941d74bc6..30eb0f73f9d5 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c @@ -48,7 +48,7 @@ static int __sigp_sense(struct kvm_vcpu *vcpu, u16 cpu_addr, - unsigned long *reg) + u64 *reg) { struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; int rc; @@ -220,7 +220,7 @@ static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter) } static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address, - unsigned long *reg) + u64 *reg) { struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; struct kvm_s390_local_interrupt *li = NULL; @@ -278,7 +278,7 @@ out_fi: } static int __sigp_sense_running(struct kvm_vcpu *vcpu, u16 cpu_addr, - unsigned long *reg) + u64 *reg) { int rc; struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; @@ -316,7 +316,7 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu) int base2 = vcpu->arch.sie_block->ipb >> 28; int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16); u32 parameter; - u16 cpu_addr = vcpu->arch.guest_gprs[r3]; + u16 cpu_addr = vcpu->run->s.regs.gprs[r3]; u8 order_code; int rc; @@ -327,18 +327,18 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu) order_code = disp2; if (base2) - order_code += vcpu->arch.guest_gprs[base2]; + order_code += vcpu->run->s.regs.gprs[base2]; if (r1 % 2) - parameter = vcpu->arch.guest_gprs[r1]; + parameter = vcpu->run->s.regs.gprs[r1]; else - parameter = vcpu->arch.guest_gprs[r1 + 1]; + parameter = vcpu->run->s.regs.gprs[r1 + 1]; switch (order_code) { case SIGP_SENSE: vcpu->stat.instruction_sigp_sense++; rc = __sigp_sense(vcpu, cpu_addr, - &vcpu->arch.guest_gprs[r1]); + &vcpu->run->s.regs.gprs[r1]); break; case SIGP_EXTERNAL_CALL: vcpu->stat.instruction_sigp_external_call++; @@ -363,12 +363,12 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu) case SIGP_SET_PREFIX: vcpu->stat.instruction_sigp_prefix++; rc = __sigp_set_prefix(vcpu, cpu_addr, parameter, - &vcpu->arch.guest_gprs[r1]); + &vcpu->run->s.regs.gprs[r1]); break; case SIGP_SENSE_RUNNING: vcpu->stat.instruction_sigp_sense_running++; rc = __sigp_sense_running(vcpu, cpu_addr, - &vcpu->arch.guest_gprs[r1]); + &vcpu->run->s.regs.gprs[r1]); break; case SIGP_RESTART: vcpu->stat.instruction_sigp_restart++; -- cgit v1.2.3 From 59674c1a6a35d56ae5197cbc9abe7bfec6762ba9 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Wed, 11 Jan 2012 11:20:33 +0100 Subject: KVM: s390: provide access guest registers via kvm_run This patch adds the access registers to the kvm_run structure. Signed-off-by: Christian Borntraeger Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/s390/include/asm/kvm.h | 2 ++ arch/s390/include/asm/kvm_host.h | 1 - arch/s390/kvm/kvm-s390.c | 16 +++++++++------- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/arch/s390/include/asm/kvm.h b/arch/s390/include/asm/kvm.h index 420dbb7db8dd..9acbde4af297 100644 --- a/arch/s390/include/asm/kvm.h +++ b/arch/s390/include/asm/kvm.h @@ -43,9 +43,11 @@ struct kvm_guest_debug_arch { #define KVM_SYNC_PREFIX (1UL << 0) #define KVM_SYNC_GPRS (1UL << 1) +#define KVM_SYNC_ACRS (1UL << 2) /* definition of registers in kvm_run */ struct kvm_sync_regs { __u64 prefix; /* prefix register */ __u64 gprs[16]; /* general purpose registers */ + __u32 acrs[16]; /* access registers */ }; #endif diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index ed843cad4194..e6304268ea28 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -231,7 +231,6 @@ struct kvm_vcpu_arch { s390_fp_regs host_fpregs; unsigned int host_acrs[NUM_ACRS]; s390_fp_regs guest_fpregs; - unsigned int guest_acrs[NUM_ACRS]; struct kvm_s390_local_interrupt local_int; struct hrtimer ckc_timer; struct tasklet_struct tasklet; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 80b12bac6a5b..0b91679369bc 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -289,7 +289,9 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) } vcpu->arch.gmap = vcpu->kvm->arch.gmap; - vcpu->run->kvm_valid_regs = KVM_SYNC_PREFIX | KVM_SYNC_GPRS; + vcpu->run->kvm_valid_regs = KVM_SYNC_PREFIX | + KVM_SYNC_GPRS | + KVM_SYNC_ACRS; return 0; } @@ -304,7 +306,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) save_access_regs(vcpu->arch.host_acrs); vcpu->arch.guest_fpregs.fpc &= FPC_VALID_MASK; restore_fp_regs(&vcpu->arch.guest_fpregs); - restore_access_regs(vcpu->arch.guest_acrs); + restore_access_regs(vcpu->run->s.regs.acrs); gmap_enable(vcpu->arch.gmap); atomic_set_mask(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags); } @@ -314,7 +316,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) atomic_clear_mask(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags); gmap_disable(vcpu->arch.gmap); save_fp_regs(&vcpu->arch.guest_fpregs); - save_access_regs(vcpu->arch.guest_acrs); + save_access_regs(vcpu->run->s.regs.acrs); restore_fp_regs(&vcpu->arch.host_fpregs); restore_access_regs(vcpu->arch.host_acrs); } @@ -441,16 +443,16 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { - memcpy(&vcpu->arch.guest_acrs, &sregs->acrs, sizeof(sregs->acrs)); + memcpy(&vcpu->run->s.regs.acrs, &sregs->acrs, sizeof(sregs->acrs)); memcpy(&vcpu->arch.sie_block->gcr, &sregs->crs, sizeof(sregs->crs)); - restore_access_regs(vcpu->arch.guest_acrs); + restore_access_regs(vcpu->run->s.regs.acrs); return 0; } int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { - memcpy(&sregs->acrs, &vcpu->arch.guest_acrs, sizeof(sregs->acrs)); + memcpy(&sregs->acrs, &vcpu->run->s.regs.acrs, sizeof(sregs->acrs)); memcpy(&sregs->crs, &vcpu->arch.sie_block->gcr, sizeof(sregs->crs)); return 0; } @@ -702,7 +704,7 @@ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr) return -EFAULT; if (__guestcopy(vcpu, addr + offsetof(struct save_area, acc_regs), - &vcpu->arch.guest_acrs, 64, prefix)) + &vcpu->run->s.regs.acrs, 64, prefix)) return -EFAULT; if (__guestcopy(vcpu, -- cgit v1.2.3 From 3bf3cdcc148abcd0e9d398393d390ff136d6eb9a Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Thu, 18 Aug 2011 15:25:14 -0500 Subject: KVM: PPC: e500: don't translate gfn to pfn with preemption disabled Delay allocation of the shadow pid until we're ready to disable preemption and write the entry. Signed-off-by: Scott Wood Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/kvm/e500_tlb.c | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index 13c432ea2fa8..22624a7ae821 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -507,21 +507,16 @@ static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu, vcpu_e500->mas7 = 0; } +/* TID must be supplied by the caller */ static inline void kvmppc_e500_setup_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500, struct tlbe *gtlbe, int tsize, struct tlbe_priv *priv, u64 gvaddr, struct tlbe *stlbe) { pfn_t pfn = priv->pfn; - unsigned int stid; - - stid = kvmppc_e500_get_sid(vcpu_e500, get_tlb_ts(gtlbe), - get_tlb_tid(gtlbe), - get_cur_pr(&vcpu_e500->vcpu), 0); /* Force TS=1 IPROT=0 for all guest mappings. */ - stlbe->mas1 = MAS1_TSIZE(tsize) - | MAS1_TID(stid) | MAS1_TS | MAS1_VALID; + stlbe->mas1 = MAS1_TSIZE(tsize) | MAS1_TS | MAS1_VALID; stlbe->mas2 = (gvaddr & MAS2_EPN) | e500_shadow_mas2_attrib(gtlbe->mas2, vcpu_e500->vcpu.arch.shared->msr & MSR_PR); @@ -816,6 +811,24 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb) return EMULATE_DONE; } +/* sesel is index into the set, not the whole array */ +static void write_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500, + struct tlbe *gtlbe, + struct tlbe *stlbe, + int stlbsel, int sesel) +{ + int stid; + + preempt_disable(); + stid = kvmppc_e500_get_sid(vcpu_e500, get_tlb_ts(gtlbe), + get_tlb_tid(gtlbe), + get_cur_pr(&vcpu_e500->vcpu), 0); + + stlbe->mas1 |= MAS1_TID(stid); + write_host_tlbe(vcpu_e500, stlbsel, sesel, stlbe); + preempt_enable(); +} + int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); @@ -845,7 +858,6 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) u64 eaddr; u64 raddr; - preempt_disable(); switch (tlbsel) { case 0: /* TLB0 */ @@ -874,8 +886,8 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) default: BUG(); } - write_host_tlbe(vcpu_e500, stlbsel, sesel, &stlbe); - preempt_enable(); + + write_stlbe(vcpu_e500, gtlbe, &stlbe, stlbsel, sesel); } kvmppc_set_exit_type(vcpu, EMULATED_TLBWE_EXITS); @@ -937,7 +949,6 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr, gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; - preempt_disable(); switch (tlbsel) { case 0: stlbsel = 0; @@ -962,8 +973,7 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr, break; } - write_host_tlbe(vcpu_e500, stlbsel, sesel, &stlbe); - preempt_enable(); + write_stlbe(vcpu_e500, gtlbe, &stlbe, stlbsel, sesel); } int kvmppc_e500_tlb_search(struct kvm_vcpu *vcpu, -- cgit v1.2.3 From 90b92a6f51af9adf8c44e8ab3f435b336e5ba6ff Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Thu, 18 Aug 2011 15:25:16 -0500 Subject: KVM: PPC: e500: Eliminate preempt_disable in local_sid_destroy_all The only place it makes sense to call this function already needs to have preemption disabled. Signed-off-by: Scott Wood Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/kvm/e500_tlb.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index 22624a7ae821..b976d8025e58 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -116,13 +116,11 @@ static inline int local_sid_lookup(struct id *entry) return -1; } -/* Invalidate all id mappings on local core */ +/* Invalidate all id mappings on local core -- call with preempt disabled */ static inline void local_sid_destroy_all(void) { - preempt_disable(); __get_cpu_var(pcpu_last_used_sid) = 0; memset(&__get_cpu_var(pcpu_sids), 0, sizeof(__get_cpu_var(pcpu_sids))); - preempt_enable(); } static void *kvmppc_e500_id_table_alloc(struct kvmppc_vcpu_e500 *vcpu_e500) -- cgit v1.2.3 From 0164c0f0c404017fb04defb0ceb23fd1c3c3a53e Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Thu, 18 Aug 2011 15:25:18 -0500 Subject: KVM: PPC: e500: clear up confusion between host and guest entries Split out the portions of tlbe_priv that should be associated with host entries into tlbe_ref. Base victim selection on the number of hardware entries, not guest entries. For TLB1, where one guest entry can be mapped by multiple host entries, we use the host tlbe_ref for tracking page references. For the guest TLB0 entries, we still track it with gtlb_priv, to avoid having to retranslate if the entry is evicted from the host TLB but not the guest TLB. Signed-off-by: Scott Wood Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/include/asm/kvm_e500.h | 24 ++- arch/powerpc/include/asm/mmu-book3e.h | 1 + arch/powerpc/kvm/e500_tlb.c | 267 ++++++++++++++++++++++++---------- arch/powerpc/kvm/e500_tlb.h | 17 --- 4 files changed, 213 insertions(+), 96 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_e500.h b/arch/powerpc/include/asm/kvm_e500.h index adbfca9dd100..a5197d816ec7 100644 --- a/arch/powerpc/include/asm/kvm_e500.h +++ b/arch/powerpc/include/asm/kvm_e500.h @@ -32,13 +32,21 @@ struct tlbe{ #define E500_TLB_VALID 1 #define E500_TLB_DIRTY 2 -struct tlbe_priv { +struct tlbe_ref { pfn_t pfn; unsigned int flags; /* E500_TLB_* */ }; +struct tlbe_priv { + struct tlbe_ref ref; /* TLB0 only -- TLB1 uses tlb_refs */ +}; + struct vcpu_id_table; +struct kvmppc_e500_tlb_params { + int entries, ways, sets; +}; + struct kvmppc_vcpu_e500 { /* Unmodified copy of the guest's TLB. */ struct tlbe *gtlb_arch[E500_TLB_NUM]; @@ -49,6 +57,20 @@ struct kvmppc_vcpu_e500 { unsigned int gtlb_size[E500_TLB_NUM]; unsigned int gtlb_nv[E500_TLB_NUM]; + /* + * information associated with each host TLB entry -- + * TLB1 only for now. If/when guest TLB1 entries can be + * mapped with host TLB0, this will be used for that too. + * + * We don't want to use this for guest TLB0 because then we'd + * have the overhead of doing the translation again even if + * the entry is still in the guest TLB (e.g. we swapped out + * and back, and our host TLB entries got evicted). + */ + struct tlbe_ref *tlb_refs[E500_TLB_NUM]; + + unsigned int host_tlb1_nv; + u32 host_pid[E500_PID_NUM]; u32 pid[E500_PID_NUM]; u32 svr; diff --git a/arch/powerpc/include/asm/mmu-book3e.h b/arch/powerpc/include/asm/mmu-book3e.h index f5f89cafebd0..a0128232d9cc 100644 --- a/arch/powerpc/include/asm/mmu-book3e.h +++ b/arch/powerpc/include/asm/mmu-book3e.h @@ -167,6 +167,7 @@ #define TLBnCFG_MAXSIZE 0x000f0000 /* Maximum Page Size (v1.0) */ #define TLBnCFG_MAXSIZE_SHIFT 16 #define TLBnCFG_ASSOC 0xff000000 /* Associativity */ +#define TLBnCFG_ASSOC_SHIFT 24 /* TLBnPS encoding */ #define TLBnPS_4K 0x00000004 diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index b976d8025e58..59221bb1e00e 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -12,6 +12,7 @@ * published by the Free Software Foundation. */ +#include #include #include #include @@ -26,7 +27,7 @@ #include "trace.h" #include "timing.h" -#define to_htlb1_esel(esel) (tlb1_entry_num - (esel) - 1) +#define to_htlb1_esel(esel) (host_tlb_params[1].entries - (esel) - 1) struct id { unsigned long val; @@ -63,7 +64,7 @@ static DEFINE_PER_CPU(struct pcpu_id_table, pcpu_sids); * The valid range of shadow ID is [1..255] */ static DEFINE_PER_CPU(unsigned long, pcpu_last_used_sid); -static unsigned int tlb1_entry_num; +static struct kvmppc_e500_tlb_params host_tlb_params[E500_TLB_NUM]; /* * Allocate a free shadow id and setup a valid sid mapping in given entry. @@ -237,7 +238,7 @@ void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu) } } -static inline unsigned int tlb0_get_next_victim( +static inline unsigned int gtlb0_get_next_victim( struct kvmppc_vcpu_e500 *vcpu_e500) { unsigned int victim; @@ -252,7 +253,7 @@ static inline unsigned int tlb0_get_next_victim( static inline unsigned int tlb1_max_shadow_size(void) { /* reserve one entry for magic page */ - return tlb1_entry_num - tlbcam_index - 1; + return host_tlb_params[1].entries - tlbcam_index - 1; } static inline int tlbe_is_writable(struct tlbe *tlbe) @@ -302,13 +303,12 @@ static inline void __write_host_tlbe(struct tlbe *stlbe, uint32_t mas0) local_irq_restore(flags); } +/* esel is index into set, not whole array */ static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, int esel, struct tlbe *stlbe) { if (tlbsel == 0) { - __write_host_tlbe(stlbe, - MAS0_TLBSEL(0) | - MAS0_ESEL(esel & (KVM_E500_TLB0_WAY_NUM - 1))); + __write_host_tlbe(stlbe, MAS0_TLBSEL(0) | MAS0_ESEL(esel)); } else { __write_host_tlbe(stlbe, MAS0_TLBSEL(1) | @@ -355,8 +355,8 @@ void kvmppc_e500_tlb_put(struct kvm_vcpu *vcpu) { } -static void kvmppc_e500_stlbe_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500, - int tlbsel, int esel) +static void inval_gtlbe_on_host(struct kvmppc_vcpu_e500 *vcpu_e500, + int tlbsel, int esel) { struct tlbe *gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; struct vcpu_id_table *idt = vcpu_e500->idt; @@ -412,18 +412,53 @@ static void kvmppc_e500_stlbe_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500, preempt_enable(); } +static int tlb0_set_base(gva_t addr, int sets, int ways) +{ + int set_base; + + set_base = (addr >> PAGE_SHIFT) & (sets - 1); + set_base *= ways; + + return set_base; +} + +static int gtlb0_set_base(struct kvmppc_vcpu_e500 *vcpu_e500, gva_t addr) +{ + int sets = KVM_E500_TLB0_SIZE / KVM_E500_TLB0_WAY_NUM; + + return tlb0_set_base(addr, sets, KVM_E500_TLB0_WAY_NUM); +} + +static int htlb0_set_base(gva_t addr) +{ + return tlb0_set_base(addr, host_tlb_params[0].sets, + host_tlb_params[0].ways); +} + +static unsigned int get_tlb_esel(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel) +{ + unsigned int esel = get_tlb_esel_bit(vcpu_e500); + + if (tlbsel == 0) { + esel &= KVM_E500_TLB0_WAY_NUM_MASK; + esel += gtlb0_set_base(vcpu_e500, vcpu_e500->mas2); + } else { + esel &= vcpu_e500->gtlb_size[tlbsel] - 1; + } + + return esel; +} + /* Search the guest TLB for a matching entry. */ static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500, gva_t eaddr, int tlbsel, unsigned int pid, int as) { int size = vcpu_e500->gtlb_size[tlbsel]; - int set_base; + unsigned int set_base; int i; if (tlbsel == 0) { - int mask = size / KVM_E500_TLB0_WAY_NUM - 1; - set_base = (eaddr >> PAGE_SHIFT) & mask; - set_base *= KVM_E500_TLB0_WAY_NUM; + set_base = gtlb0_set_base(vcpu_e500, eaddr); size = KVM_E500_TLB0_WAY_NUM; } else { set_base = 0; @@ -455,29 +490,55 @@ static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500, return -1; } -static inline void kvmppc_e500_priv_setup(struct tlbe_priv *priv, - struct tlbe *gtlbe, - pfn_t pfn) +static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref, + struct tlbe *gtlbe, + pfn_t pfn) { - priv->pfn = pfn; - priv->flags = E500_TLB_VALID; + ref->pfn = pfn; + ref->flags = E500_TLB_VALID; if (tlbe_is_writable(gtlbe)) - priv->flags |= E500_TLB_DIRTY; + ref->flags |= E500_TLB_DIRTY; } -static inline void kvmppc_e500_priv_release(struct tlbe_priv *priv) +static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref) { - if (priv->flags & E500_TLB_VALID) { - if (priv->flags & E500_TLB_DIRTY) - kvm_release_pfn_dirty(priv->pfn); + if (ref->flags & E500_TLB_VALID) { + if (ref->flags & E500_TLB_DIRTY) + kvm_release_pfn_dirty(ref->pfn); else - kvm_release_pfn_clean(priv->pfn); + kvm_release_pfn_clean(ref->pfn); + + ref->flags = 0; + } +} + +static void clear_tlb_privs(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + int tlbsel = 0; + int i; - priv->flags = 0; + for (i = 0; i < vcpu_e500->gtlb_size[tlbsel]; i++) { + struct tlbe_ref *ref = + &vcpu_e500->gtlb_priv[tlbsel][i].ref; + kvmppc_e500_ref_release(ref); } } +static void clear_tlb_refs(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + int stlbsel = 1; + int i; + + for (i = 0; i < host_tlb_params[stlbsel].entries; i++) { + struct tlbe_ref *ref = + &vcpu_e500->tlb_refs[stlbsel][i]; + kvmppc_e500_ref_release(ref); + } + + clear_tlb_privs(vcpu_e500); +} + static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu, unsigned int eaddr, int as) { @@ -487,7 +548,7 @@ static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu, /* since we only have two TLBs, only lower bit is used. */ tlbsel = (vcpu_e500->mas4 >> 28) & 0x1; - victim = (tlbsel == 0) ? tlb0_get_next_victim(vcpu_e500) : 0; + victim = (tlbsel == 0) ? gtlb0_get_next_victim(vcpu_e500) : 0; pidsel = (vcpu_e500->mas4 >> 16) & 0xf; tsized = (vcpu_e500->mas4 >> 7) & 0x1f; @@ -508,10 +569,12 @@ static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu, /* TID must be supplied by the caller */ static inline void kvmppc_e500_setup_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500, struct tlbe *gtlbe, int tsize, - struct tlbe_priv *priv, + struct tlbe_ref *ref, u64 gvaddr, struct tlbe *stlbe) { - pfn_t pfn = priv->pfn; + pfn_t pfn = ref->pfn; + + BUG_ON(!(ref->flags & E500_TLB_VALID)); /* Force TS=1 IPROT=0 for all guest mappings. */ stlbe->mas1 = MAS1_TSIZE(tsize) | MAS1_TS | MAS1_VALID; @@ -524,16 +587,15 @@ static inline void kvmppc_e500_setup_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500, stlbe->mas7 = (pfn >> (32 - PAGE_SHIFT)) & MAS7_RPN; } - +/* sesel is an index into the entire array, not just the set */ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, - u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe, int tlbsel, int esel, - struct tlbe *stlbe) + u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe, int tlbsel, int sesel, + struct tlbe *stlbe, struct tlbe_ref *ref) { struct kvm_memory_slot *slot; unsigned long pfn, hva; int pfnmap = 0; int tsize = BOOK3E_PAGESZ_4K; - struct tlbe_priv *priv; /* * Translate guest physical to true physical, acquiring @@ -629,12 +691,11 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, } } - /* Drop old priv and setup new one. */ - priv = &vcpu_e500->gtlb_priv[tlbsel][esel]; - kvmppc_e500_priv_release(priv); - kvmppc_e500_priv_setup(priv, gtlbe, pfn); + /* Drop old ref and setup new one. */ + kvmppc_e500_ref_release(ref); + kvmppc_e500_ref_setup(ref, gtlbe, pfn); - kvmppc_e500_setup_stlbe(vcpu_e500, gtlbe, tsize, priv, gvaddr, stlbe); + kvmppc_e500_setup_stlbe(vcpu_e500, gtlbe, tsize, ref, gvaddr, stlbe); } /* XXX only map the one-one case, for now use TLB0 */ @@ -642,14 +703,22 @@ static int kvmppc_e500_tlb0_map(struct kvmppc_vcpu_e500 *vcpu_e500, int esel, struct tlbe *stlbe) { struct tlbe *gtlbe; + struct tlbe_ref *ref; + int sesel = esel & (host_tlb_params[0].ways - 1); + int sesel_base; + gva_t ea; gtlbe = &vcpu_e500->gtlb_arch[0][esel]; + ref = &vcpu_e500->gtlb_priv[0][esel].ref; + + ea = get_tlb_eaddr(gtlbe); + sesel_base = htlb0_set_base(ea); kvmppc_e500_shadow_map(vcpu_e500, get_tlb_eaddr(gtlbe), get_tlb_raddr(gtlbe) >> PAGE_SHIFT, - gtlbe, 0, esel, stlbe); + gtlbe, 0, sesel_base + sesel, stlbe, ref); - return esel; + return sesel; } /* Caller must ensure that the specified guest TLB entry is safe to insert into @@ -658,14 +727,17 @@ static int kvmppc_e500_tlb0_map(struct kvmppc_vcpu_e500 *vcpu_e500, static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500, u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe, struct tlbe *stlbe) { + struct tlbe_ref *ref; unsigned int victim; - victim = vcpu_e500->gtlb_nv[1]++; + victim = vcpu_e500->host_tlb1_nv++; - if (unlikely(vcpu_e500->gtlb_nv[1] >= tlb1_max_shadow_size())) - vcpu_e500->gtlb_nv[1] = 0; + if (unlikely(vcpu_e500->host_tlb1_nv >= tlb1_max_shadow_size())) + vcpu_e500->host_tlb1_nv = 0; - kvmppc_e500_shadow_map(vcpu_e500, gvaddr, gfn, gtlbe, 1, victim, stlbe); + ref = &vcpu_e500->tlb_refs[1][victim]; + kvmppc_e500_shadow_map(vcpu_e500, gvaddr, gfn, gtlbe, 1, + victim, stlbe, ref); return victim; } @@ -792,7 +864,7 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb) /* since we only have two TLBs, only lower bit is used. */ tlbsel = vcpu_e500->mas4 >> 28 & 0x1; - victim = (tlbsel == 0) ? tlb0_get_next_victim(vcpu_e500) : 0; + victim = (tlbsel == 0) ? gtlb0_get_next_victim(vcpu_e500) : 0; vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim) | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); @@ -839,7 +911,7 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; if (get_tlb_v(gtlbe)) - kvmppc_e500_stlbe_invalidate(vcpu_e500, tlbsel, esel); + inval_gtlbe_on_host(vcpu_e500, tlbsel, esel); gtlbe->mas1 = vcpu_e500->mas1; gtlbe->mas2 = vcpu_e500->mas2; @@ -950,11 +1022,11 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr, switch (tlbsel) { case 0: stlbsel = 0; - sesel = esel; - priv = &vcpu_e500->gtlb_priv[stlbsel][sesel]; + sesel = esel & (host_tlb_params[0].ways - 1); + priv = &vcpu_e500->gtlb_priv[tlbsel][esel]; kvmppc_e500_setup_stlbe(vcpu_e500, gtlbe, BOOK3E_PAGESZ_4K, - priv, eaddr, &stlbe); + &priv->ref, eaddr, &stlbe); break; case 1: { @@ -1020,32 +1092,76 @@ void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *vcpu_e500) int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) { - tlb1_entry_num = mfspr(SPRN_TLB1CFG) & 0xFFF; + host_tlb_params[0].entries = mfspr(SPRN_TLB0CFG) & TLBnCFG_N_ENTRY; + host_tlb_params[1].entries = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY; + + /* + * This should never happen on real e500 hardware, but is + * architecturally possible -- e.g. in some weird nested + * virtualization case. + */ + if (host_tlb_params[0].entries == 0 || + host_tlb_params[1].entries == 0) { + pr_err("%s: need to know host tlb size\n", __func__); + return -ENODEV; + } + + host_tlb_params[0].ways = (mfspr(SPRN_TLB0CFG) & TLBnCFG_ASSOC) >> + TLBnCFG_ASSOC_SHIFT; + host_tlb_params[1].ways = host_tlb_params[1].entries; + + if (!is_power_of_2(host_tlb_params[0].entries) || + !is_power_of_2(host_tlb_params[0].ways) || + host_tlb_params[0].entries < host_tlb_params[0].ways || + host_tlb_params[0].ways == 0) { + pr_err("%s: bad tlb0 host config: %u entries %u ways\n", + __func__, host_tlb_params[0].entries, + host_tlb_params[0].ways); + return -ENODEV; + } + + host_tlb_params[0].sets = + host_tlb_params[0].entries / host_tlb_params[0].ways; + host_tlb_params[1].sets = 1; vcpu_e500->gtlb_size[0] = KVM_E500_TLB0_SIZE; vcpu_e500->gtlb_arch[0] = kzalloc(sizeof(struct tlbe) * KVM_E500_TLB0_SIZE, GFP_KERNEL); if (vcpu_e500->gtlb_arch[0] == NULL) - goto err_out; + goto err; vcpu_e500->gtlb_size[1] = KVM_E500_TLB1_SIZE; vcpu_e500->gtlb_arch[1] = kzalloc(sizeof(struct tlbe) * KVM_E500_TLB1_SIZE, GFP_KERNEL); if (vcpu_e500->gtlb_arch[1] == NULL) - goto err_out_guest0; - - vcpu_e500->gtlb_priv[0] = (struct tlbe_priv *) - kzalloc(sizeof(struct tlbe_priv) * KVM_E500_TLB0_SIZE, GFP_KERNEL); - if (vcpu_e500->gtlb_priv[0] == NULL) - goto err_out_guest1; - vcpu_e500->gtlb_priv[1] = (struct tlbe_priv *) - kzalloc(sizeof(struct tlbe_priv) * KVM_E500_TLB1_SIZE, GFP_KERNEL); - - if (vcpu_e500->gtlb_priv[1] == NULL) - goto err_out_priv0; + goto err; + + vcpu_e500->tlb_refs[0] = + kzalloc(sizeof(struct tlbe_ref) * host_tlb_params[0].entries, + GFP_KERNEL); + if (!vcpu_e500->tlb_refs[0]) + goto err; + + vcpu_e500->tlb_refs[1] = + kzalloc(sizeof(struct tlbe_ref) * host_tlb_params[1].entries, + GFP_KERNEL); + if (!vcpu_e500->tlb_refs[1]) + goto err; + + vcpu_e500->gtlb_priv[0] = + kzalloc(sizeof(struct tlbe_ref) * vcpu_e500->gtlb_size[0], + GFP_KERNEL); + if (!vcpu_e500->gtlb_priv[0]) + goto err; + + vcpu_e500->gtlb_priv[1] = + kzalloc(sizeof(struct tlbe_ref) * vcpu_e500->gtlb_size[1], + GFP_KERNEL); + if (!vcpu_e500->gtlb_priv[1]) + goto err; if (kvmppc_e500_id_table_alloc(vcpu_e500) == NULL) - goto err_out_priv1; + goto err; /* Init TLB configuration register */ vcpu_e500->tlb0cfg = mfspr(SPRN_TLB0CFG) & ~0xfffUL; @@ -1055,31 +1171,26 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) return 0; -err_out_priv1: - kfree(vcpu_e500->gtlb_priv[1]); -err_out_priv0: +err: + kfree(vcpu_e500->tlb_refs[0]); + kfree(vcpu_e500->tlb_refs[1]); kfree(vcpu_e500->gtlb_priv[0]); -err_out_guest1: - kfree(vcpu_e500->gtlb_arch[1]); -err_out_guest0: + kfree(vcpu_e500->gtlb_priv[1]); kfree(vcpu_e500->gtlb_arch[0]); -err_out: + kfree(vcpu_e500->gtlb_arch[1]); return -1; } void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500) { - int stlbsel, i; - - /* release all privs */ - for (stlbsel = 0; stlbsel < 2; stlbsel++) - for (i = 0; i < vcpu_e500->gtlb_size[stlbsel]; i++) { - struct tlbe_priv *priv = - &vcpu_e500->gtlb_priv[stlbsel][i]; - kvmppc_e500_priv_release(priv); - } + clear_tlb_refs(vcpu_e500); kvmppc_e500_id_table_free(vcpu_e500); + + kfree(vcpu_e500->tlb_refs[0]); + kfree(vcpu_e500->tlb_refs[1]); + kfree(vcpu_e500->gtlb_priv[0]); + kfree(vcpu_e500->gtlb_priv[1]); kfree(vcpu_e500->gtlb_arch[1]); kfree(vcpu_e500->gtlb_arch[0]); } diff --git a/arch/powerpc/kvm/e500_tlb.h b/arch/powerpc/kvm/e500_tlb.h index 59b88e99a235..b587f691459a 100644 --- a/arch/powerpc/kvm/e500_tlb.h +++ b/arch/powerpc/kvm/e500_tlb.h @@ -155,23 +155,6 @@ static inline unsigned int get_tlb_esel_bit( return (vcpu_e500->mas0 >> 16) & 0xfff; } -static inline unsigned int get_tlb_esel( - const struct kvmppc_vcpu_e500 *vcpu_e500, - int tlbsel) -{ - unsigned int esel = get_tlb_esel_bit(vcpu_e500); - - if (tlbsel == 0) { - esel &= KVM_E500_TLB0_WAY_NUM_MASK; - esel |= ((vcpu_e500->mas2 >> 12) & KVM_E500_TLB0_WAY_SIZE_MASK) - << KVM_E500_TLB0_WAY_NUM_BIT; - } else { - esel &= KVM_E500_TLB1_SIZE - 1; - } - - return esel; -} - static inline int tlbe_is_host_safe(const struct kvm_vcpu *vcpu, const struct tlbe *tlbe) { -- cgit v1.2.3 From dc83b8bc0256ee682506ed83853a98eaba529c6f Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Thu, 18 Aug 2011 15:25:21 -0500 Subject: KVM: PPC: e500: MMU API This implements a shared-memory API for giving host userspace access to the guest's TLB. Signed-off-by: Scott Wood Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- Documentation/virtual/kvm/api.txt | 74 +++++++ arch/powerpc/include/asm/kvm.h | 35 ++++ arch/powerpc/include/asm/kvm_e500.h | 24 +-- arch/powerpc/include/asm/kvm_ppc.h | 5 + arch/powerpc/kvm/e500.c | 5 +- arch/powerpc/kvm/e500_emulate.c | 12 +- arch/powerpc/kvm/e500_tlb.c | 393 ++++++++++++++++++++++++------------ arch/powerpc/kvm/e500_tlb.h | 38 ++-- arch/powerpc/kvm/powerpc.c | 28 +++ include/linux/kvm.h | 18 ++ 10 files changed, 469 insertions(+), 163 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 7ca696227d3a..bcd45d5afcab 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1409,6 +1409,38 @@ The following flags are defined: If datamatch flag is set, the event will be signaled only if the written value to the registered address is equal to datamatch in struct kvm_ioeventfd. +4.59 KVM_DIRTY_TLB + +Capability: KVM_CAP_SW_TLB +Architectures: ppc +Type: vcpu ioctl +Parameters: struct kvm_dirty_tlb (in) +Returns: 0 on success, -1 on error + +struct kvm_dirty_tlb { + __u64 bitmap; + __u32 num_dirty; +}; + +This must be called whenever userspace has changed an entry in the shared +TLB, prior to calling KVM_RUN on the associated vcpu. + +The "bitmap" field is the userspace address of an array. This array +consists of a number of bits, equal to the total number of TLB entries as +determined by the last successful call to KVM_CONFIG_TLB, rounded up to the +nearest multiple of 64. + +Each bit corresponds to one TLB entry, ordered the same as in the shared TLB +array. + +The array is little-endian: the bit 0 is the least significant bit of the +first byte, bit 8 is the least significant bit of the second byte, etc. +This avoids any complications with differing word sizes. + +The "num_dirty" field is a performance hint for KVM to determine whether it +should skip processing the bitmap and just invalidate everything. It must +be set to the number of set bits in the bitmap. + 4.62 KVM_CREATE_SPAPR_TCE Capability: KVM_CAP_SPAPR_TCE @@ -1842,3 +1874,45 @@ HTAB address part of SDR1 contains an HVA instead of a GPA, as PAPR keeps the HTAB invisible to the guest. When this capability is enabled, KVM_EXIT_PAPR_HCALL can occur. + +6.3 KVM_CAP_SW_TLB + +Architectures: ppc +Parameters: args[0] is the address of a struct kvm_config_tlb +Returns: 0 on success; -1 on error + +struct kvm_config_tlb { + __u64 params; + __u64 array; + __u32 mmu_type; + __u32 array_len; +}; + +Configures the virtual CPU's TLB array, establishing a shared memory area +between userspace and KVM. The "params" and "array" fields are userspace +addresses of mmu-type-specific data structures. The "array_len" field is an +safety mechanism, and should be set to the size in bytes of the memory that +userspace has reserved for the array. It must be at least the size dictated +by "mmu_type" and "params". + +While KVM_RUN is active, the shared region is under control of KVM. Its +contents are undefined, and any modification by userspace results in +boundedly undefined behavior. + +On return from KVM_RUN, the shared region will reflect the current state of +the guest's TLB. If userspace makes any changes, it must call KVM_DIRTY_TLB +to tell KVM which entries have been changed, prior to calling KVM_RUN again +on this vcpu. + +For mmu types KVM_MMU_FSL_BOOKE_NOHV and KVM_MMU_FSL_BOOKE_HV: + - The "params" field is of type "struct kvm_book3e_206_tlb_params". + - The "array" field points to an array of type "struct + kvm_book3e_206_tlb_entry". + - The array consists of all entries in the first TLB, followed by all + entries in the second TLB. + - Within a TLB, entries are ordered first by increasing set number. Within a + set, entries are ordered by way (increasing ESEL). + - The hash for determining set number in TLB0 is: (MAS2 >> 12) & (num_sets - 1) + where "num_sets" is the tlb_sizes[] value divided by the tlb_ways[] value. + - The tsize field of mas1 shall be set to 4K on TLB0, even though the + hardware ignores this value for TLB0. diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h index 7d9d4de057ef..663c57f87165 100644 --- a/arch/powerpc/include/asm/kvm.h +++ b/arch/powerpc/include/asm/kvm.h @@ -296,4 +296,39 @@ struct kvm_allocate_rma { __u64 rma_size; }; +struct kvm_book3e_206_tlb_entry { + __u32 mas8; + __u32 mas1; + __u64 mas2; + __u64 mas7_3; +}; + +struct kvm_book3e_206_tlb_params { + /* + * For mmu types KVM_MMU_FSL_BOOKE_NOHV and KVM_MMU_FSL_BOOKE_HV: + * + * - The number of ways of TLB0 must be a power of two between 2 and + * 16. + * - TLB1 must be fully associative. + * - The size of TLB0 must be a multiple of the number of ways, and + * the number of sets must be a power of two. + * - The size of TLB1 may not exceed 64 entries. + * - TLB0 supports 4 KiB pages. + * - The page sizes supported by TLB1 are as indicated by + * TLB1CFG (if MMUCFG[MAVN] = 0) or TLB1PS (if MMUCFG[MAVN] = 1) + * as returned by KVM_GET_SREGS. + * - TLB2 and TLB3 are reserved, and their entries in tlb_sizes[] + * and tlb_ways[] must be zero. + * + * tlb_ways[n] = tlb_sizes[n] means the array is fully associative. + * + * KVM will adjust TLBnCFG based on the sizes configured here, + * though arrays greater than 2048 entries will have TLBnCFG[NENTRY] + * set to zero. + */ + __u32 tlb_sizes[4]; + __u32 tlb_ways[4]; + __u32 reserved[8]; +}; + #endif /* __LINUX_KVM_POWERPC_H */ diff --git a/arch/powerpc/include/asm/kvm_e500.h b/arch/powerpc/include/asm/kvm_e500.h index a5197d816ec7..bc17441535f2 100644 --- a/arch/powerpc/include/asm/kvm_e500.h +++ b/arch/powerpc/include/asm/kvm_e500.h @@ -22,13 +22,6 @@ #define E500_PID_NUM 3 #define E500_TLB_NUM 2 -struct tlbe{ - u32 mas1; - u32 mas2; - u32 mas3; - u32 mas7; -}; - #define E500_TLB_VALID 1 #define E500_TLB_DIRTY 2 @@ -48,13 +41,17 @@ struct kvmppc_e500_tlb_params { }; struct kvmppc_vcpu_e500 { - /* Unmodified copy of the guest's TLB. */ - struct tlbe *gtlb_arch[E500_TLB_NUM]; + /* Unmodified copy of the guest's TLB -- shared with host userspace. */ + struct kvm_book3e_206_tlb_entry *gtlb_arch; + + /* Starting entry number in gtlb_arch[] */ + int gtlb_offset[E500_TLB_NUM]; /* KVM internal information associated with each guest TLB entry */ struct tlbe_priv *gtlb_priv[E500_TLB_NUM]; - unsigned int gtlb_size[E500_TLB_NUM]; + struct kvmppc_e500_tlb_params gtlb_params[E500_TLB_NUM]; + unsigned int gtlb_nv[E500_TLB_NUM]; /* @@ -68,7 +65,6 @@ struct kvmppc_vcpu_e500 { * and back, and our host TLB entries got evicted). */ struct tlbe_ref *tlb_refs[E500_TLB_NUM]; - unsigned int host_tlb1_nv; u32 host_pid[E500_PID_NUM]; @@ -78,11 +74,10 @@ struct kvmppc_vcpu_e500 { u32 mas0; u32 mas1; u32 mas2; - u32 mas3; + u64 mas7_3; u32 mas4; u32 mas5; u32 mas6; - u32 mas7; /* vcpu id table */ struct vcpu_id_table *idt; @@ -95,6 +90,9 @@ struct kvmppc_vcpu_e500 { u32 tlb1cfg; u64 mcar; + struct page **shared_tlb_pages; + int num_shared_tlb_pages; + struct kvm_vcpu vcpu; }; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 46efd1a265c9..a284f209e2df 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -193,4 +193,9 @@ static inline void kvm_rma_init(void) {} #endif +int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, + struct kvm_config_tlb *cfg); +int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu, + struct kvm_dirty_tlb *cfg); + #endif /* __POWERPC_KVM_PPC_H__ */ diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index 8c0d45a6faf7..f17d7e732a1e 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -121,7 +121,7 @@ void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) sregs->u.e.mas0 = vcpu_e500->mas0; sregs->u.e.mas1 = vcpu_e500->mas1; sregs->u.e.mas2 = vcpu_e500->mas2; - sregs->u.e.mas7_3 = ((u64)vcpu_e500->mas7 << 32) | vcpu_e500->mas3; + sregs->u.e.mas7_3 = vcpu_e500->mas7_3; sregs->u.e.mas4 = vcpu_e500->mas4; sregs->u.e.mas6 = vcpu_e500->mas6; @@ -154,8 +154,7 @@ int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) vcpu_e500->mas0 = sregs->u.e.mas0; vcpu_e500->mas1 = sregs->u.e.mas1; vcpu_e500->mas2 = sregs->u.e.mas2; - vcpu_e500->mas7 = sregs->u.e.mas7_3 >> 32; - vcpu_e500->mas3 = (u32)sregs->u.e.mas7_3; + vcpu_e500->mas7_3 = sregs->u.e.mas7_3; vcpu_e500->mas4 = sregs->u.e.mas4; vcpu_e500->mas6 = sregs->u.e.mas6; } diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c index d48ae396f41e..e0d36099c756 100644 --- a/arch/powerpc/kvm/e500_emulate.c +++ b/arch/powerpc/kvm/e500_emulate.c @@ -95,13 +95,17 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) case SPRN_MAS2: vcpu_e500->mas2 = spr_val; break; case SPRN_MAS3: - vcpu_e500->mas3 = spr_val; break; + vcpu_e500->mas7_3 &= ~(u64)0xffffffff; + vcpu_e500->mas7_3 |= spr_val; + break; case SPRN_MAS4: vcpu_e500->mas4 = spr_val; break; case SPRN_MAS6: vcpu_e500->mas6 = spr_val; break; case SPRN_MAS7: - vcpu_e500->mas7 = spr_val; break; + vcpu_e500->mas7_3 &= (u64)0xffffffff; + vcpu_e500->mas7_3 |= (u64)spr_val << 32; + break; case SPRN_L1CSR0: vcpu_e500->l1csr0 = spr_val; vcpu_e500->l1csr0 &= ~(L1CSR0_DCFI | L1CSR0_CLFC); @@ -158,13 +162,13 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) case SPRN_MAS2: kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas2); break; case SPRN_MAS3: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas3); break; + kvmppc_set_gpr(vcpu, rt, (u32)vcpu_e500->mas7_3); break; case SPRN_MAS4: kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas4); break; case SPRN_MAS6: kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas6); break; case SPRN_MAS7: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas7); break; + kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas7_3 >> 32); break; case SPRN_TLB0CFG: kvmppc_set_gpr(vcpu, rt, vcpu_e500->tlb0cfg); break; diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index 59221bb1e00e..f19ae2f61521 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -19,6 +19,11 @@ #include #include #include +#include +#include +#include +#include +#include #include #include @@ -66,6 +71,13 @@ static DEFINE_PER_CPU(unsigned long, pcpu_last_used_sid); static struct kvmppc_e500_tlb_params host_tlb_params[E500_TLB_NUM]; +static struct kvm_book3e_206_tlb_entry *get_entry( + struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, int entry) +{ + int offset = vcpu_e500->gtlb_offset[tlbsel]; + return &vcpu_e500->gtlb_arch[offset + entry]; +} + /* * Allocate a free shadow id and setup a valid sid mapping in given entry. * A mapping is only valid when vcpu_id_table and pcpu_id_table are match. @@ -217,34 +229,13 @@ void kvmppc_e500_recalc_shadow_pid(struct kvmppc_vcpu_e500 *vcpu_e500) preempt_enable(); } -void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu) -{ - struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - struct tlbe *tlbe; - int i, tlbsel; - - printk("| %8s | %8s | %8s | %8s | %8s |\n", - "nr", "mas1", "mas2", "mas3", "mas7"); - - for (tlbsel = 0; tlbsel < 2; tlbsel++) { - printk("Guest TLB%d:\n", tlbsel); - for (i = 0; i < vcpu_e500->gtlb_size[tlbsel]; i++) { - tlbe = &vcpu_e500->gtlb_arch[tlbsel][i]; - if (tlbe->mas1 & MAS1_VALID) - printk(" G[%d][%3d] | %08X | %08X | %08X | %08X |\n", - tlbsel, i, tlbe->mas1, tlbe->mas2, - tlbe->mas3, tlbe->mas7); - } - } -} - static inline unsigned int gtlb0_get_next_victim( struct kvmppc_vcpu_e500 *vcpu_e500) { unsigned int victim; victim = vcpu_e500->gtlb_nv[0]++; - if (unlikely(vcpu_e500->gtlb_nv[0] >= KVM_E500_TLB0_WAY_NUM)) + if (unlikely(vcpu_e500->gtlb_nv[0] >= vcpu_e500->gtlb_params[0].ways)) vcpu_e500->gtlb_nv[0] = 0; return victim; @@ -256,9 +247,9 @@ static inline unsigned int tlb1_max_shadow_size(void) return host_tlb_params[1].entries - tlbcam_index - 1; } -static inline int tlbe_is_writable(struct tlbe *tlbe) +static inline int tlbe_is_writable(struct kvm_book3e_206_tlb_entry *tlbe) { - return tlbe->mas3 & (MAS3_SW|MAS3_UW); + return tlbe->mas7_3 & (MAS3_SW|MAS3_UW); } static inline u32 e500_shadow_mas3_attrib(u32 mas3, int usermode) @@ -289,39 +280,41 @@ static inline u32 e500_shadow_mas2_attrib(u32 mas2, int usermode) /* * writing shadow tlb entry to host TLB */ -static inline void __write_host_tlbe(struct tlbe *stlbe, uint32_t mas0) +static inline void __write_host_tlbe(struct kvm_book3e_206_tlb_entry *stlbe, + uint32_t mas0) { unsigned long flags; local_irq_save(flags); mtspr(SPRN_MAS0, mas0); mtspr(SPRN_MAS1, stlbe->mas1); - mtspr(SPRN_MAS2, stlbe->mas2); - mtspr(SPRN_MAS3, stlbe->mas3); - mtspr(SPRN_MAS7, stlbe->mas7); + mtspr(SPRN_MAS2, (unsigned long)stlbe->mas2); + mtspr(SPRN_MAS3, (u32)stlbe->mas7_3); + mtspr(SPRN_MAS7, (u32)(stlbe->mas7_3 >> 32)); asm volatile("isync; tlbwe" : : : "memory"); local_irq_restore(flags); } /* esel is index into set, not whole array */ static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500, - int tlbsel, int esel, struct tlbe *stlbe) + int tlbsel, int esel, struct kvm_book3e_206_tlb_entry *stlbe) { if (tlbsel == 0) { - __write_host_tlbe(stlbe, MAS0_TLBSEL(0) | MAS0_ESEL(esel)); + int way = esel & (vcpu_e500->gtlb_params[0].ways - 1); + __write_host_tlbe(stlbe, MAS0_TLBSEL(0) | MAS0_ESEL(way)); } else { __write_host_tlbe(stlbe, MAS0_TLBSEL(1) | MAS0_ESEL(to_htlb1_esel(esel))); } trace_kvm_stlb_write(index_of(tlbsel, esel), stlbe->mas1, stlbe->mas2, - stlbe->mas3, stlbe->mas7); + (u32)stlbe->mas7_3, (u32)(stlbe->mas7_3 >> 32)); } void kvmppc_map_magic(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - struct tlbe magic; + struct kvm_book3e_206_tlb_entry magic; ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK; unsigned int stid; pfn_t pfn; @@ -335,9 +328,8 @@ void kvmppc_map_magic(struct kvm_vcpu *vcpu) magic.mas1 = MAS1_VALID | MAS1_TS | MAS1_TID(stid) | MAS1_TSIZE(BOOK3E_PAGESZ_4K); magic.mas2 = vcpu->arch.magic_page_ea | MAS2_M; - magic.mas3 = (pfn << PAGE_SHIFT) | - MAS3_SW | MAS3_SR | MAS3_UW | MAS3_UR; - magic.mas7 = pfn >> (32 - PAGE_SHIFT); + magic.mas7_3 = ((u64)pfn << PAGE_SHIFT) | + MAS3_SW | MAS3_SR | MAS3_UW | MAS3_UR; __write_host_tlbe(&magic, MAS0_TLBSEL(1) | MAS0_ESEL(tlbcam_index)); preempt_enable(); @@ -358,7 +350,8 @@ void kvmppc_e500_tlb_put(struct kvm_vcpu *vcpu) static void inval_gtlbe_on_host(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, int esel) { - struct tlbe *gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; + struct kvm_book3e_206_tlb_entry *gtlbe = + get_entry(vcpu_e500, tlbsel, esel); struct vcpu_id_table *idt = vcpu_e500->idt; unsigned int pr, tid, ts, pid; u32 val, eaddr; @@ -424,9 +417,8 @@ static int tlb0_set_base(gva_t addr, int sets, int ways) static int gtlb0_set_base(struct kvmppc_vcpu_e500 *vcpu_e500, gva_t addr) { - int sets = KVM_E500_TLB0_SIZE / KVM_E500_TLB0_WAY_NUM; - - return tlb0_set_base(addr, sets, KVM_E500_TLB0_WAY_NUM); + return tlb0_set_base(addr, vcpu_e500->gtlb_params[0].sets, + vcpu_e500->gtlb_params[0].ways); } static int htlb0_set_base(gva_t addr) @@ -440,10 +432,10 @@ static unsigned int get_tlb_esel(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel) unsigned int esel = get_tlb_esel_bit(vcpu_e500); if (tlbsel == 0) { - esel &= KVM_E500_TLB0_WAY_NUM_MASK; + esel &= vcpu_e500->gtlb_params[0].ways - 1; esel += gtlb0_set_base(vcpu_e500, vcpu_e500->mas2); } else { - esel &= vcpu_e500->gtlb_size[tlbsel] - 1; + esel &= vcpu_e500->gtlb_params[tlbsel].entries - 1; } return esel; @@ -453,19 +445,22 @@ static unsigned int get_tlb_esel(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel) static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500, gva_t eaddr, int tlbsel, unsigned int pid, int as) { - int size = vcpu_e500->gtlb_size[tlbsel]; - unsigned int set_base; + int size = vcpu_e500->gtlb_params[tlbsel].entries; + unsigned int set_base, offset; int i; if (tlbsel == 0) { set_base = gtlb0_set_base(vcpu_e500, eaddr); - size = KVM_E500_TLB0_WAY_NUM; + size = vcpu_e500->gtlb_params[0].ways; } else { set_base = 0; } + offset = vcpu_e500->gtlb_offset[tlbsel]; + for (i = 0; i < size; i++) { - struct tlbe *tlbe = &vcpu_e500->gtlb_arch[tlbsel][set_base + i]; + struct kvm_book3e_206_tlb_entry *tlbe = + &vcpu_e500->gtlb_arch[offset + set_base + i]; unsigned int tid; if (eaddr < get_tlb_eaddr(tlbe)) @@ -491,7 +486,7 @@ static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500, } static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref, - struct tlbe *gtlbe, + struct kvm_book3e_206_tlb_entry *gtlbe, pfn_t pfn) { ref->pfn = pfn; @@ -518,7 +513,7 @@ static void clear_tlb_privs(struct kvmppc_vcpu_e500 *vcpu_e500) int tlbsel = 0; int i; - for (i = 0; i < vcpu_e500->gtlb_size[tlbsel]; i++) { + for (i = 0; i < vcpu_e500->gtlb_params[tlbsel].entries; i++) { struct tlbe_ref *ref = &vcpu_e500->gtlb_priv[tlbsel][i].ref; kvmppc_e500_ref_release(ref); @@ -530,6 +525,8 @@ static void clear_tlb_refs(struct kvmppc_vcpu_e500 *vcpu_e500) int stlbsel = 1; int i; + kvmppc_e500_id_table_reset_all(vcpu_e500); + for (i = 0; i < host_tlb_params[stlbsel].entries; i++) { struct tlbe_ref *ref = &vcpu_e500->tlb_refs[stlbsel][i]; @@ -559,18 +556,18 @@ static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu, | MAS1_TSIZE(tsized); vcpu_e500->mas2 = (eaddr & MAS2_EPN) | (vcpu_e500->mas4 & MAS2_ATTRIB_MASK); - vcpu_e500->mas3 &= MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3; + vcpu_e500->mas7_3 &= MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3; vcpu_e500->mas6 = (vcpu_e500->mas6 & MAS6_SPID1) | (get_cur_pid(vcpu) << 16) | (as ? MAS6_SAS : 0); - vcpu_e500->mas7 = 0; } /* TID must be supplied by the caller */ -static inline void kvmppc_e500_setup_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500, - struct tlbe *gtlbe, int tsize, - struct tlbe_ref *ref, - u64 gvaddr, struct tlbe *stlbe) +static inline void kvmppc_e500_setup_stlbe( + struct kvmppc_vcpu_e500 *vcpu_e500, + struct kvm_book3e_206_tlb_entry *gtlbe, + int tsize, struct tlbe_ref *ref, u64 gvaddr, + struct kvm_book3e_206_tlb_entry *stlbe) { pfn_t pfn = ref->pfn; @@ -581,16 +578,16 @@ static inline void kvmppc_e500_setup_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500, stlbe->mas2 = (gvaddr & MAS2_EPN) | e500_shadow_mas2_attrib(gtlbe->mas2, vcpu_e500->vcpu.arch.shared->msr & MSR_PR); - stlbe->mas3 = ((pfn << PAGE_SHIFT) & MAS3_RPN) - | e500_shadow_mas3_attrib(gtlbe->mas3, + stlbe->mas7_3 = ((u64)pfn << PAGE_SHIFT) + | e500_shadow_mas3_attrib(gtlbe->mas7_3, vcpu_e500->vcpu.arch.shared->msr & MSR_PR); - stlbe->mas7 = (pfn >> (32 - PAGE_SHIFT)) & MAS7_RPN; } /* sesel is an index into the entire array, not just the set */ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, - u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe, int tlbsel, int sesel, - struct tlbe *stlbe, struct tlbe_ref *ref) + u64 gvaddr, gfn_t gfn, struct kvm_book3e_206_tlb_entry *gtlbe, + int tlbsel, int sesel, struct kvm_book3e_206_tlb_entry *stlbe, + struct tlbe_ref *ref) { struct kvm_memory_slot *slot; unsigned long pfn, hva; @@ -700,15 +697,16 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, /* XXX only map the one-one case, for now use TLB0 */ static int kvmppc_e500_tlb0_map(struct kvmppc_vcpu_e500 *vcpu_e500, - int esel, struct tlbe *stlbe) + int esel, + struct kvm_book3e_206_tlb_entry *stlbe) { - struct tlbe *gtlbe; + struct kvm_book3e_206_tlb_entry *gtlbe; struct tlbe_ref *ref; int sesel = esel & (host_tlb_params[0].ways - 1); int sesel_base; gva_t ea; - gtlbe = &vcpu_e500->gtlb_arch[0][esel]; + gtlbe = get_entry(vcpu_e500, 0, esel); ref = &vcpu_e500->gtlb_priv[0][esel].ref; ea = get_tlb_eaddr(gtlbe); @@ -725,7 +723,8 @@ static int kvmppc_e500_tlb0_map(struct kvmppc_vcpu_e500 *vcpu_e500, * the shadow TLB. */ /* XXX for both one-one and one-to-many , for now use TLB1 */ static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500, - u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe, struct tlbe *stlbe) + u64 gvaddr, gfn_t gfn, struct kvm_book3e_206_tlb_entry *gtlbe, + struct kvm_book3e_206_tlb_entry *stlbe) { struct tlbe_ref *ref; unsigned int victim; @@ -754,7 +753,8 @@ static inline int kvmppc_e500_gtlbe_invalidate( struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, int esel) { - struct tlbe *gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; + struct kvm_book3e_206_tlb_entry *gtlbe = + get_entry(vcpu_e500, tlbsel, esel); if (unlikely(get_tlb_iprot(gtlbe))) return -1; @@ -769,10 +769,10 @@ int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500, ulong value) int esel; if (value & MMUCSR0_TLB0FI) - for (esel = 0; esel < vcpu_e500->gtlb_size[0]; esel++) + for (esel = 0; esel < vcpu_e500->gtlb_params[0].entries; esel++) kvmppc_e500_gtlbe_invalidate(vcpu_e500, 0, esel); if (value & MMUCSR0_TLB1FI) - for (esel = 0; esel < vcpu_e500->gtlb_size[1]; esel++) + for (esel = 0; esel < vcpu_e500->gtlb_params[1].entries; esel++) kvmppc_e500_gtlbe_invalidate(vcpu_e500, 1, esel); /* Invalidate all vcpu id mappings */ @@ -797,7 +797,8 @@ int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb) if (ia) { /* invalidate all entries */ - for (esel = 0; esel < vcpu_e500->gtlb_size[tlbsel]; esel++) + for (esel = 0; esel < vcpu_e500->gtlb_params[tlbsel].entries; + esel++) kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel); } else { ea &= 0xfffff000; @@ -817,18 +818,17 @@ int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); int tlbsel, esel; - struct tlbe *gtlbe; + struct kvm_book3e_206_tlb_entry *gtlbe; tlbsel = get_tlb_tlbsel(vcpu_e500); esel = get_tlb_esel(vcpu_e500, tlbsel); - gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; + gtlbe = get_entry(vcpu_e500, tlbsel, esel); vcpu_e500->mas0 &= ~MAS0_NV(~0); vcpu_e500->mas0 |= MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); vcpu_e500->mas1 = gtlbe->mas1; vcpu_e500->mas2 = gtlbe->mas2; - vcpu_e500->mas3 = gtlbe->mas3; - vcpu_e500->mas7 = gtlbe->mas7; + vcpu_e500->mas7_3 = gtlbe->mas7_3; return EMULATE_DONE; } @@ -839,7 +839,7 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb) int as = !!get_cur_sas(vcpu_e500); unsigned int pid = get_cur_spid(vcpu_e500); int esel, tlbsel; - struct tlbe *gtlbe = NULL; + struct kvm_book3e_206_tlb_entry *gtlbe = NULL; gva_t ea; ea = kvmppc_get_gpr(vcpu, rb); @@ -847,7 +847,7 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb) for (tlbsel = 0; tlbsel < 2; tlbsel++) { esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, as); if (esel >= 0) { - gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; + gtlbe = get_entry(vcpu_e500, tlbsel, esel); break; } } @@ -857,8 +857,7 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb) | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); vcpu_e500->mas1 = gtlbe->mas1; vcpu_e500->mas2 = gtlbe->mas2; - vcpu_e500->mas3 = gtlbe->mas3; - vcpu_e500->mas7 = gtlbe->mas7; + vcpu_e500->mas7_3 = gtlbe->mas7_3; } else { int victim; @@ -873,8 +872,7 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb) | (vcpu_e500->mas4 & MAS4_TSIZED(~0)); vcpu_e500->mas2 &= MAS2_EPN; vcpu_e500->mas2 |= vcpu_e500->mas4 & MAS2_ATTRIB_MASK; - vcpu_e500->mas3 &= MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3; - vcpu_e500->mas7 = 0; + vcpu_e500->mas7_3 &= MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3; } kvmppc_set_exit_type(vcpu, EMULATED_TLBSX_EXITS); @@ -883,8 +881,8 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb) /* sesel is index into the set, not the whole array */ static void write_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500, - struct tlbe *gtlbe, - struct tlbe *stlbe, + struct kvm_book3e_206_tlb_entry *gtlbe, + struct kvm_book3e_206_tlb_entry *stlbe, int stlbsel, int sesel) { int stid; @@ -902,28 +900,27 @@ static void write_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500, int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - struct tlbe *gtlbe; + struct kvm_book3e_206_tlb_entry *gtlbe; int tlbsel, esel; tlbsel = get_tlb_tlbsel(vcpu_e500); esel = get_tlb_esel(vcpu_e500, tlbsel); - gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; + gtlbe = get_entry(vcpu_e500, tlbsel, esel); if (get_tlb_v(gtlbe)) inval_gtlbe_on_host(vcpu_e500, tlbsel, esel); gtlbe->mas1 = vcpu_e500->mas1; gtlbe->mas2 = vcpu_e500->mas2; - gtlbe->mas3 = vcpu_e500->mas3; - gtlbe->mas7 = vcpu_e500->mas7; + gtlbe->mas7_3 = vcpu_e500->mas7_3; trace_kvm_gtlb_write(vcpu_e500->mas0, gtlbe->mas1, gtlbe->mas2, - gtlbe->mas3, gtlbe->mas7); + (u32)gtlbe->mas7_3, (u32)(gtlbe->mas7_3 >> 32)); /* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */ if (tlbe_is_host_safe(vcpu, gtlbe)) { - struct tlbe stlbe; + struct kvm_book3e_206_tlb_entry stlbe; int stlbsel, sesel; u64 eaddr; u64 raddr; @@ -996,9 +993,11 @@ gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int index, gva_t eaddr) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - struct tlbe *gtlbe = - &vcpu_e500->gtlb_arch[tlbsel_of(index)][esel_of(index)]; - u64 pgmask = get_tlb_bytes(gtlbe) - 1; + struct kvm_book3e_206_tlb_entry *gtlbe; + u64 pgmask; + + gtlbe = get_entry(vcpu_e500, tlbsel_of(index), esel_of(index)); + pgmask = get_tlb_bytes(gtlbe) - 1; return get_tlb_raddr(gtlbe) | (eaddr & pgmask); } @@ -1012,12 +1011,12 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr, { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); struct tlbe_priv *priv; - struct tlbe *gtlbe, stlbe; + struct kvm_book3e_206_tlb_entry *gtlbe, stlbe; int tlbsel = tlbsel_of(index); int esel = esel_of(index); int stlbsel, sesel; - gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; + gtlbe = get_entry(vcpu_e500, tlbsel, esel); switch (tlbsel) { case 0: @@ -1073,25 +1072,174 @@ void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid) void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *vcpu_e500) { - struct tlbe *tlbe; + struct kvm_book3e_206_tlb_entry *tlbe; /* Insert large initial mapping for guest. */ - tlbe = &vcpu_e500->gtlb_arch[1][0]; + tlbe = get_entry(vcpu_e500, 1, 0); tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_256M); tlbe->mas2 = 0; - tlbe->mas3 = E500_TLB_SUPER_PERM_MASK; - tlbe->mas7 = 0; + tlbe->mas7_3 = E500_TLB_SUPER_PERM_MASK; /* 4K map for serial output. Used by kernel wrapper. */ - tlbe = &vcpu_e500->gtlb_arch[1][1]; + tlbe = get_entry(vcpu_e500, 1, 1); tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_4K); tlbe->mas2 = (0xe0004500 & 0xFFFFF000) | MAS2_I | MAS2_G; - tlbe->mas3 = (0xe0004500 & 0xFFFFF000) | E500_TLB_SUPER_PERM_MASK; - tlbe->mas7 = 0; + tlbe->mas7_3 = (0xe0004500 & 0xFFFFF000) | E500_TLB_SUPER_PERM_MASK; +} + +static void free_gtlb(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + int i; + + clear_tlb_refs(vcpu_e500); + kfree(vcpu_e500->gtlb_priv[0]); + kfree(vcpu_e500->gtlb_priv[1]); + + if (vcpu_e500->shared_tlb_pages) { + vfree((void *)(round_down((uintptr_t)vcpu_e500->gtlb_arch, + PAGE_SIZE))); + + for (i = 0; i < vcpu_e500->num_shared_tlb_pages; i++) { + set_page_dirty_lock(vcpu_e500->shared_tlb_pages[i]); + put_page(vcpu_e500->shared_tlb_pages[i]); + } + + vcpu_e500->num_shared_tlb_pages = 0; + vcpu_e500->shared_tlb_pages = NULL; + } else { + kfree(vcpu_e500->gtlb_arch); + } + + vcpu_e500->gtlb_arch = NULL; +} + +int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, + struct kvm_config_tlb *cfg) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + struct kvm_book3e_206_tlb_params params; + char *virt; + struct page **pages; + struct tlbe_priv *privs[2] = {}; + size_t array_len; + u32 sets; + int num_pages, ret, i; + + if (cfg->mmu_type != KVM_MMU_FSL_BOOKE_NOHV) + return -EINVAL; + + if (copy_from_user(¶ms, (void __user *)(uintptr_t)cfg->params, + sizeof(params))) + return -EFAULT; + + if (params.tlb_sizes[1] > 64) + return -EINVAL; + if (params.tlb_ways[1] != params.tlb_sizes[1]) + return -EINVAL; + if (params.tlb_sizes[2] != 0 || params.tlb_sizes[3] != 0) + return -EINVAL; + if (params.tlb_ways[2] != 0 || params.tlb_ways[3] != 0) + return -EINVAL; + + if (!is_power_of_2(params.tlb_ways[0])) + return -EINVAL; + + sets = params.tlb_sizes[0] >> ilog2(params.tlb_ways[0]); + if (!is_power_of_2(sets)) + return -EINVAL; + + array_len = params.tlb_sizes[0] + params.tlb_sizes[1]; + array_len *= sizeof(struct kvm_book3e_206_tlb_entry); + + if (cfg->array_len < array_len) + return -EINVAL; + + num_pages = DIV_ROUND_UP(cfg->array + array_len - 1, PAGE_SIZE) - + cfg->array / PAGE_SIZE; + pages = kmalloc(sizeof(struct page *) * num_pages, GFP_KERNEL); + if (!pages) + return -ENOMEM; + + ret = get_user_pages_fast(cfg->array, num_pages, 1, pages); + if (ret < 0) + goto err_pages; + + if (ret != num_pages) { + num_pages = ret; + ret = -EFAULT; + goto err_put_page; + } + + virt = vmap(pages, num_pages, VM_MAP, PAGE_KERNEL); + if (!virt) + goto err_put_page; + + privs[0] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[0], + GFP_KERNEL); + privs[1] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[1], + GFP_KERNEL); + + if (!privs[0] || !privs[1]) + goto err_put_page; + + free_gtlb(vcpu_e500); + + vcpu_e500->gtlb_priv[0] = privs[0]; + vcpu_e500->gtlb_priv[1] = privs[1]; + + vcpu_e500->gtlb_arch = (struct kvm_book3e_206_tlb_entry *) + (virt + (cfg->array & (PAGE_SIZE - 1))); + + vcpu_e500->gtlb_params[0].entries = params.tlb_sizes[0]; + vcpu_e500->gtlb_params[1].entries = params.tlb_sizes[1]; + + vcpu_e500->gtlb_offset[0] = 0; + vcpu_e500->gtlb_offset[1] = params.tlb_sizes[0]; + + vcpu_e500->tlb0cfg = mfspr(SPRN_TLB0CFG) & ~0xfffUL; + if (params.tlb_sizes[0] <= 2048) + vcpu_e500->tlb0cfg |= params.tlb_sizes[0]; + + vcpu_e500->tlb1cfg = mfspr(SPRN_TLB1CFG) & ~0xfffUL; + vcpu_e500->tlb1cfg |= params.tlb_sizes[1]; + + vcpu_e500->shared_tlb_pages = pages; + vcpu_e500->num_shared_tlb_pages = num_pages; + + vcpu_e500->gtlb_params[0].ways = params.tlb_ways[0]; + vcpu_e500->gtlb_params[0].sets = sets; + + vcpu_e500->gtlb_params[1].ways = params.tlb_sizes[1]; + vcpu_e500->gtlb_params[1].sets = 1; + + return 0; + +err_put_page: + kfree(privs[0]); + kfree(privs[1]); + + for (i = 0; i < num_pages; i++) + put_page(pages[i]); + +err_pages: + kfree(pages); + return ret; +} + +int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu, + struct kvm_dirty_tlb *dirty) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + + clear_tlb_refs(vcpu_e500); + return 0; } int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) { + int entry_size = sizeof(struct kvm_book3e_206_tlb_entry); + int entries = KVM_E500_TLB0_SIZE + KVM_E500_TLB1_SIZE; + host_tlb_params[0].entries = mfspr(SPRN_TLB0CFG) & TLBnCFG_N_ENTRY; host_tlb_params[1].entries = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY; @@ -1124,17 +1272,22 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) host_tlb_params[0].entries / host_tlb_params[0].ways; host_tlb_params[1].sets = 1; - vcpu_e500->gtlb_size[0] = KVM_E500_TLB0_SIZE; - vcpu_e500->gtlb_arch[0] = - kzalloc(sizeof(struct tlbe) * KVM_E500_TLB0_SIZE, GFP_KERNEL); - if (vcpu_e500->gtlb_arch[0] == NULL) - goto err; + vcpu_e500->gtlb_params[0].entries = KVM_E500_TLB0_SIZE; + vcpu_e500->gtlb_params[1].entries = KVM_E500_TLB1_SIZE; - vcpu_e500->gtlb_size[1] = KVM_E500_TLB1_SIZE; - vcpu_e500->gtlb_arch[1] = - kzalloc(sizeof(struct tlbe) * KVM_E500_TLB1_SIZE, GFP_KERNEL); - if (vcpu_e500->gtlb_arch[1] == NULL) - goto err; + vcpu_e500->gtlb_params[0].ways = KVM_E500_TLB0_WAY_NUM; + vcpu_e500->gtlb_params[0].sets = + KVM_E500_TLB0_SIZE / KVM_E500_TLB0_WAY_NUM; + + vcpu_e500->gtlb_params[1].ways = KVM_E500_TLB1_SIZE; + vcpu_e500->gtlb_params[1].sets = 1; + + vcpu_e500->gtlb_arch = kmalloc(entries * entry_size, GFP_KERNEL); + if (!vcpu_e500->gtlb_arch) + return -ENOMEM; + + vcpu_e500->gtlb_offset[0] = 0; + vcpu_e500->gtlb_offset[1] = KVM_E500_TLB0_SIZE; vcpu_e500->tlb_refs[0] = kzalloc(sizeof(struct tlbe_ref) * host_tlb_params[0].entries, @@ -1148,15 +1301,15 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) if (!vcpu_e500->tlb_refs[1]) goto err; - vcpu_e500->gtlb_priv[0] = - kzalloc(sizeof(struct tlbe_ref) * vcpu_e500->gtlb_size[0], - GFP_KERNEL); + vcpu_e500->gtlb_priv[0] = kzalloc(sizeof(struct tlbe_ref) * + vcpu_e500->gtlb_params[0].entries, + GFP_KERNEL); if (!vcpu_e500->gtlb_priv[0]) goto err; - vcpu_e500->gtlb_priv[1] = - kzalloc(sizeof(struct tlbe_ref) * vcpu_e500->gtlb_size[1], - GFP_KERNEL); + vcpu_e500->gtlb_priv[1] = kzalloc(sizeof(struct tlbe_ref) * + vcpu_e500->gtlb_params[1].entries, + GFP_KERNEL); if (!vcpu_e500->gtlb_priv[1]) goto err; @@ -1165,32 +1318,24 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) /* Init TLB configuration register */ vcpu_e500->tlb0cfg = mfspr(SPRN_TLB0CFG) & ~0xfffUL; - vcpu_e500->tlb0cfg |= vcpu_e500->gtlb_size[0]; + vcpu_e500->tlb0cfg |= vcpu_e500->gtlb_params[0].entries; vcpu_e500->tlb1cfg = mfspr(SPRN_TLB1CFG) & ~0xfffUL; - vcpu_e500->tlb1cfg |= vcpu_e500->gtlb_size[1]; + vcpu_e500->tlb1cfg |= vcpu_e500->gtlb_params[1].entries; return 0; err: + free_gtlb(vcpu_e500); kfree(vcpu_e500->tlb_refs[0]); kfree(vcpu_e500->tlb_refs[1]); - kfree(vcpu_e500->gtlb_priv[0]); - kfree(vcpu_e500->gtlb_priv[1]); - kfree(vcpu_e500->gtlb_arch[0]); - kfree(vcpu_e500->gtlb_arch[1]); return -1; } void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500) { - clear_tlb_refs(vcpu_e500); - + free_gtlb(vcpu_e500); kvmppc_e500_id_table_free(vcpu_e500); kfree(vcpu_e500->tlb_refs[0]); kfree(vcpu_e500->tlb_refs[1]); - kfree(vcpu_e500->gtlb_priv[0]); - kfree(vcpu_e500->gtlb_priv[1]); - kfree(vcpu_e500->gtlb_arch[1]); - kfree(vcpu_e500->gtlb_arch[0]); } diff --git a/arch/powerpc/kvm/e500_tlb.h b/arch/powerpc/kvm/e500_tlb.h index b587f691459a..2c296407e759 100644 --- a/arch/powerpc/kvm/e500_tlb.h +++ b/arch/powerpc/kvm/e500_tlb.h @@ -20,13 +20,9 @@ #include #include -#define KVM_E500_TLB0_WAY_SIZE_BIT 7 /* Fixed */ -#define KVM_E500_TLB0_WAY_SIZE (1UL << KVM_E500_TLB0_WAY_SIZE_BIT) -#define KVM_E500_TLB0_WAY_SIZE_MASK (KVM_E500_TLB0_WAY_SIZE - 1) - -#define KVM_E500_TLB0_WAY_NUM_BIT 1 /* No greater than 7 */ -#define KVM_E500_TLB0_WAY_NUM (1UL << KVM_E500_TLB0_WAY_NUM_BIT) -#define KVM_E500_TLB0_WAY_NUM_MASK (KVM_E500_TLB0_WAY_NUM - 1) +/* This geometry is the legacy default -- can be overridden by userspace */ +#define KVM_E500_TLB0_WAY_SIZE 128 +#define KVM_E500_TLB0_WAY_NUM 2 #define KVM_E500_TLB0_SIZE (KVM_E500_TLB0_WAY_SIZE * KVM_E500_TLB0_WAY_NUM) #define KVM_E500_TLB1_SIZE 16 @@ -58,50 +54,54 @@ extern void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *); extern void kvmppc_e500_recalc_shadow_pid(struct kvmppc_vcpu_e500 *); /* TLB helper functions */ -static inline unsigned int get_tlb_size(const struct tlbe *tlbe) +static inline unsigned int +get_tlb_size(const struct kvm_book3e_206_tlb_entry *tlbe) { return (tlbe->mas1 >> 7) & 0x1f; } -static inline gva_t get_tlb_eaddr(const struct tlbe *tlbe) +static inline gva_t get_tlb_eaddr(const struct kvm_book3e_206_tlb_entry *tlbe) { return tlbe->mas2 & 0xfffff000; } -static inline u64 get_tlb_bytes(const struct tlbe *tlbe) +static inline u64 get_tlb_bytes(const struct kvm_book3e_206_tlb_entry *tlbe) { unsigned int pgsize = get_tlb_size(tlbe); return 1ULL << 10 << pgsize; } -static inline gva_t get_tlb_end(const struct tlbe *tlbe) +static inline gva_t get_tlb_end(const struct kvm_book3e_206_tlb_entry *tlbe) { u64 bytes = get_tlb_bytes(tlbe); return get_tlb_eaddr(tlbe) + bytes - 1; } -static inline u64 get_tlb_raddr(const struct tlbe *tlbe) +static inline u64 get_tlb_raddr(const struct kvm_book3e_206_tlb_entry *tlbe) { - u64 rpn = tlbe->mas7; - return (rpn << 32) | (tlbe->mas3 & 0xfffff000); + return tlbe->mas7_3 & ~0xfffULL; } -static inline unsigned int get_tlb_tid(const struct tlbe *tlbe) +static inline unsigned int +get_tlb_tid(const struct kvm_book3e_206_tlb_entry *tlbe) { return (tlbe->mas1 >> 16) & 0xff; } -static inline unsigned int get_tlb_ts(const struct tlbe *tlbe) +static inline unsigned int +get_tlb_ts(const struct kvm_book3e_206_tlb_entry *tlbe) { return (tlbe->mas1 >> 12) & 0x1; } -static inline unsigned int get_tlb_v(const struct tlbe *tlbe) +static inline unsigned int +get_tlb_v(const struct kvm_book3e_206_tlb_entry *tlbe) { return (tlbe->mas1 >> 31) & 0x1; } -static inline unsigned int get_tlb_iprot(const struct tlbe *tlbe) +static inline unsigned int +get_tlb_iprot(const struct kvm_book3e_206_tlb_entry *tlbe) { return (tlbe->mas1 >> 30) & 0x1; } @@ -156,7 +156,7 @@ static inline unsigned int get_tlb_esel_bit( } static inline int tlbe_is_host_safe(const struct kvm_vcpu *vcpu, - const struct tlbe *tlbe) + const struct kvm_book3e_206_tlb_entry *tlbe) { gpa_t gpa; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index a5671616af86..3cf6fba513ac 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -222,6 +222,9 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_PPC_PAIRED_SINGLES: case KVM_CAP_PPC_OSI: case KVM_CAP_PPC_GET_PVINFO: +#ifdef CONFIG_KVM_E500 + case KVM_CAP_SW_TLB: +#endif r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -602,6 +605,19 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, r = 0; vcpu->arch.papr_enabled = true; break; +#ifdef CONFIG_KVM_E500 + case KVM_CAP_SW_TLB: { + struct kvm_config_tlb cfg; + void __user *user_ptr = (void __user *)(uintptr_t)cap->args[0]; + + r = -EFAULT; + if (copy_from_user(&cfg, user_ptr, sizeof(cfg))) + break; + + r = kvm_vcpu_ioctl_config_tlb(vcpu, &cfg); + break; + } +#endif default: r = -EINVAL; break; @@ -651,6 +667,18 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap); break; } + +#ifdef CONFIG_KVM_E500 + case KVM_DIRTY_TLB: { + struct kvm_dirty_tlb dirty; + r = -EFAULT; + if (copy_from_user(&dirty, argp, sizeof(dirty))) + goto out; + r = kvm_vcpu_ioctl_dirty_tlb(vcpu, &dirty); + break; + } +#endif + default: r = -EINVAL; } diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 245bcb3a0fcd..fa029ced4bd5 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -581,6 +581,7 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_PPC_RMA 65 #define KVM_CAP_MAX_VCPUS 66 /* returns max vcpus per vm */ #define KVM_CAP_PPC_PAPR 68 +#define KVM_CAP_SW_TLB 69 #define KVM_CAP_S390_GMAP 71 #define KVM_CAP_TSC_DEADLINE_TIMER 72 #define KVM_CAP_S390_UCONTROL 73 @@ -664,6 +665,21 @@ struct kvm_clock_data { __u32 pad[9]; }; +#define KVM_MMU_FSL_BOOKE_NOHV 0 +#define KVM_MMU_FSL_BOOKE_HV 1 + +struct kvm_config_tlb { + __u64 params; + __u64 array; + __u32 mmu_type; + __u32 array_len; +}; + +struct kvm_dirty_tlb { + __u64 bitmap; + __u32 num_dirty; +}; + /* * ioctls for VM fds */ @@ -801,6 +817,8 @@ struct kvm_s390_ucas_mapping { #define KVM_CREATE_SPAPR_TCE _IOW(KVMIO, 0xa8, struct kvm_create_spapr_tce) /* Available with KVM_CAP_RMA */ #define KVM_ALLOCATE_RMA _IOR(KVMIO, 0xa9, struct kvm_allocate_rma) +/* Available with KVM_CAP_SW_TLB */ +#define KVM_DIRTY_TLB _IOW(KVMIO, 0xaa, struct kvm_dirty_tlb) #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) -- cgit v1.2.3 From 303b7c97e369ec3d7ab7ba0551030160ce3f838a Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Thu, 18 Aug 2011 15:25:23 -0500 Subject: KVM: PPC: e500: tlbsx: fix tlb0 esel It should contain the way, not the absolute TLB0 index. Signed-off-by: Scott Wood Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/kvm/e500_tlb.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index f19ae2f61521..ec17148392b4 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -853,6 +853,8 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb) } if (gtlbe) { + esel &= vcpu_e500->gtlb_params[tlbsel].ways - 1; + vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(esel) | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); vcpu_e500->mas1 = gtlbe->mas1; -- cgit v1.2.3 From 841741f23b91088810e657a535b8aa683136d870 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Fri, 2 Sep 2011 17:39:37 -0500 Subject: KVM: PPC: e500: Don't hardcode PIR=0 The hardcoded behavior prevents proper SMP support. user space shall specify the vcpu's PIR as the vcpu id. Signed-off-by: Scott Wood Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/kvm/booke.c | 4 ++-- arch/powerpc/kvm/e500.c | 3 --- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index bb6c988f010a..b64220079d56 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -761,7 +761,7 @@ static void get_sregs_arch206(struct kvm_vcpu *vcpu, { sregs->u.e.features |= KVM_SREGS_E_ARCH206; - sregs->u.e.pir = 0; + sregs->u.e.pir = vcpu->vcpu_id; sregs->u.e.mcsrr0 = vcpu->arch.mcsrr0; sregs->u.e.mcsrr1 = vcpu->arch.mcsrr1; sregs->u.e.decar = vcpu->arch.decar; @@ -774,7 +774,7 @@ static int set_sregs_arch206(struct kvm_vcpu *vcpu, if (!(sregs->u.e.features & KVM_SREGS_E_ARCH206)) return 0; - if (sregs->u.e.pir != 0) + if (sregs->u.e.pir != vcpu->vcpu_id) return -EINVAL; vcpu->arch.mcsrr0 = sregs->u.e.mcsrr0; diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index f17d7e732a1e..ac3c4bf21677 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -71,9 +71,6 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu) vcpu->arch.pvr = mfspr(SPRN_PVR); vcpu_e500->svr = mfspr(SPRN_SVR); - /* Since booke kvm only support one core, update all vcpus' PIR to 0 */ - vcpu->vcpu_id = 0; - vcpu->arch.cpu_type = KVM_CPU_E500V2; return 0; -- cgit v1.2.3 From 95325e6b190bb4ec3383aa1241d10675057bff45 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Tue, 20 Sep 2011 01:31:48 +0200 Subject: KVM: PPC: E500: Support hugetlbfs With hugetlbfs support emerging on e500, we should also support KVM backing its guest memory by it. This patch adds support for hugetlbfs into the e500 shadow mmu code. Signed-off-by: Alexander Graf Acked-by: Scott Wood Signed-off-by: Avi Kivity --- arch/powerpc/kvm/e500_tlb.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index ec17148392b4..6fefb9144f23 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -673,12 +674,31 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, pfn &= ~(tsize_pages - 1); break; } + } else if (vma && hva >= vma->vm_start && + (vma->vm_flags & VM_HUGETLB)) { + unsigned long psize = vma_kernel_pagesize(vma); + + tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >> + MAS1_TSIZE_SHIFT; + + /* + * Take the largest page size that satisfies both host + * and guest mapping + */ + tsize = min(__ilog2(psize) - 10, tsize); + + /* + * e500 doesn't implement the lowest tsize bit, + * or 1K pages. + */ + tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1); } up_read(¤t->mm->mmap_sem); } if (likely(!pfnmap)) { + unsigned long tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT); pfn = gfn_to_pfn_memslot(vcpu_e500->vcpu.kvm, slot, gfn); if (is_error_pfn(pfn)) { printk(KERN_ERR "Couldn't get real page for gfn %lx!\n", @@ -686,6 +706,10 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, kvm_release_pfn_clean(pfn); return; } + + /* Align guest and physical address to page map boundaries */ + pfn &= ~(tsize_pages - 1); + gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1); } /* Drop old ref and setup new one. */ -- cgit v1.2.3 From f9208427f72e6cb52c71767af3bf8c14c43c27ac Mon Sep 17 00:00:00 2001 From: Bharat Bhushan Date: Thu, 13 Oct 2011 15:17:08 +0530 Subject: PPC: Fix race in mtmsr paravirt implementation The current implementation of mtmsr and mtmsrd are racy in that it does: * check (int_pending == 0) ---> host sets int_pending = 1 <--- * write shared page * done while instead we should check for int_pending after the shared page is written. Signed-off-by: Bharat Bhushan Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/kernel/kvm_emul.S | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kernel/kvm_emul.S b/arch/powerpc/kernel/kvm_emul.S index f2b1b2523e61..3d64c5704fd5 100644 --- a/arch/powerpc/kernel/kvm_emul.S +++ b/arch/powerpc/kernel/kvm_emul.S @@ -167,6 +167,9 @@ maybe_stay_in_guest: kvm_emulate_mtmsr_reg2: ori r30, r0, 0 + /* Put MSR into magic page because we don't call mtmsr */ + STL64(r30, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0) + /* Check if we have to fetch an interrupt */ lwz r31, (KVM_MAGIC_PAGE + KVM_MAGIC_INT)(0) cmpwi r31, 0 @@ -174,15 +177,10 @@ kvm_emulate_mtmsr_reg2: /* Check if we may trigger an interrupt */ andi. r31, r30, MSR_EE - beq no_mtmsr - - b do_mtmsr + bne do_mtmsr no_mtmsr: - /* Put MSR into magic page because we don't call mtmsr */ - STL64(r30, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0) - SCRATCH_RESTORE /* Go back to caller */ -- cgit v1.2.3 From dc2babfea592e633cf3890294af6545609b466be Mon Sep 17 00:00:00 2001 From: Bharat Bhushan Date: Wed, 19 Oct 2011 09:46:06 +0530 Subject: KVM: PPC: Fix DEC truncation for greater than 0xffff_ffff/1000 kvmppc_emulate_dec() uses dec_nsec of type unsigned long and does below calculation: dec_nsec = vcpu->arch.dec; dec_nsec *= 1000; This will truncate if DEC value "vcpu->arch.dec" is greater than 0xffff_ffff/1000. For example : For tb_ticks_per_usec = 4a, we can not set decrementer more than ~58ms. Signed-off-by: Bharat Bhushan Acked-by: Liu Yu Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/kvm/emulate.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index 141dce3c6810..4337f99fa0fa 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -84,6 +84,7 @@ static int kvmppc_dec_enabled(struct kvm_vcpu *vcpu) void kvmppc_emulate_dec(struct kvm_vcpu *vcpu) { unsigned long dec_nsec; + unsigned long long dec_time; pr_debug("mtDEC: %x\n", vcpu->arch.dec); #ifdef CONFIG_PPC_BOOK3S @@ -103,11 +104,12 @@ void kvmppc_emulate_dec(struct kvm_vcpu *vcpu) * host ticks. */ hrtimer_try_to_cancel(&vcpu->arch.dec_timer); - dec_nsec = vcpu->arch.dec; - dec_nsec *= 1000; - dec_nsec /= tb_ticks_per_usec; - hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec), - HRTIMER_MODE_REL); + dec_time = vcpu->arch.dec; + dec_time *= 1000; + do_div(dec_time, tb_ticks_per_usec); + dec_nsec = do_div(dec_time, NSEC_PER_SEC); + hrtimer_start(&vcpu->arch.dec_timer, + ktime_set(dec_time, dec_nsec), HRTIMER_MODE_REL); vcpu->arch.dec_jiffies = get_tb(); } else { hrtimer_try_to_cancel(&vcpu->arch.dec_timer); -- cgit v1.2.3 From 7401f6266de021990efc7bc212289219ba8f9a7a Mon Sep 17 00:00:00 2001 From: Bharat Bhushan Date: Mon, 31 Oct 2011 14:22:08 +0530 Subject: KVM: PPC: booke: Do Not start decrementer when SPRN_DEC set 0 As per specification the decrementer interrupt not happen when DEC is written with 0. Also when DEC is zero, no decrementer running. So we should not start hrtimer for decrementer when DEC = 0. Signed-off-by: Bharat Bhushan Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/kvm/emulate.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index 4337f99fa0fa..b6df56dd93ba 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -77,7 +77,8 @@ static int kvmppc_dec_enabled(struct kvm_vcpu *vcpu) #else static int kvmppc_dec_enabled(struct kvm_vcpu *vcpu) { - return vcpu->arch.tcr & TCR_DIE; + /* On BOOKE, DEC = 0 is as good as decrementer not enabled */ + return (vcpu->arch.tcr & TCR_DIE) && vcpu->arch.dec; } #endif -- cgit v1.2.3 From 1d1ef22208876511224a8797657e40e287e1f93d Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Tue, 8 Nov 2011 16:11:59 -0600 Subject: KVM: PPC: booke: check for signals in kvmppc_vcpu_run Currently we check prior to returning from a lightweight exit, but not prior to initial entry. book3s already does a similar test. Signed-off-by: Scott Wood Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/kvm/booke.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index b64220079d56..9c785896e867 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -322,11 +322,19 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) } local_irq_disable(); + + if (signal_pending(current)) { + kvm_run->exit_reason = KVM_EXIT_INTR; + ret = -EINTR; + goto out; + } + kvm_guest_enter(); ret = __kvmppc_vcpu_run(kvm_run, vcpu); kvm_guest_exit(); - local_irq_enable(); +out: + local_irq_enable(); return ret; } -- cgit v1.2.3 From 7e28e60ef974d0eeb43112ef264d8c130f7b7bf4 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Tue, 8 Nov 2011 18:23:20 -0600 Subject: KVM: PPC: Rename deliver_interrupts to prepare_to_enter This function also updates paravirt int_pending, so rename it to be more obvious that this is a collection of checks run prior to (re)entering a guest. Signed-off-by: Scott Wood Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/include/asm/kvm_ppc.h | 2 +- arch/powerpc/kvm/book3s.c | 2 +- arch/powerpc/kvm/book3s_hv.c | 6 +++--- arch/powerpc/kvm/book3s_pr.c | 2 +- arch/powerpc/kvm/booke.c | 4 ++-- arch/powerpc/kvm/powerpc.c | 2 +- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index a284f209e2df..c089927f64cc 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -94,7 +94,7 @@ extern int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu, extern void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu); extern void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu); -extern void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu); +extern void kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu); extern int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu); extern void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags); extern void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu); diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index e41ac6f7dcf1..73fc9f046107 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -258,7 +258,7 @@ static bool clear_irqprio(struct kvm_vcpu *vcpu, unsigned int priority) return true; } -void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu) +void kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu) { unsigned long *pending = &vcpu->arch.pending_exceptions; unsigned long old_pending = vcpu->arch.pending_exceptions; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 336983da9e72..536adee59c07 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -482,7 +482,7 @@ static void kvmppc_set_timer(struct kvm_vcpu *vcpu) if (now > vcpu->arch.dec_expires) { /* decrementer has already gone negative */ kvmppc_core_queue_dec(vcpu); - kvmppc_core_deliver_interrupts(vcpu); + kvmppc_core_prepare_to_enter(vcpu); return; } dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC @@ -797,7 +797,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) list_for_each_entry_safe(v, vn, &vc->runnable_threads, arch.run_list) { - kvmppc_core_deliver_interrupts(v); + kvmppc_core_prepare_to_enter(v); if (signal_pending(v->arch.run_task)) { kvmppc_remove_runnable(vc, v); v->stat.signal_exits++; @@ -857,7 +857,7 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) if (run->exit_reason == KVM_EXIT_PAPR_HCALL && !(vcpu->arch.shregs.msr & MSR_PR)) { r = kvmppc_pseries_do_hcall(vcpu); - kvmppc_core_deliver_interrupts(vcpu); + kvmppc_core_prepare_to_enter(vcpu); } } while (r == RESUME_GUEST); return r; diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index e2cfb9e1e20e..f3628581fb7c 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -764,7 +764,7 @@ program_interrupt: /* In case an interrupt came in that was triggered * from userspace (like DEC), we need to check what * to inject now! */ - kvmppc_core_deliver_interrupts(vcpu); + kvmppc_core_prepare_to_enter(vcpu); } } diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 9c785896e867..e082e348c882 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -289,7 +289,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, } /* Check pending exceptions and deliver one, if possible. */ -void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu) +void kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu) { unsigned long *pending = &vcpu->arch.pending_exceptions; unsigned long old_pending = vcpu->arch.pending_exceptions; @@ -611,7 +611,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, local_irq_disable(); - kvmppc_core_deliver_interrupts(vcpu); + kvmppc_core_prepare_to_enter(vcpu); if (!(r & RESUME_HOST)) { /* To avoid clobbering exit_reason, only check for signals if diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 3cf6fba513ac..6186ec0d939b 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -559,7 +559,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) vcpu->arch.hcall_needed = 0; } - kvmppc_core_deliver_interrupts(vcpu); + kvmppc_core_prepare_to_enter(vcpu); r = kvmppc_vcpu_run(run, vcpu); -- cgit v1.2.3 From 25051b5a5aff0bb71435421b4b80279b789fa0dc Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Tue, 8 Nov 2011 18:23:23 -0600 Subject: KVM: PPC: Move prepare_to_enter call site into subarch code This function should be called with interrupts disabled, to avoid a race where an exception is delivered after we check, but the resched kick is received before we disable interrupts (and thus doesn't actually trigger the exit code that would recheck exceptions). booke already does this properly in the lightweight exit case, but not on initial entry. For now, move the call of prepare_to_enter into subarch-specific code so that booke can do the right thing here. Ideally book3s would do the same thing, but I'm having a hard time seeing where it does any interrupt disabling of this sort (plus it has several additional call sites), so I'm deferring the book3s fix to someone more familiar with that code. book3s behavior should be unchanged by this patch. Signed-off-by: Scott Wood Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/kvm/book3s_hv.c | 2 ++ arch/powerpc/kvm/book3s_pr.c | 2 ++ arch/powerpc/kvm/booke.c | 4 ++++ arch/powerpc/kvm/powerpc.c | 2 -- 4 files changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 536adee59c07..b1e3b9c1326a 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -836,6 +836,8 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) return -EINVAL; } + kvmppc_core_prepare_to_enter(vcpu); + /* No need to go into the guest when all we'll do is come back out */ if (signal_pending(current)) { run->exit_reason = KVM_EXIT_INTR; diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index f3628581fb7c..203a7b7b58b9 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -929,6 +929,8 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) return -EINVAL; } + kvmppc_core_prepare_to_enter(vcpu); + /* No need to go into the guest when all we do is going out */ if (signal_pending(current)) { kvm_run->exit_reason = KVM_EXIT_INTR; diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index e082e348c882..feaefc433276 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -295,6 +295,8 @@ void kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu) unsigned long old_pending = vcpu->arch.pending_exceptions; unsigned int priority; + WARN_ON_ONCE(!irqs_disabled()); + priority = __ffs(*pending); while (priority <= BOOKE_IRQPRIO_MAX) { if (kvmppc_booke_irqprio_deliver(vcpu, priority)) @@ -323,6 +325,8 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) local_irq_disable(); + kvmppc_core_prepare_to_enter(vcpu); + if (signal_pending(current)) { kvm_run->exit_reason = KVM_EXIT_INTR; ret = -EINTR; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 6186ec0d939b..7411bdd8ff6f 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -559,8 +559,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) vcpu->arch.hcall_needed = 0; } - kvmppc_core_prepare_to_enter(vcpu); - r = kvmppc_vcpu_run(run, vcpu); if (vcpu->sigset_active) -- cgit v1.2.3 From c59a6a3e4e5976a938e21faf3da65a2784187aa7 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Tue, 8 Nov 2011 18:23:25 -0600 Subject: KVM: PPC: booke: Check for MSR[WE] in prepare_to_enter This prevents us from inappropriately blocking in a KVM_SET_REGS ioctl -- the MSR[WE] will take effect when the guest is next entered. It also causes SRR1[WE] to be set when we enter the guest's interrupt handler, which is what e500 hardware is documented to do. Signed-off-by: Scott Wood Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/kvm/booke.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index feaefc433276..557f028a9ad4 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -124,12 +124,6 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr) vcpu->arch.shared->msr = new_msr; kvmppc_mmu_msr_notify(vcpu, old_msr); - - if (vcpu->arch.shared->msr & MSR_WE) { - kvm_vcpu_block(vcpu); - kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS); - }; - kvmppc_vcpu_sync_spe(vcpu); } @@ -288,15 +282,12 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, return allowed; } -/* Check pending exceptions and deliver one, if possible. */ -void kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu) +static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu) { unsigned long *pending = &vcpu->arch.pending_exceptions; unsigned long old_pending = vcpu->arch.pending_exceptions; unsigned int priority; - WARN_ON_ONCE(!irqs_disabled()); - priority = __ffs(*pending); while (priority <= BOOKE_IRQPRIO_MAX) { if (kvmppc_booke_irqprio_deliver(vcpu, priority)) @@ -314,6 +305,23 @@ void kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu) vcpu->arch.shared->int_pending = 0; } +/* Check pending exceptions and deliver one, if possible. */ +void kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu) +{ + WARN_ON_ONCE(!irqs_disabled()); + + kvmppc_core_check_exceptions(vcpu); + + if (vcpu->arch.shared->msr & MSR_WE) { + local_irq_enable(); + kvm_vcpu_block(vcpu); + local_irq_disable(); + + kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS); + kvmppc_core_check_exceptions(vcpu); + }; +} + int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) { int ret; -- cgit v1.2.3 From 29ac26efbdc651303643025c83009ce5766c1676 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Tue, 8 Nov 2011 18:23:27 -0600 Subject: KVM: PPC: booke: Fix int_pending calculation for MSR[EE] paravirt int_pending was only being lowered if a bit in pending_exceptions was cleared during exception delivery -- but for interrupts, we clear it during IACK/TSR emulation. This caused paravirt for enabling MSR[EE] to be ineffective. Signed-off-by: Scott Wood Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/kvm/booke.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 557f028a9ad4..8dfc59a8a715 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -285,7 +285,6 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu) { unsigned long *pending = &vcpu->arch.pending_exceptions; - unsigned long old_pending = vcpu->arch.pending_exceptions; unsigned int priority; priority = __ffs(*pending); @@ -299,10 +298,7 @@ static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu) } /* Tell the guest about our interrupt status */ - if (*pending) - vcpu->arch.shared->int_pending = 1; - else if (old_pending) - vcpu->arch.shared->int_pending = 0; + vcpu->arch.shared->int_pending = !!*pending; } /* Check pending exceptions and deliver one, if possible. */ -- cgit v1.2.3 From 940b45ec18cf00046b8b28299d97066a2c43d559 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Tue, 8 Nov 2011 18:23:28 -0600 Subject: KVM: PPC: booke: Paravirtualize wrtee Also fix wrteei 1 paravirt to check for a pending interrupt. Signed-off-by: Scott Wood Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/kernel/kvm.c | 92 ++++++++++++++++++++++++++++++++++------ arch/powerpc/kernel/kvm_emul.S | 96 ++++++++++++++++++++++++++++++++---------- 2 files changed, 154 insertions(+), 34 deletions(-) diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c index 2985338d0e10..06b15ee997f7 100644 --- a/arch/powerpc/kernel/kvm.c +++ b/arch/powerpc/kernel/kvm.c @@ -1,5 +1,6 @@ /* * Copyright (C) 2010 SUSE Linux Products GmbH. All rights reserved. + * Copyright 2010-2011 Freescale Semiconductor, Inc. * * Authors: * Alexander Graf @@ -29,6 +30,7 @@ #include #include #include +#include #define KVM_MAGIC_PAGE (-4096L) #define magic_var(x) KVM_MAGIC_PAGE + offsetof(struct kvm_vcpu_arch_shared, x) @@ -41,6 +43,7 @@ #define KVM_INST_B 0x48000000 #define KVM_INST_B_MASK 0x03ffffff #define KVM_INST_B_MAX 0x01ffffff +#define KVM_INST_LI 0x38000000 #define KVM_MASK_RT 0x03e00000 #define KVM_RT_30 0x03c00000 @@ -69,6 +72,7 @@ #define KVM_INST_MTMSRD_L1 0x7c010164 #define KVM_INST_MTMSR 0x7c000124 +#define KVM_INST_WRTEE 0x7c000106 #define KVM_INST_WRTEEI_0 0x7c000146 #define KVM_INST_WRTEEI_1 0x7c008146 @@ -270,26 +274,27 @@ static void kvm_patch_ins_mtmsr(u32 *inst, u32 rt) #ifdef CONFIG_BOOKE -extern u32 kvm_emulate_wrteei_branch_offs; -extern u32 kvm_emulate_wrteei_ee_offs; -extern u32 kvm_emulate_wrteei_len; -extern u32 kvm_emulate_wrteei[]; +extern u32 kvm_emulate_wrtee_branch_offs; +extern u32 kvm_emulate_wrtee_reg_offs; +extern u32 kvm_emulate_wrtee_orig_ins_offs; +extern u32 kvm_emulate_wrtee_len; +extern u32 kvm_emulate_wrtee[]; -static void kvm_patch_ins_wrteei(u32 *inst) +static void kvm_patch_ins_wrtee(u32 *inst, u32 rt, int imm_one) { u32 *p; int distance_start; int distance_end; ulong next_inst; - p = kvm_alloc(kvm_emulate_wrteei_len * 4); + p = kvm_alloc(kvm_emulate_wrtee_len * 4); if (!p) return; /* Find out where we are and put everything there */ distance_start = (ulong)p - (ulong)inst; next_inst = ((ulong)inst + 4); - distance_end = next_inst - (ulong)&p[kvm_emulate_wrteei_branch_offs]; + distance_end = next_inst - (ulong)&p[kvm_emulate_wrtee_branch_offs]; /* Make sure we only write valid b instructions */ if (distance_start > KVM_INST_B_MAX) { @@ -298,10 +303,65 @@ static void kvm_patch_ins_wrteei(u32 *inst) } /* Modify the chunk to fit the invocation */ - memcpy(p, kvm_emulate_wrteei, kvm_emulate_wrteei_len * 4); - p[kvm_emulate_wrteei_branch_offs] |= distance_end & KVM_INST_B_MASK; - p[kvm_emulate_wrteei_ee_offs] |= (*inst & MSR_EE); - flush_icache_range((ulong)p, (ulong)p + kvm_emulate_wrteei_len * 4); + memcpy(p, kvm_emulate_wrtee, kvm_emulate_wrtee_len * 4); + p[kvm_emulate_wrtee_branch_offs] |= distance_end & KVM_INST_B_MASK; + + if (imm_one) { + p[kvm_emulate_wrtee_reg_offs] = + KVM_INST_LI | __PPC_RT(30) | MSR_EE; + } else { + /* Make clobbered registers work too */ + switch (get_rt(rt)) { + case 30: + kvm_patch_ins_ll(&p[kvm_emulate_wrtee_reg_offs], + magic_var(scratch2), KVM_RT_30); + break; + case 31: + kvm_patch_ins_ll(&p[kvm_emulate_wrtee_reg_offs], + magic_var(scratch1), KVM_RT_30); + break; + default: + p[kvm_emulate_wrtee_reg_offs] |= rt; + break; + } + } + + p[kvm_emulate_wrtee_orig_ins_offs] = *inst; + flush_icache_range((ulong)p, (ulong)p + kvm_emulate_wrtee_len * 4); + + /* Patch the invocation */ + kvm_patch_ins_b(inst, distance_start); +} + +extern u32 kvm_emulate_wrteei_0_branch_offs; +extern u32 kvm_emulate_wrteei_0_len; +extern u32 kvm_emulate_wrteei_0[]; + +static void kvm_patch_ins_wrteei_0(u32 *inst) +{ + u32 *p; + int distance_start; + int distance_end; + ulong next_inst; + + p = kvm_alloc(kvm_emulate_wrteei_0_len * 4); + if (!p) + return; + + /* Find out where we are and put everything there */ + distance_start = (ulong)p - (ulong)inst; + next_inst = ((ulong)inst + 4); + distance_end = next_inst - (ulong)&p[kvm_emulate_wrteei_0_branch_offs]; + + /* Make sure we only write valid b instructions */ + if (distance_start > KVM_INST_B_MAX) { + kvm_patching_worked = false; + return; + } + + memcpy(p, kvm_emulate_wrteei_0, kvm_emulate_wrteei_0_len * 4); + p[kvm_emulate_wrteei_0_branch_offs] |= distance_end & KVM_INST_B_MASK; + flush_icache_range((ulong)p, (ulong)p + kvm_emulate_wrteei_0_len * 4); /* Patch the invocation */ kvm_patch_ins_b(inst, distance_start); @@ -444,6 +504,11 @@ static void kvm_check_ins(u32 *inst, u32 features) case KVM_INST_MTMSRD_L0: kvm_patch_ins_mtmsr(inst, inst_rt); break; +#ifdef CONFIG_BOOKE + case KVM_INST_WRTEE: + kvm_patch_ins_wrtee(inst, inst_rt, 0); + break; +#endif } switch (inst_no_rt & ~KVM_MASK_RB) { @@ -461,8 +526,11 @@ static void kvm_check_ins(u32 *inst, u32 features) switch (_inst) { #ifdef CONFIG_BOOKE case KVM_INST_WRTEEI_0: + kvm_patch_ins_wrteei_0(inst); + break; + case KVM_INST_WRTEEI_1: - kvm_patch_ins_wrteei(inst); + kvm_patch_ins_wrtee(inst, 0, 1); break; #endif } diff --git a/arch/powerpc/kernel/kvm_emul.S b/arch/powerpc/kernel/kvm_emul.S index 3d64c5704fd5..801058dd74db 100644 --- a/arch/powerpc/kernel/kvm_emul.S +++ b/arch/powerpc/kernel/kvm_emul.S @@ -13,6 +13,7 @@ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * * Copyright SUSE Linux Products GmbH 2010 + * Copyright 2010-2011 Freescale Semiconductor, Inc. * * Authors: Alexander Graf */ @@ -208,24 +209,80 @@ kvm_emulate_mtmsr_orig_ins_offs: kvm_emulate_mtmsr_len: .long (kvm_emulate_mtmsr_end - kvm_emulate_mtmsr) / 4 +/* also used for wrteei 1 */ +.global kvm_emulate_wrtee +kvm_emulate_wrtee: + SCRATCH_SAVE + + /* Fetch old MSR in r31 */ + LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0) -.global kvm_emulate_wrteei -kvm_emulate_wrteei: + /* Insert new MSR[EE] */ +kvm_emulate_wrtee_reg: + ori r30, r0, 0 + rlwimi r31, r30, 0, MSR_EE + + /* + * If MSR[EE] is now set, check for a pending interrupt. + * We could skip this if MSR[EE] was already on, but that + * should be rare, so don't bother. + */ + andi. r30, r30, MSR_EE + /* Put MSR into magic page because we don't call wrtee */ + STL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0) + + beq no_wrtee + + /* Check if we have to fetch an interrupt */ + lwz r30, (KVM_MAGIC_PAGE + KVM_MAGIC_INT)(0) + cmpwi r30, 0 + bne do_wrtee + +no_wrtee: + SCRATCH_RESTORE + + /* Go back to caller */ +kvm_emulate_wrtee_branch: + b . + +do_wrtee: + SCRATCH_RESTORE + + /* Just fire off the wrtee if it's critical */ +kvm_emulate_wrtee_orig_ins: + wrtee r0 + + b kvm_emulate_wrtee_branch + +kvm_emulate_wrtee_end: + +.global kvm_emulate_wrtee_branch_offs +kvm_emulate_wrtee_branch_offs: + .long (kvm_emulate_wrtee_branch - kvm_emulate_wrtee) / 4 + +.global kvm_emulate_wrtee_reg_offs +kvm_emulate_wrtee_reg_offs: + .long (kvm_emulate_wrtee_reg - kvm_emulate_wrtee) / 4 + +.global kvm_emulate_wrtee_orig_ins_offs +kvm_emulate_wrtee_orig_ins_offs: + .long (kvm_emulate_wrtee_orig_ins - kvm_emulate_wrtee) / 4 + +.global kvm_emulate_wrtee_len +kvm_emulate_wrtee_len: + .long (kvm_emulate_wrtee_end - kvm_emulate_wrtee) / 4 + +.global kvm_emulate_wrteei_0 +kvm_emulate_wrteei_0: SCRATCH_SAVE /* Fetch old MSR in r31 */ LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0) /* Remove MSR_EE from old MSR */ - li r30, 0 - ori r30, r30, MSR_EE - andc r31, r31, r30 - - /* OR new MSR_EE onto the old MSR */ -kvm_emulate_wrteei_ee: - ori r31, r31, 0 + rlwinm r31, r31, 0, ~MSR_EE /* Write new MSR value back */ STL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0) @@ -233,22 +290,17 @@ kvm_emulate_wrteei_ee: SCRATCH_RESTORE /* Go back to caller */ -kvm_emulate_wrteei_branch: +kvm_emulate_wrteei_0_branch: b . -kvm_emulate_wrteei_end: - -.global kvm_emulate_wrteei_branch_offs -kvm_emulate_wrteei_branch_offs: - .long (kvm_emulate_wrteei_branch - kvm_emulate_wrteei) / 4 - -.global kvm_emulate_wrteei_ee_offs -kvm_emulate_wrteei_ee_offs: - .long (kvm_emulate_wrteei_ee - kvm_emulate_wrteei) / 4 +kvm_emulate_wrteei_0_end: -.global kvm_emulate_wrteei_len -kvm_emulate_wrteei_len: - .long (kvm_emulate_wrteei_end - kvm_emulate_wrteei) / 4 +.global kvm_emulate_wrteei_0_branch_offs +kvm_emulate_wrteei_0_branch_offs: + .long (kvm_emulate_wrteei_0_branch - kvm_emulate_wrteei_0) / 4 +.global kvm_emulate_wrteei_0_len +kvm_emulate_wrteei_0_len: + .long (kvm_emulate_wrteei_0_end - kvm_emulate_wrteei_0) / 4 .global kvm_emulate_mtsrin kvm_emulate_mtsrin: -- cgit v1.2.3 From b59049720dd95021dfe0d9f4e1fa9458a67cfe29 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Tue, 8 Nov 2011 18:23:30 -0600 Subject: KVM: PPC: Paravirtualize SPRG4-7, ESR, PIR, MASn This allows additional registers to be accessed by the guest in PR-mode KVM without trapping. SPRG4-7 are readable from userspace. On booke, KVM will sync these registers when it enters the guest, so that accesses from guest userspace will work. The guest kernel, OTOH, must consistently use either the real registers or the shared area between exits. This also applies to the already-paravirted SPRG3. On non-booke, it's not clear to what extent SPRG4-7 are supported (they're not architected for book3s, but exist on at least some classic chips). They are copied in the get/set regs ioctls, but I do not see any non-booke emulation. I also do not see any syncing with real registers (in PR-mode) including the user-readable SPRG3. This patch should not make that situation any worse. Signed-off-by: Scott Wood Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/include/asm/kvm_e500.h | 8 -- arch/powerpc/include/asm/kvm_host.h | 6 -- arch/powerpc/include/asm/kvm_para.h | 31 +++++- arch/powerpc/kernel/asm-offsets.c | 15 ++- arch/powerpc/kernel/kvm.c | 204 ++++++++++++++++++++++++++++++------ arch/powerpc/kvm/book3s.c | 16 +-- arch/powerpc/kvm/booke.c | 23 ++-- arch/powerpc/kvm/booke_emulate.c | 12 +-- arch/powerpc/kvm/booke_interrupts.S | 18 ++-- arch/powerpc/kvm/e500.c | 24 ++--- arch/powerpc/kvm/e500_emulate.c | 38 ++++--- arch/powerpc/kvm/e500_tlb.c | 83 ++++++++------- arch/powerpc/kvm/e500_tlb.h | 25 ++--- arch/powerpc/kvm/emulate.c | 3 +- arch/powerpc/kvm/powerpc.c | 2 +- 15 files changed, 339 insertions(+), 169 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_e500.h b/arch/powerpc/include/asm/kvm_e500.h index bc17441535f2..8cd50a514271 100644 --- a/arch/powerpc/include/asm/kvm_e500.h +++ b/arch/powerpc/include/asm/kvm_e500.h @@ -71,14 +71,6 @@ struct kvmppc_vcpu_e500 { u32 pid[E500_PID_NUM]; u32 svr; - u32 mas0; - u32 mas1; - u32 mas2; - u64 mas7_3; - u32 mas4; - u32 mas5; - u32 mas6; - /* vcpu id table */ struct vcpu_id_table *idt; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index bf8af5d5d5dc..bfd0c9912da5 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -318,10 +318,6 @@ struct kvm_vcpu_arch { u32 vrsave; /* also USPRG0 */ u32 mmucr; ulong shadow_msr; - ulong sprg4; - ulong sprg5; - ulong sprg6; - ulong sprg7; ulong csrr0; ulong csrr1; ulong dsrr0; @@ -329,7 +325,6 @@ struct kvm_vcpu_arch { ulong mcsrr0; ulong mcsrr1; ulong mcsr; - ulong esr; u32 dec; u32 decar; u32 tbl; @@ -338,7 +333,6 @@ struct kvm_vcpu_arch { u32 tsr; u32 ivor[64]; ulong ivpr; - u32 pir; u32 pvr; u32 shadow_pid; diff --git a/arch/powerpc/include/asm/kvm_para.h b/arch/powerpc/include/asm/kvm_para.h index 50533f9adf40..ece70fb36513 100644 --- a/arch/powerpc/include/asm/kvm_para.h +++ b/arch/powerpc/include/asm/kvm_para.h @@ -33,11 +33,35 @@ struct kvm_vcpu_arch_shared { __u64 sprg3; __u64 srr0; __u64 srr1; - __u64 dar; + __u64 dar; /* dear on BookE */ __u64 msr; __u32 dsisr; __u32 int_pending; /* Tells the guest if we have an interrupt */ __u32 sr[16]; + __u32 mas0; + __u32 mas1; + __u64 mas7_3; + __u64 mas2; + __u32 mas4; + __u32 mas6; + __u32 esr; + __u32 pir; + + /* + * SPRG4-7 are user-readable, so we can only keep these consistent + * between the shared area and the real registers when there's an + * intervening exit to KVM. This also applies to SPRG3 on some + * chips. + * + * This suffices for access by guest userspace, since in PR-mode + * KVM, an exit must occur when changing the guest's MSR[PR]. + * If the guest kernel writes to SPRG3-7 via the shared area, it + * must also use the shared area for reading while in kernel space. + */ + __u64 sprg4; + __u64 sprg5; + __u64 sprg6; + __u64 sprg7; }; #define KVM_SC_MAGIC_R0 0x4b564d21 /* "KVM!" */ @@ -47,7 +71,10 @@ struct kvm_vcpu_arch_shared { #define KVM_FEATURE_MAGIC_PAGE 1 -#define KVM_MAGIC_FEAT_SR (1 << 0) +#define KVM_MAGIC_FEAT_SR (1 << 0) + +/* MASn, ESR, PIR, and high SPRGs */ +#define KVM_MAGIC_FEAT_MAS0_TO_SPRG7 (1 << 1) #ifdef __KERNEL__ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 04caee7d9bc1..e7bfcf81b746 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -426,16 +426,23 @@ int main(void) DEFINE(VCPU_SPRG2, offsetof(struct kvm_vcpu, arch.shregs.sprg2)); DEFINE(VCPU_SPRG3, offsetof(struct kvm_vcpu, arch.shregs.sprg3)); #endif - DEFINE(VCPU_SPRG4, offsetof(struct kvm_vcpu, arch.sprg4)); - DEFINE(VCPU_SPRG5, offsetof(struct kvm_vcpu, arch.sprg5)); - DEFINE(VCPU_SPRG6, offsetof(struct kvm_vcpu, arch.sprg6)); - DEFINE(VCPU_SPRG7, offsetof(struct kvm_vcpu, arch.sprg7)); + DEFINE(VCPU_SHARED_SPRG4, offsetof(struct kvm_vcpu_arch_shared, sprg4)); + DEFINE(VCPU_SHARED_SPRG5, offsetof(struct kvm_vcpu_arch_shared, sprg5)); + DEFINE(VCPU_SHARED_SPRG6, offsetof(struct kvm_vcpu_arch_shared, sprg6)); + DEFINE(VCPU_SHARED_SPRG7, offsetof(struct kvm_vcpu_arch_shared, sprg7)); DEFINE(VCPU_SHADOW_PID, offsetof(struct kvm_vcpu, arch.shadow_pid)); DEFINE(VCPU_SHADOW_PID1, offsetof(struct kvm_vcpu, arch.shadow_pid1)); DEFINE(VCPU_SHARED, offsetof(struct kvm_vcpu, arch.shared)); DEFINE(VCPU_SHARED_MSR, offsetof(struct kvm_vcpu_arch_shared, msr)); DEFINE(VCPU_SHADOW_MSR, offsetof(struct kvm_vcpu, arch.shadow_msr)); + DEFINE(VCPU_SHARED_MAS0, offsetof(struct kvm_vcpu_arch_shared, mas0)); + DEFINE(VCPU_SHARED_MAS1, offsetof(struct kvm_vcpu_arch_shared, mas1)); + DEFINE(VCPU_SHARED_MAS2, offsetof(struct kvm_vcpu_arch_shared, mas2)); + DEFINE(VCPU_SHARED_MAS7_3, offsetof(struct kvm_vcpu_arch_shared, mas7_3)); + DEFINE(VCPU_SHARED_MAS4, offsetof(struct kvm_vcpu_arch_shared, mas4)); + DEFINE(VCPU_SHARED_MAS6, offsetof(struct kvm_vcpu_arch_shared, mas6)); + /* book3s */ #ifdef CONFIG_KVM_BOOK3S_64_HV DEFINE(KVM_LPID, offsetof(struct kvm, arch.lpid)); diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c index 06b15ee997f7..04d4b5aa6dca 100644 --- a/arch/powerpc/kernel/kvm.c +++ b/arch/powerpc/kernel/kvm.c @@ -49,23 +49,17 @@ #define KVM_RT_30 0x03c00000 #define KVM_MASK_RB 0x0000f800 #define KVM_INST_MFMSR 0x7c0000a6 -#define KVM_INST_MFSPR_SPRG0 0x7c1042a6 -#define KVM_INST_MFSPR_SPRG1 0x7c1142a6 -#define KVM_INST_MFSPR_SPRG2 0x7c1242a6 -#define KVM_INST_MFSPR_SPRG3 0x7c1342a6 -#define KVM_INST_MFSPR_SRR0 0x7c1a02a6 -#define KVM_INST_MFSPR_SRR1 0x7c1b02a6 -#define KVM_INST_MFSPR_DAR 0x7c1302a6 -#define KVM_INST_MFSPR_DSISR 0x7c1202a6 - -#define KVM_INST_MTSPR_SPRG0 0x7c1043a6 -#define KVM_INST_MTSPR_SPRG1 0x7c1143a6 -#define KVM_INST_MTSPR_SPRG2 0x7c1243a6 -#define KVM_INST_MTSPR_SPRG3 0x7c1343a6 -#define KVM_INST_MTSPR_SRR0 0x7c1a03a6 -#define KVM_INST_MTSPR_SRR1 0x7c1b03a6 -#define KVM_INST_MTSPR_DAR 0x7c1303a6 -#define KVM_INST_MTSPR_DSISR 0x7c1203a6 + +#define SPR_FROM 0 +#define SPR_TO 0x100 + +#define KVM_INST_SPR(sprn, moveto) (0x7c0002a6 | \ + (((sprn) & 0x1f) << 16) | \ + (((sprn) & 0x3e0) << 6) | \ + (moveto)) + +#define KVM_INST_MFSPR(sprn) KVM_INST_SPR(sprn, SPR_FROM) +#define KVM_INST_MTSPR(sprn) KVM_INST_SPR(sprn, SPR_TO) #define KVM_INST_TLBSYNC 0x7c00046c #define KVM_INST_MTMSRD_L0 0x7c000164 @@ -440,56 +434,191 @@ static void kvm_check_ins(u32 *inst, u32 features) case KVM_INST_MFMSR: kvm_patch_ins_ld(inst, magic_var(msr), inst_rt); break; - case KVM_INST_MFSPR_SPRG0: + case KVM_INST_MFSPR(SPRN_SPRG0): kvm_patch_ins_ld(inst, magic_var(sprg0), inst_rt); break; - case KVM_INST_MFSPR_SPRG1: + case KVM_INST_MFSPR(SPRN_SPRG1): kvm_patch_ins_ld(inst, magic_var(sprg1), inst_rt); break; - case KVM_INST_MFSPR_SPRG2: + case KVM_INST_MFSPR(SPRN_SPRG2): kvm_patch_ins_ld(inst, magic_var(sprg2), inst_rt); break; - case KVM_INST_MFSPR_SPRG3: + case KVM_INST_MFSPR(SPRN_SPRG3): kvm_patch_ins_ld(inst, magic_var(sprg3), inst_rt); break; - case KVM_INST_MFSPR_SRR0: + case KVM_INST_MFSPR(SPRN_SRR0): kvm_patch_ins_ld(inst, magic_var(srr0), inst_rt); break; - case KVM_INST_MFSPR_SRR1: + case KVM_INST_MFSPR(SPRN_SRR1): kvm_patch_ins_ld(inst, magic_var(srr1), inst_rt); break; - case KVM_INST_MFSPR_DAR: +#ifdef CONFIG_BOOKE + case KVM_INST_MFSPR(SPRN_DEAR): +#else + case KVM_INST_MFSPR(SPRN_DAR): +#endif kvm_patch_ins_ld(inst, magic_var(dar), inst_rt); break; - case KVM_INST_MFSPR_DSISR: + case KVM_INST_MFSPR(SPRN_DSISR): kvm_patch_ins_lwz(inst, magic_var(dsisr), inst_rt); break; +#ifdef CONFIG_PPC_BOOK3E_MMU + case KVM_INST_MFSPR(SPRN_MAS0): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_lwz(inst, magic_var(mas0), inst_rt); + break; + case KVM_INST_MFSPR(SPRN_MAS1): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_lwz(inst, magic_var(mas1), inst_rt); + break; + case KVM_INST_MFSPR(SPRN_MAS2): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_ld(inst, magic_var(mas2), inst_rt); + break; + case KVM_INST_MFSPR(SPRN_MAS3): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_lwz(inst, magic_var(mas7_3) + 4, inst_rt); + break; + case KVM_INST_MFSPR(SPRN_MAS4): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_lwz(inst, magic_var(mas4), inst_rt); + break; + case KVM_INST_MFSPR(SPRN_MAS6): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_lwz(inst, magic_var(mas6), inst_rt); + break; + case KVM_INST_MFSPR(SPRN_MAS7): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_lwz(inst, magic_var(mas7_3), inst_rt); + break; +#endif /* CONFIG_PPC_BOOK3E_MMU */ + + case KVM_INST_MFSPR(SPRN_SPRG4): +#ifdef CONFIG_BOOKE + case KVM_INST_MFSPR(SPRN_SPRG4R): +#endif + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_ld(inst, magic_var(sprg4), inst_rt); + break; + case KVM_INST_MFSPR(SPRN_SPRG5): +#ifdef CONFIG_BOOKE + case KVM_INST_MFSPR(SPRN_SPRG5R): +#endif + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_ld(inst, magic_var(sprg5), inst_rt); + break; + case KVM_INST_MFSPR(SPRN_SPRG6): +#ifdef CONFIG_BOOKE + case KVM_INST_MFSPR(SPRN_SPRG6R): +#endif + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_ld(inst, magic_var(sprg6), inst_rt); + break; + case KVM_INST_MFSPR(SPRN_SPRG7): +#ifdef CONFIG_BOOKE + case KVM_INST_MFSPR(SPRN_SPRG7R): +#endif + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_ld(inst, magic_var(sprg7), inst_rt); + break; + +#ifdef CONFIG_BOOKE + case KVM_INST_MFSPR(SPRN_ESR): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_lwz(inst, magic_var(esr), inst_rt); + break; +#endif + + case KVM_INST_MFSPR(SPRN_PIR): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_lwz(inst, magic_var(pir), inst_rt); + break; + + /* Stores */ - case KVM_INST_MTSPR_SPRG0: + case KVM_INST_MTSPR(SPRN_SPRG0): kvm_patch_ins_std(inst, magic_var(sprg0), inst_rt); break; - case KVM_INST_MTSPR_SPRG1: + case KVM_INST_MTSPR(SPRN_SPRG1): kvm_patch_ins_std(inst, magic_var(sprg1), inst_rt); break; - case KVM_INST_MTSPR_SPRG2: + case KVM_INST_MTSPR(SPRN_SPRG2): kvm_patch_ins_std(inst, magic_var(sprg2), inst_rt); break; - case KVM_INST_MTSPR_SPRG3: + case KVM_INST_MTSPR(SPRN_SPRG3): kvm_patch_ins_std(inst, magic_var(sprg3), inst_rt); break; - case KVM_INST_MTSPR_SRR0: + case KVM_INST_MTSPR(SPRN_SRR0): kvm_patch_ins_std(inst, magic_var(srr0), inst_rt); break; - case KVM_INST_MTSPR_SRR1: + case KVM_INST_MTSPR(SPRN_SRR1): kvm_patch_ins_std(inst, magic_var(srr1), inst_rt); break; - case KVM_INST_MTSPR_DAR: +#ifdef CONFIG_BOOKE + case KVM_INST_MTSPR(SPRN_DEAR): +#else + case KVM_INST_MTSPR(SPRN_DAR): +#endif kvm_patch_ins_std(inst, magic_var(dar), inst_rt); break; - case KVM_INST_MTSPR_DSISR: + case KVM_INST_MTSPR(SPRN_DSISR): kvm_patch_ins_stw(inst, magic_var(dsisr), inst_rt); break; +#ifdef CONFIG_PPC_BOOK3E_MMU + case KVM_INST_MTSPR(SPRN_MAS0): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_stw(inst, magic_var(mas0), inst_rt); + break; + case KVM_INST_MTSPR(SPRN_MAS1): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_stw(inst, magic_var(mas1), inst_rt); + break; + case KVM_INST_MTSPR(SPRN_MAS2): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_std(inst, magic_var(mas2), inst_rt); + break; + case KVM_INST_MTSPR(SPRN_MAS3): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_stw(inst, magic_var(mas7_3) + 4, inst_rt); + break; + case KVM_INST_MTSPR(SPRN_MAS4): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_stw(inst, magic_var(mas4), inst_rt); + break; + case KVM_INST_MTSPR(SPRN_MAS6): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_stw(inst, magic_var(mas6), inst_rt); + break; + case KVM_INST_MTSPR(SPRN_MAS7): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_stw(inst, magic_var(mas7_3), inst_rt); + break; +#endif /* CONFIG_PPC_BOOK3E_MMU */ + + case KVM_INST_MTSPR(SPRN_SPRG4): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_std(inst, magic_var(sprg4), inst_rt); + break; + case KVM_INST_MTSPR(SPRN_SPRG5): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_std(inst, magic_var(sprg5), inst_rt); + break; + case KVM_INST_MTSPR(SPRN_SPRG6): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_std(inst, magic_var(sprg6), inst_rt); + break; + case KVM_INST_MTSPR(SPRN_SPRG7): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_std(inst, magic_var(sprg7), inst_rt); + break; + +#ifdef CONFIG_BOOKE + case KVM_INST_MTSPR(SPRN_ESR): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_stw(inst, magic_var(esr), inst_rt); + break; +#endif /* Nops */ case KVM_INST_TLBSYNC: @@ -556,9 +685,18 @@ static void kvm_use_magic_page(void) start = (void*)_stext; end = (void*)_etext; + /* + * Being interrupted in the middle of patching would + * be bad for SPRG4-7, which KVM can't keep in sync + * with emulated accesses because reads don't trap. + */ + local_irq_disable(); + for (p = start; p < end; p++) kvm_check_ins(p, features); + local_irq_enable(); + printk(KERN_INFO "KVM: Live patching for a fast VM %s\n", kvm_patching_worked ? "worked" : "failed"); } diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 73fc9f046107..5398744cd773 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -423,10 +423,10 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) regs->sprg1 = vcpu->arch.shared->sprg1; regs->sprg2 = vcpu->arch.shared->sprg2; regs->sprg3 = vcpu->arch.shared->sprg3; - regs->sprg4 = vcpu->arch.sprg4; - regs->sprg5 = vcpu->arch.sprg5; - regs->sprg6 = vcpu->arch.sprg6; - regs->sprg7 = vcpu->arch.sprg7; + regs->sprg4 = vcpu->arch.shared->sprg4; + regs->sprg5 = vcpu->arch.shared->sprg5; + regs->sprg6 = vcpu->arch.shared->sprg6; + regs->sprg7 = vcpu->arch.shared->sprg7; for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) regs->gpr[i] = kvmppc_get_gpr(vcpu, i); @@ -450,10 +450,10 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) vcpu->arch.shared->sprg1 = regs->sprg1; vcpu->arch.shared->sprg2 = regs->sprg2; vcpu->arch.shared->sprg3 = regs->sprg3; - vcpu->arch.sprg4 = regs->sprg4; - vcpu->arch.sprg5 = regs->sprg5; - vcpu->arch.sprg6 = regs->sprg6; - vcpu->arch.sprg7 = regs->sprg7; + vcpu->arch.shared->sprg4 = regs->sprg4; + vcpu->arch.shared->sprg5 = regs->sprg5; + vcpu->arch.shared->sprg6 = regs->sprg6; + vcpu->arch.shared->sprg7 = regs->sprg7; for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) kvmppc_set_gpr(vcpu, i, regs->gpr[i]); diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 8dfc59a8a715..50803dd0b8f2 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -270,7 +270,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, vcpu->arch.shared->srr1 = vcpu->arch.shared->msr; vcpu->arch.pc = vcpu->arch.ivpr | vcpu->arch.ivor[priority]; if (update_esr == true) - vcpu->arch.esr = vcpu->arch.queued_esr; + vcpu->arch.shared->esr = vcpu->arch.queued_esr; if (update_dear == true) vcpu->arch.shared->dar = vcpu->arch.queued_dear; kvmppc_set_msr(vcpu, vcpu->arch.shared->msr & msr_mask); @@ -644,6 +644,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) vcpu->arch.pc = 0; vcpu->arch.shared->msr = 0; vcpu->arch.shadow_msr = MSR_USER | MSR_DE | MSR_IS | MSR_DS; + vcpu->arch.shared->pir = vcpu->vcpu_id; kvmppc_set_gpr(vcpu, 1, (16<<20) - 8); /* -8 for the callee-save LR slot */ vcpu->arch.shadow_pid = 1; @@ -678,10 +679,10 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) regs->sprg1 = vcpu->arch.shared->sprg1; regs->sprg2 = vcpu->arch.shared->sprg2; regs->sprg3 = vcpu->arch.shared->sprg3; - regs->sprg4 = vcpu->arch.sprg4; - regs->sprg5 = vcpu->arch.sprg5; - regs->sprg6 = vcpu->arch.sprg6; - regs->sprg7 = vcpu->arch.sprg7; + regs->sprg4 = vcpu->arch.shared->sprg4; + regs->sprg5 = vcpu->arch.shared->sprg5; + regs->sprg6 = vcpu->arch.shared->sprg6; + regs->sprg7 = vcpu->arch.shared->sprg7; for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) regs->gpr[i] = kvmppc_get_gpr(vcpu, i); @@ -706,10 +707,10 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) vcpu->arch.shared->sprg1 = regs->sprg1; vcpu->arch.shared->sprg2 = regs->sprg2; vcpu->arch.shared->sprg3 = regs->sprg3; - vcpu->arch.sprg4 = regs->sprg4; - vcpu->arch.sprg5 = regs->sprg5; - vcpu->arch.sprg6 = regs->sprg6; - vcpu->arch.sprg7 = regs->sprg7; + vcpu->arch.shared->sprg4 = regs->sprg4; + vcpu->arch.shared->sprg5 = regs->sprg5; + vcpu->arch.shared->sprg6 = regs->sprg6; + vcpu->arch.shared->sprg7 = regs->sprg7; for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) kvmppc_set_gpr(vcpu, i, regs->gpr[i]); @@ -727,7 +728,7 @@ static void get_sregs_base(struct kvm_vcpu *vcpu, sregs->u.e.csrr0 = vcpu->arch.csrr0; sregs->u.e.csrr1 = vcpu->arch.csrr1; sregs->u.e.mcsr = vcpu->arch.mcsr; - sregs->u.e.esr = vcpu->arch.esr; + sregs->u.e.esr = vcpu->arch.shared->esr; sregs->u.e.dear = vcpu->arch.shared->dar; sregs->u.e.tsr = vcpu->arch.tsr; sregs->u.e.tcr = vcpu->arch.tcr; @@ -745,7 +746,7 @@ static int set_sregs_base(struct kvm_vcpu *vcpu, vcpu->arch.csrr0 = sregs->u.e.csrr0; vcpu->arch.csrr1 = sregs->u.e.csrr1; vcpu->arch.mcsr = sregs->u.e.mcsr; - vcpu->arch.esr = sregs->u.e.esr; + vcpu->arch.shared->esr = sregs->u.e.esr; vcpu->arch.shared->dar = sregs->u.e.dear; vcpu->arch.vrsave = sregs->u.e.vrsave; vcpu->arch.tcr = sregs->u.e.tcr; diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c index 1260f5f24c0c..bae9288ac1e1 100644 --- a/arch/powerpc/kvm/booke_emulate.c +++ b/arch/powerpc/kvm/booke_emulate.c @@ -107,7 +107,7 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) case SPRN_DEAR: vcpu->arch.shared->dar = spr_val; break; case SPRN_ESR: - vcpu->arch.esr = spr_val; break; + vcpu->arch.shared->esr = spr_val; break; case SPRN_DBCR0: vcpu->arch.dbcr0 = spr_val; break; case SPRN_DBCR1: @@ -125,13 +125,13 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) * loaded into the real SPRGs when resuming the * guest. */ case SPRN_SPRG4: - vcpu->arch.sprg4 = spr_val; break; + vcpu->arch.shared->sprg4 = spr_val; break; case SPRN_SPRG5: - vcpu->arch.sprg5 = spr_val; break; + vcpu->arch.shared->sprg5 = spr_val; break; case SPRN_SPRG6: - vcpu->arch.sprg6 = spr_val; break; + vcpu->arch.shared->sprg6 = spr_val; break; case SPRN_SPRG7: - vcpu->arch.sprg7 = spr_val; break; + vcpu->arch.shared->sprg7 = spr_val; break; case SPRN_IVPR: vcpu->arch.ivpr = spr_val; @@ -202,7 +202,7 @@ int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) case SPRN_DEAR: kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->dar); break; case SPRN_ESR: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.esr); break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->esr); break; case SPRN_DBCR0: kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbcr0); break; case SPRN_DBCR1: diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S index 42f2fb1f66e9..10d8ef602e5c 100644 --- a/arch/powerpc/kvm/booke_interrupts.S +++ b/arch/powerpc/kvm/booke_interrupts.S @@ -402,19 +402,25 @@ lightweight_exit: /* Save vcpu pointer for the exception handlers. */ mtspr SPRN_SPRG_WVCPU, r4 + lwz r5, VCPU_SHARED(r4) + /* Can't switch the stack pointer until after IVPR is switched, * because host interrupt handlers would get confused. */ lwz r1, VCPU_GPR(r1)(r4) - /* Host interrupt handlers may have clobbered these guest-readable - * SPRGs, so we need to reload them here with the guest's values. */ - lwz r3, VCPU_SPRG4(r4) + /* + * Host interrupt handlers may have clobbered these + * guest-readable SPRGs, or the guest kernel may have + * written directly to the shared area, so we + * need to reload them here with the guest's values. + */ + lwz r3, VCPU_SHARED_SPRG4(r5) mtspr SPRN_SPRG4W, r3 - lwz r3, VCPU_SPRG5(r4) + lwz r3, VCPU_SHARED_SPRG5(r5) mtspr SPRN_SPRG5W, r3 - lwz r3, VCPU_SPRG6(r4) + lwz r3, VCPU_SHARED_SPRG6(r5) mtspr SPRN_SPRG6W, r3 - lwz r3, VCPU_SPRG7(r4) + lwz r3, VCPU_SHARED_SPRG7(r5) mtspr SPRN_SPRG7W, r3 #ifdef CONFIG_KVM_EXIT_TIMING diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index ac3c4bf21677..709d82f956e3 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -115,12 +115,12 @@ void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) sregs->u.e.impl.fsl.hid0 = vcpu_e500->hid0; sregs->u.e.impl.fsl.mcar = vcpu_e500->mcar; - sregs->u.e.mas0 = vcpu_e500->mas0; - sregs->u.e.mas1 = vcpu_e500->mas1; - sregs->u.e.mas2 = vcpu_e500->mas2; - sregs->u.e.mas7_3 = vcpu_e500->mas7_3; - sregs->u.e.mas4 = vcpu_e500->mas4; - sregs->u.e.mas6 = vcpu_e500->mas6; + sregs->u.e.mas0 = vcpu->arch.shared->mas0; + sregs->u.e.mas1 = vcpu->arch.shared->mas1; + sregs->u.e.mas2 = vcpu->arch.shared->mas2; + sregs->u.e.mas7_3 = vcpu->arch.shared->mas7_3; + sregs->u.e.mas4 = vcpu->arch.shared->mas4; + sregs->u.e.mas6 = vcpu->arch.shared->mas6; sregs->u.e.mmucfg = mfspr(SPRN_MMUCFG); sregs->u.e.tlbcfg[0] = vcpu_e500->tlb0cfg; @@ -148,12 +148,12 @@ int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) } if (sregs->u.e.features & KVM_SREGS_E_ARCH206_MMU) { - vcpu_e500->mas0 = sregs->u.e.mas0; - vcpu_e500->mas1 = sregs->u.e.mas1; - vcpu_e500->mas2 = sregs->u.e.mas2; - vcpu_e500->mas7_3 = sregs->u.e.mas7_3; - vcpu_e500->mas4 = sregs->u.e.mas4; - vcpu_e500->mas6 = sregs->u.e.mas6; + vcpu->arch.shared->mas0 = sregs->u.e.mas0; + vcpu->arch.shared->mas1 = sregs->u.e.mas1; + vcpu->arch.shared->mas2 = sregs->u.e.mas2; + vcpu->arch.shared->mas7_3 = sregs->u.e.mas7_3; + vcpu->arch.shared->mas4 = sregs->u.e.mas4; + vcpu->arch.shared->mas6 = sregs->u.e.mas6; } if (!(sregs->u.e.features & KVM_SREGS_E_IVOR)) diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c index e0d36099c756..6d0b2bd54fb0 100644 --- a/arch/powerpc/kvm/e500_emulate.c +++ b/arch/powerpc/kvm/e500_emulate.c @@ -89,22 +89,22 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) return EMULATE_FAIL; vcpu_e500->pid[2] = spr_val; break; case SPRN_MAS0: - vcpu_e500->mas0 = spr_val; break; + vcpu->arch.shared->mas0 = spr_val; break; case SPRN_MAS1: - vcpu_e500->mas1 = spr_val; break; + vcpu->arch.shared->mas1 = spr_val; break; case SPRN_MAS2: - vcpu_e500->mas2 = spr_val; break; + vcpu->arch.shared->mas2 = spr_val; break; case SPRN_MAS3: - vcpu_e500->mas7_3 &= ~(u64)0xffffffff; - vcpu_e500->mas7_3 |= spr_val; + vcpu->arch.shared->mas7_3 &= ~(u64)0xffffffff; + vcpu->arch.shared->mas7_3 |= spr_val; break; case SPRN_MAS4: - vcpu_e500->mas4 = spr_val; break; + vcpu->arch.shared->mas4 = spr_val; break; case SPRN_MAS6: - vcpu_e500->mas6 = spr_val; break; + vcpu->arch.shared->mas6 = spr_val; break; case SPRN_MAS7: - vcpu_e500->mas7_3 &= (u64)0xffffffff; - vcpu_e500->mas7_3 |= (u64)spr_val << 32; + vcpu->arch.shared->mas7_3 &= (u64)0xffffffff; + vcpu->arch.shared->mas7_3 |= (u64)spr_val << 32; break; case SPRN_L1CSR0: vcpu_e500->l1csr0 = spr_val; @@ -147,6 +147,7 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); int emulated = EMULATE_DONE; + unsigned long val; switch (sprn) { case SPRN_PID: @@ -156,20 +157,23 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) case SPRN_PID2: kvmppc_set_gpr(vcpu, rt, vcpu_e500->pid[2]); break; case SPRN_MAS0: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas0); break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->mas0); break; case SPRN_MAS1: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas1); break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->mas1); break; case SPRN_MAS2: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas2); break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->mas2); break; case SPRN_MAS3: - kvmppc_set_gpr(vcpu, rt, (u32)vcpu_e500->mas7_3); break; + val = (u32)vcpu->arch.shared->mas7_3; + kvmppc_set_gpr(vcpu, rt, val); + break; case SPRN_MAS4: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas4); break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->mas4); break; case SPRN_MAS6: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas6); break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->mas6); break; case SPRN_MAS7: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas7_3 >> 32); break; - + val = vcpu->arch.shared->mas7_3 >> 32; + kvmppc_set_gpr(vcpu, rt, val); + break; case SPRN_TLB0CFG: kvmppc_set_gpr(vcpu, rt, vcpu_e500->tlb0cfg); break; case SPRN_TLB1CFG: diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index 6fefb9144f23..9cd124a11acd 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -428,13 +428,14 @@ static int htlb0_set_base(gva_t addr) host_tlb_params[0].ways); } -static unsigned int get_tlb_esel(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel) +static unsigned int get_tlb_esel(struct kvm_vcpu *vcpu, int tlbsel) { - unsigned int esel = get_tlb_esel_bit(vcpu_e500); + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + int esel = get_tlb_esel_bit(vcpu); if (tlbsel == 0) { esel &= vcpu_e500->gtlb_params[0].ways - 1; - esel += gtlb0_set_base(vcpu_e500, vcpu_e500->mas2); + esel += gtlb0_set_base(vcpu_e500, vcpu->arch.shared->mas2); } else { esel &= vcpu_e500->gtlb_params[tlbsel].entries - 1; } @@ -545,20 +546,20 @@ static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu, int tlbsel; /* since we only have two TLBs, only lower bit is used. */ - tlbsel = (vcpu_e500->mas4 >> 28) & 0x1; + tlbsel = (vcpu->arch.shared->mas4 >> 28) & 0x1; victim = (tlbsel == 0) ? gtlb0_get_next_victim(vcpu_e500) : 0; - pidsel = (vcpu_e500->mas4 >> 16) & 0xf; - tsized = (vcpu_e500->mas4 >> 7) & 0x1f; + pidsel = (vcpu->arch.shared->mas4 >> 16) & 0xf; + tsized = (vcpu->arch.shared->mas4 >> 7) & 0x1f; - vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim) + vcpu->arch.shared->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim) | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); - vcpu_e500->mas1 = MAS1_VALID | (as ? MAS1_TS : 0) + vcpu->arch.shared->mas1 = MAS1_VALID | (as ? MAS1_TS : 0) | MAS1_TID(vcpu_e500->pid[pidsel]) | MAS1_TSIZE(tsized); - vcpu_e500->mas2 = (eaddr & MAS2_EPN) - | (vcpu_e500->mas4 & MAS2_ATTRIB_MASK); - vcpu_e500->mas7_3 &= MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3; - vcpu_e500->mas6 = (vcpu_e500->mas6 & MAS6_SPID1) + vcpu->arch.shared->mas2 = (eaddr & MAS2_EPN) + | (vcpu->arch.shared->mas4 & MAS2_ATTRIB_MASK); + vcpu->arch.shared->mas7_3 &= MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3; + vcpu->arch.shared->mas6 = (vcpu->arch.shared->mas6 & MAS6_SPID1) | (get_cur_pid(vcpu) << 16) | (as ? MAS6_SAS : 0); } @@ -844,15 +845,15 @@ int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu) int tlbsel, esel; struct kvm_book3e_206_tlb_entry *gtlbe; - tlbsel = get_tlb_tlbsel(vcpu_e500); - esel = get_tlb_esel(vcpu_e500, tlbsel); + tlbsel = get_tlb_tlbsel(vcpu); + esel = get_tlb_esel(vcpu, tlbsel); gtlbe = get_entry(vcpu_e500, tlbsel, esel); - vcpu_e500->mas0 &= ~MAS0_NV(~0); - vcpu_e500->mas0 |= MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); - vcpu_e500->mas1 = gtlbe->mas1; - vcpu_e500->mas2 = gtlbe->mas2; - vcpu_e500->mas7_3 = gtlbe->mas7_3; + vcpu->arch.shared->mas0 &= ~MAS0_NV(~0); + vcpu->arch.shared->mas0 |= MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); + vcpu->arch.shared->mas1 = gtlbe->mas1; + vcpu->arch.shared->mas2 = gtlbe->mas2; + vcpu->arch.shared->mas7_3 = gtlbe->mas7_3; return EMULATE_DONE; } @@ -860,8 +861,8 @@ int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu) int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - int as = !!get_cur_sas(vcpu_e500); - unsigned int pid = get_cur_spid(vcpu_e500); + int as = !!get_cur_sas(vcpu); + unsigned int pid = get_cur_spid(vcpu); int esel, tlbsel; struct kvm_book3e_206_tlb_entry *gtlbe = NULL; gva_t ea; @@ -879,26 +880,30 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb) if (gtlbe) { esel &= vcpu_e500->gtlb_params[tlbsel].ways - 1; - vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(esel) + vcpu->arch.shared->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(esel) | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); - vcpu_e500->mas1 = gtlbe->mas1; - vcpu_e500->mas2 = gtlbe->mas2; - vcpu_e500->mas7_3 = gtlbe->mas7_3; + vcpu->arch.shared->mas1 = gtlbe->mas1; + vcpu->arch.shared->mas2 = gtlbe->mas2; + vcpu->arch.shared->mas7_3 = gtlbe->mas7_3; } else { int victim; /* since we only have two TLBs, only lower bit is used. */ - tlbsel = vcpu_e500->mas4 >> 28 & 0x1; + tlbsel = vcpu->arch.shared->mas4 >> 28 & 0x1; victim = (tlbsel == 0) ? gtlb0_get_next_victim(vcpu_e500) : 0; - vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim) + vcpu->arch.shared->mas0 = MAS0_TLBSEL(tlbsel) + | MAS0_ESEL(victim) | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); - vcpu_e500->mas1 = (vcpu_e500->mas6 & MAS6_SPID0) - | (vcpu_e500->mas6 & (MAS6_SAS ? MAS1_TS : 0)) - | (vcpu_e500->mas4 & MAS4_TSIZED(~0)); - vcpu_e500->mas2 &= MAS2_EPN; - vcpu_e500->mas2 |= vcpu_e500->mas4 & MAS2_ATTRIB_MASK; - vcpu_e500->mas7_3 &= MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3; + vcpu->arch.shared->mas1 = + (vcpu->arch.shared->mas6 & MAS6_SPID0) + | (vcpu->arch.shared->mas6 & (MAS6_SAS ? MAS1_TS : 0)) + | (vcpu->arch.shared->mas4 & MAS4_TSIZED(~0)); + vcpu->arch.shared->mas2 &= MAS2_EPN; + vcpu->arch.shared->mas2 |= vcpu->arch.shared->mas4 & + MAS2_ATTRIB_MASK; + vcpu->arch.shared->mas7_3 &= MAS3_U0 | MAS3_U1 | + MAS3_U2 | MAS3_U3; } kvmppc_set_exit_type(vcpu, EMULATED_TLBSX_EXITS); @@ -929,19 +934,19 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) struct kvm_book3e_206_tlb_entry *gtlbe; int tlbsel, esel; - tlbsel = get_tlb_tlbsel(vcpu_e500); - esel = get_tlb_esel(vcpu_e500, tlbsel); + tlbsel = get_tlb_tlbsel(vcpu); + esel = get_tlb_esel(vcpu, tlbsel); gtlbe = get_entry(vcpu_e500, tlbsel, esel); if (get_tlb_v(gtlbe)) inval_gtlbe_on_host(vcpu_e500, tlbsel, esel); - gtlbe->mas1 = vcpu_e500->mas1; - gtlbe->mas2 = vcpu_e500->mas2; - gtlbe->mas7_3 = vcpu_e500->mas7_3; + gtlbe->mas1 = vcpu->arch.shared->mas1; + gtlbe->mas2 = vcpu->arch.shared->mas2; + gtlbe->mas7_3 = vcpu->arch.shared->mas7_3; - trace_kvm_gtlb_write(vcpu_e500->mas0, gtlbe->mas1, gtlbe->mas2, + trace_kvm_gtlb_write(vcpu->arch.shared->mas0, gtlbe->mas1, gtlbe->mas2, (u32)gtlbe->mas7_3, (u32)(gtlbe->mas7_3 >> 32)); /* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */ diff --git a/arch/powerpc/kvm/e500_tlb.h b/arch/powerpc/kvm/e500_tlb.h index 2c296407e759..5c6d2d7bf058 100644 --- a/arch/powerpc/kvm/e500_tlb.h +++ b/arch/powerpc/kvm/e500_tlb.h @@ -121,38 +121,33 @@ static inline unsigned int get_cur_pr(struct kvm_vcpu *vcpu) return !!(vcpu->arch.shared->msr & MSR_PR); } -static inline unsigned int get_cur_spid( - const struct kvmppc_vcpu_e500 *vcpu_e500) +static inline unsigned int get_cur_spid(const struct kvm_vcpu *vcpu) { - return (vcpu_e500->mas6 >> 16) & 0xff; + return (vcpu->arch.shared->mas6 >> 16) & 0xff; } -static inline unsigned int get_cur_sas( - const struct kvmppc_vcpu_e500 *vcpu_e500) +static inline unsigned int get_cur_sas(const struct kvm_vcpu *vcpu) { - return vcpu_e500->mas6 & 0x1; + return vcpu->arch.shared->mas6 & 0x1; } -static inline unsigned int get_tlb_tlbsel( - const struct kvmppc_vcpu_e500 *vcpu_e500) +static inline unsigned int get_tlb_tlbsel(const struct kvm_vcpu *vcpu) { /* * Manual says that tlbsel has 2 bits wide. * Since we only have two TLBs, only lower bit is used. */ - return (vcpu_e500->mas0 >> 28) & 0x1; + return (vcpu->arch.shared->mas0 >> 28) & 0x1; } -static inline unsigned int get_tlb_nv_bit( - const struct kvmppc_vcpu_e500 *vcpu_e500) +static inline unsigned int get_tlb_nv_bit(const struct kvm_vcpu *vcpu) { - return vcpu_e500->mas0 & 0xfff; + return vcpu->arch.shared->mas0 & 0xfff; } -static inline unsigned int get_tlb_esel_bit( - const struct kvmppc_vcpu_e500 *vcpu_e500) +static inline unsigned int get_tlb_esel_bit(const struct kvm_vcpu *vcpu) { - return (vcpu_e500->mas0 >> 16) & 0xfff; + return (vcpu->arch.shared->mas0 >> 16) & 0xfff; } static inline int tlbe_is_host_safe(const struct kvm_vcpu *vcpu, diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index b6df56dd93ba..bda052e2264b 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -162,7 +162,8 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) case OP_TRAP_64: kvmppc_core_queue_program(vcpu, SRR1_PROGTRAP); #else - kvmppc_core_queue_program(vcpu, vcpu->arch.esr | ESR_PTR); + kvmppc_core_queue_program(vcpu, + vcpu->arch.shared->esr | ESR_PTR); #endif advance = 0; break; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 7411bdd8ff6f..d02e4c84e213 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -66,7 +66,7 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) vcpu->arch.magic_page_pa = param1; vcpu->arch.magic_page_ea = param2; - r2 = KVM_MAGIC_FEAT_SR; + r2 = KVM_MAGIC_FEAT_SR | KVM_MAGIC_FEAT_MAS0_TO_SPRG7; r = HC_EV_SUCCESS; break; -- cgit v1.2.3 From dfd4d47e9a71c5a35eb67a44cd311efbe1846b7e Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Thu, 17 Nov 2011 12:39:59 +0000 Subject: KVM: PPC: booke: Improve timer register emulation Decrementers are now properly driven by TCR/TSR, and the guest has full read/write access to these registers. The decrementer keeps ticking (and setting the TSR bit) regardless of whether the interrupts are enabled with TCR. The decrementer stops at zero, rather than going negative. Decrementers (and FITs, once implemented) are delivered as level-triggered interrupts -- dequeued when the TSR bit is cleared, not on delivery. Signed-off-by: Liu Yu [scottwood@freescale.com: significant changes] Signed-off-by: Scott Wood Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/include/asm/kvm_host.h | 2 +- arch/powerpc/include/asm/kvm_ppc.h | 1 + arch/powerpc/kvm/book3s.c | 8 +++++ arch/powerpc/kvm/booke.c | 67 ++++++++++++++++++++++++++++--------- arch/powerpc/kvm/booke.h | 4 +++ arch/powerpc/kvm/booke_emulate.c | 11 ++++-- arch/powerpc/kvm/emulate.c | 59 ++++++++++++++++---------------- arch/powerpc/kvm/powerpc.c | 33 +++++++----------- 8 files changed, 115 insertions(+), 70 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index bfd0c9912da5..66c75cddaec6 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -330,7 +330,7 @@ struct kvm_vcpu_arch { u32 tbl; u32 tbu; u32 tcr; - u32 tsr; + ulong tsr; /* we need to perform set/clr_bits() which requires ulong */ u32 ivor[64]; ulong ivpr; u32 pvr; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index c089927f64cc..5192c2e70583 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -66,6 +66,7 @@ extern int kvmppc_emulate_instruction(struct kvm_run *run, extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu); extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu); extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb); +extern void kvmppc_decrementer_func(unsigned long data); extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu); /* Core-specific hooks */ diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 5398744cd773..6bf7e0582c5a 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -515,3 +515,11 @@ out: mutex_unlock(&kvm->slots_lock); return r; } + +void kvmppc_decrementer_func(unsigned long data) +{ + struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; + + kvmppc_core_queue_dec(vcpu); + kvm_vcpu_kick(vcpu); +} diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 50803dd0b8f2..9e41f45d07ed 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -252,9 +252,11 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, allowed = vcpu->arch.shared->msr & MSR_ME; msr_mask = 0; break; - case BOOKE_IRQPRIO_EXTERNAL: case BOOKE_IRQPRIO_DECREMENTER: case BOOKE_IRQPRIO_FIT: + keep_irq = true; + /* fall through */ + case BOOKE_IRQPRIO_EXTERNAL: allowed = vcpu->arch.shared->msr & MSR_EE; allowed = allowed && !crit; msr_mask = MSR_CE|MSR_ME|MSR_DE; @@ -282,11 +284,26 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, return allowed; } +static void update_timer_ints(struct kvm_vcpu *vcpu) +{ + if ((vcpu->arch.tcr & TCR_DIE) && (vcpu->arch.tsr & TSR_DIS)) + kvmppc_core_queue_dec(vcpu); + else + kvmppc_core_dequeue_dec(vcpu); +} + static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu) { unsigned long *pending = &vcpu->arch.pending_exceptions; unsigned int priority; + if (vcpu->requests) { + if (kvm_check_request(KVM_REQ_PENDING_TIMER, vcpu)) { + smp_mb(); + update_timer_ints(vcpu); + } + } + priority = __ffs(*pending); while (priority <= BOOKE_IRQPRIO_MAX) { if (kvmppc_booke_irqprio_deliver(vcpu, priority)) @@ -749,25 +766,16 @@ static int set_sregs_base(struct kvm_vcpu *vcpu, vcpu->arch.shared->esr = sregs->u.e.esr; vcpu->arch.shared->dar = sregs->u.e.dear; vcpu->arch.vrsave = sregs->u.e.vrsave; - vcpu->arch.tcr = sregs->u.e.tcr; + kvmppc_set_tcr(vcpu, sregs->u.e.tcr); - if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_DEC) + if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_DEC) { vcpu->arch.dec = sregs->u.e.dec; - - kvmppc_emulate_dec(vcpu); + kvmppc_emulate_dec(vcpu); + } if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) { - /* - * FIXME: existing KVM timer handling is incomplete. - * TSR cannot be read by the guest, and its value in - * vcpu->arch is always zero. For now, just handle - * the case where the caller is trying to inject a - * decrementer interrupt. - */ - - if ((sregs->u.e.tsr & TSR_DIS) && - (vcpu->arch.tcr & TCR_DIE)) - kvmppc_core_queue_dec(vcpu); + vcpu->arch.tsr = sregs->u.e.tsr; + update_timer_ints(vcpu); } return 0; @@ -923,6 +931,33 @@ void kvmppc_core_destroy_vm(struct kvm *kvm) { } +void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr) +{ + vcpu->arch.tcr = new_tcr; + update_timer_ints(vcpu); +} + +void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits) +{ + set_bits(tsr_bits, &vcpu->arch.tsr); + smp_wmb(); + kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); + kvm_vcpu_kick(vcpu); +} + +void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits) +{ + clear_bits(tsr_bits, &vcpu->arch.tsr); + update_timer_ints(vcpu); +} + +void kvmppc_decrementer_func(unsigned long data) +{ + struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; + + kvmppc_set_tsr_bits(vcpu, TSR_DIS); +} + int __init kvmppc_booke_init(void) { unsigned long ivor[16]; diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h index 8e1fe33d64e5..2fe202705a3f 100644 --- a/arch/powerpc/kvm/booke.h +++ b/arch/powerpc/kvm/booke.h @@ -55,6 +55,10 @@ extern unsigned long kvmppc_booke_handlers; void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr); void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr); +void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr); +void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits); +void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits); + int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int inst, int *advance); int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt); diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c index bae9288ac1e1..3e652da36534 100644 --- a/arch/powerpc/kvm/booke_emulate.c +++ b/arch/powerpc/kvm/booke_emulate.c @@ -13,6 +13,7 @@ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * * Copyright IBM Corp. 2008 + * Copyright 2011 Freescale Semiconductor, Inc. * * Authors: Hollis Blanchard */ @@ -115,10 +116,10 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) case SPRN_DBSR: vcpu->arch.dbsr &= ~spr_val; break; case SPRN_TSR: - vcpu->arch.tsr &= ~spr_val; break; + kvmppc_clr_tsr_bits(vcpu, spr_val); + break; case SPRN_TCR: - vcpu->arch.tcr = spr_val; - kvmppc_emulate_dec(vcpu); + kvmppc_set_tcr(vcpu, spr_val); break; /* Note: SPRG4-7 are user-readable. These values are @@ -209,6 +210,10 @@ int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbcr1); break; case SPRN_DBSR: kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbsr); break; + case SPRN_TSR: + kvmppc_set_gpr(vcpu, rt, vcpu->arch.tsr); break; + case SPRN_TCR: + kvmppc_set_gpr(vcpu, rt, vcpu->arch.tcr); break; case SPRN_IVOR0: kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL]); diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index bda052e2264b..968f40101883 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -13,6 +13,7 @@ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * * Copyright IBM Corp. 2007 + * Copyright 2011 Freescale Semiconductor, Inc. * * Authors: Hollis Blanchard */ @@ -69,57 +70,55 @@ #define OP_STH 44 #define OP_STHU 45 -#ifdef CONFIG_PPC_BOOK3S -static int kvmppc_dec_enabled(struct kvm_vcpu *vcpu) -{ - return 1; -} -#else -static int kvmppc_dec_enabled(struct kvm_vcpu *vcpu) -{ - /* On BOOKE, DEC = 0 is as good as decrementer not enabled */ - return (vcpu->arch.tcr & TCR_DIE) && vcpu->arch.dec; -} -#endif - void kvmppc_emulate_dec(struct kvm_vcpu *vcpu) { unsigned long dec_nsec; unsigned long long dec_time; pr_debug("mtDEC: %x\n", vcpu->arch.dec); + hrtimer_try_to_cancel(&vcpu->arch.dec_timer); + #ifdef CONFIG_PPC_BOOK3S /* mtdec lowers the interrupt line when positive. */ kvmppc_core_dequeue_dec(vcpu); /* POWER4+ triggers a dec interrupt if the value is < 0 */ if (vcpu->arch.dec & 0x80000000) { - hrtimer_try_to_cancel(&vcpu->arch.dec_timer); kvmppc_core_queue_dec(vcpu); return; } #endif - if (kvmppc_dec_enabled(vcpu)) { - /* The decrementer ticks at the same rate as the timebase, so - * that's how we convert the guest DEC value to the number of - * host ticks. */ - - hrtimer_try_to_cancel(&vcpu->arch.dec_timer); - dec_time = vcpu->arch.dec; - dec_time *= 1000; - do_div(dec_time, tb_ticks_per_usec); - dec_nsec = do_div(dec_time, NSEC_PER_SEC); - hrtimer_start(&vcpu->arch.dec_timer, - ktime_set(dec_time, dec_nsec), HRTIMER_MODE_REL); - vcpu->arch.dec_jiffies = get_tb(); - } else { - hrtimer_try_to_cancel(&vcpu->arch.dec_timer); - } + +#ifdef CONFIG_BOOKE + /* On BOOKE, DEC = 0 is as good as decrementer not enabled */ + if (vcpu->arch.dec == 0) + return; +#endif + + /* + * The decrementer ticks at the same rate as the timebase, so + * that's how we convert the guest DEC value to the number of + * host ticks. + */ + + dec_time = vcpu->arch.dec; + dec_time *= 1000; + do_div(dec_time, tb_ticks_per_usec); + dec_nsec = do_div(dec_time, NSEC_PER_SEC); + hrtimer_start(&vcpu->arch.dec_timer, + ktime_set(dec_time, dec_nsec), HRTIMER_MODE_REL); + vcpu->arch.dec_jiffies = get_tb(); } u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb) { u64 jd = tb - vcpu->arch.dec_jiffies; + +#ifdef CONFIG_BOOKE + if (vcpu->arch.dec < jd) + return 0; +#endif + return vcpu->arch.dec - jd; } diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index d02e4c84e213..fd8d3b16eaf3 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -39,7 +39,8 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) { return !(v->arch.shared->msr & MSR_WE) || - !!(v->arch.pending_exceptions); + !!(v->arch.pending_exceptions) || + v->requests; } int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) @@ -311,18 +312,6 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) return kvmppc_core_pending_dec(vcpu); } -static void kvmppc_decrementer_func(unsigned long data) -{ - struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; - - kvmppc_core_queue_dec(vcpu); - - if (waitqueue_active(vcpu->arch.wqp)) { - wake_up_interruptible(vcpu->arch.wqp); - vcpu->stat.halt_wakeup++; - } -} - /* * low level hrtimer wake routine. Because this runs in hardirq context * we schedule a tasklet to do the real work. @@ -567,6 +556,16 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) return r; } +void kvm_vcpu_kick(struct kvm_vcpu *vcpu) +{ + if (waitqueue_active(&vcpu->wq)) { + wake_up_interruptible(vcpu->arch.wqp); + vcpu->stat.halt_wakeup++; + } else if (vcpu->cpu != -1) { + smp_send_reschedule(vcpu->cpu); + } +} + int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) { if (irq->irq == KVM_INTERRUPT_UNSET) { @@ -575,13 +574,7 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) } kvmppc_core_queue_external(vcpu, irq); - - if (waitqueue_active(vcpu->arch.wqp)) { - wake_up_interruptible(vcpu->arch.wqp); - vcpu->stat.halt_wakeup++; - } else if (vcpu->cpu != -1) { - smp_send_reschedule(vcpu->cpu); - } + kvm_vcpu_kick(vcpu); return 0; } -- cgit v1.2.3 From 7d82714d4d1293edc57439c796750310866624b2 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Fri, 9 Dec 2011 15:46:21 +0100 Subject: KVM: PPC: Book3s: PR: Disable preemption in vcpu_run MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When entering the guest, we want to make sure we're not getting preempted away, so let's disable preemption on entry, but enable it again while handling guest exits. Reported-by: Jörg Sommer Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/kvm/book3s_pr.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 203a7b7b58b9..19af2bf2b87d 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -517,6 +517,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, run->ready_for_interrupt_injection = 1; trace_kvm_book3s_exit(exit_nr, vcpu); + preempt_enable(); kvm_resched(vcpu); switch (exit_nr) { case BOOK3S_INTERRUPT_INST_STORAGE: @@ -761,6 +762,8 @@ program_interrupt: run->exit_reason = KVM_EXIT_INTR; r = -EINTR; } else { + preempt_disable(); + /* In case an interrupt came in that was triggered * from userspace (like DEC), we need to check what * to inject now! */ @@ -923,10 +926,13 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) #endif ulong ext_msr; + preempt_disable(); + /* Check if we can run the vcpu at all */ if (!vcpu->arch.sane) { kvm_run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - return -EINVAL; + ret = -EINVAL; + goto out; } kvmppc_core_prepare_to_enter(vcpu); @@ -934,7 +940,8 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) /* No need to go into the guest when all we do is going out */ if (signal_pending(current)) { kvm_run->exit_reason = KVM_EXIT_INTR; - return -EINTR; + ret = -EINTR; + goto out; } /* Save FPU state in stack */ @@ -1004,6 +1011,8 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) current->thread.used_vsr = used_vsr; #endif +out: + preempt_enable(); return ret; } -- cgit v1.2.3 From d33ad328c0025c45f4688a769aeebddc342222c1 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Fri, 9 Dec 2011 15:47:53 +0100 Subject: KVM: PPC: Book3s: PR: No irq_disable in vcpu_run Somewhere during merges we ended up from local_irq_enable() foo(); local_irq_disable() to always keeping irqs enabled during that part. However, we now have the following code: foo(); local_irq_disable() which disables interrupts without the surrounding code enabling them again! So let's remove that disable and be happy. Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/kvm/book3s_pr.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 19af2bf2b87d..857ecde0cfdf 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -983,8 +983,6 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) kvm_guest_exit(); - local_irq_disable(); - current->thread.regs->msr = ext_msr; /* Make sure we save the guest FPU/Altivec/VSX state */ -- cgit v1.2.3 From 468a12c2b53776721ff83517d4a195b85c5fce54 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Fri, 9 Dec 2011 14:44:13 +0100 Subject: KVM: PPC: Use get/set for to_svcpu to help preemption MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When running the 64-bit Book3s PR code without CONFIG_PREEMPT_NONE, we were doing a few things wrong, most notably access to PACA fields without making sure that the pointers stay stable accross the access (preempt_disable()). This patch moves to_svcpu towards a get/put model which allows us to disable preemption while accessing the shadow vcpu fields in the PACA. That way we can run preemptible and everyone's happy! Reported-by: Jörg Sommer Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/include/asm/kvm_book3s.h | 76 +++++++++++++++++++++++++------- arch/powerpc/include/asm/kvm_book3s_32.h | 6 ++- arch/powerpc/include/asm/kvm_book3s_64.h | 8 +++- arch/powerpc/kvm/book3s_32_mmu_host.c | 21 ++++++--- arch/powerpc/kvm/book3s_64_mmu_host.c | 66 +++++++++++++++++---------- arch/powerpc/kvm/book3s_emulate.c | 8 +++- arch/powerpc/kvm/book3s_pr.c | 62 +++++++++++++++++--------- arch/powerpc/kvm/trace.h | 5 ++- 8 files changed, 181 insertions(+), 71 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 69c7377d2071..c941c21a1893 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -183,7 +183,9 @@ static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu, static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val) { if ( num < 14 ) { - to_svcpu(vcpu)->gpr[num] = val; + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + svcpu->gpr[num] = val; + svcpu_put(svcpu); to_book3s(vcpu)->shadow_vcpu->gpr[num] = val; } else vcpu->arch.gpr[num] = val; @@ -191,80 +193,120 @@ static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val) static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num) { - if ( num < 14 ) - return to_svcpu(vcpu)->gpr[num]; - else + if ( num < 14 ) { + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + ulong r = svcpu->gpr[num]; + svcpu_put(svcpu); + return r; + } else return vcpu->arch.gpr[num]; } static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val) { - to_svcpu(vcpu)->cr = val; + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + svcpu->cr = val; + svcpu_put(svcpu); to_book3s(vcpu)->shadow_vcpu->cr = val; } static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu) { - return to_svcpu(vcpu)->cr; + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + u32 r; + r = svcpu->cr; + svcpu_put(svcpu); + return r; } static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val) { - to_svcpu(vcpu)->xer = val; + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + svcpu->xer = val; to_book3s(vcpu)->shadow_vcpu->xer = val; + svcpu_put(svcpu); } static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu) { - return to_svcpu(vcpu)->xer; + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + u32 r; + r = svcpu->xer; + svcpu_put(svcpu); + return r; } static inline void kvmppc_set_ctr(struct kvm_vcpu *vcpu, ulong val) { - to_svcpu(vcpu)->ctr = val; + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + svcpu->ctr = val; + svcpu_put(svcpu); } static inline ulong kvmppc_get_ctr(struct kvm_vcpu *vcpu) { - return to_svcpu(vcpu)->ctr; + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + ulong r; + r = svcpu->ctr; + svcpu_put(svcpu); + return r; } static inline void kvmppc_set_lr(struct kvm_vcpu *vcpu, ulong val) { - to_svcpu(vcpu)->lr = val; + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + svcpu->lr = val; + svcpu_put(svcpu); } static inline ulong kvmppc_get_lr(struct kvm_vcpu *vcpu) { - return to_svcpu(vcpu)->lr; + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + ulong r; + r = svcpu->lr; + svcpu_put(svcpu); + return r; } static inline void kvmppc_set_pc(struct kvm_vcpu *vcpu, ulong val) { - to_svcpu(vcpu)->pc = val; + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + svcpu->pc = val; + svcpu_put(svcpu); } static inline ulong kvmppc_get_pc(struct kvm_vcpu *vcpu) { - return to_svcpu(vcpu)->pc; + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + ulong r; + r = svcpu->pc; + svcpu_put(svcpu); + return r; } static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu) { ulong pc = kvmppc_get_pc(vcpu); - struct kvmppc_book3s_shadow_vcpu *svcpu = to_svcpu(vcpu); + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + u32 r; /* Load the instruction manually if it failed to do so in the * exit path */ if (svcpu->last_inst == KVM_INST_FETCH_FAILED) kvmppc_ld(vcpu, &pc, sizeof(u32), &svcpu->last_inst, false); - return svcpu->last_inst; + r = svcpu->last_inst; + svcpu_put(svcpu); + return r; } static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu) { - return to_svcpu(vcpu)->fault_dar; + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + ulong r; + r = svcpu->fault_dar; + svcpu_put(svcpu); + return r; } static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/include/asm/kvm_book3s_32.h b/arch/powerpc/include/asm/kvm_book3s_32.h index de604db135f5..38040ff82063 100644 --- a/arch/powerpc/include/asm/kvm_book3s_32.h +++ b/arch/powerpc/include/asm/kvm_book3s_32.h @@ -20,11 +20,15 @@ #ifndef __ASM_KVM_BOOK3S_32_H__ #define __ASM_KVM_BOOK3S_32_H__ -static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu) +static inline struct kvmppc_book3s_shadow_vcpu *svcpu_get(struct kvm_vcpu *vcpu) { return to_book3s(vcpu)->shadow_vcpu; } +static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu) +{ +} + #define PTE_SIZE 12 #define VSID_ALL 0 #define SR_INVALID 0x00000001 /* VSID 1 should always be unused */ diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index d0ac94f98f9e..2054e4726ba2 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -21,10 +21,16 @@ #define __ASM_KVM_BOOK3S_64_H__ #ifdef CONFIG_KVM_BOOK3S_PR -static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu) +static inline struct kvmppc_book3s_shadow_vcpu *svcpu_get(struct kvm_vcpu *vcpu) { + preempt_disable(); return &get_paca()->shadow_vcpu; } + +static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu) +{ + preempt_enable(); +} #endif #define SPAPR_TCE_SHIFT 12 diff --git a/arch/powerpc/kvm/book3s_32_mmu_host.c b/arch/powerpc/kvm/book3s_32_mmu_host.c index 9fecbfbce773..f922c29bb234 100644 --- a/arch/powerpc/kvm/book3s_32_mmu_host.c +++ b/arch/powerpc/kvm/book3s_32_mmu_host.c @@ -151,13 +151,15 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) bool primary = false; bool evict = false; struct hpte_cache *pte; + int r = 0; /* Get host physical address for gpa */ hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT); if (is_error_pfn(hpaddr)) { printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", orig_pte->eaddr); - return -EINVAL; + r = -EINVAL; + goto out; } hpaddr <<= PAGE_SHIFT; @@ -249,7 +251,8 @@ next_pteg: kvmppc_mmu_hpte_cache_map(vcpu, pte); - return 0; +out: + return r; } static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid) @@ -297,12 +300,14 @@ int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr) u64 gvsid; u32 sr; struct kvmppc_sid_map *map; - struct kvmppc_book3s_shadow_vcpu *svcpu = to_svcpu(vcpu); + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + int r = 0; if (vcpu->arch.mmu.esid_to_vsid(vcpu, esid, &gvsid)) { /* Invalidate an entry */ svcpu->sr[esid] = SR_INVALID; - return -ENOENT; + r = -ENOENT; + goto out; } map = find_sid_vsid(vcpu, gvsid); @@ -315,17 +320,21 @@ int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr) dprintk_sr("MMU: mtsr %d, 0x%x\n", esid, sr); - return 0; +out: + svcpu_put(svcpu); + return r; } void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu) { int i; - struct kvmppc_book3s_shadow_vcpu *svcpu = to_svcpu(vcpu); + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); dprintk_sr("MMU: flushing all segments (%d)\n", ARRAY_SIZE(svcpu->sr)); for (i = 0; i < ARRAY_SIZE(svcpu->sr); i++) svcpu->sr[i] = SR_INVALID; + + svcpu_put(svcpu); } void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c index fa2f08434ba5..6f87f39a1ac2 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_host.c +++ b/arch/powerpc/kvm/book3s_64_mmu_host.c @@ -88,12 +88,14 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) int vflags = 0; int attempt = 0; struct kvmppc_sid_map *map; + int r = 0; /* Get host physical address for gpa */ hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT); if (is_error_pfn(hpaddr)) { printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", orig_pte->eaddr); - return -EINVAL; + r = -EINVAL; + goto out; } hpaddr <<= PAGE_SHIFT; hpaddr |= orig_pte->raddr & (~0xfffULL & ~PAGE_MASK); @@ -110,7 +112,8 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) printk(KERN_ERR "KVM: Segment map for 0x%llx (0x%lx) failed\n", vsid, orig_pte->eaddr); WARN_ON(true); - return -EINVAL; + r = -EINVAL; + goto out; } vsid = map->host_vsid; @@ -131,8 +134,10 @@ map_again: /* In case we tried normal mapping already, let's nuke old entries */ if (attempt > 1) - if (ppc_md.hpte_remove(hpteg) < 0) - return -1; + if (ppc_md.hpte_remove(hpteg) < 0) { + r = -1; + goto out; + } ret = ppc_md.hpte_insert(hpteg, va, hpaddr, rflags, vflags, MMU_PAGE_4K, MMU_SEGSIZE_256M); @@ -162,7 +167,8 @@ map_again: kvmppc_mmu_hpte_cache_map(vcpu, pte); } - return 0; +out: + return r; } static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid) @@ -207,25 +213,30 @@ static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid) static int kvmppc_mmu_next_segment(struct kvm_vcpu *vcpu, ulong esid) { + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); int i; int max_slb_size = 64; int found_inval = -1; int r; - if (!to_svcpu(vcpu)->slb_max) - to_svcpu(vcpu)->slb_max = 1; + if (!svcpu->slb_max) + svcpu->slb_max = 1; /* Are we overwriting? */ - for (i = 1; i < to_svcpu(vcpu)->slb_max; i++) { - if (!(to_svcpu(vcpu)->slb[i].esid & SLB_ESID_V)) + for (i = 1; i < svcpu->slb_max; i++) { + if (!(svcpu->slb[i].esid & SLB_ESID_V)) found_inval = i; - else if ((to_svcpu(vcpu)->slb[i].esid & ESID_MASK) == esid) - return i; + else if ((svcpu->slb[i].esid & ESID_MASK) == esid) { + r = i; + goto out; + } } /* Found a spare entry that was invalidated before */ - if (found_inval > 0) - return found_inval; + if (found_inval > 0) { + r = found_inval; + goto out; + } /* No spare invalid entry, so create one */ @@ -233,30 +244,35 @@ static int kvmppc_mmu_next_segment(struct kvm_vcpu *vcpu, ulong esid) max_slb_size = mmu_slb_size; /* Overflowing -> purge */ - if ((to_svcpu(vcpu)->slb_max) == max_slb_size) + if ((svcpu->slb_max) == max_slb_size) kvmppc_mmu_flush_segments(vcpu); - r = to_svcpu(vcpu)->slb_max; - to_svcpu(vcpu)->slb_max++; + r = svcpu->slb_max; + svcpu->slb_max++; +out: + svcpu_put(svcpu); return r; } int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr) { + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); u64 esid = eaddr >> SID_SHIFT; u64 slb_esid = (eaddr & ESID_MASK) | SLB_ESID_V; u64 slb_vsid = SLB_VSID_USER; u64 gvsid; int slb_index; struct kvmppc_sid_map *map; + int r = 0; slb_index = kvmppc_mmu_next_segment(vcpu, eaddr & ESID_MASK); if (vcpu->arch.mmu.esid_to_vsid(vcpu, esid, &gvsid)) { /* Invalidate an entry */ - to_svcpu(vcpu)->slb[slb_index].esid = 0; - return -ENOENT; + svcpu->slb[slb_index].esid = 0; + r = -ENOENT; + goto out; } map = find_sid_vsid(vcpu, gvsid); @@ -269,18 +285,22 @@ int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr) slb_vsid &= ~SLB_VSID_KP; slb_esid |= slb_index; - to_svcpu(vcpu)->slb[slb_index].esid = slb_esid; - to_svcpu(vcpu)->slb[slb_index].vsid = slb_vsid; + svcpu->slb[slb_index].esid = slb_esid; + svcpu->slb[slb_index].vsid = slb_vsid; trace_kvm_book3s_slbmte(slb_vsid, slb_esid); - return 0; +out: + svcpu_put(svcpu); + return r; } void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu) { - to_svcpu(vcpu)->slb_max = 1; - to_svcpu(vcpu)->slb[0].esid = 0; + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + svcpu->slb_max = 1; + svcpu->slb[0].esid = 0; + svcpu_put(svcpu); } void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index 0c9dc62532d0..f1950d131827 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -230,9 +230,12 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, r = kvmppc_st(vcpu, &addr, 32, zeros, true); if ((r == -ENOENT) || (r == -EPERM)) { + struct kvmppc_book3s_shadow_vcpu *svcpu; + + svcpu = svcpu_get(vcpu); *advance = 0; vcpu->arch.shared->dar = vaddr; - to_svcpu(vcpu)->fault_dar = vaddr; + svcpu->fault_dar = vaddr; dsisr = DSISR_ISSTORE; if (r == -ENOENT) @@ -241,7 +244,8 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, dsisr |= DSISR_PROTFAULT; vcpu->arch.shared->dsisr = dsisr; - to_svcpu(vcpu)->fault_dsisr = dsisr; + svcpu->fault_dsisr = dsisr; + svcpu_put(svcpu); kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE); diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 857ecde0cfdf..0c31507be908 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -56,10 +56,12 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr, void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { #ifdef CONFIG_PPC_BOOK3S_64 - memcpy(to_svcpu(vcpu)->slb, to_book3s(vcpu)->slb_shadow, sizeof(to_svcpu(vcpu)->slb)); + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + memcpy(svcpu->slb, to_book3s(vcpu)->slb_shadow, sizeof(svcpu->slb)); memcpy(&get_paca()->shadow_vcpu, to_book3s(vcpu)->shadow_vcpu, sizeof(get_paca()->shadow_vcpu)); - to_svcpu(vcpu)->slb_max = to_book3s(vcpu)->slb_shadow_max; + svcpu->slb_max = to_book3s(vcpu)->slb_shadow_max; + svcpu_put(svcpu); #endif #ifdef CONFIG_PPC_BOOK3S_32 @@ -70,10 +72,12 @@ void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) { #ifdef CONFIG_PPC_BOOK3S_64 - memcpy(to_book3s(vcpu)->slb_shadow, to_svcpu(vcpu)->slb, sizeof(to_svcpu(vcpu)->slb)); + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + memcpy(to_book3s(vcpu)->slb_shadow, svcpu->slb, sizeof(svcpu->slb)); memcpy(to_book3s(vcpu)->shadow_vcpu, &get_paca()->shadow_vcpu, sizeof(get_paca()->shadow_vcpu)); - to_book3s(vcpu)->slb_shadow_max = to_svcpu(vcpu)->slb_max; + to_book3s(vcpu)->slb_shadow_max = svcpu->slb_max; + svcpu_put(svcpu); #endif kvmppc_giveup_ext(vcpu, MSR_FP); @@ -308,19 +312,22 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, if (page_found == -ENOENT) { /* Page not found in guest PTE entries */ + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); - vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr; + vcpu->arch.shared->dsisr = svcpu->fault_dsisr; vcpu->arch.shared->msr |= - (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL); + (svcpu->shadow_srr1 & 0x00000000f8000000ULL); + svcpu_put(svcpu); kvmppc_book3s_queue_irqprio(vcpu, vec); } else if (page_found == -EPERM) { /* Storage protection */ + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); - vcpu->arch.shared->dsisr = - to_svcpu(vcpu)->fault_dsisr & ~DSISR_NOHPTE; + vcpu->arch.shared->dsisr = svcpu->fault_dsisr & ~DSISR_NOHPTE; vcpu->arch.shared->dsisr |= DSISR_PROTFAULT; vcpu->arch.shared->msr |= - (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL); + svcpu->shadow_srr1 & 0x00000000f8000000ULL; + svcpu_put(svcpu); kvmppc_book3s_queue_irqprio(vcpu, vec); } else if (page_found == -EINVAL) { /* Page not found in guest SLB */ @@ -521,21 +528,25 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, kvm_resched(vcpu); switch (exit_nr) { case BOOK3S_INTERRUPT_INST_STORAGE: + { + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + ulong shadow_srr1 = svcpu->shadow_srr1; vcpu->stat.pf_instruc++; #ifdef CONFIG_PPC_BOOK3S_32 /* We set segments as unused segments when invalidating them. So * treat the respective fault as segment fault. */ - if (to_svcpu(vcpu)->sr[kvmppc_get_pc(vcpu) >> SID_SHIFT] - == SR_INVALID) { + if (svcpu->sr[kvmppc_get_pc(vcpu) >> SID_SHIFT] == SR_INVALID) { kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)); r = RESUME_GUEST; + svcpu_put(svcpu); break; } #endif + svcpu_put(svcpu); /* only care about PTEG not found errors, but leave NX alone */ - if (to_svcpu(vcpu)->shadow_srr1 & 0x40000000) { + if (shadow_srr1 & 0x40000000) { r = kvmppc_handle_pagefault(run, vcpu, kvmppc_get_pc(vcpu), exit_nr); vcpu->stat.sp_instruc++; } else if (vcpu->arch.mmu.is_dcbz32(vcpu) && @@ -548,33 +559,37 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, kvmppc_mmu_pte_flush(vcpu, kvmppc_get_pc(vcpu), ~0xFFFUL); r = RESUME_GUEST; } else { - vcpu->arch.shared->msr |= - to_svcpu(vcpu)->shadow_srr1 & 0x58000000; + vcpu->arch.shared->msr |= shadow_srr1 & 0x58000000; kvmppc_book3s_queue_irqprio(vcpu, exit_nr); r = RESUME_GUEST; } break; + } case BOOK3S_INTERRUPT_DATA_STORAGE: { ulong dar = kvmppc_get_fault_dar(vcpu); + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + u32 fault_dsisr = svcpu->fault_dsisr; vcpu->stat.pf_storage++; #ifdef CONFIG_PPC_BOOK3S_32 /* We set segments as unused segments when invalidating them. So * treat the respective fault as segment fault. */ - if ((to_svcpu(vcpu)->sr[dar >> SID_SHIFT]) == SR_INVALID) { + if ((svcpu->sr[dar >> SID_SHIFT]) == SR_INVALID) { kvmppc_mmu_map_segment(vcpu, dar); r = RESUME_GUEST; + svcpu_put(svcpu); break; } #endif + svcpu_put(svcpu); /* The only case we need to handle is missing shadow PTEs */ - if (to_svcpu(vcpu)->fault_dsisr & DSISR_NOHPTE) { + if (fault_dsisr & DSISR_NOHPTE) { r = kvmppc_handle_pagefault(run, vcpu, dar, exit_nr); } else { vcpu->arch.shared->dar = dar; - vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr; + vcpu->arch.shared->dsisr = fault_dsisr; kvmppc_book3s_queue_irqprio(vcpu, exit_nr); r = RESUME_GUEST; } @@ -610,10 +625,13 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, case BOOK3S_INTERRUPT_PROGRAM: { enum emulation_result er; + struct kvmppc_book3s_shadow_vcpu *svcpu; ulong flags; program_interrupt: - flags = to_svcpu(vcpu)->shadow_srr1 & 0x1f0000ull; + svcpu = svcpu_get(vcpu); + flags = svcpu->shadow_srr1 & 0x1f0000ull; + svcpu_put(svcpu); if (vcpu->arch.shared->msr & MSR_PR) { #ifdef EXIT_DEBUG @@ -741,14 +759,18 @@ program_interrupt: r = RESUME_GUEST; break; default: + { + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + ulong shadow_srr1 = svcpu->shadow_srr1; + svcpu_put(svcpu); /* Ugh - bork here! What did we get? */ printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n", - exit_nr, kvmppc_get_pc(vcpu), to_svcpu(vcpu)->shadow_srr1); + exit_nr, kvmppc_get_pc(vcpu), shadow_srr1); r = RESUME_HOST; BUG(); break; } - + } if (!(r & RESUME_HOST)) { /* To avoid clobbering exit_reason, only check for signals if diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h index b135d3d397db..609d8bfb54e3 100644 --- a/arch/powerpc/kvm/trace.h +++ b/arch/powerpc/kvm/trace.h @@ -118,11 +118,14 @@ TRACE_EVENT(kvm_book3s_exit, ), TP_fast_assign( + struct kvmppc_book3s_shadow_vcpu *svcpu; __entry->exit_nr = exit_nr; __entry->pc = kvmppc_get_pc(vcpu); __entry->dar = kvmppc_get_fault_dar(vcpu); __entry->msr = vcpu->arch.shared->msr; - __entry->srr1 = to_svcpu(vcpu)->shadow_srr1; + svcpu = svcpu_get(vcpu); + __entry->srr1 = svcpu->shadow_srr1; + svcpu_put(svcpu); ), TP_printk("exit=0x%x | pc=0x%lx | msr=0x%lx | dar=0x%lx | srr1=0x%lx", -- cgit v1.2.3 From ae21216bece0a623d09980c120b9c98790a860b9 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Fri, 9 Dec 2011 15:20:46 +0100 Subject: KVM: PPC: align vcpu_kick with x86 Our vcpu kick implementation differs a bit from x86 which resulted in us not disabling preemption during the kick. Get it a bit closer to what x86 does. Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/kvm/powerpc.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index fd8d3b16eaf3..e1ef4d6d972a 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -558,12 +558,17 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) void kvm_vcpu_kick(struct kvm_vcpu *vcpu) { + int me; + int cpu = vcpu->cpu; + + me = get_cpu(); if (waitqueue_active(&vcpu->wq)) { wake_up_interruptible(vcpu->arch.wqp); vcpu->stat.halt_wakeup++; - } else if (vcpu->cpu != -1) { + } else if (cpu != me && cpu != -1) { smp_send_reschedule(vcpu->cpu); } + put_cpu(); } int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) -- cgit v1.2.3 From e371f713db6523d99d8ffae8f9da564055e6de17 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Mon, 19 Dec 2011 13:36:55 +0100 Subject: KVM: PPC: Book3S: PR: Fix signal check race As Scott put it: > If we get a signal after the check, we want to be sure that we don't > receive the reschedule IPI until after we're in the guest, so that it > will cause another signal check. we need to have interrupts disabled from the point we do signal_check() all the way until we actually enter the guest. This patch fixes potential signal loss races. Reported-by: Scott Wood Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/kvm/book3s_pr.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 0c31507be908..2da670405727 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -51,6 +51,8 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr, #define MSR_USER32 MSR_USER #define MSR_USER64 MSR_USER #define HW_PAGE_SIZE PAGE_SIZE +#define __hard_irq_disable local_irq_disable +#define __hard_irq_enable local_irq_enable #endif void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) @@ -776,7 +778,16 @@ program_interrupt: /* To avoid clobbering exit_reason, only check for signals if * we aren't already exiting to userspace for some other * reason. */ + + /* + * Interrupts could be timers for the guest which we have to + * inject again, so let's postpone them until we're in the guest + * and if we really did time things so badly, then we just exit + * again due to a host external interrupt. + */ + __hard_irq_disable(); if (signal_pending(current)) { + __hard_irq_enable(); #ifdef EXIT_DEBUG printk(KERN_EMERG "KVM: Going back to host\n"); #endif @@ -959,8 +970,17 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) kvmppc_core_prepare_to_enter(vcpu); + /* + * Interrupts could be timers for the guest which we have to inject + * again, so let's postpone them until we're in the guest and if we + * really did time things so badly, then we just exit again due to + * a host external interrupt. + */ + __hard_irq_disable(); + /* No need to go into the guest when all we do is going out */ if (signal_pending(current)) { + __hard_irq_enable(); kvm_run->exit_reason = KVM_EXIT_INTR; ret = -EINTR; goto out; -- cgit v1.2.3 From 7b11dc993841f1643c0932cf0faf010406502572 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Mon, 28 Nov 2011 15:20:02 +0000 Subject: KVM: PPC: e500: Fix TLBnCFG in KVM_CONFIG_TLB The associativity, not just total size, can differ from the host hardware. Signed-off-by: Scott Wood Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/kvm/e500_tlb.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index 9cd124a11acd..507376890cf8 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -1227,12 +1227,14 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, vcpu_e500->gtlb_offset[0] = 0; vcpu_e500->gtlb_offset[1] = params.tlb_sizes[0]; - vcpu_e500->tlb0cfg = mfspr(SPRN_TLB0CFG) & ~0xfffUL; + vcpu_e500->tlb0cfg &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); if (params.tlb_sizes[0] <= 2048) vcpu_e500->tlb0cfg |= params.tlb_sizes[0]; + vcpu_e500->tlb0cfg |= params.tlb_ways[0] << TLBnCFG_ASSOC_SHIFT; - vcpu_e500->tlb1cfg = mfspr(SPRN_TLB1CFG) & ~0xfffUL; + vcpu_e500->tlb1cfg &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); vcpu_e500->tlb1cfg |= params.tlb_sizes[1]; + vcpu_e500->tlb1cfg |= params.tlb_ways[1] << TLBnCFG_ASSOC_SHIFT; vcpu_e500->shared_tlb_pages = pages; vcpu_e500->num_shared_tlb_pages = num_pages; @@ -1348,10 +1350,17 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) goto err; /* Init TLB configuration register */ - vcpu_e500->tlb0cfg = mfspr(SPRN_TLB0CFG) & ~0xfffUL; + vcpu_e500->tlb0cfg = mfspr(SPRN_TLB0CFG) & + ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); vcpu_e500->tlb0cfg |= vcpu_e500->gtlb_params[0].entries; - vcpu_e500->tlb1cfg = mfspr(SPRN_TLB1CFG) & ~0xfffUL; - vcpu_e500->tlb1cfg |= vcpu_e500->gtlb_params[1].entries; + vcpu_e500->tlb0cfg |= + vcpu_e500->gtlb_params[0].ways << TLBnCFG_ASSOC_SHIFT; + + vcpu_e500->tlb1cfg = mfspr(SPRN_TLB1CFG) & + ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); + vcpu_e500->tlb0cfg |= vcpu_e500->gtlb_params[1].entries; + vcpu_e500->tlb0cfg |= + vcpu_e500->gtlb_params[1].ways << TLBnCFG_ASSOC_SHIFT; return 0; -- cgit v1.2.3 From 570135243a33174a5d74641de693b6c0233d1181 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Tue, 29 Nov 2011 10:40:23 +0000 Subject: KVM: PPC: e500: use hardware hint when loading TLB0 entries The hardware maintains a per-set next victim hint. Using this reduces conflicts, especially on e500v2 where a single guest TLB entry is mapped to two shadow TLB entries (user and kernel). We want those two entries to go to different TLB ways. sesel is now only used for TLB1. Reported-by: Liu Yu Signed-off-by: Scott Wood Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/include/asm/mmu-book3e.h | 5 ++- arch/powerpc/kvm/e500_tlb.c | 69 ++++++++++++++++++++--------------- 2 files changed, 42 insertions(+), 32 deletions(-) diff --git a/arch/powerpc/include/asm/mmu-book3e.h b/arch/powerpc/include/asm/mmu-book3e.h index a0128232d9cc..cdb5421877e2 100644 --- a/arch/powerpc/include/asm/mmu-book3e.h +++ b/arch/powerpc/include/asm/mmu-book3e.h @@ -41,9 +41,10 @@ /* MAS registers bit definitions */ #define MAS0_TLBSEL(x) (((x) << 28) & 0x30000000) -#define MAS0_ESEL(x) (((x) << 16) & 0x0FFF0000) -#define MAS0_NV(x) ((x) & 0x00000FFF) #define MAS0_ESEL_MASK 0x0FFF0000 +#define MAS0_ESEL_SHIFT 16 +#define MAS0_ESEL(x) (((x) << MAS0_ESEL_SHIFT) & MAS0_ESEL_MASK) +#define MAS0_NV(x) ((x) & 0x00000FFF) #define MAS0_HES 0x00004000 #define MAS0_WQ_ALLWAYS 0x00000000 #define MAS0_WQ_COND 0x00001000 diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index 507376890cf8..1746e677bf37 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -296,17 +296,41 @@ static inline void __write_host_tlbe(struct kvm_book3e_206_tlb_entry *stlbe, local_irq_restore(flags); } -/* esel is index into set, not whole array */ +/* + * Acquire a mas0 with victim hint, as if we just took a TLB miss. + * + * We don't care about the address we're searching for, other than that it's + * in the right set and is not present in the TLB. Using a zero PID and a + * userspace address means we don't have to set and then restore MAS5, or + * calculate a proper MAS6 value. + */ +static u32 get_host_mas0(unsigned long eaddr) +{ + unsigned long flags; + u32 mas0; + + local_irq_save(flags); + mtspr(SPRN_MAS6, 0); + asm volatile("tlbsx 0, %0" : : "b" (eaddr & ~CONFIG_PAGE_OFFSET)); + mas0 = mfspr(SPRN_MAS0); + local_irq_restore(flags); + + return mas0; +} + +/* sesel is for tlb1 only */ static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500, - int tlbsel, int esel, struct kvm_book3e_206_tlb_entry *stlbe) + int tlbsel, int sesel, struct kvm_book3e_206_tlb_entry *stlbe) { + u32 mas0; + if (tlbsel == 0) { - int way = esel & (vcpu_e500->gtlb_params[0].ways - 1); - __write_host_tlbe(stlbe, MAS0_TLBSEL(0) | MAS0_ESEL(way)); + mas0 = get_host_mas0(stlbe->mas2); + __write_host_tlbe(stlbe, mas0); } else { __write_host_tlbe(stlbe, MAS0_TLBSEL(1) | - MAS0_ESEL(to_htlb1_esel(esel))); + MAS0_ESEL(to_htlb1_esel(sesel))); } trace_kvm_stlb_write(index_of(tlbsel, esel), stlbe->mas1, stlbe->mas2, (u32)stlbe->mas7_3, (u32)(stlbe->mas7_3 >> 32)); @@ -422,12 +446,6 @@ static int gtlb0_set_base(struct kvmppc_vcpu_e500 *vcpu_e500, gva_t addr) vcpu_e500->gtlb_params[0].ways); } -static int htlb0_set_base(gva_t addr) -{ - return tlb0_set_base(addr, host_tlb_params[0].sets, - host_tlb_params[0].ways); -} - static unsigned int get_tlb_esel(struct kvm_vcpu *vcpu, int tlbsel) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); @@ -585,10 +603,9 @@ static inline void kvmppc_e500_setup_stlbe( vcpu_e500->vcpu.arch.shared->msr & MSR_PR); } -/* sesel is an index into the entire array, not just the set */ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, u64 gvaddr, gfn_t gfn, struct kvm_book3e_206_tlb_entry *gtlbe, - int tlbsel, int sesel, struct kvm_book3e_206_tlb_entry *stlbe, + int tlbsel, struct kvm_book3e_206_tlb_entry *stlbe, struct tlbe_ref *ref) { struct kvm_memory_slot *slot; @@ -721,27 +738,19 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, } /* XXX only map the one-one case, for now use TLB0 */ -static int kvmppc_e500_tlb0_map(struct kvmppc_vcpu_e500 *vcpu_e500, - int esel, - struct kvm_book3e_206_tlb_entry *stlbe) +static void kvmppc_e500_tlb0_map(struct kvmppc_vcpu_e500 *vcpu_e500, + int esel, + struct kvm_book3e_206_tlb_entry *stlbe) { struct kvm_book3e_206_tlb_entry *gtlbe; struct tlbe_ref *ref; - int sesel = esel & (host_tlb_params[0].ways - 1); - int sesel_base; - gva_t ea; gtlbe = get_entry(vcpu_e500, 0, esel); ref = &vcpu_e500->gtlb_priv[0][esel].ref; - ea = get_tlb_eaddr(gtlbe); - sesel_base = htlb0_set_base(ea); - kvmppc_e500_shadow_map(vcpu_e500, get_tlb_eaddr(gtlbe), get_tlb_raddr(gtlbe) >> PAGE_SHIFT, - gtlbe, 0, sesel_base + sesel, stlbe, ref); - - return sesel; + gtlbe, 0, stlbe, ref); } /* Caller must ensure that the specified guest TLB entry is safe to insert into @@ -760,8 +769,7 @@ static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500, vcpu_e500->host_tlb1_nv = 0; ref = &vcpu_e500->tlb_refs[1][victim]; - kvmppc_e500_shadow_map(vcpu_e500, gvaddr, gfn, gtlbe, 1, - victim, stlbe, ref); + kvmppc_e500_shadow_map(vcpu_e500, gvaddr, gfn, gtlbe, 1, stlbe, ref); return victim; } @@ -910,7 +918,7 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb) return EMULATE_DONE; } -/* sesel is index into the set, not the whole array */ +/* sesel is for tlb1 only */ static void write_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500, struct kvm_book3e_206_tlb_entry *gtlbe, struct kvm_book3e_206_tlb_entry *stlbe, @@ -963,7 +971,8 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) gtlbe->mas1 |= MAS1_TSIZE(BOOK3E_PAGESZ_4K); stlbsel = 0; - sesel = kvmppc_e500_tlb0_map(vcpu_e500, esel, &stlbe); + kvmppc_e500_tlb0_map(vcpu_e500, esel, &stlbe); + sesel = 0; /* unused */ break; @@ -1052,7 +1061,7 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr, switch (tlbsel) { case 0: stlbsel = 0; - sesel = esel & (host_tlb_params[0].ways - 1); + sesel = 0; /* unused */ priv = &vcpu_e500->gtlb_priv[tlbsel][esel]; kvmppc_e500_setup_stlbe(vcpu_e500, gtlbe, BOOK3E_PAGESZ_4K, -- cgit v1.2.3 From befdc0a65afd17181392eff3d43c63407f266a9f Mon Sep 17 00:00:00 2001 From: Liu Yu-B13201 Date: Thu, 1 Dec 2011 20:22:53 +0000 Subject: KVM: PPC: Avoid patching paravirt template code Currently we patch the whole code include paravirt template code. This isn't safe for scratch area and has impact to performance. Signed-off-by: Liu Yu Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/kernel/kvm.c | 11 ++++++++++- arch/powerpc/kernel/kvm_emul.S | 6 ++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c index 04d4b5aa6dca..62bdf2389669 100644 --- a/arch/powerpc/kernel/kvm.c +++ b/arch/powerpc/kernel/kvm.c @@ -665,6 +665,9 @@ static void kvm_check_ins(u32 *inst, u32 features) } } +extern u32 kvm_template_start[]; +extern u32 kvm_template_end[]; + static void kvm_use_magic_page(void) { u32 *p; @@ -692,8 +695,14 @@ static void kvm_use_magic_page(void) */ local_irq_disable(); - for (p = start; p < end; p++) + for (p = start; p < end; p++) { + /* Avoid patching the template code */ + if (p >= kvm_template_start && p < kvm_template_end) { + p = kvm_template_end - 1; + continue; + } kvm_check_ins(p, features); + } local_irq_enable(); diff --git a/arch/powerpc/kernel/kvm_emul.S b/arch/powerpc/kernel/kvm_emul.S index 801058dd74db..e291cf3cf954 100644 --- a/arch/powerpc/kernel/kvm_emul.S +++ b/arch/powerpc/kernel/kvm_emul.S @@ -66,6 +66,9 @@ kvm_hypercall_start: shared->critical == r1 and r2 is always != r1 */ \ STL64(r2, KVM_MAGIC_PAGE + KVM_MAGIC_CRITICAL, 0); +.global kvm_template_start +kvm_template_start: + .global kvm_emulate_mtmsrd kvm_emulate_mtmsrd: @@ -350,3 +353,6 @@ kvm_emulate_mtsrin_orig_ins_offs: .global kvm_emulate_mtsrin_len kvm_emulate_mtsrin_len: .long (kvm_emulate_mtsrin_end - kvm_emulate_mtsrin) / 4 + +.global kvm_template_end +kvm_template_end: -- cgit v1.2.3 From 4e72dbe13528394a413889d73e5025dbdf6cab70 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 12 Dec 2011 12:24:48 +0000 Subject: KVM: PPC: Make wakeups work again for Book3S HV guests When commit f43fdc15fa ("KVM: PPC: booke: Improve timer register emulation") factored out some code in arch/powerpc/kvm/powerpc.c into a new helper function, kvm_vcpu_kick(), an error crept in which causes Book3s HV guest vcpus to stall. This fixes it. On POWER7 machines, guest vcpus are grouped together into virtual CPU cores that share a single waitqueue, so it's important to use vcpu->arch.wqp rather than &vcpu->wq. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/kvm/powerpc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index e1ef4d6d972a..4f85ac32258a 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -562,7 +562,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu) int cpu = vcpu->cpu; me = get_cpu(); - if (waitqueue_active(&vcpu->wq)) { + if (waitqueue_active(vcpu->arch.wqp)) { wake_up_interruptible(vcpu->arch.wqp); vcpu->stat.halt_wakeup++; } else if (cpu != me && cpu != -1) { -- cgit v1.2.3 From 8936dda4c2ed070ecebd786baf35b08584accf4a Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 12 Dec 2011 12:27:39 +0000 Subject: KVM: PPC: Keep a record of HV guest view of hashed page table entries This adds an array that parallels the guest hashed page table (HPT), that is, it has one entry per HPTE, used to store the guest's view of the second doubleword of the corresponding HPTE. The first doubleword in the HPTE is the same as the guest's idea of it, so we don't need to store a copy, but the second doubleword in the HPTE has the real page number rather than the guest's logical page number. This allows us to remove the back_translate() and reverse_xlate() functions. This "reverse mapping" array is vmalloc'd, meaning that to access it in real mode we have to walk the kernel's page tables explicitly. That is done by the new real_vmalloc_addr() function. (In fact this returns an address in the linear mapping, so the result is usable both in real mode and in virtual mode.) There are also some minor cleanups here: moving the definitions of HPT_ORDER etc. to a header file and defining HPT_NPTE for HPT_NPTEG << 3. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/include/asm/kvm_book3s_64.h | 8 +++ arch/powerpc/include/asm/kvm_host.h | 10 ++++ arch/powerpc/kvm/book3s_64_mmu_hv.c | 44 +++++++++++----- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 87 +++++++++++++++++++------------- 4 files changed, 103 insertions(+), 46 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 2054e4726ba2..fa3dc79af702 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -35,6 +35,14 @@ static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu) #define SPAPR_TCE_SHIFT 12 +#ifdef CONFIG_KVM_BOOK3S_64_HV +/* For now use fixed-size 16MB page table */ +#define HPT_ORDER 24 +#define HPT_NPTEG (1ul << (HPT_ORDER - 7)) /* 128B per pteg */ +#define HPT_NPTE (HPT_NPTEG << 3) /* 8 PTEs per PTEG */ +#define HPT_HASH_MASK (HPT_NPTEG - 1) +#endif + static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r, unsigned long pte_index) { diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 66c75cddaec6..629df2ed22f7 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -166,9 +166,19 @@ struct kvmppc_rma_info { atomic_t use_count; }; +/* + * The reverse mapping array has one entry for each HPTE, + * which stores the guest's view of the second word of the HPTE + * (including the guest physical address of the mapping). + */ +struct revmap_entry { + unsigned long guest_rpte; +}; + struct kvm_arch { #ifdef CONFIG_KVM_BOOK3S_64_HV unsigned long hpt_virt; + struct revmap_entry *revmap; unsigned long ram_npages; unsigned long ram_psize; unsigned long ram_porder; diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index bc3a2ea94217..80ece8de4070 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -33,11 +34,6 @@ #include #include -/* For now use fixed-size 16MB page table */ -#define HPT_ORDER 24 -#define HPT_NPTEG (1ul << (HPT_ORDER - 7)) /* 128B per pteg */ -#define HPT_HASH_MASK (HPT_NPTEG - 1) - /* Pages in the VRMA are 16MB pages */ #define VRMA_PAGE_ORDER 24 #define VRMA_VSID 0x1ffffffUL /* 1TB VSID reserved for VRMA */ @@ -51,7 +47,9 @@ long kvmppc_alloc_hpt(struct kvm *kvm) { unsigned long hpt; unsigned long lpid; + struct revmap_entry *rev; + /* Allocate guest's hashed page table */ hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|__GFP_NOWARN, HPT_ORDER - PAGE_SHIFT); if (!hpt) { @@ -60,12 +58,20 @@ long kvmppc_alloc_hpt(struct kvm *kvm) } kvm->arch.hpt_virt = hpt; + /* Allocate reverse map array */ + rev = vmalloc(sizeof(struct revmap_entry) * HPT_NPTE); + if (!rev) { + pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n"); + goto out_freehpt; + } + kvm->arch.revmap = rev; + + /* Allocate the guest's logical partition ID */ do { lpid = find_first_zero_bit(lpid_inuse, NR_LPIDS); if (lpid >= NR_LPIDS) { pr_err("kvm_alloc_hpt: No LPIDs free\n"); - free_pages(hpt, HPT_ORDER - PAGE_SHIFT); - return -ENOMEM; + goto out_freeboth; } } while (test_and_set_bit(lpid, lpid_inuse)); @@ -74,11 +80,18 @@ long kvmppc_alloc_hpt(struct kvm *kvm) pr_info("KVM guest htab at %lx, LPID %lx\n", hpt, lpid); return 0; + + out_freeboth: + vfree(rev); + out_freehpt: + free_pages(hpt, HPT_ORDER - PAGE_SHIFT); + return -ENOMEM; } void kvmppc_free_hpt(struct kvm *kvm) { clear_bit(kvm->arch.lpid, lpid_inuse); + vfree(kvm->arch.revmap); free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT); } @@ -89,14 +102,16 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem) unsigned long pfn; unsigned long *hpte; unsigned long hash; + unsigned long porder = kvm->arch.ram_porder; + struct revmap_entry *rev; struct kvmppc_pginfo *pginfo = kvm->arch.ram_pginfo; if (!pginfo) return; /* VRMA can't be > 1TB */ - if (npages > 1ul << (40 - kvm->arch.ram_porder)) - npages = 1ul << (40 - kvm->arch.ram_porder); + if (npages > 1ul << (40 - porder)) + npages = 1ul << (40 - porder); /* Can't use more than 1 HPTE per HPTEG */ if (npages > HPT_NPTEG) npages = HPT_NPTEG; @@ -113,15 +128,20 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem) * at most one HPTE per HPTEG, we just assume entry 7 * is available and use it. */ - hpte = (unsigned long *) (kvm->arch.hpt_virt + (hash << 7)); - hpte += 7 * 2; + hash = (hash << 3) + 7; + hpte = (unsigned long *) (kvm->arch.hpt_virt + (hash << 4)); /* HPTE low word - RPN, protection, etc. */ hpte[1] = (pfn << PAGE_SHIFT) | HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX; - wmb(); + smp_wmb(); hpte[0] = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) | (i << (VRMA_PAGE_ORDER - 16)) | HPTE_V_BOLTED | HPTE_V_LARGE | HPTE_V_VALID; + + /* Reverse map info */ + rev = &kvm->arch.revmap[hash]; + rev->guest_rpte = (i << porder) | HPTE_R_R | HPTE_R_C | + HPTE_R_M | PP_RWXX; } } diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index bacb0cfa3602..614849360a0a 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -20,10 +20,19 @@ #include #include -/* For now use fixed-size 16MB page table */ -#define HPT_ORDER 24 -#define HPT_NPTEG (1ul << (HPT_ORDER - 7)) /* 128B per pteg */ -#define HPT_HASH_MASK (HPT_NPTEG - 1) +/* Translate address of a vmalloc'd thing to a linear map address */ +static void *real_vmalloc_addr(void *x) +{ + unsigned long addr = (unsigned long) x; + pte_t *p; + + p = find_linux_pte(swapper_pg_dir, addr); + if (!p || !pte_present(*p)) + return NULL; + /* assume we don't have huge pages in vmalloc space... */ + addr = (pte_pfn(*p) << PAGE_SHIFT) | (addr & ~PAGE_MASK); + return __va(addr); +} #define HPTE_V_HVLOCK 0x40UL @@ -52,6 +61,8 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, struct kvm *kvm = vcpu->kvm; unsigned long i, lpn, pa; unsigned long *hpte; + struct revmap_entry *rev; + unsigned long g_ptel = ptel; /* only handle 4k, 64k and 16M pages for now */ porder = 12; @@ -82,7 +93,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, pteh &= ~0x60UL; ptel &= ~(HPTE_R_PP0 - kvm->arch.ram_psize); ptel |= pa; - if (pte_index >= (HPT_NPTEG << 3)) + if (pte_index >= HPT_NPTE) return H_PARAMETER; if (likely((flags & H_EXACT) == 0)) { pte_index &= ~7UL; @@ -95,18 +106,22 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, break; hpte += 2; } + pte_index += i; } else { - i = 0; hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); if (!lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID)) return H_PTEG_FULL; } + + /* Save away the guest's idea of the second HPTE dword */ + rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); + if (rev) + rev->guest_rpte = g_ptel; hpte[1] = ptel; eieio(); hpte[0] = pteh; asm volatile("ptesync" : : : "memory"); - atomic_inc(&kvm->arch.ram_pginfo[lpn].refcnt); - vcpu->arch.gpr[4] = pte_index + i; + vcpu->arch.gpr[4] = pte_index; return H_SUCCESS; } @@ -138,7 +153,7 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, unsigned long *hpte; unsigned long v, r, rb; - if (pte_index >= (HPT_NPTEG << 3)) + if (pte_index >= HPT_NPTE) return H_PARAMETER; hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); while (!lock_hpte(hpte, HPTE_V_HVLOCK)) @@ -193,7 +208,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) if (req == 3) break; if (req != 1 || flags == 3 || - pte_index >= (HPT_NPTEG << 3)) { + pte_index >= HPT_NPTE) { /* parameter error */ args[i * 2] = ((0xa0 | flags) << 56) + pte_index; ret = H_PARAMETER; @@ -256,9 +271,10 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, { struct kvm *kvm = vcpu->kvm; unsigned long *hpte; - unsigned long v, r, rb; + struct revmap_entry *rev; + unsigned long v, r, rb, mask, bits; - if (pte_index >= (HPT_NPTEG << 3)) + if (pte_index >= HPT_NPTE) return H_PARAMETER; hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); while (!lock_hpte(hpte, HPTE_V_HVLOCK)) @@ -271,11 +287,21 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, if (atomic_read(&kvm->online_vcpus) == 1) flags |= H_LOCAL; v = hpte[0]; - r = hpte[1] & ~(HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N | - HPTE_R_KEY_HI | HPTE_R_KEY_LO); - r |= (flags << 55) & HPTE_R_PP0; - r |= (flags << 48) & HPTE_R_KEY_HI; - r |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO); + bits = (flags << 55) & HPTE_R_PP0; + bits |= (flags << 48) & HPTE_R_KEY_HI; + bits |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO); + + /* Update guest view of 2nd HPTE dword */ + mask = HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N | + HPTE_R_KEY_HI | HPTE_R_KEY_LO; + rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); + if (rev) { + r = (rev->guest_rpte & ~mask) | bits; + rev->guest_rpte = r; + } + r = (hpte[1] & ~mask) | bits; + + /* Update HPTE */ rb = compute_tlbie_rb(v, r, pte_index); hpte[0] = v & ~HPTE_V_VALID; if (!(flags & H_LOCAL)) { @@ -298,38 +324,31 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, return H_SUCCESS; } -static unsigned long reverse_xlate(struct kvm *kvm, unsigned long realaddr) -{ - long int i; - unsigned long offset, rpn; - - offset = realaddr & (kvm->arch.ram_psize - 1); - rpn = (realaddr - offset) >> PAGE_SHIFT; - for (i = 0; i < kvm->arch.ram_npages; ++i) - if (rpn == kvm->arch.ram_pginfo[i].pfn) - return (i << PAGE_SHIFT) + offset; - return HPTE_R_RPN; /* all 1s in the RPN field */ -} - long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags, unsigned long pte_index) { struct kvm *kvm = vcpu->kvm; unsigned long *hpte, r; int i, n = 1; + struct revmap_entry *rev = NULL; - if (pte_index >= (HPT_NPTEG << 3)) + if (pte_index >= HPT_NPTE) return H_PARAMETER; if (flags & H_READ_4) { pte_index &= ~3; n = 4; } + if (flags & H_R_XLATE) + rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); for (i = 0; i < n; ++i, ++pte_index) { hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); r = hpte[1]; - if ((flags & H_R_XLATE) && (hpte[0] & HPTE_V_VALID)) - r = reverse_xlate(kvm, r & HPTE_R_RPN) | - (r & ~HPTE_R_RPN); + if (hpte[0] & HPTE_V_VALID) { + if (rev) + r = rev[i].guest_rpte; + else + r = hpte[1] | HPTE_R_RPN; + } vcpu->arch.gpr[4 + i * 2] = hpte[0]; vcpu->arch.gpr[5 + i * 2] = r; } -- cgit v1.2.3 From b2b2f16508de10bb1863bdd4ec1fa212111df5b4 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 12 Dec 2011 12:28:21 +0000 Subject: KVM: PPC: Keep page physical addresses in per-slot arrays This allocates an array for each memory slot that is added to store the physical addresses of the pages in the slot. This array is vmalloc'd and accessed in kvmppc_h_enter using real_vmalloc_addr(). This allows us to remove the ram_pginfo field from the kvm_arch struct, and removes the 64GB guest RAM limit that we had. We use the low-order bits of the array entries to store a flag indicating that we have done get_page on the corresponding page, and therefore need to call put_page when we are finished with the page. Currently this is set for all pages except those in our special RMO regions. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/include/asm/kvm_host.h | 9 ++- arch/powerpc/kvm/book3s_64_mmu_hv.c | 18 +++--- arch/powerpc/kvm/book3s_hv.c | 114 +++++++++++++++++------------------- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 41 +++++++++++-- 4 files changed, 107 insertions(+), 75 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 629df2ed22f7..7a17ab5b9058 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -38,6 +38,7 @@ #define KVM_MEMORY_SLOTS 32 /* memory slots that does not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 4 +#define KVM_MEM_SLOTS_NUM (KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS) #ifdef CONFIG_KVM_MMIO #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 @@ -175,25 +176,27 @@ struct revmap_entry { unsigned long guest_rpte; }; +/* Low-order bits in kvm->arch.slot_phys[][] */ +#define KVMPPC_GOT_PAGE 0x80 + struct kvm_arch { #ifdef CONFIG_KVM_BOOK3S_64_HV unsigned long hpt_virt; struct revmap_entry *revmap; - unsigned long ram_npages; unsigned long ram_psize; unsigned long ram_porder; - struct kvmppc_pginfo *ram_pginfo; unsigned int lpid; unsigned int host_lpid; unsigned long host_lpcr; unsigned long sdr1; unsigned long host_sdr1; int tlbie_lock; - int n_rma_pages; unsigned long lpcr; unsigned long rmor; struct kvmppc_rma_info *rma; struct list_head spapr_tce_tables; + unsigned long *slot_phys[KVM_MEM_SLOTS_NUM]; + int slot_npages[KVM_MEM_SLOTS_NUM]; unsigned short last_vcpu[NR_CPUS]; struct kvmppc_vcore *vcores[KVM_MAX_VCORES]; #endif /* CONFIG_KVM_BOOK3S_64_HV */ diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 80ece8de4070..e4c60698f41a 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -98,16 +98,16 @@ void kvmppc_free_hpt(struct kvm *kvm) void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem) { unsigned long i; - unsigned long npages = kvm->arch.ram_npages; - unsigned long pfn; + unsigned long npages; + unsigned long pa; unsigned long *hpte; unsigned long hash; unsigned long porder = kvm->arch.ram_porder; struct revmap_entry *rev; - struct kvmppc_pginfo *pginfo = kvm->arch.ram_pginfo; + unsigned long *physp; - if (!pginfo) - return; + physp = kvm->arch.slot_phys[mem->slot]; + npages = kvm->arch.slot_npages[mem->slot]; /* VRMA can't be > 1TB */ if (npages > 1ul << (40 - porder)) @@ -117,9 +117,10 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem) npages = HPT_NPTEG; for (i = 0; i < npages; ++i) { - pfn = pginfo[i].pfn; - if (!pfn) + pa = physp[i]; + if (!pa) break; + pa &= PAGE_MASK; /* can't use hpt_hash since va > 64 bits */ hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & HPT_HASH_MASK; /* @@ -131,8 +132,7 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem) hash = (hash << 3) + 7; hpte = (unsigned long *) (kvm->arch.hpt_virt + (hash << 4)); /* HPTE low word - RPN, protection, etc. */ - hpte[1] = (pfn << PAGE_SHIFT) | HPTE_R_R | HPTE_R_C | - HPTE_R_M | PP_RWXX; + hpte[1] = pa | HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX; smp_wmb(); hpte[0] = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) | (i << (VRMA_PAGE_ORDER - 16)) | HPTE_V_BOLTED | diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index b1e3b9c1326a..a84fedc48d99 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -50,14 +50,6 @@ #include #include -/* - * For now, limit memory to 64GB and require it to be large pages. - * This value is chosen because it makes the ram_pginfo array be - * 64kB in size, which is about as large as we want to be trying - * to allocate with kmalloc. - */ -#define MAX_MEM_ORDER 36 - #define LARGE_PAGE_ORDER 24 /* 16MB pages */ /* #define EXIT_DEBUG */ @@ -147,10 +139,12 @@ static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu, unsigned long vcpuid, unsigned long vpa) { struct kvm *kvm = vcpu->kvm; - unsigned long pg_index, ra, len; + unsigned long gfn, pg_index, ra, len; unsigned long pg_offset; void *va; struct kvm_vcpu *tvcpu; + struct kvm_memory_slot *memslot; + unsigned long *physp; tvcpu = kvmppc_find_vcpu(kvm, vcpuid); if (!tvcpu) @@ -164,14 +158,20 @@ static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu, if (vpa & 0x7f) return H_PARAMETER; /* registering new area; convert logical addr to real */ - pg_index = vpa >> kvm->arch.ram_porder; - pg_offset = vpa & (kvm->arch.ram_psize - 1); - if (pg_index >= kvm->arch.ram_npages) + gfn = vpa >> PAGE_SHIFT; + memslot = gfn_to_memslot(kvm, gfn); + if (!memslot || !(memslot->flags & KVM_MEMSLOT_INVALID)) + return H_PARAMETER; + physp = kvm->arch.slot_phys[memslot->id]; + if (!physp) return H_PARAMETER; - if (kvm->arch.ram_pginfo[pg_index].pfn == 0) + pg_index = (gfn - memslot->base_gfn) >> + (kvm->arch.ram_porder - PAGE_SHIFT); + pg_offset = vpa & (kvm->arch.ram_psize - 1); + ra = physp[pg_index]; + if (!ra) return H_PARAMETER; - ra = kvm->arch.ram_pginfo[pg_index].pfn << PAGE_SHIFT; - ra |= pg_offset; + ra = (ra & PAGE_MASK) | pg_offset; va = __va(ra); if (flags <= 1) len = *(unsigned short *)(va + 4); @@ -1075,12 +1075,11 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem) { unsigned long psize, porder; - unsigned long i, npages, totalpages; - unsigned long pg_ix; - struct kvmppc_pginfo *pginfo; + unsigned long i, npages; unsigned long hva; struct kvmppc_rma_info *ri = NULL; struct page *page; + unsigned long *phys; /* For now, only allow 16MB pages */ porder = LARGE_PAGE_ORDER; @@ -1092,20 +1091,21 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm, return -EINVAL; } + /* Allocate a slot_phys array */ npages = mem->memory_size >> porder; - totalpages = (mem->guest_phys_addr + mem->memory_size) >> porder; - - /* More memory than we have space to track? */ - if (totalpages > (1ul << (MAX_MEM_ORDER - LARGE_PAGE_ORDER))) - return -EINVAL; + phys = kvm->arch.slot_phys[mem->slot]; + if (!phys) { + phys = vzalloc(npages * sizeof(unsigned long)); + if (!phys) + return -ENOMEM; + kvm->arch.slot_phys[mem->slot] = phys; + kvm->arch.slot_npages[mem->slot] = npages; + } /* Do we already have an RMA registered? */ if (mem->guest_phys_addr == 0 && kvm->arch.rma) return -EINVAL; - if (totalpages > kvm->arch.ram_npages) - kvm->arch.ram_npages = totalpages; - /* Is this one of our preallocated RMAs? */ if (mem->guest_phys_addr == 0) { struct vm_area_struct *vma; @@ -1138,7 +1138,6 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm, } atomic_inc(&ri->use_count); kvm->arch.rma = ri; - kvm->arch.n_rma_pages = rma_size >> porder; /* Update LPCR and RMOR */ lpcr = kvm->arch.lpcr; @@ -1162,12 +1161,9 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm, ri->base_pfn << PAGE_SHIFT, rma_size, lpcr); } - pg_ix = mem->guest_phys_addr >> porder; - pginfo = kvm->arch.ram_pginfo + pg_ix; - for (i = 0; i < npages; ++i, ++pg_ix) { - if (ri && pg_ix < kvm->arch.n_rma_pages) { - pginfo[i].pfn = ri->base_pfn + - (pg_ix << (porder - PAGE_SHIFT)); + for (i = 0; i < npages; ++i) { + if (ri && i < ri->npages) { + phys[i] = (ri->base_pfn << PAGE_SHIFT) + (i << porder); continue; } hva = mem->userspace_addr + (i << porder); @@ -1183,7 +1179,7 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm, hva, compound_order(page)); goto err; } - pginfo[i].pfn = page_to_pfn(page); + phys[i] = (page_to_pfn(page) << PAGE_SHIFT) | KVMPPC_GOT_PAGE; } return 0; @@ -1192,6 +1188,28 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm, return -EINVAL; } +static void unpin_slot(struct kvm *kvm, int slot_id) +{ + unsigned long *physp; + unsigned long j, npages, pfn; + struct page *page; + + physp = kvm->arch.slot_phys[slot_id]; + npages = kvm->arch.slot_npages[slot_id]; + if (physp) { + for (j = 0; j < npages; j++) { + if (!(physp[j] & KVMPPC_GOT_PAGE)) + continue; + pfn = physp[j] >> PAGE_SHIFT; + page = pfn_to_page(pfn); + SetPageDirty(page); + put_page(page); + } + vfree(physp); + kvm->arch.slot_phys[slot_id] = NULL; + } +} + void kvmppc_core_commit_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem) { @@ -1203,8 +1221,6 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm, int kvmppc_core_init_vm(struct kvm *kvm) { long r; - unsigned long npages = 1ul << (MAX_MEM_ORDER - LARGE_PAGE_ORDER); - long err = -ENOMEM; unsigned long lpcr; /* Allocate hashed page table */ @@ -1214,19 +1230,9 @@ int kvmppc_core_init_vm(struct kvm *kvm) INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); - kvm->arch.ram_pginfo = kzalloc(npages * sizeof(struct kvmppc_pginfo), - GFP_KERNEL); - if (!kvm->arch.ram_pginfo) { - pr_err("kvmppc_core_init_vm: couldn't alloc %lu bytes\n", - npages * sizeof(struct kvmppc_pginfo)); - goto out_free; - } - - kvm->arch.ram_npages = 0; kvm->arch.ram_psize = 1ul << LARGE_PAGE_ORDER; kvm->arch.ram_porder = LARGE_PAGE_ORDER; kvm->arch.rma = NULL; - kvm->arch.n_rma_pages = 0; kvm->arch.host_sdr1 = mfspr(SPRN_SDR1); @@ -1249,25 +1255,15 @@ int kvmppc_core_init_vm(struct kvm *kvm) kvm->arch.lpcr = lpcr; return 0; - - out_free: - kvmppc_free_hpt(kvm); - return err; } void kvmppc_core_destroy_vm(struct kvm *kvm) { - struct kvmppc_pginfo *pginfo; unsigned long i; - if (kvm->arch.ram_pginfo) { - pginfo = kvm->arch.ram_pginfo; - kvm->arch.ram_pginfo = NULL; - for (i = kvm->arch.n_rma_pages; i < kvm->arch.ram_npages; ++i) - if (pginfo[i].pfn) - put_page(pfn_to_page(pginfo[i].pfn)); - kfree(pginfo); - } + for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) + unpin_slot(kvm, i); + if (kvm->arch.rma) { kvm_release_rma(kvm->arch.rma); kvm->arch.rma = NULL; diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 614849360a0a..84dae821b230 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -20,6 +20,25 @@ #include #include +/* + * Since this file is built in even if KVM is a module, we need + * a local copy of this function for the case where kvm_main.c is + * modular. + */ +static struct kvm_memory_slot *builtin_gfn_to_memslot(struct kvm *kvm, + gfn_t gfn) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + + slots = kvm_memslots(kvm); + kvm_for_each_memslot(memslot, slots) + if (gfn >= memslot->base_gfn && + gfn < memslot->base_gfn + memslot->npages) + return memslot; + return NULL; +} + /* Translate address of a vmalloc'd thing to a linear map address */ static void *real_vmalloc_addr(void *x) { @@ -59,10 +78,12 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, { unsigned long porder; struct kvm *kvm = vcpu->kvm; - unsigned long i, lpn, pa; + unsigned long i, gfn, lpn, pa; unsigned long *hpte; struct revmap_entry *rev; unsigned long g_ptel = ptel; + struct kvm_memory_slot *memslot; + unsigned long *physp; /* only handle 4k, 64k and 16M pages for now */ porder = 12; @@ -80,12 +101,24 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, } else return H_PARAMETER; } - lpn = (ptel & HPTE_R_RPN) >> kvm->arch.ram_porder; - if (lpn >= kvm->arch.ram_npages || porder > kvm->arch.ram_porder) + if (porder > kvm->arch.ram_porder) return H_PARAMETER; - pa = kvm->arch.ram_pginfo[lpn].pfn << PAGE_SHIFT; + + gfn = ((ptel & HPTE_R_RPN) & ~((1ul << porder) - 1)) >> PAGE_SHIFT; + memslot = builtin_gfn_to_memslot(kvm, gfn); + if (!(memslot && !(memslot->flags & KVM_MEMSLOT_INVALID))) + return H_PARAMETER; + physp = kvm->arch.slot_phys[memslot->id]; + if (!physp) + return H_PARAMETER; + + lpn = (gfn - memslot->base_gfn) >> (kvm->arch.ram_porder - PAGE_SHIFT); + physp = real_vmalloc_addr(physp + lpn); + pa = *physp; if (!pa) return H_PARAMETER; + pa &= PAGE_MASK; + /* Check WIMG */ if ((ptel & HPTE_R_WIMG) != HPTE_R_M && (ptel & HPTE_R_WIMG) != (HPTE_R_W | HPTE_R_I | HPTE_R_M)) -- cgit v1.2.3 From 93e602490c1da83162a8b6ba86b4b48a7a0f0c9e Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 12 Dec 2011 12:28:55 +0000 Subject: KVM: PPC: Add an interface for pinning guest pages in Book3s HV guests This adds two new functions, kvmppc_pin_guest_page() and kvmppc_unpin_guest_page(), and uses them to pin the guest pages where the guest has registered areas of memory for the hypervisor to update, (i.e. the per-cpu virtual processor areas, SLB shadow buffers and dispatch trace logs) and then unpin them when they are no longer required. Although it is not strictly necessary to pin the pages at this point, since all guest pages are already pinned, later commits in this series will mean that guest pages aren't all pinned. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/include/asm/kvm_book3s.h | 3 ++ arch/powerpc/kvm/book3s_64_mmu_hv.c | 38 ++++++++++++++++++++ arch/powerpc/kvm/book3s_hv.c | 67 +++++++++++++++++++---------------- 3 files changed, 78 insertions(+), 30 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index c941c21a1893..bcf6f4f52a22 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -138,6 +138,9 @@ extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat, extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr); extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu); extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn); +extern void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long addr, + unsigned long *nb_ret); +extern void kvmppc_unpin_guest_page(struct kvm *kvm, void *addr); extern void kvmppc_entry_trampoline(void); extern void kvmppc_hv_entry_trampoline(void); diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index e4c60698f41a..dcd39dc64f07 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -184,6 +184,44 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, return -ENOENT; } +void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa, + unsigned long *nb_ret) +{ + struct kvm_memory_slot *memslot; + unsigned long gfn = gpa >> PAGE_SHIFT; + struct page *page; + unsigned long offset; + unsigned long pfn, pa; + unsigned long *physp; + + memslot = gfn_to_memslot(kvm, gfn); + if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) + return NULL; + physp = kvm->arch.slot_phys[memslot->id]; + if (!physp) + return NULL; + physp += (gfn - memslot->base_gfn) >> + (kvm->arch.ram_porder - PAGE_SHIFT); + pa = *physp; + if (!pa) + return NULL; + pfn = pa >> PAGE_SHIFT; + page = pfn_to_page(pfn); + get_page(page); + offset = gpa & (kvm->arch.ram_psize - 1); + if (nb_ret) + *nb_ret = kvm->arch.ram_psize - offset; + return page_address(page) + offset; +} + +void kvmppc_unpin_guest_page(struct kvm *kvm, void *va) +{ + struct page *page = virt_to_page(va); + + page = compound_head(page); + put_page(page); +} + void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu) { struct kvmppc_mmu *mmu = &vcpu->arch.mmu; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index a84fedc48d99..82d71388eace 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -139,12 +139,10 @@ static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu, unsigned long vcpuid, unsigned long vpa) { struct kvm *kvm = vcpu->kvm; - unsigned long gfn, pg_index, ra, len; - unsigned long pg_offset; + unsigned long len, nb; void *va; struct kvm_vcpu *tvcpu; - struct kvm_memory_slot *memslot; - unsigned long *physp; + int err = H_PARAMETER; tvcpu = kvmppc_find_vcpu(kvm, vcpuid); if (!tvcpu) @@ -157,51 +155,41 @@ static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu, if (flags < 4) { if (vpa & 0x7f) return H_PARAMETER; + if (flags >= 2 && !tvcpu->arch.vpa) + return H_RESOURCE; /* registering new area; convert logical addr to real */ - gfn = vpa >> PAGE_SHIFT; - memslot = gfn_to_memslot(kvm, gfn); - if (!memslot || !(memslot->flags & KVM_MEMSLOT_INVALID)) - return H_PARAMETER; - physp = kvm->arch.slot_phys[memslot->id]; - if (!physp) - return H_PARAMETER; - pg_index = (gfn - memslot->base_gfn) >> - (kvm->arch.ram_porder - PAGE_SHIFT); - pg_offset = vpa & (kvm->arch.ram_psize - 1); - ra = physp[pg_index]; - if (!ra) + va = kvmppc_pin_guest_page(kvm, vpa, &nb); + if (va == NULL) return H_PARAMETER; - ra = (ra & PAGE_MASK) | pg_offset; - va = __va(ra); if (flags <= 1) len = *(unsigned short *)(va + 4); else len = *(unsigned int *)(va + 4); - if (pg_offset + len > kvm->arch.ram_psize) - return H_PARAMETER; + if (len > nb) + goto out_unpin; switch (flags) { case 1: /* register VPA */ if (len < 640) - return H_PARAMETER; + goto out_unpin; + if (tvcpu->arch.vpa) + kvmppc_unpin_guest_page(kvm, vcpu->arch.vpa); tvcpu->arch.vpa = va; init_vpa(vcpu, va); break; case 2: /* register DTL */ if (len < 48) - return H_PARAMETER; - if (!tvcpu->arch.vpa) - return H_RESOURCE; + goto out_unpin; len -= len % 48; + if (tvcpu->arch.dtl) + kvmppc_unpin_guest_page(kvm, vcpu->arch.dtl); tvcpu->arch.dtl = va; tvcpu->arch.dtl_end = va + len; break; case 3: /* register SLB shadow buffer */ - if (len < 8) - return H_PARAMETER; - if (!tvcpu->arch.vpa) - return H_RESOURCE; - tvcpu->arch.slb_shadow = va; - len = (len - 16) / 16; + if (len < 16) + goto out_unpin; + if (tvcpu->arch.slb_shadow) + kvmppc_unpin_guest_page(kvm, vcpu->arch.slb_shadow); tvcpu->arch.slb_shadow = va; break; } @@ -210,17 +198,30 @@ static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu, case 5: /* unregister VPA */ if (tvcpu->arch.slb_shadow || tvcpu->arch.dtl) return H_RESOURCE; + if (!tvcpu->arch.vpa) + break; + kvmppc_unpin_guest_page(kvm, tvcpu->arch.vpa); tvcpu->arch.vpa = NULL; break; case 6: /* unregister DTL */ + if (!tvcpu->arch.dtl) + break; + kvmppc_unpin_guest_page(kvm, tvcpu->arch.dtl); tvcpu->arch.dtl = NULL; break; case 7: /* unregister SLB shadow buffer */ + if (!tvcpu->arch.slb_shadow) + break; + kvmppc_unpin_guest_page(kvm, tvcpu->arch.slb_shadow); tvcpu->arch.slb_shadow = NULL; break; } } return H_SUCCESS; + + out_unpin: + kvmppc_unpin_guest_page(kvm, va); + return err; } int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) @@ -470,6 +471,12 @@ out: void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) { + if (vcpu->arch.dtl) + kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.dtl); + if (vcpu->arch.slb_shadow) + kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.slb_shadow); + if (vcpu->arch.vpa) + kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.vpa); kvm_vcpu_uninit(vcpu); kfree(vcpu); } -- cgit v1.2.3 From 075295dd322b0c0de0c9ecf8e0cb19ee813438ed Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 12 Dec 2011 12:30:16 +0000 Subject: KVM: PPC: Make the H_ENTER hcall more reliable At present, our implementation of H_ENTER only makes one try at locking each slot that it looks at, and doesn't even retry the ldarx/stdcx. atomic update sequence that it uses to attempt to lock the slot. Thus it can return the H_PTEG_FULL error unnecessarily, particularly when the H_EXACT flag is set, meaning that the caller wants a specific PTEG slot. This improves the situation by making a second pass when no free HPTE slot is found, where we spin until we succeed in locking each slot in turn and then check whether it is full while we hold the lock. If the second pass fails, then we return H_PTEG_FULL. This also moves lock_hpte to a header file (since later commits in this series will need to use it from other source files) and renames it to try_lock_hpte, which is a somewhat less misleading name. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/include/asm/kvm_book3s_64.h | 25 +++++++++++++ arch/powerpc/kvm/book3s_hv_rm_mmu.c | 63 +++++++++++++++++--------------- 2 files changed, 59 insertions(+), 29 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index fa3dc79af702..300ec04a8381 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -43,6 +43,31 @@ static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu) #define HPT_HASH_MASK (HPT_NPTEG - 1) #endif +/* + * We use a lock bit in HPTE dword 0 to synchronize updates and + * accesses to each HPTE, and another bit to indicate non-present + * HPTEs. + */ +#define HPTE_V_HVLOCK 0x40UL + +static inline long try_lock_hpte(unsigned long *hpte, unsigned long bits) +{ + unsigned long tmp, old; + + asm volatile(" ldarx %0,0,%2\n" + " and. %1,%0,%3\n" + " bne 2f\n" + " ori %0,%0,%4\n" + " stdcx. %0,0,%2\n" + " beq+ 2f\n" + " li %1,%3\n" + "2: isync" + : "=&r" (tmp), "=&r" (old) + : "r" (hpte), "r" (bits), "i" (HPTE_V_HVLOCK) + : "cc", "memory"); + return old == 0; +} + static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r, unsigned long pte_index) { diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 84dae821b230..a28a6030ec90 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -53,26 +53,6 @@ static void *real_vmalloc_addr(void *x) return __va(addr); } -#define HPTE_V_HVLOCK 0x40UL - -static inline long lock_hpte(unsigned long *hpte, unsigned long bits) -{ - unsigned long tmp, old; - - asm volatile(" ldarx %0,0,%2\n" - " and. %1,%0,%3\n" - " bne 2f\n" - " ori %0,%0,%4\n" - " stdcx. %0,0,%2\n" - " beq+ 2f\n" - " li %1,%3\n" - "2: isync" - : "=&r" (tmp), "=&r" (old) - : "r" (hpte), "r" (bits), "i" (HPTE_V_HVLOCK) - : "cc", "memory"); - return old == 0; -} - long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, long pte_index, unsigned long pteh, unsigned long ptel) { @@ -126,24 +106,49 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, pteh &= ~0x60UL; ptel &= ~(HPTE_R_PP0 - kvm->arch.ram_psize); ptel |= pa; + if (pte_index >= HPT_NPTE) return H_PARAMETER; if (likely((flags & H_EXACT) == 0)) { pte_index &= ~7UL; hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); - for (i = 0; ; ++i) { - if (i == 8) - return H_PTEG_FULL; + for (i = 0; i < 8; ++i) { if ((*hpte & HPTE_V_VALID) == 0 && - lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID)) + try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID)) break; hpte += 2; } + if (i == 8) { + /* + * Since try_lock_hpte doesn't retry (not even stdcx. + * failures), it could be that there is a free slot + * but we transiently failed to lock it. Try again, + * actually locking each slot and checking it. + */ + hpte -= 16; + for (i = 0; i < 8; ++i) { + while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) + cpu_relax(); + if ((*hpte & HPTE_V_VALID) == 0) + break; + *hpte &= ~HPTE_V_HVLOCK; + hpte += 2; + } + if (i == 8) + return H_PTEG_FULL; + } pte_index += i; } else { hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); - if (!lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID)) - return H_PTEG_FULL; + if (!try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID)) { + /* Lock the slot and check again */ + while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) + cpu_relax(); + if (*hpte & HPTE_V_VALID) { + *hpte &= ~HPTE_V_HVLOCK; + return H_PTEG_FULL; + } + } } /* Save away the guest's idea of the second HPTE dword */ @@ -189,7 +194,7 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, if (pte_index >= HPT_NPTE) return H_PARAMETER; hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); - while (!lock_hpte(hpte, HPTE_V_HVLOCK)) + while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) cpu_relax(); if ((hpte[0] & HPTE_V_VALID) == 0 || ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn) || @@ -248,7 +253,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) break; } hp = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); - while (!lock_hpte(hp, HPTE_V_HVLOCK)) + while (!try_lock_hpte(hp, HPTE_V_HVLOCK)) cpu_relax(); found = 0; if (hp[0] & HPTE_V_VALID) { @@ -310,7 +315,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, if (pte_index >= HPT_NPTE) return H_PARAMETER; hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); - while (!lock_hpte(hpte, HPTE_V_HVLOCK)) + while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) cpu_relax(); if ((hpte[0] & HPTE_V_VALID) == 0 || ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn)) { -- cgit v1.2.3 From c77162dee7aff6ab5f075da9b60f649cbbeb86cc Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 12 Dec 2011 12:31:00 +0000 Subject: KVM: PPC: Only get pages when actually needed, not in prepare_memory_region() This removes the code from kvmppc_core_prepare_memory_region() that looked up the VMA for the region being added and called hva_to_page to get the pfns for the memory. We have no guarantee that there will be anything mapped there at the time of the KVM_SET_USER_MEMORY_REGION ioctl call; userspace can do that ioctl and then map memory into the region later. Instead we defer looking up the pfn for each memory page until it is needed, which generally means when the guest does an H_ENTER hcall on the page. Since we can't call get_user_pages in real mode, if we don't already have the pfn for the page, kvmppc_h_enter() will return H_TOO_HARD and we then call kvmppc_virtmode_h_enter() once we get back to kernel context. That calls kvmppc_get_guest_page() to get the pfn for the page, and then calls back to kvmppc_h_enter() to redo the HPTE insertion. When the first vcpu starts executing, we need to have the RMO or VRMA region mapped so that the guest's real mode accesses will work. Thus we now have a check in kvmppc_vcpu_run() to see if the RMO/VRMA is set up and if not, call kvmppc_hv_setup_rma(). It checks if the memslot starting at guest physical 0 now has RMO memory mapped there; if so it sets it up for the guest, otherwise on POWER7 it sets up the VRMA. The function that does that, kvmppc_map_vrma, is now a bit simpler, as it calls kvmppc_virtmode_h_enter instead of creating the HPTE itself. Since we are now potentially updating entries in the slot_phys[] arrays from multiple vcpu threads, we now have a spinlock protecting those updates to ensure that we don't lose track of any references to pages. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/include/asm/kvm_book3s.h | 4 + arch/powerpc/include/asm/kvm_book3s_64.h | 12 ++ arch/powerpc/include/asm/kvm_host.h | 2 + arch/powerpc/include/asm/kvm_ppc.h | 4 +- arch/powerpc/kvm/book3s_64_mmu_hv.c | 130 +++++++++++++--- arch/powerpc/kvm/book3s_hv.c | 244 +++++++++++++++++-------------- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 54 +++---- 7 files changed, 290 insertions(+), 160 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index bcf6f4f52a22..c700f43ba178 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -141,6 +141,10 @@ extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn); extern void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long addr, unsigned long *nb_ret); extern void kvmppc_unpin_guest_page(struct kvm *kvm, void *addr); +extern long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, + long pte_index, unsigned long pteh, unsigned long ptel); +extern long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, + long pte_index, unsigned long pteh, unsigned long ptel); extern void kvmppc_entry_trampoline(void); extern void kvmppc_hv_entry_trampoline(void); diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 300ec04a8381..7e6f2ede44ac 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -101,4 +101,16 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r, return rb; } +static inline unsigned long hpte_page_size(unsigned long h, unsigned long l) +{ + /* only handle 4k, 64k and 16M pages for now */ + if (!(h & HPTE_V_LARGE)) + return 1ul << 12; /* 4k page */ + if ((l & 0xf000) == 0x1000 && cpu_has_feature(CPU_FTR_ARCH_206)) + return 1ul << 16; /* 64k page */ + if ((l & 0xff000) == 0) + return 1ul << 24; /* 16M page */ + return 0; /* error */ +} + #endif /* __ASM_KVM_BOOK3S_64_H__ */ diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 7a17ab5b9058..beb22ba71e26 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -194,7 +194,9 @@ struct kvm_arch { unsigned long lpcr; unsigned long rmor; struct kvmppc_rma_info *rma; + int rma_setup_done; struct list_head spapr_tce_tables; + spinlock_t slot_phys_lock; unsigned long *slot_phys[KVM_MEM_SLOTS_NUM]; int slot_npages[KVM_MEM_SLOTS_NUM]; unsigned short last_vcpu[NR_CPUS]; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 5192c2e70583..1458c6740ea3 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -121,8 +121,8 @@ extern long kvmppc_alloc_hpt(struct kvm *kvm); extern void kvmppc_free_hpt(struct kvm *kvm); extern long kvmppc_prepare_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem); -extern void kvmppc_map_vrma(struct kvm *kvm, - struct kvm_userspace_memory_region *mem); +extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu, + struct kvm_memory_slot *memslot); extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu); extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvm_create_spapr_tce *args); diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index dcd39dc64f07..87016ccd8648 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -95,19 +95,17 @@ void kvmppc_free_hpt(struct kvm *kvm) free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT); } -void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem) +void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot) { + struct kvm *kvm = vcpu->kvm; unsigned long i; unsigned long npages; - unsigned long pa; - unsigned long *hpte; - unsigned long hash; + unsigned long hp_v, hp_r; + unsigned long addr, hash; unsigned long porder = kvm->arch.ram_porder; - struct revmap_entry *rev; - unsigned long *physp; + long ret; - physp = kvm->arch.slot_phys[mem->slot]; - npages = kvm->arch.slot_npages[mem->slot]; + npages = kvm->arch.slot_npages[memslot->id]; /* VRMA can't be > 1TB */ if (npages > 1ul << (40 - porder)) @@ -117,10 +115,7 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem) npages = HPT_NPTEG; for (i = 0; i < npages; ++i) { - pa = physp[i]; - if (!pa) - break; - pa &= PAGE_MASK; + addr = i << porder; /* can't use hpt_hash since va > 64 bits */ hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & HPT_HASH_MASK; /* @@ -130,18 +125,16 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem) * is available and use it. */ hash = (hash << 3) + 7; - hpte = (unsigned long *) (kvm->arch.hpt_virt + (hash << 4)); - /* HPTE low word - RPN, protection, etc. */ - hpte[1] = pa | HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX; - smp_wmb(); - hpte[0] = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) | + hp_v = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) | (i << (VRMA_PAGE_ORDER - 16)) | HPTE_V_BOLTED | HPTE_V_LARGE | HPTE_V_VALID; - - /* Reverse map info */ - rev = &kvm->arch.revmap[hash]; - rev->guest_rpte = (i << porder) | HPTE_R_R | HPTE_R_C | - HPTE_R_M | PP_RWXX; + hp_r = addr | HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX; + ret = kvmppc_virtmode_h_enter(vcpu, H_EXACT, hash, hp_v, hp_r); + if (ret != H_SUCCESS) { + pr_err("KVM: map_vrma at %lx failed, ret=%ld\n", + addr, ret); + break; + } } } @@ -178,6 +171,92 @@ static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu) kvmppc_set_msr(vcpu, MSR_SF | MSR_ME); } +/* + * This is called to get a reference to a guest page if there isn't + * one already in the kvm->arch.slot_phys[][] arrays. + */ +static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn, + struct kvm_memory_slot *memslot) +{ + unsigned long start; + long np; + struct page *page, *pages[1]; + unsigned long *physp; + unsigned long pfn, i; + + physp = kvm->arch.slot_phys[memslot->id]; + if (!physp) + return -EINVAL; + i = (gfn - memslot->base_gfn) >> (kvm->arch.ram_porder - PAGE_SHIFT); + if (physp[i]) + return 0; + + page = NULL; + start = gfn_to_hva_memslot(memslot, gfn); + + /* Instantiate and get the page we want access to */ + np = get_user_pages_fast(start, 1, 1, pages); + if (np != 1) + return -EINVAL; + page = pages[0]; + + /* Check it's a 16MB page */ + if (!PageHead(page) || + compound_order(page) != (kvm->arch.ram_porder - PAGE_SHIFT)) { + pr_err("page at %lx isn't 16MB (o=%d)\n", + start, compound_order(page)); + put_page(page); + return -EINVAL; + } + pfn = page_to_pfn(page); + + spin_lock(&kvm->arch.slot_phys_lock); + if (!physp[i]) + physp[i] = (pfn << PAGE_SHIFT) | KVMPPC_GOT_PAGE; + else + put_page(page); + spin_unlock(&kvm->arch.slot_phys_lock); + + return 0; +} + +/* + * We come here on a H_ENTER call from the guest when + * we don't have the requested page pinned already. + */ +long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, + long pte_index, unsigned long pteh, unsigned long ptel) +{ + struct kvm *kvm = vcpu->kvm; + unsigned long psize, gpa, gfn; + struct kvm_memory_slot *memslot; + long ret; + + psize = hpte_page_size(pteh, ptel); + if (!psize) + return H_PARAMETER; + + /* Find the memslot (if any) for this address */ + gpa = (ptel & HPTE_R_RPN) & ~(psize - 1); + gfn = gpa >> PAGE_SHIFT; + memslot = gfn_to_memslot(kvm, gfn); + if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) + return H_PARAMETER; + if (kvmppc_get_guest_page(kvm, gfn, memslot) < 0) + return H_PARAMETER; + + preempt_disable(); + ret = kvmppc_h_enter(vcpu, flags, pte_index, pteh, ptel); + preempt_enable(); + if (ret == H_TOO_HARD) { + /* this can't happen */ + pr_err("KVM: Oops, kvmppc_h_enter returned too hard!\n"); + ret = H_RESOURCE; /* or something */ + } + return ret; + +} + static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, struct kvmppc_pte *gpte, bool data) { @@ -203,8 +282,11 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa, physp += (gfn - memslot->base_gfn) >> (kvm->arch.ram_porder - PAGE_SHIFT); pa = *physp; - if (!pa) - return NULL; + if (!pa) { + if (kvmppc_get_guest_page(kvm, gfn, memslot) < 0) + return NULL; + pa = *physp; + } pfn = pa >> PAGE_SHIFT; page = pfn_to_page(pfn); get_page(page); diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 82d71388eace..ce5a13fb974b 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -49,6 +49,7 @@ #include #include #include +#include #define LARGE_PAGE_ORDER 24 /* 16MB pages */ @@ -57,6 +58,7 @@ /* #define EXIT_DEBUG_INT */ static void kvmppc_end_cede(struct kvm_vcpu *vcpu); +static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu); void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { @@ -231,6 +233,12 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) struct kvm_vcpu *tvcpu; switch (req) { + case H_ENTER: + ret = kvmppc_virtmode_h_enter(vcpu, kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5), + kvmppc_get_gpr(vcpu, 6), + kvmppc_get_gpr(vcpu, 7)); + break; case H_CEDE: break; case H_PROD: @@ -851,9 +859,12 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) return -EINTR; } - /* On PPC970, check that we have an RMA region */ - if (!vcpu->kvm->arch.rma && cpu_has_feature(CPU_FTR_ARCH_201)) - return -EPERM; + /* On the first time here, set up VRMA or RMA */ + if (!vcpu->kvm->arch.rma_setup_done) { + r = kvmppc_hv_setup_rma(vcpu); + if (r) + return r; + } flush_fp_to_thread(current); flush_altivec_to_thread(current); @@ -1063,34 +1074,15 @@ long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *ret) return fd; } -static struct page *hva_to_page(unsigned long addr) -{ - struct page *page[1]; - int npages; - - might_sleep(); - - npages = get_user_pages_fast(addr, 1, 1, page); - - if (unlikely(npages != 1)) - return 0; - - return page[0]; -} - int kvmppc_core_prepare_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem) { - unsigned long psize, porder; - unsigned long i, npages; - unsigned long hva; - struct kvmppc_rma_info *ri = NULL; - struct page *page; + unsigned long psize; + unsigned long npages; unsigned long *phys; - /* For now, only allow 16MB pages */ - porder = LARGE_PAGE_ORDER; - psize = 1ul << porder; + /* For now, only allow 16MB-aligned slots */ + psize = kvm->arch.ram_psize; if ((mem->memory_size & (psize - 1)) || (mem->guest_phys_addr & (psize - 1))) { pr_err("bad memory_size=%llx @ %llx\n", @@ -1099,7 +1091,7 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm, } /* Allocate a slot_phys array */ - npages = mem->memory_size >> porder; + npages = mem->memory_size >> kvm->arch.ram_porder; phys = kvm->arch.slot_phys[mem->slot]; if (!phys) { phys = vzalloc(npages * sizeof(unsigned long)); @@ -1109,39 +1101,110 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm, kvm->arch.slot_npages[mem->slot] = npages; } - /* Do we already have an RMA registered? */ - if (mem->guest_phys_addr == 0 && kvm->arch.rma) - return -EINVAL; + return 0; +} - /* Is this one of our preallocated RMAs? */ - if (mem->guest_phys_addr == 0) { - struct vm_area_struct *vma; - - down_read(¤t->mm->mmap_sem); - vma = find_vma(current->mm, mem->userspace_addr); - if (vma && vma->vm_file && - vma->vm_file->f_op == &kvm_rma_fops && - mem->userspace_addr == vma->vm_start) - ri = vma->vm_file->private_data; - up_read(¤t->mm->mmap_sem); - if (!ri && cpu_has_feature(CPU_FTR_ARCH_201)) { - pr_err("CPU requires an RMO\n"); - return -EINVAL; +static void unpin_slot(struct kvm *kvm, int slot_id) +{ + unsigned long *physp; + unsigned long j, npages, pfn; + struct page *page; + + physp = kvm->arch.slot_phys[slot_id]; + npages = kvm->arch.slot_npages[slot_id]; + if (physp) { + spin_lock(&kvm->arch.slot_phys_lock); + for (j = 0; j < npages; j++) { + if (!(physp[j] & KVMPPC_GOT_PAGE)) + continue; + pfn = physp[j] >> PAGE_SHIFT; + page = pfn_to_page(pfn); + SetPageDirty(page); + put_page(page); } + kvm->arch.slot_phys[slot_id] = NULL; + spin_unlock(&kvm->arch.slot_phys_lock); + vfree(physp); } +} + +void kvmppc_core_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem) +{ +} + +static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu) +{ + int err = 0; + struct kvm *kvm = vcpu->kvm; + struct kvmppc_rma_info *ri = NULL; + unsigned long hva; + struct kvm_memory_slot *memslot; + struct vm_area_struct *vma; + unsigned long lpcr; + unsigned long psize, porder; + unsigned long rma_size; + unsigned long rmls; + unsigned long *physp; + unsigned long i, npages, pa; + + mutex_lock(&kvm->lock); + if (kvm->arch.rma_setup_done) + goto out; /* another vcpu beat us to it */ - if (ri) { - unsigned long rma_size; - unsigned long lpcr; - long rmls; + /* Look up the memslot for guest physical address 0 */ + memslot = gfn_to_memslot(kvm, 0); - rma_size = ri->npages << PAGE_SHIFT; - if (rma_size > mem->memory_size) - rma_size = mem->memory_size; + /* We must have some memory at 0 by now */ + err = -EINVAL; + if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) + goto out; + + /* Look up the VMA for the start of this memory slot */ + hva = memslot->userspace_addr; + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm, hva); + if (!vma || vma->vm_start > hva || (vma->vm_flags & VM_IO)) + goto up_out; + + psize = vma_kernel_pagesize(vma); + if (psize != kvm->arch.ram_psize) + goto up_out; + + /* Is this one of our preallocated RMAs? */ + if (vma->vm_file && vma->vm_file->f_op == &kvm_rma_fops && + hva == vma->vm_start) + ri = vma->vm_file->private_data; + + up_read(¤t->mm->mmap_sem); + + if (!ri) { + /* On POWER7, use VRMA; on PPC970, give up */ + err = -EPERM; + if (cpu_has_feature(CPU_FTR_ARCH_201)) { + pr_err("KVM: CPU requires an RMO\n"); + goto out; + } + + /* Update VRMASD field in the LPCR */ + lpcr = kvm->arch.lpcr & ~(0x1fUL << LPCR_VRMASD_SH); + lpcr |= LPCR_VRMA_L; + kvm->arch.lpcr = lpcr; + + /* Create HPTEs in the hash page table for the VRMA */ + kvmppc_map_vrma(vcpu, memslot); + + } else { + /* Set up to use an RMO region */ + rma_size = ri->npages; + if (rma_size > memslot->npages) + rma_size = memslot->npages; + rma_size <<= PAGE_SHIFT; rmls = lpcr_rmls(rma_size); + err = -EINVAL; if (rmls < 0) { - pr_err("Can't use RMA of 0x%lx bytes\n", rma_size); - return -EINVAL; + pr_err("KVM: Can't use RMA of 0x%lx bytes\n", rma_size); + goto out; } atomic_inc(&ri->use_count); kvm->arch.rma = ri; @@ -1164,65 +1227,31 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm, kvm->arch.rmor = kvm->arch.rma->base_pfn << PAGE_SHIFT; } kvm->arch.lpcr = lpcr; - pr_info("Using RMO at %lx size %lx (LPCR = %lx)\n", + pr_info("KVM: Using RMO at %lx size %lx (LPCR = %lx)\n", ri->base_pfn << PAGE_SHIFT, rma_size, lpcr); - } - for (i = 0; i < npages; ++i) { - if (ri && i < ri->npages) { - phys[i] = (ri->base_pfn << PAGE_SHIFT) + (i << porder); - continue; - } - hva = mem->userspace_addr + (i << porder); - page = hva_to_page(hva); - if (!page) { - pr_err("oops, no pfn for hva %lx\n", hva); - goto err; - } - /* Check it's a 16MB page */ - if (!PageHead(page) || - compound_order(page) != (LARGE_PAGE_ORDER - PAGE_SHIFT)) { - pr_err("page at %lx isn't 16MB (o=%d)\n", - hva, compound_order(page)); - goto err; - } - phys[i] = (page_to_pfn(page) << PAGE_SHIFT) | KVMPPC_GOT_PAGE; + /* Initialize phys addrs of pages in RMO */ + porder = kvm->arch.ram_porder; + npages = rma_size >> porder; + pa = ri->base_pfn << PAGE_SHIFT; + physp = kvm->arch.slot_phys[memslot->id]; + spin_lock(&kvm->arch.slot_phys_lock); + for (i = 0; i < npages; ++i) + physp[i] = pa + (i << porder); + spin_unlock(&kvm->arch.slot_phys_lock); } - return 0; - - err: - return -EINVAL; -} - -static void unpin_slot(struct kvm *kvm, int slot_id) -{ - unsigned long *physp; - unsigned long j, npages, pfn; - struct page *page; - - physp = kvm->arch.slot_phys[slot_id]; - npages = kvm->arch.slot_npages[slot_id]; - if (physp) { - for (j = 0; j < npages; j++) { - if (!(physp[j] & KVMPPC_GOT_PAGE)) - continue; - pfn = physp[j] >> PAGE_SHIFT; - page = pfn_to_page(pfn); - SetPageDirty(page); - put_page(page); - } - vfree(physp); - kvm->arch.slot_phys[slot_id] = NULL; - } -} + /* Order updates to kvm->arch.lpcr etc. vs. rma_setup_done */ + smp_wmb(); + kvm->arch.rma_setup_done = 1; + err = 0; + out: + mutex_unlock(&kvm->lock); + return err; -void kvmppc_core_commit_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem) -{ - if (mem->guest_phys_addr == 0 && mem->memory_size != 0 && - !kvm->arch.rma) - kvmppc_map_vrma(kvm, mem); + up_out: + up_read(¤t->mm->mmap_sem); + goto out; } int kvmppc_core_init_vm(struct kvm *kvm) @@ -1261,6 +1290,7 @@ int kvmppc_core_init_vm(struct kvm *kvm) } kvm->arch.lpcr = lpcr; + spin_lock_init(&kvm->arch.slot_phys_lock); return 0; } diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index a28a6030ec90..047c5e1fd70f 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -56,56 +57,54 @@ static void *real_vmalloc_addr(void *x) long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, long pte_index, unsigned long pteh, unsigned long ptel) { - unsigned long porder; struct kvm *kvm = vcpu->kvm; - unsigned long i, gfn, lpn, pa; + unsigned long i, pa, gpa, gfn, psize; + unsigned long slot_fn; unsigned long *hpte; struct revmap_entry *rev; unsigned long g_ptel = ptel; struct kvm_memory_slot *memslot; - unsigned long *physp; + unsigned long *physp, pte_size; + bool realmode = vcpu->arch.vcore->vcore_state == VCORE_RUNNING; - /* only handle 4k, 64k and 16M pages for now */ - porder = 12; - if (pteh & HPTE_V_LARGE) { - if (cpu_has_feature(CPU_FTR_ARCH_206) && - (ptel & 0xf000) == 0x1000) { - /* 64k page */ - porder = 16; - } else if ((ptel & 0xff000) == 0) { - /* 16M page */ - porder = 24; - /* lowest AVA bit must be 0 for 16M pages */ - if (pteh & 0x80) - return H_PARAMETER; - } else - return H_PARAMETER; - } - if (porder > kvm->arch.ram_porder) + psize = hpte_page_size(pteh, ptel); + if (!psize) return H_PARAMETER; - gfn = ((ptel & HPTE_R_RPN) & ~((1ul << porder) - 1)) >> PAGE_SHIFT; + /* Find the memslot (if any) for this address */ + gpa = (ptel & HPTE_R_RPN) & ~(psize - 1); + gfn = gpa >> PAGE_SHIFT; memslot = builtin_gfn_to_memslot(kvm, gfn); if (!(memslot && !(memslot->flags & KVM_MEMSLOT_INVALID))) return H_PARAMETER; + slot_fn = gfn - memslot->base_gfn; + physp = kvm->arch.slot_phys[memslot->id]; if (!physp) return H_PARAMETER; - - lpn = (gfn - memslot->base_gfn) >> (kvm->arch.ram_porder - PAGE_SHIFT); - physp = real_vmalloc_addr(physp + lpn); + physp += slot_fn; + if (realmode) + physp = real_vmalloc_addr(physp); pa = *physp; if (!pa) - return H_PARAMETER; + return H_TOO_HARD; pa &= PAGE_MASK; + pte_size = kvm->arch.ram_psize; + if (pte_size < psize) + return H_PARAMETER; + if (pa && pte_size > psize) + pa |= gpa & (pte_size - 1); + + ptel &= ~(HPTE_R_PP0 - psize); + ptel |= pa; + /* Check WIMG */ if ((ptel & HPTE_R_WIMG) != HPTE_R_M && (ptel & HPTE_R_WIMG) != (HPTE_R_W | HPTE_R_I | HPTE_R_M)) return H_PARAMETER; pteh &= ~0x60UL; - ptel &= ~(HPTE_R_PP0 - kvm->arch.ram_psize); - ptel |= pa; + pteh |= HPTE_V_VALID; if (pte_index >= HPT_NPTE) return H_PARAMETER; @@ -162,6 +161,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, vcpu->arch.gpr[4] = pte_index; return H_SUCCESS; } +EXPORT_SYMBOL_GPL(kvmppc_h_enter); #define LOCK_TOKEN (*(u32 *)(&get_paca()->lock_token)) -- cgit v1.2.3 From da9d1d7f2875cc8c1ffbce8f3501d0b33f4e7a4d Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 12 Dec 2011 12:31:41 +0000 Subject: KVM: PPC: Allow use of small pages to back Book3S HV guests This relaxes the requirement that the guest memory be provided as 16MB huge pages, allowing it to be provided as normal memory, i.e. in pages of PAGE_SIZE bytes (4k or 64k). To allow this, we index the kvm->arch.slot_phys[] arrays with a small page index, even if huge pages are being used, and use the low-order 5 bits of each entry to store the order of the enclosing page with respect to normal pages, i.e. log_2(enclosing_page_size / PAGE_SIZE). Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/include/asm/kvm_book3s_64.h | 10 +++ arch/powerpc/include/asm/kvm_host.h | 3 +- arch/powerpc/include/asm/kvm_ppc.h | 2 +- arch/powerpc/include/asm/reg.h | 1 + arch/powerpc/kvm/book3s_64_mmu_hv.c | 122 +++++++++++++++++++++---------- arch/powerpc/kvm/book3s_hv.c | 57 ++++++++------- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 6 +- 7 files changed, 132 insertions(+), 69 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 7e6f2ede44ac..10920f7a2b8d 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -113,4 +113,14 @@ static inline unsigned long hpte_page_size(unsigned long h, unsigned long l) return 0; /* error */ } +static inline bool slot_is_aligned(struct kvm_memory_slot *memslot, + unsigned long pagesize) +{ + unsigned long mask = (pagesize >> PAGE_SHIFT) - 1; + + if (pagesize <= PAGE_SIZE) + return 1; + return !(memslot->base_gfn & mask) && !(memslot->npages & mask); +} + #endif /* __ASM_KVM_BOOK3S_64_H__ */ diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index beb22ba71e26..9252d5e3758d 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -177,14 +177,13 @@ struct revmap_entry { }; /* Low-order bits in kvm->arch.slot_phys[][] */ +#define KVMPPC_PAGE_ORDER_MASK 0x1f #define KVMPPC_GOT_PAGE 0x80 struct kvm_arch { #ifdef CONFIG_KVM_BOOK3S_64_HV unsigned long hpt_virt; struct revmap_entry *revmap; - unsigned long ram_psize; - unsigned long ram_porder; unsigned int lpid; unsigned int host_lpid; unsigned long host_lpcr; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 1458c6740ea3..fb70414db90c 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -122,7 +122,7 @@ extern void kvmppc_free_hpt(struct kvm *kvm); extern long kvmppc_prepare_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem); extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu, - struct kvm_memory_slot *memslot); + struct kvm_memory_slot *memslot, unsigned long porder); extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu); extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvm_create_spapr_tce *args); diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 7fdc2c0b7fa0..64447f6c049a 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -237,6 +237,7 @@ #define LPCR_ISL (1ul << (63-2)) #define LPCR_VC_SH (63-2) #define LPCR_DPFD_SH (63-11) +#define LPCR_VRMASD (0x1ful << (63-16)) #define LPCR_VRMA_L (1ul << (63-12)) #define LPCR_VRMA_LP0 (1ul << (63-15)) #define LPCR_VRMA_LP1 (1ul << (63-16)) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 87016ccd8648..cc18f3d67a57 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -34,8 +34,6 @@ #include #include -/* Pages in the VRMA are 16MB pages */ -#define VRMA_PAGE_ORDER 24 #define VRMA_VSID 0x1ffffffUL /* 1TB VSID reserved for VRMA */ /* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */ @@ -95,17 +93,31 @@ void kvmppc_free_hpt(struct kvm *kvm) free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT); } -void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot) +/* Bits in first HPTE dword for pagesize 4k, 64k or 16M */ +static inline unsigned long hpte0_pgsize_encoding(unsigned long pgsize) +{ + return (pgsize > 0x1000) ? HPTE_V_LARGE : 0; +} + +/* Bits in second HPTE dword for pagesize 4k, 64k or 16M */ +static inline unsigned long hpte1_pgsize_encoding(unsigned long pgsize) +{ + return (pgsize == 0x10000) ? 0x1000 : 0; +} + +void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, + unsigned long porder) { - struct kvm *kvm = vcpu->kvm; unsigned long i; unsigned long npages; unsigned long hp_v, hp_r; unsigned long addr, hash; - unsigned long porder = kvm->arch.ram_porder; + unsigned long psize; + unsigned long hp0, hp1; long ret; - npages = kvm->arch.slot_npages[memslot->id]; + psize = 1ul << porder; + npages = memslot->npages >> (porder - PAGE_SHIFT); /* VRMA can't be > 1TB */ if (npages > 1ul << (40 - porder)) @@ -114,6 +126,11 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot) if (npages > HPT_NPTEG) npages = HPT_NPTEG; + hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) | + HPTE_V_BOLTED | hpte0_pgsize_encoding(psize); + hp1 = hpte1_pgsize_encoding(psize) | + HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX; + for (i = 0; i < npages; ++i) { addr = i << porder; /* can't use hpt_hash since va > 64 bits */ @@ -125,10 +142,8 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot) * is available and use it. */ hash = (hash << 3) + 7; - hp_v = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) | - (i << (VRMA_PAGE_ORDER - 16)) | HPTE_V_BOLTED | - HPTE_V_LARGE | HPTE_V_VALID; - hp_r = addr | HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX; + hp_v = hp0 | ((addr >> 16) & ~0x7fUL); + hp_r = hp1 | addr; ret = kvmppc_virtmode_h_enter(vcpu, H_EXACT, hash, hp_v, hp_r); if (ret != H_SUCCESS) { pr_err("KVM: map_vrma at %lx failed, ret=%ld\n", @@ -176,22 +191,25 @@ static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu) * one already in the kvm->arch.slot_phys[][] arrays. */ static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn, - struct kvm_memory_slot *memslot) + struct kvm_memory_slot *memslot, + unsigned long psize) { unsigned long start; - long np; - struct page *page, *pages[1]; + long np, err; + struct page *page, *hpage, *pages[1]; + unsigned long s, pgsize; unsigned long *physp; - unsigned long pfn, i; + unsigned int got, pgorder; + unsigned long pfn, i, npages; physp = kvm->arch.slot_phys[memslot->id]; if (!physp) return -EINVAL; - i = (gfn - memslot->base_gfn) >> (kvm->arch.ram_porder - PAGE_SHIFT); - if (physp[i]) + if (physp[gfn - memslot->base_gfn]) return 0; page = NULL; + pgsize = psize; start = gfn_to_hva_memslot(memslot, gfn); /* Instantiate and get the page we want access to */ @@ -199,25 +217,46 @@ static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn, if (np != 1) return -EINVAL; page = pages[0]; - - /* Check it's a 16MB page */ - if (!PageHead(page) || - compound_order(page) != (kvm->arch.ram_porder - PAGE_SHIFT)) { - pr_err("page at %lx isn't 16MB (o=%d)\n", - start, compound_order(page)); - put_page(page); - return -EINVAL; + got = KVMPPC_GOT_PAGE; + + /* See if this is a large page */ + s = PAGE_SIZE; + if (PageHuge(page)) { + hpage = compound_head(page); + s <<= compound_order(hpage); + /* Get the whole large page if slot alignment is ok */ + if (s > psize && slot_is_aligned(memslot, s) && + !(memslot->userspace_addr & (s - 1))) { + start &= ~(s - 1); + pgsize = s; + page = hpage; + } } + err = -EINVAL; + if (s < psize) + goto out; pfn = page_to_pfn(page); + npages = pgsize >> PAGE_SHIFT; + pgorder = __ilog2(npages); + physp += (gfn - memslot->base_gfn) & ~(npages - 1); spin_lock(&kvm->arch.slot_phys_lock); - if (!physp[i]) - physp[i] = (pfn << PAGE_SHIFT) | KVMPPC_GOT_PAGE; - else - put_page(page); + for (i = 0; i < npages; ++i) { + if (!physp[i]) { + physp[i] = ((pfn + i) << PAGE_SHIFT) + got + pgorder; + got = 0; + } + } spin_unlock(&kvm->arch.slot_phys_lock); + err = 0; - return 0; + out: + if (got) { + if (PageHuge(page)) + page = compound_head(page); + put_page(page); + } + return err; } /* @@ -242,7 +281,9 @@ long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, memslot = gfn_to_memslot(kvm, gfn); if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) return H_PARAMETER; - if (kvmppc_get_guest_page(kvm, gfn, memslot) < 0) + if (!slot_is_aligned(memslot, psize)) + return H_PARAMETER; + if (kvmppc_get_guest_page(kvm, gfn, memslot, psize) < 0) return H_PARAMETER; preempt_disable(); @@ -269,8 +310,8 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa, struct kvm_memory_slot *memslot; unsigned long gfn = gpa >> PAGE_SHIFT; struct page *page; - unsigned long offset; - unsigned long pfn, pa; + unsigned long psize, offset; + unsigned long pa; unsigned long *physp; memslot = gfn_to_memslot(kvm, gfn); @@ -279,20 +320,23 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa, physp = kvm->arch.slot_phys[memslot->id]; if (!physp) return NULL; - physp += (gfn - memslot->base_gfn) >> - (kvm->arch.ram_porder - PAGE_SHIFT); + physp += gfn - memslot->base_gfn; pa = *physp; if (!pa) { - if (kvmppc_get_guest_page(kvm, gfn, memslot) < 0) + if (kvmppc_get_guest_page(kvm, gfn, memslot, PAGE_SIZE) < 0) return NULL; pa = *physp; } - pfn = pa >> PAGE_SHIFT; - page = pfn_to_page(pfn); + page = pfn_to_page(pa >> PAGE_SHIFT); + psize = PAGE_SIZE; + if (PageHuge(page)) { + page = compound_head(page); + psize <<= compound_order(page); + } get_page(page); - offset = gpa & (kvm->arch.ram_psize - 1); + offset = gpa & (psize - 1); if (nb_ret) - *nb_ret = kvm->arch.ram_psize - offset; + *nb_ret = psize - offset; return page_address(page) + offset; } diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index ce5a13fb974b..6ed0a84ef91c 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -51,8 +51,6 @@ #include #include -#define LARGE_PAGE_ORDER 24 /* 16MB pages */ - /* #define EXIT_DEBUG */ /* #define EXIT_DEBUG_SIMPLE */ /* #define EXIT_DEBUG_INT */ @@ -1074,24 +1072,26 @@ long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *ret) return fd; } +static unsigned long slb_pgsize_encoding(unsigned long psize) +{ + unsigned long senc = 0; + + if (psize > 0x1000) { + senc = SLB_VSID_L; + if (psize == 0x10000) + senc |= SLB_VSID_LP_01; + } + return senc; +} + int kvmppc_core_prepare_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem) { - unsigned long psize; unsigned long npages; unsigned long *phys; - /* For now, only allow 16MB-aligned slots */ - psize = kvm->arch.ram_psize; - if ((mem->memory_size & (psize - 1)) || - (mem->guest_phys_addr & (psize - 1))) { - pr_err("bad memory_size=%llx @ %llx\n", - mem->memory_size, mem->guest_phys_addr); - return -EINVAL; - } - /* Allocate a slot_phys array */ - npages = mem->memory_size >> kvm->arch.ram_porder; + npages = mem->memory_size >> PAGE_SHIFT; phys = kvm->arch.slot_phys[mem->slot]; if (!phys) { phys = vzalloc(npages * sizeof(unsigned long)); @@ -1119,6 +1119,8 @@ static void unpin_slot(struct kvm *kvm, int slot_id) continue; pfn = physp[j] >> PAGE_SHIFT; page = pfn_to_page(pfn); + if (PageHuge(page)) + page = compound_head(page); SetPageDirty(page); put_page(page); } @@ -1141,12 +1143,12 @@ static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu) unsigned long hva; struct kvm_memory_slot *memslot; struct vm_area_struct *vma; - unsigned long lpcr; + unsigned long lpcr, senc; unsigned long psize, porder; unsigned long rma_size; unsigned long rmls; unsigned long *physp; - unsigned long i, npages, pa; + unsigned long i, npages; mutex_lock(&kvm->lock); if (kvm->arch.rma_setup_done) @@ -1168,8 +1170,7 @@ static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu) goto up_out; psize = vma_kernel_pagesize(vma); - if (psize != kvm->arch.ram_psize) - goto up_out; + porder = __ilog2(psize); /* Is this one of our preallocated RMAs? */ if (vma->vm_file && vma->vm_file->f_op == &kvm_rma_fops && @@ -1186,13 +1187,20 @@ static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu) goto out; } + /* We can handle 4k, 64k or 16M pages in the VRMA */ + err = -EINVAL; + if (!(psize == 0x1000 || psize == 0x10000 || + psize == 0x1000000)) + goto out; + /* Update VRMASD field in the LPCR */ - lpcr = kvm->arch.lpcr & ~(0x1fUL << LPCR_VRMASD_SH); - lpcr |= LPCR_VRMA_L; + senc = slb_pgsize_encoding(psize); + lpcr = kvm->arch.lpcr & ~LPCR_VRMASD; + lpcr |= senc << (LPCR_VRMASD_SH - 4); kvm->arch.lpcr = lpcr; /* Create HPTEs in the hash page table for the VRMA */ - kvmppc_map_vrma(vcpu, memslot); + kvmppc_map_vrma(vcpu, memslot, porder); } else { /* Set up to use an RMO region */ @@ -1231,13 +1239,12 @@ static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu) ri->base_pfn << PAGE_SHIFT, rma_size, lpcr); /* Initialize phys addrs of pages in RMO */ - porder = kvm->arch.ram_porder; - npages = rma_size >> porder; - pa = ri->base_pfn << PAGE_SHIFT; + npages = ri->npages; + porder = __ilog2(npages); physp = kvm->arch.slot_phys[memslot->id]; spin_lock(&kvm->arch.slot_phys_lock); for (i = 0; i < npages; ++i) - physp[i] = pa + (i << porder); + physp[i] = ((ri->base_pfn + i) << PAGE_SHIFT) + porder; spin_unlock(&kvm->arch.slot_phys_lock); } @@ -1266,8 +1273,6 @@ int kvmppc_core_init_vm(struct kvm *kvm) INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); - kvm->arch.ram_psize = 1ul << LARGE_PAGE_ORDER; - kvm->arch.ram_porder = LARGE_PAGE_ORDER; kvm->arch.rma = NULL; kvm->arch.host_sdr1 = mfspr(SPRN_SDR1); diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 047c5e1fd70f..c086eb0fa992 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -77,6 +77,10 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, memslot = builtin_gfn_to_memslot(kvm, gfn); if (!(memslot && !(memslot->flags & KVM_MEMSLOT_INVALID))) return H_PARAMETER; + + /* Check if the requested page fits entirely in the memslot. */ + if (!slot_is_aligned(memslot, psize)) + return H_PARAMETER; slot_fn = gfn - memslot->base_gfn; physp = kvm->arch.slot_phys[memslot->id]; @@ -88,9 +92,9 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, pa = *physp; if (!pa) return H_TOO_HARD; + pte_size = PAGE_SIZE << (pa & KVMPPC_PAGE_ORDER_MASK); pa &= PAGE_MASK; - pte_size = kvm->arch.ram_psize; if (pte_size < psize) return H_PARAMETER; if (pa && pte_size > psize) -- cgit v1.2.3 From 9d0ef5ea043d1242897d15c71bd1a15da79b4a5d Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 12 Dec 2011 12:32:27 +0000 Subject: KVM: PPC: Allow I/O mappings in memory slots This provides for the case where userspace maps an I/O device into the address range of a memory slot using a VM_PFNMAP mapping. In that case, we work out the pfn from vma->vm_pgoff, and record the cache enable bits from vma->vm_page_prot in two low-order bits in the slot_phys array entries. Then, in kvmppc_h_enter() we check that the cache bits in the HPTE that the guest wants to insert match the cache bits in the slot_phys array entry. However, we do allow the guest to create what it thinks is a non-cacheable or write-through mapping to memory that is actually cacheable, so that we can use normal system memory as part of an emulated device later on. In that case the actual HPTE we insert is a cacheable HPTE. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/include/asm/kvm_book3s_64.h | 26 +++++++++++++ arch/powerpc/include/asm/kvm_host.h | 2 + arch/powerpc/kvm/book3s_64_mmu_hv.c | 65 +++++++++++++++++++++----------- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 15 ++++++-- 4 files changed, 84 insertions(+), 24 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 10920f7a2b8d..18b590d261ff 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -113,6 +113,32 @@ static inline unsigned long hpte_page_size(unsigned long h, unsigned long l) return 0; /* error */ } +static inline int hpte_cache_flags_ok(unsigned long ptel, unsigned long io_type) +{ + unsigned int wimg = ptel & HPTE_R_WIMG; + + /* Handle SAO */ + if (wimg == (HPTE_R_W | HPTE_R_I | HPTE_R_M) && + cpu_has_feature(CPU_FTR_ARCH_206)) + wimg = HPTE_R_M; + + if (!io_type) + return wimg == HPTE_R_M; + + return (wimg & (HPTE_R_W | HPTE_R_I)) == io_type; +} + +/* Return HPTE cache control bits corresponding to Linux pte bits */ +static inline unsigned long hpte_cache_bits(unsigned long pte_val) +{ +#if _PAGE_NO_CACHE == HPTE_R_I && _PAGE_WRITETHRU == HPTE_R_W + return pte_val & (HPTE_R_W | HPTE_R_I); +#else + return ((pte_val & _PAGE_NO_CACHE) ? HPTE_R_I : 0) + + ((pte_val & _PAGE_WRITETHRU) ? HPTE_R_W : 0); +#endif +} + static inline bool slot_is_aligned(struct kvm_memory_slot *memslot, unsigned long pagesize) { diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 9252d5e3758d..243bc8038572 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -178,6 +178,8 @@ struct revmap_entry { /* Low-order bits in kvm->arch.slot_phys[][] */ #define KVMPPC_PAGE_ORDER_MASK 0x1f +#define KVMPPC_PAGE_NO_CACHE HPTE_R_I /* 0x20 */ +#define KVMPPC_PAGE_WRITETHRU HPTE_R_W /* 0x40 */ #define KVMPPC_GOT_PAGE 0x80 struct kvm_arch { diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index cc18f3d67a57..b904c40a17bc 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -199,7 +199,8 @@ static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn, struct page *page, *hpage, *pages[1]; unsigned long s, pgsize; unsigned long *physp; - unsigned int got, pgorder; + unsigned int is_io, got, pgorder; + struct vm_area_struct *vma; unsigned long pfn, i, npages; physp = kvm->arch.slot_phys[memslot->id]; @@ -208,34 +209,51 @@ static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn, if (physp[gfn - memslot->base_gfn]) return 0; + is_io = 0; + got = 0; page = NULL; pgsize = psize; + err = -EINVAL; start = gfn_to_hva_memslot(memslot, gfn); /* Instantiate and get the page we want access to */ np = get_user_pages_fast(start, 1, 1, pages); - if (np != 1) - return -EINVAL; - page = pages[0]; - got = KVMPPC_GOT_PAGE; + if (np != 1) { + /* Look up the vma for the page */ + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm, start); + if (!vma || vma->vm_start > start || + start + psize > vma->vm_end || + !(vma->vm_flags & VM_PFNMAP)) + goto up_err; + is_io = hpte_cache_bits(pgprot_val(vma->vm_page_prot)); + pfn = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); + /* check alignment of pfn vs. requested page size */ + if (psize > PAGE_SIZE && (pfn & ((psize >> PAGE_SHIFT) - 1))) + goto up_err; + up_read(¤t->mm->mmap_sem); - /* See if this is a large page */ - s = PAGE_SIZE; - if (PageHuge(page)) { - hpage = compound_head(page); - s <<= compound_order(hpage); - /* Get the whole large page if slot alignment is ok */ - if (s > psize && slot_is_aligned(memslot, s) && - !(memslot->userspace_addr & (s - 1))) { - start &= ~(s - 1); - pgsize = s; - page = hpage; + } else { + page = pages[0]; + got = KVMPPC_GOT_PAGE; + + /* See if this is a large page */ + s = PAGE_SIZE; + if (PageHuge(page)) { + hpage = compound_head(page); + s <<= compound_order(hpage); + /* Get the whole large page if slot alignment is ok */ + if (s > psize && slot_is_aligned(memslot, s) && + !(memslot->userspace_addr & (s - 1))) { + start &= ~(s - 1); + pgsize = s; + page = hpage; + } } + if (s < psize) + goto out; + pfn = page_to_pfn(page); } - err = -EINVAL; - if (s < psize) - goto out; - pfn = page_to_pfn(page); npages = pgsize >> PAGE_SHIFT; pgorder = __ilog2(npages); @@ -243,7 +261,8 @@ static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn, spin_lock(&kvm->arch.slot_phys_lock); for (i = 0; i < npages; ++i) { if (!physp[i]) { - physp[i] = ((pfn + i) << PAGE_SHIFT) + got + pgorder; + physp[i] = ((pfn + i) << PAGE_SHIFT) + + got + is_io + pgorder; got = 0; } } @@ -257,6 +276,10 @@ static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn, put_page(page); } return err; + + up_err: + up_read(¤t->mm->mmap_sem); + return err; } /* diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index c086eb0fa992..3f5b016490d0 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -65,6 +65,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, unsigned long g_ptel = ptel; struct kvm_memory_slot *memslot; unsigned long *physp, pte_size; + unsigned long is_io; bool realmode = vcpu->arch.vcore->vcore_state == VCORE_RUNNING; psize = hpte_page_size(pteh, ptel); @@ -92,6 +93,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, pa = *physp; if (!pa) return H_TOO_HARD; + is_io = pa & (HPTE_R_I | HPTE_R_W); pte_size = PAGE_SIZE << (pa & KVMPPC_PAGE_ORDER_MASK); pa &= PAGE_MASK; @@ -104,9 +106,16 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, ptel |= pa; /* Check WIMG */ - if ((ptel & HPTE_R_WIMG) != HPTE_R_M && - (ptel & HPTE_R_WIMG) != (HPTE_R_W | HPTE_R_I | HPTE_R_M)) - return H_PARAMETER; + if (!hpte_cache_flags_ok(ptel, is_io)) { + if (is_io) + return H_PARAMETER; + /* + * Allow guest to map emulated device memory as + * uncacheable, but actually make it cacheable. + */ + ptel &= ~(HPTE_R_W|HPTE_R_I|HPTE_R_G); + ptel |= HPTE_R_M; + } pteh &= ~0x60UL; pteh |= HPTE_V_VALID; -- cgit v1.2.3 From 06ce2c63d933e347f8a199f123a8a293619ab3d2 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 12 Dec 2011 12:33:07 +0000 Subject: KVM: PPC: Maintain a doubly-linked list of guest HPTEs for each gfn This expands the reverse mapping array to contain two links for each HPTE which are used to link together HPTEs that correspond to the same guest logical page. Each circular list of HPTEs is pointed to by the rmap array entry for the guest logical page, pointed to by the relevant memslot. Links are 32-bit HPT entry indexes rather than full 64-bit pointers, to save space. We use 3 of the remaining 32 bits in the rmap array entries as a lock bit, a referenced bit and a present bit (the present bit is needed since HPTE index 0 is valid). The bit lock for the rmap chain nests inside the HPTE lock bit. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/include/asm/kvm_book3s_64.h | 18 +++++++ arch/powerpc/include/asm/kvm_host.h | 17 ++++++- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 84 +++++++++++++++++++++++++++++++- 3 files changed, 117 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 18b590d261ff..9508c03e6671 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -113,6 +113,11 @@ static inline unsigned long hpte_page_size(unsigned long h, unsigned long l) return 0; /* error */ } +static inline unsigned long hpte_rpn(unsigned long ptel, unsigned long psize) +{ + return ((ptel & HPTE_R_RPN) & ~(psize - 1)) >> PAGE_SHIFT; +} + static inline int hpte_cache_flags_ok(unsigned long ptel, unsigned long io_type) { unsigned int wimg = ptel & HPTE_R_WIMG; @@ -139,6 +144,19 @@ static inline unsigned long hpte_cache_bits(unsigned long pte_val) #endif } +static inline void lock_rmap(unsigned long *rmap) +{ + do { + while (test_bit(KVMPPC_RMAP_LOCK_BIT, rmap)) + cpu_relax(); + } while (test_and_set_bit_lock(KVMPPC_RMAP_LOCK_BIT, rmap)); +} + +static inline void unlock_rmap(unsigned long *rmap) +{ + __clear_bit_unlock(KVMPPC_RMAP_LOCK_BIT, rmap); +} + static inline bool slot_is_aligned(struct kvm_memory_slot *memslot, unsigned long pagesize) { diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 243bc8038572..97cb2d7865f3 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -170,12 +170,27 @@ struct kvmppc_rma_info { /* * The reverse mapping array has one entry for each HPTE, * which stores the guest's view of the second word of the HPTE - * (including the guest physical address of the mapping). + * (including the guest physical address of the mapping), + * plus forward and backward pointers in a doubly-linked ring + * of HPTEs that map the same host page. The pointers in this + * ring are 32-bit HPTE indexes, to save space. */ struct revmap_entry { unsigned long guest_rpte; + unsigned int forw, back; }; +/* + * We use the top bit of each memslot->rmap entry as a lock bit, + * and bit 32 as a present flag. The bottom 32 bits are the + * index in the guest HPT of a HPTE that points to the page. + */ +#define KVMPPC_RMAP_LOCK_BIT 63 +#define KVMPPC_RMAP_REF_BIT 33 +#define KVMPPC_RMAP_REFERENCED (1ul << KVMPPC_RMAP_REF_BIT) +#define KVMPPC_RMAP_PRESENT 0x100000000ul +#define KVMPPC_RMAP_INDEX 0xfffffffful + /* Low-order bits in kvm->arch.slot_phys[][] */ #define KVMPPC_PAGE_ORDER_MASK 0x1f #define KVMPPC_PAGE_NO_CACHE HPTE_R_I /* 0x20 */ diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 3f5b016490d0..5b31caa4b314 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -54,6 +54,70 @@ static void *real_vmalloc_addr(void *x) return __va(addr); } +/* + * Add this HPTE into the chain for the real page. + * Must be called with the chain locked; it unlocks the chain. + */ +static void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, + unsigned long *rmap, long pte_index, int realmode) +{ + struct revmap_entry *head, *tail; + unsigned long i; + + if (*rmap & KVMPPC_RMAP_PRESENT) { + i = *rmap & KVMPPC_RMAP_INDEX; + head = &kvm->arch.revmap[i]; + if (realmode) + head = real_vmalloc_addr(head); + tail = &kvm->arch.revmap[head->back]; + if (realmode) + tail = real_vmalloc_addr(tail); + rev->forw = i; + rev->back = head->back; + tail->forw = pte_index; + head->back = pte_index; + } else { + rev->forw = rev->back = pte_index; + i = pte_index; + } + smp_wmb(); + *rmap = i | KVMPPC_RMAP_REFERENCED | KVMPPC_RMAP_PRESENT; /* unlock */ +} + +/* Remove this HPTE from the chain for a real page */ +static void remove_revmap_chain(struct kvm *kvm, long pte_index, + unsigned long hpte_v) +{ + struct revmap_entry *rev, *next, *prev; + unsigned long gfn, ptel, head; + struct kvm_memory_slot *memslot; + unsigned long *rmap; + + rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); + ptel = rev->guest_rpte; + gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel)); + memslot = builtin_gfn_to_memslot(kvm, gfn); + if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) + return; + + rmap = real_vmalloc_addr(&memslot->rmap[gfn - memslot->base_gfn]); + lock_rmap(rmap); + + head = *rmap & KVMPPC_RMAP_INDEX; + next = real_vmalloc_addr(&kvm->arch.revmap[rev->forw]); + prev = real_vmalloc_addr(&kvm->arch.revmap[rev->back]); + next->back = rev->back; + prev->forw = rev->forw; + if (head == pte_index) { + head = rev->forw; + if (head == pte_index) + *rmap &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX); + else + *rmap = (*rmap & ~KVMPPC_RMAP_INDEX) | head; + } + unlock_rmap(rmap); +} + long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, long pte_index, unsigned long pteh, unsigned long ptel) { @@ -66,6 +130,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, struct kvm_memory_slot *memslot; unsigned long *physp, pte_size; unsigned long is_io; + unsigned long *rmap; bool realmode = vcpu->arch.vcore->vcore_state == VCORE_RUNNING; psize = hpte_page_size(pteh, ptel); @@ -83,6 +148,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, if (!slot_is_aligned(memslot, psize)) return H_PARAMETER; slot_fn = gfn - memslot->base_gfn; + rmap = &memslot->rmap[slot_fn]; physp = kvm->arch.slot_phys[memslot->id]; if (!physp) @@ -164,13 +230,25 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, } /* Save away the guest's idea of the second HPTE dword */ - rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); + rev = &kvm->arch.revmap[pte_index]; + if (realmode) + rev = real_vmalloc_addr(rev); if (rev) rev->guest_rpte = g_ptel; + + /* Link HPTE into reverse-map chain */ + if (realmode) + rmap = real_vmalloc_addr(rmap); + lock_rmap(rmap); + kvmppc_add_revmap_chain(kvm, rev, rmap, pte_index, realmode); + hpte[1] = ptel; + + /* Write the first HPTE dword, unlocking the HPTE and making it valid */ eieio(); hpte[0] = pteh; asm volatile("ptesync" : : : "memory"); + vcpu->arch.gpr[4] = pte_index; return H_SUCCESS; } @@ -220,6 +298,8 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, vcpu->arch.gpr[4] = v = hpte[0] & ~HPTE_V_HVLOCK; vcpu->arch.gpr[5] = r = hpte[1]; rb = compute_tlbie_rb(v, r, pte_index); + remove_revmap_chain(kvm, pte_index, v); + smp_wmb(); hpte[0] = 0; if (!(flags & H_LOCAL)) { while(!try_lock_tlbie(&kvm->arch.tlbie_lock)) @@ -293,6 +373,8 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) flags |= (hp[1] >> 5) & 0x0c; args[i * 2] = ((0x80 | flags) << 56) + pte_index; tlbrb[n_inval++] = compute_tlbie_rb(hp[0], hp[1], pte_index); + remove_revmap_chain(kvm, pte_index, hp[0]); + smp_wmb(); hp[0] = 0; } if (n_inval == 0) -- cgit v1.2.3 From 697d3899dcb4bcd918d060a92db57b794e56b077 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 12 Dec 2011 12:36:37 +0000 Subject: KVM: PPC: Implement MMIO emulation support for Book3S HV guests This provides the low-level support for MMIO emulation in Book3S HV guests. When the guest tries to map a page which is not covered by any memslot, that page is taken to be an MMIO emulation page. Instead of inserting a valid HPTE, we insert an HPTE that has the valid bit clear but another hypervisor software-use bit set, which we call HPTE_V_ABSENT, to indicate that this is an absent page. An absent page is treated much like a valid page as far as guest hcalls (H_ENTER, H_REMOVE, H_READ etc.) are concerned, except of course that an absent HPTE doesn't need to be invalidated with tlbie since it was never valid as far as the hardware is concerned. When the guest accesses a page for which there is an absent HPTE, it will take a hypervisor data storage interrupt (HDSI) since we now set the VPM1 bit in the LPCR. Our HDSI handler for HPTE-not-present faults looks up the hash table and if it finds an absent HPTE mapping the requested virtual address, will switch to kernel mode and handle the fault in kvmppc_book3s_hv_page_fault(), which at present just calls kvmppc_hv_emulate_mmio() to set up the MMIO emulation. This is based on an earlier patch by Benjamin Herrenschmidt, but since heavily reworked. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/include/asm/kvm_book3s.h | 5 + arch/powerpc/include/asm/kvm_book3s_64.h | 26 +++ arch/powerpc/include/asm/kvm_host.h | 5 + arch/powerpc/include/asm/mmu-hash64.h | 2 +- arch/powerpc/include/asm/ppc-opcode.h | 4 +- arch/powerpc/include/asm/reg.h | 1 + arch/powerpc/kernel/asm-offsets.c | 1 + arch/powerpc/kernel/exceptions-64s.S | 8 +- arch/powerpc/kvm/book3s_64_mmu_hv.c | 228 +++++++++++++++++++++++++-- arch/powerpc/kvm/book3s_hv.c | 21 +-- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 262 ++++++++++++++++++++++++++----- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 127 ++++++++++++--- 12 files changed, 607 insertions(+), 83 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index c700f43ba178..3a9e51f43397 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -119,6 +119,11 @@ extern void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu); extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte); extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr); extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu); +extern int kvmppc_book3s_hv_page_fault(struct kvm_run *run, + struct kvm_vcpu *vcpu, unsigned long addr, + unsigned long status); +extern long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, + unsigned long slb_v, unsigned long valid); extern void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte); extern struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu); diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 9508c03e6671..79dc37fb86b5 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -43,12 +43,15 @@ static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu) #define HPT_HASH_MASK (HPT_NPTEG - 1) #endif +#define VRMA_VSID 0x1ffffffUL /* 1TB VSID reserved for VRMA */ + /* * We use a lock bit in HPTE dword 0 to synchronize updates and * accesses to each HPTE, and another bit to indicate non-present * HPTEs. */ #define HPTE_V_HVLOCK 0x40UL +#define HPTE_V_ABSENT 0x20UL static inline long try_lock_hpte(unsigned long *hpte, unsigned long bits) { @@ -144,6 +147,29 @@ static inline unsigned long hpte_cache_bits(unsigned long pte_val) #endif } +static inline bool hpte_read_permission(unsigned long pp, unsigned long key) +{ + if (key) + return PP_RWRX <= pp && pp <= PP_RXRX; + return 1; +} + +static inline bool hpte_write_permission(unsigned long pp, unsigned long key) +{ + if (key) + return pp == PP_RWRW; + return pp <= PP_RWRW; +} + +static inline int hpte_get_skey_perm(unsigned long hpte_r, unsigned long amr) +{ + unsigned long skey; + + skey = ((hpte_r & HPTE_R_KEY_HI) >> 57) | + ((hpte_r & HPTE_R_KEY_LO) >> 9); + return (amr >> (62 - 2 * skey)) & 3; +} + static inline void lock_rmap(unsigned long *rmap) { do { diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 97cb2d7865f3..937cacaaf236 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -210,6 +210,7 @@ struct kvm_arch { unsigned long lpcr; unsigned long rmor; struct kvmppc_rma_info *rma; + unsigned long vrma_slb_v; int rma_setup_done; struct list_head spapr_tce_tables; spinlock_t slot_phys_lock; @@ -452,6 +453,10 @@ struct kvm_vcpu_arch { #ifdef CONFIG_KVM_BOOK3S_64_HV struct kvm_vcpu_arch_shared shregs; + unsigned long pgfault_addr; + long pgfault_index; + unsigned long pgfault_hpte[2]; + struct list_head run_list; struct task_struct *run_task; struct kvm_run *kvm_run; diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h index 412ba493cb98..0759dd8bf5aa 100644 --- a/arch/powerpc/include/asm/mmu-hash64.h +++ b/arch/powerpc/include/asm/mmu-hash64.h @@ -108,11 +108,11 @@ extern char initial_stab[]; #define HPTE_V_VRMA_MASK ASM_CONST(0x4001ffffff000000) /* Values for PP (assumes Ks=0, Kp=1) */ -/* pp0 will always be 0 for linux */ #define PP_RWXX 0 /* Supervisor read/write, User none */ #define PP_RWRX 1 /* Supervisor read/write, User read */ #define PP_RWRW 2 /* Supervisor read/write, User read/write */ #define PP_RXRX 3 /* Supervisor read, User read */ +#define PP_RXXX (HPTE_R_PP0 | 2) /* Supervisor read, user none */ #ifndef __ASSEMBLY__ diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index e980faae4225..d81f99430fe7 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -45,6 +45,7 @@ #define PPC_INST_MFSPR_DSCR_MASK 0xfc1fffff #define PPC_INST_MTSPR_DSCR 0x7c1103a6 #define PPC_INST_MTSPR_DSCR_MASK 0xfc1fffff +#define PPC_INST_SLBFEE 0x7c0007a7 #define PPC_INST_STRING 0x7c00042a #define PPC_INST_STRING_MASK 0xfc0007fe @@ -183,7 +184,8 @@ __PPC_RS(t) | __PPC_RA(a) | __PPC_RB(b)) #define PPC_ERATSX_DOT(t, a, w) stringify_in_c(.long PPC_INST_ERATSX_DOT | \ __PPC_RS(t) | __PPC_RA(a) | __PPC_RB(b)) - +#define PPC_SLBFEE_DOT(t, b) stringify_in_c(.long PPC_INST_SLBFEE | \ + __PPC_RT(t) | __PPC_RB(b)) /* * Define what the VSX XX1 form instructions will look like, then add diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 64447f6c049a..16efb3151c20 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -216,6 +216,7 @@ #define DSISR_ISSTORE 0x02000000 /* access was a store */ #define DSISR_DABRMATCH 0x00400000 /* hit data breakpoint */ #define DSISR_NOSEGMENT 0x00200000 /* STAB/SLB miss */ +#define DSISR_KEYFAULT 0x00200000 /* Key fault */ #define SPRN_TBRL 0x10C /* Time Base Read Lower Register (user, R/O) */ #define SPRN_TBRU 0x10D /* Time Base Read Upper Register (user, R/O) */ #define SPRN_TBWL 0x11C /* Time Base Lower Register (super, R/W) */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index e7bfcf81b746..8e0db0b12dd0 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -455,6 +455,7 @@ int main(void) DEFINE(KVM_LAST_VCPU, offsetof(struct kvm, arch.last_vcpu)); DEFINE(KVM_LPCR, offsetof(struct kvm, arch.lpcr)); DEFINE(KVM_RMOR, offsetof(struct kvm, arch.rmor)); + DEFINE(KVM_VRMA_SLB_V, offsetof(struct kvm, arch.vrma_slb_v)); DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr)); DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar)); #endif diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 15c5a4f6de01..8bea12086b67 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -100,14 +100,14 @@ data_access_not_stab: END_MMU_FTR_SECTION_IFCLR(MMU_FTR_SLB) #endif EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, data_access_common, EXC_STD, - KVMTEST_PR, 0x300) + KVMTEST, 0x300) . = 0x380 .globl data_access_slb_pSeries data_access_slb_pSeries: HMT_MEDIUM SET_SCRATCH0(r13) - EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x380) + EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x380) std r3,PACA_EXSLB+EX_R3(r13) mfspr r3,SPRN_DAR #ifdef __DISABLED__ @@ -329,8 +329,8 @@ do_stab_bolted_pSeries: EXCEPTION_PROLOG_PSERIES_1(.do_stab_bolted, EXC_STD) #endif /* CONFIG_POWER4_ONLY */ - KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_STD, 0x300) - KVM_HANDLER_PR_SKIP(PACA_EXSLB, EXC_STD, 0x380) + KVM_HANDLER_SKIP(PACA_EXGEN, EXC_STD, 0x300) + KVM_HANDLER_SKIP(PACA_EXSLB, EXC_STD, 0x380) KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x400) KVM_HANDLER_PR(PACA_EXSLB, EXC_STD, 0x480) KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x900) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index b904c40a17bc..2d31519b8637 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -34,8 +34,6 @@ #include #include -#define VRMA_VSID 0x1ffffffUL /* 1TB VSID reserved for VRMA */ - /* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */ #define MAX_LPID_970 63 #define NR_LPIDS (LPID_RSVD + 1) @@ -298,16 +296,18 @@ long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, if (!psize) return H_PARAMETER; + pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID); + /* Find the memslot (if any) for this address */ gpa = (ptel & HPTE_R_RPN) & ~(psize - 1); gfn = gpa >> PAGE_SHIFT; memslot = gfn_to_memslot(kvm, gfn); - if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) - return H_PARAMETER; - if (!slot_is_aligned(memslot, psize)) - return H_PARAMETER; - if (kvmppc_get_guest_page(kvm, gfn, memslot, psize) < 0) - return H_PARAMETER; + if (memslot && !(memslot->flags & KVM_MEMSLOT_INVALID)) { + if (!slot_is_aligned(memslot, psize)) + return H_PARAMETER; + if (kvmppc_get_guest_page(kvm, gfn, memslot, psize) < 0) + return H_PARAMETER; + } preempt_disable(); ret = kvmppc_h_enter(vcpu, flags, pte_index, pteh, ptel); @@ -321,10 +321,218 @@ long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, } +static struct kvmppc_slb *kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu *vcpu, + gva_t eaddr) +{ + u64 mask; + int i; + + for (i = 0; i < vcpu->arch.slb_nr; i++) { + if (!(vcpu->arch.slb[i].orige & SLB_ESID_V)) + continue; + + if (vcpu->arch.slb[i].origv & SLB_VSID_B_1T) + mask = ESID_MASK_1T; + else + mask = ESID_MASK; + + if (((vcpu->arch.slb[i].orige ^ eaddr) & mask) == 0) + return &vcpu->arch.slb[i]; + } + return NULL; +} + +static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r, + unsigned long ea) +{ + unsigned long ra_mask; + + ra_mask = hpte_page_size(v, r) - 1; + return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask); +} + static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, - struct kvmppc_pte *gpte, bool data) + struct kvmppc_pte *gpte, bool data) { - return -ENOENT; + struct kvm *kvm = vcpu->kvm; + struct kvmppc_slb *slbe; + unsigned long slb_v; + unsigned long pp, key; + unsigned long v, gr; + unsigned long *hptep; + int index; + int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR); + + /* Get SLB entry */ + if (virtmode) { + slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr); + if (!slbe) + return -EINVAL; + slb_v = slbe->origv; + } else { + /* real mode access */ + slb_v = vcpu->kvm->arch.vrma_slb_v; + } + + /* Find the HPTE in the hash table */ + index = kvmppc_hv_find_lock_hpte(kvm, eaddr, slb_v, + HPTE_V_VALID | HPTE_V_ABSENT); + if (index < 0) + return -ENOENT; + hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4)); + v = hptep[0] & ~HPTE_V_HVLOCK; + gr = kvm->arch.revmap[index].guest_rpte; + + /* Unlock the HPTE */ + asm volatile("lwsync" : : : "memory"); + hptep[0] = v; + + gpte->eaddr = eaddr; + gpte->vpage = ((v & HPTE_V_AVPN) << 4) | ((eaddr >> 12) & 0xfff); + + /* Get PP bits and key for permission check */ + pp = gr & (HPTE_R_PP0 | HPTE_R_PP); + key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS; + key &= slb_v; + + /* Calculate permissions */ + gpte->may_read = hpte_read_permission(pp, key); + gpte->may_write = hpte_write_permission(pp, key); + gpte->may_execute = gpte->may_read && !(gr & (HPTE_R_N | HPTE_R_G)); + + /* Storage key permission check for POWER7 */ + if (data && virtmode && cpu_has_feature(CPU_FTR_ARCH_206)) { + int amrfield = hpte_get_skey_perm(gr, vcpu->arch.amr); + if (amrfield & 1) + gpte->may_read = 0; + if (amrfield & 2) + gpte->may_write = 0; + } + + /* Get the guest physical address */ + gpte->raddr = kvmppc_mmu_get_real_addr(v, gr, eaddr); + return 0; +} + +/* + * Quick test for whether an instruction is a load or a store. + * If the instruction is a load or a store, then this will indicate + * which it is, at least on server processors. (Embedded processors + * have some external PID instructions that don't follow the rule + * embodied here.) If the instruction isn't a load or store, then + * this doesn't return anything useful. + */ +static int instruction_is_store(unsigned int instr) +{ + unsigned int mask; + + mask = 0x10000000; + if ((instr & 0xfc000000) == 0x7c000000) + mask = 0x100; /* major opcode 31 */ + return (instr & mask) != 0; +} + +static int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned long gpa, int is_store) +{ + int ret; + u32 last_inst; + unsigned long srr0 = kvmppc_get_pc(vcpu); + + /* We try to load the last instruction. We don't let + * emulate_instruction do it as it doesn't check what + * kvmppc_ld returns. + * If we fail, we just return to the guest and try executing it again. + */ + if (vcpu->arch.last_inst == KVM_INST_FETCH_FAILED) { + ret = kvmppc_ld(vcpu, &srr0, sizeof(u32), &last_inst, false); + if (ret != EMULATE_DONE || last_inst == KVM_INST_FETCH_FAILED) + return RESUME_GUEST; + vcpu->arch.last_inst = last_inst; + } + + /* + * WARNING: We do not know for sure whether the instruction we just + * read from memory is the same that caused the fault in the first + * place. If the instruction we read is neither an load or a store, + * then it can't access memory, so we don't need to worry about + * enforcing access permissions. So, assuming it is a load or + * store, we just check that its direction (load or store) is + * consistent with the original fault, since that's what we + * checked the access permissions against. If there is a mismatch + * we just return and retry the instruction. + */ + + if (instruction_is_store(vcpu->arch.last_inst) != !!is_store) + return RESUME_GUEST; + + /* + * Emulated accesses are emulated by looking at the hash for + * translation once, then performing the access later. The + * translation could be invalidated in the meantime in which + * point performing the subsequent memory access on the old + * physical address could possibly be a security hole for the + * guest (but not the host). + * + * This is less of an issue for MMIO stores since they aren't + * globally visible. It could be an issue for MMIO loads to + * a certain extent but we'll ignore it for now. + */ + + vcpu->arch.paddr_accessed = gpa; + return kvmppc_emulate_mmio(run, vcpu); +} + +int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned long ea, unsigned long dsisr) +{ + struct kvm *kvm = vcpu->kvm; + unsigned long *hptep, hpte[3]; + unsigned long psize; + unsigned long gfn; + struct kvm_memory_slot *memslot; + struct revmap_entry *rev; + long index; + + /* + * Real-mode code has already searched the HPT and found the + * entry we're interested in. Lock the entry and check that + * it hasn't changed. If it has, just return and re-execute the + * instruction. + */ + if (ea != vcpu->arch.pgfault_addr) + return RESUME_GUEST; + index = vcpu->arch.pgfault_index; + hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4)); + rev = &kvm->arch.revmap[index]; + preempt_disable(); + while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) + cpu_relax(); + hpte[0] = hptep[0] & ~HPTE_V_HVLOCK; + hpte[1] = hptep[1]; + hpte[2] = rev->guest_rpte; + asm volatile("lwsync" : : : "memory"); + hptep[0] = hpte[0]; + preempt_enable(); + + if (hpte[0] != vcpu->arch.pgfault_hpte[0] || + hpte[1] != vcpu->arch.pgfault_hpte[1]) + return RESUME_GUEST; + + /* Translate the logical address and get the page */ + psize = hpte_page_size(hpte[0], hpte[1]); + gfn = hpte_rpn(hpte[2], psize); + memslot = gfn_to_memslot(kvm, gfn); + + /* No memslot means it's an emulated MMIO region */ + if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) { + unsigned long gpa = (gfn << PAGE_SHIFT) | (ea & (psize - 1)); + return kvmppc_hv_emulate_mmio(run, vcpu, gpa, + dsisr & DSISR_ISSTORE); + } + + /* should never get here otherwise */ + return -EFAULT; } void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa, diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 6ed0a84ef91c..45aabb9a527f 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -326,19 +326,18 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, break; } /* - * We get these next two if the guest does a bad real-mode access, - * as we have enabled VRMA (virtualized real mode area) mode in the - * LPCR. We just generate an appropriate DSI/ISI to the guest. + * We get this if the guest accesses a page which it thinks + * it has mapped but which is not actually present, because + * it is for an emulated I/O device. + * Any other HDSI interrupt has been handled already. */ case BOOK3S_INTERRUPT_H_DATA_STORAGE: - vcpu->arch.shregs.dsisr = vcpu->arch.fault_dsisr; - vcpu->arch.shregs.dar = vcpu->arch.fault_dar; - kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE, 0); - r = RESUME_GUEST; + r = kvmppc_book3s_hv_page_fault(run, vcpu, + vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); break; case BOOK3S_INTERRUPT_H_INST_STORAGE: kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_INST_STORAGE, - 0x08000000); + vcpu->arch.shregs.msr & 0x58000000); r = RESUME_GUEST; break; /* @@ -1195,6 +1194,8 @@ static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu) /* Update VRMASD field in the LPCR */ senc = slb_pgsize_encoding(psize); + kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T | + (VRMA_VSID << SLB_VSID_SHIFT_1T); lpcr = kvm->arch.lpcr & ~LPCR_VRMASD; lpcr |= senc << (LPCR_VRMASD_SH - 4); kvm->arch.lpcr = lpcr; @@ -1291,7 +1292,9 @@ int kvmppc_core_init_vm(struct kvm *kvm) kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR); lpcr &= LPCR_PECE | LPCR_LPES; lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE | - LPCR_VPM0 | LPCR_VRMA_L; + LPCR_VPM0 | LPCR_VPM1; + kvm->arch.vrma_slb_v = SLB_VSID_B_1T | + (VRMA_VSID << SLB_VSID_SHIFT_1T); } kvm->arch.lpcr = lpcr; diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 5b31caa4b314..a5176dc37e7e 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -136,13 +136,23 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, psize = hpte_page_size(pteh, ptel); if (!psize) return H_PARAMETER; + pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID); /* Find the memslot (if any) for this address */ gpa = (ptel & HPTE_R_RPN) & ~(psize - 1); gfn = gpa >> PAGE_SHIFT; memslot = builtin_gfn_to_memslot(kvm, gfn); - if (!(memslot && !(memslot->flags & KVM_MEMSLOT_INVALID))) - return H_PARAMETER; + pa = 0; + rmap = NULL; + if (!(memslot && !(memslot->flags & KVM_MEMSLOT_INVALID))) { + /* PPC970 can't do emulated MMIO */ + if (!cpu_has_feature(CPU_FTR_ARCH_206)) + return H_PARAMETER; + /* Emulated MMIO - mark this with key=31 */ + pteh |= HPTE_V_ABSENT; + ptel |= HPTE_R_KEY_HI | HPTE_R_KEY_LO; + goto do_insert; + } /* Check if the requested page fits entirely in the memslot. */ if (!slot_is_aligned(memslot, psize)) @@ -170,6 +180,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, ptel &= ~(HPTE_R_PP0 - psize); ptel |= pa; + pteh |= HPTE_V_VALID; /* Check WIMG */ if (!hpte_cache_flags_ok(ptel, is_io)) { @@ -182,9 +193,8 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, ptel &= ~(HPTE_R_W|HPTE_R_I|HPTE_R_G); ptel |= HPTE_R_M; } - pteh &= ~0x60UL; - pteh |= HPTE_V_VALID; + do_insert: if (pte_index >= HPT_NPTE) return H_PARAMETER; if (likely((flags & H_EXACT) == 0)) { @@ -192,7 +202,8 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); for (i = 0; i < 8; ++i) { if ((*hpte & HPTE_V_VALID) == 0 && - try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID)) + try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID | + HPTE_V_ABSENT)) break; hpte += 2; } @@ -207,7 +218,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, for (i = 0; i < 8; ++i) { while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) cpu_relax(); - if ((*hpte & HPTE_V_VALID) == 0) + if (!(*hpte & (HPTE_V_VALID | HPTE_V_ABSENT))) break; *hpte &= ~HPTE_V_HVLOCK; hpte += 2; @@ -218,11 +229,12 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, pte_index += i; } else { hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); - if (!try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID)) { + if (!try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID | + HPTE_V_ABSENT)) { /* Lock the slot and check again */ while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) cpu_relax(); - if (*hpte & HPTE_V_VALID) { + if (*hpte & (HPTE_V_VALID | HPTE_V_ABSENT)) { *hpte &= ~HPTE_V_HVLOCK; return H_PTEG_FULL; } @@ -237,10 +249,12 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, rev->guest_rpte = g_ptel; /* Link HPTE into reverse-map chain */ - if (realmode) - rmap = real_vmalloc_addr(rmap); - lock_rmap(rmap); - kvmppc_add_revmap_chain(kvm, rev, rmap, pte_index, realmode); + if (pteh & HPTE_V_VALID) { + if (realmode) + rmap = real_vmalloc_addr(rmap); + lock_rmap(rmap); + kvmppc_add_revmap_chain(kvm, rev, rmap, pte_index, realmode); + } hpte[1] = ptel; @@ -287,7 +301,7 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) cpu_relax(); - if ((hpte[0] & HPTE_V_VALID) == 0 || + if ((hpte[0] & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 || ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn) || ((flags & H_ANDCOND) && (hpte[0] & avpn) != 0)) { hpte[0] &= ~HPTE_V_HVLOCK; @@ -298,11 +312,14 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, vcpu->arch.gpr[4] = v = hpte[0] & ~HPTE_V_HVLOCK; vcpu->arch.gpr[5] = r = hpte[1]; rb = compute_tlbie_rb(v, r, pte_index); - remove_revmap_chain(kvm, pte_index, v); + if (v & HPTE_V_VALID) + remove_revmap_chain(kvm, pte_index, v); smp_wmb(); hpte[0] = 0; + if (!(v & HPTE_V_VALID)) + return H_SUCCESS; if (!(flags & H_LOCAL)) { - while(!try_lock_tlbie(&kvm->arch.tlbie_lock)) + while (!try_lock_tlbie(&kvm->arch.tlbie_lock)) cpu_relax(); asm volatile("ptesync" : : : "memory"); asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync" @@ -349,7 +366,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) while (!try_lock_hpte(hp, HPTE_V_HVLOCK)) cpu_relax(); found = 0; - if (hp[0] & HPTE_V_VALID) { + if (hp[0] & (HPTE_V_ABSENT | HPTE_V_VALID)) { switch (flags & 3) { case 0: /* absolute */ found = 1; @@ -372,8 +389,10 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) /* insert R and C bits from PTE */ flags |= (hp[1] >> 5) & 0x0c; args[i * 2] = ((0x80 | flags) << 56) + pte_index; - tlbrb[n_inval++] = compute_tlbie_rb(hp[0], hp[1], pte_index); - remove_revmap_chain(kvm, pte_index, hp[0]); + if (hp[0] & HPTE_V_VALID) { + tlbrb[n_inval++] = compute_tlbie_rb(hp[0], hp[1], pte_index); + remove_revmap_chain(kvm, pte_index, hp[0]); + } smp_wmb(); hp[0] = 0; } @@ -409,14 +428,16 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, if (pte_index >= HPT_NPTE) return H_PARAMETER; + hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) cpu_relax(); - if ((hpte[0] & HPTE_V_VALID) == 0 || + if ((hpte[0] & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 || ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn)) { hpte[0] &= ~HPTE_V_HVLOCK; return H_NOT_FOUND; } + if (atomic_read(&kvm->online_vcpus) == 1) flags |= H_LOCAL; v = hpte[0]; @@ -435,20 +456,22 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, r = (hpte[1] & ~mask) | bits; /* Update HPTE */ - rb = compute_tlbie_rb(v, r, pte_index); - hpte[0] = v & ~HPTE_V_VALID; - if (!(flags & H_LOCAL)) { - while(!try_lock_tlbie(&kvm->arch.tlbie_lock)) - cpu_relax(); - asm volatile("ptesync" : : : "memory"); - asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync" - : : "r" (rb), "r" (kvm->arch.lpid)); - asm volatile("ptesync" : : : "memory"); - kvm->arch.tlbie_lock = 0; - } else { - asm volatile("ptesync" : : : "memory"); - asm volatile("tlbiel %0" : : "r" (rb)); - asm volatile("ptesync" : : : "memory"); + if (v & HPTE_V_VALID) { + rb = compute_tlbie_rb(v, r, pte_index); + hpte[0] = v & ~HPTE_V_VALID; + if (!(flags & H_LOCAL)) { + while(!try_lock_tlbie(&kvm->arch.tlbie_lock)) + cpu_relax(); + asm volatile("ptesync" : : : "memory"); + asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync" + : : "r" (rb), "r" (kvm->arch.lpid)); + asm volatile("ptesync" : : : "memory"); + kvm->arch.tlbie_lock = 0; + } else { + asm volatile("ptesync" : : : "memory"); + asm volatile("tlbiel %0" : : "r" (rb)); + asm volatile("ptesync" : : : "memory"); + } } hpte[1] = r; eieio(); @@ -461,7 +484,7 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags, unsigned long pte_index) { struct kvm *kvm = vcpu->kvm; - unsigned long *hpte, r; + unsigned long *hpte, v, r; int i, n = 1; struct revmap_entry *rev = NULL; @@ -475,15 +498,182 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags, rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); for (i = 0; i < n; ++i, ++pte_index) { hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); + v = hpte[0] & ~HPTE_V_HVLOCK; r = hpte[1]; - if (hpte[0] & HPTE_V_VALID) { + if (v & HPTE_V_ABSENT) { + v &= ~HPTE_V_ABSENT; + v |= HPTE_V_VALID; + } + if (v & HPTE_V_VALID) { if (rev) r = rev[i].guest_rpte; else r = hpte[1] | HPTE_R_RPN; } - vcpu->arch.gpr[4 + i * 2] = hpte[0]; + vcpu->arch.gpr[4 + i * 2] = v; vcpu->arch.gpr[5 + i * 2] = r; } return H_SUCCESS; } + +static int slb_base_page_shift[4] = { + 24, /* 16M */ + 16, /* 64k */ + 34, /* 16G */ + 20, /* 1M, unsupported */ +}; + +long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, + unsigned long valid) +{ + unsigned int i; + unsigned int pshift; + unsigned long somask; + unsigned long vsid, hash; + unsigned long avpn; + unsigned long *hpte; + unsigned long mask, val; + unsigned long v, r; + + /* Get page shift, work out hash and AVPN etc. */ + mask = SLB_VSID_B | HPTE_V_AVPN | HPTE_V_SECONDARY; + val = 0; + pshift = 12; + if (slb_v & SLB_VSID_L) { + mask |= HPTE_V_LARGE; + val |= HPTE_V_LARGE; + pshift = slb_base_page_shift[(slb_v & SLB_VSID_LP) >> 4]; + } + if (slb_v & SLB_VSID_B_1T) { + somask = (1UL << 40) - 1; + vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T; + vsid ^= vsid << 25; + } else { + somask = (1UL << 28) - 1; + vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT; + } + hash = (vsid ^ ((eaddr & somask) >> pshift)) & HPT_HASH_MASK; + avpn = slb_v & ~(somask >> 16); /* also includes B */ + avpn |= (eaddr & somask) >> 16; + + if (pshift >= 24) + avpn &= ~((1UL << (pshift - 16)) - 1); + else + avpn &= ~0x7fUL; + val |= avpn; + + for (;;) { + hpte = (unsigned long *)(kvm->arch.hpt_virt + (hash << 7)); + + for (i = 0; i < 16; i += 2) { + /* Read the PTE racily */ + v = hpte[i] & ~HPTE_V_HVLOCK; + + /* Check valid/absent, hash, segment size and AVPN */ + if (!(v & valid) || (v & mask) != val) + continue; + + /* Lock the PTE and read it under the lock */ + while (!try_lock_hpte(&hpte[i], HPTE_V_HVLOCK)) + cpu_relax(); + v = hpte[i] & ~HPTE_V_HVLOCK; + r = hpte[i+1]; + + /* + * Check the HPTE again, including large page size + * Since we don't currently allow any MPSS (mixed + * page-size segment) page sizes, it is sufficient + * to check against the actual page size. + */ + if ((v & valid) && (v & mask) == val && + hpte_page_size(v, r) == (1ul << pshift)) + /* Return with the HPTE still locked */ + return (hash << 3) + (i >> 1); + + /* Unlock and move on */ + hpte[i] = v; + } + + if (val & HPTE_V_SECONDARY) + break; + val |= HPTE_V_SECONDARY; + hash = hash ^ HPT_HASH_MASK; + } + return -1; +} +EXPORT_SYMBOL(kvmppc_hv_find_lock_hpte); + +/* + * Called in real mode to check whether an HPTE not found fault + * is due to accessing an emulated MMIO page. + * Returns a possibly modified status (DSISR) value if not + * (i.e. pass the interrupt to the guest), + * -1 to pass the fault up to host kernel mode code, -2 to do that + * and also load the instruction word, + * or 0 if we should make the guest retry the access. + */ +long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr, + unsigned long slb_v, unsigned int status) +{ + struct kvm *kvm = vcpu->kvm; + long int index; + unsigned long v, r, gr; + unsigned long *hpte; + unsigned long valid; + struct revmap_entry *rev; + unsigned long pp, key; + + valid = HPTE_V_VALID | HPTE_V_ABSENT; + index = kvmppc_hv_find_lock_hpte(kvm, addr, slb_v, valid); + if (index < 0) + return status; /* there really was no HPTE */ + + hpte = (unsigned long *)(kvm->arch.hpt_virt + (index << 4)); + v = hpte[0] & ~HPTE_V_HVLOCK; + r = hpte[1]; + rev = real_vmalloc_addr(&kvm->arch.revmap[index]); + gr = rev->guest_rpte; + + /* Unlock the HPTE */ + asm volatile("lwsync" : : : "memory"); + hpte[0] = v; + + /* If the HPTE is valid by now, retry the instruction */ + if (v & HPTE_V_VALID) + return 0; + + /* Check access permissions to the page */ + pp = gr & (HPTE_R_PP0 | HPTE_R_PP); + key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS; + if (status & DSISR_ISSTORE) { + /* check write permission */ + if (!hpte_write_permission(pp, slb_v & key)) + goto protfault; + } else { + if (!hpte_read_permission(pp, slb_v & key)) + goto protfault; + } + + /* Check storage key, if applicable */ + if (vcpu->arch.shregs.msr & MSR_DR) { + unsigned int perm = hpte_get_skey_perm(gr, vcpu->arch.amr); + if (status & DSISR_ISSTORE) + perm >>= 1; + if (perm & 1) + return (status & ~DSISR_NOHPTE) | DSISR_KEYFAULT; + } + + /* Save HPTE info for virtual-mode handler */ + vcpu->arch.pgfault_addr = addr; + vcpu->arch.pgfault_index = index; + vcpu->arch.pgfault_hpte[0] = v; + vcpu->arch.pgfault_hpte[1] = r; + + if (vcpu->arch.shregs.msr & MSR_IR) + return -2; /* MMIO emulation - load instr word */ + + return -1; /* send fault up to host kernel mode */ + + protfault: + return (status & ~DSISR_NOHPTE) | DSISR_PROTFAULT; +} diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 5c8b26183f50..d07b64d5f37e 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -601,6 +601,28 @@ kvmppc_interrupt: stw r12,VCPU_TRAP(r9) + /* Save HEIR (HV emulation assist reg) in last_inst + if this is an HEI (HV emulation interrupt, e40) */ + li r3,KVM_INST_FETCH_FAILED +BEGIN_FTR_SECTION + cmpwi r12,BOOK3S_INTERRUPT_H_EMUL_ASSIST + bne 11f + mfspr r3,SPRN_HEIR +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) +11: stw r3,VCPU_LAST_INST(r9) + + /* these are volatile across C function calls */ + mfctr r3 + mfxer r4 + std r3, VCPU_CTR(r9) + stw r4, VCPU_XER(r9) + +BEGIN_FTR_SECTION + /* If this is a page table miss then see if it's theirs or ours */ + cmpwi r12, BOOK3S_INTERRUPT_H_DATA_STORAGE + beq kvmppc_hdsi +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + /* See if this is a leftover HDEC interrupt */ cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER bne 2f @@ -608,7 +630,7 @@ kvmppc_interrupt: cmpwi r3,0 bge ignore_hdec 2: - /* See if this is something we can handle in real mode */ + /* See if this is an hcall we can handle in real mode */ cmpwi r12,BOOK3S_INTERRUPT_SYSCALL beq hcall_try_real_mode @@ -624,6 +646,7 @@ BEGIN_FTR_SECTION 1: END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) +nohpte_cont: hcall_real_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ /* Save DEC */ mfspr r5,SPRN_DEC @@ -632,36 +655,21 @@ hcall_real_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ add r5,r5,r6 std r5,VCPU_DEC_EXPIRES(r9) - /* Save HEIR (HV emulation assist reg) in last_inst - if this is an HEI (HV emulation interrupt, e40) */ - li r3,-1 -BEGIN_FTR_SECTION - cmpwi r12,BOOK3S_INTERRUPT_H_EMUL_ASSIST - bne 11f - mfspr r3,SPRN_HEIR -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) -11: stw r3,VCPU_LAST_INST(r9) - /* Save more register state */ - mfxer r5 mfdar r6 mfdsisr r7 - mfctr r8 - - stw r5, VCPU_XER(r9) std r6, VCPU_DAR(r9) stw r7, VCPU_DSISR(r9) - std r8, VCPU_CTR(r9) - /* grab HDAR & HDSISR if HV data storage interrupt (HDSI) */ BEGIN_FTR_SECTION + /* don't overwrite fault_dar/fault_dsisr if HDSI */ cmpwi r12,BOOK3S_INTERRUPT_H_DATA_STORAGE beq 6f END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) -7: std r6, VCPU_FAULT_DAR(r9) + std r6, VCPU_FAULT_DAR(r9) stw r7, VCPU_FAULT_DSISR(r9) /* Save guest CTRL register, set runlatch to 1 */ - mfspr r6,SPRN_CTRLF +6: mfspr r6,SPRN_CTRLF stw r6,VCPU_CTRL(r9) andi. r0,r6,1 bne 4f @@ -1094,9 +1102,84 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) mtspr SPRN_HSRR1, r7 ba 0x500 -6: mfspr r6,SPRN_HDAR - mfspr r7,SPRN_HDSISR - b 7b +/* + * Check whether an HDSI is an HPTE not found fault or something else. + * If it is an HPTE not found fault that is due to the guest accessing + * a page that they have mapped but which we have paged out, then + * we continue on with the guest exit path. In all other cases, + * reflect the HDSI to the guest as a DSI. + */ +kvmppc_hdsi: + mfspr r4, SPRN_HDAR + mfspr r6, SPRN_HDSISR + /* HPTE not found fault? */ + andis. r0, r6, DSISR_NOHPTE@h + beq 1f /* if not, send it to the guest */ + andi. r0, r11, MSR_DR /* data relocation enabled? */ + beq 3f + clrrdi r0, r4, 28 + PPC_SLBFEE_DOT(r5, r0) /* if so, look up SLB */ + bne 1f /* if no SLB entry found */ +4: std r4, VCPU_FAULT_DAR(r9) + stw r6, VCPU_FAULT_DSISR(r9) + + /* Search the hash table. */ + mr r3, r9 /* vcpu pointer */ + bl .kvmppc_hpte_hv_fault + ld r9, HSTATE_KVM_VCPU(r13) + ld r10, VCPU_PC(r9) + ld r11, VCPU_MSR(r9) + li r12, BOOK3S_INTERRUPT_H_DATA_STORAGE + cmpdi r3, 0 /* retry the instruction */ + beq 6f + cmpdi r3, -1 /* handle in kernel mode */ + beq nohpte_cont + cmpdi r3, -2 /* MMIO emulation; need instr word */ + beq 2f + + /* Synthesize a DSI for the guest */ + ld r4, VCPU_FAULT_DAR(r9) + mr r6, r3 +1: mtspr SPRN_DAR, r4 + mtspr SPRN_DSISR, r6 + mtspr SPRN_SRR0, r10 + mtspr SPRN_SRR1, r11 + li r10, BOOK3S_INTERRUPT_DATA_STORAGE + li r11, (MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */ + rotldi r11, r11, 63 +6: ld r7, VCPU_CTR(r9) + lwz r8, VCPU_XER(r9) + mtctr r7 + mtxer r8 + mr r4, r9 + b fast_guest_return + +3: ld r5, VCPU_KVM(r9) /* not relocated, use VRMA */ + ld r5, KVM_VRMA_SLB_V(r5) + b 4b + + /* If this is for emulated MMIO, load the instruction word */ +2: li r8, KVM_INST_FETCH_FAILED /* In case lwz faults */ + + /* Set guest mode to 'jump over instruction' so if lwz faults + * we'll just continue at the next IP. */ + li r0, KVM_GUEST_MODE_SKIP + stb r0, HSTATE_IN_GUEST(r13) + + /* Do the access with MSR:DR enabled */ + mfmsr r3 + ori r4, r3, MSR_DR /* Enable paging for data */ + mtmsrd r4 + lwz r8, 0(r10) + mtmsrd r3 + + /* Store the result */ + stw r8, VCPU_LAST_INST(r9) + + /* Unset guest mode. */ + li r0, KVM_GUEST_MODE_NONE + stb r0, HSTATE_IN_GUEST(r13) + b nohpte_cont /* * Try to handle an hcall in real mode. -- cgit v1.2.3 From 342d3db763f2621ed4546ebf8f6c61cb29d7fbdb Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 12 Dec 2011 12:38:05 +0000 Subject: KVM: PPC: Implement MMU notifiers for Book3S HV guests This adds the infrastructure to enable us to page out pages underneath a Book3S HV guest, on processors that support virtualized partition memory, that is, POWER7. Instead of pinning all the guest's pages, we now look in the host userspace Linux page tables to find the mapping for a given guest page. Then, if the userspace Linux PTE gets invalidated, kvm_unmap_hva() gets called for that address, and we replace all the guest HPTEs that refer to that page with absent HPTEs, i.e. ones with the valid bit clear and the HPTE_V_ABSENT bit set, which will cause an HDSI when the guest tries to access them. Finally, the page fault handler is extended to reinstantiate the guest HPTE when the guest tries to access a page which has been paged out. Since we can't intercept the guest DSI and ISI interrupts on PPC970, we still have to pin all the guest pages on PPC970. We have a new flag, kvm->arch.using_mmu_notifiers, that indicates whether we can page guest pages out. If it is not set, the MMU notifier callbacks do nothing and everything operates as before. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/include/asm/kvm_book3s.h | 4 + arch/powerpc/include/asm/kvm_book3s_64.h | 31 ++++ arch/powerpc/include/asm/kvm_host.h | 16 ++ arch/powerpc/include/asm/reg.h | 3 + arch/powerpc/kvm/Kconfig | 1 + arch/powerpc/kvm/book3s_64_mmu_hv.c | 290 ++++++++++++++++++++++++++++--- arch/powerpc/kvm/book3s_hv.c | 25 +-- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 140 +++++++++++---- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 49 ++++++ arch/powerpc/kvm/powerpc.c | 3 + arch/powerpc/mm/hugetlbpage.c | 2 + 11 files changed, 499 insertions(+), 65 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 3a9e51f43397..9240cebf8bad 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -143,6 +143,10 @@ extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat, extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr); extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu); extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn); +extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, + unsigned long *rmap, long pte_index, int realmode); +extern void kvmppc_invalidate_hpte(struct kvm *kvm, unsigned long *hptep, + unsigned long pte_index); extern void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long addr, unsigned long *nb_ret); extern void kvmppc_unpin_guest_page(struct kvm *kvm, void *addr); diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 79dc37fb86b5..c21e46da4a3b 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -136,6 +136,37 @@ static inline int hpte_cache_flags_ok(unsigned long ptel, unsigned long io_type) return (wimg & (HPTE_R_W | HPTE_R_I)) == io_type; } +/* + * Lock and read a linux PTE. If it's present and writable, atomically + * set dirty and referenced bits and return the PTE, otherwise return 0. + */ +static inline pte_t kvmppc_read_update_linux_pte(pte_t *p) +{ + pte_t pte, tmp; + + /* wait until _PAGE_BUSY is clear then set it atomically */ + __asm__ __volatile__ ( + "1: ldarx %0,0,%3\n" + " andi. %1,%0,%4\n" + " bne- 1b\n" + " ori %1,%0,%4\n" + " stdcx. %1,0,%3\n" + " bne- 1b" + : "=&r" (pte), "=&r" (tmp), "=m" (*p) + : "r" (p), "i" (_PAGE_BUSY) + : "cc"); + + if (pte_present(pte)) { + pte = pte_mkyoung(pte); + if (pte_write(pte)) + pte = pte_mkdirty(pte); + } + + *p = pte; /* clears _PAGE_BUSY */ + + return pte; +} + /* Return HPTE cache control bits corresponding to Linux pte bits */ static inline unsigned long hpte_cache_bits(unsigned long pte_val) { diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 937cacaaf236..968f3aa61cd1 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -32,6 +32,7 @@ #include #include #include +#include #define KVM_MAX_VCPUS NR_CPUS #define KVM_MAX_VCORES NR_CPUS @@ -44,6 +45,19 @@ #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 #endif +#ifdef CONFIG_KVM_BOOK3S_64_HV +#include + +#define KVM_ARCH_WANT_MMU_NOTIFIER + +struct kvm; +extern int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); +extern int kvm_age_hva(struct kvm *kvm, unsigned long hva); +extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); +extern void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); + +#endif + /* We don't currently support large pages. */ #define KVM_HPAGE_GFN_SHIFT(x) 0 #define KVM_NR_PAGE_SIZES 1 @@ -212,6 +226,7 @@ struct kvm_arch { struct kvmppc_rma_info *rma; unsigned long vrma_slb_v; int rma_setup_done; + int using_mmu_notifiers; struct list_head spapr_tce_tables; spinlock_t slot_phys_lock; unsigned long *slot_phys[KVM_MEM_SLOTS_NUM]; @@ -460,6 +475,7 @@ struct kvm_vcpu_arch { struct list_head run_list; struct task_struct *run_task; struct kvm_run *kvm_run; + pgd_t *pgdir; #endif }; diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 16efb3151c20..35c9309bf038 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -495,6 +495,9 @@ #define SPRN_SPRG7 0x117 /* Special Purpose Register General 7 */ #define SPRN_SRR0 0x01A /* Save/Restore Register 0 */ #define SPRN_SRR1 0x01B /* Save/Restore Register 1 */ +#define SRR1_ISI_NOPT 0x40000000 /* ISI: Not found in hash */ +#define SRR1_ISI_N_OR_G 0x10000000 /* ISI: Access is no-exec or G */ +#define SRR1_ISI_PROT 0x08000000 /* ISI: Other protection fault */ #define SRR1_WAKEMASK 0x00380000 /* reason for wakeup */ #define SRR1_WAKESYSERR 0x00300000 /* System error */ #define SRR1_WAKEEE 0x00200000 /* External interrupt */ diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 78133deb4b64..8f64709ae331 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -69,6 +69,7 @@ config KVM_BOOK3S_64 config KVM_BOOK3S_64_HV bool "KVM support for POWER7 and PPC970 using hypervisor mode in host" depends on KVM_BOOK3S_64 + select MMU_NOTIFIER ---help--- Support running unmodified book3s_64 guest kernels in virtual machines on POWER7 and PPC970 processors that have diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 2d31519b8637..83761dd8a924 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -281,8 +281,9 @@ static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn, } /* - * We come here on a H_ENTER call from the guest when - * we don't have the requested page pinned already. + * We come here on a H_ENTER call from the guest when we are not + * using mmu notifiers and we don't have the requested page pinned + * already. */ long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, long pte_index, unsigned long pteh, unsigned long ptel) @@ -292,6 +293,9 @@ long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, struct kvm_memory_slot *memslot; long ret; + if (kvm->arch.using_mmu_notifiers) + goto do_insert; + psize = hpte_page_size(pteh, ptel); if (!psize) return H_PARAMETER; @@ -309,9 +313,12 @@ long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, return H_PARAMETER; } - preempt_disable(); + do_insert: + /* Protect linux PTE lookup from page table destruction */ + rcu_read_lock_sched(); /* this disables preemption too */ + vcpu->arch.pgdir = current->mm->pgd; ret = kvmppc_h_enter(vcpu, flags, pte_index, pteh, ptel); - preempt_enable(); + rcu_read_unlock_sched(); if (ret == H_TOO_HARD) { /* this can't happen */ pr_err("KVM: Oops, kvmppc_h_enter returned too hard!\n"); @@ -487,12 +494,16 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned long ea, unsigned long dsisr) { struct kvm *kvm = vcpu->kvm; - unsigned long *hptep, hpte[3]; - unsigned long psize; - unsigned long gfn; + unsigned long *hptep, hpte[3], r; + unsigned long mmu_seq, psize, pte_size; + unsigned long gfn, hva, pfn; struct kvm_memory_slot *memslot; + unsigned long *rmap; struct revmap_entry *rev; - long index; + struct page *page, *pages[1]; + long index, ret, npages; + unsigned long is_io; + struct vm_area_struct *vma; /* * Real-mode code has already searched the HPT and found the @@ -510,7 +521,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, cpu_relax(); hpte[0] = hptep[0] & ~HPTE_V_HVLOCK; hpte[1] = hptep[1]; - hpte[2] = rev->guest_rpte; + hpte[2] = r = rev->guest_rpte; asm volatile("lwsync" : : : "memory"); hptep[0] = hpte[0]; preempt_enable(); @@ -520,8 +531,8 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, return RESUME_GUEST; /* Translate the logical address and get the page */ - psize = hpte_page_size(hpte[0], hpte[1]); - gfn = hpte_rpn(hpte[2], psize); + psize = hpte_page_size(hpte[0], r); + gfn = hpte_rpn(r, psize); memslot = gfn_to_memslot(kvm, gfn); /* No memslot means it's an emulated MMIO region */ @@ -531,8 +542,228 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, dsisr & DSISR_ISSTORE); } - /* should never get here otherwise */ - return -EFAULT; + if (!kvm->arch.using_mmu_notifiers) + return -EFAULT; /* should never get here */ + + /* used to check for invalidations in progress */ + mmu_seq = kvm->mmu_notifier_seq; + smp_rmb(); + + is_io = 0; + pfn = 0; + page = NULL; + pte_size = PAGE_SIZE; + hva = gfn_to_hva_memslot(memslot, gfn); + npages = get_user_pages_fast(hva, 1, 1, pages); + if (npages < 1) { + /* Check if it's an I/O mapping */ + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm, hva); + if (vma && vma->vm_start <= hva && hva + psize <= vma->vm_end && + (vma->vm_flags & VM_PFNMAP)) { + pfn = vma->vm_pgoff + + ((hva - vma->vm_start) >> PAGE_SHIFT); + pte_size = psize; + is_io = hpte_cache_bits(pgprot_val(vma->vm_page_prot)); + } + up_read(¤t->mm->mmap_sem); + if (!pfn) + return -EFAULT; + } else { + page = pages[0]; + if (PageHuge(page)) { + page = compound_head(page); + pte_size <<= compound_order(page); + } + pfn = page_to_pfn(page); + } + + ret = -EFAULT; + if (psize > pte_size) + goto out_put; + + /* Check WIMG vs. the actual page we're accessing */ + if (!hpte_cache_flags_ok(r, is_io)) { + if (is_io) + return -EFAULT; + /* + * Allow guest to map emulated device memory as + * uncacheable, but actually make it cacheable. + */ + r = (r & ~(HPTE_R_W|HPTE_R_I|HPTE_R_G)) | HPTE_R_M; + } + + /* Set the HPTE to point to pfn */ + r = (r & ~(HPTE_R_PP0 - pte_size)) | (pfn << PAGE_SHIFT); + ret = RESUME_GUEST; + preempt_disable(); + while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) + cpu_relax(); + if ((hptep[0] & ~HPTE_V_HVLOCK) != hpte[0] || hptep[1] != hpte[1] || + rev->guest_rpte != hpte[2]) + /* HPTE has been changed under us; let the guest retry */ + goto out_unlock; + hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID; + + rmap = &memslot->rmap[gfn - memslot->base_gfn]; + lock_rmap(rmap); + + /* Check if we might have been invalidated; let the guest retry if so */ + ret = RESUME_GUEST; + if (mmu_notifier_retry(vcpu, mmu_seq)) { + unlock_rmap(rmap); + goto out_unlock; + } + kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0); + + hptep[1] = r; + eieio(); + hptep[0] = hpte[0]; + asm volatile("ptesync" : : : "memory"); + preempt_enable(); + if (page) + SetPageDirty(page); + + out_put: + if (page) + put_page(page); + return ret; + + out_unlock: + hptep[0] &= ~HPTE_V_HVLOCK; + preempt_enable(); + goto out_put; +} + +static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, + int (*handler)(struct kvm *kvm, unsigned long *rmapp, + unsigned long gfn)) +{ + int ret; + int retval = 0; + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + + slots = kvm_memslots(kvm); + kvm_for_each_memslot(memslot, slots) { + unsigned long start = memslot->userspace_addr; + unsigned long end; + + end = start + (memslot->npages << PAGE_SHIFT); + if (hva >= start && hva < end) { + gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; + + ret = handler(kvm, &memslot->rmap[gfn_offset], + memslot->base_gfn + gfn_offset); + retval |= ret; + } + } + + return retval; +} + +static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, + unsigned long gfn) +{ + struct revmap_entry *rev = kvm->arch.revmap; + unsigned long h, i, j; + unsigned long *hptep; + unsigned long ptel, psize; + + for (;;) { + while (test_and_set_bit_lock(KVMPPC_RMAP_LOCK_BIT, rmapp)) + cpu_relax(); + if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { + __clear_bit_unlock(KVMPPC_RMAP_LOCK_BIT, rmapp); + break; + } + + /* + * To avoid an ABBA deadlock with the HPTE lock bit, + * we have to unlock the rmap chain before locking the HPTE. + * Thus we remove the first entry, unlock the rmap chain, + * lock the HPTE and then check that it is for the + * page we're unmapping before changing it to non-present. + */ + i = *rmapp & KVMPPC_RMAP_INDEX; + j = rev[i].forw; + if (j == i) { + /* chain is now empty */ + j = 0; + } else { + /* remove i from chain */ + h = rev[i].back; + rev[h].forw = j; + rev[j].back = h; + rev[i].forw = rev[i].back = i; + j |= KVMPPC_RMAP_PRESENT; + } + smp_wmb(); + *rmapp = j | (1ul << KVMPPC_RMAP_REF_BIT); + + /* Now lock, check and modify the HPTE */ + hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4)); + while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) + cpu_relax(); + ptel = rev[i].guest_rpte; + psize = hpte_page_size(hptep[0], ptel); + if ((hptep[0] & HPTE_V_VALID) && + hpte_rpn(ptel, psize) == gfn) { + kvmppc_invalidate_hpte(kvm, hptep, i); + hptep[0] |= HPTE_V_ABSENT; + } + hptep[0] &= ~HPTE_V_HVLOCK; + } + return 0; +} + +int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) +{ + if (kvm->arch.using_mmu_notifiers) + kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); + return 0; +} + +static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, + unsigned long gfn) +{ + if (!kvm->arch.using_mmu_notifiers) + return 0; + if (!(*rmapp & KVMPPC_RMAP_REFERENCED)) + return 0; + kvm_unmap_rmapp(kvm, rmapp, gfn); + while (test_and_set_bit_lock(KVMPPC_RMAP_LOCK_BIT, rmapp)) + cpu_relax(); + __clear_bit(KVMPPC_RMAP_REF_BIT, rmapp); + __clear_bit_unlock(KVMPPC_RMAP_LOCK_BIT, rmapp); + return 1; +} + +int kvm_age_hva(struct kvm *kvm, unsigned long hva) +{ + if (!kvm->arch.using_mmu_notifiers) + return 0; + return kvm_handle_hva(kvm, hva, kvm_age_rmapp); +} + +static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, + unsigned long gfn) +{ + return !!(*rmapp & KVMPPC_RMAP_REFERENCED); +} + +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +{ + if (!kvm->arch.using_mmu_notifiers) + return 0; + return kvm_handle_hva(kvm, hva, kvm_test_age_rmapp); +} + +void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +{ + if (!kvm->arch.using_mmu_notifiers) + return; + kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); } void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa, @@ -540,31 +771,42 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa, { struct kvm_memory_slot *memslot; unsigned long gfn = gpa >> PAGE_SHIFT; - struct page *page; - unsigned long psize, offset; + struct page *page, *pages[1]; + int npages; + unsigned long hva, psize, offset; unsigned long pa; unsigned long *physp; memslot = gfn_to_memslot(kvm, gfn); if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) return NULL; - physp = kvm->arch.slot_phys[memslot->id]; - if (!physp) - return NULL; - physp += gfn - memslot->base_gfn; - pa = *physp; - if (!pa) { - if (kvmppc_get_guest_page(kvm, gfn, memslot, PAGE_SIZE) < 0) + if (!kvm->arch.using_mmu_notifiers) { + physp = kvm->arch.slot_phys[memslot->id]; + if (!physp) return NULL; + physp += gfn - memslot->base_gfn; pa = *physp; + if (!pa) { + if (kvmppc_get_guest_page(kvm, gfn, memslot, + PAGE_SIZE) < 0) + return NULL; + pa = *physp; + } + page = pfn_to_page(pa >> PAGE_SHIFT); + } else { + hva = gfn_to_hva_memslot(memslot, gfn); + npages = get_user_pages_fast(hva, 1, 1, pages); + if (npages < 1) + return NULL; + page = pages[0]; } - page = pfn_to_page(pa >> PAGE_SHIFT); psize = PAGE_SIZE; if (PageHuge(page)) { page = compound_head(page); psize <<= compound_order(page); } - get_page(page); + if (!kvm->arch.using_mmu_notifiers) + get_page(page); offset = gpa & (psize - 1); if (nb_ret) *nb_ret = psize - offset; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 45aabb9a527f..86c4191cb75b 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -326,19 +326,19 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, break; } /* - * We get this if the guest accesses a page which it thinks - * it has mapped but which is not actually present, because - * it is for an emulated I/O device. - * Any other HDSI interrupt has been handled already. + * We get these next two if the guest accesses a page which it thinks + * it has mapped but which is not actually present, either because + * it is for an emulated I/O device or because the corresonding + * host page has been paged out. Any other HDSI/HISI interrupts + * have been handled already. */ case BOOK3S_INTERRUPT_H_DATA_STORAGE: r = kvmppc_book3s_hv_page_fault(run, vcpu, vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); break; case BOOK3S_INTERRUPT_H_INST_STORAGE: - kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_INST_STORAGE, - vcpu->arch.shregs.msr & 0x58000000); - r = RESUME_GUEST; + r = kvmppc_book3s_hv_page_fault(run, vcpu, + kvmppc_get_pc(vcpu), 0); break; /* * This occurs if the guest executes an illegal instruction. @@ -867,6 +867,7 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) flush_altivec_to_thread(current); flush_vsx_to_thread(current); vcpu->arch.wqp = &vcpu->arch.vcore->wq; + vcpu->arch.pgdir = current->mm->pgd; do { r = kvmppc_run_vcpu(run, vcpu); @@ -1090,9 +1091,9 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm, unsigned long *phys; /* Allocate a slot_phys array */ - npages = mem->memory_size >> PAGE_SHIFT; phys = kvm->arch.slot_phys[mem->slot]; - if (!phys) { + if (!kvm->arch.using_mmu_notifiers && !phys) { + npages = mem->memory_size >> PAGE_SHIFT; phys = vzalloc(npages * sizeof(unsigned long)); if (!phys) return -ENOMEM; @@ -1298,6 +1299,7 @@ int kvmppc_core_init_vm(struct kvm *kvm) } kvm->arch.lpcr = lpcr; + kvm->arch.using_mmu_notifiers = !!cpu_has_feature(CPU_FTR_ARCH_206); spin_lock_init(&kvm->arch.slot_phys_lock); return 0; } @@ -1306,8 +1308,9 @@ void kvmppc_core_destroy_vm(struct kvm *kvm) { unsigned long i; - for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) - unpin_slot(kvm, i); + if (!kvm->arch.using_mmu_notifiers) + for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) + unpin_slot(kvm, i); if (kvm->arch.rma) { kvm_release_rma(kvm->arch.rma); diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index a5176dc37e7e..81d16ed9767d 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -58,7 +58,7 @@ static void *real_vmalloc_addr(void *x) * Add this HPTE into the chain for the real page. * Must be called with the chain locked; it unlocks the chain. */ -static void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, +void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, unsigned long *rmap, long pte_index, int realmode) { struct revmap_entry *head, *tail; @@ -83,6 +83,7 @@ static void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, smp_wmb(); *rmap = i | KVMPPC_RMAP_REFERENCED | KVMPPC_RMAP_PRESENT; /* unlock */ } +EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain); /* Remove this HPTE from the chain for a real page */ static void remove_revmap_chain(struct kvm *kvm, long pte_index, @@ -118,12 +119,33 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index, unlock_rmap(rmap); } +static pte_t lookup_linux_pte(struct kvm_vcpu *vcpu, unsigned long hva, + unsigned long *pte_sizep) +{ + pte_t *ptep; + unsigned long ps = *pte_sizep; + unsigned int shift; + + ptep = find_linux_pte_or_hugepte(vcpu->arch.pgdir, hva, &shift); + if (!ptep) + return __pte(0); + if (shift) + *pte_sizep = 1ul << shift; + else + *pte_sizep = PAGE_SIZE; + if (ps > *pte_sizep) + return __pte(0); + if (!pte_present(*ptep)) + return __pte(0); + return kvmppc_read_update_linux_pte(ptep); +} + long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, long pte_index, unsigned long pteh, unsigned long ptel) { struct kvm *kvm = vcpu->kvm; unsigned long i, pa, gpa, gfn, psize; - unsigned long slot_fn; + unsigned long slot_fn, hva; unsigned long *hpte; struct revmap_entry *rev; unsigned long g_ptel = ptel; @@ -131,6 +153,8 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, unsigned long *physp, pte_size; unsigned long is_io; unsigned long *rmap; + pte_t pte; + unsigned long mmu_seq; bool realmode = vcpu->arch.vcore->vcore_state == VCORE_RUNNING; psize = hpte_page_size(pteh, ptel); @@ -138,11 +162,16 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, return H_PARAMETER; pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID); + /* used later to detect if we might have been invalidated */ + mmu_seq = kvm->mmu_notifier_seq; + smp_rmb(); + /* Find the memslot (if any) for this address */ gpa = (ptel & HPTE_R_RPN) & ~(psize - 1); gfn = gpa >> PAGE_SHIFT; memslot = builtin_gfn_to_memslot(kvm, gfn); pa = 0; + is_io = ~0ul; rmap = NULL; if (!(memslot && !(memslot->flags & KVM_MEMSLOT_INVALID))) { /* PPC970 can't do emulated MMIO */ @@ -160,19 +189,31 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, slot_fn = gfn - memslot->base_gfn; rmap = &memslot->rmap[slot_fn]; - physp = kvm->arch.slot_phys[memslot->id]; - if (!physp) - return H_PARAMETER; - physp += slot_fn; - if (realmode) - physp = real_vmalloc_addr(physp); - pa = *physp; - if (!pa) - return H_TOO_HARD; - is_io = pa & (HPTE_R_I | HPTE_R_W); - pte_size = PAGE_SIZE << (pa & KVMPPC_PAGE_ORDER_MASK); - pa &= PAGE_MASK; - + if (!kvm->arch.using_mmu_notifiers) { + physp = kvm->arch.slot_phys[memslot->id]; + if (!physp) + return H_PARAMETER; + physp += slot_fn; + if (realmode) + physp = real_vmalloc_addr(physp); + pa = *physp; + if (!pa) + return H_TOO_HARD; + is_io = pa & (HPTE_R_I | HPTE_R_W); + pte_size = PAGE_SIZE << (pa & KVMPPC_PAGE_ORDER_MASK); + pa &= PAGE_MASK; + } else { + /* Translate to host virtual address */ + hva = gfn_to_hva_memslot(memslot, gfn); + + /* Look up the Linux PTE for the backing page */ + pte_size = psize; + pte = lookup_linux_pte(vcpu, hva, &pte_size); + if (pte_present(pte)) { + is_io = hpte_cache_bits(pte_val(pte)); + pa = pte_pfn(pte) << PAGE_SHIFT; + } + } if (pte_size < psize) return H_PARAMETER; if (pa && pte_size > psize) @@ -180,10 +221,14 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, ptel &= ~(HPTE_R_PP0 - psize); ptel |= pa; - pteh |= HPTE_V_VALID; + + if (pa) + pteh |= HPTE_V_VALID; + else + pteh |= HPTE_V_ABSENT; /* Check WIMG */ - if (!hpte_cache_flags_ok(ptel, is_io)) { + if (is_io != ~0ul && !hpte_cache_flags_ok(ptel, is_io)) { if (is_io) return H_PARAMETER; /* @@ -194,6 +239,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, ptel |= HPTE_R_M; } + /* Find and lock the HPTEG slot to use */ do_insert: if (pte_index >= HPT_NPTE) return H_PARAMETER; @@ -253,7 +299,17 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, if (realmode) rmap = real_vmalloc_addr(rmap); lock_rmap(rmap); - kvmppc_add_revmap_chain(kvm, rev, rmap, pte_index, realmode); + /* Check for pending invalidations under the rmap chain lock */ + if (kvm->arch.using_mmu_notifiers && + mmu_notifier_retry(vcpu, mmu_seq)) { + /* inval in progress, write a non-present HPTE */ + pteh |= HPTE_V_ABSENT; + pteh &= ~HPTE_V_VALID; + unlock_rmap(rmap); + } else { + kvmppc_add_revmap_chain(kvm, rev, rmap, pte_index, + realmode); + } } hpte[1] = ptel; @@ -516,6 +572,23 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags, return H_SUCCESS; } +void kvmppc_invalidate_hpte(struct kvm *kvm, unsigned long *hptep, + unsigned long pte_index) +{ + unsigned long rb; + + hptep[0] &= ~HPTE_V_VALID; + rb = compute_tlbie_rb(hptep[0], hptep[1], pte_index); + while (!try_lock_tlbie(&kvm->arch.tlbie_lock)) + cpu_relax(); + asm volatile("ptesync" : : : "memory"); + asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync" + : : "r" (rb), "r" (kvm->arch.lpid)); + asm volatile("ptesync" : : : "memory"); + kvm->arch.tlbie_lock = 0; +} +EXPORT_SYMBOL_GPL(kvmppc_invalidate_hpte); + static int slb_base_page_shift[4] = { 24, /* 16M */ 16, /* 64k */ @@ -605,15 +678,15 @@ EXPORT_SYMBOL(kvmppc_hv_find_lock_hpte); /* * Called in real mode to check whether an HPTE not found fault - * is due to accessing an emulated MMIO page. + * is due to accessing a paged-out page or an emulated MMIO page. * Returns a possibly modified status (DSISR) value if not * (i.e. pass the interrupt to the guest), * -1 to pass the fault up to host kernel mode code, -2 to do that - * and also load the instruction word, + * and also load the instruction word (for MMIO emulation), * or 0 if we should make the guest retry the access. */ long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr, - unsigned long slb_v, unsigned int status) + unsigned long slb_v, unsigned int status, bool data) { struct kvm *kvm = vcpu->kvm; long int index; @@ -624,6 +697,7 @@ long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr, unsigned long pp, key; valid = HPTE_V_VALID | HPTE_V_ABSENT; + index = kvmppc_hv_find_lock_hpte(kvm, addr, slb_v, valid); if (index < 0) return status; /* there really was no HPTE */ @@ -645,22 +719,28 @@ long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr, /* Check access permissions to the page */ pp = gr & (HPTE_R_PP0 | HPTE_R_PP); key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS; - if (status & DSISR_ISSTORE) { + status &= ~DSISR_NOHPTE; /* DSISR_NOHPTE == SRR1_ISI_NOPT */ + if (!data) { + if (gr & (HPTE_R_N | HPTE_R_G)) + return status | SRR1_ISI_N_OR_G; + if (!hpte_read_permission(pp, slb_v & key)) + return status | SRR1_ISI_PROT; + } else if (status & DSISR_ISSTORE) { /* check write permission */ if (!hpte_write_permission(pp, slb_v & key)) - goto protfault; + return status | DSISR_PROTFAULT; } else { if (!hpte_read_permission(pp, slb_v & key)) - goto protfault; + return status | DSISR_PROTFAULT; } /* Check storage key, if applicable */ - if (vcpu->arch.shregs.msr & MSR_DR) { + if (data && (vcpu->arch.shregs.msr & MSR_DR)) { unsigned int perm = hpte_get_skey_perm(gr, vcpu->arch.amr); if (status & DSISR_ISSTORE) perm >>= 1; if (perm & 1) - return (status & ~DSISR_NOHPTE) | DSISR_KEYFAULT; + return status | DSISR_KEYFAULT; } /* Save HPTE info for virtual-mode handler */ @@ -669,11 +749,11 @@ long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr, vcpu->arch.pgfault_hpte[0] = v; vcpu->arch.pgfault_hpte[1] = r; - if (vcpu->arch.shregs.msr & MSR_IR) + /* Check the storage key to see if it is possibly emulated MMIO */ + if (data && (vcpu->arch.shregs.msr & MSR_IR) && + (r & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) == + (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) return -2; /* MMIO emulation - load instr word */ return -1; /* send fault up to host kernel mode */ - - protfault: - return (status & ~DSISR_NOHPTE) | DSISR_PROTFAULT; } diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index d07b64d5f37e..7d4990665d00 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -621,6 +621,8 @@ BEGIN_FTR_SECTION /* If this is a page table miss then see if it's theirs or ours */ cmpwi r12, BOOK3S_INTERRUPT_H_DATA_STORAGE beq kvmppc_hdsi + cmpwi r12, BOOK3S_INTERRUPT_H_INST_STORAGE + beq kvmppc_hisi END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) /* See if this is a leftover HDEC interrupt */ @@ -1125,6 +1127,7 @@ kvmppc_hdsi: /* Search the hash table. */ mr r3, r9 /* vcpu pointer */ + li r7, 1 /* data fault */ bl .kvmppc_hpte_hv_fault ld r9, HSTATE_KVM_VCPU(r13) ld r10, VCPU_PC(r9) @@ -1181,6 +1184,52 @@ kvmppc_hdsi: stb r0, HSTATE_IN_GUEST(r13) b nohpte_cont +/* + * Similarly for an HISI, reflect it to the guest as an ISI unless + * it is an HPTE not found fault for a page that we have paged out. + */ +kvmppc_hisi: + andis. r0, r11, SRR1_ISI_NOPT@h + beq 1f + andi. r0, r11, MSR_IR /* instruction relocation enabled? */ + beq 3f + clrrdi r0, r10, 28 + PPC_SLBFEE_DOT(r5, r0) /* if so, look up SLB */ + bne 1f /* if no SLB entry found */ +4: + /* Search the hash table. */ + mr r3, r9 /* vcpu pointer */ + mr r4, r10 + mr r6, r11 + li r7, 0 /* instruction fault */ + bl .kvmppc_hpte_hv_fault + ld r9, HSTATE_KVM_VCPU(r13) + ld r10, VCPU_PC(r9) + ld r11, VCPU_MSR(r9) + li r12, BOOK3S_INTERRUPT_H_INST_STORAGE + cmpdi r3, 0 /* retry the instruction */ + beq 6f + cmpdi r3, -1 /* handle in kernel mode */ + beq nohpte_cont + + /* Synthesize an ISI for the guest */ + mr r11, r3 +1: mtspr SPRN_SRR0, r10 + mtspr SPRN_SRR1, r11 + li r10, BOOK3S_INTERRUPT_INST_STORAGE + li r11, (MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */ + rotldi r11, r11, 63 +6: ld r7, VCPU_CTR(r9) + lwz r8, VCPU_XER(r9) + mtctr r7 + mtxer r8 + mr r4, r9 + b fast_guest_return + +3: ld r6, VCPU_KVM(r9) /* not relocated, use VRMA */ + ld r5, KVM_VRMA_SLB_V(r6) + b 4b + /* * Try to handle an hcall in real mode. * Returns to the guest if we handle it, or continues on up to diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 4f85ac32258a..06e955b5837e 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -245,6 +245,9 @@ int kvm_dev_ioctl_check_extension(long ext) if (cpu_has_feature(CPU_FTR_ARCH_201)) r = 2; break; + case KVM_CAP_SYNC_MMU: + r = cpu_has_feature(CPU_FTR_ARCH_206) ? 1 : 0; + break; #endif default: r = 0; diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index a8b3cc7d90fe..f348c3d90404 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -103,6 +104,7 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift *shift = hugepd_shift(*hpdp); return hugepte_offset(hpdp, ea, pdshift); } +EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte); pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) { -- cgit v1.2.3 From a355aa54f1d25dff83c0feef8863d83a76988fdb Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 12 Dec 2011 12:37:21 +0000 Subject: KVM: Add barriers to allow mmu_notifier_retry to be used locklessly This adds an smp_wmb in kvm_mmu_notifier_invalidate_range_end() and an smp_rmb in mmu_notifier_retry() so that mmu_notifier_retry() will give the correct answer when called without kvm->mmu_lock being held. PowerPC Book3S HV KVM wants to use a bitlock per guest page rather than a single global spinlock in order to improve the scalability of updates to the guest MMU hashed page table, and so needs this. Signed-off-by: Paul Mackerras Acked-by: Avi Kivity Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- include/linux/kvm_host.h | 14 +++++++++----- virt/kvm/kvm_main.c | 6 +++--- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d4d4d7092110..eada8e69fe58 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -702,12 +702,16 @@ static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_se if (unlikely(vcpu->kvm->mmu_notifier_count)) return 1; /* - * Both reads happen under the mmu_lock and both values are - * modified under mmu_lock, so there's no need of smb_rmb() - * here in between, otherwise mmu_notifier_count should be - * read before mmu_notifier_seq, see - * mmu_notifier_invalidate_range_end write side. + * Ensure the read of mmu_notifier_count happens before the read + * of mmu_notifier_seq. This interacts with the smp_wmb() in + * mmu_notifier_invalidate_range_end to make sure that the caller + * either sees the old (non-zero) value of mmu_notifier_count or + * the new (incremented) value of mmu_notifier_seq. + * PowerPC Book3s HV KVM calls this under a per-page lock + * rather than under kvm->mmu_lock, for scalability, so + * can't rely on kvm->mmu_lock to keep things ordered. */ + smp_rmb(); if (vcpu->kvm->mmu_notifier_seq != mmu_seq) return 1; return 0; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 64be836f3348..9f32bffd37c0 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -357,11 +357,11 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, * been freed. */ kvm->mmu_notifier_seq++; + smp_wmb(); /* * The above sequence increase must be visible before the - * below count decrease but both values are read by the kvm - * page fault under mmu_lock spinlock so we don't need to add - * a smb_wmb() here in between the two. + * below count decrease, which is ensured by the smp_wmb above + * in conjunction with the smp_rmb in mmu_notifier_retry(). */ kvm->mmu_notifier_count--; spin_unlock(&kvm->mmu_lock); -- cgit v1.2.3 From 4cf302bc106566c5bad523337296ea8b72df63f5 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 12 Dec 2011 12:38:51 +0000 Subject: KVM: PPC: Allow for read-only pages backing a Book3S HV guest With this, if a guest does an H_ENTER with a read/write HPTE on a page which is currently read-only, we make the actual HPTE inserted be a read-only version of the HPTE. We now intercept protection faults as well as HPTE not found faults, and for a protection fault we work out whether it should be reflected to the guest (e.g. because the guest HPTE didn't allow write access to usermode) or handled by switching to kernel context and calling kvmppc_book3s_hv_page_fault, which will then request write access to the page and update the actual HPTE. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/include/asm/kvm_book3s_64.h | 20 ++++++++++++++-- arch/powerpc/kvm/book3s_64_mmu_hv.c | 39 +++++++++++++++++++++++++++++--- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 32 ++++++++++++++++++-------- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 4 ++-- 4 files changed, 78 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index c21e46da4a3b..b0c08b142770 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -121,6 +121,22 @@ static inline unsigned long hpte_rpn(unsigned long ptel, unsigned long psize) return ((ptel & HPTE_R_RPN) & ~(psize - 1)) >> PAGE_SHIFT; } +static inline int hpte_is_writable(unsigned long ptel) +{ + unsigned long pp = ptel & (HPTE_R_PP0 | HPTE_R_PP); + + return pp != PP_RXRX && pp != PP_RXXX; +} + +static inline unsigned long hpte_make_readonly(unsigned long ptel) +{ + if ((ptel & HPTE_R_PP0) || (ptel & HPTE_R_PP) == PP_RWXX) + ptel = (ptel & ~HPTE_R_PP) | PP_RXXX; + else + ptel |= PP_RXRX; + return ptel; +} + static inline int hpte_cache_flags_ok(unsigned long ptel, unsigned long io_type) { unsigned int wimg = ptel & HPTE_R_WIMG; @@ -140,7 +156,7 @@ static inline int hpte_cache_flags_ok(unsigned long ptel, unsigned long io_type) * Lock and read a linux PTE. If it's present and writable, atomically * set dirty and referenced bits and return the PTE, otherwise return 0. */ -static inline pte_t kvmppc_read_update_linux_pte(pte_t *p) +static inline pte_t kvmppc_read_update_linux_pte(pte_t *p, int writing) { pte_t pte, tmp; @@ -158,7 +174,7 @@ static inline pte_t kvmppc_read_update_linux_pte(pte_t *p) if (pte_present(pte)) { pte = pte_mkyoung(pte); - if (pte_write(pte)) + if (writing && pte_write(pte)) pte = pte_mkdirty(pte); } diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 83761dd8a924..66d6452c1081 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -503,6 +503,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, struct page *page, *pages[1]; long index, ret, npages; unsigned long is_io; + unsigned int writing, write_ok; struct vm_area_struct *vma; /* @@ -553,8 +554,11 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, pfn = 0; page = NULL; pte_size = PAGE_SIZE; + writing = (dsisr & DSISR_ISSTORE) != 0; + /* If writing != 0, then the HPTE must allow writing, if we get here */ + write_ok = writing; hva = gfn_to_hva_memslot(memslot, gfn); - npages = get_user_pages_fast(hva, 1, 1, pages); + npages = get_user_pages_fast(hva, 1, writing, pages); if (npages < 1) { /* Check if it's an I/O mapping */ down_read(¤t->mm->mmap_sem); @@ -565,6 +569,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, ((hva - vma->vm_start) >> PAGE_SHIFT); pte_size = psize; is_io = hpte_cache_bits(pgprot_val(vma->vm_page_prot)); + write_ok = vma->vm_flags & VM_WRITE; } up_read(¤t->mm->mmap_sem); if (!pfn) @@ -575,6 +580,24 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, page = compound_head(page); pte_size <<= compound_order(page); } + /* if the guest wants write access, see if that is OK */ + if (!writing && hpte_is_writable(r)) { + pte_t *ptep, pte; + + /* + * We need to protect against page table destruction + * while looking up and updating the pte. + */ + rcu_read_lock_sched(); + ptep = find_linux_pte_or_hugepte(current->mm->pgd, + hva, NULL); + if (ptep && pte_present(*ptep)) { + pte = kvmppc_read_update_linux_pte(ptep, 1); + if (pte_write(pte)) + write_ok = 1; + } + rcu_read_unlock_sched(); + } pfn = page_to_pfn(page); } @@ -595,6 +618,8 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, /* Set the HPTE to point to pfn */ r = (r & ~(HPTE_R_PP0 - pte_size)) | (pfn << PAGE_SHIFT); + if (hpte_is_writable(r) && !write_ok) + r = hpte_make_readonly(r); ret = RESUME_GUEST; preempt_disable(); while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) @@ -614,14 +639,22 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, unlock_rmap(rmap); goto out_unlock; } - kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0); + + if (hptep[0] & HPTE_V_VALID) { + /* HPTE was previously valid, so we need to invalidate it */ + unlock_rmap(rmap); + hptep[0] |= HPTE_V_ABSENT; + kvmppc_invalidate_hpte(kvm, hptep, index); + } else { + kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0); + } hptep[1] = r; eieio(); hptep[0] = hpte[0]; asm volatile("ptesync" : : : "memory"); preempt_enable(); - if (page) + if (page && hpte_is_writable(r)) SetPageDirty(page); out_put: diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 81d16ed9767d..d3e36fc77e2b 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -120,7 +120,7 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index, } static pte_t lookup_linux_pte(struct kvm_vcpu *vcpu, unsigned long hva, - unsigned long *pte_sizep) + int writing, unsigned long *pte_sizep) { pte_t *ptep; unsigned long ps = *pte_sizep; @@ -137,7 +137,7 @@ static pte_t lookup_linux_pte(struct kvm_vcpu *vcpu, unsigned long hva, return __pte(0); if (!pte_present(*ptep)) return __pte(0); - return kvmppc_read_update_linux_pte(ptep); + return kvmppc_read_update_linux_pte(ptep, writing); } long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, @@ -154,12 +154,14 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, unsigned long is_io; unsigned long *rmap; pte_t pte; + unsigned int writing; unsigned long mmu_seq; bool realmode = vcpu->arch.vcore->vcore_state == VCORE_RUNNING; psize = hpte_page_size(pteh, ptel); if (!psize) return H_PARAMETER; + writing = hpte_is_writable(ptel); pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID); /* used later to detect if we might have been invalidated */ @@ -208,8 +210,11 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, /* Look up the Linux PTE for the backing page */ pte_size = psize; - pte = lookup_linux_pte(vcpu, hva, &pte_size); + pte = lookup_linux_pte(vcpu, hva, writing, &pte_size); if (pte_present(pte)) { + if (writing && !pte_write(pte)) + /* make the actual HPTE be read-only */ + ptel = hpte_make_readonly(ptel); is_io = hpte_cache_bits(pte_val(pte)); pa = pte_pfn(pte) << PAGE_SHIFT; } @@ -678,7 +683,9 @@ EXPORT_SYMBOL(kvmppc_hv_find_lock_hpte); /* * Called in real mode to check whether an HPTE not found fault - * is due to accessing a paged-out page or an emulated MMIO page. + * is due to accessing a paged-out page or an emulated MMIO page, + * or if a protection fault is due to accessing a page that the + * guest wanted read/write access to but which we made read-only. * Returns a possibly modified status (DSISR) value if not * (i.e. pass the interrupt to the guest), * -1 to pass the fault up to host kernel mode code, -2 to do that @@ -696,12 +703,17 @@ long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr, struct revmap_entry *rev; unsigned long pp, key; - valid = HPTE_V_VALID | HPTE_V_ABSENT; + /* For protection fault, expect to find a valid HPTE */ + valid = HPTE_V_VALID; + if (status & DSISR_NOHPTE) + valid |= HPTE_V_ABSENT; index = kvmppc_hv_find_lock_hpte(kvm, addr, slb_v, valid); - if (index < 0) - return status; /* there really was no HPTE */ - + if (index < 0) { + if (status & DSISR_NOHPTE) + return status; /* there really was no HPTE */ + return 0; /* for prot fault, HPTE disappeared */ + } hpte = (unsigned long *)(kvm->arch.hpt_virt + (index << 4)); v = hpte[0] & ~HPTE_V_HVLOCK; r = hpte[1]; @@ -712,8 +724,8 @@ long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr, asm volatile("lwsync" : : : "memory"); hpte[0] = v; - /* If the HPTE is valid by now, retry the instruction */ - if (v & HPTE_V_VALID) + /* For not found, if the HPTE is valid by now, retry the instruction */ + if ((status & DSISR_NOHPTE) && (v & HPTE_V_VALID)) return 0; /* Check access permissions to the page */ diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 7d4990665d00..b70bf22a3ff3 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1114,8 +1114,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) kvmppc_hdsi: mfspr r4, SPRN_HDAR mfspr r6, SPRN_HDSISR - /* HPTE not found fault? */ - andis. r0, r6, DSISR_NOHPTE@h + /* HPTE not found fault or protection fault? */ + andis. r0, r6, (DSISR_NOHPTE | DSISR_PROTFAULT)@h beq 1f /* if not, send it to the guest */ andi. r0, r11, MSR_DR /* data relocation enabled? */ beq 3f -- cgit v1.2.3 From 03cdab5340b423ec88fc18eb158a62a8a7b94d7f Mon Sep 17 00:00:00 2001 From: Matt Evans Date: Tue, 6 Dec 2011 21:19:42 +0000 Subject: KVM: PPC: Fix vcpu_create dereference before validity check. Fix usage of vcpu struct before check that it's actually valid. Signed-off-by: Matt Evans Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/kvm/powerpc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 06e955b5837e..25171037a36c 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -289,9 +289,10 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { struct kvm_vcpu *vcpu; vcpu = kvmppc_core_vcpu_create(kvm, id); - vcpu->arch.wqp = &vcpu->wq; - if (!IS_ERR(vcpu)) + if (!IS_ERR(vcpu)) { + vcpu->arch.wqp = &vcpu->wq; kvmppc_create_vcpu_debugfs(vcpu, id); + } return vcpu; } -- cgit v1.2.3 From b5434032fcfd7490c6453feb397fb781762b6f09 Mon Sep 17 00:00:00 2001 From: Matt Evans Date: Wed, 7 Dec 2011 16:55:57 +0000 Subject: KVM: PPC: Add KVM_CAP_NR_VCPUS and KVM_CAP_MAX_VCPUS PPC KVM lacks these two capabilities, and as such a userland system must assume a max of 4 VCPUs (following api.txt). With these, a userland can determine a more realistic limit. Signed-off-by: Matt Evans Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/kvm/powerpc.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 25171037a36c..f4380cb264e0 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -249,6 +249,22 @@ int kvm_dev_ioctl_check_extension(long ext) r = cpu_has_feature(CPU_FTR_ARCH_206) ? 1 : 0; break; #endif + case KVM_CAP_NR_VCPUS: + /* + * Recommending a number of CPUs is somewhat arbitrary; we + * return the number of present CPUs for -HV (since a host + * will have secondary threads "offline"), and for other KVM + * implementations just count online CPUs. + */ +#ifdef CONFIG_KVM_BOOK3S_64_HV + r = num_present_cpus(); +#else + r = num_online_cpus(); +#endif + break; + case KVM_CAP_MAX_VCPUS: + r = KVM_MAX_VCPUS; + break; default: r = 0; break; -- cgit v1.2.3 From a92bce95f0f967dfa6205527d7143d276b0be6a7 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 15 Dec 2011 02:01:10 +0000 Subject: KVM: PPC: Book3S HV: Keep HPTE locked when invalidating This reworks the implementations of the H_REMOVE and H_BULK_REMOVE hcalls to make sure that we keep the HPTE locked and in the reverse- mapping chain until we have finished invalidating it. Previously we would remove it from the chain and unlock it before invalidating it, leaving a tiny window when the guest could access the page even though we believe we have removed it from the guest (e.g., kvm_unmap_hva() has been called for the page and has found no HPTEs in the chain). In addition, we'll need this for future patches where we will need to read the R and C bits in the HPTE after invalidating it. Doing this required restructuring kvmppc_h_bulk_remove() substantially. Since we want to batch up the tlbies, we now need to keep several HPTEs locked simultaneously. In order to avoid possible deadlocks, we don't spin on the HPTE bitlock for any except the first HPTE in a batch. If we can't acquire the HPTE bitlock for the second or subsequent HPTE, we terminate the batch at that point, do the tlbies that we have accumulated so far, unlock those HPTEs, and then start a new batch to do the remaining invalidations. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 212 +++++++++++++++++++++--------------- 1 file changed, 125 insertions(+), 87 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index d3e36fc77e2b..ba4a1376b331 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -140,6 +140,12 @@ static pte_t lookup_linux_pte(struct kvm_vcpu *vcpu, unsigned long hva, return kvmppc_read_update_linux_pte(ptep, writing); } +static inline void unlock_hpte(unsigned long *hpte, unsigned long hpte_v) +{ + asm volatile(PPC_RELEASE_BARRIER "" : : : "memory"); + hpte[0] = hpte_v; +} + long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, long pte_index, unsigned long pteh, unsigned long ptel) { @@ -356,6 +362,7 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, struct kvm *kvm = vcpu->kvm; unsigned long *hpte; unsigned long v, r, rb; + struct revmap_entry *rev; if (pte_index >= HPT_NPTE) return H_PARAMETER; @@ -368,30 +375,32 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, hpte[0] &= ~HPTE_V_HVLOCK; return H_NOT_FOUND; } - if (atomic_read(&kvm->online_vcpus) == 1) - flags |= H_LOCAL; - vcpu->arch.gpr[4] = v = hpte[0] & ~HPTE_V_HVLOCK; - vcpu->arch.gpr[5] = r = hpte[1]; - rb = compute_tlbie_rb(v, r, pte_index); - if (v & HPTE_V_VALID) + + rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); + v = hpte[0] & ~HPTE_V_HVLOCK; + if (v & HPTE_V_VALID) { + hpte[0] &= ~HPTE_V_VALID; + rb = compute_tlbie_rb(v, hpte[1], pte_index); + if (!(flags & H_LOCAL) && atomic_read(&kvm->online_vcpus) > 1) { + while (!try_lock_tlbie(&kvm->arch.tlbie_lock)) + cpu_relax(); + asm volatile("ptesync" : : : "memory"); + asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync" + : : "r" (rb), "r" (kvm->arch.lpid)); + asm volatile("ptesync" : : : "memory"); + kvm->arch.tlbie_lock = 0; + } else { + asm volatile("ptesync" : : : "memory"); + asm volatile("tlbiel %0" : : "r" (rb)); + asm volatile("ptesync" : : : "memory"); + } remove_revmap_chain(kvm, pte_index, v); - smp_wmb(); - hpte[0] = 0; - if (!(v & HPTE_V_VALID)) - return H_SUCCESS; - if (!(flags & H_LOCAL)) { - while (!try_lock_tlbie(&kvm->arch.tlbie_lock)) - cpu_relax(); - asm volatile("ptesync" : : : "memory"); - asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync" - : : "r" (rb), "r" (kvm->arch.lpid)); - asm volatile("ptesync" : : : "memory"); - kvm->arch.tlbie_lock = 0; - } else { - asm volatile("ptesync" : : : "memory"); - asm volatile("tlbiel %0" : : "r" (rb)); - asm volatile("ptesync" : : : "memory"); } + r = rev->guest_rpte; + unlock_hpte(hpte, 0); + + vcpu->arch.gpr[4] = v; + vcpu->arch.gpr[5] = r; return H_SUCCESS; } @@ -399,82 +408,113 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; unsigned long *args = &vcpu->arch.gpr[4]; - unsigned long *hp, tlbrb[4]; - long int i, found; - long int n_inval = 0; - unsigned long flags, req, pte_index; + unsigned long *hp, *hptes[4], tlbrb[4]; + long int i, j, k, n, found, indexes[4]; + unsigned long flags, req, pte_index, rcbits; long int local = 0; long int ret = H_SUCCESS; + struct revmap_entry *rev, *revs[4]; if (atomic_read(&kvm->online_vcpus) == 1) local = 1; - for (i = 0; i < 4; ++i) { - pte_index = args[i * 2]; - flags = pte_index >> 56; - pte_index &= ((1ul << 56) - 1); - req = flags >> 6; - flags &= 3; - if (req == 3) - break; - if (req != 1 || flags == 3 || - pte_index >= HPT_NPTE) { - /* parameter error */ - args[i * 2] = ((0xa0 | flags) << 56) + pte_index; - ret = H_PARAMETER; - break; - } - hp = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); - while (!try_lock_hpte(hp, HPTE_V_HVLOCK)) - cpu_relax(); - found = 0; - if (hp[0] & (HPTE_V_ABSENT | HPTE_V_VALID)) { - switch (flags & 3) { - case 0: /* absolute */ - found = 1; + for (i = 0; i < 4 && ret == H_SUCCESS; ) { + n = 0; + for (; i < 4; ++i) { + j = i * 2; + pte_index = args[j]; + flags = pte_index >> 56; + pte_index &= ((1ul << 56) - 1); + req = flags >> 6; + flags &= 3; + if (req == 3) { /* no more requests */ + i = 4; break; - case 1: /* andcond */ - if (!(hp[0] & args[i * 2 + 1])) - found = 1; + } + if (req != 1 || flags == 3 || pte_index >= HPT_NPTE) { + /* parameter error */ + args[j] = ((0xa0 | flags) << 56) + pte_index; + ret = H_PARAMETER; break; - case 2: /* AVPN */ - if ((hp[0] & ~0x7fUL) == args[i * 2 + 1]) + } + hp = (unsigned long *) + (kvm->arch.hpt_virt + (pte_index << 4)); + /* to avoid deadlock, don't spin except for first */ + if (!try_lock_hpte(hp, HPTE_V_HVLOCK)) { + if (n) + break; + while (!try_lock_hpte(hp, HPTE_V_HVLOCK)) + cpu_relax(); + } + found = 0; + if (hp[0] & (HPTE_V_ABSENT | HPTE_V_VALID)) { + switch (flags & 3) { + case 0: /* absolute */ found = 1; - break; + break; + case 1: /* andcond */ + if (!(hp[0] & args[j + 1])) + found = 1; + break; + case 2: /* AVPN */ + if ((hp[0] & ~0x7fUL) == args[j + 1]) + found = 1; + break; + } + } + if (!found) { + hp[0] &= ~HPTE_V_HVLOCK; + args[j] = ((0x90 | flags) << 56) + pte_index; + continue; } + + args[j] = ((0x80 | flags) << 56) + pte_index; + rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); + /* insert R and C bits from guest PTE */ + rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C); + args[j] |= rcbits << (56 - 5); + + if (!(hp[0] & HPTE_V_VALID)) + continue; + + hp[0] &= ~HPTE_V_VALID; /* leave it locked */ + tlbrb[n] = compute_tlbie_rb(hp[0], hp[1], pte_index); + indexes[n] = j; + hptes[n] = hp; + revs[n] = rev; + ++n; } - if (!found) { - hp[0] &= ~HPTE_V_HVLOCK; - args[i * 2] = ((0x90 | flags) << 56) + pte_index; - continue; + + if (!n) + break; + + /* Now that we've collected a batch, do the tlbies */ + if (!local) { + while(!try_lock_tlbie(&kvm->arch.tlbie_lock)) + cpu_relax(); + asm volatile("ptesync" : : : "memory"); + for (k = 0; k < n; ++k) + asm volatile(PPC_TLBIE(%1,%0) : : + "r" (tlbrb[k]), + "r" (kvm->arch.lpid)); + asm volatile("eieio; tlbsync; ptesync" : : : "memory"); + kvm->arch.tlbie_lock = 0; + } else { + asm volatile("ptesync" : : : "memory"); + for (k = 0; k < n; ++k) + asm volatile("tlbiel %0" : : "r" (tlbrb[k])); + asm volatile("ptesync" : : : "memory"); } - /* insert R and C bits from PTE */ - flags |= (hp[1] >> 5) & 0x0c; - args[i * 2] = ((0x80 | flags) << 56) + pte_index; - if (hp[0] & HPTE_V_VALID) { - tlbrb[n_inval++] = compute_tlbie_rb(hp[0], hp[1], pte_index); + + for (k = 0; k < n; ++k) { + j = indexes[k]; + pte_index = args[j] & ((1ul << 56) - 1); + hp = hptes[k]; + rev = revs[k]; remove_revmap_chain(kvm, pte_index, hp[0]); + unlock_hpte(hp, 0); } - smp_wmb(); - hp[0] = 0; - } - if (n_inval == 0) - return ret; - - if (!local) { - while(!try_lock_tlbie(&kvm->arch.tlbie_lock)) - cpu_relax(); - asm volatile("ptesync" : : : "memory"); - for (i = 0; i < n_inval; ++i) - asm volatile(PPC_TLBIE(%1,%0) - : : "r" (tlbrb[i]), "r" (kvm->arch.lpid)); - asm volatile("eieio; tlbsync; ptesync" : : : "memory"); - kvm->arch.tlbie_lock = 0; - } else { - asm volatile("ptesync" : : : "memory"); - for (i = 0; i < n_inval; ++i) - asm volatile("tlbiel %0" : : "r" (tlbrb[i])); - asm volatile("ptesync" : : : "memory"); } + return ret; } @@ -720,9 +760,7 @@ long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr, rev = real_vmalloc_addr(&kvm->arch.revmap[index]); gr = rev->guest_rpte; - /* Unlock the HPTE */ - asm volatile("lwsync" : : : "memory"); - hpte[0] = v; + unlock_hpte(hpte, v); /* For not found, if the HPTE is valid by now, retry the instruction */ if ((status & DSISR_NOHPTE) && (v & HPTE_V_VALID)) -- cgit v1.2.3 From bad3b5075eeb18cb1641b4171618add638bc0fa7 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 15 Dec 2011 02:02:02 +0000 Subject: KVM: PPC: Book3s HV: Maintain separate guest and host views of R and C bits This allows both the guest and the host to use the referenced (R) and changed (C) bits in the guest hashed page table. The guest has a view of R and C that is maintained in the guest_rpte field of the revmap entry for the HPTE, and the host has a view that is maintained in the rmap entry for the associated gfn. Both view are updated from the guest HPT. If a bit (R or C) is zero in either view, it will be initially set to zero in the HPTE (or HPTEs), until set to 1 by hardware. When an HPTE is removed for any reason, the R and C bits from the HPTE are ORed into both views. We have to be careful to read the R and C bits from the HPTE after invalidating it, but before unlocking it, in case of any late updates by the hardware. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/include/asm/kvm_host.h | 5 ++-- arch/powerpc/kvm/book3s_64_mmu_hv.c | 48 +++++++++++++++++++++++-------------- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 45 +++++++++++++++++++--------------- 3 files changed, 59 insertions(+), 39 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 968f3aa61cd1..1cb6e522485b 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -200,8 +200,9 @@ struct revmap_entry { * index in the guest HPT of a HPTE that points to the page. */ #define KVMPPC_RMAP_LOCK_BIT 63 -#define KVMPPC_RMAP_REF_BIT 33 -#define KVMPPC_RMAP_REFERENCED (1ul << KVMPPC_RMAP_REF_BIT) +#define KVMPPC_RMAP_RC_SHIFT 32 +#define KVMPPC_RMAP_REFERENCED (HPTE_R_R << KVMPPC_RMAP_RC_SHIFT) +#define KVMPPC_RMAP_CHANGED (HPTE_R_C << KVMPPC_RMAP_RC_SHIFT) #define KVMPPC_RMAP_PRESENT 0x100000000ul #define KVMPPC_RMAP_INDEX 0xfffffffful diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 66d6452c1081..aa51ddef468e 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -505,6 +505,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned long is_io; unsigned int writing, write_ok; struct vm_area_struct *vma; + unsigned long rcbits; /* * Real-mode code has already searched the HPT and found the @@ -640,11 +641,17 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, goto out_unlock; } + /* Only set R/C in real HPTE if set in both *rmap and guest_rpte */ + rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT; + r &= rcbits | ~(HPTE_R_R | HPTE_R_C); + if (hptep[0] & HPTE_V_VALID) { /* HPTE was previously valid, so we need to invalidate it */ unlock_rmap(rmap); hptep[0] |= HPTE_V_ABSENT; kvmppc_invalidate_hpte(kvm, hptep, index); + /* don't lose previous R and C bits */ + r |= hptep[1] & (HPTE_R_R | HPTE_R_C); } else { kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0); } @@ -701,50 +708,55 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, struct revmap_entry *rev = kvm->arch.revmap; unsigned long h, i, j; unsigned long *hptep; - unsigned long ptel, psize; + unsigned long ptel, psize, rcbits; for (;;) { - while (test_and_set_bit_lock(KVMPPC_RMAP_LOCK_BIT, rmapp)) - cpu_relax(); + lock_rmap(rmapp); if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { - __clear_bit_unlock(KVMPPC_RMAP_LOCK_BIT, rmapp); + unlock_rmap(rmapp); break; } /* * To avoid an ABBA deadlock with the HPTE lock bit, - * we have to unlock the rmap chain before locking the HPTE. - * Thus we remove the first entry, unlock the rmap chain, - * lock the HPTE and then check that it is for the - * page we're unmapping before changing it to non-present. + * we can't spin on the HPTE lock while holding the + * rmap chain lock. */ i = *rmapp & KVMPPC_RMAP_INDEX; + hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4)); + if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { + /* unlock rmap before spinning on the HPTE lock */ + unlock_rmap(rmapp); + while (hptep[0] & HPTE_V_HVLOCK) + cpu_relax(); + continue; + } j = rev[i].forw; if (j == i) { /* chain is now empty */ - j = 0; + *rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX); } else { /* remove i from chain */ h = rev[i].back; rev[h].forw = j; rev[j].back = h; rev[i].forw = rev[i].back = i; - j |= KVMPPC_RMAP_PRESENT; + *rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j; } - smp_wmb(); - *rmapp = j | (1ul << KVMPPC_RMAP_REF_BIT); - /* Now lock, check and modify the HPTE */ - hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4)); - while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) - cpu_relax(); + /* Now check and modify the HPTE */ ptel = rev[i].guest_rpte; psize = hpte_page_size(hptep[0], ptel); if ((hptep[0] & HPTE_V_VALID) && hpte_rpn(ptel, psize) == gfn) { - kvmppc_invalidate_hpte(kvm, hptep, i); hptep[0] |= HPTE_V_ABSENT; + kvmppc_invalidate_hpte(kvm, hptep, i); + /* Harvest R and C */ + rcbits = hptep[1] & (HPTE_R_R | HPTE_R_C); + *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT; + rev[i].guest_rpte = ptel | rcbits; } + unlock_rmap(rmapp); hptep[0] &= ~HPTE_V_HVLOCK; } return 0; @@ -767,7 +779,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, kvm_unmap_rmapp(kvm, rmapp, gfn); while (test_and_set_bit_lock(KVMPPC_RMAP_LOCK_BIT, rmapp)) cpu_relax(); - __clear_bit(KVMPPC_RMAP_REF_BIT, rmapp); + *rmapp &= ~KVMPPC_RMAP_REFERENCED; __clear_bit_unlock(KVMPPC_RMAP_LOCK_BIT, rmapp); return 1; } diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index ba4a1376b331..91b45a03f438 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -87,15 +87,17 @@ EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain); /* Remove this HPTE from the chain for a real page */ static void remove_revmap_chain(struct kvm *kvm, long pte_index, - unsigned long hpte_v) + struct revmap_entry *rev, + unsigned long hpte_v, unsigned long hpte_r) { - struct revmap_entry *rev, *next, *prev; + struct revmap_entry *next, *prev; unsigned long gfn, ptel, head; struct kvm_memory_slot *memslot; unsigned long *rmap; + unsigned long rcbits; - rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); - ptel = rev->guest_rpte; + rcbits = hpte_r & (HPTE_R_R | HPTE_R_C); + ptel = rev->guest_rpte |= rcbits; gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel)); memslot = builtin_gfn_to_memslot(kvm, gfn); if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) @@ -116,6 +118,7 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index, else *rmap = (*rmap & ~KVMPPC_RMAP_INDEX) | head; } + *rmap |= rcbits << KVMPPC_RMAP_RC_SHIFT; unlock_rmap(rmap); } @@ -162,6 +165,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, pte_t pte; unsigned int writing; unsigned long mmu_seq; + unsigned long rcbits; bool realmode = vcpu->arch.vcore->vcore_state == VCORE_RUNNING; psize = hpte_page_size(pteh, ptel); @@ -320,6 +324,9 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, } else { kvmppc_add_revmap_chain(kvm, rev, rmap, pte_index, realmode); + /* Only set R/C in real HPTE if already set in *rmap */ + rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT; + ptel &= rcbits | ~(HPTE_R_R | HPTE_R_C); } } @@ -394,7 +401,8 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, asm volatile("tlbiel %0" : : "r" (rb)); asm volatile("ptesync" : : : "memory"); } - remove_revmap_chain(kvm, pte_index, v); + /* Read PTE low word after tlbie to get final R/C values */ + remove_revmap_chain(kvm, pte_index, rev, v, hpte[1]); } r = rev->guest_rpte; unlock_hpte(hpte, 0); @@ -469,12 +477,13 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) args[j] = ((0x80 | flags) << 56) + pte_index; rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); - /* insert R and C bits from guest PTE */ - rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C); - args[j] |= rcbits << (56 - 5); - if (!(hp[0] & HPTE_V_VALID)) + if (!(hp[0] & HPTE_V_VALID)) { + /* insert R and C bits from PTE */ + rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C); + args[j] |= rcbits << (56 - 5); continue; + } hp[0] &= ~HPTE_V_VALID; /* leave it locked */ tlbrb[n] = compute_tlbie_rb(hp[0], hp[1], pte_index); @@ -505,13 +514,16 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) asm volatile("ptesync" : : : "memory"); } + /* Read PTE low words after tlbie to get final R/C values */ for (k = 0; k < n; ++k) { j = indexes[k]; pte_index = args[j] & ((1ul << 56) - 1); hp = hptes[k]; rev = revs[k]; - remove_revmap_chain(kvm, pte_index, hp[0]); - unlock_hpte(hp, 0); + remove_revmap_chain(kvm, pte_index, rev, hp[0], hp[1]); + rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C); + args[j] |= rcbits << (56 - 5); + hp[0] = 0; } } @@ -595,8 +607,7 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags, pte_index &= ~3; n = 4; } - if (flags & H_R_XLATE) - rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); + rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); for (i = 0; i < n; ++i, ++pte_index) { hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); v = hpte[0] & ~HPTE_V_HVLOCK; @@ -605,12 +616,8 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags, v &= ~HPTE_V_ABSENT; v |= HPTE_V_VALID; } - if (v & HPTE_V_VALID) { - if (rev) - r = rev[i].guest_rpte; - else - r = hpte[1] | HPTE_R_RPN; - } + if (v & HPTE_V_VALID) + r = rev[i].guest_rpte | (r & (HPTE_R_R | HPTE_R_C)); vcpu->arch.gpr[4 + i * 2] = v; vcpu->arch.gpr[5 + i * 2] = r; } -- cgit v1.2.3 From 55514893739d28f095f19b012133eea4cb4a9390 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 15 Dec 2011 02:02:47 +0000 Subject: KVM: PPC: Book3S HV: Use the hardware referenced bit for kvm_age_hva This uses the host view of the hardware R (referenced) bit to speed up kvm_age_hva() and kvm_test_age_hva(). Instead of removing all the relevant HPTEs in kvm_age_hva(), we now just reset their R bits if set. Also, kvm_test_age_hva() now scans the relevant HPTEs to see if any of them have R set. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/include/asm/kvm_book3s.h | 2 + arch/powerpc/kvm/book3s_64_mmu_hv.c | 81 ++++++++++++++++++++++++++++++----- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 19 ++++++++ 3 files changed, 91 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 9240cebf8bad..33fdc09508a1 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -147,6 +147,8 @@ extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, unsigned long *rmap, long pte_index, int realmode); extern void kvmppc_invalidate_hpte(struct kvm *kvm, unsigned long *hptep, unsigned long pte_index); +void kvmppc_clear_ref_hpte(struct kvm *kvm, unsigned long *hptep, + unsigned long pte_index); extern void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long addr, unsigned long *nb_ret); extern void kvmppc_unpin_guest_page(struct kvm *kvm, void *addr); diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index aa51ddef468e..926e2b92bdab 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -772,16 +772,50 @@ int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, unsigned long gfn) { - if (!kvm->arch.using_mmu_notifiers) - return 0; - if (!(*rmapp & KVMPPC_RMAP_REFERENCED)) - return 0; - kvm_unmap_rmapp(kvm, rmapp, gfn); - while (test_and_set_bit_lock(KVMPPC_RMAP_LOCK_BIT, rmapp)) - cpu_relax(); - *rmapp &= ~KVMPPC_RMAP_REFERENCED; - __clear_bit_unlock(KVMPPC_RMAP_LOCK_BIT, rmapp); - return 1; + struct revmap_entry *rev = kvm->arch.revmap; + unsigned long head, i, j; + unsigned long *hptep; + int ret = 0; + + retry: + lock_rmap(rmapp); + if (*rmapp & KVMPPC_RMAP_REFERENCED) { + *rmapp &= ~KVMPPC_RMAP_REFERENCED; + ret = 1; + } + if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { + unlock_rmap(rmapp); + return ret; + } + + i = head = *rmapp & KVMPPC_RMAP_INDEX; + do { + hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4)); + j = rev[i].forw; + + /* If this HPTE isn't referenced, ignore it */ + if (!(hptep[1] & HPTE_R_R)) + continue; + + if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { + /* unlock rmap before spinning on the HPTE lock */ + unlock_rmap(rmapp); + while (hptep[0] & HPTE_V_HVLOCK) + cpu_relax(); + goto retry; + } + + /* Now check and modify the HPTE */ + if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_R)) { + kvmppc_clear_ref_hpte(kvm, hptep, i); + rev[i].guest_rpte |= HPTE_R_R; + ret = 1; + } + hptep[0] &= ~HPTE_V_HVLOCK; + } while ((i = j) != head); + + unlock_rmap(rmapp); + return ret; } int kvm_age_hva(struct kvm *kvm, unsigned long hva) @@ -794,7 +828,32 @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva) static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, unsigned long gfn) { - return !!(*rmapp & KVMPPC_RMAP_REFERENCED); + struct revmap_entry *rev = kvm->arch.revmap; + unsigned long head, i, j; + unsigned long *hp; + int ret = 1; + + if (*rmapp & KVMPPC_RMAP_REFERENCED) + return 1; + + lock_rmap(rmapp); + if (*rmapp & KVMPPC_RMAP_REFERENCED) + goto out; + + if (*rmapp & KVMPPC_RMAP_PRESENT) { + i = head = *rmapp & KVMPPC_RMAP_INDEX; + do { + hp = (unsigned long *)(kvm->arch.hpt_virt + (i << 4)); + j = rev[i].forw; + if (hp[1] & HPTE_R_R) + goto out; + } while ((i = j) != head); + } + ret = 0; + + out: + unlock_rmap(rmapp); + return ret; } int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 91b45a03f438..5f3c60b89faf 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -641,6 +641,25 @@ void kvmppc_invalidate_hpte(struct kvm *kvm, unsigned long *hptep, } EXPORT_SYMBOL_GPL(kvmppc_invalidate_hpte); +void kvmppc_clear_ref_hpte(struct kvm *kvm, unsigned long *hptep, + unsigned long pte_index) +{ + unsigned long rb; + unsigned char rbyte; + + rb = compute_tlbie_rb(hptep[0], hptep[1], pte_index); + rbyte = (hptep[1] & ~HPTE_R_R) >> 8; + /* modify only the second-last byte, which contains the ref bit */ + *((char *)hptep + 14) = rbyte; + while (!try_lock_tlbie(&kvm->arch.tlbie_lock)) + cpu_relax(); + asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync" + : : "r" (rb), "r" (kvm->arch.lpid)); + asm volatile("ptesync" : : : "memory"); + kvm->arch.tlbie_lock = 0; +} +EXPORT_SYMBOL_GPL(kvmppc_clear_ref_hpte); + static int slb_base_page_shift[4] = { 24, /* 16M */ 16, /* 64k */ -- cgit v1.2.3 From 82ed36164c8a8ee685ea3fb3c4f741214ac070ca Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 15 Dec 2011 02:03:22 +0000 Subject: KVM: PPC: Book3s HV: Implement get_dirty_log using hardware changed bit This changes the implementation of kvm_vm_ioctl_get_dirty_log() for Book3s HV guests to use the hardware C (changed) bits in the guest hashed page table. Since this makes the implementation quite different from the Book3s PR case, this moves the existing implementation from book3s.c to book3s_pr.c and creates a new implementation in book3s_hv.c. That implementation calls kvmppc_hv_get_dirty_log() to do the actual work by calling kvm_test_clear_dirty on each page. It iterates over the HPTEs, clearing the C bit if set, and returns 1 if any C bit was set (including the saved C bit in the rmap entry). Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/include/asm/kvm_book3s.h | 2 + arch/powerpc/kvm/book3s.c | 39 -------------------- arch/powerpc/kvm/book3s_64_mmu_hv.c | 69 +++++++++++++++++++++++++++++++++++ arch/powerpc/kvm/book3s_hv.c | 37 +++++++++++++++++++ arch/powerpc/kvm/book3s_pr.c | 39 ++++++++++++++++++++ 5 files changed, 147 insertions(+), 39 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 33fdc09508a1..3c3edee672aa 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -156,6 +156,8 @@ extern long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, long pte_index, unsigned long pteh, unsigned long ptel); extern long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, long pte_index, unsigned long pteh, unsigned long ptel); +extern long kvmppc_hv_get_dirty_log(struct kvm *kvm, + struct kvm_memory_slot *memslot); extern void kvmppc_entry_trampoline(void); extern void kvmppc_hv_entry_trampoline(void); diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 6bf7e0582c5a..7d54f4ed6d96 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -477,45 +477,6 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, return 0; } -/* - * Get (and clear) the dirty memory log for a memory slot. - */ -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, - struct kvm_dirty_log *log) -{ - struct kvm_memory_slot *memslot; - struct kvm_vcpu *vcpu; - ulong ga, ga_end; - int is_dirty = 0; - int r; - unsigned long n; - - mutex_lock(&kvm->slots_lock); - - r = kvm_get_dirty_log(kvm, log, &is_dirty); - if (r) - goto out; - - /* If nothing is dirty, don't bother messing with page tables. */ - if (is_dirty) { - memslot = id_to_memslot(kvm->memslots, log->slot); - - ga = memslot->base_gfn << PAGE_SHIFT; - ga_end = ga + (memslot->npages << PAGE_SHIFT); - - kvm_for_each_vcpu(n, vcpu, kvm) - kvmppc_mmu_pte_pflush(vcpu, ga, ga_end); - - n = kvm_dirty_bitmap_bytes(memslot); - memset(memslot->dirty_bitmap, 0, n); - } - - r = 0; -out: - mutex_unlock(&kvm->slots_lock); - return r; -} - void kvmppc_decrementer_func(unsigned long data) { struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 926e2b92bdab..783cd3510c93 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -870,6 +870,75 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); } +static int kvm_test_clear_dirty(struct kvm *kvm, unsigned long *rmapp) +{ + struct revmap_entry *rev = kvm->arch.revmap; + unsigned long head, i, j; + unsigned long *hptep; + int ret = 0; + + retry: + lock_rmap(rmapp); + if (*rmapp & KVMPPC_RMAP_CHANGED) { + *rmapp &= ~KVMPPC_RMAP_CHANGED; + ret = 1; + } + if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { + unlock_rmap(rmapp); + return ret; + } + + i = head = *rmapp & KVMPPC_RMAP_INDEX; + do { + hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4)); + j = rev[i].forw; + + if (!(hptep[1] & HPTE_R_C)) + continue; + + if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { + /* unlock rmap before spinning on the HPTE lock */ + unlock_rmap(rmapp); + while (hptep[0] & HPTE_V_HVLOCK) + cpu_relax(); + goto retry; + } + + /* Now check and modify the HPTE */ + if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_C)) { + /* need to make it temporarily absent to clear C */ + hptep[0] |= HPTE_V_ABSENT; + kvmppc_invalidate_hpte(kvm, hptep, i); + hptep[1] &= ~HPTE_R_C; + eieio(); + hptep[0] = (hptep[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID; + rev[i].guest_rpte |= HPTE_R_C; + ret = 1; + } + hptep[0] &= ~HPTE_V_HVLOCK; + } while ((i = j) != head); + + unlock_rmap(rmapp); + return ret; +} + +long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) +{ + unsigned long i; + unsigned long *rmapp, *map; + + preempt_disable(); + rmapp = memslot->rmap; + map = memslot->dirty_bitmap; + for (i = 0; i < memslot->npages; ++i) { + if (kvm_test_clear_dirty(kvm, rmapp)) + __set_bit_le(i, map); + ++rmapp; + } + preempt_enable(); + return 0; +} + void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa, unsigned long *nb_ret) { diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 86c4191cb75b..0f1ddf0ec032 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1072,6 +1072,43 @@ long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *ret) return fd; } +/* + * Get (and clear) the dirty memory log for a memory slot. + */ +int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) +{ + struct kvm_memory_slot *memslot; + int r; + unsigned long n; + + mutex_lock(&kvm->slots_lock); + + r = -EINVAL; + if (log->slot >= KVM_MEMORY_SLOTS) + goto out; + + memslot = id_to_memslot(kvm->memslots, log->slot); + r = -ENOENT; + if (!memslot->dirty_bitmap) + goto out; + + n = kvm_dirty_bitmap_bytes(memslot); + memset(memslot->dirty_bitmap, 0, n); + + r = kvmppc_hv_get_dirty_log(kvm, memslot); + if (r) + goto out; + + r = -EFAULT; + if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) + goto out; + + r = 0; +out: + mutex_unlock(&kvm->slots_lock); + return r; +} + static unsigned long slb_pgsize_encoding(unsigned long psize) { unsigned long senc = 0; diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 2da670405727..c193625d5289 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -1056,6 +1056,45 @@ out: return ret; } +/* + * Get (and clear) the dirty memory log for a memory slot. + */ +int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, + struct kvm_dirty_log *log) +{ + struct kvm_memory_slot *memslot; + struct kvm_vcpu *vcpu; + ulong ga, ga_end; + int is_dirty = 0; + int r; + unsigned long n; + + mutex_lock(&kvm->slots_lock); + + r = kvm_get_dirty_log(kvm, log, &is_dirty); + if (r) + goto out; + + /* If nothing is dirty, don't bother messing with page tables. */ + if (is_dirty) { + memslot = id_to_memslot(kvm->memslots, log->slot); + + ga = memslot->base_gfn << PAGE_SHIFT; + ga_end = ga + (memslot->npages << PAGE_SHIFT); + + kvm_for_each_vcpu(n, vcpu, kvm) + kvmppc_mmu_pte_pflush(vcpu, ga, ga_end); + + n = kvm_dirty_bitmap_bytes(memslot); + memset(memslot->dirty_bitmap, 0, n); + } + + r = 0; +out: + mutex_unlock(&kvm->slots_lock); + return r; +} + int kvmppc_core_prepare_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem) { -- cgit v1.2.3 From d37b1a037cae725e69e5bf96f58544b69d7c93a6 Mon Sep 17 00:00:00 2001 From: Liu Yu Date: Tue, 20 Dec 2011 14:42:56 +0000 Subject: KVM: PPC: booke: Add booke206 TLB trace The existing kvm_stlb_write/kvm_gtlb_write were a poor match for the e500/book3e MMU -- mas1 was passed as "tid", mas2 was limited to "unsigned int" which will be a problem on 64-bit, mas3/7 got split up rather than treated as a single 64-bit word, etc. Signed-off-by: Liu Yu [scottwood@freescale.com: made mas2 64-bit, and added mas8 init] Signed-off-by: Scott Wood Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/kvm/e500_tlb.c | 10 ++++---- arch/powerpc/kvm/trace.h | 57 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index 1746e677bf37..6e53e4164de1 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -294,6 +294,9 @@ static inline void __write_host_tlbe(struct kvm_book3e_206_tlb_entry *stlbe, mtspr(SPRN_MAS7, (u32)(stlbe->mas7_3 >> 32)); asm volatile("isync; tlbwe" : : : "memory"); local_irq_restore(flags); + + trace_kvm_booke206_stlb_write(mas0, stlbe->mas8, stlbe->mas1, + stlbe->mas2, stlbe->mas7_3); } /* @@ -332,8 +335,6 @@ static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500, MAS0_TLBSEL(1) | MAS0_ESEL(to_htlb1_esel(sesel))); } - trace_kvm_stlb_write(index_of(tlbsel, esel), stlbe->mas1, stlbe->mas2, - (u32)stlbe->mas7_3, (u32)(stlbe->mas7_3 >> 32)); } void kvmppc_map_magic(struct kvm_vcpu *vcpu) @@ -355,6 +356,7 @@ void kvmppc_map_magic(struct kvm_vcpu *vcpu) magic.mas2 = vcpu->arch.magic_page_ea | MAS2_M; magic.mas7_3 = ((u64)pfn << PAGE_SHIFT) | MAS3_SW | MAS3_SR | MAS3_UW | MAS3_UR; + magic.mas8 = 0; __write_host_tlbe(&magic, MAS0_TLBSEL(1) | MAS0_ESEL(tlbcam_index)); preempt_enable(); @@ -954,8 +956,8 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) gtlbe->mas2 = vcpu->arch.shared->mas2; gtlbe->mas7_3 = vcpu->arch.shared->mas7_3; - trace_kvm_gtlb_write(vcpu->arch.shared->mas0, gtlbe->mas1, gtlbe->mas2, - (u32)gtlbe->mas7_3, (u32)(gtlbe->mas7_3 >> 32)); + trace_kvm_booke206_gtlb_write(vcpu->arch.shared->mas0, gtlbe->mas1, + gtlbe->mas2, gtlbe->mas7_3); /* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */ if (tlbe_is_host_safe(vcpu, gtlbe)) { diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h index 609d8bfb54e3..877186b7b1c3 100644 --- a/arch/powerpc/kvm/trace.h +++ b/arch/powerpc/kvm/trace.h @@ -340,6 +340,63 @@ TRACE_EVENT(kvm_book3s_slbmte, #endif /* CONFIG_PPC_BOOK3S */ + +/************************************************************************* + * Book3E trace points * + *************************************************************************/ + +#ifdef CONFIG_BOOKE + +TRACE_EVENT(kvm_booke206_stlb_write, + TP_PROTO(__u32 mas0, __u32 mas8, __u32 mas1, __u64 mas2, __u64 mas7_3), + TP_ARGS(mas0, mas8, mas1, mas2, mas7_3), + + TP_STRUCT__entry( + __field( __u32, mas0 ) + __field( __u32, mas8 ) + __field( __u32, mas1 ) + __field( __u64, mas2 ) + __field( __u64, mas7_3 ) + ), + + TP_fast_assign( + __entry->mas0 = mas0; + __entry->mas8 = mas8; + __entry->mas1 = mas1; + __entry->mas2 = mas2; + __entry->mas7_3 = mas7_3; + ), + + TP_printk("mas0=%x mas8=%x mas1=%x mas2=%llx mas7_3=%llx", + __entry->mas0, __entry->mas8, __entry->mas1, + __entry->mas2, __entry->mas7_3) +); + +TRACE_EVENT(kvm_booke206_gtlb_write, + TP_PROTO(__u32 mas0, __u32 mas1, __u64 mas2, __u64 mas7_3), + TP_ARGS(mas0, mas1, mas2, mas7_3), + + TP_STRUCT__entry( + __field( __u32, mas0 ) + __field( __u32, mas1 ) + __field( __u64, mas2 ) + __field( __u64, mas7_3 ) + ), + + TP_fast_assign( + __entry->mas0 = mas0; + __entry->mas1 = mas1; + __entry->mas2 = mas2; + __entry->mas7_3 = mas7_3; + ), + + TP_printk("mas0=%x mas1=%x mas2=%llx mas7_3=%llx", + __entry->mas0, __entry->mas1, + __entry->mas2, __entry->mas7_3) +); + +#endif + #endif /* _TRACE_KVM_H */ /* This part must be outside protection */ -- cgit v1.2.3 From 6b75e6bfef7ba108ac3df0d430d80dea68fde4bf Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Wed, 7 Dec 2011 10:24:56 +0200 Subject: KVM: PPC: Use the vcpu kmem_cache when allocating new VCPUs Currently the code kzalloc()s new VCPUs instead of using the kmem_cache which is created when KVM is initialized. Modify it to allocate VCPUs from that kmem_cache. Signed-off-by: Sasha Levin Acked-by: Paul Mackerras Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/kvm/book3s_hv.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 0f1ddf0ec032..fdc804c83938 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -417,7 +417,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) goto out; err = -ENOMEM; - vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL); + vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); if (!vcpu) goto out; @@ -469,7 +469,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) return vcpu; free_vcpu: - kfree(vcpu); + kmem_cache_free(kvm_vcpu_cache, vcpu); out: return ERR_PTR(err); } @@ -483,7 +483,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) if (vcpu->arch.vpa) kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.vpa); kvm_vcpu_uninit(vcpu); - kfree(vcpu); + kmem_cache_free(kvm_vcpu_cache, vcpu); } static void kvmppc_set_timer(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From e24ed81fedd551e80378be62fa0b0532480ea7d4 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Wed, 14 Sep 2011 10:02:41 +0200 Subject: KVM: PPC: Add generic single register ioctls Right now we transfer a static struct every time we want to get or set registers. Unfortunately, over time we realize that there are more of these than we thought of before and the extensibility and flexibility of transferring a full struct every time is limited. So this is a new approach to the problem. With these new ioctls, we can get and set a single register that is identified by an ID. This allows for very precise and limited transmittal of data. When we later realize that it's a better idea to shove over multiple registers at once, we can reuse most of the infrastructure and simply implement a GET_MANY_REGS / SET_MANY_REGS interface. Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- Documentation/virtual/kvm/api.txt | 46 ++++++++++++++++++++++++++++++++++++--- arch/powerpc/kvm/powerpc.c | 41 ++++++++++++++++++++++++++++++++++ include/linux/kvm.h | 35 +++++++++++++++++++++++++++++ 3 files changed, 119 insertions(+), 3 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index bcd45d5afcab..e0c75dcb2ca3 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1533,7 +1533,7 @@ following algorithm: Some guests configure the LINT1 NMI input to cause a panic, aiding in debugging. -4.64 KVM_S390_UCAS_MAP +4.65 KVM_S390_UCAS_MAP Capability: KVM_CAP_S390_UCONTROL Architectures: s390 @@ -1552,7 +1552,7 @@ This ioctl maps the memory at "user_addr" with the length "length" to the vcpu's address space starting at "vcpu_addr". All parameters need to be alligned by 1 megabyte. -4.65 KVM_S390_UCAS_UNMAP +4.66 KVM_S390_UCAS_UNMAP Capability: KVM_CAP_S390_UCONTROL Architectures: s390 @@ -1571,7 +1571,7 @@ This ioctl unmaps the memory in the vcpu's address space starting at "vcpu_addr" with the length "length". The field "user_addr" is ignored. All parameters need to be alligned by 1 megabyte. -4.66 KVM_S390_VCPU_FAULT +4.67 KVM_S390_VCPU_FAULT Capability: KVM_CAP_S390_UCONTROL Architectures: s390 @@ -1587,6 +1587,46 @@ table upfront. This is useful to handle validity intercepts for user controlled virtual machines to fault in the virtual cpu's lowcore pages prior to calling the KVM_RUN ioctl. +4.68 KVM_SET_ONE_REG + +Capability: KVM_CAP_ONE_REG +Architectures: all +Type: vcpu ioctl +Parameters: struct kvm_one_reg (in) +Returns: 0 on success, negative value on failure + +struct kvm_one_reg { + __u64 id; + __u64 addr; +}; + +Using this ioctl, a single vcpu register can be set to a specific value +defined by user space with the passed in struct kvm_one_reg, where id +refers to the register identifier as described below and addr is a pointer +to a variable with the respective size. There can be architecture agnostic +and architecture specific registers. Each have their own range of operation +and their own constants and width. To keep track of the implemented +registers, find a list below: + + Arch | Register | Width (bits) + | | + +4.69 KVM_GET_ONE_REG + +Capability: KVM_CAP_ONE_REG +Architectures: all +Type: vcpu ioctl +Parameters: struct kvm_one_reg (in and out) +Returns: 0 on success, negative value on failure + +This ioctl allows to receive the value of a single register implemented +in a vcpu. The register to read is indicated by the "id" field of the +kvm_one_reg struct passed in. On success, the register value can be found +at the memory location pointed to by "addr". + +The list of registers accessible using this interface is identical to the +list in 4.64. + 5. The kvm_run structure Application code obtains a pointer to the kvm_run structure by diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index f4380cb264e0..089c61bf0e16 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -217,6 +217,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_PPC_UNSET_IRQ: case KVM_CAP_PPC_IRQ_LEVEL: case KVM_CAP_ENABLE_CAP: + case KVM_CAP_ONE_REG: r = 1; break; #ifndef CONFIG_KVM_BOOK3S_64_HV @@ -645,6 +646,32 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return r; } +static int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, + struct kvm_one_reg *reg) +{ + int r = -EINVAL; + + switch (reg->id) { + default: + break; + } + + return r; +} + +static int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, + struct kvm_one_reg *reg) +{ + int r = -EINVAL; + + switch (reg->id) { + default: + break; + } + + return r; +} + int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { @@ -684,6 +711,20 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } + case KVM_SET_ONE_REG: + case KVM_GET_ONE_REG: + { + struct kvm_one_reg reg; + r = -EFAULT; + if (copy_from_user(®, argp, sizeof(reg))) + goto out; + if (ioctl == KVM_SET_ONE_REG) + r = kvm_vcpu_ioctl_set_one_reg(vcpu, ®); + else + r = kvm_vcpu_ioctl_get_one_reg(vcpu, ®); + break; + } + #ifdef CONFIG_KVM_E500 case KVM_DIRTY_TLB: { struct kvm_dirty_tlb dirty; diff --git a/include/linux/kvm.h b/include/linux/kvm.h index fa029ced4bd5..4f7a9fb8ab06 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -582,6 +582,7 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_MAX_VCPUS 66 /* returns max vcpus per vm */ #define KVM_CAP_PPC_PAPR 68 #define KVM_CAP_SW_TLB 69 +#define KVM_CAP_ONE_REG 70 #define KVM_CAP_S390_GMAP 71 #define KVM_CAP_TSC_DEADLINE_TIMER 72 #define KVM_CAP_S390_UCONTROL 73 @@ -680,6 +681,37 @@ struct kvm_dirty_tlb { __u32 num_dirty; }; +/* Available with KVM_CAP_ONE_REG */ + +#define KVM_REG_ARCH_MASK 0xff00000000000000ULL +#define KVM_REG_GENERIC 0x0000000000000000ULL + +/* + * Architecture specific registers are to be defined in arch headers and + * ORed with the arch identifier. + */ +#define KVM_REG_PPC 0x1000000000000000ULL +#define KVM_REG_X86 0x2000000000000000ULL +#define KVM_REG_IA64 0x3000000000000000ULL +#define KVM_REG_ARM 0x4000000000000000ULL +#define KVM_REG_S390 0x5000000000000000ULL + +#define KVM_REG_SIZE_SHIFT 52 +#define KVM_REG_SIZE_MASK 0x00f0000000000000ULL +#define KVM_REG_SIZE_U8 0x0000000000000000ULL +#define KVM_REG_SIZE_U16 0x0010000000000000ULL +#define KVM_REG_SIZE_U32 0x0020000000000000ULL +#define KVM_REG_SIZE_U64 0x0030000000000000ULL +#define KVM_REG_SIZE_U128 0x0040000000000000ULL +#define KVM_REG_SIZE_U256 0x0050000000000000ULL +#define KVM_REG_SIZE_U512 0x0060000000000000ULL +#define KVM_REG_SIZE_U1024 0x0070000000000000ULL + +struct kvm_one_reg { + __u64 id; + __u64 addr; +}; + /* * ioctls for VM fds */ @@ -819,6 +851,9 @@ struct kvm_s390_ucas_mapping { #define KVM_ALLOCATE_RMA _IOR(KVMIO, 0xa9, struct kvm_allocate_rma) /* Available with KVM_CAP_SW_TLB */ #define KVM_DIRTY_TLB _IOW(KVMIO, 0xaa, struct kvm_dirty_tlb) +/* Available with KVM_CAP_ONE_REG */ +#define KVM_GET_ONE_REG _IOW(KVMIO, 0xab, struct kvm_one_reg) +#define KVM_SET_ONE_REG _IOW(KVMIO, 0xac, struct kvm_one_reg) #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) -- cgit v1.2.3 From 1022fc3d3bfaca09d5d6bfcc93a168de16840814 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Wed, 14 Sep 2011 21:45:23 +0200 Subject: KVM: PPC: Add support for explicit HIOR setting Until now, we always set HIOR based on the PVR, but this is just wrong. Instead, we should be setting HIOR explicitly, so user space can decide what the initial HIOR value is - just like on real hardware. We keep the old PVR based way around for backwards compatibility, but once user space uses the SET_ONE_REG based method, we drop the PVR logic. Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- Documentation/virtual/kvm/api.txt | 1 + arch/powerpc/include/asm/kvm.h | 2 ++ arch/powerpc/include/asm/kvm_book3s.h | 2 ++ arch/powerpc/kvm/book3s_pr.c | 6 ++++-- arch/powerpc/kvm/powerpc.c | 13 +++++++++++++ include/linux/kvm.h | 1 + 6 files changed, 23 insertions(+), 2 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index e0c75dcb2ca3..59a38264a0ed 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1610,6 +1610,7 @@ registers, find a list below: Arch | Register | Width (bits) | | + PPC | KVM_REG_PPC_HIOR | 64 4.69 KVM_GET_ONE_REG diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h index 663c57f87165..f41adcda1468 100644 --- a/arch/powerpc/include/asm/kvm.h +++ b/arch/powerpc/include/asm/kvm.h @@ -331,4 +331,6 @@ struct kvm_book3e_206_tlb_params { __u32 reserved[8]; }; +#define KVM_REG_PPC_HIOR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x1) + #endif /* __LINUX_KVM_POWERPC_H */ diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 3c3edee672aa..aa795ccef294 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -90,6 +90,8 @@ struct kvmppc_vcpu_book3s { #endif int context_id[SID_CONTEXTS]; + bool hior_explicit; /* HIOR is set by ioctl, not PVR */ + struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE]; struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG]; struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE]; diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index c193625d5289..00efda6dc0e2 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -157,14 +157,16 @@ void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr) #ifdef CONFIG_PPC_BOOK3S_64 if ((pvr >= 0x330000) && (pvr < 0x70330000)) { kvmppc_mmu_book3s_64_init(vcpu); - to_book3s(vcpu)->hior = 0xfff00000; + if (!to_book3s(vcpu)->hior_explicit) + to_book3s(vcpu)->hior = 0xfff00000; to_book3s(vcpu)->msr_mask = 0xffffffffffffffffULL; vcpu->arch.cpu_type = KVM_CPU_3S_64; } else #endif { kvmppc_mmu_book3s_32_init(vcpu); - to_book3s(vcpu)->hior = 0; + if (!to_book3s(vcpu)->hior_explicit) + to_book3s(vcpu)->hior = 0; to_book3s(vcpu)->msr_mask = 0xffffffffULL; vcpu->arch.cpu_type = KVM_CPU_3S_32; } diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 089c61bf0e16..59852091b389 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -212,6 +212,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_PPC_BOOKE_SREGS: #else case KVM_CAP_PPC_SEGSTATE: + case KVM_CAP_PPC_HIOR: case KVM_CAP_PPC_PAPR: #endif case KVM_CAP_PPC_UNSET_IRQ: @@ -652,6 +653,11 @@ static int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, int r = -EINVAL; switch (reg->id) { +#ifdef CONFIG_PPC_BOOK3S + case KVM_REG_PPC_HIOR: + r = put_user(to_book3s(vcpu)->hior, (u64 __user *)reg->addr); + break; +#endif default: break; } @@ -665,6 +671,13 @@ static int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, int r = -EINVAL; switch (reg->id) { +#ifdef CONFIG_PPC_BOOK3S + case KVM_ONE_REG_PPC_HIOR: + r = get_user(to_book3s(vcpu)->hior, (u64 __user *)reg->addr); + if (!r) + to_book3s(vcpu)->hior_explicit = true; + break; +#endif default: break; } diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 4f7a9fb8ab06..acbe42939089 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -580,6 +580,7 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_PPC_SMT 64 #define KVM_CAP_PPC_RMA 65 #define KVM_CAP_MAX_VCPUS 66 /* returns max vcpus per vm */ +#define KVM_CAP_PPC_HIOR 67 #define KVM_CAP_PPC_PAPR 68 #define KVM_CAP_SW_TLB 69 #define KVM_CAP_ONE_REG 70 -- cgit v1.2.3 From 31f3438eca2fc90dc892e0e9963ba4b93a2c8383 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 12 Dec 2011 12:26:50 +0000 Subject: KVM: PPC: Move kvm_vcpu_ioctl_[gs]et_one_reg down to platform-specific code This moves the get/set_one_reg implementation down from powerpc.c into booke.c, book3s_pr.c and book3s_hv.c. This avoids #ifdefs in C code, but more importantly, it fixes a bug on Book3s HV where we were accessing beyond the end of the kvm_vcpu struct (via the to_book3s() macro) and corrupting memory, causing random crashes and file corruption. On Book3s HV we only accept setting the HIOR to zero, since the guest runs in supervisor mode and its vectors are never offset from zero. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf [agraf update to apply on top of changed ONE_REG patches] Signed-off-by: Avi Kivity --- arch/powerpc/include/asm/kvm_ppc.h | 3 +++ arch/powerpc/kvm/book3s_hv.c | 36 ++++++++++++++++++++++++++++++++++++ arch/powerpc/kvm/book3s_pr.c | 32 ++++++++++++++++++++++++++++++++ arch/powerpc/kvm/booke.c | 10 ++++++++++ arch/powerpc/kvm/powerpc.c | 38 -------------------------------------- 5 files changed, 81 insertions(+), 38 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index fb70414db90c..a61b5b5047d6 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -176,6 +176,9 @@ int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); void kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); +int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg); +int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg); + void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid); #ifdef CONFIG_KVM_BOOK3S_64_HV diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index fdc804c83938..3580db8a2326 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -398,6 +398,42 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, return 0; } +int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +{ + int r = -EINVAL; + + switch (reg->id) { + case KVM_REG_PPC_HIOR: + r = put_user(0, (u64 __user *)reg->addr); + break; + default: + break; + } + + return r; +} + +int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +{ + int r = -EINVAL; + + switch (reg->id) { + case KVM_REG_PPC_HIOR: + { + u64 hior; + /* Only allow this to be set to zero */ + r = get_user(hior, (u64 __user *)reg->addr); + if (!r && (hior != 0)) + r = -EINVAL; + break; + } + default: + break; + } + + return r; +} + int kvmppc_core_check_processor_compat(void) { if (cpu_has_feature(CPU_FTR_HVMODE)) diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 00efda6dc0e2..ee222ec7c95c 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -874,6 +874,38 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, return 0; } +int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +{ + int r = -EINVAL; + + switch (reg->id) { + case KVM_REG_PPC_HIOR: + r = put_user(to_book3s(vcpu)->hior, (u64 __user *)reg->addr); + break; + default: + break; + } + + return r; +} + +int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +{ + int r = -EINVAL; + + switch (reg->id) { + case KVM_REG_PPC_HIOR: + r = get_user(to_book3s(vcpu)->hior, (u64 __user *)reg->addr); + if (!r) + to_book3s(vcpu)->hior_explicit = true; + break; + default: + break; + } + + return r; +} + int kvmppc_core_check_processor_compat(void) { return 0; diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 9e41f45d07ed..ee9e1ee9c858 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -887,6 +887,16 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, return kvmppc_core_set_sregs(vcpu, sregs); } +int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +{ + return -EINVAL; +} + +int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +{ + return -EINVAL; +} + int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { return -ENOTSUPP; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 59852091b389..e23270779ff5 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -647,44 +647,6 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return r; } -static int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, - struct kvm_one_reg *reg) -{ - int r = -EINVAL; - - switch (reg->id) { -#ifdef CONFIG_PPC_BOOK3S - case KVM_REG_PPC_HIOR: - r = put_user(to_book3s(vcpu)->hior, (u64 __user *)reg->addr); - break; -#endif - default: - break; - } - - return r; -} - -static int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, - struct kvm_one_reg *reg) -{ - int r = -EINVAL; - - switch (reg->id) { -#ifdef CONFIG_PPC_BOOK3S - case KVM_ONE_REG_PPC_HIOR: - r = get_user(to_book3s(vcpu)->hior, (u64 __user *)reg->addr); - if (!r) - to_book3s(vcpu)->hior_explicit = true; - break; -#endif - default: - break; - } - - return r; -} - int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { -- cgit v1.2.3 From b3c5d3c2a49602c370de6d02fdb923bc48cd1abc Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Sat, 7 Jan 2012 02:07:38 +0100 Subject: KVM: PPC: Rename MMIO register identifiers We need the KVM_REG namespace for generic register settings now, so let's rename the existing users to something different, enabling us to reuse the namespace for more visible interfaces. While at it, also move these private constants to a private header. Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/include/asm/kvm.h | 7 ------- arch/powerpc/include/asm/kvm_host.h | 8 ++++++++ arch/powerpc/kvm/book3s_paired_singles.c | 9 ++++++--- arch/powerpc/kvm/powerpc.c | 18 +++++++++--------- 4 files changed, 23 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h index f41adcda1468..b921c3f48928 100644 --- a/arch/powerpc/include/asm/kvm.h +++ b/arch/powerpc/include/asm/kvm.h @@ -269,13 +269,6 @@ struct kvm_guest_debug_arch { struct kvm_sync_regs { }; -#define KVM_REG_MASK 0x001f -#define KVM_REG_EXT_MASK 0xffe0 -#define KVM_REG_GPR 0x0000 -#define KVM_REG_FPR 0x0020 -#define KVM_REG_QPR 0x0040 -#define KVM_REG_FQPR 0x0060 - #define KVM_INTERRUPT_SET -1U #define KVM_INTERRUPT_UNSET -2U #define KVM_INTERRUPT_SET_LEVEL -3U diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 1cb6e522485b..af438b1e8a3c 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -485,4 +485,12 @@ struct kvm_vcpu_arch { #define KVMPPC_VCPU_BUSY_IN_HOST 1 #define KVMPPC_VCPU_RUNNABLE 2 +/* Values for vcpu->arch.io_gpr */ +#define KVM_MMIO_REG_MASK 0x001f +#define KVM_MMIO_REG_EXT_MASK 0xffe0 +#define KVM_MMIO_REG_GPR 0x0000 +#define KVM_MMIO_REG_FPR 0x0020 +#define KVM_MMIO_REG_QPR 0x0040 +#define KVM_MMIO_REG_FQPR 0x0060 + #endif /* __POWERPC_KVM_HOST_H__ */ diff --git a/arch/powerpc/kvm/book3s_paired_singles.c b/arch/powerpc/kvm/book3s_paired_singles.c index 7b0ee96c1bed..e70ef2d86431 100644 --- a/arch/powerpc/kvm/book3s_paired_singles.c +++ b/arch/powerpc/kvm/book3s_paired_singles.c @@ -196,7 +196,8 @@ static int kvmppc_emulate_fpr_load(struct kvm_run *run, struct kvm_vcpu *vcpu, kvmppc_inject_pf(vcpu, addr, false); goto done_load; } else if (r == EMULATE_DO_MMIO) { - emulated = kvmppc_handle_load(run, vcpu, KVM_REG_FPR | rs, len, 1); + emulated = kvmppc_handle_load(run, vcpu, KVM_MMIO_REG_FPR | rs, + len, 1); goto done_load; } @@ -286,11 +287,13 @@ static int kvmppc_emulate_psq_load(struct kvm_run *run, struct kvm_vcpu *vcpu, kvmppc_inject_pf(vcpu, addr, false); goto done_load; } else if ((r == EMULATE_DO_MMIO) && w) { - emulated = kvmppc_handle_load(run, vcpu, KVM_REG_FPR | rs, 4, 1); + emulated = kvmppc_handle_load(run, vcpu, KVM_MMIO_REG_FPR | rs, + 4, 1); vcpu->arch.qpr[rs] = tmp[1]; goto done_load; } else if (r == EMULATE_DO_MMIO) { - emulated = kvmppc_handle_load(run, vcpu, KVM_REG_FQPR | rs, 8, 1); + emulated = kvmppc_handle_load(run, vcpu, KVM_MMIO_REG_FQPR | rs, + 8, 1); goto done_load; } diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index e23270779ff5..0e21d155eea7 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -448,20 +448,20 @@ static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu, kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, gpr); - switch (vcpu->arch.io_gpr & KVM_REG_EXT_MASK) { - case KVM_REG_GPR: + switch (vcpu->arch.io_gpr & KVM_MMIO_REG_EXT_MASK) { + case KVM_MMIO_REG_GPR: kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, gpr); break; - case KVM_REG_FPR: - vcpu->arch.fpr[vcpu->arch.io_gpr & KVM_REG_MASK] = gpr; + case KVM_MMIO_REG_FPR: + vcpu->arch.fpr[vcpu->arch.io_gpr & KVM_MMIO_REG_MASK] = gpr; break; #ifdef CONFIG_PPC_BOOK3S - case KVM_REG_QPR: - vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_REG_MASK] = gpr; + case KVM_MMIO_REG_QPR: + vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_MMIO_REG_MASK] = gpr; break; - case KVM_REG_FQPR: - vcpu->arch.fpr[vcpu->arch.io_gpr & KVM_REG_MASK] = gpr; - vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_REG_MASK] = gpr; + case KVM_MMIO_REG_FQPR: + vcpu->arch.fpr[vcpu->arch.io_gpr & KVM_MMIO_REG_MASK] = gpr; + vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_MMIO_REG_MASK] = gpr; break; #endif default: -- cgit v1.2.3 From 54f65795c8f6e192540756085d738e66ab0917a0 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Wed, 11 Jan 2012 13:37:35 +0000 Subject: KVM: PPC: refer to paravirt docs in header file Instead of keeping separate copies of struct kvm_vcpu_arch_shared (one in the code, one in the docs) that inevitably fail to be kept in sync (already sr[] is missing from the doc version), just point to the header file as the source of documentation on the contents of the magic page. Signed-off-by: Scott Wood Acked-by: Avi Kivity Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- Documentation/virtual/kvm/ppc-pv.txt | 24 ++---------------------- arch/powerpc/include/asm/kvm_para.h | 10 ++++++++++ 2 files changed, 12 insertions(+), 22 deletions(-) diff --git a/Documentation/virtual/kvm/ppc-pv.txt b/Documentation/virtual/kvm/ppc-pv.txt index 2b7ce190cde4..6e7c37050930 100644 --- a/Documentation/virtual/kvm/ppc-pv.txt +++ b/Documentation/virtual/kvm/ppc-pv.txt @@ -81,28 +81,8 @@ additional registers to the magic page. If you add fields to the magic page, also define a new hypercall feature to indicate that the host can give you more registers. Only if the host supports the additional features, make use of them. -The magic page has the following layout as described in -arch/powerpc/include/asm/kvm_para.h: - -struct kvm_vcpu_arch_shared { - __u64 scratch1; - __u64 scratch2; - __u64 scratch3; - __u64 critical; /* Guest may not get interrupts if == r1 */ - __u64 sprg0; - __u64 sprg1; - __u64 sprg2; - __u64 sprg3; - __u64 srr0; - __u64 srr1; - __u64 dar; - __u64 msr; - __u32 dsisr; - __u32 int_pending; /* Tells the guest if we have an interrupt */ -}; - -Additions to the page must only occur at the end. Struct fields are always 32 -or 64 bit aligned, depending on them being 32 or 64 bit wide respectively. +The magic page layout is described by struct kvm_vcpu_arch_shared +in arch/powerpc/include/asm/kvm_para.h. Magic page features =================== diff --git a/arch/powerpc/include/asm/kvm_para.h b/arch/powerpc/include/asm/kvm_para.h index ece70fb36513..7b754e743003 100644 --- a/arch/powerpc/include/asm/kvm_para.h +++ b/arch/powerpc/include/asm/kvm_para.h @@ -22,6 +22,16 @@ #include +/* + * Additions to this struct must only occur at the end, and should be + * accompanied by a KVM_MAGIC_FEAT flag to advertise that they are present + * (albeit not necessarily relevant to the current target hardware platform). + * + * Struct fields are always 32 or 64 bit aligned, depending on them being 32 + * or 64 bit wide respectively. + * + * See Documentation/virtual/kvm/ppc-pv.txt + */ struct kvm_vcpu_arch_shared { __u64 scratch1; __u64 scratch2; -- cgit v1.2.3 From 28867cee754c07b3fa0a679ed2ea394843130217 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 16 Jan 2012 15:08:44 +0200 Subject: KVM: x86 emulator: add 8-bit memory operands Useful for MOVSX/MOVZX. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 0982507b962a..5da6b3619201 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -57,6 +57,7 @@ #define OpDS 23ull /* DS */ #define OpFS 24ull /* FS */ #define OpGS 25ull /* GS */ +#define OpMem8 26ull /* 8-bit zero extended memory operand */ #define OpBits 5 /* Width of operand field */ #define OpMask ((1ull << OpBits) - 1) @@ -101,6 +102,7 @@ #define SrcAcc (OpAcc << SrcShift) #define SrcImmU16 (OpImmU16 << SrcShift) #define SrcDX (OpDX << SrcShift) +#define SrcMem8 (OpMem8 << SrcShift) #define SrcMask (OpMask << SrcShift) #define BitOp (1<<11) #define MemAbs (1<<12) /* Memory operand is absolute displacement */ @@ -3656,6 +3658,9 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, case OpImm: rc = decode_imm(ctxt, op, imm_size(ctxt), true); break; + case OpMem8: + ctxt->memop.bytes = 1; + goto mem_common; case OpMem16: ctxt->memop.bytes = 2; goto mem_common; -- cgit v1.2.3 From 2adb5ad9fe1b44d0ae8b00d2bd6568e6163215b3 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 16 Jan 2012 15:08:45 +0200 Subject: KVM: x86 emulator: Remove byte-sized MOVSX/MOVZX hack Currently we treat MOVSX/MOVZX with a byte source as a byte instruction, and change the destination operand size with a hack. Change it to be a word instruction, so the destination receives its natural size, and change the source to be SrcMem8. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5da6b3619201..6eaedac7cf6a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -860,8 +860,7 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, } static void decode_register_operand(struct x86_emulate_ctxt *ctxt, - struct operand *op, - int inhibit_bytereg) + struct operand *op) { unsigned reg = ctxt->modrm_reg; int highbyte_regs = ctxt->rex_prefix == 0; @@ -878,7 +877,7 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt, } op->type = OP_REG; - if ((ctxt->d & ByteOp) && !inhibit_bytereg) { + if (ctxt->d & ByteOp) { op->addr.reg = decode_register(reg, ctxt->regs, highbyte_regs); op->bytes = 1; } else { @@ -3516,13 +3515,13 @@ static struct opcode twobyte_table[256] = { I(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr), I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg), I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg), - D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), + D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), /* 0xB8 - 0xBF */ N, N, G(BitOp, group8), I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc), I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr), - D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), + D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), /* 0xC0 - 0xCF */ D2bv(DstMem | SrcReg | ModRM | Lock), N, D(DstMem | SrcReg | ModRM | Mov), @@ -3604,9 +3603,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, switch (d) { case OpReg: - decode_register_operand(ctxt, op, - op == &ctxt->dst && - ctxt->twobyte && (ctxt->b == 0xb6 || ctxt->b == 0xb7)); + decode_register_operand(ctxt, op); break; case OpImmUByte: rc = decode_imm(ctxt, op, 1, false); -- cgit v1.2.3 From 3ea8b75e47ac70bdd0a2c0492102682d43bfa3c4 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Tue, 17 Jan 2012 19:50:08 +0900 Subject: KVM: MMU: Remove unused kvm_pte_chain Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index bd69c93da8fa..461016614324 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -181,13 +181,6 @@ struct kvm_mmu_memory_cache { void *objects[KVM_NR_MEM_OBJS]; }; -#define NR_PTE_CHAIN_ENTRIES 5 - -struct kvm_pte_chain { - u64 *parent_ptes[NR_PTE_CHAIN_ENTRIES]; - struct hlist_node link; -}; - /* * kvm_mmu_page_role, below, is defined as: * -- cgit v1.2.3 From 9373e2c0576ee15b13e93bc5c5b3ef31d0612992 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Tue, 17 Jan 2012 19:51:20 +0900 Subject: KVM: MMU: Remove unused kvm parameter from __gfn_to_rmap() Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 0a11468d853f..75b8f579b2a6 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -946,7 +946,7 @@ static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn) } } -static unsigned long *__gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level, +static unsigned long *__gfn_to_rmap(gfn_t gfn, int level, struct kvm_memory_slot *slot) { struct kvm_lpage_info *linfo; @@ -966,7 +966,7 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) struct kvm_memory_slot *slot; slot = gfn_to_memslot(kvm, gfn); - return __gfn_to_rmap(kvm, gfn, level, slot); + return __gfn_to_rmap(gfn, level, slot); } static bool rmap_can_add(struct kvm_vcpu *vcpu) @@ -1018,7 +1018,7 @@ int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, u64 *spte; int i, write_protected = 0; - rmapp = __gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL, slot); + rmapp = __gfn_to_rmap(gfn, PT_PAGE_TABLE_LEVEL, slot); spte = rmap_next(kvm, rmapp, NULL); while (spte) { BUG_ON(!(*spte & PT_PRESENT_MASK)); @@ -1033,7 +1033,7 @@ int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, /* check for huge page mappings */ for (i = PT_DIRECTORY_LEVEL; i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { - rmapp = __gfn_to_rmap(kvm, gfn, i, slot); + rmapp = __gfn_to_rmap(gfn, i, slot); spte = rmap_next(kvm, rmapp, NULL); while (spte) { BUG_ON(!(*spte & PT_PRESENT_MASK)); -- cgit v1.2.3 From e4b35cc960bf216548516d8e39f5e364cfbbc86b Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Tue, 17 Jan 2012 19:52:15 +0900 Subject: KVM: MMU: Remove unused kvm parameter from rmap_next() Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 26 +++++++++++++------------- arch/x86/kvm/mmu_audit.c | 4 ++-- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 75b8f579b2a6..ae76cc3392e1 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -988,7 +988,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) return pte_list_add(vcpu, spte, rmapp); } -static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) +static u64 *rmap_next(unsigned long *rmapp, u64 *spte) { return pte_list_next(rmapp, spte); } @@ -1019,7 +1019,7 @@ int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, int i, write_protected = 0; rmapp = __gfn_to_rmap(gfn, PT_PAGE_TABLE_LEVEL, slot); - spte = rmap_next(kvm, rmapp, NULL); + spte = rmap_next(rmapp, NULL); while (spte) { BUG_ON(!(*spte & PT_PRESENT_MASK)); rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); @@ -1027,14 +1027,14 @@ int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK); write_protected = 1; } - spte = rmap_next(kvm, rmapp, spte); + spte = rmap_next(rmapp, spte); } /* check for huge page mappings */ for (i = PT_DIRECTORY_LEVEL; i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { rmapp = __gfn_to_rmap(gfn, i, slot); - spte = rmap_next(kvm, rmapp, NULL); + spte = rmap_next(rmapp, NULL); while (spte) { BUG_ON(!(*spte & PT_PRESENT_MASK)); BUG_ON(!is_large_pte(*spte)); @@ -1045,7 +1045,7 @@ int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, spte = NULL; write_protected = 1; } - spte = rmap_next(kvm, rmapp, spte); + spte = rmap_next(rmapp, spte); } } @@ -1066,7 +1066,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, u64 *spte; int need_tlb_flush = 0; - while ((spte = rmap_next(kvm, rmapp, NULL))) { + while ((spte = rmap_next(rmapp, NULL))) { BUG_ON(!(*spte & PT_PRESENT_MASK)); rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); drop_spte(kvm, spte); @@ -1085,14 +1085,14 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, WARN_ON(pte_huge(*ptep)); new_pfn = pte_pfn(*ptep); - spte = rmap_next(kvm, rmapp, NULL); + spte = rmap_next(rmapp, NULL); while (spte) { BUG_ON(!is_shadow_present_pte(*spte)); rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); need_flush = 1; if (pte_write(*ptep)) { drop_spte(kvm, spte); - spte = rmap_next(kvm, rmapp, NULL); + spte = rmap_next(rmapp, NULL); } else { new_spte = *spte &~ (PT64_BASE_ADDR_MASK); new_spte |= (u64)new_pfn << PAGE_SHIFT; @@ -1102,7 +1102,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, new_spte &= ~shadow_accessed_mask; mmu_spte_clear_track_bits(spte); mmu_spte_set(spte, new_spte); - spte = rmap_next(kvm, rmapp, spte); + spte = rmap_next(rmapp, spte); } } if (need_flush) @@ -1176,7 +1176,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, if (!shadow_accessed_mask) return kvm_unmap_rmapp(kvm, rmapp, data); - spte = rmap_next(kvm, rmapp, NULL); + spte = rmap_next(rmapp, NULL); while (spte) { int _young; u64 _spte = *spte; @@ -1186,7 +1186,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, young = 1; clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte); } - spte = rmap_next(kvm, rmapp, spte); + spte = rmap_next(rmapp, spte); } return young; } @@ -1205,7 +1205,7 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, if (!shadow_accessed_mask) goto out; - spte = rmap_next(kvm, rmapp, NULL); + spte = rmap_next(rmapp, NULL); while (spte) { u64 _spte = *spte; BUG_ON(!(_spte & PT_PRESENT_MASK)); @@ -1214,7 +1214,7 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, young = 1; break; } - spte = rmap_next(kvm, rmapp, spte); + spte = rmap_next(rmapp, spte); } out: return young; diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index fe15dcc07a6b..6eabae3d77ff 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -200,13 +200,13 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) slot = gfn_to_memslot(kvm, sp->gfn); rmapp = &slot->rmap[sp->gfn - slot->base_gfn]; - spte = rmap_next(kvm, rmapp, NULL); + spte = rmap_next(rmapp, NULL); while (spte) { if (is_writable_pte(*spte)) audit_printk(kvm, "shadow page has writable " "mappings: gfn %llx role %x\n", sp->gfn, sp->role.word); - spte = rmap_next(kvm, rmapp, spte); + spte = rmap_next(rmapp, spte); } } -- cgit v1.2.3 From e2358851efbcdc34583ee11971a6e4d587ea8bf9 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 17 Jan 2012 14:09:50 +0100 Subject: KVM: SVM: comment nested paging and virtualization module parameters Also use true instead of 1 for enabling by default. Signed-off-by: Davidlohr Bueso Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index fce3ba0f2079..7bbd17cc3488 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -182,11 +182,13 @@ static bool npt_enabled = true; #else static bool npt_enabled; #endif -static int npt = 1; +/* allow nested paging (virtualized MMU) for all guests */ +static int npt = true; module_param(npt, int, S_IRUGO); -static int nested = 1; +/* allow nested virtualization in KVM/SVM */ +static int nested = true; module_param(nested, int, S_IRUGO); static void svm_flush_tlb(struct kvm_vcpu *vcpu); -- cgit v1.2.3 From b93a35532767a2cf78bdbc88730d5c28aa66b941 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 18 Jan 2012 20:07:09 +0200 Subject: KVM: fix error handling for out of range irq find_index_from_host_irq returns 0 on error but callers assume < 0 on error. This should not matter much: an out of range irq should never happen since irq handler was registered with this irq #, and even if it does we get a spurious msix irq in guest and typically nothing terrible happens. Still, better to make it consistent. Signed-off-by: Michael S. Tsirkin Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- virt/kvm/assigned-dev.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c index 758e3b36d4cf..ece80612b594 100644 --- a/virt/kvm/assigned-dev.c +++ b/virt/kvm/assigned-dev.c @@ -49,10 +49,8 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel index = i; break; } - if (index < 0) { + if (index < 0) printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n"); - return 0; - } return index; } -- cgit v1.2.3 From a52315e1d549dad80ff443151927226c11fd8c2b Mon Sep 17 00:00:00 2001 From: Julian Stecklina Date: Mon, 16 Jan 2012 14:02:20 +0100 Subject: KVM: Don't mistreat edge-triggered INIT IPI as INIT de-assert. (LAPIC) If the guest programs an IPI with level=0 (de-assert) and trig_mode=0 (edge), it is erroneously treated as INIT de-assert and ignored, but to quote the spec: "For this delivery mode [INIT de-assert], the level flag must be set to 0 and trigger mode flag to 1." Signed-off-by: Julian Stecklina Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/lapic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index cfdc6e0ef002..3ee1d83c695d 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -433,7 +433,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, break; case APIC_DM_INIT: - if (level) { + if (!trig_mode || level) { result = 1; vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; kvm_make_request(KVM_REQ_EVENT, vcpu); -- cgit v1.2.3 From 1a18a69b762374c423305772500f36eb8984ca52 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 1 Feb 2012 12:23:21 +0200 Subject: KVM: x86 emulator: reject SYSENTER in compatibility mode on AMD guests If the guest thinks it's an AMD, it will not have prepared the SYSENTER MSRs, and if the guest executes SYSENTER in compatibility mode, it will fails. Detect this condition and #UD instead, like the spec says. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 6eaedac7cf6a..71450aca3b86 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1892,6 +1892,17 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, ss->p = 1; } +static bool vendor_intel(struct x86_emulate_ctxt *ctxt) +{ + u32 eax, ebx, ecx, edx; + + eax = ecx = 0; + return ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx) + && ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx + && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx + && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx; +} + static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt) { struct x86_emulate_ops *ops = ctxt->ops; @@ -2008,6 +2019,14 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt) if (ctxt->mode == X86EMUL_MODE_REAL) return emulate_gp(ctxt, 0); + /* + * Not recognized on AMD in compat mode (but is recognized in legacy + * mode). + */ + if ((ctxt->mode == X86EMUL_MODE_PROT32) && (efer & EFER_LMA) + && !vendor_intel(ctxt)) + return emulate_ud(ctxt); + /* XXX sysenter/sysexit have not been tested in 64bit mode. * Therefore, we inject an #UD. */ -- cgit v1.2.3 From 9d4cba7f93c52d4121ab9c6f289e582d368a6979 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 12 Jan 2012 20:09:51 +0000 Subject: KVM: Move gfn_to_memslot() to kvm_host.h This moves __gfn_to_memslot() and search_memslots() from kvm_main.c to kvm_host.h to reduce the code duplication caused by the need for non-modular code in arch/powerpc/kvm/book3s_hv_rm_mmu.c to call gfn_to_memslot() in real mode. Rather than putting gfn_to_memslot() itself in a header, which would lead to increased code size, this puts __gfn_to_memslot() in a header. Then, the non-modular uses of gfn_to_memslot() are changed to call __gfn_to_memslot() instead. This way there is only one place in the source code that needs to be changed should the gfn_to_memslot() implementation need to be modified. On powerpc, the Book3S HV style of KVM has code that is called from real mode which needs to call gfn_to_memslot() and thus needs this. (Module code is allocated in the vmalloc region, which can't be accessed in real mode.) With this, we can remove builtin_gfn_to_memslot() from book3s_hv_rm_mmu.c. Signed-off-by: Paul Mackerras Acked-by: Avi Kivity Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 23 ++--------------------- include/linux/kvm_host.h | 25 +++++++++++++++++++++++++ virt/kvm/kvm_main.c | 21 +-------------------- 3 files changed, 28 insertions(+), 41 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 5f3c60b89faf..def880aea63a 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -21,25 +21,6 @@ #include #include -/* - * Since this file is built in even if KVM is a module, we need - * a local copy of this function for the case where kvm_main.c is - * modular. - */ -static struct kvm_memory_slot *builtin_gfn_to_memslot(struct kvm *kvm, - gfn_t gfn) -{ - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; - - slots = kvm_memslots(kvm); - kvm_for_each_memslot(memslot, slots) - if (gfn >= memslot->base_gfn && - gfn < memslot->base_gfn + memslot->npages) - return memslot; - return NULL; -} - /* Translate address of a vmalloc'd thing to a linear map address */ static void *real_vmalloc_addr(void *x) { @@ -99,7 +80,7 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index, rcbits = hpte_r & (HPTE_R_R | HPTE_R_C); ptel = rev->guest_rpte |= rcbits; gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel)); - memslot = builtin_gfn_to_memslot(kvm, gfn); + memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn); if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) return; @@ -181,7 +162,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, /* Find the memslot (if any) for this address */ gpa = (ptel & HPTE_R_RPN) & ~(psize - 1); gfn = gpa >> PAGE_SHIFT; - memslot = builtin_gfn_to_memslot(kvm, gfn); + memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn); pa = 0; is_io = ~0ul; rmap = NULL; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index eada8e69fe58..9698080c902b 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -651,6 +651,31 @@ static inline void kvm_guest_exit(void) current->flags &= ~PF_VCPU; } +/* + * search_memslots() and __gfn_to_memslot() are here because they are + * used in non-modular code in arch/powerpc/kvm/book3s_hv_rm_mmu.c. + * gfn_to_memslot() itself isn't here as an inline because that would + * bloat other code too much. + */ +static inline struct kvm_memory_slot * +search_memslots(struct kvm_memslots *slots, gfn_t gfn) +{ + struct kvm_memory_slot *memslot; + + kvm_for_each_memslot(memslot, slots) + if (gfn >= memslot->base_gfn && + gfn < memslot->base_gfn + memslot->npages) + return memslot; + + return NULL; +} + +static inline struct kvm_memory_slot * +__gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn) +{ + return search_memslots(slots, gfn); +} + static inline int memslot_id(struct kvm *kvm, gfn_t gfn) { return gfn_to_memslot(kvm, gfn)->id; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 9f32bffd37c0..470e30520fe8 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -640,19 +640,6 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) } #endif /* !CONFIG_S390 */ -static struct kvm_memory_slot * -search_memslots(struct kvm_memslots *slots, gfn_t gfn) -{ - struct kvm_memory_slot *memslot; - - kvm_for_each_memslot(memslot, slots) - if (gfn >= memslot->base_gfn && - gfn < memslot->base_gfn + memslot->npages) - return memslot; - - return NULL; -} - static int cmp_memslot(const void *slot1, const void *slot2) { struct kvm_memory_slot *s1, *s2; @@ -1031,12 +1018,6 @@ int kvm_is_error_hva(unsigned long addr) } EXPORT_SYMBOL_GPL(kvm_is_error_hva); -static struct kvm_memory_slot *__gfn_to_memslot(struct kvm_memslots *slots, - gfn_t gfn) -{ - return search_memslots(slots, gfn); -} - struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) { return __gfn_to_memslot(kvm_memslots(kvm), gfn); @@ -1459,7 +1440,7 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, ghc->gpa = gpa; ghc->generation = slots->generation; - ghc->memslot = __gfn_to_memslot(slots, gfn); + ghc->memslot = gfn_to_memslot(kvm, gfn); ghc->hva = gfn_to_hva_many(ghc->memslot, gfn, NULL); if (!kvm_is_error_hva(ghc->hva)) ghc->hva += offset; -- cgit v1.2.3 From 9cf7c0e465197fa97972428e93162318e917f8ed Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Thu, 19 Jan 2012 00:23:46 +0100 Subject: KVM: PPC: E500: Fail init when not on e500v2 When enabling the current KVM code on e500mc, I get the following oops: Oops: Exception in kernel mode, sig: 4 [#1] SMP NR_CPUS=8 P2041 RDB Modules linked in: NIP: c067df4c LR: c067df44 CTR: 00000000 REGS: ee055ed0 TRAP: 0700 Not tainted (3.2.0-10391-g36c5afe) MSR: 00029002 CR: 24042022 XER: 00000000 TASK = ee0429b0[1] 'swapper/0' THREAD: ee054000 CPU: 2 GPR00: c067df44 ee055f80 ee0429b0 00000000 00000058 0000003f ee211600 60c6b864 GPR08: 7cc903a6 0000002c 00000000 00000001 44042082 2d180088 00000000 00000000 GPR16: c0000a00 00000014 3fffffff 03fe9000 00000015 7ff3be68 c06e0000 00000000 GPR24: 00000000 00000000 00001720 c067df1c c06e0000 00000000 ee054000 c06ab51c NIP [c067df4c] kvmppc_e500_init+0x30/0xf8 LR [c067df44] kvmppc_e500_init+0x28/0xf8 Call Trace: [ee055f80] [c067df44] kvmppc_e500_init+0x28/0xf8 (unreliable) [ee055fb0] [c0001d30] do_one_initcall+0x50/0x1f0 [ee055fe0] [c06721dc] kernel_init+0xa4/0x14c [ee055ff0] [c000e910] kernel_thread+0x4c/0x68 Instruction dump: 9421ffd0 7c0802a6 93410018 9361001c 90010034 93810020 93a10024 93c10028 93e1002c 4bfffe7d 2c030000 408200a4 <7c1082a6> 90010008 7c1182a6 9001000c ---[ end trace b8ef4903fcbf9dd3 ]--- Since it doesn't make sense to run the init function on any non-supported platform, we can just call our "is this platform supported?" function and bail out of init() if it's not. Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/kvm/e500.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index 709d82f956e3..ddcd896fa2ff 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -229,6 +229,10 @@ static int __init kvmppc_e500_init(void) unsigned long ivor[3]; unsigned long max_ivor = 0; + r = kvmppc_core_check_processor_compat(); + if (r) + return r; + r = kvmppc_booke_init(); if (r) return r; -- cgit v1.2.3 From b4e706111d501991c59d2af23a299ab52a06b03d Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Mon, 16 Jan 2012 16:50:10 +0100 Subject: KVM: PPC: Convert RMA allocation into generic code We have code to allocate big chunks of linear memory on bootup for later use. This code is currently used for RMA allocation, but can be useful beyond that extent. Make it generic so we can reuse it for other stuff later. Signed-off-by: Alexander Graf Acked-by: Paul Mackerras Signed-off-by: Avi Kivity --- arch/powerpc/include/asm/kvm_host.h | 7 +- arch/powerpc/include/asm/kvm_ppc.h | 8 +- arch/powerpc/kernel/setup_64.c | 2 +- arch/powerpc/kvm/book3s_hv.c | 8 +- arch/powerpc/kvm/book3s_hv_builtin.c | 175 +++++++++++++++++++++-------------- 5 files changed, 118 insertions(+), 82 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index af438b1e8a3c..8221e717bbce 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -173,12 +173,13 @@ struct kvmppc_spapr_tce_table { struct page *pages[0]; }; -struct kvmppc_rma_info { +struct kvmppc_linear_info { void *base_virt; unsigned long base_pfn; unsigned long npages; struct list_head list; - atomic_t use_count; + atomic_t use_count; + int type; }; /* @@ -224,7 +225,7 @@ struct kvm_arch { int tlbie_lock; unsigned long lpcr; unsigned long rmor; - struct kvmppc_rma_info *rma; + struct kvmppc_linear_info *rma; unsigned long vrma_slb_v; int rma_setup_done; int using_mmu_notifiers; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index a61b5b5047d6..1c37a2f8d0f4 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -128,8 +128,8 @@ extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvm_create_spapr_tce *args); extern long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *rma); -extern struct kvmppc_rma_info *kvm_alloc_rma(void); -extern void kvm_release_rma(struct kvmppc_rma_info *ri); +extern struct kvmppc_linear_info *kvm_alloc_rma(void); +extern void kvm_release_rma(struct kvmppc_linear_info *ri); extern int kvmppc_core_init_vm(struct kvm *kvm); extern void kvmppc_core_destroy_vm(struct kvm *kvm); extern int kvmppc_core_prepare_memory_region(struct kvm *kvm, @@ -187,13 +187,13 @@ static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr) paca[cpu].kvm_hstate.xics_phys = addr; } -extern void kvm_rma_init(void); +extern void kvm_linear_init(void); #else static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr) {} -static inline void kvm_rma_init(void) +static inline void kvm_linear_init(void) {} #endif diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 4cb8f1e9d044..4721b0c8d7b7 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -598,7 +598,7 @@ void __init setup_arch(char **cmdline_p) /* Initialize the MMU context management stuff */ mmu_context_init(); - kvm_rma_init(); + kvm_linear_init(); ppc64_boot_msg(0x15, "Setup Done"); } diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 3580db8a2326..ce1cac765193 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1055,7 +1055,7 @@ static inline int lpcr_rmls(unsigned long rma_size) static int kvm_rma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { - struct kvmppc_rma_info *ri = vma->vm_file->private_data; + struct kvmppc_linear_info *ri = vma->vm_file->private_data; struct page *page; if (vmf->pgoff >= ri->npages) @@ -1080,7 +1080,7 @@ static int kvm_rma_mmap(struct file *file, struct vm_area_struct *vma) static int kvm_rma_release(struct inode *inode, struct file *filp) { - struct kvmppc_rma_info *ri = filp->private_data; + struct kvmppc_linear_info *ri = filp->private_data; kvm_release_rma(ri); return 0; @@ -1093,7 +1093,7 @@ static struct file_operations kvm_rma_fops = { long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *ret) { - struct kvmppc_rma_info *ri; + struct kvmppc_linear_info *ri; long fd; ri = kvm_alloc_rma(); @@ -1212,7 +1212,7 @@ static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu) { int err = 0; struct kvm *kvm = vcpu->kvm; - struct kvmppc_rma_info *ri = NULL; + struct kvmppc_linear_info *ri = NULL; unsigned long hva; struct kvm_memory_slot *memslot; struct vm_area_struct *vma; diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index a795a13f4a70..1c7e6ab5f9de 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -18,6 +18,14 @@ #include #include +#define KVM_LINEAR_RMA 0 + +static void __init kvm_linear_init_one(ulong size, int count, int type); +static struct kvmppc_linear_info *kvm_alloc_linear(int type); +static void kvm_release_linear(struct kvmppc_linear_info *ri); + +/*************** RMA *************/ + /* * This maintains a list of RMAs (real mode areas) for KVM guests to use. * Each RMA has to be physically contiguous and of a size that the @@ -29,32 +37,6 @@ static unsigned long kvm_rma_size = 64 << 20; /* 64MB */ static unsigned long kvm_rma_count; -static int __init early_parse_rma_size(char *p) -{ - if (!p) - return 1; - - kvm_rma_size = memparse(p, &p); - - return 0; -} -early_param("kvm_rma_size", early_parse_rma_size); - -static int __init early_parse_rma_count(char *p) -{ - if (!p) - return 1; - - kvm_rma_count = simple_strtoul(p, NULL, 0); - - return 0; -} -early_param("kvm_rma_count", early_parse_rma_count); - -static struct kvmppc_rma_info *rma_info; -static LIST_HEAD(free_rmas); -static DEFINE_SPINLOCK(rma_lock); - /* Work out RMLS (real mode limit selector) field value for a given RMA size. Assumes POWER7 or PPC970. */ static inline int lpcr_rmls(unsigned long rma_size) @@ -81,45 +63,73 @@ static inline int lpcr_rmls(unsigned long rma_size) } } -/* - * Called at boot time while the bootmem allocator is active, - * to allocate contiguous physical memory for the real memory - * areas for guests. - */ -void __init kvm_rma_init(void) +static int __init early_parse_rma_size(char *p) +{ + if (!p) + return 1; + + kvm_rma_size = memparse(p, &p); + + return 0; +} +early_param("kvm_rma_size", early_parse_rma_size); + +static int __init early_parse_rma_count(char *p) +{ + if (!p) + return 1; + + kvm_rma_count = simple_strtoul(p, NULL, 0); + + return 0; +} +early_param("kvm_rma_count", early_parse_rma_count); + +struct kvmppc_linear_info *kvm_alloc_rma(void) +{ + return kvm_alloc_linear(KVM_LINEAR_RMA); +} +EXPORT_SYMBOL_GPL(kvm_alloc_rma); + +void kvm_release_rma(struct kvmppc_linear_info *ri) +{ + kvm_release_linear(ri); +} +EXPORT_SYMBOL_GPL(kvm_release_rma); + +/*************** generic *************/ + +static LIST_HEAD(free_linears); +static DEFINE_SPINLOCK(linear_lock); + +static void __init kvm_linear_init_one(ulong size, int count, int type) { unsigned long i; unsigned long j, npages; - void *rma; + void *linear; struct page *pg; + const char *typestr; + struct kvmppc_linear_info *linear_info; - /* Only do this on PPC970 in HV mode */ - if (!cpu_has_feature(CPU_FTR_HVMODE) || - !cpu_has_feature(CPU_FTR_ARCH_201)) - return; - - if (!kvm_rma_size || !kvm_rma_count) + if (!count) return; - /* Check that the requested size is one supported in hardware */ - if (lpcr_rmls(kvm_rma_size) < 0) { - pr_err("RMA size of 0x%lx not supported\n", kvm_rma_size); - return; - } - - npages = kvm_rma_size >> PAGE_SHIFT; - rma_info = alloc_bootmem(kvm_rma_count * sizeof(struct kvmppc_rma_info)); - for (i = 0; i < kvm_rma_count; ++i) { - rma = alloc_bootmem_align(kvm_rma_size, kvm_rma_size); - pr_info("Allocated KVM RMA at %p (%ld MB)\n", rma, - kvm_rma_size >> 20); - rma_info[i].base_virt = rma; - rma_info[i].base_pfn = __pa(rma) >> PAGE_SHIFT; - rma_info[i].npages = npages; - list_add_tail(&rma_info[i].list, &free_rmas); - atomic_set(&rma_info[i].use_count, 0); - - pg = pfn_to_page(rma_info[i].base_pfn); + typestr = (type == KVM_LINEAR_RMA) ? "RMA" : ""; + + npages = size >> PAGE_SHIFT; + linear_info = alloc_bootmem(count * sizeof(struct kvmppc_linear_info)); + for (i = 0; i < count; ++i) { + linear = alloc_bootmem_align(size, size); + pr_info("Allocated KVM %s at %p (%ld MB)\n", typestr, linear, + size >> 20); + linear_info[i].base_virt = linear; + linear_info[i].base_pfn = __pa(linear) >> PAGE_SHIFT; + linear_info[i].npages = npages; + linear_info[i].type = type; + list_add_tail(&linear_info[i].list, &free_linears); + atomic_set(&linear_info[i].use_count, 0); + + pg = pfn_to_page(linear_info[i].base_pfn); for (j = 0; j < npages; ++j) { atomic_inc(&pg->_count); ++pg; @@ -127,30 +137,55 @@ void __init kvm_rma_init(void) } } -struct kvmppc_rma_info *kvm_alloc_rma(void) +static struct kvmppc_linear_info *kvm_alloc_linear(int type) { - struct kvmppc_rma_info *ri; + struct kvmppc_linear_info *ri; ri = NULL; - spin_lock(&rma_lock); - if (!list_empty(&free_rmas)) { - ri = list_first_entry(&free_rmas, struct kvmppc_rma_info, list); + spin_lock(&linear_lock); + list_for_each_entry(ri, &free_linears, list) { + if (ri->type != type) + continue; + list_del(&ri->list); atomic_inc(&ri->use_count); + break; } - spin_unlock(&rma_lock); + spin_unlock(&linear_lock); return ri; } -EXPORT_SYMBOL_GPL(kvm_alloc_rma); -void kvm_release_rma(struct kvmppc_rma_info *ri) +static void kvm_release_linear(struct kvmppc_linear_info *ri) { if (atomic_dec_and_test(&ri->use_count)) { - spin_lock(&rma_lock); - list_add_tail(&ri->list, &free_rmas); - spin_unlock(&rma_lock); + spin_lock(&linear_lock); + list_add_tail(&ri->list, &free_linears); + spin_unlock(&linear_lock); } } -EXPORT_SYMBOL_GPL(kvm_release_rma); +/* + * Called at boot time while the bootmem allocator is active, + * to allocate contiguous physical memory for the hash page + * tables for guests. + */ +void __init kvm_linear_init(void) +{ + /* RMA */ + /* Only do this on PPC970 in HV mode */ + if (!cpu_has_feature(CPU_FTR_HVMODE) || + !cpu_has_feature(CPU_FTR_ARCH_201)) + return; + + if (!kvm_rma_size || !kvm_rma_count) + return; + + /* Check that the requested size is one supported in hardware */ + if (lpcr_rmls(kvm_rma_size) < 0) { + pr_err("RMA size of 0x%lx not supported\n", kvm_rma_size); + return; + } + + kvm_linear_init_one(kvm_rma_size, kvm_rma_count, KVM_LINEAR_RMA); +} -- cgit v1.2.3 From b7f5d0114c708d6efd264a2c5e5a31cf292a9cec Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Tue, 17 Jan 2012 15:11:28 +0100 Subject: KVM: PPC: Initialize linears with zeros RMAs and HPT preallocated spaces should be zeroed, so we don't accidently leak information from previous VM executions. Signed-off-by: Alexander Graf Acked-by: Paul Mackerras Signed-off-by: Avi Kivity --- arch/powerpc/kvm/book3s_hv_builtin.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index 1c7e6ab5f9de..7caed1dfd7a4 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -152,6 +152,7 @@ static struct kvmppc_linear_info *kvm_alloc_linear(int type) break; } spin_unlock(&linear_lock); + memset(ri->base_virt, 0, ri->npages << PAGE_SHIFT); return ri; } -- cgit v1.2.3 From d2a1b483a4a3f4bbb5fec1877f716c15ac7fa405 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Mon, 16 Jan 2012 19:12:11 +0100 Subject: KVM: PPC: Add HPT preallocator We're currently allocating 16MB of linear memory on demand when creating a guest. That does work some times, but finding 16MB of linear memory available in the system at runtime is definitely not a given. So let's add another command line option similar to the RMA preallocator, that we can use to keep a pool of page tables around. Now, when a guest gets created it has a pretty low chance of receiving an OOM. Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/include/asm/kvm_host.h | 1 + arch/powerpc/include/asm/kvm_ppc.h | 2 ++ arch/powerpc/kvm/book3s_64_mmu_hv.c | 19 +++++++++++++++--- arch/powerpc/kvm/book3s_hv_builtin.c | 39 +++++++++++++++++++++++++++++++++++- 4 files changed, 57 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 8221e717bbce..1843d5d2a3be 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -235,6 +235,7 @@ struct kvm_arch { int slot_npages[KVM_MEM_SLOTS_NUM]; unsigned short last_vcpu[NR_CPUS]; struct kvmppc_vcore *vcores[KVM_MAX_VCORES]; + struct kvmppc_linear_info *hpt_li; #endif /* CONFIG_KVM_BOOK3S_64_HV */ }; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 1c37a2f8d0f4..9d6dee0f7d48 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -130,6 +130,8 @@ extern long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *rma); extern struct kvmppc_linear_info *kvm_alloc_rma(void); extern void kvm_release_rma(struct kvmppc_linear_info *ri); +extern struct kvmppc_linear_info *kvm_alloc_hpt(void); +extern void kvm_release_hpt(struct kvmppc_linear_info *li); extern int kvmppc_core_init_vm(struct kvm *kvm); extern void kvmppc_core_destroy_vm(struct kvm *kvm); extern int kvmppc_core_prepare_memory_region(struct kvm *kvm, diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 783cd3510c93..ddc485a529f2 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -44,10 +44,20 @@ long kvmppc_alloc_hpt(struct kvm *kvm) unsigned long hpt; unsigned long lpid; struct revmap_entry *rev; + struct kvmppc_linear_info *li; /* Allocate guest's hashed page table */ - hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|__GFP_NOWARN, - HPT_ORDER - PAGE_SHIFT); + li = kvm_alloc_hpt(); + if (li) { + /* using preallocated memory */ + hpt = (ulong)li->base_virt; + kvm->arch.hpt_li = li; + } else { + /* using dynamic memory */ + hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT| + __GFP_NOWARN, HPT_ORDER - PAGE_SHIFT); + } + if (!hpt) { pr_err("kvm_alloc_hpt: Couldn't alloc HPT\n"); return -ENOMEM; @@ -88,7 +98,10 @@ void kvmppc_free_hpt(struct kvm *kvm) { clear_bit(kvm->arch.lpid, lpid_inuse); vfree(kvm->arch.revmap); - free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT); + if (kvm->arch.hpt_li) + kvm_release_hpt(kvm->arch.hpt_li); + else + free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT); } /* Bits in first HPTE dword for pagesize 4k, 64k or 16M */ diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index 7caed1dfd7a4..bed1279aa6a8 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -19,6 +19,7 @@ #include #define KVM_LINEAR_RMA 0 +#define KVM_LINEAR_HPT 1 static void __init kvm_linear_init_one(ulong size, int count, int type); static struct kvmppc_linear_info *kvm_alloc_linear(int type); @@ -97,6 +98,39 @@ void kvm_release_rma(struct kvmppc_linear_info *ri) } EXPORT_SYMBOL_GPL(kvm_release_rma); +/*************** HPT *************/ + +/* + * This maintains a list of big linear HPT tables that contain the GVA->HPA + * memory mappings. If we don't reserve those early on, we might not be able + * to get a big (usually 16MB) linear memory region from the kernel anymore. + */ + +static unsigned long kvm_hpt_count; + +static int __init early_parse_hpt_count(char *p) +{ + if (!p) + return 1; + + kvm_hpt_count = simple_strtoul(p, NULL, 0); + + return 0; +} +early_param("kvm_hpt_count", early_parse_hpt_count); + +struct kvmppc_linear_info *kvm_alloc_hpt(void) +{ + return kvm_alloc_linear(KVM_LINEAR_HPT); +} +EXPORT_SYMBOL_GPL(kvm_alloc_hpt); + +void kvm_release_hpt(struct kvmppc_linear_info *li) +{ + kvm_release_linear(li); +} +EXPORT_SYMBOL_GPL(kvm_release_hpt); + /*************** generic *************/ static LIST_HEAD(free_linears); @@ -114,7 +148,7 @@ static void __init kvm_linear_init_one(ulong size, int count, int type) if (!count) return; - typestr = (type == KVM_LINEAR_RMA) ? "RMA" : ""; + typestr = (type == KVM_LINEAR_RMA) ? "RMA" : "HPT"; npages = size >> PAGE_SHIFT; linear_info = alloc_bootmem(count * sizeof(struct kvmppc_linear_info)); @@ -173,6 +207,9 @@ static void kvm_release_linear(struct kvmppc_linear_info *ri) */ void __init kvm_linear_init(void) { + /* HPT */ + kvm_linear_init_one(1 << HPT_ORDER, kvm_hpt_count, KVM_LINEAR_HPT); + /* RMA */ /* Only do this on PPC970 in HV mode */ if (!cpu_has_feature(CPU_FTR_HVMODE) || -- cgit v1.2.3 From 242ec97c358256ad6e62dab869f63a03cd244122 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Tue, 24 Jan 2012 15:06:05 +0200 Subject: KVM: x86: reset edge sense circuit of i8259 on init The spec says that during initialization "The edge sense circuit is reset which means that following initialization an interrupt request (IR) input must make a low-to-high transition to generate an interrupt", but currently if edge triggered interrupt is in IRR it is delivered after i8259 initialization. Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/i8259.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index b6a73537e1ef..81cf4fa4a2be 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -307,6 +307,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) if (val & 0x10) { s->init4 = val & 1; s->last_irr = 0; + s->irr &= s->elcr; s->imr = 0; s->priority_add = 0; s->special_mask = 0; -- cgit v1.2.3 From df156f90a0f90649dd38b7667901ef85478f3d2b Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Tue, 7 Feb 2012 15:52:44 +0100 Subject: x86: Introduce x86_cpuinit.early_percpu_clock_init hook When kvm guest uses kvmclock, it may hang on vcpu hot-plug. This is caused by an overflow in pvclock_get_nsec_offset, u64 delta = tsc - shadow->tsc_timestamp; which in turn is caused by an undefined values from percpu hv_clock that hasn't been initialized yet. Uninitialized clock on being booted cpu is accessed from start_secondary -> smp_callin -> smp_store_cpu_info -> identify_secondary_cpu -> mtrr_ap_init -> mtrr_restore -> stop_machine_from_inactive_cpu -> queue_stop_cpus_work ... -> sched_clock -> kvm_clock_read which is well before x86_cpuinit.setup_percpu_clockev call in start_secondary, where percpu clock is initialized. This patch introduces a hook that allows to setup/initialize per_cpu clock early and avoid overflow due to reading - undefined values - old values if cpu was offlined and then onlined again Another possible early user of this clock source is ftrace that accesses it to get timestamps for ring buffer entries. So if mtrr_ap_init is moved from identify_secondary_cpu to past x86_cpuinit.setup_percpu_clockev in start_secondary, ftrace may cause the same overflow/hang on cpu hot-plug anyway. More complete description of the problem: https://lkml.org/lkml/2012/2/2/101 Credits to Marcelo Tosatti for hook idea. Acked-by: Thomas Gleixner Signed-off-by: Igor Mammedov Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/x86_init.h | 2 ++ arch/x86/kernel/kvmclock.c | 4 +--- arch/x86/kernel/smpboot.c | 1 + arch/x86/kernel/x86_init.c | 1 + 4 files changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 517d4767ffdd..5d0afac2962c 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -145,9 +145,11 @@ struct x86_init_ops { /** * struct x86_cpuinit_ops - platform specific cpu hotplug setups * @setup_percpu_clockev: set up the per cpu clock event device + * @early_percpu_clock_init: early init of the per cpu clock event device */ struct x86_cpuinit_ops { void (*setup_percpu_clockev)(void); + void (*early_percpu_clock_init)(void); void (*fixup_cpu_id)(struct cpuinfo_x86 *c, int node); }; diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 44842d756b29..ca4e735adc54 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -144,8 +144,6 @@ static void __cpuinit kvm_setup_secondary_clock(void) * we shouldn't fail. */ WARN_ON(kvm_register_clock("secondary cpu clock")); - /* ok, done with our trickery, call native */ - setup_secondary_APIC_clock(); } #endif @@ -194,7 +192,7 @@ void __init kvmclock_init(void) x86_platform.get_wallclock = kvm_get_wallclock; x86_platform.set_wallclock = kvm_set_wallclock; #ifdef CONFIG_X86_LOCAL_APIC - x86_cpuinit.setup_percpu_clockev = + x86_cpuinit.early_percpu_clock_init = kvm_setup_secondary_clock; #endif machine_ops.shutdown = kvm_shutdown; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 66d250c00d11..a05d6fd5e06d 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -255,6 +255,7 @@ notrace static void __cpuinit start_secondary(void *unused) * most necessary things. */ cpu_init(); + x86_cpuinit.early_percpu_clock_init(); preempt_disable(); smp_callin(); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 947a06ccc673..6f2ec53deed0 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -91,6 +91,7 @@ struct x86_init_ops x86_init __initdata = { }; struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { + .early_percpu_clock_init = x86_init_noop, .setup_percpu_clockev = setup_secondary_APIC_clock, .fixup_cpu_id = x86_default_fixup_cpu_id, }; -- cgit v1.2.3 From a59cb29e4d81e025192550c2703f305637f016f6 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Fri, 3 Feb 2012 12:28:31 -0200 Subject: KVM: x86: increase recommended max vcpus to 160 Increase recommended max vcpus from 64 to 160 (tested internally at Red Hat). Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 461016614324..782d973b0719 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -29,7 +29,7 @@ #include #define KVM_MAX_VCPUS 254 -#define KVM_SOFT_MAX_VCPUS 64 +#define KVM_SOFT_MAX_VCPUS 160 #define KVM_MEMORY_SLOTS 32 /* memory slots that does not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 4 -- cgit v1.2.3 From cc578287e3224d0da196cc1d226bdae6b068faa7 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Fri, 3 Feb 2012 15:43:50 -0200 Subject: KVM: Infrastructure for software and hardware based TSC rate scaling This requires some restructuring; rather than use 'virtual_tsc_khz' to indicate whether hardware rate scaling is in effect, we consider each VCPU to always have a virtual TSC rate. Instead, there is new logic above the vendor-specific hardware scaling that decides whether it is even necessary to use and updates all rate variables used by common code. This means we can simply query the virtual rate at any point, which is needed for software rate scaling. There is also now a threshold added to the TSC rate scaling; minor differences and variations of measured TSC rate can accidentally provoke rate scaling to be used when it is not needed. Instead, we have a tolerance variable called tsc_tolerance_ppm, which is the maximum variation from user requested rate at which scaling will be used. The default is 250ppm, which is the half the threshold for NTP adjustment, allowing for some hardware variation. In the event that hardware rate scaling is not available, we can kludge a bit by forcing TSC catchup to turn on when a faster than hardware speed has been requested, but there is nothing available yet for the reverse case; this requires a trap and emulate software implementation for RDTSC, which is still forthcoming. [avi: fix 64-bit division on i386] Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 9 +++-- arch/x86/kvm/lapic.c | 2 +- arch/x86/kvm/svm.c | 20 ++++++---- arch/x86/kvm/vmx.c | 16 +++++--- arch/x86/kvm/x86.c | 82 +++++++++++++++++++++-------------------- 5 files changed, 71 insertions(+), 58 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 782d973b0719..ddebbe01fff9 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -422,10 +422,11 @@ struct kvm_vcpu_arch { u64 last_kernel_ns; u64 last_tsc_nsec; u64 last_tsc_write; - u32 virtual_tsc_khz; bool tsc_catchup; - u32 tsc_catchup_mult; - s8 tsc_catchup_shift; + bool tsc_always_catchup; + s8 virtual_tsc_shift; + u32 virtual_tsc_mult; + u32 virtual_tsc_khz; atomic_t nmi_queued; /* unprocessed asynchronous NMIs */ unsigned nmi_pending; /* NMI queued after currently running handler */ @@ -651,7 +652,7 @@ struct kvm_x86_ops { bool (*has_wbinvd_exit)(void); - void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz); + void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale); void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 3ee1d83c695d..72975f758c83 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -731,7 +731,7 @@ static void start_apic_timer(struct kvm_lapic *apic) u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline; u64 ns = 0; struct kvm_vcpu *vcpu = apic->vcpu; - unsigned long this_tsc_khz = vcpu_tsc_khz(vcpu); + unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz; unsigned long flags; if (unlikely(!tscdeadline || !this_tsc_khz)) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 7bbd17cc3488..e12026e5244e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -964,20 +964,25 @@ static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc) return _tsc; } -static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) +static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) { struct vcpu_svm *svm = to_svm(vcpu); u64 ratio; u64 khz; - /* TSC scaling supported? */ - if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) + /* Guest TSC same frequency as host TSC? */ + if (!scale) { + svm->tsc_ratio = TSC_RATIO_DEFAULT; return; + } - /* TSC-Scaling disabled or guest TSC same frequency as host TSC? */ - if (user_tsc_khz == 0) { - vcpu->arch.virtual_tsc_khz = 0; - svm->tsc_ratio = TSC_RATIO_DEFAULT; + /* TSC scaling supported? */ + if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { + if (user_tsc_khz > tsc_khz) { + vcpu->arch.tsc_catchup = 1; + vcpu->arch.tsc_always_catchup = 1; + } else + WARN(1, "user requested TSC rate below hardware speed\n"); return; } @@ -992,7 +997,6 @@ static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) user_tsc_khz); return; } - vcpu->arch.virtual_tsc_khz = user_tsc_khz; svm->tsc_ratio = ratio; } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 3b4c8d8ad906..e6bf61fa1c03 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1817,13 +1817,19 @@ u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu) } /* - * Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ - * ioctl. In this case the call-back should update internal vmx state to make - * the changes effective. + * Engage any workarounds for mis-matched TSC rates. Currently limited to + * software catchup for faster rates on slower CPUs. */ -static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) +static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) { - /* Nothing to do here */ + if (!scale) + return; + + if (user_tsc_khz > tsc_khz) { + vcpu->arch.tsc_catchup = 1; + vcpu->arch.tsc_always_catchup = 1; + } else + WARN(1, "user requested TSC rate below hardware speed\n"); } /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2bd77a3a41ed..41bb90acb238 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -96,6 +96,10 @@ EXPORT_SYMBOL_GPL(kvm_has_tsc_control); u32 kvm_max_guest_tsc_khz; EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); +/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ +static u32 tsc_tolerance_ppm = 250; +module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); + #define KVM_NR_SHARED_MSRS 16 struct kvm_shared_msrs_global { @@ -968,49 +972,50 @@ static inline u64 get_kernel_ns(void) static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); unsigned long max_tsc_khz; -static inline int kvm_tsc_changes_freq(void) +static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) { - int cpu = get_cpu(); - int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && - cpufreq_quick_get(cpu) != 0; - put_cpu(); - return ret; + return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult, + vcpu->arch.virtual_tsc_shift); } -u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu) +static u32 adjust_tsc_khz(u32 khz, s32 ppm) { - if (vcpu->arch.virtual_tsc_khz) - return vcpu->arch.virtual_tsc_khz; - else - return __this_cpu_read(cpu_tsc_khz); + u64 v = (u64)khz * (1000000 + ppm); + do_div(v, 1000000); + return v; } -static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) +static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz) { - u64 ret; + u32 thresh_lo, thresh_hi; + int use_scaling = 0; - WARN_ON(preemptible()); - if (kvm_tsc_changes_freq()) - printk_once(KERN_WARNING - "kvm: unreliable cycle conversion on adjustable rate TSC\n"); - ret = nsec * vcpu_tsc_khz(vcpu); - do_div(ret, USEC_PER_SEC); - return ret; -} - -static void kvm_init_tsc_catchup(struct kvm_vcpu *vcpu, u32 this_tsc_khz) -{ /* Compute a scale to convert nanoseconds in TSC cycles */ kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, - &vcpu->arch.tsc_catchup_shift, - &vcpu->arch.tsc_catchup_mult); + &vcpu->arch.virtual_tsc_shift, + &vcpu->arch.virtual_tsc_mult); + vcpu->arch.virtual_tsc_khz = this_tsc_khz; + + /* + * Compute the variation in TSC rate which is acceptable + * within the range of tolerance and decide if the + * rate being applied is within that bounds of the hardware + * rate. If so, no scaling or compensation need be done. + */ + thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm); + thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm); + if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) { + pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi); + use_scaling = 1; + } + kvm_x86_ops->set_tsc_khz(vcpu, this_tsc_khz, use_scaling); } static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) { u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec, - vcpu->arch.tsc_catchup_mult, - vcpu->arch.tsc_catchup_shift); + vcpu->arch.virtual_tsc_mult, + vcpu->arch.virtual_tsc_shift); tsc += vcpu->arch.last_tsc_write; return tsc; } @@ -1077,7 +1082,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) local_irq_save(flags); tsc_timestamp = kvm_x86_ops->read_l1_tsc(v); kernel_ns = get_kernel_ns(); - this_tsc_khz = vcpu_tsc_khz(v); + this_tsc_khz = __get_cpu_var(cpu_tsc_khz); if (unlikely(this_tsc_khz == 0)) { local_irq_restore(flags); kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); @@ -2804,26 +2809,21 @@ long kvm_arch_vcpu_ioctl(struct file *filp, u32 user_tsc_khz; r = -EINVAL; - if (!kvm_has_tsc_control) - break; - user_tsc_khz = (u32)arg; if (user_tsc_khz >= kvm_max_guest_tsc_khz) goto out; - kvm_x86_ops->set_tsc_khz(vcpu, user_tsc_khz); + if (user_tsc_khz == 0) + user_tsc_khz = tsc_khz; + + kvm_set_tsc_khz(vcpu, user_tsc_khz); r = 0; goto out; } case KVM_GET_TSC_KHZ: { - r = -EIO; - if (check_tsc_unstable()) - goto out; - - r = vcpu_tsc_khz(vcpu); - + r = vcpu->arch.virtual_tsc_khz; goto out; } default: @@ -5312,6 +5312,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) profile_hit(KVM_PROFILING, (void *)rip); } + if (unlikely(vcpu->arch.tsc_always_catchup)) + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); kvm_lapic_sync_from_vapic(vcpu); @@ -6004,7 +6006,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) } vcpu->arch.pio_data = page_address(page); - kvm_init_tsc_catchup(vcpu, max_tsc_khz); + kvm_set_tsc_khz(vcpu, max_tsc_khz); r = kvm_mmu_create(vcpu); if (r < 0) -- cgit v1.2.3 From 5d3cb0f6a8e3af018a522ae8d36f8f7d2511b5d8 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Fri, 3 Feb 2012 15:43:51 -0200 Subject: KVM: Improve TSC offset matching There are a few improvements that can be made to the TSC offset matching code. First, we don't need to call the 128-bit multiply (especially on a constant number), the code works much nicer to do computation in nanosecond units. Second, the way everything is setup with software TSC rate scaling, we currently have per-cpu rates. Obviously this isn't too desirable to use in practice, but if for some reason we do change the rate of all VCPUs at runtime, then reset the TSCs, we will only want to match offsets for VCPUs running at the same rate. Finally, for the case where we have an unstable host TSC, but rate scaling is being done in hardware, we should call the platform code to compute the TSC offset, so the math is reorganized to recompute the base instead, then transform the base into an offset using the existing API. [avi: fix 64-bit division on i386] Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti KVM: Fix 64-bit division in kvm_write_tsc() Breaks i386 build. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 44 +++++++++++++++++++++++++++-------------- 2 files changed, 30 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ddebbe01fff9..8a34fca6c572 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -513,6 +513,7 @@ struct kvm_arch { u64 last_tsc_nsec; u64 last_tsc_offset; u64 last_tsc_write; + u32 last_tsc_khz; struct kvm_xen_hvm_config xen_hvm_config; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 41bb90acb238..4390f42b371f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1025,33 +1025,46 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) struct kvm *kvm = vcpu->kvm; u64 offset, ns, elapsed; unsigned long flags; - s64 sdiff; + s64 nsdiff; raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); ns = get_kernel_ns(); elapsed = ns - kvm->arch.last_tsc_nsec; - sdiff = data - kvm->arch.last_tsc_write; - if (sdiff < 0) - sdiff = -sdiff; + + /* n.b - signed multiplication and division required */ + nsdiff = data - kvm->arch.last_tsc_write; +#ifdef CONFIG_X86_64 + nsdiff = (nsdiff * 1000) / vcpu->arch.virtual_tsc_khz; +#else + /* do_div() only does unsigned */ + asm("idivl %2; xor %%edx, %%edx" + : "=A"(nsdiff) + : "A"(nsdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz)); +#endif + nsdiff -= elapsed; + if (nsdiff < 0) + nsdiff = -nsdiff; /* - * Special case: close write to TSC within 5 seconds of - * another CPU is interpreted as an attempt to synchronize - * The 5 seconds is to accommodate host load / swapping as - * well as any reset of TSC during the boot process. - * - * In that case, for a reliable TSC, we can match TSC offsets, - * or make a best guest using elapsed value. - */ - if (sdiff < nsec_to_cycles(vcpu, 5ULL * NSEC_PER_SEC) && - elapsed < 5ULL * NSEC_PER_SEC) { + * Special case: TSC write with a small delta (1 second) of virtual + * cycle time against real time is interpreted as an attempt to + * synchronize the CPU. + * + * For a reliable TSC, we can match TSC offsets, and for an unstable + * TSC, we add elapsed time in this computation. We could let the + * compensation code attempt to catch up if we fall behind, but + * it's better to try to match offsets from the beginning. + */ + if (nsdiff < NSEC_PER_SEC && + vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) { if (!check_tsc_unstable()) { offset = kvm->arch.last_tsc_offset; pr_debug("kvm: matched tsc offset for %llu\n", data); } else { u64 delta = nsec_to_cycles(vcpu, elapsed); - offset += delta; + data += delta; + offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); pr_debug("kvm: adjusted tsc offset by %llu\n", delta); } ns = kvm->arch.last_tsc_nsec; @@ -1059,6 +1072,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) kvm->arch.last_tsc_nsec = ns; kvm->arch.last_tsc_write = data; kvm->arch.last_tsc_offset = offset; + kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz; kvm_x86_ops->write_tsc_offset(vcpu, offset); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); -- cgit v1.2.3 From 4dd7980b21408624e9b6f3df05719c3c61db6e9f Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Fri, 3 Feb 2012 15:43:52 -0200 Subject: KVM: Leave TSC synchronization window open with each new sync Currently, when the TSC is written by the guest, the variable ns is updated to force the current write to appear to have taken place at the time of the first write in this sync phase. This leaves a cliff at the end of the match window where updates will fall of the end. There are two scenarios where this can be a problem in practe - first, on a system with a large number of VCPUs, the sync period may last for an extended period of time. The second way this can happen is if the VM reboots very rapidly and we catch a VCPU TSC synchronization just around the edge. We may be unaware of the reboot, and thus the first VCPU might synchronize with an old set of the timer (at, say 0.97 seconds ago, when first powered on). The second VCPU can come in 0.04 seconds later to try to synchronize, but it misses the window because it is just over the threshold. Instead, stop doing this artificial setback of the ns variable and just update it with every write of the TSC. It may be observed that doing so causes values computed by compute_guest_tsc to diverge slightly across CPUs - note that the last_tsc_ns and last_tsc_write variable are used here, and now they last_tsc_ns will be different for each VCPU, reflecting the actual time of the update. However, compute_guest_tsc is used only for guests which already have TSC stability issues, and further, note that the previous patch has caused last_tsc_write to be incremented by the difference in nanoseconds, converted back into guest cycles. As such, only boundary rounding errors should be visible, which given the resolution in nanoseconds, is going to only be a few cycles and only visible in cross-CPU consistency tests. The problem can be fixed by adding a new set of variables to track the start offset and start write value for the current sync cycle. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4390f42b371f..030d495e5c78 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1067,7 +1067,6 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); pr_debug("kvm: adjusted tsc offset by %llu\n", delta); } - ns = kvm->arch.last_tsc_nsec; } kvm->arch.last_tsc_nsec = ns; kvm->arch.last_tsc_write = data; -- cgit v1.2.3 From b183aa580a3a09b5d79224a9022418508532c778 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Fri, 3 Feb 2012 15:43:53 -0200 Subject: KVM: Fix last_guest_tsc / tsc_offset semantics The variable last_guest_tsc was being used as an ad-hoc indicator that guest TSC has been initialized and recorded correctly. However, it may not have been, it could be that guest TSC has been set to some large value, the back to a small value (by, say, a software reboot). This defeats the logic and causes KVM to falsely assume that the guest TSC has gone backwards, marking the host TSC unstable, which is undesirable behavior. In addition, rather than try to compute an offset adjustment for the TSC on unstable platforms, just recompute the whole offset. This allows us to get rid of one callsite for adjust_tsc_offset, which is problematic because the units it takes are in guest units, but here, the computation was originally being done in host units. Doing this, and also recording last_guest_tsc when the TSC is written allow us to remove the tricky logic which depended on last_guest_tsc being zero to indicate a reset of uninitialized value. Instead, we now have the guarantee that the guest TSC offset is always at least something which will get us last_guest_tsc. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 030d495e5c78..2a59f76d96f1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1079,6 +1079,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) vcpu->arch.hv_clock.tsc_timestamp = 0; vcpu->arch.last_tsc_write = data; vcpu->arch.last_tsc_nsec = ns; + vcpu->arch.last_guest_tsc = data; } EXPORT_SYMBOL_GPL(kvm_write_tsc); @@ -1147,7 +1148,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) * observed by the guest and ensure the new system time is greater. */ max_kernel_ns = 0; - if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) { + if (vcpu->hv_clock.tsc_timestamp) { max_kernel_ns = vcpu->last_guest_tsc - vcpu->hv_clock.tsc_timestamp; max_kernel_ns = pvclock_scale_delta(max_kernel_ns, @@ -2257,13 +2258,14 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) u64 tsc; tsc = kvm_x86_ops->read_l1_tsc(vcpu); - tsc_delta = !vcpu->arch.last_guest_tsc ? 0 : - tsc - vcpu->arch.last_guest_tsc; + tsc_delta = tsc - vcpu->arch.last_guest_tsc; if (tsc_delta < 0) mark_tsc_unstable("KVM discovered backwards TSC"); if (check_tsc_unstable()) { - kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta); + u64 offset = kvm_x86_ops->compute_tsc_offset(vcpu, + vcpu->arch.last_guest_tsc); + kvm_x86_ops->write_tsc_offset(vcpu, offset); vcpu->arch.tsc_catchup = 1; } kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); -- cgit v1.2.3 From 6f526ec5383dcd5fa5ffc7b3ac1d62099a0b46ad Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Fri, 3 Feb 2012 15:43:54 -0200 Subject: KVM: Add last_host_tsc tracking back to KVM The variable last_host_tsc was removed from upstream code. I am adding it back for two reasons. First, it is unnecessary to use guest TSC computation to conclude information about the host TSC. The guest may set the TSC backwards (this case handled by the previous patch), but the computation of guest TSC (and fetching an MSR) is significanlty more work and complexity than simply reading the hardware counter. In addition, we don't actually need the guest TSC for any part of the computation, by always recomputing the offset, we can eliminate the need to deal with the current offset and any scaling factors that may apply. The second reason is that later on, we are going to be using the host TSC value to restore TSC offsets after a host S4 suspend, so we need to be reading the host values, not the guest values here. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 11 +++-------- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8a34fca6c572..b23682900f41 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -422,6 +422,7 @@ struct kvm_vcpu_arch { u64 last_kernel_ns; u64 last_tsc_nsec; u64 last_tsc_write; + u64 last_host_tsc; bool tsc_catchup; bool tsc_always_catchup; s8 virtual_tsc_shift; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2a59f76d96f1..39a57dac884a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2253,13 +2253,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_x86_ops->vcpu_load(vcpu, cpu); if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { - /* Make sure TSC doesn't go backwards */ - s64 tsc_delta; - u64 tsc; - - tsc = kvm_x86_ops->read_l1_tsc(vcpu); - tsc_delta = tsc - vcpu->arch.last_guest_tsc; - + s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : + native_read_tsc() - vcpu->arch.last_host_tsc; if (tsc_delta < 0) mark_tsc_unstable("KVM discovered backwards TSC"); if (check_tsc_unstable()) { @@ -2282,7 +2277,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { kvm_x86_ops->vcpu_put(vcpu); kvm_put_guest_fpu(vcpu); - vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); + vcpu->arch.last_host_tsc = native_read_tsc(); } static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, -- cgit v1.2.3 From f1e2b26003c41e581243c09ceed7567677449468 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Fri, 3 Feb 2012 15:43:55 -0200 Subject: KVM: Allow adjust_tsc_offset to be in host or guest cycles Redefine the API to take a parameter indicating whether an adjustment is in host or guest cycles. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 13 ++++++++++++- arch/x86/kvm/svm.c | 6 +++++- arch/x86/kvm/vmx.c | 2 +- arch/x86/kvm/x86.c | 2 +- 4 files changed, 19 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b23682900f41..dd439f13df84 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -646,7 +646,7 @@ struct kvm_x86_ops { u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); int (*get_lpage_level)(void); bool (*rdtscp_supported)(void); - void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment); + void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment, bool host); void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); @@ -676,6 +676,17 @@ struct kvm_arch_async_pf { extern struct kvm_x86_ops *kvm_x86_ops; +static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, + s64 adjustment) +{ + kvm_x86_ops->adjust_tsc_offset(vcpu, adjustment, false); +} + +static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) +{ + kvm_x86_ops->adjust_tsc_offset(vcpu, adjustment, true); +} + int kvm_mmu_module_init(void); void kvm_mmu_module_exit(void); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e12026e5244e..0b7690ee20bd 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1016,10 +1016,14 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } -static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) +static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host) { struct vcpu_svm *svm = to_svm(vcpu); + WARN_ON(adjustment < 0); + if (host) + adjustment = svm_scale_tsc(vcpu, adjustment); + svm->vmcb->control.tsc_offset += adjustment; if (is_guest_mode(vcpu)) svm->nested.hsave->control.tsc_offset += adjustment; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e6bf61fa1c03..575fb742a6fc 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1856,7 +1856,7 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) } } -static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) +static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host) { u64 offset = vmcs_read64(TSC_OFFSET); vmcs_write64(TSC_OFFSET, offset + adjustment); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 39a57dac884a..3b931302fa55 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1116,7 +1116,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) if (vcpu->tsc_catchup) { u64 tsc = compute_guest_tsc(v, kernel_ns); if (tsc > tsc_timestamp) { - kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp); + adjust_tsc_offset_guest(v, tsc - tsc_timestamp); tsc_timestamp = tsc; } } -- cgit v1.2.3 From 0dd6a6edb0124e6c71931ff575b18e15ed6e8603 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Fri, 3 Feb 2012 15:43:56 -0200 Subject: KVM: Dont mark TSC unstable due to S4 suspend During a host suspend, TSC may go backwards, which KVM interprets as an unstable TSC. Technically, KVM should not be marking the TSC unstable, which causes the TSC clocksource to go bad, but we need to be adjusting the TSC offsets in such a case. Dealing with this issue is a little tricky as the only place we can reliably do it is before much of the timekeeping infrastructure is up and running. On top of this, we are not in a KVM thread context, so we may not be able to safely access VCPU fields. Instead, we compute our best known hardware offset at power-up and stash it to be applied to all VCPUs when they actually start running. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 93 ++++++++++++++++++++++++++++++++++++++--- 2 files changed, 89 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index dd439f13df84..4fbeb84b1818 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -423,6 +423,7 @@ struct kvm_vcpu_arch { u64 last_tsc_nsec; u64 last_tsc_write; u64 last_host_tsc; + u64 tsc_offset_adjustment; bool tsc_catchup; bool tsc_always_catchup; s8 virtual_tsc_shift; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3b931302fa55..4e9bd23d522d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2252,6 +2252,14 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) } kvm_x86_ops->vcpu_load(vcpu, cpu); + + /* Apply any externally detected TSC adjustments (due to suspend) */ + if (unlikely(vcpu->arch.tsc_offset_adjustment)) { + adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment); + vcpu->arch.tsc_offset_adjustment = 0; + set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); + } + if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : native_read_tsc() - vcpu->arch.last_host_tsc; @@ -5964,13 +5972,88 @@ int kvm_arch_hardware_enable(void *garbage) struct kvm *kvm; struct kvm_vcpu *vcpu; int i; + int ret; + u64 local_tsc; + u64 max_tsc = 0; + bool stable, backwards_tsc = false; kvm_shared_msr_cpu_online(); - list_for_each_entry(kvm, &vm_list, vm_list) - kvm_for_each_vcpu(i, vcpu, kvm) - if (vcpu->cpu == smp_processor_id()) - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - return kvm_x86_ops->hardware_enable(garbage); + ret = kvm_x86_ops->hardware_enable(garbage); + if (ret != 0) + return ret; + + local_tsc = native_read_tsc(); + stable = !check_tsc_unstable(); + list_for_each_entry(kvm, &vm_list, vm_list) { + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!stable && vcpu->cpu == smp_processor_id()) + set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); + if (stable && vcpu->arch.last_host_tsc > local_tsc) { + backwards_tsc = true; + if (vcpu->arch.last_host_tsc > max_tsc) + max_tsc = vcpu->arch.last_host_tsc; + } + } + } + + /* + * Sometimes, even reliable TSCs go backwards. This happens on + * platforms that reset TSC during suspend or hibernate actions, but + * maintain synchronization. We must compensate. Fortunately, we can + * detect that condition here, which happens early in CPU bringup, + * before any KVM threads can be running. Unfortunately, we can't + * bring the TSCs fully up to date with real time, as we aren't yet far + * enough into CPU bringup that we know how much real time has actually + * elapsed; our helper function, get_kernel_ns() will be using boot + * variables that haven't been updated yet. + * + * So we simply find the maximum observed TSC above, then record the + * adjustment to TSC in each VCPU. When the VCPU later gets loaded, + * the adjustment will be applied. Note that we accumulate + * adjustments, in case multiple suspend cycles happen before some VCPU + * gets a chance to run again. In the event that no KVM threads get a + * chance to run, we will miss the entire elapsed period, as we'll have + * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may + * loose cycle time. This isn't too big a deal, since the loss will be + * uniform across all VCPUs (not to mention the scenario is extremely + * unlikely). It is possible that a second hibernate recovery happens + * much faster than a first, causing the observed TSC here to be + * smaller; this would require additional padding adjustment, which is + * why we set last_host_tsc to the local tsc observed here. + * + * N.B. - this code below runs only on platforms with reliable TSC, + * as that is the only way backwards_tsc is set above. Also note + * that this runs for ALL vcpus, which is not a bug; all VCPUs should + * have the same delta_cyc adjustment applied if backwards_tsc + * is detected. Note further, this adjustment is only done once, + * as we reset last_host_tsc on all VCPUs to stop this from being + * called multiple times (one for each physical CPU bringup). + * + * Platforms with unnreliable TSCs don't have to deal with this, they + * will be compensated by the logic in vcpu_load, which sets the TSC to + * catchup mode. This will catchup all VCPUs to real time, but cannot + * guarantee that they stay in perfect synchronization. + */ + if (backwards_tsc) { + u64 delta_cyc = max_tsc - local_tsc; + list_for_each_entry(kvm, &vm_list, vm_list) { + kvm_for_each_vcpu(i, vcpu, kvm) { + vcpu->arch.tsc_offset_adjustment += delta_cyc; + vcpu->arch.last_host_tsc = local_tsc; + } + + /* + * We have to disable TSC offset matching.. if you were + * booting a VM while issuing an S4 host suspend.... + * you may have some problem. Solving this issue is + * left as an exercise to the reader. + */ + kvm->arch.last_tsc_nsec = 0; + kvm->arch.last_tsc_write = 0; + } + + } + return 0; } void kvm_arch_hardware_disable(void *garbage) -- cgit v1.2.3 From e26101b116a6235bcd80b3a4c38c9fe91286cd79 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Fri, 3 Feb 2012 15:43:57 -0200 Subject: KVM: Track TSC synchronization in generations This allows us to track the original nanosecond and counter values at each phase of TSC writing by the guest. This gets us perfect offset matching for stable TSC systems, and perfect software computed TSC matching for machines with unstable TSC. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 10 +++++++--- arch/x86/kvm/x86.c | 41 +++++++++++++++++++++++++++++++++-------- 2 files changed, 40 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4fbeb84b1818..c24125cd0c63 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -420,10 +420,11 @@ struct kvm_vcpu_arch { u64 last_guest_tsc; u64 last_kernel_ns; - u64 last_tsc_nsec; - u64 last_tsc_write; u64 last_host_tsc; u64 tsc_offset_adjustment; + u64 this_tsc_nsec; + u64 this_tsc_write; + u8 this_tsc_generation; bool tsc_catchup; bool tsc_always_catchup; s8 virtual_tsc_shift; @@ -513,9 +514,12 @@ struct kvm_arch { s64 kvmclock_offset; raw_spinlock_t tsc_write_lock; u64 last_tsc_nsec; - u64 last_tsc_offset; u64 last_tsc_write; u32 last_tsc_khz; + u64 cur_tsc_nsec; + u64 cur_tsc_write; + u64 cur_tsc_offset; + u8 cur_tsc_generation; struct kvm_xen_hvm_config xen_hvm_config; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4e9bd23d522d..e86f9b22eaca 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1013,10 +1013,10 @@ static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz) static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) { - u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec, + u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec, vcpu->arch.virtual_tsc_mult, vcpu->arch.virtual_tsc_shift); - tsc += vcpu->arch.last_tsc_write; + tsc += vcpu->arch.this_tsc_write; return tsc; } @@ -1059,7 +1059,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) if (nsdiff < NSEC_PER_SEC && vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) { if (!check_tsc_unstable()) { - offset = kvm->arch.last_tsc_offset; + offset = kvm->arch.cur_tsc_offset; pr_debug("kvm: matched tsc offset for %llu\n", data); } else { u64 delta = nsec_to_cycles(vcpu, elapsed); @@ -1067,20 +1067,45 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); pr_debug("kvm: adjusted tsc offset by %llu\n", delta); } + } else { + /* + * We split periods of matched TSC writes into generations. + * For each generation, we track the original measured + * nanosecond time, offset, and write, so if TSCs are in + * sync, we can match exact offset, and if not, we can match + * exact software computaion in compute_guest_tsc() + * + * These values are tracked in kvm->arch.cur_xxx variables. + */ + kvm->arch.cur_tsc_generation++; + kvm->arch.cur_tsc_nsec = ns; + kvm->arch.cur_tsc_write = data; + kvm->arch.cur_tsc_offset = offset; + pr_debug("kvm: new tsc generation %u, clock %llu\n", + kvm->arch.cur_tsc_generation, data); } + + /* + * We also track th most recent recorded KHZ, write and time to + * allow the matching interval to be extended at each write. + */ kvm->arch.last_tsc_nsec = ns; kvm->arch.last_tsc_write = data; - kvm->arch.last_tsc_offset = offset; kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz; - kvm_x86_ops->write_tsc_offset(vcpu, offset); - raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); /* Reset of TSC must disable overshoot protection below */ vcpu->arch.hv_clock.tsc_timestamp = 0; - vcpu->arch.last_tsc_write = data; - vcpu->arch.last_tsc_nsec = ns; vcpu->arch.last_guest_tsc = data; + + /* Keep track of which generation this VCPU has synchronized to */ + vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation; + vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; + vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; + + kvm_x86_ops->write_tsc_offset(vcpu, offset); + raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); } + EXPORT_SYMBOL_GPL(kvm_write_tsc); static int kvm_guest_time_update(struct kvm_vcpu *v) -- cgit v1.2.3 From 10166744b80a41c30d82bc6e11140f5b28d257ab Mon Sep 17 00:00:00 2001 From: Raghavendra K T Date: Tue, 7 Feb 2012 23:19:20 +0530 Subject: KVM: VMX: remove yield_on_hlt yield_on_hlt was introduced for CPU bandwidth capping. Now it is redundant with CFS hardlimit. yield_on_hlt also complicates the scenario in paravirtual environment, that needs to trap halt. for e.g. paravirtualized ticket spinlocks. Acked-by: Anthony Liguori Signed-off-by: Raghavendra K T Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 575fb742a6fc..d2bd719925a6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -70,9 +70,6 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO); static bool __read_mostly vmm_exclusive = 1; module_param(vmm_exclusive, bool, S_IRUGO); -static bool __read_mostly yield_on_hlt = 1; -module_param(yield_on_hlt, bool, S_IRUGO); - static bool __read_mostly fasteoi = 1; module_param(fasteoi, bool, S_IRUGO); @@ -1655,17 +1652,6 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) vmx_set_interrupt_shadow(vcpu, 0); } -static void vmx_clear_hlt(struct kvm_vcpu *vcpu) -{ - /* Ensure that we clear the HLT state in the VMCS. We don't need to - * explicitly skip the instruction because if the HLT state is set, then - * the instruction is already executing and RIP has already been - * advanced. */ - if (!yield_on_hlt && - vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT) - vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); -} - /* * KVM wants to inject page-faults which it got to the guest. This function * checks whether in a nested guest, we need to inject them to L1 or L2. @@ -1718,7 +1704,6 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, intr_info |= INTR_TYPE_HARD_EXCEPTION; vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); - vmx_clear_hlt(vcpu); } static bool vmx_rdtscp_supported(void) @@ -2405,7 +2390,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) &_pin_based_exec_control) < 0) return -EIO; - min = + min = CPU_BASED_HLT_EXITING | #ifdef CONFIG_X86_64 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | @@ -2420,9 +2405,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) CPU_BASED_INVLPG_EXITING | CPU_BASED_RDPMC_EXITING; - if (yield_on_hlt) - min |= CPU_BASED_HLT_EXITING; - opt = CPU_BASED_TPR_SHADOW | CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; @@ -4009,7 +3991,6 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu) } else intr |= INTR_TYPE_EXT_INTR; vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); - vmx_clear_hlt(vcpu); } static void vmx_inject_nmi(struct kvm_vcpu *vcpu) @@ -4041,7 +4022,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) } vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); - vmx_clear_hlt(vcpu); } static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From 6dbf79e7164e9a86c1e466062c48498142ae6128 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Sun, 5 Feb 2012 20:42:41 +0900 Subject: KVM: Fix write protection race during dirty logging This patch fixes a race introduced by: commit 95d4c16ce78cb6b7549a09159c409d52ddd18dae KVM: Optimize dirty logging by rmap_write_protect() During protecting pages for dirty logging, other threads may also try to protect a page in mmu_sync_children() or kvm_mmu_get_page(). In such a case, because get_dirty_log releases mmu_lock before flushing TLB's, the following race condition can happen: A (get_dirty_log) B (another thread) lock(mmu_lock) clear pte.w unlock(mmu_lock) lock(mmu_lock) pte.w is already cleared unlock(mmu_lock) skip TLB flush return ... TLB flush Though thread B assumes the page has already been protected when it returns, the remaining TLB entry will break that assumption. This patch fixes this problem by making get_dirty_log hold the mmu_lock until it flushes the TLB's. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e86f9b22eaca..3df0b7a140b0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3065,6 +3065,8 @@ static void write_protect_slot(struct kvm *kvm, unsigned long *dirty_bitmap, unsigned long nr_dirty_pages) { + spin_lock(&kvm->mmu_lock); + /* Not many dirty pages compared to # of shadow pages. */ if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) { unsigned long gfn_offset; @@ -3072,16 +3074,13 @@ static void write_protect_slot(struct kvm *kvm, for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) { unsigned long gfn = memslot->base_gfn + gfn_offset; - spin_lock(&kvm->mmu_lock); kvm_mmu_rmap_write_protect(kvm, gfn, memslot); - spin_unlock(&kvm->mmu_lock); } kvm_flush_remote_tlbs(kvm); - } else { - spin_lock(&kvm->mmu_lock); + } else kvm_mmu_slot_remove_write_access(kvm, memslot->id); - spin_unlock(&kvm->mmu_lock); - } + + spin_unlock(&kvm->mmu_lock); } /* -- cgit v1.2.3 From 851755871c1f3184f4124c466e85881f17fa3226 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Mon, 6 Feb 2012 10:59:02 +0100 Subject: KVM: s390: Sanitize fpc registers for KVM_SET_FPU commit 7eef87dc99e419b1cc051e4417c37e4744d7b661 (KVM: s390: fix register setting) added a load of the floating point control register to the KVM_SET_FPU path. Lets make sure that the fpc is valid. Signed-off-by: Christian Borntraeger Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/s390/kvm/kvm-s390.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 0b91679369bc..121316ef2e15 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -460,7 +460,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { memcpy(&vcpu->arch.guest_fpregs.fprs, &fpu->fprs, sizeof(fpu->fprs)); - vcpu->arch.guest_fpregs.fpc = fpu->fpc; + vcpu->arch.guest_fpregs.fpc = fpu->fpc & FPC_VALID_MASK; restore_fp_regs(&vcpu->arch.guest_fpregs); return 0; } -- cgit v1.2.3 From 9e0d5473e2f0ba2d2fe9dab9408edef3060b710e Mon Sep 17 00:00:00 2001 From: Jens Freimann Date: Mon, 6 Feb 2012 10:59:03 +0100 Subject: KVM: s390: do store status after handling STOP_ON_STOP bit In handle_stop() handle the stop bit before doing the store status as described for "Stop and Store Status" in the Principles of Operation. We have to give up the local_int.lock before calling kvm store status since it calls gmap_fault() which might sleep. Since local_int.lock only protects local_int.* and not guest memory we can give up the lock. Signed-off-by: Jens Freimann Signed-off-by: Christian Borntraeger Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/s390/kvm/intercept.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 776ef83c2771..361456577c6f 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -133,13 +133,6 @@ static int handle_stop(struct kvm_vcpu *vcpu) vcpu->stat.exit_stop_request++; spin_lock_bh(&vcpu->arch.local_int.lock); - if (vcpu->arch.local_int.action_bits & ACTION_STORE_ON_STOP) { - vcpu->arch.local_int.action_bits &= ~ACTION_STORE_ON_STOP; - rc = kvm_s390_vcpu_store_status(vcpu, - KVM_S390_STORE_STATUS_NOADDR); - if (rc >= 0) - rc = -EOPNOTSUPP; - } if (vcpu->arch.local_int.action_bits & ACTION_RELOADVCPU_ON_STOP) { vcpu->arch.local_int.action_bits &= ~ACTION_RELOADVCPU_ON_STOP; @@ -155,7 +148,18 @@ static int handle_stop(struct kvm_vcpu *vcpu) rc = -EOPNOTSUPP; } - spin_unlock_bh(&vcpu->arch.local_int.lock); + if (vcpu->arch.local_int.action_bits & ACTION_STORE_ON_STOP) { + vcpu->arch.local_int.action_bits &= ~ACTION_STORE_ON_STOP; + /* store status must be called unlocked. Since local_int.lock + * only protects local_int.* and not guest memory we can give + * up the lock here */ + spin_unlock_bh(&vcpu->arch.local_int.lock); + rc = kvm_s390_vcpu_store_status(vcpu, + KVM_S390_STORE_STATUS_NOADDR); + if (rc >= 0) + rc = -EOPNOTSUPP; + } else + spin_unlock_bh(&vcpu->arch.local_int.lock); return rc; } -- cgit v1.2.3 From 151104a7b3a82f9c56d636595ae58084049d2559 Mon Sep 17 00:00:00 2001 From: Jens Freimann Date: Wed, 8 Feb 2012 08:28:29 +0100 Subject: KVM: s390: make sigp restart return busy when stop pending On reboot the guest sends in smp_send_stop() a sigp stop to all CPUs except for current CPU. Then the guest switches to the IPL cpu by sending a restart to the IPL CPU, followed by a sigp stop to the current cpu. Since restart is handled by userspace it's possible that the restart is delivered before the old stop. This means that the IPL CPU isn't restarted and we have no running CPUs. So let's make sure that there is no stop action pending when we do the restart. Signed-off-by: Jens Freimann Signed-off-by: Christian Borntraeger Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/s390/kvm/sigp.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index 30eb0f73f9d5..c703b1cbb0aa 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c @@ -309,6 +309,34 @@ static int __sigp_sense_running(struct kvm_vcpu *vcpu, u16 cpu_addr, return rc; } +static int __sigp_restart(struct kvm_vcpu *vcpu, u16 cpu_addr) +{ + int rc = 0; + struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; + struct kvm_s390_local_interrupt *li; + + if (cpu_addr >= KVM_MAX_VCPUS) + return 3; /* not operational */ + + spin_lock(&fi->lock); + li = fi->local_int[cpu_addr]; + if (li == NULL) { + rc = 3; /* not operational */ + goto out; + } + + spin_lock_bh(&li->lock); + if (li->action_bits & ACTION_STOP_ON_STOP) + rc = 2; /* busy */ + else + VCPU_EVENT(vcpu, 4, "sigp restart %x to handle userspace", + cpu_addr); + spin_unlock_bh(&li->lock); +out: + spin_unlock(&fi->lock); + return rc; +} + int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu) { int r1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4; @@ -372,6 +400,9 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu) break; case SIGP_RESTART: vcpu->stat.instruction_sigp_restart++; + rc = __sigp_restart(vcpu, cpu_addr); + if (rc == 2) /* busy */ + break; /* user space must know about restart */ default: return -EOPNOTSUPP; -- cgit v1.2.3 From 24a13044a84be51a6a4885a72ac9d5f4ed0742d0 Mon Sep 17 00:00:00 2001 From: Jens Freimann Date: Mon, 6 Feb 2012 10:59:05 +0100 Subject: KVM: s390: ignore sigp stop overinitiative In __inject_sigp_stop() do nothing when the CPU is already in stopped state. Signed-off-by: Jens Freimann Signed-off-by: Christian Borntraeger Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/s390/kvm/sigp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index c703b1cbb0aa..f3d5cc297012 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c @@ -160,12 +160,15 @@ static int __inject_sigp_stop(struct kvm_s390_local_interrupt *li, int action) inti->type = KVM_S390_SIGP_STOP; spin_lock_bh(&li->lock); + if ((atomic_read(li->cpuflags) & CPUSTAT_STOPPED)) + goto out; list_add_tail(&inti->list, &li->list); atomic_set(&li->active, 1); atomic_set_mask(CPUSTAT_STOP_INT, li->cpuflags); li->action_bits |= action; if (waitqueue_active(&li->wq)) wake_up_interruptible(&li->wq); +out: spin_unlock_bh(&li->lock); return 0; /* order accepted */ -- cgit v1.2.3 From 9ec2d6dc6c4d4792b85dec2f09d39387fb7157d1 Mon Sep 17 00:00:00 2001 From: Jens Freimann Date: Mon, 6 Feb 2012 10:59:06 +0100 Subject: KVM: s390: add stop_on_stop flag when doing stop and store When we do a stop and store status we need to pass ACTION_STOP_ON_STOP flag to __sigp_stop(). Signed-off-by: Jens Freimann Signed-off-by: Christian Borntraeger Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/s390/kvm/sigp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index f3d5cc297012..0ad4cf238391 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c @@ -385,7 +385,8 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu) break; case SIGP_STOP_STORE_STATUS: vcpu->stat.instruction_sigp_stop++; - rc = __sigp_stop(vcpu, cpu_addr, ACTION_STORE_ON_STOP); + rc = __sigp_stop(vcpu, cpu_addr, ACTION_STORE_ON_STOP | + ACTION_STOP_ON_STOP); break; case SIGP_SET_ARCH: vcpu->stat.instruction_sigp_arch++; -- cgit v1.2.3 From 9eed0735ca6a5cf386a4998ad4b6d52d1e29353f Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Mon, 6 Feb 2012 10:59:07 +0100 Subject: KVM: s390: provide control registers via kvm_run There are several cases were we need the control registers for userspace. Lets also provide those in kvm_run. Signed-off-by: Christian Borntraeger Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/s390/include/asm/kvm.h | 2 ++ arch/s390/kvm/kvm-s390.c | 9 ++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/s390/include/asm/kvm.h b/arch/s390/include/asm/kvm.h index 9acbde4af297..96076676e224 100644 --- a/arch/s390/include/asm/kvm.h +++ b/arch/s390/include/asm/kvm.h @@ -44,10 +44,12 @@ struct kvm_guest_debug_arch { #define KVM_SYNC_PREFIX (1UL << 0) #define KVM_SYNC_GPRS (1UL << 1) #define KVM_SYNC_ACRS (1UL << 2) +#define KVM_SYNC_CRS (1UL << 3) /* definition of registers in kvm_run */ struct kvm_sync_regs { __u64 prefix; /* prefix register */ __u64 gprs[16]; /* general purpose registers */ __u32 acrs[16]; /* access registers */ + __u64 crs[16]; /* control registers */ }; #endif diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 121316ef2e15..cf3c0a91d046 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -291,7 +291,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) vcpu->arch.gmap = vcpu->kvm->arch.gmap; vcpu->run->kvm_valid_regs = KVM_SYNC_PREFIX | KVM_SYNC_GPRS | - KVM_SYNC_ACRS; + KVM_SYNC_ACRS | + KVM_SYNC_CRS; return 0; } @@ -580,6 +581,11 @@ rerun_vcpu: kvm_run->kvm_dirty_regs &= ~KVM_SYNC_PREFIX; kvm_s390_set_prefix(vcpu, kvm_run->s.regs.prefix); } + if (kvm_run->kvm_dirty_regs & KVM_SYNC_CRS) { + kvm_run->kvm_dirty_regs &= ~KVM_SYNC_CRS; + memcpy(&vcpu->arch.sie_block->gcr, &kvm_run->s.regs.crs, 128); + kvm_s390_set_prefix(vcpu, kvm_run->s.regs.prefix); + } might_fault(); @@ -629,6 +635,7 @@ rerun_vcpu: kvm_run->psw_mask = vcpu->arch.sie_block->gpsw.mask; kvm_run->psw_addr = vcpu->arch.sie_block->gpsw.addr; kvm_run->s.regs.prefix = vcpu->arch.sie_block->prefix; + memcpy(&kvm_run->s.regs.crs, &vcpu->arch.sie_block->gcr, 128); if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &sigsaved, NULL); -- cgit v1.2.3 From fb03cb6f44236f4bef62a0dda8e025ff5ca51417 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 8 Feb 2012 12:59:10 +0900 Subject: KVM: Introduce gfn_to_index() which returns the index for a given level This patch cleans up the code and removes the "(void)level;" warning suppressor. Note that we can also use this for PT_PAGE_TABLE_LEVEL to treat every level uniformly later. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 3 +-- include/linux/kvm_host.h | 7 +++++++ virt/kvm/kvm_main.c | 7 +------ 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ae76cc3392e1..37e7f100a0e0 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -688,8 +688,7 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, { unsigned long idx; - idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - - (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); + idx = gfn_to_index(gfn, slot->base_gfn, level); return &slot->lpage_info[level - 2][idx]; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 9698080c902b..7a08496b974a 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -681,6 +681,13 @@ static inline int memslot_id(struct kvm *kvm, gfn_t gfn) return gfn_to_memslot(kvm, gfn)->id; } +static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) +{ + /* KVM_HPAGE_GFN_SHIFT(PT_PAGE_TABLE_LEVEL) must be 0. */ + return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - + (base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); +} + static inline unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn) { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 470e30520fe8..415fe816fc15 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -784,15 +784,10 @@ int __kvm_set_memory_region(struct kvm *kvm, int lpages; int level = i + 2; - /* Avoid unused variable warning if no large pages */ - (void)level; - if (new.lpage_info[i]) continue; - lpages = 1 + ((base_gfn + npages - 1) - >> KVM_HPAGE_GFN_SHIFT(level)); - lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level); + lpages = gfn_to_index(base_gfn + npages - 1, base_gfn, level) + 1; new.lpage_info[i] = vzalloc(lpages * sizeof(*new.lpage_info[i])); -- cgit v1.2.3 From a64f273a08d16bc66ccc5546bd28b1bba554ec81 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 8 Feb 2012 13:00:13 +0900 Subject: KVM: Split lpage_info creation out from __kvm_set_memory_region() This makes it easy to make lpage_info architecture specific. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- virt/kvm/kvm_main.c | 83 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 52 insertions(+), 31 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 415fe816fc15..7adaa2063415 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -685,6 +685,56 @@ void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new) slots->generation++; } +#ifndef CONFIG_S390 +static int create_lpage_info(struct kvm_memory_slot *slot, unsigned long npages) +{ + int i; + + for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { + unsigned long ugfn; + int lpages; + int level = i + 2; + + if (slot->lpage_info[i]) + continue; + + lpages = gfn_to_index(slot->base_gfn + npages - 1, + slot->base_gfn, level) + 1; + + slot->lpage_info[i] = vzalloc(lpages * sizeof(*slot->lpage_info[i])); + if (!slot->lpage_info[i]) + goto out_free; + + if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) + slot->lpage_info[i][0].write_count = 1; + if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) + slot->lpage_info[i][lpages - 1].write_count = 1; + ugfn = slot->userspace_addr >> PAGE_SHIFT; + /* + * If the gfn and userspace address are not aligned wrt each + * other, or if explicitly asked to, disable large page + * support for this slot + */ + if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) || + !largepages_enabled) { + unsigned long j; + + for (j = 0; j < lpages; ++j) + slot->lpage_info[i][j].write_count = 1; + } + } + + return 0; + +out_free: + for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { + vfree(slot->lpage_info[i]); + slot->lpage_info[i] = NULL; + } + return -ENOMEM; +} +#endif /* not defined CONFIG_S390 */ + /* * Allocate some memory and give it an address in the guest physical address * space. @@ -778,37 +828,8 @@ int __kvm_set_memory_region(struct kvm *kvm, if (!npages) goto skip_lpage; - for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { - unsigned long ugfn; - unsigned long j; - int lpages; - int level = i + 2; - - if (new.lpage_info[i]) - continue; - - lpages = gfn_to_index(base_gfn + npages - 1, base_gfn, level) + 1; - - new.lpage_info[i] = vzalloc(lpages * sizeof(*new.lpage_info[i])); - - if (!new.lpage_info[i]) - goto out_free; - - if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) - new.lpage_info[i][0].write_count = 1; - if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) - new.lpage_info[i][lpages - 1].write_count = 1; - ugfn = new.userspace_addr >> PAGE_SHIFT; - /* - * If the gfn and userspace address are not aligned wrt each - * other, or if explicitly asked to, disable large page - * support for this slot - */ - if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) || - !largepages_enabled) - for (j = 0; j < lpages; ++j) - new.lpage_info[i][j].write_count = 1; - } + if (create_lpage_info(&new, npages)) + goto out_free; skip_lpage: -- cgit v1.2.3 From 189a2f7b24677deced3d2a9803969ba69f4b75f6 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 8 Feb 2012 13:01:09 +0900 Subject: KVM: Simplify ifndef conditional usage in __kvm_set_memory_region() Narrow down the controlled text inside the conditional so that it will include lpage_info and rmap stuff only. For this we change the way we check whether the slot is being created from "if (npages && !new.rmap)" to "if (npages && !old.npages)". We also stop checking if lpage_info is NULL when we create lpage_info because we do it from inside the slot creation code block. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- virt/kvm/kvm_main.c | 29 ++++++++--------------------- 1 file changed, 8 insertions(+), 21 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7adaa2063415..a30447c5eb4a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -616,7 +616,6 @@ static int kvm_vm_release(struct inode *inode, struct file *filp) return 0; } -#ifndef CONFIG_S390 /* * Allocation size is twice as large as the actual dirty bitmap size. * This makes it possible to do double buffering: see x86's @@ -624,6 +623,7 @@ static int kvm_vm_release(struct inode *inode, struct file *filp) */ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) { +#ifndef CONFIG_S390 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); if (dirty_bytes > PAGE_SIZE) @@ -636,9 +636,9 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) memslot->dirty_bitmap_head = memslot->dirty_bitmap; memslot->nr_dirty_pages = 0; +#endif /* !CONFIG_S390 */ return 0; } -#endif /* !CONFIG_S390 */ static int cmp_memslot(const void *slot1, const void *slot2) { @@ -695,9 +695,6 @@ static int create_lpage_info(struct kvm_memory_slot *slot, unsigned long npages) int lpages; int level = i + 2; - if (slot->lpage_info[i]) - continue; - lpages = gfn_to_index(slot->base_gfn + npages - 1, slot->base_gfn, level) + 1; @@ -815,23 +812,18 @@ int __kvm_set_memory_region(struct kvm *kvm, r = -ENOMEM; /* Allocate if a slot is being created */ + if (npages && !old.npages) { + new.user_alloc = user_alloc; + new.userspace_addr = mem->userspace_addr; #ifndef CONFIG_S390 - if (npages && !new.rmap) { new.rmap = vzalloc(npages * sizeof(*new.rmap)); - if (!new.rmap) goto out_free; - new.user_alloc = user_alloc; - new.userspace_addr = mem->userspace_addr; + if (create_lpage_info(&new, npages)) + goto out_free; +#endif /* not defined CONFIG_S390 */ } - if (!npages) - goto skip_lpage; - - if (create_lpage_info(&new, npages)) - goto out_free; - -skip_lpage: /* Allocate page dirty bitmap if needed */ if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { @@ -839,11 +831,6 @@ skip_lpage: goto out_free; /* destroy any largepage mappings for dirty tracking */ } -#else /* not defined CONFIG_S390 */ - new.user_alloc = user_alloc; - if (user_alloc) - new.userspace_addr = mem->userspace_addr; -#endif /* not defined CONFIG_S390 */ if (!npages) { struct kvm_memory_slot *slot; -- cgit v1.2.3 From db3fe4eb45f3555d91a7124e18cf3a2f2a30eb90 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 8 Feb 2012 13:02:18 +0900 Subject: KVM: Introduce kvm_memory_slot::arch and move lpage_info into it Some members of kvm_memory_slot are not used by every architecture. This patch is the first step to make this difference clear by introducing kvm_memory_slot::arch; lpage_info is moved into it. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/ia64/include/asm/kvm_host.h | 3 ++ arch/ia64/kvm/kvm-ia64.c | 10 ++++++ arch/powerpc/include/asm/kvm_host.h | 3 ++ arch/powerpc/kvm/powerpc.c | 10 ++++++ arch/s390/include/asm/kvm_host.h | 3 ++ arch/s390/kvm/kvm-s390.c | 10 ++++++ arch/x86/include/asm/kvm_host.h | 9 +++++ arch/x86/kvm/mmu.c | 2 +- arch/x86/kvm/x86.c | 59 +++++++++++++++++++++++++++++++ include/linux/kvm_host.h | 11 +++--- virt/kvm/kvm_main.c | 70 +++++-------------------------------- 11 files changed, 122 insertions(+), 68 deletions(-) diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h index 2689ee54a1c9..e35b3a84a40b 100644 --- a/arch/ia64/include/asm/kvm_host.h +++ b/arch/ia64/include/asm/kvm_host.h @@ -459,6 +459,9 @@ struct kvm_sal_data { unsigned long boot_gp; }; +struct kvm_arch_memory_slot { +}; + struct kvm_arch { spinlock_t dirty_log_lock; diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index 8ca7261e7b3d..d8ddbba6fe7d 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -1571,6 +1571,16 @@ int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } +void kvm_arch_free_memslot(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ +} + +int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) +{ + return 0; +} + int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, struct kvm_memory_slot old, diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 1843d5d2a3be..52eb9c1f4fe0 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -213,6 +213,9 @@ struct revmap_entry { #define KVMPPC_PAGE_WRITETHRU HPTE_R_W /* 0x40 */ #define KVMPPC_GOT_PAGE 0x80 +struct kvm_arch_memory_slot { +}; + struct kvm_arch { #ifdef CONFIG_KVM_BOOK3S_64_HV unsigned long hpt_virt; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 0e21d155eea7..00d7e345b3fe 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -281,6 +281,16 @@ long kvm_arch_dev_ioctl(struct file *filp, return -EINVAL; } +void kvm_arch_free_memslot(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ +} + +int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) +{ + return 0; +} + int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, struct kvm_memory_slot old, diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index e6304268ea28..7343872890a2 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -245,6 +245,9 @@ struct kvm_vm_stat { u32 remote_tlb_flush; }; +struct kvm_arch_memory_slot { +}; + struct kvm_arch{ struct sca_block *sca; debug_info_t *dbf; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index cf3c0a91d046..17ad69d596fd 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -814,6 +814,16 @@ int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } +void kvm_arch_free_memslot(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ +} + +int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) +{ + return 0; +} + /* Section: memory related */ int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c24125cd0c63..74c9edf2bb18 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -483,6 +483,15 @@ struct kvm_vcpu_arch { } osvw; }; +struct kvm_lpage_info { + unsigned long rmap_pde; + int write_count; +}; + +struct kvm_arch_memory_slot { + struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; +}; + struct kvm_arch { unsigned int n_used_mmu_pages; unsigned int n_requested_mmu_pages; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 37e7f100a0e0..ff053ca32303 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -689,7 +689,7 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, unsigned long idx; idx = gfn_to_index(gfn, slot->base_gfn, level); - return &slot->lpage_info[level - 2][idx]; + return &slot->arch.lpage_info[level - 2][idx]; } static void account_shadowed(struct kvm *kvm, gfn_t gfn) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3df0b7a140b0..ca74c1dadf3a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6239,6 +6239,65 @@ void kvm_arch_destroy_vm(struct kvm *kvm) put_page(kvm->arch.ept_identity_pagetable); } +void kvm_arch_free_memslot(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ + int i; + + for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { + if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) { + vfree(free->arch.lpage_info[i]); + free->arch.lpage_info[i] = NULL; + } + } +} + +int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) +{ + int i; + + for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { + unsigned long ugfn; + int lpages; + int level = i + 2; + + lpages = gfn_to_index(slot->base_gfn + npages - 1, + slot->base_gfn, level) + 1; + + slot->arch.lpage_info[i] = + vzalloc(lpages * sizeof(*slot->arch.lpage_info[i])); + if (!slot->arch.lpage_info[i]) + goto out_free; + + if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) + slot->arch.lpage_info[i][0].write_count = 1; + if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) + slot->arch.lpage_info[i][lpages - 1].write_count = 1; + ugfn = slot->userspace_addr >> PAGE_SHIFT; + /* + * If the gfn and userspace address are not aligned wrt each + * other, or if explicitly asked to, disable large page + * support for this slot + */ + if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) || + !kvm_largepages_enabled()) { + unsigned long j; + + for (j = 0; j < lpages; ++j) + slot->arch.lpage_info[i][j].write_count = 1; + } + } + + return 0; + +out_free: + for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { + vfree(slot->arch.lpage_info[i]); + slot->arch.lpage_info[i] = NULL; + } + return -ENOMEM; +} + int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, struct kvm_memory_slot old, diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7a08496b974a..355e44555c39 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -171,11 +171,6 @@ static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu) */ #define KVM_MEM_MAX_NR_PAGES ((1UL << 31) - 1) -struct kvm_lpage_info { - unsigned long rmap_pde; - int write_count; -}; - struct kvm_memory_slot { gfn_t base_gfn; unsigned long npages; @@ -184,7 +179,7 @@ struct kvm_memory_slot { unsigned long *dirty_bitmap; unsigned long *dirty_bitmap_head; unsigned long nr_dirty_pages; - struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; + struct kvm_arch_memory_slot arch; unsigned long userspace_addr; int user_alloc; int id; @@ -376,6 +371,9 @@ int kvm_set_memory_region(struct kvm *kvm, int __kvm_set_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, int user_alloc); +void kvm_arch_free_memslot(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont); +int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages); int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, struct kvm_memory_slot old, @@ -385,6 +383,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, struct kvm_memory_slot old, int user_alloc); +bool kvm_largepages_enabled(void); void kvm_disable_largepages(void); void kvm_arch_flush_shadow(struct kvm *kvm); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index a30447c5eb4a..8340e0e62034 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -535,21 +535,13 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) static void kvm_free_physmem_slot(struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { - int i; - if (!dont || free->rmap != dont->rmap) vfree(free->rmap); if (!dont || free->dirty_bitmap != dont->dirty_bitmap) kvm_destroy_dirty_bitmap(free); - - for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { - if (!dont || free->lpage_info[i] != dont->lpage_info[i]) { - vfree(free->lpage_info[i]); - free->lpage_info[i] = NULL; - } - } + kvm_arch_free_memslot(free, dont); free->npages = 0; free->rmap = NULL; @@ -685,53 +677,6 @@ void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new) slots->generation++; } -#ifndef CONFIG_S390 -static int create_lpage_info(struct kvm_memory_slot *slot, unsigned long npages) -{ - int i; - - for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { - unsigned long ugfn; - int lpages; - int level = i + 2; - - lpages = gfn_to_index(slot->base_gfn + npages - 1, - slot->base_gfn, level) + 1; - - slot->lpage_info[i] = vzalloc(lpages * sizeof(*slot->lpage_info[i])); - if (!slot->lpage_info[i]) - goto out_free; - - if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) - slot->lpage_info[i][0].write_count = 1; - if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) - slot->lpage_info[i][lpages - 1].write_count = 1; - ugfn = slot->userspace_addr >> PAGE_SHIFT; - /* - * If the gfn and userspace address are not aligned wrt each - * other, or if explicitly asked to, disable large page - * support for this slot - */ - if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) || - !largepages_enabled) { - unsigned long j; - - for (j = 0; j < lpages; ++j) - slot->lpage_info[i][j].write_count = 1; - } - } - - return 0; - -out_free: - for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { - vfree(slot->lpage_info[i]); - slot->lpage_info[i] = NULL; - } - return -ENOMEM; -} -#endif /* not defined CONFIG_S390 */ - /* * Allocate some memory and give it an address in the guest physical address * space. @@ -819,10 +764,9 @@ int __kvm_set_memory_region(struct kvm *kvm, new.rmap = vzalloc(npages * sizeof(*new.rmap)); if (!new.rmap) goto out_free; - - if (create_lpage_info(&new, npages)) - goto out_free; #endif /* not defined CONFIG_S390 */ + if (kvm_arch_create_memslot(&new, npages)) + goto out_free; } /* Allocate page dirty bitmap if needed */ @@ -880,8 +824,7 @@ int __kvm_set_memory_region(struct kvm *kvm, if (!npages) { new.rmap = NULL; new.dirty_bitmap = NULL; - for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) - new.lpage_info[i] = NULL; + memset(&new.arch, 0, sizeof(new.arch)); } update_memslots(slots, &new); @@ -968,6 +911,11 @@ out: return r; } +bool kvm_largepages_enabled(void) +{ + return largepages_enabled; +} + void kvm_disable_largepages(void) { largepages_enabled = false; -- cgit v1.2.3 From 565f3be2174611f364405bbea2d86e153c2e7e78 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Fri, 10 Feb 2012 15:28:31 +0900 Subject: KVM: mmu_notifier: Flush TLBs before releasing mmu_lock Other threads may process the same page in that small window and skip TLB flush and then return before these functions do flush. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- virt/kvm/kvm_main.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 8340e0e62034..e4431ada5947 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -289,15 +289,15 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, */ idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); + kvm->mmu_notifier_seq++; need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty; - spin_unlock(&kvm->mmu_lock); - srcu_read_unlock(&kvm->srcu, idx); - /* we've to flush the tlb before the pages can be freed */ if (need_tlb_flush) kvm_flush_remote_tlbs(kvm); + spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, idx); } static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, @@ -335,12 +335,12 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, for (; start < end; start += PAGE_SIZE) need_tlb_flush |= kvm_unmap_hva(kvm, start); need_tlb_flush |= kvm->tlbs_dirty; - spin_unlock(&kvm->mmu_lock); - srcu_read_unlock(&kvm->srcu, idx); - /* we've to flush the tlb before the pages can be freed */ if (need_tlb_flush) kvm_flush_remote_tlbs(kvm); + + spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, idx); } static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, @@ -378,13 +378,14 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); - young = kvm_age_hva(kvm, address); - spin_unlock(&kvm->mmu_lock); - srcu_read_unlock(&kvm->srcu, idx); + young = kvm_age_hva(kvm, address); if (young) kvm_flush_remote_tlbs(kvm); + spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, idx); + return young; } -- cgit v1.2.3 From 270c6c79f4e15e599f47174ecedad932463af7a2 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 16 Feb 2012 14:44:11 +0200 Subject: KVM: x86 emulator: correctly mask pmc index bits in RDPMC instruction emulation Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/pmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 7aad5446f393..3e48c1d3edcd 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -413,7 +413,7 @@ int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data) struct kvm_pmc *counters; u64 ctr; - pmc &= (3u << 30) - 1; + pmc &= ~(3u << 30); if (!fixed && pmc >= pmu->nr_arch_gp_counters) return 1; if (fixed && pmc >= pmu->nr_arch_fixed_counters) -- cgit v1.2.3 From 9cc815e46911486f52bec60517d0f7b40d323bbc Mon Sep 17 00:00:00 2001 From: Danny Kukawka Date: Thu, 16 Feb 2012 14:55:54 +0100 Subject: arch/powerpc/kvm/book3s_hv.c: included linux/sched.h twice arch/powerpc/kvm/book3s_hv.c: included 'linux/sched.h' twice, remove the duplicate. Signed-off-by: Danny Kukawka Signed-off-by: Avi Kivity --- arch/powerpc/kvm/book3s_hv.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index ce1cac765193..d386b6198bc7 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -46,7 +46,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From 7f3d35fddd173e52886d03bc34b5b5d6f5bea343 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 8 Feb 2012 14:34:38 +0100 Subject: KVM: x86 emulator: Fix task switch privilege checks Currently, all task switches check privileges against the DPL of the TSS. This is only correct for jmp/call to a TSS. If a task gate is used, the DPL of this take gate is used for the check instead. Exceptions, external interrupts and iret shouldn't perform any check. [avi: kill kvm-kmod remnants] Signed-off-by: Kevin Wolf Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 2 +- arch/x86/include/asm/kvm_host.h | 4 +-- arch/x86/kvm/emulate.c | 53 +++++++++++++++++++++++++++++++++----- arch/x86/kvm/svm.c | 5 +++- arch/x86/kvm/vmx.c | 8 +++--- arch/x86/kvm/x86.c | 6 ++--- 6 files changed, 61 insertions(+), 17 deletions(-) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 7b9cfc4878af..df437b68f42b 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -388,7 +388,7 @@ bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt); #define EMULATION_INTERCEPTED 2 int x86_emulate_insn(struct x86_emulate_ctxt *ctxt); int emulator_task_switch(struct x86_emulate_ctxt *ctxt, - u16 tss_selector, int reason, + u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code); int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq); #endif /* _ASM_X86_KVM_X86_EMULATE_H */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 74c9edf2bb18..e216ba066e79 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -768,8 +768,8 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); -int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, - bool has_error_code, u32 error_code); +int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, + int reason, bool has_error_code, u32 error_code); int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 71450aca3b86..fa310a48591c 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1152,6 +1152,22 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, return 1; } +static int read_interrupt_descriptor(struct x86_emulate_ctxt *ctxt, + u16 index, struct desc_struct *desc) +{ + struct desc_ptr dt; + ulong addr; + + ctxt->ops->get_idt(ctxt, &dt); + + if (dt.size < index * 8 + 7) + return emulate_gp(ctxt, index << 3 | 0x2); + + addr = dt.address + index * 8; + return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc, + &ctxt->exception); +} + static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, u16 selector, struct desc_ptr *dt) { @@ -2421,7 +2437,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, } static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, - u16 tss_selector, int reason, + u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code) { struct x86_emulate_ops *ops = ctxt->ops; @@ -2443,12 +2459,35 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, /* FIXME: check that next_tss_desc is tss */ - if (reason != TASK_SWITCH_IRET) { - if ((tss_selector & 3) > next_tss_desc.dpl || - ops->cpl(ctxt) > next_tss_desc.dpl) - return emulate_gp(ctxt, 0); + /* + * Check privileges. The three cases are task switch caused by... + * + * 1. jmp/call/int to task gate: Check against DPL of the task gate + * 2. Exception/IRQ/iret: No check is performed + * 3. jmp/call to TSS: Check agains DPL of the TSS + */ + if (reason == TASK_SWITCH_GATE) { + if (idt_index != -1) { + /* Software interrupts */ + struct desc_struct task_gate_desc; + int dpl; + + ret = read_interrupt_descriptor(ctxt, idt_index, + &task_gate_desc); + if (ret != X86EMUL_CONTINUE) + return ret; + + dpl = task_gate_desc.dpl; + if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl) + return emulate_gp(ctxt, (idt_index << 3) | 0x2); + } + } else if (reason != TASK_SWITCH_IRET) { + int dpl = next_tss_desc.dpl; + if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl) + return emulate_gp(ctxt, tss_selector); } + desc_limit = desc_limit_scaled(&next_tss_desc); if (!next_tss_desc.p || ((desc_limit < 0x67 && (next_tss_desc.type & 8)) || @@ -2501,7 +2540,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, } int emulator_task_switch(struct x86_emulate_ctxt *ctxt, - u16 tss_selector, int reason, + u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code) { int rc; @@ -2509,7 +2548,7 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt, ctxt->_eip = ctxt->eip; ctxt->dst.type = OP_NONE; - rc = emulator_do_task_switch(ctxt, tss_selector, reason, + rc = emulator_do_task_switch(ctxt, tss_selector, idt_index, reason, has_error_code, error_code); if (rc == X86EMUL_CONTINUE) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 0b7690ee20bd..95cdeaf9c718 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2799,7 +2799,10 @@ static int task_switch_interception(struct vcpu_svm *svm) (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) skip_emulated_instruction(&svm->vcpu); - if (kvm_task_switch(&svm->vcpu, tss_selector, reason, + if (int_type != SVM_EXITINTINFO_TYPE_SOFT) + int_vec = -1; + + if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason, has_error_code, error_code) == EMULATE_FAIL) { svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR; svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d2bd719925a6..124a0952a040 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4658,9 +4658,10 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) bool has_error_code = false; u32 error_code = 0; u16 tss_selector; - int reason, type, idt_v; + int reason, type, idt_v, idt_index; idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); + idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK); type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); exit_qualification = vmcs_readl(EXIT_QUALIFICATION); @@ -4698,8 +4699,9 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) type != INTR_TYPE_NMI_INTR)) skip_emulated_instruction(vcpu); - if (kvm_task_switch(vcpu, tss_selector, reason, - has_error_code, error_code) == EMULATE_FAIL) { + if (kvm_task_switch(vcpu, tss_selector, + type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason, + has_error_code, error_code) == EMULATE_FAIL) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ca74c1dadf3a..490a1b1a255f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5655,15 +5655,15 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, return 0; } -int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, - bool has_error_code, u32 error_code) +int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, + int reason, bool has_error_code, u32 error_code) { struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; int ret; init_emulate_ctxt(vcpu); - ret = emulator_task_switch(ctxt, tss_selector, reason, + ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason, has_error_code, error_code); if (ret) -- cgit v1.2.3 From 66b0ab8fac1031ffc70eb77491048339f2717a54 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 8 Feb 2012 14:34:39 +0100 Subject: KVM: x86 emulator: VM86 segments must have DPL 3 Setting the segment DPL to 0 for at least the VM86 code segment makes the VM entry fail on VMX. Signed-off-by: Kevin Wolf Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index fa310a48591c..b19e9fffe582 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1244,6 +1244,8 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, seg_desc.type = 3; seg_desc.p = 1; seg_desc.s = 1; + if (ctxt->mode == X86EMUL_MODE_VM86) + seg_desc.dpl = 3; goto load; } -- cgit v1.2.3 From ea5e97e8bf1d56a4d9461c39e082b9c31a7be4ff Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 8 Feb 2012 14:34:40 +0100 Subject: KVM: SVM: Fix CPL updates Keep CPL at 0 in real mode and at 3 in VM86. In protected/long mode, use RPL rather than DPL of the code segment. Signed-off-by: Kevin Wolf Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 95cdeaf9c718..ab39d84dee00 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1332,6 +1332,21 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu) wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); } +static void svm_update_cpl(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + int cpl; + + if (!is_protmode(vcpu)) + cpl = 0; + else if (svm->vmcb->save.rflags & X86_EFLAGS_VM) + cpl = 3; + else + cpl = svm->vmcb->save.cs.selector & 0x3; + + svm->vmcb->save.cpl = cpl; +} + static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) { return to_svm(vcpu)->vmcb->save.rflags; @@ -1607,9 +1622,7 @@ static void svm_set_segment(struct kvm_vcpu *vcpu, s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT; } if (seg == VCPU_SREG_CS) - svm->vmcb->save.cpl - = (svm->vmcb->save.cs.attrib - >> SVM_SELECTOR_DPL_SHIFT) & 3; + svm_update_cpl(vcpu); mark_dirty(svm->vmcb, VMCB_SEG); } -- cgit v1.2.3 From 4cee4798a304ee1ea579423ca048f16ceaccdfb5 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 8 Feb 2012 14:34:41 +0100 Subject: KVM: x86 emulator: Allow PM/VM86 switch during task switch Task switches can switch between Protected Mode and VM86. The current mode must be updated during the task switch emulation so that the new segment selectors are interpreted correctly. In order to let privilege checks succeed, rflags needs to be updated in the vcpu struct as this causes a CPL update. Signed-off-by: Kevin Wolf Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 1 + arch/x86/kvm/emulate.c | 20 ++++++++++++++++++++ arch/x86/kvm/svm.c | 4 ++++ arch/x86/kvm/x86.c | 6 ++++++ 4 files changed, 31 insertions(+) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index df437b68f42b..c222e1a1b12a 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -176,6 +176,7 @@ struct x86_emulate_ops { void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr); int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val); + void (*set_rflags)(struct x86_emulate_ctxt *ctxt, ulong val); int (*cpl)(struct x86_emulate_ctxt *ctxt); int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest); int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index b19e9fffe582..83756223f8aa 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2344,6 +2344,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, return emulate_gp(ctxt, 0); ctxt->_eip = tss->eip; ctxt->eflags = tss->eflags | 2; + + /* General purpose registers */ ctxt->regs[VCPU_REGS_RAX] = tss->eax; ctxt->regs[VCPU_REGS_RCX] = tss->ecx; ctxt->regs[VCPU_REGS_RDX] = tss->edx; @@ -2365,6 +2367,24 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, set_segment_selector(ctxt, tss->fs, VCPU_SREG_FS); set_segment_selector(ctxt, tss->gs, VCPU_SREG_GS); + /* + * If we're switching between Protected Mode and VM86, we need to make + * sure to update the mode before loading the segment descriptors so + * that the selectors are interpreted correctly. + * + * Need to get rflags to the vcpu struct immediately because it + * influences the CPL which is checked at least when loading the segment + * descriptors and when pushing an error code to the new kernel stack. + * + * TODO Introduce a separate ctxt->ops->set_cpl callback + */ + if (ctxt->eflags & X86_EFLAGS_VM) + ctxt->mode = X86EMUL_MODE_VM86; + else + ctxt->mode = X86EMUL_MODE_PROT32; + + ctxt->ops->set_rflags(ctxt, ctxt->eflags); + /* * Now load segment descriptors. If fault happenes at this stage * it is handled in a context of new task diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ab39d84dee00..53efd597f39e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1354,7 +1354,11 @@ static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { + unsigned long old_rflags = to_svm(vcpu)->vmcb->save.rflags; + to_svm(vcpu)->vmcb->save.rflags = rflags; + if ((old_rflags ^ rflags) & X86_EFLAGS_VM) + svm_update_cpl(vcpu); } static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 490a1b1a255f..03a1fd47a6d3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4129,6 +4129,11 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val) return res; } +static void emulator_set_rflags(struct x86_emulate_ctxt *ctxt, ulong val) +{ + kvm_set_rflags(emul_to_vcpu(ctxt), val); +} + static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt) { return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt)); @@ -4310,6 +4315,7 @@ static struct x86_emulate_ops emulate_ops = { .set_idt = emulator_set_idt, .get_cr = emulator_get_cr, .set_cr = emulator_set_cr, + .set_rflags = emulator_set_rflags, .cpl = emulator_get_cpl, .get_dr = emulator_get_dr, .set_dr = emulator_set_dr, -- cgit v1.2.3 From 3e515705a1f46beb1c942bb8043c16f8ac7b1e9e Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 5 Mar 2012 14:23:29 +0200 Subject: KVM: Ensure all vcpus are consistent with in-kernel irqchip settings If some vcpus are created before KVM_CREATE_IRQCHIP, then irqchip_in_kernel() and vcpu->arch.apic will be inconsistent, leading to potential NULL pointer dereferences. Fix by: - ensuring that no vcpus are installed when KVM_CREATE_IRQCHIP is called - ensuring that a vcpu has an apic if it is installed after KVM_CREATE_IRQCHIP This is somewhat long winded because vcpu->arch.apic is created without kvm->lock held. Based on earlier patch by Michael Ellerman. Signed-off-by: Michael Ellerman Signed-off-by: Avi Kivity --- arch/ia64/kvm/kvm-ia64.c | 5 +++++ arch/x86/kvm/x86.c | 8 ++++++++ include/linux/kvm_host.h | 7 +++++++ virt/kvm/kvm_main.c | 4 ++++ 4 files changed, 24 insertions(+) diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index d8ddbba6fe7d..f5104b7c52cd 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -1172,6 +1172,11 @@ out: #define PALE_RESET_ENTRY 0x80000000ffffffb0UL +bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) +{ + return irqchip_in_kernel(vcpu->kcm) == (vcpu->arch.apic != NULL); +} + int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { struct kvm_vcpu *v; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 03a1fd47a6d3..9477dc6cccae 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3199,6 +3199,9 @@ long kvm_arch_vm_ioctl(struct file *filp, r = -EEXIST; if (kvm->arch.vpic) goto create_irqchip_unlock; + r = -EINVAL; + if (atomic_read(&kvm->online_vcpus)) + goto create_irqchip_unlock; r = -ENOMEM; vpic = kvm_create_pic(kvm); if (vpic) { @@ -6107,6 +6110,11 @@ void kvm_arch_check_processor_compat(void *rtn) kvm_x86_ops->check_processor_compatibility(rtn); } +bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) +{ + return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL); +} + int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { struct page *page; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 355e44555c39..e42d85ae8541 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -805,6 +805,13 @@ static inline bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) { return vcpu->kvm->bsp_vcpu_id == vcpu->vcpu_id; } + +bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu); + +#else + +static inline bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) { return true; } + #endif #ifdef __KVM_HAVE_DEVICE_ASSIGNMENT diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e4431ada5947..94e148e38719 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1651,6 +1651,10 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) goto vcpu_destroy; mutex_lock(&kvm->lock); + if (!kvm_vcpu_compatible(vcpu)) { + r = -EINVAL; + goto unlock_vcpu_destroy; + } if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) { r = -EINVAL; goto unlock_vcpu_destroy; -- cgit v1.2.3 From 07700a94b00a4fcbbfb07d1b72dc112a0e036735 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 28 Feb 2012 14:19:54 +0100 Subject: KVM: Allow host IRQ sharing for assigned PCI 2.3 devices PCI 2.3 allows to generically disable IRQ sources at device level. This enables us to share legacy IRQs of such devices with other host devices when passing them to a guest. The new IRQ sharing feature introduced here is optional, user space has to request it explicitly. Moreover, user space can inform us about its view of PCI_COMMAND_INTX_DISABLE so that we can avoid unmasking the interrupt and signaling it if the guest masked it via the virtualized PCI config space. Signed-off-by: Jan Kiszka Acked-by: Alex Williamson Acked-by: Michael S. Tsirkin Signed-off-by: Avi Kivity --- Documentation/virtual/kvm/api.txt | 41 ++++++++ arch/x86/kvm/x86.c | 1 + include/linux/kvm.h | 6 ++ include/linux/kvm_host.h | 2 + virt/kvm/assigned-dev.c | 209 ++++++++++++++++++++++++++++++++------ 5 files changed, 230 insertions(+), 29 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 59a38264a0ed..6386f8c0482e 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1169,6 +1169,14 @@ following flags are specified: /* Depends on KVM_CAP_IOMMU */ #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) +/* The following two depend on KVM_CAP_PCI_2_3 */ +#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) +#define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) + +If KVM_DEV_ASSIGN_PCI_2_3 is set, the kernel will manage legacy INTx interrupts +via the PCI-2.3-compliant device-level mask, thus enable IRQ sharing with other +assigned devices or host devices. KVM_DEV_ASSIGN_MASK_INTX specifies the +guest's view on the INTx mask, see KVM_ASSIGN_SET_INTX_MASK for details. The KVM_DEV_ASSIGN_ENABLE_IOMMU flag is a mandatory option to ensure isolation of the device. Usages not specifying this flag are deprecated. @@ -1441,6 +1449,39 @@ The "num_dirty" field is a performance hint for KVM to determine whether it should skip processing the bitmap and just invalidate everything. It must be set to the number of set bits in the bitmap. +4.60 KVM_ASSIGN_SET_INTX_MASK + +Capability: KVM_CAP_PCI_2_3 +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_assigned_pci_dev (in) +Returns: 0 on success, -1 on error + +Allows userspace to mask PCI INTx interrupts from the assigned device. The +kernel will not deliver INTx interrupts to the guest between setting and +clearing of KVM_ASSIGN_SET_INTX_MASK via this interface. This enables use of +and emulation of PCI 2.3 INTx disable command register behavior. + +This may be used for both PCI 2.3 devices supporting INTx disable natively and +older devices lacking this support. Userspace is responsible for emulating the +read value of the INTx disable bit in the guest visible PCI command register. +When modifying the INTx disable state, userspace should precede updating the +physical device command register by calling this ioctl to inform the kernel of +the new intended INTx mask state. + +Note that the kernel uses the device INTx disable bit to internally manage the +device interrupt state for PCI 2.3 devices. Reads of this register may +therefore not match the expected value. Writes should always use the guest +intended INTx disable value rather than attempting to read-copy-update the +current physical device state. Races between user and kernel updates to the +INTx disable bit are handled lazily in the kernel. It's possible the device +may generate unintended interrupts, but they will not be injected into the +guest. + +See KVM_ASSIGN_DEV_IRQ for the data structure. The target device is specified +by assigned_dev_id. In the flags field, only KVM_DEV_ASSIGN_MASK_INTX is +evaluated. + 4.62 KVM_CREATE_SPAPR_TCE Capability: KVM_CAP_SPAPR_TCE diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9477dc6cccae..6866083a48c1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2143,6 +2143,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_XSAVE: case KVM_CAP_ASYNC_PF: case KVM_CAP_GET_TSC_KHZ: + case KVM_CAP_PCI_2_3: r = 1; break; case KVM_CAP_COALESCED_MMIO: diff --git a/include/linux/kvm.h b/include/linux/kvm.h index acbe42939089..6c322a90b92f 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -588,6 +588,7 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_TSC_DEADLINE_TIMER 72 #define KVM_CAP_S390_UCONTROL 73 #define KVM_CAP_SYNC_REGS 74 +#define KVM_CAP_PCI_2_3 75 #ifdef KVM_CAP_IRQ_ROUTING @@ -784,6 +785,9 @@ struct kvm_s390_ucas_mapping { /* Available with KVM_CAP_TSC_CONTROL */ #define KVM_SET_TSC_KHZ _IO(KVMIO, 0xa2) #define KVM_GET_TSC_KHZ _IO(KVMIO, 0xa3) +/* Available with KVM_CAP_PCI_2_3 */ +#define KVM_ASSIGN_SET_INTX_MASK _IOW(KVMIO, 0xa4, \ + struct kvm_assigned_pci_dev) /* * ioctls for vcpu fds @@ -857,6 +861,8 @@ struct kvm_s390_ucas_mapping { #define KVM_SET_ONE_REG _IOW(KVMIO, 0xac, struct kvm_one_reg) #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) +#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) +#define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) struct kvm_assigned_pci_dev { __u32 assigned_dev_id; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index e42d85ae8541..ec171c1d0878 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -546,6 +546,7 @@ struct kvm_assigned_dev_kernel { unsigned int entries_nr; int host_irq; bool host_irq_disabled; + bool pci_2_3; struct msix_entry *host_msix_entries; int guest_irq; struct msix_entry *guest_msix_entries; @@ -555,6 +556,7 @@ struct kvm_assigned_dev_kernel { struct pci_dev *dev; struct kvm *kvm; spinlock_t intx_lock; + struct mutex intx_mask_lock; char irq_name[32]; struct pci_saved_state *pci_saved_state; }; diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c index ece80612b594..08e05715df72 100644 --- a/virt/kvm/assigned-dev.c +++ b/virt/kvm/assigned-dev.c @@ -55,22 +55,66 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel return index; } -static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id) +static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id) { struct kvm_assigned_dev_kernel *assigned_dev = dev_id; + int ret; - if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) { - spin_lock(&assigned_dev->intx_lock); + spin_lock(&assigned_dev->intx_lock); + if (pci_check_and_mask_intx(assigned_dev->dev)) { + assigned_dev->host_irq_disabled = true; + ret = IRQ_WAKE_THREAD; + } else + ret = IRQ_NONE; + spin_unlock(&assigned_dev->intx_lock); + + return ret; +} + +static void +kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev, + int vector) +{ + if (unlikely(assigned_dev->irq_requested_type & + KVM_DEV_IRQ_GUEST_INTX)) { + mutex_lock(&assigned_dev->intx_mask_lock); + if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) + kvm_set_irq(assigned_dev->kvm, + assigned_dev->irq_source_id, vector, 1); + mutex_unlock(&assigned_dev->intx_mask_lock); + } else + kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, + vector, 1); +} + +static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id) +{ + struct kvm_assigned_dev_kernel *assigned_dev = dev_id; + + if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) { + spin_lock_irq(&assigned_dev->intx_lock); disable_irq_nosync(irq); assigned_dev->host_irq_disabled = true; - spin_unlock(&assigned_dev->intx_lock); + spin_unlock_irq(&assigned_dev->intx_lock); } - kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, - assigned_dev->guest_irq, 1); + kvm_assigned_dev_raise_guest_irq(assigned_dev, + assigned_dev->guest_irq); + + return IRQ_HANDLED; +} + +#ifdef __KVM_HAVE_MSI +static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id) +{ + struct kvm_assigned_dev_kernel *assigned_dev = dev_id; + + kvm_assigned_dev_raise_guest_irq(assigned_dev, + assigned_dev->guest_irq); return IRQ_HANDLED; } +#endif #ifdef __KVM_HAVE_MSIX static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id) @@ -81,8 +125,7 @@ static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id) if (index >= 0) { vector = assigned_dev->guest_msix_entries[index].vector; - kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, - vector, 1); + kvm_assigned_dev_raise_guest_irq(assigned_dev, vector); } return IRQ_HANDLED; @@ -98,15 +141,31 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0); - /* The guest irq may be shared so this ack may be - * from another device. - */ - spin_lock(&dev->intx_lock); - if (dev->host_irq_disabled) { - enable_irq(dev->host_irq); - dev->host_irq_disabled = false; + mutex_lock(&dev->intx_mask_lock); + + if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) { + bool reassert = false; + + spin_lock_irq(&dev->intx_lock); + /* + * The guest IRQ may be shared so this ack can come from an + * IRQ for another guest device. + */ + if (dev->host_irq_disabled) { + if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) + enable_irq(dev->host_irq); + else if (!pci_check_and_unmask_intx(dev->dev)) + reassert = true; + dev->host_irq_disabled = reassert; + } + spin_unlock_irq(&dev->intx_lock); + + if (reassert) + kvm_set_irq(dev->kvm, dev->irq_source_id, + dev->guest_irq, 1); } - spin_unlock(&dev->intx_lock); + + mutex_unlock(&dev->intx_mask_lock); } static void deassign_guest_irq(struct kvm *kvm, @@ -154,7 +213,15 @@ static void deassign_host_irq(struct kvm *kvm, pci_disable_msix(assigned_dev->dev); } else { /* Deal with MSI and INTx */ - disable_irq(assigned_dev->host_irq); + if ((assigned_dev->irq_requested_type & + KVM_DEV_IRQ_HOST_INTX) && + (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) { + spin_lock_irq(&assigned_dev->intx_lock); + pci_intx(assigned_dev->dev, false); + spin_unlock_irq(&assigned_dev->intx_lock); + synchronize_irq(assigned_dev->host_irq); + } else + disable_irq(assigned_dev->host_irq); free_irq(assigned_dev->host_irq, assigned_dev); @@ -235,15 +302,34 @@ void kvm_free_all_assigned_devices(struct kvm *kvm) static int assigned_device_enable_host_intx(struct kvm *kvm, struct kvm_assigned_dev_kernel *dev) { + irq_handler_t irq_handler; + unsigned long flags; + dev->host_irq = dev->dev->irq; - /* Even though this is PCI, we don't want to use shared - * interrupts. Sharing host devices with guest-assigned devices - * on the same interrupt line is not a happy situation: there - * are going to be long delays in accepting, acking, etc. + + /* + * We can only share the IRQ line with other host devices if we are + * able to disable the IRQ source at device-level - independently of + * the guest driver. Otherwise host devices may suffer from unbounded + * IRQ latencies when the guest keeps the line asserted. */ - if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread, - IRQF_ONESHOT, dev->irq_name, dev)) + if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) { + irq_handler = kvm_assigned_dev_intx; + flags = IRQF_SHARED; + } else { + irq_handler = NULL; + flags = IRQF_ONESHOT; + } + if (request_threaded_irq(dev->host_irq, irq_handler, + kvm_assigned_dev_thread_intx, flags, + dev->irq_name, dev)) return -EIO; + + if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) { + spin_lock_irq(&dev->intx_lock); + pci_intx(dev->dev, true); + spin_unlock_irq(&dev->intx_lock); + } return 0; } @@ -260,8 +346,9 @@ static int assigned_device_enable_host_msi(struct kvm *kvm, } dev->host_irq = dev->dev->irq; - if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread, - 0, dev->irq_name, dev)) { + if (request_threaded_irq(dev->host_irq, NULL, + kvm_assigned_dev_thread_msi, 0, + dev->irq_name, dev)) { pci_disable_msi(dev->dev); return -EIO; } @@ -319,7 +406,6 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm, { dev->guest_irq = irq->guest_irq; dev->ack_notifier.gsi = -1; - dev->host_irq_disabled = false; return 0; } #endif @@ -331,7 +417,6 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm, { dev->guest_irq = irq->guest_irq; dev->ack_notifier.gsi = -1; - dev->host_irq_disabled = false; return 0; } #endif @@ -365,6 +450,7 @@ static int assign_host_irq(struct kvm *kvm, default: r = -EINVAL; } + dev->host_irq_disabled = false; if (!r) dev->irq_requested_type |= host_irq_type; @@ -466,6 +552,7 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm, { int r = -ENODEV; struct kvm_assigned_dev_kernel *match; + unsigned long irq_type; mutex_lock(&kvm->lock); @@ -474,7 +561,9 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm, if (!match) goto out; - r = kvm_deassign_irq(kvm, match, assigned_irq->flags); + irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK | + KVM_DEV_IRQ_GUEST_MASK); + r = kvm_deassign_irq(kvm, match, irq_type); out: mutex_unlock(&kvm->lock); return r; @@ -607,6 +696,10 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, if (!match->pci_saved_state) printk(KERN_DEBUG "%s: Couldn't store %s saved state\n", __func__, dev_name(&dev->dev)); + + if (!pci_intx_mask_supported(dev)) + assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3; + match->assigned_dev_id = assigned_dev->assigned_dev_id; match->host_segnr = assigned_dev->segnr; match->host_busnr = assigned_dev->busnr; @@ -614,6 +707,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, match->flags = assigned_dev->flags; match->dev = dev; spin_lock_init(&match->intx_lock); + mutex_init(&match->intx_mask_lock); match->irq_source_id = -1; match->kvm = kvm; match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; @@ -759,6 +853,55 @@ msix_entry_out: } #endif +static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm, + struct kvm_assigned_pci_dev *assigned_dev) +{ + int r = 0; + struct kvm_assigned_dev_kernel *match; + + mutex_lock(&kvm->lock); + + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + assigned_dev->assigned_dev_id); + if (!match) { + r = -ENODEV; + goto out; + } + + mutex_lock(&match->intx_mask_lock); + + match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX; + match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX; + + if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) { + if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) { + kvm_set_irq(match->kvm, match->irq_source_id, + match->guest_irq, 0); + /* + * Masking at hardware-level is performed on demand, + * i.e. when an IRQ actually arrives at the host. + */ + } else if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) { + /* + * Unmask the IRQ line if required. Unmasking at + * device level will be performed by user space. + */ + spin_lock_irq(&match->intx_lock); + if (match->host_irq_disabled) { + enable_irq(match->host_irq); + match->host_irq_disabled = false; + } + spin_unlock_irq(&match->intx_lock); + } + } + + mutex_unlock(&match->intx_mask_lock); + +out: + mutex_unlock(&kvm->lock); + return r; +} + long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, unsigned long arg) { @@ -866,6 +1009,15 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, break; } #endif + case KVM_ASSIGN_SET_INTX_MASK: { + struct kvm_assigned_pci_dev assigned_dev; + + r = -EFAULT; + if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) + goto out; + r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev); + break; + } default: r = -ENOTTY; break; @@ -873,4 +1025,3 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, out: return r; } - -- cgit v1.2.3 From bec87d6e34de459a559fc326aa4e56bee4ba7423 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Sun, 4 Mar 2012 13:13:38 +0800 Subject: KVM: use correct tlbs dirty type in cmpxchg Using 'int' type is not suitable for a 'long' object. So, correct it. Signed-off-by: Alex Shi Signed-off-by: Avi Kivity --- virt/kvm/kvm_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 94e148e38719..42b73930a6de 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -203,7 +203,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) void kvm_flush_remote_tlbs(struct kvm *kvm) { - int dirty_count = kvm->tlbs_dirty; + long dirty_count = kvm->tlbs_dirty; smp_mb(); if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) -- cgit v1.2.3 From 9ee73970c03edb68146ceb1ba2a7033c99a5e017 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 6 Mar 2012 14:16:33 +0200 Subject: KVM: VMX: Fix delayed load of shared MSRs Shared MSRs (MSR_*STAR and related) are stored in both vmx->guest_msrs and in the CPU registers, but vmx_set_msr() only updated memory. Prior to 46199f33c2953, this didn't matter, since we called vmx_load_host_state(), which scheduled a vmx_save_host_state(), which re-synchronized the CPU state, but now we don't, so the CPU state will not be synchronized until the next exit to host userspace. This mostly affects nested vmx workloads, which play with these MSRs a lot. Fix by loading the MSR eagerly. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 124a0952a040..4a722a0b8e13 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2210,6 +2210,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) msr = find_msr_entry(vmx, msr_index); if (msr) { msr->data = data; + if (msr - vmx->guest_msrs < vmx->save_nmsrs) + kvm_set_shared_msr(msr->index, msr->data, + msr->mask); break; } ret = kvm_set_msr_common(vcpu, msr_index, data); -- cgit v1.2.3 From a7b9d2ccc3d86303ee9314612d301966e04011c7 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 26 Feb 2012 16:55:40 +0200 Subject: KVM: PMU: warn when pin control is set in eventsel msr Print warning once if pin control bit is set in eventsel msr since emulation does not support it yet. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/perf_event.h | 1 + arch/x86/kvm/pmu.c | 3 +++ 2 files changed, 4 insertions(+) diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 096c975e099f..f1f71823f682 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -23,6 +23,7 @@ #define ARCH_PERFMON_EVENTSEL_USR (1ULL << 16) #define ARCH_PERFMON_EVENTSEL_OS (1ULL << 17) #define ARCH_PERFMON_EVENTSEL_EDGE (1ULL << 18) +#define ARCH_PERFMON_EVENTSEL_PIN_CONTROL (1ULL << 19) #define ARCH_PERFMON_EVENTSEL_INT (1ULL << 20) #define ARCH_PERFMON_EVENTSEL_ANY (1ULL << 21) #define ARCH_PERFMON_EVENTSEL_ENABLE (1ULL << 22) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 3e48c1d3edcd..6af9a542e541 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -210,6 +210,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) unsigned config, type = PERF_TYPE_RAW; u8 event_select, unit_mask; + if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) + printk_once("kvm pmu: pin control bit is ignored\n"); + pmc->eventsel = eventsel; stop_counter(pmc); -- cgit v1.2.3 From fac3368310765ade6bbdf07c9acdb04210e8b5b0 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 26 Feb 2012 16:55:41 +0200 Subject: KVM: PMU: Fix raw event check If eventsel has EDGE, INV or CMASK set we should create raw counter for it, but the check is done on a wrong variable. Fix it. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/pmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 6af9a542e541..b52a8ed283b2 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -223,7 +223,7 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT; unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; - if (!(event_select & (ARCH_PERFMON_EVENTSEL_EDGE | + if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE | ARCH_PERFMON_EVENTSEL_INV | ARCH_PERFMON_EVENTSEL_CMASK))) { config = find_arch_event(&pmc->vcpu->arch.pmu, event_select, -- cgit v1.2.3 From 62079d8a431287a4da81db64e002c71f0e06ca83 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 26 Feb 2012 16:55:42 +0200 Subject: KVM: PMU: add proper support for fixed counter 2 Currently pmu emulation emulates fixed counter 2 as bus cycles architectural counter, but since commit 9c1497ea591b25d perf has pseudo encoding for it. Use it. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/pmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index b52a8ed283b2..a73f0c104813 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -33,10 +33,11 @@ static struct kvm_arch_event_perf_mapping { [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES }, [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, + [7] = { 0x00, 0x30, PERF_COUNT_HW_REF_CPU_CYCLES }, }; /* mapping between fixed pmc index and arch_events array */ -int fixed_pmc_events[] = {1, 0, 2}; +int fixed_pmc_events[] = {1, 0, 7}; static bool pmc_is_gp(struct kvm_pmc *pmc) { -- cgit v1.2.3 From 4d6931c380a976753f7566a96b58690010ef1413 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 5 Mar 2012 16:53:06 +0100 Subject: KVM: MMU: make use of ->root_level in reset_rsvds_bits_mask The reset_rsvds_bits_mask() function can use the guest walker's root level number instead of using a separate 'level' variable. Signed-off-by: Davidlohr Bueso Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ff053ca32303..4cb164268846 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3185,15 +3185,14 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, #undef PTTYPE static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, - struct kvm_mmu *context, - int level) + struct kvm_mmu *context) { int maxphyaddr = cpuid_maxphyaddr(vcpu); u64 exb_bit_rsvd = 0; if (!context->nx) exb_bit_rsvd = rsvd_bits(63, 63); - switch (level) { + switch (context->root_level) { case PT32_ROOT_LEVEL: /* no rsvd bits for 2 level 4K page table entries */ context->rsvd_bits_mask[0][1] = 0; @@ -3251,8 +3250,9 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) { context->nx = is_nx(vcpu); + context->root_level = level; - reset_rsvds_bits_mask(vcpu, context, level); + reset_rsvds_bits_mask(vcpu, context); ASSERT(is_pae(vcpu)); context->new_cr3 = paging_new_cr3; @@ -3262,7 +3262,6 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, context->invlpg = paging64_invlpg; context->update_pte = paging64_update_pte; context->free = paging_free; - context->root_level = level; context->shadow_root_level = level; context->root_hpa = INVALID_PAGE; context->direct_map = false; @@ -3279,8 +3278,9 @@ static int paging32_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { context->nx = false; + context->root_level = PT32_ROOT_LEVEL; - reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL); + reset_rsvds_bits_mask(vcpu, context); context->new_cr3 = paging_new_cr3; context->page_fault = paging32_page_fault; @@ -3289,7 +3289,6 @@ static int paging32_init_context(struct kvm_vcpu *vcpu, context->sync_page = paging32_sync_page; context->invlpg = paging32_invlpg; context->update_pte = paging32_update_pte; - context->root_level = PT32_ROOT_LEVEL; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; context->direct_map = false; @@ -3327,19 +3326,19 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->root_level = 0; } else if (is_long_mode(vcpu)) { context->nx = is_nx(vcpu); - reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL); - context->gva_to_gpa = paging64_gva_to_gpa; context->root_level = PT64_ROOT_LEVEL; + reset_rsvds_bits_mask(vcpu, context); + context->gva_to_gpa = paging64_gva_to_gpa; } else if (is_pae(vcpu)) { context->nx = is_nx(vcpu); - reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL); - context->gva_to_gpa = paging64_gva_to_gpa; context->root_level = PT32E_ROOT_LEVEL; + reset_rsvds_bits_mask(vcpu, context); + context->gva_to_gpa = paging64_gva_to_gpa; } else { context->nx = false; - reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL); - context->gva_to_gpa = paging32_gva_to_gpa; context->root_level = PT32_ROOT_LEVEL; + reset_rsvds_bits_mask(vcpu, context); + context->gva_to_gpa = paging32_gva_to_gpa; } return 0; @@ -3402,18 +3401,18 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu) g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested; } else if (is_long_mode(vcpu)) { g_context->nx = is_nx(vcpu); - reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL); g_context->root_level = PT64_ROOT_LEVEL; + reset_rsvds_bits_mask(vcpu, g_context); g_context->gva_to_gpa = paging64_gva_to_gpa_nested; } else if (is_pae(vcpu)) { g_context->nx = is_nx(vcpu); - reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL); g_context->root_level = PT32E_ROOT_LEVEL; + reset_rsvds_bits_mask(vcpu, g_context); g_context->gva_to_gpa = paging64_gva_to_gpa_nested; } else { g_context->nx = false; - reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL); g_context->root_level = PT32_ROOT_LEVEL; + reset_rsvds_bits_mask(vcpu, g_context); g_context->gva_to_gpa = paging32_gva_to_gpa_nested; } -- cgit v1.2.3 From a223c313cb13e9ab71051fc5b70610a2829a4082 Mon Sep 17 00:00:00 2001 From: Nicolae Mogoreanu Date: Tue, 21 Feb 2012 13:44:21 -0800 Subject: KVM: Ignore the writes to MSR_K7_HWCR(3) When CPUID Fn8000_0001_EAX reports 0x00100f22 Windows 7 x64 guest tries to set bit 3 in MSRC001_0015 in nt!KiDisableCacheErrataSource and fails. This patch will ignore this step and allow things to move on without having to fake CPUID value. Signed-off-by: Nicolae Mogoreanu Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6866083a48c1..32096cf6c6c9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1547,6 +1547,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) case MSR_K7_HWCR: data &= ~(u64)0x40; /* ignore flush filter disable */ data &= ~(u64)0x100; /* ignore ignne emulation enable */ + data &= ~(u64)0x8; /* ignore TLB cache disable */ if (data != 0) { pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", data); -- cgit v1.2.3 From 9587190107d0c0cbaccbf7bf6b0245d29095a9ae Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Tue, 6 Mar 2012 16:39:22 +0200 Subject: KVM: nVMX: Fix erroneous exception bitmap check The code which checks whether to inject a pagefault to L1 or L2 (in nested VMX) was wrong, incorrect in how it checked the PF_VECTOR bit. Thanks to Dan Carpenter for spotting this. Signed-off-by: Nadav Har'El Reported-by: Dan Carpenter Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4a722a0b8e13..2c22fc788da2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1664,7 +1664,7 @@ static int nested_pf_handled(struct kvm_vcpu *vcpu) struct vmcs12 *vmcs12 = get_vmcs12(vcpu); /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */ - if (!(vmcs12->exception_bitmap & PF_VECTOR)) + if (!(vmcs12->exception_bitmap & (1u << PF_VECTOR))) return 0; nested_vmx_vmexit(vcpu); -- cgit v1.2.3 From b74f05d61b73af584d0c39121980171389ecfaaa Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Mon, 13 Feb 2012 11:07:27 -0200 Subject: x86: kvmclock: abstract save/restore sched_clock_state Upon resume from hibernation, CPU 0's hvclock area contains the old values for system_time and tsc_timestamp. It is necessary for the hypervisor to update these values with uptodate ones before the CPU uses them. Abstract TSC's save/restore sched_clock_state functions and use restore_state to write to KVM_SYSTEM_TIME MSR, forcing an update. Also move restore_sched_clock_state before __restore_processor_state, since the later calls CONFIG_LOCK_STAT's lockstat_clock (also for TSC). Thanks to Igor Mammedov for tracking it down. Fixes suspend-to-disk with kvmclock. Reviewed-by: Thomas Gleixner Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/tsc.h | 4 ++-- arch/x86/include/asm/x86_init.h | 4 ++++ arch/x86/kernel/kvmclock.c | 11 +++++++++++ arch/x86/kernel/tsc.c | 4 ++-- arch/x86/kernel/x86_init.c | 4 +++- arch/x86/power/cpu.c | 4 ++-- 6 files changed, 24 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 15d99153a96d..c91e8b9d588b 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -61,7 +61,7 @@ extern void check_tsc_sync_source(int cpu); extern void check_tsc_sync_target(void); extern int notsc_setup(char *); -extern void save_sched_clock_state(void); -extern void restore_sched_clock_state(void); +extern void tsc_save_sched_clock_state(void); +extern void tsc_restore_sched_clock_state(void); #endif /* _ASM_X86_TSC_H */ diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 5d0afac2962c..baaca8defec8 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -162,6 +162,8 @@ struct x86_cpuinit_ops { * @is_untracked_pat_range exclude from PAT logic * @nmi_init enable NMI on cpus * @i8042_detect pre-detect if i8042 controller exists + * @save_sched_clock_state: save state for sched_clock() on suspend + * @restore_sched_clock_state: restore state for sched_clock() on resume */ struct x86_platform_ops { unsigned long (*calibrate_tsc)(void); @@ -173,6 +175,8 @@ struct x86_platform_ops { void (*nmi_init)(void); unsigned char (*get_nmi_reason)(void); int (*i8042_detect)(void); + void (*save_sched_clock_state)(void); + void (*restore_sched_clock_state)(void); }; struct pci_dev; diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index ca4e735adc54..f8492da65bfc 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -136,6 +136,15 @@ int kvm_register_clock(char *txt) return ret; } +static void kvm_save_sched_clock_state(void) +{ +} + +static void kvm_restore_sched_clock_state(void) +{ + kvm_register_clock("primary cpu clock, resume"); +} + #ifdef CONFIG_X86_LOCAL_APIC static void __cpuinit kvm_setup_secondary_clock(void) { @@ -195,6 +204,8 @@ void __init kvmclock_init(void) x86_cpuinit.early_percpu_clock_init = kvm_setup_secondary_clock; #endif + x86_platform.save_sched_clock_state = kvm_save_sched_clock_state; + x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state; machine_ops.shutdown = kvm_shutdown; #ifdef CONFIG_KEXEC machine_ops.crash_shutdown = kvm_crash_shutdown; diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index a62c201c97ec..aed2aa1088f1 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -629,7 +629,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) static unsigned long long cyc2ns_suspend; -void save_sched_clock_state(void) +void tsc_save_sched_clock_state(void) { if (!sched_clock_stable) return; @@ -645,7 +645,7 @@ void save_sched_clock_state(void) * that sched_clock() continues from the point where it was left off during * suspend. */ -void restore_sched_clock_state(void) +void tsc_restore_sched_clock_state(void) { unsigned long long offset; unsigned long flags; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 6f2ec53deed0..e9f265fd79ae 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -108,7 +108,9 @@ struct x86_platform_ops x86_platform = { .is_untracked_pat_range = is_ISA_range, .nmi_init = default_nmi_init, .get_nmi_reason = default_get_nmi_reason, - .i8042_detect = default_i8042_detect + .i8042_detect = default_i8042_detect, + .save_sched_clock_state = tsc_save_sched_clock_state, + .restore_sched_clock_state = tsc_restore_sched_clock_state, }; EXPORT_SYMBOL_GPL(x86_platform); diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index f10c0afa1cb4..0e76a2814127 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -114,7 +114,7 @@ static void __save_processor_state(struct saved_context *ctxt) void save_processor_state(void) { __save_processor_state(&saved_context); - save_sched_clock_state(); + x86_platform.save_sched_clock_state(); } #ifdef CONFIG_X86_32 EXPORT_SYMBOL(save_processor_state); @@ -230,8 +230,8 @@ static void __restore_processor_state(struct saved_context *ctxt) /* Needed by apm.c */ void restore_processor_state(void) { + x86_platform.restore_sched_clock_state(); __restore_processor_state(&saved_context); - restore_sched_clock_state(); } #ifdef CONFIG_X86_32 EXPORT_SYMBOL(restore_processor_state); -- cgit v1.2.3 From 02626b6af5d2bc62db3bb85fc2891b2725535d44 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 8 Mar 2012 18:46:57 -0300 Subject: KVM: x86: fix kvm_write_tsc() TSC matching thinko kvm_write_tsc() converts from guest TSC to microseconds, not nanoseconds as intended. The result is that the window for matching is 1000 seconds, not 1 second. Microsecond precision is enough for checking whether the TSC write delta is within the heuristic values, so use it instead of nanoseconds. Noted by Avi Kivity. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 32096cf6c6c9..7287812eeb72 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1025,7 +1025,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) struct kvm *kvm = vcpu->kvm; u64 offset, ns, elapsed; unsigned long flags; - s64 nsdiff; + s64 usdiff; raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); @@ -1033,18 +1033,19 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) elapsed = ns - kvm->arch.last_tsc_nsec; /* n.b - signed multiplication and division required */ - nsdiff = data - kvm->arch.last_tsc_write; + usdiff = data - kvm->arch.last_tsc_write; #ifdef CONFIG_X86_64 - nsdiff = (nsdiff * 1000) / vcpu->arch.virtual_tsc_khz; + usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz; #else /* do_div() only does unsigned */ asm("idivl %2; xor %%edx, %%edx" - : "=A"(nsdiff) - : "A"(nsdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz)); + : "=A"(usdiff) + : "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz)); #endif - nsdiff -= elapsed; - if (nsdiff < 0) - nsdiff = -nsdiff; + do_div(elapsed, 1000); + usdiff -= elapsed; + if (usdiff < 0) + usdiff = -usdiff; /* * Special case: TSC write with a small delta (1 second) of virtual @@ -1056,7 +1057,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) * compensation code attempt to catch up if we fall behind, but * it's better to try to match offsets from the beginning. */ - if (nsdiff < NSEC_PER_SEC && + if (usdiff < USEC_PER_SEC && vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) { if (!check_tsc_unstable()) { offset = kvm->arch.cur_tsc_offset; -- cgit v1.2.3 From cf9eeac46350b8b43730b7dc5e999757bed089a4 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 14 Mar 2012 11:02:11 +0100 Subject: KVM: Convert intx_mask_lock to spin lock As kvm_notify_acked_irq calls kvm_assigned_dev_ack_irq under rcu_read_lock, we cannot use a mutex in the latter function. Switch to a spin lock to address this. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- include/linux/kvm_host.h | 2 +- virt/kvm/assigned-dev.c | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ec171c1d0878..40bb1c661a6e 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -556,7 +556,7 @@ struct kvm_assigned_dev_kernel { struct pci_dev *dev; struct kvm *kvm; spinlock_t intx_lock; - struct mutex intx_mask_lock; + spinlock_t intx_mask_lock; char irq_name[32]; struct pci_saved_state *pci_saved_state; }; diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c index 08e05715df72..01f572c10c71 100644 --- a/virt/kvm/assigned-dev.c +++ b/virt/kvm/assigned-dev.c @@ -77,11 +77,11 @@ kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev, { if (unlikely(assigned_dev->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX)) { - mutex_lock(&assigned_dev->intx_mask_lock); + spin_lock(&assigned_dev->intx_mask_lock); if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, vector, 1); - mutex_unlock(&assigned_dev->intx_mask_lock); + spin_unlock(&assigned_dev->intx_mask_lock); } else kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, vector, 1); @@ -141,7 +141,7 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0); - mutex_lock(&dev->intx_mask_lock); + spin_lock(&dev->intx_mask_lock); if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) { bool reassert = false; @@ -165,7 +165,7 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) dev->guest_irq, 1); } - mutex_unlock(&dev->intx_mask_lock); + spin_unlock(&dev->intx_mask_lock); } static void deassign_guest_irq(struct kvm *kvm, @@ -707,7 +707,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, match->flags = assigned_dev->flags; match->dev = dev; spin_lock_init(&match->intx_lock); - mutex_init(&match->intx_mask_lock); + spin_lock_init(&match->intx_mask_lock); match->irq_source_id = -1; match->kvm = kvm; match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; @@ -868,7 +868,7 @@ static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm, goto out; } - mutex_lock(&match->intx_mask_lock); + spin_lock(&match->intx_mask_lock); match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX; match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX; @@ -895,7 +895,7 @@ static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm, } } - mutex_unlock(&match->intx_mask_lock); + spin_unlock(&match->intx_mask_lock); out: mutex_unlock(&kvm->lock); -- cgit v1.2.3