From f0b85051d0f0891378a83db22c0786f1d756fbff Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Tue, 25 Nov 2008 20:17:01 +0100 Subject: KVM: SVM: Clean up VINTR setting The current VINTR intercept setters don't look clean to me. To make the code easier to read and enable the possibilty to trap on a VINTR set, this uses a helper function to set the VINTR intercept. v2 uses two distinct functions for setting and clearing the bit Acked-by: Joerg Roedel Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index a9e769e4e251..33407d95761f 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -718,6 +718,16 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) to_svm(vcpu)->vmcb->save.rflags = rflags; } +static void svm_set_vintr(struct vcpu_svm *svm) +{ + svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR; +} + +static void svm_clear_vintr(struct vcpu_svm *svm) +{ + svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR); +} + static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) { struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save; @@ -1380,7 +1390,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm, { KVMTRACE_0D(PEND_INTR, &svm->vcpu, handler); - svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR); + svm_clear_vintr(svm); svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; /* * If the user space waits to inject interrupts, exit as soon as @@ -1593,7 +1603,7 @@ static void svm_intr_assist(struct kvm_vcpu *vcpu) (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) || (vmcb->control.event_inj & SVM_EVTINJ_VALID)) { /* unable to deliver irq, set pending irq */ - vmcb->control.intercept |= (1ULL << INTERCEPT_VINTR); + svm_set_vintr(svm); svm_inject_irq(svm, 0x0); goto out; } @@ -1652,9 +1662,9 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu, */ if (!svm->vcpu.arch.interrupt_window_open && (svm->vcpu.arch.irq_summary || kvm_run->request_interrupt_window)) - control->intercept |= 1ULL << INTERCEPT_VINTR; - else - control->intercept &= ~(1ULL << INTERCEPT_VINTR); + svm_set_vintr(svm); + else + svm_clear_vintr(svm); } static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) -- cgit v1.2.3 From 9962d032bbff0268f22068787831405f8468c8b4 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Tue, 25 Nov 2008 20:17:02 +0100 Subject: KVM: SVM: Move EFER and MSR constants to generic x86 code MSR_EFER_SVME_MASK, MSR_VM_CR and MSR_VM_HSAVE_PA are set in KVM specific headers. Linux does have nice header files to collect EFER bits and MSR IDs, so IMHO we should put them there. While at it, I also changed the naming scheme to match that of the other defines. (introduced in v6) Acked-by: Joerg Roedel Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/include/asm/msr-index.h | 7 +++++++ arch/x86/include/asm/svm.h | 4 ---- arch/x86/include/asm/virtext.h | 2 +- arch/x86/kvm/svm.c | 6 +++--- 5 files changed, 12 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 730843d1d2fb..2998efe89278 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -22,6 +22,7 @@ #include #include #include +#include #define KVM_MAX_VCPUS 16 #define KVM_MEMORY_SLOTS 32 diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 358acc59ae04..46e9646e7a66 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -18,11 +18,13 @@ #define _EFER_LME 8 /* Long mode enable */ #define _EFER_LMA 10 /* Long mode active (read-only) */ #define _EFER_NX 11 /* No execute enable */ +#define _EFER_SVME 12 /* Enable virtualization */ #define EFER_SCE (1<<_EFER_SCE) #define EFER_LME (1<<_EFER_LME) #define EFER_LMA (1<<_EFER_LMA) #define EFER_NX (1<<_EFER_NX) +#define EFER_SVME (1<<_EFER_SVME) /* Intel MSRs. Some also available on other CPUs */ #define MSR_IA32_PERFCTR0 0x000000c1 @@ -360,4 +362,9 @@ #define MSR_IA32_VMX_PROCBASED_CTLS2 0x0000048b #define MSR_IA32_VMX_EPT_VPID_CAP 0x0000048c +/* AMD-V MSRs */ + +#define MSR_VM_CR 0xc0010114 +#define MSR_VM_HSAVE_PA 0xc0010117 + #endif /* _ASM_X86_MSR_INDEX_H */ diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 1b8afa78e869..82ada75f3ebf 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -174,10 +174,6 @@ struct __attribute__ ((__packed__)) vmcb { #define SVM_CPUID_FEATURE_SHIFT 2 #define SVM_CPUID_FUNC 0x8000000a -#define MSR_EFER_SVME_MASK (1ULL << 12) -#define MSR_VM_CR 0xc0010114 -#define MSR_VM_HSAVE_PA 0xc0010117ULL - #define SVM_VM_CR_SVM_DISABLE 4 #define SVM_SELECTOR_S_SHIFT 4 diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h index 593636275238..e0f9aa16358b 100644 --- a/arch/x86/include/asm/virtext.h +++ b/arch/x86/include/asm/virtext.h @@ -118,7 +118,7 @@ static inline void cpu_svm_disable(void) wrmsrl(MSR_VM_HSAVE_PA, 0); rdmsrl(MSR_EFER, efer); - wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK); + wrmsrl(MSR_EFER, efer & ~EFER_SVME); } /** Makes sure SVM is disabled, if it is supported on the CPU diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 33407d95761f..e4eb3fd91b90 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -198,7 +198,7 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) if (!npt_enabled && !(efer & EFER_LMA)) efer &= ~EFER_LME; - to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK; + to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; vcpu->arch.shadow_efer = efer; } @@ -292,7 +292,7 @@ static void svm_hardware_enable(void *garbage) svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); rdmsrl(MSR_EFER, efer); - wrmsrl(MSR_EFER, efer | MSR_EFER_SVME_MASK); + wrmsrl(MSR_EFER, efer | EFER_SVME); wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(svm_data->save_area) << PAGE_SHIFT); @@ -559,7 +559,7 @@ static void init_vmcb(struct vcpu_svm *svm) init_sys_seg(&save->ldtr, SEG_TYPE_LDT); init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); - save->efer = MSR_EFER_SVME_MASK; + save->efer = EFER_SVME; save->dr6 = 0xffff0ff0; save->dr7 = 0x400; save->rflags = 2; -- cgit v1.2.3 From c0725420cfdcf6dd9705b164a8c6cba86684080d Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Tue, 25 Nov 2008 20:17:03 +0100 Subject: KVM: SVM: Add helper functions for nested SVM These are helpers for the nested SVM implementation. - nsvm_printk implements a debug printk variant - nested_svm_do calls a handler that can accesses gpa-based memory v3 makes use of the new permission checker v6 changes: - streamline nsvm_debug() - remove printk(KERN_ERR) - SVME check before CPL check - give GP error code - use new EFER constant Acked-by: Joerg Roedel Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e4eb3fd91b90..87debdcd1b90 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -50,6 +50,15 @@ MODULE_LICENSE("GPL"); #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) +/* Turn on to get debugging output*/ +/* #define NESTED_DEBUG */ + +#ifdef NESTED_DEBUG +#define nsvm_printk(fmt, args...) printk(KERN_INFO fmt, ## args) +#else +#define nsvm_printk(fmt, args...) do {} while(0) +#endif + /* enable NPT for AMD64 and X86 with PAE */ #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) static bool npt_enabled = true; @@ -1149,6 +1158,82 @@ static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) return 1; } +static int nested_svm_check_permissions(struct vcpu_svm *svm) +{ + if (!(svm->vcpu.arch.shadow_efer & EFER_SVME) + || !is_paging(&svm->vcpu)) { + kvm_queue_exception(&svm->vcpu, UD_VECTOR); + return 1; + } + + if (svm->vmcb->save.cpl) { + kvm_inject_gp(&svm->vcpu, 0); + return 1; + } + + return 0; +} + +static struct page *nested_svm_get_page(struct vcpu_svm *svm, u64 gpa) +{ + struct page *page; + + down_read(¤t->mm->mmap_sem); + page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT); + up_read(¤t->mm->mmap_sem); + + if (is_error_page(page)) { + printk(KERN_INFO "%s: could not find page at 0x%llx\n", + __func__, gpa); + kvm_release_page_clean(page); + kvm_inject_gp(&svm->vcpu, 0); + return NULL; + } + return page; +} + +static int nested_svm_do(struct vcpu_svm *svm, + u64 arg1_gpa, u64 arg2_gpa, void *opaque, + int (*handler)(struct vcpu_svm *svm, + void *arg1, + void *arg2, + void *opaque)) +{ + struct page *arg1_page; + struct page *arg2_page = NULL; + void *arg1; + void *arg2 = NULL; + int retval; + + arg1_page = nested_svm_get_page(svm, arg1_gpa); + if(arg1_page == NULL) + return 1; + + if (arg2_gpa) { + arg2_page = nested_svm_get_page(svm, arg2_gpa); + if(arg2_page == NULL) { + kvm_release_page_clean(arg1_page); + return 1; + } + } + + arg1 = kmap_atomic(arg1_page, KM_USER0); + if (arg2_gpa) + arg2 = kmap_atomic(arg2_page, KM_USER1); + + retval = handler(svm, arg1, arg2, opaque); + + kunmap_atomic(arg1, KM_USER0); + if (arg2_gpa) + kunmap_atomic(arg2, KM_USER1); + + kvm_release_page_dirty(arg1_page); + if (arg2_gpa) + kvm_release_page_dirty(arg2_page); + + return retval; +} + static int invalid_op_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { -- cgit v1.2.3 From 1371d90460189d02bf1bcca19dbfe6bd10dc6031 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Tue, 25 Nov 2008 20:17:04 +0100 Subject: KVM: SVM: Implement GIF, clgi and stgi This patch implements the GIF flag and the clgi and stgi instructions that set this flag. Only if the flag is set (default), interrupts can be received by the CPU. To keep the information about that somewhere, this patch adds a new hidden flags vector. that is used to store information that does not go into the vmcb, but is SVM specific. I tried to write some code to make -no-kvm-irqchip work too, but the first level guest won't even boot with that atm, so I ditched it. v2 moves the hflags to x86 generic code v3 makes use of the new permission helper v6 only enables interrupt_window if GIF=1 Acked-by: Joerg Roedel Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 3 +++ arch/x86/kvm/svm.c | 47 ++++++++++++++++++++++++++++++++++++----- 2 files changed, 45 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 2998efe89278..29e4157732db 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -259,6 +259,7 @@ struct kvm_vcpu_arch { unsigned long cr3; unsigned long cr4; unsigned long cr8; + u32 hflags; u64 pdptrs[4]; /* pae */ u64 shadow_efer; u64 apic_base; @@ -738,6 +739,8 @@ enum { TASK_SWITCH_GATE = 3, }; +#define HF_GIF_MASK (1 << 0) + /* * Hardware virtualization extension instructions may fault if a * reboot turns off virtualization while processes are running. diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 87debdcd1b90..79cc06bfe57c 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -251,7 +251,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) kvm_rip_write(vcpu, svm->next_rip); svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; - vcpu->arch.interrupt_window_open = 1; + vcpu->arch.interrupt_window_open = (svm->vcpu.arch.hflags & HF_GIF_MASK); } static int has_svm(void) @@ -600,6 +600,8 @@ static void init_vmcb(struct vcpu_svm *svm) save->cr4 = 0; } force_new_asid(&svm->vcpu); + + svm->vcpu.arch.hflags = HF_GIF_MASK; } static int svm_vcpu_reset(struct kvm_vcpu *vcpu) @@ -1234,6 +1236,36 @@ static int nested_svm_do(struct vcpu_svm *svm, return retval; } +static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + if (nested_svm_check_permissions(svm)) + return 1; + + svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; + skip_emulated_instruction(&svm->vcpu); + + svm->vcpu.arch.hflags |= HF_GIF_MASK; + + return 1; +} + +static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + if (nested_svm_check_permissions(svm)) + return 1; + + svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; + skip_emulated_instruction(&svm->vcpu); + + svm->vcpu.arch.hflags &= ~HF_GIF_MASK; + + /* After a CLGI no interrupts should come */ + svm_clear_vintr(svm); + svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; + + return 1; +} + static int invalid_op_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { @@ -1535,8 +1567,8 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, [SVM_EXIT_VMMCALL] = vmmcall_interception, [SVM_EXIT_VMLOAD] = invalid_op_interception, [SVM_EXIT_VMSAVE] = invalid_op_interception, - [SVM_EXIT_STGI] = invalid_op_interception, - [SVM_EXIT_CLGI] = invalid_op_interception, + [SVM_EXIT_STGI] = stgi_interception, + [SVM_EXIT_CLGI] = clgi_interception, [SVM_EXIT_SKINIT] = invalid_op_interception, [SVM_EXIT_WBINVD] = emulate_on_interception, [SVM_EXIT_MONITOR] = invalid_op_interception, @@ -1684,6 +1716,9 @@ static void svm_intr_assist(struct kvm_vcpu *vcpu) if (!kvm_cpu_has_interrupt(vcpu)) goto out; + if (!(svm->vcpu.arch.hflags & HF_GIF_MASK)) + goto out; + if (!(vmcb->save.rflags & X86_EFLAGS_IF) || (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) || (vmcb->control.event_inj & SVM_EVTINJ_VALID)) { @@ -1710,7 +1745,8 @@ static void kvm_reput_irq(struct vcpu_svm *svm) } svm->vcpu.arch.interrupt_window_open = - !(control->int_state & SVM_INTERRUPT_SHADOW_MASK); + !(control->int_state & SVM_INTERRUPT_SHADOW_MASK) && + (svm->vcpu.arch.hflags & HF_GIF_MASK); } static void svm_do_inject_vector(struct vcpu_svm *svm) @@ -1734,7 +1770,8 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu, svm->vcpu.arch.interrupt_window_open = (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) && - (svm->vmcb->save.rflags & X86_EFLAGS_IF)); + (svm->vmcb->save.rflags & X86_EFLAGS_IF) && + (svm->vcpu.arch.hflags & HF_GIF_MASK)); if (svm->vcpu.arch.interrupt_window_open && svm->vcpu.arch.irq_summary) /* -- cgit v1.2.3 From b286d5d8b0836e76832dafcc5a18b0e8e5a3bc5e Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Tue, 25 Nov 2008 20:17:05 +0100 Subject: KVM: SVM: Implement hsave Implement the hsave MSR, that gives the VCPU a GPA to save the old guest state in. v2 allows userspace to save/restore hsave v4 dummys out the hsave MSR, so we use a host page v6 remembers the guest's hsave and exports the MSR Acked-by: Joerg Roedel Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/x86/kvm/kvm_svm.h | 2 ++ arch/x86/kvm/svm.c | 13 +++++++++++++ arch/x86/kvm/x86.c | 2 +- 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h index 8e5ee99551f6..a0877cac7b9c 100644 --- a/arch/x86/kvm/kvm_svm.h +++ b/arch/x86/kvm/kvm_svm.h @@ -41,6 +41,8 @@ struct vcpu_svm { unsigned long host_dr7; u32 *msrpm; + struct vmcb *hsave; + u64 hsave_msr; }; #endif diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 79cc06bfe57c..59aaff1c9597 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -626,6 +626,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) struct vcpu_svm *svm; struct page *page; struct page *msrpm_pages; + struct page *hsave_page; int err; svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); @@ -651,6 +652,11 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) svm->msrpm = page_address(msrpm_pages); svm_vcpu_init_msrpm(svm->msrpm); + hsave_page = alloc_page(GFP_KERNEL); + if (!hsave_page) + goto uninit; + svm->hsave = page_address(hsave_page); + svm->vmcb = page_address(page); clear_page(svm->vmcb); svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; @@ -680,6 +686,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT)); __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); + __free_page(virt_to_page(svm->hsave)); kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, svm); } @@ -1377,6 +1384,9 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) case MSR_IA32_LASTINTTOIP: *data = svm->vmcb->save.last_excp_to; break; + case MSR_VM_HSAVE_PA: + *data = svm->hsave_msr; + break; default: return kvm_get_msr_common(vcpu, ecx, data); } @@ -1470,6 +1480,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) */ pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", ecx, data); + break; + case MSR_VM_HSAVE_PA: + svm->hsave_msr = data; break; default: return kvm_set_msr_common(vcpu, ecx, data); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 758b7a155ae9..99165a961f08 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -456,7 +456,7 @@ static u32 msrs_to_save[] = { MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, - MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT + MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA }; static unsigned num_msrs_to_save; -- cgit v1.2.3 From 5542675baa7e62ca4d18278c8758b6a4ec410639 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Tue, 25 Nov 2008 20:17:06 +0100 Subject: KVM: SVM: Add VMLOAD and VMSAVE handlers This implements the VMLOAD and VMSAVE instructions, that usually surround the VMRUN instructions. Both instructions load / restore the same elements, so we only need to implement them once. v2 fixes CPL checking and replaces memcpy by assignments v3 makes use of the new permission checking Acked-by: Joerg Roedel Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 59aaff1c9597..a83c94eb5771 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1243,6 +1243,62 @@ static int nested_svm_do(struct vcpu_svm *svm, return retval; } +static int nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) +{ + to_vmcb->save.fs = from_vmcb->save.fs; + to_vmcb->save.gs = from_vmcb->save.gs; + to_vmcb->save.tr = from_vmcb->save.tr; + to_vmcb->save.ldtr = from_vmcb->save.ldtr; + to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base; + to_vmcb->save.star = from_vmcb->save.star; + to_vmcb->save.lstar = from_vmcb->save.lstar; + to_vmcb->save.cstar = from_vmcb->save.cstar; + to_vmcb->save.sfmask = from_vmcb->save.sfmask; + to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs; + to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp; + to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip; + + return 1; +} + +static int nested_svm_vmload(struct vcpu_svm *svm, void *nested_vmcb, + void *arg2, void *opaque) +{ + return nested_svm_vmloadsave((struct vmcb *)nested_vmcb, svm->vmcb); +} + +static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb, + void *arg2, void *opaque) +{ + return nested_svm_vmloadsave(svm->vmcb, (struct vmcb *)nested_vmcb); +} + +static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + if (nested_svm_check_permissions(svm)) + return 1; + + svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; + skip_emulated_instruction(&svm->vcpu); + + nested_svm_do(svm, svm->vmcb->save.rax, 0, NULL, nested_svm_vmload); + + return 1; +} + +static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + if (nested_svm_check_permissions(svm)) + return 1; + + svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; + skip_emulated_instruction(&svm->vcpu); + + nested_svm_do(svm, svm->vmcb->save.rax, 0, NULL, nested_svm_vmsave); + + return 1; +} + static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { if (nested_svm_check_permissions(svm)) @@ -1578,8 +1634,8 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, [SVM_EXIT_SHUTDOWN] = shutdown_interception, [SVM_EXIT_VMRUN] = invalid_op_interception, [SVM_EXIT_VMMCALL] = vmmcall_interception, - [SVM_EXIT_VMLOAD] = invalid_op_interception, - [SVM_EXIT_VMSAVE] = invalid_op_interception, + [SVM_EXIT_VMLOAD] = vmload_interception, + [SVM_EXIT_VMSAVE] = vmsave_interception, [SVM_EXIT_STGI] = stgi_interception, [SVM_EXIT_CLGI] = clgi_interception, [SVM_EXIT_SKINIT] = invalid_op_interception, -- cgit v1.2.3 From 3d6368ef580a4dff012960834bba4e28d3c1430c Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Tue, 25 Nov 2008 20:17:07 +0100 Subject: KVM: SVM: Add VMRUN handler This patch implements VMRUN. VMRUN enters a virtual CPU and runs that in the same context as the normal guest CPU would run. So basically it is implemented the same way, a normal CPU would do it. We also prepare all intercepts that get OR'ed with the original intercepts, as we do not allow a level 2 guest to be intercepted less than the first level guest. v2 implements the following improvements: - fixes the CPL check - does not allocate iopm when not used - remembers the host's IF in the HIF bit in the hflags v3: - make use of the new permission checking - add support for V_INTR_MASKING_MASK v4: - use host page backed hsave v5: - remove IOPM merging code v6: - save cr4 so PAE l1 guests work v7: - return 0 on vmrun so we check the MSRs too - fix MSR check to use the correct variable Acked-by: Joerg Roedel Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 2 + arch/x86/kvm/kvm_svm.h | 8 ++ arch/x86/kvm/svm.c | 157 +++++++++++++++++++++++++++++++++++++++- 3 files changed, 165 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 29e4157732db..53779309514a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -740,6 +740,8 @@ enum { }; #define HF_GIF_MASK (1 << 0) +#define HF_HIF_MASK (1 << 1) +#define HF_VINTR_MASK (1 << 2) /* * Hardware virtualization extension instructions may fault if a diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h index a0877cac7b9c..91673413d8f7 100644 --- a/arch/x86/kvm/kvm_svm.h +++ b/arch/x86/kvm/kvm_svm.h @@ -43,6 +43,14 @@ struct vcpu_svm { u32 *msrpm; struct vmcb *hsave; u64 hsave_msr; + + u64 nested_vmcb; + + /* These are the merged vectors */ + u32 *nested_msrpm; + + /* gpa pointers to the real vectors */ + u64 nested_vmcb_msrpm; }; #endif diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index a83c94eb5771..fad187cbfabe 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -77,6 +77,11 @@ static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) return container_of(vcpu, struct vcpu_svm, vcpu); } +static inline bool is_nested(struct vcpu_svm *svm) +{ + return svm->nested_vmcb; +} + static unsigned long iopm_base; struct kvm_ldttss_desc { @@ -601,6 +606,7 @@ static void init_vmcb(struct vcpu_svm *svm) } force_new_asid(&svm->vcpu); + svm->nested_vmcb = 0; svm->vcpu.arch.hflags = HF_GIF_MASK; } @@ -627,6 +633,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) struct page *page; struct page *msrpm_pages; struct page *hsave_page; + struct page *nested_msrpm_pages; int err; svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); @@ -649,6 +656,11 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); if (!msrpm_pages) goto uninit; + + nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); + if (!nested_msrpm_pages) + goto uninit; + svm->msrpm = page_address(msrpm_pages); svm_vcpu_init_msrpm(svm->msrpm); @@ -657,6 +669,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) goto uninit; svm->hsave = page_address(hsave_page); + svm->nested_msrpm = page_address(nested_msrpm_pages); + svm->vmcb = page_address(page); clear_page(svm->vmcb); svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; @@ -687,6 +701,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT)); __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); __free_page(virt_to_page(svm->hsave)); + __free_pages(virt_to_page(svm->nested_msrpm), MSRPM_ALLOC_ORDER); kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, svm); } @@ -1243,6 +1258,123 @@ static int nested_svm_do(struct vcpu_svm *svm, return retval; } + +static int nested_svm_vmrun_msrpm(struct vcpu_svm *svm, void *arg1, + void *arg2, void *opaque) +{ + int i; + u32 *nested_msrpm = (u32*)arg1; + for (i=0; i< PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++) + svm->nested_msrpm[i] = svm->msrpm[i] | nested_msrpm[i]; + svm->vmcb->control.msrpm_base_pa = __pa(svm->nested_msrpm); + + return 0; +} + +static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, + void *arg2, void *opaque) +{ + struct vmcb *nested_vmcb = (struct vmcb *)arg1; + struct vmcb *hsave = svm->hsave; + + /* nested_vmcb is our indicator if nested SVM is activated */ + svm->nested_vmcb = svm->vmcb->save.rax; + + /* Clear internal status */ + svm->vcpu.arch.exception.pending = false; + + /* Save the old vmcb, so we don't need to pick what we save, but + can restore everything when a VMEXIT occurs */ + memcpy(hsave, svm->vmcb, sizeof(struct vmcb)); + /* We need to remember the original CR3 in the SPT case */ + if (!npt_enabled) + hsave->save.cr3 = svm->vcpu.arch.cr3; + hsave->save.cr4 = svm->vcpu.arch.cr4; + hsave->save.rip = svm->next_rip; + + if (svm->vmcb->save.rflags & X86_EFLAGS_IF) + svm->vcpu.arch.hflags |= HF_HIF_MASK; + else + svm->vcpu.arch.hflags &= ~HF_HIF_MASK; + + /* Load the nested guest state */ + svm->vmcb->save.es = nested_vmcb->save.es; + svm->vmcb->save.cs = nested_vmcb->save.cs; + svm->vmcb->save.ss = nested_vmcb->save.ss; + svm->vmcb->save.ds = nested_vmcb->save.ds; + svm->vmcb->save.gdtr = nested_vmcb->save.gdtr; + svm->vmcb->save.idtr = nested_vmcb->save.idtr; + svm->vmcb->save.rflags = nested_vmcb->save.rflags; + svm_set_efer(&svm->vcpu, nested_vmcb->save.efer); + svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0); + svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4); + if (npt_enabled) { + svm->vmcb->save.cr3 = nested_vmcb->save.cr3; + svm->vcpu.arch.cr3 = nested_vmcb->save.cr3; + } else { + kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); + kvm_mmu_reset_context(&svm->vcpu); + } + svm->vmcb->save.cr2 = nested_vmcb->save.cr2; + kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax); + kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp); + kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip); + /* In case we don't even reach vcpu_run, the fields are not updated */ + svm->vmcb->save.rax = nested_vmcb->save.rax; + svm->vmcb->save.rsp = nested_vmcb->save.rsp; + svm->vmcb->save.rip = nested_vmcb->save.rip; + svm->vmcb->save.dr7 = nested_vmcb->save.dr7; + svm->vmcb->save.dr6 = nested_vmcb->save.dr6; + svm->vmcb->save.cpl = nested_vmcb->save.cpl; + + /* We don't want a nested guest to be more powerful than the guest, + so all intercepts are ORed */ + svm->vmcb->control.intercept_cr_read |= + nested_vmcb->control.intercept_cr_read; + svm->vmcb->control.intercept_cr_write |= + nested_vmcb->control.intercept_cr_write; + svm->vmcb->control.intercept_dr_read |= + nested_vmcb->control.intercept_dr_read; + svm->vmcb->control.intercept_dr_write |= + nested_vmcb->control.intercept_dr_write; + svm->vmcb->control.intercept_exceptions |= + nested_vmcb->control.intercept_exceptions; + + svm->vmcb->control.intercept |= nested_vmcb->control.intercept; + + svm->nested_vmcb_msrpm = nested_vmcb->control.msrpm_base_pa; + + force_new_asid(&svm->vcpu); + svm->vmcb->control.exit_int_info = nested_vmcb->control.exit_int_info; + svm->vmcb->control.exit_int_info_err = nested_vmcb->control.exit_int_info_err; + svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; + if (nested_vmcb->control.int_ctl & V_IRQ_MASK) { + nsvm_printk("nSVM Injecting Interrupt: 0x%x\n", + nested_vmcb->control.int_ctl); + } + if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK) + svm->vcpu.arch.hflags |= HF_VINTR_MASK; + else + svm->vcpu.arch.hflags &= ~HF_VINTR_MASK; + + nsvm_printk("nSVM exit_int_info: 0x%x | int_state: 0x%x\n", + nested_vmcb->control.exit_int_info, + nested_vmcb->control.int_state); + + svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; + svm->vmcb->control.int_state = nested_vmcb->control.int_state; + svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset; + if (nested_vmcb->control.event_inj & SVM_EVTINJ_VALID) + nsvm_printk("Injecting Event: 0x%x\n", + nested_vmcb->control.event_inj); + svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; + svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; + + svm->vcpu.arch.hflags |= HF_GIF_MASK; + + return 0; +} + static int nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) { to_vmcb->save.fs = from_vmcb->save.fs; @@ -1299,6 +1431,26 @@ static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) return 1; } +static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + nsvm_printk("VMrun\n"); + if (nested_svm_check_permissions(svm)) + return 1; + + svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; + skip_emulated_instruction(&svm->vcpu); + + if (nested_svm_do(svm, svm->vmcb->save.rax, 0, + NULL, nested_svm_vmrun)) + return 1; + + if (nested_svm_do(svm, svm->nested_vmcb_msrpm, 0, + NULL, nested_svm_vmrun_msrpm)) + return 1; + + return 1; +} + static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { if (nested_svm_check_permissions(svm)) @@ -1632,7 +1784,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, [SVM_EXIT_MSR] = msr_interception, [SVM_EXIT_TASK_SWITCH] = task_switch_interception, [SVM_EXIT_SHUTDOWN] = shutdown_interception, - [SVM_EXIT_VMRUN] = invalid_op_interception, + [SVM_EXIT_VMRUN] = vmrun_interception, [SVM_EXIT_VMMCALL] = vmmcall_interception, [SVM_EXIT_VMLOAD] = vmload_interception, [SVM_EXIT_VMSAVE] = vmsave_interception, @@ -1939,7 +2091,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) svm->host_cr2 = kvm_read_cr2(); svm->host_dr6 = read_dr6(); svm->host_dr7 = read_dr7(); - svm->vmcb->save.cr2 = vcpu->arch.cr2; + if (!is_nested(svm)) + svm->vmcb->save.cr2 = vcpu->arch.cr2; /* required for live migration with NPT */ if (npt_enabled) svm->vmcb->save.cr3 = vcpu->arch.cr3; -- cgit v1.2.3 From cf74a78b229d07f77416d2fe1f029f183a8a31df Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Tue, 25 Nov 2008 20:17:08 +0100 Subject: KVM: SVM: Add VMEXIT handler and intercepts This adds the #VMEXIT intercept, so we return to the level 1 guest when something happens in the level 2 guest that should return to the level 1 guest. v2 implements HIF handling and cleans up exception interception v3 adds support for V_INTR_MASKING_MASK v4 uses the host page hsave v5 removes IOPM merging code v6 moves mmu code out of the atomic section Acked-by: Joerg Roedel Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 293 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 293 insertions(+) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index fad187cbfabe..4cb2920b1527 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -72,6 +72,13 @@ module_param(npt, int, S_IRUGO); static void kvm_reput_irq(struct vcpu_svm *svm); static void svm_flush_tlb(struct kvm_vcpu *vcpu); +static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override); +static int nested_svm_vmexit(struct vcpu_svm *svm); +static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb, + void *arg2, void *opaque); +static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, + bool has_error_code, u32 error_code); + static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) { return container_of(vcpu, struct vcpu_svm, vcpu); @@ -221,6 +228,11 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, { struct vcpu_svm *svm = to_svm(vcpu); + /* If we are within a nested VM we'd better #VMEXIT and let the + guest handle the exception */ + if (nested_svm_check_exception(svm, nr, has_error_code, error_code)) + return; + svm->vmcb->control.event_inj = nr | SVM_EVTINJ_VALID | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0) @@ -1198,6 +1210,46 @@ static int nested_svm_check_permissions(struct vcpu_svm *svm) return 0; } +static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, + bool has_error_code, u32 error_code) +{ + if (is_nested(svm)) { + svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; + svm->vmcb->control.exit_code_hi = 0; + svm->vmcb->control.exit_info_1 = error_code; + svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; + if (nested_svm_exit_handled(svm, false)) { + nsvm_printk("VMexit -> EXCP 0x%x\n", nr); + + nested_svm_vmexit(svm); + return 1; + } + } + + return 0; +} + +static inline int nested_svm_intr(struct vcpu_svm *svm) +{ + if (is_nested(svm)) { + if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) + return 0; + + if (!(svm->vcpu.arch.hflags & HF_HIF_MASK)) + return 0; + + svm->vmcb->control.exit_code = SVM_EXIT_INTR; + + if (nested_svm_exit_handled(svm, false)) { + nsvm_printk("VMexit -> INTR\n"); + nested_svm_vmexit(svm); + return 1; + } + } + + return 0; +} + static struct page *nested_svm_get_page(struct vcpu_svm *svm, u64 gpa) { struct page *page; @@ -1258,6 +1310,228 @@ static int nested_svm_do(struct vcpu_svm *svm, return retval; } +static int nested_svm_exit_handled_real(struct vcpu_svm *svm, + void *arg1, + void *arg2, + void *opaque) +{ + struct vmcb *nested_vmcb = (struct vmcb *)arg1; + bool kvm_overrides = *(bool *)opaque; + u32 exit_code = svm->vmcb->control.exit_code; + + if (kvm_overrides) { + switch (exit_code) { + case SVM_EXIT_INTR: + case SVM_EXIT_NMI: + return 0; + /* For now we are always handling NPFs when using them */ + case SVM_EXIT_NPF: + if (npt_enabled) + return 0; + break; + /* When we're shadowing, trap PFs */ + case SVM_EXIT_EXCP_BASE + PF_VECTOR: + if (!npt_enabled) + return 0; + break; + default: + break; + } + } + + switch (exit_code) { + case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: { + u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0); + if (nested_vmcb->control.intercept_cr_read & cr_bits) + return 1; + break; + } + case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: { + u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0); + if (nested_vmcb->control.intercept_cr_write & cr_bits) + return 1; + break; + } + case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: { + u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0); + if (nested_vmcb->control.intercept_dr_read & dr_bits) + return 1; + break; + } + case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: { + u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0); + if (nested_vmcb->control.intercept_dr_write & dr_bits) + return 1; + break; + } + case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { + u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); + if (nested_vmcb->control.intercept_exceptions & excp_bits) + return 1; + break; + } + default: { + u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR); + nsvm_printk("exit code: 0x%x\n", exit_code); + if (nested_vmcb->control.intercept & exit_bits) + return 1; + } + } + + return 0; +} + +static int nested_svm_exit_handled_msr(struct vcpu_svm *svm, + void *arg1, void *arg2, + void *opaque) +{ + struct vmcb *nested_vmcb = (struct vmcb *)arg1; + u8 *msrpm = (u8 *)arg2; + u32 t0, t1; + u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; + u32 param = svm->vmcb->control.exit_info_1 & 1; + + if (!(nested_vmcb->control.intercept & (1ULL << INTERCEPT_MSR_PROT))) + return 0; + + switch(msr) { + case 0 ... 0x1fff: + t0 = (msr * 2) % 8; + t1 = msr / 8; + break; + case 0xc0000000 ... 0xc0001fff: + t0 = (8192 + msr - 0xc0000000) * 2; + t1 = (t0 / 8); + t0 %= 8; + break; + case 0xc0010000 ... 0xc0011fff: + t0 = (16384 + msr - 0xc0010000) * 2; + t1 = (t0 / 8); + t0 %= 8; + break; + default: + return 1; + break; + } + if (msrpm[t1] & ((1 << param) << t0)) + return 1; + + return 0; +} + +static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override) +{ + bool k = kvm_override; + + switch (svm->vmcb->control.exit_code) { + case SVM_EXIT_MSR: + return nested_svm_do(svm, svm->nested_vmcb, + svm->nested_vmcb_msrpm, NULL, + nested_svm_exit_handled_msr); + default: break; + } + + return nested_svm_do(svm, svm->nested_vmcb, 0, &k, + nested_svm_exit_handled_real); +} + +static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1, + void *arg2, void *opaque) +{ + struct vmcb *nested_vmcb = (struct vmcb *)arg1; + struct vmcb *hsave = svm->hsave; + u64 nested_save[] = { nested_vmcb->save.cr0, + nested_vmcb->save.cr3, + nested_vmcb->save.cr4, + nested_vmcb->save.efer, + nested_vmcb->control.intercept_cr_read, + nested_vmcb->control.intercept_cr_write, + nested_vmcb->control.intercept_dr_read, + nested_vmcb->control.intercept_dr_write, + nested_vmcb->control.intercept_exceptions, + nested_vmcb->control.intercept, + nested_vmcb->control.msrpm_base_pa, + nested_vmcb->control.iopm_base_pa, + nested_vmcb->control.tsc_offset }; + + /* Give the current vmcb to the guest */ + memcpy(nested_vmcb, svm->vmcb, sizeof(struct vmcb)); + nested_vmcb->save.cr0 = nested_save[0]; + if (!npt_enabled) + nested_vmcb->save.cr3 = nested_save[1]; + nested_vmcb->save.cr4 = nested_save[2]; + nested_vmcb->save.efer = nested_save[3]; + nested_vmcb->control.intercept_cr_read = nested_save[4]; + nested_vmcb->control.intercept_cr_write = nested_save[5]; + nested_vmcb->control.intercept_dr_read = nested_save[6]; + nested_vmcb->control.intercept_dr_write = nested_save[7]; + nested_vmcb->control.intercept_exceptions = nested_save[8]; + nested_vmcb->control.intercept = nested_save[9]; + nested_vmcb->control.msrpm_base_pa = nested_save[10]; + nested_vmcb->control.iopm_base_pa = nested_save[11]; + nested_vmcb->control.tsc_offset = nested_save[12]; + + /* We always set V_INTR_MASKING and remember the old value in hflags */ + if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) + nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK; + + if ((nested_vmcb->control.int_ctl & V_IRQ_MASK) && + (nested_vmcb->control.int_vector)) { + nsvm_printk("WARNING: IRQ 0x%x still enabled on #VMEXIT\n", + nested_vmcb->control.int_vector); + } + + /* Restore the original control entries */ + svm->vmcb->control = hsave->control; + + /* Kill any pending exceptions */ + if (svm->vcpu.arch.exception.pending == true) + nsvm_printk("WARNING: Pending Exception\n"); + svm->vcpu.arch.exception.pending = false; + + /* Restore selected save entries */ + svm->vmcb->save.es = hsave->save.es; + svm->vmcb->save.cs = hsave->save.cs; + svm->vmcb->save.ss = hsave->save.ss; + svm->vmcb->save.ds = hsave->save.ds; + svm->vmcb->save.gdtr = hsave->save.gdtr; + svm->vmcb->save.idtr = hsave->save.idtr; + svm->vmcb->save.rflags = hsave->save.rflags; + svm_set_efer(&svm->vcpu, hsave->save.efer); + svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE); + svm_set_cr4(&svm->vcpu, hsave->save.cr4); + if (npt_enabled) { + svm->vmcb->save.cr3 = hsave->save.cr3; + svm->vcpu.arch.cr3 = hsave->save.cr3; + } else { + kvm_set_cr3(&svm->vcpu, hsave->save.cr3); + } + kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax); + kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp); + kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, hsave->save.rip); + svm->vmcb->save.dr7 = 0; + svm->vmcb->save.cpl = 0; + svm->vmcb->control.exit_int_info = 0; + + svm->vcpu.arch.hflags &= ~HF_GIF_MASK; + /* Exit nested SVM mode */ + svm->nested_vmcb = 0; + + return 0; +} + +static int nested_svm_vmexit(struct vcpu_svm *svm) +{ + nsvm_printk("VMexit\n"); + if (nested_svm_do(svm, svm->nested_vmcb, 0, + NULL, nested_svm_vmexit_real)) + return 1; + + kvm_mmu_reset_context(&svm->vcpu); + kvm_mmu_load(&svm->vcpu); + + return 0; +} static int nested_svm_vmrun_msrpm(struct vcpu_svm *svm, void *arg1, void *arg2, void *opaque) @@ -1805,6 +2079,17 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) KVMTRACE_3D(VMEXIT, vcpu, exit_code, (u32)svm->vmcb->save.rip, (u32)((u64)svm->vmcb->save.rip >> 32), entryexit); + if (is_nested(svm)) { + nsvm_printk("nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx\n", + exit_code, svm->vmcb->control.exit_info_1, + svm->vmcb->control.exit_info_2, svm->vmcb->save.rip); + if (nested_svm_exit_handled(svm, true)) { + nested_svm_vmexit(svm); + nsvm_printk("-> #VMEXIT\n"); + return 1; + } + } + if (npt_enabled) { int mmu_reload = 0; if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) { @@ -1892,6 +2177,8 @@ static void svm_set_irq(struct kvm_vcpu *vcpu, int irq) { struct vcpu_svm *svm = to_svm(vcpu); + nested_svm_intr(svm); + svm_inject_irq(svm, irq); } @@ -1937,6 +2224,9 @@ static void svm_intr_assist(struct kvm_vcpu *vcpu) if (!kvm_cpu_has_interrupt(vcpu)) goto out; + if (nested_svm_intr(svm)) + goto out; + if (!(svm->vcpu.arch.hflags & HF_GIF_MASK)) goto out; @@ -1989,6 +2279,9 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu, struct vcpu_svm *svm = to_svm(vcpu); struct vmcb_control_area *control = &svm->vmcb->control; + if (nested_svm_intr(svm)) + return; + svm->vcpu.arch.interrupt_window_open = (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) && (svm->vmcb->save.rflags & X86_EFLAGS_IF) && -- cgit v1.2.3 From eb6f302edff9da963e687bf6c15dcf88843b1c3b Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 25 Nov 2008 20:17:09 +0100 Subject: KVM: SVM: Allow read access to MSR_VM_VR KVM tries to read the VM_CR MSR to find out if SVM was disabled by the BIOS. So implement read support for this MSR to make nested SVM running. Signed-off-by: Joerg Roedel Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 4cb2920b1527..df5b41192661 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1869,6 +1869,9 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) case MSR_VM_HSAVE_PA: *data = svm->hsave_msr; break; + case MSR_VM_CR: + *data = 0; + break; default: return kvm_get_msr_common(vcpu, ecx, data); } -- cgit v1.2.3 From 236de05553a7ef8f6940de8686ae9bf1272cd2cf Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Tue, 25 Nov 2008 20:17:10 +0100 Subject: KVM: SVM: Allow setting the SVME bit Normally setting the SVME bit in EFER is not allowed, as we did not support SVM. Not since we do, we should also allow enabling SVM mode. v2 comes as last patch, so we don't enable half-ready code v4 introduces a module option to enable SVM v6 warns that nesting is enabled Acked-by: Joerg Roedel Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index df5b41192661..0fbbde54ecae 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -69,6 +69,9 @@ static int npt = 1; module_param(npt, int, S_IRUGO); +static int nested = 0; +module_param(nested, int, S_IRUGO); + static void kvm_reput_irq(struct vcpu_svm *svm); static void svm_flush_tlb(struct kvm_vcpu *vcpu); @@ -443,6 +446,11 @@ static __init int svm_hardware_setup(void) if (boot_cpu_has(X86_FEATURE_NX)) kvm_enable_efer_bits(EFER_NX); + if (nested) { + printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); + kvm_enable_efer_bits(EFER_SVME); + } + for_each_online_cpu(cpu) { r = svm_cpu_init(cpu); if (r) -- cgit v1.2.3 From d80174745ba3938bc6abb8f95ed670ac0631a182 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Tue, 25 Nov 2008 20:17:11 +0100 Subject: KVM: SVM: Only allow setting of EFER_SVME when CPUID SVM is set Userspace has to tell the kernel module somehow that nested SVM should be used. The easiest way that doesn't break anything I could think of is to implement if (cpuid & svm) allow write to efer else deny write to efer Old userspaces mask the SVM capability bit, so they don't break. In order to find out that the SVM capability is set, I had to split the kvm_emulate_cpuid into a finding and an emulating part. (introduced in v6) Acked-by: Joerg Roedel Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 58 +++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 42 insertions(+), 16 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 99165a961f08..b5e9932e0f62 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -69,6 +69,8 @@ static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL; static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries); +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, + u32 function, u32 index); struct kvm_x86_ops *kvm_x86_ops; EXPORT_SYMBOL_GPL(kvm_x86_ops); @@ -173,6 +175,7 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, u32 error_code) { ++vcpu->stat.pf_guest; + if (vcpu->arch.exception.pending) { if (vcpu->arch.exception.nr == PF_VECTOR) { printk(KERN_DEBUG "kvm: inject_page_fault:" @@ -442,6 +445,11 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_get_cr8); +static inline u32 bit(int bitno) +{ + return 1 << (bitno & 31); +} + /* * List of msr numbers which we expose to userspace through KVM_GET_MSRS * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. @@ -481,6 +489,17 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer) return; } + if (efer & EFER_SVME) { + struct kvm_cpuid_entry2 *feat; + + feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); + if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) { + printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n"); + kvm_inject_gp(vcpu, 0); + return; + } + } + kvm_x86_ops->set_efer(vcpu, efer); efer &= ~EFER_LMA; @@ -1181,11 +1200,6 @@ out: return r; } -static inline u32 bit(int bitno) -{ - return 1 << (bitno & 31); -} - static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, u32 index) { @@ -1228,7 +1242,8 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, const u32 kvm_supported_word3_x86_features = bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16); const u32 kvm_supported_word6_x86_features = - bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY); + bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY) | + bit(X86_FEATURE_SVM); /* all func 2 cpuid_count() should be called on the same cpu */ get_cpu(); @@ -2832,20 +2847,15 @@ static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e, return 1; } -void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, + u32 function, u32 index) { int i; - u32 function, index; - struct kvm_cpuid_entry2 *e, *best; + struct kvm_cpuid_entry2 *best = NULL; - function = kvm_register_read(vcpu, VCPU_REGS_RAX); - index = kvm_register_read(vcpu, VCPU_REGS_RCX); - kvm_register_write(vcpu, VCPU_REGS_RAX, 0); - kvm_register_write(vcpu, VCPU_REGS_RBX, 0); - kvm_register_write(vcpu, VCPU_REGS_RCX, 0); - kvm_register_write(vcpu, VCPU_REGS_RDX, 0); - best = NULL; for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { + struct kvm_cpuid_entry2 *e; + e = &vcpu->arch.cpuid_entries[i]; if (is_matching_cpuid_entry(e, function, index)) { if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) @@ -2860,6 +2870,22 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) if (!best || e->function > best->function) best = e; } + + return best; +} + +void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) +{ + u32 function, index; + struct kvm_cpuid_entry2 *best; + + function = kvm_register_read(vcpu, VCPU_REGS_RAX); + index = kvm_register_read(vcpu, VCPU_REGS_RCX); + kvm_register_write(vcpu, VCPU_REGS_RAX, 0); + kvm_register_write(vcpu, VCPU_REGS_RBX, 0); + kvm_register_write(vcpu, VCPU_REGS_RCX, 0); + kvm_register_write(vcpu, VCPU_REGS_RDX, 0); + best = kvm_find_cpuid_entry(vcpu, function, index); if (best) { kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); -- cgit v1.2.3 From 8ab2d2e231062814bd89bba2d6d92563190aa2bb Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 15 Dec 2008 13:52:10 +0100 Subject: KVM: VMX: Support for injecting software exceptions VMX differentiates between processor and software generated exceptions when injecting them into the guest. Extend vmx_queue_exception accordingly (and refactor related constants) so that we can use this service reliably for the new guest debugging framework. Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/include/asm/vmx.h | 3 ++- arch/x86/kvm/vmx.c | 35 ++++++++++++++++++++--------------- 2 files changed, 22 insertions(+), 16 deletions(-) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index d0238e6151d8..32159f034efc 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -270,8 +270,9 @@ enum vmcs_field { #define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ #define INTR_TYPE_NMI_INTR (2 << 8) /* NMI */ -#define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */ +#define INTR_TYPE_HARD_EXCEPTION (3 << 8) /* processor exception */ #define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */ +#define INTR_TYPE_SOFT_EXCEPTION (6 << 8) /* software exception */ /* GUEST_INTERRUPTIBILITY_INFO flags. */ #define GUEST_INTR_STATE_STI 0x00000001 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7611af576829..1d974c1eaa7d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -189,21 +189,21 @@ static inline int is_page_fault(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | INTR_INFO_VALID_MASK)) == - (INTR_TYPE_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK); + (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK); } static inline int is_no_device(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | INTR_INFO_VALID_MASK)) == - (INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK); + (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK); } static inline int is_invalid_opcode(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | INTR_INFO_VALID_MASK)) == - (INTR_TYPE_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK); + (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK); } static inline int is_external_interrupt(u32 intr_info) @@ -747,29 +747,33 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, bool has_error_code, u32 error_code) { struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 intr_info = nr | INTR_INFO_VALID_MASK; - if (has_error_code) + if (has_error_code) { vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); + intr_info |= INTR_INFO_DELIVER_CODE_MASK; + } if (vcpu->arch.rmode.active) { vmx->rmode.irq.pending = true; vmx->rmode.irq.vector = nr; vmx->rmode.irq.rip = kvm_rip_read(vcpu); - if (nr == BP_VECTOR) + if (nr == BP_VECTOR || nr == OF_VECTOR) vmx->rmode.irq.rip++; - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, - nr | INTR_TYPE_SOFT_INTR - | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0) - | INTR_INFO_VALID_MASK); + intr_info |= INTR_TYPE_SOFT_INTR; + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); return; } - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, - nr | INTR_TYPE_EXCEPTION - | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0) - | INTR_INFO_VALID_MASK); + if (nr == BP_VECTOR || nr == OF_VECTOR) { + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); + intr_info |= INTR_TYPE_SOFT_EXCEPTION; + } else + intr_info |= INTR_TYPE_HARD_EXCEPTION; + + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); } static bool vmx_exception_injected(struct kvm_vcpu *vcpu) @@ -2650,7 +2654,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == - (INTR_TYPE_EXCEPTION | 1)) { + (INTR_TYPE_HARD_EXCEPTION | 1)) { kvm_run->exit_reason = KVM_EXIT_DEBUG; return 0; } @@ -3238,7 +3242,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) vmx->vcpu.arch.nmi_injected = false; } kvm_clear_exception_queue(&vmx->vcpu); - if (idtv_info_valid && type == INTR_TYPE_EXCEPTION) { + if (idtv_info_valid && (type == INTR_TYPE_HARD_EXCEPTION || + type == INTR_TYPE_SOFT_EXCEPTION)) { if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { error = vmcs_read32(IDT_VECTORING_ERROR_CODE); kvm_queue_exception_e(&vmx->vcpu, vector, error); -- cgit v1.2.3 From d0bfb940ecabf0b44fb1fd80d8d60594e569e5ec Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 15 Dec 2008 13:52:10 +0100 Subject: KVM: New guest debug interface This rips out the support for KVM_DEBUG_GUEST and introduces a new IOCTL instead: KVM_SET_GUEST_DEBUG. The IOCTL payload consists of a generic part, controlling the "main switch" and the single-step feature. The arch specific part adds an x86 interface for intercepting both types of debug exceptions separately and re-injecting them when the host was not interested. Moveover, the foundation for guest debugging via debug registers is layed. To signal breakpoint events properly back to userland, an arch-specific data block is now returned along KVM_EXIT_DEBUG. For x86, the arch block contains the PC, the debug exception, and relevant debug registers to tell debug events properly apart. The availability of this new interface is signaled by KVM_CAP_SET_GUEST_DEBUG. Empty stubs for not yet supported archs are provided. Note that both SVM and VTX are supported, but only the latter was tested yet. Based on the experience with all those VTX corner case, I would be fairly surprised if SVM will work out of the box. Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/ia64/include/asm/kvm.h | 7 ++++ arch/ia64/kvm/kvm-ia64.c | 4 +- arch/powerpc/include/asm/kvm.h | 7 ++++ arch/powerpc/kvm/powerpc.c | 4 +- arch/s390/include/asm/kvm.h | 7 ++++ arch/s390/kvm/kvm-s390.c | 4 +- arch/x86/include/asm/kvm.h | 18 ++++++++ arch/x86/include/asm/kvm_host.h | 9 +--- arch/x86/kvm/svm.c | 50 +++++++++++++++++++++- arch/x86/kvm/vmx.c | 93 ++++++++++++++++------------------------- arch/x86/kvm/x86.c | 14 ++++--- include/linux/kvm.h | 51 +++++++++++++++------- include/linux/kvm_host.h | 6 +-- virt/kvm/kvm_main.c | 6 +-- 14 files changed, 179 insertions(+), 101 deletions(-) diff --git a/arch/ia64/include/asm/kvm.h b/arch/ia64/include/asm/kvm.h index bfa86b6af7cd..be3fdb891214 100644 --- a/arch/ia64/include/asm/kvm.h +++ b/arch/ia64/include/asm/kvm.h @@ -214,4 +214,11 @@ struct kvm_sregs { struct kvm_fpu { }; +struct kvm_debug_exit_arch { +}; + +/* for KVM_SET_GUEST_DEBUG */ +struct kvm_guest_debug_arch { +}; + #endif diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index 28f982045f29..de47467a0e61 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -1303,8 +1303,8 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) return -EINVAL; } -int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, - struct kvm_debug_guest *dbg) +int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, + struct kvm_guest_debug *dbg) { return -EINVAL; } diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h index f993e4198d5c..755f1b1948c5 100644 --- a/arch/powerpc/include/asm/kvm.h +++ b/arch/powerpc/include/asm/kvm.h @@ -52,4 +52,11 @@ struct kvm_fpu { __u64 fpr[32]; }; +struct kvm_debug_exit_arch { +}; + +/* for KVM_SET_GUEST_DEBUG */ +struct kvm_guest_debug_arch { +}; + #endif /* __LINUX_KVM_POWERPC_H */ diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 5f81256287f5..7c2ad4017d6a 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -240,8 +240,8 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) kvmppc_core_vcpu_put(vcpu); } -int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, - struct kvm_debug_guest *dbg) +int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, + struct kvm_guest_debug *dbg) { int i; diff --git a/arch/s390/include/asm/kvm.h b/arch/s390/include/asm/kvm.h index e1f54654e3ae..0b2f829f6d50 100644 --- a/arch/s390/include/asm/kvm.h +++ b/arch/s390/include/asm/kvm.h @@ -42,4 +42,11 @@ struct kvm_fpu { __u64 fprs[16]; }; +struct kvm_debug_exit_arch { +}; + +/* for KVM_SET_GUEST_DEBUG */ +struct kvm_guest_debug_arch { +}; + #endif diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 0d33893e1e89..cbfe91e10120 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -422,8 +422,8 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, return -EINVAL; /* not implemented yet */ } -int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, - struct kvm_debug_guest *dbg) +int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, + struct kvm_guest_debug *dbg) { return -EINVAL; /* not implemented yet */ } diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index 886c9402ec45..32eb96c7ca27 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h @@ -212,6 +212,24 @@ struct kvm_pit_channel_state { __s64 count_load_time; }; +struct kvm_debug_exit_arch { + __u32 exception; + __u32 pad; + __u64 pc; + __u64 dr6; + __u64 dr7; +}; + +#define KVM_GUESTDBG_USE_SW_BP 0x00010000 +#define KVM_GUESTDBG_USE_HW_BP 0x00020000 +#define KVM_GUESTDBG_INJECT_DB 0x00040000 +#define KVM_GUESTDBG_INJECT_BP 0x00080000 + +/* for KVM_SET_GUEST_DEBUG */ +struct kvm_guest_debug_arch { + __u64 debugreg[8]; +}; + struct kvm_pit_state { struct kvm_pit_channel_state channels[3]; }; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 53779309514a..c430cd580ee2 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -135,12 +135,6 @@ enum { #define KVM_NR_MEM_OBJS 40 -struct kvm_guest_debug { - int enabled; - unsigned long bp[4]; - int singlestep; -}; - /* * We don't want allocation failures within the mmu code, so we preallocate * enough memory for a single page fault in a cache. @@ -448,8 +442,7 @@ struct kvm_x86_ops { void (*vcpu_put)(struct kvm_vcpu *vcpu); int (*set_guest_debug)(struct kvm_vcpu *vcpu, - struct kvm_debug_guest *dbg); - void (*guest_debug_pre)(struct kvm_vcpu *vcpu); + struct kvm_guest_debug *dbg); int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 0fbbde54ecae..88d9062f4545 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -968,9 +968,32 @@ static void svm_set_segment(struct kvm_vcpu *vcpu, } -static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) +static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { - return -EOPNOTSUPP; + int old_debug = vcpu->guest_debug; + struct vcpu_svm *svm = to_svm(vcpu); + + vcpu->guest_debug = dbg->control; + + svm->vmcb->control.intercept_exceptions &= + ~((1 << DB_VECTOR) | (1 << BP_VECTOR)); + if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { + if (vcpu->guest_debug & + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) + svm->vmcb->control.intercept_exceptions |= + 1 << DB_VECTOR; + if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) + svm->vmcb->control.intercept_exceptions |= + 1 << BP_VECTOR; + } else + vcpu->guest_debug = 0; + + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + svm->vmcb->save.rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF; + else if (old_debug & KVM_GUESTDBG_SINGLESTEP) + svm->vmcb->save.rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); + + return 0; } static int svm_get_irq(struct kvm_vcpu *vcpu) @@ -1094,6 +1117,27 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); } +static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + if (!(svm->vcpu.guest_debug & + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { + kvm_queue_exception(&svm->vcpu, DB_VECTOR); + return 1; + } + kvm_run->exit_reason = KVM_EXIT_DEBUG; + kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; + kvm_run->debug.arch.exception = DB_VECTOR; + return 0; +} + +static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + kvm_run->exit_reason = KVM_EXIT_DEBUG; + kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; + kvm_run->debug.arch.exception = BP_VECTOR; + return 0; +} + static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { int er; @@ -2050,6 +2094,8 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, [SVM_EXIT_WRITE_DR3] = emulate_on_interception, [SVM_EXIT_WRITE_DR5] = emulate_on_interception, [SVM_EXIT_WRITE_DR7] = emulate_on_interception, + [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, + [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1d974c1eaa7d..f55690ddb3ac 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -480,8 +480,13 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) eb = (1u << PF_VECTOR) | (1u << UD_VECTOR); if (!vcpu->fpu_active) eb |= 1u << NM_VECTOR; - if (vcpu->guest_debug.enabled) - eb |= 1u << DB_VECTOR; + if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { + if (vcpu->guest_debug & + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) + eb |= 1u << DB_VECTOR; + if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) + eb |= 1u << BP_VECTOR; + } if (vcpu->arch.rmode.active) eb = ~0; if (vm_need_ept()) @@ -1003,40 +1008,23 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) } } -static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) +static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { - unsigned long dr7 = 0x400; - int old_singlestep; - - old_singlestep = vcpu->guest_debug.singlestep; - - vcpu->guest_debug.enabled = dbg->enabled; - if (vcpu->guest_debug.enabled) { - int i; - - dr7 |= 0x200; /* exact */ - for (i = 0; i < 4; ++i) { - if (!dbg->breakpoints[i].enabled) - continue; - vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address; - dr7 |= 2 << (i*2); /* global enable */ - dr7 |= 0 << (i*4+16); /* execution breakpoint */ - } - - vcpu->guest_debug.singlestep = dbg->singlestep; - } else - vcpu->guest_debug.singlestep = 0; + int old_debug = vcpu->guest_debug; + unsigned long flags; - if (old_singlestep && !vcpu->guest_debug.singlestep) { - unsigned long flags; + vcpu->guest_debug = dbg->control; + if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE)) + vcpu->guest_debug = 0; - flags = vmcs_readl(GUEST_RFLAGS); + flags = vmcs_readl(GUEST_RFLAGS); + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + flags |= X86_EFLAGS_TF | X86_EFLAGS_RF; + else if (old_debug & KVM_GUESTDBG_SINGLESTEP) flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); - vmcs_writel(GUEST_RFLAGS, flags); - } + vmcs_writel(GUEST_RFLAGS, flags); update_exception_bitmap(vcpu); - vmcs_writel(GUEST_DR7, dr7); return 0; } @@ -2540,24 +2528,6 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) return 0; } -static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu) -{ - struct kvm_guest_debug *dbg = &vcpu->guest_debug; - - set_debugreg(dbg->bp[0], 0); - set_debugreg(dbg->bp[1], 1); - set_debugreg(dbg->bp[2], 2); - set_debugreg(dbg->bp[3], 3); - - if (dbg->singlestep) { - unsigned long flags; - - flags = vmcs_readl(GUEST_RFLAGS); - flags |= X86_EFLAGS_TF | X86_EFLAGS_RF; - vmcs_writel(GUEST_RFLAGS, flags); - } -} - static int handle_rmode_exception(struct kvm_vcpu *vcpu, int vec, u32 err_code) { @@ -2574,9 +2544,17 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, * the required debugging infrastructure rework. */ switch (vec) { - case DE_VECTOR: case DB_VECTOR: + if (vcpu->guest_debug & + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) + return 0; + kvm_queue_exception(vcpu, vec); + return 1; case BP_VECTOR: + if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) + return 0; + /* fall through */ + case DE_VECTOR: case OF_VECTOR: case BR_VECTOR: case UD_VECTOR: @@ -2593,7 +2571,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 intr_info, error_code; + u32 intr_info, ex_no, error_code; unsigned long cr2, rip; u32 vect_info; enum emulation_result er; @@ -2653,14 +2631,16 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } - if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == - (INTR_TYPE_HARD_EXCEPTION | 1)) { + ex_no = intr_info & INTR_INFO_VECTOR_MASK; + if (ex_no == DB_VECTOR || ex_no == BP_VECTOR) { kvm_run->exit_reason = KVM_EXIT_DEBUG; - return 0; + kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; + kvm_run->debug.arch.exception = ex_no; + } else { + kvm_run->exit_reason = KVM_EXIT_EXCEPTION; + kvm_run->ex.exception = ex_no; + kvm_run->ex.error_code = error_code; } - kvm_run->exit_reason = KVM_EXIT_EXCEPTION; - kvm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK; - kvm_run->ex.error_code = error_code; return 0; } @@ -3600,7 +3580,6 @@ static struct kvm_x86_ops vmx_x86_ops = { .vcpu_put = vmx_vcpu_put, .set_guest_debug = set_guest_debug, - .guest_debug_pre = kvm_guest_debug_pre, .get_msr = vmx_get_msr, .set_msr = vmx_set_msr, .get_segment_base = vmx_get_segment_base, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b5e9932e0f62..e990d164b56d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3005,9 +3005,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) goto out; } - if (vcpu->guest_debug.enabled) - kvm_x86_ops->guest_debug_pre(vcpu); - vcpu->guest_mode = 1; /* * Make sure that guest_mode assignment won't happen after @@ -3218,7 +3215,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) /* * Don't leak debug flags in case they were set for guest debugging */ - if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep) + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); vcpu_put(vcpu); @@ -3837,8 +3834,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, return 0; } -int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, - struct kvm_debug_guest *dbg) +int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, + struct kvm_guest_debug *dbg) { int r; @@ -3846,6 +3843,11 @@ int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, r = kvm_x86_ops->set_guest_debug(vcpu, dbg); + if (dbg->control & KVM_GUESTDBG_INJECT_DB) + kvm_queue_exception(vcpu, DB_VECTOR); + else if (dbg->control & KVM_GUESTDBG_INJECT_BP) + kvm_queue_exception(vcpu, BP_VECTOR); + vcpu_put(vcpu); return r; diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 0424326f1679..429a2ce202f9 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -126,6 +126,7 @@ struct kvm_run { __u64 data_offset; /* relative to kvm_run start */ } io; struct { + struct kvm_debug_exit_arch arch; } debug; /* KVM_EXIT_MMIO */ struct { @@ -217,21 +218,6 @@ struct kvm_interrupt { __u32 irq; }; -struct kvm_breakpoint { - __u32 enabled; - __u32 padding; - __u64 address; -}; - -/* for KVM_DEBUG_GUEST */ -struct kvm_debug_guest { - /* int */ - __u32 enabled; - __u32 pad; - struct kvm_breakpoint breakpoints[4]; - __u32 singlestep; -}; - /* for KVM_GET_DIRTY_LOG */ struct kvm_dirty_log { __u32 slot; @@ -292,6 +278,17 @@ struct kvm_s390_interrupt { __u64 parm64; }; +/* for KVM_SET_GUEST_DEBUG */ + +#define KVM_GUESTDBG_ENABLE 0x00000001 +#define KVM_GUESTDBG_SINGLESTEP 0x00000002 + +struct kvm_guest_debug { + __u32 control; + __u32 pad; + struct kvm_guest_debug_arch arch; +}; + #define KVM_TRC_SHIFT 16 /* * kvm trace categories @@ -396,6 +393,7 @@ struct kvm_trace_rec { #ifdef __KVM_HAVE_USER_NMI #define KVM_CAP_USER_NMI 22 #endif +#define KVM_CAP_SET_GUEST_DEBUG 23 /* * ioctls for VM fds @@ -440,7 +438,8 @@ struct kvm_trace_rec { #define KVM_SET_SREGS _IOW(KVMIO, 0x84, struct kvm_sregs) #define KVM_TRANSLATE _IOWR(KVMIO, 0x85, struct kvm_translation) #define KVM_INTERRUPT _IOW(KVMIO, 0x86, struct kvm_interrupt) -#define KVM_DEBUG_GUEST _IOW(KVMIO, 0x87, struct kvm_debug_guest) +/* KVM_DEBUG_GUEST is no longer supported, use KVM_SET_GUEST_DEBUG instead */ +#define KVM_DEBUG_GUEST __KVM_DEPRECATED_DEBUG_GUEST #define KVM_GET_MSRS _IOWR(KVMIO, 0x88, struct kvm_msrs) #define KVM_SET_MSRS _IOW(KVMIO, 0x89, struct kvm_msrs) #define KVM_SET_CPUID _IOW(KVMIO, 0x8a, struct kvm_cpuid) @@ -469,6 +468,26 @@ struct kvm_trace_rec { #define KVM_SET_MP_STATE _IOW(KVMIO, 0x99, struct kvm_mp_state) /* Available with KVM_CAP_NMI */ #define KVM_NMI _IO(KVMIO, 0x9a) +/* Available with KVM_CAP_SET_GUEST_DEBUG */ +#define KVM_SET_GUEST_DEBUG _IOW(KVMIO, 0x9b, struct kvm_guest_debug) + +/* + * Deprecated interfaces + */ +struct kvm_breakpoint { + __u32 enabled; + __u32 padding; + __u64 address; +}; + +struct kvm_debug_guest { + __u32 enabled; + __u32 pad; + struct kvm_breakpoint breakpoints[4]; + __u32 singlestep; +}; + +#define __KVM_DEPRECATED_DEBUG_GUEST _IOW(KVMIO, 0x87, struct kvm_debug_guest) #define KVM_TRC_INJ_VIRQ (KVM_TRC_HANDLER + 0x02) #define KVM_TRC_REDELIVER_EVT (KVM_TRC_HANDLER + 0x03) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index bf6f703642fc..e92212f970db 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -73,7 +73,7 @@ struct kvm_vcpu { struct kvm_run *run; int guest_mode; unsigned long requests; - struct kvm_guest_debug guest_debug; + unsigned long guest_debug; int fpu_active; int guest_fpu_loaded; wait_queue_head_t wq; @@ -255,8 +255,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state); int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state); -int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, - struct kvm_debug_guest *dbg); +int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, + struct kvm_guest_debug *dbg); int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run); int kvm_arch_init(void *opaque); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 29a667ce35b0..f83ef9c7e89b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1755,13 +1755,13 @@ out_free2: r = 0; break; } - case KVM_DEBUG_GUEST: { - struct kvm_debug_guest dbg; + case KVM_SET_GUEST_DEBUG: { + struct kvm_guest_debug dbg; r = -EFAULT; if (copy_from_user(&dbg, argp, sizeof dbg)) goto out; - r = kvm_arch_vcpu_ioctl_debug_guest(vcpu, &dbg); + r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); if (r) goto out; r = 0; -- cgit v1.2.3 From 55934c0bd3bb232a9cf902820dd63ad18ed65e49 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 15 Dec 2008 13:52:10 +0100 Subject: KVM: VMX: Allow single-stepping when uninterruptible When single-stepping over STI and MOV SS, we must clear the corresponding interruptibility bits in the guest state. Otherwise vmentry fails as it then expects bit 14 (BS) in pending debug exceptions being set, but that's not correct for the guest debugging case. Note that clearing those bits is safe as we check for interruptibility based on the original state and do not inject interrupts or NMIs if guest interruptibility was blocked. Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f55690ddb3ac..c776868ffe41 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2478,6 +2478,11 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu, { vmx_update_window_states(vcpu); + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_STI | + GUEST_INTR_STATE_MOV_SS); + if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) { if (vcpu->arch.interrupt.pending) { enable_nmi_window(vcpu); @@ -3244,6 +3249,11 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu) vmx_update_window_states(vcpu); + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_STI | + GUEST_INTR_STATE_MOV_SS); + if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) { if (vcpu->arch.interrupt.pending) { enable_nmi_window(vcpu); -- cgit v1.2.3 From 42dbaa5a057736bf8b5c22aa42dbe975bf1080e5 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 15 Dec 2008 13:52:10 +0100 Subject: KVM: x86: Virtualize debug registers So far KVM only had basic x86 debug register support, once introduced to realize guest debugging that way. The guest itself was not able to use those registers. This patch now adds (almost) full support for guest self-debugging via hardware registers. It refactors the code, moving generic parts out of SVM (VMX was already cleaned up by the KVM_SET_GUEST_DEBUG patches), and it ensures that the registers are properly switched between host and guest. This patch also prepares debug register usage by the host. The latter will (once wired-up by the following patch) allow for hardware breakpoints/watchpoints in guest code. If this is enabled, the guest will only see faked debug registers without functionality, but with content reflecting the guest's modifications. Tested on Intel only, but SVM /should/ work as well, but who knows... Known limitations: Trapping on tss switch won't work - most probably on Intel. Credits also go to Joerg Roedel - I used his once posted debugging series as platform for this patch. Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 22 ++++++++ arch/x86/include/asm/vmx.h | 2 +- arch/x86/kvm/kvm_svm.h | 6 --- arch/x86/kvm/svm.c | 116 +++++++++++++++------------------------- arch/x86/kvm/vmx.c | 114 +++++++++++++++++++++++++++++++++------ arch/x86/kvm/x86.c | 29 ++++++++++ 6 files changed, 193 insertions(+), 96 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c430cd580ee2..0a4dab25a919 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -135,6 +135,19 @@ enum { #define KVM_NR_MEM_OBJS 40 +#define KVM_NR_DB_REGS 4 + +#define DR6_BD (1 << 13) +#define DR6_BS (1 << 14) +#define DR6_FIXED_1 0xffff0ff0 +#define DR6_VOLATILE 0x0000e00f + +#define DR7_BP_EN_MASK 0x000000ff +#define DR7_GE (1 << 9) +#define DR7_GD (1 << 13) +#define DR7_FIXED_1 0x00000400 +#define DR7_VOLATILE 0xffff23ff + /* * We don't want allocation failures within the mmu code, so we preallocate * enough memory for a single page fault in a cache. @@ -334,6 +347,15 @@ struct kvm_vcpu_arch { struct mtrr_state_type mtrr_state; u32 pat; + + int switch_db_regs; + unsigned long host_db[KVM_NR_DB_REGS]; + unsigned long host_dr6; + unsigned long host_dr7; + unsigned long db[KVM_NR_DB_REGS]; + unsigned long dr6; + unsigned long dr7; + unsigned long eff_db[KVM_NR_DB_REGS]; }; struct kvm_mem_alias { diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 32159f034efc..498f944010b9 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -312,7 +312,7 @@ enum vmcs_field { #define DEBUG_REG_ACCESS_TYPE 0x10 /* 4, direction of access */ #define TYPE_MOV_TO_DR (0 << 4) #define TYPE_MOV_FROM_DR (1 << 4) -#define DEBUG_REG_ACCESS_REG 0xf00 /* 11:8, general purpose reg. */ +#define DEBUG_REG_ACCESS_REG(eq) (((eq) >> 8) & 0xf) /* 11:8, general purpose reg. */ /* segment AR */ diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h index 91673413d8f7..ed66e4c078dc 100644 --- a/arch/x86/kvm/kvm_svm.h +++ b/arch/x86/kvm/kvm_svm.h @@ -18,7 +18,6 @@ static const u32 host_save_user_msrs[] = { }; #define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) -#define NUM_DB_REGS 4 struct kvm_vcpu; @@ -29,16 +28,11 @@ struct vcpu_svm { struct svm_cpu_data *svm_data; uint64_t asid_generation; - unsigned long db_regs[NUM_DB_REGS]; - u64 next_rip; u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; u64 host_gs_base; unsigned long host_cr2; - unsigned long host_db_regs[NUM_DB_REGS]; - unsigned long host_dr6; - unsigned long host_dr7; u32 *msrpm; struct vmcb *hsave; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 88d9062f4545..815f50e425ac 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -38,9 +38,6 @@ MODULE_LICENSE("GPL"); #define IOPM_ALLOC_ORDER 2 #define MSRPM_ALLOC_ORDER 1 -#define DR7_GD_MASK (1 << 13) -#define DR6_BD_MASK (1 << 13) - #define SEG_TYPE_LDT 2 #define SEG_TYPE_BUSY_TSS16 3 @@ -181,32 +178,6 @@ static inline void kvm_write_cr2(unsigned long val) asm volatile ("mov %0, %%cr2" :: "r" (val)); } -static inline unsigned long read_dr6(void) -{ - unsigned long dr6; - - asm volatile ("mov %%dr6, %0" : "=r" (dr6)); - return dr6; -} - -static inline void write_dr6(unsigned long val) -{ - asm volatile ("mov %0, %%dr6" :: "r" (val)); -} - -static inline unsigned long read_dr7(void) -{ - unsigned long dr7; - - asm volatile ("mov %%dr7, %0" : "=r" (dr7)); - return dr7; -} - -static inline void write_dr7(unsigned long val) -{ - asm volatile ("mov %0, %%dr7" :: "r" (val)); -} - static inline void force_new_asid(struct kvm_vcpu *vcpu) { to_svm(vcpu)->asid_generation--; @@ -695,7 +666,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) clear_page(svm->vmcb); svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; svm->asid_generation = 0; - memset(svm->db_regs, 0, sizeof(svm->db_regs)); init_vmcb(svm); fx_init(&svm->vcpu); @@ -1035,7 +1005,29 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data) static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) { - unsigned long val = to_svm(vcpu)->db_regs[dr]; + struct vcpu_svm *svm = to_svm(vcpu); + unsigned long val; + + switch (dr) { + case 0 ... 3: + val = vcpu->arch.db[dr]; + break; + case 6: + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) + val = vcpu->arch.dr6; + else + val = svm->vmcb->save.dr6; + break; + case 7: + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) + val = vcpu->arch.dr7; + else + val = svm->vmcb->save.dr7; + break; + default: + val = 0; + } + KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler); return val; } @@ -1045,33 +1037,40 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, { struct vcpu_svm *svm = to_svm(vcpu); - *exception = 0; + KVMTRACE_2D(DR_WRITE, vcpu, (u32)dr, (u32)value, handler); - if (svm->vmcb->save.dr7 & DR7_GD_MASK) { - svm->vmcb->save.dr7 &= ~DR7_GD_MASK; - svm->vmcb->save.dr6 |= DR6_BD_MASK; - *exception = DB_VECTOR; - return; - } + *exception = 0; switch (dr) { case 0 ... 3: - svm->db_regs[dr] = value; + vcpu->arch.db[dr] = value; + if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) + vcpu->arch.eff_db[dr] = value; return; case 4 ... 5: - if (vcpu->arch.cr4 & X86_CR4_DE) { + if (vcpu->arch.cr4 & X86_CR4_DE) *exception = UD_VECTOR; + return; + case 6: + if (value & 0xffffffff00000000ULL) { + *exception = GP_VECTOR; return; } - case 7: { - if (value & ~((1ULL << 32) - 1)) { + vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1; + return; + case 7: + if (value & 0xffffffff00000000ULL) { *exception = GP_VECTOR; return; } - svm->vmcb->save.dr7 = value; + vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1; + if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { + svm->vmcb->save.dr7 = vcpu->arch.dr7; + vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK); + } return; - } default: + /* FIXME: Possible case? */ printk(KERN_DEBUG "%s: unexpected dr %u\n", __func__, dr); *exception = UD_VECTOR; @@ -2365,22 +2364,6 @@ static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) return 0; } -static void save_db_regs(unsigned long *db_regs) -{ - asm volatile ("mov %%dr0, %0" : "=r"(db_regs[0])); - asm volatile ("mov %%dr1, %0" : "=r"(db_regs[1])); - asm volatile ("mov %%dr2, %0" : "=r"(db_regs[2])); - asm volatile ("mov %%dr3, %0" : "=r"(db_regs[3])); -} - -static void load_db_regs(unsigned long *db_regs) -{ - asm volatile ("mov %0, %%dr0" : : "r"(db_regs[0])); - asm volatile ("mov %0, %%dr1" : : "r"(db_regs[1])); - asm volatile ("mov %0, %%dr2" : : "r"(db_regs[2])); - asm volatile ("mov %0, %%dr3" : : "r"(db_regs[3])); -} - static void svm_flush_tlb(struct kvm_vcpu *vcpu) { force_new_asid(vcpu); @@ -2439,20 +2422,12 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) gs_selector = kvm_read_gs(); ldt_selector = kvm_read_ldt(); svm->host_cr2 = kvm_read_cr2(); - svm->host_dr6 = read_dr6(); - svm->host_dr7 = read_dr7(); if (!is_nested(svm)) svm->vmcb->save.cr2 = vcpu->arch.cr2; /* required for live migration with NPT */ if (npt_enabled) svm->vmcb->save.cr3 = vcpu->arch.cr3; - if (svm->vmcb->save.dr7 & 0xff) { - write_dr7(0); - save_db_regs(svm->host_db_regs); - load_db_regs(svm->db_regs); - } - clgi(); local_irq_enable(); @@ -2528,16 +2503,11 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) #endif ); - if ((svm->vmcb->save.dr7 & 0xff)) - load_db_regs(svm->host_db_regs); - vcpu->arch.cr2 = svm->vmcb->save.cr2; vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; - write_dr6(svm->host_dr6); - write_dr7(svm->host_dr7); kvm_write_cr2(svm->host_cr2); kvm_load_fs(fs_selector); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c776868ffe41..0989776ee7b0 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2311,7 +2311,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) kvm_rip_write(vcpu, 0); kvm_register_write(vcpu, VCPU_REGS_RSP, 0); - /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */ vmcs_writel(GUEST_DR7, 0x400); vmcs_writel(GUEST_GDTR_BASE, 0); @@ -2577,7 +2576,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 intr_info, ex_no, error_code; - unsigned long cr2, rip; + unsigned long cr2, rip, dr6; u32 vect_info; enum emulation_result er; @@ -2637,14 +2636,28 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } ex_no = intr_info & INTR_INFO_VECTOR_MASK; - if (ex_no == DB_VECTOR || ex_no == BP_VECTOR) { + switch (ex_no) { + case DB_VECTOR: + dr6 = vmcs_readl(EXIT_QUALIFICATION); + if (!(vcpu->guest_debug & + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { + vcpu->arch.dr6 = dr6 | DR6_FIXED_1; + kvm_queue_exception(vcpu, DB_VECTOR); + return 1; + } + kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1; + kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); + /* fall through */ + case BP_VECTOR: kvm_run->exit_reason = KVM_EXIT_DEBUG; kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; kvm_run->debug.arch.exception = ex_no; - } else { + break; + default: kvm_run->exit_reason = KVM_EXIT_EXCEPTION; kvm_run->ex.exception = ex_no; kvm_run->ex.error_code = error_code; + break; } return 0; } @@ -2784,21 +2797,44 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) unsigned long val; int dr, reg; - /* - * FIXME: this code assumes the host is debugging the guest. - * need to deal with guest debugging itself too. - */ + dr = vmcs_readl(GUEST_DR7); + if (dr & DR7_GD) { + /* + * As the vm-exit takes precedence over the debug trap, we + * need to emulate the latter, either for the host or the + * guest debugging itself. + */ + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { + kvm_run->debug.arch.dr6 = vcpu->arch.dr6; + kvm_run->debug.arch.dr7 = dr; + kvm_run->debug.arch.pc = + vmcs_readl(GUEST_CS_BASE) + + vmcs_readl(GUEST_RIP); + kvm_run->debug.arch.exception = DB_VECTOR; + kvm_run->exit_reason = KVM_EXIT_DEBUG; + return 0; + } else { + vcpu->arch.dr7 &= ~DR7_GD; + vcpu->arch.dr6 |= DR6_BD; + vmcs_writel(GUEST_DR7, vcpu->arch.dr7); + kvm_queue_exception(vcpu, DB_VECTOR); + return 1; + } + } + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - dr = exit_qualification & 7; - reg = (exit_qualification >> 8) & 15; - if (exit_qualification & 16) { - /* mov from dr */ + dr = exit_qualification & DEBUG_REG_ACCESS_NUM; + reg = DEBUG_REG_ACCESS_REG(exit_qualification); + if (exit_qualification & TYPE_MOV_FROM_DR) { switch (dr) { + case 0 ... 3: + val = vcpu->arch.db[dr]; + break; case 6: - val = 0xffff0ff0; + val = vcpu->arch.dr6; break; case 7: - val = 0x400; + val = vcpu->arch.dr7; break; default: val = 0; @@ -2806,7 +2842,38 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_register_write(vcpu, reg, val); KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler); } else { - /* mov to dr */ + val = vcpu->arch.regs[reg]; + switch (dr) { + case 0 ... 3: + vcpu->arch.db[dr] = val; + if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) + vcpu->arch.eff_db[dr] = val; + break; + case 4 ... 5: + if (vcpu->arch.cr4 & X86_CR4_DE) + kvm_queue_exception(vcpu, UD_VECTOR); + break; + case 6: + if (val & 0xffffffff00000000ULL) { + kvm_queue_exception(vcpu, GP_VECTOR); + break; + } + vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; + break; + case 7: + if (val & 0xffffffff00000000ULL) { + kvm_queue_exception(vcpu, GP_VECTOR); + break; + } + vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; + if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { + vmcs_writel(GUEST_DR7, vcpu->arch.dr7); + vcpu->arch.switch_db_regs = + (val & DR7_BP_EN_MASK); + } + break; + } + KVMTRACE_2D(DR_WRITE, vcpu, (u32)dr, (u32)val, handler); } skip_emulated_instruction(vcpu); return 1; @@ -2957,7 +3024,18 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } tss_selector = exit_qualification; - return kvm_task_switch(vcpu, tss_selector, reason); + if (!kvm_task_switch(vcpu, tss_selector, reason)) + return 0; + + /* clear all local breakpoint enable flags */ + vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55); + + /* + * TODO: What about debug traps on tss switch? + * Are we supposed to inject them and update dr6? + */ + + return 1; } static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) @@ -3342,6 +3420,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) */ vmcs_writel(HOST_CR0, read_cr0()); + set_debugreg(vcpu->arch.dr6, 6); + asm( /* Store host registers */ "push %%"R"dx; push %%"R"bp;" @@ -3436,6 +3516,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); vcpu->arch.regs_dirty = 0; + get_debugreg(vcpu->arch.dr6, 6); + vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); if (vmx->rmode.irq.pending) fixup_rmode_irq(vmx); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e990d164b56d..300bc4d42abc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3025,10 +3025,34 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_guest_enter(); + get_debugreg(vcpu->arch.host_dr6, 6); + get_debugreg(vcpu->arch.host_dr7, 7); + if (unlikely(vcpu->arch.switch_db_regs)) { + get_debugreg(vcpu->arch.host_db[0], 0); + get_debugreg(vcpu->arch.host_db[1], 1); + get_debugreg(vcpu->arch.host_db[2], 2); + get_debugreg(vcpu->arch.host_db[3], 3); + + set_debugreg(0, 7); + set_debugreg(vcpu->arch.eff_db[0], 0); + set_debugreg(vcpu->arch.eff_db[1], 1); + set_debugreg(vcpu->arch.eff_db[2], 2); + set_debugreg(vcpu->arch.eff_db[3], 3); + } KVMTRACE_0D(VMENTRY, vcpu, entryexit); kvm_x86_ops->run(vcpu, kvm_run); + if (unlikely(vcpu->arch.switch_db_regs)) { + set_debugreg(0, 7); + set_debugreg(vcpu->arch.host_db[0], 0); + set_debugreg(vcpu->arch.host_db[1], 1); + set_debugreg(vcpu->arch.host_db[2], 2); + set_debugreg(vcpu->arch.host_db[3], 3); + } + set_debugreg(vcpu->arch.host_dr6, 6); + set_debugreg(vcpu->arch.host_dr7, 7); + vcpu->guest_mode = 0; local_irq_enable(); @@ -4035,6 +4059,11 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) vcpu->arch.nmi_pending = false; vcpu->arch.nmi_injected = false; + vcpu->arch.switch_db_regs = 0; + memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); + vcpu->arch.dr6 = DR6_FIXED_1; + vcpu->arch.dr7 = DR7_FIXED_1; + return kvm_x86_ops->vcpu_reset(vcpu); } -- cgit v1.2.3 From ae675ef01cd86014acf8da5dee87876b71122495 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 15 Dec 2008 13:52:10 +0100 Subject: KVM: x86: Wire-up hardware breakpoints for guest debugging Add the remaining bits to make use of debug registers also for guest debugging, thus enabling the use of hardware breakpoints and watchpoints. Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 5 +++++ arch/x86/kvm/vmx.c | 5 +++++ arch/x86/kvm/x86.c | 14 +++++++++++++- 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 815f50e425ac..b3a7a314d55c 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -958,6 +958,11 @@ static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) } else vcpu->guest_debug = 0; + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) + svm->vmcb->save.dr7 = dbg->arch.debugreg[7]; + else + svm->vmcb->save.dr7 = vcpu->arch.dr7; + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) svm->vmcb->save.rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF; else if (old_debug & KVM_GUESTDBG_SINGLESTEP) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 0989776ee7b0..cee81c9a6653 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1017,6 +1017,11 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE)) vcpu->guest_debug = 0; + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) + vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]); + else + vmcs_writel(GUEST_DR7, vcpu->arch.dr7); + flags = vmcs_readl(GUEST_RFLAGS); if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) flags |= X86_EFLAGS_TF | X86_EFLAGS_RF; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 300bc4d42abc..2477e87b2f84 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3861,10 +3861,22 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { - int r; + int i, r; vcpu_load(vcpu); + if ((dbg->control & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) == + (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) { + for (i = 0; i < KVM_NR_DB_REGS; ++i) + vcpu->arch.eff_db[i] = dbg->arch.debugreg[i]; + vcpu->arch.switch_db_regs = + (dbg->arch.debugreg[7] & DR7_BP_EN_MASK); + } else { + for (i = 0; i < KVM_NR_DB_REGS; i++) + vcpu->arch.eff_db[i] = vcpu->arch.db[i]; + vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK); + } + r = kvm_x86_ops->set_guest_debug(vcpu, dbg); if (dbg->control & KVM_GUESTDBG_INJECT_DB) -- cgit v1.2.3 From e9a999fe1feaddb71bffbacbbd68e0da8ca8b50b Mon Sep 17 00:00:00 2001 From: Jes Sorensen Date: Thu, 18 Dec 2008 12:17:51 +0100 Subject: KVM: ia64: stack get/restore patch Implement KVM_IA64_VCPU_[GS]ET_STACK ioctl calls. This is required for live migrations. Patch is based on previous implementation that was part of old GET/SET_REGS ioctl calls. Signed-off-by: Jes Sorensen Signed-off-by: Avi Kivity --- arch/ia64/include/asm/kvm.h | 7 +++ arch/ia64/include/asm/kvm_host.h | 6 ++- arch/ia64/kvm/kvm-ia64.c | 92 ++++++++++++++++++++++++++++++++++++++-- include/linux/kvm.h | 3 ++ 4 files changed, 104 insertions(+), 4 deletions(-) diff --git a/arch/ia64/include/asm/kvm.h b/arch/ia64/include/asm/kvm.h index be3fdb891214..b5145784233e 100644 --- a/arch/ia64/include/asm/kvm.h +++ b/arch/ia64/include/asm/kvm.h @@ -214,6 +214,13 @@ struct kvm_sregs { struct kvm_fpu { }; +#define KVM_IA64_VCPU_STACK_SHIFT 16 +#define KVM_IA64_VCPU_STACK_SIZE (1UL << KVM_IA64_VCPU_STACK_SHIFT) + +struct kvm_ia64_vcpu_stack { + unsigned char stack[KVM_IA64_VCPU_STACK_SIZE]; +}; + struct kvm_debug_exit_arch { }; diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h index 348663661659..7da0c0963226 100644 --- a/arch/ia64/include/asm/kvm_host.h +++ b/arch/ia64/include/asm/kvm_host.h @@ -112,7 +112,11 @@ #define VCPU_STRUCT_SHIFT 16 #define VCPU_STRUCT_SIZE (__IA64_UL_CONST(1) << VCPU_STRUCT_SHIFT) -#define KVM_STK_OFFSET VCPU_STRUCT_SIZE +/* + * This must match KVM_IA64_VCPU_STACK_{SHIFT,SIZE} arch/ia64/include/asm/kvm.h + */ +#define KVM_STK_SHIFT 16 +#define KVM_STK_OFFSET (__IA64_UL_CONST(1)<< KVM_STK_SHIFT) #define KVM_VM_STRUCT_SHIFT 19 #define KVM_VM_STRUCT_SIZE (__IA64_UL_CONST(1) << KVM_VM_STRUCT_SHIFT) diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index de47467a0e61..1477f91617a5 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -1421,6 +1421,23 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) return 0; } +int kvm_arch_vcpu_ioctl_get_stack(struct kvm_vcpu *vcpu, + struct kvm_ia64_vcpu_stack *stack) +{ + memcpy(stack, vcpu, sizeof(struct kvm_ia64_vcpu_stack)); + return 0; +} + +int kvm_arch_vcpu_ioctl_set_stack(struct kvm_vcpu *vcpu, + struct kvm_ia64_vcpu_stack *stack) +{ + memcpy(vcpu + 1, &stack->stack[0] + sizeof(struct kvm_vcpu), + sizeof(struct kvm_ia64_vcpu_stack) - sizeof(struct kvm_vcpu)); + + vcpu->arch.exit_data = ((struct kvm_vcpu *)stack)->arch.exit_data; + return 0; +} + void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) { @@ -1430,9 +1447,78 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) long kvm_arch_vcpu_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) + unsigned int ioctl, unsigned long arg) { - return -EINVAL; + struct kvm_vcpu *vcpu = filp->private_data; + void __user *argp = (void __user *)arg; + struct kvm_ia64_vcpu_stack *stack = NULL; + long r; + + switch (ioctl) { + case KVM_IA64_VCPU_GET_STACK: { + struct kvm_ia64_vcpu_stack __user *user_stack; + void __user *first_p = argp; + + r = -EFAULT; + if (copy_from_user(&user_stack, first_p, sizeof(void *))) + goto out; + + if (!access_ok(VERIFY_WRITE, user_stack, + sizeof(struct kvm_ia64_vcpu_stack))) { + printk(KERN_INFO "KVM_IA64_VCPU_GET_STACK: " + "Illegal user destination address for stack\n"); + goto out; + } + stack = kzalloc(sizeof(struct kvm_ia64_vcpu_stack), GFP_KERNEL); + if (!stack) { + r = -ENOMEM; + goto out; + } + + r = kvm_arch_vcpu_ioctl_get_stack(vcpu, stack); + if (r) + goto out; + + if (copy_to_user(user_stack, stack, + sizeof(struct kvm_ia64_vcpu_stack))) + goto out; + + break; + } + case KVM_IA64_VCPU_SET_STACK: { + struct kvm_ia64_vcpu_stack __user *user_stack; + void __user *first_p = argp; + + r = -EFAULT; + if (copy_from_user(&user_stack, first_p, sizeof(void *))) + goto out; + + if (!access_ok(VERIFY_READ, user_stack, + sizeof(struct kvm_ia64_vcpu_stack))) { + printk(KERN_INFO "KVM_IA64_VCPU_SET_STACK: " + "Illegal user address for stack\n"); + goto out; + } + stack = kmalloc(sizeof(struct kvm_ia64_vcpu_stack), GFP_KERNEL); + if (!stack) { + r = -ENOMEM; + goto out; + } + if (copy_from_user(stack, user_stack, + sizeof(struct kvm_ia64_vcpu_stack))) + goto out; + + r = kvm_arch_vcpu_ioctl_set_stack(vcpu, stack); + break; + } + + default: + r = -EINVAL; + } + +out: + kfree(stack); + return r; } int kvm_arch_set_memory_region(struct kvm *kvm, @@ -1472,7 +1558,7 @@ void kvm_arch_flush_shadow(struct kvm *kvm) } long kvm_arch_dev_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) + unsigned int ioctl, unsigned long arg) { return -EINVAL; } diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 429a2ce202f9..28582fcd3f79 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -489,6 +489,9 @@ struct kvm_debug_guest { #define __KVM_DEPRECATED_DEBUG_GUEST _IOW(KVMIO, 0x87, struct kvm_debug_guest) +#define KVM_IA64_VCPU_GET_STACK _IOR(KVMIO, 0x9a, void *) +#define KVM_IA64_VCPU_SET_STACK _IOW(KVMIO, 0x9b, void *) + #define KVM_TRC_INJ_VIRQ (KVM_TRC_HANDLER + 0x02) #define KVM_TRC_REDELIVER_EVT (KVM_TRC_HANDLER + 0x03) #define KVM_TRC_PEND_INTR (KVM_TRC_HANDLER + 0x04) -- cgit v1.2.3 From 989c0f0ed56468a4aa48711cef5acf122a40d1dd Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Thu, 18 Dec 2008 12:33:18 +0100 Subject: KVM: Remove old kvm_guest_debug structs Remove the remaining arch fragments of the old guest debug interface that now break non-x86 builds. Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/ia64/include/asm/kvm_host.h | 2 -- arch/powerpc/include/asm/kvm_host.h | 6 ------ arch/s390/include/asm/kvm_host.h | 3 --- 3 files changed, 11 deletions(-) diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h index 7da0c0963226..46f8b0eb5684 100644 --- a/arch/ia64/include/asm/kvm_host.h +++ b/arch/ia64/include/asm/kvm_host.h @@ -239,8 +239,6 @@ struct kvm_vm_data { struct kvm; struct kvm_vcpu; -struct kvm_guest_debug{ -}; struct kvm_mmio_req { uint64_t addr; /* physical address */ diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index c1e436fe7738..a22600537a8c 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -200,10 +200,4 @@ struct kvm_vcpu_arch { unsigned long pending_exceptions; }; -struct kvm_guest_debug { - int enabled; - unsigned long bp[4]; - int singlestep; -}; - #endif /* __POWERPC_KVM_HOST_H__ */ diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 3c55e4107dcc..c6e674f5fca9 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -21,9 +21,6 @@ /* memory slots that does not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 4 -struct kvm_guest_debug { -}; - struct sca_entry { atomic_t scn; __u64 reserved; -- cgit v1.2.3 From 22ccb14203d59a8bcf6f3fea76b3594d710569fa Mon Sep 17 00:00:00 2001 From: Xiantao Zhang Date: Thu, 18 Dec 2008 10:23:58 +0800 Subject: KVM: ia64: Code cleanup Remove some unnecessary blank lines to accord with Kernel's coding style. Also remove vcpu_get_itir_on_fault due to no reference to it. Signed-off-by: Xiantao Zhang Signed-off-by: Avi Kivity --- arch/ia64/kvm/process.c | 15 --------------- arch/ia64/kvm/vcpu.c | 39 ++------------------------------------- arch/ia64/kvm/vtlb.c | 5 ----- 3 files changed, 2 insertions(+), 57 deletions(-) diff --git a/arch/ia64/kvm/process.c b/arch/ia64/kvm/process.c index 230eae482f32..f9c9504144fa 100644 --- a/arch/ia64/kvm/process.c +++ b/arch/ia64/kvm/process.c @@ -167,7 +167,6 @@ static u64 vcpu_get_itir_on_fault(struct kvm_vcpu *vcpu, u64 ifa) return (rr1.val); } - /* * Set vIFA & vITIR & vIHA, when vPSR.ic =1 * Parameter: @@ -222,8 +221,6 @@ void itlb_fault(struct kvm_vcpu *vcpu, u64 vadr) inject_guest_interruption(vcpu, IA64_INST_TLB_VECTOR); } - - /* * Data Nested TLB Fault * @ Data Nested TLB Vector @@ -245,7 +242,6 @@ void alt_dtlb(struct kvm_vcpu *vcpu, u64 vadr) inject_guest_interruption(vcpu, IA64_ALT_DATA_TLB_VECTOR); } - /* * Data TLB Fault * @ Data TLB vector @@ -265,8 +261,6 @@ static void _vhpt_fault(struct kvm_vcpu *vcpu, u64 vadr) /* If vPSR.ic, IFA, ITIR, IHA*/ set_ifa_itir_iha(vcpu, vadr, 1, 1, 1); inject_guest_interruption(vcpu, IA64_VHPT_TRANS_VECTOR); - - } /* @@ -279,7 +273,6 @@ void ivhpt_fault(struct kvm_vcpu *vcpu, u64 vadr) _vhpt_fault(vcpu, vadr); } - /* * VHPT Data Fault * @ VHPT Translation vector @@ -290,8 +283,6 @@ void dvhpt_fault(struct kvm_vcpu *vcpu, u64 vadr) _vhpt_fault(vcpu, vadr); } - - /* * Deal with: * General Exception vector @@ -301,7 +292,6 @@ void _general_exception(struct kvm_vcpu *vcpu) inject_guest_interruption(vcpu, IA64_GENEX_VECTOR); } - /* * Illegal Operation Fault * @ General Exception Vector @@ -419,19 +409,16 @@ static void __page_not_present(struct kvm_vcpu *vcpu, u64 vadr) inject_guest_interruption(vcpu, IA64_PAGE_NOT_PRESENT_VECTOR); } - void data_page_not_present(struct kvm_vcpu *vcpu, u64 vadr) { __page_not_present(vcpu, vadr); } - void inst_page_not_present(struct kvm_vcpu *vcpu, u64 vadr) { __page_not_present(vcpu, vadr); } - /* Deal with * Data access rights vector */ @@ -703,7 +690,6 @@ void vhpi_detection(struct kvm_vcpu *vcpu) } } - void leave_hypervisor_tail(void) { struct kvm_vcpu *v = current_vcpu; @@ -737,7 +723,6 @@ void leave_hypervisor_tail(void) } } - static inline void handle_lds(struct kvm_pt_regs *regs) { regs->cr_ipsr |= IA64_PSR_ED; diff --git a/arch/ia64/kvm/vcpu.c b/arch/ia64/kvm/vcpu.c index ecd526b55323..4d8be4c252fa 100644 --- a/arch/ia64/kvm/vcpu.c +++ b/arch/ia64/kvm/vcpu.c @@ -112,7 +112,6 @@ void switch_to_physical_rid(struct kvm_vcpu *vcpu) return; } - void switch_to_virtual_rid(struct kvm_vcpu *vcpu) { unsigned long psr; @@ -166,8 +165,6 @@ void switch_mm_mode(struct kvm_vcpu *vcpu, struct ia64_psr old_psr, return; } - - /* * In physical mode, insert tc/tr for region 0 and 4 uses * RID[0] and RID[4] which is for physical mode emulation. @@ -269,7 +266,6 @@ static inline unsigned long fph_index(struct kvm_pt_regs *regs, return rotate_reg(96, rrb_fr, (regnum - IA64_FIRST_ROTATING_FR)); } - /* * The inverse of the above: given bspstore and the number of * registers, calculate ar.bsp. @@ -1039,8 +1035,6 @@ u64 vcpu_tak(struct kvm_vcpu *vcpu, u64 vadr) return key; } - - void kvm_thash(struct kvm_vcpu *vcpu, INST64 inst) { unsigned long thash, vadr; @@ -1050,7 +1044,6 @@ void kvm_thash(struct kvm_vcpu *vcpu, INST64 inst) vcpu_set_gr(vcpu, inst.M46.r1, thash, 0); } - void kvm_ttag(struct kvm_vcpu *vcpu, INST64 inst) { unsigned long tag, vadr; @@ -1131,7 +1124,6 @@ int vcpu_tpa(struct kvm_vcpu *vcpu, u64 vadr, u64 *padr) return IA64_NO_FAULT; } - int kvm_tpa(struct kvm_vcpu *vcpu, INST64 inst) { unsigned long r1, r3; @@ -1154,7 +1146,6 @@ void kvm_tak(struct kvm_vcpu *vcpu, INST64 inst) vcpu_set_gr(vcpu, inst.M46.r1, r1, 0); } - /************************************ * Insert/Purge translation register/cache ************************************/ @@ -1385,7 +1376,6 @@ void kvm_mov_to_ar_reg(struct kvm_vcpu *vcpu, INST64 inst) vcpu_set_itc(vcpu, r2); } - void kvm_mov_from_ar_reg(struct kvm_vcpu *vcpu, INST64 inst) { unsigned long r1; @@ -1393,8 +1383,9 @@ void kvm_mov_from_ar_reg(struct kvm_vcpu *vcpu, INST64 inst) r1 = vcpu_get_itc(vcpu); vcpu_set_gr(vcpu, inst.M31.r1, r1, 0); } + /************************************************************************** - struct kvm_vcpu*protection key register access routines + struct kvm_vcpu protection key register access routines **************************************************************************/ unsigned long vcpu_get_pkr(struct kvm_vcpu *vcpu, unsigned long reg) @@ -1407,20 +1398,6 @@ void vcpu_set_pkr(struct kvm_vcpu *vcpu, unsigned long reg, unsigned long val) ia64_set_pkr(reg, val); } - -unsigned long vcpu_get_itir_on_fault(struct kvm_vcpu *vcpu, unsigned long ifa) -{ - union ia64_rr rr, rr1; - - rr.val = vcpu_get_rr(vcpu, ifa); - rr1.val = 0; - rr1.ps = rr.ps; - rr1.rid = rr.rid; - return (rr1.val); -} - - - /******************************** * Moves to privileged registers ********************************/ @@ -1464,8 +1441,6 @@ unsigned long vcpu_set_rr(struct kvm_vcpu *vcpu, unsigned long reg, return (IA64_NO_FAULT); } - - void kvm_mov_to_rr(struct kvm_vcpu *vcpu, INST64 inst) { unsigned long r3, r2; @@ -1510,8 +1485,6 @@ void kvm_mov_to_pkr(struct kvm_vcpu *vcpu, INST64 inst) vcpu_set_pkr(vcpu, r3, r2); } - - void kvm_mov_from_rr(struct kvm_vcpu *vcpu, INST64 inst) { unsigned long r3, r1; @@ -1557,7 +1530,6 @@ void kvm_mov_from_pmc(struct kvm_vcpu *vcpu, INST64 inst) vcpu_set_gr(vcpu, inst.M43.r1, r1, 0); } - unsigned long vcpu_get_cpuid(struct kvm_vcpu *vcpu, unsigned long reg) { /* FIXME: This could get called as a result of a rsvd-reg fault */ @@ -1609,7 +1581,6 @@ unsigned long kvm_mov_to_cr(struct kvm_vcpu *vcpu, INST64 inst) return 0; } - unsigned long kvm_mov_from_cr(struct kvm_vcpu *vcpu, INST64 inst) { unsigned long tgt = inst.M33.r1; @@ -1633,8 +1604,6 @@ unsigned long kvm_mov_from_cr(struct kvm_vcpu *vcpu, INST64 inst) return 0; } - - void vcpu_set_psr(struct kvm_vcpu *vcpu, unsigned long val) { @@ -1776,9 +1745,6 @@ void vcpu_bsw1(struct kvm_vcpu *vcpu) } } - - - void vcpu_rfi(struct kvm_vcpu *vcpu) { unsigned long ifs, psr; @@ -1796,7 +1762,6 @@ void vcpu_rfi(struct kvm_vcpu *vcpu) regs->cr_iip = VCPU(vcpu, iip); } - /* VPSR can't keep track of below bits of guest PSR This function gets guest PSR diff --git a/arch/ia64/kvm/vtlb.c b/arch/ia64/kvm/vtlb.c index 6b6307a3bd55..ac94867f8267 100644 --- a/arch/ia64/kvm/vtlb.c +++ b/arch/ia64/kvm/vtlb.c @@ -509,7 +509,6 @@ void thash_purge_all(struct kvm_vcpu *v) local_flush_tlb_all(); } - /* * Lookup the hash table and its collision chain to find an entry * covering this address rid:va or the entry. @@ -517,7 +516,6 @@ void thash_purge_all(struct kvm_vcpu *v) * INPUT: * in: TLB format for both VHPT & TLB. */ - struct thash_data *vtlb_lookup(struct kvm_vcpu *v, u64 va, int is_data) { struct thash_data *cch; @@ -547,7 +545,6 @@ struct thash_data *vtlb_lookup(struct kvm_vcpu *v, u64 va, int is_data) return NULL; } - /* * Initialize internal control data before service. */ @@ -589,7 +586,6 @@ u64 kvm_gpa_to_mpa(u64 gpa) return (pte >> PAGE_SHIFT << PAGE_SHIFT) | (gpa & ~PAGE_MASK); } - /* * Fetch guest bundle code. * INPUT: @@ -631,7 +627,6 @@ int fetch_code(struct kvm_vcpu *vcpu, u64 gip, IA64_BUNDLE *pbundle) return IA64_NO_FAULT; } - void kvm_init_vhpt(struct kvm_vcpu *v) { v->arch.vhpt.num = VHPT_NUM_ENTRIES; -- cgit v1.2.3 From a770f6f28b1a9287189f3dc8333eb694d9a2f0ab Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 21 Dec 2008 19:20:09 +0200 Subject: KVM: MMU: Inherit a shadow page's guest level count from vcpu setup Instead of "calculating" it on every shadow page allocation, set it once when switching modes, and copy it when allocating pages. This doesn't buy us much, but sets up the stage for inheriting more information related to the mmu setup. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu.c | 17 +++++++++++------ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0a4dab25a919..28f875f28f58 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -244,6 +244,7 @@ struct kvm_mmu { hpa_t root_hpa; int root_level; int shadow_root_level; + union kvm_mmu_page_role base_role; u64 *pae_root; }; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 2d4477c71473..f15023c11fea 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1204,8 +1204,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp; struct hlist_node *node, *tmp; - role.word = 0; - role.glevels = vcpu->arch.mmu.root_level; + role = vcpu->arch.mmu.base_role; role.level = level; role.metaphysical = metaphysical; role.access = access; @@ -2251,17 +2250,23 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) static int init_kvm_softmmu(struct kvm_vcpu *vcpu) { + int r; + ASSERT(vcpu); ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); if (!is_paging(vcpu)) - return nonpaging_init_context(vcpu); + r = nonpaging_init_context(vcpu); else if (is_long_mode(vcpu)) - return paging64_init_context(vcpu); + r = paging64_init_context(vcpu); else if (is_pae(vcpu)) - return paging32E_init_context(vcpu); + r = paging32E_init_context(vcpu); else - return paging32_init_context(vcpu); + r = paging32_init_context(vcpu); + + vcpu->arch.mmu.base_role.glevels = vcpu->arch.mmu.root_level; + + return r; } static int init_kvm_mmu(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From 2f0b3d60b2c43aef7cd10169c425c052169c622a Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 21 Dec 2008 19:27:36 +0200 Subject: KVM: MMU: Segregate mmu pages created with different cr4.pge settings Don't allow a vcpu with cr4.pge cleared to use a shadow page created with cr4.pge set; this might cause a cr3 switch not to sync ptes that have the global bit set (the global bit has no effect if !cr4.pge). This can only occur on smp with different cr4.pge settings for different vcpus (since a cr4 change will resync the shadow ptes), but there's no cost to being correct here. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 28f875f28f58..c2a01d0513f5 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -183,6 +183,7 @@ union kvm_mmu_page_role { unsigned metaphysical:1; unsigned access:3; unsigned invalid:1; + unsigned cr4_pge:1; }; }; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2477e87b2f84..873602b5edfd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -364,6 +364,7 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) } kvm_x86_ops->set_cr4(vcpu, cr4); vcpu->arch.cr4 = cr4; + vcpu->arch.mmu.base_role.cr4_pge = !!(cr4 & X86_CR4_PGE); kvm_mmu_sync_global(vcpu); kvm_mmu_reset_context(vcpu); } -- cgit v1.2.3 From e2078318042569682d0496cfd7bd962a766e82b1 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 21 Dec 2008 19:36:59 +0200 Subject: KVM: MMU: Initialize a shadow page's global attribute from cr4.pge If cr4.pge is cleared, we ought to treat any ptes in the page as non-global. This allows us to remove the check from set_spte(). Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index f15023c11fea..5f03ec324e35 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -797,7 +797,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, ASSERT(is_empty_shadow_page(sp->spt)); bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); sp->multimapped = 0; - sp->global = 1; sp->parent_pte = parent_pte; --vcpu->kvm->arch.n_free_mmu_pages; return sp; @@ -1241,6 +1240,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word); sp->gfn = gfn; sp->role = role; + sp->global = role.cr4_pge; hlist_add_head(&sp->hash_link, bucket); if (!metaphysical) { if (rmap_write_protect(vcpu->kvm, gfn)) @@ -1668,8 +1668,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, u64 mt_mask = shadow_mt_mask; struct kvm_mmu_page *sp = page_header(__pa(shadow_pte)); - if (!(vcpu->arch.cr4 & X86_CR4_PGE)) - global = 0; if (!global && sp->global) { sp->global = 0; if (sp->unsync) { -- cgit v1.2.3 From 2b3d2a206037b1471de6a6dc51427af034cfdb47 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 23 Dec 2008 19:46:01 +0200 Subject: KVM: Fix vmload and friends misinterpreted as lidt The AMD SVM instruction family all overload the 0f 01 /3 opcode, further multiplexing on the three r/m bits. But the code decided that anything that isn't a vmmcall must be an lidt (which shares the 0f 01 /3 opcode, for the case that mod = 3). Fix by aborting emulation if this isn't a vmmcall. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index d174db7a3370..54fb09889a80 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -1908,11 +1908,16 @@ twobyte_insn: c->dst.type = OP_NONE; break; case 3: /* lidt/vmmcall */ - if (c->modrm_mod == 3 && c->modrm_rm == 1) { - rc = kvm_fix_hypercall(ctxt->vcpu); - if (rc) - goto done; - kvm_emulate_hypercall(ctxt->vcpu); + if (c->modrm_mod == 3) { + switch (c->modrm_rm) { + case 1: + rc = kvm_fix_hypercall(ctxt->vcpu); + if (rc) + goto done; + break; + default: + goto cannot_emulate; + } } else { rc = read_descriptor(ctxt, ops, c->src.ptr, &size, &address, -- cgit v1.2.3 From 971cc3dcbc0e020b82f568e61a47b72be03307dd Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Fri, 19 Dec 2008 18:13:54 +0100 Subject: KVM: Advertise guest debug capability per-arch Limit KVM_CAP_SET_GUEST_DEBUG only to those archs (currently x86) that support it. This simplifies user space stub implementations. Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- include/linux/kvm.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 28582fcd3f79..11e3e6197c83 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -393,7 +393,9 @@ struct kvm_trace_rec { #ifdef __KVM_HAVE_USER_NMI #define KVM_CAP_USER_NMI 22 #endif +#if defined(CONFIG_X86) #define KVM_CAP_SET_GUEST_DEBUG 23 +#endif /* * ioctls for VM fds -- cgit v1.2.3 From 2d11123a77e54d5cea262c958e8498f4a08bce3d Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 25 Dec 2008 14:39:47 +0200 Subject: KVM: MMU: Add for_each_shadow_entry(), a simpler alternative to walk_shadow() Using a for_each loop style removes the need to write callback and nasty casts. Implement the walk_shadow() using the for_each_shadow_entry(). Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 69 ++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 49 insertions(+), 20 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 5f03ec324e35..c669f2af1d12 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -150,6 +150,20 @@ struct kvm_shadow_walk { u64 addr, u64 *spte, int level); }; +struct kvm_shadow_walk_iterator { + u64 addr; + hpa_t shadow_addr; + int level; + u64 *sptep; + unsigned index; +}; + +#define for_each_shadow_entry(_vcpu, _addr, _walker) \ + for (shadow_walk_init(&(_walker), _vcpu, _addr); \ + shadow_walk_okay(&(_walker)); \ + shadow_walk_next(&(_walker))) + + struct kvm_unsync_walk { int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk); }; @@ -1254,33 +1268,48 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, return sp; } +static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, + struct kvm_vcpu *vcpu, u64 addr) +{ + iterator->addr = addr; + iterator->shadow_addr = vcpu->arch.mmu.root_hpa; + iterator->level = vcpu->arch.mmu.shadow_root_level; + if (iterator->level == PT32E_ROOT_LEVEL) { + iterator->shadow_addr + = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; + iterator->shadow_addr &= PT64_BASE_ADDR_MASK; + --iterator->level; + if (!iterator->shadow_addr) + iterator->level = 0; + } +} + +static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) +{ + if (iterator->level < PT_PAGE_TABLE_LEVEL) + return false; + iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); + iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; + return true; +} + +static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) +{ + iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK; + --iterator->level; +} + static int walk_shadow(struct kvm_shadow_walk *walker, struct kvm_vcpu *vcpu, u64 addr) { - hpa_t shadow_addr; - int level; + struct kvm_shadow_walk_iterator iterator; int r; - u64 *sptep; - unsigned index; - - shadow_addr = vcpu->arch.mmu.root_hpa; - level = vcpu->arch.mmu.shadow_root_level; - if (level == PT32E_ROOT_LEVEL) { - shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; - shadow_addr &= PT64_BASE_ADDR_MASK; - if (!shadow_addr) - return 1; - --level; - } - while (level >= PT_PAGE_TABLE_LEVEL) { - index = SHADOW_PT_INDEX(addr, level); - sptep = ((u64 *)__va(shadow_addr)) + index; - r = walker->entry(walker, vcpu, addr, sptep, level); + for_each_shadow_entry(vcpu, addr, iterator) { + r = walker->entry(walker, vcpu, addr, + iterator.sptep, iterator.level); if (r) return r; - shadow_addr = *sptep & PT64_BASE_ADDR_MASK; - --level; } return 0; } -- cgit v1.2.3 From 9f652d21c3f887075a33abae85bf53fec64e67b1 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 25 Dec 2008 14:54:25 +0200 Subject: KVM: MMU: Use for_each_shadow_entry() in __direct_map() Eliminating a callback and a useless structure. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 83 +++++++++++++++++++----------------------------------- 1 file changed, 29 insertions(+), 54 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c669f2af1d12..a25e1adb5cff 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1846,67 +1846,42 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) { } -struct direct_shadow_walk { - struct kvm_shadow_walk walker; - pfn_t pfn; - int write; - int largepage; - int pt_write; -}; - -static int direct_map_entry(struct kvm_shadow_walk *_walk, - struct kvm_vcpu *vcpu, - u64 addr, u64 *sptep, int level) +static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, + int largepage, gfn_t gfn, pfn_t pfn) { - struct direct_shadow_walk *walk = - container_of(_walk, struct direct_shadow_walk, walker); + struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; + int pt_write = 0; gfn_t pseudo_gfn; - gfn_t gfn = addr >> PAGE_SHIFT; - - if (level == PT_PAGE_TABLE_LEVEL - || (walk->largepage && level == PT_DIRECTORY_LEVEL)) { - mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL, - 0, walk->write, 1, &walk->pt_write, - walk->largepage, 0, gfn, walk->pfn, false); - ++vcpu->stat.pf_fixed; - return 1; - } - if (*sptep == shadow_trap_nonpresent_pte) { - pseudo_gfn = (addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; - sp = kvm_mmu_get_page(vcpu, pseudo_gfn, (gva_t)addr, level - 1, - 1, ACC_ALL, sptep); - if (!sp) { - pgprintk("nonpaging_map: ENOMEM\n"); - kvm_release_pfn_clean(walk->pfn); - return -ENOMEM; + for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { + if (iterator.level == PT_PAGE_TABLE_LEVEL + || (largepage && iterator.level == PT_DIRECTORY_LEVEL)) { + mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, + 0, write, 1, &pt_write, + largepage, 0, gfn, pfn, false); + ++vcpu->stat.pf_fixed; + break; } - set_shadow_pte(sptep, - __pa(sp->spt) - | PT_PRESENT_MASK | PT_WRITABLE_MASK - | shadow_user_mask | shadow_x_mask); - } - return 0; -} + if (*iterator.sptep == shadow_trap_nonpresent_pte) { + pseudo_gfn = (iterator.addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; + sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, + iterator.level - 1, + 1, ACC_ALL, iterator.sptep); + if (!sp) { + pgprintk("nonpaging_map: ENOMEM\n"); + kvm_release_pfn_clean(pfn); + return -ENOMEM; + } -static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, - int largepage, gfn_t gfn, pfn_t pfn) -{ - int r; - struct direct_shadow_walk walker = { - .walker = { .entry = direct_map_entry, }, - .pfn = pfn, - .largepage = largepage, - .write = write, - .pt_write = 0, - }; - - r = walk_shadow(&walker.walker, vcpu, gfn << PAGE_SHIFT); - if (r < 0) - return r; - return walker.pt_write; + set_shadow_pte(iterator.sptep, + __pa(sp->spt) + | PT_PRESENT_MASK | PT_WRITABLE_MASK + | shadow_user_mask | shadow_x_mask); + } + } + return pt_write; } static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) -- cgit v1.2.3 From e7a04c99b54ad9acb98a56113ec3163bc1039e13 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 25 Dec 2008 15:10:50 +0200 Subject: KVM: MMU: Replace walk_shadow() by for_each_shadow_entry() in fetch() Effectively reverting to the pre walk_shadow() version -- but now with the reusable for_each(). Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 128 ++++++++++++++++++++------------------------- 1 file changed, 58 insertions(+), 70 deletions(-) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 9fd78b6e17ad..69c7e3311b8a 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -283,91 +283,79 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, /* * Fetch a shadow pte for a specific level in the paging hierarchy. */ -static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw, - struct kvm_vcpu *vcpu, u64 addr, - u64 *sptep, int level) +static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, + struct guest_walker *gw, + int user_fault, int write_fault, int largepage, + int *ptwrite, pfn_t pfn) { - struct shadow_walker *sw = - container_of(_sw, struct shadow_walker, walker); - struct guest_walker *gw = sw->guest_walker; unsigned access = gw->pt_access; struct kvm_mmu_page *shadow_page; - u64 spte; + u64 spte, *sptep; int metaphysical; gfn_t table_gfn; int r; + int level; pt_element_t curr_pte; + struct kvm_shadow_walk_iterator iterator; - if (level == PT_PAGE_TABLE_LEVEL - || (sw->largepage && level == PT_DIRECTORY_LEVEL)) { - mmu_set_spte(vcpu, sptep, access, gw->pte_access & access, - sw->user_fault, sw->write_fault, - gw->ptes[gw->level-1] & PT_DIRTY_MASK, - sw->ptwrite, sw->largepage, - gw->ptes[gw->level-1] & PT_GLOBAL_MASK, - gw->gfn, sw->pfn, false); - sw->sptep = sptep; - return 1; - } - - if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) - return 0; - - if (is_large_pte(*sptep)) { - set_shadow_pte(sptep, shadow_trap_nonpresent_pte); - kvm_flush_remote_tlbs(vcpu->kvm); - rmap_remove(vcpu->kvm, sptep); - } + if (!is_present_pte(gw->ptes[gw->level - 1])) + return NULL; - if (level == PT_DIRECTORY_LEVEL && gw->level == PT_DIRECTORY_LEVEL) { - metaphysical = 1; - if (!is_dirty_pte(gw->ptes[level - 1])) - access &= ~ACC_WRITE_MASK; - table_gfn = gpte_to_gfn(gw->ptes[level - 1]); - } else { - metaphysical = 0; - table_gfn = gw->table_gfn[level - 2]; - } - shadow_page = kvm_mmu_get_page(vcpu, table_gfn, (gva_t)addr, level-1, - metaphysical, access, sptep); - if (!metaphysical) { - r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 2], - &curr_pte, sizeof(curr_pte)); - if (r || curr_pte != gw->ptes[level - 2]) { - kvm_mmu_put_page(shadow_page, sptep); - kvm_release_pfn_clean(sw->pfn); - sw->sptep = NULL; - return 1; + for_each_shadow_entry(vcpu, addr, iterator) { + level = iterator.level; + sptep = iterator.sptep; + if (level == PT_PAGE_TABLE_LEVEL + || (largepage && level == PT_DIRECTORY_LEVEL)) { + mmu_set_spte(vcpu, sptep, access, + gw->pte_access & access, + user_fault, write_fault, + gw->ptes[gw->level-1] & PT_DIRTY_MASK, + ptwrite, largepage, + gw->ptes[gw->level-1] & PT_GLOBAL_MASK, + gw->gfn, pfn, false); + break; } - } - spte = __pa(shadow_page->spt) | PT_PRESENT_MASK | PT_ACCESSED_MASK - | PT_WRITABLE_MASK | PT_USER_MASK; - *sptep = spte; - return 0; -} + if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) + continue; -static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, - struct guest_walker *guest_walker, - int user_fault, int write_fault, int largepage, - int *ptwrite, pfn_t pfn) -{ - struct shadow_walker walker = { - .walker = { .entry = FNAME(shadow_walk_entry), }, - .guest_walker = guest_walker, - .user_fault = user_fault, - .write_fault = write_fault, - .largepage = largepage, - .ptwrite = ptwrite, - .pfn = pfn, - }; + if (is_large_pte(*sptep)) { + set_shadow_pte(sptep, shadow_trap_nonpresent_pte); + kvm_flush_remote_tlbs(vcpu->kvm); + rmap_remove(vcpu->kvm, sptep); + } - if (!is_present_pte(guest_walker->ptes[guest_walker->level - 1])) - return NULL; + if (level == PT_DIRECTORY_LEVEL + && gw->level == PT_DIRECTORY_LEVEL) { + metaphysical = 1; + if (!is_dirty_pte(gw->ptes[level - 1])) + access &= ~ACC_WRITE_MASK; + table_gfn = gpte_to_gfn(gw->ptes[level - 1]); + } else { + metaphysical = 0; + table_gfn = gw->table_gfn[level - 2]; + } + shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, + metaphysical, access, sptep); + if (!metaphysical) { + r = kvm_read_guest_atomic(vcpu->kvm, + gw->pte_gpa[level - 2], + &curr_pte, sizeof(curr_pte)); + if (r || curr_pte != gw->ptes[level - 2]) { + kvm_mmu_put_page(shadow_page, sptep); + kvm_release_pfn_clean(pfn); + sptep = NULL; + break; + } + } - walk_shadow(&walker.walker, vcpu, addr); + spte = __pa(shadow_page->spt) + | PT_PRESENT_MASK | PT_ACCESSED_MASK + | PT_WRITABLE_MASK | PT_USER_MASK; + *sptep = spte; + } - return walker.sptep; + return sptep; } /* -- cgit v1.2.3 From a461930bc3cece021f8f89a80dcc1d0691a92b52 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 25 Dec 2008 15:19:00 +0200 Subject: KVM: MMU: Replace walk_shadow() by for_each_shadow_entry() in invlpg() Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 81 ++++++++++++++++++---------------------------- 1 file changed, 32 insertions(+), 49 deletions(-) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 69c7e3311b8a..46b68f941f60 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -25,7 +25,6 @@ #if PTTYPE == 64 #define pt_element_t u64 #define guest_walker guest_walker64 - #define shadow_walker shadow_walker64 #define FNAME(name) paging##64_##name #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK @@ -42,7 +41,6 @@ #elif PTTYPE == 32 #define pt_element_t u32 #define guest_walker guest_walker32 - #define shadow_walker shadow_walker32 #define FNAME(name) paging##32_##name #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK @@ -73,18 +71,6 @@ struct guest_walker { u32 error_code; }; -struct shadow_walker { - struct kvm_shadow_walk walker; - struct guest_walker *guest_walker; - int user_fault; - int write_fault; - int largepage; - int *ptwrite; - pfn_t pfn; - u64 *sptep; - gpa_t pte_gpa; -}; - static gfn_t gpte_to_gfn(pt_element_t gpte) { return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; @@ -453,54 +439,52 @@ out_unlock: return 0; } -static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw, - struct kvm_vcpu *vcpu, u64 addr, - u64 *sptep, int level) +static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) { - struct shadow_walker *sw = - container_of(_sw, struct shadow_walker, walker); + struct kvm_shadow_walk_iterator iterator; + pt_element_t gpte; + gpa_t pte_gpa = -1; + int level; + u64 *sptep; + + spin_lock(&vcpu->kvm->mmu_lock); - /* FIXME: properly handle invlpg on large guest pages */ - if (level == PT_PAGE_TABLE_LEVEL || - ((level == PT_DIRECTORY_LEVEL) && is_large_pte(*sptep))) { - struct kvm_mmu_page *sp = page_header(__pa(sptep)); + for_each_shadow_entry(vcpu, gva, iterator) { + level = iterator.level; + sptep = iterator.sptep; - sw->pte_gpa = (sp->gfn << PAGE_SHIFT); - sw->pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); + /* FIXME: properly handle invlpg on large guest pages */ + if (level == PT_PAGE_TABLE_LEVEL || + ((level == PT_DIRECTORY_LEVEL) && is_large_pte(*sptep))) { + struct kvm_mmu_page *sp = page_header(__pa(sptep)); - if (is_shadow_present_pte(*sptep)) { - rmap_remove(vcpu->kvm, sptep); - if (is_large_pte(*sptep)) - --vcpu->kvm->stat.lpages; + pte_gpa = (sp->gfn << PAGE_SHIFT); + pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); + + if (is_shadow_present_pte(*sptep)) { + rmap_remove(vcpu->kvm, sptep); + if (is_large_pte(*sptep)) + --vcpu->kvm->stat.lpages; + } + set_shadow_pte(sptep, shadow_trap_nonpresent_pte); + break; } - set_shadow_pte(sptep, shadow_trap_nonpresent_pte); - return 1; - } - if (!is_shadow_present_pte(*sptep)) - return 1; - return 0; -} -static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) -{ - pt_element_t gpte; - struct shadow_walker walker = { - .walker = { .entry = FNAME(shadow_invlpg_entry), }, - .pte_gpa = -1, - }; + if (!is_shadow_present_pte(*sptep)) + break; + } - spin_lock(&vcpu->kvm->mmu_lock); - walk_shadow(&walker.walker, vcpu, gva); spin_unlock(&vcpu->kvm->mmu_lock); - if (walker.pte_gpa == -1) + + if (pte_gpa == -1) return; - if (kvm_read_guest_atomic(vcpu->kvm, walker.pte_gpa, &gpte, + if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, sizeof(pt_element_t))) return; if (is_present_pte(gpte) && (gpte & PT_ACCESSED_MASK)) { if (mmu_topup_memory_caches(vcpu)) return; - kvm_mmu_pte_write(vcpu, walker.pte_gpa, (const u8 *)&gpte, + kvm_mmu_pte_write(vcpu, pte_gpa, (const u8 *)&gpte, sizeof(pt_element_t), 0); } } @@ -607,7 +591,6 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) #undef pt_element_t #undef guest_walker -#undef shadow_walker #undef FNAME #undef PT_BASE_ADDR_MASK #undef PT_INDEX -- cgit v1.2.3 From e8c4a4e8a7cc047dfb3c26b2cbc8599ad3460364 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 25 Dec 2008 15:20:07 +0200 Subject: KVM: MMU: Drop walk_shadow() No longer used. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a25e1adb5cff..15850809b55b 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -145,11 +145,6 @@ struct kvm_rmap_desc { struct kvm_rmap_desc *more; }; -struct kvm_shadow_walk { - int (*entry)(struct kvm_shadow_walk *walk, struct kvm_vcpu *vcpu, - u64 addr, u64 *spte, int level); -}; - struct kvm_shadow_walk_iterator { u64 addr; hpa_t shadow_addr; @@ -1299,21 +1294,6 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) --iterator->level; } -static int walk_shadow(struct kvm_shadow_walk *walker, - struct kvm_vcpu *vcpu, u64 addr) -{ - struct kvm_shadow_walk_iterator iterator; - int r; - - for_each_shadow_entry(vcpu, addr, iterator) { - r = walker->entry(walker, vcpu, addr, - iterator.sptep, iterator.level); - if (r) - return r; - } - return 0; -} - static void kvm_mmu_page_unlink_children(struct kvm *kvm, struct kvm_mmu_page *sp) { -- cgit v1.2.3 From 53f658b3c33616a4997ee254311b335e59063289 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 11 Dec 2008 20:45:05 +0100 Subject: KVM: VMX: initialize TSC offset relative to vm creation time VMX initializes the TSC offset for each vcpu at different times, and also reinitializes it for vcpus other than 0 on APIC SIPI message. This bug causes the TSC's to appear unsynchronized in the guest, even if the host is good. Older Linux kernels don't handle the situation very well, so gettimeofday is likely to go backwards in time: http://www.mail-archive.com/kvm@vger.kernel.org/msg02955.html http://sourceforge.net/tracker/index.php?func=detail&aid=2025534&group_id=180599&atid=893831 Fix it by initializating the offset of each vcpu relative to vm creation time, and moving it from vmx_vcpu_reset to vmx_vcpu_setup, out of the APIC MP init path. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/vmx.c | 19 +++++++++++-------- arch/x86/kvm/x86.c | 2 ++ 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c2a01d0513f5..9efc446b5ac6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -398,6 +398,7 @@ struct kvm_arch{ unsigned long irq_sources_bitmap; unsigned long irq_states[KVM_IOAPIC_NUM_PINS]; + u64 vm_init_tsc; }; struct kvm_vm_stat { diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index cee81c9a6653..3312047664a4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -865,11 +865,8 @@ static u64 guest_read_tsc(void) * writes 'guest_tsc' into guest's timestamp counter "register" * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc */ -static void guest_write_tsc(u64 guest_tsc) +static void guest_write_tsc(u64 guest_tsc, u64 host_tsc) { - u64 host_tsc; - - rdtscll(host_tsc); vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc); } @@ -934,6 +931,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_msr_entry *msr; + u64 host_tsc; int ret = 0; switch (msr_index) { @@ -959,7 +957,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) vmcs_writel(GUEST_SYSENTER_ESP, data); break; case MSR_IA32_TIME_STAMP_COUNTER: - guest_write_tsc(data); + rdtscll(host_tsc); + guest_write_tsc(data, host_tsc); break; case MSR_P6_PERFCTR0: case MSR_P6_PERFCTR1: @@ -2109,7 +2108,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) { u32 host_sysenter_cs, msr_low, msr_high; u32 junk; - u64 host_pat; + u64 host_pat, tsc_this, tsc_base; unsigned long a; struct descriptor_table dt; int i; @@ -2237,6 +2236,12 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); + tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc; + rdtscll(tsc_this); + if (tsc_this < vmx->vcpu.kvm->arch.vm_init_tsc) + tsc_base = tsc_this; + + guest_write_tsc(0, tsc_base); return 0; } @@ -2328,8 +2333,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); - guest_write_tsc(0); - /* Special registers */ vmcs_write64(GUEST_IA32_DEBUGCTL, 0); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 873602b5edfd..3b2acfd72d7f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4170,6 +4170,8 @@ struct kvm *kvm_arch_create_vm(void) /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); + rdtscll(kvm->arch.vm_init_tsc); + return kvm; } -- cgit v1.2.3 From 77c2002e7c6f019f59a6f3cc5f8b16b41748dbe1 Mon Sep 17 00:00:00 2001 From: Izik Eidus Date: Mon, 29 Dec 2008 01:42:19 +0200 Subject: KVM: introduce kvm_read_guest_virt, kvm_write_guest_virt This commit change the name of emulator_read_std into kvm_read_guest_virt, and add new function name kvm_write_guest_virt that allow writing into a guest virtual address. Signed-off-by: Izik Eidus Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 4 --- arch/x86/kvm/x86.c | 56 ++++++++++++++++++++++++++++++----------- 2 files changed, 42 insertions(+), 18 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9efc446b5ac6..b74576aec19a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -609,10 +609,6 @@ void kvm_inject_nmi(struct kvm_vcpu *vcpu); void fx_init(struct kvm_vcpu *vcpu); -int emulator_read_std(unsigned long addr, - void *val, - unsigned int bytes, - struct kvm_vcpu *vcpu); int emulator_write_emulated(unsigned long addr, const void *val, unsigned int bytes, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3b2acfd72d7f..67f91764e99b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1976,10 +1976,8 @@ static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, return dev; } -int emulator_read_std(unsigned long addr, - void *val, - unsigned int bytes, - struct kvm_vcpu *vcpu) +int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu) { void *data = val; int r = X86EMUL_CONTINUE; @@ -1987,27 +1985,57 @@ int emulator_read_std(unsigned long addr, while (bytes) { gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); unsigned offset = addr & (PAGE_SIZE-1); - unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset); + unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; if (gpa == UNMAPPED_GVA) { r = X86EMUL_PROPAGATE_FAULT; goto out; } - ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy); + ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); if (ret < 0) { r = X86EMUL_UNHANDLEABLE; goto out; } - bytes -= tocopy; - data += tocopy; - addr += tocopy; + bytes -= toread; + data += toread; + addr += toread; } out: return r; } -EXPORT_SYMBOL_GPL(emulator_read_std); + +int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu) +{ + void *data = val; + int r = X86EMUL_CONTINUE; + + while (bytes) { + gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + unsigned offset = addr & (PAGE_SIZE-1); + unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); + int ret; + + if (gpa == UNMAPPED_GVA) { + r = X86EMUL_PROPAGATE_FAULT; + goto out; + } + ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); + if (ret < 0) { + r = X86EMUL_UNHANDLEABLE; + goto out; + } + + bytes -= towrite; + data += towrite; + addr += towrite; + } +out: + return r; +} + static int emulator_read_emulated(unsigned long addr, void *val, @@ -2029,8 +2057,8 @@ static int emulator_read_emulated(unsigned long addr, if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) goto mmio; - if (emulator_read_std(addr, val, bytes, vcpu) - == X86EMUL_CONTINUE) + if (kvm_read_guest_virt(addr, val, bytes, vcpu) + == X86EMUL_CONTINUE) return X86EMUL_CONTINUE; if (gpa == UNMAPPED_GVA) return X86EMUL_PROPAGATE_FAULT; @@ -2233,7 +2261,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); - emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu); + kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu); printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); @@ -2241,7 +2269,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); static struct x86_emulate_ops emulate_ops = { - .read_std = emulator_read_std, + .read_std = kvm_read_guest_virt, .read_emulated = emulator_read_emulated, .write_emulated = emulator_write_emulated, .cmpxchg_emulated = emulator_cmpxchg_emulated, -- cgit v1.2.3 From 0f346074403bc109f9569f14b45cb09e83729032 Mon Sep 17 00:00:00 2001 From: Izik Eidus Date: Mon, 29 Dec 2008 01:42:20 +0200 Subject: KVM: remove the vmap usage vmap() on guest pages hides those pages from the Linux mm for an extended (userspace determined) amount of time. Get rid of it. Signed-off-by: Izik Eidus Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 62 ++++++++++------------------------------------- include/linux/kvm_types.h | 3 +-- 2 files changed, 14 insertions(+), 51 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 67f91764e99b..2a4f3a697343 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2371,40 +2371,19 @@ int emulate_instruction(struct kvm_vcpu *vcpu, } EXPORT_SYMBOL_GPL(emulate_instruction); -static void free_pio_guest_pages(struct kvm_vcpu *vcpu) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(vcpu->arch.pio.guest_pages); ++i) - if (vcpu->arch.pio.guest_pages[i]) { - kvm_release_page_dirty(vcpu->arch.pio.guest_pages[i]); - vcpu->arch.pio.guest_pages[i] = NULL; - } -} - static int pio_copy_data(struct kvm_vcpu *vcpu) { void *p = vcpu->arch.pio_data; - void *q; + gva_t q = vcpu->arch.pio.guest_gva; unsigned bytes; - int nr_pages = vcpu->arch.pio.guest_pages[1] ? 2 : 1; + int ret; - q = vmap(vcpu->arch.pio.guest_pages, nr_pages, VM_READ|VM_WRITE, - PAGE_KERNEL); - if (!q) { - free_pio_guest_pages(vcpu); - return -ENOMEM; - } - q += vcpu->arch.pio.guest_page_offset; bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; if (vcpu->arch.pio.in) - memcpy(q, p, bytes); + ret = kvm_write_guest_virt(q, p, bytes, vcpu); else - memcpy(p, q, bytes); - q -= vcpu->arch.pio.guest_page_offset; - vunmap(q); - free_pio_guest_pages(vcpu); - return 0; + ret = kvm_read_guest_virt(q, p, bytes, vcpu); + return ret; } int complete_pio(struct kvm_vcpu *vcpu) @@ -2515,7 +2494,6 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, vcpu->arch.pio.in = in; vcpu->arch.pio.string = 0; vcpu->arch.pio.down = 0; - vcpu->arch.pio.guest_page_offset = 0; vcpu->arch.pio.rep = 0; if (vcpu->run->io.direction == KVM_EXIT_IO_IN) @@ -2543,9 +2521,7 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, gva_t address, int rep, unsigned port) { unsigned now, in_page; - int i, ret = 0; - int nr_pages = 1; - struct page *page; + int ret = 0; struct kvm_io_device *pio_dev; vcpu->run->exit_reason = KVM_EXIT_IO; @@ -2557,7 +2533,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, vcpu->arch.pio.in = in; vcpu->arch.pio.string = 1; vcpu->arch.pio.down = down; - vcpu->arch.pio.guest_page_offset = offset_in_page(address); vcpu->arch.pio.rep = rep; if (vcpu->run->io.direction == KVM_EXIT_IO_IN) @@ -2577,15 +2552,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, else in_page = offset_in_page(address) + size; now = min(count, (unsigned long)in_page / size); - if (!now) { - /* - * String I/O straddles page boundary. Pin two guest pages - * so that we satisfy atomicity constraints. Do just one - * transaction to avoid complexity. - */ - nr_pages = 2; + if (!now) now = 1; - } if (down) { /* * String I/O in reverse. Yuck. Kill the guest, fix later. @@ -2600,15 +2568,7 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count) kvm_x86_ops->skip_emulated_instruction(vcpu); - for (i = 0; i < nr_pages; ++i) { - page = gva_to_page(vcpu, address + i * PAGE_SIZE); - vcpu->arch.pio.guest_pages[i] = page; - if (!page) { - kvm_inject_gp(vcpu, 0); - free_pio_guest_pages(vcpu); - return 1; - } - } + vcpu->arch.pio.guest_gva = address; pio_dev = vcpu_find_pio_dev(vcpu, port, vcpu->arch.pio.cur_count, @@ -2616,7 +2576,11 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, if (!vcpu->arch.pio.in) { /* string PIO write */ ret = pio_copy_data(vcpu); - if (ret >= 0 && pio_dev) { + if (ret == X86EMUL_PROPAGATE_FAULT) { + kvm_inject_gp(vcpu, 0); + return 1; + } + if (ret == 0 && pio_dev) { pio_string_write(pio_dev, vcpu); complete_pio(vcpu); if (vcpu->arch.pio.count == 0) diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index 9b6f395c9625..5f4a18cae26b 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -43,8 +43,7 @@ typedef hfn_t pfn_t; struct kvm_pio_request { unsigned long count; int cur_count; - struct page *guest_pages[2]; - unsigned guest_page_offset; + gva_t guest_gva; int in; int port; int size; -- cgit v1.2.3 From 61a6bd672bda3b9468bf5895c1be085c4e481138 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 29 Dec 2008 17:32:28 +0200 Subject: KVM: Fallback support for MSR_VM_HSAVE_PA Since we advertise MSR_VM_HSAVE_PA, userspace will attempt to read it even on Intel. Implement fake support for this MSR to avoid the warnings. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2a4f3a697343..c3fbe8c55c13 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -742,6 +742,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) break; case MSR_IA32_UCODE_REV: case MSR_IA32_UCODE_WRITE: + case MSR_VM_HSAVE_PA: break; case 0x200 ... 0x2ff: return set_msr_mtrr(vcpu, msr, data); @@ -863,6 +864,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_LASTBRANCHTOIP: case MSR_IA32_LASTINTFROMIP: case MSR_IA32_LASTINTTOIP: + case MSR_VM_HSAVE_PA: data = 0; break; case MSR_MTRRcap: -- cgit v1.2.3 From 52d939a0bf44081bc9f69b4fbdc9e7f416df27c7 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 30 Dec 2008 15:55:06 -0200 Subject: KVM: PIT: provide an option to disable interrupt reinjection Certain clocks (such as TSC) in older 2.6 guests overaccount for lost ticks, causing severe time drift. Interrupt reinjection magnifies the problem. Provide an option to disable it. [avi: allow room for expansion in case we want to disable reinjection of other timers] Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm.h | 5 +++++ arch/x86/kvm/i8254.c | 4 ++++ arch/x86/kvm/i8254.h | 1 + arch/x86/kvm/x86.c | 21 +++++++++++++++++++++ include/linux/kvm.h | 4 ++++ 5 files changed, 35 insertions(+) diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index 32eb96c7ca27..54bcf2281526 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h @@ -233,4 +233,9 @@ struct kvm_guest_debug_arch { struct kvm_pit_state { struct kvm_pit_channel_state channels[3]; }; + +struct kvm_reinject_control { + __u8 pit_reinject; + __u8 reserved[31]; +}; #endif /* _ASM_X86_KVM_H */ diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 72bd275a9b5c..528daadeba49 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -201,6 +201,9 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps) if (!atomic_inc_and_test(&pt->pending)) set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests); + if (!pt->reinject) + atomic_set(&pt->pending, 1); + if (vcpu0 && waitqueue_active(&vcpu0->wq)) wake_up_interruptible(&vcpu0->wq); @@ -580,6 +583,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm) pit_state->irq_ack_notifier.gsi = 0; pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq; kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); + pit_state->pit_timer.reinject = true; mutex_unlock(&pit->pit_state.lock); kvm_pit_reset(pit); diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index 4178022b97aa..76959c4b500e 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -9,6 +9,7 @@ struct kvm_kpit_timer { s64 period; /* unit: ns */ s64 scheduled; atomic_t pending; + bool reinject; }; struct kvm_kpit_channel_state { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c3fbe8c55c13..a1f14611f4b9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -993,6 +993,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_NOP_IO_DELAY: case KVM_CAP_MP_STATE: case KVM_CAP_SYNC_MMU: + case KVM_CAP_REINJECT_CONTROL: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -1728,6 +1729,15 @@ static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) return r; } +static int kvm_vm_ioctl_reinject(struct kvm *kvm, + struct kvm_reinject_control *control) +{ + if (!kvm->arch.vpit) + return -ENXIO; + kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject; + return 0; +} + /* * Get (and clear) the dirty memory log for a memory slot. */ @@ -1925,6 +1935,17 @@ long kvm_arch_vm_ioctl(struct file *filp, r = 0; break; } + case KVM_REINJECT_CONTROL: { + struct kvm_reinject_control control; + r = -EFAULT; + if (copy_from_user(&control, argp, sizeof(control))) + goto out; + r = kvm_vm_ioctl_reinject(kvm, &control); + if (r) + goto out; + r = 0; + break; + } default: ; } diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 11e3e6197c83..ae7a12c77427 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -396,6 +396,9 @@ struct kvm_trace_rec { #if defined(CONFIG_X86) #define KVM_CAP_SET_GUEST_DEBUG 23 #endif +#if defined(CONFIG_X86) +#define KVM_CAP_REINJECT_CONTROL 24 +#endif /* * ioctls for VM fds @@ -429,6 +432,7 @@ struct kvm_trace_rec { struct kvm_assigned_pci_dev) #define KVM_ASSIGN_IRQ _IOR(KVMIO, 0x70, \ struct kvm_assigned_irq) +#define KVM_REINJECT_CONTROL _IO(KVMIO, 0x71) /* * ioctls for vcpu fds -- cgit v1.2.3 From 1c08364c3565242f1e1bd585bc2ce458967941af Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 4 Jan 2009 12:39:07 +0200 Subject: KVM: Move struct kvm_pio_request into x86 kvm_host.h This is an x86 specific stucture and has no business living in common code. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 12 ++++++++++++ include/linux/kvm_types.h | 12 ------------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b74576aec19a..863ea73431ad 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -227,6 +227,18 @@ struct kvm_pv_mmu_op_buffer { char buf[512] __aligned(sizeof(long)); }; +struct kvm_pio_request { + unsigned long count; + int cur_count; + gva_t guest_gva; + int in; + int port; + int size; + int string; + int down; + int rep; +}; + /* * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level * 32-bit). The kvm_mmu structure abstracts the details of the current mmu diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index 5f4a18cae26b..2b8318c83e53 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -40,16 +40,4 @@ typedef unsigned long hfn_t; typedef hfn_t pfn_t; -struct kvm_pio_request { - unsigned long count; - int cur_count; - gva_t guest_gva; - int in; - int port; - int size; - int string; - int down; - int rep; -}; - #endif /* __KVM_TYPES_H__ */ -- cgit v1.2.3 From c46fb0211f8b93de45400bff814c0f29335af5fc Mon Sep 17 00:00:00 2001 From: Hollis Blanchard Date: Sat, 3 Jan 2009 16:22:58 -0600 Subject: KVM: ppc: move struct kvmppc_44x_tlbe into 44x-specific header Signed-off-by: Hollis Blanchard Signed-off-by: Avi Kivity --- arch/powerpc/include/asm/kvm_44x.h | 7 +++++++ arch/powerpc/include/asm/kvm_host.h | 7 ------- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_44x.h b/arch/powerpc/include/asm/kvm_44x.h index f49031b632ca..d22d39942a92 100644 --- a/arch/powerpc/include/asm/kvm_44x.h +++ b/arch/powerpc/include/asm/kvm_44x.h @@ -28,6 +28,13 @@ * need to find some way of advertising it. */ #define KVM44x_GUEST_TLB_SIZE 64 +struct kvmppc_44x_tlbe { + u32 tid; /* Only the low 8 bits are used. */ + u32 word0; + u32 word1; + u32 word2; +}; + struct kvmppc_44x_shadow_ref { struct page *page; u16 gtlb_index; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index a22600537a8c..65c4d492d722 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -64,13 +64,6 @@ struct kvm_vcpu_stat { u32 halt_wakeup; }; -struct kvmppc_44x_tlbe { - u32 tid; /* Only the low 8 bits are used. */ - u32 word0; - u32 word1; - u32 word2; -}; - enum kvm_exit_types { MMIO_EXITS, DCR_EXITS, -- cgit v1.2.3 From ecc0981ff07cbe7cdf95de20be5b24fee8e49cb5 Mon Sep 17 00:00:00 2001 From: Hollis Blanchard Date: Sat, 3 Jan 2009 16:22:59 -0600 Subject: KVM: ppc: cosmetic changes to mmu hook names Signed-off-by: Hollis Blanchard Signed-off-by: Avi Kivity --- arch/powerpc/include/asm/kvm_ppc.h | 5 +++-- arch/powerpc/kvm/44x_tlb.c | 2 +- arch/powerpc/kvm/powerpc.c | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 36d2a50a8487..7ba95d28b837 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -52,13 +52,14 @@ extern int kvmppc_emulate_instruction(struct kvm_run *run, extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu); extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu); +/* Core-specific hooks */ + extern void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr, u64 asid, u32 flags, u32 max_bytes, unsigned int gtlb_idx); extern void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode); extern void kvmppc_mmu_switch_pid(struct kvm_vcpu *vcpu, u32 pid); - -/* Core-specific hooks */ +extern void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu); extern struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id); diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c index 9a34b8edb9e2..8f9c09cbb833 100644 --- a/arch/powerpc/kvm/44x_tlb.c +++ b/arch/powerpc/kvm/44x_tlb.c @@ -248,7 +248,7 @@ static void kvmppc_44x_shadow_release(struct kvmppc_vcpu_44x *vcpu_44x, KVMTRACE_1D(STLB_INVAL, &vcpu_44x->vcpu, stlb_index, handler); } -void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu) +void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu); int i; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 7c2ad4017d6a..f30be9ef2314 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -216,7 +216,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) { - kvmppc_core_destroy_mmu(vcpu); + kvmppc_mmu_destroy(vcpu); } void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) -- cgit v1.2.3 From 475e7cdd69101939006659a63c2e4a32d5b71389 Mon Sep 17 00:00:00 2001 From: Hollis Blanchard Date: Sat, 3 Jan 2009 16:23:00 -0600 Subject: KVM: ppc: small cosmetic changes to Book E DTLB miss handler Signed-off-by: Hollis Blanchard Signed-off-by: Avi Kivity --- arch/powerpc/kvm/booke.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 35485dd6927e..d196ae619303 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -290,6 +290,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, struct kvmppc_44x_tlbe *gtlbe; unsigned long eaddr = vcpu->arch.fault_dear; int gtlb_index; + gpa_t gpaddr; gfn_t gfn; /* Check the guest TLB. */ @@ -305,8 +306,8 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, } gtlbe = &vcpu_44x->guest_tlb[gtlb_index]; - vcpu->arch.paddr_accessed = tlb_xlate(gtlbe, eaddr); - gfn = vcpu->arch.paddr_accessed >> PAGE_SHIFT; + gpaddr = tlb_xlate(gtlbe, eaddr); + gfn = gpaddr >> PAGE_SHIFT; if (kvm_is_visible_gfn(vcpu->kvm, gfn)) { /* The guest TLB had a mapping, but the shadow TLB @@ -315,13 +316,14 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, * b) the guest used a large mapping which we're faking * Either way, we need to satisfy the fault without * invoking the guest. */ - kvmppc_mmu_map(vcpu, eaddr, vcpu->arch.paddr_accessed, gtlbe->tid, + kvmppc_mmu_map(vcpu, eaddr, gpaddr, gtlbe->tid, gtlbe->word2, get_tlb_bytes(gtlbe), gtlb_index); kvmppc_account_exit(vcpu, DTLB_VIRT_MISS_EXITS); r = RESUME_GUEST; } else { /* Guest has mapped and accessed a page which is not * actually RAM. */ + vcpu->arch.paddr_accessed = gpaddr; r = kvmppc_emulate_mmio(run, vcpu); kvmppc_account_exit(vcpu, MMIO_EXITS); } -- cgit v1.2.3 From 58a96214a306fc7fc66105097eea9c4f3bfa35bc Mon Sep 17 00:00:00 2001 From: Hollis Blanchard Date: Sat, 3 Jan 2009 16:23:01 -0600 Subject: KVM: ppc: change kvmppc_mmu_map() parameters Passing just the TLB index will ease an e500 implementation. Signed-off-by: Hollis Blanchard Signed-off-by: Avi Kivity --- arch/powerpc/include/asm/kvm_ppc.h | 1 - arch/powerpc/kvm/44x_tlb.c | 15 +++++++-------- arch/powerpc/kvm/booke.c | 6 ++---- 3 files changed, 9 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 7ba95d28b837..f661f8ba3ab8 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -55,7 +55,6 @@ extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu); /* Core-specific hooks */ extern void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr, - u64 asid, u32 flags, u32 max_bytes, unsigned int gtlb_idx); extern void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode); extern void kvmppc_mmu_switch_pid(struct kvm_vcpu *vcpu, u32 pid); diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c index 8f9c09cbb833..e8ed22f28eae 100644 --- a/arch/powerpc/kvm/44x_tlb.c +++ b/arch/powerpc/kvm/44x_tlb.c @@ -269,15 +269,19 @@ void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) * Caller must ensure that the specified guest TLB entry is safe to insert into * the shadow TLB. */ -void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr, u64 asid, - u32 flags, u32 max_bytes, unsigned int gtlb_index) +void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr, + unsigned int gtlb_index) { struct kvmppc_44x_tlbe stlbe; struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu); + struct kvmppc_44x_tlbe *gtlbe = &vcpu_44x->guest_tlb[gtlb_index]; struct kvmppc_44x_shadow_ref *ref; struct page *new_page; hpa_t hpaddr; gfn_t gfn; + u32 asid = gtlbe->tid; + u32 flags = gtlbe->word2; + u32 max_bytes = get_tlb_bytes(gtlbe); unsigned int victim; /* Select TLB entry to clobber. Indirectly guard against races with the TLB @@ -448,10 +452,8 @@ int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws) } if (tlbe_is_host_safe(vcpu, tlbe)) { - u64 asid; gva_t eaddr; gpa_t gpaddr; - u32 flags; u32 bytes; eaddr = get_tlb_eaddr(tlbe); @@ -462,10 +464,7 @@ int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws) eaddr &= ~(bytes - 1); gpaddr &= ~(bytes - 1); - asid = (tlbe->word0 & PPC44x_TLB_TS) | tlbe->tid; - flags = tlbe->word2 & 0xffff; - - kvmppc_mmu_map(vcpu, eaddr, gpaddr, asid, flags, bytes, gtlb_index); + kvmppc_mmu_map(vcpu, eaddr, gpaddr, gtlb_index); } KVMTRACE_5D(GTLB_WRITE, vcpu, gtlb_index, tlbe->tid, tlbe->word0, diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index d196ae619303..85b9e2fc6c6b 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -316,8 +316,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, * b) the guest used a large mapping which we're faking * Either way, we need to satisfy the fault without * invoking the guest. */ - kvmppc_mmu_map(vcpu, eaddr, gpaddr, gtlbe->tid, - gtlbe->word2, get_tlb_bytes(gtlbe), gtlb_index); + kvmppc_mmu_map(vcpu, eaddr, gpaddr, gtlb_index); kvmppc_account_exit(vcpu, DTLB_VIRT_MISS_EXITS); r = RESUME_GUEST; } else { @@ -364,8 +363,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, * b) the guest used a large mapping which we're faking * Either way, we need to satisfy the fault without * invoking the guest. */ - kvmppc_mmu_map(vcpu, eaddr, gpaddr, gtlbe->tid, - gtlbe->word2, get_tlb_bytes(gtlbe), gtlb_index); + kvmppc_mmu_map(vcpu, eaddr, gpaddr, gtlb_index); } else { /* Guest mapped and leaped at non-RAM! */ kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_MACHINE_CHECK); -- cgit v1.2.3 From be8d1cae07d5acf4a61046d7def5eda40f0981e1 Mon Sep 17 00:00:00 2001 From: Hollis Blanchard Date: Sat, 3 Jan 2009 16:23:02 -0600 Subject: KVM: ppc: turn tlb_xlate() into a per-core hook (and give it a better name) Signed-off-by: Hollis Blanchard Signed-off-by: Avi Kivity --- arch/powerpc/include/asm/kvm_ppc.h | 2 ++ arch/powerpc/kvm/44x.c | 6 +----- arch/powerpc/kvm/44x_tlb.c | 10 ++++++++++ arch/powerpc/kvm/44x_tlb.h | 7 ------- arch/powerpc/kvm/booke.c | 10 ++-------- 5 files changed, 15 insertions(+), 20 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index f661f8ba3ab8..9fd70aa916d9 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -59,6 +59,8 @@ extern void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr, extern void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode); extern void kvmppc_mmu_switch_pid(struct kvm_vcpu *vcpu, u32 pid); extern void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu); +extern gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int gtlb_index, + gva_t eaddr); extern struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id); diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c index a66bec57265a..8383603dd8a4 100644 --- a/arch/powerpc/kvm/44x.c +++ b/arch/powerpc/kvm/44x.c @@ -149,8 +149,6 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu) int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu, struct kvm_translation *tr) { - struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu); - struct kvmppc_44x_tlbe *gtlbe; int index; gva_t eaddr; u8 pid; @@ -166,9 +164,7 @@ int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu, return 0; } - gtlbe = &vcpu_44x->guest_tlb[index]; - - tr->physical_address = tlb_xlate(gtlbe, eaddr); + tr->physical_address = kvmppc_mmu_xlate(vcpu, index, eaddr); /* XXX what does "writeable" and "usermode" even mean? */ tr->valid = 1; diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c index e8ed22f28eae..2f14671e8a15 100644 --- a/arch/powerpc/kvm/44x_tlb.c +++ b/arch/powerpc/kvm/44x_tlb.c @@ -208,6 +208,16 @@ int kvmppc_44x_tlb_index(struct kvm_vcpu *vcpu, gva_t eaddr, unsigned int pid, return -1; } +gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int gtlb_index, + gva_t eaddr) +{ + struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu); + struct kvmppc_44x_tlbe *gtlbe = &vcpu_44x->guest_tlb[gtlb_index]; + unsigned int pgmask = get_tlb_bytes(gtlbe) - 1; + + return get_tlb_raddr(gtlbe) | (eaddr & pgmask); +} + int kvmppc_44x_itlb_index(struct kvm_vcpu *vcpu, gva_t eaddr) { unsigned int as = !!(vcpu->arch.msr & MSR_IS); diff --git a/arch/powerpc/kvm/44x_tlb.h b/arch/powerpc/kvm/44x_tlb.h index 772191f29e62..05b6f7eef5b6 100644 --- a/arch/powerpc/kvm/44x_tlb.h +++ b/arch/powerpc/kvm/44x_tlb.h @@ -85,11 +85,4 @@ static inline unsigned int get_mmucr_sts(const struct kvm_vcpu *vcpu) return (vcpu->arch.mmucr >> 16) & 0x1; } -static inline gpa_t tlb_xlate(struct kvmppc_44x_tlbe *tlbe, gva_t eaddr) -{ - unsigned int pgmask = get_tlb_bytes(tlbe) - 1; - - return get_tlb_raddr(tlbe) | (eaddr & pgmask); -} - #endif /* __KVM_POWERPC_TLB_H__ */ diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 85b9e2fc6c6b..56d6ed69ed60 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -286,8 +286,6 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, /* XXX move to a 440-specific file. */ case BOOKE_INTERRUPT_DTLB_MISS: { - struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu); - struct kvmppc_44x_tlbe *gtlbe; unsigned long eaddr = vcpu->arch.fault_dear; int gtlb_index; gpa_t gpaddr; @@ -305,8 +303,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, break; } - gtlbe = &vcpu_44x->guest_tlb[gtlb_index]; - gpaddr = tlb_xlate(gtlbe, eaddr); + gpaddr = kvmppc_mmu_xlate(vcpu, gtlb_index, eaddr); gfn = gpaddr >> PAGE_SHIFT; if (kvm_is_visible_gfn(vcpu->kvm, gfn)) { @@ -332,8 +329,6 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, /* XXX move to a 440-specific file. */ case BOOKE_INTERRUPT_ITLB_MISS: { - struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu); - struct kvmppc_44x_tlbe *gtlbe; unsigned long eaddr = vcpu->arch.pc; gpa_t gpaddr; gfn_t gfn; @@ -352,8 +347,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, kvmppc_account_exit(vcpu, ITLB_VIRT_MISS_EXITS); - gtlbe = &vcpu_44x->guest_tlb[gtlb_index]; - gpaddr = tlb_xlate(gtlbe, eaddr); + gpaddr = kvmppc_mmu_xlate(vcpu, gtlb_index, eaddr); gfn = gpaddr >> PAGE_SHIFT; if (kvm_is_visible_gfn(vcpu->kvm, gfn)) { -- cgit v1.2.3 From fa86b8dda2e0faccefbeda61edc02a50bd588f4f Mon Sep 17 00:00:00 2001 From: Hollis Blanchard Date: Sat, 3 Jan 2009 16:23:03 -0600 Subject: KVM: ppc: rename 44x MMU functions used in booke.c e500 will provide its own implementation of these. Signed-off-by: Hollis Blanchard Signed-off-by: Avi Kivity --- arch/powerpc/include/asm/kvm_ppc.h | 2 ++ arch/powerpc/kvm/44x_tlb.c | 4 ++-- arch/powerpc/kvm/44x_tlb.h | 2 -- arch/powerpc/kvm/booke.c | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 9fd70aa916d9..5e80a20f32b8 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -59,6 +59,8 @@ extern void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr, extern void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode); extern void kvmppc_mmu_switch_pid(struct kvm_vcpu *vcpu, u32 pid); extern void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu); +extern int kvmppc_mmu_dtlb_index(struct kvm_vcpu *vcpu, gva_t eaddr); +extern int kvmppc_mmu_itlb_index(struct kvm_vcpu *vcpu, gva_t eaddr); extern gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int gtlb_index, gva_t eaddr); diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c index 2f14671e8a15..e67b7313ffc3 100644 --- a/arch/powerpc/kvm/44x_tlb.c +++ b/arch/powerpc/kvm/44x_tlb.c @@ -218,14 +218,14 @@ gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int gtlb_index, return get_tlb_raddr(gtlbe) | (eaddr & pgmask); } -int kvmppc_44x_itlb_index(struct kvm_vcpu *vcpu, gva_t eaddr) +int kvmppc_mmu_itlb_index(struct kvm_vcpu *vcpu, gva_t eaddr) { unsigned int as = !!(vcpu->arch.msr & MSR_IS); return kvmppc_44x_tlb_index(vcpu, eaddr, vcpu->arch.pid, as); } -int kvmppc_44x_dtlb_index(struct kvm_vcpu *vcpu, gva_t eaddr) +int kvmppc_mmu_dtlb_index(struct kvm_vcpu *vcpu, gva_t eaddr) { unsigned int as = !!(vcpu->arch.msr & MSR_DS); diff --git a/arch/powerpc/kvm/44x_tlb.h b/arch/powerpc/kvm/44x_tlb.h index 05b6f7eef5b6..a9ff80e51526 100644 --- a/arch/powerpc/kvm/44x_tlb.h +++ b/arch/powerpc/kvm/44x_tlb.h @@ -25,8 +25,6 @@ extern int kvmppc_44x_tlb_index(struct kvm_vcpu *vcpu, gva_t eaddr, unsigned int pid, unsigned int as); -extern int kvmppc_44x_dtlb_index(struct kvm_vcpu *vcpu, gva_t eaddr); -extern int kvmppc_44x_itlb_index(struct kvm_vcpu *vcpu, gva_t eaddr); extern int kvmppc_44x_emul_tlbsx(struct kvm_vcpu *vcpu, u8 rt, u8 ra, u8 rb, u8 rc); diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 56d6ed69ed60..1e692ac16e99 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -292,7 +292,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, gfn_t gfn; /* Check the guest TLB. */ - gtlb_index = kvmppc_44x_dtlb_index(vcpu, eaddr); + gtlb_index = kvmppc_mmu_dtlb_index(vcpu, eaddr); if (gtlb_index < 0) { /* The guest didn't have a mapping for it. */ kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DTLB_MISS); @@ -337,7 +337,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, r = RESUME_GUEST; /* Check the guest TLB. */ - gtlb_index = kvmppc_44x_itlb_index(vcpu, eaddr); + gtlb_index = kvmppc_mmu_itlb_index(vcpu, eaddr); if (gtlb_index < 0) { /* The guest didn't have a mapping for it. */ kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_ITLB_MISS); -- cgit v1.2.3 From f44353610b584fcbc31e363f35594796c6446d63 Mon Sep 17 00:00:00 2001 From: Hollis Blanchard Date: Sat, 3 Jan 2009 16:23:04 -0600 Subject: KVM: ppc: remove last 44x-specific bits from booke.c Signed-off-by: Hollis Blanchard Signed-off-by: Avi Kivity --- arch/powerpc/kvm/booke.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 1e692ac16e99..a73b3959e41c 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -30,10 +30,8 @@ #include #include "timing.h" #include -#include #include "booke.h" -#include "44x_tlb.h" unsigned long kvmppc_booke_handlers; @@ -284,7 +282,6 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, r = RESUME_GUEST; break; - /* XXX move to a 440-specific file. */ case BOOKE_INTERRUPT_DTLB_MISS: { unsigned long eaddr = vcpu->arch.fault_dear; int gtlb_index; @@ -327,7 +324,6 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, break; } - /* XXX move to a 440-specific file. */ case BOOKE_INTERRUPT_ITLB_MISS: { unsigned long eaddr = vcpu->arch.pc; gpa_t gpaddr; -- cgit v1.2.3 From cea5d8c9de669e30ed6d60930318376d5cc42e9e Mon Sep 17 00:00:00 2001 From: Hollis Blanchard Date: Sat, 3 Jan 2009 16:23:05 -0600 Subject: KVM: ppc: use macros instead of hardcoded literals for instruction decoding Signed-off-by: Hollis Blanchard Signed-off-by: Avi Kivity --- arch/powerpc/kvm/emulate.c | 93 +++++++++++++++++++++++++++++++--------------- 1 file changed, 63 insertions(+), 30 deletions(-) diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index d1d38daa93fb..a561d6e8da1c 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -30,6 +30,39 @@ #include #include "timing.h" +#define OP_TRAP 3 + +#define OP_31_XOP_LWZX 23 +#define OP_31_XOP_LBZX 87 +#define OP_31_XOP_STWX 151 +#define OP_31_XOP_STBX 215 +#define OP_31_XOP_STBUX 247 +#define OP_31_XOP_LHZX 279 +#define OP_31_XOP_LHZUX 311 +#define OP_31_XOP_MFSPR 339 +#define OP_31_XOP_STHX 407 +#define OP_31_XOP_STHUX 439 +#define OP_31_XOP_MTSPR 467 +#define OP_31_XOP_DCBI 470 +#define OP_31_XOP_LWBRX 534 +#define OP_31_XOP_TLBSYNC 566 +#define OP_31_XOP_STWBRX 662 +#define OP_31_XOP_LHBRX 790 +#define OP_31_XOP_STHBRX 918 + +#define OP_LWZ 32 +#define OP_LWZU 33 +#define OP_LBZ 34 +#define OP_LBZU 35 +#define OP_STW 36 +#define OP_STWU 37 +#define OP_STB 38 +#define OP_STBU 39 +#define OP_LHZ 40 +#define OP_LHZU 41 +#define OP_STH 44 +#define OP_STHU 45 + void kvmppc_emulate_dec(struct kvm_vcpu *vcpu) { if (vcpu->arch.tcr & TCR_DIE) { @@ -78,7 +111,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) kvmppc_set_exit_type(vcpu, EMULATED_INST_EXITS); switch (get_op(inst)) { - case 3: /* trap */ + case OP_TRAP: vcpu->arch.esr |= ESR_PTR; kvmppc_core_queue_program(vcpu); advance = 0; @@ -87,31 +120,31 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) case 31: switch (get_xop(inst)) { - case 23: /* lwzx */ + case OP_31_XOP_LWZX: rt = get_rt(inst); emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1); break; - case 87: /* lbzx */ + case OP_31_XOP_LBZX: rt = get_rt(inst); emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1); break; - case 151: /* stwx */ + case OP_31_XOP_STWX: rs = get_rs(inst); emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs], 4, 1); break; - case 215: /* stbx */ + case OP_31_XOP_STBX: rs = get_rs(inst); emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs], 1, 1); break; - case 247: /* stbux */ + case OP_31_XOP_STBUX: rs = get_rs(inst); ra = get_ra(inst); rb = get_rb(inst); @@ -126,12 +159,12 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) vcpu->arch.gpr[rs] = ea; break; - case 279: /* lhzx */ + case OP_31_XOP_LHZX: rt = get_rt(inst); emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1); break; - case 311: /* lhzux */ + case OP_31_XOP_LHZUX: rt = get_rt(inst); ra = get_ra(inst); rb = get_rb(inst); @@ -144,7 +177,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) vcpu->arch.gpr[ra] = ea; break; - case 339: /* mfspr */ + case OP_31_XOP_MFSPR: sprn = get_sprn(inst); rt = get_rt(inst); @@ -185,7 +218,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) } break; - case 407: /* sthx */ + case OP_31_XOP_STHX: rs = get_rs(inst); ra = get_ra(inst); rb = get_rb(inst); @@ -195,7 +228,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) 2, 1); break; - case 439: /* sthux */ + case OP_31_XOP_STHUX: rs = get_rs(inst); ra = get_ra(inst); rb = get_rb(inst); @@ -210,7 +243,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) vcpu->arch.gpr[ra] = ea; break; - case 467: /* mtspr */ + case OP_31_XOP_MTSPR: sprn = get_sprn(inst); rs = get_rs(inst); switch (sprn) { @@ -246,7 +279,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) } break; - case 470: /* dcbi */ + case OP_31_XOP_DCBI: /* Do nothing. The guest is performing dcbi because * hardware DMA is not snooped by the dcache, but * emulated DMA either goes through the dcache as @@ -254,15 +287,15 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) * coherence. */ break; - case 534: /* lwbrx */ + case OP_31_XOP_LWBRX: rt = get_rt(inst); emulated = kvmppc_handle_load(run, vcpu, rt, 4, 0); break; - case 566: /* tlbsync */ + case OP_31_XOP_TLBSYNC: break; - case 662: /* stwbrx */ + case OP_31_XOP_STWBRX: rs = get_rs(inst); ra = get_ra(inst); rb = get_rb(inst); @@ -272,12 +305,12 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) 4, 0); break; - case 790: /* lhbrx */ + case OP_31_XOP_LHBRX: rt = get_rt(inst); emulated = kvmppc_handle_load(run, vcpu, rt, 2, 0); break; - case 918: /* sthbrx */ + case OP_31_XOP_STHBRX: rs = get_rs(inst); ra = get_ra(inst); rb = get_rb(inst); @@ -293,37 +326,37 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) } break; - case 32: /* lwz */ + case OP_LWZ: rt = get_rt(inst); emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1); break; - case 33: /* lwzu */ + case OP_LWZU: ra = get_ra(inst); rt = get_rt(inst); emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1); vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed; break; - case 34: /* lbz */ + case OP_LBZ: rt = get_rt(inst); emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1); break; - case 35: /* lbzu */ + case OP_LBZU: ra = get_ra(inst); rt = get_rt(inst); emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1); vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed; break; - case 36: /* stw */ + case OP_STW: rs = get_rs(inst); emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs], 4, 1); break; - case 37: /* stwu */ + case OP_STWU: ra = get_ra(inst); rs = get_rs(inst); emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs], @@ -331,13 +364,13 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed; break; - case 38: /* stb */ + case OP_STB: rs = get_rs(inst); emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs], 1, 1); break; - case 39: /* stbu */ + case OP_STBU: ra = get_ra(inst); rs = get_rs(inst); emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs], @@ -345,25 +378,25 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed; break; - case 40: /* lhz */ + case OP_LHZ: rt = get_rt(inst); emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1); break; - case 41: /* lhzu */ + case OP_LHZU: ra = get_ra(inst); rt = get_rt(inst); emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1); vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed; break; - case 44: /* sth */ + case OP_STH: rs = get_rs(inst); emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs], 2, 1); break; - case 45: /* sthu */ + case OP_STHU: ra = get_ra(inst); rs = get_rs(inst); emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs], -- cgit v1.2.3 From d0c7dc03442f7cbd8740d9a4e7a4e7f84bfa676d Mon Sep 17 00:00:00 2001 From: Hollis Blanchard Date: Sat, 3 Jan 2009 16:23:06 -0600 Subject: KVM: ppc: split out common Book E instruction emulation The Book E code will be shared with e500. I've left PID in kvmppc_core_emulate_op() just so that we don't need to move kvmppc_set_pid() right now. Once we have the e500 implementation, we can probably share that too. Signed-off-by: Hollis Blanchard Signed-off-by: Avi Kivity --- arch/powerpc/kvm/44x_emulate.c | 217 +++----------------------------- arch/powerpc/kvm/Makefile | 1 + arch/powerpc/kvm/booke.h | 6 + arch/powerpc/kvm/booke_emulate.c | 262 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 284 insertions(+), 202 deletions(-) create mode 100644 arch/powerpc/kvm/booke_emulate.c diff --git a/arch/powerpc/kvm/44x_emulate.c b/arch/powerpc/kvm/44x_emulate.c index 82489a743a6f..61af58fcecee 100644 --- a/arch/powerpc/kvm/44x_emulate.c +++ b/arch/powerpc/kvm/44x_emulate.c @@ -27,25 +27,12 @@ #include "booke.h" #include "44x_tlb.h" -#define OP_RFI 19 - -#define XOP_RFI 50 -#define XOP_MFMSR 83 -#define XOP_WRTEE 131 -#define XOP_MTMSR 146 -#define XOP_WRTEEI 163 #define XOP_MFDCR 323 #define XOP_MTDCR 451 #define XOP_TLBSX 914 #define XOP_ICCCI 966 #define XOP_TLBWE 978 -static void kvmppc_emul_rfi(struct kvm_vcpu *vcpu) -{ - vcpu->arch.pc = vcpu->arch.srr0; - kvmppc_set_msr(vcpu, vcpu->arch.srr1); -} - int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int inst, int *advance) { @@ -59,48 +46,9 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, int ws; switch (get_op(inst)) { - case OP_RFI: - switch (get_xop(inst)) { - case XOP_RFI: - kvmppc_emul_rfi(vcpu); - kvmppc_set_exit_type(vcpu, EMULATED_RFI_EXITS); - *advance = 0; - break; - - default: - emulated = EMULATE_FAIL; - break; - } - break; - case 31: switch (get_xop(inst)) { - case XOP_MFMSR: - rt = get_rt(inst); - vcpu->arch.gpr[rt] = vcpu->arch.msr; - kvmppc_set_exit_type(vcpu, EMULATED_MFMSR_EXITS); - break; - - case XOP_MTMSR: - rs = get_rs(inst); - kvmppc_set_exit_type(vcpu, EMULATED_MTMSR_EXITS); - kvmppc_set_msr(vcpu, vcpu->arch.gpr[rs]); - break; - - case XOP_WRTEE: - rs = get_rs(inst); - vcpu->arch.msr = (vcpu->arch.msr & ~MSR_EE) - | (vcpu->arch.gpr[rs] & MSR_EE); - kvmppc_set_exit_type(vcpu, EMULATED_WRTEE_EXITS); - break; - - case XOP_WRTEEI: - vcpu->arch.msr = (vcpu->arch.msr & ~MSR_EE) - | (inst & MSR_EE); - kvmppc_set_exit_type(vcpu, EMULATED_WRTEE_EXITS); - break; - case XOP_MFDCR: dcrn = get_dcrn(inst); rt = get_rt(inst); @@ -186,186 +134,51 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, emulated = EMULATE_FAIL; } + if (emulated == EMULATE_FAIL) + emulated = kvmppc_booke_emulate_op(run, vcpu, inst, advance); + return emulated; } int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) { + int emulated = EMULATE_DONE; + switch (sprn) { - case SPRN_MMUCR: - vcpu->arch.mmucr = vcpu->arch.gpr[rs]; break; case SPRN_PID: kvmppc_set_pid(vcpu, vcpu->arch.gpr[rs]); break; + case SPRN_MMUCR: + vcpu->arch.mmucr = vcpu->arch.gpr[rs]; break; case SPRN_CCR0: vcpu->arch.ccr0 = vcpu->arch.gpr[rs]; break; case SPRN_CCR1: vcpu->arch.ccr1 = vcpu->arch.gpr[rs]; break; - case SPRN_DEAR: - vcpu->arch.dear = vcpu->arch.gpr[rs]; break; - case SPRN_ESR: - vcpu->arch.esr = vcpu->arch.gpr[rs]; break; - case SPRN_DBCR0: - vcpu->arch.dbcr0 = vcpu->arch.gpr[rs]; break; - case SPRN_DBCR1: - vcpu->arch.dbcr1 = vcpu->arch.gpr[rs]; break; - case SPRN_TSR: - vcpu->arch.tsr &= ~vcpu->arch.gpr[rs]; break; - case SPRN_TCR: - vcpu->arch.tcr = vcpu->arch.gpr[rs]; - kvmppc_emulate_dec(vcpu); - break; - - /* Note: SPRG4-7 are user-readable. These values are - * loaded into the real SPRGs when resuming the - * guest. */ - case SPRN_SPRG4: - vcpu->arch.sprg4 = vcpu->arch.gpr[rs]; break; - case SPRN_SPRG5: - vcpu->arch.sprg5 = vcpu->arch.gpr[rs]; break; - case SPRN_SPRG6: - vcpu->arch.sprg6 = vcpu->arch.gpr[rs]; break; - case SPRN_SPRG7: - vcpu->arch.sprg7 = vcpu->arch.gpr[rs]; break; - - case SPRN_IVPR: - vcpu->arch.ivpr = vcpu->arch.gpr[rs]; - break; - case SPRN_IVOR0: - vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL] = vcpu->arch.gpr[rs]; - break; - case SPRN_IVOR1: - vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK] = vcpu->arch.gpr[rs]; - break; - case SPRN_IVOR2: - vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE] = vcpu->arch.gpr[rs]; - break; - case SPRN_IVOR3: - vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE] = vcpu->arch.gpr[rs]; - break; - case SPRN_IVOR4: - vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL] = vcpu->arch.gpr[rs]; - break; - case SPRN_IVOR5: - vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT] = vcpu->arch.gpr[rs]; - break; - case SPRN_IVOR6: - vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM] = vcpu->arch.gpr[rs]; - break; - case SPRN_IVOR7: - vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL] = vcpu->arch.gpr[rs]; - break; - case SPRN_IVOR8: - vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL] = vcpu->arch.gpr[rs]; - break; - case SPRN_IVOR9: - vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL] = vcpu->arch.gpr[rs]; - break; - case SPRN_IVOR10: - vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER] = vcpu->arch.gpr[rs]; - break; - case SPRN_IVOR11: - vcpu->arch.ivor[BOOKE_IRQPRIO_FIT] = vcpu->arch.gpr[rs]; - break; - case SPRN_IVOR12: - vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG] = vcpu->arch.gpr[rs]; - break; - case SPRN_IVOR13: - vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS] = vcpu->arch.gpr[rs]; - break; - case SPRN_IVOR14: - vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS] = vcpu->arch.gpr[rs]; - break; - case SPRN_IVOR15: - vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG] = vcpu->arch.gpr[rs]; - break; - default: - return EMULATE_FAIL; + emulated = kvmppc_booke_emulate_mtspr(vcpu, sprn, rs); } kvmppc_set_exit_type(vcpu, EMULATED_MTSPR_EXITS); - return EMULATE_DONE; + return emulated; } int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) { + int emulated = EMULATE_DONE; + switch (sprn) { - /* 440 */ + case SPRN_PID: + vcpu->arch.gpr[rt] = vcpu->arch.pid; break; case SPRN_MMUCR: vcpu->arch.gpr[rt] = vcpu->arch.mmucr; break; case SPRN_CCR0: vcpu->arch.gpr[rt] = vcpu->arch.ccr0; break; case SPRN_CCR1: vcpu->arch.gpr[rt] = vcpu->arch.ccr1; break; - - /* Book E */ - case SPRN_PID: - vcpu->arch.gpr[rt] = vcpu->arch.pid; break; - case SPRN_IVPR: - vcpu->arch.gpr[rt] = vcpu->arch.ivpr; break; - case SPRN_DEAR: - vcpu->arch.gpr[rt] = vcpu->arch.dear; break; - case SPRN_ESR: - vcpu->arch.gpr[rt] = vcpu->arch.esr; break; - case SPRN_DBCR0: - vcpu->arch.gpr[rt] = vcpu->arch.dbcr0; break; - case SPRN_DBCR1: - vcpu->arch.gpr[rt] = vcpu->arch.dbcr1; break; - - case SPRN_IVOR0: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL]; - break; - case SPRN_IVOR1: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK]; - break; - case SPRN_IVOR2: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE]; - break; - case SPRN_IVOR3: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE]; - break; - case SPRN_IVOR4: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL]; - break; - case SPRN_IVOR5: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT]; - break; - case SPRN_IVOR6: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM]; - break; - case SPRN_IVOR7: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL]; - break; - case SPRN_IVOR8: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL]; - break; - case SPRN_IVOR9: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL]; - break; - case SPRN_IVOR10: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER]; - break; - case SPRN_IVOR11: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_FIT]; - break; - case SPRN_IVOR12: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG]; - break; - case SPRN_IVOR13: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS]; - break; - case SPRN_IVOR14: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS]; - break; - case SPRN_IVOR15: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG]; - break; - default: - return EMULATE_FAIL; + emulated = kvmppc_booke_emulate_mfspr(vcpu, sprn, rt); } kvmppc_set_exit_type(vcpu, EMULATED_MFSPR_EXITS); - return EMULATE_DONE; + return emulated; } diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index df7ba59e6d53..3ef5261828e2 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -16,6 +16,7 @@ AFLAGS_booke_interrupts.o := -I$(obj) kvm-440-objs := \ booke.o \ + booke_emulate.o \ booke_interrupts.o \ 44x.o \ 44x_tlb.o \ diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h index cf7c94ca24bf..311fdbc23fbe 100644 --- a/arch/powerpc/kvm/booke.h +++ b/arch/powerpc/kvm/booke.h @@ -22,6 +22,7 @@ #include #include +#include #include "timing.h" /* interrupt priortity ordering */ @@ -57,4 +58,9 @@ static inline void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr) }; } +int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance); +int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt); +int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs); + #endif /* __KVM_BOOKE_H__ */ diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c new file mode 100644 index 000000000000..8aa78f1a1f87 --- /dev/null +++ b/arch/powerpc/kvm/booke_emulate.c @@ -0,0 +1,262 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright IBM Corp. 2008 + * + * Authors: Hollis Blanchard + */ + +#include +#include + +#include "booke.h" + +#define OP_19_XOP_RFI 50 + +#define OP_31_XOP_MFMSR 83 +#define OP_31_XOP_WRTEE 131 +#define OP_31_XOP_MTMSR 146 +#define OP_31_XOP_WRTEEI 163 + +static void kvmppc_emul_rfi(struct kvm_vcpu *vcpu) +{ + vcpu->arch.pc = vcpu->arch.srr0; + kvmppc_set_msr(vcpu, vcpu->arch.srr1); +} + +int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance) +{ + int emulated = EMULATE_DONE; + int rs; + int rt; + + switch (get_op(inst)) { + case 19: + switch (get_xop(inst)) { + case OP_19_XOP_RFI: + kvmppc_emul_rfi(vcpu); + kvmppc_set_exit_type(vcpu, EMULATED_RFI_EXITS); + *advance = 0; + break; + + default: + emulated = EMULATE_FAIL; + break; + } + break; + + case 31: + switch (get_xop(inst)) { + + case OP_31_XOP_MFMSR: + rt = get_rt(inst); + vcpu->arch.gpr[rt] = vcpu->arch.msr; + kvmppc_set_exit_type(vcpu, EMULATED_MFMSR_EXITS); + break; + + case OP_31_XOP_MTMSR: + rs = get_rs(inst); + kvmppc_set_exit_type(vcpu, EMULATED_MTMSR_EXITS); + kvmppc_set_msr(vcpu, vcpu->arch.gpr[rs]); + break; + + case OP_31_XOP_WRTEE: + rs = get_rs(inst); + vcpu->arch.msr = (vcpu->arch.msr & ~MSR_EE) + | (vcpu->arch.gpr[rs] & MSR_EE); + kvmppc_set_exit_type(vcpu, EMULATED_WRTEE_EXITS); + break; + + case OP_31_XOP_WRTEEI: + vcpu->arch.msr = (vcpu->arch.msr & ~MSR_EE) + | (inst & MSR_EE); + kvmppc_set_exit_type(vcpu, EMULATED_WRTEE_EXITS); + break; + + default: + emulated = EMULATE_FAIL; + } + + break; + + default: + emulated = EMULATE_FAIL; + } + + return emulated; +} + +int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) +{ + int emulated = EMULATE_DONE; + + switch (sprn) { + case SPRN_DEAR: + vcpu->arch.dear = vcpu->arch.gpr[rs]; break; + case SPRN_ESR: + vcpu->arch.esr = vcpu->arch.gpr[rs]; break; + case SPRN_DBCR0: + vcpu->arch.dbcr0 = vcpu->arch.gpr[rs]; break; + case SPRN_DBCR1: + vcpu->arch.dbcr1 = vcpu->arch.gpr[rs]; break; + case SPRN_TSR: + vcpu->arch.tsr &= ~vcpu->arch.gpr[rs]; break; + case SPRN_TCR: + vcpu->arch.tcr = vcpu->arch.gpr[rs]; + kvmppc_emulate_dec(vcpu); + break; + + /* Note: SPRG4-7 are user-readable. These values are + * loaded into the real SPRGs when resuming the + * guest. */ + case SPRN_SPRG4: + vcpu->arch.sprg4 = vcpu->arch.gpr[rs]; break; + case SPRN_SPRG5: + vcpu->arch.sprg5 = vcpu->arch.gpr[rs]; break; + case SPRN_SPRG6: + vcpu->arch.sprg6 = vcpu->arch.gpr[rs]; break; + case SPRN_SPRG7: + vcpu->arch.sprg7 = vcpu->arch.gpr[rs]; break; + + case SPRN_IVPR: + vcpu->arch.ivpr = vcpu->arch.gpr[rs]; + break; + case SPRN_IVOR0: + vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL] = vcpu->arch.gpr[rs]; + break; + case SPRN_IVOR1: + vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK] = vcpu->arch.gpr[rs]; + break; + case SPRN_IVOR2: + vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE] = vcpu->arch.gpr[rs]; + break; + case SPRN_IVOR3: + vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE] = vcpu->arch.gpr[rs]; + break; + case SPRN_IVOR4: + vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL] = vcpu->arch.gpr[rs]; + break; + case SPRN_IVOR5: + vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT] = vcpu->arch.gpr[rs]; + break; + case SPRN_IVOR6: + vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM] = vcpu->arch.gpr[rs]; + break; + case SPRN_IVOR7: + vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL] = vcpu->arch.gpr[rs]; + break; + case SPRN_IVOR8: + vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL] = vcpu->arch.gpr[rs]; + break; + case SPRN_IVOR9: + vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL] = vcpu->arch.gpr[rs]; + break; + case SPRN_IVOR10: + vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER] = vcpu->arch.gpr[rs]; + break; + case SPRN_IVOR11: + vcpu->arch.ivor[BOOKE_IRQPRIO_FIT] = vcpu->arch.gpr[rs]; + break; + case SPRN_IVOR12: + vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG] = vcpu->arch.gpr[rs]; + break; + case SPRN_IVOR13: + vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS] = vcpu->arch.gpr[rs]; + break; + case SPRN_IVOR14: + vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS] = vcpu->arch.gpr[rs]; + break; + case SPRN_IVOR15: + vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG] = vcpu->arch.gpr[rs]; + break; + + default: + emulated = EMULATE_FAIL; + } + + return emulated; +} + +int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) +{ + int emulated = EMULATE_DONE; + + switch (sprn) { + case SPRN_IVPR: + vcpu->arch.gpr[rt] = vcpu->arch.ivpr; break; + case SPRN_DEAR: + vcpu->arch.gpr[rt] = vcpu->arch.dear; break; + case SPRN_ESR: + vcpu->arch.gpr[rt] = vcpu->arch.esr; break; + case SPRN_DBCR0: + vcpu->arch.gpr[rt] = vcpu->arch.dbcr0; break; + case SPRN_DBCR1: + vcpu->arch.gpr[rt] = vcpu->arch.dbcr1; break; + + case SPRN_IVOR0: + vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL]; + break; + case SPRN_IVOR1: + vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK]; + break; + case SPRN_IVOR2: + vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE]; + break; + case SPRN_IVOR3: + vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE]; + break; + case SPRN_IVOR4: + vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL]; + break; + case SPRN_IVOR5: + vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT]; + break; + case SPRN_IVOR6: + vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM]; + break; + case SPRN_IVOR7: + vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL]; + break; + case SPRN_IVOR8: + vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL]; + break; + case SPRN_IVOR9: + vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL]; + break; + case SPRN_IVOR10: + vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER]; + break; + case SPRN_IVOR11: + vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_FIT]; + break; + case SPRN_IVOR12: + vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG]; + break; + case SPRN_IVOR13: + vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS]; + break; + case SPRN_IVOR14: + vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS]; + break; + case SPRN_IVOR15: + vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG]; + break; + + default: + emulated = EMULATE_FAIL; + } + + return emulated; +} -- cgit v1.2.3 From f7b200af8f1da748cc67993caeff3923d6014070 Mon Sep 17 00:00:00 2001 From: Hollis Blanchard Date: Sat, 3 Jan 2009 16:23:07 -0600 Subject: KVM: ppc: Add dbsr in kvm_vcpu_arch Kernel for E500 need clear dbsr when startup. So add dbsr register in kvm_vcpu_arch for BOOKE. Signed-off-by: Liu Yu Signed-off-by: Hollis Blanchard Signed-off-by: Avi Kivity --- arch/powerpc/include/asm/kvm_host.h | 1 + arch/powerpc/kvm/booke_emulate.c | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 65c4d492d722..50e5ce1b400a 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -163,6 +163,7 @@ struct kvm_vcpu_arch { u32 ccr1; u32 dbcr0; u32 dbcr1; + u32 dbsr; #ifdef CONFIG_KVM_EXIT_TIMING struct kvmppc_exit_timing timing_exit; diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c index 8aa78f1a1f87..aebc65e93f4b 100644 --- a/arch/powerpc/kvm/booke_emulate.c +++ b/arch/powerpc/kvm/booke_emulate.c @@ -111,6 +111,8 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) vcpu->arch.dbcr0 = vcpu->arch.gpr[rs]; break; case SPRN_DBCR1: vcpu->arch.dbcr1 = vcpu->arch.gpr[rs]; break; + case SPRN_DBSR: + vcpu->arch.dbsr &= ~vcpu->arch.gpr[rs]; break; case SPRN_TSR: vcpu->arch.tsr &= ~vcpu->arch.gpr[rs]; break; case SPRN_TCR: @@ -204,6 +206,8 @@ int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) vcpu->arch.gpr[rt] = vcpu->arch.dbcr0; break; case SPRN_DBCR1: vcpu->arch.gpr[rt] = vcpu->arch.dbcr1; break; + case SPRN_DBSR: + vcpu->arch.gpr[rt] = vcpu->arch.dbsr; break; case SPRN_IVOR0: vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL]; -- cgit v1.2.3 From 366d4b9b9fdda282006b97316f8038cd36f8ecb7 Mon Sep 17 00:00:00 2001 From: Hollis Blanchard Date: Sat, 3 Jan 2009 16:23:08 -0600 Subject: KVM: ppc: No need to include core-header for KVM in asm-offsets.c currently Signed-off-by: Liu Yu Signed-off-by: Hollis Blanchard Signed-off-by: Avi Kivity --- arch/powerpc/kernel/asm-offsets.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 19ee491e9e23..42fe4da4e8ae 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -49,7 +49,7 @@ #include #endif #ifdef CONFIG_KVM -#include +#include #endif #if defined(CONFIG_BOOKE) || defined(CONFIG_40x) @@ -361,8 +361,6 @@ int main(void) DEFINE(PTE_SIZE, sizeof(pte_t)); #ifdef CONFIG_KVM - DEFINE(TLBE_BYTES, sizeof(struct kvmppc_44x_tlbe)); - DEFINE(VCPU_HOST_STACK, offsetof(struct kvm_vcpu, arch.host_stack)); DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid)); DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr)); -- cgit v1.2.3 From 17c885eb5c38f39a2bb3143a67687bd905dcd0fa Mon Sep 17 00:00:00 2001 From: Hollis Blanchard Date: Sat, 3 Jan 2009 16:23:09 -0600 Subject: KVM: ppc: ifdef iccci with CONFIG_44x E500 deosn't support this instruction. Signed-off-by: Liu Yu Signed-off-by: Hollis Blanchard Signed-off-by: Avi Kivity --- arch/powerpc/kvm/booke_interrupts.S | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S index 084ebcd7dd83..4679ec287e62 100644 --- a/arch/powerpc/kvm/booke_interrupts.S +++ b/arch/powerpc/kvm/booke_interrupts.S @@ -347,7 +347,9 @@ lightweight_exit: lwz r3, VCPU_SHADOW_PID(r4) mtspr SPRN_PID, r3 +#ifdef CONFIG_44x iccci 0, 0 /* XXX hack */ +#endif /* Load some guest volatiles. */ lwz r0, VCPU_GPR(r0)(r4) -- cgit v1.2.3 From bc8080cbcc8870178f0910edd537d0cb5706d703 Mon Sep 17 00:00:00 2001 From: Hollis Blanchard Date: Sat, 3 Jan 2009 16:23:10 -0600 Subject: KVM: ppc: E500 core-specific code Signed-off-by: Liu Yu Signed-off-by: Hollis Blanchard Signed-off-by: Avi Kivity --- arch/powerpc/include/asm/kvm_e500.h | 67 ++++ arch/powerpc/kvm/Kconfig | 13 + arch/powerpc/kvm/Makefile | 9 + arch/powerpc/kvm/e500.c | 151 ++++++++ arch/powerpc/kvm/e500_emulate.c | 169 +++++++++ arch/powerpc/kvm/e500_tlb.c | 737 ++++++++++++++++++++++++++++++++++++ arch/powerpc/kvm/e500_tlb.h | 184 +++++++++ 7 files changed, 1330 insertions(+) create mode 100644 arch/powerpc/include/asm/kvm_e500.h create mode 100644 arch/powerpc/kvm/e500.c create mode 100644 arch/powerpc/kvm/e500_emulate.c create mode 100644 arch/powerpc/kvm/e500_tlb.c create mode 100644 arch/powerpc/kvm/e500_tlb.h diff --git a/arch/powerpc/include/asm/kvm_e500.h b/arch/powerpc/include/asm/kvm_e500.h new file mode 100644 index 000000000000..9d497ce49726 --- /dev/null +++ b/arch/powerpc/include/asm/kvm_e500.h @@ -0,0 +1,67 @@ +/* + * Copyright (C) 2008 Freescale Semiconductor, Inc. All rights reserved. + * + * Author: Yu Liu, + * + * Description: + * This file is derived from arch/powerpc/include/asm/kvm_44x.h, + * by Hollis Blanchard . + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#ifndef __ASM_KVM_E500_H__ +#define __ASM_KVM_E500_H__ + +#include + +#define BOOKE_INTERRUPT_SIZE 36 + +#define E500_PID_NUM 3 +#define E500_TLB_NUM 2 + +struct tlbe{ + u32 mas1; + u32 mas2; + u32 mas3; + u32 mas7; +}; + +struct kvmppc_vcpu_e500 { + /* Unmodified copy of the guest's TLB. */ + struct tlbe *guest_tlb[E500_TLB_NUM]; + /* TLB that's actually used when the guest is running. */ + struct tlbe *shadow_tlb[E500_TLB_NUM]; + /* Pages which are referenced in the shadow TLB. */ + struct page **shadow_pages[E500_TLB_NUM]; + + unsigned int guest_tlb_size[E500_TLB_NUM]; + unsigned int shadow_tlb_size[E500_TLB_NUM]; + unsigned int guest_tlb_nv[E500_TLB_NUM]; + + u32 host_pid[E500_PID_NUM]; + u32 pid[E500_PID_NUM]; + + u32 mas0; + u32 mas1; + u32 mas2; + u32 mas3; + u32 mas4; + u32 mas5; + u32 mas6; + u32 mas7; + u32 l1csr1; + u32 hid0; + u32 hid1; + + struct kvm_vcpu vcpu; +}; + +static inline struct kvmppc_vcpu_e500 *to_e500(struct kvm_vcpu *vcpu) +{ + return container_of(vcpu, struct kvmppc_vcpu_e500, vcpu); +} + +#endif /* __ASM_KVM_E500_H__ */ diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 6dbdc4817d80..146570550142 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -43,6 +43,19 @@ config KVM_EXIT_TIMING If unsure, say N. +config KVM_E500 + bool "KVM support for PowerPC E500 processors" + depends on EXPERIMENTAL && E500 + select KVM + ---help--- + Support running unmodified E500 guest kernels in virtual machines on + E500 host processors. + + This module provides access to the hardware capabilities through + a character device node named /dev/kvm. + + If unsure, say N. + config KVM_TRACE bool "KVM trace support" depends on KVM && MARKERS && SYSFS diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 3ef5261828e2..4b2df66c79d8 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -22,3 +22,12 @@ kvm-440-objs := \ 44x_tlb.o \ 44x_emulate.o obj-$(CONFIG_KVM_440) += kvm-440.o + +kvm-e500-objs := \ + booke.o \ + booke_emulate.o \ + booke_interrupts.o \ + e500.o \ + e500_tlb.o \ + e500_emulate.o +obj-$(CONFIG_KVM_E500) += kvm-e500.o diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c new file mode 100644 index 000000000000..7992da497cd4 --- /dev/null +++ b/arch/powerpc/kvm/e500.c @@ -0,0 +1,151 @@ +/* + * Copyright (C) 2008 Freescale Semiconductor, Inc. All rights reserved. + * + * Author: Yu Liu, + * + * Description: + * This file is derived from arch/powerpc/kvm/44x.c, + * by Hollis Blanchard . + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include "e500_tlb.h" + +void kvmppc_core_load_host_debugstate(struct kvm_vcpu *vcpu) +{ +} + +void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu) +{ +} + +void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + kvmppc_e500_tlb_load(vcpu, cpu); +} + +void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) +{ + kvmppc_e500_tlb_put(vcpu); +} + +int kvmppc_core_check_processor_compat(void) +{ + int r; + + if (strcmp(cur_cpu_spec->cpu_name, "e500v2") == 0) + r = 0; + else + r = -ENOTSUPP; + + return r; +} + +int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + + kvmppc_e500_tlb_setup(vcpu_e500); + + /* Use the same core vertion as host's */ + vcpu->arch.pvr = mfspr(SPRN_PVR); + + return 0; +} + +/* 'linear_address' is actually an encoding of AS|PID|EADDR . */ +int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu, + struct kvm_translation *tr) +{ + int index; + gva_t eaddr; + u8 pid; + u8 as; + + eaddr = tr->linear_address; + pid = (tr->linear_address >> 32) & 0xff; + as = (tr->linear_address >> 40) & 0x1; + + index = kvmppc_e500_tlb_search(vcpu, eaddr, pid, as); + if (index < 0) { + tr->valid = 0; + return 0; + } + + tr->physical_address = kvmppc_mmu_xlate(vcpu, index, eaddr); + /* XXX what does "writeable" and "usermode" even mean? */ + tr->valid = 1; + + return 0; +} + +struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +{ + struct kvmppc_vcpu_e500 *vcpu_e500; + struct kvm_vcpu *vcpu; + int err; + + vcpu_e500 = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); + if (!vcpu_e500) { + err = -ENOMEM; + goto out; + } + + vcpu = &vcpu_e500->vcpu; + err = kvm_vcpu_init(vcpu, kvm, id); + if (err) + goto free_vcpu; + + err = kvmppc_e500_tlb_init(vcpu_e500); + if (err) + goto uninit_vcpu; + + return vcpu; + +uninit_vcpu: + kvm_vcpu_uninit(vcpu); +free_vcpu: + kmem_cache_free(kvm_vcpu_cache, vcpu_e500); +out: + return ERR_PTR(err); +} + +void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + + kvmppc_e500_tlb_uninit(vcpu_e500); + kvm_vcpu_uninit(vcpu); + kmem_cache_free(kvm_vcpu_cache, vcpu_e500); +} + +static int kvmppc_e500_init(void) +{ + int r; + + r = kvmppc_booke_init(); + if (r) + return r; + + return kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), THIS_MODULE); +} + +static void kvmppc_e500_exit(void) +{ + kvmppc_booke_exit(); +} + +module_init(kvmppc_e500_init); +module_exit(kvmppc_e500_exit); diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c new file mode 100644 index 000000000000..a47f44cd2cfd --- /dev/null +++ b/arch/powerpc/kvm/e500_emulate.c @@ -0,0 +1,169 @@ +/* + * Copyright (C) 2008 Freescale Semiconductor, Inc. All rights reserved. + * + * Author: Yu Liu, + * + * Description: + * This file is derived from arch/powerpc/kvm/44x_emulate.c, + * by Hollis Blanchard . + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include +#include +#include + +#include "booke.h" +#include "e500_tlb.h" + +#define XOP_TLBIVAX 786 +#define XOP_TLBSX 914 +#define XOP_TLBRE 946 +#define XOP_TLBWE 978 + +int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance) +{ + int emulated = EMULATE_DONE; + int ra; + int rb; + int rs; + int rt; + + switch (get_op(inst)) { + case 31: + switch (get_xop(inst)) { + + case XOP_TLBRE: + emulated = kvmppc_e500_emul_tlbre(vcpu); + break; + + case XOP_TLBWE: + emulated = kvmppc_e500_emul_tlbwe(vcpu); + break; + + case XOP_TLBSX: + rb = get_rb(inst); + emulated = kvmppc_e500_emul_tlbsx(vcpu,rb); + break; + + case XOP_TLBIVAX: + ra = get_ra(inst); + rb = get_rb(inst); + emulated = kvmppc_e500_emul_tlbivax(vcpu, ra, rb); + break; + + default: + emulated = EMULATE_FAIL; + } + + break; + + default: + emulated = EMULATE_FAIL; + } + + if (emulated == EMULATE_FAIL) + emulated = kvmppc_booke_emulate_op(run, vcpu, inst, advance); + + return emulated; +} + +int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + int emulated = EMULATE_DONE; + + switch (sprn) { + case SPRN_PID: + vcpu_e500->pid[0] = vcpu->arch.shadow_pid = + vcpu->arch.pid = vcpu->arch.gpr[rs]; + break; + case SPRN_PID1: + vcpu_e500->pid[1] = vcpu->arch.gpr[rs]; break; + case SPRN_PID2: + vcpu_e500->pid[2] = vcpu->arch.gpr[rs]; break; + case SPRN_MAS0: + vcpu_e500->mas0 = vcpu->arch.gpr[rs]; break; + case SPRN_MAS1: + vcpu_e500->mas1 = vcpu->arch.gpr[rs]; break; + case SPRN_MAS2: + vcpu_e500->mas2 = vcpu->arch.gpr[rs]; break; + case SPRN_MAS3: + vcpu_e500->mas3 = vcpu->arch.gpr[rs]; break; + case SPRN_MAS4: + vcpu_e500->mas4 = vcpu->arch.gpr[rs]; break; + case SPRN_MAS6: + vcpu_e500->mas6 = vcpu->arch.gpr[rs]; break; + case SPRN_MAS7: + vcpu_e500->mas7 = vcpu->arch.gpr[rs]; break; + case SPRN_L1CSR1: + vcpu_e500->l1csr1 = vcpu->arch.gpr[rs]; break; + case SPRN_HID0: + vcpu_e500->hid0 = vcpu->arch.gpr[rs]; break; + case SPRN_HID1: + vcpu_e500->hid1 = vcpu->arch.gpr[rs]; break; + + default: + emulated = kvmppc_booke_emulate_mtspr(vcpu, sprn, rs); + } + + return emulated; +} + +int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + int emulated = EMULATE_DONE; + + switch (sprn) { + case SPRN_PID: + vcpu->arch.gpr[rt] = vcpu_e500->pid[0]; break; + case SPRN_PID1: + vcpu->arch.gpr[rt] = vcpu_e500->pid[1]; break; + case SPRN_PID2: + vcpu->arch.gpr[rt] = vcpu_e500->pid[2]; break; + case SPRN_MAS0: + vcpu->arch.gpr[rt] = vcpu_e500->mas0; break; + case SPRN_MAS1: + vcpu->arch.gpr[rt] = vcpu_e500->mas1; break; + case SPRN_MAS2: + vcpu->arch.gpr[rt] = vcpu_e500->mas2; break; + case SPRN_MAS3: + vcpu->arch.gpr[rt] = vcpu_e500->mas3; break; + case SPRN_MAS4: + vcpu->arch.gpr[rt] = vcpu_e500->mas4; break; + case SPRN_MAS6: + vcpu->arch.gpr[rt] = vcpu_e500->mas6; break; + case SPRN_MAS7: + vcpu->arch.gpr[rt] = vcpu_e500->mas7; break; + + case SPRN_TLB0CFG: + vcpu->arch.gpr[rt] = mfspr(SPRN_TLB0CFG); + vcpu->arch.gpr[rt] &= ~0xfffUL; + vcpu->arch.gpr[rt] |= vcpu_e500->guest_tlb_size[0]; + break; + + case SPRN_TLB1CFG: + vcpu->arch.gpr[rt] = mfspr(SPRN_TLB1CFG); + vcpu->arch.gpr[rt] &= ~0xfffUL; + vcpu->arch.gpr[rt] |= vcpu_e500->guest_tlb_size[1]; + break; + + case SPRN_L1CSR1: + vcpu->arch.gpr[rt] = vcpu_e500->l1csr1; break; + case SPRN_HID0: + vcpu->arch.gpr[rt] = vcpu_e500->hid0; break; + case SPRN_HID1: + vcpu->arch.gpr[rt] = vcpu_e500->hid1; break; + + default: + emulated = kvmppc_booke_emulate_mfspr(vcpu, sprn, rt); + } + + return emulated; +} + diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c new file mode 100644 index 000000000000..6a50340f575e --- /dev/null +++ b/arch/powerpc/kvm/e500_tlb.c @@ -0,0 +1,737 @@ +/* + * Copyright (C) 2008 Freescale Semiconductor, Inc. All rights reserved. + * + * Author: Yu Liu, yu.liu@freescale.com + * + * Description: + * This file is based on arch/powerpc/kvm/44x_tlb.c, + * by Hollis Blanchard . + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "e500_tlb.h" + +#define to_htlb1_esel(esel) (tlb1_entry_num - (esel) - 1) + +static unsigned int tlb1_entry_num; + +void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + struct tlbe *tlbe; + int i, tlbsel; + + printk("| %8s | %8s | %8s | %8s | %8s |\n", + "nr", "mas1", "mas2", "mas3", "mas7"); + + for (tlbsel = 0; tlbsel < 2; tlbsel++) { + printk("Guest TLB%d:\n", tlbsel); + for (i = 0; i < vcpu_e500->guest_tlb_size[tlbsel]; i++) { + tlbe = &vcpu_e500->guest_tlb[tlbsel][i]; + if (tlbe->mas1 & MAS1_VALID) + printk(" G[%d][%3d] | %08X | %08X | %08X | %08X |\n", + tlbsel, i, tlbe->mas1, tlbe->mas2, + tlbe->mas3, tlbe->mas7); + } + } + + for (tlbsel = 0; tlbsel < 2; tlbsel++) { + printk("Shadow TLB%d:\n", tlbsel); + for (i = 0; i < vcpu_e500->shadow_tlb_size[tlbsel]; i++) { + tlbe = &vcpu_e500->shadow_tlb[tlbsel][i]; + if (tlbe->mas1 & MAS1_VALID) + printk(" S[%d][%3d] | %08X | %08X | %08X | %08X |\n", + tlbsel, i, tlbe->mas1, tlbe->mas2, + tlbe->mas3, tlbe->mas7); + } + } +} + +static inline unsigned int tlb0_get_next_victim( + struct kvmppc_vcpu_e500 *vcpu_e500) +{ + unsigned int victim; + + victim = vcpu_e500->guest_tlb_nv[0]++; + if (unlikely(vcpu_e500->guest_tlb_nv[0] >= KVM_E500_TLB0_WAY_NUM)) + vcpu_e500->guest_tlb_nv[0] = 0; + + return victim; +} + +static inline unsigned int tlb1_max_shadow_size(void) +{ + return tlb1_entry_num - tlbcam_index; +} + +static inline int tlbe_is_writable(struct tlbe *tlbe) +{ + return tlbe->mas3 & (MAS3_SW|MAS3_UW); +} + +static inline u32 e500_shadow_mas3_attrib(u32 mas3, int usermode) +{ + /* Mask off reserved bits. */ + mas3 &= MAS3_ATTRIB_MASK; + + if (!usermode) { + /* Guest is in supervisor mode, + * so we need to translate guest + * supervisor permissions into user permissions. */ + mas3 &= ~E500_TLB_USER_PERM_MASK; + mas3 |= (mas3 & E500_TLB_SUPER_PERM_MASK) << 1; + } + + return mas3 | E500_TLB_SUPER_PERM_MASK; +} + +static inline u32 e500_shadow_mas2_attrib(u32 mas2, int usermode) +{ + return mas2 & MAS2_ATTRIB_MASK; +} + +/* + * writing shadow tlb entry to host TLB + */ +static inline void __write_host_tlbe(struct tlbe *stlbe) +{ + mtspr(SPRN_MAS1, stlbe->mas1); + mtspr(SPRN_MAS2, stlbe->mas2); + mtspr(SPRN_MAS3, stlbe->mas3); + mtspr(SPRN_MAS7, stlbe->mas7); + __asm__ __volatile__ ("tlbwe\n" : : ); +} + +static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500, + int tlbsel, int esel) +{ + struct tlbe *stlbe = &vcpu_e500->shadow_tlb[tlbsel][esel]; + + local_irq_disable(); + if (tlbsel == 0) { + __write_host_tlbe(stlbe); + } else { + unsigned register mas0; + + mas0 = mfspr(SPRN_MAS0); + + mtspr(SPRN_MAS0, MAS0_TLBSEL(1) | MAS0_ESEL(to_htlb1_esel(esel))); + __write_host_tlbe(stlbe); + + mtspr(SPRN_MAS0, mas0); + } + local_irq_enable(); +} + +void kvmppc_e500_tlb_load(struct kvm_vcpu *vcpu, int cpu) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + int i; + unsigned register mas0; + + /* Load all valid TLB1 entries to reduce guest tlb miss fault */ + local_irq_disable(); + mas0 = mfspr(SPRN_MAS0); + for (i = 0; i < tlb1_max_shadow_size(); i++) { + struct tlbe *stlbe = &vcpu_e500->shadow_tlb[1][i]; + + if (get_tlb_v(stlbe)) { + mtspr(SPRN_MAS0, MAS0_TLBSEL(1) + | MAS0_ESEL(to_htlb1_esel(i))); + __write_host_tlbe(stlbe); + } + } + mtspr(SPRN_MAS0, mas0); + local_irq_enable(); +} + +void kvmppc_e500_tlb_put(struct kvm_vcpu *vcpu) +{ + _tlbia(); +} + +/* Search the guest TLB for a matching entry. */ +static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500, + gva_t eaddr, int tlbsel, unsigned int pid, int as) +{ + int i; + + /* XXX Replace loop with fancy data structures. */ + for (i = 0; i < vcpu_e500->guest_tlb_size[tlbsel]; i++) { + struct tlbe *tlbe = &vcpu_e500->guest_tlb[tlbsel][i]; + unsigned int tid; + + if (eaddr < get_tlb_eaddr(tlbe)) + continue; + + if (eaddr > get_tlb_end(tlbe)) + continue; + + tid = get_tlb_tid(tlbe); + if (tid && (tid != pid)) + continue; + + if (!get_tlb_v(tlbe)) + continue; + + if (get_tlb_ts(tlbe) != as && as != -1) + continue; + + return i; + } + + return -1; +} + +static void kvmppc_e500_shadow_release(struct kvmppc_vcpu_e500 *vcpu_e500, + int tlbsel, int esel) +{ + struct tlbe *stlbe = &vcpu_e500->shadow_tlb[tlbsel][esel]; + struct page *page = vcpu_e500->shadow_pages[tlbsel][esel]; + + if (page) { + vcpu_e500->shadow_pages[tlbsel][esel] = NULL; + + if (get_tlb_v(stlbe)) { + if (tlbe_is_writable(stlbe)) + kvm_release_page_dirty(page); + else + kvm_release_page_clean(page); + } + } +} + +static void kvmppc_e500_stlbe_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500, + int tlbsel, int esel) +{ + struct tlbe *stlbe = &vcpu_e500->shadow_tlb[tlbsel][esel]; + + kvmppc_e500_shadow_release(vcpu_e500, tlbsel, esel); + stlbe->mas1 = 0; + KVMTRACE_5D(STLB_INVAL, &vcpu_e500->vcpu, index_of(tlbsel, esel), + stlbe->mas1, stlbe->mas2, stlbe->mas3, stlbe->mas7, + handler); +} + +static void kvmppc_e500_tlb1_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500, + gva_t eaddr, gva_t eend, u32 tid) +{ + unsigned int pid = tid & 0xff; + unsigned int i; + + /* XXX Replace loop with fancy data structures. */ + for (i = 0; i < vcpu_e500->guest_tlb_size[1]; i++) { + struct tlbe *stlbe = &vcpu_e500->shadow_tlb[1][i]; + unsigned int tid; + + if (!get_tlb_v(stlbe)) + continue; + + if (eend < get_tlb_eaddr(stlbe)) + continue; + + if (eaddr > get_tlb_end(stlbe)) + continue; + + tid = get_tlb_tid(stlbe); + if (tid && (tid != pid)) + continue; + + kvmppc_e500_stlbe_invalidate(vcpu_e500, 1, i); + write_host_tlbe(vcpu_e500, 1, i); + } +} + +static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu, + unsigned int eaddr, int as) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + unsigned int victim, pidsel, tsized; + int tlbsel; + + /* since we only have tow TLBs, only lower bit is used. */ + tlbsel = (vcpu_e500->mas4 >> 28) & 0x1; + victim = (tlbsel == 0) ? tlb0_get_next_victim(vcpu_e500) : 0; + pidsel = (vcpu_e500->mas4 >> 16) & 0xf; + tsized = (vcpu_e500->mas4 >> 8) & 0xf; + + vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim) + | MAS0_NV(vcpu_e500->guest_tlb_nv[tlbsel]); + vcpu_e500->mas1 = MAS1_VALID | (as ? MAS1_TS : 0) + | MAS1_TID(vcpu_e500->pid[pidsel]) + | MAS1_TSIZE(tsized); + vcpu_e500->mas2 = (eaddr & MAS2_EPN) + | (vcpu_e500->mas4 & MAS2_ATTRIB_MASK); + vcpu_e500->mas3 &= MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3; + vcpu_e500->mas6 = (vcpu_e500->mas6 & MAS6_SPID1) + | (get_cur_pid(vcpu) << 16) + | (as ? MAS6_SAS : 0); + vcpu_e500->mas7 = 0; +} + +static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, + u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe, int tlbsel, int esel) +{ + struct page *new_page; + struct tlbe *stlbe; + hpa_t hpaddr; + + stlbe = &vcpu_e500->shadow_tlb[tlbsel][esel]; + + /* Get reference to new page. */ + new_page = gfn_to_page(vcpu_e500->vcpu.kvm, gfn); + if (is_error_page(new_page)) { + printk(KERN_ERR "Couldn't get guest page for gfn %lx!\n", gfn); + kvm_release_page_clean(new_page); + return; + } + hpaddr = page_to_phys(new_page); + + /* Drop reference to old page. */ + kvmppc_e500_shadow_release(vcpu_e500, tlbsel, esel); + + vcpu_e500->shadow_pages[tlbsel][esel] = new_page; + + /* Force TS=1 IPROT=0 TSIZE=4KB for all guest mappings. */ + stlbe->mas1 = MAS1_TSIZE(BOOKE_PAGESZ_4K) + | MAS1_TID(get_tlb_tid(gtlbe)) | MAS1_TS | MAS1_VALID; + stlbe->mas2 = (gvaddr & MAS2_EPN) + | e500_shadow_mas2_attrib(gtlbe->mas2, + vcpu_e500->vcpu.arch.msr & MSR_PR); + stlbe->mas3 = (hpaddr & MAS3_RPN) + | e500_shadow_mas3_attrib(gtlbe->mas3, + vcpu_e500->vcpu.arch.msr & MSR_PR); + stlbe->mas7 = (hpaddr >> 32) & MAS7_RPN; + + KVMTRACE_5D(STLB_WRITE, &vcpu_e500->vcpu, index_of(tlbsel, esel), + stlbe->mas1, stlbe->mas2, stlbe->mas3, stlbe->mas7, + handler); +} + +/* XXX only map the one-one case, for now use TLB0 */ +static int kvmppc_e500_stlbe_map(struct kvmppc_vcpu_e500 *vcpu_e500, + int tlbsel, int esel) +{ + struct tlbe *gtlbe; + + gtlbe = &vcpu_e500->guest_tlb[tlbsel][esel]; + + kvmppc_e500_shadow_map(vcpu_e500, get_tlb_eaddr(gtlbe), + get_tlb_raddr(gtlbe) >> PAGE_SHIFT, + gtlbe, tlbsel, esel); + + return esel; +} + +/* Caller must ensure that the specified guest TLB entry is safe to insert into + * the shadow TLB. */ +/* XXX for both one-one and one-to-many , for now use TLB1 */ +static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500, + u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe) +{ + unsigned int victim; + + victim = vcpu_e500->guest_tlb_nv[1]++; + + if (unlikely(vcpu_e500->guest_tlb_nv[1] >= tlb1_max_shadow_size())) + vcpu_e500->guest_tlb_nv[1] = 0; + + kvmppc_e500_shadow_map(vcpu_e500, gvaddr, gfn, gtlbe, 1, victim); + + return victim; +} + +/* Invalidate all guest kernel mappings when enter usermode, + * so that when they fault back in they will get the + * proper permission bits. */ +void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode) +{ + if (usermode) { + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + int i; + + /* XXX Replace loop with fancy data structures. */ + /* needn't set modified since tlbia will make TLB1 coherent */ + for (i = 0; i < tlb1_max_shadow_size(); i++) + kvmppc_e500_stlbe_invalidate(vcpu_e500, 1, i); + + _tlbia(); + } +} + +static int kvmppc_e500_gtlbe_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500, + int tlbsel, int esel) +{ + struct tlbe *gtlbe = &vcpu_e500->guest_tlb[tlbsel][esel]; + + if (unlikely(get_tlb_iprot(gtlbe))) + return -1; + + if (tlbsel == 1) { + kvmppc_e500_tlb1_invalidate(vcpu_e500, get_tlb_eaddr(gtlbe), + get_tlb_end(gtlbe), + get_tlb_tid(gtlbe)); + } else { + kvmppc_e500_stlbe_invalidate(vcpu_e500, tlbsel, esel); + } + + gtlbe->mas1 = 0; + + return 0; +} + +int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + unsigned int ia; + int esel, tlbsel; + gva_t ea; + + ea = ((ra) ? vcpu->arch.gpr[ra] : 0) + vcpu->arch.gpr[rb]; + + ia = (ea >> 2) & 0x1; + + /* since we only have tow TLBs, only lower bit is used. */ + tlbsel = (ea >> 3) & 0x1; + + if (ia) { + /* invalidate all entries */ + for (esel = 0; esel < vcpu_e500->guest_tlb_size[tlbsel]; esel++) + kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel); + } else { + ea &= 0xfffff000; + esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, + get_cur_pid(vcpu), -1); + if (esel >= 0) + kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel); + } + + _tlbia(); + + return EMULATE_DONE; +} + +int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + int tlbsel, esel; + struct tlbe *gtlbe; + + tlbsel = get_tlb_tlbsel(vcpu_e500); + esel = get_tlb_esel(vcpu_e500, tlbsel); + + gtlbe = &vcpu_e500->guest_tlb[tlbsel][esel]; + vcpu_e500->mas0 &= MAS0_NV(0); + vcpu_e500->mas0 |= MAS0_NV(vcpu_e500->guest_tlb_nv[tlbsel]); + vcpu_e500->mas1 = gtlbe->mas1; + vcpu_e500->mas2 = gtlbe->mas2; + vcpu_e500->mas3 = gtlbe->mas3; + vcpu_e500->mas7 = gtlbe->mas7; + + return EMULATE_DONE; +} + +int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + int as = !!get_cur_sas(vcpu_e500); + unsigned int pid = get_cur_spid(vcpu_e500); + int esel, tlbsel; + struct tlbe *gtlbe = NULL; + gva_t ea; + + ea = vcpu->arch.gpr[rb]; + + for (tlbsel = 0; tlbsel < 2; tlbsel++) { + esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, as); + if (esel >= 0) { + gtlbe = &vcpu_e500->guest_tlb[tlbsel][esel]; + break; + } + } + + if (gtlbe) { + vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(esel) + | MAS0_NV(vcpu_e500->guest_tlb_nv[tlbsel]); + vcpu_e500->mas1 = gtlbe->mas1; + vcpu_e500->mas2 = gtlbe->mas2; + vcpu_e500->mas3 = gtlbe->mas3; + vcpu_e500->mas7 = gtlbe->mas7; + } else { + int victim; + + /* since we only have tow TLBs, only lower bit is used. */ + tlbsel = vcpu_e500->mas4 >> 28 & 0x1; + victim = (tlbsel == 0) ? tlb0_get_next_victim(vcpu_e500) : 0; + + vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim) + | MAS0_NV(vcpu_e500->guest_tlb_nv[tlbsel]); + vcpu_e500->mas1 = (vcpu_e500->mas6 & MAS6_SPID0) + | (vcpu_e500->mas6 & (MAS6_SAS ? MAS1_TS : 0)) + | (vcpu_e500->mas4 & MAS4_TSIZED(~0)); + vcpu_e500->mas2 &= MAS2_EPN; + vcpu_e500->mas2 |= vcpu_e500->mas4 & MAS2_ATTRIB_MASK; + vcpu_e500->mas3 &= MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3; + vcpu_e500->mas7 = 0; + } + + return EMULATE_DONE; +} + +int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + u64 eaddr; + u64 raddr; + u32 tid; + struct tlbe *gtlbe; + int tlbsel, esel, stlbsel, sesel; + + tlbsel = get_tlb_tlbsel(vcpu_e500); + esel = get_tlb_esel(vcpu_e500, tlbsel); + + gtlbe = &vcpu_e500->guest_tlb[tlbsel][esel]; + + if (get_tlb_v(gtlbe) && tlbsel == 1) { + eaddr = get_tlb_eaddr(gtlbe); + tid = get_tlb_tid(gtlbe); + kvmppc_e500_tlb1_invalidate(vcpu_e500, eaddr, + get_tlb_end(gtlbe), tid); + } + + gtlbe->mas1 = vcpu_e500->mas1; + gtlbe->mas2 = vcpu_e500->mas2; + gtlbe->mas3 = vcpu_e500->mas3; + gtlbe->mas7 = vcpu_e500->mas7; + + KVMTRACE_5D(GTLB_WRITE, vcpu, vcpu_e500->mas0, + gtlbe->mas1, gtlbe->mas2, gtlbe->mas3, gtlbe->mas7, + handler); + + /* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */ + if (tlbe_is_host_safe(vcpu, gtlbe)) { + switch (tlbsel) { + case 0: + /* TLB0 */ + gtlbe->mas1 &= ~MAS1_TSIZE(~0); + gtlbe->mas1 |= MAS1_TSIZE(BOOKE_PAGESZ_4K); + + stlbsel = 0; + sesel = kvmppc_e500_stlbe_map(vcpu_e500, 0, esel); + + break; + + case 1: + /* TLB1 */ + eaddr = get_tlb_eaddr(gtlbe); + raddr = get_tlb_raddr(gtlbe); + + /* Create a 4KB mapping on the host. + * If the guest wanted a large page, + * only the first 4KB is mapped here and the rest + * are mapped on the fly. */ + stlbsel = 1; + sesel = kvmppc_e500_tlb1_map(vcpu_e500, eaddr, + raddr >> PAGE_SHIFT, gtlbe); + break; + + default: + BUG(); + } + write_host_tlbe(vcpu_e500, stlbsel, sesel); + } + + return EMULATE_DONE; +} + +int kvmppc_mmu_itlb_index(struct kvm_vcpu *vcpu, gva_t eaddr) +{ + unsigned int as = !!(vcpu->arch.msr & MSR_IS); + + return kvmppc_e500_tlb_search(vcpu, eaddr, get_cur_pid(vcpu), as); +} + +int kvmppc_mmu_dtlb_index(struct kvm_vcpu *vcpu, gva_t eaddr) +{ + unsigned int as = !!(vcpu->arch.msr & MSR_DS); + + return kvmppc_e500_tlb_search(vcpu, eaddr, get_cur_pid(vcpu), as); +} + +void kvmppc_mmu_itlb_miss(struct kvm_vcpu *vcpu) +{ + unsigned int as = !!(vcpu->arch.msr & MSR_IS); + + kvmppc_e500_deliver_tlb_miss(vcpu, vcpu->arch.pc, as); +} + +void kvmppc_mmu_dtlb_miss(struct kvm_vcpu *vcpu) +{ + unsigned int as = !!(vcpu->arch.msr & MSR_DS); + + kvmppc_e500_deliver_tlb_miss(vcpu, vcpu->arch.fault_dear, as); +} + +gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int index, + gva_t eaddr) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + struct tlbe *gtlbe = + &vcpu_e500->guest_tlb[tlbsel_of(index)][esel_of(index)]; + u64 pgmask = get_tlb_bytes(gtlbe) - 1; + + return get_tlb_raddr(gtlbe) | (eaddr & pgmask); +} + +void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + int tlbsel, i; + + for (tlbsel = 0; tlbsel < 2; tlbsel++) + for (i = 0; i < vcpu_e500->guest_tlb_size[tlbsel]; i++) + kvmppc_e500_shadow_release(vcpu_e500, tlbsel, i); + + /* discard all guest mapping */ + _tlbia(); +} + +void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr, + unsigned int index) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + int tlbsel = tlbsel_of(index); + int esel = esel_of(index); + int stlbsel, sesel; + + switch (tlbsel) { + case 0: + stlbsel = 0; + sesel = esel; + break; + + case 1: { + gfn_t gfn = gpaddr >> PAGE_SHIFT; + struct tlbe *gtlbe + = &vcpu_e500->guest_tlb[tlbsel][esel]; + + stlbsel = 1; + sesel = kvmppc_e500_tlb1_map(vcpu_e500, eaddr, gfn, gtlbe); + break; + } + + default: + BUG(); + break; + } + write_host_tlbe(vcpu_e500, stlbsel, sesel); +} + +int kvmppc_e500_tlb_search(struct kvm_vcpu *vcpu, + gva_t eaddr, unsigned int pid, int as) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + int esel, tlbsel; + + for (tlbsel = 0; tlbsel < 2; tlbsel++) { + esel = kvmppc_e500_tlb_index(vcpu_e500, eaddr, tlbsel, pid, as); + if (esel >= 0) + return index_of(tlbsel, esel); + } + + return -1; +} + +void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + struct tlbe *tlbe; + + /* Insert large initial mapping for guest. */ + tlbe = &vcpu_e500->guest_tlb[1][0]; + tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOKE_PAGESZ_256M); + tlbe->mas2 = 0; + tlbe->mas3 = E500_TLB_SUPER_PERM_MASK; + tlbe->mas7 = 0; + + /* 4K map for serial output. Used by kernel wrapper. */ + tlbe = &vcpu_e500->guest_tlb[1][1]; + tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOKE_PAGESZ_4K); + tlbe->mas2 = (0xe0004500 & 0xFFFFF000) | MAS2_I | MAS2_G; + tlbe->mas3 = (0xe0004500 & 0xFFFFF000) | E500_TLB_SUPER_PERM_MASK; + tlbe->mas7 = 0; +} + +int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + tlb1_entry_num = mfspr(SPRN_TLB1CFG) & 0xFFF; + + vcpu_e500->guest_tlb_size[0] = KVM_E500_TLB0_SIZE; + vcpu_e500->guest_tlb[0] = + kzalloc(sizeof(struct tlbe) * KVM_E500_TLB0_SIZE, GFP_KERNEL); + if (vcpu_e500->guest_tlb[0] == NULL) + goto err_out; + + vcpu_e500->shadow_tlb_size[0] = KVM_E500_TLB0_SIZE; + vcpu_e500->shadow_tlb[0] = + kzalloc(sizeof(struct tlbe) * KVM_E500_TLB0_SIZE, GFP_KERNEL); + if (vcpu_e500->shadow_tlb[0] == NULL) + goto err_out_guest0; + + vcpu_e500->guest_tlb_size[1] = KVM_E500_TLB1_SIZE; + vcpu_e500->guest_tlb[1] = + kzalloc(sizeof(struct tlbe) * KVM_E500_TLB1_SIZE, GFP_KERNEL); + if (vcpu_e500->guest_tlb[1] == NULL) + goto err_out_shadow0; + + vcpu_e500->shadow_tlb_size[1] = tlb1_entry_num; + vcpu_e500->shadow_tlb[1] = + kzalloc(sizeof(struct tlbe) * tlb1_entry_num, GFP_KERNEL); + if (vcpu_e500->shadow_tlb[1] == NULL) + goto err_out_guest1; + + vcpu_e500->shadow_pages[0] = (struct page **) + kzalloc(sizeof(struct page *) * KVM_E500_TLB0_SIZE, GFP_KERNEL); + if (vcpu_e500->shadow_pages[0] == NULL) + goto err_out_shadow1; + + vcpu_e500->shadow_pages[1] = (struct page **) + kzalloc(sizeof(struct page *) * tlb1_entry_num, GFP_KERNEL); + if (vcpu_e500->shadow_pages[1] == NULL) + goto err_out_page0; + + return 0; + +err_out_page0: + kfree(vcpu_e500->shadow_pages[0]); +err_out_shadow1: + kfree(vcpu_e500->shadow_tlb[1]); +err_out_guest1: + kfree(vcpu_e500->guest_tlb[1]); +err_out_shadow0: + kfree(vcpu_e500->shadow_tlb[0]); +err_out_guest0: + kfree(vcpu_e500->guest_tlb[0]); +err_out: + return -1; +} + +void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + kfree(vcpu_e500->shadow_pages[1]); + kfree(vcpu_e500->shadow_pages[0]); + kfree(vcpu_e500->shadow_tlb[1]); + kfree(vcpu_e500->guest_tlb[1]); + kfree(vcpu_e500->shadow_tlb[0]); + kfree(vcpu_e500->guest_tlb[0]); +} diff --git a/arch/powerpc/kvm/e500_tlb.h b/arch/powerpc/kvm/e500_tlb.h new file mode 100644 index 000000000000..d8833f955163 --- /dev/null +++ b/arch/powerpc/kvm/e500_tlb.h @@ -0,0 +1,184 @@ +/* + * Copyright (C) 2008 Freescale Semiconductor, Inc. All rights reserved. + * + * Author: Yu Liu, yu.liu@freescale.com + * + * Description: + * This file is based on arch/powerpc/kvm/44x_tlb.h, + * by Hollis Blanchard . + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#ifndef __KVM_E500_TLB_H__ +#define __KVM_E500_TLB_H__ + +#include +#include +#include +#include + +#define KVM_E500_TLB0_WAY_SIZE_BIT 7 /* Fixed */ +#define KVM_E500_TLB0_WAY_SIZE (1UL << KVM_E500_TLB0_WAY_SIZE_BIT) +#define KVM_E500_TLB0_WAY_SIZE_MASK (KVM_E500_TLB0_WAY_SIZE - 1) + +#define KVM_E500_TLB0_WAY_NUM_BIT 1 /* No greater than 7 */ +#define KVM_E500_TLB0_WAY_NUM (1UL << KVM_E500_TLB0_WAY_NUM_BIT) +#define KVM_E500_TLB0_WAY_NUM_MASK (KVM_E500_TLB0_WAY_NUM - 1) + +#define KVM_E500_TLB0_SIZE (KVM_E500_TLB0_WAY_SIZE * KVM_E500_TLB0_WAY_NUM) +#define KVM_E500_TLB1_SIZE 16 + +#define index_of(tlbsel, esel) (((tlbsel) << 16) | ((esel) & 0xFFFF)) +#define tlbsel_of(index) ((index) >> 16) +#define esel_of(index) ((index) & 0xFFFF) + +#define E500_TLB_USER_PERM_MASK (MAS3_UX|MAS3_UR|MAS3_UW) +#define E500_TLB_SUPER_PERM_MASK (MAS3_SX|MAS3_SR|MAS3_SW) +#define MAS2_ATTRIB_MASK \ + (MAS2_X0 | MAS2_X1 | MAS2_W | MAS2_I | MAS2_M | MAS2_G | MAS2_E) +#define MAS3_ATTRIB_MASK \ + (MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3 \ + | E500_TLB_USER_PERM_MASK | E500_TLB_SUPER_PERM_MASK) + +extern void kvmppc_dump_tlbs(struct kvm_vcpu *); +extern int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *); +extern int kvmppc_e500_emul_tlbre(struct kvm_vcpu *); +extern int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *, int, int); +extern int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *, int); +extern int kvmppc_e500_tlb_search(struct kvm_vcpu *, gva_t, unsigned int, int); +extern void kvmppc_e500_tlb_put(struct kvm_vcpu *); +extern void kvmppc_e500_tlb_load(struct kvm_vcpu *, int); +extern int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *); +extern void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *); +extern void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *); + +/* TLB helper functions */ +static inline unsigned int get_tlb_size(const struct tlbe *tlbe) +{ + return (tlbe->mas1 >> 8) & 0xf; +} + +static inline gva_t get_tlb_eaddr(const struct tlbe *tlbe) +{ + return tlbe->mas2 & 0xfffff000; +} + +static inline u64 get_tlb_bytes(const struct tlbe *tlbe) +{ + unsigned int pgsize = get_tlb_size(tlbe); + return 1ULL << 10 << (pgsize << 1); +} + +static inline gva_t get_tlb_end(const struct tlbe *tlbe) +{ + u64 bytes = get_tlb_bytes(tlbe); + return get_tlb_eaddr(tlbe) + bytes - 1; +} + +static inline u64 get_tlb_raddr(const struct tlbe *tlbe) +{ + u64 rpn = tlbe->mas7; + return (rpn << 32) | (tlbe->mas3 & 0xfffff000); +} + +static inline unsigned int get_tlb_tid(const struct tlbe *tlbe) +{ + return (tlbe->mas1 >> 16) & 0xff; +} + +static inline unsigned int get_tlb_ts(const struct tlbe *tlbe) +{ + return (tlbe->mas1 >> 12) & 0x1; +} + +static inline unsigned int get_tlb_v(const struct tlbe *tlbe) +{ + return (tlbe->mas1 >> 31) & 0x1; +} + +static inline unsigned int get_tlb_iprot(const struct tlbe *tlbe) +{ + return (tlbe->mas1 >> 30) & 0x1; +} + +static inline unsigned int get_cur_pid(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.pid & 0xff; +} + +static inline unsigned int get_cur_spid( + const struct kvmppc_vcpu_e500 *vcpu_e500) +{ + return (vcpu_e500->mas6 >> 16) & 0xff; +} + +static inline unsigned int get_cur_sas( + const struct kvmppc_vcpu_e500 *vcpu_e500) +{ + return vcpu_e500->mas6 & 0x1; +} + +static inline unsigned int get_tlb_tlbsel( + const struct kvmppc_vcpu_e500 *vcpu_e500) +{ + /* + * Manual says that tlbsel has 2 bits wide. + * Since we only have tow TLBs, only lower bit is used. + */ + return (vcpu_e500->mas0 >> 28) & 0x1; +} + +static inline unsigned int get_tlb_nv_bit( + const struct kvmppc_vcpu_e500 *vcpu_e500) +{ + return vcpu_e500->mas0 & 0xfff; +} + +static inline unsigned int get_tlb_esel_bit( + const struct kvmppc_vcpu_e500 *vcpu_e500) +{ + return (vcpu_e500->mas0 >> 16) & 0xfff; +} + +static inline unsigned int get_tlb_esel( + const struct kvmppc_vcpu_e500 *vcpu_e500, + int tlbsel) +{ + unsigned int esel = get_tlb_esel_bit(vcpu_e500); + + if (tlbsel == 0) { + esel &= KVM_E500_TLB0_WAY_NUM_MASK; + esel |= ((vcpu_e500->mas2 >> 12) & KVM_E500_TLB0_WAY_SIZE_MASK) + << KVM_E500_TLB0_WAY_NUM_BIT; + } else { + esel &= KVM_E500_TLB1_SIZE - 1; + } + + return esel; +} + +static inline int tlbe_is_host_safe(const struct kvm_vcpu *vcpu, + const struct tlbe *tlbe) +{ + gpa_t gpa; + + if (!get_tlb_v(tlbe)) + return 0; + + /* Does it match current guest AS? */ + /* XXX what about IS != DS? */ + if (get_tlb_ts(tlbe) != !!(vcpu->arch.msr & MSR_IS)) + return 0; + + gpa = get_tlb_raddr(tlbe); + if (!gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT)) + /* Mapping is not for RAM. */ + return 0; + + return 1; +} + +#endif /* __KVM_E500_TLB_H__ */ -- cgit v1.2.3 From b52a638c391c5c7b013180f5374274698b8535c8 Mon Sep 17 00:00:00 2001 From: Hollis Blanchard Date: Sat, 3 Jan 2009 16:23:11 -0600 Subject: KVM: ppc: Add kvmppc_mmu_dtlb/itlb_miss for booke When itlb or dtlb miss happens, E500 needs to update some mmu registers. So that the auto-load mechanism can work on E500 when write a tlb entry. Signed-off-by: Liu Yu Signed-off-by: Hollis Blanchard Signed-off-by: Avi Kivity --- arch/powerpc/include/asm/kvm_ppc.h | 2 ++ arch/powerpc/kvm/44x_tlb.c | 8 ++++++++ arch/powerpc/kvm/booke.c | 2 ++ 3 files changed, 12 insertions(+) diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 5e80a20f32b8..6052779dbb56 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -63,6 +63,8 @@ extern int kvmppc_mmu_dtlb_index(struct kvm_vcpu *vcpu, gva_t eaddr); extern int kvmppc_mmu_itlb_index(struct kvm_vcpu *vcpu, gva_t eaddr); extern gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int gtlb_index, gva_t eaddr); +extern void kvmppc_mmu_dtlb_miss(struct kvm_vcpu *vcpu); +extern void kvmppc_mmu_itlb_miss(struct kvm_vcpu *vcpu); extern struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id); diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c index e67b7313ffc3..4a16f472cc18 100644 --- a/arch/powerpc/kvm/44x_tlb.c +++ b/arch/powerpc/kvm/44x_tlb.c @@ -232,6 +232,14 @@ int kvmppc_mmu_dtlb_index(struct kvm_vcpu *vcpu, gva_t eaddr) return kvmppc_44x_tlb_index(vcpu, eaddr, vcpu->arch.pid, as); } +void kvmppc_mmu_itlb_miss(struct kvm_vcpu *vcpu) +{ +} + +void kvmppc_mmu_dtlb_miss(struct kvm_vcpu *vcpu) +{ +} + static void kvmppc_44x_shadow_release(struct kvmppc_vcpu_44x *vcpu_44x, unsigned int stlb_index) { diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index a73b3959e41c..933c406c7915 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -295,6 +295,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DTLB_MISS); vcpu->arch.dear = vcpu->arch.fault_dear; vcpu->arch.esr = vcpu->arch.fault_esr; + kvmppc_mmu_dtlb_miss(vcpu); kvmppc_account_exit(vcpu, DTLB_REAL_MISS_EXITS); r = RESUME_GUEST; break; @@ -337,6 +338,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, if (gtlb_index < 0) { /* The guest didn't have a mapping for it. */ kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_ITLB_MISS); + kvmppc_mmu_itlb_miss(vcpu); kvmppc_account_exit(vcpu, ITLB_REAL_MISS_EXITS); break; } -- cgit v1.2.3 From bdc89f13ec955c14777d5caf02dfca3f51d639bd Mon Sep 17 00:00:00 2001 From: Hollis Blanchard Date: Sat, 3 Jan 2009 16:23:12 -0600 Subject: KVM: ppc: distinguish between interrupts and priorities Although BOOKE_MAX_INTERRUPT has the right value, the meaning is not match. Signed-off-by: Liu Yu Signed-off-by: Hollis Blanchard Signed-off-by: Avi Kivity --- arch/powerpc/kvm/booke.c | 2 +- arch/powerpc/kvm/booke.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 933c406c7915..f192fbee2f29 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -163,7 +163,7 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu) unsigned int priority; priority = __ffs(*pending); - while (priority <= BOOKE_MAX_INTERRUPT) { + while (priority <= BOOKE_IRQPRIO_MAX) { if (kvmppc_booke_irqprio_deliver(vcpu, priority)) break; diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h index 311fdbc23fbe..7ceeb3e739b4 100644 --- a/arch/powerpc/kvm/booke.h +++ b/arch/powerpc/kvm/booke.h @@ -42,6 +42,7 @@ #define BOOKE_IRQPRIO_EXTERNAL 13 #define BOOKE_IRQPRIO_FIT 14 #define BOOKE_IRQPRIO_DECREMENTER 15 +#define BOOKE_IRQPRIO_MAX 15 /* Helper function for "full" MSR writes. No need to call this if only EE is * changing. */ -- cgit v1.2.3 From bb3a8a178dec1e46df3138a30f76acf67fe12397 Mon Sep 17 00:00:00 2001 From: Hollis Blanchard Date: Sat, 3 Jan 2009 16:23:13 -0600 Subject: KVM: ppc: Add extra E500 exceptions e500 has additional interrupt vectors (and corresponding IVORs) for SPE and performance monitoring interrupts. Signed-off-by: Liu Yu Signed-off-by: Hollis Blanchard Signed-off-by: Avi Kivity --- arch/powerpc/include/asm/kvm_asm.h | 7 ++++++- arch/powerpc/include/asm/kvm_host.h | 2 +- arch/powerpc/kvm/booke.c | 18 ++++++++++++++++++ arch/powerpc/kvm/booke.h | 30 ++++++++++++++++++------------ arch/powerpc/kvm/booke_interrupts.S | 3 +++ arch/powerpc/kvm/e500.c | 20 +++++++++++++++++++- arch/powerpc/kvm/e500_emulate.c | 27 +++++++++++++++++++++++++++ 7 files changed, 92 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h index 2197764796d9..56bfae59837f 100644 --- a/arch/powerpc/include/asm/kvm_asm.h +++ b/arch/powerpc/include/asm/kvm_asm.h @@ -42,7 +42,12 @@ #define BOOKE_INTERRUPT_DTLB_MISS 13 #define BOOKE_INTERRUPT_ITLB_MISS 14 #define BOOKE_INTERRUPT_DEBUG 15 -#define BOOKE_MAX_INTERRUPT 15 + +/* E500 */ +#define BOOKE_INTERRUPT_SPE_UNAVAIL 32 +#define BOOKE_INTERRUPT_SPE_FP_DATA 33 +#define BOOKE_INTERRUPT_SPE_FP_ROUND 34 +#define BOOKE_INTERRUPT_PERFORMANCE_MONITOR 35 #define RESUME_FLAG_NV (1<<0) /* Reload guest nonvolatile state? */ #define RESUME_FLAG_HOST (1<<1) /* Resume host? */ diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 50e5ce1b400a..1c6187697520 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -150,7 +150,7 @@ struct kvm_vcpu_arch { u32 tbu; u32 tcr; u32 tsr; - u32 ivor[16]; + u32 ivor[64]; ulong ivpr; u32 pir; diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index f192fbee2f29..642e4204cf25 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -118,6 +118,9 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, case BOOKE_IRQPRIO_DATA_STORAGE: case BOOKE_IRQPRIO_INST_STORAGE: case BOOKE_IRQPRIO_FP_UNAVAIL: + case BOOKE_IRQPRIO_SPE_UNAVAIL: + case BOOKE_IRQPRIO_SPE_FP_DATA: + case BOOKE_IRQPRIO_SPE_FP_ROUND: case BOOKE_IRQPRIO_AP_UNAVAIL: case BOOKE_IRQPRIO_ALIGNMENT: allowed = 1; @@ -261,6 +264,21 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, r = RESUME_GUEST; break; + case BOOKE_INTERRUPT_SPE_UNAVAIL: + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SPE_UNAVAIL); + r = RESUME_GUEST; + break; + + case BOOKE_INTERRUPT_SPE_FP_DATA: + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SPE_FP_DATA); + r = RESUME_GUEST; + break; + + case BOOKE_INTERRUPT_SPE_FP_ROUND: + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SPE_FP_ROUND); + r = RESUME_GUEST; + break; + case BOOKE_INTERRUPT_DATA_STORAGE: vcpu->arch.dear = vcpu->arch.fault_dear; vcpu->arch.esr = vcpu->arch.fault_esr; diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h index 7ceeb3e739b4..d59bcca1f9d8 100644 --- a/arch/powerpc/kvm/booke.h +++ b/arch/powerpc/kvm/booke.h @@ -31,18 +31,24 @@ #define BOOKE_IRQPRIO_ALIGNMENT 2 #define BOOKE_IRQPRIO_PROGRAM 3 #define BOOKE_IRQPRIO_FP_UNAVAIL 4 -#define BOOKE_IRQPRIO_SYSCALL 5 -#define BOOKE_IRQPRIO_AP_UNAVAIL 6 -#define BOOKE_IRQPRIO_DTLB_MISS 7 -#define BOOKE_IRQPRIO_ITLB_MISS 8 -#define BOOKE_IRQPRIO_MACHINE_CHECK 9 -#define BOOKE_IRQPRIO_DEBUG 10 -#define BOOKE_IRQPRIO_CRITICAL 11 -#define BOOKE_IRQPRIO_WATCHDOG 12 -#define BOOKE_IRQPRIO_EXTERNAL 13 -#define BOOKE_IRQPRIO_FIT 14 -#define BOOKE_IRQPRIO_DECREMENTER 15 -#define BOOKE_IRQPRIO_MAX 15 +#define BOOKE_IRQPRIO_SPE_UNAVAIL 5 +#define BOOKE_IRQPRIO_SPE_FP_DATA 6 +#define BOOKE_IRQPRIO_SPE_FP_ROUND 7 +#define BOOKE_IRQPRIO_SYSCALL 8 +#define BOOKE_IRQPRIO_AP_UNAVAIL 9 +#define BOOKE_IRQPRIO_DTLB_MISS 10 +#define BOOKE_IRQPRIO_ITLB_MISS 11 +#define BOOKE_IRQPRIO_MACHINE_CHECK 12 +#define BOOKE_IRQPRIO_DEBUG 13 +#define BOOKE_IRQPRIO_CRITICAL 14 +#define BOOKE_IRQPRIO_WATCHDOG 15 +#define BOOKE_IRQPRIO_EXTERNAL 16 +#define BOOKE_IRQPRIO_FIT 17 +#define BOOKE_IRQPRIO_DECREMENTER 18 +#define BOOKE_IRQPRIO_PERFORMANCE_MONITOR 19 +#define BOOKE_IRQPRIO_MAX 19 + +extern unsigned long kvmppc_booke_handlers; /* Helper function for "full" MSR writes. No need to call this if only EE is * changing. */ diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S index 4679ec287e62..d0c6f841bbd1 100644 --- a/arch/powerpc/kvm/booke_interrupts.S +++ b/arch/powerpc/kvm/booke_interrupts.S @@ -86,6 +86,9 @@ KVM_HANDLER BOOKE_INTERRUPT_WATCHDOG KVM_HANDLER BOOKE_INTERRUPT_DTLB_MISS KVM_HANDLER BOOKE_INTERRUPT_ITLB_MISS KVM_HANDLER BOOKE_INTERRUPT_DEBUG +KVM_HANDLER BOOKE_INTERRUPT_SPE_UNAVAIL +KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_DATA +KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_ROUND _GLOBAL(kvmppc_handler_len) .long kvmppc_handler_1 - kvmppc_handler_0 diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index 7992da497cd4..d8067fd81cdd 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -21,6 +21,7 @@ #include #include +#include "booke.h" #include "e500_tlb.h" void kvmppc_core_load_host_debugstate(struct kvm_vcpu *vcpu) @@ -133,12 +134,29 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) static int kvmppc_e500_init(void) { - int r; + int r, i; + unsigned long ivor[3]; + unsigned long max_ivor = 0; r = kvmppc_booke_init(); if (r) return r; + /* copy extra E500 exception handlers */ + ivor[0] = mfspr(SPRN_IVOR32); + ivor[1] = mfspr(SPRN_IVOR33); + ivor[2] = mfspr(SPRN_IVOR34); + for (i = 0; i < 3; i++) { + if (ivor[i] > max_ivor) + max_ivor = ivor[i]; + + memcpy((void *)kvmppc_booke_handlers + ivor[i], + kvmppc_handlers_start + (i + 16) * kvmppc_handler_len, + kvmppc_handler_len); + } + flush_icache_range(kvmppc_booke_handlers, + kvmppc_booke_handlers + max_ivor + kvmppc_handler_len); + return kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), THIS_MODULE); } diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c index a47f44cd2cfd..d3c0c7c7f209 100644 --- a/arch/powerpc/kvm/e500_emulate.c +++ b/arch/powerpc/kvm/e500_emulate.c @@ -107,6 +107,20 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) case SPRN_HID1: vcpu_e500->hid1 = vcpu->arch.gpr[rs]; break; + /* extra exceptions */ + case SPRN_IVOR32: + vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL] = vcpu->arch.gpr[rs]; + break; + case SPRN_IVOR33: + vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA] = vcpu->arch.gpr[rs]; + break; + case SPRN_IVOR34: + vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND] = vcpu->arch.gpr[rs]; + break; + case SPRN_IVOR35: + vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR] = vcpu->arch.gpr[rs]; + break; + default: emulated = kvmppc_booke_emulate_mtspr(vcpu, sprn, rs); } @@ -160,6 +174,19 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) case SPRN_HID1: vcpu->arch.gpr[rt] = vcpu_e500->hid1; break; + /* extra exceptions */ + case SPRN_IVOR32: + vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL]; + break; + case SPRN_IVOR33: + vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA]; + break; + case SPRN_IVOR34: + vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND]; + break; + case SPRN_IVOR35: + vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR]; + break; default: emulated = kvmppc_booke_emulate_mfspr(vcpu, sprn, rt); } -- cgit v1.2.3 From 1872a3f411ffe95c8e92300e0986a3532db55ce9 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 4 Jan 2009 23:26:52 +0200 Subject: KVM: VMX: Fix guest state validity checks The vmx guest state validity checks are full of bugs. Make them conform to the manual. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 3312047664a4..be9441056ff2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1784,14 +1784,16 @@ static bool code_segment_valid(struct kvm_vcpu *vcpu) vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); cs_rpl = cs.selector & SELECTOR_RPL_MASK; + if (cs.unusable) + return false; if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK)) return false; if (!cs.s) return false; - if (!(~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK))) { + if (cs.type & AR_TYPE_WRITEABLE_MASK) { if (cs.dpl > cs_rpl) return false; - } else if (cs.type & AR_TYPE_CODE_MASK) { + } else { if (cs.dpl != cs_rpl) return false; } @@ -1810,7 +1812,9 @@ static bool stack_segment_valid(struct kvm_vcpu *vcpu) vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); ss_rpl = ss.selector & SELECTOR_RPL_MASK; - if ((ss.type != 3) || (ss.type != 7)) + if (ss.unusable) + return true; + if (ss.type != 3 && ss.type != 7) return false; if (!ss.s) return false; @@ -1830,6 +1834,8 @@ static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) vmx_get_segment(vcpu, &var, seg); rpl = var.selector & SELECTOR_RPL_MASK; + if (var.unusable) + return true; if (!var.s) return false; if (!var.present) @@ -1851,9 +1857,11 @@ static bool tr_valid(struct kvm_vcpu *vcpu) vmx_get_segment(vcpu, &tr, VCPU_SREG_TR); + if (tr.unusable) + return false; if (tr.selector & SELECTOR_TI_MASK) /* TI = 1 */ return false; - if ((tr.type != 3) || (tr.type != 11)) /* TODO: Check if guest is in IA32e mode */ + if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */ return false; if (!tr.present) return false; @@ -1867,6 +1875,8 @@ static bool ldtr_valid(struct kvm_vcpu *vcpu) vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR); + if (ldtr.unusable) + return true; if (ldtr.selector & SELECTOR_TI_MASK) /* TI = 1 */ return false; if (ldtr.type != 2) -- cgit v1.2.3 From 9fd4a3b7a412f983696b23121413a79d2132fed6 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 4 Jan 2009 23:43:42 +0200 Subject: KVM: VMX: don't clobber segment AR if emulating invalid state The ususable bit is important for determining state validity; don't clobber it. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index be9441056ff2..a6598cbaa001 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1649,7 +1649,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, var->limit = vmcs_read32(sf->limit); var->selector = vmcs_read16(sf->selector); ar = vmcs_read32(sf->ar_bytes); - if (ar & AR_UNUSABLE_MASK) + if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state) ar = 0; var->type = ar & 15; var->s = (ar >> 4) & 1; -- cgit v1.2.3 From 10f32d84c750ccf8c0afb3a4ea9d4059aa3e9ffc Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 5 Jan 2009 00:53:19 +0200 Subject: KVM: VMX: Prevent exit handler from running if emulating due to invalid state If we've just emulated an instruction, we won't have any valid exit reason and associated information. Fix by moving the clearing of the emulation_required flag to the exit handler. This way the exit handler can notice that we've been emulating and abort early. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a6598cbaa001..a309be6788e7 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3130,7 +3130,6 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - struct vcpu_vmx *vmx = to_vmx(vcpu); int err; preempt_enable(); @@ -3155,11 +3154,6 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, local_irq_disable(); preempt_disable(); - - /* Guest state should be valid now except if we need to - * emulate an MMIO */ - if (guest_state_valid(vcpu)) - vmx->emulation_required = 0; } /* @@ -3208,8 +3202,11 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) /* If we need to emulate an MMIO from handle_invalid_guest_state * we just return 0 */ - if (vmx->emulation_required && emulate_invalid_guest_state) + if (vmx->emulation_required && emulate_invalid_guest_state) { + if (guest_state_valid(vcpu)) + vmx->emulation_required = 0; return 0; + } /* Access CR3 don't cause VMExit in paging mode, so we need * to sync with guest real CR3. */ -- cgit v1.2.3 From 350f69dcd169d536307aa4a8c38c480e3a51c0db Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 5 Jan 2009 11:12:40 +0200 Subject: KVM: x86 emulator: Make emulate_pop() a little more generic Allow emulate_pop() to read into arbitrary memory rather than just the source operand. Needed for complicated instructions like far returns. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 54fb09889a80..94459f313f12 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -1136,18 +1136,19 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt) } static int emulate_pop(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) + struct x86_emulate_ops *ops, + void *dest, int len) { struct decode_cache *c = &ctxt->decode; int rc; rc = ops->read_emulated(register_address(c, ss_base(ctxt), c->regs[VCPU_REGS_RSP]), - &c->src.val, c->src.bytes, ctxt->vcpu); + dest, len, ctxt->vcpu); if (rc != 0) return rc; - register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.bytes); + register_address_increment(c, &c->regs[VCPU_REGS_RSP], len); return rc; } @@ -1157,11 +1158,9 @@ static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, struct decode_cache *c = &ctxt->decode; int rc; - c->src.bytes = c->dst.bytes; - rc = emulate_pop(ctxt, ops); + rc = emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes); if (rc != 0) return rc; - c->dst.val = c->src.val; return 0; } @@ -1467,11 +1466,9 @@ special_insn: break; case 0x58 ... 0x5f: /* pop reg */ pop_instruction: - c->src.bytes = c->op_bytes; - rc = emulate_pop(ctxt, ops); + rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes); if (rc != 0) goto done; - c->dst.val = c->src.val; break; case 0x63: /* movsxd */ if (ctxt->mode != X86EMUL_MODE_PROT64) -- cgit v1.2.3 From 8b3079a5c0c031de07c8390aa160a4229088274f Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 5 Jan 2009 12:10:54 +0200 Subject: KVM: VMX: When emulating on invalid vmx state, don't return to userspace unnecessarily If we aren't doing mmio there's no need to exit to userspace (which will just be confused). Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a309be6788e7..df454de8acfa 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -91,6 +91,7 @@ struct vcpu_vmx { } rmode; int vpid; bool emulation_required; + enum emulation_result invalid_state_emulation_result; /* Support for vnmi-less CPUs */ int soft_vnmi_blocked; @@ -3130,7 +3131,8 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - int err; + struct vcpu_vmx *vmx = to_vmx(vcpu); + enum emulation_result err = EMULATE_DONE; preempt_enable(); local_irq_enable(); @@ -3154,6 +3156,8 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, local_irq_disable(); preempt_disable(); + + vmx->invalid_state_emulation_result = err; } /* @@ -3205,7 +3209,7 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) if (vmx->emulation_required && emulate_invalid_guest_state) { if (guest_state_valid(vcpu)) vmx->emulation_required = 0; - return 0; + return vmx->invalid_state_emulation_result != EMULATE_DO_MMIO; } /* Access CR3 don't cause VMExit in paging mode, so we need -- cgit v1.2.3 From a77ab5ead5c1fef2c6c5a9b3cf3765e52643a2aa Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 5 Jan 2009 13:27:34 +0200 Subject: KVM: x86 emulator: implement 'ret far' instruction (opcode 0xcb) Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 94459f313f12..ca91749d2083 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -178,7 +178,7 @@ static u32 opcode_table[256] = { 0, ImplicitOps | Stack, 0, 0, ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, /* 0xC8 - 0xCF */ - 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, ImplicitOps | Stack, 0, 0, 0, 0, /* 0xD0 - 0xD7 */ ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, @@ -1278,6 +1278,25 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, return 0; } +static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + int rc; + unsigned long cs; + + rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes); + if (rc) + return rc; + if (c->op_bytes == 4) + c->eip = (u32)c->eip; + rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); + if (rc) + return rc; + rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, 1, VCPU_SREG_CS); + return rc; +} + static inline int writeback(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { @@ -1735,6 +1754,11 @@ special_insn: mov: c->dst.val = c->src.val; break; + case 0xcb: /* ret far */ + rc = emulate_ret_far(ctxt, ops); + if (rc) + goto done; + break; case 0xd0 ... 0xd1: /* Grp2 */ c->src.val = 1; emulate_grp2(ctxt); -- cgit v1.2.3 From 269e05e48502f1cc06802e9fba90f5100dd6bb0d Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 5 Jan 2009 15:21:42 +0200 Subject: KVM: Properly lock PIT creation Otherwise, two threads can create a PIT in parallel and cause a memory leak. Signed-off-by: Avi Kivity --- arch/x86/kvm/i8254.c | 2 -- arch/x86/kvm/x86.c | 6 ++++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 528daadeba49..69d1bbff3fd3 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -548,9 +548,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm) if (!pit) return NULL; - mutex_lock(&kvm->lock); pit->irq_source_id = kvm_request_irq_source_id(kvm); - mutex_unlock(&kvm->lock); if (pit->irq_source_id < 0) { kfree(pit); return NULL; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a1f14611f4b9..6fbc34603375 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1837,10 +1837,16 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; break; case KVM_CREATE_PIT: + mutex_lock(&kvm->lock); + r = -EEXIST; + if (kvm->arch.vpit) + goto create_pit_unlock; r = -ENOMEM; kvm->arch.vpit = kvm_create_pit(kvm); if (kvm->arch.vpit) r = 0; + create_pit_unlock: + mutex_unlock(&kvm->lock); break; case KVM_IRQ_LINE: { struct kvm_irq_level irq_event; -- cgit v1.2.3 From f5d0906b5bafd7faea553ed1cc92bd06755b66b9 Mon Sep 17 00:00:00 2001 From: Hollis Blanchard Date: Sun, 4 Jan 2009 13:51:09 -0600 Subject: KVM: ppc: remove debug support broken by KVM debug rewrite After the rewrite of KVM's debug support, this code doesn't even build any more. Signed-off-by: Hollis Blanchard Signed-off-by: Avi Kivity --- arch/powerpc/include/asm/kvm_host.h | 5 --- arch/powerpc/include/asm/kvm_ppc.h | 3 -- arch/powerpc/kvm/44x.c | 66 ------------------------------------- arch/powerpc/kvm/powerpc.c | 27 ++------------- 4 files changed, 2 insertions(+), 99 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 1c6187697520..dfdf13c9fefd 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -111,11 +111,6 @@ struct kvm_arch { struct kvm_vcpu_arch { u32 host_stack; u32 host_pid; - u32 host_dbcr0; - u32 host_dbcr1; - u32 host_dbcr2; - u32 host_iac[4]; - u32 host_msr; u64 fpr[32]; ulong gpr[32]; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 6052779dbb56..2c6ee349df5e 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -77,9 +77,6 @@ extern int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu, extern void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu); extern void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu); -extern void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu); -extern void kvmppc_core_load_host_debugstate(struct kvm_vcpu *vcpu); - extern void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu); extern int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu); extern void kvmppc_core_queue_program(struct kvm_vcpu *vcpu); diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c index 8383603dd8a4..0cef809cec21 100644 --- a/arch/powerpc/kvm/44x.c +++ b/arch/powerpc/kvm/44x.c @@ -28,72 +28,6 @@ #include "44x_tlb.h" -/* Note: clearing MSR[DE] just means that the debug interrupt will not be - * delivered *immediately*. Instead, it simply sets the appropriate DBSR bits. - * If those DBSR bits are still set when MSR[DE] is re-enabled, the interrupt - * will be delivered as an "imprecise debug event" (which is indicated by - * DBSR[IDE]. - */ -static void kvm44x_disable_debug_interrupts(void) -{ - mtmsr(mfmsr() & ~MSR_DE); -} - -void kvmppc_core_load_host_debugstate(struct kvm_vcpu *vcpu) -{ - kvm44x_disable_debug_interrupts(); - - mtspr(SPRN_IAC1, vcpu->arch.host_iac[0]); - mtspr(SPRN_IAC2, vcpu->arch.host_iac[1]); - mtspr(SPRN_IAC3, vcpu->arch.host_iac[2]); - mtspr(SPRN_IAC4, vcpu->arch.host_iac[3]); - mtspr(SPRN_DBCR1, vcpu->arch.host_dbcr1); - mtspr(SPRN_DBCR2, vcpu->arch.host_dbcr2); - mtspr(SPRN_DBCR0, vcpu->arch.host_dbcr0); - mtmsr(vcpu->arch.host_msr); -} - -void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu) -{ - struct kvm_guest_debug *dbg = &vcpu->guest_debug; - u32 dbcr0 = 0; - - vcpu->arch.host_msr = mfmsr(); - kvm44x_disable_debug_interrupts(); - - /* Save host debug register state. */ - vcpu->arch.host_iac[0] = mfspr(SPRN_IAC1); - vcpu->arch.host_iac[1] = mfspr(SPRN_IAC2); - vcpu->arch.host_iac[2] = mfspr(SPRN_IAC3); - vcpu->arch.host_iac[3] = mfspr(SPRN_IAC4); - vcpu->arch.host_dbcr0 = mfspr(SPRN_DBCR0); - vcpu->arch.host_dbcr1 = mfspr(SPRN_DBCR1); - vcpu->arch.host_dbcr2 = mfspr(SPRN_DBCR2); - - /* set registers up for guest */ - - if (dbg->bp[0]) { - mtspr(SPRN_IAC1, dbg->bp[0]); - dbcr0 |= DBCR0_IAC1 | DBCR0_IDM; - } - if (dbg->bp[1]) { - mtspr(SPRN_IAC2, dbg->bp[1]); - dbcr0 |= DBCR0_IAC2 | DBCR0_IDM; - } - if (dbg->bp[2]) { - mtspr(SPRN_IAC3, dbg->bp[2]); - dbcr0 |= DBCR0_IAC3 | DBCR0_IDM; - } - if (dbg->bp[3]) { - mtspr(SPRN_IAC4, dbg->bp[3]); - dbcr0 |= DBCR0_IAC4 | DBCR0_IDM; - } - - mtspr(SPRN_DBCR0, dbcr0); - mtspr(SPRN_DBCR1, 0); - mtspr(SPRN_DBCR2, 0); -} - void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { kvmppc_44x_tlb_load(vcpu); diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index f30be9ef2314..9057335fdc61 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -221,41 +221,18 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - if (vcpu->guest_debug.enabled) - kvmppc_core_load_guest_debugstate(vcpu); - kvmppc_core_vcpu_load(vcpu, cpu); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { - if (vcpu->guest_debug.enabled) - kvmppc_core_load_host_debugstate(vcpu); - - /* Don't leave guest TLB entries resident when being de-scheduled. */ - /* XXX It would be nice to differentiate between heavyweight exit and - * sched_out here, since we could avoid the TLB flush for heavyweight - * exits. */ - _tlbil_all(); kvmppc_core_vcpu_put(vcpu); } int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, - struct kvm_guest_debug *dbg) + struct kvm_guest_debug *dbg) { - int i; - - vcpu->guest_debug.enabled = dbg->enabled; - if (vcpu->guest_debug.enabled) { - for (i=0; i < ARRAY_SIZE(vcpu->guest_debug.bp); i++) { - if (dbg->breakpoints[i].enabled) - vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address; - else - vcpu->guest_debug.bp[i] = 0; - } - } - - return 0; + return -EINVAL; } static void kvmppc_complete_dcr_load(struct kvm_vcpu *vcpu, -- cgit v1.2.3 From 67346440e83d2a2f2e9801f370b6240317c7d9bd Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Tue, 6 Jan 2009 10:03:01 +0800 Subject: KVM: Remove duplicated prototype of kvm_arch_destroy_vm Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- include/linux/kvm_host.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index e92212f970db..3cf0ede3fd73 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -237,7 +237,6 @@ int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, int user_alloc); long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); -void kvm_arch_destroy_vm(struct kvm *kvm); int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu); int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu); -- cgit v1.2.3 From 17071fe74fe0fbfdb03cd9b82f2490447cf1f986 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Tue, 6 Jan 2009 16:25:11 +0800 Subject: KVM: Add support to disable MSI for assigned device MSI is always enabled by default for msi2intx=1. But if msi2intx=0, we have to disable MSI if guest require to do so. The patch also discard unnecessary msi2intx judgment if guest want to update MSI state. Notice KVM_DEV_IRQ_ASSIGN_MSI_ACTION is a mask which should cover all MSI related operations, though we only got one for now. Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- include/linux/kvm.h | 1 + virt/kvm/kvm_main.c | 18 ++++++++++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/include/linux/kvm.h b/include/linux/kvm.h index ae7a12c77427..73f348066866 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -550,6 +550,7 @@ struct kvm_assigned_irq { #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) +#define KVM_DEV_IRQ_ASSIGN_MSI_ACTION KVM_DEV_IRQ_ASSIGN_ENABLE_MSI #define KVM_DEV_IRQ_ASSIGN_ENABLE_MSI (1 << 0) #endif diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index f83ef9c7e89b..04401e17c758 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -343,6 +343,14 @@ static int assigned_device_update_msi(struct kvm *kvm, adev->irq_requested_type &= ~KVM_ASSIGNED_DEV_GUEST_MSI; adev->guest_irq = airq->guest_irq; adev->ack_notifier.gsi = airq->guest_irq; + } else { + /* + * Guest require to disable device MSI, we disable MSI and + * re-enable INTx by default again. Notice it's only for + * non-msi2intx. + */ + assigned_device_update_intx(kvm, adev, airq); + return 0; } if (adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI) @@ -379,6 +387,7 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, { int r = 0; struct kvm_assigned_dev_kernel *match; + u32 current_flags = 0, changed_flags; mutex_lock(&kvm->lock); @@ -416,8 +425,13 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, } } - if ((!msi2intx && - (assigned_irq->flags & KVM_DEV_IRQ_ASSIGN_ENABLE_MSI)) || + if ((match->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI) && + (match->irq_requested_type & KVM_ASSIGNED_DEV_GUEST_MSI)) + current_flags |= KVM_DEV_IRQ_ASSIGN_ENABLE_MSI; + + changed_flags = assigned_irq->flags ^ current_flags; + + if ((changed_flags & KVM_DEV_IRQ_ASSIGN_MSI_ACTION) || (msi2intx && match->dev->msi_enabled)) { #ifdef CONFIG_X86 r = assigned_device_update_msi(kvm, match, assigned_irq); -- cgit v1.2.3 From c8a73f186bf62235b6fb5dd52601d641917dd50b Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Mon, 5 Jan 2009 16:02:47 +0100 Subject: KVM: SVM: Add microcode patch level dummy VMware ESX checks if the microcode level is correct when using a barcelona CPU, in order to see if it actually can use SVM. Let's tell it we're on the safe side... Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index b3a7a314d55c..3a60c3f04e65 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1928,6 +1928,9 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) case MSR_VM_CR: *data = 0; break; + case MSR_IA32_UCODE_REV: + *data = 0x01000065; + break; default: return kvm_get_msr_common(vcpu, ecx, data); } -- cgit v1.2.3 From 4677a3b693e035f186e2875259b9a0bb94c42fbe Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 6 Jan 2009 13:00:27 +0200 Subject: KVM: MMU: Optimize page unshadowing Using kvm_mmu_lookup_page() will result in multiple scans of the hash chains; use hlist_for_each_entry_safe() to achieve a single scan instead. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 15850809b55b..aac0499947d8 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1471,11 +1471,20 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) { + unsigned index; + struct hlist_head *bucket; struct kvm_mmu_page *sp; + struct hlist_node *node, *nn; - while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) { - pgprintk("%s: zap %lx %x\n", __func__, gfn, sp->role.word); - kvm_mmu_zap_page(kvm, sp); + index = kvm_page_table_hashfn(gfn); + bucket = &kvm->arch.mmu_page_hash[index]; + hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) { + if (sp->gfn == gfn && !sp->role.metaphysical + && !sp->role.invalid) { + pgprintk("%s: zap %lx %x\n", + __func__, gfn, sp->role.word); + kvm_mmu_zap_page(kvm, sp); + } } } -- cgit v1.2.3 From 5d9b8e30f543a9f21a968a4cda71e8f6d1c66a61 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 4 Jan 2009 18:04:18 +0200 Subject: KVM: Add CONFIG_HAVE_KVM_IRQCHIP Two KVM archs support irqchips and two don't. Add a Kconfig item to make selecting between the two models easier. Signed-off-by: Avi Kivity --- arch/ia64/kvm/Kconfig | 4 ++++ arch/powerpc/kvm/Kconfig | 3 +++ arch/s390/kvm/Kconfig | 3 +++ arch/x86/kvm/Kconfig | 4 ++++ 4 files changed, 14 insertions(+) diff --git a/arch/ia64/kvm/Kconfig b/arch/ia64/kvm/Kconfig index f833a0b4188d..0a2d6b86075a 100644 --- a/arch/ia64/kvm/Kconfig +++ b/arch/ia64/kvm/Kconfig @@ -4,6 +4,10 @@ config HAVE_KVM bool +config HAVE_KVM_IRQCHIP + bool + default y + menuconfig VIRTUALIZATION bool "Virtualization" depends on HAVE_KVM || IA64 diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 146570550142..5a152a52796f 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -2,6 +2,9 @@ # KVM configuration # +config HAVE_KVM_IRQCHIP + bool + menuconfig VIRTUALIZATION bool "Virtualization" ---help--- diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index e051cad1f1e0..3e260b7e37b2 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -4,6 +4,9 @@ config HAVE_KVM bool +config HAVE_KVM_IRQCHIP + bool + menuconfig VIRTUALIZATION bool "Virtualization" default y diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index b81125f0bdee..0a303c3ed11f 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -4,6 +4,10 @@ config HAVE_KVM bool +config HAVE_KVM_IRQCHIP + bool + default y + menuconfig VIRTUALIZATION bool "Virtualization" depends on HAVE_KVM || X86 -- cgit v1.2.3 From 75858a84a6207f5e60196f6bbd18fde4250e5759 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 4 Jan 2009 17:10:50 +0200 Subject: KVM: Interrupt mask notifiers for ioapic Allow clients to request notifications when the guest masks or unmasks a particular irq line. This complements irq ack notifications, as the guest will not ack an irq line that is masked. Currently implemented for the ioapic only. Signed-off-by: Avi Kivity --- include/linux/kvm_host.h | 17 +++++++++++++++++ virt/kvm/ioapic.c | 6 ++++++ virt/kvm/irq_comm.c | 24 ++++++++++++++++++++++++ virt/kvm/kvm_main.c | 3 +++ 4 files changed, 50 insertions(+) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3cf0ede3fd73..99963f36a6db 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -127,6 +127,10 @@ struct kvm { struct kvm_coalesced_mmio_ring *coalesced_mmio_ring; #endif +#ifdef CONFIG_HAVE_KVM_IRQCHIP + struct hlist_head mask_notifier_list; +#endif + #ifdef KVM_ARCH_WANT_MMU_NOTIFIER struct mmu_notifier mmu_notifier; unsigned long mmu_notifier_seq; @@ -320,6 +324,19 @@ struct kvm_assigned_dev_kernel { struct pci_dev *dev; struct kvm *kvm; }; + +struct kvm_irq_mask_notifier { + void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked); + int irq; + struct hlist_node link; +}; + +void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq, + struct kvm_irq_mask_notifier *kimn); +void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, + struct kvm_irq_mask_notifier *kimn); +void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask); + void kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level); void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi); void kvm_register_irq_ack_notifier(struct kvm *kvm, diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 23b81cf242af..e85a2bcd2db1 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -101,6 +101,7 @@ static void ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx) static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) { unsigned index; + bool mask_before, mask_after; switch (ioapic->ioregsel) { case IOAPIC_REG_VERSION: @@ -120,6 +121,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) ioapic_debug("change redir index %x val %x\n", index, val); if (index >= IOAPIC_NUM_PINS) return; + mask_before = ioapic->redirtbl[index].fields.mask; if (ioapic->ioregsel & 1) { ioapic->redirtbl[index].bits &= 0xffffffff; ioapic->redirtbl[index].bits |= (u64) val << 32; @@ -128,6 +130,9 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) ioapic->redirtbl[index].bits |= (u32) val; ioapic->redirtbl[index].fields.remote_irr = 0; } + mask_after = ioapic->redirtbl[index].fields.mask; + if (mask_before != mask_after) + kvm_fire_mask_notifiers(ioapic->kvm, index, mask_after); if (ioapic->irr & (1 << index)) ioapic_service(ioapic, index); break; @@ -426,3 +431,4 @@ int kvm_ioapic_init(struct kvm *kvm) kvm_io_bus_register_dev(&kvm->mmio_bus, &ioapic->dev); return 0; } + diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index aa5d1e5c497e..5162a411e4d2 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -99,3 +99,27 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) clear_bit(irq_source_id, &kvm->arch.irq_states[i]); clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap); } + +void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq, + struct kvm_irq_mask_notifier *kimn) +{ + kimn->irq = irq; + hlist_add_head(&kimn->link, &kvm->mask_notifier_list); +} + +void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, + struct kvm_irq_mask_notifier *kimn) +{ + hlist_del(&kimn->link); +} + +void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask) +{ + struct kvm_irq_mask_notifier *kimn; + struct hlist_node *n; + + hlist_for_each_entry(kimn, n, &kvm->mask_notifier_list, link) + if (kimn->irq == irq) + kimn->func(kimn, mask); +} + diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 04401e17c758..786a3ae373b0 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -842,6 +842,9 @@ static struct kvm *kvm_create_vm(void) if (IS_ERR(kvm)) goto out; +#ifdef CONFIG_HAVE_KVM_IRQCHIP + INIT_HLIST_HEAD(&kvm->mask_notifier_list); +#endif #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET page = alloc_page(GFP_KERNEL | __GFP_ZERO); -- cgit v1.2.3 From 4780c65904f0fc4e312ee2da9383eacbe04e61ea Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 4 Jan 2009 18:06:06 +0200 Subject: KVM: Reset PIT irq injection logic when the PIT IRQ is unmasked While the PIT is masked the guest cannot ack the irq, so the reinject logic will never allow the interrupt to be injected. Fix by resetting the reinjection counters on unmask. Unbreaks Xen. Signed-off-by: Avi Kivity --- arch/x86/kvm/i8254.c | 15 +++++++++++++++ arch/x86/kvm/i8254.h | 1 + 2 files changed, 16 insertions(+) diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 69d1bbff3fd3..c13bb92d3157 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -539,6 +539,16 @@ void kvm_pit_reset(struct kvm_pit *pit) pit->pit_state.irq_ack = 1; } +static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask) +{ + struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier); + + if (!mask) { + atomic_set(&pit->pit_state.pit_timer.pending, 0); + pit->pit_state.irq_ack = 1; + } +} + struct kvm_pit *kvm_create_pit(struct kvm *kvm) { struct kvm_pit *pit; @@ -586,6 +596,9 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm) kvm_pit_reset(pit); + pit->mask_notifier.func = pit_mask_notifer; + kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); + return pit; } @@ -594,6 +607,8 @@ void kvm_free_pit(struct kvm *kvm) struct hrtimer *timer; if (kvm->arch.vpit) { + kvm_unregister_irq_mask_notifier(kvm, 0, + &kvm->arch.vpit->mask_notifier); mutex_lock(&kvm->arch.vpit->pit_state.lock); timer = &kvm->arch.vpit->pit_state.pit_timer.timer; hrtimer_cancel(timer); diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index 76959c4b500e..6acbe4b505d5 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -46,6 +46,7 @@ struct kvm_pit { struct kvm *kvm; struct kvm_kpit_state pit_state; int irq_source_id; + struct kvm_irq_mask_notifier mask_notifier; }; #define KVM_PIT_BASE_ADDRESS 0x40 -- cgit v1.2.3 From a26b73ad5e763db1b01cca6965ac3b65fe36e82a Mon Sep 17 00:00:00 2001 From: Jes Sorensen Date: Thu, 8 Jan 2009 13:58:48 +0100 Subject: KVM: ia64: expose registers in struct kvm_regs Provide register layout for struct kvm_regs exposed to userland. Signed-off-by: Jes Sorensen Signed-off-by: Avi Kivity --- arch/ia64/include/asm/kvm.h | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/arch/ia64/include/asm/kvm.h b/arch/ia64/include/asm/kvm.h index b5145784233e..0ee5bd7a988f 100644 --- a/arch/ia64/include/asm/kvm.h +++ b/arch/ia64/include/asm/kvm.h @@ -166,7 +166,40 @@ struct saved_vpd { unsigned long vcpuid[5]; unsigned long vpsr; unsigned long vpr; - unsigned long vcr[128]; + union { + unsigned long vcr[128]; + struct { + unsigned long dcr; + unsigned long itm; + unsigned long iva; + unsigned long rsv1[5]; + unsigned long pta; + unsigned long rsv2[7]; + unsigned long ipsr; + unsigned long isr; + unsigned long rsv3; + unsigned long iip; + unsigned long ifa; + unsigned long itir; + unsigned long iipa; + unsigned long ifs; + unsigned long iim; + unsigned long iha; + unsigned long rsv4[38]; + unsigned long lid; + unsigned long ivr; + unsigned long tpr; + unsigned long eoi; + unsigned long irr[4]; + unsigned long itv; + unsigned long pmv; + unsigned long cmcv; + unsigned long rsv5[5]; + unsigned long lrr0; + unsigned long lrr1; + unsigned long rsv6[46]; + }; + }; }; struct kvm_regs { -- cgit v1.2.3 From ff81ff10b4417952919dcc0bd67effa9ceb411c0 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 8 Jan 2009 11:05:17 -0800 Subject: KVM: SVM: Fix typo in has_svm() Signed-off-by: Joe Perches Acked-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 3a60c3f04e65..db5021b2b5a8 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -250,7 +250,7 @@ static int has_svm(void) const char *msg; if (!cpu_has_svm(&msg)) { - printk(KERN_INFO "has_svn: %s\n", msg); + printk(KERN_INFO "has_svm: %s\n", msg); return 0; } -- cgit v1.2.3 From 9903a927a4aea4b1ea42356a8453fca664df0b18 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 8 Jan 2009 17:44:19 -0200 Subject: KVM: MMU: drop zeroing on mmu_memory_cache_alloc Zeroing on mmu_memory_cache_alloc is unnecessary since: - Smaller areas are pre-allocated with kmem_cache_zalloc. - Page pointed by ->spt is overwritten with prefetch_page and entries in page pointed by ->gfns are initialized before reading. [avi: zeroing pages is unnecessary] Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index aac0499947d8..de9a9fbc16ed 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -352,7 +352,6 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, BUG_ON(!mc->nobjs); p = mc->objects[--mc->nobjs]; - memset(p, 0, size); return p; } -- cgit v1.2.3 From f6e2c02b6d28ddabe99377c5640a833407a62632 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 11 Jan 2009 13:02:10 +0200 Subject: KVM: MMU: Rename "metaphysical" attribute to "direct" This actually describes what is going on, rather than alerting the reader that something strange is going on. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 5 +++-- arch/x86/kvm/mmu.c | 32 ++++++++++++++++---------------- arch/x86/kvm/paging_tmpl.h | 12 ++++++------ 3 files changed, 25 insertions(+), 24 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 863ea73431ad..55fd4c5fd388 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -170,7 +170,8 @@ struct kvm_pte_chain { * bits 0:3 - total guest paging levels (2-4, or zero for real mode) * bits 4:7 - page table level for this shadow (1-4) * bits 8:9 - page table quadrant for 2-level guests - * bit 16 - "metaphysical" - gfn is not a real page (huge page/real mode) + * bit 16 - direct mapping of virtual to physical mapping at gfn + * used for real mode and two-dimensional paging * bits 17:19 - common access permissions for all ptes in this shadow page */ union kvm_mmu_page_role { @@ -180,7 +181,7 @@ union kvm_mmu_page_role { unsigned level:4; unsigned quadrant:2; unsigned pad_for_nice_hex_output:6; - unsigned metaphysical:1; + unsigned direct:1; unsigned access:3; unsigned invalid:1; unsigned cr4_pge:1; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index de9a9fbc16ed..ef060ec444a4 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1066,7 +1066,7 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) index = kvm_page_table_hashfn(gfn); bucket = &kvm->arch.mmu_page_hash[index]; hlist_for_each_entry(sp, node, bucket, hash_link) - if (sp->gfn == gfn && !sp->role.metaphysical + if (sp->gfn == gfn && !sp->role.direct && !sp->role.invalid) { pgprintk("%s: found role %x\n", __func__, sp->role.word); @@ -1200,7 +1200,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gaddr, unsigned level, - int metaphysical, + int direct, unsigned access, u64 *parent_pte) { @@ -1213,7 +1213,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, role = vcpu->arch.mmu.base_role; role.level = level; - role.metaphysical = metaphysical; + role.direct = direct; role.access = access; if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); @@ -1250,7 +1250,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, sp->role = role; sp->global = role.cr4_pge; hlist_add_head(&sp->hash_link, bucket); - if (!metaphysical) { + if (!direct) { if (rmap_write_protect(vcpu->kvm, gfn)) kvm_flush_remote_tlbs(vcpu->kvm); account_shadowed(vcpu->kvm, gfn); @@ -1395,7 +1395,7 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_page_unlink_children(kvm, sp); kvm_mmu_unlink_parents(kvm, sp); kvm_flush_remote_tlbs(kvm); - if (!sp->role.invalid && !sp->role.metaphysical) + if (!sp->role.invalid && !sp->role.direct) unaccount_shadowed(kvm, sp->gfn); if (sp->unsync) kvm_unlink_unsync_page(kvm, sp); @@ -1458,7 +1458,7 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) index = kvm_page_table_hashfn(gfn); bucket = &kvm->arch.mmu_page_hash[index]; hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) - if (sp->gfn == gfn && !sp->role.metaphysical) { + if (sp->gfn == gfn && !sp->role.direct) { pgprintk("%s: gfn %lx role %x\n", __func__, gfn, sp->role.word); r = 1; @@ -1478,7 +1478,7 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) index = kvm_page_table_hashfn(gfn); bucket = &kvm->arch.mmu_page_hash[index]; hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) { - if (sp->gfn == gfn && !sp->role.metaphysical + if (sp->gfn == gfn && !sp->role.direct && !sp->role.invalid) { pgprintk("%s: zap %lx %x\n", __func__, gfn, sp->role.word); @@ -1638,7 +1638,7 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) bucket = &vcpu->kvm->arch.mmu_page_hash[index]; /* don't unsync if pagetable is shadowed with multiple roles */ hlist_for_each_entry_safe(s, node, n, bucket, hash_link) { - if (s->gfn != sp->gfn || s->role.metaphysical) + if (s->gfn != sp->gfn || s->role.direct) continue; if (s->role.word != sp->role.word) return 1; @@ -1951,7 +1951,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) int i; gfn_t root_gfn; struct kvm_mmu_page *sp; - int metaphysical = 0; + int direct = 0; root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT; @@ -1960,18 +1960,18 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) ASSERT(!VALID_PAGE(root)); if (tdp_enabled) - metaphysical = 1; + direct = 1; sp = kvm_mmu_get_page(vcpu, root_gfn, 0, - PT64_ROOT_LEVEL, metaphysical, + PT64_ROOT_LEVEL, direct, ACC_ALL, NULL); root = __pa(sp->spt); ++sp->root_count; vcpu->arch.mmu.root_hpa = root; return; } - metaphysical = !is_paging(vcpu); + direct = !is_paging(vcpu); if (tdp_enabled) - metaphysical = 1; + direct = 1; for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu.pae_root[i]; @@ -1985,7 +1985,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) } else if (vcpu->arch.mmu.root_level == 0) root_gfn = 0; sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, - PT32_ROOT_LEVEL, metaphysical, + PT32_ROOT_LEVEL, direct, ACC_ALL, NULL); root = __pa(sp->spt); ++sp->root_count; @@ -2487,7 +2487,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, index = kvm_page_table_hashfn(gfn); bucket = &vcpu->kvm->arch.mmu_page_hash[index]; hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { - if (sp->gfn != gfn || sp->role.metaphysical || sp->role.invalid) + if (sp->gfn != gfn || sp->role.direct || sp->role.invalid) continue; pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); @@ -3125,7 +3125,7 @@ static void audit_write_protection(struct kvm_vcpu *vcpu) gfn_t gfn; list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { - if (sp->role.metaphysical) + if (sp->role.direct) continue; gfn = unalias_gfn(vcpu->kvm, sp->gfn); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 46b68f941f60..7314c0944c5f 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -277,7 +277,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, unsigned access = gw->pt_access; struct kvm_mmu_page *shadow_page; u64 spte, *sptep; - int metaphysical; + int direct; gfn_t table_gfn; int r; int level; @@ -313,17 +313,17 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, if (level == PT_DIRECTORY_LEVEL && gw->level == PT_DIRECTORY_LEVEL) { - metaphysical = 1; + direct = 1; if (!is_dirty_pte(gw->ptes[level - 1])) access &= ~ACC_WRITE_MASK; table_gfn = gpte_to_gfn(gw->ptes[level - 1]); } else { - metaphysical = 0; + direct = 0; table_gfn = gw->table_gfn[level - 2]; } shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, - metaphysical, access, sptep); - if (!metaphysical) { + direct, access, sptep); + if (!direct) { r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 2], &curr_pte, sizeof(curr_pte)); @@ -512,7 +512,7 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, pt_element_t pt[256 / sizeof(pt_element_t)]; gpa_t pte_gpa; - if (sp->role.metaphysical + if (sp->role.direct || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) { nonpaging_prefetch_page(vcpu, sp); return; -- cgit v1.2.3 From 5a41accd3f69c6ef1d58f0c148980bae70515937 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 11 Jan 2009 17:19:35 +0200 Subject: KVM: MMU: Only enable cr4_pge role in shadow mode Two dimensional paging is only confused by it. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6fbc34603375..19ccde621cd0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -364,7 +364,7 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) } kvm_x86_ops->set_cr4(vcpu, cr4); vcpu->arch.cr4 = cr4; - vcpu->arch.mmu.base_role.cr4_pge = !!(cr4 & X86_CR4_PGE); + vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled; kvm_mmu_sync_global(vcpu); kvm_mmu_reset_context(vcpu); } -- cgit v1.2.3 From 87c656b4147cd08c6efba95d8d70c12cba181255 Mon Sep 17 00:00:00 2001 From: Liu Yu Date: Wed, 14 Jan 2009 10:47:36 -0600 Subject: powerpc/fsl-booke: declare tlbcam_index for use in c So, KVM needs to read tlbcam_index to know exactly which TLB1 entry is unused by host. Signed-off-by: Liu Yu Acked-by: Kumar Gala Signed-off-by: Hollis Blanchard Signed-off-by: Avi Kivity --- arch/powerpc/include/asm/mmu-fsl-booke.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/include/asm/mmu-fsl-booke.h b/arch/powerpc/include/asm/mmu-fsl-booke.h index 3f941c0f7e8e..4285b64a65e0 100644 --- a/arch/powerpc/include/asm/mmu-fsl-booke.h +++ b/arch/powerpc/include/asm/mmu-fsl-booke.h @@ -75,6 +75,8 @@ #ifndef __ASSEMBLY__ +extern unsigned int tlbcam_index; + typedef struct { unsigned int id; unsigned int active; -- cgit v1.2.3 From fb2838d446f6ee7e13e4149e08ec138753c5c450 Mon Sep 17 00:00:00 2001 From: Liu Yu Date: Wed, 14 Jan 2009 10:47:37 -0600 Subject: KVM: ppc: Fix e500 warnings and some spelling problems Signed-off-by: Avi Kivity --- arch/powerpc/kvm/e500_emulate.c | 2 -- arch/powerpc/kvm/e500_tlb.c | 6 +++--- arch/powerpc/kvm/e500_tlb.h | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c index d3c0c7c7f209..7a98d4a4c1ed 100644 --- a/arch/powerpc/kvm/e500_emulate.c +++ b/arch/powerpc/kvm/e500_emulate.c @@ -30,8 +30,6 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, int emulated = EMULATE_DONE; int ra; int rb; - int rs; - int rt; switch (get_op(inst)) { case 31: diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index 6a50340f575e..e3daf57b5f5e 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -260,7 +260,7 @@ static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu, unsigned int victim, pidsel, tsized; int tlbsel; - /* since we only have tow TLBs, only lower bit is used. */ + /* since we only have two TLBs, only lower bit is used. */ tlbsel = (vcpu_e500->mas4 >> 28) & 0x1; victim = (tlbsel == 0) ? tlb0_get_next_victim(vcpu_e500) : 0; pidsel = (vcpu_e500->mas4 >> 16) & 0xf; @@ -402,7 +402,7 @@ int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb) ia = (ea >> 2) & 0x1; - /* since we only have tow TLBs, only lower bit is used. */ + /* since we only have two TLBs, only lower bit is used. */ tlbsel = (ea >> 3) & 0x1; if (ia) { @@ -471,7 +471,7 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb) } else { int victim; - /* since we only have tow TLBs, only lower bit is used. */ + /* since we only have two TLBs, only lower bit is used. */ tlbsel = vcpu_e500->mas4 >> 28 & 0x1; victim = (tlbsel == 0) ? tlb0_get_next_victim(vcpu_e500) : 0; diff --git a/arch/powerpc/kvm/e500_tlb.h b/arch/powerpc/kvm/e500_tlb.h index d8833f955163..ab49e9355185 100644 --- a/arch/powerpc/kvm/e500_tlb.h +++ b/arch/powerpc/kvm/e500_tlb.h @@ -126,7 +126,7 @@ static inline unsigned int get_tlb_tlbsel( { /* * Manual says that tlbsel has 2 bits wide. - * Since we only have tow TLBs, only lower bit is used. + * Since we only have two TLBs, only lower bit is used. */ return (vcpu_e500->mas0 >> 28) & 0x1; } -- cgit v1.2.3 From 9aa4dd5e5f47a5feb96fc394fccf0265ab7d85a4 Mon Sep 17 00:00:00 2001 From: Liu Yu Date: Wed, 14 Jan 2009 10:47:38 -0600 Subject: KVM: ppc: Move to new TLB invalidate interface Commit 2a4aca1144394653269720ffbb5a325a77abd5fa removed old method _tlbia(). Signed-off-by: Liu Yu Signed-off-by: Hollis Blanchard Signed-off-by: Avi Kivity --- arch/powerpc/kvm/e500_tlb.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index e3daf57b5f5e..d437160d388c 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -20,6 +20,7 @@ #include #include +#include "../mm/mmu_decl.h" #include "e500_tlb.h" #define to_htlb1_esel(esel) (tlb1_entry_num - (esel) - 1) @@ -158,7 +159,7 @@ void kvmppc_e500_tlb_load(struct kvm_vcpu *vcpu, int cpu) void kvmppc_e500_tlb_put(struct kvm_vcpu *vcpu) { - _tlbia(); + _tlbil_all(); } /* Search the guest TLB for a matching entry. */ @@ -362,11 +363,10 @@ void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode) int i; /* XXX Replace loop with fancy data structures. */ - /* needn't set modified since tlbia will make TLB1 coherent */ for (i = 0; i < tlb1_max_shadow_size(); i++) kvmppc_e500_stlbe_invalidate(vcpu_e500, 1, i); - _tlbia(); + _tlbil_all(); } } @@ -417,7 +417,7 @@ int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb) kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel); } - _tlbia(); + _tlbil_all(); return EMULATE_DONE; } @@ -604,7 +604,7 @@ void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) kvmppc_e500_shadow_release(vcpu_e500, tlbsel, i); /* discard all guest mapping */ - _tlbia(); + _tlbil_all(); } void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr, -- cgit v1.2.3 From 193554750441d91e127dd5066b8aebe0f769101c Mon Sep 17 00:00:00 2001 From: Amit Shah Date: Wed, 14 Jan 2009 16:56:00 +0000 Subject: KVM: x86: Fix typos and whitespace errors Some typos, comments, whitespace errors corrected in the cpuid code Signed-off-by: Amit Shah Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 19ccde621cd0..141a0166e51c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1067,7 +1067,7 @@ long kvm_arch_dev_ioctl(struct file *filp, if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) goto out; r = kvm_dev_ioctl_get_supported_cpuid(&cpuid, - cpuid_arg->entries); + cpuid_arg->entries); if (r) goto out; @@ -1165,8 +1165,8 @@ out: } static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, - struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries) + struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries) { int r; @@ -1185,8 +1185,8 @@ out: } static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, - struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries) + struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries) { int r; @@ -1195,7 +1195,7 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, goto out; r = -EFAULT; if (copy_to_user(entries, &vcpu->arch.cpuid_entries, - vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) + vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) goto out; return 0; @@ -1205,12 +1205,12 @@ out: } static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, - u32 index) + u32 index) { entry->function = function; entry->index = index; cpuid_count(entry->function, entry->index, - &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); + &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); entry->flags = 0; } @@ -1249,7 +1249,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY) | bit(X86_FEATURE_SVM); - /* all func 2 cpuid_count() should be called on the same cpu */ + /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); do_cpuid_1_ent(entry, function, index); ++*nent; @@ -1323,7 +1323,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, } static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries) + struct kvm_cpuid_entry2 __user *entries) { struct kvm_cpuid_entry2 *cpuid_entries; int limit, nent = 0, r = -E2BIG; @@ -1340,7 +1340,7 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, limit = cpuid_entries[0].eax; for (func = 1; func <= limit && nent < cpuid->nent; ++func) do_cpuid_ent(&cpuid_entries[nent], func, 0, - &nent, cpuid->nent); + &nent, cpuid->nent); r = -E2BIG; if (nent >= cpuid->nent) goto out_free; @@ -1349,10 +1349,10 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, limit = cpuid_entries[nent - 1].eax; for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) do_cpuid_ent(&cpuid_entries[nent], func, 0, - &nent, cpuid->nent); + &nent, cpuid->nent); r = -EFAULT; if (copy_to_user(entries, cpuid_entries, - nent * sizeof(struct kvm_cpuid_entry2))) + nent * sizeof(struct kvm_cpuid_entry2))) goto out_free; cpuid->nent = nent; r = 0; @@ -1496,7 +1496,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) goto out; r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, - cpuid_arg->entries); + cpuid_arg->entries); if (r) goto out; break; @@ -1509,7 +1509,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) goto out; r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid, - cpuid_arg->entries); + cpuid_arg->entries); if (r) goto out; r = -EFAULT; @@ -2864,7 +2864,7 @@ static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e, if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index) return 0; if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) && - !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) + !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) return 0; return 1; } @@ -2892,7 +2892,6 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, if (!best || e->function > best->function) best = e; } - return best; } -- cgit v1.2.3 From 399ec807ddc38ecccf8c06dbde04531cbdc63e11 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 19 Nov 2008 13:58:46 +0200 Subject: KVM: Userspace controlled irq routing Currently KVM has a static routing from GSI numbers to interrupts (namely, 0-15 are mapped 1:1 to both PIC and IOAPIC, and 16:23 are mapped 1:1 to the IOAPIC). This is insufficient for several reasons: - HPET requires non 1:1 mapping for the timer interrupt - MSIs need a new method to assign interrupt numbers and dispatch them - ACPI APIC mode needs to be able to reassign the PCI LINK interrupts to the ioapics This patch implements an interrupt routing table (as a linked list, but this can be easily changed) and a userspace interface to replace the table. The routing table is initialized according to the current hardwired mapping. Signed-off-by: Avi Kivity --- arch/ia64/kvm/kvm-ia64.c | 5 ++ arch/x86/kvm/x86.c | 6 ++ include/linux/kvm.h | 33 ++++++++++ include/linux/kvm_host.h | 31 +++++++++ virt/kvm/irq_comm.c | 168 +++++++++++++++++++++++++++++++++++++++++++++-- virt/kvm/kvm_main.c | 36 ++++++++++ 6 files changed, 275 insertions(+), 4 deletions(-) diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index 1477f91617a5..dbf527a57341 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -919,6 +919,11 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_ioapic_init(kvm); if (r) goto out; + r = kvm_setup_default_irq_routing(kvm); + if (r) { + kfree(kvm->arch.vioapic); + goto out; + } break; case KVM_IRQ_LINE: { struct kvm_irq_level irq_event; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 141a0166e51c..32e3a7ec6ad2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1835,6 +1835,12 @@ long kvm_arch_vm_ioctl(struct file *filp, } } else goto out; + r = kvm_setup_default_irq_routing(kvm); + if (r) { + kfree(kvm->arch.vpic); + kfree(kvm->arch.vioapic); + goto out; + } break; case KVM_CREATE_PIT: mutex_lock(&kvm->lock); diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 73f348066866..7a5d73a8d4fa 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -398,6 +398,38 @@ struct kvm_trace_rec { #endif #if defined(CONFIG_X86) #define KVM_CAP_REINJECT_CONTROL 24 +#endif +#if defined(CONFIG_X86)||defined(CONFIG_IA64) +#define KVM_CAP_IRQ_ROUTING 25 +#endif + +#ifdef KVM_CAP_IRQ_ROUTING + +struct kvm_irq_routing_irqchip { + __u32 irqchip; + __u32 pin; +}; + +/* gsi routing entry types */ +#define KVM_IRQ_ROUTING_IRQCHIP 1 + +struct kvm_irq_routing_entry { + __u32 gsi; + __u32 type; + __u32 flags; + __u32 pad; + union { + struct kvm_irq_routing_irqchip irqchip; + __u32 pad[8]; + } u; +}; + +struct kvm_irq_routing { + __u32 nr; + __u32 flags; + struct kvm_irq_routing_entry entries[0]; +}; + #endif /* @@ -430,6 +462,7 @@ struct kvm_trace_rec { _IOW(KVMIO, 0x68, struct kvm_coalesced_mmio_zone) #define KVM_ASSIGN_PCI_DEVICE _IOR(KVMIO, 0x69, \ struct kvm_assigned_pci_dev) +#define KVM_SET_GSI_ROUTING _IOW(KVMIO, 0x6a, struct kvm_irq_routing) #define KVM_ASSIGN_IRQ _IOR(KVMIO, 0x70, \ struct kvm_assigned_irq) #define KVM_REINJECT_CONTROL _IO(KVMIO, 0x71) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 99963f36a6db..ce285e01bd57 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -107,6 +107,19 @@ struct kvm_memory_slot { int user_alloc; }; +struct kvm_kernel_irq_routing_entry { + u32 gsi; + void (*set)(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int level); + union { + struct { + unsigned irqchip; + unsigned pin; + } irqchip; + }; + struct list_head link; +}; + struct kvm { struct mutex lock; /* protects the vcpus array and APIC accesses */ spinlock_t mmu_lock; @@ -128,6 +141,7 @@ struct kvm { #endif #ifdef CONFIG_HAVE_KVM_IRQCHIP + struct list_head irq_routing; /* of kvm_kernel_irq_routing_entry */ struct hlist_head mask_notifier_list; #endif @@ -480,4 +494,21 @@ static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_se } #endif +#ifdef CONFIG_HAVE_KVM_IRQCHIP + +#define KVM_MAX_IRQ_ROUTES 1024 + +int kvm_setup_default_irq_routing(struct kvm *kvm); +int kvm_set_irq_routing(struct kvm *kvm, + const struct kvm_irq_routing_entry *entries, + unsigned nr, + unsigned flags); +void kvm_free_irq_routing(struct kvm *kvm); + +#else + +static inline void kvm_free_irq_routing(struct kvm *kvm) {} + +#endif + #endif diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index 5162a411e4d2..a797fa5e6420 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -24,9 +24,24 @@ #include "ioapic.h" +static void kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int level) +{ +#ifdef CONFIG_X86 + kvm_pic_set_irq(pic_irqchip(kvm), e->irqchip.pin, level); +#endif +} + +static void kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int level) +{ + kvm_ioapic_set_irq(kvm->arch.vioapic, e->irqchip.pin, level); +} + /* This should be called with the kvm->lock mutex held */ void kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level) { + struct kvm_kernel_irq_routing_entry *e; unsigned long *irq_state = (unsigned long *)&kvm->arch.irq_states[irq]; /* Logical OR for level trig interrupt */ @@ -39,10 +54,9 @@ void kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level) * IOAPIC. So set the bit in both. The guest will ignore * writes to the unused one. */ - kvm_ioapic_set_irq(kvm->arch.vioapic, irq, !!(*irq_state)); -#ifdef CONFIG_X86 - kvm_pic_set_irq(pic_irqchip(kvm), irq, !!(*irq_state)); -#endif + list_for_each_entry(e, &kvm->irq_routing, link) + if (e->gsi == irq) + e->set(e, kvm, !!(*irq_state)); } void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi) @@ -123,3 +137,149 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask) kimn->func(kimn, mask); } +static void __kvm_free_irq_routing(struct list_head *irq_routing) +{ + struct kvm_kernel_irq_routing_entry *e, *n; + + list_for_each_entry_safe(e, n, irq_routing, link) + kfree(e); +} + +void kvm_free_irq_routing(struct kvm *kvm) +{ + __kvm_free_irq_routing(&kvm->irq_routing); +} + +int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e, + const struct kvm_irq_routing_entry *ue) +{ + int r = -EINVAL; + int delta; + + e->gsi = ue->gsi; + switch (ue->type) { + case KVM_IRQ_ROUTING_IRQCHIP: + delta = 0; + switch (ue->u.irqchip.irqchip) { + case KVM_IRQCHIP_PIC_MASTER: + e->set = kvm_set_pic_irq; + break; + case KVM_IRQCHIP_PIC_SLAVE: + e->set = kvm_set_pic_irq; + delta = 8; + break; + case KVM_IRQCHIP_IOAPIC: + e->set = kvm_set_ioapic_irq; + break; + default: + goto out; + } + e->irqchip.irqchip = ue->u.irqchip.irqchip; + e->irqchip.pin = ue->u.irqchip.pin + delta; + break; + default: + goto out; + } + r = 0; +out: + return r; +} + + +int kvm_set_irq_routing(struct kvm *kvm, + const struct kvm_irq_routing_entry *ue, + unsigned nr, + unsigned flags) +{ + struct list_head irq_list = LIST_HEAD_INIT(irq_list); + struct list_head tmp = LIST_HEAD_INIT(tmp); + struct kvm_kernel_irq_routing_entry *e = NULL; + unsigned i; + int r; + + for (i = 0; i < nr; ++i) { + r = -EINVAL; + if (ue->gsi >= KVM_MAX_IRQ_ROUTES) + goto out; + if (ue->flags) + goto out; + r = -ENOMEM; + e = kzalloc(sizeof(*e), GFP_KERNEL); + if (!e) + goto out; + r = setup_routing_entry(e, ue); + if (r) + goto out; + ++ue; + list_add(&e->link, &irq_list); + e = NULL; + } + + mutex_lock(&kvm->lock); + list_splice(&kvm->irq_routing, &tmp); + INIT_LIST_HEAD(&kvm->irq_routing); + list_splice(&irq_list, &kvm->irq_routing); + INIT_LIST_HEAD(&irq_list); + list_splice(&tmp, &irq_list); + mutex_unlock(&kvm->lock); + + r = 0; + +out: + kfree(e); + __kvm_free_irq_routing(&irq_list); + return r; +} + +#define IOAPIC_ROUTING_ENTRY(irq) \ + { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \ + .u.irqchip.irqchip = KVM_IRQCHIP_IOAPIC, .u.irqchip.pin = (irq) } +#define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq) + +#ifdef CONFIG_X86 +#define SELECT_PIC(irq) \ + ((irq) < 8 ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE) +# define PIC_ROUTING_ENTRY(irq) \ + { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \ + .u.irqchip.irqchip = SELECT_PIC(irq), .u.irqchip.pin = (irq) % 8 } +# define ROUTING_ENTRY2(irq) \ + IOAPIC_ROUTING_ENTRY(irq), PIC_ROUTING_ENTRY(irq) +#else +# define ROUTING_ENTRY2(irq) \ + IOAPIC_ROUTING_ENTRY(irq) +#endif + +static const struct kvm_irq_routing_entry default_routing[] = { + ROUTING_ENTRY2(0), ROUTING_ENTRY2(1), + ROUTING_ENTRY2(2), ROUTING_ENTRY2(3), + ROUTING_ENTRY2(4), ROUTING_ENTRY2(5), + ROUTING_ENTRY2(6), ROUTING_ENTRY2(7), + ROUTING_ENTRY2(8), ROUTING_ENTRY2(9), + ROUTING_ENTRY2(10), ROUTING_ENTRY2(11), + ROUTING_ENTRY2(12), ROUTING_ENTRY2(13), + ROUTING_ENTRY2(14), ROUTING_ENTRY2(15), + ROUTING_ENTRY1(16), ROUTING_ENTRY1(17), + ROUTING_ENTRY1(18), ROUTING_ENTRY1(19), + ROUTING_ENTRY1(20), ROUTING_ENTRY1(21), + ROUTING_ENTRY1(22), ROUTING_ENTRY1(23), +#ifdef CONFIG_IA64 + ROUTING_ENTRY1(24), ROUTING_ENTRY1(25), + ROUTING_ENTRY1(26), ROUTING_ENTRY1(27), + ROUTING_ENTRY1(28), ROUTING_ENTRY1(29), + ROUTING_ENTRY1(30), ROUTING_ENTRY1(31), + ROUTING_ENTRY1(32), ROUTING_ENTRY1(33), + ROUTING_ENTRY1(34), ROUTING_ENTRY1(35), + ROUTING_ENTRY1(36), ROUTING_ENTRY1(37), + ROUTING_ENTRY1(38), ROUTING_ENTRY1(39), + ROUTING_ENTRY1(40), ROUTING_ENTRY1(41), + ROUTING_ENTRY1(42), ROUTING_ENTRY1(43), + ROUTING_ENTRY1(44), ROUTING_ENTRY1(45), + ROUTING_ENTRY1(46), ROUTING_ENTRY1(47), +#endif +}; + +int kvm_setup_default_irq_routing(struct kvm *kvm) +{ + return kvm_set_irq_routing(kvm, default_routing, + ARRAY_SIZE(default_routing), 0); +} diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 786a3ae373b0..c65484b471c6 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -843,6 +843,7 @@ static struct kvm *kvm_create_vm(void) if (IS_ERR(kvm)) goto out; #ifdef CONFIG_HAVE_KVM_IRQCHIP + INIT_LIST_HEAD(&kvm->irq_routing); INIT_HLIST_HEAD(&kvm->mask_notifier_list); #endif @@ -926,6 +927,7 @@ static void kvm_destroy_vm(struct kvm *kvm) spin_lock(&kvm_lock); list_del(&kvm->vm_list); spin_unlock(&kvm_lock); + kvm_free_irq_routing(kvm); kvm_io_bus_destroy(&kvm->pio_bus); kvm_io_bus_destroy(&kvm->mmio_bus); #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET @@ -1945,6 +1947,36 @@ static long kvm_vm_ioctl(struct file *filp, goto out; break; } +#endif +#ifdef KVM_CAP_IRQ_ROUTING + case KVM_SET_GSI_ROUTING: { + struct kvm_irq_routing routing; + struct kvm_irq_routing __user *urouting; + struct kvm_irq_routing_entry *entries; + + r = -EFAULT; + if (copy_from_user(&routing, argp, sizeof(routing))) + goto out; + r = -EINVAL; + if (routing.nr >= KVM_MAX_IRQ_ROUTES) + goto out; + if (routing.flags) + goto out; + r = -ENOMEM; + entries = vmalloc(routing.nr * sizeof(*entries)); + if (!entries) + goto out; + r = -EFAULT; + urouting = argp; + if (copy_from_user(entries, urouting->entries, + routing.nr * sizeof(*entries))) + goto out_free_irq_routing; + r = kvm_set_irq_routing(kvm, entries, routing.nr, + routing.flags); + out_free_irq_routing: + vfree(entries); + break; + } #endif default: r = kvm_arch_vm_ioctl(filp, ioctl, arg); @@ -2012,6 +2044,10 @@ static long kvm_dev_ioctl_check_extension_generic(long arg) case KVM_CAP_USER_MEMORY: case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: return 1; +#ifdef CONFIG_HAVE_KVM_IRQCHIP + case KVM_CAP_IRQ_ROUTING: + return 1; +#endif default: break; } -- cgit v1.2.3 From 91b2ae773d3b168b763237fac33f75b13d891f20 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 19 Jan 2009 14:57:52 +0200 Subject: KVM: Avoid using CONFIG_ in userspace visible headers Kconfig symbols are not available in userspace, and are not stripped by headers-install. Avoid their use by adding #defines in to suit each architecture. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm.h | 1 + include/linux/kvm.h | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index 54bcf2281526..dc3f6cf11704 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h @@ -15,6 +15,7 @@ #define __KVM_HAVE_DEVICE_ASSIGNMENT #define __KVM_HAVE_MSI #define __KVM_HAVE_USER_NMI +#define __KVM_HAVE_GUEST_DEBUG /* Architectural interrupt line count. */ #define KVM_NR_INTERRUPTS 256 diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 7a5d73a8d4fa..869462ca7625 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -393,13 +393,13 @@ struct kvm_trace_rec { #ifdef __KVM_HAVE_USER_NMI #define KVM_CAP_USER_NMI 22 #endif -#if defined(CONFIG_X86) +#ifdef __KVM_HAVE_GUEST_DEBUG #define KVM_CAP_SET_GUEST_DEBUG 23 #endif -#if defined(CONFIG_X86) +#ifdef __KVM_HAVE_PIT #define KVM_CAP_REINJECT_CONTROL 24 #endif -#if defined(CONFIG_X86)||defined(CONFIG_IA64) +#ifdef __KVM_HAVE_IOAPIC #define KVM_CAP_IRQ_ROUTING 25 #endif -- cgit v1.2.3 From 27d146449ccaad16480888129bb32a000ee33717 Mon Sep 17 00:00:00 2001 From: Xiantao Zhang Date: Thu, 15 Jan 2009 17:58:19 +0800 Subject: KVM: ia64: vTLB change for enabling windows 2008 boot Simply the logic of hash vTLB, and export kvm_gpa_to_mpa. Signed-off-by: Xiantao Zhang Signed-off-by: Avi Kivity --- arch/ia64/kvm/vcpu.h | 4 ++-- arch/ia64/kvm/vtlb.c | 39 +++++++++++++++++---------------------- 2 files changed, 19 insertions(+), 24 deletions(-) diff --git a/arch/ia64/kvm/vcpu.h b/arch/ia64/kvm/vcpu.h index b2f12a562bdf..042af92ced83 100644 --- a/arch/ia64/kvm/vcpu.h +++ b/arch/ia64/kvm/vcpu.h @@ -703,7 +703,7 @@ extern u64 guest_vhpt_lookup(u64 iha, u64 *pte); extern void thash_purge_entries(struct kvm_vcpu *v, u64 va, u64 ps); extern void thash_purge_entries_remote(struct kvm_vcpu *v, u64 va, u64 ps); extern u64 translate_phy_pte(u64 *pte, u64 itir, u64 va); -extern int thash_purge_and_insert(struct kvm_vcpu *v, u64 pte, +extern void thash_purge_and_insert(struct kvm_vcpu *v, u64 pte, u64 itir, u64 ifa, int type); extern void thash_purge_all(struct kvm_vcpu *v); extern struct thash_data *vtlb_lookup(struct kvm_vcpu *v, @@ -738,7 +738,7 @@ void kvm_init_vhpt(struct kvm_vcpu *v); void thash_init(struct thash_cb *hcb, u64 sz); void panic_vm(struct kvm_vcpu *v, const char *fmt, ...); - +u64 kvm_gpa_to_mpa(u64 gpa); extern u64 ia64_call_vsa(u64 proc, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7); diff --git a/arch/ia64/kvm/vtlb.c b/arch/ia64/kvm/vtlb.c index ac94867f8267..38232b37668b 100644 --- a/arch/ia64/kvm/vtlb.c +++ b/arch/ia64/kvm/vtlb.c @@ -164,11 +164,11 @@ static void vhpt_insert(u64 pte, u64 itir, u64 ifa, u64 gpte) unsigned long ps, gpaddr; ps = itir_ps(itir); + rr.val = ia64_get_rr(ifa); - gpaddr = ((gpte & _PAGE_PPN_MASK) >> ps << ps) | - (ifa & ((1UL << ps) - 1)); + gpaddr = ((gpte & _PAGE_PPN_MASK) >> ps << ps) | + (ifa & ((1UL << ps) - 1)); - rr.val = ia64_get_rr(ifa); head = (struct thash_data *)ia64_thash(ifa); head->etag = INVALID_TI_TAG; ia64_mf(); @@ -412,16 +412,14 @@ u64 translate_phy_pte(u64 *pte, u64 itir, u64 va) /* * Purge overlap TCs and then insert the new entry to emulate itc ops. - * Notes: Only TC entry can purge and insert. - * 1 indicates this is MMIO + * Notes: Only TC entry can purge and insert. */ -int thash_purge_and_insert(struct kvm_vcpu *v, u64 pte, u64 itir, +void thash_purge_and_insert(struct kvm_vcpu *v, u64 pte, u64 itir, u64 ifa, int type) { u64 ps; u64 phy_pte, io_mask, index; union ia64_rr vrr, mrr; - int ret = 0; ps = itir_ps(itir); vrr.val = vcpu_get_rr(v, ifa); @@ -441,25 +439,19 @@ int thash_purge_and_insert(struct kvm_vcpu *v, u64 pte, u64 itir, phy_pte &= ~_PAGE_MA_MASK; } - if (pte & VTLB_PTE_IO) - ret = 1; - vtlb_purge(v, ifa, ps); vhpt_purge(v, ifa, ps); - if (ps == mrr.ps) { - if (!(pte&VTLB_PTE_IO)) { - vhpt_insert(phy_pte, itir, ifa, pte); - } else { - vtlb_insert(v, pte, itir, ifa); - vcpu_quick_region_set(VMX(v, tc_regions), ifa); - } - } else if (ps > mrr.ps) { + if ((ps != mrr.ps) || (pte & VTLB_PTE_IO)) { vtlb_insert(v, pte, itir, ifa); vcpu_quick_region_set(VMX(v, tc_regions), ifa); - if (!(pte&VTLB_PTE_IO)) - vhpt_insert(phy_pte, itir, ifa, pte); - } else { + } + if (pte & VTLB_PTE_IO) + return; + + if (ps >= mrr.ps) + vhpt_insert(phy_pte, itir, ifa, pte); + else { u64 psr; phy_pte &= ~PAGE_FLAGS_RV_MASK; psr = ia64_clear_ic(); @@ -469,7 +461,6 @@ int thash_purge_and_insert(struct kvm_vcpu *v, u64 pte, u64 itir, if (!(pte&VTLB_PTE_IO)) mark_pages_dirty(v, pte, ps); - return ret; } /* @@ -570,6 +561,10 @@ void thash_init(struct thash_cb *hcb, u64 sz) u64 kvm_get_mpt_entry(u64 gpfn) { u64 *base = (u64 *) KVM_P2M_BASE; + + if (gpfn >= (KVM_P2M_SIZE >> 3)) + panic_vm(current_vcpu, "Invalid gpfn =%lx\n", gpfn); + return *(base + gpfn); } -- cgit v1.2.3 From 4b7bb626e3133eab131328a0770864b322c1bfe6 Mon Sep 17 00:00:00 2001 From: Xiantao Zhang Date: Thu, 15 Jan 2009 18:08:36 +0800 Subject: KVM: ia64: Add the support for translating PAL Call's pointer args Add the support to translate PAL Call's pointer args. Signed-off-by: Xiantao Zhang Signed-off-by: Avi Kivity --- arch/ia64/kvm/process.c | 48 +++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/arch/ia64/kvm/process.c b/arch/ia64/kvm/process.c index f9c9504144fa..0e727ce0d55c 100644 --- a/arch/ia64/kvm/process.c +++ b/arch/ia64/kvm/process.c @@ -550,18 +550,60 @@ void reflect_interruption(u64 ifa, u64 isr, u64 iim, inject_guest_interruption(vcpu, vector); } +static unsigned long kvm_trans_pal_call_args(struct kvm_vcpu *vcpu, + unsigned long arg) +{ + struct thash_data *data; + unsigned long gpa, poff; + + if (!is_physical_mode(vcpu)) { + /* Depends on caller to provide the DTR or DTC mapping.*/ + data = vtlb_lookup(vcpu, arg, D_TLB); + if (data) + gpa = data->page_flags & _PAGE_PPN_MASK; + else { + data = vhpt_lookup(arg); + if (!data) + return 0; + gpa = data->gpaddr & _PAGE_PPN_MASK; + } + + poff = arg & (PSIZE(data->ps) - 1); + arg = PAGEALIGN(gpa, data->ps) | poff; + } + arg = kvm_gpa_to_mpa(arg << 1 >> 1); + + return (unsigned long)__va(arg); +} + static void set_pal_call_data(struct kvm_vcpu *vcpu) { struct exit_ctl_data *p = &vcpu->arch.exit_data; + unsigned long gr28 = vcpu_get_gr(vcpu, 28); + unsigned long gr29 = vcpu_get_gr(vcpu, 29); + unsigned long gr30 = vcpu_get_gr(vcpu, 30); /*FIXME:For static and stacked convention, firmware * has put the parameters in gr28-gr31 before * break to vmm !!*/ - p->u.pal_data.gr28 = vcpu_get_gr(vcpu, 28); - p->u.pal_data.gr29 = vcpu_get_gr(vcpu, 29); - p->u.pal_data.gr30 = vcpu_get_gr(vcpu, 30); + switch (gr28) { + case PAL_PERF_MON_INFO: + case PAL_HALT_INFO: + p->u.pal_data.gr29 = kvm_trans_pal_call_args(vcpu, gr29); + p->u.pal_data.gr30 = vcpu_get_gr(vcpu, 30); + break; + case PAL_BRAND_INFO: + p->u.pal_data.gr29 = gr29;; + p->u.pal_data.gr30 = kvm_trans_pal_call_args(vcpu, gr30); + break; + default: + p->u.pal_data.gr29 = gr29;; + p->u.pal_data.gr30 = vcpu_get_gr(vcpu, 30); + } + p->u.pal_data.gr28 = gr28; p->u.pal_data.gr31 = vcpu_get_gr(vcpu, 31); + p->exit_reason = EXIT_REASON_PAL_CALL; } -- cgit v1.2.3 From 7d656bd996b68737d5d07643bc57311059038d67 Mon Sep 17 00:00:00 2001 From: Xiantao Zhang Date: Wed, 21 Jan 2009 11:21:27 +0800 Subject: KVM: ia64: Implement some pal calls needed for windows 2008 For windows 2008, it needs more pal calls to implement for booting. In addition, also changes the name of set_{sal, pal}_call_result to get_{sal,pal}_call_result for readability. Signed-off-by: Xiantao Zhang Signed-off-by: Avi Kivity --- arch/ia64/kvm/kvm_fw.c | 151 +++++++++++++++++++++++++++++++++++++++++++++++- arch/ia64/kvm/process.c | 8 +-- 2 files changed, 152 insertions(+), 7 deletions(-) diff --git a/arch/ia64/kvm/kvm_fw.c b/arch/ia64/kvm/kvm_fw.c index cb7600bdff9d..a8ae52ed5635 100644 --- a/arch/ia64/kvm/kvm_fw.c +++ b/arch/ia64/kvm/kvm_fw.c @@ -227,6 +227,18 @@ static struct ia64_pal_retval pal_proc_get_features(struct kvm_vcpu *vcpu) return result; } +static struct ia64_pal_retval pal_register_info(struct kvm_vcpu *vcpu) +{ + + struct ia64_pal_retval result = {0, 0, 0, 0}; + long in0, in1, in2, in3; + + kvm_get_pal_call_data(vcpu, &in0, &in1, &in2, &in3); + result.status = ia64_pal_register_info(in1, &result.v1, &result.v2); + + return result; +} + static struct ia64_pal_retval pal_cache_info(struct kvm_vcpu *vcpu) { @@ -268,8 +280,12 @@ static struct ia64_pal_retval pal_vm_summary(struct kvm_vcpu *vcpu) static struct ia64_pal_retval pal_vm_info(struct kvm_vcpu *vcpu) { struct ia64_pal_retval result; + unsigned long in0, in1, in2, in3; - INIT_PAL_STATUS_UNIMPLEMENTED(result); + kvm_get_pal_call_data(vcpu, &in0, &in1, &in2, &in3); + + result.status = ia64_pal_vm_info(in1, in2, + (pal_tc_info_u_t *)&result.v1, &result.v2); return result; } @@ -292,6 +308,108 @@ static void prepare_for_halt(struct kvm_vcpu *vcpu) vcpu->arch.timer_fired = 0; } +static struct ia64_pal_retval pal_perf_mon_info(struct kvm_vcpu *vcpu) +{ + long status; + unsigned long in0, in1, in2, in3, r9; + unsigned long pm_buffer[16]; + + kvm_get_pal_call_data(vcpu, &in0, &in1, &in2, &in3); + status = ia64_pal_perf_mon_info(pm_buffer, + (pal_perf_mon_info_u_t *) &r9); + if (status != 0) { + printk(KERN_DEBUG"PAL_PERF_MON_INFO fails ret=%ld\n", status); + } else { + if (in1) + memcpy((void *)in1, pm_buffer, sizeof(pm_buffer)); + else { + status = PAL_STATUS_EINVAL; + printk(KERN_WARNING"Invalid parameters " + "for PAL call:0x%lx!\n", in0); + } + } + return (struct ia64_pal_retval){status, r9, 0, 0}; +} + +static struct ia64_pal_retval pal_halt_info(struct kvm_vcpu *vcpu) +{ + unsigned long in0, in1, in2, in3; + long status; + unsigned long res = 1000UL | (1000UL << 16) | (10UL << 32) + | (1UL << 61) | (1UL << 60); + + kvm_get_pal_call_data(vcpu, &in0, &in1, &in2, &in3); + if (in1) { + memcpy((void *)in1, &res, sizeof(res)); + status = 0; + } else{ + status = PAL_STATUS_EINVAL; + printk(KERN_WARNING"Invalid parameters " + "for PAL call:0x%lx!\n", in0); + } + + return (struct ia64_pal_retval){status, 0, 0, 0}; +} + +static struct ia64_pal_retval pal_mem_attrib(struct kvm_vcpu *vcpu) +{ + unsigned long r9; + long status; + + status = ia64_pal_mem_attrib(&r9); + + return (struct ia64_pal_retval){status, r9, 0, 0}; +} + +static void remote_pal_prefetch_visibility(void *v) +{ + s64 trans_type = (s64)v; + ia64_pal_prefetch_visibility(trans_type); +} + +static struct ia64_pal_retval pal_prefetch_visibility(struct kvm_vcpu *vcpu) +{ + struct ia64_pal_retval result = {0, 0, 0, 0}; + unsigned long in0, in1, in2, in3; + kvm_get_pal_call_data(vcpu, &in0, &in1, &in2, &in3); + result.status = ia64_pal_prefetch_visibility(in1); + if (result.status == 0) { + /* Must be performed on all remote processors + in the coherence domain. */ + smp_call_function(remote_pal_prefetch_visibility, + (void *)in1, 1); + /* Unnecessary on remote processor for other vcpus!*/ + result.status = 1; + } + return result; +} + +static void remote_pal_mc_drain(void *v) +{ + ia64_pal_mc_drain(); +} + +static struct ia64_pal_retval pal_get_brand_info(struct kvm_vcpu *vcpu) +{ + struct ia64_pal_retval result = {0, 0, 0, 0}; + unsigned long in0, in1, in2, in3; + + kvm_get_pal_call_data(vcpu, &in0, &in1, &in2, &in3); + + if (in1 == 0 && in2) { + char brand_info[128]; + result.status = ia64_pal_get_brand_info(brand_info); + if (result.status == PAL_STATUS_SUCCESS) + memcpy((void *)in2, brand_info, 128); + } else { + result.status = PAL_STATUS_REQUIRES_MEMORY; + printk(KERN_WARNING"Invalid parameters for " + "PAL call:0x%lx!\n", in0); + } + + return result; +} + int kvm_pal_emul(struct kvm_vcpu *vcpu, struct kvm_run *run) { @@ -300,14 +418,22 @@ int kvm_pal_emul(struct kvm_vcpu *vcpu, struct kvm_run *run) int ret = 1; gr28 = kvm_get_pal_call_index(vcpu); - /*printk("pal_call index:%lx\n",gr28);*/ switch (gr28) { case PAL_CACHE_FLUSH: result = pal_cache_flush(vcpu); break; + case PAL_MEM_ATTRIB: + result = pal_mem_attrib(vcpu); + break; case PAL_CACHE_SUMMARY: result = pal_cache_summary(vcpu); break; + case PAL_PERF_MON_INFO: + result = pal_perf_mon_info(vcpu); + break; + case PAL_HALT_INFO: + result = pal_halt_info(vcpu); + break; case PAL_HALT_LIGHT: { INIT_PAL_STATUS_SUCCESS(result); @@ -317,6 +443,16 @@ int kvm_pal_emul(struct kvm_vcpu *vcpu, struct kvm_run *run) } break; + case PAL_PREFETCH_VISIBILITY: + result = pal_prefetch_visibility(vcpu); + break; + case PAL_MC_DRAIN: + result.status = ia64_pal_mc_drain(); + /* FIXME: All vcpus likely call PAL_MC_DRAIN. + That causes the congestion. */ + smp_call_function(remote_pal_mc_drain, NULL, 1); + break; + case PAL_FREQ_RATIOS: result = pal_freq_ratios(vcpu); break; @@ -346,6 +482,9 @@ int kvm_pal_emul(struct kvm_vcpu *vcpu, struct kvm_run *run) INIT_PAL_STATUS_SUCCESS(result); result.v1 = (1L << 32) | 1L; break; + case PAL_REGISTER_INFO: + result = pal_register_info(vcpu); + break; case PAL_VM_PAGE_SIZE: result.status = ia64_pal_vm_page_size(&result.v0, &result.v1); @@ -365,12 +504,18 @@ int kvm_pal_emul(struct kvm_vcpu *vcpu, struct kvm_run *run) result.status = ia64_pal_version( (pal_version_u_t *)&result.v0, (pal_version_u_t *)&result.v1); - break; case PAL_FIXED_ADDR: result.status = PAL_STATUS_SUCCESS; result.v0 = vcpu->vcpu_id; break; + case PAL_BRAND_INFO: + result = pal_get_brand_info(vcpu); + break; + case PAL_GET_PSTATE: + case PAL_CACHE_SHARED_INFO: + INIT_PAL_STATUS_UNIMPLEMENTED(result); + break; default: INIT_PAL_STATUS_UNIMPLEMENTED(result); printk(KERN_WARNING"kvm: Unsupported pal call," diff --git a/arch/ia64/kvm/process.c b/arch/ia64/kvm/process.c index 0e727ce0d55c..b1dc80952d91 100644 --- a/arch/ia64/kvm/process.c +++ b/arch/ia64/kvm/process.c @@ -607,7 +607,7 @@ static void set_pal_call_data(struct kvm_vcpu *vcpu) p->exit_reason = EXIT_REASON_PAL_CALL; } -static void set_pal_call_result(struct kvm_vcpu *vcpu) +static void get_pal_call_result(struct kvm_vcpu *vcpu) { struct exit_ctl_data *p = &vcpu->arch.exit_data; @@ -635,7 +635,7 @@ static void set_sal_call_data(struct kvm_vcpu *vcpu) p->exit_reason = EXIT_REASON_SAL_CALL; } -static void set_sal_call_result(struct kvm_vcpu *vcpu) +static void get_sal_call_result(struct kvm_vcpu *vcpu) { struct exit_ctl_data *p = &vcpu->arch.exit_data; @@ -658,13 +658,13 @@ void kvm_ia64_handle_break(unsigned long ifa, struct kvm_pt_regs *regs, if (iim == DOMN_PAL_REQUEST) { set_pal_call_data(v); vmm_transition(v); - set_pal_call_result(v); + get_pal_call_result(v); vcpu_increment_iip(v); return; } else if (iim == DOMN_SAL_REQUEST) { set_sal_call_data(v); vmm_transition(v); - set_sal_call_result(v); + get_sal_call_result(v); vcpu_increment_iip(v); return; } -- cgit v1.2.3 From 2c411b48af3e3c534b9cfb6a79be1df384d1ca1a Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 22 Jan 2009 14:20:27 +0100 Subject: KVM: s390: Fix printk on SIGP set arch KVM on s390 does not support the ESA/390 architecture. We refuse to change the architecture mode and print a warning. This patch removes the printk for several reasons: o A malicious guest can flood host dmesg o The old message had no newline o there is no connection between the message and the failing guest This patch simply removes the printk. We already set the condition code to 3 - the guest knows that something went wrong. Reported-by: Heiko Carstens Signed-off-by: Christian Borntraeger Signed-off-by: Avi Kivity --- arch/s390/kvm/sigp.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index 2a01b9e02801..f27dbedf0866 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c @@ -153,8 +153,6 @@ static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter) switch (parameter & 0xff) { case 0: - printk(KERN_WARNING "kvm: request to switch to ESA/390 mode" - " not supported"); rc = 3; /* not operational */ break; case 1: -- cgit v1.2.3 From 70455a36a073cbb83ca17f92d135a6128c73cb3c Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 22 Jan 2009 10:28:29 +0100 Subject: KVM: s390: Fix problem state check for b2 intercepts The kernel handles some priviledged instruction exits. While I was unable to trigger such an exit from guest userspace, the code should check for supervisor state before emulating a priviledged instruction. I also renamed kvm_s390_handle_priv to kvm_s390_handle_b2. After all there are non priviledged b2 instructions like stck (store clock). Signed-off-by: Christian Borntraeger Signed-off-by: Avi Kivity --- arch/s390/kvm/intercept.c | 2 +- arch/s390/kvm/kvm-s390.h | 2 +- arch/s390/kvm/priv.c | 18 +++++++++++++++--- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 61236102203e..9d19803111ba 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -103,7 +103,7 @@ static int handle_lctl(struct kvm_vcpu *vcpu) static intercept_handler_t instruction_handlers[256] = { [0x83] = kvm_s390_handle_diag, [0xae] = kvm_s390_handle_sigp, - [0xb2] = kvm_s390_handle_priv, + [0xb2] = kvm_s390_handle_b2, [0xb7] = handle_lctl, [0xeb] = handle_lctlg, }; diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 3893cf12eacf..00bbe69b78da 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -50,7 +50,7 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code); /* implemented in priv.c */ -int kvm_s390_handle_priv(struct kvm_vcpu *vcpu); +int kvm_s390_handle_b2(struct kvm_vcpu *vcpu); /* implemented in sigp.c */ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu); diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 3605df45dd41..4b88834b8dd8 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -304,12 +304,24 @@ static intercept_handler_t priv_handlers[256] = { [0xb1] = handle_stfl, }; -int kvm_s390_handle_priv(struct kvm_vcpu *vcpu) +int kvm_s390_handle_b2(struct kvm_vcpu *vcpu) { intercept_handler_t handler; + /* + * a lot of B2 instructions are priviledged. We first check for + * the priviledges ones, that we can handle in the kernel. If the + * kernel can handle this instruction, we check for the problem + * state bit and (a) handle the instruction or (b) send a code 2 + * program check. + * Anything else goes to userspace.*/ handler = priv_handlers[vcpu->arch.sie_block->ipa & 0x00ff]; - if (handler) - return handler(vcpu); + if (handler) { + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, + PGM_PRIVILEGED_OPERATION); + else + return handler(vcpu); + } return -ENOTSUPP; } -- cgit v1.2.3 From b7e6e4d3602c738b8f61225d9f4514945df52f07 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 22 Jan 2009 10:29:08 +0100 Subject: KVM: s390: Fix SIGP set prefix ioctl This patch fixes the SET PREFIX interrupt if triggered by userspace. Until now, it was not necessary, but life migration will need it. In addition, it helped me creating SMP support for my kvm_crashme tool (lets kvm execute random guest memory content). Signed-off-by: Christian Borntraeger Signed-off-by: Avi Kivity --- arch/s390/kvm/interrupt.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index f4fe28a2521a..0189356fe209 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -555,9 +555,14 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, VCPU_EVENT(vcpu, 3, "inject: program check %d (from user)", s390int->parm); break; + case KVM_S390_SIGP_SET_PREFIX: + inti->prefix.address = s390int->parm; + inti->type = s390int->type; + VCPU_EVENT(vcpu, 3, "inject: set prefix to %x (from user)", + s390int->parm); + break; case KVM_S390_SIGP_STOP: case KVM_S390_RESTART: - case KVM_S390_SIGP_SET_PREFIX: case KVM_S390_INT_EMERGENCY: VCPU_EVENT(vcpu, 3, "inject: type %x", s390int->type); inti->type = s390int->type; -- cgit v1.2.3 From 934d534f8a5a39e20d5682b3a3a45ff351706b59 Mon Sep 17 00:00:00 2001 From: Jes Sorensen Date: Wed, 21 Jan 2009 15:16:43 +0100 Subject: KVM: ia64: dynamic nr online cpus Account for number of online cpus and use that in loops iterating over the list of vpus instead of scanning the full array unconditionally. This patch is a building block to facilitate allowing to bump up the size of MAX_VCPUS significantly. Signed-off-by: Jes Sorensen Acked-by : Xiantao Zhang Signed-off-by: Avi Kivity --- arch/ia64/include/asm/kvm_host.h | 10 ++++++---- arch/ia64/kvm/kvm-ia64.c | 12 ++++++++---- arch/ia64/kvm/vcpu.c | 5 ++++- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h index 46f8b0eb5684..4542651e6acb 100644 --- a/arch/ia64/include/asm/kvm_host.h +++ b/arch/ia64/include/asm/kvm_host.h @@ -157,10 +157,10 @@ struct kvm_vm_data { struct kvm_vcpu_data vcpu_data[KVM_MAX_VCPUS]; }; -#define VCPU_BASE(n) KVM_VM_DATA_BASE + \ - offsetof(struct kvm_vm_data, vcpu_data[n]) -#define VM_BASE KVM_VM_DATA_BASE + \ - offsetof(struct kvm_vm_data, kvm_vm_struct) +#define VCPU_BASE(n) (KVM_VM_DATA_BASE + \ + offsetof(struct kvm_vm_data, vcpu_data[n])) +#define KVM_VM_BASE (KVM_VM_DATA_BASE + \ + offsetof(struct kvm_vm_data, kvm_vm_struct)) #define KVM_MEM_DIRTY_LOG_BASE KVM_VM_DATA_BASE + \ offsetof(struct kvm_vm_data, kvm_mem_dirty_log) @@ -464,6 +464,8 @@ struct kvm_arch { unsigned long metaphysical_rr4; unsigned long vmm_init_rr; + int online_vcpus; + struct kvm_ioapic *vioapic; struct kvm_vm_stat stat; struct kvm_sal_data rdv_sal_data; diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index dbf527a57341..9c77e3939e97 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -314,7 +314,7 @@ static struct kvm_vcpu *lid_to_vcpu(struct kvm *kvm, unsigned long id, union ia64_lid lid; int i; - for (i = 0; i < KVM_MAX_VCPUS; i++) { + for (i = 0; i < kvm->arch.online_vcpus; i++) { if (kvm->vcpus[i]) { lid.val = VCPU_LID(kvm->vcpus[i]); if (lid.id == id && lid.eid == eid) @@ -388,7 +388,7 @@ static int handle_global_purge(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) call_data.ptc_g_data = p->u.ptc_g_data; - for (i = 0; i < KVM_MAX_VCPUS; i++) { + for (i = 0; i < kvm->arch.online_vcpus; i++) { if (!kvm->vcpus[i] || kvm->vcpus[i]->arch.mp_state == KVM_MP_STATE_UNINITIALIZED || vcpu == kvm->vcpus[i]) @@ -788,6 +788,8 @@ struct kvm *kvm_arch_create_vm(void) return ERR_PTR(-ENOMEM); kvm_init_vm(kvm); + kvm->arch.online_vcpus = 0; + return kvm; } @@ -1154,7 +1156,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) /*Initialize itc offset for vcpus*/ itc_offset = 0UL - ia64_getreg(_IA64_REG_AR_ITC); - for (i = 0; i < KVM_MAX_VCPUS; i++) { + for (i = 0; i < kvm->arch.online_vcpus; i++) { v = (struct kvm_vcpu *)((char *)vcpu + sizeof(struct kvm_vcpu_data) * i); v->arch.itc_offset = itc_offset; @@ -1288,6 +1290,8 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, goto fail; } + kvm->arch.online_vcpus++; + return vcpu; fail: return ERR_PTR(r); @@ -1828,7 +1832,7 @@ struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector, struct kvm_vcpu *lvcpu = kvm->vcpus[0]; int i; - for (i = 1; i < KVM_MAX_VCPUS; i++) { + for (i = 1; i < kvm->arch.online_vcpus; i++) { if (!kvm->vcpus[i]) continue; if (lvcpu->arch.xtp > kvm->vcpus[i]->arch.xtp) diff --git a/arch/ia64/kvm/vcpu.c b/arch/ia64/kvm/vcpu.c index 4d8be4c252fa..d4d280505878 100644 --- a/arch/ia64/kvm/vcpu.c +++ b/arch/ia64/kvm/vcpu.c @@ -807,12 +807,15 @@ static inline void vcpu_set_itm(struct kvm_vcpu *vcpu, u64 val); static void vcpu_set_itc(struct kvm_vcpu *vcpu, u64 val) { struct kvm_vcpu *v; + struct kvm *kvm; int i; long itc_offset = val - ia64_getreg(_IA64_REG_AR_ITC); unsigned long vitv = VCPU(vcpu, itv); + kvm = (struct kvm *)KVM_VM_BASE; + if (vcpu->vcpu_id == 0) { - for (i = 0; i < KVM_MAX_VCPUS; i++) { + for (i = 0; i < kvm->arch.online_vcpus; i++) { v = (struct kvm_vcpu *)((char *)vcpu + sizeof(struct kvm_vcpu_data) * i); VMX(v, itc_offset) = itc_offset; -- cgit v1.2.3 From 44882eed2ebe7f75f8cdae5671ab1d6e0fa40dbc Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 27 Jan 2009 15:12:38 -0200 Subject: KVM: make irq ack notifications aware of routing table IRQ ack notifications assume an identity mapping between pin->gsi, which might not be the case with, for example, HPET. Translate before acking. Signed-off-by: Marcelo Tosatti Acked-by: Gleb Natapov --- arch/x86/kvm/i8259.c | 5 +++-- arch/x86/kvm/irq.h | 2 ++ include/linux/kvm_host.h | 2 +- virt/kvm/ioapic.c | 10 +++++----- virt/kvm/irq_comm.c | 13 ++++++++++--- 5 files changed, 21 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 179dcb0103fd..93160375c841 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -49,7 +49,8 @@ static void pic_unlock(struct kvm_pic *s) spin_unlock(&s->lock); while (acks) { - kvm_notify_acked_irq(kvm, __ffs(acks)); + kvm_notify_acked_irq(kvm, SELECT_PIC(__ffs(acks)), + __ffs(acks)); acks &= acks - 1; } @@ -232,7 +233,7 @@ int kvm_pic_read_irq(struct kvm *kvm) } pic_update_irq(s); pic_unlock(s); - kvm_notify_acked_irq(kvm, irq); + kvm_notify_acked_irq(kvm, SELECT_PIC(irq), irq); return intno; } diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 82579ee538d0..9f593188129e 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -32,6 +32,8 @@ #include "lapic.h" #define PIC_NUM_PINS 16 +#define SELECT_PIC(irq) \ + ((irq) < 8 ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE) struct kvm; struct kvm_vcpu; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ce285e01bd57..c03a0a9a8584 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -352,7 +352,7 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask); void kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level); -void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi); +void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin); void kvm_register_irq_ack_notifier(struct kvm *kvm, struct kvm_irq_ack_notifier *kian); void kvm_unregister_irq_ack_notifier(struct kvm_irq_ack_notifier *kian); diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index e85a2bcd2db1..1c986ac59ad6 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -293,20 +293,20 @@ void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) } } -static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int gsi, +static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int pin, int trigger_mode) { union ioapic_redir_entry *ent; - ent = &ioapic->redirtbl[gsi]; + ent = &ioapic->redirtbl[pin]; - kvm_notify_acked_irq(ioapic->kvm, gsi); + kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, pin); if (trigger_mode == IOAPIC_LEVEL_TRIG) { ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); ent->fields.remote_irr = 0; - if (!ent->fields.mask && (ioapic->irr & (1 << gsi))) - ioapic_service(ioapic, gsi); + if (!ent->fields.mask && (ioapic->irr & (1 << pin))) + ioapic_service(ioapic, pin); } } diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index a797fa5e6420..7aa5086c8622 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -59,10 +59,19 @@ void kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level) e->set(e, kvm, !!(*irq_state)); } -void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi) +void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) { + struct kvm_kernel_irq_routing_entry *e; struct kvm_irq_ack_notifier *kian; struct hlist_node *n; + unsigned gsi = pin; + + list_for_each_entry(e, &kvm->irq_routing, link) + if (e->irqchip.irqchip == irqchip && + e->irqchip.pin == pin) { + gsi = e->gsi; + break; + } hlist_for_each_entry(kian, n, &kvm->arch.irq_ack_notifier_list, link) if (kian->gsi == gsi) @@ -237,8 +246,6 @@ out: #define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq) #ifdef CONFIG_X86 -#define SELECT_PIC(irq) \ - ((irq) < 8 ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE) # define PIC_ROUTING_ENTRY(irq) \ { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \ .u.irqchip.irqchip = SELECT_PIC(irq), .u.irqchip.pin = (irq) % 8 } -- cgit v1.2.3 From d20626936dd6aa783760e780dae5abb127564316 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Mon, 2 Feb 2009 16:23:50 +0100 Subject: x86: Add EFER descriptions for FFXSR AMD k10 includes support for the FFXSR feature, which leaves out XMM registers on FXSAVE/FXSAVE when the EFER_FFXSR bit is set in EFER. The CPUID feature bit exists already, but the EFER bit is missing currently, so this patch adds it to the list of known EFER bits. Signed-off-by: Alexander Graf CC: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/include/asm/msr-index.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 46e9646e7a66..f4e505f286bc 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -19,12 +19,14 @@ #define _EFER_LMA 10 /* Long mode active (read-only) */ #define _EFER_NX 11 /* No execute enable */ #define _EFER_SVME 12 /* Enable virtualization */ +#define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */ #define EFER_SCE (1<<_EFER_SCE) #define EFER_LME (1<<_EFER_LME) #define EFER_LMA (1<<_EFER_LMA) #define EFER_NX (1<<_EFER_NX) #define EFER_SVME (1<<_EFER_SVME) +#define EFER_FFXSR (1<<_EFER_FFXSR) /* Intel MSRs. Some also available on other CPUs */ #define MSR_IA32_PERFCTR0 0x000000c1 -- cgit v1.2.3 From 1b2fd70c4eddef53f32639296818c0253e7ca48d Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Mon, 2 Feb 2009 16:23:51 +0100 Subject: KVM: Add FFXSR support AMD K10 CPUs implement the FFXSR feature that gets enabled using EFER. Let's check if the virtual CPU description includes that CPUID feature bit and allow enabling it then. This is required for Windows Server 2008 in Hyper-V mode. v2 adds CPUID capability exposure Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 3 +++ arch/x86/kvm/x86.c | 12 ++++++++++++ 2 files changed, 15 insertions(+) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index db5021b2b5a8..1c4a018bf4bb 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -417,6 +417,9 @@ static __init int svm_hardware_setup(void) if (boot_cpu_has(X86_FEATURE_NX)) kvm_enable_efer_bits(EFER_NX); + if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) + kvm_enable_efer_bits(EFER_FFXSR); + if (nested) { printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); kvm_enable_efer_bits(EFER_SVME); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 32e3a7ec6ad2..8f83590b47dd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -490,6 +490,17 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer) return; } + if (efer & EFER_FFXSR) { + struct kvm_cpuid_entry2 *feat; + + feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); + if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) { + printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n"); + kvm_inject_gp(vcpu, 0); + return; + } + } + if (efer & EFER_SVME) { struct kvm_cpuid_entry2 *feat; @@ -1240,6 +1251,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, #ifdef CONFIG_X86_64 bit(X86_FEATURE_LM) | #endif + bit(X86_FEATURE_FXSR_OPT) | bit(X86_FEATURE_MMXEXT) | bit(X86_FEATURE_3DNOWEXT) | bit(X86_FEATURE_3DNOW); -- cgit v1.2.3 From 34c33d163fe509da8414a736c6328855f8c164e5 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sun, 8 Feb 2009 13:28:15 +0100 Subject: KVM: Drop unused evaluations from string pio handlers Looks like neither the direction nor the rep prefix are used anymore. Drop related evaluations from SVM's and VMX's I/O exit handlers. Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 4 +--- arch/x86/kvm/vmx.c | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1c4a018bf4bb..22e88a4a51c7 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1194,7 +1194,7 @@ static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ - int size, down, in, string, rep; + int size, in, string; unsigned port; ++svm->vcpu.stat.io_exits; @@ -1213,8 +1213,6 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) in = (io_info & SVM_IOIO_TYPE_MASK) != 0; port = io_info >> 16; size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; - rep = (io_info & SVM_IOIO_REP_MASK) != 0; - down = (svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0; skip_emulated_instruction(&svm->vcpu); return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index df454de8acfa..509b35305402 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2698,7 +2698,7 @@ static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { unsigned long exit_qualification; - int size, down, in, string, rep; + int size, in, string; unsigned port; ++vcpu->stat.io_exits; @@ -2714,8 +2714,6 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) size = (exit_qualification & 7) + 1; in = (exit_qualification & 8) != 0; - down = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0; - rep = (exit_qualification & 32) != 0; port = exit_qualification >> 16; skip_emulated_instruction(vcpu); -- cgit v1.2.3 From 79950e1073150909619b7c0f9a39a2fea83a42d8 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Tue, 10 Feb 2009 13:57:06 +0800 Subject: KVM: Use irq routing API for MSI Merge MSI userspace interface with IRQ routing table. Notice the API have been changed, and using IRQ routing table would be the only interface kvm-userspace supported. Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- include/linux/kvm.h | 9 ++++++ include/linux/kvm_host.h | 2 +- virt/kvm/irq_comm.c | 78 +++++++++++++++++++++++++++++++++++++++++++----- virt/kvm/kvm_main.c | 70 ++++--------------------------------------- 4 files changed, 86 insertions(+), 73 deletions(-) diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 869462ca7625..2163b3dd36e7 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -410,8 +410,16 @@ struct kvm_irq_routing_irqchip { __u32 pin; }; +struct kvm_irq_routing_msi { + __u32 address_lo; + __u32 address_hi; + __u32 data; + __u32 pad; +}; + /* gsi routing entry types */ #define KVM_IRQ_ROUTING_IRQCHIP 1 +#define KVM_IRQ_ROUTING_MSI 2 struct kvm_irq_routing_entry { __u32 gsi; @@ -420,6 +428,7 @@ struct kvm_irq_routing_entry { __u32 pad; union { struct kvm_irq_routing_irqchip irqchip; + struct kvm_irq_routing_msi msi; __u32 pad[8]; } u; }; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index c03a0a9a8584..339eda3ca6ee 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -116,6 +116,7 @@ struct kvm_kernel_irq_routing_entry { unsigned irqchip; unsigned pin; } irqchip; + struct msi_msg msi; }; struct list_head link; }; @@ -327,7 +328,6 @@ struct kvm_assigned_dev_kernel { int host_irq; bool host_irq_disabled; int guest_irq; - struct msi_msg guest_msi; #define KVM_ASSIGNED_DEV_GUEST_INTX (1 << 0) #define KVM_ASSIGNED_DEV_GUEST_MSI (1 << 1) #define KVM_ASSIGNED_DEV_HOST_INTX (1 << 8) diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index 7aa5086c8622..6bc7439eff6e 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -20,6 +20,11 @@ */ #include + +#ifdef CONFIG_X86 +#include +#endif + #include "irq.h" #include "ioapic.h" @@ -38,17 +43,70 @@ static void kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e, kvm_ioapic_set_irq(kvm->arch.vioapic, e->irqchip.pin, level); } +static void kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int level) +{ + int vcpu_id; + struct kvm_vcpu *vcpu; + struct kvm_ioapic *ioapic = ioapic_irqchip(kvm); + int dest_id = (e->msi.address_lo & MSI_ADDR_DEST_ID_MASK) + >> MSI_ADDR_DEST_ID_SHIFT; + int vector = (e->msi.data & MSI_DATA_VECTOR_MASK) + >> MSI_DATA_VECTOR_SHIFT; + int dest_mode = test_bit(MSI_ADDR_DEST_MODE_SHIFT, + (unsigned long *)&e->msi.address_lo); + int trig_mode = test_bit(MSI_DATA_TRIGGER_SHIFT, + (unsigned long *)&e->msi.data); + int delivery_mode = test_bit(MSI_DATA_DELIVERY_MODE_SHIFT, + (unsigned long *)&e->msi.data); + u32 deliver_bitmask; + + BUG_ON(!ioapic); + + deliver_bitmask = kvm_ioapic_get_delivery_bitmask(ioapic, + dest_id, dest_mode); + /* IOAPIC delivery mode value is the same as MSI here */ + switch (delivery_mode) { + case IOAPIC_LOWEST_PRIORITY: + vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm, vector, + deliver_bitmask); + if (vcpu != NULL) + kvm_apic_set_irq(vcpu, vector, trig_mode); + else + printk(KERN_INFO "kvm: null lowest priority vcpu!\n"); + break; + case IOAPIC_FIXED: + for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) { + if (!(deliver_bitmask & (1 << vcpu_id))) + continue; + deliver_bitmask &= ~(1 << vcpu_id); + vcpu = ioapic->kvm->vcpus[vcpu_id]; + if (vcpu) + kvm_apic_set_irq(vcpu, vector, trig_mode); + } + break; + default: + break; + } +} + /* This should be called with the kvm->lock mutex held */ void kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level) { struct kvm_kernel_irq_routing_entry *e; - unsigned long *irq_state = (unsigned long *)&kvm->arch.irq_states[irq]; + unsigned long *irq_state, sig_level; + + if (irq < KVM_IOAPIC_NUM_PINS) { + irq_state = (unsigned long *)&kvm->arch.irq_states[irq]; - /* Logical OR for level trig interrupt */ - if (level) - set_bit(irq_source_id, irq_state); - else - clear_bit(irq_source_id, irq_state); + /* Logical OR for level trig interrupt */ + if (level) + set_bit(irq_source_id, irq_state); + else + clear_bit(irq_source_id, irq_state); + sig_level = !!(*irq_state); + } else /* Deal with MSI/MSI-X */ + sig_level = 1; /* Not possible to detect if the guest uses the PIC or the * IOAPIC. So set the bit in both. The guest will ignore @@ -56,7 +114,7 @@ void kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level) */ list_for_each_entry(e, &kvm->irq_routing, link) if (e->gsi == irq) - e->set(e, kvm, !!(*irq_state)); + e->set(e, kvm, sig_level); } void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) @@ -186,6 +244,12 @@ int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e, e->irqchip.irqchip = ue->u.irqchip.irqchip; e->irqchip.pin = ue->u.irqchip.pin + delta; break; + case KVM_IRQ_ROUTING_MSI: + e->set = kvm_set_msi; + e->msi.address_lo = ue->u.msi.address_lo; + e->msi.address_hi = ue->u.msi.address_hi; + e->msi.data = ue->u.msi.data; + break; default: goto out; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index c65484b471c6..266bdaf0ce44 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -47,10 +47,6 @@ #include #include -#ifdef CONFIG_X86 -#include -#endif - #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET #include "coalesced_mmio.h" #endif @@ -85,57 +81,6 @@ static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, static bool kvm_rebooting; #ifdef KVM_CAP_DEVICE_ASSIGNMENT - -#ifdef CONFIG_X86 -static void assigned_device_msi_dispatch(struct kvm_assigned_dev_kernel *dev) -{ - int vcpu_id; - struct kvm_vcpu *vcpu; - struct kvm_ioapic *ioapic = ioapic_irqchip(dev->kvm); - int dest_id = (dev->guest_msi.address_lo & MSI_ADDR_DEST_ID_MASK) - >> MSI_ADDR_DEST_ID_SHIFT; - int vector = (dev->guest_msi.data & MSI_DATA_VECTOR_MASK) - >> MSI_DATA_VECTOR_SHIFT; - int dest_mode = test_bit(MSI_ADDR_DEST_MODE_SHIFT, - (unsigned long *)&dev->guest_msi.address_lo); - int trig_mode = test_bit(MSI_DATA_TRIGGER_SHIFT, - (unsigned long *)&dev->guest_msi.data); - int delivery_mode = test_bit(MSI_DATA_DELIVERY_MODE_SHIFT, - (unsigned long *)&dev->guest_msi.data); - u32 deliver_bitmask; - - BUG_ON(!ioapic); - - deliver_bitmask = kvm_ioapic_get_delivery_bitmask(ioapic, - dest_id, dest_mode); - /* IOAPIC delivery mode value is the same as MSI here */ - switch (delivery_mode) { - case IOAPIC_LOWEST_PRIORITY: - vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm, vector, - deliver_bitmask); - if (vcpu != NULL) - kvm_apic_set_irq(vcpu, vector, trig_mode); - else - printk(KERN_INFO "kvm: null lowest priority vcpu!\n"); - break; - case IOAPIC_FIXED: - for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) { - if (!(deliver_bitmask & (1 << vcpu_id))) - continue; - deliver_bitmask &= ~(1 << vcpu_id); - vcpu = ioapic->kvm->vcpus[vcpu_id]; - if (vcpu) - kvm_apic_set_irq(vcpu, vector, trig_mode); - } - break; - default: - printk(KERN_INFO "kvm: unsupported MSI delivery mode\n"); - } -} -#else -static void assigned_device_msi_dispatch(struct kvm_assigned_dev_kernel *dev) {} -#endif - static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, int assigned_dev_id) { @@ -162,13 +107,10 @@ static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) * finer-grained lock, update this */ mutex_lock(&assigned_dev->kvm->lock); - if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_GUEST_INTX) - kvm_set_irq(assigned_dev->kvm, - assigned_dev->irq_source_id, - assigned_dev->guest_irq, 1); - else if (assigned_dev->irq_requested_type & - KVM_ASSIGNED_DEV_GUEST_MSI) { - assigned_device_msi_dispatch(assigned_dev); + kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, + assigned_dev->guest_irq, 1); + + if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_GUEST_MSI) { enable_irq(assigned_dev->host_irq); assigned_dev->host_irq_disabled = false; } @@ -331,17 +273,15 @@ static int assigned_device_update_msi(struct kvm *kvm, { int r; + adev->guest_irq = airq->guest_irq; if (airq->flags & KVM_DEV_IRQ_ASSIGN_ENABLE_MSI) { /* x86 don't care upper address of guest msi message addr */ adev->irq_requested_type |= KVM_ASSIGNED_DEV_GUEST_MSI; adev->irq_requested_type &= ~KVM_ASSIGNED_DEV_GUEST_INTX; - adev->guest_msi.address_lo = airq->guest_msi.addr_lo; - adev->guest_msi.data = airq->guest_msi.data; adev->ack_notifier.gsi = -1; } else if (msi2intx) { adev->irq_requested_type |= KVM_ASSIGNED_DEV_GUEST_INTX; adev->irq_requested_type &= ~KVM_ASSIGNED_DEV_GUEST_MSI; - adev->guest_irq = airq->guest_irq; adev->ack_notifier.gsi = airq->guest_irq; } else { /* -- cgit v1.2.3 From 49cd7d2238e44f7ee4269481cd8a1261cc8f93a5 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Wed, 11 Feb 2009 13:50:40 +0800 Subject: KVM: VMX: Use kvm_mmu_page_fault() handle EPT violation mmio Removed duplicated code. Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 30 +----------------------------- 1 file changed, 1 insertion(+), 29 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 509b35305402..cb27ffccf466 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3058,11 +3058,8 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { u64 exit_qualification; - enum emulation_result er; gpa_t gpa; - unsigned long hva; int gla_validity; - int r; exit_qualification = vmcs_read64(EXIT_QUALIFICATION); @@ -3085,32 +3082,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); - hva = gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT); - if (!kvm_is_error_hva(hva)) { - r = kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0); - if (r < 0) { - printk(KERN_ERR "EPT: Not enough memory!\n"); - return -ENOMEM; - } - return 1; - } else { - /* must be MMIO */ - er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); - - if (er == EMULATE_FAIL) { - printk(KERN_ERR - "EPT: Fail to handle EPT violation vmexit!er is %d\n", - er); - printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n", - (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS), - (long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS)); - printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", - (long unsigned int)exit_qualification); - return -ENOTSUPP; - } else if (er == EMULATE_DO_MMIO) - return 0; - } - return 1; + return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0); } static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) -- cgit v1.2.3 From c807660407a695f390034e402edfe544a1d2e40c Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Wed, 4 Feb 2009 17:52:04 +0100 Subject: KVM: Fix kvmclock on !constant_tsc boxes kvmclock currently falls apart on machines without constant tsc. This patch fixes it. Changes: * keep tsc frequency in a per-cpu variable. * handle kvmclock update using a new request flag, thus checking whenever we need an update each time we enter guest context. * use a cpufreq notifier to track frequency changes and force kvmclock updates. * send ipis to kick cpu out of guest context if needed to make sure the guest doesn't see stale values. Signed-off-by: Gerd Hoffmann Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 103 ++++++++++++++++++++++++++++++++++++++++++----- include/linux/kvm_host.h | 1 + 2 files changed, 95 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8f83590b47dd..05d7be89b5eb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -617,6 +618,8 @@ static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info * hv_clock->tsc_to_system_mul); } +static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); + static void kvm_write_guest_time(struct kvm_vcpu *v) { struct timespec ts; @@ -627,9 +630,9 @@ static void kvm_write_guest_time(struct kvm_vcpu *v) if ((!vcpu->time_page)) return; - if (unlikely(vcpu->hv_clock_tsc_khz != tsc_khz)) { - kvm_set_time_scale(tsc_khz, &vcpu->hv_clock); - vcpu->hv_clock_tsc_khz = tsc_khz; + if (unlikely(vcpu->hv_clock_tsc_khz != __get_cpu_var(cpu_tsc_khz))) { + kvm_set_time_scale(__get_cpu_var(cpu_tsc_khz), &vcpu->hv_clock); + vcpu->hv_clock_tsc_khz = __get_cpu_var(cpu_tsc_khz); } /* Keep irq disabled to prevent changes to the clock */ @@ -660,6 +663,16 @@ static void kvm_write_guest_time(struct kvm_vcpu *v) mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); } +static int kvm_request_guest_time_update(struct kvm_vcpu *v) +{ + struct kvm_vcpu_arch *vcpu = &v->arch; + + if (!vcpu->time_page) + return 0; + set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests); + return 1; +} + static bool msr_mtrr_valid(unsigned msr) { switch (msr) { @@ -790,7 +803,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) vcpu->arch.time_page = NULL; } - kvm_write_guest_time(vcpu); + kvm_request_guest_time_update(vcpu); break; } default: @@ -1000,6 +1013,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: case KVM_CAP_SET_TSS_ADDR: case KVM_CAP_EXT_CPUID: + case KVM_CAP_CLOCKSOURCE: case KVM_CAP_PIT: case KVM_CAP_NOP_IO_DELAY: case KVM_CAP_MP_STATE: @@ -1025,9 +1039,6 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_IOMMU: r = iommu_found(); break; - case KVM_CAP_CLOCKSOURCE: - r = boot_cpu_has(X86_FEATURE_CONSTANT_TSC); - break; default: r = 0; break; @@ -1098,7 +1109,7 @@ out: void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { kvm_x86_ops->vcpu_load(vcpu, cpu); - kvm_write_guest_time(vcpu); + kvm_request_guest_time_update(vcpu); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) @@ -2642,9 +2653,72 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, } EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); +static void bounce_off(void *info) +{ + /* nothing */ +} + +static unsigned int ref_freq; +static unsigned long tsc_khz_ref; + +static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cpufreq_freqs *freq = data; + struct kvm *kvm; + struct kvm_vcpu *vcpu; + int i, send_ipi = 0; + + if (!ref_freq) + ref_freq = freq->old; + + if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) + return 0; + if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) + return 0; + per_cpu(cpu_tsc_khz, freq->cpu) = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); + + spin_lock(&kvm_lock); + list_for_each_entry(kvm, &vm_list, vm_list) { + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + vcpu = kvm->vcpus[i]; + if (!vcpu) + continue; + if (vcpu->cpu != freq->cpu) + continue; + if (!kvm_request_guest_time_update(vcpu)) + continue; + if (vcpu->cpu != smp_processor_id()) + send_ipi++; + } + } + spin_unlock(&kvm_lock); + + if (freq->old < freq->new && send_ipi) { + /* + * We upscale the frequency. Must make the guest + * doesn't see old kvmclock values while running with + * the new frequency, otherwise we risk the guest sees + * time go backwards. + * + * In case we update the frequency for another cpu + * (which might be in guest context) send an interrupt + * to kick the cpu out of guest context. Next time + * guest context is entered kvmclock will be updated, + * so the guest will not see stale values. + */ + smp_call_function_single(freq->cpu, bounce_off, NULL, 1); + } + return 0; +} + +static struct notifier_block kvmclock_cpufreq_notifier_block = { + .notifier_call = kvmclock_cpufreq_notifier +}; + int kvm_arch_init(void *opaque) { - int r; + int r, cpu; struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; if (kvm_x86_ops) { @@ -2675,6 +2749,15 @@ int kvm_arch_init(void *opaque) kvm_mmu_set_base_ptes(PT_PRESENT_MASK); kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, PT_DIRTY_MASK, PT64_NX_MASK, 0, 0); + + for_each_possible_cpu(cpu) + per_cpu(cpu_tsc_khz, cpu) = tsc_khz; + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { + tsc_khz_ref = tsc_khz; + cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, + CPUFREQ_TRANSITION_NOTIFIER); + } + return 0; out: @@ -3010,6 +3093,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (vcpu->requests) { if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) __kvm_migrate_timers(vcpu); + if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests)) + kvm_write_guest_time(vcpu); if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests)) kvm_mmu_sync_roots(vcpu); if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 339eda3ca6ee..18b4df8264cf 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -37,6 +37,7 @@ #define KVM_REQ_PENDING_TIMER 5 #define KVM_REQ_UNHALT 6 #define KVM_REQ_MMU_SYNC 7 +#define KVM_REQ_KVMCLOCK_UPDATE 8 #define KVM_USERSPACE_IRQ_SOURCE_ID 0 -- cgit v1.2.3 From fc5659c8c6b6c4e02ac354b369017c1bf231f347 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Feb 2009 14:08:58 +0100 Subject: KVM: MMU: handle compound pages in kvm_is_mmio_pfn The function kvm_is_mmio_pfn is called before put_page is called on a page by KVM. This is a problem when when this function is called on some struct page which is part of a compund page. It does not test the reserved flag of the compound page but of the struct page within the compount page. This is a problem when KVM works with hugepages allocated at boot time. These pages have the reserved bit set in all tail pages. Only the flag in the compount head is cleared. KVM would not put such a page which results in a memory leak. Signed-off-by: Joerg Roedel Acked-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- virt/kvm/kvm_main.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 266bdaf0ce44..0ed662dc72d2 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -535,8 +535,10 @@ static inline int valid_vcpu(int n) inline int kvm_is_mmio_pfn(pfn_t pfn) { - if (pfn_valid(pfn)) - return PageReserved(pfn_to_page(pfn)); + if (pfn_valid(pfn)) { + struct page *page = compound_head(pfn_to_page(pfn)); + return PageReserved(page); + } return true; } -- cgit v1.2.3 From 6bed6b9e84bf5e6b468847a50c0751389fdd01d4 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Feb 2009 14:08:59 +0100 Subject: KVM: MMU: remove redundant check in mmu_set_spte The following code flow is unnecessary: if (largepage) was_rmapped = is_large_pte(*shadow_pte); else was_rmapped = 1; The is_large_pte() function will always evaluate to one here because the (largepage && !is_large_pte) case is already handled in the first if-clause. So we can remove this check and set was_rmapped to one always here. Signed-off-by: Joerg Roedel Acked-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ef060ec444a4..c90b4b2f2abd 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1791,12 +1791,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, pgprintk("hfn old %lx new %lx\n", spte_to_pfn(*shadow_pte), pfn); rmap_remove(vcpu->kvm, shadow_pte); - } else { - if (largepage) - was_rmapped = is_large_pte(*shadow_pte); - else - was_rmapped = 1; - } + } else + was_rmapped = 1; } if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault, dirty, largepage, global, gfn, pfn, speculative, true)) { -- cgit v1.2.3 From 452425dbaa1974e9fc489e64a8de46a47b4c2754 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Feb 2009 14:54:37 +0100 Subject: KVM: MMU: remove assertion in kvm_mmu_alloc_page The assertion no longer makes sense since we don't clear page tables on allocation; instead we clear them during prefetch. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c90b4b2f2abd..4a21b0f8491c 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -802,7 +802,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, set_page_private(virt_to_page(sp->spt), (unsigned long)sp); list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&sp->oos_link); - ASSERT(is_empty_shadow_page(sp->spt)); bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); sp->multimapped = 0; sp->parent_pte = parent_pte; -- cgit v1.2.3 From 4925663a079c77d95d8685228ad6675fc5639c8e Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 4 Feb 2009 17:28:14 +0200 Subject: KVM: Report IRQ injection status to userspace. IRQ injection status is either -1 (if there was no CPU found that should except the interrupt because IRQ was masked or ioapic was misconfigured or ...) or >= 0 in that case the number indicates to how many CPUs interrupt was injected. If the value is 0 it means that the interrupt was coalesced and probably should be reinjected. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/ia64/kvm/kvm-ia64.c | 12 ++++++++++-- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/i8259.c | 18 +++++++++++++----- arch/x86/kvm/x86.c | 13 +++++++++++-- include/linux/kvm.h | 7 ++++++- include/linux/kvm_host.h | 4 ++-- virt/kvm/ioapic.c | 23 ++++++++++++++++------- virt/kvm/ioapic.h | 2 +- virt/kvm/irq_comm.c | 41 ++++++++++++++++++++++++++++------------- 9 files changed, 88 insertions(+), 34 deletions(-) diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index 9c77e3939e97..076b00d1dbff 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -182,7 +182,7 @@ int kvm_dev_ioctl_check_extension(long ext) switch (ext) { case KVM_CAP_IRQCHIP: case KVM_CAP_MP_STATE: - + case KVM_CAP_IRQ_INJECT_STATUS: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -927,6 +927,7 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; } break; + case KVM_IRQ_LINE_STATUS: case KVM_IRQ_LINE: { struct kvm_irq_level irq_event; @@ -934,10 +935,17 @@ long kvm_arch_vm_ioctl(struct file *filp, if (copy_from_user(&irq_event, argp, sizeof irq_event)) goto out; if (irqchip_in_kernel(kvm)) { + __s32 status; mutex_lock(&kvm->lock); - kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, + status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irq_event.irq, irq_event.level); mutex_unlock(&kvm->lock); + if (ioctl == KVM_IRQ_LINE_STATUS) { + irq_event.status = status; + if (copy_to_user(argp, &irq_event, + sizeof irq_event)) + goto out; + } r = 0; } break; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 55fd4c5fd388..f0faf58044ff 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -616,7 +616,7 @@ void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, u32 error_code); -void kvm_pic_set_irq(void *opaque, int irq, int level); +int kvm_pic_set_irq(void *opaque, int irq, int level); void kvm_inject_nmi(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 93160375c841..b4e662e94ddc 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -77,12 +77,13 @@ void kvm_pic_clear_isr_ack(struct kvm *kvm) /* * set irq level. If an edge is detected, then the IRR is set to 1 */ -static inline void pic_set_irq1(struct kvm_kpic_state *s, int irq, int level) +static inline int pic_set_irq1(struct kvm_kpic_state *s, int irq, int level) { - int mask; + int mask, ret = 1; mask = 1 << irq; if (s->elcr & mask) /* level triggered */ if (level) { + ret = !(s->irr & mask); s->irr |= mask; s->last_irr |= mask; } else { @@ -91,11 +92,15 @@ static inline void pic_set_irq1(struct kvm_kpic_state *s, int irq, int level) } else /* edge triggered */ if (level) { - if ((s->last_irr & mask) == 0) + if ((s->last_irr & mask) == 0) { + ret = !(s->irr & mask); s->irr |= mask; + } s->last_irr |= mask; } else s->last_irr &= ~mask; + + return (s->imr & mask) ? -1 : ret; } /* @@ -172,16 +177,19 @@ void kvm_pic_update_irq(struct kvm_pic *s) pic_unlock(s); } -void kvm_pic_set_irq(void *opaque, int irq, int level) +int kvm_pic_set_irq(void *opaque, int irq, int level) { struct kvm_pic *s = opaque; + int ret = -1; pic_lock(s); if (irq >= 0 && irq < PIC_NUM_PINS) { - pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); + ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); pic_update_irq(s); } pic_unlock(s); + + return ret; } /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 05d7be89b5eb..e4db5be7c953 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1019,6 +1019,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_MP_STATE: case KVM_CAP_SYNC_MMU: case KVM_CAP_REINJECT_CONTROL: + case KVM_CAP_IRQ_INJECT_STATUS: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -1877,6 +1878,7 @@ long kvm_arch_vm_ioctl(struct file *filp, create_pit_unlock: mutex_unlock(&kvm->lock); break; + case KVM_IRQ_LINE_STATUS: case KVM_IRQ_LINE: { struct kvm_irq_level irq_event; @@ -1884,10 +1886,17 @@ long kvm_arch_vm_ioctl(struct file *filp, if (copy_from_user(&irq_event, argp, sizeof irq_event)) goto out; if (irqchip_in_kernel(kvm)) { + __s32 status; mutex_lock(&kvm->lock); - kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, - irq_event.irq, irq_event.level); + status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, + irq_event.irq, irq_event.level); mutex_unlock(&kvm->lock); + if (ioctl == KVM_IRQ_LINE_STATUS) { + irq_event.status = status; + if (copy_to_user(argp, &irq_event, + sizeof irq_event)) + goto out; + } r = 0; } break; diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 2163b3dd36e7..dd48225d1824 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -48,7 +48,10 @@ struct kvm_irq_level { * For IA-64 (APIC model) IOAPIC0: irq 0-23; IOAPIC1: irq 24-47.. * For X86 (standard AT mode) PIC0/1: irq 0-15. IOAPIC0: 0-23.. */ - __u32 irq; + union { + __u32 irq; + __s32 status; + }; __u32 level; }; @@ -402,6 +405,7 @@ struct kvm_trace_rec { #ifdef __KVM_HAVE_IOAPIC #define KVM_CAP_IRQ_ROUTING 25 #endif +#define KVM_CAP_IRQ_INJECT_STATUS 26 #ifdef KVM_CAP_IRQ_ROUTING @@ -465,6 +469,7 @@ struct kvm_irq_routing { #define KVM_CREATE_PIT _IO(KVMIO, 0x64) #define KVM_GET_PIT _IOWR(KVMIO, 0x65, struct kvm_pit_state) #define KVM_SET_PIT _IOR(KVMIO, 0x66, struct kvm_pit_state) +#define KVM_IRQ_LINE_STATUS _IOWR(KVMIO, 0x67, struct kvm_irq_level) #define KVM_REGISTER_COALESCED_MMIO \ _IOW(KVMIO, 0x67, struct kvm_coalesced_mmio_zone) #define KVM_UNREGISTER_COALESCED_MMIO \ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 18b4df8264cf..894a56e365e8 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -110,7 +110,7 @@ struct kvm_memory_slot { struct kvm_kernel_irq_routing_entry { u32 gsi; - void (*set)(struct kvm_kernel_irq_routing_entry *e, + int (*set)(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int level); union { struct { @@ -352,7 +352,7 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, struct kvm_irq_mask_notifier *kimn); void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask); -void kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level); +int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level); void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin); void kvm_register_irq_ack_notifier(struct kvm *kvm, struct kvm_irq_ack_notifier *kian); diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 1c986ac59ad6..c3b99def9cbc 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -83,19 +83,22 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, return result; } -static void ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx) +static int ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx) { union ioapic_redir_entry *pent; + int injected = -1; pent = &ioapic->redirtbl[idx]; if (!pent->fields.mask) { - int injected = ioapic_deliver(ioapic, idx); + injected = ioapic_deliver(ioapic, idx); if (injected && pent->fields.trig_mode == IOAPIC_LEVEL_TRIG) pent->fields.remote_irr = 1; } if (!pent->fields.trig_mode) ioapic->irr &= ~(1 << idx); + + return injected; } static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) @@ -207,7 +210,7 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq) u8 trig_mode = ioapic->redirtbl[irq].fields.trig_mode; u32 deliver_bitmask; struct kvm_vcpu *vcpu; - int vcpu_id, r = 0; + int vcpu_id, r = -1; ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x " "vector=%x trig_mode=%x\n", @@ -247,7 +250,9 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq) deliver_bitmask &= ~(1 << vcpu_id); vcpu = ioapic->kvm->vcpus[vcpu_id]; if (vcpu) { - r = ioapic_inj_irq(ioapic, vcpu, vector, + if (r < 0) + r = 0; + r += ioapic_inj_irq(ioapic, vcpu, vector, trig_mode, delivery_mode); } } @@ -258,8 +263,10 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq) continue; deliver_bitmask &= ~(1 << vcpu_id); vcpu = ioapic->kvm->vcpus[vcpu_id]; - if (vcpu) + if (vcpu) { ioapic_inj_nmi(vcpu); + r = 1; + } else ioapic_debug("NMI to vcpu %d failed\n", vcpu->vcpu_id); @@ -273,11 +280,12 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq) return r; } -void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) +int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) { u32 old_irr = ioapic->irr; u32 mask = 1 << irq; union ioapic_redir_entry entry; + int ret = 1; if (irq >= 0 && irq < IOAPIC_NUM_PINS) { entry = ioapic->redirtbl[irq]; @@ -288,9 +296,10 @@ void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) ioapic->irr |= mask; if ((!entry.fields.trig_mode && old_irr != ioapic->irr) || !entry.fields.remote_irr) - ioapic_service(ioapic, irq); + ret = ioapic_service(ioapic, irq); } } + return ret; } static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int pin, diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h index 49c9581d2586..a34bd5e6436b 100644 --- a/virt/kvm/ioapic.h +++ b/virt/kvm/ioapic.h @@ -83,7 +83,7 @@ struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector, unsigned long bitmap); void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode); int kvm_ioapic_init(struct kvm *kvm); -void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level); +int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level); void kvm_ioapic_reset(struct kvm_ioapic *ioapic); u32 kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, u8 dest_mode); diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index 6bc7439eff6e..be8aba791554 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -29,22 +29,24 @@ #include "ioapic.h" -static void kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, - struct kvm *kvm, int level) +static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int level) { #ifdef CONFIG_X86 - kvm_pic_set_irq(pic_irqchip(kvm), e->irqchip.pin, level); + return kvm_pic_set_irq(pic_irqchip(kvm), e->irqchip.pin, level); +#else + return -1; #endif } -static void kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e, - struct kvm *kvm, int level) +static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int level) { - kvm_ioapic_set_irq(kvm->arch.vioapic, e->irqchip.pin, level); + return kvm_ioapic_set_irq(kvm->arch.vioapic, e->irqchip.pin, level); } -static void kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, - struct kvm *kvm, int level) +static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int level) { int vcpu_id; struct kvm_vcpu *vcpu; @@ -88,13 +90,20 @@ static void kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, default: break; } + return 1; } -/* This should be called with the kvm->lock mutex held */ -void kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level) +/* This should be called with the kvm->lock mutex held + * Return value: + * < 0 Interrupt was ignored (masked or not delivered for other reasons) + * = 0 Interrupt was coalesced (previous irq is still pending) + * > 0 Number of CPUs interrupt was delivered to + */ +int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level) { struct kvm_kernel_irq_routing_entry *e; unsigned long *irq_state, sig_level; + int ret = -1; if (irq < KVM_IOAPIC_NUM_PINS) { irq_state = (unsigned long *)&kvm->arch.irq_states[irq]; @@ -113,8 +122,14 @@ void kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level) * writes to the unused one. */ list_for_each_entry(e, &kvm->irq_routing, link) - if (e->gsi == irq) - e->set(e, kvm, sig_level); + if (e->gsi == irq) { + int r = e->set(e, kvm, sig_level); + if (r < 0) + continue; + + ret = r + ((ret < 0) ? 0 : ret); + } + return ret; } void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) @@ -232,7 +247,7 @@ int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e, e->set = kvm_set_pic_irq; break; case KVM_IRQCHIP_PIC_SLAVE: - e->set = kvm_set_pic_irq; + e->set = kvm_set_pic_irq; delta = 8; break; case KVM_IRQCHIP_IOAPIC: -- cgit v1.2.3 From 1fbdc7a58512a6283e10fd27108197679db95ffa Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Sun, 11 Jan 2009 22:39:44 +0100 Subject: KVM: SVM: set accessed bit for VMCB segment selectors In the segment descriptor _cache_ the accessed bit is always set (although it can be cleared in the descriptor itself). Since Intel checks for this condition on a VMENTRY, set this bit in the AMD path to enable cross vendor migration. Cc: stable@kernel.org Signed-off-by: Andre Przywara Acked-By: Amit Shah Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 41 +++++++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 22e88a4a51c7..1821c2078199 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -796,20 +796,37 @@ static void svm_get_segment(struct kvm_vcpu *vcpu, var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1; - /* - * SVM always stores 0 for the 'G' bit in the CS selector in - * the VMCB on a VMEXIT. This hurts cross-vendor migration: - * Intel's VMENTRY has a check on the 'G' bit. - */ - if (seg == VCPU_SREG_CS) + switch (seg) { + case VCPU_SREG_CS: + /* + * SVM always stores 0 for the 'G' bit in the CS selector in + * the VMCB on a VMEXIT. This hurts cross-vendor migration: + * Intel's VMENTRY has a check on the 'G' bit. + */ var->g = s->limit > 0xfffff; - - /* - * Work around a bug where the busy flag in the tr selector - * isn't exposed - */ - if (seg == VCPU_SREG_TR) + break; + case VCPU_SREG_TR: + /* + * Work around a bug where the busy flag in the tr selector + * isn't exposed + */ var->type |= 0x2; + break; + case VCPU_SREG_DS: + case VCPU_SREG_ES: + case VCPU_SREG_FS: + case VCPU_SREG_GS: + /* + * The accessed bit must always be set in the segment + * descriptor cache, although it can be cleared in the + * descriptor, the cached bit always remains at 1. Since + * Intel has a check on this, set it here to support + * cross-vendor migration. + */ + if (!var->unusable) + var->type |= 0x1; + break; + } var->unusable = !var->present; } -- cgit v1.2.3 From c5bc22424021cabda862727fb3f5098b866f074d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 19 Feb 2009 12:18:56 +0100 Subject: KVM: MMU: Fix another largepage memory leak In the paging_fetch function rmap_remove is called after setting a large pte to non-present. This causes rmap_remove to not drop the reference to the large page. The result is a memory leak of that page. Cc: stable@kernel.org Signed-off-by: Joerg Roedel Acked-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 7314c0944c5f..0f11792fafa6 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -306,9 +306,9 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, continue; if (is_large_pte(*sptep)) { + rmap_remove(vcpu->kvm, sptep); set_shadow_pte(sptep, shadow_trap_nonpresent_pte); kvm_flush_remote_tlbs(vcpu->kvm); - rmap_remove(vcpu->kvm, sptep); } if (level == PT_DIRECTORY_LEVEL -- cgit v1.2.3 From 71450f78853b82d55cda4e182c9db6e26b631485 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 23 Feb 2009 12:57:11 +0200 Subject: KVM: Report IRQ injection status for MSI delivered interrupts Return number of CPUs interrupt was successfully injected into or -1 if none. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- virt/kvm/irq_comm.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index be8aba791554..c516d618d389 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -48,7 +48,7 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e, static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int level) { - int vcpu_id; + int vcpu_id, r = -1; struct kvm_vcpu *vcpu; struct kvm_ioapic *ioapic = ioapic_irqchip(kvm); int dest_id = (e->msi.address_lo & MSI_ADDR_DEST_ID_MASK) @@ -73,7 +73,7 @@ static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm, vector, deliver_bitmask); if (vcpu != NULL) - kvm_apic_set_irq(vcpu, vector, trig_mode); + r = kvm_apic_set_irq(vcpu, vector, trig_mode); else printk(KERN_INFO "kvm: null lowest priority vcpu!\n"); break; @@ -83,14 +83,17 @@ static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, continue; deliver_bitmask &= ~(1 << vcpu_id); vcpu = ioapic->kvm->vcpus[vcpu_id]; - if (vcpu) - kvm_apic_set_irq(vcpu, vector, trig_mode); + if (vcpu) { + if (r < 0) + r = 0; + r += kvm_apic_set_irq(vcpu, vector, trig_mode); + } } break; default: break; } - return 1; + return r; } /* This should be called with the kvm->lock mutex held -- cgit v1.2.3 From b0a1835d53c57bc38b36867c04436b60454cb610 Mon Sep 17 00:00:00 2001 From: Liu Yu Date: Tue, 17 Feb 2009 16:52:08 +0800 Subject: KVM: ppc: Add emulation of E500 register mmucsr0 Latest kernel flushes TLB via mmucsr0. Signed-off-by: Liu Yu Acked-by: Hollis Blanchard Signed-off-by: Avi Kivity --- arch/powerpc/kvm/e500_emulate.c | 8 ++++++++ arch/powerpc/kvm/e500_tlb.c | 16 ++++++++++++++++ arch/powerpc/kvm/e500_tlb.h | 1 + 3 files changed, 25 insertions(+) diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c index 7a98d4a4c1ed..3f760414b9f8 100644 --- a/arch/powerpc/kvm/e500_emulate.c +++ b/arch/powerpc/kvm/e500_emulate.c @@ -105,6 +105,11 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) case SPRN_HID1: vcpu_e500->hid1 = vcpu->arch.gpr[rs]; break; + case SPRN_MMUCSR0: + emulated = kvmppc_e500_emul_mt_mmucsr0(vcpu_e500, + vcpu->arch.gpr[rs]); + break; + /* extra exceptions */ case SPRN_IVOR32: vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL] = vcpu->arch.gpr[rs]; @@ -172,6 +177,9 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) case SPRN_HID1: vcpu->arch.gpr[rt] = vcpu_e500->hid1; break; + case SPRN_MMUCSR0: + vcpu->arch.gpr[rt] = 0; break; + /* extra exceptions */ case SPRN_IVOR32: vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL]; diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index d437160d388c..72386ddbd9d5 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -391,6 +391,22 @@ static int kvmppc_e500_gtlbe_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500, return 0; } +int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500, ulong value) +{ + int esel; + + if (value & MMUCSR0_TLB0FI) + for (esel = 0; esel < vcpu_e500->guest_tlb_size[0]; esel++) + kvmppc_e500_gtlbe_invalidate(vcpu_e500, 0, esel); + if (value & MMUCSR0_TLB1FI) + for (esel = 0; esel < vcpu_e500->guest_tlb_size[1]; esel++) + kvmppc_e500_gtlbe_invalidate(vcpu_e500, 1, esel); + + _tlbil_all(); + + return EMULATE_DONE; +} + int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); diff --git a/arch/powerpc/kvm/e500_tlb.h b/arch/powerpc/kvm/e500_tlb.h index ab49e9355185..4d5cc0f7d796 100644 --- a/arch/powerpc/kvm/e500_tlb.h +++ b/arch/powerpc/kvm/e500_tlb.h @@ -44,6 +44,7 @@ | E500_TLB_USER_PERM_MASK | E500_TLB_SUPER_PERM_MASK) extern void kvmppc_dump_tlbs(struct kvm_vcpu *); +extern int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *, ulong); extern int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *); extern int kvmppc_e500_emul_tlbre(struct kvm_vcpu *); extern int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *, int, int); -- cgit v1.2.3 From 2df8a40bccf5999261d0d3a82eac5a77678e61bd Mon Sep 17 00:00:00 2001 From: Weidong Han Date: Fri, 13 Feb 2009 10:50:56 +0800 Subject: KVM: define KVM_CAP_DEVICE_DEASSIGNMENT define KVM_CAP_DEVICE_DEASSIGNMENT and KVM_DEASSIGN_PCI_DEVICE for device deassignment. the ioctl has been already implemented in the commit: 0a920356748df4fb06e86c21c23d2ed6d31d37ad Acked-by: Mark McLoughlin Signed-off-by: Weidong Han Signed-off-by: Avi Kivity --- include/linux/kvm.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/kvm.h b/include/linux/kvm.h index dd48225d1824..0d94b274c3ae 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -406,6 +406,9 @@ struct kvm_trace_rec { #define KVM_CAP_IRQ_ROUTING 25 #endif #define KVM_CAP_IRQ_INJECT_STATUS 26 +#ifdef __KVM_HAVE_DEVICE_ASSIGNMENT +#define KVM_CAP_DEVICE_DEASSIGNMENT 27 +#endif #ifdef KVM_CAP_IRQ_ROUTING @@ -480,6 +483,8 @@ struct kvm_irq_routing { #define KVM_ASSIGN_IRQ _IOR(KVMIO, 0x70, \ struct kvm_assigned_irq) #define KVM_REINJECT_CONTROL _IO(KVMIO, 0x71) +#define KVM_DEASSIGN_PCI_DEVICE _IOR(KVMIO, 0x72, \ + struct kvm_assigned_pci_dev) /* * ioctls for vcpu fds -- cgit v1.2.3 From 4a906e49f103c2e544148a209ba1db316510799f Mon Sep 17 00:00:00 2001 From: Weidong Han Date: Fri, 13 Feb 2009 17:27:51 +0800 Subject: KVM: fix kvm_vm_ioctl_deassign_device only need to set assigned_dev_id for deassignment, use match->flags to judge and deassign it. Acked-by: Mark McLoughlin Signed-off-by: Weidong Han Signed-off-by: Avi Kivity --- virt/kvm/kvm_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 0ed662dc72d2..c4278975c8ca 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -517,7 +517,7 @@ static int kvm_vm_ioctl_deassign_device(struct kvm *kvm, goto out; } - if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) + if (match->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) kvm_deassign_device(kvm, match); kvm_free_assigned_device(kvm, match); -- cgit v1.2.3 From 2fa8937f3af8873e006ce324e7b2d3f9b2077b87 Mon Sep 17 00:00:00 2001 From: Xiantao Zhang Date: Mon, 16 Feb 2009 15:14:48 +0800 Subject: ia64: Move the macro definitions related to MSI to one header file. For kvm's MSI support, it needs these macros defined in ia64_msi.c, and to avoid duplicate them, move them to one header file and share with kvm. Signed-off-by: Xiantao Zhang Acked-by: Tony Luck Signed-off-by: Avi Kivity --- arch/ia64/include/asm/msidef.h | 42 ++++++++++++++++++++++++++++++++ arch/ia64/kernel/msi_ia64.c | 55 +++++++----------------------------------- 2 files changed, 51 insertions(+), 46 deletions(-) create mode 100644 arch/ia64/include/asm/msidef.h diff --git a/arch/ia64/include/asm/msidef.h b/arch/ia64/include/asm/msidef.h new file mode 100644 index 000000000000..592c1047a0c5 --- /dev/null +++ b/arch/ia64/include/asm/msidef.h @@ -0,0 +1,42 @@ +#ifndef _IA64_MSI_DEF_H +#define _IA64_MSI_DEF_H + +/* + * Shifts for APIC-based data + */ + +#define MSI_DATA_VECTOR_SHIFT 0 +#define MSI_DATA_VECTOR(v) (((u8)v) << MSI_DATA_VECTOR_SHIFT) +#define MSI_DATA_VECTOR_MASK 0xffffff00 + +#define MSI_DATA_DELIVERY_MODE_SHIFT 8 +#define MSI_DATA_DELIVERY_FIXED (0 << MSI_DATA_DELIVERY_MODE_SHIFT) +#define MSI_DATA_DELIVERY_LOWPRI (1 << MSI_DATA_DELIVERY_MODE_SHIFT) + +#define MSI_DATA_LEVEL_SHIFT 14 +#define MSI_DATA_LEVEL_DEASSERT (0 << MSI_DATA_LEVEL_SHIFT) +#define MSI_DATA_LEVEL_ASSERT (1 << MSI_DATA_LEVEL_SHIFT) + +#define MSI_DATA_TRIGGER_SHIFT 15 +#define MSI_DATA_TRIGGER_EDGE (0 << MSI_DATA_TRIGGER_SHIFT) +#define MSI_DATA_TRIGGER_LEVEL (1 << MSI_DATA_TRIGGER_SHIFT) + +/* + * Shift/mask fields for APIC-based bus address + */ + +#define MSI_ADDR_DEST_ID_SHIFT 4 +#define MSI_ADDR_HEADER 0xfee00000 + +#define MSI_ADDR_DEST_ID_MASK 0xfff0000f +#define MSI_ADDR_DEST_ID_CPU(cpu) ((cpu) << MSI_ADDR_DEST_ID_SHIFT) + +#define MSI_ADDR_DEST_MODE_SHIFT 2 +#define MSI_ADDR_DEST_MODE_PHYS (0 << MSI_ADDR_DEST_MODE_SHIFT) +#define MSI_ADDR_DEST_MODE_LOGIC (1 << MSI_ADDR_DEST_MODE_SHIFT) + +#define MSI_ADDR_REDIRECTION_SHIFT 3 +#define MSI_ADDR_REDIRECTION_CPU (0 << MSI_ADDR_REDIRECTION_SHIFT) +#define MSI_ADDR_REDIRECTION_LOWPRI (1 << MSI_ADDR_REDIRECTION_SHIFT) + +#endif/* _IA64_MSI_DEF_H */ diff --git a/arch/ia64/kernel/msi_ia64.c b/arch/ia64/kernel/msi_ia64.c index 890339339035..368ee4e5266d 100644 --- a/arch/ia64/kernel/msi_ia64.c +++ b/arch/ia64/kernel/msi_ia64.c @@ -7,44 +7,7 @@ #include #include #include - -/* - * Shifts for APIC-based data - */ - -#define MSI_DATA_VECTOR_SHIFT 0 -#define MSI_DATA_VECTOR(v) (((u8)v) << MSI_DATA_VECTOR_SHIFT) -#define MSI_DATA_VECTOR_MASK 0xffffff00 - -#define MSI_DATA_DELIVERY_SHIFT 8 -#define MSI_DATA_DELIVERY_FIXED (0 << MSI_DATA_DELIVERY_SHIFT) -#define MSI_DATA_DELIVERY_LOWPRI (1 << MSI_DATA_DELIVERY_SHIFT) - -#define MSI_DATA_LEVEL_SHIFT 14 -#define MSI_DATA_LEVEL_DEASSERT (0 << MSI_DATA_LEVEL_SHIFT) -#define MSI_DATA_LEVEL_ASSERT (1 << MSI_DATA_LEVEL_SHIFT) - -#define MSI_DATA_TRIGGER_SHIFT 15 -#define MSI_DATA_TRIGGER_EDGE (0 << MSI_DATA_TRIGGER_SHIFT) -#define MSI_DATA_TRIGGER_LEVEL (1 << MSI_DATA_TRIGGER_SHIFT) - -/* - * Shift/mask fields for APIC-based bus address - */ - -#define MSI_TARGET_CPU_SHIFT 4 -#define MSI_ADDR_HEADER 0xfee00000 - -#define MSI_ADDR_DESTID_MASK 0xfff0000f -#define MSI_ADDR_DESTID_CPU(cpu) ((cpu) << MSI_TARGET_CPU_SHIFT) - -#define MSI_ADDR_DESTMODE_SHIFT 2 -#define MSI_ADDR_DESTMODE_PHYS (0 << MSI_ADDR_DESTMODE_SHIFT) -#define MSI_ADDR_DESTMODE_LOGIC (1 << MSI_ADDR_DESTMODE_SHIFT) - -#define MSI_ADDR_REDIRECTION_SHIFT 3 -#define MSI_ADDR_REDIRECTION_CPU (0 << MSI_ADDR_REDIRECTION_SHIFT) -#define MSI_ADDR_REDIRECTION_LOWPRI (1 << MSI_ADDR_REDIRECTION_SHIFT) +#include static struct irq_chip ia64_msi_chip; @@ -65,8 +28,8 @@ static void ia64_set_msi_irq_affinity(unsigned int irq, read_msi_msg(irq, &msg); addr = msg.address_lo; - addr &= MSI_ADDR_DESTID_MASK; - addr |= MSI_ADDR_DESTID_CPU(cpu_physical_id(cpu)); + addr &= MSI_ADDR_DEST_ID_MASK; + addr |= MSI_ADDR_DEST_ID_CPU(cpu_physical_id(cpu)); msg.address_lo = addr; data = msg.data; @@ -98,9 +61,9 @@ int ia64_setup_msi_irq(struct pci_dev *pdev, struct msi_desc *desc) msg.address_hi = 0; msg.address_lo = MSI_ADDR_HEADER | - MSI_ADDR_DESTMODE_PHYS | + MSI_ADDR_DEST_MODE_PHYS | MSI_ADDR_REDIRECTION_CPU | - MSI_ADDR_DESTID_CPU(dest_phys_id); + MSI_ADDR_DEST_ID_CPU(dest_phys_id); msg.data = MSI_DATA_TRIGGER_EDGE | @@ -183,8 +146,8 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) msg.data &= ~MSI_DATA_VECTOR_MASK; msg.data |= MSI_DATA_VECTOR(cfg->vector); - msg.address_lo &= ~MSI_ADDR_DESTID_MASK; - msg.address_lo |= MSI_ADDR_DESTID_CPU(cpu_physical_id(cpu)); + msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; + msg.address_lo |= MSI_ADDR_DEST_ID_CPU(cpu_physical_id(cpu)); dmar_msi_write(irq, &msg); irq_desc[irq].affinity = *mask; @@ -215,9 +178,9 @@ msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) msg->address_hi = 0; msg->address_lo = MSI_ADDR_HEADER | - MSI_ADDR_DESTMODE_PHYS | + MSI_ADDR_DEST_MODE_PHYS | MSI_ADDR_REDIRECTION_CPU | - MSI_ADDR_DESTID_CPU(dest); + MSI_ADDR_DEST_ID_CPU(dest); msg->data = MSI_DATA_TRIGGER_EDGE | -- cgit v1.2.3 From 6b08035f3e64d8e474be166d682b52c95941662e Mon Sep 17 00:00:00 2001 From: Xiantao Zhang Date: Mon, 16 Feb 2009 15:24:05 +0800 Subject: KVM: ia64: Fix the build errors due to lack of macros related to MSI. Include the newly introduced msidef.h to solve the build issues. Signed-off-by: Xiantao Zhang Signed-off-by: Avi Kivity --- arch/ia64/kvm/irq.h | 2 ++ virt/kvm/irq_comm.c | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/ia64/kvm/irq.h b/arch/ia64/kvm/irq.h index c6786e8b1bf4..c0785a728271 100644 --- a/arch/ia64/kvm/irq.h +++ b/arch/ia64/kvm/irq.h @@ -23,6 +23,8 @@ #ifndef __IRQ_H #define __IRQ_H +#include "lapic.h" + static inline int irqchip_in_kernel(struct kvm *kvm) { return 1; diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index c516d618d389..a70d805e0148 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -21,9 +21,7 @@ #include -#ifdef CONFIG_X86 #include -#endif #include "irq.h" -- cgit v1.2.3 From 401d10dee083bda281f2fdcdf654080313ba30ec Mon Sep 17 00:00:00 2001 From: Amit Shah Date: Fri, 20 Feb 2009 22:53:37 +0530 Subject: KVM: VMX: Update necessary state when guest enters long mode setup_msrs() should be called when entering long mode to save the shadow state for the 64-bit guest state. Using vmx_set_efer() in enter_lmode() removes some duplicated code and also ensures we call setup_msrs(). We can safely pass the value of shadow_efer to vmx_set_efer() as no other bits in the efer change while enabling long mode (guest first sets EFER.LME, then sets CR0.PG which causes a vmexit where we activate long mode). With this fix, is_long_mode() can check for EFER.LMA set instead of EFER.LME and 5e23049e86dd298b72e206b420513dbc3a240cd9 can be reverted. Signed-off-by: Amit Shah Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 54 ++++++++++++++++++++++++------------------------------ 1 file changed, 24 insertions(+), 30 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index cb27ffccf466..48063a0aa243 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1430,6 +1430,29 @@ continue_rmode: init_rmode(vcpu->kvm); } +static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); + + vcpu->arch.shadow_efer = efer; + if (!msr) + return; + if (efer & EFER_LMA) { + vmcs_write32(VM_ENTRY_CONTROLS, + vmcs_read32(VM_ENTRY_CONTROLS) | + VM_ENTRY_IA32E_MODE); + msr->data = efer; + } else { + vmcs_write32(VM_ENTRY_CONTROLS, + vmcs_read32(VM_ENTRY_CONTROLS) & + ~VM_ENTRY_IA32E_MODE); + + msr->data = efer & ~EFER_LME; + } + setup_msrs(vmx); +} + #ifdef CONFIG_X86_64 static void enter_lmode(struct kvm_vcpu *vcpu) @@ -1444,13 +1467,8 @@ static void enter_lmode(struct kvm_vcpu *vcpu) (guest_tr_ar & ~AR_TYPE_MASK) | AR_TYPE_BUSY_64_TSS); } - vcpu->arch.shadow_efer |= EFER_LMA; - - find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME; - vmcs_write32(VM_ENTRY_CONTROLS, - vmcs_read32(VM_ENTRY_CONTROLS) - | VM_ENTRY_IA32E_MODE); + vmx_set_efer(vcpu, vcpu->arch.shadow_efer); } static void exit_lmode(struct kvm_vcpu *vcpu) @@ -1609,30 +1627,6 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) vmcs_writel(GUEST_CR4, hw_cr4); } -static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); - - vcpu->arch.shadow_efer = efer; - if (!msr) - return; - if (efer & EFER_LMA) { - vmcs_write32(VM_ENTRY_CONTROLS, - vmcs_read32(VM_ENTRY_CONTROLS) | - VM_ENTRY_IA32E_MODE); - msr->data = efer; - - } else { - vmcs_write32(VM_ENTRY_CONTROLS, - vmcs_read32(VM_ENTRY_CONTROLS) & - ~VM_ENTRY_IA32E_MODE); - - msr->data = efer & ~EFER_LME; - } - setup_msrs(vmx); -} - static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) { struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; -- cgit v1.2.3 From 41d6af119206e98764b4ae6d264d63acefcf851e Mon Sep 17 00:00:00 2001 From: Amit Shah Date: Thu, 28 Feb 2008 16:06:15 +0530 Subject: KVM: is_long_mode() should check for EFER.LMA is_long_mode currently checks the LongModeEnable bit in EFER instead of the LongModeActive bit. This is wrong, but we survived this till now since it wasn't triggered. This breaks guests that go from long mode to compatibility mode. This is noticed on a solaris guest and fixes bug #1842160 Signed-off-by: Amit Shah Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 258e5d56298e..eaab2145f62b 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -54,7 +54,7 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) static inline int is_long_mode(struct kvm_vcpu *vcpu) { #ifdef CONFIG_X86_64 - return vcpu->arch.shadow_efer & EFER_LME; + return vcpu->arch.shadow_efer & EFER_LMA; #else return 0; #endif -- cgit v1.2.3 From d7364a29b305862cbc92b3f8b6ad9968270d068c Mon Sep 17 00:00:00 2001 From: Hannes Eder Date: Sat, 21 Feb 2009 02:18:13 +0100 Subject: KVM: fix sparse warnings: context imbalance Impact: Attribute function with __acquires(...) resp. __releases(...). Fix this sparse warnings: arch/x86/kvm/i8259.c:34:13: warning: context imbalance in 'pic_lock' - wrong count at exit arch/x86/kvm/i8259.c:39:13: warning: context imbalance in 'pic_unlock' - unexpected unlock Signed-off-by: Hannes Eder Signed-off-by: Avi Kivity --- arch/x86/kvm/i8259.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index b4e662e94ddc..1ccb50c74f18 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -32,11 +32,13 @@ #include static void pic_lock(struct kvm_pic *s) + __acquires(&s->lock) { spin_lock(&s->lock); } static void pic_unlock(struct kvm_pic *s) + __releases(&s->lock) { struct kvm *kvm = s->kvm; unsigned acks = s->pending_acks; -- cgit v1.2.3 From cded19f396bc3c9d299331709d0a9fbc6ecc148a Mon Sep 17 00:00:00 2001 From: Hannes Eder Date: Sat, 21 Feb 2009 02:19:13 +0100 Subject: KVM: fix sparse warnings: Should it be static? Impact: Make symbols static. Fix this sparse warnings: arch/x86/kvm/mmu.c:992:5: warning: symbol 'mmu_pages_add' was not declared. Should it be static? arch/x86/kvm/mmu.c:1124:5: warning: symbol 'mmu_pages_next' was not declared. Should it be static? arch/x86/kvm/mmu.c:1144:6: warning: symbol 'mmu_pages_clear_parents' was not declared. Should it be static? arch/x86/kvm/x86.c:2037:5: warning: symbol 'kvm_read_guest_virt' was not declared. Should it be static? arch/x86/kvm/x86.c:2067:5: warning: symbol 'kvm_write_guest_virt' was not declared. Should it be static? virt/kvm/irq_comm.c:220:5: warning: symbol 'setup_routing_entry' was not declared. Should it be static? Signed-off-by: Hannes Eder Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 11 ++++++----- arch/x86/kvm/x86.c | 8 ++++---- virt/kvm/irq_comm.c | 4 ++-- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 4a21b0f8491c..2a36f7f7c4c7 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -989,8 +989,8 @@ struct kvm_mmu_pages { idx < 512; \ idx = find_next_bit(bitmap, 512, idx+1)) -int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, - int idx) +static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, + int idx) { int i; @@ -1121,8 +1121,9 @@ struct mmu_page_path { i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \ i = mmu_pages_next(&pvec, &parents, i)) -int mmu_pages_next(struct kvm_mmu_pages *pvec, struct mmu_page_path *parents, - int i) +static int mmu_pages_next(struct kvm_mmu_pages *pvec, + struct mmu_page_path *parents, + int i) { int n; @@ -1141,7 +1142,7 @@ int mmu_pages_next(struct kvm_mmu_pages *pvec, struct mmu_page_path *parents, return n; } -void mmu_pages_clear_parents(struct mmu_page_path *parents) +static void mmu_pages_clear_parents(struct mmu_page_path *parents) { struct kvm_mmu_page *sp; unsigned int level = 0; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e4db5be7c953..8ca100a9ecac 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2043,8 +2043,8 @@ static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, return dev; } -int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu) +static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu) { void *data = val; int r = X86EMUL_CONTINUE; @@ -2073,8 +2073,8 @@ out: return r; } -int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu) +static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu) { void *data = val; int r = X86EMUL_CONTINUE; diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index a70d805e0148..864ac5483baa 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -233,8 +233,8 @@ void kvm_free_irq_routing(struct kvm *kvm) __kvm_free_irq_routing(&kvm->irq_routing); } -int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e, - const struct kvm_irq_routing_entry *ue) +static int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e, + const struct kvm_irq_routing_entry *ue) { int r = -EINVAL; int delta; -- cgit v1.2.3 From 36463146ffb7eee4582ed785a8c8be213b8ed110 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Mon, 16 Mar 2009 16:33:43 +0800 Subject: KVM: Get support IRQ routing entry counts In capability probing ioctl. Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- virt/kvm/kvm_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index c4278975c8ca..605697e9c4dd 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1988,7 +1988,7 @@ static long kvm_dev_ioctl_check_extension_generic(long arg) return 1; #ifdef CONFIG_HAVE_KVM_IRQCHIP case KVM_CAP_IRQ_ROUTING: - return 1; + return KVM_MAX_IRQ_ROUTES; #endif default: break; -- cgit v1.2.3 From 4539b35881ae9664b0e2953438dd83f5ee02c0b4 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 12 Mar 2009 18:18:43 +0100 Subject: KVM: Fix missing smp tlb flush in invlpg When kvm emulates an invlpg instruction, it can drop a shadow pte, but leaves the guest tlbs intact. This can cause memory corruption when swapping out. Without this the other cpu can still write to a freed host physical page. tlb smp flush must happen if rmap_remove is called always before mmu_lock is released because the VM will take the mmu_lock before it can finally add the page to the freelist after swapout. mmu notifier makes it safe to flush the tlb after freeing the page (otherwise it would never be safe) so we can do a single flush for multiple sptes invalidated. Cc: stable@kernel.org Signed-off-by: Andrea Arcangeli Acked-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 0f11792fafa6..6bd70206c561 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -446,6 +446,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) gpa_t pte_gpa = -1; int level; u64 *sptep; + int need_flush = 0; spin_lock(&vcpu->kvm->mmu_lock); @@ -465,6 +466,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) rmap_remove(vcpu->kvm, sptep); if (is_large_pte(*sptep)) --vcpu->kvm->stat.lpages; + need_flush = 1; } set_shadow_pte(sptep, shadow_trap_nonpresent_pte); break; @@ -474,6 +476,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) break; } + if (need_flush) + kvm_flush_remote_tlbs(vcpu->kvm); spin_unlock(&vcpu->kvm->mmu_lock); if (pte_gpa == -1) -- cgit v1.2.3 From bc35cbc85cd78213590761618a13da6a9707652c Mon Sep 17 00:00:00 2001 From: Liu Yu Date: Tue, 17 Mar 2009 16:57:45 +0800 Subject: KVM: ppc: e500: Fix the bug that mas0 update to wrong value when read TLB entry Should clear and then update the next victim area here. Guest kernel only read TLB1 when startup kernel, this bug result in an extra 4K TLB1 mapping in guest from 0x0 to 0x0. As the problem has no impact to bootup a guest, we didn't notice it before. Signed-off-by: Liu Yu Signed-off-by: Avi Kivity --- arch/powerpc/kvm/e500_tlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index 72386ddbd9d5..ec933209e8af 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -448,7 +448,7 @@ int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu) esel = get_tlb_esel(vcpu_e500, tlbsel); gtlbe = &vcpu_e500->guest_tlb[tlbsel][esel]; - vcpu_e500->mas0 &= MAS0_NV(0); + vcpu_e500->mas0 &= ~MAS0_NV(~0); vcpu_e500->mas0 |= MAS0_NV(vcpu_e500->guest_tlb_nv[tlbsel]); vcpu_e500->mas1 = gtlbe->mas1; vcpu_e500->mas2 = gtlbe->mas2; -- cgit v1.2.3 From 046a48b35baa7c66d0d0331256ba12ca51665411 Mon Sep 17 00:00:00 2001 From: Liu Yu Date: Tue, 17 Mar 2009 16:57:46 +0800 Subject: KVM: ppc: e500: Fix the bug that KVM is unstable in SMP TLB entry should enable memory coherence in SMP. And like commit 631fba9dd3aca519355322cef035730609e91593, remove guard attribute to enable the prefetch of guest memory. Signed-off-by: Liu Yu Signed-off-by: Avi Kivity --- arch/powerpc/kvm/e500_tlb.c | 4 ++++ arch/powerpc/kvm/e500_tlb.h | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index ec933209e8af..0e773fc2d5e4 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -99,7 +99,11 @@ static inline u32 e500_shadow_mas3_attrib(u32 mas3, int usermode) static inline u32 e500_shadow_mas2_attrib(u32 mas2, int usermode) { +#ifdef CONFIG_SMP + return (mas2 & MAS2_ATTRIB_MASK) | MAS2_M; +#else return mas2 & MAS2_ATTRIB_MASK; +#endif } /* diff --git a/arch/powerpc/kvm/e500_tlb.h b/arch/powerpc/kvm/e500_tlb.h index 4d5cc0f7d796..45b064b76906 100644 --- a/arch/powerpc/kvm/e500_tlb.h +++ b/arch/powerpc/kvm/e500_tlb.h @@ -38,7 +38,7 @@ #define E500_TLB_USER_PERM_MASK (MAS3_UX|MAS3_UR|MAS3_UW) #define E500_TLB_SUPER_PERM_MASK (MAS3_SX|MAS3_SR|MAS3_SW) #define MAS2_ATTRIB_MASK \ - (MAS2_X0 | MAS2_X1 | MAS2_W | MAS2_I | MAS2_M | MAS2_G | MAS2_E) + (MAS2_X0 | MAS2_X1) #define MAS3_ATTRIB_MASK \ (MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3 \ | E500_TLB_USER_PERM_MASK | E500_TLB_SUPER_PERM_MASK) -- cgit v1.2.3 From bc7a8660df62da3fb5cad025322eda75fbee8731 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Tue, 17 Mar 2009 19:27:19 +0800 Subject: KVM: Correct deassign device ioctl to IOW It's IOR by mistake, so fix it before release. Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- include/linux/kvm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 0d94b274c3ae..311a073afe8a 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -483,7 +483,7 @@ struct kvm_irq_routing { #define KVM_ASSIGN_IRQ _IOR(KVMIO, 0x70, \ struct kvm_assigned_irq) #define KVM_REINJECT_CONTROL _IO(KVMIO, 0x71) -#define KVM_DEASSIGN_PCI_DEVICE _IOR(KVMIO, 0x72, \ +#define KVM_DEASSIGN_PCI_DEVICE _IOW(KVMIO, 0x72, \ struct kvm_assigned_pci_dev) /* -- cgit v1.2.3 From 16175a796d061833aacfbd9672235f2d2725df65 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 23 Mar 2009 22:13:44 +0200 Subject: KVM: VMX: Don't allow uninhibited access to EFER on i386 vmx_set_msr() does not allow i386 guests to touch EFER, but they can still do so through the default: label in the switch. If they set EFER_LME, they can oops the host. Fix by having EFER access through the normal channel (which will check for EFER_LME) even on i386. Reported-and-tested-by: Benjamin Gilbert Cc: stable@kernel.org Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 48063a0aa243..bb481330716f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -936,11 +936,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) int ret = 0; switch (msr_index) { -#ifdef CONFIG_X86_64 case MSR_EFER: vmx_load_host_state(vmx); ret = kvm_set_msr_common(vcpu, msr_index, data); break; +#ifdef CONFIG_X86_64 case MSR_FS_BASE: vmcs_writel(GUEST_FS_BASE, data); break; -- cgit v1.2.3